From d4df639120d818af67176a4817f13d6a793b950e Mon Sep 17 00:00:00 2001 From: yinbin Date: Wed, 9 Jul 2025 18:43:01 +0800 Subject: [PATCH] sync sockio: fix rpc_send coredump after RPC_MSG_EXIT --- ...am-stuck-while-gazelle-exit-in-multi.patch | 134 + 0323-cleancode-add-GAZELLE_SAME_NODE.patch | 958 +++ ...-cleancode-remove-gazelle_light_ring.patch | 104 + ...-get_stack_tid-DPDK_PKT_BURST_SIZE-P.patch | 292 + 0326-socket-refactor-sock_event.patch | 1613 +++++ 0327-socket-adapt-to-sock_event.patch | 2955 +++++++++ 0328-socket-refactor-tcp-and-udp.patch | 4524 ++++++++++++++ 0329-socket-adapt-to-tcp-and-udp.patch | 5305 +++++++++++++++++ ...e_max-and-change-default-rpc_msg_max.patch | 157 + 0331-cfg-add-mem_async_mode.patch | 98 + 0332-mempool-add-mem_thread_cache_flush.patch | 617 ++ 0333-dfx-support-sk_wait-stat.patch | 526 ++ 0334-mempool-fix-copy_mbuf_private.patch | 75 + 0335-socket-fix-connect-blocking.patch | 198 + ...ck_tcp_read-do-not-recv_finish_burst.patch | 32 + 0337-tcp-add-GAZELLE_TCP_ASYNC_RECVD.patch | 167 + 0338-socket-fix-tcp-closed.patch | 95 + ...ait-cannot-be-interrupted-by-signals.patch | 139 + ...pool-modify-mbuf-num-and-rpc_msg-num.patch | 98 + ..._ring-free-not-call-recv_finish_burs.patch | 125 + ...empool-mem_get_rpc-add-reserve-limit.patch | 363 ++ ...ead_tryjoin_np-coredump-when-mysqld-.patch | 59 + ...op-using-cache-when-too-many-threads.patch | 158 + ...ait-fix-lwip_tcp_allow_send-coredump.patch | 45 + ...ng_common_free-coredump-when-rte_rin.patch | 52 + 0347-sk_wait-fix-sock_wait_common_free.patch | 62 + ...ket-fix-stack_udp_readmsg-return-len.patch | 60 + ..._wait_foreach_notify-coredump-at-sta.patch | 61 + ...epoll-fix-do_lwip_connected_callback.patch | 212 + ...tack_tcp_write-wrong-copied_total-af.patch | 36 + ...-epoll-wrong-event-notify-and-remove.patch | 349 ++ ..._connected_callback-not-delete-poll-.patch | 158 + ...fix-rtw-broadcast-close-and-shutdown.patch | 53 + ...-MEM_THREAD_MANAGER_FREE_S-to-avoid-.patch | 26 + ...ack_tcp_send-output-too-many-at-once.patch | 76 + ...fix-sendmbox-full-return-EWOULDBLOCK.patch | 336 ++ ..._wait-igonre-mem_thread-flush-signal.patch | 112 + 0359-fix-20.03-LTS-build-failed.patch | 25 + ...io-fix-tcp_write-not-remove-EPOLLOUT.patch | 108 + ...rpc_send-coredump-after-RPC_MSG_EXIT.patch | 335 ++ gazelle.spec | 84 +- 41 files changed, 20981 insertions(+), 1 deletion(-) create mode 100644 0322-RTC-fixing-program-stuck-while-gazelle-exit-in-multi.patch create mode 100644 0323-cleancode-add-GAZELLE_SAME_NODE.patch create mode 100644 0324-cleancode-remove-gazelle_light_ring.patch create mode 100644 0325-cleancode-remove-get_stack_tid-DPDK_PKT_BURST_SIZE-P.patch create mode 100644 0326-socket-refactor-sock_event.patch create mode 100644 0327-socket-adapt-to-sock_event.patch create mode 100644 0328-socket-refactor-tcp-and-udp.patch create mode 100644 0329-socket-adapt-to-tcp-and-udp.patch create mode 100644 0330-cfg-add-mem_cache_max-and-change-default-rpc_msg_max.patch create mode 100644 0331-cfg-add-mem_async_mode.patch create mode 100644 0332-mempool-add-mem_thread_cache_flush.patch create mode 100644 0333-dfx-support-sk_wait-stat.patch create mode 100644 0334-mempool-fix-copy_mbuf_private.patch create mode 100644 0335-socket-fix-connect-blocking.patch create mode 100644 0336-socket-fix-stack_tcp_read-do-not-recv_finish_burst.patch create mode 100644 0337-tcp-add-GAZELLE_TCP_ASYNC_RECVD.patch create mode 100644 0338-socket-fix-tcp-closed.patch create mode 100644 0339-socket-fix-sk_wait-cannot-be-interrupted-by-signals.patch create mode 100644 0340-mempool-modify-mbuf-num-and-rpc_msg-num.patch create mode 100644 0341-mempool-fix-mbox_ring-free-not-call-recv_finish_burs.patch create mode 100644 0342-mempool-mem_get_rpc-add-reserve-limit.patch create mode 100644 0343-mempool-fix-pthread_tryjoin_np-coredump-when-mysqld-.patch create mode 100644 0344-mempool-stop-using-cache-when-too-many-threads.patch create mode 100644 0345-sk_wait-fix-lwip_tcp_allow_send-coredump.patch create mode 100644 0346-mbox-fix-mbox_ring_common_free-coredump-when-rte_rin.patch create mode 100644 0347-sk_wait-fix-sock_wait_common_free.patch create mode 100644 0348-socket-fix-stack_udp_readmsg-return-len.patch create mode 100644 0349-sk_wait-fix-lwip_wait_foreach_notify-coredump-at-sta.patch create mode 100644 0350-epoll-fix-do_lwip_connected_callback.patch create mode 100644 0351-sockio-fix-rtw_stack_tcp_write-wrong-copied_total-af.patch create mode 100644 0352-sk_event-fix-rtw-epoll-wrong-event-notify-and-remove.patch create mode 100644 0353-poll-fix-do_lwip_connected_callback-not-delete-poll-.patch create mode 100644 0354-sockctl-fix-rtw-broadcast-close-and-shutdown.patch create mode 100644 0355-mempool-increase-MEM_THREAD_MANAGER_FREE_S-to-avoid-.patch create mode 100644 0356-sockio-fix-callback_tcp_send-output-too-many-at-once.patch create mode 100644 0357-sockio-fix-sendmbox-full-return-EWOULDBLOCK.patch create mode 100644 0358-sk_wait-igonre-mem_thread-flush-signal.patch create mode 100644 0359-fix-20.03-LTS-build-failed.patch create mode 100644 0360-sockio-fix-tcp_write-not-remove-EPOLLOUT.patch create mode 100644 0361-sockio-fix-rpc_send-coredump-after-RPC_MSG_EXIT.patch diff --git a/0322-RTC-fixing-program-stuck-while-gazelle-exit-in-multi.patch b/0322-RTC-fixing-program-stuck-while-gazelle-exit-in-multi.patch new file mode 100644 index 0000000..1706f31 --- /dev/null +++ b/0322-RTC-fixing-program-stuck-while-gazelle-exit-in-multi.patch @@ -0,0 +1,134 @@ +From e09eda4fb531f63836f0b2c2ed4bfd4f769a67e3 Mon Sep 17 00:00:00 2001 +From: yinbin +Date: Tue, 3 Jun 2025 21:05:13 +0800 +Subject: [PATCH] RTC: fixing program stuck while gazelle exit in multi-threads + envirement + +--- + src/common/gazelle_base_func.h | 2 ++ + src/lstack/api/lstack_dummy_api.c | 35 +++++++++++++++++++++---- + src/lstack/core/lstack_dump.c | 2 +- + src/lstack/core/lstack_protocol_stack.c | 12 +++++++-- + 4 files changed, 43 insertions(+), 8 deletions(-) + +diff --git a/src/common/gazelle_base_func.h b/src/common/gazelle_base_func.h +index a579cd4..0ccb34b 100644 +--- a/src/common/gazelle_base_func.h ++++ b/src/common/gazelle_base_func.h +@@ -15,6 +15,8 @@ + + #include + ++#define US_PER_MS 1000 ++ + #define GAZELLE_FREE(p) do { \ + if (p) { \ + free(p); \ +diff --git a/src/lstack/api/lstack_dummy_api.c b/src/lstack/api/lstack_dummy_api.c +index 3a867b3..004a3aa 100644 +--- a/src/lstack/api/lstack_dummy_api.c ++++ b/src/lstack/api/lstack_dummy_api.c +@@ -17,18 +17,45 @@ + #include + #include + +-#define DUMMY_SLEEP_S 5 ++#include "lstack_log.h" ++#include "lstack_protocol_stack.h" ++#include "common/gazelle_base_func.h" ++ ++#define DUMMY_WAIT_TIMEOUT_MS 5000 ++static void waiting_exit_msg(void) ++{ ++ int time = 0; ++ int sleep_interval = 10; ++ ++ while (time < DUMMY_WAIT_TIMEOUT_MS) { ++ time += sleep_interval; ++ usleep(sleep_interval * US_PER_MS); ++ /* Must be in a secure context before close sockets */ ++ if (get_protocol_stack() && stack_polling(0) != 0) { ++ /* Means stack has closed all fds */ ++ stack_wait(); ++ break; ++ } ++ } ++ ++ if (time >= DUMMY_WAIT_TIMEOUT_MS) { ++ LSTACK_LOG(ERR, LSTACK, "APP thread doesn't recv 'stack_exit' message, will force quit within 5 seconds.\n"); ++ stack_wait(); ++ } ++ ++ usleep(DUMMY_WAIT_TIMEOUT_MS * US_PER_MS); ++} + + static inline ssize_t dummy_exit(void) + { +- sleep(DUMMY_SLEEP_S); ++ waiting_exit_msg(); + errno = ENOTCONN; + return -1; + } + + static int dummy_socket(int domain, int type, int protocol) + { +- sleep(DUMMY_SLEEP_S); ++ waiting_exit_msg(); + return -1; + } + +@@ -68,6 +95,4 @@ void dummy_api_init(posix_api_t *api) + api->sendto_fn = dummy_sendto; + + rte_wmb(); +- /* 1: wait until app thread call send functio complete */ +- sleep(1); + } +diff --git a/src/lstack/core/lstack_dump.c b/src/lstack/core/lstack_dump.c +index da9da28..7092871 100644 +--- a/src/lstack/core/lstack_dump.c ++++ b/src/lstack/core/lstack_dump.c +@@ -18,6 +18,7 @@ + + #include "lstack_cfg.h" + #include "lstack_log.h" ++#include "common/gazelle_base_func.h" + + #define DUMP_COMMAND_TIMEOUT_MS 2000 + #define DUMP_COMMAND_INTERVAL_MS 1 +@@ -50,7 +51,6 @@ static int dump_lstack_check(void) + return 0; + } + +-#define US_PER_MS (MS_PER_S) + static long timeval_diff_ms(struct timeval *end, struct timeval *begin) + { + struct timeval result; +diff --git a/src/lstack/core/lstack_protocol_stack.c b/src/lstack/core/lstack_protocol_stack.c +index cb1b2b8..ed36890 100644 +--- a/src/lstack/core/lstack_protocol_stack.c ++++ b/src/lstack/core/lstack_protocol_stack.c +@@ -56,10 +56,18 @@ enum rte_lcore_state_t stack_get_state(struct protocol_stack *stack) + return __atomic_load_n(&stack->state, __ATOMIC_ACQUIRE); + } + ++#define STACK_WAIT_TIMEOUT_MS 5000 + static void stack_wait_quit(struct protocol_stack *stack) + { +- while (__atomic_load_n(&stack->state, __ATOMIC_ACQUIRE) != WAIT) { +- rte_pause(); ++ int timeout = 0; ++ int sleep_interval = 10; ++ while (stack_get_state(stack) != WAIT && timeout < STACK_WAIT_TIMEOUT_MS) { ++ timeout += sleep_interval; ++ usleep(sleep_interval * US_PER_MS); ++ } ++ ++ if (timeout >= STACK_WAIT_TIMEOUT_MS) { ++ LSTACK_LOG(ERR, LSTACK, "stack %p exits time out!\n", stack); + } + } + +-- +2.33.0 + diff --git a/0323-cleancode-add-GAZELLE_SAME_NODE.patch b/0323-cleancode-add-GAZELLE_SAME_NODE.patch new file mode 100644 index 0000000..650b5e5 --- /dev/null +++ b/0323-cleancode-add-GAZELLE_SAME_NODE.patch @@ -0,0 +1,958 @@ +From ff922a7e3085c37ed70add038a306ef519983a06 Mon Sep 17 00:00:00 2001 +From: Lemmy Huang +Date: Tue, 11 Mar 2025 14:30:02 +0800 +Subject: [PATCH] cleancode: add GAZELLE_SAME_NODE + +Signed-off-by: Lemmy Huang +--- + src/lstack/api/lstack_wrap.c | 2 + + src/lstack/core/lstack_lwip.c | 363 +------------------- + src/lstack/core/lstack_protocol_stack.c | 4 + + src/lstack/core/same_node.c | 376 +++++++++++++++++++++ + src/lstack/include/lstack_ethdev.h | 2 - + src/lstack/include/lstack_lwip.h | 6 +- + src/lstack/include/lstack_protocol_stack.h | 2 + + src/lstack/include/same_node.h | 35 ++ + 8 files changed, 427 insertions(+), 363 deletions(-) + create mode 100644 src/lstack/core/same_node.c + create mode 100644 src/lstack/include/same_node.h + +diff --git a/src/lstack/api/lstack_wrap.c b/src/lstack/api/lstack_wrap.c +index e90c523..8a88c47 100644 +--- a/src/lstack/api/lstack_wrap.c ++++ b/src/lstack/api/lstack_wrap.c +@@ -362,6 +362,7 @@ static bool kernel_ip_route(const struct sockaddr *dst_addr) + + static bool should_enter_kernel_connect(const struct sockaddr *addr) + { ++#if GAZELLE_SAME_NODE + int32_t remote_port; + char listen_ring_name[RING_NAME_LEN]; + +@@ -371,6 +372,7 @@ static bool should_enter_kernel_connect(const struct sockaddr *addr) + if (kernel_ip_match(addr) && rte_ring_lookup(listen_ring_name) == NULL) { + return true; + } ++#endif /* GAZELLE_SAME_NODE */ + + if (lwip_ip_route(addr)) { + return false; +diff --git a/src/lstack/core/lstack_lwip.c b/src/lstack/core/lstack_lwip.c +index 4f5c4cc..d0e51b2 100644 +--- a/src/lstack/core/lstack_lwip.c ++++ b/src/lstack/core/lstack_lwip.c +@@ -814,76 +814,6 @@ static inline void notice_stack_send(struct lwip_sock *sock, int32_t fd, int32_t + } + } + +-/* process on same node use ring to recv data */ +-ssize_t gazelle_same_node_ring_recv(struct lwip_sock *sock, const void *buf, size_t len, int32_t flags) +-{ +- unsigned long long cur_begin = sock->same_node_rx_ring->sndbegin; +- unsigned long long cur_end; +- unsigned long long index = cur_begin + 1; +- size_t act_len = 0; +- +- cur_end = __atomic_load_n(&sock->same_node_rx_ring->sndend, __ATOMIC_ACQUIRE); +- if (cur_begin == cur_end) { +- errno = EAGAIN; +- act_len = -1; +- goto END; +- } +- act_len = cur_end - index + 1; +- act_len = RTE_MIN(act_len, len); +- if ((index & SAME_NODE_RING_MASK) + act_len > SAME_NODE_RING_LEN) { +- size_t act_len1 = SAME_NODE_RING_LEN - (index & SAME_NODE_RING_MASK); +- size_t act_len2 = act_len - act_len1; +- rte_memcpy((char *)buf, (char *)sock->same_node_rx_ring->mz->addr + (index & SAME_NODE_RING_MASK), act_len1); +- rte_memcpy((char *)buf + act_len1, (char *)sock->same_node_rx_ring->mz->addr, act_len2); +- } else { +- rte_memcpy((char *)buf, (char *)sock->same_node_rx_ring->mz->addr + (index & SAME_NODE_RING_MASK), act_len); +- } +- +- index += act_len; +- __atomic_store_n(&sock->same_node_rx_ring->sndbegin, index - 1, __ATOMIC_RELEASE); +- +-END: +- /* rte_ring_count reduce lock */ +- if (sock->wakeup && sock->wakeup->type == WAKEUP_EPOLL && (sock->events & EPOLLIN) +- && (!NETCONN_IS_DATAIN(sock))) { +- del_sock_event(sock, EPOLLIN); +- } +- return act_len; +-} +- +-/* processes on same node use ring to send data */ +-ssize_t gazelle_same_node_ring_send(struct lwip_sock *sock, const void *buf, size_t len, int32_t flags) +-{ +- unsigned long long cur_begin = __atomic_load_n(&sock->same_node_tx_ring->sndbegin, __ATOMIC_ACQUIRE); +- unsigned long long cur_end = sock->same_node_tx_ring->sndend; +- if (cur_end >= cur_begin + SAME_NODE_RING_LEN) { +- errno = EAGAIN; +- return -1; +- } +- +- unsigned long long index = cur_end + 1; +- size_t act_len = SAME_NODE_RING_LEN - (cur_end - cur_begin); +- act_len = RTE_MIN(act_len, len); +- +- if ((index & SAME_NODE_RING_MASK) + act_len > SAME_NODE_RING_LEN) { +- size_t act_len1 = SAME_NODE_RING_LEN - (index & SAME_NODE_RING_MASK); +- size_t act_len2 = act_len - act_len1; +- rte_memcpy((char *)sock->same_node_tx_ring->mz->addr + (index & SAME_NODE_RING_MASK), buf, act_len1); +- rte_memcpy((char *)sock->same_node_tx_ring->mz->addr, (char *)buf + act_len1, act_len2); +- } else { +- rte_memcpy((char *)sock->same_node_tx_ring->mz->addr + (index & SAME_NODE_RING_MASK), buf, act_len); +- } +- +- index += act_len; +- __atomic_store_n(&sock->same_node_tx_ring->sndend, index - 1, __ATOMIC_RELEASE); +- if (act_len == 0) { +- errno = EAGAIN; +- return -1; +- } +- +- return act_len; +-} +- + ssize_t do_lwip_send_to_stack(int32_t fd, const void *buf, size_t len, int32_t flags, + const struct sockaddr *addr, socklen_t addrlen) + { +@@ -903,9 +833,11 @@ ssize_t do_lwip_send_to_stack(int32_t fd, const void *buf, size_t len, int32_t f + sock->already_bind_numa = 1; + } + ++#if GAZELLE_SAME_NODE + if (sock->same_node_tx_ring != NULL) { + return gazelle_same_node_ring_send(sock, buf, len, flags); + } ++#endif /* GAZELLE_SAME_NODE */ + if (sock->errevent > 0 || sock->stack == NULL) { + GAZELLE_RETURN(ENOTCONN); + } +@@ -1186,10 +1118,11 @@ ssize_t do_lwip_read_from_stack(int32_t fd, void *buf, size_t len, int32_t flags + sock->already_bind_numa = 1; + } + ++#if GAZELLE_SAME_NODE + if (sock->same_node_rx_ring != NULL) { +- return gazelle_same_node_ring_recv(sock, buf, len, flags); +- } +- ++ recvd = gazelle_same_node_ring_recv(sock, buf, len, flags); ++ } else ++#endif /* GAZELLE_SAME_NODE */ + if (NETCONN_IS_UDP(sock)) { + recvd = recv_ring_udp_read(sock, buf, len, noblock, addr, addrlen); + } else { +@@ -1220,21 +1153,6 @@ void do_lwip_add_recvlist(int32_t fd) + } + } + +-void read_same_node_recv_list(struct protocol_stack *stack) +-{ +- struct list_node *list = &(stack->same_node_recv_list); +- struct list_node *node, *temp; +- struct lwip_sock *sock; +- +- list_for_each_node(node, temp, list) { +- sock = list_entry(node, struct lwip_sock, recv_list); +- +- if (sock->same_node_rx_ring != NULL && same_node_ring_count(sock)) { +- add_sock_event(sock, EPOLLIN); +- } +- } +-} +- + void do_lwip_read_recvlist(struct protocol_stack *stack, uint32_t max_num) + { + struct list_node *list = &(stack->recv_list); +@@ -1417,272 +1335,3 @@ uint32_t do_lwip_get_connnum(void) + return conn_num; + } + +-void netif_poll(struct netif *netif) +-{ +- struct tcp_pcb *pcb = NULL; +- struct tcp_pcb_listen *pcbl = NULL; +- +- for (pcb = tcp_active_pcbs; pcb != NULL; pcb = pcb->next) { +-#define NETIF_POLL_READ_COUNT 32 +- struct pbuf *pbufs[NETIF_POLL_READ_COUNT]; +- int ret; +- +- if (pcb->client_rx_ring != NULL) { +- ret = rte_ring_sc_dequeue_burst(pcb->client_rx_ring, (void **)pbufs, NETIF_POLL_READ_COUNT, NULL); +- for (int i = 0; i < ret; i++) { +- if (ip_input(pbufs[i], netif) != 0) { +- LSTACK_LOG(INFO, LSTACK, "ip_input return err\n"); +- pbuf_free(pbufs[i]); +- } +- } +- } +- } +- for (pcbl = tcp_listen_pcbs.listen_pcbs; pcbl != NULL; pcbl = pcbl->next) { +- if (pcbl->listen_rx_ring != NULL) { +- struct pbuf *pbuf; +- if (rte_ring_sc_dequeue(pcbl->listen_rx_ring, (void **)&pbuf) == 0) { +- if (ip_input(pbuf, netif) != ERR_OK) { +- pbuf_free(pbuf); +- } +- } +- } +- } +-} +- +-/* processes on same node handshake packet use this function */ +-err_t netif_loop_output(struct netif *netif, struct pbuf *p) +-{ +- if (!p) { +- return ERR_ARG; +- } +- const struct ip_hdr *iphdr; +- iphdr = (const struct ip_hdr *)p->payload; +- if (IPH_PROTO(iphdr) == IP_PROTO_UDP) { +- return udp_netif_loop_output(netif, p); +- } +- +- struct tcp_pcb *pcb = p->pcb; +- struct pbuf *head = NULL; +- +- if (pcb == NULL || pcb->client_tx_ring == NULL) { +- LSTACK_LOG(ERR, LSTACK, "pcb is null\n"); +- return ERR_ARG; +- } +- +- if (p->next != NULL) { +- LSTACK_LOG(ERR, LSTACK, "netif_loop_output: not support chained pbuf\n"); +- return ERR_ARG; +- } +- +- struct tcp_hdr *tcp_hdr = (struct tcp_hdr *)((char *)p->payload + sizeof(struct ip_hdr)); +- uint8_t flags = TCPH_FLAGS(tcp_hdr); +- +- head = pbuf_alloc(0, p->len, PBUF_RAM); +- if (head == NULL) { +- LSTACK_LOG(ERR, LSTACK, "netif_loop_output: pbuf_alloc failed\n"); +- return ERR_MEM; +- } +- memcpy_s(head->payload, head->len, p->payload, p->len); +- +- if ((flags & TCP_SYN) && !(flags & TCP_ACK)) { +- /* SYN packet, send to listen_ring */ +- char ring_name[RING_NAME_LEN] = {0}; +- snprintf_s(ring_name, sizeof(ring_name), sizeof(ring_name) - 1, "listen_rx_ring_%d", pcb->remote_port); +- struct rte_ring *ring = rte_ring_lookup(ring_name); +- if (ring == NULL) { +- LSTACK_LOG(INFO, LSTACK, "netif_loop_output: cant find listen_rx_ring %d\n", pcb->remote_port); +- pbuf_free(head); +- } else { +- if (rte_ring_mp_enqueue(ring, head) != 0) { +- LSTACK_LOG(INFO, LSTACK, "enqueue sync packet failed\n"); +- pbuf_free(head); +- } +- } +- } else { +- /* send other type packet to tx_ring */ +- if (rte_ring_sp_enqueue(pcb->client_tx_ring, head) != 0) { +- LSTACK_LOG(INFO, LSTACK, "client tx ring full\n"); +- pbuf_free(head); +- } +- } +- +- return ERR_OK; +-} +- +-err_t find_same_node_memzone(struct tcp_pcb *pcb, struct lwip_sock *nsock) +-{ +- char name[RING_NAME_LEN]; +- snprintf_s(name, sizeof(name), sizeof(name) - 1, "rte_mz_rx_%u", pcb->remote_port); +- if ((nsock->same_node_tx_ring_mz = rte_memzone_lookup(name)) == NULL) { +- LSTACK_LOG(INFO, LSTACK, "lwip_accept: can't find %s\n",name); +- return -1; +- } else { +- LSTACK_LOG(INFO, LSTACK, "lookup %s success\n", name); +- } +- nsock->same_node_tx_ring = (struct same_node_ring *)nsock->same_node_tx_ring_mz->addr; +- +- snprintf_s(name, sizeof(name), sizeof(name) - 1, "rte_mz_buf_rx_%u", pcb->remote_port); +- if ((nsock->same_node_tx_ring->mz = rte_memzone_lookup(name)) == NULL) { +- LSTACK_LOG(INFO, LSTACK, "lwip_accept: can't find %s\n",name); +- return -1; +- } +- +- snprintf_s(name, sizeof(name), sizeof(name) - 1, "rte_mz_tx_%u", pcb->remote_port); +- if ((nsock->same_node_rx_ring_mz = rte_memzone_lookup(name)) == NULL) { +- LSTACK_LOG(INFO, LSTACK, "lwip_accept: can't find %s\n",name); +- return -1; +- } else { +- LSTACK_LOG(INFO, LSTACK, "lookup %s success\n", name); +- } +- nsock->same_node_rx_ring = (struct same_node_ring *)nsock->same_node_rx_ring_mz->addr; +- +- snprintf_s(name, sizeof(name), sizeof(name) - 1,"rte_mz_buf_tx_%u", pcb->remote_port); +- if ((nsock->same_node_rx_ring->mz = rte_memzone_lookup(name)) == NULL) { +- LSTACK_LOG(INFO, LSTACK, "lwip_accept: can't find %s\n",name); +- return -1; +- } +- +- /* rcvlink init in alloc_socket() */ +- /* remove from g_rcv_process_list in free_socket */ +- list_add_node(&nsock->recv_list, &nsock->stack->same_node_recv_list); +- return 0; +-} +- +-err_t same_node_memzone_create(const struct rte_memzone **zone, int size, int port, char *name, char *rx) +-{ +- char mem_name[RING_NAME_LEN] = {0}; +- snprintf_s(mem_name, sizeof(mem_name), sizeof(mem_name) - 1, "%s_%s_%d", name, rx, port); +- +- *zone = rte_memzone_reserve_aligned(mem_name, size, rte_socket_id(), 0, RTE_CACHE_LINE_SIZE); +- if (*zone == NULL) { +- LSTACK_LOG(ERR, LSTACK, "cannot reserve memzone:%s, errno is %d\n", mem_name, rte_errno); +- return ERR_MEM; +- } +- +- LSTACK_LOG(INFO, LSTACK, "lstack id %d, reserve %s(%p) success, addr is %p, size is %u\n", +- rte_socket_id(), mem_name, *zone, (*zone)->addr, size); +- +- return ERR_OK; +-} +- +-err_t same_node_ring_create(struct rte_ring **ring, int size, int port, char *name, char *rx) +-{ +- if (!get_global_cfg_params()->use_sockmap) { +- *ring = NULL; +- return -1; +- } +- +- unsigned flags; +- char ring_name[RING_NAME_LEN] = {0}; +- if (strcmp(name, "listen") == 0) { +- flags = RING_F_SC_DEQ; +- } else { +- flags = RING_F_SP_ENQ | RING_F_SC_DEQ; +- } +- +- snprintf_s(ring_name, sizeof(ring_name), sizeof(ring_name) - 1, "%s_%s_ring_%d", name, rx, port); +- *ring = rte_ring_create(ring_name, size, rte_socket_id(), flags); +- if (*ring == NULL) { +- LSTACK_LOG(ERR, LSTACK, "cannot create rte_ring %s, errno is %d\n", ring_name, rte_errno); +- return ERR_MEM; +- } +- LSTACK_LOG(INFO, LSTACK, "lstack socket id:%d, create %s(%p) success\n", rte_socket_id(), ring_name, *ring); +- return ERR_OK; +-} +- +-static void init_same_node_ring(struct tcp_pcb *pcb) +-{ +- struct netconn *netconn = (struct netconn *)pcb->callback_arg; +- struct lwip_sock *sock = lwip_get_socket(netconn->callback_arg.socket); +- +- pcb->client_rx_ring = NULL; +- pcb->client_tx_ring = NULL; +- pcb->free_ring = 0; +- sock->same_node_rx_ring = NULL; +- sock->same_node_rx_ring_mz = NULL; +- sock->same_node_tx_ring = NULL; +- sock->same_node_tx_ring_mz = NULL; +-} +- +-#define CLIENT_RING_SIZE 512 +-err_t create_same_node_ring(struct tcp_pcb *pcb) +-{ +- struct netconn *netconn = (struct netconn *)pcb->callback_arg; +- struct lwip_sock *sock = lwip_get_socket(netconn->callback_arg.socket); +- +- if (same_node_ring_create(&pcb->client_rx_ring, CLIENT_RING_SIZE, pcb->local_port, "client", "rx") != 0) { +- goto END; +- } +- if (same_node_ring_create(&pcb->client_tx_ring, CLIENT_RING_SIZE, pcb->local_port, "client", "tx") != 0) { +- goto END; +- } +- pcb->free_ring = 1; +- +- if (same_node_memzone_create(&sock->same_node_rx_ring_mz, sizeof(struct same_node_ring), +- pcb->local_port, "rte_mz", "rx") != 0) { +- goto END; +- } +- sock->same_node_rx_ring = (struct same_node_ring*)sock->same_node_rx_ring_mz->addr; +- +- if (same_node_memzone_create(&sock->same_node_rx_ring->mz, SAME_NODE_RING_LEN, +- pcb->local_port, "rte_mz_buf", "rx") != 0) { +- goto END; +- } +- +- sock->same_node_rx_ring->sndbegin = 0; +- sock->same_node_rx_ring->sndend = 0; +- +- if (same_node_memzone_create(&sock->same_node_tx_ring_mz, sizeof(struct same_node_ring), +- pcb->local_port, "rte_mz", "tx") != 0) { +- goto END; +- } +- sock->same_node_tx_ring = (struct same_node_ring*)sock->same_node_tx_ring_mz->addr; +- +- if (same_node_memzone_create(&sock->same_node_tx_ring->mz, SAME_NODE_RING_LEN, +- pcb->local_port, "rte_mz_buf", "tx") != 0) { +- goto END; +- } +- +- sock->same_node_tx_ring->sndbegin = 0; +- sock->same_node_tx_ring->sndend = 0; +- +- return 0; +-END: +- rte_ring_free(pcb->client_rx_ring); +- rte_ring_free(pcb->client_tx_ring); +- rte_memzone_free(sock->same_node_rx_ring->mz); +- rte_memzone_free(sock->same_node_rx_ring_mz); +- rte_memzone_free(sock->same_node_tx_ring->mz); +- rte_memzone_free(sock->same_node_tx_ring_mz); +- init_same_node_ring(pcb); +- return ERR_BUF; +-} +- +-err_t find_same_node_ring(struct tcp_pcb *npcb) +-{ +- char name[RING_NAME_LEN] = {0}; +- snprintf_s(name, sizeof(name), sizeof(name) - 1, "client_tx_ring_%u", npcb->remote_port); +- npcb->client_rx_ring = rte_ring_lookup(name); +- memset_s(name, sizeof(name), 0, sizeof(name)); +- snprintf_s(name, sizeof(name), sizeof(name) - 1, "client_rx_ring_%u", npcb->remote_port); +- npcb->client_tx_ring = rte_ring_lookup(name); +- npcb->free_ring = 0; +- if (npcb->client_tx_ring == NULL || +- npcb->client_rx_ring == NULL) { +- LSTACK_LOG(INFO, LSTACK, "lookup client rxtx ring failed, port is %d\n", npcb->remote_port); +- tcp_abandon(npcb, 0); +- return ERR_CONN; +- } else { +- LSTACK_LOG(INFO, LSTACK, "find client_tx_ring_%u and client_rx_ring_%u\n", +- npcb->remote_port, npcb->remote_port); +- } +- return 0; +-} +- +-unsigned same_node_ring_count(struct lwip_sock *sock) +-{ +- const unsigned long long cur_begin = __atomic_load_n(&sock->same_node_rx_ring->sndbegin, __ATOMIC_RELAXED); +- const unsigned long long cur_end = __atomic_load_n(&sock->same_node_rx_ring->sndend, __ATOMIC_RELAXED); +- +- return cur_end - cur_begin; +-} +diff --git a/src/lstack/core/lstack_protocol_stack.c b/src/lstack/core/lstack_protocol_stack.c +index ed36890..b4fb2fd 100644 +--- a/src/lstack/core/lstack_protocol_stack.c ++++ b/src/lstack/core/lstack_protocol_stack.c +@@ -146,6 +146,7 @@ struct protocol_stack *get_bind_protocol_stack(void) + return stack_group->stacks[index]; + } + ++#if GAZELLE_TCP_REUSE_IPPORT + int get_min_conn_stack(struct protocol_stack_group *stack_group) + { + struct protocol_stack* stack; +@@ -161,6 +162,7 @@ int get_min_conn_stack(struct protocol_stack_group *stack_group) + } + return min_conn_stk_idx; + } ++#endif /* GAZELLE_TCP_REUSE_IPPORT */ + + void bind_to_stack_numa(struct protocol_stack *stack) + { +@@ -566,6 +568,7 @@ int stack_polling(unsigned wakeup_tick) + } + } + ++#if GAZELLE_SAME_NODE + /* run to completion mode currently does not support sockmap */ + if (use_sockmap) { + netif_poll(&stack->netif); +@@ -574,6 +577,7 @@ int stack_polling(unsigned wakeup_tick) + read_same_node_recv_list(stack); + } + } ++#endif /* GAZELLE_SAME_NODE */ + + if (cfg->udp_enable) { + udp_netif_poll(&stack->netif); +diff --git a/src/lstack/core/same_node.c b/src/lstack/core/same_node.c +new file mode 100644 +index 0000000..0fe0fa8 +--- /dev/null ++++ b/src/lstack/core/same_node.c +@@ -0,0 +1,376 @@ ++/* ++* Copyright (c) Huawei Technologies Co., Ltd. 2020-2021. All rights reserved. ++* gazelle is licensed under the Mulan PSL v2. ++* You can use this software according to the terms and conditions of the Mulan PSL v2. ++* You may obtain a copy of Mulan PSL v2 at: ++* http://license.coscl.org.cn/MulanPSL2 ++* THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR ++* IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY OR FIT FOR A PARTICULAR ++* PURPOSE. ++* See the Mulan PSL v2 for more details. ++*/ ++ ++#include ++#include ++ ++#include ++ ++#include "lstack_log.h" ++#include "lstack_cfg.h" ++#include "lstack_protocol_stack.h" ++#include "lstack_stack_stat.h" ++#include "same_node.h" ++#include "lstack_epoll.h" ++#include "lstack_lwip.h" ++ ++#if GAZELLE_SAME_NODE ++void read_same_node_recv_list(struct protocol_stack *stack) ++{ ++ struct list_node *list = &(stack->same_node_recv_list); ++ struct list_node *node, *temp; ++ struct lwip_sock *sock; ++ ++ list_for_each_node(node, temp, list) { ++ sock = list_entry(node, struct lwip_sock, recv_list); ++ ++ if (sock->same_node_rx_ring != NULL && same_node_ring_count(sock)) { ++ add_sock_event(sock, EPOLLIN); ++ } ++ } ++} ++ ++/* process on same node use ring to recv data */ ++ssize_t gazelle_same_node_ring_recv(struct lwip_sock *sock, const void *buf, size_t len, int32_t flags) ++{ ++ unsigned long long cur_begin = sock->same_node_rx_ring->sndbegin; ++ unsigned long long cur_end; ++ unsigned long long index = cur_begin + 1; ++ size_t act_len = 0; ++ ++ cur_end = __atomic_load_n(&sock->same_node_rx_ring->sndend, __ATOMIC_ACQUIRE); ++ if (cur_begin == cur_end) { ++ errno = EAGAIN; ++ act_len = -1; ++ goto END; ++ } ++ act_len = cur_end - index + 1; ++ act_len = RTE_MIN(act_len, len); ++ if ((index & SAME_NODE_RING_MASK) + act_len > SAME_NODE_RING_LEN) { ++ size_t act_len1 = SAME_NODE_RING_LEN - (index & SAME_NODE_RING_MASK); ++ size_t act_len2 = act_len - act_len1; ++ rte_memcpy((char *)buf, (char *)sock->same_node_rx_ring->mz->addr + (index & SAME_NODE_RING_MASK), act_len1); ++ rte_memcpy((char *)buf + act_len1, (char *)sock->same_node_rx_ring->mz->addr, act_len2); ++ } else { ++ rte_memcpy((char *)buf, (char *)sock->same_node_rx_ring->mz->addr + (index & SAME_NODE_RING_MASK), act_len); ++ } ++ ++ index += act_len; ++ __atomic_store_n(&sock->same_node_rx_ring->sndbegin, index - 1, __ATOMIC_RELEASE); ++ ++END: ++ return act_len; ++} ++ ++/* processes on same node use ring to send data */ ++ssize_t gazelle_same_node_ring_send(struct lwip_sock *sock, const void *buf, size_t len, int32_t flags) ++{ ++ unsigned long long cur_begin = __atomic_load_n(&sock->same_node_tx_ring->sndbegin, __ATOMIC_ACQUIRE); ++ unsigned long long cur_end = sock->same_node_tx_ring->sndend; ++ if (cur_end >= cur_begin + SAME_NODE_RING_LEN) { ++ errno = EAGAIN; ++ return -1; ++ } ++ ++ unsigned long long index = cur_end + 1; ++ size_t act_len = SAME_NODE_RING_LEN - (cur_end - cur_begin); ++ act_len = RTE_MIN(act_len, len); ++ ++ if ((index & SAME_NODE_RING_MASK) + act_len > SAME_NODE_RING_LEN) { ++ size_t act_len1 = SAME_NODE_RING_LEN - (index & SAME_NODE_RING_MASK); ++ size_t act_len2 = act_len - act_len1; ++ rte_memcpy((char *)sock->same_node_tx_ring->mz->addr + (index & SAME_NODE_RING_MASK), buf, act_len1); ++ rte_memcpy((char *)sock->same_node_tx_ring->mz->addr, (char *)buf + act_len1, act_len2); ++ } else { ++ rte_memcpy((char *)sock->same_node_tx_ring->mz->addr + (index & SAME_NODE_RING_MASK), buf, act_len); ++ } ++ ++ index += act_len; ++ __atomic_store_n(&sock->same_node_tx_ring->sndend, index - 1, __ATOMIC_RELEASE); ++ if (act_len == 0) { ++ errno = EAGAIN; ++ return -1; ++ } ++ ++ return act_len; ++} ++ ++void netif_poll(struct netif *netif) ++{ ++ struct tcp_pcb *pcb = NULL; ++ struct tcp_pcb_listen *pcbl = NULL; ++ ++ for (pcb = tcp_active_pcbs; pcb != NULL; pcb = pcb->next) { ++#define NETIF_POLL_READ_COUNT 32 ++ struct pbuf *pbufs[NETIF_POLL_READ_COUNT]; ++ int ret; ++ ++ if (pcb->client_rx_ring != NULL) { ++ ret = rte_ring_sc_dequeue_burst(pcb->client_rx_ring, (void **)pbufs, NETIF_POLL_READ_COUNT, NULL); ++ for (int i = 0; i < ret; i++) { ++ if (ip_input(pbufs[i], netif) != 0) { ++ LSTACK_LOG(INFO, LSTACK, "ip_input return err\n"); ++ pbuf_free(pbufs[i]); ++ } ++ } ++ } ++ } ++ for (pcbl = tcp_listen_pcbs.listen_pcbs; pcbl != NULL; pcbl = pcbl->next) { ++ if (pcbl->listen_rx_ring != NULL) { ++ struct pbuf *pbuf; ++ if (rte_ring_sc_dequeue(pcbl->listen_rx_ring, (void **)&pbuf) == 0) { ++ if (ip_input(pbuf, netif) != ERR_OK) { ++ pbuf_free(pbuf); ++ } ++ } ++ } ++ } ++} ++ ++/* processes on same node handshake packet use this function */ ++err_t netif_loop_output(struct netif *netif, struct pbuf *p) ++{ ++ if (!p) { ++ return ERR_ARG; ++ } ++ const struct ip_hdr *iphdr; ++ iphdr = (const struct ip_hdr *)p->payload; ++ if (IPH_PROTO(iphdr) == IP_PROTO_UDP) { ++ return udp_netif_loop_output(netif, p); ++ } ++ ++ struct tcp_pcb *pcb = p->pcb; ++ struct pbuf *head = NULL; ++ ++ if (pcb == NULL || pcb->client_tx_ring == NULL) { ++ LSTACK_LOG(ERR, LSTACK, "pcb is null\n"); ++ return ERR_ARG; ++ } ++ ++ if (p->next != NULL) { ++ LSTACK_LOG(ERR, LSTACK, "netif_loop_output: not support chained pbuf\n"); ++ return ERR_ARG; ++ } ++ ++ struct tcp_hdr *tcp_hdr = (struct tcp_hdr *)((char *)p->payload + sizeof(struct ip_hdr)); ++ uint8_t flags = TCPH_FLAGS(tcp_hdr); ++ ++ head = pbuf_alloc(0, p->len, PBUF_RAM); ++ if (head == NULL) { ++ LSTACK_LOG(ERR, LSTACK, "netif_loop_output: pbuf_alloc failed\n"); ++ return ERR_MEM; ++ } ++ memcpy_s(head->payload, head->len, p->payload, p->len); ++ ++ if ((flags & TCP_SYN) && !(flags & TCP_ACK)) { ++ /* SYN packet, send to listen_ring */ ++ char ring_name[RING_NAME_LEN] = {0}; ++ snprintf_s(ring_name, sizeof(ring_name), sizeof(ring_name) - 1, "listen_rx_ring_%d", pcb->remote_port); ++ struct rte_ring *ring = rte_ring_lookup(ring_name); ++ if (ring == NULL) { ++ LSTACK_LOG(INFO, LSTACK, "netif_loop_output: cant find listen_rx_ring %d\n", pcb->remote_port); ++ pbuf_free(head); ++ } else { ++ if (rte_ring_mp_enqueue(ring, head) != 0) { ++ LSTACK_LOG(INFO, LSTACK, "enqueue sync packet failed\n"); ++ pbuf_free(head); ++ } ++ } ++ } else { ++ /* send other type packet to tx_ring */ ++ if (rte_ring_sp_enqueue(pcb->client_tx_ring, head) != 0) { ++ LSTACK_LOG(INFO, LSTACK, "client tx ring full\n"); ++ pbuf_free(head); ++ } ++ } ++ ++ return ERR_OK; ++} ++ ++err_t find_same_node_memzone(struct tcp_pcb *pcb, struct lwip_sock *nsock) ++{ ++ char name[RING_NAME_LEN]; ++ snprintf_s(name, sizeof(name), sizeof(name) - 1, "rte_mz_rx_%u", pcb->remote_port); ++ if ((nsock->same_node_tx_ring_mz = rte_memzone_lookup(name)) == NULL) { ++ LSTACK_LOG(INFO, LSTACK, "lwip_accept: can't find %s\n",name); ++ return -1; ++ } else { ++ LSTACK_LOG(INFO, LSTACK, "lookup %s success\n", name); ++ } ++ nsock->same_node_tx_ring = (struct same_node_ring *)nsock->same_node_tx_ring_mz->addr; ++ ++ snprintf_s(name, sizeof(name), sizeof(name) - 1, "rte_mz_buf_rx_%u", pcb->remote_port); ++ if ((nsock->same_node_tx_ring->mz = rte_memzone_lookup(name)) == NULL) { ++ LSTACK_LOG(INFO, LSTACK, "lwip_accept: can't find %s\n",name); ++ return -1; ++ } ++ ++ snprintf_s(name, sizeof(name), sizeof(name) - 1, "rte_mz_tx_%u", pcb->remote_port); ++ if ((nsock->same_node_rx_ring_mz = rte_memzone_lookup(name)) == NULL) { ++ LSTACK_LOG(INFO, LSTACK, "lwip_accept: can't find %s\n",name); ++ return -1; ++ } else { ++ LSTACK_LOG(INFO, LSTACK, "lookup %s success\n", name); ++ } ++ nsock->same_node_rx_ring = (struct same_node_ring *)nsock->same_node_rx_ring_mz->addr; ++ ++ snprintf_s(name, sizeof(name), sizeof(name) - 1,"rte_mz_buf_tx_%u", pcb->remote_port); ++ if ((nsock->same_node_rx_ring->mz = rte_memzone_lookup(name)) == NULL) { ++ LSTACK_LOG(INFO, LSTACK, "lwip_accept: can't find %s\n",name); ++ return -1; ++ } ++ ++ /* rcvlink init in alloc_socket() */ ++ /* remove from g_rcv_process_list in free_socket */ ++ list_add_node(&nsock->recv_list, &nsock->stack->same_node_recv_list); ++ return 0; ++} ++ ++err_t same_node_memzone_create(const struct rte_memzone **zone, int size, int port, char *name, char *rx) ++{ ++ char mem_name[RING_NAME_LEN] = {0}; ++ snprintf_s(mem_name, sizeof(mem_name), sizeof(mem_name) - 1, "%s_%s_%d", name, rx, port); ++ ++ *zone = rte_memzone_reserve_aligned(mem_name, size, rte_socket_id(), 0, RTE_CACHE_LINE_SIZE); ++ if (*zone == NULL) { ++ LSTACK_LOG(ERR, LSTACK, "cannot reserve memzone:%s, errno is %d\n", mem_name, rte_errno); ++ return ERR_MEM; ++ } ++ ++ LSTACK_LOG(INFO, LSTACK, "lstack id %d, reserve %s(%p) success, addr is %p, size is %u\n", ++ rte_socket_id(), mem_name, *zone, (*zone)->addr, size); ++ ++ return ERR_OK; ++} ++ ++err_t same_node_ring_create(struct rte_ring **ring, int size, int port, char *name, char *rx) ++{ ++ if (!get_global_cfg_params()->use_sockmap) { ++ *ring = NULL; ++ return -1; ++ } ++ ++ unsigned flags; ++ char ring_name[RING_NAME_LEN] = {0}; ++ if (strcmp(name, "listen") == 0) { ++ flags = RING_F_SC_DEQ; ++ } else { ++ flags = RING_F_SP_ENQ | RING_F_SC_DEQ; ++ } ++ ++ snprintf_s(ring_name, sizeof(ring_name), sizeof(ring_name) - 1, "%s_%s_ring_%d", name, rx, port); ++ *ring = rte_ring_create(ring_name, size, rte_socket_id(), flags); ++ if (*ring == NULL) { ++ LSTACK_LOG(ERR, LSTACK, "cannot create rte_ring %s, errno is %d\n", ring_name, rte_errno); ++ return ERR_MEM; ++ } ++ LSTACK_LOG(INFO, LSTACK, "lstack socket id:%d, create %s(%p) success\n", rte_socket_id(), ring_name, *ring); ++ return ERR_OK; ++} ++ ++static void init_same_node_ring(struct tcp_pcb *pcb) ++{ ++ struct netconn *netconn = (struct netconn *)pcb->callback_arg; ++ struct lwip_sock *sock = lwip_get_socket(netconn->callback_arg.socket); ++ ++ pcb->client_rx_ring = NULL; ++ pcb->client_tx_ring = NULL; ++ pcb->free_ring = 0; ++ sock->same_node_rx_ring = NULL; ++ sock->same_node_rx_ring_mz = NULL; ++ sock->same_node_tx_ring = NULL; ++ sock->same_node_tx_ring_mz = NULL; ++} ++ ++#define CLIENT_RING_SIZE 512 ++err_t create_same_node_ring(struct tcp_pcb *pcb) ++{ ++ struct netconn *netconn = (struct netconn *)pcb->callback_arg; ++ struct lwip_sock *sock = lwip_get_socket(netconn->callback_arg.socket); ++ ++ if (same_node_ring_create(&pcb->client_rx_ring, CLIENT_RING_SIZE, pcb->local_port, "client", "rx") != 0) { ++ goto END; ++ } ++ if (same_node_ring_create(&pcb->client_tx_ring, CLIENT_RING_SIZE, pcb->local_port, "client", "tx") != 0) { ++ goto END; ++ } ++ pcb->free_ring = 1; ++ ++ if (same_node_memzone_create(&sock->same_node_rx_ring_mz, sizeof(struct same_node_ring), ++ pcb->local_port, "rte_mz", "rx") != 0) { ++ goto END; ++ } ++ sock->same_node_rx_ring = (struct same_node_ring*)sock->same_node_rx_ring_mz->addr; ++ ++ if (same_node_memzone_create(&sock->same_node_rx_ring->mz, SAME_NODE_RING_LEN, ++ pcb->local_port, "rte_mz_buf", "rx") != 0) { ++ goto END; ++ } ++ ++ sock->same_node_rx_ring->sndbegin = 0; ++ sock->same_node_rx_ring->sndend = 0; ++ ++ if (same_node_memzone_create(&sock->same_node_tx_ring_mz, sizeof(struct same_node_ring), ++ pcb->local_port, "rte_mz", "tx") != 0) { ++ goto END; ++ } ++ sock->same_node_tx_ring = (struct same_node_ring*)sock->same_node_tx_ring_mz->addr; ++ ++ if (same_node_memzone_create(&sock->same_node_tx_ring->mz, SAME_NODE_RING_LEN, ++ pcb->local_port, "rte_mz_buf", "tx") != 0) { ++ goto END; ++ } ++ ++ sock->same_node_tx_ring->sndbegin = 0; ++ sock->same_node_tx_ring->sndend = 0; ++ ++ return 0; ++END: ++ rte_ring_free(pcb->client_rx_ring); ++ rte_ring_free(pcb->client_tx_ring); ++ rte_memzone_free(sock->same_node_rx_ring->mz); ++ rte_memzone_free(sock->same_node_rx_ring_mz); ++ rte_memzone_free(sock->same_node_tx_ring->mz); ++ rte_memzone_free(sock->same_node_tx_ring_mz); ++ init_same_node_ring(pcb); ++ return ERR_BUF; ++} ++ ++err_t find_same_node_ring(struct tcp_pcb *npcb) ++{ ++ char name[RING_NAME_LEN] = {0}; ++ snprintf_s(name, sizeof(name), sizeof(name) - 1, "client_tx_ring_%u", npcb->remote_port); ++ npcb->client_rx_ring = rte_ring_lookup(name); ++ memset_s(name, sizeof(name), 0, sizeof(name)); ++ snprintf_s(name, sizeof(name), sizeof(name) - 1, "client_rx_ring_%u", npcb->remote_port); ++ npcb->client_tx_ring = rte_ring_lookup(name); ++ npcb->free_ring = 0; ++ if (npcb->client_tx_ring == NULL || ++ npcb->client_rx_ring == NULL) { ++ LSTACK_LOG(INFO, LSTACK, "lookup client rxtx ring failed, port is %d\n", npcb->remote_port); ++ tcp_abandon(npcb, 0); ++ return ERR_CONN; ++ } else { ++ LSTACK_LOG(INFO, LSTACK, "find client_tx_ring_%u and client_rx_ring_%u\n", ++ npcb->remote_port, npcb->remote_port); ++ } ++ return 0; ++} ++ ++unsigned same_node_ring_count(const struct lwip_sock *sock) ++{ ++ const unsigned long long cur_begin = __atomic_load_n(&sock->same_node_rx_ring->sndbegin, __ATOMIC_RELAXED); ++ const unsigned long long cur_end = __atomic_load_n(&sock->same_node_rx_ring->sndend, __ATOMIC_RELAXED); ++ ++ return cur_end - cur_begin; ++} ++#endif /* GAZELLE_SAME_NODE */ +diff --git a/src/lstack/include/lstack_ethdev.h b/src/lstack/include/lstack_ethdev.h +index 0c3d906..5aeb80d 100644 +--- a/src/lstack/include/lstack_ethdev.h ++++ b/src/lstack/include/lstack_ethdev.h +@@ -32,6 +32,4 @@ void kni_handle_rx(uint16_t port_id); + void kni_handle_tx(struct rte_mbuf *mbuf); + #endif + +-void netif_poll(struct netif *netif); +- + #endif /* __GAZELLE_ETHDEV_H__ */ +diff --git a/src/lstack/include/lstack_lwip.h b/src/lstack/include/lstack_lwip.h +index dcb7dac..f2524e4 100644 +--- a/src/lstack/include/lstack_lwip.h ++++ b/src/lstack/include/lstack_lwip.h +@@ -16,15 +16,15 @@ + + #include "common/gazelle_dfx_msg.h" + #include "common/dpdk_common.h" ++#include "same_node.h" + + struct lwip_sock; + struct rpc_msg; + struct protocol_stack; + +-unsigned same_node_ring_count(struct lwip_sock *sock); + + #define NETCONN_IS_ACCEPTIN(sock) (((sock)->conn->acceptmbox != NULL) && !sys_mbox_empty((sock)->conn->acceptmbox)) +-#define NETCONN_IS_DATAIN(sock) ((gazelle_ring_readable_count((sock)->recv_ring) || (sock)->recv_lastdata) || (sock->same_node_rx_ring != NULL && same_node_ring_count(sock))) ++#define NETCONN_IS_DATAIN(sock) ((gazelle_ring_readable_count((sock)->recv_ring) || (sock)->recv_lastdata) || NETCONN_NEED_SAME_NODE(sock)) + #define NETCONN_IS_DATAOUT(sock) (gazelle_ring_readover_count((sock)->send_ring) || (sock)->send_pre_del) + #define NETCONN_IS_OUTIDLE(sock) gazelle_ring_readable_count((sock)->send_ring) + #define NETCONN_IS_UDP(sock) (NETCONNTYPE_GROUP(netconn_type((sock)->conn)) == NETCONN_UDP) +@@ -63,6 +63,4 @@ void do_lwip_clone_sockopt(struct lwip_sock *dst_sock, struct lwip_sock *src_soc + uint32_t do_lwip_get_conntable(struct gazelle_stat_lstack_conn_info *conn, uint32_t max_num); + uint32_t do_lwip_get_connnum(void); + +-void read_same_node_recv_list(struct protocol_stack *stack); +- + #endif +diff --git a/src/lstack/include/lstack_protocol_stack.h b/src/lstack/include/lstack_protocol_stack.h +index c9c50c9..a278d7a 100644 +--- a/src/lstack/include/lstack_protocol_stack.h ++++ b/src/lstack/include/lstack_protocol_stack.h +@@ -113,7 +113,9 @@ struct protocol_stack *get_protocol_stack_by_fd(int fd); + struct protocol_stack *get_bind_protocol_stack(void); + struct protocol_stack_group *get_protocol_stack_group(void); + ++#if GAZELLE_TCP_REUSE_IPPORT + int get_min_conn_stack(struct protocol_stack_group *stack_group); ++#endif /* GAZELLE_TCP_REUSE_IPPORT */ + void bind_to_stack_numa(struct protocol_stack *stack); + void thread_bind_stack(struct protocol_stack *stack); + +diff --git a/src/lstack/include/same_node.h b/src/lstack/include/same_node.h +new file mode 100644 +index 0000000..90a5b76 +--- /dev/null ++++ b/src/lstack/include/same_node.h +@@ -0,0 +1,35 @@ ++/* ++* Copyright (c) Huawei Technologies Co., Ltd. 2020-2021. All rights reserved. ++* gazelle is licensed under the Mulan PSL v2. ++* You can use this software according to the terms and conditions of the Mulan PSL v2. ++* You may obtain a copy of Mulan PSL v2 at: ++* http://license.coscl.org.cn/MulanPSL2 ++* THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR ++* IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY OR FIT FOR A PARTICULAR ++* PURPOSE. ++* See the Mulan PSL v2 for more details. ++*/ ++ ++#ifndef __GAZELLE_SAME_NODE_H__ ++#define __GAZELLE_SAME_NODE_H__ ++ ++#include ++ ++#if GAZELLE_SAME_NODE ++ ++unsigned same_node_ring_count(const struct lwip_sock *sock); ++ ++void read_same_node_recv_list(struct protocol_stack *stack); ++ssize_t gazelle_same_node_ring_recv(struct lwip_sock *sock, const void *buf, size_t len, int32_t flags); ++ssize_t gazelle_same_node_ring_send(struct lwip_sock *sock, const void *buf, size_t len, int32_t flags); ++ ++#define NETCONN_NEED_SAME_NODE(sock) \ ++ ( (sock->same_node_rx_ring && same_node_ring_count(sock)) ) ++ ++#else /* GAZELLE_SAME_NODE */ ++ ++#define NETCONN_NEED_SAME_NODE(sock) false ++ ++#endif /* GAZELLE_SAME_NODE */ ++ ++#endif /* __GAZELLE_SAME_NODE_H__ */ +-- +2.33.0 + diff --git a/0324-cleancode-remove-gazelle_light_ring.patch b/0324-cleancode-remove-gazelle_light_ring.patch new file mode 100644 index 0000000..abaa10d --- /dev/null +++ b/0324-cleancode-remove-gazelle_light_ring.patch @@ -0,0 +1,104 @@ +From 791872dcb02eacc2bc0e43deb97ecb9cf3cd9711 Mon Sep 17 00:00:00 2001 +From: Lemmy Huang +Date: Tue, 11 Mar 2025 15:06:10 +0800 +Subject: [PATCH] cleancode: remove gazelle_light_ring + +Signed-off-by: Lemmy Huang +--- + src/common/dpdk_common.h | 60 ++++++---------------------------------- + 1 file changed, 8 insertions(+), 52 deletions(-) + +diff --git a/src/common/dpdk_common.h b/src/common/dpdk_common.h +index 7a05342..8609216 100644 +--- a/src/common/dpdk_common.h ++++ b/src/common/dpdk_common.h +@@ -123,49 +123,6 @@ struct rte_eth_conf; + struct rte_eth_dev_info; + void eth_params_checksum(struct rte_eth_conf *conf, struct rte_eth_dev_info *dev_info); + +-/* +- gazelle custom rte ring interface +- lightweight ring reduce atomic and smp_mb. +- only surpport single-consumers or the single-consumer. +- */ +-static __rte_always_inline uint32_t gazelle_light_ring_enqueue_busrt(struct rte_ring *r, void **obj_table, uint32_t n) +-{ +- uint32_t cons = __atomic_load_n(&r->cons.tail, __ATOMIC_ACQUIRE); +- uint32_t prod = r->prod.tail; +- uint32_t free_entries = r->capacity + cons - prod; +- +- if (n > free_entries) { +- return 0; +- } +- +- __rte_ring_enqueue_elems(r, prod, obj_table, sizeof(void *), n); +- +- __atomic_store_n(&r->prod.tail, prod + n, __ATOMIC_RELEASE); +- +- return n; +-} +- +-static __rte_always_inline uint32_t gazelle_light_ring_dequeue_burst(struct rte_ring *r, void **obj_table, uint32_t n) +-{ +- uint32_t prod = __atomic_load_n(&r->prod.tail, __ATOMIC_ACQUIRE); +- uint32_t cons = r->cons.tail; +- uint32_t entries = prod - cons; +- +- if (n > entries) { +- n = entries; +- } +- +- if (n == 0) { +- return 0; +- } +- +- __rte_ring_dequeue_elems(r, cons, obj_table, sizeof(void *), n); +- +- __atomic_store_n(&r->cons.tail, cons + n, __ATOMIC_RELEASE); +- +- return n; +-} +- + /* + gazelle custom rte ring interface + one thread enqueue and dequeue, other thread read object use and object still in queue. +@@ -177,15 +134,16 @@ static __rte_always_inline uint32_t gazelle_light_ring_dequeue_burst(struct rte_ + gazelle_ring_read: prod.head-->> cons.head, read object, prod.head = prod.tail + N + gazelle_ring_read_over: prod.tail = prod.head, update prod.tail + */ +-static __rte_always_inline uint32_t gazelle_ring_sp_enqueue(struct rte_ring *r, void **obj_table, uint32_t n) ++static __rte_always_inline uint32_t gazelle_ring_sp_enqueue(struct rte_ring *r, void *const *obj_table, uint32_t n) + { + uint32_t head = __atomic_load_n(&r->cons.head, __ATOMIC_ACQUIRE); + uint32_t tail = r->cons.tail; + +- uint32_t entries = r->capacity + tail - head; +- if (n > entries) { ++ uint32_t free_entries = r->capacity + tail - head; ++ if (unlikely(free_entries == 0)) + return 0; +- } ++ if (n > free_entries) ++ n = free_entries; + + __rte_ring_enqueue_elems(r, head, obj_table, sizeof(void *), n); + +@@ -200,12 +158,10 @@ static __rte_always_inline uint32_t gazelle_ring_sc_dequeue(struct rte_ring *r, + uint32_t cons = r->cons.tail; + + uint32_t entries = prod - cons; +- if (n > entries) { +- n = entries; +- } +- if (unlikely(n == 0)) { ++ if (unlikely(entries == 0)) + return 0; +- } ++ if (n > entries) ++ n = entries; + + __rte_ring_dequeue_elems(r, cons, obj_table, sizeof(void *), n); + +-- +2.33.0 + diff --git a/0325-cleancode-remove-get_stack_tid-DPDK_PKT_BURST_SIZE-P.patch b/0325-cleancode-remove-get_stack_tid-DPDK_PKT_BURST_SIZE-P.patch new file mode 100644 index 0000000..29914c6 --- /dev/null +++ b/0325-cleancode-remove-get_stack_tid-DPDK_PKT_BURST_SIZE-P.patch @@ -0,0 +1,292 @@ +From 33b786bd43e769351f89e9500f78819c3550949b Mon Sep 17 00:00:00 2001 +From: Lemmy Huang +Date: Tue, 11 Mar 2025 15:09:56 +0800 +Subject: [PATCH] cleancode: remove get_stack_tid DPDK_PKT_BURST_SIZE + PACKET_READ_SIZE + +Signed-off-by: Lemmy Huang +--- + src/common/gazelle_base_func.h | 3 --- + src/common/gazelle_opt.h | 10 ++++------ + src/lstack/api/lstack_rtw_api.c | 4 ++-- + src/lstack/core/lstack_protocol_stack.c | 11 ----------- + src/lstack/core/lstack_thread_rpc.c | 16 ++++++++-------- + src/lstack/core/lstack_virtio.c | 3 ++- + src/lstack/include/lstack_protocol_stack.h | 2 -- + src/lstack/netif/lstack_ethdev.c | 5 ++--- + src/lstack/netif/lstack_vdev.c | 8 ++++---- + 9 files changed, 22 insertions(+), 40 deletions(-) + +diff --git a/src/common/gazelle_base_func.h b/src/common/gazelle_base_func.h +index 0ccb34b..29534a7 100644 +--- a/src/common/gazelle_base_func.h ++++ b/src/common/gazelle_base_func.h +@@ -29,9 +29,6 @@ + return -1; \ + } while (0) + +-#define NODE_ENTRY(node, type, member) \ +- ((type*)((char*)(node) - (size_t)&((type*)0)->member)) +- + #define MB_IN_BYTES (1024 * 1024) + static inline int bytes_to_mb(uint32_t bytes) + { +diff --git a/src/common/gazelle_opt.h b/src/common/gazelle_opt.h +index d6b1c44..4406831 100644 +--- a/src/common/gazelle_opt.h ++++ b/src/common/gazelle_opt.h +@@ -23,6 +23,9 @@ + #define GAZELLE_TRUE 1 + #define GAZELLE_FALSE 0 + ++#define CPUS_MAX_NUM 640 ++#define GAZELLE_MAX_NUMA_NODES 8 ++ + #define PROTOCOL_STACK_MAX 32 + #define KERNEL_EPOLL_MAX 512 + +@@ -41,7 +44,7 @@ + #define VDEV_IDLE_QUEUE_SZ DEFAULT_RING_SIZE + + #define VDEV_TX_QUEUE_SZ DEFAULT_RING_SIZE +-#define FREE_RX_QUEUE_SZ DPDK_PKT_BURST_SIZE ++#define FREE_RX_QUEUE_SZ DEFAULT_RING_SIZE + + #define NIC_QUEUE_SIZE_MAX 8192 + #define NIC_QUEUE_SIZE_MIN 512 +@@ -58,8 +61,6 @@ + #define IPV6_EXTRA_HEAD_LEN 20 + #define MBUF_MAX_DATA_LEN (MTU_DEFAULT_DATA_LEN - VLAN_HEAD_LEN - IPV6_EXTRA_HEAD_LEN) + +-#define DPDK_PKT_BURST_SIZE 512 +- + #define GAZELLE_UDP_PKGLEN_MAX (65535 - IP_HLEN - UDP_HLEN) + + /* total:33 client, index 32 is invaild client */ +@@ -109,7 +110,4 @@ + + #define SLEEP_US_BEFORE_LINK_UP 10000 + +-#define CPUS_MAX_NUM 640 +-#define GAZELLE_MAX_NUMA_NODES 8 +- + #endif /* _GAZELLE_OPT_H_ */ +diff --git a/src/lstack/api/lstack_rtw_api.c b/src/lstack/api/lstack_rtw_api.c +index 6d0bd05..35439bc 100644 +--- a/src/lstack/api/lstack_rtw_api.c ++++ b/src/lstack/api/lstack_rtw_api.c +@@ -92,7 +92,7 @@ static int stack_broadcast_bind(int fd, const struct sockaddr *name, socklen_t n + + struct lwip_sock *sock = lwip_get_socket(fd); + if (sock == NULL || cur_stack == NULL) { +- LSTACK_LOG(ERR, LSTACK, "tid %ld, %d get sock null or stack null\n", get_stack_tid(), fd); ++ LSTACK_LOG(ERR, LSTACK, "tid %d, %d get sock null or stack null\n", rte_gettid(), fd); + GAZELLE_RETURN(EBADF); + } + +@@ -229,7 +229,7 @@ static int stack_broadcast_listen(int fd, int backlog) + + struct lwip_sock *sock = lwip_get_socket(fd); + if (sock == NULL || cur_stack == NULL) { +- LSTACK_LOG(ERR, LSTACK, "tid %ld, %d get sock null or stack null\n", get_stack_tid(), fd); ++ LSTACK_LOG(ERR, LSTACK, "tid %d, %d get sock null or stack null\n", rte_gettid(), fd); + GAZELLE_RETURN(EBADF); + } + +diff --git a/src/lstack/core/lstack_protocol_stack.c b/src/lstack/core/lstack_protocol_stack.c +index b4fb2fd..3bb1eeb 100644 +--- a/src/lstack/core/lstack_protocol_stack.c ++++ b/src/lstack/core/lstack_protocol_stack.c +@@ -76,17 +76,6 @@ static inline void set_stack_idx(uint16_t idx) + g_stack_p = g_stack_group.stacks[idx]; + } + +-long get_stack_tid(void) +-{ +- static PER_THREAD int32_t g_stack_tid = 0; +- +- if (g_stack_tid == 0) { +- g_stack_tid = rte_gettid(); +- } +- +- return g_stack_tid; +-} +- + struct protocol_stack_group *get_protocol_stack_group(void) + { + return &g_stack_group; +diff --git a/src/lstack/core/lstack_thread_rpc.c b/src/lstack/core/lstack_thread_rpc.c +index 050594e..26ae501 100644 +--- a/src/lstack/core/lstack_thread_rpc.c ++++ b/src/lstack/core/lstack_thread_rpc.c +@@ -235,7 +235,7 @@ static void callback_socket(struct rpc_msg *msg) + { + msg->result = lwip_socket(msg->args[MSG_ARG_0].i, msg->args[MSG_ARG_1].i, msg->args[MSG_ARG_2].i); + if (msg->result < 0) { +- LSTACK_LOG(ERR, LSTACK, "tid %ld, %ld socket failed\n", get_stack_tid(), msg->result); ++ LSTACK_LOG(ERR, LSTACK, "tid %d, %ld socket failed\n", rte_gettid(), msg->result); + } + } + +@@ -253,7 +253,7 @@ static void callback_close(struct rpc_msg *msg) + + msg->result = lwip_close(fd); + if (msg->result != 0) { +- LSTACK_LOG(ERR, LSTACK, "tid %ld, fd %d failed %ld\n", get_stack_tid(), msg->args[MSG_ARG_0].i, msg->result); ++ LSTACK_LOG(ERR, LSTACK, "tid %d, fd %d failed %ld\n", rte_gettid(), msg->args[MSG_ARG_0].i, msg->result); + } + } + +@@ -272,7 +272,7 @@ static void callback_shutdown(struct rpc_msg *msg) + + msg->result = lwip_shutdown(fd, how); + if (msg->result != 0 && errno != ENOTCONN) { +- LSTACK_LOG(ERR, LSTACK, "tid %ld, fd %d fail %ld\n", get_stack_tid(), fd, msg->result); ++ LSTACK_LOG(ERR, LSTACK, "tid %d, fd %d fail %ld\n", rte_gettid(), fd, msg->result); + } + + posix_api->shutdown_fn(fd, how); +@@ -282,7 +282,7 @@ static void callback_bind(struct rpc_msg *msg) + { + msg->result = lwip_bind(msg->args[MSG_ARG_0].i, msg->args[MSG_ARG_1].cp, msg->args[MSG_ARG_2].u); + if (msg->result != 0) { +- LSTACK_LOG(ERR, LSTACK, "tid %ld, fd %d failed %ld\n", get_stack_tid(), msg->args[MSG_ARG_0].i, msg->result); ++ LSTACK_LOG(ERR, LSTACK, "tid %d, fd %d failed %ld\n", rte_gettid(), msg->args[MSG_ARG_0].i, msg->result); + } + } + +@@ -300,7 +300,7 @@ static void callback_listen(struct rpc_msg *msg) + /* new listen add to stack listen list */ + msg->result = lwip_listen(fd, backlog); + if (msg->result != 0) { +- LSTACK_LOG(ERR, LSTACK, "tid %ld, fd %d failed %ld\n", get_stack_tid(), msg->args[MSG_ARG_0].i, msg->result); ++ LSTACK_LOG(ERR, LSTACK, "tid %d, fd %d failed %ld\n", rte_gettid(), msg->args[MSG_ARG_0].i, msg->result); + } + } + +@@ -509,7 +509,7 @@ static void callback_getsockname(struct rpc_msg *msg) + { + msg->result = lwip_getsockname(msg->args[MSG_ARG_0].i, msg->args[MSG_ARG_1].p, msg->args[MSG_ARG_2].p); + if (msg->result != 0) { +- LSTACK_LOG(ERR, LSTACK, "tid %ld, fd %d fail %ld\n", get_stack_tid(), msg->args[MSG_ARG_0].i, msg->result); ++ LSTACK_LOG(ERR, LSTACK, "tid %d, fd %d fail %ld\n", rte_gettid(), msg->args[MSG_ARG_0].i, msg->result); + } + } + +@@ -518,7 +518,7 @@ static void callback_getsockopt(struct rpc_msg *msg) + msg->result = lwip_getsockopt(msg->args[MSG_ARG_0].i, msg->args[MSG_ARG_1].i, msg->args[MSG_ARG_2].i, + msg->args[MSG_ARG_3].p, msg->args[MSG_ARG_4].p); + if (msg->result != 0) { +- LSTACK_LOG(ERR, LSTACK, "tid %ld, fd %d, level %d, optname %d, fail %ld\n", get_stack_tid(), ++ LSTACK_LOG(ERR, LSTACK, "tid %d, fd %d, level %d, optname %d, fail %ld\n", rte_gettid(), + msg->args[MSG_ARG_0].i, msg->args[MSG_ARG_1].i, msg->args[MSG_ARG_2].i, msg->result); + } + } +@@ -528,7 +528,7 @@ static void callback_setsockopt(struct rpc_msg *msg) + msg->result = lwip_setsockopt(msg->args[MSG_ARG_0].i, msg->args[MSG_ARG_1].i, msg->args[MSG_ARG_2].i, + msg->args[MSG_ARG_3].cp, msg->args[MSG_ARG_4].u); + if (msg->result != 0) { +- LSTACK_LOG(ERR, LSTACK, "tid %ld, fd %d, level %d, optname %d, fail %ld\n", get_stack_tid(), ++ LSTACK_LOG(ERR, LSTACK, "tid %d, fd %d, level %d, optname %d, fail %ld\n", rte_gettid(), + msg->args[MSG_ARG_0].i, msg->args[MSG_ARG_1].i, msg->args[MSG_ARG_2].i, msg->result); + } + } +diff --git a/src/lstack/core/lstack_virtio.c b/src/lstack/core/lstack_virtio.c +index 75a23f2..f6855d1 100644 +--- a/src/lstack/core/lstack_virtio.c ++++ b/src/lstack/core/lstack_virtio.c +@@ -17,12 +17,13 @@ + #include + #include + #include ++#include ++ + #include "lstack_cfg.h" + #include "lstack_log.h" + #include "lstack_port_map.h" + #include "lstack_interrupt.h" + #include "lstack_virtio.h" +-#include "securec.h" + + #define VIRTIO_USER_NAME "virtio_user" + #define VIRTIO_DPDK_PARA_LEN 256 +diff --git a/src/lstack/include/lstack_protocol_stack.h b/src/lstack/include/lstack_protocol_stack.h +index a278d7a..3f6e3d3e 100644 +--- a/src/lstack/include/lstack_protocol_stack.h ++++ b/src/lstack/include/lstack_protocol_stack.h +@@ -106,8 +106,6 @@ struct protocol_stack_group { + pthread_spinlock_t socket_lock; + }; + +-long get_stack_tid(void); +- + struct protocol_stack *get_protocol_stack(void); + struct protocol_stack *get_protocol_stack_by_fd(int fd); + struct protocol_stack *get_bind_protocol_stack(void); +diff --git a/src/lstack/netif/lstack_ethdev.c b/src/lstack/netif/lstack_ethdev.c +index 3b859d2..a370714 100644 +--- a/src/lstack/netif/lstack_ethdev.c ++++ b/src/lstack/netif/lstack_ethdev.c +@@ -41,7 +41,6 @@ + + /* FRAME_MTU + 14byte header */ + #define MBUF_MAX_LEN 1514 +-#define PACKET_READ_SIZE 32 + + /* any protocol stack thread receives arp packet and sync it to other threads, + * so that it can have the arp table */ +@@ -150,11 +149,11 @@ void eth_dev_recv(struct rte_mbuf *mbuf, struct protocol_stack *stack) + #if RTE_VERSION < RTE_VERSION_NUM(23, 11, 0, 0) + void kni_handle_rx(uint16_t port_id) + { +- struct rte_mbuf *pkts_burst[PACKET_READ_SIZE]; ++ struct rte_mbuf *pkts_burst[GAZELLE_PACKET_READ_SIZE]; + struct rte_kni* kni = get_gazelle_kni(); + uint32_t nb_kni_rx = 0; + if (kni) { +- nb_kni_rx = rte_kni_rx_burst(kni, pkts_burst, PACKET_READ_SIZE); ++ nb_kni_rx = rte_kni_rx_burst(kni, pkts_burst, GAZELLE_PACKET_READ_SIZE); + } + if (nb_kni_rx > 0) { + uint16_t nb_rx = rte_eth_tx_burst(port_id, 0, pkts_burst, nb_kni_rx); +diff --git a/src/lstack/netif/lstack_vdev.c b/src/lstack/netif/lstack_vdev.c +index 2eaeb1f..14d8cc6 100644 +--- a/src/lstack/netif/lstack_vdev.c ++++ b/src/lstack/netif/lstack_vdev.c +@@ -57,13 +57,13 @@ static uint32_t ltran_rx_poll(struct protocol_stack *stack, struct rte_mbuf **pk + { + uint32_t rcvd_pkts; + uint32_t nr_pkts; +- struct rte_mbuf *free_buf[DPDK_PKT_BURST_SIZE]; ++ struct rte_mbuf *free_buf[VDEV_RX_QUEUE_SZ]; + + rcvd_pkts = gazelle_ring_sc_dequeue(stack->rx_ring, (void **)pkts, max_mbuf); + + stack->rx_ring_used += rcvd_pkts; + if (unlikely(stack->rx_ring_used >= USED_RX_PKTS_WATERMARK)) { +- uint32_t free_cnt = LWIP_MIN(stack->rx_ring_used, RING_SIZE(DPDK_PKT_BURST_SIZE)); ++ uint32_t free_cnt = LWIP_MIN(stack->rx_ring_used, RING_SIZE(VDEV_RX_QUEUE_SZ)); + int32_t ret = dpdk_alloc_pktmbuf(stack->rxtx_mbuf_pool, (struct rte_mbuf **)free_buf, free_cnt, true); + if (likely(ret == 0)) { + nr_pkts = gazelle_ring_sp_enqueue(stack->rx_ring, (void **)free_buf, free_cnt); +@@ -161,7 +161,7 @@ static uint32_t vdev_rx_poll(struct protocol_stack *stack, struct rte_mbuf **pkt + static uint32_t ltran_tx_xmit(struct protocol_stack *stack, struct rte_mbuf **pkts, uint32_t nr_pkts) + { + uint32_t sent_pkts = 0; +- struct rte_mbuf *free_buf[DPDK_PKT_BURST_SIZE]; ++ struct rte_mbuf *free_buf[VDEV_TX_QUEUE_SZ]; + const uint32_t tbegin = sys_now(); + + do { +@@ -292,7 +292,7 @@ int32_t vdev_reg_xmit(enum reg_ring_type type, struct gazelle_quintuple *qtuple) + + tmp_buf = &stack->reg_buf[reg_index]; + tmp_buf->type = type; +- tmp_buf->tid = get_stack_tid(); ++ tmp_buf->tid = rte_gettid(); + ret = memcpy_s(&tmp_buf->qtuple, sizeof(*qtuple), qtuple, sizeof(struct gazelle_quintuple)); + if (ret != EOK) { + LSTACK_LOG(ERR, LSTACK, "memcpy_s failed ret=%d.\n", ret); +-- +2.33.0 + diff --git a/0326-socket-refactor-sock_event.patch b/0326-socket-refactor-sock_event.patch new file mode 100644 index 0000000..007cbc9 --- /dev/null +++ b/0326-socket-refactor-sock_event.patch @@ -0,0 +1,1613 @@ +From 2168db467896885c571dafb98ed03a90101c98d2 Mon Sep 17 00:00:00 2001 +From: Lemmy Huang +Date: Fri, 21 Mar 2025 16:42:43 +0800 +Subject: [PATCH] socket: refactor sock_event + +Signed-off-by: Lemmy Huang +--- + src/lstack/api/lstack_epoll.c | 846 ++++++++++++++++++++++++++++++ + src/lstack/core/lstack_wait.c | 550 +++++++++++++++++++ + src/lstack/include/lstack_epoll.h | 18 + + src/lstack/include/lstack_wait.h | 138 +++++ + 4 files changed, 1552 insertions(+) + create mode 100644 src/lstack/core/lstack_wait.c + create mode 100644 src/lstack/include/lstack_wait.h + +diff --git a/src/lstack/api/lstack_epoll.c b/src/lstack/api/lstack_epoll.c +index 644efc0..a110aa1 100644 +--- a/src/lstack/api/lstack_epoll.c ++++ b/src/lstack/api/lstack_epoll.c +@@ -10,6 +10,8 @@ + * See the Mulan PSL v2 for more details. + */ + ++#if !SOCK_EVENT_V2 ++ + #include + #include + #include +@@ -1038,3 +1040,847 @@ int lstack_select(int maxfd, fd_set *readfds, fd_set *writefds, fd_set *exceptfd + + return event_num; + } ++ ++#else /* SOCK_EVENT_V2 */ ++ ++#include ++#include ++#include ++#include ++ ++#include ++ ++#include ++#include ++#include ++ ++#include "lstack_epoll.h" ++#include "common/dpdk_common.h" ++#include "common/gazelle_base_func.h" ++#include "lstack_preload.h" ++#include "lstack_cfg.h" ++#include "lstack_log.h" ++#include "lstack_protocol_stack.h" ++ ++#define POLL_MAX_EVENTS 32 ++ ++static PER_THREAD struct sock_wait *g_sk_wait = NULL; ++ ++ ++static int rtc_sock_wait_timedwait(struct sock_wait *sk_wait, int timeout, uint32_t start) ++{ ++ stack_polling(0); ++ ++ if (timeout > 0 && timeout <= (int)(sys_now() - start)) { ++ timeout = 0; ++ } ++ return timeout; ++} ++ ++static int rtw_sock_wait_timedwait(struct sock_wait *sk_wait, int timeout, uint32_t start) ++{ ++ return sys_mutex_timedlock_internal(&sk_wait->mutex, timeout); ++} ++ ++static void rtc_epoll_notify_event(struct sock_wait *sk_wait, struct sock_event *sk_event, ++ unsigned pending, int stack_id) ++{ ++ sk_event->pending |= pending; ++ if (list_node_null(&sk_event->event_node)) { ++ list_add_node(&sk_event->event_node, &sk_wait->epcb.event_list); ++ } ++} ++ ++static void rtc_epoll_remove_event(struct sock_wait *sk_wait, struct sock_event *sk_event, unsigned pending) ++{ ++ sk_event->pending &= ~pending; ++ if (sk_event->pending == 0) { ++ list_del_node(&sk_event->event_node); ++ } ++} ++ ++static void rtw_epoll_notify_event(struct sock_wait *sk_wait, struct sock_event *sk_event, ++ unsigned pending, int stack_id) ++{ ++#if SOCK_WAIT_BATCH_NOTIFY ++ if (likely(stack_id >= 0)) { ++ lwip_wait_add_notify(sk_wait, sk_event, pending, stack_id); ++ return; ++ } ++#endif /* SOCK_WAIT_BATCH_NOTIFY */ ++ ++ rte_spinlock_lock(&sk_wait->epcb.lock); ++ sk_event->pending |= pending; ++ if (list_node_null(&sk_event->event_node)) { ++ list_add_node(&sk_event->event_node, &sk_wait->epcb.event_list); ++ } ++ rte_spinlock_unlock(&sk_wait->epcb.lock); ++ ++ sys_mutex_unlock_internal(&sk_wait->mutex); ++} ++ ++static void rtw_epoll_remove_event(struct sock_wait *sk_wait, struct sock_event *sk_event, unsigned pending) ++{ ++ rte_spinlock_lock(&sk_wait->epcb.lock); ++ sk_event->pending &= ~pending; ++ if (sk_event->pending == 0) { ++ list_del_node(&sk_event->event_node); ++ } ++ rte_spinlock_unlock(&sk_wait->epcb.lock); ++} ++ ++static void rtc_poll_notify_event(struct sock_wait *sk_wait, struct sock_event *sk_event, ++ unsigned pending, int stack_id) ++{ ++} ++static void rtc_poll_remove_event(struct sock_wait *sk_wait, struct sock_event *sk_event, unsigned pending) ++{ ++} ++static void rtw_poll_notify_event(struct sock_wait *sk_wait, struct sock_event *sk_event, ++ unsigned pending, int stack_id) ++{ ++#if SOCK_WAIT_BATCH_NOTIFY ++ if (likely(stack_id >= 0)) { ++ lwip_wait_add_notify(sk_wait, NULL, 0, stack_id); ++ return; ++ } ++#endif /* SOCK_WAIT_BATCH_NOTIFY */ ++ sys_mutex_unlock_internal(&sk_wait->mutex); ++} ++static void rtw_poll_remove_event(struct sock_wait *sk_wait, struct sock_event *sk_event, unsigned pending) ++{ ++} ++ ++/* Cannot support the same sock being waited by both epoll/poll/select or multiple epollfd. */ ++static void sock_wait_check_change(struct sock_wait *new_sk_wait, struct sock_wait *old_sk_wait) ++{ ++ if (old_sk_wait == NULL || new_sk_wait == old_sk_wait || ++ old_sk_wait->type == WAIT_CLOSE) { ++ return; ++ } ++ ++ if (new_sk_wait->type & WAIT_EPOLL) { ++ if (old_sk_wait->type & WAIT_EPOLL) { ++ LSTACK_LOG(ERR, LSTACK, "Cannot support the same sock being waited by multiple epollfd! \n"); ++ } else { ++ LSTACK_LOG(ERR, LSTACK, "Cannot support the same sock being waited by both epoll/poll/select! \n"); ++ } ++ } ++} ++ ++ ++static int epoll_cb_init(struct epoll_cb *epcb) ++{ ++ list_init_head(&epcb->event_list); ++ rte_spinlock_init(&epcb->lock); ++ return 0; ++} ++ ++static void epoll_cb_free(struct epoll_cb *epcb) ++{ ++ struct list_node *node, *temp; ++ struct sock_event *sk_event; ++ ++ rte_spinlock_lock(&epcb->lock); ++ ++ list_for_each_node(node, temp, &epcb->event_list) { ++ sk_event = list_entry(node, struct sock_event, event_node); ++ list_del_node(&sk_event->event_node); ++ } ++ ++ rte_spinlock_unlock(&epcb->lock); ++} ++ ++static int epoll_create_internal(int epfd) ++{ ++ struct protocol_stack_group *stack_group = get_protocol_stack_group(); ++ bool rtc_mode = get_global_cfg_params()->stack_mode_rtc; ++ struct sock_wait *sk_wait; ++ struct lwip_sock *epsock; ++ ++ epsock = lwip_get_socket(epfd); ++ if (epsock == NULL) { ++ LSTACK_LOG(ERR, LSTACK, "epfd=%d sock is NULL errno=%d\n", epfd, errno); ++ GAZELLE_RETURN(EINVAL); ++ } ++ ++ /* calloc will memset to zero */ ++ sk_wait = calloc(1, sizeof(struct sock_wait)); ++ if (sk_wait == NULL) { ++ LSTACK_LOG(ERR, LSTACK, "calloc null\n"); ++ GAZELLE_RETURN(EINVAL); ++ } ++ ++ sk_wait->type = WAIT_EPOLL; ++ sock_wait_common_init(sk_wait); ++ sock_wait_kernel_init(sk_wait, epfd, stack_group->stack_num); ++ epoll_cb_init(&sk_wait->epcb); ++ ++ if (rtc_mode) { ++ sk_wait->timedwait_fn = rtc_sock_wait_timedwait; ++ sk_wait->notify_fn = rtc_epoll_notify_event; ++ sk_wait->remove_fn = rtc_epoll_remove_event; ++ } else { ++ sk_wait->timedwait_fn = rtw_sock_wait_timedwait; ++ sk_wait->notify_fn = rtw_epoll_notify_event; ++ sk_wait->remove_fn = rtw_epoll_remove_event; ++ } ++ ++ epsock->sk_wait = sk_wait; ++ return 0; ++} ++ ++static int epoll_close_internal(int epfd) ++{ ++ struct sock_wait *sk_wait; ++ struct lwip_sock *epsock; ++ ++ epsock = lwip_get_socket(epfd); ++ if (epsock == NULL) { ++ LSTACK_LOG(ERR, LSTACK, "epfd=%d sock is NULL errno=%d\n", epfd, errno); ++ GAZELLE_RETURN(EINVAL); ++ } ++ ++ sk_wait = epsock->sk_wait; ++ if (sk_wait == NULL) { ++ return 0; ++ } ++ ++ sk_wait->type = WAIT_CLOSE; ++ epoll_cb_free(&sk_wait->epcb); ++ ++ posix_api->close_fn(sk_wait->epfd); ++ sock_wait_kernel_free(sk_wait); ++ sock_wait_common_free(sk_wait); ++ ++ sk_wait->timedwait_fn = NULL; ++ sk_wait->notify_fn = NULL; ++ sk_wait->remove_fn = NULL; ++ ++ /* FIXME: set all 'sock->sk_wait = NULL' before free. */ ++ free(sk_wait); ++ epsock->sk_wait = NULL; ++ ++ return 0; ++} ++ ++int lstack_epoll_create1(int flags) ++{ ++ int epfd = posix_api->epoll_create1_fn(flags); ++ if (epfd != -1) { ++ if (epoll_create_internal(epfd) != 0) { ++ posix_api->close_fn(epfd); ++ epfd = -1; ++ } ++ } ++ return epfd; ++} ++ ++int lstack_epoll_create(int size) ++{ ++ /* Since Linux 2.6.8, the size argument is ignored, ++ * but must be greater than zero. */ ++ return size <= 0 ? -1 : lstack_epoll_create1(0); ++} ++ ++int lstack_epoll_close(int epfd) ++{ ++ epoll_close_internal(epfd); ++ return posix_api->close_fn(epfd); ++} ++ ++int lstack_epoll_ctl(int epfd, int op, int fd, struct epoll_event *event) ++{ ++ int ret; ++ struct lwip_sock *epsock = lwip_get_socket(epfd); ++ struct lwip_sock *sock = lwip_get_socket(fd); ++ struct sock_wait *sk_wait = epsock->sk_wait; ++ struct sock_event *sk_event; ++ unsigned pending; ++ ++ if (epfd < 0 || fd < 0 || epfd == fd || \ ++ (event == NULL && op != EPOLL_CTL_DEL)) { ++ LSTACK_LOG(ERR, LSTACK, "fd=%d epfd=%d op=%d\n", fd, epfd, op); ++ GAZELLE_RETURN(EINVAL); ++ } ++ ++ LWIP_DEBUGF(SOCKETS_DEBUG, ("%s(epfd=%d, op=%d, fd=%d, event=%p)\n", ++ __FUNCTION__, epfd, op, fd, event)); ++ ++ enum posix_type sk_type = select_sock_posix_path(sock); ++ /* has POSIX_LWIP */ ++ if (sk_type != POSIX_LWIP) { ++ ret = posix_api->epoll_ctl_fn(epfd, op, fd, event); ++ if (ret != 0) { ++ LSTACK_LOG(ERR, LSTACK, "epoll_ctl_fn failed, fd=%d epfd=%d op=%d\n", fd, epfd, op); ++ return ret; ++ } ++ if (op == EPOLL_CTL_ADD) { ++ sk_wait->kernel_nfds++; ++ } else if (op == EPOLL_CTL_DEL) { ++ sk_wait->kernel_nfds--; ++ } ++ } ++ /* is POSIX_KERNEL */ ++ if (sk_type == POSIX_KERNEL) ++ return ret; ++ ++ for (; sock != NULL; sock = sock->listen_next) { ++ sk_event = &sock->sk_event; ++ ++ switch (op) { ++ case EPOLL_CTL_ADD: ++ sock_wait_check_change(sk_wait, sock->sk_wait); ++ sock->sk_wait = sk_wait; ++ /* fall through */ ++ case EPOLL_CTL_MOD: ++ sk_event->events = event->events | EPOLLERR | EPOLLHUP; ++ sk_event->ep_data = event->data; ++ ++ pending = sock_event_hold_pending(sock, WAIT_EPOLL, NETCONN_EVT_RCVPLUS, 0) | ++ sock_event_hold_pending(sock, WAIT_EPOLL, NETCONN_EVT_SENDPLUS, 0) | ++ sock_event_hold_pending(sock, WAIT_EPOLL, NETCONN_EVT_ERROR, 0); ++ sk_wait->notify_fn(sk_wait, sk_event, pending, -1); ++ ++ sk_wait->lwip_nfds++; ++ sk_wait->affinity.stack_nfds[sock->stack_id]++; ++ break; ++ case EPOLL_CTL_DEL: ++ sk_event->events = 0; ++ ++ pending = sock_event_hold_pending(sock, WAIT_EPOLL, NETCONN_EVT_RCVMINUS, 0) | ++ sock_event_hold_pending(sock, WAIT_EPOLL, NETCONN_EVT_SENDMINUS, 0) | ++ sock_event_hold_pending(sock, WAIT_EPOLL, NETCONN_EVT_ERROR, 0); ++ sk_wait->remove_fn(sk_wait, sk_event, pending); ++ ++ sk_wait->lwip_nfds--; ++ sk_wait->affinity.stack_nfds[sock->stack_id]--; ++ break; ++ default: ++ GAZELLE_RETURN(EINVAL); ++ } ++ } ++ ++ if (get_global_cfg_params()->app_bind_numa) { ++ affinity_update_max_stack(&sk_wait->affinity); ++ } ++ return 0; ++} ++ ++static int epoll_scan_lwip_event(struct epoll_cb *epcb, struct epoll_event *events, int maxevents) ++{ ++ bool rtc_mode = get_global_cfg_params()->stack_mode_rtc; ++ struct list_node *node, *temp; ++ struct sock_event *sk_event; ++ int num = 0; ++ ++ if (!rtc_mode) ++ rte_spinlock_lock(&epcb->lock); ++ ++ list_for_each_node(node, temp, &epcb->event_list) { ++ sk_event = list_entry(node, struct sock_event, event_node); ++ if (num >= maxevents) { ++ /* move list head after the current node, ++ * and start traversing from this node next time */ ++ list_del_node(&epcb->event_list); ++ list_add_node(&epcb->event_list, node); ++ break; ++ } ++ ++ if ((sk_event->events & sk_event->pending) == 0) { ++ // LSTACK_LOG(WARNING, LSTACK, "get empty event\n"); ++ list_del_node(node); ++ continue; ++ } ++ ++ events[num].events = sk_event->pending; ++ events[num].data = sk_event->ep_data; ++ num++; ++ ++ if (sk_event->events & EPOLLET) { ++ sk_event->pending = 0; ++ list_del_node(node); ++ } ++ ++ /* EPOLLONESHOT: generate event after epoll_ctl add/mod event again, ++ * epoll_event set 0 avoid generating event util epoll_ctl reset epoll_event */ ++ if (sk_event->events & EPOLLONESHOT) { ++ sk_event->events = 0; ++ list_del_node(node); ++ } ++ } ++ ++ if (!rtc_mode) ++ rte_spinlock_unlock(&epcb->lock); ++ ++ return num; ++} ++ ++int lstack_epoll_wait(int epfd, struct epoll_event* events, int maxevents, int timeout) ++{ ++ bool rtc_mode = get_global_cfg_params()->stack_mode_rtc; ++ struct lwip_sock *epsock = lwip_get_socket(epfd); ++ struct sock_wait *sk_wait = epsock->sk_wait; ++ int kernel_num = 0; ++ int lwip_num = 0; ++ int lwip_maxevents; ++ uint32_t start; ++ ++ if (unlikely(epfd < 0)) { ++ GAZELLE_RETURN(EBADF); ++ } ++ if (unlikely(events == NULL || maxevents <= 0 || timeout < -1)) { ++ GAZELLE_RETURN(EINVAL); ++ } ++ ++ if (get_global_cfg_params()->app_bind_numa) { ++ affinity_bind_stack(sk_wait, &sk_wait->affinity); ++ } ++ ++ /* avoid RTC app process events for a long time */ ++ if (rtc_mode && maxevents > POLL_MAX_EVENTS) { ++ maxevents = POLL_MAX_EVENTS; ++ } ++ /* avoid the starvation of poll events from both kernel and lwip */ ++ lwip_maxevents = (maxevents >> 1) + 1; ++ ++ start = sys_now(); ++ ++ /* RTC try to recv polling. */ ++ sk_wait->timedwait_fn(sk_wait, 0, start); ++ do { ++ if (likely(sk_wait->lwip_nfds > 0)) { ++ lwip_num = epoll_scan_lwip_event(&sk_wait->epcb, events, lwip_maxevents); ++ } ++ ++ if (sk_wait->kernel_nfds > 0 && rte_atomic16_read(&sk_wait->kernel_pending)) { ++ kernel_num = posix_api->epoll_wait_fn( ++ sk_wait->epfd, &events[lwip_num], maxevents - lwip_num, 0); ++ if (unlikely(kernel_num == 0) && errno != EINTR && errno != EAGAIN) { ++ rte_atomic16_set(&sk_wait->kernel_pending, false); ++ } ++ } ++ ++ if (lwip_num + kernel_num > 0) { ++ break; ++ } ++ ++ timeout = sk_wait->timedwait_fn(sk_wait, timeout, start); ++ } while (timeout != 0); ++ ++ sk_wait->stat.app_events += lwip_num; ++ sk_wait->stat.kernel_events += kernel_num; ++ ++ return lwip_num + kernel_num; ++} ++ ++static void poll_cb_free(struct poll_cb *pcb) ++{ ++ if (pcb->lwip_p_fds != NULL) { ++ free(pcb->lwip_p_fds); ++ pcb->lwip_p_fds = NULL; ++ } ++ if (pcb->kernel_fds != NULL) { ++ free(pcb->kernel_fds); ++ pcb->kernel_fds = NULL; ++ } ++} ++ ++static int poll_cb_init(struct poll_cb *pcb, int nfds) ++{ ++ if (nfds <= 0) ++ return 0; ++ ++ pcb->lwip_p_fds = calloc(1, sizeof(*pcb->lwip_p_fds) * nfds); ++ pcb->kernel_fds = calloc(1, sizeof(*pcb->kernel_fds) * nfds); ++ ++ if (pcb->lwip_p_fds == NULL || pcb->kernel_fds == NULL) { ++ poll_cb_free(pcb); ++ return -1; ++ } ++ ++ pcb->max_nfds = nfds; ++ return 0; ++} ++ ++static int poll_init_wait(struct sock_wait *sk_wait, int nfds) ++{ ++ struct protocol_stack_group *stack_group = get_protocol_stack_group(); ++ bool rtc_mode = get_global_cfg_params()->stack_mode_rtc; ++ int epfd; ++ ++ epfd = posix_api->epoll_create_fn(POLL_MAX_EVENTS); ++ if (epfd < 0) { ++ LSTACK_LOG(ERR, LSTACK, "epoll_create failed, errno %d\n", errno); ++ return -1; ++ } ++ ++ sk_wait->type = WAIT_POLL; ++ sock_wait_common_init(sk_wait); ++ sock_wait_kernel_init(sk_wait, epfd, stack_group->stack_num); ++ ++ if (rtc_mode) { ++ sk_wait->timedwait_fn = rtc_sock_wait_timedwait; ++ sk_wait->notify_fn = rtc_poll_notify_event; ++ sk_wait->remove_fn = rtc_poll_remove_event; ++ } else { ++ sk_wait->timedwait_fn = rtw_sock_wait_timedwait; ++ sk_wait->notify_fn = rtw_poll_notify_event; ++ sk_wait->remove_fn = rtw_poll_remove_event; ++ } ++ ++ return poll_cb_init(&sk_wait->pcb, nfds); ++} ++ ++static void poll_free_wait(struct sock_wait *sk_wait) ++{ ++ sk_wait->type = WAIT_CLOSE; ++ poll_cb_free(&sk_wait->pcb); ++ ++ posix_api->close_fn(sk_wait->epfd); ++ sock_wait_kernel_free(sk_wait); ++ sock_wait_common_free(sk_wait); ++ ++ sk_wait->timedwait_fn = NULL; ++ sk_wait->notify_fn = NULL; ++ sk_wait->remove_fn = NULL; ++} ++ ++void poll_destruct_wait(void) ++{ ++ if (unlikely(g_sk_wait == NULL)) { ++ return; ++ } ++ ++ poll_free_wait(g_sk_wait); ++ ++ /* FIXME: set all 'sock->sk_wait = NULL' before free. */ ++ free(g_sk_wait); ++ g_sk_wait = NULL; ++} ++ ++struct sock_wait *poll_construct_wait(int nfds) ++{ ++ if (unlikely(g_sk_wait == NULL)) { ++ g_sk_wait = calloc(1, sizeof(struct sock_wait)); ++ if (g_sk_wait == NULL) { ++ LSTACK_LOG(ERR, LSTACK, "calloc failed errno=%d\n", errno); ++ return NULL; ++ } ++ ++ if (poll_init_wait(g_sk_wait, nfds) < 0) { ++ free(g_sk_wait); ++ return NULL; ++ } ++ } ++ ++ /* resize poll_cb */ ++ if (g_sk_wait->pcb.max_nfds < nfds) { ++ poll_cb_free(&g_sk_wait->pcb); ++ if (poll_cb_init(&g_sk_wait->pcb, nfds) != 0) { ++ return NULL; ++ } ++ } ++ ++ return g_sk_wait; ++} ++ ++static bool poll_ctl_kernel_event(int epfd, int fds_id, ++ const struct pollfd *new_fds, const struct pollfd *old_fds) ++{ ++ int ret; ++ struct epoll_event epevent; ++ ++ LWIP_DEBUGF(SOCKETS_DEBUG, ("%s(epfd=%d, old_fd=%d, new_fd=%d)\n", ++ __FUNCTION__, epfd, old_fds->fd, new_fds->fd)); ++ ++ epevent.data.fd = fds_id; ++ epevent.events = new_fds->events; ++ ++ /* EPOLL_CTL_MOD may not be any events, but why? */ ++ if (old_fds->fd == 0) { ++ ret = posix_api->epoll_ctl_fn(epfd, EPOLL_CTL_ADD, new_fds->fd, &epevent); ++ } else { ++ ret = posix_api->epoll_ctl_fn(epfd, EPOLL_CTL_DEL, old_fds->fd, NULL); ++ ret |= posix_api->epoll_ctl_fn(epfd, EPOLL_CTL_ADD, new_fds->fd, &epevent); ++ } ++ ++ if (ret != 0) { ++ LSTACK_LOG(ERR, LSTACK, "epoll_ctl failed, errno %d, new_fd %d, old_fd %d\n", ++ errno, new_fds->fd, old_fds->fd); ++ } ++ return true; ++} ++ ++static int poll_wait_kernel_event(int epfd, struct pollfd *fds, int maxevents) ++{ ++ struct epoll_event epevents[POLL_MAX_EVENTS]; ++ int num = 0; ++ int i, fds_id; ++ ++ num = posix_api->epoll_wait_fn(epfd, epevents, maxevents, 0); ++ for (i = 0; i < num; ++i) { ++ fds_id = epevents[i].data.fd; ++ fds[fds_id].revents = epevents[i].events; ++ } ++ ++ return num; ++} ++ ++static void poll_prepare_wait(struct sock_wait *sk_wait, struct pollfd *fds, nfds_t nfds) ++{ ++ struct poll_cb *pcb = &sk_wait->pcb; ++ struct lwip_sock *sock; ++ enum posix_type sk_type; ++ int fd, i; ++ ++ sk_wait->lwip_nfds = 0; ++ sk_wait->kernel_nfds = 0; ++ memset_s(&sk_wait->affinity.stack_nfds, sizeof(sk_wait->affinity.stack_nfds), ++ 0, sizeof(sk_wait->affinity.stack_nfds)); ++ ++ for (i = 0; i < nfds; ++i) { ++ fd = fds[i].fd; ++ sock = lwip_get_socket(fd); ++ sk_type = select_sock_posix_path(sock); ++ ++ if (sk_type & POSIX_KERNEL) { ++ poll_ctl_kernel_event(sk_wait->epfd, i, &fds[i], ++ &pcb->kernel_fds[sk_wait->kernel_nfds]); ++ pcb->kernel_fds[sk_wait->kernel_nfds] = fds[i]; ++ sk_wait->kernel_nfds++; ++ } ++ ++ if (sk_type & POSIX_LWIP) { ++ pcb->lwip_p_fds[sk_wait->lwip_nfds] = &fds[i]; ++ sk_wait->lwip_nfds++; ++ ++ for (; sock != NULL; sock = sock->listen_next) { ++ sock->sk_event.events = fds[i].events | POLLERR; ++ sock_wait_check_change(sk_wait, sock->sk_wait); ++ sock->sk_wait = sk_wait; ++ sk_wait->affinity.stack_nfds[sock->stack_id]++; ++ } ++ } ++ } ++ ++ if (get_global_cfg_params()->app_bind_numa) { ++ affinity_update_max_stack(&sk_wait->affinity); ++ affinity_bind_stack(sk_wait, &sk_wait->affinity); ++ } ++ ++ return; ++} ++ ++static int poll_scan_lwip_event(struct poll_cb *pcb, int nfds, int maxevents) ++{ ++ struct lwip_sock *sock; ++ struct pollfd *fds; ++ int num = 0; ++ int i; ++ ++ for (i = 0; i < nfds && num < maxevents; ++i) { ++ fds = pcb->lwip_p_fds[i]; ++ sock = lwip_get_socket(fds->fd); ++ ++ for (; !POSIX_IS_CLOSED(sock); sock = sock->listen_next) { ++ fds->revents = sock_event_hold_pending(sock, WAIT_POLL, NETCONN_EVT_RCVPLUS, 0) | ++ sock_event_hold_pending(sock, WAIT_POLL, NETCONN_EVT_SENDPLUS, 0) | ++ sock_event_hold_pending(sock, WAIT_POLL, NETCONN_EVT_ERROR, 0); ++ if (fds->revents != 0) { ++ num++; ++ break; ++ } ++ } ++ } ++ ++ return num; ++} ++ ++int lstack_poll(struct pollfd *fds, nfds_t nfds, int timeout) ++{ ++ struct sock_wait *sk_wait; ++ int kernel_num = 0; ++ int lwip_num = 0; ++ uint32_t start; ++ ++ if (unlikely(fds == NULL || nfds == 0 || timeout < -1)) { ++ GAZELLE_RETURN(EINVAL); ++ } ++ ++ sk_wait = poll_construct_wait(nfds); ++ if (unlikely(sk_wait == NULL)) { ++ return -1; ++ } ++ poll_prepare_wait(sk_wait, fds, nfds); ++ ++ if (sk_wait->lwip_nfds == 0 && sk_wait->kernel_nfds > 0) { ++ return posix_api->poll_fn(fds, nfds, timeout); ++ } ++ ++ start = sys_now(); ++ ++ /* RTC try to recv polling. */ ++ sk_wait->timedwait_fn(sk_wait, 0, start); ++ do { ++ if (sk_wait->lwip_nfds > 0) { ++ lwip_num = poll_scan_lwip_event(&sk_wait->pcb, sk_wait->lwip_nfds, nfds); ++ } ++ ++ if (sk_wait->kernel_nfds > 0 && rte_atomic16_read(&sk_wait->kernel_pending)) { ++ kernel_num = poll_wait_kernel_event(sk_wait->epfd, fds, sk_wait->kernel_nfds); ++ if (kernel_num == 0 && errno != EINTR && errno != EAGAIN) { ++ rte_atomic16_set(&sk_wait->kernel_pending, false); ++ } ++ } ++ ++ if (lwip_num + kernel_num > 0) { ++ break; ++ } ++ ++ timeout = sk_wait->timedwait_fn(sk_wait, timeout, start); ++ } while (timeout != 0); ++ ++ sk_wait->stat.app_events += lwip_num; ++ sk_wait->stat.kernel_events += kernel_num; ++ ++ return lwip_num + kernel_num; ++} ++ ++/* refer to linux kernel */ ++#define POLLIN_SET (EPOLLRDNORM | EPOLLRDBAND | EPOLLIN | EPOLLHUP | EPOLLERR) ++#define POLLOUT_SET (EPOLLWRBAND | EPOLLWRNORM | EPOLLOUT | EPOLLERR) ++#define POLLEX_SET (EPOLLPRI) ++ ++static int fds_select2poll(struct pollfd *fds, int maxfd, ++ fd_set *readfds, fd_set *writefds, fd_set *exceptfds) ++{ ++ int nfds = 0; ++ ++ for (int i = 0; i < maxfd; i++) { ++ if (readfds && FD_ISSET(i, readfds)) { ++ fds[nfds].events = POLLIN_SET; ++ } ++ if (writefds && FD_ISSET(i, writefds)) { ++ fds[nfds].events |= POLLOUT_SET; ++ } ++ if (exceptfds && FD_ISSET(i, exceptfds)) { ++ fds[nfds].events |= POLLEX_SET; ++ } ++ if (fds[nfds].events > 0) { ++ fds[nfds].fd = i; ++ nfds++; ++ } ++ } ++ ++ return nfds; ++} ++ ++static void fds_poll2select(const struct pollfd *fds, int nfds, ++ fd_set *readfds, fd_set *writefds, fd_set *exceptfds) ++{ ++ if (readfds) ++ FD_ZERO(readfds); ++ if (writefds) ++ FD_ZERO(writefds); ++ if (exceptfds) ++ FD_ZERO(exceptfds); ++ ++ for (int i = 0; i < nfds; ++i) { ++ if (readfds && fds[i].revents & POLLIN_SET) { ++ FD_SET(fds[i].fd, readfds); ++ } ++ if (writefds && fds[i].revents & POLLOUT_SET) { ++ FD_SET(fds[i].fd, writefds); ++ } ++ if (exceptfds && fds[i].revents & POLLEX_SET) { ++ FD_SET(fds[i].fd, exceptfds); ++ } ++ } ++} ++ ++static inline int timeval2ms(struct timeval *timeval) ++{ ++ if (timeval == NULL) { ++ return -1; ++ } ++ return timeval->tv_sec * MS_PER_S + timeval->tv_usec / (US_PER_S / MS_PER_S); ++} ++ ++int lstack_select(int nfds, fd_set *readfds, fd_set *writefds, ++ fd_set *exceptfds, struct timeval *timeout) ++{ ++ struct pollfd poll_fds[FD_SETSIZE] = {0}; ++ int poll_nfds, num; ++ int time_ms; ++ ++ if (unlikely(nfds < 0 || nfds > FD_SETSIZE)) { ++ LSTACK_LOG(ERR, LSTACK, "select invalid args, nfds=%d\n", nfds); ++ GAZELLE_RETURN(EINVAL); ++ } ++ if (timeout != NULL && unlikely(timeout->tv_sec < 0 || timeout->tv_usec < 0)) { ++ LSTACK_LOG(ERR, LSTACK, "select invalid args, timeout\n"); ++ GAZELLE_RETURN(EINVAL); ++ } ++ /* empty fds, just timeout */ ++ if (!readfds && !writefds && !exceptfds) { ++ return posix_api->select_fn(nfds, readfds, writefds, exceptfds, timeout); ++ } ++ ++ time_ms = timeval2ms(timeout); ++ ++ poll_nfds = fds_select2poll(poll_fds, nfds, readfds, writefds, exceptfds); ++ num = lstack_poll(poll_fds, poll_nfds, time_ms); ++ fds_poll2select(poll_fds, poll_nfds, readfds, writefds, exceptfds); ++ ++ return num; ++} ++ ++void epoll_api_init(posix_api_t *api) ++{ ++ api->epoll_create1_fn = lstack_epoll_create1; ++ api->epoll_create_fn = lstack_epoll_create; ++ api->epoll_ctl_fn = lstack_epoll_ctl; ++ api->epoll_wait_fn = lstack_epoll_wait; ++ ++ api->poll_fn = lstack_poll; ++ api->select_fn = lstack_select; ++} ++ ++bool sock_event_wait(struct lwip_sock *sock, bool noblocking) ++{ ++ bool rtc_mode = get_global_cfg_params()->stack_mode_rtc; ++ uint32_t start; ++ int timeout; ++ unsigned pending; ++ ++ if (!rtc_mode && noblocking) ++ return false; ++ ++ if (unlikely(sock->sk_wait == NULL) || sock->sk_wait->type == WAIT_CLOSE) { ++ sock->sk_wait = poll_construct_wait(0); ++ } ++ if (!(sock->sk_wait->type & WAIT_BLOCK)) { ++ sock->sk_wait->type |= WAIT_BLOCK; ++ } ++ ++ if (rtc_mode) { ++ /* RTC try to recv polling. */ ++ sock->sk_wait->timedwait_fn(sock->sk_wait, 0, 0); ++ return true; ++ } ++ ++ timeout = sock->conn->recv_timeout == 0 ? -1 : sock->conn->recv_timeout; ++ start = sys_now(); ++ do { ++ pending = sock_event_hold_pending(sock, WAIT_BLOCK, NETCONN_EVT_RCVPLUS, 0) | ++ sock_event_hold_pending(sock, WAIT_BLOCK, NETCONN_EVT_ERROR, 0); ++ if (pending) { ++ return true; ++ } ++ timeout = sock->sk_wait->timedwait_fn(sock->sk_wait, timeout, start); ++ } while (timeout != 0); ++ ++ return false; ++} ++#endif /* SOCK_EVENT_V2 */ +diff --git a/src/lstack/core/lstack_wait.c b/src/lstack/core/lstack_wait.c +new file mode 100644 +index 0000000..c67df93 +--- /dev/null ++++ b/src/lstack/core/lstack_wait.c +@@ -0,0 +1,550 @@ ++/* ++* Copyright (c) Huawei Technologies Co., Ltd. 2020-2021. All rights reserved. ++* gazelle is licensed under the Mulan PSL v2. ++* You can use this software according to the terms and conditions of the Mulan PSL v2. ++* You may obtain a copy of Mulan PSL v2 at: ++* http://license.coscl.org.cn/MulanPSL2 ++* THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR ++* IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY OR FIT FOR A PARTICULAR ++* PURPOSE. ++* See the Mulan PSL v2 for more details. ++*/ ++ ++#include ++#include ++ ++#include ++#include ++ ++#include ++#include ++#include ++#include ++ ++#include "common/gazelle_base_func.h" ++#include "lstack_wait.h" ++#include "lstack_log.h" ++#include "lstack_cfg.h" ++#include "same_node.h" ++#include "mbox_ring.h" ++ ++#define KERNEL_EVENT_WAIT_US 10 ++#define LWIP_EVENT_WAIT_US 10 ++ ++struct kernel_wait { ++ int epfd; ++}; ++ ++struct lwip_wait { ++#if SOCK_WAIT_BATCH_NOTIFY ++ struct list_node stk_notify_list; ++#endif /* SOCK_WAIT_BATCH_NOTIFY */ ++} __rte_cache_aligned; ++ ++struct sock_wait_group { ++ struct kernel_wait kwaits[PROTOCOL_STACK_MAX]; ++ struct lwip_wait lwaits[PROTOCOL_STACK_MAX]; ++ ++ _Atomic uint16_t affinity_tick; ++ ++ /* new cache line */ ++ char pad0 __rte_cache_aligned; ++ ++ /* dfx stat */ ++ struct list_node group_list; ++ rte_spinlock_t group_list_lock; ++}; ++static struct sock_wait_group g_wait_group = {0}; ++ ++static inline struct kernel_wait *kernel_wait_get(int stack_id) ++{ ++ if (unlikely(stack_id < 0 || stack_id >= PROTOCOL_STACK_MAX)) { ++ return NULL; ++ } ++ return &g_wait_group.kwaits[stack_id]; ++} ++ ++static inline struct lwip_wait *lwip_wait_get(int stack_id) ++{ ++ if (unlikely(stack_id < 0 || stack_id >= PROTOCOL_STACK_MAX)) { ++ return NULL; ++ } ++ return &g_wait_group.lwaits[stack_id]; ++} ++ ++static int lwip_wait_init(int stack_id) ++{ ++ struct lwip_wait *lwait = lwip_wait_get(stack_id); ++ LWIP_UNUSED_ARG(lwait); ++ ++#if SOCK_WAIT_BATCH_NOTIFY ++ list_init_head(&lwait->stk_notify_list); ++#endif /* SOCK_WAIT_BATCH_NOTIFY */ ++ return 0; ++} ++ ++static int kernel_wait_init(int stack_id) ++{ ++ struct kernel_wait *kwait = kernel_wait_get(stack_id); ++ ++ kwait->epfd = posix_api->epoll_create_fn(GAZELLE_LSTACK_MAX_CONN); ++ if (kwait->epfd < 0) { ++ LSTACK_LOG(ERR, LSTACK, "epoll_create failed, errno %d\n", errno); ++ return -1; ++ } ++ ++ return 0; ++} ++ ++int sock_wait_group_init(void) ++{ ++ list_init_head(&g_wait_group.group_list); ++ rte_spinlock_init(&g_wait_group.group_list_lock); ++ return 0; ++} ++ ++static inline void sock_wait_group_add(struct sock_wait *sk_wait) ++{ ++ list_init_node(&sk_wait->group_node); ++ ++ rte_spinlock_lock(&g_wait_group.group_list_lock); ++ list_add_node(&sk_wait->group_node, &g_wait_group.group_list); ++ rte_spinlock_unlock(&g_wait_group.group_list_lock); ++} ++ ++static inline void sock_wait_group_del(struct sock_wait *sk_wait) ++{ ++ rte_spinlock_lock(&g_wait_group.group_list_lock); ++ list_del_node(&sk_wait->group_node); ++ rte_spinlock_unlock(&g_wait_group.group_list_lock); ++} ++ ++void sock_wait_group_stat(int stack_id, struct gazelle_wakeup_stat *stat) ++{ ++ struct sock_wait *sk_wait; ++ struct list_node *node, *next; ++ ++ rte_spinlock_lock(&g_wait_group.group_list_lock); ++ ++ list_for_each_node(node, next, &g_wait_group.group_list) { ++ sk_wait = list_entry(node, struct sock_wait, group_node); ++ ++ if (sk_wait->affinity.bind_stack_id == stack_id) { ++ memcpy_s(stat, sizeof(struct gazelle_wakeup_stat), ++ &sk_wait->stat, sizeof(struct gazelle_wakeup_stat)); ++ } ++ } ++ ++ rte_spinlock_unlock(&g_wait_group.group_list_lock); ++} ++ ++int kernel_wait_ctl(struct sock_wait *sk_wait, int new_stack_id, int old_stack_id) ++{ ++ int ret; ++ struct kernel_wait *old_kwait = kernel_wait_get(old_stack_id); ++ struct kernel_wait *new_kwait = kernel_wait_get(new_stack_id); ++ struct epoll_event epevent; ++ ++ /* not change */ ++ if (old_kwait != NULL && old_kwait == new_kwait) { ++ return 0; ++ } ++ ++ if (old_kwait) { ++ ret = posix_api->epoll_ctl_fn(old_kwait->epfd, EPOLL_CTL_DEL, sk_wait->epfd, NULL); ++ if (ret != 0) { ++ LSTACK_LOG(ERR, LSTACK, "epoll_ctl failed, errno %d\n", errno); ++ return -1; ++ } ++ } ++ ++ if (new_kwait) { ++ epevent.events = EPOLLIN | EPOLLOUT | EPOLLERR | EPOLLHUP | EPOLLET; ++ epevent.data.ptr = sk_wait; ++ ret = posix_api->epoll_ctl_fn(new_kwait->epfd, EPOLL_CTL_ADD, sk_wait->epfd, &epevent); ++ if (ret != 0) { ++ LSTACK_LOG(ERR, LSTACK, "epoll_ctl failed, errno %d\n", errno); ++ return -1; ++ } ++ } ++ ++ return 0; ++} ++ ++void* kernel_wait_thread(void *arg) ++{ ++ struct thread_params *t_params = (struct thread_params*) arg; ++ unsigned stack_id = t_params->idx; ++ struct epoll_event kernel_events[KERNEL_EPOLL_MAX]; ++ int num, i; ++ struct kernel_wait *kwait; ++ struct sock_wait *sk_wait; ++ ++ bind_to_stack_numa(stack_id); ++ free(arg); ++ sem_post(&get_protocol_stack_group()->sem_stack_setup); ++ ++ lwip_wait_init(stack_id); ++ kernel_wait_init(stack_id); ++ kwait = kernel_wait_get(stack_id); ++ ++ LSTACK_LOG(INFO, LSTACK, "kernelevent_%02hu start\n", stack_id); ++ ++ for (;;) { ++ num = posix_api->epoll_wait_fn(kwait->epfd, kernel_events, KERNEL_EPOLL_MAX, -1); ++ if (num < 0 && errno != EINTR && errno != EAGAIN) { ++ LSTACK_LOG(ERR, LSTACK, "epoll_wait faild, errno %d\n", errno); ++ } ++ ++ for (i = 0; i < num; ++i) { ++ sk_wait = kernel_events[i].data.ptr; ++ if (sk_wait->type == WAIT_CLOSE) ++ continue; ++ rte_atomic16_set(&sk_wait->kernel_pending, true); ++ sys_mutex_unlock_internal(&sk_wait->mutex); ++ } ++ usleep(KERNEL_EVENT_WAIT_US); ++ } ++ ++ return NULL; ++} ++ ++static unsigned affinity_choice_stack(int stack_num) ++{ ++ if (get_global_cfg_params()->stack_mode_rtc) { ++ return get_protocol_stack()->stack_idx; ++ } ++ return atomic_fetch_add(&g_wait_group.affinity_tick, 1) % stack_num; ++} ++ ++static void affinity_find_max_stack(struct wait_affinity *affinity, int stack_num) ++{ ++ int max_stack_id = affinity->max_stack_id; ++ ++ for (int i = 0; i < stack_num; i++) { ++ if (affinity->stack_nfds[i] > affinity->stack_nfds[max_stack_id]) { ++ max_stack_id = i; ++ } ++ } ++ affinity->max_stack_id = max_stack_id; ++} ++ ++void affinity_update_max_stack(struct wait_affinity *affinity) ++{ ++ struct protocol_stack_group *stack_group; ++ ++ if (get_global_cfg_params()->stack_mode_rtc) { ++ affinity->max_stack_id = get_protocol_stack()->stack_idx; ++ } else { ++ stack_group = get_protocol_stack_group(); ++ affinity_find_max_stack(affinity, stack_group->stack_num); ++ } ++} ++ ++void affinity_bind_stack(struct sock_wait *sk_wait, struct wait_affinity *affinity) ++{ ++ if (affinity->max_stack_id != affinity->bind_stack_id) { ++ bind_to_stack_numa(affinity->max_stack_id); ++ kernel_wait_ctl(sk_wait, affinity->max_stack_id, affinity->bind_stack_id); ++ affinity->bind_stack_id = affinity->max_stack_id; ++ } ++} ++ ++int sock_event_init(struct sock_event *sk_event) ++{ ++ memset_s(sk_event, sizeof(struct sock_event), 0, sizeof(struct sock_event)); ++ ++ list_init_node(&sk_event->event_node); ++#if SOCK_WAIT_BATCH_NOTIFY ++ list_init_node(&sk_event->stk_event_node); ++#endif /* SOCK_WAIT_BATCH_NOTIFY */ ++ return 0; ++} ++ ++void sock_event_free(struct sock_event *sk_event, struct sock_wait *sk_wait) ++{ ++ if (sk_wait && sk_wait->type & WAIT_EPOLL) { ++ rte_spinlock_lock(&sk_wait->epcb.lock); ++ list_del_node(&sk_event->event_node); ++ rte_spinlock_unlock(&sk_wait->epcb.lock); ++ ++#if SOCK_WAIT_BATCH_NOTIFY ++ list_del_node(&sk_event->stk_event_node); ++#endif /* SOCK_WAIT_BATCH_NOTIFY */ ++ } ++} ++ ++int sock_wait_common_init(struct sock_wait *sk_wait) ++{ ++ sk_wait->lwip_nfds = 0; ++ sk_wait->kernel_nfds = 0; ++ sys_mutex_new_internal(&sk_wait->mutex); ++ ++#if SOCK_WAIT_BATCH_NOTIFY ++ for (int i = 0; i < PROTOCOL_STACK_MAX; ++i) { ++ list_init_node(&sk_wait->stk_notify_node[i]); ++ list_init_head(&sk_wait->stk_event_list[i]); ++ } ++#endif /* SOCK_WAIT_BATCH_NOTIFY */ ++ sock_wait_group_add(sk_wait); ++ ++ return 0; ++} ++ ++void sock_wait_common_free(struct sock_wait *sk_wait) ++{ ++#if SOCK_WAIT_BATCH_NOTIFY ++ bool wait_stack; ++ ++ /* wait lwip_wait_foreach_notify() finish. */ ++ do { ++ wait_stack = false; ++ for (int i = 0; i < PROTOCOL_STACK_MAX; ++i) { ++ rte_mb(); ++ if (!list_node_null(&sk_wait->stk_notify_node[i])) { ++ wait_stack = true; ++ usleep(LWIP_EVENT_WAIT_US); ++ break; ++ } ++ } ++ } while (wait_stack); ++#endif /* SOCK_WAIT_BATCH_NOTIFY */ ++ ++ sock_wait_group_del(sk_wait); ++ sys_mutex_free_internal(&sk_wait->mutex); ++} ++ ++int sock_wait_kernel_init(struct sock_wait *sk_wait, int epfd, int stack_num) ++{ ++ sk_wait->epfd = epfd; ++ sk_wait->affinity.max_stack_id = affinity_choice_stack(stack_num); ++ kernel_wait_ctl(sk_wait, sk_wait->affinity.max_stack_id, -1); ++ sk_wait->affinity.bind_stack_id = sk_wait->affinity.max_stack_id; ++ ++ rte_atomic16_init(&sk_wait->kernel_pending); ++ rte_atomic16_set(&sk_wait->kernel_pending, true); ++ return 0; ++} ++ ++void sock_wait_kernel_free(struct sock_wait *sk_wait) ++{ ++ kernel_wait_ctl(sk_wait, -1, sk_wait->affinity.bind_stack_id); ++ sk_wait->epfd = -1; ++ sk_wait->affinity.bind_stack_id = -1; ++ sk_wait->affinity.max_stack_id = -1; ++ ++ rte_atomic16_clear(&sk_wait->kernel_pending); ++} ++ ++ ++static inline bool NETCONN_NEED_ACCEPT(const struct lwip_sock *sock) ++{ ++ if (sys_mbox_valid(&sock->conn->acceptmbox)) { ++ const struct mbox_ring *mr = &sock->conn->acceptmbox->mring; ++ return mr->ops->count(mr) > 0; ++ } ++ return false; ++} ++ ++static inline bool NETCONN_NEED_RECV(const struct lwip_sock *sock) ++{ ++ if (sock->lastdata.pbuf != NULL) ++ return true; ++ if (sys_mbox_valid(&sock->conn->recvmbox)) { ++ const struct mbox_ring *mr = &sock->conn->recvmbox->mring; ++ return mr->ops->recv_count(mr) > 0; ++ } ++ return false; ++} ++ ++static inline bool NETCONN_ALLOW_SEND(const struct lwip_sock *sock) ++{ ++ if (get_global_cfg_params()->stack_mode_rtc) { ++ if (NETCONN_TYPE(sock->conn) == NETCONN_TCP) ++ return lwip_tcp_allow_send(sock->conn->pcb.tcp); ++ return false; ++ } ++ if (sys_mbox_valid(&sock->conn->sendmbox)) { ++ const struct mbox_ring *mr = &sock->conn->sendmbox->mring; ++ return mr->ops->free_count(mr) > 0; ++ } ++ return false; ++} ++ ++static unsigned sock_event_lose_pending(const struct lwip_sock *sock, enum netconn_evt evt, unsigned len) ++{ ++ unsigned event = 0; ++ ++ switch (evt) { ++ case NETCONN_EVT_RCVMINUS: ++ if (sock->sk_event.events & EPOLLIN) { ++ if (!NETCONN_NEED_RECV(sock) && ++ !NETCONN_NEED_ACCEPT(sock)) { ++ event = EPOLLIN; ++ } ++ } ++ break; ++ case NETCONN_EVT_SENDMINUS: ++ if (sock->sk_event.events & EPOLLOUT) { ++ if (!NETCONN_ALLOW_SEND(sock)) { ++ event = EPOLLOUT; ++ } ++ } ++ break; ++ default: ++ break; ++ } ++ ++ return event; ++} ++ ++unsigned sock_event_hold_pending(const struct lwip_sock *sock, ++ enum sock_wait_type type, enum netconn_evt evt, unsigned len) ++{ ++ unsigned event = 0; ++ ++ switch (evt) { ++ case NETCONN_EVT_RCVPLUS: ++ if (sock->sk_event.events & EPOLLIN || type & WAIT_BLOCK) { ++ if (len > 0 || ++ NETCONN_NEED_RECV(sock) || ++ NETCONN_NEED_ACCEPT(sock)) { ++ event = EPOLLIN; ++ } ++ } ++ break; ++ case NETCONN_EVT_SENDPLUS: ++ if (sock->sk_event.events & EPOLLOUT) { ++ if (len > 0 || ++ NETCONN_ALLOW_SEND(sock)) { ++ event = EPOLLOUT; ++ } ++ } ++ break; ++ case NETCONN_EVT_ERROR: ++ if (sock->errevent) { ++ event = EPOLLERR | EPOLLHUP | EPOLLIN; ++ } ++ break; ++ default: ++ break; ++ } ++ ++ return event; ++} ++ ++void sock_event_remove_pending(struct lwip_sock *sock, enum netconn_evt evt, unsigned len) ++{ ++ LWIP_DEBUGF(SOCKETS_DEBUG, ("%s(sock=%p, sk_wait=%p, evt=%d, len=%u)\n", ++ __FUNCTION__, sock, sock->sk_wait, evt, len)); ++ ++ if (sock->sk_wait == NULL) { ++ return; ++ } ++ if (unlikely(sock->sk_wait->type == WAIT_CLOSE)) { ++ sock->sk_wait = NULL; ++ return; ++ } ++ ++ unsigned pending = sock_event_lose_pending(sock, evt, 0); ++ if (pending) { ++ sock->sk_wait->remove_fn(sock->sk_wait, &sock->sk_event, pending); ++ } ++} ++ ++void sock_event_notify_pending(struct lwip_sock *sock, enum netconn_evt evt, unsigned len) ++{ ++ LWIP_DEBUGF(SOCKETS_DEBUG, ("%s(sock=%p, sk_wait=%p, evt=%d, len=%u)\n", ++ __FUNCTION__, sock, sock->sk_wait, evt, len)); ++ ++ if (sock->sk_wait == NULL) { ++ return; ++ } ++ if (unlikely(sock->sk_wait->type == WAIT_CLOSE)) { ++ sock->sk_wait = NULL; ++ return; ++ } ++ ++ unsigned pending = sock_event_hold_pending(sock, sock->sk_wait->type, evt, len); ++ if (pending) { ++ sock->sk_wait->notify_fn(sock->sk_wait, &sock->sk_event, pending, sock->stack_id); ++ } ++} ++ ++#if SOCK_WAIT_BATCH_NOTIFY ++/* Only allow stack call */ ++void lwip_wait_add_notify(struct sock_wait *sk_wait, struct sock_event *sk_event, ++ unsigned pending, int stack_id) ++{ ++ struct lwip_wait *lwait = lwip_wait_get(stack_id); ++ ++ if (sk_event != NULL) { ++ sk_event->stk_pending |= pending; ++ if (list_node_null(&sk_event->stk_event_node)) { ++ list_add_node(&sk_event->stk_event_node, &sk_wait->stk_event_list[stack_id]); ++ } ++ } ++ ++ if (list_node_null(&sk_wait->stk_notify_node[stack_id])) { ++ list_add_node(&sk_wait->stk_notify_node[stack_id], &lwait->stk_notify_list); ++ } ++} ++ ++static inline ++unsigned sock_wait_foreach_event(struct sock_wait *sk_wait, int stack_id) ++{ ++ struct list_node *node, *next; ++ struct sock_event *sk_event; ++ unsigned count = 0; ++ ++ /* only rtw epoll need */ ++ if (list_head_empty(&sk_wait->stk_event_list[stack_id])) ++ return 0; ++ ++ rte_spinlock_lock(&sk_wait->epcb.lock); ++ ++ list_for_each_node(node, next, &sk_wait->stk_event_list[stack_id]) { ++ list_del_node(node); ++ sk_event = container_of(node, struct sock_event, stk_event_node); ++ ++ /* see rtw_epoll_notify_event() */ ++ sk_event->pending |= sk_event->stk_pending; ++ if (list_node_null(&sk_event->event_node)) { ++ list_add_node(&sk_event->event_node, &sk_wait->epcb.event_list); ++ } ++ ++ sk_event->stk_pending = 0; ++ count++; ++ } ++ ++ rte_spinlock_unlock(&sk_wait->epcb.lock); ++ ++ return count; ++} ++ ++/* Only allow stack call */ ++unsigned lwip_wait_foreach_notify(int stack_id) ++{ ++ struct lwip_wait *lwait = lwip_wait_get(stack_id); ++ struct sock_wait *sk_wait; ++ struct list_node *node, *next; ++ unsigned count = 0; ++ ++ list_for_each_node(node, next, &lwait->stk_notify_list) { ++ list_del_node(node); ++ sk_wait = container_of_uncheck_ptr((node - stack_id), struct sock_wait, stk_notify_node); ++ ++ sock_wait_foreach_event(sk_wait, stack_id); ++ ++ sys_mutex_unlock_internal(&sk_wait->mutex); ++ count++; ++ } ++ return count; ++} ++ ++bool lwip_wait_notify_empty(int stack_id) ++{ ++ struct lwip_wait *lwait = lwip_wait_get(stack_id); ++ return list_head_empty(&lwait->stk_notify_list); ++} ++#endif /* SOCK_WAIT_BATCH_NOTIFY */ +diff --git a/src/lstack/include/lstack_epoll.h b/src/lstack/include/lstack_epoll.h +index 83eace7..9a5d15a 100644 +--- a/src/lstack/include/lstack_epoll.h ++++ b/src/lstack/include/lstack_epoll.h +@@ -13,6 +13,8 @@ + #ifndef _GAZELLE_EPOLL_H_ + #define _GAZELLE_EPOLL_H_ + ++#if /* SOCK_EVENT_V2 */ ++ + #include + #include + #include +@@ -88,4 +90,20 @@ static inline void lstack_block_wakeup(struct wakeup_poll *wakeup) + } + } + ++#else /* SOCK_EVENT_V2 */ ++ ++#include ++#include ++ ++#include "lstack_wait.h" ++ ++struct sock_wait *poll_construct_wait(int nfds); ++void poll_destruct_wait(void); ++ ++int lstack_epoll_close(int epfd); ++void epoll_api_init(posix_api_t *api); ++bool sock_event_wait(struct lwip_sock *sock, bool noblocking); ++ ++#endif /* SOCK_EVENT_V2 */ ++ + #endif /* _GAZELLE_EPOLL_H_ */ +diff --git a/src/lstack/include/lstack_wait.h b/src/lstack/include/lstack_wait.h +new file mode 100644 +index 0000000..ed154b4 +--- /dev/null ++++ b/src/lstack/include/lstack_wait.h +@@ -0,0 +1,138 @@ ++/* ++* Copyright (c) Huawei Technologies Co., Ltd. 2020-2021. All rights reserved. ++* gazelle is licensed under the Mulan PSL v2. ++* You can use this software according to the terms and conditions of the Mulan PSL v2. ++* You may obtain a copy of Mulan PSL v2 at: ++* http://license.coscl.org.cn/MulanPSL2 ++* THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR ++* IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY OR FIT FOR A PARTICULAR ++* PURPOSE. ++* See the Mulan PSL v2 for more details. ++*/ ++ ++#ifndef _LSTACK_WAIT_H_ ++#define _LSTACK_WAIT_H_ ++ ++#include ++#include ++#include ++#include ++ ++#include ++#include ++ ++#include ++#include ++#include ++#include ++ ++#include "common/gazelle_dfx_msg.h" ++#include "lstack_protocol_stack.h" ++#include "lstack_cfg.h" ++ ++ ++#define NETCONN_TYPE(conn) NETCONNTYPE_GROUP(netconn_type((conn))) ++ ++enum sock_wait_type { ++ WAIT_CLOSE = 0x00, ++ WAIT_POLL = 0x01, ++ WAIT_EPOLL = 0x02, ++ WAIT_BLOCK = 0x04, ++ WAIT_MAX = 0x08, ++}; ++ ++struct wait_affinity { ++ int bind_stack_id; ++ int max_stack_id; ++ unsigned stack_nfds[PROTOCOL_STACK_MAX]; ++}; ++ ++/* epoll control block */ ++struct epoll_cb { ++ struct list_node event_list; ++ rte_spinlock_t lock; ++}; ++ ++/* poll control block */ ++struct poll_cb { ++ int max_nfds; ++ struct pollfd **lwip_p_fds; ++ struct pollfd *kernel_fds; ++}; ++ ++struct sock_wait { ++ enum sock_wait_type type; ++ ++ /* blocking and return 0 on timeout */ ++ int (*timedwait_fn)(struct sock_wait *sk_wait, int timeout, uint32_t start); ++ /* trigger event */ ++ void (*notify_fn)(struct sock_wait *sk_wait, struct sock_event *sk_event, ++ unsigned pending, int stack_id); ++ /* remove event */ ++ void (*remove_fn)(struct sock_wait *sk_wait, struct sock_event *sk_event, unsigned pending); ++ ++ /* dfx stat */ ++ struct list_node group_node; ++ struct gazelle_wakeup_stat stat; ++ ++ /* epoll kernel fd */ ++ int epfd; ++ ++ /* socket count */ ++ unsigned lwip_nfds; ++ unsigned kernel_nfds; ++ struct wait_affinity affinity; ++ ++ char pad0 __rte_cache_aligned; /* new cache line */ ++ ++#if SOCK_WAIT_BATCH_NOTIFY ++ /* lwip event foreach notify list */ ++ struct list_node __rte_cache_aligned stk_notify_node[PROTOCOL_STACK_MAX]; ++ struct list_node __rte_cache_aligned stk_event_list[PROTOCOL_STACK_MAX]; ++#endif /* SOCK_WAIT_BATCH_NOTIFY */ ++ ++ char pad1 __rte_cache_aligned; /* new cache line */ ++ ++ /* kernel event flag */ ++ rte_atomic16_t kernel_pending; ++ /* run-to-wakeup blocking lock */ ++ struct sys_mutex mutex; ++ ++ union { ++ struct epoll_cb epcb; ++ struct poll_cb pcb; ++ }; ++}; ++ ++ ++int sock_wait_group_init(void); ++void sock_wait_group_stat(int stack_id, struct gazelle_wakeup_stat *stat); ++ ++void* kernel_wait_thread(void *arg); ++int kernel_wait_ctl(struct sock_wait *sk_wait, int new_stack_id, int old_stack_id); ++ ++#if SOCK_WAIT_BATCH_NOTIFY ++void lwip_wait_add_notify(struct sock_wait *sk_wait, struct sock_event *sk_event, ++ unsigned pending, int stack_id); ++unsigned lwip_wait_foreach_notify(int stack_id); ++bool lwip_wait_notify_empty(int stack_id); ++#endif /* SOCK_WAIT_BATCH_NOTIFY */ ++ ++unsigned sock_event_hold_pending(const struct lwip_sock *sock, ++ enum sock_wait_type type, enum netconn_evt evt, unsigned len); ++void sock_event_notify_pending(struct lwip_sock *sock, enum netconn_evt evt, unsigned len); ++void sock_event_remove_pending(struct lwip_sock *sock, enum netconn_evt evt, unsigned len); ++ ++int sock_event_init(struct sock_event *sk_event); ++void sock_event_free(struct sock_event *sk_event, struct sock_wait *sk_wait); ++ ++int sock_wait_common_init(struct sock_wait *sk_wait); ++void sock_wait_common_free(struct sock_wait *sk_wait); ++ ++int sock_wait_kernel_init(struct sock_wait *sk_wait, int epfd, int stack_num); ++void sock_wait_kernel_free(struct sock_wait *sk_wait); ++ ++void affinity_update_max_stack(struct wait_affinity *affinity); ++void affinity_bind_stack(struct sock_wait *sk_wait, struct wait_affinity *affinity); ++ ++#endif /* _LSTACK_WAIT_H_ */ +-- +2.33.0 + diff --git a/0327-socket-adapt-to-sock_event.patch b/0327-socket-adapt-to-sock_event.patch new file mode 100644 index 0000000..c025539 --- /dev/null +++ b/0327-socket-adapt-to-sock_event.patch @@ -0,0 +1,2955 @@ +From 9098cb36f9a28fcc64b90168c71dfa4910c607ec Mon Sep 17 00:00:00 2001 +From: Lemmy Huang +Date: Fri, 21 Mar 2025 16:53:37 +0800 +Subject: [PATCH] socket: adapt to sock_event + +Signed-off-by: Lemmy Huang +--- + src/lstack/api/lstack_epoll.c | 1034 -------------------- + src/lstack/api/lstack_rtc_api.c | 79 +- + src/lstack/api/lstack_rtw_api.c | 189 ++-- + src/lstack/api/lstack_wrap.c | 153 +-- + src/lstack/core/lstack_lwip.c | 217 ++-- + src/lstack/core/lstack_protocol_stack.c | 116 +-- + src/lstack/core/lstack_stack_stat.c | 32 +- + src/lstack/core/lstack_thread_rpc.c | 29 +- + src/lstack/core/lstack_wait.c | 26 +- + src/lstack/core/same_node.c | 5 +- + src/lstack/include/lstack_epoll.h | 81 -- + src/lstack/include/lstack_lwip.h | 4 - + src/lstack/include/lstack_protocol_stack.h | 52 +- + src/lstack/include/lstack_thread_rpc.h | 1 - + 14 files changed, 312 insertions(+), 1706 deletions(-) + +diff --git a/src/lstack/api/lstack_epoll.c b/src/lstack/api/lstack_epoll.c +index a110aa1..3940f43 100644 +--- a/src/lstack/api/lstack_epoll.c ++++ b/src/lstack/api/lstack_epoll.c +@@ -10,1039 +10,6 @@ + * See the Mulan PSL v2 for more details. + */ + +-#if !SOCK_EVENT_V2 +- +-#include +-#include +-#include +-#include +-#include +-#include +-#include +-#include +- +-#include +-#include +- +-#include "lstack_stack_stat.h" +-#include "lstack_cfg.h" +-#include "lstack_log.h" +-#include "common/dpdk_common.h" +-#include "common/gazelle_base_func.h" +-#include "lstack_lwip.h" +-#include "lstack_protocol_stack.h" +-#include "lstack_epoll.h" +- +-#define EPOLL_KERNEL_INTERVAL 10 /* ms */ +-#define SEC_TO_NSEC 1000000000 +-#define SEC_TO_MSEC 1000 +-#define MSEC_TO_NSEC 1000000 +-#define POLL_KERNEL_EVENTS 32 +- +-static void update_epoll_max_stack(struct wakeup_poll *wakeup); +-static void change_epollfd_kernel_thread(struct wakeup_poll *wakeup, struct protocol_stack *old_stack, +- struct protocol_stack *new_stack); +- +-static inline void add_wakeup_to_stack_wakeuplist(struct wakeup_poll *wakeup, struct protocol_stack *stack) +-{ +- if (list_node_null(&wakeup->wakeup_list[stack->stack_idx])) { +- list_add_node(&wakeup->wakeup_list[stack->stack_idx], &stack->wakeup_list); +- } +-} +- +-void add_sock_event_nolock(struct lwip_sock *sock, uint32_t event) +-{ +- struct wakeup_poll *wakeup = sock->wakeup; +- +- if (wakeup == NULL || wakeup->type == WAKEUP_CLOSE || (event & sock->epoll_events) == 0) { +- return; +- } +- +- if (!get_global_cfg_params()->stack_mode_rtc) { +- if (event == EPOLLIN && !NETCONN_IS_DATAIN(sock) && !NETCONN_IS_ACCEPTIN(sock)) { +- return; +- } +- +- if (event == EPOLLOUT && !NETCONN_IS_OUTIDLE(sock)) { +- return; +- } +- } +- +- sock->events |= (event == EPOLLERR) ? (EPOLLIN | EPOLLERR) : (event & sock->epoll_events); +- if (list_node_null(&sock->event_list)) { +- list_add_node(&sock->event_list, &wakeup->event_list); +- } +- return; +-} +- +-static void _add_sock_event(struct lwip_sock *sock, struct wakeup_poll *wakeup, uint32_t event) +-{ +- struct protocol_stack *stack = sock->stack; +- if (wakeup == NULL || wakeup->type == WAKEUP_CLOSE) { +- return; +- } +- +- if (wakeup->type == WAKEUP_BLOCK) { +- if (!(event & (EPOLLIN | EPOLLERR))) { +- return; +- } +- } else if (!(event & sock->epoll_events)) { +- return; +- } +- +- if (wakeup->type == WAKEUP_EPOLL) { +- pthread_spin_lock(&wakeup->event_list_lock); +- add_sock_event_nolock(sock, event); +- pthread_spin_unlock(&wakeup->event_list_lock); +- } +- +- add_wakeup_to_stack_wakeuplist(wakeup, stack); +- return; +-} +- +-void add_sock_event(struct lwip_sock *sock, uint32_t event) +-{ +- _add_sock_event(sock, sock->wakeup, event); +- _add_sock_event(sock, sock->recv_block, event); +-} +- +-void del_sock_event_nolock(struct lwip_sock *sock, uint32_t event) +-{ +- if (get_global_cfg_params()->stack_mode_rtc) { +- sock->events &= ~event; +- } else { +- if ((event & EPOLLOUT) && !NETCONN_IS_OUTIDLE(sock)) { +- sock->events &= ~EPOLLOUT; +- } +- if ((event & EPOLLIN) && !NETCONN_IS_DATAIN(sock) && !NETCONN_IS_ACCEPTIN(sock)) { +- sock->events &= ~EPOLLIN; +- } +- } +- +- if (sock->events == 0) { +- list_del_node(&sock->event_list); +- } +- return; +-} +- +-void del_sock_event(struct lwip_sock *sock, uint32_t event) +-{ +- pthread_spin_lock(&sock->wakeup->event_list_lock); +- del_sock_event_nolock(sock, event); +- pthread_spin_unlock(&sock->wakeup->event_list_lock); +-} +- +-void wakeup_stack_epoll(struct protocol_stack *stack) +-{ +- struct list_node *node, *temp; +- +- list_for_each_node(node, temp, &stack->wakeup_list) { +- struct wakeup_poll *wakeup = container_of_uncheck_ptr((node - stack->stack_idx), struct wakeup_poll, wakeup_list); +- +- if (__atomic_load_n(&wakeup->in_wait, __ATOMIC_ACQUIRE)) { +- __atomic_store_n(&wakeup->in_wait, false, __ATOMIC_RELEASE); +- rte_mb(); +- sem_post(&wakeup->wait); +- stack->stats.wakeup_events++; +- } +- +- list_del_node(&wakeup->wakeup_list[stack->stack_idx]); +- } +-} +- +-static uint32_t update_events(struct lwip_sock *sock) +-{ +- uint32_t event = 0; +- +- if (sock->epoll_events & EPOLLIN) { +- if (NETCONN_IS_DATAIN(sock) || NETCONN_IS_ACCEPTIN(sock)) { +- event |= EPOLLIN; +- } +- } +- +- if ((sock->epoll_events & EPOLLOUT) && NETCONN_IS_OUTIDLE(sock)) { +- /* lwip_netconn_do_connected set LIBOS FLAGS when connected */ +- if (!POSIX_IS_CLOSED(sock) && POSIX_IS_TYPE(sock, POSIX_LWIP)) { +- event |= EPOLLOUT; +- } +- } +- +- if (sock->errevent > 0) { +- event |= EPOLLERR | EPOLLIN; +- } +- +- return event; +-} +- +-static void rtc_raise_pending_events(struct wakeup_poll *wakeup, struct lwip_sock *sock) +-{ +- uint32_t event = 0; +- +- if (sock->rcvevent) { +- event |= EPOLLIN; +- } +- +- if (sock->errevent > 0) { +- event |= EPOLLERR | EPOLLIN; +- } +- +- if (sock->sendevent) { +- /* lwip_netconn_do_connected set LIBOS FLAGS when connected */ +- if (!POSIX_IS_CLOSED(sock) && POSIX_IS_TYPE(sock, POSIX_LWIP)) { +- event |= EPOLLOUT; +- } +- } +- +- if (event) { +- sock->events = event; +- if (wakeup->type == WAKEUP_EPOLL && (sock->events & sock->epoll_events) && +- list_node_null(&sock->event_list)) { +- list_add_node(&sock->event_list, &wakeup->event_list); +- } +- } +-} +- +-static void raise_pending_events(struct wakeup_poll *wakeup, struct lwip_sock *sock) +-{ +- uint32_t event = 0; +- +- pthread_spin_lock(&wakeup->event_list_lock); +- if (NETCONN_IS_DATAIN(sock) || NETCONN_IS_ACCEPTIN(sock)) { +- event |= EPOLLIN; +- } +- +- if (sock->errevent > 0) { +- event |= EPOLLERR | EPOLLIN; +- } +- +- if (NETCONN_IS_OUTIDLE(sock)) { +- /* lwip_netconn_do_connected set LIBOS FLAGS when connected */ +- if (!POSIX_IS_CLOSED(sock) && POSIX_IS_TYPE(sock, POSIX_LWIP)) { +- event |= EPOLLOUT; +- } +- } +- +- if (event) { +- sock->events = event; +- if (wakeup->type == WAKEUP_EPOLL && (sock->events & sock->epoll_events) && +- list_node_null(&sock->event_list)) { +- list_add_node(&sock->event_list, &wakeup->event_list); +- rte_mb(); +- sem_post(&wakeup->wait); +- } +- } +- pthread_spin_unlock(&wakeup->event_list_lock); +-} +- +-int32_t lstack_do_epoll_create(int32_t fd) +-{ +- if (fd < 0) { +- return fd; +- } +- +- struct lwip_sock *sock = lwip_get_socket(fd); +- if (sock == NULL) { +- LSTACK_LOG(ERR, LSTACK, "fd=%d sock is NULL errno=%d\n", fd, errno); +- posix_api->close_fn(fd); +- GAZELLE_RETURN(EINVAL); +- } +- +- struct wakeup_poll *wakeup = calloc(1, sizeof(struct wakeup_poll)); +- if (wakeup == NULL) { +- LSTACK_LOG(ERR, LSTACK, "calloc null\n"); +- posix_api->close_fn(fd); +- GAZELLE_RETURN(EINVAL); +- } +- +- for (uint32_t i = 0; i < PROTOCOL_STACK_MAX; i++) { +- list_init_node(&wakeup->wakeup_list[i]); +- } +- +- if (sem_init(&wakeup->wait, 0, 0) != 0) { +- posix_api->close_fn(fd); +- free(wakeup); +- GAZELLE_RETURN(EINVAL); +- } +- __atomic_store_n(&wakeup->in_wait, false, __ATOMIC_RELEASE); +- +- struct protocol_stack_group *stack_group = get_protocol_stack_group(); +- list_init_node(&wakeup->poll_list); +- pthread_spin_lock(&stack_group->poll_list_lock); +- list_add_node(&wakeup->poll_list, &stack_group->poll_list); +- pthread_spin_unlock(&stack_group->poll_list_lock); +- +- list_init_head(&wakeup->event_list); +- pthread_spin_init(&wakeup->event_list_lock, PTHREAD_PROCESS_PRIVATE); +- +- wakeup->type = WAKEUP_EPOLL; +- wakeup->epollfd = fd; +- sock->wakeup = wakeup; +- +- if (!get_global_cfg_params()->stack_mode_rtc) { +- update_epoll_max_stack(wakeup); +- change_epollfd_kernel_thread(wakeup, wakeup->bind_stack, wakeup->max_stack); +- wakeup->bind_stack = wakeup->max_stack; +- if (get_global_cfg_params()->app_bind_numa) { +- bind_to_stack_numa(wakeup->bind_stack); +- } +- } else { +- wakeup->bind_stack = wakeup->max_stack = get_protocol_stack(); +- change_epollfd_kernel_thread(wakeup, NULL, wakeup->max_stack); +- } +- +- return fd; +-} +- +-int32_t lstack_epoll_create1(int32_t flags) +-{ +- int32_t fd = posix_api->epoll_create1_fn(flags); +- return lstack_do_epoll_create(fd); +-} +- +-int32_t lstack_epoll_create(int32_t flags) +-{ +- int32_t fd = posix_api->epoll_create_fn(flags); +- return lstack_do_epoll_create(fd); +-} +- +-static void stack_broadcast_clean_epoll(struct wakeup_poll *wakeup) +-{ +- struct protocol_stack_group *stack_group = get_protocol_stack_group(); +- struct protocol_stack *stack = NULL; +- +- for (int32_t i = 0; i < stack_group->stack_num; i++) { +- stack = stack_group->stacks[i]; +- rpc_call_clean_epoll(&stack->rpc_queue, wakeup); +- } +-} +- +-int32_t lstack_epoll_close(int32_t fd) +-{ +- struct lwip_sock *sock = lwip_get_socket(fd); +- if (sock == NULL) { +- LSTACK_LOG(ERR, LSTACK, "fd=%d sock is NULL errno=%d\n", fd, errno); +- GAZELLE_RETURN(EINVAL); +- } +- +- struct protocol_stack_group *stack_group = get_protocol_stack_group(); +- struct wakeup_poll *wakeup = sock->wakeup; +- if (wakeup == NULL) { +- return 0; +- } +- +- wakeup->type = WAKEUP_CLOSE; +- +- if (!get_global_cfg_params()->stack_mode_rtc) { +- stack_broadcast_clean_epoll(wakeup); +- } +- +- struct list_node *node, *temp; +- pthread_spin_lock(&wakeup->event_list_lock); +- list_for_each_node(node, temp, &wakeup->event_list) { +- struct lwip_sock *sock = list_entry(node, struct lwip_sock, event_list); +- list_del_node(&sock->event_list); +- } +- pthread_spin_unlock(&wakeup->event_list_lock); +- pthread_spin_destroy(&wakeup->event_list_lock); +- +- pthread_spin_lock(&stack_group->poll_list_lock); +- list_del_node(&wakeup->poll_list); +- pthread_spin_unlock(&stack_group->poll_list_lock); +- +- sem_destroy(&wakeup->wait); +- +- free(wakeup); +- sock->wakeup = NULL; +- +- posix_api->close_fn(fd); +- return 0; +-} +- +-static uint16_t find_max_cnt_stack(int32_t *stack_count, uint16_t stack_num, struct protocol_stack *last_stack) +-{ +- uint16_t max_index = 0; +- bool all_same_cnt = true; +- +- for (uint16_t i = 1; i < stack_num; i++) { +- if (stack_count[i] != stack_count[0]) { +- all_same_cnt = false; +- } +- +- if (stack_count[i] > stack_count[max_index]) { +- max_index = i; +- } +- } +- +- /* all stack same, don't change */ +- if (all_same_cnt && last_stack) { +- return last_stack->stack_idx; +- } +- +- /* first bind and all stack same. choice tick as queue_id, avoid all bind to statck_0. */ +- static _Atomic uint16_t tick = 0; +- if (all_same_cnt && stack_num) { +- max_index = atomic_fetch_add(&tick, 1) % stack_num; +- } +- +- return max_index; +-} +- +-static void update_epoll_max_stack(struct wakeup_poll *wakeup) +-{ +- struct protocol_stack_group *stack_group = get_protocol_stack_group(); +- uint16_t bind_id = find_max_cnt_stack(wakeup->stack_fd_cnt, stack_group->stack_num, wakeup->max_stack); +- +- wakeup->max_stack = stack_group->stacks[bind_id]; +-} +- +-int32_t lstack_rtc_epoll_ctl(int32_t epfd, int32_t op, int32_t fd, struct epoll_event *event) +-{ +- if (epfd < 0 || fd < 0 || epfd == fd || (event == NULL && op != EPOLL_CTL_DEL)) { +- LSTACK_LOG(ERR, LSTACK, "fd=%d epfd=%d op=%d\n", fd, epfd, op); +- GAZELLE_RETURN(EINVAL); +- } +- +- struct lwip_sock *epoll_sock = lwip_get_socket(epfd); +- if (epoll_sock == NULL || epoll_sock->wakeup == NULL) { +- return posix_api->epoll_ctl_fn(epfd, op, fd, event); +- } +- +- struct wakeup_poll *wakeup = epoll_sock->wakeup; +- struct lwip_sock *sock = lwip_get_socket(fd); +- if (POSIX_IS_CLOSED(sock)) { +- return posix_api->epoll_ctl_fn(epfd, op, fd, event); +- } +- +- switch (op) { +- case EPOLL_CTL_ADD: +- sock->wakeup = wakeup; +- /* fall through */ +- case EPOLL_CTL_MOD: +- sock->epoll_events = event->events | EPOLLERR | EPOLLHUP; +- sock->ep_data = event->data; +- rtc_raise_pending_events(wakeup, sock); +- break; +- case EPOLL_CTL_DEL: +- sock->epoll_events = 0; +- list_del_node(&sock->event_list); +- break; +- default: +- GAZELLE_RETURN(EINVAL); +- } +- +- return 0; +-} +- +-int32_t lstack_rtw_epoll_ctl(int32_t epfd, int32_t op, int32_t fd, struct epoll_event *event) +-{ +- LSTACK_LOG(DEBUG, LSTACK, "op=%d events: fd: %d\n", op, fd); +- +- if (epfd < 0 || fd < 0 || epfd == fd || (event == NULL && op != EPOLL_CTL_DEL)) { +- LSTACK_LOG(ERR, LSTACK, "fd=%d epfd=%d op=%d\n", fd, epfd, op); +- GAZELLE_RETURN(EINVAL); +- } +- +- struct lwip_sock *epoll_sock = lwip_get_socket(epfd); +- if (epoll_sock == NULL || epoll_sock->wakeup == NULL) { +- return posix_api->epoll_ctl_fn(epfd, op, fd, event); +- } +- +- struct wakeup_poll *wakeup = epoll_sock->wakeup; +- struct lwip_sock *sock = lwip_get_socket(fd); +- if (POSIX_IS_CLOSED(sock) || POSIX_IS_TYPE(sock, POSIX_KERNEL)) { +- return posix_api->epoll_ctl_fn(epfd, op, fd, event); +- } +- +- if (POSIX_HAS_TYPE(sock, POSIX_KERNEL)) { +- int32_t ret = posix_api->epoll_ctl_fn(epfd, op, fd, event); +- if (ret < 0) { +- LSTACK_LOG(ERR, LSTACK, "fd=%d epfd=%d op=%d errno=%d\n", fd, epfd, op, errno); +- } +- } +- +- do { +- switch (op) { +- case EPOLL_CTL_ADD: +- sock->wakeup = wakeup; +- wakeup->stack_fd_cnt[sock->stack->stack_idx]++; +- /* fall through */ +- case EPOLL_CTL_MOD: +- sock->epoll_events = event->events | EPOLLERR | EPOLLHUP; +- sock->ep_data = event->data; +- raise_pending_events(wakeup, sock); +- break; +- case EPOLL_CTL_DEL: +- sock->epoll_events = 0; +- wakeup->stack_fd_cnt[sock->stack->stack_idx]--; +- pthread_spin_lock(&wakeup->event_list_lock); +- list_del_node(&sock->event_list); +- pthread_spin_unlock(&wakeup->event_list_lock); +- break; +- default: +- GAZELLE_RETURN(EINVAL); +- } +- sock = sock->listen_next; +- } while (sock != NULL); +- +- update_epoll_max_stack(wakeup); +- return 0; +-} +- +-int32_t epoll_lwip_event_nolock(struct wakeup_poll *wakeup, struct epoll_event *events, uint32_t maxevents) +-{ +- int32_t event_num = 0; +- struct list_node *node, *temp; +- +- list_for_each_node(node, temp, &wakeup->event_list) { +- struct lwip_sock *sock = list_entry(node, struct lwip_sock, event_list); +- +- if ((sock->epoll_events & sock->events) == 0) { +- list_del_node(node); +- continue; +- } +- +- if (event_num >= maxevents) { +- /* move list head after the current node, and start traversing from this node next time */ +- list_del_node(&wakeup->event_list); +- list_add_node(&wakeup->event_list, node); +- break; +- } +- +- events[event_num].events = sock->events & sock->epoll_events; +- events[event_num].data = sock->ep_data; +- event_num++; +- +- if (sock->epoll_events & EPOLLET) { +- list_del_node(node); +- sock->events = 0; +- } +- +- /* EPOLLONESHOT: generate event after epoll_ctl add/mod event again +- epoll_event set 0 avoid generating event util epoll_ctl reset epoll_event */ +- if (sock->epoll_events & EPOLLONESHOT) { +- list_del_node(node); +- sock->epoll_events = 0; +- } +- } +- +- return event_num; +-} +- +-static int32_t epoll_lwip_event(struct wakeup_poll *wakeup, struct epoll_event *events, uint32_t maxevents) +-{ +- int32_t event_num; +- +- pthread_spin_lock(&wakeup->event_list_lock); +- event_num = epoll_lwip_event_nolock(wakeup, events, maxevents); +- pthread_spin_unlock(&wakeup->event_list_lock); +- +- return event_num; +-} +- +-static int32_t poll_lwip_event(struct pollfd *fds, nfds_t nfds) +-{ +- int32_t event_num = 0; +- +- for (uint32_t i = 0; i < nfds; i++) { +- /* sock->listen_next pointerto next stack listen */ +- int32_t fd = fds[i].fd; +- struct lwip_sock *sock = lwip_get_socket(fd); +- while (!POSIX_IS_CLOSED(sock)) { +- uint32_t events = update_events(sock); +- if (events) { +- fds[i].revents = events; +- event_num++; +- break; +- } +- +- sock = sock->listen_next; +- } +- } +- +- return event_num; +-} +- +-static void change_epollfd_kernel_thread(struct wakeup_poll *wakeup, struct protocol_stack *old_stack, +- struct protocol_stack *new_stack) +-{ +- if (old_stack) { +- if (posix_api->epoll_ctl_fn(old_stack->epollfd, EPOLL_CTL_DEL, wakeup->epollfd, NULL) != 0) { +- LSTACK_LOG(ERR, LSTACK, "epoll_ctl_fn errno=%d\n", errno); +- } +- } +- +- /* avoid kernel thread post too much, use EPOLLET */ +- struct epoll_event event; +- event.data.ptr = wakeup; +- event.events = EPOLLIN | EPOLLOUT | EPOLLERR | EPOLLHUP | EPOLLET; +- if (posix_api->epoll_ctl_fn(new_stack->epollfd, EPOLL_CTL_ADD, wakeup->epollfd, &event) != 0) { +- LSTACK_LOG(ERR, LSTACK, "epoll_ctl_fn errno=%d\n", errno); +- } +-} +- +-static void epoll_bind_statck(struct wakeup_poll *wakeup) +-{ +- if (wakeup->bind_stack != wakeup->max_stack && wakeup->max_stack) { +- bind_to_stack_numa(wakeup->max_stack); +- change_epollfd_kernel_thread(wakeup, wakeup->bind_stack, wakeup->max_stack); +- wakeup->bind_stack = wakeup->max_stack; +- } +-} +- +-static void ms_to_timespec(struct timespec *timespec, int32_t timeout) +-{ +- clock_gettime(CLOCK_REALTIME, timespec); +- timespec->tv_sec += timeout / SEC_TO_MSEC; +- timespec->tv_nsec += (timeout % SEC_TO_MSEC) * MSEC_TO_NSEC; +- timespec->tv_sec += timespec->tv_nsec / SEC_TO_NSEC; +- timespec->tv_nsec = timespec->tv_nsec % SEC_TO_NSEC; +-} +- +-/** +- * Block lstack thread +- * +- * @param wakeup +- * The pointer to the wakeup_poll. +- * @param timeout +- * The time to wait, if 'timeout <= 0' will block until unlock +- * +- * @return +- * - return '0' on unlock +- * - return 'ETIMEDOUT' on timeout +- */ +-int32_t lstack_block_wait(struct wakeup_poll *wakeup, int32_t timeout) +-{ +- int ret = 0; +- if (wakeup == NULL) { +- return ret; +- } +- +- __atomic_store_n(&wakeup->in_wait, true, __ATOMIC_RELEASE); +- if (timeout > 0) { +- struct timespec timespec; +- ms_to_timespec(×pec, timeout); +- ret = sem_timedwait(&wakeup->wait, ×pec); +- } else { +- ret = sem_wait(&wakeup->wait); +- } +- +- if (__atomic_load_n(&wakeup->in_wait, __ATOMIC_ACQUIRE)) { +- __atomic_store_n(&wakeup->in_wait, false, __ATOMIC_RELEASE); +- } +- +- return ret; +-} +- +-int32_t lstack_rtc_epoll_wait(int32_t epfd, struct epoll_event* events, int32_t maxevents, int32_t timeout) +-{ +- struct lwip_sock *sock = lwip_get_socket(epfd); +- +- if (sock == NULL || sock->wakeup == NULL) { +- return posix_api->epoll_wait_fn(epfd, events, maxevents, timeout); +- } +- +- struct wakeup_poll *wakeup = sock->wakeup; +- int32_t lwip_num = 0; +- /* 16: avoid app process events for a long time */ +- int32_t tmpmaxevents = 16; +- /* avoid the starvation of epoll events from both netstack */ +- int host_maxevents = tmpmaxevents / 2; +- uint32_t poll_ts = sys_now(); +- bool loop_flag; +- int32_t kernel_num = 0; +- int32_t tmptimeout = timeout; +- +- do { +- stack_polling(0); +- if (__atomic_load_n(&wakeup->have_kernel_event, __ATOMIC_ACQUIRE)) { +- kernel_num = posix_api->epoll_wait_fn(epfd, events, host_maxevents, 0); +- if (!kernel_num) { +- __atomic_store_n(&wakeup->have_kernel_event, false, __ATOMIC_RELEASE); +- } +- } +- if (tmptimeout > 0) { +- if (tmptimeout <= sys_now() - poll_ts) { +- tmptimeout = 0; +- } +- } +- +- loop_flag = false; +- if (!kernel_num && list_head_empty(&wakeup->event_list) && tmptimeout != 0) { +- loop_flag = true; +- } +- } while (loop_flag); +- +- if (kernel_num < 0) { +- LSTACK_LOG(ERR, LSTACK, "lstack_rtc_epoll_wait: kernel event failed\n"); +- return kernel_num; +- } +- +- lwip_num = epoll_lwip_event_nolock(wakeup, &events[kernel_num], tmpmaxevents - kernel_num); +- wakeup->stat.app_events += lwip_num; +- wakeup->stat.kernel_events += kernel_num; +- +- return lwip_num + kernel_num; +-} +- +-int32_t lstack_rtw_epoll_wait(int32_t epfd, struct epoll_event* events, int32_t maxevents, int32_t timeout) +-{ +- struct lwip_sock *sock = lwip_get_socket(epfd); +- if (sock == NULL || sock->wakeup == NULL) { +- return posix_api->epoll_wait_fn(epfd, events, maxevents, timeout); +- } +- +- struct wakeup_poll *wakeup = sock->wakeup; +- int32_t kernel_num = 0; +- int32_t lwip_num = 0; +- +- if (get_global_cfg_params()->app_bind_numa) { +- epoll_bind_statck(sock->wakeup); +- } +- +- do { +- __atomic_store_n(&wakeup->in_wait, true, __ATOMIC_RELEASE); +- lwip_num = epoll_lwip_event(wakeup, events, maxevents); +- +- if (__atomic_load_n(&wakeup->have_kernel_event, __ATOMIC_ACQUIRE)) { +- kernel_num = posix_api->epoll_wait_fn(epfd, &events[lwip_num], maxevents - lwip_num, 0); +- if (!kernel_num) { +- __atomic_store_n(&wakeup->have_kernel_event, false, __ATOMIC_RELEASE); +- } +- } +- +- if (lwip_num + kernel_num > 0) { +- break; +- } +- +- if (timeout == 0) { +- break; +- } +- } while (lstack_block_wait(wakeup, timeout) == 0); +- +- __atomic_store_n(&wakeup->in_wait, false, __ATOMIC_RELEASE); +- wakeup->stat.app_events += lwip_num; +- wakeup->stat.kernel_events += kernel_num; +- +- return lwip_num + kernel_num; +-} +- +-static int32_t init_poll_wakeup_data(struct wakeup_poll *wakeup) +-{ +- if (sem_init(&wakeup->wait, 0, 0) != 0) { +- GAZELLE_RETURN(EINVAL); +- } +- __atomic_store_n(&wakeup->in_wait, false, __ATOMIC_RELEASE); +- +- for (uint32_t i = 0; i < PROTOCOL_STACK_MAX; i++) { +- list_init_node(&wakeup->wakeup_list[i]); +- } +- +- wakeup->epollfd = posix_api->epoll_create_fn(POLL_KERNEL_EVENTS); +- if (wakeup->epollfd < 0) { +- GAZELLE_RETURN(EINVAL); +- } +- +- wakeup->type = WAKEUP_POLL; +- +- wakeup->last_fds = calloc(POLL_KERNEL_EVENTS, sizeof(struct pollfd)); +- if (wakeup->last_fds == NULL) { +- GAZELLE_RETURN(EINVAL); +- } +- wakeup->last_max_nfds = POLL_KERNEL_EVENTS; +- +- wakeup->events = calloc(POLL_KERNEL_EVENTS, sizeof(struct epoll_event)); +- if (wakeup->events == NULL) { +- free(wakeup->last_fds); +- wakeup->last_fds = NULL; +- GAZELLE_RETURN(EINVAL); +- } +- +- struct protocol_stack_group *stack_group = get_protocol_stack_group(); +- list_init_node(&wakeup->poll_list); +- pthread_spin_lock(&stack_group->poll_list_lock); +- list_add_node(&wakeup->poll_list, &stack_group->poll_list); +- pthread_spin_unlock(&stack_group->poll_list_lock); +- +- int32_t stack_count[PROTOCOL_STACK_MAX] = {0}; +- uint16_t bind_id = find_max_cnt_stack(stack_count, stack_group->stack_num, wakeup->bind_stack); +- change_epollfd_kernel_thread(wakeup, wakeup->bind_stack, stack_group->stacks[bind_id]); +- wakeup->bind_stack = stack_group->stacks[bind_id]; +- if (get_global_cfg_params()->app_bind_numa) { +- bind_to_stack_numa(wakeup->bind_stack); +- } +- +- return 0; +-} +- +-static int resize_kernel_poll(struct wakeup_poll *wakeup, nfds_t nfds) +-{ +- if (wakeup->last_fds) { +- free(wakeup->last_fds); +- } +- wakeup->last_fds = calloc(nfds, sizeof(struct pollfd)); +- if (wakeup->last_fds == NULL) { +- LSTACK_LOG(ERR, LSTACK, "calloc failed errno=%d\n", errno); +- return -1; +- } +- +- if (wakeup->events) { +- free(wakeup->events); +- } +- wakeup->events = calloc(nfds, sizeof(struct epoll_event)); +- if (wakeup->events == NULL) { +- LSTACK_LOG(ERR, LSTACK, "calloc failed errno=%d\n", errno); +- free(wakeup->last_fds); +- wakeup->last_fds = NULL; +- return -1; +- } +- +- wakeup->last_max_nfds = nfds; +- return 0; +-} +- +-static void poll_bind_statck(struct wakeup_poll *wakeup, int32_t *stack_count) +-{ +- struct protocol_stack_group *stack_group = get_protocol_stack_group(); +- +- uint16_t bind_id = find_max_cnt_stack(stack_count, stack_group->stack_num, wakeup->bind_stack); +- if (wakeup->bind_stack && wakeup->bind_stack->queue_id == bind_id) { +- return; +- } +- +- change_epollfd_kernel_thread(wakeup, wakeup->bind_stack, stack_group->stacks[bind_id]); +- bind_to_stack_numa(stack_group->stacks[bind_id]); +- wakeup->bind_stack = stack_group->stacks[bind_id]; +-} +- +-static void update_kernel_poll(struct wakeup_poll *wakeup, uint32_t index, struct pollfd *new_fd) +-{ +- posix_api->epoll_ctl_fn(wakeup->epollfd, EPOLL_CTL_DEL, wakeup->last_fds[index].fd, NULL); +- +- if (new_fd == NULL) { +- return; +- } +- +- struct epoll_event event; +- event.data.u32 = index; +- event.events = new_fd->events; +- if (posix_api->epoll_ctl_fn(wakeup->epollfd, EPOLL_CTL_ADD, new_fd->fd, &event) != 0) { +- LSTACK_LOG(ERR, LSTACK, "epoll_ctl_fn errno=%d\n", errno); +- } +-} +- +-static int poll_init(struct wakeup_poll *wakeup, struct pollfd *fds, nfds_t nfds) +-{ +- int32_t stack_count[PROTOCOL_STACK_MAX] = {0}; +- int32_t poll_change = 0; +- int ret = 0; +- +- /* poll fds num more, recalloc fds size */ +- if (nfds > wakeup->last_max_nfds) { +- ret = resize_kernel_poll(wakeup, nfds); +- if (ret < 0) { +- return -1; +- } +- poll_change = 1; +- } +- +- if (nfds < wakeup->last_nfds) { +- poll_change = 1; +- } +- +- for (uint32_t i = 0; i < nfds; i++) { +- int32_t fd = fds[i].fd; +- fds[i].revents = 0; +- struct lwip_sock *sock = lwip_get_socket(fd); +- +- if (fd == wakeup->last_fds[i].fd && fds[i].events == wakeup->last_fds[i].events) { +- /* fd close then socket may get same fd. */ +- if (sock == NULL || sock->wakeup != NULL) { +- continue; +- } +- } +- +- if (POSIX_IS_CLOSED(sock) || POSIX_HAS_TYPE(sock, POSIX_KERNEL)) { +- update_kernel_poll(wakeup, i, fds + i); +- } +- +- wakeup->last_fds[i].fd = fd; +- wakeup->last_fds[i].events = fds[i].events; +- poll_change = 1; +- +- while (!POSIX_IS_CLOSED(sock)) { +- sock->epoll_events = fds[i].events | POLLERR; +- sock->wakeup = wakeup; +- stack_count[sock->stack->stack_idx]++; +- sock = sock->listen_next; +- } +- } +- +- if (poll_change == 0) { +- return 0; +- } +- wakeup->last_nfds = nfds; +- +- if (get_global_cfg_params()->app_bind_numa) { +- poll_bind_statck(wakeup, stack_count); +- } +- return 0; +-} +- +-struct wakeup_poll* poll_construct_wakeup(void) +-{ +- static PER_THREAD struct wakeup_poll *wakeup = NULL; +- if (wakeup == NULL) { +- wakeup = calloc(1, sizeof(struct wakeup_poll)); +- if (wakeup == NULL) { +- LSTACK_LOG(ERR, LSTACK, "calloc failed errno=%d\n", errno); +- return NULL; +- } +- +- if (init_poll_wakeup_data(wakeup) < 0) { +- free(wakeup); +- return NULL; +- } +- } +- return wakeup; +-} +- +-int32_t lstack_poll(struct pollfd *fds, nfds_t nfds, int32_t timeout) +-{ +- struct wakeup_poll *wakeup = poll_construct_wakeup(); +- if (wakeup == NULL) { +- GAZELLE_RETURN(EINVAL); +- } +- +- if (poll_init(wakeup, fds, nfds) < 0) { +- free(wakeup); +- GAZELLE_RETURN(EINVAL); +- } +- +- int32_t kernel_num = 0; +- int32_t lwip_num = 0; +- +- do { +- __atomic_store_n(&wakeup->in_wait, true, __ATOMIC_RELEASE); +- lwip_num = poll_lwip_event(fds, nfds); +- +- if (__atomic_load_n(&wakeup->have_kernel_event, __ATOMIC_ACQUIRE)) { +- kernel_num = posix_api->epoll_wait_fn(wakeup->epollfd, wakeup->events, nfds, 0); +- for (int32_t i = 0; i < kernel_num; i++) { +- uint32_t index = wakeup->events[i].data.u32; +- fds[index].revents = wakeup->events[i].events; +- } +- if (!kernel_num) { +- __atomic_store_n(&wakeup->have_kernel_event, false, __ATOMIC_RELEASE); +- } +- } +- +- if (lwip_num + kernel_num > 0) { +- break; +- } +- +- if (timeout == 0) { +- break; +- } +- } while (lstack_block_wait(wakeup, timeout) == 0); +- +- __atomic_store_n(&wakeup->in_wait, false, __ATOMIC_RELEASE); +- wakeup->stat.app_events += lwip_num; +- wakeup->stat.kernel_events += kernel_num; +- +- return lwip_num + kernel_num; +-} +- +-static void select_set_revent_fdset(struct pollfd *fds, nfds_t nfds, fd_set *eventfds, uint32_t event) +-{ +- FD_ZERO(eventfds); +- +- /* Set the fd_set parameter based on the actual revents. */ +- for (int i = 0; i < nfds; i++) { +- if (fds[i].revents & event) { +- FD_SET(fds[i].fd, eventfds); +- } +- } +-} +- +-static void fds_poll2select(struct pollfd *fds, nfds_t nfds, fd_set *readfds, fd_set *writefds, fd_set *exceptfds) +-{ +- if (fds == NULL || nfds == 0) { +- return; +- } +- +- if (readfds) { +- select_set_revent_fdset(fds, nfds, readfds, EPOLLIN); +- } +- if (writefds) { +- select_set_revent_fdset(fds, nfds, writefds, EPOLLOUT); +- } +- if (exceptfds) { +- select_set_revent_fdset(fds, nfds, exceptfds, EPOLLERR); +- } +-} +- +-static inline int timeval_to_ms(struct timeval *timeval, int32_t *timeout) +-{ +- if (!timeval) { +- *timeout = -1; +- return 0; +- } +- if (unlikely((timeval->tv_sec < 0 || timeval->tv_usec < 0 || timeval->tv_usec >= 1000000))) { +- return -1; +- } +- *timeout = timeval->tv_sec * 1000 + timeval->tv_usec / 1000; +- return 0; +-} +- +-static nfds_t fds_select2poll(int maxfd, fd_set *readfds, fd_set *writefds, fd_set *exceptfds, struct pollfd *fds) +-{ +- struct pollfd *pollfds = fds; +- nfds_t nfds = 0; +- +- for (int i = 0; i < maxfd; i++) { +- if (readfds && FD_ISSET(i, readfds)) { +- pollfds[nfds].events = POLLIN; +- } +- if (writefds && FD_ISSET(i, writefds)) { +- pollfds[nfds].events |= POLLOUT; +- } +- if (exceptfds && FD_ISSET(i, exceptfds)) { +- pollfds[nfds].events |= POLLERR; +- } +- if (pollfds[nfds].events > 0) { +- pollfds[nfds].fd = i; +- nfds++; +- } +- } +- return nfds; +-} +- +-int lstack_select(int maxfd, fd_set *readfds, fd_set *writefds, fd_set *exceptfds, struct timeval *timeval) +-{ +- if (maxfd < 0 || maxfd > FD_SETSIZE) { +- LSTACK_LOG(ERR, LSTACK, "select input param error, fd num=%d\n", maxfd); +- GAZELLE_RETURN(EINVAL); +- } +- +- /* Convert the select parameter to the poll parameter. */ +- struct pollfd fds[FD_SETSIZE] = { 0 }; +- nfds_t nfds = fds_select2poll(maxfd, readfds, writefds, exceptfds, fds); +- int timeout = 0; +- if (timeval_to_ms(timeval, &timeout)) { +- LSTACK_LOG(ERR, LSTACK, "select input param timeout error.\n"); +- GAZELLE_RETURN(EINVAL); +- } +- +- int event_num = lstack_poll(fds, nfds, timeout); +- +- /* After poll, set select fd_set by fds.revents. */ +- fds_poll2select(fds, nfds, readfds, writefds, exceptfds); +- +- return event_num; +-} +- +-#else /* SOCK_EVENT_V2 */ +- + #include + #include + #include +@@ -1883,4 +850,3 @@ bool sock_event_wait(struct lwip_sock *sock, bool noblocking) + + return false; + } +-#endif /* SOCK_EVENT_V2 */ +diff --git a/src/lstack/api/lstack_rtc_api.c b/src/lstack/api/lstack_rtc_api.c +index 60d3b23..4a962e1 100644 +--- a/src/lstack/api/lstack_rtc_api.c ++++ b/src/lstack/api/lstack_rtc_api.c +@@ -13,80 +13,15 @@ + #include + #include + +-#include "lstack_epoll.h" + #include "lstack_log.h" +-#include "lstack_cfg.h" +-#include "lstack_protocol_stack.h" + #include "lstack_rtc_api.h" + +-static int rtc_socket(int domain, int type, int protocol) +-{ +- int ret; +- +- if (stack_setup_app_thread() < 0) { +- exit(1); +- } +- +- /* need call stack thread init function */ +- ret = lwip_socket(domain, type, protocol); +- return ret; +-} +- +-static int rtc_close(int s) +-{ +- struct lwip_sock *sock = lwip_get_socket(s); +- if (sock != NULL && sock->wakeup != NULL && sock->wakeup->epollfd == s) { +- return lstack_epoll_close(s); +- } +- +- return lwip_close(s); +-} +- +-static int rtc_epoll_create(int flags) +-{ +- if (stack_setup_app_thread() < 0) { +- exit(1); +- } +- +- return lstack_epoll_create(flags); +-} +- +-static int rtc_epoll_create1(int flags) +-{ +- if (stack_setup_app_thread() < 0) { +- exit(1); +- } +- +- return lstack_epoll_create1(flags); +-} +- +-static int rtc_epoll_ctl(int epfd, int op, int fd, struct epoll_event *event) +-{ +- return lstack_rtc_epoll_ctl(epfd, op, fd, event); +-} +- +-static int rtc_epoll_wait(int epfd, struct epoll_event* events, int maxevents, int timeout) +-{ +- return lstack_rtc_epoll_wait(epfd, events, maxevents, timeout); +-} +- +-static int rtc_poll(struct pollfd *fds, nfds_t nfds, int timeout) +-{ +- LSTACK_LOG(ERR, LSTACK, "rtc_poll: rtc currently does not support poll\n"); +- return -1; +-} +- +-static int rtc_select(int nfds, fd_set *readfds, fd_set *writefds, fd_set *exceptfds, struct timeval *timeout) +-{ +- LSTACK_LOG(ERR, LSTACK, "rtc_select: rtc currently does not support select\n"); +- return -1; +-} + + void rtc_api_init(posix_api_t *api) + { +- api->close_fn = rtc_close; ++ api->close_fn = lwip_close; + api->shutdown_fn = lwip_shutdown; +- api->socket_fn = rtc_socket; ++ api->socket_fn = lwip_socket; + api->accept_fn = lwip_accept; + api->accept4_fn = lwip_accept4; + api->bind_fn = lwip_bind; +@@ -104,16 +39,8 @@ void rtc_api_init(posix_api_t *api) + api->writev_fn = lwip_writev; + api->recv_fn = lwip_recv; + api->send_fn = lwip_send; +- api->recvmsg_fn = (ssize_t (*)(int, const struct msghdr *, int))lwip_recvmsg; // TODO: fix unnecessary 'const' in lwipgz_posix_api.h ++ api->recvmsg_fn = lwip_recvmsg; + api->sendmsg_fn = lwip_sendmsg; + api->recvfrom_fn = lwip_recvfrom; + api->sendto_fn = lwip_sendto; +- +- api->epoll_ctl_fn = rtc_epoll_ctl; +- api->epoll_create1_fn = rtc_epoll_create1; +- api->epoll_create_fn = rtc_epoll_create; +- api->epoll_wait_fn = rtc_epoll_wait; +- +- api->poll_fn = rtc_poll; +- api->select_fn = rtc_select; + } +diff --git a/src/lstack/api/lstack_rtw_api.c b/src/lstack/api/lstack_rtw_api.c +index 35439bc..7b8dec2 100644 +--- a/src/lstack/api/lstack_rtw_api.c ++++ b/src/lstack/api/lstack_rtw_api.c +@@ -19,30 +19,31 @@ + #include "lstack_thread_rpc.h" + #include "lstack_protocol_stack.h" + #include "lstack_lwip.h" +-#include "lstack_epoll.h" + #include "lstack_rtw_api.h" ++#include "lstack_epoll.h" ++#include "lstack_wait.h" + + /* when fd is listenfd, listenfd of all protocol stack thread will be closed */ + static int stack_broadcast_close(int fd) + { + int ret = 0; ++ struct protocol_stack *stack; + struct lwip_sock *sock = lwip_get_socket(fd); +- struct protocol_stack *stack = get_protocol_stack_by_fd(fd); + if (sock == NULL) { + GAZELLE_RETURN(EBADF); + } + + do { +- sock = sock->listen_next; ++ if (POSIX_IS_CLOSED(sock)) { ++ break; ++ } ++ stack = get_protocol_stack_by_id(sock->stack_id); + if (stack == NULL || rpc_call_close(&stack->rpc_queue, fd)) { + ret = -1; + } + +- if (POSIX_IS_CLOSED(sock)) { +- break; +- } ++ sock = sock->listen_next; + fd = sock->conn->callback_arg.socket; +- stack = get_protocol_stack_by_fd(fd); + } while (1); + + return ret; +@@ -51,23 +52,23 @@ static int stack_broadcast_close(int fd) + static int stack_broadcast_shutdown(int fd, int how) + { + int32_t ret = 0; ++ struct protocol_stack *stack; + struct lwip_sock *sock = lwip_get_socket(fd); +- struct protocol_stack *stack = get_protocol_stack_by_fd(fd); + if (sock == NULL) { + GAZELLE_RETURN(EBADF); + } + + do { +- sock = sock->listen_next; ++ if (POSIX_IS_CLOSED(sock)) { ++ break; ++ } ++ stack = get_protocol_stack_by_id(sock->stack_id); + if (stack == NULL || rpc_call_shutdown(&stack->rpc_queue, fd, how)) { + ret = -1; + } + +- if (POSIX_IS_CLOSED(sock)) { +- break; +- } ++ sock = sock->listen_next; + fd = sock->conn->callback_arg.socket; +- stack = get_protocol_stack_by_fd(fd); + } while (1); + + return ret; +@@ -76,26 +77,29 @@ static int stack_broadcast_shutdown(int fd, int how) + /* choice one stack bind */ + static int stack_single_bind(int fd, const struct sockaddr *name, socklen_t namelen) + { +- struct protocol_stack *stack = get_protocol_stack_by_fd(fd); +- if (stack == NULL) { ++ struct protocol_stack *stack; ++ struct lwip_sock *sock = lwip_get_socket(fd); ++ if (sock == NULL) { + GAZELLE_RETURN(EBADF); + } ++ stack = get_protocol_stack_by_id(sock->stack_id); + return rpc_call_bind(&stack->rpc_queue, fd, name, namelen); + } + + /* bind sync to all protocol stack thread, so that any protocol stack thread can build connect */ + static int stack_broadcast_bind(int fd, const struct sockaddr *name, socklen_t namelen) + { +- struct protocol_stack *cur_stack = get_protocol_stack_by_fd(fd); ++ struct protocol_stack *cur_stack; + struct protocol_stack *stack = NULL; + int ret, clone_fd; + + struct lwip_sock *sock = lwip_get_socket(fd); +- if (sock == NULL || cur_stack == NULL) { ++ if (sock == NULL) { + LSTACK_LOG(ERR, LSTACK, "tid %d, %d get sock null or stack null\n", rte_gettid(), fd); + GAZELLE_RETURN(EBADF); + } + ++ cur_stack = get_protocol_stack_by_id(sock->stack_id); + ret = rpc_call_bind(&cur_stack->rpc_queue, fd, name, namelen); + if (ret < 0) { + close(fd); +@@ -116,45 +120,30 @@ static int stack_broadcast_bind(int fd, const struct sockaddr *name, socklen_t n + return 0; + } + +-static void inline del_accept_in_event(struct lwip_sock *sock) +-{ +- pthread_spin_lock(&sock->wakeup->event_list_lock); +- +- if (!NETCONN_IS_ACCEPTIN(sock)) { +- sock->events &= ~EPOLLIN; +- if (sock->events == 0) { +- list_del_node(&sock->event_list); +- } +- } +- +- pthread_spin_unlock(&sock->wakeup->event_list_lock); +-} +- + static struct lwip_sock *get_min_accept_sock(int fd) + { +- struct lwip_sock *sock = lwip_get_socket(fd); ++ struct lwip_sock *sock; + struct lwip_sock *min_sock = NULL; + +- while (sock) { ++ for (sock = lwip_get_socket(fd); sock != NULL; sock = sock->listen_next) { + if (!netconn_is_nonblocking(sock->conn)) { +- if (sock->wakeup == NULL) { +- sock->wakeup = poll_construct_wakeup(); +- if (sock->wakeup == NULL) { +- return NULL; +- } +- sock->epoll_events = POLLIN | POLLERR; ++ /* init all sock sk_wait */ ++ if (unlikely(sock->sk_wait == NULL) || sock->sk_wait->type == WAIT_CLOSE) { ++ sock->sk_wait = poll_construct_wait(0); ++ } ++ if (!(sock->sk_wait->type & WAIT_BLOCK)) { ++ sock->sk_wait->type |= WAIT_BLOCK; + } + } +- if (!NETCONN_IS_ACCEPTIN(sock)) { +- sock = sock->listen_next; ++ ++ if (!sock_event_hold_pending(sock, WAIT_BLOCK, NETCONN_EVT_RCVPLUS, 0)) { + continue; + } + +- if (min_sock == NULL || min_sock->stack->conn_num > sock->stack->conn_num) { ++ if (min_sock == NULL || ++ get_protocol_stack_by_id(min_sock->stack_id)->conn_num > get_protocol_stack_by_id(sock->stack_id)->conn_num) { + min_sock = sock; + } +- +- sock = sock->listen_next; + } + + return min_sock; +@@ -171,24 +160,16 @@ static int stack_broadcast_accept4(int fd, struct sockaddr *addr, socklen_t *add + GAZELLE_RETURN(EBADF); + } + +- if (netconn_is_nonblocking(sock->conn)) { +- min_sock = get_min_accept_sock(fd); +- } else { +- while ((min_sock = get_min_accept_sock(fd)) == NULL) { +- lstack_block_wait(sock->wakeup, 0); +- } +- } +- +- if (min_sock && min_sock->conn) { +- stack = get_protocol_stack_by_fd(min_sock->conn->callback_arg.socket); +- if (stack == NULL) { +- GAZELLE_RETURN(EBADF); ++ min_sock = get_min_accept_sock(fd); ++ if (min_sock == NULL) { ++ if (sock_event_wait(sock, netconn_is_nonblocking(sock->conn) || (flags & SOCK_NONBLOCK))) { ++ min_sock = get_min_accept_sock(fd); + } +- ret = rpc_call_accept(&stack->rpc_queue, min_sock->conn->callback_arg.socket, addr, addrlen, flags); + } + +- if (min_sock && min_sock->wakeup && min_sock->wakeup->type == WAKEUP_EPOLL) { +- del_accept_in_event(min_sock); ++ if (!POSIX_IS_CLOSED(min_sock)) { ++ stack = get_protocol_stack_by_id(min_sock->stack_id); ++ ret = rpc_call_accept(&stack->rpc_queue, min_sock->conn->callback_arg.socket, addr, addrlen, flags); + } + + if (ret < 0) { +@@ -205,10 +186,12 @@ static int stack_broadcast_accept(int fd, struct sockaddr *addr, socklen_t *addr + /* choice one stack listen */ + static int stack_single_listen(int fd, int backlog) + { +- struct protocol_stack *stack = get_protocol_stack_by_fd(fd); +- if (stack == NULL) { ++ struct protocol_stack *stack; ++ struct lwip_sock *sock = lwip_get_socket(fd); ++ if (sock == NULL) { + GAZELLE_RETURN(EBADF); + } ++ stack = get_protocol_stack_by_id(sock->stack_id); + return rpc_call_listen(&stack->rpc_queue, fd, backlog); + } + +@@ -221,18 +204,19 @@ static int stack_broadcast_listen(int fd, int backlog) + struct sockaddr_in6 in6; + } sockaddr_t; + +- struct protocol_stack *cur_stack = get_protocol_stack_by_fd(fd); ++ struct protocol_stack *cur_stack; + struct protocol_stack *stack = NULL; + sockaddr_t addr; + socklen_t addr_len = sizeof(addr); + int ret, clone_fd; + + struct lwip_sock *sock = lwip_get_socket(fd); +- if (sock == NULL || cur_stack == NULL) { ++ if (sock == NULL) { + LSTACK_LOG(ERR, LSTACK, "tid %d, %d get sock null or stack null\n", rte_gettid(), fd); + GAZELLE_RETURN(EBADF); + } + ++ cur_stack = get_protocol_stack_by_id(sock->stack_id); + ret = rpc_call_getsockname(&cur_stack->rpc_queue, fd, (struct sockaddr *)&addr, &addr_len); + if (ret != 0) { + return ret; +@@ -311,46 +295,56 @@ static int rtw_listen(int s, int backlog) + + static int rtw_connect(int s, const struct sockaddr *name, socklen_t namelen) + { +- struct protocol_stack *stack = get_protocol_stack_by_fd(s); +- if (stack == NULL) { ++ struct protocol_stack *stack; ++ struct lwip_sock *sock = lwip_get_socket(s); ++ if (sock == NULL) { + GAZELLE_RETURN(EBADF); + } ++ stack = get_protocol_stack_by_id(sock->stack_id); + return rpc_call_connect(&stack->rpc_queue, s, name, namelen); + } + + static int rtw_setsockopt(int s, int level, int optname, const void *optval, socklen_t optlen) + { +- struct protocol_stack *stack = get_protocol_stack_by_fd(s); +- if (stack == NULL) { ++ struct protocol_stack *stack; ++ struct lwip_sock *sock = lwip_get_socket(s); ++ if (sock == NULL) { + GAZELLE_RETURN(EBADF); + } ++ stack = get_protocol_stack_by_id(sock->stack_id); + return rpc_call_setsockopt(&stack->rpc_queue, s, level, optname, optval, optlen); + } + + static int rtw_getsockopt(int s, int level, int optname, void *optval, socklen_t *optlen) + { +- struct protocol_stack *stack = get_protocol_stack_by_fd(s); +- if (stack == NULL) { ++ struct protocol_stack *stack; ++ struct lwip_sock *sock = lwip_get_socket(s); ++ if (sock == NULL) { + GAZELLE_RETURN(EBADF); + } ++ stack = get_protocol_stack_by_id(sock->stack_id); + return rpc_call_getsockopt(&stack->rpc_queue, s, level, optname, optval, optlen); + } + + static int rtw_getpeername(int s, struct sockaddr *name, socklen_t *namelen) + { +- struct protocol_stack *stack = get_protocol_stack_by_fd(s); +- if (stack == NULL) { ++ struct protocol_stack *stack; ++ struct lwip_sock *sock = lwip_get_socket(s); ++ if (sock == NULL) { + GAZELLE_RETURN(EBADF); + } ++ stack = get_protocol_stack_by_id(sock->stack_id); + return rpc_call_getpeername(&stack->rpc_queue, s, name, namelen); + } + + static int rtw_getsockname(int s, struct sockaddr *name, socklen_t *namelen) + { +- struct protocol_stack *stack = get_protocol_stack_by_fd(s); +- if (stack == NULL) { ++ struct protocol_stack *stack; ++ struct lwip_sock *sock = lwip_get_socket(s); ++ if (sock == NULL) { + GAZELLE_RETURN(EBADF); + } ++ stack = get_protocol_stack_by_id(sock->stack_id); + return rpc_call_getsockname(&stack->rpc_queue, s, name, namelen); + } + +@@ -467,55 +461,16 @@ static ssize_t rtw_sendto(int sockfd, const void *buf, size_t len, int flags, + return do_lwip_send_to_stack(sockfd, buf, len, flags, addr, addrlen); + } + +-static int rtw_epoll_wait(int epfd, struct epoll_event* events, int maxevents, int timeout) +-{ +- return lstack_rtw_epoll_wait(epfd, events, maxevents, timeout); +-} +- +-static int rtw_poll(struct pollfd *fds, nfds_t nfds, int timeout) +-{ +- return lstack_poll(fds, nfds, timeout); +-} +- +-static int rtw_select(int nfds, fd_set *readfds, fd_set *writefds, fd_set *exceptfds, struct timeval *timeout) +-{ +- return lstack_select(nfds, readfds, writefds, exceptfds, timeout); +-} +- + static int rtw_close(int s) + { +- struct lwip_sock *sock = lwip_get_socket(s); +- if (sock && sock->wakeup && sock->wakeup->epollfd == s) { +- return lstack_epoll_close(s); +- } + return stack_broadcast_close(s); + } + + static int rtw_shutdown(int fd, int how) + { +- struct lwip_sock *sock = lwip_get_socket(fd); +- if (sock && sock->wakeup && sock->wakeup->epollfd == fd) { +- GAZELLE_RETURN(ENOTSOCK); +- } +- + return stack_broadcast_shutdown(fd, how); + } + +-static int rtw_epoll_ctl(int epfd, int op, int fd, struct epoll_event *event) +-{ +- return lstack_rtw_epoll_ctl(epfd, op, fd, event); +-} +- +-static int rtw_epoll_create1(int flags) +-{ +- return lstack_epoll_create1(flags); +-} +- +-static int rtw_epoll_create(int flags) +-{ +- return lstack_epoll_create(flags); +-} +- + void rtw_api_init(posix_api_t *api) + { + api->close_fn = rtw_close; +@@ -538,16 +493,8 @@ void rtw_api_init(posix_api_t *api) + api->writev_fn = rtw_writev; + api->recv_fn = rtw_recv; + api->send_fn = rtw_send; +- api->recvmsg_fn = (ssize_t (*)(int, const struct msghdr *, int))rtw_recvmsg; // TODO: fix unnecessary 'const' in lwipgz_posix_api.h ++ api->recvmsg_fn = rtw_recvmsg; + api->sendmsg_fn = rtw_sendmsg; + api->recvfrom_fn = rtw_recvfrom; + api->sendto_fn = rtw_sendto; +- +- api->epoll_ctl_fn = rtw_epoll_ctl; +- api->epoll_create1_fn = rtw_epoll_create1; +- api->epoll_create_fn = rtw_epoll_create; +- api->epoll_wait_fn = rtw_epoll_wait; +- +- api->poll_fn = rtw_poll; +- api->select_fn = rtw_select; + } +diff --git a/src/lstack/api/lstack_wrap.c b/src/lstack/api/lstack_wrap.c +index 8a88c47..5869d6b 100644 +--- a/src/lstack/api/lstack_wrap.c ++++ b/src/lstack/api/lstack_wrap.c +@@ -27,6 +27,7 @@ + #include "lstack_lwip.h" + #include "lstack_preload.h" + #include "lstack_unistd.h" ++#include "lstack_epoll.h" + #include "lstack_rtc_api.h" + #include "lstack_rtw_api.h" + #include "lstack_dummy_api.h" +@@ -35,8 +36,8 @@ + #define SOL_XDP 283 /* same as define in bits/socket.h */ + #endif + +-static posix_api_t g_wrap_api_value; +-static posix_api_t *g_wrap_api; ++static posix_api_t g_wrap_api_value = {0}; ++static posix_api_t *g_wrap_api = NULL; + + void wrap_api_init(void) + { +@@ -50,6 +51,8 @@ void wrap_api_init(void) + } else { + rtw_api_init(g_wrap_api); + } ++ ++ epoll_api_init(g_wrap_api); + } + + void wrap_api_exit(void) +@@ -57,50 +60,6 @@ void wrap_api_exit(void) + dummy_api_init(g_wrap_api); + } + +-static inline int32_t do_epoll_create1(int32_t flags) +-{ +- if (select_posix_path() == POSIX_KERNEL) { +- return posix_api->epoll_create1_fn(flags); +- } +- +- return g_wrap_api->epoll_create1_fn(flags); +-} +- +-static inline int32_t do_epoll_create(int32_t size) +-{ +- if (select_posix_path() == POSIX_KERNEL) { +- return posix_api->epoll_create_fn(size); +- } +- +- return g_wrap_api->epoll_create_fn(size); +-} +- +-static inline int32_t do_epoll_ctl(int32_t epfd, int32_t op, int32_t fd, struct epoll_event* event) +-{ +- if (select_posix_path() == POSIX_KERNEL) { +- return posix_api->epoll_ctl_fn(epfd, op, fd, event); +- } +- +- return g_wrap_api->epoll_ctl_fn(epfd, op, fd, event); +-} +- +-static inline int32_t do_epoll_wait(int32_t epfd, struct epoll_event* events, int32_t maxevents, int32_t timeout) +-{ +- if (select_posix_path() == POSIX_KERNEL) { +- return posix_api->epoll_wait_fn(epfd, events, maxevents, timeout); +- } +- +- if (epfd < 0) { +- GAZELLE_RETURN(EBADF); +- } +- +- if ((events == NULL) || (timeout < -1) || (maxevents <= 0)) { +- GAZELLE_RETURN(EINVAL); +- } +- +- return g_wrap_api->epoll_wait_fn(epfd, events, maxevents, timeout); +-} +- + static inline int32_t do_accept(int32_t s, struct sockaddr *addr, socklen_t *addrlen) + { + if (select_sock_posix_path(lwip_get_socket(s)) == POSIX_KERNEL) { +@@ -515,9 +474,14 @@ static bool unsupport_optname(int32_t level, int32_t optname) + static inline int32_t do_getsockopt(int32_t s, int32_t level, int32_t optname, void *optval, socklen_t *optlen) + { + #define SO_NUMA_ID 0x100c +- if (select_sock_posix_path(lwip_get_socket(s)) == POSIX_LWIP && !unsupport_optname(level, optname)) { ++ const struct protocol_stack *stack; ++ struct lwip_sock *sock; ++ ++ sock = lwip_get_socket(s); ++ if (select_sock_posix_path(sock) == POSIX_LWIP && !unsupport_optname(level, optname)) { + if (level == IPPROTO_IP && optname == SO_NUMA_ID) { +- return lwip_get_socket(s)->stack->numa_id; ++ stack = get_protocol_stack_by_id(sock->stack_id); ++ return stack->numa_id; + } + return g_wrap_api->getsockopt_fn(s, level, optname, optval, optlen); + } +@@ -551,6 +515,12 @@ static inline int32_t do_socket(int32_t domain, int32_t type, int32_t protocol) + return posix_api->socket_fn(domain, type, protocol); + } + ++ if (get_global_cfg_params()->stack_mode_rtc) { ++ if (stack_setup_app_thread() != 0) { ++ LSTACK_EXIT(1, "stack_setup_app_thread failed\n"); ++ } ++ } ++ + ret = g_wrap_api->socket_fn(domain, type, protocol); + if (ret >= 0) { + struct lwip_sock *sock = lwip_get_socket(ret); +@@ -677,14 +647,20 @@ static inline ssize_t do_sendto(int32_t sockfd, const void *buf, size_t len, int + + static inline int32_t do_close(int fd) + { ++ struct lwip_sock *sock = lwip_get_socket(fd); ++ + /* Can not use select_sock_posix_path() ! + * When fd created by lwip_stocket() set as POSIX_KERNEL, + * lwip_close() is still required. + */ + if (select_posix_path() == POSIX_KERNEL || +- POSIX_IS_CLOSED(lwip_get_socket(fd))) { ++ POSIX_IS_CLOSED(sock)) { + return posix_api->close_fn(fd); + } ++ ++ if (select_sock_posix_path(sock) == POSIX_EPOLL) { ++ return lstack_epoll_close(fd); ++ } + return g_wrap_api->close_fn(fd); + } + +@@ -701,9 +677,56 @@ static int32_t do_shutdown(int fd, int how) + return g_wrap_api->shutdown_fn(fd, how); + } + ++ ++static inline int do_epoll_create1(int flags) ++{ ++ int epfd; ++ ++ if (select_posix_path() == POSIX_KERNEL) { ++ return posix_api->epoll_create1_fn(flags); ++ } ++ ++ if (get_global_cfg_params()->stack_mode_rtc) { ++ if (stack_setup_app_thread() != 0) { ++ LSTACK_EXIT(1, "stack_setup_app_thread failed\n"); ++ } ++ } ++ ++ epfd = g_wrap_api->epoll_create1_fn(flags); ++ if (epfd > 0) { ++ POSIX_SET_TYPE(lwip_get_socket(epfd), POSIX_EPOLL); ++ } ++ return epfd; ++} ++ ++static inline int do_epoll_create(int size) ++{ ++ /* Since Linux 2.6.8, the size argument is ignored, ++ * but must be greater than zero. */ ++ return size <= 0 ? -1 : do_epoll_create1(0); ++} ++ ++static inline int do_epoll_ctl(int epfd, int op, int fd, struct epoll_event* event) ++{ ++ if (select_sock_posix_path(lwip_get_socket(epfd)) == POSIX_KERNEL) { ++ return posix_api->epoll_ctl_fn(epfd, op, fd, event); ++ } ++ ++ return g_wrap_api->epoll_ctl_fn(epfd, op, fd, event); ++} ++ ++static inline int do_epoll_wait(int epfd, struct epoll_event* events, int maxevents, int timeout) ++{ ++ if (select_sock_posix_path(lwip_get_socket(epfd)) == POSIX_KERNEL) { ++ return posix_api->epoll_wait_fn(epfd, events, maxevents, timeout); ++ } ++ ++ return g_wrap_api->epoll_wait_fn(epfd, events, maxevents, timeout); ++} ++ + static int32_t do_poll(struct pollfd *fds, nfds_t nfds, int32_t timeout) + { +- if ((select_posix_path() == POSIX_KERNEL) || fds == NULL || nfds == 0) { ++ if ((select_posix_path() == POSIX_KERNEL)) { + return posix_api->poll_fn(fds, nfds, timeout); + } + +@@ -712,18 +735,21 @@ static int32_t do_poll(struct pollfd *fds, nfds_t nfds, int32_t timeout) + + static int32_t do_ppoll(struct pollfd *fds, nfds_t nfds, const struct timespec *tmo_p, const sigset_t *sigmask) + { +- int32_t ready; + int32_t timeout; + +- if (fds == NULL || tmo_p == NULL) { +- GAZELLE_RETURN(EINVAL); +- } +- + // s * 1000 and ns / 1000000 -> ms + timeout = (tmo_p == NULL) ? -1 : (tmo_p->tv_sec * 1000 + tmo_p->tv_nsec / 1000000); +- ready = do_poll(fds, nfds, timeout); + +- return ready; ++ return do_poll(fds, nfds, timeout); ++} ++ ++static int32_t do_select(int32_t nfds, fd_set *readfds, fd_set *writefds, fd_set *exceptfds, struct timeval *timeout) ++{ ++ if (select_posix_path() == POSIX_KERNEL) { ++ return posix_api->select_fn(nfds, readfds, writefds, exceptfds, timeout); ++ } ++ ++ return g_wrap_api->select_fn(nfds, readfds, writefds, exceptfds, timeout); + } + + static int32_t do_sigaction(int32_t signum, const struct sigaction *act, struct sigaction *oldact) +@@ -738,19 +764,6 @@ static int32_t do_sigaction(int32_t signum, const struct sigaction *act, struct + return lstack_sigaction(signum, act, oldact); + } + +-static int32_t do_select(int32_t nfds, fd_set *readfds, fd_set *writefds, fd_set *exceptfds, struct timeval *timeout) +-{ +- /* while input args are invalid, param timeout will steal be executed in kernel */ +- if (nfds <= 0 || !(readfds || writefds || exceptfds)) { +- return posix_api->select_fn(nfds, readfds, writefds, exceptfds, timeout); +- } +- +- if (select_posix_path() == POSIX_KERNEL) { +- return posix_api->select_fn(nfds, readfds, writefds, exceptfds, timeout); +- } +- +- return g_wrap_api->select_fn(nfds, readfds, writefds, exceptfds, timeout); +-} + + #define POSIX_VA_PARAM(fd, cmd, type, lwip_fn, kernel_fn) \ + do { \ +diff --git a/src/lstack/core/lstack_lwip.c b/src/lstack/core/lstack_lwip.c +index d0e51b2..047dfdf 100644 +--- a/src/lstack/core/lstack_lwip.c ++++ b/src/lstack/core/lstack_lwip.c +@@ -78,14 +78,12 @@ static void reset_sock_data(struct lwip_sock *sock) + } + + sock->type = 0; +- sock->stack = NULL; +- sock->wakeup = NULL; ++ sock->stack_id = 0; ++ sock->affinity_numa = 0; ++ sock->sk_wait = NULL; + sock->listen_next = NULL; +- sock->epoll_events = 0; +- sock->events = 0; + sock->call_num = 0; + sock->remain_len = 0; +- sock->already_bind_numa = 0; + + if (sock->recv_lastdata && sock->recv_lastdata != (void *)&fin_packet) { + pbuf_free(sock->recv_lastdata); +@@ -176,12 +174,14 @@ int do_lwip_init_sock(int32_t fd) + return -1; + } + ++ sock->stack_id = stack->stack_idx; ++ sock->sk_wait = NULL; ++ if (sock_event_init(&sock->sk_event) != 0) { ++ LSTACK_LOG(ERR, LSTACK, "sock_event_init failed\n"); ++ return -1; ++ } ++ + if (get_global_cfg_params()->stack_mode_rtc) { +- sock->stack = stack; +- sock->epoll_events = 0; +- sock->events = 0; +- sock->wakeup = NULL; +- list_init_node(&sock->event_list); + return 0; + } + +@@ -207,31 +207,25 @@ int do_lwip_init_sock(int32_t fd) + } + (void)replenish_send_idlembuf(stack, sock); + +- sock->stack = stack; +- + list_init_node(&sock->recv_list); +- list_init_node(&sock->event_list); + return 0; + } + + void do_lwip_clean_sock(int fd) + { + struct lwip_sock *sock = lwip_get_socket(fd); +- if (sock == NULL || sock->stack == NULL) { ++ if (POSIX_IS_CLOSED(sock)) { + return; + } + +- if (sock->wakeup && sock->wakeup->type == WAKEUP_EPOLL) { +- pthread_spin_lock(&sock->wakeup->event_list_lock); +- list_del_node(&sock->event_list); +- pthread_spin_unlock(&sock->wakeup->event_list_lock); +- } +- +- sock->stack->conn_num--; ++ sock_event_free(&sock->sk_event, sock->sk_wait); ++ sock->sk_wait = NULL; + + reset_sock_data(sock); + + list_del_node(&sock->recv_list); ++ ++ get_protocol_stack_by_id(sock->stack_id)->conn_num--; + } + + void do_lwip_free_pbuf(struct pbuf *pbuf) +@@ -310,7 +304,8 @@ struct pbuf *do_lwip_udp_get_from_sendring(struct lwip_sock *sock, uint16_t rema + } + + for (int i = 0; get_protocol_stack_group()->latency_start && i < actual_count; i++) { +- calculate_lstack_latency(&sock->stack->latency, pbufs[i], GAZELLE_LATENCY_WRITE_LWIP, 0); ++ struct protocol_stack *stack = get_protocol_stack_by_id(sock->stack_id); ++ calculate_lstack_latency(&stack->latency, pbufs[i], GAZELLE_LATENCY_WRITE_LWIP, 0); + } + + return pbufs[0]; +@@ -334,7 +329,8 @@ struct pbuf *do_lwip_tcp_get_from_sendring(struct lwip_sock *sock, uint16_t rema + } + + if (get_protocol_stack_group()->latency_start) { +- calculate_lstack_latency(&sock->stack->latency, pbuf, GAZELLE_LATENCY_WRITE_LWIP, 0); ++ struct protocol_stack *stack = get_protocol_stack_by_id(sock->stack_id); ++ calculate_lstack_latency(&stack->latency, pbuf, GAZELLE_LATENCY_WRITE_LWIP, 0); + } + + sock->send_pre_del = pbuf; +@@ -354,8 +350,9 @@ struct pbuf *do_lwip_tcp_get_from_sendring(struct lwip_sock *sock, uint16_t rema + + void do_lwip_get_from_sendring_over(struct lwip_sock *sock) + { ++ struct protocol_stack *stack = get_protocol_stack_by_id(sock->stack_id); ++ stack->stats.write_lwip_cnt++; + sock->send_pre_del = NULL; +- sock->stack->stats.write_lwip_cnt++; + } + + static ssize_t do_app_write(struct lwip_sock *sock, struct pbuf *pbufs[], void *buf, size_t len, uint32_t write_num) +@@ -425,8 +422,9 @@ static inline ssize_t app_buff_write(struct lwip_sock *sock, void *buf, size_t l + } + + for (int i = 0; get_protocol_stack_group()->latency_start && i < write_num; i++) { ++ struct protocol_stack *stack = get_protocol_stack_by_id(sock->stack_id); + if (pbufs[i] != NULL) { +- calculate_lstack_latency(&sock->stack->latency, pbufs[i], GAZELLE_LATENCY_WRITE_INTO_RING, 0); ++ calculate_lstack_latency(&stack->latency, pbufs[i], GAZELLE_LATENCY_WRITE_INTO_RING, 0); + } + } + +@@ -510,7 +508,6 @@ static ssize_t do_lwip_udp_fill_sendring(struct lwip_sock *sock, const void *buf + ssize_t send_len = 0; + uint32_t write_num = (len + MBUF_MAX_DATA_LEN - 1) / MBUF_MAX_DATA_LEN; + uint32_t write_avail = gazelle_ring_readable_count(sock->send_ring); +- struct wakeup_poll *wakeup = sock->wakeup; + + if (write_num > rte_ring_get_capacity(sock->send_ring)) { + LSTACK_LOG(ERR, LSTACK, "sock send_ring size is not enough\n"); +@@ -537,14 +534,7 @@ static ssize_t do_lwip_udp_fill_sendring(struct lwip_sock *sock, const void *buf + + send_len = app_buff_write(sock, (char *)buf, len, write_num, addr, addrlen); + +- if (wakeup && wakeup->type == WAKEUP_EPOLL && (sock->events & EPOLLOUT) +- && !NETCONN_IS_OUTIDLE(sock)) { +- del_sock_event(sock, EPOLLOUT); +- } +- +- if (wakeup) { +- wakeup->stat.app_write_cnt += write_num; +- } ++ API_EVENT(sock->conn, NETCONN_EVT_SENDMINUS, 0); + + return send_len; + } +@@ -561,7 +551,8 @@ static ssize_t __do_lwip_tcp_fill_sendring(struct lwip_sock *sock, const void *b + + /* merge data into last pbuf */ + if (sock->remain_len) { +- sock->stack->stats.sock_tx_merge++; ++ struct protocol_stack *stack = get_protocol_stack_by_id(sock->stack_id); ++ stack->stats.sock_tx_merge++; + send_len = merge_data_lastpbuf(sock, (char *)buf, len); + if (send_len >= len) { + send_len = len; +@@ -571,7 +562,6 @@ static ssize_t __do_lwip_tcp_fill_sendring(struct lwip_sock *sock, const void *b + + uint32_t write_num = (len - send_len + MBUF_MAX_DATA_LEN - 1) / MBUF_MAX_DATA_LEN; + uint32_t write_avail = gazelle_ring_readable_count(sock->send_ring); +- struct wakeup_poll *wakeup = sock->wakeup; + + while (!netconn_is_nonblocking(sock->conn) && (write_avail < write_num)) { + if (sock->errevent > 0) { +@@ -597,14 +587,7 @@ static ssize_t __do_lwip_tcp_fill_sendring(struct lwip_sock *sock, const void *b + } + send_len += app_buff_write(sock, (char *)buf + send_len, len - send_len, write_num, addr, addrlen); + +- if (wakeup) { +- wakeup->stat.app_write_cnt += write_num; +- } +- +- if (wakeup && wakeup->type == WAKEUP_EPOLL && (sock->events & EPOLLOUT) +- && !NETCONN_IS_OUTIDLE(sock)) { +- del_sock_event(sock, EPOLLOUT); +- } ++ API_EVENT(sock->conn, NETCONN_EVT_SENDMINUS, 0); + + END: + if (send_len == 0) { +@@ -644,9 +627,7 @@ bool do_lwip_replenish_sendring(struct protocol_stack *stack, struct lwip_sock * + + replenish_again = replenish_send_idlembuf(stack, sock); + +- if (NETCONN_IS_OUTIDLE(sock)) { +- add_sock_event(sock, EPOLLOUT); +- } ++ API_EVENT(sock->conn, NETCONN_EVT_SENDPLUS, 0); + + return replenish_again; + } +@@ -726,13 +707,14 @@ ssize_t do_lwip_read_from_lwip(struct lwip_sock *sock, int32_t flags, u8_t apifl + LSTACK_LOG(ERR, LSTACK, "Code shouldn't get here!\n"); + } + ++ struct protocol_stack *stack = get_protocol_stack_by_id(sock->stack_id); + for (uint32_t i = 0; get_protocol_stack_group()->latency_start && i < read_count; i++) { + if (pbufs[i] != NULL) { +- calculate_lstack_latency(&sock->stack->latency, pbufs[i], GAZELLE_LATENCY_READ_LWIP, 0); ++ calculate_lstack_latency(&stack->latency, pbufs[i], GAZELLE_LATENCY_READ_LWIP, 0); + } + } ++ stack->stats.read_lwip_cnt += read_count; + +- sock->stack->stats.read_lwip_cnt += read_count; + return recv_len; + } + +@@ -790,7 +772,8 @@ static inline void notice_stack_tcp_send(struct lwip_sock *sock, int32_t fd, int + { + // 2: call_num >= 2, don't need add new rpc send + if (__atomic_load_n(&sock->call_num, __ATOMIC_ACQUIRE) < 2) { +- while (rpc_call_tcp_send(&sock->stack->rpc_queue, fd, len, flags) < 0) { ++ struct protocol_stack *stack = get_protocol_stack_by_id(sock->stack_id); ++ while (rpc_call_tcp_send(&stack->rpc_queue, fd, len, flags) < 0) { + usleep(1000); // 1000: wait 1ms to exec again + } + __sync_fetch_and_add(&sock->call_num, 1); +@@ -800,7 +783,8 @@ static inline void notice_stack_tcp_send(struct lwip_sock *sock, int32_t fd, int + static inline void notice_stack_udp_send(struct lwip_sock *sock, int32_t fd, int32_t len, int32_t flags) + { + __sync_fetch_and_add(&sock->call_num, 1); +- while (rpc_call_udp_send(&sock->stack->rpc_queue, fd, len, flags) < 0) { ++ struct protocol_stack *stack = get_protocol_stack_by_id(sock->stack_id); ++ while (rpc_call_udp_send(&stack->rpc_queue, fd, len, flags) < 0) { + usleep(1000); // 1000: wait 1ms to exec again + } + } +@@ -828,9 +812,9 @@ ssize_t do_lwip_send_to_stack(int32_t fd, const void *buf, size_t len, int32_t f + } + + sock = lwip_get_socket(fd); +- if (unlikely(sock->already_bind_numa == 0 && sock->stack)) { +- thread_bind_stack(sock->stack); +- sock->already_bind_numa = 1; ++ if (unlikely(sock->affinity_numa == 0)) { ++ thread_bind_stack(sock->stack_id); ++ sock->affinity_numa = 1; + } + + #if GAZELLE_SAME_NODE +@@ -838,7 +822,7 @@ ssize_t do_lwip_send_to_stack(int32_t fd, const void *buf, size_t len, int32_t f + return gazelle_same_node_ring_send(sock, buf, len, flags); + } + #endif /* GAZELLE_SAME_NODE */ +- if (sock->errevent > 0 || sock->stack == NULL) { ++ if (sock->errevent > 0) { + GAZELLE_RETURN(ENOTCONN); + } + +@@ -920,9 +904,10 @@ static struct pbuf *pbuf_free_partial(struct pbuf *pbuf, uint16_t free_len) + + static bool recv_break_for_err(struct lwip_sock *sock) + { +- bool break_wait = (sock->errevent > 0) && (!NETCONN_IS_DATAIN(sock)); + errno = err_to_errno(netconn_err(sock->conn)); +- return break_wait; ++ unsigned pending = sock_event_hold_pending(sock, WAIT_BLOCK, NETCONN_EVT_RCVPLUS, 0) | ++ sock_event_hold_pending(sock, WAIT_BLOCK, NETCONN_EVT_ERROR, 0); ++ return pending; + } + + /* +@@ -931,8 +916,7 @@ static bool recv_break_for_err(struct lwip_sock *sock) + */ + static int recv_ring_get_one(struct lwip_sock *sock, bool noblock, struct pbuf **pbuf) + { +- int32_t expect = 1; // only get one pbuf +- int ret = 0; ++ int32_t expect; + uint64_t time_stamp = sys_now_us(); + + if (sock->recv_lastdata != NULL) { +@@ -941,45 +925,24 @@ static int recv_ring_get_one(struct lwip_sock *sock, bool noblock, struct pbuf * + return 0; + } + +- if (noblock) { +- if (gazelle_ring_read(sock->recv_ring, (void **)pbuf, expect) != expect) { ++ expect = gazelle_ring_read(sock->recv_ring, (void **)pbuf, 1); ++ if (expect == 0) { ++ if (netconn_is_nonblocking(sock->conn)) { + GAZELLE_RETURN(EAGAIN); + } +- goto END; +- } +- +- if (sock->recv_block == NULL) { +- sock->recv_block = poll_construct_wakeup(); +- if (sock->recv_block == NULL) { +- GAZELLE_RETURN(ENOMEM); +- } +- sock->recv_block->type = WAKEUP_BLOCK; +- } +- +- do { +- __atomic_store_n(&sock->recv_block->in_wait, true, __ATOMIC_RELEASE); +- if (gazelle_ring_read(sock->recv_ring, (void **)pbuf, expect) == expect) { +- break; +- } +- if (recv_break_for_err(sock)) { +- sock->recv_block = NULL; +- return -1; +- } +- ret = lstack_block_wait(sock->recv_block, sock->conn->recv_timeout); +- if (ret != 0) { +- if (errno == ETIMEDOUT) { +- errno = EAGAIN; ++ sock_event_wait(sock, true); ++ expect = gazelle_ring_read(sock->recv_ring, (void **)pbuf, 1); ++ if (expect == 0) { ++ if (recv_break_for_err(sock)) { ++ return -1; + } +- sock->recv_block = NULL; +- return ret; ++ GAZELLE_RETURN(EAGAIN); + } +- } while (1); +- __atomic_store_n(&sock->recv_block->in_wait, false, __ATOMIC_RELEASE); +- sock->recv_block = NULL; ++ } + +-END: + if (get_protocol_stack_group()->latency_start) { +- calculate_lstack_latency(&sock->stack->latency, *pbuf, GAZELLE_LATENCY_READ_APP_CALL, time_stamp); ++ struct protocol_stack *stack = get_protocol_stack_by_id(sock->stack_id); ++ calculate_lstack_latency(&stack->latency, *pbuf, GAZELLE_LATENCY_READ_APP_CALL, time_stamp); + } + + return 0; +@@ -1044,12 +1007,9 @@ static ssize_t recv_ring_tcp_read(struct lwip_sock *sock, void *buf, size_t len, + if (pbuf->tot_len > copy_len) { + sock->recv_lastdata = pbuf_free_partial(pbuf, copy_len); + } else { +- if (sock->wakeup) { +- sock->wakeup->stat.app_read_cnt += 1; +- } +- + if (get_protocol_stack_group()->latency_start) { +- calculate_lstack_latency(&sock->stack->latency, pbuf, GAZELLE_LATENCY_READ_LSTACK, 0); ++ struct protocol_stack *stack = get_protocol_stack_by_id(sock->stack_id); ++ calculate_lstack_latency(&stack->latency, pbuf, GAZELLE_LATENCY_READ_LSTACK, 0); + } + + gazelle_ring_read_over(sock->recv_ring); +@@ -1088,15 +1048,12 @@ static ssize_t recv_ring_udp_read(struct lwip_sock *sock, void *buf, size_t len, + lwip_sock_make_addr(sock->conn, &(pbuf->addr), pbuf->port, addr, addrlen); + } + ++ struct protocol_stack *stack = get_protocol_stack_by_id(sock->stack_id); + if (copy_len < pbuf->tot_len) { +- sock->stack->stats.sock_rx_drop++; +- } +- +- if (sock->wakeup) { +- sock->wakeup->stat.app_read_cnt++; ++ stack->stats.sock_rx_drop++; + } + if (get_protocol_stack_group()->latency_start) { +- calculate_lstack_latency(&sock->stack->latency, pbuf, GAZELLE_LATENCY_READ_LSTACK, 0); ++ calculate_lstack_latency(&stack->latency, pbuf, GAZELLE_LATENCY_READ_LSTACK, 0); + } + + return copy_len; +@@ -1113,9 +1070,9 @@ ssize_t do_lwip_read_from_stack(int32_t fd, void *buf, size_t len, int32_t flags + return -1; + } + +- if (unlikely(sock->already_bind_numa == 0 && sock->stack)) { +- thread_bind_stack(sock->stack); +- sock->already_bind_numa = 1; ++ if (unlikely(sock->affinity_numa == 0)) { ++ thread_bind_stack(sock->stack_id); ++ sock->affinity_numa = 1; + } + + #if GAZELLE_SAME_NODE +@@ -1129,16 +1086,9 @@ ssize_t do_lwip_read_from_stack(int32_t fd, void *buf, size_t len, int32_t flags + recvd = recv_ring_tcp_read(sock, buf, len, noblock); + } + +- /* rte_ring_count reduce lock */ +- if (sock->wakeup && sock->wakeup->type == WAKEUP_EPOLL && (sock->events & EPOLLIN) +- && (!NETCONN_IS_DATAIN(sock))) { +- del_sock_event(sock, EPOLLIN); +- } ++ API_EVENT(sock->conn, NETCONN_EVT_RCVMINUS, recvd); + + if (recvd < 0) { +- if (sock->wakeup) { +- sock->wakeup->stat.read_null++; +- } + return -1; + } + return recvd; +@@ -1148,8 +1098,9 @@ void do_lwip_add_recvlist(int32_t fd) + { + struct lwip_sock *sock = lwip_get_socket(fd); + +- if (sock && sock->stack && list_node_null(&sock->recv_list)) { +- list_add_node(&sock->recv_list, &sock->stack->recv_list); ++ if (sock && list_node_null(&sock->recv_list)) { ++ struct protocol_stack *stack = get_protocol_stack_by_id(sock->stack_id); ++ list_add_node(&sock->recv_list, &stack->recv_list); + } + } + +@@ -1176,7 +1127,8 @@ void do_lwip_read_recvlist(struct protocol_stack *stack, uint32_t max_num) + } + + if (get_protocol_stack_group()->latency_start) { +- calculate_sock_latency(&sock->stack->latency, sock, GAZELLE_LATENCY_RECVMBOX_READY); ++ struct protocol_stack *stack = get_protocol_stack_by_id(sock->stack_id); ++ calculate_sock_latency(&stack->latency, sock, GAZELLE_LATENCY_RECVMBOX_READY); + } + + ssize_t len = 0; +@@ -1186,36 +1138,33 @@ void do_lwip_read_recvlist(struct protocol_stack *stack, uint32_t max_num) + len = lwip_recv(sock->conn->callback_arg.socket, NULL, 0, 0); + } + if (len < 0 && errno != EAGAIN) { +- sock->errevent = 1; +- add_sock_event(sock, EPOLLERR); ++ API_EVENT(sock->conn, NETCONN_EVT_ERROR, 0); + /* = 0: fin */ + } else if (len >= 0) { +- add_sock_event(sock, EPOLLIN); ++ API_EVENT(sock->conn, NETCONN_EVT_RCVPLUS, 0); + } + } + } + +-void do_lwip_connected_callback(struct netconn *conn) ++void do_lwip_connected_callback(int fd) + { +- if (conn == NULL) { +- return; +- } +- +- int32_t fd = conn->callback_arg.socket; + struct lwip_sock *sock = lwip_get_socket(fd); + if (POSIX_IS_CLOSED(sock)) { + return; + } + +- if (sock->wakeup != NULL && sock->wakeup->epollfd > 0) { +- posix_api->epoll_ctl_fn(sock->wakeup->epollfd, EPOLL_CTL_DEL, fd, NULL); ++ if (POSIX_HAS_TYPE(sock, POSIX_KERNEL)) { ++ /* delete kernel event */ ++ if (sock->sk_wait != NULL) { ++ posix_api->epoll_ctl_fn(sock->sk_wait->epfd, EPOLL_CTL_DEL, fd, NULL); ++ } ++ /* shutdown kernel connect, do_connect() has tried both kernel and lwip. */ ++ posix_api->shutdown_fn(fd, SHUT_RDWR); + } + + POSIX_SET_TYPE(sock, POSIX_LWIP); + +- posix_api->shutdown_fn(fd, SHUT_RDWR); +- +- add_sock_event(sock, EPOLLOUT); ++ API_EVENT(sock->conn, NETCONN_EVT_RCVPLUS, 0); + } + + static void copy_pcb_to_conn(struct gazelle_stat_lstack_conn_info *conn, const struct tcp_pcb *pcb) +@@ -1249,9 +1198,9 @@ static void copy_pcb_to_conn(struct gazelle_stat_lstack_conn_info *conn, const s + conn->recv_ring_cnt = (sock->recv_ring == NULL) ? 0 : gazelle_ring_readable_count(sock->recv_ring); + conn->recv_ring_cnt += (sock->recv_lastdata) ? 1 : 0; + conn->send_ring_cnt = (sock->send_ring == NULL) ? 0 : gazelle_ring_readover_count(sock->send_ring); +- conn->events = sock->events; +- conn->epoll_events = sock->epoll_events; +- conn->eventlist = !list_node_null(&sock->event_list); ++ conn->events = sock->sk_event.pending; ++ conn->epoll_events = sock->sk_event.events; ++ conn->eventlist = !list_node_null(&sock->sk_event.event_node); + } + } + } +diff --git a/src/lstack/core/lstack_protocol_stack.c b/src/lstack/core/lstack_protocol_stack.c +index 3bb1eeb..1e7df33 100644 +--- a/src/lstack/core/lstack_protocol_stack.c ++++ b/src/lstack/core/lstack_protocol_stack.c +@@ -29,7 +29,7 @@ + #include "lstack_ethdev.h" + #include "lstack_lwip.h" + #include "lstack_control_plane.h" +-#include "lstack_epoll.h" ++#include "lstack_wait.h" + #include "lstack_stack_stat.h" + #include "lstack_virtio.h" + #include "lstack_interrupt.h" +@@ -39,9 +39,7 @@ + #include + #endif + +-#define KERNEL_EVENT_10us 10 +- +-static PER_THREAD struct protocol_stack *g_stack_p = NULL; ++PER_THREAD struct protocol_stack *g_stack_p = NULL; + static struct protocol_stack_group g_stack_group = {0}; + + typedef void *(*stack_thread_func)(void *arg); +@@ -81,19 +79,15 @@ struct protocol_stack_group *get_protocol_stack_group(void) + return &g_stack_group; + } + +-struct protocol_stack *get_protocol_stack(void) ++struct protocol_stack *get_protocol_stack_by_id(int stack_id) + { +- return g_stack_p; +-} ++ struct protocol_stack_group *stack_group; + +-struct protocol_stack *get_protocol_stack_by_fd(int fd) +-{ +- struct lwip_sock *sock = lwip_get_socket(fd); +- if (POSIX_IS_CLOSED(sock)) { ++ if (stack_id < 0) { + return NULL; + } +- +- return sock->stack; ++ stack_group = get_protocol_stack_group(); ++ return stack_group->stacks[stack_id]; + } + + struct protocol_stack *get_bind_protocol_stack(void) +@@ -153,10 +147,11 @@ int get_min_conn_stack(struct protocol_stack_group *stack_group) + } + #endif /* GAZELLE_TCP_REUSE_IPPORT */ + +-void bind_to_stack_numa(struct protocol_stack *stack) ++void bind_to_stack_numa(int stack_id) + { +- int32_t ret; ++ int ret; + pthread_t tid = pthread_self(); ++ struct protocol_stack *stack = get_protocol_stack_by_id(stack_id); + + if (get_global_cfg_params()->stack_num > 0) { + numa_run_on_node(stack->numa_id); +@@ -170,7 +165,7 @@ void bind_to_stack_numa(struct protocol_stack *stack) + } + } + +-void thread_bind_stack(struct protocol_stack *stack) ++void thread_bind_stack(int stack_id) + { + static PER_THREAD uint16_t stack_sock_num[GAZELLE_MAX_STACK_NUM] = {0}; + static PER_THREAD uint16_t max_sock_stack = 0; +@@ -179,10 +174,10 @@ void thread_bind_stack(struct protocol_stack *stack) + return; + } + +- stack_sock_num[stack->stack_idx]++; +- if (stack_sock_num[stack->stack_idx] > max_sock_stack) { +- max_sock_stack = stack_sock_num[stack->stack_idx]; +- bind_to_stack_numa(stack); ++ stack_sock_num[stack_id]++; ++ if (stack_sock_num[stack_id] > max_sock_stack) { ++ max_sock_stack = stack_sock_num[stack_id]; ++ bind_to_stack_numa(stack_id); + } + } + +@@ -308,11 +303,6 @@ void low_power_idling(struct protocol_stack *stack) + } + } + +-struct thread_params { +- uint16_t queue_id; +- uint16_t idx; +-}; +- + static int32_t create_thread(void *arg, char *thread_name, stack_thread_func func) + { + /* thread may run slow, if arg is temp var maybe have relese */ +@@ -347,48 +337,6 @@ static int32_t create_thread(void *arg, char *thread_name, stack_thread_func fun + return 0; + } + +-static void wakeup_kernel_event(struct protocol_stack *stack) +-{ +- if (stack->kernel_event_num <= 0) { +- return; +- } +- +- for (int32_t i = 0; i < stack->kernel_event_num; i++) { +- struct wakeup_poll *wakeup = stack->kernel_events[i].data.ptr; +- if (wakeup->type == WAKEUP_CLOSE) { +- continue; +- } +- +- __atomic_store_n(&wakeup->have_kernel_event, true, __ATOMIC_RELEASE); +- lstack_block_wakeup(wakeup); +- } +- +- return; +-} +- +-static void* gazelle_kernelevent_thread(void *arg) +-{ +- struct thread_params *t_params = (struct thread_params*) arg; +- uint16_t idx = t_params->idx; +- struct protocol_stack *stack = get_protocol_stack_group()->stacks[idx]; +- +- bind_to_stack_numa(stack); +- +- LSTACK_LOG(INFO, LSTACK, "kernelevent_%02hu start\n", idx); +- free(arg); +- sem_post(&g_stack_group.sem_stack_setup); +- +- for (;;) { +- stack->kernel_event_num = posix_api->epoll_wait_fn(stack->epollfd, stack->kernel_events, KERNEL_EPOLL_MAX, -1); +- if (stack->kernel_event_num > 0) { +- wakeup_kernel_event(stack); +- usleep(KERNEL_EVENT_10us); +- } +- } +- +- return NULL; +-} +- + static int32_t init_stack_value(struct protocol_stack *stack, void *arg) + { + struct thread_params *t_params = (struct thread_params*) arg; +@@ -403,16 +351,10 @@ static int32_t init_stack_value(struct protocol_stack *stack, void *arg) + + list_init_head(&stack->recv_list); + list_init_head(&stack->same_node_recv_list); +- list_init_head(&stack->wakeup_list); + + stack_group->stacks[t_params->idx] = stack; + set_stack_idx(t_params->idx); + +- stack->epollfd = posix_api->epoll_create_fn(GAZELLE_LSTACK_MAX_CONN); +- if (stack->epollfd < 0) { +- LSTACK_LOG(ERR, LSTACK, "kernel epoll_create failed\n"); +- return -1; +- } + + if (cfg_params->stack_num > 0) { + stack->numa_id = cfg_params->numa_id; +@@ -453,7 +395,7 @@ static int32_t create_affiliate_thread(void *arg) + return -1; + } + memcpy_s(params, sizeof(*params), arg, sizeof(struct thread_params)); +- if (create_thread((void *)params, "gazellekernel", gazelle_kernelevent_thread) != 0) { ++ if (create_thread((void *)params, "gazellekernel", kernel_wait_thread) != 0) { + LSTACK_LOG(ERR, LSTACK, "gazellekernel errno=%d\n", errno); + return -1; + } +@@ -551,7 +493,9 @@ int stack_polling(unsigned wakeup_tick) + do_lwip_read_recvlist(stack, read_connect_number); + + if ((wakeup_tick & 0xf) == 0) { +- wakeup_stack_epoll(stack); ++#if SOCK_WAIT_BATCH_NOTIFY ++ stack->stats.wakeup_events += lwip_wait_foreach_notify(stack->stack_idx); ++#endif /* SOCK_WAIT_BATCH_NOTIFY */ + if (get_global_cfg_params()->send_cache_mode) { + tx_cache_send(stack->queue_id); + } +@@ -595,7 +539,7 @@ static bool stack_local_event_get(uint16_t stack_id) + if (!lockless_queue_empty(&stack->dfx_rpc_queue.queue) || + !lockless_queue_empty(&stack->rpc_queue.queue) || + !list_head_empty(&stack->recv_list) || +- !list_head_empty(&stack->wakeup_list) || ++ !lwip_wait_notify_empty(stack_id) || + tx_cache_count(stack->queue_id)) { + return true; + } +@@ -681,8 +625,6 @@ int stack_group_init(void) + struct protocol_stack_group *stack_group = get_protocol_stack_group(); + stack_group->stack_num = 0; + +- list_init_head(&stack_group->poll_list); +- pthread_spin_init(&stack_group->poll_list_lock, PTHREAD_PROCESS_PRIVATE); + pthread_spin_init(&stack_group->socket_lock, PTHREAD_PROCESS_PRIVATE); + if (sem_init(&stack_group->sem_stack_setup, 0, 0) < 0) { + LSTACK_LOG(ERR, LSTACK, "sem_init failed errno=%d\n", errno); +@@ -783,24 +725,22 @@ OUT2: + return -1; + } + +-static void stack_all_fds_close(struct protocol_stack *stack) ++ ++void stack_exit(void) + { ++ struct protocol_stack *stack = get_protocol_stack(); ++ if (stack == NULL) ++ return; ++ ++ /* close all fd */ + for (int i = 3; i < GAZELLE_MAX_CLIENTS + GAZELLE_RESERVED_CLIENTS; i++) { + struct lwip_sock *sock = lwip_get_socket(i); +- if (!POSIX_IS_CLOSED(sock) && sock->stack == stack) { ++ if (!POSIX_IS_CLOSED(sock) && sock->stack_id == stack->stack_idx) { + lwip_close(i); + } + } + } + +-void stack_exit(void) +-{ +- struct protocol_stack *stack = get_protocol_stack(); +- if (stack != NULL) { +- stack_all_fds_close(stack); +- } +-} +- + void stack_wait(void) + { + struct protocol_stack *stack = get_protocol_stack(); +diff --git a/src/lstack/core/lstack_stack_stat.c b/src/lstack/core/lstack_stack_stat.c +index b1eb60e..dc9c931 100644 +--- a/src/lstack/core/lstack_stack_stat.c ++++ b/src/lstack/core/lstack_stack_stat.c +@@ -65,8 +65,9 @@ void time_stamp_record(int fd, struct pbuf *pbuf) + { + struct lwip_sock *sock = lwip_get_socket(fd); + +- if (get_protocol_stack_group()->latency_start && sock && sock->stack && pbuf) { +- calculate_lstack_latency(&sock->stack->latency, pbuf, GAZELLE_LATENCY_INTO_MBOX, 0); ++ if (get_protocol_stack_group()->latency_start && sock && pbuf) { ++ struct protocol_stack *stack = get_protocol_stack_by_id(sock->stack_id); ++ calculate_lstack_latency(&stack->latency, pbuf, GAZELLE_LATENCY_INTO_MBOX, 0); + time_stamp_into_recvmbox(sock); + } + } +@@ -209,29 +210,6 @@ static void set_latency_start_flag(bool start) + } + } + +-static void get_wakeup_stat(struct protocol_stack_group *stack_group, struct protocol_stack *stack, +- struct gazelle_wakeup_stat *stat) +-{ +- struct list_node *node, *temp; +- +- pthread_spin_lock(&stack_group->poll_list_lock); +- +- list_for_each_node(node, temp, &stack_group->poll_list) { +- struct wakeup_poll *wakeup = list_entry(node, struct wakeup_poll, poll_list); +- +- if (wakeup->bind_stack == stack) { +- stat->kernel_events += wakeup->stat.kernel_events; +- stat->app_events += wakeup->stat.app_events; +- stat->read_null += wakeup->stat.read_null; +- stat->app_write_cnt += wakeup->stat.app_write_cnt; +- stat->app_write_rpc += wakeup->stat.app_write_rpc; +- stat->app_read_cnt += wakeup->stat.app_read_cnt; +- } +- } +- +- pthread_spin_unlock(&stack_group->poll_list_lock); +-} +- + void lstack_get_low_power_info(struct gazelle_stat_low_power_info *low_power_info) + { + struct cfg_params *cfg = get_global_cfg_params(); +@@ -244,8 +222,6 @@ void lstack_get_low_power_info(struct gazelle_stat_low_power_info *low_power_inf + + static void get_stack_stats(struct gazelle_stack_dfx_data *dfx, struct protocol_stack *stack) + { +- struct protocol_stack_group *stack_group = get_protocol_stack_group(); +- + dfx->loglevel = rte_log_get_level(RTE_LOGTYPE_LSTACK); + + lstack_get_low_power_info(&dfx->low_power_info); +@@ -257,7 +233,7 @@ static void get_stack_stats(struct gazelle_stack_dfx_data *dfx, struct protocol_ + return; + } + +- get_wakeup_stat(stack_group, stack, &dfx->data.pkts.wakeup_stat); ++ sock_wait_group_stat(stack->stack_idx, &dfx->data.pkts.wakeup_stat); + + dfx->data.pkts.call_alloc_fail = rpc_stats_get()->call_alloc_fail; + +diff --git a/src/lstack/core/lstack_thread_rpc.c b/src/lstack/core/lstack_thread_rpc.c +index 26ae501..a831d3b 100644 +--- a/src/lstack/core/lstack_thread_rpc.c ++++ b/src/lstack/core/lstack_thread_rpc.c +@@ -242,8 +242,8 @@ static void callback_socket(struct rpc_msg *msg) + static void callback_close(struct rpc_msg *msg) + { + int fd = msg->args[MSG_ARG_0].i; +- struct protocol_stack *stack = get_protocol_stack_by_fd(fd); + struct lwip_sock *sock = lwip_get_socket(fd); ++ struct protocol_stack *stack = get_protocol_stack_by_id(sock->stack_id); + + if (sock && __atomic_load_n(&sock->call_num, __ATOMIC_ACQUIRE) > 0) { + msg->recall_flag = 1; +@@ -261,8 +261,8 @@ static void callback_shutdown(struct rpc_msg *msg) + { + int fd = msg->args[MSG_ARG_0].i; + int how = msg->args[MSG_ARG_1].i; +- struct protocol_stack *stack = get_protocol_stack_by_fd(fd); + struct lwip_sock *sock = lwip_get_socket(fd); ++ struct protocol_stack *stack = get_protocol_stack_by_id(sock->stack_id); + + if (sock && __atomic_load_n(&sock->call_num, __ATOMIC_ACQUIRE) > 0) { + msg->recall_flag = 1; +@@ -365,14 +365,14 @@ static void callback_accept(struct rpc_msg *msg) + } + + struct lwip_sock *sock = lwip_get_socket(accept_fd); +- if (sock == NULL || sock->stack == NULL) { ++ if (sock == NULL) { + lwip_close(accept_fd); + LSTACK_LOG(ERR, LSTACK, "fd %d ret %d\n", fd, accept_fd); + return; + } + + msg->result = accept_fd; +- sock->stack->conn_num++; ++ stack->conn_num++; + if (rte_ring_count(sock->conn->recvmbox->ring)) { + do_lwip_add_recvlist(accept_fd); + } +@@ -751,27 +751,6 @@ int rpc_call_recvlistcnt(rpc_queue *queue) + return rpc_sync_call(queue, msg); + } + +-static void callback_clean_epoll(struct rpc_msg *msg) +-{ +- struct protocol_stack *stack = get_protocol_stack(); +- struct wakeup_poll *wakeup = (struct wakeup_poll *)msg->args[MSG_ARG_0].p; +- +- list_del_node(&wakeup->wakeup_list[stack->stack_idx]); +-} +- +-int rpc_call_clean_epoll(rpc_queue *queue, void *wakeup) +-{ +- struct rpc_msg *msg = rpc_msg_alloc(callback_clean_epoll); +- if (msg == NULL) { +- return -1; +- } +- +- msg->args[MSG_ARG_0].p = wakeup; +- +- rpc_sync_call(queue, msg); +- return 0; +-} +- + static void callback_arp(struct rpc_msg *msg) + { + struct rte_mbuf *mbuf = (struct rte_mbuf *)msg->args[MSG_ARG_0].p; +diff --git a/src/lstack/core/lstack_wait.c b/src/lstack/core/lstack_wait.c +index c67df93..6334f5e 100644 +--- a/src/lstack/core/lstack_wait.c ++++ b/src/lstack/core/lstack_wait.c +@@ -26,7 +26,7 @@ + #include "lstack_log.h" + #include "lstack_cfg.h" + #include "same_node.h" +-#include "mbox_ring.h" ++#include "lstack_lwip.h" + + #define KERNEL_EVENT_WAIT_US 10 + #define LWIP_EVENT_WAIT_US 10 +@@ -340,34 +340,26 @@ void sock_wait_kernel_free(struct sock_wait *sk_wait) + static inline bool NETCONN_NEED_ACCEPT(const struct lwip_sock *sock) + { + if (sys_mbox_valid(&sock->conn->acceptmbox)) { +- const struct mbox_ring *mr = &sock->conn->acceptmbox->mring; +- return mr->ops->count(mr) > 0; ++ return !sys_mbox_empty(sock->conn->acceptmbox); + } + return false; + } + + static inline bool NETCONN_NEED_RECV(const struct lwip_sock *sock) + { +- if (sock->lastdata.pbuf != NULL) ++ if (sock->recv_lastdata != NULL) ++ return true; ++ if (gazelle_ring_readable_count(sock->recv_ring) > 0) ++ return true; ++ if (NETCONN_NEED_SAME_NODE(sock)) + return true; +- if (sys_mbox_valid(&sock->conn->recvmbox)) { +- const struct mbox_ring *mr = &sock->conn->recvmbox->mring; +- return mr->ops->recv_count(mr) > 0; +- } + return false; + } + + static inline bool NETCONN_ALLOW_SEND(const struct lwip_sock *sock) + { +- if (get_global_cfg_params()->stack_mode_rtc) { +- if (NETCONN_TYPE(sock->conn) == NETCONN_TCP) +- return lwip_tcp_allow_send(sock->conn->pcb.tcp); +- return false; +- } +- if (sys_mbox_valid(&sock->conn->sendmbox)) { +- const struct mbox_ring *mr = &sock->conn->sendmbox->mring; +- return mr->ops->free_count(mr) > 0; +- } ++ if (gazelle_ring_readable_count(sock->send_ring) > 0) ++ return true; + return false; + } + +diff --git a/src/lstack/core/same_node.c b/src/lstack/core/same_node.c +index 0fe0fa8..660fefd 100644 +--- a/src/lstack/core/same_node.c ++++ b/src/lstack/core/same_node.c +@@ -34,7 +34,7 @@ void read_same_node_recv_list(struct protocol_stack *stack) + sock = list_entry(node, struct lwip_sock, recv_list); + + if (sock->same_node_rx_ring != NULL && same_node_ring_count(sock)) { +- add_sock_event(sock, EPOLLIN); ++ API_EVENT(sock->conn, NETCONN_EVT_RCVPLUS, 0); + } + } + } +@@ -231,7 +231,8 @@ err_t find_same_node_memzone(struct tcp_pcb *pcb, struct lwip_sock *nsock) + + /* rcvlink init in alloc_socket() */ + /* remove from g_rcv_process_list in free_socket */ +- list_add_node(&nsock->recv_list, &nsock->stack->same_node_recv_list); ++ struct protocol_stack *stack = get_protocol_stack_by_id(nsock->stack_id); ++ list_add_node(&nsock->recv_list, &stack->same_node_recv_list); + return 0; + } + +diff --git a/src/lstack/include/lstack_epoll.h b/src/lstack/include/lstack_epoll.h +index 9a5d15a..655e178 100644 +--- a/src/lstack/include/lstack_epoll.h ++++ b/src/lstack/include/lstack_epoll.h +@@ -13,85 +13,6 @@ + #ifndef _GAZELLE_EPOLL_H_ + #define _GAZELLE_EPOLL_H_ + +-#if /* SOCK_EVENT_V2 */ +- +-#include +-#include +-#include +-#include +- +-#include +-#include +- +-#include "common/gazelle_dfx_msg.h" +-#include "common/gazelle_opt.h" +- +-enum wakeup_type { +- WAKEUP_EPOLL = 0, +- WAKEUP_POLL, +- WAKEUP_CLOSE, +- WAKEUP_BLOCK, +-}; +- +-struct protocol_stack; +-struct wakeup_poll { +- /* stack thread read frequently */ +- enum wakeup_type type; +- sem_t wait; +- bool in_wait; +- struct list_node wakeup_list[PROTOCOL_STACK_MAX]; +- bool have_kernel_event; +- char pad __rte_cache_aligned; +- +- struct gazelle_wakeup_stat stat; +- struct protocol_stack *bind_stack; +- struct list_node poll_list; +- +- /* poll */ +- struct pollfd *last_fds; +- nfds_t last_nfds; +- nfds_t last_max_nfds; +- struct epoll_event *events; +- +- /* epoll */ +- int32_t epollfd; /* epoll kernel fd */ +- int32_t stack_fd_cnt[PROTOCOL_STACK_MAX]; +- struct protocol_stack *max_stack; +- struct list_node event_list; +- pthread_spinlock_t event_list_lock; +-}; +- +-void add_sock_event(struct lwip_sock *sock, uint32_t event); +-void add_sock_event_nolock(struct lwip_sock *sock, uint32_t event); +-void del_sock_event(struct lwip_sock *sock, uint32_t event); +-void del_sock_event_nolock(struct lwip_sock *sock, uint32_t event); +- +-void wakeup_stack_epoll(struct protocol_stack *stack); +- +-int32_t lstack_epoll_create(int32_t size); +-int32_t lstack_epoll_create1(int32_t flags); +-int32_t lstack_rtw_epoll_ctl(int32_t epfd, int32_t op, int32_t fd, struct epoll_event *event); +-int32_t lstack_rtc_epoll_ctl(int32_t epfd, int32_t op, int32_t fd, struct epoll_event *event); +-int32_t lstack_rtw_epoll_wait(int32_t epfd, struct epoll_event *events, int32_t maxevents, int32_t timeout); +-int32_t lstack_rtc_epoll_wait(int32_t epfd, struct epoll_event* events, int32_t maxevents, int32_t timeout); +-int32_t lstack_poll(struct pollfd *fds, nfds_t nfds, int32_t timeout); +-int lstack_select(int maxfd, fd_set *readfds, fd_set *writefds, fd_set *exceptfds, struct timeval *timeval); +- +-int32_t lstack_block_wait(struct wakeup_poll *wakeup, int32_t timeout); +- +-struct wakeup_poll* poll_construct_wakeup(void); +- +-static inline void lstack_block_wakeup(struct wakeup_poll *wakeup) +-{ +- if (wakeup && __atomic_load_n(&wakeup->in_wait, __ATOMIC_ACQUIRE)) { +- __atomic_store_n(&wakeup->in_wait, false, __ATOMIC_RELEASE); +- rte_mb(); +- sem_post(&wakeup->wait); +- } +-} +- +-#else /* SOCK_EVENT_V2 */ +- + #include + #include + +@@ -104,6 +25,4 @@ int lstack_epoll_close(int epfd); + void epoll_api_init(posix_api_t *api); + bool sock_event_wait(struct lwip_sock *sock, bool noblocking); + +-#endif /* SOCK_EVENT_V2 */ +- + #endif /* _GAZELLE_EPOLL_H_ */ +diff --git a/src/lstack/include/lstack_lwip.h b/src/lstack/include/lstack_lwip.h +index f2524e4..4cc9db1 100644 +--- a/src/lstack/include/lstack_lwip.h ++++ b/src/lstack/include/lstack_lwip.h +@@ -22,11 +22,7 @@ struct lwip_sock; + struct rpc_msg; + struct protocol_stack; + +- +-#define NETCONN_IS_ACCEPTIN(sock) (((sock)->conn->acceptmbox != NULL) && !sys_mbox_empty((sock)->conn->acceptmbox)) +-#define NETCONN_IS_DATAIN(sock) ((gazelle_ring_readable_count((sock)->recv_ring) || (sock)->recv_lastdata) || NETCONN_NEED_SAME_NODE(sock)) + #define NETCONN_IS_DATAOUT(sock) (gazelle_ring_readover_count((sock)->send_ring) || (sock)->send_pre_del) +-#define NETCONN_IS_OUTIDLE(sock) gazelle_ring_readable_count((sock)->send_ring) + #define NETCONN_IS_UDP(sock) (NETCONNTYPE_GROUP(netconn_type((sock)->conn)) == NETCONN_UDP) + + /* lwip api */ +diff --git a/src/lstack/include/lstack_protocol_stack.h b/src/lstack/include/lstack_protocol_stack.h +index 3f6e3d3e..b77d5da 100644 +--- a/src/lstack/include/lstack_protocol_stack.h ++++ b/src/lstack/include/lstack_protocol_stack.h +@@ -14,7 +14,6 @@ + #define __GAZELLE_PROTOCOL_STACK_H__ + + #include +-#include + #include + + #include +@@ -43,41 +42,35 @@ struct protocol_stack { + uint16_t numa_id; + uint16_t cpu_id; + uint32_t stack_idx; ++ ++ struct netif netif; ++ struct lstack_dev_ops dev_ops; ++ + cpu_set_t idle_cpuset; /* idle cpu in numa of stack, app thread bind to it */ +- int32_t epollfd; /* kernel event thread epoll fd */ ++ + volatile enum rte_lcore_state_t state; ++ volatile bool low_power; ++ volatile uint16_t conn_num; + + struct rte_mempool *rxtx_mbuf_pool; +- struct rte_ring *rx_ring; ++ struct rte_ring *rx_ring; + struct rte_ring *tx_ring; + struct rte_ring *reg_ring; +- struct rte_ring *wakeup_ring; + struct reg_ring_msg *reg_buf; + uint32_t reg_head; + +- volatile bool low_power; ++ uint32_t rx_ring_used; ++ uint32_t tx_ring_used; ++ struct rte_mbuf *pkts[NIC_QUEUE_SIZE_MAX]; + + char pad1 __rte_cache_aligned; + rpc_queue dfx_rpc_queue; + rpc_queue rpc_queue; + char pad2 __rte_cache_aligned; + +- /* kernel event thread read/write frequently */ +- struct epoll_event kernel_events[KERNEL_EPOLL_MAX]; +- int32_t kernel_event_num; +- char pad3 __rte_cache_aligned; +- +- struct netif netif; +- struct lstack_dev_ops dev_ops; +- uint32_t rx_ring_used; +- uint32_t tx_ring_used; +- +- struct rte_mbuf *pkts[NIC_QUEUE_SIZE_MAX]; + struct list_node recv_list; + struct list_node same_node_recv_list; /* used for same node processes communication */ +- struct list_node wakeup_list; + +- volatile uint16_t conn_num; + struct stats_ *lwip_stats; + struct gazelle_stack_latency latency; + struct gazelle_stack_stat stats; +@@ -93,8 +86,7 @@ struct protocol_stack_group { + struct rte_mempool *kni_pktmbuf_pool; + struct eth_params *eth_params; + struct protocol_stack *stacks[PROTOCOL_STACK_MAX]; +- struct list_node poll_list; +- pthread_spinlock_t poll_list_lock; ++ + sem_t sem_listen_thread; + struct rte_mempool *total_rxtx_pktmbuf_pool[PROTOCOL_STACK_MAX]; + sem_t sem_stack_setup; +@@ -106,16 +98,26 @@ struct protocol_stack_group { + pthread_spinlock_t socket_lock; + }; + +-struct protocol_stack *get_protocol_stack(void); +-struct protocol_stack *get_protocol_stack_by_fd(int fd); +-struct protocol_stack *get_bind_protocol_stack(void); ++struct thread_params { ++ uint16_t queue_id; ++ uint16_t idx; ++}; ++ + struct protocol_stack_group *get_protocol_stack_group(void); + ++extern PER_THREAD struct protocol_stack *g_stack_p; ++static inline struct protocol_stack *get_protocol_stack(void) ++{ ++ return g_stack_p; ++} ++struct protocol_stack *get_protocol_stack_by_id(int stack_id); ++struct protocol_stack *get_bind_protocol_stack(void); ++ + #if GAZELLE_TCP_REUSE_IPPORT + int get_min_conn_stack(struct protocol_stack_group *stack_group); + #endif /* GAZELLE_TCP_REUSE_IPPORT */ +-void bind_to_stack_numa(struct protocol_stack *stack); +-void thread_bind_stack(struct protocol_stack *stack); ++void bind_to_stack_numa(int stack_id); ++void thread_bind_stack(int stack_id); + + int stack_group_init(void); + void stack_group_exit(void); +diff --git a/src/lstack/include/lstack_thread_rpc.h b/src/lstack/include/lstack_thread_rpc.h +index c74981f..427a519 100644 +--- a/src/lstack/include/lstack_thread_rpc.h ++++ b/src/lstack/include/lstack_thread_rpc.h +@@ -101,7 +101,6 @@ int rpc_call_udp_send(rpc_queue *queue, int fd, size_t len, int flags); + int rpc_call_replenish(rpc_queue *queue, void *sock); + int rpc_call_recvlistcnt(rpc_queue *queue); + +-int rpc_call_clean_epoll(rpc_queue *queue, void *wakeup); + int rpc_call_arp(rpc_queue *queue, void *mbuf); + + int rpc_call_conntable(rpc_queue *queue, void *conn_table, unsigned max_conn); +-- +2.33.0 + diff --git a/0328-socket-refactor-tcp-and-udp.patch b/0328-socket-refactor-tcp-and-udp.patch new file mode 100644 index 0000000..58114af --- /dev/null +++ b/0328-socket-refactor-tcp-and-udp.patch @@ -0,0 +1,4524 @@ +From 52c4cd904e90bc506ae1323465d99007e0e1a4ba Mon Sep 17 00:00:00 2001 +From: Lemmy Huang +Date: Fri, 21 Mar 2025 17:02:04 +0800 +Subject: [PATCH] socket: refactor tcp and udp + +Signed-off-by: Lemmy Huang +--- + src/lstack/api/lstack_sockctl.c | 840 +++++++++++ + src/lstack/api/lstack_sockio.c | 1488 ++++++++++++++++++++ + src/lstack/core/lstack_mempool.c | 1014 +++++++++++++ + src/lstack/include/lstack_lockless_queue.h | 81 +- + src/lstack/include/lstack_mempool.h | 325 +++++ + src/lstack/include/lstack_sockctl.h | 25 + + src/lstack/include/lstack_sockio.h | 41 + + src/lstack/include/mbox_ring.h | 583 ++++++++ + 8 files changed, 4394 insertions(+), 3 deletions(-) + create mode 100644 src/lstack/api/lstack_sockctl.c + create mode 100644 src/lstack/api/lstack_sockio.c + create mode 100644 src/lstack/core/lstack_mempool.c + create mode 100644 src/lstack/include/lstack_mempool.h + create mode 100644 src/lstack/include/lstack_sockctl.h + create mode 100644 src/lstack/include/lstack_sockio.h + create mode 100644 src/lstack/include/mbox_ring.h + +diff --git a/src/lstack/api/lstack_sockctl.c b/src/lstack/api/lstack_sockctl.c +new file mode 100644 +index 0000000..71310b7 +--- /dev/null ++++ b/src/lstack/api/lstack_sockctl.c +@@ -0,0 +1,840 @@ ++/* ++* Copyright (c) Huawei Technologies Co., Ltd. 2020-2021. All rights reserved. ++* gazelle is licensed under the Mulan PSL v2. ++* You can use this software according to the terms and conditions of the Mulan PSL v2. ++* You may obtain a copy of Mulan PSL v2 at: ++* http://license.coscl.org.cn/MulanPSL2 ++* THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR ++* IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY OR FIT FOR A PARTICULAR ++* PURPOSE. ++* See the Mulan PSL v2 for more details. ++*/ ++ ++#include ++#include ++#include ++ ++#include "common/gazelle_base_func.h" ++#include "lstack_log.h" ++#include "lstack_cfg.h" ++#include "lstack_thread_rpc.h" ++#include "lstack_protocol_stack.h" ++#include "lstack_epoll.h" ++#include "lstack_sockctl.h" ++#include "lstack_sockio.h" ++ ++ ++static void callback_getpeername(struct rpc_msg *msg) ++{ ++ msg->result = lwip_getpeername(msg->args[MSG_ARG_0].i, msg->args[MSG_ARG_1].p, msg->args[MSG_ARG_2].p); ++ if (msg->result != 0) { ++ LSTACK_LOG(ERR, LSTACK, "tid %d, fd %d fail %ld\n", rte_gettid(), msg->args[MSG_ARG_0].i, msg->result); ++ } ++} ++ ++static void callback_getsockname(struct rpc_msg *msg) ++{ ++ msg->result = lwip_getsockname(msg->args[MSG_ARG_0].i, msg->args[MSG_ARG_1].p, msg->args[MSG_ARG_2].p); ++ if (msg->result != 0) { ++ LSTACK_LOG(ERR, LSTACK, "tid %d, fd %d fail %ld\n", rte_gettid(), msg->args[MSG_ARG_0].i, msg->result); ++ } ++} ++ ++static void callback_getsockopt(struct rpc_msg *msg) ++{ ++ msg->result = lwip_getsockopt(msg->args[MSG_ARG_0].i, msg->args[MSG_ARG_1].i, msg->args[MSG_ARG_2].i, ++ msg->args[MSG_ARG_3].p, msg->args[MSG_ARG_4].p); ++ if (msg->result != 0) { ++ LSTACK_LOG(ERR, LSTACK, "tid %d, fd %d, level %d, optname %d, fail %ld\n", rte_gettid(), ++ msg->args[MSG_ARG_0].i, msg->args[MSG_ARG_1].i, msg->args[MSG_ARG_2].i, msg->result); ++ } ++} ++ ++static void callback_setsockopt(struct rpc_msg *msg) ++{ ++ msg->result = lwip_setsockopt(msg->args[MSG_ARG_0].i, msg->args[MSG_ARG_1].i, msg->args[MSG_ARG_2].i, ++ msg->args[MSG_ARG_3].cp, msg->args[MSG_ARG_4].u); ++ if (msg->result != 0) { ++ LSTACK_LOG(ERR, LSTACK, "tid %d, fd %d, level %d, optname %d, fail %ld\n", rte_gettid(), ++ msg->args[MSG_ARG_0].i, msg->args[MSG_ARG_1].i, msg->args[MSG_ARG_2].i, msg->result); ++ } ++} ++ ++static int rpc_call_getpeername(int stack_id, int fd, struct sockaddr *addr, socklen_t *addrlen) ++{ ++ rpc_queue *queue = &get_protocol_stack_by_id(stack_id)->rpc_queue; ++ struct rpc_msg *msg = rpc_msg_alloc(stack_id, callback_getpeername); ++ if (msg == NULL) { ++ return -1; ++ } ++ ++ msg->args[MSG_ARG_0].i = fd; ++ msg->args[MSG_ARG_1].p = addr; ++ msg->args[MSG_ARG_2].p = addrlen; ++ ++ return rpc_sync_call(queue, msg); ++} ++ ++static int rpc_call_getsockname(int stack_id, int fd, struct sockaddr *addr, socklen_t *addrlen) ++{ ++ rpc_queue *queue = &get_protocol_stack_by_id(stack_id)->rpc_queue; ++ struct rpc_msg *msg = rpc_msg_alloc(stack_id, callback_getsockname); ++ if (msg == NULL) { ++ return -1; ++ } ++ ++ msg->args[MSG_ARG_0].i = fd; ++ msg->args[MSG_ARG_1].p = addr; ++ msg->args[MSG_ARG_2].p = addrlen; ++ ++ return rpc_sync_call(queue, msg); ++} ++ ++static int rpc_call_getsockopt(int stack_id, int fd, int level, int optname, void *optval, socklen_t *optlen) ++{ ++ rpc_queue *queue = &get_protocol_stack_by_id(stack_id)->rpc_queue; ++ struct rpc_msg *msg = rpc_msg_alloc(stack_id, callback_getsockopt); ++ if (msg == NULL) { ++ return -1; ++ } ++ ++ msg->args[MSG_ARG_0].i = fd; ++ msg->args[MSG_ARG_1].i = level; ++ msg->args[MSG_ARG_2].i = optname; ++ msg->args[MSG_ARG_3].p = optval; ++ msg->args[MSG_ARG_4].p = optlen; ++ ++ return rpc_sync_call(queue, msg); ++} ++ ++static int rpc_call_setsockopt(int stack_id, int fd, int level, int optname, const void *optval, socklen_t optlen) ++{ ++ rpc_queue *queue = &get_protocol_stack_by_id(stack_id)->rpc_queue; ++ struct rpc_msg *msg = rpc_msg_alloc(stack_id, callback_setsockopt); ++ if (msg == NULL) { ++ return -1; ++ } ++ ++ msg->args[MSG_ARG_0].i = fd; ++ msg->args[MSG_ARG_1].i = level; ++ msg->args[MSG_ARG_2].i = optname; ++ msg->args[MSG_ARG_3].cp = optval; ++ msg->args[MSG_ARG_4].u = optlen; ++ ++ return rpc_sync_call(queue, msg); ++} ++ ++static int rtw_setsockopt(int s, int level, int optname, const void *optval, socklen_t optlen) ++{ ++ struct lwip_sock *sock = lwip_get_socket(s); ++ if (POSIX_IS_CLOSED(sock)) { ++ GAZELLE_RETURN(EBADF); ++ } ++ return rpc_call_setsockopt(sock->stack_id, s, level, optname, optval, optlen); ++} ++ ++static int rtw_getsockopt(int s, int level, int optname, void *optval, socklen_t *optlen) ++{ ++ struct lwip_sock *sock = lwip_get_socket(s); ++ if (POSIX_IS_CLOSED(sock)) { ++ GAZELLE_RETURN(EBADF); ++ } ++ return rpc_call_getsockopt(sock->stack_id, s, level, optname, optval, optlen); ++} ++ ++static int rtw_getpeername(int s, struct sockaddr *name, socklen_t *namelen) ++{ ++ struct lwip_sock *sock = lwip_get_socket(s); ++ if (POSIX_IS_CLOSED(sock)) { ++ GAZELLE_RETURN(EBADF); ++ } ++ return rpc_call_getpeername(sock->stack_id, s, name, namelen); ++} ++ ++static int rtw_getsockname(int s, struct sockaddr *name, socklen_t *namelen) ++{ ++ struct lwip_sock *sock = lwip_get_socket(s); ++ if (POSIX_IS_CLOSED(sock)) { ++ GAZELLE_RETURN(EBADF); ++ } ++ return rpc_call_getsockname(sock->stack_id, s, name, namelen); ++} ++ ++ ++static void callback_socket(struct rpc_msg *msg) ++{ ++ msg->result = lwip_socket(msg->args[MSG_ARG_0].i, msg->args[MSG_ARG_1].i, msg->args[MSG_ARG_2].i); ++ if (msg->result < 0) { ++ LSTACK_LOG(ERR, LSTACK, "tid %d, %ld socket failed\n", rte_gettid(), msg->result); ++ } ++} ++ ++static int rpc_call_socket(int stack_id, int domain, int type, int protocol) ++{ ++ rpc_queue *queue = &get_protocol_stack_by_id(stack_id)->rpc_queue; ++ struct rpc_msg *msg = rpc_msg_alloc(stack_id, callback_socket); ++ if (msg == NULL) { ++ return -1; ++ } ++ ++ msg->args[MSG_ARG_0].i = domain; ++ msg->args[MSG_ARG_1].i = type; ++ msg->args[MSG_ARG_2].i = protocol; ++ ++ return rpc_sync_call(queue, msg); ++} ++ ++static void callback_close(struct rpc_msg *msg) ++{ ++ int fd = msg->args[MSG_ARG_0].i; ++ struct lwip_sock *sock = lwip_get_socket(fd); ++ ++ if (sockio_mbox_pending(sock)) { ++ rpc_queue *queue = &get_protocol_stack_by_id(sock->stack_id)->rpc_queue; ++ rpc_async_call(queue, msg, RPC_MSG_FREE | RPC_MSG_RECALL); /* until stack_send recall finish */ ++ return; ++ } ++ ++ msg->result = lwip_close(fd); ++ if (msg->result != 0) { ++ LSTACK_LOG(ERR, LSTACK, "tid %d, fd %d failed %ld\n", rte_gettid(), msg->args[MSG_ARG_0].i, msg->result); ++ } ++} ++ ++static void callback_shutdown(struct rpc_msg *msg) ++{ ++ int fd = msg->args[MSG_ARG_0].i; ++ int how = msg->args[MSG_ARG_1].i; ++ struct lwip_sock *sock = lwip_get_socket(fd); ++ ++ if (sockio_mbox_pending(sock)) { ++ rpc_queue *queue = &get_protocol_stack_by_id(sock->stack_id)->rpc_queue; ++ rpc_async_call(queue, msg, RPC_MSG_FREE | RPC_MSG_RECALL); ++ return; ++ } ++ ++ msg->result = lwip_shutdown(fd, how); ++ if (msg->result != 0 && errno != ENOTCONN) { ++ LSTACK_LOG(ERR, LSTACK, "tid %d, fd %d fail %ld\n", rte_gettid(), fd, msg->result); ++ } ++ ++ posix_api->shutdown_fn(fd, how); ++} ++ ++static int rpc_call_close(int stack_id, int fd) ++{ ++ rpc_queue *queue = &get_protocol_stack_by_id(stack_id)->rpc_queue; ++ struct rpc_msg *msg = rpc_msg_alloc(stack_id, callback_close); ++ if (msg == NULL) { ++ return -1; ++ } ++ ++ msg->args[MSG_ARG_0].i = fd; ++ ++ return rpc_sync_call(queue, msg); ++} ++ ++static int rpc_call_shutdown(int stack_id, int fd, int how) ++{ ++ rpc_queue *queue = &get_protocol_stack_by_id(stack_id)->rpc_queue; ++ struct rpc_msg *msg = rpc_msg_alloc(stack_id, callback_shutdown); ++ if (msg == NULL) { ++ return -1; ++ } ++ ++ msg->args[MSG_ARG_0].i = fd; ++ msg->args[MSG_ARG_1].i = how; ++ ++ return rpc_sync_call(queue, msg); ++} ++ ++static void callback_bind(struct rpc_msg *msg) ++{ ++ msg->result = lwip_bind(msg->args[MSG_ARG_0].i, msg->args[MSG_ARG_1].cp, msg->args[MSG_ARG_2].u); ++ if (msg->result != 0) { ++ LSTACK_LOG(ERR, LSTACK, "tid %d, fd %d failed %ld\n", rte_gettid(), msg->args[MSG_ARG_0].i, msg->result); ++ } ++} ++ ++static int rpc_call_bind(int stack_id, int fd, const struct sockaddr *addr, socklen_t addrlen) ++{ ++ rpc_queue *queue = &get_protocol_stack_by_id(stack_id)->rpc_queue; ++ struct rpc_msg *msg = rpc_msg_alloc(stack_id, callback_bind); ++ if (msg == NULL) { ++ return -1; ++ } ++ ++ msg->args[MSG_ARG_0].i = fd; ++ msg->args[MSG_ARG_1].cp = addr; ++ msg->args[MSG_ARG_2].u = addrlen; ++ ++ return rpc_sync_call(queue, msg); ++} ++ ++static void callback_listen(struct rpc_msg *msg) ++{ ++ int fd = msg->args[MSG_ARG_0].i; ++ int backlog = msg->args[MSG_ARG_1].i; ++ ++ struct lwip_sock *sock = lwip_get_socket(fd); ++ if (sock == NULL) { ++ msg->result = -1; ++ return; ++ } ++ ++ /* new listen add to stack listen list */ ++ msg->result = lwip_listen(fd, backlog); ++ if (msg->result != 0) { ++ LSTACK_LOG(ERR, LSTACK, "tid %d, fd %d failed %ld\n", rte_gettid(), msg->args[MSG_ARG_0].i, msg->result); ++ } ++} ++ ++static int rpc_call_listen(int stack_id, int s, int backlog) ++{ ++ rpc_queue *queue = &get_protocol_stack_by_id(stack_id)->rpc_queue; ++ struct rpc_msg *msg = rpc_msg_alloc(stack_id, callback_listen); ++ if (msg == NULL) { ++ return -1; ++ } ++ ++ msg->args[MSG_ARG_0].i = s; ++ msg->args[MSG_ARG_1].i = backlog; ++ ++ return rpc_sync_call(queue, msg); ++} ++ ++static void do_lwip_clone_sockopt(struct lwip_sock *dst_sock, struct lwip_sock *src_sock) ++{ ++ dst_sock->conn->pcb.ip->so_options = src_sock->conn->pcb.ip->so_options; ++ dst_sock->conn->pcb.ip->ttl = src_sock->conn->pcb.ip->ttl; ++ dst_sock->conn->pcb.ip->tos = src_sock->conn->pcb.ip->tos; ++ dst_sock->conn->flags = src_sock->conn->flags; ++ ++ switch (NETCONN_TYPE(src_sock->conn)) { ++ case NETCONN_TCP: ++ dst_sock->conn->pcb.tcp->netif_idx = src_sock->conn->pcb.tcp->netif_idx; ++ dst_sock->conn->pcb.tcp->flags = src_sock->conn->pcb.tcp->flags; ++ dst_sock->conn->pcb.tcp->keep_idle = src_sock->conn->pcb.tcp->keep_idle; ++ dst_sock->conn->pcb.tcp->keep_intvl = src_sock->conn->pcb.tcp->keep_intvl; ++ dst_sock->conn->pcb.tcp->keep_cnt = src_sock->conn->pcb.tcp->keep_cnt; ++ break; ++ case NETCONN_UDP: ++ dst_sock->conn->pcb.udp->flags = src_sock->conn->pcb.udp->flags; ++ dst_sock->conn->pcb.udp->mcast_ifindex = src_sock->conn->pcb.udp->mcast_ifindex; ++ dst_sock->conn->pcb.udp->mcast_ttl = src_sock->conn->pcb.udp->mcast_ttl; ++ break; ++ default: ++ break; ++ } ++} ++static void callback_create_shadow_fd(struct rpc_msg *msg) ++{ ++ int fd = msg->args[MSG_ARG_0].i; ++ struct sockaddr *addr = msg->args[MSG_ARG_1].p; ++ socklen_t addr_len = msg->args[MSG_ARG_2].u; ++ ++ int clone_fd = 0; ++ struct lwip_sock *sock = lwip_get_socket(fd); ++ if (sock == NULL) { ++ LSTACK_LOG(ERR, LSTACK, "get sock null fd=%d\n", fd); ++ msg->result = -1; ++ return; ++ } ++ ++ int domain = addr->sa_family; ++ int type = NETCONN_TYPE(sock->conn) == NETCONN_UDP ? SOCK_DGRAM : SOCK_STREAM; ++ clone_fd = lwip_socket(domain, type, 0); ++ if (clone_fd < 0) { ++ LSTACK_LOG(ERR, LSTACK, "clone socket failed clone_fd=%d errno=%d\n", clone_fd, errno); ++ msg->result = clone_fd; ++ return; ++ } ++ ++ struct lwip_sock *clone_sock = lwip_get_socket(clone_fd); ++ if (clone_sock == NULL) { ++ LSTACK_LOG(ERR, LSTACK, "get sock null fd=%d clone_fd=%d\n", fd, clone_fd); ++ msg->result = -1; ++ return; ++ } ++ ++ do_lwip_clone_sockopt(clone_sock, sock); ++ ++ while (sock->listen_next) { ++ sock = sock->listen_next; ++ } ++ sock->listen_next = clone_sock; ++ ++ int ret = lwip_bind(clone_fd, addr, addr_len); ++ if (ret < 0) { ++ LSTACK_LOG(ERR, LSTACK, "clone bind failed clone_fd=%d errno=%d\n", clone_fd, errno); ++ msg->result = ret; ++ return; ++ } ++ ++ msg->result = clone_fd; ++} ++ ++static int rpc_call_shadow_fd(int stack_id, int fd, const struct sockaddr *addr, socklen_t addrlen) ++{ ++ rpc_queue *queue = &get_protocol_stack_by_id(stack_id)->rpc_queue; ++ struct rpc_msg *msg = rpc_msg_alloc(stack_id, callback_create_shadow_fd); ++ if (msg == NULL) { ++ return -1; ++ } ++ ++ msg->args[MSG_ARG_0].i = fd; ++ msg->args[MSG_ARG_1].cp = addr; ++ msg->args[MSG_ARG_2].u = addrlen; ++ ++ return rpc_sync_call(queue, msg); ++} ++ ++static void callback_accept(struct rpc_msg *msg) ++{ ++ int fd = msg->args[MSG_ARG_0].i; ++ msg->result = -1; ++ struct protocol_stack *stack = get_protocol_stack(); ++ ++ int accept_fd = lwip_accept4(fd, msg->args[MSG_ARG_1].p, msg->args[MSG_ARG_2].p, msg->args[MSG_ARG_3].i); ++ if (accept_fd < 0) { ++ stack->stats.accept_fail++; ++ LSTACK_LOG(ERR, LSTACK, "fd %d ret %d\n", fd, accept_fd); ++ return; ++ } ++ msg->result = accept_fd; ++} ++ ++static int rpc_call_accept(int stack_id, int fd, struct sockaddr *addr, socklen_t *addrlen, int flags) ++{ ++ rpc_queue *queue = &get_protocol_stack_by_id(stack_id)->rpc_queue; ++ struct rpc_msg *msg = rpc_msg_alloc(stack_id, callback_accept); ++ if (msg == NULL) { ++ return -1; ++ } ++ ++ msg->args[MSG_ARG_0].i = fd; ++ msg->args[MSG_ARG_1].p = addr; ++ msg->args[MSG_ARG_2].p = addrlen; ++ msg->args[MSG_ARG_3].i = flags; ++ ++ return rpc_sync_call(queue, msg); ++} ++ ++static void callback_connect(struct rpc_msg *msg) ++{ ++ msg->result = lwip_connect(msg->args[MSG_ARG_0].i, msg->args[MSG_ARG_1].p, msg->args[MSG_ARG_2].u); ++ if (msg->result < 0) { ++ msg->result = -errno; ++ } ++} ++ ++static int rpc_call_connect(int stack_id, int fd, const struct sockaddr *addr, socklen_t addrlen) ++{ ++ rpc_queue *queue = &get_protocol_stack_by_id(stack_id)->rpc_queue; ++ struct rpc_msg *msg = rpc_msg_alloc(stack_id, callback_connect); ++ if (msg == NULL) { ++ return -1; ++ } ++ ++ msg->args[MSG_ARG_0].i = fd; ++ msg->args[MSG_ARG_1].cp = addr; ++ msg->args[MSG_ARG_2].u = addrlen; ++ ++ int ret = rpc_sync_call(queue, msg); ++ if (ret < 0) { ++ errno = -ret; ++ ret = -1; ++ } ++ ++ if (ret < 0 && errno == EINPROGRESS) { ++ struct lwip_sock *sock = lwip_get_socket(fd); ++ if (sock_event_wait(sock, netconn_is_nonblocking(sock->conn))) { ++ ret = 0; ++ } ++ } ++ return ret; ++} ++ ++/* for lwip nonblock connected callback */ ++void do_lwip_connected_callback(int fd) ++{ ++ struct lwip_sock *sock = lwip_get_socket(fd); ++ if (POSIX_IS_CLOSED(sock)) { ++ return; ++ } ++ ++ if (POSIX_HAS_TYPE(sock, POSIX_KERNEL)) { ++ /* delete kernel event */ ++ if (sock->sk_wait != NULL) { ++ posix_api->epoll_ctl_fn(sock->sk_wait->epfd, EPOLL_CTL_DEL, fd, NULL); ++ } ++ /* shutdown kernel connect, do_connect() has tried both kernel and lwip. */ ++ posix_api->shutdown_fn(fd, SHUT_RDWR); ++ } ++ ++ POSIX_SET_TYPE(sock, POSIX_LWIP); ++ ++ API_EVENT(sock->conn, NETCONN_EVT_RCVPLUS, 0); ++} ++ ++/* when fd is listenfd, listenfd of all protocol stack thread will be closed */ ++static int stack_broadcast_close(int fd) ++{ ++ int ret = 0; ++ struct lwip_sock *sock = lwip_get_socket(fd); ++ ++ while (sock != NULL) { ++ if (POSIX_IS_CLOSED(sock)) { ++ ret = -1; ++ break; ++ } ++ fd = sock->conn->callback_arg.socket; ++ ret |= rpc_call_close(sock->stack_id, fd); ++ sock = sock->listen_next; ++ } ++ ++ if (ret != 0) { ++ GAZELLE_RETURN(EBADF); ++ } ++ return ret; ++} ++ ++static int stack_broadcast_shutdown(int fd, int how) ++{ ++ int ret = 0; ++ struct lwip_sock *sock = lwip_get_socket(fd); ++ ++ while (true) { ++ if (POSIX_IS_CLOSED(sock)) { ++ ret = -1; ++ break; ++ } ++ fd = sock->conn->callback_arg.socket; ++ ret |= rpc_call_shutdown(sock->stack_id, fd, how); ++ sock = sock->listen_next; ++ } ++ ++ if (ret != 0) { ++ GAZELLE_RETURN(EBADF); ++ } ++ return ret; ++} ++ ++/* choice one stack bind */ ++static int stack_single_bind(int fd, const struct sockaddr *name, socklen_t namelen) ++{ ++ struct lwip_sock *sock = lwip_get_socket(fd); ++ if (POSIX_IS_CLOSED(sock)) { ++ GAZELLE_RETURN(EBADF); ++ } ++ return rpc_call_bind(sock->stack_id, fd, name, namelen); ++} ++ ++/* bind sync to all protocol stack thread, so that any protocol stack thread can build connect */ ++static int stack_broadcast_bind(int fd, const struct sockaddr *name, socklen_t namelen) ++{ ++ struct protocol_stack *cur_stack; ++ struct protocol_stack *stack = NULL; ++ int ret, clone_fd; ++ ++ struct lwip_sock *sock = lwip_get_socket(fd); ++ if (sock == NULL) { ++ LSTACK_LOG(ERR, LSTACK, "tid %d, %d get sock null or stack null\n", rte_gettid(), fd); ++ GAZELLE_RETURN(EBADF); ++ } ++ ++ ret = rpc_call_bind(sock->stack_id, fd, name, namelen); ++ if (ret < 0) { ++ close(fd); ++ return ret; ++ } ++ ++ cur_stack = get_protocol_stack_by_id(sock->stack_id); ++ struct protocol_stack_group *stack_group = get_protocol_stack_group(); ++ for (int i = 0; i < stack_group->stack_num; ++i) { ++ stack = stack_group->stacks[i]; ++ if (stack != cur_stack) { ++ clone_fd = rpc_call_shadow_fd(stack->stack_idx, fd, name, namelen); ++ if (clone_fd < 0) { ++ stack_broadcast_close(fd); ++ return clone_fd; ++ } ++ } ++ } ++ return 0; ++} ++ ++static struct lwip_sock *get_min_accept_sock(int fd) ++{ ++ struct lwip_sock *sock; ++ struct lwip_sock *min_sock = NULL; ++ ++ for (sock = lwip_get_socket(fd); sock != NULL; sock = sock->listen_next) { ++ if (!netconn_is_nonblocking(sock->conn)) { ++ /* init all sock sk_wait */ ++ if (unlikely(sock->sk_wait == NULL) || sock->sk_wait->type == WAIT_CLOSE) { ++ sock->sk_wait = poll_construct_wait(0); ++ } ++ if (!(sock->sk_wait->type & WAIT_BLOCK)) { ++ sock->sk_wait->type |= WAIT_BLOCK; ++ } ++ } ++ ++ if (!sock_event_hold_pending(sock, WAIT_BLOCK, NETCONN_EVT_RCVPLUS, 0)) { ++ continue; ++ } ++ ++ if (min_sock == NULL || ++ get_protocol_stack_by_id(min_sock->stack_id)->conn_num > get_protocol_stack_by_id(sock->stack_id)->conn_num) { ++ min_sock = sock; ++ } ++ } ++ ++ return min_sock; ++} ++ ++/* ergodic the protocol stack thread to find the connection, because all threads are listening */ ++static int stack_broadcast_accept4(int fd, struct sockaddr *addr, socklen_t *addrlen, int flags) ++{ ++ int ret = -1; ++ struct protocol_stack *stack; ++ struct lwip_sock *min_sock; ++ ++ struct lwip_sock *sock = lwip_get_socket(fd); ++ if (sock == NULL) { ++ GAZELLE_RETURN(EBADF); ++ } ++ ++ min_sock = get_min_accept_sock(fd); ++ if (min_sock == NULL) { ++ if (sock_event_wait(sock, netconn_is_nonblocking(sock->conn) || (flags & SOCK_NONBLOCK))) { ++ min_sock = get_min_accept_sock(fd); ++ } ++ } ++ ++ if (!POSIX_IS_CLOSED(min_sock)) { ++ stack = get_protocol_stack_by_id(min_sock->stack_id); ++ ret = rpc_call_accept(stack->stack_idx, min_sock->conn->callback_arg.socket, addr, addrlen, flags); ++ } ++ ++ if (ret < 0) { ++ errno = EAGAIN; ++ } ++ return ret; ++} ++ ++static int stack_broadcast_accept(int fd, struct sockaddr *addr, socklen_t *addrlen) ++{ ++ if (get_global_cfg_params()->nonblock_mode) ++ return stack_broadcast_accept4(fd, addr, addrlen, SOCK_NONBLOCK); ++ else ++ return stack_broadcast_accept4(fd, addr, addrlen, 0); ++} ++ ++/* choice one stack listen */ ++static int stack_single_listen(int fd, int backlog) ++{ ++ struct lwip_sock *sock = lwip_get_socket(fd); ++ if (POSIX_IS_CLOSED(sock)) { ++ GAZELLE_RETURN(EBADF); ++ } ++ return rpc_call_listen(sock->stack_id, fd, backlog); ++} ++ ++/* listen sync to all protocol stack thread, so that any protocol stack thread can build connect */ ++static int stack_broadcast_listen(int fd, int backlog) ++{ ++ typedef union sockaddr_union { ++ struct sockaddr sa; ++ struct sockaddr_in in; ++ struct sockaddr_in6 in6; ++ } sockaddr_t; ++ ++ struct protocol_stack *cur_stack; ++ struct protocol_stack *stack = NULL; ++ sockaddr_t addr; ++ socklen_t addr_len = sizeof(addr); ++ int ret, clone_fd; ++ ++ struct lwip_sock *sock = lwip_get_socket(fd); ++ if (sock == NULL) { ++ LSTACK_LOG(ERR, LSTACK, "tid %d, %d get sock null or stack null\n", rte_gettid(), fd); ++ GAZELLE_RETURN(EBADF); ++ } ++ ++ ret = rpc_call_getsockname(sock->stack_id, fd, (struct sockaddr *)&addr, &addr_len); ++ if (ret != 0) { ++ return ret; ++ } ++ ++ cur_stack = get_protocol_stack_by_id(sock->stack_id); ++ struct protocol_stack_group *stack_group = get_protocol_stack_group(); ++#if GAZELLE_TCP_REUSE_IPPORT ++ int min_conn_stk_idx = get_min_conn_stack(stack_group); ++#endif ++ ++ for (int32_t i = 0; i < stack_group->stack_num; ++i) { ++ stack = stack_group->stacks[i]; ++ if (get_global_cfg_params()->seperate_send_recv && stack->is_send_thread) { ++ continue; ++ } ++ if (stack != cur_stack) { ++ clone_fd = rpc_call_shadow_fd(stack->stack_idx, fd, (struct sockaddr *)&addr, addr_len); ++ if (clone_fd < 0) { ++ stack_broadcast_close(fd); ++ return clone_fd; ++ } ++ } else { ++ clone_fd = fd; ++ } ++ ++#if GAZELLE_TCP_REUSE_IPPORT ++ if (min_conn_stk_idx == i) { ++ lwip_get_socket(clone_fd)->conn->is_master_fd = 1; ++ } else { ++ lwip_get_socket(clone_fd)->conn->is_master_fd = 0; ++ } ++#endif /* GAZELLE_TCP_REUSE_IPPORT */ ++ ++ ret = rpc_call_listen(stack->stack_idx, clone_fd, backlog); ++ if (ret < 0) { ++ stack_broadcast_close(fd); ++ return ret; ++ } ++ } ++ return 0; ++} ++ ++ ++static int rtw_socket(int domain, int type, int protocol) ++{ ++ struct protocol_stack *stack = get_bind_protocol_stack(); ++ if (stack == NULL) { ++ GAZELLE_RETURN(EINVAL); ++ } ++ return rpc_call_socket(stack->stack_idx, domain, type, protocol); ++} ++ ++static int rtw_accept(int s, struct sockaddr *addr, socklen_t *addrlen) ++{ ++ return stack_broadcast_accept(s, addr, addrlen); ++} ++ ++static int rtw_accept4(int s, struct sockaddr *addr, socklen_t *addrlen, int flags) ++{ ++ return stack_broadcast_accept4(s, addr, addrlen, flags); ++} ++ ++static int rtw_bind(int s, const struct sockaddr *name, socklen_t namelen) ++{ ++ struct lwip_sock *sock = lwip_get_socket(s); ++ ++ if (NETCONN_TYPE(sock->conn) == NETCONN_UDP && ++ get_global_cfg_params()->listen_shadow) { ++ return stack_broadcast_bind(s, name, namelen); ++ } else { ++ return stack_single_bind(s, name, namelen); ++ } ++} ++ ++static int rtw_listen(int s, int backlog) ++{ ++ if (!get_global_cfg_params()->tuple_filter && ++ !get_global_cfg_params()->listen_shadow) { ++ return stack_single_listen(s, backlog); ++ } else { ++ return stack_broadcast_listen(s, backlog); ++ } ++} ++ ++static int rtw_connect(int s, const struct sockaddr *name, socklen_t namelen) ++{ ++ struct lwip_sock *sock = lwip_get_socket(s); ++ struct protocol_stack *stack = get_protocol_stack_by_id(sock->stack_id); ++ if (stack == NULL || POSIX_IS_CLOSED(sock)) { ++ GAZELLE_RETURN(EBADF); ++ } ++ ++ return rpc_call_connect(stack->stack_idx, s, name, namelen); ++} ++ ++static int rtw_close(int s) ++{ ++ return stack_broadcast_close(s); ++} ++ ++static int rtw_shutdown(int fd, int how) ++{ ++ return stack_broadcast_shutdown(fd, how); ++} ++ ++void sockctl_rtw_api_init(posix_api_t *api) ++{ ++ api->close_fn = rtw_close; ++ api->shutdown_fn = rtw_shutdown; ++ api->socket_fn = rtw_socket; ++ api->bind_fn = rtw_bind; ++ api->listen_fn = rtw_listen; ++ api->accept_fn = rtw_accept; ++ api->accept4_fn = rtw_accept4; ++ api->connect_fn = rtw_connect; ++ ++ api->setsockopt_fn = rtw_setsockopt; ++ api->getsockopt_fn = rtw_getsockopt; ++ api->getpeername_fn = rtw_getpeername; ++ api->getsockname_fn = rtw_getsockname; ++} ++ ++static int rtc_connect(int s, const struct sockaddr *name, socklen_t namelen) ++{ ++ int ret; ++ ++ ret = lwip_connect(s, name, namelen); ++ if (ret < 0 && errno == EINPROGRESS) { ++ struct lwip_sock *sock = lwip_get_socket(s); ++ if (sock_event_wait(sock, netconn_is_nonblocking(sock->conn))) { ++ ret = 0; ++ } ++ } ++ ++ return ret; ++} ++ ++static int rtc_accept4(int s, struct sockaddr *addr, socklen_t *addrlen, int flags) ++{ ++ int ret; ++ struct lwip_sock *sock = lwip_get_socket(s); ++ if (POSIX_IS_CLOSED(sock)) { ++ GAZELLE_RETURN(EBADF); ++ } ++ ++ ret = lwip_accept4(s, addr, addrlen, flags); ++ if (ret < 0 && errno == EWOULDBLOCK) { ++ if (sock_event_wait(sock, netconn_is_nonblocking(sock->conn) || (flags & SOCK_NONBLOCK))) { ++ ret = lwip_accept4(s, addr, addrlen, flags); ++ } ++ } ++ return ret; ++} ++ ++static int rtc_accept(int s, struct sockaddr *addr, socklen_t *addrlen) ++{ ++ return rtc_accept4(s, addr, addrlen, 0); ++} ++ ++void sockctl_rtc_api_init(posix_api_t *api) ++{ ++ api->close_fn = lwip_close; ++ api->shutdown_fn = lwip_shutdown; ++ api->socket_fn = lwip_socket; ++ api->bind_fn = lwip_bind; ++ api->listen_fn = lwip_listen; ++ api->accept_fn = rtc_accept; ++ api->accept4_fn = rtc_accept4; ++ api->connect_fn = rtc_connect; ++ ++ api->setsockopt_fn = lwip_setsockopt; ++ api->getsockopt_fn = lwip_getsockopt; ++ api->getpeername_fn = lwip_getpeername; ++ api->getsockname_fn = lwip_getsockname; ++} +diff --git a/src/lstack/api/lstack_sockio.c b/src/lstack/api/lstack_sockio.c +new file mode 100644 +index 0000000..060b3b3 +--- /dev/null ++++ b/src/lstack/api/lstack_sockio.c +@@ -0,0 +1,1488 @@ ++/* ++* Copyright (c) Huawei Technologies Co., Ltd. 2020-2021. All rights reserved. ++* gazelle is licensed under the Mulan PSL v2. ++* You can use this software according to the terms and conditions of the Mulan PSL v2. ++* You may obtain a copy of Mulan PSL v2 at: ++* http://license.coscl.org.cn/MulanPSL2 ++* THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR ++* IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY OR FIT FOR A PARTICULAR ++* PURPOSE. ++* See the Mulan PSL v2 for more details. ++*/ ++ ++#include ++ ++#include ++#include ++#include ++ ++#include "lstack_thread_rpc.h" ++#include "lstack_log.h" ++#include "lstack_sockio.h" ++#include "lstack_wait.h" ++#include "mbox_ring.h" ++#include "lstack_epoll.h" ++#include "lstack_stack_stat.h" ++ ++ ++/* see lwip ip4_frag() and ip6_frag(), nfb must be a multiple of 8 */ ++#define IP_FRAG_NFB ((GAZELLE_ETH_MTU - PBUF_IP) / 8) ++#define UDP_MSS (IP_FRAG_NFB * 8 - UDP_HLEN) ++ ++#define IP4_UDP_SND_SIZE_MAX (0xFFFF - IP_HLEN - UDP_HLEN) ++#define IP6_UDP_SND_SIZE_MAX (0xFFFF - IP6_HLEN - UDP_HLEN) ++#define UDP_SND_SIZE_MAX(conn) (NETCONNTYPE_ISIPV6(netconn_type(conn)) ? IP6_UDP_SND_SIZE_MAX : IP4_UDP_SND_SIZE_MAX) ++#define UDP_SND_QUEUELEN_MAX ((IP6_UDP_SND_SIZE_MAX + UDP_MSS - 1) / UDP_MSS) ++#define UDP_SND_OUTPUT_NUM (16) ++ ++#define TCP_SND_OUTPUT_NUM OFFLOAD_TX_TSO_MTU_FRAGS ++#define TCP_SND_QUEUELEN_MAX OFFLOAD_TX_TSO_MTU_FRAGS ++#define TCP_SND_SIZE_MAX (TCP_SND_QUEUELEN_MAX * TCP_MSS) ++ ++#define TCP_SND_APPEND_LEN (TCP_MSS >> 1) ++ ++#define RECV_EXTEND_CACHE_MAX 8 ++#define RECV_EXTEND_CACHE_LEN (4 * TCP_MSS) ++ ++struct sockio_ops { ++ ssize_t (*stack_udp_write)(struct lwip_sock *sock, const void *data, size_t len, int flags, ++ const struct sockaddr *to, socklen_t tolen); ++ void (*stack_udp_send)(struct lwip_sock *sock); ++ ++ ssize_t (*stack_udp_readmsg)(struct lwip_sock *sock, struct msghdr *msg, int flags); ++ ++ ssize_t (*stack_tcp_write)(struct lwip_sock *sock, const char *data, size_t len, int flags); ++ void (*stack_tcp_send)(struct lwip_sock *sock); ++ ++ ssize_t (*stack_tcp_read)(struct lwip_sock *sock, char *data, size_t len, int flags, ++ struct sockaddr *from, socklen_t *fromlen); ++ void (*stack_tcp_recvd)(struct lwip_sock *sock, ssize_t recvd, int flags); ++}; ++static struct sockio_ops ioops = {0}; ++ ++ ++static unsigned pbuf_list_count(const struct mbox_ring *mr) ++{ ++ struct pbuf *p = mr->ops->read_tail(mr); ++ return pbuf_clen(p); ++} ++ ++static unsigned netbuf_list_count(const struct mbox_ring *mr) ++{ ++ struct netbuf *nbuf = mr->ops->read_tail(mr); ++ return pbuf_clen(nbuf->p); ++} ++ ++static void netbuf_obj_free(struct mbox_ring *mr, void *obj, bool is_tail) ++{ ++ err_t err; ++ if (unlikely(lwip_netconn_is_err_msg(obj, &err))) ++ return; ++ ++ if (is_tail && (mr->flags & MBOX_FLAG_RECV)) { ++ pbuf_free((struct pbuf *)obj); ++ } else { ++ netbuf_free((struct netbuf *)obj); ++ } ++} ++ ++static void pbuf_obj_free(struct mbox_ring *mr, void *obj, bool is_tail) ++{ ++ err_t err; ++ if (unlikely(lwip_netconn_is_err_msg(obj, &err))) ++ return; ++ pbuf_free((struct pbuf *)obj); ++} ++ ++void sockio_mbox_set_func(struct mbox_ring *mr) ++{ ++ mr->tail_count = pbuf_list_count; ++ if (mr->flags & MBOX_FLAG_TCP) { ++ /* only tcp sendmbox & recvmbox, lwip would free all acceptmbox newconn objs. */ ++ mr->obj_free_fn = pbuf_obj_free; ++ } else if (mr->flags & MBOX_FLAG_UDP) { ++ /* udp sendmbox & recvmbox */ ++ mr->obj_free_fn = netbuf_obj_free; ++ if (mr->flags & MBOX_FLAG_SEND) ++ mr->tail_count = netbuf_list_count; ++ } ++} ++ ++void sockio_peek_recv_free(struct mbox_ring *mr, unsigned n) ++{ ++ void *buf_pkts[RECV_EXTEND_CACHE_MAX]; ++ unsigned num, i; ++ ++ mr->stk_queued_num += n; ++ if (mr->stk_queued_num < (RECV_EXTEND_CACHE_MAX >> 1)) { ++ return; ++ } ++ ++ while (true) { ++ num = mr->ops->dequeue_burst(mr, buf_pkts, RECV_EXTEND_CACHE_MAX); ++ if (num == 0) ++ break; ++ if (mr->flags & MBOX_FLAG_UDP) { ++ for (i = 0; i < num; ++i) { ++ buf_pkts[i] = ((struct netbuf *)buf_pkts[i])->p; ++ } ++ } ++ mem_put_pbuf_list_bulk((struct pbuf **)buf_pkts, num); ++ mr->stk_queued_num -= num; ++ } ++} ++ ++static void sock_mbox_private_free(struct mbox_ring *mr) ++{ ++ struct rpc_msg *msg = (struct rpc_msg *)mr->private_data; ++ if (msg != NULL) { ++ rpc_msg_free(msg); ++ mr->private_data = NULL; ++ } ++} ++ ++static int sock_mbox_private_init(sys_mbox_t mb, rpc_func_t func) ++{ ++ struct rpc_msg *msg = rpc_msg_alloc(get_protocol_stack()->stack_idx, func); ++ if (msg == NULL) ++ return -1; ++ ++ memset_s(msg->args, sizeof(msg->args), 0, sizeof(msg->args)); ++ ++ mb->mring.private_data = msg; ++ mb->mring.private_data_free_fn = sock_mbox_private_free; ++ return 0; ++} ++ ++static inline struct rpc_msg *sock_mbox_private_get(sys_mbox_t mb) ++{ ++ return (struct rpc_msg *)mb->mring.private_data; ++} ++ ++ ++static inline uint16_t write_pbuf(struct pbuf *p, const char *data, uint16_t len, uint8_t optlen) ++{ ++ mem_init_pbuf(p, PBUF_TRANSPORT, len, len, PBUF_POOL); ++ if (optlen > 0) { ++ /* see pbuf_remove_header() */ ++ p->payload = (uint8_t *)p->payload + optlen; ++ } ++ ++ if (get_protocol_stack_group()->latency_start) ++ time_stamp_into_write(&p, 1); ++ ++ pbuf_take(p, data, len); ++ return len; ++} ++ ++static inline void write_pbuf_bulk(struct pbuf *pbuf_pkts[], unsigned n, uint16_t payload_size, ++ const char *data, uint16_t len, uint8_t optlen) ++{ ++ unsigned i; ++ uint16_t copied_total = 0; ++ ++ for (i = 0; i < (n & ~0x3); i += 4) { ++ rte_prefetch0(pbuf_pkts[i + 1]); ++ rte_prefetch0(data + copied_total + payload_size); ++ copied_total += write_pbuf(pbuf_pkts[i], data + copied_total, payload_size, optlen); ++ ++ rte_prefetch0(pbuf_pkts[i + 2]); ++ rte_prefetch0(data + copied_total + payload_size); ++ copied_total += write_pbuf(pbuf_pkts[i + 1], data + copied_total, payload_size, optlen); ++ ++ rte_prefetch0(pbuf_pkts[i + 3]); ++ rte_prefetch0(data + copied_total + payload_size); ++ copied_total += write_pbuf(pbuf_pkts[i + 2], data + copied_total, payload_size, optlen); ++ ++ if (payload_size > len - copied_total) ++ payload_size = len - copied_total; ++ copied_total += write_pbuf(pbuf_pkts[i + 3], data + copied_total, payload_size, optlen); ++ } ++ switch (n & 0x3) { ++ case 3: ++ rte_prefetch0(pbuf_pkts[i + 1]); ++ copied_total += write_pbuf(pbuf_pkts[i], data + copied_total, payload_size, optlen); ++ ++i; /* fallthrough */ ++ case 2: ++ rte_prefetch0(pbuf_pkts[i + 1]); ++ copied_total += write_pbuf(pbuf_pkts[i], data + copied_total, payload_size, optlen); ++ ++i; /* fallthrough */ ++ case 1: ++ payload_size = len - copied_total; ++ write_pbuf(pbuf_pkts[i], data + copied_total, payload_size, optlen); ++ /* fallthrough */ ++ } ++} ++ ++static inline void write_pbuf_list(struct pbuf *pbuf_pkts[], unsigned n, uint16_t payload_size, ++ const char *data, uint16_t len, uint8_t optlen) ++{ ++ unsigned i; ++ uint16_t copied_total = 0; ++ ++ for (i = 0; i < n - 1; ++i) { ++ rte_prefetch0(pbuf_pkts[i + 1]); ++ rte_prefetch0(data + copied_total + payload_size); ++ write_pbuf(pbuf_pkts[i], data + copied_total, payload_size, optlen); ++ pbuf_pkts[i]->next = pbuf_pkts[i + 1]; ++ pbuf_pkts[i]->tot_len = len - copied_total; ++ copied_total += payload_size; ++ } ++ ++ payload_size = len - copied_total; ++ write_pbuf(pbuf_pkts[i], data + copied_total, payload_size, optlen); ++ pbuf_pkts[i]->next = NULL; ++} ++ ++static uint16_t stack_udp_write_one(const struct lwip_sock *sock, struct mbox_ring *mr, ++ const char *data, uint16_t len, int flags, ++ const struct sockaddr *to, socklen_t tolen) ++{ ++ struct pbuf **extcache_list = (struct pbuf **)&sock->conn->recvmbox->mring.st_obj; ++ struct pbuf *p; ++ struct netbuf *nbuf; ++ ++ p = mem_extcache_get_pbuf(sock->stack_id, true, extcache_list); ++ if (p == NULL) { ++ LSTACK_LOG(ERR, LSTACK, "mem_extcache_get_pbuf failed\n"); ++ set_errno(ENOMEM); ++ return 0; ++ } ++ ++ write_pbuf(p, data, len, 0); ++ ++ nbuf = netbuf_create(p); ++ lwip_sendto_netbuf(sock->conn, nbuf, to, tolen); ++ ++ if (get_protocol_stack_group()->latency_start) ++ calculate_lstack_latency(sock->stack_id, &nbuf->p, 1, GAZELLE_LATENCY_WRITE_INTO_RING, 0); ++ ++ mr->ops->enqueue_burst(mr, (void **)&nbuf, 1); ++ mr->app_free_count -= 1; ++ ++ return len; ++} ++ ++static uint16_t stack_udp_write_bulk(const struct lwip_sock *sock, struct mbox_ring *mr, ++ const char *data, uint16_t len, int flags, ++ const struct sockaddr *to, socklen_t tolen) ++{ ++ struct pbuf *pbuf_pkts[UDP_SND_QUEUELEN_MAX]; ++ unsigned pbuf_num; ++ struct netbuf *nbuf; ++ uint16_t payload_size; ++ uint8_t optlen; ++ uint16_t copied_total = 0; ++ ++ if (NETCONNTYPE_ISIPV6(netconn_type(sock->conn))) { ++ optlen = IP6_FRAG_HLEN; ++ payload_size = UDP_MSS - IP6_FRAG_HLEN; ++ } else { ++ optlen = 0; ++ payload_size = UDP_MSS; ++ } ++ ++ /* step1. udp append data */ ++ nbuf = (struct netbuf *)mr->ops->pop_tail(mr, NULL); ++ if (nbuf != NULL) { ++ copied_total = LWIP_MIN(len, payload_size - nbuf->tail->len); ++ pbuf_append_take(nbuf->p, nbuf->tail, data, copied_total, NULL); ++ len -= copied_total; ++ } ++ ++ /* step2. alloc a batch of pbufs */ ++ if (len > 0) { ++ struct pbuf **extcache_list = (struct pbuf **)&sock->conn->recvmbox->mring.st_obj; ++ pbuf_num = (len + payload_size - 1) / payload_size; ++ pbuf_num = mem_extcache_get_pbuf_bulk(sock->stack_id, pbuf_pkts, pbuf_num, true, extcache_list); ++ if (pbuf_num == 0) { ++ /* drop netbuf */ ++ if (nbuf != NULL) { ++ netbuf_free(nbuf); ++ } ++ LSTACK_LOG(ERR, LSTACK, "mem_extcache_get_pbuf_bulk failed, pbuf_num %u\n", pbuf_num); ++ set_errno(ENOMEM); ++ return 0; ++ } ++ ++ write_pbuf_list(pbuf_pkts, pbuf_num, payload_size, data + copied_total, len, optlen); ++ copied_total += len; ++ ++ if (nbuf == NULL) { ++ nbuf = netbuf_create(pbuf_pkts[0]); ++ lwip_sendto_netbuf(sock->conn, nbuf, to, tolen); ++ } else { ++ pbuf_cat(nbuf->p, pbuf_pkts[0]); ++ } ++ nbuf->tail = pbuf_pkts[pbuf_num - 1]; ++ } ++ ++ /* step3. enqueue the new netbuf */ ++ if ((flags & MSG_MORE) == 0) { ++ if (get_protocol_stack_group()->latency_start) ++ calculate_lstack_latency(sock->stack_id, &nbuf->p, 1, GAZELLE_LATENCY_WRITE_INTO_RING, 0); ++ ++ mr->ops->enqueue_burst(mr, (void **)&nbuf, 1); ++ mr->app_free_count -= 1; ++ } else { ++ mr->ops->push_tail(mr, nbuf); ++ } ++ ++ return copied_total; ++} ++ ++static ssize_t stack_udp_write(struct lwip_sock *sock, const void *data, size_t len, int flags, ++ const struct sockaddr *to, socklen_t tolen) ++{ ++ struct mbox_ring *mr = &sock->conn->sendmbox->mring; ++ uint16_t copied_total; ++ ++ LWIP_DEBUGF(SOCKETS_DEBUG, ("%s(sock=%p, data=%p, size=%"SZT_F", flags=0x%x)\n", ++ __FUNCTION__, sock, data, len, flags)); ++ ++ if (unlikely(sock_event_hold_pending(sock, WAIT_BLOCK, NETCONN_EVT_ERROR, 0))) { ++ set_errno(ENOTCONN); ++ return -1; ++ } ++ ++ if (unlikely(len > UDP_SND_SIZE_MAX(sock->conn))) { ++ LSTACK_LOG(ERR, LSTACK, "Message too long\n"); ++ set_errno(EMSGSIZE); ++ return -1; ++ } ++ ++ if (unlikely(mr->app_free_count < 1)) { ++ mr->app_free_count = mr->ops->free_count(mr); ++ if (unlikely(mr->app_free_count < 1)) { ++ API_EVENT(sock->conn, NETCONN_EVT_SENDMINUS, 0); ++ set_errno(EWOULDBLOCK); ++ return -1; ++ } ++ } ++ ++ if (len <= UDP_MSS && (flags & MSG_MORE) == 0) { ++ copied_total = stack_udp_write_one(sock, mr, data, len, flags, to, tolen); ++ } else { ++ copied_total = stack_udp_write_bulk(sock, mr, data, len, flags, to, tolen); ++ } ++ ++ return copied_total > 0 ? copied_total : -1; ++} ++ ++static ssize_t stack_udp_output(struct netconn *conn, bool *output_again, struct mem_thread *mt) ++{ ++ struct mbox_ring *mr = &conn->sendmbox->mring; ++ err_t err; ++ struct netbuf *nbuf_pkts[UDP_SND_OUTPUT_NUM]; ++ unsigned nbuf_num = 0; ++ unsigned pbuf_num = 0; ++ size_t send_total = 0; ++ size_t send_len; ++ ++ *output_again = false; ++ ++ nbuf_num = mr->ops->dequeue_burst(mr, (void **)nbuf_pkts, UDP_SND_OUTPUT_NUM); ++ if (unlikely(nbuf_num == 0)) { ++ return 0; ++ } ++ if (unlikely(nbuf_num == UDP_SND_OUTPUT_NUM) && ++ mr->ops->count(mr) > 0) { ++ *output_again = true; ++ } ++ ++ for (unsigned i = 0; i < nbuf_num; ++i) { ++ LWIP_DEBUGF(SOCKETS_DEBUG, ("%s(conn=%p, fd=%d, nbuf_pkts[%d]=%p {.p=%p, .tot_len=%u})\n", ++ __FUNCTION__, conn, conn->callback_arg.socket, ++ i, nbuf_pkts[i], nbuf_pkts[i]->p, nbuf_pkts[i]->p->tot_len)); ++ ++ if (get_protocol_stack_group()->latency_start) ++ calculate_lstack_latency(get_protocol_stack()->stack_idx, &(nbuf_pkts[i]->p), 1, GAZELLE_LATENCY_WRITE_LWIP, 0); ++ ++ if (mt != NULL) { ++ pbuf_num += pbuf_clen(nbuf_pkts[i]->p); ++ } ++ ++ /* ip4_frag/ip6_frag would: ++ * 1. split pbuf list and modify tot_len. ++ * 2. free node of pbuf list, except for the pbuf head. ++ */ ++ send_len = nbuf_pkts[i]->p->tot_len; ++ /* This would add header 'UDP_HLEN' ! */ ++ err = netconn_send(conn, nbuf_pkts[i]); ++ if (err != ERR_OK) { ++ LSTACK_LOG(ERR, LSTACK, "netconn_send failed, err %d\n", err); ++ break; ++ } ++ send_total += send_len; ++ } ++ for (unsigned i = 0; i < nbuf_num; ++i) { ++ netbuf_free(nbuf_pkts[i]); ++ } ++ ++ if (mt != NULL) { ++ mem_mbuf_migrate_enqueue(mt, pbuf_num); ++ } ++ ++ return (err == ERR_OK ? send_total : -1); ++} ++ ++static void callback_udp_send(struct rpc_msg *msg) ++{ ++ struct protocol_stack *stack = get_protocol_stack(); ++ struct lwip_sock *sock = msg->args[MSG_ARG_0].p; ++ struct mem_thread *mt = msg->args[MSG_ARG_1].p; ++ bool output_again; ++ ++ if (get_protocol_stack_group()->latency_start) ++ calculate_sock_latency(sock, GAZELLE_LATENCY_WRITE_RPC_MSG); ++ ++ msg->result = stack_udp_output(sock->conn, &output_again, mt); ++ if (output_again) { ++ rpc_async_call(&stack->rpc_queue, msg, RPC_MSG_REUSE | RPC_MSG_RECALL); ++ } ++ ++ return; ++} ++ ++static inline int rpc_call_udp_send(rpc_queue *queue, struct lwip_sock *sock) ++{ ++ struct rpc_msg *msg; ++ ++ if (get_protocol_stack_group()->latency_start) ++ time_stamp_into_rpcmsg(sock); ++ ++ msg = sock_mbox_private_get(sock->conn->sendmbox); ++ msg->args[MSG_ARG_0].p = sock; ++ msg->args[MSG_ARG_1].p = mem_thread_migrate_get(sock->stack_id); ++ ++ rpc_async_call(queue, msg, RPC_MSG_REUSE); ++ return 0; ++} ++ ++static void rtw_stack_udp_send(struct lwip_sock *sock) ++{ ++ struct protocol_stack *stack = get_protocol_stack_by_id(sock->stack_id); ++ rpc_call_udp_send(&stack->rpc_queue, sock); ++} ++ ++static void rtc_stack_udp_send(struct lwip_sock *sock) ++{ ++ bool output_again; ++ do { ++ stack_udp_output(sock->conn, &output_again, NULL); ++ } while (output_again); ++} ++ ++static ssize_t stack_udp_readmsg(struct lwip_sock *sock, struct msghdr *msg, int flags) ++{ ++ struct mbox_ring *mr = &sock->conn->recvmbox->mring; ++ struct pbuf **extcache_list; ++ struct netbuf *nbuf; ++ err_t err = ERR_OK; ++ uint16_t copied_total = 0; ++ ++ LWIP_DEBUGF(SOCKETS_DEBUG, ("%s(sock=%p, msg=%p, size=%"SZT_F", flags=0x%x)\n", ++ __FUNCTION__, sock, msg, len, flags)); ++ ++ if (mr->ops->recv_start_burst(mr, (void **)&nbuf, 1) == 0) { ++ if (unlikely(sock_event_hold_pending(sock, WAIT_BLOCK, NETCONN_EVT_ERROR, 0))) { ++ err = ERR_CONN; ++ } else { ++ err = ERR_WOULDBLOCK; ++ } ++ goto out; ++ } ++ if (unlikely(lwip_netconn_is_err_msg(nbuf, &err))) { ++ API_EVENT(sock->conn, NETCONN_EVT_RCVMINUS, 0); ++ goto out; ++ } ++ ++ if (get_protocol_stack_group()->latency_start) ++ calculate_lstack_latency(sock->stack_id, &nbuf->p, 1, GAZELLE_LATENCY_READ_APP_CALL, sys_now_us()); ++ ++ /* let not free inside by MSG_PEEK */ ++ sock->lastdata.netbuf = nbuf; ++ err = lwip_recvfrom_udp_raw(sock, flags | MSG_PEEK, msg, &copied_total, 0); ++ sock->lastdata.netbuf = NULL; ++ ++ if (get_protocol_stack_group()->latency_start) ++ calculate_lstack_latency(sock->stack_id, &nbuf->p, 1, GAZELLE_LATENCY_READ_LSTACK, 0); ++ ++ if (mr->flags & MBOX_FLAG_PEEK) { ++ extcache_list = NULL; ++ } else { ++ extcache_list = (struct pbuf **)&mr->st_obj; ++ mem_extcache_put_pbuf(nbuf->p, NULL, extcache_list); ++ } ++ ++ mr->app_recvd_len += copied_total; ++ mr->app_queued_num++; ++ if (mr->app_queued_num >= RECV_EXTEND_CACHE_MAX || ++ mr->app_recvd_len >= RECV_EXTEND_CACHE_LEN) { ++ if (extcache_list != NULL) { ++ mem_extcache_flush_pbuf(extcache_list); ++ } ++ mr->ops->recv_finish_burst(mr); ++ mr->app_queued_num = 0; ++ mr->app_recvd_len = 0; ++ } ++ ++ if (err == ERR_OK) { ++ API_EVENT(sock->conn, NETCONN_EVT_RCVMINUS, copied_total); ++ return copied_total; ++ } ++out: ++ set_errno(err_to_errno(err)); ++ return -1; ++} ++ ++ ++static uint16_t rtw_stack_tcp_write_one(const struct lwip_sock *sock, struct mbox_ring *mr, ++ const char *data, uint16_t len, int flags) ++{ ++ struct pbuf **extcache_list = (struct pbuf **)&sock->conn->recvmbox->mring.st_obj; ++ struct pbuf *p; ++ ++ p = mem_extcache_get_pbuf(sock->stack_id, true, extcache_list); ++ if (p == NULL) { ++ LSTACK_LOG(ERR, LSTACK, "mem_extcache_get_pbuf failed\n"); ++ set_errno(ENOMEM); ++ return 0; ++ } ++ ++ write_pbuf(p, data, len, 0); ++ if ((flags & MSG_MORE) == 0) { ++ p->tcp_psh = 1; ++ } ++ ++ if (get_protocol_stack_group()->latency_start) ++ calculate_lstack_latency(sock->stack_id, &p, 1, GAZELLE_LATENCY_WRITE_INTO_RING, 0); ++ ++ mr->ops->enqueue_burst(mr, (void **)&p, 1); ++ mr->app_free_count -= 1; ++ ++ return len; ++} ++ ++static uint16_t rtw_stack_tcp_write_bulk(const struct lwip_sock *sock, struct mbox_ring *mr, ++ const char *data, uint16_t len, int flags) ++{ ++ struct pbuf **extcache_list = (struct pbuf **)&sock->conn->recvmbox->mring.st_obj; ++ unsigned pbuf_num; ++ struct pbuf *pbuf_pkts[TCP_SND_QUEUELEN_MAX]; ++ struct pbuf *tail; ++ ++ pbuf_num = (len + TCP_MSS - 1) / TCP_MSS; ++ pbuf_num = mem_extcache_get_pbuf_bulk(sock->stack_id, pbuf_pkts, pbuf_num, true, extcache_list); ++ if (unlikely(pbuf_num == 0)) { ++ LSTACK_LOG(ERR, LSTACK, "mem_extcache_get_pbuf_bulk failed, pbuf_num %u\n", pbuf_num); ++ set_errno(ENOMEM); ++ return 0; ++ } ++ ++ write_pbuf_bulk(pbuf_pkts, pbuf_num, TCP_MSS, data, len, 0); ++ ++ if (get_protocol_stack_group()->latency_start) ++ calculate_lstack_latency(sock->stack_id, pbuf_pkts, pbuf_num, GAZELLE_LATENCY_WRITE_INTO_RING, 0); ++ ++ tail = pbuf_pkts[pbuf_num - 1]; ++ if ((flags & MSG_MORE) == 0) { ++ tail->tcp_psh = 1; ++ } ++ ++ mr->app_tail_left = TCP_MSS - tail->tot_len; ++ if (mr->app_tail_left > TCP_SND_APPEND_LEN) { ++ pbuf_num--; ++ } else { ++ mr->app_tail_left = 0; ++ tail = NULL; ++ } ++ ++ /* must first enqueue before push_tail !!! */ ++ mr->app_free_count -= pbuf_num; ++ mr->ops->enqueue_burst(mr, (void **)pbuf_pkts, pbuf_num); ++ if (tail != NULL) { ++ mr->ops->push_tail(mr, tail); ++ } ++ ++ return len; ++} ++ ++static inline bool tcp_seg_need_append(uint16_t oversize_left, uint16_t payload_size, uint16_t data_len, int flags) ++{ ++ if (flags & MSG_MORE) { ++ return true; ++ } ++ /* Avoid splitting once write len into 3 segs. */ ++ if ((data_len % payload_size) <= oversize_left) ++ return true; ++ return false; ++} ++static uint16_t rtw_stack_tcp_append(struct mbox_ring *mr, const char *data, uint16_t len, int flags) ++{ ++ struct pbuf *p; ++ bool need_append; ++ uint16_t buf_copy_len; ++ ++ if (mr->app_tail_left == 0) { ++ return 0; ++ } ++ ++ buf_copy_len = 0; ++ p = (struct pbuf *)mr->ops->pop_tail(mr, NULL); ++ if (p != NULL) { ++ need_append = tcp_seg_need_append(mr->app_tail_left, TCP_MSS, len, flags); ++ if (need_append) { ++ buf_copy_len = LWIP_MIN(len, mr->app_tail_left); ++ pbuf_append_take(p, p, data, buf_copy_len, NULL); ++ } ++ mr->ops->enqueue_burst(mr, (void **)&p, 1); ++ mr->app_free_count -= 1; ++ } ++ ++ mr->app_tail_left = 0; ++ ++ return buf_copy_len; ++} ++ ++static ssize_t rtw_stack_tcp_write(struct lwip_sock *sock, const char *data, size_t len, int flags) ++{ ++ struct mbox_ring *mr = &sock->conn->sendmbox->mring; ++ uint16_t buf_copy_len; ++ uint32_t total_copy_len = (uint32_t)len; ++ uint32_t copied_total = 0; ++ ++ LWIP_DEBUGF(SOCKETS_DEBUG, ("%s(sock=%p, data=%p, size=%"SZT_F", flags=0x%x)\n", ++ __FUNCTION__, sock, data, len, flags)); ++ ++ if (unlikely(sock_event_hold_pending(sock, WAIT_BLOCK, NETCONN_EVT_ERROR, 0))) { ++ set_errno(ENOTCONN); ++ return -1; ++ } ++ ++ if (unlikely(mr->app_free_count < 2)) { ++ mr->app_free_count = mr->ops->free_count(mr); ++ if (unlikely(mr->app_free_count < 2)) { ++ API_EVENT(sock->conn, NETCONN_EVT_SENDMINUS, 0); ++ set_errno(EWOULDBLOCK); ++ return -1; ++ } ++ } ++ ++ copied_total = rtw_stack_tcp_append(mr, data, LWIP_MIN(TCP_MSS, total_copy_len), flags); ++ if (copied_total == total_copy_len) { ++ return copied_total; ++ } ++ ++ if (total_copy_len <= TCP_MSS) { ++ /* write one pbuf */ ++ copied_total += rtw_stack_tcp_write_one(sock, mr, data + copied_total, total_copy_len, flags); ++ } else { ++ if (total_copy_len > mr->app_free_count * TCP_MSS) { ++ mr->app_free_count = mr->ops->free_count(mr); ++ if (unlikely(mr->app_free_count < 2)) { ++ API_EVENT(sock->conn, NETCONN_EVT_SENDMINUS, 0); ++ set_errno(EWOULDBLOCK); ++ goto out; ++ } ++ if (total_copy_len > mr->app_free_count * TCP_MSS) { ++ total_copy_len = mr->app_free_count * TCP_MSS; ++ } ++ } ++ /* write bulk pbuf */ ++ while (total_copy_len > 0) { ++ buf_copy_len = LWIP_MIN(total_copy_len, TCP_SND_SIZE_MAX); ++ buf_copy_len = rtw_stack_tcp_write_bulk(sock, mr, data + copied_total, buf_copy_len, flags); ++ if (unlikely(buf_copy_len == 0)) { ++ goto out; ++ } ++ copied_total += buf_copy_len; ++ total_copy_len -= buf_copy_len; ++ } ++ } ++ ++out: ++ return copied_total > 0 ? copied_total : -1; ++} ++ ++static struct pbuf *rtw_tcp_output_pop_tail(struct mbox_ring *mr) ++{ ++ void *tail; ++ ++ tail = mr->ops->read_tail(mr); ++ if (tail == NULL) ++ return NULL; ++ ++ if (mr->ops->count(mr) > 0) ++ return NULL; ++ return mr->ops->pop_tail(mr, tail); ++} ++ ++static uint16_t rtw_stack_tcp_output(struct netconn *conn, bool *output_again, struct mem_thread *mt) ++{ ++ struct mbox_ring *mr = &conn->sendmbox->mring; ++ struct pbuf *pbuf_pkts[TCP_SND_OUTPUT_NUM]; ++ uint16_t pbuf_num; ++ ++ *output_again = false; ++ ++ /* must first dequeue before pop_tail !!! */ ++ pbuf_num = mr->ops->dequeue_burst(mr, (void **)pbuf_pkts, TCP_SND_OUTPUT_NUM); ++ ++ if (pbuf_num < TCP_SND_OUTPUT_NUM) { ++ if (pbuf_num == 0 || pbuf_pkts[pbuf_num - 1]->len == TCP_MSS) { ++ pbuf_pkts[pbuf_num] = rtw_tcp_output_pop_tail(mr); ++ if (pbuf_pkts[pbuf_num] != NULL) { ++ pbuf_num++; ++ } ++ } ++ ++ if (unlikely(pbuf_num == 0)) { ++ return 0; ++ } ++ } else { ++ if (mr->ops->count(mr) > 0 || mr->ops->read_tail(mr) != NULL) { ++ *output_again = true; ++ } ++ } ++ ++ LWIP_DEBUGF(SOCKETS_DEBUG, ("%s(conn=%p, fd=%d, pbuf_num=%u)\n", ++ __FUNCTION__, conn, conn->callback_arg.socket, pbuf_num)); ++ ++ if (get_protocol_stack_group()->latency_start) ++ calculate_lstack_latency(get_protocol_stack()->stack_idx, pbuf_pkts, pbuf_num, GAZELLE_LATENCY_WRITE_LWIP, 0); ++ ++ if (mt != NULL) { ++ mem_mbuf_migrate_enqueue(mt, pbuf_num); ++ } ++ ++ if (pbuf_num > 1) { ++ lwip_tcp_tso_merge_seg(conn->pcb.tcp, pbuf_pkts, &pbuf_num); ++ } ++ return lwip_tcp_prepare_seg(conn->pcb.tcp, pbuf_pkts, pbuf_num); ++} ++ ++ ++static uint32_t pbuf_copy_and_free(struct pbuf **left_pbuf, struct pbuf **extcache_list, char *data, uint32_t len) ++{ ++ struct pbuf *q, *t; ++ uint16_t buf_copy_len; ++ uint32_t copied_total = 0; ++ ++ q = *left_pbuf; ++ t = NULL; ++ while (copied_total < len && q != NULL) { ++ buf_copy_len = LWIP_MIN(q->len, len - copied_total); ++ ++ if (buf_copy_len > 0) { ++ MEMCPY(data + copied_total, q->payload, buf_copy_len); ++ copied_total += buf_copy_len; ++ ++ if (buf_copy_len < q->len) { ++ pbuf_remove_header(q, buf_copy_len); ++ break; ++ } else { ++ q->tot_len = q->len = 0; ++ } ++ } ++ ++ t = q; ++ q = q->next; ++ } ++ ++ if (t != NULL && extcache_list != NULL) { ++ t->next = NULL; ++ mem_extcache_put_pbuf(*left_pbuf, t, extcache_list); ++ } ++ *left_pbuf = q; ++ ++ return copied_total; ++} ++ ++static ssize_t stack_tcp_read(struct lwip_sock *sock, char *data, size_t len, int flags, ++ struct sockaddr *from, socklen_t *fromlen) ++{ ++ struct mbox_ring *mr = &sock->conn->recvmbox->mring; ++ struct pbuf **extcache_list; ++ err_t err = ERR_OK; ++ struct pbuf *p; ++ ++ uint32_t buf_copy_len; ++ uint32_t total_copy_len = len; ++ uint32_t copied_total = 0; ++ ++ LWIP_DEBUGF(SOCKETS_DEBUG, ("%s(sock=%p, data=%p, size=%"SZT_F", flags=0x%x)\n", ++ __FUNCTION__, sock, data, len, flags)); ++ ++ if (mr->flags & MBOX_FLAG_PEEK) { ++ extcache_list = NULL; ++ } else { ++ extcache_list = (struct pbuf **)&mr->st_obj; ++ } ++ ++ if (sock->lastdata.pbuf != NULL) { ++ // TODO: support MSG_PEEK ++ buf_copy_len = pbuf_copy_and_free(&sock->lastdata.pbuf, extcache_list, data, total_copy_len); ++ copied_total += buf_copy_len; ++ total_copy_len -= buf_copy_len; ++ mr->app_recvd_len += buf_copy_len; ++ } ++ ++ while (total_copy_len > 0) { ++ if (mr->ops->recv_start_burst(mr, (void **)&p, 1) == 0) { ++ if (unlikely(sock_event_hold_pending(sock, WAIT_BLOCK, NETCONN_EVT_ERROR, 0))) { ++ err = ERR_CONN; ++ } else { ++ err = ERR_WOULDBLOCK; ++ } ++ break; ++ } ++ mr->app_queued_num++; ++ if (unlikely(lwip_netconn_is_err_msg(p, &err))) { ++ API_EVENT(sock->conn, NETCONN_EVT_RCVMINUS, copied_total); ++ break; ++ } ++ ++ if (get_protocol_stack_group()->latency_start) ++ calculate_lstack_latency(sock->stack_id, &p, 1, GAZELLE_LATENCY_READ_APP_CALL, sys_now_us()); ++ ++ sock->lastdata.pbuf = p; ++ // TODO: support MSG_PEEK ++ buf_copy_len = pbuf_copy_and_free(&sock->lastdata.pbuf, extcache_list, data + copied_total, total_copy_len); ++ copied_total += buf_copy_len; ++ total_copy_len -= buf_copy_len; ++ mr->app_recvd_len += buf_copy_len; ++ ++ if (get_protocol_stack_group()->latency_start) ++ calculate_lstack_latency(sock->stack_id, &p, 1, GAZELLE_LATENCY_READ_LSTACK, 0); ++ ++ if (mr->app_queued_num >= RECV_EXTEND_CACHE_MAX || ++ mr->app_recvd_len >= RECV_EXTEND_CACHE_LEN) { ++ if (sock->lastdata.pbuf == NULL) { ++ mr->ops->recv_finish_burst(mr); ++ mr->app_queued_num = 0; ++ } ++ } ++ } ++ ++ if (mr->app_recvd_len >= RECV_EXTEND_CACHE_LEN) { ++ if (extcache_list != NULL) { ++ mem_extcache_flush_pbuf(extcache_list); ++ } ++ mr->app_recvd_len = 0; ++ } ++ ++ lwip_tcp_recv_from(sock->conn, from, fromlen, copied_total); ++ ++ if (copied_total > 0) { ++ API_EVENT(sock->conn, NETCONN_EVT_RCVMINUS, copied_total); ++ return copied_total; ++ } ++ ++ set_errno(err_to_errno(err)); ++ if (err == ERR_CLSD) { ++ return 0; ++ } ++ return -1; ++} ++ ++ ++#define RECVD_UNCOMMITTED(msg) ((msg)->args[MSG_ARG_2].ul) ++#define RECVD_CURR_SEQ(msg) ((msg)->args[MSG_ARG_3].ul) ++#define RECVD_LAST_SEQ(msg) ((msg)->args[MSG_ARG_4].ul) ++ ++static inline bool rpc_commit_tcp_recvd(struct rpc_msg *recvmsg, unsigned long threshold) ++{ ++ if (RECVD_UNCOMMITTED(recvmsg) >= threshold) { ++ __atomic_add_fetch(&RECVD_CURR_SEQ(recvmsg), RECVD_UNCOMMITTED(recvmsg), __ATOMIC_RELEASE); ++ RECVD_UNCOMMITTED(recvmsg) = 0; ++ return true; ++ } ++ return false; ++} ++ ++#if TCP_RECV_AND_UPDATE ++static inline unsigned long rpc_read_tcp_recvd(struct rpc_msg *recvmsg) ++{ ++ unsigned long curr_recvd_seq; ++ unsigned long recvd; ++ ++ curr_recvd_seq = __atomic_load_n(&RECVD_CURR_SEQ(recvmsg), __ATOMIC_ACQUIRE); ++ recvd = curr_recvd_seq - RECVD_LAST_SEQ(recvmsg); ++ if (recvd > 0) { ++ /* update last recvd seq */ ++ RECVD_LAST_SEQ(recvmsg) = curr_recvd_seq; ++ } ++ return recvd; ++} ++ ++static void callback_tcp_recvd(struct rpc_msg *recvmsg) ++{ ++ struct lwip_sock *sock = recvmsg->args[MSG_ARG_0].p; ++ unsigned long recvd; ++ ++ recvd = rpc_read_tcp_recvd(recvmsg); ++ lwip_tcp_recvd(sock->conn, recvd, 0); ++ ++ recvmsg->result = recvd; ++ return; ++} ++#endif /* TCP_RECV_AND_UPDATE */ ++ ++static inline int rpc_call_tcp_recvd(rpc_queue *queue, struct lwip_sock *sock, size_t recvd, int flags) ++{ ++ struct rpc_msg *recvmsg; ++ ++ recvmsg = sock_mbox_private_get(sock->conn->recvmbox); ++ recvmsg->args[MSG_ARG_0].p = sock; ++ recvmsg->result = 0; ++ ++ RECVD_UNCOMMITTED(recvmsg) += recvd; ++ if (rpc_commit_tcp_recvd(recvmsg, TCP_WND_UPDATE_THRESHOLD << 1)) { ++ rpc_async_call(queue, recvmsg, RPC_MSG_REUSE); ++ } ++ ++ return 0; ++} ++ ++static void rtw_stack_tcp_recvd(struct lwip_sock *sock, ssize_t recvd, int flags) ++{ ++ struct protocol_stack *stack = get_protocol_stack_by_id(sock->stack_id); ++ ++ if (recvd <= 0 || flags & MSG_PEEK) { ++ return; ++ } ++ rpc_call_tcp_recvd(&stack->rpc_queue, sock, recvd, flags); ++} ++ ++static void rtc_stack_tcp_recvd(struct lwip_sock *sock, ssize_t recvd, int flags) ++{ ++ if (recvd <= 0 || flags & MSG_PEEK) { ++ return; ++ } ++ lwip_tcp_recvd(sock->conn, recvd, flags); ++} ++ ++static void callback_tcp_send(struct rpc_msg *sendmsg) ++{ ++ struct protocol_stack *stack = get_protocol_stack(); ++ struct lwip_sock *sock = sendmsg->args[MSG_ARG_0].p; ++ struct mem_thread *mt = sendmsg->args[MSG_ARG_1].p; ++ bool output_again; ++ err_t err; ++ ++ if (unlikely(sock->conn->pcb.tcp == NULL)) ++ return; ++ ++ if (get_protocol_stack_group()->latency_start) ++ calculate_sock_latency(sock, GAZELLE_LATENCY_WRITE_RPC_MSG); ++ ++ do { ++ if (!lwip_tcp_allow_send(sock->conn->pcb.tcp)) { ++ rpc_async_call(&stack->rpc_queue, sendmsg, RPC_MSG_REUSE | RPC_MSG_RECALL); ++ break; ++ } ++ sendmsg->result += rtw_stack_tcp_output(sock->conn, &output_again, mt); ++ } while (output_again); ++ err = tcp_output(sock->conn->pcb.tcp); ++ if (unlikely(err != ERR_OK)) { ++ LSTACK_LOG(ERR, LSTACK, "tcp_output failed, sock %p, err %u\n", sock, err); ++ } ++ ++#if TCP_RECV_AND_UPDATE ++ struct rpc_msg *recvmsg; ++ if (RECVD_UNCOMMITTED(sendmsg)) { ++ RECVD_UNCOMMITTED(sendmsg) = 0; ++ recvmsg = sock_mbox_private_get(sock->conn->recvmbox); ++ callback_tcp_recvd(recvmsg); ++ } ++#endif /* TCP_RECV_AND_UPDATE */ ++ ++ return; ++} ++ ++static inline int rpc_call_tcp_send(rpc_queue *queue, struct lwip_sock *sock) ++{ ++ struct rpc_msg *sendmsg; ++ ++ if (get_protocol_stack_group()->latency_start) ++ time_stamp_into_rpcmsg(sock); ++ ++ sendmsg = sock_mbox_private_get(sock->conn->sendmbox); ++ sendmsg->result = 0; ++ sendmsg->args[MSG_ARG_0].p = sock; ++ sendmsg->args[MSG_ARG_1].p = mem_thread_migrate_get(sock->stack_id); ++ ++#if TCP_RECV_AND_UPDATE ++ struct rpc_msg *recvmsg; ++ recvmsg = sock_mbox_private_get(sock->conn->recvmbox); ++ RECVD_UNCOMMITTED(sendmsg) = rpc_commit_tcp_recvd(recvmsg, TCP_WND_UPDATE_THRESHOLD); ++#endif /* TCP_RECV_AND_UPDATE */ ++ ++ rpc_async_call(queue, sendmsg, RPC_MSG_REUSE); ++ return 0; ++} ++ ++static void rtw_stack_tcp_send(struct lwip_sock *sock) ++{ ++ struct protocol_stack *stack = get_protocol_stack_by_id(sock->stack_id); ++ rpc_call_tcp_send(&stack->rpc_queue, sock); ++} ++ ++ ++static ssize_t rtc_stack_tcp_write(struct lwip_sock *sock, const char *data, size_t len, int flags) ++{ ++ struct tcp_pcb *pcb = sock->conn->pcb.tcp; ++ err_t err = ERR_OK; ++ int write_flags, write_more; ++ ++ uint16_t buf_copy_len; ++ uint32_t total_copy_len; ++ uint32_t copied_total = 0; ++ ++ write_more = TCP_WRITE_FLAG_MORE; ++ write_flags = NETCONN_COPY | ++ ((flags & MSG_MORE) ? NETCONN_MORE : 0) | ++ ((flags & MSG_DONTWAIT) ? NETCONN_DONTBLOCK : 0); ++ ++ if (unlikely(sock_event_hold_pending(sock, WAIT_BLOCK, NETCONN_EVT_ERROR, 0))) { ++ set_errno(ENOTCONN); ++ return -1; ++ } ++ ++ total_copy_len = LWIP_MIN((uint32_t)len, (uint32_t)pcb->snd_buf); ++ if (unlikely(total_copy_len == 0)) { ++ API_EVENT(sock->conn, NETCONN_EVT_SENDMINUS, 0); ++ set_errno(EWOULDBLOCK); ++ return -1; ++ } ++ ++ while (total_copy_len > 0) { ++ if (total_copy_len <= TCP_SND_SIZE_MAX) { ++ buf_copy_len = total_copy_len; ++ write_more = 0; ++ } else { ++ buf_copy_len = TCP_SND_SIZE_MAX; ++ } ++ ++ err = tcp_write(pcb, data + copied_total, buf_copy_len, write_flags | write_more); ++ if (err != ERR_OK) { ++ LSTACK_LOG(ERR, LSTACK, "tcp_write failed, errno %d\n", err_to_errno(err)); ++ break; ++ } ++ total_copy_len -= buf_copy_len; ++ copied_total += buf_copy_len; ++ } ++ ++ if (copied_total > 0) { ++ return copied_total; ++ } ++ set_errno(err_to_errno(err)); ++ return -1; ++} ++ ++static void rtc_stack_tcp_send(struct lwip_sock *sock) ++{ ++ tcp_output(sock->conn->pcb.tcp); ++} ++ ++ ++ssize_t sockio_recvfrom(int fd, void *mem, size_t len, int flags, ++ struct sockaddr *from, socklen_t *fromlen) ++{ ++ struct lwip_sock *sock = lwip_get_socket(fd); ++ ssize_t recvd; ++ struct iovec vec; ++ struct msghdr msg; ++ ++ LWIP_DEBUGF(SOCKETS_DEBUG, ("%s(%d, mem=%p, size=%"SZT_F", flags=0x%x)\n", ++ __FUNCTION__, fd, mem, len, flags)); ++ ++ if (unlikely(mem == NULL || len <= 0)) { ++ set_errno(EINVAL); ++ return -1; ++ } ++ ++ if (unlikely(!sock->affinity_numa)) { ++ thread_bind_stack(sock->stack_id); ++ sock->affinity_numa = true; ++ } ++ ++ switch (NETCONN_TYPE(sock->conn)) { ++ case NETCONN_TCP: ++ /* TODO: support MSG_WAITALL */ ++ recvd = ioops.stack_tcp_read(sock, mem, len, flags, from, fromlen); ++ if (recvd < 0 && errno == EWOULDBLOCK) { ++ if (sock_event_wait(sock, netconn_is_nonblocking(sock->conn) || (flags & MSG_DONTWAIT))) { ++ recvd = ioops.stack_tcp_read(sock, mem, len, flags, from, fromlen); ++ } ++ } ++#if TCP_RECV_AND_UPDATE ++ if (recvd > 0) { ++ ioops.stack_tcp_recvd(sock, recvd, flags); ++ } ++#endif /* TCP_RECV_AND_UPDATE */ ++ break; ++ case NETCONN_UDP: ++ vec.iov_base = mem; ++ vec.iov_len = len; ++ msg.msg_control = NULL; ++ msg.msg_controllen = 0; ++ msg.msg_flags = 0; ++ msg.msg_iov = &vec; ++ msg.msg_iovlen = 1; ++ msg.msg_name = from; ++ msg.msg_namelen = (fromlen ? *fromlen : 0); ++ recvd = ioops.stack_udp_readmsg(sock, &msg, flags); ++ if (recvd < 0 && errno == EWOULDBLOCK) { ++ if (sock_event_wait(sock, netconn_is_nonblocking(sock->conn) || (flags & MSG_DONTWAIT))) { ++ recvd = ioops.stack_udp_readmsg(sock, &msg, flags); ++ } ++ } ++ if (recvd > 0 && fromlen != NULL) { ++ *fromlen = msg.msg_namelen; ++ } ++ break; ++ default: ++ recvd = -1; ++ break; ++ } ++ ++ return recvd; ++} ++ ++ssize_t sockio_recvmsg(int fd, struct msghdr *msg, int flags) ++{ ++ struct lwip_sock *sock = lwip_get_socket(fd); ++ ssize_t ret, recvd = 0; ++ ++ ret = lwip_recvmsg_check(NULL, msg, flags); ++ if (unlikely(ret <= 0)) { ++ return ret; ++ } ++ ++ if (unlikely(!sock->affinity_numa)) { ++ thread_bind_stack(sock->stack_id); ++ sock->affinity_numa = true; ++ } ++ ++ switch (NETCONN_TYPE(sock->conn)) { ++ case NETCONN_TCP: ++ for (int i = 0; i < msg->msg_iovlen; ++i) { ++ ret = sockio_recvfrom(fd, msg->msg_iov[i].iov_base, msg->msg_iov[i].iov_len, flags, NULL, NULL); ++ if (ret <= 0) { ++ if (recvd == 0) ++ recvd = ret; ++ break; ++ } ++ recvd += ret; ++ } ++ break; ++ case NETCONN_UDP: ++ recvd = ioops.stack_udp_readmsg(sock, msg, flags); ++ if (recvd < 0 && errno == EWOULDBLOCK) { ++ if (sock_event_wait(sock, netconn_is_nonblocking(sock->conn) || (flags & MSG_DONTWAIT))) { ++ recvd = ioops.stack_udp_readmsg(sock, msg, flags); ++ } ++ } ++ break; ++ default: ++ recvd = -1; ++ break; ++ } ++ ++ return recvd; ++} ++ ++ssize_t sockio_sendto(int fd, const void *mem, size_t len, int flags, ++ const struct sockaddr *to, socklen_t tolen) ++{ ++ struct lwip_sock *sock = lwip_get_socket(fd); ++ ssize_t ret; ++ ++ LWIP_DEBUGF(SOCKETS_DEBUG, ("%s(%d, mem=%p, size=%"SZT_F", flags=0x%x)\n", ++ __FUNCTION__, fd, mem, len, flags)); ++ ++ if (unlikely(mem == NULL || len <= 0)) { ++ set_errno(EINVAL); ++ return -1; ++ } ++ ++ if (unlikely(!sock->affinity_numa)) { ++ thread_bind_stack(sock->stack_id); ++ sock->affinity_numa = true; ++ } ++ ++ switch (NETCONN_TYPE(sock->conn)) { ++ case NETCONN_TCP: ++ ret = ioops.stack_tcp_write(sock, mem, len, flags); ++ if (ret < 0) { ++ if (errno == EWOULDBLOCK) { ++ sock_event_wait(sock, true); ++ } ++ } else { ++ ioops.stack_tcp_send(sock); ++ } ++ break; ++ case NETCONN_UDP: ++ ret = ioops.stack_udp_write(sock, mem, len, flags, to, tolen); ++ if (ret < 0) { ++ if (errno == EWOULDBLOCK) { ++ sock_event_wait(sock, true); ++ } ++ } else { ++ ioops.stack_udp_send(sock); ++ } ++ break; ++ default: ++ ret = -1; ++ break; ++ } ++ ++ return ret; ++} ++ ++ssize_t sockio_sendmsg(int fd, const struct msghdr *msg, int flags) ++{ ++ struct lwip_sock *sock = lwip_get_socket(fd); ++ ssize_t ret = -1; ++ size_t written = 0; ++ int write_more = MSG_MORE; ++ int i; ++ ++ ret = lwip_sendmsg_check(sock, msg, flags); ++ if (unlikely(ret <= 0)) { ++ return ret; ++ } ++ ++ if (unlikely(!sock->affinity_numa)) { ++ thread_bind_stack(sock->stack_id); ++ sock->affinity_numa = true; ++ } ++ ++ switch (NETCONN_TYPE(sock->conn)) { ++ case NETCONN_TCP: ++ for (i = 0; i < msg->msg_iovlen; ++i) { ++ if (i == msg->msg_iovlen - 1) { ++ write_more = 0; ++ } ++ ret = ioops.stack_tcp_write(sock, msg->msg_iov[i].iov_base, msg->msg_iov[i].iov_len, flags | write_more); ++ if (ret < 0) { ++ if (errno == EWOULDBLOCK) { ++ sock_event_wait(sock, true); ++ } ++ break; ++ } ++ written += ret; ++ } ++ if (written > 0) { ++ ioops.stack_tcp_send(sock); ++ } ++ break; ++ case NETCONN_UDP: ++ for (i = 0; i < msg->msg_iovlen; ++i) { ++ if (i == msg->msg_iovlen - 1) { ++ write_more = 0; ++ } ++ ret = ioops.stack_udp_write(sock, msg->msg_iov[i].iov_base, msg->msg_iov[i].iov_len, flags | write_more, NULL, 0); ++ if (ret < 0) { ++ if (errno == EWOULDBLOCK) { ++ sock_event_wait(sock, true); ++ } ++ break; ++ } ++ written += ret; ++ } ++ if (written > 0) { ++ ioops.stack_udp_send(sock); ++ } ++ break; ++ default: ++ written = -1; ++ break; ++ } ++ ++ return written > 0 ? written : ret; ++} ++ ++ssize_t sockio_read(int fd, void *mem, size_t len) ++{ ++ return sockio_recvfrom(fd, mem, len, 0, NULL, NULL); ++} ++ ++ssize_t sockio_write(int fd, const void *mem, size_t len) ++{ ++ return sockio_sendto(fd, mem, len, 0, NULL, 0); ++} ++ ++ssize_t sockio_recv(int fd, void *mem, size_t len, int flags) ++{ ++ return sockio_recvfrom(fd, mem, len, flags, NULL, NULL); ++} ++ ++ssize_t sockio_send(int fd, const void *mem, size_t len, int flags) ++{ ++ return sockio_sendto(fd, mem, len, flags, NULL, 0); ++} ++ ++ssize_t sockio_readv(int fd, const struct iovec *iov, int iovcnt) ++{ ++ struct msghdr msg; ++ ++ msg.msg_name = NULL; ++ msg.msg_namelen = 0; ++ msg.msg_iov = LWIP_CONST_CAST(struct iovec *, iov); ++ msg.msg_iovlen = iovcnt; ++ msg.msg_control = NULL; ++ msg.msg_controllen = 0; ++ msg.msg_flags = 0; ++ ++ return sockio_recvmsg(fd, &msg, 0); ++} ++ ++ssize_t sockio_writev(int fd, const struct iovec *iov, int iovcnt) ++{ ++ struct msghdr msg; ++ ++ msg.msg_name = NULL; ++ msg.msg_namelen = 0; ++ msg.msg_iov = LWIP_CONST_CAST(struct iovec *, iov); ++ msg.msg_iovlen = iovcnt; ++ msg.msg_control = NULL; ++ msg.msg_controllen = 0; ++ msg.msg_flags = 0; ++ ++ return sockio_sendmsg(fd, &msg, 0); ++} ++ ++void sockio_ops_init(void) ++{ ++ struct sockio_ops *ops = &ioops; ++ ++ ops->stack_udp_write = stack_udp_write; ++ ops->stack_udp_readmsg = stack_udp_readmsg; ++ ops->stack_tcp_read = stack_tcp_read; ++ ++ if (get_global_cfg_params()->stack_mode_rtc) { ++ ops->stack_udp_send = rtc_stack_udp_send; ++ ops->stack_tcp_write = rtc_stack_tcp_write; ++ ops->stack_tcp_send = rtc_stack_tcp_send; ++ ops->stack_tcp_recvd = rtc_stack_tcp_recvd; ++ } else { ++ ops->stack_udp_send = rtw_stack_udp_send; ++ ops->stack_tcp_write = rtw_stack_tcp_write; ++ ops->stack_tcp_send = rtw_stack_tcp_send; ++ ops->stack_tcp_recvd = rtw_stack_tcp_recvd; ++ } ++} ++ ++static int sockio_mbox_init(struct lwip_sock *sock) ++{ ++ int ret; ++ sys_mbox_t sendmbox = sock->conn->sendmbox; ++ sys_mbox_t recvmbox = sock->conn->recvmbox; ++ ++ if (get_global_cfg_params()->stack_mode_rtc) { ++ return 0; ++ } ++ ++ switch (NETCONN_TYPE(sock->conn)) { ++ case NETCONN_TCP: ++ ret = sock_mbox_private_init(sendmbox, callback_tcp_send); ++#if TCP_RECV_AND_UPDATE ++ if (sys_mbox_valid(&recvmbox)) { ++ ret |= sock_mbox_private_init(recvmbox, callback_tcp_recvd); ++ } ++#endif /* TCP_RECV_AND_UPDATE */ ++ break; ++ case NETCONN_UDP: ++ ret = sock_mbox_private_init(sendmbox, callback_udp_send); ++ break; ++ default: ++ ret = 0; ++ } ++ ++ if (ret != 0) { ++ sock_mbox_private_free(&sendmbox->mring); ++ if (sys_mbox_valid(&recvmbox)) { ++ sock_mbox_private_free(&recvmbox->mring); ++ } ++ } ++ return ret; ++} ++ ++bool sockio_mbox_pending(const struct lwip_sock *sock) ++{ ++ struct rpc_msg *msg; ++ ++ if (POSIX_IS_CLOSED(sock)) ++ return false; ++ ++ if (sys_mbox_valid(&sock->conn->sendmbox)) { ++ msg = sock_mbox_private_get(sock->conn->sendmbox); ++ if (msg != NULL && !lockless_queue_node_is_poped(&msg->queue_node)) { ++ return true; ++ } ++ } ++ if (sys_mbox_valid(&sock->conn->recvmbox)) { ++ msg = sock_mbox_private_get(sock->conn->recvmbox); ++ if (msg != NULL && !lockless_queue_node_is_poped(&msg->queue_node)) { ++ return true; ++ } ++ } ++ ++ return false; ++} ++ ++int do_lwip_init_sock(int fd) ++{ ++ int ret; ++ struct protocol_stack *stack = get_protocol_stack(); ++ struct lwip_sock *sock = lwip_get_socket(fd); ++ if (sock == NULL) { ++ return -1; ++ } ++ ++ sock->type = 0; ++ sock->listen_next = NULL; ++ sock->stack_id = stack->stack_idx; ++ ++ /* RTC affinity by stack_setup_app_thread() */ ++ if (get_global_cfg_params()->stack_mode_rtc) { ++ sock->affinity_numa = true; ++ } else { ++ sock->affinity_numa = false; ++ } ++ ++ sock->sk_wait = NULL; ++ ret = sock_event_init(&sock->sk_event); ++ if (ret != 0) { ++ LSTACK_LOG(ERR, LSTACK, "sock_event_init failed\n"); ++ return -1; ++ } ++ ++ ret = sockio_mbox_init(sock); ++ if (ret != 0) { ++ LSTACK_LOG(ERR, LSTACK, "sockio_mbox_init failed\n"); ++ return -1; ++ } ++ ++ get_protocol_stack_by_id(sock->stack_id)->conn_num++; ++ return 0; ++} ++ ++void do_lwip_clean_sock(int fd) ++{ ++ struct lwip_sock *sock = lwip_get_socket(fd); ++ if (POSIX_IS_CLOSED(sock)) { ++ return; ++ } ++ ++ sock_event_free(&sock->sk_event, sock->sk_wait); ++ sock->sk_wait = NULL; ++ ++ sock->listen_next = NULL; ++ ++ get_protocol_stack_by_id(sock->stack_id)->conn_num--; ++ sock->stack_id = -1; ++} +diff --git a/src/lstack/core/lstack_mempool.c b/src/lstack/core/lstack_mempool.c +new file mode 100644 +index 0000000..8f01f31 +--- /dev/null ++++ b/src/lstack/core/lstack_mempool.c +@@ -0,0 +1,1014 @@ ++/* ++* Copyright (c) Huawei Technologies Co., Ltd. 2020-2021. All rights reserved. ++* gazelle is licensed under the Mulan PSL v2. ++* You can use this software according to the terms and conditions of the Mulan PSL v2. ++* You may obtain a copy of Mulan PSL v2 at: ++* http://license.coscl.org.cn/MulanPSL2 ++* THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR ++* IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY OR FIT FOR A PARTICULAR ++* PURPOSE. ++* See the Mulan PSL v2 for more details. ++*/ ++ ++#include ++ ++#include ++#include ++ ++#include "lstack_mempool.h" ++#include "lstack_log.h" ++#include "lstack_cfg.h" ++#include "common/dpdk_common.h" ++#include "lstack_dpdk.h" ++#include "lstack_protocol_stack.h" ++ ++#define MEM_THREAD_MANAGER_TIMEOUT 1 ++#define MEM_THREAD_MANAGER_MAX 64 ++ ++struct mem_thread_manager { ++ struct list_node mt_work_list; ++ struct list_node mt_free_list; ++ rte_spinlock_t list_lock; ++}; ++ ++struct mem_thread_group { ++ int tid; ++ pthread_t thread; ++ struct list_node mt_node; ++ struct mem_thread mt_array[PROTOCOL_STACK_MAX]; ++}; ++ ++static struct mem_stack g_mem_stack_group[PROTOCOL_STACK_MAX] = {0}; ++static PER_THREAD struct mem_thread_group *g_mem_thread_group = NULL; ++static struct mem_thread_manager g_mem_thread_manager = {0}; ++ ++static __rte_always_inline ++struct mem_stack *mem_stack_get(int stack_id) ++{ ++ return &g_mem_stack_group[stack_id]; ++} ++ ++struct rte_mempool *mem_get_mbuf_pool(int stack_id) ++{ ++ return g_mem_stack_group[stack_id].mbuf_pool; ++} ++ ++struct rte_mempool *mem_get_rpc_pool(int stack_id) ++{ ++ return g_mem_stack_group[stack_id].rpc_pool; ++} ++ ++static void mem_thread_manager_add_work(struct mem_thread_group *mt_group) ++{ ++ rte_spinlock_lock(&g_mem_thread_manager.list_lock); ++ list_add_node(&mt_group->mt_node, &g_mem_thread_manager.mt_work_list); ++ rte_spinlock_unlock(&g_mem_thread_manager.list_lock); ++} ++ ++static inline bool mem_thread_group_exist(const struct mem_thread_group *mt_group) ++{ ++ if (pthread_tryjoin_np(mt_group->thread, NULL) == 0) ++ return false; ++ return true; ++} ++ ++static void mem_thread_group_free(struct mem_thread_group *mt_group) ++{ ++ struct mem_thread *mt; ++ int stack_id; ++ ++ for (stack_id = 0; stack_id < PROTOCOL_STACK_MAX; stack_id++) { ++ mt = &mt_group->mt_array[stack_id]; ++ mem_thread_cache_free(mt); ++ } ++ free(mt_group); ++ return; ++} ++ ++static int mem_thread_group_init(int stack_id) ++{ ++ struct mem_thread *mt; ++ ++ if (rte_lcore_id() < RTE_MAX_LCORE) { ++ LSTACK_LOG(ERR, LSTACK, "tid %d, lcore_id %u is invalid\n", rte_gettid(), rte_lcore_id()); ++ return -1; ++ } ++ ++ if (g_mem_thread_group == NULL) { ++ g_mem_thread_group = (struct mem_thread_group *)calloc(1, sizeof(struct mem_thread_group)); ++ if (g_mem_thread_group == NULL) { ++ LSTACK_LOG(ERR, LSTACK, "alloc mem_thread_group failed, stack_id %d\n", stack_id); ++ return -1; ++ } ++ g_mem_thread_group->tid = rte_gettid(); ++ g_mem_thread_group->thread = pthread_self(); ++ list_init_node(&g_mem_thread_group->mt_node); ++ mem_thread_manager_add_work(g_mem_thread_group); ++ } ++ ++ mt = &g_mem_thread_group->mt_array[stack_id]; ++ if (mem_thread_cache_init(mt) != 0) { ++ LSTACK_LOG(ERR, LSTACK, "mem_thread_cache_init failed, stack_id %d\n", stack_id); ++ return -1; ++ } ++ ++ return 0; ++} ++ ++static inline struct mem_thread *mem_thread_group_get(int stack_id) ++{ ++ struct mem_thread *mt; ++ ++ if (likely(g_mem_thread_group != NULL)) { ++ mt = &g_mem_thread_group->mt_array[stack_id]; ++ if (likely(mt->mbuf_cache != NULL)) ++ return mt; ++ } ++ ++ if (mem_thread_group_init(stack_id) != 0) { ++ LSTACK_LOG(ERR, LSTACK, "mem_thread_group_init failed, stack_id %d\n", stack_id); ++ return NULL; ++ } ++ mt = &g_mem_thread_group->mt_array[stack_id]; ++ return mt; ++} ++ ++static void *mem_thread_manager_thread(void *arg) ++{ ++ struct list_node *node, *next; ++ struct mem_thread_group *mt_group; ++ unsigned count = 0; ++ ++ rte_spinlock_init(&g_mem_thread_manager.list_lock); ++ list_init_head(&g_mem_thread_manager.mt_work_list); ++ list_init_head(&g_mem_thread_manager.mt_free_list); ++ ++ while(true) { ++ sleep(MEM_THREAD_MANAGER_TIMEOUT); ++ ++ rte_spinlock_lock(&g_mem_thread_manager.list_lock); ++ ++ list_for_each_node(node, next, &g_mem_thread_manager.mt_free_list) { ++ mt_group = container_of(node, struct mem_thread_group, mt_node); ++ list_del_node(node); ++ mem_thread_group_free(mt_group); ++ } ++ ++ list_for_each_node(node, next, &g_mem_thread_manager.mt_work_list) { ++ count++; ++ if (count > MEM_THREAD_MANAGER_MAX) { ++ /* move list head after the current node, ++ * and start traversing from this node next time */ ++ list_del_node(&g_mem_thread_manager.mt_work_list); ++ list_add_node(&g_mem_thread_manager.mt_work_list, node); ++ break; ++ } ++ ++ mt_group = container_of(node, struct mem_thread_group, mt_node); ++ if (mem_thread_group_exist(mt_group)) { ++ continue; ++ } ++ list_del_node(node); ++ list_add_node(node, &g_mem_thread_manager.mt_free_list); ++ } ++ ++ rte_spinlock_unlock(&g_mem_thread_manager.list_lock); ++ } ++ ++ return NULL; ++} ++ ++int mem_thread_manager_init(void) ++{ ++ return thread_create("gzmempool", 0, mem_thread_manager_thread, NULL); ++} ++ ++static __rte_always_inline ++struct mem_thread *mem_thread_get(int stack_id) ++{ ++ /* stack thread uses mbufpool_cache instead of buf_cache */ ++ if (get_protocol_stack() != NULL) ++ return NULL; ++ ++#if MEMP_DEBUG ++ if (RTE_PER_LCORE(_lcore_id) < RTE_MAX_LCORE) { ++ LWIP_DEBUGF(MEMP_DEBUG | LWIPGZ_LOG_FATAL, ("tid %d has invalid rte_lcore_id %u !\n", ++ rte_gettid(), RTE_PER_LCORE(_lcore_id))); ++ return NULL; ++ } ++#endif /* MEMP_DEBUG */ ++ ++ return mem_thread_group_get(stack_id); ++} ++ ++struct mem_obj_ops { ++ void (*init)(struct rte_mempool *mp, void *arg, void *obj, unsigned obj_idx); ++ unsigned (*get_stack_id)(const void *obj); ++ struct rte_mempool * (*get_pool)(const void *obj); ++}; ++ ++static __rte_always_inline ++void rpc_obj_init(struct rte_mempool *mp, void *arg, void *obj, unsigned obj_idx) ++{ ++ int stack_id = *(int *)arg; ++ struct rpc_msg *msg = obj; ++ msg->stack_id = stack_id; ++} ++ ++static __rte_always_inline ++unsigned rpc_obj_get_stack_id(const void *obj) ++{ ++ return ((const struct rpc_msg *)obj)->stack_id; ++} ++ ++static __rte_always_inline ++struct rte_mempool *rpc_obj_get_pool(const void *obj) ++{ ++ int stack_id = rpc_obj_get_stack_id(obj); ++ return mem_get_rpc_pool(stack_id); ++} ++ ++static __rte_always_inline ++void mbuf_obj_init(struct rte_mempool *mp, void *arg, void *obj, unsigned obj_idx) ++{ ++ int stack_id = *(int *)arg; ++ struct rte_mbuf *mbuf = obj; ++ struct mbuf_private *priv = mbuf_to_private(mbuf); ++ priv->stack_id = stack_id; ++} ++ ++static __rte_always_inline ++unsigned mbuf_obj_get_stack_id(const void *obj) ++{ ++ return mbuf_to_private((const struct rte_mbuf *)obj)->stack_id; ++} ++ ++static __rte_always_inline ++struct rte_mempool *mbuf_obj_get_pool(const void *obj) ++{ ++ int stack_id = mbuf_obj_get_stack_id(obj); ++ return mem_get_mbuf_pool(stack_id); ++} ++ ++static const struct mem_obj_ops rpc_obj_ops = { ++ .init = rpc_obj_init, ++ .get_stack_id = rpc_obj_get_stack_id, ++ .get_pool = rpc_obj_get_pool, ++}; ++ ++static const struct mem_obj_ops mbuf_obj_ops = { ++ .init = mbuf_obj_init, ++ .get_stack_id = mbuf_obj_get_stack_id, ++ .get_pool = mbuf_obj_get_pool, ++}; ++ ++struct mempool_ops { ++ struct rte_mempool *(*create)(const char *name, unsigned n, ++ unsigned cache_size, unsigned priv_size, unsigned data_room_size, int socket_id); ++ void (*put_bulk)(struct rte_mempool *pool, void *const *obj_table, unsigned n); ++ unsigned (*get_bulk)(struct rte_mempool *pool, void **obj_table, unsigned n); ++}; ++ ++static __rte_always_inline ++struct rte_mempool *mempool_create(const char *name, unsigned n, ++ unsigned cache_size, unsigned priv_size, unsigned data_room_size, int socket_id) ++{ ++ struct rte_mempool *pool; ++ ++ LSTACK_LOG(INFO, LSTACK, "name %s, n %u, cache_size %u, priv_size %u, data_room_size %u, socket_id %d, ops_name %s\n", ++ name, n, cache_size, priv_size, data_room_size, socket_id, MEMPOOL_OPS_NAME); ++ ++ pool = rte_mempool_create(name, n, data_room_size, cache_size, priv_size, NULL, NULL, NULL, NULL, socket_id, 0); ++ if (pool != NULL) ++ rte_mempool_set_ops_byname(pool, MEMPOOL_OPS_NAME, NULL); ++ return pool; ++} ++ ++static __rte_always_inline ++void mempool_put_bulk(struct rte_mempool *pool, void *const *obj_table, unsigned n) ++{ ++ rte_mempool_put_bulk(pool, obj_table, n); ++} ++ ++static __rte_always_inline ++unsigned mempool_get_bulk(struct rte_mempool *pool, void **obj_table, unsigned n) ++{ ++ return rte_mempool_get_bulk(pool, obj_table, n) != 0 ? 0 : n; ++} ++ ++static __rte_always_inline ++struct rte_mempool *pkgmbuf_create(const char *name, unsigned n, ++ unsigned cache_size, unsigned priv_size, unsigned data_room_size, int socket_id) ++{ ++ LSTACK_LOG(INFO, LSTACK, "name %s, n %u, cache_size %u, priv_size %u, data_room_size %u, socket_id %d, ops_name %s\n", ++ name, n, cache_size, priv_size, data_room_size, socket_id, MEMPOOL_OPS_NAME); ++ ++ return rte_pktmbuf_pool_create_by_ops(name, n, cache_size, priv_size, data_room_size, socket_id, MEMPOOL_OPS_NAME); ++} ++ ++static __rte_always_inline ++void pkgmbuf_put_bulk(struct rte_mempool *pool, void *const *obj_table, unsigned n) ++{ ++ // rte_pktmbuf_free_bulk((struct rte_mbuf **)obj_table, n); ++ rte_mempool_put_bulk(pool, obj_table, n); ++} ++ ++static __rte_always_inline ++unsigned pkgmbuf_get_bulk(struct rte_mempool *pool, void **obj_table, unsigned n) ++{ ++ return rte_pktmbuf_alloc_bulk(pool, (struct rte_mbuf **)obj_table, n) != 0 ? 0 : n; ++} ++ ++static const struct mempool_ops mem_mp_ops = { ++ .create = mempool_create, ++ .put_bulk = mempool_put_bulk, ++ .get_bulk = mempool_get_bulk, ++}; ++ ++static const struct mempool_ops mbuf_mp_ops = { ++ .create = pkgmbuf_create, ++ .put_bulk = pkgmbuf_put_bulk, ++ .get_bulk = pkgmbuf_get_bulk, ++}; ++ ++ ++static struct rte_mempool *mbuf_pool_create(int stack_id) ++{ ++ struct cfg_params *cfg_params = get_global_cfg_params(); ++ char name[RTE_MEMPOOL_NAMESIZE]; ++ struct rte_mempool *pool; ++ uint32_t total_conn_mbufs, total_nic_mbufs, total_mbufs; ++ uint16_t private_size; ++ ++ total_conn_mbufs = cfg_params->mbuf_count_per_conn * cfg_params->tcp_conn_count; ++ total_nic_mbufs = cfg_params->rxqueue_size + cfg_params->txqueue_size; ++ ++ total_mbufs = (total_conn_mbufs / cfg_params->num_queue) + total_nic_mbufs + MBUFPOOL_RESERVE_NUM; ++ /* limit mbuf max num based on the dpdk capability */ ++ if (total_mbufs > MBUFPOOL_MAX_NUM) { ++ LSTACK_LOG(ERR, LSTACK, "total_mbufs %u out of the dpdk mbuf_pool range\n", total_mbufs); ++ return NULL; ++ } ++ ++ SYS_FORMAT_NAME(name, RTE_MEMPOOL_NAMESIZE, "%s_%hu", "mbuf_pool", stack_id); ++ private_size = RTE_ALIGN(sizeof(struct mbuf_private) + 24, RTE_CACHE_LINE_SIZE); ++ ++ pool = mbuf_mp_ops.create(name, total_mbufs, MBUFPOOL_CACHE_NUM, private_size, MBUF_DATA_SIZE, rte_socket_id()); ++ if (pool == NULL) { ++ LSTACK_LOG(ERR, LSTACK, "rte_pktmbuf_pool_create %s failed, rte_errno %d\n", name, rte_errno); ++ return NULL; ++ } ++ ++ return pool; ++} ++ ++static struct rte_mempool *rpc_pool_create(int stack_id) ++{ ++ char name [RTE_MEMPOOL_NAMESIZE]; ++ struct rte_mempool *pool; ++ uint32_t total_bufs = get_global_cfg_params()->rpc_msg_max; ++ ++ SYS_FORMAT_NAME(name, RTE_MEMPOOL_NAMESIZE, "%s_%hu", "rpc_pool", stack_id); ++ ++ pool = mem_mp_ops.create(name, total_bufs, MEMPOOL_CACHE_NUM, 0, sizeof(struct rpc_msg), rte_socket_id()); ++ if (pool == NULL) { ++ LSTACK_LOG(ERR, LSTACK, "rte_mempool_create %s failed, rte_errno %d\n", name, rte_errno); ++ } ++ ++ return pool; ++} ++ ++void mem_stack_pool_free(int stack_id) ++{ ++ struct mem_stack *ms = mem_stack_get(stack_id); ++ ++ if (ms->mbuf_pool != NULL) { ++ rte_mempool_free(ms->mbuf_pool); ++ ms->mbuf_pool = NULL; ++ } ++ if (ms->rpc_pool != NULL) { ++ rte_mempool_free(ms->rpc_pool); ++ ms->rpc_pool = NULL; ++ } ++} ++ ++int mem_stack_pool_init(int stack_id) ++{ ++ struct mem_stack *ms = mem_stack_get(stack_id); ++ ++ ms->mbuf_pool = mbuf_pool_create(stack_id); ++ if (ms->mbuf_pool == NULL) { ++ return -1; ++ } ++ ++ ms->rpc_pool = rpc_pool_create(stack_id); ++ if (ms->rpc_pool == NULL) { ++ mem_stack_pool_free(stack_id); ++ return -1; ++ } ++ ++ rte_mempool_obj_iter(ms->mbuf_pool, mbuf_obj_ops.init, &stack_id); ++ rte_mempool_obj_iter(ms->rpc_pool, rpc_obj_ops.init, &stack_id); ++ ++ return 0; ++} ++ ++int mem_stack_mpcache_init(int stack_id, unsigned cpu_id) ++{ ++ struct mem_stack *ms = mem_stack_get(stack_id); ++ ++ if (ms->mbuf_pool == NULL) { ++ LSTACK_LOG(ERR, LSTACK, "mem_stack_get stack_id %d failed\n", stack_id); ++ return -1; ++ } ++ ++ RTE_PER_LCORE(_lcore_id) = cpu_id; ++ ms->mbuf_mpcache = rte_mempool_default_cache(ms->mbuf_pool, rte_lcore_id()); ++ ms->migrate_watermark = ms->mbuf_mpcache->size / 8; ++ ++ LSTACK_LOG(INFO, LSTACK, "tid %d, stack_id %d, lcore_id %u, migrate_watermark %u\n", ++ rte_gettid(), stack_id, rte_lcore_id(), ms->migrate_watermark); ++ ++ return 0; ++} ++ ++unsigned mem_stack_mbuf_pool_count(int stack_id) ++{ ++ struct mem_stack *ms = mem_stack_get(stack_id); ++ return rte_mempool_avail_count(ms->mbuf_pool); ++} ++ ++void mem_thread_cache_free(struct mem_thread *mt) ++{ ++ void *obj; ++ ++ if (mt->mbuf_migrate_ring != NULL) { ++ while (rte_ring_sc_dequeue(mt->mbuf_migrate_ring, &obj) == 0) { ++ mem_put_mbuf_bulk((struct rte_mbuf **)&obj, 1); ++ } ++ rte_ring_free(mt->mbuf_migrate_ring); ++ mt->mbuf_migrate_ring = NULL; ++ } ++ ++ if (mt->mbuf_cache != NULL) { ++ while (buf_cache_pop_bulk(mt->mbuf_cache, &obj, 1, NULL) > 0) { ++ mem_put_mbuf_bulk((struct rte_mbuf **)&obj, 1); ++ } ++ buf_cache_free(mt->mbuf_cache); ++ mt->mbuf_cache = NULL; ++ } ++ ++ if (mt->rpc_cache != NULL) { ++ while (buf_cache_pop_bulk(mt->rpc_cache, &obj, 1, NULL) > 0) { ++ mem_put_rpc(obj); ++ } ++ buf_cache_free(mt->rpc_cache); ++ mt->rpc_cache = NULL; ++ } ++} ++ ++int mem_thread_cache_init(struct mem_thread *mt) ++{ ++ if (!get_global_cfg_params()->stack_mode_rtc && !dpdk_nic_is_xdp()) { ++ char name [RTE_MEMPOOL_NAMESIZE]; ++ SYS_FORMAT_NAME(name, RTE_MEMPOOL_NAMESIZE, "%s_%p", "migrate_ring", mt); ++ ++ mt->mbuf_migrate_ring = rte_ring_create(name, BUF_CACHE_DEFAULT_NUM, ++ rte_socket_id(), RING_F_SP_ENQ | RING_F_SC_DEQ); ++ if (mt->mbuf_migrate_ring == NULL) { ++ return -1; ++ } ++ } ++ ++ mt->mbuf_cache = buf_cache_create(BUF_CACHE_DEFAULT_NUM); ++ if (mt->mbuf_cache == NULL) { ++ mem_thread_cache_free(mt); ++ return -1; ++ } ++ ++ mt->rpc_cache = buf_cache_create(BUF_CACHE_MIN_NUM); ++ if (mt->rpc_cache == NULL) { ++ mem_thread_cache_free(mt); ++ return -1; ++ } ++ ++ return 0; ++} ++ ++struct mem_thread *mem_thread_migrate_get(int stack_id) ++{ ++ struct mem_thread *mt = mem_thread_get(stack_id); ++ if (mt == NULL || mt->mbuf_migrate_ring == NULL) ++ return NULL; ++ return mt; ++} ++ ++static inline void mem_preinit_pbuf(struct pbuf *p); ++void mem_mbuf_migrate_enqueue(struct mem_thread *mt, unsigned n) ++{ ++ struct mem_stack *ms; ++ struct rte_mempool_cache *mpcache; ++ int stack_id; ++ unsigned num, i; ++ void **obj_table; ++ ++ stack_id = get_protocol_stack()->stack_idx; ++ ms = mem_stack_get(stack_id); ++ mpcache = ms->mbuf_mpcache; ++ ++ mt->stk_migrate_count += n; ++ ++ if (mpcache->len <= ms->migrate_watermark) ++ return; ++ ++ num = LWIP_MIN(mpcache->len - ms->migrate_watermark, ++ mt->stk_migrate_count); ++ obj_table = &mpcache->objs[mpcache->len - num]; ++ ++ for (i = 0; i < num; i++) { ++ rte_pktmbuf_reset(obj_table[i]); ++ mem_preinit_pbuf(mbuf_to_pbuf(obj_table[i])); ++ } ++ num = rte_ring_sp_enqueue_bulk(mt->mbuf_migrate_ring, obj_table, num, NULL); ++ if (num > 0) { ++ mpcache->len -= num; ++ mt->stk_migrate_count -= num; ++ } else { ++ mt->stk_migrate_count = 0; ++ } ++} ++ ++void mem_mbuf_migrate_dequeue(struct mem_thread *mt) ++{ ++ struct buf_cache *cache; ++ unsigned num; ++ void **obj_table; ++ ++ if (mt->mbuf_migrate_ring == NULL) ++ return; ++ ++ cache = mt->mbuf_cache; ++ if (cache->head > (cache->watermark >> 1)) ++ return; ++ ++ num = cache->capacity - cache->head; ++ obj_table = &cache->objs[cache->head]; ++ ++ num = rte_ring_sc_dequeue_burst(mt->mbuf_migrate_ring, obj_table, num, NULL); ++ cache->head += num; ++} ++ ++/* see rte_mempool_cache_flush() */ ++static inline ++void pool_put_with_mpcache(struct rte_mempool *pool, struct rte_mempool_cache* mpcache, void *obj) ++{ ++ if (mpcache->len >= mpcache->flushthresh) { ++ rte_mempool_ops_enqueue_bulk(pool, &mpcache->objs[mpcache->size], ++ mpcache->len - mpcache->size); ++ mpcache->len = mpcache->size; ++ } ++ mpcache->objs[mpcache->len] = obj; ++ mpcache->len++; ++} ++ ++static inline ++void pool_put_with_bufcache(struct rte_mempool *pool, struct buf_cache* cache, void *obj) ++{ ++ if (cache->head >= cache->flushthresh) { ++ buf_cache_sub_watermark(cache); ++ rte_mempool_ops_enqueue_bulk(pool, &cache->objs[cache->watermark], ++ cache->head - cache->watermark); ++ cache->head = cache->watermark; ++ } ++ cache->objs[cache->head] = obj; ++ cache->head++; ++} ++ ++static unsigned pool_get_bulk_with_cache(const struct mempool_ops *pool_ops, ++ struct rte_mempool *pool, struct buf_cache *cache, ++ void **obj_table, unsigned n) ++{ ++ unsigned ret; ++ unsigned count = 0; ++ unsigned get_count; ++ ++ ret = buf_cache_pop_bulk(cache, obj_table, n, &count); ++ if (ret > 0) { ++ return n; ++ } ++ ++ /* get from the pool */ ++ ret = pool_ops->get_bulk(pool, obj_table, n); ++ if (unlikely(ret == 0)) { ++ LSTACK_LOG(ERR, LSTACK, "pool %s get_bulk failed, n %u\n", pool->name, n); ++ return 0; ++ } ++ ++ buf_cache_add_watermark(cache); ++ if (count >= cache->watermark) { ++ return n; ++ } ++ ++ /* get from the pool, then enqueue to cache */ ++ get_count = cache->watermark - count; ++ LWIP_DEBUGF(MEMP_DEBUG, ("%s(cache=%p, watermark=%u, get_count=%u)\n", ++ __FUNCTION__, cache, cache->watermark, get_count)); ++ ++ ret = pool_ops->get_bulk(pool, &cache->objs[cache->head], get_count); ++ if (unlikely(ret == 0)) { ++ LSTACK_LOG(ERR, LSTACK, "pool %s get_bulk failed, n %u\n", pool->name, get_count); ++ } else { ++ cache->head += get_count; ++ } ++ ++ return n; ++} ++ ++static void pool_put_bulk_with_cache(const struct mempool_ops *pool_ops, ++ struct rte_mempool *pool, struct buf_cache *cache, ++ void *const *obj_table, unsigned n) ++{ ++ unsigned ret; ++ unsigned count; ++ unsigned free_count = 0; ++ unsigned put_count; ++ ++ ret = buf_cache_push_bulk(cache, obj_table, n, &free_count); ++ if (ret > 0) { ++ return; ++ } ++ ++ /* put to the pool */ ++ pool_ops->put_bulk(pool, obj_table, n); ++ ++ buf_cache_sub_watermark(cache); ++ count = buf_cache_get_capacity(cache) - free_count; ++ if (count <= cache->watermark) { ++ return; ++ } ++ ++ /* dequeue from cache, then put to the pool */ ++ put_count = count - cache->watermark; ++ LWIP_DEBUGF(MEMP_DEBUG, ("pool_put_bulk_with_cache(cache=%p, watermark=%u, put_count=%u)\n", ++ cache, cache->watermark, put_count)); ++ ++ pool_ops->put_bulk(pool, &cache->objs[cache->head - put_count], put_count); ++ cache->head -= put_count; ++ ++ return; ++} ++ ++ ++void *mem_get_rpc(int stack_id) ++{ ++ struct mem_stack *ms = mem_stack_get(stack_id); ++ struct mem_thread *mt = mem_thread_get(stack_id); ++ unsigned ret; ++ void *obj; ++ ++ if (mt == NULL) { ++ ret = mem_mp_ops.get_bulk(ms->rpc_pool, &obj, 1); ++ } else { ++ ret = pool_get_bulk_with_cache(&mem_mp_ops, ms->rpc_pool, mt->rpc_cache, &obj, 1); ++ } ++ ++ LWIP_DEBUGF(MEMP_DEBUG, ("mem_get_rpc(stack_id=%d, obj=%p)\n", stack_id, obj)); ++ ++ return ret == 0 ? NULL : obj; ++} ++ ++void mem_put_rpc(void *obj) ++{ ++ unsigned stack_id = rpc_obj_ops.get_stack_id(obj); ++ struct mem_stack *ms = mem_stack_get(stack_id); ++ struct mem_thread *mt = mem_thread_get(stack_id); ++ ++ LWIP_DEBUGF(MEMP_DEBUG, ("mem_put_rpc(stack_id=%d, obj=%p)\n", stack_id, obj)); ++ ++ if (mt == NULL) { ++ mem_mp_ops.put_bulk(ms->rpc_pool, &obj, 1); ++ } else { ++ pool_put_bulk_with_cache(&mem_mp_ops, ms->rpc_pool, mt->rpc_cache, &obj, 1); ++ } ++} ++ ++unsigned mem_get_mbuf_bulk(int stack_id, struct rte_mbuf **mbuf_table, unsigned n, bool reserve) ++{ ++ struct mem_stack *ms = mem_stack_get(stack_id); ++ struct mem_thread *mt = mem_thread_get(stack_id); ++ unsigned ret; ++ ++ if (unlikely(n == 0)) { ++ return 0; ++ } ++ ++ if (reserve) { ++ /* don't use rte_mempool_avail_count, it traverse cpu local cache, ++ * when RTE_MAX_LCORE is too large, it's time-consuming ++ */ ++ if (rte_ring_count(ms->mbuf_pool->pool_data) < MBUFPOOL_RESERVE_NUM + n) { ++ return 0; ++ } ++ } ++ ++ if (mt == NULL) { ++ ret = mbuf_mp_ops.get_bulk(ms->mbuf_pool, (void **)mbuf_table, n); ++ } else { ++ mem_mbuf_migrate_dequeue(mt); ++ ret = pool_get_bulk_with_cache(&mbuf_mp_ops, ms->mbuf_pool, mt->mbuf_cache, (void **)mbuf_table, n); ++ } ++ ++#if MEMP_DEBUG ++ for (unsigned i = 0; i < ret; ++i) { ++ LWIP_DEBUGF(MEMP_DEBUG, ("mem_get_mbuf_bulk(stack_id=%d, n=%u, mbuf_table[%u]=%p, pbuf=%p)\n", ++ stack_id, n, i, mbuf_table[i], mbuf_to_pbuf(mbuf_table[i]))); ++ } ++#endif /* MEMP_DEBUG */ ++ ++ return ret; ++} ++ ++static void mem_put_mbuf_bulk_by_pbuf(struct rte_mbuf *const *mbuf_table, unsigned n) ++{ ++ unsigned stack_id = mbuf_obj_ops.get_stack_id(mbuf_table[0]); ++ struct mem_stack *ms = mem_stack_get(stack_id); ++ struct mem_thread *mt = mem_thread_get(stack_id); ++ ++ if (unlikely(n == 0)) { ++ return; ++ } ++ ++#if MEMP_DEBUG ++ for (unsigned i = 0; i < n; ++i) { ++ LWIP_DEBUGF(MEMP_DEBUG, ("mem_put_mbuf_bulk(stack_id=%d, n=%u, mbuf_table[%u]=%p, pbuf=%p)\n", ++ stack_id, n, i, mbuf_table[i], mbuf_to_pbuf(mbuf_table[i]))); ++ } ++#endif /* MEMP_DEBUG */ ++ ++ if (mt == NULL) { ++ mbuf_mp_ops.put_bulk(ms->mbuf_pool, (void *const *)mbuf_table, n); ++ } else { ++ pool_put_bulk_with_cache(&mbuf_mp_ops, ms->mbuf_pool, mt->mbuf_cache, (void *const *)mbuf_table, n); ++ } ++ ++} ++ ++void mem_put_mbuf_bulk(struct rte_mbuf *const *mbuf_table, unsigned n) ++{ ++ unsigned i; ++ for (i = 0; i < n; ++i) { ++ LWIP_DEBUGF(MEMP_DEBUG, ("%s(stack_id=%d, n=%u, mbuf_table[%u]=%p, pbuf=%p)\n", ++ __FUNCTION__, mbuf_obj_ops.get_stack_id(mbuf_table[i]), ++ n, i, mbuf_table[i], mbuf_to_pbuf(mbuf_table[i]))); ++ ++ rte_pktmbuf_free(mbuf_table[i]); ++ } ++} ++ ++ ++unsigned mem_get_pbuf_bulk(int stack_id, struct pbuf **pbuf_table, unsigned n, bool reserve) ++{ ++ struct rte_mbuf **mbuf_table = (struct rte_mbuf **)pbuf_table; ++ unsigned ret, i; ++ ++ ret = mem_get_mbuf_bulk(stack_id, mbuf_table, n, reserve); ++ if (unlikely(ret == 0)) { ++ struct protocol_stack *stack = get_protocol_stack_by_id(stack_id); ++ stack->stats.tx_allocmbuf_fail++; ++ return 0; ++ } ++ ++ for (i = 0; i < (n & ~0x3); i += 4) { ++ pbuf_table[i] = mbuf_to_pbuf(mbuf_table[i]); ++ pbuf_table[i + 1] = mbuf_to_pbuf(mbuf_table[i + 1]); ++ pbuf_table[i + 2] = mbuf_to_pbuf(mbuf_table[i + 2]); ++ pbuf_table[i + 3] = mbuf_to_pbuf(mbuf_table[i + 3]); ++ } ++ switch (n & 0x3) { ++ case 3: ++ pbuf_table[i] = mbuf_to_pbuf(mbuf_table[i]); /* fallthrough */ ++ ++i; ++ case 2: ++ pbuf_table[i] = mbuf_to_pbuf(mbuf_table[i]); /* fallthrough */ ++ ++i; ++ case 1: ++ pbuf_table[i] = mbuf_to_pbuf(mbuf_table[i]); /* fallthrough */ ++ ++i; ++ } ++ ++ return n; ++} ++ ++void mem_preput_pbuf(struct pbuf *p) ++{ ++ struct rte_mbuf *m = pbuf_to_mbuf(p); ++ p->mbuf_refcnt = rte_mbuf_refcnt_read(m); ++ if (p->mbuf_refcnt == 1) { ++ rte_pktmbuf_reset(m); ++ } ++} ++ ++/* ignore buf->ref, and reset to 1 */ ++static __rte_always_inline ++struct rte_mbuf *pbuf_to_mbuf_prefree(struct pbuf *p) ++{ ++ if (unlikely(p == NULL)) ++ return NULL; ++ ++ if (p->next != NULL) ++ p->next = NULL; ++ ++ struct rte_mbuf *m = pbuf_to_mbuf(p); ++#if MEMP_DEBUG ++ if (rte_mbuf_refcnt_read(m) > 1) { ++ LWIP_DEBUGF(MEMP_DEBUG, ("pbuf_to_mbuf_prefree(mbuf=%p, pbuf=%p, refcnt=%u)\n", ++ m, p, rte_mbuf_refcnt_read(m))); ++ } ++#endif /* MEMP_DEBUG */ ++ if (p->mbuf_refcnt != 1) { ++ m = rte_pktmbuf_prefree_seg(m); ++ if (m != NULL) { ++ rte_pktmbuf_reset(m); ++ } ++ } ++ ++ return m; ++} ++ ++void mem_put_pbuf_bulk(struct pbuf *const *pbuf_table, unsigned n) ++{ ++ struct rte_mbuf *mbuf_table[BUF_BULK_MAX_NUM]; ++ unsigned i, copied, batch, bulk_num; ++ ++ copied = 0; ++ while (copied < n) { ++ batch = LWIP_MIN(n - copied, BUF_BULK_MAX_NUM); ++ bulk_num = 0; ++ for (i = 0; i < batch; ++i, ++copied) { ++ mbuf_table[bulk_num] = pbuf_to_mbuf_prefree(pbuf_table[copied]); ++ if (mbuf_table[bulk_num] != NULL) { ++ ++bulk_num; ++ } ++ } ++ mem_put_mbuf_bulk_by_pbuf(mbuf_table, bulk_num); ++ } ++} ++ ++void mem_put_pbuf_list_bulk(struct pbuf *const *pbuf_table, unsigned n) ++{ ++ unsigned stack_id = mbuf_obj_ops.get_stack_id(pbuf_to_mbuf(pbuf_table[0])); ++ struct mem_stack *ms = mem_stack_get(stack_id); ++ struct mem_thread *mt = mem_thread_get(stack_id); ++ ++ struct pbuf *q, *next; ++ struct rte_mbuf *mbuf; ++ ++ for (unsigned i = 0; i < n; ++i) { ++ q = pbuf_table[i]; ++ while (q != NULL) { ++ next = q->next; ++ q->next = NULL; ++ ++ q->ref--; ++ if (q->ref > 0) ++ break; ++ mbuf = pbuf_to_mbuf_prefree(q); ++ if (mbuf == NULL) ++ break; ++ ++ q = next; ++ ++ if (mt == NULL) { ++ pool_put_with_mpcache(ms->mbuf_pool, ms->mbuf_mpcache, mbuf); ++ } else { ++ pool_put_with_bufcache(ms->mbuf_pool, mt->mbuf_cache, mbuf); ++ } ++ ++ LWIP_DEBUGF(MEMP_DEBUG, ("%s(stack_id=%d, n=%u, mbuf_table[%u]=%p, pbuf=%p)\n", ++ __FUNCTION__, stack_id, n, i, mbuf, q)); ++ } ++ } ++} ++ ++struct pbuf *mem_get_pbuf(int stack_id, bool reserve) ++{ ++ int ret; ++ struct rte_mbuf *mbuf; ++ ++ if (stack_id < 0 || stack_id >= PROTOCOL_STACK_MAX) ++ stack_id = get_protocol_stack()->stack_idx; ++ ++ ret = mem_get_mbuf_bulk(stack_id, &mbuf, 1, reserve); ++ if (unlikely(ret == 0)) { ++ struct protocol_stack *stack = get_protocol_stack_by_id(stack_id); ++ stack->stats.tx_allocmbuf_fail++; ++ return NULL; ++ } ++ ++ return mbuf_to_pbuf(mbuf); ++} ++ ++void mem_put_pbuf(struct pbuf *p) ++{ ++ struct rte_mbuf *mbuf = pbuf_to_mbuf_prefree(p); ++ if (mbuf != NULL) { ++ mem_put_mbuf_bulk_by_pbuf(&mbuf, 1); ++ } ++} ++ ++unsigned mem_extcache_get_pbuf_bulk(int stack_id, struct pbuf **pbuf_table, unsigned n, bool reserve, struct pbuf **extcache_list) ++{ ++ unsigned ret; ++ struct pbuf *p; ++ ++ for (int i = 0; i < n; ++i) { ++ p = *extcache_list; ++ if (p != NULL) { ++ *extcache_list = p->next; ++ p->next = NULL; ++ pbuf_table[i] = p; ++ } else { ++ ret = mem_get_pbuf_bulk(stack_id, &pbuf_table[i], n - i, reserve); ++ if (unlikely(ret == 0)) { ++ mem_put_pbuf_bulk(pbuf_table, i); ++ return 0; ++ } ++ break; ++ } ++ } ++ ++ return n; ++} ++ ++struct pbuf *mem_extcache_get_pbuf(int stack_id, bool reserve, struct pbuf **extcache_list) ++{ ++ struct pbuf *p; ++ ++ p = *extcache_list; ++ if (p != NULL) { ++ *extcache_list = p->next; ++ p->next = NULL; ++ } else { ++ p = mem_get_pbuf(stack_id, reserve); ++ } ++ ++ return p; ++} ++ ++void mem_extcache_put_pbuf(struct pbuf *h, struct pbuf *t, struct pbuf **extcache_list) ++{ ++ if (get_global_cfg_params()->stack_mode_rtc) { ++ pbuf_free(h); ++ return; ++ } ++ ++ if (*extcache_list == NULL) { ++ *extcache_list = h; ++ } else { ++ if (t == NULL) ++ t = pbuf_list_tail(h); ++ t->next = *extcache_list; ++ *extcache_list = h; ++ } ++} ++ ++void mem_extcache_flush_pbuf(struct pbuf **extcache_list) ++{ ++ if (get_global_cfg_params()->stack_mode_rtc) { ++ return; ++ } ++ ++ struct pbuf *p = *extcache_list; ++ if (p != NULL) { ++ mem_put_pbuf_list_bulk(&p, 1); ++ *extcache_list = NULL; ++ } ++} ++ ++static inline void mem_preinit_pbuf(struct pbuf *p) ++{ ++ mem_init_pbuf(p, 0, 0, 0, PBUF_POOL_PREINIT); ++} ++ ++void mem_init_pbuf(struct pbuf *p, pbuf_layer layer, uint16_t tot_len, uint16_t len, pbuf_type type) ++{ ++ struct pbuf_custom *pc; ++ struct rte_mbuf *mbuf; ++ void *data; ++ ++ if (p->type_internal == PBUF_POOL_PREINIT) { ++ p->payload = (uint8_t *)p->payload + LWIP_MEM_ALIGN_SIZE((uint16_t)layer); ++ p->type_internal = type; ++ p->len = len; ++ p->tot_len = tot_len; ++ return; ++ } ++ ++ pc = (struct pbuf_custom *)p; ++ mbuf = pbuf_to_mbuf(p); ++ data = rte_pktmbuf_mtod(mbuf, void *); ++ ++ pbuf_alloced_custom(layer, len, type, pc, data, MBUF_PAYLOAD_SIZE); ++ p->tot_len = tot_len; ++ pc->custom_free_function = mem_put_pbuf; ++} +diff --git a/src/lstack/include/lstack_lockless_queue.h b/src/lstack/include/lstack_lockless_queue.h +index b0fc31f..bec2564 100644 +--- a/src/lstack/include/lstack_lockless_queue.h ++++ b/src/lstack/include/lstack_lockless_queue.h +@@ -15,6 +15,8 @@ + + #include + ++/* multi-producers single-consumer queue */ ++ + typedef struct lockless_queue_node { + struct lockless_queue_node *volatile next; + } lockless_queue_node; +@@ -25,6 +27,32 @@ typedef struct lockless_queue { + lockless_queue_node stub __attribute__((__aligned__(64))); + } lockless_queue; + ++ ++static inline void lockless_queue_node_set_poped(lockless_queue_node *node) ++{ ++ node->next = node; ++} ++ ++static inline bool lockless_queue_node_is_poped(lockless_queue_node *node) ++{ ++ return node->next == node; ++} ++ ++static inline bool lockless_queue_node_test_poped(lockless_queue_node *node) ++{ ++ /* ++ * if (node->next == node) { ++ * node->next = NULL; ++ * return 1; ++ * } else { ++ * return 0; ++ * } ++ */ ++ return !!__atomic_compare_exchange_n( ++ (volatile uint64_t *)&node->next, (uint64_t *)&node, (uint64_t)NULL, ++ false, __ATOMIC_ACQUIRE, __ATOMIC_ACQUIRE); ++} ++ + static inline void lockless_queue_init(lockless_queue *queue) + { + queue->head = &queue->stub; +@@ -57,11 +85,35 @@ static inline int32_t lockless_queue_count(lockless_queue *queue) + return count; + } + ++/* ++ * tail head ++ * \_ stub _/ ++ * ++ * tail old_head head ++ * \_ stub _/ node _/ ++ * ++ * tail head ++ * \_ stub -> node _/ ++ */ + static inline void lockless_queue_mpsc_push(lockless_queue *queue, lockless_queue_node *node) + { ++ lockless_queue_node *old_head; ++ + node->next = NULL; +- lockless_queue_node *old_head = +- (lockless_queue_node *)__atomic_exchange_n((void **)&queue->head, (void*)node, __ATOMIC_ACQ_REL); ++ ++ old_head = (lockless_queue_node *)__atomic_exchange_n((void **)&queue->head, (void*)node, __ATOMIC_ACQ_REL); ++ ++ __atomic_store_n(&old_head->next, node, __ATOMIC_RELEASE); ++} ++ ++static inline void lockless_queue_mpsc_test_push(lockless_queue *queue, lockless_queue_node *node) ++{ ++ lockless_queue_node *old_head; ++ ++ if (!lockless_queue_node_test_poped(node)) ++ return; ++ ++ old_head = (lockless_queue_node *)__atomic_exchange_n((void **)&queue->head, (void*)node, __ATOMIC_ACQ_REL); + + __atomic_store_n(&old_head->next, node, __ATOMIC_RELEASE); + } +@@ -70,7 +122,13 @@ static inline lockless_queue_node* lockless_queue_mpsc_pop(lockless_queue* queue + { + lockless_queue_node *tail = queue->tail; + lockless_queue_node *next = tail->next; ++ lockless_queue_node *head; + ++ /* ++ * step1. dequeue stub. ++ * tail head ++ * \_ stub -> node -> node _/ ++ */ + if (tail == &queue->stub) { + if (next == NULL) { + return NULL; +@@ -80,21 +138,38 @@ static inline lockless_queue_node* lockless_queue_mpsc_pop(lockless_queue* queue + next = next->next; + } + ++ /* ++ * step2. dequeue tail. ++ * tail next-\ head ++ * \_ node -> node _/ ++ */ + if (next) { + queue->tail = next; ++ lockless_queue_node_set_poped(tail); + return tail; + } + +- lockless_queue_node *head = queue->head; ++ /* ++ * step3. enqueue ing. ++ * tail old_head head ++ * \_ node _/ node _/ ++ */ ++ head = queue->head; + if (tail != head) { + return NULL; + } + ++ /* ++ * step4. only one node, enqueue stub and dequeue node. ++ * tail head ++ * \_ node _/ ++ */ + lockless_queue_mpsc_push(queue, &queue->stub); + + next = tail->next; + if (next) { + queue->tail = next; ++ lockless_queue_node_set_poped(tail); + return tail; + } + +diff --git a/src/lstack/include/lstack_mempool.h b/src/lstack/include/lstack_mempool.h +new file mode 100644 +index 0000000..c6adff0 +--- /dev/null ++++ b/src/lstack/include/lstack_mempool.h +@@ -0,0 +1,325 @@ ++/* ++* Copyright (c) Huawei Technologies Co., Ltd. 2020-2021. All rights reserved. ++* gazelle is licensed under the Mulan PSL v2. ++* You can use this software according to the terms and conditions of the Mulan PSL v2. ++* You may obtain a copy of Mulan PSL v2 at: ++* http://license.coscl.org.cn/MulanPSL2 ++* THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR ++* IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY OR FIT FOR A PARTICULAR ++* PURPOSE. ++* See the Mulan PSL v2 for more details. ++*/ ++ ++#ifndef __GAZELLE_MEM_H__ ++#define __GAZELLE_MEM_H__ ++ ++#include ++ ++#include ++#include ++#include ++ ++#include ++#include ++ ++#include "common/dpdk_common.h" ++ ++/* fix virtio PMD error: Rx scatter is disabled and RxQ mbuf pool object is too small. */ ++#define DEV_VIRTIO_RX_MBUF_SIZE 1530 ++/* see hinic_convert_rx_buf_size() */ ++#define DEV_HINIC_RX_MBUF_SIZE 0x600 ++ ++/* IP6_HLEN - IP_HLEN: reserve 20 byte to overflow, ++ * if distinguish between IP4_MSS and IP6_MSS. */ ++#define MBUF_PAYLOAD_SIZE LWIP_MAX(PBUF_POOL_BUFSIZE, DEV_HINIC_RX_MBUF_SIZE) ++#define MBUF_DATA_SIZE (MBUF_PAYLOAD_SIZE + RTE_PKTMBUF_HEADROOM) ++/* DPDK limit ring head-tail distance in rte_ring_init. ++ * Max value is RTE_RING_SZ_MASK / HTD_MAX_DEF, RTE_RING_SZ_MASK is 0x7fffffff, HTD_MAX_DEF is 8. ++ */ ++#define MBUFPOOL_MAX_NUM 0xfffffff ++ ++#define MBUFPOOL_CACHE_NUM LWIP_MIN(NIC_QUEUE_SIZE_MAX >> 1, RTE_MEMPOOL_CACHE_MAX_SIZE) ++#define MBUFPOOL_RESERVE_NUM (NIC_QUEUE_SIZE_MAX + MBUFPOOL_CACHE_NUM) ++ ++#define MEMPOOL_OPS_NAME "ring_mt_rts" ++#define MEMPOOL_CACHE_NUM 32 ++ ++#define BUF_CACHE_MIN_NUM 32 ++#define BUF_CACHE_DEFAULT_NUM 1024 ++#define BUF_CACHE_WATERSTEP_SHIFT 4 /* 1/16 */ ++ ++#define BUF_BULK_MAX_NUM 32 ++ ++struct buf_cache { ++ unsigned size; /* Size of cache. */ ++ unsigned mask; /* Mask (size-1) of cache. */ ++ unsigned capacity; /* Usable size of cache */ ++ ++ unsigned watermark; ++ unsigned waterstep; ++ unsigned flushthresh; ++ ++ unsigned head; ++ unsigned tail; ++ ++ /* new cache line */ ++ char pad0 __rte_cache_aligned; ++ void *objs[0]; ++}; ++ ++static __rte_always_inline ++struct buf_cache *buf_cache_create(unsigned count) ++{ ++ struct buf_cache *cache; ++ unsigned size; ++ ++ size = rte_align32pow2(count); ++ if (size < BUF_CACHE_MIN_NUM) ++ return NULL; ++ ++ cache = (struct buf_cache *)calloc(1, sizeof(struct buf_cache) + sizeof(void *) * size); ++ if (cache == NULL) ++ return NULL; ++ ++ cache->size = size; ++ cache->mask = size - 1; ++ cache->capacity = size - 1; ++ if (cache->capacity > count) ++ cache->capacity = count; ++ ++ cache->head = 0; ++ cache->tail = 0; ++ ++ cache->waterstep = cache->size >> BUF_CACHE_WATERSTEP_SHIFT; ++ if (cache->waterstep < BUF_CACHE_WATERSTEP_SHIFT) ++ cache->waterstep = BUF_CACHE_WATERSTEP_SHIFT; ++ cache->watermark = cache->waterstep; ++ cache->flushthresh = cache->size - cache->waterstep; ++ ++ return cache; ++} ++ ++static __rte_always_inline ++void buf_cache_free(struct buf_cache *cache) ++{ ++ if (cache != NULL) { ++ free(cache); ++ } ++} ++ ++static __rte_always_inline ++unsigned buf_cache_count(const struct buf_cache *cache) ++{ ++ unsigned count = (cache->head - cache->tail) & cache->mask; ++ return (count > cache->capacity) ? cache->capacity : count; ++} ++ ++static __rte_always_inline ++unsigned buf_cache_free_count(const struct buf_cache *cache) ++{ ++ return cache->capacity - buf_cache_count(cache); ++} ++ ++static __rte_always_inline ++unsigned buf_cache_get_capacity(const struct buf_cache *cache) ++{ ++ return cache->capacity; ++} ++ ++static __rte_always_inline ++void buf_cache_add_watermark(struct buf_cache *cache) ++{ ++ if (cache->watermark < cache->flushthresh) { ++ cache->watermark += cache->waterstep; ++ } ++} ++ ++static __rte_always_inline ++void buf_cache_sub_watermark(struct buf_cache *cache) ++{ ++ if (cache->watermark > cache->waterstep) { ++ cache->watermark -= cache->waterstep; ++ } ++} ++ ++static __rte_always_inline ++void __buf_cache_copy_objs(void ** dst_table, void *const *src_table, unsigned n) ++{ ++ unsigned i; ++ ++ for (i = 0; i < (n & ~0x3); i += 4) { ++ dst_table[i] = src_table[i]; ++ dst_table[i + 1] = src_table[i + 1]; ++ dst_table[i + 2] = src_table[i + 2]; ++ dst_table[i + 3] = src_table[i + 3]; ++ } ++ switch (n & 0x3) { ++ case 3: ++ dst_table[i] = src_table[i]; /* fallthrough */ ++ ++i; ++ case 2: ++ dst_table[i] = src_table[i]; /* fallthrough */ ++ ++i; ++ case 1: ++ dst_table[i] = src_table[i]; /* fallthrough */ ++ } ++} ++ ++static __rte_always_inline ++unsigned buf_cache_enqueue_bulk(struct buf_cache *cache, void *const *obj_table, unsigned n, unsigned *free_space) ++{ ++ unsigned free_count = buf_cache_free_count(cache); ++ unsigned i, idx; ++ ++ if (unlikely(n > free_count)) { ++ if (free_space != NULL) { ++ *free_space = free_count; ++ } ++ return 0; ++ } ++ ++ /* refence to __rte_ring_enqueue_elems_64() */ ++ idx = cache->head & cache->mask; ++ if (likely(idx + n < cache->size)) { ++ __buf_cache_copy_objs(&cache->objs[idx], obj_table, n); ++ } else { ++ for (i = 0; idx < cache->size; i++, idx++) ++ cache->objs[idx] = obj_table[i]; ++ /* Start at the beginning */ ++ for (idx = 0; i < n; i++, idx++) ++ cache->objs[idx] = obj_table[i]; ++ } ++ ++ cache->head += n; ++ return n; ++} ++ ++static __rte_always_inline ++unsigned buf_cache_dequeue_bulk(struct buf_cache *cache, void **obj_table, unsigned n, unsigned *available) ++{ ++ unsigned count = buf_cache_count(cache); ++ unsigned i, idx; ++ ++ if (unlikely(n > count)) { ++ if (available != NULL) { ++ *available = count; ++ } ++ return 0; ++ } ++ ++ /* refence to __rte_ring_dequeue_elems_64() */ ++ idx = cache->tail & cache->mask; ++ if (likely(idx + n < cache->size)) { ++ __buf_cache_copy_objs(obj_table, &cache->objs[idx], n); ++ } else { ++ for (i = 0; idx < cache->size; i++, idx++) ++ obj_table[i] = cache->objs[idx]; ++ /* Start at the beginning */ ++ for (idx = 0; i < n; i++, idx++) ++ obj_table[i] = cache->objs[idx]; ++ } ++ ++ cache->tail += n; ++ return n; ++} ++ ++static __rte_always_inline ++unsigned buf_cache_push_bulk(struct buf_cache *cache, void *const *obj_table, unsigned n, unsigned *free_space) ++{ ++ unsigned free_count = buf_cache_free_count(cache); ++ unsigned top; ++ ++ if (unlikely(n > free_count)) { ++ if (free_space != NULL) { ++ *free_space = free_count; ++ } ++ return 0; ++ } ++ ++ top = cache->head; ++ __buf_cache_copy_objs(&cache->objs[top], obj_table, n); ++ ++ cache->head += n; ++ return n; ++} ++ ++static __rte_always_inline ++unsigned buf_cache_pop_bulk(struct buf_cache *cache, void **obj_table, unsigned n, unsigned *available) ++{ ++ unsigned count = buf_cache_count(cache); ++ unsigned top; ++ ++ if (unlikely(n > count)) { ++ if (available != NULL) { ++ *available = count; ++ } ++ return 0; ++ } ++ ++ top = cache->head; ++ __buf_cache_copy_objs(obj_table, &cache->objs[top - n], n); ++ ++ cache->head -= n; ++ return n; ++} ++ ++ ++struct mem_stack { ++ struct rte_mempool *rpc_pool; ++ ++ struct rte_mempool *mbuf_pool; ++ struct rte_mempool_cache *mbuf_mpcache; ++ unsigned migrate_watermark; ++}; ++ ++struct mem_thread { ++ struct buf_cache *rpc_cache; ++ ++ struct buf_cache *mbuf_cache; ++ struct rte_ring *mbuf_migrate_ring; ++ ++ char pad0 __rte_cache_aligned; /* new cache line */ ++ ++ unsigned stk_migrate_count; ++} __rte_cache_aligned; ++ ++void mem_stack_pool_free(int stack_id); ++int mem_stack_pool_init(int stack_id); ++int mem_stack_mpcache_init(int stack_id, unsigned cpu_id); ++ ++int mem_thread_manager_init(void); ++void mem_thread_cache_free(struct mem_thread *mt); ++int mem_thread_cache_init(struct mem_thread *mt); ++ ++unsigned mem_stack_mbuf_pool_count(int stack_id); ++struct rte_mempool *mem_get_mbuf_pool(int stack_id); ++struct rte_mempool *mem_get_rpc_pool(int stack_id); ++ ++void *mem_get_rpc(int stack_id); ++void mem_put_rpc(void *obj); ++ ++struct mem_thread *mem_thread_migrate_get(int stack_id); ++void mem_mbuf_migrate_enqueue(struct mem_thread *mt, unsigned n); ++void mem_mbuf_migrate_dequeue(struct mem_thread *mt); ++ ++unsigned mem_get_mbuf_bulk(int stack_id, struct rte_mbuf **mbuf_table, unsigned n, bool reserve); ++void mem_put_mbuf_bulk(struct rte_mbuf *const *mbuf_table, unsigned n); ++ ++unsigned mem_get_pbuf_bulk(int stack_id, struct pbuf **pbuf_table, unsigned n, bool reserve); ++void mem_preput_pbuf(struct pbuf *p); ++void mem_put_pbuf_bulk(struct pbuf *const *pbuf_table, unsigned n); ++void mem_put_pbuf_list_bulk(struct pbuf *const *pbuf_table, unsigned n); ++ ++struct pbuf *mem_get_pbuf(int stack_id, bool reserve); ++void mem_put_pbuf(struct pbuf *p); ++ ++unsigned mem_extcache_get_pbuf_bulk(int stack_id, struct pbuf **pbuf_table, unsigned n, bool reserve, ++ struct pbuf **extcache_list); ++struct pbuf *mem_extcache_get_pbuf(int stack_id, bool reserve, struct pbuf **extcache_list); ++void mem_extcache_put_pbuf(struct pbuf *h, struct pbuf *t, struct pbuf **extcache_list); ++void mem_extcache_flush_pbuf(struct pbuf **extcache_list); ++ ++void mem_init_pbuf(struct pbuf *p, pbuf_layer layer, uint16_t tot_len, uint16_t len, pbuf_type type); ++ ++ ++#endif /* __GAZELLE_MEM_H__ */ +\ No newline at end of file +diff --git a/src/lstack/include/lstack_sockctl.h b/src/lstack/include/lstack_sockctl.h +new file mode 100644 +index 0000000..91445ad +--- /dev/null ++++ b/src/lstack/include/lstack_sockctl.h +@@ -0,0 +1,25 @@ ++/* ++* Copyright (c) Huawei Technologies Co., Ltd. 2020-2021. All rights reserved. ++* gazelle is licensed under the Mulan PSL v2. ++* You can use this software according to the terms and conditions of the Mulan PSL v2. ++* You may obtain a copy of Mulan PSL v2 at: ++* http://license.coscl.org.cn/MulanPSL2 ++* THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR ++* IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY OR FIT FOR A PARTICULAR ++* PURPOSE. ++* See the Mulan PSL v2 for more details. ++*/ ++ ++#ifndef _LSTACK_RTC_API_H_ ++#define _LSTACK_RTC_API_H_ ++ ++#include ++ ++/* don't include lwip/sockets.h, conflict with sys/socket.h */ ++extern int lwip_fcntl(int s, int cmd, int val); ++extern int lwip_ioctl(int s, long cmd, void *argp); ++ ++void sockctl_rtw_api_init(posix_api_t *api); ++void sockctl_rtc_api_init(posix_api_t *api); ++ ++#endif /* __LSTACK_RTC_API_H_ */ +diff --git a/src/lstack/include/lstack_sockio.h b/src/lstack/include/lstack_sockio.h +new file mode 100644 +index 0000000..f4e5e99 +--- /dev/null ++++ b/src/lstack/include/lstack_sockio.h +@@ -0,0 +1,41 @@ ++/* ++* Copyright (c) Huawei Technologies Co., Ltd. 2020-2021. All rights reserved. ++* gazelle is licensed under the Mulan PSL v2. ++* You can use this software according to the terms and conditions of the Mulan PSL v2. ++* You may obtain a copy of Mulan PSL v2 at: ++* http://license.coscl.org.cn/MulanPSL2 ++* THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR ++* IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY OR FIT FOR A PARTICULAR ++* PURPOSE. ++* See the Mulan PSL v2 for more details. ++*/ ++ ++#ifndef _LSTACK_SOCKIO_H_ ++#define _LSTACK_SOCKIO_H_ ++ ++#include ++#include ++ ++ssize_t sockio_recvfrom(int fd, void *mem, size_t len, int flags, struct sockaddr *from, socklen_t *fromlen); ++ssize_t sockio_recvmsg(int fd, struct msghdr *msg, int flags); ++ssize_t sockio_sendto(int fd, const void *mem, size_t len, int flags, const struct sockaddr *to, socklen_t tolen); ++ssize_t sockio_sendmsg(int fd, const struct msghdr *msg, int flags); ++ ++ssize_t sockio_read(int fd, void *mem, size_t len); ++ssize_t sockio_write(int fd, const void *mem, size_t len); ++ ++ssize_t sockio_recv(int fd, void *mem, size_t len, int flags); ++ssize_t sockio_send(int fd, const void *mem, size_t len, int flags); ++ ++ssize_t sockio_readv(int fd, const struct iovec *iov, int iovcnt); ++ssize_t sockio_writev(int fd, const struct iovec *iov, int iovcnt); ++ ++ ++void sockio_ops_init(void); ++bool sockio_mbox_pending(const struct lwip_sock *sock); ++ ++/* just for lwip */ ++int do_lwip_init_sock(int fd); ++void do_lwip_clean_sock(int fd); ++ ++#endif /* _LSTACK_SOCKIO_H_ */ +diff --git a/src/lstack/include/mbox_ring.h b/src/lstack/include/mbox_ring.h +new file mode 100644 +index 0000000..c48a47b +--- /dev/null ++++ b/src/lstack/include/mbox_ring.h +@@ -0,0 +1,583 @@ ++/* ++* Copyright (c) Huawei Technologies Co., Ltd. 2020-2021. All rights reserved. ++* gazelle is licensed under the Mulan PSL v2. ++* You can use this software according to the terms and conditions of the Mulan PSL v2. ++* You may obtain a copy of Mulan PSL v2 at: ++* http://license.coscl.org.cn/MulanPSL2 ++* THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR ++* IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY OR FIT FOR A PARTICULAR ++* PURPOSE. ++* See the Mulan PSL v2 for more details. ++*/ ++ ++#ifndef __MBOX_RING_H__ ++#define __MBOX_RING_H__ ++ ++#include ++#include ++ ++#include ++#include ++ ++#include "common/dpdk_common.h" ++#include "lstack_dpdk.h" ++#include "lstack_mempool.h" ++#include "lstack_cfg.h" ++ ++/* Optimize performance of creating ring. */ ++static inline ++struct rte_ring *rte_ring_create_fast(const char *name, unsigned size, unsigned flags) ++{ ++ ssize_t ring_size; ++ char ring_name[RTE_MEMZONE_NAMESIZE] = {0}; ++ struct rte_ring *ring; ++ ++ ring_size = rte_ring_get_memsize(size); ++ if (ring_size < 0) { ++ RTE_LOG(ERR, EAL, "rte_ring_get_memszie failed\n"); ++ return NULL; ++ } ++ ++ /* ++ * rte_ring_create is not used because it calls memzone_lookup_thread_unsafe function ++ * time consuming when there are many rings ++ */ ++ ring = rte_malloc_socket(NULL, ring_size, RTE_CACHE_LINE_SIZE, rte_socket_id()); ++ if (ring == NULL) { ++ RTE_LOG(ERR, EAL, "cannot create rte_ring for mbox\n"); ++ return NULL; ++ } ++ ++ if (snprintf(ring_name, sizeof(ring_name), "%s""%"PRIXPTR, name, (uintptr_t)ring) < 0) { ++ rte_free(ring); ++ RTE_LOG(ERR, EAL, "snprintf failed\n"); ++ return NULL; ++ } ++ ++ if (rte_ring_init(ring, ring_name, size, flags) != 0) { ++ rte_free(ring); ++ RTE_LOG(ERR, EAL, "cannot init rte_ring for mbox\n"); ++ return NULL; ++ } ++ ++ return ring; ++} ++ ++static inline ++void rte_ring_free_fast(struct rte_ring *ring) ++{ ++ rte_free(ring); ++} ++ ++ ++static inline ++void mbox_ring_common_free(struct mbox_ring *mr) ++{ ++ void *obj; ++ ++ if (mr->private_data_free_fn != NULL && mr->private_data != NULL) { ++ mr->private_data_free_fn(mr); ++ mr->private_data_free_fn = NULL; ++ mr->private_data = NULL; ++ } ++ ++ obj = mr->ops->pop_tail(mr, NULL); ++ if (obj != NULL) ++ mr->obj_free_fn(mr, obj, true); ++ while (true) { ++ if (mr->ops->dequeue_burst(mr, &obj, 1) == 0) ++ break; ++ mr->obj_free_fn(mr, obj, false); ++ } ++} ++ ++extern void sockio_mbox_set_func(struct mbox_ring *mr); ++static inline ++void mbox_ring_common_init(struct mbox_ring *mr) ++{ ++ mr->stk_queued_num = 0; ++ ++ mr->app_free_count = 0; ++ mr->app_queued_num = 0; ++ mr->app_tail_left = 0; ++ mr->app_recvd_len = 0; ++ ++ sockio_mbox_set_func(mr); ++} ++ ++/* single thread */ ++static inline ++int st_ring_create(struct mbox_ring *mr, const char *name, unsigned count) ++{ ++ mbox_ring_common_init(mr); ++ ++ mr->ops = &g_mbox_rtc_default_ops; ++ mr->st_obj = NULL; ++ ++ mr->ring = rte_ring_create_fast(name, count, RING_F_SP_ENQ | RING_F_SC_DEQ); ++ if (mr->ring == NULL) { ++ return -ENOMEM; ++ } ++ return 0; ++} ++ ++static inline ++void st_ring_destroy(struct mbox_ring *mr) ++{ ++ mbox_ring_common_free(mr); ++ ++ mr->ops = NULL; ++ mr->st_obj = NULL; ++ if (mr->ring != NULL) { ++ rte_ring_free_fast(mr->ring); ++ mr->ring = NULL; ++ } ++} ++ ++static inline ++unsigned st_ring_get_capacity(const struct mbox_ring *mr) ++{ ++ return mr->ring->capacity; ++} ++ ++static inline ++unsigned st_ring_count(const struct mbox_ring *mr) ++{ ++ // return rte_ring_count(mr->ring); ++ struct rte_ring *r = mr->ring; ++ uint32_t prod_tail = r->prod.tail; ++ uint32_t cons_head = r->cons.head; ++ uint32_t count = (prod_tail - cons_head) & r->mask; ++ return (count > r->capacity) ? r->capacity : count; ++} ++ ++static inline ++unsigned st_ring_free_count(const struct mbox_ring *mr) ++{ ++ return st_ring_get_capacity(mr) - st_ring_count(mr); ++} ++ ++static inline ++unsigned st_ring_enqueue_burst_start(struct mbox_ring *mr, void *const *obj_table, unsigned n) ++{ ++ struct rte_ring *r = mr->ring; ++ uint32_t prod_head, cons_tail; ++ uint32_t free_entries; ++ ++ prod_head = r->prod.head; ++ cons_tail = r->cons.tail; ++ ++ free_entries = r->capacity + cons_tail - prod_head; ++ if (unlikely(free_entries == 0)) ++ return 0; ++ if (n > free_entries) ++ n = free_entries; ++ ++ r->prod.head = prod_head + n; ++ ++ __rte_ring_enqueue_elems(r, prod_head, obj_table, sizeof(void *), n); ++ return n; ++} ++ ++static inline ++void st_ring_enqueue_burst_finish(struct mbox_ring *mr) ++{ ++ mr->ring->prod.tail = mr->ring->prod.head; ++} ++ ++static inline ++unsigned st_ring_dequeue_burst_start(struct mbox_ring *mr, void **obj_table, unsigned n) ++{ ++ struct rte_ring *r = mr->ring; ++ uint32_t cons_head, prod_tail; ++ uint32_t entries; ++ ++ cons_head = r->cons.head; ++ prod_tail = r->prod.tail; ++ ++ entries = prod_tail - cons_head; ++ if (unlikely(entries == 0)) ++ return 0; ++ if (n > entries) ++ n = entries; ++ ++ r->cons.head = cons_head + n; ++ ++ __rte_ring_dequeue_elems(r, cons_head, obj_table, sizeof(void *), n); ++ return n; ++} ++ ++static inline ++void st_ring_dequeue_burst_finish(struct mbox_ring *mr) ++{ ++ mr->ring->cons.tail = mr->ring->cons.head; ++} ++ ++static inline ++unsigned st_ring_enqueue_burst(struct mbox_ring *mr, void *const *obj_table, unsigned n) ++{ ++ n = st_ring_enqueue_burst_start(mr, obj_table, n); ++ st_ring_enqueue_burst_finish(mr); ++ return n; ++} ++ ++static inline ++unsigned st_ring_dequeue_burst(struct mbox_ring *mr, void **obj_table, unsigned n) ++{ ++ n = st_ring_dequeue_burst_start(mr, obj_table, n); ++ st_ring_dequeue_burst_finish(mr); ++ return n; ++} ++ ++static inline ++void *st_ring_read_tail(const struct mbox_ring *mr) ++{ ++ return mr->st_obj; ++} ++ ++static inline ++void st_ring_push_tail(struct mbox_ring *mr, void *obj) ++{ ++ mr->st_obj = obj; ++} ++ ++static inline ++void *st_ring_pop_tail(struct mbox_ring *mr, void *expect) ++{ ++ expect = mr->st_obj; ++ mr->st_obj = NULL; ++ return expect; ++} ++ ++static inline ++void st_ring_ops_init(struct mbox_ring_ops *ops) ++{ ++ ops->create = st_ring_create; ++ ops->destroy = st_ring_destroy; ++ ++ ops->get_capacity = st_ring_get_capacity; ++ ops->count = st_ring_count; ++ ops->free_count = st_ring_free_count; ++ ++ ops->enqueue_burst = st_ring_enqueue_burst; ++ ops->dequeue_burst = st_ring_dequeue_burst; ++ ++ ops->recv_count = st_ring_count; ++ ops->recv_start_burst = st_ring_dequeue_burst_start; ++ ops->recv_finish_burst = st_ring_dequeue_burst_finish; ++ ++ ops->read_tail = st_ring_read_tail; ++ ops->push_tail = st_ring_push_tail; ++ ops->pop_tail = st_ring_pop_tail; ++} ++ ++ ++/* multi thread */ ++static inline ++int mt_ring_create(struct mbox_ring *mr, const char *name, unsigned count) ++{ ++ mbox_ring_common_init(mr); ++ ++ if ((mr->flags & MBOX_FLAG_TCP) && (mr->flags & MBOX_FLAG_SEND)) { ++ mr->ops = &g_mbox_rtw_append_ops; ++ rte_atomic64_init(&mr->mt_obj); ++ } else { ++ mr->ops = &g_mbox_rtw_default_ops; ++ mr->st_obj = NULL; ++ } ++ if ((mr->flags & MBOX_FLAG_RECV) && !dpdk_nic_is_xdp()) { ++ mr->flags |= MBOX_FLAG_PEEK; ++ mr->ops = &g_mbox_rtw_peek_ops; ++ mr->ops->create(mr, name, count); ++ } ++ ++ mr->ring = rte_ring_create_fast(name, count, RING_F_SP_ENQ | RING_F_SC_DEQ); ++ if (mr->ring == NULL) { ++ return -ENOMEM; ++ } ++ return 0; ++} ++ ++static inline ++void mt_ring_destroy(struct mbox_ring *mr) ++{ ++ if (mr->flags & MBOX_FLAG_PEEK) { ++ mr->ops->destroy(mr); ++ } ++ mbox_ring_common_free(mr); ++ ++ mr->ops = NULL; ++ if ((mr->flags & MBOX_FLAG_TCP) && (mr->flags & MBOX_FLAG_SEND)) { ++ rte_atomic64_clear(&mr->mt_obj); ++ } else { ++ mr->st_obj = NULL; ++ } ++ ++ if (mr->ring != NULL) { ++ rte_ring_free_fast(mr->ring); ++ mr->ring = NULL; ++ } ++} ++ ++static inline ++unsigned mt_ring_get_capacity(const struct mbox_ring *mr) ++{ ++ return mr->ring->capacity; ++} ++ ++static inline ++unsigned mt_ring_count(const struct mbox_ring *mr) ++{ ++ // return rte_ring_count(mr->ring); ++ struct rte_ring *r = mr->ring; ++ uint32_t prod_tail = r->prod.tail; ++ uint32_t cons_head = r->cons.head; ++ uint32_t count = (prod_tail - cons_head) & r->mask; ++ return (count > r->capacity) ? r->capacity : count; ++} ++ ++static inline ++unsigned mt_ring_free_count(const struct mbox_ring *mr) ++{ ++ return mt_ring_get_capacity(mr) - mt_ring_count(mr); ++} ++ ++static inline ++unsigned mt_ring_enqueue_burst_start(struct mbox_ring *mr, void *const *obj_table, unsigned n) ++{ ++ struct rte_ring *r = mr->ring; ++ uint32_t prod_head, cons_tail; ++ uint32_t free_entries; ++ ++ prod_head = r->prod.head; ++ cons_tail = __atomic_load_n(&r->cons.tail, __ATOMIC_ACQUIRE); ++ ++ free_entries = r->capacity + cons_tail - prod_head; ++ if (unlikely(free_entries == 0)) ++ return 0; ++ if (n > free_entries) ++ n = free_entries; ++ ++ r->prod.head = prod_head + n; ++ ++ __rte_ring_enqueue_elems(r, prod_head, obj_table, sizeof(void *), n); ++ return n; ++} ++ ++static inline ++void mt_ring_enqueue_burst_finish(struct mbox_ring *mr) ++{ ++ __atomic_store_n(&mr->ring->prod.tail, mr->ring->prod.head, __ATOMIC_RELEASE); ++} ++ ++static inline ++unsigned mt_ring_dequeue_burst_start(struct mbox_ring *mr, void ** obj_table, unsigned n) ++{ ++ struct rte_ring *r = mr->ring; ++ uint32_t cons_head, prod_tail; ++ uint32_t entries; ++ ++ cons_head = r->cons.head; ++ prod_tail = __atomic_load_n(&r->prod.tail, __ATOMIC_ACQUIRE); ++ ++ entries = prod_tail - cons_head; ++ if (unlikely(entries == 0)) ++ return 0; ++ if (n > entries) ++ n = entries; ++ ++ r->cons.head = cons_head + n; ++ ++ __rte_ring_dequeue_elems(r, cons_head, obj_table, sizeof(void *), n); ++ return n; ++} ++ ++static inline ++void mt_ring_dequeue_burst_finish(struct mbox_ring *mr) ++{ ++ __atomic_store_n(&mr->ring->cons.tail, mr->ring->cons.head, __ATOMIC_RELEASE); ++} ++ ++static inline ++unsigned mt_ring_enqueue_burst(struct mbox_ring *mr, void *const *obj_table, unsigned n) ++{ ++ // return rte_ring_sp_enqueue_burst(mr->ring, obj_table, n, NULL); ++ n = mt_ring_enqueue_burst_start(mr, obj_table, n); ++ mt_ring_enqueue_burst_finish(mr); ++ return n; ++} ++ ++static inline ++unsigned mt_ring_dequeue_burst(struct mbox_ring *mr, void **obj_table, unsigned n) ++{ ++ // return rte_ring_sc_dequeue_burst(mr->ring, obj_table, n, NULL); ++ n = mt_ring_dequeue_burst_start(mr, obj_table, n); ++ mt_ring_dequeue_burst_finish(mr); ++ return n; ++} ++ ++static inline ++void *mt_ring_read_tail(const struct mbox_ring *mr) ++{ ++ return (void *)rte_atomic64_read((rte_atomic64_t *)&mr->mt_obj); ++} ++ ++static inline ++void mt_ring_push_tail(struct mbox_ring *mr, void *obj) ++{ ++ rte_atomic64_set(&mr->mt_obj, (uint64_t )obj); ++} ++ ++static inline ++void *mt_ring_pop_tail(struct mbox_ring *mr, void *expect) ++{ ++ if (expect == NULL) { ++ expect = (void *)rte_atomic64_exchange((volatile uint64_t *)&mr->mt_obj.cnt, ++ (uint64_t)NULL); ++ return expect; ++ } ++ ++ int ret = rte_atomic64_cmpset((volatile uint64_t *)&mr->mt_obj.cnt, ++ (uint64_t)expect, (uint64_t)NULL); ++ if (ret == 0) /* mt_obj != expect, cmpset failed */ ++ return NULL; ++ return expect; ++} ++ ++static inline ++void mt_ring_ops_init(struct mbox_ring_ops *ops) ++{ ++ ops->create = mt_ring_create; ++ ops->destroy = mt_ring_destroy; ++ ++ ops->get_capacity = mt_ring_get_capacity; ++ ops->count = mt_ring_count; ++ ops->free_count = mt_ring_free_count; ++ ++ ops->enqueue_burst = mt_ring_enqueue_burst; ++ ops->dequeue_burst = mt_ring_dequeue_burst; ++ ++ ops->recv_count = mt_ring_count; ++ ops->recv_start_burst = mt_ring_dequeue_burst_start; ++ ops->recv_finish_burst = mt_ring_dequeue_burst_finish; ++ ++ ops->read_tail = mt_ring_read_tail; ++ ops->push_tail = mt_ring_push_tail; ++ ops->pop_tail = mt_ring_pop_tail; ++} ++ ++/* multi thread & peek */ ++static inline ++int pk_ring_create(struct mbox_ring *mr, const char *name, unsigned count) ++{ ++ return 0; ++} ++ ++static inline ++void pk_ring_destroy(struct mbox_ring *mr) ++{ ++ void *obj; ++ while (mr->ops->recv_start_burst(mr, &obj, 1) > 0) { ++ mr->ops->recv_finish_burst(mr); ++ } ++ return; ++} ++ ++extern void sockio_peek_recv_free(struct mbox_ring *mr, unsigned n); ++static inline ++unsigned pk_ring_enqueue_burst(struct mbox_ring *mr, void *const *obj_table, unsigned n) ++{ ++ n = gazelle_ring_sp_enqueue(mr->ring, obj_table, n); ++ if (mr->flags & MBOX_FLAG_RECV) ++ sockio_peek_recv_free(mr, n); ++ return n; ++} ++ ++static inline ++unsigned pk_ring_dequeue_burst(struct mbox_ring *mr, void **obj_table, unsigned n) ++{ ++ return gazelle_ring_sc_dequeue(mr->ring, obj_table, n); ++} ++ ++static inline ++unsigned pk_ring_peek_start_burst(struct mbox_ring *mr, void **obj_table, unsigned n) ++{ ++ return gazelle_ring_read(mr->ring, obj_table, n); ++} ++static inline ++void pk_ring_peek_finish_burst(struct mbox_ring *mr) ++{ ++ gazelle_ring_read_over(mr->ring); ++} ++ ++static inline ++unsigned pk_ring_get_capacity(const struct mbox_ring *mr) ++{ ++ return mr->ring->capacity; ++} ++static inline ++unsigned pk_ring_count(const struct mbox_ring *mr) ++{ ++ return gazelle_ring_count(mr->ring); ++} ++static inline ++unsigned pk_ring_free_count(const struct mbox_ring *mr) ++{ ++ return gazelle_ring_free_count(mr->ring); ++} ++ ++static inline ++unsigned pk_ring_peek_start_count(const struct mbox_ring *mr) ++{ ++ return gazelle_ring_readable_count(mr->ring); ++} ++static inline ++unsigned pk_ring_peek_finish_count(const struct mbox_ring *mr) ++{ ++ return gazelle_ring_readover_count(mr->ring); ++} ++ ++static inline ++void pk_ring_ops_init(struct mbox_ring_ops *ops) ++{ ++ ops->create = pk_ring_create; ++ ops->destroy = pk_ring_destroy; ++ ++ ops->get_capacity = pk_ring_get_capacity; ++ ops->count = pk_ring_count; ++ ops->free_count = pk_ring_free_count; ++ ++ ops->enqueue_burst = pk_ring_enqueue_burst; ++ ops->dequeue_burst = pk_ring_dequeue_burst; ++ ++ ops->recv_count = pk_ring_peek_start_count; ++ ops->recv_start_burst = pk_ring_peek_start_burst; ++ ops->recv_finish_burst = pk_ring_peek_finish_burst; ++ ++ ops->read_tail = st_ring_read_tail; ++ ops->push_tail = st_ring_push_tail; ++ ops->pop_tail = st_ring_pop_tail; ++} ++ ++static inline ++void mbox_ring_ops_init(void) ++{ ++ st_ring_ops_init(&g_mbox_rtc_default_ops); ++ ++ mt_ring_ops_init(&g_mbox_rtw_append_ops); ++ mt_ring_ops_init(&g_mbox_rtw_default_ops); ++ /* rtw udp don't need to append data.*/ ++ g_mbox_rtw_default_ops.read_tail = st_ring_read_tail; ++ g_mbox_rtw_default_ops.pop_tail = st_ring_pop_tail; ++ g_mbox_rtw_default_ops.push_tail = st_ring_push_tail; ++ ++ pk_ring_ops_init(&g_mbox_rtw_peek_ops); ++ ++ if (get_global_cfg_params()->stack_mode_rtc) { ++ g_mbox_default_ops = &g_mbox_rtc_default_ops; ++ } else { ++ g_mbox_default_ops = &g_mbox_rtw_default_ops; ++ } ++} ++ ++#endif /* __MBOX_RING_H__ */ +-- +2.33.0 + diff --git a/0329-socket-adapt-to-tcp-and-udp.patch b/0329-socket-adapt-to-tcp-and-udp.patch new file mode 100644 index 0000000..cc1d8e7 --- /dev/null +++ b/0329-socket-adapt-to-tcp-and-udp.patch @@ -0,0 +1,5305 @@ +From 3413b74afd91914682fc494defe17d057db8e2d2 Mon Sep 17 00:00:00 2001 +From: Lemmy Huang +Date: Fri, 21 Mar 2025 17:05:48 +0800 +Subject: [PATCH] socket: adapt to tcp and udp + +Signed-off-by: Lemmy Huang +--- + src/common/dpdk_common.h | 11 +- + src/common/gazelle_dfx_msg.h | 30 +- + src/common/gazelle_opt.h | 6 - + src/common/gazelle_reg_msg.h | 1 - + src/lstack/api/lstack_rtc_api.c | 46 - + src/lstack/api/lstack_rtw_api.c | 500 ------- + ...lstack_dummy_api.c => lstack_sock_dummy.c} | 2 +- + src/lstack/api/lstack_sockctl.c | 25 +- + src/lstack/api/lstack_wrap.c | 69 +- + src/lstack/core/lstack_cfg.c | 47 +- + src/lstack/core/lstack_control_plane.c | 38 +- + src/lstack/core/lstack_dpdk.c | 174 +-- + src/lstack/core/lstack_lwip.c | 1286 ----------------- + src/lstack/core/lstack_mempool.c | 23 +- + src/lstack/core/lstack_preload.c | 11 +- + src/lstack/core/lstack_protocol_stack.c | 63 +- + src/lstack/core/lstack_stack_stat.c | 122 +- + src/lstack/core/lstack_thread_rpc.c | 821 ++--------- + src/lstack/core/lstack_virtio.c | 3 +- + src/lstack/core/lstack_wait.c | 26 +- + src/lstack/core/same_node.c | 22 +- + src/lstack/include/lstack_cfg.h | 6 +- + src/lstack/include/lstack_dpdk.h | 20 +- + src/lstack/include/lstack_dummy_api.h | 23 - + src/lstack/include/lstack_lwip.h | 62 - + src/lstack/include/lstack_mempool.h | 2 +- + src/lstack/include/lstack_protocol_stack.h | 11 - + src/lstack/include/lstack_rtc_api.h | 25 - + .../{lstack_rtw_api.h => lstack_sock_dummy.h} | 10 +- + src/lstack/include/lstack_stack_stat.h | 16 +- + src/lstack/include/lstack_thread_rpc.h | 63 +- + src/lstack/include/mbox_ring.h | 2 +- + src/lstack/include/same_node.h | 1 - + src/lstack/netif/lstack_ethdev.c | 76 +- + src/lstack/netif/lstack_fault_inject.c | 20 +- + src/lstack/netif/lstack_flow.c | 15 +- + src/lstack/netif/lstack_vdev.c | 11 +- + src/ltran/ltran_dfx.c | 34 +- + src/ltran/ltran_forward.c | 2 +- + 39 files changed, 492 insertions(+), 3233 deletions(-) + delete mode 100644 src/lstack/api/lstack_rtc_api.c + delete mode 100644 src/lstack/api/lstack_rtw_api.c + rename src/lstack/api/{lstack_dummy_api.c => lstack_sock_dummy.c} (98%) + delete mode 100644 src/lstack/core/lstack_lwip.c + delete mode 100644 src/lstack/include/lstack_dummy_api.h + delete mode 100644 src/lstack/include/lstack_lwip.h + delete mode 100644 src/lstack/include/lstack_rtc_api.h + rename src/lstack/include/{lstack_rtw_api.h => lstack_sock_dummy.h} (77%) + +diff --git a/src/common/dpdk_common.h b/src/common/dpdk_common.h +index 8609216..cff193c 100644 +--- a/src/common/dpdk_common.h ++++ b/src/common/dpdk_common.h +@@ -40,8 +40,7 @@ struct latency_timestamp { + struct mbuf_private { + /* struct pbuf_custom must at first */ + struct pbuf_custom pc; +- /* don't use `struct tcp_seg` directly to avoid conflicts by include lwip tcp header */ +- char ts[32]; // 32 > sizeof(struct tcp_seg) ++ int stack_id; /* the stack to which buf belongs */ + struct latency_timestamp lt; + }; + +@@ -49,13 +48,9 @@ static __rte_always_inline struct mbuf_private *mbuf_to_private(const struct rte + { + return (struct mbuf_private *)RTE_PTR_ADD(m, sizeof(struct rte_mbuf)); + } +-static __rte_always_inline struct pbuf_custom *mbuf_to_pbuf(const struct rte_mbuf *m) ++static __rte_always_inline struct pbuf *mbuf_to_pbuf(const struct rte_mbuf *m) + { +- return &mbuf_to_private(m)->pc; +-} +-static __rte_always_inline struct rte_mbuf *pbuf_to_mbuf(const struct pbuf *p) +-{ +- return (struct rte_mbuf *)RTE_PTR_SUB(p, sizeof(struct rte_mbuf)); ++ return &mbuf_to_private(m)->pc.pbuf; + } + static __rte_always_inline struct mbuf_private *pbuf_to_private(const struct pbuf *p) + { +diff --git a/src/common/gazelle_dfx_msg.h b/src/common/gazelle_dfx_msg.h +index 1a89e65..2c6462d 100644 +--- a/src/common/gazelle_dfx_msg.h ++++ b/src/common/gazelle_dfx_msg.h +@@ -72,10 +72,9 @@ enum GAZELLE_STAT_MODE { + + enum GAZELLE_LATENCY_TYPE { + GAZELLE_LATENCY_INTO_MBOX, // t0 -> t1 +- GAZELLE_LATENCY_READ_LWIP, // t1 -> t2 +- GAZELLE_LATENCY_READ_APP_CALL, // t2 -> t3 +- GAZELLE_LATENCY_READ_LSTACK, // t3 -> t4 +- GAZELLE_LATENCY_READ_MAX, // t0 -> t4 ++ GAZELLE_LATENCY_READ_APP_CALL, // t1 -> t2 ++ GAZELLE_LATENCY_READ_LSTACK, // t2 -> t3 ++ GAZELLE_LATENCY_READ_MAX, // t0 -> t3 + + GAZELLE_LATENCY_WRITE_INTO_RING, // t0 -> t1 + GAZELLE_LATENCY_WRITE_LWIP, // t1 -> t2 +@@ -123,12 +122,12 @@ struct gazelle_wakeup_stat { + }; + + struct gazelle_stack_aggregate_stats { +- /* 0: RX, 1: TX, 2: APP_TX */ +- uint32_t size_1_64[3]; +- uint32_t size_65_512[3]; +- uint32_t size_513_1460[3]; +- uint32_t size_1461_8192[3]; +- uint32_t size_8193_max[3]; ++ /* 0: RX, 1: TX */ ++ uint32_t size_1_64[2]; ++ uint32_t size_65_512[2]; ++ uint32_t size_513_1460[2]; ++ uint32_t size_1461_8192[2]; ++ uint32_t size_8193_max[2]; + + uint64_t rx_bytes; + uint64_t tx_bytes; +@@ -138,7 +137,6 @@ struct gazelle_stat_pkts { + uint16_t conn_num; + uint32_t mbufpool_avail_cnt; + uint64_t call_msg_cnt; +- uint64_t recv_list_cnt; + uint64_t call_alloc_fail; + struct gazelle_stack_stat stack_stat; + struct gazelle_wakeup_stat wakeup_stat; +@@ -257,15 +255,17 @@ struct gazelle_stat_lstack_proto { + + struct gazelle_stat_lstack_conn_info { + uint32_t state; ++ uint32_t tcp_sub_state; ++ + gz_addr_t rip; + gz_addr_t lip; + uint16_t r_port; + uint16_t l_port; + uint32_t in_send; +- uint32_t recv_cnt; +- uint32_t send_ring_cnt; +- uint32_t recv_ring_cnt; +- uint32_t tcp_sub_state; ++ uint32_t recvmbox_cnt; ++ uint16_t recvmbox_tail; ++ uint32_t sendmbox_cnt; ++ uint16_t sendmbox_tail; + + uint32_t cwn; + uint32_t rcv_wnd; +diff --git a/src/common/gazelle_opt.h b/src/common/gazelle_opt.h +index 4406831..86eb874 100644 +--- a/src/common/gazelle_opt.h ++++ b/src/common/gazelle_opt.h +@@ -56,12 +56,6 @@ + #define STACK_THREAD_DEFAULT 4 + #define STACK_NIC_READ_DEFAULT 128 + +-#define MTU_DEFAULT_DATA_LEN 1460 +-#define VLAN_HEAD_LEN 4 +-#define IPV6_EXTRA_HEAD_LEN 20 +-#define MBUF_MAX_DATA_LEN (MTU_DEFAULT_DATA_LEN - VLAN_HEAD_LEN - IPV6_EXTRA_HEAD_LEN) +- +-#define GAZELLE_UDP_PKGLEN_MAX (65535 - IP_HLEN - UDP_HLEN) + + /* total:33 client, index 32 is invaild client */ + #define GAZELLE_CLIENT_NUM_ALL 33 +diff --git a/src/common/gazelle_reg_msg.h b/src/common/gazelle_reg_msg.h +index ecd1e35..f573390 100644 +--- a/src/common/gazelle_reg_msg.h ++++ b/src/common/gazelle_reg_msg.h +@@ -33,7 +33,6 @@ + #define OPT_VDEV "--vdev" + #define VDEV_ARG_IFACE "iface" + +-#define GAZELLE_MAX_NUMA_NODES 8 + #define SOCKET_MEM_STRLEN (GAZELLE_MAX_NUMA_NODES * 10) + + /* types for msg from lstack to ltran */ +diff --git a/src/lstack/api/lstack_rtc_api.c b/src/lstack/api/lstack_rtc_api.c +deleted file mode 100644 +index 4a962e1..0000000 +--- a/src/lstack/api/lstack_rtc_api.c ++++ /dev/null +@@ -1,46 +0,0 @@ +-/* +-* Copyright (c) Huawei Technologies Co., Ltd. 2020-2021. All rights reserved. +-* gazelle is licensed under the Mulan PSL v2. +-* You can use this software according to the terms and conditions of the Mulan PSL v2. +-* You may obtain a copy of Mulan PSL v2 at: +-* http://license.coscl.org.cn/MulanPSL2 +-* THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR +-* IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY OR FIT FOR A PARTICULAR +-* PURPOSE. +-* See the Mulan PSL v2 for more details. +-*/ +- +-#include +-#include +- +-#include "lstack_log.h" +-#include "lstack_rtc_api.h" +- +- +-void rtc_api_init(posix_api_t *api) +-{ +- api->close_fn = lwip_close; +- api->shutdown_fn = lwip_shutdown; +- api->socket_fn = lwip_socket; +- api->accept_fn = lwip_accept; +- api->accept4_fn = lwip_accept4; +- api->bind_fn = lwip_bind; +- api->listen_fn = lwip_listen; +- api->connect_fn = lwip_connect; +- +- api->setsockopt_fn = lwip_setsockopt; +- api->getsockopt_fn = lwip_getsockopt; +- api->getpeername_fn = lwip_getpeername; +- api->getsockname_fn = lwip_getsockname; +- +- api->read_fn = lwip_read; +- api->readv_fn = lwip_readv; +- api->write_fn = lwip_write; +- api->writev_fn = lwip_writev; +- api->recv_fn = lwip_recv; +- api->send_fn = lwip_send; +- api->recvmsg_fn = lwip_recvmsg; +- api->sendmsg_fn = lwip_sendmsg; +- api->recvfrom_fn = lwip_recvfrom; +- api->sendto_fn = lwip_sendto; +-} +diff --git a/src/lstack/api/lstack_rtw_api.c b/src/lstack/api/lstack_rtw_api.c +deleted file mode 100644 +index 7b8dec2..0000000 +--- a/src/lstack/api/lstack_rtw_api.c ++++ /dev/null +@@ -1,500 +0,0 @@ +-/* +-* Copyright (c) Huawei Technologies Co., Ltd. 2020-2021. All rights reserved. +-* gazelle is licensed under the Mulan PSL v2. +-* You can use this software according to the terms and conditions of the Mulan PSL v2. +-* You may obtain a copy of Mulan PSL v2 at: +-* http://license.coscl.org.cn/MulanPSL2 +-* THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR +-* IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY OR FIT FOR A PARTICULAR +-* PURPOSE. +-* See the Mulan PSL v2 for more details. +-*/ +- +-#include +-#include +- +-#include "common/gazelle_base_func.h" +-#include "lstack_log.h" +-#include "lstack_cfg.h" +-#include "lstack_thread_rpc.h" +-#include "lstack_protocol_stack.h" +-#include "lstack_lwip.h" +-#include "lstack_rtw_api.h" +-#include "lstack_epoll.h" +-#include "lstack_wait.h" +- +-/* when fd is listenfd, listenfd of all protocol stack thread will be closed */ +-static int stack_broadcast_close(int fd) +-{ +- int ret = 0; +- struct protocol_stack *stack; +- struct lwip_sock *sock = lwip_get_socket(fd); +- if (sock == NULL) { +- GAZELLE_RETURN(EBADF); +- } +- +- do { +- if (POSIX_IS_CLOSED(sock)) { +- break; +- } +- stack = get_protocol_stack_by_id(sock->stack_id); +- if (stack == NULL || rpc_call_close(&stack->rpc_queue, fd)) { +- ret = -1; +- } +- +- sock = sock->listen_next; +- fd = sock->conn->callback_arg.socket; +- } while (1); +- +- return ret; +-} +- +-static int stack_broadcast_shutdown(int fd, int how) +-{ +- int32_t ret = 0; +- struct protocol_stack *stack; +- struct lwip_sock *sock = lwip_get_socket(fd); +- if (sock == NULL) { +- GAZELLE_RETURN(EBADF); +- } +- +- do { +- if (POSIX_IS_CLOSED(sock)) { +- break; +- } +- stack = get_protocol_stack_by_id(sock->stack_id); +- if (stack == NULL || rpc_call_shutdown(&stack->rpc_queue, fd, how)) { +- ret = -1; +- } +- +- sock = sock->listen_next; +- fd = sock->conn->callback_arg.socket; +- } while (1); +- +- return ret; +-} +- +-/* choice one stack bind */ +-static int stack_single_bind(int fd, const struct sockaddr *name, socklen_t namelen) +-{ +- struct protocol_stack *stack; +- struct lwip_sock *sock = lwip_get_socket(fd); +- if (sock == NULL) { +- GAZELLE_RETURN(EBADF); +- } +- stack = get_protocol_stack_by_id(sock->stack_id); +- return rpc_call_bind(&stack->rpc_queue, fd, name, namelen); +-} +- +-/* bind sync to all protocol stack thread, so that any protocol stack thread can build connect */ +-static int stack_broadcast_bind(int fd, const struct sockaddr *name, socklen_t namelen) +-{ +- struct protocol_stack *cur_stack; +- struct protocol_stack *stack = NULL; +- int ret, clone_fd; +- +- struct lwip_sock *sock = lwip_get_socket(fd); +- if (sock == NULL) { +- LSTACK_LOG(ERR, LSTACK, "tid %d, %d get sock null or stack null\n", rte_gettid(), fd); +- GAZELLE_RETURN(EBADF); +- } +- +- cur_stack = get_protocol_stack_by_id(sock->stack_id); +- ret = rpc_call_bind(&cur_stack->rpc_queue, fd, name, namelen); +- if (ret < 0) { +- close(fd); +- return ret; +- } +- +- struct protocol_stack_group *stack_group = get_protocol_stack_group(); +- for (int i = 0; i < stack_group->stack_num; ++i) { +- stack = stack_group->stacks[i]; +- if (stack != cur_stack) { +- clone_fd = rpc_call_shadow_fd(&stack->rpc_queue, fd, name, namelen); +- if (clone_fd < 0) { +- stack_broadcast_close(fd); +- return clone_fd; +- } +- } +- } +- return 0; +-} +- +-static struct lwip_sock *get_min_accept_sock(int fd) +-{ +- struct lwip_sock *sock; +- struct lwip_sock *min_sock = NULL; +- +- for (sock = lwip_get_socket(fd); sock != NULL; sock = sock->listen_next) { +- if (!netconn_is_nonblocking(sock->conn)) { +- /* init all sock sk_wait */ +- if (unlikely(sock->sk_wait == NULL) || sock->sk_wait->type == WAIT_CLOSE) { +- sock->sk_wait = poll_construct_wait(0); +- } +- if (!(sock->sk_wait->type & WAIT_BLOCK)) { +- sock->sk_wait->type |= WAIT_BLOCK; +- } +- } +- +- if (!sock_event_hold_pending(sock, WAIT_BLOCK, NETCONN_EVT_RCVPLUS, 0)) { +- continue; +- } +- +- if (min_sock == NULL || +- get_protocol_stack_by_id(min_sock->stack_id)->conn_num > get_protocol_stack_by_id(sock->stack_id)->conn_num) { +- min_sock = sock; +- } +- } +- +- return min_sock; +-} +- +-/* ergodic the protocol stack thread to find the connection, because all threads are listening */ +-static int stack_broadcast_accept4(int fd, struct sockaddr *addr, socklen_t *addrlen, int flags) +-{ +- int ret = -1; +- struct lwip_sock *min_sock = NULL; +- struct lwip_sock *sock = lwip_get_socket(fd); +- struct protocol_stack *stack = NULL; +- if (sock == NULL) { +- GAZELLE_RETURN(EBADF); +- } +- +- min_sock = get_min_accept_sock(fd); +- if (min_sock == NULL) { +- if (sock_event_wait(sock, netconn_is_nonblocking(sock->conn) || (flags & SOCK_NONBLOCK))) { +- min_sock = get_min_accept_sock(fd); +- } +- } +- +- if (!POSIX_IS_CLOSED(min_sock)) { +- stack = get_protocol_stack_by_id(min_sock->stack_id); +- ret = rpc_call_accept(&stack->rpc_queue, min_sock->conn->callback_arg.socket, addr, addrlen, flags); +- } +- +- if (ret < 0) { +- errno = EAGAIN; +- } +- return ret; +-} +- +-static int stack_broadcast_accept(int fd, struct sockaddr *addr, socklen_t *addrlen) +-{ +- return stack_broadcast_accept4(fd, addr, addrlen, 0); +-} +- +-/* choice one stack listen */ +-static int stack_single_listen(int fd, int backlog) +-{ +- struct protocol_stack *stack; +- struct lwip_sock *sock = lwip_get_socket(fd); +- if (sock == NULL) { +- GAZELLE_RETURN(EBADF); +- } +- stack = get_protocol_stack_by_id(sock->stack_id); +- return rpc_call_listen(&stack->rpc_queue, fd, backlog); +-} +- +-/* listen sync to all protocol stack thread, so that any protocol stack thread can build connect */ +-static int stack_broadcast_listen(int fd, int backlog) +-{ +- typedef union sockaddr_union { +- struct sockaddr sa; +- struct sockaddr_in in; +- struct sockaddr_in6 in6; +- } sockaddr_t; +- +- struct protocol_stack *cur_stack; +- struct protocol_stack *stack = NULL; +- sockaddr_t addr; +- socklen_t addr_len = sizeof(addr); +- int ret, clone_fd; +- +- struct lwip_sock *sock = lwip_get_socket(fd); +- if (sock == NULL) { +- LSTACK_LOG(ERR, LSTACK, "tid %d, %d get sock null or stack null\n", rte_gettid(), fd); +- GAZELLE_RETURN(EBADF); +- } +- +- cur_stack = get_protocol_stack_by_id(sock->stack_id); +- ret = rpc_call_getsockname(&cur_stack->rpc_queue, fd, (struct sockaddr *)&addr, &addr_len); +- if (ret != 0) { +- return ret; +- } +- +- struct protocol_stack_group *stack_group = get_protocol_stack_group(); +-#if GAZELLE_TCP_REUSE_IPPORT +- int min_conn_stk_idx = get_min_conn_stack(stack_group); +-#endif +- for (int32_t i = 0; i < stack_group->stack_num; ++i) { +- stack = stack_group->stacks[i]; +- if (stack != cur_stack) { +- clone_fd = rpc_call_shadow_fd(&stack->rpc_queue, fd, (struct sockaddr *)&addr, addr_len); +- if (clone_fd < 0) { +- stack_broadcast_close(fd); +- return clone_fd; +- } +- } else { +- clone_fd = fd; +- } +-#if GAZELLE_TCP_REUSE_IPPORT +- if (min_conn_stk_idx == i) { +- lwip_get_socket(clone_fd)->conn->is_master_fd = 1; +- } else { +- lwip_get_socket(clone_fd)->conn->is_master_fd = 0; +- } +-#endif +- ret = rpc_call_listen(&stack->rpc_queue, clone_fd, backlog); +- if (ret < 0) { +- stack_broadcast_close(fd); +- return ret; +- } +- } +- return 0; +-} +- +-static int rtw_socket(int domain, int type, int protocol) +-{ +- struct protocol_stack *stack = get_bind_protocol_stack(); +- if (stack == NULL) { +- GAZELLE_RETURN(EINVAL); +- } +- return rpc_call_socket(&stack->rpc_queue, domain, type, protocol); +-} +- +-static int rtw_accept(int s, struct sockaddr *addr, socklen_t *addrlen) +-{ +- return stack_broadcast_accept(s, addr, addrlen); +-} +- +-static int rtw_accept4(int s, struct sockaddr *addr, socklen_t *addrlen, int flags) +-{ +- return stack_broadcast_accept4(s, addr, addrlen, flags); +-} +- +-static int rtw_bind(int s, const struct sockaddr *name, socklen_t namelen) +-{ +- struct lwip_sock *sock = lwip_get_socket(s); +- +- if (NETCONN_IS_UDP(sock) && get_global_cfg_params()->listen_shadow) { +- return stack_broadcast_bind(s, name, namelen); +- } else { +- return stack_single_bind(s, name, namelen); +- } +-} +- +-static int rtw_listen(int s, int backlog) +-{ +- if (!get_global_cfg_params()->tuple_filter && +- !get_global_cfg_params()->listen_shadow) { +- return stack_single_listen(s, backlog); +- } else { +- return stack_broadcast_listen(s, backlog); +- } +-} +- +-static int rtw_connect(int s, const struct sockaddr *name, socklen_t namelen) +-{ +- struct protocol_stack *stack; +- struct lwip_sock *sock = lwip_get_socket(s); +- if (sock == NULL) { +- GAZELLE_RETURN(EBADF); +- } +- stack = get_protocol_stack_by_id(sock->stack_id); +- return rpc_call_connect(&stack->rpc_queue, s, name, namelen); +-} +- +-static int rtw_setsockopt(int s, int level, int optname, const void *optval, socklen_t optlen) +-{ +- struct protocol_stack *stack; +- struct lwip_sock *sock = lwip_get_socket(s); +- if (sock == NULL) { +- GAZELLE_RETURN(EBADF); +- } +- stack = get_protocol_stack_by_id(sock->stack_id); +- return rpc_call_setsockopt(&stack->rpc_queue, s, level, optname, optval, optlen); +-} +- +-static int rtw_getsockopt(int s, int level, int optname, void *optval, socklen_t *optlen) +-{ +- struct protocol_stack *stack; +- struct lwip_sock *sock = lwip_get_socket(s); +- if (sock == NULL) { +- GAZELLE_RETURN(EBADF); +- } +- stack = get_protocol_stack_by_id(sock->stack_id); +- return rpc_call_getsockopt(&stack->rpc_queue, s, level, optname, optval, optlen); +-} +- +-static int rtw_getpeername(int s, struct sockaddr *name, socklen_t *namelen) +-{ +- struct protocol_stack *stack; +- struct lwip_sock *sock = lwip_get_socket(s); +- if (sock == NULL) { +- GAZELLE_RETURN(EBADF); +- } +- stack = get_protocol_stack_by_id(sock->stack_id); +- return rpc_call_getpeername(&stack->rpc_queue, s, name, namelen); +-} +- +-static int rtw_getsockname(int s, struct sockaddr *name, socklen_t *namelen) +-{ +- struct protocol_stack *stack; +- struct lwip_sock *sock = lwip_get_socket(s); +- if (sock == NULL) { +- GAZELLE_RETURN(EBADF); +- } +- stack = get_protocol_stack_by_id(sock->stack_id); +- return rpc_call_getsockname(&stack->rpc_queue, s, name, namelen); +-} +- +-static ssize_t rtw_read(int s, void *mem, size_t len) +-{ +- return do_lwip_read_from_stack(s, mem, len, 0, NULL, NULL); +-} +- +-static ssize_t rtw_readv(int s, const struct iovec *iov, int iovcnt) +-{ +- struct msghdr msg; +- +- msg.msg_name = NULL; +- msg.msg_namelen = 0; +- msg.msg_iov = LWIP_CONST_CAST(struct iovec *, iov); +- msg.msg_iovlen = iovcnt; +- msg.msg_control = NULL; +- msg.msg_controllen = 0; +- msg.msg_flags = 0; +- return do_lwip_recvmsg_from_stack(s, &msg, 0); +-} +- +-static ssize_t rtw_write(int s, const void *mem, size_t size) +-{ +- return do_lwip_send_to_stack(s, mem, size, 0, NULL, 0); +-} +- +-static ssize_t rtw_writev(int s, const struct iovec *iov, int iovcnt) +-{ +- struct lwip_sock *sock = lwip_get_socket(s); +- struct msghdr msg; +- +- msg.msg_name = NULL; +- msg.msg_namelen = 0; +- msg.msg_iov = LWIP_CONST_CAST(struct iovec *, iov); +- msg.msg_iovlen = iovcnt; +- msg.msg_control = NULL; +- msg.msg_controllen = 0; +- msg.msg_flags = 0; +- return do_lwip_sendmsg_to_stack(sock, s, &msg, 0); +-} +- +-static ssize_t rtw_recv(int sockfd, void *buf, size_t len, int flags) +-{ +- return do_lwip_read_from_stack(sockfd, buf, len, flags, NULL, NULL); +-} +- +-static ssize_t rtw_send(int sockfd, const void *buf, size_t len, int flags) +-{ +- return do_lwip_send_to_stack(sockfd, buf, len, flags, NULL, 0); +-} +- +-static ssize_t rtw_recvmsg(int s, struct msghdr *message, int flags) +-{ +- return do_lwip_recvmsg_from_stack(s, message, flags); +-} +- +-static ssize_t rtw_sendmsg(int s, const struct msghdr *message, int flags) +-{ +- struct lwip_sock *sock = lwip_get_socket(s); +- return do_lwip_sendmsg_to_stack(sock, s, message, flags); +-} +- +-static ssize_t rtw_udp_recvfrom(int sockfd, void *buf, size_t len, int flags, +- struct sockaddr *addr, socklen_t *addrlen) +-{ +- struct lwip_sock *sock = lwip_get_socket(sockfd); +- int ret; +- +- while (1) { +- ret = do_lwip_read_from_stack(sockfd, buf, len, flags, addr, addrlen); +- if (ret >= 0) { +- return ret; +- } +- if (ret < 0 && errno != EAGAIN) { +- return -1; +- } +- sock = sock->listen_next; +- if (!POSIX_IS_CLOSED(sock)) { +- sockfd = sock->conn->callback_arg.socket; +- } else { +- if (sock == NULL) { +- errno = EAGAIN; +- return -1; +- } else { +- errno = ENOTCONN; +- return -1; +- } +- } +- } +-} +- +-static inline ssize_t rtw_tcp_recvfrom(int sockfd, void *buf, size_t len, int flags, +- struct sockaddr *addr, socklen_t *addrlen) +-{ +- return do_lwip_read_from_stack(sockfd, buf, len, flags, addr, addrlen); +-} +- +- +-static ssize_t rtw_recvfrom(int sockfd, void *buf, size_t len, int flags, +- struct sockaddr *addr, socklen_t *addrlen) +-{ +- struct lwip_sock *sock = lwip_get_socket(sockfd); +- if (NETCONN_IS_UDP(sock)) { +- return rtw_udp_recvfrom(sockfd, buf, len, flags, addr, addrlen); +- } else { +- return rtw_tcp_recvfrom(sockfd, buf, len, flags, addr, addrlen); +- } +-} +- +-static ssize_t rtw_sendto(int sockfd, const void *buf, size_t len, int flags, +- const struct sockaddr *addr, socklen_t addrlen) +-{ +- return do_lwip_send_to_stack(sockfd, buf, len, flags, addr, addrlen); +-} +- +-static int rtw_close(int s) +-{ +- return stack_broadcast_close(s); +-} +- +-static int rtw_shutdown(int fd, int how) +-{ +- return stack_broadcast_shutdown(fd, how); +-} +- +-void rtw_api_init(posix_api_t *api) +-{ +- api->close_fn = rtw_close; +- api->shutdown_fn = rtw_shutdown; +- api->socket_fn = rtw_socket; +- api->accept_fn = rtw_accept; +- api->accept4_fn = rtw_accept4; +- api->bind_fn = rtw_bind; +- api->listen_fn = rtw_listen; +- api->connect_fn = rtw_connect; +- +- api->setsockopt_fn = rtw_setsockopt; +- api->getsockopt_fn = rtw_getsockopt; +- api->getpeername_fn = rtw_getpeername; +- api->getsockname_fn = rtw_getsockname; +- +- api->read_fn = rtw_read; +- api->readv_fn = rtw_readv; +- api->write_fn = rtw_write; +- api->writev_fn = rtw_writev; +- api->recv_fn = rtw_recv; +- api->send_fn = rtw_send; +- api->recvmsg_fn = rtw_recvmsg; +- api->sendmsg_fn = rtw_sendmsg; +- api->recvfrom_fn = rtw_recvfrom; +- api->sendto_fn = rtw_sendto; +-} +diff --git a/src/lstack/api/lstack_dummy_api.c b/src/lstack/api/lstack_sock_dummy.c +similarity index 98% +rename from src/lstack/api/lstack_dummy_api.c +rename to src/lstack/api/lstack_sock_dummy.c +index 004a3aa..ce046e3 100644 +--- a/src/lstack/api/lstack_dummy_api.c ++++ b/src/lstack/api/lstack_sock_dummy.c +@@ -85,7 +85,7 @@ static ssize_t dummy_sendto(int sockfd, const void *buf, size_t len, int flags, + return dummy_exit(); + } + +-void dummy_api_init(posix_api_t *api) ++void sock_dummy_api_init(posix_api_t *api) + { + api->socket_fn = dummy_socket; + api->send_fn = dummy_send; +diff --git a/src/lstack/api/lstack_sockctl.c b/src/lstack/api/lstack_sockctl.c +index 71310b7..7da7473 100644 +--- a/src/lstack/api/lstack_sockctl.c ++++ b/src/lstack/api/lstack_sockctl.c +@@ -27,9 +27,6 @@ + static void callback_getpeername(struct rpc_msg *msg) + { + msg->result = lwip_getpeername(msg->args[MSG_ARG_0].i, msg->args[MSG_ARG_1].p, msg->args[MSG_ARG_2].p); +- if (msg->result != 0) { +- LSTACK_LOG(ERR, LSTACK, "tid %d, fd %d fail %ld\n", rte_gettid(), msg->args[MSG_ARG_0].i, msg->result); +- } + } + + static void callback_getsockname(struct rpc_msg *msg) +@@ -191,7 +188,7 @@ static void callback_close(struct rpc_msg *msg) + + if (sockio_mbox_pending(sock)) { + rpc_queue *queue = &get_protocol_stack_by_id(sock->stack_id)->rpc_queue; +- rpc_async_call(queue, msg, RPC_MSG_FREE | RPC_MSG_RECALL); /* until stack_send recall finish */ ++ rpc_async_call(queue, msg, RPC_MSG_RECALL); /* until stack_send recall finish */ + return; + } + +@@ -209,7 +206,7 @@ static void callback_shutdown(struct rpc_msg *msg) + + if (sockio_mbox_pending(sock)) { + rpc_queue *queue = &get_protocol_stack_by_id(sock->stack_id)->rpc_queue; +- rpc_async_call(queue, msg, RPC_MSG_FREE | RPC_MSG_RECALL); ++ rpc_async_call(queue, msg, RPC_MSG_RECALL); + return; + } + +@@ -458,12 +455,15 @@ static int rpc_call_connect(int stack_id, int fd, const struct sockaddr *addr, s + /* for lwip nonblock connected callback */ + void do_lwip_connected_callback(int fd) + { ++ bool has_kernel; + struct lwip_sock *sock = lwip_get_socket(fd); + if (POSIX_IS_CLOSED(sock)) { + return; + } + +- if (POSIX_HAS_TYPE(sock, POSIX_KERNEL)) { ++ has_kernel = POSIX_HAS_TYPE(sock, POSIX_KERNEL); ++ POSIX_SET_TYPE(sock, POSIX_LWIP); ++ if (has_kernel) { + /* delete kernel event */ + if (sock->sk_wait != NULL) { + posix_api->epoll_ctl_fn(sock->sk_wait->epfd, EPOLL_CTL_DEL, fd, NULL); +@@ -471,10 +471,7 @@ void do_lwip_connected_callback(int fd) + /* shutdown kernel connect, do_connect() has tried both kernel and lwip. */ + posix_api->shutdown_fn(fd, SHUT_RDWR); + } +- +- POSIX_SET_TYPE(sock, POSIX_LWIP); +- +- API_EVENT(sock->conn, NETCONN_EVT_RCVPLUS, 0); ++ return; + } + + /* when fd is listenfd, listenfd of all protocol stack thread will be closed */ +@@ -625,10 +622,7 @@ static int stack_broadcast_accept4(int fd, struct sockaddr *addr, socklen_t *add + + static int stack_broadcast_accept(int fd, struct sockaddr *addr, socklen_t *addrlen) + { +- if (get_global_cfg_params()->nonblock_mode) +- return stack_broadcast_accept4(fd, addr, addrlen, SOCK_NONBLOCK); +- else +- return stack_broadcast_accept4(fd, addr, addrlen, 0); ++ return stack_broadcast_accept4(fd, addr, addrlen, 0); + } + + /* choice one stack listen */ +@@ -675,9 +669,6 @@ static int stack_broadcast_listen(int fd, int backlog) + + for (int32_t i = 0; i < stack_group->stack_num; ++i) { + stack = stack_group->stacks[i]; +- if (get_global_cfg_params()->seperate_send_recv && stack->is_send_thread) { +- continue; +- } + if (stack != cur_stack) { + clone_fd = rpc_call_shadow_fd(stack->stack_idx, fd, (struct sockaddr *)&addr, addr_len); + if (clone_fd < 0) { +diff --git a/src/lstack/api/lstack_wrap.c b/src/lstack/api/lstack_wrap.c +index 5869d6b..e22937f 100644 +--- a/src/lstack/api/lstack_wrap.c ++++ b/src/lstack/api/lstack_wrap.c +@@ -24,13 +24,13 @@ + #include "common/gazelle_base_func.h" + #include "lstack_log.h" + #include "lstack_cfg.h" +-#include "lstack_lwip.h" + #include "lstack_preload.h" + #include "lstack_unistd.h" + #include "lstack_epoll.h" +-#include "lstack_rtc_api.h" +-#include "lstack_rtw_api.h" +-#include "lstack_dummy_api.h" ++#include "lstack_sockctl.h" ++#include "lstack_sockio.h" ++#include "lstack_sock_dummy.h" ++#include "mbox_ring.h" + + #ifndef SOL_XDP + #define SOL_XDP 283 /* same as define in bits/socket.h */ +@@ -47,17 +47,19 @@ void wrap_api_init(void) + g_wrap_api = &g_wrap_api_value; + + if (get_global_cfg_params()->stack_mode_rtc) { +- rtc_api_init(g_wrap_api); ++ sockctl_rtc_api_init(g_wrap_api); + } else { +- rtw_api_init(g_wrap_api); ++ sockctl_rtw_api_init(g_wrap_api); + } + + epoll_api_init(g_wrap_api); ++ sockio_ops_init(); ++ mbox_ring_ops_init(); + } + + void wrap_api_exit(void) + { +- dummy_api_init(g_wrap_api); ++ sock_dummy_api_init(g_wrap_api); + } + + static inline int32_t do_accept(int32_t s, struct sockaddr *addr, socklen_t *addrlen) +@@ -517,7 +519,7 @@ static inline int32_t do_socket(int32_t domain, int32_t type, int32_t protocol) + + if (get_global_cfg_params()->stack_mode_rtc) { + if (stack_setup_app_thread() != 0) { +- LSTACK_EXIT(1, "stack_setup_app_thread failed\n"); ++ exit(1); + } + } + +@@ -536,30 +538,16 @@ static inline int32_t do_socket(int32_t domain, int32_t type, int32_t protocol) + + static inline ssize_t do_recv(int32_t sockfd, void *buf, size_t len, int32_t flags) + { +- if (buf == NULL) { +- GAZELLE_RETURN(EINVAL); +- } +- if (len == 0) { +- return 0; +- } +- + if (select_sock_posix_path(lwip_get_socket(sockfd)) == POSIX_LWIP) { +- return g_wrap_api->recv_fn(sockfd, buf, len, flags); ++ return sockio_recv(sockfd, buf, len, flags); + } + return posix_api->recv_fn(sockfd, buf, len, flags); + } + + static inline ssize_t do_read(int32_t s, void *mem, size_t len) + { +- if (mem == NULL) { +- GAZELLE_RETURN(EINVAL); +- } +- if (len == 0) { +- return 0; +- } +- + if (select_sock_posix_path(lwip_get_socket(s)) == POSIX_LWIP) { +- return g_wrap_api->read_fn(s, mem, len); ++ return sockio_read(s, mem, len); + } + return posix_api->read_fn(s, mem, len); + } +@@ -567,7 +555,7 @@ static inline ssize_t do_read(int32_t s, void *mem, size_t len) + static inline ssize_t do_readv(int32_t s, const struct iovec *iov, int iovcnt) + { + if (select_sock_posix_path(lwip_get_socket(s)) == POSIX_LWIP) { +- return g_wrap_api->readv_fn(s, iov, iovcnt); ++ return sockio_readv(s, iov, iovcnt); + } + return posix_api->readv_fn(s, iov, iovcnt); + } +@@ -575,7 +563,7 @@ static inline ssize_t do_readv(int32_t s, const struct iovec *iov, int iovcnt) + static inline ssize_t do_send(int32_t sockfd, const void *buf, size_t len, int32_t flags) + { + if (select_sock_posix_path(lwip_get_socket(sockfd)) == POSIX_LWIP) { +- return g_wrap_api->send_fn(sockfd, buf, len, flags); ++ return sockio_send(sockfd, buf, len, flags); + } + return posix_api->send_fn(sockfd, buf, len, flags); + } +@@ -583,7 +571,7 @@ static inline ssize_t do_send(int32_t sockfd, const void *buf, size_t len, int32 + static inline ssize_t do_write(int32_t s, const void *mem, size_t size) + { + if (select_sock_posix_path(lwip_get_socket(s)) == POSIX_LWIP) { +- return g_wrap_api->write_fn(s, mem, size); ++ return sockio_write(s, mem, size); + } + return posix_api->write_fn(s, mem, size); + } +@@ -591,31 +579,23 @@ static inline ssize_t do_write(int32_t s, const void *mem, size_t size) + static inline ssize_t do_writev(int32_t s, const struct iovec *iov, int iovcnt) + { + if (select_sock_posix_path(lwip_get_socket(s)) == POSIX_LWIP) { +- return g_wrap_api->writev_fn(s, iov, iovcnt); ++ return sockio_writev(s, iov, iovcnt); + } + return posix_api->writev_fn(s, iov, iovcnt); + } + + static inline ssize_t do_recvmsg(int32_t s, struct msghdr *message, int32_t flags) + { +- if (message == NULL) { +- GAZELLE_RETURN(EINVAL); +- } +- + if (select_sock_posix_path(lwip_get_socket(s)) == POSIX_LWIP) { +- return g_wrap_api->recvmsg_fn(s, message, flags); ++ return sockio_recvmsg(s, message, flags); + } + return posix_api->recvmsg_fn(s, message, flags); + } + + static inline ssize_t do_sendmsg(int32_t s, const struct msghdr *message, int32_t flags) + { +- if (message == NULL) { +- GAZELLE_RETURN(EINVAL); +- } +- + if (select_sock_posix_path(lwip_get_socket(s)) == POSIX_LWIP) { +- return g_wrap_api->sendmsg_fn(s, message, flags); ++ return sockio_sendmsg(s, message, flags); + } + return posix_api->sendmsg_fn(s, message, flags); + } +@@ -623,15 +603,8 @@ static inline ssize_t do_sendmsg(int32_t s, const struct msghdr *message, int32_ + static inline ssize_t do_recvfrom(int32_t sockfd, void *buf, size_t len, int32_t flags, + struct sockaddr *addr, socklen_t *addrlen) + { +- if (buf == NULL) { +- GAZELLE_RETURN(EINVAL); +- } +- if (len == 0) { +- return 0; +- } +- + if (select_sock_posix_path(lwip_get_socket(sockfd)) == POSIX_LWIP) { +- return g_wrap_api->recvfrom_fn(sockfd, buf, len, flags, addr, addrlen); ++ return sockio_recvfrom(sockfd, buf, len, flags, addr, addrlen); + } + return posix_api->recvfrom_fn(sockfd, buf, len, flags, addr, addrlen); + } +@@ -640,7 +613,7 @@ static inline ssize_t do_sendto(int32_t sockfd, const void *buf, size_t len, int + const struct sockaddr *addr, socklen_t addrlen) + { + if (select_sock_posix_path(lwip_get_socket(sockfd)) == POSIX_LWIP) { +- return g_wrap_api->sendto_fn(sockfd, buf, len, flags, addr, addrlen); ++ return sockio_sendto(sockfd, buf, len, flags, addr, addrlen); + } + return posix_api->sendto_fn(sockfd, buf, len, flags, addr, addrlen); + } +@@ -688,7 +661,7 @@ static inline int do_epoll_create1(int flags) + + if (get_global_cfg_params()->stack_mode_rtc) { + if (stack_setup_app_thread() != 0) { +- LSTACK_EXIT(1, "stack_setup_app_thread failed\n"); ++ exit(1); + } + } + +diff --git a/src/lstack/core/lstack_cfg.c b/src/lstack/core/lstack_cfg.c +index 04ceb89..432e4db 100644 +--- a/src/lstack/core/lstack_cfg.c ++++ b/src/lstack/core/lstack_cfg.c +@@ -14,6 +14,7 @@ + #include + #include + #include ++#include + #include + #include + #include +@@ -28,9 +29,7 @@ + #include + + #include +-#include +-#include +-#include ++#include + + #include "common/gazelle_reg_msg.h" + #include "common/gazelle_base_func.h" +@@ -65,13 +64,10 @@ static int32_t parse_kni_switch(void); + static int32_t parse_listen_shadow(void); + static int32_t parse_main_thread_affinity(void); + static int32_t parse_unix_prefix(void); +-static int32_t parse_read_connect_number(void); + static int32_t parse_rpc_number(void); + static int32_t parse_nic_read_number(void); + static int32_t parse_tcp_conn_count(void); + static int32_t parse_mbuf_count_per_conn(void); +-static int32_t parse_send_ring_size(void); +-static int32_t parse_recv_ring_size(void); + static int32_t parse_num_process(void); + static int32_t parse_process_numa(void); + static int32_t parse_process_index(void); +@@ -123,8 +119,6 @@ static struct config_vector_t g_config_tbl[] = { + { "mbuf_count_per_conn", parse_mbuf_count_per_conn }, + { "nic_rxqueue_size", parse_nic_rxqueue_size}, + { "nic_txqueue_size", parse_nic_txqueue_size}, +- { "send_ring_size", parse_send_ring_size }, +- { "recv_ring_size", parse_recv_ring_size }, + { "rpc_msg_max", parse_rpc_msg_max }, + { "app_bind_numa", parse_app_bind_numa }, + { "stack_num", parse_stack_num }, +@@ -142,7 +136,6 @@ static struct config_vector_t g_config_tbl[] = { + { "app_exclude_cpus", parse_app_exclude_cpus }, + { "main_thread_affinity", parse_main_thread_affinity }, + { "unix_prefix", parse_unix_prefix }, +- { "read_connect_number", parse_read_connect_number }, + { "rpc_number", parse_rpc_number }, + { "nic_read_number", parse_nic_read_number }, + { "num_process", parse_num_process }, +@@ -1012,22 +1005,6 @@ static int32_t parse_tcp_conn_count(void) + return ret; + } + +-static int32_t parse_send_ring_size(void) +-{ +- int32_t ret; +- /* send ring size default value is 32 */ +- PARSE_ARG(g_config_params.send_ring_size, "send_ring_size", 32, 1, SOCK_SEND_RING_SIZE_MAX, ret); +- return ret; +-} +- +-static int32_t parse_recv_ring_size(void) +-{ +- int32_t ret; +- /* recv ring size default value is 128 */ +- PARSE_ARG(g_config_params.recv_ring_size, "recv_ring_size", 128, 1, SOCK_RECV_RING_SIZE_MAX, ret); +- return ret; +-} +- + static int32_t parse_mbuf_count_per_conn(void) + { + int32_t ret; +@@ -1036,13 +1013,6 @@ static int32_t parse_mbuf_count_per_conn(void) + return ret; + } + +-static int32_t parse_read_connect_number(void) +-{ +- int32_t ret; +- PARSE_ARG(g_config_params.read_connect_number, "read_connect_number", +- STACK_THREAD_DEFAULT, 1, INT32_MAX, ret); +- return ret; +-} + + static int32_t parse_rpc_number(void) + { +@@ -1126,6 +1096,16 @@ static int32_t parse_conf_file(const char *path) + return 0; + } + ++static void lwip_conf_init(void) ++{ ++ const struct cfg_params *cfg = get_global_cfg_params(); ++ ++ struct sys_config sys_conf = { ++ .rtc_mode = cfg->stack_mode_rtc, ++ }; ++ sys_config_init(&sys_conf); ++} ++ + int32_t cfg_init(void) + { + int32_t ret; +@@ -1145,8 +1125,9 @@ int32_t cfg_init(void) + } + + ret = parse_conf_file(config_file); +- + free(config_file); ++ ++ lwip_conf_init(); + return ret; + } + +diff --git a/src/lstack/core/lstack_control_plane.c b/src/lstack/core/lstack_control_plane.c +index bf34693..9d9e012 100644 +--- a/src/lstack/core/lstack_control_plane.c ++++ b/src/lstack/core/lstack_control_plane.c +@@ -278,6 +278,8 @@ static int32_t proc_memory_init(const struct reg_response_msg *rsp_msg) + } + + ret = rte_eal_init(lc_argc, lc_argv); ++ /* rte_eal_init() would call __rte_thread_init(), and set _lcore_id. */ ++ RTE_PER_LCORE(_lcore_id) = LCORE_ID_ANY; + if (ret < 0) { + if (rte_errno == EALREADY) + LSTACK_PRE_LOG(LSTACK_INFO, "rte_eal_init aleady init ret=%d\n", ret); +@@ -389,7 +391,7 @@ static int32_t reg_conn(enum GAZELLE_TCP_LIST_STATE table_state, enum reg_ring_t + return 0; + } + +-void thread_register_phase1(struct rpc_msg *msg) ++static void thread_register_phase1(struct rpc_msg *msg) + { + int32_t ret; + +@@ -415,7 +417,7 @@ void thread_register_phase1(struct rpc_msg *msg) + msg->result = ret; + } + +-void thread_register_phase2(struct rpc_msg *msg) ++static void thread_register_phase2(struct rpc_msg *msg) + { + struct gazelle_stat_lstack_conn *conn = (struct gazelle_stat_lstack_conn *)msg->args[MSG_ARG_0].p; + +@@ -427,6 +429,28 @@ void thread_register_phase2(struct rpc_msg *msg) + msg->result = ret; + } + ++static int rpc_call_thread_regphase1(int stack_id, void *conn) ++{ ++ rpc_queue *queue = &get_protocol_stack_by_id(stack_id)->rpc_queue; ++ struct rpc_msg *msg = rpc_msg_alloc(stack_id, thread_register_phase1); ++ if (msg == NULL) { ++ return -1; ++ } ++ msg->args[MSG_ARG_0].p = conn; ++ return rpc_sync_call(queue, msg); ++} ++ ++static int rpc_call_thread_regphase2(int stack_id, void *conn) ++{ ++ rpc_queue *queue = &get_protocol_stack_by_id(stack_id)->rpc_queue; ++ struct rpc_msg *msg = rpc_msg_alloc(stack_id, thread_register_phase2); ++ if (msg == NULL) { ++ return -1; ++ } ++ msg->args[MSG_ARG_0].p = conn; ++ return rpc_sync_call(queue, msg); ++} ++ + int32_t client_reg_thrd_ring(void) + { + int32_t ret; +@@ -625,10 +649,9 @@ static int32_t thread_register(void) + /* register all connected conn before listen conn, avoid creating new conn */ + struct protocol_stack_group *stack_group = get_protocol_stack_group(); + for (int32_t i = 0; i < stack_group->stack_num; i++) { +- conn->conn_num = rpc_call_conntable(&stack_group->stacks[i]->rpc_queue, +- conn->conn_list, GAZELLE_LSTACK_MAX_CONN); ++ conn->conn_num = rpc_call_conntable(i, conn->conn_list, GAZELLE_LSTACK_MAX_CONN); + +- ret = rpc_call_thread_regphase1(&stack_group->stacks[i]->rpc_queue, conn); ++ ret = rpc_call_thread_regphase1(i, conn); + if (ret != 0) { + LSTACK_LOG(ERR, LSTACK, "thread_register_phase1 failed ret=%d!\n", ret); + free(conn); +@@ -637,10 +660,9 @@ static int32_t thread_register(void) + } + + for (int32_t i = 0; i < stack_group->stack_num; i++) { +- conn->conn_num = rpc_call_conntable(&stack_group->stacks[i]->rpc_queue, +- conn->conn_list, GAZELLE_LSTACK_MAX_CONN); ++ conn->conn_num = rpc_call_conntable(i, conn->conn_list, GAZELLE_LSTACK_MAX_CONN); + +- ret = rpc_call_thread_regphase2(&stack_group->stacks[i]->rpc_queue, conn); ++ ret = rpc_call_thread_regphase2(i, conn); + if (ret != 0) { + LSTACK_LOG(ERR, LSTACK, "thread_register_phase2 failed ret=%d!\n", ret); + free(conn); +diff --git a/src/lstack/core/lstack_dpdk.c b/src/lstack/core/lstack_dpdk.c +index fcb78ca..8f896c9 100644 +--- a/src/lstack/core/lstack_dpdk.c ++++ b/src/lstack/core/lstack_dpdk.c +@@ -38,17 +38,15 @@ + + #include + #include +-#include + + #include "lstack_log.h" + #include "common/dpdk_common.h" + #include "common/gazelle_base_func.h" +-#include "lstack_thread_rpc.h" + #include "lstack_protocol_stack.h" +-#include "lstack_lwip.h" + #include "lstack_cfg.h" + #include "lstack_virtio.h" + #include "lstack_dpdk.h" ++#include "mbox_ring.h" + + struct eth_params { + uint16_t port_id; +@@ -109,6 +107,8 @@ int32_t dpdk_eal_init(void) + struct cfg_params *global_params = get_global_cfg_params(); + + ret = rte_eal_init(global_params->dpdk_argc, global_params->dpdk_argv); ++ /* rte_eal_init() would call __rte_thread_init(), and set _lcore_id. */ ++ RTE_PER_LCORE(_lcore_id) = LCORE_ID_ANY; + if (ret < 0) { + if (rte_errno == EALREADY) { + LSTACK_PRE_LOG(LSTACK_INFO, "rte_eal_init aleady init\n"); +@@ -135,58 +135,6 @@ int32_t dpdk_eal_init(void) + return ret; + } + +-struct rte_mempool *create_pktmbuf_mempool(const char *name, uint32_t nb_mbuf, +- uint32_t mbuf_cache_size, uint16_t queue_id, unsigned numa_id) +-{ +- int32_t ret; +- char pool_name[PATH_MAX]; +- struct rte_mempool *pool; +- +- ret = snprintf_s(pool_name, sizeof(pool_name), PATH_MAX - 1, "%s_%hu", name, queue_id); +- if (ret < 0) { +- LSTACK_LOG(ERR, LSTACK, "snprintf_s fail ret=%d \n", ret); +- return NULL; +- } +- /* limit mbuf max num based on the dpdk capability */ +- if (nb_mbuf > MBUF_MAX_NUM) { +- LSTACK_LOG(ERR, LSTACK, "out of the dpdk mbuf quantity range\n"); +- return NULL; +- } +- +- /* time stamp before pbuf_custom as priv_data */ +- uint16_t private_size = sizeof(struct mbuf_private); +- if (xdp_eth_enabled()) { +- /* reserved for xdp metadata, see struct xsk_tx_metadata in /usr/include/linux/if_xdp.h */ +- private_size += 24; +- } +- private_size = RTE_ALIGN(private_size, RTE_CACHE_LINE_SIZE); +- pool = rte_pktmbuf_pool_create(pool_name, nb_mbuf, mbuf_cache_size, private_size, MBUF_SZ, numa_id); +- if (pool == NULL) { +- LSTACK_LOG(ERR, LSTACK, "cannot create %s pool rte_err=%d\n", pool_name, rte_errno); +- } +- +- return pool; +-} +- +-static struct rte_mempool* get_pktmbuf_mempool(const char *name, uint16_t queue_id) +-{ +- int32_t ret; +- char pool_name[PATH_MAX]; +- struct rte_mempool *pool; +- +- ret = snprintf_s(pool_name, sizeof(pool_name), PATH_MAX - 1, "%s_%hu", name, queue_id); +- if (ret < 0) { +- LSTACK_LOG(ERR, LSTACK, "snprintf_s fail ret=%d\n", ret); +- return NULL; +- } +- pool = rte_mempool_lookup(pool_name); +- if (pool == NULL) { +- LSTACK_LOG(ERR, LSTACK, "look up %s pool rte_err=%d\n", pool_name, rte_errno); +- } +- +- return pool; +-} +- + static struct reg_ring_msg *create_reg_mempool(const char *name, uint16_t queue_id) + { + int ret; +@@ -207,115 +155,56 @@ static struct reg_ring_msg *create_reg_mempool(const char *name, uint16_t queue_ + return reg_buf; + } + +-int32_t pktmbuf_pool_init(struct protocol_stack *stack) +-{ +- stack->rxtx_mbuf_pool = get_pktmbuf_mempool("rxtx_mbuf", stack->queue_id); +- if (stack->rxtx_mbuf_pool == NULL) { +- LSTACK_LOG(ERR, LSTACK, "rxtx_mbuf_pool is NULL\n"); +- return -1; +- } +- +- if (use_ltran()) { +- stack->reg_buf = create_reg_mempool("reg_ring_msg", stack->queue_id); +- if (stack->reg_buf == NULL) { +- LSTACK_LOG(ERR, LSTACK, "rxtx_mbuf_pool is NULL\n"); +- return -1; +- } +- } +- +- return 0; +-} +- +-struct rte_mempool *create_mempool(const char *name, uint32_t count, uint32_t size, +- uint32_t flags, int32_t idx) ++int32_t create_shared_ring(struct protocol_stack *stack) + { +- char pool_name [RTE_MEMPOOL_NAMESIZE]; +- struct rte_mempool *mempool; +- int32_t ret = snprintf_s(pool_name, sizeof(pool_name), RTE_MEMPOOL_NAMESIZE - 1, +- "%s_%d", name, idx); +- if (ret < 0) { +- LSTACK_LOG(ERR, LSTACK, "snprintf_s fail ret=%d\n", ret); +- return NULL; ++ if (!use_ltran()) { ++ return 0; + } + +- mempool = rte_mempool_create(pool_name, count, size, +- 0, 0, NULL, NULL, NULL, NULL, rte_socket_id(), flags); +- if (mempool == NULL) { +- LSTACK_LOG(ERR, LSTACK, "%s create failed. errno: %d.\n", name, rte_errno); ++ stack->rx_ring = rte_ring_create_fast("RING_RX", VDEV_RX_QUEUE_SZ, RING_F_SP_ENQ | RING_F_SC_DEQ); ++ if (stack->rx_ring == NULL) { ++ return -1; + } + +- return mempool; +-} +- +-int32_t create_shared_ring(struct protocol_stack *stack) +-{ +- rpc_queue_init(&stack->rpc_queue, stack->queue_id); +- rpc_queue_init(&stack->dfx_rpc_queue, stack->queue_id); +- +- if (use_ltran()) { +- stack->rx_ring = gazelle_ring_create_fast("RING_RX", VDEV_RX_QUEUE_SZ, RING_F_SP_ENQ | RING_F_SC_DEQ); +- if (stack->rx_ring == NULL) { +- return -1; +- } +- +- stack->tx_ring = gazelle_ring_create_fast("RING_TX", VDEV_TX_QUEUE_SZ, RING_F_SP_ENQ | RING_F_SC_DEQ); +- if (stack->tx_ring == NULL) { +- return -1; +- } +- +- stack->reg_ring = gazelle_ring_create_fast("SHARED_REG_RING", VDEV_REG_QUEUE_SZ, RING_F_SP_ENQ | RING_F_SC_DEQ); +- if (stack->reg_ring == NULL) { +- return -1; +- } ++ stack->tx_ring = rte_ring_create_fast("RING_TX", VDEV_TX_QUEUE_SZ, RING_F_SP_ENQ | RING_F_SC_DEQ); ++ if (stack->tx_ring == NULL) { ++ return -1; + } + +- return 0; +-} +- +-int32_t dpdk_alloc_pktmbuf(struct rte_mempool *pool, struct rte_mbuf **mbufs, uint32_t num, bool reserve) +-{ +- if (reserve) { +- /* +- * don't use rte_mempool_avail_count, it traverse cpu local cache, +- * when RTE_MAX_LCORE is too large, it's time-consuming +- */ +- if (rte_ring_count(pool->pool_data) < MBUFPOOL_RESERVE_NUM + num) { +- return -ENOMEM; +- } ++ stack->reg_ring = rte_ring_create_fast("SHARED_REG_RING", VDEV_REG_QUEUE_SZ, RING_F_SP_ENQ | RING_F_SC_DEQ); ++ if (stack->reg_ring == NULL) { ++ return -1; + } + +- int32_t ret = rte_pktmbuf_alloc_bulk(pool, mbufs, num); +- if (ret != 0) { +- LSTACK_LOG(ERR, LSTACK, "rte_pktmbuf_alloc_bulk fail allocNum=%d, ret=%d, info:%s \n", +- num, ret, rte_strerror(-ret)); +- return ret; ++ stack->reg_buf = create_reg_mempool("reg_ring_msg", stack->queue_id); ++ if (stack->reg_buf == NULL) { ++ LSTACK_LOG(ERR, LSTACK, "reg_buf is NULL\n"); ++ return -1; + } + + return 0; + } + +-int32_t fill_mbuf_to_ring(struct rte_mempool *mempool, struct rte_ring *ring, uint32_t mbuf_num) ++int32_t fill_mbuf_to_ring(int stack_id, struct rte_ring *ring, uint32_t mbuf_num) + { + int32_t ret; + uint32_t batch; + uint32_t remain = mbuf_num; +- struct rte_mbuf *free_buf[FREE_RX_QUEUE_SZ]; ++ struct rte_mbuf *free_buf[VDEV_RX_QUEUE_SZ]; + + while (remain > 0) { +- batch = LWIP_MIN(remain, RING_SIZE(FREE_RX_QUEUE_SZ)); ++ batch = LWIP_MIN(remain, RING_SIZE(VDEV_RX_QUEUE_SZ)); + +- ret = dpdk_alloc_pktmbuf(mempool, free_buf, batch, true); ++ ret = mem_get_mbuf_bulk(stack_id, free_buf, batch, true); + if (ret != 0) { + LSTACK_LOG(ERR, LSTACK, "cannot alloc mbuf for ring, count: %u ret=%d\n", batch, ret); + return -1; + } + + ret = gazelle_ring_sp_enqueue(ring, (void **)free_buf, batch); +- if (ret == 0) { ++ if (ret < batch) { ++ mem_put_mbuf_bulk(&free_buf[ret], batch - ret); + LSTACK_LOG(ERR, LSTACK, "cannot enqueue to ring, count: %u\n", batch); +- for (int i = 0; i < batch; i++) { +- rte_pktmbuf_free(free_buf[i]); +- } + return -1; + } + +@@ -592,7 +481,7 @@ static int32_t dpdk_ethdev_setup(const struct eth_params *eth_params, uint16_t i + int32_t ret; + uint16_t numa_id = 0; + struct cfg_params *cfg = get_global_cfg_params(); +- struct rte_mempool *rxtx_mbuf_pool = get_protocol_stack_group()->total_rxtx_pktmbuf_pool[idx]; ++ struct rte_mempool *rxtx_mbuf_pool = mem_get_mbuf_pool(idx); + + if (!cfg->use_ltran && cfg->num_process == 1) { + numa_id = (cfg->stack_num > 0) ? cfg->numa_id : numa_node_of_cpu(cfg->cpus[idx]); +@@ -651,7 +540,7 @@ int32_t dpdk_ethdev_start(void) + int32_t dpdk_init_lstack_kni(void) + { + struct protocol_stack_group *stack_group = get_protocol_stack_group(); +- stack_group->kni_pktmbuf_pool = create_pktmbuf_mempool("kni_mbuf", KNI_NB_MBUF, 0, 0, rte_socket_id()); ++ stack_group->kni_pktmbuf_pool = rte_pktmbuf_pool_create("kni_mbuf", KNI_NB_MBUF, 0, 0, MBUF_DATA_SIZE, rte_socket_id()); + if (stack_group->kni_pktmbuf_pool == NULL) { + LSTACK_LOG(ERR, LSTACK, "kni_mbuf is NULL\n"); + return -1; +@@ -1030,7 +919,7 @@ uint32_t dpdk_total_socket_memory(void) + struct cfg_params *cfg = get_global_cfg_params(); + + /* calculate the memory(bytes) of rxtx_mempool */ +- elt_size = sizeof(struct rte_mbuf) + MBUF_SZ + RTE_ALIGN(sizeof(struct mbuf_private), RTE_CACHE_LINE_SIZE); ++ elt_size = sizeof(struct rte_mbuf) + MBUF_DATA_SIZE + RTE_ALIGN(sizeof(struct mbuf_private), RTE_CACHE_LINE_SIZE); + per_pktmbuf_mempool_size = rte_mempool_calc_obj_size(elt_size, 0, NULL); + + /* calculate the memory(bytes) of rpc_mempool, reserved num is (app threads + lstack threads + listen thread) */ +@@ -1038,13 +927,12 @@ uint32_t dpdk_total_socket_memory(void) + per_rpc_mempool_size = rte_mempool_calc_obj_size(elt_size, 0, NULL); + + /* calculate the memory(bytes) of rings, reserved num is GAZELLE_LSTACK_MAX_CONN. */ +- per_conn_ring_size = rte_ring_get_memsize(cfg->send_ring_size) + +- rte_ring_get_memsize(cfg->recv_ring_size) + +- rte_ring_get_memsize(DEFAULT_ACCEPTMBOX_SIZE); ++ per_conn_ring_size = rte_ring_get_memsize(DEFAULT_SENDMBOX_SIZE) + ++ rte_ring_get_memsize(DEFAULT_ACCEPTMBOX_SIZE); + + total_socket_memory = fixed_mem + bytes_to_mb( + (per_pktmbuf_mempool_size * dpdk_pktmbuf_mempool_num()) * cfg->num_queue + +- per_rpc_mempool_size * cfg->rpc_msg_max * (RPC_MEMPOOL_THREAD_NUM + cfg->num_queue + 1) + ++ per_rpc_mempool_size * cfg->rpc_msg_max + + per_conn_ring_size * GAZELLE_LSTACK_MAX_CONN); + + return total_socket_memory; +diff --git a/src/lstack/core/lstack_lwip.c b/src/lstack/core/lstack_lwip.c +deleted file mode 100644 +index 047dfdf..0000000 +--- a/src/lstack/core/lstack_lwip.c ++++ /dev/null +@@ -1,1286 +0,0 @@ +-/* +-* Copyright (c) Huawei Technologies Co., Ltd. 2020-2021. All rights reserved. +-* gazelle is licensed under the Mulan PSL v2. +-* You can use this software according to the terms and conditions of the Mulan PSL v2. +-* You may obtain a copy of Mulan PSL v2 at: +-* http://license.coscl.org.cn/MulanPSL2 +-* THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR +-* IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY OR FIT FOR A PARTICULAR +-* PURPOSE. +-* See the Mulan PSL v2 for more details. +-*/ +- +-#include +-#include +-#include +- +-#include +-#include +- +-#include +-#include +-#include +-#include +-#include +-#include +-#include +-#include +-#include +-#include +- +-#include "common/gazelle_base_func.h" +-#include "lstack_log.h" +-#include "lstack_cfg.h" +-#include "lstack_protocol_stack.h" +-#include "lstack_stack_stat.h" +-#include "lstack_epoll.h" +-#include "lstack_dpdk.h" +-#include "lstack_lwip.h" +- +-static const uint8_t fin_packet = 0; +- +-static void free_ring_pbuf(struct rte_ring *ring) +-{ +- void *pbufs[SOCK_RECV_RING_SIZE]; +- +- do { +- gazelle_ring_read(ring, pbufs, RING_SIZE(SOCK_RECV_RING_SIZE)); +- gazelle_ring_read_over(ring); +- } while (gazelle_ring_readable_count(ring)); +- +- do { +- uint32_t num = gazelle_ring_sc_dequeue(ring, pbufs, RING_SIZE(SOCK_RECV_RING_SIZE)); +- +- for (uint32_t i = 0; i < num; i++) { +- pbuf_free(pbufs[i]); +- } +- } while (gazelle_ring_readover_count(ring)); +-} +- +-static void reset_sock_data(struct lwip_sock *sock) +-{ +- /* check null pointer in ring_free func */ +- if (sock->recv_ring) { +- free_ring_pbuf(sock->recv_ring); +- gazelle_ring_free_fast(sock->recv_ring); +- sock->recv_ring = NULL; +- } +- +- if (sock->send_ring) { +- free_ring_pbuf(sock->send_ring); +- gazelle_ring_free_fast(sock->send_ring); +- sock->send_ring = NULL; +- } +- +- if (sock->send_pre_del) { +- pbuf_free(sock->send_pre_del); +- sock->send_pre_del = NULL; +- } +- +- sock->type = 0; +- sock->stack_id = 0; +- sock->affinity_numa = 0; +- sock->sk_wait = NULL; +- sock->listen_next = NULL; +- sock->call_num = 0; +- sock->remain_len = 0; +- +- if (sock->recv_lastdata && sock->recv_lastdata != (void *)&fin_packet) { +- pbuf_free(sock->recv_lastdata); +- } +- sock->recv_lastdata = NULL; +-} +- +-static struct pbuf *init_mbuf_to_pbuf(struct rte_mbuf *mbuf, pbuf_layer layer, uint16_t length, pbuf_type type) +-{ +- struct pbuf_custom *pbuf_custom = mbuf_to_pbuf(mbuf); +- +- void *data = rte_pktmbuf_mtod(mbuf, void *); +- struct pbuf *pbuf = pbuf_alloced_custom(layer, length, type, pbuf_custom, data, MAX_PACKET_SZ); +- if (pbuf) { +- pbuf->allow_append = 1; +- pbuf->addr = *IP_ANY_TYPE; +- pbuf->port = 0; +- pthread_spin_init(&pbuf->pbuf_lock, PTHREAD_PROCESS_SHARED); +- } +- +- return pbuf; +-} +- +-static uint32_t update_replenish_mbuf_cnt(struct protocol_stack *stack, struct lwip_sock *sock) +-{ +- const uint32_t min_alloc_mbuf_num = 4; +- struct rte_ring *ring = sock->send_ring; +- +- uint32_t replenish_cnt = gazelle_ring_free_count(ring); +- if (replenish_cnt <= min_alloc_mbuf_num) { +- return replenish_cnt; +- } +- +- uint32_t resu = replenish_cnt; +- uint32_t tcp_conn_count = get_global_cfg_params()->tcp_conn_count; +- uint16_t send_ring_size = get_global_cfg_params()->send_ring_size; +- uint16_t proportion = stack->conn_num / tcp_conn_count; +- uint32_t replenish_mbuf_cnt_cal = (send_ring_size >> proportion); +- +- if (replenish_mbuf_cnt_cal <= min_alloc_mbuf_num) { +- resu = min_alloc_mbuf_num; +- } else if (replenish_mbuf_cnt_cal < replenish_cnt) { +- resu = replenish_mbuf_cnt_cal; +- } else { +- resu = replenish_cnt + 1; +- } +- +- return resu - 1; +-} +- +-/* true: need replenish again */ +-static bool replenish_send_idlembuf(struct protocol_stack *stack, struct lwip_sock *sock) +-{ +- void *pbuf[SOCK_SEND_RING_SIZE_MAX]; +- struct rte_ring *ring = sock->send_ring; +- +- uint32_t replenish_cnt = update_replenish_mbuf_cnt(stack, sock); +- if (replenish_cnt == 0) { +- return false; +- } +- if (dpdk_alloc_pktmbuf(stack->rxtx_mbuf_pool, (struct rte_mbuf **)pbuf, replenish_cnt, true) != 0) { +- stack->stats.tx_allocmbuf_fail++; +- return true; +- } +- +- uint32_t i = 0; +- for (; i < replenish_cnt - 1; i++) { +- rte_prefetch0(mbuf_to_pbuf((void *)pbuf[i + 1])); +- pbuf[i] = init_mbuf_to_pbuf(pbuf[i], PBUF_TRANSPORT, MBUF_MAX_DATA_LEN, PBUF_RAM); +- } +- pbuf[i] = init_mbuf_to_pbuf((struct rte_mbuf *)pbuf[i], PBUF_TRANSPORT, MBUF_MAX_DATA_LEN, PBUF_RAM); +- +- uint32_t num = gazelle_ring_sp_enqueue(ring, pbuf, replenish_cnt); +- for (uint32_t i = num; i < replenish_cnt; i++) { +- pbuf_free(pbuf[i]); +- } +- +- sem_post(&sock->snd_ring_sem); +- +- return false; +-} +- +-int do_lwip_init_sock(int32_t fd) +-{ +- struct protocol_stack *stack = get_protocol_stack(); +- struct lwip_sock *sock = lwip_get_socket(fd); +- if (sock == NULL) { +- return -1; +- } +- +- sock->stack_id = stack->stack_idx; +- sock->sk_wait = NULL; +- if (sock_event_init(&sock->sk_event) != 0) { +- LSTACK_LOG(ERR, LSTACK, "sock_event_init failed\n"); +- return -1; +- } +- +- if (get_global_cfg_params()->stack_mode_rtc) { +- return 0; +- } +- +- if (sock->recv_ring != NULL || sock->send_ring != NULL) { +- LSTACK_LOG(ERR, LSTACK, "socket(%d) not close but open again?\n", fd); +- } +- +- reset_sock_data(sock); +- +- sock->recv_ring = gazelle_ring_create_fast("sock_recv", SOCK_RECV_RING_SIZE, RING_F_SP_ENQ | RING_F_SC_DEQ); +- if (sock->recv_ring == NULL) { +- LSTACK_LOG(ERR, LSTACK, "sock_recv create failed. errno: %d.\n", rte_errno); +- return -1; +- } +- +- sock->send_ring = gazelle_ring_create_fast("sock_send", +- get_global_cfg_params()->send_ring_size, +- RING_F_SP_ENQ | RING_F_SC_DEQ); +- if (sock->send_ring == NULL) { +- gazelle_ring_free_fast(sock->recv_ring); +- LSTACK_LOG(ERR, LSTACK, "sock_send create failed. errno: %d.\n", rte_errno); +- return -1; +- } +- (void)replenish_send_idlembuf(stack, sock); +- +- list_init_node(&sock->recv_list); +- return 0; +-} +- +-void do_lwip_clean_sock(int fd) +-{ +- struct lwip_sock *sock = lwip_get_socket(fd); +- if (POSIX_IS_CLOSED(sock)) { +- return; +- } +- +- sock_event_free(&sock->sk_event, sock->sk_wait); +- sock->sk_wait = NULL; +- +- reset_sock_data(sock); +- +- list_del_node(&sock->recv_list); +- +- get_protocol_stack_by_id(sock->stack_id)->conn_num--; +-} +- +-void do_lwip_free_pbuf(struct pbuf *pbuf) +-{ +- if (pbuf == NULL) { +- return; +- } +- +- struct rte_mbuf *mbuf = pbuf_to_mbuf(pbuf); +- +- rte_pktmbuf_free_seg(mbuf); +-} +- +-struct pbuf *do_lwip_alloc_pbuf(pbuf_layer layer, uint16_t length, pbuf_type type) +-{ +- int ret; +- struct rte_mbuf *mbuf; +- struct protocol_stack *stack = get_protocol_stack(); +- +- /* ensure arp packet can be sent */ +- if (layer == PBUF_LINK && length == SIZEOF_ETHARP_HDR) { +- ret = dpdk_alloc_pktmbuf(stack->rxtx_mbuf_pool, &mbuf, 1, false); +- } else { +- ret = dpdk_alloc_pktmbuf(stack->rxtx_mbuf_pool, &mbuf, 1, true); +- } +- if (ret != 0) { +- stack->stats.tx_allocmbuf_fail++; +- return NULL; +- } +- +- return init_mbuf_to_pbuf(mbuf, layer, length, type); +-} +- +-static inline bool pbuf_allow_append(struct pbuf *pbuf, uint16_t remain_size) +-{ +- int ret; +- +- /* Using pthread_spin_trylock to avoid deadlock between app thread and lstack threads */ +- ret = pthread_spin_trylock(&pbuf->pbuf_lock); +- if (ret != 0) { +- return false; +- } +- +- if (pbuf->tot_len > remain_size) { +- pthread_spin_unlock(&pbuf->pbuf_lock); +- return false; +- } +- if (pbuf->allow_append == 1) { +- __sync_fetch_and_sub(&pbuf->allow_append, 1); +- } +- +- pthread_spin_unlock(&pbuf->pbuf_lock); +- return true; +-} +- +-struct pbuf *do_lwip_udp_get_from_sendring(struct lwip_sock *sock, uint16_t remain_size) +-{ +- int count; +- /* when remain_size is 0, fill_sendring write one pbuf to sendring */ +- if (remain_size == 0) { +- count = 1; +- } else { +- count = (remain_size + MBUF_MAX_DATA_LEN - 1) / MBUF_MAX_DATA_LEN; +- } +- +- struct pbuf *pbufs[count]; +- +- int actual_count = gazelle_ring_sc_dequeue(sock->send_ring, (void **)&pbufs, count); +- /* it's impossible to enter this branch theoretically */ +- if (unlikely((actual_count != count) || +- ((actual_count != 0) && pbufs[0]->tot_len != remain_size))) { +- LSTACK_LOG(ERR, LSTACK, "udp get pbuf from sendring error, expected: %d, actual: %d\n", +- count, actual_count); +- LSTACK_LOG(ERR, LSTACK, "udp get pbuf size error, expected: %d, actual: %d\n", +- remain_size, actual_count == 0 ? 0 : pbufs[0]->tot_len); +- } +- +- for (int i = 0; get_protocol_stack_group()->latency_start && i < actual_count; i++) { +- struct protocol_stack *stack = get_protocol_stack_by_id(sock->stack_id); +- calculate_lstack_latency(&stack->latency, pbufs[i], GAZELLE_LATENCY_WRITE_LWIP, 0); +- } +- +- return pbufs[0]; +-} +- +-struct pbuf *do_lwip_tcp_get_from_sendring(struct lwip_sock *sock, uint16_t remain_size) +-{ +- struct pbuf *pbuf = NULL; +- +- if (unlikely(sock->send_pre_del)) { +- if (pbuf_allow_append(sock->send_pre_del, remain_size)) { +- return sock->send_pre_del; +- } else { +- return NULL; +- } +- } +- +- gazelle_ring_sc_dequeue(sock->send_ring, (void **)&pbuf, 1); +- if (pbuf == NULL) { +- return NULL; +- } +- +- if (get_protocol_stack_group()->latency_start) { +- struct protocol_stack *stack = get_protocol_stack_by_id(sock->stack_id); +- calculate_lstack_latency(&stack->latency, pbuf, GAZELLE_LATENCY_WRITE_LWIP, 0); +- } +- +- sock->send_pre_del = pbuf; +- +- if (!gazelle_ring_readover_count(sock->send_ring)) { +- if (!pbuf_allow_append(pbuf, remain_size)) { +- return NULL; +- } +- } else { +- if (pbuf->tot_len > remain_size) { +- return NULL; +- } +- } +- +- return pbuf; +-} +- +-void do_lwip_get_from_sendring_over(struct lwip_sock *sock) +-{ +- struct protocol_stack *stack = get_protocol_stack_by_id(sock->stack_id); +- stack->stats.write_lwip_cnt++; +- sock->send_pre_del = NULL; +-} +- +-static ssize_t do_app_write(struct lwip_sock *sock, struct pbuf *pbufs[], void *buf, size_t len, uint32_t write_num) +-{ +- ssize_t send_len = 0; +- uint32_t i = 0; +- +- for (i = 0; i < write_num - 1; i++) { +- rte_prefetch0(pbufs[i + 1]); +- rte_prefetch0(pbufs[i + 1]->payload); +- rte_prefetch0((char *)buf + send_len + MBUF_MAX_DATA_LEN); +- rte_memcpy((char *)pbufs[i]->payload, (char *)buf + send_len, MBUF_MAX_DATA_LEN); +- pbufs[i]->tot_len = pbufs[i]->len = MBUF_MAX_DATA_LEN; +- send_len += MBUF_MAX_DATA_LEN; +- +- /* if udp pkg len > mtu, use pbuf chain to send it */ +- if (NETCONN_IS_UDP(sock) && i > 0) { +- pbuf_cat(pbufs[0], pbufs[i]); +- } +- } +- +- /* reduce the branch in loop */ +- size_t copy_len = len - send_len; +- rte_memcpy((char *)pbufs[i]->payload, (char *)buf + send_len, copy_len); +- pbufs[i]->tot_len = pbufs[i]->len = copy_len; +- send_len += copy_len; +- +- if (NETCONN_IS_UDP(sock) && i > 0) { +- pbuf_cat(pbufs[0], pbufs[i]); +- } +- +- return send_len; +-} +- +-static inline ssize_t app_buff_write(struct lwip_sock *sock, void *buf, size_t len, uint32_t write_num, +- const struct sockaddr *addr, socklen_t addrlen) +-{ +- struct pbuf *pbufs[SOCK_SEND_RING_SIZE_MAX]; +- +- (void)gazelle_ring_read(sock->send_ring, (void **)pbufs, write_num); +- +- if (get_protocol_stack_group()->latency_start) { +- uint64_t time_stamp = sys_now_us(); +- time_stamp_into_pbuf(write_num, pbufs, time_stamp); +- } +- +- ssize_t send_len = do_app_write(sock, pbufs, buf, len, write_num); +- +- if (addr) { +- if (addr->sa_family == AF_INET) { +- struct sockaddr_in *saddr = (struct sockaddr_in *)addr; +- for (int i = 0; i < write_num; i++) { +- pbufs[i]->addr.u_addr.ip4.addr = saddr->sin_addr.s_addr; +- pbufs[i]->port = lwip_ntohs((saddr)->sin_port); +- IP_SET_TYPE(&pbufs[i]->addr, IPADDR_TYPE_V4); +- } +- } else if (addr->sa_family == AF_INET6) { +- struct sockaddr_in6 *saddr = (struct sockaddr_in6 *)addr; +- for (int i = 0; i < write_num; i++) { +- memcpy_s(pbufs[i]->addr.u_addr.ip6.addr, IPV6_ADDR_LEN, saddr->sin6_addr.s6_addr, IPV6_ADDR_LEN); +- pbufs[i]->port = lwip_ntohs((saddr)->sin6_port); +- IP_SET_TYPE(&pbufs[i]->addr, IPADDR_TYPE_V6); +- } +- } else { +- return 0; +- } +- } +- +- for (int i = 0; get_protocol_stack_group()->latency_start && i < write_num; i++) { +- struct protocol_stack *stack = get_protocol_stack_by_id(sock->stack_id); +- if (pbufs[i] != NULL) { +- calculate_lstack_latency(&stack->latency, pbufs[i], GAZELLE_LATENCY_WRITE_INTO_RING, 0); +- } +- } +- +- gazelle_ring_read_over(sock->send_ring); +- +- sock->remain_len = MBUF_MAX_DATA_LEN - pbufs[write_num - 1]->len; +- return send_len; +-} +- +-static inline struct pbuf *gazelle_ring_readlast(struct rte_ring *r) +-{ +- struct pbuf *last_pbuf = NULL; +- volatile uint32_t tail = __atomic_load_n(&r->cons.tail, __ATOMIC_ACQUIRE); +- uint32_t last = r->prod.tail - 1; +- if (last + 1 == tail || last + 1 - tail > r->capacity) { +- return NULL; +- } +- +- __rte_ring_dequeue_elems(r, last, (void **)&last_pbuf, sizeof(void *), 1); +- +- if (pthread_spin_trylock(&last_pbuf->pbuf_lock) != 0) { +- return NULL; +- } +- if (last_pbuf->allow_append != 1) { +- pthread_spin_unlock(&last_pbuf->pbuf_lock); +- return NULL; +- } +- +- return last_pbuf; +-} +- +-static inline void gazelle_ring_lastover(struct pbuf *last_pbuf) +-{ +- pthread_spin_unlock(&last_pbuf->pbuf_lock); +-} +- +-static inline size_t merge_data_lastpbuf(struct lwip_sock *sock, void *buf, size_t len) +-{ +- struct pbuf *last_pbuf = gazelle_ring_readlast(sock->send_ring); +- if (last_pbuf == NULL) { +- sock->remain_len = 0; +- return 0; +- } +- +- size_t send_len = MBUF_MAX_DATA_LEN - last_pbuf->len; +- if (send_len >= len) { +- sock->remain_len = send_len - len; +- send_len = len; +- } else { +- sock->remain_len = 0; +- } +- +- uint16_t offset = last_pbuf->len; +- last_pbuf->tot_len = last_pbuf->len = offset + send_len; +- rte_memcpy((char *)last_pbuf->payload + offset, buf, send_len); +- +- gazelle_ring_lastover(last_pbuf); +- +- return send_len; +-} +- +-int sem_timedwait_nsecs(sem_t *sem) +-{ +- struct timespec ts; +- clock_gettime(CLOCK_REALTIME, &ts); +- long long wait_nsec = ts.tv_nsec + SEND_TIME_WAIT_NS; +- ts.tv_nsec = wait_nsec % SECOND_NSECOND; +- long add = wait_nsec / SECOND_NSECOND; +- ts.tv_sec += add; +- return sem_timedwait(sem, &ts); +-} +- +-static ssize_t do_lwip_udp_fill_sendring(struct lwip_sock *sock, const void *buf, size_t len, +- const struct sockaddr *addr, socklen_t addrlen) +-{ +- if (len > GAZELLE_UDP_PKGLEN_MAX) { +- LSTACK_LOG(ERR, LSTACK, "Message too long\n"); +- GAZELLE_RETURN(EMSGSIZE); +- } +- +- ssize_t send_len = 0; +- uint32_t write_num = (len + MBUF_MAX_DATA_LEN - 1) / MBUF_MAX_DATA_LEN; +- uint32_t write_avail = gazelle_ring_readable_count(sock->send_ring); +- +- if (write_num > rte_ring_get_capacity(sock->send_ring)) { +- LSTACK_LOG(ERR, LSTACK, "sock send_ring size is not enough\n"); +- GAZELLE_RETURN(ENOMEM); +- } +- +- /* if udp send 0 packet, set write_num to at least 1 */ +- if (write_num == 0) { +- write_num = 1; +- } +- +- while (!netconn_is_nonblocking(sock->conn) && (write_avail < write_num)) { +- if (sock->errevent > 0) { +- GAZELLE_RETURN(ENOTCONN); +- } +- write_avail = gazelle_ring_readable_count(sock->send_ring); +- } +- +- if (write_avail < write_num) { +- LSTACK_LOG(WARNING, LSTACK, "sock send_ring is already exhausted.\n"); +- sem_timedwait_nsecs(&sock->snd_ring_sem); +- GAZELLE_RETURN(ENOMEM); +- } +- +- send_len = app_buff_write(sock, (char *)buf, len, write_num, addr, addrlen); +- +- API_EVENT(sock->conn, NETCONN_EVT_SENDMINUS, 0); +- +- return send_len; +-} +- +-static ssize_t __do_lwip_tcp_fill_sendring(struct lwip_sock *sock, const void *buf, size_t len, +- const struct sockaddr *addr, socklen_t addrlen) +-{ +- /* refer to the lwip implementation. */ +- if (len == 0) { +- return 0; +- } +- +- ssize_t send_len = 0; +- +- /* merge data into last pbuf */ +- if (sock->remain_len) { +- struct protocol_stack *stack = get_protocol_stack_by_id(sock->stack_id); +- stack->stats.sock_tx_merge++; +- send_len = merge_data_lastpbuf(sock, (char *)buf, len); +- if (send_len >= len) { +- send_len = len; +- goto END; +- } +- } +- +- uint32_t write_num = (len - send_len + MBUF_MAX_DATA_LEN - 1) / MBUF_MAX_DATA_LEN; +- uint32_t write_avail = gazelle_ring_readable_count(sock->send_ring); +- +- while (!netconn_is_nonblocking(sock->conn) && (write_avail < write_num)) { +- if (sock->errevent > 0) { +- GAZELLE_RETURN(ENOTCONN); +- } +- /* wait until (send_ring_size / 4) */ +- if (write_avail > (rte_ring_get_capacity(sock->send_ring) >> 2)) { +- break; +- } +- write_avail = gazelle_ring_readable_count(sock->send_ring); +- } +- +- /* send_ring is full, data attach last pbuf */ +- if (write_avail == 0) { +- sem_timedwait_nsecs(&sock->snd_ring_sem); +- goto END; +- } +- +- /* send_ring have idle */ +- if (write_num > write_avail) { +- write_num = write_avail; +- len = write_num * MBUF_MAX_DATA_LEN; +- } +- send_len += app_buff_write(sock, (char *)buf + send_len, len - send_len, write_num, addr, addrlen); +- +- API_EVENT(sock->conn, NETCONN_EVT_SENDMINUS, 0); +- +-END: +- if (send_len == 0) { +- errno = EAGAIN; +- return -1; +- } +- +- return send_len; +-} +- +-static inline void notice_stack_tcp_send(struct lwip_sock *sock, int32_t fd, int32_t len, int32_t flags); +-static ssize_t do_lwip_tcp_fill_sendring(struct lwip_sock *sock, const void *buf, size_t len, +- const struct sockaddr *addr, socklen_t addrlen) +-{ +- ssize_t ret, send_len = 0; +- +- while (true) { +- ret = __do_lwip_tcp_fill_sendring(sock, (char *)buf + send_len, len - send_len, addr, addrlen); +- // send = 0 : tcp peer close connection ? +- if (unlikely(ret <= 0)) { +- break; +- } +- send_len += ret; +- if (send_len == len || netconn_is_nonblocking(sock->conn)) { +- break; +- } +- +- notice_stack_tcp_send(sock, sock->conn->callback_arg.socket, ret, 0); +- } +- +- return send_len == 0 ? ret : send_len; +-} +- +-bool do_lwip_replenish_sendring(struct protocol_stack *stack, struct lwip_sock *sock) +-{ +- bool replenish_again = false; +- +- replenish_again = replenish_send_idlembuf(stack, sock); +- +- API_EVENT(sock->conn, NETCONN_EVT_SENDPLUS, 0); +- +- return replenish_again; +-} +- +-static inline void free_recv_ring_readover(struct rte_ring *ring) +-{ +- void *pbufs[SOCK_RECV_RING_SIZE]; +- uint32_t num = gazelle_ring_sc_dequeue(ring, pbufs, RING_SIZE(SOCK_RECV_RING_SIZE)); +- for (uint32_t i = 0; i < num; i++) { +- pbuf_free(pbufs[i]); +- } +-} +- +-static inline struct pbuf *pbuf_last(struct pbuf *pbuf) +-{ +- while (pbuf->next) { +- pbuf = pbuf->next; +- } +- return pbuf; +-} +- +-ssize_t do_lwip_read_from_lwip(struct lwip_sock *sock, int32_t flags, u8_t apiflags) +-{ +- if (sock->conn->recvmbox == NULL) { +- sock->conn->pending_err = ERR_CONN; +- GAZELLE_RETURN(ENOTCONN); +- } +- +- free_recv_ring_readover(sock->recv_ring); +- +- uint32_t free_count = gazelle_ring_free_count(sock->recv_ring); +- if (free_count == 0) { +- sock->conn->pending_err = ERR_WOULDBLOCK; +- GAZELLE_RETURN(EAGAIN); +- } +- +- uint32_t data_count = rte_ring_count(sock->conn->recvmbox->ring); +- uint32_t read_num = LWIP_MIN(free_count, data_count); +- struct pbuf *pbufs[SOCK_RECV_RING_SIZE]; +- uint32_t read_count = 0; +- ssize_t recv_len = 0; +- +- for (uint32_t i = 0; i < read_num; i++) { +- +- err_t err = ERR_OK; +- if (NETCONN_IS_UDP(sock)) { +- err = netconn_recv_udp_raw_pbuf_flags(sock->conn, &pbufs[i], apiflags); +- } else { +- err = netconn_recv_tcp_pbuf_flags(sock->conn, &pbufs[i], apiflags); +- } +- if (err != ERR_OK) { +- /* fin has been read from recvmbox, put it to recv_ring */ +- if (!NETCONN_IS_UDP(sock) && +- (netconn_is_flag_set(sock->conn, NETCONN_FIN_RX_PENDING) || err == ERR_CLSD)) { +- /* fin has been read, lwip don't need to process fin packet */ +- netconn_clear_flags(sock->conn, NETCONN_FIN_RX_PENDING); +- pbufs[i] = NULL; +- read_count++; +- break; +- } +- +- /* store err to pending_err again, clear it after app read */ +- sock->conn->pending_err = err; +- GAZELLE_RETURN(err_to_errno(err)); +- } +- +- recv_len += pbufs[i]->tot_len; +- lstack_calculate_aggregate(0, pbufs[i]->tot_len); +- read_count++; +- +- /* once we have some data to return, only add more if we don't need to wait */ +- apiflags |= NETCONN_DONTBLOCK | NETCONN_NOFIN; +- } +- +- uint32_t enqueue_num = gazelle_ring_sp_enqueue(sock->recv_ring, (void **)pbufs, read_count); +- if (enqueue_num != read_count) { +- LSTACK_LOG(ERR, LSTACK, "Code shouldn't get here!\n"); +- } +- +- struct protocol_stack *stack = get_protocol_stack_by_id(sock->stack_id); +- for (uint32_t i = 0; get_protocol_stack_group()->latency_start && i < read_count; i++) { +- if (pbufs[i] != NULL) { +- calculate_lstack_latency(&stack->latency, pbufs[i], GAZELLE_LATENCY_READ_LWIP, 0); +- } +- } +- stack->stats.read_lwip_cnt += read_count; +- +- return recv_len; +-} +- +-static int32_t check_msg_vaild(const struct msghdr *message) +-{ +- ssize_t buflen = 0; +- +- if (message == NULL || message->msg_iovlen <= 0 || message->msg_iovlen > IOV_MAX) { +- GAZELLE_RETURN(EINVAL); +- } +- +- for (int32_t i = 0; i < message->msg_iovlen; i++) { +- if ((message->msg_iov[i].iov_base == NULL) || ((ssize_t)message->msg_iov[i].iov_len < 0) || +- ((size_t)(ssize_t)message->msg_iov[i].iov_len != message->msg_iov[i].iov_len) || +- ((ssize_t)(buflen + (ssize_t)message->msg_iov[i].iov_len) < 0)) { +- GAZELLE_RETURN(EINVAL); +- } +- buflen = (ssize_t)(buflen + (ssize_t)message->msg_iov[i].iov_len); +- } +- +- return 0; +-} +- +-ssize_t do_lwip_recvmsg_from_stack(int32_t s, const struct msghdr *message, int32_t flags) +-{ +- ssize_t buflen = 0; +- +- if (check_msg_vaild(message)) { +- GAZELLE_RETURN(EINVAL); +- } +- +- for (int32_t i = 0; i < message->msg_iovlen; i++) { +- if (message->msg_iov[i].iov_len == 0) { +- continue; +- } +- +- ssize_t recvd_local = do_lwip_read_from_stack(s, message->msg_iov[i].iov_base, message->msg_iov[i].iov_len, +- flags, NULL, NULL); +- if (recvd_local > 0) { +- buflen += recvd_local; +- } +- if (recvd_local < 0 || (recvd_local < (int)message->msg_iov[i].iov_len) || (flags & MSG_PEEK)) { +- if (buflen <= 0) { +- buflen = recvd_local; +- } +- break; +- } +- flags |= MSG_DONTWAIT; +- } +- +- return buflen; +-} +- +-static inline void notice_stack_tcp_send(struct lwip_sock *sock, int32_t fd, int32_t len, int32_t flags) +-{ +- // 2: call_num >= 2, don't need add new rpc send +- if (__atomic_load_n(&sock->call_num, __ATOMIC_ACQUIRE) < 2) { +- struct protocol_stack *stack = get_protocol_stack_by_id(sock->stack_id); +- while (rpc_call_tcp_send(&stack->rpc_queue, fd, len, flags) < 0) { +- usleep(1000); // 1000: wait 1ms to exec again +- } +- __sync_fetch_and_add(&sock->call_num, 1); +- } +-} +- +-static inline void notice_stack_udp_send(struct lwip_sock *sock, int32_t fd, int32_t len, int32_t flags) +-{ +- __sync_fetch_and_add(&sock->call_num, 1); +- struct protocol_stack *stack = get_protocol_stack_by_id(sock->stack_id); +- while (rpc_call_udp_send(&stack->rpc_queue, fd, len, flags) < 0) { +- usleep(1000); // 1000: wait 1ms to exec again +- } +-} +- +-static inline void notice_stack_send(struct lwip_sock *sock, int32_t fd, int32_t len, int32_t flags) +-{ +- if (NETCONN_IS_UDP(sock)) { +- notice_stack_udp_send(sock, fd, len, flags); +- } else { +- notice_stack_tcp_send(sock, fd, len, flags); +- } +-} +- +-ssize_t do_lwip_send_to_stack(int32_t fd, const void *buf, size_t len, int32_t flags, +- const struct sockaddr *addr, socklen_t addrlen) +-{ +- struct lwip_sock *sock; +- ssize_t send = 0; +- +- if (buf == NULL) { +- GAZELLE_RETURN(EINVAL); +- } +- if (addr && addr->sa_family != AF_INET && addr->sa_family != AF_INET6) { +- GAZELLE_RETURN(EINVAL); +- } +- +- sock = lwip_get_socket(fd); +- if (unlikely(sock->affinity_numa == 0)) { +- thread_bind_stack(sock->stack_id); +- sock->affinity_numa = 1; +- } +- +-#if GAZELLE_SAME_NODE +- if (sock->same_node_tx_ring != NULL) { +- return gazelle_same_node_ring_send(sock, buf, len, flags); +- } +-#endif /* GAZELLE_SAME_NODE */ +- if (sock->errevent > 0) { +- GAZELLE_RETURN(ENOTCONN); +- } +- +- if (NETCONN_IS_UDP(sock)) { +- send = do_lwip_udp_fill_sendring(sock, buf, len, addr, addrlen); +- /* send = 0: udp send a empty package */ +- if (send < 0) { +- return send; +- } +- } else { +- send = do_lwip_tcp_fill_sendring(sock, buf, len, addr, addrlen); +- // send = 0 : tcp peer close connection ? +- if (send <= 0) { +- return send; +- } +- } +- +- notice_stack_send(sock, fd, send, flags); +- return send; +-} +- +-ssize_t do_lwip_sendmsg_to_stack(struct lwip_sock *sock, int32_t s, const struct msghdr *message, int32_t flags) +-{ +- int32_t ret; +- int32_t i; +- ssize_t buflen = 0; +- +- if (check_msg_vaild(message)) { +- GAZELLE_RETURN(EINVAL); +- } +- +- for (i = 0; i < message->msg_iovlen; i++) { +- if (message->msg_iov[i].iov_len == 0) { +- continue; +- } +- +- if (NETCONN_IS_UDP(sock)) { +- ret = do_lwip_udp_fill_sendring(sock, message->msg_iov[i].iov_base, message->msg_iov[i].iov_len, NULL, 0); +- } else { +- ret = do_lwip_tcp_fill_sendring(sock, message->msg_iov[i].iov_base, message->msg_iov[i].iov_len, NULL, 0); +- } +- if (ret <= 0) { +- buflen = (buflen == 0) ? ret : buflen; +- break; +- } +- +- buflen += ret; +- +- if (ret < message->msg_iov[i].iov_len) { +- break; +- } +- } +- +- if (buflen > 0) { +- notice_stack_send(sock, s, buflen, flags); +- } +- return buflen; +-} +- +-static struct pbuf *pbuf_free_partial(struct pbuf *pbuf, uint16_t free_len) +-{ +- uint32_t tot_len = pbuf->tot_len - free_len; +- +- while (free_len && pbuf) { +- if (free_len >= pbuf->len) { +- free_len = free_len - pbuf->len; +- pbuf = pbuf->next; +- } else { +- pbuf_remove_header(pbuf, free_len); +- break; +- } +- } +- +- if (pbuf) { +- pbuf->tot_len = tot_len; +- } +- return pbuf; +-} +- +-static bool recv_break_for_err(struct lwip_sock *sock) +-{ +- errno = err_to_errno(netconn_err(sock->conn)); +- unsigned pending = sock_event_hold_pending(sock, WAIT_BLOCK, NETCONN_EVT_RCVPLUS, 0) | +- sock_event_hold_pending(sock, WAIT_BLOCK, NETCONN_EVT_ERROR, 0); +- return pending; +-} +- +-/* +- * return 0 on success, -1 on error +- * pbuf maybe NULL(tcp fin packet) +- */ +-static int recv_ring_get_one(struct lwip_sock *sock, bool noblock, struct pbuf **pbuf) +-{ +- int32_t expect; +- uint64_t time_stamp = sys_now_us(); +- +- if (sock->recv_lastdata != NULL) { +- *pbuf = sock->recv_lastdata; +- sock->recv_lastdata = NULL; +- return 0; +- } +- +- expect = gazelle_ring_read(sock->recv_ring, (void **)pbuf, 1); +- if (expect == 0) { +- if (netconn_is_nonblocking(sock->conn)) { +- GAZELLE_RETURN(EAGAIN); +- } +- sock_event_wait(sock, true); +- expect = gazelle_ring_read(sock->recv_ring, (void **)pbuf, 1); +- if (expect == 0) { +- if (recv_break_for_err(sock)) { +- return -1; +- } +- GAZELLE_RETURN(EAGAIN); +- } +- } +- +- if (get_protocol_stack_group()->latency_start) { +- struct protocol_stack *stack = get_protocol_stack_by_id(sock->stack_id); +- calculate_lstack_latency(&stack->latency, *pbuf, GAZELLE_LATENCY_READ_APP_CALL, time_stamp); +- } +- +- return 0; +-} +- +-/* return true: fin is read to user, false: pend fin */ +-static bool recv_ring_handle_fin(struct lwip_sock *sock, struct pbuf *pbuf, ssize_t recvd) +-{ +- if (pbuf == NULL) { +- if (recvd > 0) { +- /* handle data first, then handle fin */ +- sock->recv_lastdata = (void *)&fin_packet; +- gazelle_ring_read_over(sock->recv_ring); +- return false; +- } +- gazelle_ring_read_over(sock->recv_ring); +- return true; +- } +- /* pending fin */ +- if (pbuf == (void *)&fin_packet) { +- return true; +- } +- +- return false; +-} +- +-static ssize_t recv_ring_tcp_read(struct lwip_sock *sock, void *buf, size_t len, bool noblock) +-{ +- ssize_t recvd = 0; +- size_t recv_left = len; +- uint32_t copy_len; +- struct pbuf *pbuf = NULL; +- +- if (len == 0) { +- return 0; +- } +- +- while (recv_left > 0) { +- if (recv_ring_get_one(sock, noblock | recvd, &pbuf) != 0) { +- /* When the buffer is empty, it will be returned directly +- if in non-blocking mode or if data has already been received */ +- break; +- } +- +- if (unlikely((pbuf == NULL) || (pbuf == (void *)&fin_packet))) { +- if (recv_ring_handle_fin(sock, pbuf, recvd)) { +- return 0; +- } else { +- break; /* recvd > 0, pending fin, handle data */ +- } +- } +- +- copy_len = (recv_left > pbuf->tot_len) ? pbuf->tot_len : recv_left; +- if (copy_len > UINT16_MAX) { +- copy_len = UINT16_MAX; /* it's impossible to get here */ +- } +- pbuf_copy_partial(pbuf, (char *)buf + recvd, copy_len, 0); +- +- recvd += copy_len; +- recv_left -= copy_len; +- +- if (pbuf->tot_len > copy_len) { +- sock->recv_lastdata = pbuf_free_partial(pbuf, copy_len); +- } else { +- if (get_protocol_stack_group()->latency_start) { +- struct protocol_stack *stack = get_protocol_stack_by_id(sock->stack_id); +- calculate_lstack_latency(&stack->latency, pbuf, GAZELLE_LATENCY_READ_LSTACK, 0); +- } +- +- gazelle_ring_read_over(sock->recv_ring); +- } +- } +- +- if (recvd > 0) { +- errno = 0; +- } else { +- recvd = -1; +- } +- +- return recvd; +-} +- +-static ssize_t recv_ring_udp_read(struct lwip_sock *sock, void *buf, size_t len, bool noblock, +- struct sockaddr *addr, socklen_t *addrlen) +-{ +- size_t recv_left = len; +- struct pbuf *pbuf = NULL; +- uint32_t copy_len; +- +- sock->recv_lastdata = NULL; +- +- if (recv_ring_get_one(sock, noblock, &pbuf) != 0) { +- /* errno have set */ +- return -1; +- } +- +- copy_len = (recv_left > pbuf->tot_len) ? pbuf->tot_len : recv_left; +- pbuf_copy_partial(pbuf, (char *)buf, copy_len, 0); +- /* drop remaining data if have */ +- gazelle_ring_read_over(sock->recv_ring); +- +- if (pbuf && addr && addrlen) { +- lwip_sock_make_addr(sock->conn, &(pbuf->addr), pbuf->port, addr, addrlen); +- } +- +- struct protocol_stack *stack = get_protocol_stack_by_id(sock->stack_id); +- if (copy_len < pbuf->tot_len) { +- stack->stats.sock_rx_drop++; +- } +- if (get_protocol_stack_group()->latency_start) { +- calculate_lstack_latency(&stack->latency, pbuf, GAZELLE_LATENCY_READ_LSTACK, 0); +- } +- +- return copy_len; +-} +- +-ssize_t do_lwip_read_from_stack(int32_t fd, void *buf, size_t len, int32_t flags, +- struct sockaddr *addr, socklen_t *addrlen) +-{ +- ssize_t recvd = 0; +- struct lwip_sock *sock = lwip_get_socket(fd); +- bool noblock = (flags & MSG_DONTWAIT) || netconn_is_nonblocking(sock->conn); +- +- if (recv_break_for_err(sock)) { +- return -1; +- } +- +- if (unlikely(sock->affinity_numa == 0)) { +- thread_bind_stack(sock->stack_id); +- sock->affinity_numa = 1; +- } +- +-#if GAZELLE_SAME_NODE +- if (sock->same_node_rx_ring != NULL) { +- recvd = gazelle_same_node_ring_recv(sock, buf, len, flags); +- } else +-#endif /* GAZELLE_SAME_NODE */ +- if (NETCONN_IS_UDP(sock)) { +- recvd = recv_ring_udp_read(sock, buf, len, noblock, addr, addrlen); +- } else { +- recvd = recv_ring_tcp_read(sock, buf, len, noblock); +- } +- +- API_EVENT(sock->conn, NETCONN_EVT_RCVMINUS, recvd); +- +- if (recvd < 0) { +- return -1; +- } +- return recvd; +-} +- +-void do_lwip_add_recvlist(int32_t fd) +-{ +- struct lwip_sock *sock = lwip_get_socket(fd); +- +- if (sock && list_node_null(&sock->recv_list)) { +- struct protocol_stack *stack = get_protocol_stack_by_id(sock->stack_id); +- list_add_node(&sock->recv_list, &stack->recv_list); +- } +-} +- +-void do_lwip_read_recvlist(struct protocol_stack *stack, uint32_t max_num) +-{ +- struct list_node *list = &(stack->recv_list); +- struct list_node *node, *temp; +- struct lwip_sock *sock; +- uint32_t read_num = 0; +- +- list_for_each_node(node, temp, list) { +- sock = list_entry(node, struct lwip_sock, recv_list); +- +- if (++read_num > max_num) { +- /* list head move to next send */ +- list_del_node(&stack->recv_list); +- list_add_node(&stack->recv_list, &sock->recv_list); +- break; +- } +- +- if (sock->conn == NULL || sock->conn->recvmbox == NULL || rte_ring_count(sock->conn->recvmbox->ring) == 0) { +- list_del_node(&sock->recv_list); +- continue; +- } +- +- if (get_protocol_stack_group()->latency_start) { +- struct protocol_stack *stack = get_protocol_stack_by_id(sock->stack_id); +- calculate_sock_latency(&stack->latency, sock, GAZELLE_LATENCY_RECVMBOX_READY); +- } +- +- ssize_t len = 0; +- if (NETCONN_IS_UDP(sock)) { +- len = lwip_recv(sock->conn->callback_arg.socket, NULL, SSIZE_MAX, 0); +- } else { +- len = lwip_recv(sock->conn->callback_arg.socket, NULL, 0, 0); +- } +- if (len < 0 && errno != EAGAIN) { +- API_EVENT(sock->conn, NETCONN_EVT_ERROR, 0); +- /* = 0: fin */ +- } else if (len >= 0) { +- API_EVENT(sock->conn, NETCONN_EVT_RCVPLUS, 0); +- } +- } +-} +- +-void do_lwip_connected_callback(int fd) +-{ +- struct lwip_sock *sock = lwip_get_socket(fd); +- if (POSIX_IS_CLOSED(sock)) { +- return; +- } +- +- if (POSIX_HAS_TYPE(sock, POSIX_KERNEL)) { +- /* delete kernel event */ +- if (sock->sk_wait != NULL) { +- posix_api->epoll_ctl_fn(sock->sk_wait->epfd, EPOLL_CTL_DEL, fd, NULL); +- } +- /* shutdown kernel connect, do_connect() has tried both kernel and lwip. */ +- posix_api->shutdown_fn(fd, SHUT_RDWR); +- } +- +- POSIX_SET_TYPE(sock, POSIX_LWIP); +- +- API_EVENT(sock->conn, NETCONN_EVT_RCVPLUS, 0); +-} +- +-static void copy_pcb_to_conn(struct gazelle_stat_lstack_conn_info *conn, const struct tcp_pcb *pcb) +-{ +- struct netconn *netconn = (struct netconn *)pcb->callback_arg; +- +- conn->lip = *((gz_addr_t *)&pcb->local_ip); +- conn->rip = *((gz_addr_t *)&pcb->remote_ip); +- conn->l_port = pcb->local_port; +- conn->r_port = pcb->remote_port; +- conn->in_send = pcb->snd_queuelen; +- conn->tcp_sub_state = pcb->state; +- conn->cwn = pcb->cwnd; +- conn->rcv_wnd = pcb->rcv_wnd; +- conn->snd_wnd = pcb->snd_wnd; +- conn->snd_buf = pcb->snd_buf; +- conn->lastack = pcb->lastack; +- conn->snd_nxt = pcb->snd_nxt; +- conn->rcv_nxt = pcb->rcv_nxt; +- conn->keepalive = (ip_get_option(pcb, SOF_KEEPALIVE) != 0); +- conn->keep_idle = pcb->keep_idle; +- conn->keep_intvl = pcb->keep_intvl; +- conn->keep_cnt = pcb->keep_cnt; +- conn->pingpong = tcp_in_pingpong(pcb); +- +- if (netconn != NULL) { +- conn->fd = netconn->callback_arg.socket; +- conn->recv_cnt = (netconn->recvmbox == NULL) ? 0 : rte_ring_count(netconn->recvmbox->ring); +- struct lwip_sock *sock = lwip_get_socket(netconn->callback_arg.socket); +- if (!POSIX_IS_CLOSED(sock)) { +- conn->recv_ring_cnt = (sock->recv_ring == NULL) ? 0 : gazelle_ring_readable_count(sock->recv_ring); +- conn->recv_ring_cnt += (sock->recv_lastdata) ? 1 : 0; +- conn->send_ring_cnt = (sock->send_ring == NULL) ? 0 : gazelle_ring_readover_count(sock->send_ring); +- conn->events = sock->sk_event.pending; +- conn->epoll_events = sock->sk_event.events; +- conn->eventlist = !list_node_null(&sock->sk_event.event_node); +- } +- } +-} +- +-void do_lwip_clone_sockopt(struct lwip_sock *dst_sock, struct lwip_sock *src_sock) +-{ +- dst_sock->conn->pcb.ip->so_options = src_sock->conn->pcb.ip->so_options; +- dst_sock->conn->pcb.ip->ttl = src_sock->conn->pcb.ip->ttl; +- dst_sock->conn->pcb.ip->tos = src_sock->conn->pcb.ip->tos; +- dst_sock->conn->flags = src_sock->conn->flags; +- if (NETCONN_IS_UDP(src_sock)) { +- dst_sock->conn->pcb.udp->flags = src_sock->conn->pcb.udp->flags; +- dst_sock->conn->pcb.udp->mcast_ifindex = src_sock->conn->pcb.udp->mcast_ifindex; +- dst_sock->conn->pcb.udp->mcast_ttl = src_sock->conn->pcb.udp->mcast_ttl; +- } else { +- dst_sock->conn->pcb.tcp->netif_idx = src_sock->conn->pcb.tcp->netif_idx; +- dst_sock->conn->pcb.tcp->flags = src_sock->conn->pcb.tcp->flags; +- dst_sock->conn->pcb.tcp->keep_idle = src_sock->conn->pcb.tcp->keep_idle; +- dst_sock->conn->pcb.tcp->keep_intvl = src_sock->conn->pcb.tcp->keep_intvl; +- dst_sock->conn->pcb.tcp->keep_cnt = src_sock->conn->pcb.tcp->keep_cnt; +- } +-} +- +-uint32_t do_lwip_get_conntable(struct gazelle_stat_lstack_conn_info *conn, +- uint32_t max_num) +-{ +- struct tcp_pcb *pcb = NULL; +- uint32_t conn_num = 0; +- +- if (conn == NULL) { +- return -1; +- } +- +- for (pcb = tcp_active_pcbs; pcb != NULL && conn_num < max_num; pcb = pcb->next) { +- conn[conn_num].state = GAZELLE_ACTIVE_LIST; +- copy_pcb_to_conn(conn + conn_num, pcb); +- conn_num++; +- } +- +- for (pcb = tcp_tw_pcbs; pcb != NULL && conn_num < max_num; pcb = pcb->next) { +- conn[conn_num].state = GAZELLE_TIME_WAIT_LIST; +- copy_pcb_to_conn(conn + conn_num, pcb); +- conn_num++; +- } +- +- for (struct tcp_pcb_listen *pcbl = tcp_listen_pcbs.listen_pcbs; pcbl != NULL && conn_num < max_num; +- pcbl = pcbl->next) { +- conn[conn_num].state = GAZELLE_LISTEN_LIST; +- conn[conn_num].lip = *((gz_addr_t *)&pcbl->local_ip); +- conn[conn_num].l_port = pcbl->local_port; +- conn[conn_num].tcp_sub_state = pcbl->state; +- struct netconn *netconn = (struct netconn *)pcbl->callback_arg; +- conn[conn_num].fd = netconn != NULL ? netconn->callback_arg.socket : -1; +- if (netconn != NULL && netconn->acceptmbox != NULL) { +- conn[conn_num].recv_cnt = rte_ring_count(netconn->acceptmbox->ring); +- } +- conn_num++; +- } +- +- return conn_num; +-} +- +-uint32_t do_lwip_get_connnum(void) +-{ +- struct tcp_pcb *pcb = NULL; +- struct tcp_pcb_listen *pcbl = NULL; +- uint32_t conn_num = 0; +- +- for (pcb = tcp_active_pcbs; pcb != NULL; pcb = pcb->next) { +- conn_num++; +- } +- +- for (pcbl = tcp_listen_pcbs.listen_pcbs; pcbl != NULL; pcbl = pcbl->next) { +- conn_num++; +- } +- +- for (pcb = tcp_tw_pcbs; pcb != NULL; pcb = pcb->next) { +- conn_num++; +- } +- +- return conn_num; +-} +- +diff --git a/src/lstack/core/lstack_mempool.c b/src/lstack/core/lstack_mempool.c +index 8f01f31..983f2f0 100644 +--- a/src/lstack/core/lstack_mempool.c ++++ b/src/lstack/core/lstack_mempool.c +@@ -332,13 +332,14 @@ static const struct mempool_ops mbuf_mp_ops = { + }; + + +-static struct rte_mempool *mbuf_pool_create(int stack_id) ++static struct rte_mempool *mbuf_pool_create(int stack_id, uint16_t numa_id) + { + struct cfg_params *cfg_params = get_global_cfg_params(); + char name[RTE_MEMPOOL_NAMESIZE]; + struct rte_mempool *pool; + uint32_t total_conn_mbufs, total_nic_mbufs, total_mbufs; + uint16_t private_size; ++ uint16_t xdp_metadata = 0; + + total_conn_mbufs = cfg_params->mbuf_count_per_conn * cfg_params->tcp_conn_count; + total_nic_mbufs = cfg_params->rxqueue_size + cfg_params->txqueue_size; +@@ -351,9 +352,13 @@ static struct rte_mempool *mbuf_pool_create(int stack_id) + } + + SYS_FORMAT_NAME(name, RTE_MEMPOOL_NAMESIZE, "%s_%hu", "mbuf_pool", stack_id); +- private_size = RTE_ALIGN(sizeof(struct mbuf_private) + 24, RTE_CACHE_LINE_SIZE); ++ /* reserved for xdp metadata, see struct xsk_tx_metadata in /usr/include/linux/if_xdp.h */ ++ if (xdp_eth_enabled()) { ++ xdp_metadata = 24; ++ } ++ private_size = RTE_ALIGN(sizeof(struct mbuf_private) + xdp_metadata, RTE_CACHE_LINE_SIZE); + +- pool = mbuf_mp_ops.create(name, total_mbufs, MBUFPOOL_CACHE_NUM, private_size, MBUF_DATA_SIZE, rte_socket_id()); ++ pool = mbuf_mp_ops.create(name, total_mbufs, MBUFPOOL_CACHE_NUM, private_size, MBUF_DATA_SIZE, numa_id); + if (pool == NULL) { + LSTACK_LOG(ERR, LSTACK, "rte_pktmbuf_pool_create %s failed, rte_errno %d\n", name, rte_errno); + return NULL; +@@ -362,7 +367,7 @@ static struct rte_mempool *mbuf_pool_create(int stack_id) + return pool; + } + +-static struct rte_mempool *rpc_pool_create(int stack_id) ++static struct rte_mempool *rpc_pool_create(int stack_id, uint16_t numa_id) + { + char name [RTE_MEMPOOL_NAMESIZE]; + struct rte_mempool *pool; +@@ -370,7 +375,7 @@ static struct rte_mempool *rpc_pool_create(int stack_id) + + SYS_FORMAT_NAME(name, RTE_MEMPOOL_NAMESIZE, "%s_%hu", "rpc_pool", stack_id); + +- pool = mem_mp_ops.create(name, total_bufs, MEMPOOL_CACHE_NUM, 0, sizeof(struct rpc_msg), rte_socket_id()); ++ pool = mem_mp_ops.create(name, total_bufs, MEMPOOL_CACHE_NUM, 0, sizeof(struct rpc_msg), numa_id); + if (pool == NULL) { + LSTACK_LOG(ERR, LSTACK, "rte_mempool_create %s failed, rte_errno %d\n", name, rte_errno); + } +@@ -392,16 +397,16 @@ void mem_stack_pool_free(int stack_id) + } + } + +-int mem_stack_pool_init(int stack_id) ++int mem_stack_pool_init(int stack_id, unsigned numa_id) + { + struct mem_stack *ms = mem_stack_get(stack_id); + +- ms->mbuf_pool = mbuf_pool_create(stack_id); ++ ms->mbuf_pool = mbuf_pool_create(stack_id, numa_id); + if (ms->mbuf_pool == NULL) { + return -1; + } + +- ms->rpc_pool = rpc_pool_create(stack_id); ++ ms->rpc_pool = rpc_pool_create(stack_id, numa_id); + if (ms->rpc_pool == NULL) { + mem_stack_pool_free(stack_id); + return -1; +@@ -469,7 +474,7 @@ void mem_thread_cache_free(struct mem_thread *mt) + + int mem_thread_cache_init(struct mem_thread *mt) + { +- if (!get_global_cfg_params()->stack_mode_rtc && !dpdk_nic_is_xdp()) { ++ if (!get_global_cfg_params()->stack_mode_rtc && !xdp_eth_enabled()) { + char name [RTE_MEMPOOL_NAMESIZE]; + SYS_FORMAT_NAME(name, RTE_MEMPOOL_NAMESIZE, "%s_%p", "migrate_ring", mt); + +diff --git a/src/lstack/core/lstack_preload.c b/src/lstack/core/lstack_preload.c +index bdb61e9..2de25f9 100644 +--- a/src/lstack/core/lstack_preload.c ++++ b/src/lstack/core/lstack_preload.c +@@ -94,15 +94,10 @@ enum posix_type select_sock_posix_path(struct lwip_sock *sock) + } + + /* CLOSED means not sockfd, such as file fd or unix fd */ +- if (POSIX_IS_CLOSED(sock) || POSIX_IS_TYPE(sock, POSIX_KERNEL)) { ++ if (POSIX_IS_CLOSED(sock)) { + return POSIX_KERNEL; + } +- +- if (likely(POSIX_IS_TYPE(sock, POSIX_LWIP))) { +- return POSIX_LWIP; +- } +- +- return POSIX_ALL; ++ return sock->type; + } + + enum posix_type select_posix_path(void) +@@ -193,6 +188,6 @@ int preload_info_init(void) + } + + g_preload_info.preload_switch = 1; +- ++ LSTACK_PRE_LOG(LSTACK_INFO, "LD_PRELOAD ok\n"); + return preload_check_bind_proc(); + } +diff --git a/src/lstack/core/lstack_protocol_stack.c b/src/lstack/core/lstack_protocol_stack.c +index 1e7df33..c07d8e7 100644 +--- a/src/lstack/core/lstack_protocol_stack.c ++++ b/src/lstack/core/lstack_protocol_stack.c +@@ -27,13 +27,13 @@ + #include "lstack_cfg.h" + #include "lstack_dpdk.h" + #include "lstack_ethdev.h" +-#include "lstack_lwip.h" + #include "lstack_control_plane.h" + #include "lstack_wait.h" + #include "lstack_stack_stat.h" + #include "lstack_virtio.h" + #include "lstack_interrupt.h" + #include "lstack_protocol_stack.h" ++#include "lstack_mempool.h" + + #if RTE_VERSION < RTE_VERSION_NUM(23, 11, 0, 0) + #include +@@ -349,8 +349,8 @@ static int32_t init_stack_value(struct protocol_stack *stack, void *arg) + stack->stack_idx = t_params->idx; + stack->lwip_stats = &lwip_stats; + +- list_init_head(&stack->recv_list); +- list_init_head(&stack->same_node_recv_list); ++ rpc_queue_init(&stack->rpc_queue, stack->queue_id); ++ rpc_queue_init(&stack->dfx_rpc_queue, stack->queue_id); + + stack_group->stacks[t_params->idx] = stack; + set_stack_idx(t_params->idx); +@@ -367,11 +367,6 @@ static int32_t init_stack_value(struct protocol_stack *stack, void *arg) + } + } + +- if (pktmbuf_pool_init(stack) != 0) { +- LSTACK_LOG(ERR, LSTACK, "pktmbuf_pool_init failed\n"); +- return -1; +- } +- + if (create_shared_ring(stack) != 0) { + LSTACK_LOG(ERR, LSTACK, "create_shared_ring failed\n"); + return -1; +@@ -426,11 +421,14 @@ static struct protocol_stack *stack_thread_init(void *arg) + if (stack_affinity_cpu(stack->cpu_id) != 0) { + goto END; + } +- RTE_PER_LCORE(_lcore_id) = stack->cpu_id; + } else { + stack_affinity_numa(stack->numa_id); + } + ++ if (mem_stack_mpcache_init(stack->stack_idx, stack->cpu_id) < 0) { ++ goto END; ++ } ++ + lwip_init(); + /* Using errno to return lwip_init() result. */ + if (errno != 0) { +@@ -468,7 +466,6 @@ int stack_polling(unsigned wakeup_tick) + bool use_sockmap = cfg->use_sockmap; + bool stack_mode_rtc = cfg->stack_mode_rtc; + uint32_t rpc_number = cfg->rpc_number; +- uint32_t read_connect_number = cfg->read_connect_number; + struct protocol_stack *stack = get_protocol_stack(); + uint32_t timeout; + +@@ -490,8 +487,6 @@ int stack_polling(unsigned wakeup_tick) + return force_quit; + } + +- do_lwip_read_recvlist(stack, read_connect_number); +- + if ((wakeup_tick & 0xf) == 0) { + #if SOCK_WAIT_BATCH_NOTIFY + stack->stats.wakeup_events += lwip_wait_foreach_notify(stack->stack_idx); +@@ -505,10 +500,6 @@ int stack_polling(unsigned wakeup_tick) + /* run to completion mode currently does not support sockmap */ + if (use_sockmap) { + netif_poll(&stack->netif); +- /* reduce traversal times */ +- if ((wakeup_tick & 0xff) == 0) { +- read_same_node_recv_list(stack); +- } + } + #endif /* GAZELLE_SAME_NODE */ + +@@ -538,7 +529,6 @@ static bool stack_local_event_get(uint16_t stack_id) + struct protocol_stack *stack = g_stack_group.stacks[stack_id]; + if (!lockless_queue_empty(&stack->dfx_rpc_queue.queue) || + !lockless_queue_empty(&stack->rpc_queue.queue) || +- !list_head_empty(&stack->recv_list) || + !lwip_wait_notify_empty(stack_id) || + tx_cache_count(stack->queue_id)) { + return true; +@@ -582,9 +572,8 @@ static void* gazelle_stack_thread(void *arg) + + static int stack_group_init_mempool(void) + { ++ int ret; + struct cfg_params *cfg_params = get_global_cfg_params(); +- uint32_t total_mbufs = dpdk_pktmbuf_mempool_num(); +- struct rte_mempool *rxtx_mbuf = NULL; + uint32_t cpu_id = 0; + unsigned numa_id = 0; + int queue_id = 0; +@@ -607,13 +596,12 @@ static int stack_group_init_mempool(void) + return -1; + } + +- rxtx_mbuf = create_pktmbuf_mempool("rxtx_mbuf", total_mbufs, RXTX_CACHE_SZ, queue_id, numa_id); +- if (rxtx_mbuf == NULL) { +- LSTACK_LOG(ERR, LSTACK, "numid=%d, rxtx_mbuf idx=%d, create_pktmbuf_mempool fail\n", numa_id, queue_id); ++ ret = mem_stack_pool_init(queue_id, numa_id); ++ if (ret != 0) { ++ LSTACK_LOG(ERR, LSTACK, "mem_stack_pool_init failed, cpuid=%u, numid=%d, queue_id=%d\n", ++ cpu_id, numa_id, queue_id); + return -1; + } +- +- get_protocol_stack_group()->total_rxtx_pktmbuf_pool[queue_id] = rxtx_mbuf; + } + } + +@@ -633,6 +621,11 @@ int stack_group_init(void) + + stack_group->stack_setup_fail = 0; + ++ if (mem_thread_manager_init() != 0) { ++ LSTACK_LOG(ERR, LSTACK, "mem_thread_manager_init failed\n"); ++ return -1; ++ } ++ + if (get_global_cfg_params()->is_primary) { + if (stack_group_init_mempool() != 0) { + LSTACK_LOG(ERR, LSTACK, "stack group init mempool failed\n"); +@@ -640,7 +633,7 @@ int stack_group_init(void) + } + } + +- return 0; ++ return sock_wait_group_init(); + } + + int stack_setup_app_thread(void) +@@ -749,6 +742,24 @@ void stack_wait(void) + } + } + ++static void stack_exit_by_rpc(struct rpc_msg *msg) ++{ ++ stack_exit(); ++} ++ ++static int rpc_call_stack_exit(int stack_id) ++{ ++ rpc_queue *queue = &get_protocol_stack_by_id(stack_id)->rpc_queue; ++ struct rpc_msg *msg = rpc_msg_alloc(stack_id, stack_exit_by_rpc); ++ if (msg == NULL) { ++ return -1; ++ } ++ ++ msg->flags |= RPC_MSG_EXIT; ++ rpc_async_call(queue, msg, RPC_MSG_FREE | RPC_MSG_EXIT); ++ return 0; ++} ++ + void stack_group_exit(void) + { + int i; +@@ -762,7 +773,7 @@ void stack_group_exit(void) + } + + if (stack != stack_group->stacks[i]) { +- rpc_call_stack_exit(&stack_group->stacks[i]->rpc_queue); ++ rpc_call_stack_exit(i); + } + } + +diff --git a/src/lstack/core/lstack_stack_stat.c b/src/lstack/core/lstack_stack_stat.c +index dc9c931..3b3bd75 100644 +--- a/src/lstack/core/lstack_stack_stat.c ++++ b/src/lstack/core/lstack_stack_stat.c +@@ -26,29 +26,15 @@ + #include "common/gazelle_dfx_msg.h" + #include "lstack_thread_rpc.h" + #include "lstack_protocol_stack.h" +-#include "lstack_epoll.h" + #include "lstack_dpdk.h" + #include "lstack_stack_stat.h" + #include "lstack_virtio.h" +-#include "lstack_dump.h" ++#include "lstack_wait.h" ++#include "lstack_mempool.h" + +-void time_stamp_transfer_pbuf(struct pbuf *pbuf_old, struct pbuf *pbuf_new) ++void time_stamp_into_write(struct pbuf *pbufs[], uint32_t num) + { +- if (!get_protocol_stack_group()->latency_start) { +- return; +- } +- struct latency_timestamp *lt_old; +- struct latency_timestamp *lt_new; +- +- lt_old = &pbuf_to_private(pbuf_old)->lt; +- lt_new = &pbuf_to_private(pbuf_new)->lt; +- +- lt_new->stamp = lt_old->stamp; +- lt_new->check = lt_old->check; +- lt_new->type = lt_old->type; +- for (int i = 0; i < GAZELLE_LATENCY_MAX; i++) { +- lt_new->stamp_seg[i] = lt_old->stamp_seg[i]; +- } ++ time_stamp_into_pbuf(num, pbufs, sys_now_us()); + } + + void time_stamp_into_rpcmsg(struct lwip_sock *sock) +@@ -56,28 +42,29 @@ void time_stamp_into_rpcmsg(struct lwip_sock *sock) + sock->stamp.rpc_time_stamp = sys_now_us(); + } + +-void time_stamp_into_recvmbox(struct lwip_sock *sock) ++static void time_stamp_into_recvmbox(struct lwip_sock *sock) + { + sock->stamp.mbox_time_stamp = sys_now_us(); + } + + void time_stamp_record(int fd, struct pbuf *pbuf) + { +- struct lwip_sock *sock = lwip_get_socket(fd); +- +- if (get_protocol_stack_group()->latency_start && sock && pbuf) { +- struct protocol_stack *stack = get_protocol_stack_by_id(sock->stack_id); +- calculate_lstack_latency(&stack->latency, pbuf, GAZELLE_LATENCY_INTO_MBOX, 0); +- time_stamp_into_recvmbox(sock); ++ if (get_protocol_stack_group()->latency_start && pbuf) { ++ struct lwip_sock *sock = lwip_get_socket(fd); ++ if (sock != NULL) { ++ calculate_sock_latency(sock, GAZELLE_LATENCY_RECVMBOX_READY); ++ calculate_lstack_latency(sock->stack_id, &pbuf, 1, GAZELLE_LATENCY_INTO_MBOX, 0); ++ time_stamp_into_recvmbox(sock); ++ } + } + } + +-void calculate_sock_latency(struct gazelle_stack_latency *stack_latency, struct lwip_sock *sock, +- enum GAZELLE_LATENCY_TYPE type) ++void calculate_sock_latency(struct lwip_sock *sock, enum GAZELLE_LATENCY_TYPE type) + { + uint64_t latency; + uint64_t stamp; + struct stack_latency *latency_stat; ++ struct protocol_stack *stack; + + if (type == GAZELLE_LATENCY_WRITE_RPC_MSG) { + stamp = sock->stamp.rpc_time_stamp; +@@ -87,12 +74,13 @@ void calculate_sock_latency(struct gazelle_stack_latency *stack_latency, struct + return; + } + +- if (stamp < stack_latency->start_time) { ++ stack = get_protocol_stack(); ++ if (stamp < stack->latency.start_time) { + return; + } + + latency = sys_now_us() - stamp; +- latency_stat = &stack_latency->latency[type]; ++ latency_stat = &stack->latency.latency[type]; + + latency_stat->latency_total += latency; + latency_stat->latency_max = (latency_stat->latency_max > latency) ? latency_stat->latency_max : latency; +@@ -112,47 +100,54 @@ void calculate_latency_stat(struct gazelle_stack_latency *stack_latency, uint64_ + latency_stat->latency_pkts++; + } + +-void calculate_lstack_latency(struct gazelle_stack_latency *stack_latency, const struct pbuf *pbuf, ++void calculate_lstack_latency(int stack_id, struct pbuf *const *pbufs, uint32_t num, + enum GAZELLE_LATENCY_TYPE type, uint64_t time_record) + { + uint64_t latency; + uint16_t lt_type; + struct latency_timestamp *lt; ++ struct gazelle_stack_latency *stack_latency; ++ struct protocol_stack *stack; + +- if (pbuf == NULL || type >= GAZELLE_LATENCY_MAX) { ++ stack = get_protocol_stack_by_id(stack_id); ++ if (stack == NULL) + return; +- } ++ stack_latency = &stack->latency; ++ lt_type = (type / GAZELLE_LATENCY_READ_MAX) ? GAZELLE_LATENCY_WR : GAZELLE_LATENCY_RD; + +- lt = &pbuf_to_private(pbuf)->lt; +- if (lt == NULL) { +- return; +- } ++ for (uint32_t i = 0; i < num; ++i) { ++ if (pbufs[i] == NULL) { ++ continue; ++ } ++ lt = &pbuf_to_private(pbufs[i])->lt; + +- lt_type = (type / GAZELLE_LATENCY_READ_MAX) ? GAZELLE_LATENCY_WR : GAZELLE_LATENCY_RD; +- if (lt->stamp != ~(lt->check) || lt->stamp < stack_latency->start_time || lt_type != lt->type) { +- return; +- } ++ if (lt->stamp != ~(lt->check) || ++ lt->stamp < stack_latency->start_time || ++ lt_type != lt->type) { ++ continue; ++ } + +- if (time_record == 0) { +- lt->stamp_seg[type] = sys_now_us() - lt->stamp; +- } else { +- lt->stamp_seg[type] = time_record > (lt->stamp_seg[type - 1] + lt->stamp) ? +- (time_record - lt->stamp) : lt->stamp_seg[type - 1]; +- } ++ if (time_record == 0) { ++ lt->stamp_seg[type] = sys_now_us() - lt->stamp; ++ } else { ++ lt->stamp_seg[type] = time_record > (lt->stamp_seg[type - 1] + lt->stamp) ? ++ (time_record - lt->stamp) : lt->stamp_seg[type - 1]; ++ } + +- latency = lt->stamp_seg[type]; +- if (((lt_type == GAZELLE_LATENCY_RD && type > GAZELLE_LATENCY_READ_LWIP) || +- (lt_type == GAZELLE_LATENCY_WR && type > GAZELLE_LATENCY_WRITE_INTO_RING)) && +- latency >= lt->stamp_seg[type - 1]) { +- latency -= lt->stamp_seg[type - 1]; +- } ++ latency = lt->stamp_seg[type]; ++ if (((lt_type == GAZELLE_LATENCY_RD && type > GAZELLE_LATENCY_INTO_MBOX) || ++ (lt_type == GAZELLE_LATENCY_WR && type > GAZELLE_LATENCY_WRITE_INTO_RING)) && ++ latency >= lt->stamp_seg[type - 1]) { ++ latency -= lt->stamp_seg[type - 1]; ++ } + +- /* calculate the time of the entire read/write process */ +- if (type == GAZELLE_LATENCY_READ_MAX - 1 || type == GAZELLE_LATENCY_WRITE_MAX - 1) { +- calculate_latency_stat(stack_latency, lt->stamp_seg[type], type + 1); +- } ++ /* calculate the time of the entire read/write process */ ++ if (type == GAZELLE_LATENCY_READ_MAX - 1 || type == GAZELLE_LATENCY_WRITE_MAX - 1) { ++ calculate_latency_stat(stack_latency, lt->stamp_seg[type], type + 1); ++ } + +- calculate_latency_stat(stack_latency, latency, type); ++ calculate_latency_stat(stack_latency, latency, type); ++ } + } + + void lstack_calculate_aggregate(int type, uint32_t len) +@@ -240,13 +235,7 @@ static void get_stack_stats(struct gazelle_stack_dfx_data *dfx, struct protocol_ + int32_t rpc_call_result = rpc_msgcnt(&stack->rpc_queue); + dfx->data.pkts.call_msg_cnt = (rpc_call_result < 0) ? 0 : rpc_call_result; + +- if (stack_get_state(stack) == RUNNING) { +- rpc_call_result = rpc_call_mbufpoolsize(&stack->dfx_rpc_queue); +- dfx->data.pkts.mbufpool_avail_cnt = (rpc_call_result < 0) ? 0 : rpc_call_result; +- +- rpc_call_result = rpc_call_recvlistcnt(&stack->dfx_rpc_queue); +- dfx->data.pkts.recv_list_cnt = (rpc_call_result < 0) ? 0 : rpc_call_result; +- } ++ dfx->data.pkts.mbufpool_avail_cnt = mem_stack_mbuf_pool_count(stack->stack_idx); + + dfx->data.pkts.conn_num = stack->conn_num; + } +@@ -323,13 +312,12 @@ static void get_stack_dfx_data(struct gazelle_stack_dfx_data *dfx, struct protoc + break; + case GAZELLE_STAT_LSTACK_SHOW_CONN: + if (stack_get_state(stack) == RUNNING) { +- rpc_call_result = rpc_call_conntable(&stack->dfx_rpc_queue, dfx->data.conn.conn_list, ++ rpc_call_result = rpc_call_conntable(stack->stack_idx, dfx->data.conn.conn_list, + GAZELLE_LSTACK_MAX_CONN); + dfx->data.conn.conn_num = (rpc_call_result < 0) ? 0 : rpc_call_result; +- rpc_call_result = rpc_call_connnum(&stack->dfx_rpc_queue); ++ rpc_call_result = rpc_call_connnum(stack->stack_idx); + dfx->data.conn.total_conn_num = (rpc_call_result < 0) ? 0 : rpc_call_result; + } +- + break; + case GAZELLE_STAT_LSTACK_SHOW_LATENCY: + ret = memcpy_s(&dfx->data.latency, sizeof(dfx->data.latency), &stack->latency, sizeof(stack->latency)); +diff --git a/src/lstack/core/lstack_thread_rpc.c b/src/lstack/core/lstack_thread_rpc.c +index a831d3b..7b3e432 100644 +--- a/src/lstack/core/lstack_thread_rpc.c ++++ b/src/lstack/core/lstack_thread_rpc.c +@@ -9,33 +9,17 @@ + * PURPOSE. + * See the Mulan PSL v2 for more details. + */ +-#include +-#include +-#include + +-#include "lwip/lwipgz_posix_api.h" ++#include ++#include + + #include "lstack_log.h" + #include "lstack_cfg.h" +-#include "lstack_dpdk.h" + #include "lstack_stack_stat.h" + #include "lstack_protocol_stack.h" + #include "lstack_thread_rpc.h" +-#include "lstack_epoll.h" +-#include "lstack_lwip.h" +- +-struct rpc_pool_array { +-#define RPC_POOL_MAX_COUNT 1024 +- struct rpc_msg_pool *array[RPC_POOL_MAX_COUNT]; +- pthread_mutex_t lock; +- int cur_count; +-}; ++#include "lstack_mempool.h" + +-static struct rpc_pool_array g_rpc_pool_array = { +- .lock = PTHREAD_MUTEX_INITIALIZER, +-}; +- +-static PER_THREAD struct rpc_msg_pool *g_rpc_pool = NULL; + static struct rpc_stats g_rpc_stats; + + struct rpc_stats *rpc_stats_get(void) +@@ -44,114 +28,60 @@ struct rpc_stats *rpc_stats_get(void) + } + + __rte_always_inline +-static struct rpc_msg *get_rpc_msg(struct rpc_msg_pool *rpc_pool) +-{ +- int ret; +- struct rpc_msg *msg = NULL; +- ret = rte_mempool_get(rpc_pool->mempool, (void **)&msg); +- if (ret < 0) { +- errno = ENOMEM; +- return NULL; +- } +- return msg; +-} +- +-__rte_always_inline +-static void rpc_msg_init(struct rpc_msg *msg, rpc_func_t func, struct rpc_msg_pool *pool) ++static void rpc_msg_init(struct rpc_msg *msg, rpc_func_t func) + { + msg->func = func; +- msg->rpcpool = pool; +- msg->recall_flag = 0; ++ msg->flags = 0; + pthread_spin_init(&msg->lock, PTHREAD_PROCESS_PRIVATE); +-} +- +-static struct rpc_msg_pool *rpc_msg_pool_init(void) +-{ +- struct rpc_msg_pool *rpc_pool; +- pthread_mutex_lock(&g_rpc_pool_array.lock); +- if (g_rpc_pool_array.cur_count >= RPC_POOL_MAX_COUNT) { +- pthread_mutex_unlock(&g_rpc_pool_array.lock); +- return g_rpc_pool_array.array[rte_gettid() % RPC_POOL_MAX_COUNT]; +- } + +- rpc_pool = calloc(1, sizeof(struct rpc_msg_pool)); +- if (rpc_pool == NULL) { +- LSTACK_LOG(INFO, LSTACK, "g_rpc_pool calloc failed\n"); +- goto END; +- } +- rpc_pool->mempool = +- create_mempool("rpc_pool", get_global_cfg_params()->rpc_msg_max, sizeof(struct rpc_msg), 0, rte_gettid()); +- if (rpc_pool->mempool == NULL) { +- LSTACK_LOG(INFO, LSTACK, "rpc_pool create failed, errno is %d\n", errno); +- free(rpc_pool); +- goto END; +- } +- +- g_rpc_pool_array.array[g_rpc_pool_array.cur_count++] = rpc_pool; +- pthread_mutex_unlock(&g_rpc_pool_array.lock); +- return rpc_pool; +-END: +- pthread_mutex_unlock(&g_rpc_pool_array.lock); +- g_rpc_stats.call_alloc_fail++; +- return NULL; ++ lockless_queue_node_set_poped(&msg->queue_node); + } + +- +-static struct rpc_msg *rpc_msg_alloc(rpc_func_t func) ++struct rpc_msg *rpc_msg_alloc(int stack_id, rpc_func_t func) + { + struct rpc_msg *msg; + +- if (unlikely(g_rpc_pool == NULL)) { +- g_rpc_pool = rpc_msg_pool_init(); +- if (g_rpc_pool == NULL) { +- exit(-1); +- } +- } +- +- msg = get_rpc_msg(g_rpc_pool); ++ msg = mem_get_rpc(stack_id); + if (unlikely(msg == NULL)) { + g_rpc_stats.call_alloc_fail++; + return NULL; + } + +- rpc_msg_init(msg, func, g_rpc_pool); ++ rpc_msg_init(msg, func); + return msg; + } + +-__rte_always_inline +-static void rpc_msg_free(struct rpc_msg *msg) ++void rpc_msg_free(struct rpc_msg *msg) + { + pthread_spin_destroy(&msg->lock); +- if (msg->rpcpool != NULL && msg->rpcpool->mempool != NULL) { +- rte_mempool_put(msg->rpcpool->mempool, (void *)msg); +- } else { +- free(msg); +- } ++ mem_put_rpc(msg); + } + +-__rte_always_inline +-static void rpc_call(rpc_queue *queue, struct rpc_msg *msg) ++void rpc_async_call(rpc_queue *queue, struct rpc_msg *msg, int flags) + { +- lockless_queue_mpsc_push(&queue->queue, &msg->queue_node); +- intr_wakeup(queue->queue_id, INTR_REMOTE_EVENT); +-} ++ if (flags & RPC_MSG_RECALL) ++ msg->flags |= flags; /* if RECALL, keep the previous flags. */ ++ else ++ msg->flags = flags & (~RPC_MSG_SYNC); + +-__rte_always_inline +-static void rpc_async_call(rpc_queue *queue, struct rpc_msg *msg) +-{ +- msg->sync_flag = 0; +- rpc_call(queue, msg); ++ if (msg->flags & RPC_MSG_REUSE) ++ lockless_queue_mpsc_test_push(&queue->queue, &msg->queue_node); ++ else ++ lockless_queue_mpsc_push(&queue->queue, &msg->queue_node); ++ ++ intr_wakeup(queue->queue_id, INTR_REMOTE_EVENT); + } + +-__rte_always_inline +-static int rpc_sync_call(rpc_queue *queue, struct rpc_msg *msg) ++int rpc_sync_call(rpc_queue *queue, struct rpc_msg *msg) + { + int ret; + + pthread_spin_trylock(&msg->lock); + +- msg->sync_flag = 1; +- rpc_call(queue, msg); ++ msg->flags = RPC_MSG_SYNC; ++ lockless_queue_mpsc_push(&queue->queue, &msg->queue_node); ++ ++ intr_wakeup(queue->queue_id, INTR_REMOTE_EVENT); + + // waiting stack unlock + pthread_spin_lock(&msg->lock); +@@ -161,36 +91,15 @@ static int rpc_sync_call(rpc_queue *queue, struct rpc_msg *msg) + return ret; + } + +-int rpc_msgcnt(rpc_queue *queue) ++void rpc_queue_init(rpc_queue *queue, uint16_t queue_id) + { +- return lockless_queue_count(&queue->queue); ++ lockless_queue_init(&queue->queue); ++ queue->queue_id = queue_id; + } + +-static struct rpc_msg *rpc_msg_alloc_except(rpc_func_t func) +-{ +- struct rpc_msg *msg = calloc(1, sizeof(struct rpc_msg)); +- if (msg == NULL) { +- return NULL; +- } +- +- rpc_msg_init(msg, func, NULL); +- return msg; +-} +- +-static void stack_exit_by_rpc(struct rpc_msg *msg) +-{ +- stack_exit(); +-} +- +-int rpc_call_stack_exit(rpc_queue *queue) ++int rpc_msgcnt(rpc_queue *queue) + { +- struct rpc_msg *msg = rpc_msg_alloc_except(stack_exit_by_rpc); +- if (msg == NULL) { +- return -1; +- } +- +- rpc_async_call(queue, msg); +- return 0; ++ return lockless_queue_count(&queue->queue); + } + + int rpc_poll_msg(rpc_queue *queue, int max_num) +@@ -203,7 +112,6 @@ int rpc_poll_msg(rpc_queue *queue, int max_num) + if (node == NULL) { + break; + } +- + msg = container_of(node, struct rpc_msg, queue_node); + + if (likely(msg->func)) { +@@ -212,17 +120,19 @@ int rpc_poll_msg(rpc_queue *queue, int max_num) + g_rpc_stats.call_null++; + } + +- if (unlikely(msg->func == stack_exit_by_rpc)) { +- force_quit = 1; +- } +- if (msg->recall_flag) { +- msg->recall_flag = 0; ++ if (msg->flags & RPC_MSG_RECALL) { ++ msg->flags &= ~RPC_MSG_RECALL; + continue; + } + +- if (msg->sync_flag) { ++ if (unlikely(msg->flags & RPC_MSG_EXIT)) { ++ force_quit = 1; ++ } ++ ++ if (msg->flags & RPC_MSG_SYNC) { + pthread_spin_unlock(&msg->lock); +- } else { ++ } ++ if (msg->flags & RPC_MSG_FREE) { + rpc_msg_free(msg); + } + } +@@ -231,560 +141,145 @@ int rpc_poll_msg(rpc_queue *queue, int max_num) + } + + +-static void callback_socket(struct rpc_msg *msg) +-{ +- msg->result = lwip_socket(msg->args[MSG_ARG_0].i, msg->args[MSG_ARG_1].i, msg->args[MSG_ARG_2].i); +- if (msg->result < 0) { +- LSTACK_LOG(ERR, LSTACK, "tid %d, %ld socket failed\n", rte_gettid(), msg->result); +- } +-} +- +-static void callback_close(struct rpc_msg *msg) +-{ +- int fd = msg->args[MSG_ARG_0].i; +- struct lwip_sock *sock = lwip_get_socket(fd); +- struct protocol_stack *stack = get_protocol_stack_by_id(sock->stack_id); +- +- if (sock && __atomic_load_n(&sock->call_num, __ATOMIC_ACQUIRE) > 0) { +- msg->recall_flag = 1; +- rpc_call(&stack->rpc_queue, msg); /* until stack_send recall finish */ +- return; +- } +- +- msg->result = lwip_close(fd); +- if (msg->result != 0) { +- LSTACK_LOG(ERR, LSTACK, "tid %d, fd %d failed %ld\n", rte_gettid(), msg->args[MSG_ARG_0].i, msg->result); +- } +-} +- +-static void callback_shutdown(struct rpc_msg *msg) +-{ +- int fd = msg->args[MSG_ARG_0].i; +- int how = msg->args[MSG_ARG_1].i; +- struct lwip_sock *sock = lwip_get_socket(fd); +- struct protocol_stack *stack = get_protocol_stack_by_id(sock->stack_id); +- +- if (sock && __atomic_load_n(&sock->call_num, __ATOMIC_ACQUIRE) > 0) { +- msg->recall_flag = 1; +- rpc_call(&stack->rpc_queue, msg); +- return; +- } +- +- msg->result = lwip_shutdown(fd, how); +- if (msg->result != 0 && errno != ENOTCONN) { +- LSTACK_LOG(ERR, LSTACK, "tid %d, fd %d fail %ld\n", rte_gettid(), fd, msg->result); +- } +- +- posix_api->shutdown_fn(fd, how); +-} +- +-static void callback_bind(struct rpc_msg *msg) +-{ +- msg->result = lwip_bind(msg->args[MSG_ARG_0].i, msg->args[MSG_ARG_1].cp, msg->args[MSG_ARG_2].u); +- if (msg->result != 0) { +- LSTACK_LOG(ERR, LSTACK, "tid %d, fd %d failed %ld\n", rte_gettid(), msg->args[MSG_ARG_0].i, msg->result); +- } +-} +- +-static void callback_listen(struct rpc_msg *msg) +-{ +- int fd = msg->args[MSG_ARG_0].i; +- int backlog = msg->args[MSG_ARG_1].i; +- +- struct lwip_sock *sock = lwip_get_socket(fd); +- if (sock == NULL) { +- msg->result = -1; +- return; +- } +- +- /* new listen add to stack listen list */ +- msg->result = lwip_listen(fd, backlog); +- if (msg->result != 0) { +- LSTACK_LOG(ERR, LSTACK, "tid %d, fd %d failed %ld\n", rte_gettid(), msg->args[MSG_ARG_0].i, msg->result); +- } +-} +- +-static void callback_create_shadow_fd(struct rpc_msg *msg) +-{ +- int fd = msg->args[MSG_ARG_0].i; +- struct sockaddr *addr = msg->args[MSG_ARG_1].p; +- socklen_t addr_len = msg->args[MSG_ARG_2].u; +- +- int clone_fd = 0; +- struct lwip_sock *sock = lwip_get_socket(fd); +- if (sock == NULL) { +- LSTACK_LOG(ERR, LSTACK, "get sock null fd=%d\n", fd); +- msg->result = -1; +- return; +- } +- +- int domain = addr->sa_family; +- int type = NETCONN_IS_UDP(sock) ? SOCK_DGRAM : SOCK_STREAM; +- clone_fd = lwip_socket(domain, type, 0); +- if (clone_fd < 0) { +- LSTACK_LOG(ERR, LSTACK, "clone socket failed clone_fd=%d errno=%d\n", clone_fd, errno); +- msg->result = clone_fd; +- return; +- } +- +- struct lwip_sock *clone_sock = lwip_get_socket(clone_fd); +- if (clone_sock == NULL) { +- LSTACK_LOG(ERR, LSTACK, "get sock null fd=%d clone_fd=%d\n", fd, clone_fd); +- msg->result = -1; +- return; +- } +- +- do_lwip_clone_sockopt(clone_sock, sock); +- +- while (sock->listen_next) { +- sock = sock->listen_next; +- } +- sock->listen_next = clone_sock; +- +- int ret = lwip_bind(clone_fd, addr, addr_len); +- if (ret < 0) { +- LSTACK_LOG(ERR, LSTACK, "clone bind failed clone_fd=%d errno=%d\n", clone_fd, errno); +- msg->result = ret; +- return; +- } +- +- msg->result = clone_fd; +-} +- +-static void callback_accept(struct rpc_msg *msg) ++static void callback_arp(struct rpc_msg *msg) + { +- int fd = msg->args[MSG_ARG_0].i; +- msg->result = -1; ++ struct rte_mbuf *mbuf = (struct rte_mbuf *)msg->args[MSG_ARG_0].p; + struct protocol_stack *stack = get_protocol_stack(); + +- int accept_fd = lwip_accept4(fd, msg->args[MSG_ARG_1].p, msg->args[MSG_ARG_2].p, msg->args[MSG_ARG_3].i); +- if (accept_fd < 0) { +- stack->stats.accept_fail++; +- LSTACK_LOG(ERR, LSTACK, "fd %d ret %d\n", fd, accept_fd); +- return; +- } +- +- struct lwip_sock *sock = lwip_get_socket(accept_fd); +- if (sock == NULL) { +- lwip_close(accept_fd); +- LSTACK_LOG(ERR, LSTACK, "fd %d ret %d\n", fd, accept_fd); +- return; +- } +- +- msg->result = accept_fd; +- stack->conn_num++; +- if (rte_ring_count(sock->conn->recvmbox->ring)) { +- do_lwip_add_recvlist(accept_fd); +- } +-} +- +-static void callback_connect(struct rpc_msg *msg) +-{ +- msg->result = lwip_connect(msg->args[MSG_ARG_0].i, msg->args[MSG_ARG_1].p, msg->args[MSG_ARG_2].u); +- if (msg->result < 0) { +- msg->result = -errno; +- } +-} +- +-int rpc_call_socket(rpc_queue *queue, int domain, int type, int protocol) +-{ +- struct rpc_msg *msg = rpc_msg_alloc(callback_socket); +- if (msg == NULL) { +- return -1; +- } +- +- msg->args[MSG_ARG_0].i = domain; +- msg->args[MSG_ARG_1].i = type; +- msg->args[MSG_ARG_2].i = protocol; +- +- return rpc_sync_call(queue, msg); +-} +- +-int rpc_call_close(rpc_queue *queue, int fd) +-{ +- struct rpc_msg *msg = rpc_msg_alloc(callback_close); +- if (msg == NULL) { +- return -1; +- } +- +- msg->args[MSG_ARG_0].i = fd; +- +- return rpc_sync_call(queue, msg); +-} +- +-int rpc_call_shutdown(rpc_queue *queue, int fd, int how) +-{ +- struct rpc_msg *msg = rpc_msg_alloc(callback_shutdown); +- if (msg == NULL) { +- return -1; +- } +- +- msg->args[MSG_ARG_0].i = fd; +- msg->args[MSG_ARG_1].i = how; +- +- return rpc_sync_call(queue, msg); +-} +- +-int rpc_call_bind(rpc_queue *queue, int fd, const struct sockaddr *addr, socklen_t addrlen) +-{ +- struct rpc_msg *msg = rpc_msg_alloc(callback_bind); +- if (msg == NULL) { +- return -1; +- } +- +- msg->args[MSG_ARG_0].i = fd; +- msg->args[MSG_ARG_1].cp = addr; +- msg->args[MSG_ARG_2].u = addrlen; +- +- return rpc_sync_call(queue, msg); +-} +- +-int rpc_call_listen(rpc_queue *queue, int s, int backlog) +-{ +- struct rpc_msg *msg = rpc_msg_alloc(callback_listen); +- if (msg == NULL) { +- return -1; +- } +- +- msg->args[MSG_ARG_0].i = s; +- msg->args[MSG_ARG_1].i = backlog; +- +- return rpc_sync_call(queue, msg); +-} +- +-int rpc_call_shadow_fd(rpc_queue *queue, int fd, const struct sockaddr *addr, socklen_t addrlen) +-{ +- struct rpc_msg *msg = rpc_msg_alloc(callback_create_shadow_fd); +- if (msg == NULL) { +- return -1; +- } +- +- msg->args[MSG_ARG_0].i = fd; +- msg->args[MSG_ARG_1].cp = addr; +- msg->args[MSG_ARG_2].u = addrlen; +- +- return rpc_sync_call(queue, msg); +-} +- +-int rpc_call_accept(rpc_queue *queue, int fd, struct sockaddr *addr, socklen_t *addrlen, int flags) +-{ +- struct rpc_msg *msg = rpc_msg_alloc(callback_accept); +- if (msg == NULL) { +- return -1; +- } +- +- msg->args[MSG_ARG_0].i = fd; +- msg->args[MSG_ARG_1].p = addr; +- msg->args[MSG_ARG_2].p = addrlen; +- msg->args[MSG_ARG_3].i = flags; +- +- return rpc_sync_call(queue, msg); +-} +- +-int rpc_call_connect(rpc_queue *queue, int fd, const struct sockaddr *addr, socklen_t addrlen) +-{ +- struct rpc_msg *msg = rpc_msg_alloc(callback_connect); +- if (msg == NULL) { +- return -1; +- } +- +- msg->args[MSG_ARG_0].i = fd; +- msg->args[MSG_ARG_1].cp = addr; +- msg->args[MSG_ARG_2].u = addrlen; +- +- int ret = rpc_sync_call(queue, msg); +- if (ret < 0) { +- errno = -ret; +- return -1; +- } +- return ret; +-} +- +-static void callback_getpeername(struct rpc_msg *msg) +-{ +- msg->result = lwip_getpeername(msg->args[MSG_ARG_0].i, msg->args[MSG_ARG_1].p, msg->args[MSG_ARG_2].p); +-} +- +-static void callback_getsockname(struct rpc_msg *msg) +-{ +- msg->result = lwip_getsockname(msg->args[MSG_ARG_0].i, msg->args[MSG_ARG_1].p, msg->args[MSG_ARG_2].p); +- if (msg->result != 0) { +- LSTACK_LOG(ERR, LSTACK, "tid %d, fd %d fail %ld\n", rte_gettid(), msg->args[MSG_ARG_0].i, msg->result); +- } +-} +- +-static void callback_getsockopt(struct rpc_msg *msg) +-{ +- msg->result = lwip_getsockopt(msg->args[MSG_ARG_0].i, msg->args[MSG_ARG_1].i, msg->args[MSG_ARG_2].i, +- msg->args[MSG_ARG_3].p, msg->args[MSG_ARG_4].p); +- if (msg->result != 0) { +- LSTACK_LOG(ERR, LSTACK, "tid %d, fd %d, level %d, optname %d, fail %ld\n", rte_gettid(), +- msg->args[MSG_ARG_0].i, msg->args[MSG_ARG_1].i, msg->args[MSG_ARG_2].i, msg->result); +- } +-} +- +-static void callback_setsockopt(struct rpc_msg *msg) +-{ +- msg->result = lwip_setsockopt(msg->args[MSG_ARG_0].i, msg->args[MSG_ARG_1].i, msg->args[MSG_ARG_2].i, +- msg->args[MSG_ARG_3].cp, msg->args[MSG_ARG_4].u); +- if (msg->result != 0) { +- LSTACK_LOG(ERR, LSTACK, "tid %d, fd %d, level %d, optname %d, fail %ld\n", rte_gettid(), +- msg->args[MSG_ARG_0].i, msg->args[MSG_ARG_1].i, msg->args[MSG_ARG_2].i, msg->result); +- } +-} +- +-int rpc_call_getpeername(rpc_queue *queue, int fd, struct sockaddr *addr, socklen_t *addrlen) +-{ +- struct rpc_msg *msg = rpc_msg_alloc(callback_getpeername); +- if (msg == NULL) { +- return -1; +- } +- +- msg->args[MSG_ARG_0].i = fd; +- msg->args[MSG_ARG_1].p = addr; +- msg->args[MSG_ARG_2].p = addrlen; +- +- return rpc_sync_call(queue, msg); +-} +- +-int rpc_call_getsockname(rpc_queue *queue, int fd, struct sockaddr *addr, socklen_t *addrlen) +-{ +- struct rpc_msg *msg = rpc_msg_alloc(callback_getsockname); +- if (msg == NULL) { +- return -1; +- } +- +- msg->args[MSG_ARG_0].i = fd; +- msg->args[MSG_ARG_1].p = addr; +- msg->args[MSG_ARG_2].p = addrlen; +- +- return rpc_sync_call(queue, msg); +-} +- +-int rpc_call_getsockopt(rpc_queue *queue, int fd, int level, int optname, void *optval, socklen_t *optlen) +-{ +- struct rpc_msg *msg = rpc_msg_alloc(callback_getsockopt); +- if (msg == NULL) { +- return -1; +- } +- +- msg->args[MSG_ARG_0].i = fd; +- msg->args[MSG_ARG_1].i = level; +- msg->args[MSG_ARG_2].i = optname; +- msg->args[MSG_ARG_3].p = optval; +- msg->args[MSG_ARG_4].p = optlen; +- +- return rpc_sync_call(queue, msg); ++ eth_dev_recv(mbuf, stack); + } + +-int rpc_call_setsockopt(rpc_queue *queue, int fd, int level, int optname, const void *optval, socklen_t optlen) ++int rpc_call_arp(int stack_id, void *mbuf) + { +- struct rpc_msg *msg = rpc_msg_alloc(callback_setsockopt); ++ rpc_queue *queue = &get_protocol_stack_by_id(stack_id)->rpc_queue; ++ struct rpc_msg *msg = rpc_msg_alloc(stack_id, callback_arp); + if (msg == NULL) { + return -1; + } + +- msg->args[MSG_ARG_0].i = fd; +- msg->args[MSG_ARG_1].i = level; +- msg->args[MSG_ARG_2].i = optname; +- msg->args[MSG_ARG_3].cp = optval; +- msg->args[MSG_ARG_4].u = optlen; ++ msg->args[MSG_ARG_0].p = mbuf; + +- return rpc_sync_call(queue, msg); ++ rpc_async_call(queue, msg, RPC_MSG_FREE); ++ return 0; + } + +-static void callback_tcp_send(struct rpc_msg *msg) +-{ +- int fd = msg->args[MSG_ARG_0].i; +- size_t len = UINT16_MAX; /* ignore msg->args[MSG_ARG_1].size; */ +- struct protocol_stack *stack = get_protocol_stack(); +- int ret; +- msg->result = -1; +- +- struct lwip_sock *sock = lwip_get_socket(fd); +- if (unlikely(POSIX_IS_CLOSED(sock))) { +- return; +- } +- +- if (get_protocol_stack_group()->latency_start) { +- calculate_sock_latency(&stack->latency, sock, GAZELLE_LATENCY_WRITE_RPC_MSG); +- } ++static void copy_pcb_to_conn(struct gazelle_stat_lstack_conn_info *conn, const struct tcp_pcb *pcb) ++{ ++ struct netconn *netconn = (struct netconn *)pcb->callback_arg; ++ const struct mbox_ring *mr; ++ ++ conn->lip = *((gz_addr_t *)&pcb->local_ip); ++ conn->rip = *((gz_addr_t *)&pcb->remote_ip); ++ conn->l_port = pcb->local_port; ++ conn->r_port = pcb->remote_port; ++ conn->in_send = pcb->snd_queuelen; ++ conn->tcp_sub_state = pcb->state; ++ conn->cwn = pcb->cwnd; ++ conn->rcv_wnd = pcb->rcv_wnd; ++ conn->snd_wnd = pcb->snd_wnd; ++ conn->snd_buf = pcb->snd_buf; ++ conn->lastack = pcb->lastack; ++ conn->snd_nxt = pcb->snd_nxt; ++ conn->rcv_nxt = pcb->rcv_nxt; ++ conn->keepalive = (ip_get_option(pcb, SOF_KEEPALIVE) != 0); ++ conn->keep_idle = pcb->keep_idle; ++ conn->keep_intvl = pcb->keep_intvl; ++ conn->keep_cnt = pcb->keep_cnt; ++ conn->pingpong = tcp_in_pingpong(pcb); ++ ++ if (netconn != NULL) { ++ if (sys_mbox_valid(&netconn->recvmbox)) { ++ mr = &netconn->recvmbox->mring; ++ conn->recvmbox_cnt = mr->ops->recv_count(mr); ++ conn->recvmbox_tail = mr->tail_count(mr); ++ } ++ if (sys_mbox_valid(&netconn->sendmbox)) { ++ mr = &netconn->sendmbox->mring; ++ conn->sendmbox_cnt = mr->ops->count(mr); ++ conn->sendmbox_tail = mr->tail_count(mr); ++ } + +- ret = lwip_send(fd, sock, len, 0); +- if (unlikely(ret < 0) && (errno == ENOTCONN || errno == ECONNRESET || errno == ECONNABORTED)) { +- __sync_fetch_and_sub(&sock->call_num, 1); +- return; +- } +- msg->result = 0; +- +- ret = do_lwip_replenish_sendring(stack, sock); +- if (ret > 0 || NETCONN_IS_DATAOUT(sock)) { +- if (__atomic_load_n(&sock->call_num, __ATOMIC_ACQUIRE) == 1) { +- msg->recall_flag = 1; +- rpc_call(&stack->rpc_queue, msg); +- return; ++ conn->fd = netconn->callback_arg.socket; ++ struct lwip_sock *sock = lwip_get_socket(netconn->callback_arg.socket); ++ if (!POSIX_IS_CLOSED(sock)) { ++ struct sock_event *sk_event = &sock->sk_event; ++ conn->events = sk_event->pending; ++ conn->epoll_events = sk_event->events; ++ conn->eventlist = !list_node_null(&sk_event->event_node); + } + } +- +- __sync_fetch_and_sub(&sock->call_num, 1); +- return; + } + +-static void callback_udp_send(struct rpc_msg *msg) ++static uint32_t do_lwip_get_conntable(struct gazelle_stat_lstack_conn_info *conn, ++ uint32_t max_num) + { +- int fd = msg->args[MSG_ARG_0].i; +- size_t len = msg->args[MSG_ARG_1].size; +- struct protocol_stack *stack = get_protocol_stack(); +- int ret; +- msg->result = -1; +- +- struct lwip_sock *sock = lwip_get_socket(fd); +- if (unlikely(POSIX_IS_CLOSED(sock))) { +- return; +- } +- +- if (get_protocol_stack_group()->latency_start) { +- calculate_sock_latency(&stack->latency, sock, GAZELLE_LATENCY_WRITE_RPC_MSG); +- } +- +- ret = lwip_send(fd, sock, len, 0); +- if (unlikely(ret < 0) && (errno == ENOTCONN || errno == ECONNRESET || errno == ECONNABORTED)) { +- __sync_fetch_and_sub(&sock->call_num, 1); +- return; +- } +- msg->result = 0; +- +- ret = do_lwip_replenish_sendring(stack, sock); +- if (ret > 0 && (__atomic_load_n(&sock->call_num, __ATOMIC_ACQUIRE) == 1)) { +- rpc_call_replenish(&stack->rpc_queue, sock); +- return; +- } ++ struct tcp_pcb *pcb = NULL; ++ uint32_t conn_num = 0; ++ const struct mbox_ring *mr; + +- __sync_fetch_and_sub(&sock->call_num, 1); +- return; +-} +- +-int rpc_call_udp_send(rpc_queue *queue, int fd, size_t len, int flags) +-{ +- struct rpc_msg *msg = rpc_msg_alloc(callback_udp_send); +- if (msg == NULL) { ++ if (conn == NULL) { + return -1; + } + +- if (get_protocol_stack_group()->latency_start) { +- time_stamp_into_rpcmsg(lwip_get_socket(fd)); ++ for (pcb = tcp_active_pcbs; pcb != NULL && conn_num < max_num; pcb = pcb->next) { ++ conn[conn_num].state = GAZELLE_ACTIVE_LIST; ++ copy_pcb_to_conn(conn + conn_num, pcb); ++ conn_num++; + } + +- msg->args[MSG_ARG_0].i = fd; +- msg->args[MSG_ARG_1].size = len; +- msg->args[MSG_ARG_2].i = flags; +- +- rpc_async_call(queue, msg); +- return 0; +-} +- +-int rpc_call_tcp_send(rpc_queue *queue, int fd, size_t len, int flags) +-{ +- struct rpc_msg *msg = rpc_msg_alloc(callback_tcp_send); +- if (msg == NULL) { +- return -1; ++ for (pcb = tcp_tw_pcbs; pcb != NULL && conn_num < max_num; pcb = pcb->next) { ++ conn[conn_num].state = GAZELLE_TIME_WAIT_LIST; ++ copy_pcb_to_conn(conn + conn_num, pcb); ++ conn_num++; + } + +- if (get_protocol_stack_group()->latency_start) { +- time_stamp_into_rpcmsg(lwip_get_socket(fd)); ++ for (struct tcp_pcb_listen *pcbl = tcp_listen_pcbs.listen_pcbs; pcbl != NULL && conn_num < max_num; ++ pcbl = pcbl->next) { ++ conn[conn_num].state = GAZELLE_LISTEN_LIST; ++ conn[conn_num].lip = *((gz_addr_t *)&pcbl->local_ip); ++ conn[conn_num].l_port = pcbl->local_port; ++ conn[conn_num].tcp_sub_state = pcbl->state; ++ struct netconn *netconn = (struct netconn *)pcbl->callback_arg; ++ conn[conn_num].fd = netconn != NULL ? netconn->callback_arg.socket : -1; ++ if (netconn != NULL) { ++ if (sys_mbox_valid(&netconn->acceptmbox)) { ++ mr = &netconn->acceptmbox->mring; ++ conn[conn_num].recvmbox_cnt = mr->ops->count(mr); ++ } ++ } ++ conn_num++; + } + +- msg->args[MSG_ARG_0].i = fd; +- msg->args[MSG_ARG_1].size = len; +- msg->args[MSG_ARG_2].i = flags; +- +- rpc_async_call(queue, msg); +- return 0; ++ return conn_num; + } + +-static void callback_replenish_sendring(struct rpc_msg *msg) +-{ +- struct protocol_stack *stack = get_protocol_stack(); +- struct lwip_sock *sock = (struct lwip_sock *)msg->args[MSG_ARG_0].p; +- +- msg->result = do_lwip_replenish_sendring(stack, sock); +- if (msg->result == true) { +- msg->recall_flag = 1; +- rpc_call(&stack->rpc_queue, msg); +- } +-} +- +-int rpc_call_replenish(rpc_queue *queue, void *sock) ++static void callback_get_conntable(struct rpc_msg *msg) + { +- struct rpc_msg *msg = rpc_msg_alloc(callback_replenish_sendring); +- if (msg == NULL) { +- return -1; +- } +- +- msg->args[MSG_ARG_0].p = sock; ++ struct gazelle_stat_lstack_conn_info *conn = (struct gazelle_stat_lstack_conn_info *)msg->args[MSG_ARG_0].p; ++ unsigned max_num = msg->args[MSG_ARG_1].u; + +- rpc_async_call(queue, msg); +- return 0; ++ msg->result = do_lwip_get_conntable(conn, max_num); + } + +-static void callback_recvlist_count(struct rpc_msg *msg) ++static uint32_t do_lwip_get_connnum(void) + { +- struct protocol_stack *stack = get_protocol_stack(); +- struct list_node *list = &stack->recv_list; +- int count = 0; +- struct list_node *node; +- struct list_node *temp; ++ struct tcp_pcb *pcb = NULL; ++ struct tcp_pcb_listen *pcbl = NULL; ++ uint32_t conn_num = 0; + +- list_for_each_node(node, temp, list) { +- count++; ++ for (pcb = tcp_active_pcbs; pcb != NULL; pcb = pcb->next) { ++ conn_num++; + } +- msg->result = count; +-} + +-int rpc_call_recvlistcnt(rpc_queue *queue) +-{ +- struct rpc_msg *msg = rpc_msg_alloc(callback_recvlist_count); +- if (msg == NULL) { +- return -1; ++ for (pcbl = tcp_listen_pcbs.listen_pcbs; pcbl != NULL; pcbl = pcbl->next) { ++ conn_num++; + } + +- return rpc_sync_call(queue, msg); +-} +- +-static void callback_arp(struct rpc_msg *msg) +-{ +- struct rte_mbuf *mbuf = (struct rte_mbuf *)msg->args[MSG_ARG_0].p; +- struct protocol_stack *stack = get_protocol_stack(); +- +- eth_dev_recv(mbuf, stack); +-} +- +-int rpc_call_arp(rpc_queue *queue, void *mbuf) +-{ +- struct rpc_msg *msg = rpc_msg_alloc(callback_arp); +- if (msg == NULL) { +- return -1; ++ for (pcb = tcp_tw_pcbs; pcb != NULL; pcb = pcb->next) { ++ conn_num++; + } + +- msg->args[MSG_ARG_0].p = mbuf; +- +- rpc_async_call(queue, msg); +- return 0; +-} +- +-static void callback_mempool_size(struct rpc_msg *msg) +-{ +- struct protocol_stack *stack = get_protocol_stack(); +- +- msg->result = rte_mempool_avail_count(stack->rxtx_mbuf_pool); +-} +- +-static void callback_get_conntable(struct rpc_msg *msg) +-{ +- struct gazelle_stat_lstack_conn_info *conn = (struct gazelle_stat_lstack_conn_info *)msg->args[MSG_ARG_0].p; +- unsigned max_num = msg->args[MSG_ARG_1].u; +- +- msg->result = do_lwip_get_conntable(conn, max_num); ++ return conn_num; + } + + static void callback_get_connnum(struct rpc_msg *msg) +@@ -792,9 +287,10 @@ static void callback_get_connnum(struct rpc_msg *msg) + msg->result = do_lwip_get_connnum(); + } + +-int rpc_call_conntable(rpc_queue *queue, void *conn_table, unsigned max_conn) ++int rpc_call_conntable(int stack_id, void *conn_table, unsigned max_conn) + { +- struct rpc_msg *msg = rpc_msg_alloc(callback_get_conntable); ++ rpc_queue *queue = &get_protocol_stack_by_id(stack_id)->dfx_rpc_queue; ++ struct rpc_msg *msg = rpc_msg_alloc(stack_id, callback_get_conntable); + if (msg == NULL) { + return -1; + } +@@ -805,44 +301,13 @@ int rpc_call_conntable(rpc_queue *queue, void *conn_table, unsigned max_conn) + return rpc_sync_call(queue, msg); + } + +-int rpc_call_connnum(rpc_queue *queue) ++int rpc_call_connnum(int stack_id) + { +- struct rpc_msg *msg = rpc_msg_alloc(callback_get_connnum); ++ rpc_queue *queue = &get_protocol_stack_by_id(stack_id)->dfx_rpc_queue; ++ struct rpc_msg *msg = rpc_msg_alloc(stack_id, callback_get_connnum); + if (msg == NULL) { + return -1; + } + + return rpc_sync_call(queue, msg); + } +- +-int rpc_call_mbufpoolsize(rpc_queue *queue) +-{ +- struct rpc_msg *msg = rpc_msg_alloc(callback_mempool_size); +- if (msg == NULL) { +- return -1; +- } +- +- return rpc_sync_call(queue, msg); +-} +- +-extern void thread_register_phase1(struct rpc_msg *msg); +-int rpc_call_thread_regphase1(rpc_queue *queue, void *conn) +-{ +- struct rpc_msg *msg = rpc_msg_alloc(thread_register_phase1); +- if (msg == NULL) { +- return -1; +- } +- msg->args[MSG_ARG_0].p = conn; +- return rpc_sync_call(queue, msg); +-} +- +-extern void thread_register_phase2(struct rpc_msg *msg); +-int rpc_call_thread_regphase2(rpc_queue *queue, void *conn) +-{ +- struct rpc_msg *msg = rpc_msg_alloc(thread_register_phase2); +- if (msg == NULL) { +- return -1; +- } +- msg->args[MSG_ARG_0].p = conn; +- return rpc_sync_call(queue, msg); +-} +diff --git a/src/lstack/core/lstack_virtio.c b/src/lstack/core/lstack_virtio.c +index f6855d1..2c2d74e 100644 +--- a/src/lstack/core/lstack_virtio.c ++++ b/src/lstack/core/lstack_virtio.c +@@ -24,6 +24,7 @@ + #include "lstack_port_map.h" + #include "lstack_interrupt.h" + #include "lstack_virtio.h" ++#include "mbox_ring.h" + + #define VIRTIO_USER_NAME "virtio_user" + #define VIRTIO_DPDK_PARA_LEN 256 +@@ -293,7 +294,7 @@ static int virtio_port_init(uint16_t port) + } + + for (uint16_t q = 0; q < rx_queue_num; q++) { +- struct rte_mempool *rxtx_mbuf_pool = get_protocol_stack_group()->total_rxtx_pktmbuf_pool[q % mbuf_total_num]; ++ struct rte_mempool *rxtx_mbuf_pool = mem_get_mbuf_pool(q % mbuf_total_num); + retval = rte_eth_rx_queue_setup(port, q, VIRTIO_TX_RX_RING_SIZE, rte_eth_dev_socket_id(port), + NULL, rxtx_mbuf_pool); + if (retval < 0) { +diff --git a/src/lstack/core/lstack_wait.c b/src/lstack/core/lstack_wait.c +index 6334f5e..c67df93 100644 +--- a/src/lstack/core/lstack_wait.c ++++ b/src/lstack/core/lstack_wait.c +@@ -26,7 +26,7 @@ + #include "lstack_log.h" + #include "lstack_cfg.h" + #include "same_node.h" +-#include "lstack_lwip.h" ++#include "mbox_ring.h" + + #define KERNEL_EVENT_WAIT_US 10 + #define LWIP_EVENT_WAIT_US 10 +@@ -340,26 +340,34 @@ void sock_wait_kernel_free(struct sock_wait *sk_wait) + static inline bool NETCONN_NEED_ACCEPT(const struct lwip_sock *sock) + { + if (sys_mbox_valid(&sock->conn->acceptmbox)) { +- return !sys_mbox_empty(sock->conn->acceptmbox); ++ const struct mbox_ring *mr = &sock->conn->acceptmbox->mring; ++ return mr->ops->count(mr) > 0; + } + return false; + } + + static inline bool NETCONN_NEED_RECV(const struct lwip_sock *sock) + { +- if (sock->recv_lastdata != NULL) +- return true; +- if (gazelle_ring_readable_count(sock->recv_ring) > 0) +- return true; +- if (NETCONN_NEED_SAME_NODE(sock)) ++ if (sock->lastdata.pbuf != NULL) + return true; ++ if (sys_mbox_valid(&sock->conn->recvmbox)) { ++ const struct mbox_ring *mr = &sock->conn->recvmbox->mring; ++ return mr->ops->recv_count(mr) > 0; ++ } + return false; + } + + static inline bool NETCONN_ALLOW_SEND(const struct lwip_sock *sock) + { +- if (gazelle_ring_readable_count(sock->send_ring) > 0) +- return true; ++ if (get_global_cfg_params()->stack_mode_rtc) { ++ if (NETCONN_TYPE(sock->conn) == NETCONN_TCP) ++ return lwip_tcp_allow_send(sock->conn->pcb.tcp); ++ return false; ++ } ++ if (sys_mbox_valid(&sock->conn->sendmbox)) { ++ const struct mbox_ring *mr = &sock->conn->sendmbox->mring; ++ return mr->ops->free_count(mr) > 0; ++ } + return false; + } + +diff --git a/src/lstack/core/same_node.c b/src/lstack/core/same_node.c +index 660fefd..eb3610e 100644 +--- a/src/lstack/core/same_node.c ++++ b/src/lstack/core/same_node.c +@@ -20,25 +20,10 @@ + #include "lstack_protocol_stack.h" + #include "lstack_stack_stat.h" + #include "same_node.h" +-#include "lstack_epoll.h" +-#include "lstack_lwip.h" ++#include "lstack_wait.h" + +-#if GAZELLE_SAME_NODE +-void read_same_node_recv_list(struct protocol_stack *stack) +-{ +- struct list_node *list = &(stack->same_node_recv_list); +- struct list_node *node, *temp; +- struct lwip_sock *sock; +- +- list_for_each_node(node, temp, list) { +- sock = list_entry(node, struct lwip_sock, recv_list); +- +- if (sock->same_node_rx_ring != NULL && same_node_ring_count(sock)) { +- API_EVENT(sock->conn, NETCONN_EVT_RCVPLUS, 0); +- } +- } +-} + ++#if GAZELLE_SAME_NODE + /* process on same node use ring to recv data */ + ssize_t gazelle_same_node_ring_recv(struct lwip_sock *sock, const void *buf, size_t len, int32_t flags) + { +@@ -231,8 +216,7 @@ err_t find_same_node_memzone(struct tcp_pcb *pcb, struct lwip_sock *nsock) + + /* rcvlink init in alloc_socket() */ + /* remove from g_rcv_process_list in free_socket */ +- struct protocol_stack *stack = get_protocol_stack_by_id(nsock->stack_id); +- list_add_node(&nsock->recv_list, &stack->same_node_recv_list); ++ API_EVENT(nsock->conn, NETCONN_EVT_RCVPLUS, 0); + return 0; + } + +diff --git a/src/lstack/include/lstack_cfg.h b/src/lstack/include/lstack_cfg.h +index d59407b..3e671b5 100644 +--- a/src/lstack/include/lstack_cfg.h ++++ b/src/lstack/include/lstack_cfg.h +@@ -128,15 +128,12 @@ struct cfg_params { + bool listen_shadow; // true:listen in all stack thread. false:listen in one stack thread. + bool stack_interrupt; + +- uint32_t read_connect_number; + uint32_t nic_read_number; + uint32_t rpc_number; + uint32_t rpc_msg_max; + }; + + struct { // socket +- uint16_t send_ring_size; +- uint16_t recv_ring_size; + uint32_t tcp_conn_count; + uint32_t mbuf_count_per_conn; + }; +@@ -167,7 +164,8 @@ static inline uint8_t use_ltran(void) + + static inline bool xdp_eth_enabled(void) + { +- if (strlen(get_global_cfg_params()->xdp_eth_name)) { ++ /* strlen > 0 */ ++ if (get_global_cfg_params()->xdp_eth_name[0] != '\0') { + return true; + } + return false; +diff --git a/src/lstack/include/lstack_dpdk.h b/src/lstack/include/lstack_dpdk.h +index 6251be7..f70477e 100644 +--- a/src/lstack/include/lstack_dpdk.h ++++ b/src/lstack/include/lstack_dpdk.h +@@ -13,6 +13,7 @@ + #ifndef _GAZELLE_DPDK_H_ + #define _GAZELLE_DPDK_H_ + ++#include + #include + #include + #include +@@ -20,21 +21,10 @@ + #include "common/gazelle_opt.h" + #include "common/gazelle_dfx_msg.h" + +-#define RXTX_CACHE_SZ (VDEV_RX_QUEUE_SZ) +- + #define KNI_NB_MBUF (DEFAULT_RING_SIZE << 4) + +-#define MAX_PACKET_SZ 1538 +- + #define RING_SIZE(x) ((x) - 1) + +-#define MBUF_SZ (MAX_PACKET_SZ + RTE_PKTMBUF_HEADROOM) +- +-/* DPDK limit ring head-tail distance in rte_ring_init. +- * Max value is RTE_RING_SZ_MASK / HTD_MAX_DEF, RTE_RING_SZ_MASK is 0x7fffffff, HTD_MAX_DEF is 8. +- */ +-#define MBUF_MAX_NUM 0xfffffff +- + struct protocol_stack; + + int32_t dpdk_eal_init(void); +@@ -47,13 +37,7 @@ int init_dpdk_ethdev(void); + int thread_affinity_default(void); + + int32_t create_shared_ring(struct protocol_stack *stack); +-int32_t fill_mbuf_to_ring(struct rte_mempool *mempool, struct rte_ring *ring, uint32_t mbuf_num); +-int32_t pktmbuf_pool_init(struct protocol_stack *stack); +-struct rte_mempool *create_mempool(const char *name, uint32_t count, uint32_t size, +- uint32_t flags, int32_t idx); +-struct rte_mempool *create_pktmbuf_mempool(const char *name, uint32_t nb_mbuf, +- uint32_t mbuf_cache_size, uint16_t queue_id, unsigned numa_id); +-int32_t dpdk_alloc_pktmbuf(struct rte_mempool *pool, struct rte_mbuf **mbufs, uint32_t num, bool reserve); ++int32_t fill_mbuf_to_ring(int stack_id, struct rte_ring *ring, uint32_t mbuf_num); + + #if RTE_VERSION < RTE_VERSION_NUM(23, 11, 0, 0) + void dpdk_skip_nic_init(void); +diff --git a/src/lstack/include/lstack_dummy_api.h b/src/lstack/include/lstack_dummy_api.h +deleted file mode 100644 +index 48bce31..0000000 +--- a/src/lstack/include/lstack_dummy_api.h ++++ /dev/null +@@ -1,23 +0,0 @@ +-/* +-* Copyright (c) Huawei Technologies Co., Ltd. 2020-2021. All rights reserved. +-* gazelle is licensed under the Mulan PSL v2. +-* You can use this software according to the terms and conditions of the Mulan PSL v2. +-* You may obtain a copy of Mulan PSL v2 at: +-* http://license.coscl.org.cn/MulanPSL2 +-* THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR +-* IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY OR FIT FOR A PARTICULAR +-* PURPOSE. +-* See the Mulan PSL v2 for more details. +-*/ +- +-#ifndef _LSTACK_DUMMY_API_H_ +-#define _LSTACK_DUMMY_API_H_ +- +-int dummy_socket(int domain, int type, int protocol); +-ssize_t dummy_write(int s, const void *mem, size_t size); +-ssize_t dummy_writev(int s, const struct iovec *iov, int iovcnt); +-ssize_t dummy_sendmsg(int s, const struct msghdr *message, int flags); +-ssize_t dummy_send(int sockfd, const void *buf, size_t len, int flags); +-ssize_t dummy_sendto(int sockfd, const void *buf, size_t len, int flags, +- const struct sockaddr *addr, socklen_t addrlen); +-#endif /* __LSTACK_DUMMY_API_H_ */ +diff --git a/src/lstack/include/lstack_lwip.h b/src/lstack/include/lstack_lwip.h +deleted file mode 100644 +index 4cc9db1..0000000 +--- a/src/lstack/include/lstack_lwip.h ++++ /dev/null +@@ -1,62 +0,0 @@ +-/* +-* Copyright (c) Huawei Technologies Co., Ltd. 2020-2021. All rights reserved. +-* gazelle is licensed under the Mulan PSL v2. +-* You can use this software according to the terms and conditions of the Mulan PSL v2. +-* You may obtain a copy of Mulan PSL v2 at: +-* http://license.coscl.org.cn/MulanPSL2 +-* THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR +-* IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY OR FIT FOR A PARTICULAR +-* PURPOSE. +-* See the Mulan PSL v2 for more details. +-*/ +- +-#ifndef __GAZELLE_LWIP_H__ +-#define __GAZELLE_LWIP_H__ +-#include +- +-#include "common/gazelle_dfx_msg.h" +-#include "common/dpdk_common.h" +-#include "same_node.h" +- +-struct lwip_sock; +-struct rpc_msg; +-struct protocol_stack; +- +-#define NETCONN_IS_DATAOUT(sock) (gazelle_ring_readover_count((sock)->send_ring) || (sock)->send_pre_del) +-#define NETCONN_IS_UDP(sock) (NETCONNTYPE_GROUP(netconn_type((sock)->conn)) == NETCONN_UDP) +- +-/* lwip api */ +-struct pbuf *do_lwip_tcp_get_from_sendring(struct lwip_sock *sock, uint16_t remain_size); +-struct pbuf *do_lwip_udp_get_from_sendring(struct lwip_sock *sock, uint16_t remain_size); +-void do_lwip_get_from_sendring_over(struct lwip_sock *sock); +-ssize_t do_lwip_read_from_lwip(struct lwip_sock *sock, int32_t flags, uint8_t apiflags); +- +-/* lwip api */ +-void do_lwip_free_pbuf(struct pbuf *pbuf); +-struct pbuf *do_lwip_alloc_pbuf(pbuf_layer layer, uint16_t length, pbuf_type type); +- +-/* lwip api */ +-void do_lwip_add_recvlist(int32_t fd); +-/* stack api */ +-void do_lwip_read_recvlist(struct protocol_stack *stack, uint32_t max_num); +- +- +-/* app api */ +-ssize_t do_lwip_sendmsg_to_stack(struct lwip_sock *sock, int32_t s, +- const struct msghdr *message, int32_t flags); +-ssize_t do_lwip_recvmsg_from_stack(int32_t s, const struct msghdr *message, int32_t flags); +- +-ssize_t do_lwip_send_to_stack(int32_t fd, const void *buf, size_t len, int32_t flags, +- const struct sockaddr *addr, socklen_t addrlen); +-ssize_t do_lwip_read_from_stack(int32_t fd, void *buf, size_t len, int32_t flags, +- struct sockaddr *addr, socklen_t *addrlen); +- +-/* stack api */ +-bool do_lwip_replenish_sendring(struct protocol_stack *stack, struct lwip_sock *sock); +- +-void do_lwip_clone_sockopt(struct lwip_sock *dst_sock, struct lwip_sock *src_sock); +- +-uint32_t do_lwip_get_conntable(struct gazelle_stat_lstack_conn_info *conn, uint32_t max_num); +-uint32_t do_lwip_get_connnum(void); +- +-#endif +diff --git a/src/lstack/include/lstack_mempool.h b/src/lstack/include/lstack_mempool.h +index c6adff0..a602610 100644 +--- a/src/lstack/include/lstack_mempool.h ++++ b/src/lstack/include/lstack_mempool.h +@@ -284,7 +284,7 @@ struct mem_thread { + } __rte_cache_aligned; + + void mem_stack_pool_free(int stack_id); +-int mem_stack_pool_init(int stack_id); ++int mem_stack_pool_init(int stack_id, unsigned numa_id); + int mem_stack_mpcache_init(int stack_id, unsigned cpu_id); + + int mem_thread_manager_init(void); +diff --git a/src/lstack/include/lstack_protocol_stack.h b/src/lstack/include/lstack_protocol_stack.h +index b77d5da..8653fb0 100644 +--- a/src/lstack/include/lstack_protocol_stack.h ++++ b/src/lstack/include/lstack_protocol_stack.h +@@ -29,12 +29,6 @@ + #include "lstack_ethdev.h" + #include "lstack_tx_cache.h" + +-#define SOCK_RECV_RING_SIZE (get_global_cfg_params()->recv_ring_size) +-#define SOCK_RECV_RING_SIZE_MAX (2048) +-#define SOCK_SEND_RING_SIZE_MAX (2048) +- +-#define MBUFPOOL_RESERVE_NUM (2 * get_global_cfg_params()->rxqueue_size + 1024) +- + struct protocol_stack { + uint32_t tid; + uint16_t queue_id; +@@ -52,7 +46,6 @@ struct protocol_stack { + volatile bool low_power; + volatile uint16_t conn_num; + +- struct rte_mempool *rxtx_mbuf_pool; + struct rte_ring *rx_ring; + struct rte_ring *tx_ring; + struct rte_ring *reg_ring; +@@ -68,9 +61,6 @@ struct protocol_stack { + rpc_queue rpc_queue; + char pad2 __rte_cache_aligned; + +- struct list_node recv_list; +- struct list_node same_node_recv_list; /* used for same node processes communication */ +- + struct stats_ *lwip_stats; + struct gazelle_stack_latency latency; + struct gazelle_stack_stat stats; +@@ -88,7 +78,6 @@ struct protocol_stack_group { + struct protocol_stack *stacks[PROTOCOL_STACK_MAX]; + + sem_t sem_listen_thread; +- struct rte_mempool *total_rxtx_pktmbuf_pool[PROTOCOL_STACK_MAX]; + sem_t sem_stack_setup; + bool stack_setup_fail; + +diff --git a/src/lstack/include/lstack_rtc_api.h b/src/lstack/include/lstack_rtc_api.h +deleted file mode 100644 +index b4b7e1c..0000000 +--- a/src/lstack/include/lstack_rtc_api.h ++++ /dev/null +@@ -1,25 +0,0 @@ +-/* +-* Copyright (c) Huawei Technologies Co., Ltd. 2020-2021. All rights reserved. +-* gazelle is licensed under the Mulan PSL v2. +-* You can use this software according to the terms and conditions of the Mulan PSL v2. +-* You may obtain a copy of Mulan PSL v2 at: +-* http://license.coscl.org.cn/MulanPSL2 +-* THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR +-* IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY OR FIT FOR A PARTICULAR +-* PURPOSE. +-* See the Mulan PSL v2 for more details. +-*/ +- +-#ifndef _LSTACK_RTC_API_H_ +-#define _LSTACK_RTC_API_H_ +- +-#include +- +-/* don't include lwip/sockets.h, conflict with sys/socket.h */ +-extern int lwip_fcntl(int s, int cmd, int val); +-extern int lwip_ioctl(int s, long cmd, void *argp); +- +-void dummy_api_init(posix_api_t *api); +-void rtc_api_init(posix_api_t *api); +- +-#endif /* __LSTACK_RTC_API_H_ */ +diff --git a/src/lstack/include/lstack_rtw_api.h b/src/lstack/include/lstack_sock_dummy.h +similarity index 77% +rename from src/lstack/include/lstack_rtw_api.h +rename to src/lstack/include/lstack_sock_dummy.h +index 437901a..99491d1 100644 +--- a/src/lstack/include/lstack_rtw_api.h ++++ b/src/lstack/include/lstack_sock_dummy.h +@@ -10,11 +10,9 @@ + * See the Mulan PSL v2 for more details. + */ + +-#ifndef _LSTACK_RTW_API_H_ +-#define _LSTACK_RTW_API_H_ ++#ifndef _LSTACK_SOCK_DUMMY_H_ ++#define _LSTACK_SOCK_DUMMY_H_ + +-#include ++void sock_dummy_api_init(posix_api_t *api); + +-void rtw_api_init(posix_api_t *api); +- +-#endif /* _LSTACK_RTW_API_H_ */ ++#endif /* _LSTACK_SOCK_DUMMY_H_ */ +diff --git a/src/lstack/include/lstack_stack_stat.h b/src/lstack/include/lstack_stack_stat.h +index ef33365..e6e6abd 100644 +--- a/src/lstack/include/lstack_stack_stat.h ++++ b/src/lstack/include/lstack_stack_stat.h +@@ -13,30 +13,26 @@ + #ifndef GAZELLE_STACK_STAT_H + #define GAZELLE_STACK_STAT_H + ++#include ++#include ++ + struct gazelle_stack_latency; +-struct pbuf; + struct rpc_msg; + struct gazelle_stat_low_power_info; +-struct wakeup_poll; +-struct protocol_stack; + enum GAZELLE_LATENCY_TYPE; + enum GAZELLE_STAT_MODE; + struct gazelle_stat_msg_request; +-struct lwip_sock; + +-void calculate_lstack_latency(struct gazelle_stack_latency *stack_latency, const struct pbuf *pbuf, ++void calculate_lstack_latency(int stack_id, struct pbuf *const *pbuf, uint32_t num, + enum GAZELLE_LATENCY_TYPE type, uint64_t time_record); +-void calculate_sock_latency(struct gazelle_stack_latency *stack_latency, struct lwip_sock *sock, +- enum GAZELLE_LATENCY_TYPE type); ++void calculate_sock_latency(struct lwip_sock *sock, enum GAZELLE_LATENCY_TYPE type); + void stack_stat_init(void); + int handle_stack_cmd(int fd, struct gazelle_stat_msg_request *msg); + int handle_dpdk_cmd(int fd, enum GAZELLE_STAT_MODE stat_mode); + void lstack_get_low_power_info(struct gazelle_stat_low_power_info *low_power_info); +-void unregister_wakeup(struct protocol_stack *stack, struct wakeup_poll *wakeup); + void lstack_calculate_aggregate(int type, uint32_t len); +-void time_stamp_transfer_pbuf(struct pbuf *pbuf_old, struct pbuf *pbuf_new); ++void time_stamp_into_write(struct pbuf *pbufs[], uint32_t num); + void time_stamp_into_rpcmsg(struct lwip_sock *sock); +-void time_stamp_into_recvmbox(struct lwip_sock *sock); + void time_stamp_record(int fd, struct pbuf *pbuf); + + #endif /* GAZELLE_STACK_STAT_H */ +diff --git a/src/lstack/include/lstack_thread_rpc.h b/src/lstack/include/lstack_thread_rpc.h +index 427a519..32dde53 100644 +--- a/src/lstack/include/lstack_thread_rpc.h ++++ b/src/lstack/include/lstack_thread_rpc.h +@@ -26,8 +26,6 @@ + #define MSG_ARG_4 (4) + #define RPM_MSG_ARG_SIZE (5) + +-#define RPC_MEMPOOL_THREAD_NUM 64 +- + typedef struct rpc_queue rpc_queue; + struct rpc_queue { + struct lockless_queue queue; +@@ -38,6 +36,7 @@ struct rpc_stats { + uint16_t call_null; + uint64_t call_alloc_fail; + }; ++struct rpc_stats *rpc_stats_get(void); + + union rpc_msg_arg { + int i; +@@ -52,62 +51,34 @@ union rpc_msg_arg { + struct rpc_msg; + typedef void (*rpc_func_t)(struct rpc_msg *msg); + struct rpc_msg { +- int8_t sync_flag : 1; +- int8_t recall_flag : 1; ++#define RPC_MSG_SYNC 0x01 ++#define RPC_MSG_FREE 0x02 ++#define RPC_MSG_EXIT 0x04 ++#define RPC_MSG_RECALL 0x08 ++#define RPC_MSG_REUSE 0x10 ++ int flags; ++ int stack_id; /* the stack to which buf belongs */ + + long result; /* func return val */ + rpc_func_t func; /* msg handle func hook */ + union rpc_msg_arg args[RPM_MSG_ARG_SIZE]; /* resolve by type */ + +- struct rpc_msg_pool { +- struct rte_mempool *mempool; +- } *rpcpool; +- + pthread_spinlock_t lock; /* msg handler unlock notice sender msg process done */ + lockless_queue_node queue_node; + }; + +-static inline void rpc_queue_init(rpc_queue *queue, uint16_t queue_id) +-{ +- lockless_queue_init(&queue->queue); +- queue->queue_id = queue_id; +-} +-struct rpc_stats *rpc_stats_get(void); ++struct rpc_msg *rpc_msg_alloc(int stack_id, rpc_func_t func); ++void rpc_msg_free(struct rpc_msg *msg); ++ ++void rpc_queue_init(rpc_queue *queue, uint16_t queue_id); + int rpc_msgcnt(rpc_queue *queue); + int rpc_poll_msg(rpc_queue *queue, int max_num); + +-int rpc_call_stack_exit(rpc_queue *queue); +- +-/* #include will conflict with lwip/sockets.h */ +-struct sockaddr; +- +-int rpc_call_close(rpc_queue *queue, int fd); +-int rpc_call_shutdown(rpc_queue *queue, int fd, int how); +-int rpc_call_socket(rpc_queue *queue, int domain, int type, int protocol); +-int rpc_call_bind(rpc_queue *queue, int fd, const struct sockaddr *addr, socklen_t addrlen); +-int rpc_call_listen(rpc_queue *queue, int s, int backlog); +-int rpc_call_shadow_fd(rpc_queue *queue, int fd, const struct sockaddr *addr, socklen_t addrlen); +-int rpc_call_accept(rpc_queue *queue, int fd, struct sockaddr *addr, socklen_t *addrlen, int flags); +-int rpc_call_connect(rpc_queue *queue, int fd, const struct sockaddr *addr, socklen_t addrlen); +- +-int rpc_call_getpeername(rpc_queue *queue, int fd, struct sockaddr *addr, socklen_t *addrlen); +-int rpc_call_getsockname(rpc_queue *queue, int fd, struct sockaddr *addr, socklen_t *addrlen); +-int rpc_call_getsockopt(rpc_queue *queue, int fd, int level, int optname, void *optval, socklen_t *optlen); +-int rpc_call_setsockopt(rpc_queue *queue, int fd, int level, int optname, const void *optval, socklen_t optlen); +- +-int rpc_call_tcp_send(rpc_queue *queue, int fd, size_t len, int flags); +-int rpc_call_udp_send(rpc_queue *queue, int fd, size_t len, int flags); +- +-int rpc_call_replenish(rpc_queue *queue, void *sock); +-int rpc_call_recvlistcnt(rpc_queue *queue); +- +-int rpc_call_arp(rpc_queue *queue, void *mbuf); +- +-int rpc_call_conntable(rpc_queue *queue, void *conn_table, unsigned max_conn); +-int rpc_call_connnum(rpc_queue *queue); +-int rpc_call_mbufpoolsize(rpc_queue *queue); ++int rpc_sync_call(rpc_queue *queue, struct rpc_msg *msg); ++void rpc_async_call(rpc_queue *queue, struct rpc_msg *msg, int flags); + +-int rpc_call_thread_regphase1(rpc_queue *queue, void *conn); +-int rpc_call_thread_regphase2(rpc_queue *queue, void *conn); ++int rpc_call_conntable(int stack_id, void *conn_table, unsigned max_conn); ++int rpc_call_connnum(int stack_id); ++int rpc_call_arp(int stack_id, void *mbuf); + + #endif +diff --git a/src/lstack/include/mbox_ring.h b/src/lstack/include/mbox_ring.h +index c48a47b..7ffdcf4 100644 +--- a/src/lstack/include/mbox_ring.h ++++ b/src/lstack/include/mbox_ring.h +@@ -285,7 +285,7 @@ int mt_ring_create(struct mbox_ring *mr, const char *name, unsigned count) + mr->ops = &g_mbox_rtw_default_ops; + mr->st_obj = NULL; + } +- if ((mr->flags & MBOX_FLAG_RECV) && !dpdk_nic_is_xdp()) { ++ if ((mr->flags & MBOX_FLAG_RECV) && !xdp_eth_enabled()) { + mr->flags |= MBOX_FLAG_PEEK; + mr->ops = &g_mbox_rtw_peek_ops; + mr->ops->create(mr, name, count); +diff --git a/src/lstack/include/same_node.h b/src/lstack/include/same_node.h +index 90a5b76..a7ae64d 100644 +--- a/src/lstack/include/same_node.h ++++ b/src/lstack/include/same_node.h +@@ -19,7 +19,6 @@ + + unsigned same_node_ring_count(const struct lwip_sock *sock); + +-void read_same_node_recv_list(struct protocol_stack *stack); + ssize_t gazelle_same_node_ring_recv(struct lwip_sock *sock, const void *buf, size_t len, int32_t flags); + ssize_t gazelle_same_node_ring_send(struct lwip_sock *sock, const void *buf, size_t len, int32_t flags); + +diff --git a/src/lstack/netif/lstack_ethdev.c b/src/lstack/netif/lstack_ethdev.c +index a370714..f2ec977 100644 +--- a/src/lstack/netif/lstack_ethdev.c ++++ b/src/lstack/netif/lstack_ethdev.c +@@ -31,16 +31,13 @@ + #include "lstack_stack_stat.h" + #include "lstack_log.h" + #include "lstack_dpdk.h" +-#include "lstack_lwip.h" + #include "lstack_protocol_stack.h" + #include "lstack_thread_rpc.h" + #include "lstack_flow.h" + #include "lstack_tx_cache.h" + #include "lstack_virtio.h" + #include "lstack_ethdev.h" +- +-/* FRAME_MTU + 14byte header */ +-#define MBUF_MAX_LEN 1514 ++#include "lstack_mempool.h" + + /* any protocol stack thread receives arp packet and sync it to other threads, + * so that it can have the arp table */ +@@ -62,23 +59,23 @@ static void stack_broadcast_arp(struct rte_mbuf *mbuf, struct protocol_stack *cu + continue; + } + +- ret = dpdk_alloc_pktmbuf(stack->rxtx_mbuf_pool, &mbuf_copy, 1, true); +- if (ret != 0) { ++ ret = mem_get_mbuf_bulk(stack->stack_idx, &mbuf_copy, 1, true); ++ if (ret == 0) { + stack->stats.rx_allocmbuf_fail++; + return; + } + copy_mbuf(mbuf_copy, mbuf); + +- ret = rpc_call_arp(&stack->rpc_queue, mbuf_copy); ++ ret = rpc_call_arp(stack->stack_idx, mbuf_copy); + if (ret != 0) { +- rte_pktmbuf_free(mbuf_copy); ++ mem_put_mbuf_bulk(&mbuf_copy, 1); + return; + } + } + #if RTE_VERSION < RTE_VERSION_NUM(23, 11, 0, 0) + if (get_global_cfg_params()->kni_switch) { +- ret = dpdk_alloc_pktmbuf(cur_stack->rxtx_mbuf_pool, &mbuf_copy, 1, true); +- if (ret != 0) { ++ ret = mem_get_mbuf_bulk(cur_stack->stack_idx, &mbuf_copy, 1, true); ++ if (ret == 0) { + cur_stack->stats.rx_allocmbuf_fail++; + return; + } +@@ -87,8 +84,8 @@ static void stack_broadcast_arp(struct rte_mbuf *mbuf, struct protocol_stack *cu + } + #endif + if (get_global_cfg_params()->flow_bifurcation) { +- ret = dpdk_alloc_pktmbuf(cur_stack->rxtx_mbuf_pool, &mbuf_copy, 1, true); +- if (ret != 0) { ++ ret = mem_get_mbuf_bulk(cur_stack->stack_idx, &mbuf_copy, 1, true); ++ if (ret == 0) { + cur_stack->stats.rx_allocmbuf_fail++; + return; + } +@@ -100,12 +97,10 @@ static void stack_broadcast_arp(struct rte_mbuf *mbuf, struct protocol_stack *cu + + void eth_dev_recv(struct rte_mbuf *mbuf, struct protocol_stack *stack) + { +- int32_t ret; +- void *payload = NULL; ++ int ret; + struct pbuf *next = NULL; + struct pbuf *prev = NULL; + struct pbuf *head = NULL; +- struct pbuf_custom *pc = NULL; + struct rte_mbuf *m = mbuf; + uint16_t len, pkt_len; + struct rte_mbuf *next_m = NULL; +@@ -114,14 +109,9 @@ void eth_dev_recv(struct rte_mbuf *mbuf, struct protocol_stack *stack) + + while (m != NULL) { + len = (uint16_t)rte_pktmbuf_data_len(m); +- payload = rte_pktmbuf_mtod(m, void *); +- pc = mbuf_to_pbuf(m); +- next = pbuf_alloced_custom(PBUF_RAW, (uint16_t)len, PBUF_RAM, pc, payload, (uint16_t)len); +- if (next == NULL) { +- stack->stats.rx_allocmbuf_fail++; +- break; +- } +- next->tot_len = pkt_len; ++ next = mbuf_to_pbuf(m); ++ mem_init_pbuf(next, PBUF_RAW, pkt_len, len, PBUF_POOL); ++ + pkt_len -= len; + + if (head == NULL) { +@@ -135,6 +125,8 @@ void eth_dev_recv(struct rte_mbuf *mbuf, struct protocol_stack *stack) + next_m = m->next; + m->next = NULL; + m = next_m; ++ ++ mem_preput_pbuf(next); + } + + if (head != NULL) { +@@ -291,15 +283,16 @@ static err_t eth_dev_output(struct netif *netif, struct pbuf *pbuf) + struct protocol_stack *stack = get_protocol_stack(); + struct rte_mbuf *pre_mbuf = NULL; + struct rte_mbuf *first_mbuf = NULL; ++ struct rte_mbuf *mbuf; + void *buf_addr; + + while (likely(pbuf != NULL)) { +- struct rte_mbuf *mbuf = pbuf_to_mbuf(pbuf); +- ++ mbuf = pbuf_to_mbuf(pbuf); ++ // rte_mbuf_refcnt_set(mbuf, pbuf->ref + 1); ++ rte_mbuf_refcnt_update(mbuf, 1); + mbuf->data_len = pbuf->len; + mbuf->pkt_len = pbuf->tot_len; + mbuf->next = NULL; +- buf_addr = rte_pktmbuf_mtod(mbuf, void *); + + /* + * |rte_mbuf | mbuf_private | data_off | data | +@@ -307,6 +300,7 @@ static err_t eth_dev_output(struct netif *netif, struct pbuf *pbuf) + * buf_addr payload + * m->buf_addr pointer pbuf->payload + */ ++ buf_addr = rte_pktmbuf_mtod(mbuf, void *); + mbuf->data_off += (uint8_t *)pbuf->payload - (uint8_t *)buf_addr; + + if (first_mbuf == NULL) { +@@ -316,26 +310,19 @@ static err_t eth_dev_output(struct netif *netif, struct pbuf *pbuf) + first_mbuf->nb_segs++; + pre_mbuf->next = mbuf; + } +- +- if (likely(first_mbuf->pkt_len > MBUF_MAX_LEN)) { +- mbuf->ol_flags |= RTE_MBUF_F_TX_TCP_SEG; +- mbuf->tso_segsz = MBUF_MAX_DATA_LEN; +- } +- + pre_mbuf = mbuf; +- rte_mbuf_refcnt_update(mbuf, 1); + +- if (get_protocol_stack_group()->latency_start) { +- calculate_lstack_latency(&stack->latency, pbuf, GAZELLE_LATENCY_WRITE_LSTACK, 0); +- } ++ if (get_protocol_stack_group()->latency_start) ++ calculate_lstack_latency(stack->stack_idx, &pbuf, 1, GAZELLE_LATENCY_WRITE_LSTACK, 0); ++ + pbuf = pbuf->next; + } + + uint32_t sent_pkts = stack->dev_ops.tx_xmit(stack, &first_mbuf, 1); + stack->stats.tx += sent_pkts; + if (sent_pkts < 1) { ++ mem_put_mbuf_bulk(&first_mbuf, 1); + stack->stats.tx_drop++; +- rte_pktmbuf_free(first_mbuf); + return ERR_MEM; + } + +@@ -349,7 +336,7 @@ static err_t eth_dev_init(struct netif *netif) + netif->name[0] = 'e'; + netif->name[1] = 't'; + netif->flags |= NETIF_FLAG_BROADCAST | NETIF_FLAG_ETHARP | NETIF_FLAG_IGMP | NETIF_FLAG_MLD6; +- netif->mtu = FRAME_MTU; ++ netif->mtu = GAZELLE_IP_MTU; + netif->output = etharp_output; + netif->linkoutput = eth_dev_output; + netif->output_ip6 = ethip6_output; +@@ -368,18 +355,11 @@ static err_t eth_dev_init(struct netif *netif) + RTE_ETH_RX_OFFLOAD_UDP_CKSUM | + RTE_ETH_RX_OFFLOAD_IPV4_CKSUM); + netif_set_txol_flags(netif, RTE_ETH_TX_OFFLOAD_TCP_CKSUM | RTE_ETH_TX_OFFLOAD_TCP_TSO); +- /* 16: see kernel MAX_SKB_FRAGS define in skbuff.h */ +- netif_set_max_pbuf_frags(netif, 16); ++ netif_set_max_pbuf_frags(netif, OFFLOAD_TX_TSO_4K_FRAGS); + } else { + netif_set_rxol_flags(netif, get_protocol_stack_group()->rx_offload); + netif_set_txol_flags(netif, get_protocol_stack_group()->tx_offload); +- /* 40: dpdk pmd support 40 max segs */ +- netif_set_max_pbuf_frags(netif, 40); +- } +- netif_set_min_tso_seglen(netif, 256); +- +- if (get_global_cfg_params()->stack_mode_rtc) { +- netif_set_rtc_mode(netif); ++ netif_set_max_pbuf_frags(netif, OFFLOAD_TX_TSO_MTU_FRAGS); + } + + return ERR_OK; +@@ -400,7 +380,7 @@ int32_t ethdev_init(struct protocol_stack *stack) + + if (use_ltran()) { + stack->rx_ring_used = 0; +- int32_t ret = fill_mbuf_to_ring(stack->rxtx_mbuf_pool, stack->rx_ring, RING_SIZE(VDEV_RX_QUEUE_SZ)); ++ int32_t ret = fill_mbuf_to_ring(stack->stack_idx, stack->rx_ring, RING_SIZE(VDEV_RX_QUEUE_SZ)); + if (ret != 0) { + LSTACK_LOG(ERR, LSTACK, "fill mbuf to rx_ring failed ret=%d\n", ret); + return ret; +diff --git a/src/lstack/netif/lstack_fault_inject.c b/src/lstack/netif/lstack_fault_inject.c +index 9fae745..59eb8bd 100644 +--- a/src/lstack/netif/lstack_fault_inject.c ++++ b/src/lstack/netif/lstack_fault_inject.c +@@ -161,10 +161,8 @@ static int32_t inject_packet_loss_random(struct protocol_stack *stack, struct rt + if (rand_num > boundary) { + return nr_pkts; + } +- +- for (int32_t i = 0; i < nr_pkts; ++i) { +- rte_pktmbuf_free(pkts[i]); +- } ++ ++ mem_put_mbuf_bulk(pkts, nr_pkts); + return nr_pkts; + } + +@@ -184,7 +182,7 @@ static int32_t inject_packet_duplicate_random(struct protocol_stack *stack, stru + return nr_pkts; + } + +- struct rte_mempool *mp = stack->rxtx_mbuf_pool; ++ struct rte_mempool *mp = mem_get_mbuf_pool(stack->stack_idx); + struct rte_mbuf *mbuf_clone = NULL; + int32_t ret = 0; + +@@ -192,14 +190,14 @@ static int32_t inject_packet_duplicate_random(struct protocol_stack *stack, stru + int32_t count = count_max; + while (count--) { + mbuf_clone = rte_pktmbuf_clone(pkts[i], mp); +- rte_pktmbuf_free(pkts[i]); ++ mem_put_mbuf_bulk(&pkts[i], 1); + if (mbuf_clone == NULL) { + LSTACK_LOG(ERR, LSTACK, "fault inject mbuf_clone fail.\n"); + return 0; + } + ret = vdev_tx_xmit(stack, &mbuf_clone, 1); + if (ret < 1) { +- rte_pktmbuf_free(mbuf_clone); ++ mem_put_mbuf_bulk(&mbuf_clone, 1); + return ret; + } + } +@@ -215,7 +213,7 @@ static int32_t send_reorder_array(struct protocol_stack *stack) + for (int32_t i = 0; i < g_reorder[idx].cur_cnt; ++i) { + ret = vdev_tx_xmit(stack, g_reorder[idx].array + i, 1); + if (ret < 1) { +- rte_pktmbuf_free(*(g_reorder[idx].array + i)); ++ mem_put_mbuf_bulk(g_reorder[idx].array + i, 1); + } + } + g_reorder[idx].cur_cnt = 0; +@@ -239,7 +237,7 @@ static int32_t inject_packet_reorder_random(struct protocol_stack *stack, struct + return nr_pkts; + } + +- struct rte_mempool *mp = stack->rxtx_mbuf_pool; ++ struct rte_mempool *mp = mem_get_mbuf_pool(stack->stack_idx); + struct rte_mbuf *mbuf_clone = NULL; + int32_t idx = stack->stack_idx; + for (int32_t i = 0; i < nr_pkts; ++i) { +@@ -251,8 +249,8 @@ static int32_t inject_packet_reorder_random(struct protocol_stack *stack, struct + } + *(g_reorder[idx].array + g_reorder[idx].cur_cnt++) = mbuf_clone; + /* func rte_pktmbuf_clone will add refcnt of mbuf, so following operation will free mbuf double */ +- rte_pktmbuf_free(pkts[i]); +- rte_pktmbuf_free(pkts[i]); ++ mem_put_mbuf_bulk(&pkts[i], 1); ++ mem_put_mbuf_bulk(&pkts[i], 1); + } else { + send_reorder_array(stack); + } +diff --git a/src/lstack/netif/lstack_flow.c b/src/lstack/netif/lstack_flow.c +index ec09e45..7f2d9bd 100644 +--- a/src/lstack/netif/lstack_flow.c ++++ b/src/lstack/netif/lstack_flow.c +@@ -29,6 +29,7 @@ + #include "lstack_cfg.h" + #include "lstack_protocol_stack.h" + #include "lstack_flow.h" ++#include "lstack_mempool.h" + + #define MAX_PATTERN_NUM 4 + #define MAX_ACTION_NUM 2 +@@ -435,7 +436,7 @@ static void transfer_tcp_to_thread(struct rte_mbuf *mbuf, uint16_t stk_idx) + struct protocol_stack *stack = get_protocol_stack_group()->stacks[stk_idx]; + int ret = -1; + while (ret != 0) { +- ret = rpc_call_arp(&stack->rpc_queue, mbuf); ++ ret = rpc_call_arp(stack->stack_idx, mbuf); + printf("transfer_tcp_to_thread, ret : %d \n", ret); + } + } +@@ -449,17 +450,15 @@ static void parse_arp_and_transefer(char* buf) + int32_t ret; + for (int32_t i = 0; i < stack_group->stack_num; i++) { + stack = stack_group->stacks[i]; +- ret = dpdk_alloc_pktmbuf(stack->rxtx_mbuf_pool, &mbuf_copy, 1, false); +- while (ret != 0) { +- ret = dpdk_alloc_pktmbuf(stack->rxtx_mbuf_pool, &mbuf_copy, 1, false); ++ while (mem_get_mbuf_bulk(stack->stack_idx, &mbuf_copy, 1, false) == 0) { + stack->stats.rx_allocmbuf_fail++; + } + copy_mbuf(mbuf_copy, mbuf); + +- ret = rpc_call_arp(&stack->rpc_queue, mbuf_copy); ++ ret = rpc_call_arp(stack->stack_idx, mbuf_copy); + + while (ret != 0) { +- rpc_call_arp(&stack->rpc_queue, mbuf_copy); ++ rpc_call_arp(stack->stack_idx, mbuf_copy); + } + } + } +@@ -478,9 +477,7 @@ static void parse_tcp_and_transefer(char* buf) + struct rte_mbuf *mbuf_copy = NULL; + struct protocol_stack *stack = stack_group->stacks[stk_index]; + +- int32_t ret = dpdk_alloc_pktmbuf(stack->rxtx_mbuf_pool, &mbuf_copy, 1, false); +- while (ret != 0) { +- ret = dpdk_alloc_pktmbuf(stack->rxtx_mbuf_pool, &mbuf_copy, 1, false); ++ while (mem_get_mbuf_bulk(stack->stack_idx, &mbuf_copy, 1, false) == 0) { + stack->stats.rx_allocmbuf_fail++; + } + +diff --git a/src/lstack/netif/lstack_vdev.c b/src/lstack/netif/lstack_vdev.c +index 14d8cc6..425f8dd 100644 +--- a/src/lstack/netif/lstack_vdev.c ++++ b/src/lstack/netif/lstack_vdev.c +@@ -32,12 +32,11 @@ + #include "common/dpdk_common.h" + #include "lstack_protocol_stack.h" + #include "common/gazelle_reg_msg.h" +-#include "lstack_lwip.h" + #include "lstack_flow.h" + #include "lstack_vdev.h" + #include "lstack_port_map.h" + #include "lstack_virtio.h" +- ++#include "lstack_mempool.h" + #include "lstack_interrupt.h" + + /* INUSE_TX_PKTS_WATERMARK < VDEV_RX_QUEUE_SZ; +@@ -64,8 +63,8 @@ static uint32_t ltran_rx_poll(struct protocol_stack *stack, struct rte_mbuf **pk + stack->rx_ring_used += rcvd_pkts; + if (unlikely(stack->rx_ring_used >= USED_RX_PKTS_WATERMARK)) { + uint32_t free_cnt = LWIP_MIN(stack->rx_ring_used, RING_SIZE(VDEV_RX_QUEUE_SZ)); +- int32_t ret = dpdk_alloc_pktmbuf(stack->rxtx_mbuf_pool, (struct rte_mbuf **)free_buf, free_cnt, true); +- if (likely(ret == 0)) { ++ int ret = mem_get_mbuf_bulk(stack->stack_idx, (struct rte_mbuf **)free_buf, free_cnt, true); ++ if (likely(ret > 0)) { + nr_pkts = gazelle_ring_sp_enqueue(stack->rx_ring, (void **)free_buf, free_cnt); + stack->rx_ring_used -= nr_pkts; + } else { +@@ -167,9 +166,7 @@ static uint32_t ltran_tx_xmit(struct protocol_stack *stack, struct rte_mbuf **pk + do { + if (unlikely(stack->tx_ring_used >= INUSE_TX_PKTS_WATERMARK)) { + uint32_t free_pkts = gazelle_ring_sc_dequeue(stack->tx_ring, (void **)free_buf, stack->tx_ring_used); +- for (uint32_t i = 0; i < free_pkts; i++) { +- rte_pktmbuf_free(free_buf[i]); +- } ++ mem_put_mbuf_bulk(free_buf, free_pkts); + stack->tx_ring_used -= free_pkts; + } + +diff --git a/src/ltran/ltran_dfx.c b/src/ltran/ltran_dfx.c +index f6d1148..1722460 100644 +--- a/src/ltran/ltran_dfx.c ++++ b/src/ltran/ltran_dfx.c +@@ -740,7 +740,6 @@ static void show_lstack_stats(struct gazelle_stack_dfx_data *lstack_stat) + printf("app_write: %-18"PRIu64" ", lstack_stat->data.pkts.wakeup_stat.app_write_cnt); + printf("write_lwip: %-17"PRIu64" ", lstack_stat->data.pkts.stack_stat.write_lwip_cnt); + printf("app_write_rpc: %-14"PRIu64" \n", lstack_stat->data.pkts.wakeup_stat.app_write_rpc); +- printf("recv_list: %-18"PRIu64" ", lstack_stat->data.pkts.recv_list_cnt); + printf("conn_num: %-19hu ", lstack_stat->data.pkts.conn_num); + + printf("kernel_events: %-14"PRIu64"\n", lstack_stat->data.pkts.wakeup_stat.kernel_events); +@@ -902,11 +901,10 @@ static void gazelle_print_lstack_stat_latency(void *buf, const struct gazelle_st + printf("Recv:\n"); + + printf("range: t0--->t1\n%s", res[GAZELLE_LATENCY_INTO_MBOX].latency_stat_result); +- printf("range: t1--->t2\n%s", res[GAZELLE_LATENCY_READ_LWIP].latency_stat_result); +- printf("range: t2--->t3\n%s", res[GAZELLE_LATENCY_READ_APP_CALL].latency_stat_result); +- printf("range: t3--->t4\n%s", res[GAZELLE_LATENCY_READ_LSTACK].latency_stat_result); +- printf("range: t0--->t4\n%s", res[GAZELLE_LATENCY_READ_MAX].latency_stat_result); +- printf("t0: read from nic t1: into recvmbox t2: into recvring t3: app read start t4: app read end\n"); ++ printf("range: t1--->t2\n%s", res[GAZELLE_LATENCY_READ_APP_CALL].latency_stat_result); ++ printf("range: t2--->t3\n%s", res[GAZELLE_LATENCY_READ_LSTACK].latency_stat_result); ++ printf("range: t0--->t3\n%s", res[GAZELLE_LATENCY_READ_MAX].latency_stat_result); ++ printf("t0: read from nic t1: into recvmbox t2: app read start t3: app read end\n"); + + printf("Send:\n"); + printf("range: t0--->t1\n%s", res[GAZELLE_LATENCY_WRITE_INTO_RING].latency_stat_result); +@@ -1271,7 +1269,7 @@ static void gazelle_print_lstack_stat_conn(void *buf, const struct gazelle_stat_ + printf("Active Internet connections (servers and established)\n"); + do { + printf("\n------ stack tid: %6u ------time=%s\n", stat->tid, sys_local_time_str); +- printf("No. Proto lwip_recv recv_ring in_send send_ring cwn rcv_wnd snd_wnd snd_buf snd_nxt" ++ printf("No. Proto recvmbox recvtail sendmbox sendtail in_send cwn rcv_wnd snd_wnd snd_buf snd_nxt" + " lastack rcv_nxt events epoll_ev evlist fd Local Address" + " Foreign Address State" + " keep-alive keep-alive(idle,intvl,cnt) pingpong\n"); +@@ -1292,10 +1290,12 @@ static void gazelle_print_lstack_stat_conn(void *buf, const struct gazelle_stat_ + + sprintf_s(str_laddr, sizeof(str_laddr), "%s:%hu", str_ip, conn_info->l_port); + sprintf_s(str_raddr, sizeof(str_raddr), "%s:%hu", str_rip, conn_info->r_port); +- printf("%-6utcp %-10u%-10u%-8u%-10u%-9d%-9d%-10d%-10d%-15u%-15u%-15u%-10x%-10x%-7d%-7d" +- "%-52s %-52s %s %-5d %s %d\n", +- i, conn_info->recv_cnt, conn_info->recv_ring_cnt, conn_info->in_send, +- conn_info->send_ring_cnt, conn_info->cwn, conn_info->rcv_wnd, conn_info->snd_wnd, ++ printf("%-6utcp %-9u%-9u%-9u%-9u%-8u%-9d%-9d%-10d%-10d%-15u%-15u%-15u%-10x%-10x%-7d%-7d" ++ "%-52s %-52s %s %-5d %s %d\n", i, ++ conn_info->recvmbox_cnt, conn_info->recvmbox_tail, ++ conn_info->sendmbox_cnt, conn_info->sendmbox_tail, ++ conn_info->in_send, ++ conn_info->cwn, conn_info->rcv_wnd, conn_info->snd_wnd, + conn_info->snd_buf, conn_info->snd_nxt, conn_info->lastack, conn_info->rcv_nxt, conn_info->events, + conn_info->epoll_events, conn_info->eventlist, conn_info->fd, + str_laddr, str_raddr, tcp_state_to_str(conn_info->tcp_sub_state), +@@ -1304,14 +1304,14 @@ static void gazelle_print_lstack_stat_conn(void *buf, const struct gazelle_stat_ + inet_ntop(domain, lip, str_ip, sizeof(str_ip)); + sprintf_s(str_laddr, sizeof(str_laddr), "%s:%hu", str_ip, conn_info->l_port); + sprintf_s(str_raddr, sizeof(str_raddr), "%s:*", domain == AF_INET ? "0.0.0.0" : "::0"); +- printf("%-6utcp %-147u%-7d%-52s %-52s LISTEN\n", i, conn_info->recv_cnt, ++ printf("%-6utcp %-147u%-7d%-52s %-52s LISTEN\n", i, conn_info->recvmbox_cnt, + conn_info->fd, str_laddr, str_raddr); + } else { + printf("Got unknow tcp conn::%s:%5hu, state:%u\n", + inet_ntop(domain, lip, str_ip, sizeof(str_ip)), conn_info->l_port, conn_info->state); + } +- unread_pkts += conn_info->recv_ring_cnt + conn_info->recv_cnt; +- unsend_pkts += conn_info->send_ring_cnt + conn_info->in_send; ++ unread_pkts += conn_info->recvmbox_cnt; ++ unsend_pkts += conn_info->sendmbox_cnt + conn_info->in_send; + } + if (conn->conn_num > 0) { + printf("Total unread pkts:%u unsend pkts:%u\n", unread_pkts, unsend_pkts); +@@ -1479,12 +1479,6 @@ static void gazelle_print_lstack_aggregate(void *buf, const struct gazelle_stat_ + printf("tx_size_1461_8192 byte: %u\n", stats->size_1461_8192[1]); + printf("tx_size_8193_max byte: %u\n", stats->size_8193_max[1]); + +- printf("app_tx_szie_1_64: %u\n", stats->size_1_64[2]); +- printf("app_tx_size_65_512: %u\n", stats->size_65_512[2]); +- printf("app_tx_size_513_1460 byte: %u\n", stats->size_513_1460[2]); +- printf("app_tx_size_1461_8192 byte: %u\n", stats->size_1461_8192[2]); +- printf("app_tx_size_8193_max byte: %u\n", stats->size_8193_max[2]); +- + if ((dfx->eof != 0) || (ret != GAZELLE_OK)) { + break; + } +diff --git a/src/ltran/ltran_forward.c b/src/ltran/ltran_forward.c +index aef5e46..b62421a 100644 +--- a/src/ltran/ltran_forward.c ++++ b/src/ltran/ltran_forward.c +@@ -292,7 +292,7 @@ static uint32_t get_vlan_offset(const struct rte_mbuf *m) + uint32_t offset = 0; + struct rte_ether_hdr *ethh = rte_pktmbuf_mtod(m, struct rte_ether_hdr *); + u16_t type = ethh->ether_type; +- if (type == PP_HTONS(RTE_ETHER_TYPE_VLAN)) { ++ if (type == htons(RTE_ETHER_TYPE_VLAN)) { + offset += sizeof(struct rte_vlan_hdr); + } + return offset; +-- +2.33.0 + diff --git a/0330-cfg-add-mem_cache_max-and-change-default-rpc_msg_max.patch b/0330-cfg-add-mem_cache_max-and-change-default-rpc_msg_max.patch new file mode 100644 index 0000000..e399bcd --- /dev/null +++ b/0330-cfg-add-mem_cache_max-and-change-default-rpc_msg_max.patch @@ -0,0 +1,157 @@ +From 9469121fdc2e20ebda9b062bb7876ae8dee921d7 Mon Sep 17 00:00:00 2001 +From: Lemmy Huang +Date: Mon, 24 Mar 2025 17:16:40 +0800 +Subject: [PATCH] cfg: add mem_cache_max and change default rpc_msg_max + +Signed-off-by: Lemmy Huang +--- + src/lstack/core/lstack_cfg.c | 14 +++++++++++++- + src/lstack/core/lstack_mempool.c | 8 ++++---- + src/lstack/include/lstack_cfg.h | 7 ++++--- + src/lstack/include/lstack_mempool.h | 2 +- + src/lstack/lstack.conf | 9 --------- + 5 files changed, 22 insertions(+), 18 deletions(-) + +diff --git a/src/lstack/core/lstack_cfg.c b/src/lstack/core/lstack_cfg.c +index 432e4db..78f9198 100644 +--- a/src/lstack/core/lstack_cfg.c ++++ b/src/lstack/core/lstack_cfg.c +@@ -36,6 +36,7 @@ + #include "lstack_log.h" + #include "lstack_dpdk.h" + #include "lstack_cfg.h" ++#include "lstack_mempool.h" + + #define DEFAULT_CONF_FILE "/etc/gazelle/lstack.conf" + #define LSTACK_CONF_ENV "LSTACK_CONF_PATH" +@@ -82,6 +83,7 @@ static int32_t parse_nic_txqueue_size(void); + static int32_t parse_stack_thread_mode(void); + static int32_t parse_nic_vlan_mode(void); + static int32_t parse_rpc_msg_max(void); ++static int32_t parse_mem_cache_num(void); + static int32_t parse_send_cache_mode(void); + static int32_t parse_flow_bifurcation(void); + static int32_t parse_stack_interrupt(void); +@@ -120,6 +122,7 @@ static struct config_vector_t g_config_tbl[] = { + { "nic_rxqueue_size", parse_nic_rxqueue_size}, + { "nic_txqueue_size", parse_nic_txqueue_size}, + { "rpc_msg_max", parse_rpc_msg_max }, ++ { "mem_cache_num", parse_mem_cache_num }, + { "app_bind_numa", parse_app_bind_numa }, + { "stack_num", parse_stack_num }, + { "num_cpus", parse_stack_cpu_number }, +@@ -1424,7 +1427,16 @@ static int32_t parse_nic_vlan_mode(void) + static int32_t parse_rpc_msg_max(void) + { + int32_t ret; +- PARSE_ARG(g_config_params.rpc_msg_max, "rpc_msg_max", 4096, 1, 8192, ret); ++ PARSE_ARG(g_config_params.rpc_msg_max, "rpc_msg_max", ++ 4096, GAZELLE_RESERVED_CLIENTS, MEMP_NUM_SYS_MBOX + GAZELLE_RESERVED_CLIENTS, ret); ++ return ret; ++} ++ ++static int32_t parse_mem_cache_num(void) ++{ ++ int32_t ret; ++ PARSE_ARG(g_config_params.mem_cache_num, "mem_cache_num", ++ MEMPOOL_CACHE_NUM, BUF_CACHE_MIN_NUM, BUF_CACHE_MAX_NUM, ret); + return ret; + } + +diff --git a/src/lstack/core/lstack_mempool.c b/src/lstack/core/lstack_mempool.c +index 983f2f0..72c7d67 100644 +--- a/src/lstack/core/lstack_mempool.c ++++ b/src/lstack/core/lstack_mempool.c +@@ -332,7 +332,7 @@ static const struct mempool_ops mbuf_mp_ops = { + }; + + +-static struct rte_mempool *mbuf_pool_create(int stack_id, uint16_t numa_id) ++static struct rte_mempool *mbuf_pool_create(int stack_id, unsigned numa_id) + { + struct cfg_params *cfg_params = get_global_cfg_params(); + char name[RTE_MEMPOOL_NAMESIZE]; +@@ -367,7 +367,7 @@ static struct rte_mempool *mbuf_pool_create(int stack_id, uint16_t numa_id) + return pool; + } + +-static struct rte_mempool *rpc_pool_create(int stack_id, uint16_t numa_id) ++static struct rte_mempool *rpc_pool_create(int stack_id, unsigned numa_id) + { + char name [RTE_MEMPOOL_NAMESIZE]; + struct rte_mempool *pool; +@@ -478,14 +478,14 @@ int mem_thread_cache_init(struct mem_thread *mt) + char name [RTE_MEMPOOL_NAMESIZE]; + SYS_FORMAT_NAME(name, RTE_MEMPOOL_NAMESIZE, "%s_%p", "migrate_ring", mt); + +- mt->mbuf_migrate_ring = rte_ring_create(name, BUF_CACHE_DEFAULT_NUM, ++ mt->mbuf_migrate_ring = rte_ring_create(name, BUF_CACHE_MAX_NUM, + rte_socket_id(), RING_F_SP_ENQ | RING_F_SC_DEQ); + if (mt->mbuf_migrate_ring == NULL) { + return -1; + } + } + +- mt->mbuf_cache = buf_cache_create(BUF_CACHE_DEFAULT_NUM); ++ mt->mbuf_cache = buf_cache_create(get_global_cfg_params()->mem_cache_num); + if (mt->mbuf_cache == NULL) { + mem_thread_cache_free(mt); + return -1; +diff --git a/src/lstack/include/lstack_cfg.h b/src/lstack/include/lstack_cfg.h +index 3e671b5..b4cdd07 100644 +--- a/src/lstack/include/lstack_cfg.h ++++ b/src/lstack/include/lstack_cfg.h +@@ -130,12 +130,13 @@ struct cfg_params { + + uint32_t nic_read_number; + uint32_t rpc_number; +- uint32_t rpc_msg_max; + }; + + struct { // socket +- uint32_t tcp_conn_count; +- uint32_t mbuf_count_per_conn; ++ uint16_t tcp_conn_count; ++ uint16_t mbuf_count_per_conn; ++ uint16_t rpc_msg_max; ++ uint16_t mem_cache_num; + }; + + struct { // deprecated +diff --git a/src/lstack/include/lstack_mempool.h b/src/lstack/include/lstack_mempool.h +index a602610..848509c 100644 +--- a/src/lstack/include/lstack_mempool.h ++++ b/src/lstack/include/lstack_mempool.h +@@ -45,7 +45,7 @@ + #define MEMPOOL_CACHE_NUM 32 + + #define BUF_CACHE_MIN_NUM 32 +-#define BUF_CACHE_DEFAULT_NUM 1024 ++#define BUF_CACHE_MAX_NUM 1024 + #define BUF_CACHE_WATERSTEP_SHIFT 4 /* 1/16 */ + + #define BUF_BULK_MAX_NUM 32 +diff --git a/src/lstack/lstack.conf b/src/lstack/lstack.conf +index 490bdfc..4084ce3 100644 +--- a/src/lstack/lstack.conf ++++ b/src/lstack/lstack.conf +@@ -23,16 +23,7 @@ low_power_mode=0 + tcp_conn_count = 1500 + mbuf_count_per_conn = 170 + +-# send ring size, default is 32, max is 2048 +-# if udp pktlen exceeds 45952(32 * 1436)B, send_ring_size must be at least 64. +-send_ring_size = 32 +- +-#recv ring size, default is 128, max is 2048 +-recv_ring_size = 128 +- + #protocol stack thread per loop params +-#read data form protocol stack into recv_ring +-read_connect_number = 4 + #process rpc msg number + rpc_number = 4 + #read nic pkts number +-- +2.33.0 + diff --git a/0331-cfg-add-mem_async_mode.patch b/0331-cfg-add-mem_async_mode.patch new file mode 100644 index 0000000..13a8fc9 --- /dev/null +++ b/0331-cfg-add-mem_async_mode.patch @@ -0,0 +1,98 @@ +From 9da8d8a3e4a37049793db6fac45bb98b60a612a6 Mon Sep 17 00:00:00 2001 +From: Lemmy Huang +Date: Tue, 25 Mar 2025 15:31:55 +0800 +Subject: [PATCH] cfg: add mem_async_mode + +Signed-off-by: Lemmy Huang +--- + src/lstack/core/lstack_cfg.c | 11 +++++++++++ + src/lstack/core/lstack_mempool.c | 2 +- + src/lstack/include/lstack_cfg.h | 1 + + src/lstack/include/mbox_ring.h | 10 ++++++---- + 4 files changed, 19 insertions(+), 5 deletions(-) + +diff --git a/src/lstack/core/lstack_cfg.c b/src/lstack/core/lstack_cfg.c +index 78f9198..ddb419d 100644 +--- a/src/lstack/core/lstack_cfg.c ++++ b/src/lstack/core/lstack_cfg.c +@@ -84,6 +84,7 @@ static int32_t parse_stack_thread_mode(void); + static int32_t parse_nic_vlan_mode(void); + static int32_t parse_rpc_msg_max(void); + static int32_t parse_mem_cache_num(void); ++static int32_t parse_mem_async_mode(void); + static int32_t parse_send_cache_mode(void); + static int32_t parse_flow_bifurcation(void); + static int32_t parse_stack_interrupt(void); +@@ -151,6 +152,7 @@ static struct config_vector_t g_config_tbl[] = { + { "use_sockmap", parse_use_sockmap }, + { "udp_enable", parse_udp_enable }, + { "stack_thread_mode", parse_stack_thread_mode }, ++ { "mem_async_mode", parse_mem_async_mode }, + { "nic_vlan_mode", parse_nic_vlan_mode }, + { "send_cache_mode", parse_send_cache_mode }, + { "flow_bifurcation", parse_flow_bifurcation}, +@@ -1440,6 +1442,15 @@ static int32_t parse_mem_cache_num(void) + return ret; + } + ++static int32_t parse_mem_async_mode(void) ++{ ++ // TODO ++ g_config_params.mem_async_mode = 1; ++ if (g_config_params.stack_mode_rtc || xdp_eth_enabled()) ++ g_config_params.mem_async_mode = 0; ++ return 0; ++} ++ + static int32_t parse_send_cache_mode(void) + { + int32_t ret; +diff --git a/src/lstack/core/lstack_mempool.c b/src/lstack/core/lstack_mempool.c +index 72c7d67..eeae797 100644 +--- a/src/lstack/core/lstack_mempool.c ++++ b/src/lstack/core/lstack_mempool.c +@@ -474,7 +474,7 @@ void mem_thread_cache_free(struct mem_thread *mt) + + int mem_thread_cache_init(struct mem_thread *mt) + { +- if (!get_global_cfg_params()->stack_mode_rtc && !xdp_eth_enabled()) { ++ if (get_global_cfg_params()->mem_async_mode) { + char name [RTE_MEMPOOL_NAMESIZE]; + SYS_FORMAT_NAME(name, RTE_MEMPOOL_NAMESIZE, "%s_%p", "migrate_ring", mt); + +diff --git a/src/lstack/include/lstack_cfg.h b/src/lstack/include/lstack_cfg.h +index b4cdd07..e1639a1 100644 +--- a/src/lstack/include/lstack_cfg.h ++++ b/src/lstack/include/lstack_cfg.h +@@ -137,6 +137,7 @@ struct cfg_params { + uint16_t mbuf_count_per_conn; + uint16_t rpc_msg_max; + uint16_t mem_cache_num; ++ bool mem_async_mode; + }; + + struct { // deprecated +diff --git a/src/lstack/include/mbox_ring.h b/src/lstack/include/mbox_ring.h +index 7ffdcf4..f6acdef 100644 +--- a/src/lstack/include/mbox_ring.h ++++ b/src/lstack/include/mbox_ring.h +@@ -285,10 +285,12 @@ int mt_ring_create(struct mbox_ring *mr, const char *name, unsigned count) + mr->ops = &g_mbox_rtw_default_ops; + mr->st_obj = NULL; + } +- if ((mr->flags & MBOX_FLAG_RECV) && !xdp_eth_enabled()) { +- mr->flags |= MBOX_FLAG_PEEK; +- mr->ops = &g_mbox_rtw_peek_ops; +- mr->ops->create(mr, name, count); ++ if (mr->flags & MBOX_FLAG_RECV) { ++ if (get_global_cfg_params()->mem_async_mode) { ++ mr->flags |= MBOX_FLAG_PEEK; ++ mr->ops = &g_mbox_rtw_peek_ops; ++ mr->ops->create(mr, name, count); ++ } + } + + mr->ring = rte_ring_create_fast(name, count, RING_F_SP_ENQ | RING_F_SC_DEQ); +-- +2.33.0 + diff --git a/0332-mempool-add-mem_thread_cache_flush.patch b/0332-mempool-add-mem_thread_cache_flush.patch new file mode 100644 index 0000000..f51c8a7 --- /dev/null +++ b/0332-mempool-add-mem_thread_cache_flush.patch @@ -0,0 +1,617 @@ +From 8ff71cad286a1a1952f427ce48a815322569189a Mon Sep 17 00:00:00 2001 +From: Lemmy Huang +Date: Thu, 27 Mar 2025 15:20:02 +0800 +Subject: [PATCH] mempool: add mem_thread_cache_flush fix + PBUF_POOL_PREINIT + +Signed-off-by: Lemmy Huang +--- + src/lstack/api/lstack_unistd.c | 20 +-- + src/lstack/core/lstack_mempool.c | 259 ++++++++++++++++++++++++---- + src/lstack/include/lstack_mempool.h | 19 +- + src/lstack/include/lstack_unistd.h | 3 + + 4 files changed, 250 insertions(+), 51 deletions(-) + +diff --git a/src/lstack/api/lstack_unistd.c b/src/lstack/api/lstack_unistd.c +index e3b9b1f..431e0a2 100644 +--- a/src/lstack/api/lstack_unistd.c ++++ b/src/lstack/api/lstack_unistd.c +@@ -58,7 +58,7 @@ bool sig_need_dump(int sig) + return true; + } + +-static void pthread_block_sig(int sig) ++void pthread_block_sig(int sig) + { + sigset_t mask; + +@@ -67,6 +67,15 @@ static void pthread_block_sig(int sig) + pthread_sigmask(SIG_BLOCK, &mask, NULL); + } + ++void pthread_unblock_sig(int sig) ++{ ++ sigset_t mask; ++ ++ sigemptyset(&mask); ++ sigaddset(&mask, sig); ++ pthread_sigmask(SIG_UNBLOCK, &mask, NULL); ++} ++ + static void lstack_sigaction_default_handler(int sig, siginfo_t *info, void *context) + { + static bool skip_process_exit = false; +@@ -118,15 +127,6 @@ static void lstack_sig_default_handler(int sig) + lstack_sigaction_default_handler(sig, NULL, NULL); + } + +-static void pthread_unblock_sig(int sig) +-{ +- sigset_t mask; +- +- sigemptyset(&mask); +- sigaddset(&mask, sig); +- pthread_sigmask(SIG_UNBLOCK, &mask, NULL); +-} +- + int lstack_signal_init(void) + { + unsigned int i; +diff --git a/src/lstack/core/lstack_mempool.c b/src/lstack/core/lstack_mempool.c +index eeae797..7e2a706 100644 +--- a/src/lstack/core/lstack_mempool.c ++++ b/src/lstack/core/lstack_mempool.c +@@ -21,14 +21,18 @@ + #include "common/dpdk_common.h" + #include "lstack_dpdk.h" + #include "lstack_protocol_stack.h" ++#include "lstack_unistd.h" + +-#define MEM_THREAD_MANAGER_TIMEOUT 1 +-#define MEM_THREAD_MANAGER_MAX 64 ++#define MEM_THREAD_FLUSH_SIG (SIGRTMIN + 11) ++#define MEM_THREAD_MANAGER_FLUSH_MS 100 ++#define MEM_THREAD_MANAGER_FREE_S 2 ++#define MEM_THREAD_MANAGER_FREE_MAX 64 + + struct mem_thread_manager { + struct list_node mt_work_list; + struct list_node mt_free_list; + rte_spinlock_t list_lock; ++ uint32_t flush_time; + }; + + struct mem_thread_group { +@@ -36,6 +40,9 @@ struct mem_thread_group { + pthread_t thread; + struct list_node mt_node; + struct mem_thread mt_array[PROTOCOL_STACK_MAX]; ++ ++ bool used_flag; ++ uint32_t used_time; + }; + + static struct mem_stack g_mem_stack_group[PROTOCOL_STACK_MAX] = {0}; +@@ -58,11 +65,73 @@ struct rte_mempool *mem_get_rpc_pool(int stack_id) + return g_mem_stack_group[stack_id].rpc_pool; + } + +-static void mem_thread_manager_add_work(struct mem_thread_group *mt_group) ++static inline bool mem_thread_group_in_used(const struct mem_thread_group *mt_grooup, uint32_t timeout) + { +- rte_spinlock_lock(&g_mem_thread_manager.list_lock); +- list_add_node(&mt_group->mt_node, &g_mem_thread_manager.mt_work_list); +- rte_spinlock_unlock(&g_mem_thread_manager.list_lock); ++ return mt_grooup->used_flag || ++ (sys_now() - mt_grooup->used_time < timeout); ++} ++ ++static inline void mem_thread_group_used(void) ++{ ++ g_mem_thread_group->used_flag = true; ++ g_mem_thread_group->used_time = sys_now(); ++} ++ ++static inline void mem_thread_group_done(void) ++{ ++ g_mem_thread_group->used_flag = false; ++} ++ ++static void mem_thread_cache_flush(struct mem_thread *mt); ++static unsigned mem_thread_cache_count(const struct mem_thread *mt); ++static void mem_thread_group_action_flush(int signum) ++{ ++ struct mem_thread *mt; ++ int stack_id; ++ ++ if (g_mem_thread_group == NULL) ++ return; ++ if (mem_thread_group_in_used(g_mem_thread_group, MEM_THREAD_MANAGER_FLUSH_MS)) ++ return; ++ ++ for (stack_id = 0; stack_id < PROTOCOL_STACK_MAX; stack_id++) { ++ mt = &g_mem_thread_group->mt_array[stack_id]; ++ mem_thread_cache_flush(mt); ++ } ++} ++ ++static int mem_thread_group_register_flush(void) ++{ ++ sighandler_t handler; ++ handler = signal(MEM_THREAD_FLUSH_SIG, mem_thread_group_action_flush); ++ if (handler == SIG_ERR) { ++ LSTACK_LOG(ERR, LSTACK, "signal failed\n"); ++ return -1; ++ } ++ pthread_unblock_sig(MEM_THREAD_FLUSH_SIG); ++ return 0; ++} ++ ++static inline void mem_thread_group_notify_flush(const struct mem_thread_group *mt_group, uint32_t timeout) ++{ ++ const struct mem_thread *mt; ++ int stack_id; ++ unsigned count = 0; ++ ++ if (mem_thread_group_in_used(mt_group, timeout)) ++ return; ++ ++ for (stack_id = 0; stack_id < PROTOCOL_STACK_MAX; stack_id++) { ++ mt = &mt_group->mt_array[stack_id]; ++ count += mem_thread_cache_count(mt); ++ } ++ if (count == 0) { ++ return; ++ } ++ ++ if (pthread_kill(mt_group->thread, MEM_THREAD_FLUSH_SIG) != 0) { ++ LSTACK_LOG(ERR, LSTACK, "pthread_kill tid %d failed\n", mt_group->tid); ++ } + } + + static inline bool mem_thread_group_exist(const struct mem_thread_group *mt_group) +@@ -72,6 +141,13 @@ static inline bool mem_thread_group_exist(const struct mem_thread_group *mt_grou + return true; + } + ++static void mem_thread_manager_add_work(struct mem_thread_group *mt_group) ++{ ++ rte_spinlock_lock(&g_mem_thread_manager.list_lock); ++ list_add_node(&mt_group->mt_node, &g_mem_thread_manager.mt_work_list); ++ rte_spinlock_unlock(&g_mem_thread_manager.list_lock); ++} ++ + static void mem_thread_group_free(struct mem_thread_group *mt_group) + { + struct mem_thread *mt; +@@ -100,6 +176,8 @@ static int mem_thread_group_init(int stack_id) + LSTACK_LOG(ERR, LSTACK, "alloc mem_thread_group failed, stack_id %d\n", stack_id); + return -1; + } ++ mem_thread_group_register_flush(); ++ + g_mem_thread_group->tid = rte_gettid(); + g_mem_thread_group->thread = pthread_self(); + list_init_node(&g_mem_thread_group->mt_node); +@@ -107,7 +185,7 @@ static int mem_thread_group_init(int stack_id) + } + + mt = &g_mem_thread_group->mt_array[stack_id]; +- if (mem_thread_cache_init(mt) != 0) { ++ if (mem_thread_cache_init(mt, stack_id) != 0) { + LSTACK_LOG(ERR, LSTACK, "mem_thread_cache_init failed, stack_id %d\n", stack_id); + return -1; + } +@@ -133,6 +211,31 @@ static inline struct mem_thread *mem_thread_group_get(int stack_id) + return mt; + } + ++static void mem_thread_manager_flush_all(void) ++{ ++ struct list_node *node, *next; ++ struct mem_thread_group *mt_group; ++ uint32_t now = sys_now(); ++ ++ rte_spinlock_lock(&g_mem_thread_manager.list_lock); ++ ++ if (now - g_mem_thread_manager.flush_time < MEM_THREAD_MANAGER_FLUSH_MS) { ++ rte_spinlock_unlock(&g_mem_thread_manager.list_lock); ++ return; ++ } ++ g_mem_thread_manager.flush_time = now; ++ ++ list_for_each_node(node, next, &g_mem_thread_manager.mt_work_list) { ++ mt_group = container_of(node, struct mem_thread_group, mt_node); ++ /* skip myself */ ++ if (mt_group == g_mem_thread_group) ++ continue; ++ mem_thread_group_notify_flush(mt_group, MEM_THREAD_MANAGER_FLUSH_MS); ++ } ++ ++ rte_spinlock_unlock(&g_mem_thread_manager.list_lock); ++} ++ + static void *mem_thread_manager_thread(void *arg) + { + struct list_node *node, *next; +@@ -142,9 +245,10 @@ static void *mem_thread_manager_thread(void *arg) + rte_spinlock_init(&g_mem_thread_manager.list_lock); + list_init_head(&g_mem_thread_manager.mt_work_list); + list_init_head(&g_mem_thread_manager.mt_free_list); ++ g_mem_thread_manager.flush_time = sys_now(); + + while(true) { +- sleep(MEM_THREAD_MANAGER_TIMEOUT); ++ sleep(MEM_THREAD_MANAGER_FREE_S); + + rte_spinlock_lock(&g_mem_thread_manager.list_lock); + +@@ -156,7 +260,7 @@ static void *mem_thread_manager_thread(void *arg) + + list_for_each_node(node, next, &g_mem_thread_manager.mt_work_list) { + count++; +- if (count > MEM_THREAD_MANAGER_MAX) { ++ if (count > MEM_THREAD_MANAGER_FREE_MAX) { + /* move list head after the current node, + * and start traversing from this node next time */ + list_del_node(&g_mem_thread_manager.mt_work_list); +@@ -166,6 +270,7 @@ static void *mem_thread_manager_thread(void *arg) + + mt_group = container_of(node, struct mem_thread_group, mt_node); + if (mem_thread_group_exist(mt_group)) { ++ mem_thread_group_notify_flush(mt_group, MEM_THREAD_MANAGER_FREE_S * MS_PER_S); + continue; + } + list_del_node(node); +@@ -183,8 +288,7 @@ int mem_thread_manager_init(void) + return thread_create("gzmempool", 0, mem_thread_manager_thread, NULL); + } + +-static __rte_always_inline +-struct mem_thread *mem_thread_get(int stack_id) ++static inline struct mem_thread *mem_thread_get(int stack_id) + { + /* stack thread uses mbufpool_cache instead of buf_cache */ + if (get_protocol_stack() != NULL) +@@ -443,42 +547,97 @@ unsigned mem_stack_mbuf_pool_count(int stack_id) + return rte_mempool_avail_count(ms->mbuf_pool); + } + +-void mem_thread_cache_free(struct mem_thread *mt) ++static void mem_thread_cache_flush(struct mem_thread *mt) + { +- void *obj; ++ struct mem_stack *ms = mem_stack_get(mt->stack_id); ++ void *obj_table[BUF_BULK_MAX_NUM]; ++ unsigned num; + + if (mt->mbuf_migrate_ring != NULL) { +- while (rte_ring_sc_dequeue(mt->mbuf_migrate_ring, &obj) == 0) { +- mem_put_mbuf_bulk((struct rte_mbuf **)&obj, 1); ++ LWIP_DEBUGF(MEMP_DEBUG, ("%s(mem_thread=%p, stack_id=%d, mbuf_migrate_ring count=%u)\n", ++ __FUNCTION__, mt, mt->stack_id, rte_ring_count(mt->mbuf_migrate_ring))); ++ ++ while (true) { ++ num = rte_ring_sc_dequeue_burst(mt->mbuf_migrate_ring, obj_table, BUF_BULK_MAX_NUM, NULL); ++ if (num == 0) ++ break; ++ mbuf_mp_ops.put_bulk(ms->mbuf_pool, obj_table, num); + } +- rte_ring_free(mt->mbuf_migrate_ring); +- mt->mbuf_migrate_ring = NULL; + } + + if (mt->mbuf_cache != NULL) { +- while (buf_cache_pop_bulk(mt->mbuf_cache, &obj, 1, NULL) > 0) { +- mem_put_mbuf_bulk((struct rte_mbuf **)&obj, 1); ++ LWIP_DEBUGF(MEMP_DEBUG, ("%s(mem_thread=%p, stack_id=%d, mbuf_cache count=%u)\n", ++ __FUNCTION__, mt, mt->stack_id, buf_cache_count(mt->mbuf_cache))); ++ ++ while (true) { ++ num = LWIP_MIN(buf_cache_count(mt->mbuf_cache), BUF_BULK_MAX_NUM); ++ num = buf_cache_pop_bulk(mt->mbuf_cache, obj_table, num, NULL); ++ if (num == 0) ++ break; ++ mbuf_mp_ops.put_bulk(ms->mbuf_pool, obj_table, num); + } +- buf_cache_free(mt->mbuf_cache); +- mt->mbuf_cache = NULL; ++ buf_cache_reset_watermark(mt->mbuf_cache); + } + + if (mt->rpc_cache != NULL) { +- while (buf_cache_pop_bulk(mt->rpc_cache, &obj, 1, NULL) > 0) { +- mem_put_rpc(obj); ++ LWIP_DEBUGF(MEMP_DEBUG, ("%s(mem_thread=%p, stack_id=%d, rpc_cache count=%u)\n", ++ __FUNCTION__, mt, mt->stack_id, buf_cache_count(mt->rpc_cache))); ++ ++ while (true) { ++ num = LWIP_MIN(buf_cache_count(mt->rpc_cache), BUF_BULK_MAX_NUM); ++ num = buf_cache_pop_bulk(mt->rpc_cache, obj_table, num, NULL); ++ if (num == 0) ++ break; ++ mem_mp_ops.put_bulk(ms->rpc_pool, obj_table, num); + } ++ buf_cache_reset_watermark(mt->rpc_cache); ++ } ++} ++ ++static unsigned mem_thread_cache_count(const struct mem_thread *mt) ++{ ++ unsigned count = 0; ++ ++ if (mt->mbuf_migrate_ring != NULL) { ++ count += rte_ring_count(mt->mbuf_migrate_ring); ++ } ++ if (mt->mbuf_cache != NULL) { ++ count += buf_cache_count(mt->mbuf_cache); ++ } ++ if (mt->rpc_cache != NULL) { ++ count += buf_cache_count(mt->rpc_cache); ++ } ++ return count; ++} ++ ++void mem_thread_cache_free(struct mem_thread *mt) ++{ ++ mem_thread_cache_flush(mt); ++ ++ if (mt->mbuf_migrate_ring != NULL) { ++ rte_ring_free(mt->mbuf_migrate_ring); ++ mt->mbuf_migrate_ring = NULL; ++ } ++ if (mt->mbuf_cache != NULL) { ++ buf_cache_free(mt->mbuf_cache); ++ mt->mbuf_cache = NULL; ++ } ++ if (mt->rpc_cache != NULL) { + buf_cache_free(mt->rpc_cache); + mt->rpc_cache = NULL; + } + } + +-int mem_thread_cache_init(struct mem_thread *mt) ++int mem_thread_cache_init(struct mem_thread *mt, int stack_id) + { ++ mt->stack_id = stack_id; ++ + if (get_global_cfg_params()->mem_async_mode) { + char name [RTE_MEMPOOL_NAMESIZE]; + SYS_FORMAT_NAME(name, RTE_MEMPOOL_NAMESIZE, "%s_%p", "migrate_ring", mt); + +- mt->mbuf_migrate_ring = rte_ring_create(name, BUF_CACHE_MAX_NUM, ++ mt->mbuf_migrate_ring = rte_ring_create(name, ++ LWIP_MAX(get_global_cfg_params()->mem_cache_num, MIGRATE_RING_MIN_NUM), + rte_socket_id(), RING_F_SP_ENQ | RING_F_SC_DEQ); + if (mt->mbuf_migrate_ring == NULL) { + return -1; +@@ -522,9 +681,17 @@ void mem_mbuf_migrate_enqueue(struct mem_thread *mt, unsigned n) + mpcache = ms->mbuf_mpcache; + + mt->stk_migrate_count += n; ++ if (mt->stk_migrate_count < BUF_CACHE_WATERSTEP_MIN) ++ return; ++ if (mpcache->len < ms->migrate_watermark) ++ return; + +- if (mpcache->len <= ms->migrate_watermark) ++ /* no sufficient mbuf */ ++ if (rte_ring_count(ms->mbuf_pool->pool_data) < MBUFPOOL_RESERVE_NUM) { ++ mem_thread_manager_flush_all(); ++ mt->stk_migrate_count = 0; + return; ++ } + + num = LWIP_MIN(mpcache->len - ms->migrate_watermark, + mt->stk_migrate_count); +@@ -654,8 +821,8 @@ static void pool_put_bulk_with_cache(const struct mempool_ops *pool_ops, + + /* dequeue from cache, then put to the pool */ + put_count = count - cache->watermark; +- LWIP_DEBUGF(MEMP_DEBUG, ("pool_put_bulk_with_cache(cache=%p, watermark=%u, put_count=%u)\n", +- cache, cache->watermark, put_count)); ++ LWIP_DEBUGF(MEMP_DEBUG, ("%s(cache=%p, watermark=%u, put_count=%u)\n", ++ __FUNCTION__, cache, cache->watermark, put_count)); + + pool_ops->put_bulk(pool, &cache->objs[cache->head - put_count], put_count); + cache->head -= put_count; +@@ -674,10 +841,12 @@ void *mem_get_rpc(int stack_id) + if (mt == NULL) { + ret = mem_mp_ops.get_bulk(ms->rpc_pool, &obj, 1); + } else { ++ mem_thread_group_used(); + ret = pool_get_bulk_with_cache(&mem_mp_ops, ms->rpc_pool, mt->rpc_cache, &obj, 1); ++ mem_thread_group_done(); + } + +- LWIP_DEBUGF(MEMP_DEBUG, ("mem_get_rpc(stack_id=%d, obj=%p)\n", stack_id, obj)); ++ LWIP_DEBUGF(MEMP_DEBUG, ("%s(stack_id=%d, obj=%p)\n", __FUNCTION__, stack_id, obj)); + + return ret == 0 ? NULL : obj; + } +@@ -688,12 +857,14 @@ void mem_put_rpc(void *obj) + struct mem_stack *ms = mem_stack_get(stack_id); + struct mem_thread *mt = mem_thread_get(stack_id); + +- LWIP_DEBUGF(MEMP_DEBUG, ("mem_put_rpc(stack_id=%d, obj=%p)\n", stack_id, obj)); ++ LWIP_DEBUGF(MEMP_DEBUG, ("%s(stack_id=%d, obj=%p)\n", __FUNCTION__, stack_id, obj)); + + if (mt == NULL) { + mem_mp_ops.put_bulk(ms->rpc_pool, &obj, 1); + } else { ++ mem_thread_group_used(); + pool_put_bulk_with_cache(&mem_mp_ops, ms->rpc_pool, mt->rpc_cache, &obj, 1); ++ mem_thread_group_done(); + } + } + +@@ -712,6 +883,7 @@ unsigned mem_get_mbuf_bulk(int stack_id, struct rte_mbuf **mbuf_table, unsigned + * when RTE_MAX_LCORE is too large, it's time-consuming + */ + if (rte_ring_count(ms->mbuf_pool->pool_data) < MBUFPOOL_RESERVE_NUM + n) { ++ mem_thread_manager_flush_all(); + return 0; + } + } +@@ -719,14 +891,16 @@ unsigned mem_get_mbuf_bulk(int stack_id, struct rte_mbuf **mbuf_table, unsigned + if (mt == NULL) { + ret = mbuf_mp_ops.get_bulk(ms->mbuf_pool, (void **)mbuf_table, n); + } else { ++ mem_thread_group_used(); + mem_mbuf_migrate_dequeue(mt); + ret = pool_get_bulk_with_cache(&mbuf_mp_ops, ms->mbuf_pool, mt->mbuf_cache, (void **)mbuf_table, n); ++ mem_thread_group_done(); + } + + #if MEMP_DEBUG + for (unsigned i = 0; i < ret; ++i) { +- LWIP_DEBUGF(MEMP_DEBUG, ("mem_get_mbuf_bulk(stack_id=%d, n=%u, mbuf_table[%u]=%p, pbuf=%p)\n", +- stack_id, n, i, mbuf_table[i], mbuf_to_pbuf(mbuf_table[i]))); ++ LWIP_DEBUGF(MEMP_DEBUG, ("%s(stack_id=%d, n=%u, mbuf_table[%u]=%p, pbuf=%p)\n", ++ __FUNCTION__, stack_id, n, i, mbuf_table[i], mbuf_to_pbuf(mbuf_table[i]))); + } + #endif /* MEMP_DEBUG */ + +@@ -745,15 +919,17 @@ static void mem_put_mbuf_bulk_by_pbuf(struct rte_mbuf *const *mbuf_table, unsign + + #if MEMP_DEBUG + for (unsigned i = 0; i < n; ++i) { +- LWIP_DEBUGF(MEMP_DEBUG, ("mem_put_mbuf_bulk(stack_id=%d, n=%u, mbuf_table[%u]=%p, pbuf=%p)\n", +- stack_id, n, i, mbuf_table[i], mbuf_to_pbuf(mbuf_table[i]))); ++ LWIP_DEBUGF(MEMP_DEBUG, ("%s(stack_id=%d, n=%u, mbuf_table[%u]=%p, pbuf=%p)\n", ++ __FUNCTION__, stack_id, n, i, mbuf_table[i], mbuf_to_pbuf(mbuf_table[i]))); + } + #endif /* MEMP_DEBUG */ + + if (mt == NULL) { + mbuf_mp_ops.put_bulk(ms->mbuf_pool, (void *const *)mbuf_table, n); + } else { ++ mem_thread_group_used(); + pool_put_bulk_with_cache(&mbuf_mp_ops, ms->mbuf_pool, mt->mbuf_cache, (void *const *)mbuf_table, n); ++ mem_thread_group_done(); + } + + } +@@ -826,8 +1002,8 @@ struct rte_mbuf *pbuf_to_mbuf_prefree(struct pbuf *p) + struct rte_mbuf *m = pbuf_to_mbuf(p); + #if MEMP_DEBUG + if (rte_mbuf_refcnt_read(m) > 1) { +- LWIP_DEBUGF(MEMP_DEBUG, ("pbuf_to_mbuf_prefree(mbuf=%p, pbuf=%p, refcnt=%u)\n", +- m, p, rte_mbuf_refcnt_read(m))); ++ LWIP_DEBUGF(MEMP_DEBUG, ("%s(mbuf=%p, pbuf=%p, refcnt=%u)\n", ++ __FUNCTION__, m, p, rte_mbuf_refcnt_read(m))); + } + #endif /* MEMP_DEBUG */ + if (p->mbuf_refcnt != 1) { +@@ -868,6 +1044,9 @@ void mem_put_pbuf_list_bulk(struct pbuf *const *pbuf_table, unsigned n) + struct pbuf *q, *next; + struct rte_mbuf *mbuf; + ++ if (mt != NULL) ++ mem_thread_group_used(); ++ + for (unsigned i = 0; i < n; ++i) { + q = pbuf_table[i]; + while (q != NULL) { +@@ -893,6 +1072,10 @@ void mem_put_pbuf_list_bulk(struct pbuf *const *pbuf_table, unsigned n) + __FUNCTION__, stack_id, n, i, mbuf, q)); + } + } ++ ++ if (mt != NULL) ++ mem_thread_group_done(); ++ return; + } + + struct pbuf *mem_get_pbuf(int stack_id, bool reserve) +@@ -1001,7 +1184,9 @@ void mem_init_pbuf(struct pbuf *p, pbuf_layer layer, uint16_t tot_len, uint16_t + struct rte_mbuf *mbuf; + void *data; + +- if (p->type_internal == PBUF_POOL_PREINIT) { ++ /* PBUF_POOL_PREINIT maybe give back to mbuf_pool, and alloc to NIC rx. ++ * so ignore PBUF_POOL_PREINIT at this time. */ ++ if (layer == PBUF_TRANSPORT && p->type_internal == PBUF_POOL_PREINIT) { + p->payload = (uint8_t *)p->payload + LWIP_MEM_ALIGN_SIZE((uint16_t)layer); + p->type_internal = type; + p->len = len; +diff --git a/src/lstack/include/lstack_mempool.h b/src/lstack/include/lstack_mempool.h +index 848509c..e636fda 100644 +--- a/src/lstack/include/lstack_mempool.h ++++ b/src/lstack/include/lstack_mempool.h +@@ -44,12 +44,15 @@ + #define MEMPOOL_OPS_NAME "ring_mt_rts" + #define MEMPOOL_CACHE_NUM 32 + +-#define BUF_CACHE_MIN_NUM 32 ++#define BUF_CACHE_MIN_NUM 16 + #define BUF_CACHE_MAX_NUM 1024 + #define BUF_CACHE_WATERSTEP_SHIFT 4 /* 1/16 */ ++#define BUF_CACHE_WATERSTEP_MIN 4 + + #define BUF_BULK_MAX_NUM 32 + ++#define MIGRATE_RING_MIN_NUM (BUF_CACHE_MIN_NUM << 1) ++ + struct buf_cache { + unsigned size; /* Size of cache. */ + unsigned mask; /* Mask (size-1) of cache. */ +@@ -91,8 +94,8 @@ struct buf_cache *buf_cache_create(unsigned count) + cache->tail = 0; + + cache->waterstep = cache->size >> BUF_CACHE_WATERSTEP_SHIFT; +- if (cache->waterstep < BUF_CACHE_WATERSTEP_SHIFT) +- cache->waterstep = BUF_CACHE_WATERSTEP_SHIFT; ++ if (cache->waterstep < BUF_CACHE_WATERSTEP_MIN) ++ cache->waterstep = BUF_CACHE_WATERSTEP_MIN; + cache->watermark = cache->waterstep; + cache->flushthresh = cache->size - cache->waterstep; + +@@ -142,6 +145,12 @@ void buf_cache_sub_watermark(struct buf_cache *cache) + } + } + ++static __rte_always_inline ++void buf_cache_reset_watermark(struct buf_cache *cache) ++{ ++ cache->watermark = cache->waterstep; ++} ++ + static __rte_always_inline + void __buf_cache_copy_objs(void ** dst_table, void *const *src_table, unsigned n) + { +@@ -273,6 +282,8 @@ struct mem_stack { + }; + + struct mem_thread { ++ int stack_id; ++ + struct buf_cache *rpc_cache; + + struct buf_cache *mbuf_cache; +@@ -289,7 +300,7 @@ int mem_stack_mpcache_init(int stack_id, unsigned cpu_id); + + int mem_thread_manager_init(void); + void mem_thread_cache_free(struct mem_thread *mt); +-int mem_thread_cache_init(struct mem_thread *mt); ++int mem_thread_cache_init(struct mem_thread *mt, int stack_id); + + unsigned mem_stack_mbuf_pool_count(int stack_id); + struct rte_mempool *mem_get_mbuf_pool(int stack_id); +diff --git a/src/lstack/include/lstack_unistd.h b/src/lstack/include/lstack_unistd.h +index 3bcee5a..397cd19 100644 +--- a/src/lstack/include/lstack_unistd.h ++++ b/src/lstack/include/lstack_unistd.h +@@ -16,6 +16,9 @@ + #include + #include + ++void pthread_block_sig(int sig); ++void pthread_unblock_sig(int sig); ++ + int lstack_signal_init(void); + int lstack_sigaction(int sig_num, const struct sigaction *action, struct sigaction *old_action); + pid_t lstack_fork(void); +-- +2.33.0 + diff --git a/0333-dfx-support-sk_wait-stat.patch b/0333-dfx-support-sk_wait-stat.patch new file mode 100644 index 0000000..ebd305d --- /dev/null +++ b/0333-dfx-support-sk_wait-stat.patch @@ -0,0 +1,526 @@ +From 4884431653c978adfc03053e1803dd93b58ca3a0 Mon Sep 17 00:00:00 2001 +From: Lemmy Huang +Date: Tue, 1 Apr 2025 20:50:08 +0800 +Subject: [PATCH] dfx: support sk_wait stat + +Signed-off-by: Lemmy Huang +--- + src/common/gazelle_dfx_msg.h | 27 ++++++++-------- + src/lstack/api/lstack_sockctl.c | 7 ++-- + src/lstack/api/lstack_sockio.c | 45 +++++++++++++++++--------- + src/lstack/core/lstack_mempool.c | 18 +++++++---- + src/lstack/core/lstack_stack_stat.c | 12 +++---- + src/lstack/core/lstack_thread_rpc.c | 16 ++++----- + src/lstack/core/lstack_wait.c | 15 ++++++--- + src/lstack/include/lstack_mempool.h | 3 +- + src/lstack/include/lstack_thread_rpc.h | 9 ++---- + src/lstack/include/lstack_wait.h | 4 ++- + src/ltran/ltran_dfx.c | 31 ++++++++---------- + 11 files changed, 105 insertions(+), 82 deletions(-) + +diff --git a/src/common/gazelle_dfx_msg.h b/src/common/gazelle_dfx_msg.h +index 2c6462d..2f1066d 100644 +--- a/src/common/gazelle_dfx_msg.h ++++ b/src/common/gazelle_dfx_msg.h +@@ -94,31 +94,33 @@ enum GAZELLE_TCP_LIST_STATE { + }; + + struct gazelle_stack_stat { ++ uint32_t conn_num; ++ uint32_t mbuf_pool_cnt; + uint64_t wakeup_events; +- uint64_t write_lwip_cnt; +- uint64_t send_pkts_fail; +- uint64_t read_lwip_drop; +- uint64_t read_lwip_cnt; + uint64_t rx_allocmbuf_fail; + uint64_t tx_allocmbuf_fail; +- uint64_t call_null; + uint64_t rx_drop; + uint64_t rx; + uint64_t tx_drop; + uint64_t tx; + uint64_t tx_prepare_fail; +- uint64_t accept_fail; +- uint64_t sock_rx_drop; +- uint64_t sock_tx_merge; + }; + + struct gazelle_wakeup_stat { ++ uint64_t kernel_events; + uint64_t app_events; +- uint64_t app_write_rpc; ++ uint64_t accept_fail; + uint64_t app_write_cnt; + uint64_t app_read_cnt; + uint64_t read_null; +- uint64_t kernel_events; ++ uint64_t sock_rx_drop; ++ uint64_t sock_tx_merge; ++}; ++ ++struct gazelle_rpc_stat { ++ uint32_t rpc_pool_cnt; ++ uint64_t call_alloc_fail; ++ uint64_t call_msg_cnt; + }; + + struct gazelle_stack_aggregate_stats { +@@ -134,12 +136,9 @@ struct gazelle_stack_aggregate_stats { + }; + + struct gazelle_stat_pkts { +- uint16_t conn_num; +- uint32_t mbufpool_avail_cnt; +- uint64_t call_msg_cnt; +- uint64_t call_alloc_fail; + struct gazelle_stack_stat stack_stat; + struct gazelle_wakeup_stat wakeup_stat; ++ struct gazelle_rpc_stat rpc_stat; + struct gazelle_stack_aggregate_stats aggregate_stats; + }; + +diff --git a/src/lstack/api/lstack_sockctl.c b/src/lstack/api/lstack_sockctl.c +index 7da7473..7ff7163 100644 +--- a/src/lstack/api/lstack_sockctl.c ++++ b/src/lstack/api/lstack_sockctl.c +@@ -388,13 +388,16 @@ static int rpc_call_shadow_fd(int stack_id, int fd, const struct sockaddr *addr, + + static void callback_accept(struct rpc_msg *msg) + { ++ struct lwip_sock *sock; + int fd = msg->args[MSG_ARG_0].i; + msg->result = -1; +- struct protocol_stack *stack = get_protocol_stack(); + + int accept_fd = lwip_accept4(fd, msg->args[MSG_ARG_1].p, msg->args[MSG_ARG_2].p, msg->args[MSG_ARG_3].i); + if (accept_fd < 0) { +- stack->stats.accept_fail++; ++ sock = lwip_get_socket(fd); ++ if (!POSIX_IS_CLOSED(sock)) { ++ SOCK_WAIT_STAT(sock->sk_wait, accept_fail, 1); ++ } + LSTACK_LOG(ERR, LSTACK, "fd %d ret %d\n", fd, accept_fd); + return; + } +diff --git a/src/lstack/api/lstack_sockio.c b/src/lstack/api/lstack_sockio.c +index 060b3b3..89a6acb 100644 +--- a/src/lstack/api/lstack_sockio.c ++++ b/src/lstack/api/lstack_sockio.c +@@ -49,7 +49,7 @@ struct sockio_ops { + const struct sockaddr *to, socklen_t tolen); + void (*stack_udp_send)(struct lwip_sock *sock); + +- ssize_t (*stack_udp_readmsg)(struct lwip_sock *sock, struct msghdr *msg, int flags); ++ ssize_t (*stack_udp_readmsg)(struct lwip_sock *sock, struct msghdr *msg, size_t len, int flags); + + ssize_t (*stack_tcp_write)(struct lwip_sock *sock, const char *data, size_t len, int flags); + void (*stack_tcp_send)(struct lwip_sock *sock); +@@ -260,6 +260,8 @@ static uint16_t stack_udp_write_one(const struct lwip_sock *sock, struct mbox_ri + mr->ops->enqueue_burst(mr, (void **)&nbuf, 1); + mr->app_free_count -= 1; + ++ SOCK_WAIT_STAT(sock->sk_wait, app_write_cnt, 1); ++ + return len; + } + +@@ -268,7 +270,7 @@ static uint16_t stack_udp_write_bulk(const struct lwip_sock *sock, struct mbox_r + const struct sockaddr *to, socklen_t tolen) + { + struct pbuf *pbuf_pkts[UDP_SND_QUEUELEN_MAX]; +- unsigned pbuf_num; ++ unsigned pbuf_num = 0; + struct netbuf *nbuf; + uint16_t payload_size; + uint8_t optlen; +@@ -328,6 +330,8 @@ static uint16_t stack_udp_write_bulk(const struct lwip_sock *sock, struct mbox_r + mr->ops->push_tail(mr, nbuf); + } + ++ SOCK_WAIT_STAT(sock->sk_wait, app_write_cnt, pbuf_num); ++ + return copied_total; + } + +@@ -473,7 +477,7 @@ static void rtc_stack_udp_send(struct lwip_sock *sock) + } while (output_again); + } + +-static ssize_t stack_udp_readmsg(struct lwip_sock *sock, struct msghdr *msg, int flags) ++static ssize_t stack_udp_readmsg(struct lwip_sock *sock, struct msghdr *msg, size_t len, int flags) + { + struct mbox_ring *mr = &sock->conn->recvmbox->mring; + struct pbuf **extcache_list; +@@ -505,6 +509,8 @@ static ssize_t stack_udp_readmsg(struct lwip_sock *sock, struct msghdr *msg, int + err = lwip_recvfrom_udp_raw(sock, flags | MSG_PEEK, msg, &copied_total, 0); + sock->lastdata.netbuf = NULL; + ++ SOCK_WAIT_STAT(sock->sk_wait, app_read_cnt, 1); ++ SOCK_WAIT_STAT(sock->sk_wait, sock_rx_drop, copied_total < len ? 1 : 0); + if (get_protocol_stack_group()->latency_start) + calculate_lstack_latency(sock->stack_id, &nbuf->p, 1, GAZELLE_LATENCY_READ_LSTACK, 0); + +@@ -532,6 +538,8 @@ static ssize_t stack_udp_readmsg(struct lwip_sock *sock, struct msghdr *msg, int + return copied_total; + } + out: ++ SOCK_WAIT_STAT(sock->sk_wait, read_null, 1); ++ + set_errno(err_to_errno(err)); + return -1; + } +@@ -561,6 +569,8 @@ static uint16_t rtw_stack_tcp_write_one(const struct lwip_sock *sock, struct mbo + mr->ops->enqueue_burst(mr, (void **)&p, 1); + mr->app_free_count -= 1; + ++ SOCK_WAIT_STAT(sock->sk_wait, app_write_cnt, 1); ++ + return len; + } + +@@ -582,6 +592,7 @@ static uint16_t rtw_stack_tcp_write_bulk(const struct lwip_sock *sock, struct mb + + write_pbuf_bulk(pbuf_pkts, pbuf_num, TCP_MSS, data, len, 0); + ++ SOCK_WAIT_STAT(sock->sk_wait, app_write_cnt, pbuf_num); + if (get_protocol_stack_group()->latency_start) + calculate_lstack_latency(sock->stack_id, pbuf_pkts, pbuf_num, GAZELLE_LATENCY_WRITE_INTO_RING, 0); + +@@ -670,6 +681,7 @@ static ssize_t rtw_stack_tcp_write(struct lwip_sock *sock, const char *data, siz + } + + copied_total = rtw_stack_tcp_append(mr, data, LWIP_MIN(TCP_MSS, total_copy_len), flags); ++ SOCK_WAIT_STAT(sock->sk_wait, sock_tx_merge, copied_total > 0 ? 1 : 0); + if (copied_total == total_copy_len) { + return copied_total; + } +@@ -843,6 +855,7 @@ static ssize_t stack_tcp_read(struct lwip_sock *sock, char *data, size_t len, in + break; + } + ++ SOCK_WAIT_STAT(sock->sk_wait, app_read_cnt, 1); + if (get_protocol_stack_group()->latency_start) + calculate_lstack_latency(sock->stack_id, &p, 1, GAZELLE_LATENCY_READ_APP_CALL, sys_now_us()); + +@@ -879,6 +892,8 @@ static ssize_t stack_tcp_read(struct lwip_sock *sock, char *data, size_t len, in + return copied_total; + } + ++ SOCK_WAIT_STAT(sock->sk_wait, read_null, 1); ++ + set_errno(err_to_errno(err)); + if (err == ERR_CLSD) { + return 0; +@@ -1133,10 +1148,10 @@ ssize_t sockio_recvfrom(int fd, void *mem, size_t len, int flags, + msg.msg_iovlen = 1; + msg.msg_name = from; + msg.msg_namelen = (fromlen ? *fromlen : 0); +- recvd = ioops.stack_udp_readmsg(sock, &msg, flags); ++ recvd = ioops.stack_udp_readmsg(sock, &msg, len, flags); + if (recvd < 0 && errno == EWOULDBLOCK) { + if (sock_event_wait(sock, netconn_is_nonblocking(sock->conn) || (flags & MSG_DONTWAIT))) { +- recvd = ioops.stack_udp_readmsg(sock, &msg, flags); ++ recvd = ioops.stack_udp_readmsg(sock, &msg, len, flags); + } + } + if (recvd > 0 && fromlen != NULL) { +@@ -1154,11 +1169,11 @@ ssize_t sockio_recvfrom(int fd, void *mem, size_t len, int flags, + ssize_t sockio_recvmsg(int fd, struct msghdr *msg, int flags) + { + struct lwip_sock *sock = lwip_get_socket(fd); +- ssize_t ret, recvd = 0; ++ ssize_t len, recvd = 0; + +- ret = lwip_recvmsg_check(NULL, msg, flags); +- if (unlikely(ret <= 0)) { +- return ret; ++ len = lwip_recvmsg_check(NULL, msg, flags); ++ if (unlikely(len <= 0)) { ++ return len; + } + + if (unlikely(!sock->affinity_numa)) { +@@ -1169,20 +1184,20 @@ ssize_t sockio_recvmsg(int fd, struct msghdr *msg, int flags) + switch (NETCONN_TYPE(sock->conn)) { + case NETCONN_TCP: + for (int i = 0; i < msg->msg_iovlen; ++i) { +- ret = sockio_recvfrom(fd, msg->msg_iov[i].iov_base, msg->msg_iov[i].iov_len, flags, NULL, NULL); +- if (ret <= 0) { ++ len = sockio_recvfrom(fd, msg->msg_iov[i].iov_base, msg->msg_iov[i].iov_len, flags, NULL, NULL); ++ if (len <= 0) { + if (recvd == 0) +- recvd = ret; ++ recvd = len; + break; + } +- recvd += ret; ++ recvd += len; + } + break; + case NETCONN_UDP: +- recvd = ioops.stack_udp_readmsg(sock, msg, flags); ++ recvd = ioops.stack_udp_readmsg(sock, msg, len, flags); + if (recvd < 0 && errno == EWOULDBLOCK) { + if (sock_event_wait(sock, netconn_is_nonblocking(sock->conn) || (flags & MSG_DONTWAIT))) { +- recvd = ioops.stack_udp_readmsg(sock, msg, flags); ++ recvd = ioops.stack_udp_readmsg(sock, msg, len, flags); + } + } + break; +diff --git a/src/lstack/core/lstack_mempool.c b/src/lstack/core/lstack_mempool.c +index 7e2a706..bb47830 100644 +--- a/src/lstack/core/lstack_mempool.c ++++ b/src/lstack/core/lstack_mempool.c +@@ -65,6 +65,18 @@ struct rte_mempool *mem_get_rpc_pool(int stack_id) + return g_mem_stack_group[stack_id].rpc_pool; + } + ++unsigned mem_stack_mbuf_pool_count(int stack_id) ++{ ++ struct mem_stack *ms = mem_stack_get(stack_id); ++ return rte_mempool_avail_count(ms->mbuf_pool); ++} ++ ++unsigned mem_stack_rpc_pool_count(int stack_id) ++{ ++ struct mem_stack *ms = mem_stack_get(stack_id); ++ return rte_mempool_avail_count(ms->rpc_pool); ++} ++ + static inline bool mem_thread_group_in_used(const struct mem_thread_group *mt_grooup, uint32_t timeout) + { + return mt_grooup->used_flag || +@@ -541,12 +553,6 @@ int mem_stack_mpcache_init(int stack_id, unsigned cpu_id) + return 0; + } + +-unsigned mem_stack_mbuf_pool_count(int stack_id) +-{ +- struct mem_stack *ms = mem_stack_get(stack_id); +- return rte_mempool_avail_count(ms->mbuf_pool); +-} +- + static void mem_thread_cache_flush(struct mem_thread *mt) + { + struct mem_stack *ms = mem_stack_get(mt->stack_id); +diff --git a/src/lstack/core/lstack_stack_stat.c b/src/lstack/core/lstack_stack_stat.c +index 3b3bd75..c84b7b1 100644 +--- a/src/lstack/core/lstack_stack_stat.c ++++ b/src/lstack/core/lstack_stack_stat.c +@@ -221,6 +221,8 @@ static void get_stack_stats(struct gazelle_stack_dfx_data *dfx, struct protocol_ + + lstack_get_low_power_info(&dfx->low_power_info); + ++ stack->stats.conn_num = stack->conn_num; ++ stack->stats.mbuf_pool_cnt = mem_stack_mbuf_pool_count(stack->stack_idx); + int32_t ret = memcpy_s(&dfx->data.pkts.stack_stat, sizeof(struct gazelle_stack_stat), + &stack->stats, sizeof(struct gazelle_stack_stat)); + if (ret != EOK) { +@@ -229,15 +231,9 @@ static void get_stack_stats(struct gazelle_stack_dfx_data *dfx, struct protocol_ + } + + sock_wait_group_stat(stack->stack_idx, &dfx->data.pkts.wakeup_stat); ++ rpc_get_stat(&stack->rpc_queue, &dfx->data.pkts.rpc_stat); + +- dfx->data.pkts.call_alloc_fail = rpc_stats_get()->call_alloc_fail; +- +- int32_t rpc_call_result = rpc_msgcnt(&stack->rpc_queue); +- dfx->data.pkts.call_msg_cnt = (rpc_call_result < 0) ? 0 : rpc_call_result; +- +- dfx->data.pkts.mbufpool_avail_cnt = mem_stack_mbuf_pool_count(stack->stack_idx); +- +- dfx->data.pkts.conn_num = stack->conn_num; ++ return; + } + + static void get_stack_dfx_data_proto(struct gazelle_stack_dfx_data *dfx, struct protocol_stack *stack, +diff --git a/src/lstack/core/lstack_thread_rpc.c b/src/lstack/core/lstack_thread_rpc.c +index 7b3e432..84e5814 100644 +--- a/src/lstack/core/lstack_thread_rpc.c ++++ b/src/lstack/core/lstack_thread_rpc.c +@@ -10,6 +10,8 @@ + * See the Mulan PSL v2 for more details. + */ + ++#include ++ + #include + #include + +@@ -20,11 +22,13 @@ + #include "lstack_thread_rpc.h" + #include "lstack_mempool.h" + +-static struct rpc_stats g_rpc_stats; ++static struct gazelle_rpc_stat g_rpc_stats = {0}; + +-struct rpc_stats *rpc_stats_get(void) ++void rpc_get_stat(rpc_queue *queue, struct gazelle_rpc_stat *stat) + { +- return &g_rpc_stats; ++ g_rpc_stats.rpc_pool_cnt = mem_stack_rpc_pool_count(queue->queue_id); ++ g_rpc_stats.call_msg_cnt = rpc_msgcnt(queue); ++ memcpy_s(stat, sizeof(struct gazelle_rpc_stat), &g_rpc_stats, sizeof(struct gazelle_rpc_stat)); + } + + __rte_always_inline +@@ -114,11 +118,7 @@ int rpc_poll_msg(rpc_queue *queue, int max_num) + } + msg = container_of(node, struct rpc_msg, queue_node); + +- if (likely(msg->func)) { +- msg->func(msg); +- } else { +- g_rpc_stats.call_null++; +- } ++ msg->func(msg); + + if (msg->flags & RPC_MSG_RECALL) { + msg->flags &= ~RPC_MSG_RECALL; +diff --git a/src/lstack/core/lstack_wait.c b/src/lstack/core/lstack_wait.c +index c67df93..381290c 100644 +--- a/src/lstack/core/lstack_wait.c ++++ b/src/lstack/core/lstack_wait.c +@@ -129,10 +129,17 @@ void sock_wait_group_stat(int stack_id, struct gazelle_wakeup_stat *stat) + list_for_each_node(node, next, &g_wait_group.group_list) { + sk_wait = list_entry(node, struct sock_wait, group_node); + +- if (sk_wait->affinity.bind_stack_id == stack_id) { +- memcpy_s(stat, sizeof(struct gazelle_wakeup_stat), +- &sk_wait->stat, sizeof(struct gazelle_wakeup_stat)); +- } ++ if (sk_wait->affinity.bind_stack_id != stack_id) ++ continue; ++ ++ stat->kernel_events += sk_wait->stat.kernel_events ; ++ stat->app_events += sk_wait->stat.app_events ; ++ stat->accept_fail += sk_wait->stat.accept_fail ; ++ stat->app_write_cnt += sk_wait->stat.app_write_cnt ; ++ stat->app_read_cnt += sk_wait->stat.app_read_cnt ; ++ stat->read_null += sk_wait->stat.read_null ; ++ stat->sock_rx_drop += sk_wait->stat.sock_rx_drop ; ++ stat->sock_tx_merge += sk_wait->stat.sock_tx_merge ; + } + + rte_spinlock_unlock(&g_wait_group.group_list_lock); +diff --git a/src/lstack/include/lstack_mempool.h b/src/lstack/include/lstack_mempool.h +index e636fda..bd170bb 100644 +--- a/src/lstack/include/lstack_mempool.h ++++ b/src/lstack/include/lstack_mempool.h +@@ -302,9 +302,10 @@ int mem_thread_manager_init(void); + void mem_thread_cache_free(struct mem_thread *mt); + int mem_thread_cache_init(struct mem_thread *mt, int stack_id); + +-unsigned mem_stack_mbuf_pool_count(int stack_id); + struct rte_mempool *mem_get_mbuf_pool(int stack_id); + struct rte_mempool *mem_get_rpc_pool(int stack_id); ++unsigned mem_stack_mbuf_pool_count(int stack_id); ++unsigned mem_stack_rpc_pool_count(int stack_id); + + void *mem_get_rpc(int stack_id); + void mem_put_rpc(void *obj); +diff --git a/src/lstack/include/lstack_thread_rpc.h b/src/lstack/include/lstack_thread_rpc.h +index 32dde53..013ff29 100644 +--- a/src/lstack/include/lstack_thread_rpc.h ++++ b/src/lstack/include/lstack_thread_rpc.h +@@ -16,6 +16,7 @@ + #include + #include + ++#include "common/gazelle_dfx_msg.h" + #include "lstack_lockless_queue.h" + #include "lstack_interrupt.h" + +@@ -32,12 +33,6 @@ struct rpc_queue { + uint16_t queue_id; + }; + +-struct rpc_stats { +- uint16_t call_null; +- uint64_t call_alloc_fail; +-}; +-struct rpc_stats *rpc_stats_get(void); +- + union rpc_msg_arg { + int i; + unsigned int u; +@@ -67,6 +62,8 @@ struct rpc_msg { + lockless_queue_node queue_node; + }; + ++void rpc_get_stat(rpc_queue *queue, struct gazelle_rpc_stat *stat); ++ + struct rpc_msg *rpc_msg_alloc(int stack_id, rpc_func_t func); + void rpc_msg_free(struct rpc_msg *msg); + +diff --git a/src/lstack/include/lstack_wait.h b/src/lstack/include/lstack_wait.h +index ed154b4..306c6a4 100644 +--- a/src/lstack/include/lstack_wait.h ++++ b/src/lstack/include/lstack_wait.h +@@ -73,7 +73,6 @@ struct sock_wait { + + /* dfx stat */ + struct list_node group_node; +- struct gazelle_wakeup_stat stat; + + /* epoll kernel fd */ + int epfd; +@@ -83,6 +82,9 @@ struct sock_wait { + unsigned kernel_nfds; + struct wait_affinity affinity; + ++#define SOCK_WAIT_STAT(sk_wait, name, count) if ((sk_wait) != NULL && (sk_wait)->type != WAIT_CLOSE) { (sk_wait)->stat.name += count; } ++ struct gazelle_wakeup_stat stat; ++ + char pad0 __rte_cache_aligned; /* new cache line */ + + #if SOCK_WAIT_BATCH_NOTIFY +diff --git a/src/ltran/ltran_dfx.c b/src/ltran/ltran_dfx.c +index 1722460..01ff2c3 100644 +--- a/src/ltran/ltran_dfx.c ++++ b/src/ltran/ltran_dfx.c +@@ -734,26 +734,23 @@ static void show_lstack_stats(struct gazelle_stack_dfx_data *lstack_stat) + printf("tx_pkts: %-20"PRIu64" ", lstack_stat->data.pkts.stack_stat.tx); + printf("tx_drop: %-20"PRIu64" ", lstack_stat->data.pkts.stack_stat.tx_drop); + printf("tx_allocmbuf_fail: %-10"PRIu64"\n", lstack_stat->data.pkts.stack_stat.tx_allocmbuf_fail); ++ ++ printf("mbuf_pool_freecnt: %-10"PRIu32" ", lstack_stat->data.pkts.stack_stat.mbuf_pool_cnt); ++ printf("conn_num: %-19hu ", lstack_stat->data.pkts.stack_stat.conn_num); ++ printf("wakeup_events: %-14"PRIu64" \n", lstack_stat->data.pkts.stack_stat.wakeup_events); ++ ++ printf("rpc_pool_cnt: %-15"PRIu32" ", lstack_stat->data.pkts.rpc_stat.rpc_pool_cnt); ++ printf("call_alloc_fail: %-12"PRIu64" ", lstack_stat->data.pkts.rpc_stat.call_alloc_fail); ++ printf("call_msg: %-19"PRIu64" \n", lstack_stat->data.pkts.rpc_stat.call_msg_cnt); ++ ++ printf("kernel_events: %-14"PRIu64" ", lstack_stat->data.pkts.wakeup_stat.kernel_events); ++ printf("app_events: %-17"PRIu64" \n", lstack_stat->data.pkts.wakeup_stat.app_events); + printf("app_read: %-19"PRIu64" ", lstack_stat->data.pkts.wakeup_stat.app_read_cnt); +- printf("read_lwip: %-18"PRIu64" ", lstack_stat->data.pkts.stack_stat.read_lwip_cnt); +- printf("read_lwip_drop: %-13"PRIu64" \n", lstack_stat->data.pkts.stack_stat.read_lwip_drop); + printf("app_write: %-18"PRIu64" ", lstack_stat->data.pkts.wakeup_stat.app_write_cnt); +- printf("write_lwip: %-17"PRIu64" ", lstack_stat->data.pkts.stack_stat.write_lwip_cnt); +- printf("app_write_rpc: %-14"PRIu64" \n", lstack_stat->data.pkts.wakeup_stat.app_write_rpc); +- printf("conn_num: %-19hu ", lstack_stat->data.pkts.conn_num); +- +- printf("kernel_events: %-14"PRIu64"\n", lstack_stat->data.pkts.wakeup_stat.kernel_events); +- printf("wakeup_events: %-14"PRIu64" ", lstack_stat->data.pkts.stack_stat.wakeup_events); +- printf("app_events: %-17"PRIu64" ", lstack_stat->data.pkts.wakeup_stat.app_events); + printf("read_null: %-18"PRIu64" \n", lstack_stat->data.pkts.wakeup_stat.read_null); +- printf("call_msg: %-19"PRIu64" ", lstack_stat->data.pkts.call_msg_cnt); +- printf("call_alloc_fail: %-12"PRIu64" ", lstack_stat->data.pkts.call_alloc_fail); +- printf("call_null: %-18"PRIu64" \n", lstack_stat->data.pkts.stack_stat.call_null); +- printf("send_pkts_fail: %-13"PRIu64" ", lstack_stat->data.pkts.stack_stat.send_pkts_fail); +- printf("mbuf_pool_freecnt: %-10"PRIu32" \n", lstack_stat->data.pkts.mbufpool_avail_cnt); +- printf("accpet_fail: %-16"PRIu64" ", lstack_stat->data.pkts.stack_stat.accept_fail); +- printf("sock_rx_drop: %-15"PRIu64" ", lstack_stat->data.pkts.stack_stat.sock_rx_drop); +- printf("sock_tx_merge: %-16"PRIu64" \n", lstack_stat->data.pkts.stack_stat.sock_tx_merge); ++ printf("sock_rx_drop: %-15"PRIu64" ", lstack_stat->data.pkts.wakeup_stat.sock_rx_drop); ++ printf("sock_tx_merge: %-14"PRIu64" ", lstack_stat->data.pkts.wakeup_stat.sock_tx_merge); ++ printf("accpet_fail: %-16"PRIu64" \n", lstack_stat->data.pkts.wakeup_stat.accept_fail); + } + + static void gazelle_print_lstack_stat_detail(struct gazelle_stack_dfx_data *lstack_stat, +-- +2.33.0 + diff --git a/0334-mempool-fix-copy_mbuf_private.patch b/0334-mempool-fix-copy_mbuf_private.patch new file mode 100644 index 0000000..869e6d8 --- /dev/null +++ b/0334-mempool-fix-copy_mbuf_private.patch @@ -0,0 +1,75 @@ +From b189e23541622329d3df7cf95ecf20b2394e3539 Mon Sep 17 00:00:00 2001 +From: Lemmy Huang +Date: Sun, 6 Apr 2025 16:02:40 +0800 +Subject: [PATCH] mempool: fix copy_mbuf_private + +Signed-off-by: Lemmy Huang +--- + src/common/dpdk_common.h | 18 +++++++++++------- + src/lstack/core/lstack_dpdk.c | 2 +- + 2 files changed, 12 insertions(+), 8 deletions(-) + +diff --git a/src/common/dpdk_common.h b/src/common/dpdk_common.h +index cff193c..fca4aa3 100644 +--- a/src/common/dpdk_common.h ++++ b/src/common/dpdk_common.h +@@ -38,9 +38,11 @@ struct latency_timestamp { + uint16_t type; // latency type + }; + struct mbuf_private { +- /* struct pbuf_custom must at first */ ++ /* struct pbuf_custom must at first. do not copy in copy_mbuf_private() !!! */ + struct pbuf_custom pc; +- int stack_id; /* the stack to which buf belongs */ ++ /* the stack to which buf belongs. do not copy in copy_mbuf_private() !!! */ ++ int stack_id; ++ + struct latency_timestamp lt; + }; + +@@ -57,11 +59,16 @@ static __rte_always_inline struct mbuf_private *pbuf_to_private(const struct pbu + return mbuf_to_private(pbuf_to_mbuf(p)); + } + ++static __rte_always_inline void copy_mbuf_private(struct mbuf_private *dst, const struct mbuf_private *src) ++{ ++ rte_memcpy(&dst->lt, &src->lt, sizeof(struct latency_timestamp)); ++} ++ + /* NOTE!!! magic code, even the order. + * I wrote it carefully, and check the assembly. for example, there is 24 ins in A72, + * and if there is no cache miss, it only take less than 20 cycle(store pipe is the bottleneck). + */ +-static __rte_always_inline void copy_mbuf(struct rte_mbuf *dst, struct rte_mbuf *src) ++static __rte_always_inline void copy_mbuf(struct rte_mbuf *dst, const struct rte_mbuf *src) + { + /* In the direction of tx, data is copied from lstack to ltran. It is necessary to judge whether + the length of data transmitted from lstack has been tampered with to prevent overflow +@@ -79,10 +86,7 @@ static __rte_always_inline void copy_mbuf(struct rte_mbuf *dst, struct rte_mbuf + uint8_t *src_data = rte_pktmbuf_mtod(src, void*); + rte_memcpy(dst_data, src_data, data_len); + +- // copy private date. +- dst_data = (uint8_t *)mbuf_to_private(dst); +- src_data = (uint8_t *)mbuf_to_private(src); +- rte_memcpy(dst_data, src_data, sizeof(struct mbuf_private)); ++ copy_mbuf_private(mbuf_to_private(dst), mbuf_to_private(src)); + } + + static __rte_always_inline void time_stamp_into_mbuf(uint32_t rx_count, struct rte_mbuf *buf[], uint64_t time_stamp) +diff --git a/src/lstack/core/lstack_dpdk.c b/src/lstack/core/lstack_dpdk.c +index 8f896c9..baba571 100644 +--- a/src/lstack/core/lstack_dpdk.c ++++ b/src/lstack/core/lstack_dpdk.c +@@ -196,7 +196,7 @@ int32_t fill_mbuf_to_ring(int stack_id, struct rte_ring *ring, uint32_t mbuf_num + batch = LWIP_MIN(remain, RING_SIZE(VDEV_RX_QUEUE_SZ)); + + ret = mem_get_mbuf_bulk(stack_id, free_buf, batch, true); +- if (ret != 0) { ++ if (ret == 0) { + LSTACK_LOG(ERR, LSTACK, "cannot alloc mbuf for ring, count: %u ret=%d\n", batch, ret); + return -1; + } +-- +2.33.0 + diff --git a/0335-socket-fix-connect-blocking.patch b/0335-socket-fix-connect-blocking.patch new file mode 100644 index 0000000..eb9d21e --- /dev/null +++ b/0335-socket-fix-connect-blocking.patch @@ -0,0 +1,198 @@ +From 3d17332feaa4104d614fb0ba72d29e56b4e87cf2 Mon Sep 17 00:00:00 2001 +From: Lemmy Huang +Date: Wed, 16 Apr 2025 09:56:46 +0800 +Subject: [PATCH] socket: fix connect blocking + +Signed-off-by: Lemmy Huang +--- + src/lstack/api/lstack_epoll.c | 17 +++++++++++------ + src/lstack/api/lstack_sockctl.c | 8 ++++---- + src/lstack/api/lstack_sockio.c | 14 +++++++------- + src/lstack/core/lstack_wait.c | 2 +- + src/lstack/include/lstack_epoll.h | 2 +- + 5 files changed, 24 insertions(+), 19 deletions(-) + +diff --git a/src/lstack/api/lstack_epoll.c b/src/lstack/api/lstack_epoll.c +index 3940f43..ed6d2f0 100644 +--- a/src/lstack/api/lstack_epoll.c ++++ b/src/lstack/api/lstack_epoll.c +@@ -814,12 +814,12 @@ void epoll_api_init(posix_api_t *api) + api->select_fn = lstack_select; + } + +-bool sock_event_wait(struct lwip_sock *sock, bool noblocking) ++bool sock_event_wait(struct lwip_sock *sock, enum netconn_evt evt, bool noblocking) + { + bool rtc_mode = get_global_cfg_params()->stack_mode_rtc; + uint32_t start; + int timeout; +- unsigned pending; ++ unsigned pending = 0; + + if (!rtc_mode && noblocking) + return false; +@@ -829,6 +829,7 @@ bool sock_event_wait(struct lwip_sock *sock, bool noblocking) + } + if (!(sock->sk_wait->type & WAIT_BLOCK)) { + sock->sk_wait->type |= WAIT_BLOCK; ++ rte_wmb(); + } + + if (rtc_mode) { +@@ -840,13 +841,17 @@ bool sock_event_wait(struct lwip_sock *sock, bool noblocking) + timeout = sock->conn->recv_timeout == 0 ? -1 : sock->conn->recv_timeout; + start = sys_now(); + do { +- pending = sock_event_hold_pending(sock, WAIT_BLOCK, NETCONN_EVT_RCVPLUS, 0) | ++ pending = sock_event_hold_pending(sock, WAIT_BLOCK, evt, 0) | + sock_event_hold_pending(sock, WAIT_BLOCK, NETCONN_EVT_ERROR, 0); +- if (pending) { +- return true; ++ if (pending != 0) { ++ break; + } + timeout = sock->sk_wait->timedwait_fn(sock->sk_wait, timeout, start); + } while (timeout != 0); + +- return false; ++ if (evt == NETCONN_EVT_SENDPLUS) { ++ /* remove WAIT_BLOCK type */ ++ sock->sk_wait->type &= ~WAIT_BLOCK; ++ } ++ return pending != 0; + } +diff --git a/src/lstack/api/lstack_sockctl.c b/src/lstack/api/lstack_sockctl.c +index 7ff7163..d1ef36a 100644 +--- a/src/lstack/api/lstack_sockctl.c ++++ b/src/lstack/api/lstack_sockctl.c +@@ -448,7 +448,7 @@ static int rpc_call_connect(int stack_id, int fd, const struct sockaddr *addr, s + + if (ret < 0 && errno == EINPROGRESS) { + struct lwip_sock *sock = lwip_get_socket(fd); +- if (sock_event_wait(sock, netconn_is_nonblocking(sock->conn))) { ++ if (sock_event_wait(sock, NETCONN_EVT_SENDPLUS, netconn_is_nonblocking(sock->conn))) { + ret = 0; + } + } +@@ -607,7 +607,7 @@ static int stack_broadcast_accept4(int fd, struct sockaddr *addr, socklen_t *add + + min_sock = get_min_accept_sock(fd); + if (min_sock == NULL) { +- if (sock_event_wait(sock, netconn_is_nonblocking(sock->conn) || (flags & SOCK_NONBLOCK))) { ++ if (sock_event_wait(sock, NETCONN_EVT_RCVPLUS, netconn_is_nonblocking(sock->conn) || (flags & SOCK_NONBLOCK))) { + min_sock = get_min_accept_sock(fd); + } + } +@@ -786,7 +786,7 @@ static int rtc_connect(int s, const struct sockaddr *name, socklen_t namelen) + ret = lwip_connect(s, name, namelen); + if (ret < 0 && errno == EINPROGRESS) { + struct lwip_sock *sock = lwip_get_socket(s); +- if (sock_event_wait(sock, netconn_is_nonblocking(sock->conn))) { ++ if (sock_event_wait(sock, NETCONN_EVT_SENDPLUS, netconn_is_nonblocking(sock->conn))) { + ret = 0; + } + } +@@ -804,7 +804,7 @@ static int rtc_accept4(int s, struct sockaddr *addr, socklen_t *addrlen, int fla + + ret = lwip_accept4(s, addr, addrlen, flags); + if (ret < 0 && errno == EWOULDBLOCK) { +- if (sock_event_wait(sock, netconn_is_nonblocking(sock->conn) || (flags & SOCK_NONBLOCK))) { ++ if (sock_event_wait(sock, NETCONN_EVT_RCVPLUS, netconn_is_nonblocking(sock->conn) || (flags & SOCK_NONBLOCK))) { + ret = lwip_accept4(s, addr, addrlen, flags); + } + } +diff --git a/src/lstack/api/lstack_sockio.c b/src/lstack/api/lstack_sockio.c +index 89a6acb..438f1ce 100644 +--- a/src/lstack/api/lstack_sockio.c ++++ b/src/lstack/api/lstack_sockio.c +@@ -1128,7 +1128,7 @@ ssize_t sockio_recvfrom(int fd, void *mem, size_t len, int flags, + /* TODO: support MSG_WAITALL */ + recvd = ioops.stack_tcp_read(sock, mem, len, flags, from, fromlen); + if (recvd < 0 && errno == EWOULDBLOCK) { +- if (sock_event_wait(sock, netconn_is_nonblocking(sock->conn) || (flags & MSG_DONTWAIT))) { ++ if (sock_event_wait(sock, NETCONN_EVT_RCVPLUS, netconn_is_nonblocking(sock->conn) || (flags & MSG_DONTWAIT))) { + recvd = ioops.stack_tcp_read(sock, mem, len, flags, from, fromlen); + } + } +@@ -1150,7 +1150,7 @@ ssize_t sockio_recvfrom(int fd, void *mem, size_t len, int flags, + msg.msg_namelen = (fromlen ? *fromlen : 0); + recvd = ioops.stack_udp_readmsg(sock, &msg, len, flags); + if (recvd < 0 && errno == EWOULDBLOCK) { +- if (sock_event_wait(sock, netconn_is_nonblocking(sock->conn) || (flags & MSG_DONTWAIT))) { ++ if (sock_event_wait(sock, NETCONN_EVT_RCVPLUS, netconn_is_nonblocking(sock->conn) || (flags & MSG_DONTWAIT))) { + recvd = ioops.stack_udp_readmsg(sock, &msg, len, flags); + } + } +@@ -1196,7 +1196,7 @@ ssize_t sockio_recvmsg(int fd, struct msghdr *msg, int flags) + case NETCONN_UDP: + recvd = ioops.stack_udp_readmsg(sock, msg, len, flags); + if (recvd < 0 && errno == EWOULDBLOCK) { +- if (sock_event_wait(sock, netconn_is_nonblocking(sock->conn) || (flags & MSG_DONTWAIT))) { ++ if (sock_event_wait(sock, NETCONN_EVT_RCVPLUS, netconn_is_nonblocking(sock->conn) || (flags & MSG_DONTWAIT))) { + recvd = ioops.stack_udp_readmsg(sock, msg, len, flags); + } + } +@@ -1233,7 +1233,7 @@ ssize_t sockio_sendto(int fd, const void *mem, size_t len, int flags, + ret = ioops.stack_tcp_write(sock, mem, len, flags); + if (ret < 0) { + if (errno == EWOULDBLOCK) { +- sock_event_wait(sock, true); ++ sock_event_wait(sock, NETCONN_EVT_SENDPLUS, true); + } + } else { + ioops.stack_tcp_send(sock); +@@ -1243,7 +1243,7 @@ ssize_t sockio_sendto(int fd, const void *mem, size_t len, int flags, + ret = ioops.stack_udp_write(sock, mem, len, flags, to, tolen); + if (ret < 0) { + if (errno == EWOULDBLOCK) { +- sock_event_wait(sock, true); ++ sock_event_wait(sock, NETCONN_EVT_SENDPLUS, true); + } + } else { + ioops.stack_udp_send(sock); +@@ -1284,7 +1284,7 @@ ssize_t sockio_sendmsg(int fd, const struct msghdr *msg, int flags) + ret = ioops.stack_tcp_write(sock, msg->msg_iov[i].iov_base, msg->msg_iov[i].iov_len, flags | write_more); + if (ret < 0) { + if (errno == EWOULDBLOCK) { +- sock_event_wait(sock, true); ++ sock_event_wait(sock, NETCONN_EVT_SENDPLUS, true); + } + break; + } +@@ -1302,7 +1302,7 @@ ssize_t sockio_sendmsg(int fd, const struct msghdr *msg, int flags) + ret = ioops.stack_udp_write(sock, msg->msg_iov[i].iov_base, msg->msg_iov[i].iov_len, flags | write_more, NULL, 0); + if (ret < 0) { + if (errno == EWOULDBLOCK) { +- sock_event_wait(sock, true); ++ sock_event_wait(sock, NETCONN_EVT_SENDPLUS, true); + } + break; + } +diff --git a/src/lstack/core/lstack_wait.c b/src/lstack/core/lstack_wait.c +index 381290c..866e293 100644 +--- a/src/lstack/core/lstack_wait.c ++++ b/src/lstack/core/lstack_wait.c +@@ -421,7 +421,7 @@ unsigned sock_event_hold_pending(const struct lwip_sock *sock, + } + break; + case NETCONN_EVT_SENDPLUS: +- if (sock->sk_event.events & EPOLLOUT) { ++ if (sock->sk_event.events & EPOLLOUT || type & WAIT_BLOCK) { + if (len > 0 || + NETCONN_ALLOW_SEND(sock)) { + event = EPOLLOUT; +diff --git a/src/lstack/include/lstack_epoll.h b/src/lstack/include/lstack_epoll.h +index 655e178..99a7162 100644 +--- a/src/lstack/include/lstack_epoll.h ++++ b/src/lstack/include/lstack_epoll.h +@@ -23,6 +23,6 @@ void poll_destruct_wait(void); + + int lstack_epoll_close(int epfd); + void epoll_api_init(posix_api_t *api); +-bool sock_event_wait(struct lwip_sock *sock, bool noblocking); ++bool sock_event_wait(struct lwip_sock *sock, enum netconn_evt evt, bool noblocking); + + #endif /* _GAZELLE_EPOLL_H_ */ +-- +2.33.0 + diff --git a/0336-socket-fix-stack_tcp_read-do-not-recv_finish_burst.patch b/0336-socket-fix-stack_tcp_read-do-not-recv_finish_burst.patch new file mode 100644 index 0000000..9e7972c --- /dev/null +++ b/0336-socket-fix-stack_tcp_read-do-not-recv_finish_burst.patch @@ -0,0 +1,32 @@ +From 062a2006981cd9918a3b698241fe1cd64b99fbdb Mon Sep 17 00:00:00 2001 +From: Lemmy Huang +Date: Thu, 17 Apr 2025 14:52:26 +0800 +Subject: [PATCH] socket: fix stack_tcp_read do not recv_finish_burst + +Signed-off-by: Lemmy Huang +--- + src/lstack/api/lstack_sockio.c | 8 ++++++++ + 1 file changed, 8 insertions(+) + +diff --git a/src/lstack/api/lstack_sockio.c b/src/lstack/api/lstack_sockio.c +index 438f1ce..0ebd9e2 100644 +--- a/src/lstack/api/lstack_sockio.c ++++ b/src/lstack/api/lstack_sockio.c +@@ -838,6 +838,14 @@ static ssize_t stack_tcp_read(struct lwip_sock *sock, char *data, size_t len, in + copied_total += buf_copy_len; + total_copy_len -= buf_copy_len; + mr->app_recvd_len += buf_copy_len; ++ ++ if (mr->app_queued_num >= RECV_EXTEND_CACHE_MAX || ++ mr->app_recvd_len >= RECV_EXTEND_CACHE_LEN) { ++ if (sock->lastdata.pbuf == NULL) { ++ mr->ops->recv_finish_burst(mr); ++ mr->app_queued_num = 0; ++ } ++ } + } + + while (total_copy_len > 0) { +-- +2.33.0 + diff --git a/0337-tcp-add-GAZELLE_TCP_ASYNC_RECVD.patch b/0337-tcp-add-GAZELLE_TCP_ASYNC_RECVD.patch new file mode 100644 index 0000000..d9127dc --- /dev/null +++ b/0337-tcp-add-GAZELLE_TCP_ASYNC_RECVD.patch @@ -0,0 +1,167 @@ +From b928e5c8027272c75f46395aa2bef25962d844f3 Mon Sep 17 00:00:00 2001 +From: Lemmy Huang +Date: Fri, 18 Apr 2025 14:41:20 +0800 +Subject: [PATCH] tcp: add GAZELLE_TCP_ASYNC_RECVD fix mbuf OOM caused by + untimely sockio_peek_recv_free + +Signed-off-by: Lemmy Huang +--- + src/lstack/api/lstack_sockio.c | 68 ++++++++++++++-------------------- + 1 file changed, 27 insertions(+), 41 deletions(-) + +diff --git a/src/lstack/api/lstack_sockio.c b/src/lstack/api/lstack_sockio.c +index 0ebd9e2..46c8992 100644 +--- a/src/lstack/api/lstack_sockio.c ++++ b/src/lstack/api/lstack_sockio.c +@@ -910,47 +910,34 @@ static ssize_t stack_tcp_read(struct lwip_sock *sock, char *data, size_t len, in + } + + +-#define RECVD_UNCOMMITTED(msg) ((msg)->args[MSG_ARG_2].ul) +-#define RECVD_CURR_SEQ(msg) ((msg)->args[MSG_ARG_3].ul) +-#define RECVD_LAST_SEQ(msg) ((msg)->args[MSG_ARG_4].ul) +- +-static inline bool rpc_commit_tcp_recvd(struct rpc_msg *recvmsg, unsigned long threshold) ++#if GAZELLE_TCP_ASYNC_RECVD ++#define RECVD_UNSUBMITED(msg) ((msg)->args[MSG_ARG_2].ul) ++static inline bool rpc_submit_tcp_recvd(struct rpc_msg *recvmsg, size_t threshold, size_t recvd) + { +- if (RECVD_UNCOMMITTED(recvmsg) >= threshold) { +- __atomic_add_fetch(&RECVD_CURR_SEQ(recvmsg), RECVD_UNCOMMITTED(recvmsg), __ATOMIC_RELEASE); +- RECVD_UNCOMMITTED(recvmsg) = 0; ++ RECVD_UNSUBMITED(recvmsg) += recvd; ++ if (RECVD_UNSUBMITED(recvmsg) >= threshold) { ++ RECVD_UNSUBMITED(recvmsg) = 0; + return true; + } + return false; + } + +-#if TCP_RECV_AND_UPDATE +-static inline unsigned long rpc_read_tcp_recvd(struct rpc_msg *recvmsg) +-{ +- unsigned long curr_recvd_seq; +- unsigned long recvd; +- +- curr_recvd_seq = __atomic_load_n(&RECVD_CURR_SEQ(recvmsg), __ATOMIC_ACQUIRE); +- recvd = curr_recvd_seq - RECVD_LAST_SEQ(recvmsg); +- if (recvd > 0) { +- /* update last recvd seq */ +- RECVD_LAST_SEQ(recvmsg) = curr_recvd_seq; +- } +- return recvd; +-} +- + static void callback_tcp_recvd(struct rpc_msg *recvmsg) + { + struct lwip_sock *sock = recvmsg->args[MSG_ARG_0].p; +- unsigned long recvd; ++ struct mbox_ring *mr; ++ u32_t recvd; + +- recvd = rpc_read_tcp_recvd(recvmsg); +- lwip_tcp_recvd(sock->conn, recvd, 0); ++ mr = &sock->conn->recvmbox->mring; ++ if (mr->flags & MBOX_FLAG_PEEK) { ++ sockio_peek_recv_free(mr, 0); ++ } + ++ recvd = lwip_netconn_get_recvd(sock->conn, 0, 0); ++ lwip_netconn_update_recvd(sock->conn, recvd); + recvmsg->result = recvd; + return; + } +-#endif /* TCP_RECV_AND_UPDATE */ + + static inline int rpc_call_tcp_recvd(rpc_queue *queue, struct lwip_sock *sock, size_t recvd, int flags) + { +@@ -960,22 +947,23 @@ static inline int rpc_call_tcp_recvd(rpc_queue *queue, struct lwip_sock *sock, s + recvmsg->args[MSG_ARG_0].p = sock; + recvmsg->result = 0; + +- RECVD_UNCOMMITTED(recvmsg) += recvd; +- if (rpc_commit_tcp_recvd(recvmsg, TCP_WND_UPDATE_THRESHOLD << 1)) { ++ if (rpc_submit_tcp_recvd(recvmsg, TCP_WND >> 1, recvd)) { + rpc_async_call(queue, recvmsg, RPC_MSG_REUSE); + } +- + return 0; + } ++#endif /* GAZELLE_TCP_ASYNC_RECVD */ + + static void rtw_stack_tcp_recvd(struct lwip_sock *sock, ssize_t recvd, int flags) + { ++#if GAZELLE_TCP_ASYNC_RECVD + struct protocol_stack *stack = get_protocol_stack_by_id(sock->stack_id); + + if (recvd <= 0 || flags & MSG_PEEK) { + return; + } + rpc_call_tcp_recvd(&stack->rpc_queue, sock, recvd, flags); ++#endif /* GAZELLE_TCP_ASYNC_RECVD */ + } + + static void rtc_stack_tcp_recvd(struct lwip_sock *sock, ssize_t recvd, int flags) +@@ -1012,14 +1000,14 @@ static void callback_tcp_send(struct rpc_msg *sendmsg) + LSTACK_LOG(ERR, LSTACK, "tcp_output failed, sock %p, err %u\n", sock, err); + } + +-#if TCP_RECV_AND_UPDATE ++#if GAZELLE_TCP_ASYNC_RECVD + struct rpc_msg *recvmsg; +- if (RECVD_UNCOMMITTED(sendmsg)) { +- RECVD_UNCOMMITTED(sendmsg) = 0; ++ if (RECVD_UNSUBMITED(sendmsg)) { ++ RECVD_UNSUBMITED(sendmsg) = 0; + recvmsg = sock_mbox_private_get(sock->conn->recvmbox); + callback_tcp_recvd(recvmsg); + } +-#endif /* TCP_RECV_AND_UPDATE */ ++#endif /* GAZELLE_TCP_ASYNC_RECVD */ + + return; + } +@@ -1036,11 +1024,11 @@ static inline int rpc_call_tcp_send(rpc_queue *queue, struct lwip_sock *sock) + sendmsg->args[MSG_ARG_0].p = sock; + sendmsg->args[MSG_ARG_1].p = mem_thread_migrate_get(sock->stack_id); + +-#if TCP_RECV_AND_UPDATE ++#if GAZELLE_TCP_ASYNC_RECVD + struct rpc_msg *recvmsg; + recvmsg = sock_mbox_private_get(sock->conn->recvmbox); +- RECVD_UNCOMMITTED(sendmsg) = rpc_commit_tcp_recvd(recvmsg, TCP_WND_UPDATE_THRESHOLD); +-#endif /* TCP_RECV_AND_UPDATE */ ++ RECVD_UNSUBMITED(sendmsg) = rpc_submit_tcp_recvd(recvmsg, TCP_WND >> 2, 0); ++#endif /* GAZELLE_TCP_ASYNC_RECVD */ + + rpc_async_call(queue, sendmsg, RPC_MSG_REUSE); + return 0; +@@ -1140,11 +1128,9 @@ ssize_t sockio_recvfrom(int fd, void *mem, size_t len, int flags, + recvd = ioops.stack_tcp_read(sock, mem, len, flags, from, fromlen); + } + } +-#if TCP_RECV_AND_UPDATE + if (recvd > 0) { + ioops.stack_tcp_recvd(sock, recvd, flags); + } +-#endif /* TCP_RECV_AND_UPDATE */ + break; + case NETCONN_UDP: + vec.iov_base = mem; +@@ -1412,11 +1398,11 @@ static int sockio_mbox_init(struct lwip_sock *sock) + switch (NETCONN_TYPE(sock->conn)) { + case NETCONN_TCP: + ret = sock_mbox_private_init(sendmbox, callback_tcp_send); +-#if TCP_RECV_AND_UPDATE ++#if GAZELLE_TCP_ASYNC_RECVD + if (sys_mbox_valid(&recvmbox)) { + ret |= sock_mbox_private_init(recvmbox, callback_tcp_recvd); + } +-#endif /* TCP_RECV_AND_UPDATE */ ++#endif /* GAZELLE_TCP_ASYNC_RECVD */ + break; + case NETCONN_UDP: + ret = sock_mbox_private_init(sendmbox, callback_udp_send); +-- +2.33.0 + diff --git a/0338-socket-fix-tcp-closed.patch b/0338-socket-fix-tcp-closed.patch new file mode 100644 index 0000000..41f9dcd --- /dev/null +++ b/0338-socket-fix-tcp-closed.patch @@ -0,0 +1,95 @@ +From 933f0e59f77fe450ab255c07c9f4b994de4be11f Mon Sep 17 00:00:00 2001 +From: Lemmy Huang +Date: Sat, 19 Apr 2025 21:20:52 +0800 +Subject: [PATCH] socket: fix tcp closed + +Signed-off-by: Lemmy Huang +--- + src/lstack/api/lstack_sockio.c | 48 ++++++++++++++-------------------- + 1 file changed, 19 insertions(+), 29 deletions(-) + +diff --git a/src/lstack/api/lstack_sockio.c b/src/lstack/api/lstack_sockio.c +index 46c8992..0bd31ab 100644 +--- a/src/lstack/api/lstack_sockio.c ++++ b/src/lstack/api/lstack_sockio.c +@@ -817,7 +817,7 @@ static ssize_t stack_tcp_read(struct lwip_sock *sock, char *data, size_t len, in + struct mbox_ring *mr = &sock->conn->recvmbox->mring; + struct pbuf **extcache_list; + err_t err = ERR_OK; +- struct pbuf *p; ++ struct pbuf *p = NULL; + + uint32_t buf_copy_len; + uint32_t total_copy_len = len; +@@ -832,50 +832,40 @@ static ssize_t stack_tcp_read(struct lwip_sock *sock, char *data, size_t len, in + extcache_list = (struct pbuf **)&mr->st_obj; + } + +- if (sock->lastdata.pbuf != NULL) { +- // TODO: support MSG_PEEK +- buf_copy_len = pbuf_copy_and_free(&sock->lastdata.pbuf, extcache_list, data, total_copy_len); +- copied_total += buf_copy_len; +- total_copy_len -= buf_copy_len; +- mr->app_recvd_len += buf_copy_len; +- +- if (mr->app_queued_num >= RECV_EXTEND_CACHE_MAX || +- mr->app_recvd_len >= RECV_EXTEND_CACHE_LEN) { +- if (sock->lastdata.pbuf == NULL) { +- mr->ops->recv_finish_burst(mr); +- mr->app_queued_num = 0; +- } +- } +- } +- + while (total_copy_len > 0) { +- if (mr->ops->recv_start_burst(mr, (void **)&p, 1) == 0) { +- if (unlikely(sock_event_hold_pending(sock, WAIT_BLOCK, NETCONN_EVT_ERROR, 0))) { +- err = ERR_CONN; +- } else { +- err = ERR_WOULDBLOCK; ++ if (sock->lastdata.pbuf == NULL) { ++ if (mr->ops->recv_start_burst(mr, (void **)&sock->lastdata.pbuf, 1) == 0) { ++ if (unlikely(sock_event_hold_pending(sock, WAIT_BLOCK, NETCONN_EVT_ERROR, 0))) { ++ err = ERR_CONN; ++ } else { ++ err = ERR_WOULDBLOCK; ++ } ++ break; + } +- break; ++ mr->app_queued_num++; ++ SOCK_WAIT_STAT(sock->sk_wait, app_read_cnt, 1); + } +- mr->app_queued_num++; +- if (unlikely(lwip_netconn_is_err_msg(p, &err))) { ++ ++ if (unlikely(lwip_netconn_is_err_msg(sock->lastdata.pbuf, &err))) { + API_EVENT(sock->conn, NETCONN_EVT_RCVMINUS, copied_total); + break; + } + +- SOCK_WAIT_STAT(sock->sk_wait, app_read_cnt, 1); +- if (get_protocol_stack_group()->latency_start) ++ if (get_protocol_stack_group()->latency_start) { ++ p = sock->lastdata.pbuf; + calculate_lstack_latency(sock->stack_id, &p, 1, GAZELLE_LATENCY_READ_APP_CALL, sys_now_us()); ++ } + +- sock->lastdata.pbuf = p; + // TODO: support MSG_PEEK + buf_copy_len = pbuf_copy_and_free(&sock->lastdata.pbuf, extcache_list, data + copied_total, total_copy_len); + copied_total += buf_copy_len; + total_copy_len -= buf_copy_len; + mr->app_recvd_len += buf_copy_len; + +- if (get_protocol_stack_group()->latency_start) ++ if (get_protocol_stack_group()->latency_start) { + calculate_lstack_latency(sock->stack_id, &p, 1, GAZELLE_LATENCY_READ_LSTACK, 0); ++ p = NULL; ++ } + + if (mr->app_queued_num >= RECV_EXTEND_CACHE_MAX || + mr->app_recvd_len >= RECV_EXTEND_CACHE_LEN) { +-- +2.33.0 + diff --git a/0339-socket-fix-sk_wait-cannot-be-interrupted-by-signals.patch b/0339-socket-fix-sk_wait-cannot-be-interrupted-by-signals.patch new file mode 100644 index 0000000..27c3a6e --- /dev/null +++ b/0339-socket-fix-sk_wait-cannot-be-interrupted-by-signals.patch @@ -0,0 +1,139 @@ +From 56f8d17ddf0a8668d87d2060f97f6c177bc5ce3d Mon Sep 17 00:00:00 2001 +From: Lemmy Huang +Date: Sun, 20 Apr 2025 12:54:06 +0800 +Subject: [PATCH] socket: fix sk_wait cannot be interrupted by signals + +Signed-off-by: Lemmy Huang +--- + src/lstack/api/lstack_epoll.c | 19 +++++++++++++------ + src/lstack/core/lstack_wait.c | 8 ++++---- + src/lstack/include/lstack_wait.h | 2 +- + 3 files changed, 18 insertions(+), 11 deletions(-) + +diff --git a/src/lstack/api/lstack_epoll.c b/src/lstack/api/lstack_epoll.c +index ed6d2f0..19cc9d6 100644 +--- a/src/lstack/api/lstack_epoll.c ++++ b/src/lstack/api/lstack_epoll.c +@@ -40,13 +40,16 @@ static int rtc_sock_wait_timedwait(struct sock_wait *sk_wait, int timeout, uint3 + + if (timeout > 0 && timeout <= (int)(sys_now() - start)) { + timeout = 0; ++ } else if (timeout < 0) { ++ errno = 0; + } + return timeout; + } + + static int rtw_sock_wait_timedwait(struct sock_wait *sk_wait, int timeout, uint32_t start) + { +- return sys_mutex_timedlock_internal(&sk_wait->mutex, timeout); ++ /* when sem interrupted by signals, errno = EINTR */ ++ return sys_sem_wait_internal(&sk_wait->sem, timeout); + } + + static void rtc_epoll_notify_event(struct sock_wait *sk_wait, struct sock_event *sk_event, +@@ -83,7 +86,7 @@ static void rtw_epoll_notify_event(struct sock_wait *sk_wait, struct sock_event + } + rte_spinlock_unlock(&sk_wait->epcb.lock); + +- sys_mutex_unlock_internal(&sk_wait->mutex); ++ sys_sem_signal_internal(&sk_wait->sem); + } + + static void rtw_epoll_remove_event(struct sock_wait *sk_wait, struct sock_event *sk_event, unsigned pending) +@@ -112,7 +115,7 @@ static void rtw_poll_notify_event(struct sock_wait *sk_wait, struct sock_event * + return; + } + #endif /* SOCK_WAIT_BATCH_NOTIFY */ +- sys_mutex_unlock_internal(&sk_wait->mutex); ++ sys_sem_signal_internal(&sk_wait->sem); + } + static void rtw_poll_remove_event(struct sock_wait *sk_wait, struct sock_event *sk_event, unsigned pending) + { +@@ -433,7 +436,7 @@ int lstack_epoll_wait(int epfd, struct epoll_event* events, int maxevents, int t + } + + timeout = sk_wait->timedwait_fn(sk_wait, timeout, start); +- } while (timeout != 0); ++ } while (timeout > 0 || (timeout < 0 && errno == 0)); + + sk_wait->stat.app_events += lwip_num; + sk_wait->stat.kernel_events += kernel_num; +@@ -706,7 +709,7 @@ int lstack_poll(struct pollfd *fds, nfds_t nfds, int timeout) + } + + timeout = sk_wait->timedwait_fn(sk_wait, timeout, start); +- } while (timeout != 0); ++ } while (timeout > 0 || (timeout < 0 && errno == 0)); + + sk_wait->stat.app_events += lwip_num; + sk_wait->stat.kernel_events += kernel_num; +@@ -847,7 +850,11 @@ bool sock_event_wait(struct lwip_sock *sock, enum netconn_evt evt, bool noblocki + break; + } + timeout = sock->sk_wait->timedwait_fn(sock->sk_wait, timeout, start); +- } while (timeout != 0); ++ } while (timeout > 0 || (timeout < 0 && errno == 0)); ++ ++ if (errno == ETIMEDOUT) { ++ errno = EAGAIN; ++ } + + if (evt == NETCONN_EVT_SENDPLUS) { + /* remove WAIT_BLOCK type */ +diff --git a/src/lstack/core/lstack_wait.c b/src/lstack/core/lstack_wait.c +index 866e293..816f340 100644 +--- a/src/lstack/core/lstack_wait.c ++++ b/src/lstack/core/lstack_wait.c +@@ -208,7 +208,7 @@ void* kernel_wait_thread(void *arg) + if (sk_wait->type == WAIT_CLOSE) + continue; + rte_atomic16_set(&sk_wait->kernel_pending, true); +- sys_mutex_unlock_internal(&sk_wait->mutex); ++ sys_sem_signal_internal(&sk_wait->sem); + } + usleep(KERNEL_EVENT_WAIT_US); + } +@@ -285,7 +285,7 @@ int sock_wait_common_init(struct sock_wait *sk_wait) + { + sk_wait->lwip_nfds = 0; + sk_wait->kernel_nfds = 0; +- sys_mutex_new_internal(&sk_wait->mutex); ++ sys_sem_new_internal(&sk_wait->sem, 0); + + #if SOCK_WAIT_BATCH_NOTIFY + for (int i = 0; i < PROTOCOL_STACK_MAX; ++i) { +@@ -318,7 +318,7 @@ void sock_wait_common_free(struct sock_wait *sk_wait) + #endif /* SOCK_WAIT_BATCH_NOTIFY */ + + sock_wait_group_del(sk_wait); +- sys_mutex_free_internal(&sk_wait->mutex); ++ sys_sem_free_internal(&sk_wait->sem); + } + + int sock_wait_kernel_init(struct sock_wait *sk_wait, int epfd, int stack_num) +@@ -543,7 +543,7 @@ unsigned lwip_wait_foreach_notify(int stack_id) + + sock_wait_foreach_event(sk_wait, stack_id); + +- sys_mutex_unlock_internal(&sk_wait->mutex); ++ sys_sem_signal_internal(&sk_wait->sem); + count++; + } + return count; +diff --git a/src/lstack/include/lstack_wait.h b/src/lstack/include/lstack_wait.h +index 306c6a4..a1d35dd 100644 +--- a/src/lstack/include/lstack_wait.h ++++ b/src/lstack/include/lstack_wait.h +@@ -98,7 +98,7 @@ struct sock_wait { + /* kernel event flag */ + rte_atomic16_t kernel_pending; + /* run-to-wakeup blocking lock */ +- struct sys_mutex mutex; ++ struct sys_sem sem; /* Do not use mutex, as it cannot be interrupted by signals */ + + union { + struct epoll_cb epcb; +-- +2.33.0 + diff --git a/0340-mempool-modify-mbuf-num-and-rpc_msg-num.patch b/0340-mempool-modify-mbuf-num-and-rpc_msg-num.patch new file mode 100644 index 0000000..f560ef0 --- /dev/null +++ b/0340-mempool-modify-mbuf-num-and-rpc_msg-num.patch @@ -0,0 +1,98 @@ +From 243f24d1a2123537bc4b280064f32d38af450826 Mon Sep 17 00:00:00 2001 +From: Lemmy Huang +Date: Sun, 20 Apr 2025 20:06:24 +0800 +Subject: [PATCH] mempool: modify mbuf num and rpc_msg num + +Signed-off-by: Lemmy Huang +--- + src/lstack/core/lstack_dpdk.c | 5 +++-- + src/lstack/core/lstack_mempool.c | 24 +++++++++++++----------- + src/lstack/include/lstack_mempool.h | 2 +- + 3 files changed, 17 insertions(+), 14 deletions(-) + +diff --git a/src/lstack/core/lstack_dpdk.c b/src/lstack/core/lstack_dpdk.c +index baba571..ddcd5ae 100644 +--- a/src/lstack/core/lstack_dpdk.c ++++ b/src/lstack/core/lstack_dpdk.c +@@ -901,8 +901,9 @@ uint32_t dpdk_pktmbuf_mempool_num(void) + { + struct cfg_params *cfg = get_global_cfg_params(); + +- return (MBUFPOOL_RESERVE_NUM + cfg->rxqueue_size + cfg->txqueue_size + +- (cfg->tcp_conn_count * cfg->mbuf_count_per_conn) / cfg->num_queue); ++ return (MBUFPOOL_RESERVE_NUM + MBUFPOOL_CACHE_NUM + ++ cfg->rxqueue_size + cfg->txqueue_size + ++ (cfg->tcp_conn_count * cfg->mbuf_count_per_conn) / cfg->num_queue); + } + + uint32_t dpdk_total_socket_memory(void) +diff --git a/src/lstack/core/lstack_mempool.c b/src/lstack/core/lstack_mempool.c +index bb47830..b03c3ef 100644 +--- a/src/lstack/core/lstack_mempool.c ++++ b/src/lstack/core/lstack_mempool.c +@@ -450,20 +450,15 @@ static const struct mempool_ops mbuf_mp_ops = { + + static struct rte_mempool *mbuf_pool_create(int stack_id, unsigned numa_id) + { +- struct cfg_params *cfg_params = get_global_cfg_params(); + char name[RTE_MEMPOOL_NAMESIZE]; + struct rte_mempool *pool; +- uint32_t total_conn_mbufs, total_nic_mbufs, total_mbufs; ++ uint32_t total_bufs; + uint16_t private_size; + uint16_t xdp_metadata = 0; + +- total_conn_mbufs = cfg_params->mbuf_count_per_conn * cfg_params->tcp_conn_count; +- total_nic_mbufs = cfg_params->rxqueue_size + cfg_params->txqueue_size; +- +- total_mbufs = (total_conn_mbufs / cfg_params->num_queue) + total_nic_mbufs + MBUFPOOL_RESERVE_NUM; +- /* limit mbuf max num based on the dpdk capability */ +- if (total_mbufs > MBUFPOOL_MAX_NUM) { +- LSTACK_LOG(ERR, LSTACK, "total_mbufs %u out of the dpdk mbuf_pool range\n", total_mbufs); ++ total_bufs = dpdk_pktmbuf_mempool_num(); ++ if (total_bufs > MEMPOOL_MAX_NUM) { ++ LSTACK_LOG(ERR, LSTACK, "total_bufs %u out of the dpdk mempool range\n", total_bufs); + return NULL; + } + +@@ -474,7 +469,7 @@ static struct rte_mempool *mbuf_pool_create(int stack_id, unsigned numa_id) + } + private_size = RTE_ALIGN(sizeof(struct mbuf_private) + xdp_metadata, RTE_CACHE_LINE_SIZE); + +- pool = mbuf_mp_ops.create(name, total_mbufs, MBUFPOOL_CACHE_NUM, private_size, MBUF_DATA_SIZE, numa_id); ++ pool = mbuf_mp_ops.create(name, total_bufs, MBUFPOOL_CACHE_NUM, private_size, MBUF_DATA_SIZE, numa_id); + if (pool == NULL) { + LSTACK_LOG(ERR, LSTACK, "rte_pktmbuf_pool_create %s failed, rte_errno %d\n", name, rte_errno); + return NULL; +@@ -487,7 +482,14 @@ static struct rte_mempool *rpc_pool_create(int stack_id, unsigned numa_id) + { + char name [RTE_MEMPOOL_NAMESIZE]; + struct rte_mempool *pool; +- uint32_t total_bufs = get_global_cfg_params()->rpc_msg_max; ++ uint32_t total_bufs; ++ ++ total_bufs = MEMPOOL_CACHE_NUM + BUF_CACHE_MIN_NUM + ++ (get_global_cfg_params()->rpc_msg_max / get_global_cfg_params()->num_queue); ++ if (total_bufs > MEMPOOL_MAX_NUM) { ++ LSTACK_LOG(ERR, LSTACK, "total_bufs %u out of the dpdk mempool range\n", total_bufs); ++ return NULL; ++ } + + SYS_FORMAT_NAME(name, RTE_MEMPOOL_NAMESIZE, "%s_%hu", "rpc_pool", stack_id); + +diff --git a/src/lstack/include/lstack_mempool.h b/src/lstack/include/lstack_mempool.h +index bd170bb..131029e 100644 +--- a/src/lstack/include/lstack_mempool.h ++++ b/src/lstack/include/lstack_mempool.h +@@ -36,7 +36,7 @@ + /* DPDK limit ring head-tail distance in rte_ring_init. + * Max value is RTE_RING_SZ_MASK / HTD_MAX_DEF, RTE_RING_SZ_MASK is 0x7fffffff, HTD_MAX_DEF is 8. + */ +-#define MBUFPOOL_MAX_NUM 0xfffffff ++#define MEMPOOL_MAX_NUM 0xfffffff + + #define MBUFPOOL_CACHE_NUM LWIP_MIN(NIC_QUEUE_SIZE_MAX >> 1, RTE_MEMPOOL_CACHE_MAX_SIZE) + #define MBUFPOOL_RESERVE_NUM (NIC_QUEUE_SIZE_MAX + MBUFPOOL_CACHE_NUM) +-- +2.33.0 + diff --git a/0341-mempool-fix-mbox_ring-free-not-call-recv_finish_burs.patch b/0341-mempool-fix-mbox_ring-free-not-call-recv_finish_burs.patch new file mode 100644 index 0000000..8695395 --- /dev/null +++ b/0341-mempool-fix-mbox_ring-free-not-call-recv_finish_burs.patch @@ -0,0 +1,125 @@ +From 532f3450a0b4212e03636c05593c05a50e95914d Mon Sep 17 00:00:00 2001 +From: Lemmy Huang +Date: Mon, 21 Apr 2025 14:42:02 +0800 +Subject: [PATCH] mempool: fix mbox_ring free not call recv_finish_burst + +Signed-off-by: Lemmy Huang +--- + src/lstack/api/lstack_sockio.c | 16 ++++++++++++++-- + src/lstack/include/lstack_lockless_queue.h | 8 ++++---- + src/lstack/include/lstack_sockio.h | 2 +- + src/lstack/include/mbox_ring.h | 8 +++++--- + 4 files changed, 24 insertions(+), 10 deletions(-) + +diff --git a/src/lstack/api/lstack_sockio.c b/src/lstack/api/lstack_sockio.c +index 0bd31ab..66e84d2 100644 +--- a/src/lstack/api/lstack_sockio.c ++++ b/src/lstack/api/lstack_sockio.c +@@ -1410,9 +1410,11 @@ static int sockio_mbox_init(struct lwip_sock *sock) + return ret; + } + +-bool sockio_mbox_pending(const struct lwip_sock *sock) ++bool sockio_mbox_pending(struct lwip_sock *sock) + { +- struct rpc_msg *msg; ++ const struct rpc_msg *msg; ++ const struct mbox_ring *mr; ++ err_t err; + + if (POSIX_IS_CLOSED(sock)) + return false; +@@ -1428,6 +1430,16 @@ bool sockio_mbox_pending(const struct lwip_sock *sock) + if (msg != NULL && !lockless_queue_node_is_poped(&msg->queue_node)) { + return true; + } ++ ++ /* PEEK lastdata is only used to mark the last read location and not for releasing. ++ * all peek bufs should free after pk_ring_dequeue_burst. */ ++ mr = &sock->conn->recvmbox->mring; ++ if (mr->flags & MBOX_FLAG_PEEK && mr->flags & MBOX_FLAG_TCP) { ++ if (sock->lastdata.pbuf != NULL && ++ !lwip_netconn_is_err_msg(sock->lastdata.pbuf, &err)) { ++ sock->lastdata.pbuf = NULL; ++ } ++ } + } + + return false; +diff --git a/src/lstack/include/lstack_lockless_queue.h b/src/lstack/include/lstack_lockless_queue.h +index bec2564..4f9b37b 100644 +--- a/src/lstack/include/lstack_lockless_queue.h ++++ b/src/lstack/include/lstack_lockless_queue.h +@@ -33,7 +33,7 @@ static inline void lockless_queue_node_set_poped(lockless_queue_node *node) + node->next = node; + } + +-static inline bool lockless_queue_node_is_poped(lockless_queue_node *node) ++static inline bool lockless_queue_node_is_poped(const lockless_queue_node *node) + { + return node->next == node; + } +@@ -60,18 +60,18 @@ static inline void lockless_queue_init(lockless_queue *queue) + queue->stub.next = NULL; + } + +-static inline bool lockless_queue_empty(lockless_queue *queue) ++static inline bool lockless_queue_empty(const lockless_queue *queue) + { + return (queue->head == queue->tail) && (queue->tail == &queue->stub); + } + +-static inline int32_t lockless_queue_count(lockless_queue *queue) ++static inline int32_t lockless_queue_count(const lockless_queue *queue) + { + if (lockless_queue_empty(queue)) { + return 0; + } + +- lockless_queue_node *tail = queue->tail; ++ const lockless_queue_node *tail = queue->tail; + if (tail == &queue->stub) { + tail = queue->stub.next; + } +diff --git a/src/lstack/include/lstack_sockio.h b/src/lstack/include/lstack_sockio.h +index f4e5e99..265d620 100644 +--- a/src/lstack/include/lstack_sockio.h ++++ b/src/lstack/include/lstack_sockio.h +@@ -32,7 +32,7 @@ ssize_t sockio_writev(int fd, const struct iovec *iov, int iovcnt); + + + void sockio_ops_init(void); +-bool sockio_mbox_pending(const struct lwip_sock *sock); ++bool sockio_mbox_pending(struct lwip_sock *sock); + + /* just for lwip */ + int do_lwip_init_sock(int fd); +diff --git a/src/lstack/include/mbox_ring.h b/src/lstack/include/mbox_ring.h +index f6acdef..2e3ac6a 100644 +--- a/src/lstack/include/mbox_ring.h ++++ b/src/lstack/include/mbox_ring.h +@@ -84,6 +84,9 @@ void mbox_ring_common_free(struct mbox_ring *mr) + obj = mr->ops->pop_tail(mr, NULL); + if (obj != NULL) + mr->obj_free_fn(mr, obj, true); ++ ++ if (mr->flags & MBOX_FLAG_RECV) ++ mr->ops->recv_finish_burst(mr); + while (true) { + if (mr->ops->dequeue_burst(mr, &obj, 1) == 0) + break; +@@ -479,9 +482,8 @@ static inline + void pk_ring_destroy(struct mbox_ring *mr) + { + void *obj; +- while (mr->ops->recv_start_burst(mr, &obj, 1) > 0) { +- mr->ops->recv_finish_burst(mr); +- } ++ while (mr->ops->recv_start_burst(mr, &obj, 1) > 0) { } ++ mr->ops->recv_finish_burst(mr); + return; + } + +-- +2.33.0 + diff --git a/0342-mempool-mem_get_rpc-add-reserve-limit.patch b/0342-mempool-mem_get_rpc-add-reserve-limit.patch new file mode 100644 index 0000000..49c96f4 --- /dev/null +++ b/0342-mempool-mem_get_rpc-add-reserve-limit.patch @@ -0,0 +1,363 @@ +From 5ce675c15ae4312d115686ce6f6ac7fb322b6f10 Mon Sep 17 00:00:00 2001 +From: Lemmy Huang +Date: Tue, 22 Apr 2025 15:03:00 +0800 +Subject: [PATCH] mempool: mem_get_rpc add reserve limit + +Signed-off-by: Lemmy Huang +--- + src/lstack/api/lstack_sockctl.c | 24 ++++++++-------- + src/lstack/api/lstack_sockio.c | 2 +- + src/lstack/core/lstack_control_plane.c | 4 +-- + src/lstack/core/lstack_mempool.c | 37 +++++++++++++++++-------- + src/lstack/core/lstack_protocol_stack.c | 2 +- + src/lstack/core/lstack_thread_rpc.c | 10 +++---- + src/lstack/include/lstack_mempool.h | 3 +- + src/lstack/include/lstack_thread_rpc.h | 2 +- + 8 files changed, 49 insertions(+), 35 deletions(-) + +diff --git a/src/lstack/api/lstack_sockctl.c b/src/lstack/api/lstack_sockctl.c +index d1ef36a..f53b4cd 100644 +--- a/src/lstack/api/lstack_sockctl.c ++++ b/src/lstack/api/lstack_sockctl.c +@@ -60,7 +60,7 @@ static void callback_setsockopt(struct rpc_msg *msg) + static int rpc_call_getpeername(int stack_id, int fd, struct sockaddr *addr, socklen_t *addrlen) + { + rpc_queue *queue = &get_protocol_stack_by_id(stack_id)->rpc_queue; +- struct rpc_msg *msg = rpc_msg_alloc(stack_id, callback_getpeername); ++ struct rpc_msg *msg = rpc_msg_alloc(stack_id, false, callback_getpeername); + if (msg == NULL) { + return -1; + } +@@ -75,7 +75,7 @@ static int rpc_call_getpeername(int stack_id, int fd, struct sockaddr *addr, soc + static int rpc_call_getsockname(int stack_id, int fd, struct sockaddr *addr, socklen_t *addrlen) + { + rpc_queue *queue = &get_protocol_stack_by_id(stack_id)->rpc_queue; +- struct rpc_msg *msg = rpc_msg_alloc(stack_id, callback_getsockname); ++ struct rpc_msg *msg = rpc_msg_alloc(stack_id, false, callback_getsockname); + if (msg == NULL) { + return -1; + } +@@ -90,7 +90,7 @@ static int rpc_call_getsockname(int stack_id, int fd, struct sockaddr *addr, soc + static int rpc_call_getsockopt(int stack_id, int fd, int level, int optname, void *optval, socklen_t *optlen) + { + rpc_queue *queue = &get_protocol_stack_by_id(stack_id)->rpc_queue; +- struct rpc_msg *msg = rpc_msg_alloc(stack_id, callback_getsockopt); ++ struct rpc_msg *msg = rpc_msg_alloc(stack_id, false, callback_getsockopt); + if (msg == NULL) { + return -1; + } +@@ -107,7 +107,7 @@ static int rpc_call_getsockopt(int stack_id, int fd, int level, int optname, voi + static int rpc_call_setsockopt(int stack_id, int fd, int level, int optname, const void *optval, socklen_t optlen) + { + rpc_queue *queue = &get_protocol_stack_by_id(stack_id)->rpc_queue; +- struct rpc_msg *msg = rpc_msg_alloc(stack_id, callback_setsockopt); ++ struct rpc_msg *msg = rpc_msg_alloc(stack_id, false, callback_setsockopt); + if (msg == NULL) { + return -1; + } +@@ -169,7 +169,7 @@ static void callback_socket(struct rpc_msg *msg) + static int rpc_call_socket(int stack_id, int domain, int type, int protocol) + { + rpc_queue *queue = &get_protocol_stack_by_id(stack_id)->rpc_queue; +- struct rpc_msg *msg = rpc_msg_alloc(stack_id, callback_socket); ++ struct rpc_msg *msg = rpc_msg_alloc(stack_id, true, callback_socket); + if (msg == NULL) { + return -1; + } +@@ -221,7 +221,7 @@ static void callback_shutdown(struct rpc_msg *msg) + static int rpc_call_close(int stack_id, int fd) + { + rpc_queue *queue = &get_protocol_stack_by_id(stack_id)->rpc_queue; +- struct rpc_msg *msg = rpc_msg_alloc(stack_id, callback_close); ++ struct rpc_msg *msg = rpc_msg_alloc(stack_id, false, callback_close); + if (msg == NULL) { + return -1; + } +@@ -234,7 +234,7 @@ static int rpc_call_close(int stack_id, int fd) + static int rpc_call_shutdown(int stack_id, int fd, int how) + { + rpc_queue *queue = &get_protocol_stack_by_id(stack_id)->rpc_queue; +- struct rpc_msg *msg = rpc_msg_alloc(stack_id, callback_shutdown); ++ struct rpc_msg *msg = rpc_msg_alloc(stack_id, false, callback_shutdown); + if (msg == NULL) { + return -1; + } +@@ -256,7 +256,7 @@ static void callback_bind(struct rpc_msg *msg) + static int rpc_call_bind(int stack_id, int fd, const struct sockaddr *addr, socklen_t addrlen) + { + rpc_queue *queue = &get_protocol_stack_by_id(stack_id)->rpc_queue; +- struct rpc_msg *msg = rpc_msg_alloc(stack_id, callback_bind); ++ struct rpc_msg *msg = rpc_msg_alloc(stack_id, false, callback_bind); + if (msg == NULL) { + return -1; + } +@@ -289,7 +289,7 @@ static void callback_listen(struct rpc_msg *msg) + static int rpc_call_listen(int stack_id, int s, int backlog) + { + rpc_queue *queue = &get_protocol_stack_by_id(stack_id)->rpc_queue; +- struct rpc_msg *msg = rpc_msg_alloc(stack_id, callback_listen); ++ struct rpc_msg *msg = rpc_msg_alloc(stack_id, false, callback_listen); + if (msg == NULL) { + return -1; + } +@@ -374,7 +374,7 @@ static void callback_create_shadow_fd(struct rpc_msg *msg) + static int rpc_call_shadow_fd(int stack_id, int fd, const struct sockaddr *addr, socklen_t addrlen) + { + rpc_queue *queue = &get_protocol_stack_by_id(stack_id)->rpc_queue; +- struct rpc_msg *msg = rpc_msg_alloc(stack_id, callback_create_shadow_fd); ++ struct rpc_msg *msg = rpc_msg_alloc(stack_id, false, callback_create_shadow_fd); + if (msg == NULL) { + return -1; + } +@@ -407,7 +407,7 @@ static void callback_accept(struct rpc_msg *msg) + static int rpc_call_accept(int stack_id, int fd, struct sockaddr *addr, socklen_t *addrlen, int flags) + { + rpc_queue *queue = &get_protocol_stack_by_id(stack_id)->rpc_queue; +- struct rpc_msg *msg = rpc_msg_alloc(stack_id, callback_accept); ++ struct rpc_msg *msg = rpc_msg_alloc(stack_id, false, callback_accept); + if (msg == NULL) { + return -1; + } +@@ -431,7 +431,7 @@ static void callback_connect(struct rpc_msg *msg) + static int rpc_call_connect(int stack_id, int fd, const struct sockaddr *addr, socklen_t addrlen) + { + rpc_queue *queue = &get_protocol_stack_by_id(stack_id)->rpc_queue; +- struct rpc_msg *msg = rpc_msg_alloc(stack_id, callback_connect); ++ struct rpc_msg *msg = rpc_msg_alloc(stack_id, false, callback_connect); + if (msg == NULL) { + return -1; + } +diff --git a/src/lstack/api/lstack_sockio.c b/src/lstack/api/lstack_sockio.c +index 66e84d2..9eafc22 100644 +--- a/src/lstack/api/lstack_sockio.c ++++ b/src/lstack/api/lstack_sockio.c +@@ -143,7 +143,7 @@ static void sock_mbox_private_free(struct mbox_ring *mr) + + static int sock_mbox_private_init(sys_mbox_t mb, rpc_func_t func) + { +- struct rpc_msg *msg = rpc_msg_alloc(get_protocol_stack()->stack_idx, func); ++ struct rpc_msg *msg = rpc_msg_alloc(get_protocol_stack()->stack_idx, true, func); + if (msg == NULL) + return -1; + +diff --git a/src/lstack/core/lstack_control_plane.c b/src/lstack/core/lstack_control_plane.c +index 9d9e012..6ad0192 100644 +--- a/src/lstack/core/lstack_control_plane.c ++++ b/src/lstack/core/lstack_control_plane.c +@@ -432,7 +432,7 @@ static void thread_register_phase2(struct rpc_msg *msg) + static int rpc_call_thread_regphase1(int stack_id, void *conn) + { + rpc_queue *queue = &get_protocol_stack_by_id(stack_id)->rpc_queue; +- struct rpc_msg *msg = rpc_msg_alloc(stack_id, thread_register_phase1); ++ struct rpc_msg *msg = rpc_msg_alloc(stack_id, false, thread_register_phase1); + if (msg == NULL) { + return -1; + } +@@ -443,7 +443,7 @@ static int rpc_call_thread_regphase1(int stack_id, void *conn) + static int rpc_call_thread_regphase2(int stack_id, void *conn) + { + rpc_queue *queue = &get_protocol_stack_by_id(stack_id)->rpc_queue; +- struct rpc_msg *msg = rpc_msg_alloc(stack_id, thread_register_phase2); ++ struct rpc_msg *msg = rpc_msg_alloc(stack_id, false, thread_register_phase2); + if (msg == NULL) { + return -1; + } +diff --git a/src/lstack/core/lstack_mempool.c b/src/lstack/core/lstack_mempool.c +index b03c3ef..00b299a 100644 +--- a/src/lstack/core/lstack_mempool.c ++++ b/src/lstack/core/lstack_mempool.c +@@ -67,16 +67,24 @@ struct rte_mempool *mem_get_rpc_pool(int stack_id) + + unsigned mem_stack_mbuf_pool_count(int stack_id) + { +- struct mem_stack *ms = mem_stack_get(stack_id); ++ const struct mem_stack *ms = mem_stack_get(stack_id); + return rte_mempool_avail_count(ms->mbuf_pool); + } + + unsigned mem_stack_rpc_pool_count(int stack_id) + { +- struct mem_stack *ms = mem_stack_get(stack_id); ++ const struct mem_stack *ms = mem_stack_get(stack_id); + return rte_mempool_avail_count(ms->rpc_pool); + } + ++static inline unsigned mem_stack_pool_ring_count(const struct rte_mempool *pool) ++{ ++ /* don't use rte_mempool_avail_count, it traverse cpu local cache, ++ * when RTE_MAX_LCORE is too large, it's time-consuming ++ */ ++ return rte_ring_count(pool->pool_data); ++} ++ + static inline bool mem_thread_group_in_used(const struct mem_thread_group *mt_grooup, uint32_t timeout) + { + return mt_grooup->used_flag || +@@ -484,8 +492,7 @@ static struct rte_mempool *rpc_pool_create(int stack_id, unsigned numa_id) + struct rte_mempool *pool; + uint32_t total_bufs; + +- total_bufs = MEMPOOL_CACHE_NUM + BUF_CACHE_MIN_NUM + +- (get_global_cfg_params()->rpc_msg_max / get_global_cfg_params()->num_queue); ++ total_bufs = RPCPOOL_RESERVE_NUM + MEMP_NUM_SYS_MBOX; + if (total_bufs > MEMPOOL_MAX_NUM) { + LSTACK_LOG(ERR, LSTACK, "total_bufs %u out of the dpdk mempool range\n", total_bufs); + return NULL; +@@ -695,7 +702,7 @@ void mem_mbuf_migrate_enqueue(struct mem_thread *mt, unsigned n) + return; + + /* no sufficient mbuf */ +- if (rte_ring_count(ms->mbuf_pool->pool_data) < MBUFPOOL_RESERVE_NUM) { ++ if (mem_stack_pool_ring_count(ms->mbuf_pool) < MBUFPOOL_RESERVE_NUM) { + mem_thread_manager_flush_all(); + mt->stk_migrate_count = 0; + return; +@@ -780,7 +787,8 @@ static unsigned pool_get_bulk_with_cache(const struct mempool_ops *pool_ops, + /* get from the pool */ + ret = pool_ops->get_bulk(pool, obj_table, n); + if (unlikely(ret == 0)) { +- LSTACK_LOG(ERR, LSTACK, "pool %s get_bulk failed, n %u\n", pool->name, n); ++ LSTACK_LOG(ERR, LSTACK, "pool %s get_bulk failed, n %u, count %u\n", ++ pool->name, n, mem_stack_pool_ring_count(pool)); + return 0; + } + +@@ -796,7 +804,8 @@ static unsigned pool_get_bulk_with_cache(const struct mempool_ops *pool_ops, + + ret = pool_ops->get_bulk(pool, &cache->objs[cache->head], get_count); + if (unlikely(ret == 0)) { +- LSTACK_LOG(ERR, LSTACK, "pool %s get_bulk failed, n %u\n", pool->name, get_count); ++ LSTACK_LOG(ERR, LSTACK, "pool %s get_bulk failed, n %u, count %u\n", ++ pool->name, get_count, mem_stack_pool_ring_count(pool)); + } else { + cache->head += get_count; + } +@@ -839,13 +848,20 @@ static void pool_put_bulk_with_cache(const struct mempool_ops *pool_ops, + } + + +-void *mem_get_rpc(int stack_id) ++void *mem_get_rpc(int stack_id, bool reserve) + { + struct mem_stack *ms = mem_stack_get(stack_id); + struct mem_thread *mt = mem_thread_get(stack_id); + unsigned ret; + void *obj; + ++ if (reserve) { ++ if (mem_stack_pool_ring_count(ms->rpc_pool) < RPCPOOL_RESERVE_NUM) { ++ mem_thread_manager_flush_all(); ++ return NULL; ++ } ++ } ++ + if (mt == NULL) { + ret = mem_mp_ops.get_bulk(ms->rpc_pool, &obj, 1); + } else { +@@ -887,10 +903,7 @@ unsigned mem_get_mbuf_bulk(int stack_id, struct rte_mbuf **mbuf_table, unsigned + } + + if (reserve) { +- /* don't use rte_mempool_avail_count, it traverse cpu local cache, +- * when RTE_MAX_LCORE is too large, it's time-consuming +- */ +- if (rte_ring_count(ms->mbuf_pool->pool_data) < MBUFPOOL_RESERVE_NUM + n) { ++ if (mem_stack_pool_ring_count(ms->mbuf_pool) < MBUFPOOL_RESERVE_NUM + n) { + mem_thread_manager_flush_all(); + return 0; + } +diff --git a/src/lstack/core/lstack_protocol_stack.c b/src/lstack/core/lstack_protocol_stack.c +index c07d8e7..d375ecc 100644 +--- a/src/lstack/core/lstack_protocol_stack.c ++++ b/src/lstack/core/lstack_protocol_stack.c +@@ -750,7 +750,7 @@ static void stack_exit_by_rpc(struct rpc_msg *msg) + static int rpc_call_stack_exit(int stack_id) + { + rpc_queue *queue = &get_protocol_stack_by_id(stack_id)->rpc_queue; +- struct rpc_msg *msg = rpc_msg_alloc(stack_id, stack_exit_by_rpc); ++ struct rpc_msg *msg = rpc_msg_alloc(stack_id, false, stack_exit_by_rpc); + if (msg == NULL) { + return -1; + } +diff --git a/src/lstack/core/lstack_thread_rpc.c b/src/lstack/core/lstack_thread_rpc.c +index 84e5814..6bf83d0 100644 +--- a/src/lstack/core/lstack_thread_rpc.c ++++ b/src/lstack/core/lstack_thread_rpc.c +@@ -41,11 +41,11 @@ static void rpc_msg_init(struct rpc_msg *msg, rpc_func_t func) + lockless_queue_node_set_poped(&msg->queue_node); + } + +-struct rpc_msg *rpc_msg_alloc(int stack_id, rpc_func_t func) ++struct rpc_msg *rpc_msg_alloc(int stack_id, bool reserve, rpc_func_t func) + { + struct rpc_msg *msg; + +- msg = mem_get_rpc(stack_id); ++ msg = mem_get_rpc(stack_id, reserve); + if (unlikely(msg == NULL)) { + g_rpc_stats.call_alloc_fail++; + return NULL; +@@ -152,7 +152,7 @@ static void callback_arp(struct rpc_msg *msg) + int rpc_call_arp(int stack_id, void *mbuf) + { + rpc_queue *queue = &get_protocol_stack_by_id(stack_id)->rpc_queue; +- struct rpc_msg *msg = rpc_msg_alloc(stack_id, callback_arp); ++ struct rpc_msg *msg = rpc_msg_alloc(stack_id, false, callback_arp); + if (msg == NULL) { + return -1; + } +@@ -290,7 +290,7 @@ static void callback_get_connnum(struct rpc_msg *msg) + int rpc_call_conntable(int stack_id, void *conn_table, unsigned max_conn) + { + rpc_queue *queue = &get_protocol_stack_by_id(stack_id)->dfx_rpc_queue; +- struct rpc_msg *msg = rpc_msg_alloc(stack_id, callback_get_conntable); ++ struct rpc_msg *msg = rpc_msg_alloc(stack_id, false, callback_get_conntable); + if (msg == NULL) { + return -1; + } +@@ -304,7 +304,7 @@ int rpc_call_conntable(int stack_id, void *conn_table, unsigned max_conn) + int rpc_call_connnum(int stack_id) + { + rpc_queue *queue = &get_protocol_stack_by_id(stack_id)->dfx_rpc_queue; +- struct rpc_msg *msg = rpc_msg_alloc(stack_id, callback_get_connnum); ++ struct rpc_msg *msg = rpc_msg_alloc(stack_id, false, callback_get_connnum); + if (msg == NULL) { + return -1; + } +diff --git a/src/lstack/include/lstack_mempool.h b/src/lstack/include/lstack_mempool.h +index 131029e..6a31503 100644 +--- a/src/lstack/include/lstack_mempool.h ++++ b/src/lstack/include/lstack_mempool.h +@@ -40,6 +40,7 @@ + + #define MBUFPOOL_CACHE_NUM LWIP_MIN(NIC_QUEUE_SIZE_MAX >> 1, RTE_MEMPOOL_CACHE_MAX_SIZE) + #define MBUFPOOL_RESERVE_NUM (NIC_QUEUE_SIZE_MAX + MBUFPOOL_CACHE_NUM) ++#define RPCPOOL_RESERVE_NUM 512 + + #define MEMPOOL_OPS_NAME "ring_mt_rts" + #define MEMPOOL_CACHE_NUM 32 +@@ -307,7 +308,7 @@ struct rte_mempool *mem_get_rpc_pool(int stack_id); + unsigned mem_stack_mbuf_pool_count(int stack_id); + unsigned mem_stack_rpc_pool_count(int stack_id); + +-void *mem_get_rpc(int stack_id); ++void *mem_get_rpc(int stack_id, bool reserve); + void mem_put_rpc(void *obj); + + struct mem_thread *mem_thread_migrate_get(int stack_id); +diff --git a/src/lstack/include/lstack_thread_rpc.h b/src/lstack/include/lstack_thread_rpc.h +index 013ff29..2d577ae 100644 +--- a/src/lstack/include/lstack_thread_rpc.h ++++ b/src/lstack/include/lstack_thread_rpc.h +@@ -64,7 +64,7 @@ struct rpc_msg { + + void rpc_get_stat(rpc_queue *queue, struct gazelle_rpc_stat *stat); + +-struct rpc_msg *rpc_msg_alloc(int stack_id, rpc_func_t func); ++struct rpc_msg *rpc_msg_alloc(int stack_id, bool reserve, rpc_func_t func); + void rpc_msg_free(struct rpc_msg *msg); + + void rpc_queue_init(rpc_queue *queue, uint16_t queue_id); +-- +2.33.0 + diff --git a/0343-mempool-fix-pthread_tryjoin_np-coredump-when-mysqld-.patch b/0343-mempool-fix-pthread_tryjoin_np-coredump-when-mysqld-.patch new file mode 100644 index 0000000..073073b --- /dev/null +++ b/0343-mempool-fix-pthread_tryjoin_np-coredump-when-mysqld-.patch @@ -0,0 +1,59 @@ +From 00b47db8233ba31eabf0b8c4b3b015607afdcf78 Mon Sep 17 00:00:00 2001 +From: Lemmy Huang +Date: Wed, 23 Apr 2025 09:26:45 +0800 +Subject: [PATCH] mempool: fix pthread_tryjoin_np coredump when mysqld shutdown + +Signed-off-by: Lemmy Huang +--- + src/lstack/core/lstack_mempool.c | 14 ++++++++++++-- + 1 file changed, 12 insertions(+), 2 deletions(-) + +diff --git a/src/lstack/core/lstack_mempool.c b/src/lstack/core/lstack_mempool.c +index 00b299a..4dd7574 100644 +--- a/src/lstack/core/lstack_mempool.c ++++ b/src/lstack/core/lstack_mempool.c +@@ -23,6 +23,8 @@ + #include "lstack_protocol_stack.h" + #include "lstack_unistd.h" + ++#define MEM_THREAD_TASK_PATH "/proc/%d/task/%d/stat" ++#define MEM_THREAD_MAX_PATH 32 /* cat /proc/sys/kernel/pid_max */ + #define MEM_THREAD_FLUSH_SIG (SIGRTMIN + 11) + #define MEM_THREAD_MANAGER_FLUSH_MS 100 + #define MEM_THREAD_MANAGER_FREE_S 2 +@@ -36,6 +38,7 @@ struct mem_thread_manager { + }; + + struct mem_thread_group { ++ char task_path[MEM_THREAD_MAX_PATH]; + int tid; + pthread_t thread; + struct list_node mt_node; +@@ -156,8 +159,13 @@ static inline void mem_thread_group_notify_flush(const struct mem_thread_group * + + static inline bool mem_thread_group_exist(const struct mem_thread_group *mt_group) + { +- if (pthread_tryjoin_np(mt_group->thread, NULL) == 0) +- return false; ++ if (access(mt_group->task_path, R_OK) != 0) { ++ if (errno == ENOENT) { ++ return false; ++ } ++ LSTACK_LOG(ERR, LSTACK, "mem_thread_group_exist access %s failed, errno %d\n", ++ mt_group->task_path, errno); ++ } + return true; + } + +@@ -200,6 +208,8 @@ static int mem_thread_group_init(int stack_id) + + g_mem_thread_group->tid = rte_gettid(); + g_mem_thread_group->thread = pthread_self(); ++ SYS_FORMAT_NAME(g_mem_thread_group->task_path, sizeof(g_mem_thread_group->task_path), ++ MEM_THREAD_TASK_PATH, getpid(), g_mem_thread_group->tid); + list_init_node(&g_mem_thread_group->mt_node); + mem_thread_manager_add_work(g_mem_thread_group); + } +-- +2.33.0 + diff --git a/0344-mempool-stop-using-cache-when-too-many-threads.patch b/0344-mempool-stop-using-cache-when-too-many-threads.patch new file mode 100644 index 0000000..ae5f15d --- /dev/null +++ b/0344-mempool-stop-using-cache-when-too-many-threads.patch @@ -0,0 +1,158 @@ +From ce877b0e0171bc878a4b0a19f328634381bfa375 Mon Sep 17 00:00:00 2001 +From: Lemmy Huang +Date: Wed, 23 Apr 2025 14:20:08 +0800 +Subject: [PATCH] mempool: stop using cache when too many threads + +Signed-off-by: Lemmy Huang +--- + src/lstack/core/lstack_mempool.c | 53 ++++++++++++++++++++++---------- + 1 file changed, 36 insertions(+), 17 deletions(-) + +diff --git a/src/lstack/core/lstack_mempool.c b/src/lstack/core/lstack_mempool.c +index 4dd7574..70ac8c6 100644 +--- a/src/lstack/core/lstack_mempool.c ++++ b/src/lstack/core/lstack_mempool.c +@@ -35,6 +35,7 @@ struct mem_thread_manager { + struct list_node mt_free_list; + rte_spinlock_t list_lock; + uint32_t flush_time; ++ unsigned thread_num; + }; + + struct mem_thread_group { +@@ -173,6 +174,7 @@ static void mem_thread_manager_add_work(struct mem_thread_group *mt_group) + { + rte_spinlock_lock(&g_mem_thread_manager.list_lock); + list_add_node(&mt_group->mt_node, &g_mem_thread_manager.mt_work_list); ++ g_mem_thread_manager.thread_num++; + rte_spinlock_unlock(&g_mem_thread_manager.list_lock); + } + +@@ -305,6 +307,7 @@ static void *mem_thread_manager_thread(void *arg) + } + list_del_node(node); + list_add_node(node, &g_mem_thread_manager.mt_free_list); ++ g_mem_thread_manager.thread_num--; + } + + rte_spinlock_unlock(&g_mem_thread_manager.list_lock); +@@ -502,7 +505,7 @@ static struct rte_mempool *rpc_pool_create(int stack_id, unsigned numa_id) + struct rte_mempool *pool; + uint32_t total_bufs; + +- total_bufs = RPCPOOL_RESERVE_NUM + MEMP_NUM_SYS_MBOX; ++ total_bufs = LWIP_MAX(RPCPOOL_RESERVE_NUM, MBUFPOOL_RESERVE_NUM) + MEMP_NUM_SYS_MBOX; + if (total_bufs > MEMPOOL_MAX_NUM) { + LSTACK_LOG(ERR, LSTACK, "total_bufs %u out of the dpdk mempool range\n", total_bufs); + return NULL; +@@ -783,7 +786,7 @@ void pool_put_with_bufcache(struct rte_mempool *pool, struct buf_cache* cache, v + + static unsigned pool_get_bulk_with_cache(const struct mempool_ops *pool_ops, + struct rte_mempool *pool, struct buf_cache *cache, +- void **obj_table, unsigned n) ++ void **obj_table, unsigned n, unsigned pool_count) + { + unsigned ret; + unsigned count = 0; +@@ -802,6 +805,13 @@ static unsigned pool_get_bulk_with_cache(const struct mempool_ops *pool_ops, + return 0; + } + ++ /* Stop using cache when too many threads. */ ++ ret = MBUFPOOL_RESERVE_NUM + BUF_CACHE_MIN_NUM * g_mem_thread_manager.thread_num; ++ if (unlikely(ret > pool_count)) { ++ buf_cache_reset_watermark(cache); ++ return n; ++ } ++ + buf_cache_add_watermark(cache); + if (count >= cache->watermark) { + return n; +@@ -862,27 +872,32 @@ void *mem_get_rpc(int stack_id, bool reserve) + { + struct mem_stack *ms = mem_stack_get(stack_id); + struct mem_thread *mt = mem_thread_get(stack_id); +- unsigned ret; ++ unsigned ret = 0; ++ unsigned pool_count; + void *obj; + +- if (reserve) { +- if (mem_stack_pool_ring_count(ms->rpc_pool) < RPCPOOL_RESERVE_NUM) { +- mem_thread_manager_flush_all(); +- return NULL; +- } ++ pool_count = mem_stack_pool_ring_count(ms->rpc_pool); ++ if (reserve && pool_count < RPCPOOL_RESERVE_NUM) { ++ goto out; + } + + if (mt == NULL) { + ret = mem_mp_ops.get_bulk(ms->rpc_pool, &obj, 1); + } else { + mem_thread_group_used(); +- ret = pool_get_bulk_with_cache(&mem_mp_ops, ms->rpc_pool, mt->rpc_cache, &obj, 1); ++ ret = pool_get_bulk_with_cache(&mem_mp_ops, ms->rpc_pool, mt->rpc_cache, ++ &obj, 1, pool_count); + mem_thread_group_done(); + } + + LWIP_DEBUGF(MEMP_DEBUG, ("%s(stack_id=%d, obj=%p)\n", __FUNCTION__, stack_id, obj)); + +- return ret == 0 ? NULL : obj; ++out: ++ if (unlikely(ret == 0)) { ++ mem_thread_manager_flush_all(); ++ return NULL; ++ } ++ return obj; + } + + void mem_put_rpc(void *obj) +@@ -906,17 +921,16 @@ unsigned mem_get_mbuf_bulk(int stack_id, struct rte_mbuf **mbuf_table, unsigned + { + struct mem_stack *ms = mem_stack_get(stack_id); + struct mem_thread *mt = mem_thread_get(stack_id); +- unsigned ret; ++ unsigned ret = 0; ++ unsigned pool_count; + + if (unlikely(n == 0)) { + return 0; + } + +- if (reserve) { +- if (mem_stack_pool_ring_count(ms->mbuf_pool) < MBUFPOOL_RESERVE_NUM + n) { +- mem_thread_manager_flush_all(); +- return 0; +- } ++ pool_count = mem_stack_pool_ring_count(ms->mbuf_pool); ++ if (reserve && pool_count < MBUFPOOL_RESERVE_NUM + n) { ++ goto out; + } + + if (mt == NULL) { +@@ -924,7 +938,8 @@ unsigned mem_get_mbuf_bulk(int stack_id, struct rte_mbuf **mbuf_table, unsigned + } else { + mem_thread_group_used(); + mem_mbuf_migrate_dequeue(mt); +- ret = pool_get_bulk_with_cache(&mbuf_mp_ops, ms->mbuf_pool, mt->mbuf_cache, (void **)mbuf_table, n); ++ ret = pool_get_bulk_with_cache(&mbuf_mp_ops, ms->mbuf_pool, mt->mbuf_cache, ++ (void **)mbuf_table, n, pool_count); + mem_thread_group_done(); + } + +@@ -935,6 +950,10 @@ unsigned mem_get_mbuf_bulk(int stack_id, struct rte_mbuf **mbuf_table, unsigned + } + #endif /* MEMP_DEBUG */ + ++out: ++ if (unlikely(ret == 0)) { ++ mem_thread_manager_flush_all(); ++ } + return ret; + } + +-- +2.33.0 + diff --git a/0345-sk_wait-fix-lwip_tcp_allow_send-coredump.patch b/0345-sk_wait-fix-lwip_tcp_allow_send-coredump.patch new file mode 100644 index 0000000..95cbae8 --- /dev/null +++ b/0345-sk_wait-fix-lwip_tcp_allow_send-coredump.patch @@ -0,0 +1,45 @@ +From f0e0a9cdff414f1089173ea86e697ea3118623eb Mon Sep 17 00:00:00 2001 +From: Lemmy Huang +Date: Wed, 23 Apr 2025 16:34:08 +0800 +Subject: [PATCH] sk_wait: fix lwip_tcp_allow_send coredump + +Signed-off-by: Lemmy Huang +--- + src/lstack/core/lstack_wait.c | 17 ++++++++++++----- + 1 file changed, 12 insertions(+), 5 deletions(-) + +diff --git a/src/lstack/core/lstack_wait.c b/src/lstack/core/lstack_wait.c +index 816f340..d9db582 100644 +--- a/src/lstack/core/lstack_wait.c ++++ b/src/lstack/core/lstack_wait.c +@@ -366,15 +366,22 @@ static inline bool NETCONN_NEED_RECV(const struct lwip_sock *sock) + + static inline bool NETCONN_ALLOW_SEND(const struct lwip_sock *sock) + { ++ if (sock->conn->pcb.tcp == NULL) { ++ return false; ++ } ++ + if (get_global_cfg_params()->stack_mode_rtc) { + if (NETCONN_TYPE(sock->conn) == NETCONN_TCP) + return lwip_tcp_allow_send(sock->conn->pcb.tcp); +- return false; +- } +- if (sys_mbox_valid(&sock->conn->sendmbox)) { +- const struct mbox_ring *mr = &sock->conn->sendmbox->mring; +- return mr->ops->free_count(mr) > 0; ++ else /* if UDP */ ++ return true; ++ } else { /* if RTW */ ++ if (sys_mbox_valid(&sock->conn->sendmbox)) { ++ const struct mbox_ring *mr = &sock->conn->sendmbox->mring; ++ return mr->ops->free_count(mr) > 0; ++ } + } ++ + return false; + } + +-- +2.33.0 + diff --git a/0346-mbox-fix-mbox_ring_common_free-coredump-when-rte_rin.patch b/0346-mbox-fix-mbox_ring_common_free-coredump-when-rte_rin.patch new file mode 100644 index 0000000..fe8c61b --- /dev/null +++ b/0346-mbox-fix-mbox_ring_common_free-coredump-when-rte_rin.patch @@ -0,0 +1,52 @@ +From c9285237af5d95fc4014963ab9fae2327a0bd265 Mon Sep 17 00:00:00 2001 +From: Lemmy Huang +Date: Fri, 25 Apr 2025 15:09:27 +0800 +Subject: [PATCH] mbox: fix mbox_ring_common_free coredump when rte_ring_create + failed + +Signed-off-by: Lemmy Huang +--- + src/lstack/include/mbox_ring.h | 20 ++++++++++++-------- + 1 file changed, 12 insertions(+), 8 deletions(-) + +diff --git a/src/lstack/include/mbox_ring.h b/src/lstack/include/mbox_ring.h +index 2e3ac6a..893c636 100644 +--- a/src/lstack/include/mbox_ring.h ++++ b/src/lstack/include/mbox_ring.h +@@ -85,12 +85,14 @@ void mbox_ring_common_free(struct mbox_ring *mr) + if (obj != NULL) + mr->obj_free_fn(mr, obj, true); + +- if (mr->flags & MBOX_FLAG_RECV) +- mr->ops->recv_finish_burst(mr); +- while (true) { +- if (mr->ops->dequeue_burst(mr, &obj, 1) == 0) +- break; +- mr->obj_free_fn(mr, obj, false); ++ if (mr->ring != NULL) { ++ if (mr->flags & MBOX_FLAG_RECV) ++ mr->ops->recv_finish_burst(mr); ++ while (true) { ++ if (mr->ops->dequeue_burst(mr, &obj, 1) == 0) ++ break; ++ mr->obj_free_fn(mr, obj, false); ++ } + } + } + +@@ -482,8 +484,10 @@ static inline + void pk_ring_destroy(struct mbox_ring *mr) + { + void *obj; +- while (mr->ops->recv_start_burst(mr, &obj, 1) > 0) { } +- mr->ops->recv_finish_burst(mr); ++ if (mr->ring != NULL) { ++ while (mr->ops->recv_start_burst(mr, &obj, 1) > 0) { } ++ mr->ops->recv_finish_burst(mr); ++ } + return; + } + +-- +2.33.0 + diff --git a/0347-sk_wait-fix-sock_wait_common_free.patch b/0347-sk_wait-fix-sock_wait_common_free.patch new file mode 100644 index 0000000..976d7c2 --- /dev/null +++ b/0347-sk_wait-fix-sock_wait_common_free.patch @@ -0,0 +1,62 @@ +From bd6149bf56dd26a67f835be5b4bc4d36ca8bca96 Mon Sep 17 00:00:00 2001 +From: Lemmy Huang +Date: Thu, 24 Apr 2025 10:03:25 +0800 +Subject: [PATCH] sk_wait: fix sock_wait_common_free socket: simplify calling + free_count in rtw_stack_tcp_write + +Signed-off-by: Lemmy Huang +--- + src/lstack/api/lstack_sockio.c | 13 +++---------- + src/lstack/core/lstack_wait.c | 5 +++-- + 2 files changed, 6 insertions(+), 12 deletions(-) + +diff --git a/src/lstack/api/lstack_sockio.c b/src/lstack/api/lstack_sockio.c +index 9eafc22..aff9d6d 100644 +--- a/src/lstack/api/lstack_sockio.c ++++ b/src/lstack/api/lstack_sockio.c +@@ -671,7 +671,8 @@ static ssize_t rtw_stack_tcp_write(struct lwip_sock *sock, const char *data, siz + return -1; + } + +- if (unlikely(mr->app_free_count < 2)) { ++ if (unlikely(mr->app_free_count < 2) || ++ total_copy_len > mr->app_free_count * TCP_MSS) { + mr->app_free_count = mr->ops->free_count(mr); + if (unlikely(mr->app_free_count < 2)) { + API_EVENT(sock->conn, NETCONN_EVT_SENDMINUS, 0); +@@ -691,15 +692,7 @@ static ssize_t rtw_stack_tcp_write(struct lwip_sock *sock, const char *data, siz + copied_total += rtw_stack_tcp_write_one(sock, mr, data + copied_total, total_copy_len, flags); + } else { + if (total_copy_len > mr->app_free_count * TCP_MSS) { +- mr->app_free_count = mr->ops->free_count(mr); +- if (unlikely(mr->app_free_count < 2)) { +- API_EVENT(sock->conn, NETCONN_EVT_SENDMINUS, 0); +- set_errno(EWOULDBLOCK); +- goto out; +- } +- if (total_copy_len > mr->app_free_count * TCP_MSS) { +- total_copy_len = mr->app_free_count * TCP_MSS; +- } ++ total_copy_len = mr->app_free_count * TCP_MSS; + } + /* write bulk pbuf */ + while (total_copy_len > 0) { +diff --git a/src/lstack/core/lstack_wait.c b/src/lstack/core/lstack_wait.c +index d9db582..d0e8d82 100644 +--- a/src/lstack/core/lstack_wait.c ++++ b/src/lstack/core/lstack_wait.c +@@ -307,8 +307,9 @@ void sock_wait_common_free(struct sock_wait *sk_wait) + do { + wait_stack = false; + for (int i = 0; i < PROTOCOL_STACK_MAX; ++i) { +- rte_mb(); +- if (!list_node_null(&sk_wait->stk_notify_node[i])) { ++ rte_rmb(); ++ if (!list_node_null(&sk_wait->stk_notify_node[i]) || ++ !list_head_empty(&sk_wait->stk_event_list[i])) { + wait_stack = true; + usleep(LWIP_EVENT_WAIT_US); + break; +-- +2.33.0 + diff --git a/0348-socket-fix-stack_udp_readmsg-return-len.patch b/0348-socket-fix-stack_udp_readmsg-return-len.patch new file mode 100644 index 0000000..4c82756 --- /dev/null +++ b/0348-socket-fix-stack_udp_readmsg-return-len.patch @@ -0,0 +1,60 @@ +From 329cbd6b63b4954c01caf9023007557f5efe7b27 Mon Sep 17 00:00:00 2001 +From: Lemmy Huang +Date: Fri, 25 Apr 2025 20:37:39 +0800 +Subject: [PATCH] socket: fix stack_udp_readmsg return len + +Signed-off-by: Lemmy Huang +--- + src/lstack/api/lstack_sockio.c | 12 ++++++------ + 1 file changed, 6 insertions(+), 6 deletions(-) + +diff --git a/src/lstack/api/lstack_sockio.c b/src/lstack/api/lstack_sockio.c +index aff9d6d..4c524ab 100644 +--- a/src/lstack/api/lstack_sockio.c ++++ b/src/lstack/api/lstack_sockio.c +@@ -483,7 +483,7 @@ static ssize_t stack_udp_readmsg(struct lwip_sock *sock, struct msghdr *msg, siz + struct pbuf **extcache_list; + struct netbuf *nbuf; + err_t err = ERR_OK; +- uint16_t copied_total = 0; ++ uint16_t total_len = 0; + + LWIP_DEBUGF(SOCKETS_DEBUG, ("%s(sock=%p, msg=%p, size=%"SZT_F", flags=0x%x)\n", + __FUNCTION__, sock, msg, len, flags)); +@@ -506,11 +506,11 @@ static ssize_t stack_udp_readmsg(struct lwip_sock *sock, struct msghdr *msg, siz + + /* let not free inside by MSG_PEEK */ + sock->lastdata.netbuf = nbuf; +- err = lwip_recvfrom_udp_raw(sock, flags | MSG_PEEK, msg, &copied_total, 0); ++ err = lwip_recvfrom_udp_raw(sock, flags | MSG_PEEK, msg, &total_len, 0); + sock->lastdata.netbuf = NULL; + + SOCK_WAIT_STAT(sock->sk_wait, app_read_cnt, 1); +- SOCK_WAIT_STAT(sock->sk_wait, sock_rx_drop, copied_total < len ? 1 : 0); ++ SOCK_WAIT_STAT(sock->sk_wait, sock_rx_drop, total_len > len ? 1 : 0); + if (get_protocol_stack_group()->latency_start) + calculate_lstack_latency(sock->stack_id, &nbuf->p, 1, GAZELLE_LATENCY_READ_LSTACK, 0); + +@@ -521,7 +521,7 @@ static ssize_t stack_udp_readmsg(struct lwip_sock *sock, struct msghdr *msg, siz + mem_extcache_put_pbuf(nbuf->p, NULL, extcache_list); + } + +- mr->app_recvd_len += copied_total; ++ mr->app_recvd_len += total_len; + mr->app_queued_num++; + if (mr->app_queued_num >= RECV_EXTEND_CACHE_MAX || + mr->app_recvd_len >= RECV_EXTEND_CACHE_LEN) { +@@ -534,8 +534,8 @@ static ssize_t stack_udp_readmsg(struct lwip_sock *sock, struct msghdr *msg, siz + } + + if (err == ERR_OK) { +- API_EVENT(sock->conn, NETCONN_EVT_RCVMINUS, copied_total); +- return copied_total; ++ API_EVENT(sock->conn, NETCONN_EVT_RCVMINUS, total_len); ++ return LWIP_MIN(len, total_len); + } + out: + SOCK_WAIT_STAT(sock->sk_wait, read_null, 1); +-- +2.33.0 + diff --git a/0349-sk_wait-fix-lwip_wait_foreach_notify-coredump-at-sta.patch b/0349-sk_wait-fix-lwip_wait_foreach_notify-coredump-at-sta.patch new file mode 100644 index 0000000..dc3473c --- /dev/null +++ b/0349-sk_wait-fix-lwip_wait_foreach_notify-coredump-at-sta.patch @@ -0,0 +1,61 @@ +From 1ff5ffd3634c61c6d6062b51d418b41b3a6a86d0 Mon Sep 17 00:00:00 2001 +From: Lemmy Huang +Date: Fri, 25 Apr 2025 21:40:16 +0800 +Subject: [PATCH] sk_wait: fix lwip_wait_foreach_notify coredump at startup + +Signed-off-by: Lemmy Huang +--- + src/lstack/core/lstack_protocol_stack.c | 1 + + src/lstack/core/lstack_wait.c | 3 +-- + src/lstack/include/lstack_wait.h | 2 ++ + 3 files changed, 4 insertions(+), 2 deletions(-) + +diff --git a/src/lstack/core/lstack_protocol_stack.c b/src/lstack/core/lstack_protocol_stack.c +index d375ecc..8ff6981 100644 +--- a/src/lstack/core/lstack_protocol_stack.c ++++ b/src/lstack/core/lstack_protocol_stack.c +@@ -425,6 +425,7 @@ static struct protocol_stack *stack_thread_init(void *arg) + stack_affinity_numa(stack->numa_id); + } + ++ lwip_wait_init(stack->stack_idx); + if (mem_stack_mpcache_init(stack->stack_idx, stack->cpu_id) < 0) { + goto END; + } +diff --git a/src/lstack/core/lstack_wait.c b/src/lstack/core/lstack_wait.c +index d0e8d82..42ebf05 100644 +--- a/src/lstack/core/lstack_wait.c ++++ b/src/lstack/core/lstack_wait.c +@@ -72,7 +72,7 @@ static inline struct lwip_wait *lwip_wait_get(int stack_id) + return &g_wait_group.lwaits[stack_id]; + } + +-static int lwip_wait_init(int stack_id) ++int lwip_wait_init(int stack_id) + { + struct lwip_wait *lwait = lwip_wait_get(stack_id); + LWIP_UNUSED_ARG(lwait); +@@ -191,7 +191,6 @@ void* kernel_wait_thread(void *arg) + free(arg); + sem_post(&get_protocol_stack_group()->sem_stack_setup); + +- lwip_wait_init(stack_id); + kernel_wait_init(stack_id); + kwait = kernel_wait_get(stack_id); + +diff --git a/src/lstack/include/lstack_wait.h b/src/lstack/include/lstack_wait.h +index a1d35dd..58da126 100644 +--- a/src/lstack/include/lstack_wait.h ++++ b/src/lstack/include/lstack_wait.h +@@ -110,6 +110,8 @@ struct sock_wait { + int sock_wait_group_init(void); + void sock_wait_group_stat(int stack_id, struct gazelle_wakeup_stat *stat); + ++int lwip_wait_init(int stack_id); ++ + void* kernel_wait_thread(void *arg); + int kernel_wait_ctl(struct sock_wait *sk_wait, int new_stack_id, int old_stack_id); + +-- +2.33.0 + diff --git a/0350-epoll-fix-do_lwip_connected_callback.patch b/0350-epoll-fix-do_lwip_connected_callback.patch new file mode 100644 index 0000000..ed88faf --- /dev/null +++ b/0350-epoll-fix-do_lwip_connected_callback.patch @@ -0,0 +1,212 @@ +From b872ae2d9b0aca063e14aa0648d13e49af494a75 Mon Sep 17 00:00:00 2001 +From: Lemmy Huang +Date: Tue, 22 Apr 2025 15:41:53 +0800 +Subject: [PATCH] epoll: fix do_lwip_connected_callback + +Signed-off-by: Lemmy Huang +--- + src/lstack/api/lstack_epoll.c | 46 +++++++++++++++++++------------ + src/lstack/api/lstack_sockctl.c | 22 --------------- + src/lstack/api/lstack_wrap.c | 25 +++++++++++++++++ + src/lstack/core/lstack_wait.c | 3 +- + src/lstack/include/lstack_epoll.h | 3 ++ + 5 files changed, 58 insertions(+), 41 deletions(-) + +diff --git a/src/lstack/api/lstack_epoll.c b/src/lstack/api/lstack_epoll.c +index 19cc9d6..1a10e24 100644 +--- a/src/lstack/api/lstack_epoll.c ++++ b/src/lstack/api/lstack_epoll.c +@@ -219,7 +219,6 @@ static int epoll_close_internal(int epfd) + sk_wait->type = WAIT_CLOSE; + epoll_cb_free(&sk_wait->epcb); + +- posix_api->close_fn(sk_wait->epfd); + sock_wait_kernel_free(sk_wait); + sock_wait_common_free(sk_wait); + +@@ -259,6 +258,25 @@ int lstack_epoll_close(int epfd) + return posix_api->close_fn(epfd); + } + ++int epoll_ctl_kernel_event(int epfd, int op, int fd, struct epoll_event *event, ++ struct sock_wait *sk_wait) ++{ ++ int ret; ++ ++ ret = posix_api->epoll_ctl_fn(epfd, op, fd, event); ++ if (ret != 0) { ++ LSTACK_LOG(ERR, LSTACK, "epoll_ctl_fn failed, fd=%d epfd=%d op=%d\n", fd, epfd, op); ++ return ret; ++ } ++ if (op == EPOLL_CTL_ADD) { ++ sk_wait->kernel_nfds++; ++ } else if (op == EPOLL_CTL_DEL) { ++ sk_wait->kernel_nfds--; ++ } ++ ++ return ret; ++} ++ + int lstack_epoll_ctl(int epfd, int op, int fd, struct epoll_event *event) + { + int ret; +@@ -278,22 +296,13 @@ int lstack_epoll_ctl(int epfd, int op, int fd, struct epoll_event *event) + __FUNCTION__, epfd, op, fd, event)); + + enum posix_type sk_type = select_sock_posix_path(sock); +- /* has POSIX_LWIP */ +- if (sk_type != POSIX_LWIP) { +- ret = posix_api->epoll_ctl_fn(epfd, op, fd, event); +- if (ret != 0) { +- LSTACK_LOG(ERR, LSTACK, "epoll_ctl_fn failed, fd=%d epfd=%d op=%d\n", fd, epfd, op); ++ if (sk_type & POSIX_KERNEL) { /* has POSIX_KERNEL */ ++ ret = epoll_ctl_kernel_event(epfd, op, fd, event, sk_wait); ++ if (ret != 0 || ++ sk_type == POSIX_KERNEL) { /* is POSIX_KERNEL */ + return ret; + } +- if (op == EPOLL_CTL_ADD) { +- sk_wait->kernel_nfds++; +- } else if (op == EPOLL_CTL_DEL) { +- sk_wait->kernel_nfds--; +- } + } +- /* is POSIX_KERNEL */ +- if (sk_type == POSIX_KERNEL) +- return ret; + + for (; sock != NULL; sock = sock->listen_next) { + sk_event = &sock->sk_event; +@@ -412,7 +421,8 @@ int lstack_epoll_wait(int epfd, struct epoll_event* events, int maxevents, int t + maxevents = POLL_MAX_EVENTS; + } + /* avoid the starvation of poll events from both kernel and lwip */ +- lwip_maxevents = (maxevents >> 1) + 1; ++ lwip_maxevents = sk_wait->kernel_nfds > 0 ? ++ (maxevents >> 1) + 1 : maxevents; + + start = sys_now(); + +@@ -555,7 +565,7 @@ struct sock_wait *poll_construct_wait(int nfds) + return g_sk_wait; + } + +-static bool poll_ctl_kernel_event(int epfd, int fds_id, ++static int poll_ctl_kernel_event(int epfd, int fds_id, + const struct pollfd *new_fds, const struct pollfd *old_fds) + { + int ret; +@@ -575,11 +585,11 @@ static bool poll_ctl_kernel_event(int epfd, int fds_id, + ret |= posix_api->epoll_ctl_fn(epfd, EPOLL_CTL_ADD, new_fds->fd, &epevent); + } + +- if (ret != 0) { ++ if (ret != 0 && errno != EINTR && errno != ENOENT) { + LSTACK_LOG(ERR, LSTACK, "epoll_ctl failed, errno %d, new_fd %d, old_fd %d\n", + errno, new_fds->fd, old_fds->fd); + } +- return true; ++ return ret; + } + + static int poll_wait_kernel_event(int epfd, struct pollfd *fds, int maxevents) +diff --git a/src/lstack/api/lstack_sockctl.c b/src/lstack/api/lstack_sockctl.c +index f53b4cd..856b24e 100644 +--- a/src/lstack/api/lstack_sockctl.c ++++ b/src/lstack/api/lstack_sockctl.c +@@ -455,28 +455,6 @@ static int rpc_call_connect(int stack_id, int fd, const struct sockaddr *addr, s + return ret; + } + +-/* for lwip nonblock connected callback */ +-void do_lwip_connected_callback(int fd) +-{ +- bool has_kernel; +- struct lwip_sock *sock = lwip_get_socket(fd); +- if (POSIX_IS_CLOSED(sock)) { +- return; +- } +- +- has_kernel = POSIX_HAS_TYPE(sock, POSIX_KERNEL); +- POSIX_SET_TYPE(sock, POSIX_LWIP); +- if (has_kernel) { +- /* delete kernel event */ +- if (sock->sk_wait != NULL) { +- posix_api->epoll_ctl_fn(sock->sk_wait->epfd, EPOLL_CTL_DEL, fd, NULL); +- } +- /* shutdown kernel connect, do_connect() has tried both kernel and lwip. */ +- posix_api->shutdown_fn(fd, SHUT_RDWR); +- } +- return; +-} +- + /* when fd is listenfd, listenfd of all protocol stack thread will be closed */ + static int stack_broadcast_close(int fd) + { +diff --git a/src/lstack/api/lstack_wrap.c b/src/lstack/api/lstack_wrap.c +index e22937f..8339750 100644 +--- a/src/lstack/api/lstack_wrap.c ++++ b/src/lstack/api/lstack_wrap.c +@@ -378,6 +378,31 @@ static int32_t do_connect(int32_t s, const struct sockaddr *addr, socklen_t addr + return ret; + } + ++/* for lwip nonblock connected callback */ ++void do_lwip_connected_callback(int fd) ++{ ++ struct lwip_sock *sock = lwip_get_socket(fd); ++ if (POSIX_IS_CLOSED(sock)) { ++ return; ++ } ++ ++ if (POSIX_HAS_TYPE(sock, POSIX_KERNEL)) { ++ POSIX_SET_TYPE(sock, POSIX_LWIP); ++ /* delete kernel event */ ++ if (sock->sk_wait != NULL) { ++ if (sock->sk_wait->type & WAIT_EPOLL) { ++ epoll_ctl_kernel_event(sock->sk_wait->epfd, EPOLL_CTL_DEL, fd, NULL, sock->sk_wait); ++ } ++ } ++ /* shutdown kernel connect, do_connect() has tried both kernel and lwip. */ ++ posix_api->shutdown_fn(fd, SHUT_RDWR); ++ } else { ++ POSIX_SET_TYPE(sock, POSIX_LWIP); ++ } ++ ++ return; ++} ++ + static inline int32_t do_listen(int32_t s, int32_t backlog) + { + if (select_sock_posix_path(lwip_get_socket(s)) == POSIX_KERNEL) { +diff --git a/src/lstack/core/lstack_wait.c b/src/lstack/core/lstack_wait.c +index 42ebf05..7070d3f 100644 +--- a/src/lstack/core/lstack_wait.c ++++ b/src/lstack/core/lstack_wait.c +@@ -392,7 +392,8 @@ static unsigned sock_event_lose_pending(const struct lwip_sock *sock, enum netco + switch (evt) { + case NETCONN_EVT_RCVMINUS: + if (sock->sk_event.events & EPOLLIN) { +- if (!NETCONN_NEED_RECV(sock) && ++ if (!sock->errevent && ++ !NETCONN_NEED_RECV(sock) && + !NETCONN_NEED_ACCEPT(sock)) { + event = EPOLLIN; + } +diff --git a/src/lstack/include/lstack_epoll.h b/src/lstack/include/lstack_epoll.h +index 99a7162..c6d2eb3 100644 +--- a/src/lstack/include/lstack_epoll.h ++++ b/src/lstack/include/lstack_epoll.h +@@ -23,6 +23,9 @@ void poll_destruct_wait(void); + + int lstack_epoll_close(int epfd); + void epoll_api_init(posix_api_t *api); ++int epoll_ctl_kernel_event(int epfd, int op, int fd, struct epoll_event *event, ++ struct sock_wait *sk_wait); ++ + bool sock_event_wait(struct lwip_sock *sock, enum netconn_evt evt, bool noblocking); + + #endif /* _GAZELLE_EPOLL_H_ */ +-- +2.33.0 + diff --git a/0351-sockio-fix-rtw_stack_tcp_write-wrong-copied_total-af.patch b/0351-sockio-fix-rtw_stack_tcp_write-wrong-copied_total-af.patch new file mode 100644 index 0000000..09dc616 --- /dev/null +++ b/0351-sockio-fix-rtw_stack_tcp_write-wrong-copied_total-af.patch @@ -0,0 +1,36 @@ +From 0f38dd2b397cc84bfaf1eeac358ad2a916998d73 Mon Sep 17 00:00:00 2001 +From: Lemmy Huang +Date: Thu, 26 Jun 2025 10:30:45 +0800 +Subject: [PATCH] sockio: fix rtw_stack_tcp_write wrong copied_total after + append + +Signed-off-by: Lemmy Huang +--- + src/lstack/api/lstack_sockio.c | 6 ++++-- + 1 file changed, 4 insertions(+), 2 deletions(-) + +diff --git a/src/lstack/api/lstack_sockio.c b/src/lstack/api/lstack_sockio.c +index 4c524ab..8640931 100644 +--- a/src/lstack/api/lstack_sockio.c ++++ b/src/lstack/api/lstack_sockio.c +@@ -683,13 +683,15 @@ static ssize_t rtw_stack_tcp_write(struct lwip_sock *sock, const char *data, siz + + copied_total = rtw_stack_tcp_append(mr, data, LWIP_MIN(TCP_MSS, total_copy_len), flags); + SOCK_WAIT_STAT(sock->sk_wait, sock_tx_merge, copied_total > 0 ? 1 : 0); +- if (copied_total == total_copy_len) { +- return copied_total; ++ total_copy_len -= copied_total; ++ if (total_copy_len == 0) { ++ goto out; + } + + if (total_copy_len <= TCP_MSS) { + /* write one pbuf */ + copied_total += rtw_stack_tcp_write_one(sock, mr, data + copied_total, total_copy_len, flags); ++ total_copy_len -= copied_total; + } else { + if (total_copy_len > mr->app_free_count * TCP_MSS) { + total_copy_len = mr->app_free_count * TCP_MSS; +-- +2.33.0 + diff --git a/0352-sk_event-fix-rtw-epoll-wrong-event-notify-and-remove.patch b/0352-sk_event-fix-rtw-epoll-wrong-event-notify-and-remove.patch new file mode 100644 index 0000000..7abee40 --- /dev/null +++ b/0352-sk_event-fix-rtw-epoll-wrong-event-notify-and-remove.patch @@ -0,0 +1,349 @@ +From ad2d26c2a9316829724adbf5ed7e201ff85ae7a6 Mon Sep 17 00:00:00 2001 +From: Lemmy Huang +Date: Sat, 28 Jun 2025 14:59:54 +0800 +Subject: [PATCH] sk_event: fix rtw epoll wrong event notify and remove + +Signed-off-by: Lemmy Huang +--- + src/lstack/api/lstack_epoll.c | 69 +++++++++++++++++++------------- + src/lstack/api/lstack_sockio.c | 2 +- + src/lstack/core/lstack_wait.c | 53 ++++++++++++++---------- + src/lstack/include/lstack_wait.h | 9 +++-- + 4 files changed, 80 insertions(+), 53 deletions(-) + +diff --git a/src/lstack/api/lstack_epoll.c b/src/lstack/api/lstack_epoll.c +index 1a10e24..d39002b 100644 +--- a/src/lstack/api/lstack_epoll.c ++++ b/src/lstack/api/lstack_epoll.c +@@ -53,46 +53,64 @@ static int rtw_sock_wait_timedwait(struct sock_wait *sk_wait, int timeout, uint3 + } + + static void rtc_epoll_notify_event(struct sock_wait *sk_wait, struct sock_event *sk_event, +- unsigned pending, int stack_id) ++ enum netconn_evt evt, int stack_id) + { +- sk_event->pending |= pending; +- if (list_node_null(&sk_event->event_node)) { +- list_add_node(&sk_event->event_node, &sk_wait->epcb.event_list); ++ sk_event->pending |= sock_event_hold_pending(sk_event->sock, sk_wait->type, evt, 0); ++ if (likely(sk_event->pending != 0)) { ++ if (list_node_null(&sk_event->event_node)) { ++ list_add_node(&sk_event->event_node, &sk_wait->epcb.event_list); ++ } + } + } + +-static void rtc_epoll_remove_event(struct sock_wait *sk_wait, struct sock_event *sk_event, unsigned pending) ++static void rtc_epoll_remove_event(struct sock_wait *sk_wait, struct sock_event *sk_event, enum netconn_evt evt) + { +- sk_event->pending &= ~pending; ++ sk_event->pending &= ~sock_event_lose_pending(sk_event->sock, evt, 0); + if (sk_event->pending == 0) { + list_del_node(&sk_event->event_node); + } + } + + static void rtw_epoll_notify_event(struct sock_wait *sk_wait, struct sock_event *sk_event, +- unsigned pending, int stack_id) ++ enum netconn_evt evt, int stack_id) + { ++ /* call sock_event_hold_pending in lock to avoid unnecessary events: ++ * stack: mbox_enqueue -> hold_pending -> lock -> notify_event -> unlock ++ * recv: mbox_dequeue -> lose_pending -> lock -> remove_event -> unlock ++ */ ++ + #if SOCK_WAIT_BATCH_NOTIFY + if (likely(stack_id >= 0)) { +- lwip_wait_add_notify(sk_wait, sk_event, pending, stack_id); ++ lwip_wait_add_notify(sk_wait, sk_event, evt, stack_id); + return; + } + #endif /* SOCK_WAIT_BATCH_NOTIFY */ + + rte_spinlock_lock(&sk_wait->epcb.lock); +- sk_event->pending |= pending; +- if (list_node_null(&sk_event->event_node)) { +- list_add_node(&sk_event->event_node, &sk_wait->epcb.event_list); ++ sk_event->pending |= sock_event_hold_pending(sk_event->sock, sk_wait->type, evt, 0); ++ if (likely(sk_event->pending != 0)) { ++ if (list_node_null(&sk_event->event_node)) { ++ list_add_node(&sk_event->event_node, &sk_wait->epcb.event_list); ++ } + } + rte_spinlock_unlock(&sk_wait->epcb.lock); + + sys_sem_signal_internal(&sk_wait->sem); + } + +-static void rtw_epoll_remove_event(struct sock_wait *sk_wait, struct sock_event *sk_event, unsigned pending) ++static void rtw_epoll_remove_event(struct sock_wait *sk_wait, struct sock_event *sk_event, enum netconn_evt evt) + { ++ /* call sock_event_hold_pending in lock to avoid wrong remove: ++ * stack: mbox_enqueue -> hold_pending -> lock -> notify_event -> unlock ++ * recv: lose_pending -> lock -> remove_event -> unlock ++ */ ++ ++ if (sock_event_lose_pending(sk_event->sock, evt, 0) == 0) { ++ return; ++ } ++ + rte_spinlock_lock(&sk_wait->epcb.lock); +- sk_event->pending &= ~pending; ++ sk_event->pending &= ~sock_event_lose_pending(sk_event->sock, evt, 0); + if (sk_event->pending == 0) { + list_del_node(&sk_event->event_node); + } +@@ -100,24 +118,24 @@ static void rtw_epoll_remove_event(struct sock_wait *sk_wait, struct sock_event + } + + static void rtc_poll_notify_event(struct sock_wait *sk_wait, struct sock_event *sk_event, +- unsigned pending, int stack_id) ++ enum netconn_evt evt, int stack_id) + { + } +-static void rtc_poll_remove_event(struct sock_wait *sk_wait, struct sock_event *sk_event, unsigned pending) ++static void rtc_poll_remove_event(struct sock_wait *sk_wait, struct sock_event *sk_event, enum netconn_evt evt) + { + } + static void rtw_poll_notify_event(struct sock_wait *sk_wait, struct sock_event *sk_event, +- unsigned pending, int stack_id) ++ enum netconn_evt evt, int stack_id) + { + #if SOCK_WAIT_BATCH_NOTIFY + if (likely(stack_id >= 0)) { +- lwip_wait_add_notify(sk_wait, NULL, 0, stack_id); ++ lwip_wait_add_notify(sk_wait, NULL, evt, stack_id); + return; + } + #endif /* SOCK_WAIT_BATCH_NOTIFY */ + sys_sem_signal_internal(&sk_wait->sem); + } +-static void rtw_poll_remove_event(struct sock_wait *sk_wait, struct sock_event *sk_event, unsigned pending) ++static void rtw_poll_remove_event(struct sock_wait *sk_wait, struct sock_event *sk_event, enum netconn_evt evt) + { + } + +@@ -284,7 +302,6 @@ int lstack_epoll_ctl(int epfd, int op, int fd, struct epoll_event *event) + struct lwip_sock *sock = lwip_get_socket(fd); + struct sock_wait *sk_wait = epsock->sk_wait; + struct sock_event *sk_event; +- unsigned pending; + + if (epfd < 0 || fd < 0 || epfd == fd || \ + (event == NULL && op != EPOLL_CTL_DEL)) { +@@ -316,10 +333,9 @@ int lstack_epoll_ctl(int epfd, int op, int fd, struct epoll_event *event) + sk_event->events = event->events | EPOLLERR | EPOLLHUP; + sk_event->ep_data = event->data; + +- pending = sock_event_hold_pending(sock, WAIT_EPOLL, NETCONN_EVT_RCVPLUS, 0) | +- sock_event_hold_pending(sock, WAIT_EPOLL, NETCONN_EVT_SENDPLUS, 0) | +- sock_event_hold_pending(sock, WAIT_EPOLL, NETCONN_EVT_ERROR, 0); +- sk_wait->notify_fn(sk_wait, sk_event, pending, -1); ++ sk_wait->notify_fn(sk_wait, sk_event, NETCONN_EVT_RCVPLUS, -1); ++ sk_wait->notify_fn(sk_wait, sk_event, NETCONN_EVT_SENDPLUS, -1); ++ sk_wait->notify_fn(sk_wait, sk_event, NETCONN_EVT_ERROR, -1); + + sk_wait->lwip_nfds++; + sk_wait->affinity.stack_nfds[sock->stack_id]++; +@@ -327,10 +343,9 @@ int lstack_epoll_ctl(int epfd, int op, int fd, struct epoll_event *event) + case EPOLL_CTL_DEL: + sk_event->events = 0; + +- pending = sock_event_hold_pending(sock, WAIT_EPOLL, NETCONN_EVT_RCVMINUS, 0) | +- sock_event_hold_pending(sock, WAIT_EPOLL, NETCONN_EVT_SENDMINUS, 0) | +- sock_event_hold_pending(sock, WAIT_EPOLL, NETCONN_EVT_ERROR, 0); +- sk_wait->remove_fn(sk_wait, sk_event, pending); ++ sk_wait->remove_fn(sk_wait, sk_event, NETCONN_EVT_RCVMINUS); ++ sk_wait->remove_fn(sk_wait, sk_event, NETCONN_EVT_SENDMINUS); ++ sk_wait->remove_fn(sk_wait, sk_event, NETCONN_EVT_ERROR); + + sk_wait->lwip_nfds--; + sk_wait->affinity.stack_nfds[sock->stack_id]--; +diff --git a/src/lstack/api/lstack_sockio.c b/src/lstack/api/lstack_sockio.c +index 8640931..418311e 100644 +--- a/src/lstack/api/lstack_sockio.c ++++ b/src/lstack/api/lstack_sockio.c +@@ -1461,7 +1461,7 @@ int do_lwip_init_sock(int fd) + } + + sock->sk_wait = NULL; +- ret = sock_event_init(&sock->sk_event); ++ ret = sock_event_init(&sock->sk_event, sock); + if (ret != 0) { + LSTACK_LOG(ERR, LSTACK, "sock_event_init failed\n"); + return -1; +diff --git a/src/lstack/core/lstack_wait.c b/src/lstack/core/lstack_wait.c +index 7070d3f..4714742 100644 +--- a/src/lstack/core/lstack_wait.c ++++ b/src/lstack/core/lstack_wait.c +@@ -256,10 +256,11 @@ void affinity_bind_stack(struct sock_wait *sk_wait, struct wait_affinity *affini + } + } + +-int sock_event_init(struct sock_event *sk_event) ++int sock_event_init(struct sock_event *sk_event, struct lwip_sock *sock) + { + memset_s(sk_event, sizeof(struct sock_event), 0, sizeof(struct sock_event)); + ++ sk_event->sock = sock; + list_init_node(&sk_event->event_node); + #if SOCK_WAIT_BATCH_NOTIFY + list_init_node(&sk_event->stk_event_node); +@@ -278,6 +279,7 @@ void sock_event_free(struct sock_event *sk_event, struct sock_wait *sk_wait) + list_del_node(&sk_event->stk_event_node); + #endif /* SOCK_WAIT_BATCH_NOTIFY */ + } ++ sk_event->sock = NULL; + } + + int sock_wait_common_init(struct sock_wait *sk_wait) +@@ -385,7 +387,7 @@ static inline bool NETCONN_ALLOW_SEND(const struct lwip_sock *sock) + return false; + } + +-static unsigned sock_event_lose_pending(const struct lwip_sock *sock, enum netconn_evt evt, unsigned len) ++unsigned sock_event_lose_pending(const struct lwip_sock *sock, enum netconn_evt evt, unsigned len) + { + unsigned event = 0; + +@@ -421,8 +423,7 @@ unsigned sock_event_hold_pending(const struct lwip_sock *sock, + switch (evt) { + case NETCONN_EVT_RCVPLUS: + if (sock->sk_event.events & EPOLLIN || type & WAIT_BLOCK) { +- if (len > 0 || +- NETCONN_NEED_RECV(sock) || ++ if (NETCONN_NEED_RECV(sock) || + NETCONN_NEED_ACCEPT(sock)) { + event = EPOLLIN; + } +@@ -430,8 +431,7 @@ unsigned sock_event_hold_pending(const struct lwip_sock *sock, + break; + case NETCONN_EVT_SENDPLUS: + if (sock->sk_event.events & EPOLLOUT || type & WAIT_BLOCK) { +- if (len > 0 || +- NETCONN_ALLOW_SEND(sock)) { ++ if (NETCONN_ALLOW_SEND(sock)) { + event = EPOLLOUT; + } + } +@@ -460,11 +460,7 @@ void sock_event_remove_pending(struct lwip_sock *sock, enum netconn_evt evt, uns + sock->sk_wait = NULL; + return; + } +- +- unsigned pending = sock_event_lose_pending(sock, evt, 0); +- if (pending) { +- sock->sk_wait->remove_fn(sock->sk_wait, &sock->sk_event, pending); +- } ++ sock->sk_wait->remove_fn(sock->sk_wait, &sock->sk_event, evt); + } + + void sock_event_notify_pending(struct lwip_sock *sock, enum netconn_evt evt, unsigned len) +@@ -479,22 +475,18 @@ void sock_event_notify_pending(struct lwip_sock *sock, enum netconn_evt evt, uns + sock->sk_wait = NULL; + return; + } +- +- unsigned pending = sock_event_hold_pending(sock, sock->sk_wait->type, evt, len); +- if (pending) { +- sock->sk_wait->notify_fn(sock->sk_wait, &sock->sk_event, pending, sock->stack_id); +- } ++ sock->sk_wait->notify_fn(sock->sk_wait, &sock->sk_event, evt, sock->stack_id); + } + + #if SOCK_WAIT_BATCH_NOTIFY + /* Only allow stack call */ + void lwip_wait_add_notify(struct sock_wait *sk_wait, struct sock_event *sk_event, +- unsigned pending, int stack_id) ++ enum netconn_evt evt, int stack_id) + { + struct lwip_wait *lwait = lwip_wait_get(stack_id); + + if (sk_event != NULL) { +- sk_event->stk_pending |= pending; ++ sk_event->stk_evts |= evt; + if (list_node_null(&sk_event->stk_event_node)) { + list_add_node(&sk_event->stk_event_node, &sk_wait->stk_event_list[stack_id]); + } +@@ -505,6 +497,23 @@ void lwip_wait_add_notify(struct sock_wait *sk_wait, struct sock_event *sk_event + } + } + ++static unsigned sock_event_get_pending(struct sock_event *sk_event, enum netconn_evt evts) ++{ ++ unsigned pending = 0; ++ ++ if (evts & NETCONN_EVT_SENDPLUS) { ++ pending |= sock_event_hold_pending(sk_event->sock, WAIT_EPOLL, NETCONN_EVT_SENDPLUS, 0); ++ } ++ if (evts & NETCONN_EVT_RCVPLUS) { ++ pending |= sock_event_hold_pending(sk_event->sock, WAIT_EPOLL, NETCONN_EVT_RCVPLUS, 0); ++ } ++ if (evts & NETCONN_EVT_ERROR) { ++ pending |= sock_event_hold_pending(sk_event->sock, WAIT_EPOLL, NETCONN_EVT_ERROR, 0); ++ } ++ sk_event->stk_evts = 0; ++ return pending; ++} ++ + static inline + unsigned sock_wait_foreach_event(struct sock_wait *sk_wait, int stack_id) + { +@@ -523,12 +532,14 @@ unsigned sock_wait_foreach_event(struct sock_wait *sk_wait, int stack_id) + sk_event = container_of(node, struct sock_event, stk_event_node); + + /* see rtw_epoll_notify_event() */ +- sk_event->pending |= sk_event->stk_pending; ++ sk_event->pending |= sock_event_get_pending(sk_event, sk_event->stk_evts); ++ if (unlikely(sk_event->pending == 0)) { ++ continue; ++ } ++ + if (list_node_null(&sk_event->event_node)) { + list_add_node(&sk_event->event_node, &sk_wait->epcb.event_list); + } +- +- sk_event->stk_pending = 0; + count++; + } + +diff --git a/src/lstack/include/lstack_wait.h b/src/lstack/include/lstack_wait.h +index 58da126..8ed2e93 100644 +--- a/src/lstack/include/lstack_wait.h ++++ b/src/lstack/include/lstack_wait.h +@@ -67,9 +67,9 @@ struct sock_wait { + int (*timedwait_fn)(struct sock_wait *sk_wait, int timeout, uint32_t start); + /* trigger event */ + void (*notify_fn)(struct sock_wait *sk_wait, struct sock_event *sk_event, +- unsigned pending, int stack_id); ++ enum netconn_evt evt, int stack_id); + /* remove event */ +- void (*remove_fn)(struct sock_wait *sk_wait, struct sock_event *sk_event, unsigned pending); ++ void (*remove_fn)(struct sock_wait *sk_wait, struct sock_event *sk_event, enum netconn_evt evt); + + /* dfx stat */ + struct list_node group_node; +@@ -117,17 +117,18 @@ int kernel_wait_ctl(struct sock_wait *sk_wait, int new_stack_id, int old_stack_i + + #if SOCK_WAIT_BATCH_NOTIFY + void lwip_wait_add_notify(struct sock_wait *sk_wait, struct sock_event *sk_event, +- unsigned pending, int stack_id); ++ enum netconn_evt evt, int stack_id); + unsigned lwip_wait_foreach_notify(int stack_id); + bool lwip_wait_notify_empty(int stack_id); + #endif /* SOCK_WAIT_BATCH_NOTIFY */ + ++unsigned sock_event_lose_pending(const struct lwip_sock *sock, enum netconn_evt evt, unsigned len); + unsigned sock_event_hold_pending(const struct lwip_sock *sock, + enum sock_wait_type type, enum netconn_evt evt, unsigned len); + void sock_event_notify_pending(struct lwip_sock *sock, enum netconn_evt evt, unsigned len); + void sock_event_remove_pending(struct lwip_sock *sock, enum netconn_evt evt, unsigned len); + +-int sock_event_init(struct sock_event *sk_event); ++int sock_event_init(struct sock_event *sk_event, struct lwip_sock *sock); + void sock_event_free(struct sock_event *sk_event, struct sock_wait *sk_wait); + + int sock_wait_common_init(struct sock_wait *sk_wait); +-- +2.33.0 + diff --git a/0353-poll-fix-do_lwip_connected_callback-not-delete-poll-.patch b/0353-poll-fix-do_lwip_connected_callback-not-delete-poll-.patch new file mode 100644 index 0000000..2912a3d --- /dev/null +++ b/0353-poll-fix-do_lwip_connected_callback-not-delete-poll-.patch @@ -0,0 +1,158 @@ +From 4931694a535c2371645ab58669e80a6a204a1dd1 Mon Sep 17 00:00:00 2001 +From: Lemmy Huang +Date: Wed, 2 Jul 2025 09:36:12 +0800 +Subject: [PATCH] poll: fix do_lwip_connected_callback not delete poll kernel + fd + +Signed-off-by: Lemmy Huang +--- + src/lstack/api/lstack_epoll.c | 73 +++++++++++++++++++------------ + src/lstack/api/lstack_wrap.c | 2 + + src/lstack/include/lstack_epoll.h | 1 + + 3 files changed, 49 insertions(+), 27 deletions(-) + +diff --git a/src/lstack/api/lstack_epoll.c b/src/lstack/api/lstack_epoll.c +index d39002b..949e10b 100644 +--- a/src/lstack/api/lstack_epoll.c ++++ b/src/lstack/api/lstack_epoll.c +@@ -580,43 +580,62 @@ struct sock_wait *poll_construct_wait(int nfds) + return g_sk_wait; + } + +-static int poll_ctl_kernel_event(int epfd, int fds_id, +- const struct pollfd *new_fds, const struct pollfd *old_fds) +-{ +- int ret; ++struct poll_kernel_data { ++ union { ++ struct { ++ uint32_t fd; ++ uint16_t fds_id; ++ }; ++ void *ptr; ++ }; ++}; ++int poll_ctl_kernel_event(int epfd, int fds_id, int old_fd, const struct pollfd *new_fds) ++{ ++ int ret = 0; + struct epoll_event epevent; ++ struct poll_kernel_data pdata; + +- LWIP_DEBUGF(SOCKETS_DEBUG, ("%s(epfd=%d, old_fd=%d, new_fd=%d)\n", +- __FUNCTION__, epfd, old_fds->fd, new_fds->fd)); +- +- epevent.data.fd = fds_id; +- epevent.events = new_fds->events; ++ RTE_BUILD_BUG_ON(sizeof(struct poll_kernel_data) > sizeof(void *)); + + /* EPOLL_CTL_MOD may not be any events, but why? */ +- if (old_fds->fd == 0) { +- ret = posix_api->epoll_ctl_fn(epfd, EPOLL_CTL_ADD, new_fds->fd, &epevent); +- } else { +- ret = posix_api->epoll_ctl_fn(epfd, EPOLL_CTL_DEL, old_fds->fd, NULL); ++ if (old_fd > 0) { ++ ret = posix_api->epoll_ctl_fn(epfd, EPOLL_CTL_DEL, old_fd, NULL); ++ } ++ if (new_fds != NULL && new_fds->fd > 0) { ++ pdata.fd = new_fds->fd; ++ pdata.fds_id = fds_id; ++ epevent.data.ptr = pdata.ptr; ++ epevent.events = new_fds->events; + ret |= posix_api->epoll_ctl_fn(epfd, EPOLL_CTL_ADD, new_fds->fd, &epevent); + } + ++ LWIP_DEBUGF(SOCKETS_DEBUG, ("%s(epfd=%d, fds_id %d, old_fd=%d, new_fd=%d)\n", ++ __FUNCTION__, epfd, fds_id, old_fd, new_fds ? new_fds->fd : -1)); ++ + if (ret != 0 && errno != EINTR && errno != ENOENT) { +- LSTACK_LOG(ERR, LSTACK, "epoll_ctl failed, errno %d, new_fd %d, old_fd %d\n", +- errno, new_fds->fd, old_fds->fd); ++ LSTACK_LOG(ERR, LSTACK, "epoll_ctl failed, errno %d, fds_id %d, old_fd %d, new_fd %d\n", ++ errno, fds_id, old_fd, new_fds ? new_fds->fd : -1); + } + return ret; + } + +-static int poll_wait_kernel_event(int epfd, struct pollfd *fds, int maxevents) ++static int poll_wait_kernel_event(int epfd, const struct poll_cb *pcb, struct pollfd *fds, int maxevents) + { + struct epoll_event epevents[POLL_MAX_EVENTS]; +- int num = 0; +- int i, fds_id; +- +- num = posix_api->epoll_wait_fn(epfd, epevents, maxevents, 0); +- for (i = 0; i < num; ++i) { +- fds_id = epevents[i].data.fd; +- fds[fds_id].revents = epevents[i].events; ++ struct poll_kernel_data pdata; ++ int ret, i, num = 0; ++ ++ ret = posix_api->epoll_wait_fn(epfd, epevents, maxevents, 0); ++ for (i = 0; i < ret; ++i) { ++ pdata.ptr = epevents[i].data.ptr; ++ if (pdata.fd != fds[pdata.fds_id].fd) { ++ poll_ctl_kernel_event(epfd, pdata.fds_id, pdata.fd, NULL); ++ continue; ++ } ++ /* may be already counted by poll_scan_lwip_event() */ ++ if (fds[pdata.fds_id].revents == 0) ++ num++; ++ fds[pdata.fds_id].revents |= epevents[i].events; + } + + return num; +@@ -635,14 +654,14 @@ static void poll_prepare_wait(struct sock_wait *sk_wait, struct pollfd *fds, nfd + 0, sizeof(sk_wait->affinity.stack_nfds)); + + for (i = 0; i < nfds; ++i) { ++ fds[i].revents = 0; + fd = fds[i].fd; + sock = lwip_get_socket(fd); + sk_type = select_sock_posix_path(sock); + + if (sk_type & POSIX_KERNEL) { +- poll_ctl_kernel_event(sk_wait->epfd, i, &fds[i], +- &pcb->kernel_fds[sk_wait->kernel_nfds]); +- pcb->kernel_fds[sk_wait->kernel_nfds] = fds[i]; ++ poll_ctl_kernel_event(sk_wait->epfd, i, pcb->kernel_fds[i].fd, &fds[i]); ++ pcb->kernel_fds[i] = fds[i]; + sk_wait->kernel_nfds++; + } + +@@ -723,7 +742,7 @@ int lstack_poll(struct pollfd *fds, nfds_t nfds, int timeout) + } + + if (sk_wait->kernel_nfds > 0 && rte_atomic16_read(&sk_wait->kernel_pending)) { +- kernel_num = poll_wait_kernel_event(sk_wait->epfd, fds, sk_wait->kernel_nfds); ++ kernel_num = poll_wait_kernel_event(sk_wait->epfd, &sk_wait->pcb, fds, sk_wait->kernel_nfds); + if (kernel_num == 0 && errno != EINTR && errno != EAGAIN) { + rte_atomic16_set(&sk_wait->kernel_pending, false); + } +diff --git a/src/lstack/api/lstack_wrap.c b/src/lstack/api/lstack_wrap.c +index 8339750..6523dc2 100644 +--- a/src/lstack/api/lstack_wrap.c ++++ b/src/lstack/api/lstack_wrap.c +@@ -392,6 +392,8 @@ void do_lwip_connected_callback(int fd) + if (sock->sk_wait != NULL) { + if (sock->sk_wait->type & WAIT_EPOLL) { + epoll_ctl_kernel_event(sock->sk_wait->epfd, EPOLL_CTL_DEL, fd, NULL, sock->sk_wait); ++ } else if (sock->sk_wait->type & WAIT_POLL) { ++ poll_ctl_kernel_event(sock->sk_wait->epfd, 0, fd, NULL); + } + } + /* shutdown kernel connect, do_connect() has tried both kernel and lwip. */ +diff --git a/src/lstack/include/lstack_epoll.h b/src/lstack/include/lstack_epoll.h +index c6d2eb3..94e872d 100644 +--- a/src/lstack/include/lstack_epoll.h ++++ b/src/lstack/include/lstack_epoll.h +@@ -25,6 +25,7 @@ int lstack_epoll_close(int epfd); + void epoll_api_init(posix_api_t *api); + int epoll_ctl_kernel_event(int epfd, int op, int fd, struct epoll_event *event, + struct sock_wait *sk_wait); ++int poll_ctl_kernel_event(int epfd, int fds_id, int old_fd, const struct pollfd *new_fds); + + bool sock_event_wait(struct lwip_sock *sock, enum netconn_evt evt, bool noblocking); + +-- +2.33.0 + diff --git a/0354-sockctl-fix-rtw-broadcast-close-and-shutdown.patch b/0354-sockctl-fix-rtw-broadcast-close-and-shutdown.patch new file mode 100644 index 0000000..ed0d99b --- /dev/null +++ b/0354-sockctl-fix-rtw-broadcast-close-and-shutdown.patch @@ -0,0 +1,53 @@ +From 930c68e37f09c0ad3f5e56ae5407ad397afc48cb Mon Sep 17 00:00:00 2001 +From: Lemmy Huang +Date: Fri, 4 Jul 2025 08:58:03 +0800 +Subject: [PATCH] sockctl: fix rtw broadcast close and shutdown + +Signed-off-by: Lemmy Huang +--- + src/lstack/api/lstack_sockctl.c | 8 ++++---- + 1 file changed, 4 insertions(+), 4 deletions(-) + +diff --git a/src/lstack/api/lstack_sockctl.c b/src/lstack/api/lstack_sockctl.c +index 856b24e..a224303 100644 +--- a/src/lstack/api/lstack_sockctl.c ++++ b/src/lstack/api/lstack_sockctl.c +@@ -461,7 +461,7 @@ static int stack_broadcast_close(int fd) + int ret = 0; + struct lwip_sock *sock = lwip_get_socket(fd); + +- while (sock != NULL) { ++ do { + if (POSIX_IS_CLOSED(sock)) { + ret = -1; + break; +@@ -469,7 +469,7 @@ static int stack_broadcast_close(int fd) + fd = sock->conn->callback_arg.socket; + ret |= rpc_call_close(sock->stack_id, fd); + sock = sock->listen_next; +- } ++ } while (sock != NULL); + + if (ret != 0) { + GAZELLE_RETURN(EBADF); +@@ -482,7 +482,7 @@ static int stack_broadcast_shutdown(int fd, int how) + int ret = 0; + struct lwip_sock *sock = lwip_get_socket(fd); + +- while (true) { ++ do { + if (POSIX_IS_CLOSED(sock)) { + ret = -1; + break; +@@ -490,7 +490,7 @@ static int stack_broadcast_shutdown(int fd, int how) + fd = sock->conn->callback_arg.socket; + ret |= rpc_call_shutdown(sock->stack_id, fd, how); + sock = sock->listen_next; +- } ++ } while (sock != NULL); + + if (ret != 0) { + GAZELLE_RETURN(EBADF); +-- +2.33.0 + diff --git a/0355-mempool-increase-MEM_THREAD_MANAGER_FREE_S-to-avoid-.patch b/0355-mempool-increase-MEM_THREAD_MANAGER_FREE_S-to-avoid-.patch new file mode 100644 index 0000000..ed3c4f5 --- /dev/null +++ b/0355-mempool-increase-MEM_THREAD_MANAGER_FREE_S-to-avoid-.patch @@ -0,0 +1,26 @@ +From f807fda71067c85f436a294746afc4a56aebe8fa Mon Sep 17 00:00:00 2001 +From: Lemmy Huang +Date: Fri, 4 Jul 2025 08:58:37 +0800 +Subject: [PATCH] mempool: increase MEM_THREAD_MANAGER_FREE_S to avoid poll + +Signed-off-by: Lemmy Huang +--- + src/lstack/core/lstack_mempool.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/src/lstack/core/lstack_mempool.c b/src/lstack/core/lstack_mempool.c +index 70ac8c6..3f447f0 100644 +--- a/src/lstack/core/lstack_mempool.c ++++ b/src/lstack/core/lstack_mempool.c +@@ -27,7 +27,7 @@ + #define MEM_THREAD_MAX_PATH 32 /* cat /proc/sys/kernel/pid_max */ + #define MEM_THREAD_FLUSH_SIG (SIGRTMIN + 11) + #define MEM_THREAD_MANAGER_FLUSH_MS 100 +-#define MEM_THREAD_MANAGER_FREE_S 2 ++#define MEM_THREAD_MANAGER_FREE_S 20 + #define MEM_THREAD_MANAGER_FREE_MAX 64 + + struct mem_thread_manager { +-- +2.33.0 + diff --git a/0356-sockio-fix-callback_tcp_send-output-too-many-at-once.patch b/0356-sockio-fix-callback_tcp_send-output-too-many-at-once.patch new file mode 100644 index 0000000..ee77df0 --- /dev/null +++ b/0356-sockio-fix-callback_tcp_send-output-too-many-at-once.patch @@ -0,0 +1,76 @@ +From 0199d6397ae4f6c6fe76ed2d741f3b0e72546af2 Mon Sep 17 00:00:00 2001 +From: Lemmy Huang +Date: Fri, 4 Jul 2025 08:59:39 +0800 +Subject: [PATCH] sockio: fix callback_tcp_send output too many at once + +Signed-off-by: Lemmy Huang +--- + src/lstack/api/lstack_sockio.c | 33 ++++++++++++++++++++++----------- + 1 file changed, 22 insertions(+), 11 deletions(-) + +diff --git a/src/lstack/api/lstack_sockio.c b/src/lstack/api/lstack_sockio.c +index 418311e..05e82d7 100644 +--- a/src/lstack/api/lstack_sockio.c ++++ b/src/lstack/api/lstack_sockio.c +@@ -964,36 +964,47 @@ static void callback_tcp_send(struct rpc_msg *sendmsg) + struct protocol_stack *stack = get_protocol_stack(); + struct lwip_sock *sock = sendmsg->args[MSG_ARG_0].p; + struct mem_thread *mt = sendmsg->args[MSG_ARG_1].p; ++ const struct tcp_pcb *pcb = sock->conn->pcb.tcp; ++ const struct mbox_ring *mr = &sock->conn->sendmbox->mring; + bool output_again; + err_t err; + +- if (unlikely(sock->conn->pcb.tcp == NULL)) ++ if (unlikely(pcb == NULL)) + return; + ++#if GAZELLE_TCP_ASYNC_RECVD ++ struct rpc_msg *recvmsg; ++ if (RECVD_UNSUBMITED(sendmsg)) { ++ RECVD_UNSUBMITED(sendmsg) = 0; ++ recvmsg = sock_mbox_private_get(sock->conn->recvmbox); ++ callback_tcp_recvd(recvmsg); ++ } ++#endif /* GAZELLE_TCP_ASYNC_RECVD */ ++ ++ /* If LWIP_MIN(snd_wnd, cwnd) limit output. */ ++ if (pcb->unsent != NULL && pcb->unacked != NULL) { ++ rpc_async_call(&stack->rpc_queue, sendmsg, RPC_MSG_REUSE | RPC_MSG_RECALL); ++ return; ++ } ++ + if (get_protocol_stack_group()->latency_start) + calculate_sock_latency(sock, GAZELLE_LATENCY_WRITE_RPC_MSG); + ++ sendmsg->result = 0; + do { +- if (!lwip_tcp_allow_send(sock->conn->pcb.tcp)) { ++ /* Not output too many bufs at once. */ ++ if (sendmsg->result >= mr->ops->get_capacity(mr) || !lwip_tcp_allow_send(sock->conn->pcb.tcp)) { + rpc_async_call(&stack->rpc_queue, sendmsg, RPC_MSG_REUSE | RPC_MSG_RECALL); + break; + } + sendmsg->result += rtw_stack_tcp_output(sock->conn, &output_again, mt); + } while (output_again); ++ + err = tcp_output(sock->conn->pcb.tcp); + if (unlikely(err != ERR_OK)) { + LSTACK_LOG(ERR, LSTACK, "tcp_output failed, sock %p, err %u\n", sock, err); + } + +-#if GAZELLE_TCP_ASYNC_RECVD +- struct rpc_msg *recvmsg; +- if (RECVD_UNSUBMITED(sendmsg)) { +- RECVD_UNSUBMITED(sendmsg) = 0; +- recvmsg = sock_mbox_private_get(sock->conn->recvmbox); +- callback_tcp_recvd(recvmsg); +- } +-#endif /* GAZELLE_TCP_ASYNC_RECVD */ +- + return; + } + +-- +2.33.0 + diff --git a/0357-sockio-fix-sendmbox-full-return-EWOULDBLOCK.patch b/0357-sockio-fix-sendmbox-full-return-EWOULDBLOCK.patch new file mode 100644 index 0000000..d6d8e13 --- /dev/null +++ b/0357-sockio-fix-sendmbox-full-return-EWOULDBLOCK.patch @@ -0,0 +1,336 @@ +From 635e86630da1ebfe5955b1598f8adb5a9d547a2f Mon Sep 17 00:00:00 2001 +From: Lemmy Huang +Date: Fri, 4 Jul 2025 09:00:42 +0800 +Subject: [PATCH] sockio: fix sendmbox full return EWOULDBLOCK + +Signed-off-by: Lemmy Huang +--- + src/lstack/api/lstack_epoll.c | 34 ++++++++----- + src/lstack/api/lstack_sockctl.c | 8 +-- + src/lstack/api/lstack_sockio.c | 83 +++++++++++++++++++------------ + src/lstack/include/lstack_epoll.h | 2 +- + 4 files changed, 76 insertions(+), 51 deletions(-) + +diff --git a/src/lstack/api/lstack_epoll.c b/src/lstack/api/lstack_epoll.c +index 949e10b..a07968a 100644 +--- a/src/lstack/api/lstack_epoll.c ++++ b/src/lstack/api/lstack_epoll.c +@@ -439,7 +439,7 @@ int lstack_epoll_wait(int epfd, struct epoll_event* events, int maxevents, int t + lwip_maxevents = sk_wait->kernel_nfds > 0 ? + (maxevents >> 1) + 1 : maxevents; + +- start = sys_now(); ++ start = timeout <= 0 ? 0 : sys_now(); + + /* RTC try to recv polling. */ + sk_wait->timedwait_fn(sk_wait, 0, start); +@@ -732,7 +732,7 @@ int lstack_poll(struct pollfd *fds, nfds_t nfds, int timeout) + return posix_api->poll_fn(fds, nfds, timeout); + } + +- start = sys_now(); ++ start = timeout <= 0 ? 0 : sys_now(); + + /* RTC try to recv polling. */ + sk_wait->timedwait_fn(sk_wait, 0, start); +@@ -861,14 +861,17 @@ void epoll_api_init(posix_api_t *api) + api->select_fn = lstack_select; + } + +-bool sock_event_wait(struct lwip_sock *sock, enum netconn_evt evt, bool noblocking) ++bool sock_event_wait(struct lwip_sock *sock, enum netconn_evt evt, bool nonblocking) + { + bool rtc_mode = get_global_cfg_params()->stack_mode_rtc; +- uint32_t start; +- int timeout; + unsigned pending = 0; ++ uint32_t start; ++ int timeout = -1; ++ int old_errno; ++ ++ nonblocking |= netconn_is_nonblocking(sock->conn); + +- if (!rtc_mode && noblocking) ++ if (!rtc_mode && nonblocking) + return false; + + if (unlikely(sock->sk_wait == NULL) || sock->sk_wait->type == WAIT_CLOSE) { +@@ -879,14 +882,21 @@ bool sock_event_wait(struct lwip_sock *sock, enum netconn_evt evt, bool noblocki + rte_wmb(); + } + ++ old_errno = errno; + if (rtc_mode) { + /* RTC try to recv polling. */ + sock->sk_wait->timedwait_fn(sock->sk_wait, 0, 0); +- return true; ++ if (nonblocking) { ++ errno = old_errno; ++ return false; ++ } + } + +- timeout = sock->conn->recv_timeout == 0 ? -1 : sock->conn->recv_timeout; +- start = sys_now(); ++ if (evt == NETCONN_EVT_RCVPLUS && sock->conn->recv_timeout > 0) { ++ timeout = sock->conn->recv_timeout; ++ } ++ ++ start = timeout <= 0 ? 0 : sys_now(); + do { + pending = sock_event_hold_pending(sock, WAIT_BLOCK, evt, 0) | + sock_event_hold_pending(sock, WAIT_BLOCK, NETCONN_EVT_ERROR, 0); +@@ -896,13 +906,11 @@ bool sock_event_wait(struct lwip_sock *sock, enum netconn_evt evt, bool noblocki + timeout = sock->sk_wait->timedwait_fn(sock->sk_wait, timeout, start); + } while (timeout > 0 || (timeout < 0 && errno == 0)); + +- if (errno == ETIMEDOUT) { +- errno = EAGAIN; +- } +- + if (evt == NETCONN_EVT_SENDPLUS) { + /* remove WAIT_BLOCK type */ + sock->sk_wait->type &= ~WAIT_BLOCK; + } ++ ++ errno = old_errno; + return pending != 0; + } +diff --git a/src/lstack/api/lstack_sockctl.c b/src/lstack/api/lstack_sockctl.c +index a224303..906d228 100644 +--- a/src/lstack/api/lstack_sockctl.c ++++ b/src/lstack/api/lstack_sockctl.c +@@ -448,7 +448,7 @@ static int rpc_call_connect(int stack_id, int fd, const struct sockaddr *addr, s + + if (ret < 0 && errno == EINPROGRESS) { + struct lwip_sock *sock = lwip_get_socket(fd); +- if (sock_event_wait(sock, NETCONN_EVT_SENDPLUS, netconn_is_nonblocking(sock->conn))) { ++ if (sock_event_wait(sock, NETCONN_EVT_SENDPLUS, false)) { + ret = 0; + } + } +@@ -585,7 +585,7 @@ static int stack_broadcast_accept4(int fd, struct sockaddr *addr, socklen_t *add + + min_sock = get_min_accept_sock(fd); + if (min_sock == NULL) { +- if (sock_event_wait(sock, NETCONN_EVT_RCVPLUS, netconn_is_nonblocking(sock->conn) || (flags & SOCK_NONBLOCK))) { ++ if (sock_event_wait(sock, NETCONN_EVT_RCVPLUS, flags & SOCK_NONBLOCK)) { + min_sock = get_min_accept_sock(fd); + } + } +@@ -764,7 +764,7 @@ static int rtc_connect(int s, const struct sockaddr *name, socklen_t namelen) + ret = lwip_connect(s, name, namelen); + if (ret < 0 && errno == EINPROGRESS) { + struct lwip_sock *sock = lwip_get_socket(s); +- if (sock_event_wait(sock, NETCONN_EVT_SENDPLUS, netconn_is_nonblocking(sock->conn))) { ++ if (sock_event_wait(sock, NETCONN_EVT_SENDPLUS, false)) { + ret = 0; + } + } +@@ -782,7 +782,7 @@ static int rtc_accept4(int s, struct sockaddr *addr, socklen_t *addrlen, int fla + + ret = lwip_accept4(s, addr, addrlen, flags); + if (ret < 0 && errno == EWOULDBLOCK) { +- if (sock_event_wait(sock, NETCONN_EVT_RCVPLUS, netconn_is_nonblocking(sock->conn) || (flags & SOCK_NONBLOCK))) { ++ if (sock_event_wait(sock, NETCONN_EVT_RCVPLUS, flags & SOCK_NONBLOCK)) { + ret = lwip_accept4(s, addr, addrlen, flags); + } + } +diff --git a/src/lstack/api/lstack_sockio.c b/src/lstack/api/lstack_sockio.c +index 05e82d7..b5b203f 100644 +--- a/src/lstack/api/lstack_sockio.c ++++ b/src/lstack/api/lstack_sockio.c +@@ -359,7 +359,7 @@ static ssize_t stack_udp_write(struct lwip_sock *sock, const void *data, size_t + mr->app_free_count = mr->ops->free_count(mr); + if (unlikely(mr->app_free_count < 1)) { + API_EVENT(sock->conn, NETCONN_EVT_SENDMINUS, 0); +- set_errno(EWOULDBLOCK); ++ set_errno(ENOBUFS); + return -1; + } + } +@@ -370,7 +370,10 @@ static ssize_t stack_udp_write(struct lwip_sock *sock, const void *data, size_t + copied_total = stack_udp_write_bulk(sock, mr, data, len, flags, to, tolen); + } + +- return copied_total > 0 ? copied_total : -1; ++ if (likely(copied_total > 0)) ++ return copied_total; ++ API_EVENT(sock->conn, NETCONN_EVT_SENDMINUS, 0); ++ return -1; + } + + static ssize_t stack_udp_output(struct netconn *conn, bool *output_again, struct mem_thread *mt) +@@ -676,7 +679,7 @@ static ssize_t rtw_stack_tcp_write(struct lwip_sock *sock, const char *data, siz + mr->app_free_count = mr->ops->free_count(mr); + if (unlikely(mr->app_free_count < 2)) { + API_EVENT(sock->conn, NETCONN_EVT_SENDMINUS, 0); +- set_errno(EWOULDBLOCK); ++ set_errno(ENOBUFS); + return -1; + } + } +@@ -709,6 +712,8 @@ static ssize_t rtw_stack_tcp_write(struct lwip_sock *sock, const char *data, siz + } + + out: ++ if (unlikely(total_copy_len > 0)) ++ API_EVENT(sock->conn, NETCONN_EVT_SENDMINUS, 0); + return copied_total > 0 ? copied_total : -1; + } + +@@ -1058,11 +1063,6 @@ static ssize_t rtc_stack_tcp_write(struct lwip_sock *sock, const char *data, siz + } + + total_copy_len = LWIP_MIN((uint32_t)len, (uint32_t)pcb->snd_buf); +- if (unlikely(total_copy_len == 0)) { +- API_EVENT(sock->conn, NETCONN_EVT_SENDMINUS, 0); +- set_errno(EWOULDBLOCK); +- return -1; +- } + + while (total_copy_len > 0) { + if (total_copy_len <= TCP_SND_SIZE_MAX) { +@@ -1074,14 +1074,23 @@ static ssize_t rtc_stack_tcp_write(struct lwip_sock *sock, const char *data, siz + + err = tcp_write(pcb, data + copied_total, buf_copy_len, write_flags | write_more); + if (err != ERR_OK) { +- LSTACK_LOG(ERR, LSTACK, "tcp_write failed, errno %d\n", err_to_errno(err)); + break; + } + total_copy_len -= buf_copy_len; + copied_total += buf_copy_len; + } + +- if (copied_total > 0) { ++ /* if OK or memory error, check available space */ ++ if (err == ERR_OK || err == ERR_MEM) { ++ if (!lwip_tcp_allow_send(sock->conn->pcb.tcp)) { ++ API_EVENT(sock->conn, NETCONN_EVT_SENDMINUS, 0); ++ err = ERR_BUF; ++ } else if (total_copy_len > 0) { ++ API_EVENT(sock->conn, NETCONN_EVT_SENDMINUS, 0); ++ } ++ } ++ ++ if (likely(copied_total > 0)) { + return copied_total; + } + set_errno(err_to_errno(err)); +@@ -1120,7 +1129,7 @@ ssize_t sockio_recvfrom(int fd, void *mem, size_t len, int flags, + /* TODO: support MSG_WAITALL */ + recvd = ioops.stack_tcp_read(sock, mem, len, flags, from, fromlen); + if (recvd < 0 && errno == EWOULDBLOCK) { +- if (sock_event_wait(sock, NETCONN_EVT_RCVPLUS, netconn_is_nonblocking(sock->conn) || (flags & MSG_DONTWAIT))) { ++ if (sock_event_wait(sock, NETCONN_EVT_RCVPLUS, flags & MSG_DONTWAIT)) { + recvd = ioops.stack_tcp_read(sock, mem, len, flags, from, fromlen); + } + } +@@ -1140,7 +1149,7 @@ ssize_t sockio_recvfrom(int fd, void *mem, size_t len, int flags, + msg.msg_namelen = (fromlen ? *fromlen : 0); + recvd = ioops.stack_udp_readmsg(sock, &msg, len, flags); + if (recvd < 0 && errno == EWOULDBLOCK) { +- if (sock_event_wait(sock, NETCONN_EVT_RCVPLUS, netconn_is_nonblocking(sock->conn) || (flags & MSG_DONTWAIT))) { ++ if (sock_event_wait(sock, NETCONN_EVT_RCVPLUS, flags & MSG_DONTWAIT)) { + recvd = ioops.stack_udp_readmsg(sock, &msg, len, flags); + } + } +@@ -1186,7 +1195,7 @@ ssize_t sockio_recvmsg(int fd, struct msghdr *msg, int flags) + case NETCONN_UDP: + recvd = ioops.stack_udp_readmsg(sock, msg, len, flags); + if (recvd < 0 && errno == EWOULDBLOCK) { +- if (sock_event_wait(sock, NETCONN_EVT_RCVPLUS, netconn_is_nonblocking(sock->conn) || (flags & MSG_DONTWAIT))) { ++ if (sock_event_wait(sock, NETCONN_EVT_RCVPLUS, flags & MSG_DONTWAIT)) { + recvd = ioops.stack_udp_readmsg(sock, msg, len, flags); + } + } +@@ -1220,22 +1229,24 @@ ssize_t sockio_sendto(int fd, const void *mem, size_t len, int flags, + + switch (NETCONN_TYPE(sock->conn)) { + case NETCONN_TCP: +- ret = ioops.stack_tcp_write(sock, mem, len, flags); +- if (ret < 0) { +- if (errno == EWOULDBLOCK) { +- sock_event_wait(sock, NETCONN_EVT_SENDPLUS, true); +- } +- } else { ++ do { ++ ret = ioops.stack_tcp_write(sock, mem, len, flags); ++ if (likely(ret > 0) || errno != ENOBUFS) ++ break; ++ sock_event_wait(sock, NETCONN_EVT_SENDPLUS, true); ++ } while (true); ++ if (ret > 0) { + ioops.stack_tcp_send(sock); + } + break; + case NETCONN_UDP: +- ret = ioops.stack_udp_write(sock, mem, len, flags, to, tolen); +- if (ret < 0) { +- if (errno == EWOULDBLOCK) { +- sock_event_wait(sock, NETCONN_EVT_SENDPLUS, true); +- } +- } else { ++ do { ++ ret = ioops.stack_udp_write(sock, mem, len, flags, to, tolen); ++ if (likely(ret > 0) || errno != ENOBUFS) ++ break; ++ sock_event_wait(sock, NETCONN_EVT_SENDPLUS, true); ++ } while (true); ++ if (ret > 0) { + ioops.stack_udp_send(sock); + } + break; +@@ -1271,11 +1282,15 @@ ssize_t sockio_sendmsg(int fd, const struct msghdr *msg, int flags) + if (i == msg->msg_iovlen - 1) { + write_more = 0; + } +- ret = ioops.stack_tcp_write(sock, msg->msg_iov[i].iov_base, msg->msg_iov[i].iov_len, flags | write_more); ++ do { ++ ret = ioops.stack_tcp_write(sock, msg->msg_iov[i].iov_base, msg->msg_iov[i].iov_len, flags | write_more); ++ if (likely(ret > 0) || errno != ENOBUFS) ++ break; ++ if (written > 0) ++ ioops.stack_tcp_send(sock); ++ sock_event_wait(sock, NETCONN_EVT_SENDPLUS, true); ++ } while (true); + if (ret < 0) { +- if (errno == EWOULDBLOCK) { +- sock_event_wait(sock, NETCONN_EVT_SENDPLUS, true); +- } + break; + } + written += ret; +@@ -1289,11 +1304,13 @@ ssize_t sockio_sendmsg(int fd, const struct msghdr *msg, int flags) + if (i == msg->msg_iovlen - 1) { + write_more = 0; + } +- ret = ioops.stack_udp_write(sock, msg->msg_iov[i].iov_base, msg->msg_iov[i].iov_len, flags | write_more, NULL, 0); ++ do { ++ ret = ioops.stack_udp_write(sock, msg->msg_iov[i].iov_base, msg->msg_iov[i].iov_len, flags | write_more, NULL, 0); ++ if (likely(ret > 0) || errno != ENOBUFS) ++ break; ++ sock_event_wait(sock, NETCONN_EVT_SENDPLUS, true); ++ } while (true); + if (ret < 0) { +- if (errno == EWOULDBLOCK) { +- sock_event_wait(sock, NETCONN_EVT_SENDPLUS, true); +- } + break; + } + written += ret; +diff --git a/src/lstack/include/lstack_epoll.h b/src/lstack/include/lstack_epoll.h +index 94e872d..844ef9d 100644 +--- a/src/lstack/include/lstack_epoll.h ++++ b/src/lstack/include/lstack_epoll.h +@@ -27,6 +27,6 @@ int epoll_ctl_kernel_event(int epfd, int op, int fd, struct epoll_event *event, + struct sock_wait *sk_wait); + int poll_ctl_kernel_event(int epfd, int fds_id, int old_fd, const struct pollfd *new_fds); + +-bool sock_event_wait(struct lwip_sock *sock, enum netconn_evt evt, bool noblocking); ++bool sock_event_wait(struct lwip_sock *sock, enum netconn_evt evt, bool nonblocking); + + #endif /* _GAZELLE_EPOLL_H_ */ +-- +2.33.0 + diff --git a/0358-sk_wait-igonre-mem_thread-flush-signal.patch b/0358-sk_wait-igonre-mem_thread-flush-signal.patch new file mode 100644 index 0000000..e80b167 --- /dev/null +++ b/0358-sk_wait-igonre-mem_thread-flush-signal.patch @@ -0,0 +1,112 @@ +From f388e26bb66ee37fcfdb0adc357dbb4c98a03b64 Mon Sep 17 00:00:00 2001 +From: Lemmy Huang +Date: Fri, 4 Jul 2025 20:24:36 +0800 +Subject: [PATCH] sk_wait: igonre mem_thread flush signal + +Signed-off-by: Lemmy Huang +--- + src/lstack/api/lstack_epoll.c | 14 ++++++++++++-- + src/lstack/core/lstack_mempool.c | 15 ++++++++++++++- + src/lstack/include/lstack_mempool.h | 1 + + 3 files changed, 27 insertions(+), 3 deletions(-) + +diff --git a/src/lstack/api/lstack_epoll.c b/src/lstack/api/lstack_epoll.c +index a07968a..0889f13 100644 +--- a/src/lstack/api/lstack_epoll.c ++++ b/src/lstack/api/lstack_epoll.c +@@ -28,6 +28,7 @@ + #include "lstack_cfg.h" + #include "lstack_log.h" + #include "lstack_protocol_stack.h" ++#include "lstack_mempool.h" + + #define POLL_MAX_EVENTS 32 + +@@ -36,20 +37,29 @@ static PER_THREAD struct sock_wait *g_sk_wait = NULL; + + static int rtc_sock_wait_timedwait(struct sock_wait *sk_wait, int timeout, uint32_t start) + { ++ mem_thread_ignore_flush_intr(); ++ + stack_polling(0); + + if (timeout > 0 && timeout <= (int)(sys_now() - start)) { + timeout = 0; + } else if (timeout < 0) { +- errno = 0; ++ if (errno != EINTR || mem_thread_ignore_flush_intr()) { ++ errno = 0; ++ } + } + return timeout; + } + + static int rtw_sock_wait_timedwait(struct sock_wait *sk_wait, int timeout, uint32_t start) + { ++ int ret; + /* when sem interrupted by signals, errno = EINTR */ +- return sys_sem_wait_internal(&sk_wait->sem, timeout); ++ mem_thread_ignore_flush_intr(); ++ do { ++ ret = sys_sem_wait_internal(&sk_wait->sem, timeout); ++ } while (ret < 0 && errno == EINTR && mem_thread_ignore_flush_intr()); ++ return ret; + } + + static void rtc_epoll_notify_event(struct sock_wait *sk_wait, struct sock_event *sk_event, +diff --git a/src/lstack/core/lstack_mempool.c b/src/lstack/core/lstack_mempool.c +index 3f447f0..ef8e7df 100644 +--- a/src/lstack/core/lstack_mempool.c ++++ b/src/lstack/core/lstack_mempool.c +@@ -45,8 +45,9 @@ struct mem_thread_group { + struct list_node mt_node; + struct mem_thread mt_array[PROTOCOL_STACK_MAX]; + +- bool used_flag; + uint32_t used_time; ++ bool used_flag; ++ bool siged_flag; + }; + + static struct mem_stack g_mem_stack_group[PROTOCOL_STACK_MAX] = {0}; +@@ -106,6 +107,15 @@ static inline void mem_thread_group_done(void) + g_mem_thread_group->used_flag = false; + } + ++bool mem_thread_ignore_flush_intr(void) ++{ ++ if (likely(g_mem_thread_group != NULL) && g_mem_thread_group->siged_flag) { ++ g_mem_thread_group->siged_flag = false; ++ return true; ++ } ++ return false; ++} ++ + static void mem_thread_cache_flush(struct mem_thread *mt); + static unsigned mem_thread_cache_count(const struct mem_thread *mt); + static void mem_thread_group_action_flush(int signum) +@@ -115,6 +125,9 @@ static void mem_thread_group_action_flush(int signum) + + if (g_mem_thread_group == NULL) + return; ++ ++ g_mem_thread_group->siged_flag = true; ++ + if (mem_thread_group_in_used(g_mem_thread_group, MEM_THREAD_MANAGER_FLUSH_MS)) + return; + +diff --git a/src/lstack/include/lstack_mempool.h b/src/lstack/include/lstack_mempool.h +index 6a31503..946edd6 100644 +--- a/src/lstack/include/lstack_mempool.h ++++ b/src/lstack/include/lstack_mempool.h +@@ -300,6 +300,7 @@ int mem_stack_pool_init(int stack_id, unsigned numa_id); + int mem_stack_mpcache_init(int stack_id, unsigned cpu_id); + + int mem_thread_manager_init(void); ++bool mem_thread_ignore_flush_intr(void); + void mem_thread_cache_free(struct mem_thread *mt); + int mem_thread_cache_init(struct mem_thread *mt, int stack_id); + +-- +2.33.0 + diff --git a/0359-fix-20.03-LTS-build-failed.patch b/0359-fix-20.03-LTS-build-failed.patch new file mode 100644 index 0000000..540e401 --- /dev/null +++ b/0359-fix-20.03-LTS-build-failed.patch @@ -0,0 +1,25 @@ +From 39557d189a97a30a63f76b16eb2daee9c52cbc47 Mon Sep 17 00:00:00 2001 +From: Lemmy Huang +Date: Sun, 6 Jul 2025 17:00:49 +0800 +Subject: [PATCH] fix 20.03-LTS build failed + +Signed-off-by: Lemmy Huang +--- + src/lstack/core/lstack_mempool.c | 1 + + 1 file changed, 1 insertion(+) + +diff --git a/src/lstack/core/lstack_mempool.c b/src/lstack/core/lstack_mempool.c +index ef8e7df..4388722 100644 +--- a/src/lstack/core/lstack_mempool.c ++++ b/src/lstack/core/lstack_mempool.c +@@ -13,6 +13,7 @@ + #include + + #include ++#include + #include + + #include "lstack_mempool.h" +-- +2.33.0 + diff --git a/0360-sockio-fix-tcp_write-not-remove-EPOLLOUT.patch b/0360-sockio-fix-tcp_write-not-remove-EPOLLOUT.patch new file mode 100644 index 0000000..0ab7cf9 --- /dev/null +++ b/0360-sockio-fix-tcp_write-not-remove-EPOLLOUT.patch @@ -0,0 +1,108 @@ +From 25e83b84417d5b609ea0d639a9a87d6602b101bf Mon Sep 17 00:00:00 2001 +From: Lemmy Huang +Date: Mon, 7 Jul 2025 14:27:16 +0800 +Subject: [PATCH] sockio: fix tcp_write not remove EPOLLOUT + +Signed-off-by: Lemmy Huang +--- + src/lstack/api/lstack_sockio.c | 20 +++++++++++--------- + src/lstack/core/lstack_wait.c | 2 +- + src/lstack/include/lstack_wait.h | 2 ++ + 3 files changed, 14 insertions(+), 10 deletions(-) + +diff --git a/src/lstack/api/lstack_sockio.c b/src/lstack/api/lstack_sockio.c +index b5b203f..424cc82 100644 +--- a/src/lstack/api/lstack_sockio.c ++++ b/src/lstack/api/lstack_sockio.c +@@ -355,9 +355,9 @@ static ssize_t stack_udp_write(struct lwip_sock *sock, const void *data, size_t + return -1; + } + +- if (unlikely(mr->app_free_count < 1)) { ++ if (unlikely(mr->app_free_count < SOCK_SENDMBOX_ALLOW_WRITE_SIZE)) { + mr->app_free_count = mr->ops->free_count(mr); +- if (unlikely(mr->app_free_count < 1)) { ++ if (unlikely(mr->app_free_count < SOCK_SENDMBOX_ALLOW_WRITE_SIZE)) { + API_EVENT(sock->conn, NETCONN_EVT_SENDMINUS, 0); + set_errno(ENOBUFS); + return -1; +@@ -370,9 +370,12 @@ static ssize_t stack_udp_write(struct lwip_sock *sock, const void *data, size_t + copied_total = stack_udp_write_bulk(sock, mr, data, len, flags, to, tolen); + } + ++ if (mr->app_free_count < SOCK_SENDMBOX_ALLOW_NOTIFY_SIZE) { ++ API_EVENT(sock->conn, NETCONN_EVT_SENDMINUS, 0); ++ } ++ + if (likely(copied_total > 0)) + return copied_total; +- API_EVENT(sock->conn, NETCONN_EVT_SENDMINUS, 0); + return -1; + } + +@@ -674,10 +677,10 @@ static ssize_t rtw_stack_tcp_write(struct lwip_sock *sock, const char *data, siz + return -1; + } + +- if (unlikely(mr->app_free_count < 2) || ++ if (unlikely(mr->app_free_count < SOCK_SENDMBOX_ALLOW_WRITE_SIZE) || + total_copy_len > mr->app_free_count * TCP_MSS) { + mr->app_free_count = mr->ops->free_count(mr); +- if (unlikely(mr->app_free_count < 2)) { ++ if (unlikely(mr->app_free_count < SOCK_SENDMBOX_ALLOW_WRITE_SIZE)) { + API_EVENT(sock->conn, NETCONN_EVT_SENDMINUS, 0); + set_errno(ENOBUFS); + return -1; +@@ -712,8 +715,9 @@ static ssize_t rtw_stack_tcp_write(struct lwip_sock *sock, const char *data, siz + } + + out: +- if (unlikely(total_copy_len > 0)) ++ if (mr->app_free_count < SOCK_SENDMBOX_ALLOW_NOTIFY_SIZE) { + API_EVENT(sock->conn, NETCONN_EVT_SENDMINUS, 0); ++ } + return copied_total > 0 ? copied_total : -1; + } + +@@ -1082,11 +1086,9 @@ static ssize_t rtc_stack_tcp_write(struct lwip_sock *sock, const char *data, siz + + /* if OK or memory error, check available space */ + if (err == ERR_OK || err == ERR_MEM) { ++ API_EVENT(sock->conn, NETCONN_EVT_SENDMINUS, 0); + if (!lwip_tcp_allow_send(sock->conn->pcb.tcp)) { +- API_EVENT(sock->conn, NETCONN_EVT_SENDMINUS, 0); + err = ERR_BUF; +- } else if (total_copy_len > 0) { +- API_EVENT(sock->conn, NETCONN_EVT_SENDMINUS, 0); + } + } + +diff --git a/src/lstack/core/lstack_wait.c b/src/lstack/core/lstack_wait.c +index 4714742..855f2ed 100644 +--- a/src/lstack/core/lstack_wait.c ++++ b/src/lstack/core/lstack_wait.c +@@ -380,7 +380,7 @@ static inline bool NETCONN_ALLOW_SEND(const struct lwip_sock *sock) + } else { /* if RTW */ + if (sys_mbox_valid(&sock->conn->sendmbox)) { + const struct mbox_ring *mr = &sock->conn->sendmbox->mring; +- return mr->ops->free_count(mr) > 0; ++ return mr->ops->free_count(mr) >= SOCK_SENDMBOX_ALLOW_NOTIFY_SIZE; + } + } + +diff --git a/src/lstack/include/lstack_wait.h b/src/lstack/include/lstack_wait.h +index 8ed2e93..dca22b6 100644 +--- a/src/lstack/include/lstack_wait.h ++++ b/src/lstack/include/lstack_wait.h +@@ -30,6 +30,8 @@ + #include "lstack_protocol_stack.h" + #include "lstack_cfg.h" + ++#define SOCK_SENDMBOX_ALLOW_WRITE_SIZE 4 ++#define SOCK_SENDMBOX_ALLOW_NOTIFY_SIZE 16 + + #define NETCONN_TYPE(conn) NETCONNTYPE_GROUP(netconn_type((conn))) + +-- +2.33.0 + diff --git a/0361-sockio-fix-rpc_send-coredump-after-RPC_MSG_EXIT.patch b/0361-sockio-fix-rpc_send-coredump-after-RPC_MSG_EXIT.patch new file mode 100644 index 0000000..09e1faa --- /dev/null +++ b/0361-sockio-fix-rpc_send-coredump-after-RPC_MSG_EXIT.patch @@ -0,0 +1,335 @@ +From ebbd5e477c3cdb5a561f79a09983b5beabb74c0e Mon Sep 17 00:00:00 2001 +From: Lemmy Huang +Date: Tue, 8 Jul 2025 14:04:29 +0800 +Subject: [PATCH] sockio: fix rpc_send coredump after RPC_MSG_EXIT + +Signed-off-by: Lemmy Huang +--- + src/lstack/api/lstack_sockio.c | 64 ++++++++++++++++++++++------- + src/lstack/api/lstack_wrap.c | 22 +++++----- + src/lstack/include/lstack_mempool.h | 6 +-- + src/lstack/include/lstack_sockio.h | 17 +------- + 4 files changed, 65 insertions(+), 44 deletions(-) + +diff --git a/src/lstack/api/lstack_sockio.c b/src/lstack/api/lstack_sockio.c +index 424cc82..ce22134 100644 +--- a/src/lstack/api/lstack_sockio.c ++++ b/src/lstack/api/lstack_sockio.c +@@ -443,6 +443,11 @@ static void callback_udp_send(struct rpc_msg *msg) + struct mem_thread *mt = msg->args[MSG_ARG_1].p; + bool output_again; + ++ if (unlikely(POSIX_IS_CLOSED(sock))) { ++ msg->result = -1; ++ return; ++ } ++ + if (get_protocol_stack_group()->latency_start) + calculate_sock_latency(sock, GAZELLE_LATENCY_WRITE_RPC_MSG); + +@@ -922,6 +927,11 @@ static void callback_tcp_recvd(struct rpc_msg *recvmsg) + struct mbox_ring *mr; + u32_t recvd; + ++ if (unlikely(POSIX_IS_CLOSED(sock))) { ++ recvmsg->result = -1; ++ return; ++ } ++ + mr = &sock->conn->recvmbox->mring; + if (mr->flags & MBOX_FLAG_PEEK) { + sockio_peek_recv_free(mr, 0); +@@ -973,13 +983,21 @@ static void callback_tcp_send(struct rpc_msg *sendmsg) + struct protocol_stack *stack = get_protocol_stack(); + struct lwip_sock *sock = sendmsg->args[MSG_ARG_0].p; + struct mem_thread *mt = sendmsg->args[MSG_ARG_1].p; +- const struct tcp_pcb *pcb = sock->conn->pcb.tcp; +- const struct mbox_ring *mr = &sock->conn->sendmbox->mring; ++ const struct tcp_pcb *pcb; ++ const struct mbox_ring *mr; + bool output_again; + err_t err; + +- if (unlikely(pcb == NULL)) ++ if (unlikely(POSIX_IS_CLOSED(sock))) { ++ sendmsg->result = -1; ++ return; ++ } ++ mr = &sock->conn->sendmbox->mring; ++ pcb = sock->conn->pcb.tcp; ++ if (unlikely(pcb == NULL)) { ++ sendmsg->result = -1; + return; ++ } + + #if GAZELLE_TCP_ASYNC_RECVD + struct rpc_msg *recvmsg; +@@ -1105,7 +1123,7 @@ static void rtc_stack_tcp_send(struct lwip_sock *sock) + } + + +-ssize_t sockio_recvfrom(int fd, void *mem, size_t len, int flags, ++static ssize_t sockio_recvfrom(int fd, void *mem, size_t len, int flags, + struct sockaddr *from, socklen_t *fromlen) + { + struct lwip_sock *sock = lwip_get_socket(fd); +@@ -1167,7 +1185,7 @@ ssize_t sockio_recvfrom(int fd, void *mem, size_t len, int flags, + return recvd; + } + +-ssize_t sockio_recvmsg(int fd, struct msghdr *msg, int flags) ++static ssize_t sockio_recvmsg(int fd, struct msghdr *msg, int flags) + { + struct lwip_sock *sock = lwip_get_socket(fd); + ssize_t len, recvd = 0; +@@ -1210,7 +1228,7 @@ ssize_t sockio_recvmsg(int fd, struct msghdr *msg, int flags) + return recvd; + } + +-ssize_t sockio_sendto(int fd, const void *mem, size_t len, int flags, ++static ssize_t sockio_sendto(int fd, const void *mem, size_t len, int flags, + const struct sockaddr *to, socklen_t tolen) + { + struct lwip_sock *sock = lwip_get_socket(fd); +@@ -1260,7 +1278,7 @@ ssize_t sockio_sendto(int fd, const void *mem, size_t len, int flags, + return ret; + } + +-ssize_t sockio_sendmsg(int fd, const struct msghdr *msg, int flags) ++static ssize_t sockio_sendmsg(int fd, const struct msghdr *msg, int flags) + { + struct lwip_sock *sock = lwip_get_socket(fd); + ssize_t ret = -1; +@@ -1329,27 +1347,27 @@ ssize_t sockio_sendmsg(int fd, const struct msghdr *msg, int flags) + return written > 0 ? written : ret; + } + +-ssize_t sockio_read(int fd, void *mem, size_t len) ++static ssize_t sockio_read(int fd, void *mem, size_t len) + { + return sockio_recvfrom(fd, mem, len, 0, NULL, NULL); + } + +-ssize_t sockio_write(int fd, const void *mem, size_t len) ++static ssize_t sockio_write(int fd, const void *mem, size_t len) + { + return sockio_sendto(fd, mem, len, 0, NULL, 0); + } + +-ssize_t sockio_recv(int fd, void *mem, size_t len, int flags) ++static ssize_t sockio_recv(int fd, void *mem, size_t len, int flags) + { + return sockio_recvfrom(fd, mem, len, flags, NULL, NULL); + } + +-ssize_t sockio_send(int fd, const void *mem, size_t len, int flags) ++static ssize_t sockio_send(int fd, const void *mem, size_t len, int flags) + { + return sockio_sendto(fd, mem, len, flags, NULL, 0); + } + +-ssize_t sockio_readv(int fd, const struct iovec *iov, int iovcnt) ++static ssize_t sockio_readv(int fd, const struct iovec *iov, int iovcnt) + { + struct msghdr msg; + +@@ -1364,7 +1382,7 @@ ssize_t sockio_readv(int fd, const struct iovec *iov, int iovcnt) + return sockio_recvmsg(fd, &msg, 0); + } + +-ssize_t sockio_writev(int fd, const struct iovec *iov, int iovcnt) ++static ssize_t sockio_writev(int fd, const struct iovec *iov, int iovcnt) + { + struct msghdr msg; + +@@ -1379,7 +1397,7 @@ ssize_t sockio_writev(int fd, const struct iovec *iov, int iovcnt) + return sockio_sendmsg(fd, &msg, 0); + } + +-void sockio_ops_init(void) ++static void sockio_ops_init(void) + { + struct sockio_ops *ops = &ioops; + +@@ -1400,6 +1418,24 @@ void sockio_ops_init(void) + } + } + ++void sockio_api_init(posix_api_t *api) ++{ ++ sockio_ops_init(); ++ ++ api->recvfrom_fn = sockio_recvfrom; ++ api->recvmsg_fn = sockio_recvmsg; ++ api->sendto_fn = sockio_sendto; ++ api->sendmsg_fn = sockio_sendmsg; ++ ++ api->read_fn = sockio_read; ++ api->write_fn = sockio_write; ++ api->recv_fn = sockio_recv; ++ api->send_fn = sockio_send; ++ ++ api->readv_fn = sockio_readv; ++ api->writev_fn = sockio_writev; ++} ++ + static int sockio_mbox_init(struct lwip_sock *sock) + { + int ret; +diff --git a/src/lstack/api/lstack_wrap.c b/src/lstack/api/lstack_wrap.c +index 6523dc2..95279da 100644 +--- a/src/lstack/api/lstack_wrap.c ++++ b/src/lstack/api/lstack_wrap.c +@@ -53,7 +53,7 @@ void wrap_api_init(void) + } + + epoll_api_init(g_wrap_api); +- sockio_ops_init(); ++ sockio_api_init(g_wrap_api); + mbox_ring_ops_init(); + } + +@@ -566,7 +566,7 @@ static inline int32_t do_socket(int32_t domain, int32_t type, int32_t protocol) + static inline ssize_t do_recv(int32_t sockfd, void *buf, size_t len, int32_t flags) + { + if (select_sock_posix_path(lwip_get_socket(sockfd)) == POSIX_LWIP) { +- return sockio_recv(sockfd, buf, len, flags); ++ return g_wrap_api->recv_fn(sockfd, buf, len, flags); + } + return posix_api->recv_fn(sockfd, buf, len, flags); + } +@@ -574,7 +574,7 @@ static inline ssize_t do_recv(int32_t sockfd, void *buf, size_t len, int32_t fla + static inline ssize_t do_read(int32_t s, void *mem, size_t len) + { + if (select_sock_posix_path(lwip_get_socket(s)) == POSIX_LWIP) { +- return sockio_read(s, mem, len); ++ return g_wrap_api->read_fn(s, mem, len); + } + return posix_api->read_fn(s, mem, len); + } +@@ -582,7 +582,7 @@ static inline ssize_t do_read(int32_t s, void *mem, size_t len) + static inline ssize_t do_readv(int32_t s, const struct iovec *iov, int iovcnt) + { + if (select_sock_posix_path(lwip_get_socket(s)) == POSIX_LWIP) { +- return sockio_readv(s, iov, iovcnt); ++ return g_wrap_api->readv_fn(s, iov, iovcnt); + } + return posix_api->readv_fn(s, iov, iovcnt); + } +@@ -590,7 +590,7 @@ static inline ssize_t do_readv(int32_t s, const struct iovec *iov, int iovcnt) + static inline ssize_t do_send(int32_t sockfd, const void *buf, size_t len, int32_t flags) + { + if (select_sock_posix_path(lwip_get_socket(sockfd)) == POSIX_LWIP) { +- return sockio_send(sockfd, buf, len, flags); ++ return g_wrap_api->send_fn(sockfd, buf, len, flags); + } + return posix_api->send_fn(sockfd, buf, len, flags); + } +@@ -598,7 +598,7 @@ static inline ssize_t do_send(int32_t sockfd, const void *buf, size_t len, int32 + static inline ssize_t do_write(int32_t s, const void *mem, size_t size) + { + if (select_sock_posix_path(lwip_get_socket(s)) == POSIX_LWIP) { +- return sockio_write(s, mem, size); ++ return g_wrap_api->write_fn(s, mem, size); + } + return posix_api->write_fn(s, mem, size); + } +@@ -606,7 +606,7 @@ static inline ssize_t do_write(int32_t s, const void *mem, size_t size) + static inline ssize_t do_writev(int32_t s, const struct iovec *iov, int iovcnt) + { + if (select_sock_posix_path(lwip_get_socket(s)) == POSIX_LWIP) { +- return sockio_writev(s, iov, iovcnt); ++ return g_wrap_api->writev_fn(s, iov, iovcnt); + } + return posix_api->writev_fn(s, iov, iovcnt); + } +@@ -614,7 +614,7 @@ static inline ssize_t do_writev(int32_t s, const struct iovec *iov, int iovcnt) + static inline ssize_t do_recvmsg(int32_t s, struct msghdr *message, int32_t flags) + { + if (select_sock_posix_path(lwip_get_socket(s)) == POSIX_LWIP) { +- return sockio_recvmsg(s, message, flags); ++ return g_wrap_api->recvmsg_fn(s, message, flags); + } + return posix_api->recvmsg_fn(s, message, flags); + } +@@ -622,7 +622,7 @@ static inline ssize_t do_recvmsg(int32_t s, struct msghdr *message, int32_t flag + static inline ssize_t do_sendmsg(int32_t s, const struct msghdr *message, int32_t flags) + { + if (select_sock_posix_path(lwip_get_socket(s)) == POSIX_LWIP) { +- return sockio_sendmsg(s, message, flags); ++ return g_wrap_api->sendmsg_fn(s, message, flags); + } + return posix_api->sendmsg_fn(s, message, flags); + } +@@ -631,7 +631,7 @@ static inline ssize_t do_recvfrom(int32_t sockfd, void *buf, size_t len, int32_t + struct sockaddr *addr, socklen_t *addrlen) + { + if (select_sock_posix_path(lwip_get_socket(sockfd)) == POSIX_LWIP) { +- return sockio_recvfrom(sockfd, buf, len, flags, addr, addrlen); ++ return g_wrap_api->recvfrom_fn(sockfd, buf, len, flags, addr, addrlen); + } + return posix_api->recvfrom_fn(sockfd, buf, len, flags, addr, addrlen); + } +@@ -640,7 +640,7 @@ static inline ssize_t do_sendto(int32_t sockfd, const void *buf, size_t len, int + const struct sockaddr *addr, socklen_t addrlen) + { + if (select_sock_posix_path(lwip_get_socket(sockfd)) == POSIX_LWIP) { +- return sockio_sendto(sockfd, buf, len, flags, addr, addrlen); ++ return g_wrap_api->sendto_fn(sockfd, buf, len, flags, addr, addrlen); + } + return posix_api->sendto_fn(sockfd, buf, len, flags, addr, addrlen); + } +diff --git a/src/lstack/include/lstack_mempool.h b/src/lstack/include/lstack_mempool.h +index 946edd6..5e489e4 100644 +--- a/src/lstack/include/lstack_mempool.h ++++ b/src/lstack/include/lstack_mempool.h +@@ -10,8 +10,8 @@ + * See the Mulan PSL v2 for more details. + */ + +-#ifndef __GAZELLE_MEM_H__ +-#define __GAZELLE_MEM_H__ ++#ifndef __GAZELLE_MEMPOOL_H__ ++#define __GAZELLE_MEMPOOL_H__ + + #include + +@@ -336,4 +336,4 @@ void mem_extcache_flush_pbuf(struct pbuf **extcache_list); + void mem_init_pbuf(struct pbuf *p, pbuf_layer layer, uint16_t tot_len, uint16_t len, pbuf_type type); + + +-#endif /* __GAZELLE_MEM_H__ */ +\ No newline at end of file ++#endif /* __GAZELLE_MEMPOOL_H__ */ +\ No newline at end of file +diff --git a/src/lstack/include/lstack_sockio.h b/src/lstack/include/lstack_sockio.h +index 265d620..2e4a8b1 100644 +--- a/src/lstack/include/lstack_sockio.h ++++ b/src/lstack/include/lstack_sockio.h +@@ -16,22 +16,7 @@ + #include + #include + +-ssize_t sockio_recvfrom(int fd, void *mem, size_t len, int flags, struct sockaddr *from, socklen_t *fromlen); +-ssize_t sockio_recvmsg(int fd, struct msghdr *msg, int flags); +-ssize_t sockio_sendto(int fd, const void *mem, size_t len, int flags, const struct sockaddr *to, socklen_t tolen); +-ssize_t sockio_sendmsg(int fd, const struct msghdr *msg, int flags); +- +-ssize_t sockio_read(int fd, void *mem, size_t len); +-ssize_t sockio_write(int fd, const void *mem, size_t len); +- +-ssize_t sockio_recv(int fd, void *mem, size_t len, int flags); +-ssize_t sockio_send(int fd, const void *mem, size_t len, int flags); +- +-ssize_t sockio_readv(int fd, const struct iovec *iov, int iovcnt); +-ssize_t sockio_writev(int fd, const struct iovec *iov, int iovcnt); +- +- +-void sockio_ops_init(void); ++void sockio_api_init(posix_api_t *api); + bool sockio_mbox_pending(struct lwip_sock *sock); + + /* just for lwip */ +-- +2.33.0 + diff --git a/gazelle.spec b/gazelle.spec index 90f57a6..b31e477 100644 --- a/gazelle.spec +++ b/gazelle.spec @@ -2,7 +2,7 @@ Name: gazelle Version: 1.0.2 -Release: 84 +Release: 85 Summary: gazelle is a high performance user-mode stack License: MulanPSL-2.0 URL: https://gitee.com/openeuler/gazelle @@ -338,6 +338,46 @@ Patch9318: 0318-RTC-mode-fix-gazellectl-can-t-print-connenct-info.patch Patch9319: 0319-Connect-fix-benchmark_dws-connect-failed.patch Patch9320: 0320-Protocal-fixing-deathlock-between-protocol-threads-a.patch Patch9321: 0321-update-gazelle-max-numa-nodes-8.patch +Patch9322: 0322-RTC-fixing-program-stuck-while-gazelle-exit-in-multi.patch +Patch9323: 0323-cleancode-add-GAZELLE_SAME_NODE.patch +Patch9324: 0324-cleancode-remove-gazelle_light_ring.patch +Patch9325: 0325-cleancode-remove-get_stack_tid-DPDK_PKT_BURST_SIZE-P.patch +Patch9326: 0326-socket-refactor-sock_event.patch +Patch9327: 0327-socket-adapt-to-sock_event.patch +Patch9328: 0328-socket-refactor-tcp-and-udp.patch +Patch9329: 0329-socket-adapt-to-tcp-and-udp.patch +Patch9330: 0330-cfg-add-mem_cache_max-and-change-default-rpc_msg_max.patch +Patch9331: 0331-cfg-add-mem_async_mode.patch +Patch9332: 0332-mempool-add-mem_thread_cache_flush.patch +Patch9333: 0333-dfx-support-sk_wait-stat.patch +Patch9334: 0334-mempool-fix-copy_mbuf_private.patch +Patch9335: 0335-socket-fix-connect-blocking.patch +Patch9336: 0336-socket-fix-stack_tcp_read-do-not-recv_finish_burst.patch +Patch9337: 0337-tcp-add-GAZELLE_TCP_ASYNC_RECVD.patch +Patch9338: 0338-socket-fix-tcp-closed.patch +Patch9339: 0339-socket-fix-sk_wait-cannot-be-interrupted-by-signals.patch +Patch9340: 0340-mempool-modify-mbuf-num-and-rpc_msg-num.patch +Patch9341: 0341-mempool-fix-mbox_ring-free-not-call-recv_finish_burs.patch +Patch9342: 0342-mempool-mem_get_rpc-add-reserve-limit.patch +Patch9343: 0343-mempool-fix-pthread_tryjoin_np-coredump-when-mysqld-.patch +Patch9344: 0344-mempool-stop-using-cache-when-too-many-threads.patch +Patch9345: 0345-sk_wait-fix-lwip_tcp_allow_send-coredump.patch +Patch9346: 0346-mbox-fix-mbox_ring_common_free-coredump-when-rte_rin.patch +Patch9347: 0347-sk_wait-fix-sock_wait_common_free.patch +Patch9348: 0348-socket-fix-stack_udp_readmsg-return-len.patch +Patch9349: 0349-sk_wait-fix-lwip_wait_foreach_notify-coredump-at-sta.patch +Patch9350: 0350-epoll-fix-do_lwip_connected_callback.patch +Patch9351: 0351-sockio-fix-rtw_stack_tcp_write-wrong-copied_total-af.patch +Patch9352: 0352-sk_event-fix-rtw-epoll-wrong-event-notify-and-remove.patch +Patch9353: 0353-poll-fix-do_lwip_connected_callback-not-delete-poll-.patch +Patch9354: 0354-sockctl-fix-rtw-broadcast-close-and-shutdown.patch +Patch9355: 0355-mempool-increase-MEM_THREAD_MANAGER_FREE_S-to-avoid-.patch +Patch9356: 0356-sockio-fix-callback_tcp_send-output-too-many-at-once.patch +Patch9357: 0357-sockio-fix-sendmbox-full-return-EWOULDBLOCK.patch +Patch9358: 0358-sk_wait-igonre-mem_thread-flush-signal.patch +Patch9359: 0359-fix-20.03-LTS-build-failed.patch +Patch9360: 0360-sockio-fix-tcp_write-not-remove-EPOLLOUT.patch +Patch9361: 0361-sockio-fix-rpc_send-coredump-after-RPC_MSG_EXIT.patch %description %{name} is a high performance user-mode stack. @@ -379,6 +419,48 @@ install -Dpm 0640 %{_builddir}/%{name}-%{version}/src/ltran/ltran.conf %{b %config(noreplace) %{conf_path}/ltran.conf %changelog +* Wed Jul 09 2025 yinbin6 - 1.0.2-85 +- sockio: fix rpc_send coredump after RPC_MSG_EXIT +- sockio: fix tcp_write not remove EPOLLOUT +- fix 20.03-LTS build failed +- sk_wait: igonre mem_thread flush signal +- sockio: fix sendmbox full return EWOULDBLOCK +- sockio: fix callback_tcp_send output too many at once +- mempool: increase MEM_THREAD_MANAGER_FREE_S to avoid poll +- sockctl: fix rtw broadcast close and shutdown +- poll: fix do_lwip_connected_callback not delete poll kernel fd +- sk_event: fix rtw epoll wrong event notify and remove +- sockio: fix rtw_stack_tcp_write wrong copied_total after append +- epoll: fix do_lwip_connected_callback +- sk_wait: fix lwip_wait_foreach_notify coredump at startup +- socket: fix stack_udp_readmsg return len +- sk_wait: fix sock_wait_common_free socket: simplify calling free_count in rtw_stack_tcp_write +- mbox: fix mbox_ring_common_free coredump when rte_ring_create failed +- sk_wait: fix lwip_tcp_allow_send coredump +- mempool: stop using cache when too many threads +- mempool: fix pthread_tryjoin_np coredump when mysqld shutdown +- mempool: mem_get_rpc add reserve limit +- mempool: fix mbox_ring free not call recv_finish_burst +- mempool: modify mbuf num and rpc_msg num +- socket: fix sk_wait cannot be interrupted by signals +- socket: fix tcp closed +- tcp: add GAZELLE_TCP_ASYNC_RECVD fix mbuf OOM caused by untimely sockio_peek_recv_free +- socket: fix stack_tcp_read do not recv_finish_burst +- socket: fix connect blocking +- mempool: fix copy_mbuf_private +- dfx: support sk_wait stat +- mempool: add mem_thread_cache_flush fix PBUF_POOL_PREINIT +- cfg: add mem_async_mode +- cfg: add mem_cache_max and change default rpc_msg_max +- socket: adapt to tcp and udp +- socket: refactor tcp and udp +- socket: adapt to sock_event +- socket: refactor sock_event +- cleancode: remove get_stack_tid DPDK_PKT_BURST_SIZE PACKET_READ_SIZE +- cleancode: remove gazelle_light_ring +- cleancode: add GAZELLE_SAME_NODE +- RTC: fixing program stuck while gazelle exit in multi-threads envirement + * Tue Mar 04 2025 yinbin6 - 1.0.2-84 - update gazelle max numa nodes 8 - Protocal: fixing deathlock between protocol threads and app thread -- Gitee