From 840680656e73a941b5c583fcdbd25e03824064b4 Mon Sep 17 00:00:00 2001 From: Guangguan Wang Date: Wed, 12 Jan 2022 00:15:51 +0800 Subject: [PATCH 1/4] anolis: net/smc: Support rq flow control in smc-r link layer anolis inclusion from anolis-5.10.134-12 commit 70922b9ed51e2681b3d6159414247819f5ab187d category: performance bugzilla: https://gitee.com/openeuler/kernel/issues/I82GZ7 CVE: NA Reference: https://gitee.com/anolis/cloud-kernel/commit/70922b9ed51e2681b3d6159414247819f5ab187d -------------------------------- ANBZ: #254 This patch supports rq flow control in smc-r link layer. QPs communicating without rq flow control, in the previous version, may result in RNR (reveive not ready) error, which means when sq sends a message to the remote qp, but the remote qp's rq has no rq entities to receive the message. In RNR situation, the rdma transport layer may retransmit the messages again and again until the rq has any entities, which may lower the performance, especially in heavy traffic. Using credits to do rq flow control can avoid the occurrence of RNR. The test of redis-benchmark shows that more than 3X rps improvement in SET and more than 7X rps improvement in GET. Test command: redis-server --save "" --appendonly no --protected-mode no --io-threads 7 --io-threads-do-reads yes redis-benchmark -h 192.168.26.36 -q -t set,get -P 1 --threads 7 -n 2000000 -c 500 -d 10 Before: SET: 173325.25 requests per second, p50=2.703 msec GET: 81383.52 requests per second, p50=5.575 msec After: SET: 554323.69 requests per second, p50=0.959 msec GET: 604741.19 requests per second, p50=0.855 msec Signed-off-by: Guangguan Wang Acked-by: Tony Lu Signed-off-by: Yingyu Zeng --- net/smc/af_smc.c | 12 ++++++ net/smc/smc_cdc.c | 12 +++++- net/smc/smc_cdc.h | 3 +- net/smc/smc_clc.c | 3 ++ net/smc/smc_clc.h | 3 +- net/smc/smc_core.h | 17 ++++++++- net/smc/smc_ib.c | 6 ++- net/smc/smc_llc.c | 92 +++++++++++++++++++++++++++++++++++++++++++++- net/smc/smc_llc.h | 5 +++ net/smc/smc_wr.c | 31 +++++++++++++--- net/smc/smc_wr.h | 54 ++++++++++++++++++++++++++- 11 files changed, 223 insertions(+), 15 deletions(-) diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index 41cbc7c89c9d..9b2c61f93539 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -517,6 +517,13 @@ static void smc_link_save_peer_info(struct smc_link *link, memcpy(link->peer_mac, clc->r0.lcl.mac, sizeof(link->peer_mac)); link->peer_psn = ntoh24(clc->r0.psn); link->peer_mtu = clc->r0.qp_mtu; + link->credits_enable = clc->r0.init_credits ? 1 : 0; + if (link->credits_enable) { + atomic_set(&link->peer_rq_credits, clc->r0.init_credits); + // set peer rq credits watermark, if less than init_credits * 2/3, + // then credit announcement is needed. + link->peer_cr_watermark_low = max(clc->r0.init_credits * 2 / 3, 1); + } } static void smc_switch_to_fallback(struct smc_sock *smc) @@ -809,6 +816,11 @@ static int smc_connect_rdma(struct smc_sock *smc, goto connect_abort; } } else { + if (smc_llc_announce_credits(link, SMC_LLC_RESP, true)) { + reason_code = SMC_CLC_DECL_CREDITSERR; + goto connect_abort; + } + if (smcr_lgr_reg_rmbs(link, smc->conn.rmb_desc)) { reason_code = SMC_CLC_DECL_ERR_REGRMB; goto connect_abort; diff --git a/net/smc/smc_cdc.c b/net/smc/smc_cdc.c index 94503f36b9a6..e674ee16cda4 100644 --- a/net/smc/smc_cdc.c +++ b/net/smc/smc_cdc.c @@ -101,25 +101,30 @@ int smc_cdc_msg_send(struct smc_connection *conn, struct smc_cdc_tx_pend *pend) { struct smc_link *link = conn->lnk; + struct smc_cdc_msg *cdc_msg = (struct smc_cdc_msg *)wr_buf; union smc_host_cursor cfed; + u8 saved_credits = 0; int rc; smc_cdc_add_pending_send(conn, pend); conn->tx_cdc_seq++; conn->local_tx_ctrl.seqno = conn->tx_cdc_seq; - smc_host_msg_to_cdc((struct smc_cdc_msg *)wr_buf, conn, &cfed); + smc_host_msg_to_cdc(cdc_msg, conn, &cfed); + saved_credits = (u8)smc_wr_rx_get_credits(link); + cdc_msg->credits = saved_credits; atomic_inc(&conn->cdc_pend_tx_wr); smp_mb__after_atomic(); /* Make sure cdc_pend_tx_wr added before post */ rc = smc_wr_tx_send(link, (struct smc_wr_tx_pend_priv *)pend); - if (!rc) { + if (likely(!rc)) { smc_curs_copy(&conn->rx_curs_confirmed, &cfed, conn); conn->local_rx_ctrl.prod_flags.cons_curs_upd_req = 0; } else { conn->tx_cdc_seq--; conn->local_tx_ctrl.seqno = conn->tx_cdc_seq; + smc_wr_rx_put_credits(link, saved_credits); atomic_dec(&conn->cdc_pend_tx_wr); } @@ -430,6 +435,9 @@ static void smc_cdc_rx_handler(struct ib_wc *wc, void *buf) if (cdc->len != SMC_WR_TX_SIZE) return; /* invalid message */ + if (cdc->credits) + smc_wr_tx_put_credits(link, cdc->credits, true); + /* lookup connection */ lgr = smc_get_lgr(link); read_lock_bh(&lgr->conns_lock); diff --git a/net/smc/smc_cdc.h b/net/smc/smc_cdc.h index 696cc11f2303..145ce7997e64 100644 --- a/net/smc/smc_cdc.h +++ b/net/smc/smc_cdc.h @@ -47,7 +47,8 @@ struct smc_cdc_msg { union smc_cdc_cursor cons; /* piggy backed "ack" */ struct smc_cdc_producer_flags prod_flags; struct smc_cdc_conn_state_flags conn_state_flags; - u8 reserved[18]; + u8 credits; /* credits synced by every cdc msg */ + u8 reserved[17]; }; /* SMC-D cursor format */ diff --git a/net/smc/smc_clc.c b/net/smc/smc_clc.c index 5ee5b2ce29a6..76fe3d5eb27a 100644 --- a/net/smc/smc_clc.c +++ b/net/smc/smc_clc.c @@ -699,9 +699,12 @@ static int smc_clc_send_confirm_accept(struct smc_sock *smc, switch (clc->hdr.type) { case SMC_CLC_ACCEPT: clc->r0.qp_mtu = link->path_mtu; + clc->r0.init_credits = (u8)link->wr_rx_cnt; break; case SMC_CLC_CONFIRM: clc->r0.qp_mtu = min(link->path_mtu, link->peer_mtu); + clc->r0.init_credits = + link->credits_enable ? (u8)link->wr_rx_cnt : 0; break; } clc->r0.rmbe_size = conn->rmbe_size_short; diff --git a/net/smc/smc_clc.h b/net/smc/smc_clc.h index c579d1d5995a..d152d134685f 100644 --- a/net/smc/smc_clc.h +++ b/net/smc/smc_clc.h @@ -53,6 +53,7 @@ #define SMC_CLC_DECL_ERR_RTOK 0x09990001 /* rtoken handling failed */ #define SMC_CLC_DECL_ERR_RDYLNK 0x09990002 /* ib ready link failed */ #define SMC_CLC_DECL_ERR_REGRMB 0x09990003 /* reg rmb failed */ +#define SMC_CLC_DECL_CREDITSERR 0x09990004 /* announce credits failed */ #define SMC_FIRST_CONTACT_MASK 0b10 /* first contact bit within typev2 */ @@ -178,7 +179,7 @@ struct smcr_clc_msg_accept_confirm { /* SMCR accept/confirm */ u8 qp_mtu : 4, rmbe_size : 4; #endif - u8 reserved; + u8 init_credits; /* QP rq init credits for rq flowctrl */ __be64 rmb_dma_addr; /* RMB virtual address */ u8 reserved2; u8 psn[3]; /* packet sequence number */ diff --git a/net/smc/smc_core.h b/net/smc/smc_core.h index 9364d0f35cce..f10bbb80f8d1 100644 --- a/net/smc/smc_core.h +++ b/net/smc/smc_core.h @@ -18,7 +18,12 @@ #include "smc.h" #include "smc_ib.h" -#define SMC_RMBS_PER_LGR_MAX 255 /* max. # of RMBs per link group */ +#define SMC_RMBS_PER_LGR_MAX 32 /* max. # of RMBs per link group. Correspondingly, + * SMC_WR_BUF_CNT should not be less than 2 * + * SMC_RMBS_PER_LGR_MAX, since every connection at + * least has two rq/sq credits in average, otherwise + * may result in waiting for credits in sending process. + */ struct smc_lgr_list { /* list of link group definition */ struct list_head list; @@ -72,6 +77,8 @@ struct smc_rdma_wr { /* work requests per message #define SMC_LGR_ID_SIZE 4 +#define SMC_LINKFLAG_ANNOUNCE_PENDING 0 + struct smc_link { struct smc_ib_device *smcibdev; /* ib-device */ u8 ibport; /* port - values 1 | 2 */ @@ -110,6 +117,14 @@ struct smc_link { atomic_t wr_reg_refcnt; /* reg refs to link */ enum smc_wr_reg_state wr_reg_state; /* state of wr_reg request */ + atomic_t peer_rq_credits; /* credits for peer rq flowctrl */ + atomic_t local_rq_credits; /* credits for local rq flowctrl */ + u8 credits_enable; /* credits enable flag, set when negotiation */ + u8 local_cr_watermark_high; /* local rq credits watermark */ + u8 peer_cr_watermark_low; /* peer rq credits watermark */ + struct work_struct credits_announce_work; /* work for credits announcement */ + unsigned long flags; /* link flags, SMC_LINKFLAG_ANNOUNCE_PENDING .etc */ + u8 gid[SMC_GID_SIZE];/* gid matching used vlan id*/ u8 sgid_index; /* gid index for vlan id */ u32 peer_qpn; /* QP number of peer */ diff --git a/net/smc/smc_ib.c b/net/smc/smc_ib.c index f1ffbd414602..e865f107a9ea 100644 --- a/net/smc/smc_ib.c +++ b/net/smc/smc_ib.c @@ -365,10 +365,12 @@ int smc_ib_create_queue_pair(struct smc_link *lnk) .srq = NULL, .cap = { /* include unsolicited rdma_writes as well, - * there are max. 2 RDMA_WRITE per 1 WR_SEND + * there are max. 2 RDMA_WRITE per 1 WR_SEND. + * RDMA_WRITE consumes send queue entities, + * without recv queue entities. */ .max_send_wr = SMC_WR_BUF_CNT * 3, - .max_recv_wr = SMC_WR_BUF_CNT * 3, + .max_recv_wr = SMC_WR_BUF_CNT, .max_send_sge = SMC_IB_MAX_SEND_SGE, .max_recv_sge = 1, }, diff --git a/net/smc/smc_llc.c b/net/smc/smc_llc.c index 0ef15f8fba90..e45cd3e7e5c2 100644 --- a/net/smc/smc_llc.c +++ b/net/smc/smc_llc.c @@ -67,7 +67,8 @@ struct smc_llc_msg_add_link { /* type 0x02 */ reserved3 : 4; #endif u8 initial_psn[3]; - u8 reserved[8]; + u8 init_credits; /* QP rq init credits for rq flowctrl */ + u8 reserved[7]; }; struct smc_llc_msg_add_link_cont_rt { @@ -135,6 +136,12 @@ struct smc_llc_msg_delete_rkey { /* type 0x09 */ u8 reserved2[4]; }; +struct smc_llc_msg_announce_credits { /* type 0x0A */ + struct smc_llc_hdr hd; + u8 credits; + u8 reserved[39]; +}; + union smc_llc_msg { struct smc_llc_msg_confirm_link confirm_link; struct smc_llc_msg_add_link add_link; @@ -145,6 +152,7 @@ union smc_llc_msg { struct smc_llc_msg_delete_rkey delete_rkey; struct smc_llc_msg_test_link test_link; + struct smc_llc_msg_announce_credits announce_credits; struct { struct smc_llc_hdr hdr; u8 data[SMC_LLC_DATA_LEN]; @@ -584,6 +592,46 @@ static int smc_llc_send_test_link(struct smc_link *link, u8 user_data[16]) return rc; } +/* send credits announce request or response */ +int smc_llc_announce_credits(struct smc_link *link, + enum smc_llc_reqresp reqresp, bool force) +{ + struct smc_llc_msg_announce_credits *announce_credits; + struct smc_wr_tx_pend_priv *pend; + struct smc_wr_buf *wr_buf; + int rc; + u8 saved_credits = 0; + + if (!link->credits_enable || + (!force && !smc_wr_rx_credits_need_announce(link))) + return 0; + + saved_credits = (u8)smc_wr_rx_get_credits(link); + if (!saved_credits) + /* maybe synced by cdc msg */ + return 0; + + rc = smc_llc_add_pending_send(link, &wr_buf, &pend); + if (rc) { + smc_wr_rx_put_credits(link, saved_credits); + return rc; + } + + announce_credits = (struct smc_llc_msg_announce_credits *)wr_buf; + memset(announce_credits, 0, sizeof(*announce_credits)); + announce_credits->hd.common.type = SMC_LLC_ANNOUNCE_CREDITS; + announce_credits->hd.length = sizeof(struct smc_llc_msg_announce_credits); + if (reqresp == SMC_LLC_RESP) + announce_credits->hd.flags |= SMC_LLC_FLAG_RESP; + announce_credits->credits = saved_credits; + /* send llc message */ + rc = smc_wr_tx_send(link, pend); + if (rc) + smc_wr_rx_put_credits(link, saved_credits); + + return rc; +} + /* schedule an llc send on link, may wait for buffers */ static int smc_llc_send_message(struct smc_link *link, void *llcbuf) { @@ -862,6 +910,13 @@ static void smc_llc_save_add_link_info(struct smc_link *link, memcpy(link->peer_mac, add_llc->sender_mac, ETH_ALEN); link->peer_psn = ntoh24(add_llc->initial_psn); link->peer_mtu = add_llc->qp_mtu; + link->credits_enable = add_llc->init_credits ? 1 : 0; + if (link->credits_enable) { + atomic_set(&link->peer_rq_credits, add_llc->init_credits); + // set peer rq credits watermark, if less than init_credits * 2/3, + // then credit announcement is needed. + link->peer_cr_watermark_low = max(add_llc->init_credits * 2 / 3, 1); + } } /* as an SMC client, process an add link request */ @@ -1620,6 +1675,10 @@ static void smc_llc_event_handler(struct smc_llc_qentry *qentry) smc_llc_flow_stop(lgr, &lgr->llc_flow_rmt); } return; + case SMC_LLC_ANNOUNCE_CREDITS: + if (smc_link_active(link)) + smc_wr_tx_put_credits(link, llc->announce_credits.credits, true); + break; default: smc_llc_protocol_violation(lgr, llc->raw.hdr.common.type); break; @@ -1688,6 +1747,10 @@ static void smc_llc_rx_response(struct smc_link *link, case SMC_LLC_CONFIRM_RKEY_CONT: /* not used because max links is 3 */ break; + case SMC_LLC_ANNOUNCE_CREDITS: + if (smc_link_active(link)) + smc_wr_tx_put_credits(link, qentry->msg.announce_credits.credits, true); + break; default: smc_llc_protocol_violation(link->lgr, llc_type); break; @@ -1774,6 +1837,27 @@ static void smc_llc_testlink_work(struct work_struct *work) schedule_delayed_work(&link->llc_testlink_wrk, next_interval); } +static void smc_llc_announce_credits_work(struct work_struct *work) +{ + struct smc_link *link = container_of(work, + struct smc_link, credits_announce_work); + int rc, retry = 0, agains = 0; + +again: + do { + rc = smc_llc_announce_credits(link, SMC_LLC_RESP, false); + } while ((rc == -EBUSY) && smc_link_sendable(link) && + (retry++ < SMC_LLC_ANNOUNCE_CR_MAX_RETRY)); + + if (smc_wr_rx_credits_need_announce(link) && + smc_link_sendable(link) && agains <= 5 && !rc) { + agains++; + goto again; + } + + clear_bit(SMC_LINKFLAG_ANNOUNCE_PENDING, &link->flags); +} + void smc_llc_lgr_init(struct smc_link_group *lgr, struct smc_sock *smc) { struct net *net = sock_net(smc->clcsock->sk); @@ -1809,6 +1893,7 @@ int smc_llc_link_init(struct smc_link *link) { init_completion(&link->llc_testlink_resp); INIT_DELAYED_WORK(&link->llc_testlink_wrk, smc_llc_testlink_work); + INIT_WORK(&link->credits_announce_work, smc_llc_announce_credits_work); return 0; } @@ -1840,6 +1925,7 @@ void smc_llc_link_clear(struct smc_link *link, bool log) link->smcibdev->ibdev->name, link->ibport); complete(&link->llc_testlink_resp); cancel_delayed_work_sync(&link->llc_testlink_wrk); + cancel_work_sync(&link->credits_announce_work); } /* register a new rtoken at the remote peer (for all links) */ @@ -1954,6 +2040,10 @@ static struct smc_wr_rx_handler smc_llc_rx_handlers[] = { .handler = smc_llc_rx_handler, .type = SMC_LLC_DELETE_RKEY }, + { + .handler = smc_llc_rx_handler, + .type = SMC_LLC_ANNOUNCE_CREDITS + }, { .handler = NULL, } diff --git a/net/smc/smc_llc.h b/net/smc/smc_llc.h index cc00a2ec4e92..7694ff573e34 100644 --- a/net/smc/smc_llc.h +++ b/net/smc/smc_llc.h @@ -20,6 +20,8 @@ #define SMC_LLC_WAIT_FIRST_TIME (5 * HZ) #define SMC_LLC_WAIT_TIME (2 * HZ) +#define SMC_LLC_ANNOUNCE_CR_MAX_RETRY (1) + enum smc_llc_reqresp { SMC_LLC_REQ, SMC_LLC_RESP @@ -34,6 +36,7 @@ enum smc_llc_msg_type { SMC_LLC_TEST_LINK = 0x07, SMC_LLC_CONFIRM_RKEY_CONT = 0x08, SMC_LLC_DELETE_RKEY = 0x09, + SMC_LLC_ANNOUNCE_CREDITS = 0X0A, }; #define smc_link_downing(state) \ @@ -77,6 +80,8 @@ int smc_llc_send_add_link(struct smc_link *link, u8 mac[], u8 gid[], int smc_llc_send_delete_link(struct smc_link *link, u8 link_del_id, enum smc_llc_reqresp reqresp, bool orderly, u32 reason); +int smc_llc_announce_credits(struct smc_link *link, + enum smc_llc_reqresp reqresp, bool force); void smc_llc_srv_delete_link_local(struct smc_link *link, u8 del_link_id); void smc_llc_lgr_init(struct smc_link_group *lgr, struct smc_sock *smc); void smc_llc_lgr_clear(struct smc_link_group *lgr); diff --git a/net/smc/smc_wr.c b/net/smc/smc_wr.c index 5a81f8c9ebf9..45fc4f469d4e 100644 --- a/net/smc/smc_wr.c +++ b/net/smc/smc_wr.c @@ -115,7 +115,8 @@ static inline void smc_wr_tx_process_cqe(struct ib_wc *wc) } if (pnd_snd.handler) pnd_snd.handler(&pnd_snd.priv, link, wc->status); - wake_up(&link->wr_tx_wait); + if (wq_has_sleeper(&link->wr_tx_wait)) + wake_up(&link->wr_tx_wait); } static void smc_wr_tx_tasklet_fn(unsigned long data) @@ -158,11 +159,16 @@ static inline int smc_wr_tx_get_free_slot_index(struct smc_link *link, u32 *idx) *idx = link->wr_tx_cnt; if (!smc_link_sendable(link)) return -ENOLINK; + + if (!smc_wr_tx_get_credit(link)) + return -EBUSY; + for_each_clear_bit(*idx, link->wr_tx_mask, link->wr_tx_cnt) { if (!test_and_set_bit(*idx, link->wr_tx_mask)) return 0; } *idx = link->wr_tx_cnt; + smc_wr_tx_put_credits(link, 1, false); return -EBUSY; } @@ -241,7 +247,7 @@ int smc_wr_tx_put_slot(struct smc_link *link, memset(&link->wr_tx_bufs[idx], 0, sizeof(link->wr_tx_bufs[idx])); test_and_clear_bit(idx, link->wr_tx_mask); - wake_up(&link->wr_tx_wait); + smc_wr_tx_put_credits(link, 1, true); return 1; } @@ -405,6 +411,12 @@ static inline void smc_wr_rx_process_cqes(struct ib_wc wc[], int num) break; } } + + if (smc_wr_rx_credits_need_announce(link) && + !test_bit(SMC_LINKFLAG_ANNOUNCE_PENDING, &link->flags)) { + set_bit(SMC_LINKFLAG_ANNOUNCE_PENDING, &link->flags); + schedule_work(&link->credits_announce_work); + } } } @@ -447,6 +459,8 @@ int smc_wr_rx_post_init(struct smc_link *link) for (i = 0; i < link->wr_rx_cnt; i++) rc = smc_wr_rx_post(link); + // credits have already been announced to peer + atomic_set(&link->local_rq_credits, 0); return rc; } @@ -481,7 +495,7 @@ void smc_wr_remember_qp_attr(struct smc_link *lnk) lnk->wr_tx_cnt = min_t(size_t, SMC_WR_BUF_CNT, lnk->qp_attr.cap.max_send_wr); - lnk->wr_rx_cnt = min_t(size_t, SMC_WR_BUF_CNT * 3, + lnk->wr_rx_cnt = min_t(size_t, SMC_WR_BUF_CNT, lnk->qp_attr.cap.max_recv_wr); } @@ -592,7 +606,7 @@ int smc_wr_alloc_link_mem(struct smc_link *link) link->wr_tx_bufs = kcalloc(SMC_WR_BUF_CNT, SMC_WR_BUF_SIZE, GFP_KERNEL); if (!link->wr_tx_bufs) goto no_mem; - link->wr_rx_bufs = kcalloc(SMC_WR_BUF_CNT * 3, SMC_WR_BUF_SIZE, + link->wr_rx_bufs = kcalloc(SMC_WR_BUF_CNT, SMC_WR_BUF_SIZE, GFP_KERNEL); if (!link->wr_rx_bufs) goto no_mem_wr_tx_bufs; @@ -600,7 +614,7 @@ int smc_wr_alloc_link_mem(struct smc_link *link) GFP_KERNEL); if (!link->wr_tx_ibs) goto no_mem_wr_rx_bufs; - link->wr_rx_ibs = kcalloc(SMC_WR_BUF_CNT * 3, + link->wr_rx_ibs = kcalloc(SMC_WR_BUF_CNT, sizeof(link->wr_rx_ibs[0]), GFP_KERNEL); if (!link->wr_rx_ibs) @@ -619,7 +633,7 @@ int smc_wr_alloc_link_mem(struct smc_link *link) GFP_KERNEL); if (!link->wr_tx_sges) goto no_mem_wr_tx_rdma_sges; - link->wr_rx_sges = kcalloc(SMC_WR_BUF_CNT * 3, + link->wr_rx_sges = kcalloc(SMC_WR_BUF_CNT, sizeof(link->wr_rx_sges[0]), GFP_KERNEL); if (!link->wr_rx_sges) @@ -708,6 +722,11 @@ int smc_wr_create_link(struct smc_link *lnk) atomic_set(&lnk->wr_tx_refcnt, 0); init_waitqueue_head(&lnk->wr_reg_wait); atomic_set(&lnk->wr_reg_refcnt, 0); + atomic_set(&lnk->peer_rq_credits, 0); + atomic_set(&lnk->local_rq_credits, 0); + lnk->flags = 0; + lnk->local_cr_watermark_high = max(lnk->wr_rx_cnt / 3, 1U); + lnk->peer_cr_watermark_low = 0; return rc; dma_unmap: diff --git a/net/smc/smc_wr.h b/net/smc/smc_wr.h index cb58e60078f5..dabc810a36af 100644 --- a/net/smc/smc_wr.h +++ b/net/smc/smc_wr.h @@ -19,7 +19,12 @@ #include "smc.h" #include "smc_core.h" -#define SMC_WR_BUF_CNT 16 /* # of ctrl buffers per link */ +#define SMC_WR_BUF_CNT 64 /* # of ctrl buffers per link, SMC_WR_BUF_CNT + * should not be less than 2 * SMC_RMBS_PER_LGR_MAX, + * since every connection at least has two rq/sq + * credits in average, otherwise may result in + * waiting for credits in sending process. + */ #define SMC_WR_TX_WAIT_FREE_SLOT_TIME (10 * HZ) @@ -83,6 +88,51 @@ static inline void smc_wr_wakeup_reg_wait(struct smc_link *lnk) wake_up(&lnk->wr_reg_wait); } +// get one tx credit, and peer rq credits dec +static inline int smc_wr_tx_get_credit(struct smc_link *link) +{ + return !link->credits_enable || atomic_dec_if_positive(&link->peer_rq_credits) >= 0; +} + +// put tx credits, when some failures occurred after tx credits got +// or receive announce credits msgs +static inline void smc_wr_tx_put_credits(struct smc_link *link, int credits, bool wakeup) +{ + if (link->credits_enable && credits) { + atomic_add(credits, &link->peer_rq_credits); + if (wakeup && wq_has_sleeper(&link->wr_tx_wait)) + wake_up_nr(&link->wr_tx_wait, credits); + } +} + +// to check whether peer rq credits is lower than watermark. +static inline int smc_wr_tx_credits_need_announce(struct smc_link *link) +{ + return link->credits_enable && + atomic_read(&link->peer_rq_credits) <= link->peer_cr_watermark_low; +} + +// get local rq credits and set credits to zero. +// may called when announcing credits +static inline int smc_wr_rx_get_credits(struct smc_link *link) +{ + return link->credits_enable ? atomic_fetch_and(0, &link->local_rq_credits) : 0; +} + +// called when post_recv a rqe +static inline void smc_wr_rx_put_credits(struct smc_link *link, int credits) +{ + if (link->credits_enable && credits) + atomic_add(credits, &link->local_rq_credits); +} + +// to check whether local rq credits is higher than watermark. +static inline int smc_wr_rx_credits_need_announce(struct smc_link *link) +{ + return link->credits_enable && + atomic_read(&link->local_rq_credits) >= link->local_cr_watermark_high; +} + /* post a new receive work request to fill a completed old work request entry */ static inline int smc_wr_rx_post(struct smc_link *link) { @@ -95,6 +145,8 @@ static inline int smc_wr_rx_post(struct smc_link *link) index = do_div(temp_wr_id, link->wr_rx_cnt); link->wr_rx_ibs[index].wr_id = wr_id; rc = ib_post_recv(link->roce_qp, &link->wr_rx_ibs[index], NULL); + if (!rc) + smc_wr_rx_put_credits(link, 1); return rc; } -- Gitee From c64f017133db593f24b350b4fd64421c9817c263 Mon Sep 17 00:00:00 2001 From: Guangguan Wang Date: Fri, 6 May 2022 10:45:52 +0800 Subject: [PATCH 2/4] anolis: net/smc: compress frequency of credits announcement by cdc msg anolis inclusion from anolis-5.10.134-12 commit d82aa96b148c88da4019fae4a9e346117b4141ae category: performance bugzilla: https://gitee.com/openeuler/kernel/issues/I82GZ7 CVE: NA Reference: https://gitee.com/anolis/cloud-kernel/commit/d82aa96b148c88da4019fae4a9e346117b4141ae -------------------------------- ANBZ: #1742 When in heavy traffic, credits token by cdc msg maybe few and wakeup frequently when credits update in recv side, which may use more cpu. Set announcement wartermark, which is 10% of local rq credits, can compress the announcement frequecy, and the credits taken by cdc msg is more than 10% of local rq credits, reduce the wakeup frequency in the recv side. In netty benchamrk, show 28% improvement in throughput: Before: throughput cpu sys usr thread-480 connect-48 len-8: 1653807.614 124.755 69.0489 55.7061 After: throughput cpu sys usr thread-480 connect-48 len-8: 2113879.617 132.117 67.9467 64.1707 Signed-off-by: Guangguan Wang Reviewed-by: Tony Lu Acked-by: Tony Lu Link: https://gitee.com/anolis/cloud-kernel/pulls/577 Signed-off-by: Yingyu Zeng --- net/smc/smc_cdc.c | 3 ++- net/smc/smc_core.h | 1 + net/smc/smc_wr.c | 5 +++++ net/smc/smc_wr.h | 11 +++++++++++ 4 files changed, 19 insertions(+), 1 deletion(-) diff --git a/net/smc/smc_cdc.c b/net/smc/smc_cdc.c index e674ee16cda4..3156be1e5eb3 100644 --- a/net/smc/smc_cdc.c +++ b/net/smc/smc_cdc.c @@ -111,7 +111,8 @@ int smc_cdc_msg_send(struct smc_connection *conn, conn->tx_cdc_seq++; conn->local_tx_ctrl.seqno = conn->tx_cdc_seq; smc_host_msg_to_cdc(cdc_msg, conn, &cfed); - saved_credits = (u8)smc_wr_rx_get_credits(link); + if (smc_wr_rx_credits_need_announce_frequent(link)) + saved_credits = (u8)smc_wr_rx_get_credits(link); cdc_msg->credits = saved_credits; atomic_inc(&conn->cdc_pend_tx_wr); diff --git a/net/smc/smc_core.h b/net/smc/smc_core.h index f10bbb80f8d1..c0f00f7198c3 100644 --- a/net/smc/smc_core.h +++ b/net/smc/smc_core.h @@ -122,6 +122,7 @@ struct smc_link { u8 credits_enable; /* credits enable flag, set when negotiation */ u8 local_cr_watermark_high; /* local rq credits watermark */ u8 peer_cr_watermark_low; /* peer rq credits watermark */ + u8 credits_update_limit; /* credits update limit for cdc msg */ struct work_struct credits_announce_work; /* work for credits announcement */ unsigned long flags; /* link flags, SMC_LINKFLAG_ANNOUNCE_PENDING .etc */ diff --git a/net/smc/smc_wr.c b/net/smc/smc_wr.c index 45fc4f469d4e..ff305dc7af39 100644 --- a/net/smc/smc_wr.c +++ b/net/smc/smc_wr.c @@ -727,6 +727,11 @@ int smc_wr_create_link(struct smc_link *lnk) lnk->flags = 0; lnk->local_cr_watermark_high = max(lnk->wr_rx_cnt / 3, 1U); lnk->peer_cr_watermark_low = 0; + + /* if credits accumlated less than 10% of wr_rx_cnt(at least 5), + * will not be announced by cdc msg. + */ + lnk->credits_update_limit = max(lnk->wr_rx_cnt / 10, 5U); return rc; dma_unmap: diff --git a/net/smc/smc_wr.h b/net/smc/smc_wr.h index dabc810a36af..119f1aa0076e 100644 --- a/net/smc/smc_wr.h +++ b/net/smc/smc_wr.h @@ -133,6 +133,17 @@ static inline int smc_wr_rx_credits_need_announce(struct smc_link *link) atomic_read(&link->local_rq_credits) >= link->local_cr_watermark_high; } +static inline int smc_wr_rx_credits_need_announce_frequent(struct smc_link *link) +{ + /* announce when local rq credits accumulated more than credits_update_limit, or + * peer rq credits is empty. As peer credits empty and local credits is less than + * credits_update_limit, may results in credits deadlock. + */ + return link->credits_enable && + (atomic_read(&link->local_rq_credits) >= link->credits_update_limit || + !atomic_read(&link->peer_rq_credits)); +} + /* post a new receive work request to fill a completed old work request entry */ static inline int smc_wr_rx_post(struct smc_link *link) { -- Gitee From 095a8bba6e0a0210e13e660389139031e4e516ce Mon Sep 17 00:00:00 2001 From: Dust Li Date: Thu, 2 Sep 2021 13:19:26 +0800 Subject: [PATCH 3/4] anolis: net/smc: don't call ib_req_notify_cq in the send routine anolis inclusion from anolis-5.10.134-14 commit 7e390c306a2d09b5c4e1a9b594d7d0d8169bbc7c category: performance bugzilla: https://gitee.com/openeuler/kernel/issues/I82GZ7 CVE: NA Reference: https://gitee.com/anolis/cloud-kernel/commit/7e390c306a2d09b5c4e1a9b594d7d0d8169bbc7c -------------------------------- ANBZ: #1742 We can just call ib_req_notify_cq() when the link got ready, and rearm it after poll_cq(). Which is enough to make sure we won't miss any events. Simple sockperf test show about 20% gain in throughput test with small messages. Test command: client: smc_run sockperf tp -i $SERVER -m 14 -t 30 --tcp server: smc_run sockperf sr --tcp Without this: Summary: BandWidth is 6.504 MBps (52.034 Mbps) With this: Summary: BandWidth is 7.846 MBps (62.771 Mbps) Signed-off-by: Dust Li Acked-by: Dust Li Acked-by: Tony Lu Link: https://gitee.com/anolis/cloud-kernel/pulls/577 Link: https://gitee.com/anolis/cloud-kernel/pulls/906 Link: https://gitee.com/anolis/cloud-kernel/pulls/1281 Signed-off-by: Yingyu Zeng --- net/smc/smc_ib.c | 6 ++++++ net/smc/smc_wr.c | 2 -- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/net/smc/smc_ib.c b/net/smc/smc_ib.c index e865f107a9ea..41e8ff98576f 100644 --- a/net/smc/smc_ib.c +++ b/net/smc/smc_ib.c @@ -126,6 +126,12 @@ int smc_ib_ready_link(struct smc_link *lnk) IB_CQ_SOLICITED_MASK); if (rc) goto out; + + rc = ib_req_notify_cq(lnk->smcibdev->roce_cq_send, + IB_CQ_NEXT_COMP | IB_CQ_REPORT_MISSED_EVENTS); + if (rc) + goto out; + rc = smc_wr_rx_post_init(lnk); if (rc) goto out; diff --git a/net/smc/smc_wr.c b/net/smc/smc_wr.c index ff305dc7af39..ba47f261bb06 100644 --- a/net/smc/smc_wr.c +++ b/net/smc/smc_wr.c @@ -262,8 +262,6 @@ int smc_wr_tx_send(struct smc_link *link, struct smc_wr_tx_pend_priv *priv) struct smc_wr_tx_pend *pend; int rc; - ib_req_notify_cq(link->smcibdev->roce_cq_send, - IB_CQ_NEXT_COMP | IB_CQ_REPORT_MISSED_EVENTS); pend = container_of(priv, struct smc_wr_tx_pend, priv); rc = ib_post_send(link->roce_qp, &link->wr_tx_ibs[pend->idx], NULL); if (rc) { -- Gitee From 9fb93d8701854e8686490477c5c5d0e6c965db81 Mon Sep 17 00:00:00 2001 From: Guangguan Wang Date: Wed, 25 May 2022 15:24:44 +0800 Subject: [PATCH 4/4] anolis: net/smc: remove redundant ib_req_notify_cq anolis inclusion from anolis-5.10.134-12 commit 933a6aabd5dc8e5bf5762de1f788ee20b2c8897c category: performance bugzilla: https://gitee.com/openeuler/kernel/issues/I82GZ7 CVE: NA Reference: https://gitee.com/anolis/cloud-kernel/commit/933a6aabd5dc8e5bf5762de1f788ee20b2c8897c -------------------------------- ANBZ: #1742 Solicited flag is only used by RCQ. As SCQ and RCQ are combined into one CQ, we can not notify cq with solicited flag. And immediately after the solicited notify cq, another notify with next complete flag is performed, the state machine of CQ will also immediately switch from the Arm_Sol state to the Armed state, which is the same as the result of direct notify with next complete flag. So the code of notify CQ with solicited is redundant and meaningless. Signed-off-by: Guangguan Wang Reviewed-by: Tony Lu Acked-by: Tony Lu Link: https://gitee.com/anolis/cloud-kernel/pulls/577 Signed-off-by: Yingyu Zeng --- net/smc/smc_ib.c | 5 ----- 1 file changed, 5 deletions(-) diff --git a/net/smc/smc_ib.c b/net/smc/smc_ib.c index 41e8ff98576f..4ec4c57a80b8 100644 --- a/net/smc/smc_ib.c +++ b/net/smc/smc_ib.c @@ -122,11 +122,6 @@ int smc_ib_ready_link(struct smc_link *lnk) if (rc) goto out; smc_wr_remember_qp_attr(lnk); - rc = ib_req_notify_cq(lnk->smcibdev->roce_cq_recv, - IB_CQ_SOLICITED_MASK); - if (rc) - goto out; - rc = ib_req_notify_cq(lnk->smcibdev->roce_cq_send, IB_CQ_NEXT_COMP | IB_CQ_REPORT_MISSED_EVENTS); if (rc) -- Gitee