From 21428bdcb4a6e3bd9b1295b807cd3b50cd33d5b8 Mon Sep 17 00:00:00 2001 From: Guangguan Wang Date: Wed, 12 Jan 2022 00:15:51 +0800 Subject: [PATCH 1/4] anolis: net/smc: Support rq flow control in smc-r link layer anolis inclusion category: performance bugzilla: https://gitee.com/openeuler/kernel/issues/I78AFL CVE: NA Reference: https://gitee.com/anolis/cloud-kernel/commit/70922b9ed51e2681b3d6159414247819f5ab187d -------------------------------- ANBZ: #254 This patch supports rq flow control in smc-r link layer. QPs communicating without rq flow control, in the previous version, may result in RNR (reveive not ready) error, which means when sq sends a message to the remote qp, but the remote qp's rq has no rq entities to receive the message. In RNR situation, the rdma transport layer may retransmit the messages again and again until the rq has any entities, which may lower the performance, especially in heavy traffic. Using credits to do rq flow control can avoid the occurrence of RNR. The test of redis-benchmark shows that more than 3X rps improvement in SET and more than 7X rps improvement in GET. Test command: redis-server --save "" --appendonly no --protected-mode no --io-threads 7 --io-threads-do-reads yes redis-benchmark -h 192.168.26.36 -q -t set,get -P 1 --threads 7 -n 2000000 -c 500 -d 10 Before: SET: 173325.25 requests per second, p50=2.703 msec GET: 81383.52 requests per second, p50=5.575 msec After: SET: 554323.69 requests per second, p50=0.959 msec GET: 604741.19 requests per second, p50=0.855 msec Signed-off-by: Guangguan Wang Acked-by: Tony Lu Signed-off-by: Gengbiao Shen Signed-off-by: Litao Jiao --- net/smc/af_smc.c | 12 ++++++ net/smc/smc_cdc.c | 12 +++++- net/smc/smc_cdc.h | 3 +- net/smc/smc_clc.c | 3 ++ net/smc/smc_clc.h | 3 +- net/smc/smc_core.h | 17 ++++++++- net/smc/smc_ib.c | 6 ++- net/smc/smc_llc.c | 92 +++++++++++++++++++++++++++++++++++++++++++++- net/smc/smc_llc.h | 5 +++ net/smc/smc_wr.c | 31 +++++++++++++--- net/smc/smc_wr.h | 54 ++++++++++++++++++++++++++- 11 files changed, 223 insertions(+), 15 deletions(-) diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index 6a4a19f57a39..8737660acf43 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -620,6 +620,13 @@ static void smc_link_save_peer_info(struct smc_link *link, memcpy(link->peer_mac, clc->r0.lcl.mac, sizeof(link->peer_mac)); link->peer_psn = ntoh24(clc->r0.psn); link->peer_mtu = clc->r0.qp_mtu; + link->credits_enable = clc->r0.init_credits ? 1 : 0; + if (link->credits_enable) { + atomic_set(&link->peer_rq_credits, clc->r0.init_credits); + // set peer rq credits watermark, if less than init_credits * 2/3, + // then credit announcement is needed. + link->peer_cr_watermark_low = max(clc->r0.init_credits * 2 / 3, 1); + } } /* must be called under rcu read lock */ @@ -1047,6 +1054,11 @@ static int smc_connect_rdma(struct smc_sock *smc, goto connect_abort; } } else { + if (smc_llc_announce_credits(link, SMC_LLC_RESP, true)) { + reason_code = SMC_CLC_DECL_CREDITSERR; + goto connect_abort; + } + /* reg sendbufs if they were vzalloced */ if (smc->conn.sndbuf_desc->is_vm) { if (smcr_lgr_reg_sndbufs(link, smc->conn.sndbuf_desc)) { diff --git a/net/smc/smc_cdc.c b/net/smc/smc_cdc.c index 80e30439e337..fc564773fc18 100644 --- a/net/smc/smc_cdc.c +++ b/net/smc/smc_cdc.c @@ -111,7 +111,9 @@ int smc_cdc_msg_send(struct smc_connection *conn, struct smc_cdc_tx_pend *pend) { struct smc_link *link = conn->lnk; + struct smc_cdc_msg *cdc_msg = (struct smc_cdc_msg *)wr_buf; union smc_host_cursor cfed; + u8 saved_credits = 0; int rc; if (unlikely(!READ_ONCE(conn->sndbuf_desc))) @@ -121,18 +123,21 @@ int smc_cdc_msg_send(struct smc_connection *conn, conn->tx_cdc_seq++; conn->local_tx_ctrl.seqno = conn->tx_cdc_seq; - smc_host_msg_to_cdc((struct smc_cdc_msg *)wr_buf, conn, &cfed); + smc_host_msg_to_cdc(cdc_msg, conn, &cfed); + saved_credits = (u8)smc_wr_rx_get_credits(link); + cdc_msg->credits = saved_credits; atomic_inc(&conn->cdc_pend_tx_wr); smp_mb__after_atomic(); /* Make sure cdc_pend_tx_wr added before post */ rc = smc_wr_tx_send(link, (struct smc_wr_tx_pend_priv *)pend); - if (!rc) { + if (likely(!rc)) { smc_curs_copy(&conn->rx_curs_confirmed, &cfed, conn); conn->local_rx_ctrl.prod_flags.cons_curs_upd_req = 0; } else { conn->tx_cdc_seq--; conn->local_tx_ctrl.seqno = conn->tx_cdc_seq; + smc_wr_rx_put_credits(link, saved_credits); if (atomic_dec_and_test(&conn->cdc_pend_tx_wr) || smc_link_usable(conn->lnk)) wake_up(&conn->cdc_pend_tx_wq); } @@ -452,6 +457,9 @@ static void smc_cdc_rx_handler(struct ib_wc *wc, void *buf) if (cdc->len != SMC_WR_TX_SIZE) return; /* invalid message */ + if (cdc->credits) + smc_wr_tx_put_credits(link, cdc->credits, true); + /* lookup connection */ lgr = smc_get_lgr(link); read_lock_bh(&lgr->conns_lock); diff --git a/net/smc/smc_cdc.h b/net/smc/smc_cdc.h index 696cc11f2303..145ce7997e64 100644 --- a/net/smc/smc_cdc.h +++ b/net/smc/smc_cdc.h @@ -47,7 +47,8 @@ struct smc_cdc_msg { union smc_cdc_cursor cons; /* piggy backed "ack" */ struct smc_cdc_producer_flags prod_flags; struct smc_cdc_conn_state_flags conn_state_flags; - u8 reserved[18]; + u8 credits; /* credits synced by every cdc msg */ + u8 reserved[17]; }; /* SMC-D cursor format */ diff --git a/net/smc/smc_clc.c b/net/smc/smc_clc.c index c9450ab0e23b..2b68e88986d0 100644 --- a/net/smc/smc_clc.c +++ b/net/smc/smc_clc.c @@ -699,9 +699,12 @@ static int smc_clc_send_confirm_accept(struct smc_sock *smc, switch (clc->hdr.type) { case SMC_CLC_ACCEPT: clc->r0.qp_mtu = link->path_mtu; + clc->r0.init_credits = (u8)link->wr_rx_cnt; break; case SMC_CLC_CONFIRM: clc->r0.qp_mtu = min(link->path_mtu, link->peer_mtu); + clc->r0.init_credits = + link->credits_enable ? (u8)link->wr_rx_cnt : 0; break; } clc->r0.rmbe_size = conn->rmbe_size_comp; diff --git a/net/smc/smc_clc.h b/net/smc/smc_clc.h index 8992949900e9..3491099d3c9b 100644 --- a/net/smc/smc_clc.h +++ b/net/smc/smc_clc.h @@ -53,6 +53,7 @@ #define SMC_CLC_DECL_ERR_RTOK 0x09990001 /* rtoken handling failed */ #define SMC_CLC_DECL_ERR_RDYLNK 0x09990002 /* ib ready link failed */ #define SMC_CLC_DECL_ERR_REGBUF 0x09990003 /* reg rdma bufs failed */ +#define SMC_CLC_DECL_CREDITSERR 0x09990004 /* announce credits failed */ #define SMC_FIRST_CONTACT_MASK 0b10 /* first contact bit within typev2 */ @@ -178,7 +179,7 @@ struct smcr_clc_msg_accept_confirm { /* SMCR accept/confirm */ u8 qp_mtu : 4, rmbe_size : 4; #endif - u8 reserved; + u8 init_credits; /* QP rq init credits for rq flowctrl */ __be64 rmb_dma_addr; /* RMB virtual address */ u8 reserved2; u8 psn[3]; /* packet sequence number */ diff --git a/net/smc/smc_core.h b/net/smc/smc_core.h index ab410fe62fac..e9af83d79ab2 100644 --- a/net/smc/smc_core.h +++ b/net/smc/smc_core.h @@ -18,7 +18,12 @@ #include "smc.h" #include "smc_ib.h" -#define SMC_RMBS_PER_LGR_MAX 255 /* max. # of RMBs per link group */ +#define SMC_RMBS_PER_LGR_MAX 32 /* max. # of RMBs per link group. Correspondingly, + * SMC_WR_BUF_CNT should not be less than 2 * + * SMC_RMBS_PER_LGR_MAX, since every connection at + * least has two rq/sq credits in average, otherwise + * may result in waiting for credits in sending process. + */ struct smc_lgr_list { /* list of link group definition */ struct list_head list; @@ -72,6 +77,8 @@ struct smc_rdma_wr { /* work requests per message #define SMC_LGR_ID_SIZE 4 +#define SMC_LINKFLAG_ANNOUNCE_PENDING 0 + struct smc_link { struct smc_ib_device *smcibdev; /* ib-device */ u8 ibport; /* port - values 1 | 2 */ @@ -110,6 +117,14 @@ struct smc_link { atomic_t wr_reg_refcnt; /* reg refs to link */ enum smc_wr_reg_state wr_reg_state; /* state of wr_reg request */ + atomic_t peer_rq_credits; /* credits for peer rq flowctrl */ + atomic_t local_rq_credits; /* credits for local rq flowctrl */ + u8 credits_enable; /* credits enable flag, set when negotiation */ + u8 local_cr_watermark_high; /* local rq credits watermark */ + u8 peer_cr_watermark_low; /* peer rq credits watermark */ + struct work_struct credits_announce_work; /* work for credits announcement */ + unsigned long flags; /* link flags, SMC_LINKFLAG_ANNOUNCE_PENDING .etc */ + u8 gid[SMC_GID_SIZE];/* gid matching used vlan id*/ u8 sgid_index; /* gid index for vlan id */ u32 peer_qpn; /* QP number of peer */ diff --git a/net/smc/smc_ib.c b/net/smc/smc_ib.c index 1413665f3115..d72c767df0a1 100644 --- a/net/smc/smc_ib.c +++ b/net/smc/smc_ib.c @@ -365,10 +365,12 @@ int smc_ib_create_queue_pair(struct smc_link *lnk) .srq = NULL, .cap = { /* include unsolicited rdma_writes as well, - * there are max. 2 RDMA_WRITE per 1 WR_SEND + * there are max. 2 RDMA_WRITE per 1 WR_SEND. + * RDMA_WRITE consumes send queue entities, + * without recv queue entities. */ .max_send_wr = SMC_WR_BUF_CNT * 3, - .max_recv_wr = SMC_WR_BUF_CNT * 3, + .max_recv_wr = SMC_WR_BUF_CNT, .max_send_sge = SMC_IB_MAX_SEND_SGE, .max_recv_sge = 1, .max_inline_data = 0, diff --git a/net/smc/smc_llc.c b/net/smc/smc_llc.c index 97d10b933512..c376cd4d3c2c 100644 --- a/net/smc/smc_llc.c +++ b/net/smc/smc_llc.c @@ -67,7 +67,8 @@ struct smc_llc_msg_add_link { /* type 0x02 */ reserved3 : 4; #endif u8 initial_psn[3]; - u8 reserved[8]; + u8 init_credits; /* QP rq init credits for rq flowctrl */ + u8 reserved[7]; }; struct smc_llc_msg_add_link_cont_rt { @@ -135,6 +136,12 @@ struct smc_llc_msg_delete_rkey { /* type 0x09 */ u8 reserved2[4]; }; +struct smc_llc_msg_announce_credits { /* type 0x0A */ + struct smc_llc_hdr hd; + u8 credits; + u8 reserved[39]; +}; + union smc_llc_msg { struct smc_llc_msg_confirm_link confirm_link; struct smc_llc_msg_add_link add_link; @@ -145,6 +152,7 @@ union smc_llc_msg { struct smc_llc_msg_delete_rkey delete_rkey; struct smc_llc_msg_test_link test_link; + struct smc_llc_msg_announce_credits announce_credits; struct { struct smc_llc_hdr hdr; u8 data[SMC_LLC_DATA_LEN]; @@ -587,6 +595,46 @@ static int smc_llc_send_test_link(struct smc_link *link, u8 user_data[16]) return rc; } +/* send credits announce request or response */ +int smc_llc_announce_credits(struct smc_link *link, + enum smc_llc_reqresp reqresp, bool force) +{ + struct smc_llc_msg_announce_credits *announce_credits; + struct smc_wr_tx_pend_priv *pend; + struct smc_wr_buf *wr_buf; + int rc; + u8 saved_credits = 0; + + if (!link->credits_enable || + (!force && !smc_wr_rx_credits_need_announce(link))) + return 0; + + saved_credits = (u8)smc_wr_rx_get_credits(link); + if (!saved_credits) + /* maybe synced by cdc msg */ + return 0; + + rc = smc_llc_add_pending_send(link, &wr_buf, &pend); + if (rc) { + smc_wr_rx_put_credits(link, saved_credits); + return rc; + } + + announce_credits = (struct smc_llc_msg_announce_credits *)wr_buf; + memset(announce_credits, 0, sizeof(*announce_credits)); + announce_credits->hd.common.type = SMC_LLC_ANNOUNCE_CREDITS; + announce_credits->hd.length = sizeof(struct smc_llc_msg_announce_credits); + if (reqresp == SMC_LLC_RESP) + announce_credits->hd.flags |= SMC_LLC_FLAG_RESP; + announce_credits->credits = saved_credits; + /* send llc message */ + rc = smc_wr_tx_send(link, pend); + if (rc) + smc_wr_rx_put_credits(link, saved_credits); + + return rc; +} + /* schedule an llc send on link, may wait for buffers */ static int smc_llc_send_message(struct smc_link *link, void *llcbuf) { @@ -866,6 +914,13 @@ static void smc_llc_save_add_link_info(struct smc_link *link, memcpy(link->peer_mac, add_llc->sender_mac, ETH_ALEN); link->peer_psn = ntoh24(add_llc->initial_psn); link->peer_mtu = add_llc->qp_mtu; + link->credits_enable = add_llc->init_credits ? 1 : 0; + if (link->credits_enable) { + atomic_set(&link->peer_rq_credits, add_llc->init_credits); + // set peer rq credits watermark, if less than init_credits * 2/3, + // then credit announcement is needed. + link->peer_cr_watermark_low = max(add_llc->init_credits * 2 / 3, 1); + } } /* as an SMC client, process an add link request */ @@ -1624,6 +1679,10 @@ static void smc_llc_event_handler(struct smc_llc_qentry *qentry) smc_llc_flow_stop(lgr, &lgr->llc_flow_rmt); } return; + case SMC_LLC_ANNOUNCE_CREDITS: + if (smc_link_active(link)) + smc_wr_tx_put_credits(link, llc->announce_credits.credits, true); + break; default: smc_llc_protocol_violation(lgr, llc->raw.hdr.common.type); break; @@ -1692,6 +1751,10 @@ static void smc_llc_rx_response(struct smc_link *link, case SMC_LLC_CONFIRM_RKEY_CONT: /* not used because max links is 3 */ break; + case SMC_LLC_ANNOUNCE_CREDITS: + if (smc_link_active(link)) + smc_wr_tx_put_credits(link, qentry->msg.announce_credits.credits, true); + break; default: smc_llc_protocol_violation(link->lgr, llc_type); break; @@ -1778,6 +1841,27 @@ static void smc_llc_testlink_work(struct work_struct *work) schedule_delayed_work(&link->llc_testlink_wrk, next_interval); } +static void smc_llc_announce_credits_work(struct work_struct *work) +{ + struct smc_link *link = container_of(work, + struct smc_link, credits_announce_work); + int rc, retry = 0, agains = 0; + +again: + do { + rc = smc_llc_announce_credits(link, SMC_LLC_RESP, false); + } while ((rc == -EBUSY) && smc_link_sendable(link) && + (retry++ < SMC_LLC_ANNOUNCE_CR_MAX_RETRY)); + + if (smc_wr_rx_credits_need_announce(link) && + smc_link_sendable(link) && agains <= 5 && !rc) { + agains++; + goto again; + } + + clear_bit(SMC_LINKFLAG_ANNOUNCE_PENDING, &link->flags); +} + void smc_llc_lgr_init(struct smc_link_group *lgr, struct smc_sock *smc) { struct net *net = sock_net(smc->clcsock->sk); @@ -1813,6 +1897,7 @@ int smc_llc_link_init(struct smc_link *link) { init_completion(&link->llc_testlink_resp); INIT_DELAYED_WORK(&link->llc_testlink_wrk, smc_llc_testlink_work); + INIT_WORK(&link->credits_announce_work, smc_llc_announce_credits_work); return 0; } @@ -1844,6 +1929,7 @@ void smc_llc_link_clear(struct smc_link *link, bool log) link->smcibdev->ibdev->name, link->ibport); complete(&link->llc_testlink_resp); cancel_delayed_work_sync(&link->llc_testlink_wrk); + cancel_work_sync(&link->credits_announce_work); } /* register a new rtoken at the remote peer (for all links) */ @@ -1958,6 +2044,10 @@ static struct smc_wr_rx_handler smc_llc_rx_handlers[] = { .handler = smc_llc_rx_handler, .type = SMC_LLC_DELETE_RKEY }, + { + .handler = smc_llc_rx_handler, + .type = SMC_LLC_ANNOUNCE_CREDITS + }, { .handler = NULL, } diff --git a/net/smc/smc_llc.h b/net/smc/smc_llc.h index cc00a2ec4e92..7694ff573e34 100644 --- a/net/smc/smc_llc.h +++ b/net/smc/smc_llc.h @@ -20,6 +20,8 @@ #define SMC_LLC_WAIT_FIRST_TIME (5 * HZ) #define SMC_LLC_WAIT_TIME (2 * HZ) +#define SMC_LLC_ANNOUNCE_CR_MAX_RETRY (1) + enum smc_llc_reqresp { SMC_LLC_REQ, SMC_LLC_RESP @@ -34,6 +36,7 @@ enum smc_llc_msg_type { SMC_LLC_TEST_LINK = 0x07, SMC_LLC_CONFIRM_RKEY_CONT = 0x08, SMC_LLC_DELETE_RKEY = 0x09, + SMC_LLC_ANNOUNCE_CREDITS = 0X0A, }; #define smc_link_downing(state) \ @@ -77,6 +80,8 @@ int smc_llc_send_add_link(struct smc_link *link, u8 mac[], u8 gid[], int smc_llc_send_delete_link(struct smc_link *link, u8 link_del_id, enum smc_llc_reqresp reqresp, bool orderly, u32 reason); +int smc_llc_announce_credits(struct smc_link *link, + enum smc_llc_reqresp reqresp, bool force); void smc_llc_srv_delete_link_local(struct smc_link *link, u8 del_link_id); void smc_llc_lgr_init(struct smc_link_group *lgr, struct smc_sock *smc); void smc_llc_lgr_clear(struct smc_link_group *lgr); diff --git a/net/smc/smc_wr.c b/net/smc/smc_wr.c index d6a4d8b9d446..e6004922d9a0 100644 --- a/net/smc/smc_wr.c +++ b/net/smc/smc_wr.c @@ -115,7 +115,8 @@ static inline void smc_wr_tx_process_cqe(struct ib_wc *wc) } if (pnd_snd.handler) pnd_snd.handler(&pnd_snd.priv, link, wc->status); - wake_up(&link->wr_tx_wait); + if (wq_has_sleeper(&link->wr_tx_wait)) + wake_up(&link->wr_tx_wait); } static void smc_wr_tx_tasklet_fn(unsigned long data) @@ -158,11 +159,16 @@ static inline int smc_wr_tx_get_free_slot_index(struct smc_link *link, u32 *idx) *idx = link->wr_tx_cnt; if (!smc_link_sendable(link)) return -ENOLINK; + + if (!smc_wr_tx_get_credit(link)) + return -EBUSY; + for_each_clear_bit(*idx, link->wr_tx_mask, link->wr_tx_cnt) { if (!test_and_set_bit(*idx, link->wr_tx_mask)) return 0; } *idx = link->wr_tx_cnt; + smc_wr_tx_put_credits(link, 1, false); return -EBUSY; } @@ -241,7 +247,7 @@ int smc_wr_tx_put_slot(struct smc_link *link, memset(&link->wr_tx_bufs[idx], 0, sizeof(link->wr_tx_bufs[idx])); test_and_clear_bit(idx, link->wr_tx_mask); - wake_up(&link->wr_tx_wait); + smc_wr_tx_put_credits(link, 1, true); return 1; } @@ -405,6 +411,12 @@ static inline void smc_wr_rx_process_cqes(struct ib_wc wc[], int num) break; } } + + if (smc_wr_rx_credits_need_announce(link) && + !test_bit(SMC_LINKFLAG_ANNOUNCE_PENDING, &link->flags)) { + set_bit(SMC_LINKFLAG_ANNOUNCE_PENDING, &link->flags); + schedule_work(&link->credits_announce_work); + } } } @@ -447,6 +459,8 @@ int smc_wr_rx_post_init(struct smc_link *link) for (i = 0; i < link->wr_rx_cnt; i++) rc = smc_wr_rx_post(link); + // credits have already been announced to peer + atomic_set(&link->local_rq_credits, 0); return rc; } @@ -481,7 +495,7 @@ void smc_wr_remember_qp_attr(struct smc_link *lnk) lnk->wr_tx_cnt = min_t(size_t, SMC_WR_BUF_CNT, lnk->qp_attr.cap.max_send_wr); - lnk->wr_rx_cnt = min_t(size_t, SMC_WR_BUF_CNT * 3, + lnk->wr_rx_cnt = min_t(size_t, SMC_WR_BUF_CNT, lnk->qp_attr.cap.max_recv_wr); } @@ -595,7 +609,7 @@ int smc_wr_alloc_link_mem(struct smc_link *link) link->wr_tx_bufs = kcalloc(SMC_WR_BUF_CNT, SMC_WR_BUF_SIZE, GFP_KERNEL); if (!link->wr_tx_bufs) goto no_mem; - link->wr_rx_bufs = kcalloc(SMC_WR_BUF_CNT * 3, SMC_WR_BUF_SIZE, + link->wr_rx_bufs = kcalloc(SMC_WR_BUF_CNT, SMC_WR_BUF_SIZE, GFP_KERNEL); if (!link->wr_rx_bufs) goto no_mem_wr_tx_bufs; @@ -603,7 +617,7 @@ int smc_wr_alloc_link_mem(struct smc_link *link) GFP_KERNEL); if (!link->wr_tx_ibs) goto no_mem_wr_rx_bufs; - link->wr_rx_ibs = kcalloc(SMC_WR_BUF_CNT * 3, + link->wr_rx_ibs = kcalloc(SMC_WR_BUF_CNT, sizeof(link->wr_rx_ibs[0]), GFP_KERNEL); if (!link->wr_rx_ibs) @@ -622,7 +636,7 @@ int smc_wr_alloc_link_mem(struct smc_link *link) GFP_KERNEL); if (!link->wr_tx_sges) goto no_mem_wr_tx_rdma_sges; - link->wr_rx_sges = kcalloc(SMC_WR_BUF_CNT * 3, + link->wr_rx_sges = kcalloc(SMC_WR_BUF_CNT, sizeof(link->wr_rx_sges[0]), GFP_KERNEL); if (!link->wr_rx_sges) @@ -711,6 +725,11 @@ int smc_wr_create_link(struct smc_link *lnk) atomic_set(&lnk->wr_tx_refcnt, 0); init_waitqueue_head(&lnk->wr_reg_wait); atomic_set(&lnk->wr_reg_refcnt, 0); + atomic_set(&lnk->peer_rq_credits, 0); + atomic_set(&lnk->local_rq_credits, 0); + lnk->flags = 0; + lnk->local_cr_watermark_high = max(lnk->wr_rx_cnt / 3, 1U); + lnk->peer_cr_watermark_low = 0; return rc; dma_unmap: diff --git a/net/smc/smc_wr.h b/net/smc/smc_wr.h index cb58e60078f5..dabc810a36af 100644 --- a/net/smc/smc_wr.h +++ b/net/smc/smc_wr.h @@ -19,7 +19,12 @@ #include "smc.h" #include "smc_core.h" -#define SMC_WR_BUF_CNT 16 /* # of ctrl buffers per link */ +#define SMC_WR_BUF_CNT 64 /* # of ctrl buffers per link, SMC_WR_BUF_CNT + * should not be less than 2 * SMC_RMBS_PER_LGR_MAX, + * since every connection at least has two rq/sq + * credits in average, otherwise may result in + * waiting for credits in sending process. + */ #define SMC_WR_TX_WAIT_FREE_SLOT_TIME (10 * HZ) @@ -83,6 +88,51 @@ static inline void smc_wr_wakeup_reg_wait(struct smc_link *lnk) wake_up(&lnk->wr_reg_wait); } +// get one tx credit, and peer rq credits dec +static inline int smc_wr_tx_get_credit(struct smc_link *link) +{ + return !link->credits_enable || atomic_dec_if_positive(&link->peer_rq_credits) >= 0; +} + +// put tx credits, when some failures occurred after tx credits got +// or receive announce credits msgs +static inline void smc_wr_tx_put_credits(struct smc_link *link, int credits, bool wakeup) +{ + if (link->credits_enable && credits) { + atomic_add(credits, &link->peer_rq_credits); + if (wakeup && wq_has_sleeper(&link->wr_tx_wait)) + wake_up_nr(&link->wr_tx_wait, credits); + } +} + +// to check whether peer rq credits is lower than watermark. +static inline int smc_wr_tx_credits_need_announce(struct smc_link *link) +{ + return link->credits_enable && + atomic_read(&link->peer_rq_credits) <= link->peer_cr_watermark_low; +} + +// get local rq credits and set credits to zero. +// may called when announcing credits +static inline int smc_wr_rx_get_credits(struct smc_link *link) +{ + return link->credits_enable ? atomic_fetch_and(0, &link->local_rq_credits) : 0; +} + +// called when post_recv a rqe +static inline void smc_wr_rx_put_credits(struct smc_link *link, int credits) +{ + if (link->credits_enable && credits) + atomic_add(credits, &link->local_rq_credits); +} + +// to check whether local rq credits is higher than watermark. +static inline int smc_wr_rx_credits_need_announce(struct smc_link *link) +{ + return link->credits_enable && + atomic_read(&link->local_rq_credits) >= link->local_cr_watermark_high; +} + /* post a new receive work request to fill a completed old work request entry */ static inline int smc_wr_rx_post(struct smc_link *link) { @@ -95,6 +145,8 @@ static inline int smc_wr_rx_post(struct smc_link *link) index = do_div(temp_wr_id, link->wr_rx_cnt); link->wr_rx_ibs[index].wr_id = wr_id; rc = ib_post_recv(link->roce_qp, &link->wr_rx_ibs[index], NULL); + if (!rc) + smc_wr_rx_put_credits(link, 1); return rc; } -- Gitee From c372f25f39a6e7298076799f771d67002427b904 Mon Sep 17 00:00:00 2001 From: Guangguan Wang Date: Fri, 6 May 2022 10:45:52 +0800 Subject: [PATCH 2/4] anolis: net/smc: compress frequency of credits announcement by cdc msg anolis inclusion category: performance bugzilla: https://gitee.com/openeuler/kernel/issues/I78AFL CVE: NA Reference: https://gitee.com/anolis/cloud-kernel/commit/d82aa96b148c88da4019fae4a9e346117b4141ae -------------------------------- ANBZ: #1742 When in heavy traffic, credits token by cdc msg maybe few and wakeup frequently when credits update in recv side, which may use more cpu. Set announcement wartermark, which is 10% of local rq credits, can compress the announcement frequecy, and the credits taken by cdc msg is more than 10% of local rq credits, reduce the wakeup frequency in the recv side. In netty benchamrk, show 28% improvement in throughput: Before: throughput cpu sys usr thread-480 connect-48 len-8: 1653807.614 124.755 69.0489 55.7061 After: throughput cpu sys usr thread-480 connect-48 len-8: 2113879.617 132.117 67.9467 64.1707 Signed-off-by: Guangguan Wang Reviewed-by: Tony Lu Acked-by: Tony Lu Link: https://gitee.com/anolis/cloud-kernel/pulls/577 Signed-off-by: Gengbiao Shen Signed-off-by: Litao Jiao --- net/smc/smc_cdc.c | 3 ++- net/smc/smc_core.h | 1 + net/smc/smc_wr.c | 5 +++++ net/smc/smc_wr.h | 11 +++++++++++ 4 files changed, 19 insertions(+), 1 deletion(-) diff --git a/net/smc/smc_cdc.c b/net/smc/smc_cdc.c index fc564773fc18..f47c0c7f8f59 100644 --- a/net/smc/smc_cdc.c +++ b/net/smc/smc_cdc.c @@ -124,7 +124,8 @@ int smc_cdc_msg_send(struct smc_connection *conn, conn->tx_cdc_seq++; conn->local_tx_ctrl.seqno = conn->tx_cdc_seq; smc_host_msg_to_cdc(cdc_msg, conn, &cfed); - saved_credits = (u8)smc_wr_rx_get_credits(link); + if (smc_wr_rx_credits_need_announce_frequent(link)) + saved_credits = (u8)smc_wr_rx_get_credits(link); cdc_msg->credits = saved_credits; atomic_inc(&conn->cdc_pend_tx_wr); diff --git a/net/smc/smc_core.h b/net/smc/smc_core.h index e9af83d79ab2..b82bac5b1447 100644 --- a/net/smc/smc_core.h +++ b/net/smc/smc_core.h @@ -122,6 +122,7 @@ struct smc_link { u8 credits_enable; /* credits enable flag, set when negotiation */ u8 local_cr_watermark_high; /* local rq credits watermark */ u8 peer_cr_watermark_low; /* peer rq credits watermark */ + u8 credits_update_limit; /* credits update limit for cdc msg */ struct work_struct credits_announce_work; /* work for credits announcement */ unsigned long flags; /* link flags, SMC_LINKFLAG_ANNOUNCE_PENDING .etc */ diff --git a/net/smc/smc_wr.c b/net/smc/smc_wr.c index e6004922d9a0..a22f65b33487 100644 --- a/net/smc/smc_wr.c +++ b/net/smc/smc_wr.c @@ -730,6 +730,11 @@ int smc_wr_create_link(struct smc_link *lnk) lnk->flags = 0; lnk->local_cr_watermark_high = max(lnk->wr_rx_cnt / 3, 1U); lnk->peer_cr_watermark_low = 0; + + /* if credits accumlated less than 10% of wr_rx_cnt(at least 5), + * will not be announced by cdc msg. + */ + lnk->credits_update_limit = max(lnk->wr_rx_cnt / 10, 5U); return rc; dma_unmap: diff --git a/net/smc/smc_wr.h b/net/smc/smc_wr.h index dabc810a36af..119f1aa0076e 100644 --- a/net/smc/smc_wr.h +++ b/net/smc/smc_wr.h @@ -133,6 +133,17 @@ static inline int smc_wr_rx_credits_need_announce(struct smc_link *link) atomic_read(&link->local_rq_credits) >= link->local_cr_watermark_high; } +static inline int smc_wr_rx_credits_need_announce_frequent(struct smc_link *link) +{ + /* announce when local rq credits accumulated more than credits_update_limit, or + * peer rq credits is empty. As peer credits empty and local credits is less than + * credits_update_limit, may results in credits deadlock. + */ + return link->credits_enable && + (atomic_read(&link->local_rq_credits) >= link->credits_update_limit || + !atomic_read(&link->peer_rq_credits)); +} + /* post a new receive work request to fill a completed old work request entry */ static inline int smc_wr_rx_post(struct smc_link *link) { -- Gitee From d74680da64b3f52838f9464fae24846bdfa8d91f Mon Sep 17 00:00:00 2001 From: Dust Li Date: Thu, 2 Sep 2021 13:19:26 +0800 Subject: [PATCH 3/4] anolis: net/smc: don't call ib_req_notify_cq in the send routine anolis inclusion category: performance bugzilla: https://gitee.com/openeuler/kernel/issues/I78AFL CVE: NA Reference: https://gitee.com/anolis/cloud-kernel/commit/7e390c306a2d09b5c4e1a9b594d7d0d8169bbc7c -------------------------------- ANBZ: #1742 We can just call ib_req_notify_cq() when the link got ready, and rearm it after poll_cq(). Which is enough to make sure we won't miss any events. Simple sockperf test show about 20% gain in throughput test with small messages. Test command: client: smc_run sockperf tp -i $SERVER -m 14 -t 30 --tcp server: smc_run sockperf sr --tcp Without this: Summary: BandWidth is 6.504 MBps (52.034 Mbps) With this: Summary: BandWidth is 7.846 MBps (62.771 Mbps) Signed-off-by: Dust Li Acked-by: Dust Li Acked-by: Tony Lu Link: https://gitee.com/anolis/cloud-kernel/pulls/577 Link: https://gitee.com/anolis/cloud-kernel/pulls/906 Link: https://gitee.com/anolis/cloud-kernel/pulls/1281 Signed-off-by: Gengbiao Shen Signed-off-by: Litao Jiao --- net/smc/smc_ib.c | 6 ++++++ net/smc/smc_wr.c | 2 -- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/net/smc/smc_ib.c b/net/smc/smc_ib.c index d72c767df0a1..1ae3f0e08c2f 100644 --- a/net/smc/smc_ib.c +++ b/net/smc/smc_ib.c @@ -126,6 +126,12 @@ int smc_ib_ready_link(struct smc_link *lnk) IB_CQ_SOLICITED_MASK); if (rc) goto out; + + rc = ib_req_notify_cq(lnk->smcibdev->roce_cq_send, + IB_CQ_NEXT_COMP | IB_CQ_REPORT_MISSED_EVENTS); + if (rc) + goto out; + rc = smc_wr_rx_post_init(lnk); if (rc) goto out; diff --git a/net/smc/smc_wr.c b/net/smc/smc_wr.c index a22f65b33487..26969cc7d305 100644 --- a/net/smc/smc_wr.c +++ b/net/smc/smc_wr.c @@ -262,8 +262,6 @@ int smc_wr_tx_send(struct smc_link *link, struct smc_wr_tx_pend_priv *priv) struct smc_wr_tx_pend *pend; int rc; - ib_req_notify_cq(link->smcibdev->roce_cq_send, - IB_CQ_NEXT_COMP | IB_CQ_REPORT_MISSED_EVENTS); pend = container_of(priv, struct smc_wr_tx_pend, priv); rc = ib_post_send(link->roce_qp, &link->wr_tx_ibs[pend->idx], NULL); if (rc) { -- Gitee From 27b5109e55dd7aaec9620e16835270654d0aae26 Mon Sep 17 00:00:00 2001 From: Guangguan Wang Date: Wed, 25 May 2022 15:24:44 +0800 Subject: [PATCH 4/4] anolis: net/smc: remove redundant ib_req_notify_cq anolis inclusion category: performance bugzilla: https://gitee.com/openeuler/kernel/issues/I78AFL CVE: NA Reference: https://gitee.com/anolis/cloud-kernel/commit/933a6aabd5dc8e5bf5762de1f788ee20b2c8897c -------------------------------- ANBZ: #1742 Solicited flag is only used by RCQ. As SCQ and RCQ are combined into one CQ, we can not notify cq with solicited flag. And immediately after the solicited notify cq, another notify with next complete flag is performed, the state machine of CQ will also immediately switch from the Arm_Sol state to the Armed state, which is the same as the result of direct notify with next complete flag. So the code of notify CQ with solicited is redundant and meaningless. Signed-off-by: Guangguan Wang Reviewed-by: Tony Lu Acked-by: Tony Lu Link: https://gitee.com/anolis/cloud-kernel/pulls/577 Signed-off-by: Gengbiao Shen Signed-off-by: Litao Jiao --- net/smc/smc_ib.c | 5 ----- 1 file changed, 5 deletions(-) diff --git a/net/smc/smc_ib.c b/net/smc/smc_ib.c index 1ae3f0e08c2f..b93e4cb015c2 100644 --- a/net/smc/smc_ib.c +++ b/net/smc/smc_ib.c @@ -122,11 +122,6 @@ int smc_ib_ready_link(struct smc_link *lnk) if (rc) goto out; smc_wr_remember_qp_attr(lnk); - rc = ib_req_notify_cq(lnk->smcibdev->roce_cq_recv, - IB_CQ_SOLICITED_MASK); - if (rc) - goto out; - rc = ib_req_notify_cq(lnk->smcibdev->roce_cq_send, IB_CQ_NEXT_COMP | IB_CQ_REPORT_MISSED_EVENTS); if (rc) -- Gitee