diff --git a/Documentation/networking/smc-sysctl.rst b/Documentation/networking/smc-sysctl.rst new file mode 100644 index 0000000000000000000000000000000000000000..c53f8c61c9e488c5945520f4dadce3caf3a994c7 --- /dev/null +++ b/Documentation/networking/smc-sysctl.rst @@ -0,0 +1,23 @@ +.. SPDX-License-Identifier: GPL-2.0 + +========= +SMC Sysctl +========= + +/proc/sys/net/smc/* Variables +============================== + +autocorking_size - INTEGER + Setting SMC auto corking size: + SMC auto corking is like TCP auto corking from the application's + perspective of view. When applications do consecutive small + write()/sendmsg() system calls, we try to coalesce these small writes + as much as possible, to lower total amount of CDC and RDMA Write been + sent. + autocorking_size limits the maximum corked bytes that can be sent to + the under device in 1 single sending. If set to 0, the SMC auto corking + is disabled. + Applications can still use TCP_CORK for optimal behavior when they + know how/when to uncork their sockets. + + Default: 64K diff --git a/include/linux/socket.h b/include/linux/socket.h index 9aa530d497da8f738e2f2fd6f2d1b94bfd65c29c..4005895fe296b4d2fffc9c13873acdab16092196 100644 --- a/include/linux/socket.h +++ b/include/linux/socket.h @@ -360,6 +360,7 @@ struct ucred { #define SOL_KCM 281 #define SOL_TLS 282 #define SOL_XDP 283 +#define SOL_SMC 286 /* IPX options */ #define IPX_TYPE 1 diff --git a/include/linux/tcp.h b/include/linux/tcp.h index 2f87377e9af70b8b714f842170e0ba3ac9ffddf8..66177c5e27c9d9018af9d69dcc5d65fa602cd8d6 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -394,6 +394,7 @@ struct tcp_sock { bool is_mptcp; #endif #if IS_ENABLED(CONFIG_SMC) + bool (*smc_hs_congested)(const struct sock *sk); bool syn_smc; /* SYN includes SMC */ #endif diff --git a/include/net/netns/smc.h b/include/net/netns/smc.h index cfbdcd4194ecbcb921660e25d52cc031f747591d..135cfa9f42c449f6a094d71ed59f739454ce620f 100644 --- a/include/net/netns/smc.h +++ b/include/net/netns/smc.h @@ -19,15 +19,17 @@ struct netns_smc { /* protect fback_rsn */ struct mutex mutex_fback_rsn; struct smc_stats_rsn *fback_rsn; + int limit_smc_hs; /* constraint on handshake */ struct smc_convert smc_conv; #ifdef CONFIG_SYSCTL struct ctl_table_header *smc_hdr; #endif + unsigned int sysctl_autocorking_size; int sysctl_wmem_default; int sysctl_rmem_default; int sysctl_tcp2smc; int sysctl_allow_different_subnet; - int sysctl_autocorking; + int sysctl_keep_first_contact_clcsock; + int sysctl_disable_multiple_link; }; - #endif diff --git a/include/uapi/linux/smc.h b/include/uapi/linux/smc.h index b69bd17f6a5279a76817cd8e0633220c5eba0fda..759bcb2ff03effa84d434dd6bdafa0a3564fa2a2 100644 --- a/include/uapi/linux/smc.h +++ b/include/uapi/linux/smc.h @@ -59,6 +59,9 @@ enum { SMC_NETLINK_DUMP_SEID, SMC_NETLINK_ENABLE_SEID, SMC_NETLINK_DISABLE_SEID, + SMC_NETLINK_DUMP_HS_LIMITATION, + SMC_NETLINK_ENABLE_HS_LIMITATION, + SMC_NETLINK_DISABLE_HS_LIMITATION, SMC_NETLINK_ADD_TCP2SMC_WLIST, SMC_NETLINK_DEL_TCP2SMC_WLIST, SMC_NETLINK_GET_TCP2SMC_WLIST, @@ -285,4 +288,16 @@ enum { __SMC_NLA_SEID_TABLE_MAX, SMC_NLA_SEID_TABLE_MAX = __SMC_NLA_SEID_TABLE_MAX - 1 }; + +/* SMC_NETLINK_HS_LIMITATION attributes */ +enum { + SMC_NLA_HS_LIMITATION_UNSPEC, + SMC_NLA_HS_LIMITATION_ENABLED, /* u8 */ + __SMC_NLA_HS_LIMITATION_MAX, + SMC_NLA_HS_LIMITATION_MAX = __SMC_NLA_HS_LIMITATION_MAX - 1 +}; + +/* SMC socket options */ +#define SMC_LIMIT_HS 1 /* constraint on smc handshake */ + #endif /* _UAPI_LINUX_SMC_H */ diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 1ba6c15c993cbe72c97d57cb5005a406431b0958..a689adf859c5b5a49528543457696b78c94f8137 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -6647,7 +6647,8 @@ static void tcp_openreq_init(struct request_sock *req, ireq->ir_num = ntohs(tcp_hdr(skb)->dest); ireq->ir_mark = inet_request_mark(sk, skb); #if IS_ENABLED(CONFIG_SMC) - ireq->smc_ok = rx_opt->smc_ok; + ireq->smc_ok = rx_opt->smc_ok && !(tcp_sk(sk)->smc_hs_congested && + tcp_sk(sk)->smc_hs_congested(sk)); #endif } diff --git a/net/smc/Makefile b/net/smc/Makefile index 72b3c934e4730b395ff4ef07765a2515c2eeafaf..bd6f807ff803083dd2fb7f9a2b4538f5a76aafaf 100644 --- a/net/smc/Makefile +++ b/net/smc/Makefile @@ -4,4 +4,5 @@ obj-$(CONFIG_SMC) += smc.o obj-$(CONFIG_SMC_DIAG) += smc_diag.o smc-y := af_smc.o smc_pnet.o smc_ib.o smc_clc.o smc_core.o smc_wr.o smc_llc.o smc-y += smc_cdc.o smc_tx.o smc_rx.o smc_close.o smc_ism.o smc_netlink.o smc_stats.o -smc-y += smc_tracepoint.o smc_sysctl.o smc_proc.o smc_conv.o +smc-y += smc_tracepoint.o smc_proc.o smc_conv.o +smc-$(CONFIG_SYSCTL) += smc_sysctl.o diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index 1f6120e742f578dcac0bc32a98eb6df02edf96cf..b59fe3958a2748820e3c67186c5a6b5c039e7979 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -52,6 +52,7 @@ #include "smc_close.h" #include "smc_stats.h" #include "smc_tracepoint.h" +#include "smc_sysctl.h" #include "smc_proc.h" #include "smc_conv.h" @@ -62,16 +63,51 @@ static DEFINE_MUTEX(smc_client_lgr_pending); /* serialize link group * creation on client */ -struct workqueue_struct *smc_tcp_ls_wq; /* wq for tcp listen work */ +static struct workqueue_struct *smc_tcp_ls_wq; /* wq for tcp listen work */ struct workqueue_struct *smc_hs_wq; /* wq for handshake work */ struct workqueue_struct *smc_close_wq; /* wq for close work */ static void smc_tcp_listen_work(struct work_struct *); static void smc_connect_work(struct work_struct *); -static void smc_clcsock_state_change(struct sock *clcsk); -static void smc_clcsock_data_ready(struct sock *clcsk); -static void smc_clcsock_write_space(struct sock *clcsk); -static void smc_clcsock_error_report(struct sock *clcsk); + +int smc_nl_dump_hs_limitation(struct sk_buff *skb, struct netlink_callback *cb) +{ + struct smc_nl_dmp_ctx *cb_ctx = smc_nl_dmp_ctx(cb); + void *hdr; + + if (cb_ctx->pos[0]) + goto out; + + hdr = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, + &smc_gen_nl_family, NLM_F_MULTI, + SMC_NETLINK_DUMP_HS_LIMITATION); + if (!hdr) + return -ENOMEM; + + if (nla_put_u8(skb, SMC_NLA_HS_LIMITATION_ENABLED, + sock_net(skb->sk)->smc.limit_smc_hs)) + goto err; + + genlmsg_end(skb, hdr); + cb_ctx->pos[0] = 1; +out: + return skb->len; +err: + genlmsg_cancel(skb, hdr); + return -EMSGSIZE; +} + +int smc_nl_enable_hs_limitation(struct sk_buff *skb, struct genl_info *info) +{ + sock_net(skb->sk)->smc.limit_smc_hs = true; + return 0; +} + +int smc_nl_disable_hs_limitation(struct sk_buff *skb, struct genl_info *info) +{ + sock_net(skb->sk)->smc.limit_smc_hs = false; + return 0; +} static void smc_set_keepalive(struct sock *sk, int val) { @@ -80,6 +116,61 @@ static void smc_set_keepalive(struct sock *sk, int val) smc->clcsock->sk->sk_prot->keepalive(smc->clcsock->sk, val); } +static struct sock *smc_tcp_syn_recv_sock(const struct sock *sk, + struct sk_buff *skb, + struct request_sock *req, + struct dst_entry *dst, + struct request_sock *req_unhash, + bool *own_req) +{ + struct smc_sock *smc; + struct sock *child; + + smc = smc_clcsock_user_data(sk); + + if (READ_ONCE(sk->sk_ack_backlog) + atomic_read(&smc->queued_smc_hs) > + sk->sk_max_ack_backlog) + goto drop; + + if (sk_acceptq_is_full(&smc->sk)) { + NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS); + goto drop; + } + + /* passthrough to original syn recv sock fct */ + child = smc->ori_af_ops->syn_recv_sock(sk, skb, req, dst, req_unhash, + own_req); + /* child must not inherit smc or its ops */ + if (child) { + rcu_assign_sk_user_data(child, NULL); + + /* v4-mapped sockets don't inherit parent ops. Don't restore. */ + if (inet_csk(child)->icsk_af_ops == inet_csk(sk)->icsk_af_ops) + inet_csk(child)->icsk_af_ops = smc->ori_af_ops; + } + return child; + +drop: + dst_release(dst); + tcp_listendrop(sk); + return NULL; +} + +static bool smc_hs_congested(const struct sock *sk) +{ + const struct smc_sock *smc; + + smc = smc_clcsock_user_data(sk); + + if (!smc) + return true; + + if (workqueue_congested(WORK_CPU_UNBOUND, smc_hs_wq)) + return true; + + return false; +} + static struct smc_hashinfo smc_v4_hashinfo = { .lock = __RW_LOCK_UNLOCKED(smc_v4_hashinfo.lock), }; @@ -117,12 +208,27 @@ void smc_unhash_sk(struct sock *sk) } EXPORT_SYMBOL_GPL(smc_unhash_sk); +/* This will be called before user really release sock_lock. So do the + * work which we didn't do because of user hold the sock_lock in the + * BH context + */ +static void smc_release_cb(struct sock *sk) +{ + struct smc_sock *smc = smc_sk(sk); + + if (smc->conn.tx_in_release_sock) { + smc_tx_pending(&smc->conn); + smc->conn.tx_in_release_sock = false; + } +} + struct proto smc_proto = { .name = "SMC", .owner = THIS_MODULE, .keepalive = smc_set_keepalive, .hash = smc_hash_sk, .unhash = smc_unhash_sk, + .release_cb = smc_release_cb, .obj_size = sizeof(struct smc_sock), .h.smc_hash = &smc_v4_hashinfo, .slab_flags = SLAB_TYPESAFE_BY_RCU, @@ -135,17 +241,34 @@ struct proto smc_proto6 = { .keepalive = smc_set_keepalive, .hash = smc_hash_sk, .unhash = smc_unhash_sk, + .release_cb = smc_release_cb, .obj_size = sizeof(struct smc_sock), .h.smc_hash = &smc_v6_hashinfo, .slab_flags = SLAB_TYPESAFE_BY_RCU, }; EXPORT_SYMBOL_GPL(smc_proto6); +static void smc_fback_restore_callbacks(struct smc_sock *smc) +{ + struct sock *clcsk = smc->clcsock->sk; + + write_lock_bh(&clcsk->sk_callback_lock); + clcsk->sk_user_data = NULL; + + smc_clcsock_restore_cb(&clcsk->sk_state_change, &smc->clcsk_state_change); + smc_clcsock_restore_cb(&clcsk->sk_data_ready, &smc->clcsk_data_ready); + smc_clcsock_restore_cb(&clcsk->sk_write_space, &smc->clcsk_write_space); + smc_clcsock_restore_cb(&clcsk->sk_error_report, &smc->clcsk_error_report); + + write_unlock_bh(&clcsk->sk_callback_lock); +} + static void smc_restore_fallback_changes(struct smc_sock *smc) { if (smc->clcsock->file) { /* non-accepted sockets have no file yet */ smc->clcsock->file->private_data = smc->sk.sk_socket; smc->clcsock->file = NULL; + smc_fback_restore_callbacks(smc); } } @@ -264,6 +387,7 @@ static struct sock *smc_sock_alloc(struct net *net, struct socket *sock, sk->sk_sndbuf = net->smc.sysctl_wmem_default; sk->sk_rcvbuf = net->smc.sysctl_rmem_default; smc = smc_sk(sk); + smc->keep_clcsock = 0; INIT_WORK(&smc->tcp_listen_work, smc_tcp_listen_work); INIT_WORK(&smc->connect_work, smc_connect_work); INIT_DELAYED_WORK(&smc->conn.tx_work, smc_tx_work); @@ -273,6 +397,7 @@ static struct sock *smc_sock_alloc(struct net *net, struct socket *sock, sk->sk_prot->hash(sk); sk_refcnt_debug_inc(sk); mutex_init(&smc->clcsock_release_lock); + smc_init_saved_callbacks(smc); return sk; } @@ -408,6 +533,7 @@ static int smcr_lgr_reg_rmbs(struct smc_link *link, static int smcr_clnt_conf_first_link(struct smc_sock *smc) { struct smc_link *link = smc->conn.lnk; + struct net *net = sock_net(&smc->sk); struct smc_llc_qentry *qentry; int rc; @@ -447,20 +573,22 @@ static int smcr_clnt_conf_first_link(struct smc_sock *smc) smc_llc_link_active(link); smcr_lgr_set_type(link->lgr, SMC_LGR_SINGLE); - /* optional 2nd link, receive ADD LINK request from server */ - qentry = smc_llc_wait(link->lgr, NULL, SMC_LLC_WAIT_TIME, - SMC_LLC_ADD_LINK); - if (!qentry) { - struct smc_clc_msg_decline dclc; - - rc = smc_clc_wait_msg(smc, &dclc, sizeof(dclc), - SMC_CLC_DECLINE, CLC_WAIT_TIME_SHORT); - if (rc == -EAGAIN) - rc = 0; /* no DECLINE received, go with one link */ - return rc; + if (!net->smc.sysctl_disable_multiple_link) { + /* optional 2nd link, receive ADD LINK request from server */ + qentry = smc_llc_wait(link->lgr, NULL, SMC_LLC_WAIT_TIME, + SMC_LLC_ADD_LINK); + if (!qentry) { + struct smc_clc_msg_decline dclc; + + rc = smc_clc_wait_msg(smc, &dclc, sizeof(dclc), + SMC_CLC_DECLINE, CLC_WAIT_TIME_SHORT); + if (rc == -EAGAIN) + rc = 0; /* no DECLINE received, go with one link */ + return rc; + } + smc_llc_flow_qentry_clr(&link->lgr->llc_flow_lcl); + smc_llc_cli_add_link(link, qentry); } - smc_llc_flow_qentry_clr(&link->lgr->llc_flow_lcl); - smc_llc_cli_add_link(link, qentry); return 0; } @@ -549,6 +677,13 @@ static void smc_link_save_peer_info(struct smc_link *link, memcpy(link->peer_mac, ini->peer_mac, sizeof(link->peer_mac)); link->peer_psn = ntoh24(clc->r0.psn); link->peer_mtu = clc->r0.qp_mtu; + link->credits_enable = clc->r0.init_credits ? 1 : 0; + if (link->credits_enable) { + atomic_set(&link->peer_rq_credits, clc->r0.init_credits); + // set peer rq credits watermark, if less than init_credits * 2/3, + // then credit announcement is needed. + link->peer_cr_watermark_low = max(clc->r0.init_credits * 2 / 3, 1); + } } static void smc_stat_inc_fback_rsn_cnt(struct smc_sock *smc, @@ -584,10 +719,141 @@ static void smc_stat_fallback(struct smc_sock *smc) mutex_unlock(&net->smc.mutex_fback_rsn); } -static void smc_switch_to_fallback(struct smc_sock *smc, int reason_code) +/* must be called under rcu read lock */ +static void smc_fback_wakeup_waitqueue(struct smc_sock *smc, void *key) +{ + struct socket_wq *wq; + __poll_t flags; + + wq = rcu_dereference(smc->sk.sk_wq); + if (!skwq_has_sleeper(wq)) + return; + + /* wake up smc sk->sk_wq */ + if (!key) { + /* sk_state_change */ + wake_up_interruptible_all(&wq->wait); + } else { + flags = key_to_poll(key); + if (flags & (EPOLLIN | EPOLLOUT)) + /* sk_data_ready or sk_write_space */ + wake_up_interruptible_sync_poll(&wq->wait, flags); + else if (flags & EPOLLERR) + /* sk_error_report */ + wake_up_interruptible_poll(&wq->wait, flags); + } +} + +static int smc_fback_mark_woken(wait_queue_entry_t *wait, + unsigned int mode, int sync, void *key) +{ + struct smc_mark_woken *mark = + container_of(wait, struct smc_mark_woken, wait_entry); + + mark->woken = true; + mark->key = key; + return 0; +} + +static void smc_fback_forward_wakeup(struct smc_sock *smc, struct sock *clcsk, + void (*clcsock_callback)(struct sock *sk)) +{ + struct smc_mark_woken mark = { .woken = false }; + struct socket_wq *wq; + + init_waitqueue_func_entry(&mark.wait_entry, + smc_fback_mark_woken); + rcu_read_lock(); + wq = rcu_dereference(clcsk->sk_wq); + if (!wq) + goto out; + add_wait_queue(sk_sleep(clcsk), &mark.wait_entry); + clcsock_callback(clcsk); + remove_wait_queue(sk_sleep(clcsk), &mark.wait_entry); + + if (mark.woken) + smc_fback_wakeup_waitqueue(smc, mark.key); +out: + rcu_read_unlock(); +} + +static void smc_fback_state_change(struct sock *clcsk) +{ + struct smc_sock *smc; + + read_lock_bh(&clcsk->sk_callback_lock); + smc = smc_clcsock_user_data(clcsk); + if (smc) + smc_fback_forward_wakeup(smc, clcsk, + smc->clcsk_state_change); + read_unlock_bh(&clcsk->sk_callback_lock); +} + +static void smc_fback_data_ready(struct sock *clcsk) +{ + struct smc_sock *smc; + + read_lock_bh(&clcsk->sk_callback_lock); + smc = smc_clcsock_user_data(clcsk); + if (smc) + smc_fback_forward_wakeup(smc, clcsk, + smc->clcsk_data_ready); + read_unlock_bh(&clcsk->sk_callback_lock); +} + +static void smc_fback_write_space(struct sock *clcsk) +{ + struct smc_sock *smc; + + read_lock_bh(&clcsk->sk_callback_lock); + smc = smc_clcsock_user_data(clcsk); + if (smc) + smc_fback_forward_wakeup(smc, clcsk, + smc->clcsk_write_space); + read_unlock_bh(&clcsk->sk_callback_lock); +} + +static void smc_fback_error_report(struct sock *clcsk) +{ + struct smc_sock *smc; + + read_lock_bh(&clcsk->sk_callback_lock); + smc = smc_clcsock_user_data(clcsk); + if (smc) + smc_fback_forward_wakeup(smc, clcsk, + smc->clcsk_error_report); + read_unlock_bh(&clcsk->sk_callback_lock); +} + +static void smc_fback_replace_callbacks(struct smc_sock *smc) { struct sock *clcsk = smc->clcsock->sk; + write_lock_bh(&clcsk->sk_callback_lock); + clcsk->sk_user_data = (void *)((uintptr_t)smc | SK_USER_DATA_NOCOPY); + + smc_clcsock_replace_cb(&clcsk->sk_state_change, smc_fback_state_change, + &smc->clcsk_state_change); + smc_clcsock_replace_cb(&clcsk->sk_data_ready, smc_fback_data_ready, + &smc->clcsk_data_ready); + smc_clcsock_replace_cb(&clcsk->sk_write_space, smc_fback_write_space, + &smc->clcsk_write_space); + smc_clcsock_replace_cb(&clcsk->sk_error_report, smc_fback_error_report, + &smc->clcsk_error_report); + + write_unlock_bh(&clcsk->sk_callback_lock); +} + +static int smc_switch_to_fallback(struct smc_sock *smc, int reason_code) +{ + int rc = 0; + + mutex_lock(&smc->clcsock_release_lock); + if (!smc->clcsock) { + rc = -EBADF; + goto out; + } + smc->use_fallback = true; smc->fallback_rsn = reason_code; smc_stat_fallback(smc); @@ -598,25 +864,30 @@ static void smc_switch_to_fallback(struct smc_sock *smc, int reason_code) smc->clcsock->wq.fasync_list = smc->sk.sk_socket->wq.fasync_list; - smc->clcsk_state_change = clcsk->sk_state_change; - smc->clcsk_data_ready = clcsk->sk_data_ready; - smc->clcsk_write_space = clcsk->sk_write_space; - smc->clcsk_error_report = clcsk->sk_error_report; - - clcsk->sk_state_change = smc_clcsock_state_change; - clcsk->sk_data_ready = smc_clcsock_data_ready; - clcsk->sk_write_space = smc_clcsock_write_space; - clcsk->sk_error_report = smc_clcsock_error_report; - - smc->clcsock->sk->sk_user_data = - (void *)((uintptr_t)smc | SK_USER_DATA_NOCOPY); + /* There might be some wait entries remaining + * in smc sk->sk_wq and they should be woken up + * as clcsock's wait queue is woken up. + */ + smc_fback_replace_callbacks(smc); } +out: + mutex_unlock(&smc->clcsock_release_lock); + return rc; } /* fall back during connect */ static int smc_connect_fallback(struct smc_sock *smc, int reason_code) { - smc_switch_to_fallback(smc, reason_code); + struct net *net = sock_net(&smc->sk); + int rc = 0; + + rc = smc_switch_to_fallback(smc, reason_code); + if (rc) { /* fallback fails */ + this_cpu_inc(net->smc.smc_stats->clnt_hshake_err_cnt); + if (smc->sk.sk_state == SMC_INIT) + sock_put(&smc->sk); /* passive closing */ + return rc; + } smc_copy_sock_settings_to_clc(smc); smc->connect_nonblock = 0; if (smc->sk.sk_state == SMC_INIT) @@ -651,10 +922,16 @@ static int smc_connect_decline_fallback(struct smc_sock *smc, int reason_code, static void smc_conn_abort(struct smc_sock *smc, int local_first) { - if (local_first) - smc_lgr_cleanup_early(&smc->conn); - else - smc_conn_free(&smc->conn); + struct smc_connection *conn = &smc->conn; + struct smc_link_group *lgr = conn->lgr; + bool lgr_valid = false; + + if (smc_conn_lgr_valid(conn)) + lgr_valid = true; + + smc_conn_free(conn); + if (local_first && lgr_valid) + smc_lgr_cleanup_early(lgr); } /* check if there is a rdma device available for this connection. */ @@ -962,6 +1239,11 @@ static int smc_connect_rdma(struct smc_sock *smc, goto connect_abort; } } else { + if (smc_llc_announce_credits(link, SMC_LLC_RESP, true)) { + reason_code = SMC_CLC_DECL_CREDITSERR; + goto connect_abort; + } + if (smcr_lgr_reg_rmbs(link, smc->conn.rmb_desc)) { reason_code = SMC_CLC_DECL_ERR_REGRMB; goto connect_abort; @@ -1179,8 +1461,14 @@ static int __smc_connect(struct smc_sock *smc) /* perform CLC handshake */ rc = smc_connect_clc(smc, aclc2, ini); - if (rc) + if (rc) { + /* -EAGAIN on timeout, see tcp_recvmsg() */ + if (rc == -EAGAIN) { + rc = -ETIMEDOUT; + smc->sk.sk_err = ETIMEDOUT; + } goto vlan_cleanup; + } /* check if smc modes and versions of CLC proposal and accept match */ rc = smc_connect_check_aclc(ini, aclc); @@ -1239,6 +1527,8 @@ static void smc_connect_work(struct work_struct *work) smc->sk.sk_state = SMC_CLOSED; if (rc == -EPIPE || rc == -EAGAIN) smc->sk.sk_err = EPIPE; + else if (rc == -ECONNREFUSED) + smc->sk.sk_err = ECONNREFUSED; else if (signal_pending(current)) smc->sk.sk_err = -sock_intr_errno(timeo); sock_put(&smc->sk); /* passive closing */ @@ -1358,6 +1648,19 @@ static int smc_clcsock_accept(struct smc_sock *lsmc, struct smc_sock **new_smc) * function; switch it back to the original sk_data_ready function */ new_clcsock->sk->sk_data_ready = lsmc->clcsk_data_ready; + + /* if new clcsock has also inherited the fallback-specific callback + * functions, switch them back to the original ones. + */ + if (lsmc->use_fallback) { + if (lsmc->clcsk_state_change) + new_clcsock->sk->sk_state_change = lsmc->clcsk_state_change; + if (lsmc->clcsk_write_space) + new_clcsock->sk->sk_write_space = lsmc->clcsk_write_space; + if (lsmc->clcsk_error_report) + new_clcsock->sk->sk_error_report = lsmc->clcsk_error_report; + } + (*new_smc)->clcsock = new_clcsock; out: return rc; @@ -1442,6 +1745,7 @@ void smc_close_non_accepted(struct sock *sk) static int smcr_serv_conf_first_link(struct smc_sock *smc) { struct smc_link *link = smc->conn.lnk; + struct net *net = sock_net(&smc->sk); struct smc_llc_qentry *qentry; int rc; @@ -1475,8 +1779,10 @@ static int smcr_serv_conf_first_link(struct smc_sock *smc) smc_llc_link_active(link); smcr_lgr_set_type(link->lgr, SMC_LGR_SINGLE); - /* initial contact - try to establish second link */ - smc_llc_srv_add_link(link, NULL); + if (!net->smc.sysctl_disable_multiple_link) { + /* initial contact - try to establish second link */ + smc_llc_srv_add_link(link, NULL); + } return 0; } @@ -1486,6 +1792,9 @@ static void smc_listen_out(struct smc_sock *new_smc) struct smc_sock *lsmc = new_smc->listen_smc; struct sock *newsmcsk = &new_smc->sk; + if (tcp_sk(new_smc->clcsock->sk)->syn_smc) + atomic_dec(&lsmc->queued_smc_hs); + if (lsmc->sk.sk_state == SMC_LISTEN) { lock_sock_nested(&lsmc->sk, SINGLE_DEPTH_NESTING); smc_accept_enqueue(&lsmc->sk, newsmcsk); @@ -1531,11 +1840,12 @@ static void smc_listen_decline(struct smc_sock *new_smc, int reason_code, { /* RDMA setup failed, switch back to TCP */ smc_conn_abort(new_smc, local_first); - if (reason_code < 0) { /* error, no fallback possible */ + if (reason_code < 0 || + smc_switch_to_fallback(new_smc, reason_code)) { + /* error, no fallback possible */ smc_listen_out_err(new_smc); return; } - smc_switch_to_fallback(new_smc, reason_code); if (reason_code && reason_code != SMC_CLC_DECL_PEERDECL) { if (smc_clc_send_decline(new_smc, reason_code, version) < 0) { smc_listen_out_err(new_smc); @@ -1980,8 +2290,11 @@ static void smc_listen_work(struct work_struct *work) /* check if peer is smc capable */ if (!tcp_sk(newclcsock->sk)->syn_smc) { - smc_switch_to_fallback(new_smc, SMC_CLC_DECL_PEERNOSMC); - smc_listen_out_connected(new_smc); + rc = smc_switch_to_fallback(new_smc, SMC_CLC_DECL_PEERNOSMC); + if (rc) + smc_listen_out_err(new_smc); + else + smc_listen_out_connected(new_smc); return; } @@ -2090,6 +2403,9 @@ static void smc_tcp_listen_work(struct work_struct *work) if (!new_smc) continue; + if (tcp_sk(new_smc->clcsock->sk)->syn_smc) + atomic_inc(&lsmc->queued_smc_hs); + new_smc->listen_smc = lsmc; new_smc->use_fallback = lsmc->use_fallback; new_smc->fallback_rsn = lsmc->fallback_rsn; @@ -2108,124 +2424,22 @@ static void smc_tcp_listen_work(struct work_struct *work) sock_put(&lsmc->sk); /* sock_hold in smc_clcsock_data_ready() */ } -static void smc_wake_up_waitqueue(struct smc_sock *smc, void *key) +static void smc_clcsock_data_ready(struct sock *listen_clcsock) { - struct socket_wq *wq; - __poll_t flags; - - rcu_read_lock(); - wq = rcu_dereference(smc->sk.sk_wq); - if (skwq_has_sleeper(wq)) { - if (!key) { - /* sk_state_change */ - wake_up_interruptible_all(&wq->wait); - } else { - flags = key_to_poll(key); - if (flags & (EPOLLIN | EPOLLOUT)) - /* sk_data_ready or sk_write_space */ - wake_up_interruptible_sync_poll(&wq->wait, flags); - else if (flags & EPOLLERR) - /* sk_error_report */ - wake_up_interruptible_poll(&wq->wait, flags); - } - } - rcu_read_unlock(); -} - -static int smc_mark_clcwq_woken(wait_queue_entry_t *wait, unsigned int mode, - int sync, void *key) -{ - struct smc_mark_wake_up *mark; - - mark = container_of(wait, struct smc_mark_wake_up, - wait_entry); - mark->woken = true; - mark->key = key; - return 0; -} - -static void smc_forward_wake_up(struct smc_sock *smc, struct sock *clcsk, - void (*clcsk_callback)(struct sock *sk)) -{ - struct smc_mark_wake_up mark = { .woken = false }; - struct socket_wq *wq; - - rcu_read_lock(); - /* ensure that clcsk->sk_wq still exists */ - wq = rcu_dereference(clcsk->sk_wq); - if (!wq) { - rcu_read_unlock(); - return; - } - - init_waitqueue_func_entry(&mark.wait_entry, - smc_mark_clcwq_woken); - add_wait_queue(sk_sleep(clcsk), &mark.wait_entry); - clcsk_callback(clcsk); - remove_wait_queue(sk_sleep(clcsk), &mark.wait_entry); - rcu_read_unlock(); - - if (mark.woken) - smc_wake_up_waitqueue(smc, mark.key); -} - -static void smc_clcsock_state_change(struct sock *clcsk) -{ - struct smc_sock *smc; - - smc = (struct smc_sock *) - ((uintptr_t)clcsk->sk_user_data & ~SK_USER_DATA_NOCOPY); - if (!smc) - return; - - smc_forward_wake_up(smc, clcsk, smc->clcsk_state_change); -} - -static void smc_clcsock_data_ready(struct sock *clcsk) -{ - struct smc_sock *smc; - - smc = (struct smc_sock *) - ((uintptr_t)clcsk->sk_user_data & ~SK_USER_DATA_NOCOPY); - if (!smc) - return; + struct smc_sock *lsmc; - if (!smc->use_fallback) { - /* listening situation */ - smc->clcsk_data_ready(clcsk); - if (smc->sk.sk_state == SMC_LISTEN) { - sock_hold(&smc->sk); /* sock_put in smc_tcp_listen_work() */ - if (!queue_work(smc_tcp_ls_wq, &smc->tcp_listen_work)) - sock_put(&smc->sk); - } - } else { - /* fallback situation */ - smc_forward_wake_up(smc, clcsk, smc->clcsk_data_ready); + read_lock_bh(&listen_clcsock->sk_callback_lock); + lsmc = smc_clcsock_user_data(listen_clcsock); + if (!lsmc) + goto out; + lsmc->clcsk_data_ready(listen_clcsock); + if (lsmc->sk.sk_state == SMC_LISTEN) { + sock_hold(&lsmc->sk); /* sock_put in smc_tcp_listen_work() */ + if (!queue_work(smc_tcp_ls_wq, &lsmc->tcp_listen_work)) + sock_put(&lsmc->sk); } -} - -static void smc_clcsock_write_space(struct sock *clcsk) -{ - struct smc_sock *smc; - - smc = (struct smc_sock *) - ((uintptr_t)clcsk->sk_user_data & ~SK_USER_DATA_NOCOPY); - if (!smc) - return; - - smc_forward_wake_up(smc, clcsk, smc->clcsk_write_space); -} - -static void smc_clcsock_error_report(struct sock *clcsk) -{ - struct smc_sock *smc; - - smc = (struct smc_sock *) - ((uintptr_t)clcsk->sk_user_data & ~SK_USER_DATA_NOCOPY); - if (!smc) - return; - - smc_forward_wake_up(smc, clcsk, smc->clcsk_error_report); +out: + read_unlock_bh(&listen_clcsock->sk_callback_lock); } static int smc_listen(struct socket *sock, int backlog) @@ -2257,13 +2471,31 @@ static int smc_listen(struct socket *sock, int backlog) /* save original sk_data_ready function and establish * smc-specific sk_data_ready function */ - smc->clcsk_data_ready = smc->clcsock->sk->sk_data_ready; - smc->clcsock->sk->sk_data_ready = smc_clcsock_data_ready; + write_lock_bh(&smc->clcsock->sk->sk_callback_lock); smc->clcsock->sk->sk_user_data = (void *)((uintptr_t)smc | SK_USER_DATA_NOCOPY); + smc_clcsock_replace_cb(&smc->clcsock->sk->sk_data_ready, + smc_clcsock_data_ready, &smc->clcsk_data_ready); + write_unlock_bh(&smc->clcsock->sk->sk_callback_lock); + + /* save original ops */ + smc->ori_af_ops = inet_csk(smc->clcsock->sk)->icsk_af_ops; + + smc->af_ops = *smc->ori_af_ops; + smc->af_ops.syn_recv_sock = smc_tcp_syn_recv_sock; + + inet_csk(smc->clcsock->sk)->icsk_af_ops = &smc->af_ops; + + if (smc->limit_smc_hs) + tcp_sk(smc->clcsock->sk)->smc_hs_congested = smc_hs_congested; + rc = kernel_listen(smc->clcsock, backlog); if (rc) { - smc->clcsock->sk->sk_data_ready = smc->clcsk_data_ready; + write_lock_bh(&smc->clcsock->sk->sk_callback_lock); + smc_clcsock_restore_cb(&smc->clcsock->sk->sk_data_ready, + &smc->clcsk_data_ready); + smc->clcsock->sk->sk_user_data = NULL; + write_unlock_bh(&smc->clcsock->sk->sk_callback_lock); goto out; } sk->sk_max_ack_backlog = backlog; @@ -2374,7 +2606,9 @@ static int smc_sendmsg(struct socket *sock, struct msghdr *msg, size_t len) if (msg->msg_flags & MSG_FASTOPEN) { if (sk->sk_state == SMC_INIT && !smc->connect_nonblock) { - smc_switch_to_fallback(smc, SMC_CLC_DECL_OPTUNSUPP); + rc = smc_switch_to_fallback(smc, SMC_CLC_DECL_OPTUNSUPP); + if (rc) + goto out; } else { rc = -EINVAL; goto out; @@ -2522,8 +2756,10 @@ static int smc_shutdown(struct socket *sock, int how) if (smc->use_fallback) { rc = kernel_sock_shutdown(smc->clcsock, how); sk->sk_shutdown = smc->clcsock->sk->sk_shutdown; - if (sk->sk_shutdown == SHUTDOWN_MASK) + if (sk->sk_shutdown == SHUTDOWN_MASK) { sk->sk_state = SMC_CLOSED; + sock_put(sk); + } goto out; } switch (how) { @@ -2542,7 +2778,7 @@ static int smc_shutdown(struct socket *sock, int how) /* nothing more to do because peer is not involved */ break; } - if (do_shutdown && smc->clcsock) + if (do_shutdown && smc->clcsock && !smc->keep_clcsock) rc1 = kernel_sock_shutdown(smc->clcsock, how); /* map sock_shutdown_cmd constants to sk_shutdown value range */ sk->sk_shutdown |= how + 1; @@ -2552,6 +2788,71 @@ static int smc_shutdown(struct socket *sock, int how) return rc ? rc : rc1; } +static int __smc_getsockopt(struct socket *sock, int level, int optname, + char __user *optval, int __user *optlen) +{ + struct smc_sock *smc; + int val, len; + + smc = smc_sk(sock->sk); + + if (get_user(len, optlen)) + return -EFAULT; + + len = min_t(int, len, sizeof(int)); + + if (len < 0) + return -EINVAL; + + switch (optname) { + case SMC_LIMIT_HS: + val = smc->limit_smc_hs; + break; + default: + return -EOPNOTSUPP; + } + + if (put_user(len, optlen)) + return -EFAULT; + if (copy_to_user(optval, &val, len)) + return -EFAULT; + + return 0; +} + +static int __smc_setsockopt(struct socket *sock, int level, int optname, + sockptr_t optval, unsigned int optlen) +{ + struct sock *sk = sock->sk; + struct smc_sock *smc; + int val, rc; + + smc = smc_sk(sk); + + lock_sock(sk); + switch (optname) { + case SMC_LIMIT_HS: + if (optlen < sizeof(int)) { + rc = -EINVAL; + break; + } + if (copy_from_sockptr(&val, optval, sizeof(int))) { + rc = -EFAULT; + break; + } + + smc->limit_smc_hs = !!val; + rc = 0; + break; + default: + rc = -EOPNOTSUPP; + break; + } + release_sock(sk); + + return rc; +} + static int smc_setsockopt(struct socket *sock, int level, int optname, sockptr_t optval, unsigned int optlen) { @@ -2561,6 +2862,8 @@ static int smc_setsockopt(struct socket *sock, int level, int optname, if (level == SOL_TCP && optname == TCP_ULP) return -EOPNOTSUPP; + else if (level == SOL_SMC) + return __smc_setsockopt(sock, level, optname, optval, optlen); smc = smc_sk(sk); @@ -2598,11 +2901,33 @@ static int smc_setsockopt(struct socket *sock, int level, int optname, case TCP_FASTOPEN_NO_COOKIE: /* option not supported by SMC */ if (sk->sk_state == SMC_INIT && !smc->connect_nonblock) { - smc_switch_to_fallback(smc, SMC_CLC_DECL_OPTUNSUPP); + rc = smc_switch_to_fallback(smc, SMC_CLC_DECL_OPTUNSUPP); } else { rc = -EINVAL; } break; + case TCP_NODELAY: + if (sk->sk_state != SMC_INIT && + sk->sk_state != SMC_LISTEN && + sk->sk_state != SMC_CLOSED) { + if (val) { + SMC_STAT_INC(smc, ndly_cnt); + smc_tx_pending(&smc->conn); + cancel_delayed_work(&smc->conn.tx_work); + } + } + break; + case TCP_CORK: + if (sk->sk_state != SMC_INIT && + sk->sk_state != SMC_LISTEN && + sk->sk_state != SMC_CLOSED) { + if (!val) { + SMC_STAT_INC(smc, cork_cnt); + smc_tx_pending(&smc->conn); + cancel_delayed_work(&smc->conn.tx_work); + } + } + break; case TCP_DEFER_ACCEPT: smc->sockopt_defer_accept = val; break; @@ -2621,6 +2946,9 @@ static int smc_getsockopt(struct socket *sock, int level, int optname, struct smc_sock *smc; int rc; + if (level == SOL_SMC) + return __smc_getsockopt(sock, level, optname, optval, optlen); + smc = smc_sk(sock->sk); mutex_lock(&smc->clcsock_release_lock); if (!smc->clcsock) { @@ -2628,10 +2956,12 @@ static int smc_getsockopt(struct socket *sock, int level, int optname, return -EBADF; } /* socket options apply to the CLC socket */ - if (unlikely(!smc->clcsock->ops->getsockopt)) + if (unlikely(!smc->clcsock->ops->getsockopt)) { + mutex_unlock(&smc->clcsock_release_lock); return -EOPNOTSUPP; + } rc = smc->clcsock->ops->getsockopt(smc->clcsock, level, optname, - optval, optlen); + optval, optlen); mutex_unlock(&smc->clcsock_release_lock); return rc; } @@ -2735,8 +3065,10 @@ static ssize_t smc_sendpage(struct socket *sock, struct page *page, rc = kernel_sendpage(smc->clcsock, page, offset, size, flags); } else { + lock_sock(sk); + rc = smc_tx_sendpage(smc, page, offset, size, flags); + release_sock(sk); SMC_STAT_INC(smc, sendpage_cnt); - rc = sock_no_sendpage(sock, page, offset, size, flags); } out: @@ -2845,6 +3177,9 @@ static int __smc_create(struct net *net, struct socket *sock, int protocol, smc->use_fallback = false; /* assume rdma capability first */ smc->fallback_rsn = 0; + /* default behavior from limit_smc_hs in every net namespace */ + smc->limit_smc_hs = net->smc.limit_smc_hs; + rc = 0; if (!clcsock) { rc = sock_create_kern(net, family, SOCK_STREAM, IPPROTO_TCP, @@ -2935,23 +3270,17 @@ unsigned int smc_net_id; static __net_init int smc_net_init(struct net *net) { - if (net != &init_net) { - net->smc.sysctl_wmem_default = - init_net.smc.sysctl_rmem_default; - net->smc.sysctl_rmem_default = - init_net.smc.sysctl_rmem_default; - net->smc.sysctl_tcp2smc = 0; - net->smc.sysctl_allow_different_subnet = 0; - net->smc.sysctl_autocorking = 1; - } + int rc; + rc = smc_sysctl_net_init(net); + if (rc) + return rc; return smc_pnet_net_init(net); } static void __net_exit smc_net_exit(struct net *net) { - net->smc.sysctl_tcp2smc = 0; - net->smc.sysctl_allow_different_subnet = 0; + smc_sysctl_net_exit(net); smc_pnet_net_exit(net); } @@ -2980,8 +3309,6 @@ static struct pernet_operations smc_net_stat_ops = { static int __init smc_init(void) { int rc, i; - int max_rshare, max_wshare; - unsigned long limit; rc = register_pernet_subsys(&smc_net_ops); if (rc) @@ -3057,51 +3384,39 @@ static int __init smc_init(void) INIT_HLIST_HEAD(&smc_v6_hashinfo.ht[i]); } - rc = smc_proc_init(); + rc = smc_ib_register_client(); if (rc) { - pr_err("%s: smc_proc_init fails with %d\n", __func__, rc); + pr_err("%s: ib_register fails with %d\n", __func__, rc); goto out_sock; } - rc = smc_conv_init(); + rc = tcp_register_ulp(&smc_ulp_ops); if (rc) { - pr_err("%s: smc_conv_init fails with %d\n", __func__, rc); - goto out_proc; + pr_err("%s: tcp_ulp_register fails with %d\n", __func__, rc); + goto out_ib; } - rc = smc_ib_register_client(); + rc = smc_proc_init(); if (rc) { - pr_err("%s: ib_register fails with %d\n", __func__, rc); - goto out_conv; + pr_err("%s: smc_proc_init fails with %d\n", __func__, rc); + goto out_ulp; } - rc = tcp_register_ulp(&smc_ulp_ops); + rc = smc_conv_init(); if (rc) { - pr_err("%s: tcp_ulp_register fails with %d\n", __func__, rc); - goto out_conv; + pr_err("%s: smc_conv_init fails with %d\n", __func__, rc); + goto out_proc; } - limit = nr_free_buffer_pages() << (PAGE_SHIFT - 7); - max_wshare = min(4UL * 1024 * 1024, limit); - max_rshare = min(6UL * 1024 * 1024, limit); - - init_net.smc.sysctl_wmem_default = 256 * 1024; - init_net.smc.sysctl_rmem_default = 384 * 1024; - init_net.smc.sysctl_tcp2smc = 0; - init_net.smc.sysctl_allow_different_subnet = 0; - init_net.smc.sysctl_autocorking = 1; - -#ifdef CONFIG_SYSCTL - smc_sysctl_init(); -#endif - static_branch_enable(&tcp_have_smc); return 0; -out_conv: - smc_conv_exit(); out_proc: smc_proc_exit(); +out_ulp: + tcp_unregister_ulp(&smc_ulp_ops); +out_ib: + smc_ib_unregister_client(); out_sock: sock_unregister(PF_SMC); out_proto6: @@ -3145,9 +3460,6 @@ static void __exit smc_exit(void) smc_clc_exit(); unregister_pernet_subsys(&smc_net_stat_ops); unregister_pernet_subsys(&smc_net_ops); -#ifdef CONFIG_SYSCTL - smc_sysctl_exit(); -#endif rcu_barrier(); } diff --git a/net/smc/smc.h b/net/smc/smc.h index e56a776b5034020376fc504271f1507a648d2b9a..05864aeb790994bfae73215e3ba711798e1d633e 100644 --- a/net/smc/smc.h +++ b/net/smc/smc.h @@ -14,6 +14,7 @@ #include #include #include /* __aligned */ +#include #include #include "smc_ib.h" @@ -24,6 +25,7 @@ #define SMC_MAX_ISM_DEVS 8 /* max # of proposed non-native ISM * devices */ +#define SMC_AUTOCORKING_DEFAULT_SIZE 0x10000 /* 64K by default */ extern struct proto smc_proto; extern struct proto smc_proto6; @@ -135,7 +137,7 @@ enum smc_urg_state { SMC_URG_READ = 3, /* data was already read */ }; -struct smc_mark_wake_up { +struct smc_mark_woken { bool woken; void *key; wait_queue_entry_t wait_entry; @@ -188,7 +190,6 @@ struct smc_connection { */ wait_queue_head_t cdc_pend_tx_wq; /* wakeup on no cdc_pend_tx_wr*/ atomic_t tx_pushing; /* nr_threads trying tx push */ - struct delayed_work tx_work; /* retry of smc_cdc_msg_send */ u32 tx_off; /* base offset in peer rmb */ @@ -208,6 +209,10 @@ struct smc_connection { * data still pending */ char urg_rx_byte; /* urgent byte */ + bool tx_in_release_sock; + /* flush pending tx data in + * sock release_cb() + */ atomic_t bytes_to_rcv; /* arrived data, * not yet received */ @@ -241,21 +246,27 @@ struct smc_sock { /* smc sock container */ void (*clcsk_state_change)(struct sock *sk); /* original stat_change fct. */ void (*clcsk_data_ready)(struct sock *sk); - /* original data_ready fct. **/ + /* original data_ready fct. */ void (*clcsk_write_space)(struct sock *sk); /* original write_space fct. */ void (*clcsk_error_report)(struct sock *sk); /* original error_report fct. */ struct smc_connection conn; /* smc connection */ struct smc_sock *listen_smc; /* listen parent */ + bool keep_clcsock; struct work_struct connect_work; /* handle non-blocking connect*/ struct work_struct tcp_listen_work;/* handle tcp socket accepts */ struct work_struct smc_listen_work;/* prepare new accept socket */ struct list_head accept_q; /* sockets to be accepted */ spinlock_t accept_q_lock; /* protects accept_q */ + bool limit_smc_hs; /* put constraint on handshake */ bool use_fallback; /* fallback to tcp */ int fallback_rsn; /* reason for fallback */ u32 peer_diagnosis; /* decline reason from peer */ + atomic_t queued_smc_hs; /* queued smc handshakes */ + struct inet_connection_sock_af_ops af_ops; + const struct inet_connection_sock_af_ops *ori_af_ops; + /* original af ops */ int sockopt_defer_accept; /* sockopt TCP_DEFER_ACCEPT * value @@ -280,7 +291,41 @@ static inline struct smc_sock *smc_sk(const struct sock *sk) return (struct smc_sock *)sk; } -extern struct workqueue_struct *smc_tcp_ls_wq; /* wq for tcp listen work */ +static inline void smc_init_saved_callbacks(struct smc_sock *smc) +{ + smc->clcsk_state_change = NULL; + smc->clcsk_data_ready = NULL; + smc->clcsk_write_space = NULL; + smc->clcsk_error_report = NULL; +} + +static inline struct smc_sock *smc_clcsock_user_data(const struct sock *clcsk) +{ + return (struct smc_sock *) + ((uintptr_t)clcsk->sk_user_data & ~SK_USER_DATA_NOCOPY); +} + +/* save target_cb in saved_cb, and replace target_cb with new_cb */ +static inline void smc_clcsock_replace_cb(void (**target_cb)(struct sock *), + void (*new_cb)(struct sock *), + void (**saved_cb)(struct sock *)) +{ + /* only save once */ + if (!*saved_cb) + *saved_cb = *target_cb; + *target_cb = new_cb; +} + +/* restore target_cb to saved_cb, and reset saved_cb to NULL */ +static inline void smc_clcsock_restore_cb(void (**target_cb)(struct sock *), + void (**saved_cb)(struct sock *)) +{ + if (!*saved_cb) + return; + *target_cb = *saved_cb; + *saved_cb = NULL; +} + extern struct workqueue_struct *smc_hs_wq; /* wq for handshake work */ extern struct workqueue_struct *smc_close_wq; /* wq for close work */ @@ -330,9 +375,9 @@ void smc_fill_gid_list(struct smc_link_group *lgr, struct smc_gidlist *gidlist, struct smc_ib_device *known_dev, u8 *known_gid); -#ifdef CONFIG_SYSCTL -int smc_sysctl_init(void); -void smc_sysctl_exit(void); -#endif +/* smc handshake limitation interface for netlink */ +int smc_nl_dump_hs_limitation(struct sk_buff *skb, struct netlink_callback *cb); +int smc_nl_enable_hs_limitation(struct sk_buff *skb, struct genl_info *info); +int smc_nl_disable_hs_limitation(struct sk_buff *skb, struct genl_info *info); #endif /* __SMC_H */ diff --git a/net/smc/smc_cdc.c b/net/smc/smc_cdc.c index 2b453894ed3809557fe407c4a9c6c87611cc9fbb..c469a0c67c3c1d09229e9452e397becfe28734b4 100644 --- a/net/smc/smc_cdc.c +++ b/net/smc/smc_cdc.c @@ -34,6 +34,7 @@ static void smc_cdc_tx_handler(struct smc_wr_tx_pend_priv *pnd_snd, smc = container_of(conn, struct smc_sock, conn); bh_lock_sock(&smc->sk); if (!wc_status) { + atomic_inc(&link->cdc_comp_cnt); diff = smc_curs_diff(cdcpend->conn->sndbuf_desc->len, &cdcpend->conn->tx_curs_fin, &cdcpend->cursor); @@ -49,10 +50,15 @@ static void smc_cdc_tx_handler(struct smc_wr_tx_pend_priv *pnd_snd, } if (atomic_dec_and_test(&conn->cdc_pend_tx_wr)) { - /* If this is the last pending WR complete, push them to prevent - * no one trying to push when corked. + /* If user owns the sock_lock, mark the connection need sending. + * User context will later try to send when it release sock_lock + * in smc_release_cb() */ - smc_tx_sndbuf_nonempty(conn); + if (sock_owned_by_user(&smc->sk)) + conn->tx_in_release_sock = true; + else + smc_tx_pending(conn); + if (unlikely(wq_has_sleeper(&conn->cdc_pend_tx_wq))) wake_up(&conn->cdc_pend_tx_wq); } @@ -106,25 +112,31 @@ int smc_cdc_msg_send(struct smc_connection *conn, struct smc_cdc_tx_pend *pend) { struct smc_link *link = conn->lnk; + struct smc_cdc_msg *cdc_msg = (struct smc_cdc_msg *)wr_buf; union smc_host_cursor cfed; + u8 saved_credits = 0; int rc; smc_cdc_add_pending_send(conn, pend); conn->tx_cdc_seq++; conn->local_tx_ctrl.seqno = conn->tx_cdc_seq; - smc_host_msg_to_cdc((struct smc_cdc_msg *)wr_buf, conn, &cfed); + smc_host_msg_to_cdc(cdc_msg, conn, &cfed); + saved_credits = (u8)smc_wr_rx_get_credits(link); + cdc_msg->credits = saved_credits; atomic_inc(&conn->cdc_pend_tx_wr); smp_mb__after_atomic(); /* Make sure cdc_pend_tx_wr added before post */ rc = smc_wr_tx_send(link, (struct smc_wr_tx_pend_priv *)pend); - if (!rc) { + if (likely(!rc)) { smc_curs_copy(&conn->rx_curs_confirmed, &cfed, conn); conn->local_rx_ctrl.prod_flags.cons_curs_upd_req = 0; + atomic_inc(&link->cdc_send_cnt); } else { conn->tx_cdc_seq--; conn->local_tx_ctrl.seqno = conn->tx_cdc_seq; + smc_wr_rx_put_credits(link, saved_credits); atomic_dec(&conn->cdc_pend_tx_wr); } @@ -202,7 +214,8 @@ int smc_cdc_get_slot_and_msg_send(struct smc_connection *conn) { int rc; - if (!conn->lgr || (conn->lgr->is_smcd && conn->lgr->peer_shutdown)) + if (!smc_conn_lgr_valid(conn) || + (conn->lgr->is_smcd && conn->lgr->peer_shutdown)) return -EPIPE; if (conn->lgr->is_smcd) { @@ -354,8 +367,12 @@ static void smc_cdc_msg_recv_action(struct smc_sock *smc, /* trigger sndbuf consumer: RDMA write into peer RMBE and CDC */ if ((diff_cons && smc_tx_prepared_sends(conn)) || conn->local_rx_ctrl.prod_flags.cons_curs_upd_req || - conn->local_rx_ctrl.prod_flags.urg_data_pending) - smc_tx_sndbuf_nonempty(conn); + conn->local_rx_ctrl.prod_flags.urg_data_pending) { + if (!sock_owned_by_user(&smc->sk)) + smc_tx_pending(conn); + else + conn->tx_in_release_sock = true; + } if (diff_cons && conn->urg_tx_pend && atomic_read(&conn->peer_rmbe_space) == conn->peer_rmbe_size) { @@ -435,6 +452,9 @@ static void smc_cdc_rx_handler(struct ib_wc *wc, void *buf) if (cdc->len != SMC_WR_TX_SIZE) return; /* invalid message */ + if (cdc->credits) + smc_wr_tx_put_credits(link, cdc->credits, true); + /* lookup connection */ lgr = smc_get_lgr(link); read_lock_bh(&lgr->conns_lock); diff --git a/net/smc/smc_cdc.h b/net/smc/smc_cdc.h index 696cc11f2303b95318f6750479bb8abffde3ca24..145ce7997e64207427b297d7f5c9bdd957d03704 100644 --- a/net/smc/smc_cdc.h +++ b/net/smc/smc_cdc.h @@ -47,7 +47,8 @@ struct smc_cdc_msg { union smc_cdc_cursor cons; /* piggy backed "ack" */ struct smc_cdc_producer_flags prod_flags; struct smc_cdc_conn_state_flags conn_state_flags; - u8 reserved[18]; + u8 credits; /* credits synced by every cdc msg */ + u8 reserved[17]; }; /* SMC-D cursor format */ diff --git a/net/smc/smc_clc.c b/net/smc/smc_clc.c index 6be95a2a7b2515216eef177b45bb02af8c4fc208..bd07837d21d995ef5c5d3b7cf51e79439d9c4947 100644 --- a/net/smc/smc_clc.c +++ b/net/smc/smc_clc.c @@ -191,7 +191,8 @@ static int smc_nl_ueid_dumpinfo(struct sk_buff *skb, u32 portid, u32 seq, flags, SMC_NETLINK_DUMP_UEID); if (!hdr) return -ENOMEM; - snprintf(ueid_str, sizeof(ueid_str), "%s", ueid); + memcpy(ueid_str, ueid, SMC_MAX_EID_LEN); + ueid_str[SMC_MAX_EID_LEN] = 0; if (nla_put_string(skb, SMC_NLA_EID_TABLE_ENTRY, ueid_str)) { genlmsg_cancel(skb, hdr); return -EMSGSIZE; @@ -252,7 +253,8 @@ int smc_nl_dump_seid(struct sk_buff *skb, struct netlink_callback *cb) goto end; smc_ism_get_system_eid(&seid); - snprintf(seid_str, sizeof(seid_str), "%s", seid); + memcpy(seid_str, seid, SMC_MAX_EID_LEN); + seid_str[SMC_MAX_EID_LEN] = 0; if (nla_put_string(skb, SMC_NLA_SEID_ENTRY, seid_str)) goto err; read_lock(&smc_clc_eid_table.lock); @@ -774,7 +776,7 @@ int smc_clc_send_decline(struct smc_sock *smc, u32 peer_diag_info, u8 version) dclc.os_type = version == SMC_V1 ? 0 : SMC_CLC_OS_LINUX; dclc.hdr.typev2 = (peer_diag_info == SMC_CLC_DECL_SYNCERR) ? SMC_FIRST_CONTACT_MASK : 0; - if ((!smc->conn.lgr || !smc->conn.lgr->is_smcd) && + if ((!smc_conn_lgr_valid(&smc->conn) || !smc->conn.lgr->is_smcd) && smc_ib_is_valid_local_systemid()) memcpy(dclc.id_for_peer, local_systemid, sizeof(local_systemid)); @@ -1038,9 +1040,12 @@ static int smc_clc_send_confirm_accept(struct smc_sock *smc, switch (clc->hdr.type) { case SMC_CLC_ACCEPT: clc->r0.qp_mtu = link->path_mtu; + clc->r0.init_credits = (u8)link->wr_rx_cnt; break; case SMC_CLC_CONFIRM: clc->r0.qp_mtu = min(link->path_mtu, link->peer_mtu); + clc->r0.init_credits = + link->credits_enable ? (u8)link->wr_rx_cnt : 0; break; } clc->r0.rmbe_size = conn->rmbe_size_short; diff --git a/net/smc/smc_clc.h b/net/smc/smc_clc.h index 83f02f131fc090b1691ab11aab2823e4b3d24f59..eb4bba54d6df77f3fc036959dc7077c0e9de478f 100644 --- a/net/smc/smc_clc.h +++ b/net/smc/smc_clc.h @@ -63,6 +63,7 @@ #define SMC_CLC_DECL_ERR_RTOK 0x09990001 /* rtoken handling failed */ #define SMC_CLC_DECL_ERR_RDYLNK 0x09990002 /* ib ready link failed */ #define SMC_CLC_DECL_ERR_REGRMB 0x09990003 /* reg rmb failed */ +#define SMC_CLC_DECL_CREDITSERR 0x09990004 /* announce credits failed */ #define SMC_FIRST_CONTACT_MASK 0b10 /* first contact bit within typev2 */ @@ -190,7 +191,7 @@ struct smcr_clc_msg_accept_confirm { /* SMCR accept/confirm */ u8 qp_mtu : 4, rmbe_size : 4; #endif - u8 reserved; + u8 init_credits; /* QP rq init credits for rq flowctrl */ __be64 rmb_dma_addr; /* RMB virtual address */ u8 reserved2; u8 psn[3]; /* packet sequence number */ diff --git a/net/smc/smc_close.c b/net/smc/smc_close.c index 292e4d904ab6e4afbba2cef421bce27cae5af364..038bcafe9a9e9eb21efacbdf814365d9aadc85b9 100644 --- a/net/smc/smc_close.c +++ b/net/smc/smc_close.c @@ -28,10 +28,12 @@ void smc_clcsock_release(struct smc_sock *smc) if (smc->listen_smc && current_work() != &smc->smc_listen_work) cancel_work_sync(&smc->smc_listen_work); mutex_lock(&smc->clcsock_release_lock); + /* don't release clcsock for eRDMA */ if (smc->clcsock) { tcp = smc->clcsock; smc->clcsock = NULL; - sock_release(tcp); + if (!smc->keep_clcsock) + sock_release(tcp); } mutex_unlock(&smc->clcsock_release_lock); } @@ -57,6 +59,9 @@ static void smc_close_stream_wait(struct smc_sock *smc, long timeout) if (!smc_tx_prepared_sends(&smc->conn)) return; + /* Send out corked data remaining in sndbuf */ + smc_tx_pending(&smc->conn); + smc->wait_close_tx_prepared = 1; add_wait_queue(sk_sleep(sk), &wait); while (!signal_pending(current) && timeout) { @@ -211,8 +216,11 @@ int smc_close_active(struct smc_sock *smc) sk->sk_state = SMC_CLOSED; sk->sk_state_change(sk); /* wake up accept */ if (smc->clcsock && smc->clcsock->sk) { - smc->clcsock->sk->sk_data_ready = smc->clcsk_data_ready; + write_lock_bh(&smc->clcsock->sk->sk_callback_lock); + smc_clcsock_restore_cb(&smc->clcsock->sk->sk_data_ready, + &smc->clcsk_data_ready); smc->clcsock->sk->sk_user_data = NULL; + write_unlock_bh(&smc->clcsock->sk->sk_callback_lock); rc = kernel_sock_shutdown(smc->clcsock, SHUT_RDWR); } smc_close_cleanup_listen(sk); @@ -233,7 +241,8 @@ int smc_close_active(struct smc_sock *smc) /* actively shutdown clcsock before peer close it, * prevent peer from entering TIME_WAIT state. */ - if (smc->clcsock && smc->clcsock->sk) { + if (smc->clcsock && smc->clcsock->sk && + !smc->keep_clcsock) { rc1 = kernel_sock_shutdown(smc->clcsock, SHUT_RDWR); rc = rc ? rc : rc1; diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c index 278fbc18c950abda871e72aceb895a9615b83ce0..9ccf9a432c3c06f90c0184892aa9a786768976c4 100644 --- a/net/smc/smc_core.c +++ b/net/smc/smc_core.c @@ -155,7 +155,6 @@ static int smcr_lgr_conn_assign_link(struct smc_connection *conn, bool first) if (!conn->lnk) return SMC_CLC_DECL_NOACTLINK; atomic_inc(&conn->lnk->conn_cnt); - smcr_link_hold(conn->lnk); /* link_put in smc_conn_free() */ return 0; } @@ -187,7 +186,6 @@ static int smc_lgr_register_conn(struct smc_connection *conn, bool first) conn->alert_token_local = 0; } smc_lgr_add_alert_token(conn); - smc_lgr_hold(conn->lgr); /* lgr_put in smc_conn_free() */ conn->lgr->conns_num++; return 0; } @@ -213,7 +211,7 @@ static void smc_lgr_unregister_conn(struct smc_connection *conn) { struct smc_link_group *lgr = conn->lgr; - if (!lgr) + if (!smc_conn_lgr_valid(conn)) return; write_lock_bh(&lgr->conns_lock); if (conn->alert_token_local) { @@ -625,15 +623,13 @@ int smcd_nl_get_lgr(struct sk_buff *skb, struct netlink_callback *cb) return skb->len; } -void smc_lgr_cleanup_early(struct smc_connection *conn) +void smc_lgr_cleanup_early(struct smc_link_group *lgr) { - struct smc_link_group *lgr = conn->lgr; spinlock_t *lgr_lock; if (!lgr) return; - smc_conn_free(conn); smc_lgr_list_head(lgr, &lgr_lock); spin_lock_bh(lgr_lock); /* do not use this link group for new connections */ @@ -776,6 +772,18 @@ int smcr_link_init(struct smc_link_group *lgr, struct smc_link *lnk, smc_ibdev_cnt_inc(lnk); smcr_copy_dev_info_to_link(lnk); atomic_set(&lnk->conn_cnt, 0); + + atomic_set(&lnk->total_send_cnt, 0); + atomic_set(&lnk->total_comp_cnt, 0); + atomic_set(&lnk->reg_send_cnt, 0); + atomic_set(&lnk->reg_comp_cnt, 0); + atomic_set(&lnk->cdc_send_cnt, 0); + atomic_set(&lnk->cdc_comp_cnt, 0); + atomic_set(&lnk->llc_send_cnt, 0); + atomic_set(&lnk->llc_comp_cnt, 0); + atomic_set(&lnk->rdma_write_cnt, 0); + atomic_set(&lnk->bad_comp_cnt, 0); + smc_llc_link_set_uid(lnk); INIT_WORK(&lnk->link_down_wrk, smc_link_down_work); if (!lnk->smcibdev->initialized) { @@ -826,6 +834,7 @@ int smcr_link_init(struct smc_link_group *lgr, struct smc_link *lnk, lnk->state = SMC_LNK_UNUSED; if (!atomic_dec_return(&smcibdev->lnk_cnt)) wake_up(&smcibdev->lnks_deleted); + smc_lgr_put(lgr); /* lgr_hold above */ return rc; } @@ -917,6 +926,7 @@ static int smc_lgr_create(struct smc_sock *smc, struct smc_init_info *ini) link_idx = SMC_SINGLE_LINK; lnk = &lgr->lnk[link_idx]; smcr_link_iw_extension(&lnk->iw_conn_param, smc->clcsock->sk); + lnk->clcsock = smc->clcsock; rc = smcr_link_init(lgr, lnk, link_idx, ini); if (rc) { @@ -1019,11 +1029,11 @@ void smc_switch_link_and_count(struct smc_connection *conn, struct smc_link *to_lnk) { atomic_dec(&conn->lnk->conn_cnt); - /* put old link, hold in smcr_lgr_conn_assign_link() */ + /* link_hold in smc_conn_create() */ smcr_link_put(conn->lnk); conn->lnk = to_lnk; atomic_inc(&conn->lnk->conn_cnt); - /* hold new link, put in smc_conn_free() */ + /* link_put in smc_conn_free() */ smcr_link_hold(conn->lnk); } @@ -1158,17 +1168,16 @@ void smc_conn_free(struct smc_connection *conn) struct smc_link_group *lgr = conn->lgr; if (!lgr || conn->freed) - /* smc connection wasn't registered to a link group - * or has already been freed before. - * - * Judge these to ensure that lgr/link refcnt will be - * put only once if connection has been registered to - * a link group successfully. + /* Connection has never been registered in a + * link group, or has already been freed. */ return; conn->freed = 1; - if (conn->killed) + if (!smc_conn_lgr_valid(conn)) + /* Connection has already unregistered from + * link group. + */ goto lgr_put; if (lgr->is_smcd) { @@ -1189,8 +1198,8 @@ void smc_conn_free(struct smc_connection *conn) smc_lgr_schedule_free_work(lgr); lgr_put: if (!lgr->is_smcd) - smcr_link_put(conn->lnk); /* link_hold in smcr_lgr_conn_assign_link() */ - smc_lgr_put(lgr); /* lgr_hold in smc_lgr_register_conn() */ + smcr_link_put(conn->lnk); /* link_hold in smc_conn_create() */ + smc_lgr_put(lgr); /* lgr_hold in smc_conn_create() */ } /* unregister a link from a buf_desc */ @@ -1246,20 +1255,28 @@ static void smcr_rtoken_clear_link(struct smc_link *lnk) } } -void __smcr_link_clear(struct smc_link *lnk) +static void __smcr_link_clear(struct smc_link *lnk) { + struct smc_link_group *lgr = lnk->lgr; + struct smc_ib_device *smcibdev; + smc_wr_free_link_mem(lnk); - smc_lgr_put(lnk->lgr); /* lgr_hold in smcr_link_init() */ + smc_ibdev_cnt_dec(lnk); + if (lnk->clcsock) + sock_release(lnk->clcsock); + put_device(&lnk->smcibdev->ibdev->dev); + smcibdev = lnk->smcibdev; memset(lnk, 0, sizeof(struct smc_link)); lnk->state = SMC_LNK_UNUSED; + if (!atomic_dec_return(&smcibdev->lnk_cnt)) + wake_up(&smcibdev->lnks_deleted); + smc_lgr_put(lgr); /* lgr_hold in smcr_link_init() */ } /* must be called under lgr->llc_conf_mutex lock */ void smcr_link_clear(struct smc_link *lnk, bool log) { - struct smc_ib_device *smcibdev; - - if (lnk->clearing || !lnk->lgr || + if (!lnk->lgr || lnk->clearing || lnk->state == SMC_LNK_UNUSED) return; lnk->clearing = 1; @@ -1271,11 +1288,6 @@ void smcr_link_clear(struct smc_link *lnk, bool log) smc_wr_free_link(lnk); smc_ib_destroy_queue_pair(lnk); smc_ib_dealloc_protection_domain(lnk); - smc_ibdev_cnt_dec(lnk); - put_device(&lnk->smcibdev->ibdev->dev); - smcibdev = lnk->smcibdev; - if (!atomic_dec_return(&smcibdev->lnk_cnt)) - wake_up(&smcibdev->lnks_deleted); smcr_link_put(lnk); /* theoretically last link_put */ } @@ -1359,6 +1371,14 @@ static void smc_lgr_free_bufs(struct smc_link_group *lgr) static void __smc_lgr_free(struct smc_link_group *lgr) { smc_lgr_free_bufs(lgr); + if (lgr->is_smcd) { + if (!atomic_dec_return(&lgr->smcd->lgr_cnt)) + wake_up(&lgr->smcd->lgrs_deleted); + } else { + smc_wr_free_lgr_mem(lgr); + if (!atomic_dec_return(&lgr_cnt)) + wake_up(&lgrs_deleted); + } kfree(lgr); } @@ -1381,12 +1401,6 @@ static void smc_lgr_free(struct smc_link_group *lgr) if (lgr->is_smcd) { smc_ism_put_vlan(lgr->smcd, lgr->vlan_id); put_device(&lgr->smcd->dev); - if (!atomic_dec_return(&lgr->smcd->lgr_cnt)) - wake_up(&lgr->smcd->lgrs_deleted); - } else { - smc_wr_free_lgr_mem(lgr); - if (!atomic_dec_return(&lgr_cnt)) - wake_up(&lgrs_deleted); } smc_lgr_put(lgr); /* theoretically last lgr_put */ } @@ -1551,16 +1565,11 @@ void smc_smcd_terminate_all(struct smcd_dev *smcd) /* Called when an SMCR device is removed or the smc module is unloaded. * If smcibdev is given, all SMCR link groups using this device are terminated. * If smcibdev is NULL, all SMCR link groups are terminated. - * - * We must wait here for QPs been destroyed before we destroy the CQs, - * or we won't received any CQEs and cdc_pend_tx_wr cannot reach 0 thus - * smc_sock cannot be released. */ void smc_smcr_terminate_all(struct smc_ib_device *smcibdev) { struct smc_link_group *lgr, *lg; LIST_HEAD(lgr_free_list); - LIST_HEAD(lgr_linkdown_list); int i; spin_lock_bh(&smc_lgr_list.lock); @@ -1572,7 +1581,7 @@ void smc_smcr_terminate_all(struct smc_ib_device *smcibdev) list_for_each_entry_safe(lgr, lg, &smc_lgr_list.list, list) { for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) { if (lgr->lnk[i].smcibdev == smcibdev) - list_move_tail(&lgr->list, &lgr_linkdown_list); + smcr_link_down_cond_sched(&lgr->lnk[i]); } } } @@ -1584,16 +1593,6 @@ void smc_smcr_terminate_all(struct smc_ib_device *smcibdev) __smc_lgr_terminate(lgr, false); } - list_for_each_entry_safe(lgr, lg, &lgr_linkdown_list, list) { - for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) { - if (lgr->lnk[i].smcibdev == smcibdev) { - mutex_lock(&lgr->llc_conf_mutex); - smcr_link_down_cond(&lgr->lnk[i]); - mutex_unlock(&lgr->llc_conf_mutex); - } - } - } - if (smcibdev) { if (atomic_read(&smcibdev->lnk_cnt)) wait_event(smcibdev->lnks_deleted, @@ -1674,6 +1673,9 @@ void smcr_port_add(struct smc_ib_device *smcibdev, u8 ibport) lgr->type == SMC_LGR_ASYMMETRIC_PEER || !rdma_dev_access_netns(smcibdev->ibdev, lgr->net)) continue; + if (lgr->type == SMC_LGR_SINGLE && + lgr->net->smc.sysctl_disable_multiple_link) + continue; /* trigger local add link processing */ link = smc_llc_usable_link(lgr); @@ -1941,6 +1943,9 @@ int smc_conn_create(struct smc_sock *smc, struct smc_init_info *ini) create: if (ini->first_contact_local) { + /* keep this clcsock for QP reuse */ + if (net->smc.sysctl_keep_first_contact_clcsock) + smc->keep_clcsock = 1; rc = smc_lgr_create(smc, ini); if (rc) goto out; @@ -1949,14 +1954,14 @@ int smc_conn_create(struct smc_sock *smc, struct smc_init_info *ini) rc = smc_lgr_register_conn(conn, true); write_unlock_bh(&lgr->conns_lock); if (rc) { - spin_lock_bh(lgr_lock); - if (!list_empty(&lgr->list)) - list_del_init(&lgr->list); - spin_unlock_bh(lgr_lock); - __smc_lgr_terminate(lgr, true); + smc_lgr_cleanup_early(lgr); goto out; } } + smc_lgr_hold(conn->lgr); /* lgr_put in smc_conn_free() */ + if (!conn->lgr->is_smcd) + smcr_link_hold(conn->lnk); /* link_put in smc_conn_free() */ + conn->freed = 0; conn->local_tx_ctrl.common.type = SMC_CDC_MSG_TYPE; conn->local_tx_ctrl.len = SMC_WR_TX_SIZE; conn->urg_state = SMC_URG_READ; @@ -2342,14 +2347,16 @@ static int __smc_buf_create(struct smc_sock *smc, bool is_smcd, bool is_rmb) void smc_sndbuf_sync_sg_for_cpu(struct smc_connection *conn) { - if (!conn->lgr || conn->lgr->is_smcd || !smc_link_active(conn->lnk)) + if (!smc_conn_lgr_valid(conn) || conn->lgr->is_smcd || + !smc_link_active(conn->lnk)) return; smc_ib_sync_sg_for_cpu(conn->lnk, conn->sndbuf_desc, DMA_TO_DEVICE); } void smc_sndbuf_sync_sg_for_device(struct smc_connection *conn) { - if (!conn->lgr || conn->lgr->is_smcd || !smc_link_active(conn->lnk)) + if (!smc_conn_lgr_valid(conn) || conn->lgr->is_smcd || + !smc_link_active(conn->lnk)) return; smc_ib_sync_sg_for_device(conn->lnk, conn->sndbuf_desc, DMA_TO_DEVICE); } @@ -2358,7 +2365,7 @@ void smc_rmb_sync_sg_for_cpu(struct smc_connection *conn) { int i; - if (!conn->lgr || conn->lgr->is_smcd) + if (!smc_conn_lgr_valid(conn) || conn->lgr->is_smcd) return; for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) { if (!smc_link_active(&conn->lgr->lnk[i])) @@ -2372,7 +2379,7 @@ void smc_rmb_sync_sg_for_device(struct smc_connection *conn) { int i; - if (!conn->lgr || conn->lgr->is_smcd) + if (!smc_conn_lgr_valid(conn) || conn->lgr->is_smcd) return; for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) { if (!smc_link_active(&conn->lgr->lnk[i])) diff --git a/net/smc/smc_core.h b/net/smc/smc_core.h index 0c2dc09cfccf588b22c2332ac28891e6b57023e2..3d8954ca0af1443da48be114873f36d22de85609 100644 --- a/net/smc/smc_core.h +++ b/net/smc/smc_core.h @@ -21,7 +21,12 @@ #include "smc.h" #include "smc_ib.h" -#define SMC_RMBS_PER_LGR_MAX 255 /* max. # of RMBs per link group */ +#define SMC_RMBS_PER_LGR_MAX 32 /* max. # of RMBs per link group. Correspondingly, + * SMC_WR_BUF_CNT should not be less than 2 * + * SMC_RMBS_PER_LGR_MAX, since every connection at + * least has two rq/sq credits in average, otherwise + * may result in waiting for credits in sending process. + */ struct smc_lgr_list { /* list of link group definition */ struct list_head list; @@ -80,6 +85,8 @@ struct smc_rdma_wr { /* work requests per message #define SMC_LGR_ID_SIZE 4 +#define SMC_LINKFLAG_ANNOUNCE_PENDING 0 + struct smc_link { struct iw_ext_conn_param iw_conn_param; struct smc_ib_device *smcibdev; /* ib-device */ @@ -87,6 +94,8 @@ struct smc_link { struct ib_pd *roce_pd; /* IB protection domain, * unique for every RoCE QP */ + struct smc_ib_cq *smcibcq_recv; /* cq for recv */ + struct smc_ib_cq *smcibcq_send; /* cq for send */ struct ib_qp *roce_qp; /* IB queue pair */ struct ib_qp_attr qp_attr; /* IB queue pair attributes */ @@ -124,6 +133,14 @@ struct smc_link { atomic_t wr_reg_refcnt; /* reg refs to link */ enum smc_wr_reg_state wr_reg_state; /* state of wr_reg request */ + atomic_t peer_rq_credits; /* credits for peer rq flowctrl */ + atomic_t local_rq_credits; /* credits for local rq flowctrl */ + u8 credits_enable; /* credits enable flag, set when negotiation */ + u8 local_cr_watermark_high; /* local rq credits watermark */ + u8 peer_cr_watermark_low; /* peer rq credits watermark */ + struct work_struct credits_announce_work; /* work for credits announcement */ + unsigned long flags; /* link flags, SMC_LINKFLAG_ANNOUNCE_PENDING .etc */ + u8 gid[SMC_GID_SIZE];/* gid matching used vlan id*/ u8 sgid_index; /* gid index for vlan id */ u32 peer_qpn; /* QP number of peer */ @@ -150,6 +167,18 @@ struct smc_link { struct completion llc_testlink_resp; /* wait for rx of testlink */ int llc_testlink_time; /* testlink interval */ atomic_t conn_cnt; /* connections on this link */ + + struct socket *clcsock; /* keep for eRDMA */ + atomic_t total_send_cnt; + atomic_t total_comp_cnt; + atomic_t cdc_send_cnt; + atomic_t cdc_comp_cnt; + atomic_t llc_send_cnt; + atomic_t llc_comp_cnt; + atomic_t reg_send_cnt; + atomic_t reg_comp_cnt; + atomic_t rdma_write_cnt; + atomic_t bad_comp_cnt; }; /* For now we just allow one parallel link per link group. The SMC protocol @@ -413,6 +442,11 @@ static inline struct smc_connection *smc_lgr_find_conn( return res; } +static inline bool smc_conn_lgr_valid(struct smc_connection *conn) +{ + return conn->lgr && conn->alert_token_local; +} + /* * Returns true if the specified link is usable. * @@ -489,7 +523,7 @@ static inline void smc_set_pci_values(struct pci_dev *pci_dev, struct smc_sock; struct smc_clc_msg_accept_confirm; -void smc_lgr_cleanup_early(struct smc_connection *conn); +void smc_lgr_cleanup_early(struct smc_link_group *lgr); void smc_lgr_terminate_sched(struct smc_link_group *lgr); void smc_lgr_hold(struct smc_link_group *lgr); void smc_lgr_put(struct smc_link_group *lgr); diff --git a/net/smc/smc_diag.c b/net/smc/smc_diag.c index 1fa7c7cf93325c417574b997703d7ee259443be6..bbe00b50b6662379241672f7f4bd91db00a3c65f 100644 --- a/net/smc/smc_diag.c +++ b/net/smc/smc_diag.c @@ -89,7 +89,7 @@ static int __smc_diag_dump(struct sock *sk, struct sk_buff *skb, r->diag_state = sk->sk_state; if (smc->use_fallback) r->diag_mode = SMC_DIAG_MODE_FALLBACK_TCP; - else if (smc->conn.lgr && smc->conn.lgr->is_smcd) + else if (smc_conn_lgr_valid(&smc->conn) && smc->conn.lgr->is_smcd) r->diag_mode = SMC_DIAG_MODE_SMCD; else r->diag_mode = SMC_DIAG_MODE_SMCR; @@ -148,7 +148,7 @@ static int __smc_diag_dump(struct sock *sk, struct sk_buff *skb, goto errout; } - if (smc->conn.lgr && !smc->conn.lgr->is_smcd && + if (smc_conn_lgr_valid(&smc->conn) && !smc->conn.lgr->is_smcd && (req->diag_ext & (1 << (SMC_DIAG_LGRINFO - 1))) && !list_empty(&smc->conn.lgr->list)) { struct smc_diag_lgrinfo linfo = { @@ -168,7 +168,7 @@ static int __smc_diag_dump(struct sock *sk, struct sk_buff *skb, if (nla_put(skb, SMC_DIAG_LGRINFO, sizeof(linfo), &linfo) < 0) goto errout; } - if (smc->conn.lgr && smc->conn.lgr->is_smcd && + if (smc_conn_lgr_valid(&smc->conn) && smc->conn.lgr->is_smcd && (req->diag_ext & (1 << (SMC_DIAG_DMBINFO - 1))) && !list_empty(&smc->conn.lgr->list)) { struct smc_connection *conn = &smc->conn; diff --git a/net/smc/smc_ib.c b/net/smc/smc_ib.c index 8e2b1af1d291a7485c9ee6c52ea9c53e4591aec4..c98e871b54c45a066559e0984cb93286cfd27476 100644 --- a/net/smc/smc_ib.c +++ b/net/smc/smc_ib.c @@ -131,12 +131,12 @@ int smc_ib_ready_link(struct smc_link *lnk) if (rc) goto out; smc_wr_remember_qp_attr(lnk); - rc = ib_req_notify_cq(lnk->smcibdev->roce_cq_recv, + rc = ib_req_notify_cq(lnk->smcibcq_recv->ib_cq, IB_CQ_SOLICITED_MASK); if (rc) goto out; - rc = ib_req_notify_cq(lnk->smcibdev->roce_cq_send, + rc = ib_req_notify_cq(lnk->smcibcq_send->ib_cq, IB_CQ_NEXT_COMP | IB_CQ_REPORT_MISSED_EVENTS); if (rc) goto out; @@ -630,6 +630,36 @@ int smcr_nl_get_device(struct sk_buff *skb, struct netlink_callback *cb) return skb->len; } +static struct smc_ib_cq *smc_ib_get_least_used_cq(struct smc_ib_device *smcibdev, + bool is_send) +{ + struct smc_ib_cq *smcibcq, *cq; + int min, i; + + if (is_send) + smcibcq = smcibdev->smcibcq_send; + else + smcibcq = smcibdev->smcibcq_recv; + + cq = smcibcq; + min = cq->load; + + for (i = 0; i < smcibdev->num_cq_peer; i++) { + if (smcibcq[i].load < min) { + cq = &smcibcq[i]; + min = cq->load; + } + } + + cq->load++; + return cq; +} + +static void smc_ib_put_cq(struct smc_ib_cq *smcibcq) +{ + smcibcq->load--; +} + static void smc_ib_qp_event_handler(struct ib_event *ibevent, void *priv) { struct smc_link *lnk = (struct smc_link *)priv; @@ -653,27 +683,38 @@ static void smc_ib_qp_event_handler(struct ib_event *ibevent, void *priv) void smc_ib_destroy_queue_pair(struct smc_link *lnk) { - if (lnk->roce_qp) + if (lnk->roce_qp) { ib_destroy_qp(lnk->roce_qp); + smc_ib_put_cq(lnk->smcibcq_send); + smc_ib_put_cq(lnk->smcibcq_recv); + } lnk->roce_qp = NULL; + lnk->smcibcq_send = NULL; + lnk->smcibcq_recv = NULL; } /* create a queue pair within the protection domain for a link */ int smc_ib_create_queue_pair(struct smc_link *lnk) { + struct smc_ib_cq *smcibcq_send = smc_ib_get_least_used_cq(lnk->smcibdev, + true); + struct smc_ib_cq *smcibcq_recv = smc_ib_get_least_used_cq(lnk->smcibdev, + false); int sges_per_buf = (lnk->lgr->smc_version == SMC_V2) ? 2 : 1; struct ib_qp_init_attr qp_attr = { .event_handler = smc_ib_qp_event_handler, .qp_context = lnk, - .send_cq = lnk->smcibdev->roce_cq_send, - .recv_cq = lnk->smcibdev->roce_cq_recv, + .send_cq = smcibcq_send->ib_cq, + .recv_cq = smcibcq_recv->ib_cq, .srq = NULL, .cap = { /* include unsolicited rdma_writes as well, - * there are max. 2 RDMA_WRITE per 1 WR_SEND + * there are max. 2 RDMA_WRITE per 1 WR_SEND. + * RDMA_WRITE consumes send queue entities, + * without recv queue entities. */ .max_send_wr = SMC_WR_BUF_CNT * 3, - .max_recv_wr = SMC_WR_BUF_CNT * 3, + .max_recv_wr = SMC_WR_BUF_CNT, .max_send_sge = SMC_IB_MAX_SEND_SGE, .max_recv_sge = sges_per_buf, }, @@ -690,10 +731,13 @@ int smc_ib_create_queue_pair(struct smc_link *lnk) lnk->roce_qp = ib_create_qp(lnk->roce_pd, &qp_attr); rc = PTR_ERR_OR_ZERO(lnk->roce_qp); - if (IS_ERR(lnk->roce_qp)) + if (IS_ERR(lnk->roce_qp)) { lnk->roce_qp = NULL; - else + } else { + lnk->smcibcq_send = smcibcq_send; + lnk->smcibcq_recv = smcibcq_recv; smc_wr_remember_qp_attr(lnk); + } return rc; } @@ -810,11 +854,34 @@ void smc_ib_buf_unmap_sg(struct smc_link *lnk, buf_slot->sgt[lnk->link_idx].sgl->dma_address = 0; } +static void smc_ib_cleanup_cq(struct smc_ib_device *smcibdev) +{ + int i; + + for (i = 0; i < smcibdev->num_cq_peer; i++) { + if (smcibdev->smcibcq_send[i].ib_cq) + ib_destroy_cq(smcibdev->smcibcq_send[i].ib_cq); + + if (smcibdev->smcibcq_recv[i].ib_cq) + ib_destroy_cq(smcibdev->smcibcq_recv[i].ib_cq); + } + + kfree(smcibdev->smcibcq_send); + kfree(smcibdev->smcibcq_recv); +} + +static void cq_event_handler(struct ib_event *event, void *data) +{ + pr_warn("smc: event %u (%s) data %p\n", + event->event, ib_event_msg(event->event), data); +} + long smc_ib_setup_per_ibdev(struct smc_ib_device *smcibdev) { - struct ib_cq_init_attr cqattr = { - .cqe = SMC_MAX_CQE, .comp_vector = 0 }; + struct ib_cq_init_attr cqattr = { .cqe = SMC_MAX_CQE }; int cqe_size_order, smc_order; + struct smc_ib_cq *smcibcq; + int i, num_cq_peer; long rc; mutex_lock(&smcibdev->mutex); @@ -826,28 +893,53 @@ long smc_ib_setup_per_ibdev(struct smc_ib_device *smcibdev) smc_order = MAX_ORDER - cqe_size_order - 1; if (SMC_MAX_CQE + 2 > (0x00000001 << smc_order) * PAGE_SIZE) cqattr.cqe = (0x00000001 << smc_order) * PAGE_SIZE - 2; - smcibdev->roce_cq_send = ib_create_cq(smcibdev->ibdev, - smc_wr_tx_cq_handler, NULL, - smcibdev, &cqattr); - rc = PTR_ERR_OR_ZERO(smcibdev->roce_cq_send); - if (IS_ERR(smcibdev->roce_cq_send)) { - smcibdev->roce_cq_send = NULL; - goto out; + num_cq_peer = min_t(int, smcibdev->ibdev->num_comp_vectors, + num_online_cpus()); + smcibdev->num_cq_peer = num_cq_peer; + smcibdev->smcibcq_send = kcalloc(num_cq_peer, sizeof(*smcibcq), + GFP_KERNEL); + if (!smcibdev->smcibcq_send) { + rc = -ENOMEM; + goto err; } - smcibdev->roce_cq_recv = ib_create_cq(smcibdev->ibdev, - smc_wr_rx_cq_handler, NULL, - smcibdev, &cqattr); - rc = PTR_ERR_OR_ZERO(smcibdev->roce_cq_recv); - if (IS_ERR(smcibdev->roce_cq_recv)) { - smcibdev->roce_cq_recv = NULL; + smcibdev->smcibcq_recv = kcalloc(num_cq_peer, sizeof(*smcibcq), + GFP_KERNEL); + if (!smcibdev->smcibcq_recv) { + rc = -ENOMEM; goto err; } + + /* initialize CQs */ + for (i = 0; i < num_cq_peer; i++) { + /* initialize send CQ */ + smcibcq = &smcibdev->smcibcq_send[i]; + smcibcq->smcibdev = smcibdev; + smcibcq->is_send = 1; + cqattr.comp_vector = i; + smcibcq->ib_cq = ib_create_cq(smcibdev->ibdev, + smc_wr_tx_cq_handler, cq_event_handler, + smcibcq, &cqattr); + rc = PTR_ERR_OR_ZERO(smcibcq->ib_cq); + if (IS_ERR(smcibcq->ib_cq)) + goto err; + + /* initialize recv CQ */ + smcibcq = &smcibdev->smcibcq_recv[i]; + smcibcq->smcibdev = smcibdev; + cqattr.comp_vector = num_cq_peer - 1 - i; /* reverse to spread snd/rcv */ + smcibcq->ib_cq = ib_create_cq(smcibdev->ibdev, + smc_wr_rx_cq_handler, cq_event_handler, + smcibcq, &cqattr); + rc = PTR_ERR_OR_ZERO(smcibcq->ib_cq); + if (IS_ERR(smcibcq->ib_cq)) + goto err; + } smc_wr_add_dev(smcibdev); smcibdev->initialized = 1; goto out; err: - ib_destroy_cq(smcibdev->roce_cq_send); + smc_ib_cleanup_cq(smcibdev); out: mutex_unlock(&smcibdev->mutex); return rc; @@ -859,8 +951,7 @@ static void smc_ib_cleanup_per_ibdev(struct smc_ib_device *smcibdev) if (!smcibdev->initialized) goto out; smcibdev->initialized = 0; - ib_destroy_cq(smcibdev->roce_cq_recv); - ib_destroy_cq(smcibdev->roce_cq_send); + smc_ib_cleanup_cq(smcibdev); smc_wr_remove_dev(smcibdev); out: mutex_unlock(&smcibdev->mutex); diff --git a/net/smc/smc_ib.h b/net/smc/smc_ib.h index 5d8b49c57f507b62ab23a8067000757d0bfd0265..1af83b5a2e7e0505a0edc0510a3bb13e5d918de5 100644 --- a/net/smc/smc_ib.h +++ b/net/smc/smc_ib.h @@ -32,15 +32,22 @@ struct smc_ib_devices { /* list of smc ib devices definition */ extern struct smc_ib_devices smc_ib_devices; /* list of smc ib devices */ extern struct smc_lgr_list smc_lgr_list; /* list of linkgroups */ +struct smc_ib_cq { /* ib_cq wrapper for smc */ + struct smc_ib_device *smcibdev; /* parent ib device */ + struct ib_cq *ib_cq; /* real ib_cq for link */ + struct tasklet_struct tasklet; /* tasklet for wr */ + bool is_send; /* send for recv cq */ + int load; /* load of current cq */ +}; + struct smc_ib_device { /* ib-device infos for smc */ struct list_head list; struct ib_device *ibdev; struct ib_port_attr pattr[SMC_MAX_PORTS]; /* ib dev. port attrs */ struct ib_event_handler event_handler; /* global ib_event handler */ - struct ib_cq *roce_cq_send; /* send completion queue */ - struct ib_cq *roce_cq_recv; /* recv completion queue */ - struct tasklet_struct send_tasklet; /* called by send cq handler */ - struct tasklet_struct recv_tasklet; /* called by recv cq handler */ + int num_cq_peer; /* num of snd/rcv cq peer */ + struct smc_ib_cq *smcibcq_send; /* send cqs */ + struct smc_ib_cq *smcibcq_recv; /* recv cqs */ char mac[SMC_MAX_PORTS][ETH_ALEN]; /* mac address per port*/ u8 pnetid[SMC_MAX_PORTS][SMC_MAX_PNETID_LEN]; diff --git a/net/smc/smc_llc.c b/net/smc/smc_llc.c index 1d8dafa1a35e57d012595e8e31b70f19b33809e6..d323b81f6d0410329ad1a5ff5bd53cb9558b44e5 100644 --- a/net/smc/smc_llc.c +++ b/net/smc/smc_llc.c @@ -75,7 +75,8 @@ struct smc_llc_msg_add_link { /* type 0x02 */ reserved3 : 4; #endif u8 initial_psn[3]; - u8 reserved[8]; + u8 init_credits; /* QP rq init credits for rq flowctrl */ + u8 reserved[7]; }; struct smc_llc_msg_add_link_cont_rt { @@ -170,6 +171,12 @@ struct smc_llc_msg_delete_rkey { /* type 0x09 */ u8 reserved2[4]; }; +struct smc_llc_msg_announce_credits { /* type 0x0A */ + struct smc_llc_hdr hd; + u8 credits; + u8 reserved[39]; +}; + struct smc_llc_msg_delete_rkey_v2 { /* type 0x29 */ struct smc_llc_hdr hd; u8 num_rkeys; @@ -189,6 +196,7 @@ union smc_llc_msg { struct smc_llc_msg_delete_rkey delete_rkey; struct smc_llc_msg_test_link test_link; + struct smc_llc_msg_announce_credits announce_credits; struct { struct smc_llc_hdr hdr; u8 data[SMC_LLC_DATA_LEN]; @@ -381,6 +389,8 @@ static void smc_llc_tx_handler(struct smc_wr_tx_pend_priv *pend, enum ib_wc_status wc_status) { /* future work: handle wc_status error for recovery and failover */ + if (!wc_status) + atomic_inc(&link->llc_comp_cnt); } /** @@ -748,6 +758,46 @@ static int smc_llc_send_test_link(struct smc_link *link, u8 user_data[16]) return rc; } +/* send credits announce request or response */ +int smc_llc_announce_credits(struct smc_link *link, + enum smc_llc_reqresp reqresp, bool force) +{ + struct smc_llc_msg_announce_credits *announce_credits; + struct smc_wr_tx_pend_priv *pend; + struct smc_wr_buf *wr_buf; + int rc; + u8 saved_credits = 0; + + if (!link->credits_enable || + (!force && !smc_wr_rx_credits_need_announce(link))) + return 0; + + saved_credits = (u8)smc_wr_rx_get_credits(link); + if (!saved_credits) + /* maybe synced by cdc msg */ + return 0; + + rc = smc_llc_add_pending_send(link, &wr_buf, &pend); + if (rc) { + smc_wr_rx_put_credits(link, saved_credits); + return rc; + } + + announce_credits = (struct smc_llc_msg_announce_credits *)wr_buf; + memset(announce_credits, 0, sizeof(*announce_credits)); + announce_credits->hd.common.type = SMC_LLC_ANNOUNCE_CREDITS; + announce_credits->hd.length = sizeof(struct smc_llc_msg_announce_credits); + if (reqresp == SMC_LLC_RESP) + announce_credits->hd.flags |= SMC_LLC_FLAG_RESP; + announce_credits->credits = saved_credits; + /* send llc message */ + rc = smc_wr_tx_send(link, pend); + if (rc) + smc_wr_rx_put_credits(link, saved_credits); + + return rc; +} + /* schedule an llc send on link, may wait for buffers */ static int smc_llc_send_message(struct smc_link *link, void *llcbuf) { @@ -1010,6 +1060,13 @@ static void smc_llc_save_add_link_info(struct smc_link *link, memcpy(link->peer_mac, add_llc->sender_mac, ETH_ALEN); link->peer_psn = ntoh24(add_llc->initial_psn); link->peer_mtu = add_llc->qp_mtu; + link->credits_enable = add_llc->init_credits ? 1 : 0; + if (link->credits_enable) { + atomic_set(&link->peer_rq_credits, add_llc->init_credits); + // set peer rq credits watermark, if less than init_credits * 2/3, + // then credit announcement is needed. + link->peer_cr_watermark_low = max(add_llc->init_credits * 2 / 3, 1); + } } /* as an SMC client, process an add link request */ @@ -1030,6 +1087,9 @@ int smc_llc_cli_add_link(struct smc_link *link, struct smc_llc_qentry *qentry) rc = -ENOMEM; goto out_reject; } + if (lgr->type == SMC_LGR_SINGLE && + lgr->net->smc.sysctl_disable_multiple_link) + goto out_reject; ini->vlan_id = lgr->vlan_id; if (lgr->smc_version == SMC_V2) { @@ -1059,6 +1119,8 @@ int smc_llc_cli_add_link(struct smc_link *link, struct smc_llc_qentry *qentry) goto out_reject; lnk_new = &lgr->lnk[lnk_idx]; lnk_new->iw_conn_param = link->iw_conn_param; + lnk_new->clcsock = link->clcsock; + rc = smcr_link_init(lgr, lnk_new, lnk_idx, ini); if (rc) goto out_reject; @@ -1155,6 +1217,9 @@ static void smc_llc_cli_add_link_invite(struct smc_link *link, if (lgr->type == SMC_LGR_SYMMETRIC || lgr->type == SMC_LGR_ASYMMETRIC_PEER) goto out; + if (lgr->type == SMC_LGR_SINGLE && + lgr->net->smc.sysctl_disable_multiple_link) + goto out; ini = kzalloc(sizeof(*ini), GFP_KERNEL); if (!ini) @@ -1400,6 +1465,9 @@ int smc_llc_srv_add_link(struct smc_link *link, rc = -ENOMEM; goto out; } + if (lgr->type == SMC_LGR_SINGLE && + lgr->net->smc.sysctl_disable_multiple_link) + goto out; /* ignore client add link recommendation, start new flow */ ini->vlan_id = lgr->vlan_id; @@ -1430,6 +1498,7 @@ int smc_llc_srv_add_link(struct smc_link *link, } lgr->lnk[lnk_idx].iw_conn_param = link->iw_conn_param; + lgr->lnk[lnk_idx].clcsock = link->clcsock; rc = smcr_link_init(lgr, &lgr->lnk[lnk_idx], lnk_idx, ini); if (rc) goto out; @@ -1930,6 +1999,10 @@ static void smc_llc_event_handler(struct smc_llc_qentry *qentry) smc_llc_flow_stop(lgr, &lgr->llc_flow_rmt); } return; + case SMC_LLC_ANNOUNCE_CREDITS: + if (smc_link_active(link)) + smc_wr_tx_put_credits(link, llc->announce_credits.credits, true); + break; case SMC_LLC_REQ_ADD_LINK: /* handle response here, smc_llc_flow_stop() cannot be called * in tasklet context @@ -2015,6 +2088,10 @@ static void smc_llc_rx_response(struct smc_link *link, case SMC_LLC_CONFIRM_RKEY_CONT: /* not used because max links is 3 */ break; + case SMC_LLC_ANNOUNCE_CREDITS: + if (smc_link_active(link)) + smc_wr_tx_put_credits(link, qentry->msg.announce_credits.credits, true); + break; default: smc_llc_protocol_violation(link->lgr, qentry->msg.raw.hdr.common.type); @@ -2108,6 +2185,27 @@ static void smc_llc_testlink_work(struct work_struct *work) schedule_delayed_work(&link->llc_testlink_wrk, next_interval); } +static void smc_llc_announce_credits_work(struct work_struct *work) +{ + struct smc_link *link = container_of(work, + struct smc_link, credits_announce_work); + int rc, retry = 0, agains = 0; + +again: + do { + rc = smc_llc_announce_credits(link, SMC_LLC_RESP, false); + } while ((rc == -EBUSY) && smc_link_sendable(link) && + (retry++ < SMC_LLC_ANNOUNCE_CR_MAX_RETRY)); + + if (smc_wr_rx_credits_need_announce(link) && + smc_link_sendable(link) && agains <= 5 && !rc) { + agains++; + goto again; + } + + clear_bit(SMC_LINKFLAG_ANNOUNCE_PENDING, &link->flags); +} + void smc_llc_lgr_init(struct smc_link_group *lgr, struct smc_sock *smc) { struct net *net = sock_net(smc->clcsock->sk); @@ -2143,6 +2241,7 @@ int smc_llc_link_init(struct smc_link *link) { init_completion(&link->llc_testlink_resp); INIT_DELAYED_WORK(&link->llc_testlink_wrk, smc_llc_testlink_work); + INIT_WORK(&link->credits_announce_work, smc_llc_announce_credits_work); return 0; } @@ -2174,6 +2273,7 @@ void smc_llc_link_clear(struct smc_link *link, bool log) link->smcibdev->ibdev->name, link->ibport); complete(&link->llc_testlink_resp); cancel_delayed_work_sync(&link->llc_testlink_wrk); + cancel_work_sync(&link->credits_announce_work); } /* register a new rtoken at the remote peer (for all links) */ @@ -2288,6 +2388,10 @@ static struct smc_wr_rx_handler smc_llc_rx_handlers[] = { .handler = smc_llc_rx_handler, .type = SMC_LLC_DELETE_RKEY }, + { + .handler = smc_llc_rx_handler, + .type = SMC_LLC_ANNOUNCE_CREDITS + }, /* V2 types */ { .handler = smc_llc_rx_handler, diff --git a/net/smc/smc_llc.h b/net/smc/smc_llc.h index 4404e52b3346fbc51855667fb4832f32a9987ef9..f8a14643faf4ff080c51d535920651bc2a9551ea 100644 --- a/net/smc/smc_llc.h +++ b/net/smc/smc_llc.h @@ -20,6 +20,8 @@ #define SMC_LLC_WAIT_FIRST_TIME (5 * HZ) #define SMC_LLC_WAIT_TIME (2 * HZ) +#define SMC_LLC_ANNOUNCE_CR_MAX_RETRY (1) + enum smc_llc_reqresp { SMC_LLC_REQ, SMC_LLC_RESP @@ -35,6 +37,7 @@ enum smc_llc_msg_type { SMC_LLC_TEST_LINK = 0x07, SMC_LLC_CONFIRM_RKEY_CONT = 0x08, SMC_LLC_DELETE_RKEY = 0x09, + SMC_LLC_ANNOUNCE_CREDITS = 0X0A, /* V2 types */ SMC_LLC_CONFIRM_LINK_V2 = 0x21, SMC_LLC_ADD_LINK_V2 = 0x22, @@ -86,6 +89,8 @@ int smc_llc_send_add_link(struct smc_link *link, u8 mac[], u8 gid[], int smc_llc_send_delete_link(struct smc_link *link, u8 link_del_id, enum smc_llc_reqresp reqresp, bool orderly, u32 reason); +int smc_llc_announce_credits(struct smc_link *link, + enum smc_llc_reqresp reqresp, bool force); void smc_llc_srv_delete_link_local(struct smc_link *link, u8 del_link_id); void smc_llc_lgr_init(struct smc_link_group *lgr, struct smc_sock *smc); void smc_llc_lgr_clear(struct smc_link_group *lgr); diff --git a/net/smc/smc_netlink.c b/net/smc/smc_netlink.c index f2007aa124cfa99af36d48c659e33192013cfc53..52dba083b70e600296164ab6403d028545660497 100644 --- a/net/smc/smc_netlink.c +++ b/net/smc/smc_netlink.c @@ -112,6 +112,21 @@ static const struct genl_ops smc_gen_nl_ops[] = { .flags = GENL_ADMIN_PERM, .doit = smc_nl_disable_seid, }, + { + .cmd = SMC_NETLINK_DUMP_HS_LIMITATION, + /* can be retrieved by unprivileged users */ + .dumpit = smc_nl_dump_hs_limitation, + }, + { + .cmd = SMC_NETLINK_ENABLE_HS_LIMITATION, + .flags = GENL_ADMIN_PERM, + .doit = smc_nl_enable_hs_limitation, + }, + { + .cmd = SMC_NETLINK_DISABLE_HS_LIMITATION, + .flags = GENL_ADMIN_PERM, + .doit = smc_nl_disable_hs_limitation, + }, { .cmd = SMC_NETLINK_ADD_TCP2SMC_WLIST, /* can be retrieved by unprivileged users */ diff --git a/net/smc/smc_pnet.c b/net/smc/smc_pnet.c index 2bba184ca093350795c43b769c50df7a1541c89f..1ed4bbccaf314d0dc2069972175549d3aab346d1 100644 --- a/net/smc/smc_pnet.c +++ b/net/smc/smc_pnet.c @@ -868,6 +868,9 @@ int smc_pnet_net_init(struct net *net) smc_pnet_create_pnetids_list(net); + /* disable handshake limitation by default */ + net->smc.limit_smc_hs = 0; + return 0; } diff --git a/net/smc/smc_proc.c b/net/smc/smc_proc.c index 19d8cc82a7ac683b2cf0def4e3dc5bbc9c6f5d27..d9c11b31c4ab9cf3daacd540bedc4ab05e55df9b 100644 --- a/net/smc/smc_proc.c +++ b/net/smc/smc_proc.c @@ -154,9 +154,11 @@ static void _conn_show(struct seq_file *seq, struct smc_sock *smc, int protocol) seq_printf(seq, CONN_LGR_FM, lgr->role == SMC_CLNT ? 'C' : 'S', lnk->ibname, lnk->ibport, lnk->roce_qp->qp_num, - lnk->peer_qpn, lnk->wr_tx_cnt, lnk->wr_rx_cnt); + lnk->peer_qpn, smc->conn.tx_cnt, smc->conn.tx_bytes, + smc->conn.tx_corked_cnt, smc->conn.tx_corked_bytes); } else { - seq_puts(seq, "- - - - - - - -\n"); + seq_puts(seq, "- - - - - - -" + " - - -\n"); } } @@ -170,7 +172,7 @@ static int smc_conn_show(struct seq_file *seq, void *v) seq_printf(seq, sp->protocol == SMCPROTO_SMC ? CONN4_HDR : CONN6_HDR, "sl", "local_addr", "remote_addr", "is_fb", "fb_rsn", "sock", "clc_sock", "st", "inode", "lgr_id", "lgr_role", "dev", "port", - "l_qp", "r_qp", "tx_cnt", "rx_cnt"); + "l_qp", "r_qp", "tx_P", "tx_B", "cork_P", "cork_B"); goto out; } @@ -234,6 +236,63 @@ static struct smc_proc_entry smc_proc[] = { #endif }; +extern struct smc_lgr_list smc_lgr_list; +static int proc_show_links(struct seq_file *seq, void *v) +{ + struct smc_link_group *lgr, *lg; + struct smc_link *lnk; + int i = 0, j = 0; + + seq_printf(seq, "%-9s%-6s%-6s%-5s%-7s%-6s%-7s%-7s%-7s%-4s%-4s%-6s%-6s%-6s%-6s%-6s%-7s%-16s%-16s%-16s%-16s%-16s%-16s%-16s%-16s%-16s%-16s\n", + "grp", "type", "role", "idx", "gconn", "conn", "state", "qpn_l", "qpn_r", + "tx", "rx", "cr-e", "cr-l", "cr-r", "cr_h", "cr_l", "flags", "total_send", + "total_comp", "cdc_send", "cdc_comp", "llc_send", "llc_comp", "reg_send", + "reg_comp", "bad_comp", "rdma_write"); + + spin_lock_bh(&smc_lgr_list.lock); + list_for_each_entry_safe(lgr, lg, &smc_lgr_list.list, list) { + for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) { + lnk = &lgr->lnk[i]; + if (!smc_link_usable(lnk)) + continue; + for (j = 0; j < SMC_LGR_ID_SIZE; j++) + seq_printf(seq, "%02X", lgr->id[j]); + seq_printf(seq, " %-6s%-6s%-5d%-7d%-6d%-7d%-7d%-7d%-4d%-4d%-6u%-6d%-6d%-6u%-6u%-7lu%-16u%-16u%-16u%-16u%-16u%-16u%-16u%-16u%-16u%-16u\n", + lgr->is_smcd ? "D" : "R", lgr->role == SMC_CLNT ? "C" : "S", i, + lgr->conns_num, atomic_read(&lnk->conn_cnt), lnk->state, + lnk->roce_qp ? lnk->roce_qp->qp_num : 0, lnk->peer_qpn, + lnk->wr_tx_cnt, lnk->wr_rx_cnt, lnk->credits_enable, + atomic_read(&lnk->local_rq_credits), + atomic_read(&lnk->peer_rq_credits), lnk->local_cr_watermark_high, + lnk->peer_cr_watermark_low, lnk->flags, + atomic_read(&lnk->total_send_cnt), + atomic_read(&lnk->total_comp_cnt), + atomic_read(&lnk->cdc_send_cnt), + atomic_read(&lnk->cdc_comp_cnt), + atomic_read(&lnk->llc_send_cnt), + atomic_read(&lnk->llc_comp_cnt), + atomic_read(&lnk->reg_send_cnt), + atomic_read(&lnk->reg_comp_cnt), + atomic_read(&lnk->bad_comp_cnt), + atomic_read(&lnk->rdma_write_cnt)); + } + } + spin_unlock_bh(&smc_lgr_list.lock); + return 0; +} + +static int proc_open_links(struct inode *inode, struct file *file) +{ + single_open(file, proc_show_links, NULL); + return 0; +} + +static struct proc_ops link_file_ops = { +.proc_open = proc_open_links, +.proc_read = seq_read, +.proc_release = single_release, +}; + static int __net_init smc_proc_dir_init(struct net *net) { int i, rc = -ENOMEM; @@ -250,6 +309,9 @@ static int __net_init smc_proc_dir_init(struct net *net) goto err_entry; } + if (!proc_create("links", 0444, net->proc_net_smc, &link_file_ops)) + goto err_entry; + return 0; err_entry: @@ -265,6 +327,8 @@ static void __net_exit smc_proc_dir_exit(struct net *net) { int i; + remove_proc_entry("links", net->proc_net_smc); + for (i = 0; i < ARRAY_SIZE(smc_proc); i++) remove_proc_entry(smc_proc[i].name, net->proc_net_smc); diff --git a/net/smc/smc_proc.h b/net/smc/smc_proc.h index ec59ca03e1633f352e4479d4790aecd778c7fc23..faa5eaaee5113769ca8cbb2f2796e80d714d5269 100644 --- a/net/smc/smc_proc.h +++ b/net/smc/smc_proc.h @@ -9,12 +9,14 @@ #include #include "smc.h" -#define CONN4_HDR ("%4s:%-15s%-15s%-7s%-10s%-19s%-19s%-6s%-19s%-11s%-11s%-9s%-6s%-6s%-7s%-9s%-6s\n") -#define CONN6_HDR ("%4s:%-39s%-39s%-7s%-10s%-19s%-19s%-6s%-19s%-11s%-11s%-9s%-6s%-6s%-7s%-9s%-6s\n") +#define CONN4_HDR ("%4s:%-15s%-15s%-7s%-10s%-19s%-19s%-6s%-17s%-11s" \ + "%-11s%-13s%-6s%-6s%-7s%-9s%-9s%-9s%-9s\n") +#define CONN6_HDR ("%4s:%-39s%-39s%-7s%-10s%-19s%-19s%-6s%-17s%-11s" \ + "%-11s%-13s%-6s%-6s%-7s%-9s%-9s%-9s%-9s\n") #define CONN4_ADDR_FM ("%4d:%08X:%04X %08X:%04X") #define CONN6_ADDR_FM ("%4d:%08X%08X%08X%08X:%04X %08X%08X%08X%08X:%04X") -#define CONN_SK_FM (" %c %-8X %pK %pK %2d %-16lu ") -#define CONN_LGR_FM (" %c %-8s %d %-4X %-4X %-8X %-8X\n") +#define CONN_SK_FM (" %c %-8X %pK %pK %2d %-16lu ") +#define CONN_LGR_FM (" %c %-8s %d %-4X %-4X %-8llu %-8llu %-8llu %-8llu\n") struct smc_proc_private { struct seq_net_private p; diff --git a/net/smc/smc_sysctl.c b/net/smc/smc_sysctl.c index 7f4e0912dd973cca765e52a8333f40d4a0c8d7af..78f9895d649e3c85074f02be0d18bfa5adc4ea4d 100644 --- a/net/smc/smc_sysctl.c +++ b/net/smc/smc_sysctl.c @@ -1,17 +1,34 @@ // SPDX-License-Identifier: GPL-2.0 +/* + * Shared Memory Communications over RDMA (SMC-R) and RoCE + * + * smc_sysctl.c: sysctl interface to SMC subsystem. + * + * Copyright (c) 2022, Alibaba Inc. + * + * Author: Tony Lu + * + */ -#include #include #include -#include #include +#include "smc.h" +#include "smc_sysctl.h" #include "smc_core.h" static int min_sndbuf = SMC_BUF_MIN_SIZE; static int min_rcvbuf = SMC_BUF_MIN_SIZE; static struct ctl_table smc_table[] = { + { + .procname = "autocorking_size", + .data = &init_net.smc.sysctl_autocorking_size, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_douintvec, + }, { .procname = "wmem_default", .data = &init_net.smc.sysctl_wmem_default, @@ -45,8 +62,26 @@ static struct ctl_table smc_table[] = { .extra2 = SYSCTL_ONE, }, { - .procname = "autocorking", - .data = &init_net.smc.sysctl_autocorking, + .procname = "limit_handshake", + .data = &init_net.smc.limit_smc_hs, + .maxlen = sizeof(init_net.smc.limit_smc_hs), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, + { + .procname = "keep_first_contact_clcsock", + .data = &init_net.smc.sysctl_keep_first_contact_clcsock, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, + { + .procname = "disable_multiple_link", + .data = &init_net.smc.sysctl_disable_multiple_link, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, @@ -56,7 +91,7 @@ static struct ctl_table smc_table[] = { { } }; -static __net_init int smc_sysctl_init_net(struct net *net) +int __net_init smc_sysctl_net_init(struct net *net) { struct ctl_table *table; @@ -76,6 +111,14 @@ static __net_init int smc_sysctl_init_net(struct net *net) if (!net->smc.smc_hdr) goto err_reg; + net->smc.sysctl_autocorking_size = SMC_AUTOCORKING_DEFAULT_SIZE; + net->smc.sysctl_wmem_default = 256 * 1024; + net->smc.sysctl_rmem_default = 384 * 1024; + net->smc.sysctl_tcp2smc = 0; + net->smc.sysctl_allow_different_subnet = 1; + net->smc.sysctl_keep_first_contact_clcsock = 1; + net->smc.sysctl_disable_multiple_link = 1; + return 0; err_reg: @@ -85,22 +128,12 @@ static __net_init int smc_sysctl_init_net(struct net *net) return -ENOMEM; } -static __net_exit void smc_sysctl_exit_net(struct net *net) -{ - unregister_net_sysctl_table(net->smc.smc_hdr); -} - -static struct pernet_operations smc_sysctl_ops __net_initdata = { - .init = smc_sysctl_init_net, - .exit = smc_sysctl_exit_net, -}; - -int __init smc_sysctl_init(void) +void __net_exit smc_sysctl_net_exit(struct net *net) { - return register_pernet_subsys(&smc_sysctl_ops); -} + struct ctl_table *table; -void smc_sysctl_exit(void) -{ - unregister_pernet_subsys(&smc_sysctl_ops); + table = net->smc.smc_hdr->ctl_table_arg; + unregister_net_sysctl_table(net->smc.smc_hdr); + if (!net_eq(net, &init_net)) + kfree(table); } diff --git a/net/smc/smc_sysctl.h b/net/smc/smc_sysctl.h new file mode 100644 index 0000000000000000000000000000000000000000..0becc11bd2f4c0871acb93a725a960069c63d102 --- /dev/null +++ b/net/smc/smc_sysctl.h @@ -0,0 +1,33 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Shared Memory Communications over RDMA (SMC-R) and RoCE + * + * smc_sysctl.c: sysctl interface to SMC subsystem. + * + * Copyright (c) 2022, Alibaba Inc. + * + * Author: Tony Lu + * + */ + +#ifndef _SMC_SYSCTL_H +#define _SMC_SYSCTL_H + +#ifdef CONFIG_SYSCTL + +int __net_init smc_sysctl_net_init(struct net *net); +void __net_exit smc_sysctl_net_exit(struct net *net); + +#else + +static inline int smc_sysctl_net_init(struct net *net) +{ + net->smc.sysctl_autocorking_size = SMC_AUTOCORKING_DEFAULT_SIZE; + return 0; +} + +static inline void smc_sysctl_net_exit(struct net *net) { } + +#endif /* CONFIG_SYSCTL */ + +#endif /* _SMC_SYSCTL_H */ diff --git a/net/smc/smc_tx.c b/net/smc/smc_tx.c index 68d62ac63dec1eb6e62a337d5bb10dc3bcc3c74f..20217edfb9e3f811fb6edc3dcd11f453df4cf784 100644 --- a/net/smc/smc_tx.c +++ b/net/smc/smc_tx.c @@ -124,37 +124,56 @@ static int smc_tx_wait(struct smc_sock *smc, int flags) return rc; } -/* Strategy: Nagle algorithm - * 1. The first message should never cork - * 2. If we have any inflight messages, wait for the first - * message back - * 3. The total corked message should not exceed min(64k, sendbuf/2) +static bool smc_tx_is_corked(struct smc_sock *smc) +{ + struct tcp_sock *tp = tcp_sk(smc->clcsock->sk); + + return (tp->nonagle & TCP_NAGLE_CORK) ? true : false; +} + +/* If we have pending CDC messages, do not send: + * Because CQE of this CDC message will happen shortly, it gives + * a chance to coalesce future sendmsg() payload in to one RDMA Write, + * without need for a timer, and with no latency trade off. + * Algorithm here: + * 1. First message should never cork + * 2. If we have pending Tx CDC messages, wait for the first CDC + * message's completion + * 3. Don't cork to much data in a single RDMA Write to prevent burst + * traffic, total corked message should not exceed sendbuf/2 */ -static bool smc_tx_should_cork(struct smc_sock *smc, struct msghdr *msg) +static bool smc_should_autocork(struct smc_sock *smc) { struct smc_connection *conn = &smc->conn; - int prepared_send; + int corking_size; - /* First request && no more message should always pass */ - if (atomic_read(&conn->cdc_pend_tx_wr) == 0 && - !(msg->msg_flags & MSG_MORE)) - return false; + corking_size = min_t(unsigned int, conn->sndbuf_desc->len >> 1, + sock_net(&smc->sk)->smc.sysctl_autocorking_size); - /* If We have enough data in the send queue that have not been - * pushed, send immediately. - * Note, here we only care about the prepared_sends, but not - * sendbuf_space because sendbuf_space has nothing to do with - * corked data size. - */ - prepared_send = smc_tx_prepared_sends(conn); - if (prepared_send > min(64 * 1024, conn->sndbuf_desc->len >> 1)) + if (atomic_read(&conn->cdc_pend_tx_wr) == 0 || + smc_tx_prepared_sends(conn) > corking_size) return false; + return true; +} - if (!sock_net(&smc->sk)->smc.sysctl_autocorking) - return false; +static bool smc_tx_should_cork(struct smc_sock *smc, struct msghdr *msg) +{ + struct smc_connection *conn = &smc->conn; - /* All the other conditions should cork */ - return true; + if (smc_should_autocork(smc)) + return true; + + /* for a corked socket defer the RDMA writes if + * sndbuf_space is still available. The applications + * should known how/when to uncork it. + */ + if ((msg->msg_flags & MSG_MORE || + smc_tx_is_corked(smc) || + msg->msg_flags & MSG_SENDPAGE_NOTLAST) && + atomic_read(&conn->sndbuf_space)) + return true; + + return false; } /* sndbuf producer: main API called by socket layer. @@ -203,20 +222,12 @@ int smc_tx_sendmsg(struct smc_sock *smc, struct msghdr *msg, size_t len) if (msg->msg_flags & MSG_OOB) conn->local_tx_ctrl.prod_flags.urg_data_pending = 1; - /* If our send queue is full but peer have RMBE space, - * we should send them out before wait - */ - if (!atomic_read(&conn->sndbuf_space) && - atomic_read(&conn->peer_rmbe_space) > 0) - smc_tx_sndbuf_nonempty(conn); - if (!atomic_read(&conn->sndbuf_space) || conn->urg_tx_pend) { + if (send_done) + return send_done; rc = smc_tx_wait(smc, msg->msg_flags); - if (rc) { - if (send_done) - return send_done; + if (rc) goto out_err; - } continue; } @@ -269,18 +280,16 @@ int smc_tx_sendmsg(struct smc_sock *smc, struct msghdr *msg, size_t len) */ if ((msg->msg_flags & MSG_OOB) && !send_remaining) conn->urg_tx_pend = true; - if (smc_tx_should_cork(smc, msg)) { - /* for a corked socket defer the RDMA writes if there - * is still sufficient sndbuf_space available - */ - conn->tx_corked_bytes += copylen; - ++conn->tx_corked_cnt; - } else { + /* If we need to cork, do nothing and wait for the next + * sendmsg() call or push on tx completion + */ + if (!smc_tx_should_cork(smc, msg)) { conn->tx_bytes += copylen; ++conn->tx_cnt; - if (delayed_work_pending(&conn->tx_work)) - cancel_delayed_work(&conn->tx_work); smc_tx_sndbuf_nonempty(conn); + } else { + conn->tx_corked_bytes += copylen; + ++conn->tx_corked_cnt; } trace_smc_tx_sendmsg(smc, copylen); @@ -296,6 +305,22 @@ int smc_tx_sendmsg(struct smc_sock *smc, struct msghdr *msg, size_t len) return rc; } +int smc_tx_sendpage(struct smc_sock *smc, struct page *page, int offset, + size_t size, int flags) +{ + struct msghdr msg = {.msg_flags = flags}; + char *kaddr = kmap(page); + struct kvec iov; + int rc; + + iov.iov_base = kaddr + offset; + iov.iov_len = size; + iov_iter_kvec(&msg.msg_iter, WRITE, &iov, 1, size); + rc = smc_tx_sendmsg(smc, &msg, size); + kunmap(page); + return rc; +} + /***************************** sndbuf consumer *******************************/ /* sndbuf consumer: actual data transfer of one target chunk with ISM write */ @@ -333,9 +358,17 @@ static int smc_tx_rdma_write(struct smc_connection *conn, int peer_rmbe_offset, /* offset within RMBE */ peer_rmbe_offset; rdma_wr->rkey = lgr->rtokens[conn->rtoken_idx][link->link_idx].rkey; + /* rtoken might be deleted if peer freed connection */ + if (!rdma_wr->rkey || + (rdma_wr->remote_addr == (conn->tx_off + peer_rmbe_offset))) { + pr_warn_ratelimited("smc: unexpected sends during connection termination flow\n"); + return -EINVAL; + } rc = ib_post_send(link->roce_qp, &rdma_wr->wr, NULL); if (rc) smcr_link_down_cond_sched(link); + else + atomic_inc(&link->rdma_write_cnt); return rc; } @@ -612,17 +645,10 @@ static int smcd_tx_sndbuf_nonempty(struct smc_connection *conn) return rc; } -int smc_tx_sndbuf_nonempty(struct smc_connection *conn) +static int __smc_tx_sndbuf_nonempty(struct smc_connection *conn) { - int rc = 0; struct smc_sock *smc = container_of(conn, struct smc_sock, conn); - - /* Only let one to push to prevent wasting of CPU and CDC slot */ - if (atomic_inc_return(&conn->tx_pushing) > 1) - return 0; - -again: - atomic_set(&conn->tx_pushing, 1); + int rc = 0; /* No data in the send queue */ if (unlikely(smc_tx_prepared_sends(conn) <= 0)) @@ -646,16 +672,34 @@ int smc_tx_sndbuf_nonempty(struct smc_connection *conn) if (!rc) { /* trigger socket release if connection is closing */ - struct smc_sock *smc = container_of(conn, struct smc_sock, - conn); smc_close_wake_tx_prepared(smc); } out: + return rc; +} + +int smc_tx_sndbuf_nonempty(struct smc_connection *conn) +{ + int rc; + + /* This make sure only one can send simultaneously to prevent wasting + * of CPU and CDC slot. + * Record whether someone has tried to push while we are pushing. + */ + if (atomic_inc_return(&conn->tx_pushing) > 1) + return 0; + +again: + atomic_set(&conn->tx_pushing, 1); + smp_wmb(); /* Make sure tx_pushing is 1 before real send */ + rc = __smc_tx_sndbuf_nonempty(conn); + /* We need to check whether someone else have added some data into - * the send queue and tried to push but failed when we are pushing. - * If so, we need to try push again to prevent those data in the - * send queue may never been pushed out + * the send queue and tried to push but failed after the atomic_set() + * when we are pushing. + * If so, we need to push again to prevent those data hang in the send + * queue. */ if (unlikely(!atomic_dec_and_test(&conn->tx_pushing))) goto again; @@ -664,26 +708,36 @@ int smc_tx_sndbuf_nonempty(struct smc_connection *conn) } /* Wakeup sndbuf consumers from process context - * since there is more data to transmit + * since there is more data to transmit. The caller + * must hold sock lock. */ -void smc_tx_work(struct work_struct *work) +void smc_tx_pending(struct smc_connection *conn) { - struct smc_connection *conn = container_of(to_delayed_work(work), - struct smc_connection, - tx_work); struct smc_sock *smc = container_of(conn, struct smc_sock, conn); int rc; - lock_sock(&smc->sk); if (smc->sk.sk_err) - goto out; + return; rc = smc_tx_sndbuf_nonempty(conn); if (!rc && conn->local_rx_ctrl.prod_flags.write_blocked && !atomic_read(&conn->bytes_to_rcv)) conn->local_rx_ctrl.prod_flags.write_blocked = 0; +} -out: +/* Wakeup sndbuf consumers from process context + * since there is more data to transmit in locked + * sock. + */ +void smc_tx_work(struct work_struct *work) +{ + struct smc_connection *conn = container_of(to_delayed_work(work), + struct smc_connection, + tx_work); + struct smc_sock *smc = container_of(conn, struct smc_sock, conn); + + lock_sock(&smc->sk); + smc_tx_pending(conn); release_sock(&smc->sk); } diff --git a/net/smc/smc_tx.h b/net/smc/smc_tx.h index 07e6ad76224a0cd1b2fe5bb91d7acaa4eb977534..34b578498b1f1cd78a75fc9bed698e9bd1080dae 100644 --- a/net/smc/smc_tx.h +++ b/net/smc/smc_tx.h @@ -27,9 +27,12 @@ static inline int smc_tx_prepared_sends(struct smc_connection *conn) return smc_curs_diff(conn->sndbuf_desc->len, &sent, &prep); } +void smc_tx_pending(struct smc_connection *conn); void smc_tx_work(struct work_struct *work); void smc_tx_init(struct smc_sock *smc); int smc_tx_sendmsg(struct smc_sock *smc, struct msghdr *msg, size_t len); +int smc_tx_sendpage(struct smc_sock *smc, struct page *page, int offset, + size_t size, int flags); int smc_tx_sndbuf_nonempty(struct smc_connection *conn); void smc_tx_sndbuf_nonfull(struct smc_sock *smc); void smc_tx_consumer_update(struct smc_connection *conn, bool force); diff --git a/net/smc/smc_wr.c b/net/smc/smc_wr.c index ca179e2c86b7ccb9e39bcbc2cdc3605241fe51b2..c36b7c3e1b4c74b9122b06e2d2efd49ca2d9fe5f 100644 --- a/net/smc/smc_wr.c +++ b/net/smc/smc_wr.c @@ -81,12 +81,17 @@ static inline void smc_wr_tx_process_cqe(struct ib_wc *wc) u32 pnd_snd_idx; link = wc->qp->qp_context; + atomic_inc(&link->total_comp_cnt); if (wc->opcode == IB_WC_REG_MR) { - if (wc->status) + if (wc->status) { link->wr_reg_state = FAILED; - else + pr_warn("smc: reg mr comp failed\n"); + atomic_inc(&link->bad_comp_cnt); + } else { link->wr_reg_state = CONFIRMED; + atomic_inc(&link->reg_comp_cnt); + } smc_wr_wakeup_reg_wait(link); return; } @@ -94,8 +99,10 @@ static inline void smc_wr_tx_process_cqe(struct ib_wc *wc) pnd_snd_idx = smc_wr_tx_find_pending_index(link, wc->wr_id); if (pnd_snd_idx == link->wr_tx_cnt) { if (link->lgr->smc_version != SMC_V2 || - link->wr_tx_v2_pend->wr_id != wc->wr_id) + link->wr_tx_v2_pend->wr_id != wc->wr_id) { + pr_warn("smc: find pending index failed\n"); return; + } link->wr_tx_v2_pend->wc_status = wc->status; memcpy(&pnd_snd, link->wr_tx_v2_pend, sizeof(pnd_snd)); /* clear the full struct smc_wr_tx_pend including .priv */ @@ -114,11 +121,14 @@ static inline void smc_wr_tx_process_cqe(struct ib_wc *wc) sizeof(link->wr_tx_pends[pnd_snd_idx])); memset(&link->wr_tx_bufs[pnd_snd_idx], 0, sizeof(link->wr_tx_bufs[pnd_snd_idx])); - if (!test_and_clear_bit(pnd_snd_idx, link->wr_tx_mask)) + if (!test_and_clear_bit(pnd_snd_idx, link->wr_tx_mask)) { + pr_warn("smc: clear pending index bitmap failed\n"); return; + } } if (wc->status) { + atomic_inc(&link->bad_comp_cnt); if (link->lgr->smc_version == SMC_V2) { memset(link->wr_tx_v2_pend, 0, sizeof(*link->wr_tx_v2_pend)); @@ -130,40 +140,44 @@ static inline void smc_wr_tx_process_cqe(struct ib_wc *wc) } if (pnd_snd.handler) pnd_snd.handler(&pnd_snd.priv, link, wc->status); - wake_up(&link->wr_tx_wait); + if (wq_has_sleeper(&link->wr_tx_wait)) + wake_up(&link->wr_tx_wait); } static void smc_wr_tx_tasklet_fn(struct tasklet_struct *t) { - struct smc_ib_device *dev = from_tasklet(dev, t, send_tasklet); + struct smc_ib_cq *smcibcq = from_tasklet(smcibcq, t, tasklet); struct ib_wc wc[SMC_WR_MAX_POLL_CQE]; - int i = 0, rc; - int polled = 0; + int i, rc; again: - polled++; do { memset(&wc, 0, sizeof(wc)); - rc = ib_poll_cq(dev->roce_cq_send, SMC_WR_MAX_POLL_CQE, wc); - if (polled == 1) { - ib_req_notify_cq(dev->roce_cq_send, - IB_CQ_NEXT_COMP | - IB_CQ_REPORT_MISSED_EVENTS); - } - if (!rc) - break; + rc = ib_poll_cq(smcibcq->ib_cq, SMC_WR_MAX_POLL_CQE, wc); for (i = 0; i < rc; i++) smc_wr_tx_process_cqe(&wc[i]); + if (rc < SMC_WR_MAX_POLL_CQE) + /* If < SMC_WR_MAX_POLL_CQE, the CQ should have been + * drained, no need to poll again. + */ + break; } while (rc > 0); - if (polled == 1) + + /* With IB_CQ_REPORT_MISSED_EVENTS, if ib_req_notify_cq() returns 0, + * then it is safe to wait for the next event; else we must poll the + * CQ again to make sure we won't miss any event. + */ + if (ib_req_notify_cq(smcibcq->ib_cq, + IB_CQ_NEXT_COMP | + IB_CQ_REPORT_MISSED_EVENTS) > 0) goto again; } void smc_wr_tx_cq_handler(struct ib_cq *ib_cq, void *cq_context) { - struct smc_ib_device *dev = (struct smc_ib_device *)cq_context; + struct smc_ib_cq *smcibcq = (struct smc_ib_cq *)cq_context; - tasklet_schedule(&dev->send_tasklet); + tasklet_schedule(&smcibcq->tasklet); } /*---------------------------- request submission ---------------------------*/ @@ -173,11 +187,16 @@ static inline int smc_wr_tx_get_free_slot_index(struct smc_link *link, u32 *idx) *idx = link->wr_tx_cnt; if (!smc_link_sendable(link)) return -ENOLINK; + + if (!smc_wr_tx_get_credit(link)) + return -EBUSY; + for_each_clear_bit(*idx, link->wr_tx_mask, link->wr_tx_cnt) { if (!test_and_set_bit(*idx, link->wr_tx_mask)) return 0; } *idx = link->wr_tx_cnt; + smc_wr_tx_put_credits(link, 1, false); return -EBUSY; } @@ -283,7 +302,7 @@ int smc_wr_tx_put_slot(struct smc_link *link, memset(&link->wr_tx_bufs[idx], 0, sizeof(link->wr_tx_bufs[idx])); test_and_clear_bit(idx, link->wr_tx_mask); - wake_up(&link->wr_tx_wait); + smc_wr_tx_put_credits(link, 1, true); return 1; } else if (link->lgr->smc_version == SMC_V2 && pend->idx == link->wr_tx_cnt) { @@ -311,6 +330,8 @@ int smc_wr_tx_send(struct smc_link *link, struct smc_wr_tx_pend_priv *priv) if (rc) { smc_wr_tx_put_slot(link, priv); smcr_link_down_cond_sched(link); + } else { + atomic_inc(&link->total_send_cnt); } return rc; } @@ -321,12 +342,14 @@ int smc_wr_tx_v2_send(struct smc_link *link, struct smc_wr_tx_pend_priv *priv, int rc; link->wr_tx_v2_ib->sg_list[0].length = len; - ib_req_notify_cq(link->smcibdev->roce_cq_send, + ib_req_notify_cq(link->smcibcq_send->ib_cq, IB_CQ_NEXT_COMP | IB_CQ_REPORT_MISSED_EVENTS); rc = ib_post_send(link->roce_qp, link->wr_tx_v2_ib, NULL); if (rc) { smc_wr_tx_put_slot(link, priv); smcr_link_down_cond_sched(link); + } else { + atomic_inc(&link->total_send_cnt); } return rc; } @@ -365,7 +388,7 @@ int smc_wr_reg_send(struct smc_link *link, struct ib_mr *mr) { int rc; - ib_req_notify_cq(link->smcibdev->roce_cq_send, + ib_req_notify_cq(link->smcibcq_send->ib_cq, IB_CQ_NEXT_COMP | IB_CQ_REPORT_MISSED_EVENTS); link->wr_reg_state = POSTED; link->wr_reg.wr.wr_id = (u64)(uintptr_t)mr; @@ -374,6 +397,8 @@ int smc_wr_reg_send(struct smc_link *link, struct ib_mr *mr) rc = ib_post_send(link->roce_qp, &link->wr_reg.wr, NULL); if (rc) return rc; + atomic_inc(&link->reg_send_cnt); + atomic_inc(&link->total_send_cnt); atomic_inc(&link->wr_reg_refcnt); rc = wait_event_interruptible_timeout(link->wr_reg_wait, @@ -469,39 +494,49 @@ static inline void smc_wr_rx_process_cqes(struct ib_wc wc[], int num) break; } } + + if (smc_wr_rx_credits_need_announce(link) && + !test_bit(SMC_LINKFLAG_ANNOUNCE_PENDING, &link->flags)) { + set_bit(SMC_LINKFLAG_ANNOUNCE_PENDING, &link->flags); + schedule_work(&link->credits_announce_work); + } } } static void smc_wr_rx_tasklet_fn(struct tasklet_struct *t) { - struct smc_ib_device *dev = from_tasklet(dev, t, recv_tasklet); + struct smc_ib_cq *smcibcq = from_tasklet(smcibcq, t, tasklet); struct ib_wc wc[SMC_WR_MAX_POLL_CQE]; - int polled = 0; int rc; again: - polled++; do { memset(&wc, 0, sizeof(wc)); - rc = ib_poll_cq(dev->roce_cq_recv, SMC_WR_MAX_POLL_CQE, wc); - if (polled == 1) { - ib_req_notify_cq(dev->roce_cq_recv, - IB_CQ_SOLICITED_MASK - | IB_CQ_REPORT_MISSED_EVENTS); - } - if (!rc) + rc = ib_poll_cq(smcibcq->ib_cq, SMC_WR_MAX_POLL_CQE, wc); + if (rc > 0) + smc_wr_rx_process_cqes(&wc[0], rc); + if (rc < SMC_WR_MAX_POLL_CQE) + /* If < SMC_WR_MAX_POLL_CQE, the CQ should have been + * drained, no need to poll again. + */ break; - smc_wr_rx_process_cqes(&wc[0], rc); } while (rc > 0); - if (polled == 1) + + /* With IB_CQ_REPORT_MISSED_EVENTS, if ib_req_notify_cq() returns 0, + * then it is safe to wait for the next event; else we must poll the + * CQ again to make sure we won't miss any event. + */ + if (ib_req_notify_cq(smcibcq->ib_cq, + IB_CQ_NEXT_COMP | + IB_CQ_REPORT_MISSED_EVENTS) > 0) goto again; } void smc_wr_rx_cq_handler(struct ib_cq *ib_cq, void *cq_context) { - struct smc_ib_device *dev = (struct smc_ib_device *)cq_context; + struct smc_ib_cq *smcibcq = (struct smc_ib_cq *)cq_context; - tasklet_schedule(&dev->recv_tasklet); + tasklet_schedule(&smcibcq->tasklet); } int smc_wr_rx_post_init(struct smc_link *link) @@ -511,6 +546,8 @@ int smc_wr_rx_post_init(struct smc_link *link) for (i = 0; i < link->wr_rx_cnt; i++) rc = smc_wr_rx_post(link); + // credits have already been announced to peer + atomic_set(&link->local_rq_credits, 0); return rc; } @@ -545,7 +582,7 @@ void smc_wr_remember_qp_attr(struct smc_link *lnk) lnk->wr_tx_cnt = min_t(size_t, SMC_WR_BUF_CNT, lnk->qp_attr.cap.max_send_wr); - lnk->wr_rx_cnt = min_t(size_t, SMC_WR_BUF_CNT * 3, + lnk->wr_rx_cnt = min_t(size_t, SMC_WR_BUF_CNT, lnk->qp_attr.cap.max_recv_wr); } @@ -734,7 +771,7 @@ int smc_wr_alloc_link_mem(struct smc_link *link) link->wr_tx_bufs = kcalloc(SMC_WR_BUF_CNT, SMC_WR_BUF_SIZE, GFP_KERNEL); if (!link->wr_tx_bufs) goto no_mem; - link->wr_rx_bufs = kcalloc(SMC_WR_BUF_CNT * 3, SMC_WR_BUF_SIZE, + link->wr_rx_bufs = kcalloc(SMC_WR_BUF_CNT, SMC_WR_BUF_SIZE, GFP_KERNEL); if (!link->wr_rx_bufs) goto no_mem_wr_tx_bufs; @@ -742,7 +779,7 @@ int smc_wr_alloc_link_mem(struct smc_link *link) GFP_KERNEL); if (!link->wr_tx_ibs) goto no_mem_wr_rx_bufs; - link->wr_rx_ibs = kcalloc(SMC_WR_BUF_CNT * 3, + link->wr_rx_ibs = kcalloc(SMC_WR_BUF_CNT, sizeof(link->wr_rx_ibs[0]), GFP_KERNEL); if (!link->wr_rx_ibs) @@ -761,7 +798,7 @@ int smc_wr_alloc_link_mem(struct smc_link *link) GFP_KERNEL); if (!link->wr_tx_sges) goto no_mem_wr_tx_rdma_sges; - link->wr_rx_sges = kcalloc(SMC_WR_BUF_CNT * 3, + link->wr_rx_sges = kcalloc(SMC_WR_BUF_CNT, sizeof(link->wr_rx_sges[0]) * sges_per_buf, GFP_KERNEL); if (!link->wr_rx_sges) @@ -828,14 +865,24 @@ int smc_wr_alloc_link_mem(struct smc_link *link) void smc_wr_remove_dev(struct smc_ib_device *smcibdev) { - tasklet_kill(&smcibdev->recv_tasklet); - tasklet_kill(&smcibdev->send_tasklet); + int i; + + for (i = 0; i < smcibdev->num_cq_peer; i++) { + tasklet_kill(&smcibdev->smcibcq_send[i].tasklet); + tasklet_kill(&smcibdev->smcibcq_recv[i].tasklet); + } } void smc_wr_add_dev(struct smc_ib_device *smcibdev) { - tasklet_setup(&smcibdev->recv_tasklet, smc_wr_rx_tasklet_fn); - tasklet_setup(&smcibdev->send_tasklet, smc_wr_tx_tasklet_fn); + int i; + + for (i = 0; i < smcibdev->num_cq_peer; i++) { + tasklet_setup(&smcibdev->smcibcq_send[i].tasklet, + smc_wr_tx_tasklet_fn); + tasklet_setup(&smcibdev->smcibcq_recv[i].tasklet, + smc_wr_rx_tasklet_fn); + } } int smc_wr_create_link(struct smc_link *lnk) @@ -884,6 +931,11 @@ int smc_wr_create_link(struct smc_link *lnk) atomic_set(&lnk->wr_tx_refcnt, 0); init_waitqueue_head(&lnk->wr_reg_wait); atomic_set(&lnk->wr_reg_refcnt, 0); + atomic_set(&lnk->peer_rq_credits, 0); + atomic_set(&lnk->local_rq_credits, 0); + lnk->flags = 0; + lnk->local_cr_watermark_high = max(lnk->wr_rx_cnt / 3, 1U); + lnk->peer_cr_watermark_low = 0; return rc; dma_unmap: diff --git a/net/smc/smc_wr.h b/net/smc/smc_wr.h index 47512ccce5ef874fe4ab60873e8bc1eef9ef27a0..8cf276215c91e80f472c3cbd01c397d3806f6343 100644 --- a/net/smc/smc_wr.h +++ b/net/smc/smc_wr.h @@ -19,7 +19,12 @@ #include "smc.h" #include "smc_core.h" -#define SMC_WR_BUF_CNT 16 /* # of ctrl buffers per link */ +#define SMC_WR_BUF_CNT 64 /* # of ctrl buffers per link, SMC_WR_BUF_CNT + * should not be less than 2 * SMC_RMBS_PER_LGR_MAX, + * since every connection at least has two rq/sq + * credits in average, otherwise may result in + * waiting for credits in sending process. + */ #define SMC_WR_TX_WAIT_FREE_SLOT_TIME (10 * HZ) @@ -83,6 +88,51 @@ static inline void smc_wr_wakeup_reg_wait(struct smc_link *lnk) wake_up(&lnk->wr_reg_wait); } +// get one tx credit, and peer rq credits dec +static inline int smc_wr_tx_get_credit(struct smc_link *link) +{ + return !link->credits_enable || atomic_dec_if_positive(&link->peer_rq_credits) >= 0; +} + +// put tx credits, when some failures occurred after tx credits got +// or receive announce credits msgs +static inline void smc_wr_tx_put_credits(struct smc_link *link, int credits, bool wakeup) +{ + if (link->credits_enable && credits) { + atomic_add(credits, &link->peer_rq_credits); + if (wakeup && wq_has_sleeper(&link->wr_tx_wait)) + wake_up_nr(&link->wr_tx_wait, credits); + } +} + +// to check whether peer rq credits is lower than watermark. +static inline int smc_wr_tx_credits_need_announce(struct smc_link *link) +{ + return link->credits_enable && + atomic_read(&link->peer_rq_credits) <= link->peer_cr_watermark_low; +} + +// get local rq credits and set credits to zero. +// may called when announcing credits +static inline int smc_wr_rx_get_credits(struct smc_link *link) +{ + return link->credits_enable ? atomic_fetch_and(0, &link->local_rq_credits) : 0; +} + +// called when post_recv a rqe +static inline void smc_wr_rx_put_credits(struct smc_link *link, int credits) +{ + if (link->credits_enable && credits) + atomic_add(credits, &link->local_rq_credits); +} + +// to check whether local rq credits is higher than watermark. +static inline int smc_wr_rx_credits_need_announce(struct smc_link *link) +{ + return link->credits_enable && + atomic_read(&link->local_rq_credits) >= link->local_cr_watermark_high; +} + /* post a new receive work request to fill a completed old work request entry */ static inline int smc_wr_rx_post(struct smc_link *link) { @@ -95,6 +145,8 @@ static inline int smc_wr_rx_post(struct smc_link *link) index = do_div(temp_wr_id, link->wr_rx_cnt); link->wr_rx_ibs[index].wr_id = wr_id; rc = ib_post_recv(link->roce_qp, &link->wr_rx_ibs[index], NULL); + if (!rc) + smc_wr_rx_put_credits(link, 1); return rc; } @@ -125,10 +177,6 @@ int smc_wr_tx_v2_send(struct smc_link *link, int smc_wr_tx_send_wait(struct smc_link *link, struct smc_wr_tx_pend_priv *priv, unsigned long timeout); void smc_wr_tx_cq_handler(struct ib_cq *ib_cq, void *cq_context); -void smc_wr_tx_dismiss_slots(struct smc_link *lnk, u8 wr_rx_hdr_type, - smc_wr_tx_filter filter, - smc_wr_tx_dismisser dismisser, - unsigned long data); void smc_wr_tx_wait_no_pending_sends(struct smc_link *link); int smc_wr_rx_register_handler(struct smc_wr_rx_handler *handler);