From f994c63b8ee956fca8b7d8b966066413b2aee8a1 Mon Sep 17 00:00:00 2001 From: Zhou Juan Date: Fri, 2 Jun 2023 11:37:11 +0800 Subject: [PATCH] Backport bugfix for hns 1.Fix the owner bit error of sq in new io 2.Fix incorrect post-send with direct wqe of 3.Add a judgment to the congestion control algorithm Singed-off-by: Juan Zhou (cherry picked from commit 092143ba858a7aba0630fadd416faa2a4e7eaf06) (cherry picked from commit 9004055930b51f6f9c5eb68bd29452a71c2e0b1d) --- ...-the-owner-bit-error-of-sq-in-new-io.patch | 69 +++++++++++++++++++ ...rect-post-send-with-direct-wqe-of-wr.patch | 49 +++++++++++++ ...gment-to-the-congestion-control-algo.patch | 41 +++++++++++ rdma-core.spec | 11 ++- 4 files changed, 169 insertions(+), 1 deletion(-) create mode 100644 0047-libhns-Fix-the-owner-bit-error-of-sq-in-new-io.patch create mode 100644 0048-libhns-Fix-incorrect-post-send-with-direct-wqe-of-wr.patch create mode 100644 0049-libhns-Add-a-judgment-to-the-congestion-control-algo.patch diff --git a/0047-libhns-Fix-the-owner-bit-error-of-sq-in-new-io.patch b/0047-libhns-Fix-the-owner-bit-error-of-sq-in-new-io.patch new file mode 100644 index 0000000..e0b7b38 --- /dev/null +++ b/0047-libhns-Fix-the-owner-bit-error-of-sq-in-new-io.patch @@ -0,0 +1,69 @@ +From a86a120c35b1112bcef6c3821c2e5e1910e615e9 Mon Sep 17 00:00:00 2001 +From: Luoyouming +Date: Fri, 2 Jun 2023 10:33:14 +0800 +Subject: [PATCH 2/4] libhns: Fix the owner bit error of sq in new io + +driver inclusion +category: bugfix +bugzilla: https://gitee.com/src-openeuler/rdma-core/issues/I7A5Y5 + +--------------------------------------------------------------- + +The code does not use the head position of sq to set the owner bit, +but uses the head after adding 1 to cause an owner bit error. When +the wqe queue has not been flipped, the hardware has flipped based +on the owner bit judgment, resulting in failure to obtain wqe, +unable to send, and unable to generate cqe. This patch will set the +onwer bit ahead of time before the head value increases. + +Fixes: 36446a56eea5 ("libhns: Extended QP supports the new post send mechanism") +Signed-off-by: Luoyouming +--- + providers/hns/hns_roce_u_hw_v2.c | 10 ++++++---- + 1 file changed, 6 insertions(+), 4 deletions(-) + +diff --git a/providers/hns/hns_roce_u_hw_v2.c b/providers/hns/hns_roce_u_hw_v2.c +index 616d1ea..cde4801 100644 +--- a/providers/hns/hns_roce_u_hw_v2.c ++++ b/providers/hns/hns_roce_u_hw_v2.c +@@ -2215,6 +2215,9 @@ init_rc_wqe(struct hns_roce_qp *qp, uint64_t wr_id, unsigned int opcode) + + qp->sq.wrid[wqe_idx] = wr_id; + qp->cur_wqe = wqe; ++ ++ enable_wqe(qp, wqe, qp->sq.head); ++ + qp->sq.head++; + + return wqe; +@@ -2236,9 +2239,6 @@ static void wr_set_sge_rc(struct ibv_qp_ex *ibv_qp, uint32_t lkey, + wqe->msg_len = htole32(length); + hr_reg_write(wqe, RCWQE_LEN0, length); + hr_reg_write(wqe, RCWQE_SGE_NUM, !!length); +- /* ignore ex sge start index */ +- +- enable_wqe(qp, wqe, qp->sq.head); + } + + static void set_sgl_rc(struct hns_roce_v2_wqe_data_seg *dseg, +@@ -2541,6 +2541,9 @@ init_ud_wqe(struct hns_roce_qp *qp, uint64_t wr_id, unsigned int opcode) + + qp->sq.wrid[wqe_idx] = wr_id; + qp->cur_wqe = wqe; ++ ++ enable_wqe(qp, wqe, qp->sq.head); ++ + qp->sq.head++; + + return wqe; +@@ -2610,7 +2613,6 @@ static void wr_set_sge_ud(struct ibv_qp_ex *ibv_qp, uint32_t lkey, + dseg->len = htole32(length); + + qp->sge_info.start_idx++; +- enable_wqe(qp, wqe, qp->sq.head); + } + + static void wr_set_sge_list_ud(struct ibv_qp_ex *ibv_qp, size_t num_sge, +-- +2.25.1 + diff --git a/0048-libhns-Fix-incorrect-post-send-with-direct-wqe-of-wr.patch b/0048-libhns-Fix-incorrect-post-send-with-direct-wqe-of-wr.patch new file mode 100644 index 0000000..680c583 --- /dev/null +++ b/0048-libhns-Fix-incorrect-post-send-with-direct-wqe-of-wr.patch @@ -0,0 +1,49 @@ +From cfea6efe6decfa8c209ad9a85e1290674370725e Mon Sep 17 00:00:00 2001 +From: Junxian Huang +Date: Fri, 2 Jun 2023 10:33:15 +0800 +Subject: [PATCH 3/4] libhns: Fix incorrect post-send with direct wqe of + wr-list in user space + +driver inclusion +category: bugfix +bugzilla: https://gitee.com/openeuler/kernel/issues/I7A2SA + +--------------------------------------------------------------- + +Currently, direct wqe is not supported for wr-list. RoCE driver excludes +direct wqe for wr-list by judging whether the number of wr is 1. + +For a wr-list where the second wr is a length-error atomic wr, the +post-send driver handles the first wr and adds 1 to the wr number counter +firstly. While handling the second wr, the driver finds out a length error +and terminates the wr handle process, remaining the counter at 1. This +causes the driver mistakenly judges there is only 1 wr and thus enters +the direct wqe process, carrying the current length-error atomic wqe. + +This patch fixes the error by adding a judgement whether the current wr +is a bad wr. If so, use the normal doorbell process but not direct wqe +despite the wr number is 1. + +Fixes: 159933c37450 ("libhns: Add support for direct wqe") +Signed-off-by: Junxian Huang +--- + providers/hns/hns_roce_u_hw_v2.c | 3 ++- + 1 file changed, 2 insertions(+), 1 deletion(-) + +diff --git a/providers/hns/hns_roce_u_hw_v2.c b/providers/hns/hns_roce_u_hw_v2.c +index cde4801..bb26c59 100644 +--- a/providers/hns/hns_roce_u_hw_v2.c ++++ b/providers/hns/hns_roce_u_hw_v2.c +@@ -1480,7 +1480,8 @@ out: + + udma_to_device_barrier(); + +- if (nreq == 1 && (qp->flags & HNS_ROCE_QP_CAP_DIRECT_WQE)) ++ if (nreq == 1 && !ret && ++ (qp->flags & HNS_ROCE_QP_CAP_DIRECT_WQE)) + hns_roce_write_dwqe(qp, wqe); + else + hns_roce_update_sq_db(ctx, qp); +-- +2.25.1 + diff --git a/0049-libhns-Add-a-judgment-to-the-congestion-control-algo.patch b/0049-libhns-Add-a-judgment-to-the-congestion-control-algo.patch new file mode 100644 index 0000000..027b393 --- /dev/null +++ b/0049-libhns-Add-a-judgment-to-the-congestion-control-algo.patch @@ -0,0 +1,41 @@ +From 8fbf781e3b3630c25a361f7c5e3642350dcd21c9 Mon Sep 17 00:00:00 2001 +From: Chengchang Tang +Date: Fri, 2 Jun 2023 10:33:16 +0800 +Subject: [PATCH 4/4] libhns: Add a judgment to the congestion control + algorithm + +driver inclusion +category: bugfix +bugzilla: https://gitee.com/src-openeuler/rdma-core/issues/I7A7HI + +--------------------------------------------------------------- + +The congestion control algorithm is used only when the comp_mask flag +HNSDV_QP_INIT_ATTR_MASK_QP_CONGEST_TYPE is seted. + +A check on comp_mask is added to prevent invalid parameter errors caused +by unconfigured congestion control algorithm types. + +Fixes: 7623f24781f1 ("libhns: Support congestion control algorithm configuration") +Signed-off-by: Chengchang Tang +--- + providers/hns/hns_roce_u_verbs.c | 3 +++ + 1 file changed, 3 insertions(+) + +diff --git a/providers/hns/hns_roce_u_verbs.c b/providers/hns/hns_roce_u_verbs.c +index 6c6120c..fa27fc1 100644 +--- a/providers/hns/hns_roce_u_verbs.c ++++ b/providers/hns/hns_roce_u_verbs.c +@@ -988,6 +988,9 @@ static int check_qp_congest_type(struct hns_roce_context *ctx, + { + struct hns_roce_device *hr_dev = to_hr_dev(ctx->ibv_ctx.context.device); + ++ if (!(hns_attr->comp_mask & HNSDV_QP_INIT_ATTR_MASK_QP_CONGEST_TYPE)) ++ return 0; ++ + if (!check_comp_mask(hns_attr->congest_type, hr_dev->congest_type)) { + verbs_err(&ctx->ibv_ctx, "unsupported congest type 0x%x.\n", + hns_attr->congest_type); +-- +2.25.1 + diff --git a/rdma-core.spec b/rdma-core.spec index f07b051..e76f282 100644 --- a/rdma-core.spec +++ b/rdma-core.spec @@ -1,6 +1,6 @@ Name: rdma-core Version: 41.0 -Release: 11 +Release: 12 Summary: RDMA core userspace libraries and daemons License: GPLv2 or BSD Url: https://github.com/linux-rdma/rdma-core @@ -52,6 +52,9 @@ Patch42: 0043-libhns-Add-support-for-SVE-Direct-WQE.patch Patch43: 0044-libhns-Fix-the-sge-num-problem-of-atomic-op.patch Patch44: 0045-libhns-Fix-sge-tail_len-overflow.patch Patch45: 0046-libhns-Disable-local-invalidate-operation.patch +Patch46: 0047-libhns-Fix-the-owner-bit-error-of-sq-in-new-io.patch +Patch47: 0048-libhns-Fix-incorrect-post-send-with-direct-wqe-of-wr.patch +Patch48: 0049-libhns-Add-a-judgment-to-the-congestion-control-algo.patch BuildRequires: binutils cmake >= 2.8.11 gcc libudev-devel pkgconfig pkgconfig(libnl-3.0) BuildRequires: pkgconfig(libnl-route-3.0) valgrind-devel systemd systemd-devel @@ -299,6 +302,12 @@ fi %{_mandir}/* %changelog +* Fri Jun 2 2023 Juan Zhou - 41.0-12 +- Type: bugfix +- ID: NA +- SUG: NA +- DESC: Backport bugfix for hns + * Thu May 11 2023 Juan Zhou - 41.0-11 - Type: bugfix - ID: NA -- Gitee