From 00a59846eb47f5ec2764d4eedc1a18bc2bf369cb Mon Sep 17 00:00:00 2001 From: Zhongyuan Zhou Date: Mon, 15 Jan 2024 14:35:39 +0800 Subject: [PATCH] High-priority bugfixes to be incorporated from SP3 Three patches are included: libhns: Bugfix for wrong timing of modifying ibv_qp state to err Fix owner bit when SQ wraps around in new IO Fix missing DB when compiler does not support SVE Merge these patches to enhance robustness of the SP2 Singed-off-by: Chengchang Tang Singed-off-by: Junxian Huang Singed-off-by: Ran Zhou --- ...r-wrong-timing-of-modifying-ibv_qp-s.patch | 43 +++++++++ ...r-bit-when-SQ-wraps-around-in-new-IO.patch | 95 +++++++++++++++++++ ...ng-DB-when-compiler-does-not-support.patch | 84 ++++++++++++++++ rdma-core.spec | 11 ++- 4 files changed, 232 insertions(+), 1 deletion(-) create mode 100644 0052-libhns-Bugfix-for-wrong-timing-of-modifying-ibv_qp-s.patch create mode 100644 0053-libhns-Fix-owner-bit-when-SQ-wraps-around-in-new-IO.patch create mode 100644 0054-libhns-Fix-missing-DB-when-compiler-does-not-support.patch diff --git a/0052-libhns-Bugfix-for-wrong-timing-of-modifying-ibv_qp-s.patch b/0052-libhns-Bugfix-for-wrong-timing-of-modifying-ibv_qp-s.patch new file mode 100644 index 0000000..b83e99f --- /dev/null +++ b/0052-libhns-Bugfix-for-wrong-timing-of-modifying-ibv_qp-s.patch @@ -0,0 +1,43 @@ +From ef63fff534db1e8c7d4537c543a9dc8b9773923d Mon Sep 17 00:00:00 2001 +From: Yangyang Li +Date: Fri, 1 Dec 2023 10:43:23 +0800 +Subject: [PATCH 52/54] libhns: Bugfix for wrong timing of modifying ibv_qp + state to err + +driver inclusion +category: bugfix +bugzilla: https://gitee.com/src-openeuler/rdma-core/issues/I8L4YU + +-------------------------------------------------------------------------- + +Currently the QPC state in HW is modified inside the critical section of +spinlock but the ibv_qp state is modified outside. There will be a short +period when QPC state has been modified to err with ibv_qp state still +remaining RTS. WQEs during this period will still be post-send by RTS-state +ibv_qp but then dropped by err-state HW with no flush CQEs generated. + +To fix this problem, the QPC state in HW and ibv_qp state should be both +modified to err inside the critical section of spinlock. + +Fixes: f1a80cc3dfe2 ("libhns: Bugfix for flush cqe in case multi-process") +Signed-off-by: Yangyang Li +--- + providers/hns/hns_roce_u_hw_v2.c | 2 ++ + 1 file changed, 2 insertions(+) + +diff --git a/providers/hns/hns_roce_u_hw_v2.c b/providers/hns/hns_roce_u_hw_v2.c +index ee2fffe..78bb7e0 100644 +--- a/providers/hns/hns_roce_u_hw_v2.c ++++ b/providers/hns/hns_roce_u_hw_v2.c +@@ -1742,6 +1742,8 @@ static int hns_roce_u_v2_modify_qp(struct ibv_qp *qp, struct ibv_qp_attr *attr, + sizeof(resp_ex)); + + if (flag) { ++ if (!ret) ++ qp->state = IBV_QPS_ERR; + hns_roce_spin_unlock(&hr_qp->sq.hr_lock); + hns_roce_spin_unlock(&hr_qp->rq.hr_lock); + } +-- +2.25.1 + diff --git a/0053-libhns-Fix-owner-bit-when-SQ-wraps-around-in-new-IO.patch b/0053-libhns-Fix-owner-bit-when-SQ-wraps-around-in-new-IO.patch new file mode 100644 index 0000000..77e18df --- /dev/null +++ b/0053-libhns-Fix-owner-bit-when-SQ-wraps-around-in-new-IO.patch @@ -0,0 +1,95 @@ +From 32842498c7b507a8f27ae404cf5e6dc5caf55192 Mon Sep 17 00:00:00 2001 +From: Chengchang Tang +Date: Thu, 7 Dec 2023 09:48:02 +0800 +Subject: [PATCH 53/54] libhns: Fix owner bit when SQ wraps around in new IO + +driver inclusion +category: bugfix +bugzilla: https://gitee.com/src-openeuler/rdma-core/issues/I8MF9Q + +-------------------------------------------------------------------------- + +The owner bit has been write in init_rc_wqe() or init_ud_wqe() +with a write value. And it will be overwritten by some subsequent +operations. When the SQ wraps around, the overwritten value will be +an incorrect value. + +For example, driver will assign the owner bit in the second step, +and overwrite it in the third step. + +```c +ibv_wr_start(); +ibv_wr_rdma_write(); +if (inline) + ibv_wr_set_inline_data_list(); +else + ibv_wr_set_sge_list(); +ibv_wr_complete(); +``` + +This patch removes the redundant owner bit assignment operations +in new IO. + +Fixes: 36446a56eea5 ("libhns: Extended QP supports the new post send mechanism") +Fixes: 163d62ca6196 ("libhns: Fix the owner bit error of sq in new io") +Signed-off-by: Chengchang Tang +--- + providers/hns/hns_roce_u_hw_v2.c | 7 ------- + 1 file changed, 7 deletions(-) + +diff --git a/providers/hns/hns_roce_u_hw_v2.c b/providers/hns/hns_roce_u_hw_v2.c +index 78bb7e0..695d565 100644 +--- a/providers/hns/hns_roce_u_hw_v2.c ++++ b/providers/hns/hns_roce_u_hw_v2.c +@@ -2314,8 +2314,6 @@ static void wr_set_sge_list_rc(struct ibv_qp_ex *ibv_qp, size_t num_sge, + + wqe->msg_len = htole32(qp->sge_info.total_len); + hr_reg_write(wqe, RCWQE_SGE_NUM, qp->sge_info.valid_num); +- +- enable_wqe(qp, wqe, qp->sq.head); + } + + static void wr_send_rc(struct ibv_qp_ex *ibv_qp) +@@ -2507,7 +2505,6 @@ static void wr_set_inline_data_rc(struct ibv_qp_ex *ibv_qp, void *addr, + + qp->sge_info.total_len = length; + set_inline_data_list_rc(qp, wqe, 1, &buff); +- enable_wqe(qp, wqe, qp->sq.head); + } + + static void wr_set_inline_data_list_rc(struct ibv_qp_ex *ibv_qp, size_t num_buf, +@@ -2525,7 +2522,6 @@ static void wr_set_inline_data_list_rc(struct ibv_qp_ex *ibv_qp, size_t num_buf, + qp->sge_info.total_len += buf_list[i].length; + + set_inline_data_list_rc(qp, wqe, num_buf, buf_list); +- enable_wqe(qp, wqe, qp->sq.head); + } + + static struct hns_roce_ud_sq_wqe * +@@ -2662,7 +2658,6 @@ static void wr_set_sge_list_ud(struct ibv_qp_ex *ibv_qp, size_t num_sge, + hr_reg_write(wqe, UDWQE_SGE_NUM, cnt); + + qp->sge_info.start_idx += cnt; +- enable_wqe(qp, wqe, qp->sq.head); + } + + static void set_inline_data_list_ud(struct hns_roce_qp *qp, +@@ -2728,7 +2723,6 @@ static void wr_set_inline_data_ud(struct ibv_qp_ex *ibv_qp, void *addr, + + qp->sge_info.total_len = length; + set_inline_data_list_ud(qp, wqe, 1, &buff); +- enable_wqe(qp, wqe, qp->sq.head); + } + + static void wr_set_inline_data_list_ud(struct ibv_qp_ex *ibv_qp, size_t num_buf, +@@ -2746,7 +2740,6 @@ static void wr_set_inline_data_list_ud(struct ibv_qp_ex *ibv_qp, size_t num_buf, + qp->sge_info.total_len += buf_list[i].length; + + set_inline_data_list_ud(qp, wqe, num_buf, buf_list); +- enable_wqe(qp, wqe, qp->sq.head); + } + + static void wr_start(struct ibv_qp_ex *ibv_qp) +-- +2.25.1 + diff --git a/0054-libhns-Fix-missing-DB-when-compiler-does-not-support.patch b/0054-libhns-Fix-missing-DB-when-compiler-does-not-support.patch new file mode 100644 index 0000000..7bcf80c --- /dev/null +++ b/0054-libhns-Fix-missing-DB-when-compiler-does-not-support.patch @@ -0,0 +1,84 @@ +From 72c68907fbeba58b306c512f1bd8e1e52b46d0f0 Mon Sep 17 00:00:00 2001 +From: Chengchang Tang +Date: Fri, 8 Dec 2023 09:49:42 +0800 +Subject: [PATCH 54/54] libhns: Fix missing DB when compiler does not support + SVE + +driver inclusion +category: bugfix +bugzilla: https://gitee.com/openeuler/kernel/issues/I8MPTX + +---------------------------------------------------------------------- + +Currently, if compiler does not support SVE, hns_roce_sve_write512() will +be a empty function, which means that this doorbell will be missed when +HNS_ROCE_QP_CAP_SVE_DIRECT_WQE is set in qp flag. + +This patch ensures that driver will at least generate the DB regardless +of whether SVE DWQE is supported or not. + +Fixes: 7b1f5c5654c2 ("libhns: Add support for SVE Direct WQE function") +Signed-off-by: Chengchang Tang +Signed-off-by: Ran Zhou +--- + providers/hns/hns_roce_u_hw_v2.c | 33 +++++++++++++------------------- + 1 file changed, 13 insertions(+), 20 deletions(-) + +diff --git a/providers/hns/hns_roce_u_hw_v2.c b/providers/hns/hns_roce_u_hw_v2.c +index 695d565..a76e67c 100644 +--- a/providers/hns/hns_roce_u_hw_v2.c ++++ b/providers/hns/hns_roce_u_hw_v2.c +@@ -318,26 +318,22 @@ static void hns_roce_update_sq_db(struct hns_roce_context *ctx, + hns_roce_write64(ctx, qp->sq.db_reg, (__le32 *)&sq_db); + } + +-static void hns_roce_write512(uint64_t *dest, uint64_t *val) ++static void hns_roce_qp_write512(struct hns_roce_qp *qp, uint64_t *val) + { +- mmio_memcpy_x64(dest, val, sizeof(struct hns_roce_rc_sq_wqe)); +-} ++ uint64_t *dest = qp->sq.db_reg; + + #if defined(HNS_SVE) +-static void hns_roce_sve_write512(uint64_t *dest, uint64_t *val) +-{ +- asm volatile( +- "ldr z0, [%0]\n" +- "str z0, [%1]\n" +- ::"r" (val), "r"(dest):"cc", "memory" +- ); +-} +-#else +-static void hns_roce_sve_write512(uint64_t *dest, uint64_t *val) +-{ +- return; +-} ++ if (qp->flags & HNS_ROCE_QP_CAP_SVE_DIRECT_WQE) { ++ asm volatile( ++ "ldr z0, [%0]\n" ++ "str z0, [%1]\n" ++ ::"r" (val), "r"(dest):"cc", "memory" ++ ); ++ return; ++ } + #endif ++ mmio_memcpy_x64(dest, val, sizeof(struct hns_roce_rc_sq_wqe)); ++} + + static void hns_roce_write_dwqe(struct hns_roce_qp *qp, void *wqe) + { +@@ -355,10 +351,7 @@ static void hns_roce_write_dwqe(struct hns_roce_qp *qp, void *wqe) + hr_reg_write(rc_sq_wqe, RCWQE_DB_SL_H, qp->sl >> HNS_ROCE_SL_SHIFT); + hr_reg_write(rc_sq_wqe, RCWQE_WQE_IDX, qp->sq.head); + +- if (qp->flags & HNS_ROCE_QP_CAP_SVE_DIRECT_WQE) +- hns_roce_sve_write512(qp->sq.db_reg, wqe); +- else +- hns_roce_write512(qp->sq.db_reg, wqe); ++ hns_roce_qp_write512(qp, wqe); + } + + static void update_cq_db(struct hns_roce_context *ctx, struct hns_roce_cq *cq) +-- +2.25.1 + diff --git a/rdma-core.spec b/rdma-core.spec index a516049..ea951db 100644 --- a/rdma-core.spec +++ b/rdma-core.spec @@ -1,6 +1,6 @@ Name: rdma-core Version: 41.0 -Release: 14 +Release: 15 Summary: RDMA core userspace libraries and daemons License: GPLv2 or BSD Url: https://github.com/linux-rdma/rdma-core @@ -57,6 +57,9 @@ Patch47: 0048-libhns-Fix-incorrect-post-send-with-direct-wqe-of-wr.patch Patch48: 0049-libhns-Add-a-judgment-to-the-congestion-control-algo.patch Patch49: 0050-libhns-Support-user-to-choose-using-UD-sl-or-pktype-.patch Patch50: 0051-libhns-Get-dmac-from-kernel-driver.patch +Patch51: 0052-libhns-Bugfix-for-wrong-timing-of-modifying-ibv_qp-s.patch +Patch52: 0053-libhns-Fix-owner-bit-when-SQ-wraps-around-in-new-IO.patch +Patch53: 0054-libhns-Fix-missing-DB-when-compiler-does-not-support.patch BuildRequires: binutils cmake >= 2.8.11 gcc libudev-devel pkgconfig pkgconfig(libnl-3.0) BuildRequires: pkgconfig(libnl-route-3.0) valgrind-devel systemd systemd-devel @@ -304,6 +307,12 @@ fi %{_mandir}/* %changelog +* Mon Jan 15 2024 Ran Zhou - 41.0-15 +- Type: bugfix +- ID: NA +- SUG: NA +- DESC: Backport bugfix from SP3 + * Thu Jan 11 2024 Ran Zhou - 41.0-14 - Type: bugfix - ID: NA -- Gitee