From 794f3792a7267d0586bfac7d67507a27a5e61305 Mon Sep 17 00:00:00 2001 From: Ran Zhou Date: Thu, 7 Dec 2023 10:58:28 +0800 Subject: [PATCH] Bugfix for lock and owner bit Correct the return of error code, add init of pthread spinlock and mutex judgement, remove a repeated init of pthread lock init, fix owner bit when SQ wrqps. Signed-off-by: Wenpeng Liang Signed-off-by: Chengchang Tang --- ...ns-Fix-possible-overflow-in-cq-clean.patch | 12 +- ...-parent-domain-unsupported-comp-mask.patch | 72 ++++++ ...ad_spin_destroy-pthread_mutex_destro.patch | 216 ++++++++++++++++++ ...-repeated-initialization-of-a-spinlo.patch | 38 +++ ...r-bit-when-SQ-wraps-around-in-new-IO.patch | 95 ++++++++ rdma-core.spec | 12 +- 6 files changed, 438 insertions(+), 7 deletions(-) create mode 100644 0077-libhns-Fix-parent-domain-unsupported-comp-mask.patch create mode 100644 0078-libhns-Add-pthread_spin_destroy-pthread_mutex_destro.patch create mode 100644 0079-libhns-Removes-a-repeated-initialization-of-a-spinlo.patch create mode 100644 0080-libhns-Fix-owner-bit-when-SQ-wraps-around-in-new-IO.patch diff --git a/0074-libhns-Fix-possible-overflow-in-cq-clean.patch b/0074-libhns-Fix-possible-overflow-in-cq-clean.patch index b50e61f..65c1556 100644 --- a/0074-libhns-Fix-possible-overflow-in-cq-clean.patch +++ b/0074-libhns-Fix-possible-overflow-in-cq-clean.patch @@ -40,14 +40,14 @@ index b48cabd..fc938de 100644 - for (prod_index = cq->cons_index; get_sw_cqe_v2(cq, prod_index); - ++prod_index) - if (prod_index > cq->cons_index + cq->verbs_cq.cq.cqe) -+ struct hns_roce_context *ctx = to_hr_ctx(cq->verbs_cq.cq.context); ++ struct hns_roce_context *ctx = to_hr_ctx(cq->verbs_cq.cq.context); + uint64_t cons_index = cq->cons_index; + uint64_t prod_index = cq->cons_index; -+ struct hns_roce_v2_cqe *cqe, *dest; -+ uint16_t wqe_index; -+ uint8_t owner_bit; -+ bool is_recv_cqe; -+ int nfreed = 0; ++ struct hns_roce_v2_cqe *cqe, *dest; ++ uint16_t wqe_index; ++ uint8_t owner_bit; ++ bool is_recv_cqe; ++ int nfreed = 0; + + for (; get_sw_cqe_v2(cq, prod_index); ++prod_index) + if (prod_index > cons_index + cq->verbs_cq.cq.cqe) diff --git a/0077-libhns-Fix-parent-domain-unsupported-comp-mask.patch b/0077-libhns-Fix-parent-domain-unsupported-comp-mask.patch new file mode 100644 index 0000000..faa9771 --- /dev/null +++ b/0077-libhns-Fix-parent-domain-unsupported-comp-mask.patch @@ -0,0 +1,72 @@ +From f912bc59d0b56b560a0097b5f738ba160a3e23eb Mon Sep 17 00:00:00 2001 +From: Chengchang Tang +Date: Thu, 7 Dec 2023 09:47:59 +0800 +Subject: [PATCH 77/80] libhns: Fix parent domain unsupported comp mask + +driver inclusion +category: bugfix +bugzilla: https://gitee.com/src-openeuler/rdma-core/issues/I8MF28 + +Hns do not support any comp mask for parent domain. Driver returns +EINVAL if any compmask is set. + +This patch Replace the inappropriate return value EINVAL with +EOPNOTSUPP. + +The error is found by testcase test_mem_align_ud_traffic. + +ERROR: test_mem_align_ud_traffic (tests.test_parent_domain.ParentDomain +TrafficTest) + +---------------------------------------------------------------------- + +Traceback (most recent call last): + File "./tests/test_parent_domain.py", line 183, in test_mem_align_ud +_traffic + self.create_players(parent_domain_ud_res, + File "./tests/test_parent_domain.py", line 156, in create_players + self.client = resource(**self.dev_info, **resource_arg) + File "./tests/test_parent_domain.py", line 90, in __init__ + super().__init__(**kwargs) + File "./tests/base.py", line 617, in __init__ + super(RoCETrafficResources, self).__init__(dev_name, ib_port, +gid_index, **kwargs) + File "./tests/base.py", line 503, in __init__ + super(TrafficResources, self).__init__(dev_name=dev_name, + File "./tests/base.py", line 477, in __init__ + self.create_pd() + File "./tests/test_parent_domain.py", line 95, in create_pd + create_parent_domain_with_allocators(self) + File "./tests/test_parent_domain.py", line 69, in create_parent_ +domain_with_allocators + raise ex + File "./tests/test_parent_domain.py", line 65, in create_parent_ +domain_with_allocators + res.pd = ParentDomain(res.ctx, attr=pd_attr) + File "pd.pyx", line 261, in pyverbs.pd.ParentDomain.__init__ +pyverbs.pyverbs_error.PyverbsRDMAError: Failed to allocate Parent +Domain.Errno: 22, Invalid argument + +Fixes: cfe548d4c78e ("libhns: Add support for the thread +domain and the parent domain") +Signed-off-by: Chengchang Tang +--- + providers/hns/hns_roce_u_verbs.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/providers/hns/hns_roce_u_verbs.c b/providers/hns/hns_roce_u_verbs.c +index e597e93..fae6126 100644 +--- a/providers/hns/hns_roce_u_verbs.c ++++ b/providers/hns/hns_roce_u_verbs.c +@@ -238,7 +238,7 @@ struct ibv_pd *hns_roce_u_alloc_pad(struct ibv_context *context, + return NULL; + + if (attr->comp_mask) { +- errno = EINVAL; ++ errno = EOPNOTSUPP; + return NULL; + } + +-- +2.25.1 + diff --git a/0078-libhns-Add-pthread_spin_destroy-pthread_mutex_destro.patch b/0078-libhns-Add-pthread_spin_destroy-pthread_mutex_destro.patch new file mode 100644 index 0000000..5636e79 --- /dev/null +++ b/0078-libhns-Add-pthread_spin_destroy-pthread_mutex_destro.patch @@ -0,0 +1,216 @@ +From 3a1432cfdb7c696d3acf97025e6d74bbf3e520dc Mon Sep 17 00:00:00 2001 +From: Wenpeng Liang +Date: Thu, 7 Dec 2023 09:48:00 +0800 +Subject: [PATCH 78/80] libhns: Add + pthread_spin_destroy()/pthread_mutex_destroy() + +driver inclusion +category: bugfix +bugzilla: https://gitee.com/src-openeuler/rdma-core/issues/I8MF59 + +-------------------------------------------------------------------------- + +The functions pthread_spin_destroy()/pthread_mutex_destroy() +corresponds to pthread_spin_init()/pthread_mutex_init(). The +driver should call pthread_spin_destroy()/pthread_mutex_destroy() +to clean up resources before exiting. + +Signed-off-by: Wenpeng Liang +--- + providers/hns/hns_roce_u.c | 61 +++++++++++++++++++++++++++----- + providers/hns/hns_roce_u_hw_v2.c | 1 + + providers/hns/hns_roce_u_verbs.c | 17 ++++++--- + 3 files changed, 67 insertions(+), 12 deletions(-) + +diff --git a/providers/hns/hns_roce_u.c b/providers/hns/hns_roce_u.c +index f30486f..dfcd798 100644 +--- a/providers/hns/hns_roce_u.c ++++ b/providers/hns/hns_roce_u.c +@@ -346,6 +346,47 @@ static int query_dev_attr(struct hns_roce_context *context, + return 0; + } + ++static int hns_roce_init_context_lock(struct hns_roce_context *context) ++{ ++ int ret; ++ ++ ret = pthread_spin_init(&context->uar_lock, PTHREAD_PROCESS_PRIVATE); ++ if (ret) ++ return ret; ++ ++ ret = pthread_mutex_init(&context->qp_table_mutex, NULL); ++ if (ret) ++ goto destroy_uar_lock; ++ ++ ret = pthread_mutex_init(&context->srq_table_mutex, NULL); ++ if (ret) ++ goto destroy_qp_mutex; ++ ++ ret = pthread_mutex_init(&context->db_list_mutex, NULL); ++ if (ret) ++ goto destroy_srq_mutex; ++ ++ return 0; ++ ++destroy_srq_mutex: ++ pthread_mutex_destroy(&context->srq_table_mutex); ++ ++destroy_qp_mutex: ++ pthread_mutex_destroy(&context->qp_table_mutex); ++ ++destroy_uar_lock: ++ pthread_spin_destroy(&context->uar_lock); ++ return ret; ++} ++ ++static void hns_roce_destroy_context_lock(struct hns_roce_context *context) ++{ ++ pthread_spin_destroy(&context->uar_lock); ++ pthread_mutex_destroy(&context->qp_table_mutex); ++ pthread_mutex_destroy(&context->srq_table_mutex); ++ pthread_mutex_destroy(&context->db_list_mutex); ++} ++ + static struct verbs_context *hns_roce_alloc_context(struct ibv_device *ibdev, + int cmd_fd, + void *private_data) +@@ -365,7 +406,10 @@ static struct verbs_context *hns_roce_alloc_context(struct ibv_device *ibdev, + ucontext_set_cmd(&cmd, ctx_attr); + if (ibv_cmd_get_context(&context->ibv_ctx, &cmd.ibv_cmd, sizeof(cmd), + &resp.ibv_resp, sizeof(resp))) +- goto err_free; ++ goto err_ibv_cmd; ++ ++ if (hns_roce_init_context_lock(context)) ++ goto err_ibv_cmd; + + hr_dev->congest_type = resp.congest_type; + +@@ -383,23 +427,23 @@ static struct verbs_context *hns_roce_alloc_context(struct ibv_device *ibdev, + context->qp_table_shift = calc_table_shift(resp.qp_tab_size, + HNS_ROCE_QP_TABLE_BITS); + context->qp_table_mask = (1 << context->qp_table_shift) - 1; +- pthread_mutex_init(&context->qp_table_mutex, NULL); ++ + for (i = 0; i < HNS_ROCE_QP_TABLE_SIZE; ++i) + context->qp_table[i].refcnt = 0; + + context->srq_table_shift = calc_table_shift(resp.srq_tab_size, + HNS_ROCE_SRQ_TABLE_BITS); + context->srq_table_mask = (1 << context->srq_table_shift) - 1; +- pthread_mutex_init(&context->srq_table_mutex, NULL); ++ + for (i = 0; i < HNS_ROCE_SRQ_TABLE_SIZE; ++i) + context->srq_table[i].refcnt = 0; + + if (query_dev_attr(context, hr_dev, &resp)) +- goto err_free; ++ goto err_query_dev; + + if (init_dca_context(context, cmd_fd, + &resp, ctx_attr, hr_dev->page_size)) +- goto err_free; ++ goto err_query_dev; + + if (init_reset_context(context, cmd_fd, &resp, hr_dev->page_size)) + goto reset_free; +@@ -407,8 +451,6 @@ static struct verbs_context *hns_roce_alloc_context(struct ibv_device *ibdev, + if (hns_roce_mmap(hr_dev, context, cmd_fd)) + goto uar_free; + +- pthread_spin_init(&context->uar_lock, PTHREAD_PROCESS_PRIVATE); +- + verbs_set_ops(&context->ibv_ctx, &hns_common_ops); + verbs_set_ops(&context->ibv_ctx, &hr_dev->u_hw->hw_ops); + +@@ -419,7 +461,9 @@ uar_free: + munmap(context->reset_state, hr_dev->page_size); + reset_free: + uninit_dca_context(context); +-err_free: ++err_query_dev: ++ hns_roce_destroy_context_lock(context); ++err_ibv_cmd: + verbs_uninit_context(&context->ibv_ctx); + free(context); + return NULL; +@@ -434,6 +478,7 @@ static void hns_roce_free_context(struct ibv_context *ibctx) + if (context->reset_state) + munmap(context->reset_state, hr_dev->page_size); + uninit_dca_context(context); ++ hns_roce_destroy_context_lock(context); + verbs_uninit_context(&context->ibv_ctx); + free(context); + } +diff --git a/providers/hns/hns_roce_u_hw_v2.c b/providers/hns/hns_roce_u_hw_v2.c +index 68d7110..b2a8858 100644 +--- a/providers/hns/hns_roce_u_hw_v2.c ++++ b/providers/hns/hns_roce_u_hw_v2.c +@@ -2049,6 +2049,7 @@ static int hns_roce_u_v2_destroy_qp(struct ibv_qp *ibqp) + hns_roce_unlock_cqs(to_hr_cq(ibqp->send_cq), to_hr_cq(ibqp->recv_cq)); + + hns_roce_free_qp_buf(qp, ctx); ++ hns_roce_qp_spinlock_destroy(qp); + + free(qp); + +diff --git a/providers/hns/hns_roce_u_verbs.c b/providers/hns/hns_roce_u_verbs.c +index fae6126..ba3fef6 100644 +--- a/providers/hns/hns_roce_u_verbs.c ++++ b/providers/hns/hns_roce_u_verbs.c +@@ -803,6 +803,7 @@ int hns_roce_u_modify_cq(struct ibv_cq *cq, struct ibv_modify_cq_attr *attr) + + int hns_roce_u_destroy_cq(struct ibv_cq *cq) + { ++ struct hns_roce_cq *hr_cq = to_hr_cq(cq); + int ret; + + ret = ibv_cmd_destroy_cq(cq); +@@ -811,10 +812,13 @@ int hns_roce_u_destroy_cq(struct ibv_cq *cq) + + hns_roce_uninit_cq_swc(to_hr_cq(cq)); + +- hns_roce_free_db(to_hr_ctx(cq->context), to_hr_cq(cq)->db, ++ hns_roce_free_db(to_hr_ctx(cq->context), hr_cq->db, + HNS_ROCE_CQ_TYPE_DB); +- hns_roce_free_buf(&to_hr_cq(cq)->buf); +- free(to_hr_cq(cq)); ++ hns_roce_free_buf(&hr_cq->buf); ++ ++ hns_roce_spinlock_destroy(&hr_cq->hr_lock); ++ ++ free(hr_cq); + + return ret; + } +@@ -1071,7 +1075,7 @@ static struct ibv_srq *create_srq(struct ibv_context *context, + + set_srq_param(context, srq, init_attr); + if (alloc_srq_buf(srq)) +- goto err_free_srq; ++ goto err_destroy_lock; + + srq->rdb = hns_roce_alloc_db(hr_ctx, HNS_ROCE_SRQ_TYPE_DB); + if (!srq->rdb) +@@ -1102,6 +1106,9 @@ err_srq_db: + err_srq_buf: + free_srq_buf(srq); + ++err_destroy_lock: ++ hns_roce_spinlock_destroy(&srq->hr_lock); ++ + err_free_srq: + free(srq); + +@@ -1191,6 +1198,8 @@ int hns_roce_u_destroy_srq(struct ibv_srq *ibv_srq) + + hns_roce_free_db(ctx, srq->rdb, HNS_ROCE_SRQ_TYPE_DB); + free_srq_buf(srq); ++ ++ hns_roce_spinlock_destroy(&srq->hr_lock); + free(srq); + + return 0; +-- +2.25.1 + diff --git a/0079-libhns-Removes-a-repeated-initialization-of-a-spinlo.patch b/0079-libhns-Removes-a-repeated-initialization-of-a-spinlo.patch new file mode 100644 index 0000000..41e7dc0 --- /dev/null +++ b/0079-libhns-Removes-a-repeated-initialization-of-a-spinlo.patch @@ -0,0 +1,38 @@ +From 8759b0e6ec4e73994743c1ae0d0ecc186688b6d6 Mon Sep 17 00:00:00 2001 +From: Ran Zhou +Date: Thu, 7 Dec 2023 09:48:01 +0800 +Subject: [PATCH 79/80] libhns: Removes a repeated initialization of a spinlock + +driver inclusion +category: bugfix +bugzilla: https://gitee.com/src-openeuler/rdma-core/issues/I8MF83 + +-------------------------------------------------------------------------- + +The pthread_spin_init() of qp was done in create_qp(). We removed +the spinlock init in this place to avoid initializing twice. + +Signed-off-by: Wenpeng Liang +Signed-off-by: Ran Zhou +--- + providers/hns/hns_roce_u_verbs.c | 4 ---- + 1 file changed, 4 deletions(-) + +diff --git a/providers/hns/hns_roce_u_verbs.c b/providers/hns/hns_roce_u_verbs.c +index ba3fef6..c404948 100644 +--- a/providers/hns/hns_roce_u_verbs.c ++++ b/providers/hns/hns_roce_u_verbs.c +@@ -1933,10 +1933,6 @@ static int hns_roce_alloc_qp_buf(struct ibv_qp_init_attr_ex *attr, + { + int ret; + +- if (pthread_spin_init(&qp->sq.hr_lock.lock, PTHREAD_PROCESS_PRIVATE) || +- pthread_spin_init(&qp->rq.hr_lock.lock, PTHREAD_PROCESS_PRIVATE)) +- return -ENOMEM; +- + ret = qp_alloc_wqe(attr, hns_attr, qp, ctx); + if (ret) + return ret; +-- +2.25.1 + diff --git a/0080-libhns-Fix-owner-bit-when-SQ-wraps-around-in-new-IO.patch b/0080-libhns-Fix-owner-bit-when-SQ-wraps-around-in-new-IO.patch new file mode 100644 index 0000000..c7769b3 --- /dev/null +++ b/0080-libhns-Fix-owner-bit-when-SQ-wraps-around-in-new-IO.patch @@ -0,0 +1,95 @@ +From 96d30f16bc03167c7c52e663785192382688f542 Mon Sep 17 00:00:00 2001 +From: Chengchang Tang +Date: Thu, 7 Dec 2023 09:48:02 +0800 +Subject: [PATCH 80/80] libhns: Fix owner bit when SQ wraps around in new IO + +driver inclusion +category: bugfix +bugzilla: https://gitee.com/src-openeuler/rdma-core/issues/I8MF9Q + +-------------------------------------------------------------------------- + +The owner bit has been write in init_rc_wqe() or init_ud_wqe() +with a write value. And it will be overwritten by some subsequent +operations. When the SQ wraps around, the overwritten value will be +an incorrect value. + +For example, driver will assign the owner bit in the second step, +and overwrite it in the third step. + +```c +ibv_wr_start(); +ibv_wr_rdma_write(); +if (inline) + ibv_wr_set_inline_data_list(); +else + ibv_wr_set_sge_list(); +ibv_wr_complete(); +``` + +This patch removes the redundant owner bit assignment operations +in new IO. + +Fixes: 36446a56eea5 ("libhns: Extended QP supports the new post send mechanism") +Fixes: 163d62ca6196 ("libhns: Fix the owner bit error of sq in new io") +Signed-off-by: Chengchang Tang +--- + providers/hns/hns_roce_u_hw_v2.c | 7 ------- + 1 file changed, 7 deletions(-) + +diff --git a/providers/hns/hns_roce_u_hw_v2.c b/providers/hns/hns_roce_u_hw_v2.c +index b2a8858..acbc854 100644 +--- a/providers/hns/hns_roce_u_hw_v2.c ++++ b/providers/hns/hns_roce_u_hw_v2.c +@@ -2544,8 +2544,6 @@ static void wr_set_sge_list_rc(struct ibv_qp_ex *ibv_qp, size_t num_sge, + + wqe->msg_len = htole32(qp->sge_info.total_len); + hr_reg_write(wqe, RCWQE_SGE_NUM, qp->sge_info.valid_num); +- +- enable_wqe(qp, wqe, qp->sq.head); + } + + static void wr_send_rc(struct ibv_qp_ex *ibv_qp) +@@ -2737,7 +2735,6 @@ static void wr_set_inline_data_rc(struct ibv_qp_ex *ibv_qp, void *addr, + + qp->sge_info.total_len = length; + set_inline_data_list_rc(qp, wqe, 1, &buff); +- enable_wqe(qp, wqe, qp->sq.head); + } + + static void wr_set_inline_data_list_rc(struct ibv_qp_ex *ibv_qp, size_t num_buf, +@@ -2755,7 +2752,6 @@ static void wr_set_inline_data_list_rc(struct ibv_qp_ex *ibv_qp, size_t num_buf, + qp->sge_info.total_len += buf_list[i].length; + + set_inline_data_list_rc(qp, wqe, num_buf, buf_list); +- enable_wqe(qp, wqe, qp->sq.head); + } + + static struct hns_roce_ud_sq_wqe * +@@ -2892,7 +2888,6 @@ static void wr_set_sge_list_ud(struct ibv_qp_ex *ibv_qp, size_t num_sge, + hr_reg_write(wqe, UDWQE_SGE_NUM, cnt); + + qp->sge_info.start_idx += cnt; +- enable_wqe(qp, wqe, qp->sq.head); + } + + static void set_inline_data_list_ud(struct hns_roce_qp *qp, +@@ -2958,7 +2953,6 @@ static void wr_set_inline_data_ud(struct ibv_qp_ex *ibv_qp, void *addr, + + qp->sge_info.total_len = length; + set_inline_data_list_ud(qp, wqe, 1, &buff); +- enable_wqe(qp, wqe, qp->sq.head); + } + + static void wr_set_inline_data_list_ud(struct ibv_qp_ex *ibv_qp, size_t num_buf, +@@ -2976,7 +2970,6 @@ static void wr_set_inline_data_list_ud(struct ibv_qp_ex *ibv_qp, size_t num_buf, + qp->sge_info.total_len += buf_list[i].length; + + set_inline_data_list_ud(qp, wqe, num_buf, buf_list); +- enable_wqe(qp, wqe, qp->sq.head); + } + + static void wr_start(struct ibv_qp_ex *ibv_qp) +-- +2.25.1 + diff --git a/rdma-core.spec b/rdma-core.spec index 50e12d1..d1e9126 100644 --- a/rdma-core.spec +++ b/rdma-core.spec @@ -1,6 +1,6 @@ Name: rdma-core Version: 41.0 -Release: 22 +Release: 23 Summary: RDMA core userspace libraries and daemons License: GPLv2 or BSD Url: https://github.com/linux-rdma/rdma-core @@ -82,6 +82,10 @@ patch73: 0073-libhns-Fix-uninitialized-qp-attr-when-flush-cqe.patch patch74: 0074-libhns-Fix-possible-overflow-in-cq-clean.patch patch75: 0075-libhns-Fix-unnecessary-dca-memory-detach.patch patch76: 0076-libhns-Bugfix-for-wrong-timing-of-modifying-ibv_qp-s.patch +patch77: 0077-libhns-Fix-parent-domain-unsupported-comp-mask.patch +patch78: 0078-libhns-Add-pthread_spin_destroy-pthread_mutex_destro.patch +patch79: 0079-libhns-Removes-a-repeated-initialization-of-a-spinlo.patch +patch80: 0080-libhns-Fix-owner-bit-when-SQ-wraps-around-in-new-IO.patch BuildRequires: binutils cmake >= 2.8.11 gcc libudev-devel pkgconfig pkgconfig(libnl-3.0) BuildRequires: pkgconfig(libnl-route-3.0) valgrind-devel systemd systemd-devel @@ -329,6 +333,12 @@ fi %{_mandir}/* %changelog +* Thu Dec 7 2023 Ran Zhou - 41.0-23 +- Type: bugfix +- ID: NA +- SUG: NA +- DESC: Bugfix for lock and owner bit + * Fri Dec 1 2023 Ran Zhou - 41.0-22 - Type: bugfix - ID: NA -- Gitee