diff --git a/0058-libhns-Support-flexible-WQE-buffer-page-size.patch b/0058-libhns-Support-flexible-WQE-buffer-page-size.patch
new file mode 100644
index 0000000000000000000000000000000000000000..f4f4dfac2cb6ece43d8e7b2dd0607a94a567fe50
--- /dev/null
+++ b/0058-libhns-Support-flexible-WQE-buffer-page-size.patch
@@ -0,0 +1,231 @@
+From d628c51d25b972a7d26e53ea400b3a0679d51f91 Mon Sep 17 00:00:00 2001
+From: Chengchang Tang <tangchengchang@huawei.com>
+Date: Mon, 23 Oct 2023 21:13:03 +0800
+Subject: [PATCH] libhns: Support flexible WQE buffer page size
+
+driver inclusion
+category: feature
+bugzilla: https://gitee.com/openeuler/kernel/issues/I87LTM
+
+--------------------------------------------------------------------------
+
+Currently, driver fixedly allocates 4K pages for user space WQE buffer
+even in a 64K system. This results in HW reading WQE with a granularity
+of 4K even in a 64K system. Considering that we support 1024-byte inline,
+in the scenario of using SQ inline, HW will switch pages every 4 WQEs.
+This will introduce a delay of about 400ns, which is an average delay of
+100ns per packet.
+
+In order to improve performance, we allow user-mode drivers to use a
+larger page size to allocate WQE buffers, thereby reducing the latency
+introduced by HW page switching. User-mode drivers will be allowed to
+allocate WQE buffers between 4K to system page size. During
+ibv_create_qp(), the driver will dynamically select the appropriate page
+size based on ibv_qp_cap, thus reducing memory consumption while improving
+performance.
+
+This feature needs to be used in conjunction with the kernel-mode driver.
+In order to ensure forward compatibility, if the kernel-mode driver does
+not support this feature, the user-mode driver will continue to use a
+fixed 4K pagesize to allocate WQE buffer.
+
+Signed-off-by: Chengchang Tang <tangchengchang@huawei.com>
+---
+ kernel-headers/rdma/hns-abi.h    |  5 ++-
+ providers/hns/hns_roce_u.c       |  2 +-
+ providers/hns/hns_roce_u.h       |  1 +
+ providers/hns/hns_roce_u_verbs.c | 65 ++++++++++++++++++++++++++------
+ 4 files changed, 59 insertions(+), 14 deletions(-)
+
+diff --git a/kernel-headers/rdma/hns-abi.h b/kernel-headers/rdma/hns-abi.h
+index cab941f..157dc9d 100644
+--- a/kernel-headers/rdma/hns-abi.h
++++ b/kernel-headers/rdma/hns-abi.h
+@@ -81,7 +81,8 @@ struct hns_roce_ib_create_qp {
+ 	__u8    log_sq_bb_count;
+ 	__u8    log_sq_stride;
+ 	__u8    sq_no_prefetch;
+-	__u8    reserved[5];
++	__u8    reserved[4];
++	__u8    pageshift;
+ 	__aligned_u64 sdb_addr;
+ 	__aligned_u64 comp_mask;
+ 	__aligned_u64 create_flags;
+@@ -122,6 +123,7 @@ enum {
+ 	HNS_ROCE_RQ_INLINE_FLAGS = 1 << 1,
+ 	HNS_ROCE_CQE_INLINE_FLAGS = 1 << 2,
+ 	HNS_ROCE_UCTX_CONFIG_DCA = 1 << 3,
++	HNS_ROCE_UCTX_DYN_QP_PGSZ = 1 << 4,
+ };
+ 
+ enum {
+@@ -129,6 +131,7 @@ enum {
+ 	HNS_ROCE_RSP_RQ_INLINE_FLAGS = 1 << 1,
+ 	HNS_ROCE_RSP_CQE_INLINE_FLAGS = 1 << 2,
+ 	HNS_ROCE_UCTX_RSP_DCA_FLAGS = HNS_ROCE_UCTX_CONFIG_DCA,
++	HNS_ROCE_UCTX_RSP_DYN_QP_PGSZ = HNS_ROCE_UCTX_DYN_QP_PGSZ,
+ };
+ 
+ struct hns_roce_ib_alloc_ucontext_resp {
+diff --git a/providers/hns/hns_roce_u.c b/providers/hns/hns_roce_u.c
+index 0660081..02ad880 100644
+--- a/providers/hns/hns_roce_u.c
++++ b/providers/hns/hns_roce_u.c
+@@ -267,7 +267,7 @@ static void ucontext_set_cmd(struct hns_roce_alloc_ucontext *cmd,
+ 			     struct hnsdv_context_attr *attr)
+ {
+ 	cmd->config |= HNS_ROCE_EXSGE_FLAGS | HNS_ROCE_RQ_INLINE_FLAGS |
+-		       HNS_ROCE_CQE_INLINE_FLAGS;
++		       HNS_ROCE_CQE_INLINE_FLAGS | HNS_ROCE_UCTX_DYN_QP_PGSZ;
+ 
+ 	if (!attr || !(attr->flags & HNSDV_CONTEXT_FLAGS_DCA))
+ 		return;
+diff --git a/providers/hns/hns_roce_u.h b/providers/hns/hns_roce_u.h
+index 5501d8e..ae9ae51 100644
+--- a/providers/hns/hns_roce_u.h
++++ b/providers/hns/hns_roce_u.h
+@@ -409,6 +409,7 @@ struct hns_roce_qp {
+ 	uint8_t				sl;
+ 	uint8_t				tc_mode;
+ 	uint8_t				priority;
++	uint8_t				pageshift;
+ 	unsigned int			qkey;
+ 	enum ibv_mtu			path_mtu;
+ 
+diff --git a/providers/hns/hns_roce_u_verbs.c b/providers/hns/hns_roce_u_verbs.c
+index 7b58dd0..f76341c 100644
+--- a/providers/hns/hns_roce_u_verbs.c
++++ b/providers/hns/hns_roce_u_verbs.c
+@@ -1327,31 +1327,69 @@ static void free_recv_rinl_buf(struct hns_roce_rinl_buf *rinl_buf)
+ 	}
+ }
+ 
++static void get_best_multi_region_pg_shift(struct hns_roce_device *hr_dev,
++					   struct hns_roce_context *ctx,
++					   struct hns_roce_qp *qp, bool dca_en)
++{
++	uint32_t ext_sge_size;
++	uint32_t sq_size;
++	uint32_t rq_size;
++	uint8_t pg_shift;
++
++	if (!(ctx->config & HNS_ROCE_UCTX_RSP_DYN_QP_PGSZ) || dca_en) {
++		qp->pageshift = HNS_HW_PAGE_SHIFT;
++		return;
++	}
++
++	/*
++	* The larger the pagesize used, the better the performance, but it
++	* may waste more memory. Therefore, we use the least common multiple
++	* (aligned to power of 2) of sq wqe buffer size, rq wqe buffer size,
++	* and ext_sge buffer size as the pagesize. Additionally, since the
++	* kernel cannot guarantee the allocation of contiguous memory larger
++	* than the system page, the pagesize must be smaller than the system
++	* page.
++	*/
++	sq_size = qp->sq.wqe_cnt << qp->sq.wqe_shift;
++	ext_sge_size = qp->ex_sge.sge_cnt << qp->ex_sge.sge_shift;
++	rq_size = qp->rq.wqe_cnt << qp->rq.wqe_shift;
++
++	pg_shift = max_t(uint8_t, sq_size ? hr_ilog32(sq_size) : 0,
++			 ext_sge_size ? hr_ilog32(ext_sge_size) : 0);
++	pg_shift = max_t(uint8_t, pg_shift, rq_size ? hr_ilog32(rq_size) : 0);
++	pg_shift = max_t(uint8_t, pg_shift,  HNS_HW_PAGE_SHIFT);
++	qp->pageshift = min_t(uint8_t, pg_shift, hr_ilog32(hr_dev->page_size));
++}
++
+ static int calc_qp_buff_size(struct hns_roce_device *hr_dev,
+-			     struct hns_roce_qp *qp)
++			     struct hns_roce_context *ctx,
++			     struct hns_roce_qp *qp, bool dca_en)
+ {
+ 	struct hns_roce_wq *sq = &qp->sq;
+ 	struct hns_roce_wq *rq = &qp->rq;
++	unsigned int page_size;
+ 	unsigned int size;
+ 
+ 	qp->buf_size = 0;
++	get_best_multi_region_pg_shift(hr_dev, ctx, qp, dca_en);
++	page_size = 1 << qp->pageshift;
+ 
+ 	/* SQ WQE */
+ 	sq->offset = 0;
+-	size = to_hr_hem_entries_size(sq->wqe_cnt, sq->wqe_shift);
++	size = align(sq->wqe_cnt << sq->wqe_shift, page_size);
+ 	qp->buf_size += size;
+ 
+ 	/* extend SGE WQE in SQ */
+ 	qp->ex_sge.offset = qp->buf_size;
+ 	if (qp->ex_sge.sge_cnt > 0) {
+-		size = to_hr_hem_entries_size(qp->ex_sge.sge_cnt,
+-					      qp->ex_sge.sge_shift);
++		size = align(qp->ex_sge.sge_cnt << qp->ex_sge.sge_shift,
++			     page_size);
+ 		qp->buf_size += size;
+ 	}
+ 
+ 	/* RQ WQE */
+ 	rq->offset = qp->buf_size;
+-	size = to_hr_hem_entries_size(rq->wqe_cnt, rq->wqe_shift);
++	size = align(rq->wqe_cnt << rq->wqe_shift, page_size);
+ 	qp->buf_size += size;
+ 
+ 	if (qp->buf_size < 1)
+@@ -1375,7 +1413,7 @@ static inline bool check_qp_support_dca(struct hns_roce_dca_ctx *dca_ctx,
+ 	if (hns_attr &&
+ 	    (hns_attr->comp_mask & HNSDV_QP_INIT_ATTR_MASK_QP_CREATE_FLAGS) &&
+ 	    (hns_attr->create_flags & HNSDV_QP_CREATE_ENABLE_DCA_MODE))
+-		return true;
++		return dca_ctx->max_size > 0;
+ 
+ 	return false;
+ }
+@@ -1396,9 +1434,12 @@ static int qp_alloc_wqe(struct ibv_qp_init_attr_ex *attr,
+ 			struct hns_roce_qp *qp, struct hns_roce_context *ctx)
+ {
+ 	struct hns_roce_device *hr_dev = to_hr_dev(ctx->ibv_ctx.context.device);
++	bool dca_en = check_qp_support_dca(&ctx->dca_ctx, attr, hns_attr);
++	int ret;
+ 
+-	if (calc_qp_buff_size(hr_dev, qp))
+-		return -EINVAL;
++	ret = calc_qp_buff_size(hr_dev, ctx, qp, dca_en);
++	if (ret)
++		return ret;
+ 
+ 	qp->sq.wrid = malloc(qp->sq.wqe_cnt * sizeof(uint64_t));
+ 	if (!qp->sq.wrid)
+@@ -1416,19 +1457,18 @@ static int qp_alloc_wqe(struct ibv_qp_init_attr_ex *attr,
+ 			goto err_alloc;
+ 	}
+ 
+-	if (check_qp_support_dca(&ctx->dca_ctx, attr, hns_attr) &&
+-	    ctx->dca_ctx.max_size > 0) {
++	if (dca_en) {
+ 		/* when DCA is enabled, use a buffer list to store page addr */
+ 		qp->buf.buf = NULL;
+ 		qp->dca_wqe.max_cnt = hr_hw_page_count(qp->buf_size);
+-		qp->dca_wqe.shift = HNS_HW_PAGE_SHIFT;
++		qp->dca_wqe.shift = qp->pageshift;
+ 		qp->dca_wqe.bufs = calloc(qp->dca_wqe.max_cnt, sizeof(void *));
+ 		if (!qp->dca_wqe.bufs)
+ 			goto err_alloc;
+ 		verbs_debug(&ctx->ibv_ctx, "alloc DCA buf.\n");
+ 	} else {
+ 		if (hns_roce_alloc_buf(&qp->buf, qp->buf_size,
+-				       HNS_HW_PAGE_SIZE))
++				       1 << qp->pageshift))
+ 			goto err_alloc;
+ 	}
+ 
+@@ -1642,6 +1682,7 @@ static int qp_exec_create_cmd(struct ibv_qp_init_attr_ex *attr,
+ 	cmd_ex.buf_addr = (uintptr_t)qp->buf.buf;
+ 	cmd_ex.log_sq_stride = qp->sq.wqe_shift;
+ 	cmd_ex.log_sq_bb_count = hr_ilog32(qp->sq.wqe_cnt);
++	cmd_ex.pageshift = qp->pageshift;
+ 
+ 	if (cmd_flag->congest_type_flags) {
+ 		cmd_ex.comp_mask |= HNS_ROCE_CREATE_QP_MASK_CONGEST_TYPE;
+-- 
+2.25.1
+
diff --git a/rdma-core.spec b/rdma-core.spec
index 044f63e2210b7061027ee197bfd22a15cf4c1738..e0f9da802b09914c69cc724d8db79aa042919c0b 100644
--- a/rdma-core.spec
+++ b/rdma-core.spec
@@ -1,6 +1,6 @@
 Name:           rdma-core
 Version:        41.0
-Release:        15
+Release:        16
 Summary:        RDMA core userspace libraries and daemons
 License:        GPLv2 or BSD
 Url:            https://github.com/linux-rdma/rdma-core
@@ -63,6 +63,7 @@ Patch53: 0054-libhns-return-error-when-post-send-in-reset-state.patch
 Patch54: 0055-libhns-separate-the-initialization-steps-of-lock.patch
 Patch55: 0056-libhns-assign-doorbell-to-zero-when-allocate-it.patch
 patch56: 0057-libhns-Fix-missing-reset-notification.patch
+patch57: 0058-libhns-Support-flexible-WQE-buffer-page-size.patch
 
 BuildRequires:  binutils cmake >= 2.8.11 gcc libudev-devel pkgconfig pkgconfig(libnl-3.0)
 BuildRequires:  pkgconfig(libnl-route-3.0) valgrind-devel systemd systemd-devel
@@ -310,6 +311,12 @@ fi
 %{_mandir}/*
 
 %changelog
+* Tue Oct 24 2023 Ran Zhou <zhouran10@h-partners.com> - 41.0-16
+- Type: requirement
+- ID: NA
+- SUG: NA
+- DESC: Support flexible WQE buffer page size
+
 * Tue Sep 26 2023 Juan Zhou <zhoujuan51@h-partners.com> - 41.0-15
 - Type: requirement
 - ID: NA