diff --git a/0058-libhns-Support-flexible-WQE-buffer-page-size.patch b/0058-libhns-Support-flexible-WQE-buffer-page-size.patch new file mode 100644 index 0000000000000000000000000000000000000000..f4f4dfac2cb6ece43d8e7b2dd0607a94a567fe50 --- /dev/null +++ b/0058-libhns-Support-flexible-WQE-buffer-page-size.patch @@ -0,0 +1,231 @@ +From d628c51d25b972a7d26e53ea400b3a0679d51f91 Mon Sep 17 00:00:00 2001 +From: Chengchang Tang +Date: Mon, 23 Oct 2023 21:13:03 +0800 +Subject: [PATCH] libhns: Support flexible WQE buffer page size + +driver inclusion +category: feature +bugzilla: https://gitee.com/openeuler/kernel/issues/I87LTM + +-------------------------------------------------------------------------- + +Currently, driver fixedly allocates 4K pages for user space WQE buffer +even in a 64K system. This results in HW reading WQE with a granularity +of 4K even in a 64K system. Considering that we support 1024-byte inline, +in the scenario of using SQ inline, HW will switch pages every 4 WQEs. +This will introduce a delay of about 400ns, which is an average delay of +100ns per packet. + +In order to improve performance, we allow user-mode drivers to use a +larger page size to allocate WQE buffers, thereby reducing the latency +introduced by HW page switching. User-mode drivers will be allowed to +allocate WQE buffers between 4K to system page size. During +ibv_create_qp(), the driver will dynamically select the appropriate page +size based on ibv_qp_cap, thus reducing memory consumption while improving +performance. + +This feature needs to be used in conjunction with the kernel-mode driver. +In order to ensure forward compatibility, if the kernel-mode driver does +not support this feature, the user-mode driver will continue to use a +fixed 4K pagesize to allocate WQE buffer. + +Signed-off-by: Chengchang Tang +--- + kernel-headers/rdma/hns-abi.h | 5 ++- + providers/hns/hns_roce_u.c | 2 +- + providers/hns/hns_roce_u.h | 1 + + providers/hns/hns_roce_u_verbs.c | 65 ++++++++++++++++++++++++++------ + 4 files changed, 59 insertions(+), 14 deletions(-) + +diff --git a/kernel-headers/rdma/hns-abi.h b/kernel-headers/rdma/hns-abi.h +index cab941f..157dc9d 100644 +--- a/kernel-headers/rdma/hns-abi.h ++++ b/kernel-headers/rdma/hns-abi.h +@@ -81,7 +81,8 @@ struct hns_roce_ib_create_qp { + __u8 log_sq_bb_count; + __u8 log_sq_stride; + __u8 sq_no_prefetch; +- __u8 reserved[5]; ++ __u8 reserved[4]; ++ __u8 pageshift; + __aligned_u64 sdb_addr; + __aligned_u64 comp_mask; + __aligned_u64 create_flags; +@@ -122,6 +123,7 @@ enum { + HNS_ROCE_RQ_INLINE_FLAGS = 1 << 1, + HNS_ROCE_CQE_INLINE_FLAGS = 1 << 2, + HNS_ROCE_UCTX_CONFIG_DCA = 1 << 3, ++ HNS_ROCE_UCTX_DYN_QP_PGSZ = 1 << 4, + }; + + enum { +@@ -129,6 +131,7 @@ enum { + HNS_ROCE_RSP_RQ_INLINE_FLAGS = 1 << 1, + HNS_ROCE_RSP_CQE_INLINE_FLAGS = 1 << 2, + HNS_ROCE_UCTX_RSP_DCA_FLAGS = HNS_ROCE_UCTX_CONFIG_DCA, ++ HNS_ROCE_UCTX_RSP_DYN_QP_PGSZ = HNS_ROCE_UCTX_DYN_QP_PGSZ, + }; + + struct hns_roce_ib_alloc_ucontext_resp { +diff --git a/providers/hns/hns_roce_u.c b/providers/hns/hns_roce_u.c +index 0660081..02ad880 100644 +--- a/providers/hns/hns_roce_u.c ++++ b/providers/hns/hns_roce_u.c +@@ -267,7 +267,7 @@ static void ucontext_set_cmd(struct hns_roce_alloc_ucontext *cmd, + struct hnsdv_context_attr *attr) + { + cmd->config |= HNS_ROCE_EXSGE_FLAGS | HNS_ROCE_RQ_INLINE_FLAGS | +- HNS_ROCE_CQE_INLINE_FLAGS; ++ HNS_ROCE_CQE_INLINE_FLAGS | HNS_ROCE_UCTX_DYN_QP_PGSZ; + + if (!attr || !(attr->flags & HNSDV_CONTEXT_FLAGS_DCA)) + return; +diff --git a/providers/hns/hns_roce_u.h b/providers/hns/hns_roce_u.h +index 5501d8e..ae9ae51 100644 +--- a/providers/hns/hns_roce_u.h ++++ b/providers/hns/hns_roce_u.h +@@ -409,6 +409,7 @@ struct hns_roce_qp { + uint8_t sl; + uint8_t tc_mode; + uint8_t priority; ++ uint8_t pageshift; + unsigned int qkey; + enum ibv_mtu path_mtu; + +diff --git a/providers/hns/hns_roce_u_verbs.c b/providers/hns/hns_roce_u_verbs.c +index 7b58dd0..f76341c 100644 +--- a/providers/hns/hns_roce_u_verbs.c ++++ b/providers/hns/hns_roce_u_verbs.c +@@ -1327,31 +1327,69 @@ static void free_recv_rinl_buf(struct hns_roce_rinl_buf *rinl_buf) + } + } + ++static void get_best_multi_region_pg_shift(struct hns_roce_device *hr_dev, ++ struct hns_roce_context *ctx, ++ struct hns_roce_qp *qp, bool dca_en) ++{ ++ uint32_t ext_sge_size; ++ uint32_t sq_size; ++ uint32_t rq_size; ++ uint8_t pg_shift; ++ ++ if (!(ctx->config & HNS_ROCE_UCTX_RSP_DYN_QP_PGSZ) || dca_en) { ++ qp->pageshift = HNS_HW_PAGE_SHIFT; ++ return; ++ } ++ ++ /* ++ * The larger the pagesize used, the better the performance, but it ++ * may waste more memory. Therefore, we use the least common multiple ++ * (aligned to power of 2) of sq wqe buffer size, rq wqe buffer size, ++ * and ext_sge buffer size as the pagesize. Additionally, since the ++ * kernel cannot guarantee the allocation of contiguous memory larger ++ * than the system page, the pagesize must be smaller than the system ++ * page. ++ */ ++ sq_size = qp->sq.wqe_cnt << qp->sq.wqe_shift; ++ ext_sge_size = qp->ex_sge.sge_cnt << qp->ex_sge.sge_shift; ++ rq_size = qp->rq.wqe_cnt << qp->rq.wqe_shift; ++ ++ pg_shift = max_t(uint8_t, sq_size ? hr_ilog32(sq_size) : 0, ++ ext_sge_size ? hr_ilog32(ext_sge_size) : 0); ++ pg_shift = max_t(uint8_t, pg_shift, rq_size ? hr_ilog32(rq_size) : 0); ++ pg_shift = max_t(uint8_t, pg_shift, HNS_HW_PAGE_SHIFT); ++ qp->pageshift = min_t(uint8_t, pg_shift, hr_ilog32(hr_dev->page_size)); ++} ++ + static int calc_qp_buff_size(struct hns_roce_device *hr_dev, +- struct hns_roce_qp *qp) ++ struct hns_roce_context *ctx, ++ struct hns_roce_qp *qp, bool dca_en) + { + struct hns_roce_wq *sq = &qp->sq; + struct hns_roce_wq *rq = &qp->rq; ++ unsigned int page_size; + unsigned int size; + + qp->buf_size = 0; ++ get_best_multi_region_pg_shift(hr_dev, ctx, qp, dca_en); ++ page_size = 1 << qp->pageshift; + + /* SQ WQE */ + sq->offset = 0; +- size = to_hr_hem_entries_size(sq->wqe_cnt, sq->wqe_shift); ++ size = align(sq->wqe_cnt << sq->wqe_shift, page_size); + qp->buf_size += size; + + /* extend SGE WQE in SQ */ + qp->ex_sge.offset = qp->buf_size; + if (qp->ex_sge.sge_cnt > 0) { +- size = to_hr_hem_entries_size(qp->ex_sge.sge_cnt, +- qp->ex_sge.sge_shift); ++ size = align(qp->ex_sge.sge_cnt << qp->ex_sge.sge_shift, ++ page_size); + qp->buf_size += size; + } + + /* RQ WQE */ + rq->offset = qp->buf_size; +- size = to_hr_hem_entries_size(rq->wqe_cnt, rq->wqe_shift); ++ size = align(rq->wqe_cnt << rq->wqe_shift, page_size); + qp->buf_size += size; + + if (qp->buf_size < 1) +@@ -1375,7 +1413,7 @@ static inline bool check_qp_support_dca(struct hns_roce_dca_ctx *dca_ctx, + if (hns_attr && + (hns_attr->comp_mask & HNSDV_QP_INIT_ATTR_MASK_QP_CREATE_FLAGS) && + (hns_attr->create_flags & HNSDV_QP_CREATE_ENABLE_DCA_MODE)) +- return true; ++ return dca_ctx->max_size > 0; + + return false; + } +@@ -1396,9 +1434,12 @@ static int qp_alloc_wqe(struct ibv_qp_init_attr_ex *attr, + struct hns_roce_qp *qp, struct hns_roce_context *ctx) + { + struct hns_roce_device *hr_dev = to_hr_dev(ctx->ibv_ctx.context.device); ++ bool dca_en = check_qp_support_dca(&ctx->dca_ctx, attr, hns_attr); ++ int ret; + +- if (calc_qp_buff_size(hr_dev, qp)) +- return -EINVAL; ++ ret = calc_qp_buff_size(hr_dev, ctx, qp, dca_en); ++ if (ret) ++ return ret; + + qp->sq.wrid = malloc(qp->sq.wqe_cnt * sizeof(uint64_t)); + if (!qp->sq.wrid) +@@ -1416,19 +1457,18 @@ static int qp_alloc_wqe(struct ibv_qp_init_attr_ex *attr, + goto err_alloc; + } + +- if (check_qp_support_dca(&ctx->dca_ctx, attr, hns_attr) && +- ctx->dca_ctx.max_size > 0) { ++ if (dca_en) { + /* when DCA is enabled, use a buffer list to store page addr */ + qp->buf.buf = NULL; + qp->dca_wqe.max_cnt = hr_hw_page_count(qp->buf_size); +- qp->dca_wqe.shift = HNS_HW_PAGE_SHIFT; ++ qp->dca_wqe.shift = qp->pageshift; + qp->dca_wqe.bufs = calloc(qp->dca_wqe.max_cnt, sizeof(void *)); + if (!qp->dca_wqe.bufs) + goto err_alloc; + verbs_debug(&ctx->ibv_ctx, "alloc DCA buf.\n"); + } else { + if (hns_roce_alloc_buf(&qp->buf, qp->buf_size, +- HNS_HW_PAGE_SIZE)) ++ 1 << qp->pageshift)) + goto err_alloc; + } + +@@ -1642,6 +1682,7 @@ static int qp_exec_create_cmd(struct ibv_qp_init_attr_ex *attr, + cmd_ex.buf_addr = (uintptr_t)qp->buf.buf; + cmd_ex.log_sq_stride = qp->sq.wqe_shift; + cmd_ex.log_sq_bb_count = hr_ilog32(qp->sq.wqe_cnt); ++ cmd_ex.pageshift = qp->pageshift; + + if (cmd_flag->congest_type_flags) { + cmd_ex.comp_mask |= HNS_ROCE_CREATE_QP_MASK_CONGEST_TYPE; +-- +2.25.1 + diff --git a/rdma-core.spec b/rdma-core.spec index 044f63e2210b7061027ee197bfd22a15cf4c1738..e0f9da802b09914c69cc724d8db79aa042919c0b 100644 --- a/rdma-core.spec +++ b/rdma-core.spec @@ -1,6 +1,6 @@ Name: rdma-core Version: 41.0 -Release: 15 +Release: 16 Summary: RDMA core userspace libraries and daemons License: GPLv2 or BSD Url: https://github.com/linux-rdma/rdma-core @@ -63,6 +63,7 @@ Patch53: 0054-libhns-return-error-when-post-send-in-reset-state.patch Patch54: 0055-libhns-separate-the-initialization-steps-of-lock.patch Patch55: 0056-libhns-assign-doorbell-to-zero-when-allocate-it.patch patch56: 0057-libhns-Fix-missing-reset-notification.patch +patch57: 0058-libhns-Support-flexible-WQE-buffer-page-size.patch BuildRequires: binutils cmake >= 2.8.11 gcc libudev-devel pkgconfig pkgconfig(libnl-3.0) BuildRequires: pkgconfig(libnl-route-3.0) valgrind-devel systemd systemd-devel @@ -310,6 +311,12 @@ fi %{_mandir}/* %changelog +* Tue Oct 24 2023 Ran Zhou - 41.0-16 +- Type: requirement +- ID: NA +- SUG: NA +- DESC: Support flexible WQE buffer page size + * Tue Sep 26 2023 Juan Zhou - 41.0-15 - Type: requirement - ID: NA