代码拉取完成,页面将自动刷新
同步操作将从 src-openEuler/rdma-core 强制同步,此操作会覆盖自 Fork 仓库以来所做的任何修改,且无法恢复!!!
确定后同步将在后台操作,完成时将刷新页面,请耐心等待。
From 64c66455fef1c908cc8f06a2b71aa2fd71806218 Mon Sep 17 00:00:00 2001
From: Yixing Liu <liuyixing1@huawei.com>
Date: Wed, 15 Dec 2021 16:42:30 +0800
Subject: [PATCH 7/8] libhns: Add support for direct wqe
The current write wqe mechanism is to write to DDR first, and then notify
the hardware through doorbell to read the data. Direct wqe is a mechanism
to fill wqe directly into the hardware. In the case of light load, the wqe
will be filled into pcie bar space of the hardware, this will reduce one
memory access operation and therefore reduce the latency. SIMD instructions
allows cpu to write the 512 bits at one time to device memory, thus it can
be used for posting direct wqe.
The process of post send of HIP08/09:
+-----------+
| post send |
+-----+-----+
|
+-----+-----+
| write WQE |
+-----+-----+
|
| udma_to_device_barrier()
|
+-----+-----+ Y +-----------+ N
| HIP09 ? +------+ multi WR ?+-------------+
+-----+-----+ +-----+-----+ |
| N | Y |
+-----+-----+ +-----+-----+ +--------+--------+
| ring DB | | ring DB | |direct WQE (ST4) |
+-----------+ +-----------+ +-----------------+
Signed-off-by: Yixing Liu <liuyixing1@huawei.com>
Signed-off-by: Lang Cheng <chenglang@huawei.com>
Signed-off-by: Wenpeng Liang <liangwenpeng@huawei.com>
---
providers/hns/hns_roce_u.h | 5 +++-
providers/hns/hns_roce_u_hw_v2.c | 43 ++++++++++++++++++++++++++------
providers/hns/hns_roce_u_hw_v2.h | 31 +++++++++++++----------
providers/hns/hns_roce_u_verbs.c | 26 +++++++++++++++++--
util/mmio.h | 27 +++++++++++++++++++-
5 files changed, 107 insertions(+), 25 deletions(-)
diff --git a/providers/hns/hns_roce_u.h b/providers/hns/hns_roce_u.h
index b3f48113..37711363 100644
--- a/providers/hns/hns_roce_u.h
+++ b/providers/hns/hns_roce_u.h
@@ -80,6 +80,8 @@
#define INVALID_SGE_LENGTH 0x80000000
+#define HNS_ROCE_DWQE_PAGE_SIZE 65536
+
#define HNS_ROCE_ADDRESS_MASK 0xFFFFFFFF
#define HNS_ROCE_ADDRESS_SHIFT 32
@@ -279,13 +281,14 @@ struct hns_roce_qp {
struct hns_roce_sge_ex ex_sge;
unsigned int next_sge;
int port_num;
- int sl;
+ uint8_t sl;
unsigned int qkey;
enum ibv_mtu path_mtu;
struct hns_roce_rinl_buf rq_rinl_buf;
unsigned long flags;
int refcnt; /* specially used for XRC */
+ void *dwqe_page;
};
struct hns_roce_av {
diff --git a/providers/hns/hns_roce_u_hw_v2.c b/providers/hns/hns_roce_u_hw_v2.c
index 1169b64b..f102fd61 100644
--- a/providers/hns/hns_roce_u_hw_v2.c
+++ b/providers/hns/hns_roce_u_hw_v2.c
@@ -33,6 +33,7 @@
#define _GNU_SOURCE
#include <stdio.h>
#include <string.h>
+#include <sys/mman.h>
#include "hns_roce_u.h"
#include "hns_roce_u_db.h"
#include "hns_roce_u_hw_v2.h"
@@ -297,20 +298,40 @@ static void hns_roce_update_rq_db(struct hns_roce_context *ctx,
}
static void hns_roce_update_sq_db(struct hns_roce_context *ctx,
- unsigned int qpn, unsigned int sl,
- unsigned int sq_head)
+ struct hns_roce_qp *qp)
{
struct hns_roce_db sq_db = {};
- sq_db.byte_4 = htole32(qpn);
+ sq_db.byte_4 = htole32(qp->verbs_qp.qp.qp_num);
roce_set_field(sq_db.byte_4, DB_BYTE_4_CMD_M, DB_BYTE_4_CMD_S,
HNS_ROCE_V2_SQ_DB);
- sq_db.parameter = htole32(sq_head);
- roce_set_field(sq_db.parameter, DB_PARAM_SL_M, DB_PARAM_SL_S, sl);
+ sq_db.parameter = htole32(qp->sq.head);
+ roce_set_field(sq_db.parameter, DB_PARAM_SL_M, DB_PARAM_SL_S, qp->sl);
hns_roce_write64(ctx->uar + ROCEE_VF_DB_CFG0_OFFSET, (__le32 *)&sq_db);
}
+static void hns_roce_write512(uint64_t *dest, uint64_t *val)
+{
+ mmio_memcpy_x64(dest, val, sizeof(struct hns_roce_rc_sq_wqe));
+}
+
+static void hns_roce_write_dwqe(struct hns_roce_qp *qp, void *wqe)
+{
+ struct hns_roce_rc_sq_wqe *rc_sq_wqe = wqe;
+
+ /* All kinds of DirectWQE have the same header field layout */
+ roce_set_bit(rc_sq_wqe->byte_4, RC_SQ_WQE_BYTE_4_FLAG_S, 1);
+ roce_set_field(rc_sq_wqe->byte_4, RC_SQ_WQE_BYTE_4_DB_SL_L_M,
+ RC_SQ_WQE_BYTE_4_DB_SL_L_S, qp->sl);
+ roce_set_field(rc_sq_wqe->byte_4, RC_SQ_WQE_BYTE_4_DB_SL_H_M,
+ RC_SQ_WQE_BYTE_4_DB_SL_H_S, qp->sl >> HNS_ROCE_SL_SHIFT);
+ roce_set_field(rc_sq_wqe->byte_4, RC_SQ_WQE_BYTE_4_WQE_INDEX_M,
+ RC_SQ_WQE_BYTE_4_WQE_INDEX_S, qp->sq.head);
+
+ hns_roce_write512(qp->dwqe_page, wqe);
+}
+
static void hns_roce_v2_update_cq_cons_index(struct hns_roce_context *ctx,
struct hns_roce_cq *cq)
{
@@ -339,8 +360,7 @@ static struct hns_roce_qp *hns_roce_v2_find_qp(struct hns_roce_context *ctx,
return NULL;
}
-static void hns_roce_v2_clear_qp(struct hns_roce_context *ctx,
- struct hns_roce_qp *qp)
+void hns_roce_v2_clear_qp(struct hns_roce_context *ctx, struct hns_roce_qp *qp)
{
uint32_t qpn = qp->verbs_qp.qp.qp_num;
uint32_t tind = (qpn & (ctx->num_qps - 1)) >> ctx->qp_table_shift;
@@ -1196,6 +1216,7 @@ int hns_roce_u_v2_post_send(struct ibv_qp *ibvqp, struct ibv_send_wr *wr,
break;
case IBV_QPT_UD:
ret = set_ud_wqe(wqe, qp, wr, nreq, &sge_info);
+ qp->sl = to_hr_ah(wr->wr.ud.ah)->av.sl;
break;
default:
ret = EINVAL;
@@ -1214,7 +1235,10 @@ out:
udma_to_device_barrier();
- hns_roce_update_sq_db(ctx, ibvqp->qp_num, qp->sl, qp->sq.head);
+ if (nreq == 1 && (qp->flags & HNS_ROCE_QP_CAP_DIRECT_WQE))
+ hns_roce_write_dwqe(qp, wqe);
+ else
+ hns_roce_update_sq_db(ctx, qp);
if (qp->flags & HNS_ROCE_QP_CAP_SQ_RECORD_DB)
*(qp->sdb) = qp->sq.head & 0xffff;
@@ -1506,6 +1530,9 @@ static int hns_roce_u_v2_destroy_qp(struct ibv_qp *ibqp)
if (ret)
return ret;
+ if (qp->flags & HNS_ROCE_QP_CAP_DIRECT_WQE)
+ munmap(qp->dwqe_page, HNS_ROCE_DWQE_PAGE_SIZE);
+
hns_roce_v2_clear_qp(ctx, qp);
hns_roce_lock_cqs(ibqp);
diff --git a/providers/hns/hns_roce_u_hw_v2.h b/providers/hns/hns_roce_u_hw_v2.h
index c13d82e3..af72cd70 100644
--- a/providers/hns/hns_roce_u_hw_v2.h
+++ b/providers/hns/hns_roce_u_hw_v2.h
@@ -40,6 +40,8 @@
#define HNS_ROCE_CMDSN_MASK 0x3
+#define HNS_ROCE_SL_SHIFT 2
+
/* V2 REG DEFINITION */
#define ROCEE_VF_DB_CFG0_OFFSET 0x0230
@@ -133,6 +135,8 @@ struct hns_roce_db {
#define DB_BYTE_4_CMD_S 24
#define DB_BYTE_4_CMD_M GENMASK(27, 24)
+#define DB_BYTE_4_FLAG_S 31
+
#define DB_PARAM_SRQ_PRODUCER_COUNTER_S 0
#define DB_PARAM_SRQ_PRODUCER_COUNTER_M GENMASK(15, 0)
@@ -216,8 +220,16 @@ struct hns_roce_rc_sq_wqe {
};
#define RC_SQ_WQE_BYTE_4_OPCODE_S 0
-#define RC_SQ_WQE_BYTE_4_OPCODE_M \
- (((1UL << 5) - 1) << RC_SQ_WQE_BYTE_4_OPCODE_S)
+#define RC_SQ_WQE_BYTE_4_OPCODE_M GENMASK(4, 0)
+
+#define RC_SQ_WQE_BYTE_4_DB_SL_L_S 5
+#define RC_SQ_WQE_BYTE_4_DB_SL_L_M GENMASK(6, 5)
+
+#define RC_SQ_WQE_BYTE_4_DB_SL_H_S 13
+#define RC_SQ_WQE_BYTE_4_DB_SL_H_M GENMASK(14, 13)
+
+#define RC_SQ_WQE_BYTE_4_WQE_INDEX_S 15
+#define RC_SQ_WQE_BYTE_4_WQE_INDEX_M GENMASK(30, 15)
#define RC_SQ_WQE_BYTE_4_OWNER_S 7
@@ -239,6 +251,8 @@ struct hns_roce_rc_sq_wqe {
#define RC_SQ_WQE_BYTE_4_RDMA_WRITE_S 22
+#define RC_SQ_WQE_BYTE_4_FLAG_S 31
+
#define RC_SQ_WQE_BYTE_16_XRC_SRQN_S 0
#define RC_SQ_WQE_BYTE_16_XRC_SRQN_M \
(((1UL << 24) - 1) << RC_SQ_WQE_BYTE_16_XRC_SRQN_S)
@@ -311,23 +325,12 @@ struct hns_roce_ud_sq_wqe {
#define UD_SQ_WQE_OPCODE_S 0
#define UD_SQ_WQE_OPCODE_M GENMASK(4, 0)
-#define UD_SQ_WQE_DB_SL_L_S 5
-#define UD_SQ_WQE_DB_SL_L_M GENMASK(6, 5)
-
-#define UD_SQ_WQE_DB_SL_H_S 13
-#define UD_SQ_WQE_DB_SL_H_M GENMASK(14, 13)
-
-#define UD_SQ_WQE_INDEX_S 15
-#define UD_SQ_WQE_INDEX_M GENMASK(30, 15)
-
#define UD_SQ_WQE_OWNER_S 7
#define UD_SQ_WQE_CQE_S 8
#define UD_SQ_WQE_SE_S 11
-#define UD_SQ_WQE_FLAG_S 31
-
#define UD_SQ_WQE_PD_S 0
#define UD_SQ_WQE_PD_M GENMASK(23, 0)
@@ -376,4 +379,6 @@ struct hns_roce_ud_sq_wqe {
#define MAX_SERVICE_LEVEL 0x7
+void hns_roce_v2_clear_qp(struct hns_roce_context *ctx, struct hns_roce_qp *qp);
+
#endif /* _HNS_ROCE_U_HW_V2_H */
diff --git a/providers/hns/hns_roce_u_verbs.c b/providers/hns/hns_roce_u_verbs.c
index 125858d2..fc902815 100644
--- a/providers/hns/hns_roce_u_verbs.c
+++ b/providers/hns/hns_roce_u_verbs.c
@@ -1076,7 +1076,8 @@ static int hns_roce_store_qp(struct hns_roce_context *ctx,
static int qp_exec_create_cmd(struct ibv_qp_init_attr_ex *attr,
struct hns_roce_qp *qp,
- struct hns_roce_context *ctx)
+ struct hns_roce_context *ctx,
+ uint64_t *dwqe_mmap_key)
{
struct hns_roce_create_qp_ex_resp resp_ex = {};
struct hns_roce_create_qp_ex cmd_ex = {};
@@ -1093,6 +1094,7 @@ static int qp_exec_create_cmd(struct ibv_qp_init_attr_ex *attr,
&resp_ex.ibv_resp, sizeof(resp_ex));
qp->flags = resp_ex.drv_payload.cap_flags;
+ *dwqe_mmap_key = resp_ex.drv_payload.dwqe_mmap_key;
return ret;
}
@@ -1144,11 +1146,23 @@ static int hns_roce_alloc_qp_buf(struct ibv_qp_init_attr_ex *attr,
return ret;
}
+static int mmap_dwqe(struct ibv_context *ibv_ctx, struct hns_roce_qp *qp,
+ uint64_t dwqe_mmap_key)
+{
+ qp->dwqe_page = mmap(NULL, HNS_ROCE_DWQE_PAGE_SIZE, PROT_WRITE,
+ MAP_SHARED, ibv_ctx->cmd_fd, dwqe_mmap_key);
+ if (qp->dwqe_page == MAP_FAILED)
+ return -EINVAL;
+
+ return 0;
+}
+
static struct ibv_qp *create_qp(struct ibv_context *ibv_ctx,
struct ibv_qp_init_attr_ex *attr)
{
struct hns_roce_context *context = to_hr_ctx(ibv_ctx);
struct hns_roce_qp *qp;
+ uint64_t dwqe_mmap_key;
int ret;
ret = verify_qp_create_attr(context, attr);
@@ -1167,7 +1181,7 @@ static struct ibv_qp *create_qp(struct ibv_context *ibv_ctx,
if (ret)
goto err_buf;
- ret = qp_exec_create_cmd(attr, qp, context);
+ ret = qp_exec_create_cmd(attr, qp, context, &dwqe_mmap_key);
if (ret)
goto err_cmd;
@@ -1175,10 +1189,18 @@ static struct ibv_qp *create_qp(struct ibv_context *ibv_ctx,
if (ret)
goto err_store;
+ if (qp->flags & HNS_ROCE_QP_CAP_DIRECT_WQE) {
+ ret = mmap_dwqe(ibv_ctx, qp, dwqe_mmap_key);
+ if (ret)
+ goto err_dwqe;
+ }
+
qp_setup_config(attr, qp, context);
return &qp->verbs_qp.qp;
+err_dwqe:
+ hns_roce_v2_clear_qp(context, qp);
err_store:
ibv_cmd_destroy_qp(&qp->verbs_qp.qp);
err_cmd:
diff --git a/util/mmio.h b/util/mmio.h
index 101af9dd..01d1455e 100644
--- a/util/mmio.h
+++ b/util/mmio.h
@@ -210,8 +210,33 @@ static inline void mmio_memcpy_x64(void *dest, const void *src, size_t bytecnt)
{
s390_mmio_write(dest, src, bytecnt);
}
-#else
+#elif defined(__aarch64__) || defined(__arm__)
+#include <arm_neon.h>
+
+static inline void _mmio_memcpy_x64_64b(void *dest, const void *src)
+{
+ vst4q_u64(dest, vld4q_u64(src));
+}
+
+static inline void _mmio_memcpy_x64(void *dest, const void *src, size_t bytecnt)
+{
+ do {
+ _mmio_memcpy_x64_64b(dest, src);
+ bytecnt -= sizeof(uint64x2x4_t);
+ src += sizeof(uint64x2x4_t);
+ } while (bytecnt > 0);
+}
+
+#define mmio_memcpy_x64(dest, src, bytecount) \
+ ({ \
+ if (__builtin_constant_p((bytecount) == 64)) \
+ _mmio_memcpy_x64_64b((dest), (src)); \
+ else \
+ _mmio_memcpy_x64((dest), (src), (bytecount)); \
+ })
+
+#else
/* Transfer is some multiple of 64 bytes */
static inline void mmio_memcpy_x64(void *dest, const void *src, size_t bytecnt)
{
--
2.33.0
此处可能存在不合适展示的内容,页面不予展示。您可通过相关编辑功能自查并修改。
如您确认内容无涉及 不当用语 / 纯广告导流 / 暴力 / 低俗色情 / 侵权 / 盗版 / 虚假 / 无价值内容或违法国家有关法律法规的内容,可点击提交进行申诉,我们将尽快为您处理。