diff --git a/1001-providers-erdma-Add-userspace-verbs-related-header-f.patch b/1001-providers-erdma-Add-userspace-verbs-related-header-f.patch new file mode 100644 index 0000000000000000000000000000000000000000..aee39d0d7a2556c4eb2ffa3afe60437d827e82ad --- /dev/null +++ b/1001-providers-erdma-Add-userspace-verbs-related-header-f.patch @@ -0,0 +1,430 @@ +From b8fdbf0cd29d6630ad53e78ec4724bb85f19611d Mon Sep 17 00:00:00 2001 +Message-Id: +From: Cheng Xu +Date: Thu, 1 Dec 2022 15:38:34 +0800 +Subject: [PATCH 1/4] providers/erdma: Add userspace verbs related header + files. + +Add the userspace verbs implementation related header files: 'erdma_hw.h' +for hardware interface definitions, 'erdma_verbs.h' for verbs related +definitions and 'erdma_db.h' for doorbell records related definitions. + +Signed-off-by: Cheng Xu +--- + providers/erdma/erdma_db.h | 17 +++ + providers/erdma/erdma_hw.h | 218 ++++++++++++++++++++++++++++++++++ + providers/erdma/erdma_verbs.h | 153 ++++++++++++++++++++++++ + 3 files changed, 388 insertions(+) + create mode 100644 providers/erdma/erdma_db.h + create mode 100644 providers/erdma/erdma_hw.h + create mode 100644 providers/erdma/erdma_verbs.h + +diff --git a/providers/erdma/erdma_db.h b/providers/erdma/erdma_db.h +new file mode 100644 +index 000000000000..c302cb7ab154 +--- /dev/null ++++ b/providers/erdma/erdma_db.h +@@ -0,0 +1,17 @@ ++/* SPDX-License-Identifier: GPL-2.0 or OpenIB.org BSD (MIT) See COPYING file */ ++/* ++ * Authors: Cheng Xu ++ * Copyright (c) 2020-2021, Alibaba Group. ++ */ ++ ++#ifndef __ERDMA_DB_H__ ++#define __ERDMA_DB_H__ ++ ++#include ++ ++#include "erdma.h" ++ ++uint64_t *erdma_alloc_dbrecords(struct erdma_context *ctx); ++void erdma_dealloc_dbrecords(struct erdma_context *ctx, uint64_t *dbrecords); ++ ++#endif +diff --git a/providers/erdma/erdma_hw.h b/providers/erdma/erdma_hw.h +new file mode 100644 +index 000000000000..7f567076a549 +--- /dev/null ++++ b/providers/erdma/erdma_hw.h +@@ -0,0 +1,218 @@ ++/* SPDX-License-Identifier: GPL-2.0 or OpenIB.org BSD (MIT) See COPYING file */ ++/* ++ * Authors: Cheng Xu ++ * Copyright (c) 2020-2021, Alibaba Group. ++ */ ++ ++#ifndef __ERDMA_HW_H__ ++#define __ERDMA_HW_H__ ++ ++#include ++ ++#define ERDMA_SDB_PAGE 0 ++#define ERDMA_SDB_ENTRY 1 ++#define ERDMA_SDB_SHARED 2 ++ ++#define ERDMA_NSDB_PER_ENTRY 2 ++#define ERDMA_SDB_ALLOC_QPN_MASK 0x1f ++#define ERDMA_RDB_ALLOC_QPN_MASK 0x7f ++ ++#define ERDMA_SQDB_SIZE 128 ++#define ERDMA_CQDB_SIZE 8 ++#define ERDMA_RQDB_SIZE 8 ++#define ERDMA_RQDB_SPACE_SIZE 32 ++ ++/* WQE related. */ ++#define EQE_SIZE 16 ++#define EQE_SHIFT 4 ++#define RQE_SIZE 32 ++#define RQE_SHIFT 5 ++#define CQE_SIZE 32 ++#define CQE_SHIFT 5 ++#define SQEBB_SIZE 32 ++#define SQEBB_SHIFT 5 ++#define SQEBB_MASK (~(SQEBB_SIZE - 1)) ++#define SQEBB_ALIGN(size) ((size + SQEBB_SIZE - 1) & SQEBB_MASK) ++#define SQEBB_COUNT(size) (SQEBB_ALIGN(size) >> SQEBB_SHIFT) ++ ++#define MAX_WQEBB_PER_SQE 4 ++ ++enum erdma_opcode { ++ ERDMA_OP_WRITE = 0, ++ ERDMA_OP_READ = 1, ++ ERDMA_OP_SEND = 2, ++ ERDMA_OP_SEND_WITH_IMM = 3, ++ ++ ERDMA_OP_RECEIVE = 4, ++ ERDMA_OP_RECV_IMM = 5, ++ ERDMA_OP_RECV_INV = 6, ++ ++ ERDMA_OP_REQ_ERR = 7, ++ ERDNA_OP_READ_RESPONSE = 8, ++ ERDMA_OP_WRITE_WITH_IMM = 9, ++ ++ ERDMA_OP_RECV_ERR = 10, ++ ++ ERDMA_OP_INVALIDATE = 11, ++ ERDMA_OP_RSP_SEND_IMM = 12, ++ ERDMA_OP_SEND_WITH_INV = 13, ++ ++ ERDMA_OP_REG_MR = 14, ++ ERDMA_OP_LOCAL_INV = 15, ++ ERDMA_OP_READ_WITH_INV = 16, ++ ERDMA_OP_ATOMIC_CAS = 17, ++ ERDMA_OP_ATOMIC_FAD = 18, ++ ERDMA_NUM_OPCODES = 19, ++ ERDMA_OP_INVALID = ERDMA_NUM_OPCODES + 1 ++}; ++ ++/* ++ * Inline data are kept within the work request itself occupying ++ * the space of sge[1] .. sge[n]. Therefore, inline data cannot be ++ * supported if ERDMA_MAX_SGE is below 2 elements. ++ */ ++#define ERDMA_MAX_INLINE (sizeof(struct erdma_sge) * (ERDMA_MAX_SEND_SGE)) ++ ++enum erdma_wc_status { ++ ERDMA_WC_SUCCESS = 0, ++ ERDMA_WC_GENERAL_ERR = 1, ++ ERDMA_WC_RECV_WQE_FORMAT_ERR = 2, ++ ERDMA_WC_RECV_STAG_INVALID_ERR = 3, ++ ERDMA_WC_RECV_ADDR_VIOLATION_ERR = 4, ++ ERDMA_WC_RECV_RIGHT_VIOLATION_ERR = 5, ++ ERDMA_WC_RECV_PDID_ERR = 6, ++ ERDMA_WC_RECV_WARRPING_ERR = 7, ++ ERDMA_WC_SEND_WQE_FORMAT_ERR = 8, ++ ERDMA_WC_SEND_WQE_ORD_EXCEED = 9, ++ ERDMA_WC_SEND_STAG_INVALID_ERR = 10, ++ ERDMA_WC_SEND_ADDR_VIOLATION_ERR = 11, ++ ERDMA_WC_SEND_RIGHT_VIOLATION_ERR = 12, ++ ERDMA_WC_SEND_PDID_ERR = 13, ++ ERDMA_WC_SEND_WARRPING_ERR = 14, ++ ERDMA_WC_FLUSH_ERR = 15, ++ ERDMA_WC_RETRY_EXC_ERR = 16, ++ ERDMA_NUM_WC_STATUS ++}; ++ ++enum erdma_vendor_err { ++ ERDMA_WC_VENDOR_NO_ERR = 0, ++ ERDMA_WC_VENDOR_INVALID_RQE = 1, ++ ERDMA_WC_VENDOR_RQE_INVALID_STAG = 2, ++ ERDMA_WC_VENDOR_RQE_ADDR_VIOLATION = 3, ++ ERDMA_WC_VENDOR_RQE_ACCESS_RIGHT_ERR = 4, ++ ERDMA_WC_VENDOR_RQE_INVALID_PD = 5, ++ ERDMA_WC_VENDOR_RQE_WRAP_ERR = 6, ++ ERDMA_WC_VENDOR_INVALID_SQE = 0x20, ++ ERDMA_WC_VENDOR_ZERO_ORD = 0x21, ++ ERDMA_WC_VENDOR_SQE_INVALID_STAG = 0x30, ++ ERDMA_WC_VENDOR_SQE_ADDR_VIOLATION = 0x31, ++ ERDMA_WC_VENDOR_SQE_ACCESS_ERR = 0x32, ++ ERDMA_WC_VENDOR_SQE_INVALID_PD = 0x33, ++ ERDMA_WC_VENDOR_SQE_WARP_ERR = 0x34 ++}; ++ ++/* Doorbell related. */ ++#define ERDMA_CQDB_IDX_MASK GENMASK_ULL(63, 56) ++#define ERDMA_CQDB_CQN_MASK GENMASK_ULL(55, 32) ++#define ERDMA_CQDB_ARM_MASK BIT_ULL(31) ++#define ERDMA_CQDB_SOL_MASK BIT_ULL(30) ++#define ERDMA_CQDB_CMDSN_MASK GENMASK_ULL(29, 28) ++#define ERDMA_CQDB_CI_MASK GENMASK_ULL(23, 0) ++ ++#define ERDMA_CQE_QTYPE_SQ 0 ++#define ERDMA_CQE_QTYPE_RQ 1 ++#define ERDMA_CQE_QTYPE_CMDQ 2 ++ ++/* CQE hdr */ ++#define ERDMA_CQE_HDR_OWNER_MASK BIT(31) ++#define ERDMA_CQE_HDR_OPCODE_MASK GENMASK(23, 16) ++#define ERDMA_CQE_HDR_QTYPE_MASK GENMASK(15, 8) ++#define ERDMA_CQE_HDR_SYNDROME_MASK GENMASK(7, 0) ++ ++struct erdma_cqe { ++ __be32 hdr; ++ __be32 qe_idx; ++ __be32 qpn; ++ __le32 imm_data; ++ __be32 size; ++ __be32 rsvd[3]; ++}; ++ ++struct erdma_sge { ++ __aligned_le64 laddr; ++ __le32 length; ++ __le32 lkey; ++}; ++ ++/* Receive Queue Element */ ++struct erdma_rqe { ++ __le16 qe_idx; ++ __le16 rsvd; ++ __le32 qpn; ++ __le32 rsvd2; ++ __le32 rsvd3; ++ __le64 to; ++ __le32 length; ++ __le32 stag; ++}; ++ ++/* SQE */ ++#define ERDMA_SQE_HDR_SGL_LEN_MASK GENMASK_ULL(63, 56) ++#define ERDMA_SQE_HDR_WQEBB_CNT_MASK GENMASK_ULL(54, 52) ++#define ERDMA_SQE_HDR_QPN_MASK GENMASK_ULL(51, 32) ++#define ERDMA_SQE_HDR_OPCODE_MASK GENMASK_ULL(31, 27) ++#define ERDMA_SQE_HDR_DWQE_MASK BIT_ULL(26) ++#define ERDMA_SQE_HDR_INLINE_MASK BIT_ULL(25) ++#define ERDMA_SQE_HDR_FENCE_MASK BIT_ULL(24) ++#define ERDMA_SQE_HDR_SE_MASK BIT_ULL(23) ++#define ERDMA_SQE_HDR_CE_MASK BIT_ULL(22) ++#define ERDMA_SQE_HDR_WQEBB_INDEX_MASK GENMASK_ULL(15, 0) ++ ++struct erdma_write_sqe { ++ __le64 hdr; ++ __be32 imm_data; ++ __le32 length; ++ ++ __le32 sink_stag; ++ /* avoid sink_to not 8-byte aligned. */ ++ __le32 sink_to_low; ++ __le32 sink_to_high; ++ ++ __le32 rsvd; ++ ++ struct erdma_sge sgl[0]; ++}; ++ ++struct erdma_send_sqe { ++ __le64 hdr; ++ union { ++ __be32 imm_data; ++ __le32 invalid_stag; ++ }; ++ __le32 length; ++ struct erdma_sge sgl[0]; ++}; ++ ++struct erdma_readreq_sqe { ++ __le64 hdr; ++ __le32 invalid_stag; ++ __le32 length; ++ __le32 sink_stag; ++ /* avoid sink_to not 8-byte aligned. */ ++ __le32 sink_to_low; ++ __le32 sink_to_high; ++ __le32 rsvd0; ++ struct erdma_sge sgl; ++}; ++ ++struct erdma_atomic_sqe { ++ __le64 hdr; ++ __le64 rsvd; ++ __le64 fetchadd_swap_data; ++ __le64 cmp_data; ++ ++ struct erdma_sge remote; ++ struct erdma_sge sgl; ++}; ++ ++#endif +diff --git a/providers/erdma/erdma_verbs.h b/providers/erdma/erdma_verbs.h +new file mode 100644 +index 000000000000..d9820c5a20d5 +--- /dev/null ++++ b/providers/erdma/erdma_verbs.h +@@ -0,0 +1,153 @@ ++/* SPDX-License-Identifier: GPL-2.0 or OpenIB.org BSD (MIT) See COPYING file */ ++/* ++ * Authors: Cheng Xu ++ * Copyright (c) 2020-2021, Alibaba Group. ++ */ ++ ++#ifndef __ERDMA_VERBS_H__ ++#define __ERDMA_VERBS_H__ ++ ++#include ++#include ++#include ++ ++#include "erdma.h" ++#include "erdma_hw.h" ++ ++#define ERDMA_MAX_SEND_SGE 6 ++#define ERDMA_MAX_RECV_SGE 1 ++ ++struct erdma_queue { ++ void *qbuf; ++ void *db; ++ ++ uint16_t rsvd0; ++ uint16_t depth; ++ uint32_t size; ++ ++ uint16_t pi; ++ uint16_t ci; ++ ++ uint32_t rsvd1; ++ uint64_t *wr_tbl; ++ ++ void *db_record; ++}; ++ ++struct erdma_qp { ++ struct verbs_qp verbs_qp; ++ struct erdma_device *erdma_dev; ++ ++ uint32_t id; /* qpn */ ++ ++ pthread_spinlock_t sq_lock; ++ pthread_spinlock_t rq_lock; ++ ++ int sq_sig_all; ++ int disable_dwqe; ++ pthread_spinlock_t *sdb_lock; ++ ++ struct erdma_queue sq; ++ struct erdma_queue rq; ++ ++ void *qbuf; ++ size_t qbuf_size; ++ uint64_t *db_records; ++ ++ void *cur_sqe; ++ uint16_t cur_pi; ++ uint16_t sq_pi_rb; ++ uint32_t sgl_off; /* SGL offset in wqebb. */ ++ __le32 *length_field; /* The total length field in SQE header */ ++ int err; ++}; ++ ++struct erdma_cq { ++ struct ibv_cq base_cq; ++ struct erdma_device *erdma_dev; ++ uint32_t id; ++ ++ uint32_t event_stats; ++ ++ uint32_t depth; ++ uint32_t ci; ++ struct erdma_cqe *queue; ++ ++ void *db; ++ uint16_t db_offset; ++ ++ void *db_record; ++ uint32_t cmdsn; ++ uint32_t comp_vector; ++ uint32_t db_index; ++ ++ pthread_spinlock_t lock; ++}; ++ ++static inline struct erdma_cq *to_ecq(struct ibv_cq *base) ++{ ++ return container_of(base, struct erdma_cq, base_cq); ++} ++ ++static inline struct erdma_qp *to_eqp(struct ibv_qp *ibqp) ++{ ++ return container_of(ibqp, struct erdma_qp, verbs_qp.qp); ++} ++ ++static inline void *get_sq_wqebb(struct erdma_qp *qp, uint16_t idx) ++{ ++ idx &= (qp->sq.depth - 1); ++ return qp->sq.qbuf + (idx << SQEBB_SHIFT); ++} ++ ++static inline void __kick_sq_db(struct erdma_qp *qp, uint16_t pi) ++{ ++ uint64_t db_data; ++ ++ db_data = FIELD_PREP(ERDMA_SQE_HDR_QPN_MASK, qp->id) | ++ FIELD_PREP(ERDMA_SQE_HDR_WQEBB_INDEX_MASK, pi); ++ ++ *(__le64 *)qp->sq.db_record = htole64(db_data); ++ udma_to_device_barrier(); ++ mmio_write64_le(qp->sq.db, htole64(db_data)); ++} ++ ++struct ibv_pd *erdma_alloc_pd(struct ibv_context *ctx); ++int erdma_free_pd(struct ibv_pd *pd); ++ ++int erdma_query_device(struct ibv_context *ctx, ++ const struct ibv_query_device_ex_input *input, ++ struct ibv_device_attr_ex *attr, size_t attr_size); ++int erdma_query_port(struct ibv_context *ctx, uint8_t port, ++ struct ibv_port_attr *attr); ++ ++struct ibv_mr *erdma_reg_mr(struct ibv_pd *pd, void *addr, size_t len, ++ uint64_t hca_va, int access); ++int erdma_dereg_mr(struct verbs_mr *vmr); ++ ++struct ibv_qp *erdma_create_qp(struct ibv_pd *pd, ++ struct ibv_qp_init_attr *attr); ++struct ibv_qp *erdma_create_qp_ex(struct ibv_context *context, ++ struct ibv_qp_init_attr_ex *attr); ++int erdma_modify_qp(struct ibv_qp *base_qp, struct ibv_qp_attr *attr, ++ int attr_mask); ++int erdma_query_qp(struct ibv_qp *qp, struct ibv_qp_attr *attr, int attr_mask, ++ struct ibv_qp_init_attr *init_attr); ++int erdma_post_send(struct ibv_qp *base_qp, struct ibv_send_wr *wr, ++ struct ibv_send_wr **bad_wr); ++int erdma_post_recv(struct ibv_qp *base_qp, struct ibv_recv_wr *wr, ++ struct ibv_recv_wr **bad_wr); ++int erdma_destroy_qp(struct ibv_qp *base_qp); ++ ++void erdma_free_context(struct ibv_context *ibv_ctx); ++ ++struct ibv_cq *erdma_create_cq(struct ibv_context *ctx, int num_cqe, ++ struct ibv_comp_channel *channel, ++ int comp_vector); ++int erdma_destroy_cq(struct ibv_cq *base_cq); ++int erdma_notify_cq(struct ibv_cq *ibcq, int solicited); ++void erdma_cq_event(struct ibv_cq *ibcq); ++int erdma_poll_cq(struct ibv_cq *ibcq, int num_entries, struct ibv_wc *wc); ++ ++ ++#endif +-- +2.37.0 + diff --git a/1002-providers-erdma-Add-userspace-verbs-implementation.patch b/1002-providers-erdma-Add-userspace-verbs-implementation.patch new file mode 100644 index 0000000000000000000000000000000000000000..527852507ad335fc378fbfc165fedd76d15b4aff --- /dev/null +++ b/1002-providers-erdma-Add-userspace-verbs-implementation.patch @@ -0,0 +1,1656 @@ +From 08f8e55dde433e81ce241003835cdb29dd2708cf Mon Sep 17 00:00:00 2001 +Message-Id: <08f8e55dde433e81ce241003835cdb29dd2708cf.1669880730.git.chengyou@linux.alibaba.com> +In-Reply-To: +References: +From: Cheng Xu +Date: Thu, 1 Dec 2022 15:39:02 +0800 +Subject: [PATCH 2/4] providers/erdma: Add userspace verbs implementation + +Implementation of the erdma's 'struct verbs_context_ops' interface. +Due to doorbells may be drop by hardware in some situations, such as +hardware hot-upgrade, driver will keep the latest doorbell value of each +QP and CQ. So we introduce the doorbell records to store the latest +doorbell values also. + +Signed-off-by: Cheng Xu +--- + providers/erdma/erdma_db.c | 115 +++ + providers/erdma/erdma_verbs.c | 1504 +++++++++++++++++++++++++++++++++ + 2 files changed, 1619 insertions(+) + create mode 100644 providers/erdma/erdma_db.c + create mode 100644 providers/erdma/erdma_verbs.c + +diff --git a/providers/erdma/erdma_db.c b/providers/erdma/erdma_db.c +new file mode 100644 +index 000000000000..382262da5f48 +--- /dev/null ++++ b/providers/erdma/erdma_db.c +@@ -0,0 +1,115 @@ ++// SPDX-License-Identifier: GPL-2.0 or OpenIB.org BSD (MIT) See COPYING file ++ ++// Authors: Cheng Xu ++// Copyright (c) 2020-2021, Alibaba Group. ++ ++// Copyright (c) 2012 Mellanox Technologies, Inc. All rights reserved. ++ ++#define _GNU_SOURCE ++#include ++#include ++#include ++#include ++ ++#include "erdma.h" ++#include "erdma_db.h" ++ ++#define ERDMA_DBRECORDS_SIZE 16 ++ ++struct erdma_dbrecord_page { ++ struct erdma_dbrecord_page *prev, *next; ++ void *page_buf; ++ int cnt; ++ int used; ++ unsigned long free[0]; ++}; ++ ++uint64_t *erdma_alloc_dbrecords(struct erdma_context *ctx) ++{ ++ int bits_perlong = (8 * sizeof(unsigned long)); ++ struct erdma_dbrecord_page *page = NULL; ++ int dbrecords_per_page, nlongs = 0; ++ uint64_t *db_records = NULL; ++ int i, j, rv; ++ ++ pthread_mutex_lock(&ctx->dbrecord_pages_mutex); ++ ++ for (page = ctx->dbrecord_pages; page; page = page->next) ++ if (page->used < page->cnt) ++ goto found; ++ ++ dbrecords_per_page = ctx->page_size / ERDMA_DBRECORDS_SIZE; ++ nlongs = align(dbrecords_per_page, bits_perlong) / bits_perlong; ++ page = malloc(sizeof(*page) + nlongs * sizeof(unsigned long)); ++ if (!page) ++ goto out; ++ ++ rv = posix_memalign(&page->page_buf, ctx->page_size, ctx->page_size); ++ if (rv) { ++ free(page); ++ goto out; ++ } ++ ++ page->cnt = dbrecords_per_page; ++ page->used = 0; ++ for (i = 0; i < nlongs; i++) ++ page->free[i] = ~0UL; ++ ++ page->prev = NULL; ++ page->next = ctx->dbrecord_pages; ++ ctx->dbrecord_pages = page; ++ if (page->next) ++ page->next->prev = page; ++ ++found: ++ ++page->used; ++ ++ for (i = 0; !page->free[i]; ++i) ++ ; /* nothing */ ++ ++ j = ffsl(page->free[i]) - 1; ++ page->free[i] &= ~(1UL << j); ++ ++ db_records = ++ page->page_buf + (i * bits_perlong + j) * ERDMA_DBRECORDS_SIZE; ++ ++out: ++ pthread_mutex_unlock(&ctx->dbrecord_pages_mutex); ++ ++ return db_records; ++} ++ ++void erdma_dealloc_dbrecords(struct erdma_context *ctx, uint64_t *dbrecords) ++{ ++ struct erdma_dbrecord_page *page; ++ int page_mask = ~(ctx->page_size - 1); ++ int idx; ++ ++ pthread_mutex_lock(&ctx->dbrecord_pages_mutex); ++ for (page = ctx->dbrecord_pages; page; page = page->next) ++ if (((uintptr_t)dbrecords & page_mask) == ++ (uintptr_t)page->page_buf) ++ break; ++ ++ if (!page) ++ goto out; ++ ++ idx = ((void *)dbrecords - page->page_buf) / ERDMA_DBRECORDS_SIZE; ++ page->free[idx / (8 * sizeof(unsigned long))] |= ++ 1UL << (idx % (8 * sizeof(unsigned long))); ++ ++ if (!--page->used) { ++ if (page->prev) ++ page->prev->next = page->next; ++ else ++ ctx->dbrecord_pages = page->next; ++ if (page->next) ++ page->next->prev = page->prev; ++ ++ free(page->page_buf); ++ free(page); ++ } ++ ++out: ++ pthread_mutex_unlock(&ctx->dbrecord_pages_mutex); ++} +diff --git a/providers/erdma/erdma_verbs.c b/providers/erdma/erdma_verbs.c +new file mode 100644 +index 000000000000..d665e0401b60 +--- /dev/null ++++ b/providers/erdma/erdma_verbs.c +@@ -0,0 +1,1504 @@ ++// SPDX-License-Identifier: GPL-2.0 or BSD-3-Clause ++ ++// Authors: Cheng Xu ++// Copyright (c) 2020-2021, Alibaba Group. ++// Authors: Bernard Metzler ++// Copyright (c) 2008-2019, IBM Corporation ++ ++#include ++#include ++#ifdef HAVE_AVX_SUPPORT ++#include ++#else ++#define _mm256_store_si256(a, b) fprintf(stderr, "not supported") ++#endif ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include "erdma.h" ++#include "erdma_abi.h" ++#include "erdma_db.h" ++#include "erdma_hw.h" ++#include "erdma_verbs.h" ++ ++int erdma_query_device(struct ibv_context *ctx, ++ const struct ibv_query_device_ex_input *input, ++ struct ibv_device_attr_ex *attr, size_t attr_size) ++{ ++ struct ib_uverbs_ex_query_device_resp resp; ++ unsigned int major, minor, sub_minor; ++ size_t resp_size = sizeof(resp); ++ uint64_t raw_fw_ver; ++ int rv; ++ ++ rv = ibv_cmd_query_device_any(ctx, input, attr, attr_size, &resp, ++ &resp_size); ++ if (rv) ++ return rv; ++ ++ raw_fw_ver = resp.base.fw_ver; ++ major = (raw_fw_ver >> 32) & 0xffff; ++ minor = (raw_fw_ver >> 16) & 0xffff; ++ sub_minor = raw_fw_ver & 0xffff; ++ ++ snprintf(attr->orig_attr.fw_ver, sizeof(attr->orig_attr.fw_ver), ++ "%d.%d.%d", major, minor, sub_minor); ++ ++ return 0; ++} ++ ++int erdma_query_port(struct ibv_context *ctx, uint8_t port, ++ struct ibv_port_attr *attr) ++{ ++ struct ibv_query_port cmd = {}; ++ ++ return ibv_cmd_query_port(ctx, port, attr, &cmd, sizeof(cmd)); ++} ++ ++int erdma_query_qp(struct ibv_qp *qp, struct ibv_qp_attr *attr, int attr_mask, ++ struct ibv_qp_init_attr *init_attr) ++{ ++ struct ibv_query_qp cmd = {}; ++ ++ return ibv_cmd_query_qp(qp, attr, attr_mask, init_attr, &cmd, ++ sizeof(cmd)); ++} ++ ++struct ibv_pd *erdma_alloc_pd(struct ibv_context *ctx) ++{ ++ struct ib_uverbs_alloc_pd_resp resp; ++ struct ibv_alloc_pd cmd = {}; ++ struct ibv_pd *pd; ++ ++ pd = calloc(1, sizeof(*pd)); ++ if (!pd) ++ return NULL; ++ ++ if (ibv_cmd_alloc_pd(ctx, pd, &cmd, sizeof(cmd), &resp, sizeof(resp))) { ++ free(pd); ++ return NULL; ++ } ++ ++ return pd; ++} ++ ++int erdma_free_pd(struct ibv_pd *pd) ++{ ++ int rv; ++ ++ rv = ibv_cmd_dealloc_pd(pd); ++ if (rv) ++ return rv; ++ ++ free(pd); ++ return 0; ++} ++ ++struct ibv_mr *erdma_reg_mr(struct ibv_pd *pd, void *addr, size_t len, ++ uint64_t hca_va, int access) ++{ ++ struct ib_uverbs_reg_mr_resp resp; ++ struct ibv_reg_mr cmd; ++ struct verbs_mr *vmr; ++ int ret; ++ ++ vmr = calloc(1, sizeof(*vmr)); ++ if (!vmr) ++ return NULL; ++ ++ ret = ibv_cmd_reg_mr(pd, addr, len, hca_va, access, vmr, &cmd, ++ sizeof(cmd), &resp, sizeof(resp)); ++ if (ret) { ++ free(vmr); ++ return NULL; ++ } ++ ++ return &vmr->ibv_mr; ++} ++ ++int erdma_dereg_mr(struct verbs_mr *vmr) ++{ ++ int ret; ++ ++ ret = ibv_cmd_dereg_mr(vmr); ++ if (ret) ++ return ret; ++ ++ free(vmr); ++ return 0; ++} ++ ++int erdma_notify_cq(struct ibv_cq *ibcq, int solicited) ++{ ++ struct erdma_cq *cq = to_ecq(ibcq); ++ uint64_t db_data; ++ int ret; ++ ++ ret = pthread_spin_lock(&cq->lock); ++ if (ret) ++ return ret; ++ ++ db_data = FIELD_PREP(ERDMA_CQDB_IDX_MASK, cq->db_index) | ++ FIELD_PREP(ERDMA_CQDB_CQN_MASK, cq->id) | ++ FIELD_PREP(ERDMA_CQDB_ARM_MASK, 1) | ++ FIELD_PREP(ERDMA_CQDB_SOL_MASK, solicited) | ++ FIELD_PREP(ERDMA_CQDB_CMDSN_MASK, cq->cmdsn) | ++ FIELD_PREP(ERDMA_CQDB_CI_MASK, cq->ci); ++ ++ *(__le64 *)cq->db_record = htole64(db_data); ++ cq->db_index++; ++ udma_to_device_barrier(); ++ mmio_write64_le(cq->db, htole64(db_data)); ++ ++ pthread_spin_unlock(&cq->lock); ++ ++ return ret; ++} ++ ++struct ibv_cq *erdma_create_cq(struct ibv_context *ctx, int num_cqe, ++ struct ibv_comp_channel *channel, ++ int comp_vector) ++{ ++ struct erdma_context *ectx = to_ectx(ctx); ++ struct erdma_cmd_create_cq_resp resp = {}; ++ struct erdma_cmd_create_cq cmd = {}; ++ uint64_t *db_records = NULL; ++ struct erdma_cq *cq; ++ size_t cq_size; ++ int rv; ++ ++ cq = calloc(1, sizeof(*cq)); ++ if (!cq) ++ return NULL; ++ ++ if (num_cqe < 64) ++ num_cqe = 64; ++ ++ num_cqe = roundup_pow_of_two(num_cqe); ++ cq_size = align(num_cqe * sizeof(struct erdma_cqe), ERDMA_PAGE_SIZE); ++ ++ rv = posix_memalign((void **)&cq->queue, ERDMA_PAGE_SIZE, cq_size); ++ if (rv) { ++ errno = rv; ++ free(cq); ++ return NULL; ++ } ++ ++ rv = ibv_dontfork_range(cq->queue, cq_size); ++ if (rv) { ++ errno = rv; ++ free(cq->queue); ++ cq->queue = NULL; ++ goto error_alloc; ++ } ++ ++ memset(cq->queue, 0, cq_size); ++ ++ db_records = erdma_alloc_dbrecords(ectx); ++ if (!db_records) { ++ errno = ENOMEM; ++ goto error_alloc; ++ } ++ ++ cmd.db_record_va = (uintptr_t)db_records; ++ cmd.qbuf_va = (uintptr_t)cq->queue; ++ cmd.qbuf_len = cq_size; ++ ++ rv = ibv_cmd_create_cq(ctx, num_cqe, channel, comp_vector, &cq->base_cq, ++ &cmd.ibv_cmd, sizeof(cmd), &resp.ibv_resp, ++ sizeof(resp)); ++ if (rv) { ++ errno = EIO; ++ goto error_alloc; ++ } ++ ++ pthread_spin_init(&cq->lock, PTHREAD_PROCESS_PRIVATE); ++ ++ *db_records = 0; ++ cq->db_record = db_records; ++ ++ cq->id = resp.cq_id; ++ cq->depth = resp.num_cqe; ++ ++ cq->db = ectx->cdb; ++ cq->db_offset = (cq->id & (ERDMA_PAGE_SIZE / ERDMA_CQDB_SIZE - 1)) * ++ ERDMA_CQDB_SIZE; ++ cq->db += cq->db_offset; ++ ++ cq->comp_vector = comp_vector; ++ ++ return &cq->base_cq; ++ ++error_alloc: ++ if (db_records) ++ erdma_dealloc_dbrecords(ectx, db_records); ++ ++ if (cq->queue) { ++ ibv_dofork_range(cq->queue, cq_size); ++ free(cq->queue); ++ } ++ ++ free(cq); ++ ++ return NULL; ++} ++ ++int erdma_destroy_cq(struct ibv_cq *base_cq) ++{ ++ struct erdma_context *ctx = to_ectx(base_cq->context); ++ struct erdma_cq *cq = to_ecq(base_cq); ++ int rv; ++ ++ pthread_spin_lock(&cq->lock); ++ rv = ibv_cmd_destroy_cq(base_cq); ++ if (rv) { ++ pthread_spin_unlock(&cq->lock); ++ errno = EIO; ++ return rv; ++ } ++ pthread_spin_destroy(&cq->lock); ++ ++ if (cq->db_record) ++ erdma_dealloc_dbrecords(ctx, cq->db_record); ++ ++ if (cq->queue) { ++ ibv_dofork_range(cq->queue, cq->depth << CQE_SHIFT); ++ free(cq->queue); ++ } ++ ++ free(cq); ++ ++ return 0; ++} ++ ++static inline void kick_hw_sqe(struct erdma_qp *qp, uint16_t pi, ++ uint32_t wqebb_cnt) ++{ ++ uint16_t idx = pi & (qp->sq.depth - 1); ++ void *sqe = get_sq_wqebb(qp, idx); ++ uint32_t i; ++ ++ pthread_spin_lock(qp->sdb_lock); ++ ++ *(__le64 *)qp->sq.db_record = htole64(*(uint64_t *)sqe); ++ ++ udma_to_device_barrier(); ++ ++ for (i = 0; i < wqebb_cnt; i++) { ++ _mm256_store_si256(qp->sq.db + (i << 5), ++ _mm256_load_si256(sqe)); ++ sqe = get_sq_wqebb(qp, idx + i + 1); ++ } ++ ++ mmio_flush_writes(); ++ ++ pthread_spin_unlock(qp->sdb_lock); ++} ++ ++enum { ++ ERDMA_SUPPORTED_SEND_OPS_FLAGS_RC = ++ IBV_QP_EX_WITH_SEND | IBV_QP_EX_WITH_SEND_WITH_INV | ++ IBV_QP_EX_WITH_SEND_WITH_IMM | IBV_QP_EX_WITH_RDMA_WRITE | ++ IBV_QP_EX_WITH_RDMA_WRITE_WITH_IMM | IBV_QP_EX_WITH_RDMA_READ | ++ IBV_QP_EX_WITH_ATOMIC_CMP_AND_SWP | ++ IBV_QP_EX_WITH_ATOMIC_FETCH_AND_ADD, ++}; ++ ++static void erdma_wr_start(struct ibv_qp_ex *ibqp) ++{ ++ struct erdma_qp *qp = to_eqp(&ibqp->qp_base); ++ ++ if (ibqp->qp_base.state == IBV_QPS_ERR) { ++ qp->err = -EIO; ++ return; ++ } ++ ++ pthread_spin_lock(&qp->sq_lock); ++ ++ qp->err = 0; ++ qp->sq_pi_rb = qp->sq.pi; ++} ++ ++static void erdma_wr_abort(struct ibv_qp_ex *ibqp) ++{ ++ struct erdma_qp *qp = to_eqp(&ibqp->qp_base); ++ ++ /* Rolling back */ ++ qp->sq.pi = qp->sq_pi_rb; ++ ++ pthread_spin_unlock(&qp->sq_lock); ++} ++ ++static int erdma_wr_complete(struct ibv_qp_ex *ibqp) ++{ ++ struct erdma_qp *qp = to_eqp(&ibqp->qp_base); ++ int err = qp->err; ++ ++ if (unlikely(err)) { ++ qp->sq.pi = qp->sq_pi_rb; ++ goto out; ++ } ++ ++ __kick_sq_db(qp, qp->sq.pi); /* normal doorbell. */ ++ ++out: ++ pthread_spin_unlock(&qp->sq_lock); ++ ++ return err; ++} ++ ++static void *erdma_init_sqe(struct erdma_qp *qp, uint64_t wr_id, ++ unsigned int wr_flags, enum erdma_opcode opcode) ++{ ++ uint16_t sq_pi = qp->sq.pi; ++ uint64_t sqe_hdr; ++ void *sqe; ++ ++ if ((uint16_t)(sq_pi - qp->sq.ci) >= qp->sq.depth) { ++ qp->err = ENOMEM; ++ qp->cur_sqe = NULL; ++ return NULL; ++ } ++ ++ sqe = get_sq_wqebb(qp, sq_pi); ++ ++ /* Clear the first 8Byte of the wqe hdr. */ ++ *(uint64_t *)sqe = 0; ++ qp->sq.wr_tbl[sq_pi & (qp->sq.depth - 1)] = wr_id; ++ ++ sqe_hdr = FIELD_PREP(ERDMA_SQE_HDR_QPN_MASK, qp->id); ++ sqe_hdr |= FIELD_PREP(ERDMA_SQE_HDR_CE_MASK, ++ wr_flags & IBV_SEND_SIGNALED ? 1 : 0); ++ sqe_hdr |= FIELD_PREP(ERDMA_SQE_HDR_CE_MASK, qp->sq_sig_all); ++ sqe_hdr |= FIELD_PREP(ERDMA_SQE_HDR_SE_MASK, ++ wr_flags & IBV_SEND_SOLICITED ? 1 : 0); ++ sqe_hdr |= FIELD_PREP(ERDMA_SQE_HDR_FENCE_MASK, ++ wr_flags & IBV_SEND_FENCE ? 1 : 0); ++ sqe_hdr |= FIELD_PREP(ERDMA_SQE_HDR_INLINE_MASK, ++ wr_flags & IBV_SEND_INLINE ? 1 : 0); ++ sqe_hdr |= FIELD_PREP(ERDMA_SQE_HDR_OPCODE_MASK, opcode); ++ ++ *(__le64 *)sqe = htole64(sqe_hdr); ++ qp->cur_sqe = sqe; ++ ++ return sqe; ++} ++ ++static void erdma_wr_send(struct ibv_qp_ex *ibqp) ++{ ++ struct erdma_qp *qp = to_eqp(&ibqp->qp_base); ++ ++ if (!erdma_init_sqe(qp, ibqp->wr_id, ibqp->wr_flags, ERDMA_OP_SEND)) ++ return; ++ ++ qp->cur_pi = qp->sq.pi; ++ qp->sgl_off = 16; ++} ++ ++static void erdma_wr_rdma_write(struct ibv_qp_ex *ibqp, uint32_t rkey, ++ uint64_t raddr) ++{ ++ struct erdma_qp *qp = to_eqp(&ibqp->qp_base); ++ struct erdma_write_sqe *sqe; ++ ++ sqe = erdma_init_sqe(qp, ibqp->wr_id, ibqp->wr_flags, ERDMA_OP_WRITE); ++ if (!sqe) ++ return; ++ ++ qp->cur_pi = qp->sq.pi + 1; ++ qp->sgl_off = 0; ++ sqe->sink_to_low = htole32(raddr & 0xffffffff); ++ sqe->sink_to_high = htole32((raddr >> 32) & 0xffffffff); ++ sqe->sink_stag = htole32(rkey); ++} ++ ++static void erdma_wr_set_sge(struct ibv_qp_ex *ibqp, uint32_t lkey, ++ uint64_t addr, uint32_t length) ++{ ++ struct erdma_qp *qp = to_eqp(&ibqp->qp_base); ++ uint64_t *sqe_hdr = qp->cur_sqe; ++ struct erdma_sge *sge; ++ uint32_t wqebb_cnt; ++ ++ sge = get_sq_wqebb(qp, qp->cur_pi) + qp->sgl_off; ++ sge->laddr = addr; ++ sge->lkey = lkey; ++ sge->length = length; ++ ++ if (qp->length_field) ++ *qp->length_field = htole32(length); ++ ++ *sqe_hdr |= FIELD_PREP(ERDMA_SQE_HDR_SGL_LEN_MASK, 1); ++ ++ /* ++ * This is the last SQE in SQE, ++ * so anyhow we need to move the cursor to next sqebb. ++ */ ++ qp->cur_pi++; ++ wqebb_cnt = qp->cur_pi - qp->sq.pi; ++ ++ assert(wqebb_cnt <= MAX_WQEBB_PER_SQE); ++ *sqe_hdr |= FIELD_PREP(ERDMA_SQE_HDR_WQEBB_CNT_MASK, wqebb_cnt - 1); ++ *sqe_hdr |= FIELD_PREP(ERDMA_SQE_HDR_WQEBB_INDEX_MASK, qp->cur_pi); ++ ++ qp->sq.pi = qp->cur_pi; ++} ++ ++static void erdma_wr_set_sge_list(struct ibv_qp_ex *ibqp, size_t num_sge, ++ const struct ibv_sge *sg_list) ++{ ++ struct erdma_qp *qp = to_eqp(&ibqp->qp_base); ++ uint64_t *sqe_hdr = qp->cur_sqe; ++ uint32_t wqebb_cnt, sqebb_half; ++ struct erdma_sge *sge; ++ uint32_t i, bytes = 0; ++ ++ sge = get_sq_wqebb(qp, qp->cur_pi) + qp->sgl_off; ++ ++ if (num_sge > ERDMA_MAX_SEND_SGE) { ++ qp->err = EINVAL; ++ return; ++ } ++ ++ sqebb_half = !!qp->sgl_off; ++ ++ for (i = 0; i < num_sge; i++) { ++ sge->laddr = sg_list[i].addr; ++ sge->lkey = sg_list[i].lkey; ++ sge->length = sg_list[i].length; ++ bytes += sge->length; ++ ++ if (sqebb_half) { ++ qp->cur_pi++; ++ sge = get_sq_wqebb(qp, qp->cur_pi); ++ } else { ++ *((uint32_t *)sge + 7) = qp->id; ++ sge++; ++ } ++ ++ sqebb_half = !sqebb_half; ++ } ++ ++ qp->cur_pi += sqebb_half; ++ ++ if (qp->length_field) ++ *qp->length_field = htole32(bytes); ++ ++ *sqe_hdr |= FIELD_PREP(ERDMA_SQE_HDR_SGL_LEN_MASK, num_sge); ++ ++ wqebb_cnt = qp->cur_pi - qp->sq.pi; ++ assert(wqebb_cnt <= MAX_WQEBB_PER_SQE); ++ *sqe_hdr |= FIELD_PREP(ERDMA_SQE_HDR_WQEBB_CNT_MASK, wqebb_cnt - 1); ++ *sqe_hdr |= FIELD_PREP(ERDMA_SQE_HDR_WQEBB_INDEX_MASK, qp->cur_pi); ++ ++ qp->sq.pi = qp->cur_pi; ++} ++ ++static size_t free_space_to_tail(struct erdma_qp *qp, uint16_t cur_idx) ++{ ++ cur_idx &= qp->sq.depth - 1; ++ ++ return (qp->sq.depth - cur_idx) << SQEBB_SHIFT; ++} ++ ++static void erdma_wr_set_inline_data(struct ibv_qp_ex *ibqp, void *addr, ++ size_t length) ++{ ++ struct erdma_qp *qp = to_eqp(&ibqp->qp_base); ++ uint64_t *sqe_hdr = qp->cur_sqe; ++ uint32_t wqebb_cnt; ++ size_t space; ++ void *data; ++ ++ if (length > ERDMA_MAX_INLINE) { ++ qp->err = EINVAL; ++ return; ++ } ++ ++ data = get_sq_wqebb(qp, qp->cur_pi) + qp->sgl_off; ++ ++ if (qp->length_field) ++ *qp->length_field = htole32(length); ++ ++ space = free_space_to_tail(qp, qp->cur_pi) - qp->sgl_off; ++ ++ if (space >= length) { ++ memcpy(data, addr, length); ++ } else { ++ memcpy(data, addr, length - space); ++ memcpy(qp->sq.qbuf, addr + length - space, space); ++ } ++ ++ *sqe_hdr |= FIELD_PREP(ERDMA_SQE_HDR_SGL_LEN_MASK, length); ++ ++ /* ++ * This is the last SQE in SQE, ++ * so anyhow we need to move the cursor to next free sqebb. ++ */ ++ qp->cur_pi += SQEBB_COUNT(qp->sgl_off + length); ++ wqebb_cnt = qp->cur_pi - qp->sq.pi; ++ ++ assert(wqebb_cnt <= MAX_WQEBB_PER_SQE); ++ *sqe_hdr |= FIELD_PREP(ERDMA_SQE_HDR_WQEBB_CNT_MASK, wqebb_cnt - 1); ++ *sqe_hdr |= FIELD_PREP(ERDMA_SQE_HDR_WQEBB_INDEX_MASK, qp->cur_pi); ++ ++ qp->sq.pi = qp->cur_pi; ++} ++ ++static void erdma_wr_set_inline_data_list(struct ibv_qp_ex *ibqp, ++ size_t num_buf, ++ const struct ibv_data_buf *buf_list) ++{ ++ struct erdma_qp *qp = to_eqp(&ibqp->qp_base); ++ size_t space, total_size = 0, i; ++ uint64_t *sqe_hdr = qp->cur_sqe; ++ uint32_t wqebb_cnt; ++ void *data; ++ ++ data = get_sq_wqebb(qp, qp->cur_pi) + qp->sgl_off; ++ ++ space = free_space_to_tail(qp, qp->cur_pi) - qp->sgl_off; ++ for (i = 0; i < num_buf; i++) ++ total_size += buf_list[i].length; ++ ++ if (total_size > ERDMA_MAX_INLINE) { ++ qp->err = EINVAL; ++ return; ++ } ++ ++ if (qp->length_field) ++ *qp->length_field = htole32(total_size); ++ ++ if (space >= total_size) { ++ for (i = 0; i < num_buf; i++) { ++ memcpy(data, buf_list[i].addr, buf_list[i].length); ++ data += buf_list[i].length; ++ } ++ } else { ++ for (i = 0; i < num_buf; i++) { ++ if (space > buf_list[i].length) { ++ memcpy(data, buf_list[i].addr, ++ buf_list[i].length); ++ data += buf_list[i].length; ++ space -= buf_list[i].length; ++ } else { ++ if (space != 0) ++ memcpy(data, buf_list[i].addr, space); ++ ++ data = get_sq_wqebb(qp, 0); ++ memcpy(data, buf_list[i].addr + space, ++ buf_list[i].length - space); ++ data += buf_list[i].length - space; ++ space = free_space_to_tail(qp, 0) - ++ buf_list[i].length - space; ++ } ++ } ++ } ++ ++ *sqe_hdr |= FIELD_PREP(ERDMA_SQE_HDR_SGL_LEN_MASK, total_size); ++ ++ /* ++ * This is the last SQE in SQE, ++ * so anyhow we need to move the cursor to next free sqebb. ++ */ ++ qp->cur_pi += SQEBB_COUNT(qp->sgl_off + total_size); ++ wqebb_cnt = qp->cur_pi - qp->sq.pi; ++ ++ assert(wqebb_cnt <= MAX_WQEBB_PER_SQE); ++ *sqe_hdr |= FIELD_PREP(ERDMA_SQE_HDR_WQEBB_CNT_MASK, wqebb_cnt - 1); ++ *sqe_hdr |= FIELD_PREP(ERDMA_SQE_HDR_WQEBB_INDEX_MASK, qp->cur_pi); ++ ++ qp->sq.pi = qp->cur_pi; ++} ++ ++static int erdma_fill_wr_ops(struct erdma_qp *eqp, ++ const struct ibv_qp_init_attr_ex *attr) ++{ ++ struct ibv_qp_ex *ibqp_ex = &eqp->verbs_qp.qp_ex; ++ ++ ibqp_ex->wr_start = erdma_wr_start; ++ ibqp_ex->wr_complete = erdma_wr_complete; ++ ibqp_ex->wr_abort = erdma_wr_abort; ++ ++ if (attr->send_ops_flags & ~ERDMA_SUPPORTED_SEND_OPS_FLAGS_RC) ++ return -EOPNOTSUPP; ++ ++ ibqp_ex->wr_send = erdma_wr_send; ++ // ibqp->wr_send_imm = erdma_send_wr_send_imm; ++ // ibqp->wr_send_inv = erdma_send_wr_send_inv; ++ ibqp_ex->wr_rdma_write = erdma_wr_rdma_write; ++ // ibqp->wr_rdma_write_imm = erdma_send_wr_rdma_write_imm; ++ // ibqp->wr_rdma_read = erdma_send_wr_rdma_read; ++ // ibqp->wr_atomic_cmp_swp = erdma_send_wr_atomic_cmp_swp; ++ // ibqp->wr_atomic_fetch_add = erdma_send_wr_atomic_fetch_add; ++ // ibqp->wr_bind_mw = erdma_send_wr_bind_mw; ++ // ibqp->wr_local_inv = erdma_send_wr_local_inv; ++ ibqp_ex->wr_set_sge = erdma_wr_set_sge; ++ ibqp_ex->wr_set_sge_list = erdma_wr_set_sge_list; ++ ibqp_ex->wr_set_inline_data = erdma_wr_set_inline_data; ++ ibqp_ex->wr_set_inline_data_list = erdma_wr_set_inline_data_list; ++ ++ return 0; ++} ++ ++static void __erdma_alloc_dbs(struct erdma_qp *qp, struct erdma_context *ctx) ++{ ++ uint32_t qpn = qp->id; ++ ++ if (ctx->sdb_type == ERDMA_SDB_PAGE) { ++ qp->disable_dwqe = 0; ++ qp->sq.db = ctx->sdb + (qpn & 31) * 128; ++ qp->sdb_lock = &ctx->sdb_lock[qpn & 31].lock; ++ } else if (ctx->sdb_type == ERDMA_SDB_ENTRY) { ++ qp->disable_dwqe = 0; ++ ++ qp->sq.db = ctx->sdb + ctx->sdb_entid * 256; ++ qp->sdb_lock = &ctx->sdb_lock[0].lock; ++ } else { ++ qp->disable_dwqe = 1; ++ qp->sq.db = ctx->sdb + (qpn & 31) * 128; ++ qp->sdb_lock = &ctx->sdb_lock[0].lock; ++ } ++ ++#ifndef HAVE_AVX_SUPPORT ++ qp->disable_dwqe = 1; ++#endif ++ ++ /* qpn[6:0] as the index in this rq db page. */ ++ qp->rq.db = ctx->rdb + ++ (qpn & ERDMA_RDB_ALLOC_QPN_MASK) * ERDMA_RQDB_SPACE_SIZE; ++} ++ ++static int erdma_check_qp_attr(struct erdma_context *ctx, ++ struct ibv_qp_init_attr_ex *attr) ++ ++{ ++ if (!check_comp_mask(attr->comp_mask, ++ IBV_QP_INIT_ATTR_PD | ++ IBV_QP_INIT_ATTR_SEND_OPS_FLAGS)) ++ return -EOPNOTSUPP; ++ ++ if (attr->qp_type != IBV_QPT_RC || ++ !(attr->comp_mask & IBV_QP_INIT_ATTR_PD)) ++ return -EINVAL; ++ ++ if (!attr->recv_cq || !attr->send_cq) ++ return -EINVAL; ++ ++ if (attr->srq) ++ return -EINVAL; ++ ++ return 0; ++} ++ ++static int erdma_store_qp(struct erdma_context *ctx, struct erdma_qp *qp) ++{ ++ uint32_t tbl_idx, tbl_off; ++ int rv = 0; ++ ++ pthread_mutex_lock(&ctx->qp_table_mutex); ++ tbl_idx = qp->id >> ERDMA_QP_TABLE_SHIFT; ++ tbl_off = qp->id & ERDMA_QP_TABLE_MASK; ++ ++ if (ctx->qp_table[tbl_idx].refcnt == 0) { ++ ctx->qp_table[tbl_idx].table = ++ calloc(ERDMA_QP_TABLE_SIZE, sizeof(struct erdma_qp *)); ++ if (!ctx->qp_table[tbl_idx].table) { ++ rv = -ENOMEM; ++ goto out; ++ } ++ } ++ ++ /* exist qp */ ++ if (ctx->qp_table[tbl_idx].table[tbl_off]) { ++ rv = -EBUSY; ++ goto out; ++ } ++ ++ ctx->qp_table[tbl_idx].table[tbl_off] = qp; ++ ctx->qp_table[tbl_idx].refcnt++; ++ ++out: ++ pthread_mutex_unlock(&ctx->qp_table_mutex); ++ ++ return rv; ++} ++ ++static void erdma_clear_qp(struct erdma_context *ctx, struct erdma_qp *qp) ++{ ++ uint32_t tbl_idx, tbl_off; ++ ++ pthread_mutex_lock(&ctx->qp_table_mutex); ++ tbl_idx = qp->id >> ERDMA_QP_TABLE_SHIFT; ++ tbl_off = qp->id & ERDMA_QP_TABLE_MASK; ++ ++ ctx->qp_table[tbl_idx].table[tbl_off] = NULL; ++ ctx->qp_table[tbl_idx].refcnt--; ++ ++ if (ctx->qp_table[tbl_idx].refcnt == 0) { ++ free(ctx->qp_table[tbl_idx].table); ++ ctx->qp_table[tbl_idx].table = NULL; ++ } ++ ++ pthread_mutex_unlock(&ctx->qp_table_mutex); ++} ++ ++static int erdma_alloc_qp_buf_and_db(struct erdma_context *ctx, ++ struct erdma_qp *qp, ++ struct ibv_qp_init_attr_ex *attr) ++{ ++ size_t queue_size; ++ uint32_t nwqebb; ++ int rv; ++ ++ nwqebb = roundup_pow_of_two(attr->cap.max_send_wr * MAX_WQEBB_PER_SQE); ++ queue_size = align(nwqebb << SQEBB_SHIFT, ctx->page_size); ++ nwqebb = roundup_pow_of_two(attr->cap.max_recv_wr); ++ queue_size += align(nwqebb << RQE_SHIFT, ctx->page_size); ++ ++ qp->qbuf_size = queue_size; ++ rv = posix_memalign(&qp->qbuf, ctx->page_size, queue_size); ++ if (rv) { ++ errno = ENOMEM; ++ return -1; ++ } ++ ++ rv = ibv_dontfork_range(qp->qbuf, queue_size); ++ if (rv) { ++ errno = rv; ++ goto err_dontfork; ++ } ++ ++ /* doorbell record allocation. */ ++ qp->db_records = erdma_alloc_dbrecords(ctx); ++ if (!qp->db_records) { ++ errno = ENOMEM; ++ goto err_dbrec; ++ } ++ ++ *qp->db_records = 0; ++ *(qp->db_records + 1) = 0; ++ qp->sq.db_record = qp->db_records; ++ qp->rq.db_record = qp->db_records + 1; ++ ++ pthread_spin_init(&qp->sq_lock, PTHREAD_PROCESS_PRIVATE); ++ pthread_spin_init(&qp->rq_lock, PTHREAD_PROCESS_PRIVATE); ++ ++ return 0; ++ ++err_dbrec: ++ ibv_dofork_range(qp->qbuf, queue_size); ++ ++err_dontfork: ++ free(qp->qbuf); ++ ++ return -1; ++} ++ ++static void erdma_free_qp_buf_and_db(struct erdma_context *ctx, ++ struct erdma_qp *qp) ++{ ++ pthread_spin_destroy(&qp->sq_lock); ++ pthread_spin_destroy(&qp->rq_lock); ++ ++ if (qp->db_records) ++ erdma_dealloc_dbrecords(ctx, qp->db_records); ++ ++ ibv_dofork_range(qp->qbuf, qp->qbuf_size); ++ ++ if (qp->qbuf) ++ free(qp->qbuf); ++} ++ ++static int erdma_alloc_wrid_tbl(struct erdma_qp *qp) ++{ ++ qp->rq.wr_tbl = calloc(qp->rq.depth, sizeof(uint64_t)); ++ if (!qp->rq.wr_tbl) ++ return -ENOMEM; ++ ++ qp->sq.wr_tbl = calloc(qp->sq.depth, sizeof(uint64_t)); ++ if (!qp->sq.wr_tbl) { ++ free(qp->rq.wr_tbl); ++ return -ENOMEM; ++ } ++ ++ return 0; ++} ++ ++static void erdma_free_wrid_tbl(struct erdma_qp *qp) ++{ ++ if (qp->sq.wr_tbl) ++ free(qp->sq.wr_tbl); ++ ++ if (qp->rq.wr_tbl) ++ free(qp->rq.wr_tbl); ++} ++ ++static struct ibv_qp *create_qp(struct ibv_context *ibv_ctx, ++ struct ibv_qp_init_attr_ex *attr) ++{ ++ struct erdma_context *ctx = to_ectx(ibv_ctx); ++ struct erdma_cmd_create_qp_resp resp = {}; ++ struct erdma_cmd_create_qp cmd = {}; ++ struct erdma_qp *qp; ++ int rv; ++ ++ rv = erdma_check_qp_attr(ctx, attr); ++ if (rv) { ++ errno = -rv; ++ return NULL; ++ } ++ ++ qp = calloc(1, sizeof(*qp)); ++ if (!qp) ++ return NULL; ++ ++ rv = erdma_alloc_qp_buf_and_db(ctx, qp, attr); ++ if (rv) ++ goto err; ++ ++ cmd.db_record_va = (uintptr_t)qp->db_records; ++ cmd.qbuf_va = (uintptr_t)qp->qbuf; ++ cmd.qbuf_len = (__u32)qp->qbuf_size; ++ ++ rv = ibv_cmd_create_qp_ex(ibv_ctx, &qp->verbs_qp, attr, &cmd.ibv_cmd, ++ sizeof(cmd), &resp.ibv_resp, sizeof(resp)); ++ if (rv) ++ goto err_cmd; ++ ++ qp->id = resp.qp_id; ++ qp->sq.qbuf = qp->qbuf; ++ qp->rq.qbuf = qp->qbuf + resp.rq_offset; ++ qp->sq.depth = resp.num_sqe; ++ qp->rq.depth = resp.num_rqe; ++ qp->sq_sig_all = attr->sq_sig_all; ++ qp->sq.size = resp.num_sqe * SQEBB_SIZE; ++ qp->rq.size = resp.num_rqe * sizeof(struct erdma_rqe); ++ ++ /* doorbell allocation. */ ++ __erdma_alloc_dbs(qp, ctx); ++ ++ if (attr->comp_mask & IBV_QP_INIT_ATTR_SEND_OPS_FLAGS) { ++ rv = erdma_fill_wr_ops(qp, attr); ++ if (rv) { ++ errno = -rv; ++ goto err_send_ops; ++ } ++ } ++ ++ rv = erdma_alloc_wrid_tbl(qp); ++ if (rv) ++ goto err_wrid_tbl; ++ ++ rv = erdma_store_qp(ctx, qp); ++ if (rv) { ++ errno = -rv; ++ goto err_store; ++ } ++ ++ return &qp->verbs_qp.qp; ++ ++err_store: ++ erdma_free_wrid_tbl(qp); ++err_wrid_tbl: ++err_send_ops: ++ ibv_cmd_destroy_qp(&qp->verbs_qp.qp); ++err_cmd: ++ erdma_free_qp_buf_and_db(ctx, qp); ++err: ++ free(qp); ++ ++ return NULL; ++} ++ ++struct ibv_qp *erdma_create_qp(struct ibv_pd *pd, struct ibv_qp_init_attr *attr) ++{ ++ struct ibv_qp_init_attr_ex attrx = {}; ++ struct ibv_qp *qp; ++ ++ memcpy(&attrx, attr, sizeof(*attr)); ++ attrx.comp_mask = IBV_QP_INIT_ATTR_PD; ++ attrx.pd = pd; ++ ++ qp = create_qp(pd->context, &attrx); ++ if (qp) ++ memcpy(attr, &attrx, sizeof(*attr)); ++ ++ return qp; ++} ++ ++struct ibv_qp *erdma_create_qp_ex(struct ibv_context *context, ++ struct ibv_qp_init_attr_ex *attr) ++{ ++ return create_qp(context, attr); ++} ++ ++int erdma_destroy_qp(struct ibv_qp *ibqp) ++{ ++ struct ibv_context *base_ctx = ibqp->pd->context; ++ struct erdma_context *ctx = to_ectx(base_ctx); ++ struct erdma_qp *qp = to_eqp(ibqp); ++ int rv; ++ ++ erdma_clear_qp(ctx, qp); ++ ++ rv = ibv_cmd_destroy_qp(ibqp); ++ if (rv) ++ return rv; ++ ++ erdma_free_wrid_tbl(qp); ++ erdma_free_qp_buf_and_db(ctx, qp); ++ ++ free(qp); ++ ++ return 0; ++} ++ ++int erdma_modify_qp(struct ibv_qp *ibqp, struct ibv_qp_attr *attr, ++ int attr_mask) ++{ ++ struct erdma_qp *qp = to_eqp(ibqp); ++ struct ibv_modify_qp cmd = {}; ++ int rv; ++ ++ pthread_spin_lock(&qp->sq_lock); ++ pthread_spin_lock(&qp->rq_lock); ++ ++ rv = ibv_cmd_modify_qp(ibqp, attr, attr_mask, &cmd, sizeof(cmd)); ++ ++ pthread_spin_unlock(&qp->rq_lock); ++ pthread_spin_unlock(&qp->sq_lock); ++ ++ return rv; ++} ++ ++static int erdma_push_one_sqe(struct erdma_qp *qp, struct ibv_send_wr *wr, ++ uint16_t *sq_pi, int use_direct) ++{ ++ uint16_t tmp_pi = *sq_pi; ++ void *sqe; ++ uint64_t sqe_hdr; ++ struct erdma_write_sqe *write_sqe; ++ struct erdma_send_sqe *send_sqe; ++ struct erdma_readreq_sqe *read_sqe; ++ struct erdma_atomic_sqe *atomic_sqe; ++ uint32_t wqe_size = 0; ++ __le32 *length_field = NULL; ++ struct erdma_sge *sgl_base = NULL; ++ uint32_t i, bytes = 0; ++ uint32_t sgl_off, sgl_idx, wqebb_cnt, opcode; ++ ++ sqe = get_sq_wqebb(qp, tmp_pi); ++ /* Clear the first 8Byte of the wqe hdr. */ ++ *(uint64_t *)sqe = 0; ++ ++ qp->sq.wr_tbl[tmp_pi & (qp->sq.depth - 1)] = wr->wr_id; ++ ++ sqe_hdr = FIELD_PREP(ERDMA_SQE_HDR_QPN_MASK, qp->id) | ++ FIELD_PREP(ERDMA_SQE_HDR_CE_MASK, ++ wr->send_flags & IBV_SEND_SIGNALED ? 1 : 0) | ++ FIELD_PREP(ERDMA_SQE_HDR_CE_MASK, qp->sq_sig_all) | ++ FIELD_PREP(ERDMA_SQE_HDR_SE_MASK, ++ wr->send_flags & IBV_SEND_SOLICITED ? 1 : 0) | ++ FIELD_PREP(ERDMA_SQE_HDR_FENCE_MASK, ++ wr->send_flags & IBV_SEND_FENCE ? 1 : 0) | ++ FIELD_PREP(ERDMA_SQE_HDR_INLINE_MASK, ++ wr->send_flags & IBV_SEND_INLINE ? 1 : 0) | ++ FIELD_PREP(ERDMA_SQE_HDR_DWQE_MASK, use_direct); ++ ++ switch (wr->opcode) { ++ case IBV_WR_RDMA_WRITE: ++ case IBV_WR_RDMA_WRITE_WITH_IMM: ++ if (wr->opcode == IBV_WR_RDMA_WRITE) ++ opcode = ERDMA_OP_WRITE; ++ else ++ opcode = ERDMA_OP_WRITE_WITH_IMM; ++ sqe_hdr |= FIELD_PREP(ERDMA_SQE_HDR_OPCODE_MASK, opcode); ++ write_sqe = sqe; ++ write_sqe->imm_data = wr->imm_data; ++ write_sqe->sink_stag = htole32(wr->wr.rdma.rkey); ++ write_sqe->sink_to_low = ++ htole32(wr->wr.rdma.remote_addr & 0xFFFFFFFF); ++ write_sqe->sink_to_high = ++ htole32((wr->wr.rdma.remote_addr >> 32) & 0xFFFFFFFF); ++ ++ length_field = &write_sqe->length; ++ /* sgl is at the start of next wqebb. */ ++ sgl_base = get_sq_wqebb(qp, tmp_pi + 1); ++ sgl_off = 0; ++ sgl_idx = tmp_pi + 1; ++ wqe_size = sizeof(struct erdma_write_sqe); ++ ++ break; ++ case IBV_WR_SEND: ++ case IBV_WR_SEND_WITH_IMM: ++ case IBV_WR_SEND_WITH_INV: ++ if (wr->opcode == IBV_WR_SEND) ++ opcode = ERDMA_OP_SEND; ++ else if (wr->opcode == IBV_WR_SEND_WITH_IMM) ++ opcode = ERDMA_OP_SEND_WITH_IMM; ++ else ++ opcode = ERDMA_OP_SEND_WITH_INV; ++ sqe_hdr |= FIELD_PREP(ERDMA_SQE_HDR_OPCODE_MASK, opcode); ++ send_sqe = sqe; ++ if (wr->opcode == IBV_WR_SEND_WITH_INV) ++ send_sqe->invalid_stag = htole32(wr->invalidate_rkey); ++ else ++ send_sqe->imm_data = wr->imm_data; ++ ++ length_field = &send_sqe->length; ++ /* sgl is in the half of current wqebb (offset 16Byte) */ ++ sgl_base = sqe; ++ sgl_off = 16; ++ sgl_idx = tmp_pi; ++ wqe_size = sizeof(struct erdma_send_sqe); ++ ++ break; ++ case IBV_WR_RDMA_READ: ++ sqe_hdr |= FIELD_PREP(ERDMA_SQE_HDR_OPCODE_MASK, ERDMA_OP_READ); ++ read_sqe = sqe; ++ ++ read_sqe->sink_to_low = htole32(wr->sg_list->addr & 0xFFFFFFFF); ++ read_sqe->sink_to_high = ++ htole32((wr->sg_list->addr >> 32) & 0xFFFFFFFF); ++ read_sqe->sink_stag = htole32(wr->sg_list->lkey); ++ read_sqe->length = htole32(wr->sg_list->length); ++ ++ sgl_base = get_sq_wqebb(qp, tmp_pi + 1); ++ ++ sgl_base->laddr = htole64(wr->wr.rdma.remote_addr); ++ sgl_base->length = htole32(wr->sg_list->length); ++ sgl_base->lkey = htole32(wr->wr.rdma.rkey); ++ ++ wqe_size = sizeof(struct erdma_readreq_sqe); ++ ++ goto out; ++ case IBV_WR_ATOMIC_CMP_AND_SWP: ++ case IBV_WR_ATOMIC_FETCH_AND_ADD: ++ atomic_sqe = (struct erdma_atomic_sqe *)sqe; ++ ++ if (wr->opcode == IBV_WR_ATOMIC_CMP_AND_SWP) { ++ sqe_hdr |= FIELD_PREP(ERDMA_SQE_HDR_OPCODE_MASK, ++ ERDMA_OP_ATOMIC_CAS); ++ atomic_sqe->fetchadd_swap_data = wr->wr.atomic.swap; ++ atomic_sqe->cmp_data = wr->wr.atomic.compare_add; ++ } else { ++ sqe_hdr |= FIELD_PREP(ERDMA_SQE_HDR_OPCODE_MASK, ++ ERDMA_OP_ATOMIC_FAD); ++ atomic_sqe->fetchadd_swap_data = ++ wr->wr.atomic.compare_add; ++ } ++ ++ sgl_base = (struct erdma_sge *)get_sq_wqebb(qp, tmp_pi + 1); ++ /* remote SGL fields */ ++ sgl_base->laddr = wr->wr.atomic.remote_addr; ++ sgl_base->lkey = wr->wr.atomic.rkey; ++ ++ /* local SGL fields */ ++ sgl_base++; ++ sgl_base->laddr = wr->sg_list[0].addr; ++ sgl_base->length = wr->sg_list[0].length; ++ sgl_base->lkey = wr->sg_list[0].lkey; ++ wqe_size = sizeof(struct erdma_atomic_sqe); ++ goto out; ++ default: ++ return -EINVAL; ++ } ++ ++ if (wr->send_flags & IBV_SEND_INLINE) { ++ char *data = (char *)sgl_base; ++ uint32_t remain_size; ++ uint32_t copy_size; ++ uint32_t data_off; ++ ++ i = 0; ++ bytes = 0; ++ ++ /* Allow more than ERDMA_MAX_SGE, since content copied here */ ++ while (i < wr->num_sge) { ++ bytes += wr->sg_list[i].length; ++ if (bytes > (int)ERDMA_MAX_INLINE) ++ return -EINVAL; ++ ++ remain_size = wr->sg_list[i].length; ++ data_off = 0; ++ ++ while (1) { ++ copy_size = ++ min(remain_size, SQEBB_SIZE - sgl_off); ++ memcpy(data + sgl_off, ++ (void *)(uintptr_t)wr->sg_list[i].addr + ++ data_off, ++ copy_size); ++ remain_size -= copy_size; ++ ++ /* Update sgl_offset. */ ++ sgl_idx += ++ ((sgl_off + copy_size) >> SQEBB_SHIFT); ++ sgl_off = (sgl_off + copy_size) & ++ (SQEBB_SIZE - 1); ++ data_off += copy_size; ++ data = get_sq_wqebb(qp, sgl_idx); ++ ++ if (!remain_size) ++ break; ++ }; ++ ++ i++; ++ } ++ ++ *length_field = htole32(bytes); ++ wqe_size += bytes; ++ sqe_hdr |= FIELD_PREP(ERDMA_SQE_HDR_SGL_LEN_MASK, bytes); ++ } else { ++ char *sgl = (char *)sgl_base; ++ ++ if (wr->num_sge > ERDMA_MAX_SEND_SGE) ++ return -EINVAL; ++ ++ i = 0; ++ bytes = 0; ++ ++ while (i < wr->num_sge) { ++ bytes += wr->sg_list[i].length; ++ memcpy(sgl + sgl_off, &wr->sg_list[i], ++ sizeof(struct ibv_sge)); ++ ++ if (sgl_off == 0) ++ *(uint32_t *)(sgl + 28) = qp->id; ++ ++ sgl_idx += (sgl_off == sizeof(struct ibv_sge) ? 1 : 0); ++ sgl = get_sq_wqebb(qp, sgl_idx); ++ sgl_off = sizeof(struct ibv_sge) - sgl_off; ++ ++ i++; ++ } ++ ++ *length_field = htole32(bytes); ++ sqe_hdr |= FIELD_PREP(ERDMA_SQE_HDR_SGL_LEN_MASK, wr->num_sge); ++ wqe_size += wr->num_sge * sizeof(struct ibv_sge); ++ } ++ ++out: ++ wqebb_cnt = SQEBB_COUNT(wqe_size); ++ assert(wqebb_cnt <= MAX_WQEBB_PER_SQE); ++ sqe_hdr |= FIELD_PREP(ERDMA_SQE_HDR_WQEBB_CNT_MASK, wqebb_cnt - 1); ++ sqe_hdr |= ++ FIELD_PREP(ERDMA_SQE_HDR_WQEBB_INDEX_MASK, tmp_pi + wqebb_cnt); ++ ++ *(__le64 *)sqe = htole64(sqe_hdr); ++ *sq_pi = tmp_pi + wqebb_cnt; ++ ++ if (use_direct) ++ kick_hw_sqe(qp, tmp_pi, wqebb_cnt); ++ ++ return 0; ++} ++ ++int erdma_post_send(struct ibv_qp *ibqp, struct ibv_send_wr *wr, ++ struct ibv_send_wr **bad_wr) ++{ ++ struct erdma_qp *qp = to_eqp(ibqp); ++ uint16_t sq_pi; ++ int new_sqe = 0, rv = 0; ++ int use_dwqe = 0, first_wr = 1, dsqe = 0; ++ ++ *bad_wr = NULL; ++ ++ if (ibqp->state == IBV_QPS_ERR) { ++ *bad_wr = wr; ++ return -EIO; ++ } ++ ++ pthread_spin_lock(&qp->sq_lock); ++ sq_pi = qp->sq.pi; ++ ++ while (wr) { ++ if ((uint16_t)(sq_pi - qp->sq.ci) >= qp->sq.depth) { ++ rv = -ENOMEM; ++ *bad_wr = wr; ++ break; ++ } ++ ++ use_dwqe = (!qp->disable_dwqe && first_wr && sq_pi == qp->sq.ci) ? 1 : 0; ++ if (use_dwqe) ++ dsqe++; ++ ++ rv = erdma_push_one_sqe(qp, wr, &sq_pi, use_dwqe); ++ if (rv) { ++ *bad_wr = wr; ++ break; ++ } ++ ++ new_sqe++; ++ first_wr = 0; ++ wr = wr->next; ++ } ++ ++ if (new_sqe) ++ qp->sq.pi = sq_pi; ++ ++ if (new_sqe - dsqe) ++ __kick_sq_db(qp, sq_pi); /* normal doorbell. */ ++ ++ pthread_spin_unlock(&qp->sq_lock); ++ ++ return rv; ++} ++ ++static int push_recv_wqe(struct erdma_qp *qp, struct ibv_recv_wr *wr) ++{ ++ uint16_t rq_pi = qp->rq.pi; ++ uint16_t idx = rq_pi & (qp->rq.depth - 1); ++ struct erdma_rqe *rqe = (struct erdma_rqe *)qp->rq.qbuf + idx; ++ ++ if ((uint16_t)(rq_pi - qp->rq.ci) == qp->rq.depth) ++ return -ENOMEM; ++ ++ rqe->qe_idx = htole16(rq_pi + 1); ++ rqe->qpn = htole32(qp->id); ++ qp->rq.wr_tbl[idx] = wr->wr_id; ++ ++ if (wr->num_sge == 0) { ++ rqe->length = 0; ++ } else if (wr->num_sge == 1) { ++ rqe->stag = htole32(wr->sg_list[0].lkey); ++ rqe->to = htole64(wr->sg_list[0].addr); ++ rqe->length = htole32(wr->sg_list[0].length); ++ } else { ++ return -EINVAL; ++ } ++ ++ *(__le64 *)qp->rq.db_record = *(__le64 *)rqe; ++ udma_to_device_barrier(); ++ mmio_write64_le(qp->rq.db, *(__le64 *)rqe); ++ ++ qp->rq.pi = rq_pi + 1; ++ ++ return 0; ++} ++ ++int erdma_post_recv(struct ibv_qp *ibqp, struct ibv_recv_wr *wr, ++ struct ibv_recv_wr **bad_wr) ++{ ++ struct erdma_qp *qp = to_eqp(ibqp); ++ int ret = 0; ++ ++ if (ibqp->state == IBV_QPS_ERR) { ++ *bad_wr = wr; ++ return -EIO; ++ } ++ ++ pthread_spin_lock(&qp->rq_lock); ++ ++ while (wr) { ++ ret = push_recv_wqe(qp, wr); ++ if (ret) { ++ *bad_wr = wr; ++ break; ++ } ++ ++ wr = wr->next; ++ } ++ ++ pthread_spin_unlock(&qp->rq_lock); ++ ++ return ret; ++} ++ ++void erdma_cq_event(struct ibv_cq *ibcq) ++{ ++ struct erdma_cq *cq = to_ecq(ibcq); ++ ++ cq->cmdsn++; ++} ++ ++static void *get_next_valid_cqe(struct erdma_cq *cq) ++{ ++ struct erdma_cqe *cqe = cq->queue + (cq->ci & (cq->depth - 1)); ++ uint32_t owner = FIELD_GET(ERDMA_CQE_HDR_OWNER_MASK, be32toh(cqe->hdr)); ++ ++ return owner ^ !!(cq->ci & cq->depth) ? cqe : NULL; ++} ++ ++static const enum ibv_wc_opcode wc_mapping_table[ERDMA_NUM_OPCODES] = { ++ [ERDMA_OP_WRITE] = IBV_WC_RDMA_WRITE, ++ [ERDMA_OP_READ] = IBV_WC_RDMA_READ, ++ [ERDMA_OP_SEND] = IBV_WC_SEND, ++ [ERDMA_OP_SEND_WITH_IMM] = IBV_WC_SEND, ++ [ERDMA_OP_RECEIVE] = IBV_WC_RECV, ++ [ERDMA_OP_RECV_IMM] = IBV_WC_RECV_RDMA_WITH_IMM, ++ [ERDMA_OP_RECV_INV] = IBV_WC_RECV, ++ [ERDMA_OP_WRITE_WITH_IMM] = IBV_WC_RDMA_WRITE, ++ [ERDMA_OP_INVALIDATE] = IBV_WC_LOCAL_INV, ++ [ERDMA_OP_RSP_SEND_IMM] = IBV_WC_RECV, ++ [ERDMA_OP_SEND_WITH_INV] = IBV_WC_SEND, ++ [ERDMA_OP_READ_WITH_INV] = IBV_WC_RDMA_READ, ++ [ERDMA_OP_ATOMIC_CAS] = IBV_WC_COMP_SWAP, ++ [ERDMA_OP_ATOMIC_FAD] = IBV_WC_FETCH_ADD, ++}; ++ ++static const struct { ++ enum erdma_wc_status erdma; ++ enum ibv_wc_status base; ++ enum erdma_vendor_err vendor; ++} map_cqe_status[ERDMA_NUM_WC_STATUS] = { ++ { ERDMA_WC_SUCCESS, IBV_WC_SUCCESS, ERDMA_WC_VENDOR_NO_ERR }, ++ { ERDMA_WC_GENERAL_ERR, IBV_WC_GENERAL_ERR, ERDMA_WC_VENDOR_NO_ERR }, ++ { ERDMA_WC_RECV_WQE_FORMAT_ERR, IBV_WC_GENERAL_ERR, ++ ERDMA_WC_VENDOR_INVALID_RQE }, ++ { ERDMA_WC_RECV_STAG_INVALID_ERR, IBV_WC_REM_ACCESS_ERR, ++ ERDMA_WC_VENDOR_RQE_INVALID_STAG }, ++ { ERDMA_WC_RECV_ADDR_VIOLATION_ERR, IBV_WC_REM_ACCESS_ERR, ++ ERDMA_WC_VENDOR_RQE_ADDR_VIOLATION }, ++ { ERDMA_WC_RECV_RIGHT_VIOLATION_ERR, IBV_WC_REM_ACCESS_ERR, ++ ERDMA_WC_VENDOR_RQE_ACCESS_RIGHT_ERR }, ++ { ERDMA_WC_RECV_PDID_ERR, IBV_WC_REM_ACCESS_ERR, ++ ERDMA_WC_VENDOR_RQE_INVALID_PD }, ++ { ERDMA_WC_RECV_WARRPING_ERR, IBV_WC_REM_ACCESS_ERR, ++ ERDMA_WC_VENDOR_RQE_WRAP_ERR }, ++ { ERDMA_WC_SEND_WQE_FORMAT_ERR, IBV_WC_LOC_QP_OP_ERR, ++ ERDMA_WC_VENDOR_INVALID_SQE }, ++ { ERDMA_WC_SEND_WQE_ORD_EXCEED, IBV_WC_GENERAL_ERR, ++ ERDMA_WC_VENDOR_ZERO_ORD }, ++ { ERDMA_WC_SEND_STAG_INVALID_ERR, IBV_WC_LOC_ACCESS_ERR, ++ ERDMA_WC_VENDOR_SQE_INVALID_STAG }, ++ { ERDMA_WC_SEND_ADDR_VIOLATION_ERR, IBV_WC_LOC_ACCESS_ERR, ++ ERDMA_WC_VENDOR_SQE_ADDR_VIOLATION }, ++ { ERDMA_WC_SEND_RIGHT_VIOLATION_ERR, IBV_WC_LOC_ACCESS_ERR, ++ ERDMA_WC_VENDOR_SQE_ACCESS_ERR }, ++ { ERDMA_WC_SEND_PDID_ERR, IBV_WC_LOC_ACCESS_ERR, ++ ERDMA_WC_VENDOR_SQE_INVALID_PD }, ++ { ERDMA_WC_SEND_WARRPING_ERR, IBV_WC_LOC_ACCESS_ERR, ++ ERDMA_WC_VENDOR_SQE_WARP_ERR }, ++ { ERDMA_WC_FLUSH_ERR, IBV_WC_WR_FLUSH_ERR, ERDMA_WC_VENDOR_NO_ERR }, ++ { ERDMA_WC_RETRY_EXC_ERR, IBV_WC_RETRY_EXC_ERR, ++ ERDMA_WC_VENDOR_NO_ERR }, ++}; ++ ++#define ERDMA_POLLCQ_NO_QP (-1) ++#define ERDMA_POLLCQ_DUP_COMP (-2) ++#define ERDMA_POLLCQ_WRONG_IDX (-3) ++ ++static int __erdma_poll_one_cqe(struct erdma_context *ctx, struct erdma_cq *cq, ++ struct ibv_wc *wc) ++{ ++ uint32_t cqe_hdr, opcode, syndrome, qpn; ++ uint16_t depth, wqe_idx, old_ci, new_ci; ++ uint64_t *sqe_hdr, *qeidx2wrid; ++ uint32_t tbl_idx, tbl_off; ++ struct erdma_cqe *cqe; ++ struct erdma_qp *qp; ++ ++ cqe = get_next_valid_cqe(cq); ++ if (!cqe) ++ return -EAGAIN; ++ ++ cq->ci++; ++ udma_from_device_barrier(); ++ ++ cqe_hdr = be32toh(cqe->hdr); ++ syndrome = FIELD_GET(ERDMA_CQE_HDR_SYNDROME_MASK, cqe_hdr); ++ opcode = FIELD_GET(ERDMA_CQE_HDR_OPCODE_MASK, cqe_hdr); ++ qpn = be32toh(cqe->qpn); ++ wqe_idx = be32toh(cqe->qe_idx); ++ ++ tbl_idx = qpn >> ERDMA_QP_TABLE_SHIFT; ++ tbl_off = qpn & ERDMA_QP_TABLE_MASK; ++ ++ if (!ctx->qp_table[tbl_idx].table || ++ !ctx->qp_table[tbl_idx].table[tbl_off]) ++ return ERDMA_POLLCQ_NO_QP; ++ ++ qp = ctx->qp_table[tbl_idx].table[tbl_off]; ++ ++ if (FIELD_GET(ERDMA_CQE_HDR_QTYPE_MASK, cqe_hdr) == ++ ERDMA_CQE_QTYPE_SQ) { ++ qeidx2wrid = qp->sq.wr_tbl; ++ depth = qp->sq.depth; ++ sqe_hdr = get_sq_wqebb(qp, wqe_idx); ++ old_ci = qp->sq.ci; ++ new_ci = wqe_idx + ++ FIELD_GET(ERDMA_SQE_HDR_WQEBB_CNT_MASK, *sqe_hdr) + 1; ++ ++ if ((uint16_t)(new_ci - old_ci) > depth) ++ return ERDMA_POLLCQ_WRONG_IDX; ++ else if (new_ci == old_ci) ++ return ERDMA_POLLCQ_DUP_COMP; ++ ++ qp->sq.ci = new_ci; ++ } else { ++ qeidx2wrid = qp->rq.wr_tbl; ++ depth = qp->rq.depth; ++ qp->rq.ci++; ++ } ++ ++ wc->wr_id = qeidx2wrid[wqe_idx & (depth - 1)]; ++ wc->byte_len = be32toh(cqe->size); ++ wc->wc_flags = 0; ++ ++ wc->opcode = wc_mapping_table[opcode]; ++ if (opcode == ERDMA_OP_RECV_IMM || opcode == ERDMA_OP_RSP_SEND_IMM) { ++ wc->imm_data = htobe32(le32toh(cqe->imm_data)); ++ wc->wc_flags |= IBV_WC_WITH_IMM; ++ } ++ ++ if (syndrome >= ERDMA_NUM_WC_STATUS) ++ syndrome = ERDMA_WC_GENERAL_ERR; ++ ++ wc->status = map_cqe_status[syndrome].base; ++ wc->vendor_err = map_cqe_status[syndrome].vendor; ++ wc->qp_num = qpn; ++ ++ return 0; ++} ++ ++int erdma_poll_cq(struct ibv_cq *ibcq, int num_entries, struct ibv_wc *wc) ++{ ++ struct erdma_context *ctx = to_ectx(ibcq->context); ++ struct erdma_cq *cq = to_ecq(ibcq); ++ int ret, npolled = 0; ++ ++ pthread_spin_lock(&cq->lock); ++ ++ while (npolled < num_entries) { ++ ret = __erdma_poll_one_cqe(ctx, cq, wc + npolled); ++ if (ret == -EAGAIN) /* CQ is empty, break the loop. */ ++ break; ++ else if (ret) /* We handle the polling error silently. */ ++ continue; ++ npolled++; ++ } ++ ++ pthread_spin_unlock(&cq->lock); ++ ++ return npolled; ++} ++ ++void erdma_free_context(struct ibv_context *ibv_ctx) ++{ ++ struct erdma_context *ctx = to_ectx(ibv_ctx); ++ int i; ++ ++ munmap(aligned_address(ctx->sdb, ctx->page_size), ctx->page_size); ++ munmap(aligned_address(ctx->rdb, ctx->page_size), ctx->page_size); ++ munmap(aligned_address(ctx->cdb, ctx->page_size), ctx->page_size); ++ ++ pthread_mutex_lock(&ctx->qp_table_mutex); ++ for (i = 0; i < ERDMA_QP_TABLE_SIZE; ++i) { ++ if (ctx->qp_table[i].refcnt) ++ free(ctx->qp_table[i].table); ++ } ++ ++ pthread_mutex_unlock(&ctx->qp_table_mutex); ++ pthread_mutex_destroy(&ctx->qp_table_mutex); ++ ++ verbs_uninit_context(&ctx->ibv_ctx); ++ free(ctx); ++} +-- +2.37.0 + diff --git a/1003-providers-erdma-Add-the-main-module-of-erdma.patch b/1003-providers-erdma-Add-the-main-module-of-erdma.patch new file mode 100644 index 0000000000000000000000000000000000000000..41f02e3b882875b5c045180468ab5938c703c938 --- /dev/null +++ b/1003-providers-erdma-Add-the-main-module-of-erdma.patch @@ -0,0 +1,326 @@ +From ccacbfd79f17b3955acff68135e029433b2072d6 Mon Sep 17 00:00:00 2001 +Message-Id: +In-Reply-To: +References: +From: Cheng Xu +Date: Thu, 1 Dec 2022 15:40:48 +0800 +Subject: [PATCH 3/4] providers/erdma: Add the main module of erdma + +Add the definitions of erdma provider driver, and add the application +interface to core, so that core can recognize erdma provider. + +Signed-off-by: Cheng Xu +--- + libibverbs/verbs.h | 8 +- + providers/erdma/erdma.c | 149 ++++++++++++++++++++++++++++++++++++ + providers/erdma/erdma.h | 79 +++++++++++++++++++ + providers/erdma/erdma_abi.h | 21 +++++ + 4 files changed, 254 insertions(+), 3 deletions(-) + create mode 100644 providers/erdma/erdma.c + create mode 100644 providers/erdma/erdma.h + create mode 100644 providers/erdma/erdma_abi.h + +diff --git a/libibverbs/verbs.h b/libibverbs/verbs.h +index 36b4142517c0..b2f2b4b07be6 100644 +--- a/libibverbs/verbs.h ++++ b/libibverbs/verbs.h +@@ -2190,7 +2190,7 @@ struct ibv_device **ibv_get_device_list(int *num_devices); + */ + #ifdef RDMA_STATIC_PROVIDERS + #define _RDMA_STATIC_PREFIX_(_1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, \ +- _12, _13, _14, _15, _16, _17, ...) \ ++ _12, _13, _14, _15, _16, _17, _18, ...) \ + &verbs_provider_##_1, &verbs_provider_##_2, &verbs_provider_##_3, \ + &verbs_provider_##_4, &verbs_provider_##_5, \ + &verbs_provider_##_6, &verbs_provider_##_7, \ +@@ -2198,16 +2198,18 @@ struct ibv_device **ibv_get_device_list(int *num_devices); + &verbs_provider_##_10, &verbs_provider_##_11, \ + &verbs_provider_##_12, &verbs_provider_##_13, \ + &verbs_provider_##_14, &verbs_provider_##_15, \ +- &verbs_provider_##_16, &verbs_provider_##_17 ++ &verbs_provider_##_16, &verbs_provider_##_17, \ ++ &verbs_provider_##_18 + #define _RDMA_STATIC_PREFIX(arg) \ + _RDMA_STATIC_PREFIX_(arg, none, none, none, none, none, none, none, \ + none, none, none, none, none, none, none, none, \ +- none) ++ none, none) + + struct verbs_devices_ops; + extern const struct verbs_device_ops verbs_provider_bnxt_re; + extern const struct verbs_device_ops verbs_provider_cxgb4; + extern const struct verbs_device_ops verbs_provider_efa; ++extern const struct verbs_device_ops verbs_provider_erdma; + extern const struct verbs_device_ops verbs_provider_hfi1verbs; + extern const struct verbs_device_ops verbs_provider_hns; + extern const struct verbs_device_ops verbs_provider_ipathverbs; +diff --git a/providers/erdma/erdma.c b/providers/erdma/erdma.c +new file mode 100644 +index 000000000000..440314599cf1 +--- /dev/null ++++ b/providers/erdma/erdma.c +@@ -0,0 +1,149 @@ ++// SPDX-License-Identifier: GPL-2.0 or OpenIB.org BSD (MIT) See COPYING file ++ ++// Authors: Cheng Xu ++// Copyright (c) 2020-2021, Alibaba Group. ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include "erdma.h" ++#include "erdma_abi.h" ++#include "erdma_hw.h" ++#include "erdma_verbs.h" ++ ++static const struct verbs_context_ops erdma_context_ops = { ++ .alloc_pd = erdma_alloc_pd, ++ .cq_event = erdma_cq_event, ++ .create_cq = erdma_create_cq, ++ .create_qp = erdma_create_qp, ++ .create_qp_ex = erdma_create_qp_ex, ++ .dealloc_pd = erdma_free_pd, ++ .dereg_mr = erdma_dereg_mr, ++ .destroy_cq = erdma_destroy_cq, ++ .destroy_qp = erdma_destroy_qp, ++ .free_context = erdma_free_context, ++ .modify_qp = erdma_modify_qp, ++ .poll_cq = erdma_poll_cq, ++ .post_recv = erdma_post_recv, ++ .post_send = erdma_post_send, ++ .query_device_ex = erdma_query_device, ++ .query_port = erdma_query_port, ++ .query_qp = erdma_query_qp, ++ .reg_mr = erdma_reg_mr, ++ .req_notify_cq = erdma_notify_cq, ++}; ++ ++static struct verbs_context *erdma_alloc_context(struct ibv_device *device, ++ int cmd_fd, void *private_data) ++{ ++ struct erdma_cmd_alloc_context_resp resp = {}; ++ struct ibv_get_context cmd = {}; ++ struct erdma_context *ctx; ++ int i; ++ ++ ctx = verbs_init_and_alloc_context(device, cmd_fd, ctx, ibv_ctx, ++ RDMA_DRIVER_ERDMA); ++ if (!ctx) ++ return NULL; ++ ++ pthread_mutex_init(&ctx->qp_table_mutex, NULL); ++ for (i = 0; i < ERDMA_QP_TABLE_SIZE; ++i) ++ ctx->qp_table[i].refcnt = 0; ++ ++ if (ibv_cmd_get_context(&ctx->ibv_ctx, &cmd, sizeof(cmd), ++ &resp.ibv_resp, sizeof(resp))) ++ goto err_out; ++ ++ verbs_set_ops(&ctx->ibv_ctx, &erdma_context_ops); ++ ctx->page_size = to_edev(device)->page_size; ++ ctx->dev_id = resp.dev_id; ++ ++ ctx->sdb_type = resp.sdb_type; ++ ctx->sdb_entid = resp.sdb_entid; ++ ++ ctx->sdb = mmap(NULL, ctx->page_size, PROT_WRITE, MAP_SHARED, cmd_fd, ++ resp.sdb); ++ if (ctx->sdb == MAP_FAILED) ++ goto err_out; ++ ++ ctx->sdb += resp.sdb_off; ++ ++ ctx->rdb = mmap(NULL, ctx->page_size, PROT_WRITE, MAP_SHARED, cmd_fd, ++ resp.rdb); ++ if (ctx->rdb == MAP_FAILED) ++ goto err_rdb_map; ++ ++ ctx->rdb += resp.rdb_off; ++ ++ ctx->cdb = mmap(NULL, ctx->page_size, PROT_WRITE, MAP_SHARED, cmd_fd, ++ resp.cdb); ++ if (ctx->cdb == MAP_FAILED) ++ goto err_cdb_map; ++ ++ ctx->cdb += resp.cdb_off; ++ ++ ctx->dbrecord_pages = NULL; ++ pthread_mutex_init(&ctx->dbrecord_pages_mutex, NULL); ++ ++ for (i = 0; i < 32; i++) ++ pthread_spin_init(&ctx->sdb_lock[i].lock, ++ PTHREAD_PROCESS_PRIVATE); ++ ++ return &ctx->ibv_ctx; ++ ++err_cdb_map: ++ munmap(aligned_address(ctx->rdb, ctx->page_size), ctx->page_size); ++err_rdb_map: ++ munmap(aligned_address(ctx->sdb, ctx->page_size), ctx->page_size); ++err_out: ++ verbs_uninit_context(&ctx->ibv_ctx); ++ free(ctx); ++ ++ return NULL; ++} ++ ++static struct verbs_device * ++erdma_device_alloc(struct verbs_sysfs_dev *sysfs_dev) ++{ ++ struct erdma_device *dev; ++ ++ dev = calloc(1, sizeof(*dev)); ++ if (!dev) ++ return NULL; ++ ++ dev->page_size = sysconf(_SC_PAGESIZE); ++ ++ return &dev->ibv_dev; ++} ++ ++static void erdma_device_free(struct verbs_device *vdev) ++{ ++ struct erdma_device *dev = ++ container_of(vdev, struct erdma_device, ibv_dev); ++ ++ free(dev); ++} ++ ++static const struct verbs_match_ent match_table[] = { ++ VERBS_DRIVER_ID(RDMA_DRIVER_ERDMA), ++ VERBS_PCI_MATCH(PCI_VENDOR_ID_ALIBABA, 0x107f, NULL), ++ {}, ++}; ++ ++static const struct verbs_device_ops erdma_dev_ops = { ++ .name = "erdma", ++ .match_min_abi_version = 0, ++ .match_max_abi_version = ERDMA_ABI_VERSION, ++ .match_table = match_table, ++ .alloc_device = erdma_device_alloc, ++ .uninit_device = erdma_device_free, ++ .alloc_context = erdma_alloc_context, ++}; ++ ++PROVIDER_DRIVER(erdma, erdma_dev_ops); +diff --git a/providers/erdma/erdma.h b/providers/erdma/erdma.h +new file mode 100644 +index 000000000000..41bca6e7915f +--- /dev/null ++++ b/providers/erdma/erdma.h +@@ -0,0 +1,79 @@ ++/* SPDX-License-Identifier: GPL-2.0 or OpenIB.org BSD (MIT) See COPYING file */ ++/* ++ * Authors: Cheng Xu ++ * Copyright (c) 2020-2021, Alibaba Group. ++ */ ++ ++#ifndef __ERDMA_H__ ++#define __ERDMA_H__ ++ ++#include ++#include ++#include ++ ++#include ++#include ++#include ++ ++#ifndef PCI_VENDOR_ID_ALIBABA ++#define PCI_VENDOR_ID_ALIBABA 0x1ded ++#endif ++ ++#define ERDMA_PAGE_SIZE 4096 ++#define ERDMA_PAGE_SHIFT 12 ++#define ERDMA_SIZE_TO_NPAGE(size) \ ++ (((size) + ERDMA_PAGE_SIZE - 1) >> ERDMA_PAGE_SHIFT) ++ ++struct erdma_device { ++ struct verbs_device ibv_dev; ++ uint32_t page_size; ++}; ++ ++struct erdma_aligned_lock_t { ++ pthread_spinlock_t lock; ++} __attribute__((__packed__)) __attribute__((__aligned__(64))); ++ ++#define ERDMA_QP_TABLE_SIZE 4096 ++#define ERDMA_QP_TABLE_SHIFT 12 ++#define ERDMA_QP_TABLE_MASK 0xFFF ++ ++struct erdma_context { ++ struct verbs_context ibv_ctx; ++ uint32_t dev_id; ++ ++ struct { ++ struct erdma_qp **table; ++ int refcnt; ++ } qp_table[ERDMA_QP_TABLE_SIZE]; ++ pthread_mutex_t qp_table_mutex; ++ ++ uint8_t sdb_type; ++ uint32_t sdb_entid; ++ ++ void *sdb; ++ void *rdb; ++ void *cdb; ++ ++ uint32_t page_size; ++ pthread_mutex_t dbrecord_pages_mutex; ++ struct erdma_dbrecord_page *dbrecord_pages; ++ ++ struct erdma_aligned_lock_t sdb_lock[32]; ++}; ++ ++static inline struct erdma_context *to_ectx(struct ibv_context *base) ++{ ++ return container_of(base, struct erdma_context, ibv_ctx.context); ++} ++ ++static inline struct erdma_device *to_edev(struct ibv_device *ibv_dev) ++{ ++ return container_of(ibv_dev, struct erdma_device, ibv_dev.device); ++} ++ ++static inline void *aligned_address(void *addr, uint32_t align) ++{ ++ return addr - ((uintptr_t)addr & (align - 1)); ++} ++ ++#endif +diff --git a/providers/erdma/erdma_abi.h b/providers/erdma/erdma_abi.h +new file mode 100644 +index 000000000000..184804711d9b +--- /dev/null ++++ b/providers/erdma/erdma_abi.h +@@ -0,0 +1,21 @@ ++/* SPDX-License-Identifier: GPL-2.0 or OpenIB.org BSD (MIT) See COPYING file */ ++/* ++ * Authors: Cheng Xu ++ * Copyright (c) 2020-2021, Alibaba Group. ++ */ ++ ++#ifndef __ERDMA_ABI_H__ ++#define __ERDMA_ABI_H__ ++ ++#include ++#include ++#include ++ ++DECLARE_DRV_CMD(erdma_cmd_alloc_context, IB_USER_VERBS_CMD_GET_CONTEXT, empty, ++ erdma_uresp_alloc_ctx); ++DECLARE_DRV_CMD(erdma_cmd_create_cq, IB_USER_VERBS_CMD_CREATE_CQ, ++ erdma_ureq_create_cq, erdma_uresp_create_cq); ++DECLARE_DRV_CMD(erdma_cmd_create_qp, IB_USER_VERBS_CMD_CREATE_QP, ++ erdma_ureq_create_qp, erdma_uresp_create_qp); ++ ++#endif +-- +2.37.0 + diff --git a/1004-providers-erdma-Add-to-the-build-environment.patch b/1004-providers-erdma-Add-to-the-build-environment.patch new file mode 100644 index 0000000000000000000000000000000000000000..faab71ed342ffe05c7c53ec94073918d3f991b04 --- /dev/null +++ b/1004-providers-erdma-Add-to-the-build-environment.patch @@ -0,0 +1,245 @@ +From 500d8631e9598875063ba5daf3c668928984979c Mon Sep 17 00:00:00 2001 +Message-Id: <500d8631e9598875063ba5daf3c668928984979c.1669880730.git.chengyou@linux.alibaba.com> +In-Reply-To: +References: +From: Cheng Xu +Date: Thu, 1 Dec 2022 15:42:21 +0800 +Subject: [PATCH 4/4] providers/erdma: Add to the build environment + +Make the build system can build the provider, and add erdma to redhat package +environment and debian pkg build environment. + +Signed-off-by: Cheng Xu +--- + CMakeLists.txt | 1 + + MAINTAINERS | 5 +++ + README.md | 1 + + debian/control | 1 + + debian/copyright | 4 ++ + kernel-headers/CMakeLists.txt | 2 + + kernel-headers/rdma/erdma-abi.h | 52 +++++++++++++++++++++++ + kernel-headers/rdma/ib_user_ioctl_verbs.h | 1 + + libibverbs/examples/devinfo.c | 7 ++- + providers/erdma/CMakeLists.txt | 12 ++++++ + redhat/rdma-core.spec | 3 ++ + 11 files changed, 87 insertions(+), 2 deletions(-) + create mode 100644 kernel-headers/rdma/erdma-abi.h + create mode 100644 providers/erdma/CMakeLists.txt + +diff --git a/CMakeLists.txt b/CMakeLists.txt +index 1aecae49a507..bdf24c7fc4cd 100644 +--- a/CMakeLists.txt ++++ b/CMakeLists.txt +@@ -668,6 +668,7 @@ add_subdirectory(providers/bnxt_re) + add_subdirectory(providers/cxgb4) # NO SPARSE + add_subdirectory(providers/efa) + add_subdirectory(providers/efa/man) ++add_subdirectory(providers/erdma) + add_subdirectory(providers/hns) + add_subdirectory(providers/irdma) + add_subdirectory(providers/mlx4) +diff --git a/MAINTAINERS b/MAINTAINERS +index 9fec12406f81..bbeddabbd1ba 100644 +--- a/MAINTAINERS ++++ b/MAINTAINERS +@@ -61,6 +61,11 @@ M: Gal Pressman + S: Supported + F: providers/efa/ + ++ERDMA USERSPACE PROVIDER (for erdma.ko) ++M: Cheng Xu ++S: Supported ++F: providers/erdma/ ++ + HF1 USERSPACE PROVIDER (for hf1.ko) + M: Mike Marciniszyn + M: Dennis Dalessandro +diff --git a/README.md b/README.md +index 18c3b014bdbd..ebb941e2132a 100644 +--- a/README.md ++++ b/README.md +@@ -15,6 +15,7 @@ under the providers/ directory. Support for the following Kernel RDMA drivers + is included: + + - efa.ko ++ - erdma.ko + - iw_cxgb4.ko + - hfi1.ko + - hns-roce.ko +diff --git a/debian/control b/debian/control +index d9fec3a5fbdc..38dc6f710dbc 100644 +--- a/debian/control ++++ b/debian/control +@@ -93,6 +93,7 @@ Description: User space provider drivers for libibverbs + - bnxt_re: Broadcom NetXtreme-E RoCE HCAs + - cxgb4: Chelsio T4 iWARP HCAs + - efa: Amazon Elastic Fabric Adapter ++ - erdma: Alibaba Elastic RDMA (iWarp) Adapter + - hfi1verbs: Intel Omni-Path HFI + - hns: HiSilicon Hip06 SoC + - ipathverbs: QLogic InfiniPath HCAs +diff --git a/debian/copyright b/debian/copyright +index d58aa779a569..ead62adb437a 100644 +--- a/debian/copyright ++++ b/debian/copyright +@@ -156,6 +156,10 @@ Files: providers/efa/* + Copyright: 2019 Amazon.com, Inc. or its affiliates. + License: BSD-2-clause or GPL-2 + ++Files: providers/erdma/* ++Copyright: 2020-2021, Alibaba Group ++License: BSD-MIT or GPL-2 ++ + Files: providers/hfi1verbs/* + Copyright: 2005 PathScale, Inc. + 2006-2009 QLogic Corporation +diff --git a/kernel-headers/CMakeLists.txt b/kernel-headers/CMakeLists.txt +index d9621ee2b940..f30ff2292ace 100644 +--- a/kernel-headers/CMakeLists.txt ++++ b/kernel-headers/CMakeLists.txt +@@ -2,6 +2,7 @@ publish_internal_headers(rdma + rdma/bnxt_re-abi.h + rdma/cxgb4-abi.h + rdma/efa-abi.h ++ rdma/erdma-abi.h + rdma/hns-abi.h + rdma/ib_user_ioctl_cmds.h + rdma/ib_user_ioctl_verbs.h +@@ -64,6 +65,7 @@ rdma_kernel_provider_abi( + rdma/bnxt_re-abi.h + rdma/cxgb4-abi.h + rdma/efa-abi.h ++ rdma/erdma-abi.h + rdma/hns-abi.h + rdma/ib_user_verbs.h + rdma/irdma-abi.h +diff --git a/kernel-headers/rdma/erdma-abi.h b/kernel-headers/rdma/erdma-abi.h +new file mode 100644 +index 000000000000..455046415983 +--- /dev/null ++++ b/kernel-headers/rdma/erdma-abi.h +@@ -0,0 +1,52 @@ ++/* SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR Linux-OpenIB) */ ++/* ++ * Copyright (c) 2020-2022, Alibaba Group. ++ */ ++ ++#ifndef __ERDMA_USER_H__ ++#define __ERDMA_USER_H__ ++ ++#include ++ ++#define ERDMA_ABI_VERSION 1 ++ ++struct erdma_ureq_create_cq { ++ __aligned_u64 db_record_va; ++ __aligned_u64 qbuf_va; ++ __u32 qbuf_len; ++ __u32 rsvd0; ++}; ++ ++struct erdma_uresp_create_cq { ++ __u32 cq_id; ++ __u32 num_cqe; ++}; ++ ++struct erdma_ureq_create_qp { ++ __aligned_u64 db_record_va; ++ __aligned_u64 qbuf_va; ++ __u32 qbuf_len; ++ __u32 rsvd0; ++}; ++ ++struct erdma_uresp_create_qp { ++ __u32 qp_id; ++ __u32 num_sqe; ++ __u32 num_rqe; ++ __u32 rq_offset; ++}; ++ ++struct erdma_uresp_alloc_ctx { ++ __u32 dev_id; ++ __u32 pad; ++ __u32 sdb_type; ++ __u32 sdb_entid; ++ __aligned_u64 sdb; ++ __aligned_u64 rdb; ++ __aligned_u64 cdb; ++ __u32 sdb_off; ++ __u32 rdb_off; ++ __u32 cdb_off; ++}; ++ ++#endif +diff --git a/kernel-headers/rdma/ib_user_ioctl_verbs.h b/kernel-headers/rdma/ib_user_ioctl_verbs.h +index 3072e5d6b692..7dd56210226f 100644 +--- a/kernel-headers/rdma/ib_user_ioctl_verbs.h ++++ b/kernel-headers/rdma/ib_user_ioctl_verbs.h +@@ -250,6 +250,7 @@ enum rdma_driver_id { + RDMA_DRIVER_QIB, + RDMA_DRIVER_EFA, + RDMA_DRIVER_SIW, ++ RDMA_DRIVER_ERDMA, + }; + + enum ib_uverbs_gid_type { +diff --git a/libibverbs/examples/devinfo.c b/libibverbs/examples/devinfo.c +index 5db568b08040..f7680a96a2a1 100644 +--- a/libibverbs/examples/devinfo.c ++++ b/libibverbs/examples/devinfo.c +@@ -521,8 +521,11 @@ static int print_hca_cap(struct ibv_device *ib_dev, uint8_t ib_port) + } + + printf("hca_id:\t%s\n", ibv_get_device_name(ib_dev)); +- printf("\ttransport:\t\t\t%s (%d)\n", +- transport_str(ib_dev->transport_type), ib_dev->transport_type); ++ if (device_attr.orig_attr.vendor_id != 0x1ded) ++ printf("\ttransport:\t\t\t%s (%d)\n", ++ transport_str(ib_dev->transport_type), ib_dev->transport_type); ++ else ++ printf("\ttransport:\t\t\t%s\n", "eRDMA"); + if (strlen(device_attr.orig_attr.fw_ver)) + printf("\tfw_ver:\t\t\t\t%s\n", device_attr.orig_attr.fw_ver); + printf("\tnode_guid:\t\t\t%s\n", guid_str(device_attr.orig_attr.node_guid, buf)); +diff --git a/providers/erdma/CMakeLists.txt b/providers/erdma/CMakeLists.txt +new file mode 100644 +index 000000000000..eeb40f79a08e +--- /dev/null ++++ b/providers/erdma/CMakeLists.txt +@@ -0,0 +1,12 @@ ++rdma_provider(erdma ++ erdma.c ++ erdma_db.c ++ erdma_verbs.c ++) ++ ++if (${CMAKE_SYSTEM_PROCESSOR} STREQUAL "x86_64") ++ set(ERDMA_PRIV_FLAGS "-msse4.1 -msse4.2 -mavx2 -Werror") ++ add_definitions(-DHAVE_AVX_SUPPORT) ++endif() ++ ++set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${ERDMA_PRIV_FLAGS}") +diff --git a/redhat/rdma-core.spec b/redhat/rdma-core.spec +index 55b2d608b29b..32e9bf77e8ca 100644 +--- a/redhat/rdma-core.spec ++++ b/redhat/rdma-core.spec +@@ -148,6 +148,8 @@ Provides: libcxgb4 = %{version}-%{release} + Obsoletes: libcxgb4 < %{version}-%{release} + Provides: libefa = %{version}-%{release} + Obsoletes: libefa < %{version}-%{release} ++Provides: liberdma = %{version}-%{release} ++Obsoletes: liberdma < %{version}-%{release} + Provides: libhfi1 = %{version}-%{release} + Obsoletes: libhfi1 < %{version}-%{release} + Provides: libipathverbs = %{version}-%{release} +@@ -176,6 +178,7 @@ Device-specific plug-in ibverbs userspace drivers are included: + + - libcxgb4: Chelsio T4 iWARP HCA + - libefa: Amazon Elastic Fabric Adapter ++- liberdma: Alibaba Elastic RDMA (iWarp) Adapter + - libhfi1: Intel Omni-Path HFI + - libhns: HiSilicon Hip06 SoC + - libipathverbs: QLogic InfiniPath HCA +-- +2.37.0 + diff --git a/rdma-core.spec b/rdma-core.spec index ecc8ed71e99220c118431f65fabad5f8cb5c00b2..ab2c93085ef1949a6cae7a930ca421c5b0d5a7c7 100644 --- a/rdma-core.spec +++ b/rdma-core.spec @@ -1,4 +1,4 @@ -%define anolis_release .0.2 +%define anolis_release .0.3 Name: rdma-core Version: 37.2 Release: 1%{anolis_release}%{?dist} @@ -22,6 +22,12 @@ Patch3: udev-keep-NAME_KERNEL-as-default-interface-naming-co.patch Patch300: 0001-ibacm-acm.c-load-plugin-while-it-is-soft-link.patch Patch301: 0001-tests-Fix-comparing-qp_state-for-iWARP-providers.patch +# ERDMA provider support +Patch1000: 1001-providers-erdma-Add-userspace-verbs-related-header-f.patch +Patch1001: 1002-providers-erdma-Add-userspace-verbs-implementation.patch +Patch1002: 1003-providers-erdma-Add-the-main-module-of-erdma.patch +Patch1003: 1004-providers-erdma-Add-to-the-build-environment.patch + # Do not build static libs by default. %define with_static %{?_with_static: 1} %{?!_with_static: 0} @@ -259,6 +265,10 @@ easy, object-oriented access to IB verbs. %patch3 -p1 %patch300 -p1 %patch301 -p1 +%patch1000 -p1 +%patch1001 -p1 +%patch1002 -p1 +%patch1003 -p1 %build @@ -644,6 +654,9 @@ fi %endif %changelog +* Thu Dec 16 2022 Cheng Xu - 37.2.1.0.3 +- Backport ERDMA support + * Wed Nov 16 2022 Liwei Ge - 37.2-1.0.2 - Fix build error