From f2c8cd805401ce0f030f7fe6377e809e96a47b80 Mon Sep 17 00:00:00 2001 From: Lyontang Date: Thu, 26 Dec 2024 10:45:37 +0800 Subject: [PATCH 1/2] =?UTF-8?q?=E6=9B=B4=E6=96=B0spdk-23.01.patch?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- spdk-23.01.patch | 534689 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 534689 insertions(+) create mode 100644 spdk-23.01.patch diff --git a/spdk-23.01.patch b/spdk-23.01.patch new file mode 100644 index 0000000..547da9c --- /dev/null +++ b/spdk-23.01.patch @@ -0,0 +1,534689 @@ +diff --git a/CONFIG b/CONFIG +index 0f23513..f49af84 100644 +--- a/CONFIG ++++ b/CONFIG +@@ -1,220 +1,223 @@ +-# SPDX-License-Identifier: BSD-3-Clause +-# Copyright (C) 2015 Intel Corporation. +-# All rights reserved. +-# Copyright (c) 2021, 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +-# Copyright (c) 2022 Dell Inc, or its subsidiaries. +-# +- +-# configure options: __CONFIGURE_OPTIONS__ +- +-# Installation prefix +-CONFIG_PREFIX="/usr/local" +- +-# Target architecture +-CONFIG_ARCH=native +- +-# Destination directory for the libraries +-CONFIG_LIBDIR= +- +-# Prefix for cross compilation +-CONFIG_CROSS_PREFIX= +- +-# Build with debug logging. Turn off for performance testing and normal usage +-CONFIG_DEBUG=n +- +-# Treat warnings as errors (fail the build on any warning). +-CONFIG_WERROR=n +- +-# Build with link-time optimization. +-CONFIG_LTO=n +- +-# Generate profile guided optimization data. +-CONFIG_PGO_CAPTURE=n +- +-# Use profile guided optimization data. +-CONFIG_PGO_USE=n +- +-# Build with code coverage instrumentation. +-CONFIG_COVERAGE=n +- +-# Build with Address Sanitizer enabled +-CONFIG_ASAN=n +- +-# Build with Undefined Behavior Sanitizer enabled +-CONFIG_UBSAN=n +- +-# Build with LLVM fuzzing enabled +-CONFIG_FUZZER=n +-CONFIG_FUZZER_LIB= +- +-# Build with Thread Sanitizer enabled +-CONFIG_TSAN=n +- +-# Build functional tests +-CONFIG_TESTS=y +- +-# Build unit tests +-CONFIG_UNIT_TESTS=y +- +-# Build examples +-CONFIG_EXAMPLES=y +- +-# Build apps +-CONFIG_APPS=y +- +-# Build with Control-flow Enforcement Technology (CET) +-CONFIG_CET=n +- +-# Directory that contains the desired SPDK environment library. +-# By default, this is implemented using DPDK. +-CONFIG_ENV= +- +-# This directory should contain 'include' and 'lib' directories for your DPDK +-# installation. +-CONFIG_DPDK_DIR= +-# Automatically set via pkg-config when bare --with-dpdk is set +-CONFIG_DPDK_LIB_DIR= +-CONFIG_DPDK_INC_DIR= +-CONFIG_DPDK_PKG_CONFIG=n +- +-# This directory should contain 'include' and 'lib' directories for WPDK. +-CONFIG_WPDK_DIR= +- +-# Build SPDK FIO plugin. Requires CONFIG_FIO_SOURCE_DIR set to a valid +-# fio source code directory. +-CONFIG_FIO_PLUGIN=n +- +-# This directory should contain the source code directory for fio +-# which is required for building the SPDK FIO plugin. +-CONFIG_FIO_SOURCE_DIR=/usr/src/fio +- +-# Enable RDMA support for the NVMf target. +-# Requires ibverbs development libraries. +-CONFIG_RDMA=n +-CONFIG_RDMA_SEND_WITH_INVAL=n +-CONFIG_RDMA_SET_ACK_TIMEOUT=n +-CONFIG_RDMA_SET_TOS=n +-CONFIG_RDMA_PROV=verbs +- +-# Enable NVMe Character Devices. +-CONFIG_NVME_CUSE=n +- +-# Enable FC support for the NVMf target. +-# Requires FC low level driver (from FC vendor) +-CONFIG_FC=n +-CONFIG_FC_PATH= +- +-# Build Ceph RBD support in bdev modules +-# Requires librbd development libraries +-CONFIG_RBD=n +- +-# Build DAOS support in bdev modules +-# Requires daos development libraries +-CONFIG_DAOS=n +-CONFIG_DAOS_DIR= +- +-# Build UBLK support +-CONFIG_UBLK=n +- +-# Build vhost library. +-CONFIG_VHOST=y +- +-# Build vhost initiator (Virtio) driver. +-CONFIG_VIRTIO=y +- +-# Build custom vfio-user transport for NVMf target and NVMe initiator. +-CONFIG_VFIO_USER=n +-CONFIG_VFIO_USER_DIR= +- +-# Build with PMDK backends +-CONFIG_PMDK=n +-CONFIG_PMDK_DIR= +- +-# Build with xNVMe +-CONFIG_XNVME=n +- +-# Enable the dependencies for building the DPDK accel compress module +-CONFIG_DPDK_COMPRESSDEV=n +- +-# Enable the dependencies for building the compress vbdev, includes the reduce library +-CONFIG_VBDEV_COMPRESS=n +- +-# Enable mlx5_pci dpdk compress PMD, enabled automatically if CONFIG_VBDEV_COMPRESS=y and libmlx5 exists +-CONFIG_VBDEV_COMPRESS_MLX5=n +- +-# Enable mlx5_pci dpdk crypto PMD, enabled automatically if CONFIG_CRYPTO=y and libmlx5 exists +-CONFIG_CRYPTO_MLX5=n +- +-# Requires libiscsi development libraries. +-CONFIG_ISCSI_INITIATOR=n +- +-# Enable the dependencies for building the crypto vbdev +-CONFIG_CRYPTO=n +- +-# Build spdk shared libraries in addition to the static ones. +-CONFIG_SHARED=n +- +-# Build with VTune support. +-CONFIG_VTUNE=n +-CONFIG_VTUNE_DIR= +- +-# Build Intel IPSEC_MB library +-CONFIG_IPSEC_MB=n +- +-# Enable OCF module +-CONFIG_OCF=n +-CONFIG_OCF_PATH= +-CONFIG_CUSTOMOCF=n +- +-# Build ISA-L library +-CONFIG_ISAL=y +- +-# Build ISA-L-crypto library +-CONFIG_ISAL_CRYPTO=y +- +-# Build with IO_URING support +-CONFIG_URING=n +- +-# Build IO_URING bdev with ZNS support +-CONFIG_URING_ZNS=n +- +-# Path to custom built IO_URING library +-CONFIG_URING_PATH= +- +-# Path to custom built OPENSSL library +-CONFIG_OPENSSL_PATH= +- +-# Build with FUSE support +-CONFIG_FUSE=n +- +-# Build with RAID5f support +-CONFIG_RAID5F=n +- +-# Build with IDXD support +-# In this mode, SPDK fully controls the DSA device. +-CONFIG_IDXD=n +- +-# Build with USDT support +-CONFIG_USDT=n +- +-# Build with IDXD kernel support. +-# In this mode, SPDK shares the DSA device with the kernel. +-CONFIG_IDXD_KERNEL=n +- +-# arc4random is available in stdlib.h +-CONFIG_HAVE_ARC4RANDOM=n +- +-# Is DPDK using libbsd? +-CONFIG_HAVE_LIBBSD=n +- +-# Is DPDK using libarchive? +-CONFIG_HAVE_LIBARCHIVE=n +- +-# Path to IPSEC_MB used by DPDK +-CONFIG_IPSEC_MB_DIR= +- +-# Generate Storage Management Agent's protobuf interface +-CONFIG_SMA=n +- +-# Build with Avahi support +-CONFIG_AVAHI=n ++# SPDX-License-Identifier: BSD-3-Clause ++# Copyright (C) 2015 Intel Corporation. ++# All rights reserved. ++# Copyright (c) 2021, 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved. ++# Copyright (c) 2022 Dell Inc, or its subsidiaries. ++# ++ ++# configure options: __CONFIGURE_OPTIONS__ ++ ++# Installation prefix ++CONFIG_PREFIX="/usr/local" ++ ++# Target architecture ++CONFIG_ARCH=native ++ ++# Destination directory for the libraries ++CONFIG_LIBDIR= ++ ++# Prefix for cross compilation ++CONFIG_CROSS_PREFIX= ++ ++# Build with debug logging. Turn off for performance testing and normal usage ++CONFIG_DEBUG=n ++ ++# Treat warnings as errors (fail the build on any warning). ++CONFIG_WERROR=n ++ ++# Build with link-time optimization. ++CONFIG_LTO=n ++ ++# Generate profile guided optimization data. ++CONFIG_PGO_CAPTURE=n ++ ++# Use profile guided optimization data. ++CONFIG_PGO_USE=n ++ ++# Build with code coverage instrumentation. ++CONFIG_COVERAGE=n ++ ++# Build with Address Sanitizer enabled ++CONFIG_ASAN=n ++ ++# Build with Undefined Behavior Sanitizer enabled ++CONFIG_UBSAN=n ++ ++# Build with LLVM fuzzing enabled ++CONFIG_FUZZER=n ++CONFIG_FUZZER_LIB= ++ ++# Build with Thread Sanitizer enabled ++CONFIG_TSAN=n ++ ++# Build functional tests ++CONFIG_TESTS=y ++ ++# Build unit tests ++CONFIG_UNIT_TESTS=y ++ ++# Build examples ++CONFIG_EXAMPLES=n ++ ++# Build apps ++CONFIG_APPS=y ++ ++# Build with Control-flow Enforcement Technology (CET) ++CONFIG_CET=n ++ ++# Directory that contains the desired SPDK environment library. ++# By default, this is implemented using DPDK. ++CONFIG_ENV= ++ ++# This directory should contain 'include' and 'lib' directories for your DPDK ++# installation. ++CONFIG_DPDK_DIR= ++# Automatically set via pkg-config when bare --with-dpdk is set ++CONFIG_DPDK_LIB_DIR= ++CONFIG_DPDK_INC_DIR= ++CONFIG_DPDK_PKG_CONFIG=n ++ ++# This directory should contain 'include' and 'lib' directories for WPDK. ++CONFIG_WPDK_DIR= ++ ++# Build SPDK FIO plugin. Requires CONFIG_FIO_SOURCE_DIR set to a valid ++# fio source code directory. ++CONFIG_FIO_PLUGIN=n ++ ++# This directory should contain the source code directory for fio ++# which is required for building the SPDK FIO plugin. ++CONFIG_FIO_SOURCE_DIR=/usr/src/fio ++ ++# Enable RDMA support for the NVMf target. ++# Requires ibverbs development libraries. ++CONFIG_RDMA=n ++CONFIG_RDMA_SEND_WITH_INVAL=n ++CONFIG_RDMA_SET_ACK_TIMEOUT=n ++CONFIG_RDMA_SET_TOS=n ++CONFIG_RDMA_PROV=verbs ++ ++# Enable NVMe Character Devices. ++CONFIG_NVME_CUSE=n ++ ++# Enable FC support for the NVMf target. ++# Requires FC low level driver (from FC vendor) ++CONFIG_FC=n ++CONFIG_FC_PATH= ++ ++# Build Ceph RBD support in bdev modules ++# Requires librbd development libraries ++CONFIG_RBD=n ++ ++# Build DAOS support in bdev modules ++# Requires daos development libraries ++CONFIG_DAOS=n ++CONFIG_DAOS_DIR= ++ ++# Build UBLK support ++CONFIG_UBLK=n ++ ++# Build vhost library. ++CONFIG_VHOST=y ++ ++# Build ssam library. ++CONFIG_SSAM=y ++ ++# Build vhost initiator (Virtio) driver. ++CONFIG_VIRTIO=y ++ ++# Build custom vfio-user transport for NVMf target and NVMe initiator. ++CONFIG_VFIO_USER=n ++CONFIG_VFIO_USER_DIR= ++ ++# Build with PMDK backends ++CONFIG_PMDK=n ++CONFIG_PMDK_DIR= ++ ++# Build with xNVMe ++CONFIG_XNVME=n ++ ++# Enable the dependencies for building the DPDK accel compress module ++CONFIG_DPDK_COMPRESSDEV=n ++ ++# Enable the dependencies for building the compress vbdev, includes the reduce library ++CONFIG_VBDEV_COMPRESS=n ++ ++# Enable mlx5_pci dpdk compress PMD, enabled automatically if CONFIG_VBDEV_COMPRESS=y and libmlx5 exists ++CONFIG_VBDEV_COMPRESS_MLX5=n ++ ++# Enable mlx5_pci dpdk crypto PMD, enabled automatically if CONFIG_CRYPTO=y and libmlx5 exists ++CONFIG_CRYPTO_MLX5=n ++ ++# Requires libiscsi development libraries. ++CONFIG_ISCSI_INITIATOR=n ++ ++# Enable the dependencies for building the crypto vbdev ++CONFIG_CRYPTO=n ++ ++# Build spdk shared libraries in addition to the static ones. ++CONFIG_SHARED=n ++ ++# Build with VTune support. ++CONFIG_VTUNE=n ++CONFIG_VTUNE_DIR= ++ ++# Build Intel IPSEC_MB library ++CONFIG_IPSEC_MB=n ++ ++# Enable OCF module ++CONFIG_OCF=n ++CONFIG_OCF_PATH= ++CONFIG_CUSTOMOCF=n ++ ++# Build ISA-L library ++CONFIG_ISAL=y ++ ++# Build ISA-L-crypto library ++CONFIG_ISAL_CRYPTO=y ++ ++# Build with IO_URING support ++CONFIG_URING=n ++ ++# Build IO_URING bdev with ZNS support ++CONFIG_URING_ZNS=n ++ ++# Path to custom built IO_URING library ++CONFIG_URING_PATH= ++ ++# Path to custom built OPENSSL library ++CONFIG_OPENSSL_PATH= ++ ++# Build with FUSE support ++CONFIG_FUSE=n ++ ++# Build with RAID5f support ++CONFIG_RAID5F=n ++ ++# Build with IDXD support ++# In this mode, SPDK fully controls the DSA device. ++CONFIG_IDXD=n ++ ++# Build with USDT support ++CONFIG_USDT=n ++ ++# Build with IDXD kernel support. ++# In this mode, SPDK shares the DSA device with the kernel. ++CONFIG_IDXD_KERNEL=n ++ ++# arc4random is available in stdlib.h ++CONFIG_HAVE_ARC4RANDOM=n ++ ++# Is DPDK using libbsd? ++CONFIG_HAVE_LIBBSD=n ++ ++# Is DPDK using libarchive? ++CONFIG_HAVE_LIBARCHIVE=n ++ ++# Path to IPSEC_MB used by DPDK ++CONFIG_IPSEC_MB_DIR= ++ ++# Generate Storage Management Agent's protobuf interface ++CONFIG_SMA=n ++ ++# Build with Avahi support ++CONFIG_AVAHI=n +diff --git a/app/Makefile b/app/Makefile +index 4d02c60..3c4ca1c 100644 +--- a/app/Makefile ++++ b/app/Makefile +@@ -1,29 +1,30 @@ +-# SPDX-License-Identifier: BSD-3-Clause +-# Copyright (C) 2015 Intel Corporation. +-# All rights reserved. +-# +- +-SPDK_ROOT_DIR := $(abspath $(CURDIR)/..) +-include $(SPDK_ROOT_DIR)/mk/spdk.common.mk +- +-DIRS-y += trace +-DIRS-y += trace_record +-DIRS-y += nvmf_tgt +-DIRS-y += iscsi_tgt +-DIRS-y += spdk_tgt +-DIRS-y += spdk_lspci +-ifneq ($(OS),Windows) +-# TODO - currently disabled on Windows due to lack of support for curses +-DIRS-y += spdk_top +-endif +-ifeq ($(OS),Linux) +-DIRS-$(CONFIG_VHOST) += vhost +-DIRS-y += spdk_dd +-endif +- +-.PHONY: all clean $(DIRS-y) +- +-all: $(DIRS-y) +-clean: $(DIRS-y) +- +-include $(SPDK_ROOT_DIR)/mk/spdk.subdirs.mk ++# SPDX-License-Identifier: BSD-3-Clause ++# Copyright (C) 2015 Intel Corporation. ++# All rights reserved. ++# ++ ++SPDK_ROOT_DIR := $(abspath $(CURDIR)/..) ++include $(SPDK_ROOT_DIR)/mk/spdk.common.mk ++ ++DIRS-y += trace ++DIRS-y += trace_record ++DIRS-y += nvmf_tgt ++DIRS-y += iscsi_tgt ++DIRS-y += spdk_tgt ++DIRS-y += spdk_lspci ++DIRS-y += ssam ++ifneq ($(OS),Windows) ++# TODO - currently disabled on Windows due to lack of support for curses ++DIRS-y += spdk_top ++endif ++ifeq ($(OS),Linux) ++DIRS-$(CONFIG_VHOST) += vhost ++DIRS-y += spdk_dd ++endif ++ ++.PHONY: all clean $(DIRS-y) ++ ++all: $(DIRS-y) ++clean: $(DIRS-y) ++ ++include $(SPDK_ROOT_DIR)/mk/spdk.subdirs.mk +diff --git a/app/ssam/Makefile b/app/ssam/Makefile +new file mode 100644 +index 0000000..2b9ae3f +--- /dev/null ++++ b/app/ssam/Makefile +@@ -0,0 +1,58 @@ ++# ++# BSD LICENSE ++# ++# Copyright (c) Huawei Technologies Co., Ltd. 2021-2021. All rights reserved. ++# ++# Redistribution and use in source and binary forms, with or without ++# modification, are permitted provided that the following conditions ++# are met: ++# ++# * Redistributions of source code must retain the above copyright ++# notice, this list of conditions and the following disclaimer. ++# * Redistributions in binary form must reproduce the above copyright ++# notice, this list of conditions and the following disclaimer in ++# the documentation and/or other materials provided with the ++# distribution. ++# * Neither the name of Intel Corporation nor the names of its ++# contributors may be used to endorse or promote products derived ++# from this software without specific prior written permission. ++# ++# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS ++# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT ++# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR ++# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT ++# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, ++# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT ++# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, ++# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY ++# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT ++# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE ++# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++# ++ ++SPDK_ROOT_DIR := $(abspath $(CURDIR)/../..) ++include $(SPDK_ROOT_DIR)/mk/spdk.common.mk ++include $(SPDK_ROOT_DIR)/mk/spdk.modules.mk ++ ++APP = ssam ++ ++C_SRCS := ssam.c ++ ++SYS_LIBS += -ldpak_ssam -lcap ++SPDK_LIB_LIST = $(ALL_MODULES_LIST) event_ssam event ssam ssam_adapter ++ ++ifeq ($(OS),Linux) ++SPDK_LIB_LIST += event_nbd ++endif ++ ++ifeq ($(SPDK_ROOT_DIR)/lib/env_dpdk,$(CONFIG_ENV)) ++SPDK_LIB_LIST += env_dpdk_rpc ++endif ++ ++include $(SPDK_ROOT_DIR)/mk/spdk.app.mk ++ ++install: $(APP) ++ $(INSTALL_APP) ++ ++uninstall: ++ $(UNINSTALL_APP) +diff --git a/app/ssam/ssam.c b/app/ssam/ssam.c +new file mode 100644 +index 0000000..3872330 +--- /dev/null ++++ b/app/ssam/ssam.c +@@ -0,0 +1,103 @@ ++/*- ++ * BSD LICENSE ++ * ++ * Copyright (c) Huawei Technologies Co., Ltd. 2021-2021. All rights reserved. ++ * ++ * Redistribution and use in source and binary forms, with or without ++ * modification, are permitted provided that the following conditions ++ * are met: ++ * ++ * * Redistributions of source code must retain the above copyright ++ * notice, this list of conditions and the following disclaimer. ++ * * Redistributions in binary form must reproduce the above copyright ++ * notice, this list of conditions and the following disclaimer in ++ * the documentation and/or other materials provided with the ++ * distribution. ++ * * Neither the name of Intel Corporation nor the names of its ++ * contributors may be used to endorse or promote products derived ++ * from this software without specific prior written permission. ++ * ++ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS ++ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT ++ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR ++ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT ++ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, ++ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT ++ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, ++ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY ++ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT ++ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE ++ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++ */ ++ ++#include "spdk/ssam.h" ++#include "spdk/string.h" ++ ++#define IOVA_MODE_PA "pa" ++ ++static bool g_start_flag = false; ++ ++bool spdk_ssam_is_starting(void) ++{ ++ return g_start_flag; ++} ++ ++static void ++ssam_started(void *ctx) ++{ ++ ssam_poller_start(); ++ SPDK_NOTICELOG("hot restart %d\n", spdk_ssam_get_hot_restart()); ++ spdk_ssam_set_hot_restart(false); ++ g_start_flag = false; ++ SPDK_NOTICELOG("%s server started.\n", SSAM_SERVER_NAME); ++} ++ ++int ++main(int argc, char *argv[]) ++{ ++ struct spdk_app_opts opts = {}; ++ int rc; ++ int shm_id; ++ ++ spdk_app_opts_init(&opts, sizeof(opts)); ++ opts.name = SSAM_SERVER_NAME; ++ opts.iova_mode = IOVA_MODE_PA; ++ opts.num_entries = 0; ++ g_start_flag = true; ++ ++ rc = spdk_ssam_user_config_init(); ++ if (rc != 0) { ++ SPDK_ERRLOG("ssam user config init failed: %s \n", spdk_strerror(-rc)); ++ exit(rc); ++ } ++ ++ shm_id = shm_open(SSAM_SHM, O_RDWR, SSAM_SHM_PERMIT); ++ if (shm_id < 0) { ++ SPDK_NOTICELOG("ssam share memory hasn't been created.\n"); ++ g_start_flag = false; ++ } else { ++ ssam_set_shm_created(true); ++ SPDK_NOTICELOG("ssam share memory has been created.\n"); ++ } ++ ++ rc = ssam_rc_preinit(); ++ if (rc < 0) { ++ exit(rc); ++ } ++ ++ rc = spdk_app_parse_args(argc, argv, &opts, NULL, NULL, NULL, NULL); ++ if (rc != SPDK_APP_PARSE_ARGS_SUCCESS) { ++ SPDK_ERRLOG("spdk app parse args fail: %d \n", rc); ++ exit(rc); ++ } ++ spdk_ssam_set_hot_restart(opts.hot_restart); ++ ++ /* Blocks until the application is exiting */ ++ rc = spdk_app_start(&opts, ssam_started, NULL); ++ spdk_ssam_exit(); ++ ++ spdk_app_fini(); ++ SPDK_NOTICELOG("%s server exited.\n", SSAM_SERVER_NAME); ++ ++ return rc; ++} +diff --git a/configure b/configure +index 5935ff5..c2b81a1 100644 +--- a/configure ++++ b/configure +@@ -1,1336 +1,1362 @@ +-#!/usr/bin/env bash +-# SPDX-License-Identifier: BSD-3-Clause +-# Copyright (C) 2017 Intel Corporation +-# All rights reserved. +-# Copyright (c) 2022 Dell Inc, or its subsidiaries. +-# +- +-set -e +- +-trap 'echo -e "\n\nConfiguration failed\n\n" >&2' ERR +- +-rootdir=$(readlink -f $(dirname $0)) +-source "$rootdir/scripts/common.sh" +- +-function usage() { +- echo "'configure' configures SPDK to compile on supported platforms." +- echo "" +- echo "Usage: ./configure [OPTION]..." +- echo "" +- echo "Defaults for the options are specified in brackets." +- echo "" +- echo "General:" +- echo " -h, --help Display this help and exit" +- echo "" +- echo " --prefix=path Configure installation prefix (default: /usr/local)" +- echo " --target-arch=arch Target build architecture. Must be a valid GNU arch. Default: native" +- echo "" +- echo " --cross-prefix=prefix Prefix for cross compilation (default: none)" +- echo " example: aarch64-linux-gnu" +- echo " --libdir=path Configure installation path for the libraries (default: \$prefix/lib)" +- echo "" +- echo " --enable-debug Configure for debug builds" +- echo " --enable-werror Treat compiler warnings as errors" +- echo " --enable-asan Enable address sanitizer" +- echo " --enable-ubsan Enable undefined behavior sanitizer" +- echo " --enable-coverage Enable code coverage tracking" +- echo " --enable-lto Enable link-time optimization" +- echo " --enable-pgo-capture Enable generation of profile guided optimization data" +- echo " --enable-pgo-use Use previously captured profile guided optimization data" +- echo " --enable-cet Enable Intel Control-flow Enforcement Technology (CET)" +- echo " --disable-tests Disable building of functional tests" +- echo " --disable-unit-tests Disable building of unit tests" +- echo " --disable-examples Disable building of examples" +- echo " --disable-apps Disable building of apps" +- echo "" +- echo "Specifying Dependencies:" +- echo "--with-DEPENDENCY[=path] Use the given dependency. Optionally, provide the" +- echo " path." +- echo "--without-DEPENDENCY Do not link to the given dependency. This may" +- echo " disable features and components." +- echo "" +- echo "Valid dependencies are listed below." +- echo " --with-dpdk[=DIR] Build against a custom dpdk version. By default, the dpdk" +- echo " --without-dpdk submodule in spdk tree will be used." +- echo " example: /usr/share/dpdk/x86_64-default-linuxapp-gcc" +- echo " --with-env=DIR Use an alternate environment implementation instead of DPDK." +- echo " Implies --without-dpdk." +- echo " --with-idxd Build the IDXD library and accel framework plug-in module." +- echo " --without-idxd Disabled while experimental. Only built for x86 when enabled." +- echo " --with-crypto Build isa-l-crypto and vbdev crypto module. No path required." +- echo " --without-crypto Disable isa-l-crypto and vbdev crypto module." +- echo " --with-fio[=DIR] Build fio_plugin." +- echo " --without-fio default: /usr/src/fio" +- echo " --with-xnvme Build xNVMe bdev module." +- echo " --without-xnvme No path required." +- echo " --with-vhost Build vhost target. Enabled by default." +- echo " --without-vhost No path required." +- echo " --with-virtio Build vhost initiator and virtio-pci bdev modules." +- echo " --without-virtio No path required." +- echo " --with-vfio-user[=DIR] Build custom vfio-user transport for NVMf target and vfio-user target." +- echo " vfio-user initiator is always built-in in Linux." +- echo " example: /usr/src/libvfio-user" +- echo " --without-vfio-user No path required." +- echo " --with-pmdk[=DIR] Build persistent memory bdev. (Deprecated) +- example: /usr/share/pmdk" +- echo " --without-pmdk No path required." +- echo " --with-vbdev-compress Build vbdev compression module and dependencies." +- echo " --without-vbdev-compress No path required." +- echo " --with-dpdk-compressdev Build accel DPDK compression module and dependencies." +- echo " --without-dpdk-compressdev No path required." +- echo " --with-rbd Build Ceph RBD bdev module." +- echo " --without-rbd No path required." +- echo " --with-ublk Build ublk library." +- echo " --without-ublk No path required." +- echo " --with-rdma[=DIR] Build RDMA transport for NVMf target and initiator." +- echo " --without-rdma Accepts optional RDMA provider name. Can be \"verbs\" or \"mlx5_dv\"." +- echo " If no provider specified, \"verbs\" provider is used by default." +- echo " --with-fc[=DIR] Build FC transport for NVMf target." +- echo " --without-fc If an argument is provided, it is considered a directory containing" +- echo " libufc.a and fc_lld.h. Otherwise the regular system paths will" +- echo " be searched." +- echo " --with-daos[=DIR] Build DAOS bdev module." +- echo " --without-daos No path required." +- echo " --with-shared Build spdk shared libraries." +- echo " --without-shared No path required." +- echo " --with-iscsi-initiator Build with iscsi bdev module." +- echo " --without-iscsi-initiator No path required." +- echo " --with-vtune=DIR Required to profile I/O under Intel VTune Amplifier XE. (Deprecated)" +- echo " --without-vtune example: /opt/intel/vtune_amplifier_xe_version" +- echo " --with-ocf[=DIR] Build OCF library and bdev module. (Deprecated)" +- echo " --without-ocf If argument is directory, interpret it as root of OCF repo" +- echo " If argument is file, interpret it as compiled OCF lib" +- echo " If no argument is specified, OCF git submodule is used by default" +- echo " example: /usr/src/ocf/" +- echo " --with-uring[=DIR] Build I/O uring bdev or socket module." +- echo " --without-uring If an argument is provided, it is considered a directory containing" +- echo " liburing.a and io_uring.h. Otherwise the regular system paths will" +- echo " be searched." +- echo " --without-uring-zns Build I/O uring module without ZNS (zoned namespaces) support." +- echo " --with-openssl[=DIR] Build OPENSSL with custom path. Otherwise the regular system paths will" +- echo " be searched." +- echo " --with-fuse Build FUSE components for mounting a blobfs filesystem." +- echo " --without-fuse No path required." +- echo " --with-nvme-cuse Build NVMe driver with support for CUSE-based character devices." +- echo " --without-nvme-cuse No path required." +- echo " --with-raid5f Build with bdev_raid module RAID5f support." +- echo " --without-raid5f No path required." +- echo " --with-wpdk=DIR Build using WPDK to provide support for Windows (experimental)." +- echo " --without-wpdk The argument must be a directory containing lib and include." +- echo " --with-usdt Build with userspace DTrace probes enabled." +- echo " --without-usdt No path required." +- echo " --with-fuzzer Build with LLVM fuzzing enabled." +- echo " Path to clang_rt.fuzzer_no_main library required." +- echo " Requires setting CC and CXX to clang." +- echo " (Typically /usr/lib/llvm-VER/lib/clang/VER/lib/linux/libclang_rt.fuzzer_no_main-ARCH.a)" +- echo " --with-sma Generate Storage Management Agent's protobuf interface" +- echo " --without-sma No path required." +- echo " --with-avahi Build with Avahi mDNS discovery client service enabled in bdev-nvme module." +- echo " --without-avahi No path required." +- echo "" +- echo "Environment variables:" +- echo "" +- echo "CC C compiler" +- echo "CFLAGS C compiler flags" +- echo "CXX C++ compiler" +- echo "CXXFLAGS C++ compiler flags" +- echo "LD Linker" +- echo "LDFLAGS Linker flags" +- echo "DESTDIR Destination for 'make install'" +- echo "" +-} +- +-# Load default values +-# Convert config to sourceable configuration file +-sed -r 's/CONFIG_([[:alnum:]_]+)=(.*)/CONFIG[\1]=\2/g' $rootdir/CONFIG > $rootdir/CONFIG.sh +-declare -A CONFIG +-source $rootdir/CONFIG.sh +-rm $rootdir/CONFIG.sh +- +-# Try to expand literal ~ that might have been passed as an option via --long-opt=~/dir. +-set -- "${@//\~/~}" +- +-for i in "$@"; do +- case "$i" in +- --cross-prefix=*) +- CONFIG[CROSS_PREFIX]="${i#*=}" +- ;; +- --enable-lto) +- CONFIG[LTO]=y +- ;; +- --disable-lto) +- CONFIG[LTO]=n +- ;; +- esac +-done +- +-# Detect the compiler toolchain +-$rootdir/scripts/detect_cc.sh --cc="$CC" --cxx="$CXX" --lto="${CONFIG[LTO]}" --ld="$LD" --cross-prefix="${CONFIG[CROSS_PREFIX]}" > $rootdir/mk/cc.mk +- +-CC=$(grep "DEFAULT_CC=" "$rootdir/mk/cc.mk" | sed s/DEFAULT_CC=//) +-CC_TYPE=$(grep "CC_TYPE=" "$rootdir/mk/cc.mk" | cut -d "=" -f 2) +- +-arch=$($CC -dumpmachine) +-sys_name=$(uname -s) +- +-if [[ $arch == *mingw* ]] || [[ $arch == *windows* ]]; then +- sys_name=Windows +-fi +- +-if [[ $sys_name != "Linux" ]]; then +- # Vhost, rte_vhost library and virtio are only supported on Linux. +- CONFIG[VHOST]="n" +- CONFIG[VIRTIO]="n" +- echo "Notice: Vhost, rte_vhost library and virtio are only supported on Linux. Turning off default feature." +-fi +- +-function check_dir() { +- arg="$1" +- dir="${arg#*=}" +- if [ ! -d "$dir" ]; then +- echo "$arg: directory not found" +- exit 1 +- fi +-} +- +-# On x86_64 'clang -dumpmachine' produces x86_64-pc-linux-gnu +-# whereas the dpdk might be built with gcc and its libs lie in +-# x86_64-linux-gnu. Let's find the right libdir for dpdk libs. +-function find_dpdk_arch_libdir() { +- local dpdk_libdir="$1/lib" +- +- # Use libdir with 'lib' or 'lib64' +- if [[ ! -d "$dpdk_libdir" ]]; then +- dpdk_libdir+="64" +- fi +- +- # Checking first what we have with $arch, then clang +- # variant of arch. +- arches=("$arch" "$(echo $arch | sed 's/-pc//g')") +- for a in "${arches[@]}"; do +- local libdir_arch="$dpdk_libdir/$a" +- if [[ -d "$libdir_arch" ]]; then +- echo "$libdir_arch" +- return +- fi +- done +- +- # Fallback to the libdir without arch component +- echo "$dpdk_libdir" +-} +- +-function check_IPSec_mb() { +- local mode=$1 +- local dpdk_libdir=$2 +- local dpdk_incdir=$3 +- local have_ipsec_mb=n +- +- if [[ $mode = "pkg-config" ]]; then +- local dpdk_libs +- +- # Request libdpdk pkg-config settings to figure out if the IPSec_MB is used +- # as a dependency. +- # Due to some reason pkg-config shows -lIPSec_MB only with --static option +- dpdk_libs=$(PKG_CONFIG_PATH="$PKG_CONFIG_PATH:$dpdk_libdir/pkgconfig" pkg-config --libs --static libdpdk) +- if echo "$dpdk_libs" | grep "\-lIPSec_MB" > /dev/null 2>&1; then +- have_ipsec_mb=y +- fi +- elif [[ $mode = "build-config" ]]; then +- # Use dpdk build config header to check if the IPSec_MB was used. +- if grep -F "define RTE_CRYPTO_IPSEC_MB 1" "$dpdk_incdir/rte_build_config.h" > /dev/null 2>&1; then +- have_ipsec_mb=y +- fi +- else +- echo "ERROR: Invalid IPSec_MB checking mode $mode." +- echo "ERROR: Only \"pkg-config\" and \"build-config\" available." +- exit 1 +- fi +- if [[ $have_ipsec_mb = "n" ]]; then +- CONFIG[IPSEC_MB]=n +- return +- fi +- +- # Since we don't know the library path where the IPSec_MB is located +- # let's find it out with the ldd utility. This can be a standard location +- # or a custom build. +- local librte_crypto_ipsec_mb="$dpdk_libdir/librte_crypto_ipsec_mb.so" +- if [[ -f "$librte_crypto_ipsec_mb" ]]; then +- local ipsec_mb_libdir +- +- ipsec_mb_libdir=$(ldd "$librte_crypto_ipsec_mb" | grep "libIPSec_MB.so" \ +- | sed -e 's/\s*libIPSec_MB.so.*=>\s//' -e 's/\/libIPSec_MB.so.*$//') +- if [[ -d $ipsec_mb_libdir ]]; then +- CONFIG[IPSEC_MB]=y +- CONFIG[IPSEC_MB_DIR]="$ipsec_mb_libdir" +- elif [[ $ipsec_mb_libdir = "not found" ]]; then +- # ldconfig cache is broken, old build with refs to non-existing libs, etc. +- echo "ERROR: Invalid IPSec_MB installation. Library is not found and/or ldconfig cache is broken!" +- exit 1 +- else +- # Failed to check for IPSec_MB lib path. Let's just assume it is lives +- # in one of the standard locations (/usr/lib, etc.). +- CONFIG[IPSEC_MB]=y +- fi +- else +- # pkg-config says there is IPSec_mb and dpdk lib does not have it. Let's just +- # assume it is installed in the system in one of the standard locations. +- CONFIG[IPSEC_MB]=y +- fi +-} +- +-for i in "$@"; do +- case "$i" in +- -h | --help) +- usage +- exit 0 +- ;; +- --cross-prefix=*) ;& +- --enable-lto) ;& +- --disable-lto) +- # Options handled before detecting CC. +- ;; +- --prefix=*) +- CONFIG[PREFIX]="${i#*=}" +- ;; +- --target-arch=*) +- CONFIG[ARCH]="${i#*=}" +- ;; +- --libdir=*) +- CONFIG[LIBDIR]="${i#*=}" +- ;; +- --enable-debug) +- CONFIG[DEBUG]=y +- ;; +- --disable-debug) +- CONFIG[DEBUG]=n +- ;; +- --enable-asan) +- CONFIG[ASAN]=y +- ;; +- --disable-asan) +- CONFIG[ASAN]=n +- ;; +- --enable-ubsan) +- CONFIG[UBSAN]=y +- ;; +- --disable-ubsan) +- CONFIG[UBSAN]=n +- ;; +- --enable-tsan) +- CONFIG[TSAN]=y +- ;; +- --disable-tsan) +- CONFIG[TSAN]=n +- ;; +- --enable-coverage) +- CONFIG[COVERAGE]=y +- ;; +- --disable-coverage) +- CONFIG[COVERAGE]=n +- ;; +- --enable-pgo-capture) +- CONFIG[PGO_CAPTURE]=y +- ;; +- --disable-pgo-capture) +- CONFIG[PGO_CAPTURE]=n +- ;; +- --enable-pgo-use) +- CONFIG[PGO_USE]=y +- ;; +- --disable-pgo-use) +- CONFIG[PGO_USE]=n +- ;; +- --enable-tests) +- CONFIG[TESTS]=y +- ;; +- --disable-tests) +- CONFIG[TESTS]=n +- ;; +- --enable-unit-tests) +- CONFIG[UNIT_TESTS]=y +- ;; +- --disable-unit-tests) +- CONFIG[UNIT_TESTS]=n +- ;; +- --enable-examples) +- CONFIG[EXAMPLES]=y +- ;; +- --disable-examples) +- CONFIG[EXAMPLES]=n +- ;; +- --enable-apps) +- CONFIG[APPS]=y +- ;; +- --disable-apps) +- CONFIG[APPS]=N +- ;; +- --enable-werror) +- CONFIG[WERROR]=y +- ;; +- --disable-werror) +- CONFIG[WERROR]=n +- ;; +- --enable-cet) +- CONFIG[CET]=y +- ;; +- --disable-cet) +- CONFIG[CET]=n +- ;; +- --with-dpdk) +- # Can we use pkg-config? +- if command -v "pkg-config" > /dev/null 2>&1 && pkg-config --exists libdpdk; then +- dpdk_libdir=$(pkg-config --variable=libdir libdpdk) +- dpdk_libdir=$(readlink -f $dpdk_libdir) +- dpdk_incdir=$(pkg-config --variable=includedir libdpdk) +- echo "Using DPDK lib dir $dpdk_libdir" +- CONFIG[DPDK_LIB_DIR]=$dpdk_libdir +- CONFIG[DPDK_INC_DIR]=$dpdk_incdir +- CONFIG[DPDK_PKG_CONFIG]=y +- CFLAGS="${CFLAGS:+$CFLAGS }$(pkg-config --cflags libdpdk)" +- check_IPSec_mb "pkg-config" "$dpdk_libdir" "$dpdk_incdir" +- else +- echo "libdpdk.pc not found, aborting" +- exit 1 +- fi +- ;; +- --with-dpdk=*) +- check_dir "$i" +- dpdk_dir=$(readlink -f ${i#*=}) +- dpdk_libdir=$(find_dpdk_arch_libdir $dpdk_dir) +- dpdk_incdir="$dpdk_dir/include" +- +- # Can we use pkg-config? +- if command -v "pkg-config" > /dev/null 2>&1 && PKG_CONFIG_PATH="$PKG_CONFIG_PATH:$dpdk_libdir/pkgconfig" pkg-config --exists libdpdk; then +- echo "Using $dpdk_libdir/pkgconfig for additional libs..." +- sysroot_dir=$(PKG_CONFIG_PATH="$PKG_CONFIG_PATH:$dpdk_libdir/pkgconfig" pkg-config --variable=pc_sysrootdir libdpdk) +- dpdk_libdir=$(PKG_CONFIG_SYSROOT_DIR='' PKG_CONFIG_PATH="$PKG_CONFIG_PATH:$dpdk_libdir/pkgconfig" pkg-config --variable=libdir libdpdk) +- dpdk_libdir=$(readlink -f "${sysroot_dir}$dpdk_libdir") +- if ! echo $dpdk_libdir | grep $dpdk_dir > /dev/null 2>&1; then +- echo "ERROR: pkg-config reported DPDK libdir $dpdk_libdir is out of the directory specified with --with-dpdk=" +- echo "ERROR: do you have another DPDK installed in the system?" +- exit 1 +- fi +- CFLAGS="${CFLAGS:+$CFLAGS }$(PKG_CONFIG_PATH="$PKG_CONFIG_PATH:$dpdk_libdir/pkgconfig" pkg-config --cflags libdpdk)" +- dpdk_incdir="${sysroot_dir}$(PKG_CONFIG_PATH="$PKG_CONFIG_PATH:$dpdk_libdir/pkgconfig" pkg-config --variable=includedir libdpdk)" +- check_IPSec_mb "pkg-config" "$dpdk_libdir" "$dpdk_incdir" +- else +- echo "Using $dpdk_incdir/rte_build_config.h for additional libs..." +- +- check_IPSec_mb "build-config" "$dpdk_libdir" "$dpdk_incdir" +- fi +- echo "DPDK libraries: $dpdk_libdir" +- echo "DPDK includes: $dpdk_incdir" +- CONFIG[DPDK_DIR]=$dpdk_dir +- CONFIG[DPDK_LIB_DIR]="$dpdk_libdir" +- CONFIG[DPDK_INC_DIR]="$dpdk_incdir" +- CONFIG[DPDK_PKG_CONFIG]=n +- ;; +- --without-dpdk) +- CONFIG[DPDK_DIR]= +- ;; +- --with-wpdk=*) +- check_dir "$i" +- CONFIG[WPDK_DIR]=$(readlink -f ${i#*=}) +- ;; +- --without-wpdk) +- CONFIG[WPDK_DIR]= +- ;; +- --with-env=*) +- CONFIG[ENV]="${i#*=}" +- ;; +- --with-ublk) +- CONFIG[UBLK]=y +- ;; +- --without-ublk) +- CONFIG[UBLK]=n +- ;; +- --with-rbd) +- CONFIG[RBD]=y +- ;; +- --without-rbd) +- CONFIG[RBD]=n +- ;; +- --with-rdma=*) +- CONFIG[RDMA]=y +- CONFIG[RDMA_PROV]=${i#*=} +- ;; +- --with-rdma) +- CONFIG[RDMA]=y +- CONFIG[RDMA_PROV]="verbs" +- ;; +- --without-rdma) +- CONFIG[RDMA]=n +- ;; +- --with-fc=*) +- CONFIG[FC]=y +- CONFIG[FC_PATH]=$(readlink -f ${i#*=}) +- ;; +- --with-fc) +- CONFIG[FC]=y +- CONFIG[FC_PATH]= +- ;; +- --without-fc) +- CONFIG[FC]=n +- CONFIG[FC_PATH]= +- ;; +- --with-daos) +- CONFIG[DAOS]=y +- CONFIG[DAOS_DIR]="" +- ;; +- --with-daos=*) +- CONFIG[DAOS]=y +- check_dir "$i" +- CONFIG[DAOS_DIR]=$(readlink -f ${i#*=}) +- ;; +- --without-daos) +- CONFIG[DAOS]=n +- ;; +- --with-shared) +- CONFIG[SHARED]=y +- ;; +- --without-shared) +- CONFIG[SHARED]=n +- ;; +- --with-iscsi-initiator) +- CONFIG[ISCSI_INITIATOR]=y +- ;; +- --without-iscsi-initiator) +- CONFIG[ISCSI_INITIATOR]=n +- ;; +- --with-crypto) +- CONFIG[CRYPTO]=y +- ;; +- --without-crypto) +- CONFIG[CRYPTO]=n +- ;; +- --with-vhost) +- CONFIG[VHOST]=y +- ;; +- --without-vhost) +- CONFIG[VHOST]=n +- ;; +- --with-virtio) +- CONFIG[VIRTIO]=y +- ;; +- --without-virtio) +- CONFIG[VIRTIO]=n +- ;; +- --with-vfio-user) +- CONFIG[VFIO_USER]=y +- CONFIG[VFIO_USER_DIR]="" +- ;; +- --with-vfio-user=*) +- CONFIG[VFIO_USER]=y +- check_dir "$i" +- CONFIG[VFIO_USER_DIR]=$(readlink -f ${i#*=}) +- ;; +- --without-vfio-user) +- CONFIG[VFIO_USER]=n +- ;; +- --with-pmdk) +- CONFIG[PMDK]=y +- CONFIG[PMDK_DIR]="" +- ;; +- --with-pmdk=*) +- CONFIG[PMDK]=y +- check_dir "$i" +- CONFIG[PMDK_DIR]=$(readlink -f ${i#*=}) +- ;; +- --without-pmdk) +- CONFIG[PMDK]=n +- ;; +- --with-vbdev-compress) +- CONFIG[VBDEV_COMPRESS]=y +- ;; +- --without-vbdev-compress) +- CONFIG[VBDEV_COMPRESS]=n +- ;; +- --with-dpdk-compressdev) +- CONFIG[DPDK_COMPRESSDEV]=y +- ;; +- --without-dpdk-compressdev) +- CONFIG[DPDK_COMPRESSDEV]=n +- ;; +- --with-xnvme) +- CONFIG[XNVME]=y +- ;; +- --without-xnvme) +- CONFIG[XNVME]=n +- ;; +- --with-fio) ;& +- --with-fio=*) +- if [[ ${i#*=} != "$i" ]]; then +- CONFIG[FIO_SOURCE_DIR]=${i#*=} +- fi +- check_dir "--with-fio=${CONFIG[FIO_SOURCE_DIR]}" +- CONFIG[FIO_SOURCE_DIR]=$(readlink -f "${CONFIG[FIO_SOURCE_DIR]}") +- CONFIG[FIO_PLUGIN]=y +- ;; +- --without-fio) +- CONFIG[FIO_PLUGIN]=n +- ;; +- --with-vtune=*) +- check_dir "$i" +- CONFIG[VTUNE_DIR]="${i#*=}" +- CONFIG[VTUNE]=y +- ;; +- --without-vtune) +- CONFIG[VTUNE_DIR]= +- CONFIG[VTUNE]=n +- ;; +- --with-ocf) +- CONFIG[OCF]=y +- CONFIG[OCF_PATH]=$(readlink -f "$rootdir/ocf") +- ;; +- --with-ocf=*) +- CONFIG[OCF]=y +- CONFIG[OCF_PATH]=$(readlink -f ${i#*=}) +- ;; +- --without-ocf) +- CONFIG[OCF]=n +- CONFIG[OCF_PATH]= +- ;; +- --with-uring=*) +- CONFIG[URING]=y +- CONFIG[URING_PATH]=$(readlink -f ${i#*=}) +- ;; +- --with-uring) +- CONFIG[URING]=y +- CONFIG[URING_ZNS]=y +- CONFIG[URING_PATH]= +- ;; +- --without-uring) +- CONFIG[URING]=n +- CONFIG[URING_PATH]= +- ;; +- --without-uring-zns) +- CONFIG[URING_ZNS]=n +- ;; +- --with-openssl=*) +- check_dir "$i" +- CONFIG[OPENSSL_PATH]=$(readlink -f ${i#*=}) +- ;; +- --with-fuse) +- CONFIG[FUSE]=y +- ;; +- --without-fuse) +- CONFIG[FUSE]=n +- ;; +- --with-nvme-cuse) +- CONFIG[NVME_CUSE]=y +- ;; +- --without-nvme-cuse) +- CONFIG[NVME_CUSE]=n +- ;; +- --with-raid5f) +- CONFIG[RAID5F]=y +- ;; +- --without-raid5f) +- CONFIG[RAID5F]=n +- ;; +- --with-idxd) +- CONFIG[IDXD]=y +- CONFIG[IDXD_KERNEL]=n +- ;; +- --without-idxd) +- CONFIG[IDXD]=n +- ;; +- --with-usdt) +- CONFIG[USDT]=y +- ;; +- --without-usdt) +- CONFIG[USDT]=n +- ;; +- --with-fuzzer) +- echo "Must specify fuzzer library path with --with-fuzzer" +- usage +- exit 1 +- ;; +- --with-fuzzer=*) +- CONFIG[FUZZER]=y +- CONFIG[FUZZER_LIB]=$(readlink -f ${i#*=}) +- ;; +- --without-fuzzer) +- CONFIG[FUZZER]=n +- CONFIG[FUZZER_LIB]= +- ;; +- --with-sma) +- CONFIG[SMA]=y +- ;; +- --without-sma) +- CONFIG[SMA]=n +- ;; +- --with-avahi) +- CONFIG[AVAHI]=y +- ;; +- --without-avahi) +- CONFIG[AVAHI]=n +- ;; +- --) +- break +- ;; +- *) +- echo "Unrecognized option $i" +- usage +- exit 1 +- ;; +- esac +-done +- +-if [[ $arch == x86_64* ]]; then +- BUILD_CMD=($CC -o /dev/null -x c $CPPFLAGS $CFLAGS $LDFLAGS "-march=native") +-else +- BUILD_CMD=($CC -o /dev/null -x c $CPPFLAGS $CFLAGS $LDFLAGS) +-fi +-BUILD_CMD+=(-I/usr/local/include -L/usr/local/lib) +- +-if [[ "${CONFIG[VFIO_USER]}" = "y" ]]; then +- if ! echo -e '#include ' \ +- | "${BUILD_CMD[@]}" -E - 2> /dev/null; then +- echo "ERROR: --with-vfio-user requires json-c-devel" +- echo "Please install then re-run this script" +- exit 1 +- fi +- if ! echo -e '#include ' \ +- | "${BUILD_CMD[@]}" -E - 2> /dev/null; then +- echo "ERROR: --with-vfio-user requires libcmocka-devel" +- echo "Please install then re-run this script" +- exit 1 +- fi +-fi +- +-# IDXD uses Intel specific instructions. +-if [[ "${CONFIG[IDXD]}" = "y" ]]; then +- if [ $(uname -s) == "FreeBSD" ]; then +- intel="hw.model: Intel" +- cpu_vendor=$(sysctl -a | grep hw.model | cut -c 1-15) +- else +- intel="GenuineIntel" +- cpu_vendor=$(grep -i 'vendor' /proc/cpuinfo --max-count=1) +- fi +- if [[ "$cpu_vendor" != *"$intel"* ]]; then +- echo "ERROR: IDXD cannot be used due to CPU incompatibility." +- exit 1 +- fi +- if [ -e /usr/include/accel-config/libaccel_config.h ]; then +- CONFIG[IDXD_KERNEL]=y +- fi +- +-fi +- +-if [ -z "${CONFIG[ENV]}" ]; then +- CONFIG[ENV]=$rootdir/lib/env_dpdk +- echo "Using default SPDK env in ${CONFIG[ENV]}" +- if [[ -z "${CONFIG[DPDK_DIR]}" && "${CONFIG[DPDK_PKG_CONFIG]}" == n ]]; then +- if [ ! -f "$rootdir"/dpdk/config/meson.build ]; then +- echo "DPDK not found; please specify --with-dpdk= or run:" +- echo +- echo " git submodule update --init" +- exit 1 +- else +- CONFIG[DPDK_DIR]="${rootdir}/dpdk/build" +- # Default ipsec libs +- if [[ "${CONFIG[CRYPTO]}" = "y" ]] && [[ $arch = x86_64* ]]; then +- CONFIG[IPSEC_MB]=y +- CONFIG[IPSEC_MB_DIR]="${rootdir}/intel-ipsec-mb/lib" +- fi +- echo "Using default DPDK in ${CONFIG[DPDK_DIR]}" +- fi +- fi +-else +- if [[ -n "${CONFIG[DPDK_DIR]}" || "${CONFIG[DPDK_PKG_CONFIG]}" == y ]]; then +- echo "--with-env and --with-dpdk are mutually exclusive." +- exit 1 +- fi +- +- if [ "${CONFIG[VHOST]}" = "y" ]; then +- echo "Vhost is only supported when using the default DPDK environment. Disabling it." +- fi +- # Always disable vhost, but only print the error message if the user explicitly turned it on. +- CONFIG[VHOST]="n" +- if [ "${CONFIG[VIRTIO]}" = "y" ]; then +- echo "Virtio is only supported when using the default DPDK environment. Disabling it." +- fi +- # Always disable virtio, but only print the error message if the user explicitly turned it on. +- CONFIG[VIRTIO]="n" +-fi +- +-if [[ "${CONFIG[DPDK_PKG_CONFIG]}" == y ]]; then +- if [[ "${CONFIG[SHARED]}" == n ]]; then +- # dpdk-devel doesn't provide static libs +- echo "Build against packaged DPDK requested, enabling shared libraries" +- CONFIG[SHARED]=y +- fi +-fi +- +-if [[ $sys_name == "Windows" ]]; then +- if [ -z "${CONFIG[WPDK_DIR]}" ]; then +- if [ ! -f "$rootdir"/wpdk/Makefile ]; then +- echo "WPDK not found; please specify --with-wpdk=. See https://wpdk.github.io." +- exit 1 +- else +- CONFIG[WPDK_DIR]="${rootdir}/wpdk/build" +- echo "Using default WPDK in ${CONFIG[WPDK_DIR]}" +- fi +- fi +-else +- if [ -n "${CONFIG[WPDK_DIR]}" ]; then +- echo "ERROR: --with-wpdk is only supported for Windows" +- exit 1 +- fi +-fi +- +-if [ "${CONFIG[VTUNE]}" = "y" ]; then +- echo "WARNING: VTune support is deprecated." +- if [ -z "${CONFIG[VTUNE_DIR]}" ]; then +- echo "When VTune is enabled, you must specify the VTune directory using --with-vtune=path" +- exit 1 +- fi +-fi +- +-if [[ "${CONFIG[ASAN]}" = "y" && "${CONFIG[TSAN]}" = "y" ]]; then +- echo "ERROR: ASAN and TSAN cannot be enabled at the same time." +- exit 1 +-fi +- +-if [[ "${CONFIG[FIO_PLUGIN]}" = "y" && "${CONFIG[EXAMPLES]}" = "n" ]]; then +- echo "ERROR: --with-fio and --disable-examples are mutually exclusive." +- exit 1 +-fi +- +-if [[ $sys_name == "FreeBSD" ]]; then +- # FreeBSD doesn't support all configurations +- if [[ "${CONFIG[COVERAGE]}" == "y" ]]; then +- echo "ERROR: CONFIG_COVERAGE not available on FreeBSD" +- exit 1 +- fi +-fi +- +-if [[ $sys_name == "Linux" ]]; then +- if pkg-config libbsd; then +- CONFIG[HAVE_LIBBSD]=y +- fi +-fi +- +-if pkg-config libarchive; then +- CONFIG[HAVE_LIBARCHIVE]=y +-fi +- +-if [[ $sys_name != "Linux" ]]; then +- if [[ "${CONFIG[VHOST]}" == "y" ]]; then +- echo "Vhost is only supported on Linux." +- exit 1 +- fi +- if [[ "${CONFIG[VIRTIO]}" == "y" ]]; then +- echo "Virtio is only supported on Linux." +- exit 1 +- fi +-fi +- +-if [ "${CONFIG[RDMA]}" = "y" ]; then +- if [[ ! "${CONFIG[RDMA_PROV]}" == "verbs" ]] && [[ ! "${CONFIG[RDMA_PROV]}" == "mlx5_dv" ]]; then +- echo "Invalid RDMA provider specified, must be \"verbs\" or \"mlx5_dv\"" +- exit 1 +- fi +- +- if ! echo -e '#include \n#include \n' \ +- 'int main(void) { return 0; }\n' \ +- | "${BUILD_CMD[@]}" -libverbs -lrdmacm - 2> /dev/null; then +- echo "--with-rdma requires libverbs and librdmacm." +- echo "Please install then re-run this script." +- exit 1 +- fi +- +- if echo -e '#include \n' \ +- 'int main(void) { return !!IBV_WR_SEND_WITH_INV; }\n' \ +- | "${BUILD_CMD[@]}" -c - 2> /dev/null; then +- CONFIG[RDMA_SEND_WITH_INVAL]="y" +- else +- CONFIG[RDMA_SEND_WITH_INVAL]="n" +- echo " +-******************************************************************************* +-WARNING: The Infiniband Verbs opcode Send With Invalidate is either not +-supported or is not functional with the current version of libibverbs installed +-on this system. Please upgrade to at least version 1.1. +- +-Beginning with Linux kernel 4.14, the kernel NVMe-oF initiator leverages Send +-With Invalidate RDMA operations to improve performance. Failing to use the +-Send With Invalidate operation on the NVMe-oF target side results in full +-functionality, but greatly reduced performance. The SPDK NVMe-oF target will +-be unable to leverage that operation using the currently installed version +-of libibverbs, so Linux kernel NVMe-oF initiators based on kernels greater +-than or equal to 4.14 will see significantly reduced performance. +-*******************************************************************************" +- fi +- +- if echo -e '#include \n' \ +- 'int main(void) { return !!RDMA_OPTION_ID_ACK_TIMEOUT; }\n' \ +- | "${BUILD_CMD[@]}" -c - 2> /dev/null; then +- CONFIG[RDMA_SET_ACK_TIMEOUT]="y" +- else +- CONFIG[RDMA_SET_ACK_TIMEOUT]="n" +- echo "RDMA_OPTION_ID_ACK_TIMEOUT is not supported" +- fi +- +- if echo -e '#include \n' \ +- 'int main(void) { return !!RDMA_OPTION_ID_TOS; }\n' \ +- | "${BUILD_CMD[@]}" -c - 2> /dev/null; then +- CONFIG[RDMA_SET_TOS]="y" +- else +- CONFIG[RDMA_SET_TOS]="n" +- echo "RDMA_OPTION_ID_TOS is not supported" +- fi +- +- if [ "${CONFIG[RDMA_PROV]}" == "mlx5_dv" ]; then +- MLX5_DV_BUILD_BUILD_CMD=" +- #include \n +- #include \n +- int main(void) { return rdma_establish(NULL) ||\n +- !!IBV_QP_INIT_ATTR_SEND_OPS_FLAGS || !!MLX5_OPCODE_RDMA_WRITE" +- if [ "${CONFIG[CRYPTO]}" = "y" ]; then +- MLX5_DV_BUILD_BUILD_CMD+="|| !!MLX5DV_CRYPTO_ENGINES_CAP_AES_XTS_SINGLE_BLOCK" +- fi +- MLX5_DV_BUILD_BUILD_CMD+=";}" +- if ! echo -e $MLX5_DV_BUILD_BUILD_CMD | "${BUILD_CMD[@]}" -lmlx5 -I${rootdir}/include -c -; then +- echo "mlx5_dv provider is not supported" +- exit 1 +- fi +- fi +- +- echo "Using '${CONFIG[RDMA_PROV]}' RDMA provider" +-fi +- +-if [[ "${CONFIG[FC]}" = "y" ]]; then +- if [[ -n "${CONFIG[FC_PATH]}" ]]; then +- check_dir "${CONFIG[FC_PATH]}" +- fi +-fi +- +-if [[ "${CONFIG[PMDK]}" = "y" ]]; then +- echo "WARNING: PMDK - bdev_pmem is deprecated." +- echo "WARNING: PMDK - ACCEL_FLAG_PERSISTENT in accel_sw module is deprecated." +- if ! echo -e '#include \nint main(void) { return 0; }\n' \ +- | "${BUILD_CMD[@]}" -lpmemblk - 2> /dev/null; then +- echo "--with-pmdk requires libpmemblk." +- echo "Please install then re-run this script." +- exit 1 +- fi +-fi +- +-function dpdk_version() { +- # Check DPDK version to determine if mlx5_pci driver is supported +- local dpdk_ver="none" +- if [[ "${CONFIG[DPDK_DIR]}" == "$rootdir/dpdk/build" ]]; then +- # DPDK_DIR points at our submodule so ./build may not exist yet. Use +- # absolute path to lookup the version. +- dpdk_ver=$(< "$rootdir/dpdk/VERSION") +- elif [[ -f "${CONFIG[DPDK_DIR]}"/../VERSION ]]; then +- dpdk_ver=$(< "${CONFIG[DPDK_DIR]}"/../VERSION) +- fi +- echo $dpdk_ver +-} +- +-function mlx5_build() { +- # Check if libmlx5 exists to enable mlx5_pci compress/crypto PMD +- if ! echo -e '#include \n' \ +- '#include \n' \ +- '#include \n' \ +- 'int main(void) { return 0; }\n' \ +- | "${BUILD_CMD[@]}" -lmlx5 -libverbs -I${rootdir}/include -c - 2> /dev/null; then +- return 1 +- fi +- return 0 +-} +- +-if [[ "${CONFIG[VBDEV_COMPRESS]}" = "y" ]]; then +- echo "WARNING: PMDK - Persistent device support with bdev_compress is deprecated." +- if ! echo -e '#include \nint main(void) { return 0; }\n' \ +- | "${BUILD_CMD[@]}" -lpmem - 2> /dev/null; then +- echo "--with-vbdev-compress requires libpmem." +- echo "Please install then re-run this script." +- exit 1 +- fi +- # Try to enable mlx5 compress +- CONFIG[VBDEV_COMPRESS_MLX5]="y" +- +- # Check if libmlx5 exists to enable mlx5_pci compress PMD +- if ! mlx5_build; then +- echo "libmlx5 is not found, so disabling DPDK mlx5_pci compress PMD" +- CONFIG[VBDEV_COMPRESS_MLX5]="n" +- else +- if [[ "${CONFIG[DPDK_PKG_CONFIG]}" = "y" ]]; then +- # Check if librte_compress_mlx5 exists in DPDK package +- if [ ! -f "${CONFIG[DPDK_LIB_DIR]}"/librte_compress_mlx5.so ]; then +- echo "librte_compress_mlx5 is not found, so disabling DPDK mlx5_pci compress PMD" +- CONFIG[VBDEV_COMPRESS_MLX5]="n" +- fi +- else +- # Check DPDK version to determine if mlx5_pci driver is supported +- dpdk_ver=$(dpdk_version) +- if [[ $dpdk_ver = "none" ]]; then +- echo "Cannot get DPDK version, so disabling DPDK mlx5_pci compress PMD" +- CONFIG[VBDEV_COMPRESS_MLX5]="n" +- elif [[ -n $dpdk_ver ]] && lt "$dpdk_ver" 21.02.0; then +- # mlx5_pci for compress is supported by DPDK >- 21.02.0 +- echo "DPDK version ${dpdk_ver} doesn't support mlx5_pci compress PMD" +- CONFIG[VBDEV_COMPRESS_MLX5]="n" +- elif [[ -n ${CONFIG[DPDK_LIB_DIR]} ]] && [ ! -f "${CONFIG[DPDK_LIB_DIR]}"/librte_compress_mlx5.so ]; then +- # This is only checked when --with-dpdk or --with-dpdk=* is used +- echo "librte_compress_mlx5 is not found, so disabling DPDK mlx5_pci compress PMD" +- CONFIG[VBDEV_COMPRESS_MLX5]="n" +- fi +- fi +- fi +-fi +- +-if [[ "${CONFIG[CRYPTO]}" = "y" ]]; then +- # Try to enable mlx5 crypto +- CONFIG[CRYPTO_MLX5]="y" +- +- # Check if libmlx5 exists to enable mlx5_pci compress PMD +- if ! mlx5_build; then +- echo "libmlx5 is not found, so disabling DPDK mlx5_pci crypto PMD" +- CONFIG[CRYPTO_MLX5]="n" +- else +- if [[ "${CONFIG[DPDK_PKG_CONFIG]}" = "y" ]]; then +- # Check if librte_crypto_mlx5 exists in DPDK package +- if [ ! -f "${CONFIG[DPDK_LIB_DIR]}"/librte_crypto_mlx5.so ]; then +- echo "librte_crypto_mlx5 is not found, so disabling DPDK mlx5_pci crypto PMD" +- CONFIG[CRYPTO_MLX5]="n" +- fi +- else +- # Check DPDK version to determine if mlx5_pci driver is supported +- dpdk_ver=$(dpdk_version) +- if [[ $dpdk_ver = "none" ]]; then +- echo "Cannot get DPDK version, so disabling DPDK mlx5_pci crypto PMD" +- CONFIG[CRYPTO_MLX5]="n" +- elif [[ -n $dpdk_ver ]] && lt "$dpdk_ver" 21.11.0; then +- # mlx5_pci for crypto is supported by DPDK >- 21.11.0 +- echo "DPDK version ${dpdk_ver} doesn't support mlx5_pci crypto PMD" +- CONFIG[CRYPTO_MLX5]="n" +- elif [[ -n ${CONFIG[DPDK_LIB_DIR]} ]] && [ ! -f "${CONFIG[DPDK_LIB_DIR]}"/librte_crypto_mlx5.so ]; then +- # This is only checked when --with-dpdk or --with-dpdk=* is used +- echo "librte_crypto_mlx5 is not found, so disabling DPDK mlx5_pci crypto PMD" +- CONFIG[CRYPTO_MLX5]="n" +- fi +- fi +- fi +-fi +- +-if [[ "${CONFIG[NVME_CUSE]}" = "y" ]]; then +- if ! echo -e '#define FUSE_USE_VERSION 31\n#include \n#include \n#include \nint main(void) { return 0; }\n' \ +- | "${BUILD_CMD[@]}" -lfuse3 -D_FILE_OFFSET_BITS=64 - 2> /dev/null; then +- echo "--with-cuse requires libfuse3." +- echo "Please install then re-run this script." +- exit 1 +- fi +-fi +- +-if [[ "${CONFIG[RBD]}" = "y" ]]; then +- if ! echo -e '#include \n#include \n' \ +- 'int main(void) { return 0; }\n' \ +- | "${BUILD_CMD[@]}" -lrados -lrbd - 2> /dev/null; then +- echo "--with-rbd requires librados and librbd." +- echo "Please install then re-run this script." +- exit 1 +- fi +-fi +- +-if [[ "${CONFIG[UBLK]}" = "y" ]]; then +- if ! echo -e '#include \n#include \n' \ +- 'int main(void) { return 0; }\n' \ +- | "${BUILD_CMD[@]}" -luring - 2> /dev/null; then +- echo "--with-ublk requires liburing and ublk_drv." +- echo "Please install then re-run this script." +- exit 1 +- fi +-fi +- +-if [[ "${CONFIG[ISCSI_INITIATOR]}" = "y" ]]; then +- # Fedora installs libiscsi to /usr/lib64/iscsi for some reason. +- if ! echo -e '#include \n#include \n' \ +- '#if LIBISCSI_API_VERSION < 20150621\n' \ +- '#error\n' \ +- '#endif\n' \ +- 'int main(void) { return 0; }\n' \ +- | "${BUILD_CMD[@]}" -L/usr/lib64/iscsi -liscsi - 2> /dev/null; then +- echo "--with-iscsi-initiator requires libiscsi with" +- echo "LIBISCSI_API_VERSION >= 20150621." +- echo "Please install then re-run this script." +- exit 1 +- fi +-fi +- +-if [[ "${CONFIG[DAOS]}" = "y" ]]; then +- daos_build_cmd=("${BUILD_CMD[@]}") +- if [[ -n "${CONFIG[DAOS_DIR]}" ]]; then +- daos_build_cmd+=(-I"${CONFIG[DAOS_DIR]}"/include -L"${CONFIG[DAOS_DIR]}"/lib64) +- fi +- if ! echo -e '#include \n#include \n' \ +- 'int main(void) { return 0; }\n' \ +- | "${daos_build_cmd[@]}" -lgurt -ldaos -ldaos_common -ldfs - 2> /dev/null; then +- echo "--with-daos requires libdaos, libdaos_common, libdfs and libgurt" +- echo "Please install then re-run this script." +- exit 1 +- fi +-fi +- +-if [[ "${CONFIG[ASAN]}" = "y" ]]; then +- if ! echo -e 'int main(void) { return 0; }\n' \ +- | "${BUILD_CMD[@]}" -fsanitize=address - 2> /dev/null; then +- echo "--enable-asan requires libasan." +- echo "Please install then re-run this script." +- exit 1 +- fi +-fi +- +-if [[ "${CONFIG[UBSAN]}" = "y" ]]; then +- if ! echo -e 'int main(void) { return 0; }\n' \ +- | "${BUILD_CMD[@]}" -fsanitize=undefined - 2> /dev/null; then +- echo "--enable-ubsan requires libubsan." +- echo "Please install then re-run this script." +- echo "If installed, please check that the GCC version is at least 6.4" +- echo "and synchronize CC accordingly." +- exit 1 +- fi +-fi +- +-if [[ "${CONFIG[TSAN]}" = "y" ]]; then +- if ! echo -e 'int main(void) { return 0; }\n' \ +- | "${BUILD_CMD[@]}" -fsanitize=thread - 2> /dev/null; then +- echo "--enable-tsan requires libtsan." +- echo "Please install then re-run this script." +- exit 1 +- fi +-fi +- +-if echo -e '#include \nint main(void) { arc4random(); return 0; }\n' \ +- | "${BUILD_CMD[@]}" - 2> /dev/null; then +- CONFIG[HAVE_ARC4RANDOM]="y" +-fi +- +-if [[ "${CONFIG[OCF]}" = "y" ]]; then +- echo "WARNING: OCF - bdev_ocf is deprecated." +- # If OCF_PATH is a file, assume it is a library and use it to compile with +- if [ -f ${CONFIG[OCF_PATH]} ]; then +- CONFIG[CUSTOMOCF]=y +- else +- CONFIG[CUSTOMOCF]=n +- fi +-fi +- +-if [[ "${CONFIG[PGO_CAPTURE]}" = "y" && "${CONFIG[PGO_USE]}" = "y" ]]; then +- echo "ERROR: --enable-pgo-capture and --enable-pgo-use are mutually exclusive." +- exit 1 +-elif [[ "${CONFIG[PGO_USE]}" = "y" ]]; then +- if [[ "$CC_TYPE" = "clang" ]]; then +- # For clang we need to run an extra step on gathered profiling data. +- echo "Generating suitable profile data" +- llvm-profdata merge -output=build/pgo/default.profdata build/pgo +- fi +-fi +- +-if [[ "${CONFIG[URING]}" = "y" || "${CONFIG[XNVME]}" = "y" ]]; then +- if [[ -n "${CONFIG[URING_PATH]}" ]]; then +- check_dir "${CONFIG[URING_PATH]}" +- elif ! echo -e '#include \nint main(void) { return 0; }\n' \ +- | "${BUILD_CMD[@]}" -luring - 2> /dev/null; then +- echo "--with-uring requires liburing." +- echo "Please build and install then re-run this script." +- exit 1 +- fi +- # Support for Zoned devices is enabled by default for Uring bdev. Check appropriate support in kernel. +- if [[ "${CONFIG[URING_ZNS]}" = "y" ]]; then +- if ! echo -e '#include\nint main(void) { return BLK_ZONE_REP_CAPACITY; }\n' \ +- | "${BUILD_CMD[@]}" -c - 2> /dev/null; then +- echo "Disabling Zoned NS support in Uring! Requires blkzoned.h from kernel >= linux-5.9." +- CONFIG[URING_ZNS]=n +- fi +- fi +-fi +- +-if [[ "${CONFIG[FUSE]}" = "y" ]]; then +- if [[ ! -d /usr/include/fuse3 ]] && [[ ! -d /usr/local/include/fuse3 ]]; then +- echo "--with-fuse requires libfuse3." +- echo "Please install then re-run this script." +- exit 1 +- fi +-fi +- +-if [ "${CONFIG[CET]}" = "y" ]; then +- if ! echo -e 'int main(void) { return 0; }\n' | "${BUILD_CMD[@]}" -fcf-protection - 2> /dev/null; then +- echo "--enable-cet requires compiler/linker that supports CET." +- echo "Please install then re-run this script." +- exit 1 +- fi +-fi +- +-if [[ "${CONFIG[FUZZER]}" = "y" && "$CC_TYPE" != "clang" ]]; then +- echo "--with-fuzzer requires setting CC and CXX to clang." +- exit 1 +-fi +- +-if [[ $arch == x86_64* ]] || [[ $arch == aarch64* ]]; then +- CONFIG[ISAL]=y +- # make sure the submodule is initialized +- if [ ! -f "$rootdir"/isa-l/autogen.sh ]; then +- echo "ISA-L is required but was not found, please init the submodule with:" +- echo " git submodule update --init" +- echo "and then re-run this script." +- exit 1 +- fi +- # for x86 only, check the nasm version for ISA-L and IPSEC +- if [[ $arch == x86_64* ]]; then +- ver=$(nasm -v 2> /dev/null | awk '{print $3}' | awk -Fr '{print $1}') +- if lt "$ver" 2.14; then +- CONFIG[ISAL]=n +- # IPSEC has nasm requirement and DPDK crypto relies on IPSEC +- CONFIG[IPSEC_MB]=n +- echo "WARNING: ISA-L & DPDK crypto cannot be used as nasm ver must be 2.14 or newer." +- fi +- fi +- # check gas version on aarch64 +- if [[ $arch == aarch64* ]]; then +- ver=$(as --version 2> /dev/null | awk 'NR==1{print $7}') +- if lt "$ver" 2.24; then +- # ISA-L, compression & crypto require gas version 2.24 or newer. +- CONFIG[ISAL]=n +- echo "Notice: ISA-L, compression & crypto require GAS version 2.24 or newer. Turning off default ISA-L and crypto features." +- elif lt "$ver" 2.34; then +- #For gas v2.24~v2.34, sve2 instructions are not supported. To workaround it, sve2 optimization should be disabled +- ISAL_CRYPTO_OPTS+=("--disable-sve2") +- fi +- fi +-else +- # for PPC +- CONFIG[ISAL]=n +- echo "WARNING: ISA-L cannot be used due to architecture incompatibility." +-fi +- +-# now either configure ISA-L or disable unavailable features +-if [[ "${CONFIG[ISAL]}" = "y" ]]; then +- cd $rootdir/isa-l +- ISAL_LOG=$rootdir/isa-l/spdk-isal.log +- if [[ -n "${CONFIG[CROSS_PREFIX]}" ]]; then +- ISAL_OPTS=("--host=${CONFIG[CROSS_PREFIX]}") +- else +- ISAL_OPTS=() +- fi +- echo -n "Configuring ISA-L (logfile: $ISAL_LOG)..." +- ./autogen.sh &> $ISAL_LOG +- ./configure CFLAGS="-fPIC -g -O2" "${ISAL_OPTS[@]}" --enable-shared=no >> $ISAL_LOG 2>&1 +- echo "done." +- cd $rootdir +-else +- echo "Without ISA-L, there is no software support for crypto or compression," +- echo "so these features will be disabled." +- CONFIG[CRYPTO]=n +- CONFIG[VBDEV_COMPRESS]=n +- CONFIG[DPDK_COMPRESSDEV]=n +-fi +- +-# ISA-L-crypto complements ISA-L functionality, it is only enabled together with ISA-L +-if [[ "${CONFIG[ISAL]}" = "y" ]]; then +- if [ ! -f "$rootdir"/isa-l-crypto/autogen.sh ]; then +- echo "ISA-L-crypto is required but was not found, please init the submodule with:" +- echo " git submodule update --init" +- echo "and then re-run this script." +- exit 1 +- fi +- +- cd $rootdir/isa-l-crypto +- ISAL_CRYPTO_LOG=$rootdir/isa-l-crypto/spdk-isal-crypto.log +- if [[ -n "${CONFIG[CROSS_PREFIX]}" ]]; then +- ISAL_CRYPTO_OPTS+=("--host=${CONFIG[CROSS_PREFIX]}") +- fi +- ISAL_CRYPTO_OPTS+=("--enable-shared=no") +- echo -n "Configuring ISA-L-crypto (logfile: $ISAL_CRYPTO_LOG)..." +- ./autogen.sh &> $ISAL_CRYPTO_LOG +- ./configure CFLAGS="-fPIC -g -O2" "${ISAL_CRYPTO_OPTS[@]}" >> $ISAL_CRYPTO_LOG 2>&1 +- echo "done." +- cd $rootdir +- CONFIG[ISAL_CRYPTO]=y +-else +- CONFIG[ISAL_CRYPTO]=n +-fi +- +-if [[ "${CONFIG[SMA]}" = "y" ]]; then +- if ! python3 -c 'import grpc; import grpc_tools' 2> /dev/null; then +- echo "--with-sma requires grpcio and grpcio-tools python packages." +- echo "Please install then re-run this script." +- exit 1 +- fi +-fi +- +-if [[ "${CONFIG[AVAHI]}" = "y" ]]; then +- if ! echo -e '#include \n#include \n' \ +- 'int main(void) { return 0; }\n' \ +- | "${BUILD_CMD[@]}" -lavahi-client -lavahi-common - 2> /dev/null; then +- echo "--with-avahi requires libavahi-client and libavahi-common." +- echo "Please install then re-run this script." +- exit 1 +- fi +-fi +- +-# For ARM Neoverse-N1 platform, debug build needs gcc version newer than 8.4 +-if [[ "${CONFIG[DEBUG]}" = "y" && $arch = aarch64* && "$CC_TYPE" = "gcc" ]]; then +- GCC_VERSION=$($CC -dumpfullversion) +- PART_NUM=$(grep -i -m 1 "CPU part" /proc/cpuinfo | awk '{print $4}') +- +- if [[ "$(printf '%s\n' "8.4.0" "$GCC_VERSION" | sort -V | head -n1)" != "8.4.0" ]]; then +- if [[ $PART_NUM = 0xd0c ]]; then +- echo "WARNING: For ARM Neoverse-N1 platform, debug build needs GCC version newer than 8.4." +- echo " Will work around this by using armv8.2-a+crypto as target architecture for now." +- CONFIG[ARCH]=armv8.2-a+crypto +- elif [[ $PART_NUM = 0x0b2 ]]; then +- echo "WARNING: For ARM octeontx2 platform, debug build needs GCC version newer than 8.4." +- echo " Will work around this by using armv8.2-a+crypto as target architecture for now." +- CONFIG[ARCH]=armv8.2-a+crypto +- fi +- fi +-fi +- +-# We are now ready to generate final configuration. But first do sanity +-# check to see if all keys in CONFIG array have its reflection in CONFIG file. +-if (($(grep -cE "^\s*CONFIG_[[:alnum:]_]+=" "$rootdir/CONFIG") != ${#CONFIG[@]})); then +- echo "" +- echo "BUG: Some configuration options are not present in CONFIG file. Please update this file." +- echo "Missing options in CONFIG (+) file and in current config (-): " +- diff -u --label "CONFIG file" --label "CONFIG[@]" \ +- <(sed -r -e '/^[[:space:]]*$/d; /^[[:space:]]*#.*/d; s/(CONFIG_[[:alnum:]_]+)=.*/\1/g' CONFIG | sort) \ +- <(printf "CONFIG_%s\n" "${!CONFIG[@]}" | sort) +- exit 1 +-fi +- +-echo -n "Creating mk/config.mk..." +-cp -f $rootdir/CONFIG $rootdir/mk/config.mk +-ARGS=$(echo "$@" | sed 's/ /\\ /g') +-sed -i.bak -r "s#__CONFIGURE_OPTIONS__#${ARGS}#g" $rootdir/mk/config.mk +-for key in "${!CONFIG[@]}"; do +- sed -i.bak -r "s#[[:space:]]*CONFIG_${key}=.*#CONFIG_${key}\?=${CONFIG[$key]}#g" $rootdir/mk/config.mk +-done +-# On FreeBSD sed -i 'SUFFIX' - SUFFIX is mandatory. So no way but to delete the backed file. +-rm -f $rootdir/mk/config.mk.bak +-echo "done." +- +-# Environment variables +-echo -n "Creating mk/cc.flags.mk..." +-rm -f $rootdir/mk/cc.flags.mk +-[ -n "$CFLAGS" ] && echo "CFLAGS?=$CFLAGS" > $rootdir/mk/cc.flags.mk +-[ -n "$CXXFLAGS" ] && echo "CXXFLAGS?=$CXXFLAGS" >> $rootdir/mk/cc.flags.mk +-[ -n "$LDFLAGS" ] && echo "LDFLAGS?=$LDFLAGS" >> $rootdir/mk/cc.flags.mk +-[ -n "$DESTDIR" ] && echo "DESTDIR?=$DESTDIR" >> $rootdir/mk/cc.flags.mk +-echo "done." +- +-# Create .sh with build config for easy sourcing|lookup during the tests. +-for conf in "${!CONFIG[@]}"; do +- echo "CONFIG_$conf=${CONFIG[$conf]}" +-done > "$rootdir/test/common/build_config.sh" +- +-if [[ $sys_name == "FreeBSD" ]]; then +- echo "Type 'gmake' to build." +-else +- echo "Type 'make' to build." +-fi +- +-exit 0 ++#!/usr/bin/env bash ++# SPDX-License-Identifier: BSD-3-Clause ++# Copyright (C) 2017 Intel Corporation ++# All rights reserved. ++# Copyright (c) 2022 Dell Inc, or its subsidiaries. ++# ++ ++set -e ++ ++trap 'echo -e "\n\nConfiguration failed\n\n" >&2' ERR ++ ++rootdir=$(readlink -f $(dirname $0)) ++source "$rootdir/scripts/common.sh" ++ ++function usage() { ++ echo "'configure' configures SPDK to compile on supported platforms." ++ echo "" ++ echo "Usage: ./configure [OPTION]..." ++ echo "" ++ echo "Defaults for the options are specified in brackets." ++ echo "" ++ echo "General:" ++ echo " -h, --help Display this help and exit" ++ echo "" ++ echo " --prefix=path Configure installation prefix (default: /usr/local)" ++ echo " --target-arch=arch Target build architecture. Must be a valid GNU arch. Default: native" ++ echo "" ++ echo " --cross-prefix=prefix Prefix for cross compilation (default: none)" ++ echo " example: aarch64-linux-gnu" ++ echo " --libdir=path Configure installation path for the libraries (default: \$prefix/lib)" ++ echo "" ++ echo " --enable-debug Configure for debug builds" ++ echo " --enable-werror Treat compiler warnings as errors" ++ echo " --enable-asan Enable address sanitizer" ++ echo " --enable-ubsan Enable undefined behavior sanitizer" ++ echo " --enable-coverage Enable code coverage tracking" ++ echo " --enable-lto Enable link-time optimization" ++ echo " --enable-pgo-capture Enable generation of profile guided optimization data" ++ echo " --enable-pgo-use Use previously captured profile guided optimization data" ++ echo " --enable-cet Enable Intel Control-flow Enforcement Technology (CET)" ++ echo " --disable-tests Disable building of functional tests" ++ echo " --disable-unit-tests Disable building of unit tests" ++ echo " --disable-examples Disable building of examples" ++ echo " --disable-apps Disable building of apps" ++ echo "" ++ echo "Specifying Dependencies:" ++ echo "--with-DEPENDENCY[=path] Use the given dependency. Optionally, provide the" ++ echo " path." ++ echo "--without-DEPENDENCY Do not link to the given dependency. This may" ++ echo " disable features and components." ++ echo "" ++ echo "Valid dependencies are listed below." ++ echo " --with-dpdk[=DIR] Build against a custom dpdk version. By default, the dpdk" ++ echo " --without-dpdk submodule in spdk tree will be used." ++ echo " example: /usr/share/dpdk/x86_64-default-linuxapp-gcc" ++ echo " --with-env=DIR Use an alternate environment implementation instead of DPDK." ++ echo " Implies --without-dpdk." ++ echo " --with-idxd Build the IDXD library and accel framework plug-in module." ++ echo " --without-idxd Disabled while experimental. Only built for x86 when enabled." ++ echo " --with-crypto Build isa-l-crypto and vbdev crypto module. No path required." ++ echo " --without-crypto Disable isa-l-crypto and vbdev crypto module." ++ echo " --with-fio[=DIR] Build fio_plugin." ++ echo " --without-fio default: /usr/src/fio" ++ echo " --with-xnvme Build xNVMe bdev module." ++ echo " --without-xnvme No path required." ++ echo " --with-vhost Build vhost target. Enabled by default." ++ echo " --without-vhost No path required." ++ echo " --with-virtio Build vhost initiator and virtio-pci bdev modules." ++ echo " --without-virtio No path required." ++ echo " --with-vfio-user[=DIR] Build custom vfio-user transport for NVMf target and vfio-user target." ++ echo " vfio-user initiator is always built-in in Linux." ++ echo " example: /usr/src/libvfio-user" ++ echo " --without-vfio-user No path required." ++ echo " --with-pmdk[=DIR] Build persistent memory bdev. (Deprecated) ++ example: /usr/share/pmdk" ++ echo " --without-pmdk No path required." ++ echo " --with-vbdev-compress Build vbdev compression module and dependencies." ++ echo " --without-vbdev-compress No path required." ++ echo " --with-dpdk-compressdev Build accel DPDK compression module and dependencies." ++ echo " --without-dpdk-compressdev No path required." ++ echo " --with-rbd Build Ceph RBD bdev module." ++ echo " --without-rbd No path required." ++ echo " --with-ublk Build ublk library." ++ echo " --without-ublk No path required." ++ echo " --with-rdma[=DIR] Build RDMA transport for NVMf target and initiator." ++ echo " --without-rdma Accepts optional RDMA provider name. Can be \"verbs\" or \"mlx5_dv\"." ++ echo " If no provider specified, \"verbs\" provider is used by default." ++ echo " --with-fc[=DIR] Build FC transport for NVMf target." ++ echo " --without-fc If an argument is provided, it is considered a directory containing" ++ echo " libufc.a and fc_lld.h. Otherwise the regular system paths will" ++ echo " be searched." ++ echo " --with-daos[=DIR] Build DAOS bdev module." ++ echo " --without-daos No path required." ++ echo " --with-shared Build spdk shared libraries." ++ echo " --without-shared No path required." ++ echo " --with-iscsi-initiator Build with iscsi bdev module." ++ echo " --without-iscsi-initiator No path required." ++ echo " --with-vtune=DIR Required to profile I/O under Intel VTune Amplifier XE. (Deprecated)" ++ echo " --without-vtune example: /opt/intel/vtune_amplifier_xe_version" ++ echo " --with-ocf[=DIR] Build OCF library and bdev module. (Deprecated)" ++ echo " --without-ocf If argument is directory, interpret it as root of OCF repo" ++ echo " If argument is file, interpret it as compiled OCF lib" ++ echo " If no argument is specified, OCF git submodule is used by default" ++ echo " example: /usr/src/ocf/" ++ echo " --with-uring[=DIR] Build I/O uring bdev or socket module." ++ echo " --without-uring If an argument is provided, it is considered a directory containing" ++ echo " liburing.a and io_uring.h. Otherwise the regular system paths will" ++ echo " be searched." ++ echo " --without-uring-zns Build I/O uring module without ZNS (zoned namespaces) support." ++ echo " --with-openssl[=DIR] Build OPENSSL with custom path. Otherwise the regular system paths will" ++ echo " be searched." ++ echo " --with-fuse Build FUSE components for mounting a blobfs filesystem." ++ echo " --without-fuse No path required." ++ echo " --with-nvme-cuse Build NVMe driver with support for CUSE-based character devices." ++ echo " --without-nvme-cuse No path required." ++ echo " --with-raid5f Build with bdev_raid module RAID5f support." ++ echo " --without-raid5f No path required." ++ echo " --with-wpdk=DIR Build using WPDK to provide support for Windows (experimental)." ++ echo " --without-wpdk The argument must be a directory containing lib and include." ++ echo " --with-usdt Build with userspace DTrace probes enabled." ++ echo " --without-usdt No path required." ++ echo " --with-fuzzer Build with LLVM fuzzing enabled." ++ echo " Path to clang_rt.fuzzer_no_main library required." ++ echo " Requires setting CC and CXX to clang." ++ echo " (Typically /usr/lib/llvm-VER/lib/clang/VER/lib/linux/libclang_rt.fuzzer_no_main-ARCH.a)" ++ echo " --with-sma Generate Storage Management Agent's protobuf interface" ++ echo " --without-sma No path required." ++ echo " --with-avahi Build with Avahi mDNS discovery client service enabled in bdev-nvme module." ++ echo " --without-avahi No path required." ++ echo " --with-ssam Support to build ssam for DPU storage accel." ++ echo " --without-ssam No path required." ++ echo "" ++ echo "Environment variables:" ++ echo "" ++ echo "CC C compiler" ++ echo "CFLAGS C compiler flags" ++ echo "CXX C++ compiler" ++ echo "CXXFLAGS C++ compiler flags" ++ echo "LD Linker" ++ echo "LDFLAGS Linker flags" ++ echo "DESTDIR Destination for 'make install'" ++ echo "" ++} ++ ++# Load default values ++# Convert config to sourceable configuration file ++sed -r 's/CONFIG_([[:alnum:]_]+)=(.*)/CONFIG[\1]=\2/g' $rootdir/CONFIG > $rootdir/CONFIG.sh ++declare -A CONFIG ++source $rootdir/CONFIG.sh ++rm $rootdir/CONFIG.sh ++ ++# Try to expand literal ~ that might have been passed as an option via --long-opt=~/dir. ++set -- "${@//\~/~}" ++ ++for i in "$@"; do ++ case "$i" in ++ --cross-prefix=*) ++ CONFIG[CROSS_PREFIX]="${i#*=}" ++ ;; ++ --enable-lto) ++ CONFIG[LTO]=y ++ ;; ++ --disable-lto) ++ CONFIG[LTO]=n ++ ;; ++ esac ++done ++ ++# Detect the compiler toolchain ++$rootdir/scripts/detect_cc.sh --cc="$CC" --cxx="$CXX" --lto="${CONFIG[LTO]}" --ld="$LD" --cross-prefix="${CONFIG[CROSS_PREFIX]}" > $rootdir/mk/cc.mk ++ ++CC=$(grep "DEFAULT_CC=" "$rootdir/mk/cc.mk" | sed s/DEFAULT_CC=//) ++CC_TYPE=$(grep "CC_TYPE=" "$rootdir/mk/cc.mk" | cut -d "=" -f 2) ++ ++arch=$($CC -dumpmachine) ++sys_name=$(uname -s) ++ ++if [[ $arch == *mingw* ]] || [[ $arch == *windows* ]]; then ++ sys_name=Windows ++fi ++ ++if [[ $sys_name != "Linux" ]]; then ++ # Vhost, rte_vhost library and virtio are only supported on Linux. ++ CONFIG[VHOST]="n" ++ CONFIG[VIRTIO]="n" ++ echo "Notice: Vhost, rte_vhost library and virtio are only supported on Linux. Turning off default feature." ++fi ++ ++function check_dir() { ++ arg="$1" ++ dir="${arg#*=}" ++ if [ ! -d "$dir" ]; then ++ echo "$arg: directory not found" ++ exit 1 ++ fi ++} ++ ++# On x86_64 'clang -dumpmachine' produces x86_64-pc-linux-gnu ++# whereas the dpdk might be built with gcc and its libs lie in ++# x86_64-linux-gnu. Let's find the right libdir for dpdk libs. ++function find_dpdk_arch_libdir() { ++ local dpdk_libdir="$1/lib" ++ ++ # Use libdir with 'lib' or 'lib64' ++ if [[ ! -d "$dpdk_libdir" ]]; then ++ dpdk_libdir+="64" ++ fi ++ ++ # Checking first what we have with $arch, then clang ++ # variant of arch. ++ arches=("$arch" "$(echo $arch | sed 's/-pc//g')") ++ for a in "${arches[@]}"; do ++ local libdir_arch="$dpdk_libdir/$a" ++ if [[ -d "$libdir_arch" ]]; then ++ echo "$libdir_arch" ++ return ++ fi ++ done ++ ++ # Fallback to the libdir without arch component ++ echo "$dpdk_libdir" ++} ++ ++function check_IPSec_mb() { ++ local mode=$1 ++ local dpdk_libdir=$2 ++ local dpdk_incdir=$3 ++ local have_ipsec_mb=n ++ ++ if [[ $mode = "pkg-config" ]]; then ++ local dpdk_libs ++ ++ # Request libdpdk pkg-config settings to figure out if the IPSec_MB is used ++ # as a dependency. ++ # Due to some reason pkg-config shows -lIPSec_MB only with --static option ++ dpdk_libs=$(PKG_CONFIG_PATH="$PKG_CONFIG_PATH:$dpdk_libdir/pkgconfig" pkg-config --libs --static libdpdk) ++ if echo "$dpdk_libs" | grep "\-lIPSec_MB" > /dev/null 2>&1; then ++ have_ipsec_mb=y ++ fi ++ elif [[ $mode = "build-config" ]]; then ++ # Use dpdk build config header to check if the IPSec_MB was used. ++ if grep -F "define RTE_CRYPTO_IPSEC_MB 1" "$dpdk_incdir/rte_build_config.h" > /dev/null 2>&1; then ++ have_ipsec_mb=y ++ fi ++ else ++ echo "ERROR: Invalid IPSec_MB checking mode $mode." ++ echo "ERROR: Only \"pkg-config\" and \"build-config\" available." ++ exit 1 ++ fi ++ if [[ $have_ipsec_mb = "n" ]]; then ++ CONFIG[IPSEC_MB]=n ++ return ++ fi ++ ++ # Since we don't know the library path where the IPSec_MB is located ++ # let's find it out with the ldd utility. This can be a standard location ++ # or a custom build. ++ local librte_crypto_ipsec_mb="$dpdk_libdir/librte_crypto_ipsec_mb.so" ++ if [[ -f "$librte_crypto_ipsec_mb" ]]; then ++ local ipsec_mb_libdir ++ ++ ipsec_mb_libdir=$(ldd "$librte_crypto_ipsec_mb" | grep "libIPSec_MB.so" \ ++ | sed -e 's/\s*libIPSec_MB.so.*=>\s//' -e 's/\/libIPSec_MB.so.*$//') ++ if [[ -d $ipsec_mb_libdir ]]; then ++ CONFIG[IPSEC_MB]=y ++ CONFIG[IPSEC_MB_DIR]="$ipsec_mb_libdir" ++ elif [[ $ipsec_mb_libdir = "not found" ]]; then ++ # ldconfig cache is broken, old build with refs to non-existing libs, etc. ++ echo "ERROR: Invalid IPSec_MB installation. Library is not found and/or ldconfig cache is broken!" ++ exit 1 ++ else ++ # Failed to check for IPSec_MB lib path. Let's just assume it is lives ++ # in one of the standard locations (/usr/lib, etc.). ++ CONFIG[IPSEC_MB]=y ++ fi ++ else ++ # pkg-config says there is IPSec_mb and dpdk lib does not have it. Let's just ++ # assume it is installed in the system in one of the standard locations. ++ CONFIG[IPSEC_MB]=y ++ fi ++} ++ ++for i in "$@"; do ++ case "$i" in ++ -h | --help) ++ usage ++ exit 0 ++ ;; ++ --cross-prefix=*) ;& ++ --enable-lto) ;& ++ --disable-lto) ++ # Options handled before detecting CC. ++ ;; ++ --prefix=*) ++ CONFIG[PREFIX]="${i#*=}" ++ ;; ++ --target-arch=*) ++ CONFIG[ARCH]="${i#*=}" ++ ;; ++ --libdir=*) ++ CONFIG[LIBDIR]="${i#*=}" ++ ;; ++ --enable-debug) ++ CONFIG[DEBUG]=y ++ ;; ++ --disable-debug) ++ CONFIG[DEBUG]=n ++ ;; ++ --enable-asan) ++ CONFIG[ASAN]=y ++ ;; ++ --disable-asan) ++ CONFIG[ASAN]=n ++ ;; ++ --enable-ubsan) ++ CONFIG[UBSAN]=y ++ ;; ++ --disable-ubsan) ++ CONFIG[UBSAN]=n ++ ;; ++ --enable-tsan) ++ CONFIG[TSAN]=y ++ ;; ++ --disable-tsan) ++ CONFIG[TSAN]=n ++ ;; ++ --enable-coverage) ++ CONFIG[COVERAGE]=y ++ ;; ++ --disable-coverage) ++ CONFIG[COVERAGE]=n ++ ;; ++ --enable-pgo-capture) ++ CONFIG[PGO_CAPTURE]=y ++ ;; ++ --disable-pgo-capture) ++ CONFIG[PGO_CAPTURE]=n ++ ;; ++ --enable-pgo-use) ++ CONFIG[PGO_USE]=y ++ ;; ++ --disable-pgo-use) ++ CONFIG[PGO_USE]=n ++ ;; ++ --enable-tests) ++ CONFIG[TESTS]=y ++ ;; ++ --disable-tests) ++ CONFIG[TESTS]=n ++ ;; ++ --enable-unit-tests) ++ CONFIG[UNIT_TESTS]=y ++ ;; ++ --disable-unit-tests) ++ CONFIG[UNIT_TESTS]=n ++ ;; ++ --enable-examples) ++ CONFIG[EXAMPLES]=y ++ ;; ++ --disable-examples) ++ CONFIG[EXAMPLES]=n ++ ;; ++ --enable-apps) ++ CONFIG[APPS]=y ++ ;; ++ --disable-apps) ++ CONFIG[APPS]=N ++ ;; ++ --enable-werror) ++ CONFIG[WERROR]=y ++ ;; ++ --disable-werror) ++ CONFIG[WERROR]=n ++ ;; ++ --enable-cet) ++ CONFIG[CET]=y ++ ;; ++ --disable-cet) ++ CONFIG[CET]=n ++ ;; ++ --with-dpdk) ++ # Can we use pkg-config? ++ if command -v "pkg-config" > /dev/null 2>&1 && pkg-config --exists libdpdk; then ++ dpdk_libdir=$(pkg-config --variable=libdir libdpdk) ++ dpdk_libdir=$(readlink -f $dpdk_libdir) ++ dpdk_incdir=$(pkg-config --variable=includedir libdpdk) ++ echo "Using DPDK lib dir $dpdk_libdir" ++ CONFIG[DPDK_LIB_DIR]=$dpdk_libdir ++ CONFIG[DPDK_INC_DIR]=$dpdk_incdir ++ CONFIG[DPDK_PKG_CONFIG]=y ++ CFLAGS="${CFLAGS:+$CFLAGS }$(pkg-config --cflags libdpdk)" ++ check_IPSec_mb "pkg-config" "$dpdk_libdir" "$dpdk_incdir" ++ else ++ echo "libdpdk.pc not found, aborting" ++ exit 1 ++ fi ++ ;; ++ --with-dpdk=*) ++ check_dir "$i" ++ dpdk_dir=$(readlink -f ${i#*=}) ++ dpdk_libdir=$(find_dpdk_arch_libdir $dpdk_dir) ++ dpdk_incdir="$dpdk_dir/include" ++ ++ # Can we use pkg-config? ++ if command -v "pkg-config" > /dev/null 2>&1 && PKG_CONFIG_PATH="$PKG_CONFIG_PATH:$dpdk_libdir/pkgconfig" pkg-config --exists libdpdk; then ++ echo "Using $dpdk_libdir/pkgconfig for additional libs..." ++ sysroot_dir=$(PKG_CONFIG_PATH="$PKG_CONFIG_PATH:$dpdk_libdir/pkgconfig" pkg-config --variable=pc_sysrootdir libdpdk) ++ dpdk_libdir=$(PKG_CONFIG_SYSROOT_DIR='' PKG_CONFIG_PATH="$PKG_CONFIG_PATH:$dpdk_libdir/pkgconfig" pkg-config --variable=libdir libdpdk) ++ dpdk_libdir=$(readlink -f "${sysroot_dir}$dpdk_libdir") ++ if ! echo $dpdk_libdir | grep $dpdk_dir > /dev/null 2>&1; then ++ echo "ERROR: pkg-config reported DPDK libdir $dpdk_libdir is out of the directory specified with --with-dpdk=" ++ echo "ERROR: do you have another DPDK installed in the system?" ++ exit 1 ++ fi ++ CFLAGS="${CFLAGS:+$CFLAGS }$(PKG_CONFIG_PATH="$PKG_CONFIG_PATH:$dpdk_libdir/pkgconfig" pkg-config --cflags libdpdk)" ++ dpdk_incdir="${sysroot_dir}$(PKG_CONFIG_PATH="$PKG_CONFIG_PATH:$dpdk_libdir/pkgconfig" pkg-config --variable=includedir libdpdk)" ++ check_IPSec_mb "pkg-config" "$dpdk_libdir" "$dpdk_incdir" ++ else ++ echo "Using $dpdk_incdir/rte_build_config.h for additional libs..." ++ ++ check_IPSec_mb "build-config" "$dpdk_libdir" "$dpdk_incdir" ++ fi ++ echo "DPDK libraries: $dpdk_libdir" ++ echo "DPDK includes: $dpdk_incdir" ++ CONFIG[DPDK_DIR]=$dpdk_dir ++ CONFIG[DPDK_LIB_DIR]="$dpdk_libdir" ++ CONFIG[DPDK_INC_DIR]="$dpdk_incdir" ++ CONFIG[DPDK_PKG_CONFIG]=n ++ ;; ++ --without-dpdk) ++ CONFIG[DPDK_DIR]= ++ ;; ++ --with-wpdk=*) ++ check_dir "$i" ++ CONFIG[WPDK_DIR]=$(readlink -f ${i#*=}) ++ ;; ++ --without-wpdk) ++ CONFIG[WPDK_DIR]= ++ ;; ++ --with-env=*) ++ CONFIG[ENV]="${i#*=}" ++ ;; ++ --with-ublk) ++ CONFIG[UBLK]=y ++ ;; ++ --without-ublk) ++ CONFIG[UBLK]=n ++ ;; ++ --with-rbd) ++ CONFIG[RBD]=y ++ ;; ++ --without-rbd) ++ CONFIG[RBD]=n ++ ;; ++ --with-rdma=*) ++ CONFIG[RDMA]=y ++ CONFIG[RDMA_PROV]=${i#*=} ++ ;; ++ --with-rdma) ++ CONFIG[RDMA]=y ++ CONFIG[RDMA_PROV]="verbs" ++ ;; ++ --without-rdma) ++ CONFIG[RDMA]=n ++ ;; ++ --with-fc=*) ++ CONFIG[FC]=y ++ CONFIG[FC_PATH]=$(readlink -f ${i#*=}) ++ ;; ++ --with-fc) ++ CONFIG[FC]=y ++ CONFIG[FC_PATH]= ++ ;; ++ --without-fc) ++ CONFIG[FC]=n ++ CONFIG[FC_PATH]= ++ ;; ++ --with-daos) ++ CONFIG[DAOS]=y ++ CONFIG[DAOS_DIR]="" ++ ;; ++ --with-daos=*) ++ CONFIG[DAOS]=y ++ check_dir "$i" ++ CONFIG[DAOS_DIR]=$(readlink -f ${i#*=}) ++ ;; ++ --without-daos) ++ CONFIG[DAOS]=n ++ ;; ++ --with-shared) ++ CONFIG[SHARED]=y ++ ;; ++ --without-shared) ++ CONFIG[SHARED]=n ++ ;; ++ --with-iscsi-initiator) ++ CONFIG[ISCSI_INITIATOR]=y ++ ;; ++ --without-iscsi-initiator) ++ CONFIG[ISCSI_INITIATOR]=n ++ ;; ++ --with-crypto) ++ CONFIG[CRYPTO]=y ++ ;; ++ --without-crypto) ++ CONFIG[CRYPTO]=n ++ ;; ++ --with-vhost) ++ CONFIG[VHOST]=y ++ ;; ++ --without-vhost) ++ CONFIG[VHOST]=n ++ ;; ++ --with-virtio) ++ CONFIG[VIRTIO]=y ++ ;; ++ --without-virtio) ++ CONFIG[VIRTIO]=n ++ ;; ++ --with-vfio-user) ++ CONFIG[VFIO_USER]=y ++ CONFIG[VFIO_USER_DIR]="" ++ ;; ++ --with-vfio-user=*) ++ CONFIG[VFIO_USER]=y ++ check_dir "$i" ++ CONFIG[VFIO_USER_DIR]=$(readlink -f ${i#*=}) ++ ;; ++ --without-vfio-user) ++ CONFIG[VFIO_USER]=n ++ ;; ++ --with-pmdk) ++ CONFIG[PMDK]=y ++ CONFIG[PMDK_DIR]="" ++ ;; ++ --with-pmdk=*) ++ CONFIG[PMDK]=y ++ check_dir "$i" ++ CONFIG[PMDK_DIR]=$(readlink -f ${i#*=}) ++ ;; ++ --without-pmdk) ++ CONFIG[PMDK]=n ++ ;; ++ --with-vbdev-compress) ++ CONFIG[VBDEV_COMPRESS]=y ++ ;; ++ --without-vbdev-compress) ++ CONFIG[VBDEV_COMPRESS]=n ++ ;; ++ --with-dpdk-compressdev) ++ CONFIG[DPDK_COMPRESSDEV]=y ++ ;; ++ --without-dpdk-compressdev) ++ CONFIG[DPDK_COMPRESSDEV]=n ++ ;; ++ --with-xnvme) ++ CONFIG[XNVME]=y ++ ;; ++ --without-xnvme) ++ CONFIG[XNVME]=n ++ ;; ++ --with-fio) ;& ++ --with-fio=*) ++ if [[ ${i#*=} != "$i" ]]; then ++ CONFIG[FIO_SOURCE_DIR]=${i#*=} ++ fi ++ check_dir "--with-fio=${CONFIG[FIO_SOURCE_DIR]}" ++ CONFIG[FIO_SOURCE_DIR]=$(readlink -f "${CONFIG[FIO_SOURCE_DIR]}") ++ CONFIG[FIO_PLUGIN]=y ++ ;; ++ --without-fio) ++ CONFIG[FIO_PLUGIN]=n ++ ;; ++ --with-vtune=*) ++ check_dir "$i" ++ CONFIG[VTUNE_DIR]="${i#*=}" ++ CONFIG[VTUNE]=y ++ ;; ++ --without-vtune) ++ CONFIG[VTUNE_DIR]= ++ CONFIG[VTUNE]=n ++ ;; ++ --with-ocf) ++ CONFIG[OCF]=y ++ CONFIG[OCF_PATH]=$(readlink -f "$rootdir/ocf") ++ ;; ++ --with-ocf=*) ++ CONFIG[OCF]=y ++ CONFIG[OCF_PATH]=$(readlink -f ${i#*=}) ++ ;; ++ --without-ocf) ++ CONFIG[OCF]=n ++ CONFIG[OCF_PATH]= ++ ;; ++ --with-uring=*) ++ CONFIG[URING]=y ++ CONFIG[URING_PATH]=$(readlink -f ${i#*=}) ++ ;; ++ --with-uring) ++ CONFIG[URING]=y ++ CONFIG[URING_ZNS]=y ++ CONFIG[URING_PATH]= ++ ;; ++ --without-uring) ++ CONFIG[URING]=n ++ CONFIG[URING_PATH]= ++ ;; ++ --without-uring-zns) ++ CONFIG[URING_ZNS]=n ++ ;; ++ --with-openssl=*) ++ check_dir "$i" ++ CONFIG[OPENSSL_PATH]=$(readlink -f ${i#*=}) ++ ;; ++ --with-fuse) ++ CONFIG[FUSE]=y ++ ;; ++ --without-fuse) ++ CONFIG[FUSE]=n ++ ;; ++ --with-ssam) ++ CONFIG[SSAM]=y ++ ;; ++ --without-ssam) ++ CONFIG[SSAM]=n ++ ;; ++ --with-ssam-only) ++ CONFIG[SSAM_ONLY]=y ++ ;; ++ --with-nvme-cuse) ++ CONFIG[NVME_CUSE]=y ++ ;; ++ --without-nvme-cuse) ++ CONFIG[NVME_CUSE]=n ++ ;; ++ --with-raid5f) ++ CONFIG[RAID5F]=y ++ ;; ++ --without-raid5f) ++ CONFIG[RAID5F]=n ++ ;; ++ --with-idxd) ++ CONFIG[IDXD]=y ++ CONFIG[IDXD_KERNEL]=n ++ ;; ++ --without-idxd) ++ CONFIG[IDXD]=n ++ ;; ++ --with-usdt) ++ CONFIG[USDT]=y ++ ;; ++ --without-usdt) ++ CONFIG[USDT]=n ++ ;; ++ --with-fuzzer) ++ echo "Must specify fuzzer library path with --with-fuzzer" ++ usage ++ exit 1 ++ ;; ++ --with-fuzzer=*) ++ CONFIG[FUZZER]=y ++ CONFIG[FUZZER_LIB]=$(readlink -f ${i#*=}) ++ ;; ++ --without-fuzzer) ++ CONFIG[FUZZER]=n ++ CONFIG[FUZZER_LIB]= ++ ;; ++ --with-sma) ++ CONFIG[SMA]=y ++ ;; ++ --without-sma) ++ CONFIG[SMA]=n ++ ;; ++ --with-avahi) ++ CONFIG[AVAHI]=y ++ ;; ++ --without-avahi) ++ CONFIG[AVAHI]=n ++ ;; ++ --) ++ break ++ ;; ++ *) ++ echo "Unrecognized option $i" ++ usage ++ exit 1 ++ ;; ++ esac ++done ++ ++if [[ $arch == x86_64* ]]; then ++ BUILD_CMD=($CC -o /dev/null -x c $CPPFLAGS $CFLAGS $LDFLAGS "-march=native") ++else ++ BUILD_CMD=($CC -o /dev/null -x c $CPPFLAGS $CFLAGS $LDFLAGS) ++fi ++BUILD_CMD+=(-I/usr/local/include -L/usr/local/lib) ++ ++if [[ "${CONFIG[VFIO_USER]}" = "y" ]]; then ++ if ! echo -e '#include ' \ ++ | "${BUILD_CMD[@]}" -E - 2> /dev/null; then ++ echo "ERROR: --with-vfio-user requires json-c-devel" ++ echo "Please install then re-run this script" ++ exit 1 ++ fi ++ if ! echo -e '#include ' \ ++ | "${BUILD_CMD[@]}" -E - 2> /dev/null; then ++ echo "ERROR: --with-vfio-user requires libcmocka-devel" ++ echo "Please install then re-run this script" ++ exit 1 ++ fi ++fi ++ ++# IDXD uses Intel specific instructions. ++if [[ "${CONFIG[IDXD]}" = "y" ]]; then ++ if [ $(uname -s) == "FreeBSD" ]; then ++ intel="hw.model: Intel" ++ cpu_vendor=$(sysctl -a | grep hw.model | cut -c 1-15) ++ else ++ intel="GenuineIntel" ++ cpu_vendor=$(grep -i 'vendor' /proc/cpuinfo --max-count=1) ++ fi ++ if [[ "$cpu_vendor" != *"$intel"* ]]; then ++ echo "ERROR: IDXD cannot be used due to CPU incompatibility." ++ exit 1 ++ fi ++ if [ -e /usr/include/accel-config/libaccel_config.h ]; then ++ CONFIG[IDXD_KERNEL]=y ++ fi ++ ++fi ++ ++if [ -z "${CONFIG[ENV]}" ]; then ++ CONFIG[ENV]=$rootdir/lib/env_dpdk ++ echo "Using default SPDK env in ${CONFIG[ENV]}" ++ if [[ -z "${CONFIG[DPDK_DIR]}" && "${CONFIG[DPDK_PKG_CONFIG]}" == n ]]; then ++ if [ ! -f "$rootdir"/dpdk/config/meson.build ]; then ++ echo "DPDK not found; please specify --with-dpdk= or run:" ++ echo ++ echo " git submodule update --init" ++ exit 1 ++ else ++ CONFIG[DPDK_DIR]="${rootdir}/dpdk/build" ++ # Default ipsec libs ++ if [[ "${CONFIG[CRYPTO]}" = "y" ]] && [[ $arch = x86_64* ]]; then ++ CONFIG[IPSEC_MB]=y ++ CONFIG[IPSEC_MB_DIR]="${rootdir}/intel-ipsec-mb/lib" ++ fi ++ echo "Using default DPDK in ${CONFIG[DPDK_DIR]}" ++ fi ++ fi ++else ++ if [[ -n "${CONFIG[DPDK_DIR]}" || "${CONFIG[DPDK_PKG_CONFIG]}" == y ]]; then ++ echo "--with-env and --with-dpdk are mutually exclusive." ++ exit 1 ++ fi ++ ++ if [ "${CONFIG[VHOST]}" = "y" ]; then ++ echo "Vhost is only supported when using the default DPDK environment. Disabling it." ++ fi ++ # Always disable vhost, but only print the error message if the user explicitly turned it on. ++ CONFIG[VHOST]="n" ++ if [ "${CONFIG[VIRTIO]}" = "y" ]; then ++ echo "Virtio is only supported when using the default DPDK environment. Disabling it." ++ fi ++ # Always disable virtio, but only print the error message if the user explicitly turned it on. ++ CONFIG[VIRTIO]="n" ++fi ++ ++if [[ "${CONFIG[DPDK_PKG_CONFIG]}" == y ]]; then ++ if [[ "${CONFIG[SHARED]}" == n ]]; then ++ # dpdk-devel doesn't provide static libs ++ echo "Build against packaged DPDK requested, enabling shared libraries" ++ CONFIG[SHARED]=y ++ fi ++fi ++ ++if [[ $sys_name == "Windows" ]]; then ++ if [ -z "${CONFIG[WPDK_DIR]}" ]; then ++ if [ ! -f "$rootdir"/wpdk/Makefile ]; then ++ echo "WPDK not found; please specify --with-wpdk=. See https://wpdk.github.io." ++ exit 1 ++ else ++ CONFIG[WPDK_DIR]="${rootdir}/wpdk/build" ++ echo "Using default WPDK in ${CONFIG[WPDK_DIR]}" ++ fi ++ fi ++else ++ if [ -n "${CONFIG[WPDK_DIR]}" ]; then ++ echo "ERROR: --with-wpdk is only supported for Windows" ++ exit 1 ++ fi ++fi ++ ++if [ "${CONFIG[VTUNE]}" = "y" ]; then ++ echo "WARNING: VTune support is deprecated." ++ if [ -z "${CONFIG[VTUNE_DIR]}" ]; then ++ echo "When VTune is enabled, you must specify the VTune directory using --with-vtune=path" ++ exit 1 ++ fi ++fi ++ ++if [[ "${CONFIG[ASAN]}" = "y" && "${CONFIG[TSAN]}" = "y" ]]; then ++ echo "ERROR: ASAN and TSAN cannot be enabled at the same time." ++ exit 1 ++fi ++ ++if [[ "${CONFIG[FIO_PLUGIN]}" = "y" && "${CONFIG[EXAMPLES]}" = "n" ]]; then ++ echo "ERROR: --with-fio and --disable-examples are mutually exclusive." ++ exit 1 ++fi ++ ++if [[ $sys_name == "FreeBSD" ]]; then ++ # FreeBSD doesn't support all configurations ++ if [[ "${CONFIG[COVERAGE]}" == "y" ]]; then ++ echo "ERROR: CONFIG_COVERAGE not available on FreeBSD" ++ exit 1 ++ fi ++fi ++ ++if [[ $sys_name == "Linux" ]]; then ++ if pkg-config libbsd; then ++ CONFIG[HAVE_LIBBSD]=y ++ fi ++fi ++ ++if pkg-config libarchive; then ++ CONFIG[HAVE_LIBARCHIVE]=y ++fi ++ ++if [[ $sys_name != "Linux" ]]; then ++ if [[ "${CONFIG[VHOST]}" == "y" ]]; then ++ echo "Vhost is only supported on Linux." ++ exit 1 ++ fi ++ if [[ "${CONFIG[VIRTIO]}" == "y" ]]; then ++ echo "Virtio is only supported on Linux." ++ exit 1 ++ fi ++fi ++ ++if [ "${CONFIG[RDMA]}" = "y" ]; then ++ if [[ ! "${CONFIG[RDMA_PROV]}" == "verbs" ]] && [[ ! "${CONFIG[RDMA_PROV]}" == "mlx5_dv" ]]; then ++ echo "Invalid RDMA provider specified, must be \"verbs\" or \"mlx5_dv\"" ++ exit 1 ++ fi ++ ++ if ! echo -e '#include \n#include \n' \ ++ 'int main(void) { return 0; }\n' \ ++ | "${BUILD_CMD[@]}" -libverbs -lrdmacm - 2> /dev/null; then ++ echo "--with-rdma requires libverbs and librdmacm." ++ echo "Please install then re-run this script." ++ exit 1 ++ fi ++ ++ if echo -e '#include \n' \ ++ 'int main(void) { return !!IBV_WR_SEND_WITH_INV; }\n' \ ++ | "${BUILD_CMD[@]}" -c - 2> /dev/null; then ++ CONFIG[RDMA_SEND_WITH_INVAL]="y" ++ else ++ CONFIG[RDMA_SEND_WITH_INVAL]="n" ++ echo " ++******************************************************************************* ++WARNING: The Infiniband Verbs opcode Send With Invalidate is either not ++supported or is not functional with the current version of libibverbs installed ++on this system. Please upgrade to at least version 1.1. ++ ++Beginning with Linux kernel 4.14, the kernel NVMe-oF initiator leverages Send ++With Invalidate RDMA operations to improve performance. Failing to use the ++Send With Invalidate operation on the NVMe-oF target side results in full ++functionality, but greatly reduced performance. The SPDK NVMe-oF target will ++be unable to leverage that operation using the currently installed version ++of libibverbs, so Linux kernel NVMe-oF initiators based on kernels greater ++than or equal to 4.14 will see significantly reduced performance. ++*******************************************************************************" ++ fi ++ ++ if echo -e '#include \n' \ ++ 'int main(void) { return !!RDMA_OPTION_ID_ACK_TIMEOUT; }\n' \ ++ | "${BUILD_CMD[@]}" -c - 2> /dev/null; then ++ CONFIG[RDMA_SET_ACK_TIMEOUT]="y" ++ else ++ CONFIG[RDMA_SET_ACK_TIMEOUT]="n" ++ echo "RDMA_OPTION_ID_ACK_TIMEOUT is not supported" ++ fi ++ ++ if echo -e '#include \n' \ ++ 'int main(void) { return !!RDMA_OPTION_ID_TOS; }\n' \ ++ | "${BUILD_CMD[@]}" -c - 2> /dev/null; then ++ CONFIG[RDMA_SET_TOS]="y" ++ else ++ CONFIG[RDMA_SET_TOS]="n" ++ echo "RDMA_OPTION_ID_TOS is not supported" ++ fi ++ ++ if [ "${CONFIG[RDMA_PROV]}" == "mlx5_dv" ]; then ++ MLX5_DV_BUILD_BUILD_CMD=" ++ #include \n ++ #include \n ++ int main(void) { return rdma_establish(NULL) ||\n ++ !!IBV_QP_INIT_ATTR_SEND_OPS_FLAGS || !!MLX5_OPCODE_RDMA_WRITE" ++ if [ "${CONFIG[CRYPTO]}" = "y" ]; then ++ MLX5_DV_BUILD_BUILD_CMD+="|| !!MLX5DV_CRYPTO_ENGINES_CAP_AES_XTS_SINGLE_BLOCK" ++ fi ++ MLX5_DV_BUILD_BUILD_CMD+=";}" ++ if ! echo -e $MLX5_DV_BUILD_BUILD_CMD | "${BUILD_CMD[@]}" -lmlx5 -I${rootdir}/include -c -; then ++ echo "mlx5_dv provider is not supported" ++ exit 1 ++ fi ++ fi ++ ++ echo "Using '${CONFIG[RDMA_PROV]}' RDMA provider" ++fi ++ ++if [[ "${CONFIG[FC]}" = "y" ]]; then ++ if [[ -n "${CONFIG[FC_PATH]}" ]]; then ++ check_dir "${CONFIG[FC_PATH]}" ++ fi ++fi ++ ++if [[ "${CONFIG[PMDK]}" = "y" ]]; then ++ echo "WARNING: PMDK - bdev_pmem is deprecated." ++ echo "WARNING: PMDK - ACCEL_FLAG_PERSISTENT in accel_sw module is deprecated." ++ if ! echo -e '#include \nint main(void) { return 0; }\n' \ ++ | "${BUILD_CMD[@]}" -lpmemblk - 2> /dev/null; then ++ echo "--with-pmdk requires libpmemblk." ++ echo "Please install then re-run this script." ++ exit 1 ++ fi ++fi ++ ++function dpdk_version() { ++ # Check DPDK version to determine if mlx5_pci driver is supported ++ local dpdk_ver="none" ++ if [[ "${CONFIG[DPDK_DIR]}" == "$rootdir/dpdk/build" ]]; then ++ # DPDK_DIR points at our submodule so ./build may not exist yet. Use ++ # absolute path to lookup the version. ++ dpdk_ver=$(< "$rootdir/dpdk/VERSION") ++ elif [[ -f "${CONFIG[DPDK_DIR]}"/../VERSION ]]; then ++ dpdk_ver=$(< "${CONFIG[DPDK_DIR]}"/../VERSION) ++ fi ++ echo $dpdk_ver ++} ++ ++function mlx5_build() { ++ # Check if libmlx5 exists to enable mlx5_pci compress/crypto PMD ++ if ! echo -e '#include \n' \ ++ '#include \n' \ ++ '#include \n' \ ++ 'int main(void) { return 0; }\n' \ ++ | "${BUILD_CMD[@]}" -lmlx5 -libverbs -I${rootdir}/include -c - 2> /dev/null; then ++ return 1 ++ fi ++ return 0 ++} ++ ++if [[ "${CONFIG[VBDEV_COMPRESS]}" = "y" ]]; then ++ echo "WARNING: PMDK - Persistent device support with bdev_compress is deprecated." ++ if ! echo -e '#include \nint main(void) { return 0; }\n' \ ++ | "${BUILD_CMD[@]}" -lpmem - 2> /dev/null; then ++ echo "--with-vbdev-compress requires libpmem." ++ echo "Please install then re-run this script." ++ exit 1 ++ fi ++ # Try to enable mlx5 compress ++ CONFIG[VBDEV_COMPRESS_MLX5]="y" ++ ++ # Check if libmlx5 exists to enable mlx5_pci compress PMD ++ if ! mlx5_build; then ++ echo "libmlx5 is not found, so disabling DPDK mlx5_pci compress PMD" ++ CONFIG[VBDEV_COMPRESS_MLX5]="n" ++ else ++ if [[ "${CONFIG[DPDK_PKG_CONFIG]}" = "y" ]]; then ++ # Check if librte_compress_mlx5 exists in DPDK package ++ if [ ! -f "${CONFIG[DPDK_LIB_DIR]}"/librte_compress_mlx5.so ]; then ++ echo "librte_compress_mlx5 is not found, so disabling DPDK mlx5_pci compress PMD" ++ CONFIG[VBDEV_COMPRESS_MLX5]="n" ++ fi ++ else ++ # Check DPDK version to determine if mlx5_pci driver is supported ++ dpdk_ver=$(dpdk_version) ++ if [[ $dpdk_ver = "none" ]]; then ++ echo "Cannot get DPDK version, so disabling DPDK mlx5_pci compress PMD" ++ CONFIG[VBDEV_COMPRESS_MLX5]="n" ++ elif [[ -n $dpdk_ver ]] && lt "$dpdk_ver" 21.02.0; then ++ # mlx5_pci for compress is supported by DPDK >- 21.02.0 ++ echo "DPDK version ${dpdk_ver} doesn't support mlx5_pci compress PMD" ++ CONFIG[VBDEV_COMPRESS_MLX5]="n" ++ elif [[ -n ${CONFIG[DPDK_LIB_DIR]} ]] && [ ! -f "${CONFIG[DPDK_LIB_DIR]}"/librte_compress_mlx5.so ]; then ++ # This is only checked when --with-dpdk or --with-dpdk=* is used ++ echo "librte_compress_mlx5 is not found, so disabling DPDK mlx5_pci compress PMD" ++ CONFIG[VBDEV_COMPRESS_MLX5]="n" ++ fi ++ fi ++ fi ++fi ++ ++if [[ "${CONFIG[CRYPTO]}" = "y" ]]; then ++ # Try to enable mlx5 crypto ++ CONFIG[CRYPTO_MLX5]="y" ++ ++ # Check if libmlx5 exists to enable mlx5_pci compress PMD ++ if ! mlx5_build; then ++ echo "libmlx5 is not found, so disabling DPDK mlx5_pci crypto PMD" ++ CONFIG[CRYPTO_MLX5]="n" ++ else ++ if [[ "${CONFIG[DPDK_PKG_CONFIG]}" = "y" ]]; then ++ # Check if librte_crypto_mlx5 exists in DPDK package ++ if [ ! -f "${CONFIG[DPDK_LIB_DIR]}"/librte_crypto_mlx5.so ]; then ++ echo "librte_crypto_mlx5 is not found, so disabling DPDK mlx5_pci crypto PMD" ++ CONFIG[CRYPTO_MLX5]="n" ++ fi ++ else ++ # Check DPDK version to determine if mlx5_pci driver is supported ++ dpdk_ver=$(dpdk_version) ++ if [[ $dpdk_ver = "none" ]]; then ++ echo "Cannot get DPDK version, so disabling DPDK mlx5_pci crypto PMD" ++ CONFIG[CRYPTO_MLX5]="n" ++ elif [[ -n $dpdk_ver ]] && lt "$dpdk_ver" 21.11.0; then ++ # mlx5_pci for crypto is supported by DPDK >- 21.11.0 ++ echo "DPDK version ${dpdk_ver} doesn't support mlx5_pci crypto PMD" ++ CONFIG[CRYPTO_MLX5]="n" ++ elif [[ -n ${CONFIG[DPDK_LIB_DIR]} ]] && [ ! -f "${CONFIG[DPDK_LIB_DIR]}"/librte_crypto_mlx5.so ]; then ++ # This is only checked when --with-dpdk or --with-dpdk=* is used ++ echo "librte_crypto_mlx5 is not found, so disabling DPDK mlx5_pci crypto PMD" ++ CONFIG[CRYPTO_MLX5]="n" ++ fi ++ fi ++ fi ++fi ++ ++if [[ "${CONFIG[NVME_CUSE]}" = "y" ]]; then ++ if ! echo -e '#define FUSE_USE_VERSION 31\n#include \n#include \n#include \nint main(void) { return 0; }\n' \ ++ | "${BUILD_CMD[@]}" -lfuse3 -D_FILE_OFFSET_BITS=64 - 2> /dev/null; then ++ echo "--with-cuse requires libfuse3." ++ echo "Please install then re-run this script." ++ exit 1 ++ fi ++fi ++ ++if [[ "${CONFIG[RBD]}" = "y" ]]; then ++ if ! echo -e '#include \n#include \n' \ ++ 'int main(void) { return 0; }\n' \ ++ | "${BUILD_CMD[@]}" -lrados -lrbd - 2> /dev/null; then ++ echo "--with-rbd requires librados and librbd." ++ echo "Please install then re-run this script." ++ exit 1 ++ fi ++fi ++ ++if [[ "${CONFIG[UBLK]}" = "y" ]]; then ++ if ! echo -e '#include \n#include \n' \ ++ 'int main(void) { return 0; }\n' \ ++ | "${BUILD_CMD[@]}" -luring - 2> /dev/null; then ++ echo "--with-ublk requires liburing and ublk_drv." ++ echo "Please install then re-run this script." ++ exit 1 ++ fi ++fi ++ ++if [[ "${CONFIG[ISCSI_INITIATOR]}" = "y" ]]; then ++ # Fedora installs libiscsi to /usr/lib64/iscsi for some reason. ++ if ! echo -e '#include \n#include \n' \ ++ '#if LIBISCSI_API_VERSION < 20150621\n' \ ++ '#error\n' \ ++ '#endif\n' \ ++ 'int main(void) { return 0; }\n' \ ++ | "${BUILD_CMD[@]}" -L/usr/lib64/iscsi -liscsi - 2> /dev/null; then ++ echo "--with-iscsi-initiator requires libiscsi with" ++ echo "LIBISCSI_API_VERSION >= 20150621." ++ echo "Please install then re-run this script." ++ exit 1 ++ fi ++fi ++ ++if [[ "${CONFIG[DAOS]}" = "y" ]]; then ++ daos_build_cmd=("${BUILD_CMD[@]}") ++ if [[ -n "${CONFIG[DAOS_DIR]}" ]]; then ++ daos_build_cmd+=(-I"${CONFIG[DAOS_DIR]}"/include -L"${CONFIG[DAOS_DIR]}"/lib64) ++ fi ++ if ! echo -e '#include \n#include \n' \ ++ 'int main(void) { return 0; }\n' \ ++ | "${daos_build_cmd[@]}" -lgurt -ldaos -ldaos_common -ldfs - 2> /dev/null; then ++ echo "--with-daos requires libdaos, libdaos_common, libdfs and libgurt" ++ echo "Please install then re-run this script." ++ exit 1 ++ fi ++fi ++ ++if [[ "${CONFIG[ASAN]}" = "y" ]]; then ++ if ! echo -e 'int main(void) { return 0; }\n' \ ++ | "${BUILD_CMD[@]}" -fsanitize=address - 2> /dev/null; then ++ echo "--enable-asan requires libasan." ++ echo "Please install then re-run this script." ++ exit 1 ++ fi ++fi ++ ++if [[ "${CONFIG[UBSAN]}" = "y" ]]; then ++ if ! echo -e 'int main(void) { return 0; }\n' \ ++ | "${BUILD_CMD[@]}" -fsanitize=undefined - 2> /dev/null; then ++ echo "--enable-ubsan requires libubsan." ++ echo "Please install then re-run this script." ++ echo "If installed, please check that the GCC version is at least 6.4" ++ echo "and synchronize CC accordingly." ++ exit 1 ++ fi ++fi ++ ++if [[ "${CONFIG[TSAN]}" = "y" ]]; then ++ if ! echo -e 'int main(void) { return 0; }\n' \ ++ | "${BUILD_CMD[@]}" -fsanitize=thread - 2> /dev/null; then ++ echo "--enable-tsan requires libtsan." ++ echo "Please install then re-run this script." ++ exit 1 ++ fi ++fi ++ ++if echo -e '#include \nint main(void) { arc4random(); return 0; }\n' \ ++ | "${BUILD_CMD[@]}" - 2> /dev/null; then ++ CONFIG[HAVE_ARC4RANDOM]="y" ++fi ++ ++if [[ "${CONFIG[OCF]}" = "y" ]]; then ++ echo "WARNING: OCF - bdev_ocf is deprecated." ++ # If OCF_PATH is a file, assume it is a library and use it to compile with ++ if [ -f ${CONFIG[OCF_PATH]} ]; then ++ CONFIG[CUSTOMOCF]=y ++ else ++ CONFIG[CUSTOMOCF]=n ++ fi ++fi ++ ++if [[ "${CONFIG[PGO_CAPTURE]}" = "y" && "${CONFIG[PGO_USE]}" = "y" ]]; then ++ echo "ERROR: --enable-pgo-capture and --enable-pgo-use are mutually exclusive." ++ exit 1 ++elif [[ "${CONFIG[PGO_USE]}" = "y" ]]; then ++ if [[ "$CC_TYPE" = "clang" ]]; then ++ # For clang we need to run an extra step on gathered profiling data. ++ echo "Generating suitable profile data" ++ llvm-profdata merge -output=build/pgo/default.profdata build/pgo ++ fi ++fi ++ ++if [[ "${CONFIG[URING]}" = "y" || "${CONFIG[XNVME]}" = "y" ]]; then ++ if [[ -n "${CONFIG[URING_PATH]}" ]]; then ++ check_dir "${CONFIG[URING_PATH]}" ++ elif ! echo -e '#include \nint main(void) { return 0; }\n' \ ++ | "${BUILD_CMD[@]}" -luring - 2> /dev/null; then ++ echo "--with-uring requires liburing." ++ echo "Please build and install then re-run this script." ++ exit 1 ++ fi ++ # Support for Zoned devices is enabled by default for Uring bdev. Check appropriate support in kernel. ++ if [[ "${CONFIG[URING_ZNS]}" = "y" ]]; then ++ if ! echo -e '#include\nint main(void) { return BLK_ZONE_REP_CAPACITY; }\n' \ ++ | "${BUILD_CMD[@]}" -c - 2> /dev/null; then ++ echo "Disabling Zoned NS support in Uring! Requires blkzoned.h from kernel >= linux-5.9." ++ CONFIG[URING_ZNS]=n ++ fi ++ fi ++fi ++ ++if [[ "${CONFIG[FUSE]}" = "y" ]]; then ++ if [[ ! -d /usr/include/fuse3 ]] && [[ ! -d /usr/local/include/fuse3 ]]; then ++ echo "--with-fuse requires libfuse3." ++ echo "Please install then re-run this script." ++ exit 1 ++ fi ++fi ++ ++if [[ "${CONFIG[SSAM]}" = "y" ]]; then ++ if [[ ! -e /usr/lib64/libdpak_ssam.so ]]; then ++ echo "--with-ssam requires libdpak_ssam." ++ echo "Please install then re-run this script." ++ exit 1 ++ fi ++fi ++ ++if [[ "${CONFIG[SSAM_ONLY]}" = "y" ]]; then ++ if [[ "${CONFIG[SSAM]}" = "n" ]]; then ++ echo "--with-ssam-only requires --with-ssam." ++ exit 1 ++ fi ++fi ++ ++if [ "${CONFIG[CET]}" = "y" ]; then ++ if ! echo -e 'int main(void) { return 0; }\n' | "${BUILD_CMD[@]}" -fcf-protection - 2> /dev/null; then ++ echo "--enable-cet requires compiler/linker that supports CET." ++ echo "Please install then re-run this script." ++ exit 1 ++ fi ++fi ++ ++if [[ "${CONFIG[FUZZER]}" = "y" && "$CC_TYPE" != "clang" ]]; then ++ echo "--with-fuzzer requires setting CC and CXX to clang." ++ exit 1 ++fi ++ ++if [[ $arch == x86_64* ]] || [[ $arch == aarch64* ]]; then ++ CONFIG[ISAL]=y ++ # make sure the submodule is initialized ++ if [ ! -f "$rootdir"/isa-l/autogen.sh ]; then ++ echo "ISA-L is required but was not found, please init the submodule with:" ++ echo " git submodule update --init" ++ echo "and then re-run this script." ++ exit 1 ++ fi ++ # for x86 only, check the nasm version for ISA-L and IPSEC ++ if [[ $arch == x86_64* ]]; then ++ ver=$(nasm -v 2> /dev/null | awk '{print $3}' | awk -Fr '{print $1}') ++ if lt "$ver" 2.14; then ++ CONFIG[ISAL]=n ++ # IPSEC has nasm requirement and DPDK crypto relies on IPSEC ++ CONFIG[IPSEC_MB]=n ++ echo "WARNING: ISA-L & DPDK crypto cannot be used as nasm ver must be 2.14 or newer." ++ fi ++ fi ++ # check gas version on aarch64 ++ if [[ $arch == aarch64* ]]; then ++ ver=$(as --version 2> /dev/null | awk 'NR==1{print $7}') ++ if lt "$ver" 2.24; then ++ # ISA-L, compression & crypto require gas version 2.24 or newer. ++ CONFIG[ISAL]=n ++ echo "Notice: ISA-L, compression & crypto require GAS version 2.24 or newer. Turning off default ISA-L and crypto features." ++ elif lt "$ver" 2.34; then ++ #For gas v2.24~v2.34, sve2 instructions are not supported. To workaround it, sve2 optimization should be disabled ++ ISAL_CRYPTO_OPTS+=("--disable-sve2") ++ fi ++ fi ++else ++ # for PPC ++ CONFIG[ISAL]=n ++ echo "WARNING: ISA-L cannot be used due to architecture incompatibility." ++fi ++ ++# now either configure ISA-L or disable unavailable features ++if [[ "${CONFIG[ISAL]}" = "y" ]]; then ++ cd $rootdir/isa-l ++ ISAL_LOG=$rootdir/isa-l/spdk-isal.log ++ if [[ -n "${CONFIG[CROSS_PREFIX]}" ]]; then ++ ISAL_OPTS=("--host=${CONFIG[CROSS_PREFIX]}") ++ else ++ ISAL_OPTS=() ++ fi ++ echo -n "Configuring ISA-L (logfile: $ISAL_LOG)..." ++ ./autogen.sh &> $ISAL_LOG ++ ./configure CFLAGS="-fPIC -g -O2" "${ISAL_OPTS[@]}" --enable-shared=no >> $ISAL_LOG 2>&1 ++ echo "done." ++ cd $rootdir ++else ++ echo "Without ISA-L, there is no software support for crypto or compression," ++ echo "so these features will be disabled." ++ CONFIG[CRYPTO]=n ++ CONFIG[VBDEV_COMPRESS]=n ++ CONFIG[DPDK_COMPRESSDEV]=n ++fi ++ ++# ISA-L-crypto complements ISA-L functionality, it is only enabled together with ISA-L ++if [[ "${CONFIG[ISAL]}" = "y" ]]; then ++ if [ ! -f "$rootdir"/isa-l-crypto/autogen.sh ]; then ++ echo "ISA-L-crypto is required but was not found, please init the submodule with:" ++ echo " git submodule update --init" ++ echo "and then re-run this script." ++ exit 1 ++ fi ++ ++ cd $rootdir/isa-l-crypto ++ ISAL_CRYPTO_LOG=$rootdir/isa-l-crypto/spdk-isal-crypto.log ++ if [[ -n "${CONFIG[CROSS_PREFIX]}" ]]; then ++ ISAL_CRYPTO_OPTS+=("--host=${CONFIG[CROSS_PREFIX]}") ++ fi ++ ISAL_CRYPTO_OPTS+=("--enable-shared=no") ++ echo -n "Configuring ISA-L-crypto (logfile: $ISAL_CRYPTO_LOG)..." ++ ./autogen.sh &> $ISAL_CRYPTO_LOG ++ ./configure CFLAGS="-fPIC -g -O2" "${ISAL_CRYPTO_OPTS[@]}" >> $ISAL_CRYPTO_LOG 2>&1 ++ echo "done." ++ cd $rootdir ++ CONFIG[ISAL_CRYPTO]=y ++else ++ CONFIG[ISAL_CRYPTO]=n ++fi ++ ++if [[ "${CONFIG[SMA]}" = "y" ]]; then ++ if ! python3 -c 'import grpc; import grpc_tools' 2> /dev/null; then ++ echo "--with-sma requires grpcio and grpcio-tools python packages." ++ echo "Please install then re-run this script." ++ exit 1 ++ fi ++fi ++ ++if [[ "${CONFIG[AVAHI]}" = "y" ]]; then ++ if ! echo -e '#include \n#include \n' \ ++ 'int main(void) { return 0; }\n' \ ++ | "${BUILD_CMD[@]}" -lavahi-client -lavahi-common - 2> /dev/null; then ++ echo "--with-avahi requires libavahi-client and libavahi-common." ++ echo "Please install then re-run this script." ++ exit 1 ++ fi ++fi ++ ++# For ARM Neoverse-N1 platform, debug build needs gcc version newer than 8.4 ++if [[ "${CONFIG[DEBUG]}" = "y" && $arch = aarch64* && "$CC_TYPE" = "gcc" ]]; then ++ GCC_VERSION=$($CC -dumpfullversion) ++ PART_NUM=$(grep -i -m 1 "CPU part" /proc/cpuinfo | awk '{print $4}') ++ ++ if [[ "$(printf '%s\n' "8.4.0" "$GCC_VERSION" | sort -V | head -n1)" != "8.4.0" ]]; then ++ if [[ $PART_NUM = 0xd0c ]]; then ++ echo "WARNING: For ARM Neoverse-N1 platform, debug build needs GCC version newer than 8.4." ++ echo " Will work around this by using armv8.2-a+crypto as target architecture for now." ++ CONFIG[ARCH]=armv8.2-a+crypto ++ elif [[ $PART_NUM = 0x0b2 ]]; then ++ echo "WARNING: For ARM octeontx2 platform, debug build needs GCC version newer than 8.4." ++ echo " Will work around this by using armv8.2-a+crypto as target architecture for now." ++ CONFIG[ARCH]=armv8.2-a+crypto ++ fi ++ fi ++fi ++ ++# We are now ready to generate final configuration. But first do sanity ++# check to see if all keys in CONFIG array have its reflection in CONFIG file. ++if (($(grep -cE "^\s*CONFIG_[[:alnum:]_]+=" "$rootdir/CONFIG") != ${#CONFIG[@]})); then ++ echo "" ++ echo "BUG: Some configuration options are not present in CONFIG file. Please update this file." ++ echo "Missing options in CONFIG (+) file and in current config (-): " ++ diff -u --label "CONFIG file" --label "CONFIG[@]" \ ++ <(sed -r -e '/^[[:space:]]*$/d; /^[[:space:]]*#.*/d; s/(CONFIG_[[:alnum:]_]+)=.*/\1/g' CONFIG | sort) \ ++ <(printf "CONFIG_%s\n" "${!CONFIG[@]}" | sort) ++ exit 1 ++fi ++ ++echo -n "Creating mk/config.mk..." ++cp -f $rootdir/CONFIG $rootdir/mk/config.mk ++ARGS=$(echo "$@" | sed 's/ /\\ /g') ++sed -i.bak -r "s#__CONFIGURE_OPTIONS__#${ARGS}#g" $rootdir/mk/config.mk ++for key in "${!CONFIG[@]}"; do ++ sed -i.bak -r "s#[[:space:]]*CONFIG_${key}=.*#CONFIG_${key}\?=${CONFIG[$key]}#g" $rootdir/mk/config.mk ++done ++# On FreeBSD sed -i 'SUFFIX' - SUFFIX is mandatory. So no way but to delete the backed file. ++rm -f $rootdir/mk/config.mk.bak ++echo "done." ++ ++# Environment variables ++echo -n "Creating mk/cc.flags.mk..." ++rm -f $rootdir/mk/cc.flags.mk ++[ -n "$CFLAGS" ] && echo "CFLAGS?=$CFLAGS" > $rootdir/mk/cc.flags.mk ++[ -n "$CXXFLAGS" ] && echo "CXXFLAGS?=$CXXFLAGS" >> $rootdir/mk/cc.flags.mk ++[ -n "$LDFLAGS" ] && echo "LDFLAGS?=$LDFLAGS" >> $rootdir/mk/cc.flags.mk ++[ -n "$DESTDIR" ] && echo "DESTDIR?=$DESTDIR" >> $rootdir/mk/cc.flags.mk ++echo "done." ++ ++# Create .sh with build config for easy sourcing|lookup during the tests. ++for conf in "${!CONFIG[@]}"; do ++ echo "CONFIG_$conf=${CONFIG[$conf]}" ++done > "$rootdir/test/common/build_config.sh" ++ ++if [[ $sys_name == "FreeBSD" ]]; then ++ echo "Type 'gmake' to build." ++else ++ echo "Type 'make' to build." ++fi ++ ++exit 0 +diff --git a/examples/bdev/fio_plugin/fio_plugin.c b/examples/bdev/fio_plugin/fio_plugin.c +index 92690be..402206d 100644 +--- a/examples/bdev/fio_plugin/fio_plugin.c ++++ b/examples/bdev/fio_plugin/fio_plugin.c +@@ -1,1496 +1,1496 @@ +-/* SPDX-License-Identifier: BSD-3-Clause +- * Copyright (C) 2017 Intel Corporation. +- * All rights reserved. +- * Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +- */ +- +-#include "spdk/stdinc.h" +- +-#include "spdk/bdev.h" +-#include "spdk/bdev_zone.h" +-#include "spdk/accel.h" +-#include "spdk/env.h" +-#include "spdk/init.h" +-#include "spdk/thread.h" +-#include "spdk/log.h" +-#include "spdk/string.h" +-#include "spdk/queue.h" +-#include "spdk/util.h" +-#include "spdk/rpc.h" +- +-#include "spdk_internal/event.h" +- +-#include "config-host.h" +-#include "fio.h" +-#include "optgroup.h" +- +-#ifdef for_each_rw_ddir +-#define FIO_HAS_ZBD (FIO_IOOPS_VERSION >= 26) +-#else +-#define FIO_HAS_ZBD (0) +-#endif +- +-/* FreeBSD is missing CLOCK_MONOTONIC_RAW, +- * so alternative is provided. */ +-#ifndef CLOCK_MONOTONIC_RAW /* Defined in glibc bits/time.h */ +-#define CLOCK_MONOTONIC_RAW CLOCK_MONOTONIC +-#endif +- +-struct spdk_fio_options { +- void *pad; +- char *conf; +- char *json_conf; +- char *env_context; +- char *log_flags; +- unsigned mem_mb; +- int mem_single_seg; +- int initial_zone_reset; +- int zone_append; +- char *rpc_listen_addr; +-}; +- +-struct spdk_fio_request { +- struct io_u *io; +- struct thread_data *td; +-}; +- +-struct spdk_fio_target { +- struct spdk_bdev *bdev; +- struct spdk_bdev_desc *desc; +- struct spdk_io_channel *ch; +- bool zone_append_enabled; +- +- TAILQ_ENTRY(spdk_fio_target) link; +-}; +- +-struct spdk_fio_thread { +- struct thread_data *td; /* fio thread context */ +- struct spdk_thread *thread; /* spdk thread context */ +- +- TAILQ_HEAD(, spdk_fio_target) targets; +- bool failed; /* true if the thread failed to initialize */ +- +- struct io_u **iocq; /* io completion queue */ +- unsigned int iocq_count; /* number of iocq entries filled by last getevents */ +- unsigned int iocq_size; /* number of iocq entries allocated */ +- +- TAILQ_ENTRY(spdk_fio_thread) link; +-}; +- +-struct spdk_fio_zone_cb_arg { +- struct spdk_fio_target *target; +- struct spdk_bdev_zone_info *spdk_zones; +- int completed; +- uint64_t offset_blocks; +- struct zbd_zone *fio_zones; +- unsigned int nr_zones; +-}; +- +-/* On App Thread (oat) context used for making sync calls from async calls. */ +-struct spdk_fio_oat_ctx { +- union { +- struct spdk_fio_setup_args { +- struct thread_data *td; +- } sa; +- struct spdk_fio_bdev_get_zoned_model_args { +- struct fio_file *f; +- enum zbd_zoned_model *model; +- } zma; +- struct spdk_fio_bdev_get_max_open_zones_args { +- struct fio_file *f; +- unsigned int *max_open_zones; +- } moza; +- } u; +- pthread_mutex_t mutex; +- pthread_cond_t cond; +- int ret; +-}; +- +-static bool g_spdk_env_initialized = false; +-static const char *g_json_config_file = NULL; +-static const char *g_rpc_listen_addr = SPDK_DEFAULT_RPC_ADDR; +- +-static int spdk_fio_init(struct thread_data *td); +-static void spdk_fio_cleanup(struct thread_data *td); +-static size_t spdk_fio_poll_thread(struct spdk_fio_thread *fio_thread); +-static int spdk_fio_handle_options(struct thread_data *td, struct fio_file *f, +- struct spdk_bdev *bdev); +-static int spdk_fio_handle_options_per_target(struct thread_data *td, struct fio_file *f); +-static void spdk_fio_setup_oat(void *ctx); +- +-static pthread_t g_init_thread_id = 0; +-static pthread_mutex_t g_init_mtx = PTHREAD_MUTEX_INITIALIZER; +-static pthread_cond_t g_init_cond; +-static bool g_poll_loop = true; +-static TAILQ_HEAD(, spdk_fio_thread) g_threads = TAILQ_HEAD_INITIALIZER(g_threads); +- +-/* Default polling timeout (ns) */ +-#define SPDK_FIO_POLLING_TIMEOUT 1000000000ULL +- +-static __thread bool g_internal_thread = false; +- +-/* Run msg_fn on app thread ("oat") and wait for it to call spdk_fio_wake_oat_waiter() */ +-static void +-spdk_fio_sync_run_oat(void (*msg_fn)(void *), struct spdk_fio_oat_ctx *ctx) +-{ +- assert(spdk_get_thread() != spdk_thread_get_app_thread()); +- +- pthread_mutex_init(&ctx->mutex, NULL); +- pthread_cond_init(&ctx->cond, NULL); +- pthread_mutex_lock(&ctx->mutex); +- +- spdk_thread_send_msg(spdk_thread_get_app_thread(), msg_fn, ctx); +- +- /* Wake up the poll loop in spdk_init_thread_poll() */ +- pthread_mutex_lock(&g_init_mtx); +- pthread_cond_signal(&g_init_cond); +- pthread_mutex_unlock(&g_init_mtx); +- +- /* Wait for msg_fn() to call spdk_fio_wake_oat_waiter() */ +- pthread_cond_wait(&ctx->cond, &ctx->mutex); +- pthread_mutex_unlock(&ctx->mutex); +- +- pthread_mutex_destroy(&ctx->mutex); +- pthread_cond_destroy(&ctx->cond); +-} +- +-static void +-spdk_fio_wake_oat_waiter(struct spdk_fio_oat_ctx *ctx) +-{ +- pthread_mutex_lock(&ctx->mutex); +- pthread_cond_signal(&ctx->cond); +- pthread_mutex_unlock(&ctx->mutex); +-} +- +-static int +-spdk_fio_schedule_thread(struct spdk_thread *thread) +-{ +- struct spdk_fio_thread *fio_thread; +- +- if (g_internal_thread) { +- /* Do nothing. */ +- return 0; +- } +- +- fio_thread = spdk_thread_get_ctx(thread); +- +- pthread_mutex_lock(&g_init_mtx); +- TAILQ_INSERT_TAIL(&g_threads, fio_thread, link); +- pthread_mutex_unlock(&g_init_mtx); +- +- return 0; +-} +- +-static int +-spdk_fio_init_thread(struct thread_data *td) +-{ +- struct spdk_fio_thread *fio_thread; +- struct spdk_thread *thread; +- +- g_internal_thread = true; +- thread = spdk_thread_create("fio_thread", NULL); +- g_internal_thread = false; +- if (!thread) { +- SPDK_ERRLOG("failed to allocate thread\n"); +- return -1; +- } +- +- fio_thread = spdk_thread_get_ctx(thread); +- fio_thread->td = td; +- fio_thread->thread = thread; +- td->io_ops_data = fio_thread; +- +- spdk_set_thread(thread); +- +- fio_thread->iocq_size = td->o.iodepth; +- fio_thread->iocq = calloc(fio_thread->iocq_size, sizeof(struct io_u *)); +- assert(fio_thread->iocq != NULL); +- +- TAILQ_INIT(&fio_thread->targets); +- +- return 0; +-} +- +-static void +-spdk_fio_bdev_close_targets(void *arg) +-{ +- struct spdk_fio_thread *fio_thread = arg; +- struct spdk_fio_target *target, *tmp; +- +- TAILQ_FOREACH_SAFE(target, &fio_thread->targets, link, tmp) { +- TAILQ_REMOVE(&fio_thread->targets, target, link); +- spdk_put_io_channel(target->ch); +- spdk_bdev_close(target->desc); +- free(target); +- } +-} +- +-static void +-spdk_fio_cleanup_thread(struct spdk_fio_thread *fio_thread) +-{ +- spdk_thread_send_msg(fio_thread->thread, spdk_fio_bdev_close_targets, fio_thread); +- +- pthread_mutex_lock(&g_init_mtx); +- TAILQ_INSERT_TAIL(&g_threads, fio_thread, link); +- pthread_mutex_unlock(&g_init_mtx); +-} +- +-static void +-spdk_fio_calc_timeout(struct spdk_fio_thread *fio_thread, struct timespec *ts) +-{ +- uint64_t timeout, now; +- +- if (spdk_thread_has_active_pollers(fio_thread->thread)) { +- return; +- } +- +- timeout = spdk_thread_next_poller_expiration(fio_thread->thread); +- now = spdk_get_ticks(); +- +- if (timeout == 0) { +- timeout = now + (SPDK_FIO_POLLING_TIMEOUT * spdk_get_ticks_hz()) / SPDK_SEC_TO_NSEC; +- } +- +- if (timeout > now) { +- timeout = ((timeout - now) * SPDK_SEC_TO_NSEC) / spdk_get_ticks_hz() + +- ts->tv_sec * SPDK_SEC_TO_NSEC + ts->tv_nsec; +- +- ts->tv_sec = timeout / SPDK_SEC_TO_NSEC; +- ts->tv_nsec = timeout % SPDK_SEC_TO_NSEC; +- } +-} +- +-static void +-spdk_fio_bdev_init_done(int rc, void *cb_arg) +-{ +- *(bool *)cb_arg = true; +- +- if (spdk_rpc_initialize(g_rpc_listen_addr) == 0) { +- spdk_rpc_set_state(SPDK_RPC_RUNTIME); +- } +-} +- +-static void +-spdk_fio_bdev_init_start(void *arg) +-{ +- bool *done = arg; +- +- spdk_subsystem_init_from_json_config(g_json_config_file, SPDK_DEFAULT_RPC_ADDR, +- spdk_fio_bdev_init_done, done, true); +-} +- +-static void +-spdk_fio_bdev_fini_done(void *cb_arg) +-{ +- *(bool *)cb_arg = true; +- +- spdk_rpc_finish(); +-} +- +-static void +-spdk_fio_bdev_fini_start(void *arg) +-{ +- bool *done = arg; +- +- spdk_subsystem_fini(spdk_fio_bdev_fini_done, done); +-} +- +-static void * +-spdk_init_thread_poll(void *arg) +-{ +- struct spdk_fio_options *eo = arg; +- struct spdk_fio_thread *fio_thread; +- struct spdk_fio_thread *thread, *tmp; +- struct spdk_env_opts opts; +- bool done; +- int rc; +- struct timespec ts; +- struct thread_data td = {}; +- +- /* Create a dummy thread data for use on the initialization thread. */ +- td.o.iodepth = 32; +- td.eo = eo; +- +- /* Parse the SPDK configuration file */ +- eo = arg; +- +- if (eo->conf && eo->json_conf) { +- SPDK_ERRLOG("Cannot provide two types of configuration files\n"); +- rc = EINVAL; +- goto err_exit; +- } else if (eo->conf && strlen(eo->conf)) { +- g_json_config_file = eo->conf; +- } else if (eo->json_conf && strlen(eo->json_conf)) { +- g_json_config_file = eo->json_conf; +- } else { +- SPDK_ERRLOG("No configuration file provided\n"); +- rc = EINVAL; +- goto err_exit; +- } +- +- /* Initialize the RPC listen address */ +- if (eo->rpc_listen_addr) { +- g_rpc_listen_addr = eo->rpc_listen_addr; +- } +- +- /* Initialize the environment library */ +- spdk_env_opts_init(&opts); +- opts.name = "fio"; +- +- if (eo->mem_mb) { +- opts.mem_size = eo->mem_mb; +- } +- opts.hugepage_single_segments = eo->mem_single_seg; +- if (eo->env_context) { +- opts.env_context = eo->env_context; +- } +- +- if (spdk_env_init(&opts) < 0) { +- SPDK_ERRLOG("Unable to initialize SPDK env\n"); +- rc = EINVAL; +- goto err_exit; +- } +- spdk_unaffinitize_thread(); +- +- if (eo->log_flags) { +- char *tok = strtok(eo->log_flags, ","); +- do { +- rc = spdk_log_set_flag(tok); +- if (rc < 0) { +- SPDK_ERRLOG("unknown spdk log flag %s\n", tok); +- rc = EINVAL; +- goto err_exit; +- } +- } while ((tok = strtok(NULL, ",")) != NULL); +-#ifdef DEBUG +- spdk_log_set_print_level(SPDK_LOG_DEBUG); +-#endif +- } +- +- spdk_thread_lib_init(spdk_fio_schedule_thread, sizeof(struct spdk_fio_thread)); +- +- /* Create an SPDK thread temporarily */ +- rc = spdk_fio_init_thread(&td); +- if (rc < 0) { +- SPDK_ERRLOG("Failed to create initialization thread\n"); +- goto err_exit; +- } +- +- fio_thread = td.io_ops_data; +- +- /* Initialize the bdev layer */ +- done = false; +- spdk_thread_send_msg(fio_thread->thread, spdk_fio_bdev_init_start, &done); +- +- do { +- spdk_fio_poll_thread(fio_thread); +- } while (!done); +- +- /* +- * Continue polling until there are no more events. +- * This handles any final events posted by pollers. +- */ +- while (spdk_fio_poll_thread(fio_thread) > 0) {}; +- +- /* Set condition variable */ +- pthread_mutex_lock(&g_init_mtx); +- pthread_cond_signal(&g_init_cond); +- +- pthread_mutex_unlock(&g_init_mtx); +- +- while (g_poll_loop) { +- spdk_fio_poll_thread(fio_thread); +- +- pthread_mutex_lock(&g_init_mtx); +- if (!TAILQ_EMPTY(&g_threads)) { +- TAILQ_FOREACH_SAFE(thread, &g_threads, link, tmp) { +- if (spdk_thread_is_exited(thread->thread)) { +- TAILQ_REMOVE(&g_threads, thread, link); +- free(thread->iocq); +- spdk_thread_destroy(thread->thread); +- } else { +- spdk_fio_poll_thread(thread); +- } +- } +- +- /* If there are exiting threads to poll, don't sleep. */ +- pthread_mutex_unlock(&g_init_mtx); +- continue; +- } +- +- /* Figure out how long to sleep. */ +- clock_gettime(CLOCK_MONOTONIC, &ts); +- spdk_fio_calc_timeout(fio_thread, &ts); +- +- rc = pthread_cond_timedwait(&g_init_cond, &g_init_mtx, &ts); +- pthread_mutex_unlock(&g_init_mtx); +- +- if (rc != 0 && rc != ETIMEDOUT) { +- break; +- } +- } +- +- spdk_fio_cleanup_thread(fio_thread); +- +- /* Finalize the bdev layer */ +- done = false; +- spdk_thread_send_msg(fio_thread->thread, spdk_fio_bdev_fini_start, &done); +- +- do { +- spdk_fio_poll_thread(fio_thread); +- +- TAILQ_FOREACH_SAFE(thread, &g_threads, link, tmp) { +- spdk_fio_poll_thread(thread); +- } +- } while (!done); +- +- /* Now exit all the threads */ +- TAILQ_FOREACH(thread, &g_threads, link) { +- spdk_set_thread(thread->thread); +- spdk_thread_exit(thread->thread); +- spdk_set_thread(NULL); +- } +- +- /* And wait for them to gracefully exit */ +- while (!TAILQ_EMPTY(&g_threads)) { +- TAILQ_FOREACH_SAFE(thread, &g_threads, link, tmp) { +- if (spdk_thread_is_exited(thread->thread)) { +- TAILQ_REMOVE(&g_threads, thread, link); +- free(thread->iocq); +- spdk_thread_destroy(thread->thread); +- } else { +- spdk_thread_poll(thread->thread, 0, 0); +- } +- } +- } +- +- pthread_exit(NULL); +- +-err_exit: +- exit(rc); +- return NULL; +-} +- +-static int +-spdk_fio_init_env(struct thread_data *td) +-{ +- pthread_condattr_t attr; +- int rc = -1; +- +- if (pthread_condattr_init(&attr)) { +- SPDK_ERRLOG("Unable to initialize condition variable\n"); +- return -1; +- } +- +- if (pthread_condattr_setclock(&attr, CLOCK_MONOTONIC)) { +- SPDK_ERRLOG("Unable to initialize condition variable\n"); +- goto out; +- } +- +- if (pthread_cond_init(&g_init_cond, &attr)) { +- SPDK_ERRLOG("Unable to initialize condition variable\n"); +- goto out; +- } +- +- /* +- * Spawn a thread to handle initialization operations and to poll things +- * like the admin queues periodically. +- */ +- rc = pthread_create(&g_init_thread_id, NULL, &spdk_init_thread_poll, td->eo); +- if (rc != 0) { +- SPDK_ERRLOG("Unable to spawn thread to poll admin queue. It won't be polled.\n"); +- } +- +- /* Wait for background thread to advance past the initialization */ +- pthread_mutex_lock(&g_init_mtx); +- pthread_cond_wait(&g_init_cond, &g_init_mtx); +- pthread_mutex_unlock(&g_init_mtx); +-out: +- pthread_condattr_destroy(&attr); +- return rc; +-} +- +-static bool +-fio_redirected_to_dev_null(void) +-{ +- char path[PATH_MAX] = ""; +- ssize_t ret; +- +- ret = readlink("/proc/self/fd/1", path, sizeof(path)); +- +- if (ret == -1 || strcmp(path, "/dev/null") != 0) { +- return false; +- } +- +- ret = readlink("/proc/self/fd/2", path, sizeof(path)); +- +- if (ret == -1 || strcmp(path, "/dev/null") != 0) { +- return false; +- } +- +- return true; +-} +- +-static int +-spdk_fio_init_spdk_env(struct thread_data *td) +-{ +- static pthread_mutex_t setup_lock = PTHREAD_MUTEX_INITIALIZER; +- +- pthread_mutex_lock(&setup_lock); +- if (!g_spdk_env_initialized) { +- if (spdk_fio_init_env(td)) { +- pthread_mutex_unlock(&setup_lock); +- SPDK_ERRLOG("failed to initialize\n"); +- return -1; +- } +- +- g_spdk_env_initialized = true; +- } +- pthread_mutex_unlock(&setup_lock); +- +- return 0; +-} +- +-/* Called for each thread to fill in the 'real_file_size' member for +- * each file associated with this thread. This is called prior to +- * the init operation (spdk_fio_init()) below. This call will occur +- * on the initial start up thread if 'create_serialize' is true, or +- * on the thread actually associated with 'thread_data' if 'create_serialize' +- * is false. +- */ +-static int +-spdk_fio_setup(struct thread_data *td) +-{ +- struct spdk_fio_oat_ctx ctx = { 0 }; +- +- /* +- * If we're running in a daemonized FIO instance, it's possible +- * fd 1/2 were re-used for something important by FIO. Newer fio +- * versions are careful to redirect those to /dev/null, but if we're +- * not, we'll abort early, so we don't accidentally write messages to +- * an important file, etc. +- */ +- if (is_backend && !fio_redirected_to_dev_null()) { +- char buf[1024]; +- snprintf(buf, sizeof(buf), +- "SPDK FIO plugin is in daemon mode, but stdout/stderr " +- "aren't redirected to /dev/null. Aborting."); +- fio_server_text_output(FIO_LOG_ERR, buf, sizeof(buf)); +- return -1; +- } +- +- if (!td->o.use_thread) { +- SPDK_ERRLOG("must set thread=1 when using spdk plugin\n"); +- return -1; +- } +- +- if (spdk_fio_init_spdk_env(td) != 0) { +- return -1; +- } +- +- ctx.u.sa.td = td; +- spdk_fio_sync_run_oat(spdk_fio_setup_oat, &ctx); +- return ctx.ret; +-} +- +-static void +-spdk_fio_setup_oat(void *_ctx) +-{ +- struct spdk_fio_oat_ctx *ctx = _ctx; +- struct thread_data *td = ctx->u.sa.td; +- unsigned int i; +- struct fio_file *f; +- +- if (td->o.nr_files == 1 && strcmp(td->files[0]->file_name, "*") == 0) { +- struct spdk_bdev *bdev; +- +- /* add all available bdevs as fio targets */ +- for (bdev = spdk_bdev_first_leaf(); bdev; bdev = spdk_bdev_next_leaf(bdev)) { +- add_file(td, spdk_bdev_get_name(bdev), 0, 1); +- } +- } +- +- for_each_file(td, f, i) { +- struct spdk_bdev *bdev; +- +- if (strcmp(f->file_name, "*") == 0) { +- continue; +- } +- +- bdev = spdk_bdev_get_by_name(f->file_name); +- if (!bdev) { +- SPDK_ERRLOG("Unable to find bdev with name %s\n", f->file_name); +- ctx->ret = -1; +- goto out; +- } +- +- f->real_file_size = spdk_bdev_get_num_blocks(bdev) * +- spdk_bdev_get_block_size(bdev); +- f->filetype = FIO_TYPE_BLOCK; +- fio_file_set_size_known(f); +- +- ctx->ret = spdk_fio_handle_options(td, f, bdev); +- if (ctx->ret) { +- goto out; +- } +- } +- +- ctx->ret = 0; +-out: +- spdk_fio_wake_oat_waiter(ctx); +-} +- +-static void +-fio_bdev_event_cb(enum spdk_bdev_event_type type, struct spdk_bdev *bdev, +- void *event_ctx) +-{ +- SPDK_WARNLOG("Unsupported bdev event: type %d\n", type); +-} +- +-static void +-spdk_fio_bdev_open(void *arg) +-{ +- struct thread_data *td = arg; +- struct spdk_fio_thread *fio_thread; +- unsigned int i; +- struct fio_file *f; +- int rc; +- +- fio_thread = td->io_ops_data; +- +- for_each_file(td, f, i) { +- struct spdk_fio_target *target; +- +- if (strcmp(f->file_name, "*") == 0) { +- continue; +- } +- +- target = calloc(1, sizeof(*target)); +- if (!target) { +- SPDK_ERRLOG("Unable to allocate memory for I/O target.\n"); +- fio_thread->failed = true; +- return; +- } +- +- rc = spdk_bdev_open_ext(f->file_name, true, fio_bdev_event_cb, NULL, +- &target->desc); +- if (rc) { +- SPDK_ERRLOG("Unable to open bdev %s\n", f->file_name); +- free(target); +- fio_thread->failed = true; +- return; +- } +- +- target->bdev = spdk_bdev_desc_get_bdev(target->desc); +- +- target->ch = spdk_bdev_get_io_channel(target->desc); +- if (!target->ch) { +- SPDK_ERRLOG("Unable to get I/O channel for bdev.\n"); +- spdk_bdev_close(target->desc); +- free(target); +- fio_thread->failed = true; +- return; +- } +- +- f->engine_data = target; +- +- rc = spdk_fio_handle_options_per_target(td, f); +- if (rc) { +- SPDK_ERRLOG("Failed to handle options for: %s\n", f->file_name); +- f->engine_data = NULL; +- spdk_put_io_channel(target->ch); +- spdk_bdev_close(target->desc); +- free(target); +- fio_thread->failed = true; +- return; +- } +- +- TAILQ_INSERT_TAIL(&fio_thread->targets, target, link); +- } +-} +- +-/* Called for each thread, on that thread, shortly after the thread +- * starts. +- * +- * Also called by spdk_fio_report_zones(), since we need an I/O channel +- * in order to get the zone report. (fio calls the .report_zones callback +- * before it calls the .init callback.) +- * Therefore, if fio was run with --zonemode=zbd, the thread will already +- * be initialized by the time that fio calls the .init callback. +- */ +-static int +-spdk_fio_init(struct thread_data *td) +-{ +- struct spdk_fio_thread *fio_thread; +- int rc; +- +- if (spdk_fio_init_spdk_env(td) != 0) { +- return -1; +- } +- +- /* If thread has already been initialized, do nothing. */ +- if (td->io_ops_data) { +- return 0; +- } +- +- rc = spdk_fio_init_thread(td); +- if (rc) { +- return rc; +- } +- +- fio_thread = td->io_ops_data; +- assert(fio_thread); +- fio_thread->failed = false; +- +- spdk_thread_send_msg(fio_thread->thread, spdk_fio_bdev_open, td); +- +- while (spdk_fio_poll_thread(fio_thread) > 0) {} +- +- if (fio_thread->failed) { +- return -1; +- } +- +- return 0; +-} +- +-static void +-spdk_fio_cleanup(struct thread_data *td) +-{ +- struct spdk_fio_thread *fio_thread = td->io_ops_data; +- +- spdk_fio_cleanup_thread(fio_thread); +- td->io_ops_data = NULL; +-} +- +-static int +-spdk_fio_open(struct thread_data *td, struct fio_file *f) +-{ +- +- return 0; +-} +- +-static int +-spdk_fio_close(struct thread_data *td, struct fio_file *f) +-{ +- return 0; +-} +- +-static int +-spdk_fio_iomem_alloc(struct thread_data *td, size_t total_mem) +-{ +- td->orig_buffer = spdk_dma_zmalloc(total_mem, 0x1000, NULL); +- return td->orig_buffer == NULL; +-} +- +-static void +-spdk_fio_iomem_free(struct thread_data *td) +-{ +- spdk_dma_free(td->orig_buffer); +-} +- +-static int +-spdk_fio_io_u_init(struct thread_data *td, struct io_u *io_u) +-{ +- struct spdk_fio_request *fio_req; +- +- io_u->engine_data = NULL; +- +- fio_req = calloc(1, sizeof(*fio_req)); +- if (fio_req == NULL) { +- return 1; +- } +- fio_req->io = io_u; +- fio_req->td = td; +- +- io_u->engine_data = fio_req; +- +- return 0; +-} +- +-static void +-spdk_fio_io_u_free(struct thread_data *td, struct io_u *io_u) +-{ +- struct spdk_fio_request *fio_req = io_u->engine_data; +- +- if (fio_req) { +- assert(fio_req->io == io_u); +- free(fio_req); +- io_u->engine_data = NULL; +- } +-} +- +-static void +-spdk_fio_completion_cb(struct spdk_bdev_io *bdev_io, +- bool success, +- void *cb_arg) +-{ +- struct spdk_fio_request *fio_req = cb_arg; +- struct thread_data *td = fio_req->td; +- struct spdk_fio_thread *fio_thread = td->io_ops_data; +- +- assert(fio_thread->iocq_count < fio_thread->iocq_size); +- fio_req->io->error = success ? 0 : EIO; +- fio_thread->iocq[fio_thread->iocq_count++] = fio_req->io; +- +- spdk_bdev_free_io(bdev_io); +-} +- +-#if FIO_IOOPS_VERSION >= 24 +-typedef enum fio_q_status fio_q_status_t; +-#else +-typedef int fio_q_status_t; +-#endif +- +-static uint64_t +-spdk_fio_zone_bytes_to_blocks(struct spdk_bdev *bdev, uint64_t offset_bytes, uint64_t *zone_start, +- uint64_t num_bytes, uint64_t *num_blocks) +-{ +- uint32_t block_size = spdk_bdev_get_block_size(bdev); +- *zone_start = spdk_bdev_get_zone_id(bdev, offset_bytes / block_size); +- *num_blocks = num_bytes / block_size; +- return (offset_bytes % block_size) | (num_bytes % block_size); +-} +- +-static fio_q_status_t +-spdk_fio_queue(struct thread_data *td, struct io_u *io_u) +-{ +- int rc = 1; +- struct spdk_fio_request *fio_req = io_u->engine_data; +- struct spdk_fio_target *target = io_u->file->engine_data; +- +- assert(fio_req->td == td); +- +- if (!target) { +- SPDK_ERRLOG("Unable to look up correct I/O target.\n"); +- fio_req->io->error = ENODEV; +- return FIO_Q_COMPLETED; +- } +- +- switch (io_u->ddir) { +- case DDIR_READ: +- rc = spdk_bdev_read(target->desc, target->ch, +- io_u->buf, io_u->offset, io_u->xfer_buflen, +- spdk_fio_completion_cb, fio_req); +- break; +- case DDIR_WRITE: +- if (!target->zone_append_enabled) { +- rc = spdk_bdev_write(target->desc, target->ch, +- io_u->buf, io_u->offset, io_u->xfer_buflen, +- spdk_fio_completion_cb, fio_req); +- } else { +- uint64_t zone_start, num_blocks; +- if (spdk_fio_zone_bytes_to_blocks(target->bdev, io_u->offset, &zone_start, +- io_u->xfer_buflen, &num_blocks) != 0) { +- rc = -EINVAL; +- break; +- } +- rc = spdk_bdev_zone_append(target->desc, target->ch, io_u->buf, +- zone_start, num_blocks, spdk_fio_completion_cb, +- fio_req); +- } +- break; +- case DDIR_TRIM: +- rc = spdk_bdev_unmap(target->desc, target->ch, +- io_u->offset, io_u->xfer_buflen, +- spdk_fio_completion_cb, fio_req); +- break; +- case DDIR_SYNC: +- rc = spdk_bdev_flush(target->desc, target->ch, +- io_u->offset, io_u->xfer_buflen, +- spdk_fio_completion_cb, fio_req); +- break; +- default: +- assert(false); +- break; +- } +- +- if (rc == -ENOMEM) { +- return FIO_Q_BUSY; +- } +- +- if (rc != 0) { +- fio_req->io->error = abs(rc); +- return FIO_Q_COMPLETED; +- } +- +- return FIO_Q_QUEUED; +-} +- +-static struct io_u * +-spdk_fio_event(struct thread_data *td, int event) +-{ +- struct spdk_fio_thread *fio_thread = td->io_ops_data; +- +- assert(event >= 0); +- assert((unsigned)event < fio_thread->iocq_count); +- return fio_thread->iocq[event]; +-} +- +-static size_t +-spdk_fio_poll_thread(struct spdk_fio_thread *fio_thread) +-{ +- return spdk_thread_poll(fio_thread->thread, 0, 0); +-} +- +-static int +-spdk_fio_getevents(struct thread_data *td, unsigned int min, +- unsigned int max, const struct timespec *t) +-{ +- struct spdk_fio_thread *fio_thread = td->io_ops_data; +- struct timespec t0, t1; +- uint64_t timeout = 0; +- +- if (t) { +- timeout = t->tv_sec * SPDK_SEC_TO_NSEC + t->tv_nsec; +- clock_gettime(CLOCK_MONOTONIC_RAW, &t0); +- } +- +- fio_thread->iocq_count = 0; +- +- for (;;) { +- spdk_fio_poll_thread(fio_thread); +- +- if (fio_thread->iocq_count >= min) { +- return fio_thread->iocq_count; +- } +- +- if (t) { +- clock_gettime(CLOCK_MONOTONIC_RAW, &t1); +- uint64_t elapse = ((t1.tv_sec - t0.tv_sec) * SPDK_SEC_TO_NSEC) +- + t1.tv_nsec - t0.tv_nsec; +- if (elapse > timeout) { +- break; +- } +- } +- } +- +- return fio_thread->iocq_count; +-} +- +-static int +-spdk_fio_invalidate(struct thread_data *td, struct fio_file *f) +-{ +- /* TODO: This should probably send a flush to the device, but for now just return successful. */ +- return 0; +-} +- +-#if FIO_HAS_ZBD +-/* Runs on app thread (oat) */ +-static void +-spdk_fio_get_zoned_model_oat(void *arg) +-{ +- struct spdk_fio_oat_ctx *ctx = arg; +- struct fio_file *f = ctx->u.zma.f; +- enum zbd_zoned_model *model = ctx->u.zma.model; +- struct spdk_bdev *bdev; +- +- if (f->filetype != FIO_TYPE_BLOCK) { +- SPDK_ERRLOG("Unsupported filetype: %d\n", f->filetype); +- ctx->ret = -EINVAL; +- goto out; +- } +- +- bdev = spdk_bdev_get_by_name(f->file_name); +- if (!bdev) { +- SPDK_ERRLOG("Cannot get zoned model, no bdev with name: %s\n", f->file_name); +- ctx->ret = -ENODEV; +- goto out; +- } +- +- if (spdk_bdev_is_zoned(bdev)) { +- *model = ZBD_HOST_MANAGED; +- } else { +- *model = ZBD_NONE; +- } +- +- ctx->ret = 0; +-out: +- spdk_fio_wake_oat_waiter(ctx); +-} +- +-static int +-spdk_fio_get_zoned_model(struct thread_data *td, struct fio_file *f, enum zbd_zoned_model *model) +-{ +- struct spdk_fio_oat_ctx ctx = { 0 }; +- +- ctx.u.zma.f = f; +- ctx.u.zma.model = model; +- +- spdk_fio_sync_run_oat(spdk_fio_get_zoned_model_oat, &ctx); +- +- return ctx.ret; +-} +- +- +-static void +-spdk_fio_bdev_get_zone_info_done(struct spdk_bdev_io *bdev_io, bool success, void *arg) +-{ +- struct spdk_fio_zone_cb_arg *cb_arg = arg; +- unsigned int i; +- int handled_zones = 0; +- +- if (!success) { +- spdk_bdev_free_io(bdev_io); +- cb_arg->completed = -EIO; +- return; +- } +- +- for (i = 0; i < cb_arg->nr_zones; i++) { +- struct spdk_bdev_zone_info *zone_src = &cb_arg->spdk_zones[handled_zones]; +- struct zbd_zone *zone_dest = &cb_arg->fio_zones[handled_zones]; +- uint32_t block_size = spdk_bdev_get_block_size(cb_arg->target->bdev); +- +- switch (zone_src->type) { +- case SPDK_BDEV_ZONE_TYPE_SEQWR: +- zone_dest->type = ZBD_ZONE_TYPE_SWR; +- break; +- case SPDK_BDEV_ZONE_TYPE_SEQWP: +- zone_dest->type = ZBD_ZONE_TYPE_SWP; +- break; +- case SPDK_BDEV_ZONE_TYPE_CNV: +- zone_dest->type = ZBD_ZONE_TYPE_CNV; +- break; +- default: +- spdk_bdev_free_io(bdev_io); +- cb_arg->completed = -EIO; +- return; +- } +- +- zone_dest->len = spdk_bdev_get_zone_size(cb_arg->target->bdev) * block_size; +- zone_dest->capacity = zone_src->capacity * block_size; +- zone_dest->start = zone_src->zone_id * block_size; +- zone_dest->wp = zone_src->write_pointer * block_size; +- +- switch (zone_src->state) { +- case SPDK_BDEV_ZONE_STATE_EMPTY: +- zone_dest->cond = ZBD_ZONE_COND_EMPTY; +- break; +- case SPDK_BDEV_ZONE_STATE_IMP_OPEN: +- zone_dest->cond = ZBD_ZONE_COND_IMP_OPEN; +- break; +- case SPDK_BDEV_ZONE_STATE_EXP_OPEN: +- zone_dest->cond = ZBD_ZONE_COND_EXP_OPEN; +- break; +- case SPDK_BDEV_ZONE_STATE_FULL: +- zone_dest->cond = ZBD_ZONE_COND_FULL; +- break; +- case SPDK_BDEV_ZONE_STATE_CLOSED: +- zone_dest->cond = ZBD_ZONE_COND_CLOSED; +- break; +- case SPDK_BDEV_ZONE_STATE_READ_ONLY: +- zone_dest->cond = ZBD_ZONE_COND_READONLY; +- break; +- case SPDK_BDEV_ZONE_STATE_OFFLINE: +- zone_dest->cond = ZBD_ZONE_COND_OFFLINE; +- break; +- case SPDK_BDEV_ZONE_STATE_NOT_WP: +- zone_dest->cond = ZBD_ZONE_COND_NOT_WP; +- /* Set WP to end of zone for zone types w/o WP (e.g. Conv. zones in SMR) */ +- zone_dest->wp = zone_dest->start + zone_dest->capacity; +- break; +- default: +- spdk_bdev_free_io(bdev_io); +- cb_arg->completed = -EIO; +- return; +- } +- handled_zones++; +- } +- +- spdk_bdev_free_io(bdev_io); +- cb_arg->completed = handled_zones; +-} +- +-static void +-spdk_fio_bdev_get_zone_info(void *arg) +-{ +- struct spdk_fio_zone_cb_arg *cb_arg = arg; +- struct spdk_fio_target *target = cb_arg->target; +- int rc; +- +- rc = spdk_bdev_get_zone_info(target->desc, target->ch, cb_arg->offset_blocks, +- cb_arg->nr_zones, cb_arg->spdk_zones, +- spdk_fio_bdev_get_zone_info_done, cb_arg); +- if (rc < 0) { +- cb_arg->completed = rc; +- } +-} +- +-static int +-spdk_fio_report_zones(struct thread_data *td, struct fio_file *f, uint64_t offset, +- struct zbd_zone *zones, unsigned int nr_zones) +-{ +- struct spdk_fio_target *target; +- struct spdk_fio_thread *fio_thread; +- struct spdk_fio_zone_cb_arg cb_arg; +- uint32_t block_size; +- int rc; +- +- if (nr_zones == 0) { +- return 0; +- } +- +- /* spdk_fio_report_zones() is only called before the bdev I/O channels have been created. +- * Since we need an I/O channel for report_zones(), call spdk_fio_init() to initialize +- * the thread early. +- * spdk_fio_report_zones() might be called several times by fio, if e.g. the zone report +- * for all zones does not fit in the buffer that fio has allocated for the zone report. +- * It is safe to call spdk_fio_init(), even if the thread has already been initialized. +- */ +- rc = spdk_fio_init(td); +- if (rc) { +- return rc; +- } +- fio_thread = td->io_ops_data; +- target = f->engine_data; +- +- assert(fio_thread); +- assert(target); +- +- block_size = spdk_bdev_get_block_size(target->bdev); +- +- cb_arg.target = target; +- cb_arg.completed = 0; +- cb_arg.offset_blocks = offset / block_size; +- cb_arg.fio_zones = zones; +- cb_arg.nr_zones = spdk_min(nr_zones, spdk_bdev_get_num_zones(target->bdev)); +- +- cb_arg.spdk_zones = calloc(1, sizeof(*cb_arg.spdk_zones) * cb_arg.nr_zones); +- if (!cb_arg.spdk_zones) { +- SPDK_ERRLOG("Could not allocate memory for zone report!\n"); +- rc = -ENOMEM; +- goto cleanup_thread; +- } +- +- spdk_thread_send_msg(fio_thread->thread, spdk_fio_bdev_get_zone_info, &cb_arg); +- do { +- spdk_fio_poll_thread(fio_thread); +- } while (!cb_arg.completed); +- +- /* Free cb_arg.spdk_zones. The report in fio format is stored in cb_arg.fio_zones/zones. */ +- free(cb_arg.spdk_zones); +- +- rc = cb_arg.completed; +- if (rc < 0) { +- SPDK_ERRLOG("Failed to get zone info: %d\n", rc); +- goto cleanup_thread; +- } +- +- /* Return the amount of zones successfully copied. */ +- return rc; +- +-cleanup_thread: +- spdk_fio_cleanup(td); +- +- return rc; +-} +- +-static void +-spdk_fio_bdev_zone_reset_done(struct spdk_bdev_io *bdev_io, bool success, void *arg) +-{ +- struct spdk_fio_zone_cb_arg *cb_arg = arg; +- +- spdk_bdev_free_io(bdev_io); +- +- if (!success) { +- cb_arg->completed = -EIO; +- } else { +- cb_arg->completed = 1; +- } +-} +- +-static void +-spdk_fio_bdev_zone_reset(void *arg) +-{ +- struct spdk_fio_zone_cb_arg *cb_arg = arg; +- struct spdk_fio_target *target = cb_arg->target; +- int rc; +- +- rc = spdk_bdev_zone_management(target->desc, target->ch, cb_arg->offset_blocks, +- SPDK_BDEV_ZONE_RESET, +- spdk_fio_bdev_zone_reset_done, cb_arg); +- if (rc < 0) { +- cb_arg->completed = rc; +- } +-} +- +-static int +-spdk_fio_reset_zones(struct spdk_fio_thread *fio_thread, struct spdk_fio_target *target, +- uint64_t offset, uint64_t length) +-{ +- uint64_t zone_size_bytes; +- uint32_t block_size; +- int rc; +- +- assert(fio_thread); +- assert(target); +- +- block_size = spdk_bdev_get_block_size(target->bdev); +- zone_size_bytes = spdk_bdev_get_zone_size(target->bdev) * block_size; +- +- for (uint64_t cur = offset; cur < offset + length; cur += zone_size_bytes) { +- struct spdk_fio_zone_cb_arg cb_arg = { +- .target = target, +- .completed = 0, +- .offset_blocks = cur / block_size, +- }; +- +- spdk_thread_send_msg(fio_thread->thread, spdk_fio_bdev_zone_reset, &cb_arg); +- do { +- spdk_fio_poll_thread(fio_thread); +- } while (!cb_arg.completed); +- +- rc = cb_arg.completed; +- if (rc < 0) { +- SPDK_ERRLOG("Failed to reset zone: %d\n", rc); +- return rc; +- } +- } +- +- return 0; +-} +- +-static int +-spdk_fio_reset_wp(struct thread_data *td, struct fio_file *f, uint64_t offset, uint64_t length) +-{ +- return spdk_fio_reset_zones(td->io_ops_data, f->engine_data, offset, length); +-} +-#endif +- +-#if FIO_IOOPS_VERSION >= 30 +-static void +-spdk_fio_get_max_open_zones_oat(void *_ctx) +-{ +- struct spdk_fio_oat_ctx *ctx = _ctx; +- struct fio_file *f = ctx->u.moza.f; +- struct spdk_bdev *bdev; +- +- bdev = spdk_bdev_get_by_name(f->file_name); +- if (!bdev) { +- SPDK_ERRLOG("Cannot get max open zones, no bdev with name: %s\n", f->file_name); +- ctx->ret = -ENODEV; +- } else { +- *ctx->u.moza.max_open_zones = spdk_bdev_get_max_open_zones(bdev); +- ctx->ret = 0; +- } +- +- spdk_fio_wake_oat_waiter(ctx); +-} +- +-static int +-spdk_fio_get_max_open_zones(struct thread_data *td, struct fio_file *f, +- unsigned int *max_open_zones) +-{ +- struct spdk_fio_oat_ctx ctx = { 0 }; +- +- ctx.u.moza.f = f; +- ctx.u.moza.max_open_zones = max_open_zones; +- +- spdk_fio_sync_run_oat(spdk_fio_get_max_open_zones_oat, &ctx); +- +- return ctx.ret; +-} +-#endif +- +-static int +-spdk_fio_handle_options(struct thread_data *td, struct fio_file *f, struct spdk_bdev *bdev) +-{ +- struct spdk_fio_options *fio_options = td->eo; +- +- if (fio_options->initial_zone_reset && spdk_bdev_is_zoned(bdev)) { +-#if FIO_HAS_ZBD +- int rc = spdk_fio_init(td); +- if (rc) { +- return rc; +- } +- /* offset used to indicate conventional zones that need to be skipped (reset not allowed) */ +- rc = spdk_fio_reset_zones(td->io_ops_data, f->engine_data, td->o.start_offset, +- f->real_file_size - td->o.start_offset); +- if (rc) { +- spdk_fio_cleanup(td); +- return rc; +- } +-#else +- SPDK_ERRLOG("fio version is too old to support zoned block devices\n"); +-#endif +- } +- +- return 0; +-} +- +-static int +-spdk_fio_handle_options_per_target(struct thread_data *td, struct fio_file *f) +-{ +- struct spdk_fio_target *target = f->engine_data; +- struct spdk_fio_options *fio_options = td->eo; +- +- if (fio_options->zone_append && spdk_bdev_is_zoned(target->bdev)) { +- if (spdk_bdev_io_type_supported(target->bdev, SPDK_BDEV_IO_TYPE_ZONE_APPEND)) { +- SPDK_DEBUGLOG(fio_bdev, "Using zone appends instead of writes on: '%s'\n", +- f->file_name); +- target->zone_append_enabled = true; +- } else { +- SPDK_WARNLOG("Falling back to writes on: '%s' - bdev lacks zone append cmd\n", +- f->file_name); +- } +- } +- +- return 0; +-} +- +-static struct fio_option options[] = { +- { +- .name = "spdk_conf", +- .lname = "SPDK configuration file", +- .type = FIO_OPT_STR_STORE, +- .off1 = offsetof(struct spdk_fio_options, conf), +- .help = "A SPDK JSON configuration file", +- .category = FIO_OPT_C_ENGINE, +- .group = FIO_OPT_G_INVALID, +- }, +- { +- .name = "spdk_json_conf", +- .lname = "SPDK JSON configuration file", +- .type = FIO_OPT_STR_STORE, +- .off1 = offsetof(struct spdk_fio_options, json_conf), +- .help = "A SPDK JSON configuration file", +- .category = FIO_OPT_C_ENGINE, +- .group = FIO_OPT_G_INVALID, +- }, +- { +- .name = "spdk_mem", +- .lname = "SPDK memory in MB", +- .type = FIO_OPT_INT, +- .off1 = offsetof(struct spdk_fio_options, mem_mb), +- .help = "Amount of memory in MB to allocate for SPDK", +- .category = FIO_OPT_C_ENGINE, +- .group = FIO_OPT_G_INVALID, +- }, +- { +- .name = "spdk_single_seg", +- .lname = "SPDK switch to create just a single hugetlbfs file", +- .type = FIO_OPT_BOOL, +- .off1 = offsetof(struct spdk_fio_options, mem_single_seg), +- .help = "If set to 1, SPDK will use just a single hugetlbfs file", +- .def = "0", +- .category = FIO_OPT_C_ENGINE, +- .group = FIO_OPT_G_INVALID, +- }, +- { +- .name = "log_flags", +- .lname = "log flags", +- .type = FIO_OPT_STR_STORE, +- .off1 = offsetof(struct spdk_fio_options, log_flags), +- .help = "SPDK log flags to enable", +- .category = FIO_OPT_C_ENGINE, +- .group = FIO_OPT_G_INVALID, +- }, +- { +- .name = "initial_zone_reset", +- .lname = "Reset Zones on initialization", +- .type = FIO_OPT_INT, +- .off1 = offsetof(struct spdk_fio_options, initial_zone_reset), +- .def = "0", +- .help = "Reset Zones on initialization (0=disable, 1=Reset All Zones)", +- .category = FIO_OPT_C_ENGINE, +- .group = FIO_OPT_G_INVALID, +- }, +- { +- .name = "zone_append", +- .lname = "Use zone append instead of write", +- .type = FIO_OPT_INT, +- .off1 = offsetof(struct spdk_fio_options, zone_append), +- .def = "0", +- .help = "Use zone append instead of write (1=zone append, 0=write)", +- .category = FIO_OPT_C_ENGINE, +- .group = FIO_OPT_G_INVALID, +- }, +- { +- .name = "env_context", +- .lname = "Environment context options", +- .type = FIO_OPT_STR_STORE, +- .off1 = offsetof(struct spdk_fio_options, env_context), +- .help = "Opaque context for use of the env implementation", +- .category = FIO_OPT_C_ENGINE, +- .group = FIO_OPT_G_INVALID, +- }, +- { +- .name = "spdk_rpc_listen_addr", +- .lname = "SPDK RPC listen address", +- .type = FIO_OPT_STR_STORE, +- .off1 = offsetof(struct spdk_fio_options, rpc_listen_addr), +- .help = "The address to listen the RPC operations", +- .category = FIO_OPT_C_ENGINE, +- .group = FIO_OPT_G_INVALID, +- }, +- { +- .name = NULL, +- }, +-}; +- +-/* FIO imports this structure using dlsym */ +-struct ioengine_ops ioengine = { +- .name = "spdk_bdev", +- .version = FIO_IOOPS_VERSION, +- .flags = FIO_RAWIO | FIO_NOEXTEND | FIO_NODISKUTIL | FIO_MEMALIGN, +- .setup = spdk_fio_setup, +- .init = spdk_fio_init, +- /* .prep = unused, */ +- .queue = spdk_fio_queue, +- /* .commit = unused, */ +- .getevents = spdk_fio_getevents, +- .event = spdk_fio_event, +- /* .errdetails = unused, */ +- /* .cancel = unused, */ +- .cleanup = spdk_fio_cleanup, +- .open_file = spdk_fio_open, +- .close_file = spdk_fio_close, +- .invalidate = spdk_fio_invalidate, +- /* .unlink_file = unused, */ +- /* .get_file_size = unused, */ +- /* .terminate = unused, */ +- .iomem_alloc = spdk_fio_iomem_alloc, +- .iomem_free = spdk_fio_iomem_free, +- .io_u_init = spdk_fio_io_u_init, +- .io_u_free = spdk_fio_io_u_free, +-#if FIO_HAS_ZBD +- .get_zoned_model = spdk_fio_get_zoned_model, +- .report_zones = spdk_fio_report_zones, +- .reset_wp = spdk_fio_reset_wp, +-#endif +-#if FIO_IOOPS_VERSION >= 30 +- .get_max_open_zones = spdk_fio_get_max_open_zones, +-#endif +- .option_struct_size = sizeof(struct spdk_fio_options), +- .options = options, +-}; +- +-static void fio_init +-spdk_fio_register(void) +-{ +- register_ioengine(&ioengine); +-} +- +-static void +-spdk_fio_finish_env(void) +-{ +- pthread_mutex_lock(&g_init_mtx); +- g_poll_loop = false; +- pthread_cond_signal(&g_init_cond); +- pthread_mutex_unlock(&g_init_mtx); +- pthread_join(g_init_thread_id, NULL); +- +- spdk_thread_lib_fini(); +- spdk_env_fini(); +-} +- +-static void fio_exit +-spdk_fio_unregister(void) +-{ +- if (g_spdk_env_initialized) { +- spdk_fio_finish_env(); +- g_spdk_env_initialized = false; +- } +- unregister_ioengine(&ioengine); +-} +- +-SPDK_LOG_REGISTER_COMPONENT(fio_bdev) ++/* SPDX-License-Identifier: BSD-3-Clause ++ * Copyright (C) 2017 Intel Corporation. ++ * All rights reserved. ++ * Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. ++ */ ++ ++#include "spdk/stdinc.h" ++ ++#include "spdk/bdev.h" ++#include "spdk/bdev_zone.h" ++#include "spdk/accel.h" ++#include "spdk/env.h" ++#include "spdk/init.h" ++#include "spdk/thread.h" ++#include "spdk/log.h" ++#include "spdk/string.h" ++#include "spdk/queue.h" ++#include "spdk/util.h" ++#include "spdk/rpc.h" ++ ++#include "spdk_internal/event.h" ++ ++#include "config-host.h" ++#include "fio.h" ++#include "optgroup.h" ++ ++#ifdef for_each_rw_ddir ++#define FIO_HAS_ZBD (FIO_IOOPS_VERSION >= 26) ++#else ++#define FIO_HAS_ZBD (0) ++#endif ++ ++/* FreeBSD is missing CLOCK_MONOTONIC_RAW, ++ * so alternative is provided. */ ++#ifndef CLOCK_MONOTONIC_RAW /* Defined in glibc bits/time.h */ ++#define CLOCK_MONOTONIC_RAW CLOCK_MONOTONIC ++#endif ++ ++struct spdk_fio_options { ++ void *pad; ++ char *conf; ++ char *json_conf; ++ char *env_context; ++ char *log_flags; ++ unsigned mem_mb; ++ int mem_single_seg; ++ int initial_zone_reset; ++ int zone_append; ++ char *rpc_listen_addr; ++}; ++ ++struct spdk_fio_request { ++ struct io_u *io; ++ struct thread_data *td; ++}; ++ ++struct spdk_fio_target { ++ struct spdk_bdev *bdev; ++ struct spdk_bdev_desc *desc; ++ struct spdk_io_channel *ch; ++ bool zone_append_enabled; ++ ++ TAILQ_ENTRY(spdk_fio_target) link; ++}; ++ ++struct spdk_fio_thread { ++ struct thread_data *td; /* fio thread context */ ++ struct spdk_thread *thread; /* spdk thread context */ ++ ++ TAILQ_HEAD(, spdk_fio_target) targets; ++ bool failed; /* true if the thread failed to initialize */ ++ ++ struct io_u **iocq; /* io completion queue */ ++ unsigned int iocq_count; /* number of iocq entries filled by last getevents */ ++ unsigned int iocq_size; /* number of iocq entries allocated */ ++ ++ TAILQ_ENTRY(spdk_fio_thread) link; ++}; ++ ++struct spdk_fio_zone_cb_arg { ++ struct spdk_fio_target *target; ++ struct spdk_bdev_zone_info *spdk_zones; ++ int completed; ++ uint64_t offset_blocks; ++ struct zbd_zone *fio_zones; ++ unsigned int nr_zones; ++}; ++ ++/* On App Thread (oat) context used for making sync calls from async calls. */ ++struct spdk_fio_oat_ctx { ++ union { ++ struct spdk_fio_setup_args { ++ struct thread_data *td; ++ } sa; ++ struct spdk_fio_bdev_get_zoned_model_args { ++ struct fio_file *f; ++ enum zbd_zoned_model *model; ++ } zma; ++ struct spdk_fio_bdev_get_max_open_zones_args { ++ struct fio_file *f; ++ unsigned int *max_open_zones; ++ } moza; ++ } u; ++ pthread_mutex_t mutex; ++ pthread_cond_t cond; ++ int ret; ++}; ++ ++static bool g_spdk_env_initialized = false; ++static const char *g_json_config_file = NULL; ++static const char *g_rpc_listen_addr = SPDK_DEFAULT_RPC_ADDR; ++ ++static int spdk_fio_init(struct thread_data *td); ++static void spdk_fio_cleanup(struct thread_data *td); ++static size_t spdk_fio_poll_thread(struct spdk_fio_thread *fio_thread); ++static int spdk_fio_handle_options(struct thread_data *td, struct fio_file *f, ++ struct spdk_bdev *bdev); ++static int spdk_fio_handle_options_per_target(struct thread_data *td, struct fio_file *f); ++static void spdk_fio_setup_oat(void *ctx); ++ ++static pthread_t g_init_thread_id = 0; ++static pthread_mutex_t g_init_mtx = PTHREAD_MUTEX_INITIALIZER; ++static pthread_cond_t g_init_cond; ++static bool g_poll_loop = true; ++static TAILQ_HEAD(, spdk_fio_thread) g_threads = TAILQ_HEAD_INITIALIZER(g_threads); ++ ++/* Default polling timeout (ns) */ ++#define SPDK_FIO_POLLING_TIMEOUT 1000000000ULL ++ ++static __thread bool g_internal_thread = false; ++ ++/* Run msg_fn on app thread ("oat") and wait for it to call spdk_fio_wake_oat_waiter() */ ++static void ++spdk_fio_sync_run_oat(void (*msg_fn)(void *), struct spdk_fio_oat_ctx *ctx) ++{ ++ assert(spdk_get_thread() != spdk_thread_get_app_thread()); ++ ++ pthread_mutex_init(&ctx->mutex, NULL); ++ pthread_cond_init(&ctx->cond, NULL); ++ pthread_mutex_lock(&ctx->mutex); ++ ++ spdk_thread_send_msg(spdk_thread_get_app_thread(), msg_fn, ctx); ++ ++ /* Wake up the poll loop in spdk_init_thread_poll() */ ++ pthread_mutex_lock(&g_init_mtx); ++ pthread_cond_signal(&g_init_cond); ++ pthread_mutex_unlock(&g_init_mtx); ++ ++ /* Wait for msg_fn() to call spdk_fio_wake_oat_waiter() */ ++ pthread_cond_wait(&ctx->cond, &ctx->mutex); ++ pthread_mutex_unlock(&ctx->mutex); ++ ++ pthread_mutex_destroy(&ctx->mutex); ++ pthread_cond_destroy(&ctx->cond); ++} ++ ++static void ++spdk_fio_wake_oat_waiter(struct spdk_fio_oat_ctx *ctx) ++{ ++ pthread_mutex_lock(&ctx->mutex); ++ pthread_cond_signal(&ctx->cond); ++ pthread_mutex_unlock(&ctx->mutex); ++} ++ ++static int ++spdk_fio_schedule_thread(struct spdk_thread *thread) ++{ ++ struct spdk_fio_thread *fio_thread; ++ ++ if (g_internal_thread) { ++ /* Do nothing. */ ++ return 0; ++ } ++ ++ fio_thread = spdk_thread_get_ctx(thread); ++ ++ pthread_mutex_lock(&g_init_mtx); ++ TAILQ_INSERT_TAIL(&g_threads, fio_thread, link); ++ pthread_mutex_unlock(&g_init_mtx); ++ ++ return 0; ++} ++ ++static int ++spdk_fio_init_thread(struct thread_data *td) ++{ ++ struct spdk_fio_thread *fio_thread; ++ struct spdk_thread *thread; ++ ++ g_internal_thread = true; ++ thread = spdk_thread_create("fio_thread", NULL); ++ g_internal_thread = false; ++ if (!thread) { ++ SPDK_ERRLOG("failed to allocate thread\n"); ++ return -1; ++ } ++ ++ fio_thread = spdk_thread_get_ctx(thread); ++ fio_thread->td = td; ++ fio_thread->thread = thread; ++ td->io_ops_data = fio_thread; ++ ++ spdk_set_thread(thread); ++ ++ fio_thread->iocq_size = td->o.iodepth; ++ fio_thread->iocq = calloc(fio_thread->iocq_size, sizeof(struct io_u *)); ++ assert(fio_thread->iocq != NULL); ++ ++ TAILQ_INIT(&fio_thread->targets); ++ ++ return 0; ++} ++ ++static void ++spdk_fio_bdev_close_targets(void *arg) ++{ ++ struct spdk_fio_thread *fio_thread = arg; ++ struct spdk_fio_target *target, *tmp; ++ ++ TAILQ_FOREACH_SAFE(target, &fio_thread->targets, link, tmp) { ++ TAILQ_REMOVE(&fio_thread->targets, target, link); ++ spdk_put_io_channel(target->ch); ++ spdk_bdev_close(target->desc); ++ free(target); ++ } ++} ++ ++static void ++spdk_fio_cleanup_thread(struct spdk_fio_thread *fio_thread) ++{ ++ spdk_thread_send_msg(fio_thread->thread, spdk_fio_bdev_close_targets, fio_thread); ++ ++ pthread_mutex_lock(&g_init_mtx); ++ TAILQ_INSERT_TAIL(&g_threads, fio_thread, link); ++ pthread_mutex_unlock(&g_init_mtx); ++} ++ ++static void ++spdk_fio_calc_timeout(struct spdk_fio_thread *fio_thread, struct timespec *ts) ++{ ++ uint64_t timeout, now; ++ ++ if (spdk_thread_has_active_pollers(fio_thread->thread)) { ++ return; ++ } ++ ++ timeout = spdk_thread_next_poller_expiration(fio_thread->thread); ++ now = spdk_get_ticks(); ++ ++ if (timeout == 0) { ++ timeout = now + (SPDK_FIO_POLLING_TIMEOUT * spdk_get_ticks_hz()) / SPDK_SEC_TO_NSEC; ++ } ++ ++ if (timeout > now) { ++ timeout = ((timeout - now) * SPDK_SEC_TO_NSEC) / spdk_get_ticks_hz() + ++ ts->tv_sec * SPDK_SEC_TO_NSEC + ts->tv_nsec; ++ ++ ts->tv_sec = timeout / SPDK_SEC_TO_NSEC; ++ ts->tv_nsec = timeout % SPDK_SEC_TO_NSEC; ++ } ++} ++ ++static void ++spdk_fio_bdev_init_done(int rc, void *cb_arg) ++{ ++ *(bool *)cb_arg = true; ++ ++ if (spdk_rpc_initialize(g_rpc_listen_addr, RPC_SELECT_INTERVAL) == 0) { ++ spdk_rpc_set_state(SPDK_RPC_RUNTIME); ++ } ++} ++ ++static void ++spdk_fio_bdev_init_start(void *arg) ++{ ++ bool *done = arg; ++ ++ spdk_subsystem_init_from_json_config(g_json_config_file, SPDK_DEFAULT_RPC_ADDR, ++ spdk_fio_bdev_init_done, done, true); ++} ++ ++static void ++spdk_fio_bdev_fini_done(void *cb_arg) ++{ ++ *(bool *)cb_arg = true; ++ ++ spdk_rpc_finish(); ++} ++ ++static void ++spdk_fio_bdev_fini_start(void *arg) ++{ ++ bool *done = arg; ++ ++ spdk_subsystem_fini(spdk_fio_bdev_fini_done, done); ++} ++ ++static void * ++spdk_init_thread_poll(void *arg) ++{ ++ struct spdk_fio_options *eo = arg; ++ struct spdk_fio_thread *fio_thread; ++ struct spdk_fio_thread *thread, *tmp; ++ struct spdk_env_opts opts; ++ bool done; ++ int rc; ++ struct timespec ts; ++ struct thread_data td = {}; ++ ++ /* Create a dummy thread data for use on the initialization thread. */ ++ td.o.iodepth = 32; ++ td.eo = eo; ++ ++ /* Parse the SPDK configuration file */ ++ eo = arg; ++ ++ if (eo->conf && eo->json_conf) { ++ SPDK_ERRLOG("Cannot provide two types of configuration files\n"); ++ rc = EINVAL; ++ goto err_exit; ++ } else if (eo->conf && strlen(eo->conf)) { ++ g_json_config_file = eo->conf; ++ } else if (eo->json_conf && strlen(eo->json_conf)) { ++ g_json_config_file = eo->json_conf; ++ } else { ++ SPDK_ERRLOG("No configuration file provided\n"); ++ rc = EINVAL; ++ goto err_exit; ++ } ++ ++ /* Initialize the RPC listen address */ ++ if (eo->rpc_listen_addr) { ++ g_rpc_listen_addr = eo->rpc_listen_addr; ++ } ++ ++ /* Initialize the environment library */ ++ spdk_env_opts_init(&opts); ++ opts.name = "fio"; ++ ++ if (eo->mem_mb) { ++ opts.mem_size = eo->mem_mb; ++ } ++ opts.hugepage_single_segments = eo->mem_single_seg; ++ if (eo->env_context) { ++ opts.env_context = eo->env_context; ++ } ++ ++ if (spdk_env_init(&opts) < 0) { ++ SPDK_ERRLOG("Unable to initialize SPDK env\n"); ++ rc = EINVAL; ++ goto err_exit; ++ } ++ spdk_unaffinitize_thread(); ++ ++ if (eo->log_flags) { ++ char *tok = strtok(eo->log_flags, ","); ++ do { ++ rc = spdk_log_set_flag(tok); ++ if (rc < 0) { ++ SPDK_ERRLOG("unknown spdk log flag %s\n", tok); ++ rc = EINVAL; ++ goto err_exit; ++ } ++ } while ((tok = strtok(NULL, ",")) != NULL); ++#ifdef DEBUG ++ spdk_log_set_print_level(SPDK_LOG_DEBUG); ++#endif ++ } ++ ++ spdk_thread_lib_init(spdk_fio_schedule_thread, sizeof(struct spdk_fio_thread)); ++ ++ /* Create an SPDK thread temporarily */ ++ rc = spdk_fio_init_thread(&td); ++ if (rc < 0) { ++ SPDK_ERRLOG("Failed to create initialization thread\n"); ++ goto err_exit; ++ } ++ ++ fio_thread = td.io_ops_data; ++ ++ /* Initialize the bdev layer */ ++ done = false; ++ spdk_thread_send_msg(fio_thread->thread, spdk_fio_bdev_init_start, &done); ++ ++ do { ++ spdk_fio_poll_thread(fio_thread); ++ } while (!done); ++ ++ /* ++ * Continue polling until there are no more events. ++ * This handles any final events posted by pollers. ++ */ ++ while (spdk_fio_poll_thread(fio_thread) > 0) {}; ++ ++ /* Set condition variable */ ++ pthread_mutex_lock(&g_init_mtx); ++ pthread_cond_signal(&g_init_cond); ++ ++ pthread_mutex_unlock(&g_init_mtx); ++ ++ while (g_poll_loop) { ++ spdk_fio_poll_thread(fio_thread); ++ ++ pthread_mutex_lock(&g_init_mtx); ++ if (!TAILQ_EMPTY(&g_threads)) { ++ TAILQ_FOREACH_SAFE(thread, &g_threads, link, tmp) { ++ if (spdk_thread_is_exited(thread->thread)) { ++ TAILQ_REMOVE(&g_threads, thread, link); ++ free(thread->iocq); ++ spdk_thread_destroy(thread->thread); ++ } else { ++ spdk_fio_poll_thread(thread); ++ } ++ } ++ ++ /* If there are exiting threads to poll, don't sleep. */ ++ pthread_mutex_unlock(&g_init_mtx); ++ continue; ++ } ++ ++ /* Figure out how long to sleep. */ ++ clock_gettime(CLOCK_MONOTONIC, &ts); ++ spdk_fio_calc_timeout(fio_thread, &ts); ++ ++ rc = pthread_cond_timedwait(&g_init_cond, &g_init_mtx, &ts); ++ pthread_mutex_unlock(&g_init_mtx); ++ ++ if (rc != 0 && rc != ETIMEDOUT) { ++ break; ++ } ++ } ++ ++ spdk_fio_cleanup_thread(fio_thread); ++ ++ /* Finalize the bdev layer */ ++ done = false; ++ spdk_thread_send_msg(fio_thread->thread, spdk_fio_bdev_fini_start, &done); ++ ++ do { ++ spdk_fio_poll_thread(fio_thread); ++ ++ TAILQ_FOREACH_SAFE(thread, &g_threads, link, tmp) { ++ spdk_fio_poll_thread(thread); ++ } ++ } while (!done); ++ ++ /* Now exit all the threads */ ++ TAILQ_FOREACH(thread, &g_threads, link) { ++ spdk_set_thread(thread->thread); ++ spdk_thread_exit(thread->thread); ++ spdk_set_thread(NULL); ++ } ++ ++ /* And wait for them to gracefully exit */ ++ while (!TAILQ_EMPTY(&g_threads)) { ++ TAILQ_FOREACH_SAFE(thread, &g_threads, link, tmp) { ++ if (spdk_thread_is_exited(thread->thread)) { ++ TAILQ_REMOVE(&g_threads, thread, link); ++ free(thread->iocq); ++ spdk_thread_destroy(thread->thread); ++ } else { ++ spdk_thread_poll(thread->thread, 0, 0); ++ } ++ } ++ } ++ ++ pthread_exit(NULL); ++ ++err_exit: ++ exit(rc); ++ return NULL; ++} ++ ++static int ++spdk_fio_init_env(struct thread_data *td) ++{ ++ pthread_condattr_t attr; ++ int rc = -1; ++ ++ if (pthread_condattr_init(&attr)) { ++ SPDK_ERRLOG("Unable to initialize condition variable\n"); ++ return -1; ++ } ++ ++ if (pthread_condattr_setclock(&attr, CLOCK_MONOTONIC)) { ++ SPDK_ERRLOG("Unable to initialize condition variable\n"); ++ goto out; ++ } ++ ++ if (pthread_cond_init(&g_init_cond, &attr)) { ++ SPDK_ERRLOG("Unable to initialize condition variable\n"); ++ goto out; ++ } ++ ++ /* ++ * Spawn a thread to handle initialization operations and to poll things ++ * like the admin queues periodically. ++ */ ++ rc = pthread_create(&g_init_thread_id, NULL, &spdk_init_thread_poll, td->eo); ++ if (rc != 0) { ++ SPDK_ERRLOG("Unable to spawn thread to poll admin queue. It won't be polled.\n"); ++ } ++ ++ /* Wait for background thread to advance past the initialization */ ++ pthread_mutex_lock(&g_init_mtx); ++ pthread_cond_wait(&g_init_cond, &g_init_mtx); ++ pthread_mutex_unlock(&g_init_mtx); ++out: ++ pthread_condattr_destroy(&attr); ++ return rc; ++} ++ ++static bool ++fio_redirected_to_dev_null(void) ++{ ++ char path[PATH_MAX] = ""; ++ ssize_t ret; ++ ++ ret = readlink("/proc/self/fd/1", path, sizeof(path)); ++ ++ if (ret == -1 || strcmp(path, "/dev/null") != 0) { ++ return false; ++ } ++ ++ ret = readlink("/proc/self/fd/2", path, sizeof(path)); ++ ++ if (ret == -1 || strcmp(path, "/dev/null") != 0) { ++ return false; ++ } ++ ++ return true; ++} ++ ++static int ++spdk_fio_init_spdk_env(struct thread_data *td) ++{ ++ static pthread_mutex_t setup_lock = PTHREAD_MUTEX_INITIALIZER; ++ ++ pthread_mutex_lock(&setup_lock); ++ if (!g_spdk_env_initialized) { ++ if (spdk_fio_init_env(td)) { ++ pthread_mutex_unlock(&setup_lock); ++ SPDK_ERRLOG("failed to initialize\n"); ++ return -1; ++ } ++ ++ g_spdk_env_initialized = true; ++ } ++ pthread_mutex_unlock(&setup_lock); ++ ++ return 0; ++} ++ ++/* Called for each thread to fill in the 'real_file_size' member for ++ * each file associated with this thread. This is called prior to ++ * the init operation (spdk_fio_init()) below. This call will occur ++ * on the initial start up thread if 'create_serialize' is true, or ++ * on the thread actually associated with 'thread_data' if 'create_serialize' ++ * is false. ++ */ ++static int ++spdk_fio_setup(struct thread_data *td) ++{ ++ struct spdk_fio_oat_ctx ctx = { 0 }; ++ ++ /* ++ * If we're running in a daemonized FIO instance, it's possible ++ * fd 1/2 were re-used for something important by FIO. Newer fio ++ * versions are careful to redirect those to /dev/null, but if we're ++ * not, we'll abort early, so we don't accidentally write messages to ++ * an important file, etc. ++ */ ++ if (is_backend && !fio_redirected_to_dev_null()) { ++ char buf[1024]; ++ snprintf(buf, sizeof(buf), ++ "SPDK FIO plugin is in daemon mode, but stdout/stderr " ++ "aren't redirected to /dev/null. Aborting."); ++ fio_server_text_output(FIO_LOG_ERR, buf, sizeof(buf)); ++ return -1; ++ } ++ ++ if (!td->o.use_thread) { ++ SPDK_ERRLOG("must set thread=1 when using spdk plugin\n"); ++ return -1; ++ } ++ ++ if (spdk_fio_init_spdk_env(td) != 0) { ++ return -1; ++ } ++ ++ ctx.u.sa.td = td; ++ spdk_fio_sync_run_oat(spdk_fio_setup_oat, &ctx); ++ return ctx.ret; ++} ++ ++static void ++spdk_fio_setup_oat(void *_ctx) ++{ ++ struct spdk_fio_oat_ctx *ctx = _ctx; ++ struct thread_data *td = ctx->u.sa.td; ++ unsigned int i; ++ struct fio_file *f; ++ ++ if (td->o.nr_files == 1 && strcmp(td->files[0]->file_name, "*") == 0) { ++ struct spdk_bdev *bdev; ++ ++ /* add all available bdevs as fio targets */ ++ for (bdev = spdk_bdev_first_leaf(); bdev; bdev = spdk_bdev_next_leaf(bdev)) { ++ add_file(td, spdk_bdev_get_name(bdev), 0, 1); ++ } ++ } ++ ++ for_each_file(td, f, i) { ++ struct spdk_bdev *bdev; ++ ++ if (strcmp(f->file_name, "*") == 0) { ++ continue; ++ } ++ ++ bdev = spdk_bdev_get_by_name(f->file_name); ++ if (!bdev) { ++ SPDK_ERRLOG("Unable to find bdev with name %s\n", f->file_name); ++ ctx->ret = -1; ++ goto out; ++ } ++ ++ f->real_file_size = spdk_bdev_get_num_blocks(bdev) * ++ spdk_bdev_get_block_size(bdev); ++ f->filetype = FIO_TYPE_BLOCK; ++ fio_file_set_size_known(f); ++ ++ ctx->ret = spdk_fio_handle_options(td, f, bdev); ++ if (ctx->ret) { ++ goto out; ++ } ++ } ++ ++ ctx->ret = 0; ++out: ++ spdk_fio_wake_oat_waiter(ctx); ++} ++ ++static void ++fio_bdev_event_cb(enum spdk_bdev_event_type type, struct spdk_bdev *bdev, ++ void *event_ctx) ++{ ++ SPDK_WARNLOG("Unsupported bdev event: type %d\n", type); ++} ++ ++static void ++spdk_fio_bdev_open(void *arg) ++{ ++ struct thread_data *td = arg; ++ struct spdk_fio_thread *fio_thread; ++ unsigned int i; ++ struct fio_file *f; ++ int rc; ++ ++ fio_thread = td->io_ops_data; ++ ++ for_each_file(td, f, i) { ++ struct spdk_fio_target *target; ++ ++ if (strcmp(f->file_name, "*") == 0) { ++ continue; ++ } ++ ++ target = calloc(1, sizeof(*target)); ++ if (!target) { ++ SPDK_ERRLOG("Unable to allocate memory for I/O target.\n"); ++ fio_thread->failed = true; ++ return; ++ } ++ ++ rc = spdk_bdev_open_ext(f->file_name, true, fio_bdev_event_cb, NULL, ++ &target->desc); ++ if (rc) { ++ SPDK_ERRLOG("Unable to open bdev %s\n", f->file_name); ++ free(target); ++ fio_thread->failed = true; ++ return; ++ } ++ ++ target->bdev = spdk_bdev_desc_get_bdev(target->desc); ++ ++ target->ch = spdk_bdev_get_io_channel(target->desc); ++ if (!target->ch) { ++ SPDK_ERRLOG("Unable to get I/O channel for bdev.\n"); ++ spdk_bdev_close(target->desc); ++ free(target); ++ fio_thread->failed = true; ++ return; ++ } ++ ++ f->engine_data = target; ++ ++ rc = spdk_fio_handle_options_per_target(td, f); ++ if (rc) { ++ SPDK_ERRLOG("Failed to handle options for: %s\n", f->file_name); ++ f->engine_data = NULL; ++ spdk_put_io_channel(target->ch); ++ spdk_bdev_close(target->desc); ++ free(target); ++ fio_thread->failed = true; ++ return; ++ } ++ ++ TAILQ_INSERT_TAIL(&fio_thread->targets, target, link); ++ } ++} ++ ++/* Called for each thread, on that thread, shortly after the thread ++ * starts. ++ * ++ * Also called by spdk_fio_report_zones(), since we need an I/O channel ++ * in order to get the zone report. (fio calls the .report_zones callback ++ * before it calls the .init callback.) ++ * Therefore, if fio was run with --zonemode=zbd, the thread will already ++ * be initialized by the time that fio calls the .init callback. ++ */ ++static int ++spdk_fio_init(struct thread_data *td) ++{ ++ struct spdk_fio_thread *fio_thread; ++ int rc; ++ ++ if (spdk_fio_init_spdk_env(td) != 0) { ++ return -1; ++ } ++ ++ /* If thread has already been initialized, do nothing. */ ++ if (td->io_ops_data) { ++ return 0; ++ } ++ ++ rc = spdk_fio_init_thread(td); ++ if (rc) { ++ return rc; ++ } ++ ++ fio_thread = td->io_ops_data; ++ assert(fio_thread); ++ fio_thread->failed = false; ++ ++ spdk_thread_send_msg(fio_thread->thread, spdk_fio_bdev_open, td); ++ ++ while (spdk_fio_poll_thread(fio_thread) > 0) {} ++ ++ if (fio_thread->failed) { ++ return -1; ++ } ++ ++ return 0; ++} ++ ++static void ++spdk_fio_cleanup(struct thread_data *td) ++{ ++ struct spdk_fio_thread *fio_thread = td->io_ops_data; ++ ++ spdk_fio_cleanup_thread(fio_thread); ++ td->io_ops_data = NULL; ++} ++ ++static int ++spdk_fio_open(struct thread_data *td, struct fio_file *f) ++{ ++ ++ return 0; ++} ++ ++static int ++spdk_fio_close(struct thread_data *td, struct fio_file *f) ++{ ++ return 0; ++} ++ ++static int ++spdk_fio_iomem_alloc(struct thread_data *td, size_t total_mem) ++{ ++ td->orig_buffer = spdk_dma_zmalloc(total_mem, 0x1000, NULL); ++ return td->orig_buffer == NULL; ++} ++ ++static void ++spdk_fio_iomem_free(struct thread_data *td) ++{ ++ spdk_dma_free(td->orig_buffer); ++} ++ ++static int ++spdk_fio_io_u_init(struct thread_data *td, struct io_u *io_u) ++{ ++ struct spdk_fio_request *fio_req; ++ ++ io_u->engine_data = NULL; ++ ++ fio_req = calloc(1, sizeof(*fio_req)); ++ if (fio_req == NULL) { ++ return 1; ++ } ++ fio_req->io = io_u; ++ fio_req->td = td; ++ ++ io_u->engine_data = fio_req; ++ ++ return 0; ++} ++ ++static void ++spdk_fio_io_u_free(struct thread_data *td, struct io_u *io_u) ++{ ++ struct spdk_fio_request *fio_req = io_u->engine_data; ++ ++ if (fio_req) { ++ assert(fio_req->io == io_u); ++ free(fio_req); ++ io_u->engine_data = NULL; ++ } ++} ++ ++static void ++spdk_fio_completion_cb(struct spdk_bdev_io *bdev_io, ++ bool success, ++ void *cb_arg) ++{ ++ struct spdk_fio_request *fio_req = cb_arg; ++ struct thread_data *td = fio_req->td; ++ struct spdk_fio_thread *fio_thread = td->io_ops_data; ++ ++ assert(fio_thread->iocq_count < fio_thread->iocq_size); ++ fio_req->io->error = success ? 0 : EIO; ++ fio_thread->iocq[fio_thread->iocq_count++] = fio_req->io; ++ ++ spdk_bdev_free_io(bdev_io); ++} ++ ++#if FIO_IOOPS_VERSION >= 24 ++typedef enum fio_q_status fio_q_status_t; ++#else ++typedef int fio_q_status_t; ++#endif ++ ++static uint64_t ++spdk_fio_zone_bytes_to_blocks(struct spdk_bdev *bdev, uint64_t offset_bytes, uint64_t *zone_start, ++ uint64_t num_bytes, uint64_t *num_blocks) ++{ ++ uint32_t block_size = spdk_bdev_get_block_size(bdev); ++ *zone_start = spdk_bdev_get_zone_id(bdev, offset_bytes / block_size); ++ *num_blocks = num_bytes / block_size; ++ return (offset_bytes % block_size) | (num_bytes % block_size); ++} ++ ++static fio_q_status_t ++spdk_fio_queue(struct thread_data *td, struct io_u *io_u) ++{ ++ int rc = 1; ++ struct spdk_fio_request *fio_req = io_u->engine_data; ++ struct spdk_fio_target *target = io_u->file->engine_data; ++ ++ assert(fio_req->td == td); ++ ++ if (!target) { ++ SPDK_ERRLOG("Unable to look up correct I/O target.\n"); ++ fio_req->io->error = ENODEV; ++ return FIO_Q_COMPLETED; ++ } ++ ++ switch (io_u->ddir) { ++ case DDIR_READ: ++ rc = spdk_bdev_read(target->desc, target->ch, ++ io_u->buf, io_u->offset, io_u->xfer_buflen, ++ spdk_fio_completion_cb, fio_req); ++ break; ++ case DDIR_WRITE: ++ if (!target->zone_append_enabled) { ++ rc = spdk_bdev_write(target->desc, target->ch, ++ io_u->buf, io_u->offset, io_u->xfer_buflen, ++ spdk_fio_completion_cb, fio_req); ++ } else { ++ uint64_t zone_start, num_blocks; ++ if (spdk_fio_zone_bytes_to_blocks(target->bdev, io_u->offset, &zone_start, ++ io_u->xfer_buflen, &num_blocks) != 0) { ++ rc = -EINVAL; ++ break; ++ } ++ rc = spdk_bdev_zone_append(target->desc, target->ch, io_u->buf, ++ zone_start, num_blocks, spdk_fio_completion_cb, ++ fio_req); ++ } ++ break; ++ case DDIR_TRIM: ++ rc = spdk_bdev_unmap(target->desc, target->ch, ++ io_u->offset, io_u->xfer_buflen, ++ spdk_fio_completion_cb, fio_req); ++ break; ++ case DDIR_SYNC: ++ rc = spdk_bdev_flush(target->desc, target->ch, ++ io_u->offset, io_u->xfer_buflen, ++ spdk_fio_completion_cb, fio_req); ++ break; ++ default: ++ assert(false); ++ break; ++ } ++ ++ if (rc == -ENOMEM) { ++ return FIO_Q_BUSY; ++ } ++ ++ if (rc != 0) { ++ fio_req->io->error = abs(rc); ++ return FIO_Q_COMPLETED; ++ } ++ ++ return FIO_Q_QUEUED; ++} ++ ++static struct io_u * ++spdk_fio_event(struct thread_data *td, int event) ++{ ++ struct spdk_fio_thread *fio_thread = td->io_ops_data; ++ ++ assert(event >= 0); ++ assert((unsigned)event < fio_thread->iocq_count); ++ return fio_thread->iocq[event]; ++} ++ ++static size_t ++spdk_fio_poll_thread(struct spdk_fio_thread *fio_thread) ++{ ++ return spdk_thread_poll(fio_thread->thread, 0, 0); ++} ++ ++static int ++spdk_fio_getevents(struct thread_data *td, unsigned int min, ++ unsigned int max, const struct timespec *t) ++{ ++ struct spdk_fio_thread *fio_thread = td->io_ops_data; ++ struct timespec t0, t1; ++ uint64_t timeout = 0; ++ ++ if (t) { ++ timeout = t->tv_sec * SPDK_SEC_TO_NSEC + t->tv_nsec; ++ clock_gettime(CLOCK_MONOTONIC_RAW, &t0); ++ } ++ ++ fio_thread->iocq_count = 0; ++ ++ for (;;) { ++ spdk_fio_poll_thread(fio_thread); ++ ++ if (fio_thread->iocq_count >= min) { ++ return fio_thread->iocq_count; ++ } ++ ++ if (t) { ++ clock_gettime(CLOCK_MONOTONIC_RAW, &t1); ++ uint64_t elapse = ((t1.tv_sec - t0.tv_sec) * SPDK_SEC_TO_NSEC) ++ + t1.tv_nsec - t0.tv_nsec; ++ if (elapse > timeout) { ++ break; ++ } ++ } ++ } ++ ++ return fio_thread->iocq_count; ++} ++ ++static int ++spdk_fio_invalidate(struct thread_data *td, struct fio_file *f) ++{ ++ /* TODO: This should probably send a flush to the device, but for now just return successful. */ ++ return 0; ++} ++ ++#if FIO_HAS_ZBD ++/* Runs on app thread (oat) */ ++static void ++spdk_fio_get_zoned_model_oat(void *arg) ++{ ++ struct spdk_fio_oat_ctx *ctx = arg; ++ struct fio_file *f = ctx->u.zma.f; ++ enum zbd_zoned_model *model = ctx->u.zma.model; ++ struct spdk_bdev *bdev; ++ ++ if (f->filetype != FIO_TYPE_BLOCK) { ++ SPDK_ERRLOG("Unsupported filetype: %d\n", f->filetype); ++ ctx->ret = -EINVAL; ++ goto out; ++ } ++ ++ bdev = spdk_bdev_get_by_name(f->file_name); ++ if (!bdev) { ++ SPDK_ERRLOG("Cannot get zoned model, no bdev with name: %s\n", f->file_name); ++ ctx->ret = -ENODEV; ++ goto out; ++ } ++ ++ if (spdk_bdev_is_zoned(bdev)) { ++ *model = ZBD_HOST_MANAGED; ++ } else { ++ *model = ZBD_NONE; ++ } ++ ++ ctx->ret = 0; ++out: ++ spdk_fio_wake_oat_waiter(ctx); ++} ++ ++static int ++spdk_fio_get_zoned_model(struct thread_data *td, struct fio_file *f, enum zbd_zoned_model *model) ++{ ++ struct spdk_fio_oat_ctx ctx = { 0 }; ++ ++ ctx.u.zma.f = f; ++ ctx.u.zma.model = model; ++ ++ spdk_fio_sync_run_oat(spdk_fio_get_zoned_model_oat, &ctx); ++ ++ return ctx.ret; ++} ++ ++ ++static void ++spdk_fio_bdev_get_zone_info_done(struct spdk_bdev_io *bdev_io, bool success, void *arg) ++{ ++ struct spdk_fio_zone_cb_arg *cb_arg = arg; ++ unsigned int i; ++ int handled_zones = 0; ++ ++ if (!success) { ++ spdk_bdev_free_io(bdev_io); ++ cb_arg->completed = -EIO; ++ return; ++ } ++ ++ for (i = 0; i < cb_arg->nr_zones; i++) { ++ struct spdk_bdev_zone_info *zone_src = &cb_arg->spdk_zones[handled_zones]; ++ struct zbd_zone *zone_dest = &cb_arg->fio_zones[handled_zones]; ++ uint32_t block_size = spdk_bdev_get_block_size(cb_arg->target->bdev); ++ ++ switch (zone_src->type) { ++ case SPDK_BDEV_ZONE_TYPE_SEQWR: ++ zone_dest->type = ZBD_ZONE_TYPE_SWR; ++ break; ++ case SPDK_BDEV_ZONE_TYPE_SEQWP: ++ zone_dest->type = ZBD_ZONE_TYPE_SWP; ++ break; ++ case SPDK_BDEV_ZONE_TYPE_CNV: ++ zone_dest->type = ZBD_ZONE_TYPE_CNV; ++ break; ++ default: ++ spdk_bdev_free_io(bdev_io); ++ cb_arg->completed = -EIO; ++ return; ++ } ++ ++ zone_dest->len = spdk_bdev_get_zone_size(cb_arg->target->bdev) * block_size; ++ zone_dest->capacity = zone_src->capacity * block_size; ++ zone_dest->start = zone_src->zone_id * block_size; ++ zone_dest->wp = zone_src->write_pointer * block_size; ++ ++ switch (zone_src->state) { ++ case SPDK_BDEV_ZONE_STATE_EMPTY: ++ zone_dest->cond = ZBD_ZONE_COND_EMPTY; ++ break; ++ case SPDK_BDEV_ZONE_STATE_IMP_OPEN: ++ zone_dest->cond = ZBD_ZONE_COND_IMP_OPEN; ++ break; ++ case SPDK_BDEV_ZONE_STATE_EXP_OPEN: ++ zone_dest->cond = ZBD_ZONE_COND_EXP_OPEN; ++ break; ++ case SPDK_BDEV_ZONE_STATE_FULL: ++ zone_dest->cond = ZBD_ZONE_COND_FULL; ++ break; ++ case SPDK_BDEV_ZONE_STATE_CLOSED: ++ zone_dest->cond = ZBD_ZONE_COND_CLOSED; ++ break; ++ case SPDK_BDEV_ZONE_STATE_READ_ONLY: ++ zone_dest->cond = ZBD_ZONE_COND_READONLY; ++ break; ++ case SPDK_BDEV_ZONE_STATE_OFFLINE: ++ zone_dest->cond = ZBD_ZONE_COND_OFFLINE; ++ break; ++ case SPDK_BDEV_ZONE_STATE_NOT_WP: ++ zone_dest->cond = ZBD_ZONE_COND_NOT_WP; ++ /* Set WP to end of zone for zone types w/o WP (e.g. Conv. zones in SMR) */ ++ zone_dest->wp = zone_dest->start + zone_dest->capacity; ++ break; ++ default: ++ spdk_bdev_free_io(bdev_io); ++ cb_arg->completed = -EIO; ++ return; ++ } ++ handled_zones++; ++ } ++ ++ spdk_bdev_free_io(bdev_io); ++ cb_arg->completed = handled_zones; ++} ++ ++static void ++spdk_fio_bdev_get_zone_info(void *arg) ++{ ++ struct spdk_fio_zone_cb_arg *cb_arg = arg; ++ struct spdk_fio_target *target = cb_arg->target; ++ int rc; ++ ++ rc = spdk_bdev_get_zone_info(target->desc, target->ch, cb_arg->offset_blocks, ++ cb_arg->nr_zones, cb_arg->spdk_zones, ++ spdk_fio_bdev_get_zone_info_done, cb_arg); ++ if (rc < 0) { ++ cb_arg->completed = rc; ++ } ++} ++ ++static int ++spdk_fio_report_zones(struct thread_data *td, struct fio_file *f, uint64_t offset, ++ struct zbd_zone *zones, unsigned int nr_zones) ++{ ++ struct spdk_fio_target *target; ++ struct spdk_fio_thread *fio_thread; ++ struct spdk_fio_zone_cb_arg cb_arg; ++ uint32_t block_size; ++ int rc; ++ ++ if (nr_zones == 0) { ++ return 0; ++ } ++ ++ /* spdk_fio_report_zones() is only called before the bdev I/O channels have been created. ++ * Since we need an I/O channel for report_zones(), call spdk_fio_init() to initialize ++ * the thread early. ++ * spdk_fio_report_zones() might be called several times by fio, if e.g. the zone report ++ * for all zones does not fit in the buffer that fio has allocated for the zone report. ++ * It is safe to call spdk_fio_init(), even if the thread has already been initialized. ++ */ ++ rc = spdk_fio_init(td); ++ if (rc) { ++ return rc; ++ } ++ fio_thread = td->io_ops_data; ++ target = f->engine_data; ++ ++ assert(fio_thread); ++ assert(target); ++ ++ block_size = spdk_bdev_get_block_size(target->bdev); ++ ++ cb_arg.target = target; ++ cb_arg.completed = 0; ++ cb_arg.offset_blocks = offset / block_size; ++ cb_arg.fio_zones = zones; ++ cb_arg.nr_zones = spdk_min(nr_zones, spdk_bdev_get_num_zones(target->bdev)); ++ ++ cb_arg.spdk_zones = calloc(1, sizeof(*cb_arg.spdk_zones) * cb_arg.nr_zones); ++ if (!cb_arg.spdk_zones) { ++ SPDK_ERRLOG("Could not allocate memory for zone report!\n"); ++ rc = -ENOMEM; ++ goto cleanup_thread; ++ } ++ ++ spdk_thread_send_msg(fio_thread->thread, spdk_fio_bdev_get_zone_info, &cb_arg); ++ do { ++ spdk_fio_poll_thread(fio_thread); ++ } while (!cb_arg.completed); ++ ++ /* Free cb_arg.spdk_zones. The report in fio format is stored in cb_arg.fio_zones/zones. */ ++ free(cb_arg.spdk_zones); ++ ++ rc = cb_arg.completed; ++ if (rc < 0) { ++ SPDK_ERRLOG("Failed to get zone info: %d\n", rc); ++ goto cleanup_thread; ++ } ++ ++ /* Return the amount of zones successfully copied. */ ++ return rc; ++ ++cleanup_thread: ++ spdk_fio_cleanup(td); ++ ++ return rc; ++} ++ ++static void ++spdk_fio_bdev_zone_reset_done(struct spdk_bdev_io *bdev_io, bool success, void *arg) ++{ ++ struct spdk_fio_zone_cb_arg *cb_arg = arg; ++ ++ spdk_bdev_free_io(bdev_io); ++ ++ if (!success) { ++ cb_arg->completed = -EIO; ++ } else { ++ cb_arg->completed = 1; ++ } ++} ++ ++static void ++spdk_fio_bdev_zone_reset(void *arg) ++{ ++ struct spdk_fio_zone_cb_arg *cb_arg = arg; ++ struct spdk_fio_target *target = cb_arg->target; ++ int rc; ++ ++ rc = spdk_bdev_zone_management(target->desc, target->ch, cb_arg->offset_blocks, ++ SPDK_BDEV_ZONE_RESET, ++ spdk_fio_bdev_zone_reset_done, cb_arg); ++ if (rc < 0) { ++ cb_arg->completed = rc; ++ } ++} ++ ++static int ++spdk_fio_reset_zones(struct spdk_fio_thread *fio_thread, struct spdk_fio_target *target, ++ uint64_t offset, uint64_t length) ++{ ++ uint64_t zone_size_bytes; ++ uint32_t block_size; ++ int rc; ++ ++ assert(fio_thread); ++ assert(target); ++ ++ block_size = spdk_bdev_get_block_size(target->bdev); ++ zone_size_bytes = spdk_bdev_get_zone_size(target->bdev) * block_size; ++ ++ for (uint64_t cur = offset; cur < offset + length; cur += zone_size_bytes) { ++ struct spdk_fio_zone_cb_arg cb_arg = { ++ .target = target, ++ .completed = 0, ++ .offset_blocks = cur / block_size, ++ }; ++ ++ spdk_thread_send_msg(fio_thread->thread, spdk_fio_bdev_zone_reset, &cb_arg); ++ do { ++ spdk_fio_poll_thread(fio_thread); ++ } while (!cb_arg.completed); ++ ++ rc = cb_arg.completed; ++ if (rc < 0) { ++ SPDK_ERRLOG("Failed to reset zone: %d\n", rc); ++ return rc; ++ } ++ } ++ ++ return 0; ++} ++ ++static int ++spdk_fio_reset_wp(struct thread_data *td, struct fio_file *f, uint64_t offset, uint64_t length) ++{ ++ return spdk_fio_reset_zones(td->io_ops_data, f->engine_data, offset, length); ++} ++#endif ++ ++#if FIO_IOOPS_VERSION >= 30 ++static void ++spdk_fio_get_max_open_zones_oat(void *_ctx) ++{ ++ struct spdk_fio_oat_ctx *ctx = _ctx; ++ struct fio_file *f = ctx->u.moza.f; ++ struct spdk_bdev *bdev; ++ ++ bdev = spdk_bdev_get_by_name(f->file_name); ++ if (!bdev) { ++ SPDK_ERRLOG("Cannot get max open zones, no bdev with name: %s\n", f->file_name); ++ ctx->ret = -ENODEV; ++ } else { ++ *ctx->u.moza.max_open_zones = spdk_bdev_get_max_open_zones(bdev); ++ ctx->ret = 0; ++ } ++ ++ spdk_fio_wake_oat_waiter(ctx); ++} ++ ++static int ++spdk_fio_get_max_open_zones(struct thread_data *td, struct fio_file *f, ++ unsigned int *max_open_zones) ++{ ++ struct spdk_fio_oat_ctx ctx = { 0 }; ++ ++ ctx.u.moza.f = f; ++ ctx.u.moza.max_open_zones = max_open_zones; ++ ++ spdk_fio_sync_run_oat(spdk_fio_get_max_open_zones_oat, &ctx); ++ ++ return ctx.ret; ++} ++#endif ++ ++static int ++spdk_fio_handle_options(struct thread_data *td, struct fio_file *f, struct spdk_bdev *bdev) ++{ ++ struct spdk_fio_options *fio_options = td->eo; ++ ++ if (fio_options->initial_zone_reset && spdk_bdev_is_zoned(bdev)) { ++#if FIO_HAS_ZBD ++ int rc = spdk_fio_init(td); ++ if (rc) { ++ return rc; ++ } ++ /* offset used to indicate conventional zones that need to be skipped (reset not allowed) */ ++ rc = spdk_fio_reset_zones(td->io_ops_data, f->engine_data, td->o.start_offset, ++ f->real_file_size - td->o.start_offset); ++ if (rc) { ++ spdk_fio_cleanup(td); ++ return rc; ++ } ++#else ++ SPDK_ERRLOG("fio version is too old to support zoned block devices\n"); ++#endif ++ } ++ ++ return 0; ++} ++ ++static int ++spdk_fio_handle_options_per_target(struct thread_data *td, struct fio_file *f) ++{ ++ struct spdk_fio_target *target = f->engine_data; ++ struct spdk_fio_options *fio_options = td->eo; ++ ++ if (fio_options->zone_append && spdk_bdev_is_zoned(target->bdev)) { ++ if (spdk_bdev_io_type_supported(target->bdev, SPDK_BDEV_IO_TYPE_ZONE_APPEND)) { ++ SPDK_DEBUGLOG(fio_bdev, "Using zone appends instead of writes on: '%s'\n", ++ f->file_name); ++ target->zone_append_enabled = true; ++ } else { ++ SPDK_WARNLOG("Falling back to writes on: '%s' - bdev lacks zone append cmd\n", ++ f->file_name); ++ } ++ } ++ ++ return 0; ++} ++ ++static struct fio_option options[] = { ++ { ++ .name = "spdk_conf", ++ .lname = "SPDK configuration file", ++ .type = FIO_OPT_STR_STORE, ++ .off1 = offsetof(struct spdk_fio_options, conf), ++ .help = "A SPDK JSON configuration file", ++ .category = FIO_OPT_C_ENGINE, ++ .group = FIO_OPT_G_INVALID, ++ }, ++ { ++ .name = "spdk_json_conf", ++ .lname = "SPDK JSON configuration file", ++ .type = FIO_OPT_STR_STORE, ++ .off1 = offsetof(struct spdk_fio_options, json_conf), ++ .help = "A SPDK JSON configuration file", ++ .category = FIO_OPT_C_ENGINE, ++ .group = FIO_OPT_G_INVALID, ++ }, ++ { ++ .name = "spdk_mem", ++ .lname = "SPDK memory in MB", ++ .type = FIO_OPT_INT, ++ .off1 = offsetof(struct spdk_fio_options, mem_mb), ++ .help = "Amount of memory in MB to allocate for SPDK", ++ .category = FIO_OPT_C_ENGINE, ++ .group = FIO_OPT_G_INVALID, ++ }, ++ { ++ .name = "spdk_single_seg", ++ .lname = "SPDK switch to create just a single hugetlbfs file", ++ .type = FIO_OPT_BOOL, ++ .off1 = offsetof(struct spdk_fio_options, mem_single_seg), ++ .help = "If set to 1, SPDK will use just a single hugetlbfs file", ++ .def = "0", ++ .category = FIO_OPT_C_ENGINE, ++ .group = FIO_OPT_G_INVALID, ++ }, ++ { ++ .name = "log_flags", ++ .lname = "log flags", ++ .type = FIO_OPT_STR_STORE, ++ .off1 = offsetof(struct spdk_fio_options, log_flags), ++ .help = "SPDK log flags to enable", ++ .category = FIO_OPT_C_ENGINE, ++ .group = FIO_OPT_G_INVALID, ++ }, ++ { ++ .name = "initial_zone_reset", ++ .lname = "Reset Zones on initialization", ++ .type = FIO_OPT_INT, ++ .off1 = offsetof(struct spdk_fio_options, initial_zone_reset), ++ .def = "0", ++ .help = "Reset Zones on initialization (0=disable, 1=Reset All Zones)", ++ .category = FIO_OPT_C_ENGINE, ++ .group = FIO_OPT_G_INVALID, ++ }, ++ { ++ .name = "zone_append", ++ .lname = "Use zone append instead of write", ++ .type = FIO_OPT_INT, ++ .off1 = offsetof(struct spdk_fio_options, zone_append), ++ .def = "0", ++ .help = "Use zone append instead of write (1=zone append, 0=write)", ++ .category = FIO_OPT_C_ENGINE, ++ .group = FIO_OPT_G_INVALID, ++ }, ++ { ++ .name = "env_context", ++ .lname = "Environment context options", ++ .type = FIO_OPT_STR_STORE, ++ .off1 = offsetof(struct spdk_fio_options, env_context), ++ .help = "Opaque context for use of the env implementation", ++ .category = FIO_OPT_C_ENGINE, ++ .group = FIO_OPT_G_INVALID, ++ }, ++ { ++ .name = "spdk_rpc_listen_addr", ++ .lname = "SPDK RPC listen address", ++ .type = FIO_OPT_STR_STORE, ++ .off1 = offsetof(struct spdk_fio_options, rpc_listen_addr), ++ .help = "The address to listen the RPC operations", ++ .category = FIO_OPT_C_ENGINE, ++ .group = FIO_OPT_G_INVALID, ++ }, ++ { ++ .name = NULL, ++ }, ++}; ++ ++/* FIO imports this structure using dlsym */ ++struct ioengine_ops ioengine = { ++ .name = "spdk_bdev", ++ .version = FIO_IOOPS_VERSION, ++ .flags = FIO_RAWIO | FIO_NOEXTEND | FIO_NODISKUTIL | FIO_MEMALIGN, ++ .setup = spdk_fio_setup, ++ .init = spdk_fio_init, ++ /* .prep = unused, */ ++ .queue = spdk_fio_queue, ++ /* .commit = unused, */ ++ .getevents = spdk_fio_getevents, ++ .event = spdk_fio_event, ++ /* .errdetails = unused, */ ++ /* .cancel = unused, */ ++ .cleanup = spdk_fio_cleanup, ++ .open_file = spdk_fio_open, ++ .close_file = spdk_fio_close, ++ .invalidate = spdk_fio_invalidate, ++ /* .unlink_file = unused, */ ++ /* .get_file_size = unused, */ ++ /* .terminate = unused, */ ++ .iomem_alloc = spdk_fio_iomem_alloc, ++ .iomem_free = spdk_fio_iomem_free, ++ .io_u_init = spdk_fio_io_u_init, ++ .io_u_free = spdk_fio_io_u_free, ++#if FIO_HAS_ZBD ++ .get_zoned_model = spdk_fio_get_zoned_model, ++ .report_zones = spdk_fio_report_zones, ++ .reset_wp = spdk_fio_reset_wp, ++#endif ++#if FIO_IOOPS_VERSION >= 30 ++ .get_max_open_zones = spdk_fio_get_max_open_zones, ++#endif ++ .option_struct_size = sizeof(struct spdk_fio_options), ++ .options = options, ++}; ++ ++static void fio_init ++spdk_fio_register(void) ++{ ++ register_ioengine(&ioengine); ++} ++ ++static void ++spdk_fio_finish_env(void) ++{ ++ pthread_mutex_lock(&g_init_mtx); ++ g_poll_loop = false; ++ pthread_cond_signal(&g_init_cond); ++ pthread_mutex_unlock(&g_init_mtx); ++ pthread_join(g_init_thread_id, NULL); ++ ++ spdk_thread_lib_fini(); ++ spdk_env_fini(); ++} ++ ++static void fio_exit ++spdk_fio_unregister(void) ++{ ++ if (g_spdk_env_initialized) { ++ spdk_fio_finish_env(); ++ g_spdk_env_initialized = false; ++ } ++ unregister_ioengine(&ioengine); ++} ++ ++SPDK_LOG_REGISTER_COMPONENT(fio_bdev) +diff --git a/examples/nvmf/nvmf/nvmf.c b/examples/nvmf/nvmf/nvmf.c +index 035170a..d684806 100644 +--- a/examples/nvmf/nvmf/nvmf.c ++++ b/examples/nvmf/nvmf/nvmf.c +@@ -1,881 +1,881 @@ +-/* SPDX-License-Identifier: BSD-3-Clause +- * Copyright (C) 2019 Intel Corporation. +- * All rights reserved. +- */ +- +-#include "spdk/stdinc.h" +- +-#include "spdk/env.h" +-#include "spdk/event.h" +-#include "spdk/init.h" +-#include "spdk/string.h" +-#include "spdk/thread.h" +-#include "spdk/bdev.h" +-#include "spdk/rpc.h" +-#include "spdk/nvmf.h" +-#include "spdk/likely.h" +- +-#include "spdk_internal/event.h" +- +-#define NVMF_DEFAULT_SUBSYSTEMS 32 +- +-static const char *g_rpc_addr = SPDK_DEFAULT_RPC_ADDR; +- +-enum nvmf_target_state { +- NVMF_INIT_SUBSYSTEM = 0, +- NVMF_INIT_TARGET, +- NVMF_INIT_POLL_GROUPS, +- NVMF_INIT_START_SUBSYSTEMS, +- NVMF_RUNNING, +- NVMF_FINI_STOP_SUBSYSTEMS, +- NVMF_FINI_POLL_GROUPS, +- NVMF_FINI_TARGET, +- NVMF_FINI_SUBSYSTEM, +-}; +- +-struct nvmf_lw_thread { +- TAILQ_ENTRY(nvmf_lw_thread) link; +- bool resched; +-}; +- +-struct nvmf_reactor { +- uint32_t core; +- +- struct spdk_ring *threads; +- TAILQ_ENTRY(nvmf_reactor) link; +-}; +- +-struct nvmf_target_poll_group { +- struct spdk_nvmf_poll_group *group; +- struct spdk_thread *thread; +- +- TAILQ_ENTRY(nvmf_target_poll_group) link; +-}; +- +-struct nvmf_target { +- struct spdk_nvmf_tgt *tgt; +- +- int max_subsystems; +-}; +- +-TAILQ_HEAD(, nvmf_reactor) g_reactors = TAILQ_HEAD_INITIALIZER(g_reactors); +-TAILQ_HEAD(, nvmf_target_poll_group) g_poll_groups = TAILQ_HEAD_INITIALIZER(g_poll_groups); +-static uint32_t g_num_poll_groups = 0; +- +-static struct nvmf_reactor *g_main_reactor = NULL; +-static struct nvmf_reactor *g_next_reactor = NULL; +-static struct spdk_thread *g_init_thread = NULL; +-static struct spdk_thread *g_fini_thread = NULL; +-static struct nvmf_target g_nvmf_tgt = { +- .max_subsystems = NVMF_DEFAULT_SUBSYSTEMS, +-}; +- +-static struct nvmf_target_poll_group *g_next_pg = NULL; +-static pthread_mutex_t g_mutex = PTHREAD_MUTEX_INITIALIZER; +-static bool g_reactors_exit = false; +-static enum nvmf_target_state g_target_state; +-static bool g_intr_received = false; +- +-static uint32_t g_migrate_pg_period_us = 0; +-static struct spdk_poller *g_migrate_pg_poller = NULL; +- +-static void nvmf_target_advance_state(void); +-static int nvmf_schedule_spdk_thread(struct spdk_thread *thread); +- +-static void +-usage(char *program_name) +-{ +- printf("%s options", program_name); +- printf("\n"); +- printf("\t[-g period of round robin poll group migration (us) (default: 0 (disabled))]\n"); +- printf("\t[-h show this usage]\n"); +- printf("\t[-i shared memory ID (optional)]\n"); +- printf("\t[-m core mask for DPDK]\n"); +- printf("\t[-n max subsystems for target(default: 32)]\n"); +- printf("\t[-r RPC listen address (default /var/tmp/spdk.sock)]\n"); +- printf("\t[-s memory size in MB for DPDK (default: 0MB)]\n"); +- printf("\t[-u disable PCI access]\n"); +-} +- +-static int +-parse_args(int argc, char **argv, struct spdk_env_opts *opts) +-{ +- int op; +- long int value; +- +- while ((op = getopt(argc, argv, "g:i:m:n:p:r:s:u:h")) != -1) { +- switch (op) { +- case 'g': +- value = spdk_strtol(optarg, 10); +- if (value < 0) { +- fprintf(stderr, "converting a string to integer failed\n"); +- return -EINVAL; +- } +- g_migrate_pg_period_us = value; +- break; +- case 'i': +- value = spdk_strtol(optarg, 10); +- if (value < 0) { +- fprintf(stderr, "converting a string to integer failed\n"); +- return -EINVAL; +- } +- opts->shm_id = value; +- break; +- case 'm': +- opts->core_mask = optarg; +- break; +- case 'n': +- g_nvmf_tgt.max_subsystems = spdk_strtol(optarg, 10); +- if (g_nvmf_tgt.max_subsystems < 0) { +- fprintf(stderr, "converting a string to integer failed\n"); +- return -EINVAL; +- } +- break; +- case 'r': +- g_rpc_addr = optarg; +- break; +- case 's': +- value = spdk_strtol(optarg, 10); +- if (value < 0) { +- fprintf(stderr, "converting a string to integer failed\n"); +- return -EINVAL; +- } +- opts->mem_size = value; +- break; +- case 'u': +- opts->no_pci = true; +- break; +- case 'h': +- usage(argv[0]); +- exit(EXIT_SUCCESS); +- default: +- usage(argv[0]); +- return 1; +- } +- } +- +- return 0; +-} +- +-static int +-nvmf_reactor_run(void *arg) +-{ +- struct nvmf_reactor *nvmf_reactor = arg; +- struct nvmf_lw_thread *lw_thread; +- struct spdk_thread *thread; +- +- /* run all the lightweight threads in this nvmf_reactor by FIFO. */ +- do { +- if (spdk_ring_dequeue(nvmf_reactor->threads, (void **)&lw_thread, 1)) { +- thread = spdk_thread_get_from_ctx(lw_thread); +- +- spdk_thread_poll(thread, 0, 0); +- +- if (spdk_unlikely(spdk_thread_is_exited(thread) && +- spdk_thread_is_idle(thread))) { +- spdk_thread_destroy(thread); +- } else if (spdk_unlikely(lw_thread->resched)) { +- lw_thread->resched = false; +- nvmf_schedule_spdk_thread(thread); +- } else { +- spdk_ring_enqueue(nvmf_reactor->threads, (void **)&lw_thread, 1, NULL); +- } +- } +- } while (!g_reactors_exit); +- +- /* free all the lightweight threads */ +- while (spdk_ring_dequeue(nvmf_reactor->threads, (void **)&lw_thread, 1)) { +- thread = spdk_thread_get_from_ctx(lw_thread); +- spdk_set_thread(thread); +- +- if (spdk_thread_is_exited(thread)) { +- spdk_thread_destroy(thread); +- } else { +- /* This thread is not exited yet, and may need to communicate with other threads +- * to be exited. So mark it as exiting, and check again after traversing other threads. +- */ +- spdk_thread_exit(thread); +- spdk_thread_poll(thread, 0, 0); +- spdk_ring_enqueue(nvmf_reactor->threads, (void **)&lw_thread, 1, NULL); +- } +- } +- +- return 0; +-} +- +-static int +-nvmf_schedule_spdk_thread(struct spdk_thread *thread) +-{ +- struct nvmf_reactor *nvmf_reactor; +- struct nvmf_lw_thread *lw_thread; +- struct spdk_cpuset *cpumask; +- uint32_t i; +- +- /* Lightweight threads may have a requested cpumask. +- * This is a request only - the scheduler does not have to honor it. +- * For this scheduler implementation, each reactor is pinned to +- * a particular core so honoring the request is reasonably easy. +- */ +- cpumask = spdk_thread_get_cpumask(thread); +- +- lw_thread = spdk_thread_get_ctx(thread); +- assert(lw_thread != NULL); +- memset(lw_thread, 0, sizeof(*lw_thread)); +- +- /* assign lightweight threads to nvmf reactor(core) +- * Here we use the mutex.The way the actual SPDK event framework +- * solves this is by using internal rings for messages between reactors +- */ +- pthread_mutex_lock(&g_mutex); +- for (i = 0; i < spdk_env_get_core_count(); i++) { +- if (g_next_reactor == NULL) { +- g_next_reactor = TAILQ_FIRST(&g_reactors); +- } +- nvmf_reactor = g_next_reactor; +- g_next_reactor = TAILQ_NEXT(g_next_reactor, link); +- +- /* each spdk_thread has the core affinity */ +- if (spdk_cpuset_get_cpu(cpumask, nvmf_reactor->core)) { +- spdk_ring_enqueue(nvmf_reactor->threads, (void **)&lw_thread, 1, NULL); +- break; +- } +- } +- pthread_mutex_unlock(&g_mutex); +- +- if (i == spdk_env_get_core_count()) { +- fprintf(stderr, "failed to schedule spdk thread\n"); +- return -1; +- } +- return 0; +-} +- +-static void +-nvmf_request_spdk_thread_reschedule(struct spdk_thread *thread) +-{ +- struct nvmf_lw_thread *lw_thread; +- +- assert(thread == spdk_get_thread()); +- +- lw_thread = spdk_thread_get_ctx(thread); +- +- assert(lw_thread != NULL); +- +- lw_thread->resched = true; +-} +- +-static int +-nvmf_reactor_thread_op(struct spdk_thread *thread, enum spdk_thread_op op) +-{ +- switch (op) { +- case SPDK_THREAD_OP_NEW: +- return nvmf_schedule_spdk_thread(thread); +- case SPDK_THREAD_OP_RESCHED: +- nvmf_request_spdk_thread_reschedule(thread); +- return 0; +- default: +- return -ENOTSUP; +- } +-} +- +-static bool +-nvmf_reactor_thread_op_supported(enum spdk_thread_op op) +-{ +- switch (op) { +- case SPDK_THREAD_OP_NEW: +- case SPDK_THREAD_OP_RESCHED: +- return true; +- default: +- return false; +- } +-} +- +-static int +-nvmf_init_threads(void) +-{ +- int rc; +- uint32_t i; +- char thread_name[32]; +- struct nvmf_reactor *nvmf_reactor; +- struct spdk_cpuset cpumask; +- uint32_t main_core = spdk_env_get_current_core(); +- +- /* Whenever SPDK creates a new lightweight thread it will call +- * nvmf_schedule_spdk_thread asking for the application to begin +- * polling it via spdk_thread_poll(). Each lightweight thread in +- * SPDK optionally allocates extra memory to be used by the application +- * framework. The size of the extra memory allocated is the second parameter. +- */ +- spdk_thread_lib_init_ext(nvmf_reactor_thread_op, nvmf_reactor_thread_op_supported, +- sizeof(struct nvmf_lw_thread), SPDK_DEFAULT_MSG_MEMPOOL_SIZE); +- +- /* Spawn one system thread per CPU core. The system thread is called a reactor. +- * SPDK will spawn lightweight threads that must be mapped to reactors in +- * nvmf_schedule_spdk_thread. Using a single system thread per CPU core is a +- * choice unique to this application. SPDK itself does not require this specific +- * threading model. For example, another viable threading model would be +- * dynamically scheduling the lightweight threads onto a thread pool using a +- * work queue. +- */ +- SPDK_ENV_FOREACH_CORE(i) { +- nvmf_reactor = calloc(1, sizeof(struct nvmf_reactor)); +- if (!nvmf_reactor) { +- fprintf(stderr, "failed to alloc nvmf reactor\n"); +- rc = -ENOMEM; +- goto err_exit; +- } +- +- nvmf_reactor->core = i; +- +- nvmf_reactor->threads = spdk_ring_create(SPDK_RING_TYPE_MP_SC, 1024, SPDK_ENV_SOCKET_ID_ANY); +- if (!nvmf_reactor->threads) { +- fprintf(stderr, "failed to alloc ring\n"); +- free(nvmf_reactor); +- rc = -ENOMEM; +- goto err_exit; +- } +- +- TAILQ_INSERT_TAIL(&g_reactors, nvmf_reactor, link); +- +- if (i == main_core) { +- g_main_reactor = nvmf_reactor; +- g_next_reactor = g_main_reactor; +- } else { +- rc = spdk_env_thread_launch_pinned(i, +- nvmf_reactor_run, +- nvmf_reactor); +- if (rc) { +- fprintf(stderr, "failed to pin reactor launch\n"); +- goto err_exit; +- } +- } +- } +- +- /* Spawn a lightweight thread only on the current core to manage this application. */ +- spdk_cpuset_zero(&cpumask); +- spdk_cpuset_set_cpu(&cpumask, main_core, true); +- snprintf(thread_name, sizeof(thread_name), "nvmf_main_thread"); +- g_init_thread = spdk_thread_create(thread_name, &cpumask); +- if (!g_init_thread) { +- fprintf(stderr, "failed to create spdk thread\n"); +- return -1; +- } +- +- fprintf(stdout, "nvmf threads initialize successfully\n"); +- return 0; +- +-err_exit: +- return rc; +-} +- +-static void +-nvmf_destroy_threads(void) +-{ +- struct nvmf_reactor *nvmf_reactor, *tmp; +- +- TAILQ_FOREACH_SAFE(nvmf_reactor, &g_reactors, link, tmp) { +- spdk_ring_free(nvmf_reactor->threads); +- free(nvmf_reactor); +- } +- +- pthread_mutex_destroy(&g_mutex); +- spdk_thread_lib_fini(); +- fprintf(stdout, "nvmf threads destroy successfully\n"); +-} +- +-static void +-nvmf_tgt_destroy_done(void *ctx, int status) +-{ +- fprintf(stdout, "destroyed the nvmf target service\n"); +- +- g_target_state = NVMF_FINI_SUBSYSTEM; +- nvmf_target_advance_state(); +-} +- +-static void +-nvmf_destroy_nvmf_tgt(void) +-{ +- if (g_nvmf_tgt.tgt) { +- spdk_nvmf_tgt_destroy(g_nvmf_tgt.tgt, nvmf_tgt_destroy_done, NULL); +- } else { +- g_target_state = NVMF_FINI_SUBSYSTEM; +- } +-} +- +-static void +-nvmf_create_nvmf_tgt(void) +-{ +- struct spdk_nvmf_subsystem *subsystem; +- struct spdk_nvmf_target_opts tgt_opts = {}; +- +- tgt_opts.max_subsystems = g_nvmf_tgt.max_subsystems; +- snprintf(tgt_opts.name, sizeof(tgt_opts.name), "%s", "nvmf_example"); +- /* Construct the default NVMe-oF target +- * An NVMe-oF target is a collection of subsystems, namespace, and poll +- * groups, and defines the scope of the NVMe-oF discovery service. +- */ +- g_nvmf_tgt.tgt = spdk_nvmf_tgt_create(&tgt_opts); +- if (g_nvmf_tgt.tgt == NULL) { +- fprintf(stderr, "spdk_nvmf_tgt_create() failed\n"); +- goto error; +- } +- +- /* Create and add discovery subsystem to the NVMe-oF target. +- * NVMe-oF defines a discovery mechanism that a host uses to determine +- * the NVM subsystems that expose namespaces that the host may access. +- * It provides a host with following capabilities: +- * 1,The ability to discover a list of NVM subsystems with namespaces +- * that are accessible to the host. +- * 2,The ability to discover multiple paths to an NVM subsystem. +- * 3,The ability to discover controllers that are statically configured. +- */ +- subsystem = spdk_nvmf_subsystem_create(g_nvmf_tgt.tgt, SPDK_NVMF_DISCOVERY_NQN, +- SPDK_NVMF_SUBTYPE_DISCOVERY, 0); +- if (subsystem == NULL) { +- fprintf(stderr, "failed to create discovery nvmf library subsystem\n"); +- goto error; +- } +- +- /* Allow any host to access the discovery subsystem */ +- spdk_nvmf_subsystem_set_allow_any_host(subsystem, true); +- +- fprintf(stdout, "created a nvmf target service\n"); +- +- g_target_state = NVMF_INIT_POLL_GROUPS; +- return; +- +-error: +- g_target_state = NVMF_FINI_TARGET; +-} +- +-static void +-nvmf_tgt_subsystem_stop_next(struct spdk_nvmf_subsystem *subsystem, +- void *cb_arg, int status) +-{ +- int rc; +- +- subsystem = spdk_nvmf_subsystem_get_next(subsystem); +- if (subsystem) { +- rc = spdk_nvmf_subsystem_stop(subsystem, +- nvmf_tgt_subsystem_stop_next, +- cb_arg); +- if (rc) { +- nvmf_tgt_subsystem_stop_next(subsystem, cb_arg, 0); +- fprintf(stderr, "Unable to stop NVMe-oF subsystem. Trying others.\n"); +- } +- return; +- } +- +- fprintf(stdout, "all subsystems of target stopped\n"); +- +- g_target_state = NVMF_FINI_POLL_GROUPS; +- nvmf_target_advance_state(); +-} +- +-static void +-nvmf_tgt_stop_subsystems(struct nvmf_target *nvmf_tgt) +-{ +- struct spdk_nvmf_subsystem *subsystem; +- int rc; +- +- subsystem = spdk_nvmf_subsystem_get_first(nvmf_tgt->tgt); +- if (spdk_likely(subsystem)) { +- rc = spdk_nvmf_subsystem_stop(subsystem, +- nvmf_tgt_subsystem_stop_next, +- NULL); +- if (rc) { +- nvmf_tgt_subsystem_stop_next(subsystem, NULL, 0); +- fprintf(stderr, "Unable to stop NVMe-oF subsystem. Trying others.\n"); +- } +- } else { +- g_target_state = NVMF_FINI_POLL_GROUPS; +- } +-} +- +-static void +-nvmf_tgt_subsystem_start_next(struct spdk_nvmf_subsystem *subsystem, +- void *cb_arg, int status) +-{ +- int rc; +- +- subsystem = spdk_nvmf_subsystem_get_next(subsystem); +- if (subsystem) { +- rc = spdk_nvmf_subsystem_start(subsystem, nvmf_tgt_subsystem_start_next, +- cb_arg); +- if (rc) { +- g_target_state = NVMF_FINI_STOP_SUBSYSTEMS; +- fprintf(stderr, "Unable to start NVMe-oF subsystem. shutting down app.\n"); +- nvmf_target_advance_state(); +- } +- return; +- } +- +- fprintf(stdout, "all subsystems of target started\n"); +- +- g_target_state = NVMF_RUNNING; +- nvmf_target_advance_state(); +-} +- +-static void +-nvmf_tgt_start_subsystems(struct nvmf_target *nvmf_tgt) +-{ +- struct spdk_nvmf_subsystem *subsystem; +- int rc; +- +- /* Subsystem is the NVM subsystem which is a combine of namespaces +- * except the discovery subsystem which is used for discovery service. +- * It also controls the hosts that means the subsystem determines whether +- * the host can access this subsystem. +- */ +- subsystem = spdk_nvmf_subsystem_get_first(nvmf_tgt->tgt); +- if (spdk_likely(subsystem)) { +- /* In SPDK there are three states in subsystem: Inactive, Active, Paused. +- * Start subsystem means make it from inactive to active that means +- * subsystem start to work or it can be accessed. +- */ +- rc = spdk_nvmf_subsystem_start(subsystem, +- nvmf_tgt_subsystem_start_next, +- NULL); +- if (rc) { +- fprintf(stderr, "Unable to start NVMe-oF subsystem. shutting down app.\n"); +- g_target_state = NVMF_FINI_STOP_SUBSYSTEMS; +- } +- } else { +- g_target_state = NVMF_RUNNING; +- } +-} +- +-static void +-nvmf_tgt_create_poll_groups_done(void *ctx) +-{ +- struct nvmf_target_poll_group *pg = ctx; +- +- if (!g_next_pg) { +- g_next_pg = pg; +- } +- +- TAILQ_INSERT_TAIL(&g_poll_groups, pg, link); +- +- assert(g_num_poll_groups < spdk_env_get_core_count()); +- +- if (++g_num_poll_groups == spdk_env_get_core_count()) { +- fprintf(stdout, "create targets's poll groups done\n"); +- +- g_target_state = NVMF_INIT_START_SUBSYSTEMS; +- nvmf_target_advance_state(); +- } +-} +- +-static void +-nvmf_tgt_create_poll_group(void *ctx) +-{ +- struct nvmf_target_poll_group *pg; +- +- pg = calloc(1, sizeof(struct nvmf_target_poll_group)); +- if (!pg) { +- fprintf(stderr, "failed to allocate poll group\n"); +- assert(false); +- return; +- } +- +- pg->thread = spdk_get_thread(); +- pg->group = spdk_nvmf_poll_group_create(g_nvmf_tgt.tgt); +- if (!pg->group) { +- fprintf(stderr, "failed to create poll group of the target\n"); +- free(pg); +- assert(false); +- return; +- } +- +- spdk_thread_send_msg(g_init_thread, nvmf_tgt_create_poll_groups_done, pg); +-} +- +-/* Create a lightweight thread per poll group instead of assuming a pool of lightweight +- * threads already exist at start up time. A poll group is a collection of unrelated NVMe-oF +- * connections. Each poll group is only accessed from the associated lightweight thread. +- */ +-static void +-nvmf_poll_groups_create(void) +-{ +- struct spdk_cpuset tmp_cpumask = {}; +- uint32_t i; +- char thread_name[32]; +- struct spdk_thread *thread; +- +- assert(g_init_thread != NULL); +- +- SPDK_ENV_FOREACH_CORE(i) { +- spdk_cpuset_zero(&tmp_cpumask); +- spdk_cpuset_set_cpu(&tmp_cpumask, i, true); +- snprintf(thread_name, sizeof(thread_name), "nvmf_tgt_poll_group_%u", i); +- +- thread = spdk_thread_create(thread_name, &tmp_cpumask); +- assert(thread != NULL); +- +- spdk_thread_send_msg(thread, nvmf_tgt_create_poll_group, NULL); +- } +-} +- +-static void +-_nvmf_tgt_destroy_poll_groups_done(void *ctx) +-{ +- assert(g_num_poll_groups > 0); +- +- if (--g_num_poll_groups == 0) { +- fprintf(stdout, "destroy targets's poll groups done\n"); +- +- g_target_state = NVMF_FINI_TARGET; +- nvmf_target_advance_state(); +- } +-} +- +-static void +-nvmf_tgt_destroy_poll_groups_done(void *cb_arg, int status) +-{ +- struct nvmf_target_poll_group *pg = cb_arg; +- +- free(pg); +- +- spdk_thread_send_msg(g_fini_thread, _nvmf_tgt_destroy_poll_groups_done, NULL); +- +- spdk_thread_exit(spdk_get_thread()); +-} +- +-static void +-nvmf_tgt_destroy_poll_group(void *ctx) +-{ +- struct nvmf_target_poll_group *pg = ctx; +- +- spdk_nvmf_poll_group_destroy(pg->group, nvmf_tgt_destroy_poll_groups_done, pg); +-} +- +-static void +-nvmf_poll_groups_destroy(void) +-{ +- struct nvmf_target_poll_group *pg, *tmp; +- +- g_fini_thread = spdk_get_thread(); +- assert(g_fini_thread != NULL); +- +- TAILQ_FOREACH_SAFE(pg, &g_poll_groups, link, tmp) { +- TAILQ_REMOVE(&g_poll_groups, pg, link); +- spdk_thread_send_msg(pg->thread, nvmf_tgt_destroy_poll_group, pg); +- } +-} +- +-static void +-nvmf_subsystem_fini_done(void *cb_arg) +-{ +- fprintf(stdout, "bdev subsystem finish successfully\n"); +- spdk_rpc_finish(); +- g_reactors_exit = true; +-} +- +-static void +-nvmf_subsystem_init_done(int rc, void *cb_arg) +-{ +- fprintf(stdout, "bdev subsystem init successfully\n"); +- +- rc = spdk_rpc_initialize(g_rpc_addr); +- if (rc) { +- spdk_app_stop(rc); +- return; +- } +- +- spdk_rpc_set_state(SPDK_RPC_RUNTIME); +- +- g_target_state = NVMF_INIT_TARGET; +- nvmf_target_advance_state(); +-} +- +-static void +-migrate_poll_group_by_rr(void *ctx) +-{ +- uint32_t current_core, next_core; +- struct spdk_cpuset cpumask = {}; +- +- current_core = spdk_env_get_current_core(); +- next_core = spdk_env_get_next_core(current_core); +- if (next_core == UINT32_MAX) { +- next_core = spdk_env_get_first_core(); +- } +- +- spdk_cpuset_set_cpu(&cpumask, next_core, true); +- +- spdk_thread_set_cpumask(&cpumask); +-} +- +-static int +-migrate_poll_groups_by_rr(void *ctx) +-{ +- struct nvmf_target_poll_group *pg; +- +- TAILQ_FOREACH(pg, &g_poll_groups, link) { +- spdk_thread_send_msg(pg->thread, migrate_poll_group_by_rr, NULL); +- } +- +- return SPDK_POLLER_BUSY; +-} +- +-static void +-nvmf_target_advance_state(void) +-{ +- enum nvmf_target_state prev_state; +- +- do { +- prev_state = g_target_state; +- +- switch (g_target_state) { +- case NVMF_INIT_SUBSYSTEM: +- /* initialize the bdev layer */ +- spdk_subsystem_init(nvmf_subsystem_init_done, NULL); +- return; +- case NVMF_INIT_TARGET: +- nvmf_create_nvmf_tgt(); +- break; +- case NVMF_INIT_POLL_GROUPS: +- nvmf_poll_groups_create(); +- break; +- case NVMF_INIT_START_SUBSYSTEMS: +- nvmf_tgt_start_subsystems(&g_nvmf_tgt); +- break; +- case NVMF_RUNNING: +- fprintf(stdout, "nvmf target is running\n"); +- if (g_migrate_pg_period_us != 0) { +- g_migrate_pg_poller = SPDK_POLLER_REGISTER(migrate_poll_groups_by_rr, NULL, +- g_migrate_pg_period_us); +- } +- break; +- case NVMF_FINI_STOP_SUBSYSTEMS: +- spdk_poller_unregister(&g_migrate_pg_poller); +- nvmf_tgt_stop_subsystems(&g_nvmf_tgt); +- break; +- case NVMF_FINI_POLL_GROUPS: +- nvmf_poll_groups_destroy(); +- break; +- case NVMF_FINI_TARGET: +- nvmf_destroy_nvmf_tgt(); +- break; +- case NVMF_FINI_SUBSYSTEM: +- spdk_subsystem_fini(nvmf_subsystem_fini_done, NULL); +- break; +- } +- } while (g_target_state != prev_state); +-} +- +-static void +-nvmf_target_app_start(void *arg) +-{ +- g_target_state = NVMF_INIT_SUBSYSTEM; +- nvmf_target_advance_state(); +-} +- +-static void +-_nvmf_shutdown_cb(void *ctx) +-{ +- /* Still in initialization state, defer shutdown operation */ +- if (g_target_state < NVMF_RUNNING) { +- spdk_thread_send_msg(spdk_get_thread(), _nvmf_shutdown_cb, NULL); +- return; +- } else if (g_target_state > NVMF_RUNNING) { +- /* Already in Shutdown status, ignore the signal */ +- return; +- } +- +- g_target_state = NVMF_FINI_STOP_SUBSYSTEMS; +- nvmf_target_advance_state(); +-} +- +-static void +-nvmf_shutdown_cb(int signo) +-{ +- if (!g_intr_received) { +- g_intr_received = true; +- spdk_thread_send_msg(g_init_thread, _nvmf_shutdown_cb, NULL); +- } +-} +- +-static int +-nvmf_setup_signal_handlers(void) +-{ +- struct sigaction sigact; +- sigset_t sigmask; +- int signals[] = {SIGINT, SIGTERM}; +- int num_signals = sizeof(signals) / sizeof(int); +- int rc, i; +- +- rc = sigemptyset(&sigmask); +- if (rc) { +- fprintf(stderr, "errno:%d--failed to empty signal set\n", errno); +- return rc; +- } +- memset(&sigact, 0, sizeof(sigact)); +- rc = sigemptyset(&sigact.sa_mask); +- if (rc) { +- fprintf(stderr, "errno:%d--failed to empty signal set\n", errno); +- return rc; +- } +- +- /* Install the same handler for SIGINT and SIGTERM */ +- sigact.sa_handler = nvmf_shutdown_cb; +- +- for (i = 0; i < num_signals; i++) { +- rc = sigaction(signals[i], &sigact, NULL); +- if (rc < 0) { +- fprintf(stderr, "errno:%d--sigaction() failed\n", errno); +- return rc; +- } +- rc = sigaddset(&sigmask, signals[i]); +- if (rc) { +- fprintf(stderr, "errno:%d--failed to add set\n", errno); +- return rc; +- } +- } +- +- pthread_sigmask(SIG_UNBLOCK, &sigmask, NULL); +- +- return 0; +-} +- +-int +-main(int argc, char **argv) +-{ +- int rc; +- struct spdk_env_opts opts; +- +- spdk_env_opts_init(&opts); +- opts.name = "nvmf-example"; +- +- rc = parse_args(argc, argv, &opts); +- if (rc != 0) { +- return rc; +- } +- +- if (spdk_env_init(&opts) < 0) { +- fprintf(stderr, "unable to initialize SPDK env\n"); +- return -EINVAL; +- } +- +- /* Initialize the threads */ +- rc = nvmf_init_threads(); +- assert(rc == 0); +- +- /* Send a message to the thread assigned to the main reactor +- * that continues initialization. This is how we bootstrap the +- * program so that all code from here on is running on an SPDK thread. +- */ +- assert(g_init_thread != NULL); +- +- rc = nvmf_setup_signal_handlers(); +- assert(rc == 0); +- +- spdk_thread_send_msg(g_init_thread, nvmf_target_app_start, NULL); +- +- nvmf_reactor_run(g_main_reactor); +- +- spdk_env_thread_wait_all(); +- nvmf_destroy_threads(); +- +- spdk_env_fini(); +- +- return rc; +-} ++/* SPDX-License-Identifier: BSD-3-Clause ++ * Copyright (C) 2019 Intel Corporation. ++ * All rights reserved. ++ */ ++ ++#include "spdk/stdinc.h" ++ ++#include "spdk/env.h" ++#include "spdk/event.h" ++#include "spdk/init.h" ++#include "spdk/string.h" ++#include "spdk/thread.h" ++#include "spdk/bdev.h" ++#include "spdk/rpc.h" ++#include "spdk/nvmf.h" ++#include "spdk/likely.h" ++ ++#include "spdk_internal/event.h" ++ ++#define NVMF_DEFAULT_SUBSYSTEMS 32 ++ ++static const char *g_rpc_addr = SPDK_DEFAULT_RPC_ADDR; ++ ++enum nvmf_target_state { ++ NVMF_INIT_SUBSYSTEM = 0, ++ NVMF_INIT_TARGET, ++ NVMF_INIT_POLL_GROUPS, ++ NVMF_INIT_START_SUBSYSTEMS, ++ NVMF_RUNNING, ++ NVMF_FINI_STOP_SUBSYSTEMS, ++ NVMF_FINI_POLL_GROUPS, ++ NVMF_FINI_TARGET, ++ NVMF_FINI_SUBSYSTEM, ++}; ++ ++struct nvmf_lw_thread { ++ TAILQ_ENTRY(nvmf_lw_thread) link; ++ bool resched; ++}; ++ ++struct nvmf_reactor { ++ uint32_t core; ++ ++ struct spdk_ring *threads; ++ TAILQ_ENTRY(nvmf_reactor) link; ++}; ++ ++struct nvmf_target_poll_group { ++ struct spdk_nvmf_poll_group *group; ++ struct spdk_thread *thread; ++ ++ TAILQ_ENTRY(nvmf_target_poll_group) link; ++}; ++ ++struct nvmf_target { ++ struct spdk_nvmf_tgt *tgt; ++ ++ int max_subsystems; ++}; ++ ++TAILQ_HEAD(, nvmf_reactor) g_reactors = TAILQ_HEAD_INITIALIZER(g_reactors); ++TAILQ_HEAD(, nvmf_target_poll_group) g_poll_groups = TAILQ_HEAD_INITIALIZER(g_poll_groups); ++static uint32_t g_num_poll_groups = 0; ++ ++static struct nvmf_reactor *g_main_reactor = NULL; ++static struct nvmf_reactor *g_next_reactor = NULL; ++static struct spdk_thread *g_init_thread = NULL; ++static struct spdk_thread *g_fini_thread = NULL; ++static struct nvmf_target g_nvmf_tgt = { ++ .max_subsystems = NVMF_DEFAULT_SUBSYSTEMS, ++}; ++ ++static struct nvmf_target_poll_group *g_next_pg = NULL; ++static pthread_mutex_t g_mutex = PTHREAD_MUTEX_INITIALIZER; ++static bool g_reactors_exit = false; ++static enum nvmf_target_state g_target_state; ++static bool g_intr_received = false; ++ ++static uint32_t g_migrate_pg_period_us = 0; ++static struct spdk_poller *g_migrate_pg_poller = NULL; ++ ++static void nvmf_target_advance_state(void); ++static int nvmf_schedule_spdk_thread(struct spdk_thread *thread); ++ ++static void ++usage(char *program_name) ++{ ++ printf("%s options", program_name); ++ printf("\n"); ++ printf("\t[-g period of round robin poll group migration (us) (default: 0 (disabled))]\n"); ++ printf("\t[-h show this usage]\n"); ++ printf("\t[-i shared memory ID (optional)]\n"); ++ printf("\t[-m core mask for DPDK]\n"); ++ printf("\t[-n max subsystems for target(default: 32)]\n"); ++ printf("\t[-r RPC listen address (default /var/tmp/spdk.sock)]\n"); ++ printf("\t[-s memory size in MB for DPDK (default: 0MB)]\n"); ++ printf("\t[-u disable PCI access]\n"); ++} ++ ++static int ++parse_args(int argc, char **argv, struct spdk_env_opts *opts) ++{ ++ int op; ++ long int value; ++ ++ while ((op = getopt(argc, argv, "g:i:m:n:p:r:s:u:h")) != -1) { ++ switch (op) { ++ case 'g': ++ value = spdk_strtol(optarg, 10); ++ if (value < 0) { ++ fprintf(stderr, "converting a string to integer failed\n"); ++ return -EINVAL; ++ } ++ g_migrate_pg_period_us = value; ++ break; ++ case 'i': ++ value = spdk_strtol(optarg, 10); ++ if (value < 0) { ++ fprintf(stderr, "converting a string to integer failed\n"); ++ return -EINVAL; ++ } ++ opts->shm_id = value; ++ break; ++ case 'm': ++ opts->core_mask = optarg; ++ break; ++ case 'n': ++ g_nvmf_tgt.max_subsystems = spdk_strtol(optarg, 10); ++ if (g_nvmf_tgt.max_subsystems < 0) { ++ fprintf(stderr, "converting a string to integer failed\n"); ++ return -EINVAL; ++ } ++ break; ++ case 'r': ++ g_rpc_addr = optarg; ++ break; ++ case 's': ++ value = spdk_strtol(optarg, 10); ++ if (value < 0) { ++ fprintf(stderr, "converting a string to integer failed\n"); ++ return -EINVAL; ++ } ++ opts->mem_size = value; ++ break; ++ case 'u': ++ opts->no_pci = true; ++ break; ++ case 'h': ++ usage(argv[0]); ++ exit(EXIT_SUCCESS); ++ default: ++ usage(argv[0]); ++ return 1; ++ } ++ } ++ ++ return 0; ++} ++ ++static int ++nvmf_reactor_run(void *arg) ++{ ++ struct nvmf_reactor *nvmf_reactor = arg; ++ struct nvmf_lw_thread *lw_thread; ++ struct spdk_thread *thread; ++ ++ /* run all the lightweight threads in this nvmf_reactor by FIFO. */ ++ do { ++ if (spdk_ring_dequeue(nvmf_reactor->threads, (void **)&lw_thread, 1)) { ++ thread = spdk_thread_get_from_ctx(lw_thread); ++ ++ spdk_thread_poll(thread, 0, 0); ++ ++ if (spdk_unlikely(spdk_thread_is_exited(thread) && ++ spdk_thread_is_idle(thread))) { ++ spdk_thread_destroy(thread); ++ } else if (spdk_unlikely(lw_thread->resched)) { ++ lw_thread->resched = false; ++ nvmf_schedule_spdk_thread(thread); ++ } else { ++ spdk_ring_enqueue(nvmf_reactor->threads, (void **)&lw_thread, 1, NULL); ++ } ++ } ++ } while (!g_reactors_exit); ++ ++ /* free all the lightweight threads */ ++ while (spdk_ring_dequeue(nvmf_reactor->threads, (void **)&lw_thread, 1)) { ++ thread = spdk_thread_get_from_ctx(lw_thread); ++ spdk_set_thread(thread); ++ ++ if (spdk_thread_is_exited(thread)) { ++ spdk_thread_destroy(thread); ++ } else { ++ /* This thread is not exited yet, and may need to communicate with other threads ++ * to be exited. So mark it as exiting, and check again after traversing other threads. ++ */ ++ spdk_thread_exit(thread); ++ spdk_thread_poll(thread, 0, 0); ++ spdk_ring_enqueue(nvmf_reactor->threads, (void **)&lw_thread, 1, NULL); ++ } ++ } ++ ++ return 0; ++} ++ ++static int ++nvmf_schedule_spdk_thread(struct spdk_thread *thread) ++{ ++ struct nvmf_reactor *nvmf_reactor; ++ struct nvmf_lw_thread *lw_thread; ++ struct spdk_cpuset *cpumask; ++ uint32_t i; ++ ++ /* Lightweight threads may have a requested cpumask. ++ * This is a request only - the scheduler does not have to honor it. ++ * For this scheduler implementation, each reactor is pinned to ++ * a particular core so honoring the request is reasonably easy. ++ */ ++ cpumask = spdk_thread_get_cpumask(thread); ++ ++ lw_thread = spdk_thread_get_ctx(thread); ++ assert(lw_thread != NULL); ++ memset(lw_thread, 0, sizeof(*lw_thread)); ++ ++ /* assign lightweight threads to nvmf reactor(core) ++ * Here we use the mutex.The way the actual SPDK event framework ++ * solves this is by using internal rings for messages between reactors ++ */ ++ pthread_mutex_lock(&g_mutex); ++ for (i = 0; i < spdk_env_get_core_count(); i++) { ++ if (g_next_reactor == NULL) { ++ g_next_reactor = TAILQ_FIRST(&g_reactors); ++ } ++ nvmf_reactor = g_next_reactor; ++ g_next_reactor = TAILQ_NEXT(g_next_reactor, link); ++ ++ /* each spdk_thread has the core affinity */ ++ if (spdk_cpuset_get_cpu(cpumask, nvmf_reactor->core)) { ++ spdk_ring_enqueue(nvmf_reactor->threads, (void **)&lw_thread, 1, NULL); ++ break; ++ } ++ } ++ pthread_mutex_unlock(&g_mutex); ++ ++ if (i == spdk_env_get_core_count()) { ++ fprintf(stderr, "failed to schedule spdk thread\n"); ++ return -1; ++ } ++ return 0; ++} ++ ++static void ++nvmf_request_spdk_thread_reschedule(struct spdk_thread *thread) ++{ ++ struct nvmf_lw_thread *lw_thread; ++ ++ assert(thread == spdk_get_thread()); ++ ++ lw_thread = spdk_thread_get_ctx(thread); ++ ++ assert(lw_thread != NULL); ++ ++ lw_thread->resched = true; ++} ++ ++static int ++nvmf_reactor_thread_op(struct spdk_thread *thread, enum spdk_thread_op op) ++{ ++ switch (op) { ++ case SPDK_THREAD_OP_NEW: ++ return nvmf_schedule_spdk_thread(thread); ++ case SPDK_THREAD_OP_RESCHED: ++ nvmf_request_spdk_thread_reschedule(thread); ++ return 0; ++ default: ++ return -ENOTSUP; ++ } ++} ++ ++static bool ++nvmf_reactor_thread_op_supported(enum spdk_thread_op op) ++{ ++ switch (op) { ++ case SPDK_THREAD_OP_NEW: ++ case SPDK_THREAD_OP_RESCHED: ++ return true; ++ default: ++ return false; ++ } ++} ++ ++static int ++nvmf_init_threads(void) ++{ ++ int rc; ++ uint32_t i; ++ char thread_name[32]; ++ struct nvmf_reactor *nvmf_reactor; ++ struct spdk_cpuset cpumask; ++ uint32_t main_core = spdk_env_get_current_core(); ++ ++ /* Whenever SPDK creates a new lightweight thread it will call ++ * nvmf_schedule_spdk_thread asking for the application to begin ++ * polling it via spdk_thread_poll(). Each lightweight thread in ++ * SPDK optionally allocates extra memory to be used by the application ++ * framework. The size of the extra memory allocated is the second parameter. ++ */ ++ spdk_thread_lib_init_ext(nvmf_reactor_thread_op, nvmf_reactor_thread_op_supported, ++ sizeof(struct nvmf_lw_thread), SPDK_DEFAULT_MSG_MEMPOOL_SIZE); ++ ++ /* Spawn one system thread per CPU core. The system thread is called a reactor. ++ * SPDK will spawn lightweight threads that must be mapped to reactors in ++ * nvmf_schedule_spdk_thread. Using a single system thread per CPU core is a ++ * choice unique to this application. SPDK itself does not require this specific ++ * threading model. For example, another viable threading model would be ++ * dynamically scheduling the lightweight threads onto a thread pool using a ++ * work queue. ++ */ ++ SPDK_ENV_FOREACH_CORE(i) { ++ nvmf_reactor = calloc(1, sizeof(struct nvmf_reactor)); ++ if (!nvmf_reactor) { ++ fprintf(stderr, "failed to alloc nvmf reactor\n"); ++ rc = -ENOMEM; ++ goto err_exit; ++ } ++ ++ nvmf_reactor->core = i; ++ ++ nvmf_reactor->threads = spdk_ring_create(SPDK_RING_TYPE_MP_SC, 1024, SPDK_ENV_SOCKET_ID_ANY); ++ if (!nvmf_reactor->threads) { ++ fprintf(stderr, "failed to alloc ring\n"); ++ free(nvmf_reactor); ++ rc = -ENOMEM; ++ goto err_exit; ++ } ++ ++ TAILQ_INSERT_TAIL(&g_reactors, nvmf_reactor, link); ++ ++ if (i == main_core) { ++ g_main_reactor = nvmf_reactor; ++ g_next_reactor = g_main_reactor; ++ } else { ++ rc = spdk_env_thread_launch_pinned(i, ++ nvmf_reactor_run, ++ nvmf_reactor); ++ if (rc) { ++ fprintf(stderr, "failed to pin reactor launch\n"); ++ goto err_exit; ++ } ++ } ++ } ++ ++ /* Spawn a lightweight thread only on the current core to manage this application. */ ++ spdk_cpuset_zero(&cpumask); ++ spdk_cpuset_set_cpu(&cpumask, main_core, true); ++ snprintf(thread_name, sizeof(thread_name), "nvmf_main_thread"); ++ g_init_thread = spdk_thread_create(thread_name, &cpumask); ++ if (!g_init_thread) { ++ fprintf(stderr, "failed to create spdk thread\n"); ++ return -1; ++ } ++ ++ fprintf(stdout, "nvmf threads initialize successfully\n"); ++ return 0; ++ ++err_exit: ++ return rc; ++} ++ ++static void ++nvmf_destroy_threads(void) ++{ ++ struct nvmf_reactor *nvmf_reactor, *tmp; ++ ++ TAILQ_FOREACH_SAFE(nvmf_reactor, &g_reactors, link, tmp) { ++ spdk_ring_free(nvmf_reactor->threads); ++ free(nvmf_reactor); ++ } ++ ++ pthread_mutex_destroy(&g_mutex); ++ spdk_thread_lib_fini(); ++ fprintf(stdout, "nvmf threads destroy successfully\n"); ++} ++ ++static void ++nvmf_tgt_destroy_done(void *ctx, int status) ++{ ++ fprintf(stdout, "destroyed the nvmf target service\n"); ++ ++ g_target_state = NVMF_FINI_SUBSYSTEM; ++ nvmf_target_advance_state(); ++} ++ ++static void ++nvmf_destroy_nvmf_tgt(void) ++{ ++ if (g_nvmf_tgt.tgt) { ++ spdk_nvmf_tgt_destroy(g_nvmf_tgt.tgt, nvmf_tgt_destroy_done, NULL); ++ } else { ++ g_target_state = NVMF_FINI_SUBSYSTEM; ++ } ++} ++ ++static void ++nvmf_create_nvmf_tgt(void) ++{ ++ struct spdk_nvmf_subsystem *subsystem; ++ struct spdk_nvmf_target_opts tgt_opts = {}; ++ ++ tgt_opts.max_subsystems = g_nvmf_tgt.max_subsystems; ++ snprintf(tgt_opts.name, sizeof(tgt_opts.name), "%s", "nvmf_example"); ++ /* Construct the default NVMe-oF target ++ * An NVMe-oF target is a collection of subsystems, namespace, and poll ++ * groups, and defines the scope of the NVMe-oF discovery service. ++ */ ++ g_nvmf_tgt.tgt = spdk_nvmf_tgt_create(&tgt_opts); ++ if (g_nvmf_tgt.tgt == NULL) { ++ fprintf(stderr, "spdk_nvmf_tgt_create() failed\n"); ++ goto error; ++ } ++ ++ /* Create and add discovery subsystem to the NVMe-oF target. ++ * NVMe-oF defines a discovery mechanism that a host uses to determine ++ * the NVM subsystems that expose namespaces that the host may access. ++ * It provides a host with following capabilities: ++ * 1,The ability to discover a list of NVM subsystems with namespaces ++ * that are accessible to the host. ++ * 2,The ability to discover multiple paths to an NVM subsystem. ++ * 3,The ability to discover controllers that are statically configured. ++ */ ++ subsystem = spdk_nvmf_subsystem_create(g_nvmf_tgt.tgt, SPDK_NVMF_DISCOVERY_NQN, ++ SPDK_NVMF_SUBTYPE_DISCOVERY, 0); ++ if (subsystem == NULL) { ++ fprintf(stderr, "failed to create discovery nvmf library subsystem\n"); ++ goto error; ++ } ++ ++ /* Allow any host to access the discovery subsystem */ ++ spdk_nvmf_subsystem_set_allow_any_host(subsystem, true); ++ ++ fprintf(stdout, "created a nvmf target service\n"); ++ ++ g_target_state = NVMF_INIT_POLL_GROUPS; ++ return; ++ ++error: ++ g_target_state = NVMF_FINI_TARGET; ++} ++ ++static void ++nvmf_tgt_subsystem_stop_next(struct spdk_nvmf_subsystem *subsystem, ++ void *cb_arg, int status) ++{ ++ int rc; ++ ++ subsystem = spdk_nvmf_subsystem_get_next(subsystem); ++ if (subsystem) { ++ rc = spdk_nvmf_subsystem_stop(subsystem, ++ nvmf_tgt_subsystem_stop_next, ++ cb_arg); ++ if (rc) { ++ nvmf_tgt_subsystem_stop_next(subsystem, cb_arg, 0); ++ fprintf(stderr, "Unable to stop NVMe-oF subsystem. Trying others.\n"); ++ } ++ return; ++ } ++ ++ fprintf(stdout, "all subsystems of target stopped\n"); ++ ++ g_target_state = NVMF_FINI_POLL_GROUPS; ++ nvmf_target_advance_state(); ++} ++ ++static void ++nvmf_tgt_stop_subsystems(struct nvmf_target *nvmf_tgt) ++{ ++ struct spdk_nvmf_subsystem *subsystem; ++ int rc; ++ ++ subsystem = spdk_nvmf_subsystem_get_first(nvmf_tgt->tgt); ++ if (spdk_likely(subsystem)) { ++ rc = spdk_nvmf_subsystem_stop(subsystem, ++ nvmf_tgt_subsystem_stop_next, ++ NULL); ++ if (rc) { ++ nvmf_tgt_subsystem_stop_next(subsystem, NULL, 0); ++ fprintf(stderr, "Unable to stop NVMe-oF subsystem. Trying others.\n"); ++ } ++ } else { ++ g_target_state = NVMF_FINI_POLL_GROUPS; ++ } ++} ++ ++static void ++nvmf_tgt_subsystem_start_next(struct spdk_nvmf_subsystem *subsystem, ++ void *cb_arg, int status) ++{ ++ int rc; ++ ++ subsystem = spdk_nvmf_subsystem_get_next(subsystem); ++ if (subsystem) { ++ rc = spdk_nvmf_subsystem_start(subsystem, nvmf_tgt_subsystem_start_next, ++ cb_arg); ++ if (rc) { ++ g_target_state = NVMF_FINI_STOP_SUBSYSTEMS; ++ fprintf(stderr, "Unable to start NVMe-oF subsystem. shutting down app.\n"); ++ nvmf_target_advance_state(); ++ } ++ return; ++ } ++ ++ fprintf(stdout, "all subsystems of target started\n"); ++ ++ g_target_state = NVMF_RUNNING; ++ nvmf_target_advance_state(); ++} ++ ++static void ++nvmf_tgt_start_subsystems(struct nvmf_target *nvmf_tgt) ++{ ++ struct spdk_nvmf_subsystem *subsystem; ++ int rc; ++ ++ /* Subsystem is the NVM subsystem which is a combine of namespaces ++ * except the discovery subsystem which is used for discovery service. ++ * It also controls the hosts that means the subsystem determines whether ++ * the host can access this subsystem. ++ */ ++ subsystem = spdk_nvmf_subsystem_get_first(nvmf_tgt->tgt); ++ if (spdk_likely(subsystem)) { ++ /* In SPDK there are three states in subsystem: Inactive, Active, Paused. ++ * Start subsystem means make it from inactive to active that means ++ * subsystem start to work or it can be accessed. ++ */ ++ rc = spdk_nvmf_subsystem_start(subsystem, ++ nvmf_tgt_subsystem_start_next, ++ NULL); ++ if (rc) { ++ fprintf(stderr, "Unable to start NVMe-oF subsystem. shutting down app.\n"); ++ g_target_state = NVMF_FINI_STOP_SUBSYSTEMS; ++ } ++ } else { ++ g_target_state = NVMF_RUNNING; ++ } ++} ++ ++static void ++nvmf_tgt_create_poll_groups_done(void *ctx) ++{ ++ struct nvmf_target_poll_group *pg = ctx; ++ ++ if (!g_next_pg) { ++ g_next_pg = pg; ++ } ++ ++ TAILQ_INSERT_TAIL(&g_poll_groups, pg, link); ++ ++ assert(g_num_poll_groups < spdk_env_get_core_count()); ++ ++ if (++g_num_poll_groups == spdk_env_get_core_count()) { ++ fprintf(stdout, "create targets's poll groups done\n"); ++ ++ g_target_state = NVMF_INIT_START_SUBSYSTEMS; ++ nvmf_target_advance_state(); ++ } ++} ++ ++static void ++nvmf_tgt_create_poll_group(void *ctx) ++{ ++ struct nvmf_target_poll_group *pg; ++ ++ pg = calloc(1, sizeof(struct nvmf_target_poll_group)); ++ if (!pg) { ++ fprintf(stderr, "failed to allocate poll group\n"); ++ assert(false); ++ return; ++ } ++ ++ pg->thread = spdk_get_thread(); ++ pg->group = spdk_nvmf_poll_group_create(g_nvmf_tgt.tgt); ++ if (!pg->group) { ++ fprintf(stderr, "failed to create poll group of the target\n"); ++ free(pg); ++ assert(false); ++ return; ++ } ++ ++ spdk_thread_send_msg(g_init_thread, nvmf_tgt_create_poll_groups_done, pg); ++} ++ ++/* Create a lightweight thread per poll group instead of assuming a pool of lightweight ++ * threads already exist at start up time. A poll group is a collection of unrelated NVMe-oF ++ * connections. Each poll group is only accessed from the associated lightweight thread. ++ */ ++static void ++nvmf_poll_groups_create(void) ++{ ++ struct spdk_cpuset tmp_cpumask = {}; ++ uint32_t i; ++ char thread_name[32]; ++ struct spdk_thread *thread; ++ ++ assert(g_init_thread != NULL); ++ ++ SPDK_ENV_FOREACH_CORE(i) { ++ spdk_cpuset_zero(&tmp_cpumask); ++ spdk_cpuset_set_cpu(&tmp_cpumask, i, true); ++ snprintf(thread_name, sizeof(thread_name), "nvmf_tgt_poll_group_%u", i); ++ ++ thread = spdk_thread_create(thread_name, &tmp_cpumask); ++ assert(thread != NULL); ++ ++ spdk_thread_send_msg(thread, nvmf_tgt_create_poll_group, NULL); ++ } ++} ++ ++static void ++_nvmf_tgt_destroy_poll_groups_done(void *ctx) ++{ ++ assert(g_num_poll_groups > 0); ++ ++ if (--g_num_poll_groups == 0) { ++ fprintf(stdout, "destroy targets's poll groups done\n"); ++ ++ g_target_state = NVMF_FINI_TARGET; ++ nvmf_target_advance_state(); ++ } ++} ++ ++static void ++nvmf_tgt_destroy_poll_groups_done(void *cb_arg, int status) ++{ ++ struct nvmf_target_poll_group *pg = cb_arg; ++ ++ free(pg); ++ ++ spdk_thread_send_msg(g_fini_thread, _nvmf_tgt_destroy_poll_groups_done, NULL); ++ ++ spdk_thread_exit(spdk_get_thread()); ++} ++ ++static void ++nvmf_tgt_destroy_poll_group(void *ctx) ++{ ++ struct nvmf_target_poll_group *pg = ctx; ++ ++ spdk_nvmf_poll_group_destroy(pg->group, nvmf_tgt_destroy_poll_groups_done, pg); ++} ++ ++static void ++nvmf_poll_groups_destroy(void) ++{ ++ struct nvmf_target_poll_group *pg, *tmp; ++ ++ g_fini_thread = spdk_get_thread(); ++ assert(g_fini_thread != NULL); ++ ++ TAILQ_FOREACH_SAFE(pg, &g_poll_groups, link, tmp) { ++ TAILQ_REMOVE(&g_poll_groups, pg, link); ++ spdk_thread_send_msg(pg->thread, nvmf_tgt_destroy_poll_group, pg); ++ } ++} ++ ++static void ++nvmf_subsystem_fini_done(void *cb_arg) ++{ ++ fprintf(stdout, "bdev subsystem finish successfully\n"); ++ spdk_rpc_finish(); ++ g_reactors_exit = true; ++} ++ ++static void ++nvmf_subsystem_init_done(int rc, void *cb_arg) ++{ ++ fprintf(stdout, "bdev subsystem init successfully\n"); ++ ++ rc = spdk_rpc_initialize(g_rpc_addr, RPC_SELECT_INTERVAL); ++ if (rc) { ++ spdk_app_stop(rc); ++ return; ++ } ++ ++ spdk_rpc_set_state(SPDK_RPC_RUNTIME); ++ ++ g_target_state = NVMF_INIT_TARGET; ++ nvmf_target_advance_state(); ++} ++ ++static void ++migrate_poll_group_by_rr(void *ctx) ++{ ++ uint32_t current_core, next_core; ++ struct spdk_cpuset cpumask = {}; ++ ++ current_core = spdk_env_get_current_core(); ++ next_core = spdk_env_get_next_core(current_core); ++ if (next_core == UINT32_MAX) { ++ next_core = spdk_env_get_first_core(); ++ } ++ ++ spdk_cpuset_set_cpu(&cpumask, next_core, true); ++ ++ spdk_thread_set_cpumask(&cpumask); ++} ++ ++static int ++migrate_poll_groups_by_rr(void *ctx) ++{ ++ struct nvmf_target_poll_group *pg; ++ ++ TAILQ_FOREACH(pg, &g_poll_groups, link) { ++ spdk_thread_send_msg(pg->thread, migrate_poll_group_by_rr, NULL); ++ } ++ ++ return SPDK_POLLER_BUSY; ++} ++ ++static void ++nvmf_target_advance_state(void) ++{ ++ enum nvmf_target_state prev_state; ++ ++ do { ++ prev_state = g_target_state; ++ ++ switch (g_target_state) { ++ case NVMF_INIT_SUBSYSTEM: ++ /* initialize the bdev layer */ ++ spdk_subsystem_init(nvmf_subsystem_init_done, NULL); ++ return; ++ case NVMF_INIT_TARGET: ++ nvmf_create_nvmf_tgt(); ++ break; ++ case NVMF_INIT_POLL_GROUPS: ++ nvmf_poll_groups_create(); ++ break; ++ case NVMF_INIT_START_SUBSYSTEMS: ++ nvmf_tgt_start_subsystems(&g_nvmf_tgt); ++ break; ++ case NVMF_RUNNING: ++ fprintf(stdout, "nvmf target is running\n"); ++ if (g_migrate_pg_period_us != 0) { ++ g_migrate_pg_poller = SPDK_POLLER_REGISTER(migrate_poll_groups_by_rr, NULL, ++ g_migrate_pg_period_us); ++ } ++ break; ++ case NVMF_FINI_STOP_SUBSYSTEMS: ++ spdk_poller_unregister(&g_migrate_pg_poller); ++ nvmf_tgt_stop_subsystems(&g_nvmf_tgt); ++ break; ++ case NVMF_FINI_POLL_GROUPS: ++ nvmf_poll_groups_destroy(); ++ break; ++ case NVMF_FINI_TARGET: ++ nvmf_destroy_nvmf_tgt(); ++ break; ++ case NVMF_FINI_SUBSYSTEM: ++ spdk_subsystem_fini(nvmf_subsystem_fini_done, NULL); ++ break; ++ } ++ } while (g_target_state != prev_state); ++} ++ ++static void ++nvmf_target_app_start(void *arg) ++{ ++ g_target_state = NVMF_INIT_SUBSYSTEM; ++ nvmf_target_advance_state(); ++} ++ ++static void ++_nvmf_shutdown_cb(void *ctx) ++{ ++ /* Still in initialization state, defer shutdown operation */ ++ if (g_target_state < NVMF_RUNNING) { ++ spdk_thread_send_msg(spdk_get_thread(), _nvmf_shutdown_cb, NULL); ++ return; ++ } else if (g_target_state > NVMF_RUNNING) { ++ /* Already in Shutdown status, ignore the signal */ ++ return; ++ } ++ ++ g_target_state = NVMF_FINI_STOP_SUBSYSTEMS; ++ nvmf_target_advance_state(); ++} ++ ++static void ++nvmf_shutdown_cb(int signo) ++{ ++ if (!g_intr_received) { ++ g_intr_received = true; ++ spdk_thread_send_msg(g_init_thread, _nvmf_shutdown_cb, NULL); ++ } ++} ++ ++static int ++nvmf_setup_signal_handlers(void) ++{ ++ struct sigaction sigact; ++ sigset_t sigmask; ++ int signals[] = {SIGINT, SIGTERM}; ++ int num_signals = sizeof(signals) / sizeof(int); ++ int rc, i; ++ ++ rc = sigemptyset(&sigmask); ++ if (rc) { ++ fprintf(stderr, "errno:%d--failed to empty signal set\n", errno); ++ return rc; ++ } ++ memset(&sigact, 0, sizeof(sigact)); ++ rc = sigemptyset(&sigact.sa_mask); ++ if (rc) { ++ fprintf(stderr, "errno:%d--failed to empty signal set\n", errno); ++ return rc; ++ } ++ ++ /* Install the same handler for SIGINT and SIGTERM */ ++ sigact.sa_handler = nvmf_shutdown_cb; ++ ++ for (i = 0; i < num_signals; i++) { ++ rc = sigaction(signals[i], &sigact, NULL); ++ if (rc < 0) { ++ fprintf(stderr, "errno:%d--sigaction() failed\n", errno); ++ return rc; ++ } ++ rc = sigaddset(&sigmask, signals[i]); ++ if (rc) { ++ fprintf(stderr, "errno:%d--failed to add set\n", errno); ++ return rc; ++ } ++ } ++ ++ pthread_sigmask(SIG_UNBLOCK, &sigmask, NULL); ++ ++ return 0; ++} ++ ++int ++main(int argc, char **argv) ++{ ++ int rc; ++ struct spdk_env_opts opts; ++ ++ spdk_env_opts_init(&opts); ++ opts.name = "nvmf-example"; ++ ++ rc = parse_args(argc, argv, &opts); ++ if (rc != 0) { ++ return rc; ++ } ++ ++ if (spdk_env_init(&opts) < 0) { ++ fprintf(stderr, "unable to initialize SPDK env\n"); ++ return -EINVAL; ++ } ++ ++ /* Initialize the threads */ ++ rc = nvmf_init_threads(); ++ assert(rc == 0); ++ ++ /* Send a message to the thread assigned to the main reactor ++ * that continues initialization. This is how we bootstrap the ++ * program so that all code from here on is running on an SPDK thread. ++ */ ++ assert(g_init_thread != NULL); ++ ++ rc = nvmf_setup_signal_handlers(); ++ assert(rc == 0); ++ ++ spdk_thread_send_msg(g_init_thread, nvmf_target_app_start, NULL); ++ ++ nvmf_reactor_run(g_main_reactor); ++ ++ spdk_env_thread_wait_all(); ++ nvmf_destroy_threads(); ++ ++ spdk_env_fini(); ++ ++ return rc; ++} +diff --git a/include/spdk/env.h b/include/spdk/env.h +index bac976c..6844f13 100644 +--- a/include/spdk/env.h ++++ b/include/spdk/env.h +@@ -1,1440 +1,1441 @@ +-/* SPDX-License-Identifier: BSD-3-Clause +- * Copyright (C) 2015 Intel Corporation. +- * Copyright (c) NetApp, Inc. +- * Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. +- * All rights reserved. +- */ +- +-/** \file +- * Encapsulated third-party dependencies +- */ +- +-#ifndef SPDK_ENV_H +-#define SPDK_ENV_H +- +-#include "spdk/stdinc.h" +-#include "spdk/queue.h" +-#include "spdk/pci_ids.h" +- +-#ifdef __cplusplus +-extern "C" { +-#endif +- +-#define SPDK_ENV_SOCKET_ID_ANY (-1) +-#define SPDK_ENV_LCORE_ID_ANY (UINT32_MAX) +- +-/** +- * Memory is dma-safe. +- */ +-#define SPDK_MALLOC_DMA 0x01 +- +-/** +- * Memory is sharable across process boundaries. +- */ +-#define SPDK_MALLOC_SHARE 0x02 +- +-#define SPDK_MAX_MEMZONE_NAME_LEN 32 +-#define SPDK_MAX_MEMPOOL_NAME_LEN 29 +- +-/** +- * Memzone flags +- */ +-#define SPDK_MEMZONE_NO_IOVA_CONTIG 0x00100000 /**< no iova contiguity */ +- +-/** +- * \brief Environment initialization options +- */ +-struct spdk_env_opts { +- const char *name; +- const char *core_mask; +- int shm_id; +- int mem_channel; +- int main_core; +- int mem_size; +- bool no_pci; +- bool hugepage_single_segments; +- bool unlink_hugepage; +- size_t num_pci_addr; +- const char *hugedir; +- struct spdk_pci_addr *pci_blocked; +- struct spdk_pci_addr *pci_allowed; +- const char *iova_mode; +- uint64_t base_virtaddr; +- +- /** Opaque context for use of the env implementation. */ +- void *env_context; +- const char *vf_token; +-}; +- +-/** +- * Allocate dma/sharable memory based on a given dma_flg. It is a memory buffer +- * with the given size, alignment and socket id. +- * +- * \param size Size in bytes. +- * \param align If non-zero, the allocated buffer is aligned to a multiple of +- * align. In this case, it must be a power of two. The returned buffer is always +- * aligned to at least cache line size. +- * \param phys_addr **Deprecated**. Please use spdk_vtophys() for retrieving physical +- * addresses. A pointer to the variable to hold the physical address of +- * the allocated buffer is passed. If NULL, the physical address is not returned. +- * \param socket_id Socket ID to allocate memory on, or SPDK_ENV_SOCKET_ID_ANY +- * for any socket. +- * \param flags Combination of SPDK_MALLOC flags (\ref SPDK_MALLOC_DMA, \ref SPDK_MALLOC_SHARE). +- * At least one flag must be specified. +- * +- * \return a pointer to the allocated memory buffer. +- */ +-void *spdk_malloc(size_t size, size_t align, uint64_t *phys_addr, int socket_id, uint32_t flags); +- +-/** +- * Allocate dma/sharable memory based on a given dma_flg. It is a memory buffer +- * with the given size, alignment and socket id. Also, the buffer will be zeroed. +- * +- * \param size Size in bytes. +- * \param align If non-zero, the allocated buffer is aligned to a multiple of +- * align. In this case, it must be a power of two. The returned buffer is always +- * aligned to at least cache line size. +- * \param phys_addr **Deprecated**. Please use spdk_vtophys() for retrieving physical +- * addresses. A pointer to the variable to hold the physical address of +- * the allocated buffer is passed. If NULL, the physical address is not returned. +- * \param socket_id Socket ID to allocate memory on, or SPDK_ENV_SOCKET_ID_ANY +- * for any socket. +- * \param flags Combination of SPDK_MALLOC flags (\ref SPDK_MALLOC_DMA, \ref SPDK_MALLOC_SHARE). +- * +- * \return a pointer to the allocated memory buffer. +- */ +-void *spdk_zmalloc(size_t size, size_t align, uint64_t *phys_addr, int socket_id, uint32_t flags); +- +-/** +- * Resize a dma/sharable memory buffer with the given new size and alignment. +- * Existing contents are preserved. +- * +- * \param buf Buffer to resize. +- * \param size Size in bytes. +- * \param align If non-zero, the allocated buffer is aligned to a multiple of +- * align. In this case, it must be a power of two. The returned buffer is always +- * aligned to at least cache line size. +- * +- * \return a pointer to the resized memory buffer. +- */ +-void *spdk_realloc(void *buf, size_t size, size_t align); +- +-/** +- * Free buffer memory that was previously allocated with spdk_malloc() or spdk_zmalloc(). +- * +- * \param buf Buffer to free. +- */ +-void spdk_free(void *buf); +- +-/** +- * Initialize the default value of opts. +- * +- * \param opts Data structure where SPDK will initialize the default options. +- */ +-void spdk_env_opts_init(struct spdk_env_opts *opts); +- +-/** +- * Initialize or reinitialize the environment library. +- * For initialization, this must be called prior to using any other functions +- * in this library. For reinitialization, the parameter `opts` must be set to +- * NULL and this must be called after the environment library was finished by +- * spdk_env_fini() within the same process. +- * +- * \param opts Environment initialization options. +- * \return 0 on success, or negative errno on failure. +- */ +-int spdk_env_init(const struct spdk_env_opts *opts); +- +-/** +- * Release any resources of the environment library that were allocated with +- * spdk_env_init(). After this call, no SPDK env function calls may be made. +- * It is expected that common usage of this function is to call it just before +- * terminating the process or before reinitializing the environment library +- * within the same process. +- */ +-void spdk_env_fini(void); +- +-/** +- * Allocate a pinned memory buffer with the given size and alignment. +- * +- * \param size Size in bytes. +- * \param align If non-zero, the allocated buffer is aligned to a multiple of +- * align. In this case, it must be a power of two. The returned buffer is always +- * aligned to at least cache line size. +- * \param phys_addr A pointer to the variable to hold the physical address of +- * the allocated buffer is passed. If NULL, the physical address is not returned. +- * +- * \return a pointer to the allocated memory buffer. +- */ +-void *spdk_dma_malloc(size_t size, size_t align, uint64_t *phys_addr); +- +-/** +- * Allocate a pinned, memory buffer with the given size, alignment and socket id. +- * +- * \param size Size in bytes. +- * \param align If non-zero, the allocated buffer is aligned to a multiple of +- * align. In this case, it must be a power of two. The returned buffer is always +- * aligned to at least cache line size. +- * \param phys_addr A pointer to the variable to hold the physical address of +- * the allocated buffer is passed. If NULL, the physical address is not returned. +- * \param socket_id Socket ID to allocate memory on, or SPDK_ENV_SOCKET_ID_ANY +- * for any socket. +- * +- * \return a pointer to the allocated memory buffer. +- */ +-void *spdk_dma_malloc_socket(size_t size, size_t align, uint64_t *phys_addr, int socket_id); +- +-/** +- * Allocate a pinned memory buffer with the given size and alignment. The buffer +- * will be zeroed. +- * +- * \param size Size in bytes. +- * \param align If non-zero, the allocated buffer is aligned to a multiple of +- * align. In this case, it must be a power of two. The returned buffer is always +- * aligned to at least cache line size. +- * \param phys_addr A pointer to the variable to hold the physical address of +- * the allocated buffer is passed. If NULL, the physical address is not returned. +- * +- * \return a pointer to the allocated memory buffer. +- */ +-void *spdk_dma_zmalloc(size_t size, size_t align, uint64_t *phys_addr); +- +-/** +- * Allocate a pinned memory buffer with the given size, alignment and socket id. +- * The buffer will be zeroed. +- * +- * \param size Size in bytes. +- * \param align If non-zero, the allocated buffer is aligned to a multiple of +- * align. In this case, it must be a power of two. The returned buffer is always +- * aligned to at least cache line size. +- * \param phys_addr A pointer to the variable to hold the physical address of +- * the allocated buffer is passed. If NULL, the physical address is not returned. +- * \param socket_id Socket ID to allocate memory on, or SPDK_ENV_SOCKET_ID_ANY +- * for any socket. +- * +- * \return a pointer to the allocated memory buffer. +- */ +-void *spdk_dma_zmalloc_socket(size_t size, size_t align, uint64_t *phys_addr, int socket_id); +- +-/** +- * Resize the allocated and pinned memory buffer with the given new size and +- * alignment. Existing contents are preserved. +- * +- * \param buf Buffer to resize. +- * \param size Size in bytes. +- * \param align If non-zero, the allocated buffer is aligned to a multiple of +- * align. In this case, it must be a power of two. The returned buffer is always +- * aligned to at least cache line size. +- * \param phys_addr A pointer to the variable to hold the physical address of +- * the allocated buffer is passed. If NULL, the physical address is not returned. +- * +- * \return a pointer to the resized memory buffer. +- */ +-void *spdk_dma_realloc(void *buf, size_t size, size_t align, uint64_t *phys_addr); +- +-/** +- * Free a memory buffer previously allocated, for example from spdk_dma_zmalloc(). +- * This call is never made from the performance path. +- * +- * \param buf Buffer to free. +- */ +-void spdk_dma_free(void *buf); +- +-/** +- * Reserve a named, process shared memory zone with the given size, socket_id +- * and flags. Unless `SPDK_MEMZONE_NO_IOVA_CONTIG` flag is provided, the returned +- * memory will be IOVA contiguous. +- * +- * \param name Name to set for this memory zone. +- * \param len Length in bytes. +- * \param socket_id Socket ID to allocate memory on, or SPDK_ENV_SOCKET_ID_ANY +- * for any socket. +- * \param flags Flags to set for this memory zone. +- * +- * \return a pointer to the allocated memory address on success, or NULL on failure. +- */ +-void *spdk_memzone_reserve(const char *name, size_t len, int socket_id, unsigned flags); +- +-/** +- * Reserve a named, process shared memory zone with the given size, socket_id, +- * flags and alignment. Unless `SPDK_MEMZONE_NO_IOVA_CONTIG` flag is provided, +- * the returned memory will be IOVA contiguous. +- * +- * \param name Name to set for this memory zone. +- * \param len Length in bytes. +- * \param socket_id Socket ID to allocate memory on, or SPDK_ENV_SOCKET_ID_ANY +- * for any socket. +- * \param flags Flags to set for this memory zone. +- * \param align Alignment for resulting memzone. Must be a power of 2. +- * +- * \return a pointer to the allocated memory address on success, or NULL on failure. +- */ +-void *spdk_memzone_reserve_aligned(const char *name, size_t len, int socket_id, +- unsigned flags, unsigned align); +- +-/** +- * Lookup the memory zone identified by the given name. +- * +- * \param name Name of the memory zone. +- * +- * \return a pointer to the reserved memory address on success, or NULL on failure. +- */ +-void *spdk_memzone_lookup(const char *name); +- +-/** +- * Free the memory zone identified by the given name. +- * +- * \return 0 on success, -1 on failure. +- */ +-int spdk_memzone_free(const char *name); +- +-/** +- * Dump debug information about all memzones. +- * +- * \param f File to write debug information to. +- */ +-void spdk_memzone_dump(FILE *f); +- +-struct spdk_mempool; +- +-#define SPDK_MEMPOOL_DEFAULT_CACHE_SIZE SIZE_MAX +- +-/** +- * Create a thread-safe memory pool. +- * +- * \param name Name for the memory pool. +- * \param count Count of elements. +- * \param ele_size Element size in bytes. +- * \param cache_size How many elements may be cached in per-core caches. Use +- * SPDK_MEMPOOL_DEFAULT_CACHE_SIZE for a reasonable default, or 0 for no per-core cache. +- * \param socket_id Socket ID to allocate memory on, or SPDK_ENV_SOCKET_ID_ANY +- * for any socket. +- * +- * \return a pointer to the created memory pool. +- */ +-struct spdk_mempool *spdk_mempool_create(const char *name, size_t count, +- size_t ele_size, size_t cache_size, int socket_id); +- +-/** +- * An object callback function for memory pool. +- * +- * Used by spdk_mempool_create_ctor(). +- */ +-typedef void (spdk_mempool_obj_cb_t)(struct spdk_mempool *mp, +- void *opaque, void *obj, unsigned obj_idx); +- +-/** +- * A memory chunk callback function for memory pool. +- * +- * Used by spdk_mempool_mem_iter(). +- */ +-typedef void (spdk_mempool_mem_cb_t)(struct spdk_mempool *mp, void *opaque, void *addr, +- uint64_t iova, size_t len, unsigned mem_idx); +- +-/** +- * Create a thread-safe memory pool with user provided initialization function +- * and argument. +- * +- * \param name Name for the memory pool. +- * \param count Count of elements. +- * \param ele_size Element size in bytes. +- * \param cache_size How many elements may be cached in per-core caches. Use +- * SPDK_MEMPOOL_DEFAULT_CACHE_SIZE for a reasonable default, or 0 for no per-core cache. +- * \param socket_id Socket ID to allocate memory on, or SPDK_ENV_SOCKET_ID_ANY +- * for any socket. +- * \param obj_init User provided object callback initialization function. +- * \param obj_init_arg User provided callback initialization function argument. +- * +- * \return a pointer to the created memory pool. +- */ +-struct spdk_mempool *spdk_mempool_create_ctor(const char *name, size_t count, +- size_t ele_size, size_t cache_size, int socket_id, +- spdk_mempool_obj_cb_t *obj_init, void *obj_init_arg); +- +-/** +- * Get the name of a memory pool. +- * +- * \param mp Memory pool to query. +- * +- * \return the name of the memory pool. +- */ +-char *spdk_mempool_get_name(struct spdk_mempool *mp); +- +-/** +- * Free a memory pool. +- */ +-void spdk_mempool_free(struct spdk_mempool *mp); +- +-/** +- * Get an element from a memory pool. If no elements remain, return NULL. +- * +- * \param mp Memory pool to query. +- * +- * \return a pointer to the element. +- */ +-void *spdk_mempool_get(struct spdk_mempool *mp); +- +-/** +- * Get multiple elements from a memory pool. +- * +- * \param mp Memory pool to get multiple elements from. +- * \param ele_arr Array of the elements to fill. +- * \param count Count of elements to get. +- * +- * \return 0 on success, negative errno on failure. +- */ +-int spdk_mempool_get_bulk(struct spdk_mempool *mp, void **ele_arr, size_t count); +- +-/** +- * Put an element back into the memory pool. +- * +- * \param mp Memory pool to put element back into. +- * \param ele Element to put. +- */ +-void spdk_mempool_put(struct spdk_mempool *mp, void *ele); +- +-/** +- * Put multiple elements back into the memory pool. +- * +- * \param mp Memory pool to put multiple elements back into. +- * \param ele_arr Array of the elements to put. +- * \param count Count of elements to put. +- */ +-void spdk_mempool_put_bulk(struct spdk_mempool *mp, void **ele_arr, size_t count); +- +-/** +- * Get the number of entries in the memory pool. +- * +- * \param pool Memory pool to query. +- * +- * \return the number of entries in the memory pool. +- */ +-size_t spdk_mempool_count(const struct spdk_mempool *pool); +- +-/** +- * Iterate through all elements of the pool and call a function on each one. +- * +- * \param mp Memory pool to iterate on. +- * \param obj_cb Function to call on each element. +- * \param obj_cb_arg Opaque pointer passed to the callback function. +- * +- * \return Number of elements iterated. +- */ +-uint32_t spdk_mempool_obj_iter(struct spdk_mempool *mp, spdk_mempool_obj_cb_t obj_cb, +- void *obj_cb_arg); +- +-/** +- * Iterate through all memory chunks of the pool and call a function on each one. +- * +- * \param mp Memory pool to iterate on. +- * \param mem_cb Function to call on each memory chunk. +- * \param mem_cb_arg Opaque pointer passed to the callback function. +- * +- * \return Number of memory chunks iterated. +- */ +-uint32_t spdk_mempool_mem_iter(struct spdk_mempool *mp, spdk_mempool_mem_cb_t mem_cb, +- void *mem_cb_arg); +- +-/** +- * Lookup the memory pool identified by the given name. +- * +- * \param name Name of the memory pool. +- * +- * \return a pointer to the memory pool on success, or NULL on failure. +- */ +-struct spdk_mempool *spdk_mempool_lookup(const char *name); +- +-/** +- * Get the number of dedicated CPU cores utilized by this env abstraction. +- * +- * \return the number of dedicated CPU cores. +- */ +-uint32_t spdk_env_get_core_count(void); +- +-/** +- * Get the CPU core index of the current thread. +- * +- * This will only function when called from threads set up by +- * this environment abstraction. For any other threads \c SPDK_ENV_LCORE_ID_ANY +- * will be returned. +- * +- * \return the CPU core index of the current thread. +- */ +-uint32_t spdk_env_get_current_core(void); +- +-/** +- * Get the index of the first dedicated CPU core for this application. +- * +- * \return the index of the first dedicated CPU core. +- */ +-uint32_t spdk_env_get_first_core(void); +- +-/** +- * Get the index of the last dedicated CPU core for this application. +- * +- * \return the index of the last dedicated CPU core. +- */ +-uint32_t spdk_env_get_last_core(void); +- +-/** +- * Get the index of the next dedicated CPU core for this application. +- * +- * If there is no next core, return UINT32_MAX. +- * +- * \param prev_core Index of previous core. +- * +- * \return the index of the next dedicated CPU core. +- */ +-uint32_t spdk_env_get_next_core(uint32_t prev_core); +- +-#define SPDK_ENV_FOREACH_CORE(i) \ +- for (i = spdk_env_get_first_core(); \ +- i < UINT32_MAX; \ +- i = spdk_env_get_next_core(i)) +- +-/** +- * Get the socket ID for the given core. +- * +- * \param core CPU core to query. +- * +- * \return the socket ID for the given core. +- */ +-uint32_t spdk_env_get_socket_id(uint32_t core); +- +-typedef int (*thread_start_fn)(void *); +- +-/** +- * Launch a thread pinned to the given core. Only a single pinned thread may be +- * launched per core. Subsequent attempts to launch pinned threads on that core +- * will fail. +- * +- * \param core The core to pin the thread to. +- * \param fn Entry point on the new thread. +- * \param arg Argument passed to thread_start_fn +- * +- * \return 0 on success, negative errno on failure. +- */ +-int spdk_env_thread_launch_pinned(uint32_t core, thread_start_fn fn, void *arg); +- +-/** +- * Wait for all threads to exit before returning. +- */ +-void spdk_env_thread_wait_all(void); +- +-/** +- * Check whether the calling process is primary process. +- * +- * \return true if the calling process is primary process, or false otherwise. +- */ +-bool spdk_process_is_primary(void); +- +-/** +- * Get a monotonic timestamp counter. +- * +- * \return the monotonic timestamp counter. +- */ +-uint64_t spdk_get_ticks(void); +- +-/** +- * Get the tick rate of spdk_get_ticks() per second. +- * +- * \return the tick rate of spdk_get_ticks() per second. +- */ +-uint64_t spdk_get_ticks_hz(void); +- +-/** +- * Delay the given number of microseconds. +- * +- * \param us Number of microseconds. +- */ +-void spdk_delay_us(unsigned int us); +- +-/** +- * Pause CPU execution for a short while +- */ +-void spdk_pause(void); +- +-struct spdk_ring; +- +-enum spdk_ring_type { +- SPDK_RING_TYPE_SP_SC, /* Single-producer, single-consumer */ +- SPDK_RING_TYPE_MP_SC, /* Multi-producer, single-consumer */ +- SPDK_RING_TYPE_MP_MC, /* Multi-producer, multi-consumer */ +-}; +- +-/** +- * Create a ring. +- * +- * \param type Type for the ring. (SPDK_RING_TYPE_SP_SC or SPDK_RING_TYPE_MP_SC). +- * \param count Size of the ring in elements. +- * \param socket_id Socket ID to allocate memory on, or SPDK_ENV_SOCKET_ID_ANY +- * for any socket. +- * +- * \return a pointer to the created ring. +- */ +-struct spdk_ring *spdk_ring_create(enum spdk_ring_type type, size_t count, int socket_id); +- +-/** +- * Free the ring. +- * +- * \param ring Ring to free. +- */ +-void spdk_ring_free(struct spdk_ring *ring); +- +-/** +- * Get the number of objects in the ring. +- * +- * \param ring the ring. +- * +- * \return the number of objects in the ring. +- */ +-size_t spdk_ring_count(struct spdk_ring *ring); +- +-/** +- * Queue the array of objects (with length count) on the ring. +- * +- * \param ring A pointer to the ring. +- * \param objs A pointer to the array to be queued. +- * \param count Length count of the array of objects. +- * \param free_space If non-NULL, amount of free space after the enqueue has finished. +- * +- * \return the number of objects enqueued. +- */ +-size_t spdk_ring_enqueue(struct spdk_ring *ring, void **objs, size_t count, +- size_t *free_space); +- +-/** +- * Dequeue count objects from the ring into the array objs. +- * +- * \param ring A pointer to the ring. +- * \param objs A pointer to the array to be dequeued. +- * \param count Maximum number of elements to be dequeued. +- * +- * \return the number of objects dequeued which is less than 'count'. +- */ +-size_t spdk_ring_dequeue(struct spdk_ring *ring, void **objs, size_t count); +- +-/** +- * Reports whether the SPDK application is using the IOMMU for DMA +- * +- * \return True if we are using the IOMMU, false otherwise. +- */ +-bool spdk_iommu_is_enabled(void); +- +-#define SPDK_VTOPHYS_ERROR (0xFFFFFFFFFFFFFFFFULL) +- +-/** +- * Get the physical address of a buffer. +- * +- * \param buf A pointer to a buffer. +- * \param size Contains the size of the memory region pointed to by vaddr. +- * If vaddr is successfully translated, then this is updated with the size of +- * the memory region for which the translation is valid. +- * +- * \return the physical address of this buffer on success, or SPDK_VTOPHYS_ERROR +- * on failure. +- */ +-uint64_t spdk_vtophys(const void *buf, uint64_t *size); +- +-struct spdk_pci_addr { +- uint32_t domain; +- uint8_t bus; +- uint8_t dev; +- uint8_t func; +-}; +- +-struct spdk_pci_id { +- uint32_t class_id; /**< Class ID or SPDK_PCI_CLASS_ANY_ID. */ +- uint16_t vendor_id; /**< Vendor ID or SPDK_PCI_ANY_ID. */ +- uint16_t device_id; /**< Device ID or SPDK_PCI_ANY_ID. */ +- uint16_t subvendor_id; /**< Subsystem vendor ID or SPDK_PCI_ANY_ID. */ +- uint16_t subdevice_id; /**< Subsystem device ID or SPDK_PCI_ANY_ID. */ +-}; +- +-/** Device needs PCI BAR mapping (done with either IGB_UIO or VFIO) */ +-#define SPDK_PCI_DRIVER_NEED_MAPPING 0x0001 +-/** Device needs PCI BAR mapping with enabled write combining (wc) */ +-#define SPDK_PCI_DRIVER_WC_ACTIVATE 0x0002 +- +-void spdk_pci_driver_register(const char *name, struct spdk_pci_id *id_table, uint32_t flags); +- +-struct spdk_pci_device { +- struct spdk_pci_device *parent; +- void *dev_handle; +- struct spdk_pci_addr addr; +- struct spdk_pci_id id; +- int socket_id; +- const char *type; +- +- int (*map_bar)(struct spdk_pci_device *dev, uint32_t bar, +- void **mapped_addr, uint64_t *phys_addr, uint64_t *size); +- int (*unmap_bar)(struct spdk_pci_device *dev, uint32_t bar, +- void *addr); +- int (*cfg_read)(struct spdk_pci_device *dev, void *value, +- uint32_t len, uint32_t offset); +- int (*cfg_write)(struct spdk_pci_device *dev, void *value, +- uint32_t len, uint32_t offset); +- +- struct _spdk_pci_device_internal { +- struct spdk_pci_driver *driver; +- bool attached; +- /* optional fd for exclusive access to this device on this process */ +- int claim_fd; +- bool pending_removal; +- /* The device was successfully removed on a DPDK interrupt thread, +- * but to prevent data races we couldn't remove it from the global +- * device list right away. It'll be removed as soon as possible +- * on a regular thread when any public pci function is called. +- */ +- bool removed; +- TAILQ_ENTRY(spdk_pci_device) tailq; +- } internal; +-}; +- +-/** +- * Callback for device attach handling. +- * +- * \param enum_ctx Opaque value. +- * \param dev PCI device. +- * +- * \return -1 if an error occurred, +- * 0 if device attached successfully, +- * 1 if device not attached. +- */ +-typedef int (*spdk_pci_enum_cb)(void *enum_ctx, struct spdk_pci_device *dev); +- +-#define SPDK_PCI_DEVICE(vend, dev) \ +- .class_id = SPDK_PCI_CLASS_ANY_ID, \ +- .vendor_id = (vend), \ +- .device_id = (dev), \ +- .subvendor_id = SPDK_PCI_ANY_ID, \ +- .subdevice_id = SPDK_PCI_ANY_ID +- +-#define SPDK_PCI_DRIVER_REGISTER(name, id_table, flags) \ +-__attribute__((constructor)) static void _spdk_pci_driver_register_##name(void) \ +-{ \ +- spdk_pci_driver_register(#name, id_table, flags); \ +-} +- +-/** +- * Get the VMD PCI driver object. +- * +- * \return PCI driver. +- */ +-struct spdk_pci_driver *spdk_pci_vmd_get_driver(void); +- +-/** +- * Get the I/OAT PCI driver object. +- * +- * \return PCI driver. +- */ +-struct spdk_pci_driver *spdk_pci_ioat_get_driver(void); +- +-/** +- * Get the IDXD PCI driver object. +- * +- * \return PCI driver. +- */ +-struct spdk_pci_driver *spdk_pci_idxd_get_driver(void); +- +-/** +- * Get the Virtio PCI driver object. +- * +- * \return PCI driver. +- */ +-struct spdk_pci_driver *spdk_pci_virtio_get_driver(void); +- +-/** +- * Get PCI driver by name (e.g. "nvme", "vmd", "ioat"). +- */ +-struct spdk_pci_driver *spdk_pci_get_driver(const char *name); +- +-/** +- * Get the NVMe PCI driver object. +- * +- * \return PCI driver. +- */ +-struct spdk_pci_driver *spdk_pci_nvme_get_driver(void); +- +-/** +- * Enumerate all PCI devices supported by the provided driver and try to +- * attach those that weren't attached yet. The provided callback will be +- * called for each such device and its return code will decide whether that +- * device is attached or not. Attached devices have to be manually detached +- * with spdk_pci_device_detach() to be attach-able again. +- * +- * During enumeration all registered pci devices with exposed access to +- * userspace are getting probed internally unless not explicitly specified +- * on denylist. Because of that it becomes not possible to either use such +- * devices with another application or unbind the driver (e.g. vfio). +- * +- * 2s asynchronous delay is introduced to avoid race conditions between +- * user space software initialization and in-kernel device handling for +- * newly inserted devices. Subsequent enumerate call after the delay +- * shall allow for a successful device attachment. +- * +- * \param driver Driver for a specific device type. +- * \param enum_cb Callback to be called for each non-attached PCI device. +- * \param enum_ctx Additional context passed to the callback function. +- * +- * \return -1 if an internal error occurred or the provided callback returned -1, +- * 0 otherwise +- */ +-int spdk_pci_enumerate(struct spdk_pci_driver *driver, spdk_pci_enum_cb enum_cb, void *enum_ctx); +- +-/** +- * Call the provided function pointer for every enumerated PCI device. +- * +- * \param ctx Context parameter to pass to fn. +- * \param fn Function to call for each PCI device +- */ +-void spdk_pci_for_each_device(void *ctx, void (*fn)(void *ctx, struct spdk_pci_device *dev)); +- +-/** +- * Map a PCI BAR in the current process. +- * +- * \param dev PCI device. +- * \param bar BAR number. +- * \param mapped_addr A variable to store the virtual address of the mapping. +- * \param phys_addr A variable to store the physical address of the mapping. +- * \param size A variable to store the size of the bar (in bytes). +- * +- * \return 0 on success. +- */ +-int spdk_pci_device_map_bar(struct spdk_pci_device *dev, uint32_t bar, +- void **mapped_addr, uint64_t *phys_addr, uint64_t *size); +- +-/** +- * Unmap a PCI BAR from the current process. This happens automatically when +- * the PCI device is detached. +- * +- * \param dev PCI device. +- * \param bar BAR number. +- * \param mapped_addr Virtual address of the bar. +- * +- * \return 0 on success. +- */ +-int spdk_pci_device_unmap_bar(struct spdk_pci_device *dev, uint32_t bar, +- void *mapped_addr); +- +-/** +- * Enable PCI device interrupts. (Experimental) +- * +- * \param dev PCI device. +- * +- * \return 0 on success, negative value on error. +- */ +-int spdk_pci_device_enable_interrupt(struct spdk_pci_device *dev); +- +-/** +- * Disable PCI device interrupts. (Experimental) +- * +- * \param dev PCI device. +- * +- * \return 0 on success, negative value on error. +- */ +-int spdk_pci_device_disable_interrupt(struct spdk_pci_device *dev); +- +-/** +- * Get an event file descriptor assosiated with a PCI device interrupt. +- * (Experimental) +- * +- * \param dev PCI device. +- * +- * \return Event file descriptor on success, negative value on error. +- */ +-int spdk_pci_device_get_interrupt_efd(struct spdk_pci_device *dev); +- +-/** +- * Get the domain of a PCI device. +- * +- * \param dev PCI device. +- * +- * \return PCI device domain. +- */ +-uint32_t spdk_pci_device_get_domain(struct spdk_pci_device *dev); +- +-/** +- * Get the bus number of a PCI device. +- * +- * \param dev PCI device. +- * +- * \return PCI bus number. +- */ +-uint8_t spdk_pci_device_get_bus(struct spdk_pci_device *dev); +- +-/** +- * Get the device number within the PCI bus the device is on. +- * +- * \param dev PCI device. +- * +- * \return PCI device number. +- */ +-uint8_t spdk_pci_device_get_dev(struct spdk_pci_device *dev); +- +-/** +- * Get the particular function number represented by struct spdk_pci_device. +- * +- * \param dev PCI device. +- * +- * \return PCI function number. +- */ +-uint8_t spdk_pci_device_get_func(struct spdk_pci_device *dev); +- +-/** +- * Get the full DomainBDF address of a PCI device. +- * +- * \param dev PCI device. +- * +- * \return PCI address. +- */ +-struct spdk_pci_addr spdk_pci_device_get_addr(struct spdk_pci_device *dev); +- +-/** +- * Get the vendor ID of a PCI device. +- * +- * \param dev PCI device. +- * +- * \return vendor ID. +- */ +-uint16_t spdk_pci_device_get_vendor_id(struct spdk_pci_device *dev); +- +-/** +- * Get the device ID of a PCI device. +- * +- * \param dev PCI device. +- * +- * \return device ID. +- */ +-uint16_t spdk_pci_device_get_device_id(struct spdk_pci_device *dev); +- +-/** +- * Get the subvendor ID of a PCI device. +- * +- * \param dev PCI device. +- * +- * \return subvendor ID. +- */ +-uint16_t spdk_pci_device_get_subvendor_id(struct spdk_pci_device *dev); +- +-/** +- * Get the subdevice ID of a PCI device. +- * +- * \param dev PCI device. +- * +- * \return subdevice ID. +- */ +-uint16_t spdk_pci_device_get_subdevice_id(struct spdk_pci_device *dev); +- +-/** +- * Get the PCI ID of a PCI device. +- * +- * \param dev PCI device. +- * +- * \return PCI ID. +- */ +-struct spdk_pci_id spdk_pci_device_get_id(struct spdk_pci_device *dev); +- +-/** +- * Get the NUMA node the PCI device is on. +- * +- * \param dev PCI device. +- * +- * \return NUMA node index (>= 0). +- */ +-int spdk_pci_device_get_socket_id(struct spdk_pci_device *dev); +- +-/** +- * Serialize the PCIe Device Serial Number into the provided buffer. +- * The buffer will contain a 16-character-long serial number followed by +- * a NULL terminator. +- * +- * \param dev PCI device. +- * \param sn Buffer to store the serial number in. +- * \param len Length of buffer. Must be at least 17. +- * +- * \return 0 on success, -1 on failure. +- */ +-int spdk_pci_device_get_serial_number(struct spdk_pci_device *dev, char *sn, size_t len); +- +-/** +- * Claim a PCI device for exclusive SPDK userspace access. +- * +- * Uses F_SETLK on a shared memory file with the PCI address embedded in its name. +- * As long as this file remains open with the lock acquired, other processes will +- * not be able to successfully call this function on the same PCI device. +- * +- * The device can be un-claimed by the owning process with spdk_pci_device_unclaim(). +- * It will be also unclaimed automatically when detached. +- * +- * \param dev PCI device to claim. +- * +- * \return -EACCES if the device has already been claimed, +- * negative errno on unexpected errors, +- * 0 on success. +- */ +-int spdk_pci_device_claim(struct spdk_pci_device *dev); +- +-/** +- * Undo spdk_pci_device_claim(). +- * +- * \param dev PCI device to unclaim. +- */ +-void spdk_pci_device_unclaim(struct spdk_pci_device *dev); +- +-/** +- * Release all resources associated with the given device and detach it. As long +- * as the PCI device is physically available, it will attachable again. +- * +- * \param device PCI device. +- */ +-void spdk_pci_device_detach(struct spdk_pci_device *device); +- +-/** +- * Attach a PCI device. This will bypass all blocked list rules and explicitly +- * attach a device at the provided address. The return code of the provided +- * callback will decide whether that device is attached or not. Attached +- * devices have to be manually detached with spdk_pci_device_detach() to be +- * attach-able again. +- * +- * \param driver Driver for a specific device type. The device will only be +- * attached if it's supported by this driver. +- * \param enum_cb Callback to be called for the PCI device once it's found. +- * \param enum_ctx Additional context passed to the callback function. +- * \param pci_address Address of the device to attach. +- * +- * \return -1 if a device at the provided PCI address couldn't be found, +- * -1 if an internal error happened or the provided callback returned non-zero, +- * 0 otherwise +- */ +-int spdk_pci_device_attach(struct spdk_pci_driver *driver, spdk_pci_enum_cb enum_cb, +- void *enum_ctx, struct spdk_pci_addr *pci_address); +- +-/** +- * Allow the specified PCI device to be probed by the calling process. +- * +- * When using spdk_pci_enumerate(), only devices with allowed PCI addresses will +- * be probed. By default, this is all PCI addresses, but the pci_allowed +- * and pci_blocked environment options can override this behavior. +- * This API enables the caller to allow a new PCI address that may have previously +- * been blocked. +- * +- * \param pci_addr PCI address to allow +- * \return 0 if successful +- * \return -ENOMEM if environment-specific data structures cannot be allocated +- * \return -EINVAL if specified PCI address is not valid +- */ +-int spdk_pci_device_allow(struct spdk_pci_addr *pci_addr); +- +-/** +- * Read \c len bytes from the PCI configuration space. +- * +- * \param dev PCI device. +- * \param buf A buffer to copy the data into. +- * \param len Number of bytes to read. +- * \param offset Offset (in bytes) in the PCI config space to start reading from. +- * +- * \return 0 on success, -1 on failure. +- */ +-int spdk_pci_device_cfg_read(struct spdk_pci_device *dev, void *buf, uint32_t len, +- uint32_t offset); +- +-/** +- * Write \c len bytes into the PCI configuration space. +- * +- * \param dev PCI device. +- * \param buf A buffer to copy the data from. +- * \param len Number of bytes to write. +- * \param offset Offset (in bytes) in the PCI config space to start writing to. +- * +- * \return 0 on success, -1 on failure. +- */ +-int spdk_pci_device_cfg_write(struct spdk_pci_device *dev, void *buf, uint32_t len, +- uint32_t offset); +- +-/** +- * Read 1 byte from the PCI configuration space. +- * +- * \param dev PCI device. +- * \param value A buffer to copy the data into. +- * \param offset Offset (in bytes) in the PCI config space to start reading from. +- * +- * \return 0 on success, -1 on failure. +- */ +-int spdk_pci_device_cfg_read8(struct spdk_pci_device *dev, uint8_t *value, uint32_t offset); +- +-/** +- * Write 1 byte into the PCI configuration space. +- * +- * \param dev PCI device. +- * \param value A value to write. +- * \param offset Offset (in bytes) in the PCI config space to start writing to. +- * +- * \return 0 on success, -1 on failure. +- */ +-int spdk_pci_device_cfg_write8(struct spdk_pci_device *dev, uint8_t value, uint32_t offset); +- +-/** +- * Read 2 bytes from the PCI configuration space. +- * +- * \param dev PCI device. +- * \param value A buffer to copy the data into. +- * \param offset Offset (in bytes) in the PCI config space to start reading from. +- * +- * \return 0 on success, -1 on failure. +- */ +-int spdk_pci_device_cfg_read16(struct spdk_pci_device *dev, uint16_t *value, uint32_t offset); +- +-/** +- * Write 2 bytes into the PCI configuration space. +- * +- * \param dev PCI device. +- * \param value A value to write. +- * \param offset Offset (in bytes) in the PCI config space to start writing to. +- * +- * \return 0 on success, -1 on failure. +- */ +-int spdk_pci_device_cfg_write16(struct spdk_pci_device *dev, uint16_t value, uint32_t offset); +- +-/** +- * Read 4 bytes from the PCI configuration space. +- * +- * \param dev PCI device. +- * \param value A buffer to copy the data into. +- * \param offset Offset (in bytes) in the PCI config space to start reading from. +- * +- * \return 0 on success, -1 on failure. +- */ +-int spdk_pci_device_cfg_read32(struct spdk_pci_device *dev, uint32_t *value, uint32_t offset); +- +-/** +- * Write 4 bytes into the PCI configuration space. +- * +- * \param dev PCI device. +- * \param value A value to write. +- * \param offset Offset (in bytes) in the PCI config space to start writing to. +- * +- * \return 0 on success, -1 on failure. +- */ +-int spdk_pci_device_cfg_write32(struct spdk_pci_device *dev, uint32_t value, uint32_t offset); +- +-/** +- * Check if device was requested to be removed from the process. This can be +- * caused either by physical device hotremoval or OS-triggered removal. In the +- * latter case, the device may continue to function properly even if this +- * function returns \c true . The upper-layer driver may check this function +- * periodically and eventually detach the device. +- * +- * \param dev PCI device. +- * +- * \return if device was requested to be removed +- */ +-bool spdk_pci_device_is_removed(struct spdk_pci_device *dev); +- +-/** +- * Compare two PCI addresses. +- * +- * \param a1 PCI address 1. +- * \param a2 PCI address 2. +- * +- * \return 0 if a1 == a2, less than 0 if a1 < a2, greater than 0 if a1 > a2 +- */ +-int spdk_pci_addr_compare(const struct spdk_pci_addr *a1, const struct spdk_pci_addr *a2); +- +-/** +- * Convert a string representation of a PCI address into a struct spdk_pci_addr. +- * +- * \param addr PCI address output on success. +- * \param bdf PCI address in domain:bus:device.function format or +- * domain.bus.device.function format. +- * +- * \return 0 on success, negative errno on failure. +- */ +-int spdk_pci_addr_parse(struct spdk_pci_addr *addr, const char *bdf); +- +-/** +- * Convert a struct spdk_pci_addr to a string. +- * +- * \param bdf String into which a string will be output in the format +- * domain:bus:device.function. The string must be at least 14 characters in size. +- * \param sz Size of bdf in bytes. Must be at least 14. +- * \param addr PCI address. +- * +- * \return 0 on success, or a negated errno on failure. +- */ +-int spdk_pci_addr_fmt(char *bdf, size_t sz, const struct spdk_pci_addr *addr); +- +-/** +- * Hook a custom PCI device into the PCI layer. The device will be attachable, +- * enumerable, and will call provided callbacks on each PCI resource access +- * request. +- * +- * \param drv driver that will be able to attach the device +- * \param dev fully initialized PCI device struct +- * +- * \return 0 on success, negative errno otherwise. +- */ +-int spdk_pci_hook_device(struct spdk_pci_driver *drv, struct spdk_pci_device *dev); +- +-/** +- * Un-hook a custom PCI device from the PCI layer. The device must not be attached. +- * +- * \param dev fully initialized PCI device struct +- */ +-void spdk_pci_unhook_device(struct spdk_pci_device *dev); +- +-/** +- * Return the type of the PCI device. +- * +- * \param dev PCI device +- * +- * \return string representing the type of the device +- */ +-const char *spdk_pci_device_get_type(const struct spdk_pci_device *dev); +- +-struct spdk_pci_device_provider { +- const char *name; +- +- /** +- * Callback executed to attach a PCI device on a given address. +- * +- * \param addr address of the device. +- * +- * \return 0 if the device was attached successfully, negative errno otherwise. +- */ +- int (*attach_cb)(const struct spdk_pci_addr *addr); +- +- /** +- * Callback executed to detach a given PCI device. The provider to detach the device is +- * selected based on the type of the device and the name of the provider (i.e. dev->type == +- * provider->name). +- * +- * \param dev PCI device to detach. +- */ +- void (*detach_cb)(struct spdk_pci_device *dev); +- +- TAILQ_ENTRY(spdk_pci_device_provider) tailq; +-}; +- +-/** +- * Register a PCI device provdier. +- * +- * \param provider PCI device provider. +- */ +-void spdk_pci_register_device_provider(struct spdk_pci_device_provider *provider); +- +-#define SPDK_PCI_REGISTER_DEVICE_PROVIDER(name, provider) \ +- static void __attribute__((constructor)) _spdk_pci_register_device_provider_##name(void) \ +- { \ +- spdk_pci_register_device_provider(provider); \ +- } +- +-/** +- * Remove any CPU affinity from the current thread. +- */ +-void spdk_unaffinitize_thread(void); +- +-/** +- * Call a function with CPU affinity unset. +- * +- * This can be used to run a function that creates other threads without inheriting the calling +- * thread's CPU affinity. +- * +- * \param cb Function to call +- * \param arg Parameter to the function cb(). +- * +- * \return the return value of cb(). +- */ +-void *spdk_call_unaffinitized(void *cb(void *arg), void *arg); +- +-/** +- * Page-granularity memory address translation table. +- */ +-struct spdk_mem_map; +- +-enum spdk_mem_map_notify_action { +- SPDK_MEM_MAP_NOTIFY_REGISTER, +- SPDK_MEM_MAP_NOTIFY_UNREGISTER, +-}; +- +-typedef int (*spdk_mem_map_notify_cb)(void *cb_ctx, struct spdk_mem_map *map, +- enum spdk_mem_map_notify_action action, +- void *vaddr, size_t size); +- +-typedef int (*spdk_mem_map_contiguous_translations)(uint64_t addr_1, uint64_t addr_2); +- +-/** +- * A function table to be implemented by each memory map. +- */ +-struct spdk_mem_map_ops { +- spdk_mem_map_notify_cb notify_cb; +- spdk_mem_map_contiguous_translations are_contiguous; +-}; +- +-/** +- * Allocate a virtual memory address translation map. +- * +- * \param default_translation Default translation for the map. +- * \param ops Table of callback functions for map operations. +- * \param cb_ctx Argument passed to the callback function. +- * +- * \return a pointer to the allocated virtual memory address translation map. +- */ +-struct spdk_mem_map *spdk_mem_map_alloc(uint64_t default_translation, +- const struct spdk_mem_map_ops *ops, void *cb_ctx); +- +-/** +- * Free a memory map previously allocated by spdk_mem_map_alloc(). +- * +- * \param pmap Memory map to free. +- */ +-void spdk_mem_map_free(struct spdk_mem_map **pmap); +- +-/** +- * Register an address translation for a range of virtual memory. +- * +- * \param map Memory map. +- * \param vaddr Virtual address of the region to register - must be 2 MB aligned. +- * \param size Size of the region in bytes - must be multiple of 2 MB in the +- * current implementation. +- * \param translation Translation to store in the map for this address range. +- * +- * \sa spdk_mem_map_clear_translation(). +- * +- * \return 0 on success, negative errno on failure. +- */ +-int spdk_mem_map_set_translation(struct spdk_mem_map *map, uint64_t vaddr, uint64_t size, +- uint64_t translation); +- +-/** +- * Unregister an address translation. +- * +- * \param map Memory map. +- * \param vaddr Virtual address of the region to unregister - must be 2 MB aligned. +- * \param size Size of the region in bytes - must be multiple of 2 MB in the +- * current implementation. +- * +- * \sa spdk_mem_map_set_translation(). +- * +- * \return 0 on success, negative errno on failure. +- */ +-int spdk_mem_map_clear_translation(struct spdk_mem_map *map, uint64_t vaddr, uint64_t size); +- +-/** +- * Look up the translation of a virtual address in a memory map. +- * +- * \param map Memory map. +- * \param vaddr Virtual address. +- * \param size Contains the size of the memory region pointed to by vaddr. +- * If vaddr is successfully translated, then this is updated with the size of +- * the memory region for which the translation is valid. +- * +- * \return the translation of vaddr stored in the map, or default_translation +- * as specified in spdk_mem_map_alloc() if vaddr is not present in the map. +- */ +-uint64_t spdk_mem_map_translate(const struct spdk_mem_map *map, uint64_t vaddr, uint64_t *size); +- +-/** +- * Register the specified memory region for address translation. +- * +- * The memory region must map to pinned huge pages (2MB or greater). +- * +- * \param vaddr Virtual address to register. +- * \param len Length in bytes of the vaddr. +- * +- * \return 0 on success, negative errno on failure. +- */ +-int spdk_mem_register(void *vaddr, size_t len); +- +-/** +- * Unregister the specified memory region from vtophys address translation. +- * +- * The caller must ensure all in-flight DMA operations to this memory region +- * are completed or cancelled before calling this function. +- * +- * \param vaddr Virtual address to unregister. +- * \param len Length in bytes of the vaddr. +- * +- * \return 0 on success, negative errno on failure. +- */ +-int spdk_mem_unregister(void *vaddr, size_t len); +- +-/** +- * Reserve the address space specified in all memory maps. +- * +- * This pre-allocates the necessary space in the memory maps such that +- * future calls to spdk_mem_register() on that region require no +- * internal memory allocations. +- * +- * \param vaddr Virtual address to reserve +- * \param len Length in bytes of vaddr +- * +- * \return 0 on success, negated errno on failure. +- */ +-int spdk_mem_reserve(void *vaddr, size_t len); +- +-/** +- * Get the address's file descriptor and offset, it works with spdk memory allocation APIs +- * +- * \param vaddr Virtual address to get +- * \param offset Virtual address's map offset to the file descriptor +- * +- * \return negative errno on failure, otherwise return the file descriptor +- */ +-int spdk_mem_get_fd_and_offset(void *vaddr, uint64_t *offset); +- +-enum spdk_pci_event_type { +- SPDK_UEVENT_ADD = 0, +- SPDK_UEVENT_REMOVE = 1, +-}; +- +-struct spdk_pci_event { +- enum spdk_pci_event_type action; +- struct spdk_pci_addr traddr; +-}; +- +-typedef void (*spdk_pci_error_handler)(const void *failure_addr, void *ctx); +- +-/** +- * Begin listening for PCI bus events. This is used to detect hot-insert and +- * hot-remove events. Once the system is listening, events may be retrieved +- * by calling spdk_pci_get_event() periodically. +- * +- * \return negative errno on failure, otherwise, return a file descriptor +- * that may be later passed to spdk_pci_get_event(). +- */ +-int spdk_pci_event_listen(void); +- +-/** +- * Get the next PCI bus event. +- * +- * \param fd A file descriptor returned by spdk_pci_event_listen() +- * \param event An event on the PCI bus +- * +- * \return Negative errno on failure. 0 for no event. A positive number +- * when an event has been returned +- */ +-int spdk_pci_get_event(int fd, struct spdk_pci_event *event); +- +-/** +- * Register a signal handler to handle bus errors on the PCI bus +- * +- * \param sighandler Signal bus handler of the PCI bus +- * \param ctx The arg pass to the registered signal bus handler. +- * +- * \return negative errno on failure, otherwise it means successful +- */ +-int spdk_pci_register_error_handler(spdk_pci_error_handler sighandler, void *ctx); +- +-/** +- * Register a signal handler to handle bus errors on the PCI bus +- * +- * \param sighandler Signal bus handler of the PCI bus +- */ +-void spdk_pci_unregister_error_handler(spdk_pci_error_handler sighandler); +- +-#ifdef __cplusplus +-} +-#endif +- +-#endif ++/* SPDX-License-Identifier: BSD-3-Clause ++ * Copyright (C) 2015 Intel Corporation. ++ * Copyright (c) NetApp, Inc. ++ * Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. ++ * All rights reserved. ++ */ ++ ++/** \file ++ * Encapsulated third-party dependencies ++ */ ++ ++#ifndef SPDK_ENV_H ++#define SPDK_ENV_H ++ ++#include "spdk/stdinc.h" ++#include "spdk/queue.h" ++#include "spdk/pci_ids.h" ++ ++#ifdef __cplusplus ++extern "C" { ++#endif ++ ++#define SPDK_ENV_SOCKET_ID_ANY (-1) ++#define SPDK_ENV_LCORE_ID_ANY (UINT32_MAX) ++ ++/** ++ * Memory is dma-safe. ++ */ ++#define SPDK_MALLOC_DMA 0x01 ++ ++/** ++ * Memory is sharable across process boundaries. ++ */ ++#define SPDK_MALLOC_SHARE 0x02 ++ ++#define SPDK_MAX_MEMZONE_NAME_LEN 32 ++#define SPDK_MAX_MEMPOOL_NAME_LEN 29 ++ ++/** ++ * Memzone flags ++ */ ++#define SPDK_MEMZONE_NO_IOVA_CONTIG 0x00100000 /**< no iova contiguity */ ++ ++/** ++ * \brief Environment initialization options ++ */ ++struct spdk_env_opts { ++ const char *name; ++ const char *core_mask; ++ int shm_id; ++ int mem_channel; ++ int main_core; ++ int mem_size; ++ bool no_pci; ++ bool hugepage_single_segments; ++ bool unlink_hugepage; ++ size_t num_pci_addr; ++ const char *hugedir; ++ struct spdk_pci_addr *pci_blocked; ++ struct spdk_pci_addr *pci_allowed; ++ const char *iova_mode; ++ uint64_t base_virtaddr; ++ ++ /** Opaque context for use of the env implementation. */ ++ void *env_context; ++ const char *vf_token; ++ bool hot_restart; ++}; ++ ++/** ++ * Allocate dma/sharable memory based on a given dma_flg. It is a memory buffer ++ * with the given size, alignment and socket id. ++ * ++ * \param size Size in bytes. ++ * \param align If non-zero, the allocated buffer is aligned to a multiple of ++ * align. In this case, it must be a power of two. The returned buffer is always ++ * aligned to at least cache line size. ++ * \param phys_addr **Deprecated**. Please use spdk_vtophys() for retrieving physical ++ * addresses. A pointer to the variable to hold the physical address of ++ * the allocated buffer is passed. If NULL, the physical address is not returned. ++ * \param socket_id Socket ID to allocate memory on, or SPDK_ENV_SOCKET_ID_ANY ++ * for any socket. ++ * \param flags Combination of SPDK_MALLOC flags (\ref SPDK_MALLOC_DMA, \ref SPDK_MALLOC_SHARE). ++ * At least one flag must be specified. ++ * ++ * \return a pointer to the allocated memory buffer. ++ */ ++void *spdk_malloc(size_t size, size_t align, uint64_t *phys_addr, int socket_id, uint32_t flags); ++ ++/** ++ * Allocate dma/sharable memory based on a given dma_flg. It is a memory buffer ++ * with the given size, alignment and socket id. Also, the buffer will be zeroed. ++ * ++ * \param size Size in bytes. ++ * \param align If non-zero, the allocated buffer is aligned to a multiple of ++ * align. In this case, it must be a power of two. The returned buffer is always ++ * aligned to at least cache line size. ++ * \param phys_addr **Deprecated**. Please use spdk_vtophys() for retrieving physical ++ * addresses. A pointer to the variable to hold the physical address of ++ * the allocated buffer is passed. If NULL, the physical address is not returned. ++ * \param socket_id Socket ID to allocate memory on, or SPDK_ENV_SOCKET_ID_ANY ++ * for any socket. ++ * \param flags Combination of SPDK_MALLOC flags (\ref SPDK_MALLOC_DMA, \ref SPDK_MALLOC_SHARE). ++ * ++ * \return a pointer to the allocated memory buffer. ++ */ ++void *spdk_zmalloc(size_t size, size_t align, uint64_t *phys_addr, int socket_id, uint32_t flags); ++ ++/** ++ * Resize a dma/sharable memory buffer with the given new size and alignment. ++ * Existing contents are preserved. ++ * ++ * \param buf Buffer to resize. ++ * \param size Size in bytes. ++ * \param align If non-zero, the allocated buffer is aligned to a multiple of ++ * align. In this case, it must be a power of two. The returned buffer is always ++ * aligned to at least cache line size. ++ * ++ * \return a pointer to the resized memory buffer. ++ */ ++void *spdk_realloc(void *buf, size_t size, size_t align); ++ ++/** ++ * Free buffer memory that was previously allocated with spdk_malloc() or spdk_zmalloc(). ++ * ++ * \param buf Buffer to free. ++ */ ++void spdk_free(void *buf); ++ ++/** ++ * Initialize the default value of opts. ++ * ++ * \param opts Data structure where SPDK will initialize the default options. ++ */ ++void spdk_env_opts_init(struct spdk_env_opts *opts); ++ ++/** ++ * Initialize or reinitialize the environment library. ++ * For initialization, this must be called prior to using any other functions ++ * in this library. For reinitialization, the parameter `opts` must be set to ++ * NULL and this must be called after the environment library was finished by ++ * spdk_env_fini() within the same process. ++ * ++ * \param opts Environment initialization options. ++ * \return 0 on success, or negative errno on failure. ++ */ ++int spdk_env_init(const struct spdk_env_opts *opts); ++ ++/** ++ * Release any resources of the environment library that were allocated with ++ * spdk_env_init(). After this call, no SPDK env function calls may be made. ++ * It is expected that common usage of this function is to call it just before ++ * terminating the process or before reinitializing the environment library ++ * within the same process. ++ */ ++void spdk_env_fini(void); ++ ++/** ++ * Allocate a pinned memory buffer with the given size and alignment. ++ * ++ * \param size Size in bytes. ++ * \param align If non-zero, the allocated buffer is aligned to a multiple of ++ * align. In this case, it must be a power of two. The returned buffer is always ++ * aligned to at least cache line size. ++ * \param phys_addr A pointer to the variable to hold the physical address of ++ * the allocated buffer is passed. If NULL, the physical address is not returned. ++ * ++ * \return a pointer to the allocated memory buffer. ++ */ ++void *spdk_dma_malloc(size_t size, size_t align, uint64_t *phys_addr); ++ ++/** ++ * Allocate a pinned, memory buffer with the given size, alignment and socket id. ++ * ++ * \param size Size in bytes. ++ * \param align If non-zero, the allocated buffer is aligned to a multiple of ++ * align. In this case, it must be a power of two. The returned buffer is always ++ * aligned to at least cache line size. ++ * \param phys_addr A pointer to the variable to hold the physical address of ++ * the allocated buffer is passed. If NULL, the physical address is not returned. ++ * \param socket_id Socket ID to allocate memory on, or SPDK_ENV_SOCKET_ID_ANY ++ * for any socket. ++ * ++ * \return a pointer to the allocated memory buffer. ++ */ ++void *spdk_dma_malloc_socket(size_t size, size_t align, uint64_t *phys_addr, int socket_id); ++ ++/** ++ * Allocate a pinned memory buffer with the given size and alignment. The buffer ++ * will be zeroed. ++ * ++ * \param size Size in bytes. ++ * \param align If non-zero, the allocated buffer is aligned to a multiple of ++ * align. In this case, it must be a power of two. The returned buffer is always ++ * aligned to at least cache line size. ++ * \param phys_addr A pointer to the variable to hold the physical address of ++ * the allocated buffer is passed. If NULL, the physical address is not returned. ++ * ++ * \return a pointer to the allocated memory buffer. ++ */ ++void *spdk_dma_zmalloc(size_t size, size_t align, uint64_t *phys_addr); ++ ++/** ++ * Allocate a pinned memory buffer with the given size, alignment and socket id. ++ * The buffer will be zeroed. ++ * ++ * \param size Size in bytes. ++ * \param align If non-zero, the allocated buffer is aligned to a multiple of ++ * align. In this case, it must be a power of two. The returned buffer is always ++ * aligned to at least cache line size. ++ * \param phys_addr A pointer to the variable to hold the physical address of ++ * the allocated buffer is passed. If NULL, the physical address is not returned. ++ * \param socket_id Socket ID to allocate memory on, or SPDK_ENV_SOCKET_ID_ANY ++ * for any socket. ++ * ++ * \return a pointer to the allocated memory buffer. ++ */ ++void *spdk_dma_zmalloc_socket(size_t size, size_t align, uint64_t *phys_addr, int socket_id); ++ ++/** ++ * Resize the allocated and pinned memory buffer with the given new size and ++ * alignment. Existing contents are preserved. ++ * ++ * \param buf Buffer to resize. ++ * \param size Size in bytes. ++ * \param align If non-zero, the allocated buffer is aligned to a multiple of ++ * align. In this case, it must be a power of two. The returned buffer is always ++ * aligned to at least cache line size. ++ * \param phys_addr A pointer to the variable to hold the physical address of ++ * the allocated buffer is passed. If NULL, the physical address is not returned. ++ * ++ * \return a pointer to the resized memory buffer. ++ */ ++void *spdk_dma_realloc(void *buf, size_t size, size_t align, uint64_t *phys_addr); ++ ++/** ++ * Free a memory buffer previously allocated, for example from spdk_dma_zmalloc(). ++ * This call is never made from the performance path. ++ * ++ * \param buf Buffer to free. ++ */ ++void spdk_dma_free(void *buf); ++ ++/** ++ * Reserve a named, process shared memory zone with the given size, socket_id ++ * and flags. Unless `SPDK_MEMZONE_NO_IOVA_CONTIG` flag is provided, the returned ++ * memory will be IOVA contiguous. ++ * ++ * \param name Name to set for this memory zone. ++ * \param len Length in bytes. ++ * \param socket_id Socket ID to allocate memory on, or SPDK_ENV_SOCKET_ID_ANY ++ * for any socket. ++ * \param flags Flags to set for this memory zone. ++ * ++ * \return a pointer to the allocated memory address on success, or NULL on failure. ++ */ ++void *spdk_memzone_reserve(const char *name, size_t len, int socket_id, unsigned flags); ++ ++/** ++ * Reserve a named, process shared memory zone with the given size, socket_id, ++ * flags and alignment. Unless `SPDK_MEMZONE_NO_IOVA_CONTIG` flag is provided, ++ * the returned memory will be IOVA contiguous. ++ * ++ * \param name Name to set for this memory zone. ++ * \param len Length in bytes. ++ * \param socket_id Socket ID to allocate memory on, or SPDK_ENV_SOCKET_ID_ANY ++ * for any socket. ++ * \param flags Flags to set for this memory zone. ++ * \param align Alignment for resulting memzone. Must be a power of 2. ++ * ++ * \return a pointer to the allocated memory address on success, or NULL on failure. ++ */ ++void *spdk_memzone_reserve_aligned(const char *name, size_t len, int socket_id, ++ unsigned flags, unsigned align); ++ ++/** ++ * Lookup the memory zone identified by the given name. ++ * ++ * \param name Name of the memory zone. ++ * ++ * \return a pointer to the reserved memory address on success, or NULL on failure. ++ */ ++void *spdk_memzone_lookup(const char *name); ++ ++/** ++ * Free the memory zone identified by the given name. ++ * ++ * \return 0 on success, -1 on failure. ++ */ ++int spdk_memzone_free(const char *name); ++ ++/** ++ * Dump debug information about all memzones. ++ * ++ * \param f File to write debug information to. ++ */ ++void spdk_memzone_dump(FILE *f); ++ ++struct spdk_mempool; ++ ++#define SPDK_MEMPOOL_DEFAULT_CACHE_SIZE SIZE_MAX ++ ++/** ++ * Create a thread-safe memory pool. ++ * ++ * \param name Name for the memory pool. ++ * \param count Count of elements. ++ * \param ele_size Element size in bytes. ++ * \param cache_size How many elements may be cached in per-core caches. Use ++ * SPDK_MEMPOOL_DEFAULT_CACHE_SIZE for a reasonable default, or 0 for no per-core cache. ++ * \param socket_id Socket ID to allocate memory on, or SPDK_ENV_SOCKET_ID_ANY ++ * for any socket. ++ * ++ * \return a pointer to the created memory pool. ++ */ ++struct spdk_mempool *spdk_mempool_create(const char *name, size_t count, ++ size_t ele_size, size_t cache_size, int socket_id); ++ ++/** ++ * An object callback function for memory pool. ++ * ++ * Used by spdk_mempool_create_ctor(). ++ */ ++typedef void (spdk_mempool_obj_cb_t)(struct spdk_mempool *mp, ++ void *opaque, void *obj, unsigned obj_idx); ++ ++/** ++ * A memory chunk callback function for memory pool. ++ * ++ * Used by spdk_mempool_mem_iter(). ++ */ ++typedef void (spdk_mempool_mem_cb_t)(struct spdk_mempool *mp, void *opaque, void *addr, ++ uint64_t iova, size_t len, unsigned mem_idx); ++ ++/** ++ * Create a thread-safe memory pool with user provided initialization function ++ * and argument. ++ * ++ * \param name Name for the memory pool. ++ * \param count Count of elements. ++ * \param ele_size Element size in bytes. ++ * \param cache_size How many elements may be cached in per-core caches. Use ++ * SPDK_MEMPOOL_DEFAULT_CACHE_SIZE for a reasonable default, or 0 for no per-core cache. ++ * \param socket_id Socket ID to allocate memory on, or SPDK_ENV_SOCKET_ID_ANY ++ * for any socket. ++ * \param obj_init User provided object callback initialization function. ++ * \param obj_init_arg User provided callback initialization function argument. ++ * ++ * \return a pointer to the created memory pool. ++ */ ++struct spdk_mempool *spdk_mempool_create_ctor(const char *name, size_t count, ++ size_t ele_size, size_t cache_size, int socket_id, ++ spdk_mempool_obj_cb_t *obj_init, void *obj_init_arg); ++ ++/** ++ * Get the name of a memory pool. ++ * ++ * \param mp Memory pool to query. ++ * ++ * \return the name of the memory pool. ++ */ ++char *spdk_mempool_get_name(struct spdk_mempool *mp); ++ ++/** ++ * Free a memory pool. ++ */ ++void spdk_mempool_free(struct spdk_mempool *mp); ++ ++/** ++ * Get an element from a memory pool. If no elements remain, return NULL. ++ * ++ * \param mp Memory pool to query. ++ * ++ * \return a pointer to the element. ++ */ ++void *spdk_mempool_get(struct spdk_mempool *mp); ++ ++/** ++ * Get multiple elements from a memory pool. ++ * ++ * \param mp Memory pool to get multiple elements from. ++ * \param ele_arr Array of the elements to fill. ++ * \param count Count of elements to get. ++ * ++ * \return 0 on success, negative errno on failure. ++ */ ++int spdk_mempool_get_bulk(struct spdk_mempool *mp, void **ele_arr, size_t count); ++ ++/** ++ * Put an element back into the memory pool. ++ * ++ * \param mp Memory pool to put element back into. ++ * \param ele Element to put. ++ */ ++void spdk_mempool_put(struct spdk_mempool *mp, void *ele); ++ ++/** ++ * Put multiple elements back into the memory pool. ++ * ++ * \param mp Memory pool to put multiple elements back into. ++ * \param ele_arr Array of the elements to put. ++ * \param count Count of elements to put. ++ */ ++void spdk_mempool_put_bulk(struct spdk_mempool *mp, void **ele_arr, size_t count); ++ ++/** ++ * Get the number of entries in the memory pool. ++ * ++ * \param pool Memory pool to query. ++ * ++ * \return the number of entries in the memory pool. ++ */ ++size_t spdk_mempool_count(const struct spdk_mempool *pool); ++ ++/** ++ * Iterate through all elements of the pool and call a function on each one. ++ * ++ * \param mp Memory pool to iterate on. ++ * \param obj_cb Function to call on each element. ++ * \param obj_cb_arg Opaque pointer passed to the callback function. ++ * ++ * \return Number of elements iterated. ++ */ ++uint32_t spdk_mempool_obj_iter(struct spdk_mempool *mp, spdk_mempool_obj_cb_t obj_cb, ++ void *obj_cb_arg); ++ ++/** ++ * Iterate through all memory chunks of the pool and call a function on each one. ++ * ++ * \param mp Memory pool to iterate on. ++ * \param mem_cb Function to call on each memory chunk. ++ * \param mem_cb_arg Opaque pointer passed to the callback function. ++ * ++ * \return Number of memory chunks iterated. ++ */ ++uint32_t spdk_mempool_mem_iter(struct spdk_mempool *mp, spdk_mempool_mem_cb_t mem_cb, ++ void *mem_cb_arg); ++ ++/** ++ * Lookup the memory pool identified by the given name. ++ * ++ * \param name Name of the memory pool. ++ * ++ * \return a pointer to the memory pool on success, or NULL on failure. ++ */ ++struct spdk_mempool *spdk_mempool_lookup(const char *name); ++ ++/** ++ * Get the number of dedicated CPU cores utilized by this env abstraction. ++ * ++ * \return the number of dedicated CPU cores. ++ */ ++uint32_t spdk_env_get_core_count(void); ++ ++/** ++ * Get the CPU core index of the current thread. ++ * ++ * This will only function when called from threads set up by ++ * this environment abstraction. For any other threads \c SPDK_ENV_LCORE_ID_ANY ++ * will be returned. ++ * ++ * \return the CPU core index of the current thread. ++ */ ++uint32_t spdk_env_get_current_core(void); ++ ++/** ++ * Get the index of the first dedicated CPU core for this application. ++ * ++ * \return the index of the first dedicated CPU core. ++ */ ++uint32_t spdk_env_get_first_core(void); ++ ++/** ++ * Get the index of the last dedicated CPU core for this application. ++ * ++ * \return the index of the last dedicated CPU core. ++ */ ++uint32_t spdk_env_get_last_core(void); ++ ++/** ++ * Get the index of the next dedicated CPU core for this application. ++ * ++ * If there is no next core, return UINT32_MAX. ++ * ++ * \param prev_core Index of previous core. ++ * ++ * \return the index of the next dedicated CPU core. ++ */ ++uint32_t spdk_env_get_next_core(uint32_t prev_core); ++ ++#define SPDK_ENV_FOREACH_CORE(i) \ ++ for (i = spdk_env_get_first_core(); \ ++ i < UINT32_MAX; \ ++ i = spdk_env_get_next_core(i)) ++ ++/** ++ * Get the socket ID for the given core. ++ * ++ * \param core CPU core to query. ++ * ++ * \return the socket ID for the given core. ++ */ ++uint32_t spdk_env_get_socket_id(uint32_t core); ++ ++typedef int (*thread_start_fn)(void *); ++ ++/** ++ * Launch a thread pinned to the given core. Only a single pinned thread may be ++ * launched per core. Subsequent attempts to launch pinned threads on that core ++ * will fail. ++ * ++ * \param core The core to pin the thread to. ++ * \param fn Entry point on the new thread. ++ * \param arg Argument passed to thread_start_fn ++ * ++ * \return 0 on success, negative errno on failure. ++ */ ++int spdk_env_thread_launch_pinned(uint32_t core, thread_start_fn fn, void *arg); ++ ++/** ++ * Wait for all threads to exit before returning. ++ */ ++void spdk_env_thread_wait_all(void); ++ ++/** ++ * Check whether the calling process is primary process. ++ * ++ * \return true if the calling process is primary process, or false otherwise. ++ */ ++bool spdk_process_is_primary(void); ++ ++/** ++ * Get a monotonic timestamp counter. ++ * ++ * \return the monotonic timestamp counter. ++ */ ++uint64_t spdk_get_ticks(void); ++ ++/** ++ * Get the tick rate of spdk_get_ticks() per second. ++ * ++ * \return the tick rate of spdk_get_ticks() per second. ++ */ ++uint64_t spdk_get_ticks_hz(void); ++ ++/** ++ * Delay the given number of microseconds. ++ * ++ * \param us Number of microseconds. ++ */ ++void spdk_delay_us(unsigned int us); ++ ++/** ++ * Pause CPU execution for a short while ++ */ ++void spdk_pause(void); ++ ++struct spdk_ring; ++ ++enum spdk_ring_type { ++ SPDK_RING_TYPE_SP_SC, /* Single-producer, single-consumer */ ++ SPDK_RING_TYPE_MP_SC, /* Multi-producer, single-consumer */ ++ SPDK_RING_TYPE_MP_MC, /* Multi-producer, multi-consumer */ ++}; ++ ++/** ++ * Create a ring. ++ * ++ * \param type Type for the ring. (SPDK_RING_TYPE_SP_SC or SPDK_RING_TYPE_MP_SC). ++ * \param count Size of the ring in elements. ++ * \param socket_id Socket ID to allocate memory on, or SPDK_ENV_SOCKET_ID_ANY ++ * for any socket. ++ * ++ * \return a pointer to the created ring. ++ */ ++struct spdk_ring *spdk_ring_create(enum spdk_ring_type type, size_t count, int socket_id); ++ ++/** ++ * Free the ring. ++ * ++ * \param ring Ring to free. ++ */ ++void spdk_ring_free(struct spdk_ring *ring); ++ ++/** ++ * Get the number of objects in the ring. ++ * ++ * \param ring the ring. ++ * ++ * \return the number of objects in the ring. ++ */ ++size_t spdk_ring_count(struct spdk_ring *ring); ++ ++/** ++ * Queue the array of objects (with length count) on the ring. ++ * ++ * \param ring A pointer to the ring. ++ * \param objs A pointer to the array to be queued. ++ * \param count Length count of the array of objects. ++ * \param free_space If non-NULL, amount of free space after the enqueue has finished. ++ * ++ * \return the number of objects enqueued. ++ */ ++size_t spdk_ring_enqueue(struct spdk_ring *ring, void **objs, size_t count, ++ size_t *free_space); ++ ++/** ++ * Dequeue count objects from the ring into the array objs. ++ * ++ * \param ring A pointer to the ring. ++ * \param objs A pointer to the array to be dequeued. ++ * \param count Maximum number of elements to be dequeued. ++ * ++ * \return the number of objects dequeued which is less than 'count'. ++ */ ++size_t spdk_ring_dequeue(struct spdk_ring *ring, void **objs, size_t count); ++ ++/** ++ * Reports whether the SPDK application is using the IOMMU for DMA ++ * ++ * \return True if we are using the IOMMU, false otherwise. ++ */ ++bool spdk_iommu_is_enabled(void); ++ ++#define SPDK_VTOPHYS_ERROR (0xFFFFFFFFFFFFFFFFULL) ++ ++/** ++ * Get the physical address of a buffer. ++ * ++ * \param buf A pointer to a buffer. ++ * \param size Contains the size of the memory region pointed to by vaddr. ++ * If vaddr is successfully translated, then this is updated with the size of ++ * the memory region for which the translation is valid. ++ * ++ * \return the physical address of this buffer on success, or SPDK_VTOPHYS_ERROR ++ * on failure. ++ */ ++uint64_t spdk_vtophys(const void *buf, uint64_t *size); ++ ++struct spdk_pci_addr { ++ uint32_t domain; ++ uint8_t bus; ++ uint8_t dev; ++ uint8_t func; ++}; ++ ++struct spdk_pci_id { ++ uint32_t class_id; /**< Class ID or SPDK_PCI_CLASS_ANY_ID. */ ++ uint16_t vendor_id; /**< Vendor ID or SPDK_PCI_ANY_ID. */ ++ uint16_t device_id; /**< Device ID or SPDK_PCI_ANY_ID. */ ++ uint16_t subvendor_id; /**< Subsystem vendor ID or SPDK_PCI_ANY_ID. */ ++ uint16_t subdevice_id; /**< Subsystem device ID or SPDK_PCI_ANY_ID. */ ++}; ++ ++/** Device needs PCI BAR mapping (done with either IGB_UIO or VFIO) */ ++#define SPDK_PCI_DRIVER_NEED_MAPPING 0x0001 ++/** Device needs PCI BAR mapping with enabled write combining (wc) */ ++#define SPDK_PCI_DRIVER_WC_ACTIVATE 0x0002 ++ ++void spdk_pci_driver_register(const char *name, struct spdk_pci_id *id_table, uint32_t flags); ++ ++struct spdk_pci_device { ++ struct spdk_pci_device *parent; ++ void *dev_handle; ++ struct spdk_pci_addr addr; ++ struct spdk_pci_id id; ++ int socket_id; ++ const char *type; ++ ++ int (*map_bar)(struct spdk_pci_device *dev, uint32_t bar, ++ void **mapped_addr, uint64_t *phys_addr, uint64_t *size); ++ int (*unmap_bar)(struct spdk_pci_device *dev, uint32_t bar, ++ void *addr); ++ int (*cfg_read)(struct spdk_pci_device *dev, void *value, ++ uint32_t len, uint32_t offset); ++ int (*cfg_write)(struct spdk_pci_device *dev, void *value, ++ uint32_t len, uint32_t offset); ++ ++ struct _spdk_pci_device_internal { ++ struct spdk_pci_driver *driver; ++ bool attached; ++ /* optional fd for exclusive access to this device on this process */ ++ int claim_fd; ++ bool pending_removal; ++ /* The device was successfully removed on a DPDK interrupt thread, ++ * but to prevent data races we couldn't remove it from the global ++ * device list right away. It'll be removed as soon as possible ++ * on a regular thread when any public pci function is called. ++ */ ++ bool removed; ++ TAILQ_ENTRY(spdk_pci_device) tailq; ++ } internal; ++}; ++ ++/** ++ * Callback for device attach handling. ++ * ++ * \param enum_ctx Opaque value. ++ * \param dev PCI device. ++ * ++ * \return -1 if an error occurred, ++ * 0 if device attached successfully, ++ * 1 if device not attached. ++ */ ++typedef int (*spdk_pci_enum_cb)(void *enum_ctx, struct spdk_pci_device *dev); ++ ++#define SPDK_PCI_DEVICE(vend, dev) \ ++ .class_id = SPDK_PCI_CLASS_ANY_ID, \ ++ .vendor_id = (vend), \ ++ .device_id = (dev), \ ++ .subvendor_id = SPDK_PCI_ANY_ID, \ ++ .subdevice_id = SPDK_PCI_ANY_ID ++ ++#define SPDK_PCI_DRIVER_REGISTER(name, id_table, flags) \ ++__attribute__((constructor)) static void _spdk_pci_driver_register_##name(void) \ ++{ \ ++ spdk_pci_driver_register(#name, id_table, flags); \ ++} ++ ++/** ++ * Get the VMD PCI driver object. ++ * ++ * \return PCI driver. ++ */ ++struct spdk_pci_driver *spdk_pci_vmd_get_driver(void); ++ ++/** ++ * Get the I/OAT PCI driver object. ++ * ++ * \return PCI driver. ++ */ ++struct spdk_pci_driver *spdk_pci_ioat_get_driver(void); ++ ++/** ++ * Get the IDXD PCI driver object. ++ * ++ * \return PCI driver. ++ */ ++struct spdk_pci_driver *spdk_pci_idxd_get_driver(void); ++ ++/** ++ * Get the Virtio PCI driver object. ++ * ++ * \return PCI driver. ++ */ ++struct spdk_pci_driver *spdk_pci_virtio_get_driver(void); ++ ++/** ++ * Get PCI driver by name (e.g. "nvme", "vmd", "ioat"). ++ */ ++struct spdk_pci_driver *spdk_pci_get_driver(const char *name); ++ ++/** ++ * Get the NVMe PCI driver object. ++ * ++ * \return PCI driver. ++ */ ++struct spdk_pci_driver *spdk_pci_nvme_get_driver(void); ++ ++/** ++ * Enumerate all PCI devices supported by the provided driver and try to ++ * attach those that weren't attached yet. The provided callback will be ++ * called for each such device and its return code will decide whether that ++ * device is attached or not. Attached devices have to be manually detached ++ * with spdk_pci_device_detach() to be attach-able again. ++ * ++ * During enumeration all registered pci devices with exposed access to ++ * userspace are getting probed internally unless not explicitly specified ++ * on denylist. Because of that it becomes not possible to either use such ++ * devices with another application or unbind the driver (e.g. vfio). ++ * ++ * 2s asynchronous delay is introduced to avoid race conditions between ++ * user space software initialization and in-kernel device handling for ++ * newly inserted devices. Subsequent enumerate call after the delay ++ * shall allow for a successful device attachment. ++ * ++ * \param driver Driver for a specific device type. ++ * \param enum_cb Callback to be called for each non-attached PCI device. ++ * \param enum_ctx Additional context passed to the callback function. ++ * ++ * \return -1 if an internal error occurred or the provided callback returned -1, ++ * 0 otherwise ++ */ ++int spdk_pci_enumerate(struct spdk_pci_driver *driver, spdk_pci_enum_cb enum_cb, void *enum_ctx); ++ ++/** ++ * Call the provided function pointer for every enumerated PCI device. ++ * ++ * \param ctx Context parameter to pass to fn. ++ * \param fn Function to call for each PCI device ++ */ ++void spdk_pci_for_each_device(void *ctx, void (*fn)(void *ctx, struct spdk_pci_device *dev)); ++ ++/** ++ * Map a PCI BAR in the current process. ++ * ++ * \param dev PCI device. ++ * \param bar BAR number. ++ * \param mapped_addr A variable to store the virtual address of the mapping. ++ * \param phys_addr A variable to store the physical address of the mapping. ++ * \param size A variable to store the size of the bar (in bytes). ++ * ++ * \return 0 on success. ++ */ ++int spdk_pci_device_map_bar(struct spdk_pci_device *dev, uint32_t bar, ++ void **mapped_addr, uint64_t *phys_addr, uint64_t *size); ++ ++/** ++ * Unmap a PCI BAR from the current process. This happens automatically when ++ * the PCI device is detached. ++ * ++ * \param dev PCI device. ++ * \param bar BAR number. ++ * \param mapped_addr Virtual address of the bar. ++ * ++ * \return 0 on success. ++ */ ++int spdk_pci_device_unmap_bar(struct spdk_pci_device *dev, uint32_t bar, ++ void *mapped_addr); ++ ++/** ++ * Enable PCI device interrupts. (Experimental) ++ * ++ * \param dev PCI device. ++ * ++ * \return 0 on success, negative value on error. ++ */ ++int spdk_pci_device_enable_interrupt(struct spdk_pci_device *dev); ++ ++/** ++ * Disable PCI device interrupts. (Experimental) ++ * ++ * \param dev PCI device. ++ * ++ * \return 0 on success, negative value on error. ++ */ ++int spdk_pci_device_disable_interrupt(struct spdk_pci_device *dev); ++ ++/** ++ * Get an event file descriptor assosiated with a PCI device interrupt. ++ * (Experimental) ++ * ++ * \param dev PCI device. ++ * ++ * \return Event file descriptor on success, negative value on error. ++ */ ++int spdk_pci_device_get_interrupt_efd(struct spdk_pci_device *dev); ++ ++/** ++ * Get the domain of a PCI device. ++ * ++ * \param dev PCI device. ++ * ++ * \return PCI device domain. ++ */ ++uint32_t spdk_pci_device_get_domain(struct spdk_pci_device *dev); ++ ++/** ++ * Get the bus number of a PCI device. ++ * ++ * \param dev PCI device. ++ * ++ * \return PCI bus number. ++ */ ++uint8_t spdk_pci_device_get_bus(struct spdk_pci_device *dev); ++ ++/** ++ * Get the device number within the PCI bus the device is on. ++ * ++ * \param dev PCI device. ++ * ++ * \return PCI device number. ++ */ ++uint8_t spdk_pci_device_get_dev(struct spdk_pci_device *dev); ++ ++/** ++ * Get the particular function number represented by struct spdk_pci_device. ++ * ++ * \param dev PCI device. ++ * ++ * \return PCI function number. ++ */ ++uint8_t spdk_pci_device_get_func(struct spdk_pci_device *dev); ++ ++/** ++ * Get the full DomainBDF address of a PCI device. ++ * ++ * \param dev PCI device. ++ * ++ * \return PCI address. ++ */ ++struct spdk_pci_addr spdk_pci_device_get_addr(struct spdk_pci_device *dev); ++ ++/** ++ * Get the vendor ID of a PCI device. ++ * ++ * \param dev PCI device. ++ * ++ * \return vendor ID. ++ */ ++uint16_t spdk_pci_device_get_vendor_id(struct spdk_pci_device *dev); ++ ++/** ++ * Get the device ID of a PCI device. ++ * ++ * \param dev PCI device. ++ * ++ * \return device ID. ++ */ ++uint16_t spdk_pci_device_get_device_id(struct spdk_pci_device *dev); ++ ++/** ++ * Get the subvendor ID of a PCI device. ++ * ++ * \param dev PCI device. ++ * ++ * \return subvendor ID. ++ */ ++uint16_t spdk_pci_device_get_subvendor_id(struct spdk_pci_device *dev); ++ ++/** ++ * Get the subdevice ID of a PCI device. ++ * ++ * \param dev PCI device. ++ * ++ * \return subdevice ID. ++ */ ++uint16_t spdk_pci_device_get_subdevice_id(struct spdk_pci_device *dev); ++ ++/** ++ * Get the PCI ID of a PCI device. ++ * ++ * \param dev PCI device. ++ * ++ * \return PCI ID. ++ */ ++struct spdk_pci_id spdk_pci_device_get_id(struct spdk_pci_device *dev); ++ ++/** ++ * Get the NUMA node the PCI device is on. ++ * ++ * \param dev PCI device. ++ * ++ * \return NUMA node index (>= 0). ++ */ ++int spdk_pci_device_get_socket_id(struct spdk_pci_device *dev); ++ ++/** ++ * Serialize the PCIe Device Serial Number into the provided buffer. ++ * The buffer will contain a 16-character-long serial number followed by ++ * a NULL terminator. ++ * ++ * \param dev PCI device. ++ * \param sn Buffer to store the serial number in. ++ * \param len Length of buffer. Must be at least 17. ++ * ++ * \return 0 on success, -1 on failure. ++ */ ++int spdk_pci_device_get_serial_number(struct spdk_pci_device *dev, char *sn, size_t len); ++ ++/** ++ * Claim a PCI device for exclusive SPDK userspace access. ++ * ++ * Uses F_SETLK on a shared memory file with the PCI address embedded in its name. ++ * As long as this file remains open with the lock acquired, other processes will ++ * not be able to successfully call this function on the same PCI device. ++ * ++ * The device can be un-claimed by the owning process with spdk_pci_device_unclaim(). ++ * It will be also unclaimed automatically when detached. ++ * ++ * \param dev PCI device to claim. ++ * ++ * \return -EACCES if the device has already been claimed, ++ * negative errno on unexpected errors, ++ * 0 on success. ++ */ ++int spdk_pci_device_claim(struct spdk_pci_device *dev); ++ ++/** ++ * Undo spdk_pci_device_claim(). ++ * ++ * \param dev PCI device to unclaim. ++ */ ++void spdk_pci_device_unclaim(struct spdk_pci_device *dev); ++ ++/** ++ * Release all resources associated with the given device and detach it. As long ++ * as the PCI device is physically available, it will attachable again. ++ * ++ * \param device PCI device. ++ */ ++void spdk_pci_device_detach(struct spdk_pci_device *device); ++ ++/** ++ * Attach a PCI device. This will bypass all blocked list rules and explicitly ++ * attach a device at the provided address. The return code of the provided ++ * callback will decide whether that device is attached or not. Attached ++ * devices have to be manually detached with spdk_pci_device_detach() to be ++ * attach-able again. ++ * ++ * \param driver Driver for a specific device type. The device will only be ++ * attached if it's supported by this driver. ++ * \param enum_cb Callback to be called for the PCI device once it's found. ++ * \param enum_ctx Additional context passed to the callback function. ++ * \param pci_address Address of the device to attach. ++ * ++ * \return -1 if a device at the provided PCI address couldn't be found, ++ * -1 if an internal error happened or the provided callback returned non-zero, ++ * 0 otherwise ++ */ ++int spdk_pci_device_attach(struct spdk_pci_driver *driver, spdk_pci_enum_cb enum_cb, ++ void *enum_ctx, struct spdk_pci_addr *pci_address); ++ ++/** ++ * Allow the specified PCI device to be probed by the calling process. ++ * ++ * When using spdk_pci_enumerate(), only devices with allowed PCI addresses will ++ * be probed. By default, this is all PCI addresses, but the pci_allowed ++ * and pci_blocked environment options can override this behavior. ++ * This API enables the caller to allow a new PCI address that may have previously ++ * been blocked. ++ * ++ * \param pci_addr PCI address to allow ++ * \return 0 if successful ++ * \return -ENOMEM if environment-specific data structures cannot be allocated ++ * \return -EINVAL if specified PCI address is not valid ++ */ ++int spdk_pci_device_allow(struct spdk_pci_addr *pci_addr); ++ ++/** ++ * Read \c len bytes from the PCI configuration space. ++ * ++ * \param dev PCI device. ++ * \param buf A buffer to copy the data into. ++ * \param len Number of bytes to read. ++ * \param offset Offset (in bytes) in the PCI config space to start reading from. ++ * ++ * \return 0 on success, -1 on failure. ++ */ ++int spdk_pci_device_cfg_read(struct spdk_pci_device *dev, void *buf, uint32_t len, ++ uint32_t offset); ++ ++/** ++ * Write \c len bytes into the PCI configuration space. ++ * ++ * \param dev PCI device. ++ * \param buf A buffer to copy the data from. ++ * \param len Number of bytes to write. ++ * \param offset Offset (in bytes) in the PCI config space to start writing to. ++ * ++ * \return 0 on success, -1 on failure. ++ */ ++int spdk_pci_device_cfg_write(struct spdk_pci_device *dev, void *buf, uint32_t len, ++ uint32_t offset); ++ ++/** ++ * Read 1 byte from the PCI configuration space. ++ * ++ * \param dev PCI device. ++ * \param value A buffer to copy the data into. ++ * \param offset Offset (in bytes) in the PCI config space to start reading from. ++ * ++ * \return 0 on success, -1 on failure. ++ */ ++int spdk_pci_device_cfg_read8(struct spdk_pci_device *dev, uint8_t *value, uint32_t offset); ++ ++/** ++ * Write 1 byte into the PCI configuration space. ++ * ++ * \param dev PCI device. ++ * \param value A value to write. ++ * \param offset Offset (in bytes) in the PCI config space to start writing to. ++ * ++ * \return 0 on success, -1 on failure. ++ */ ++int spdk_pci_device_cfg_write8(struct spdk_pci_device *dev, uint8_t value, uint32_t offset); ++ ++/** ++ * Read 2 bytes from the PCI configuration space. ++ * ++ * \param dev PCI device. ++ * \param value A buffer to copy the data into. ++ * \param offset Offset (in bytes) in the PCI config space to start reading from. ++ * ++ * \return 0 on success, -1 on failure. ++ */ ++int spdk_pci_device_cfg_read16(struct spdk_pci_device *dev, uint16_t *value, uint32_t offset); ++ ++/** ++ * Write 2 bytes into the PCI configuration space. ++ * ++ * \param dev PCI device. ++ * \param value A value to write. ++ * \param offset Offset (in bytes) in the PCI config space to start writing to. ++ * ++ * \return 0 on success, -1 on failure. ++ */ ++int spdk_pci_device_cfg_write16(struct spdk_pci_device *dev, uint16_t value, uint32_t offset); ++ ++/** ++ * Read 4 bytes from the PCI configuration space. ++ * ++ * \param dev PCI device. ++ * \param value A buffer to copy the data into. ++ * \param offset Offset (in bytes) in the PCI config space to start reading from. ++ * ++ * \return 0 on success, -1 on failure. ++ */ ++int spdk_pci_device_cfg_read32(struct spdk_pci_device *dev, uint32_t *value, uint32_t offset); ++ ++/** ++ * Write 4 bytes into the PCI configuration space. ++ * ++ * \param dev PCI device. ++ * \param value A value to write. ++ * \param offset Offset (in bytes) in the PCI config space to start writing to. ++ * ++ * \return 0 on success, -1 on failure. ++ */ ++int spdk_pci_device_cfg_write32(struct spdk_pci_device *dev, uint32_t value, uint32_t offset); ++ ++/** ++ * Check if device was requested to be removed from the process. This can be ++ * caused either by physical device hotremoval or OS-triggered removal. In the ++ * latter case, the device may continue to function properly even if this ++ * function returns \c true . The upper-layer driver may check this function ++ * periodically and eventually detach the device. ++ * ++ * \param dev PCI device. ++ * ++ * \return if device was requested to be removed ++ */ ++bool spdk_pci_device_is_removed(struct spdk_pci_device *dev); ++ ++/** ++ * Compare two PCI addresses. ++ * ++ * \param a1 PCI address 1. ++ * \param a2 PCI address 2. ++ * ++ * \return 0 if a1 == a2, less than 0 if a1 < a2, greater than 0 if a1 > a2 ++ */ ++int spdk_pci_addr_compare(const struct spdk_pci_addr *a1, const struct spdk_pci_addr *a2); ++ ++/** ++ * Convert a string representation of a PCI address into a struct spdk_pci_addr. ++ * ++ * \param addr PCI address output on success. ++ * \param bdf PCI address in domain:bus:device.function format or ++ * domain.bus.device.function format. ++ * ++ * \return 0 on success, negative errno on failure. ++ */ ++int spdk_pci_addr_parse(struct spdk_pci_addr *addr, const char *bdf); ++ ++/** ++ * Convert a struct spdk_pci_addr to a string. ++ * ++ * \param bdf String into which a string will be output in the format ++ * domain:bus:device.function. The string must be at least 14 characters in size. ++ * \param sz Size of bdf in bytes. Must be at least 14. ++ * \param addr PCI address. ++ * ++ * \return 0 on success, or a negated errno on failure. ++ */ ++int spdk_pci_addr_fmt(char *bdf, size_t sz, const struct spdk_pci_addr *addr); ++ ++/** ++ * Hook a custom PCI device into the PCI layer. The device will be attachable, ++ * enumerable, and will call provided callbacks on each PCI resource access ++ * request. ++ * ++ * \param drv driver that will be able to attach the device ++ * \param dev fully initialized PCI device struct ++ * ++ * \return 0 on success, negative errno otherwise. ++ */ ++int spdk_pci_hook_device(struct spdk_pci_driver *drv, struct spdk_pci_device *dev); ++ ++/** ++ * Un-hook a custom PCI device from the PCI layer. The device must not be attached. ++ * ++ * \param dev fully initialized PCI device struct ++ */ ++void spdk_pci_unhook_device(struct spdk_pci_device *dev); ++ ++/** ++ * Return the type of the PCI device. ++ * ++ * \param dev PCI device ++ * ++ * \return string representing the type of the device ++ */ ++const char *spdk_pci_device_get_type(const struct spdk_pci_device *dev); ++ ++struct spdk_pci_device_provider { ++ const char *name; ++ ++ /** ++ * Callback executed to attach a PCI device on a given address. ++ * ++ * \param addr address of the device. ++ * ++ * \return 0 if the device was attached successfully, negative errno otherwise. ++ */ ++ int (*attach_cb)(const struct spdk_pci_addr *addr); ++ ++ /** ++ * Callback executed to detach a given PCI device. The provider to detach the device is ++ * selected based on the type of the device and the name of the provider (i.e. dev->type == ++ * provider->name). ++ * ++ * \param dev PCI device to detach. ++ */ ++ void (*detach_cb)(struct spdk_pci_device *dev); ++ ++ TAILQ_ENTRY(spdk_pci_device_provider) tailq; ++}; ++ ++/** ++ * Register a PCI device provdier. ++ * ++ * \param provider PCI device provider. ++ */ ++void spdk_pci_register_device_provider(struct spdk_pci_device_provider *provider); ++ ++#define SPDK_PCI_REGISTER_DEVICE_PROVIDER(name, provider) \ ++ static void __attribute__((constructor)) _spdk_pci_register_device_provider_##name(void) \ ++ { \ ++ spdk_pci_register_device_provider(provider); \ ++ } ++ ++/** ++ * Remove any CPU affinity from the current thread. ++ */ ++void spdk_unaffinitize_thread(void); ++ ++/** ++ * Call a function with CPU affinity unset. ++ * ++ * This can be used to run a function that creates other threads without inheriting the calling ++ * thread's CPU affinity. ++ * ++ * \param cb Function to call ++ * \param arg Parameter to the function cb(). ++ * ++ * \return the return value of cb(). ++ */ ++void *spdk_call_unaffinitized(void *cb(void *arg), void *arg); ++ ++/** ++ * Page-granularity memory address translation table. ++ */ ++struct spdk_mem_map; ++ ++enum spdk_mem_map_notify_action { ++ SPDK_MEM_MAP_NOTIFY_REGISTER, ++ SPDK_MEM_MAP_NOTIFY_UNREGISTER, ++}; ++ ++typedef int (*spdk_mem_map_notify_cb)(void *cb_ctx, struct spdk_mem_map *map, ++ enum spdk_mem_map_notify_action action, ++ void *vaddr, size_t size); ++ ++typedef int (*spdk_mem_map_contiguous_translations)(uint64_t addr_1, uint64_t addr_2); ++ ++/** ++ * A function table to be implemented by each memory map. ++ */ ++struct spdk_mem_map_ops { ++ spdk_mem_map_notify_cb notify_cb; ++ spdk_mem_map_contiguous_translations are_contiguous; ++}; ++ ++/** ++ * Allocate a virtual memory address translation map. ++ * ++ * \param default_translation Default translation for the map. ++ * \param ops Table of callback functions for map operations. ++ * \param cb_ctx Argument passed to the callback function. ++ * ++ * \return a pointer to the allocated virtual memory address translation map. ++ */ ++struct spdk_mem_map *spdk_mem_map_alloc(uint64_t default_translation, ++ const struct spdk_mem_map_ops *ops, void *cb_ctx); ++ ++/** ++ * Free a memory map previously allocated by spdk_mem_map_alloc(). ++ * ++ * \param pmap Memory map to free. ++ */ ++void spdk_mem_map_free(struct spdk_mem_map **pmap); ++ ++/** ++ * Register an address translation for a range of virtual memory. ++ * ++ * \param map Memory map. ++ * \param vaddr Virtual address of the region to register - must be 2 MB aligned. ++ * \param size Size of the region in bytes - must be multiple of 2 MB in the ++ * current implementation. ++ * \param translation Translation to store in the map for this address range. ++ * ++ * \sa spdk_mem_map_clear_translation(). ++ * ++ * \return 0 on success, negative errno on failure. ++ */ ++int spdk_mem_map_set_translation(struct spdk_mem_map *map, uint64_t vaddr, uint64_t size, ++ uint64_t translation); ++ ++/** ++ * Unregister an address translation. ++ * ++ * \param map Memory map. ++ * \param vaddr Virtual address of the region to unregister - must be 2 MB aligned. ++ * \param size Size of the region in bytes - must be multiple of 2 MB in the ++ * current implementation. ++ * ++ * \sa spdk_mem_map_set_translation(). ++ * ++ * \return 0 on success, negative errno on failure. ++ */ ++int spdk_mem_map_clear_translation(struct spdk_mem_map *map, uint64_t vaddr, uint64_t size); ++ ++/** ++ * Look up the translation of a virtual address in a memory map. ++ * ++ * \param map Memory map. ++ * \param vaddr Virtual address. ++ * \param size Contains the size of the memory region pointed to by vaddr. ++ * If vaddr is successfully translated, then this is updated with the size of ++ * the memory region for which the translation is valid. ++ * ++ * \return the translation of vaddr stored in the map, or default_translation ++ * as specified in spdk_mem_map_alloc() if vaddr is not present in the map. ++ */ ++uint64_t spdk_mem_map_translate(const struct spdk_mem_map *map, uint64_t vaddr, uint64_t *size); ++ ++/** ++ * Register the specified memory region for address translation. ++ * ++ * The memory region must map to pinned huge pages (2MB or greater). ++ * ++ * \param vaddr Virtual address to register. ++ * \param len Length in bytes of the vaddr. ++ * ++ * \return 0 on success, negative errno on failure. ++ */ ++int spdk_mem_register(void *vaddr, size_t len); ++ ++/** ++ * Unregister the specified memory region from vtophys address translation. ++ * ++ * The caller must ensure all in-flight DMA operations to this memory region ++ * are completed or cancelled before calling this function. ++ * ++ * \param vaddr Virtual address to unregister. ++ * \param len Length in bytes of the vaddr. ++ * ++ * \return 0 on success, negative errno on failure. ++ */ ++int spdk_mem_unregister(void *vaddr, size_t len); ++ ++/** ++ * Reserve the address space specified in all memory maps. ++ * ++ * This pre-allocates the necessary space in the memory maps such that ++ * future calls to spdk_mem_register() on that region require no ++ * internal memory allocations. ++ * ++ * \param vaddr Virtual address to reserve ++ * \param len Length in bytes of vaddr ++ * ++ * \return 0 on success, negated errno on failure. ++ */ ++int spdk_mem_reserve(void *vaddr, size_t len); ++ ++/** ++ * Get the address's file descriptor and offset, it works with spdk memory allocation APIs ++ * ++ * \param vaddr Virtual address to get ++ * \param offset Virtual address's map offset to the file descriptor ++ * ++ * \return negative errno on failure, otherwise return the file descriptor ++ */ ++int spdk_mem_get_fd_and_offset(void *vaddr, uint64_t *offset); ++ ++enum spdk_pci_event_type { ++ SPDK_UEVENT_ADD = 0, ++ SPDK_UEVENT_REMOVE = 1, ++}; ++ ++struct spdk_pci_event { ++ enum spdk_pci_event_type action; ++ struct spdk_pci_addr traddr; ++}; ++ ++typedef void (*spdk_pci_error_handler)(const void *failure_addr, void *ctx); ++ ++/** ++ * Begin listening for PCI bus events. This is used to detect hot-insert and ++ * hot-remove events. Once the system is listening, events may be retrieved ++ * by calling spdk_pci_get_event() periodically. ++ * ++ * \return negative errno on failure, otherwise, return a file descriptor ++ * that may be later passed to spdk_pci_get_event(). ++ */ ++int spdk_pci_event_listen(void); ++ ++/** ++ * Get the next PCI bus event. ++ * ++ * \param fd A file descriptor returned by spdk_pci_event_listen() ++ * \param event An event on the PCI bus ++ * ++ * \return Negative errno on failure. 0 for no event. A positive number ++ * when an event has been returned ++ */ ++int spdk_pci_get_event(int fd, struct spdk_pci_event *event); ++ ++/** ++ * Register a signal handler to handle bus errors on the PCI bus ++ * ++ * \param sighandler Signal bus handler of the PCI bus ++ * \param ctx The arg pass to the registered signal bus handler. ++ * ++ * \return negative errno on failure, otherwise it means successful ++ */ ++int spdk_pci_register_error_handler(spdk_pci_error_handler sighandler, void *ctx); ++ ++/** ++ * Register a signal handler to handle bus errors on the PCI bus ++ * ++ * \param sighandler Signal bus handler of the PCI bus ++ */ ++void spdk_pci_unregister_error_handler(spdk_pci_error_handler sighandler); ++ ++#ifdef __cplusplus ++} ++#endif ++ ++#endif +diff --git a/include/spdk/event.h b/include/spdk/event.h +index be8c3ee..2143c30 100644 +--- a/include/spdk/event.h ++++ b/include/spdk/event.h +@@ -1,333 +1,336 @@ +-/* SPDX-License-Identifier: BSD-3-Clause +- * Copyright (C) 2016 Intel Corporation. All rights reserved. +- * Copyright (c) 2019 Mellanox Technologies LTD. All rights reserved. +- */ +- +-/** +- * \file +- * Event framework public API. +- * +- * See @ref event_components for an overview of the SPDK event framework API. +- */ +- +-#ifndef SPDK_EVENT_H +-#define SPDK_EVENT_H +- +-#include "spdk/stdinc.h" +- +-#include "spdk/cpuset.h" +-#include "spdk/init.h" +-#include "spdk/queue.h" +-#include "spdk/log.h" +-#include "spdk/thread.h" +-#include "spdk/assert.h" +- +-#ifdef __cplusplus +-extern "C" { +-#endif +- +-/** +- * Event handler function. +- * +- * \param arg1 Argument 1. +- * \param arg2 Argument 2. +- */ +-typedef void (*spdk_event_fn)(void *arg1, void *arg2); +- +-/** +- * \brief An event is a function that is passed to and called on an lcore. +- */ +-struct spdk_event; +- +-/** +- * \brief A poller is a function that is repeatedly called on an lcore. +- */ +-struct spdk_poller; +- +-/** +- * Callback function for customized shutdown handling of application. +- */ +-typedef void (*spdk_app_shutdown_cb)(void); +- +-/** +- * Signal handler function. +- * +- * \param signal Signal number. +- */ +-typedef void (*spdk_sighandler_t)(int signal); +- +-/** +- * \brief Event framework initialization options +- */ +-struct spdk_app_opts { +- const char *name; +- const char *json_config_file; +- bool json_config_ignore_errors; +- +- /* Hole at bytes 17-23. */ +- uint8_t reserved17[7]; +- +- const char *rpc_addr; /* Can be UNIX domain socket path or IP address + TCP port */ +- const char *reactor_mask; +- const char *tpoint_group_mask; +- +- int shm_id; +- +- /* Hole at bytes 52-55. */ +- uint8_t reserved52[4]; +- +- spdk_app_shutdown_cb shutdown_cb; +- +- bool enable_coredump; +- +- /* Hole at bytes 65-67. */ +- uint8_t reserved65[3]; +- +- int mem_channel; +- int main_core; +- int mem_size; +- bool no_pci; +- bool hugepage_single_segments; +- bool unlink_hugepage; +- +- /* Hole at bytes 83-85. */ +- uint8_t reserved83[5]; +- +- const char *hugedir; +- enum spdk_log_level print_level; +- +- /* Hole at bytes 100-103. */ +- uint8_t reserved100[4]; +- +- size_t num_pci_addr; +- struct spdk_pci_addr *pci_blocked; +- struct spdk_pci_addr *pci_allowed; +- const char *iova_mode; +- +- /* Wait for the associated RPC before initializing subsystems +- * when this flag is enabled. +- */ +- bool delay_subsystem_init; +- +- /* Hole at bytes 137-143. */ +- uint8_t reserved137[7]; +- +- /* Number of trace entries allocated for each core */ +- uint64_t num_entries; +- +- /** Opaque context for use of the env implementation. */ +- void *env_context; +- +- /** +- * for passing user-provided log call +- */ +- logfunc *log; +- +- uint64_t base_virtaddr; +- +- /** +- * The size of spdk_app_opts according to the caller of this library is used for ABI +- * compatibility. The library uses this field to know how many fields in this +- * structure are valid. And the library will populate any remaining fields with default values. +- * After that, new added fields should be put after opts_size. +- */ +- size_t opts_size; +- +- /** +- * Disable default signal handlers. +- * If set to `true`, the shutdown process is not started implicitly by +- * process signals, hence the application is responsible for calling +- * spdk_app_start_shutdown(). +- * +- * Default is `false`. +- */ +- bool disable_signal_handlers; +- +- /* Hole at bytes 185-191. */ +- uint8_t reserved185[7]; +- +- /** +- * The allocated size for the message pool used by the threading library. +- * +- * Default is `SPDK_DEFAULT_MSG_MEMPOOL_SIZE`. +- */ +- size_t msg_mempool_size; +- +- /* +- * If non-NULL, a string array of allowed RPC methods. +- */ +- const char **rpc_allowlist; +- +- /** +- * Used to pass vf_token to vfio_pci driver through DPDK. +- * The vf_token is an UUID that shared between SR-IOV PF and VF. +- */ +- const char *vf_token; +-} __attribute__((packed)); +-SPDK_STATIC_ASSERT(sizeof(struct spdk_app_opts) == 216, "Incorrect size"); +- +-/** +- * Initialize the default value of opts +- * +- * \param opts Data structure where SPDK will initialize the default options. +- * \param opts_size Must be set to sizeof(struct spdk_app_opts). +- */ +-void spdk_app_opts_init(struct spdk_app_opts *opts, size_t opts_size); +- +-/** +- * Start the framework. +- * +- * Before calling this function, opts must be initialized by +- * spdk_app_opts_init(). Once started, the framework will call start_fn on +- * an spdk_thread running on the current system thread with the +- * argument provided. +- * +- * If opts->delay_subsystem_init is set +- * (e.g. through --wait-for-rpc flag in spdk_app_parse_args()) +- * this function will only start a limited RPC server accepting +- * only a few RPC commands - mostly related to pre-initialization. +- * With this option, the framework won't be started and start_fn +- * won't be called until the user sends an `rpc_framework_start_init` +- * RPC command, which marks the pre-initialization complete and +- * allows start_fn to be finally called. +- * +- * This call will block until spdk_app_stop() is called. If an error +- * condition occurs during the initialization code within spdk_app_start(), +- * this function will immediately return before invoking start_fn. +- * +- * \param opts_user Initialization options used for this application. It should not be +- * NULL. And the opts_size value inside the opts structure should not be zero. +- * \param start_fn Entry point that will execute on an internally created thread +- * once the framework has been started. +- * \param ctx Argument passed to function start_fn. +- * +- * \return 0 on success or non-zero on failure. +- */ +-int spdk_app_start(struct spdk_app_opts *opts_user, spdk_msg_fn start_fn, +- void *ctx); +- +-/** +- * Perform final shutdown operations on an application using the event framework. +- */ +-void spdk_app_fini(void); +- +-/** +- * Start shutting down the framework. +- * +- * Typically this function is not called directly, and the shutdown process is +- * started implicitly by a process signal. But in applications that are using +- * SPDK for a subset of its process threads, this function can be called in lieu +- * of a signal. +- */ +-void spdk_app_start_shutdown(void); +- +-/** +- * Stop the framework. +- * +- * This does not wait for all threads to exit. Instead, it kicks off the shutdown +- * process and returns. Once the shutdown process is complete, spdk_app_start() +- * will return. +- * +- * \param rc The rc value specified here will be returned to caller of spdk_app_start(). +- */ +-void spdk_app_stop(int rc); +- +-/** +- * Return the shared memory id for this application. +- * +- * \return shared memory id. +- */ +-int spdk_app_get_shm_id(void); +- +-/** +- * Convert a string containing a CPU core mask into a bitmask +- * +- * \param mask String containing a CPU core mask. +- * \param cpumask Bitmask of CPU cores. +- * +- * \return 0 on success, -1 on failure. +- */ +-int spdk_app_parse_core_mask(const char *mask, struct spdk_cpuset *cpumask); +- +-/** +- * Get the mask of the CPU cores active for this application +- * +- * \return the bitmask of the active CPU cores. +- */ +-const struct spdk_cpuset *spdk_app_get_core_mask(void); +- +-#define SPDK_APP_GETOPT_STRING "c:de:ghi:m:n:p:r:s:uvA:B:L:RW:" +- +-enum spdk_app_parse_args_rvals { +- SPDK_APP_PARSE_ARGS_HELP = 0, +- SPDK_APP_PARSE_ARGS_SUCCESS = 1, +- SPDK_APP_PARSE_ARGS_FAIL = 2 +-}; +-typedef enum spdk_app_parse_args_rvals spdk_app_parse_args_rvals_t; +- +-/** +- * Helper function for parsing arguments and printing usage messages. +- * +- * \param argc Count of arguments in argv parameter array. +- * \param argv Array of command line arguments. +- * \param opts Default options for the application. +- * \param getopt_str String representing the app-specific command line parameters. +- * Characters in this string must not conflict with characters in SPDK_APP_GETOPT_STRING. +- * \param app_long_opts Array of full-name parameters. Can be NULL. +- * \param parse Function pointer to call if an argument in getopt_str is found. +- * \param usage Function pointer to print usage messages for app-specific command +- * line parameters. +- *\return SPDK_APP_PARSE_ARGS_FAIL on failure, SPDK_APP_PARSE_ARGS_SUCCESS on +- * success, SPDK_APP_PARSE_ARGS_HELP if '-h' passed as an option. +- */ +-spdk_app_parse_args_rvals_t spdk_app_parse_args(int argc, char **argv, +- struct spdk_app_opts *opts, const char *getopt_str, +- const struct option *app_long_opts, int (*parse)(int ch, char *arg), +- void (*usage)(void)); +- +-/** +- * Print usage strings for common SPDK command line options. +- * +- * May only be called after spdk_app_parse_args(). +- */ +-void spdk_app_usage(void); +- +-/** +- * Allocate an event to be passed to spdk_event_call(). +- * +- * \param lcore Lcore to run this event. +- * \param fn Function used to execute event. +- * \param arg1 Argument passed to function fn. +- * \param arg2 Argument passed to function fn. +- * +- * \return a pointer to the allocated event. +- */ +-struct spdk_event *spdk_event_allocate(uint32_t lcore, spdk_event_fn fn, +- void *arg1, void *arg2); +- +-/** +- * Pass the given event to the associated lcore and call the function. +- * +- * \param event Event to execute. +- */ +-void spdk_event_call(struct spdk_event *event); +- +-/** +- * Enable or disable monitoring of context switches. +- * +- * \param enabled True to enable, false to disable. +- */ +-void spdk_framework_enable_context_switch_monitor(bool enabled); +- +-/** +- * Return whether context switch monitoring is enabled. +- * +- * \return true if enabled or false otherwise. +- */ +-bool spdk_framework_context_switch_monitor_enabled(void); +- +-#ifdef __cplusplus +-} +-#endif +- +-#endif ++/* SPDX-License-Identifier: BSD-3-Clause ++ * Copyright (C) 2016 Intel Corporation. All rights reserved. ++ * Copyright (c) 2019 Mellanox Technologies LTD. All rights reserved. ++ */ ++ ++/** ++ * \file ++ * Event framework public API. ++ * ++ * See @ref event_components for an overview of the SPDK event framework API. ++ */ ++ ++#ifndef SPDK_EVENT_H ++#define SPDK_EVENT_H ++ ++#include "spdk/stdinc.h" ++ ++#include "spdk/cpuset.h" ++#include "spdk/init.h" ++#include "spdk/queue.h" ++#include "spdk/log.h" ++#include "spdk/thread.h" ++#include "spdk/assert.h" ++ ++#ifdef __cplusplus ++extern "C" { ++#endif ++ ++/** ++ * Event handler function. ++ * ++ * \param arg1 Argument 1. ++ * \param arg2 Argument 2. ++ */ ++typedef void (*spdk_event_fn)(void *arg1, void *arg2); ++ ++/** ++ * \brief An event is a function that is passed to and called on an lcore. ++ */ ++struct spdk_event; ++ ++/** ++ * \brief A poller is a function that is repeatedly called on an lcore. ++ */ ++struct spdk_poller; ++ ++/** ++ * Callback function for customized shutdown handling of application. ++ */ ++typedef void (*spdk_app_shutdown_cb)(void); ++ ++/** ++ * Signal handler function. ++ * ++ * \param signal Signal number. ++ */ ++typedef void (*spdk_sighandler_t)(int signal); ++ ++/** ++ * \brief Event framework initialization options ++ */ ++struct spdk_app_opts { ++ const char *name; ++ const char *json_config_file; ++ bool json_config_ignore_errors; ++ ++ /* Hole at bytes 17-23. */ ++ uint8_t reserved17[7]; ++ ++ const char *rpc_addr; /* Can be UNIX domain socket path or IP address + TCP port */ ++ const char *reactor_mask; ++ const char *tpoint_group_mask; ++ ++ int shm_id; ++ ++ /* Hole at bytes 52-55. */ ++ uint8_t reserved52[4]; ++ ++ spdk_app_shutdown_cb shutdown_cb; ++ ++ bool enable_coredump; ++ ++ /* Hole at bytes 65-67. */ ++ uint8_t reserved65[3]; ++ ++ int mem_channel; ++ int main_core; ++ int mem_size; ++ bool no_pci; ++ bool hugepage_single_segments; ++ bool unlink_hugepage; ++ ++ /* Hole at bytes 83-85. */ ++ uint8_t reserved83[5]; ++ ++ const char *hugedir; ++ enum spdk_log_level print_level; ++ ++ /* Hole at bytes 100-103. */ ++ uint8_t reserved100[4]; ++ ++ size_t num_pci_addr; ++ struct spdk_pci_addr *pci_blocked; ++ struct spdk_pci_addr *pci_allowed; ++ const char *iova_mode; ++ ++ /* Wait for the associated RPC before initializing subsystems ++ * when this flag is enabled. ++ */ ++ bool delay_subsystem_init; ++ ++ /* Hole at bytes 137-143. */ ++ uint8_t reserved137[7]; ++ ++ /* Number of trace entries allocated for each core */ ++ uint64_t num_entries; ++ ++ /** Opaque context for use of the env implementation. */ ++ void *env_context; ++ ++ /** ++ * for passing user-provided log call ++ */ ++ logfunc *log; ++ ++ uint64_t base_virtaddr; ++ ++ /** ++ * The size of spdk_app_opts according to the caller of this library is used for ABI ++ * compatibility. The library uses this field to know how many fields in this ++ * structure are valid. And the library will populate any remaining fields with default values. ++ * After that, new added fields should be put after opts_size. ++ */ ++ size_t opts_size; ++ ++ /** ++ * Disable default signal handlers. ++ * If set to `true`, the shutdown process is not started implicitly by ++ * process signals, hence the application is responsible for calling ++ * spdk_app_start_shutdown(). ++ * ++ * Default is `false`. ++ */ ++ bool disable_signal_handlers; ++ ++ /* Hole at bytes 185-190. */ ++ uint8_t reserved185[6]; ++ ++ bool hot_restart; ++ /** ++ * The allocated size for the message pool used by the threading library. ++ * ++ * Default is `SPDK_DEFAULT_MSG_MEMPOOL_SIZE`. ++ */ ++ size_t msg_mempool_size; ++ ++ /* ++ * If non-NULL, a string array of allowed RPC methods. ++ */ ++ const char **rpc_allowlist; ++ ++ /** ++ * Used to pass vf_token to vfio_pci driver through DPDK. ++ * The vf_token is an UUID that shared between SR-IOV PF and VF. ++ */ ++ const char *vf_token; ++} __attribute__((packed)); ++SPDK_STATIC_ASSERT(sizeof(struct spdk_app_opts) == 216, "Incorrect size"); ++ ++/** ++ * Initialize the default value of opts ++ * ++ * \param opts Data structure where SPDK will initialize the default options. ++ * \param opts_size Must be set to sizeof(struct spdk_app_opts). ++ */ ++void spdk_app_opts_init(struct spdk_app_opts *opts, size_t opts_size); ++ ++/** ++ * Start the framework. ++ * ++ * Before calling this function, opts must be initialized by ++ * spdk_app_opts_init(). Once started, the framework will call start_fn on ++ * an spdk_thread running on the current system thread with the ++ * argument provided. ++ * ++ * If opts->delay_subsystem_init is set ++ * (e.g. through --wait-for-rpc flag in spdk_app_parse_args()) ++ * this function will only start a limited RPC server accepting ++ * only a few RPC commands - mostly related to pre-initialization. ++ * With this option, the framework won't be started and start_fn ++ * won't be called until the user sends an `rpc_framework_start_init` ++ * RPC command, which marks the pre-initialization complete and ++ * allows start_fn to be finally called. ++ * ++ * This call will block until spdk_app_stop() is called. If an error ++ * condition occurs during the initialization code within spdk_app_start(), ++ * this function will immediately return before invoking start_fn. ++ * ++ * \param opts_user Initialization options used for this application. It should not be ++ * NULL. And the opts_size value inside the opts structure should not be zero. ++ * \param start_fn Entry point that will execute on an internally created thread ++ * once the framework has been started. ++ * \param ctx Argument passed to function start_fn. ++ * ++ * \return 0 on success or non-zero on failure. ++ */ ++int spdk_app_start(struct spdk_app_opts *opts_user, spdk_msg_fn start_fn, ++ void *ctx); ++ ++/** ++ * Perform final shutdown operations on an application using the event framework. ++ */ ++void spdk_app_fini(void); ++ ++/** ++ * Start shutting down the framework. ++ * ++ * Typically this function is not called directly, and the shutdown process is ++ * started implicitly by a process signal. But in applications that are using ++ * SPDK for a subset of its process threads, this function can be called in lieu ++ * of a signal. ++ */ ++void spdk_app_start_shutdown(void); ++ ++/** ++ * Stop the framework. ++ * ++ * This does not wait for all threads to exit. Instead, it kicks off the shutdown ++ * process and returns. Once the shutdown process is complete, spdk_app_start() ++ * will return. ++ * ++ * \param rc The rc value specified here will be returned to caller of spdk_app_start(). ++ */ ++void spdk_app_stop(int rc); ++ ++/** ++ * Return the shared memory id for this application. ++ * ++ * \return shared memory id. ++ */ ++int spdk_app_get_shm_id(void); ++ ++bool spdk_get_shutdown_sig_received(void); ++ ++/** ++ * Convert a string containing a CPU core mask into a bitmask ++ * ++ * \param mask String containing a CPU core mask. ++ * \param cpumask Bitmask of CPU cores. ++ * ++ * \return 0 on success, -1 on failure. ++ */ ++int spdk_app_parse_core_mask(const char *mask, struct spdk_cpuset *cpumask); ++ ++/** ++ * Get the mask of the CPU cores active for this application ++ * ++ * \return the bitmask of the active CPU cores. ++ */ ++const struct spdk_cpuset *spdk_app_get_core_mask(void); ++ ++#define SPDK_APP_GETOPT_STRING "c:de:ghi:m:n:p:r:s:uvA:B:L:RW:" ++ ++enum spdk_app_parse_args_rvals { ++ SPDK_APP_PARSE_ARGS_HELP = 0, ++ SPDK_APP_PARSE_ARGS_SUCCESS = 1, ++ SPDK_APP_PARSE_ARGS_FAIL = 2 ++}; ++typedef enum spdk_app_parse_args_rvals spdk_app_parse_args_rvals_t; ++ ++/** ++ * Helper function for parsing arguments and printing usage messages. ++ * ++ * \param argc Count of arguments in argv parameter array. ++ * \param argv Array of command line arguments. ++ * \param opts Default options for the application. ++ * \param getopt_str String representing the app-specific command line parameters. ++ * Characters in this string must not conflict with characters in SPDK_APP_GETOPT_STRING. ++ * \param app_long_opts Array of full-name parameters. Can be NULL. ++ * \param parse Function pointer to call if an argument in getopt_str is found. ++ * \param usage Function pointer to print usage messages for app-specific command ++ * line parameters. ++ *\return SPDK_APP_PARSE_ARGS_FAIL on failure, SPDK_APP_PARSE_ARGS_SUCCESS on ++ * success, SPDK_APP_PARSE_ARGS_HELP if '-h' passed as an option. ++ */ ++spdk_app_parse_args_rvals_t spdk_app_parse_args(int argc, char **argv, ++ struct spdk_app_opts *opts, const char *getopt_str, ++ const struct option *app_long_opts, int (*parse)(int ch, char *arg), ++ void (*usage)(void)); ++ ++/** ++ * Print usage strings for common SPDK command line options. ++ * ++ * May only be called after spdk_app_parse_args(). ++ */ ++void spdk_app_usage(void); ++ ++/** ++ * Allocate an event to be passed to spdk_event_call(). ++ * ++ * \param lcore Lcore to run this event. ++ * \param fn Function used to execute event. ++ * \param arg1 Argument passed to function fn. ++ * \param arg2 Argument passed to function fn. ++ * ++ * \return a pointer to the allocated event. ++ */ ++struct spdk_event *spdk_event_allocate(uint32_t lcore, spdk_event_fn fn, ++ void *arg1, void *arg2); ++ ++/** ++ * Pass the given event to the associated lcore and call the function. ++ * ++ * \param event Event to execute. ++ */ ++void spdk_event_call(struct spdk_event *event); ++ ++/** ++ * Enable or disable monitoring of context switches. ++ * ++ * \param enabled True to enable, false to disable. ++ */ ++void spdk_framework_enable_context_switch_monitor(bool enabled); ++ ++/** ++ * Return whether context switch monitoring is enabled. ++ * ++ * \return true if enabled or false otherwise. ++ */ ++bool spdk_framework_context_switch_monitor_enabled(void); ++ ++#ifdef __cplusplus ++} ++#endif ++ ++#endif +diff --git a/include/spdk/init.h b/include/spdk/init.h +index 3bba865..2650a91 100644 +--- a/include/spdk/init.h ++++ b/include/spdk/init.h +@@ -1,79 +1,84 @@ +-/* SPDX-License-Identifier: BSD-3-Clause +- * Copyright (C) 2021 Intel Corporation. All rights reserved. +- */ +- +-/** +- * \file +- * SPDK Initialization Helper +- */ +- +-#ifndef SPDK_INIT_H +-#define SPDK_INIT_H +- +-#include "spdk/stdinc.h" +-#include "spdk/queue.h" +- +-#ifdef __cplusplus +-extern "C" { +-#endif +- +-#define SPDK_DEFAULT_RPC_ADDR "/var/tmp/spdk.sock" +- +-/** +- * Create the SPDK JSON-RPC server and listen at the provided address. The RPC server is optional and is +- * independent of subsystem initialization. The RPC server can be started and stopped at any time. +- * +- * \param listen_addr Path to a unix domain socket to listen on +- * +- * \return Negated errno on failure. 0 on success. +- */ +-int spdk_rpc_initialize(const char *listen_addr); +- +-/** +- * Shut down the SPDK JSON-RPC target +- */ +-void spdk_rpc_finish(void); +- +-typedef void (*spdk_subsystem_init_fn)(int rc, void *ctx); +- +-/** +- * Begin the initialization process for all SPDK subsystems. SPDK is divided into subsystems at a macro-level +- * and each subsystem automatically registers itself with this library at start up using a C +- * constructor. Further, each subsystem can declare other subsystems that it depends on. +- * Calling this function will correctly initialize all subsystems that are present, in the +- * required order. +- * +- * \param cb_fn Function called when the process is complete. +- * \param cb_arg User context passed to cb_fn. +- */ +-void spdk_subsystem_init(spdk_subsystem_init_fn cb_fn, void *cb_arg); +- +-/** +- * Like spdk_subsystem_init, but additionally configure each subsystem using the provided JSON config +- * file. This will automatically start a JSON RPC server and then stop it. +- * +- * \param json_config_file Path to a JSON config file. +- * \param rpc_addr Path to a unix domain socket to send configuration RPCs to. +- * \param cb_fn Function called when the process is complete. +- * \param cb_arg User context passed to cb_fn. +- * \param stop_on_error Whether to stop initialization if one of the JSON RPCs fails. +- */ +-void spdk_subsystem_init_from_json_config(const char *json_config_file, const char *rpc_addr, +- spdk_subsystem_init_fn cb_fn, void *cb_arg, +- bool stop_on_error); +- +-typedef void (*spdk_subsystem_fini_fn)(void *ctx); +- +-/** +- * Tear down all of the subsystems in the correct order. +- * +- * \param cb_fn Function called when the process is complete. +- * \param cb_arg User context passed to cb_fn +- */ +-void spdk_subsystem_fini(spdk_subsystem_fini_fn cb_fn, void *cb_arg); +- +-#ifdef __cplusplus +-} +-#endif +- +-#endif ++/* SPDX-License-Identifier: BSD-3-Clause ++ * Copyright (C) 2021 Intel Corporation. All rights reserved. ++ */ ++ ++/** ++ * \file ++ * SPDK Initialization Helper ++ */ ++ ++#ifndef SPDK_INIT_H ++#define SPDK_INIT_H ++ ++#include "spdk/stdinc.h" ++#include "spdk/queue.h" ++ ++#ifdef __cplusplus ++extern "C" { ++#endif ++ ++#define SPDK_DEFAULT_RPC_ADDR "/var/tmp/spdk.sock" ++#define RPC_SELECT_INTERVAL 4000 /* 4ms */ ++ ++/** ++ * Create the SPDK JSON-RPC server and listen at the provided address. The RPC server is optional and is ++ * independent of subsystem initialization. The RPC server can be started and stopped at any time. ++ * ++ * \param listen_addr Path to a unix domain socket to listen on ++ * ++ * \return Negated errno on failure. 0 on success. ++ */ ++int spdk_rpc_initialize(const char *listen_addr, int internval); ++ ++/** ++ * Shut down the SPDK JSON-RPC target ++ */ ++void spdk_rpc_finish(void); ++ ++typedef void (*spdk_subsystem_init_fn)(int rc, void *ctx); ++ ++/** ++ * Begin the initialization process for all SPDK subsystems. SPDK is divided into subsystems at a macro-level ++ * and each subsystem automatically registers itself with this library at start up using a C ++ * constructor. Further, each subsystem can declare other subsystems that it depends on. ++ * Calling this function will correctly initialize all subsystems that are present, in the ++ * required order. ++ * ++ * \param cb_fn Function called when the process is complete. ++ * \param cb_arg User context passed to cb_fn. ++ */ ++void spdk_subsystem_init(spdk_subsystem_init_fn cb_fn, void *cb_arg); ++ ++/** ++ * Like spdk_subsystem_init, but additionally configure each subsystem using the provided JSON config ++ * file. This will automatically start a JSON RPC server and then stop it. ++ * ++ * \param json_config_file Path to a JSON config file. ++ * \param rpc_addr Path to a unix domain socket to send configuration RPCs to. ++ * \param cb_fn Function called when the process is complete. ++ * \param cb_arg User context passed to cb_fn. ++ * \param stop_on_error Whether to stop initialization if one of the JSON RPCs fails. ++ */ ++void spdk_subsystem_init_from_json_config(const char *json_config_file, const char *rpc_addr, ++ spdk_subsystem_init_fn cb_fn, void *cb_arg, ++ bool stop_on_error); ++ ++typedef void (*spdk_subsystem_fini_fn)(void *ctx); ++ ++/** ++ * Tear down all of the subsystems in the correct order. ++ * ++ * \param cb_fn Function called when the process is complete. ++ * \param cb_arg User context passed to cb_fn ++ */ ++void spdk_subsystem_fini(spdk_subsystem_fini_fn cb_fn, void *cb_arg); ++ ++void spdk_ssam_set_hot_restart(bool value); ++ ++bool spdk_ssam_get_hot_restart(void); ++ ++#ifdef __cplusplus ++} ++#endif ++ ++#endif +diff --git a/include/spdk/ssam.h b/include/spdk/ssam.h +new file mode 100644 +index 0000000..2736719 +--- /dev/null ++++ b/include/spdk/ssam.h +@@ -0,0 +1,240 @@ ++/*- ++ * BSD LICENSE ++ * ++ * Copyright (c) Huawei Technologies Co., Ltd. 2021-2021. All rights reserved. ++ * ++ * Redistribution and use in source and binary forms, with or without ++ * modification, are permitted provided that the following conditions ++ * are met: ++ * ++ * * Redistributions of source code must retain the above copyright ++ * notice, this list of conditions and the following disclaimer. ++ * * Redistributions in binary form must reproduce the above copyright ++ * notice, this list of conditions and the following disclaimer in ++ * the documentation and/or other materials provided with the ++ * distribution. ++ * * Neither the name of Intel Corporation nor the names of its ++ * contributors may be used to endorse or promote products derived ++ * from this software without specific prior written permission. ++ * ++ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS ++ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT ++ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR ++ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT ++ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, ++ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT ++ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, ++ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY ++ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT ++ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE ++ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++ */ ++ ++#ifndef SSAM_H ++#define SSAM_H ++ ++#include ++ ++#include "spdk/stdinc.h" ++#include "spdk/cpuset.h" ++#include "spdk/json.h" ++#include "spdk/thread.h" ++#include "spdk/event.h" ++ ++#include "dpak_ssam.h" ++ ++#ifdef DEBUG ++#define ASSERT(f) assert(f) ++#else ++#define ASSERT(f) ((void)0) ++#endif ++ ++#define SPDK_INVALID_TID UINT16_MAX ++#define SPDK_SESSION_TYPE_MAX_LEN 64 ++ ++#define SPDK_SESSION_TYPE_BLK "blk" ++#define SPDK_SESSION_TYPE_SCSI "scsi" ++ ++#define SSAM_SHM "ssam_shm" ++#define SSAM_SHM_PERMIT 0640 ++#define SSAM_STORAGE_READY_FILE "/proc/sdi_storage/storage_ready" ++ ++enum virtio_type ++{ ++ VIRTIO_TYPE_UNKNOWN, ++ VIRTIO_TYPE_BLK, ++ VIRTIO_TYPE_SCSI, ++}; ++ ++/** ++ * ssam subsystem init callback ++ * ++ * \param rc The preceding processing result, ++ * 0 on success, negative errno on error. ++ */ ++typedef void (*spdk_ssam_init_cb)(int rc); ++ ++/** ++ * ssam subsystem fini callback ++ */ ++typedef void (*spdk_ssam_fini_cb)(void); ++ ++/** ++ * ssam dump config json ++ */ ++void spdk_ssam_config_json(struct spdk_json_write_ctx *w); ++ ++/** ++ * Check if ssam support the global vf id. ++ * ++ * \param gfunc_id ssam global vf id. ++ * ++ * \return -EINVAL indicate gfunc_id invalid, -ENODEV indicate no such vf or ++ * 0 indicate gfunc_id valid. ++ */ ++int spdk_ssam_check_gfunc_id(uint16_t gfunc_id); ++ ++/** ++ * Find a ssam session by global vf id. ++ * ++ * \param gfunc_id ssam global vf id. ++ * ++ * \return ssam session or NULL indicate not find. ++ */ ++struct spdk_ssam_session *spdk_ssam_session_find(uint16_t gfunc_id); ++ ++/** ++ * Get gfunc id by controller name. ++ * ++ * \param name controller name. ++ * ++ * \return gfunc id or SPDK_INVALID_GFUNC_ID gfunc id not find. ++ */ ++uint16_t spdk_ssam_get_gfunc_id_by_name(char *name); ++ ++/** ++ * Get the next ssam device. If there's no more devices to iterate ++ * through, NULL will be returned. ++ * ++ * \param smdev ssam device. If NULL, this function will return the ++ * very first device. ++ * ++ * \return smdev ssam device or NULL indicate no more devices ++ */ ++struct spdk_ssam_dev *spdk_ssam_dev_next(const struct spdk_ssam_dev *smdev); ++ ++/** ++ * Lock the global ssam mutex synchronizing all the ssam device accesses. ++ */ ++void spdk_ssam_lock(void); ++ ++/** ++ * Lock the global ssam mutex synchronizing all the ssam device accesses. ++ * ++ * \return 0 if the mutex could be locked immediately, negative errno otherwise. ++ */ ++int spdk_ssam_trylock(void); ++ ++/** ++ * Unlock the global ssam mutex. ++ */ ++void spdk_ssam_unlock(void); ++ ++/** ++ * \param smsession ssam session. ++ * \param arg user-provided parameter. ++ * ++ * \return 0 on success, negative if failed ++ */ ++typedef int (*spdk_ssam_session_fn)(struct spdk_ssam_session *smsession, void **arg); ++ ++/** ++ * \param smsession ssam session. ++ * \param arg user-provided parameter. ++ */ ++typedef void (*spdk_ssam_session_cpl_fn)(struct spdk_ssam_session *smsession, void **arg); ++ ++/** ++ * \param arg user-provided parameter. ++ * \param rsp spdk_ssam_session_fn call back response value, 0 success, negative if failed. ++ */ ++typedef void (*spdk_ssam_session_rsp_fn)(void *arg, int rsp); ++ ++struct spdk_ssam_session_reg_info { ++ char type_name[SPDK_SESSION_TYPE_MAX_LEN]; ++ spdk_ssam_session_rsp_fn rsp_fn; ++ void *rsp_ctx; ++ uint16_t gfunc_id; ++ uint16_t tid; ++ uint16_t queues; ++ const struct spdk_ssam_session_backend *backend; ++ uint32_t session_ctx_size; ++ char *name; ++ char *dbdf; ++}; ++ ++/** ++ * Construct a ssam blk device. This will create a ssam ++ * blk device and then create a session. Creating the smdev will ++ * start an I/O poller and hog a CPU. If already exist a ssam ++ * blk device, then it will only create a session to this device. ++ * All sessions in the same device share one I/O poller and one CPU. ++ * ssam blk device is tightly associated with given SPDK bdev. ++ * Given bdev can not be changed, unless it has been hotremoved. This ++ * would result in all I/O failing with virtio VIRTIO_BLK_S_IOERR ++ * error code. ++ * ++ * This function is thread-safe. ++ * ++ * \param info session register information. ++ * \param dev_name bdev name to associate with this vhost device ++ * \param readonly if set, all writes to the device will fail with ++ * VIRTIO_BLK_S_IOERR error code. ++ * \param serial means volume id. ++ * ++ * \return 0 on success, negative errno on error. ++ */ ++int spdk_ssam_blk_construct(struct spdk_ssam_session_reg_info *info, ++ const char *dev_name, bool readonly, char *serial); ++ ++/** ++ * ssam user config init. ++ */ ++int spdk_ssam_user_config_init(void); ++ ++/** ++ * ssam get tid which has minimum device. ++ */ ++uint16_t spdk_ssam_get_tid(void); ++ ++void spdk_ssam_exit(void); ++ ++void spdk_ssam_subsystem_fini(spdk_ssam_fini_cb fini_cb); ++ ++void spdk_ssam_subsystem_init(spdk_ssam_init_cb init_cb); ++ ++int spdk_ssam_scsi_construct(struct spdk_ssam_session_reg_info *info); ++ ++int spdk_ssam_scsi_dev_add_tgt(struct spdk_ssam_session *smsession, int target_num, ++ const char *bdev_name); ++ ++int spdk_ssam_scsi_dev_remove_tgt(struct spdk_ssam_session *smsession, ++ unsigned scsi_tgt_num, spdk_ssam_session_rsp_fn cb_fn, void *cb_arg); ++ ++void ssam_set_shm_created(bool shm_created); ++ ++bool ssam_get_shm_created(void); ++ ++void ssam_poller_start(void); ++ ++void ssam_deinit_device_pcie_list(void); ++ ++int ssam_init_device_pcie_list(void); ++ ++bool spdk_ssam_is_starting(void); ++ ++void ssam_dump_device_pcie_list(struct spdk_json_write_ctx *w); ++ ++uint32_t ssam_get_device_pcie_list_size(void); ++ ++#endif /* SSAM_H */ +\ No newline at end of file +diff --git a/lib/Makefile b/lib/Makefile +index 5cf00b8..c297bc1 100644 +--- a/lib/Makefile ++++ b/lib/Makefile +@@ -1,43 +1,45 @@ +-# SPDX-License-Identifier: BSD-3-Clause +-# Copyright (C) 2015 Intel Corporation. +-# All rights reserved. +-# +- +-SPDK_ROOT_DIR := $(abspath $(CURDIR)/..) +-include $(SPDK_ROOT_DIR)/mk/spdk.common.mk +-include $(SPDK_ROOT_DIR)/mk/spdk.lib_deps.mk +- +-DIRS-y += bdev blob blobfs conf dma accel event json jsonrpc \ +- log lvol rpc sock thread trace util nvme vmd nvmf scsi \ +- ioat ut_mock iscsi notify init trace_parser +-ifeq ($(OS),Linux) +-DIRS-y += nbd ftl vfio_user +-ifeq ($(CONFIG_UBLK),y) +-DIRS-y += ublk +-endif +-endif +- +-DIRS-$(CONFIG_OCF) += env_ocf +-DIRS-$(CONFIG_IDXD) += idxd +-DIRS-$(CONFIG_VHOST) += vhost +-DIRS-$(CONFIG_VIRTIO) += virtio +-DIRS-$(CONFIG_VBDEV_COMPRESS) += reduce +-DIRS-$(CONFIG_RDMA) += rdma +-DIRS-$(CONFIG_VFIO_USER) += vfu_tgt +- +-ifeq ($(CONFIG_RDMA_PROV),mlx5_dv) +-DIRS-y += mlx5 +-endif +-# If CONFIG_ENV is pointing at a directory in lib, build it. +-# Out-of-tree env implementations must be built separately by the user. +-ENV_NAME := $(notdir $(CONFIG_ENV)) +-ifeq ($(abspath $(CONFIG_ENV)),$(SPDK_ROOT_DIR)/lib/$(ENV_NAME)) +-DIRS-y += $(ENV_NAME) +-endif +- +-.PHONY: all clean $(DIRS-y) +- +-all: $(DIRS-y) +-clean: $(DIRS-y) +- +-include $(SPDK_ROOT_DIR)/mk/spdk.subdirs.mk ++# SPDX-License-Identifier: BSD-3-Clause ++# Copyright (C) 2015 Intel Corporation. ++# All rights reserved. ++# ++ ++SPDK_ROOT_DIR := $(abspath $(CURDIR)/..) ++include $(SPDK_ROOT_DIR)/mk/spdk.common.mk ++include $(SPDK_ROOT_DIR)/mk/spdk.lib_deps.mk ++ ++DIRS-y += bdev blob blobfs conf dma accel event json jsonrpc \ ++ log lvol rpc sock thread trace util nvme vmd nvmf scsi \ ++ ioat ut_mock iscsi notify init trace_parser ++ifeq ($(OS),Linux) ++DIRS-y += nbd ftl vfio_user ++ifeq ($(CONFIG_UBLK),y) ++DIRS-y += ublk ++endif ++endif ++ ++DIRS-$(CONFIG_OCF) += env_ocf ++DIRS-$(CONFIG_IDXD) += idxd ++DIRS-$(CONFIG_VHOST) += vhost ++DIRS-$(CONFIG_SSAM) += ssam ++DIRS-$(CONFIG_SSAM) += ssam_adapter ++DIRS-$(CONFIG_VIRTIO) += virtio ++DIRS-$(CONFIG_VBDEV_COMPRESS) += reduce ++DIRS-$(CONFIG_RDMA) += rdma ++DIRS-$(CONFIG_VFIO_USER) += vfu_tgt ++ ++ifeq ($(CONFIG_RDMA_PROV),mlx5_dv) ++DIRS-y += mlx5 ++endif ++# If CONFIG_ENV is pointing at a directory in lib, build it. ++# Out-of-tree env implementations must be built separately by the user. ++ENV_NAME := $(notdir $(CONFIG_ENV)) ++ifeq ($(abspath $(CONFIG_ENV)),$(SPDK_ROOT_DIR)/lib/$(ENV_NAME)) ++DIRS-y += $(ENV_NAME) ++endif ++ ++.PHONY: all clean $(DIRS-y) ++ ++all: $(DIRS-y) ++clean: $(DIRS-y) ++ ++include $(SPDK_ROOT_DIR)/mk/spdk.subdirs.mk +diff --git a/lib/bdev/bdev.c b/lib/bdev/bdev.c +index c7bd1a3..f51628a 100644 +--- a/lib/bdev/bdev.c ++++ b/lib/bdev/bdev.c +@@ -1,8643 +1,8662 @@ +-/* SPDX-License-Identifier: BSD-3-Clause +- * Copyright (C) 2016 Intel Corporation. All rights reserved. +- * Copyright (c) 2019 Mellanox Technologies LTD. All rights reserved. +- * Copyright (c) 2021-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +- */ +- +-#include "spdk/stdinc.h" +- +-#include "spdk/bdev.h" +- +-#include "spdk/config.h" +-#include "spdk/env.h" +-#include "spdk/thread.h" +-#include "spdk/likely.h" +-#include "spdk/queue.h" +-#include "spdk/nvme_spec.h" +-#include "spdk/scsi_spec.h" +-#include "spdk/notify.h" +-#include "spdk/util.h" +-#include "spdk/trace.h" +-#include "spdk/dma.h" +- +-#include "spdk/bdev_module.h" +-#include "spdk/log.h" +-#include "spdk/string.h" +- +-#include "bdev_internal.h" +-#include "spdk_internal/trace_defs.h" +- +-#ifdef SPDK_CONFIG_VTUNE +-#include "ittnotify.h" +-#include "ittnotify_types.h" +-int __itt_init_ittlib(const char *, __itt_group_id); +-#endif +- +-#define SPDK_BDEV_IO_POOL_SIZE (64 * 1024 - 1) +-#define SPDK_BDEV_IO_CACHE_SIZE 256 +-#define SPDK_BDEV_AUTO_EXAMINE true +-#define BUF_SMALL_POOL_SIZE 8191 +-#define BUF_LARGE_POOL_SIZE 1023 +-#define BUF_SMALL_CACHE_SIZE 128 +-#define BUF_LARGE_CACHE_SIZE 16 +-#define NOMEM_THRESHOLD_COUNT 8 +- +-#define SPDK_BDEV_QOS_TIMESLICE_IN_USEC 1000 +-#define SPDK_BDEV_QOS_MIN_IO_PER_TIMESLICE 1 +-#define SPDK_BDEV_QOS_MIN_BYTE_PER_TIMESLICE 512 +-#define SPDK_BDEV_QOS_MIN_IOS_PER_SEC 1000 +-#define SPDK_BDEV_QOS_MIN_BYTES_PER_SEC (1024 * 1024) +-#define SPDK_BDEV_QOS_LIMIT_NOT_DEFINED UINT64_MAX +-#define SPDK_BDEV_IO_POLL_INTERVAL_IN_MSEC 1000 +- +-/* The maximum number of children requests for a UNMAP or WRITE ZEROES command +- * when splitting into children requests at a time. +- */ +-#define SPDK_BDEV_MAX_CHILDREN_UNMAP_WRITE_ZEROES_REQS (8) +-#define BDEV_RESET_CHECK_OUTSTANDING_IO_PERIOD 1000000 +- +-/* The maximum number of children requests for a COPY command +- * when splitting into children requests at a time. +- */ +-#define SPDK_BDEV_MAX_CHILDREN_COPY_REQS (8) +- +-SPDK_LOG_DEPRECATION_REGISTER(bdev_register_examine_thread, +- "bdev register and examine on non-app thread", "SPDK 23.05", 0); +- +-SPDK_LOG_DEPRECATION_REGISTER(vtune_support, "Intel(R) VTune integration", "SPDK 23.05", 0); +- +-static const char *qos_rpc_type[] = {"rw_ios_per_sec", +- "rw_mbytes_per_sec", "r_mbytes_per_sec", "w_mbytes_per_sec" +- }; +- +-TAILQ_HEAD(spdk_bdev_list, spdk_bdev); +- +-RB_HEAD(bdev_name_tree, spdk_bdev_name); +- +-static int +-bdev_name_cmp(struct spdk_bdev_name *name1, struct spdk_bdev_name *name2) +-{ +- return strcmp(name1->name, name2->name); +-} +- +-RB_GENERATE_STATIC(bdev_name_tree, spdk_bdev_name, node, bdev_name_cmp); +- +-struct spdk_bdev_mgr { +- struct spdk_mempool *bdev_io_pool; +- +- void *zero_buffer; +- +- TAILQ_HEAD(bdev_module_list, spdk_bdev_module) bdev_modules; +- +- struct spdk_bdev_list bdevs; +- struct bdev_name_tree bdev_names; +- +- bool init_complete; +- bool module_init_complete; +- +- struct spdk_spinlock spinlock; +- +-#ifdef SPDK_CONFIG_VTUNE +- __itt_domain *domain; +-#endif +-}; +- +-static struct spdk_bdev_mgr g_bdev_mgr = { +- .bdev_modules = TAILQ_HEAD_INITIALIZER(g_bdev_mgr.bdev_modules), +- .bdevs = TAILQ_HEAD_INITIALIZER(g_bdev_mgr.bdevs), +- .bdev_names = RB_INITIALIZER(g_bdev_mgr.bdev_names), +- .init_complete = false, +- .module_init_complete = false, +-}; +- +-static void +-__attribute__((constructor)) +-_bdev_init(void) +-{ +- spdk_spin_init(&g_bdev_mgr.spinlock); +-} +- +-typedef void (*lock_range_cb)(void *ctx, int status); +- +-typedef void (*bdev_copy_bounce_buffer_cpl)(void *ctx, int rc); +- +-struct lba_range { +- uint64_t offset; +- uint64_t length; +- void *locked_ctx; +- struct spdk_bdev_channel *owner_ch; +- TAILQ_ENTRY(lba_range) tailq; +-}; +- +-static struct spdk_bdev_opts g_bdev_opts = { +- .bdev_io_pool_size = SPDK_BDEV_IO_POOL_SIZE, +- .bdev_io_cache_size = SPDK_BDEV_IO_CACHE_SIZE, +- .bdev_auto_examine = SPDK_BDEV_AUTO_EXAMINE, +- .small_buf_pool_size = BUF_SMALL_POOL_SIZE, +- .large_buf_pool_size = BUF_LARGE_POOL_SIZE, +-}; +- +-static spdk_bdev_init_cb g_init_cb_fn = NULL; +-static void *g_init_cb_arg = NULL; +- +-static spdk_bdev_fini_cb g_fini_cb_fn = NULL; +-static void *g_fini_cb_arg = NULL; +-static struct spdk_thread *g_fini_thread = NULL; +- +-struct spdk_bdev_qos_limit { +- /** IOs or bytes allowed per second (i.e., 1s). */ +- uint64_t limit; +- +- /** Remaining IOs or bytes allowed in current timeslice (e.g., 1ms). +- * For remaining bytes, allowed to run negative if an I/O is submitted when +- * some bytes are remaining, but the I/O is bigger than that amount. The +- * excess will be deducted from the next timeslice. +- */ +- int64_t remaining_this_timeslice; +- +- /** Minimum allowed IOs or bytes to be issued in one timeslice (e.g., 1ms). */ +- uint32_t min_per_timeslice; +- +- /** Maximum allowed IOs or bytes to be issued in one timeslice (e.g., 1ms). */ +- uint32_t max_per_timeslice; +- +- /** Function to check whether to queue the IO. */ +- bool (*queue_io)(const struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io); +- +- /** Function to update for the submitted IO. */ +- void (*update_quota)(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io); +-}; +- +-struct spdk_bdev_qos { +- /** Types of structure of rate limits. */ +- struct spdk_bdev_qos_limit rate_limits[SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES]; +- +- /** The channel that all I/O are funneled through. */ +- struct spdk_bdev_channel *ch; +- +- /** The thread on which the poller is running. */ +- struct spdk_thread *thread; +- +- /** Queue of I/O waiting to be issued. */ +- bdev_io_tailq_t queued; +- +- /** Size of a timeslice in tsc ticks. */ +- uint64_t timeslice_size; +- +- /** Timestamp of start of last timeslice. */ +- uint64_t last_timeslice; +- +- /** Poller that processes queued I/O commands each time slice. */ +- struct spdk_poller *poller; +-}; +- +-struct spdk_bdev_mgmt_channel { +- /* +- * Each thread keeps a cache of bdev_io - this allows +- * bdev threads which are *not* DPDK threads to still +- * benefit from a per-thread bdev_io cache. Without +- * this, non-DPDK threads fetching from the mempool +- * incur a cmpxchg on get and put. +- */ +- bdev_io_stailq_t per_thread_cache; +- uint32_t per_thread_cache_count; +- uint32_t bdev_io_cache_size; +- +- struct spdk_iobuf_channel iobuf; +- +- TAILQ_HEAD(, spdk_bdev_shared_resource) shared_resources; +- TAILQ_HEAD(, spdk_bdev_io_wait_entry) io_wait_queue; +-}; +- +-/* +- * Per-module (or per-io_device) data. Multiple bdevs built on the same io_device +- * will queue here their IO that awaits retry. It makes it possible to retry sending +- * IO to one bdev after IO from other bdev completes. +- */ +-struct spdk_bdev_shared_resource { +- /* The bdev management channel */ +- struct spdk_bdev_mgmt_channel *mgmt_ch; +- +- /* +- * Count of I/O submitted to bdev module and waiting for completion. +- * Incremented before submit_request() is called on an spdk_bdev_io. +- */ +- uint64_t io_outstanding; +- +- /* +- * Queue of IO awaiting retry because of a previous NOMEM status returned +- * on this channel. +- */ +- bdev_io_tailq_t nomem_io; +- +- /* +- * Threshold which io_outstanding must drop to before retrying nomem_io. +- */ +- uint64_t nomem_threshold; +- +- /* I/O channel allocated by a bdev module */ +- struct spdk_io_channel *shared_ch; +- +- /* Refcount of bdev channels using this resource */ +- uint32_t ref; +- +- TAILQ_ENTRY(spdk_bdev_shared_resource) link; +-}; +- +-#define BDEV_CH_RESET_IN_PROGRESS (1 << 0) +-#define BDEV_CH_QOS_ENABLED (1 << 1) +- +-struct spdk_bdev_channel { +- struct spdk_bdev *bdev; +- +- /* The channel for the underlying device */ +- struct spdk_io_channel *channel; +- +- /* Per io_device per thread data */ +- struct spdk_bdev_shared_resource *shared_resource; +- +- struct spdk_bdev_io_stat *stat; +- +- /* +- * Count of I/O submitted to the underlying dev module through this channel +- * and waiting for completion. +- */ +- uint64_t io_outstanding; +- +- /* +- * List of all submitted I/Os including I/O that are generated via splitting. +- */ +- bdev_io_tailq_t io_submitted; +- +- /* +- * List of spdk_bdev_io that are currently queued because they write to a locked +- * LBA range. +- */ +- bdev_io_tailq_t io_locked; +- +- uint32_t flags; +- +- struct spdk_histogram_data *histogram; +- +-#ifdef SPDK_CONFIG_VTUNE +- uint64_t start_tsc; +- uint64_t interval_tsc; +- __itt_string_handle *handle; +- struct spdk_bdev_io_stat *prev_stat; +-#endif +- +- bdev_io_tailq_t queued_resets; +- +- lba_range_tailq_t locked_ranges; +-}; +- +-struct media_event_entry { +- struct spdk_bdev_media_event event; +- TAILQ_ENTRY(media_event_entry) tailq; +-}; +- +-#define MEDIA_EVENT_POOL_SIZE 64 +- +-struct spdk_bdev_desc { +- struct spdk_bdev *bdev; +- struct spdk_thread *thread; +- struct { +- spdk_bdev_event_cb_t event_fn; +- void *ctx; +- } callback; +- bool closed; +- bool write; +- bool memory_domains_supported; +- struct spdk_spinlock spinlock; +- uint32_t refs; +- TAILQ_HEAD(, media_event_entry) pending_media_events; +- TAILQ_HEAD(, media_event_entry) free_media_events; +- struct media_event_entry *media_events_buffer; +- TAILQ_ENTRY(spdk_bdev_desc) link; +- +- uint64_t timeout_in_sec; +- spdk_bdev_io_timeout_cb cb_fn; +- void *cb_arg; +- struct spdk_poller *io_timeout_poller; +-}; +- +-struct spdk_bdev_iostat_ctx { +- struct spdk_bdev_io_stat *stat; +- spdk_bdev_get_device_stat_cb cb; +- void *cb_arg; +-}; +- +-struct set_qos_limit_ctx { +- void (*cb_fn)(void *cb_arg, int status); +- void *cb_arg; +- struct spdk_bdev *bdev; +-}; +- +-struct spdk_bdev_channel_iter { +- spdk_bdev_for_each_channel_msg fn; +- spdk_bdev_for_each_channel_done cpl; +- struct spdk_io_channel_iter *i; +- void *ctx; +-}; +- +-struct spdk_bdev_io_error_stat { +- uint32_t error_status[-SPDK_MIN_BDEV_IO_STATUS]; +-}; +- +-#define __bdev_to_io_dev(bdev) (((char *)bdev) + 1) +-#define __bdev_from_io_dev(io_dev) ((struct spdk_bdev *)(((char *)io_dev) - 1)) +-#define __io_ch_to_bdev_ch(io_ch) ((struct spdk_bdev_channel *)spdk_io_channel_get_ctx(io_ch)) +-#define __io_ch_to_bdev_mgmt_ch(io_ch) ((struct spdk_bdev_mgmt_channel *)spdk_io_channel_get_ctx(io_ch)) +- +-static inline void bdev_io_complete(void *ctx); +- +-static void bdev_write_zero_buffer_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg); +-static void bdev_write_zero_buffer_next(void *_bdev_io); +- +-static void bdev_enable_qos_msg(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, +- struct spdk_io_channel *ch, void *_ctx); +-static void bdev_enable_qos_done(struct spdk_bdev *bdev, void *_ctx, int status); +- +-static int bdev_readv_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, +- struct iovec *iov, int iovcnt, void *md_buf, uint64_t offset_blocks, +- uint64_t num_blocks, spdk_bdev_io_completion_cb cb, void *cb_arg, +- struct spdk_bdev_ext_io_opts *opts, bool copy_opts); +-static int bdev_writev_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, +- struct iovec *iov, int iovcnt, void *md_buf, +- uint64_t offset_blocks, uint64_t num_blocks, +- spdk_bdev_io_completion_cb cb, void *cb_arg, +- struct spdk_bdev_ext_io_opts *opts, bool copy_opts); +- +-static int bdev_lock_lba_range(struct spdk_bdev_desc *desc, struct spdk_io_channel *_ch, +- uint64_t offset, uint64_t length, +- lock_range_cb cb_fn, void *cb_arg); +- +-static int bdev_unlock_lba_range(struct spdk_bdev_desc *desc, struct spdk_io_channel *_ch, +- uint64_t offset, uint64_t length, +- lock_range_cb cb_fn, void *cb_arg); +- +-static inline void bdev_io_complete(void *ctx); +- +-static bool bdev_abort_queued_io(bdev_io_tailq_t *queue, struct spdk_bdev_io *bio_to_abort); +-static bool bdev_abort_buf_io(struct spdk_bdev_mgmt_channel *ch, struct spdk_bdev_io *bio_to_abort); +- +-void +-spdk_bdev_get_opts(struct spdk_bdev_opts *opts, size_t opts_size) +-{ +- if (!opts) { +- SPDK_ERRLOG("opts should not be NULL\n"); +- return; +- } +- +- if (!opts_size) { +- SPDK_ERRLOG("opts_size should not be zero value\n"); +- return; +- } +- +- opts->opts_size = opts_size; +- +-#define SET_FIELD(field) \ +- if (offsetof(struct spdk_bdev_opts, field) + sizeof(opts->field) <= opts_size) { \ +- opts->field = g_bdev_opts.field; \ +- } \ +- +- SET_FIELD(bdev_io_pool_size); +- SET_FIELD(bdev_io_cache_size); +- SET_FIELD(bdev_auto_examine); +- SET_FIELD(small_buf_pool_size); +- SET_FIELD(large_buf_pool_size); +- +- /* Do not remove this statement, you should always update this statement when you adding a new field, +- * and do not forget to add the SET_FIELD statement for your added field. */ +- SPDK_STATIC_ASSERT(sizeof(struct spdk_bdev_opts) == 32, "Incorrect size"); +- +-#undef SET_FIELD +-} +- +-SPDK_LOG_DEPRECATION_REGISTER(bdev_opts_small_buf_pool_size, "spdk_bdev_opts.small_buf_pool_size", +- "v23.05", 0); +-SPDK_LOG_DEPRECATION_REGISTER(bdev_opts_large_buf_pool_size, "spdk_bdev_opts.large_buf_pool_size", +- "v23.05", 0); +-int +-spdk_bdev_set_opts(struct spdk_bdev_opts *opts) +-{ +- struct spdk_iobuf_opts iobuf_opts; +- uint32_t min_pool_size; +- int rc; +- +- if (!opts) { +- SPDK_ERRLOG("opts cannot be NULL\n"); +- return -1; +- } +- +- if (!opts->opts_size) { +- SPDK_ERRLOG("opts_size inside opts cannot be zero value\n"); +- return -1; +- } +- +- /* +- * Add 1 to the thread count to account for the extra mgmt_ch that gets created during subsystem +- * initialization. A second mgmt_ch will be created on the same thread when the application starts +- * but before the deferred put_io_channel event is executed for the first mgmt_ch. +- */ +- min_pool_size = opts->bdev_io_cache_size * (spdk_thread_get_count() + 1); +- if (opts->bdev_io_pool_size < min_pool_size) { +- SPDK_ERRLOG("bdev_io_pool_size %" PRIu32 " is not compatible with bdev_io_cache_size %" PRIu32 +- " and %" PRIu32 " threads\n", opts->bdev_io_pool_size, opts->bdev_io_cache_size, +- spdk_thread_get_count()); +- SPDK_ERRLOG("bdev_io_pool_size must be at least %" PRIu32 "\n", min_pool_size); +- return -1; +- } +- +- if (opts->small_buf_pool_size != BUF_SMALL_POOL_SIZE) { +- SPDK_LOG_DEPRECATED(bdev_opts_small_buf_pool_size); +- } +- if (opts->large_buf_pool_size != BUF_LARGE_POOL_SIZE) { +- SPDK_LOG_DEPRECATED(bdev_opts_large_buf_pool_size); +- } +- +-#define SET_FIELD(field) \ +- if (offsetof(struct spdk_bdev_opts, field) + sizeof(opts->field) <= opts->opts_size) { \ +- g_bdev_opts.field = opts->field; \ +- } \ +- +- SET_FIELD(bdev_io_pool_size); +- SET_FIELD(bdev_io_cache_size); +- SET_FIELD(bdev_auto_examine); +- SET_FIELD(small_buf_pool_size); +- SET_FIELD(large_buf_pool_size); +- +- spdk_iobuf_get_opts(&iobuf_opts); +- iobuf_opts.small_pool_count = opts->small_buf_pool_size; +- iobuf_opts.large_pool_count = opts->large_buf_pool_size; +- +- rc = spdk_iobuf_set_opts(&iobuf_opts); +- if (rc != 0) { +- SPDK_ERRLOG("Failed to set iobuf opts\n"); +- return -1; +- } +- +- g_bdev_opts.opts_size = opts->opts_size; +- +-#undef SET_FIELD +- +- return 0; +-} +- +-static struct spdk_bdev * +-bdev_get_by_name(const char *bdev_name) +-{ +- struct spdk_bdev_name find; +- struct spdk_bdev_name *res; +- +- find.name = (char *)bdev_name; +- res = RB_FIND(bdev_name_tree, &g_bdev_mgr.bdev_names, &find); +- if (res != NULL) { +- return res->bdev; +- } +- +- return NULL; +-} +- +-struct spdk_bdev * +-spdk_bdev_get_by_name(const char *bdev_name) +-{ +- struct spdk_bdev *bdev; +- +- spdk_spin_lock(&g_bdev_mgr.spinlock); +- bdev = bdev_get_by_name(bdev_name); +- spdk_spin_unlock(&g_bdev_mgr.spinlock); +- +- return bdev; +-} +- +-struct bdev_io_status_string { +- enum spdk_bdev_io_status status; +- const char *str; +-}; +- +-static const struct bdev_io_status_string bdev_io_status_strings[] = { +- { SPDK_BDEV_IO_STATUS_AIO_ERROR, "aio_error" }, +- { SPDK_BDEV_IO_STATUS_ABORTED, "aborted" }, +- { SPDK_BDEV_IO_STATUS_FIRST_FUSED_FAILED, "first_fused_failed" }, +- { SPDK_BDEV_IO_STATUS_MISCOMPARE, "miscompare" }, +- { SPDK_BDEV_IO_STATUS_NOMEM, "nomem" }, +- { SPDK_BDEV_IO_STATUS_SCSI_ERROR, "scsi_error" }, +- { SPDK_BDEV_IO_STATUS_NVME_ERROR, "nvme_error" }, +- { SPDK_BDEV_IO_STATUS_FAILED, "failed" }, +- { SPDK_BDEV_IO_STATUS_PENDING, "pending" }, +- { SPDK_BDEV_IO_STATUS_SUCCESS, "success" }, +-}; +- +-static const char * +-bdev_io_status_get_string(enum spdk_bdev_io_status status) +-{ +- uint32_t i; +- +- for (i = 0; i < SPDK_COUNTOF(bdev_io_status_strings); i++) { +- if (bdev_io_status_strings[i].status == status) { +- return bdev_io_status_strings[i].str; +- } +- } +- +- return "reserved"; +-} +- +-struct spdk_bdev_wait_for_examine_ctx { +- struct spdk_poller *poller; +- spdk_bdev_wait_for_examine_cb cb_fn; +- void *cb_arg; +-}; +- +-static bool bdev_module_all_actions_completed(void); +- +-static int +-bdev_wait_for_examine_cb(void *arg) +-{ +- struct spdk_bdev_wait_for_examine_ctx *ctx = arg; +- +- if (!bdev_module_all_actions_completed()) { +- return SPDK_POLLER_IDLE; +- } +- +- spdk_poller_unregister(&ctx->poller); +- ctx->cb_fn(ctx->cb_arg); +- free(ctx); +- +- return SPDK_POLLER_BUSY; +-} +- +-int +-spdk_bdev_wait_for_examine(spdk_bdev_wait_for_examine_cb cb_fn, void *cb_arg) +-{ +- struct spdk_bdev_wait_for_examine_ctx *ctx; +- +- ctx = calloc(1, sizeof(*ctx)); +- if (ctx == NULL) { +- return -ENOMEM; +- } +- ctx->cb_fn = cb_fn; +- ctx->cb_arg = cb_arg; +- ctx->poller = SPDK_POLLER_REGISTER(bdev_wait_for_examine_cb, ctx, 0); +- +- return 0; +-} +- +-struct spdk_bdev_examine_item { +- char *name; +- TAILQ_ENTRY(spdk_bdev_examine_item) link; +-}; +- +-TAILQ_HEAD(spdk_bdev_examine_allowlist, spdk_bdev_examine_item); +- +-struct spdk_bdev_examine_allowlist g_bdev_examine_allowlist = TAILQ_HEAD_INITIALIZER( +- g_bdev_examine_allowlist); +- +-static inline bool +-bdev_examine_allowlist_check(const char *name) +-{ +- struct spdk_bdev_examine_item *item; +- TAILQ_FOREACH(item, &g_bdev_examine_allowlist, link) { +- if (strcmp(name, item->name) == 0) { +- return true; +- } +- } +- return false; +-} +- +-static inline void +-bdev_examine_allowlist_free(void) +-{ +- struct spdk_bdev_examine_item *item; +- while (!TAILQ_EMPTY(&g_bdev_examine_allowlist)) { +- item = TAILQ_FIRST(&g_bdev_examine_allowlist); +- TAILQ_REMOVE(&g_bdev_examine_allowlist, item, link); +- free(item->name); +- free(item); +- } +-} +- +-static inline bool +-bdev_in_examine_allowlist(struct spdk_bdev *bdev) +-{ +- struct spdk_bdev_alias *tmp; +- if (bdev_examine_allowlist_check(bdev->name)) { +- return true; +- } +- TAILQ_FOREACH(tmp, &bdev->aliases, tailq) { +- if (bdev_examine_allowlist_check(tmp->alias.name)) { +- return true; +- } +- } +- return false; +-} +- +-static inline bool +-bdev_ok_to_examine(struct spdk_bdev *bdev) +-{ +- if (g_bdev_opts.bdev_auto_examine) { +- return true; +- } else { +- return bdev_in_examine_allowlist(bdev); +- } +-} +- +-static void +-bdev_examine(struct spdk_bdev *bdev) +-{ +- struct spdk_bdev_module *module; +- uint32_t action; +- +- if (!bdev_ok_to_examine(bdev)) { +- return; +- } +- +- TAILQ_FOREACH(module, &g_bdev_mgr.bdev_modules, internal.tailq) { +- if (module->examine_config) { +- spdk_spin_lock(&module->internal.spinlock); +- action = module->internal.action_in_progress; +- module->internal.action_in_progress++; +- spdk_spin_unlock(&module->internal.spinlock); +- module->examine_config(bdev); +- if (action != module->internal.action_in_progress) { +- SPDK_ERRLOG("examine_config for module %s did not call " +- "spdk_bdev_module_examine_done()\n", module->name); +- } +- } +- } +- +- if (bdev->internal.claim_type != SPDK_BDEV_CLAIM_NONE) { +- module = bdev->internal.claim.v1.module; +- if (module->examine_disk) { +- spdk_spin_lock(&module->internal.spinlock); +- module->internal.action_in_progress++; +- spdk_spin_unlock(&module->internal.spinlock); +- module->examine_disk(bdev); +- } +- return; +- } +- +- TAILQ_FOREACH(module, &g_bdev_mgr.bdev_modules, internal.tailq) { +- if (module->examine_disk) { +- spdk_spin_lock(&module->internal.spinlock); +- module->internal.action_in_progress++; +- spdk_spin_unlock(&module->internal.spinlock); +- module->examine_disk(bdev); +- } +- } +-} +- +-int +-spdk_bdev_examine(const char *name) +-{ +- struct spdk_bdev *bdev; +- struct spdk_bdev_examine_item *item; +- +- if (spdk_unlikely(spdk_thread_get_app_thread() != spdk_get_thread())) { +- SPDK_LOG_DEPRECATED(bdev_register_examine_thread); +- } +- +- if (g_bdev_opts.bdev_auto_examine) { +- SPDK_ERRLOG("Manual examine is not allowed if auto examine is enabled"); +- return -EINVAL; +- } +- +- if (bdev_examine_allowlist_check(name)) { +- SPDK_ERRLOG("Duplicate bdev name for manual examine: %s\n", name); +- return -EEXIST; +- } +- +- item = calloc(1, sizeof(*item)); +- if (!item) { +- return -ENOMEM; +- } +- item->name = strdup(name); +- if (!item->name) { +- free(item); +- return -ENOMEM; +- } +- TAILQ_INSERT_TAIL(&g_bdev_examine_allowlist, item, link); +- +- bdev = spdk_bdev_get_by_name(name); +- if (bdev) { +- bdev_examine(bdev); +- } +- return 0; +-} +- +-static inline void +-bdev_examine_allowlist_config_json(struct spdk_json_write_ctx *w) +-{ +- struct spdk_bdev_examine_item *item; +- TAILQ_FOREACH(item, &g_bdev_examine_allowlist, link) { +- spdk_json_write_object_begin(w); +- spdk_json_write_named_string(w, "method", "bdev_examine"); +- spdk_json_write_named_object_begin(w, "params"); +- spdk_json_write_named_string(w, "name", item->name); +- spdk_json_write_object_end(w); +- spdk_json_write_object_end(w); +- } +-} +- +-struct spdk_bdev * +-spdk_bdev_first(void) +-{ +- struct spdk_bdev *bdev; +- +- bdev = TAILQ_FIRST(&g_bdev_mgr.bdevs); +- if (bdev) { +- SPDK_DEBUGLOG(bdev, "Starting bdev iteration at %s\n", bdev->name); +- } +- +- return bdev; +-} +- +-struct spdk_bdev * +-spdk_bdev_next(struct spdk_bdev *prev) +-{ +- struct spdk_bdev *bdev; +- +- bdev = TAILQ_NEXT(prev, internal.link); +- if (bdev) { +- SPDK_DEBUGLOG(bdev, "Continuing bdev iteration at %s\n", bdev->name); +- } +- +- return bdev; +-} +- +-static struct spdk_bdev * +-_bdev_next_leaf(struct spdk_bdev *bdev) +-{ +- while (bdev != NULL) { +- if (bdev->internal.claim_type == SPDK_BDEV_CLAIM_NONE) { +- return bdev; +- } else { +- bdev = TAILQ_NEXT(bdev, internal.link); +- } +- } +- +- return bdev; +-} +- +-struct spdk_bdev * +-spdk_bdev_first_leaf(void) +-{ +- struct spdk_bdev *bdev; +- +- bdev = _bdev_next_leaf(TAILQ_FIRST(&g_bdev_mgr.bdevs)); +- +- if (bdev) { +- SPDK_DEBUGLOG(bdev, "Starting bdev iteration at %s\n", bdev->name); +- } +- +- return bdev; +-} +- +-struct spdk_bdev * +-spdk_bdev_next_leaf(struct spdk_bdev *prev) +-{ +- struct spdk_bdev *bdev; +- +- bdev = _bdev_next_leaf(TAILQ_NEXT(prev, internal.link)); +- +- if (bdev) { +- SPDK_DEBUGLOG(bdev, "Continuing bdev iteration at %s\n", bdev->name); +- } +- +- return bdev; +-} +- +-static inline bool +-bdev_io_use_memory_domain(struct spdk_bdev_io *bdev_io) +-{ +- return bdev_io->internal.ext_opts && bdev_io->internal.ext_opts->memory_domain; +-} +- +-void +-spdk_bdev_io_set_buf(struct spdk_bdev_io *bdev_io, void *buf, size_t len) +-{ +- struct iovec *iovs; +- +- if (bdev_io->u.bdev.iovs == NULL) { +- bdev_io->u.bdev.iovs = &bdev_io->iov; +- bdev_io->u.bdev.iovcnt = 1; +- } +- +- iovs = bdev_io->u.bdev.iovs; +- +- assert(iovs != NULL); +- assert(bdev_io->u.bdev.iovcnt >= 1); +- +- iovs[0].iov_base = buf; +- iovs[0].iov_len = len; +-} +- +-void +-spdk_bdev_io_set_md_buf(struct spdk_bdev_io *bdev_io, void *md_buf, size_t len) +-{ +- assert((len / spdk_bdev_get_md_size(bdev_io->bdev)) >= bdev_io->u.bdev.num_blocks); +- bdev_io->u.bdev.md_buf = md_buf; +-} +- +-static bool +-_is_buf_allocated(const struct iovec *iovs) +-{ +- if (iovs == NULL) { +- return false; +- } +- +- return iovs[0].iov_base != NULL; +-} +- +-static bool +-_are_iovs_aligned(struct iovec *iovs, int iovcnt, uint32_t alignment) +-{ +- int i; +- uintptr_t iov_base; +- +- if (spdk_likely(alignment == 1)) { +- return true; +- } +- +- for (i = 0; i < iovcnt; i++) { +- iov_base = (uintptr_t)iovs[i].iov_base; +- if ((iov_base & (alignment - 1)) != 0) { +- return false; +- } +- } +- +- return true; +-} +- +-static void +-bdev_io_get_buf_complete(struct spdk_bdev_io *bdev_io, bool status) +-{ +- struct spdk_io_channel *ch = spdk_bdev_io_get_io_channel(bdev_io); +- void *buf; +- +- if (spdk_unlikely(bdev_io->internal.get_aux_buf_cb != NULL)) { +- buf = bdev_io->internal.buf; +- bdev_io->internal.buf = NULL; +- bdev_io->internal.get_aux_buf_cb(ch, bdev_io, buf); +- bdev_io->internal.get_aux_buf_cb = NULL; +- } else { +- assert(bdev_io->internal.get_buf_cb != NULL); +- bdev_io->internal.get_buf_cb(ch, bdev_io, status); +- bdev_io->internal.get_buf_cb = NULL; +- } +-} +- +-static void +-_bdev_io_pull_buffer_cpl(void *ctx, int rc) +-{ +- struct spdk_bdev_io *bdev_io = ctx; +- +- if (rc) { +- SPDK_ERRLOG("Set bounce buffer failed with rc %d\n", rc); +- bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; +- } +- bdev_io_get_buf_complete(bdev_io, !rc); +-} +- +-static void +-_bdev_io_pull_bounce_md_buf(struct spdk_bdev_io *bdev_io, void *md_buf, size_t len) +-{ +- int rc = 0; +- +- /* save original md_buf */ +- bdev_io->internal.orig_md_iov.iov_base = bdev_io->u.bdev.md_buf; +- bdev_io->internal.orig_md_iov.iov_len = len; +- bdev_io->internal.bounce_md_iov.iov_base = md_buf; +- bdev_io->internal.bounce_md_iov.iov_len = len; +- /* set bounce md_buf */ +- bdev_io->u.bdev.md_buf = md_buf; +- +- if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) { +- if (bdev_io_use_memory_domain(bdev_io)) { +- rc = spdk_memory_domain_pull_data(bdev_io->internal.ext_opts->memory_domain, +- bdev_io->internal.ext_opts->memory_domain_ctx, +- &bdev_io->internal.orig_md_iov, 1, +- &bdev_io->internal.bounce_md_iov, 1, +- bdev_io->internal.data_transfer_cpl, +- bdev_io); +- if (rc == 0) { +- /* Continue to submit IO in completion callback */ +- return; +- } +- SPDK_ERRLOG("Failed to pull data from memory domain %s, rc %d\n", +- spdk_memory_domain_get_dma_device_id(bdev_io->internal.ext_opts->memory_domain), rc); +- } else { +- memcpy(md_buf, bdev_io->internal.orig_md_iov.iov_base, bdev_io->internal.orig_md_iov.iov_len); +- } +- } +- +- assert(bdev_io->internal.data_transfer_cpl); +- bdev_io->internal.data_transfer_cpl(bdev_io, rc); +-} +- +-static void +-_bdev_io_set_md_buf(struct spdk_bdev_io *bdev_io) +-{ +- struct spdk_bdev *bdev = bdev_io->bdev; +- uint64_t md_len; +- void *buf; +- +- if (spdk_bdev_is_md_separate(bdev)) { +- buf = (char *)bdev_io->u.bdev.iovs[0].iov_base + bdev_io->u.bdev.iovs[0].iov_len; +- md_len = bdev_io->u.bdev.num_blocks * bdev->md_len; +- +- assert(((uintptr_t)buf & (spdk_bdev_get_buf_align(bdev) - 1)) == 0); +- +- if (bdev_io->u.bdev.md_buf != NULL) { +- _bdev_io_pull_bounce_md_buf(bdev_io, buf, md_len); +- return; +- } else { +- spdk_bdev_io_set_md_buf(bdev_io, buf, md_len); +- } +- } +- +- bdev_io_get_buf_complete(bdev_io, true); +-} +- +-static void +-_bdev_io_pull_bounce_data_buf_done(void *ctx, int rc) +-{ +- struct spdk_bdev_io *bdev_io = ctx; +- +- if (rc) { +- SPDK_ERRLOG("Failed to get data buffer\n"); +- assert(bdev_io->internal.data_transfer_cpl); +- bdev_io->internal.data_transfer_cpl(bdev_io, rc); +- return; +- } +- +- _bdev_io_set_md_buf(bdev_io); +-} +- +-static void +-_bdev_io_pull_bounce_data_buf(struct spdk_bdev_io *bdev_io, void *buf, size_t len, +- bdev_copy_bounce_buffer_cpl cpl_cb) +-{ +- int rc = 0; +- +- bdev_io->internal.data_transfer_cpl = cpl_cb; +- /* save original iovec */ +- bdev_io->internal.orig_iovs = bdev_io->u.bdev.iovs; +- bdev_io->internal.orig_iovcnt = bdev_io->u.bdev.iovcnt; +- /* set bounce iov */ +- bdev_io->u.bdev.iovs = &bdev_io->internal.bounce_iov; +- bdev_io->u.bdev.iovcnt = 1; +- /* set bounce buffer for this operation */ +- bdev_io->u.bdev.iovs[0].iov_base = buf; +- bdev_io->u.bdev.iovs[0].iov_len = len; +- /* if this is write path, copy data from original buffer to bounce buffer */ +- if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) { +- if (bdev_io_use_memory_domain(bdev_io)) { +- rc = spdk_memory_domain_pull_data(bdev_io->internal.ext_opts->memory_domain, +- bdev_io->internal.ext_opts->memory_domain_ctx, +- bdev_io->internal.orig_iovs, +- (uint32_t) bdev_io->internal.orig_iovcnt, +- bdev_io->u.bdev.iovs, 1, +- _bdev_io_pull_bounce_data_buf_done, +- bdev_io); +- if (rc == 0) { +- /* Continue to submit IO in completion callback */ +- return; +- } +- SPDK_ERRLOG("Failed to pull data from memory domain %s\n", +- spdk_memory_domain_get_dma_device_id(bdev_io->internal.ext_opts->memory_domain)); +- } else { +- spdk_copy_iovs_to_buf(buf, len, bdev_io->internal.orig_iovs, bdev_io->internal.orig_iovcnt); +- } +- } +- +- _bdev_io_pull_bounce_data_buf_done(bdev_io, rc); +-} +- +-static void +-_bdev_io_set_buf(struct spdk_bdev_io *bdev_io, void *buf, uint64_t len) +-{ +- struct spdk_bdev *bdev = bdev_io->bdev; +- bool buf_allocated; +- uint64_t alignment; +- void *aligned_buf; +- +- bdev_io->internal.buf = buf; +- +- if (spdk_unlikely(bdev_io->internal.get_aux_buf_cb != NULL)) { +- bdev_io_get_buf_complete(bdev_io, true); +- return; +- } +- +- alignment = spdk_bdev_get_buf_align(bdev); +- buf_allocated = _is_buf_allocated(bdev_io->u.bdev.iovs); +- aligned_buf = (void *)(((uintptr_t)buf + (alignment - 1)) & ~(alignment - 1)); +- +- if (buf_allocated) { +- _bdev_io_pull_bounce_data_buf(bdev_io, aligned_buf, len, _bdev_io_pull_buffer_cpl); +- /* Continue in completion callback */ +- return; +- } else { +- spdk_bdev_io_set_buf(bdev_io, aligned_buf, len); +- } +- +- _bdev_io_set_md_buf(bdev_io); +-} +- +-static inline uint64_t +-bdev_io_get_max_buf_len(struct spdk_bdev_io *bdev_io, uint64_t len) +-{ +- struct spdk_bdev *bdev = bdev_io->bdev; +- uint64_t md_len, alignment; +- +- md_len = spdk_bdev_is_md_separate(bdev) ? bdev_io->u.bdev.num_blocks * bdev->md_len : 0; +- +- /* 1 byte alignment needs 0 byte of extra space, 64 bytes alignment needs 63 bytes of extra space, etc. */ +- alignment = spdk_bdev_get_buf_align(bdev) - 1; +- +- return len + alignment + md_len; +-} +- +-static void +-_bdev_io_put_buf(struct spdk_bdev_io *bdev_io, void *buf, uint64_t buf_len) +-{ +- struct spdk_bdev_mgmt_channel *ch; +- +- ch = bdev_io->internal.ch->shared_resource->mgmt_ch; +- spdk_iobuf_put(&ch->iobuf, buf, bdev_io_get_max_buf_len(bdev_io, buf_len)); +-} +- +-static void +-bdev_io_put_buf(struct spdk_bdev_io *bdev_io) +-{ +- assert(bdev_io->internal.buf != NULL); +- _bdev_io_put_buf(bdev_io, bdev_io->internal.buf, bdev_io->internal.buf_len); +- bdev_io->internal.buf = NULL; +-} +- +-void +-spdk_bdev_io_put_aux_buf(struct spdk_bdev_io *bdev_io, void *buf) +-{ +- uint64_t len = bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen; +- +- assert(buf != NULL); +- _bdev_io_put_buf(bdev_io, buf, len); +-} +- +-static void +-bdev_ch_retry_io(struct spdk_bdev_channel *bdev_ch) +-{ +- struct spdk_bdev *bdev = bdev_ch->bdev; +- struct spdk_bdev_shared_resource *shared_resource = bdev_ch->shared_resource; +- struct spdk_bdev_io *bdev_io; +- +- if (shared_resource->io_outstanding > shared_resource->nomem_threshold) { +- /* +- * Allow some more I/O to complete before retrying the nomem_io queue. +- * Some drivers (such as nvme) cannot immediately take a new I/O in +- * the context of a completion, because the resources for the I/O are +- * not released until control returns to the bdev poller. Also, we +- * may require several small I/O to complete before a larger I/O +- * (that requires splitting) can be submitted. +- */ +- return; +- } +- +- while (!TAILQ_EMPTY(&shared_resource->nomem_io)) { +- bdev_io = TAILQ_FIRST(&shared_resource->nomem_io); +- TAILQ_REMOVE(&shared_resource->nomem_io, bdev_io, internal.link); +- bdev_io->internal.ch->io_outstanding++; +- shared_resource->io_outstanding++; +- bdev_io->internal.status = SPDK_BDEV_IO_STATUS_PENDING; +- bdev_io->internal.error.nvme.cdw0 = 0; +- bdev_io->num_retries++; +- bdev->fn_table->submit_request(spdk_bdev_io_get_io_channel(bdev_io), bdev_io); +- if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_NOMEM) { +- break; +- } +- } +-} +- +-static inline void +-_bdev_io_decrement_outstanding(struct spdk_bdev_channel *bdev_ch, +- struct spdk_bdev_shared_resource *shared_resource) +-{ +- assert(bdev_ch->io_outstanding > 0); +- assert(shared_resource->io_outstanding > 0); +- bdev_ch->io_outstanding--; +- shared_resource->io_outstanding--; +-} +- +-static inline bool +-_bdev_io_handle_no_mem(struct spdk_bdev_io *bdev_io) +-{ +- struct spdk_bdev_channel *bdev_ch = bdev_io->internal.ch; +- struct spdk_bdev_shared_resource *shared_resource = bdev_ch->shared_resource; +- +- if (spdk_unlikely(bdev_io->internal.status == SPDK_BDEV_IO_STATUS_NOMEM)) { +- TAILQ_INSERT_HEAD(&shared_resource->nomem_io, bdev_io, internal.link); +- /* +- * Wait for some of the outstanding I/O to complete before we +- * retry any of the nomem_io. Normally we will wait for +- * NOMEM_THRESHOLD_COUNT I/O to complete but for low queue +- * depth channels we will instead wait for half to complete. +- */ +- shared_resource->nomem_threshold = spdk_max((int64_t)shared_resource->io_outstanding / 2, +- (int64_t)shared_resource->io_outstanding - NOMEM_THRESHOLD_COUNT); +- return true; +- } +- +- if (spdk_unlikely(!TAILQ_EMPTY(&shared_resource->nomem_io))) { +- bdev_ch_retry_io(bdev_ch); +- } +- +- return false; +-} +- +-static void +-_bdev_io_complete_push_bounce_done(void *ctx, int rc) +-{ +- struct spdk_bdev_io *bdev_io = ctx; +- struct spdk_bdev_channel *bdev_ch = bdev_io->internal.ch; +- struct spdk_bdev_shared_resource *shared_resource = bdev_ch->shared_resource; +- +- if (rc) { +- bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; +- } +- /* We want to free the bounce buffer here since we know we're done with it (as opposed +- * to waiting for the conditional free of internal.buf in spdk_bdev_free_io()). +- */ +- bdev_io_put_buf(bdev_io); +- +- /* Continue with IO completion flow */ +- _bdev_io_decrement_outstanding(bdev_ch, shared_resource); +- if (spdk_unlikely(_bdev_io_handle_no_mem(bdev_io))) { +- return; +- } +- +- bdev_io_complete(bdev_io); +-} +- +-static inline void +-_bdev_io_push_bounce_md_buffer(struct spdk_bdev_io *bdev_io) +-{ +- int rc = 0; +- +- /* do the same for metadata buffer */ +- if (spdk_unlikely(bdev_io->internal.orig_md_iov.iov_base != NULL)) { +- assert(spdk_bdev_is_md_separate(bdev_io->bdev)); +- +- if (bdev_io->type == SPDK_BDEV_IO_TYPE_READ && +- bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS) { +- if (bdev_io_use_memory_domain(bdev_io)) { +- /* If memory domain is used then we need to call async push function */ +- rc = spdk_memory_domain_push_data(bdev_io->internal.ext_opts->memory_domain, +- bdev_io->internal.ext_opts->memory_domain_ctx, +- &bdev_io->internal.orig_md_iov, +- (uint32_t)bdev_io->internal.orig_iovcnt, +- &bdev_io->internal.bounce_md_iov, 1, +- bdev_io->internal.data_transfer_cpl, +- bdev_io); +- if (rc == 0) { +- /* Continue IO completion in async callback */ +- return; +- } +- SPDK_ERRLOG("Failed to push md to memory domain %s\n", +- spdk_memory_domain_get_dma_device_id(bdev_io->internal.ext_opts->memory_domain)); +- } else { +- memcpy(bdev_io->internal.orig_md_iov.iov_base, bdev_io->u.bdev.md_buf, +- bdev_io->internal.orig_md_iov.iov_len); +- } +- } +- } +- +- assert(bdev_io->internal.data_transfer_cpl); +- bdev_io->internal.data_transfer_cpl(bdev_io, rc); +-} +- +-static void +-_bdev_io_push_bounce_data_buffer_done(void *ctx, int rc) +-{ +- struct spdk_bdev_io *bdev_io = ctx; +- +- assert(bdev_io->internal.data_transfer_cpl); +- +- if (rc) { +- bdev_io->internal.data_transfer_cpl(bdev_io, rc); +- return; +- } +- +- /* set original buffer for this io */ +- bdev_io->u.bdev.iovcnt = bdev_io->internal.orig_iovcnt; +- bdev_io->u.bdev.iovs = bdev_io->internal.orig_iovs; +- /* disable bouncing buffer for this io */ +- bdev_io->internal.orig_iovcnt = 0; +- bdev_io->internal.orig_iovs = NULL; +- +- _bdev_io_push_bounce_md_buffer(bdev_io); +-} +- +-static inline void +-_bdev_io_push_bounce_data_buffer(struct spdk_bdev_io *bdev_io, bdev_copy_bounce_buffer_cpl cpl_cb) +-{ +- int rc = 0; +- +- bdev_io->internal.data_transfer_cpl = cpl_cb; +- +- /* if this is read path, copy data from bounce buffer to original buffer */ +- if (bdev_io->type == SPDK_BDEV_IO_TYPE_READ && +- bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS) { +- if (bdev_io_use_memory_domain(bdev_io)) { +- /* If memory domain is used then we need to call async push function */ +- rc = spdk_memory_domain_push_data(bdev_io->internal.ext_opts->memory_domain, +- bdev_io->internal.ext_opts->memory_domain_ctx, +- bdev_io->internal.orig_iovs, +- (uint32_t)bdev_io->internal.orig_iovcnt, +- &bdev_io->internal.bounce_iov, 1, +- _bdev_io_push_bounce_data_buffer_done, +- bdev_io); +- if (rc == 0) { +- /* Continue IO completion in async callback */ +- return; +- } +- SPDK_ERRLOG("Failed to push data to memory domain %s\n", +- spdk_memory_domain_get_dma_device_id(bdev_io->internal.ext_opts->memory_domain)); +- } else { +- spdk_copy_buf_to_iovs(bdev_io->internal.orig_iovs, +- bdev_io->internal.orig_iovcnt, +- bdev_io->internal.bounce_iov.iov_base, +- bdev_io->internal.bounce_iov.iov_len); +- } +- } +- +- _bdev_io_push_bounce_data_buffer_done(bdev_io, rc); +-} +- +-static void +-bdev_io_get_iobuf_cb(struct spdk_iobuf_entry *iobuf, void *buf) +-{ +- struct spdk_bdev_io *bdev_io; +- +- bdev_io = SPDK_CONTAINEROF(iobuf, struct spdk_bdev_io, internal.iobuf); +- _bdev_io_set_buf(bdev_io, buf, bdev_io->internal.buf_len); +-} +- +-static void +-bdev_io_get_buf(struct spdk_bdev_io *bdev_io, uint64_t len) +-{ +- struct spdk_bdev_mgmt_channel *mgmt_ch; +- uint64_t max_len; +- void *buf; +- +- assert(spdk_bdev_io_get_thread(bdev_io) == spdk_get_thread()); +- mgmt_ch = bdev_io->internal.ch->shared_resource->mgmt_ch; +- max_len = bdev_io_get_max_buf_len(bdev_io, len); +- +- if (spdk_unlikely(max_len > mgmt_ch->iobuf.large.bufsize)) { +- SPDK_ERRLOG("Length %" PRIu64 " is larger than allowed\n", max_len); +- bdev_io_get_buf_complete(bdev_io, false); +- return; +- } +- +- bdev_io->internal.buf_len = len; +- buf = spdk_iobuf_get(&mgmt_ch->iobuf, max_len, &bdev_io->internal.iobuf, +- bdev_io_get_iobuf_cb); +- if (buf != NULL) { +- _bdev_io_set_buf(bdev_io, buf, len); +- } +-} +- +-void +-spdk_bdev_io_get_buf(struct spdk_bdev_io *bdev_io, spdk_bdev_io_get_buf_cb cb, uint64_t len) +-{ +- struct spdk_bdev *bdev = bdev_io->bdev; +- uint64_t alignment; +- +- assert(cb != NULL); +- bdev_io->internal.get_buf_cb = cb; +- +- alignment = spdk_bdev_get_buf_align(bdev); +- +- if (_is_buf_allocated(bdev_io->u.bdev.iovs) && +- _are_iovs_aligned(bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt, alignment)) { +- /* Buffer already present and aligned */ +- cb(spdk_bdev_io_get_io_channel(bdev_io), bdev_io, true); +- return; +- } +- +- bdev_io_get_buf(bdev_io, len); +-} +- +-static void +-_bdev_memory_domain_get_io_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io, +- bool success) +-{ +- if (!success) { +- SPDK_ERRLOG("Failed to get data buffer, completing IO\n"); +- bdev_io_complete(bdev_io); +- } else { +- bdev_io_submit(bdev_io); +- } +-} +- +-static void +-_bdev_memory_domain_io_get_buf(struct spdk_bdev_io *bdev_io, spdk_bdev_io_get_buf_cb cb, +- uint64_t len) +-{ +- assert(cb != NULL); +- bdev_io->internal.get_buf_cb = cb; +- +- bdev_io_get_buf(bdev_io, len); +-} +- +-void +-spdk_bdev_io_get_aux_buf(struct spdk_bdev_io *bdev_io, spdk_bdev_io_get_aux_buf_cb cb) +-{ +- uint64_t len = bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen; +- +- assert(cb != NULL); +- assert(bdev_io->internal.get_aux_buf_cb == NULL); +- bdev_io->internal.get_aux_buf_cb = cb; +- bdev_io_get_buf(bdev_io, len); +-} +- +-static int +-bdev_module_get_max_ctx_size(void) +-{ +- struct spdk_bdev_module *bdev_module; +- int max_bdev_module_size = 0; +- +- TAILQ_FOREACH(bdev_module, &g_bdev_mgr.bdev_modules, internal.tailq) { +- if (bdev_module->get_ctx_size && bdev_module->get_ctx_size() > max_bdev_module_size) { +- max_bdev_module_size = bdev_module->get_ctx_size(); +- } +- } +- +- return max_bdev_module_size; +-} +- +-static void +-bdev_qos_config_json(struct spdk_bdev *bdev, struct spdk_json_write_ctx *w) +-{ +- int i; +- struct spdk_bdev_qos *qos = bdev->internal.qos; +- uint64_t limits[SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES]; +- +- if (!qos) { +- return; +- } +- +- spdk_bdev_get_qos_rate_limits(bdev, limits); +- +- spdk_json_write_object_begin(w); +- spdk_json_write_named_string(w, "method", "bdev_set_qos_limit"); +- +- spdk_json_write_named_object_begin(w, "params"); +- spdk_json_write_named_string(w, "name", bdev->name); +- for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { +- if (limits[i] > 0) { +- spdk_json_write_named_uint64(w, qos_rpc_type[i], limits[i]); +- } +- } +- spdk_json_write_object_end(w); +- +- spdk_json_write_object_end(w); +-} +- +-void +-spdk_bdev_subsystem_config_json(struct spdk_json_write_ctx *w) +-{ +- struct spdk_bdev_module *bdev_module; +- struct spdk_bdev *bdev; +- +- assert(w != NULL); +- +- spdk_json_write_array_begin(w); +- +- spdk_json_write_object_begin(w); +- spdk_json_write_named_string(w, "method", "bdev_set_options"); +- spdk_json_write_named_object_begin(w, "params"); +- spdk_json_write_named_uint32(w, "bdev_io_pool_size", g_bdev_opts.bdev_io_pool_size); +- spdk_json_write_named_uint32(w, "bdev_io_cache_size", g_bdev_opts.bdev_io_cache_size); +- spdk_json_write_named_bool(w, "bdev_auto_examine", g_bdev_opts.bdev_auto_examine); +- spdk_json_write_object_end(w); +- spdk_json_write_object_end(w); +- +- bdev_examine_allowlist_config_json(w); +- +- TAILQ_FOREACH(bdev_module, &g_bdev_mgr.bdev_modules, internal.tailq) { +- if (bdev_module->config_json) { +- bdev_module->config_json(w); +- } +- } +- +- spdk_spin_lock(&g_bdev_mgr.spinlock); +- +- TAILQ_FOREACH(bdev, &g_bdev_mgr.bdevs, internal.link) { +- if (bdev->fn_table->write_config_json) { +- bdev->fn_table->write_config_json(bdev, w); +- } +- +- bdev_qos_config_json(bdev, w); +- } +- +- spdk_spin_unlock(&g_bdev_mgr.spinlock); +- +- /* This has to be last RPC in array to make sure all bdevs finished examine */ +- spdk_json_write_object_begin(w); +- spdk_json_write_named_string(w, "method", "bdev_wait_for_examine"); +- spdk_json_write_object_end(w); +- +- spdk_json_write_array_end(w); +-} +- +-static void +-bdev_mgmt_channel_destroy(void *io_device, void *ctx_buf) +-{ +- struct spdk_bdev_mgmt_channel *ch = ctx_buf; +- struct spdk_bdev_io *bdev_io; +- +- spdk_iobuf_channel_fini(&ch->iobuf); +- +- while (!STAILQ_EMPTY(&ch->per_thread_cache)) { +- bdev_io = STAILQ_FIRST(&ch->per_thread_cache); +- STAILQ_REMOVE_HEAD(&ch->per_thread_cache, internal.buf_link); +- ch->per_thread_cache_count--; +- spdk_mempool_put(g_bdev_mgr.bdev_io_pool, (void *)bdev_io); +- } +- +- assert(ch->per_thread_cache_count == 0); +-} +- +-static int +-bdev_mgmt_channel_create(void *io_device, void *ctx_buf) +-{ +- struct spdk_bdev_mgmt_channel *ch = ctx_buf; +- struct spdk_bdev_io *bdev_io; +- uint32_t i; +- int rc; +- +- rc = spdk_iobuf_channel_init(&ch->iobuf, "bdev", BUF_SMALL_CACHE_SIZE, BUF_LARGE_CACHE_SIZE); +- if (rc != 0) { +- SPDK_ERRLOG("Failed to create iobuf channel: %s\n", spdk_strerror(-rc)); +- return -1; +- } +- +- STAILQ_INIT(&ch->per_thread_cache); +- ch->bdev_io_cache_size = g_bdev_opts.bdev_io_cache_size; +- +- /* Pre-populate bdev_io cache to ensure this thread cannot be starved. */ +- ch->per_thread_cache_count = 0; +- for (i = 0; i < ch->bdev_io_cache_size; i++) { +- bdev_io = spdk_mempool_get(g_bdev_mgr.bdev_io_pool); +- if (bdev_io == NULL) { +- SPDK_ERRLOG("You need to increase bdev_io_pool_size using bdev_set_options RPC.\n"); +- assert(false); +- bdev_mgmt_channel_destroy(io_device, ctx_buf); +- return -1; +- } +- ch->per_thread_cache_count++; +- STAILQ_INSERT_HEAD(&ch->per_thread_cache, bdev_io, internal.buf_link); +- } +- +- TAILQ_INIT(&ch->shared_resources); +- TAILQ_INIT(&ch->io_wait_queue); +- +- return 0; +-} +- +-static void +-bdev_init_complete(int rc) +-{ +- spdk_bdev_init_cb cb_fn = g_init_cb_fn; +- void *cb_arg = g_init_cb_arg; +- struct spdk_bdev_module *m; +- +- g_bdev_mgr.init_complete = true; +- g_init_cb_fn = NULL; +- g_init_cb_arg = NULL; +- +- /* +- * For modules that need to know when subsystem init is complete, +- * inform them now. +- */ +- if (rc == 0) { +- TAILQ_FOREACH(m, &g_bdev_mgr.bdev_modules, internal.tailq) { +- if (m->init_complete) { +- m->init_complete(); +- } +- } +- } +- +- cb_fn(cb_arg, rc); +-} +- +-static bool +-bdev_module_all_actions_completed(void) +-{ +- struct spdk_bdev_module *m; +- +- TAILQ_FOREACH(m, &g_bdev_mgr.bdev_modules, internal.tailq) { +- if (m->internal.action_in_progress > 0) { +- return false; +- } +- } +- return true; +-} +- +-static void +-bdev_module_action_complete(void) +-{ +- /* +- * Don't finish bdev subsystem initialization if +- * module pre-initialization is still in progress, or +- * the subsystem been already initialized. +- */ +- if (!g_bdev_mgr.module_init_complete || g_bdev_mgr.init_complete) { +- return; +- } +- +- /* +- * Check all bdev modules for inits/examinations in progress. If any +- * exist, return immediately since we cannot finish bdev subsystem +- * initialization until all are completed. +- */ +- if (!bdev_module_all_actions_completed()) { +- return; +- } +- +- /* +- * Modules already finished initialization - now that all +- * the bdev modules have finished their asynchronous I/O +- * processing, the entire bdev layer can be marked as complete. +- */ +- bdev_init_complete(0); +-} +- +-static void +-bdev_module_action_done(struct spdk_bdev_module *module) +-{ +- spdk_spin_lock(&module->internal.spinlock); +- assert(module->internal.action_in_progress > 0); +- module->internal.action_in_progress--; +- spdk_spin_unlock(&module->internal.spinlock); +- bdev_module_action_complete(); +-} +- +-void +-spdk_bdev_module_init_done(struct spdk_bdev_module *module) +-{ +- assert(module->async_init); +- bdev_module_action_done(module); +-} +- +-void +-spdk_bdev_module_examine_done(struct spdk_bdev_module *module) +-{ +- bdev_module_action_done(module); +-} +- +-/** The last initialized bdev module */ +-static struct spdk_bdev_module *g_resume_bdev_module = NULL; +- +-static void +-bdev_init_failed(void *cb_arg) +-{ +- struct spdk_bdev_module *module = cb_arg; +- +- spdk_spin_lock(&module->internal.spinlock); +- assert(module->internal.action_in_progress > 0); +- module->internal.action_in_progress--; +- spdk_spin_unlock(&module->internal.spinlock); +- bdev_init_complete(-1); +-} +- +-static int +-bdev_modules_init(void) +-{ +- struct spdk_bdev_module *module; +- int rc = 0; +- +- TAILQ_FOREACH(module, &g_bdev_mgr.bdev_modules, internal.tailq) { +- g_resume_bdev_module = module; +- if (module->async_init) { +- spdk_spin_lock(&module->internal.spinlock); +- module->internal.action_in_progress = 1; +- spdk_spin_unlock(&module->internal.spinlock); +- } +- rc = module->module_init(); +- if (rc != 0) { +- /* Bump action_in_progress to prevent other modules from completion of modules_init +- * Send message to defer application shutdown until resources are cleaned up */ +- spdk_spin_lock(&module->internal.spinlock); +- module->internal.action_in_progress = 1; +- spdk_spin_unlock(&module->internal.spinlock); +- spdk_thread_send_msg(spdk_get_thread(), bdev_init_failed, module); +- return rc; +- } +- } +- +- g_resume_bdev_module = NULL; +- return 0; +-} +- +-void +-spdk_bdev_initialize(spdk_bdev_init_cb cb_fn, void *cb_arg) +-{ +- int rc = 0; +- char mempool_name[32]; +- +- assert(cb_fn != NULL); +- +- g_init_cb_fn = cb_fn; +- g_init_cb_arg = cb_arg; +- +- spdk_notify_type_register("bdev_register"); +- spdk_notify_type_register("bdev_unregister"); +- +- snprintf(mempool_name, sizeof(mempool_name), "bdev_io_%d", getpid()); +- +- rc = spdk_iobuf_register_module("bdev"); +- if (rc != 0) { +- SPDK_ERRLOG("could not register bdev iobuf module: %s\n", spdk_strerror(-rc)); +- bdev_init_complete(-1); +- return; +- } +- +- g_bdev_mgr.bdev_io_pool = spdk_mempool_create(mempool_name, +- g_bdev_opts.bdev_io_pool_size, +- sizeof(struct spdk_bdev_io) + +- bdev_module_get_max_ctx_size(), +- 0, +- SPDK_ENV_SOCKET_ID_ANY); +- +- if (g_bdev_mgr.bdev_io_pool == NULL) { +- SPDK_ERRLOG("could not allocate spdk_bdev_io pool\n"); +- bdev_init_complete(-1); +- return; +- } +- +- g_bdev_mgr.zero_buffer = spdk_zmalloc(ZERO_BUFFER_SIZE, ZERO_BUFFER_SIZE, +- NULL, SPDK_ENV_LCORE_ID_ANY, SPDK_MALLOC_DMA); +- if (!g_bdev_mgr.zero_buffer) { +- SPDK_ERRLOG("create bdev zero buffer failed\n"); +- bdev_init_complete(-1); +- return; +- } +- +-#ifdef SPDK_CONFIG_VTUNE +- SPDK_LOG_DEPRECATED(vtune_support); +- g_bdev_mgr.domain = __itt_domain_create("spdk_bdev"); +-#endif +- +- spdk_io_device_register(&g_bdev_mgr, bdev_mgmt_channel_create, +- bdev_mgmt_channel_destroy, +- sizeof(struct spdk_bdev_mgmt_channel), +- "bdev_mgr"); +- +- rc = bdev_modules_init(); +- g_bdev_mgr.module_init_complete = true; +- if (rc != 0) { +- SPDK_ERRLOG("bdev modules init failed\n"); +- return; +- } +- +- bdev_module_action_complete(); +-} +- +-static void +-bdev_mgr_unregister_cb(void *io_device) +-{ +- spdk_bdev_fini_cb cb_fn = g_fini_cb_fn; +- +- if (g_bdev_mgr.bdev_io_pool) { +- if (spdk_mempool_count(g_bdev_mgr.bdev_io_pool) != g_bdev_opts.bdev_io_pool_size) { +- SPDK_ERRLOG("bdev IO pool count is %zu but should be %u\n", +- spdk_mempool_count(g_bdev_mgr.bdev_io_pool), +- g_bdev_opts.bdev_io_pool_size); +- } +- +- spdk_mempool_free(g_bdev_mgr.bdev_io_pool); +- } +- +- spdk_free(g_bdev_mgr.zero_buffer); +- +- bdev_examine_allowlist_free(); +- +- cb_fn(g_fini_cb_arg); +- g_fini_cb_fn = NULL; +- g_fini_cb_arg = NULL; +- g_bdev_mgr.init_complete = false; +- g_bdev_mgr.module_init_complete = false; +-} +- +-static void +-bdev_module_fini_iter(void *arg) +-{ +- struct spdk_bdev_module *bdev_module; +- +- /* FIXME: Handling initialization failures is broken now, +- * so we won't even try cleaning up after successfully +- * initialized modules. if module_init_complete is false, +- * just call spdk_bdev_mgr_unregister_cb +- */ +- if (!g_bdev_mgr.module_init_complete) { +- bdev_mgr_unregister_cb(NULL); +- return; +- } +- +- /* Start iterating from the last touched module */ +- if (!g_resume_bdev_module) { +- bdev_module = TAILQ_LAST(&g_bdev_mgr.bdev_modules, bdev_module_list); +- } else { +- bdev_module = TAILQ_PREV(g_resume_bdev_module, bdev_module_list, +- internal.tailq); +- } +- +- while (bdev_module) { +- if (bdev_module->async_fini) { +- /* Save our place so we can resume later. We must +- * save the variable here, before calling module_fini() +- * below, because in some cases the module may immediately +- * call spdk_bdev_module_fini_done() and re-enter +- * this function to continue iterating. */ +- g_resume_bdev_module = bdev_module; +- } +- +- if (bdev_module->module_fini) { +- bdev_module->module_fini(); +- } +- +- if (bdev_module->async_fini) { +- return; +- } +- +- bdev_module = TAILQ_PREV(bdev_module, bdev_module_list, +- internal.tailq); +- } +- +- g_resume_bdev_module = NULL; +- spdk_io_device_unregister(&g_bdev_mgr, bdev_mgr_unregister_cb); +-} +- +-void +-spdk_bdev_module_fini_done(void) +-{ +- if (spdk_get_thread() != g_fini_thread) { +- spdk_thread_send_msg(g_fini_thread, bdev_module_fini_iter, NULL); +- } else { +- bdev_module_fini_iter(NULL); +- } +-} +- +-static void +-bdev_finish_unregister_bdevs_iter(void *cb_arg, int bdeverrno) +-{ +- struct spdk_bdev *bdev = cb_arg; +- +- if (bdeverrno && bdev) { +- SPDK_WARNLOG("Unable to unregister bdev '%s' during spdk_bdev_finish()\n", +- bdev->name); +- +- /* +- * Since the call to spdk_bdev_unregister() failed, we have no way to free this +- * bdev; try to continue by manually removing this bdev from the list and continue +- * with the next bdev in the list. +- */ +- TAILQ_REMOVE(&g_bdev_mgr.bdevs, bdev, internal.link); +- } +- +- if (TAILQ_EMPTY(&g_bdev_mgr.bdevs)) { +- SPDK_DEBUGLOG(bdev, "Done unregistering bdevs\n"); +- /* +- * Bdev module finish need to be deferred as we might be in the middle of some context +- * (like bdev part free) that will use this bdev (or private bdev driver ctx data) +- * after returning. +- */ +- spdk_thread_send_msg(spdk_get_thread(), bdev_module_fini_iter, NULL); +- return; +- } +- +- /* +- * Unregister last unclaimed bdev in the list, to ensure that bdev subsystem +- * shutdown proceeds top-down. The goal is to give virtual bdevs an opportunity +- * to detect clean shutdown as opposed to run-time hot removal of the underlying +- * base bdevs. +- * +- * Also, walk the list in the reverse order. +- */ +- for (bdev = TAILQ_LAST(&g_bdev_mgr.bdevs, spdk_bdev_list); +- bdev; bdev = TAILQ_PREV(bdev, spdk_bdev_list, internal.link)) { +- spdk_spin_lock(&bdev->internal.spinlock); +- if (bdev->internal.claim_type != SPDK_BDEV_CLAIM_NONE) { +- SPDK_DEBUGLOG(bdev, "Skipping claimed bdev '%s'(<-'%s').\n", +- bdev->name, bdev->internal.claim.v1.module->name); +- spdk_spin_unlock(&bdev->internal.spinlock); +- continue; +- } +- spdk_spin_unlock(&bdev->internal.spinlock); +- +- SPDK_DEBUGLOG(bdev, "Unregistering bdev '%s'\n", bdev->name); +- spdk_bdev_unregister(bdev, bdev_finish_unregister_bdevs_iter, bdev); +- return; +- } +- +- /* +- * If any bdev fails to unclaim underlying bdev properly, we may face the +- * case of bdev list consisting of claimed bdevs only (if claims are managed +- * correctly, this would mean there's a loop in the claims graph which is +- * clearly impossible). Warn and unregister last bdev on the list then. +- */ +- for (bdev = TAILQ_LAST(&g_bdev_mgr.bdevs, spdk_bdev_list); +- bdev; bdev = TAILQ_PREV(bdev, spdk_bdev_list, internal.link)) { +- SPDK_WARNLOG("Unregistering claimed bdev '%s'!\n", bdev->name); +- spdk_bdev_unregister(bdev, bdev_finish_unregister_bdevs_iter, bdev); +- return; +- } +-} +- +-static void +-bdev_module_fini_start_iter(void *arg) +-{ +- struct spdk_bdev_module *bdev_module; +- +- if (!g_resume_bdev_module) { +- bdev_module = TAILQ_LAST(&g_bdev_mgr.bdev_modules, bdev_module_list); +- } else { +- bdev_module = TAILQ_PREV(g_resume_bdev_module, bdev_module_list, internal.tailq); +- } +- +- while (bdev_module) { +- if (bdev_module->async_fini_start) { +- /* Save our place so we can resume later. We must +- * save the variable here, before calling fini_start() +- * below, because in some cases the module may immediately +- * call spdk_bdev_module_fini_start_done() and re-enter +- * this function to continue iterating. */ +- g_resume_bdev_module = bdev_module; +- } +- +- if (bdev_module->fini_start) { +- bdev_module->fini_start(); +- } +- +- if (bdev_module->async_fini_start) { +- return; +- } +- +- bdev_module = TAILQ_PREV(bdev_module, bdev_module_list, internal.tailq); +- } +- +- g_resume_bdev_module = NULL; +- +- bdev_finish_unregister_bdevs_iter(NULL, 0); +-} +- +-void +-spdk_bdev_module_fini_start_done(void) +-{ +- if (spdk_get_thread() != g_fini_thread) { +- spdk_thread_send_msg(g_fini_thread, bdev_module_fini_start_iter, NULL); +- } else { +- bdev_module_fini_start_iter(NULL); +- } +-} +- +-static void +-bdev_finish_wait_for_examine_done(void *cb_arg) +-{ +- bdev_module_fini_start_iter(NULL); +-} +- +-void +-spdk_bdev_finish(spdk_bdev_fini_cb cb_fn, void *cb_arg) +-{ +- int rc; +- +- assert(cb_fn != NULL); +- +- g_fini_thread = spdk_get_thread(); +- +- g_fini_cb_fn = cb_fn; +- g_fini_cb_arg = cb_arg; +- +- rc = spdk_bdev_wait_for_examine(bdev_finish_wait_for_examine_done, NULL); +- if (rc != 0) { +- SPDK_ERRLOG("wait_for_examine failed: %s\n", spdk_strerror(-rc)); +- bdev_finish_wait_for_examine_done(NULL); +- } +-} +- +-struct spdk_bdev_io * +-bdev_channel_get_io(struct spdk_bdev_channel *channel) +-{ +- struct spdk_bdev_mgmt_channel *ch = channel->shared_resource->mgmt_ch; +- struct spdk_bdev_io *bdev_io; +- +- if (ch->per_thread_cache_count > 0) { +- bdev_io = STAILQ_FIRST(&ch->per_thread_cache); +- STAILQ_REMOVE_HEAD(&ch->per_thread_cache, internal.buf_link); +- ch->per_thread_cache_count--; +- } else if (spdk_unlikely(!TAILQ_EMPTY(&ch->io_wait_queue))) { +- /* +- * Don't try to look for bdev_ios in the global pool if there are +- * waiters on bdev_ios - we don't want this caller to jump the line. +- */ +- bdev_io = NULL; +- } else { +- bdev_io = spdk_mempool_get(g_bdev_mgr.bdev_io_pool); +- } +- +- return bdev_io; +-} +- +-void +-spdk_bdev_free_io(struct spdk_bdev_io *bdev_io) +-{ +- struct spdk_bdev_mgmt_channel *ch; +- +- assert(bdev_io != NULL); +- assert(bdev_io->internal.status != SPDK_BDEV_IO_STATUS_PENDING); +- +- ch = bdev_io->internal.ch->shared_resource->mgmt_ch; +- +- if (bdev_io->internal.buf != NULL) { +- bdev_io_put_buf(bdev_io); +- } +- +- if (ch->per_thread_cache_count < ch->bdev_io_cache_size) { +- ch->per_thread_cache_count++; +- STAILQ_INSERT_HEAD(&ch->per_thread_cache, bdev_io, internal.buf_link); +- while (ch->per_thread_cache_count > 0 && !TAILQ_EMPTY(&ch->io_wait_queue)) { +- struct spdk_bdev_io_wait_entry *entry; +- +- entry = TAILQ_FIRST(&ch->io_wait_queue); +- TAILQ_REMOVE(&ch->io_wait_queue, entry, link); +- entry->cb_fn(entry->cb_arg); +- } +- } else { +- /* We should never have a full cache with entries on the io wait queue. */ +- assert(TAILQ_EMPTY(&ch->io_wait_queue)); +- spdk_mempool_put(g_bdev_mgr.bdev_io_pool, (void *)bdev_io); +- } +-} +- +-static bool +-bdev_qos_is_iops_rate_limit(enum spdk_bdev_qos_rate_limit_type limit) +-{ +- assert(limit != SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES); +- +- switch (limit) { +- case SPDK_BDEV_QOS_RW_IOPS_RATE_LIMIT: +- return true; +- case SPDK_BDEV_QOS_RW_BPS_RATE_LIMIT: +- case SPDK_BDEV_QOS_R_BPS_RATE_LIMIT: +- case SPDK_BDEV_QOS_W_BPS_RATE_LIMIT: +- return false; +- case SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES: +- default: +- return false; +- } +-} +- +-static bool +-bdev_qos_io_to_limit(struct spdk_bdev_io *bdev_io) +-{ +- switch (bdev_io->type) { +- case SPDK_BDEV_IO_TYPE_NVME_IO: +- case SPDK_BDEV_IO_TYPE_NVME_IO_MD: +- case SPDK_BDEV_IO_TYPE_READ: +- case SPDK_BDEV_IO_TYPE_WRITE: +- return true; +- case SPDK_BDEV_IO_TYPE_ZCOPY: +- if (bdev_io->u.bdev.zcopy.start) { +- return true; +- } else { +- return false; +- } +- default: +- return false; +- } +-} +- +-static bool +-bdev_is_read_io(struct spdk_bdev_io *bdev_io) +-{ +- switch (bdev_io->type) { +- case SPDK_BDEV_IO_TYPE_NVME_IO: +- case SPDK_BDEV_IO_TYPE_NVME_IO_MD: +- /* Bit 1 (0x2) set for read operation */ +- if (bdev_io->u.nvme_passthru.cmd.opc & SPDK_NVME_OPC_READ) { +- return true; +- } else { +- return false; +- } +- case SPDK_BDEV_IO_TYPE_READ: +- return true; +- case SPDK_BDEV_IO_TYPE_ZCOPY: +- /* Populate to read from disk */ +- if (bdev_io->u.bdev.zcopy.populate) { +- return true; +- } else { +- return false; +- } +- default: +- return false; +- } +-} +- +-static uint64_t +-bdev_get_io_size_in_byte(struct spdk_bdev_io *bdev_io) +-{ +- struct spdk_bdev *bdev = bdev_io->bdev; +- +- switch (bdev_io->type) { +- case SPDK_BDEV_IO_TYPE_NVME_IO: +- case SPDK_BDEV_IO_TYPE_NVME_IO_MD: +- return bdev_io->u.nvme_passthru.nbytes; +- case SPDK_BDEV_IO_TYPE_READ: +- case SPDK_BDEV_IO_TYPE_WRITE: +- return bdev_io->u.bdev.num_blocks * bdev->blocklen; +- case SPDK_BDEV_IO_TYPE_ZCOPY: +- /* Track the data in the start phase only */ +- if (bdev_io->u.bdev.zcopy.start) { +- return bdev_io->u.bdev.num_blocks * bdev->blocklen; +- } else { +- return 0; +- } +- default: +- return 0; +- } +-} +- +-static bool +-bdev_qos_rw_queue_io(const struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) +-{ +- if (limit->max_per_timeslice > 0 && limit->remaining_this_timeslice <= 0) { +- return true; +- } else { +- return false; +- } +-} +- +-static bool +-bdev_qos_r_queue_io(const struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) +-{ +- if (bdev_is_read_io(io) == false) { +- return false; +- } +- +- return bdev_qos_rw_queue_io(limit, io); +-} +- +-static bool +-bdev_qos_w_queue_io(const struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) +-{ +- if (bdev_is_read_io(io) == true) { +- return false; +- } +- +- return bdev_qos_rw_queue_io(limit, io); +-} +- +-static void +-bdev_qos_rw_iops_update_quota(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) +-{ +- limit->remaining_this_timeslice--; +-} +- +-static void +-bdev_qos_rw_bps_update_quota(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) +-{ +- limit->remaining_this_timeslice -= bdev_get_io_size_in_byte(io); +-} +- +-static void +-bdev_qos_r_bps_update_quota(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) +-{ +- if (bdev_is_read_io(io) == false) { +- return; +- } +- +- return bdev_qos_rw_bps_update_quota(limit, io); +-} +- +-static void +-bdev_qos_w_bps_update_quota(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) +-{ +- if (bdev_is_read_io(io) == true) { +- return; +- } +- +- return bdev_qos_rw_bps_update_quota(limit, io); +-} +- +-static void +-bdev_qos_set_ops(struct spdk_bdev_qos *qos) +-{ +- int i; +- +- for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { +- if (qos->rate_limits[i].limit == SPDK_BDEV_QOS_LIMIT_NOT_DEFINED) { +- qos->rate_limits[i].queue_io = NULL; +- qos->rate_limits[i].update_quota = NULL; +- continue; +- } +- +- switch (i) { +- case SPDK_BDEV_QOS_RW_IOPS_RATE_LIMIT: +- qos->rate_limits[i].queue_io = bdev_qos_rw_queue_io; +- qos->rate_limits[i].update_quota = bdev_qos_rw_iops_update_quota; +- break; +- case SPDK_BDEV_QOS_RW_BPS_RATE_LIMIT: +- qos->rate_limits[i].queue_io = bdev_qos_rw_queue_io; +- qos->rate_limits[i].update_quota = bdev_qos_rw_bps_update_quota; +- break; +- case SPDK_BDEV_QOS_R_BPS_RATE_LIMIT: +- qos->rate_limits[i].queue_io = bdev_qos_r_queue_io; +- qos->rate_limits[i].update_quota = bdev_qos_r_bps_update_quota; +- break; +- case SPDK_BDEV_QOS_W_BPS_RATE_LIMIT: +- qos->rate_limits[i].queue_io = bdev_qos_w_queue_io; +- qos->rate_limits[i].update_quota = bdev_qos_w_bps_update_quota; +- break; +- default: +- break; +- } +- } +-} +- +-static void +-_bdev_io_complete_in_submit(struct spdk_bdev_channel *bdev_ch, +- struct spdk_bdev_io *bdev_io, +- enum spdk_bdev_io_status status) +-{ +- struct spdk_bdev_shared_resource *shared_resource = bdev_ch->shared_resource; +- +- bdev_io->internal.in_submit_request = true; +- bdev_ch->io_outstanding++; +- shared_resource->io_outstanding++; +- spdk_bdev_io_complete(bdev_io, status); +- bdev_io->internal.in_submit_request = false; +-} +- +-static inline void +-bdev_io_do_submit(struct spdk_bdev_channel *bdev_ch, struct spdk_bdev_io *bdev_io) +-{ +- struct spdk_bdev *bdev = bdev_io->bdev; +- struct spdk_io_channel *ch = bdev_ch->channel; +- struct spdk_bdev_shared_resource *shared_resource = bdev_ch->shared_resource; +- +- if (spdk_unlikely(bdev_io->type == SPDK_BDEV_IO_TYPE_ABORT)) { +- struct spdk_bdev_mgmt_channel *mgmt_channel = shared_resource->mgmt_ch; +- struct spdk_bdev_io *bio_to_abort = bdev_io->u.abort.bio_to_abort; +- +- if (bdev_abort_queued_io(&shared_resource->nomem_io, bio_to_abort) || +- bdev_abort_buf_io(mgmt_channel, bio_to_abort)) { +- _bdev_io_complete_in_submit(bdev_ch, bdev_io, +- SPDK_BDEV_IO_STATUS_SUCCESS); +- return; +- } +- } +- +- if (spdk_unlikely(bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE && +- bdev_io->bdev->split_on_write_unit && +- bdev_io->u.bdev.num_blocks < bdev_io->bdev->write_unit_size)) { +- SPDK_ERRLOG("IO num_blocks %lu does not match the write_unit_size %u\n", +- bdev_io->u.bdev.num_blocks, bdev_io->bdev->write_unit_size); +- _bdev_io_complete_in_submit(bdev_ch, bdev_io, SPDK_BDEV_IO_STATUS_FAILED); +- return; +- } +- +- if (spdk_likely(TAILQ_EMPTY(&shared_resource->nomem_io))) { +- bdev_ch->io_outstanding++; +- shared_resource->io_outstanding++; +- bdev_io->internal.in_submit_request = true; +- bdev->fn_table->submit_request(ch, bdev_io); +- bdev_io->internal.in_submit_request = false; +- } else { +- TAILQ_INSERT_TAIL(&shared_resource->nomem_io, bdev_io, internal.link); +- } +-} +- +-static bool +-bdev_qos_queue_io(struct spdk_bdev_qos *qos, struct spdk_bdev_io *bdev_io) +-{ +- int i; +- +- if (bdev_qos_io_to_limit(bdev_io) == true) { +- for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { +- if (!qos->rate_limits[i].queue_io) { +- continue; +- } +- +- if (qos->rate_limits[i].queue_io(&qos->rate_limits[i], +- bdev_io) == true) { +- return true; +- } +- } +- for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { +- if (!qos->rate_limits[i].update_quota) { +- continue; +- } +- +- qos->rate_limits[i].update_quota(&qos->rate_limits[i], bdev_io); +- } +- } +- +- return false; +-} +- +-static inline void +-_bdev_io_do_submit(void *ctx) +-{ +- struct spdk_bdev_io *bdev_io = ctx; +- struct spdk_bdev_channel *ch = bdev_io->internal.ch; +- +- bdev_io_do_submit(ch, bdev_io); +-} +- +-static int +-bdev_qos_io_submit(struct spdk_bdev_channel *ch, struct spdk_bdev_qos *qos) +-{ +- struct spdk_bdev_io *bdev_io = NULL, *tmp = NULL; +- int submitted_ios = 0; +- +- TAILQ_FOREACH_SAFE(bdev_io, &qos->queued, internal.link, tmp) { +- if (!bdev_qos_queue_io(qos, bdev_io)) { +- TAILQ_REMOVE(&qos->queued, bdev_io, internal.link); +- +- if (bdev_io->internal.io_submit_ch) { +- /* Send back the IO to the original thread for the actual processing. */ +- bdev_io->internal.ch = bdev_io->internal.io_submit_ch; +- bdev_io->internal.io_submit_ch = NULL; +- spdk_thread_send_msg(spdk_bdev_io_get_thread(bdev_io), +- _bdev_io_do_submit, bdev_io); +- } else { +- bdev_io_do_submit(ch, bdev_io); +- } +- +- submitted_ios++; +- } +- } +- +- return submitted_ios; +-} +- +-static void +-bdev_queue_io_wait_with_cb(struct spdk_bdev_io *bdev_io, spdk_bdev_io_wait_cb cb_fn) +-{ +- int rc; +- +- bdev_io->internal.waitq_entry.bdev = bdev_io->bdev; +- bdev_io->internal.waitq_entry.cb_fn = cb_fn; +- bdev_io->internal.waitq_entry.cb_arg = bdev_io; +- rc = spdk_bdev_queue_io_wait(bdev_io->bdev, spdk_io_channel_from_ctx(bdev_io->internal.ch), +- &bdev_io->internal.waitq_entry); +- if (rc != 0) { +- SPDK_ERRLOG("Queue IO failed, rc=%d\n", rc); +- bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; +- bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); +- } +-} +- +-static bool +-bdev_rw_should_split(struct spdk_bdev_io *bdev_io) +-{ +- uint32_t io_boundary; +- struct spdk_bdev *bdev = bdev_io->bdev; +- uint32_t max_size = bdev->max_segment_size; +- int max_segs = bdev->max_num_segments; +- +- if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE && bdev->split_on_write_unit) { +- io_boundary = bdev->write_unit_size; +- } else if (bdev->split_on_optimal_io_boundary) { +- io_boundary = bdev->optimal_io_boundary; +- } else { +- io_boundary = 0; +- } +- +- if (spdk_likely(!io_boundary && !max_segs && !max_size)) { +- return false; +- } +- +- if (io_boundary) { +- uint64_t start_stripe, end_stripe; +- +- start_stripe = bdev_io->u.bdev.offset_blocks; +- end_stripe = start_stripe + bdev_io->u.bdev.num_blocks - 1; +- /* Avoid expensive div operations if possible. These spdk_u32 functions are very cheap. */ +- if (spdk_likely(spdk_u32_is_pow2(io_boundary))) { +- start_stripe >>= spdk_u32log2(io_boundary); +- end_stripe >>= spdk_u32log2(io_boundary); +- } else { +- start_stripe /= io_boundary; +- end_stripe /= io_boundary; +- } +- +- if (start_stripe != end_stripe) { +- return true; +- } +- } +- +- if (max_segs) { +- if (bdev_io->u.bdev.iovcnt > max_segs) { +- return true; +- } +- } +- +- if (max_size) { +- for (int i = 0; i < bdev_io->u.bdev.iovcnt; i++) { +- if (bdev_io->u.bdev.iovs[i].iov_len > max_size) { +- return true; +- } +- } +- } +- +- return false; +-} +- +-static bool +-bdev_unmap_should_split(struct spdk_bdev_io *bdev_io) +-{ +- uint32_t num_unmap_segments; +- +- if (!bdev_io->bdev->max_unmap || !bdev_io->bdev->max_unmap_segments) { +- return false; +- } +- num_unmap_segments = spdk_divide_round_up(bdev_io->u.bdev.num_blocks, bdev_io->bdev->max_unmap); +- if (num_unmap_segments > bdev_io->bdev->max_unmap_segments) { +- return true; +- } +- +- return false; +-} +- +-static bool +-bdev_write_zeroes_should_split(struct spdk_bdev_io *bdev_io) +-{ +- if (!bdev_io->bdev->max_write_zeroes) { +- return false; +- } +- +- if (bdev_io->u.bdev.num_blocks > bdev_io->bdev->max_write_zeroes) { +- return true; +- } +- +- return false; +-} +- +-static bool +-bdev_copy_should_split(struct spdk_bdev_io *bdev_io) +-{ +- if (bdev_io->bdev->max_copy != 0 && +- bdev_io->u.bdev.num_blocks > bdev_io->bdev->max_copy) { +- return true; +- } +- +- return false; +-} +- +-static bool +-bdev_io_should_split(struct spdk_bdev_io *bdev_io) +-{ +- switch (bdev_io->type) { +- case SPDK_BDEV_IO_TYPE_READ: +- case SPDK_BDEV_IO_TYPE_WRITE: +- return bdev_rw_should_split(bdev_io); +- case SPDK_BDEV_IO_TYPE_UNMAP: +- return bdev_unmap_should_split(bdev_io); +- case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: +- return bdev_write_zeroes_should_split(bdev_io); +- case SPDK_BDEV_IO_TYPE_COPY: +- return bdev_copy_should_split(bdev_io); +- default: +- return false; +- } +-} +- +-static uint32_t +-_to_next_boundary(uint64_t offset, uint32_t boundary) +-{ +- return (boundary - (offset % boundary)); +-} +- +-static void bdev_io_split_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg); +- +-static void _bdev_rw_split(void *_bdev_io); +- +-static void bdev_unmap_split(struct spdk_bdev_io *bdev_io); +- +-static void +-_bdev_unmap_split(void *_bdev_io) +-{ +- return bdev_unmap_split((struct spdk_bdev_io *)_bdev_io); +-} +- +-static void bdev_write_zeroes_split(struct spdk_bdev_io *bdev_io); +- +-static void +-_bdev_write_zeroes_split(void *_bdev_io) +-{ +- return bdev_write_zeroes_split((struct spdk_bdev_io *)_bdev_io); +-} +- +-static void bdev_copy_split(struct spdk_bdev_io *bdev_io); +- +-static void +-_bdev_copy_split(void *_bdev_io) +-{ +- return bdev_copy_split((struct spdk_bdev_io *)_bdev_io); +-} +- +-static int +-bdev_io_split_submit(struct spdk_bdev_io *bdev_io, struct iovec *iov, int iovcnt, void *md_buf, +- uint64_t num_blocks, uint64_t *offset, uint64_t *remaining) +-{ +- int rc; +- uint64_t current_offset, current_remaining, current_src_offset; +- spdk_bdev_io_wait_cb io_wait_fn; +- +- current_offset = *offset; +- current_remaining = *remaining; +- +- bdev_io->u.bdev.split_outstanding++; +- +- io_wait_fn = _bdev_rw_split; +- switch (bdev_io->type) { +- case SPDK_BDEV_IO_TYPE_READ: +- rc = bdev_readv_blocks_with_md(bdev_io->internal.desc, +- spdk_io_channel_from_ctx(bdev_io->internal.ch), +- iov, iovcnt, md_buf, current_offset, +- num_blocks, +- bdev_io_split_done, bdev_io, +- bdev_io->internal.ext_opts, true); +- break; +- case SPDK_BDEV_IO_TYPE_WRITE: +- rc = bdev_writev_blocks_with_md(bdev_io->internal.desc, +- spdk_io_channel_from_ctx(bdev_io->internal.ch), +- iov, iovcnt, md_buf, current_offset, +- num_blocks, +- bdev_io_split_done, bdev_io, +- bdev_io->internal.ext_opts, true); +- break; +- case SPDK_BDEV_IO_TYPE_UNMAP: +- io_wait_fn = _bdev_unmap_split; +- rc = spdk_bdev_unmap_blocks(bdev_io->internal.desc, +- spdk_io_channel_from_ctx(bdev_io->internal.ch), +- current_offset, num_blocks, +- bdev_io_split_done, bdev_io); +- break; +- case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: +- io_wait_fn = _bdev_write_zeroes_split; +- rc = spdk_bdev_write_zeroes_blocks(bdev_io->internal.desc, +- spdk_io_channel_from_ctx(bdev_io->internal.ch), +- current_offset, num_blocks, +- bdev_io_split_done, bdev_io); +- break; +- case SPDK_BDEV_IO_TYPE_COPY: +- io_wait_fn = _bdev_copy_split; +- current_src_offset = bdev_io->u.bdev.copy.src_offset_blocks + +- (current_offset - bdev_io->u.bdev.offset_blocks); +- rc = spdk_bdev_copy_blocks(bdev_io->internal.desc, +- spdk_io_channel_from_ctx(bdev_io->internal.ch), +- current_offset, current_src_offset, num_blocks, +- bdev_io_split_done, bdev_io); +- break; +- default: +- assert(false); +- rc = -EINVAL; +- break; +- } +- +- if (rc == 0) { +- current_offset += num_blocks; +- current_remaining -= num_blocks; +- bdev_io->u.bdev.split_current_offset_blocks = current_offset; +- bdev_io->u.bdev.split_remaining_num_blocks = current_remaining; +- *offset = current_offset; +- *remaining = current_remaining; +- } else { +- bdev_io->u.bdev.split_outstanding--; +- if (rc == -ENOMEM) { +- if (bdev_io->u.bdev.split_outstanding == 0) { +- /* No I/O is outstanding. Hence we should wait here. */ +- bdev_queue_io_wait_with_cb(bdev_io, io_wait_fn); +- } +- } else { +- bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; +- if (bdev_io->u.bdev.split_outstanding == 0) { +- spdk_trace_record(TRACE_BDEV_IO_DONE, 0, 0, (uintptr_t)bdev_io, bdev_io->internal.caller_ctx); +- TAILQ_REMOVE(&bdev_io->internal.ch->io_submitted, bdev_io, internal.ch_link); +- bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); +- } +- } +- } +- +- return rc; +-} +- +-static void +-_bdev_rw_split(void *_bdev_io) +-{ +- struct iovec *parent_iov, *iov; +- struct spdk_bdev_io *bdev_io = _bdev_io; +- struct spdk_bdev *bdev = bdev_io->bdev; +- uint64_t parent_offset, current_offset, remaining; +- uint32_t parent_iov_offset, parent_iovcnt, parent_iovpos, child_iovcnt; +- uint32_t to_next_boundary, to_next_boundary_bytes, to_last_block_bytes; +- uint32_t iovcnt, iov_len, child_iovsize; +- uint32_t blocklen = bdev->blocklen; +- uint32_t io_boundary; +- uint32_t max_segment_size = bdev->max_segment_size; +- uint32_t max_child_iovcnt = bdev->max_num_segments; +- void *md_buf = NULL; +- int rc; +- +- max_segment_size = max_segment_size ? max_segment_size : UINT32_MAX; +- max_child_iovcnt = max_child_iovcnt ? spdk_min(max_child_iovcnt, SPDK_BDEV_IO_NUM_CHILD_IOV) : +- SPDK_BDEV_IO_NUM_CHILD_IOV; +- +- if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE && bdev->split_on_write_unit) { +- io_boundary = bdev->write_unit_size; +- } else if (bdev->split_on_optimal_io_boundary) { +- io_boundary = bdev->optimal_io_boundary; +- } else { +- io_boundary = UINT32_MAX; +- } +- +- remaining = bdev_io->u.bdev.split_remaining_num_blocks; +- current_offset = bdev_io->u.bdev.split_current_offset_blocks; +- parent_offset = bdev_io->u.bdev.offset_blocks; +- parent_iov_offset = (current_offset - parent_offset) * blocklen; +- parent_iovcnt = bdev_io->u.bdev.iovcnt; +- +- for (parent_iovpos = 0; parent_iovpos < parent_iovcnt; parent_iovpos++) { +- parent_iov = &bdev_io->u.bdev.iovs[parent_iovpos]; +- if (parent_iov_offset < parent_iov->iov_len) { +- break; +- } +- parent_iov_offset -= parent_iov->iov_len; +- } +- +- child_iovcnt = 0; +- while (remaining > 0 && parent_iovpos < parent_iovcnt && +- child_iovcnt < SPDK_BDEV_IO_NUM_CHILD_IOV) { +- to_next_boundary = _to_next_boundary(current_offset, io_boundary); +- to_next_boundary = spdk_min(remaining, to_next_boundary); +- to_next_boundary_bytes = to_next_boundary * blocklen; +- +- iov = &bdev_io->child_iov[child_iovcnt]; +- iovcnt = 0; +- +- if (bdev_io->u.bdev.md_buf) { +- md_buf = (char *)bdev_io->u.bdev.md_buf + +- (current_offset - parent_offset) * spdk_bdev_get_md_size(bdev); +- } +- +- child_iovsize = spdk_min(SPDK_BDEV_IO_NUM_CHILD_IOV - child_iovcnt, max_child_iovcnt); +- while (to_next_boundary_bytes > 0 && parent_iovpos < parent_iovcnt && +- iovcnt < child_iovsize) { +- parent_iov = &bdev_io->u.bdev.iovs[parent_iovpos]; +- iov_len = parent_iov->iov_len - parent_iov_offset; +- +- iov_len = spdk_min(iov_len, max_segment_size); +- iov_len = spdk_min(iov_len, to_next_boundary_bytes); +- to_next_boundary_bytes -= iov_len; +- +- bdev_io->child_iov[child_iovcnt].iov_base = parent_iov->iov_base + parent_iov_offset; +- bdev_io->child_iov[child_iovcnt].iov_len = iov_len; +- +- if (iov_len < parent_iov->iov_len - parent_iov_offset) { +- parent_iov_offset += iov_len; +- } else { +- parent_iovpos++; +- parent_iov_offset = 0; +- } +- child_iovcnt++; +- iovcnt++; +- } +- +- if (to_next_boundary_bytes > 0) { +- /* We had to stop this child I/O early because we ran out of +- * child_iov space or were limited by max_num_segments. +- * Ensure the iovs to be aligned with block size and +- * then adjust to_next_boundary before starting the +- * child I/O. +- */ +- assert(child_iovcnt == SPDK_BDEV_IO_NUM_CHILD_IOV || +- iovcnt == child_iovsize); +- to_last_block_bytes = to_next_boundary_bytes % blocklen; +- if (to_last_block_bytes != 0) { +- uint32_t child_iovpos = child_iovcnt - 1; +- /* don't decrease child_iovcnt when it equals to SPDK_BDEV_IO_NUM_CHILD_IOV +- * so the loop will naturally end +- */ +- +- to_last_block_bytes = blocklen - to_last_block_bytes; +- to_next_boundary_bytes += to_last_block_bytes; +- while (to_last_block_bytes > 0 && iovcnt > 0) { +- iov_len = spdk_min(to_last_block_bytes, +- bdev_io->child_iov[child_iovpos].iov_len); +- bdev_io->child_iov[child_iovpos].iov_len -= iov_len; +- if (bdev_io->child_iov[child_iovpos].iov_len == 0) { +- child_iovpos--; +- if (--iovcnt == 0) { +- /* If the child IO is less than a block size just return. +- * If the first child IO of any split round is less than +- * a block size, an error exit. +- */ +- if (bdev_io->u.bdev.split_outstanding == 0) { +- SPDK_ERRLOG("The first child io was less than a block size\n"); +- bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; +- spdk_trace_record(TRACE_BDEV_IO_DONE, 0, 0, (uintptr_t)bdev_io, bdev_io->internal.caller_ctx); +- TAILQ_REMOVE(&bdev_io->internal.ch->io_submitted, bdev_io, internal.ch_link); +- bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); +- } +- +- return; +- } +- } +- +- to_last_block_bytes -= iov_len; +- +- if (parent_iov_offset == 0) { +- parent_iovpos--; +- parent_iov_offset = bdev_io->u.bdev.iovs[parent_iovpos].iov_len; +- } +- parent_iov_offset -= iov_len; +- } +- +- assert(to_last_block_bytes == 0); +- } +- to_next_boundary -= to_next_boundary_bytes / blocklen; +- } +- +- rc = bdev_io_split_submit(bdev_io, iov, iovcnt, md_buf, to_next_boundary, +- ¤t_offset, &remaining); +- if (spdk_unlikely(rc)) { +- return; +- } +- } +-} +- +-static void +-bdev_unmap_split(struct spdk_bdev_io *bdev_io) +-{ +- uint64_t offset, unmap_blocks, remaining, max_unmap_blocks; +- uint32_t num_children_reqs = 0; +- int rc; +- +- offset = bdev_io->u.bdev.split_current_offset_blocks; +- remaining = bdev_io->u.bdev.split_remaining_num_blocks; +- max_unmap_blocks = bdev_io->bdev->max_unmap * bdev_io->bdev->max_unmap_segments; +- +- while (remaining && (num_children_reqs < SPDK_BDEV_MAX_CHILDREN_UNMAP_WRITE_ZEROES_REQS)) { +- unmap_blocks = spdk_min(remaining, max_unmap_blocks); +- +- rc = bdev_io_split_submit(bdev_io, NULL, 0, NULL, unmap_blocks, +- &offset, &remaining); +- if (spdk_likely(rc == 0)) { +- num_children_reqs++; +- } else { +- return; +- } +- } +-} +- +-static void +-bdev_write_zeroes_split(struct spdk_bdev_io *bdev_io) +-{ +- uint64_t offset, write_zeroes_blocks, remaining; +- uint32_t num_children_reqs = 0; +- int rc; +- +- offset = bdev_io->u.bdev.split_current_offset_blocks; +- remaining = bdev_io->u.bdev.split_remaining_num_blocks; +- +- while (remaining && (num_children_reqs < SPDK_BDEV_MAX_CHILDREN_UNMAP_WRITE_ZEROES_REQS)) { +- write_zeroes_blocks = spdk_min(remaining, bdev_io->bdev->max_write_zeroes); +- +- rc = bdev_io_split_submit(bdev_io, NULL, 0, NULL, write_zeroes_blocks, +- &offset, &remaining); +- if (spdk_likely(rc == 0)) { +- num_children_reqs++; +- } else { +- return; +- } +- } +-} +- +-static void +-bdev_copy_split(struct spdk_bdev_io *bdev_io) +-{ +- uint64_t offset, copy_blocks, remaining; +- uint32_t num_children_reqs = 0; +- int rc; +- +- offset = bdev_io->u.bdev.split_current_offset_blocks; +- remaining = bdev_io->u.bdev.split_remaining_num_blocks; +- +- assert(bdev_io->bdev->max_copy != 0); +- while (remaining && (num_children_reqs < SPDK_BDEV_MAX_CHILDREN_COPY_REQS)) { +- copy_blocks = spdk_min(remaining, bdev_io->bdev->max_copy); +- +- rc = bdev_io_split_submit(bdev_io, NULL, 0, NULL, copy_blocks, +- &offset, &remaining); +- if (spdk_likely(rc == 0)) { +- num_children_reqs++; +- } else { +- return; +- } +- } +-} +- +-static void +-parent_bdev_io_complete(void *ctx, int rc) +-{ +- struct spdk_bdev_io *parent_io = ctx; +- +- if (rc) { +- parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; +- } +- +- parent_io->internal.cb(parent_io, parent_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS, +- parent_io->internal.caller_ctx); +-} +- +-static void +-bdev_io_split_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) +-{ +- struct spdk_bdev_io *parent_io = cb_arg; +- +- spdk_bdev_free_io(bdev_io); +- +- if (!success) { +- parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; +- /* If any child I/O failed, stop further splitting process. */ +- parent_io->u.bdev.split_current_offset_blocks += parent_io->u.bdev.split_remaining_num_blocks; +- parent_io->u.bdev.split_remaining_num_blocks = 0; +- } +- parent_io->u.bdev.split_outstanding--; +- if (parent_io->u.bdev.split_outstanding != 0) { +- return; +- } +- +- /* +- * Parent I/O finishes when all blocks are consumed. +- */ +- if (parent_io->u.bdev.split_remaining_num_blocks == 0) { +- assert(parent_io->internal.cb != bdev_io_split_done); +- spdk_trace_record(TRACE_BDEV_IO_DONE, 0, 0, (uintptr_t)parent_io, bdev_io->internal.caller_ctx); +- TAILQ_REMOVE(&parent_io->internal.ch->io_submitted, parent_io, internal.ch_link); +- +- if (parent_io->internal.orig_iovcnt != 0) { +- _bdev_io_push_bounce_data_buffer(parent_io, parent_bdev_io_complete); +- /* bdev IO will be completed in the callback */ +- } else { +- parent_bdev_io_complete(parent_io, 0); +- } +- return; +- } +- +- /* +- * Continue with the splitting process. This function will complete the parent I/O if the +- * splitting is done. +- */ +- switch (parent_io->type) { +- case SPDK_BDEV_IO_TYPE_READ: +- case SPDK_BDEV_IO_TYPE_WRITE: +- _bdev_rw_split(parent_io); +- break; +- case SPDK_BDEV_IO_TYPE_UNMAP: +- bdev_unmap_split(parent_io); +- break; +- case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: +- bdev_write_zeroes_split(parent_io); +- break; +- case SPDK_BDEV_IO_TYPE_COPY: +- bdev_copy_split(parent_io); +- break; +- default: +- assert(false); +- break; +- } +-} +- +-static void bdev_rw_split_get_buf_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io, +- bool success); +- +-static void +-bdev_io_split(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io) +-{ +- bdev_io->u.bdev.split_current_offset_blocks = bdev_io->u.bdev.offset_blocks; +- bdev_io->u.bdev.split_remaining_num_blocks = bdev_io->u.bdev.num_blocks; +- bdev_io->u.bdev.split_outstanding = 0; +- bdev_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS; +- +- switch (bdev_io->type) { +- case SPDK_BDEV_IO_TYPE_READ: +- case SPDK_BDEV_IO_TYPE_WRITE: +- if (_is_buf_allocated(bdev_io->u.bdev.iovs)) { +- _bdev_rw_split(bdev_io); +- } else { +- assert(bdev_io->type == SPDK_BDEV_IO_TYPE_READ); +- spdk_bdev_io_get_buf(bdev_io, bdev_rw_split_get_buf_cb, +- bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen); +- } +- break; +- case SPDK_BDEV_IO_TYPE_UNMAP: +- bdev_unmap_split(bdev_io); +- break; +- case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: +- bdev_write_zeroes_split(bdev_io); +- break; +- case SPDK_BDEV_IO_TYPE_COPY: +- bdev_copy_split(bdev_io); +- break; +- default: +- assert(false); +- break; +- } +-} +- +-static void +-bdev_rw_split_get_buf_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io, bool success) +-{ +- if (!success) { +- spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); +- return; +- } +- +- _bdev_rw_split(bdev_io); +-} +- +-/* Explicitly mark this inline, since it's used as a function pointer and otherwise won't +- * be inlined, at least on some compilers. +- */ +-static inline void +-_bdev_io_submit(void *ctx) +-{ +- struct spdk_bdev_io *bdev_io = ctx; +- struct spdk_bdev *bdev = bdev_io->bdev; +- struct spdk_bdev_channel *bdev_ch = bdev_io->internal.ch; +- +- if (spdk_likely(bdev_ch->flags == 0)) { +- bdev_io_do_submit(bdev_ch, bdev_io); +- return; +- } +- +- if (bdev_ch->flags & BDEV_CH_RESET_IN_PROGRESS) { +- _bdev_io_complete_in_submit(bdev_ch, bdev_io, SPDK_BDEV_IO_STATUS_ABORTED); +- } else if (bdev_ch->flags & BDEV_CH_QOS_ENABLED) { +- if (spdk_unlikely(bdev_io->type == SPDK_BDEV_IO_TYPE_ABORT) && +- bdev_abort_queued_io(&bdev->internal.qos->queued, bdev_io->u.abort.bio_to_abort)) { +- _bdev_io_complete_in_submit(bdev_ch, bdev_io, SPDK_BDEV_IO_STATUS_SUCCESS); +- } else { +- TAILQ_INSERT_TAIL(&bdev->internal.qos->queued, bdev_io, internal.link); +- bdev_qos_io_submit(bdev_ch, bdev->internal.qos); +- } +- } else { +- SPDK_ERRLOG("unknown bdev_ch flag %x found\n", bdev_ch->flags); +- _bdev_io_complete_in_submit(bdev_ch, bdev_io, SPDK_BDEV_IO_STATUS_FAILED); +- } +-} +- +-bool bdev_lba_range_overlapped(struct lba_range *range1, struct lba_range *range2); +- +-bool +-bdev_lba_range_overlapped(struct lba_range *range1, struct lba_range *range2) +-{ +- if (range1->length == 0 || range2->length == 0) { +- return false; +- } +- +- if (range1->offset + range1->length <= range2->offset) { +- return false; +- } +- +- if (range2->offset + range2->length <= range1->offset) { +- return false; +- } +- +- return true; +-} +- +-static bool +-bdev_io_range_is_locked(struct spdk_bdev_io *bdev_io, struct lba_range *range) +-{ +- struct spdk_bdev_channel *ch = bdev_io->internal.ch; +- struct lba_range r; +- +- switch (bdev_io->type) { +- case SPDK_BDEV_IO_TYPE_NVME_IO: +- case SPDK_BDEV_IO_TYPE_NVME_IO_MD: +- /* Don't try to decode the NVMe command - just assume worst-case and that +- * it overlaps a locked range. +- */ +- return true; +- case SPDK_BDEV_IO_TYPE_WRITE: +- case SPDK_BDEV_IO_TYPE_UNMAP: +- case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: +- case SPDK_BDEV_IO_TYPE_ZCOPY: +- case SPDK_BDEV_IO_TYPE_COPY: +- r.offset = bdev_io->u.bdev.offset_blocks; +- r.length = bdev_io->u.bdev.num_blocks; +- if (!bdev_lba_range_overlapped(range, &r)) { +- /* This I/O doesn't overlap the specified LBA range. */ +- return false; +- } else if (range->owner_ch == ch && range->locked_ctx == bdev_io->internal.caller_ctx) { +- /* This I/O overlaps, but the I/O is on the same channel that locked this +- * range, and the caller_ctx is the same as the locked_ctx. This means +- * that this I/O is associated with the lock, and is allowed to execute. +- */ +- return false; +- } else { +- return true; +- } +- default: +- return false; +- } +-} +- +-void +-bdev_io_submit(struct spdk_bdev_io *bdev_io) +-{ +- struct spdk_bdev *bdev = bdev_io->bdev; +- struct spdk_thread *thread = spdk_bdev_io_get_thread(bdev_io); +- struct spdk_bdev_channel *ch = bdev_io->internal.ch; +- +- assert(thread != NULL); +- assert(bdev_io->internal.status == SPDK_BDEV_IO_STATUS_PENDING); +- +- if (!TAILQ_EMPTY(&ch->locked_ranges)) { +- struct lba_range *range; +- +- TAILQ_FOREACH(range, &ch->locked_ranges, tailq) { +- if (bdev_io_range_is_locked(bdev_io, range)) { +- TAILQ_INSERT_TAIL(&ch->io_locked, bdev_io, internal.ch_link); +- return; +- } +- } +- } +- +- TAILQ_INSERT_TAIL(&ch->io_submitted, bdev_io, internal.ch_link); +- +- bdev_io->internal.submit_tsc = spdk_get_ticks(); +- spdk_trace_record_tsc(bdev_io->internal.submit_tsc, TRACE_BDEV_IO_START, 0, 0, +- (uintptr_t)bdev_io, (uint64_t)bdev_io->type, bdev_io->internal.caller_ctx, +- bdev_io->u.bdev.offset_blocks, bdev_io->u.bdev.num_blocks, +- spdk_bdev_get_name(bdev)); +- +- if (bdev_io_should_split(bdev_io)) { +- bdev_io_split(NULL, bdev_io); +- return; +- } +- +- if (ch->flags & BDEV_CH_QOS_ENABLED) { +- if ((thread == bdev->internal.qos->thread) || !bdev->internal.qos->thread) { +- _bdev_io_submit(bdev_io); +- } else { +- bdev_io->internal.io_submit_ch = ch; +- bdev_io->internal.ch = bdev->internal.qos->ch; +- spdk_thread_send_msg(bdev->internal.qos->thread, _bdev_io_submit, bdev_io); +- } +- } else { +- _bdev_io_submit(bdev_io); +- } +-} +- +-static inline void +-_bdev_io_copy_ext_opts(struct spdk_bdev_io *bdev_io, struct spdk_bdev_ext_io_opts *opts) +-{ +- struct spdk_bdev_ext_io_opts *opts_copy = &bdev_io->internal.ext_opts_copy; +- +- /* Zero part we don't copy */ +- memset(((char *)opts_copy) + opts->size, 0, sizeof(*opts) - opts->size); +- memcpy(opts_copy, opts, opts->size); +- opts_copy->size = sizeof(*opts_copy); +- opts_copy->metadata = bdev_io->u.bdev.md_buf; +- /* Save pointer to the copied ext_opts which will be used by bdev modules */ +- bdev_io->u.bdev.ext_opts = opts_copy; +-} +- +-static inline void +-_bdev_io_ext_use_bounce_buffer(struct spdk_bdev_io *bdev_io) +-{ +- /* bdev doesn't support memory domains, thereby buffers in this IO request can't +- * be accessed directly. It is needed to allocate buffers before issuing IO operation. +- * For write operation we need to pull buffers from memory domain before submitting IO. +- * Once read operation completes, we need to use memory_domain push functionality to +- * update data in original memory domain IO buffer +- * This IO request will go through a regular IO flow, so clear memory domains pointers in +- * the copied ext_opts */ +- bdev_io->internal.ext_opts_copy.memory_domain = NULL; +- bdev_io->internal.ext_opts_copy.memory_domain_ctx = NULL; +- _bdev_memory_domain_io_get_buf(bdev_io, _bdev_memory_domain_get_io_cb, +- bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen); +-} +- +-static inline void +-_bdev_io_submit_ext(struct spdk_bdev_desc *desc, struct spdk_bdev_io *bdev_io, +- struct spdk_bdev_ext_io_opts *opts, bool copy_opts) +-{ +- if (opts) { +- bool use_pull_push = opts->memory_domain && !desc->memory_domains_supported; +- assert(opts->size <= sizeof(*opts)); +- /* +- * copy if size is smaller than opts struct to avoid having to check size +- * on every access to bdev_io->u.bdev.ext_opts +- */ +- if (copy_opts || use_pull_push || opts->size < sizeof(*opts)) { +- _bdev_io_copy_ext_opts(bdev_io, opts); +- if (use_pull_push) { +- _bdev_io_ext_use_bounce_buffer(bdev_io); +- return; +- } +- } +- } +- bdev_io_submit(bdev_io); +-} +- +-static void +-bdev_io_submit_reset(struct spdk_bdev_io *bdev_io) +-{ +- struct spdk_bdev *bdev = bdev_io->bdev; +- struct spdk_bdev_channel *bdev_ch = bdev_io->internal.ch; +- struct spdk_io_channel *ch = bdev_ch->channel; +- +- assert(bdev_io->internal.status == SPDK_BDEV_IO_STATUS_PENDING); +- +- bdev_io->internal.in_submit_request = true; +- bdev->fn_table->submit_request(ch, bdev_io); +- bdev_io->internal.in_submit_request = false; +-} +- +-void +-bdev_io_init(struct spdk_bdev_io *bdev_io, +- struct spdk_bdev *bdev, void *cb_arg, +- spdk_bdev_io_completion_cb cb) +-{ +- bdev_io->bdev = bdev; +- bdev_io->internal.caller_ctx = cb_arg; +- bdev_io->internal.cb = cb; +- bdev_io->internal.status = SPDK_BDEV_IO_STATUS_PENDING; +- bdev_io->internal.in_submit_request = false; +- bdev_io->internal.buf = NULL; +- bdev_io->internal.io_submit_ch = NULL; +- bdev_io->internal.orig_iovs = NULL; +- bdev_io->internal.orig_iovcnt = 0; +- bdev_io->internal.orig_md_iov.iov_base = NULL; +- bdev_io->internal.error.nvme.cdw0 = 0; +- bdev_io->num_retries = 0; +- bdev_io->internal.get_buf_cb = NULL; +- bdev_io->internal.get_aux_buf_cb = NULL; +- bdev_io->internal.ext_opts = NULL; +- bdev_io->internal.data_transfer_cpl = NULL; +-} +- +-static bool +-bdev_io_type_supported(struct spdk_bdev *bdev, enum spdk_bdev_io_type io_type) +-{ +- return bdev->fn_table->io_type_supported(bdev->ctxt, io_type); +-} +- +-bool +-spdk_bdev_io_type_supported(struct spdk_bdev *bdev, enum spdk_bdev_io_type io_type) +-{ +- bool supported; +- +- supported = bdev_io_type_supported(bdev, io_type); +- +- if (!supported) { +- switch (io_type) { +- case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: +- /* The bdev layer will emulate write zeroes as long as write is supported. */ +- supported = bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE); +- break; +- default: +- break; +- } +- } +- +- return supported; +-} +- +-uint64_t +-spdk_bdev_io_get_submit_tsc(struct spdk_bdev_io *bdev_io) +-{ +- return bdev_io->internal.submit_tsc; +-} +- +-int +-spdk_bdev_dump_info_json(struct spdk_bdev *bdev, struct spdk_json_write_ctx *w) +-{ +- if (bdev->fn_table->dump_info_json) { +- return bdev->fn_table->dump_info_json(bdev->ctxt, w); +- } +- +- return 0; +-} +- +-static void +-bdev_qos_update_max_quota_per_timeslice(struct spdk_bdev_qos *qos) +-{ +- uint32_t max_per_timeslice = 0; +- int i; +- +- for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { +- if (qos->rate_limits[i].limit == SPDK_BDEV_QOS_LIMIT_NOT_DEFINED) { +- qos->rate_limits[i].max_per_timeslice = 0; +- continue; +- } +- +- max_per_timeslice = qos->rate_limits[i].limit * +- SPDK_BDEV_QOS_TIMESLICE_IN_USEC / SPDK_SEC_TO_USEC; +- +- qos->rate_limits[i].max_per_timeslice = spdk_max(max_per_timeslice, +- qos->rate_limits[i].min_per_timeslice); +- +- qos->rate_limits[i].remaining_this_timeslice = qos->rate_limits[i].max_per_timeslice; +- } +- +- bdev_qos_set_ops(qos); +-} +- +-static int +-bdev_channel_poll_qos(void *arg) +-{ +- struct spdk_bdev_qos *qos = arg; +- uint64_t now = spdk_get_ticks(); +- int i; +- +- if (now < (qos->last_timeslice + qos->timeslice_size)) { +- /* We received our callback earlier than expected - return +- * immediately and wait to do accounting until at least one +- * timeslice has actually expired. This should never happen +- * with a well-behaved timer implementation. +- */ +- return SPDK_POLLER_IDLE; +- } +- +- /* Reset for next round of rate limiting */ +- for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { +- /* We may have allowed the IOs or bytes to slightly overrun in the last +- * timeslice. remaining_this_timeslice is signed, so if it's negative +- * here, we'll account for the overrun so that the next timeslice will +- * be appropriately reduced. +- */ +- if (qos->rate_limits[i].remaining_this_timeslice > 0) { +- qos->rate_limits[i].remaining_this_timeslice = 0; +- } +- } +- +- while (now >= (qos->last_timeslice + qos->timeslice_size)) { +- qos->last_timeslice += qos->timeslice_size; +- for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { +- qos->rate_limits[i].remaining_this_timeslice += +- qos->rate_limits[i].max_per_timeslice; +- } +- } +- +- return bdev_qos_io_submit(qos->ch, qos); +-} +- +-static void +-bdev_channel_destroy_resource(struct spdk_bdev_channel *ch) +-{ +- struct spdk_bdev_shared_resource *shared_resource; +- struct lba_range *range; +- +- bdev_free_io_stat(ch->stat); +-#ifdef SPDK_CONFIG_VTUNE +- bdev_free_io_stat(ch->prev_stat); +-#endif +- +- while (!TAILQ_EMPTY(&ch->locked_ranges)) { +- range = TAILQ_FIRST(&ch->locked_ranges); +- TAILQ_REMOVE(&ch->locked_ranges, range, tailq); +- free(range); +- } +- +- spdk_put_io_channel(ch->channel); +- +- shared_resource = ch->shared_resource; +- +- assert(TAILQ_EMPTY(&ch->io_locked)); +- assert(TAILQ_EMPTY(&ch->io_submitted)); +- assert(ch->io_outstanding == 0); +- assert(shared_resource->ref > 0); +- shared_resource->ref--; +- if (shared_resource->ref == 0) { +- assert(shared_resource->io_outstanding == 0); +- TAILQ_REMOVE(&shared_resource->mgmt_ch->shared_resources, shared_resource, link); +- spdk_put_io_channel(spdk_io_channel_from_ctx(shared_resource->mgmt_ch)); +- free(shared_resource); +- } +-} +- +-static void +-bdev_enable_qos(struct spdk_bdev *bdev, struct spdk_bdev_channel *ch) +-{ +- struct spdk_bdev_qos *qos = bdev->internal.qos; +- int i; +- +- assert(spdk_spin_held(&bdev->internal.spinlock)); +- +- /* Rate limiting on this bdev enabled */ +- if (qos) { +- if (qos->ch == NULL) { +- struct spdk_io_channel *io_ch; +- +- SPDK_DEBUGLOG(bdev, "Selecting channel %p as QoS channel for bdev %s on thread %p\n", ch, +- bdev->name, spdk_get_thread()); +- +- /* No qos channel has been selected, so set one up */ +- +- /* Take another reference to ch */ +- io_ch = spdk_get_io_channel(__bdev_to_io_dev(bdev)); +- assert(io_ch != NULL); +- qos->ch = ch; +- +- qos->thread = spdk_io_channel_get_thread(io_ch); +- +- TAILQ_INIT(&qos->queued); +- +- for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { +- if (bdev_qos_is_iops_rate_limit(i) == true) { +- qos->rate_limits[i].min_per_timeslice = +- SPDK_BDEV_QOS_MIN_IO_PER_TIMESLICE; +- } else { +- qos->rate_limits[i].min_per_timeslice = +- SPDK_BDEV_QOS_MIN_BYTE_PER_TIMESLICE; +- } +- +- if (qos->rate_limits[i].limit == 0) { +- qos->rate_limits[i].limit = SPDK_BDEV_QOS_LIMIT_NOT_DEFINED; +- } +- } +- bdev_qos_update_max_quota_per_timeslice(qos); +- qos->timeslice_size = +- SPDK_BDEV_QOS_TIMESLICE_IN_USEC * spdk_get_ticks_hz() / SPDK_SEC_TO_USEC; +- qos->last_timeslice = spdk_get_ticks(); +- qos->poller = SPDK_POLLER_REGISTER(bdev_channel_poll_qos, +- qos, +- SPDK_BDEV_QOS_TIMESLICE_IN_USEC); +- } +- +- ch->flags |= BDEV_CH_QOS_ENABLED; +- } +-} +- +-struct poll_timeout_ctx { +- struct spdk_bdev_desc *desc; +- uint64_t timeout_in_sec; +- spdk_bdev_io_timeout_cb cb_fn; +- void *cb_arg; +-}; +- +-static void +-bdev_desc_free(struct spdk_bdev_desc *desc) +-{ +- spdk_spin_destroy(&desc->spinlock); +- free(desc->media_events_buffer); +- free(desc); +-} +- +-static void +-bdev_channel_poll_timeout_io_done(struct spdk_bdev *bdev, void *_ctx, int status) +-{ +- struct poll_timeout_ctx *ctx = _ctx; +- struct spdk_bdev_desc *desc = ctx->desc; +- +- free(ctx); +- +- spdk_spin_lock(&desc->spinlock); +- desc->refs--; +- if (desc->closed == true && desc->refs == 0) { +- spdk_spin_unlock(&desc->spinlock); +- bdev_desc_free(desc); +- return; +- } +- spdk_spin_unlock(&desc->spinlock); +-} +- +-static void +-bdev_channel_poll_timeout_io(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, +- struct spdk_io_channel *io_ch, void *_ctx) +-{ +- struct poll_timeout_ctx *ctx = _ctx; +- struct spdk_bdev_channel *bdev_ch = __io_ch_to_bdev_ch(io_ch); +- struct spdk_bdev_desc *desc = ctx->desc; +- struct spdk_bdev_io *bdev_io; +- uint64_t now; +- +- spdk_spin_lock(&desc->spinlock); +- if (desc->closed == true) { +- spdk_spin_unlock(&desc->spinlock); +- spdk_bdev_for_each_channel_continue(i, -1); +- return; +- } +- spdk_spin_unlock(&desc->spinlock); +- +- now = spdk_get_ticks(); +- TAILQ_FOREACH(bdev_io, &bdev_ch->io_submitted, internal.ch_link) { +- /* Exclude any I/O that are generated via splitting. */ +- if (bdev_io->internal.cb == bdev_io_split_done) { +- continue; +- } +- +- /* Once we find an I/O that has not timed out, we can immediately +- * exit the loop. +- */ +- if (now < (bdev_io->internal.submit_tsc + +- ctx->timeout_in_sec * spdk_get_ticks_hz())) { +- goto end; +- } +- +- if (bdev_io->internal.desc == desc) { +- ctx->cb_fn(ctx->cb_arg, bdev_io); +- } +- } +- +-end: +- spdk_bdev_for_each_channel_continue(i, 0); +-} +- +-static int +-bdev_poll_timeout_io(void *arg) +-{ +- struct spdk_bdev_desc *desc = arg; +- struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); +- struct poll_timeout_ctx *ctx; +- +- ctx = calloc(1, sizeof(struct poll_timeout_ctx)); +- if (!ctx) { +- SPDK_ERRLOG("failed to allocate memory\n"); +- return SPDK_POLLER_BUSY; +- } +- ctx->desc = desc; +- ctx->cb_arg = desc->cb_arg; +- ctx->cb_fn = desc->cb_fn; +- ctx->timeout_in_sec = desc->timeout_in_sec; +- +- /* Take a ref on the descriptor in case it gets closed while we are checking +- * all of the channels. +- */ +- spdk_spin_lock(&desc->spinlock); +- desc->refs++; +- spdk_spin_unlock(&desc->spinlock); +- +- spdk_bdev_for_each_channel(bdev, bdev_channel_poll_timeout_io, ctx, +- bdev_channel_poll_timeout_io_done); +- +- return SPDK_POLLER_BUSY; +-} +- +-int +-spdk_bdev_set_timeout(struct spdk_bdev_desc *desc, uint64_t timeout_in_sec, +- spdk_bdev_io_timeout_cb cb_fn, void *cb_arg) +-{ +- assert(desc->thread == spdk_get_thread()); +- +- spdk_poller_unregister(&desc->io_timeout_poller); +- +- if (timeout_in_sec) { +- assert(cb_fn != NULL); +- desc->io_timeout_poller = SPDK_POLLER_REGISTER(bdev_poll_timeout_io, +- desc, +- SPDK_BDEV_IO_POLL_INTERVAL_IN_MSEC * SPDK_SEC_TO_USEC / +- 1000); +- if (desc->io_timeout_poller == NULL) { +- SPDK_ERRLOG("can not register the desc timeout IO poller\n"); +- return -1; +- } +- } +- +- desc->cb_fn = cb_fn; +- desc->cb_arg = cb_arg; +- desc->timeout_in_sec = timeout_in_sec; +- +- return 0; +-} +- +-static int +-bdev_channel_create(void *io_device, void *ctx_buf) +-{ +- struct spdk_bdev *bdev = __bdev_from_io_dev(io_device); +- struct spdk_bdev_channel *ch = ctx_buf; +- struct spdk_io_channel *mgmt_io_ch; +- struct spdk_bdev_mgmt_channel *mgmt_ch; +- struct spdk_bdev_shared_resource *shared_resource; +- struct lba_range *range; +- +- ch->bdev = bdev; +- ch->channel = bdev->fn_table->get_io_channel(bdev->ctxt); +- if (!ch->channel) { +- return -1; +- } +- +- spdk_trace_record(TRACE_BDEV_IOCH_CREATE, 0, 0, 0, ch->bdev->name, +- spdk_thread_get_id(spdk_io_channel_get_thread(ch->channel))); +- +- assert(ch->histogram == NULL); +- if (bdev->internal.histogram_enabled) { +- ch->histogram = spdk_histogram_data_alloc(); +- if (ch->histogram == NULL) { +- SPDK_ERRLOG("Could not allocate histogram\n"); +- } +- } +- +- mgmt_io_ch = spdk_get_io_channel(&g_bdev_mgr); +- if (!mgmt_io_ch) { +- spdk_put_io_channel(ch->channel); +- return -1; +- } +- +- mgmt_ch = __io_ch_to_bdev_mgmt_ch(mgmt_io_ch); +- TAILQ_FOREACH(shared_resource, &mgmt_ch->shared_resources, link) { +- if (shared_resource->shared_ch == ch->channel) { +- spdk_put_io_channel(mgmt_io_ch); +- shared_resource->ref++; +- break; +- } +- } +- +- if (shared_resource == NULL) { +- shared_resource = calloc(1, sizeof(*shared_resource)); +- if (shared_resource == NULL) { +- spdk_put_io_channel(ch->channel); +- spdk_put_io_channel(mgmt_io_ch); +- return -1; +- } +- +- shared_resource->mgmt_ch = mgmt_ch; +- shared_resource->io_outstanding = 0; +- TAILQ_INIT(&shared_resource->nomem_io); +- shared_resource->nomem_threshold = 0; +- shared_resource->shared_ch = ch->channel; +- shared_resource->ref = 1; +- TAILQ_INSERT_TAIL(&mgmt_ch->shared_resources, shared_resource, link); +- } +- +- ch->io_outstanding = 0; +- TAILQ_INIT(&ch->queued_resets); +- TAILQ_INIT(&ch->locked_ranges); +- ch->flags = 0; +- ch->shared_resource = shared_resource; +- +- TAILQ_INIT(&ch->io_submitted); +- TAILQ_INIT(&ch->io_locked); +- +- ch->stat = bdev_alloc_io_stat(false); +- if (ch->stat == NULL) { +- bdev_channel_destroy_resource(ch); +- return -1; +- } +- +- ch->stat->ticks_rate = spdk_get_ticks_hz(); +- +-#ifdef SPDK_CONFIG_VTUNE +- { +- char *name; +- __itt_init_ittlib(NULL, 0); +- name = spdk_sprintf_alloc("spdk_bdev_%s_%p", ch->bdev->name, ch); +- if (!name) { +- bdev_channel_destroy_resource(ch); +- return -1; +- } +- ch->handle = __itt_string_handle_create(name); +- free(name); +- ch->start_tsc = spdk_get_ticks(); +- ch->interval_tsc = spdk_get_ticks_hz() / 100; +- ch->prev_stat = bdev_alloc_io_stat(false); +- if (ch->prev_stat == NULL) { +- bdev_channel_destroy_resource(ch); +- return -1; +- } +- } +-#endif +- +- spdk_spin_lock(&bdev->internal.spinlock); +- bdev_enable_qos(bdev, ch); +- +- TAILQ_FOREACH(range, &bdev->internal.locked_ranges, tailq) { +- struct lba_range *new_range; +- +- new_range = calloc(1, sizeof(*new_range)); +- if (new_range == NULL) { +- spdk_spin_unlock(&bdev->internal.spinlock); +- bdev_channel_destroy_resource(ch); +- return -1; +- } +- new_range->length = range->length; +- new_range->offset = range->offset; +- new_range->locked_ctx = range->locked_ctx; +- TAILQ_INSERT_TAIL(&ch->locked_ranges, new_range, tailq); +- } +- +- spdk_spin_unlock(&bdev->internal.spinlock); +- +- return 0; +-} +- +-static int +-bdev_abort_all_buf_io_cb(struct spdk_iobuf_channel *ch, struct spdk_iobuf_entry *entry, +- void *cb_ctx) +-{ +- struct spdk_bdev_channel *bdev_ch = cb_ctx; +- struct spdk_bdev_io *bdev_io; +- uint64_t buf_len; +- +- bdev_io = SPDK_CONTAINEROF(entry, struct spdk_bdev_io, internal.iobuf); +- if (bdev_io->internal.ch == bdev_ch) { +- buf_len = bdev_io_get_max_buf_len(bdev_io, bdev_io->internal.buf_len); +- spdk_iobuf_entry_abort(ch, entry, buf_len); +- spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_ABORTED); +- } +- +- return 0; +-} +- +-/* +- * Abort I/O that are waiting on a data buffer. +- */ +-static void +-bdev_abort_all_buf_io(struct spdk_bdev_mgmt_channel *mgmt_ch, struct spdk_bdev_channel *ch) +-{ +- spdk_iobuf_for_each_entry(&mgmt_ch->iobuf, &mgmt_ch->iobuf.small, +- bdev_abort_all_buf_io_cb, ch); +- spdk_iobuf_for_each_entry(&mgmt_ch->iobuf, &mgmt_ch->iobuf.large, +- bdev_abort_all_buf_io_cb, ch); +-} +- +-/* +- * Abort I/O that are queued waiting for submission. These types of I/O are +- * linked using the spdk_bdev_io link TAILQ_ENTRY. +- */ +-static void +-bdev_abort_all_queued_io(bdev_io_tailq_t *queue, struct spdk_bdev_channel *ch) +-{ +- struct spdk_bdev_io *bdev_io, *tmp; +- +- TAILQ_FOREACH_SAFE(bdev_io, queue, internal.link, tmp) { +- if (bdev_io->internal.ch == ch) { +- TAILQ_REMOVE(queue, bdev_io, internal.link); +- /* +- * spdk_bdev_io_complete() assumes that the completed I/O had +- * been submitted to the bdev module. Since in this case it +- * hadn't, bump io_outstanding to account for the decrement +- * that spdk_bdev_io_complete() will do. +- */ +- if (bdev_io->type != SPDK_BDEV_IO_TYPE_RESET) { +- ch->io_outstanding++; +- ch->shared_resource->io_outstanding++; +- } +- spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_ABORTED); +- } +- } +-} +- +-static bool +-bdev_abort_queued_io(bdev_io_tailq_t *queue, struct spdk_bdev_io *bio_to_abort) +-{ +- struct spdk_bdev_io *bdev_io; +- +- TAILQ_FOREACH(bdev_io, queue, internal.link) { +- if (bdev_io == bio_to_abort) { +- TAILQ_REMOVE(queue, bio_to_abort, internal.link); +- spdk_bdev_io_complete(bio_to_abort, SPDK_BDEV_IO_STATUS_ABORTED); +- return true; +- } +- } +- +- return false; +-} +- +-static int +-bdev_abort_buf_io_cb(struct spdk_iobuf_channel *ch, struct spdk_iobuf_entry *entry, void *cb_ctx) +-{ +- struct spdk_bdev_io *bdev_io, *bio_to_abort = cb_ctx; +- uint64_t buf_len; +- +- bdev_io = SPDK_CONTAINEROF(entry, struct spdk_bdev_io, internal.iobuf); +- if (bdev_io == bio_to_abort) { +- buf_len = bdev_io_get_max_buf_len(bdev_io, bdev_io->internal.buf_len); +- spdk_iobuf_entry_abort(ch, entry, buf_len); +- spdk_bdev_io_complete(bio_to_abort, SPDK_BDEV_IO_STATUS_ABORTED); +- return 1; +- } +- +- return 0; +-} +- +-static bool +-bdev_abort_buf_io(struct spdk_bdev_mgmt_channel *mgmt_ch, struct spdk_bdev_io *bio_to_abort) +-{ +- int rc; +- +- rc = spdk_iobuf_for_each_entry(&mgmt_ch->iobuf, &mgmt_ch->iobuf.small, +- bdev_abort_buf_io_cb, bio_to_abort); +- if (rc == 1) { +- return true; +- } +- +- rc = spdk_iobuf_for_each_entry(&mgmt_ch->iobuf, &mgmt_ch->iobuf.large, +- bdev_abort_buf_io_cb, bio_to_abort); +- return rc == 1; +-} +- +-static void +-bdev_qos_channel_destroy(void *cb_arg) +-{ +- struct spdk_bdev_qos *qos = cb_arg; +- +- spdk_put_io_channel(spdk_io_channel_from_ctx(qos->ch)); +- spdk_poller_unregister(&qos->poller); +- +- SPDK_DEBUGLOG(bdev, "Free QoS %p.\n", qos); +- +- free(qos); +-} +- +-static int +-bdev_qos_destroy(struct spdk_bdev *bdev) +-{ +- int i; +- +- /* +- * Cleanly shutting down the QoS poller is tricky, because +- * during the asynchronous operation the user could open +- * a new descriptor and create a new channel, spawning +- * a new QoS poller. +- * +- * The strategy is to create a new QoS structure here and swap it +- * in. The shutdown path then continues to refer to the old one +- * until it completes and then releases it. +- */ +- struct spdk_bdev_qos *new_qos, *old_qos; +- +- old_qos = bdev->internal.qos; +- +- new_qos = calloc(1, sizeof(*new_qos)); +- if (!new_qos) { +- SPDK_ERRLOG("Unable to allocate memory to shut down QoS.\n"); +- return -ENOMEM; +- } +- +- /* Copy the old QoS data into the newly allocated structure */ +- memcpy(new_qos, old_qos, sizeof(*new_qos)); +- +- /* Zero out the key parts of the QoS structure */ +- new_qos->ch = NULL; +- new_qos->thread = NULL; +- new_qos->poller = NULL; +- TAILQ_INIT(&new_qos->queued); +- /* +- * The limit member of spdk_bdev_qos_limit structure is not zeroed. +- * It will be used later for the new QoS structure. +- */ +- for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { +- new_qos->rate_limits[i].remaining_this_timeslice = 0; +- new_qos->rate_limits[i].min_per_timeslice = 0; +- new_qos->rate_limits[i].max_per_timeslice = 0; +- } +- +- bdev->internal.qos = new_qos; +- +- if (old_qos->thread == NULL) { +- free(old_qos); +- } else { +- spdk_thread_send_msg(old_qos->thread, bdev_qos_channel_destroy, old_qos); +- } +- +- /* It is safe to continue with destroying the bdev even though the QoS channel hasn't +- * been destroyed yet. The destruction path will end up waiting for the final +- * channel to be put before it releases resources. */ +- +- return 0; +-} +- +-void +-spdk_bdev_add_io_stat(struct spdk_bdev_io_stat *total, struct spdk_bdev_io_stat *add) +-{ +- total->bytes_read += add->bytes_read; +- total->num_read_ops += add->num_read_ops; +- total->bytes_written += add->bytes_written; +- total->num_write_ops += add->num_write_ops; +- total->bytes_unmapped += add->bytes_unmapped; +- total->num_unmap_ops += add->num_unmap_ops; +- total->bytes_copied += add->bytes_copied; +- total->num_copy_ops += add->num_copy_ops; +- total->read_latency_ticks += add->read_latency_ticks; +- total->write_latency_ticks += add->write_latency_ticks; +- total->unmap_latency_ticks += add->unmap_latency_ticks; +- total->copy_latency_ticks += add->copy_latency_ticks; +- if (total->max_read_latency_ticks < add->max_read_latency_ticks) { +- total->max_read_latency_ticks = add->max_read_latency_ticks; +- } +- if (total->min_read_latency_ticks > add->min_read_latency_ticks) { +- total->min_read_latency_ticks = add->min_read_latency_ticks; +- } +- if (total->max_write_latency_ticks < add->max_write_latency_ticks) { +- total->max_write_latency_ticks = add->max_write_latency_ticks; +- } +- if (total->min_write_latency_ticks > add->min_write_latency_ticks) { +- total->min_write_latency_ticks = add->min_write_latency_ticks; +- } +- if (total->max_unmap_latency_ticks < add->max_unmap_latency_ticks) { +- total->max_unmap_latency_ticks = add->max_unmap_latency_ticks; +- } +- if (total->min_unmap_latency_ticks > add->min_unmap_latency_ticks) { +- total->min_unmap_latency_ticks = add->min_unmap_latency_ticks; +- } +- if (total->max_copy_latency_ticks < add->max_copy_latency_ticks) { +- total->max_copy_latency_ticks = add->max_copy_latency_ticks; +- } +- if (total->min_copy_latency_ticks > add->min_copy_latency_ticks) { +- total->min_copy_latency_ticks = add->min_copy_latency_ticks; +- } +-} +- +-static void +-bdev_get_io_stat(struct spdk_bdev_io_stat *to_stat, struct spdk_bdev_io_stat *from_stat) +-{ +- memcpy(to_stat, from_stat, offsetof(struct spdk_bdev_io_stat, io_error)); +- +- if (to_stat->io_error != NULL && from_stat->io_error != NULL) { +- memcpy(to_stat->io_error, from_stat->io_error, +- sizeof(struct spdk_bdev_io_error_stat)); +- } +-} +- +-void +-spdk_bdev_reset_io_stat(struct spdk_bdev_io_stat *stat, enum spdk_bdev_reset_stat_mode mode) +-{ +- stat->max_read_latency_ticks = 0; +- stat->min_read_latency_ticks = UINT64_MAX; +- stat->max_write_latency_ticks = 0; +- stat->min_write_latency_ticks = UINT64_MAX; +- stat->max_unmap_latency_ticks = 0; +- stat->min_unmap_latency_ticks = UINT64_MAX; +- stat->max_copy_latency_ticks = 0; +- stat->min_copy_latency_ticks = UINT64_MAX; +- +- if (mode != SPDK_BDEV_RESET_STAT_ALL) { +- return; +- } +- +- stat->bytes_read = 0; +- stat->num_read_ops = 0; +- stat->bytes_written = 0; +- stat->num_write_ops = 0; +- stat->bytes_unmapped = 0; +- stat->num_unmap_ops = 0; +- stat->bytes_copied = 0; +- stat->num_copy_ops = 0; +- stat->read_latency_ticks = 0; +- stat->write_latency_ticks = 0; +- stat->unmap_latency_ticks = 0; +- stat->copy_latency_ticks = 0; +- +- if (stat->io_error != NULL) { +- memset(stat->io_error, 0, sizeof(struct spdk_bdev_io_error_stat)); +- } +-} +- +-struct spdk_bdev_io_stat * +-bdev_alloc_io_stat(bool io_error_stat) +-{ +- struct spdk_bdev_io_stat *stat; +- +- stat = malloc(sizeof(struct spdk_bdev_io_stat)); +- if (stat == NULL) { +- return NULL; +- } +- +- if (io_error_stat) { +- stat->io_error = malloc(sizeof(struct spdk_bdev_io_error_stat)); +- if (stat->io_error == NULL) { +- free(stat); +- return NULL; +- } +- } else { +- stat->io_error = NULL; +- } +- +- spdk_bdev_reset_io_stat(stat, SPDK_BDEV_RESET_STAT_ALL); +- +- return stat; +-} +- +-void +-bdev_free_io_stat(struct spdk_bdev_io_stat *stat) +-{ +- if (stat != NULL) { +- free(stat->io_error); +- free(stat); +- } +-} +- +-void +-spdk_bdev_dump_io_stat_json(struct spdk_bdev_io_stat *stat, struct spdk_json_write_ctx *w) +-{ +- int i; +- +- spdk_json_write_named_uint64(w, "bytes_read", stat->bytes_read); +- spdk_json_write_named_uint64(w, "num_read_ops", stat->num_read_ops); +- spdk_json_write_named_uint64(w, "bytes_written", stat->bytes_written); +- spdk_json_write_named_uint64(w, "num_write_ops", stat->num_write_ops); +- spdk_json_write_named_uint64(w, "bytes_unmapped", stat->bytes_unmapped); +- spdk_json_write_named_uint64(w, "num_unmap_ops", stat->num_unmap_ops); +- spdk_json_write_named_uint64(w, "bytes_copied", stat->bytes_copied); +- spdk_json_write_named_uint64(w, "num_copy_ops", stat->num_copy_ops); +- spdk_json_write_named_uint64(w, "read_latency_ticks", stat->read_latency_ticks); +- spdk_json_write_named_uint64(w, "max_read_latency_ticks", stat->max_read_latency_ticks); +- spdk_json_write_named_uint64(w, "min_read_latency_ticks", +- stat->min_read_latency_ticks != UINT64_MAX ? +- stat->min_read_latency_ticks : 0); +- spdk_json_write_named_uint64(w, "write_latency_ticks", stat->write_latency_ticks); +- spdk_json_write_named_uint64(w, "max_write_latency_ticks", stat->max_write_latency_ticks); +- spdk_json_write_named_uint64(w, "min_write_latency_ticks", +- stat->min_write_latency_ticks != UINT64_MAX ? +- stat->min_write_latency_ticks : 0); +- spdk_json_write_named_uint64(w, "unmap_latency_ticks", stat->unmap_latency_ticks); +- spdk_json_write_named_uint64(w, "max_unmap_latency_ticks", stat->max_unmap_latency_ticks); +- spdk_json_write_named_uint64(w, "min_unmap_latency_ticks", +- stat->min_unmap_latency_ticks != UINT64_MAX ? +- stat->min_unmap_latency_ticks : 0); +- spdk_json_write_named_uint64(w, "copy_latency_ticks", stat->copy_latency_ticks); +- spdk_json_write_named_uint64(w, "max_copy_latency_ticks", stat->max_copy_latency_ticks); +- spdk_json_write_named_uint64(w, "min_copy_latency_ticks", +- stat->min_copy_latency_ticks != UINT64_MAX ? +- stat->min_copy_latency_ticks : 0); +- +- if (stat->io_error != NULL) { +- spdk_json_write_named_object_begin(w, "io_error"); +- for (i = 0; i < -SPDK_MIN_BDEV_IO_STATUS; i++) { +- if (stat->io_error->error_status[i] != 0) { +- spdk_json_write_named_uint32(w, bdev_io_status_get_string(-(i + 1)), +- stat->io_error->error_status[i]); +- } +- } +- spdk_json_write_object_end(w); +- } +-} +- +-static void +-bdev_channel_abort_queued_ios(struct spdk_bdev_channel *ch) +-{ +- struct spdk_bdev_shared_resource *shared_resource = ch->shared_resource; +- struct spdk_bdev_mgmt_channel *mgmt_ch = shared_resource->mgmt_ch; +- +- bdev_abort_all_queued_io(&shared_resource->nomem_io, ch); +- bdev_abort_all_buf_io(mgmt_ch, ch); +- bdev_abort_all_buf_io(mgmt_ch, ch); +-} +- +-static void +-bdev_channel_destroy(void *io_device, void *ctx_buf) +-{ +- struct spdk_bdev_channel *ch = ctx_buf; +- +- SPDK_DEBUGLOG(bdev, "Destroying channel %p for bdev %s on thread %p\n", ch, ch->bdev->name, +- spdk_get_thread()); +- +- spdk_trace_record(TRACE_BDEV_IOCH_DESTROY, 0, 0, 0, ch->bdev->name, +- spdk_thread_get_id(spdk_io_channel_get_thread(ch->channel))); +- +- /* This channel is going away, so add its statistics into the bdev so that they don't get lost. */ +- spdk_spin_lock(&ch->bdev->internal.spinlock); +- spdk_bdev_add_io_stat(ch->bdev->internal.stat, ch->stat); +- spdk_spin_unlock(&ch->bdev->internal.spinlock); +- +- bdev_abort_all_queued_io(&ch->queued_resets, ch); +- +- bdev_channel_abort_queued_ios(ch); +- +- if (ch->histogram) { +- spdk_histogram_data_free(ch->histogram); +- } +- +- bdev_channel_destroy_resource(ch); +-} +- +-/* +- * If the name already exists in the global bdev name tree, RB_INSERT() returns a pointer +- * to it. Hence we do not have to call bdev_get_by_name() when using this function. +- */ +-static int +-bdev_name_add(struct spdk_bdev_name *bdev_name, struct spdk_bdev *bdev, const char *name) +-{ +- struct spdk_bdev_name *tmp; +- +- bdev_name->name = strdup(name); +- if (bdev_name->name == NULL) { +- SPDK_ERRLOG("Unable to allocate bdev name\n"); +- return -ENOMEM; +- } +- +- bdev_name->bdev = bdev; +- +- spdk_spin_lock(&g_bdev_mgr.spinlock); +- tmp = RB_INSERT(bdev_name_tree, &g_bdev_mgr.bdev_names, bdev_name); +- spdk_spin_unlock(&g_bdev_mgr.spinlock); +- +- if (tmp != NULL) { +- SPDK_ERRLOG("Bdev name %s already exists\n", name); +- free(bdev_name->name); +- return -EEXIST; +- } +- +- return 0; +-} +- +-static void +-bdev_name_del_unsafe(struct spdk_bdev_name *bdev_name) +-{ +- RB_REMOVE(bdev_name_tree, &g_bdev_mgr.bdev_names, bdev_name); +- free(bdev_name->name); +-} +- +-static void +-bdev_name_del(struct spdk_bdev_name *bdev_name) +-{ +- spdk_spin_lock(&g_bdev_mgr.spinlock); +- bdev_name_del_unsafe(bdev_name); +- spdk_spin_unlock(&g_bdev_mgr.spinlock); +-} +- +-int +-spdk_bdev_alias_add(struct spdk_bdev *bdev, const char *alias) +-{ +- struct spdk_bdev_alias *tmp; +- int ret; +- +- if (alias == NULL) { +- SPDK_ERRLOG("Empty alias passed\n"); +- return -EINVAL; +- } +- +- tmp = calloc(1, sizeof(*tmp)); +- if (tmp == NULL) { +- SPDK_ERRLOG("Unable to allocate alias\n"); +- return -ENOMEM; +- } +- +- ret = bdev_name_add(&tmp->alias, bdev, alias); +- if (ret != 0) { +- free(tmp); +- return ret; +- } +- +- TAILQ_INSERT_TAIL(&bdev->aliases, tmp, tailq); +- +- return 0; +-} +- +-static int +-bdev_alias_del(struct spdk_bdev *bdev, const char *alias, +- void (*alias_del_fn)(struct spdk_bdev_name *n)) +-{ +- struct spdk_bdev_alias *tmp; +- +- TAILQ_FOREACH(tmp, &bdev->aliases, tailq) { +- if (strcmp(alias, tmp->alias.name) == 0) { +- TAILQ_REMOVE(&bdev->aliases, tmp, tailq); +- alias_del_fn(&tmp->alias); +- free(tmp); +- return 0; +- } +- } +- +- return -ENOENT; +-} +- +-int +-spdk_bdev_alias_del(struct spdk_bdev *bdev, const char *alias) +-{ +- int rc; +- +- rc = bdev_alias_del(bdev, alias, bdev_name_del); +- if (rc == -ENOENT) { +- SPDK_INFOLOG(bdev, "Alias %s does not exist\n", alias); +- } +- +- return rc; +-} +- +-void +-spdk_bdev_alias_del_all(struct spdk_bdev *bdev) +-{ +- struct spdk_bdev_alias *p, *tmp; +- +- TAILQ_FOREACH_SAFE(p, &bdev->aliases, tailq, tmp) { +- TAILQ_REMOVE(&bdev->aliases, p, tailq); +- bdev_name_del(&p->alias); +- free(p); +- } +-} +- +-struct spdk_io_channel * +-spdk_bdev_get_io_channel(struct spdk_bdev_desc *desc) +-{ +- return spdk_get_io_channel(__bdev_to_io_dev(spdk_bdev_desc_get_bdev(desc))); +-} +- +-void * +-spdk_bdev_get_module_ctx(struct spdk_bdev_desc *desc) +-{ +- struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); +- void *ctx = NULL; +- +- if (bdev->fn_table->get_module_ctx) { +- ctx = bdev->fn_table->get_module_ctx(bdev->ctxt); +- } +- +- return ctx; +-} +- +-const char * +-spdk_bdev_get_module_name(const struct spdk_bdev *bdev) +-{ +- return bdev->module->name; +-} +- +-const char * +-spdk_bdev_get_name(const struct spdk_bdev *bdev) +-{ +- return bdev->name; +-} +- +-const char * +-spdk_bdev_get_product_name(const struct spdk_bdev *bdev) +-{ +- return bdev->product_name; +-} +- +-const struct spdk_bdev_aliases_list * +-spdk_bdev_get_aliases(const struct spdk_bdev *bdev) +-{ +- return &bdev->aliases; +-} +- +-uint32_t +-spdk_bdev_get_block_size(const struct spdk_bdev *bdev) +-{ +- return bdev->blocklen; +-} +- +-uint32_t +-spdk_bdev_get_write_unit_size(const struct spdk_bdev *bdev) +-{ +- return bdev->write_unit_size; +-} +- +-uint64_t +-spdk_bdev_get_num_blocks(const struct spdk_bdev *bdev) +-{ +- return bdev->blockcnt; +-} +- +-const char * +-spdk_bdev_get_qos_rpc_type(enum spdk_bdev_qos_rate_limit_type type) +-{ +- return qos_rpc_type[type]; +-} +- +-void +-spdk_bdev_get_qos_rate_limits(struct spdk_bdev *bdev, uint64_t *limits) +-{ +- int i; +- +- memset(limits, 0, sizeof(*limits) * SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES); +- +- spdk_spin_lock(&bdev->internal.spinlock); +- if (bdev->internal.qos) { +- for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { +- if (bdev->internal.qos->rate_limits[i].limit != +- SPDK_BDEV_QOS_LIMIT_NOT_DEFINED) { +- limits[i] = bdev->internal.qos->rate_limits[i].limit; +- if (bdev_qos_is_iops_rate_limit(i) == false) { +- /* Change from Byte to Megabyte which is user visible. */ +- limits[i] = limits[i] / 1024 / 1024; +- } +- } +- } +- } +- spdk_spin_unlock(&bdev->internal.spinlock); +-} +- +-size_t +-spdk_bdev_get_buf_align(const struct spdk_bdev *bdev) +-{ +- return 1 << bdev->required_alignment; +-} +- +-uint32_t +-spdk_bdev_get_optimal_io_boundary(const struct spdk_bdev *bdev) +-{ +- return bdev->optimal_io_boundary; +-} +- +-bool +-spdk_bdev_has_write_cache(const struct spdk_bdev *bdev) +-{ +- return bdev->write_cache; +-} +- +-const struct spdk_uuid * +-spdk_bdev_get_uuid(const struct spdk_bdev *bdev) +-{ +- return &bdev->uuid; +-} +- +-uint16_t +-spdk_bdev_get_acwu(const struct spdk_bdev *bdev) +-{ +- return bdev->acwu; +-} +- +-uint32_t +-spdk_bdev_get_md_size(const struct spdk_bdev *bdev) +-{ +- return bdev->md_len; +-} +- +-bool +-spdk_bdev_is_md_interleaved(const struct spdk_bdev *bdev) +-{ +- return (bdev->md_len != 0) && bdev->md_interleave; +-} +- +-bool +-spdk_bdev_is_md_separate(const struct spdk_bdev *bdev) +-{ +- return (bdev->md_len != 0) && !bdev->md_interleave; +-} +- +-bool +-spdk_bdev_is_zoned(const struct spdk_bdev *bdev) +-{ +- return bdev->zoned; +-} +- +-uint32_t +-spdk_bdev_get_data_block_size(const struct spdk_bdev *bdev) +-{ +- if (spdk_bdev_is_md_interleaved(bdev)) { +- return bdev->blocklen - bdev->md_len; +- } else { +- return bdev->blocklen; +- } +-} +- +-uint32_t +-spdk_bdev_get_physical_block_size(const struct spdk_bdev *bdev) +-{ +- return bdev->phys_blocklen; +-} +- +-static uint32_t +-_bdev_get_block_size_with_md(const struct spdk_bdev *bdev) +-{ +- if (!spdk_bdev_is_md_interleaved(bdev)) { +- return bdev->blocklen + bdev->md_len; +- } else { +- return bdev->blocklen; +- } +-} +- +-/* We have to use the typedef in the function declaration to appease astyle. */ +-typedef enum spdk_dif_type spdk_dif_type_t; +- +-spdk_dif_type_t +-spdk_bdev_get_dif_type(const struct spdk_bdev *bdev) +-{ +- if (bdev->md_len != 0) { +- return bdev->dif_type; +- } else { +- return SPDK_DIF_DISABLE; +- } +-} +- +-bool +-spdk_bdev_is_dif_head_of_md(const struct spdk_bdev *bdev) +-{ +- if (spdk_bdev_get_dif_type(bdev) != SPDK_DIF_DISABLE) { +- return bdev->dif_is_head_of_md; +- } else { +- return false; +- } +-} +- +-bool +-spdk_bdev_is_dif_check_enabled(const struct spdk_bdev *bdev, +- enum spdk_dif_check_type check_type) +-{ +- if (spdk_bdev_get_dif_type(bdev) == SPDK_DIF_DISABLE) { +- return false; +- } +- +- switch (check_type) { +- case SPDK_DIF_CHECK_TYPE_REFTAG: +- return (bdev->dif_check_flags & SPDK_DIF_FLAGS_REFTAG_CHECK) != 0; +- case SPDK_DIF_CHECK_TYPE_APPTAG: +- return (bdev->dif_check_flags & SPDK_DIF_FLAGS_APPTAG_CHECK) != 0; +- case SPDK_DIF_CHECK_TYPE_GUARD: +- return (bdev->dif_check_flags & SPDK_DIF_FLAGS_GUARD_CHECK) != 0; +- default: +- return false; +- } +-} +- +-uint32_t +-spdk_bdev_get_max_copy(const struct spdk_bdev *bdev) +-{ +- return bdev->max_copy; +-} +- +-uint64_t +-spdk_bdev_get_qd(const struct spdk_bdev *bdev) +-{ +- return bdev->internal.measured_queue_depth; +-} +- +-uint64_t +-spdk_bdev_get_qd_sampling_period(const struct spdk_bdev *bdev) +-{ +- return bdev->internal.period; +-} +- +-uint64_t +-spdk_bdev_get_weighted_io_time(const struct spdk_bdev *bdev) +-{ +- return bdev->internal.weighted_io_time; +-} +- +-uint64_t +-spdk_bdev_get_io_time(const struct spdk_bdev *bdev) +-{ +- return bdev->internal.io_time; +-} +- +-static void bdev_update_qd_sampling_period(void *ctx); +- +-static void +-_calculate_measured_qd_cpl(struct spdk_bdev *bdev, void *_ctx, int status) +-{ +- bdev->internal.measured_queue_depth = bdev->internal.temporary_queue_depth; +- +- if (bdev->internal.measured_queue_depth) { +- bdev->internal.io_time += bdev->internal.period; +- bdev->internal.weighted_io_time += bdev->internal.period * bdev->internal.measured_queue_depth; +- } +- +- bdev->internal.qd_poll_in_progress = false; +- +- bdev_update_qd_sampling_period(bdev); +-} +- +-static void +-_calculate_measured_qd(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, +- struct spdk_io_channel *io_ch, void *_ctx) +-{ +- struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(io_ch); +- +- bdev->internal.temporary_queue_depth += ch->io_outstanding; +- spdk_bdev_for_each_channel_continue(i, 0); +-} +- +-static int +-bdev_calculate_measured_queue_depth(void *ctx) +-{ +- struct spdk_bdev *bdev = ctx; +- +- bdev->internal.qd_poll_in_progress = true; +- bdev->internal.temporary_queue_depth = 0; +- spdk_bdev_for_each_channel(bdev, _calculate_measured_qd, bdev, _calculate_measured_qd_cpl); +- return SPDK_POLLER_BUSY; +-} +- +-static void +-bdev_update_qd_sampling_period(void *ctx) +-{ +- struct spdk_bdev *bdev = ctx; +- +- if (bdev->internal.period == bdev->internal.new_period) { +- return; +- } +- +- if (bdev->internal.qd_poll_in_progress) { +- return; +- } +- +- bdev->internal.period = bdev->internal.new_period; +- +- spdk_poller_unregister(&bdev->internal.qd_poller); +- if (bdev->internal.period != 0) { +- bdev->internal.qd_poller = SPDK_POLLER_REGISTER(bdev_calculate_measured_queue_depth, +- bdev, bdev->internal.period); +- } else { +- spdk_bdev_close(bdev->internal.qd_desc); +- bdev->internal.qd_desc = NULL; +- } +-} +- +-static void +-_tmp_bdev_event_cb(enum spdk_bdev_event_type type, struct spdk_bdev *bdev, void *ctx) +-{ +- SPDK_NOTICELOG("Unexpected event type: %d\n", type); +-} +- +-void +-spdk_bdev_set_qd_sampling_period(struct spdk_bdev *bdev, uint64_t period) +-{ +- int rc; +- +- if (bdev->internal.new_period == period) { +- return; +- } +- +- bdev->internal.new_period = period; +- +- if (bdev->internal.qd_desc != NULL) { +- assert(bdev->internal.period != 0); +- +- spdk_thread_send_msg(bdev->internal.qd_desc->thread, +- bdev_update_qd_sampling_period, bdev); +- return; +- } +- +- assert(bdev->internal.period == 0); +- +- rc = spdk_bdev_open_ext(spdk_bdev_get_name(bdev), false, _tmp_bdev_event_cb, +- NULL, &bdev->internal.qd_desc); +- if (rc != 0) { +- return; +- } +- +- bdev->internal.period = period; +- bdev->internal.qd_poller = SPDK_POLLER_REGISTER(bdev_calculate_measured_queue_depth, +- bdev, period); +-} +- +-struct bdev_get_current_qd_ctx { +- uint64_t current_qd; +- spdk_bdev_get_current_qd_cb cb_fn; +- void *cb_arg; +-}; +- +-static void +-bdev_get_current_qd_done(struct spdk_bdev *bdev, void *_ctx, int status) +-{ +- struct bdev_get_current_qd_ctx *ctx = _ctx; +- +- ctx->cb_fn(bdev, ctx->current_qd, ctx->cb_arg, 0); +- +- free(ctx); +-} +- +-static void +-bdev_get_current_qd(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, +- struct spdk_io_channel *io_ch, void *_ctx) +-{ +- struct bdev_get_current_qd_ctx *ctx = _ctx; +- struct spdk_bdev_channel *bdev_ch = __io_ch_to_bdev_ch(io_ch); +- +- ctx->current_qd += bdev_ch->io_outstanding; +- +- spdk_bdev_for_each_channel_continue(i, 0); +-} +- +-void +-spdk_bdev_get_current_qd(struct spdk_bdev *bdev, spdk_bdev_get_current_qd_cb cb_fn, +- void *cb_arg) +-{ +- struct bdev_get_current_qd_ctx *ctx; +- +- assert(cb_fn != NULL); +- +- ctx = calloc(1, sizeof(*ctx)); +- if (ctx == NULL) { +- cb_fn(bdev, 0, cb_arg, -ENOMEM); +- return; +- } +- +- ctx->cb_fn = cb_fn; +- ctx->cb_arg = cb_arg; +- +- spdk_bdev_for_each_channel(bdev, bdev_get_current_qd, ctx, bdev_get_current_qd_done); +-} +- +-static void +-_resize_notify(void *arg) +-{ +- struct spdk_bdev_desc *desc = arg; +- +- spdk_spin_lock(&desc->spinlock); +- desc->refs--; +- if (!desc->closed) { +- spdk_spin_unlock(&desc->spinlock); +- desc->callback.event_fn(SPDK_BDEV_EVENT_RESIZE, +- desc->bdev, +- desc->callback.ctx); +- return; +- } else if (0 == desc->refs) { +- /* This descriptor was closed after this resize_notify message was sent. +- * spdk_bdev_close() could not free the descriptor since this message was +- * in flight, so we free it now using bdev_desc_free(). +- */ +- spdk_spin_unlock(&desc->spinlock); +- bdev_desc_free(desc); +- return; +- } +- spdk_spin_unlock(&desc->spinlock); +-} +- +-int +-spdk_bdev_notify_blockcnt_change(struct spdk_bdev *bdev, uint64_t size) +-{ +- struct spdk_bdev_desc *desc; +- int ret; +- +- if (size == bdev->blockcnt) { +- return 0; +- } +- +- spdk_spin_lock(&bdev->internal.spinlock); +- +- /* bdev has open descriptors */ +- if (!TAILQ_EMPTY(&bdev->internal.open_descs) && +- bdev->blockcnt > size) { +- ret = -EBUSY; +- } else { +- bdev->blockcnt = size; +- TAILQ_FOREACH(desc, &bdev->internal.open_descs, link) { +- spdk_spin_lock(&desc->spinlock); +- if (!desc->closed) { +- desc->refs++; +- spdk_thread_send_msg(desc->thread, _resize_notify, desc); +- } +- spdk_spin_unlock(&desc->spinlock); +- } +- ret = 0; +- } +- +- spdk_spin_unlock(&bdev->internal.spinlock); +- +- return ret; +-} +- +-/* +- * Convert I/O offset and length from bytes to blocks. +- * +- * Returns zero on success or non-zero if the byte parameters aren't divisible by the block size. +- */ +-static uint64_t +-bdev_bytes_to_blocks(struct spdk_bdev *bdev, uint64_t offset_bytes, uint64_t *offset_blocks, +- uint64_t num_bytes, uint64_t *num_blocks) +-{ +- uint32_t block_size = bdev->blocklen; +- uint8_t shift_cnt; +- +- /* Avoid expensive div operations if possible. These spdk_u32 functions are very cheap. */ +- if (spdk_likely(spdk_u32_is_pow2(block_size))) { +- shift_cnt = spdk_u32log2(block_size); +- *offset_blocks = offset_bytes >> shift_cnt; +- *num_blocks = num_bytes >> shift_cnt; +- return (offset_bytes - (*offset_blocks << shift_cnt)) | +- (num_bytes - (*num_blocks << shift_cnt)); +- } else { +- *offset_blocks = offset_bytes / block_size; +- *num_blocks = num_bytes / block_size; +- return (offset_bytes % block_size) | (num_bytes % block_size); +- } +-} +- +-static bool +-bdev_io_valid_blocks(struct spdk_bdev *bdev, uint64_t offset_blocks, uint64_t num_blocks) +-{ +- /* Return failure if offset_blocks + num_blocks is less than offset_blocks; indicates there +- * has been an overflow and hence the offset has been wrapped around */ +- if (offset_blocks + num_blocks < offset_blocks) { +- return false; +- } +- +- /* Return failure if offset_blocks + num_blocks exceeds the size of the bdev */ +- if (offset_blocks + num_blocks > bdev->blockcnt) { +- return false; +- } +- +- return true; +-} +- +-static void +-bdev_seek_complete_cb(void *ctx) +-{ +- struct spdk_bdev_io *bdev_io = ctx; +- +- bdev_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS; +- bdev_io->internal.cb(bdev_io, true, bdev_io->internal.caller_ctx); +-} +- +-static int +-bdev_seek(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, +- uint64_t offset_blocks, enum spdk_bdev_io_type io_type, +- spdk_bdev_io_completion_cb cb, void *cb_arg) +-{ +- struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); +- struct spdk_bdev_io *bdev_io; +- struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); +- +- assert(io_type == SPDK_BDEV_IO_TYPE_SEEK_DATA || io_type == SPDK_BDEV_IO_TYPE_SEEK_HOLE); +- +- /* Check if offset_blocks is valid looking at the validity of one block */ +- if (!bdev_io_valid_blocks(bdev, offset_blocks, 1)) { +- return -EINVAL; +- } +- +- bdev_io = bdev_channel_get_io(channel); +- if (!bdev_io) { +- return -ENOMEM; +- } +- +- bdev_io->internal.ch = channel; +- bdev_io->internal.desc = desc; +- bdev_io->type = io_type; +- bdev_io->u.bdev.offset_blocks = offset_blocks; +- bdev_io_init(bdev_io, bdev, cb_arg, cb); +- +- if (!spdk_bdev_io_type_supported(bdev, io_type)) { +- /* In case bdev doesn't support seek to next data/hole offset, +- * it is assumed that only data and no holes are present */ +- if (io_type == SPDK_BDEV_IO_TYPE_SEEK_DATA) { +- bdev_io->u.bdev.seek.offset = offset_blocks; +- } else { +- bdev_io->u.bdev.seek.offset = UINT64_MAX; +- } +- +- spdk_thread_send_msg(spdk_get_thread(), bdev_seek_complete_cb, bdev_io); +- return 0; +- } +- +- bdev_io_submit(bdev_io); +- return 0; +-} +- +-int +-spdk_bdev_seek_data(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, +- uint64_t offset_blocks, +- spdk_bdev_io_completion_cb cb, void *cb_arg) +-{ +- return bdev_seek(desc, ch, offset_blocks, SPDK_BDEV_IO_TYPE_SEEK_DATA, cb, cb_arg); +-} +- +-int +-spdk_bdev_seek_hole(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, +- uint64_t offset_blocks, +- spdk_bdev_io_completion_cb cb, void *cb_arg) +-{ +- return bdev_seek(desc, ch, offset_blocks, SPDK_BDEV_IO_TYPE_SEEK_HOLE, cb, cb_arg); +-} +- +-uint64_t +-spdk_bdev_io_get_seek_offset(const struct spdk_bdev_io *bdev_io) +-{ +- return bdev_io->u.bdev.seek.offset; +-} +- +-static int +-bdev_read_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, void *buf, +- void *md_buf, uint64_t offset_blocks, uint64_t num_blocks, +- spdk_bdev_io_completion_cb cb, void *cb_arg) +-{ +- struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); +- struct spdk_bdev_io *bdev_io; +- struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); +- +- if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { +- return -EINVAL; +- } +- +- bdev_io = bdev_channel_get_io(channel); +- if (!bdev_io) { +- return -ENOMEM; +- } +- +- bdev_io->internal.ch = channel; +- bdev_io->internal.desc = desc; +- bdev_io->type = SPDK_BDEV_IO_TYPE_READ; +- bdev_io->u.bdev.iovs = &bdev_io->iov; +- bdev_io->u.bdev.iovs[0].iov_base = buf; +- bdev_io->u.bdev.iovs[0].iov_len = num_blocks * bdev->blocklen; +- bdev_io->u.bdev.iovcnt = 1; +- bdev_io->u.bdev.md_buf = md_buf; +- bdev_io->u.bdev.num_blocks = num_blocks; +- bdev_io->u.bdev.offset_blocks = offset_blocks; +- bdev_io->u.bdev.ext_opts = NULL; +- bdev_io_init(bdev_io, bdev, cb_arg, cb); +- +- bdev_io_submit(bdev_io); +- return 0; +-} +- +-int +-spdk_bdev_read(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, +- void *buf, uint64_t offset, uint64_t nbytes, +- spdk_bdev_io_completion_cb cb, void *cb_arg) +-{ +- uint64_t offset_blocks, num_blocks; +- +- if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks, +- nbytes, &num_blocks) != 0) { +- return -EINVAL; +- } +- +- return spdk_bdev_read_blocks(desc, ch, buf, offset_blocks, num_blocks, cb, cb_arg); +-} +- +-int +-spdk_bdev_read_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, +- void *buf, uint64_t offset_blocks, uint64_t num_blocks, +- spdk_bdev_io_completion_cb cb, void *cb_arg) +-{ +- return bdev_read_blocks_with_md(desc, ch, buf, NULL, offset_blocks, num_blocks, cb, cb_arg); +-} +- +-int +-spdk_bdev_read_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, +- void *buf, void *md_buf, uint64_t offset_blocks, uint64_t num_blocks, +- spdk_bdev_io_completion_cb cb, void *cb_arg) +-{ +- struct iovec iov = { +- .iov_base = buf, +- }; +- +- if (md_buf && !spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) { +- return -EINVAL; +- } +- +- if (md_buf && !_is_buf_allocated(&iov)) { +- return -EINVAL; +- } +- +- return bdev_read_blocks_with_md(desc, ch, buf, md_buf, offset_blocks, num_blocks, +- cb, cb_arg); +-} +- +-int +-spdk_bdev_readv(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, +- struct iovec *iov, int iovcnt, +- uint64_t offset, uint64_t nbytes, +- spdk_bdev_io_completion_cb cb, void *cb_arg) +-{ +- uint64_t offset_blocks, num_blocks; +- +- if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks, +- nbytes, &num_blocks) != 0) { +- return -EINVAL; +- } +- +- return spdk_bdev_readv_blocks(desc, ch, iov, iovcnt, offset_blocks, num_blocks, cb, cb_arg); +-} +- +-static int +-bdev_readv_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, +- struct iovec *iov, int iovcnt, void *md_buf, uint64_t offset_blocks, +- uint64_t num_blocks, spdk_bdev_io_completion_cb cb, void *cb_arg, +- struct spdk_bdev_ext_io_opts *opts, bool copy_opts) +-{ +- struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); +- struct spdk_bdev_io *bdev_io; +- struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); +- +- if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { +- return -EINVAL; +- } +- +- bdev_io = bdev_channel_get_io(channel); +- if (!bdev_io) { +- return -ENOMEM; +- } +- +- bdev_io->internal.ch = channel; +- bdev_io->internal.desc = desc; +- bdev_io->type = SPDK_BDEV_IO_TYPE_READ; +- bdev_io->u.bdev.iovs = iov; +- bdev_io->u.bdev.iovcnt = iovcnt; +- bdev_io->u.bdev.md_buf = md_buf; +- bdev_io->u.bdev.num_blocks = num_blocks; +- bdev_io->u.bdev.offset_blocks = offset_blocks; +- bdev_io_init(bdev_io, bdev, cb_arg, cb); +- bdev_io->internal.ext_opts = opts; +- bdev_io->u.bdev.ext_opts = opts; +- +- _bdev_io_submit_ext(desc, bdev_io, opts, copy_opts); +- +- return 0; +-} +- +-int +-spdk_bdev_readv_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, +- struct iovec *iov, int iovcnt, +- uint64_t offset_blocks, uint64_t num_blocks, +- spdk_bdev_io_completion_cb cb, void *cb_arg) +-{ +- return bdev_readv_blocks_with_md(desc, ch, iov, iovcnt, NULL, offset_blocks, +- num_blocks, cb, cb_arg, NULL, false); +-} +- +-int +-spdk_bdev_readv_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, +- struct iovec *iov, int iovcnt, void *md_buf, +- uint64_t offset_blocks, uint64_t num_blocks, +- spdk_bdev_io_completion_cb cb, void *cb_arg) +-{ +- if (md_buf && !spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) { +- return -EINVAL; +- } +- +- if (md_buf && !_is_buf_allocated(iov)) { +- return -EINVAL; +- } +- +- return bdev_readv_blocks_with_md(desc, ch, iov, iovcnt, md_buf, offset_blocks, +- num_blocks, cb, cb_arg, NULL, false); +-} +- +-static inline bool +-_bdev_io_check_opts(struct spdk_bdev_ext_io_opts *opts, struct iovec *iov) +-{ +- /* +- * We check if opts size is at least of size when we first introduced +- * spdk_bdev_ext_io_opts (ac6f2bdd8d) since access to those members +- * are not checked internal. +- */ +- return opts->size >= offsetof(struct spdk_bdev_ext_io_opts, metadata) + +- sizeof(opts->metadata) && +- opts->size <= sizeof(*opts) && +- /* When memory domain is used, the user must provide data buffers */ +- (!opts->memory_domain || (iov && iov[0].iov_base)); +-} +- +-int +-spdk_bdev_readv_blocks_ext(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, +- struct iovec *iov, int iovcnt, +- uint64_t offset_blocks, uint64_t num_blocks, +- spdk_bdev_io_completion_cb cb, void *cb_arg, +- struct spdk_bdev_ext_io_opts *opts) +-{ +- void *md = NULL; +- +- if (opts) { +- if (spdk_unlikely(!_bdev_io_check_opts(opts, iov))) { +- return -EINVAL; +- } +- md = opts->metadata; +- } +- +- if (md && !spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) { +- return -EINVAL; +- } +- +- if (md && !_is_buf_allocated(iov)) { +- return -EINVAL; +- } +- +- return bdev_readv_blocks_with_md(desc, ch, iov, iovcnt, md, offset_blocks, +- num_blocks, cb, cb_arg, opts, false); +-} +- +-static int +-bdev_write_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, +- void *buf, void *md_buf, uint64_t offset_blocks, uint64_t num_blocks, +- spdk_bdev_io_completion_cb cb, void *cb_arg) +-{ +- struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); +- struct spdk_bdev_io *bdev_io; +- struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); +- +- if (!desc->write) { +- return -EBADF; +- } +- +- if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { +- return -EINVAL; +- } +- +- bdev_io = bdev_channel_get_io(channel); +- if (!bdev_io) { +- return -ENOMEM; +- } +- +- bdev_io->internal.ch = channel; +- bdev_io->internal.desc = desc; +- bdev_io->type = SPDK_BDEV_IO_TYPE_WRITE; +- bdev_io->u.bdev.iovs = &bdev_io->iov; +- bdev_io->u.bdev.iovs[0].iov_base = buf; +- bdev_io->u.bdev.iovs[0].iov_len = num_blocks * bdev->blocklen; +- bdev_io->u.bdev.iovcnt = 1; +- bdev_io->u.bdev.md_buf = md_buf; +- bdev_io->u.bdev.num_blocks = num_blocks; +- bdev_io->u.bdev.offset_blocks = offset_blocks; +- bdev_io->u.bdev.ext_opts = NULL; +- bdev_io_init(bdev_io, bdev, cb_arg, cb); +- +- bdev_io_submit(bdev_io); +- return 0; +-} +- +-int +-spdk_bdev_write(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, +- void *buf, uint64_t offset, uint64_t nbytes, +- spdk_bdev_io_completion_cb cb, void *cb_arg) +-{ +- uint64_t offset_blocks, num_blocks; +- +- if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks, +- nbytes, &num_blocks) != 0) { +- return -EINVAL; +- } +- +- return spdk_bdev_write_blocks(desc, ch, buf, offset_blocks, num_blocks, cb, cb_arg); +-} +- +-int +-spdk_bdev_write_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, +- void *buf, uint64_t offset_blocks, uint64_t num_blocks, +- spdk_bdev_io_completion_cb cb, void *cb_arg) +-{ +- return bdev_write_blocks_with_md(desc, ch, buf, NULL, offset_blocks, num_blocks, +- cb, cb_arg); +-} +- +-int +-spdk_bdev_write_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, +- void *buf, void *md_buf, uint64_t offset_blocks, uint64_t num_blocks, +- spdk_bdev_io_completion_cb cb, void *cb_arg) +-{ +- struct iovec iov = { +- .iov_base = buf, +- }; +- +- if (md_buf && !spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) { +- return -EINVAL; +- } +- +- if (md_buf && !_is_buf_allocated(&iov)) { +- return -EINVAL; +- } +- +- return bdev_write_blocks_with_md(desc, ch, buf, md_buf, offset_blocks, num_blocks, +- cb, cb_arg); +-} +- +-static int +-bdev_writev_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, +- struct iovec *iov, int iovcnt, void *md_buf, +- uint64_t offset_blocks, uint64_t num_blocks, +- spdk_bdev_io_completion_cb cb, void *cb_arg, +- struct spdk_bdev_ext_io_opts *opts, bool copy_opts) +-{ +- struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); +- struct spdk_bdev_io *bdev_io; +- struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); +- +- if (!desc->write) { +- return -EBADF; +- } +- +- if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { +- return -EINVAL; +- } +- +- bdev_io = bdev_channel_get_io(channel); +- if (!bdev_io) { +- return -ENOMEM; +- } +- +- bdev_io->internal.ch = channel; +- bdev_io->internal.desc = desc; +- bdev_io->type = SPDK_BDEV_IO_TYPE_WRITE; +- bdev_io->u.bdev.iovs = iov; +- bdev_io->u.bdev.iovcnt = iovcnt; +- bdev_io->u.bdev.md_buf = md_buf; +- bdev_io->u.bdev.num_blocks = num_blocks; +- bdev_io->u.bdev.offset_blocks = offset_blocks; +- bdev_io_init(bdev_io, bdev, cb_arg, cb); +- bdev_io->internal.ext_opts = opts; +- bdev_io->u.bdev.ext_opts = opts; +- +- _bdev_io_submit_ext(desc, bdev_io, opts, copy_opts); +- +- return 0; +-} +- +-int +-spdk_bdev_writev(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, +- struct iovec *iov, int iovcnt, +- uint64_t offset, uint64_t len, +- spdk_bdev_io_completion_cb cb, void *cb_arg) +-{ +- uint64_t offset_blocks, num_blocks; +- +- if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks, +- len, &num_blocks) != 0) { +- return -EINVAL; +- } +- +- return spdk_bdev_writev_blocks(desc, ch, iov, iovcnt, offset_blocks, num_blocks, cb, cb_arg); +-} +- +-int +-spdk_bdev_writev_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, +- struct iovec *iov, int iovcnt, +- uint64_t offset_blocks, uint64_t num_blocks, +- spdk_bdev_io_completion_cb cb, void *cb_arg) +-{ +- return bdev_writev_blocks_with_md(desc, ch, iov, iovcnt, NULL, offset_blocks, +- num_blocks, cb, cb_arg, NULL, false); +-} +- +-int +-spdk_bdev_writev_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, +- struct iovec *iov, int iovcnt, void *md_buf, +- uint64_t offset_blocks, uint64_t num_blocks, +- spdk_bdev_io_completion_cb cb, void *cb_arg) +-{ +- if (md_buf && !spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) { +- return -EINVAL; +- } +- +- if (md_buf && !_is_buf_allocated(iov)) { +- return -EINVAL; +- } +- +- return bdev_writev_blocks_with_md(desc, ch, iov, iovcnt, md_buf, offset_blocks, +- num_blocks, cb, cb_arg, NULL, false); +-} +- +-int +-spdk_bdev_writev_blocks_ext(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, +- struct iovec *iov, int iovcnt, +- uint64_t offset_blocks, uint64_t num_blocks, +- spdk_bdev_io_completion_cb cb, void *cb_arg, +- struct spdk_bdev_ext_io_opts *opts) +-{ +- void *md = NULL; +- +- if (opts) { +- if (spdk_unlikely(!_bdev_io_check_opts(opts, iov))) { +- return -EINVAL; +- } +- md = opts->metadata; +- } +- +- if (md && !spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) { +- return -EINVAL; +- } +- +- if (md && !_is_buf_allocated(iov)) { +- return -EINVAL; +- } +- +- return bdev_writev_blocks_with_md(desc, ch, iov, iovcnt, md, offset_blocks, +- num_blocks, cb, cb_arg, opts, false); +-} +- +-static void +-bdev_compare_do_read_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) +-{ +- struct spdk_bdev_io *parent_io = cb_arg; +- struct spdk_bdev *bdev = parent_io->bdev; +- uint8_t *read_buf = bdev_io->u.bdev.iovs[0].iov_base; +- int i, rc = 0; +- +- if (!success) { +- parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; +- parent_io->internal.cb(parent_io, false, parent_io->internal.caller_ctx); +- spdk_bdev_free_io(bdev_io); +- return; +- } +- +- for (i = 0; i < parent_io->u.bdev.iovcnt; i++) { +- rc = memcmp(read_buf, +- parent_io->u.bdev.iovs[i].iov_base, +- parent_io->u.bdev.iovs[i].iov_len); +- if (rc) { +- break; +- } +- read_buf += parent_io->u.bdev.iovs[i].iov_len; +- } +- +- if (rc == 0 && parent_io->u.bdev.md_buf && spdk_bdev_is_md_separate(bdev)) { +- rc = memcmp(bdev_io->u.bdev.md_buf, +- parent_io->u.bdev.md_buf, +- spdk_bdev_get_md_size(bdev)); +- } +- +- spdk_bdev_free_io(bdev_io); +- +- if (rc == 0) { +- parent_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS; +- parent_io->internal.cb(parent_io, true, parent_io->internal.caller_ctx); +- } else { +- parent_io->internal.status = SPDK_BDEV_IO_STATUS_MISCOMPARE; +- parent_io->internal.cb(parent_io, false, parent_io->internal.caller_ctx); +- } +-} +- +-static void +-bdev_compare_do_read(void *_bdev_io) +-{ +- struct spdk_bdev_io *bdev_io = _bdev_io; +- int rc; +- +- rc = spdk_bdev_read_blocks(bdev_io->internal.desc, +- spdk_io_channel_from_ctx(bdev_io->internal.ch), NULL, +- bdev_io->u.bdev.offset_blocks, bdev_io->u.bdev.num_blocks, +- bdev_compare_do_read_done, bdev_io); +- +- if (rc == -ENOMEM) { +- bdev_queue_io_wait_with_cb(bdev_io, bdev_compare_do_read); +- } else if (rc != 0) { +- bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; +- bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); +- } +-} +- +-static int +-bdev_comparev_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, +- struct iovec *iov, int iovcnt, void *md_buf, +- uint64_t offset_blocks, uint64_t num_blocks, +- spdk_bdev_io_completion_cb cb, void *cb_arg) +-{ +- struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); +- struct spdk_bdev_io *bdev_io; +- struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); +- +- if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { +- return -EINVAL; +- } +- +- bdev_io = bdev_channel_get_io(channel); +- if (!bdev_io) { +- return -ENOMEM; +- } +- +- bdev_io->internal.ch = channel; +- bdev_io->internal.desc = desc; +- bdev_io->type = SPDK_BDEV_IO_TYPE_COMPARE; +- bdev_io->u.bdev.iovs = iov; +- bdev_io->u.bdev.iovcnt = iovcnt; +- bdev_io->u.bdev.md_buf = md_buf; +- bdev_io->u.bdev.num_blocks = num_blocks; +- bdev_io->u.bdev.offset_blocks = offset_blocks; +- bdev_io_init(bdev_io, bdev, cb_arg, cb); +- bdev_io->u.bdev.ext_opts = NULL; +- +- if (bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_COMPARE)) { +- bdev_io_submit(bdev_io); +- return 0; +- } +- +- bdev_compare_do_read(bdev_io); +- +- return 0; +-} +- +-int +-spdk_bdev_comparev_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, +- struct iovec *iov, int iovcnt, +- uint64_t offset_blocks, uint64_t num_blocks, +- spdk_bdev_io_completion_cb cb, void *cb_arg) +-{ +- return bdev_comparev_blocks_with_md(desc, ch, iov, iovcnt, NULL, offset_blocks, +- num_blocks, cb, cb_arg); +-} +- +-int +-spdk_bdev_comparev_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, +- struct iovec *iov, int iovcnt, void *md_buf, +- uint64_t offset_blocks, uint64_t num_blocks, +- spdk_bdev_io_completion_cb cb, void *cb_arg) +-{ +- if (md_buf && !spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) { +- return -EINVAL; +- } +- +- if (md_buf && !_is_buf_allocated(iov)) { +- return -EINVAL; +- } +- +- return bdev_comparev_blocks_with_md(desc, ch, iov, iovcnt, md_buf, offset_blocks, +- num_blocks, cb, cb_arg); +-} +- +-static int +-bdev_compare_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, +- void *buf, void *md_buf, uint64_t offset_blocks, uint64_t num_blocks, +- spdk_bdev_io_completion_cb cb, void *cb_arg) +-{ +- struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); +- struct spdk_bdev_io *bdev_io; +- struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); +- +- if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { +- return -EINVAL; +- } +- +- bdev_io = bdev_channel_get_io(channel); +- if (!bdev_io) { +- return -ENOMEM; +- } +- +- bdev_io->internal.ch = channel; +- bdev_io->internal.desc = desc; +- bdev_io->type = SPDK_BDEV_IO_TYPE_COMPARE; +- bdev_io->u.bdev.iovs = &bdev_io->iov; +- bdev_io->u.bdev.iovs[0].iov_base = buf; +- bdev_io->u.bdev.iovs[0].iov_len = num_blocks * bdev->blocklen; +- bdev_io->u.bdev.iovcnt = 1; +- bdev_io->u.bdev.md_buf = md_buf; +- bdev_io->u.bdev.num_blocks = num_blocks; +- bdev_io->u.bdev.offset_blocks = offset_blocks; +- bdev_io_init(bdev_io, bdev, cb_arg, cb); +- bdev_io->u.bdev.ext_opts = NULL; +- +- if (bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_COMPARE)) { +- bdev_io_submit(bdev_io); +- return 0; +- } +- +- bdev_compare_do_read(bdev_io); +- +- return 0; +-} +- +-int +-spdk_bdev_compare_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, +- void *buf, uint64_t offset_blocks, uint64_t num_blocks, +- spdk_bdev_io_completion_cb cb, void *cb_arg) +-{ +- return bdev_compare_blocks_with_md(desc, ch, buf, NULL, offset_blocks, num_blocks, +- cb, cb_arg); +-} +- +-int +-spdk_bdev_compare_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, +- void *buf, void *md_buf, uint64_t offset_blocks, uint64_t num_blocks, +- spdk_bdev_io_completion_cb cb, void *cb_arg) +-{ +- struct iovec iov = { +- .iov_base = buf, +- }; +- +- if (md_buf && !spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) { +- return -EINVAL; +- } +- +- if (md_buf && !_is_buf_allocated(&iov)) { +- return -EINVAL; +- } +- +- return bdev_compare_blocks_with_md(desc, ch, buf, md_buf, offset_blocks, num_blocks, +- cb, cb_arg); +-} +- +-static void +-bdev_comparev_and_writev_blocks_unlocked(void *ctx, int unlock_status) +-{ +- struct spdk_bdev_io *bdev_io = ctx; +- +- if (unlock_status) { +- SPDK_ERRLOG("LBA range unlock failed\n"); +- } +- +- bdev_io->internal.cb(bdev_io, bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS ? true : +- false, bdev_io->internal.caller_ctx); +-} +- +-static void +-bdev_comparev_and_writev_blocks_unlock(struct spdk_bdev_io *bdev_io, int status) +-{ +- bdev_io->internal.status = status; +- +- bdev_unlock_lba_range(bdev_io->internal.desc, spdk_io_channel_from_ctx(bdev_io->internal.ch), +- bdev_io->u.bdev.offset_blocks, bdev_io->u.bdev.num_blocks, +- bdev_comparev_and_writev_blocks_unlocked, bdev_io); +-} +- +-static void +-bdev_compare_and_write_do_write_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) +-{ +- struct spdk_bdev_io *parent_io = cb_arg; +- +- if (!success) { +- SPDK_ERRLOG("Compare and write operation failed\n"); +- } +- +- spdk_bdev_free_io(bdev_io); +- +- bdev_comparev_and_writev_blocks_unlock(parent_io, +- success ? SPDK_BDEV_IO_STATUS_SUCCESS : SPDK_BDEV_IO_STATUS_FAILED); +-} +- +-static void +-bdev_compare_and_write_do_write(void *_bdev_io) +-{ +- struct spdk_bdev_io *bdev_io = _bdev_io; +- int rc; +- +- rc = spdk_bdev_writev_blocks(bdev_io->internal.desc, +- spdk_io_channel_from_ctx(bdev_io->internal.ch), +- bdev_io->u.bdev.fused_iovs, bdev_io->u.bdev.fused_iovcnt, +- bdev_io->u.bdev.offset_blocks, bdev_io->u.bdev.num_blocks, +- bdev_compare_and_write_do_write_done, bdev_io); +- +- +- if (rc == -ENOMEM) { +- bdev_queue_io_wait_with_cb(bdev_io, bdev_compare_and_write_do_write); +- } else if (rc != 0) { +- bdev_comparev_and_writev_blocks_unlock(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); +- } +-} +- +-static void +-bdev_compare_and_write_do_compare_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) +-{ +- struct spdk_bdev_io *parent_io = cb_arg; +- +- spdk_bdev_free_io(bdev_io); +- +- if (!success) { +- bdev_comparev_and_writev_blocks_unlock(parent_io, SPDK_BDEV_IO_STATUS_MISCOMPARE); +- return; +- } +- +- bdev_compare_and_write_do_write(parent_io); +-} +- +-static void +-bdev_compare_and_write_do_compare(void *_bdev_io) +-{ +- struct spdk_bdev_io *bdev_io = _bdev_io; +- int rc; +- +- rc = spdk_bdev_comparev_blocks(bdev_io->internal.desc, +- spdk_io_channel_from_ctx(bdev_io->internal.ch), bdev_io->u.bdev.iovs, +- bdev_io->u.bdev.iovcnt, bdev_io->u.bdev.offset_blocks, bdev_io->u.bdev.num_blocks, +- bdev_compare_and_write_do_compare_done, bdev_io); +- +- if (rc == -ENOMEM) { +- bdev_queue_io_wait_with_cb(bdev_io, bdev_compare_and_write_do_compare); +- } else if (rc != 0) { +- bdev_comparev_and_writev_blocks_unlock(bdev_io, SPDK_BDEV_IO_STATUS_FIRST_FUSED_FAILED); +- } +-} +- +-static void +-bdev_comparev_and_writev_blocks_locked(void *ctx, int status) +-{ +- struct spdk_bdev_io *bdev_io = ctx; +- +- if (status) { +- bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FIRST_FUSED_FAILED; +- bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); +- return; +- } +- +- bdev_compare_and_write_do_compare(bdev_io); +-} +- +-int +-spdk_bdev_comparev_and_writev_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, +- struct iovec *compare_iov, int compare_iovcnt, +- struct iovec *write_iov, int write_iovcnt, +- uint64_t offset_blocks, uint64_t num_blocks, +- spdk_bdev_io_completion_cb cb, void *cb_arg) +-{ +- struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); +- struct spdk_bdev_io *bdev_io; +- struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); +- +- if (!desc->write) { +- return -EBADF; +- } +- +- if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { +- return -EINVAL; +- } +- +- if (num_blocks > bdev->acwu) { +- return -EINVAL; +- } +- +- bdev_io = bdev_channel_get_io(channel); +- if (!bdev_io) { +- return -ENOMEM; +- } +- +- bdev_io->internal.ch = channel; +- bdev_io->internal.desc = desc; +- bdev_io->type = SPDK_BDEV_IO_TYPE_COMPARE_AND_WRITE; +- bdev_io->u.bdev.iovs = compare_iov; +- bdev_io->u.bdev.iovcnt = compare_iovcnt; +- bdev_io->u.bdev.fused_iovs = write_iov; +- bdev_io->u.bdev.fused_iovcnt = write_iovcnt; +- bdev_io->u.bdev.md_buf = NULL; +- bdev_io->u.bdev.num_blocks = num_blocks; +- bdev_io->u.bdev.offset_blocks = offset_blocks; +- bdev_io_init(bdev_io, bdev, cb_arg, cb); +- bdev_io->u.bdev.ext_opts = NULL; +- +- if (bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_COMPARE_AND_WRITE)) { +- bdev_io_submit(bdev_io); +- return 0; +- } +- +- return bdev_lock_lba_range(desc, ch, offset_blocks, num_blocks, +- bdev_comparev_and_writev_blocks_locked, bdev_io); +-} +- +-int +-spdk_bdev_zcopy_start(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, +- struct iovec *iov, int iovcnt, +- uint64_t offset_blocks, uint64_t num_blocks, +- bool populate, +- spdk_bdev_io_completion_cb cb, void *cb_arg) +-{ +- struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); +- struct spdk_bdev_io *bdev_io; +- struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); +- +- if (!desc->write) { +- return -EBADF; +- } +- +- if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { +- return -EINVAL; +- } +- +- if (!spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_ZCOPY)) { +- return -ENOTSUP; +- } +- +- bdev_io = bdev_channel_get_io(channel); +- if (!bdev_io) { +- return -ENOMEM; +- } +- +- bdev_io->internal.ch = channel; +- bdev_io->internal.desc = desc; +- bdev_io->type = SPDK_BDEV_IO_TYPE_ZCOPY; +- bdev_io->u.bdev.num_blocks = num_blocks; +- bdev_io->u.bdev.offset_blocks = offset_blocks; +- bdev_io->u.bdev.iovs = iov; +- bdev_io->u.bdev.iovcnt = iovcnt; +- bdev_io->u.bdev.md_buf = NULL; +- bdev_io->u.bdev.zcopy.populate = populate ? 1 : 0; +- bdev_io->u.bdev.zcopy.commit = 0; +- bdev_io->u.bdev.zcopy.start = 1; +- bdev_io_init(bdev_io, bdev, cb_arg, cb); +- bdev_io->u.bdev.ext_opts = NULL; +- +- bdev_io_submit(bdev_io); +- +- return 0; +-} +- +-int +-spdk_bdev_zcopy_end(struct spdk_bdev_io *bdev_io, bool commit, +- spdk_bdev_io_completion_cb cb, void *cb_arg) +-{ +- if (bdev_io->type != SPDK_BDEV_IO_TYPE_ZCOPY) { +- return -EINVAL; +- } +- +- bdev_io->u.bdev.zcopy.commit = commit ? 1 : 0; +- bdev_io->u.bdev.zcopy.start = 0; +- bdev_io->internal.caller_ctx = cb_arg; +- bdev_io->internal.cb = cb; +- bdev_io->internal.status = SPDK_BDEV_IO_STATUS_PENDING; +- +- bdev_io_submit(bdev_io); +- +- return 0; +-} +- +-int +-spdk_bdev_write_zeroes(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, +- uint64_t offset, uint64_t len, +- spdk_bdev_io_completion_cb cb, void *cb_arg) +-{ +- uint64_t offset_blocks, num_blocks; +- +- if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks, +- len, &num_blocks) != 0) { +- return -EINVAL; +- } +- +- return spdk_bdev_write_zeroes_blocks(desc, ch, offset_blocks, num_blocks, cb, cb_arg); +-} +- +-int +-spdk_bdev_write_zeroes_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, +- uint64_t offset_blocks, uint64_t num_blocks, +- spdk_bdev_io_completion_cb cb, void *cb_arg) +-{ +- struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); +- struct spdk_bdev_io *bdev_io; +- struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); +- +- if (!desc->write) { +- return -EBADF; +- } +- +- if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { +- return -EINVAL; +- } +- +- if (!bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE_ZEROES) && +- !bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE)) { +- return -ENOTSUP; +- } +- +- bdev_io = bdev_channel_get_io(channel); +- +- if (!bdev_io) { +- return -ENOMEM; +- } +- +- bdev_io->type = SPDK_BDEV_IO_TYPE_WRITE_ZEROES; +- bdev_io->internal.ch = channel; +- bdev_io->internal.desc = desc; +- bdev_io->u.bdev.offset_blocks = offset_blocks; +- bdev_io->u.bdev.num_blocks = num_blocks; +- bdev_io_init(bdev_io, bdev, cb_arg, cb); +- bdev_io->u.bdev.ext_opts = NULL; +- +- if (bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE_ZEROES)) { +- bdev_io_submit(bdev_io); +- return 0; +- } +- +- assert(bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE)); +- assert(_bdev_get_block_size_with_md(bdev) <= ZERO_BUFFER_SIZE); +- bdev_io->u.bdev.split_remaining_num_blocks = num_blocks; +- bdev_io->u.bdev.split_current_offset_blocks = offset_blocks; +- bdev_write_zero_buffer_next(bdev_io); +- +- return 0; +-} +- +-int +-spdk_bdev_unmap(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, +- uint64_t offset, uint64_t nbytes, +- spdk_bdev_io_completion_cb cb, void *cb_arg) +-{ +- uint64_t offset_blocks, num_blocks; +- +- if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks, +- nbytes, &num_blocks) != 0) { +- return -EINVAL; +- } +- +- return spdk_bdev_unmap_blocks(desc, ch, offset_blocks, num_blocks, cb, cb_arg); +-} +- +-int +-spdk_bdev_unmap_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, +- uint64_t offset_blocks, uint64_t num_blocks, +- spdk_bdev_io_completion_cb cb, void *cb_arg) +-{ +- struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); +- struct spdk_bdev_io *bdev_io; +- struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); +- +- if (!desc->write) { +- return -EBADF; +- } +- +- if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { +- return -EINVAL; +- } +- +- if (num_blocks == 0) { +- SPDK_ERRLOG("Can't unmap 0 bytes\n"); +- return -EINVAL; +- } +- +- bdev_io = bdev_channel_get_io(channel); +- if (!bdev_io) { +- return -ENOMEM; +- } +- +- bdev_io->internal.ch = channel; +- bdev_io->internal.desc = desc; +- bdev_io->type = SPDK_BDEV_IO_TYPE_UNMAP; +- +- bdev_io->u.bdev.iovs = &bdev_io->iov; +- bdev_io->u.bdev.iovs[0].iov_base = NULL; +- bdev_io->u.bdev.iovs[0].iov_len = 0; +- bdev_io->u.bdev.iovcnt = 1; +- +- bdev_io->u.bdev.offset_blocks = offset_blocks; +- bdev_io->u.bdev.num_blocks = num_blocks; +- bdev_io_init(bdev_io, bdev, cb_arg, cb); +- bdev_io->u.bdev.ext_opts = NULL; +- +- bdev_io_submit(bdev_io); +- return 0; +-} +- +-int +-spdk_bdev_flush(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, +- uint64_t offset, uint64_t length, +- spdk_bdev_io_completion_cb cb, void *cb_arg) +-{ +- uint64_t offset_blocks, num_blocks; +- +- if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks, +- length, &num_blocks) != 0) { +- return -EINVAL; +- } +- +- return spdk_bdev_flush_blocks(desc, ch, offset_blocks, num_blocks, cb, cb_arg); +-} +- +-int +-spdk_bdev_flush_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, +- uint64_t offset_blocks, uint64_t num_blocks, +- spdk_bdev_io_completion_cb cb, void *cb_arg) +-{ +- struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); +- struct spdk_bdev_io *bdev_io; +- struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); +- +- if (!desc->write) { +- return -EBADF; +- } +- +- if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { +- return -EINVAL; +- } +- +- bdev_io = bdev_channel_get_io(channel); +- if (!bdev_io) { +- return -ENOMEM; +- } +- +- bdev_io->internal.ch = channel; +- bdev_io->internal.desc = desc; +- bdev_io->type = SPDK_BDEV_IO_TYPE_FLUSH; +- bdev_io->u.bdev.iovs = NULL; +- bdev_io->u.bdev.iovcnt = 0; +- bdev_io->u.bdev.offset_blocks = offset_blocks; +- bdev_io->u.bdev.num_blocks = num_blocks; +- bdev_io_init(bdev_io, bdev, cb_arg, cb); +- +- bdev_io_submit(bdev_io); +- return 0; +-} +- +-static int bdev_reset_poll_for_outstanding_io(void *ctx); +- +-static void +-bdev_reset_check_outstanding_io_done(struct spdk_bdev *bdev, void *_ctx, int status) +-{ +- struct spdk_bdev_channel *ch = _ctx; +- struct spdk_bdev_io *bdev_io; +- +- bdev_io = TAILQ_FIRST(&ch->queued_resets); +- +- if (status == -EBUSY) { +- if (spdk_get_ticks() < bdev_io->u.reset.wait_poller.stop_time_tsc) { +- bdev_io->u.reset.wait_poller.poller = SPDK_POLLER_REGISTER(bdev_reset_poll_for_outstanding_io, +- ch, BDEV_RESET_CHECK_OUTSTANDING_IO_PERIOD); +- } else { +- /* If outstanding IOs are still present and reset_io_drain_timeout seconds passed, +- * start the reset. */ +- TAILQ_REMOVE(&ch->queued_resets, bdev_io, internal.link); +- bdev_io_submit_reset(bdev_io); +- } +- } else { +- TAILQ_REMOVE(&ch->queued_resets, bdev_io, internal.link); +- SPDK_DEBUGLOG(bdev, +- "Skipping reset for underlying device of bdev: %s - no outstanding I/O.\n", +- ch->bdev->name); +- /* Mark the completion status as a SUCCESS and complete the reset. */ +- spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_SUCCESS); +- } +-} +- +-static void +-bdev_reset_check_outstanding_io(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, +- struct spdk_io_channel *io_ch, void *_ctx) +-{ +- struct spdk_bdev_channel *cur_ch = __io_ch_to_bdev_ch(io_ch); +- int status = 0; +- +- if (cur_ch->io_outstanding > 0) { +- /* If a channel has outstanding IO, set status to -EBUSY code. This will stop +- * further iteration over the rest of the channels and pass non-zero status +- * to the callback function. */ +- status = -EBUSY; +- } +- spdk_bdev_for_each_channel_continue(i, status); +-} +- +-static int +-bdev_reset_poll_for_outstanding_io(void *ctx) +-{ +- struct spdk_bdev_channel *ch = ctx; +- struct spdk_bdev_io *bdev_io; +- +- bdev_io = TAILQ_FIRST(&ch->queued_resets); +- +- spdk_poller_unregister(&bdev_io->u.reset.wait_poller.poller); +- spdk_bdev_for_each_channel(ch->bdev, bdev_reset_check_outstanding_io, ch, +- bdev_reset_check_outstanding_io_done); +- +- return SPDK_POLLER_BUSY; +-} +- +-static void +-bdev_reset_freeze_channel_done(struct spdk_bdev *bdev, void *_ctx, int status) +-{ +- struct spdk_bdev_channel *ch = _ctx; +- struct spdk_bdev_io *bdev_io; +- +- bdev_io = TAILQ_FIRST(&ch->queued_resets); +- +- if (bdev->reset_io_drain_timeout == 0) { +- TAILQ_REMOVE(&ch->queued_resets, bdev_io, internal.link); +- +- bdev_io_submit_reset(bdev_io); +- return; +- } +- +- bdev_io->u.reset.wait_poller.stop_time_tsc = spdk_get_ticks() + +- (ch->bdev->reset_io_drain_timeout * spdk_get_ticks_hz()); +- +- /* In case bdev->reset_io_drain_timeout is not equal to zero, +- * submit the reset to the underlying module only if outstanding I/O +- * remain after reset_io_drain_timeout seconds have passed. */ +- spdk_bdev_for_each_channel(ch->bdev, bdev_reset_check_outstanding_io, ch, +- bdev_reset_check_outstanding_io_done); +-} +- +-static void +-bdev_reset_freeze_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, +- struct spdk_io_channel *ch, void *_ctx) +-{ +- struct spdk_bdev_channel *channel; +- struct spdk_bdev_mgmt_channel *mgmt_channel; +- struct spdk_bdev_shared_resource *shared_resource; +- bdev_io_tailq_t tmp_queued; +- +- TAILQ_INIT(&tmp_queued); +- +- channel = __io_ch_to_bdev_ch(ch); +- shared_resource = channel->shared_resource; +- mgmt_channel = shared_resource->mgmt_ch; +- +- channel->flags |= BDEV_CH_RESET_IN_PROGRESS; +- +- if ((channel->flags & BDEV_CH_QOS_ENABLED) != 0) { +- /* The QoS object is always valid and readable while +- * the channel flag is set, so the lock here should not +- * be necessary. We're not in the fast path though, so +- * just take it anyway. */ +- spdk_spin_lock(&channel->bdev->internal.spinlock); +- if (channel->bdev->internal.qos->ch == channel) { +- TAILQ_SWAP(&channel->bdev->internal.qos->queued, &tmp_queued, spdk_bdev_io, internal.link); +- } +- spdk_spin_unlock(&channel->bdev->internal.spinlock); +- } +- +- bdev_abort_all_queued_io(&shared_resource->nomem_io, channel); +- bdev_abort_all_buf_io(mgmt_channel, channel); +- bdev_abort_all_buf_io(mgmt_channel, channel); +- bdev_abort_all_queued_io(&tmp_queued, channel); +- +- spdk_bdev_for_each_channel_continue(i, 0); +-} +- +-static void +-bdev_start_reset(void *ctx) +-{ +- struct spdk_bdev_channel *ch = ctx; +- +- spdk_bdev_for_each_channel(ch->bdev, bdev_reset_freeze_channel, ch, +- bdev_reset_freeze_channel_done); +-} +- +-static void +-bdev_channel_start_reset(struct spdk_bdev_channel *ch) +-{ +- struct spdk_bdev *bdev = ch->bdev; +- +- assert(!TAILQ_EMPTY(&ch->queued_resets)); +- +- spdk_spin_lock(&bdev->internal.spinlock); +- if (bdev->internal.reset_in_progress == NULL) { +- bdev->internal.reset_in_progress = TAILQ_FIRST(&ch->queued_resets); +- /* +- * Take a channel reference for the target bdev for the life of this +- * reset. This guards against the channel getting destroyed while +- * spdk_bdev_for_each_channel() calls related to this reset IO are in +- * progress. We will release the reference when this reset is +- * completed. +- */ +- bdev->internal.reset_in_progress->u.reset.ch_ref = spdk_get_io_channel(__bdev_to_io_dev(bdev)); +- bdev_start_reset(ch); +- } +- spdk_spin_unlock(&bdev->internal.spinlock); +-} +- +-int +-spdk_bdev_reset(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, +- spdk_bdev_io_completion_cb cb, void *cb_arg) +-{ +- struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); +- struct spdk_bdev_io *bdev_io; +- struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); +- +- bdev_io = bdev_channel_get_io(channel); +- if (!bdev_io) { +- return -ENOMEM; +- } +- +- bdev_io->internal.ch = channel; +- bdev_io->internal.desc = desc; +- bdev_io->internal.submit_tsc = spdk_get_ticks(); +- bdev_io->type = SPDK_BDEV_IO_TYPE_RESET; +- bdev_io->u.reset.ch_ref = NULL; +- bdev_io_init(bdev_io, bdev, cb_arg, cb); +- +- spdk_spin_lock(&bdev->internal.spinlock); +- TAILQ_INSERT_TAIL(&channel->queued_resets, bdev_io, internal.link); +- spdk_spin_unlock(&bdev->internal.spinlock); +- +- TAILQ_INSERT_TAIL(&bdev_io->internal.ch->io_submitted, bdev_io, +- internal.ch_link); +- +- bdev_channel_start_reset(channel); +- +- return 0; +-} +- +-void +-spdk_bdev_get_io_stat(struct spdk_bdev *bdev, struct spdk_io_channel *ch, +- struct spdk_bdev_io_stat *stat) +-{ +- struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); +- +- bdev_get_io_stat(stat, channel->stat); +-} +- +-static void +-bdev_get_device_stat_done(struct spdk_bdev *bdev, void *_ctx, int status) +-{ +- struct spdk_bdev_iostat_ctx *bdev_iostat_ctx = _ctx; +- +- bdev_iostat_ctx->cb(bdev, bdev_iostat_ctx->stat, +- bdev_iostat_ctx->cb_arg, 0); +- free(bdev_iostat_ctx); +-} +- +-static void +-bdev_get_each_channel_stat(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, +- struct spdk_io_channel *ch, void *_ctx) +-{ +- struct spdk_bdev_iostat_ctx *bdev_iostat_ctx = _ctx; +- struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); +- +- spdk_bdev_add_io_stat(bdev_iostat_ctx->stat, channel->stat); +- spdk_bdev_for_each_channel_continue(i, 0); +-} +- +-void +-spdk_bdev_get_device_stat(struct spdk_bdev *bdev, struct spdk_bdev_io_stat *stat, +- spdk_bdev_get_device_stat_cb cb, void *cb_arg) +-{ +- struct spdk_bdev_iostat_ctx *bdev_iostat_ctx; +- +- assert(bdev != NULL); +- assert(stat != NULL); +- assert(cb != NULL); +- +- bdev_iostat_ctx = calloc(1, sizeof(struct spdk_bdev_iostat_ctx)); +- if (bdev_iostat_ctx == NULL) { +- SPDK_ERRLOG("Unable to allocate memory for spdk_bdev_iostat_ctx\n"); +- cb(bdev, stat, cb_arg, -ENOMEM); +- return; +- } +- +- bdev_iostat_ctx->stat = stat; +- bdev_iostat_ctx->cb = cb; +- bdev_iostat_ctx->cb_arg = cb_arg; +- +- /* Start with the statistics from previously deleted channels. */ +- spdk_spin_lock(&bdev->internal.spinlock); +- bdev_get_io_stat(bdev_iostat_ctx->stat, bdev->internal.stat); +- spdk_spin_unlock(&bdev->internal.spinlock); +- +- /* Then iterate and add the statistics from each existing channel. */ +- spdk_bdev_for_each_channel(bdev, bdev_get_each_channel_stat, bdev_iostat_ctx, +- bdev_get_device_stat_done); +-} +- +-struct bdev_iostat_reset_ctx { +- enum spdk_bdev_reset_stat_mode mode; +- bdev_reset_device_stat_cb cb; +- void *cb_arg; +-}; +- +-static void +-bdev_reset_device_stat_done(struct spdk_bdev *bdev, void *_ctx, int status) +-{ +- struct bdev_iostat_reset_ctx *ctx = _ctx; +- +- ctx->cb(bdev, ctx->cb_arg, 0); +- +- free(ctx); +-} +- +-static void +-bdev_reset_each_channel_stat(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, +- struct spdk_io_channel *ch, void *_ctx) +-{ +- struct bdev_iostat_reset_ctx *ctx = _ctx; +- struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); +- +- spdk_bdev_reset_io_stat(channel->stat, ctx->mode); +- +- spdk_bdev_for_each_channel_continue(i, 0); +-} +- +-void +-bdev_reset_device_stat(struct spdk_bdev *bdev, enum spdk_bdev_reset_stat_mode mode, +- bdev_reset_device_stat_cb cb, void *cb_arg) +-{ +- struct bdev_iostat_reset_ctx *ctx; +- +- assert(bdev != NULL); +- assert(cb != NULL); +- +- ctx = calloc(1, sizeof(*ctx)); +- if (ctx == NULL) { +- SPDK_ERRLOG("Unable to allocate bdev_iostat_reset_ctx.\n"); +- cb(bdev, cb_arg, -ENOMEM); +- return; +- } +- +- ctx->mode = mode; +- ctx->cb = cb; +- ctx->cb_arg = cb_arg; +- +- spdk_spin_lock(&bdev->internal.spinlock); +- spdk_bdev_reset_io_stat(bdev->internal.stat, mode); +- spdk_spin_unlock(&bdev->internal.spinlock); +- +- spdk_bdev_for_each_channel(bdev, +- bdev_reset_each_channel_stat, +- ctx, +- bdev_reset_device_stat_done); +-} +- +-int +-spdk_bdev_nvme_admin_passthru(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, +- const struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes, +- spdk_bdev_io_completion_cb cb, void *cb_arg) +-{ +- struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); +- struct spdk_bdev_io *bdev_io; +- struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); +- +- if (!desc->write) { +- return -EBADF; +- } +- +- if (spdk_unlikely(!bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_NVME_ADMIN))) { +- return -ENOTSUP; +- } +- +- bdev_io = bdev_channel_get_io(channel); +- if (!bdev_io) { +- return -ENOMEM; +- } +- +- bdev_io->internal.ch = channel; +- bdev_io->internal.desc = desc; +- bdev_io->type = SPDK_BDEV_IO_TYPE_NVME_ADMIN; +- bdev_io->u.nvme_passthru.cmd = *cmd; +- bdev_io->u.nvme_passthru.buf = buf; +- bdev_io->u.nvme_passthru.nbytes = nbytes; +- bdev_io->u.nvme_passthru.md_buf = NULL; +- bdev_io->u.nvme_passthru.md_len = 0; +- +- bdev_io_init(bdev_io, bdev, cb_arg, cb); +- +- bdev_io_submit(bdev_io); +- return 0; +-} +- +-int +-spdk_bdev_nvme_io_passthru(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, +- const struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes, +- spdk_bdev_io_completion_cb cb, void *cb_arg) +-{ +- struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); +- struct spdk_bdev_io *bdev_io; +- struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); +- +- if (!desc->write) { +- /* +- * Do not try to parse the NVMe command - we could maybe use bits in the opcode +- * to easily determine if the command is a read or write, but for now just +- * do not allow io_passthru with a read-only descriptor. +- */ +- return -EBADF; +- } +- +- if (spdk_unlikely(!bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_NVME_IO))) { +- return -ENOTSUP; +- } +- +- bdev_io = bdev_channel_get_io(channel); +- if (!bdev_io) { +- return -ENOMEM; +- } +- +- bdev_io->internal.ch = channel; +- bdev_io->internal.desc = desc; +- bdev_io->type = SPDK_BDEV_IO_TYPE_NVME_IO; +- bdev_io->u.nvme_passthru.cmd = *cmd; +- bdev_io->u.nvme_passthru.buf = buf; +- bdev_io->u.nvme_passthru.nbytes = nbytes; +- bdev_io->u.nvme_passthru.md_buf = NULL; +- bdev_io->u.nvme_passthru.md_len = 0; +- +- bdev_io_init(bdev_io, bdev, cb_arg, cb); +- +- bdev_io_submit(bdev_io); +- return 0; +-} +- +-int +-spdk_bdev_nvme_io_passthru_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, +- const struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes, void *md_buf, size_t md_len, +- spdk_bdev_io_completion_cb cb, void *cb_arg) +-{ +- struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); +- struct spdk_bdev_io *bdev_io; +- struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); +- +- if (!desc->write) { +- /* +- * Do not try to parse the NVMe command - we could maybe use bits in the opcode +- * to easily determine if the command is a read or write, but for now just +- * do not allow io_passthru with a read-only descriptor. +- */ +- return -EBADF; +- } +- +- if (spdk_unlikely(!bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_NVME_IO_MD))) { +- return -ENOTSUP; +- } +- +- bdev_io = bdev_channel_get_io(channel); +- if (!bdev_io) { +- return -ENOMEM; +- } +- +- bdev_io->internal.ch = channel; +- bdev_io->internal.desc = desc; +- bdev_io->type = SPDK_BDEV_IO_TYPE_NVME_IO_MD; +- bdev_io->u.nvme_passthru.cmd = *cmd; +- bdev_io->u.nvme_passthru.buf = buf; +- bdev_io->u.nvme_passthru.nbytes = nbytes; +- bdev_io->u.nvme_passthru.md_buf = md_buf; +- bdev_io->u.nvme_passthru.md_len = md_len; +- +- bdev_io_init(bdev_io, bdev, cb_arg, cb); +- +- bdev_io_submit(bdev_io); +- return 0; +-} +- +-static void bdev_abort_retry(void *ctx); +-static void bdev_abort(struct spdk_bdev_io *parent_io); +- +-static void +-bdev_abort_io_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) +-{ +- struct spdk_bdev_channel *channel = bdev_io->internal.ch; +- struct spdk_bdev_io *parent_io = cb_arg; +- struct spdk_bdev_io *bio_to_abort, *tmp_io; +- +- bio_to_abort = bdev_io->u.abort.bio_to_abort; +- +- spdk_bdev_free_io(bdev_io); +- +- if (!success) { +- /* Check if the target I/O completed in the meantime. */ +- TAILQ_FOREACH(tmp_io, &channel->io_submitted, internal.ch_link) { +- if (tmp_io == bio_to_abort) { +- break; +- } +- } +- +- /* If the target I/O still exists, set the parent to failed. */ +- if (tmp_io != NULL) { +- parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; +- } +- } +- +- parent_io->u.bdev.split_outstanding--; +- if (parent_io->u.bdev.split_outstanding == 0) { +- if (parent_io->internal.status == SPDK_BDEV_IO_STATUS_NOMEM) { +- bdev_abort_retry(parent_io); +- } else { +- bdev_io_complete(parent_io); +- } +- } +-} +- +-static int +-bdev_abort_io(struct spdk_bdev_desc *desc, struct spdk_bdev_channel *channel, +- struct spdk_bdev_io *bio_to_abort, +- spdk_bdev_io_completion_cb cb, void *cb_arg) +-{ +- struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); +- struct spdk_bdev_io *bdev_io; +- +- if (bio_to_abort->type == SPDK_BDEV_IO_TYPE_ABORT || +- bio_to_abort->type == SPDK_BDEV_IO_TYPE_RESET) { +- /* TODO: Abort reset or abort request. */ +- return -ENOTSUP; +- } +- +- bdev_io = bdev_channel_get_io(channel); +- if (bdev_io == NULL) { +- return -ENOMEM; +- } +- +- bdev_io->internal.ch = channel; +- bdev_io->internal.desc = desc; +- bdev_io->type = SPDK_BDEV_IO_TYPE_ABORT; +- bdev_io_init(bdev_io, bdev, cb_arg, cb); +- +- if (bdev->split_on_optimal_io_boundary && bdev_io_should_split(bio_to_abort)) { +- bdev_io->u.bdev.abort.bio_cb_arg = bio_to_abort; +- +- /* Parent abort request is not submitted directly, but to manage its +- * execution add it to the submitted list here. +- */ +- bdev_io->internal.submit_tsc = spdk_get_ticks(); +- TAILQ_INSERT_TAIL(&channel->io_submitted, bdev_io, internal.ch_link); +- +- bdev_abort(bdev_io); +- +- return 0; +- } +- +- bdev_io->u.abort.bio_to_abort = bio_to_abort; +- +- /* Submit the abort request to the underlying bdev module. */ +- bdev_io_submit(bdev_io); +- +- return 0; +-} +- +-static uint32_t +-_bdev_abort(struct spdk_bdev_io *parent_io) +-{ +- struct spdk_bdev_desc *desc = parent_io->internal.desc; +- struct spdk_bdev_channel *channel = parent_io->internal.ch; +- void *bio_cb_arg; +- struct spdk_bdev_io *bio_to_abort; +- uint32_t matched_ios; +- int rc; +- +- bio_cb_arg = parent_io->u.bdev.abort.bio_cb_arg; +- +- /* matched_ios is returned and will be kept by the caller. +- * +- * This function will be used for two cases, 1) the same cb_arg is used for +- * multiple I/Os, 2) a single large I/O is split into smaller ones. +- * Incrementing split_outstanding directly here may confuse readers especially +- * for the 1st case. +- * +- * Completion of I/O abort is processed after stack unwinding. Hence this trick +- * works as expected. +- */ +- matched_ios = 0; +- parent_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS; +- +- TAILQ_FOREACH(bio_to_abort, &channel->io_submitted, internal.ch_link) { +- if (bio_to_abort->internal.caller_ctx != bio_cb_arg) { +- continue; +- } +- +- if (bio_to_abort->internal.submit_tsc > parent_io->internal.submit_tsc) { +- /* Any I/O which was submitted after this abort command should be excluded. */ +- continue; +- } +- +- rc = bdev_abort_io(desc, channel, bio_to_abort, bdev_abort_io_done, parent_io); +- if (rc != 0) { +- if (rc == -ENOMEM) { +- parent_io->internal.status = SPDK_BDEV_IO_STATUS_NOMEM; +- } else { +- parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; +- } +- break; +- } +- matched_ios++; +- } +- +- return matched_ios; +-} +- +-static void +-bdev_abort_retry(void *ctx) +-{ +- struct spdk_bdev_io *parent_io = ctx; +- uint32_t matched_ios; +- +- matched_ios = _bdev_abort(parent_io); +- +- if (matched_ios == 0) { +- if (parent_io->internal.status == SPDK_BDEV_IO_STATUS_NOMEM) { +- bdev_queue_io_wait_with_cb(parent_io, bdev_abort_retry); +- } else { +- /* For retry, the case that no target I/O was found is success +- * because it means target I/Os completed in the meantime. +- */ +- bdev_io_complete(parent_io); +- } +- return; +- } +- +- /* Use split_outstanding to manage the progress of aborting I/Os. */ +- parent_io->u.bdev.split_outstanding = matched_ios; +-} +- +-static void +-bdev_abort(struct spdk_bdev_io *parent_io) +-{ +- uint32_t matched_ios; +- +- matched_ios = _bdev_abort(parent_io); +- +- if (matched_ios == 0) { +- if (parent_io->internal.status == SPDK_BDEV_IO_STATUS_NOMEM) { +- bdev_queue_io_wait_with_cb(parent_io, bdev_abort_retry); +- } else { +- /* The case the no target I/O was found is failure. */ +- parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; +- bdev_io_complete(parent_io); +- } +- return; +- } +- +- /* Use split_outstanding to manage the progress of aborting I/Os. */ +- parent_io->u.bdev.split_outstanding = matched_ios; +-} +- +-int +-spdk_bdev_abort(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, +- void *bio_cb_arg, +- spdk_bdev_io_completion_cb cb, void *cb_arg) +-{ +- struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); +- struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); +- struct spdk_bdev_io *bdev_io; +- +- if (bio_cb_arg == NULL) { +- return -EINVAL; +- } +- +- if (!spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_ABORT)) { +- return -ENOTSUP; +- } +- +- bdev_io = bdev_channel_get_io(channel); +- if (bdev_io == NULL) { +- return -ENOMEM; +- } +- +- bdev_io->internal.ch = channel; +- bdev_io->internal.desc = desc; +- bdev_io->internal.submit_tsc = spdk_get_ticks(); +- bdev_io->type = SPDK_BDEV_IO_TYPE_ABORT; +- bdev_io_init(bdev_io, bdev, cb_arg, cb); +- +- bdev_io->u.bdev.abort.bio_cb_arg = bio_cb_arg; +- +- /* Parent abort request is not submitted directly, but to manage its execution, +- * add it to the submitted list here. +- */ +- TAILQ_INSERT_TAIL(&channel->io_submitted, bdev_io, internal.ch_link); +- +- bdev_abort(bdev_io); +- +- return 0; +-} +- +-int +-spdk_bdev_queue_io_wait(struct spdk_bdev *bdev, struct spdk_io_channel *ch, +- struct spdk_bdev_io_wait_entry *entry) +-{ +- struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); +- struct spdk_bdev_mgmt_channel *mgmt_ch = channel->shared_resource->mgmt_ch; +- +- if (bdev != entry->bdev) { +- SPDK_ERRLOG("bdevs do not match\n"); +- return -EINVAL; +- } +- +- if (mgmt_ch->per_thread_cache_count > 0) { +- SPDK_ERRLOG("Cannot queue io_wait if spdk_bdev_io available in per-thread cache\n"); +- return -EINVAL; +- } +- +- TAILQ_INSERT_TAIL(&mgmt_ch->io_wait_queue, entry, link); +- return 0; +-} +- +-static inline void +-bdev_io_update_io_stat(struct spdk_bdev_io *bdev_io, uint64_t tsc_diff) +-{ +- enum spdk_bdev_io_status io_status = bdev_io->internal.status; +- struct spdk_bdev_io_stat *io_stat = bdev_io->internal.ch->stat; +- uint64_t num_blocks = bdev_io->u.bdev.num_blocks; +- uint32_t blocklen = bdev_io->bdev->blocklen; +- +- if (spdk_likely(io_status == SPDK_BDEV_IO_STATUS_SUCCESS)) { +- switch (bdev_io->type) { +- case SPDK_BDEV_IO_TYPE_READ: +- io_stat->bytes_read += num_blocks * blocklen; +- io_stat->num_read_ops++; +- io_stat->read_latency_ticks += tsc_diff; +- if (io_stat->max_read_latency_ticks < tsc_diff) { +- io_stat->max_read_latency_ticks = tsc_diff; +- } +- if (io_stat->min_read_latency_ticks > tsc_diff) { +- io_stat->min_read_latency_ticks = tsc_diff; +- } +- break; +- case SPDK_BDEV_IO_TYPE_WRITE: +- io_stat->bytes_written += num_blocks * blocklen; +- io_stat->num_write_ops++; +- io_stat->write_latency_ticks += tsc_diff; +- if (io_stat->max_write_latency_ticks < tsc_diff) { +- io_stat->max_write_latency_ticks = tsc_diff; +- } +- if (io_stat->min_write_latency_ticks > tsc_diff) { +- io_stat->min_write_latency_ticks = tsc_diff; +- } +- break; +- case SPDK_BDEV_IO_TYPE_UNMAP: +- io_stat->bytes_unmapped += num_blocks * blocklen; +- io_stat->num_unmap_ops++; +- io_stat->unmap_latency_ticks += tsc_diff; +- if (io_stat->max_unmap_latency_ticks < tsc_diff) { +- io_stat->max_unmap_latency_ticks = tsc_diff; +- } +- if (io_stat->min_unmap_latency_ticks > tsc_diff) { +- io_stat->min_unmap_latency_ticks = tsc_diff; +- } +- break; +- case SPDK_BDEV_IO_TYPE_ZCOPY: +- /* Track the data in the start phase only */ +- if (bdev_io->u.bdev.zcopy.start) { +- if (bdev_io->u.bdev.zcopy.populate) { +- io_stat->bytes_read += num_blocks * blocklen; +- io_stat->num_read_ops++; +- io_stat->read_latency_ticks += tsc_diff; +- if (io_stat->max_read_latency_ticks < tsc_diff) { +- io_stat->max_read_latency_ticks = tsc_diff; +- } +- if (io_stat->min_read_latency_ticks > tsc_diff) { +- io_stat->min_read_latency_ticks = tsc_diff; +- } +- } else { +- io_stat->bytes_written += num_blocks * blocklen; +- io_stat->num_write_ops++; +- io_stat->write_latency_ticks += tsc_diff; +- if (io_stat->max_write_latency_ticks < tsc_diff) { +- io_stat->max_write_latency_ticks = tsc_diff; +- } +- if (io_stat->min_write_latency_ticks > tsc_diff) { +- io_stat->min_write_latency_ticks = tsc_diff; +- } +- } +- } +- break; +- case SPDK_BDEV_IO_TYPE_COPY: +- io_stat->bytes_copied += num_blocks * blocklen; +- io_stat->num_copy_ops++; +- bdev_io->internal.ch->stat->copy_latency_ticks += tsc_diff; +- if (io_stat->max_copy_latency_ticks < tsc_diff) { +- io_stat->max_copy_latency_ticks = tsc_diff; +- } +- if (io_stat->min_copy_latency_ticks > tsc_diff) { +- io_stat->min_copy_latency_ticks = tsc_diff; +- } +- break; +- default: +- break; +- } +- } else if (io_status <= SPDK_BDEV_IO_STATUS_FAILED && io_status >= SPDK_MIN_BDEV_IO_STATUS) { +- io_stat = bdev_io->bdev->internal.stat; +- assert(io_stat->io_error != NULL); +- +- spdk_spin_lock(&bdev_io->bdev->internal.spinlock); +- io_stat->io_error->error_status[-io_status - 1]++; +- spdk_spin_unlock(&bdev_io->bdev->internal.spinlock); +- } +- +-#ifdef SPDK_CONFIG_VTUNE +- uint64_t now_tsc = spdk_get_ticks(); +- if (now_tsc > (bdev_io->internal.ch->start_tsc + bdev_io->internal.ch->interval_tsc)) { +- uint64_t data[5]; +- struct spdk_bdev_io_stat *prev_stat = bdev_io->internal.ch->prev_stat; +- +- data[0] = io_stat->num_read_ops - prev_stat->num_read_ops; +- data[1] = io_stat->bytes_read - prev_stat->bytes_read; +- data[2] = io_stat->num_write_ops - prev_stat->num_write_ops; +- data[3] = io_stat->bytes_written - prev_stat->bytes_written; +- data[4] = bdev_io->bdev->fn_table->get_spin_time ? +- bdev_io->bdev->fn_table->get_spin_time(spdk_bdev_io_get_io_channel(bdev_io)) : 0; +- +- __itt_metadata_add(g_bdev_mgr.domain, __itt_null, bdev_io->internal.ch->handle, +- __itt_metadata_u64, 5, data); +- +- memcpy(prev_stat, io_stat, sizeof(struct spdk_bdev_io_stat)); +- bdev_io->internal.ch->start_tsc = now_tsc; +- } +-#endif +-} +- +-static inline void +-bdev_io_complete(void *ctx) +-{ +- struct spdk_bdev_io *bdev_io = ctx; +- struct spdk_bdev_channel *bdev_ch = bdev_io->internal.ch; +- uint64_t tsc, tsc_diff; +- +- if (spdk_unlikely(bdev_io->internal.in_submit_request)) { +- /* +- * Defer completion to avoid potential infinite recursion if the +- * user's completion callback issues a new I/O. +- */ +- spdk_thread_send_msg(spdk_bdev_io_get_thread(bdev_io), +- bdev_io_complete, bdev_io); +- return; +- } +- +- tsc = spdk_get_ticks(); +- tsc_diff = tsc - bdev_io->internal.submit_tsc; +- spdk_trace_record_tsc(tsc, TRACE_BDEV_IO_DONE, 0, 0, (uintptr_t)bdev_io, +- bdev_io->internal.caller_ctx); +- +- TAILQ_REMOVE(&bdev_ch->io_submitted, bdev_io, internal.ch_link); +- +- if (bdev_io->internal.ch->histogram) { +- spdk_histogram_data_tally(bdev_io->internal.ch->histogram, tsc_diff); +- } +- +- bdev_io_update_io_stat(bdev_io, tsc_diff); +- +- assert(bdev_io->internal.cb != NULL); +- assert(spdk_get_thread() == spdk_bdev_io_get_thread(bdev_io)); +- +- bdev_io->internal.cb(bdev_io, bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS, +- bdev_io->internal.caller_ctx); +-} +- +-static void bdev_destroy_cb(void *io_device); +- +-static void +-bdev_reset_complete(struct spdk_bdev *bdev, void *_ctx, int status) +-{ +- struct spdk_bdev_io *bdev_io = _ctx; +- +- if (bdev_io->u.reset.ch_ref != NULL) { +- spdk_put_io_channel(bdev_io->u.reset.ch_ref); +- bdev_io->u.reset.ch_ref = NULL; +- } +- +- bdev_io_complete(bdev_io); +- +- if (bdev->internal.status == SPDK_BDEV_STATUS_REMOVING && +- TAILQ_EMPTY(&bdev->internal.open_descs)) { +- spdk_io_device_unregister(__bdev_to_io_dev(bdev), bdev_destroy_cb); +- } +-} +- +-static void +-bdev_unfreeze_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, +- struct spdk_io_channel *_ch, void *_ctx) +-{ +- struct spdk_bdev_io *bdev_io = _ctx; +- struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch); +- struct spdk_bdev_io *queued_reset; +- +- ch->flags &= ~BDEV_CH_RESET_IN_PROGRESS; +- while (!TAILQ_EMPTY(&ch->queued_resets)) { +- queued_reset = TAILQ_FIRST(&ch->queued_resets); +- TAILQ_REMOVE(&ch->queued_resets, queued_reset, internal.link); +- spdk_bdev_io_complete(queued_reset, bdev_io->internal.status); +- } +- +- spdk_bdev_for_each_channel_continue(i, 0); +-} +- +-void +-spdk_bdev_io_complete(struct spdk_bdev_io *bdev_io, enum spdk_bdev_io_status status) +-{ +- struct spdk_bdev *bdev = bdev_io->bdev; +- struct spdk_bdev_channel *bdev_ch = bdev_io->internal.ch; +- struct spdk_bdev_shared_resource *shared_resource = bdev_ch->shared_resource; +- +- bdev_io->internal.status = status; +- +- if (spdk_unlikely(bdev_io->type == SPDK_BDEV_IO_TYPE_RESET)) { +- bool unlock_channels = false; +- +- if (status == SPDK_BDEV_IO_STATUS_NOMEM) { +- SPDK_ERRLOG("NOMEM returned for reset\n"); +- } +- spdk_spin_lock(&bdev->internal.spinlock); +- if (bdev_io == bdev->internal.reset_in_progress) { +- bdev->internal.reset_in_progress = NULL; +- unlock_channels = true; +- } +- spdk_spin_unlock(&bdev->internal.spinlock); +- +- if (unlock_channels) { +- spdk_bdev_for_each_channel(bdev, bdev_unfreeze_channel, bdev_io, +- bdev_reset_complete); +- return; +- } +- } else { +- if (spdk_unlikely(bdev_io->internal.orig_iovcnt != 0)) { +- _bdev_io_push_bounce_data_buffer(bdev_io, _bdev_io_complete_push_bounce_done); +- /* bdev IO will be completed in the callback */ +- return; +- } +- +- _bdev_io_decrement_outstanding(bdev_ch, shared_resource); +- if (spdk_unlikely(_bdev_io_handle_no_mem(bdev_io))) { +- return; +- } +- } +- +- bdev_io_complete(bdev_io); +-} +- +-void +-spdk_bdev_io_complete_scsi_status(struct spdk_bdev_io *bdev_io, enum spdk_scsi_status sc, +- enum spdk_scsi_sense sk, uint8_t asc, uint8_t ascq) +-{ +- if (sc == SPDK_SCSI_STATUS_GOOD) { +- bdev_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS; +- } else { +- bdev_io->internal.status = SPDK_BDEV_IO_STATUS_SCSI_ERROR; +- bdev_io->internal.error.scsi.sc = sc; +- bdev_io->internal.error.scsi.sk = sk; +- bdev_io->internal.error.scsi.asc = asc; +- bdev_io->internal.error.scsi.ascq = ascq; +- } +- +- spdk_bdev_io_complete(bdev_io, bdev_io->internal.status); +-} +- +-void +-spdk_bdev_io_get_scsi_status(const struct spdk_bdev_io *bdev_io, +- int *sc, int *sk, int *asc, int *ascq) +-{ +- assert(sc != NULL); +- assert(sk != NULL); +- assert(asc != NULL); +- assert(ascq != NULL); +- +- switch (bdev_io->internal.status) { +- case SPDK_BDEV_IO_STATUS_SUCCESS: +- *sc = SPDK_SCSI_STATUS_GOOD; +- *sk = SPDK_SCSI_SENSE_NO_SENSE; +- *asc = SPDK_SCSI_ASC_NO_ADDITIONAL_SENSE; +- *ascq = SPDK_SCSI_ASCQ_CAUSE_NOT_REPORTABLE; +- break; +- case SPDK_BDEV_IO_STATUS_NVME_ERROR: +- spdk_scsi_nvme_translate(bdev_io, sc, sk, asc, ascq); +- break; +- case SPDK_BDEV_IO_STATUS_SCSI_ERROR: +- *sc = bdev_io->internal.error.scsi.sc; +- *sk = bdev_io->internal.error.scsi.sk; +- *asc = bdev_io->internal.error.scsi.asc; +- *ascq = bdev_io->internal.error.scsi.ascq; +- break; +- default: +- *sc = SPDK_SCSI_STATUS_CHECK_CONDITION; +- *sk = SPDK_SCSI_SENSE_ABORTED_COMMAND; +- *asc = SPDK_SCSI_ASC_NO_ADDITIONAL_SENSE; +- *ascq = SPDK_SCSI_ASCQ_CAUSE_NOT_REPORTABLE; +- break; +- } +-} +- +-void +-spdk_bdev_io_complete_aio_status(struct spdk_bdev_io *bdev_io, int aio_result) +-{ +- if (aio_result == 0) { +- bdev_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS; +- } else { +- bdev_io->internal.status = SPDK_BDEV_IO_STATUS_AIO_ERROR; +- } +- +- bdev_io->internal.error.aio_result = aio_result; +- +- spdk_bdev_io_complete(bdev_io, bdev_io->internal.status); +-} +- +-void +-spdk_bdev_io_get_aio_status(const struct spdk_bdev_io *bdev_io, int *aio_result) +-{ +- assert(aio_result != NULL); +- +- if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_AIO_ERROR) { +- *aio_result = bdev_io->internal.error.aio_result; +- } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS) { +- *aio_result = 0; +- } else { +- *aio_result = -EIO; +- } +-} +- +-void +-spdk_bdev_io_complete_nvme_status(struct spdk_bdev_io *bdev_io, uint32_t cdw0, int sct, int sc) +-{ +- if (sct == SPDK_NVME_SCT_GENERIC && sc == SPDK_NVME_SC_SUCCESS) { +- bdev_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS; +- } else if (sct == SPDK_NVME_SCT_GENERIC && sc == SPDK_NVME_SC_ABORTED_BY_REQUEST) { +- bdev_io->internal.status = SPDK_BDEV_IO_STATUS_ABORTED; +- } else { +- bdev_io->internal.status = SPDK_BDEV_IO_STATUS_NVME_ERROR; +- } +- +- bdev_io->internal.error.nvme.cdw0 = cdw0; +- bdev_io->internal.error.nvme.sct = sct; +- bdev_io->internal.error.nvme.sc = sc; +- +- spdk_bdev_io_complete(bdev_io, bdev_io->internal.status); +-} +- +-void +-spdk_bdev_io_get_nvme_status(const struct spdk_bdev_io *bdev_io, uint32_t *cdw0, int *sct, int *sc) +-{ +- assert(sct != NULL); +- assert(sc != NULL); +- assert(cdw0 != NULL); +- +- if (spdk_unlikely(bdev_io->type == SPDK_BDEV_IO_TYPE_ABORT)) { +- *sct = SPDK_NVME_SCT_GENERIC; +- *sc = SPDK_NVME_SC_SUCCESS; +- if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS) { +- *cdw0 = 0; +- } else { +- *cdw0 = 1U; +- } +- return; +- } +- +- if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_NVME_ERROR) { +- *sct = bdev_io->internal.error.nvme.sct; +- *sc = bdev_io->internal.error.nvme.sc; +- } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS) { +- *sct = SPDK_NVME_SCT_GENERIC; +- *sc = SPDK_NVME_SC_SUCCESS; +- } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_ABORTED) { +- *sct = SPDK_NVME_SCT_GENERIC; +- *sc = SPDK_NVME_SC_ABORTED_BY_REQUEST; +- } else { +- *sct = SPDK_NVME_SCT_GENERIC; +- *sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; +- } +- +- *cdw0 = bdev_io->internal.error.nvme.cdw0; +-} +- +-void +-spdk_bdev_io_get_nvme_fused_status(const struct spdk_bdev_io *bdev_io, uint32_t *cdw0, +- int *first_sct, int *first_sc, int *second_sct, int *second_sc) +-{ +- assert(first_sct != NULL); +- assert(first_sc != NULL); +- assert(second_sct != NULL); +- assert(second_sc != NULL); +- assert(cdw0 != NULL); +- +- if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_NVME_ERROR) { +- if (bdev_io->internal.error.nvme.sct == SPDK_NVME_SCT_MEDIA_ERROR && +- bdev_io->internal.error.nvme.sc == SPDK_NVME_SC_COMPARE_FAILURE) { +- *first_sct = bdev_io->internal.error.nvme.sct; +- *first_sc = bdev_io->internal.error.nvme.sc; +- *second_sct = SPDK_NVME_SCT_GENERIC; +- *second_sc = SPDK_NVME_SC_ABORTED_FAILED_FUSED; +- } else { +- *first_sct = SPDK_NVME_SCT_GENERIC; +- *first_sc = SPDK_NVME_SC_SUCCESS; +- *second_sct = bdev_io->internal.error.nvme.sct; +- *second_sc = bdev_io->internal.error.nvme.sc; +- } +- } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_ABORTED) { +- *first_sct = SPDK_NVME_SCT_GENERIC; +- *first_sc = SPDK_NVME_SC_ABORTED_BY_REQUEST; +- *second_sct = SPDK_NVME_SCT_GENERIC; +- *second_sc = SPDK_NVME_SC_ABORTED_BY_REQUEST; +- } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS) { +- *first_sct = SPDK_NVME_SCT_GENERIC; +- *first_sc = SPDK_NVME_SC_SUCCESS; +- *second_sct = SPDK_NVME_SCT_GENERIC; +- *second_sc = SPDK_NVME_SC_SUCCESS; +- } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_FIRST_FUSED_FAILED) { +- *first_sct = SPDK_NVME_SCT_GENERIC; +- *first_sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; +- *second_sct = SPDK_NVME_SCT_GENERIC; +- *second_sc = SPDK_NVME_SC_ABORTED_FAILED_FUSED; +- } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_MISCOMPARE) { +- *first_sct = SPDK_NVME_SCT_MEDIA_ERROR; +- *first_sc = SPDK_NVME_SC_COMPARE_FAILURE; +- *second_sct = SPDK_NVME_SCT_GENERIC; +- *second_sc = SPDK_NVME_SC_ABORTED_FAILED_FUSED; +- } else { +- *first_sct = SPDK_NVME_SCT_GENERIC; +- *first_sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; +- *second_sct = SPDK_NVME_SCT_GENERIC; +- *second_sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; +- } +- +- *cdw0 = bdev_io->internal.error.nvme.cdw0; +-} +- +-struct spdk_thread * +-spdk_bdev_io_get_thread(struct spdk_bdev_io *bdev_io) +-{ +- return spdk_io_channel_get_thread(bdev_io->internal.ch->channel); +-} +- +-struct spdk_io_channel * +-spdk_bdev_io_get_io_channel(struct spdk_bdev_io *bdev_io) +-{ +- return bdev_io->internal.ch->channel; +-} +- +-static int +-bdev_register(struct spdk_bdev *bdev) +-{ +- char *bdev_name; +- char uuid[SPDK_UUID_STRING_LEN]; +- int ret; +- +- assert(bdev->module != NULL); +- +- if (!bdev->name) { +- SPDK_ERRLOG("Bdev name is NULL\n"); +- return -EINVAL; +- } +- +- if (!strlen(bdev->name)) { +- SPDK_ERRLOG("Bdev name must not be an empty string\n"); +- return -EINVAL; +- } +- +- /* Users often register their own I/O devices using the bdev name. In +- * order to avoid conflicts, prepend bdev_. */ +- bdev_name = spdk_sprintf_alloc("bdev_%s", bdev->name); +- if (!bdev_name) { +- SPDK_ERRLOG("Unable to allocate memory for internal bdev name.\n"); +- return -ENOMEM; +- } +- +- bdev->internal.stat = bdev_alloc_io_stat(true); +- if (!bdev->internal.stat) { +- SPDK_ERRLOG("Unable to allocate I/O statistics structure.\n"); +- free(bdev_name); +- return -ENOMEM; +- } +- +- bdev->internal.status = SPDK_BDEV_STATUS_READY; +- bdev->internal.measured_queue_depth = UINT64_MAX; +- bdev->internal.claim_type = SPDK_BDEV_CLAIM_NONE; +- memset(&bdev->internal.claim, 0, sizeof(bdev->internal.claim)); +- bdev->internal.qd_poller = NULL; +- bdev->internal.qos = NULL; +- +- TAILQ_INIT(&bdev->internal.open_descs); +- TAILQ_INIT(&bdev->internal.locked_ranges); +- TAILQ_INIT(&bdev->internal.pending_locked_ranges); +- TAILQ_INIT(&bdev->aliases); +- +- ret = bdev_name_add(&bdev->internal.bdev_name, bdev, bdev->name); +- if (ret != 0) { +- bdev_free_io_stat(bdev->internal.stat); +- free(bdev_name); +- return ret; +- } +- +- /* UUID may be specified by the user or defined by bdev itself. +- * Otherwise it will be generated here, so this field will never be empty. */ +- if (spdk_mem_all_zero(&bdev->uuid, sizeof(bdev->uuid))) { +- spdk_uuid_generate(&bdev->uuid); +- } +- +- /* Add the UUID alias only if it's different than the name */ +- spdk_uuid_fmt_lower(uuid, sizeof(uuid), &bdev->uuid); +- if (strcmp(bdev->name, uuid) != 0) { +- ret = spdk_bdev_alias_add(bdev, uuid); +- if (ret != 0) { +- SPDK_ERRLOG("Unable to add uuid:%s alias for bdev %s\n", uuid, bdev->name); +- bdev_name_del(&bdev->internal.bdev_name); +- bdev_free_io_stat(bdev->internal.stat); +- free(bdev_name); +- return ret; +- } +- } +- +- if (spdk_bdev_get_buf_align(bdev) > 1) { +- if (bdev->split_on_optimal_io_boundary) { +- bdev->optimal_io_boundary = spdk_min(bdev->optimal_io_boundary, +- SPDK_BDEV_LARGE_BUF_MAX_SIZE / bdev->blocklen); +- } else { +- bdev->split_on_optimal_io_boundary = true; +- bdev->optimal_io_boundary = SPDK_BDEV_LARGE_BUF_MAX_SIZE / bdev->blocklen; +- } +- } +- +- /* If the user didn't specify a write unit size, set it to one. */ +- if (bdev->write_unit_size == 0) { +- bdev->write_unit_size = 1; +- } +- +- /* Set ACWU value to the write unit size if bdev module did not set it (does not support it natively) */ +- if (bdev->acwu == 0) { +- bdev->acwu = bdev->write_unit_size; +- } +- +- if (bdev->phys_blocklen == 0) { +- bdev->phys_blocklen = spdk_bdev_get_data_block_size(bdev); +- } +- +- bdev->internal.reset_in_progress = NULL; +- bdev->internal.qd_poll_in_progress = false; +- bdev->internal.period = 0; +- bdev->internal.new_period = 0; +- +- spdk_io_device_register(__bdev_to_io_dev(bdev), +- bdev_channel_create, bdev_channel_destroy, +- sizeof(struct spdk_bdev_channel), +- bdev_name); +- +- free(bdev_name); +- +- spdk_spin_init(&bdev->internal.spinlock); +- +- SPDK_DEBUGLOG(bdev, "Inserting bdev %s into list\n", bdev->name); +- TAILQ_INSERT_TAIL(&g_bdev_mgr.bdevs, bdev, internal.link); +- +- return 0; +-} +- +-static void +-bdev_destroy_cb(void *io_device) +-{ +- int rc; +- struct spdk_bdev *bdev; +- spdk_bdev_unregister_cb cb_fn; +- void *cb_arg; +- +- bdev = __bdev_from_io_dev(io_device); +- +- if (bdev->internal.unregister_td != spdk_get_thread()) { +- spdk_thread_send_msg(bdev->internal.unregister_td, bdev_destroy_cb, io_device); +- return; +- } +- +- cb_fn = bdev->internal.unregister_cb; +- cb_arg = bdev->internal.unregister_ctx; +- +- spdk_spin_destroy(&bdev->internal.spinlock); +- free(bdev->internal.qos); +- bdev_free_io_stat(bdev->internal.stat); +- +- rc = bdev->fn_table->destruct(bdev->ctxt); +- if (rc < 0) { +- SPDK_ERRLOG("destruct failed\n"); +- } +- if (rc <= 0 && cb_fn != NULL) { +- cb_fn(cb_arg, rc); +- } +-} +- +-void +-spdk_bdev_destruct_done(struct spdk_bdev *bdev, int bdeverrno) +-{ +- if (bdev->internal.unregister_cb != NULL) { +- bdev->internal.unregister_cb(bdev->internal.unregister_ctx, bdeverrno); +- } +-} +- +-static void +-_remove_notify(void *arg) +-{ +- struct spdk_bdev_desc *desc = arg; +- +- spdk_spin_lock(&desc->spinlock); +- desc->refs--; +- +- if (!desc->closed) { +- spdk_spin_unlock(&desc->spinlock); +- desc->callback.event_fn(SPDK_BDEV_EVENT_REMOVE, desc->bdev, desc->callback.ctx); +- return; +- } else if (0 == desc->refs) { +- /* This descriptor was closed after this remove_notify message was sent. +- * spdk_bdev_close() could not free the descriptor since this message was +- * in flight, so we free it now using bdev_desc_free(). +- */ +- spdk_spin_unlock(&desc->spinlock); +- bdev_desc_free(desc); +- return; +- } +- spdk_spin_unlock(&desc->spinlock); +-} +- +-/* returns: 0 - bdev removed and ready to be destructed. +- * -EBUSY - bdev can't be destructed yet. */ +-static int +-bdev_unregister_unsafe(struct spdk_bdev *bdev) +-{ +- struct spdk_bdev_desc *desc, *tmp; +- int rc = 0; +- char uuid[SPDK_UUID_STRING_LEN]; +- +- assert(spdk_spin_held(&g_bdev_mgr.spinlock)); +- assert(spdk_spin_held(&bdev->internal.spinlock)); +- +- /* Notify each descriptor about hotremoval */ +- TAILQ_FOREACH_SAFE(desc, &bdev->internal.open_descs, link, tmp) { +- rc = -EBUSY; +- spdk_spin_lock(&desc->spinlock); +- /* +- * Defer invocation of the event_cb to a separate message that will +- * run later on its thread. This ensures this context unwinds and +- * we don't recursively unregister this bdev again if the event_cb +- * immediately closes its descriptor. +- */ +- desc->refs++; +- spdk_thread_send_msg(desc->thread, _remove_notify, desc); +- spdk_spin_unlock(&desc->spinlock); +- } +- +- /* If there are no descriptors, proceed removing the bdev */ +- if (rc == 0) { +- TAILQ_REMOVE(&g_bdev_mgr.bdevs, bdev, internal.link); +- SPDK_DEBUGLOG(bdev, "Removing bdev %s from list done\n", bdev->name); +- +- /* Delete the name and the UUID alias */ +- spdk_uuid_fmt_lower(uuid, sizeof(uuid), &bdev->uuid); +- bdev_name_del_unsafe(&bdev->internal.bdev_name); +- bdev_alias_del(bdev, uuid, bdev_name_del_unsafe); +- +- spdk_notify_send("bdev_unregister", spdk_bdev_get_name(bdev)); +- +- if (bdev->internal.reset_in_progress != NULL) { +- /* If reset is in progress, let the completion callback for reset +- * unregister the bdev. +- */ +- rc = -EBUSY; +- } +- } +- +- return rc; +-} +- +-static void +-bdev_unregister_abort_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, +- struct spdk_io_channel *io_ch, void *_ctx) +-{ +- struct spdk_bdev_channel *bdev_ch = __io_ch_to_bdev_ch(io_ch); +- +- bdev_channel_abort_queued_ios(bdev_ch); +- spdk_bdev_for_each_channel_continue(i, 0); +-} +- +-static void +-bdev_unregister(struct spdk_bdev *bdev, void *_ctx, int status) +-{ +- int rc; +- +- spdk_spin_lock(&g_bdev_mgr.spinlock); +- spdk_spin_lock(&bdev->internal.spinlock); +- /* +- * Set the status to REMOVING after completing to abort channels. Otherwise, +- * the last spdk_bdev_close() may call spdk_io_device_unregister() while +- * spdk_bdev_for_each_channel() is executed and spdk_io_device_unregister() +- * may fail. +- */ +- bdev->internal.status = SPDK_BDEV_STATUS_REMOVING; +- rc = bdev_unregister_unsafe(bdev); +- spdk_spin_unlock(&bdev->internal.spinlock); +- spdk_spin_unlock(&g_bdev_mgr.spinlock); +- +- if (rc == 0) { +- spdk_io_device_unregister(__bdev_to_io_dev(bdev), bdev_destroy_cb); +- } +-} +- +-void +-spdk_bdev_unregister(struct spdk_bdev *bdev, spdk_bdev_unregister_cb cb_fn, void *cb_arg) +-{ +- struct spdk_thread *thread; +- +- SPDK_DEBUGLOG(bdev, "Removing bdev %s from list\n", bdev->name); +- +- thread = spdk_get_thread(); +- if (!thread) { +- /* The user called this from a non-SPDK thread. */ +- if (cb_fn != NULL) { +- cb_fn(cb_arg, -ENOTSUP); +- } +- return; +- } +- +- spdk_spin_lock(&g_bdev_mgr.spinlock); +- if (bdev->internal.status == SPDK_BDEV_STATUS_UNREGISTERING || +- bdev->internal.status == SPDK_BDEV_STATUS_REMOVING) { +- spdk_spin_unlock(&g_bdev_mgr.spinlock); +- if (cb_fn) { +- cb_fn(cb_arg, -EBUSY); +- } +- return; +- } +- +- spdk_spin_lock(&bdev->internal.spinlock); +- bdev->internal.status = SPDK_BDEV_STATUS_UNREGISTERING; +- bdev->internal.unregister_cb = cb_fn; +- bdev->internal.unregister_ctx = cb_arg; +- bdev->internal.unregister_td = thread; +- spdk_spin_unlock(&bdev->internal.spinlock); +- spdk_spin_unlock(&g_bdev_mgr.spinlock); +- +- spdk_bdev_set_qd_sampling_period(bdev, 0); +- +- spdk_bdev_for_each_channel(bdev, bdev_unregister_abort_channel, bdev, +- bdev_unregister); +-} +- +-int +-spdk_bdev_unregister_by_name(const char *bdev_name, struct spdk_bdev_module *module, +- spdk_bdev_unregister_cb cb_fn, void *cb_arg) +-{ +- struct spdk_bdev_desc *desc; +- struct spdk_bdev *bdev; +- int rc; +- +- rc = spdk_bdev_open_ext(bdev_name, false, _tmp_bdev_event_cb, NULL, &desc); +- if (rc != 0) { +- SPDK_ERRLOG("Failed to open bdev with name: %s\n", bdev_name); +- return rc; +- } +- +- bdev = spdk_bdev_desc_get_bdev(desc); +- +- if (bdev->module != module) { +- spdk_bdev_close(desc); +- SPDK_ERRLOG("Bdev %s was not registered by the specified module.\n", +- bdev_name); +- return -ENODEV; +- } +- +- spdk_bdev_unregister(bdev, cb_fn, cb_arg); +- +- spdk_bdev_close(desc); +- +- return 0; +-} +- +-static int +-bdev_start_qos(struct spdk_bdev *bdev) +-{ +- struct set_qos_limit_ctx *ctx; +- +- /* Enable QoS */ +- if (bdev->internal.qos && bdev->internal.qos->thread == NULL) { +- ctx = calloc(1, sizeof(*ctx)); +- if (ctx == NULL) { +- SPDK_ERRLOG("Failed to allocate memory for QoS context\n"); +- return -ENOMEM; +- } +- ctx->bdev = bdev; +- spdk_bdev_for_each_channel(bdev, bdev_enable_qos_msg, ctx, bdev_enable_qos_done); +- } +- +- return 0; +-} +- +-static int +-bdev_open(struct spdk_bdev *bdev, bool write, struct spdk_bdev_desc *desc) +-{ +- struct spdk_thread *thread; +- int rc = 0; +- +- thread = spdk_get_thread(); +- if (!thread) { +- SPDK_ERRLOG("Cannot open bdev from non-SPDK thread.\n"); +- return -ENOTSUP; +- } +- +- SPDK_DEBUGLOG(bdev, "Opening descriptor %p for bdev %s on thread %p\n", desc, bdev->name, +- spdk_get_thread()); +- +- desc->bdev = bdev; +- desc->thread = thread; +- desc->write = write; +- +- spdk_spin_lock(&bdev->internal.spinlock); +- if (bdev->internal.status == SPDK_BDEV_STATUS_UNREGISTERING || +- bdev->internal.status == SPDK_BDEV_STATUS_REMOVING) { +- spdk_spin_unlock(&bdev->internal.spinlock); +- return -ENODEV; +- } +- +- if (write && bdev->internal.claim_type != SPDK_BDEV_CLAIM_NONE) { +- SPDK_ERRLOG("Could not open %s - %s module already claimed it\n", +- bdev->name, bdev->internal.claim.v1.module->name); +- spdk_spin_unlock(&bdev->internal.spinlock); +- return -EPERM; +- } +- +- rc = bdev_start_qos(bdev); +- if (rc != 0) { +- SPDK_ERRLOG("Failed to start QoS on bdev %s\n", bdev->name); +- spdk_spin_unlock(&bdev->internal.spinlock); +- return rc; +- } +- +- TAILQ_INSERT_TAIL(&bdev->internal.open_descs, desc, link); +- +- spdk_spin_unlock(&bdev->internal.spinlock); +- +- return 0; +-} +- +-static int +-bdev_desc_alloc(struct spdk_bdev *bdev, spdk_bdev_event_cb_t event_cb, void *event_ctx, +- struct spdk_bdev_desc **_desc) +-{ +- struct spdk_bdev_desc *desc; +- unsigned int event_id; +- +- desc = calloc(1, sizeof(*desc)); +- if (desc == NULL) { +- SPDK_ERRLOG("Failed to allocate memory for bdev descriptor\n"); +- return -ENOMEM; +- } +- +- TAILQ_INIT(&desc->pending_media_events); +- TAILQ_INIT(&desc->free_media_events); +- +- desc->memory_domains_supported = spdk_bdev_get_memory_domains(bdev, NULL, 0) > 0; +- desc->callback.event_fn = event_cb; +- desc->callback.ctx = event_ctx; +- spdk_spin_init(&desc->spinlock); +- +- if (bdev->media_events) { +- desc->media_events_buffer = calloc(MEDIA_EVENT_POOL_SIZE, +- sizeof(*desc->media_events_buffer)); +- if (desc->media_events_buffer == NULL) { +- SPDK_ERRLOG("Failed to initialize media event pool\n"); +- bdev_desc_free(desc); +- return -ENOMEM; +- } +- +- for (event_id = 0; event_id < MEDIA_EVENT_POOL_SIZE; ++event_id) { +- TAILQ_INSERT_TAIL(&desc->free_media_events, +- &desc->media_events_buffer[event_id], tailq); +- } +- } +- +- *_desc = desc; +- +- return 0; +-} +- +-int +-spdk_bdev_open_ext(const char *bdev_name, bool write, spdk_bdev_event_cb_t event_cb, +- void *event_ctx, struct spdk_bdev_desc **_desc) +-{ +- struct spdk_bdev_desc *desc; +- struct spdk_bdev *bdev; +- int rc; +- +- if (event_cb == NULL) { +- SPDK_ERRLOG("Missing event callback function\n"); +- return -EINVAL; +- } +- +- spdk_spin_lock(&g_bdev_mgr.spinlock); +- +- bdev = bdev_get_by_name(bdev_name); +- +- if (bdev == NULL) { +- SPDK_NOTICELOG("Currently unable to find bdev with name: %s\n", bdev_name); +- spdk_spin_unlock(&g_bdev_mgr.spinlock); +- return -ENODEV; +- } +- +- rc = bdev_desc_alloc(bdev, event_cb, event_ctx, &desc); +- if (rc != 0) { +- spdk_spin_unlock(&g_bdev_mgr.spinlock); +- return rc; +- } +- +- rc = bdev_open(bdev, write, desc); +- if (rc != 0) { +- bdev_desc_free(desc); +- desc = NULL; +- } +- +- *_desc = desc; +- +- spdk_spin_unlock(&g_bdev_mgr.spinlock); +- +- return rc; +-} +- +-static void +-bdev_close(struct spdk_bdev *bdev, struct spdk_bdev_desc *desc) +-{ +- int rc; +- +- spdk_spin_lock(&bdev->internal.spinlock); +- spdk_spin_lock(&desc->spinlock); +- +- TAILQ_REMOVE(&bdev->internal.open_descs, desc, link); +- +- desc->closed = true; +- +- if (0 == desc->refs) { +- spdk_spin_unlock(&desc->spinlock); +- bdev_desc_free(desc); +- } else { +- spdk_spin_unlock(&desc->spinlock); +- } +- +- /* If no more descriptors, kill QoS channel */ +- if (bdev->internal.qos && TAILQ_EMPTY(&bdev->internal.open_descs)) { +- SPDK_DEBUGLOG(bdev, "Closed last descriptor for bdev %s on thread %p. Stopping QoS.\n", +- bdev->name, spdk_get_thread()); +- +- if (bdev_qos_destroy(bdev)) { +- /* There isn't anything we can do to recover here. Just let the +- * old QoS poller keep running. The QoS handling won't change +- * cores when the user allocates a new channel, but it won't break. */ +- SPDK_ERRLOG("Unable to shut down QoS poller. It will continue running on the current thread.\n"); +- } +- } +- +- if (bdev->internal.status == SPDK_BDEV_STATUS_REMOVING && TAILQ_EMPTY(&bdev->internal.open_descs)) { +- rc = bdev_unregister_unsafe(bdev); +- spdk_spin_unlock(&bdev->internal.spinlock); +- +- if (rc == 0) { +- spdk_io_device_unregister(__bdev_to_io_dev(bdev), bdev_destroy_cb); +- } +- } else { +- spdk_spin_unlock(&bdev->internal.spinlock); +- } +-} +- +-void +-spdk_bdev_close(struct spdk_bdev_desc *desc) +-{ +- struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); +- +- SPDK_DEBUGLOG(bdev, "Closing descriptor %p for bdev %s on thread %p\n", desc, bdev->name, +- spdk_get_thread()); +- +- assert(desc->thread == spdk_get_thread()); +- +- spdk_poller_unregister(&desc->io_timeout_poller); +- +- spdk_spin_lock(&g_bdev_mgr.spinlock); +- +- bdev_close(bdev, desc); +- +- spdk_spin_unlock(&g_bdev_mgr.spinlock); +-} +- +-static void +-bdev_register_finished(void *arg) +-{ +- struct spdk_bdev_desc *desc = arg; +- struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); +- +- spdk_notify_send("bdev_register", spdk_bdev_get_name(bdev)); +- +- spdk_spin_lock(&g_bdev_mgr.spinlock); +- +- bdev_close(bdev, desc); +- +- spdk_spin_unlock(&g_bdev_mgr.spinlock); +-} +- +-int +-spdk_bdev_register(struct spdk_bdev *bdev) +-{ +- struct spdk_bdev_desc *desc; +- int rc; +- +- if (spdk_unlikely(spdk_thread_get_app_thread() != spdk_get_thread())) { +- SPDK_LOG_DEPRECATED(bdev_register_examine_thread); +- } +- +- rc = bdev_register(bdev); +- if (rc != 0) { +- return rc; +- } +- +- /* A descriptor is opened to prevent bdev deletion during examination */ +- rc = bdev_desc_alloc(bdev, _tmp_bdev_event_cb, NULL, &desc); +- if (rc != 0) { +- spdk_bdev_unregister(bdev, NULL, NULL); +- return rc; +- } +- +- rc = bdev_open(bdev, false, desc); +- if (rc != 0) { +- bdev_desc_free(desc); +- spdk_bdev_unregister(bdev, NULL, NULL); +- return rc; +- } +- +- /* Examine configuration before initializing I/O */ +- bdev_examine(bdev); +- +- rc = spdk_bdev_wait_for_examine(bdev_register_finished, desc); +- if (rc != 0) { +- bdev_close(bdev, desc); +- spdk_bdev_unregister(bdev, NULL, NULL); +- } +- +- return rc; +-} +- +-int +-spdk_bdev_module_claim_bdev(struct spdk_bdev *bdev, struct spdk_bdev_desc *desc, +- struct spdk_bdev_module *module) +-{ +- spdk_spin_lock(&bdev->internal.spinlock); +- +- if (bdev->internal.claim_type != SPDK_BDEV_CLAIM_NONE) { +- SPDK_ERRLOG("bdev %s already claimed by module %s\n", bdev->name, +- bdev->internal.claim.v1.module->name); +- spdk_spin_unlock(&bdev->internal.spinlock); +- return -EPERM; +- } +- +- if (desc && !desc->write) { +- desc->write = true; +- } +- +- bdev->internal.claim_type = SPDK_BDEV_CLAIM_EXCL_WRITE; +- bdev->internal.claim.v1.module = module; +- +- spdk_spin_unlock(&bdev->internal.spinlock); +- return 0; +-} +- +-void +-spdk_bdev_module_release_bdev(struct spdk_bdev *bdev) +-{ +- spdk_spin_lock(&bdev->internal.spinlock); +- +- assert(bdev->internal.claim.v1.module != NULL); +- assert(bdev->internal.claim_type == SPDK_BDEV_CLAIM_EXCL_WRITE); +- bdev->internal.claim_type = SPDK_BDEV_CLAIM_NONE; +- bdev->internal.claim.v1.module = NULL; +- +- spdk_spin_unlock(&bdev->internal.spinlock); +-} +- +-struct spdk_bdev * +-spdk_bdev_desc_get_bdev(struct spdk_bdev_desc *desc) +-{ +- assert(desc != NULL); +- return desc->bdev; +-} +- +-int +-spdk_for_each_bdev(void *ctx, spdk_for_each_bdev_fn fn) +-{ +- struct spdk_bdev *bdev, *tmp; +- struct spdk_bdev_desc *desc; +- int rc = 0; +- +- assert(fn != NULL); +- +- spdk_spin_lock(&g_bdev_mgr.spinlock); +- bdev = spdk_bdev_first(); +- while (bdev != NULL) { +- rc = bdev_desc_alloc(bdev, _tmp_bdev_event_cb, NULL, &desc); +- if (rc != 0) { +- break; +- } +- rc = bdev_open(bdev, false, desc); +- if (rc != 0) { +- bdev_desc_free(desc); +- if (rc == -ENODEV) { +- /* Ignore the error and move to the next bdev. */ +- rc = 0; +- bdev = spdk_bdev_next(bdev); +- continue; +- } +- break; +- } +- spdk_spin_unlock(&g_bdev_mgr.spinlock); +- +- rc = fn(ctx, bdev); +- +- spdk_spin_lock(&g_bdev_mgr.spinlock); +- tmp = spdk_bdev_next(bdev); +- bdev_close(bdev, desc); +- if (rc != 0) { +- break; +- } +- bdev = tmp; +- } +- spdk_spin_unlock(&g_bdev_mgr.spinlock); +- +- return rc; +-} +- +-int +-spdk_for_each_bdev_leaf(void *ctx, spdk_for_each_bdev_fn fn) +-{ +- struct spdk_bdev *bdev, *tmp; +- struct spdk_bdev_desc *desc; +- int rc = 0; +- +- assert(fn != NULL); +- +- spdk_spin_lock(&g_bdev_mgr.spinlock); +- bdev = spdk_bdev_first_leaf(); +- while (bdev != NULL) { +- rc = bdev_desc_alloc(bdev, _tmp_bdev_event_cb, NULL, &desc); +- if (rc != 0) { +- break; +- } +- rc = bdev_open(bdev, false, desc); +- if (rc != 0) { +- bdev_desc_free(desc); +- if (rc == -ENODEV) { +- /* Ignore the error and move to the next bdev. */ +- rc = 0; +- bdev = spdk_bdev_next_leaf(bdev); +- continue; +- } +- break; +- } +- spdk_spin_unlock(&g_bdev_mgr.spinlock); +- +- rc = fn(ctx, bdev); +- +- spdk_spin_lock(&g_bdev_mgr.spinlock); +- tmp = spdk_bdev_next_leaf(bdev); +- bdev_close(bdev, desc); +- if (rc != 0) { +- break; +- } +- bdev = tmp; +- } +- spdk_spin_unlock(&g_bdev_mgr.spinlock); +- +- return rc; +-} +- +-void +-spdk_bdev_io_get_iovec(struct spdk_bdev_io *bdev_io, struct iovec **iovp, int *iovcntp) +-{ +- struct iovec *iovs; +- int iovcnt; +- +- if (bdev_io == NULL) { +- return; +- } +- +- switch (bdev_io->type) { +- case SPDK_BDEV_IO_TYPE_READ: +- case SPDK_BDEV_IO_TYPE_WRITE: +- case SPDK_BDEV_IO_TYPE_ZCOPY: +- iovs = bdev_io->u.bdev.iovs; +- iovcnt = bdev_io->u.bdev.iovcnt; +- break; +- default: +- iovs = NULL; +- iovcnt = 0; +- break; +- } +- +- if (iovp) { +- *iovp = iovs; +- } +- if (iovcntp) { +- *iovcntp = iovcnt; +- } +-} +- +-void * +-spdk_bdev_io_get_md_buf(struct spdk_bdev_io *bdev_io) +-{ +- if (bdev_io == NULL) { +- return NULL; +- } +- +- if (!spdk_bdev_is_md_separate(bdev_io->bdev)) { +- return NULL; +- } +- +- if (bdev_io->type == SPDK_BDEV_IO_TYPE_READ || +- bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) { +- return bdev_io->u.bdev.md_buf; +- } +- +- return NULL; +-} +- +-void * +-spdk_bdev_io_get_cb_arg(struct spdk_bdev_io *bdev_io) +-{ +- if (bdev_io == NULL) { +- assert(false); +- return NULL; +- } +- +- return bdev_io->internal.caller_ctx; +-} +- +-void +-spdk_bdev_module_list_add(struct spdk_bdev_module *bdev_module) +-{ +- +- if (spdk_bdev_module_list_find(bdev_module->name)) { +- SPDK_ERRLOG("ERROR: module '%s' already registered.\n", bdev_module->name); +- assert(false); +- } +- +- spdk_spin_init(&bdev_module->internal.spinlock); +- +- /* +- * Modules with examine callbacks must be initialized first, so they are +- * ready to handle examine callbacks from later modules that will +- * register physical bdevs. +- */ +- if (bdev_module->examine_config != NULL || bdev_module->examine_disk != NULL) { +- TAILQ_INSERT_HEAD(&g_bdev_mgr.bdev_modules, bdev_module, internal.tailq); +- } else { +- TAILQ_INSERT_TAIL(&g_bdev_mgr.bdev_modules, bdev_module, internal.tailq); +- } +-} +- +-struct spdk_bdev_module * +-spdk_bdev_module_list_find(const char *name) +-{ +- struct spdk_bdev_module *bdev_module; +- +- TAILQ_FOREACH(bdev_module, &g_bdev_mgr.bdev_modules, internal.tailq) { +- if (strcmp(name, bdev_module->name) == 0) { +- break; +- } +- } +- +- return bdev_module; +-} +- +-static void +-bdev_write_zero_buffer_next(void *_bdev_io) +-{ +- struct spdk_bdev_io *bdev_io = _bdev_io; +- uint64_t num_bytes, num_blocks; +- void *md_buf = NULL; +- int rc; +- +- num_bytes = spdk_min(_bdev_get_block_size_with_md(bdev_io->bdev) * +- bdev_io->u.bdev.split_remaining_num_blocks, +- ZERO_BUFFER_SIZE); +- num_blocks = num_bytes / _bdev_get_block_size_with_md(bdev_io->bdev); +- num_blocks -= num_blocks % bdev_io->bdev->write_unit_size; +- +- if (spdk_bdev_is_md_separate(bdev_io->bdev)) { +- md_buf = (char *)g_bdev_mgr.zero_buffer + +- spdk_bdev_get_block_size(bdev_io->bdev) * num_blocks; +- } +- +- rc = bdev_write_blocks_with_md(bdev_io->internal.desc, +- spdk_io_channel_from_ctx(bdev_io->internal.ch), +- g_bdev_mgr.zero_buffer, md_buf, +- bdev_io->u.bdev.split_current_offset_blocks, num_blocks, +- bdev_write_zero_buffer_done, bdev_io); +- if (rc == 0) { +- bdev_io->u.bdev.split_remaining_num_blocks -= num_blocks; +- bdev_io->u.bdev.split_current_offset_blocks += num_blocks; +- } else if (rc == -ENOMEM) { +- bdev_queue_io_wait_with_cb(bdev_io, bdev_write_zero_buffer_next); +- } else { +- bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; +- bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); +- } +-} +- +-static void +-bdev_write_zero_buffer_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) +-{ +- struct spdk_bdev_io *parent_io = cb_arg; +- +- spdk_bdev_free_io(bdev_io); +- +- if (!success) { +- parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; +- parent_io->internal.cb(parent_io, false, parent_io->internal.caller_ctx); +- return; +- } +- +- if (parent_io->u.bdev.split_remaining_num_blocks == 0) { +- parent_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS; +- parent_io->internal.cb(parent_io, true, parent_io->internal.caller_ctx); +- return; +- } +- +- bdev_write_zero_buffer_next(parent_io); +-} +- +-static void +-bdev_set_qos_limit_done(struct set_qos_limit_ctx *ctx, int status) +-{ +- spdk_spin_lock(&ctx->bdev->internal.spinlock); +- ctx->bdev->internal.qos_mod_in_progress = false; +- spdk_spin_unlock(&ctx->bdev->internal.spinlock); +- +- if (ctx->cb_fn) { +- ctx->cb_fn(ctx->cb_arg, status); +- } +- free(ctx); +-} +- +-static void +-bdev_disable_qos_done(void *cb_arg) +-{ +- struct set_qos_limit_ctx *ctx = cb_arg; +- struct spdk_bdev *bdev = ctx->bdev; +- struct spdk_bdev_io *bdev_io; +- struct spdk_bdev_qos *qos; +- +- spdk_spin_lock(&bdev->internal.spinlock); +- qos = bdev->internal.qos; +- bdev->internal.qos = NULL; +- spdk_spin_unlock(&bdev->internal.spinlock); +- +- while (!TAILQ_EMPTY(&qos->queued)) { +- /* Send queued I/O back to their original thread for resubmission. */ +- bdev_io = TAILQ_FIRST(&qos->queued); +- TAILQ_REMOVE(&qos->queued, bdev_io, internal.link); +- +- if (bdev_io->internal.io_submit_ch) { +- /* +- * Channel was changed when sending it to the QoS thread - change it back +- * before sending it back to the original thread. +- */ +- bdev_io->internal.ch = bdev_io->internal.io_submit_ch; +- bdev_io->internal.io_submit_ch = NULL; +- } +- +- spdk_thread_send_msg(spdk_bdev_io_get_thread(bdev_io), +- _bdev_io_submit, bdev_io); +- } +- +- if (qos->thread != NULL) { +- spdk_put_io_channel(spdk_io_channel_from_ctx(qos->ch)); +- spdk_poller_unregister(&qos->poller); +- } +- +- free(qos); +- +- bdev_set_qos_limit_done(ctx, 0); +-} +- +-static void +-bdev_disable_qos_msg_done(struct spdk_bdev *bdev, void *_ctx, int status) +-{ +- struct set_qos_limit_ctx *ctx = _ctx; +- struct spdk_thread *thread; +- +- spdk_spin_lock(&bdev->internal.spinlock); +- thread = bdev->internal.qos->thread; +- spdk_spin_unlock(&bdev->internal.spinlock); +- +- if (thread != NULL) { +- spdk_thread_send_msg(thread, bdev_disable_qos_done, ctx); +- } else { +- bdev_disable_qos_done(ctx); +- } +-} +- +-static void +-bdev_disable_qos_msg(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, +- struct spdk_io_channel *ch, void *_ctx) +-{ +- struct spdk_bdev_channel *bdev_ch = __io_ch_to_bdev_ch(ch); +- +- bdev_ch->flags &= ~BDEV_CH_QOS_ENABLED; +- +- spdk_bdev_for_each_channel_continue(i, 0); +-} +- +-static void +-bdev_update_qos_rate_limit_msg(void *cb_arg) +-{ +- struct set_qos_limit_ctx *ctx = cb_arg; +- struct spdk_bdev *bdev = ctx->bdev; +- +- spdk_spin_lock(&bdev->internal.spinlock); +- bdev_qos_update_max_quota_per_timeslice(bdev->internal.qos); +- spdk_spin_unlock(&bdev->internal.spinlock); +- +- bdev_set_qos_limit_done(ctx, 0); +-} +- +-static void +-bdev_enable_qos_msg(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, +- struct spdk_io_channel *ch, void *_ctx) +-{ +- struct spdk_bdev_channel *bdev_ch = __io_ch_to_bdev_ch(ch); +- +- spdk_spin_lock(&bdev->internal.spinlock); +- bdev_enable_qos(bdev, bdev_ch); +- spdk_spin_unlock(&bdev->internal.spinlock); +- spdk_bdev_for_each_channel_continue(i, 0); +-} +- +-static void +-bdev_enable_qos_done(struct spdk_bdev *bdev, void *_ctx, int status) +-{ +- struct set_qos_limit_ctx *ctx = _ctx; +- +- bdev_set_qos_limit_done(ctx, status); +-} +- +-static void +-bdev_set_qos_rate_limits(struct spdk_bdev *bdev, uint64_t *limits) +-{ +- int i; +- +- assert(bdev->internal.qos != NULL); +- +- for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { +- if (limits[i] != SPDK_BDEV_QOS_LIMIT_NOT_DEFINED) { +- bdev->internal.qos->rate_limits[i].limit = limits[i]; +- +- if (limits[i] == 0) { +- bdev->internal.qos->rate_limits[i].limit = +- SPDK_BDEV_QOS_LIMIT_NOT_DEFINED; +- } +- } +- } +-} +- +-void +-spdk_bdev_set_qos_rate_limits(struct spdk_bdev *bdev, uint64_t *limits, +- void (*cb_fn)(void *cb_arg, int status), void *cb_arg) +-{ +- struct set_qos_limit_ctx *ctx; +- uint32_t limit_set_complement; +- uint64_t min_limit_per_sec; +- int i; +- bool disable_rate_limit = true; +- +- for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { +- if (limits[i] == SPDK_BDEV_QOS_LIMIT_NOT_DEFINED) { +- continue; +- } +- +- if (limits[i] > 0) { +- disable_rate_limit = false; +- } +- +- if (bdev_qos_is_iops_rate_limit(i) == true) { +- min_limit_per_sec = SPDK_BDEV_QOS_MIN_IOS_PER_SEC; +- } else { +- /* Change from megabyte to byte rate limit */ +- limits[i] = limits[i] * 1024 * 1024; +- min_limit_per_sec = SPDK_BDEV_QOS_MIN_BYTES_PER_SEC; +- } +- +- limit_set_complement = limits[i] % min_limit_per_sec; +- if (limit_set_complement) { +- SPDK_ERRLOG("Requested rate limit %" PRIu64 " is not a multiple of %" PRIu64 "\n", +- limits[i], min_limit_per_sec); +- limits[i] += min_limit_per_sec - limit_set_complement; +- SPDK_ERRLOG("Round up the rate limit to %" PRIu64 "\n", limits[i]); +- } +- } +- +- ctx = calloc(1, sizeof(*ctx)); +- if (ctx == NULL) { +- cb_fn(cb_arg, -ENOMEM); +- return; +- } +- +- ctx->cb_fn = cb_fn; +- ctx->cb_arg = cb_arg; +- ctx->bdev = bdev; +- +- spdk_spin_lock(&bdev->internal.spinlock); +- if (bdev->internal.qos_mod_in_progress) { +- spdk_spin_unlock(&bdev->internal.spinlock); +- free(ctx); +- cb_fn(cb_arg, -EAGAIN); +- return; +- } +- bdev->internal.qos_mod_in_progress = true; +- +- if (disable_rate_limit == true && bdev->internal.qos) { +- for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { +- if (limits[i] == SPDK_BDEV_QOS_LIMIT_NOT_DEFINED && +- (bdev->internal.qos->rate_limits[i].limit > 0 && +- bdev->internal.qos->rate_limits[i].limit != +- SPDK_BDEV_QOS_LIMIT_NOT_DEFINED)) { +- disable_rate_limit = false; +- break; +- } +- } +- } +- +- if (disable_rate_limit == false) { +- if (bdev->internal.qos == NULL) { +- bdev->internal.qos = calloc(1, sizeof(*bdev->internal.qos)); +- if (!bdev->internal.qos) { +- spdk_spin_unlock(&bdev->internal.spinlock); +- SPDK_ERRLOG("Unable to allocate memory for QoS tracking\n"); +- bdev_set_qos_limit_done(ctx, -ENOMEM); +- return; +- } +- } +- +- if (bdev->internal.qos->thread == NULL) { +- /* Enabling */ +- bdev_set_qos_rate_limits(bdev, limits); +- +- spdk_bdev_for_each_channel(bdev, bdev_enable_qos_msg, ctx, +- bdev_enable_qos_done); +- } else { +- /* Updating */ +- bdev_set_qos_rate_limits(bdev, limits); +- +- spdk_thread_send_msg(bdev->internal.qos->thread, +- bdev_update_qos_rate_limit_msg, ctx); +- } +- } else { +- if (bdev->internal.qos != NULL) { +- bdev_set_qos_rate_limits(bdev, limits); +- +- /* Disabling */ +- spdk_bdev_for_each_channel(bdev, bdev_disable_qos_msg, ctx, +- bdev_disable_qos_msg_done); +- } else { +- spdk_spin_unlock(&bdev->internal.spinlock); +- bdev_set_qos_limit_done(ctx, 0); +- return; +- } +- } +- +- spdk_spin_unlock(&bdev->internal.spinlock); +-} +- +-struct spdk_bdev_histogram_ctx { +- spdk_bdev_histogram_status_cb cb_fn; +- void *cb_arg; +- struct spdk_bdev *bdev; +- int status; +-}; +- +-static void +-bdev_histogram_disable_channel_cb(struct spdk_bdev *bdev, void *_ctx, int status) +-{ +- struct spdk_bdev_histogram_ctx *ctx = _ctx; +- +- spdk_spin_lock(&ctx->bdev->internal.spinlock); +- ctx->bdev->internal.histogram_in_progress = false; +- spdk_spin_unlock(&ctx->bdev->internal.spinlock); +- ctx->cb_fn(ctx->cb_arg, ctx->status); +- free(ctx); +-} +- +-static void +-bdev_histogram_disable_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, +- struct spdk_io_channel *_ch, void *_ctx) +-{ +- struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch); +- +- if (ch->histogram != NULL) { +- spdk_histogram_data_free(ch->histogram); +- ch->histogram = NULL; +- } +- spdk_bdev_for_each_channel_continue(i, 0); +-} +- +-static void +-bdev_histogram_enable_channel_cb(struct spdk_bdev *bdev, void *_ctx, int status) +-{ +- struct spdk_bdev_histogram_ctx *ctx = _ctx; +- +- if (status != 0) { +- ctx->status = status; +- ctx->bdev->internal.histogram_enabled = false; +- spdk_bdev_for_each_channel(ctx->bdev, bdev_histogram_disable_channel, ctx, +- bdev_histogram_disable_channel_cb); +- } else { +- spdk_spin_lock(&ctx->bdev->internal.spinlock); +- ctx->bdev->internal.histogram_in_progress = false; +- spdk_spin_unlock(&ctx->bdev->internal.spinlock); +- ctx->cb_fn(ctx->cb_arg, ctx->status); +- free(ctx); +- } +-} +- +-static void +-bdev_histogram_enable_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, +- struct spdk_io_channel *_ch, void *_ctx) +-{ +- struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch); +- int status = 0; +- +- if (ch->histogram == NULL) { +- ch->histogram = spdk_histogram_data_alloc(); +- if (ch->histogram == NULL) { +- status = -ENOMEM; +- } +- } +- +- spdk_bdev_for_each_channel_continue(i, status); +-} +- +-void +-spdk_bdev_histogram_enable(struct spdk_bdev *bdev, spdk_bdev_histogram_status_cb cb_fn, +- void *cb_arg, bool enable) +-{ +- struct spdk_bdev_histogram_ctx *ctx; +- +- ctx = calloc(1, sizeof(struct spdk_bdev_histogram_ctx)); +- if (ctx == NULL) { +- cb_fn(cb_arg, -ENOMEM); +- return; +- } +- +- ctx->bdev = bdev; +- ctx->status = 0; +- ctx->cb_fn = cb_fn; +- ctx->cb_arg = cb_arg; +- +- spdk_spin_lock(&bdev->internal.spinlock); +- if (bdev->internal.histogram_in_progress) { +- spdk_spin_unlock(&bdev->internal.spinlock); +- free(ctx); +- cb_fn(cb_arg, -EAGAIN); +- return; +- } +- +- bdev->internal.histogram_in_progress = true; +- spdk_spin_unlock(&bdev->internal.spinlock); +- +- bdev->internal.histogram_enabled = enable; +- +- if (enable) { +- /* Allocate histogram for each channel */ +- spdk_bdev_for_each_channel(bdev, bdev_histogram_enable_channel, ctx, +- bdev_histogram_enable_channel_cb); +- } else { +- spdk_bdev_for_each_channel(bdev, bdev_histogram_disable_channel, ctx, +- bdev_histogram_disable_channel_cb); +- } +-} +- +-struct spdk_bdev_histogram_data_ctx { +- spdk_bdev_histogram_data_cb cb_fn; +- void *cb_arg; +- struct spdk_bdev *bdev; +- /** merged histogram data from all channels */ +- struct spdk_histogram_data *histogram; +-}; +- +-static void +-bdev_histogram_get_channel_cb(struct spdk_bdev *bdev, void *_ctx, int status) +-{ +- struct spdk_bdev_histogram_data_ctx *ctx = _ctx; +- +- ctx->cb_fn(ctx->cb_arg, status, ctx->histogram); +- free(ctx); +-} +- +-static void +-bdev_histogram_get_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, +- struct spdk_io_channel *_ch, void *_ctx) +-{ +- struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch); +- struct spdk_bdev_histogram_data_ctx *ctx = _ctx; +- int status = 0; +- +- if (ch->histogram == NULL) { +- status = -EFAULT; +- } else { +- spdk_histogram_data_merge(ctx->histogram, ch->histogram); +- } +- +- spdk_bdev_for_each_channel_continue(i, status); +-} +- +-void +-spdk_bdev_histogram_get(struct spdk_bdev *bdev, struct spdk_histogram_data *histogram, +- spdk_bdev_histogram_data_cb cb_fn, +- void *cb_arg) +-{ +- struct spdk_bdev_histogram_data_ctx *ctx; +- +- ctx = calloc(1, sizeof(struct spdk_bdev_histogram_data_ctx)); +- if (ctx == NULL) { +- cb_fn(cb_arg, -ENOMEM, NULL); +- return; +- } +- +- ctx->bdev = bdev; +- ctx->cb_fn = cb_fn; +- ctx->cb_arg = cb_arg; +- +- ctx->histogram = histogram; +- +- spdk_bdev_for_each_channel(bdev, bdev_histogram_get_channel, ctx, +- bdev_histogram_get_channel_cb); +-} +- +-void +-spdk_bdev_channel_get_histogram(struct spdk_io_channel *ch, spdk_bdev_histogram_data_cb cb_fn, +- void *cb_arg) +-{ +- struct spdk_bdev_channel *bdev_ch = __io_ch_to_bdev_ch(ch); +- int status = 0; +- +- assert(cb_fn != NULL); +- +- if (bdev_ch->histogram == NULL) { +- status = -EFAULT; +- } +- cb_fn(cb_arg, status, bdev_ch->histogram); +-} +- +-size_t +-spdk_bdev_get_media_events(struct spdk_bdev_desc *desc, struct spdk_bdev_media_event *events, +- size_t max_events) +-{ +- struct media_event_entry *entry; +- size_t num_events = 0; +- +- for (; num_events < max_events; ++num_events) { +- entry = TAILQ_FIRST(&desc->pending_media_events); +- if (entry == NULL) { +- break; +- } +- +- events[num_events] = entry->event; +- TAILQ_REMOVE(&desc->pending_media_events, entry, tailq); +- TAILQ_INSERT_TAIL(&desc->free_media_events, entry, tailq); +- } +- +- return num_events; +-} +- +-int +-spdk_bdev_push_media_events(struct spdk_bdev *bdev, const struct spdk_bdev_media_event *events, +- size_t num_events) +-{ +- struct spdk_bdev_desc *desc; +- struct media_event_entry *entry; +- size_t event_id; +- int rc = 0; +- +- assert(bdev->media_events); +- +- spdk_spin_lock(&bdev->internal.spinlock); +- TAILQ_FOREACH(desc, &bdev->internal.open_descs, link) { +- if (desc->write) { +- break; +- } +- } +- +- if (desc == NULL || desc->media_events_buffer == NULL) { +- rc = -ENODEV; +- goto out; +- } +- +- for (event_id = 0; event_id < num_events; ++event_id) { +- entry = TAILQ_FIRST(&desc->free_media_events); +- if (entry == NULL) { +- break; +- } +- +- TAILQ_REMOVE(&desc->free_media_events, entry, tailq); +- TAILQ_INSERT_TAIL(&desc->pending_media_events, entry, tailq); +- entry->event = events[event_id]; +- } +- +- rc = event_id; +-out: +- spdk_spin_unlock(&bdev->internal.spinlock); +- return rc; +-} +- +-void +-spdk_bdev_notify_media_management(struct spdk_bdev *bdev) +-{ +- struct spdk_bdev_desc *desc; +- +- spdk_spin_lock(&bdev->internal.spinlock); +- TAILQ_FOREACH(desc, &bdev->internal.open_descs, link) { +- if (!TAILQ_EMPTY(&desc->pending_media_events)) { +- desc->callback.event_fn(SPDK_BDEV_EVENT_MEDIA_MANAGEMENT, bdev, +- desc->callback.ctx); +- } +- } +- spdk_spin_unlock(&bdev->internal.spinlock); +-} +- +-struct locked_lba_range_ctx { +- struct lba_range range; +- struct spdk_bdev *bdev; +- struct lba_range *current_range; +- struct lba_range *owner_range; +- struct spdk_poller *poller; +- lock_range_cb cb_fn; +- void *cb_arg; +-}; +- +-static void +-bdev_lock_error_cleanup_cb(struct spdk_bdev *bdev, void *_ctx, int status) +-{ +- struct locked_lba_range_ctx *ctx = _ctx; +- +- ctx->cb_fn(ctx->cb_arg, -ENOMEM); +- free(ctx); +-} +- +-static void bdev_unlock_lba_range_get_channel(struct spdk_bdev_channel_iter *i, +- struct spdk_bdev *bdev, struct spdk_io_channel *ch, void *_ctx); +- +-static void +-bdev_lock_lba_range_cb(struct spdk_bdev *bdev, void *_ctx, int status) +-{ +- struct locked_lba_range_ctx *ctx = _ctx; +- +- if (status == -ENOMEM) { +- /* One of the channels could not allocate a range object. +- * So we have to go back and clean up any ranges that were +- * allocated successfully before we return error status to +- * the caller. We can reuse the unlock function to do that +- * clean up. +- */ +- spdk_bdev_for_each_channel(bdev, bdev_unlock_lba_range_get_channel, ctx, +- bdev_lock_error_cleanup_cb); +- return; +- } +- +- /* All channels have locked this range and no I/O overlapping the range +- * are outstanding! Set the owner_ch for the range object for the +- * locking channel, so that this channel will know that it is allowed +- * to write to this range. +- */ +- ctx->owner_range->owner_ch = ctx->range.owner_ch; +- ctx->cb_fn(ctx->cb_arg, status); +- +- /* Don't free the ctx here. Its range is in the bdev's global list of +- * locked ranges still, and will be removed and freed when this range +- * is later unlocked. +- */ +-} +- +-static int +-bdev_lock_lba_range_check_io(void *_i) +-{ +- struct spdk_bdev_channel_iter *i = _i; +- struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i->i); +- struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch); +- struct locked_lba_range_ctx *ctx = i->ctx; +- struct lba_range *range = ctx->current_range; +- struct spdk_bdev_io *bdev_io; +- +- spdk_poller_unregister(&ctx->poller); +- +- /* The range is now in the locked_ranges, so no new IO can be submitted to this +- * range. But we need to wait until any outstanding IO overlapping with this range +- * are completed. +- */ +- TAILQ_FOREACH(bdev_io, &ch->io_submitted, internal.ch_link) { +- if (bdev_io_range_is_locked(bdev_io, range)) { +- ctx->poller = SPDK_POLLER_REGISTER(bdev_lock_lba_range_check_io, i, 100); +- return SPDK_POLLER_BUSY; +- } +- } +- +- spdk_bdev_for_each_channel_continue(i, 0); +- return SPDK_POLLER_BUSY; +-} +- +-static void +-bdev_lock_lba_range_get_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, +- struct spdk_io_channel *_ch, void *_ctx) +-{ +- struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch); +- struct locked_lba_range_ctx *ctx = _ctx; +- struct lba_range *range; +- +- TAILQ_FOREACH(range, &ch->locked_ranges, tailq) { +- if (range->length == ctx->range.length && +- range->offset == ctx->range.offset && +- range->locked_ctx == ctx->range.locked_ctx) { +- /* This range already exists on this channel, so don't add +- * it again. This can happen when a new channel is created +- * while the for_each_channel operation is in progress. +- * Do not check for outstanding I/O in that case, since the +- * range was locked before any I/O could be submitted to the +- * new channel. +- */ +- spdk_bdev_for_each_channel_continue(i, 0); +- return; +- } +- } +- +- range = calloc(1, sizeof(*range)); +- if (range == NULL) { +- spdk_bdev_for_each_channel_continue(i, -ENOMEM); +- return; +- } +- +- range->length = ctx->range.length; +- range->offset = ctx->range.offset; +- range->locked_ctx = ctx->range.locked_ctx; +- ctx->current_range = range; +- if (ctx->range.owner_ch == ch) { +- /* This is the range object for the channel that will hold +- * the lock. Store it in the ctx object so that we can easily +- * set its owner_ch after the lock is finally acquired. +- */ +- ctx->owner_range = range; +- } +- TAILQ_INSERT_TAIL(&ch->locked_ranges, range, tailq); +- bdev_lock_lba_range_check_io(i); +-} +- +-static void +-bdev_lock_lba_range_ctx(struct spdk_bdev *bdev, struct locked_lba_range_ctx *ctx) +-{ +- assert(spdk_get_thread() == spdk_io_channel_get_thread(ctx->range.owner_ch->channel)); +- +- /* We will add a copy of this range to each channel now. */ +- spdk_bdev_for_each_channel(bdev, bdev_lock_lba_range_get_channel, ctx, +- bdev_lock_lba_range_cb); +-} +- +-static bool +-bdev_lba_range_overlaps_tailq(struct lba_range *range, lba_range_tailq_t *tailq) +-{ +- struct lba_range *r; +- +- TAILQ_FOREACH(r, tailq, tailq) { +- if (bdev_lba_range_overlapped(range, r)) { +- return true; +- } +- } +- return false; +-} +- +-static int +-bdev_lock_lba_range(struct spdk_bdev_desc *desc, struct spdk_io_channel *_ch, +- uint64_t offset, uint64_t length, +- lock_range_cb cb_fn, void *cb_arg) +-{ +- struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); +- struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch); +- struct locked_lba_range_ctx *ctx; +- +- if (cb_arg == NULL) { +- SPDK_ERRLOG("cb_arg must not be NULL\n"); +- return -EINVAL; +- } +- +- ctx = calloc(1, sizeof(*ctx)); +- if (ctx == NULL) { +- return -ENOMEM; +- } +- +- ctx->range.offset = offset; +- ctx->range.length = length; +- ctx->range.owner_ch = ch; +- ctx->range.locked_ctx = cb_arg; +- ctx->bdev = bdev; +- ctx->cb_fn = cb_fn; +- ctx->cb_arg = cb_arg; +- +- spdk_spin_lock(&bdev->internal.spinlock); +- if (bdev_lba_range_overlaps_tailq(&ctx->range, &bdev->internal.locked_ranges)) { +- /* There is an active lock overlapping with this range. +- * Put it on the pending list until this range no +- * longer overlaps with another. +- */ +- TAILQ_INSERT_TAIL(&bdev->internal.pending_locked_ranges, &ctx->range, tailq); +- } else { +- TAILQ_INSERT_TAIL(&bdev->internal.locked_ranges, &ctx->range, tailq); +- bdev_lock_lba_range_ctx(bdev, ctx); +- } +- spdk_spin_unlock(&bdev->internal.spinlock); +- return 0; +-} +- +-static void +-bdev_lock_lba_range_ctx_msg(void *_ctx) +-{ +- struct locked_lba_range_ctx *ctx = _ctx; +- +- bdev_lock_lba_range_ctx(ctx->bdev, ctx); +-} +- +-static void +-bdev_unlock_lba_range_cb(struct spdk_bdev *bdev, void *_ctx, int status) +-{ +- struct locked_lba_range_ctx *ctx = _ctx; +- struct locked_lba_range_ctx *pending_ctx; +- struct lba_range *range, *tmp; +- +- spdk_spin_lock(&bdev->internal.spinlock); +- /* Check if there are any pending locked ranges that overlap with this range +- * that was just unlocked. If there are, check that it doesn't overlap with any +- * other locked ranges before calling bdev_lock_lba_range_ctx which will start +- * the lock process. +- */ +- TAILQ_FOREACH_SAFE(range, &bdev->internal.pending_locked_ranges, tailq, tmp) { +- if (bdev_lba_range_overlapped(range, &ctx->range) && +- !bdev_lba_range_overlaps_tailq(range, &bdev->internal.locked_ranges)) { +- TAILQ_REMOVE(&bdev->internal.pending_locked_ranges, range, tailq); +- pending_ctx = SPDK_CONTAINEROF(range, struct locked_lba_range_ctx, range); +- TAILQ_INSERT_TAIL(&bdev->internal.locked_ranges, range, tailq); +- spdk_thread_send_msg(spdk_io_channel_get_thread(pending_ctx->range.owner_ch->channel), +- bdev_lock_lba_range_ctx_msg, pending_ctx); +- } +- } +- spdk_spin_unlock(&bdev->internal.spinlock); +- +- ctx->cb_fn(ctx->cb_arg, status); +- free(ctx); +-} +- +-static void +-bdev_unlock_lba_range_get_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, +- struct spdk_io_channel *_ch, void *_ctx) +-{ +- struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch); +- struct locked_lba_range_ctx *ctx = _ctx; +- TAILQ_HEAD(, spdk_bdev_io) io_locked; +- struct spdk_bdev_io *bdev_io; +- struct lba_range *range; +- +- TAILQ_FOREACH(range, &ch->locked_ranges, tailq) { +- if (ctx->range.offset == range->offset && +- ctx->range.length == range->length && +- ctx->range.locked_ctx == range->locked_ctx) { +- TAILQ_REMOVE(&ch->locked_ranges, range, tailq); +- free(range); +- break; +- } +- } +- +- /* Note: we should almost always be able to assert that the range specified +- * was found. But there are some very rare corner cases where a new channel +- * gets created simultaneously with a range unlock, where this function +- * would execute on that new channel and wouldn't have the range. +- * We also use this to clean up range allocations when a later allocation +- * fails in the locking path. +- * So we can't actually assert() here. +- */ +- +- /* Swap the locked IO into a temporary list, and then try to submit them again. +- * We could hyper-optimize this to only resubmit locked I/O that overlap +- * with the range that was just unlocked, but this isn't a performance path so +- * we go for simplicity here. +- */ +- TAILQ_INIT(&io_locked); +- TAILQ_SWAP(&ch->io_locked, &io_locked, spdk_bdev_io, internal.ch_link); +- while (!TAILQ_EMPTY(&io_locked)) { +- bdev_io = TAILQ_FIRST(&io_locked); +- TAILQ_REMOVE(&io_locked, bdev_io, internal.ch_link); +- bdev_io_submit(bdev_io); +- } +- +- spdk_bdev_for_each_channel_continue(i, 0); +-} +- +-static int +-bdev_unlock_lba_range(struct spdk_bdev_desc *desc, struct spdk_io_channel *_ch, +- uint64_t offset, uint64_t length, +- lock_range_cb cb_fn, void *cb_arg) +-{ +- struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); +- struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch); +- struct locked_lba_range_ctx *ctx; +- struct lba_range *range; +- bool range_found = false; +- +- /* Let's make sure the specified channel actually has a lock on +- * the specified range. Note that the range must match exactly. +- */ +- TAILQ_FOREACH(range, &ch->locked_ranges, tailq) { +- if (range->offset == offset && range->length == length && +- range->owner_ch == ch && range->locked_ctx == cb_arg) { +- range_found = true; +- break; +- } +- } +- +- if (!range_found) { +- return -EINVAL; +- } +- +- spdk_spin_lock(&bdev->internal.spinlock); +- /* We confirmed that this channel has locked the specified range. To +- * start the unlock the process, we find the range in the bdev's locked_ranges +- * and remove it. This ensures new channels don't inherit the locked range. +- * Then we will send a message to each channel (including the one specified +- * here) to remove the range from its per-channel list. +- */ +- TAILQ_FOREACH(range, &bdev->internal.locked_ranges, tailq) { +- if (range->offset == offset && range->length == length && +- range->locked_ctx == cb_arg) { +- break; +- } +- } +- if (range == NULL) { +- assert(false); +- spdk_spin_unlock(&bdev->internal.spinlock); +- return -EINVAL; +- } +- TAILQ_REMOVE(&bdev->internal.locked_ranges, range, tailq); +- ctx = SPDK_CONTAINEROF(range, struct locked_lba_range_ctx, range); +- spdk_spin_unlock(&bdev->internal.spinlock); +- +- ctx->cb_fn = cb_fn; +- ctx->cb_arg = cb_arg; +- +- spdk_bdev_for_each_channel(bdev, bdev_unlock_lba_range_get_channel, ctx, +- bdev_unlock_lba_range_cb); +- return 0; +-} +- +-int +-spdk_bdev_get_memory_domains(struct spdk_bdev *bdev, struct spdk_memory_domain **domains, +- int array_size) +-{ +- if (!bdev) { +- return -EINVAL; +- } +- +- if (bdev->fn_table->get_memory_domains) { +- return bdev->fn_table->get_memory_domains(bdev->ctxt, domains, array_size); +- } +- +- return 0; +-} +- +-struct spdk_bdev_for_each_io_ctx { +- void *ctx; +- spdk_bdev_io_fn fn; +- spdk_bdev_for_each_io_cb cb; +-}; +- +-static void +-bdev_channel_for_each_io(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, +- struct spdk_io_channel *io_ch, void *_ctx) +-{ +- struct spdk_bdev_for_each_io_ctx *ctx = _ctx; +- struct spdk_bdev_channel *bdev_ch = __io_ch_to_bdev_ch(io_ch); +- struct spdk_bdev_io *bdev_io; +- int rc = 0; +- +- TAILQ_FOREACH(bdev_io, &bdev_ch->io_submitted, internal.ch_link) { +- rc = ctx->fn(ctx->ctx, bdev_io); +- if (rc != 0) { +- break; +- } +- } +- +- spdk_bdev_for_each_channel_continue(i, rc); +-} +- +-static void +-bdev_for_each_io_done(struct spdk_bdev *bdev, void *_ctx, int status) +-{ +- struct spdk_bdev_for_each_io_ctx *ctx = _ctx; +- +- ctx->cb(ctx->ctx, status); +- +- free(ctx); +-} +- +-void +-spdk_bdev_for_each_bdev_io(struct spdk_bdev *bdev, void *_ctx, spdk_bdev_io_fn fn, +- spdk_bdev_for_each_io_cb cb) +-{ +- struct spdk_bdev_for_each_io_ctx *ctx; +- +- assert(fn != NULL && cb != NULL); +- +- ctx = calloc(1, sizeof(*ctx)); +- if (ctx == NULL) { +- SPDK_ERRLOG("Failed to allocate context.\n"); +- cb(_ctx, -ENOMEM); +- return; +- } +- +- ctx->ctx = _ctx; +- ctx->fn = fn; +- ctx->cb = cb; +- +- spdk_bdev_for_each_channel(bdev, bdev_channel_for_each_io, ctx, +- bdev_for_each_io_done); +-} +- +-void +-spdk_bdev_for_each_channel_continue(struct spdk_bdev_channel_iter *iter, int status) +-{ +- spdk_for_each_channel_continue(iter->i, status); +-} +- +-static struct spdk_bdev * +-io_channel_iter_get_bdev(struct spdk_io_channel_iter *i) +-{ +- void *io_device = spdk_io_channel_iter_get_io_device(i); +- +- return __bdev_from_io_dev(io_device); +-} +- +-static void +-bdev_each_channel_msg(struct spdk_io_channel_iter *i) +-{ +- struct spdk_bdev_channel_iter *iter = spdk_io_channel_iter_get_ctx(i); +- struct spdk_bdev *bdev = io_channel_iter_get_bdev(i); +- struct spdk_io_channel *ch = spdk_io_channel_iter_get_channel(i); +- +- iter->i = i; +- iter->fn(iter, bdev, ch, iter->ctx); +-} +- +-static void +-bdev_each_channel_cpl(struct spdk_io_channel_iter *i, int status) +-{ +- struct spdk_bdev_channel_iter *iter = spdk_io_channel_iter_get_ctx(i); +- struct spdk_bdev *bdev = io_channel_iter_get_bdev(i); +- +- iter->i = i; +- iter->cpl(bdev, iter->ctx, status); +- +- free(iter); +-} +- +-void +-spdk_bdev_for_each_channel(struct spdk_bdev *bdev, spdk_bdev_for_each_channel_msg fn, +- void *ctx, spdk_bdev_for_each_channel_done cpl) +-{ +- struct spdk_bdev_channel_iter *iter; +- +- assert(bdev != NULL && fn != NULL && ctx != NULL); +- +- iter = calloc(1, sizeof(struct spdk_bdev_channel_iter)); +- if (iter == NULL) { +- SPDK_ERRLOG("Unable to allocate iterator\n"); +- assert(false); +- return; +- } +- +- iter->fn = fn; +- iter->cpl = cpl; +- iter->ctx = ctx; +- +- spdk_for_each_channel(__bdev_to_io_dev(bdev), bdev_each_channel_msg, +- iter, bdev_each_channel_cpl); +-} +- +-int +-spdk_bdev_copy_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, +- uint64_t dst_offset_blocks, uint64_t src_offset_blocks, uint64_t num_blocks, +- spdk_bdev_io_completion_cb cb, void *cb_arg) +-{ +- struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); +- struct spdk_bdev_io *bdev_io; +- struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); +- +- if (!desc->write) { +- return -EBADF; +- } +- +- if (spdk_unlikely(!bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_COPY))) { +- SPDK_DEBUGLOG(bdev, "Copy IO type is not supported\n"); +- return -ENOTSUP; +- } +- +- if (num_blocks == 0) { +- SPDK_ERRLOG("Can't copy 0 blocks\n"); +- return -EINVAL; +- } +- +- if (!bdev_io_valid_blocks(bdev, dst_offset_blocks, num_blocks) || +- !bdev_io_valid_blocks(bdev, src_offset_blocks, num_blocks)) { +- SPDK_DEBUGLOG(bdev, +- "Invalid offset or number of blocks: dst %lu, src %lu, count %lu\n", +- dst_offset_blocks, src_offset_blocks, num_blocks); +- return -EINVAL; +- } +- +- bdev_io = bdev_channel_get_io(channel); +- if (!bdev_io) { +- return -ENOMEM; +- } +- +- bdev_io->internal.ch = channel; +- bdev_io->internal.desc = desc; +- bdev_io->type = SPDK_BDEV_IO_TYPE_COPY; +- +- bdev_io->u.bdev.offset_blocks = dst_offset_blocks; +- bdev_io->u.bdev.copy.src_offset_blocks = src_offset_blocks; +- bdev_io->u.bdev.num_blocks = num_blocks; +- bdev_io->u.bdev.ext_opts = NULL; +- bdev_io_init(bdev_io, bdev, cb_arg, cb); +- +- bdev_io_submit(bdev_io); +- return 0; +-} +- +-SPDK_LOG_REGISTER_COMPONENT(bdev) +- +-SPDK_TRACE_REGISTER_FN(bdev_trace, "bdev", TRACE_GROUP_BDEV) +-{ +- struct spdk_trace_tpoint_opts opts[] = { +- { +- "BDEV_IO_START", TRACE_BDEV_IO_START, +- OWNER_BDEV, OBJECT_BDEV_IO, 1, +- { +- { "type", SPDK_TRACE_ARG_TYPE_INT, 8 }, +- { "ctx", SPDK_TRACE_ARG_TYPE_PTR, 8 }, +- { "offset", SPDK_TRACE_ARG_TYPE_INT, 8 }, +- { "len", SPDK_TRACE_ARG_TYPE_INT, 8 }, +- { "name", SPDK_TRACE_ARG_TYPE_STR, 40} +- } +- }, +- { +- "BDEV_IO_DONE", TRACE_BDEV_IO_DONE, +- OWNER_BDEV, OBJECT_BDEV_IO, 0, +- {{ "ctx", SPDK_TRACE_ARG_TYPE_PTR, 8 }} +- }, +- { +- "BDEV_IOCH_CREATE", TRACE_BDEV_IOCH_CREATE, +- OWNER_BDEV, OBJECT_NONE, 1, +- { +- { "name", SPDK_TRACE_ARG_TYPE_STR, 40 }, +- { "thread_id", SPDK_TRACE_ARG_TYPE_INT, 8} +- } +- }, +- { +- "BDEV_IOCH_DESTROY", TRACE_BDEV_IOCH_DESTROY, +- OWNER_BDEV, OBJECT_NONE, 0, +- { +- { "name", SPDK_TRACE_ARG_TYPE_STR, 40 }, +- { "thread_id", SPDK_TRACE_ARG_TYPE_INT, 8} +- } +- }, +- }; +- +- +- spdk_trace_register_owner(OWNER_BDEV, 'b'); +- spdk_trace_register_object(OBJECT_BDEV_IO, 'i'); +- spdk_trace_register_description_ext(opts, SPDK_COUNTOF(opts)); +- spdk_trace_tpoint_register_relation(TRACE_BDEV_NVME_IO_START, OBJECT_BDEV_IO, 0); +- spdk_trace_tpoint_register_relation(TRACE_BDEV_NVME_IO_DONE, OBJECT_BDEV_IO, 0); +-} ++/* SPDX-License-Identifier: BSD-3-Clause ++ * Copyright (C) 2016 Intel Corporation. All rights reserved. ++ * Copyright (c) 2019 Mellanox Technologies LTD. All rights reserved. ++ * Copyright (c) 2021-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. ++ */ ++ ++#include "spdk/stdinc.h" ++ ++#include "spdk/bdev.h" ++ ++#include "spdk/config.h" ++#include "spdk/env.h" ++#include "spdk/thread.h" ++#include "spdk/likely.h" ++#include "spdk/queue.h" ++#include "spdk/nvme_spec.h" ++#include "spdk/scsi_spec.h" ++#include "spdk/notify.h" ++#include "spdk/util.h" ++#include "spdk/trace.h" ++#include "spdk/dma.h" ++ ++#include "spdk/bdev_module.h" ++#include "spdk/log.h" ++#include "spdk/string.h" ++#include "spdk/event.h" ++ ++#include "bdev_internal.h" ++#include "spdk_internal/trace_defs.h" ++ ++#ifdef SPDK_CONFIG_VTUNE ++#include "ittnotify.h" ++#include "ittnotify_types.h" ++int __itt_init_ittlib(const char *, __itt_group_id); ++#endif ++ ++#define SPDK_BDEV_IO_POOL_SIZE (64 * 1024 - 1) ++#define SPDK_BDEV_IO_CACHE_SIZE 256 ++#define SPDK_BDEV_AUTO_EXAMINE true ++#define BUF_SMALL_POOL_SIZE 8191 ++#define BUF_LARGE_POOL_SIZE 1023 ++#define BUF_SMALL_CACHE_SIZE 128 ++#define BUF_LARGE_CACHE_SIZE 16 ++#define NOMEM_THRESHOLD_COUNT 8 ++ ++#define SPDK_BDEV_QOS_TIMESLICE_IN_USEC 1000 ++#define SPDK_BDEV_QOS_MIN_IO_PER_TIMESLICE 1 ++#define SPDK_BDEV_QOS_MIN_BYTE_PER_TIMESLICE 512 ++#define SPDK_BDEV_QOS_MIN_IOS_PER_SEC 1000 ++#define SPDK_BDEV_QOS_MIN_BYTES_PER_SEC (1024 * 1024) ++#define SPDK_BDEV_QOS_LIMIT_NOT_DEFINED UINT64_MAX ++#define SPDK_BDEV_IO_POLL_INTERVAL_IN_MSEC 1000 ++ ++/* The maximum number of children requests for a UNMAP or WRITE ZEROES command ++ * when splitting into children requests at a time. ++ */ ++#define SPDK_BDEV_MAX_CHILDREN_UNMAP_WRITE_ZEROES_REQS (8) ++#define BDEV_RESET_CHECK_OUTSTANDING_IO_PERIOD 1000000 ++ ++/* The maximum number of children requests for a COPY command ++ * when splitting into children requests at a time. ++ */ ++#define SPDK_BDEV_MAX_CHILDREN_COPY_REQS (8) ++ ++SPDK_LOG_DEPRECATION_REGISTER(bdev_register_examine_thread, ++ "bdev register and examine on non-app thread", "SPDK 23.05", 0); ++ ++SPDK_LOG_DEPRECATION_REGISTER(vtune_support, "Intel(R) VTune integration", "SPDK 23.05", 0); ++ ++static const char *qos_rpc_type[] = {"rw_ios_per_sec", ++ "rw_mbytes_per_sec", "r_mbytes_per_sec", "w_mbytes_per_sec" ++ }; ++ ++TAILQ_HEAD(spdk_bdev_list, spdk_bdev); ++ ++RB_HEAD(bdev_name_tree, spdk_bdev_name); ++ ++static int ++bdev_name_cmp(struct spdk_bdev_name *name1, struct spdk_bdev_name *name2) ++{ ++ return strcmp(name1->name, name2->name); ++} ++ ++RB_GENERATE_STATIC(bdev_name_tree, spdk_bdev_name, node, bdev_name_cmp); ++ ++struct spdk_bdev_mgr { ++ struct spdk_mempool *bdev_io_pool; ++ ++ void *zero_buffer; ++ ++ TAILQ_HEAD(bdev_module_list, spdk_bdev_module) bdev_modules; ++ ++ struct spdk_bdev_list bdevs; ++ struct bdev_name_tree bdev_names; ++ ++ bool init_complete; ++ bool module_init_complete; ++ ++ struct spdk_spinlock spinlock; ++ ++#ifdef SPDK_CONFIG_VTUNE ++ __itt_domain *domain; ++#endif ++}; ++ ++static struct spdk_bdev_mgr g_bdev_mgr = { ++ .bdev_modules = TAILQ_HEAD_INITIALIZER(g_bdev_mgr.bdev_modules), ++ .bdevs = TAILQ_HEAD_INITIALIZER(g_bdev_mgr.bdevs), ++ .bdev_names = RB_INITIALIZER(g_bdev_mgr.bdev_names), ++ .init_complete = false, ++ .module_init_complete = false, ++}; ++ ++static void ++__attribute__((constructor)) ++_bdev_init(void) ++{ ++ spdk_spin_init(&g_bdev_mgr.spinlock); ++} ++ ++typedef void (*lock_range_cb)(void *ctx, int status); ++ ++typedef void (*bdev_copy_bounce_buffer_cpl)(void *ctx, int rc); ++ ++struct lba_range { ++ uint64_t offset; ++ uint64_t length; ++ void *locked_ctx; ++ struct spdk_bdev_channel *owner_ch; ++ TAILQ_ENTRY(lba_range) tailq; ++}; ++ ++static struct spdk_bdev_opts g_bdev_opts = { ++ .bdev_io_pool_size = SPDK_BDEV_IO_POOL_SIZE, ++ .bdev_io_cache_size = SPDK_BDEV_IO_CACHE_SIZE, ++ .bdev_auto_examine = SPDK_BDEV_AUTO_EXAMINE, ++ .small_buf_pool_size = BUF_SMALL_POOL_SIZE, ++ .large_buf_pool_size = BUF_LARGE_POOL_SIZE, ++}; ++ ++static spdk_bdev_init_cb g_init_cb_fn = NULL; ++static void *g_init_cb_arg = NULL; ++ ++static spdk_bdev_fini_cb g_fini_cb_fn = NULL; ++static void *g_fini_cb_arg = NULL; ++static struct spdk_thread *g_fini_thread = NULL; ++ ++struct spdk_bdev_qos_limit { ++ /** IOs or bytes allowed per second (i.e., 1s). */ ++ uint64_t limit; ++ ++ /** Remaining IOs or bytes allowed in current timeslice (e.g., 1ms). ++ * For remaining bytes, allowed to run negative if an I/O is submitted when ++ * some bytes are remaining, but the I/O is bigger than that amount. The ++ * excess will be deducted from the next timeslice. ++ */ ++ int64_t remaining_this_timeslice; ++ ++ /** Minimum allowed IOs or bytes to be issued in one timeslice (e.g., 1ms). */ ++ uint32_t min_per_timeslice; ++ ++ /** Maximum allowed IOs or bytes to be issued in one timeslice (e.g., 1ms). */ ++ uint32_t max_per_timeslice; ++ ++ /** Function to check whether to queue the IO. */ ++ bool (*queue_io)(const struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io); ++ ++ /** Function to update for the submitted IO. */ ++ void (*update_quota)(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io); ++}; ++ ++struct spdk_bdev_qos { ++ /** Types of structure of rate limits. */ ++ struct spdk_bdev_qos_limit rate_limits[SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES]; ++ ++ /** The channel that all I/O are funneled through. */ ++ struct spdk_bdev_channel *ch; ++ ++ /** The thread on which the poller is running. */ ++ struct spdk_thread *thread; ++ ++ /** Queue of I/O waiting to be issued. */ ++ bdev_io_tailq_t queued; ++ ++ /** Size of a timeslice in tsc ticks. */ ++ uint64_t timeslice_size; ++ ++ /** Timestamp of start of last timeslice. */ ++ uint64_t last_timeslice; ++ ++ /** Poller that processes queued I/O commands each time slice. */ ++ struct spdk_poller *poller; ++}; ++ ++struct spdk_bdev_mgmt_channel { ++ /* ++ * Each thread keeps a cache of bdev_io - this allows ++ * bdev threads which are *not* DPDK threads to still ++ * benefit from a per-thread bdev_io cache. Without ++ * this, non-DPDK threads fetching from the mempool ++ * incur a cmpxchg on get and put. ++ */ ++ bdev_io_stailq_t per_thread_cache; ++ uint32_t per_thread_cache_count; ++ uint32_t bdev_io_cache_size; ++ ++ struct spdk_iobuf_channel iobuf; ++ ++ TAILQ_HEAD(, spdk_bdev_shared_resource) shared_resources; ++ TAILQ_HEAD(, spdk_bdev_io_wait_entry) io_wait_queue; ++}; ++ ++/* ++ * Per-module (or per-io_device) data. Multiple bdevs built on the same io_device ++ * will queue here their IO that awaits retry. It makes it possible to retry sending ++ * IO to one bdev after IO from other bdev completes. ++ */ ++struct spdk_bdev_shared_resource { ++ /* The bdev management channel */ ++ struct spdk_bdev_mgmt_channel *mgmt_ch; ++ ++ /* ++ * Count of I/O submitted to bdev module and waiting for completion. ++ * Incremented before submit_request() is called on an spdk_bdev_io. ++ */ ++ uint64_t io_outstanding; ++ ++ /* ++ * Queue of IO awaiting retry because of a previous NOMEM status returned ++ * on this channel. ++ */ ++ bdev_io_tailq_t nomem_io; ++ ++ /* ++ * Threshold which io_outstanding must drop to before retrying nomem_io. ++ */ ++ uint64_t nomem_threshold; ++ ++ /* I/O channel allocated by a bdev module */ ++ struct spdk_io_channel *shared_ch; ++ ++ /* Refcount of bdev channels using this resource */ ++ uint32_t ref; ++ ++ TAILQ_ENTRY(spdk_bdev_shared_resource) link; ++}; ++ ++#define BDEV_CH_RESET_IN_PROGRESS (1 << 0) ++#define BDEV_CH_QOS_ENABLED (1 << 1) ++ ++struct spdk_bdev_channel { ++ struct spdk_bdev *bdev; ++ ++ /* The channel for the underlying device */ ++ struct spdk_io_channel *channel; ++ ++ /* Per io_device per thread data */ ++ struct spdk_bdev_shared_resource *shared_resource; ++ ++ struct spdk_bdev_io_stat *stat; ++ ++ /* ++ * Count of I/O submitted to the underlying dev module through this channel ++ * and waiting for completion. ++ */ ++ uint64_t io_outstanding; ++ ++ /* ++ * List of all submitted I/Os including I/O that are generated via splitting. ++ */ ++ bdev_io_tailq_t io_submitted; ++ ++ /* ++ * List of spdk_bdev_io that are currently queued because they write to a locked ++ * LBA range. ++ */ ++ bdev_io_tailq_t io_locked; ++ ++ uint32_t flags; ++ ++ struct spdk_histogram_data *histogram; ++ ++#ifdef SPDK_CONFIG_VTUNE ++ uint64_t start_tsc; ++ uint64_t interval_tsc; ++ __itt_string_handle *handle; ++ struct spdk_bdev_io_stat *prev_stat; ++#endif ++ ++ bdev_io_tailq_t queued_resets; ++ ++ lba_range_tailq_t locked_ranges; ++}; ++ ++struct media_event_entry { ++ struct spdk_bdev_media_event event; ++ TAILQ_ENTRY(media_event_entry) tailq; ++}; ++ ++#define MEDIA_EVENT_POOL_SIZE 64 ++ ++struct spdk_bdev_desc { ++ struct spdk_bdev *bdev; ++ struct spdk_thread *thread; ++ struct { ++ spdk_bdev_event_cb_t event_fn; ++ void *ctx; ++ } callback; ++ bool closed; ++ bool write; ++ bool memory_domains_supported; ++ struct spdk_spinlock spinlock; ++ uint32_t refs; ++ TAILQ_HEAD(, media_event_entry) pending_media_events; ++ TAILQ_HEAD(, media_event_entry) free_media_events; ++ struct media_event_entry *media_events_buffer; ++ TAILQ_ENTRY(spdk_bdev_desc) link; ++ ++ uint64_t timeout_in_sec; ++ spdk_bdev_io_timeout_cb cb_fn; ++ void *cb_arg; ++ struct spdk_poller *io_timeout_poller; ++}; ++ ++struct spdk_bdev_iostat_ctx { ++ struct spdk_bdev_io_stat *stat; ++ spdk_bdev_get_device_stat_cb cb; ++ void *cb_arg; ++}; ++ ++struct set_qos_limit_ctx { ++ void (*cb_fn)(void *cb_arg, int status); ++ void *cb_arg; ++ struct spdk_bdev *bdev; ++}; ++ ++struct spdk_bdev_channel_iter { ++ spdk_bdev_for_each_channel_msg fn; ++ spdk_bdev_for_each_channel_done cpl; ++ struct spdk_io_channel_iter *i; ++ void *ctx; ++}; ++ ++struct spdk_bdev_io_error_stat { ++ uint32_t error_status[-SPDK_MIN_BDEV_IO_STATUS]; ++}; ++ ++#define __bdev_to_io_dev(bdev) (((char *)bdev) + 1) ++#define __bdev_from_io_dev(io_dev) ((struct spdk_bdev *)(((char *)io_dev) - 1)) ++#define __io_ch_to_bdev_ch(io_ch) ((struct spdk_bdev_channel *)spdk_io_channel_get_ctx(io_ch)) ++#define __io_ch_to_bdev_mgmt_ch(io_ch) ((struct spdk_bdev_mgmt_channel *)spdk_io_channel_get_ctx(io_ch)) ++ ++static inline void bdev_io_complete(void *ctx); ++ ++static void bdev_write_zero_buffer_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg); ++static void bdev_write_zero_buffer_next(void *_bdev_io); ++ ++static void bdev_enable_qos_msg(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, ++ struct spdk_io_channel *ch, void *_ctx); ++static void bdev_enable_qos_done(struct spdk_bdev *bdev, void *_ctx, int status); ++ ++static int bdev_readv_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, ++ struct iovec *iov, int iovcnt, void *md_buf, uint64_t offset_blocks, ++ uint64_t num_blocks, spdk_bdev_io_completion_cb cb, void *cb_arg, ++ struct spdk_bdev_ext_io_opts *opts, bool copy_opts); ++static int bdev_writev_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, ++ struct iovec *iov, int iovcnt, void *md_buf, ++ uint64_t offset_blocks, uint64_t num_blocks, ++ spdk_bdev_io_completion_cb cb, void *cb_arg, ++ struct spdk_bdev_ext_io_opts *opts, bool copy_opts); ++ ++static int bdev_lock_lba_range(struct spdk_bdev_desc *desc, struct spdk_io_channel *_ch, ++ uint64_t offset, uint64_t length, ++ lock_range_cb cb_fn, void *cb_arg); ++ ++static int bdev_unlock_lba_range(struct spdk_bdev_desc *desc, struct spdk_io_channel *_ch, ++ uint64_t offset, uint64_t length, ++ lock_range_cb cb_fn, void *cb_arg); ++ ++static inline void bdev_io_complete(void *ctx); ++ ++static bool bdev_abort_queued_io(bdev_io_tailq_t *queue, struct spdk_bdev_io *bio_to_abort); ++static bool bdev_abort_buf_io(struct spdk_bdev_mgmt_channel *ch, struct spdk_bdev_io *bio_to_abort); ++ ++void ++spdk_bdev_get_opts(struct spdk_bdev_opts *opts, size_t opts_size) ++{ ++ if (!opts) { ++ SPDK_ERRLOG("opts should not be NULL\n"); ++ return; ++ } ++ ++ if (!opts_size) { ++ SPDK_ERRLOG("opts_size should not be zero value\n"); ++ return; ++ } ++ ++ opts->opts_size = opts_size; ++ ++#define SET_FIELD(field) \ ++ if (offsetof(struct spdk_bdev_opts, field) + sizeof(opts->field) <= opts_size) { \ ++ opts->field = g_bdev_opts.field; \ ++ } \ ++ ++ SET_FIELD(bdev_io_pool_size); ++ SET_FIELD(bdev_io_cache_size); ++ SET_FIELD(bdev_auto_examine); ++ SET_FIELD(small_buf_pool_size); ++ SET_FIELD(large_buf_pool_size); ++ ++ /* Do not remove this statement, you should always update this statement when you adding a new field, ++ * and do not forget to add the SET_FIELD statement for your added field. */ ++ SPDK_STATIC_ASSERT(sizeof(struct spdk_bdev_opts) == 32, "Incorrect size"); ++ ++#undef SET_FIELD ++} ++ ++SPDK_LOG_DEPRECATION_REGISTER(bdev_opts_small_buf_pool_size, "spdk_bdev_opts.small_buf_pool_size", ++ "v23.05", 0); ++SPDK_LOG_DEPRECATION_REGISTER(bdev_opts_large_buf_pool_size, "spdk_bdev_opts.large_buf_pool_size", ++ "v23.05", 0); ++int ++spdk_bdev_set_opts(struct spdk_bdev_opts *opts) ++{ ++ struct spdk_iobuf_opts iobuf_opts; ++ uint32_t min_pool_size; ++ int rc; ++ ++ if (!opts) { ++ SPDK_ERRLOG("opts cannot be NULL\n"); ++ return -1; ++ } ++ ++ if (!opts->opts_size) { ++ SPDK_ERRLOG("opts_size inside opts cannot be zero value\n"); ++ return -1; ++ } ++ ++ /* ++ * Add 1 to the thread count to account for the extra mgmt_ch that gets created during subsystem ++ * initialization. A second mgmt_ch will be created on the same thread when the application starts ++ * but before the deferred put_io_channel event is executed for the first mgmt_ch. ++ */ ++ min_pool_size = opts->bdev_io_cache_size * (spdk_thread_get_count() + 1); ++ if (opts->bdev_io_pool_size < min_pool_size) { ++ SPDK_ERRLOG("bdev_io_pool_size %" PRIu32 " is not compatible with bdev_io_cache_size %" PRIu32 ++ " and %" PRIu32 " threads\n", opts->bdev_io_pool_size, opts->bdev_io_cache_size, ++ spdk_thread_get_count()); ++ SPDK_ERRLOG("bdev_io_pool_size must be at least %" PRIu32 "\n", min_pool_size); ++ return -1; ++ } ++ ++ if (opts->small_buf_pool_size != BUF_SMALL_POOL_SIZE) { ++ SPDK_LOG_DEPRECATED(bdev_opts_small_buf_pool_size); ++ } ++ if (opts->large_buf_pool_size != BUF_LARGE_POOL_SIZE) { ++ SPDK_LOG_DEPRECATED(bdev_opts_large_buf_pool_size); ++ } ++ ++#define SET_FIELD(field) \ ++ if (offsetof(struct spdk_bdev_opts, field) + sizeof(opts->field) <= opts->opts_size) { \ ++ g_bdev_opts.field = opts->field; \ ++ } \ ++ ++ SET_FIELD(bdev_io_pool_size); ++ SET_FIELD(bdev_io_cache_size); ++ SET_FIELD(bdev_auto_examine); ++ SET_FIELD(small_buf_pool_size); ++ SET_FIELD(large_buf_pool_size); ++ ++ spdk_iobuf_get_opts(&iobuf_opts); ++ iobuf_opts.small_pool_count = opts->small_buf_pool_size; ++ iobuf_opts.large_pool_count = opts->large_buf_pool_size; ++ ++ rc = spdk_iobuf_set_opts(&iobuf_opts); ++ if (rc != 0) { ++ SPDK_ERRLOG("Failed to set iobuf opts\n"); ++ return -1; ++ } ++ ++ g_bdev_opts.opts_size = opts->opts_size; ++ ++#undef SET_FIELD ++ ++ return 0; ++} ++ ++static struct spdk_bdev * ++bdev_get_by_name(const char *bdev_name) ++{ ++ struct spdk_bdev_name find; ++ struct spdk_bdev_name *res; ++ ++ find.name = (char *)bdev_name; ++ res = RB_FIND(bdev_name_tree, &g_bdev_mgr.bdev_names, &find); ++ if (res != NULL) { ++ return res->bdev; ++ } ++ ++ return NULL; ++} ++ ++struct spdk_bdev * ++spdk_bdev_get_by_name(const char *bdev_name) ++{ ++ struct spdk_bdev *bdev; ++ ++ spdk_spin_lock(&g_bdev_mgr.spinlock); ++ bdev = bdev_get_by_name(bdev_name); ++ spdk_spin_unlock(&g_bdev_mgr.spinlock); ++ ++ return bdev; ++} ++ ++struct bdev_io_status_string { ++ enum spdk_bdev_io_status status; ++ const char *str; ++}; ++ ++static const struct bdev_io_status_string bdev_io_status_strings[] = { ++ { SPDK_BDEV_IO_STATUS_AIO_ERROR, "aio_error" }, ++ { SPDK_BDEV_IO_STATUS_ABORTED, "aborted" }, ++ { SPDK_BDEV_IO_STATUS_FIRST_FUSED_FAILED, "first_fused_failed" }, ++ { SPDK_BDEV_IO_STATUS_MISCOMPARE, "miscompare" }, ++ { SPDK_BDEV_IO_STATUS_NOMEM, "nomem" }, ++ { SPDK_BDEV_IO_STATUS_SCSI_ERROR, "scsi_error" }, ++ { SPDK_BDEV_IO_STATUS_NVME_ERROR, "nvme_error" }, ++ { SPDK_BDEV_IO_STATUS_FAILED, "failed" }, ++ { SPDK_BDEV_IO_STATUS_PENDING, "pending" }, ++ { SPDK_BDEV_IO_STATUS_SUCCESS, "success" }, ++}; ++ ++static const char * ++bdev_io_status_get_string(enum spdk_bdev_io_status status) ++{ ++ uint32_t i; ++ ++ for (i = 0; i < SPDK_COUNTOF(bdev_io_status_strings); i++) { ++ if (bdev_io_status_strings[i].status == status) { ++ return bdev_io_status_strings[i].str; ++ } ++ } ++ ++ return "reserved"; ++} ++ ++struct spdk_bdev_wait_for_examine_ctx { ++ struct spdk_poller *poller; ++ spdk_bdev_wait_for_examine_cb cb_fn; ++ void *cb_arg; ++}; ++ ++static bool bdev_module_all_actions_completed(void); ++ ++static int ++bdev_wait_for_examine_cb(void *arg) ++{ ++ struct spdk_bdev_wait_for_examine_ctx *ctx = arg; ++ ++ if (!bdev_module_all_actions_completed()) { ++ return SPDK_POLLER_IDLE; ++ } ++ ++ spdk_poller_unregister(&ctx->poller); ++ ctx->cb_fn(ctx->cb_arg); ++ free(ctx); ++ ++ return SPDK_POLLER_BUSY; ++} ++ ++int ++spdk_bdev_wait_for_examine(spdk_bdev_wait_for_examine_cb cb_fn, void *cb_arg) ++{ ++ struct spdk_bdev_wait_for_examine_ctx *ctx; ++ ++ ctx = calloc(1, sizeof(*ctx)); ++ if (ctx == NULL) { ++ return -ENOMEM; ++ } ++ ctx->cb_fn = cb_fn; ++ ctx->cb_arg = cb_arg; ++ ctx->poller = SPDK_POLLER_REGISTER(bdev_wait_for_examine_cb, ctx, 0); ++ ++ return 0; ++} ++ ++struct spdk_bdev_examine_item { ++ char *name; ++ TAILQ_ENTRY(spdk_bdev_examine_item) link; ++}; ++ ++TAILQ_HEAD(spdk_bdev_examine_allowlist, spdk_bdev_examine_item); ++ ++struct spdk_bdev_examine_allowlist g_bdev_examine_allowlist = TAILQ_HEAD_INITIALIZER( ++ g_bdev_examine_allowlist); ++ ++static inline bool ++bdev_examine_allowlist_check(const char *name) ++{ ++ struct spdk_bdev_examine_item *item; ++ TAILQ_FOREACH(item, &g_bdev_examine_allowlist, link) { ++ if (strcmp(name, item->name) == 0) { ++ return true; ++ } ++ } ++ return false; ++} ++ ++static inline void ++bdev_examine_allowlist_free(void) ++{ ++ struct spdk_bdev_examine_item *item; ++ while (!TAILQ_EMPTY(&g_bdev_examine_allowlist)) { ++ item = TAILQ_FIRST(&g_bdev_examine_allowlist); ++ TAILQ_REMOVE(&g_bdev_examine_allowlist, item, link); ++ free(item->name); ++ free(item); ++ } ++} ++ ++static inline bool ++bdev_in_examine_allowlist(struct spdk_bdev *bdev) ++{ ++ struct spdk_bdev_alias *tmp; ++ if (bdev_examine_allowlist_check(bdev->name)) { ++ return true; ++ } ++ TAILQ_FOREACH(tmp, &bdev->aliases, tailq) { ++ if (bdev_examine_allowlist_check(tmp->alias.name)) { ++ return true; ++ } ++ } ++ return false; ++} ++ ++static inline bool ++bdev_ok_to_examine(struct spdk_bdev *bdev) ++{ ++ if (g_bdev_opts.bdev_auto_examine) { ++ return true; ++ } else { ++ return bdev_in_examine_allowlist(bdev); ++ } ++} ++ ++static void ++bdev_examine(struct spdk_bdev *bdev) ++{ ++ if (spdk_ssam_get_hot_restart() == true) { ++ return; ++ } ++ struct spdk_bdev_module *module; ++ uint32_t action; ++ ++ if (!bdev_ok_to_examine(bdev)) { ++ return; ++ } ++ ++ TAILQ_FOREACH(module, &g_bdev_mgr.bdev_modules, internal.tailq) { ++ if (module->examine_config) { ++ spdk_spin_lock(&module->internal.spinlock); ++ action = module->internal.action_in_progress; ++ module->internal.action_in_progress++; ++ spdk_spin_unlock(&module->internal.spinlock); ++ module->examine_config(bdev); ++ if (action != module->internal.action_in_progress) { ++ SPDK_ERRLOG("examine_config for module %s did not call " ++ "spdk_bdev_module_examine_done()\n", module->name); ++ } ++ } ++ } ++ ++ if (bdev->internal.claim_type != SPDK_BDEV_CLAIM_NONE) { ++ module = bdev->internal.claim.v1.module; ++ if (module->examine_disk) { ++ spdk_spin_lock(&module->internal.spinlock); ++ module->internal.action_in_progress++; ++ spdk_spin_unlock(&module->internal.spinlock); ++ module->examine_disk(bdev); ++ } ++ return; ++ } ++ ++ TAILQ_FOREACH(module, &g_bdev_mgr.bdev_modules, internal.tailq) { ++ if (module->examine_disk) { ++ spdk_spin_lock(&module->internal.spinlock); ++ module->internal.action_in_progress++; ++ spdk_spin_unlock(&module->internal.spinlock); ++ module->examine_disk(bdev); ++ } ++ } ++} ++ ++int ++spdk_bdev_examine(const char *name) ++{ ++ struct spdk_bdev *bdev; ++ struct spdk_bdev_examine_item *item; ++ ++ if (spdk_unlikely(spdk_thread_get_app_thread() != spdk_get_thread())) { ++ SPDK_LOG_DEPRECATED(bdev_register_examine_thread); ++ } ++ ++ if (g_bdev_opts.bdev_auto_examine) { ++ SPDK_ERRLOG("Manual examine is not allowed if auto examine is enabled"); ++ return -EINVAL; ++ } ++ ++ if (bdev_examine_allowlist_check(name)) { ++ SPDK_ERRLOG("Duplicate bdev name for manual examine: %s\n", name); ++ return -EEXIST; ++ } ++ ++ item = calloc(1, sizeof(*item)); ++ if (!item) { ++ return -ENOMEM; ++ } ++ item->name = strdup(name); ++ if (!item->name) { ++ free(item); ++ return -ENOMEM; ++ } ++ TAILQ_INSERT_TAIL(&g_bdev_examine_allowlist, item, link); ++ ++ bdev = spdk_bdev_get_by_name(name); ++ if (bdev) { ++ bdev_examine(bdev); ++ } ++ return 0; ++} ++ ++static inline void ++bdev_examine_allowlist_config_json(struct spdk_json_write_ctx *w) ++{ ++ struct spdk_bdev_examine_item *item; ++ TAILQ_FOREACH(item, &g_bdev_examine_allowlist, link) { ++ spdk_json_write_object_begin(w); ++ spdk_json_write_named_string(w, "method", "bdev_examine"); ++ spdk_json_write_named_object_begin(w, "params"); ++ spdk_json_write_named_string(w, "name", item->name); ++ spdk_json_write_object_end(w); ++ spdk_json_write_object_end(w); ++ } ++} ++ ++struct spdk_bdev * ++spdk_bdev_first(void) ++{ ++ struct spdk_bdev *bdev; ++ ++ bdev = TAILQ_FIRST(&g_bdev_mgr.bdevs); ++ if (bdev) { ++ SPDK_DEBUGLOG(bdev, "Starting bdev iteration at %s\n", bdev->name); ++ } ++ ++ return bdev; ++} ++ ++struct spdk_bdev * ++spdk_bdev_next(struct spdk_bdev *prev) ++{ ++ struct spdk_bdev *bdev; ++ ++ bdev = TAILQ_NEXT(prev, internal.link); ++ if (bdev) { ++ SPDK_DEBUGLOG(bdev, "Continuing bdev iteration at %s\n", bdev->name); ++ } ++ ++ return bdev; ++} ++ ++static struct spdk_bdev * ++_bdev_next_leaf(struct spdk_bdev *bdev) ++{ ++ while (bdev != NULL) { ++ if (bdev->internal.claim_type == SPDK_BDEV_CLAIM_NONE) { ++ return bdev; ++ } else { ++ bdev = TAILQ_NEXT(bdev, internal.link); ++ } ++ } ++ ++ return bdev; ++} ++ ++struct spdk_bdev * ++spdk_bdev_first_leaf(void) ++{ ++ struct spdk_bdev *bdev; ++ ++ bdev = _bdev_next_leaf(TAILQ_FIRST(&g_bdev_mgr.bdevs)); ++ ++ if (bdev) { ++ SPDK_DEBUGLOG(bdev, "Starting bdev iteration at %s\n", bdev->name); ++ } ++ ++ return bdev; ++} ++ ++struct spdk_bdev * ++spdk_bdev_next_leaf(struct spdk_bdev *prev) ++{ ++ struct spdk_bdev *bdev; ++ ++ bdev = _bdev_next_leaf(TAILQ_NEXT(prev, internal.link)); ++ ++ if (bdev) { ++ SPDK_DEBUGLOG(bdev, "Continuing bdev iteration at %s\n", bdev->name); ++ } ++ ++ return bdev; ++} ++ ++static inline bool ++bdev_io_use_memory_domain(struct spdk_bdev_io *bdev_io) ++{ ++ return bdev_io->internal.ext_opts && bdev_io->internal.ext_opts->memory_domain; ++} ++ ++void ++spdk_bdev_io_set_buf(struct spdk_bdev_io *bdev_io, void *buf, size_t len) ++{ ++ struct iovec *iovs; ++ ++ if (bdev_io->u.bdev.iovs == NULL) { ++ bdev_io->u.bdev.iovs = &bdev_io->iov; ++ bdev_io->u.bdev.iovcnt = 1; ++ } ++ ++ iovs = bdev_io->u.bdev.iovs; ++ ++ assert(iovs != NULL); ++ assert(bdev_io->u.bdev.iovcnt >= 1); ++ ++ iovs[0].iov_base = buf; ++ iovs[0].iov_len = len; ++} ++ ++void ++spdk_bdev_io_set_md_buf(struct spdk_bdev_io *bdev_io, void *md_buf, size_t len) ++{ ++ assert((len / spdk_bdev_get_md_size(bdev_io->bdev)) >= bdev_io->u.bdev.num_blocks); ++ bdev_io->u.bdev.md_buf = md_buf; ++} ++ ++static bool ++_is_buf_allocated(const struct iovec *iovs) ++{ ++ if (iovs == NULL) { ++ return false; ++ } ++ ++ return iovs[0].iov_base != NULL; ++} ++ ++static bool ++_are_iovs_aligned(struct iovec *iovs, int iovcnt, uint32_t alignment) ++{ ++ int i; ++ uintptr_t iov_base; ++ ++ if (spdk_likely(alignment == 1)) { ++ return true; ++ } ++ ++ for (i = 0; i < iovcnt; i++) { ++ iov_base = (uintptr_t)iovs[i].iov_base; ++ if ((iov_base & (alignment - 1)) != 0) { ++ return false; ++ } ++ } ++ ++ return true; ++} ++ ++static void ++bdev_io_get_buf_complete(struct spdk_bdev_io *bdev_io, bool status) ++{ ++ struct spdk_io_channel *ch = spdk_bdev_io_get_io_channel(bdev_io); ++ void *buf; ++ ++ if (spdk_unlikely(bdev_io->internal.get_aux_buf_cb != NULL)) { ++ buf = bdev_io->internal.buf; ++ bdev_io->internal.buf = NULL; ++ bdev_io->internal.get_aux_buf_cb(ch, bdev_io, buf); ++ bdev_io->internal.get_aux_buf_cb = NULL; ++ } else { ++ assert(bdev_io->internal.get_buf_cb != NULL); ++ bdev_io->internal.get_buf_cb(ch, bdev_io, status); ++ bdev_io->internal.get_buf_cb = NULL; ++ } ++} ++ ++static void ++_bdev_io_pull_buffer_cpl(void *ctx, int rc) ++{ ++ struct spdk_bdev_io *bdev_io = ctx; ++ ++ if (rc) { ++ SPDK_ERRLOG("Set bounce buffer failed with rc %d\n", rc); ++ bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; ++ } ++ bdev_io_get_buf_complete(bdev_io, !rc); ++} ++ ++static void ++_bdev_io_pull_bounce_md_buf(struct spdk_bdev_io *bdev_io, void *md_buf, size_t len) ++{ ++ int rc = 0; ++ ++ /* save original md_buf */ ++ bdev_io->internal.orig_md_iov.iov_base = bdev_io->u.bdev.md_buf; ++ bdev_io->internal.orig_md_iov.iov_len = len; ++ bdev_io->internal.bounce_md_iov.iov_base = md_buf; ++ bdev_io->internal.bounce_md_iov.iov_len = len; ++ /* set bounce md_buf */ ++ bdev_io->u.bdev.md_buf = md_buf; ++ ++ if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) { ++ if (bdev_io_use_memory_domain(bdev_io)) { ++ rc = spdk_memory_domain_pull_data(bdev_io->internal.ext_opts->memory_domain, ++ bdev_io->internal.ext_opts->memory_domain_ctx, ++ &bdev_io->internal.orig_md_iov, 1, ++ &bdev_io->internal.bounce_md_iov, 1, ++ bdev_io->internal.data_transfer_cpl, ++ bdev_io); ++ if (rc == 0) { ++ /* Continue to submit IO in completion callback */ ++ return; ++ } ++ SPDK_ERRLOG("Failed to pull data from memory domain %s, rc %d\n", ++ spdk_memory_domain_get_dma_device_id(bdev_io->internal.ext_opts->memory_domain), rc); ++ } else { ++ memcpy(md_buf, bdev_io->internal.orig_md_iov.iov_base, bdev_io->internal.orig_md_iov.iov_len); ++ } ++ } ++ ++ assert(bdev_io->internal.data_transfer_cpl); ++ bdev_io->internal.data_transfer_cpl(bdev_io, rc); ++} ++ ++static void ++_bdev_io_set_md_buf(struct spdk_bdev_io *bdev_io) ++{ ++ struct spdk_bdev *bdev = bdev_io->bdev; ++ uint64_t md_len; ++ void *buf; ++ ++ if (spdk_bdev_is_md_separate(bdev)) { ++ buf = (char *)bdev_io->u.bdev.iovs[0].iov_base + bdev_io->u.bdev.iovs[0].iov_len; ++ md_len = bdev_io->u.bdev.num_blocks * bdev->md_len; ++ ++ assert(((uintptr_t)buf & (spdk_bdev_get_buf_align(bdev) - 1)) == 0); ++ ++ if (bdev_io->u.bdev.md_buf != NULL) { ++ _bdev_io_pull_bounce_md_buf(bdev_io, buf, md_len); ++ return; ++ } else { ++ spdk_bdev_io_set_md_buf(bdev_io, buf, md_len); ++ } ++ } ++ ++ bdev_io_get_buf_complete(bdev_io, true); ++} ++ ++static void ++_bdev_io_pull_bounce_data_buf_done(void *ctx, int rc) ++{ ++ struct spdk_bdev_io *bdev_io = ctx; ++ ++ if (rc) { ++ SPDK_ERRLOG("Failed to get data buffer\n"); ++ assert(bdev_io->internal.data_transfer_cpl); ++ bdev_io->internal.data_transfer_cpl(bdev_io, rc); ++ return; ++ } ++ ++ _bdev_io_set_md_buf(bdev_io); ++} ++ ++static void ++_bdev_io_pull_bounce_data_buf(struct spdk_bdev_io *bdev_io, void *buf, size_t len, ++ bdev_copy_bounce_buffer_cpl cpl_cb) ++{ ++ int rc = 0; ++ ++ bdev_io->internal.data_transfer_cpl = cpl_cb; ++ /* save original iovec */ ++ bdev_io->internal.orig_iovs = bdev_io->u.bdev.iovs; ++ bdev_io->internal.orig_iovcnt = bdev_io->u.bdev.iovcnt; ++ /* set bounce iov */ ++ bdev_io->u.bdev.iovs = &bdev_io->internal.bounce_iov; ++ bdev_io->u.bdev.iovcnt = 1; ++ /* set bounce buffer for this operation */ ++ bdev_io->u.bdev.iovs[0].iov_base = buf; ++ bdev_io->u.bdev.iovs[0].iov_len = len; ++ /* if this is write path, copy data from original buffer to bounce buffer */ ++ if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) { ++ if (bdev_io_use_memory_domain(bdev_io)) { ++ rc = spdk_memory_domain_pull_data(bdev_io->internal.ext_opts->memory_domain, ++ bdev_io->internal.ext_opts->memory_domain_ctx, ++ bdev_io->internal.orig_iovs, ++ (uint32_t) bdev_io->internal.orig_iovcnt, ++ bdev_io->u.bdev.iovs, 1, ++ _bdev_io_pull_bounce_data_buf_done, ++ bdev_io); ++ if (rc == 0) { ++ /* Continue to submit IO in completion callback */ ++ return; ++ } ++ SPDK_ERRLOG("Failed to pull data from memory domain %s\n", ++ spdk_memory_domain_get_dma_device_id(bdev_io->internal.ext_opts->memory_domain)); ++ } else { ++ spdk_copy_iovs_to_buf(buf, len, bdev_io->internal.orig_iovs, bdev_io->internal.orig_iovcnt); ++ } ++ } ++ ++ _bdev_io_pull_bounce_data_buf_done(bdev_io, rc); ++} ++ ++static void ++_bdev_io_set_buf(struct spdk_bdev_io *bdev_io, void *buf, uint64_t len) ++{ ++ struct spdk_bdev *bdev = bdev_io->bdev; ++ bool buf_allocated; ++ uint64_t alignment; ++ void *aligned_buf; ++ ++ bdev_io->internal.buf = buf; ++ ++ if (spdk_unlikely(bdev_io->internal.get_aux_buf_cb != NULL)) { ++ bdev_io_get_buf_complete(bdev_io, true); ++ return; ++ } ++ ++ alignment = spdk_bdev_get_buf_align(bdev); ++ buf_allocated = _is_buf_allocated(bdev_io->u.bdev.iovs); ++ aligned_buf = (void *)(((uintptr_t)buf + (alignment - 1)) & ~(alignment - 1)); ++ ++ if (buf_allocated) { ++ _bdev_io_pull_bounce_data_buf(bdev_io, aligned_buf, len, _bdev_io_pull_buffer_cpl); ++ /* Continue in completion callback */ ++ return; ++ } else { ++ spdk_bdev_io_set_buf(bdev_io, aligned_buf, len); ++ } ++ ++ _bdev_io_set_md_buf(bdev_io); ++} ++ ++static inline uint64_t ++bdev_io_get_max_buf_len(struct spdk_bdev_io *bdev_io, uint64_t len) ++{ ++ struct spdk_bdev *bdev = bdev_io->bdev; ++ uint64_t md_len, alignment; ++ ++ md_len = spdk_bdev_is_md_separate(bdev) ? bdev_io->u.bdev.num_blocks * bdev->md_len : 0; ++ ++ /* 1 byte alignment needs 0 byte of extra space, 64 bytes alignment needs 63 bytes of extra space, etc. */ ++ alignment = spdk_bdev_get_buf_align(bdev) - 1; ++ ++ return len + alignment + md_len; ++} ++ ++static void ++_bdev_io_put_buf(struct spdk_bdev_io *bdev_io, void *buf, uint64_t buf_len) ++{ ++ struct spdk_bdev_mgmt_channel *ch; ++ ++ ch = bdev_io->internal.ch->shared_resource->mgmt_ch; ++ spdk_iobuf_put(&ch->iobuf, buf, bdev_io_get_max_buf_len(bdev_io, buf_len)); ++} ++ ++static void ++bdev_io_put_buf(struct spdk_bdev_io *bdev_io) ++{ ++ assert(bdev_io->internal.buf != NULL); ++ _bdev_io_put_buf(bdev_io, bdev_io->internal.buf, bdev_io->internal.buf_len); ++ bdev_io->internal.buf = NULL; ++} ++ ++void ++spdk_bdev_io_put_aux_buf(struct spdk_bdev_io *bdev_io, void *buf) ++{ ++ uint64_t len = bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen; ++ ++ assert(buf != NULL); ++ _bdev_io_put_buf(bdev_io, buf, len); ++} ++ ++static void ++bdev_ch_retry_io(struct spdk_bdev_channel *bdev_ch) ++{ ++ struct spdk_bdev *bdev = bdev_ch->bdev; ++ struct spdk_bdev_shared_resource *shared_resource = bdev_ch->shared_resource; ++ struct spdk_bdev_io *bdev_io; ++ ++ if (shared_resource->io_outstanding > shared_resource->nomem_threshold) { ++ /* ++ * Allow some more I/O to complete before retrying the nomem_io queue. ++ * Some drivers (such as nvme) cannot immediately take a new I/O in ++ * the context of a completion, because the resources for the I/O are ++ * not released until control returns to the bdev poller. Also, we ++ * may require several small I/O to complete before a larger I/O ++ * (that requires splitting) can be submitted. ++ */ ++ return; ++ } ++ ++ while (!TAILQ_EMPTY(&shared_resource->nomem_io)) { ++ bdev_io = TAILQ_FIRST(&shared_resource->nomem_io); ++ TAILQ_REMOVE(&shared_resource->nomem_io, bdev_io, internal.link); ++ bdev_io->internal.ch->io_outstanding++; ++ shared_resource->io_outstanding++; ++ bdev_io->internal.status = SPDK_BDEV_IO_STATUS_PENDING; ++ bdev_io->internal.error.nvme.cdw0 = 0; ++ bdev_io->num_retries++; ++ bdev->fn_table->submit_request(spdk_bdev_io_get_io_channel(bdev_io), bdev_io); ++ if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_NOMEM) { ++ break; ++ } ++ } ++} ++ ++static inline void ++_bdev_io_decrement_outstanding(struct spdk_bdev_channel *bdev_ch, ++ struct spdk_bdev_shared_resource *shared_resource) ++{ ++ assert(bdev_ch->io_outstanding > 0); ++ assert(shared_resource->io_outstanding > 0); ++ bdev_ch->io_outstanding--; ++ shared_resource->io_outstanding--; ++} ++ ++static inline bool ++_bdev_io_handle_no_mem(struct spdk_bdev_io *bdev_io) ++{ ++ struct spdk_bdev_channel *bdev_ch = bdev_io->internal.ch; ++ struct spdk_bdev_shared_resource *shared_resource = bdev_ch->shared_resource; ++ ++ if (spdk_unlikely(bdev_io->internal.status == SPDK_BDEV_IO_STATUS_NOMEM)) { ++ TAILQ_INSERT_HEAD(&shared_resource->nomem_io, bdev_io, internal.link); ++ /* ++ * Wait for some of the outstanding I/O to complete before we ++ * retry any of the nomem_io. Normally we will wait for ++ * NOMEM_THRESHOLD_COUNT I/O to complete but for low queue ++ * depth channels we will instead wait for half to complete. ++ */ ++ shared_resource->nomem_threshold = spdk_max((int64_t)shared_resource->io_outstanding / 2, ++ (int64_t)shared_resource->io_outstanding - NOMEM_THRESHOLD_COUNT); ++ return true; ++ } ++ ++ if (spdk_unlikely(!TAILQ_EMPTY(&shared_resource->nomem_io))) { ++ bdev_ch_retry_io(bdev_ch); ++ } ++ ++ return false; ++} ++ ++static void ++_bdev_io_complete_push_bounce_done(void *ctx, int rc) ++{ ++ struct spdk_bdev_io *bdev_io = ctx; ++ struct spdk_bdev_channel *bdev_ch = bdev_io->internal.ch; ++ struct spdk_bdev_shared_resource *shared_resource = bdev_ch->shared_resource; ++ ++ if (rc) { ++ bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; ++ } ++ /* We want to free the bounce buffer here since we know we're done with it (as opposed ++ * to waiting for the conditional free of internal.buf in spdk_bdev_free_io()). ++ */ ++ bdev_io_put_buf(bdev_io); ++ ++ /* Continue with IO completion flow */ ++ _bdev_io_decrement_outstanding(bdev_ch, shared_resource); ++ if (spdk_unlikely(_bdev_io_handle_no_mem(bdev_io))) { ++ return; ++ } ++ ++ bdev_io_complete(bdev_io); ++} ++ ++static inline void ++_bdev_io_push_bounce_md_buffer(struct spdk_bdev_io *bdev_io) ++{ ++ int rc = 0; ++ ++ /* do the same for metadata buffer */ ++ if (spdk_unlikely(bdev_io->internal.orig_md_iov.iov_base != NULL)) { ++ assert(spdk_bdev_is_md_separate(bdev_io->bdev)); ++ ++ if (bdev_io->type == SPDK_BDEV_IO_TYPE_READ && ++ bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS) { ++ if (bdev_io_use_memory_domain(bdev_io)) { ++ /* If memory domain is used then we need to call async push function */ ++ rc = spdk_memory_domain_push_data(bdev_io->internal.ext_opts->memory_domain, ++ bdev_io->internal.ext_opts->memory_domain_ctx, ++ &bdev_io->internal.orig_md_iov, ++ (uint32_t)bdev_io->internal.orig_iovcnt, ++ &bdev_io->internal.bounce_md_iov, 1, ++ bdev_io->internal.data_transfer_cpl, ++ bdev_io); ++ if (rc == 0) { ++ /* Continue IO completion in async callback */ ++ return; ++ } ++ SPDK_ERRLOG("Failed to push md to memory domain %s\n", ++ spdk_memory_domain_get_dma_device_id(bdev_io->internal.ext_opts->memory_domain)); ++ } else { ++ memcpy(bdev_io->internal.orig_md_iov.iov_base, bdev_io->u.bdev.md_buf, ++ bdev_io->internal.orig_md_iov.iov_len); ++ } ++ } ++ } ++ ++ assert(bdev_io->internal.data_transfer_cpl); ++ bdev_io->internal.data_transfer_cpl(bdev_io, rc); ++} ++ ++static void ++_bdev_io_push_bounce_data_buffer_done(void *ctx, int rc) ++{ ++ struct spdk_bdev_io *bdev_io = ctx; ++ ++ assert(bdev_io->internal.data_transfer_cpl); ++ ++ if (rc) { ++ bdev_io->internal.data_transfer_cpl(bdev_io, rc); ++ return; ++ } ++ ++ /* set original buffer for this io */ ++ bdev_io->u.bdev.iovcnt = bdev_io->internal.orig_iovcnt; ++ bdev_io->u.bdev.iovs = bdev_io->internal.orig_iovs; ++ /* disable bouncing buffer for this io */ ++ bdev_io->internal.orig_iovcnt = 0; ++ bdev_io->internal.orig_iovs = NULL; ++ ++ _bdev_io_push_bounce_md_buffer(bdev_io); ++} ++ ++static inline void ++_bdev_io_push_bounce_data_buffer(struct spdk_bdev_io *bdev_io, bdev_copy_bounce_buffer_cpl cpl_cb) ++{ ++ int rc = 0; ++ ++ bdev_io->internal.data_transfer_cpl = cpl_cb; ++ ++ /* if this is read path, copy data from bounce buffer to original buffer */ ++ if (bdev_io->type == SPDK_BDEV_IO_TYPE_READ && ++ bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS) { ++ if (bdev_io_use_memory_domain(bdev_io)) { ++ /* If memory domain is used then we need to call async push function */ ++ rc = spdk_memory_domain_push_data(bdev_io->internal.ext_opts->memory_domain, ++ bdev_io->internal.ext_opts->memory_domain_ctx, ++ bdev_io->internal.orig_iovs, ++ (uint32_t)bdev_io->internal.orig_iovcnt, ++ &bdev_io->internal.bounce_iov, 1, ++ _bdev_io_push_bounce_data_buffer_done, ++ bdev_io); ++ if (rc == 0) { ++ /* Continue IO completion in async callback */ ++ return; ++ } ++ SPDK_ERRLOG("Failed to push data to memory domain %s\n", ++ spdk_memory_domain_get_dma_device_id(bdev_io->internal.ext_opts->memory_domain)); ++ } else { ++ spdk_copy_buf_to_iovs(bdev_io->internal.orig_iovs, ++ bdev_io->internal.orig_iovcnt, ++ bdev_io->internal.bounce_iov.iov_base, ++ bdev_io->internal.bounce_iov.iov_len); ++ } ++ } ++ ++ _bdev_io_push_bounce_data_buffer_done(bdev_io, rc); ++} ++ ++static void ++bdev_io_get_iobuf_cb(struct spdk_iobuf_entry *iobuf, void *buf) ++{ ++ struct spdk_bdev_io *bdev_io; ++ ++ bdev_io = SPDK_CONTAINEROF(iobuf, struct spdk_bdev_io, internal.iobuf); ++ _bdev_io_set_buf(bdev_io, buf, bdev_io->internal.buf_len); ++} ++ ++static void ++bdev_io_get_buf(struct spdk_bdev_io *bdev_io, uint64_t len) ++{ ++ struct spdk_bdev_mgmt_channel *mgmt_ch; ++ uint64_t max_len; ++ void *buf; ++ ++ assert(spdk_bdev_io_get_thread(bdev_io) == spdk_get_thread()); ++ mgmt_ch = bdev_io->internal.ch->shared_resource->mgmt_ch; ++ max_len = bdev_io_get_max_buf_len(bdev_io, len); ++ ++ if (spdk_unlikely(max_len > mgmt_ch->iobuf.large.bufsize)) { ++ SPDK_ERRLOG("Length %" PRIu64 " is larger than allowed\n", max_len); ++ bdev_io_get_buf_complete(bdev_io, false); ++ return; ++ } ++ ++ bdev_io->internal.buf_len = len; ++ buf = spdk_iobuf_get(&mgmt_ch->iobuf, max_len, &bdev_io->internal.iobuf, ++ bdev_io_get_iobuf_cb); ++ if (buf != NULL) { ++ _bdev_io_set_buf(bdev_io, buf, len); ++ } ++} ++ ++void ++spdk_bdev_io_get_buf(struct spdk_bdev_io *bdev_io, spdk_bdev_io_get_buf_cb cb, uint64_t len) ++{ ++ struct spdk_bdev *bdev = bdev_io->bdev; ++ uint64_t alignment; ++ ++ assert(cb != NULL); ++ bdev_io->internal.get_buf_cb = cb; ++ ++ alignment = spdk_bdev_get_buf_align(bdev); ++ ++ if (_is_buf_allocated(bdev_io->u.bdev.iovs) && ++ _are_iovs_aligned(bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt, alignment)) { ++ /* Buffer already present and aligned */ ++ cb(spdk_bdev_io_get_io_channel(bdev_io), bdev_io, true); ++ return; ++ } ++ ++ bdev_io_get_buf(bdev_io, len); ++} ++ ++static void ++_bdev_memory_domain_get_io_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io, ++ bool success) ++{ ++ if (!success) { ++ SPDK_ERRLOG("Failed to get data buffer, completing IO\n"); ++ bdev_io_complete(bdev_io); ++ } else { ++ bdev_io_submit(bdev_io); ++ } ++} ++ ++static void ++_bdev_memory_domain_io_get_buf(struct spdk_bdev_io *bdev_io, spdk_bdev_io_get_buf_cb cb, ++ uint64_t len) ++{ ++ assert(cb != NULL); ++ bdev_io->internal.get_buf_cb = cb; ++ ++ bdev_io_get_buf(bdev_io, len); ++} ++ ++void ++spdk_bdev_io_get_aux_buf(struct spdk_bdev_io *bdev_io, spdk_bdev_io_get_aux_buf_cb cb) ++{ ++ uint64_t len = bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen; ++ ++ assert(cb != NULL); ++ assert(bdev_io->internal.get_aux_buf_cb == NULL); ++ bdev_io->internal.get_aux_buf_cb = cb; ++ bdev_io_get_buf(bdev_io, len); ++} ++ ++static int ++bdev_module_get_max_ctx_size(void) ++{ ++ struct spdk_bdev_module *bdev_module; ++ int max_bdev_module_size = 0; ++ ++ TAILQ_FOREACH(bdev_module, &g_bdev_mgr.bdev_modules, internal.tailq) { ++ if (bdev_module->get_ctx_size && bdev_module->get_ctx_size() > max_bdev_module_size) { ++ max_bdev_module_size = bdev_module->get_ctx_size(); ++ } ++ } ++ ++ return max_bdev_module_size; ++} ++ ++static void ++bdev_qos_config_json(struct spdk_bdev *bdev, struct spdk_json_write_ctx *w) ++{ ++ int i; ++ struct spdk_bdev_qos *qos = bdev->internal.qos; ++ uint64_t limits[SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES]; ++ ++ if (!qos) { ++ return; ++ } ++ ++ spdk_bdev_get_qos_rate_limits(bdev, limits); ++ ++ spdk_json_write_object_begin(w); ++ spdk_json_write_named_string(w, "method", "bdev_set_qos_limit"); ++ ++ spdk_json_write_named_object_begin(w, "params"); ++ spdk_json_write_named_string(w, "name", bdev->name); ++ for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { ++ if (limits[i] > 0) { ++ spdk_json_write_named_uint64(w, qos_rpc_type[i], limits[i]); ++ } ++ } ++ spdk_json_write_object_end(w); ++ ++ spdk_json_write_object_end(w); ++} ++ ++void ++spdk_bdev_subsystem_config_json(struct spdk_json_write_ctx *w) ++{ ++ struct spdk_bdev_module *bdev_module; ++ struct spdk_bdev *bdev; ++ ++ assert(w != NULL); ++ ++ spdk_json_write_array_begin(w); ++ ++ spdk_json_write_object_begin(w); ++ spdk_json_write_named_string(w, "method", "bdev_set_options"); ++ spdk_json_write_named_object_begin(w, "params"); ++ spdk_json_write_named_uint32(w, "bdev_io_pool_size", g_bdev_opts.bdev_io_pool_size); ++ spdk_json_write_named_uint32(w, "bdev_io_cache_size", g_bdev_opts.bdev_io_cache_size); ++ spdk_json_write_named_bool(w, "bdev_auto_examine", g_bdev_opts.bdev_auto_examine); ++ spdk_json_write_object_end(w); ++ spdk_json_write_object_end(w); ++ ++ bdev_examine_allowlist_config_json(w); ++ ++ TAILQ_FOREACH(bdev_module, &g_bdev_mgr.bdev_modules, internal.tailq) { ++ if (bdev_module->config_json) { ++ bdev_module->config_json(w); ++ } ++ } ++ ++ spdk_spin_lock(&g_bdev_mgr.spinlock); ++ ++ TAILQ_FOREACH(bdev, &g_bdev_mgr.bdevs, internal.link) { ++ if (bdev->fn_table->write_config_json) { ++ bdev->fn_table->write_config_json(bdev, w); ++ } ++ ++ bdev_qos_config_json(bdev, w); ++ } ++ ++ spdk_spin_unlock(&g_bdev_mgr.spinlock); ++ ++ /* This has to be last RPC in array to make sure all bdevs finished examine */ ++ spdk_json_write_object_begin(w); ++ spdk_json_write_named_string(w, "method", "bdev_wait_for_examine"); ++ spdk_json_write_object_end(w); ++ ++ spdk_json_write_array_end(w); ++} ++ ++static void ++bdev_mgmt_channel_destroy(void *io_device, void *ctx_buf) ++{ ++ struct spdk_bdev_mgmt_channel *ch = ctx_buf; ++ struct spdk_bdev_io *bdev_io; ++ ++ spdk_iobuf_channel_fini(&ch->iobuf); ++ ++ while (!STAILQ_EMPTY(&ch->per_thread_cache)) { ++ bdev_io = STAILQ_FIRST(&ch->per_thread_cache); ++ STAILQ_REMOVE_HEAD(&ch->per_thread_cache, internal.buf_link); ++ ch->per_thread_cache_count--; ++ spdk_mempool_put(g_bdev_mgr.bdev_io_pool, (void *)bdev_io); ++ } ++ ++ assert(ch->per_thread_cache_count == 0); ++} ++ ++static int ++bdev_mgmt_channel_create(void *io_device, void *ctx_buf) ++{ ++ struct spdk_bdev_mgmt_channel *ch = ctx_buf; ++ struct spdk_bdev_io *bdev_io; ++ uint32_t i; ++ int rc; ++ ++ rc = spdk_iobuf_channel_init(&ch->iobuf, "bdev", BUF_SMALL_CACHE_SIZE, BUF_LARGE_CACHE_SIZE); ++ if (rc != 0) { ++ SPDK_ERRLOG("Failed to create iobuf channel: %s\n", spdk_strerror(-rc)); ++ return -1; ++ } ++ ++ STAILQ_INIT(&ch->per_thread_cache); ++ ch->bdev_io_cache_size = g_bdev_opts.bdev_io_cache_size; ++ ++ /* Pre-populate bdev_io cache to ensure this thread cannot be starved. */ ++ ch->per_thread_cache_count = 0; ++ for (i = 0; i < ch->bdev_io_cache_size; i++) { ++ bdev_io = spdk_mempool_get(g_bdev_mgr.bdev_io_pool); ++ if (bdev_io == NULL) { ++ SPDK_ERRLOG("You need to increase bdev_io_pool_size using bdev_set_options RPC.\n"); ++ assert(false); ++ bdev_mgmt_channel_destroy(io_device, ctx_buf); ++ return -1; ++ } ++ ch->per_thread_cache_count++; ++ STAILQ_INSERT_HEAD(&ch->per_thread_cache, bdev_io, internal.buf_link); ++ } ++ ++ TAILQ_INIT(&ch->shared_resources); ++ TAILQ_INIT(&ch->io_wait_queue); ++ ++ return 0; ++} ++ ++static void ++bdev_init_complete(int rc) ++{ ++ spdk_bdev_init_cb cb_fn = g_init_cb_fn; ++ void *cb_arg = g_init_cb_arg; ++ struct spdk_bdev_module *m; ++ ++ g_bdev_mgr.init_complete = true; ++ g_init_cb_fn = NULL; ++ g_init_cb_arg = NULL; ++ ++ /* ++ * For modules that need to know when subsystem init is complete, ++ * inform them now. ++ */ ++ if (rc == 0) { ++ TAILQ_FOREACH(m, &g_bdev_mgr.bdev_modules, internal.tailq) { ++ if (m->init_complete) { ++ m->init_complete(); ++ } ++ } ++ } ++ ++ cb_fn(cb_arg, rc); ++} ++ ++static bool ++bdev_module_all_actions_completed(void) ++{ ++ struct spdk_bdev_module *m; ++ ++ TAILQ_FOREACH(m, &g_bdev_mgr.bdev_modules, internal.tailq) { ++ if (m->internal.action_in_progress > 0) { ++ return false; ++ } ++ } ++ return true; ++} ++ ++static void ++bdev_module_action_complete(void) ++{ ++ /* ++ * Don't finish bdev subsystem initialization if ++ * module pre-initialization is still in progress, or ++ * the subsystem been already initialized. ++ */ ++ if (!g_bdev_mgr.module_init_complete || g_bdev_mgr.init_complete) { ++ return; ++ } ++ ++ /* ++ * Check all bdev modules for inits/examinations in progress. If any ++ * exist, return immediately since we cannot finish bdev subsystem ++ * initialization until all are completed. ++ */ ++ if (!bdev_module_all_actions_completed()) { ++ return; ++ } ++ ++ /* ++ * Modules already finished initialization - now that all ++ * the bdev modules have finished their asynchronous I/O ++ * processing, the entire bdev layer can be marked as complete. ++ */ ++ bdev_init_complete(0); ++} ++ ++static void ++bdev_module_action_done(struct spdk_bdev_module *module) ++{ ++ spdk_spin_lock(&module->internal.spinlock); ++ assert(module->internal.action_in_progress > 0); ++ module->internal.action_in_progress--; ++ spdk_spin_unlock(&module->internal.spinlock); ++ bdev_module_action_complete(); ++} ++ ++void ++spdk_bdev_module_init_done(struct spdk_bdev_module *module) ++{ ++ assert(module->async_init); ++ bdev_module_action_done(module); ++} ++ ++void ++spdk_bdev_module_examine_done(struct spdk_bdev_module *module) ++{ ++ bdev_module_action_done(module); ++} ++ ++/** The last initialized bdev module */ ++static struct spdk_bdev_module *g_resume_bdev_module = NULL; ++ ++static void ++bdev_init_failed(void *cb_arg) ++{ ++ struct spdk_bdev_module *module = cb_arg; ++ ++ spdk_spin_lock(&module->internal.spinlock); ++ assert(module->internal.action_in_progress > 0); ++ module->internal.action_in_progress--; ++ spdk_spin_unlock(&module->internal.spinlock); ++ bdev_init_complete(-1); ++} ++ ++static int ++bdev_modules_init(void) ++{ ++ struct spdk_bdev_module *module; ++ int rc = 0; ++ ++ TAILQ_FOREACH(module, &g_bdev_mgr.bdev_modules, internal.tailq) { ++ g_resume_bdev_module = module; ++ if (module->async_init) { ++ spdk_spin_lock(&module->internal.spinlock); ++ module->internal.action_in_progress = 1; ++ spdk_spin_unlock(&module->internal.spinlock); ++ } ++ rc = module->module_init(); ++ if (rc != 0) { ++ /* Bump action_in_progress to prevent other modules from completion of modules_init ++ * Send message to defer application shutdown until resources are cleaned up */ ++ spdk_spin_lock(&module->internal.spinlock); ++ module->internal.action_in_progress = 1; ++ spdk_spin_unlock(&module->internal.spinlock); ++ spdk_thread_send_msg(spdk_get_thread(), bdev_init_failed, module); ++ return rc; ++ } ++ } ++ ++ g_resume_bdev_module = NULL; ++ return 0; ++} ++ ++void ++spdk_bdev_initialize(spdk_bdev_init_cb cb_fn, void *cb_arg) ++{ ++ int rc = 0; ++ char mempool_name[32]; ++ ++ assert(cb_fn != NULL); ++ ++ g_init_cb_fn = cb_fn; ++ g_init_cb_arg = cb_arg; ++ ++ spdk_notify_type_register("bdev_register"); ++ spdk_notify_type_register("bdev_unregister"); ++ ++ snprintf(mempool_name, sizeof(mempool_name), "bdev_io_%d", getpid()); ++ ++ rc = spdk_iobuf_register_module("bdev"); ++ if (rc != 0) { ++ SPDK_ERRLOG("could not register bdev iobuf module: %s\n", spdk_strerror(-rc)); ++ bdev_init_complete(-1); ++ return; ++ } ++ ++ g_bdev_mgr.bdev_io_pool = spdk_mempool_create(mempool_name, ++ g_bdev_opts.bdev_io_pool_size, ++ sizeof(struct spdk_bdev_io) + ++ bdev_module_get_max_ctx_size(), ++ 0, ++ SPDK_ENV_SOCKET_ID_ANY); ++ ++ if (g_bdev_mgr.bdev_io_pool == NULL) { ++ SPDK_ERRLOG("could not allocate spdk_bdev_io pool\n"); ++ bdev_init_complete(-1); ++ return; ++ } ++ ++ g_bdev_mgr.zero_buffer = spdk_zmalloc(ZERO_BUFFER_SIZE, ZERO_BUFFER_SIZE, ++ NULL, SPDK_ENV_LCORE_ID_ANY, SPDK_MALLOC_DMA); ++ if (!g_bdev_mgr.zero_buffer) { ++ SPDK_ERRLOG("create bdev zero buffer failed\n"); ++ bdev_init_complete(-1); ++ return; ++ } ++ ++#ifdef SPDK_CONFIG_VTUNE ++ SPDK_LOG_DEPRECATED(vtune_support); ++ g_bdev_mgr.domain = __itt_domain_create("spdk_bdev"); ++#endif ++ ++ spdk_io_device_register(&g_bdev_mgr, bdev_mgmt_channel_create, ++ bdev_mgmt_channel_destroy, ++ sizeof(struct spdk_bdev_mgmt_channel), ++ "bdev_mgr"); ++ ++ rc = bdev_modules_init(); ++ g_bdev_mgr.module_init_complete = true; ++ if (rc != 0) { ++ SPDK_ERRLOG("bdev modules init failed\n"); ++ return; ++ } ++ ++ bdev_module_action_complete(); ++} ++ ++static void ++bdev_mgr_unregister_cb(void *io_device) ++{ ++ spdk_bdev_fini_cb cb_fn = g_fini_cb_fn; ++ ++ if (g_bdev_mgr.bdev_io_pool) { ++ if (spdk_mempool_count(g_bdev_mgr.bdev_io_pool) != g_bdev_opts.bdev_io_pool_size) { ++ SPDK_ERRLOG("bdev IO pool count is %zu but should be %u\n", ++ spdk_mempool_count(g_bdev_mgr.bdev_io_pool), ++ g_bdev_opts.bdev_io_pool_size); ++ } ++ ++ spdk_mempool_free(g_bdev_mgr.bdev_io_pool); ++ } ++ ++ spdk_free(g_bdev_mgr.zero_buffer); ++ ++ bdev_examine_allowlist_free(); ++ ++ cb_fn(g_fini_cb_arg); ++ g_fini_cb_fn = NULL; ++ g_fini_cb_arg = NULL; ++ g_bdev_mgr.init_complete = false; ++ g_bdev_mgr.module_init_complete = false; ++} ++ ++static void ++bdev_module_fini_iter(void *arg) ++{ ++ struct spdk_bdev_module *bdev_module; ++ ++ /* FIXME: Handling initialization failures is broken now, ++ * so we won't even try cleaning up after successfully ++ * initialized modules. if module_init_complete is false, ++ * just call spdk_bdev_mgr_unregister_cb ++ */ ++ if (!g_bdev_mgr.module_init_complete) { ++ bdev_mgr_unregister_cb(NULL); ++ return; ++ } ++ ++ /* Start iterating from the last touched module */ ++ if (!g_resume_bdev_module) { ++ bdev_module = TAILQ_LAST(&g_bdev_mgr.bdev_modules, bdev_module_list); ++ } else { ++ bdev_module = TAILQ_PREV(g_resume_bdev_module, bdev_module_list, ++ internal.tailq); ++ } ++ ++ while (bdev_module) { ++ if (bdev_module->async_fini) { ++ /* Save our place so we can resume later. We must ++ * save the variable here, before calling module_fini() ++ * below, because in some cases the module may immediately ++ * call spdk_bdev_module_fini_done() and re-enter ++ * this function to continue iterating. */ ++ g_resume_bdev_module = bdev_module; ++ } ++ ++ if (bdev_module->module_fini) { ++ bdev_module->module_fini(); ++ } ++ ++ if (bdev_module->async_fini) { ++ return; ++ } ++ ++ bdev_module = TAILQ_PREV(bdev_module, bdev_module_list, ++ internal.tailq); ++ } ++ ++ g_resume_bdev_module = NULL; ++ spdk_io_device_unregister(&g_bdev_mgr, bdev_mgr_unregister_cb); ++} ++ ++void ++spdk_bdev_module_fini_done(void) ++{ ++ if (spdk_get_thread() != g_fini_thread) { ++ spdk_thread_send_msg(g_fini_thread, bdev_module_fini_iter, NULL); ++ } else { ++ bdev_module_fini_iter(NULL); ++ } ++} ++ ++static void ++bdev_finish_unregister_bdevs_iter(void *cb_arg, int bdeverrno) ++{ ++ struct spdk_bdev *bdev = cb_arg; ++ ++ if (bdeverrno && bdev) { ++ SPDK_WARNLOG("Unable to unregister bdev '%s' during spdk_bdev_finish()\n", ++ bdev->name); ++ ++ /* ++ * Since the call to spdk_bdev_unregister() failed, we have no way to free this ++ * bdev; try to continue by manually removing this bdev from the list and continue ++ * with the next bdev in the list. ++ */ ++ TAILQ_REMOVE(&g_bdev_mgr.bdevs, bdev, internal.link); ++ } ++ ++ if (TAILQ_EMPTY(&g_bdev_mgr.bdevs)) { ++ SPDK_DEBUGLOG(bdev, "Done unregistering bdevs\n"); ++ /* ++ * Bdev module finish need to be deferred as we might be in the middle of some context ++ * (like bdev part free) that will use this bdev (or private bdev driver ctx data) ++ * after returning. ++ */ ++ spdk_thread_send_msg(spdk_get_thread(), bdev_module_fini_iter, NULL); ++ return; ++ } ++ ++ /* ++ * Unregister last unclaimed bdev in the list, to ensure that bdev subsystem ++ * shutdown proceeds top-down. The goal is to give virtual bdevs an opportunity ++ * to detect clean shutdown as opposed to run-time hot removal of the underlying ++ * base bdevs. ++ * ++ * Also, walk the list in the reverse order. ++ */ ++ for (bdev = TAILQ_LAST(&g_bdev_mgr.bdevs, spdk_bdev_list); ++ bdev; bdev = TAILQ_PREV(bdev, spdk_bdev_list, internal.link)) { ++ spdk_spin_lock(&bdev->internal.spinlock); ++ if (bdev->internal.claim_type != SPDK_BDEV_CLAIM_NONE) { ++ SPDK_DEBUGLOG(bdev, "Skipping claimed bdev '%s'(<-'%s').\n", ++ bdev->name, bdev->internal.claim.v1.module->name); ++ spdk_spin_unlock(&bdev->internal.spinlock); ++ continue; ++ } ++ spdk_spin_unlock(&bdev->internal.spinlock); ++ ++ SPDK_DEBUGLOG(bdev, "Unregistering bdev '%s'\n", bdev->name); ++ spdk_bdev_unregister(bdev, bdev_finish_unregister_bdevs_iter, bdev); ++ return; ++ } ++ ++ /* ++ * If any bdev fails to unclaim underlying bdev properly, we may face the ++ * case of bdev list consisting of claimed bdevs only (if claims are managed ++ * correctly, this would mean there's a loop in the claims graph which is ++ * clearly impossible). Warn and unregister last bdev on the list then. ++ */ ++ for (bdev = TAILQ_LAST(&g_bdev_mgr.bdevs, spdk_bdev_list); ++ bdev; bdev = TAILQ_PREV(bdev, spdk_bdev_list, internal.link)) { ++ SPDK_WARNLOG("Unregistering claimed bdev '%s'!\n", bdev->name); ++ spdk_bdev_unregister(bdev, bdev_finish_unregister_bdevs_iter, bdev); ++ return; ++ } ++} ++ ++static void ++bdev_module_fini_start_iter(void *arg) ++{ ++ struct spdk_bdev_module *bdev_module; ++ ++ if (!g_resume_bdev_module) { ++ bdev_module = TAILQ_LAST(&g_bdev_mgr.bdev_modules, bdev_module_list); ++ } else { ++ bdev_module = TAILQ_PREV(g_resume_bdev_module, bdev_module_list, internal.tailq); ++ } ++ ++ while (bdev_module) { ++ if (bdev_module->async_fini_start) { ++ /* Save our place so we can resume later. We must ++ * save the variable here, before calling fini_start() ++ * below, because in some cases the module may immediately ++ * call spdk_bdev_module_fini_start_done() and re-enter ++ * this function to continue iterating. */ ++ g_resume_bdev_module = bdev_module; ++ } ++ ++ if (bdev_module->fini_start) { ++ bdev_module->fini_start(); ++ } ++ ++ if (bdev_module->async_fini_start) { ++ return; ++ } ++ ++ bdev_module = TAILQ_PREV(bdev_module, bdev_module_list, internal.tailq); ++ } ++ ++ g_resume_bdev_module = NULL; ++ ++ bdev_finish_unregister_bdevs_iter(NULL, 0); ++} ++ ++void ++spdk_bdev_module_fini_start_done(void) ++{ ++ if (spdk_get_thread() != g_fini_thread) { ++ spdk_thread_send_msg(g_fini_thread, bdev_module_fini_start_iter, NULL); ++ } else { ++ bdev_module_fini_start_iter(NULL); ++ } ++} ++ ++static void ++bdev_finish_wait_for_examine_done(void *cb_arg) ++{ ++ bdev_module_fini_start_iter(NULL); ++} ++ ++void ++spdk_bdev_finish(spdk_bdev_fini_cb cb_fn, void *cb_arg) ++{ ++ int rc; ++ ++ assert(cb_fn != NULL); ++ ++ g_fini_thread = spdk_get_thread(); ++ ++ g_fini_cb_fn = cb_fn; ++ g_fini_cb_arg = cb_arg; ++ ++ rc = spdk_bdev_wait_for_examine(bdev_finish_wait_for_examine_done, NULL); ++ if (rc != 0) { ++ SPDK_ERRLOG("wait_for_examine failed: %s\n", spdk_strerror(-rc)); ++ bdev_finish_wait_for_examine_done(NULL); ++ } ++} ++ ++struct spdk_bdev_io * ++bdev_channel_get_io(struct spdk_bdev_channel *channel) ++{ ++ struct spdk_bdev_mgmt_channel *ch = channel->shared_resource->mgmt_ch; ++ struct spdk_bdev_io *bdev_io; ++ ++ if (ch->per_thread_cache_count > 0) { ++ bdev_io = STAILQ_FIRST(&ch->per_thread_cache); ++ STAILQ_REMOVE_HEAD(&ch->per_thread_cache, internal.buf_link); ++ ch->per_thread_cache_count--; ++ } else if (spdk_unlikely(!TAILQ_EMPTY(&ch->io_wait_queue))) { ++ /* ++ * Don't try to look for bdev_ios in the global pool if there are ++ * waiters on bdev_ios - we don't want this caller to jump the line. ++ */ ++ bdev_io = NULL; ++ } else { ++ bdev_io = spdk_mempool_get(g_bdev_mgr.bdev_io_pool); ++ } ++ ++ return bdev_io; ++} ++ ++void ++spdk_bdev_free_io(struct spdk_bdev_io *bdev_io) ++{ ++ struct spdk_bdev_mgmt_channel *ch; ++ ++ assert(bdev_io != NULL); ++ assert(bdev_io->internal.status != SPDK_BDEV_IO_STATUS_PENDING); ++ ++ ch = bdev_io->internal.ch->shared_resource->mgmt_ch; ++ ++ if (bdev_io->internal.buf != NULL) { ++ bdev_io_put_buf(bdev_io); ++ } ++ ++ if (ch->per_thread_cache_count < ch->bdev_io_cache_size) { ++ ch->per_thread_cache_count++; ++ STAILQ_INSERT_HEAD(&ch->per_thread_cache, bdev_io, internal.buf_link); ++ while (ch->per_thread_cache_count > 0 && !TAILQ_EMPTY(&ch->io_wait_queue)) { ++ struct spdk_bdev_io_wait_entry *entry; ++ ++ entry = TAILQ_FIRST(&ch->io_wait_queue); ++ TAILQ_REMOVE(&ch->io_wait_queue, entry, link); ++ entry->cb_fn(entry->cb_arg); ++ } ++ } else { ++ /* We should never have a full cache with entries on the io wait queue. */ ++ assert(TAILQ_EMPTY(&ch->io_wait_queue)); ++ spdk_mempool_put(g_bdev_mgr.bdev_io_pool, (void *)bdev_io); ++ } ++} ++ ++static bool ++bdev_qos_is_iops_rate_limit(enum spdk_bdev_qos_rate_limit_type limit) ++{ ++ assert(limit != SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES); ++ ++ switch (limit) { ++ case SPDK_BDEV_QOS_RW_IOPS_RATE_LIMIT: ++ return true; ++ case SPDK_BDEV_QOS_RW_BPS_RATE_LIMIT: ++ case SPDK_BDEV_QOS_R_BPS_RATE_LIMIT: ++ case SPDK_BDEV_QOS_W_BPS_RATE_LIMIT: ++ return false; ++ case SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES: ++ default: ++ return false; ++ } ++} ++ ++static bool ++bdev_qos_io_to_limit(struct spdk_bdev_io *bdev_io) ++{ ++ switch (bdev_io->type) { ++ case SPDK_BDEV_IO_TYPE_NVME_IO: ++ case SPDK_BDEV_IO_TYPE_NVME_IO_MD: ++ case SPDK_BDEV_IO_TYPE_READ: ++ case SPDK_BDEV_IO_TYPE_WRITE: ++ return true; ++ case SPDK_BDEV_IO_TYPE_ZCOPY: ++ if (bdev_io->u.bdev.zcopy.start) { ++ return true; ++ } else { ++ return false; ++ } ++ default: ++ return false; ++ } ++} ++ ++static bool ++bdev_is_read_io(struct spdk_bdev_io *bdev_io) ++{ ++ switch (bdev_io->type) { ++ case SPDK_BDEV_IO_TYPE_NVME_IO: ++ case SPDK_BDEV_IO_TYPE_NVME_IO_MD: ++ /* Bit 1 (0x2) set for read operation */ ++ if (bdev_io->u.nvme_passthru.cmd.opc & SPDK_NVME_OPC_READ) { ++ return true; ++ } else { ++ return false; ++ } ++ case SPDK_BDEV_IO_TYPE_READ: ++ return true; ++ case SPDK_BDEV_IO_TYPE_ZCOPY: ++ /* Populate to read from disk */ ++ if (bdev_io->u.bdev.zcopy.populate) { ++ return true; ++ } else { ++ return false; ++ } ++ default: ++ return false; ++ } ++} ++ ++static uint64_t ++bdev_get_io_size_in_byte(struct spdk_bdev_io *bdev_io) ++{ ++ struct spdk_bdev *bdev = bdev_io->bdev; ++ ++ switch (bdev_io->type) { ++ case SPDK_BDEV_IO_TYPE_NVME_IO: ++ case SPDK_BDEV_IO_TYPE_NVME_IO_MD: ++ return bdev_io->u.nvme_passthru.nbytes; ++ case SPDK_BDEV_IO_TYPE_READ: ++ case SPDK_BDEV_IO_TYPE_WRITE: ++ return bdev_io->u.bdev.num_blocks * bdev->blocklen; ++ case SPDK_BDEV_IO_TYPE_ZCOPY: ++ /* Track the data in the start phase only */ ++ if (bdev_io->u.bdev.zcopy.start) { ++ return bdev_io->u.bdev.num_blocks * bdev->blocklen; ++ } else { ++ return 0; ++ } ++ default: ++ return 0; ++ } ++} ++ ++static bool ++bdev_qos_rw_queue_io(const struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) ++{ ++ if (limit->max_per_timeslice > 0 && limit->remaining_this_timeslice <= 0) { ++ return true; ++ } else { ++ return false; ++ } ++} ++ ++static bool ++bdev_qos_r_queue_io(const struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) ++{ ++ if (bdev_is_read_io(io) == false) { ++ return false; ++ } ++ ++ return bdev_qos_rw_queue_io(limit, io); ++} ++ ++static bool ++bdev_qos_w_queue_io(const struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) ++{ ++ if (bdev_is_read_io(io) == true) { ++ return false; ++ } ++ ++ return bdev_qos_rw_queue_io(limit, io); ++} ++ ++static void ++bdev_qos_rw_iops_update_quota(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) ++{ ++ limit->remaining_this_timeslice--; ++} ++ ++static void ++bdev_qos_rw_bps_update_quota(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) ++{ ++ limit->remaining_this_timeslice -= bdev_get_io_size_in_byte(io); ++} ++ ++static void ++bdev_qos_r_bps_update_quota(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) ++{ ++ if (bdev_is_read_io(io) == false) { ++ return; ++ } ++ ++ return bdev_qos_rw_bps_update_quota(limit, io); ++} ++ ++static void ++bdev_qos_w_bps_update_quota(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) ++{ ++ if (bdev_is_read_io(io) == true) { ++ return; ++ } ++ ++ return bdev_qos_rw_bps_update_quota(limit, io); ++} ++ ++static void ++bdev_qos_set_ops(struct spdk_bdev_qos *qos) ++{ ++ int i; ++ ++ for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { ++ if (qos->rate_limits[i].limit == SPDK_BDEV_QOS_LIMIT_NOT_DEFINED) { ++ qos->rate_limits[i].queue_io = NULL; ++ qos->rate_limits[i].update_quota = NULL; ++ continue; ++ } ++ ++ switch (i) { ++ case SPDK_BDEV_QOS_RW_IOPS_RATE_LIMIT: ++ qos->rate_limits[i].queue_io = bdev_qos_rw_queue_io; ++ qos->rate_limits[i].update_quota = bdev_qos_rw_iops_update_quota; ++ break; ++ case SPDK_BDEV_QOS_RW_BPS_RATE_LIMIT: ++ qos->rate_limits[i].queue_io = bdev_qos_rw_queue_io; ++ qos->rate_limits[i].update_quota = bdev_qos_rw_bps_update_quota; ++ break; ++ case SPDK_BDEV_QOS_R_BPS_RATE_LIMIT: ++ qos->rate_limits[i].queue_io = bdev_qos_r_queue_io; ++ qos->rate_limits[i].update_quota = bdev_qos_r_bps_update_quota; ++ break; ++ case SPDK_BDEV_QOS_W_BPS_RATE_LIMIT: ++ qos->rate_limits[i].queue_io = bdev_qos_w_queue_io; ++ qos->rate_limits[i].update_quota = bdev_qos_w_bps_update_quota; ++ break; ++ default: ++ break; ++ } ++ } ++} ++ ++static void ++_bdev_io_complete_in_submit(struct spdk_bdev_channel *bdev_ch, ++ struct spdk_bdev_io *bdev_io, ++ enum spdk_bdev_io_status status) ++{ ++ struct spdk_bdev_shared_resource *shared_resource = bdev_ch->shared_resource; ++ ++ bdev_io->internal.in_submit_request = true; ++ bdev_ch->io_outstanding++; ++ shared_resource->io_outstanding++; ++ spdk_bdev_io_complete(bdev_io, status); ++ bdev_io->internal.in_submit_request = false; ++} ++ ++static inline void ++bdev_io_do_submit(struct spdk_bdev_channel *bdev_ch, struct spdk_bdev_io *bdev_io) ++{ ++ struct spdk_bdev *bdev = bdev_io->bdev; ++ struct spdk_io_channel *ch = bdev_ch->channel; ++ struct spdk_bdev_shared_resource *shared_resource = bdev_ch->shared_resource; ++ ++ if (spdk_unlikely(bdev_io->type == SPDK_BDEV_IO_TYPE_ABORT)) { ++ struct spdk_bdev_mgmt_channel *mgmt_channel = shared_resource->mgmt_ch; ++ struct spdk_bdev_io *bio_to_abort = bdev_io->u.abort.bio_to_abort; ++ ++ if (bdev_abort_queued_io(&shared_resource->nomem_io, bio_to_abort) || ++ bdev_abort_buf_io(mgmt_channel, bio_to_abort)) { ++ _bdev_io_complete_in_submit(bdev_ch, bdev_io, ++ SPDK_BDEV_IO_STATUS_SUCCESS); ++ return; ++ } ++ } ++ ++ if (spdk_unlikely(bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE && ++ bdev_io->bdev->split_on_write_unit && ++ bdev_io->u.bdev.num_blocks < bdev_io->bdev->write_unit_size)) { ++ SPDK_ERRLOG("IO num_blocks %lu does not match the write_unit_size %u\n", ++ bdev_io->u.bdev.num_blocks, bdev_io->bdev->write_unit_size); ++ _bdev_io_complete_in_submit(bdev_ch, bdev_io, SPDK_BDEV_IO_STATUS_FAILED); ++ return; ++ } ++ ++ if (spdk_likely(TAILQ_EMPTY(&shared_resource->nomem_io))) { ++ bdev_ch->io_outstanding++; ++ shared_resource->io_outstanding++; ++ bdev_io->internal.in_submit_request = true; ++ bdev->fn_table->submit_request(ch, bdev_io); ++ bdev_io->internal.in_submit_request = false; ++ } else { ++ TAILQ_INSERT_TAIL(&shared_resource->nomem_io, bdev_io, internal.link); ++ } ++} ++ ++static bool ++bdev_qos_queue_io(struct spdk_bdev_qos *qos, struct spdk_bdev_io *bdev_io) ++{ ++ int i; ++ ++ if (bdev_qos_io_to_limit(bdev_io) == true) { ++ for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { ++ if (!qos->rate_limits[i].queue_io) { ++ continue; ++ } ++ ++ if (qos->rate_limits[i].queue_io(&qos->rate_limits[i], ++ bdev_io) == true) { ++ return true; ++ } ++ } ++ for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { ++ if (!qos->rate_limits[i].update_quota) { ++ continue; ++ } ++ ++ qos->rate_limits[i].update_quota(&qos->rate_limits[i], bdev_io); ++ } ++ } ++ ++ return false; ++} ++ ++static inline void ++_bdev_io_do_submit(void *ctx) ++{ ++ struct spdk_bdev_io *bdev_io = ctx; ++ struct spdk_bdev_channel *ch = bdev_io->internal.ch; ++ ++ bdev_io_do_submit(ch, bdev_io); ++} ++ ++static int ++bdev_qos_io_submit(struct spdk_bdev_channel *ch, struct spdk_bdev_qos *qos) ++{ ++ struct spdk_bdev_io *bdev_io = NULL, *tmp = NULL; ++ int submitted_ios = 0; ++ ++ TAILQ_FOREACH_SAFE(bdev_io, &qos->queued, internal.link, tmp) { ++ if (!bdev_qos_queue_io(qos, bdev_io)) { ++ TAILQ_REMOVE(&qos->queued, bdev_io, internal.link); ++ ++ if (bdev_io->internal.io_submit_ch) { ++ /* Send back the IO to the original thread for the actual processing. */ ++ bdev_io->internal.ch = bdev_io->internal.io_submit_ch; ++ bdev_io->internal.io_submit_ch = NULL; ++ spdk_thread_send_msg(spdk_bdev_io_get_thread(bdev_io), ++ _bdev_io_do_submit, bdev_io); ++ } else { ++ bdev_io_do_submit(ch, bdev_io); ++ } ++ ++ submitted_ios++; ++ } ++ } ++ ++ return submitted_ios; ++} ++ ++static void ++bdev_queue_io_wait_with_cb(struct spdk_bdev_io *bdev_io, spdk_bdev_io_wait_cb cb_fn) ++{ ++ int rc; ++ ++ bdev_io->internal.waitq_entry.bdev = bdev_io->bdev; ++ bdev_io->internal.waitq_entry.cb_fn = cb_fn; ++ bdev_io->internal.waitq_entry.cb_arg = bdev_io; ++ rc = spdk_bdev_queue_io_wait(bdev_io->bdev, spdk_io_channel_from_ctx(bdev_io->internal.ch), ++ &bdev_io->internal.waitq_entry); ++ if (rc != 0) { ++ SPDK_ERRLOG("Queue IO failed, rc=%d\n", rc); ++ bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; ++ bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); ++ } ++} ++ ++static bool ++bdev_rw_should_split(struct spdk_bdev_io *bdev_io) ++{ ++ uint32_t io_boundary; ++ struct spdk_bdev *bdev = bdev_io->bdev; ++ uint32_t max_size = bdev->max_segment_size; ++ int max_segs = bdev->max_num_segments; ++ ++ if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE && bdev->split_on_write_unit) { ++ io_boundary = bdev->write_unit_size; ++ } else if (bdev->split_on_optimal_io_boundary) { ++ io_boundary = bdev->optimal_io_boundary; ++ } else { ++ io_boundary = 0; ++ } ++ ++ if (spdk_likely(!io_boundary && !max_segs && !max_size)) { ++ return false; ++ } ++ ++ if (io_boundary) { ++ uint64_t start_stripe, end_stripe; ++ ++ start_stripe = bdev_io->u.bdev.offset_blocks; ++ end_stripe = start_stripe + bdev_io->u.bdev.num_blocks - 1; ++ /* Avoid expensive div operations if possible. These spdk_u32 functions are very cheap. */ ++ if (spdk_likely(spdk_u32_is_pow2(io_boundary))) { ++ start_stripe >>= spdk_u32log2(io_boundary); ++ end_stripe >>= spdk_u32log2(io_boundary); ++ } else { ++ start_stripe /= io_boundary; ++ end_stripe /= io_boundary; ++ } ++ ++ if (start_stripe != end_stripe) { ++ return true; ++ } ++ } ++ ++ if (max_segs) { ++ if (bdev_io->u.bdev.iovcnt > max_segs) { ++ return true; ++ } ++ } ++ ++ if (max_size) { ++ for (int i = 0; i < bdev_io->u.bdev.iovcnt; i++) { ++ if (bdev_io->u.bdev.iovs[i].iov_len > max_size) { ++ return true; ++ } ++ } ++ } ++ ++ return false; ++} ++ ++static bool ++bdev_unmap_should_split(struct spdk_bdev_io *bdev_io) ++{ ++ uint32_t num_unmap_segments; ++ ++ if (!bdev_io->bdev->max_unmap || !bdev_io->bdev->max_unmap_segments) { ++ return false; ++ } ++ num_unmap_segments = spdk_divide_round_up(bdev_io->u.bdev.num_blocks, bdev_io->bdev->max_unmap); ++ if (num_unmap_segments > bdev_io->bdev->max_unmap_segments) { ++ return true; ++ } ++ ++ return false; ++} ++ ++static bool ++bdev_write_zeroes_should_split(struct spdk_bdev_io *bdev_io) ++{ ++ if (!bdev_io->bdev->max_write_zeroes) { ++ return false; ++ } ++ ++ if (bdev_io->u.bdev.num_blocks > bdev_io->bdev->max_write_zeroes) { ++ return true; ++ } ++ ++ return false; ++} ++ ++static bool ++bdev_copy_should_split(struct spdk_bdev_io *bdev_io) ++{ ++ if (bdev_io->bdev->max_copy != 0 && ++ bdev_io->u.bdev.num_blocks > bdev_io->bdev->max_copy) { ++ return true; ++ } ++ ++ return false; ++} ++ ++static bool ++bdev_io_should_split(struct spdk_bdev_io *bdev_io) ++{ ++ switch (bdev_io->type) { ++ case SPDK_BDEV_IO_TYPE_READ: ++ case SPDK_BDEV_IO_TYPE_WRITE: ++ return bdev_rw_should_split(bdev_io); ++ case SPDK_BDEV_IO_TYPE_UNMAP: ++ return bdev_unmap_should_split(bdev_io); ++ case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: ++ return bdev_write_zeroes_should_split(bdev_io); ++ case SPDK_BDEV_IO_TYPE_COPY: ++ return bdev_copy_should_split(bdev_io); ++ default: ++ return false; ++ } ++} ++ ++static uint32_t ++_to_next_boundary(uint64_t offset, uint32_t boundary) ++{ ++ return (boundary - (offset % boundary)); ++} ++ ++static void bdev_io_split_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg); ++ ++static void _bdev_rw_split(void *_bdev_io); ++ ++static void bdev_unmap_split(struct spdk_bdev_io *bdev_io); ++ ++static void ++_bdev_unmap_split(void *_bdev_io) ++{ ++ return bdev_unmap_split((struct spdk_bdev_io *)_bdev_io); ++} ++ ++static void bdev_write_zeroes_split(struct spdk_bdev_io *bdev_io); ++ ++static void ++_bdev_write_zeroes_split(void *_bdev_io) ++{ ++ return bdev_write_zeroes_split((struct spdk_bdev_io *)_bdev_io); ++} ++ ++static void bdev_copy_split(struct spdk_bdev_io *bdev_io); ++ ++static void ++_bdev_copy_split(void *_bdev_io) ++{ ++ return bdev_copy_split((struct spdk_bdev_io *)_bdev_io); ++} ++ ++static int ++bdev_io_split_submit(struct spdk_bdev_io *bdev_io, struct iovec *iov, int iovcnt, void *md_buf, ++ uint64_t num_blocks, uint64_t *offset, uint64_t *remaining) ++{ ++ int rc; ++ uint64_t current_offset, current_remaining, current_src_offset; ++ spdk_bdev_io_wait_cb io_wait_fn; ++ ++ current_offset = *offset; ++ current_remaining = *remaining; ++ ++ bdev_io->u.bdev.split_outstanding++; ++ ++ io_wait_fn = _bdev_rw_split; ++ switch (bdev_io->type) { ++ case SPDK_BDEV_IO_TYPE_READ: ++ rc = bdev_readv_blocks_with_md(bdev_io->internal.desc, ++ spdk_io_channel_from_ctx(bdev_io->internal.ch), ++ iov, iovcnt, md_buf, current_offset, ++ num_blocks, ++ bdev_io_split_done, bdev_io, ++ bdev_io->internal.ext_opts, true); ++ break; ++ case SPDK_BDEV_IO_TYPE_WRITE: ++ rc = bdev_writev_blocks_with_md(bdev_io->internal.desc, ++ spdk_io_channel_from_ctx(bdev_io->internal.ch), ++ iov, iovcnt, md_buf, current_offset, ++ num_blocks, ++ bdev_io_split_done, bdev_io, ++ bdev_io->internal.ext_opts, true); ++ break; ++ case SPDK_BDEV_IO_TYPE_UNMAP: ++ io_wait_fn = _bdev_unmap_split; ++ rc = spdk_bdev_unmap_blocks(bdev_io->internal.desc, ++ spdk_io_channel_from_ctx(bdev_io->internal.ch), ++ current_offset, num_blocks, ++ bdev_io_split_done, bdev_io); ++ break; ++ case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: ++ io_wait_fn = _bdev_write_zeroes_split; ++ rc = spdk_bdev_write_zeroes_blocks(bdev_io->internal.desc, ++ spdk_io_channel_from_ctx(bdev_io->internal.ch), ++ current_offset, num_blocks, ++ bdev_io_split_done, bdev_io); ++ break; ++ case SPDK_BDEV_IO_TYPE_COPY: ++ io_wait_fn = _bdev_copy_split; ++ current_src_offset = bdev_io->u.bdev.copy.src_offset_blocks + ++ (current_offset - bdev_io->u.bdev.offset_blocks); ++ rc = spdk_bdev_copy_blocks(bdev_io->internal.desc, ++ spdk_io_channel_from_ctx(bdev_io->internal.ch), ++ current_offset, current_src_offset, num_blocks, ++ bdev_io_split_done, bdev_io); ++ break; ++ default: ++ assert(false); ++ rc = -EINVAL; ++ break; ++ } ++ ++ if (rc == 0) { ++ current_offset += num_blocks; ++ current_remaining -= num_blocks; ++ bdev_io->u.bdev.split_current_offset_blocks = current_offset; ++ bdev_io->u.bdev.split_remaining_num_blocks = current_remaining; ++ *offset = current_offset; ++ *remaining = current_remaining; ++ } else { ++ bdev_io->u.bdev.split_outstanding--; ++ if (rc == -ENOMEM) { ++ if (bdev_io->u.bdev.split_outstanding == 0) { ++ /* No I/O is outstanding. Hence we should wait here. */ ++ bdev_queue_io_wait_with_cb(bdev_io, io_wait_fn); ++ } ++ } else { ++ bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; ++ if (bdev_io->u.bdev.split_outstanding == 0) { ++ spdk_trace_record(TRACE_BDEV_IO_DONE, 0, 0, (uintptr_t)bdev_io, bdev_io->internal.caller_ctx); ++ TAILQ_REMOVE(&bdev_io->internal.ch->io_submitted, bdev_io, internal.ch_link); ++ bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); ++ } ++ } ++ } ++ ++ return rc; ++} ++ ++static void ++_bdev_rw_split(void *_bdev_io) ++{ ++ struct iovec *parent_iov, *iov; ++ struct spdk_bdev_io *bdev_io = _bdev_io; ++ struct spdk_bdev *bdev = bdev_io->bdev; ++ uint64_t parent_offset, current_offset, remaining; ++ uint32_t parent_iov_offset, parent_iovcnt, parent_iovpos, child_iovcnt; ++ uint32_t to_next_boundary, to_next_boundary_bytes, to_last_block_bytes; ++ uint32_t iovcnt, iov_len, child_iovsize; ++ uint32_t blocklen = bdev->blocklen; ++ uint32_t io_boundary; ++ uint32_t max_segment_size = bdev->max_segment_size; ++ uint32_t max_child_iovcnt = bdev->max_num_segments; ++ void *md_buf = NULL; ++ int rc; ++ ++ max_segment_size = max_segment_size ? max_segment_size : UINT32_MAX; ++ max_child_iovcnt = max_child_iovcnt ? spdk_min(max_child_iovcnt, SPDK_BDEV_IO_NUM_CHILD_IOV) : ++ SPDK_BDEV_IO_NUM_CHILD_IOV; ++ ++ if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE && bdev->split_on_write_unit) { ++ io_boundary = bdev->write_unit_size; ++ } else if (bdev->split_on_optimal_io_boundary) { ++ io_boundary = bdev->optimal_io_boundary; ++ } else { ++ io_boundary = UINT32_MAX; ++ } ++ ++ remaining = bdev_io->u.bdev.split_remaining_num_blocks; ++ current_offset = bdev_io->u.bdev.split_current_offset_blocks; ++ parent_offset = bdev_io->u.bdev.offset_blocks; ++ parent_iov_offset = (current_offset - parent_offset) * blocklen; ++ parent_iovcnt = bdev_io->u.bdev.iovcnt; ++ ++ for (parent_iovpos = 0; parent_iovpos < parent_iovcnt; parent_iovpos++) { ++ parent_iov = &bdev_io->u.bdev.iovs[parent_iovpos]; ++ if (parent_iov_offset < parent_iov->iov_len) { ++ break; ++ } ++ parent_iov_offset -= parent_iov->iov_len; ++ } ++ ++ child_iovcnt = 0; ++ while (remaining > 0 && parent_iovpos < parent_iovcnt && ++ child_iovcnt < SPDK_BDEV_IO_NUM_CHILD_IOV) { ++ to_next_boundary = _to_next_boundary(current_offset, io_boundary); ++ to_next_boundary = spdk_min(remaining, to_next_boundary); ++ to_next_boundary_bytes = to_next_boundary * blocklen; ++ ++ iov = &bdev_io->child_iov[child_iovcnt]; ++ iovcnt = 0; ++ ++ if (bdev_io->u.bdev.md_buf) { ++ md_buf = (char *)bdev_io->u.bdev.md_buf + ++ (current_offset - parent_offset) * spdk_bdev_get_md_size(bdev); ++ } ++ ++ child_iovsize = spdk_min(SPDK_BDEV_IO_NUM_CHILD_IOV - child_iovcnt, max_child_iovcnt); ++ while (to_next_boundary_bytes > 0 && parent_iovpos < parent_iovcnt && ++ iovcnt < child_iovsize) { ++ parent_iov = &bdev_io->u.bdev.iovs[parent_iovpos]; ++ iov_len = parent_iov->iov_len - parent_iov_offset; ++ ++ iov_len = spdk_min(iov_len, max_segment_size); ++ iov_len = spdk_min(iov_len, to_next_boundary_bytes); ++ to_next_boundary_bytes -= iov_len; ++ ++ bdev_io->child_iov[child_iovcnt].iov_base = parent_iov->iov_base + parent_iov_offset; ++ bdev_io->child_iov[child_iovcnt].iov_len = iov_len; ++ ++ if (iov_len < parent_iov->iov_len - parent_iov_offset) { ++ parent_iov_offset += iov_len; ++ } else { ++ parent_iovpos++; ++ parent_iov_offset = 0; ++ } ++ child_iovcnt++; ++ iovcnt++; ++ } ++ ++ if (to_next_boundary_bytes > 0) { ++ /* We had to stop this child I/O early because we ran out of ++ * child_iov space or were limited by max_num_segments. ++ * Ensure the iovs to be aligned with block size and ++ * then adjust to_next_boundary before starting the ++ * child I/O. ++ */ ++ assert(child_iovcnt == SPDK_BDEV_IO_NUM_CHILD_IOV || ++ iovcnt == child_iovsize); ++ to_last_block_bytes = to_next_boundary_bytes % blocklen; ++ if (to_last_block_bytes != 0) { ++ uint32_t child_iovpos = child_iovcnt - 1; ++ /* don't decrease child_iovcnt when it equals to SPDK_BDEV_IO_NUM_CHILD_IOV ++ * so the loop will naturally end ++ */ ++ ++ to_last_block_bytes = blocklen - to_last_block_bytes; ++ to_next_boundary_bytes += to_last_block_bytes; ++ while (to_last_block_bytes > 0 && iovcnt > 0) { ++ iov_len = spdk_min(to_last_block_bytes, ++ bdev_io->child_iov[child_iovpos].iov_len); ++ bdev_io->child_iov[child_iovpos].iov_len -= iov_len; ++ if (bdev_io->child_iov[child_iovpos].iov_len == 0) { ++ child_iovpos--; ++ if (--iovcnt == 0) { ++ /* If the child IO is less than a block size just return. ++ * If the first child IO of any split round is less than ++ * a block size, an error exit. ++ */ ++ if (bdev_io->u.bdev.split_outstanding == 0) { ++ SPDK_ERRLOG("The first child io was less than a block size\n"); ++ bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; ++ spdk_trace_record(TRACE_BDEV_IO_DONE, 0, 0, (uintptr_t)bdev_io, bdev_io->internal.caller_ctx); ++ TAILQ_REMOVE(&bdev_io->internal.ch->io_submitted, bdev_io, internal.ch_link); ++ bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); ++ } ++ ++ return; ++ } ++ } ++ ++ to_last_block_bytes -= iov_len; ++ ++ if (parent_iov_offset == 0) { ++ parent_iovpos--; ++ parent_iov_offset = bdev_io->u.bdev.iovs[parent_iovpos].iov_len; ++ } ++ parent_iov_offset -= iov_len; ++ } ++ ++ assert(to_last_block_bytes == 0); ++ } ++ to_next_boundary -= to_next_boundary_bytes / blocklen; ++ } ++ ++ rc = bdev_io_split_submit(bdev_io, iov, iovcnt, md_buf, to_next_boundary, ++ ¤t_offset, &remaining); ++ if (spdk_unlikely(rc)) { ++ return; ++ } ++ } ++} ++ ++static void ++bdev_unmap_split(struct spdk_bdev_io *bdev_io) ++{ ++ uint64_t offset, unmap_blocks, remaining, max_unmap_blocks; ++ uint32_t num_children_reqs = 0; ++ int rc; ++ ++ offset = bdev_io->u.bdev.split_current_offset_blocks; ++ remaining = bdev_io->u.bdev.split_remaining_num_blocks; ++ max_unmap_blocks = bdev_io->bdev->max_unmap * bdev_io->bdev->max_unmap_segments; ++ ++ while (remaining && (num_children_reqs < SPDK_BDEV_MAX_CHILDREN_UNMAP_WRITE_ZEROES_REQS)) { ++ unmap_blocks = spdk_min(remaining, max_unmap_blocks); ++ ++ rc = bdev_io_split_submit(bdev_io, NULL, 0, NULL, unmap_blocks, ++ &offset, &remaining); ++ if (spdk_likely(rc == 0)) { ++ num_children_reqs++; ++ } else { ++ return; ++ } ++ } ++} ++ ++static void ++bdev_write_zeroes_split(struct spdk_bdev_io *bdev_io) ++{ ++ uint64_t offset, write_zeroes_blocks, remaining; ++ uint32_t num_children_reqs = 0; ++ int rc; ++ ++ offset = bdev_io->u.bdev.split_current_offset_blocks; ++ remaining = bdev_io->u.bdev.split_remaining_num_blocks; ++ ++ while (remaining && (num_children_reqs < SPDK_BDEV_MAX_CHILDREN_UNMAP_WRITE_ZEROES_REQS)) { ++ write_zeroes_blocks = spdk_min(remaining, bdev_io->bdev->max_write_zeroes); ++ ++ rc = bdev_io_split_submit(bdev_io, NULL, 0, NULL, write_zeroes_blocks, ++ &offset, &remaining); ++ if (spdk_likely(rc == 0)) { ++ num_children_reqs++; ++ } else { ++ return; ++ } ++ } ++} ++ ++static void ++bdev_copy_split(struct spdk_bdev_io *bdev_io) ++{ ++ uint64_t offset, copy_blocks, remaining; ++ uint32_t num_children_reqs = 0; ++ int rc; ++ ++ offset = bdev_io->u.bdev.split_current_offset_blocks; ++ remaining = bdev_io->u.bdev.split_remaining_num_blocks; ++ ++ assert(bdev_io->bdev->max_copy != 0); ++ while (remaining && (num_children_reqs < SPDK_BDEV_MAX_CHILDREN_COPY_REQS)) { ++ copy_blocks = spdk_min(remaining, bdev_io->bdev->max_copy); ++ ++ rc = bdev_io_split_submit(bdev_io, NULL, 0, NULL, copy_blocks, ++ &offset, &remaining); ++ if (spdk_likely(rc == 0)) { ++ num_children_reqs++; ++ } else { ++ return; ++ } ++ } ++} ++ ++static void ++parent_bdev_io_complete(void *ctx, int rc) ++{ ++ struct spdk_bdev_io *parent_io = ctx; ++ ++ if (rc) { ++ parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; ++ } ++ ++ parent_io->internal.cb(parent_io, parent_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS, ++ parent_io->internal.caller_ctx); ++} ++ ++static void ++bdev_io_split_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) ++{ ++ struct spdk_bdev_io *parent_io = cb_arg; ++ ++ spdk_bdev_free_io(bdev_io); ++ ++ if (!success) { ++ parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; ++ /* If any child I/O failed, stop further splitting process. */ ++ parent_io->u.bdev.split_current_offset_blocks += parent_io->u.bdev.split_remaining_num_blocks; ++ parent_io->u.bdev.split_remaining_num_blocks = 0; ++ } ++ parent_io->u.bdev.split_outstanding--; ++ if (parent_io->u.bdev.split_outstanding != 0) { ++ return; ++ } ++ ++ /* ++ * Parent I/O finishes when all blocks are consumed. ++ */ ++ if (parent_io->u.bdev.split_remaining_num_blocks == 0) { ++ assert(parent_io->internal.cb != bdev_io_split_done); ++ spdk_trace_record(TRACE_BDEV_IO_DONE, 0, 0, (uintptr_t)parent_io, bdev_io->internal.caller_ctx); ++ TAILQ_REMOVE(&parent_io->internal.ch->io_submitted, parent_io, internal.ch_link); ++ ++ if (parent_io->internal.orig_iovcnt != 0) { ++ _bdev_io_push_bounce_data_buffer(parent_io, parent_bdev_io_complete); ++ /* bdev IO will be completed in the callback */ ++ } else { ++ parent_bdev_io_complete(parent_io, 0); ++ } ++ return; ++ } ++ ++ /* ++ * Continue with the splitting process. This function will complete the parent I/O if the ++ * splitting is done. ++ */ ++ switch (parent_io->type) { ++ case SPDK_BDEV_IO_TYPE_READ: ++ case SPDK_BDEV_IO_TYPE_WRITE: ++ _bdev_rw_split(parent_io); ++ break; ++ case SPDK_BDEV_IO_TYPE_UNMAP: ++ bdev_unmap_split(parent_io); ++ break; ++ case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: ++ bdev_write_zeroes_split(parent_io); ++ break; ++ case SPDK_BDEV_IO_TYPE_COPY: ++ bdev_copy_split(parent_io); ++ break; ++ default: ++ assert(false); ++ break; ++ } ++} ++ ++static void bdev_rw_split_get_buf_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io, ++ bool success); ++ ++static void ++bdev_io_split(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io) ++{ ++ bdev_io->u.bdev.split_current_offset_blocks = bdev_io->u.bdev.offset_blocks; ++ bdev_io->u.bdev.split_remaining_num_blocks = bdev_io->u.bdev.num_blocks; ++ bdev_io->u.bdev.split_outstanding = 0; ++ bdev_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS; ++ ++ switch (bdev_io->type) { ++ case SPDK_BDEV_IO_TYPE_READ: ++ case SPDK_BDEV_IO_TYPE_WRITE: ++ if (_is_buf_allocated(bdev_io->u.bdev.iovs)) { ++ _bdev_rw_split(bdev_io); ++ } else { ++ assert(bdev_io->type == SPDK_BDEV_IO_TYPE_READ); ++ spdk_bdev_io_get_buf(bdev_io, bdev_rw_split_get_buf_cb, ++ bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen); ++ } ++ break; ++ case SPDK_BDEV_IO_TYPE_UNMAP: ++ bdev_unmap_split(bdev_io); ++ break; ++ case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: ++ bdev_write_zeroes_split(bdev_io); ++ break; ++ case SPDK_BDEV_IO_TYPE_COPY: ++ bdev_copy_split(bdev_io); ++ break; ++ default: ++ assert(false); ++ break; ++ } ++} ++ ++static void ++bdev_rw_split_get_buf_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io, bool success) ++{ ++ if (!success) { ++ spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); ++ return; ++ } ++ ++ _bdev_rw_split(bdev_io); ++} ++ ++/* Explicitly mark this inline, since it's used as a function pointer and otherwise won't ++ * be inlined, at least on some compilers. ++ */ ++static inline void ++_bdev_io_submit(void *ctx) ++{ ++ struct spdk_bdev_io *bdev_io = ctx; ++ struct spdk_bdev *bdev = bdev_io->bdev; ++ struct spdk_bdev_channel *bdev_ch = bdev_io->internal.ch; ++ ++ if (spdk_likely(bdev_ch->flags == 0)) { ++ bdev_io_do_submit(bdev_ch, bdev_io); ++ return; ++ } ++ ++ if (bdev_ch->flags & BDEV_CH_RESET_IN_PROGRESS) { ++ _bdev_io_complete_in_submit(bdev_ch, bdev_io, SPDK_BDEV_IO_STATUS_ABORTED); ++ } else if (bdev_ch->flags & BDEV_CH_QOS_ENABLED) { ++ if (spdk_unlikely(bdev_io->type == SPDK_BDEV_IO_TYPE_ABORT) && ++ bdev_abort_queued_io(&bdev->internal.qos->queued, bdev_io->u.abort.bio_to_abort)) { ++ _bdev_io_complete_in_submit(bdev_ch, bdev_io, SPDK_BDEV_IO_STATUS_SUCCESS); ++ } else { ++ TAILQ_INSERT_TAIL(&bdev->internal.qos->queued, bdev_io, internal.link); ++ bdev_qos_io_submit(bdev_ch, bdev->internal.qos); ++ } ++ } else { ++ SPDK_ERRLOG("unknown bdev_ch flag %x found\n", bdev_ch->flags); ++ _bdev_io_complete_in_submit(bdev_ch, bdev_io, SPDK_BDEV_IO_STATUS_FAILED); ++ } ++} ++ ++bool bdev_lba_range_overlapped(struct lba_range *range1, struct lba_range *range2); ++ ++bool ++bdev_lba_range_overlapped(struct lba_range *range1, struct lba_range *range2) ++{ ++ if (range1->length == 0 || range2->length == 0) { ++ return false; ++ } ++ ++ if (range1->offset + range1->length <= range2->offset) { ++ return false; ++ } ++ ++ if (range2->offset + range2->length <= range1->offset) { ++ return false; ++ } ++ ++ return true; ++} ++ ++static bool ++bdev_io_range_is_locked(struct spdk_bdev_io *bdev_io, struct lba_range *range) ++{ ++ struct spdk_bdev_channel *ch = bdev_io->internal.ch; ++ struct lba_range r; ++ ++ switch (bdev_io->type) { ++ case SPDK_BDEV_IO_TYPE_NVME_IO: ++ case SPDK_BDEV_IO_TYPE_NVME_IO_MD: ++ /* Don't try to decode the NVMe command - just assume worst-case and that ++ * it overlaps a locked range. ++ */ ++ return true; ++ case SPDK_BDEV_IO_TYPE_WRITE: ++ case SPDK_BDEV_IO_TYPE_UNMAP: ++ case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: ++ case SPDK_BDEV_IO_TYPE_ZCOPY: ++ case SPDK_BDEV_IO_TYPE_COPY: ++ r.offset = bdev_io->u.bdev.offset_blocks; ++ r.length = bdev_io->u.bdev.num_blocks; ++ if (!bdev_lba_range_overlapped(range, &r)) { ++ /* This I/O doesn't overlap the specified LBA range. */ ++ return false; ++ } else if (range->owner_ch == ch && range->locked_ctx == bdev_io->internal.caller_ctx) { ++ /* This I/O overlaps, but the I/O is on the same channel that locked this ++ * range, and the caller_ctx is the same as the locked_ctx. This means ++ * that this I/O is associated with the lock, and is allowed to execute. ++ */ ++ return false; ++ } else { ++ return true; ++ } ++ default: ++ return false; ++ } ++} ++ ++void ++bdev_io_submit(struct spdk_bdev_io *bdev_io) ++{ ++ struct spdk_bdev *bdev = bdev_io->bdev; ++ struct spdk_thread *thread = spdk_bdev_io_get_thread(bdev_io); ++ struct spdk_bdev_channel *ch = bdev_io->internal.ch; ++ ++ assert(thread != NULL); ++ assert(bdev_io->internal.status == SPDK_BDEV_IO_STATUS_PENDING); ++ ++ if (!TAILQ_EMPTY(&ch->locked_ranges)) { ++ struct lba_range *range; ++ ++ TAILQ_FOREACH(range, &ch->locked_ranges, tailq) { ++ if (bdev_io_range_is_locked(bdev_io, range)) { ++ TAILQ_INSERT_TAIL(&ch->io_locked, bdev_io, internal.ch_link); ++ return; ++ } ++ } ++ } ++ ++ TAILQ_INSERT_TAIL(&ch->io_submitted, bdev_io, internal.ch_link); ++ ++ bdev_io->internal.submit_tsc = spdk_get_ticks(); ++ spdk_trace_record_tsc(bdev_io->internal.submit_tsc, TRACE_BDEV_IO_START, 0, 0, ++ (uintptr_t)bdev_io, (uint64_t)bdev_io->type, bdev_io->internal.caller_ctx, ++ bdev_io->u.bdev.offset_blocks, bdev_io->u.bdev.num_blocks, ++ spdk_bdev_get_name(bdev)); ++ ++ if (bdev_io_should_split(bdev_io)) { ++ bdev_io_split(NULL, bdev_io); ++ return; ++ } ++ ++ if (ch->flags & BDEV_CH_QOS_ENABLED) { ++ if ((thread == bdev->internal.qos->thread) || !bdev->internal.qos->thread) { ++ _bdev_io_submit(bdev_io); ++ } else { ++ bdev_io->internal.io_submit_ch = ch; ++ bdev_io->internal.ch = bdev->internal.qos->ch; ++ spdk_thread_send_msg(bdev->internal.qos->thread, _bdev_io_submit, bdev_io); ++ } ++ } else { ++ _bdev_io_submit(bdev_io); ++ } ++} ++ ++static inline void ++_bdev_io_copy_ext_opts(struct spdk_bdev_io *bdev_io, struct spdk_bdev_ext_io_opts *opts) ++{ ++ struct spdk_bdev_ext_io_opts *opts_copy = &bdev_io->internal.ext_opts_copy; ++ ++ /* Zero part we don't copy */ ++ memset(((char *)opts_copy) + opts->size, 0, sizeof(*opts) - opts->size); ++ memcpy(opts_copy, opts, opts->size); ++ opts_copy->size = sizeof(*opts_copy); ++ opts_copy->metadata = bdev_io->u.bdev.md_buf; ++ /* Save pointer to the copied ext_opts which will be used by bdev modules */ ++ bdev_io->u.bdev.ext_opts = opts_copy; ++} ++ ++static inline void ++_bdev_io_ext_use_bounce_buffer(struct spdk_bdev_io *bdev_io) ++{ ++ /* bdev doesn't support memory domains, thereby buffers in this IO request can't ++ * be accessed directly. It is needed to allocate buffers before issuing IO operation. ++ * For write operation we need to pull buffers from memory domain before submitting IO. ++ * Once read operation completes, we need to use memory_domain push functionality to ++ * update data in original memory domain IO buffer ++ * This IO request will go through a regular IO flow, so clear memory domains pointers in ++ * the copied ext_opts */ ++ bdev_io->internal.ext_opts_copy.memory_domain = NULL; ++ bdev_io->internal.ext_opts_copy.memory_domain_ctx = NULL; ++ _bdev_memory_domain_io_get_buf(bdev_io, _bdev_memory_domain_get_io_cb, ++ bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen); ++} ++ ++static inline void ++_bdev_io_submit_ext(struct spdk_bdev_desc *desc, struct spdk_bdev_io *bdev_io, ++ struct spdk_bdev_ext_io_opts *opts, bool copy_opts) ++{ ++ if (opts) { ++ bool use_pull_push = opts->memory_domain && !desc->memory_domains_supported; ++ assert(opts->size <= sizeof(*opts)); ++ /* ++ * copy if size is smaller than opts struct to avoid having to check size ++ * on every access to bdev_io->u.bdev.ext_opts ++ */ ++ if (copy_opts || use_pull_push || opts->size < sizeof(*opts)) { ++ _bdev_io_copy_ext_opts(bdev_io, opts); ++ if (use_pull_push) { ++ _bdev_io_ext_use_bounce_buffer(bdev_io); ++ return; ++ } ++ } ++ } ++ bdev_io_submit(bdev_io); ++} ++ ++static void ++bdev_io_submit_reset(struct spdk_bdev_io *bdev_io) ++{ ++ struct spdk_bdev *bdev = bdev_io->bdev; ++ struct spdk_bdev_channel *bdev_ch = bdev_io->internal.ch; ++ struct spdk_io_channel *ch = bdev_ch->channel; ++ ++ assert(bdev_io->internal.status == SPDK_BDEV_IO_STATUS_PENDING); ++ ++ bdev_io->internal.in_submit_request = true; ++ bdev->fn_table->submit_request(ch, bdev_io); ++ bdev_io->internal.in_submit_request = false; ++} ++ ++void ++bdev_io_init(struct spdk_bdev_io *bdev_io, ++ struct spdk_bdev *bdev, void *cb_arg, ++ spdk_bdev_io_completion_cb cb) ++{ ++ bdev_io->bdev = bdev; ++ bdev_io->internal.caller_ctx = cb_arg; ++ bdev_io->internal.cb = cb; ++ bdev_io->internal.status = SPDK_BDEV_IO_STATUS_PENDING; ++ bdev_io->internal.in_submit_request = false; ++ bdev_io->internal.buf = NULL; ++ bdev_io->internal.io_submit_ch = NULL; ++ bdev_io->internal.orig_iovs = NULL; ++ bdev_io->internal.orig_iovcnt = 0; ++ bdev_io->internal.orig_md_iov.iov_base = NULL; ++ bdev_io->internal.error.nvme.cdw0 = 0; ++ bdev_io->num_retries = 0; ++ bdev_io->internal.get_buf_cb = NULL; ++ bdev_io->internal.get_aux_buf_cb = NULL; ++ bdev_io->internal.ext_opts = NULL; ++ bdev_io->internal.data_transfer_cpl = NULL; ++} ++ ++static bool ++bdev_io_type_supported(struct spdk_bdev *bdev, enum spdk_bdev_io_type io_type) ++{ ++ return bdev->fn_table->io_type_supported(bdev->ctxt, io_type); ++} ++ ++bool ++spdk_bdev_io_type_supported(struct spdk_bdev *bdev, enum spdk_bdev_io_type io_type) ++{ ++ bool supported; ++ ++ supported = bdev_io_type_supported(bdev, io_type); ++ ++ if (!supported) { ++ switch (io_type) { ++ case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: ++ /* The bdev layer will emulate write zeroes as long as write is supported. */ ++ supported = bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE); ++ break; ++ default: ++ break; ++ } ++ } ++ ++ return supported; ++} ++ ++uint64_t ++spdk_bdev_io_get_submit_tsc(struct spdk_bdev_io *bdev_io) ++{ ++ return bdev_io->internal.submit_tsc; ++} ++ ++int ++spdk_bdev_dump_info_json(struct spdk_bdev *bdev, struct spdk_json_write_ctx *w) ++{ ++ if (bdev->fn_table->dump_info_json) { ++ return bdev->fn_table->dump_info_json(bdev->ctxt, w); ++ } ++ ++ return 0; ++} ++ ++static void ++bdev_qos_update_max_quota_per_timeslice(struct spdk_bdev_qos *qos) ++{ ++ uint32_t max_per_timeslice = 0; ++ int i; ++ ++ for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { ++ if (qos->rate_limits[i].limit == SPDK_BDEV_QOS_LIMIT_NOT_DEFINED) { ++ qos->rate_limits[i].max_per_timeslice = 0; ++ continue; ++ } ++ ++ max_per_timeslice = qos->rate_limits[i].limit * ++ SPDK_BDEV_QOS_TIMESLICE_IN_USEC / SPDK_SEC_TO_USEC; ++ ++ qos->rate_limits[i].max_per_timeslice = spdk_max(max_per_timeslice, ++ qos->rate_limits[i].min_per_timeslice); ++ ++ qos->rate_limits[i].remaining_this_timeslice = qos->rate_limits[i].max_per_timeslice; ++ } ++ ++ bdev_qos_set_ops(qos); ++} ++ ++static int ++bdev_channel_poll_qos(void *arg) ++{ ++ struct spdk_bdev_qos *qos = arg; ++ uint64_t now = spdk_get_ticks(); ++ int i; ++ ++ if (now < (qos->last_timeslice + qos->timeslice_size)) { ++ /* We received our callback earlier than expected - return ++ * immediately and wait to do accounting until at least one ++ * timeslice has actually expired. This should never happen ++ * with a well-behaved timer implementation. ++ */ ++ return SPDK_POLLER_IDLE; ++ } ++ ++ /* Reset for next round of rate limiting */ ++ for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { ++ /* We may have allowed the IOs or bytes to slightly overrun in the last ++ * timeslice. remaining_this_timeslice is signed, so if it's negative ++ * here, we'll account for the overrun so that the next timeslice will ++ * be appropriately reduced. ++ */ ++ if (qos->rate_limits[i].remaining_this_timeslice > 0) { ++ qos->rate_limits[i].remaining_this_timeslice = 0; ++ } ++ } ++ ++ while (now >= (qos->last_timeslice + qos->timeslice_size)) { ++ qos->last_timeslice += qos->timeslice_size; ++ for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { ++ qos->rate_limits[i].remaining_this_timeslice += ++ qos->rate_limits[i].max_per_timeslice; ++ } ++ } ++ ++ return bdev_qos_io_submit(qos->ch, qos); ++} ++ ++static void ++bdev_channel_destroy_resource(struct spdk_bdev_channel *ch) ++{ ++ struct spdk_bdev_shared_resource *shared_resource; ++ struct lba_range *range; ++ struct spdk_bdev_io *bdev_io, *tmp; ++ ++ bdev_free_io_stat(ch->stat); ++#ifdef SPDK_CONFIG_VTUNE ++ bdev_free_io_stat(ch->prev_stat); ++#endif ++ ++ while (!TAILQ_EMPTY(&ch->locked_ranges)) { ++ range = TAILQ_FIRST(&ch->locked_ranges); ++ TAILQ_REMOVE(&ch->locked_ranges, range, tailq); ++ free(range); ++ } ++ ++ spdk_put_io_channel(ch->channel); ++ ++ shared_resource = ch->shared_resource; ++ ch->shared_resource = NULL; ++ ++ TAILQ_FOREACH_SAFE(bdev_io, &ch->io_submitted, internal.ch_link, tmp) { ++ spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_ABORTED); ++ } ++ ++ assert(TAILQ_EMPTY(&ch->io_locked)); ++ assert(TAILQ_EMPTY(&ch->io_submitted)); ++ assert(ch->io_outstanding == 0); ++ assert(shared_resource->ref > 0); ++ shared_resource->ref--; ++ if (shared_resource->ref == 0) { ++ assert(shared_resource->io_outstanding == 0); ++ TAILQ_REMOVE(&shared_resource->mgmt_ch->shared_resources, shared_resource, link); ++ spdk_put_io_channel(spdk_io_channel_from_ctx(shared_resource->mgmt_ch)); ++ free(shared_resource); ++ } ++} ++ ++static void ++bdev_enable_qos(struct spdk_bdev *bdev, struct spdk_bdev_channel *ch) ++{ ++ struct spdk_bdev_qos *qos = bdev->internal.qos; ++ int i; ++ ++ assert(spdk_spin_held(&bdev->internal.spinlock)); ++ ++ /* Rate limiting on this bdev enabled */ ++ if (qos) { ++ if (qos->ch == NULL) { ++ struct spdk_io_channel *io_ch; ++ ++ SPDK_DEBUGLOG(bdev, "Selecting channel %p as QoS channel for bdev %s on thread %p\n", ch, ++ bdev->name, spdk_get_thread()); ++ ++ /* No qos channel has been selected, so set one up */ ++ ++ /* Take another reference to ch */ ++ io_ch = spdk_get_io_channel(__bdev_to_io_dev(bdev)); ++ assert(io_ch != NULL); ++ qos->ch = ch; ++ ++ qos->thread = spdk_io_channel_get_thread(io_ch); ++ ++ TAILQ_INIT(&qos->queued); ++ ++ for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { ++ if (bdev_qos_is_iops_rate_limit(i) == true) { ++ qos->rate_limits[i].min_per_timeslice = ++ SPDK_BDEV_QOS_MIN_IO_PER_TIMESLICE; ++ } else { ++ qos->rate_limits[i].min_per_timeslice = ++ SPDK_BDEV_QOS_MIN_BYTE_PER_TIMESLICE; ++ } ++ ++ if (qos->rate_limits[i].limit == 0) { ++ qos->rate_limits[i].limit = SPDK_BDEV_QOS_LIMIT_NOT_DEFINED; ++ } ++ } ++ bdev_qos_update_max_quota_per_timeslice(qos); ++ qos->timeslice_size = ++ SPDK_BDEV_QOS_TIMESLICE_IN_USEC * spdk_get_ticks_hz() / SPDK_SEC_TO_USEC; ++ qos->last_timeslice = spdk_get_ticks(); ++ qos->poller = SPDK_POLLER_REGISTER(bdev_channel_poll_qos, ++ qos, ++ SPDK_BDEV_QOS_TIMESLICE_IN_USEC); ++ } ++ ++ ch->flags |= BDEV_CH_QOS_ENABLED; ++ } ++} ++ ++struct poll_timeout_ctx { ++ struct spdk_bdev_desc *desc; ++ uint64_t timeout_in_sec; ++ spdk_bdev_io_timeout_cb cb_fn; ++ void *cb_arg; ++}; ++ ++static void ++bdev_desc_free(struct spdk_bdev_desc *desc) ++{ ++ spdk_spin_destroy(&desc->spinlock); ++ free(desc->media_events_buffer); ++ free(desc); ++} ++ ++static void ++bdev_channel_poll_timeout_io_done(struct spdk_bdev *bdev, void *_ctx, int status) ++{ ++ struct poll_timeout_ctx *ctx = _ctx; ++ struct spdk_bdev_desc *desc = ctx->desc; ++ ++ free(ctx); ++ ++ spdk_spin_lock(&desc->spinlock); ++ desc->refs--; ++ if (desc->closed == true && desc->refs == 0) { ++ spdk_spin_unlock(&desc->spinlock); ++ bdev_desc_free(desc); ++ return; ++ } ++ spdk_spin_unlock(&desc->spinlock); ++} ++ ++static void ++bdev_channel_poll_timeout_io(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, ++ struct spdk_io_channel *io_ch, void *_ctx) ++{ ++ struct poll_timeout_ctx *ctx = _ctx; ++ struct spdk_bdev_channel *bdev_ch = __io_ch_to_bdev_ch(io_ch); ++ struct spdk_bdev_desc *desc = ctx->desc; ++ struct spdk_bdev_io *bdev_io; ++ uint64_t now; ++ ++ spdk_spin_lock(&desc->spinlock); ++ if (desc->closed == true) { ++ spdk_spin_unlock(&desc->spinlock); ++ spdk_bdev_for_each_channel_continue(i, -1); ++ return; ++ } ++ spdk_spin_unlock(&desc->spinlock); ++ ++ now = spdk_get_ticks(); ++ TAILQ_FOREACH(bdev_io, &bdev_ch->io_submitted, internal.ch_link) { ++ /* Exclude any I/O that are generated via splitting. */ ++ if (bdev_io->internal.cb == bdev_io_split_done) { ++ continue; ++ } ++ ++ /* Once we find an I/O that has not timed out, we can immediately ++ * exit the loop. ++ */ ++ if (now < (bdev_io->internal.submit_tsc + ++ ctx->timeout_in_sec * spdk_get_ticks_hz())) { ++ goto end; ++ } ++ ++ if (bdev_io->internal.desc == desc) { ++ ctx->cb_fn(ctx->cb_arg, bdev_io); ++ } ++ } ++ ++end: ++ spdk_bdev_for_each_channel_continue(i, 0); ++} ++ ++static int ++bdev_poll_timeout_io(void *arg) ++{ ++ struct spdk_bdev_desc *desc = arg; ++ struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); ++ struct poll_timeout_ctx *ctx; ++ ++ ctx = calloc(1, sizeof(struct poll_timeout_ctx)); ++ if (!ctx) { ++ SPDK_ERRLOG("failed to allocate memory\n"); ++ return SPDK_POLLER_BUSY; ++ } ++ ctx->desc = desc; ++ ctx->cb_arg = desc->cb_arg; ++ ctx->cb_fn = desc->cb_fn; ++ ctx->timeout_in_sec = desc->timeout_in_sec; ++ ++ /* Take a ref on the descriptor in case it gets closed while we are checking ++ * all of the channels. ++ */ ++ spdk_spin_lock(&desc->spinlock); ++ desc->refs++; ++ spdk_spin_unlock(&desc->spinlock); ++ ++ spdk_bdev_for_each_channel(bdev, bdev_channel_poll_timeout_io, ctx, ++ bdev_channel_poll_timeout_io_done); ++ ++ return SPDK_POLLER_BUSY; ++} ++ ++int ++spdk_bdev_set_timeout(struct spdk_bdev_desc *desc, uint64_t timeout_in_sec, ++ spdk_bdev_io_timeout_cb cb_fn, void *cb_arg) ++{ ++ assert(desc->thread == spdk_get_thread()); ++ ++ spdk_poller_unregister(&desc->io_timeout_poller); ++ ++ if (timeout_in_sec) { ++ assert(cb_fn != NULL); ++ desc->io_timeout_poller = SPDK_POLLER_REGISTER(bdev_poll_timeout_io, ++ desc, ++ SPDK_BDEV_IO_POLL_INTERVAL_IN_MSEC * SPDK_SEC_TO_USEC / ++ 1000); ++ if (desc->io_timeout_poller == NULL) { ++ SPDK_ERRLOG("can not register the desc timeout IO poller\n"); ++ return -1; ++ } ++ } ++ ++ desc->cb_fn = cb_fn; ++ desc->cb_arg = cb_arg; ++ desc->timeout_in_sec = timeout_in_sec; ++ ++ return 0; ++} ++ ++static int ++bdev_channel_create(void *io_device, void *ctx_buf) ++{ ++ struct spdk_bdev *bdev = __bdev_from_io_dev(io_device); ++ struct spdk_bdev_channel *ch = ctx_buf; ++ struct spdk_io_channel *mgmt_io_ch; ++ struct spdk_bdev_mgmt_channel *mgmt_ch; ++ struct spdk_bdev_shared_resource *shared_resource; ++ struct lba_range *range; ++ ++ ch->bdev = bdev; ++ ch->channel = bdev->fn_table->get_io_channel(bdev->ctxt); ++ if (!ch->channel) { ++ return -1; ++ } ++ ++ spdk_trace_record(TRACE_BDEV_IOCH_CREATE, 0, 0, 0, ch->bdev->name, ++ spdk_thread_get_id(spdk_io_channel_get_thread(ch->channel))); ++ ++ assert(ch->histogram == NULL); ++ if (bdev->internal.histogram_enabled) { ++ ch->histogram = spdk_histogram_data_alloc(); ++ if (ch->histogram == NULL) { ++ SPDK_ERRLOG("Could not allocate histogram\n"); ++ } ++ } ++ ++ mgmt_io_ch = spdk_get_io_channel(&g_bdev_mgr); ++ if (!mgmt_io_ch) { ++ spdk_put_io_channel(ch->channel); ++ return -1; ++ } ++ ++ mgmt_ch = __io_ch_to_bdev_mgmt_ch(mgmt_io_ch); ++ TAILQ_FOREACH(shared_resource, &mgmt_ch->shared_resources, link) { ++ if (shared_resource->shared_ch == ch->channel) { ++ spdk_put_io_channel(mgmt_io_ch); ++ shared_resource->ref++; ++ break; ++ } ++ } ++ ++ if (shared_resource == NULL) { ++ shared_resource = calloc(1, sizeof(*shared_resource)); ++ if (shared_resource == NULL) { ++ spdk_put_io_channel(ch->channel); ++ spdk_put_io_channel(mgmt_io_ch); ++ return -1; ++ } ++ ++ shared_resource->mgmt_ch = mgmt_ch; ++ shared_resource->io_outstanding = 0; ++ TAILQ_INIT(&shared_resource->nomem_io); ++ shared_resource->nomem_threshold = 0; ++ shared_resource->shared_ch = ch->channel; ++ shared_resource->ref = 1; ++ TAILQ_INSERT_TAIL(&mgmt_ch->shared_resources, shared_resource, link); ++ } ++ ++ ch->io_outstanding = 0; ++ TAILQ_INIT(&ch->queued_resets); ++ TAILQ_INIT(&ch->locked_ranges); ++ ch->flags = 0; ++ ch->shared_resource = shared_resource; ++ ++ TAILQ_INIT(&ch->io_submitted); ++ TAILQ_INIT(&ch->io_locked); ++ ++ ch->stat = bdev_alloc_io_stat(false); ++ if (ch->stat == NULL) { ++ bdev_channel_destroy_resource(ch); ++ return -1; ++ } ++ ++ ch->stat->ticks_rate = spdk_get_ticks_hz(); ++ ++#ifdef SPDK_CONFIG_VTUNE ++ { ++ char *name; ++ __itt_init_ittlib(NULL, 0); ++ name = spdk_sprintf_alloc("spdk_bdev_%s_%p", ch->bdev->name, ch); ++ if (!name) { ++ bdev_channel_destroy_resource(ch); ++ return -1; ++ } ++ ch->handle = __itt_string_handle_create(name); ++ free(name); ++ ch->start_tsc = spdk_get_ticks(); ++ ch->interval_tsc = spdk_get_ticks_hz() / 100; ++ ch->prev_stat = bdev_alloc_io_stat(false); ++ if (ch->prev_stat == NULL) { ++ bdev_channel_destroy_resource(ch); ++ return -1; ++ } ++ } ++#endif ++ ++ spdk_spin_lock(&bdev->internal.spinlock); ++ bdev_enable_qos(bdev, ch); ++ ++ TAILQ_FOREACH(range, &bdev->internal.locked_ranges, tailq) { ++ struct lba_range *new_range; ++ ++ new_range = calloc(1, sizeof(*new_range)); ++ if (new_range == NULL) { ++ spdk_spin_unlock(&bdev->internal.spinlock); ++ bdev_channel_destroy_resource(ch); ++ return -1; ++ } ++ new_range->length = range->length; ++ new_range->offset = range->offset; ++ new_range->locked_ctx = range->locked_ctx; ++ TAILQ_INSERT_TAIL(&ch->locked_ranges, new_range, tailq); ++ } ++ ++ spdk_spin_unlock(&bdev->internal.spinlock); ++ ++ return 0; ++} ++ ++static int ++bdev_abort_all_buf_io_cb(struct spdk_iobuf_channel *ch, struct spdk_iobuf_entry *entry, ++ void *cb_ctx) ++{ ++ struct spdk_bdev_channel *bdev_ch = cb_ctx; ++ struct spdk_bdev_io *bdev_io; ++ uint64_t buf_len; ++ ++ bdev_io = SPDK_CONTAINEROF(entry, struct spdk_bdev_io, internal.iobuf); ++ if (bdev_io->internal.ch == bdev_ch) { ++ buf_len = bdev_io_get_max_buf_len(bdev_io, bdev_io->internal.buf_len); ++ spdk_iobuf_entry_abort(ch, entry, buf_len); ++ spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_ABORTED); ++ } ++ ++ return 0; ++} ++ ++/* ++ * Abort I/O that are waiting on a data buffer. ++ */ ++static void ++bdev_abort_all_buf_io(struct spdk_bdev_mgmt_channel *mgmt_ch, struct spdk_bdev_channel *ch) ++{ ++ spdk_iobuf_for_each_entry(&mgmt_ch->iobuf, &mgmt_ch->iobuf.small, ++ bdev_abort_all_buf_io_cb, ch); ++ spdk_iobuf_for_each_entry(&mgmt_ch->iobuf, &mgmt_ch->iobuf.large, ++ bdev_abort_all_buf_io_cb, ch); ++} ++ ++/* ++ * Abort I/O that are queued waiting for submission. These types of I/O are ++ * linked using the spdk_bdev_io link TAILQ_ENTRY. ++ */ ++static void ++bdev_abort_all_queued_io(bdev_io_tailq_t *queue, struct spdk_bdev_channel *ch) ++{ ++ struct spdk_bdev_io *bdev_io, *tmp; ++ ++ TAILQ_FOREACH_SAFE(bdev_io, queue, internal.link, tmp) { ++ if (bdev_io->internal.ch == ch) { ++ TAILQ_REMOVE(queue, bdev_io, internal.link); ++ /* ++ * spdk_bdev_io_complete() assumes that the completed I/O had ++ * been submitted to the bdev module. Since in this case it ++ * hadn't, bump io_outstanding to account for the decrement ++ * that spdk_bdev_io_complete() will do. ++ */ ++ if (bdev_io->type != SPDK_BDEV_IO_TYPE_RESET) { ++ ch->io_outstanding++; ++ ch->shared_resource->io_outstanding++; ++ } ++ spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_ABORTED); ++ } ++ } ++} ++ ++static bool ++bdev_abort_queued_io(bdev_io_tailq_t *queue, struct spdk_bdev_io *bio_to_abort) ++{ ++ struct spdk_bdev_io *bdev_io; ++ ++ TAILQ_FOREACH(bdev_io, queue, internal.link) { ++ if (bdev_io == bio_to_abort) { ++ TAILQ_REMOVE(queue, bio_to_abort, internal.link); ++ spdk_bdev_io_complete(bio_to_abort, SPDK_BDEV_IO_STATUS_ABORTED); ++ return true; ++ } ++ } ++ ++ return false; ++} ++ ++static int ++bdev_abort_buf_io_cb(struct spdk_iobuf_channel *ch, struct spdk_iobuf_entry *entry, void *cb_ctx) ++{ ++ struct spdk_bdev_io *bdev_io, *bio_to_abort = cb_ctx; ++ uint64_t buf_len; ++ ++ bdev_io = SPDK_CONTAINEROF(entry, struct spdk_bdev_io, internal.iobuf); ++ if (bdev_io == bio_to_abort) { ++ buf_len = bdev_io_get_max_buf_len(bdev_io, bdev_io->internal.buf_len); ++ spdk_iobuf_entry_abort(ch, entry, buf_len); ++ spdk_bdev_io_complete(bio_to_abort, SPDK_BDEV_IO_STATUS_ABORTED); ++ return 1; ++ } ++ ++ return 0; ++} ++ ++static bool ++bdev_abort_buf_io(struct spdk_bdev_mgmt_channel *mgmt_ch, struct spdk_bdev_io *bio_to_abort) ++{ ++ int rc; ++ ++ rc = spdk_iobuf_for_each_entry(&mgmt_ch->iobuf, &mgmt_ch->iobuf.small, ++ bdev_abort_buf_io_cb, bio_to_abort); ++ if (rc == 1) { ++ return true; ++ } ++ ++ rc = spdk_iobuf_for_each_entry(&mgmt_ch->iobuf, &mgmt_ch->iobuf.large, ++ bdev_abort_buf_io_cb, bio_to_abort); ++ return rc == 1; ++} ++ ++static void ++bdev_qos_channel_destroy(void *cb_arg) ++{ ++ struct spdk_bdev_qos *qos = cb_arg; ++ ++ spdk_put_io_channel(spdk_io_channel_from_ctx(qos->ch)); ++ spdk_poller_unregister(&qos->poller); ++ ++ SPDK_DEBUGLOG(bdev, "Free QoS %p.\n", qos); ++ ++ free(qos); ++} ++ ++static int ++bdev_qos_destroy(struct spdk_bdev *bdev) ++{ ++ int i; ++ ++ /* ++ * Cleanly shutting down the QoS poller is tricky, because ++ * during the asynchronous operation the user could open ++ * a new descriptor and create a new channel, spawning ++ * a new QoS poller. ++ * ++ * The strategy is to create a new QoS structure here and swap it ++ * in. The shutdown path then continues to refer to the old one ++ * until it completes and then releases it. ++ */ ++ struct spdk_bdev_qos *new_qos, *old_qos; ++ ++ old_qos = bdev->internal.qos; ++ ++ new_qos = calloc(1, sizeof(*new_qos)); ++ if (!new_qos) { ++ SPDK_ERRLOG("Unable to allocate memory to shut down QoS.\n"); ++ return -ENOMEM; ++ } ++ ++ /* Copy the old QoS data into the newly allocated structure */ ++ memcpy(new_qos, old_qos, sizeof(*new_qos)); ++ ++ /* Zero out the key parts of the QoS structure */ ++ new_qos->ch = NULL; ++ new_qos->thread = NULL; ++ new_qos->poller = NULL; ++ TAILQ_INIT(&new_qos->queued); ++ /* ++ * The limit member of spdk_bdev_qos_limit structure is not zeroed. ++ * It will be used later for the new QoS structure. ++ */ ++ for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { ++ new_qos->rate_limits[i].remaining_this_timeslice = 0; ++ new_qos->rate_limits[i].min_per_timeslice = 0; ++ new_qos->rate_limits[i].max_per_timeslice = 0; ++ } ++ ++ bdev->internal.qos = new_qos; ++ ++ if (old_qos->thread == NULL) { ++ free(old_qos); ++ } else { ++ spdk_thread_send_msg(old_qos->thread, bdev_qos_channel_destroy, old_qos); ++ } ++ ++ /* It is safe to continue with destroying the bdev even though the QoS channel hasn't ++ * been destroyed yet. The destruction path will end up waiting for the final ++ * channel to be put before it releases resources. */ ++ ++ return 0; ++} ++ ++void ++spdk_bdev_add_io_stat(struct spdk_bdev_io_stat *total, struct spdk_bdev_io_stat *add) ++{ ++ total->bytes_read += add->bytes_read; ++ total->num_read_ops += add->num_read_ops; ++ total->bytes_written += add->bytes_written; ++ total->num_write_ops += add->num_write_ops; ++ total->bytes_unmapped += add->bytes_unmapped; ++ total->num_unmap_ops += add->num_unmap_ops; ++ total->bytes_copied += add->bytes_copied; ++ total->num_copy_ops += add->num_copy_ops; ++ total->read_latency_ticks += add->read_latency_ticks; ++ total->write_latency_ticks += add->write_latency_ticks; ++ total->unmap_latency_ticks += add->unmap_latency_ticks; ++ total->copy_latency_ticks += add->copy_latency_ticks; ++ if (total->max_read_latency_ticks < add->max_read_latency_ticks) { ++ total->max_read_latency_ticks = add->max_read_latency_ticks; ++ } ++ if (total->min_read_latency_ticks > add->min_read_latency_ticks) { ++ total->min_read_latency_ticks = add->min_read_latency_ticks; ++ } ++ if (total->max_write_latency_ticks < add->max_write_latency_ticks) { ++ total->max_write_latency_ticks = add->max_write_latency_ticks; ++ } ++ if (total->min_write_latency_ticks > add->min_write_latency_ticks) { ++ total->min_write_latency_ticks = add->min_write_latency_ticks; ++ } ++ if (total->max_unmap_latency_ticks < add->max_unmap_latency_ticks) { ++ total->max_unmap_latency_ticks = add->max_unmap_latency_ticks; ++ } ++ if (total->min_unmap_latency_ticks > add->min_unmap_latency_ticks) { ++ total->min_unmap_latency_ticks = add->min_unmap_latency_ticks; ++ } ++ if (total->max_copy_latency_ticks < add->max_copy_latency_ticks) { ++ total->max_copy_latency_ticks = add->max_copy_latency_ticks; ++ } ++ if (total->min_copy_latency_ticks > add->min_copy_latency_ticks) { ++ total->min_copy_latency_ticks = add->min_copy_latency_ticks; ++ } ++} ++ ++static void ++bdev_get_io_stat(struct spdk_bdev_io_stat *to_stat, struct spdk_bdev_io_stat *from_stat) ++{ ++ memcpy(to_stat, from_stat, offsetof(struct spdk_bdev_io_stat, io_error)); ++ ++ if (to_stat->io_error != NULL && from_stat->io_error != NULL) { ++ memcpy(to_stat->io_error, from_stat->io_error, ++ sizeof(struct spdk_bdev_io_error_stat)); ++ } ++} ++ ++void ++spdk_bdev_reset_io_stat(struct spdk_bdev_io_stat *stat, enum spdk_bdev_reset_stat_mode mode) ++{ ++ stat->max_read_latency_ticks = 0; ++ stat->min_read_latency_ticks = UINT64_MAX; ++ stat->max_write_latency_ticks = 0; ++ stat->min_write_latency_ticks = UINT64_MAX; ++ stat->max_unmap_latency_ticks = 0; ++ stat->min_unmap_latency_ticks = UINT64_MAX; ++ stat->max_copy_latency_ticks = 0; ++ stat->min_copy_latency_ticks = UINT64_MAX; ++ ++ if (mode != SPDK_BDEV_RESET_STAT_ALL) { ++ return; ++ } ++ ++ stat->bytes_read = 0; ++ stat->num_read_ops = 0; ++ stat->bytes_written = 0; ++ stat->num_write_ops = 0; ++ stat->bytes_unmapped = 0; ++ stat->num_unmap_ops = 0; ++ stat->bytes_copied = 0; ++ stat->num_copy_ops = 0; ++ stat->read_latency_ticks = 0; ++ stat->write_latency_ticks = 0; ++ stat->unmap_latency_ticks = 0; ++ stat->copy_latency_ticks = 0; ++ ++ if (stat->io_error != NULL) { ++ memset(stat->io_error, 0, sizeof(struct spdk_bdev_io_error_stat)); ++ } ++} ++ ++struct spdk_bdev_io_stat * ++bdev_alloc_io_stat(bool io_error_stat) ++{ ++ struct spdk_bdev_io_stat *stat; ++ ++ stat = malloc(sizeof(struct spdk_bdev_io_stat)); ++ if (stat == NULL) { ++ return NULL; ++ } ++ ++ if (io_error_stat) { ++ stat->io_error = malloc(sizeof(struct spdk_bdev_io_error_stat)); ++ if (stat->io_error == NULL) { ++ free(stat); ++ return NULL; ++ } ++ } else { ++ stat->io_error = NULL; ++ } ++ ++ spdk_bdev_reset_io_stat(stat, SPDK_BDEV_RESET_STAT_ALL); ++ ++ return stat; ++} ++ ++void ++bdev_free_io_stat(struct spdk_bdev_io_stat *stat) ++{ ++ if (stat != NULL) { ++ free(stat->io_error); ++ free(stat); ++ } ++} ++ ++void ++spdk_bdev_dump_io_stat_json(struct spdk_bdev_io_stat *stat, struct spdk_json_write_ctx *w) ++{ ++ int i; ++ ++ spdk_json_write_named_uint64(w, "bytes_read", stat->bytes_read); ++ spdk_json_write_named_uint64(w, "num_read_ops", stat->num_read_ops); ++ spdk_json_write_named_uint64(w, "bytes_written", stat->bytes_written); ++ spdk_json_write_named_uint64(w, "num_write_ops", stat->num_write_ops); ++ spdk_json_write_named_uint64(w, "bytes_unmapped", stat->bytes_unmapped); ++ spdk_json_write_named_uint64(w, "num_unmap_ops", stat->num_unmap_ops); ++ spdk_json_write_named_uint64(w, "bytes_copied", stat->bytes_copied); ++ spdk_json_write_named_uint64(w, "num_copy_ops", stat->num_copy_ops); ++ spdk_json_write_named_uint64(w, "read_latency_ticks", stat->read_latency_ticks); ++ spdk_json_write_named_uint64(w, "max_read_latency_ticks", stat->max_read_latency_ticks); ++ spdk_json_write_named_uint64(w, "min_read_latency_ticks", ++ stat->min_read_latency_ticks != UINT64_MAX ? ++ stat->min_read_latency_ticks : 0); ++ spdk_json_write_named_uint64(w, "write_latency_ticks", stat->write_latency_ticks); ++ spdk_json_write_named_uint64(w, "max_write_latency_ticks", stat->max_write_latency_ticks); ++ spdk_json_write_named_uint64(w, "min_write_latency_ticks", ++ stat->min_write_latency_ticks != UINT64_MAX ? ++ stat->min_write_latency_ticks : 0); ++ spdk_json_write_named_uint64(w, "unmap_latency_ticks", stat->unmap_latency_ticks); ++ spdk_json_write_named_uint64(w, "max_unmap_latency_ticks", stat->max_unmap_latency_ticks); ++ spdk_json_write_named_uint64(w, "min_unmap_latency_ticks", ++ stat->min_unmap_latency_ticks != UINT64_MAX ? ++ stat->min_unmap_latency_ticks : 0); ++ spdk_json_write_named_uint64(w, "copy_latency_ticks", stat->copy_latency_ticks); ++ spdk_json_write_named_uint64(w, "max_copy_latency_ticks", stat->max_copy_latency_ticks); ++ spdk_json_write_named_uint64(w, "min_copy_latency_ticks", ++ stat->min_copy_latency_ticks != UINT64_MAX ? ++ stat->min_copy_latency_ticks : 0); ++ ++ if (stat->io_error != NULL) { ++ spdk_json_write_named_object_begin(w, "io_error"); ++ for (i = 0; i < -SPDK_MIN_BDEV_IO_STATUS; i++) { ++ if (stat->io_error->error_status[i] != 0) { ++ spdk_json_write_named_uint32(w, bdev_io_status_get_string(-(i + 1)), ++ stat->io_error->error_status[i]); ++ } ++ } ++ spdk_json_write_object_end(w); ++ } ++} ++ ++static void ++bdev_channel_abort_queued_ios(struct spdk_bdev_channel *ch) ++{ ++ struct spdk_bdev_shared_resource *shared_resource = ch->shared_resource; ++ struct spdk_bdev_mgmt_channel *mgmt_ch = shared_resource->mgmt_ch; ++ ++ bdev_abort_all_queued_io(&shared_resource->nomem_io, ch); ++ bdev_abort_all_buf_io(mgmt_ch, ch); ++ bdev_abort_all_buf_io(mgmt_ch, ch); ++} ++ ++static void ++bdev_channel_destroy(void *io_device, void *ctx_buf) ++{ ++ struct spdk_bdev_channel *ch = ctx_buf; ++ ++ SPDK_DEBUGLOG(bdev, "Destroying channel %p for bdev %s on thread %p\n", ch, ch->bdev->name, ++ spdk_get_thread()); ++ ++ spdk_trace_record(TRACE_BDEV_IOCH_DESTROY, 0, 0, 0, ch->bdev->name, ++ spdk_thread_get_id(spdk_io_channel_get_thread(ch->channel))); ++ ++ /* This channel is going away, so add its statistics into the bdev so that they don't get lost. */ ++ spdk_spin_lock(&ch->bdev->internal.spinlock); ++ spdk_bdev_add_io_stat(ch->bdev->internal.stat, ch->stat); ++ spdk_spin_unlock(&ch->bdev->internal.spinlock); ++ ++ bdev_abort_all_queued_io(&ch->queued_resets, ch); ++ ++ bdev_channel_abort_queued_ios(ch); ++ ++ if (ch->histogram) { ++ spdk_histogram_data_free(ch->histogram); ++ } ++ ++ bdev_channel_destroy_resource(ch); ++} ++ ++/* ++ * If the name already exists in the global bdev name tree, RB_INSERT() returns a pointer ++ * to it. Hence we do not have to call bdev_get_by_name() when using this function. ++ */ ++static int ++bdev_name_add(struct spdk_bdev_name *bdev_name, struct spdk_bdev *bdev, const char *name) ++{ ++ struct spdk_bdev_name *tmp; ++ ++ bdev_name->name = strdup(name); ++ if (bdev_name->name == NULL) { ++ SPDK_ERRLOG("Unable to allocate bdev name\n"); ++ return -ENOMEM; ++ } ++ ++ bdev_name->bdev = bdev; ++ ++ spdk_spin_lock(&g_bdev_mgr.spinlock); ++ tmp = RB_INSERT(bdev_name_tree, &g_bdev_mgr.bdev_names, bdev_name); ++ spdk_spin_unlock(&g_bdev_mgr.spinlock); ++ ++ if (tmp != NULL) { ++ SPDK_ERRLOG("Bdev name %s already exists\n", name); ++ free(bdev_name->name); ++ return -EEXIST; ++ } ++ ++ return 0; ++} ++ ++static void ++bdev_name_del_unsafe(struct spdk_bdev_name *bdev_name) ++{ ++ RB_REMOVE(bdev_name_tree, &g_bdev_mgr.bdev_names, bdev_name); ++ free(bdev_name->name); ++} ++ ++static void ++bdev_name_del(struct spdk_bdev_name *bdev_name) ++{ ++ spdk_spin_lock(&g_bdev_mgr.spinlock); ++ bdev_name_del_unsafe(bdev_name); ++ spdk_spin_unlock(&g_bdev_mgr.spinlock); ++} ++ ++int ++spdk_bdev_alias_add(struct spdk_bdev *bdev, const char *alias) ++{ ++ struct spdk_bdev_alias *tmp; ++ int ret; ++ ++ if (alias == NULL) { ++ SPDK_ERRLOG("Empty alias passed\n"); ++ return -EINVAL; ++ } ++ ++ tmp = calloc(1, sizeof(*tmp)); ++ if (tmp == NULL) { ++ SPDK_ERRLOG("Unable to allocate alias\n"); ++ return -ENOMEM; ++ } ++ ++ ret = bdev_name_add(&tmp->alias, bdev, alias); ++ if (ret != 0) { ++ free(tmp); ++ return ret; ++ } ++ ++ TAILQ_INSERT_TAIL(&bdev->aliases, tmp, tailq); ++ ++ return 0; ++} ++ ++static int ++bdev_alias_del(struct spdk_bdev *bdev, const char *alias, ++ void (*alias_del_fn)(struct spdk_bdev_name *n)) ++{ ++ struct spdk_bdev_alias *tmp; ++ ++ TAILQ_FOREACH(tmp, &bdev->aliases, tailq) { ++ if (strcmp(alias, tmp->alias.name) == 0) { ++ TAILQ_REMOVE(&bdev->aliases, tmp, tailq); ++ alias_del_fn(&tmp->alias); ++ free(tmp); ++ return 0; ++ } ++ } ++ ++ return -ENOENT; ++} ++ ++int ++spdk_bdev_alias_del(struct spdk_bdev *bdev, const char *alias) ++{ ++ int rc; ++ ++ rc = bdev_alias_del(bdev, alias, bdev_name_del); ++ if (rc == -ENOENT) { ++ SPDK_INFOLOG(bdev, "Alias %s does not exist\n", alias); ++ } ++ ++ return rc; ++} ++ ++void ++spdk_bdev_alias_del_all(struct spdk_bdev *bdev) ++{ ++ struct spdk_bdev_alias *p, *tmp; ++ ++ TAILQ_FOREACH_SAFE(p, &bdev->aliases, tailq, tmp) { ++ TAILQ_REMOVE(&bdev->aliases, p, tailq); ++ bdev_name_del(&p->alias); ++ free(p); ++ } ++} ++ ++struct spdk_io_channel * ++spdk_bdev_get_io_channel(struct spdk_bdev_desc *desc) ++{ ++ return spdk_get_io_channel(__bdev_to_io_dev(spdk_bdev_desc_get_bdev(desc))); ++} ++ ++void * ++spdk_bdev_get_module_ctx(struct spdk_bdev_desc *desc) ++{ ++ struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); ++ void *ctx = NULL; ++ ++ if (bdev->fn_table->get_module_ctx) { ++ ctx = bdev->fn_table->get_module_ctx(bdev->ctxt); ++ } ++ ++ return ctx; ++} ++ ++const char * ++spdk_bdev_get_module_name(const struct spdk_bdev *bdev) ++{ ++ return bdev->module->name; ++} ++ ++const char * ++spdk_bdev_get_name(const struct spdk_bdev *bdev) ++{ ++ return bdev->name; ++} ++ ++const char * ++spdk_bdev_get_product_name(const struct spdk_bdev *bdev) ++{ ++ return bdev->product_name; ++} ++ ++const struct spdk_bdev_aliases_list * ++spdk_bdev_get_aliases(const struct spdk_bdev *bdev) ++{ ++ return &bdev->aliases; ++} ++ ++uint32_t ++spdk_bdev_get_block_size(const struct spdk_bdev *bdev) ++{ ++ return bdev->blocklen; ++} ++ ++uint32_t ++spdk_bdev_get_write_unit_size(const struct spdk_bdev *bdev) ++{ ++ return bdev->write_unit_size; ++} ++ ++uint64_t ++spdk_bdev_get_num_blocks(const struct spdk_bdev *bdev) ++{ ++ return bdev->blockcnt; ++} ++ ++const char * ++spdk_bdev_get_qos_rpc_type(enum spdk_bdev_qos_rate_limit_type type) ++{ ++ return qos_rpc_type[type]; ++} ++ ++void ++spdk_bdev_get_qos_rate_limits(struct spdk_bdev *bdev, uint64_t *limits) ++{ ++ int i; ++ ++ memset(limits, 0, sizeof(*limits) * SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES); ++ ++ spdk_spin_lock(&bdev->internal.spinlock); ++ if (bdev->internal.qos) { ++ for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { ++ if (bdev->internal.qos->rate_limits[i].limit != ++ SPDK_BDEV_QOS_LIMIT_NOT_DEFINED) { ++ limits[i] = bdev->internal.qos->rate_limits[i].limit; ++ if (bdev_qos_is_iops_rate_limit(i) == false) { ++ /* Change from Byte to Megabyte which is user visible. */ ++ limits[i] = limits[i] / 1024 / 1024; ++ } ++ } ++ } ++ } ++ spdk_spin_unlock(&bdev->internal.spinlock); ++} ++ ++size_t ++spdk_bdev_get_buf_align(const struct spdk_bdev *bdev) ++{ ++ return 1 << bdev->required_alignment; ++} ++ ++uint32_t ++spdk_bdev_get_optimal_io_boundary(const struct spdk_bdev *bdev) ++{ ++ return bdev->optimal_io_boundary; ++} ++ ++bool ++spdk_bdev_has_write_cache(const struct spdk_bdev *bdev) ++{ ++ return bdev->write_cache; ++} ++ ++const struct spdk_uuid * ++spdk_bdev_get_uuid(const struct spdk_bdev *bdev) ++{ ++ return &bdev->uuid; ++} ++ ++uint16_t ++spdk_bdev_get_acwu(const struct spdk_bdev *bdev) ++{ ++ return bdev->acwu; ++} ++ ++uint32_t ++spdk_bdev_get_md_size(const struct spdk_bdev *bdev) ++{ ++ return bdev->md_len; ++} ++ ++bool ++spdk_bdev_is_md_interleaved(const struct spdk_bdev *bdev) ++{ ++ return (bdev->md_len != 0) && bdev->md_interleave; ++} ++ ++bool ++spdk_bdev_is_md_separate(const struct spdk_bdev *bdev) ++{ ++ return (bdev->md_len != 0) && !bdev->md_interleave; ++} ++ ++bool ++spdk_bdev_is_zoned(const struct spdk_bdev *bdev) ++{ ++ return bdev->zoned; ++} ++ ++uint32_t ++spdk_bdev_get_data_block_size(const struct spdk_bdev *bdev) ++{ ++ if (spdk_bdev_is_md_interleaved(bdev)) { ++ return bdev->blocklen - bdev->md_len; ++ } else { ++ return bdev->blocklen; ++ } ++} ++ ++uint32_t ++spdk_bdev_get_physical_block_size(const struct spdk_bdev *bdev) ++{ ++ return bdev->phys_blocklen; ++} ++ ++static uint32_t ++_bdev_get_block_size_with_md(const struct spdk_bdev *bdev) ++{ ++ if (!spdk_bdev_is_md_interleaved(bdev)) { ++ return bdev->blocklen + bdev->md_len; ++ } else { ++ return bdev->blocklen; ++ } ++} ++ ++/* We have to use the typedef in the function declaration to appease astyle. */ ++typedef enum spdk_dif_type spdk_dif_type_t; ++ ++spdk_dif_type_t ++spdk_bdev_get_dif_type(const struct spdk_bdev *bdev) ++{ ++ if (bdev->md_len != 0) { ++ return bdev->dif_type; ++ } else { ++ return SPDK_DIF_DISABLE; ++ } ++} ++ ++bool ++spdk_bdev_is_dif_head_of_md(const struct spdk_bdev *bdev) ++{ ++ if (spdk_bdev_get_dif_type(bdev) != SPDK_DIF_DISABLE) { ++ return bdev->dif_is_head_of_md; ++ } else { ++ return false; ++ } ++} ++ ++bool ++spdk_bdev_is_dif_check_enabled(const struct spdk_bdev *bdev, ++ enum spdk_dif_check_type check_type) ++{ ++ if (spdk_bdev_get_dif_type(bdev) == SPDK_DIF_DISABLE) { ++ return false; ++ } ++ ++ switch (check_type) { ++ case SPDK_DIF_CHECK_TYPE_REFTAG: ++ return (bdev->dif_check_flags & SPDK_DIF_FLAGS_REFTAG_CHECK) != 0; ++ case SPDK_DIF_CHECK_TYPE_APPTAG: ++ return (bdev->dif_check_flags & SPDK_DIF_FLAGS_APPTAG_CHECK) != 0; ++ case SPDK_DIF_CHECK_TYPE_GUARD: ++ return (bdev->dif_check_flags & SPDK_DIF_FLAGS_GUARD_CHECK) != 0; ++ default: ++ return false; ++ } ++} ++ ++uint32_t ++spdk_bdev_get_max_copy(const struct spdk_bdev *bdev) ++{ ++ return bdev->max_copy; ++} ++ ++uint64_t ++spdk_bdev_get_qd(const struct spdk_bdev *bdev) ++{ ++ return bdev->internal.measured_queue_depth; ++} ++ ++uint64_t ++spdk_bdev_get_qd_sampling_period(const struct spdk_bdev *bdev) ++{ ++ return bdev->internal.period; ++} ++ ++uint64_t ++spdk_bdev_get_weighted_io_time(const struct spdk_bdev *bdev) ++{ ++ return bdev->internal.weighted_io_time; ++} ++ ++uint64_t ++spdk_bdev_get_io_time(const struct spdk_bdev *bdev) ++{ ++ return bdev->internal.io_time; ++} ++ ++static void bdev_update_qd_sampling_period(void *ctx); ++ ++static void ++_calculate_measured_qd_cpl(struct spdk_bdev *bdev, void *_ctx, int status) ++{ ++ bdev->internal.measured_queue_depth = bdev->internal.temporary_queue_depth; ++ ++ if (bdev->internal.measured_queue_depth) { ++ bdev->internal.io_time += bdev->internal.period; ++ bdev->internal.weighted_io_time += bdev->internal.period * bdev->internal.measured_queue_depth; ++ } ++ ++ bdev->internal.qd_poll_in_progress = false; ++ ++ bdev_update_qd_sampling_period(bdev); ++} ++ ++static void ++_calculate_measured_qd(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, ++ struct spdk_io_channel *io_ch, void *_ctx) ++{ ++ struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(io_ch); ++ ++ bdev->internal.temporary_queue_depth += ch->io_outstanding; ++ spdk_bdev_for_each_channel_continue(i, 0); ++} ++ ++static int ++bdev_calculate_measured_queue_depth(void *ctx) ++{ ++ struct spdk_bdev *bdev = ctx; ++ ++ bdev->internal.qd_poll_in_progress = true; ++ bdev->internal.temporary_queue_depth = 0; ++ spdk_bdev_for_each_channel(bdev, _calculate_measured_qd, bdev, _calculate_measured_qd_cpl); ++ return SPDK_POLLER_BUSY; ++} ++ ++static void ++bdev_update_qd_sampling_period(void *ctx) ++{ ++ struct spdk_bdev *bdev = ctx; ++ ++ if (bdev->internal.period == bdev->internal.new_period) { ++ return; ++ } ++ ++ if (bdev->internal.qd_poll_in_progress) { ++ return; ++ } ++ ++ bdev->internal.period = bdev->internal.new_period; ++ ++ spdk_poller_unregister(&bdev->internal.qd_poller); ++ if (bdev->internal.period != 0) { ++ bdev->internal.qd_poller = SPDK_POLLER_REGISTER(bdev_calculate_measured_queue_depth, ++ bdev, bdev->internal.period); ++ } else { ++ spdk_bdev_close(bdev->internal.qd_desc); ++ bdev->internal.qd_desc = NULL; ++ } ++} ++ ++static void ++_tmp_bdev_event_cb(enum spdk_bdev_event_type type, struct spdk_bdev *bdev, void *ctx) ++{ ++ SPDK_NOTICELOG("Unexpected event type: %d\n", type); ++} ++ ++void ++spdk_bdev_set_qd_sampling_period(struct spdk_bdev *bdev, uint64_t period) ++{ ++ int rc; ++ ++ if (bdev->internal.new_period == period) { ++ return; ++ } ++ ++ bdev->internal.new_period = period; ++ ++ if (bdev->internal.qd_desc != NULL) { ++ assert(bdev->internal.period != 0); ++ ++ spdk_thread_send_msg(bdev->internal.qd_desc->thread, ++ bdev_update_qd_sampling_period, bdev); ++ return; ++ } ++ ++ assert(bdev->internal.period == 0); ++ ++ rc = spdk_bdev_open_ext(spdk_bdev_get_name(bdev), false, _tmp_bdev_event_cb, ++ NULL, &bdev->internal.qd_desc); ++ if (rc != 0) { ++ return; ++ } ++ ++ bdev->internal.period = period; ++ bdev->internal.qd_poller = SPDK_POLLER_REGISTER(bdev_calculate_measured_queue_depth, ++ bdev, period); ++} ++ ++struct bdev_get_current_qd_ctx { ++ uint64_t current_qd; ++ spdk_bdev_get_current_qd_cb cb_fn; ++ void *cb_arg; ++}; ++ ++static void ++bdev_get_current_qd_done(struct spdk_bdev *bdev, void *_ctx, int status) ++{ ++ struct bdev_get_current_qd_ctx *ctx = _ctx; ++ ++ ctx->cb_fn(bdev, ctx->current_qd, ctx->cb_arg, 0); ++ ++ free(ctx); ++} ++ ++static void ++bdev_get_current_qd(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, ++ struct spdk_io_channel *io_ch, void *_ctx) ++{ ++ struct bdev_get_current_qd_ctx *ctx = _ctx; ++ struct spdk_bdev_channel *bdev_ch = __io_ch_to_bdev_ch(io_ch); ++ ++ ctx->current_qd += bdev_ch->io_outstanding; ++ ++ spdk_bdev_for_each_channel_continue(i, 0); ++} ++ ++void ++spdk_bdev_get_current_qd(struct spdk_bdev *bdev, spdk_bdev_get_current_qd_cb cb_fn, ++ void *cb_arg) ++{ ++ struct bdev_get_current_qd_ctx *ctx; ++ ++ assert(cb_fn != NULL); ++ ++ ctx = calloc(1, sizeof(*ctx)); ++ if (ctx == NULL) { ++ cb_fn(bdev, 0, cb_arg, -ENOMEM); ++ return; ++ } ++ ++ ctx->cb_fn = cb_fn; ++ ctx->cb_arg = cb_arg; ++ ++ spdk_bdev_for_each_channel(bdev, bdev_get_current_qd, ctx, bdev_get_current_qd_done); ++} ++ ++static void ++_resize_notify(void *arg) ++{ ++ struct spdk_bdev_desc *desc = arg; ++ ++ spdk_spin_lock(&desc->spinlock); ++ desc->refs--; ++ if (!desc->closed) { ++ spdk_spin_unlock(&desc->spinlock); ++ desc->callback.event_fn(SPDK_BDEV_EVENT_RESIZE, ++ desc->bdev, ++ desc->callback.ctx); ++ return; ++ } else if (0 == desc->refs) { ++ /* This descriptor was closed after this resize_notify message was sent. ++ * spdk_bdev_close() could not free the descriptor since this message was ++ * in flight, so we free it now using bdev_desc_free(). ++ */ ++ spdk_spin_unlock(&desc->spinlock); ++ bdev_desc_free(desc); ++ return; ++ } ++ spdk_spin_unlock(&desc->spinlock); ++} ++ ++int ++spdk_bdev_notify_blockcnt_change(struct spdk_bdev *bdev, uint64_t size) ++{ ++ struct spdk_bdev_desc *desc; ++ int ret; ++ ++ if (size == bdev->blockcnt) { ++ return 0; ++ } ++ ++ spdk_spin_lock(&bdev->internal.spinlock); ++ ++ /* bdev has open descriptors */ ++ if (!TAILQ_EMPTY(&bdev->internal.open_descs) && ++ bdev->blockcnt > size) { ++ ret = -EBUSY; ++ } else { ++ bdev->blockcnt = size; ++ TAILQ_FOREACH(desc, &bdev->internal.open_descs, link) { ++ spdk_spin_lock(&desc->spinlock); ++ if (!desc->closed) { ++ desc->refs++; ++ spdk_thread_send_msg(desc->thread, _resize_notify, desc); ++ } ++ spdk_spin_unlock(&desc->spinlock); ++ } ++ ret = 0; ++ } ++ ++ spdk_spin_unlock(&bdev->internal.spinlock); ++ ++ return ret; ++} ++ ++/* ++ * Convert I/O offset and length from bytes to blocks. ++ * ++ * Returns zero on success or non-zero if the byte parameters aren't divisible by the block size. ++ */ ++static uint64_t ++bdev_bytes_to_blocks(struct spdk_bdev *bdev, uint64_t offset_bytes, uint64_t *offset_blocks, ++ uint64_t num_bytes, uint64_t *num_blocks) ++{ ++ uint32_t block_size = bdev->blocklen; ++ uint8_t shift_cnt; ++ ++ /* Avoid expensive div operations if possible. These spdk_u32 functions are very cheap. */ ++ if (spdk_likely(spdk_u32_is_pow2(block_size))) { ++ shift_cnt = spdk_u32log2(block_size); ++ *offset_blocks = offset_bytes >> shift_cnt; ++ *num_blocks = num_bytes >> shift_cnt; ++ return (offset_bytes - (*offset_blocks << shift_cnt)) | ++ (num_bytes - (*num_blocks << shift_cnt)); ++ } else { ++ *offset_blocks = offset_bytes / block_size; ++ *num_blocks = num_bytes / block_size; ++ return (offset_bytes % block_size) | (num_bytes % block_size); ++ } ++} ++ ++static bool ++bdev_io_valid_blocks(struct spdk_bdev *bdev, uint64_t offset_blocks, uint64_t num_blocks) ++{ ++ /* Return failure if offset_blocks + num_blocks is less than offset_blocks; indicates there ++ * has been an overflow and hence the offset has been wrapped around */ ++ if (offset_blocks + num_blocks < offset_blocks) { ++ return false; ++ } ++ ++ /* Return failure if offset_blocks + num_blocks exceeds the size of the bdev */ ++ if (offset_blocks + num_blocks > bdev->blockcnt) { ++ return false; ++ } ++ ++ return true; ++} ++ ++static void ++bdev_seek_complete_cb(void *ctx) ++{ ++ struct spdk_bdev_io *bdev_io = ctx; ++ ++ bdev_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS; ++ bdev_io->internal.cb(bdev_io, true, bdev_io->internal.caller_ctx); ++} ++ ++static int ++bdev_seek(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, ++ uint64_t offset_blocks, enum spdk_bdev_io_type io_type, ++ spdk_bdev_io_completion_cb cb, void *cb_arg) ++{ ++ struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); ++ struct spdk_bdev_io *bdev_io; ++ struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); ++ ++ assert(io_type == SPDK_BDEV_IO_TYPE_SEEK_DATA || io_type == SPDK_BDEV_IO_TYPE_SEEK_HOLE); ++ ++ /* Check if offset_blocks is valid looking at the validity of one block */ ++ if (!bdev_io_valid_blocks(bdev, offset_blocks, 1)) { ++ return -EINVAL; ++ } ++ ++ bdev_io = bdev_channel_get_io(channel); ++ if (!bdev_io) { ++ return -ENOMEM; ++ } ++ ++ bdev_io->internal.ch = channel; ++ bdev_io->internal.desc = desc; ++ bdev_io->type = io_type; ++ bdev_io->u.bdev.offset_blocks = offset_blocks; ++ bdev_io_init(bdev_io, bdev, cb_arg, cb); ++ ++ if (!spdk_bdev_io_type_supported(bdev, io_type)) { ++ /* In case bdev doesn't support seek to next data/hole offset, ++ * it is assumed that only data and no holes are present */ ++ if (io_type == SPDK_BDEV_IO_TYPE_SEEK_DATA) { ++ bdev_io->u.bdev.seek.offset = offset_blocks; ++ } else { ++ bdev_io->u.bdev.seek.offset = UINT64_MAX; ++ } ++ ++ spdk_thread_send_msg(spdk_get_thread(), bdev_seek_complete_cb, bdev_io); ++ return 0; ++ } ++ ++ bdev_io_submit(bdev_io); ++ return 0; ++} ++ ++int ++spdk_bdev_seek_data(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, ++ uint64_t offset_blocks, ++ spdk_bdev_io_completion_cb cb, void *cb_arg) ++{ ++ return bdev_seek(desc, ch, offset_blocks, SPDK_BDEV_IO_TYPE_SEEK_DATA, cb, cb_arg); ++} ++ ++int ++spdk_bdev_seek_hole(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, ++ uint64_t offset_blocks, ++ spdk_bdev_io_completion_cb cb, void *cb_arg) ++{ ++ return bdev_seek(desc, ch, offset_blocks, SPDK_BDEV_IO_TYPE_SEEK_HOLE, cb, cb_arg); ++} ++ ++uint64_t ++spdk_bdev_io_get_seek_offset(const struct spdk_bdev_io *bdev_io) ++{ ++ return bdev_io->u.bdev.seek.offset; ++} ++ ++static int ++bdev_read_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, void *buf, ++ void *md_buf, uint64_t offset_blocks, uint64_t num_blocks, ++ spdk_bdev_io_completion_cb cb, void *cb_arg) ++{ ++ struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); ++ struct spdk_bdev_io *bdev_io; ++ struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); ++ ++ if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { ++ return -EINVAL; ++ } ++ ++ bdev_io = bdev_channel_get_io(channel); ++ if (!bdev_io) { ++ return -ENOMEM; ++ } ++ ++ bdev_io->internal.ch = channel; ++ bdev_io->internal.desc = desc; ++ bdev_io->type = SPDK_BDEV_IO_TYPE_READ; ++ bdev_io->u.bdev.iovs = &bdev_io->iov; ++ bdev_io->u.bdev.iovs[0].iov_base = buf; ++ bdev_io->u.bdev.iovs[0].iov_len = num_blocks * bdev->blocklen; ++ bdev_io->u.bdev.iovcnt = 1; ++ bdev_io->u.bdev.md_buf = md_buf; ++ bdev_io->u.bdev.num_blocks = num_blocks; ++ bdev_io->u.bdev.offset_blocks = offset_blocks; ++ bdev_io->u.bdev.ext_opts = NULL; ++ bdev_io_init(bdev_io, bdev, cb_arg, cb); ++ ++ bdev_io_submit(bdev_io); ++ return 0; ++} ++ ++int ++spdk_bdev_read(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, ++ void *buf, uint64_t offset, uint64_t nbytes, ++ spdk_bdev_io_completion_cb cb, void *cb_arg) ++{ ++ uint64_t offset_blocks, num_blocks; ++ ++ if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks, ++ nbytes, &num_blocks) != 0) { ++ return -EINVAL; ++ } ++ ++ return spdk_bdev_read_blocks(desc, ch, buf, offset_blocks, num_blocks, cb, cb_arg); ++} ++ ++int ++spdk_bdev_read_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, ++ void *buf, uint64_t offset_blocks, uint64_t num_blocks, ++ spdk_bdev_io_completion_cb cb, void *cb_arg) ++{ ++ return bdev_read_blocks_with_md(desc, ch, buf, NULL, offset_blocks, num_blocks, cb, cb_arg); ++} ++ ++int ++spdk_bdev_read_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, ++ void *buf, void *md_buf, uint64_t offset_blocks, uint64_t num_blocks, ++ spdk_bdev_io_completion_cb cb, void *cb_arg) ++{ ++ struct iovec iov = { ++ .iov_base = buf, ++ }; ++ ++ if (md_buf && !spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) { ++ return -EINVAL; ++ } ++ ++ if (md_buf && !_is_buf_allocated(&iov)) { ++ return -EINVAL; ++ } ++ ++ return bdev_read_blocks_with_md(desc, ch, buf, md_buf, offset_blocks, num_blocks, ++ cb, cb_arg); ++} ++ ++int ++spdk_bdev_readv(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, ++ struct iovec *iov, int iovcnt, ++ uint64_t offset, uint64_t nbytes, ++ spdk_bdev_io_completion_cb cb, void *cb_arg) ++{ ++ uint64_t offset_blocks, num_blocks; ++ ++ if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks, ++ nbytes, &num_blocks) != 0) { ++ return -EINVAL; ++ } ++ ++ return spdk_bdev_readv_blocks(desc, ch, iov, iovcnt, offset_blocks, num_blocks, cb, cb_arg); ++} ++ ++static int ++bdev_readv_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, ++ struct iovec *iov, int iovcnt, void *md_buf, uint64_t offset_blocks, ++ uint64_t num_blocks, spdk_bdev_io_completion_cb cb, void *cb_arg, ++ struct spdk_bdev_ext_io_opts *opts, bool copy_opts) ++{ ++ struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); ++ struct spdk_bdev_io *bdev_io; ++ struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); ++ ++ if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { ++ return -EINVAL; ++ } ++ ++ bdev_io = bdev_channel_get_io(channel); ++ if (!bdev_io) { ++ return -ENOMEM; ++ } ++ ++ bdev_io->internal.ch = channel; ++ bdev_io->internal.desc = desc; ++ bdev_io->type = SPDK_BDEV_IO_TYPE_READ; ++ bdev_io->u.bdev.iovs = iov; ++ bdev_io->u.bdev.iovcnt = iovcnt; ++ bdev_io->u.bdev.md_buf = md_buf; ++ bdev_io->u.bdev.num_blocks = num_blocks; ++ bdev_io->u.bdev.offset_blocks = offset_blocks; ++ bdev_io_init(bdev_io, bdev, cb_arg, cb); ++ bdev_io->internal.ext_opts = opts; ++ bdev_io->u.bdev.ext_opts = opts; ++ ++ _bdev_io_submit_ext(desc, bdev_io, opts, copy_opts); ++ ++ return 0; ++} ++ ++int ++spdk_bdev_readv_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, ++ struct iovec *iov, int iovcnt, ++ uint64_t offset_blocks, uint64_t num_blocks, ++ spdk_bdev_io_completion_cb cb, void *cb_arg) ++{ ++ return bdev_readv_blocks_with_md(desc, ch, iov, iovcnt, NULL, offset_blocks, ++ num_blocks, cb, cb_arg, NULL, false); ++} ++ ++int ++spdk_bdev_readv_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, ++ struct iovec *iov, int iovcnt, void *md_buf, ++ uint64_t offset_blocks, uint64_t num_blocks, ++ spdk_bdev_io_completion_cb cb, void *cb_arg) ++{ ++ if (md_buf && !spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) { ++ return -EINVAL; ++ } ++ ++ if (md_buf && !_is_buf_allocated(iov)) { ++ return -EINVAL; ++ } ++ ++ return bdev_readv_blocks_with_md(desc, ch, iov, iovcnt, md_buf, offset_blocks, ++ num_blocks, cb, cb_arg, NULL, false); ++} ++ ++static inline bool ++_bdev_io_check_opts(struct spdk_bdev_ext_io_opts *opts, struct iovec *iov) ++{ ++ /* ++ * We check if opts size is at least of size when we first introduced ++ * spdk_bdev_ext_io_opts (ac6f2bdd8d) since access to those members ++ * are not checked internal. ++ */ ++ return opts->size >= offsetof(struct spdk_bdev_ext_io_opts, metadata) + ++ sizeof(opts->metadata) && ++ opts->size <= sizeof(*opts) && ++ /* When memory domain is used, the user must provide data buffers */ ++ (!opts->memory_domain || (iov && iov[0].iov_base)); ++} ++ ++int ++spdk_bdev_readv_blocks_ext(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, ++ struct iovec *iov, int iovcnt, ++ uint64_t offset_blocks, uint64_t num_blocks, ++ spdk_bdev_io_completion_cb cb, void *cb_arg, ++ struct spdk_bdev_ext_io_opts *opts) ++{ ++ void *md = NULL; ++ ++ if (opts) { ++ if (spdk_unlikely(!_bdev_io_check_opts(opts, iov))) { ++ return -EINVAL; ++ } ++ md = opts->metadata; ++ } ++ ++ if (md && !spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) { ++ return -EINVAL; ++ } ++ ++ if (md && !_is_buf_allocated(iov)) { ++ return -EINVAL; ++ } ++ ++ return bdev_readv_blocks_with_md(desc, ch, iov, iovcnt, md, offset_blocks, ++ num_blocks, cb, cb_arg, opts, false); ++} ++ ++static int ++bdev_write_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, ++ void *buf, void *md_buf, uint64_t offset_blocks, uint64_t num_blocks, ++ spdk_bdev_io_completion_cb cb, void *cb_arg) ++{ ++ struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); ++ struct spdk_bdev_io *bdev_io; ++ struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); ++ ++ if (!desc->write) { ++ return -EBADF; ++ } ++ ++ if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { ++ return -EINVAL; ++ } ++ ++ bdev_io = bdev_channel_get_io(channel); ++ if (!bdev_io) { ++ return -ENOMEM; ++ } ++ ++ bdev_io->internal.ch = channel; ++ bdev_io->internal.desc = desc; ++ bdev_io->type = SPDK_BDEV_IO_TYPE_WRITE; ++ bdev_io->u.bdev.iovs = &bdev_io->iov; ++ bdev_io->u.bdev.iovs[0].iov_base = buf; ++ bdev_io->u.bdev.iovs[0].iov_len = num_blocks * bdev->blocklen; ++ bdev_io->u.bdev.iovcnt = 1; ++ bdev_io->u.bdev.md_buf = md_buf; ++ bdev_io->u.bdev.num_blocks = num_blocks; ++ bdev_io->u.bdev.offset_blocks = offset_blocks; ++ bdev_io->u.bdev.ext_opts = NULL; ++ bdev_io_init(bdev_io, bdev, cb_arg, cb); ++ ++ bdev_io_submit(bdev_io); ++ return 0; ++} ++ ++int ++spdk_bdev_write(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, ++ void *buf, uint64_t offset, uint64_t nbytes, ++ spdk_bdev_io_completion_cb cb, void *cb_arg) ++{ ++ uint64_t offset_blocks, num_blocks; ++ ++ if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks, ++ nbytes, &num_blocks) != 0) { ++ return -EINVAL; ++ } ++ ++ return spdk_bdev_write_blocks(desc, ch, buf, offset_blocks, num_blocks, cb, cb_arg); ++} ++ ++int ++spdk_bdev_write_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, ++ void *buf, uint64_t offset_blocks, uint64_t num_blocks, ++ spdk_bdev_io_completion_cb cb, void *cb_arg) ++{ ++ return bdev_write_blocks_with_md(desc, ch, buf, NULL, offset_blocks, num_blocks, ++ cb, cb_arg); ++} ++ ++int ++spdk_bdev_write_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, ++ void *buf, void *md_buf, uint64_t offset_blocks, uint64_t num_blocks, ++ spdk_bdev_io_completion_cb cb, void *cb_arg) ++{ ++ struct iovec iov = { ++ .iov_base = buf, ++ }; ++ ++ if (md_buf && !spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) { ++ return -EINVAL; ++ } ++ ++ if (md_buf && !_is_buf_allocated(&iov)) { ++ return -EINVAL; ++ } ++ ++ return bdev_write_blocks_with_md(desc, ch, buf, md_buf, offset_blocks, num_blocks, ++ cb, cb_arg); ++} ++ ++static int ++bdev_writev_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, ++ struct iovec *iov, int iovcnt, void *md_buf, ++ uint64_t offset_blocks, uint64_t num_blocks, ++ spdk_bdev_io_completion_cb cb, void *cb_arg, ++ struct spdk_bdev_ext_io_opts *opts, bool copy_opts) ++{ ++ struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); ++ struct spdk_bdev_io *bdev_io; ++ struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); ++ ++ if (!desc->write) { ++ return -EBADF; ++ } ++ ++ if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { ++ return -EINVAL; ++ } ++ ++ bdev_io = bdev_channel_get_io(channel); ++ if (!bdev_io) { ++ return -ENOMEM; ++ } ++ ++ bdev_io->internal.ch = channel; ++ bdev_io->internal.desc = desc; ++ bdev_io->type = SPDK_BDEV_IO_TYPE_WRITE; ++ bdev_io->u.bdev.iovs = iov; ++ bdev_io->u.bdev.iovcnt = iovcnt; ++ bdev_io->u.bdev.md_buf = md_buf; ++ bdev_io->u.bdev.num_blocks = num_blocks; ++ bdev_io->u.bdev.offset_blocks = offset_blocks; ++ bdev_io_init(bdev_io, bdev, cb_arg, cb); ++ bdev_io->internal.ext_opts = opts; ++ bdev_io->u.bdev.ext_opts = opts; ++ ++ _bdev_io_submit_ext(desc, bdev_io, opts, copy_opts); ++ ++ return 0; ++} ++ ++int ++spdk_bdev_writev(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, ++ struct iovec *iov, int iovcnt, ++ uint64_t offset, uint64_t len, ++ spdk_bdev_io_completion_cb cb, void *cb_arg) ++{ ++ uint64_t offset_blocks, num_blocks; ++ ++ if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks, ++ len, &num_blocks) != 0) { ++ return -EINVAL; ++ } ++ ++ return spdk_bdev_writev_blocks(desc, ch, iov, iovcnt, offset_blocks, num_blocks, cb, cb_arg); ++} ++ ++int ++spdk_bdev_writev_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, ++ struct iovec *iov, int iovcnt, ++ uint64_t offset_blocks, uint64_t num_blocks, ++ spdk_bdev_io_completion_cb cb, void *cb_arg) ++{ ++ return bdev_writev_blocks_with_md(desc, ch, iov, iovcnt, NULL, offset_blocks, ++ num_blocks, cb, cb_arg, NULL, false); ++} ++ ++int ++spdk_bdev_writev_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, ++ struct iovec *iov, int iovcnt, void *md_buf, ++ uint64_t offset_blocks, uint64_t num_blocks, ++ spdk_bdev_io_completion_cb cb, void *cb_arg) ++{ ++ if (md_buf && !spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) { ++ return -EINVAL; ++ } ++ ++ if (md_buf && !_is_buf_allocated(iov)) { ++ return -EINVAL; ++ } ++ ++ return bdev_writev_blocks_with_md(desc, ch, iov, iovcnt, md_buf, offset_blocks, ++ num_blocks, cb, cb_arg, NULL, false); ++} ++ ++int ++spdk_bdev_writev_blocks_ext(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, ++ struct iovec *iov, int iovcnt, ++ uint64_t offset_blocks, uint64_t num_blocks, ++ spdk_bdev_io_completion_cb cb, void *cb_arg, ++ struct spdk_bdev_ext_io_opts *opts) ++{ ++ void *md = NULL; ++ ++ if (opts) { ++ if (spdk_unlikely(!_bdev_io_check_opts(opts, iov))) { ++ return -EINVAL; ++ } ++ md = opts->metadata; ++ } ++ ++ if (md && !spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) { ++ return -EINVAL; ++ } ++ ++ if (md && !_is_buf_allocated(iov)) { ++ return -EINVAL; ++ } ++ ++ return bdev_writev_blocks_with_md(desc, ch, iov, iovcnt, md, offset_blocks, ++ num_blocks, cb, cb_arg, opts, false); ++} ++ ++static void ++bdev_compare_do_read_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) ++{ ++ struct spdk_bdev_io *parent_io = cb_arg; ++ struct spdk_bdev *bdev = parent_io->bdev; ++ uint8_t *read_buf = bdev_io->u.bdev.iovs[0].iov_base; ++ int i, rc = 0; ++ ++ if (!success) { ++ parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; ++ parent_io->internal.cb(parent_io, false, parent_io->internal.caller_ctx); ++ spdk_bdev_free_io(bdev_io); ++ return; ++ } ++ ++ for (i = 0; i < parent_io->u.bdev.iovcnt; i++) { ++ rc = memcmp(read_buf, ++ parent_io->u.bdev.iovs[i].iov_base, ++ parent_io->u.bdev.iovs[i].iov_len); ++ if (rc) { ++ break; ++ } ++ read_buf += parent_io->u.bdev.iovs[i].iov_len; ++ } ++ ++ if (rc == 0 && parent_io->u.bdev.md_buf && spdk_bdev_is_md_separate(bdev)) { ++ rc = memcmp(bdev_io->u.bdev.md_buf, ++ parent_io->u.bdev.md_buf, ++ spdk_bdev_get_md_size(bdev)); ++ } ++ ++ spdk_bdev_free_io(bdev_io); ++ ++ if (rc == 0) { ++ parent_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS; ++ parent_io->internal.cb(parent_io, true, parent_io->internal.caller_ctx); ++ } else { ++ parent_io->internal.status = SPDK_BDEV_IO_STATUS_MISCOMPARE; ++ parent_io->internal.cb(parent_io, false, parent_io->internal.caller_ctx); ++ } ++} ++ ++static void ++bdev_compare_do_read(void *_bdev_io) ++{ ++ struct spdk_bdev_io *bdev_io = _bdev_io; ++ int rc; ++ ++ rc = spdk_bdev_read_blocks(bdev_io->internal.desc, ++ spdk_io_channel_from_ctx(bdev_io->internal.ch), NULL, ++ bdev_io->u.bdev.offset_blocks, bdev_io->u.bdev.num_blocks, ++ bdev_compare_do_read_done, bdev_io); ++ ++ if (rc == -ENOMEM) { ++ bdev_queue_io_wait_with_cb(bdev_io, bdev_compare_do_read); ++ } else if (rc != 0) { ++ bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; ++ bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); ++ } ++} ++ ++static int ++bdev_comparev_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, ++ struct iovec *iov, int iovcnt, void *md_buf, ++ uint64_t offset_blocks, uint64_t num_blocks, ++ spdk_bdev_io_completion_cb cb, void *cb_arg) ++{ ++ struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); ++ struct spdk_bdev_io *bdev_io; ++ struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); ++ ++ if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { ++ return -EINVAL; ++ } ++ ++ bdev_io = bdev_channel_get_io(channel); ++ if (!bdev_io) { ++ return -ENOMEM; ++ } ++ ++ bdev_io->internal.ch = channel; ++ bdev_io->internal.desc = desc; ++ bdev_io->type = SPDK_BDEV_IO_TYPE_COMPARE; ++ bdev_io->u.bdev.iovs = iov; ++ bdev_io->u.bdev.iovcnt = iovcnt; ++ bdev_io->u.bdev.md_buf = md_buf; ++ bdev_io->u.bdev.num_blocks = num_blocks; ++ bdev_io->u.bdev.offset_blocks = offset_blocks; ++ bdev_io_init(bdev_io, bdev, cb_arg, cb); ++ bdev_io->u.bdev.ext_opts = NULL; ++ ++ if (bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_COMPARE)) { ++ bdev_io_submit(bdev_io); ++ return 0; ++ } ++ ++ bdev_compare_do_read(bdev_io); ++ ++ return 0; ++} ++ ++int ++spdk_bdev_comparev_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, ++ struct iovec *iov, int iovcnt, ++ uint64_t offset_blocks, uint64_t num_blocks, ++ spdk_bdev_io_completion_cb cb, void *cb_arg) ++{ ++ return bdev_comparev_blocks_with_md(desc, ch, iov, iovcnt, NULL, offset_blocks, ++ num_blocks, cb, cb_arg); ++} ++ ++int ++spdk_bdev_comparev_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, ++ struct iovec *iov, int iovcnt, void *md_buf, ++ uint64_t offset_blocks, uint64_t num_blocks, ++ spdk_bdev_io_completion_cb cb, void *cb_arg) ++{ ++ if (md_buf && !spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) { ++ return -EINVAL; ++ } ++ ++ if (md_buf && !_is_buf_allocated(iov)) { ++ return -EINVAL; ++ } ++ ++ return bdev_comparev_blocks_with_md(desc, ch, iov, iovcnt, md_buf, offset_blocks, ++ num_blocks, cb, cb_arg); ++} ++ ++static int ++bdev_compare_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, ++ void *buf, void *md_buf, uint64_t offset_blocks, uint64_t num_blocks, ++ spdk_bdev_io_completion_cb cb, void *cb_arg) ++{ ++ struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); ++ struct spdk_bdev_io *bdev_io; ++ struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); ++ ++ if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { ++ return -EINVAL; ++ } ++ ++ bdev_io = bdev_channel_get_io(channel); ++ if (!bdev_io) { ++ return -ENOMEM; ++ } ++ ++ bdev_io->internal.ch = channel; ++ bdev_io->internal.desc = desc; ++ bdev_io->type = SPDK_BDEV_IO_TYPE_COMPARE; ++ bdev_io->u.bdev.iovs = &bdev_io->iov; ++ bdev_io->u.bdev.iovs[0].iov_base = buf; ++ bdev_io->u.bdev.iovs[0].iov_len = num_blocks * bdev->blocklen; ++ bdev_io->u.bdev.iovcnt = 1; ++ bdev_io->u.bdev.md_buf = md_buf; ++ bdev_io->u.bdev.num_blocks = num_blocks; ++ bdev_io->u.bdev.offset_blocks = offset_blocks; ++ bdev_io_init(bdev_io, bdev, cb_arg, cb); ++ bdev_io->u.bdev.ext_opts = NULL; ++ ++ if (bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_COMPARE)) { ++ bdev_io_submit(bdev_io); ++ return 0; ++ } ++ ++ bdev_compare_do_read(bdev_io); ++ ++ return 0; ++} ++ ++int ++spdk_bdev_compare_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, ++ void *buf, uint64_t offset_blocks, uint64_t num_blocks, ++ spdk_bdev_io_completion_cb cb, void *cb_arg) ++{ ++ return bdev_compare_blocks_with_md(desc, ch, buf, NULL, offset_blocks, num_blocks, ++ cb, cb_arg); ++} ++ ++int ++spdk_bdev_compare_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, ++ void *buf, void *md_buf, uint64_t offset_blocks, uint64_t num_blocks, ++ spdk_bdev_io_completion_cb cb, void *cb_arg) ++{ ++ struct iovec iov = { ++ .iov_base = buf, ++ }; ++ ++ if (md_buf && !spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) { ++ return -EINVAL; ++ } ++ ++ if (md_buf && !_is_buf_allocated(&iov)) { ++ return -EINVAL; ++ } ++ ++ return bdev_compare_blocks_with_md(desc, ch, buf, md_buf, offset_blocks, num_blocks, ++ cb, cb_arg); ++} ++ ++static void ++bdev_comparev_and_writev_blocks_unlocked(void *ctx, int unlock_status) ++{ ++ struct spdk_bdev_io *bdev_io = ctx; ++ ++ if (unlock_status) { ++ SPDK_ERRLOG("LBA range unlock failed\n"); ++ } ++ ++ bdev_io->internal.cb(bdev_io, bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS ? true : ++ false, bdev_io->internal.caller_ctx); ++} ++ ++static void ++bdev_comparev_and_writev_blocks_unlock(struct spdk_bdev_io *bdev_io, int status) ++{ ++ bdev_io->internal.status = status; ++ ++ bdev_unlock_lba_range(bdev_io->internal.desc, spdk_io_channel_from_ctx(bdev_io->internal.ch), ++ bdev_io->u.bdev.offset_blocks, bdev_io->u.bdev.num_blocks, ++ bdev_comparev_and_writev_blocks_unlocked, bdev_io); ++} ++ ++static void ++bdev_compare_and_write_do_write_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) ++{ ++ struct spdk_bdev_io *parent_io = cb_arg; ++ ++ if (!success) { ++ SPDK_ERRLOG("Compare and write operation failed\n"); ++ } ++ ++ spdk_bdev_free_io(bdev_io); ++ ++ bdev_comparev_and_writev_blocks_unlock(parent_io, ++ success ? SPDK_BDEV_IO_STATUS_SUCCESS : SPDK_BDEV_IO_STATUS_FAILED); ++} ++ ++static void ++bdev_compare_and_write_do_write(void *_bdev_io) ++{ ++ struct spdk_bdev_io *bdev_io = _bdev_io; ++ int rc; ++ ++ rc = spdk_bdev_writev_blocks(bdev_io->internal.desc, ++ spdk_io_channel_from_ctx(bdev_io->internal.ch), ++ bdev_io->u.bdev.fused_iovs, bdev_io->u.bdev.fused_iovcnt, ++ bdev_io->u.bdev.offset_blocks, bdev_io->u.bdev.num_blocks, ++ bdev_compare_and_write_do_write_done, bdev_io); ++ ++ ++ if (rc == -ENOMEM) { ++ bdev_queue_io_wait_with_cb(bdev_io, bdev_compare_and_write_do_write); ++ } else if (rc != 0) { ++ bdev_comparev_and_writev_blocks_unlock(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); ++ } ++} ++ ++static void ++bdev_compare_and_write_do_compare_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) ++{ ++ struct spdk_bdev_io *parent_io = cb_arg; ++ ++ spdk_bdev_free_io(bdev_io); ++ ++ if (!success) { ++ bdev_comparev_and_writev_blocks_unlock(parent_io, SPDK_BDEV_IO_STATUS_MISCOMPARE); ++ return; ++ } ++ ++ bdev_compare_and_write_do_write(parent_io); ++} ++ ++static void ++bdev_compare_and_write_do_compare(void *_bdev_io) ++{ ++ struct spdk_bdev_io *bdev_io = _bdev_io; ++ int rc; ++ ++ rc = spdk_bdev_comparev_blocks(bdev_io->internal.desc, ++ spdk_io_channel_from_ctx(bdev_io->internal.ch), bdev_io->u.bdev.iovs, ++ bdev_io->u.bdev.iovcnt, bdev_io->u.bdev.offset_blocks, bdev_io->u.bdev.num_blocks, ++ bdev_compare_and_write_do_compare_done, bdev_io); ++ ++ if (rc == -ENOMEM) { ++ bdev_queue_io_wait_with_cb(bdev_io, bdev_compare_and_write_do_compare); ++ } else if (rc != 0) { ++ bdev_comparev_and_writev_blocks_unlock(bdev_io, SPDK_BDEV_IO_STATUS_FIRST_FUSED_FAILED); ++ } ++} ++ ++static void ++bdev_comparev_and_writev_blocks_locked(void *ctx, int status) ++{ ++ struct spdk_bdev_io *bdev_io = ctx; ++ ++ if (status) { ++ bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FIRST_FUSED_FAILED; ++ bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); ++ return; ++ } ++ ++ bdev_compare_and_write_do_compare(bdev_io); ++} ++ ++int ++spdk_bdev_comparev_and_writev_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, ++ struct iovec *compare_iov, int compare_iovcnt, ++ struct iovec *write_iov, int write_iovcnt, ++ uint64_t offset_blocks, uint64_t num_blocks, ++ spdk_bdev_io_completion_cb cb, void *cb_arg) ++{ ++ struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); ++ struct spdk_bdev_io *bdev_io; ++ struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); ++ ++ if (!desc->write) { ++ return -EBADF; ++ } ++ ++ if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { ++ return -EINVAL; ++ } ++ ++ if (num_blocks > bdev->acwu) { ++ return -EINVAL; ++ } ++ ++ bdev_io = bdev_channel_get_io(channel); ++ if (!bdev_io) { ++ return -ENOMEM; ++ } ++ ++ bdev_io->internal.ch = channel; ++ bdev_io->internal.desc = desc; ++ bdev_io->type = SPDK_BDEV_IO_TYPE_COMPARE_AND_WRITE; ++ bdev_io->u.bdev.iovs = compare_iov; ++ bdev_io->u.bdev.iovcnt = compare_iovcnt; ++ bdev_io->u.bdev.fused_iovs = write_iov; ++ bdev_io->u.bdev.fused_iovcnt = write_iovcnt; ++ bdev_io->u.bdev.md_buf = NULL; ++ bdev_io->u.bdev.num_blocks = num_blocks; ++ bdev_io->u.bdev.offset_blocks = offset_blocks; ++ bdev_io_init(bdev_io, bdev, cb_arg, cb); ++ bdev_io->u.bdev.ext_opts = NULL; ++ ++ if (bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_COMPARE_AND_WRITE)) { ++ bdev_io_submit(bdev_io); ++ return 0; ++ } ++ ++ return bdev_lock_lba_range(desc, ch, offset_blocks, num_blocks, ++ bdev_comparev_and_writev_blocks_locked, bdev_io); ++} ++ ++int ++spdk_bdev_zcopy_start(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, ++ struct iovec *iov, int iovcnt, ++ uint64_t offset_blocks, uint64_t num_blocks, ++ bool populate, ++ spdk_bdev_io_completion_cb cb, void *cb_arg) ++{ ++ struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); ++ struct spdk_bdev_io *bdev_io; ++ struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); ++ ++ if (!desc->write) { ++ return -EBADF; ++ } ++ ++ if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { ++ return -EINVAL; ++ } ++ ++ if (!spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_ZCOPY)) { ++ return -ENOTSUP; ++ } ++ ++ bdev_io = bdev_channel_get_io(channel); ++ if (!bdev_io) { ++ return -ENOMEM; ++ } ++ ++ bdev_io->internal.ch = channel; ++ bdev_io->internal.desc = desc; ++ bdev_io->type = SPDK_BDEV_IO_TYPE_ZCOPY; ++ bdev_io->u.bdev.num_blocks = num_blocks; ++ bdev_io->u.bdev.offset_blocks = offset_blocks; ++ bdev_io->u.bdev.iovs = iov; ++ bdev_io->u.bdev.iovcnt = iovcnt; ++ bdev_io->u.bdev.md_buf = NULL; ++ bdev_io->u.bdev.zcopy.populate = populate ? 1 : 0; ++ bdev_io->u.bdev.zcopy.commit = 0; ++ bdev_io->u.bdev.zcopy.start = 1; ++ bdev_io_init(bdev_io, bdev, cb_arg, cb); ++ bdev_io->u.bdev.ext_opts = NULL; ++ ++ bdev_io_submit(bdev_io); ++ ++ return 0; ++} ++ ++int ++spdk_bdev_zcopy_end(struct spdk_bdev_io *bdev_io, bool commit, ++ spdk_bdev_io_completion_cb cb, void *cb_arg) ++{ ++ if (bdev_io->type != SPDK_BDEV_IO_TYPE_ZCOPY) { ++ return -EINVAL; ++ } ++ ++ bdev_io->u.bdev.zcopy.commit = commit ? 1 : 0; ++ bdev_io->u.bdev.zcopy.start = 0; ++ bdev_io->internal.caller_ctx = cb_arg; ++ bdev_io->internal.cb = cb; ++ bdev_io->internal.status = SPDK_BDEV_IO_STATUS_PENDING; ++ ++ bdev_io_submit(bdev_io); ++ ++ return 0; ++} ++ ++int ++spdk_bdev_write_zeroes(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, ++ uint64_t offset, uint64_t len, ++ spdk_bdev_io_completion_cb cb, void *cb_arg) ++{ ++ uint64_t offset_blocks, num_blocks; ++ ++ if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks, ++ len, &num_blocks) != 0) { ++ return -EINVAL; ++ } ++ ++ return spdk_bdev_write_zeroes_blocks(desc, ch, offset_blocks, num_blocks, cb, cb_arg); ++} ++ ++int ++spdk_bdev_write_zeroes_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, ++ uint64_t offset_blocks, uint64_t num_blocks, ++ spdk_bdev_io_completion_cb cb, void *cb_arg) ++{ ++ struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); ++ struct spdk_bdev_io *bdev_io; ++ struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); ++ ++ if (!desc->write) { ++ return -EBADF; ++ } ++ ++ if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { ++ return -EINVAL; ++ } ++ ++ if (!bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE_ZEROES) && ++ !bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE)) { ++ return -ENOTSUP; ++ } ++ ++ bdev_io = bdev_channel_get_io(channel); ++ ++ if (!bdev_io) { ++ return -ENOMEM; ++ } ++ ++ bdev_io->type = SPDK_BDEV_IO_TYPE_WRITE_ZEROES; ++ bdev_io->internal.ch = channel; ++ bdev_io->internal.desc = desc; ++ bdev_io->u.bdev.offset_blocks = offset_blocks; ++ bdev_io->u.bdev.num_blocks = num_blocks; ++ bdev_io_init(bdev_io, bdev, cb_arg, cb); ++ bdev_io->u.bdev.ext_opts = NULL; ++ ++ if (bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE_ZEROES)) { ++ bdev_io_submit(bdev_io); ++ return 0; ++ } ++ ++ assert(bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE)); ++ assert(_bdev_get_block_size_with_md(bdev) <= ZERO_BUFFER_SIZE); ++ bdev_io->u.bdev.split_remaining_num_blocks = num_blocks; ++ bdev_io->u.bdev.split_current_offset_blocks = offset_blocks; ++ bdev_write_zero_buffer_next(bdev_io); ++ ++ return 0; ++} ++ ++int ++spdk_bdev_unmap(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, ++ uint64_t offset, uint64_t nbytes, ++ spdk_bdev_io_completion_cb cb, void *cb_arg) ++{ ++ uint64_t offset_blocks, num_blocks; ++ ++ if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks, ++ nbytes, &num_blocks) != 0) { ++ return -EINVAL; ++ } ++ ++ return spdk_bdev_unmap_blocks(desc, ch, offset_blocks, num_blocks, cb, cb_arg); ++} ++ ++int ++spdk_bdev_unmap_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, ++ uint64_t offset_blocks, uint64_t num_blocks, ++ spdk_bdev_io_completion_cb cb, void *cb_arg) ++{ ++ struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); ++ struct spdk_bdev_io *bdev_io; ++ struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); ++ ++ if (!desc->write) { ++ return -EBADF; ++ } ++ ++ if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { ++ return -EINVAL; ++ } ++ ++ if (num_blocks == 0) { ++ SPDK_ERRLOG("Can't unmap 0 bytes\n"); ++ return -EINVAL; ++ } ++ ++ bdev_io = bdev_channel_get_io(channel); ++ if (!bdev_io) { ++ return -ENOMEM; ++ } ++ ++ bdev_io->internal.ch = channel; ++ bdev_io->internal.desc = desc; ++ bdev_io->type = SPDK_BDEV_IO_TYPE_UNMAP; ++ ++ bdev_io->u.bdev.iovs = &bdev_io->iov; ++ bdev_io->u.bdev.iovs[0].iov_base = NULL; ++ bdev_io->u.bdev.iovs[0].iov_len = 0; ++ bdev_io->u.bdev.iovcnt = 1; ++ ++ bdev_io->u.bdev.offset_blocks = offset_blocks; ++ bdev_io->u.bdev.num_blocks = num_blocks; ++ bdev_io_init(bdev_io, bdev, cb_arg, cb); ++ bdev_io->u.bdev.ext_opts = NULL; ++ ++ bdev_io_submit(bdev_io); ++ return 0; ++} ++ ++int ++spdk_bdev_flush(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, ++ uint64_t offset, uint64_t length, ++ spdk_bdev_io_completion_cb cb, void *cb_arg) ++{ ++ uint64_t offset_blocks, num_blocks; ++ ++ if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks, ++ length, &num_blocks) != 0) { ++ return -EINVAL; ++ } ++ ++ return spdk_bdev_flush_blocks(desc, ch, offset_blocks, num_blocks, cb, cb_arg); ++} ++ ++int ++spdk_bdev_flush_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, ++ uint64_t offset_blocks, uint64_t num_blocks, ++ spdk_bdev_io_completion_cb cb, void *cb_arg) ++{ ++ struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); ++ struct spdk_bdev_io *bdev_io; ++ struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); ++ ++ if (!desc->write) { ++ return -EBADF; ++ } ++ ++ if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { ++ return -EINVAL; ++ } ++ ++ bdev_io = bdev_channel_get_io(channel); ++ if (!bdev_io) { ++ return -ENOMEM; ++ } ++ ++ bdev_io->internal.ch = channel; ++ bdev_io->internal.desc = desc; ++ bdev_io->type = SPDK_BDEV_IO_TYPE_FLUSH; ++ bdev_io->u.bdev.iovs = NULL; ++ bdev_io->u.bdev.iovcnt = 0; ++ bdev_io->u.bdev.offset_blocks = offset_blocks; ++ bdev_io->u.bdev.num_blocks = num_blocks; ++ bdev_io_init(bdev_io, bdev, cb_arg, cb); ++ ++ bdev_io_submit(bdev_io); ++ return 0; ++} ++ ++static int bdev_reset_poll_for_outstanding_io(void *ctx); ++ ++static void ++bdev_reset_check_outstanding_io_done(struct spdk_bdev *bdev, void *_ctx, int status) ++{ ++ struct spdk_bdev_channel *ch = _ctx; ++ struct spdk_bdev_io *bdev_io; ++ ++ bdev_io = TAILQ_FIRST(&ch->queued_resets); ++ ++ if (status == -EBUSY) { ++ if (spdk_get_ticks() < bdev_io->u.reset.wait_poller.stop_time_tsc) { ++ bdev_io->u.reset.wait_poller.poller = SPDK_POLLER_REGISTER(bdev_reset_poll_for_outstanding_io, ++ ch, BDEV_RESET_CHECK_OUTSTANDING_IO_PERIOD); ++ } else { ++ /* If outstanding IOs are still present and reset_io_drain_timeout seconds passed, ++ * start the reset. */ ++ TAILQ_REMOVE(&ch->queued_resets, bdev_io, internal.link); ++ bdev_io_submit_reset(bdev_io); ++ } ++ } else { ++ TAILQ_REMOVE(&ch->queued_resets, bdev_io, internal.link); ++ SPDK_DEBUGLOG(bdev, ++ "Skipping reset for underlying device of bdev: %s - no outstanding I/O.\n", ++ ch->bdev->name); ++ /* Mark the completion status as a SUCCESS and complete the reset. */ ++ spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_SUCCESS); ++ } ++} ++ ++static void ++bdev_reset_check_outstanding_io(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, ++ struct spdk_io_channel *io_ch, void *_ctx) ++{ ++ struct spdk_bdev_channel *cur_ch = __io_ch_to_bdev_ch(io_ch); ++ int status = 0; ++ ++ if (cur_ch->io_outstanding > 0) { ++ /* If a channel has outstanding IO, set status to -EBUSY code. This will stop ++ * further iteration over the rest of the channels and pass non-zero status ++ * to the callback function. */ ++ status = -EBUSY; ++ } ++ spdk_bdev_for_each_channel_continue(i, status); ++} ++ ++static int ++bdev_reset_poll_for_outstanding_io(void *ctx) ++{ ++ struct spdk_bdev_channel *ch = ctx; ++ struct spdk_bdev_io *bdev_io; ++ ++ bdev_io = TAILQ_FIRST(&ch->queued_resets); ++ ++ spdk_poller_unregister(&bdev_io->u.reset.wait_poller.poller); ++ spdk_bdev_for_each_channel(ch->bdev, bdev_reset_check_outstanding_io, ch, ++ bdev_reset_check_outstanding_io_done); ++ ++ return SPDK_POLLER_BUSY; ++} ++ ++static void ++bdev_reset_freeze_channel_done(struct spdk_bdev *bdev, void *_ctx, int status) ++{ ++ struct spdk_bdev_channel *ch = _ctx; ++ struct spdk_bdev_io *bdev_io; ++ ++ bdev_io = TAILQ_FIRST(&ch->queued_resets); ++ ++ if (bdev->reset_io_drain_timeout == 0) { ++ TAILQ_REMOVE(&ch->queued_resets, bdev_io, internal.link); ++ ++ bdev_io_submit_reset(bdev_io); ++ return; ++ } ++ ++ bdev_io->u.reset.wait_poller.stop_time_tsc = spdk_get_ticks() + ++ (ch->bdev->reset_io_drain_timeout * spdk_get_ticks_hz()); ++ ++ /* In case bdev->reset_io_drain_timeout is not equal to zero, ++ * submit the reset to the underlying module only if outstanding I/O ++ * remain after reset_io_drain_timeout seconds have passed. */ ++ spdk_bdev_for_each_channel(ch->bdev, bdev_reset_check_outstanding_io, ch, ++ bdev_reset_check_outstanding_io_done); ++} ++ ++static void ++bdev_reset_freeze_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, ++ struct spdk_io_channel *ch, void *_ctx) ++{ ++ struct spdk_bdev_channel *channel; ++ struct spdk_bdev_mgmt_channel *mgmt_channel; ++ struct spdk_bdev_shared_resource *shared_resource; ++ bdev_io_tailq_t tmp_queued; ++ ++ TAILQ_INIT(&tmp_queued); ++ ++ channel = __io_ch_to_bdev_ch(ch); ++ shared_resource = channel->shared_resource; ++ mgmt_channel = shared_resource->mgmt_ch; ++ ++ channel->flags |= BDEV_CH_RESET_IN_PROGRESS; ++ ++ if ((channel->flags & BDEV_CH_QOS_ENABLED) != 0) { ++ /* The QoS object is always valid and readable while ++ * the channel flag is set, so the lock here should not ++ * be necessary. We're not in the fast path though, so ++ * just take it anyway. */ ++ spdk_spin_lock(&channel->bdev->internal.spinlock); ++ if (channel->bdev->internal.qos->ch == channel) { ++ TAILQ_SWAP(&channel->bdev->internal.qos->queued, &tmp_queued, spdk_bdev_io, internal.link); ++ } ++ spdk_spin_unlock(&channel->bdev->internal.spinlock); ++ } ++ ++ bdev_abort_all_queued_io(&shared_resource->nomem_io, channel); ++ bdev_abort_all_buf_io(mgmt_channel, channel); ++ bdev_abort_all_buf_io(mgmt_channel, channel); ++ bdev_abort_all_queued_io(&tmp_queued, channel); ++ ++ spdk_bdev_for_each_channel_continue(i, 0); ++} ++ ++static void ++bdev_start_reset(void *ctx) ++{ ++ struct spdk_bdev_channel *ch = ctx; ++ ++ spdk_bdev_for_each_channel(ch->bdev, bdev_reset_freeze_channel, ch, ++ bdev_reset_freeze_channel_done); ++} ++ ++static void ++bdev_channel_start_reset(struct spdk_bdev_channel *ch) ++{ ++ struct spdk_bdev *bdev = ch->bdev; ++ ++ assert(!TAILQ_EMPTY(&ch->queued_resets)); ++ ++ spdk_spin_lock(&bdev->internal.spinlock); ++ if (bdev->internal.reset_in_progress == NULL) { ++ bdev->internal.reset_in_progress = TAILQ_FIRST(&ch->queued_resets); ++ /* ++ * Take a channel reference for the target bdev for the life of this ++ * reset. This guards against the channel getting destroyed while ++ * spdk_bdev_for_each_channel() calls related to this reset IO are in ++ * progress. We will release the reference when this reset is ++ * completed. ++ */ ++ bdev->internal.reset_in_progress->u.reset.ch_ref = spdk_get_io_channel(__bdev_to_io_dev(bdev)); ++ bdev_start_reset(ch); ++ } ++ spdk_spin_unlock(&bdev->internal.spinlock); ++} ++ ++int ++spdk_bdev_reset(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, ++ spdk_bdev_io_completion_cb cb, void *cb_arg) ++{ ++ struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); ++ struct spdk_bdev_io *bdev_io; ++ struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); ++ ++ bdev_io = bdev_channel_get_io(channel); ++ if (!bdev_io) { ++ return -ENOMEM; ++ } ++ ++ bdev_io->internal.ch = channel; ++ bdev_io->internal.desc = desc; ++ bdev_io->internal.submit_tsc = spdk_get_ticks(); ++ bdev_io->type = SPDK_BDEV_IO_TYPE_RESET; ++ bdev_io->u.reset.ch_ref = NULL; ++ bdev_io_init(bdev_io, bdev, cb_arg, cb); ++ ++ spdk_spin_lock(&bdev->internal.spinlock); ++ TAILQ_INSERT_TAIL(&channel->queued_resets, bdev_io, internal.link); ++ spdk_spin_unlock(&bdev->internal.spinlock); ++ ++ TAILQ_INSERT_TAIL(&bdev_io->internal.ch->io_submitted, bdev_io, ++ internal.ch_link); ++ ++ bdev_channel_start_reset(channel); ++ ++ return 0; ++} ++ ++void ++spdk_bdev_get_io_stat(struct spdk_bdev *bdev, struct spdk_io_channel *ch, ++ struct spdk_bdev_io_stat *stat) ++{ ++ struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); ++ ++ bdev_get_io_stat(stat, channel->stat); ++} ++ ++static void ++bdev_get_device_stat_done(struct spdk_bdev *bdev, void *_ctx, int status) ++{ ++ struct spdk_bdev_iostat_ctx *bdev_iostat_ctx = _ctx; ++ ++ bdev_iostat_ctx->cb(bdev, bdev_iostat_ctx->stat, ++ bdev_iostat_ctx->cb_arg, 0); ++ free(bdev_iostat_ctx); ++} ++ ++static void ++bdev_get_each_channel_stat(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, ++ struct spdk_io_channel *ch, void *_ctx) ++{ ++ struct spdk_bdev_iostat_ctx *bdev_iostat_ctx = _ctx; ++ struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); ++ ++ spdk_bdev_add_io_stat(bdev_iostat_ctx->stat, channel->stat); ++ spdk_bdev_for_each_channel_continue(i, 0); ++} ++ ++void ++spdk_bdev_get_device_stat(struct spdk_bdev *bdev, struct spdk_bdev_io_stat *stat, ++ spdk_bdev_get_device_stat_cb cb, void *cb_arg) ++{ ++ struct spdk_bdev_iostat_ctx *bdev_iostat_ctx; ++ ++ assert(bdev != NULL); ++ assert(stat != NULL); ++ assert(cb != NULL); ++ ++ bdev_iostat_ctx = calloc(1, sizeof(struct spdk_bdev_iostat_ctx)); ++ if (bdev_iostat_ctx == NULL) { ++ SPDK_ERRLOG("Unable to allocate memory for spdk_bdev_iostat_ctx\n"); ++ cb(bdev, stat, cb_arg, -ENOMEM); ++ return; ++ } ++ ++ bdev_iostat_ctx->stat = stat; ++ bdev_iostat_ctx->cb = cb; ++ bdev_iostat_ctx->cb_arg = cb_arg; ++ ++ /* Start with the statistics from previously deleted channels. */ ++ spdk_spin_lock(&bdev->internal.spinlock); ++ bdev_get_io_stat(bdev_iostat_ctx->stat, bdev->internal.stat); ++ spdk_spin_unlock(&bdev->internal.spinlock); ++ ++ /* Then iterate and add the statistics from each existing channel. */ ++ spdk_bdev_for_each_channel(bdev, bdev_get_each_channel_stat, bdev_iostat_ctx, ++ bdev_get_device_stat_done); ++} ++ ++struct bdev_iostat_reset_ctx { ++ enum spdk_bdev_reset_stat_mode mode; ++ bdev_reset_device_stat_cb cb; ++ void *cb_arg; ++}; ++ ++static void ++bdev_reset_device_stat_done(struct spdk_bdev *bdev, void *_ctx, int status) ++{ ++ struct bdev_iostat_reset_ctx *ctx = _ctx; ++ ++ ctx->cb(bdev, ctx->cb_arg, 0); ++ ++ free(ctx); ++} ++ ++static void ++bdev_reset_each_channel_stat(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, ++ struct spdk_io_channel *ch, void *_ctx) ++{ ++ struct bdev_iostat_reset_ctx *ctx = _ctx; ++ struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); ++ ++ spdk_bdev_reset_io_stat(channel->stat, ctx->mode); ++ ++ spdk_bdev_for_each_channel_continue(i, 0); ++} ++ ++void ++bdev_reset_device_stat(struct spdk_bdev *bdev, enum spdk_bdev_reset_stat_mode mode, ++ bdev_reset_device_stat_cb cb, void *cb_arg) ++{ ++ struct bdev_iostat_reset_ctx *ctx; ++ ++ assert(bdev != NULL); ++ assert(cb != NULL); ++ ++ ctx = calloc(1, sizeof(*ctx)); ++ if (ctx == NULL) { ++ SPDK_ERRLOG("Unable to allocate bdev_iostat_reset_ctx.\n"); ++ cb(bdev, cb_arg, -ENOMEM); ++ return; ++ } ++ ++ ctx->mode = mode; ++ ctx->cb = cb; ++ ctx->cb_arg = cb_arg; ++ ++ spdk_spin_lock(&bdev->internal.spinlock); ++ spdk_bdev_reset_io_stat(bdev->internal.stat, mode); ++ spdk_spin_unlock(&bdev->internal.spinlock); ++ ++ spdk_bdev_for_each_channel(bdev, ++ bdev_reset_each_channel_stat, ++ ctx, ++ bdev_reset_device_stat_done); ++} ++ ++int ++spdk_bdev_nvme_admin_passthru(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, ++ const struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes, ++ spdk_bdev_io_completion_cb cb, void *cb_arg) ++{ ++ struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); ++ struct spdk_bdev_io *bdev_io; ++ struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); ++ ++ if (!desc->write) { ++ return -EBADF; ++ } ++ ++ if (spdk_unlikely(!bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_NVME_ADMIN))) { ++ return -ENOTSUP; ++ } ++ ++ bdev_io = bdev_channel_get_io(channel); ++ if (!bdev_io) { ++ return -ENOMEM; ++ } ++ ++ bdev_io->internal.ch = channel; ++ bdev_io->internal.desc = desc; ++ bdev_io->type = SPDK_BDEV_IO_TYPE_NVME_ADMIN; ++ bdev_io->u.nvme_passthru.cmd = *cmd; ++ bdev_io->u.nvme_passthru.buf = buf; ++ bdev_io->u.nvme_passthru.nbytes = nbytes; ++ bdev_io->u.nvme_passthru.md_buf = NULL; ++ bdev_io->u.nvme_passthru.md_len = 0; ++ ++ bdev_io_init(bdev_io, bdev, cb_arg, cb); ++ ++ bdev_io_submit(bdev_io); ++ return 0; ++} ++ ++int ++spdk_bdev_nvme_io_passthru(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, ++ const struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes, ++ spdk_bdev_io_completion_cb cb, void *cb_arg) ++{ ++ struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); ++ struct spdk_bdev_io *bdev_io; ++ struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); ++ ++ if (!desc->write) { ++ /* ++ * Do not try to parse the NVMe command - we could maybe use bits in the opcode ++ * to easily determine if the command is a read or write, but for now just ++ * do not allow io_passthru with a read-only descriptor. ++ */ ++ return -EBADF; ++ } ++ ++ if (spdk_unlikely(!bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_NVME_IO))) { ++ return -ENOTSUP; ++ } ++ ++ bdev_io = bdev_channel_get_io(channel); ++ if (!bdev_io) { ++ return -ENOMEM; ++ } ++ ++ bdev_io->internal.ch = channel; ++ bdev_io->internal.desc = desc; ++ bdev_io->type = SPDK_BDEV_IO_TYPE_NVME_IO; ++ bdev_io->u.nvme_passthru.cmd = *cmd; ++ bdev_io->u.nvme_passthru.buf = buf; ++ bdev_io->u.nvme_passthru.nbytes = nbytes; ++ bdev_io->u.nvme_passthru.md_buf = NULL; ++ bdev_io->u.nvme_passthru.md_len = 0; ++ ++ bdev_io_init(bdev_io, bdev, cb_arg, cb); ++ ++ bdev_io_submit(bdev_io); ++ return 0; ++} ++ ++int ++spdk_bdev_nvme_io_passthru_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, ++ const struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes, void *md_buf, size_t md_len, ++ spdk_bdev_io_completion_cb cb, void *cb_arg) ++{ ++ struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); ++ struct spdk_bdev_io *bdev_io; ++ struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); ++ ++ if (!desc->write) { ++ /* ++ * Do not try to parse the NVMe command - we could maybe use bits in the opcode ++ * to easily determine if the command is a read or write, but for now just ++ * do not allow io_passthru with a read-only descriptor. ++ */ ++ return -EBADF; ++ } ++ ++ if (spdk_unlikely(!bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_NVME_IO_MD))) { ++ return -ENOTSUP; ++ } ++ ++ bdev_io = bdev_channel_get_io(channel); ++ if (!bdev_io) { ++ return -ENOMEM; ++ } ++ ++ bdev_io->internal.ch = channel; ++ bdev_io->internal.desc = desc; ++ bdev_io->type = SPDK_BDEV_IO_TYPE_NVME_IO_MD; ++ bdev_io->u.nvme_passthru.cmd = *cmd; ++ bdev_io->u.nvme_passthru.buf = buf; ++ bdev_io->u.nvme_passthru.nbytes = nbytes; ++ bdev_io->u.nvme_passthru.md_buf = md_buf; ++ bdev_io->u.nvme_passthru.md_len = md_len; ++ ++ bdev_io_init(bdev_io, bdev, cb_arg, cb); ++ ++ bdev_io_submit(bdev_io); ++ return 0; ++} ++ ++static void bdev_abort_retry(void *ctx); ++static void bdev_abort(struct spdk_bdev_io *parent_io); ++ ++static void ++bdev_abort_io_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) ++{ ++ struct spdk_bdev_channel *channel = bdev_io->internal.ch; ++ struct spdk_bdev_io *parent_io = cb_arg; ++ struct spdk_bdev_io *bio_to_abort, *tmp_io; ++ ++ bio_to_abort = bdev_io->u.abort.bio_to_abort; ++ ++ spdk_bdev_free_io(bdev_io); ++ ++ if (!success) { ++ /* Check if the target I/O completed in the meantime. */ ++ TAILQ_FOREACH(tmp_io, &channel->io_submitted, internal.ch_link) { ++ if (tmp_io == bio_to_abort) { ++ break; ++ } ++ } ++ ++ /* If the target I/O still exists, set the parent to failed. */ ++ if (tmp_io != NULL) { ++ parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; ++ } ++ } ++ ++ parent_io->u.bdev.split_outstanding--; ++ if (parent_io->u.bdev.split_outstanding == 0) { ++ if (parent_io->internal.status == SPDK_BDEV_IO_STATUS_NOMEM) { ++ bdev_abort_retry(parent_io); ++ } else { ++ bdev_io_complete(parent_io); ++ } ++ } ++} ++ ++static int ++bdev_abort_io(struct spdk_bdev_desc *desc, struct spdk_bdev_channel *channel, ++ struct spdk_bdev_io *bio_to_abort, ++ spdk_bdev_io_completion_cb cb, void *cb_arg) ++{ ++ struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); ++ struct spdk_bdev_io *bdev_io; ++ ++ if (bio_to_abort->type == SPDK_BDEV_IO_TYPE_ABORT || ++ bio_to_abort->type == SPDK_BDEV_IO_TYPE_RESET) { ++ /* TODO: Abort reset or abort request. */ ++ return -ENOTSUP; ++ } ++ ++ bdev_io = bdev_channel_get_io(channel); ++ if (bdev_io == NULL) { ++ return -ENOMEM; ++ } ++ ++ bdev_io->internal.ch = channel; ++ bdev_io->internal.desc = desc; ++ bdev_io->type = SPDK_BDEV_IO_TYPE_ABORT; ++ bdev_io_init(bdev_io, bdev, cb_arg, cb); ++ ++ if (bdev->split_on_optimal_io_boundary && bdev_io_should_split(bio_to_abort)) { ++ bdev_io->u.bdev.abort.bio_cb_arg = bio_to_abort; ++ ++ /* Parent abort request is not submitted directly, but to manage its ++ * execution add it to the submitted list here. ++ */ ++ bdev_io->internal.submit_tsc = spdk_get_ticks(); ++ TAILQ_INSERT_TAIL(&channel->io_submitted, bdev_io, internal.ch_link); ++ ++ bdev_abort(bdev_io); ++ ++ return 0; ++ } ++ ++ bdev_io->u.abort.bio_to_abort = bio_to_abort; ++ ++ /* Submit the abort request to the underlying bdev module. */ ++ bdev_io_submit(bdev_io); ++ ++ return 0; ++} ++ ++static uint32_t ++_bdev_abort(struct spdk_bdev_io *parent_io) ++{ ++ struct spdk_bdev_desc *desc = parent_io->internal.desc; ++ struct spdk_bdev_channel *channel = parent_io->internal.ch; ++ void *bio_cb_arg; ++ struct spdk_bdev_io *bio_to_abort; ++ uint32_t matched_ios; ++ int rc; ++ ++ bio_cb_arg = parent_io->u.bdev.abort.bio_cb_arg; ++ ++ /* matched_ios is returned and will be kept by the caller. ++ * ++ * This function will be used for two cases, 1) the same cb_arg is used for ++ * multiple I/Os, 2) a single large I/O is split into smaller ones. ++ * Incrementing split_outstanding directly here may confuse readers especially ++ * for the 1st case. ++ * ++ * Completion of I/O abort is processed after stack unwinding. Hence this trick ++ * works as expected. ++ */ ++ matched_ios = 0; ++ parent_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS; ++ ++ TAILQ_FOREACH(bio_to_abort, &channel->io_submitted, internal.ch_link) { ++ if (bio_to_abort->internal.caller_ctx != bio_cb_arg) { ++ continue; ++ } ++ ++ if (bio_to_abort->internal.submit_tsc > parent_io->internal.submit_tsc) { ++ /* Any I/O which was submitted after this abort command should be excluded. */ ++ continue; ++ } ++ ++ rc = bdev_abort_io(desc, channel, bio_to_abort, bdev_abort_io_done, parent_io); ++ if (rc != 0) { ++ if (rc == -ENOMEM) { ++ parent_io->internal.status = SPDK_BDEV_IO_STATUS_NOMEM; ++ } else { ++ parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; ++ } ++ break; ++ } ++ matched_ios++; ++ } ++ ++ return matched_ios; ++} ++ ++static void ++bdev_abort_retry(void *ctx) ++{ ++ struct spdk_bdev_io *parent_io = ctx; ++ uint32_t matched_ios; ++ ++ matched_ios = _bdev_abort(parent_io); ++ ++ if (matched_ios == 0) { ++ if (parent_io->internal.status == SPDK_BDEV_IO_STATUS_NOMEM) { ++ bdev_queue_io_wait_with_cb(parent_io, bdev_abort_retry); ++ } else { ++ /* For retry, the case that no target I/O was found is success ++ * because it means target I/Os completed in the meantime. ++ */ ++ bdev_io_complete(parent_io); ++ } ++ return; ++ } ++ ++ /* Use split_outstanding to manage the progress of aborting I/Os. */ ++ parent_io->u.bdev.split_outstanding = matched_ios; ++} ++ ++static void ++bdev_abort(struct spdk_bdev_io *parent_io) ++{ ++ uint32_t matched_ios; ++ ++ matched_ios = _bdev_abort(parent_io); ++ ++ if (matched_ios == 0) { ++ if (parent_io->internal.status == SPDK_BDEV_IO_STATUS_NOMEM) { ++ bdev_queue_io_wait_with_cb(parent_io, bdev_abort_retry); ++ } else { ++ /* The case the no target I/O was found is failure. */ ++ parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; ++ bdev_io_complete(parent_io); ++ } ++ return; ++ } ++ ++ /* Use split_outstanding to manage the progress of aborting I/Os. */ ++ parent_io->u.bdev.split_outstanding = matched_ios; ++} ++ ++int ++spdk_bdev_abort(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, ++ void *bio_cb_arg, ++ spdk_bdev_io_completion_cb cb, void *cb_arg) ++{ ++ struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); ++ struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); ++ struct spdk_bdev_io *bdev_io; ++ ++ if (bio_cb_arg == NULL) { ++ return -EINVAL; ++ } ++ ++ if (!spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_ABORT)) { ++ return -ENOTSUP; ++ } ++ ++ bdev_io = bdev_channel_get_io(channel); ++ if (bdev_io == NULL) { ++ return -ENOMEM; ++ } ++ ++ bdev_io->internal.ch = channel; ++ bdev_io->internal.desc = desc; ++ bdev_io->internal.submit_tsc = spdk_get_ticks(); ++ bdev_io->type = SPDK_BDEV_IO_TYPE_ABORT; ++ bdev_io_init(bdev_io, bdev, cb_arg, cb); ++ ++ bdev_io->u.bdev.abort.bio_cb_arg = bio_cb_arg; ++ ++ /* Parent abort request is not submitted directly, but to manage its execution, ++ * add it to the submitted list here. ++ */ ++ TAILQ_INSERT_TAIL(&channel->io_submitted, bdev_io, internal.ch_link); ++ ++ bdev_abort(bdev_io); ++ ++ return 0; ++} ++ ++int ++spdk_bdev_queue_io_wait(struct spdk_bdev *bdev, struct spdk_io_channel *ch, ++ struct spdk_bdev_io_wait_entry *entry) ++{ ++ struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); ++ struct spdk_bdev_mgmt_channel *mgmt_ch = channel->shared_resource->mgmt_ch; ++ ++ if (bdev != entry->bdev) { ++ SPDK_ERRLOG("bdevs do not match\n"); ++ return -EINVAL; ++ } ++ ++ if (mgmt_ch->per_thread_cache_count > 0) { ++ SPDK_ERRLOG("Cannot queue io_wait if spdk_bdev_io available in per-thread cache\n"); ++ return -EINVAL; ++ } ++ ++ TAILQ_INSERT_TAIL(&mgmt_ch->io_wait_queue, entry, link); ++ return 0; ++} ++ ++static inline void ++bdev_io_update_io_stat(struct spdk_bdev_io *bdev_io, uint64_t tsc_diff) ++{ ++ enum spdk_bdev_io_status io_status = bdev_io->internal.status; ++ struct spdk_bdev_io_stat *io_stat = bdev_io->internal.ch->stat; ++ uint64_t num_blocks = bdev_io->u.bdev.num_blocks; ++ uint32_t blocklen = bdev_io->bdev->blocklen; ++ ++ if (spdk_likely(io_status == SPDK_BDEV_IO_STATUS_SUCCESS)) { ++ switch (bdev_io->type) { ++ case SPDK_BDEV_IO_TYPE_READ: ++ io_stat->bytes_read += num_blocks * blocklen; ++ io_stat->num_read_ops++; ++ io_stat->read_latency_ticks += tsc_diff; ++ if (io_stat->max_read_latency_ticks < tsc_diff) { ++ io_stat->max_read_latency_ticks = tsc_diff; ++ } ++ if (io_stat->min_read_latency_ticks > tsc_diff) { ++ io_stat->min_read_latency_ticks = tsc_diff; ++ } ++ break; ++ case SPDK_BDEV_IO_TYPE_WRITE: ++ io_stat->bytes_written += num_blocks * blocklen; ++ io_stat->num_write_ops++; ++ io_stat->write_latency_ticks += tsc_diff; ++ if (io_stat->max_write_latency_ticks < tsc_diff) { ++ io_stat->max_write_latency_ticks = tsc_diff; ++ } ++ if (io_stat->min_write_latency_ticks > tsc_diff) { ++ io_stat->min_write_latency_ticks = tsc_diff; ++ } ++ break; ++ case SPDK_BDEV_IO_TYPE_UNMAP: ++ io_stat->bytes_unmapped += num_blocks * blocklen; ++ io_stat->num_unmap_ops++; ++ io_stat->unmap_latency_ticks += tsc_diff; ++ if (io_stat->max_unmap_latency_ticks < tsc_diff) { ++ io_stat->max_unmap_latency_ticks = tsc_diff; ++ } ++ if (io_stat->min_unmap_latency_ticks > tsc_diff) { ++ io_stat->min_unmap_latency_ticks = tsc_diff; ++ } ++ break; ++ case SPDK_BDEV_IO_TYPE_ZCOPY: ++ /* Track the data in the start phase only */ ++ if (bdev_io->u.bdev.zcopy.start) { ++ if (bdev_io->u.bdev.zcopy.populate) { ++ io_stat->bytes_read += num_blocks * blocklen; ++ io_stat->num_read_ops++; ++ io_stat->read_latency_ticks += tsc_diff; ++ if (io_stat->max_read_latency_ticks < tsc_diff) { ++ io_stat->max_read_latency_ticks = tsc_diff; ++ } ++ if (io_stat->min_read_latency_ticks > tsc_diff) { ++ io_stat->min_read_latency_ticks = tsc_diff; ++ } ++ } else { ++ io_stat->bytes_written += num_blocks * blocklen; ++ io_stat->num_write_ops++; ++ io_stat->write_latency_ticks += tsc_diff; ++ if (io_stat->max_write_latency_ticks < tsc_diff) { ++ io_stat->max_write_latency_ticks = tsc_diff; ++ } ++ if (io_stat->min_write_latency_ticks > tsc_diff) { ++ io_stat->min_write_latency_ticks = tsc_diff; ++ } ++ } ++ } ++ break; ++ case SPDK_BDEV_IO_TYPE_COPY: ++ io_stat->bytes_copied += num_blocks * blocklen; ++ io_stat->num_copy_ops++; ++ bdev_io->internal.ch->stat->copy_latency_ticks += tsc_diff; ++ if (io_stat->max_copy_latency_ticks < tsc_diff) { ++ io_stat->max_copy_latency_ticks = tsc_diff; ++ } ++ if (io_stat->min_copy_latency_ticks > tsc_diff) { ++ io_stat->min_copy_latency_ticks = tsc_diff; ++ } ++ break; ++ default: ++ break; ++ } ++ } else if (io_status <= SPDK_BDEV_IO_STATUS_FAILED && io_status >= SPDK_MIN_BDEV_IO_STATUS) { ++ io_stat = bdev_io->bdev->internal.stat; ++ assert(io_stat->io_error != NULL); ++ ++ spdk_spin_lock(&bdev_io->bdev->internal.spinlock); ++ io_stat->io_error->error_status[-io_status - 1]++; ++ spdk_spin_unlock(&bdev_io->bdev->internal.spinlock); ++ } ++ ++#ifdef SPDK_CONFIG_VTUNE ++ uint64_t now_tsc = spdk_get_ticks(); ++ if (now_tsc > (bdev_io->internal.ch->start_tsc + bdev_io->internal.ch->interval_tsc)) { ++ uint64_t data[5]; ++ struct spdk_bdev_io_stat *prev_stat = bdev_io->internal.ch->prev_stat; ++ ++ data[0] = io_stat->num_read_ops - prev_stat->num_read_ops; ++ data[1] = io_stat->bytes_read - prev_stat->bytes_read; ++ data[2] = io_stat->num_write_ops - prev_stat->num_write_ops; ++ data[3] = io_stat->bytes_written - prev_stat->bytes_written; ++ data[4] = bdev_io->bdev->fn_table->get_spin_time ? ++ bdev_io->bdev->fn_table->get_spin_time(spdk_bdev_io_get_io_channel(bdev_io)) : 0; ++ ++ __itt_metadata_add(g_bdev_mgr.domain, __itt_null, bdev_io->internal.ch->handle, ++ __itt_metadata_u64, 5, data); ++ ++ memcpy(prev_stat, io_stat, sizeof(struct spdk_bdev_io_stat)); ++ bdev_io->internal.ch->start_tsc = now_tsc; ++ } ++#endif ++} ++ ++static inline void ++bdev_io_complete(void *ctx) ++{ ++ struct spdk_bdev_io *bdev_io = ctx; ++ struct spdk_bdev_channel *bdev_ch = bdev_io->internal.ch; ++ uint64_t tsc, tsc_diff; ++ ++ if (spdk_unlikely(bdev_io->internal.in_submit_request)) { ++ /* ++ * Defer completion to avoid potential infinite recursion if the ++ * user's completion callback issues a new I/O. ++ */ ++ spdk_thread_send_msg(spdk_bdev_io_get_thread(bdev_io), ++ bdev_io_complete, bdev_io); ++ return; ++ } ++ ++ tsc = spdk_get_ticks(); ++ tsc_diff = tsc - bdev_io->internal.submit_tsc; ++ spdk_trace_record_tsc(tsc, TRACE_BDEV_IO_DONE, 0, 0, (uintptr_t)bdev_io, ++ bdev_io->internal.caller_ctx); ++ ++ TAILQ_REMOVE(&bdev_ch->io_submitted, bdev_io, internal.ch_link); ++ ++ if (bdev_io->internal.ch->histogram) { ++ spdk_histogram_data_tally(bdev_io->internal.ch->histogram, tsc_diff); ++ } ++ ++ bdev_io_update_io_stat(bdev_io, tsc_diff); ++ ++ assert(bdev_io->internal.cb != NULL); ++ assert(spdk_get_thread() == spdk_bdev_io_get_thread(bdev_io)); ++ ++ bdev_io->internal.cb(bdev_io, bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS, ++ bdev_io->internal.caller_ctx); ++} ++ ++static void bdev_destroy_cb(void *io_device); ++ ++static void ++bdev_reset_complete(struct spdk_bdev *bdev, void *_ctx, int status) ++{ ++ struct spdk_bdev_io *bdev_io = _ctx; ++ ++ if (bdev_io->u.reset.ch_ref != NULL) { ++ spdk_put_io_channel(bdev_io->u.reset.ch_ref); ++ bdev_io->u.reset.ch_ref = NULL; ++ } ++ ++ bdev_io_complete(bdev_io); ++ ++ if (bdev->internal.status == SPDK_BDEV_STATUS_REMOVING && ++ TAILQ_EMPTY(&bdev->internal.open_descs)) { ++ spdk_io_device_unregister(__bdev_to_io_dev(bdev), bdev_destroy_cb); ++ } ++} ++ ++static void ++bdev_unfreeze_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, ++ struct spdk_io_channel *_ch, void *_ctx) ++{ ++ struct spdk_bdev_io *bdev_io = _ctx; ++ struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch); ++ struct spdk_bdev_io *queued_reset; ++ ++ ch->flags &= ~BDEV_CH_RESET_IN_PROGRESS; ++ while (!TAILQ_EMPTY(&ch->queued_resets)) { ++ queued_reset = TAILQ_FIRST(&ch->queued_resets); ++ TAILQ_REMOVE(&ch->queued_resets, queued_reset, internal.link); ++ spdk_bdev_io_complete(queued_reset, bdev_io->internal.status); ++ } ++ ++ spdk_bdev_for_each_channel_continue(i, 0); ++} ++ ++void ++spdk_bdev_io_complete(struct spdk_bdev_io *bdev_io, enum spdk_bdev_io_status status) ++{ ++ struct spdk_bdev *bdev = bdev_io->bdev; ++ struct spdk_bdev_channel *bdev_ch = bdev_io->internal.ch; ++ struct spdk_bdev_shared_resource *shared_resource = bdev_ch->shared_resource; ++ ++ if (spdk_unlikely(spdk_get_shutdown_sig_received())) { ++ /* ++ * In the hot restart process, when this callback is triggered, ++ * the bdev buf memory may have been released. ++ * Therefore, do not need to continue. ++ */ ++ return; ++ } ++ ++ bdev_io->internal.status = status; ++ ++ if (spdk_unlikely(bdev_io->type == SPDK_BDEV_IO_TYPE_RESET)) { ++ bool unlock_channels = false; ++ ++ if (status == SPDK_BDEV_IO_STATUS_NOMEM) { ++ SPDK_ERRLOG("NOMEM returned for reset\n"); ++ } ++ spdk_spin_lock(&bdev->internal.spinlock); ++ if (bdev_io == bdev->internal.reset_in_progress) { ++ bdev->internal.reset_in_progress = NULL; ++ unlock_channels = true; ++ } ++ spdk_spin_unlock(&bdev->internal.spinlock); ++ ++ if (unlock_channels) { ++ spdk_bdev_for_each_channel(bdev, bdev_unfreeze_channel, bdev_io, ++ bdev_reset_complete); ++ return; ++ } ++ } else { ++ if (spdk_unlikely(bdev_io->internal.orig_iovcnt != 0)) { ++ _bdev_io_push_bounce_data_buffer(bdev_io, _bdev_io_complete_push_bounce_done); ++ /* bdev IO will be completed in the callback */ ++ return; ++ } ++ ++ _bdev_io_decrement_outstanding(bdev_ch, shared_resource); ++ if (spdk_unlikely(_bdev_io_handle_no_mem(bdev_io))) { ++ return; ++ } ++ } ++ ++ bdev_io_complete(bdev_io); ++} ++ ++void ++spdk_bdev_io_complete_scsi_status(struct spdk_bdev_io *bdev_io, enum spdk_scsi_status sc, ++ enum spdk_scsi_sense sk, uint8_t asc, uint8_t ascq) ++{ ++ if (sc == SPDK_SCSI_STATUS_GOOD) { ++ bdev_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS; ++ } else { ++ bdev_io->internal.status = SPDK_BDEV_IO_STATUS_SCSI_ERROR; ++ bdev_io->internal.error.scsi.sc = sc; ++ bdev_io->internal.error.scsi.sk = sk; ++ bdev_io->internal.error.scsi.asc = asc; ++ bdev_io->internal.error.scsi.ascq = ascq; ++ } ++ ++ spdk_bdev_io_complete(bdev_io, bdev_io->internal.status); ++} ++ ++void ++spdk_bdev_io_get_scsi_status(const struct spdk_bdev_io *bdev_io, ++ int *sc, int *sk, int *asc, int *ascq) ++{ ++ assert(sc != NULL); ++ assert(sk != NULL); ++ assert(asc != NULL); ++ assert(ascq != NULL); ++ ++ switch (bdev_io->internal.status) { ++ case SPDK_BDEV_IO_STATUS_SUCCESS: ++ *sc = SPDK_SCSI_STATUS_GOOD; ++ *sk = SPDK_SCSI_SENSE_NO_SENSE; ++ *asc = SPDK_SCSI_ASC_NO_ADDITIONAL_SENSE; ++ *ascq = SPDK_SCSI_ASCQ_CAUSE_NOT_REPORTABLE; ++ break; ++ case SPDK_BDEV_IO_STATUS_NVME_ERROR: ++ spdk_scsi_nvme_translate(bdev_io, sc, sk, asc, ascq); ++ break; ++ case SPDK_BDEV_IO_STATUS_SCSI_ERROR: ++ *sc = bdev_io->internal.error.scsi.sc; ++ *sk = bdev_io->internal.error.scsi.sk; ++ *asc = bdev_io->internal.error.scsi.asc; ++ *ascq = bdev_io->internal.error.scsi.ascq; ++ break; ++ default: ++ *sc = SPDK_SCSI_STATUS_CHECK_CONDITION; ++ *sk = SPDK_SCSI_SENSE_ABORTED_COMMAND; ++ *asc = SPDK_SCSI_ASC_NO_ADDITIONAL_SENSE; ++ *ascq = SPDK_SCSI_ASCQ_CAUSE_NOT_REPORTABLE; ++ break; ++ } ++} ++ ++void ++spdk_bdev_io_complete_aio_status(struct spdk_bdev_io *bdev_io, int aio_result) ++{ ++ if (aio_result == 0) { ++ bdev_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS; ++ } else { ++ bdev_io->internal.status = SPDK_BDEV_IO_STATUS_AIO_ERROR; ++ } ++ ++ bdev_io->internal.error.aio_result = aio_result; ++ ++ spdk_bdev_io_complete(bdev_io, bdev_io->internal.status); ++} ++ ++void ++spdk_bdev_io_get_aio_status(const struct spdk_bdev_io *bdev_io, int *aio_result) ++{ ++ assert(aio_result != NULL); ++ ++ if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_AIO_ERROR) { ++ *aio_result = bdev_io->internal.error.aio_result; ++ } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS) { ++ *aio_result = 0; ++ } else { ++ *aio_result = -EIO; ++ } ++} ++ ++void ++spdk_bdev_io_complete_nvme_status(struct spdk_bdev_io *bdev_io, uint32_t cdw0, int sct, int sc) ++{ ++ if (sct == SPDK_NVME_SCT_GENERIC && sc == SPDK_NVME_SC_SUCCESS) { ++ bdev_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS; ++ } else if (sct == SPDK_NVME_SCT_GENERIC && sc == SPDK_NVME_SC_ABORTED_BY_REQUEST) { ++ bdev_io->internal.status = SPDK_BDEV_IO_STATUS_ABORTED; ++ } else { ++ bdev_io->internal.status = SPDK_BDEV_IO_STATUS_NVME_ERROR; ++ } ++ ++ bdev_io->internal.error.nvme.cdw0 = cdw0; ++ bdev_io->internal.error.nvme.sct = sct; ++ bdev_io->internal.error.nvme.sc = sc; ++ ++ spdk_bdev_io_complete(bdev_io, bdev_io->internal.status); ++} ++ ++void ++spdk_bdev_io_get_nvme_status(const struct spdk_bdev_io *bdev_io, uint32_t *cdw0, int *sct, int *sc) ++{ ++ assert(sct != NULL); ++ assert(sc != NULL); ++ assert(cdw0 != NULL); ++ ++ if (spdk_unlikely(bdev_io->type == SPDK_BDEV_IO_TYPE_ABORT)) { ++ *sct = SPDK_NVME_SCT_GENERIC; ++ *sc = SPDK_NVME_SC_SUCCESS; ++ if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS) { ++ *cdw0 = 0; ++ } else { ++ *cdw0 = 1U; ++ } ++ return; ++ } ++ ++ if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_NVME_ERROR) { ++ *sct = bdev_io->internal.error.nvme.sct; ++ *sc = bdev_io->internal.error.nvme.sc; ++ } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS) { ++ *sct = SPDK_NVME_SCT_GENERIC; ++ *sc = SPDK_NVME_SC_SUCCESS; ++ } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_ABORTED) { ++ *sct = SPDK_NVME_SCT_GENERIC; ++ *sc = SPDK_NVME_SC_ABORTED_BY_REQUEST; ++ } else { ++ *sct = SPDK_NVME_SCT_GENERIC; ++ *sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; ++ } ++ ++ *cdw0 = bdev_io->internal.error.nvme.cdw0; ++} ++ ++void ++spdk_bdev_io_get_nvme_fused_status(const struct spdk_bdev_io *bdev_io, uint32_t *cdw0, ++ int *first_sct, int *first_sc, int *second_sct, int *second_sc) ++{ ++ assert(first_sct != NULL); ++ assert(first_sc != NULL); ++ assert(second_sct != NULL); ++ assert(second_sc != NULL); ++ assert(cdw0 != NULL); ++ ++ if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_NVME_ERROR) { ++ if (bdev_io->internal.error.nvme.sct == SPDK_NVME_SCT_MEDIA_ERROR && ++ bdev_io->internal.error.nvme.sc == SPDK_NVME_SC_COMPARE_FAILURE) { ++ *first_sct = bdev_io->internal.error.nvme.sct; ++ *first_sc = bdev_io->internal.error.nvme.sc; ++ *second_sct = SPDK_NVME_SCT_GENERIC; ++ *second_sc = SPDK_NVME_SC_ABORTED_FAILED_FUSED; ++ } else { ++ *first_sct = SPDK_NVME_SCT_GENERIC; ++ *first_sc = SPDK_NVME_SC_SUCCESS; ++ *second_sct = bdev_io->internal.error.nvme.sct; ++ *second_sc = bdev_io->internal.error.nvme.sc; ++ } ++ } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_ABORTED) { ++ *first_sct = SPDK_NVME_SCT_GENERIC; ++ *first_sc = SPDK_NVME_SC_ABORTED_BY_REQUEST; ++ *second_sct = SPDK_NVME_SCT_GENERIC; ++ *second_sc = SPDK_NVME_SC_ABORTED_BY_REQUEST; ++ } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS) { ++ *first_sct = SPDK_NVME_SCT_GENERIC; ++ *first_sc = SPDK_NVME_SC_SUCCESS; ++ *second_sct = SPDK_NVME_SCT_GENERIC; ++ *second_sc = SPDK_NVME_SC_SUCCESS; ++ } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_FIRST_FUSED_FAILED) { ++ *first_sct = SPDK_NVME_SCT_GENERIC; ++ *first_sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; ++ *second_sct = SPDK_NVME_SCT_GENERIC; ++ *second_sc = SPDK_NVME_SC_ABORTED_FAILED_FUSED; ++ } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_MISCOMPARE) { ++ *first_sct = SPDK_NVME_SCT_MEDIA_ERROR; ++ *first_sc = SPDK_NVME_SC_COMPARE_FAILURE; ++ *second_sct = SPDK_NVME_SCT_GENERIC; ++ *second_sc = SPDK_NVME_SC_ABORTED_FAILED_FUSED; ++ } else { ++ *first_sct = SPDK_NVME_SCT_GENERIC; ++ *first_sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; ++ *second_sct = SPDK_NVME_SCT_GENERIC; ++ *second_sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; ++ } ++ ++ *cdw0 = bdev_io->internal.error.nvme.cdw0; ++} ++ ++struct spdk_thread * ++spdk_bdev_io_get_thread(struct spdk_bdev_io *bdev_io) ++{ ++ return spdk_io_channel_get_thread(bdev_io->internal.ch->channel); ++} ++ ++struct spdk_io_channel * ++spdk_bdev_io_get_io_channel(struct spdk_bdev_io *bdev_io) ++{ ++ return bdev_io->internal.ch->channel; ++} ++ ++static int ++bdev_register(struct spdk_bdev *bdev) ++{ ++ char *bdev_name; ++ char uuid[SPDK_UUID_STRING_LEN]; ++ int ret; ++ ++ assert(bdev->module != NULL); ++ ++ if (!bdev->name) { ++ SPDK_ERRLOG("Bdev name is NULL\n"); ++ return -EINVAL; ++ } ++ ++ if (!strlen(bdev->name)) { ++ SPDK_ERRLOG("Bdev name must not be an empty string\n"); ++ return -EINVAL; ++ } ++ ++ /* Users often register their own I/O devices using the bdev name. In ++ * order to avoid conflicts, prepend bdev_. */ ++ bdev_name = spdk_sprintf_alloc("bdev_%s", bdev->name); ++ if (!bdev_name) { ++ SPDK_ERRLOG("Unable to allocate memory for internal bdev name.\n"); ++ return -ENOMEM; ++ } ++ ++ bdev->internal.stat = bdev_alloc_io_stat(true); ++ if (!bdev->internal.stat) { ++ SPDK_ERRLOG("Unable to allocate I/O statistics structure.\n"); ++ free(bdev_name); ++ return -ENOMEM; ++ } ++ ++ bdev->internal.status = SPDK_BDEV_STATUS_READY; ++ bdev->internal.measured_queue_depth = UINT64_MAX; ++ bdev->internal.claim_type = SPDK_BDEV_CLAIM_NONE; ++ memset(&bdev->internal.claim, 0, sizeof(bdev->internal.claim)); ++ bdev->internal.qd_poller = NULL; ++ bdev->internal.qos = NULL; ++ ++ TAILQ_INIT(&bdev->internal.open_descs); ++ TAILQ_INIT(&bdev->internal.locked_ranges); ++ TAILQ_INIT(&bdev->internal.pending_locked_ranges); ++ TAILQ_INIT(&bdev->aliases); ++ ++ ret = bdev_name_add(&bdev->internal.bdev_name, bdev, bdev->name); ++ if (ret != 0) { ++ bdev_free_io_stat(bdev->internal.stat); ++ free(bdev_name); ++ return ret; ++ } ++ ++ /* UUID may be specified by the user or defined by bdev itself. ++ * Otherwise it will be generated here, so this field will never be empty. */ ++ if (spdk_mem_all_zero(&bdev->uuid, sizeof(bdev->uuid))) { ++ spdk_uuid_generate(&bdev->uuid); ++ } ++ ++ /* Add the UUID alias only if it's different than the name */ ++ spdk_uuid_fmt_lower(uuid, sizeof(uuid), &bdev->uuid); ++ if (strcmp(bdev->name, uuid) != 0) { ++ ret = spdk_bdev_alias_add(bdev, uuid); ++ if (ret != 0) { ++ SPDK_ERRLOG("Unable to add uuid:%s alias for bdev %s\n", uuid, bdev->name); ++ bdev_name_del(&bdev->internal.bdev_name); ++ bdev_free_io_stat(bdev->internal.stat); ++ free(bdev_name); ++ return ret; ++ } ++ } ++ ++ if (spdk_bdev_get_buf_align(bdev) > 1) { ++ if (bdev->split_on_optimal_io_boundary) { ++ bdev->optimal_io_boundary = spdk_min(bdev->optimal_io_boundary, ++ SPDK_BDEV_LARGE_BUF_MAX_SIZE / bdev->blocklen); ++ } else { ++ bdev->split_on_optimal_io_boundary = true; ++ bdev->optimal_io_boundary = SPDK_BDEV_LARGE_BUF_MAX_SIZE / bdev->blocklen; ++ } ++ } ++ ++ /* If the user didn't specify a write unit size, set it to one. */ ++ if (bdev->write_unit_size == 0) { ++ bdev->write_unit_size = 1; ++ } ++ ++ /* Set ACWU value to the write unit size if bdev module did not set it (does not support it natively) */ ++ if (bdev->acwu == 0) { ++ bdev->acwu = bdev->write_unit_size; ++ } ++ ++ if (bdev->phys_blocklen == 0) { ++ bdev->phys_blocklen = spdk_bdev_get_data_block_size(bdev); ++ } ++ ++ bdev->internal.reset_in_progress = NULL; ++ bdev->internal.qd_poll_in_progress = false; ++ bdev->internal.period = 0; ++ bdev->internal.new_period = 0; ++ ++ spdk_io_device_register(__bdev_to_io_dev(bdev), ++ bdev_channel_create, bdev_channel_destroy, ++ sizeof(struct spdk_bdev_channel), ++ bdev_name); ++ ++ free(bdev_name); ++ ++ spdk_spin_init(&bdev->internal.spinlock); ++ ++ SPDK_DEBUGLOG(bdev, "Inserting bdev %s into list\n", bdev->name); ++ TAILQ_INSERT_TAIL(&g_bdev_mgr.bdevs, bdev, internal.link); ++ ++ return 0; ++} ++ ++static void ++bdev_destroy_cb(void *io_device) ++{ ++ int rc; ++ struct spdk_bdev *bdev; ++ spdk_bdev_unregister_cb cb_fn; ++ void *cb_arg; ++ ++ bdev = __bdev_from_io_dev(io_device); ++ ++ if (bdev->internal.unregister_td != spdk_get_thread()) { ++ spdk_thread_send_msg(bdev->internal.unregister_td, bdev_destroy_cb, io_device); ++ return; ++ } ++ ++ cb_fn = bdev->internal.unregister_cb; ++ cb_arg = bdev->internal.unregister_ctx; ++ ++ spdk_spin_destroy(&bdev->internal.spinlock); ++ free(bdev->internal.qos); ++ bdev_free_io_stat(bdev->internal.stat); ++ ++ rc = bdev->fn_table->destruct(bdev->ctxt); ++ if (rc < 0) { ++ SPDK_ERRLOG("destruct failed\n"); ++ } ++ if (rc <= 0 && cb_fn != NULL) { ++ cb_fn(cb_arg, rc); ++ } ++} ++ ++void ++spdk_bdev_destruct_done(struct spdk_bdev *bdev, int bdeverrno) ++{ ++ if (bdev->internal.unregister_cb != NULL) { ++ bdev->internal.unregister_cb(bdev->internal.unregister_ctx, bdeverrno); ++ } ++} ++ ++static void ++_remove_notify(void *arg) ++{ ++ struct spdk_bdev_desc *desc = arg; ++ ++ spdk_spin_lock(&desc->spinlock); ++ desc->refs--; ++ ++ if (!desc->closed) { ++ spdk_spin_unlock(&desc->spinlock); ++ desc->callback.event_fn(SPDK_BDEV_EVENT_REMOVE, desc->bdev, desc->callback.ctx); ++ return; ++ } else if (0 == desc->refs) { ++ /* This descriptor was closed after this remove_notify message was sent. ++ * spdk_bdev_close() could not free the descriptor since this message was ++ * in flight, so we free it now using bdev_desc_free(). ++ */ ++ spdk_spin_unlock(&desc->spinlock); ++ bdev_desc_free(desc); ++ return; ++ } ++ spdk_spin_unlock(&desc->spinlock); ++} ++ ++/* returns: 0 - bdev removed and ready to be destructed. ++ * -EBUSY - bdev can't be destructed yet. */ ++static int ++bdev_unregister_unsafe(struct spdk_bdev *bdev) ++{ ++ struct spdk_bdev_desc *desc, *tmp; ++ int rc = 0; ++ char uuid[SPDK_UUID_STRING_LEN]; ++ ++ assert(spdk_spin_held(&g_bdev_mgr.spinlock)); ++ assert(spdk_spin_held(&bdev->internal.spinlock)); ++ ++ /* Notify each descriptor about hotremoval */ ++ TAILQ_FOREACH_SAFE(desc, &bdev->internal.open_descs, link, tmp) { ++ rc = -EBUSY; ++ spdk_spin_lock(&desc->spinlock); ++ /* ++ * Defer invocation of the event_cb to a separate message that will ++ * run later on its thread. This ensures this context unwinds and ++ * we don't recursively unregister this bdev again if the event_cb ++ * immediately closes its descriptor. ++ */ ++ desc->refs++; ++ spdk_thread_send_msg(desc->thread, _remove_notify, desc); ++ spdk_spin_unlock(&desc->spinlock); ++ } ++ ++ /* If there are no descriptors, proceed removing the bdev */ ++ if (rc == 0) { ++ TAILQ_REMOVE(&g_bdev_mgr.bdevs, bdev, internal.link); ++ SPDK_DEBUGLOG(bdev, "Removing bdev %s from list done\n", bdev->name); ++ ++ /* Delete the name and the UUID alias */ ++ spdk_uuid_fmt_lower(uuid, sizeof(uuid), &bdev->uuid); ++ bdev_name_del_unsafe(&bdev->internal.bdev_name); ++ bdev_alias_del(bdev, uuid, bdev_name_del_unsafe); ++ ++ spdk_notify_send("bdev_unregister", spdk_bdev_get_name(bdev)); ++ ++ if (bdev->internal.reset_in_progress != NULL) { ++ /* If reset is in progress, let the completion callback for reset ++ * unregister the bdev. ++ */ ++ rc = -EBUSY; ++ } ++ } ++ ++ return rc; ++} ++ ++static void ++bdev_unregister_abort_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, ++ struct spdk_io_channel *io_ch, void *_ctx) ++{ ++ struct spdk_bdev_channel *bdev_ch = __io_ch_to_bdev_ch(io_ch); ++ ++ bdev_channel_abort_queued_ios(bdev_ch); ++ spdk_bdev_for_each_channel_continue(i, 0); ++} ++ ++static void ++bdev_unregister(struct spdk_bdev *bdev, void *_ctx, int status) ++{ ++ int rc; ++ ++ spdk_spin_lock(&g_bdev_mgr.spinlock); ++ spdk_spin_lock(&bdev->internal.spinlock); ++ /* ++ * Set the status to REMOVING after completing to abort channels. Otherwise, ++ * the last spdk_bdev_close() may call spdk_io_device_unregister() while ++ * spdk_bdev_for_each_channel() is executed and spdk_io_device_unregister() ++ * may fail. ++ */ ++ bdev->internal.status = SPDK_BDEV_STATUS_REMOVING; ++ rc = bdev_unregister_unsafe(bdev); ++ spdk_spin_unlock(&bdev->internal.spinlock); ++ spdk_spin_unlock(&g_bdev_mgr.spinlock); ++ ++ if (rc == 0) { ++ spdk_io_device_unregister(__bdev_to_io_dev(bdev), bdev_destroy_cb); ++ } ++} ++ ++void ++spdk_bdev_unregister(struct spdk_bdev *bdev, spdk_bdev_unregister_cb cb_fn, void *cb_arg) ++{ ++ struct spdk_thread *thread; ++ ++ SPDK_DEBUGLOG(bdev, "Removing bdev %s from list\n", bdev->name); ++ ++ thread = spdk_get_thread(); ++ if (!thread) { ++ /* The user called this from a non-SPDK thread. */ ++ if (cb_fn != NULL) { ++ cb_fn(cb_arg, -ENOTSUP); ++ } ++ return; ++ } ++ ++ spdk_spin_lock(&g_bdev_mgr.spinlock); ++ if (bdev->internal.status == SPDK_BDEV_STATUS_UNREGISTERING || ++ bdev->internal.status == SPDK_BDEV_STATUS_REMOVING) { ++ spdk_spin_unlock(&g_bdev_mgr.spinlock); ++ if (cb_fn) { ++ cb_fn(cb_arg, -EBUSY); ++ } ++ return; ++ } ++ ++ spdk_spin_lock(&bdev->internal.spinlock); ++ bdev->internal.status = SPDK_BDEV_STATUS_UNREGISTERING; ++ bdev->internal.unregister_cb = cb_fn; ++ bdev->internal.unregister_ctx = cb_arg; ++ bdev->internal.unregister_td = thread; ++ spdk_spin_unlock(&bdev->internal.spinlock); ++ spdk_spin_unlock(&g_bdev_mgr.spinlock); ++ ++ spdk_bdev_set_qd_sampling_period(bdev, 0); ++ ++ spdk_bdev_for_each_channel(bdev, bdev_unregister_abort_channel, bdev, ++ bdev_unregister); ++} ++ ++int ++spdk_bdev_unregister_by_name(const char *bdev_name, struct spdk_bdev_module *module, ++ spdk_bdev_unregister_cb cb_fn, void *cb_arg) ++{ ++ struct spdk_bdev_desc *desc; ++ struct spdk_bdev *bdev; ++ int rc; ++ ++ rc = spdk_bdev_open_ext(bdev_name, false, _tmp_bdev_event_cb, NULL, &desc); ++ if (rc != 0) { ++ SPDK_ERRLOG("Failed to open bdev with name: %s\n", bdev_name); ++ return rc; ++ } ++ ++ bdev = spdk_bdev_desc_get_bdev(desc); ++ ++ if (bdev->module != module) { ++ spdk_bdev_close(desc); ++ SPDK_ERRLOG("Bdev %s was not registered by the specified module.\n", ++ bdev_name); ++ return -ENODEV; ++ } ++ ++ spdk_bdev_unregister(bdev, cb_fn, cb_arg); ++ ++ spdk_bdev_close(desc); ++ ++ return 0; ++} ++ ++static int ++bdev_start_qos(struct spdk_bdev *bdev) ++{ ++ struct set_qos_limit_ctx *ctx; ++ ++ /* Enable QoS */ ++ if (bdev->internal.qos && bdev->internal.qos->thread == NULL) { ++ ctx = calloc(1, sizeof(*ctx)); ++ if (ctx == NULL) { ++ SPDK_ERRLOG("Failed to allocate memory for QoS context\n"); ++ return -ENOMEM; ++ } ++ ctx->bdev = bdev; ++ spdk_bdev_for_each_channel(bdev, bdev_enable_qos_msg, ctx, bdev_enable_qos_done); ++ } ++ ++ return 0; ++} ++ ++static int ++bdev_open(struct spdk_bdev *bdev, bool write, struct spdk_bdev_desc *desc) ++{ ++ struct spdk_thread *thread; ++ int rc = 0; ++ ++ thread = spdk_get_thread(); ++ if (!thread) { ++ SPDK_ERRLOG("Cannot open bdev from non-SPDK thread.\n"); ++ return -ENOTSUP; ++ } ++ ++ SPDK_DEBUGLOG(bdev, "Opening descriptor %p for bdev %s on thread %p\n", desc, bdev->name, ++ spdk_get_thread()); ++ ++ desc->bdev = bdev; ++ desc->thread = thread; ++ desc->write = write; ++ ++ spdk_spin_lock(&bdev->internal.spinlock); ++ if (bdev->internal.status == SPDK_BDEV_STATUS_UNREGISTERING || ++ bdev->internal.status == SPDK_BDEV_STATUS_REMOVING) { ++ spdk_spin_unlock(&bdev->internal.spinlock); ++ return -ENODEV; ++ } ++ ++ if (write && bdev->internal.claim_type != SPDK_BDEV_CLAIM_NONE) { ++ SPDK_ERRLOG("Could not open %s - %s module already claimed it\n", ++ bdev->name, bdev->internal.claim.v1.module->name); ++ spdk_spin_unlock(&bdev->internal.spinlock); ++ return -EPERM; ++ } ++ ++ rc = bdev_start_qos(bdev); ++ if (rc != 0) { ++ SPDK_ERRLOG("Failed to start QoS on bdev %s\n", bdev->name); ++ spdk_spin_unlock(&bdev->internal.spinlock); ++ return rc; ++ } ++ ++ TAILQ_INSERT_TAIL(&bdev->internal.open_descs, desc, link); ++ ++ spdk_spin_unlock(&bdev->internal.spinlock); ++ ++ return 0; ++} ++ ++static int ++bdev_desc_alloc(struct spdk_bdev *bdev, spdk_bdev_event_cb_t event_cb, void *event_ctx, ++ struct spdk_bdev_desc **_desc) ++{ ++ struct spdk_bdev_desc *desc; ++ unsigned int event_id; ++ ++ desc = calloc(1, sizeof(*desc)); ++ if (desc == NULL) { ++ SPDK_ERRLOG("Failed to allocate memory for bdev descriptor\n"); ++ return -ENOMEM; ++ } ++ ++ TAILQ_INIT(&desc->pending_media_events); ++ TAILQ_INIT(&desc->free_media_events); ++ ++ desc->memory_domains_supported = spdk_bdev_get_memory_domains(bdev, NULL, 0) > 0; ++ desc->callback.event_fn = event_cb; ++ desc->callback.ctx = event_ctx; ++ spdk_spin_init(&desc->spinlock); ++ ++ if (bdev->media_events) { ++ desc->media_events_buffer = calloc(MEDIA_EVENT_POOL_SIZE, ++ sizeof(*desc->media_events_buffer)); ++ if (desc->media_events_buffer == NULL) { ++ SPDK_ERRLOG("Failed to initialize media event pool\n"); ++ bdev_desc_free(desc); ++ return -ENOMEM; ++ } ++ ++ for (event_id = 0; event_id < MEDIA_EVENT_POOL_SIZE; ++event_id) { ++ TAILQ_INSERT_TAIL(&desc->free_media_events, ++ &desc->media_events_buffer[event_id], tailq); ++ } ++ } ++ ++ *_desc = desc; ++ ++ return 0; ++} ++ ++int ++spdk_bdev_open_ext(const char *bdev_name, bool write, spdk_bdev_event_cb_t event_cb, ++ void *event_ctx, struct spdk_bdev_desc **_desc) ++{ ++ struct spdk_bdev_desc *desc; ++ struct spdk_bdev *bdev; ++ int rc; ++ ++ if (event_cb == NULL) { ++ SPDK_ERRLOG("Missing event callback function\n"); ++ return -EINVAL; ++ } ++ ++ spdk_spin_lock(&g_bdev_mgr.spinlock); ++ ++ bdev = bdev_get_by_name(bdev_name); ++ ++ if (bdev == NULL) { ++ SPDK_NOTICELOG("Currently unable to find bdev with name: %s\n", bdev_name); ++ spdk_spin_unlock(&g_bdev_mgr.spinlock); ++ return -ENODEV; ++ } ++ ++ rc = bdev_desc_alloc(bdev, event_cb, event_ctx, &desc); ++ if (rc != 0) { ++ spdk_spin_unlock(&g_bdev_mgr.spinlock); ++ return rc; ++ } ++ ++ rc = bdev_open(bdev, write, desc); ++ if (rc != 0) { ++ bdev_desc_free(desc); ++ desc = NULL; ++ } ++ ++ *_desc = desc; ++ ++ spdk_spin_unlock(&g_bdev_mgr.spinlock); ++ ++ return rc; ++} ++ ++static void ++bdev_close(struct spdk_bdev *bdev, struct spdk_bdev_desc *desc) ++{ ++ int rc; ++ ++ spdk_spin_lock(&bdev->internal.spinlock); ++ spdk_spin_lock(&desc->spinlock); ++ ++ TAILQ_REMOVE(&bdev->internal.open_descs, desc, link); ++ ++ desc->closed = true; ++ ++ if (0 == desc->refs) { ++ spdk_spin_unlock(&desc->spinlock); ++ bdev_desc_free(desc); ++ } else { ++ spdk_spin_unlock(&desc->spinlock); ++ } ++ ++ /* If no more descriptors, kill QoS channel */ ++ if (bdev->internal.qos && TAILQ_EMPTY(&bdev->internal.open_descs)) { ++ SPDK_DEBUGLOG(bdev, "Closed last descriptor for bdev %s on thread %p. Stopping QoS.\n", ++ bdev->name, spdk_get_thread()); ++ ++ if (bdev_qos_destroy(bdev)) { ++ /* There isn't anything we can do to recover here. Just let the ++ * old QoS poller keep running. The QoS handling won't change ++ * cores when the user allocates a new channel, but it won't break. */ ++ SPDK_ERRLOG("Unable to shut down QoS poller. It will continue running on the current thread.\n"); ++ } ++ } ++ ++ if (bdev->internal.status == SPDK_BDEV_STATUS_REMOVING && TAILQ_EMPTY(&bdev->internal.open_descs)) { ++ rc = bdev_unregister_unsafe(bdev); ++ spdk_spin_unlock(&bdev->internal.spinlock); ++ ++ if (rc == 0) { ++ spdk_io_device_unregister(__bdev_to_io_dev(bdev), bdev_destroy_cb); ++ } ++ } else { ++ spdk_spin_unlock(&bdev->internal.spinlock); ++ } ++} ++ ++void ++spdk_bdev_close(struct spdk_bdev_desc *desc) ++{ ++ struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); ++ ++ SPDK_DEBUGLOG(bdev, "Closing descriptor %p for bdev %s on thread %p\n", desc, bdev->name, ++ spdk_get_thread()); ++ ++ assert(desc->thread == spdk_get_thread()); ++ ++ spdk_poller_unregister(&desc->io_timeout_poller); ++ ++ spdk_spin_lock(&g_bdev_mgr.spinlock); ++ ++ bdev_close(bdev, desc); ++ ++ spdk_spin_unlock(&g_bdev_mgr.spinlock); ++} ++ ++static void ++bdev_register_finished(void *arg) ++{ ++ struct spdk_bdev_desc *desc = arg; ++ struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); ++ ++ spdk_notify_send("bdev_register", spdk_bdev_get_name(bdev)); ++ ++ spdk_spin_lock(&g_bdev_mgr.spinlock); ++ ++ bdev_close(bdev, desc); ++ ++ spdk_spin_unlock(&g_bdev_mgr.spinlock); ++} ++ ++int ++spdk_bdev_register(struct spdk_bdev *bdev) ++{ ++ struct spdk_bdev_desc *desc; ++ int rc; ++ ++ if (spdk_unlikely(spdk_thread_get_app_thread() != spdk_get_thread())) { ++ SPDK_LOG_DEPRECATED(bdev_register_examine_thread); ++ } ++ ++ rc = bdev_register(bdev); ++ if (rc != 0) { ++ return rc; ++ } ++ ++ /* A descriptor is opened to prevent bdev deletion during examination */ ++ rc = bdev_desc_alloc(bdev, _tmp_bdev_event_cb, NULL, &desc); ++ if (rc != 0) { ++ spdk_bdev_unregister(bdev, NULL, NULL); ++ return rc; ++ } ++ ++ rc = bdev_open(bdev, false, desc); ++ if (rc != 0) { ++ bdev_desc_free(desc); ++ spdk_bdev_unregister(bdev, NULL, NULL); ++ return rc; ++ } ++ ++ /* Examine configuration before initializing I/O */ ++ bdev_examine(bdev); ++ ++ rc = spdk_bdev_wait_for_examine(bdev_register_finished, desc); ++ if (rc != 0) { ++ bdev_close(bdev, desc); ++ spdk_bdev_unregister(bdev, NULL, NULL); ++ } ++ ++ return rc; ++} ++ ++int ++spdk_bdev_module_claim_bdev(struct spdk_bdev *bdev, struct spdk_bdev_desc *desc, ++ struct spdk_bdev_module *module) ++{ ++ spdk_spin_lock(&bdev->internal.spinlock); ++ ++ if (bdev->internal.claim_type != SPDK_BDEV_CLAIM_NONE) { ++ SPDK_ERRLOG("bdev %s already claimed by module %s\n", bdev->name, ++ bdev->internal.claim.v1.module->name); ++ spdk_spin_unlock(&bdev->internal.spinlock); ++ return -EPERM; ++ } ++ ++ if (desc && !desc->write) { ++ desc->write = true; ++ } ++ ++ bdev->internal.claim_type = SPDK_BDEV_CLAIM_EXCL_WRITE; ++ bdev->internal.claim.v1.module = module; ++ ++ spdk_spin_unlock(&bdev->internal.spinlock); ++ return 0; ++} ++ ++void ++spdk_bdev_module_release_bdev(struct spdk_bdev *bdev) ++{ ++ spdk_spin_lock(&bdev->internal.spinlock); ++ ++ assert(bdev->internal.claim.v1.module != NULL); ++ assert(bdev->internal.claim_type == SPDK_BDEV_CLAIM_EXCL_WRITE); ++ bdev->internal.claim_type = SPDK_BDEV_CLAIM_NONE; ++ bdev->internal.claim.v1.module = NULL; ++ ++ spdk_spin_unlock(&bdev->internal.spinlock); ++} ++ ++struct spdk_bdev * ++spdk_bdev_desc_get_bdev(struct spdk_bdev_desc *desc) ++{ ++ assert(desc != NULL); ++ return desc->bdev; ++} ++ ++int ++spdk_for_each_bdev(void *ctx, spdk_for_each_bdev_fn fn) ++{ ++ struct spdk_bdev *bdev, *tmp; ++ struct spdk_bdev_desc *desc; ++ int rc = 0; ++ ++ assert(fn != NULL); ++ ++ spdk_spin_lock(&g_bdev_mgr.spinlock); ++ bdev = spdk_bdev_first(); ++ while (bdev != NULL) { ++ rc = bdev_desc_alloc(bdev, _tmp_bdev_event_cb, NULL, &desc); ++ if (rc != 0) { ++ break; ++ } ++ rc = bdev_open(bdev, false, desc); ++ if (rc != 0) { ++ bdev_desc_free(desc); ++ if (rc == -ENODEV) { ++ /* Ignore the error and move to the next bdev. */ ++ rc = 0; ++ bdev = spdk_bdev_next(bdev); ++ continue; ++ } ++ break; ++ } ++ spdk_spin_unlock(&g_bdev_mgr.spinlock); ++ ++ rc = fn(ctx, bdev); ++ ++ spdk_spin_lock(&g_bdev_mgr.spinlock); ++ tmp = spdk_bdev_next(bdev); ++ bdev_close(bdev, desc); ++ if (rc != 0) { ++ break; ++ } ++ bdev = tmp; ++ } ++ spdk_spin_unlock(&g_bdev_mgr.spinlock); ++ ++ return rc; ++} ++ ++int ++spdk_for_each_bdev_leaf(void *ctx, spdk_for_each_bdev_fn fn) ++{ ++ struct spdk_bdev *bdev, *tmp; ++ struct spdk_bdev_desc *desc; ++ int rc = 0; ++ ++ assert(fn != NULL); ++ ++ spdk_spin_lock(&g_bdev_mgr.spinlock); ++ bdev = spdk_bdev_first_leaf(); ++ while (bdev != NULL) { ++ rc = bdev_desc_alloc(bdev, _tmp_bdev_event_cb, NULL, &desc); ++ if (rc != 0) { ++ break; ++ } ++ rc = bdev_open(bdev, false, desc); ++ if (rc != 0) { ++ bdev_desc_free(desc); ++ if (rc == -ENODEV) { ++ /* Ignore the error and move to the next bdev. */ ++ rc = 0; ++ bdev = spdk_bdev_next_leaf(bdev); ++ continue; ++ } ++ break; ++ } ++ spdk_spin_unlock(&g_bdev_mgr.spinlock); ++ ++ rc = fn(ctx, bdev); ++ ++ spdk_spin_lock(&g_bdev_mgr.spinlock); ++ tmp = spdk_bdev_next_leaf(bdev); ++ bdev_close(bdev, desc); ++ if (rc != 0) { ++ break; ++ } ++ bdev = tmp; ++ } ++ spdk_spin_unlock(&g_bdev_mgr.spinlock); ++ ++ return rc; ++} ++ ++void ++spdk_bdev_io_get_iovec(struct spdk_bdev_io *bdev_io, struct iovec **iovp, int *iovcntp) ++{ ++ struct iovec *iovs; ++ int iovcnt; ++ ++ if (bdev_io == NULL) { ++ return; ++ } ++ ++ switch (bdev_io->type) { ++ case SPDK_BDEV_IO_TYPE_READ: ++ case SPDK_BDEV_IO_TYPE_WRITE: ++ case SPDK_BDEV_IO_TYPE_ZCOPY: ++ iovs = bdev_io->u.bdev.iovs; ++ iovcnt = bdev_io->u.bdev.iovcnt; ++ break; ++ default: ++ iovs = NULL; ++ iovcnt = 0; ++ break; ++ } ++ ++ if (iovp) { ++ *iovp = iovs; ++ } ++ if (iovcntp) { ++ *iovcntp = iovcnt; ++ } ++} ++ ++void * ++spdk_bdev_io_get_md_buf(struct spdk_bdev_io *bdev_io) ++{ ++ if (bdev_io == NULL) { ++ return NULL; ++ } ++ ++ if (!spdk_bdev_is_md_separate(bdev_io->bdev)) { ++ return NULL; ++ } ++ ++ if (bdev_io->type == SPDK_BDEV_IO_TYPE_READ || ++ bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) { ++ return bdev_io->u.bdev.md_buf; ++ } ++ ++ return NULL; ++} ++ ++void * ++spdk_bdev_io_get_cb_arg(struct spdk_bdev_io *bdev_io) ++{ ++ if (bdev_io == NULL) { ++ assert(false); ++ return NULL; ++ } ++ ++ return bdev_io->internal.caller_ctx; ++} ++ ++void ++spdk_bdev_module_list_add(struct spdk_bdev_module *bdev_module) ++{ ++ ++ if (spdk_bdev_module_list_find(bdev_module->name)) { ++ SPDK_ERRLOG("ERROR: module '%s' already registered.\n", bdev_module->name); ++ assert(false); ++ } ++ ++ spdk_spin_init(&bdev_module->internal.spinlock); ++ ++ /* ++ * Modules with examine callbacks must be initialized first, so they are ++ * ready to handle examine callbacks from later modules that will ++ * register physical bdevs. ++ */ ++ if (bdev_module->examine_config != NULL || bdev_module->examine_disk != NULL) { ++ TAILQ_INSERT_HEAD(&g_bdev_mgr.bdev_modules, bdev_module, internal.tailq); ++ } else { ++ TAILQ_INSERT_TAIL(&g_bdev_mgr.bdev_modules, bdev_module, internal.tailq); ++ } ++} ++ ++struct spdk_bdev_module * ++spdk_bdev_module_list_find(const char *name) ++{ ++ struct spdk_bdev_module *bdev_module; ++ ++ TAILQ_FOREACH(bdev_module, &g_bdev_mgr.bdev_modules, internal.tailq) { ++ if (strcmp(name, bdev_module->name) == 0) { ++ break; ++ } ++ } ++ ++ return bdev_module; ++} ++ ++static void ++bdev_write_zero_buffer_next(void *_bdev_io) ++{ ++ struct spdk_bdev_io *bdev_io = _bdev_io; ++ uint64_t num_bytes, num_blocks; ++ void *md_buf = NULL; ++ int rc; ++ ++ num_bytes = spdk_min(_bdev_get_block_size_with_md(bdev_io->bdev) * ++ bdev_io->u.bdev.split_remaining_num_blocks, ++ ZERO_BUFFER_SIZE); ++ num_blocks = num_bytes / _bdev_get_block_size_with_md(bdev_io->bdev); ++ num_blocks -= num_blocks % bdev_io->bdev->write_unit_size; ++ ++ if (spdk_bdev_is_md_separate(bdev_io->bdev)) { ++ md_buf = (char *)g_bdev_mgr.zero_buffer + ++ spdk_bdev_get_block_size(bdev_io->bdev) * num_blocks; ++ } ++ ++ rc = bdev_write_blocks_with_md(bdev_io->internal.desc, ++ spdk_io_channel_from_ctx(bdev_io->internal.ch), ++ g_bdev_mgr.zero_buffer, md_buf, ++ bdev_io->u.bdev.split_current_offset_blocks, num_blocks, ++ bdev_write_zero_buffer_done, bdev_io); ++ if (rc == 0) { ++ bdev_io->u.bdev.split_remaining_num_blocks -= num_blocks; ++ bdev_io->u.bdev.split_current_offset_blocks += num_blocks; ++ } else if (rc == -ENOMEM) { ++ bdev_queue_io_wait_with_cb(bdev_io, bdev_write_zero_buffer_next); ++ } else { ++ bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; ++ bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); ++ } ++} ++ ++static void ++bdev_write_zero_buffer_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) ++{ ++ struct spdk_bdev_io *parent_io = cb_arg; ++ ++ spdk_bdev_free_io(bdev_io); ++ ++ if (!success) { ++ parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; ++ parent_io->internal.cb(parent_io, false, parent_io->internal.caller_ctx); ++ return; ++ } ++ ++ if (parent_io->u.bdev.split_remaining_num_blocks == 0) { ++ parent_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS; ++ parent_io->internal.cb(parent_io, true, parent_io->internal.caller_ctx); ++ return; ++ } ++ ++ bdev_write_zero_buffer_next(parent_io); ++} ++ ++static void ++bdev_set_qos_limit_done(struct set_qos_limit_ctx *ctx, int status) ++{ ++ spdk_spin_lock(&ctx->bdev->internal.spinlock); ++ ctx->bdev->internal.qos_mod_in_progress = false; ++ spdk_spin_unlock(&ctx->bdev->internal.spinlock); ++ ++ if (ctx->cb_fn) { ++ ctx->cb_fn(ctx->cb_arg, status); ++ } ++ free(ctx); ++} ++ ++static void ++bdev_disable_qos_done(void *cb_arg) ++{ ++ struct set_qos_limit_ctx *ctx = cb_arg; ++ struct spdk_bdev *bdev = ctx->bdev; ++ struct spdk_bdev_io *bdev_io; ++ struct spdk_bdev_qos *qos; ++ ++ spdk_spin_lock(&bdev->internal.spinlock); ++ qos = bdev->internal.qos; ++ bdev->internal.qos = NULL; ++ spdk_spin_unlock(&bdev->internal.spinlock); ++ ++ while (!TAILQ_EMPTY(&qos->queued)) { ++ /* Send queued I/O back to their original thread for resubmission. */ ++ bdev_io = TAILQ_FIRST(&qos->queued); ++ TAILQ_REMOVE(&qos->queued, bdev_io, internal.link); ++ ++ if (bdev_io->internal.io_submit_ch) { ++ /* ++ * Channel was changed when sending it to the QoS thread - change it back ++ * before sending it back to the original thread. ++ */ ++ bdev_io->internal.ch = bdev_io->internal.io_submit_ch; ++ bdev_io->internal.io_submit_ch = NULL; ++ } ++ ++ spdk_thread_send_msg(spdk_bdev_io_get_thread(bdev_io), ++ _bdev_io_submit, bdev_io); ++ } ++ ++ if (qos->thread != NULL) { ++ spdk_put_io_channel(spdk_io_channel_from_ctx(qos->ch)); ++ spdk_poller_unregister(&qos->poller); ++ } ++ ++ free(qos); ++ ++ bdev_set_qos_limit_done(ctx, 0); ++} ++ ++static void ++bdev_disable_qos_msg_done(struct spdk_bdev *bdev, void *_ctx, int status) ++{ ++ struct set_qos_limit_ctx *ctx = _ctx; ++ struct spdk_thread *thread; ++ ++ spdk_spin_lock(&bdev->internal.spinlock); ++ thread = bdev->internal.qos->thread; ++ spdk_spin_unlock(&bdev->internal.spinlock); ++ ++ if (thread != NULL) { ++ spdk_thread_send_msg(thread, bdev_disable_qos_done, ctx); ++ } else { ++ bdev_disable_qos_done(ctx); ++ } ++} ++ ++static void ++bdev_disable_qos_msg(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, ++ struct spdk_io_channel *ch, void *_ctx) ++{ ++ struct spdk_bdev_channel *bdev_ch = __io_ch_to_bdev_ch(ch); ++ ++ bdev_ch->flags &= ~BDEV_CH_QOS_ENABLED; ++ ++ spdk_bdev_for_each_channel_continue(i, 0); ++} ++ ++static void ++bdev_update_qos_rate_limit_msg(void *cb_arg) ++{ ++ struct set_qos_limit_ctx *ctx = cb_arg; ++ struct spdk_bdev *bdev = ctx->bdev; ++ ++ spdk_spin_lock(&bdev->internal.spinlock); ++ bdev_qos_update_max_quota_per_timeslice(bdev->internal.qos); ++ spdk_spin_unlock(&bdev->internal.spinlock); ++ ++ bdev_set_qos_limit_done(ctx, 0); ++} ++ ++static void ++bdev_enable_qos_msg(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, ++ struct spdk_io_channel *ch, void *_ctx) ++{ ++ struct spdk_bdev_channel *bdev_ch = __io_ch_to_bdev_ch(ch); ++ ++ spdk_spin_lock(&bdev->internal.spinlock); ++ bdev_enable_qos(bdev, bdev_ch); ++ spdk_spin_unlock(&bdev->internal.spinlock); ++ spdk_bdev_for_each_channel_continue(i, 0); ++} ++ ++static void ++bdev_enable_qos_done(struct spdk_bdev *bdev, void *_ctx, int status) ++{ ++ struct set_qos_limit_ctx *ctx = _ctx; ++ ++ bdev_set_qos_limit_done(ctx, status); ++} ++ ++static void ++bdev_set_qos_rate_limits(struct spdk_bdev *bdev, uint64_t *limits) ++{ ++ int i; ++ ++ assert(bdev->internal.qos != NULL); ++ ++ for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { ++ if (limits[i] != SPDK_BDEV_QOS_LIMIT_NOT_DEFINED) { ++ bdev->internal.qos->rate_limits[i].limit = limits[i]; ++ ++ if (limits[i] == 0) { ++ bdev->internal.qos->rate_limits[i].limit = ++ SPDK_BDEV_QOS_LIMIT_NOT_DEFINED; ++ } ++ } ++ } ++} ++ ++void ++spdk_bdev_set_qos_rate_limits(struct spdk_bdev *bdev, uint64_t *limits, ++ void (*cb_fn)(void *cb_arg, int status), void *cb_arg) ++{ ++ struct set_qos_limit_ctx *ctx; ++ uint32_t limit_set_complement; ++ uint64_t min_limit_per_sec; ++ int i; ++ bool disable_rate_limit = true; ++ ++ for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { ++ if (limits[i] == SPDK_BDEV_QOS_LIMIT_NOT_DEFINED) { ++ continue; ++ } ++ ++ if (limits[i] > 0) { ++ disable_rate_limit = false; ++ } ++ ++ if (bdev_qos_is_iops_rate_limit(i) == true) { ++ min_limit_per_sec = SPDK_BDEV_QOS_MIN_IOS_PER_SEC; ++ } else { ++ /* Change from megabyte to byte rate limit */ ++ limits[i] = limits[i] * 1024 * 1024; ++ min_limit_per_sec = SPDK_BDEV_QOS_MIN_BYTES_PER_SEC; ++ } ++ ++ limit_set_complement = limits[i] % min_limit_per_sec; ++ if (limit_set_complement) { ++ SPDK_ERRLOG("Requested rate limit %" PRIu64 " is not a multiple of %" PRIu64 "\n", ++ limits[i], min_limit_per_sec); ++ limits[i] += min_limit_per_sec - limit_set_complement; ++ SPDK_ERRLOG("Round up the rate limit to %" PRIu64 "\n", limits[i]); ++ } ++ } ++ ++ ctx = calloc(1, sizeof(*ctx)); ++ if (ctx == NULL) { ++ cb_fn(cb_arg, -ENOMEM); ++ return; ++ } ++ ++ ctx->cb_fn = cb_fn; ++ ctx->cb_arg = cb_arg; ++ ctx->bdev = bdev; ++ ++ spdk_spin_lock(&bdev->internal.spinlock); ++ if (bdev->internal.qos_mod_in_progress) { ++ spdk_spin_unlock(&bdev->internal.spinlock); ++ free(ctx); ++ cb_fn(cb_arg, -EAGAIN); ++ return; ++ } ++ bdev->internal.qos_mod_in_progress = true; ++ ++ if (disable_rate_limit == true && bdev->internal.qos) { ++ for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { ++ if (limits[i] == SPDK_BDEV_QOS_LIMIT_NOT_DEFINED && ++ (bdev->internal.qos->rate_limits[i].limit > 0 && ++ bdev->internal.qos->rate_limits[i].limit != ++ SPDK_BDEV_QOS_LIMIT_NOT_DEFINED)) { ++ disable_rate_limit = false; ++ break; ++ } ++ } ++ } ++ ++ if (disable_rate_limit == false) { ++ if (bdev->internal.qos == NULL) { ++ bdev->internal.qos = calloc(1, sizeof(*bdev->internal.qos)); ++ if (!bdev->internal.qos) { ++ spdk_spin_unlock(&bdev->internal.spinlock); ++ SPDK_ERRLOG("Unable to allocate memory for QoS tracking\n"); ++ bdev_set_qos_limit_done(ctx, -ENOMEM); ++ return; ++ } ++ } ++ ++ if (bdev->internal.qos->thread == NULL) { ++ /* Enabling */ ++ bdev_set_qos_rate_limits(bdev, limits); ++ ++ spdk_bdev_for_each_channel(bdev, bdev_enable_qos_msg, ctx, ++ bdev_enable_qos_done); ++ } else { ++ /* Updating */ ++ bdev_set_qos_rate_limits(bdev, limits); ++ ++ spdk_thread_send_msg(bdev->internal.qos->thread, ++ bdev_update_qos_rate_limit_msg, ctx); ++ } ++ } else { ++ if (bdev->internal.qos != NULL) { ++ bdev_set_qos_rate_limits(bdev, limits); ++ ++ /* Disabling */ ++ spdk_bdev_for_each_channel(bdev, bdev_disable_qos_msg, ctx, ++ bdev_disable_qos_msg_done); ++ } else { ++ spdk_spin_unlock(&bdev->internal.spinlock); ++ bdev_set_qos_limit_done(ctx, 0); ++ return; ++ } ++ } ++ ++ spdk_spin_unlock(&bdev->internal.spinlock); ++} ++ ++struct spdk_bdev_histogram_ctx { ++ spdk_bdev_histogram_status_cb cb_fn; ++ void *cb_arg; ++ struct spdk_bdev *bdev; ++ int status; ++}; ++ ++static void ++bdev_histogram_disable_channel_cb(struct spdk_bdev *bdev, void *_ctx, int status) ++{ ++ struct spdk_bdev_histogram_ctx *ctx = _ctx; ++ ++ spdk_spin_lock(&ctx->bdev->internal.spinlock); ++ ctx->bdev->internal.histogram_in_progress = false; ++ spdk_spin_unlock(&ctx->bdev->internal.spinlock); ++ ctx->cb_fn(ctx->cb_arg, ctx->status); ++ free(ctx); ++} ++ ++static void ++bdev_histogram_disable_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, ++ struct spdk_io_channel *_ch, void *_ctx) ++{ ++ struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch); ++ ++ if (ch->histogram != NULL) { ++ spdk_histogram_data_free(ch->histogram); ++ ch->histogram = NULL; ++ } ++ spdk_bdev_for_each_channel_continue(i, 0); ++} ++ ++static void ++bdev_histogram_enable_channel_cb(struct spdk_bdev *bdev, void *_ctx, int status) ++{ ++ struct spdk_bdev_histogram_ctx *ctx = _ctx; ++ ++ if (status != 0) { ++ ctx->status = status; ++ ctx->bdev->internal.histogram_enabled = false; ++ spdk_bdev_for_each_channel(ctx->bdev, bdev_histogram_disable_channel, ctx, ++ bdev_histogram_disable_channel_cb); ++ } else { ++ spdk_spin_lock(&ctx->bdev->internal.spinlock); ++ ctx->bdev->internal.histogram_in_progress = false; ++ spdk_spin_unlock(&ctx->bdev->internal.spinlock); ++ ctx->cb_fn(ctx->cb_arg, ctx->status); ++ free(ctx); ++ } ++} ++ ++static void ++bdev_histogram_enable_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, ++ struct spdk_io_channel *_ch, void *_ctx) ++{ ++ struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch); ++ int status = 0; ++ ++ if (ch->histogram == NULL) { ++ ch->histogram = spdk_histogram_data_alloc(); ++ if (ch->histogram == NULL) { ++ status = -ENOMEM; ++ } ++ } ++ ++ spdk_bdev_for_each_channel_continue(i, status); ++} ++ ++void ++spdk_bdev_histogram_enable(struct spdk_bdev *bdev, spdk_bdev_histogram_status_cb cb_fn, ++ void *cb_arg, bool enable) ++{ ++ struct spdk_bdev_histogram_ctx *ctx; ++ ++ ctx = calloc(1, sizeof(struct spdk_bdev_histogram_ctx)); ++ if (ctx == NULL) { ++ cb_fn(cb_arg, -ENOMEM); ++ return; ++ } ++ ++ ctx->bdev = bdev; ++ ctx->status = 0; ++ ctx->cb_fn = cb_fn; ++ ctx->cb_arg = cb_arg; ++ ++ spdk_spin_lock(&bdev->internal.spinlock); ++ if (bdev->internal.histogram_in_progress) { ++ spdk_spin_unlock(&bdev->internal.spinlock); ++ free(ctx); ++ cb_fn(cb_arg, -EAGAIN); ++ return; ++ } ++ ++ bdev->internal.histogram_in_progress = true; ++ spdk_spin_unlock(&bdev->internal.spinlock); ++ ++ bdev->internal.histogram_enabled = enable; ++ ++ if (enable) { ++ /* Allocate histogram for each channel */ ++ spdk_bdev_for_each_channel(bdev, bdev_histogram_enable_channel, ctx, ++ bdev_histogram_enable_channel_cb); ++ } else { ++ spdk_bdev_for_each_channel(bdev, bdev_histogram_disable_channel, ctx, ++ bdev_histogram_disable_channel_cb); ++ } ++} ++ ++struct spdk_bdev_histogram_data_ctx { ++ spdk_bdev_histogram_data_cb cb_fn; ++ void *cb_arg; ++ struct spdk_bdev *bdev; ++ /** merged histogram data from all channels */ ++ struct spdk_histogram_data *histogram; ++}; ++ ++static void ++bdev_histogram_get_channel_cb(struct spdk_bdev *bdev, void *_ctx, int status) ++{ ++ struct spdk_bdev_histogram_data_ctx *ctx = _ctx; ++ ++ ctx->cb_fn(ctx->cb_arg, status, ctx->histogram); ++ free(ctx); ++} ++ ++static void ++bdev_histogram_get_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, ++ struct spdk_io_channel *_ch, void *_ctx) ++{ ++ struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch); ++ struct spdk_bdev_histogram_data_ctx *ctx = _ctx; ++ int status = 0; ++ ++ if (ch->histogram == NULL) { ++ status = -EFAULT; ++ } else { ++ spdk_histogram_data_merge(ctx->histogram, ch->histogram); ++ } ++ ++ spdk_bdev_for_each_channel_continue(i, status); ++} ++ ++void ++spdk_bdev_histogram_get(struct spdk_bdev *bdev, struct spdk_histogram_data *histogram, ++ spdk_bdev_histogram_data_cb cb_fn, ++ void *cb_arg) ++{ ++ struct spdk_bdev_histogram_data_ctx *ctx; ++ ++ ctx = calloc(1, sizeof(struct spdk_bdev_histogram_data_ctx)); ++ if (ctx == NULL) { ++ cb_fn(cb_arg, -ENOMEM, NULL); ++ return; ++ } ++ ++ ctx->bdev = bdev; ++ ctx->cb_fn = cb_fn; ++ ctx->cb_arg = cb_arg; ++ ++ ctx->histogram = histogram; ++ ++ spdk_bdev_for_each_channel(bdev, bdev_histogram_get_channel, ctx, ++ bdev_histogram_get_channel_cb); ++} ++ ++void ++spdk_bdev_channel_get_histogram(struct spdk_io_channel *ch, spdk_bdev_histogram_data_cb cb_fn, ++ void *cb_arg) ++{ ++ struct spdk_bdev_channel *bdev_ch = __io_ch_to_bdev_ch(ch); ++ int status = 0; ++ ++ assert(cb_fn != NULL); ++ ++ if (bdev_ch->histogram == NULL) { ++ status = -EFAULT; ++ } ++ cb_fn(cb_arg, status, bdev_ch->histogram); ++} ++ ++size_t ++spdk_bdev_get_media_events(struct spdk_bdev_desc *desc, struct spdk_bdev_media_event *events, ++ size_t max_events) ++{ ++ struct media_event_entry *entry; ++ size_t num_events = 0; ++ ++ for (; num_events < max_events; ++num_events) { ++ entry = TAILQ_FIRST(&desc->pending_media_events); ++ if (entry == NULL) { ++ break; ++ } ++ ++ events[num_events] = entry->event; ++ TAILQ_REMOVE(&desc->pending_media_events, entry, tailq); ++ TAILQ_INSERT_TAIL(&desc->free_media_events, entry, tailq); ++ } ++ ++ return num_events; ++} ++ ++int ++spdk_bdev_push_media_events(struct spdk_bdev *bdev, const struct spdk_bdev_media_event *events, ++ size_t num_events) ++{ ++ struct spdk_bdev_desc *desc; ++ struct media_event_entry *entry; ++ size_t event_id; ++ int rc = 0; ++ ++ assert(bdev->media_events); ++ ++ spdk_spin_lock(&bdev->internal.spinlock); ++ TAILQ_FOREACH(desc, &bdev->internal.open_descs, link) { ++ if (desc->write) { ++ break; ++ } ++ } ++ ++ if (desc == NULL || desc->media_events_buffer == NULL) { ++ rc = -ENODEV; ++ goto out; ++ } ++ ++ for (event_id = 0; event_id < num_events; ++event_id) { ++ entry = TAILQ_FIRST(&desc->free_media_events); ++ if (entry == NULL) { ++ break; ++ } ++ ++ TAILQ_REMOVE(&desc->free_media_events, entry, tailq); ++ TAILQ_INSERT_TAIL(&desc->pending_media_events, entry, tailq); ++ entry->event = events[event_id]; ++ } ++ ++ rc = event_id; ++out: ++ spdk_spin_unlock(&bdev->internal.spinlock); ++ return rc; ++} ++ ++void ++spdk_bdev_notify_media_management(struct spdk_bdev *bdev) ++{ ++ struct spdk_bdev_desc *desc; ++ ++ spdk_spin_lock(&bdev->internal.spinlock); ++ TAILQ_FOREACH(desc, &bdev->internal.open_descs, link) { ++ if (!TAILQ_EMPTY(&desc->pending_media_events)) { ++ desc->callback.event_fn(SPDK_BDEV_EVENT_MEDIA_MANAGEMENT, bdev, ++ desc->callback.ctx); ++ } ++ } ++ spdk_spin_unlock(&bdev->internal.spinlock); ++} ++ ++struct locked_lba_range_ctx { ++ struct lba_range range; ++ struct spdk_bdev *bdev; ++ struct lba_range *current_range; ++ struct lba_range *owner_range; ++ struct spdk_poller *poller; ++ lock_range_cb cb_fn; ++ void *cb_arg; ++}; ++ ++static void ++bdev_lock_error_cleanup_cb(struct spdk_bdev *bdev, void *_ctx, int status) ++{ ++ struct locked_lba_range_ctx *ctx = _ctx; ++ ++ ctx->cb_fn(ctx->cb_arg, -ENOMEM); ++ free(ctx); ++} ++ ++static void bdev_unlock_lba_range_get_channel(struct spdk_bdev_channel_iter *i, ++ struct spdk_bdev *bdev, struct spdk_io_channel *ch, void *_ctx); ++ ++static void ++bdev_lock_lba_range_cb(struct spdk_bdev *bdev, void *_ctx, int status) ++{ ++ struct locked_lba_range_ctx *ctx = _ctx; ++ ++ if (status == -ENOMEM) { ++ /* One of the channels could not allocate a range object. ++ * So we have to go back and clean up any ranges that were ++ * allocated successfully before we return error status to ++ * the caller. We can reuse the unlock function to do that ++ * clean up. ++ */ ++ spdk_bdev_for_each_channel(bdev, bdev_unlock_lba_range_get_channel, ctx, ++ bdev_lock_error_cleanup_cb); ++ return; ++ } ++ ++ /* All channels have locked this range and no I/O overlapping the range ++ * are outstanding! Set the owner_ch for the range object for the ++ * locking channel, so that this channel will know that it is allowed ++ * to write to this range. ++ */ ++ ctx->owner_range->owner_ch = ctx->range.owner_ch; ++ ctx->cb_fn(ctx->cb_arg, status); ++ ++ /* Don't free the ctx here. Its range is in the bdev's global list of ++ * locked ranges still, and will be removed and freed when this range ++ * is later unlocked. ++ */ ++} ++ ++static int ++bdev_lock_lba_range_check_io(void *_i) ++{ ++ struct spdk_bdev_channel_iter *i = _i; ++ struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i->i); ++ struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch); ++ struct locked_lba_range_ctx *ctx = i->ctx; ++ struct lba_range *range = ctx->current_range; ++ struct spdk_bdev_io *bdev_io; ++ ++ spdk_poller_unregister(&ctx->poller); ++ ++ /* The range is now in the locked_ranges, so no new IO can be submitted to this ++ * range. But we need to wait until any outstanding IO overlapping with this range ++ * are completed. ++ */ ++ TAILQ_FOREACH(bdev_io, &ch->io_submitted, internal.ch_link) { ++ if (bdev_io_range_is_locked(bdev_io, range)) { ++ ctx->poller = SPDK_POLLER_REGISTER(bdev_lock_lba_range_check_io, i, 100); ++ return SPDK_POLLER_BUSY; ++ } ++ } ++ ++ spdk_bdev_for_each_channel_continue(i, 0); ++ return SPDK_POLLER_BUSY; ++} ++ ++static void ++bdev_lock_lba_range_get_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, ++ struct spdk_io_channel *_ch, void *_ctx) ++{ ++ struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch); ++ struct locked_lba_range_ctx *ctx = _ctx; ++ struct lba_range *range; ++ ++ TAILQ_FOREACH(range, &ch->locked_ranges, tailq) { ++ if (range->length == ctx->range.length && ++ range->offset == ctx->range.offset && ++ range->locked_ctx == ctx->range.locked_ctx) { ++ /* This range already exists on this channel, so don't add ++ * it again. This can happen when a new channel is created ++ * while the for_each_channel operation is in progress. ++ * Do not check for outstanding I/O in that case, since the ++ * range was locked before any I/O could be submitted to the ++ * new channel. ++ */ ++ spdk_bdev_for_each_channel_continue(i, 0); ++ return; ++ } ++ } ++ ++ range = calloc(1, sizeof(*range)); ++ if (range == NULL) { ++ spdk_bdev_for_each_channel_continue(i, -ENOMEM); ++ return; ++ } ++ ++ range->length = ctx->range.length; ++ range->offset = ctx->range.offset; ++ range->locked_ctx = ctx->range.locked_ctx; ++ ctx->current_range = range; ++ if (ctx->range.owner_ch == ch) { ++ /* This is the range object for the channel that will hold ++ * the lock. Store it in the ctx object so that we can easily ++ * set its owner_ch after the lock is finally acquired. ++ */ ++ ctx->owner_range = range; ++ } ++ TAILQ_INSERT_TAIL(&ch->locked_ranges, range, tailq); ++ bdev_lock_lba_range_check_io(i); ++} ++ ++static void ++bdev_lock_lba_range_ctx(struct spdk_bdev *bdev, struct locked_lba_range_ctx *ctx) ++{ ++ assert(spdk_get_thread() == spdk_io_channel_get_thread(ctx->range.owner_ch->channel)); ++ ++ /* We will add a copy of this range to each channel now. */ ++ spdk_bdev_for_each_channel(bdev, bdev_lock_lba_range_get_channel, ctx, ++ bdev_lock_lba_range_cb); ++} ++ ++static bool ++bdev_lba_range_overlaps_tailq(struct lba_range *range, lba_range_tailq_t *tailq) ++{ ++ struct lba_range *r; ++ ++ TAILQ_FOREACH(r, tailq, tailq) { ++ if (bdev_lba_range_overlapped(range, r)) { ++ return true; ++ } ++ } ++ return false; ++} ++ ++static int ++bdev_lock_lba_range(struct spdk_bdev_desc *desc, struct spdk_io_channel *_ch, ++ uint64_t offset, uint64_t length, ++ lock_range_cb cb_fn, void *cb_arg) ++{ ++ struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); ++ struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch); ++ struct locked_lba_range_ctx *ctx; ++ ++ if (cb_arg == NULL) { ++ SPDK_ERRLOG("cb_arg must not be NULL\n"); ++ return -EINVAL; ++ } ++ ++ ctx = calloc(1, sizeof(*ctx)); ++ if (ctx == NULL) { ++ return -ENOMEM; ++ } ++ ++ ctx->range.offset = offset; ++ ctx->range.length = length; ++ ctx->range.owner_ch = ch; ++ ctx->range.locked_ctx = cb_arg; ++ ctx->bdev = bdev; ++ ctx->cb_fn = cb_fn; ++ ctx->cb_arg = cb_arg; ++ ++ spdk_spin_lock(&bdev->internal.spinlock); ++ if (bdev_lba_range_overlaps_tailq(&ctx->range, &bdev->internal.locked_ranges)) { ++ /* There is an active lock overlapping with this range. ++ * Put it on the pending list until this range no ++ * longer overlaps with another. ++ */ ++ TAILQ_INSERT_TAIL(&bdev->internal.pending_locked_ranges, &ctx->range, tailq); ++ } else { ++ TAILQ_INSERT_TAIL(&bdev->internal.locked_ranges, &ctx->range, tailq); ++ bdev_lock_lba_range_ctx(bdev, ctx); ++ } ++ spdk_spin_unlock(&bdev->internal.spinlock); ++ return 0; ++} ++ ++static void ++bdev_lock_lba_range_ctx_msg(void *_ctx) ++{ ++ struct locked_lba_range_ctx *ctx = _ctx; ++ ++ bdev_lock_lba_range_ctx(ctx->bdev, ctx); ++} ++ ++static void ++bdev_unlock_lba_range_cb(struct spdk_bdev *bdev, void *_ctx, int status) ++{ ++ struct locked_lba_range_ctx *ctx = _ctx; ++ struct locked_lba_range_ctx *pending_ctx; ++ struct lba_range *range, *tmp; ++ ++ spdk_spin_lock(&bdev->internal.spinlock); ++ /* Check if there are any pending locked ranges that overlap with this range ++ * that was just unlocked. If there are, check that it doesn't overlap with any ++ * other locked ranges before calling bdev_lock_lba_range_ctx which will start ++ * the lock process. ++ */ ++ TAILQ_FOREACH_SAFE(range, &bdev->internal.pending_locked_ranges, tailq, tmp) { ++ if (bdev_lba_range_overlapped(range, &ctx->range) && ++ !bdev_lba_range_overlaps_tailq(range, &bdev->internal.locked_ranges)) { ++ TAILQ_REMOVE(&bdev->internal.pending_locked_ranges, range, tailq); ++ pending_ctx = SPDK_CONTAINEROF(range, struct locked_lba_range_ctx, range); ++ TAILQ_INSERT_TAIL(&bdev->internal.locked_ranges, range, tailq); ++ spdk_thread_send_msg(spdk_io_channel_get_thread(pending_ctx->range.owner_ch->channel), ++ bdev_lock_lba_range_ctx_msg, pending_ctx); ++ } ++ } ++ spdk_spin_unlock(&bdev->internal.spinlock); ++ ++ ctx->cb_fn(ctx->cb_arg, status); ++ free(ctx); ++} ++ ++static void ++bdev_unlock_lba_range_get_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, ++ struct spdk_io_channel *_ch, void *_ctx) ++{ ++ struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch); ++ struct locked_lba_range_ctx *ctx = _ctx; ++ TAILQ_HEAD(, spdk_bdev_io) io_locked; ++ struct spdk_bdev_io *bdev_io; ++ struct lba_range *range; ++ ++ TAILQ_FOREACH(range, &ch->locked_ranges, tailq) { ++ if (ctx->range.offset == range->offset && ++ ctx->range.length == range->length && ++ ctx->range.locked_ctx == range->locked_ctx) { ++ TAILQ_REMOVE(&ch->locked_ranges, range, tailq); ++ free(range); ++ break; ++ } ++ } ++ ++ /* Note: we should almost always be able to assert that the range specified ++ * was found. But there are some very rare corner cases where a new channel ++ * gets created simultaneously with a range unlock, where this function ++ * would execute on that new channel and wouldn't have the range. ++ * We also use this to clean up range allocations when a later allocation ++ * fails in the locking path. ++ * So we can't actually assert() here. ++ */ ++ ++ /* Swap the locked IO into a temporary list, and then try to submit them again. ++ * We could hyper-optimize this to only resubmit locked I/O that overlap ++ * with the range that was just unlocked, but this isn't a performance path so ++ * we go for simplicity here. ++ */ ++ TAILQ_INIT(&io_locked); ++ TAILQ_SWAP(&ch->io_locked, &io_locked, spdk_bdev_io, internal.ch_link); ++ while (!TAILQ_EMPTY(&io_locked)) { ++ bdev_io = TAILQ_FIRST(&io_locked); ++ TAILQ_REMOVE(&io_locked, bdev_io, internal.ch_link); ++ bdev_io_submit(bdev_io); ++ } ++ ++ spdk_bdev_for_each_channel_continue(i, 0); ++} ++ ++static int ++bdev_unlock_lba_range(struct spdk_bdev_desc *desc, struct spdk_io_channel *_ch, ++ uint64_t offset, uint64_t length, ++ lock_range_cb cb_fn, void *cb_arg) ++{ ++ struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); ++ struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch); ++ struct locked_lba_range_ctx *ctx; ++ struct lba_range *range; ++ bool range_found = false; ++ ++ /* Let's make sure the specified channel actually has a lock on ++ * the specified range. Note that the range must match exactly. ++ */ ++ TAILQ_FOREACH(range, &ch->locked_ranges, tailq) { ++ if (range->offset == offset && range->length == length && ++ range->owner_ch == ch && range->locked_ctx == cb_arg) { ++ range_found = true; ++ break; ++ } ++ } ++ ++ if (!range_found) { ++ return -EINVAL; ++ } ++ ++ spdk_spin_lock(&bdev->internal.spinlock); ++ /* We confirmed that this channel has locked the specified range. To ++ * start the unlock the process, we find the range in the bdev's locked_ranges ++ * and remove it. This ensures new channels don't inherit the locked range. ++ * Then we will send a message to each channel (including the one specified ++ * here) to remove the range from its per-channel list. ++ */ ++ TAILQ_FOREACH(range, &bdev->internal.locked_ranges, tailq) { ++ if (range->offset == offset && range->length == length && ++ range->locked_ctx == cb_arg) { ++ break; ++ } ++ } ++ if (range == NULL) { ++ assert(false); ++ spdk_spin_unlock(&bdev->internal.spinlock); ++ return -EINVAL; ++ } ++ TAILQ_REMOVE(&bdev->internal.locked_ranges, range, tailq); ++ ctx = SPDK_CONTAINEROF(range, struct locked_lba_range_ctx, range); ++ spdk_spin_unlock(&bdev->internal.spinlock); ++ ++ ctx->cb_fn = cb_fn; ++ ctx->cb_arg = cb_arg; ++ ++ spdk_bdev_for_each_channel(bdev, bdev_unlock_lba_range_get_channel, ctx, ++ bdev_unlock_lba_range_cb); ++ return 0; ++} ++ ++int ++spdk_bdev_get_memory_domains(struct spdk_bdev *bdev, struct spdk_memory_domain **domains, ++ int array_size) ++{ ++ if (!bdev) { ++ return -EINVAL; ++ } ++ ++ if (bdev->fn_table->get_memory_domains) { ++ return bdev->fn_table->get_memory_domains(bdev->ctxt, domains, array_size); ++ } ++ ++ return 0; ++} ++ ++struct spdk_bdev_for_each_io_ctx { ++ void *ctx; ++ spdk_bdev_io_fn fn; ++ spdk_bdev_for_each_io_cb cb; ++}; ++ ++static void ++bdev_channel_for_each_io(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, ++ struct spdk_io_channel *io_ch, void *_ctx) ++{ ++ struct spdk_bdev_for_each_io_ctx *ctx = _ctx; ++ struct spdk_bdev_channel *bdev_ch = __io_ch_to_bdev_ch(io_ch); ++ struct spdk_bdev_io *bdev_io; ++ int rc = 0; ++ ++ TAILQ_FOREACH(bdev_io, &bdev_ch->io_submitted, internal.ch_link) { ++ rc = ctx->fn(ctx->ctx, bdev_io); ++ if (rc != 0) { ++ break; ++ } ++ } ++ ++ spdk_bdev_for_each_channel_continue(i, rc); ++} ++ ++static void ++bdev_for_each_io_done(struct spdk_bdev *bdev, void *_ctx, int status) ++{ ++ struct spdk_bdev_for_each_io_ctx *ctx = _ctx; ++ ++ ctx->cb(ctx->ctx, status); ++ ++ free(ctx); ++} ++ ++void ++spdk_bdev_for_each_bdev_io(struct spdk_bdev *bdev, void *_ctx, spdk_bdev_io_fn fn, ++ spdk_bdev_for_each_io_cb cb) ++{ ++ struct spdk_bdev_for_each_io_ctx *ctx; ++ ++ assert(fn != NULL && cb != NULL); ++ ++ ctx = calloc(1, sizeof(*ctx)); ++ if (ctx == NULL) { ++ SPDK_ERRLOG("Failed to allocate context.\n"); ++ cb(_ctx, -ENOMEM); ++ return; ++ } ++ ++ ctx->ctx = _ctx; ++ ctx->fn = fn; ++ ctx->cb = cb; ++ ++ spdk_bdev_for_each_channel(bdev, bdev_channel_for_each_io, ctx, ++ bdev_for_each_io_done); ++} ++ ++void ++spdk_bdev_for_each_channel_continue(struct spdk_bdev_channel_iter *iter, int status) ++{ ++ spdk_for_each_channel_continue(iter->i, status); ++} ++ ++static struct spdk_bdev * ++io_channel_iter_get_bdev(struct spdk_io_channel_iter *i) ++{ ++ void *io_device = spdk_io_channel_iter_get_io_device(i); ++ ++ return __bdev_from_io_dev(io_device); ++} ++ ++static void ++bdev_each_channel_msg(struct spdk_io_channel_iter *i) ++{ ++ struct spdk_bdev_channel_iter *iter = spdk_io_channel_iter_get_ctx(i); ++ struct spdk_bdev *bdev = io_channel_iter_get_bdev(i); ++ struct spdk_io_channel *ch = spdk_io_channel_iter_get_channel(i); ++ ++ iter->i = i; ++ iter->fn(iter, bdev, ch, iter->ctx); ++} ++ ++static void ++bdev_each_channel_cpl(struct spdk_io_channel_iter *i, int status) ++{ ++ struct spdk_bdev_channel_iter *iter = spdk_io_channel_iter_get_ctx(i); ++ struct spdk_bdev *bdev = io_channel_iter_get_bdev(i); ++ ++ iter->i = i; ++ iter->cpl(bdev, iter->ctx, status); ++ ++ free(iter); ++} ++ ++void ++spdk_bdev_for_each_channel(struct spdk_bdev *bdev, spdk_bdev_for_each_channel_msg fn, ++ void *ctx, spdk_bdev_for_each_channel_done cpl) ++{ ++ struct spdk_bdev_channel_iter *iter; ++ ++ assert(bdev != NULL && fn != NULL && ctx != NULL); ++ ++ iter = calloc(1, sizeof(struct spdk_bdev_channel_iter)); ++ if (iter == NULL) { ++ SPDK_ERRLOG("Unable to allocate iterator\n"); ++ assert(false); ++ return; ++ } ++ ++ iter->fn = fn; ++ iter->cpl = cpl; ++ iter->ctx = ctx; ++ ++ spdk_for_each_channel(__bdev_to_io_dev(bdev), bdev_each_channel_msg, ++ iter, bdev_each_channel_cpl); ++} ++ ++int ++spdk_bdev_copy_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, ++ uint64_t dst_offset_blocks, uint64_t src_offset_blocks, uint64_t num_blocks, ++ spdk_bdev_io_completion_cb cb, void *cb_arg) ++{ ++ struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); ++ struct spdk_bdev_io *bdev_io; ++ struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); ++ ++ if (!desc->write) { ++ return -EBADF; ++ } ++ ++ if (spdk_unlikely(!bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_COPY))) { ++ SPDK_DEBUGLOG(bdev, "Copy IO type is not supported\n"); ++ return -ENOTSUP; ++ } ++ ++ if (num_blocks == 0) { ++ SPDK_ERRLOG("Can't copy 0 blocks\n"); ++ return -EINVAL; ++ } ++ ++ if (!bdev_io_valid_blocks(bdev, dst_offset_blocks, num_blocks) || ++ !bdev_io_valid_blocks(bdev, src_offset_blocks, num_blocks)) { ++ SPDK_DEBUGLOG(bdev, ++ "Invalid offset or number of blocks: dst %lu, src %lu, count %lu\n", ++ dst_offset_blocks, src_offset_blocks, num_blocks); ++ return -EINVAL; ++ } ++ ++ bdev_io = bdev_channel_get_io(channel); ++ if (!bdev_io) { ++ return -ENOMEM; ++ } ++ ++ bdev_io->internal.ch = channel; ++ bdev_io->internal.desc = desc; ++ bdev_io->type = SPDK_BDEV_IO_TYPE_COPY; ++ ++ bdev_io->u.bdev.offset_blocks = dst_offset_blocks; ++ bdev_io->u.bdev.copy.src_offset_blocks = src_offset_blocks; ++ bdev_io->u.bdev.num_blocks = num_blocks; ++ bdev_io->u.bdev.ext_opts = NULL; ++ bdev_io_init(bdev_io, bdev, cb_arg, cb); ++ ++ bdev_io_submit(bdev_io); ++ return 0; ++} ++ ++SPDK_LOG_REGISTER_COMPONENT(bdev) ++ ++SPDK_TRACE_REGISTER_FN(bdev_trace, "bdev", TRACE_GROUP_BDEV) ++{ ++ struct spdk_trace_tpoint_opts opts[] = { ++ { ++ "BDEV_IO_START", TRACE_BDEV_IO_START, ++ OWNER_BDEV, OBJECT_BDEV_IO, 1, ++ { ++ { "type", SPDK_TRACE_ARG_TYPE_INT, 8 }, ++ { "ctx", SPDK_TRACE_ARG_TYPE_PTR, 8 }, ++ { "offset", SPDK_TRACE_ARG_TYPE_INT, 8 }, ++ { "len", SPDK_TRACE_ARG_TYPE_INT, 8 }, ++ { "name", SPDK_TRACE_ARG_TYPE_STR, 40} ++ } ++ }, ++ { ++ "BDEV_IO_DONE", TRACE_BDEV_IO_DONE, ++ OWNER_BDEV, OBJECT_BDEV_IO, 0, ++ {{ "ctx", SPDK_TRACE_ARG_TYPE_PTR, 8 }} ++ }, ++ { ++ "BDEV_IOCH_CREATE", TRACE_BDEV_IOCH_CREATE, ++ OWNER_BDEV, OBJECT_NONE, 1, ++ { ++ { "name", SPDK_TRACE_ARG_TYPE_STR, 40 }, ++ { "thread_id", SPDK_TRACE_ARG_TYPE_INT, 8} ++ } ++ }, ++ { ++ "BDEV_IOCH_DESTROY", TRACE_BDEV_IOCH_DESTROY, ++ OWNER_BDEV, OBJECT_NONE, 0, ++ { ++ { "name", SPDK_TRACE_ARG_TYPE_STR, 40 }, ++ { "thread_id", SPDK_TRACE_ARG_TYPE_INT, 8} ++ } ++ }, ++ }; ++ ++ ++ spdk_trace_register_owner(OWNER_BDEV, 'b'); ++ spdk_trace_register_object(OBJECT_BDEV_IO, 'i'); ++ spdk_trace_register_description_ext(opts, SPDK_COUNTOF(opts)); ++ spdk_trace_tpoint_register_relation(TRACE_BDEV_NVME_IO_START, OBJECT_BDEV_IO, 0); ++ spdk_trace_tpoint_register_relation(TRACE_BDEV_NVME_IO_DONE, OBJECT_BDEV_IO, 0); ++} +diff --git a/lib/event/app.c b/lib/event/app.c +index 72ff985..3f1a9c6 100644 +--- a/lib/event/app.c ++++ b/lib/event/app.c +@@ -1,1323 +1,1336 @@ +-/* SPDX-License-Identifier: BSD-3-Clause +- * Copyright (C) 2016 Intel Corporation. All rights reserved. +- * Copyright (c) 2019 Mellanox Technologies LTD. All rights reserved. +- * Copyright (c) 2021 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +- */ +- +-#include "spdk/stdinc.h" +-#include "spdk/version.h" +- +-#include "spdk_internal/event.h" +- +-#include "spdk/assert.h" +-#include "spdk/env.h" +-#include "spdk/init.h" +-#include "spdk/log.h" +-#include "spdk/thread.h" +-#include "spdk/trace.h" +-#include "spdk/string.h" +-#include "spdk/scheduler.h" +-#include "spdk/rpc.h" +-#include "spdk/util.h" +- +-#define SPDK_APP_DEFAULT_LOG_LEVEL SPDK_LOG_NOTICE +-#define SPDK_APP_DEFAULT_LOG_PRINT_LEVEL SPDK_LOG_INFO +-#define SPDK_APP_DEFAULT_NUM_TRACE_ENTRIES SPDK_DEFAULT_NUM_TRACE_ENTRIES +- +-#define SPDK_APP_DPDK_DEFAULT_MEM_SIZE -1 +-#define SPDK_APP_DPDK_DEFAULT_MAIN_CORE -1 +-#define SPDK_APP_DPDK_DEFAULT_MEM_CHANNEL -1 +-#define SPDK_APP_DPDK_DEFAULT_CORE_MASK "0x1" +-#define SPDK_APP_DPDK_DEFAULT_BASE_VIRTADDR 0x200000000000 +-#define SPDK_APP_DEFAULT_CORE_LIMIT 0x140000000 /* 5 GiB */ +- +-#define MAX_CPU_CORES 128 +- +-struct spdk_app { +- const char *json_config_file; +- bool json_config_ignore_errors; +- bool stopped; +- const char *rpc_addr; +- const char **rpc_allowlist; +- int shm_id; +- spdk_app_shutdown_cb shutdown_cb; +- int rc; +-}; +- +-static struct spdk_app g_spdk_app; +-static spdk_msg_fn g_start_fn = NULL; +-static void *g_start_arg = NULL; +-static struct spdk_thread *g_app_thread = NULL; +-static bool g_delay_subsystem_init = false; +-static bool g_shutdown_sig_received = false; +-static char *g_executable_name; +-static struct spdk_app_opts g_default_opts; +-static bool g_disable_cpumask_locks = false; +- +-static int g_core_locks[MAX_CPU_CORES]; +- +-int +-spdk_app_get_shm_id(void) +-{ +- return g_spdk_app.shm_id; +-} +- +-/* append one empty option to indicate the end of the array */ +-static const struct option g_cmdline_options[] = { +-#define CONFIG_FILE_OPT_IDX 'c' +- {"config", required_argument, NULL, CONFIG_FILE_OPT_IDX}, +-#define LIMIT_COREDUMP_OPT_IDX 'd' +- {"limit-coredump", no_argument, NULL, LIMIT_COREDUMP_OPT_IDX}, +-#define TPOINT_GROUP_OPT_IDX 'e' +- {"tpoint-group", required_argument, NULL, TPOINT_GROUP_OPT_IDX}, +-#define SINGLE_FILE_SEGMENTS_OPT_IDX 'g' +- {"single-file-segments", no_argument, NULL, SINGLE_FILE_SEGMENTS_OPT_IDX}, +-#define HELP_OPT_IDX 'h' +- {"help", no_argument, NULL, HELP_OPT_IDX}, +-#define SHM_ID_OPT_IDX 'i' +- {"shm-id", required_argument, NULL, SHM_ID_OPT_IDX}, +-#define CPUMASK_OPT_IDX 'm' +- {"cpumask", required_argument, NULL, CPUMASK_OPT_IDX}, +-#define MEM_CHANNELS_OPT_IDX 'n' +- {"mem-channels", required_argument, NULL, MEM_CHANNELS_OPT_IDX}, +-#define MAIN_CORE_OPT_IDX 'p' +- {"main-core", required_argument, NULL, MAIN_CORE_OPT_IDX}, +- {"master-core", required_argument, NULL, MAIN_CORE_OPT_IDX}, /* deprecated */ +-#define RPC_SOCKET_OPT_IDX 'r' +- {"rpc-socket", required_argument, NULL, RPC_SOCKET_OPT_IDX}, +-#define MEM_SIZE_OPT_IDX 's' +- {"mem-size", required_argument, NULL, MEM_SIZE_OPT_IDX}, +-#define NO_PCI_OPT_IDX 'u' +- {"no-pci", no_argument, NULL, NO_PCI_OPT_IDX}, +-#define VERSION_OPT_IDX 'v' +- {"version", no_argument, NULL, VERSION_OPT_IDX}, +-#define PCI_BLOCKED_OPT_IDX 'B' +- {"pci-blocked", required_argument, NULL, PCI_BLOCKED_OPT_IDX}, +- {"pci-blacklist", required_argument, NULL, PCI_BLOCKED_OPT_IDX}, /* deprecated */ +-#define LOGFLAG_OPT_IDX 'L' +- {"logflag", required_argument, NULL, LOGFLAG_OPT_IDX}, +-#define HUGE_UNLINK_OPT_IDX 'R' +- {"huge-unlink", no_argument, NULL, HUGE_UNLINK_OPT_IDX}, +-#define PCI_ALLOWED_OPT_IDX 'A' +- {"pci-allowed", required_argument, NULL, PCI_ALLOWED_OPT_IDX}, +-#define PCI_WHITELIST_OPT_IDX 'W' +- {"pci-whitelist", required_argument, NULL, PCI_WHITELIST_OPT_IDX}, /* deprecated */ +-#define SILENCE_NOTICELOG_OPT_IDX 257 +- {"silence-noticelog", no_argument, NULL, SILENCE_NOTICELOG_OPT_IDX}, +-#define WAIT_FOR_RPC_OPT_IDX 258 +- {"wait-for-rpc", no_argument, NULL, WAIT_FOR_RPC_OPT_IDX}, +-#define HUGE_DIR_OPT_IDX 259 +- {"huge-dir", required_argument, NULL, HUGE_DIR_OPT_IDX}, +-#define NUM_TRACE_ENTRIES_OPT_IDX 260 +- {"num-trace-entries", required_argument, NULL, NUM_TRACE_ENTRIES_OPT_IDX}, +-#define MAX_REACTOR_DELAY_OPT_IDX 261 +- {"max-delay", required_argument, NULL, MAX_REACTOR_DELAY_OPT_IDX}, +-#define JSON_CONFIG_OPT_IDX 262 +- {"json", required_argument, NULL, JSON_CONFIG_OPT_IDX}, +-#define JSON_CONFIG_IGNORE_INIT_ERRORS_IDX 263 +- {"json-ignore-init-errors", no_argument, NULL, JSON_CONFIG_IGNORE_INIT_ERRORS_IDX}, +-#define IOVA_MODE_OPT_IDX 264 +- {"iova-mode", required_argument, NULL, IOVA_MODE_OPT_IDX}, +-#define BASE_VIRTADDR_OPT_IDX 265 +- {"base-virtaddr", required_argument, NULL, BASE_VIRTADDR_OPT_IDX}, +-#define ENV_CONTEXT_OPT_IDX 266 +- {"env-context", required_argument, NULL, ENV_CONTEXT_OPT_IDX}, +-#define DISABLE_CPUMASK_LOCKS_OPT_IDX 267 +- {"disable-cpumask-locks", no_argument, NULL, DISABLE_CPUMASK_LOCKS_OPT_IDX}, +-#define RPCS_ALLOWED_OPT_IDX 268 +- {"rpcs-allowed", required_argument, NULL, RPCS_ALLOWED_OPT_IDX}, +-#define ENV_VF_TOKEN_OPT_IDX 269 +- {"vfio-vf-token", required_argument, NULL, ENV_VF_TOKEN_OPT_IDX}, +-#define MSG_MEMPOOL_SIZE_OPT_IDX 270 +- {"msg-mempool-size", required_argument, NULL, MSG_MEMPOOL_SIZE_OPT_IDX}, +-}; +- +-static void +-app_start_shutdown(void *ctx) +-{ +- if (g_spdk_app.shutdown_cb) { +- g_spdk_app.shutdown_cb(); +- g_spdk_app.shutdown_cb = NULL; +- } else { +- spdk_app_stop(0); +- } +-} +- +-void +-spdk_app_start_shutdown(void) +-{ +- spdk_thread_send_critical_msg(g_app_thread, app_start_shutdown); +-} +- +-static void +-__shutdown_signal(int signo) +-{ +- if (!g_shutdown_sig_received) { +- g_shutdown_sig_received = true; +- spdk_app_start_shutdown(); +- } +-} +- +-static int +-app_opts_validate(const char *app_opts) +-{ +- int i = 0, j; +- +- for (i = 0; app_opts[i] != '\0'; i++) { +- /* ignore getopt control characters */ +- if (app_opts[i] == ':' || app_opts[i] == '+' || app_opts[i] == '-') { +- continue; +- } +- +- for (j = 0; SPDK_APP_GETOPT_STRING[j] != '\0'; j++) { +- if (app_opts[i] == SPDK_APP_GETOPT_STRING[j]) { +- return app_opts[i]; +- } +- } +- } +- return 0; +-} +- +-void +-spdk_app_opts_init(struct spdk_app_opts *opts, size_t opts_size) +-{ +- if (!opts) { +- SPDK_ERRLOG("opts should not be NULL\n"); +- return; +- } +- +- if (!opts_size) { +- SPDK_ERRLOG("opts_size should not be zero value\n"); +- return; +- } +- +- memset(opts, 0, opts_size); +- opts->opts_size = opts_size; +- +-#define SET_FIELD(field, value) \ +- if (offsetof(struct spdk_app_opts, field) + sizeof(opts->field) <= opts_size) { \ +- opts->field = value; \ +- } \ +- +- SET_FIELD(enable_coredump, true); +- SET_FIELD(shm_id, -1); +- SET_FIELD(mem_size, SPDK_APP_DPDK_DEFAULT_MEM_SIZE); +- SET_FIELD(main_core, SPDK_APP_DPDK_DEFAULT_MAIN_CORE); +- SET_FIELD(mem_channel, SPDK_APP_DPDK_DEFAULT_MEM_CHANNEL); +- SET_FIELD(reactor_mask, SPDK_APP_DPDK_DEFAULT_CORE_MASK); +- SET_FIELD(base_virtaddr, SPDK_APP_DPDK_DEFAULT_BASE_VIRTADDR); +- SET_FIELD(print_level, SPDK_APP_DEFAULT_LOG_PRINT_LEVEL); +- SET_FIELD(rpc_addr, SPDK_DEFAULT_RPC_ADDR); +- SET_FIELD(num_entries, SPDK_APP_DEFAULT_NUM_TRACE_ENTRIES); +- SET_FIELD(delay_subsystem_init, false); +- SET_FIELD(disable_signal_handlers, false); +- SET_FIELD(msg_mempool_size, SPDK_DEFAULT_MSG_MEMPOOL_SIZE); +- SET_FIELD(rpc_allowlist, NULL); +-#undef SET_FIELD +-} +- +-static int +-app_setup_signal_handlers(struct spdk_app_opts *opts) +-{ +- struct sigaction sigact; +- sigset_t sigmask; +- int rc; +- +- sigemptyset(&sigmask); +- memset(&sigact, 0, sizeof(sigact)); +- sigemptyset(&sigact.sa_mask); +- +- sigact.sa_handler = SIG_IGN; +- rc = sigaction(SIGPIPE, &sigact, NULL); +- if (rc < 0) { +- SPDK_ERRLOG("sigaction(SIGPIPE) failed\n"); +- return rc; +- } +- +- /* Install the same handler for SIGINT and SIGTERM */ +- g_shutdown_sig_received = false; +- sigact.sa_handler = __shutdown_signal; +- rc = sigaction(SIGINT, &sigact, NULL); +- if (rc < 0) { +- SPDK_ERRLOG("sigaction(SIGINT) failed\n"); +- return rc; +- } +- sigaddset(&sigmask, SIGINT); +- +- rc = sigaction(SIGTERM, &sigact, NULL); +- if (rc < 0) { +- SPDK_ERRLOG("sigaction(SIGTERM) failed\n"); +- return rc; +- } +- sigaddset(&sigmask, SIGTERM); +- +- pthread_sigmask(SIG_UNBLOCK, &sigmask, NULL); +- +- return 0; +-} +- +-static void +-app_start_application(void) +-{ +- assert(spdk_get_thread() == g_app_thread); +- +- g_start_fn(g_start_arg); +-} +- +-static void +-app_start_rpc(int rc, void *arg1) +-{ +- if (rc) { +- spdk_app_stop(rc); +- return; +- } +- +- spdk_rpc_set_allowlist(g_spdk_app.rpc_allowlist); +- +- rc = spdk_rpc_initialize(g_spdk_app.rpc_addr); +- if (rc) { +- spdk_app_stop(rc); +- return; +- } +- +- if (!g_delay_subsystem_init) { +- spdk_rpc_set_state(SPDK_RPC_RUNTIME); +- app_start_application(); +- } +-} +- +-static int +-app_opts_add_pci_addr(struct spdk_app_opts *opts, struct spdk_pci_addr **list, char *bdf) +-{ +- struct spdk_pci_addr *tmp = *list; +- size_t i = opts->num_pci_addr; +- +- tmp = realloc(tmp, sizeof(*tmp) * (i + 1)); +- if (tmp == NULL) { +- SPDK_ERRLOG("realloc error\n"); +- return -ENOMEM; +- } +- +- *list = tmp; +- if (spdk_pci_addr_parse(*list + i, bdf) < 0) { +- SPDK_ERRLOG("Invalid address %s\n", bdf); +- return -EINVAL; +- } +- +- opts->num_pci_addr++; +- return 0; +-} +- +-static int +-app_setup_env(struct spdk_app_opts *opts) +-{ +- struct spdk_env_opts env_opts = {}; +- int rc; +- +- if (opts == NULL) { +- rc = spdk_env_init(NULL); +- if (rc != 0) { +- SPDK_ERRLOG("Unable to reinitialize SPDK env\n"); +- } +- +- return rc; +- } +- +- spdk_env_opts_init(&env_opts); +- +- env_opts.name = opts->name; +- env_opts.core_mask = opts->reactor_mask; +- env_opts.shm_id = opts->shm_id; +- env_opts.mem_channel = opts->mem_channel; +- env_opts.main_core = opts->main_core; +- env_opts.mem_size = opts->mem_size; +- env_opts.hugepage_single_segments = opts->hugepage_single_segments; +- env_opts.unlink_hugepage = opts->unlink_hugepage; +- env_opts.hugedir = opts->hugedir; +- env_opts.no_pci = opts->no_pci; +- env_opts.num_pci_addr = opts->num_pci_addr; +- env_opts.pci_blocked = opts->pci_blocked; +- env_opts.pci_allowed = opts->pci_allowed; +- env_opts.base_virtaddr = opts->base_virtaddr; +- env_opts.env_context = opts->env_context; +- env_opts.iova_mode = opts->iova_mode; +- env_opts.vf_token = opts->vf_token; +- +- rc = spdk_env_init(&env_opts); +- free(env_opts.pci_blocked); +- free(env_opts.pci_allowed); +- +- if (rc < 0) { +- SPDK_ERRLOG("Unable to initialize SPDK env\n"); +- } +- +- return rc; +-} +- +-static int +-app_setup_trace(struct spdk_app_opts *opts) +-{ +- char shm_name[64]; +- uint64_t tpoint_group_mask, tpoint_mask = -1ULL; +- char *end = NULL, *tpoint_group_mask_str, *tpoint_group_str = NULL; +- char *tp_g_str, *tpoint_group, *tpoints; +- bool error_found = false; +- uint64_t group_id; +- +- if (opts->shm_id >= 0) { +- snprintf(shm_name, sizeof(shm_name), "/%s_trace.%d", opts->name, opts->shm_id); +- } else { +- snprintf(shm_name, sizeof(shm_name), "/%s_trace.pid%d", opts->name, (int)getpid()); +- } +- +- if (spdk_trace_init(shm_name, opts->num_entries) != 0) { +- return -1; +- } +- +- if (opts->tpoint_group_mask == NULL) { +- return 0; +- } +- +- tpoint_group_mask_str = strdup(opts->tpoint_group_mask); +- if (tpoint_group_mask_str == NULL) { +- SPDK_ERRLOG("Unable to get string of tpoint group mask from opts.\n"); +- return -1; +- } +- /* Save a pointer to the original value of the tpoint group mask string +- * to free later, because spdk_strsepq() modifies given char*. */ +- tp_g_str = tpoint_group_mask_str; +- while ((tpoint_group_str = spdk_strsepq(&tpoint_group_mask_str, ",")) != NULL) { +- if (strchr(tpoint_group_str, ':')) { +- /* Get the tpoint group mask */ +- tpoint_group = spdk_strsepq(&tpoint_group_str, ":"); +- /* Get the tpoint mask inside that group */ +- tpoints = spdk_strsepq(&tpoint_group_str, ":"); +- +- errno = 0; +- tpoint_group_mask = strtoull(tpoint_group, &end, 16); +- if (*end != '\0' || errno) { +- tpoint_group_mask = spdk_trace_create_tpoint_group_mask(tpoint_group); +- if (tpoint_group_mask == 0) { +- error_found = true; +- break; +- } +- } +- /* Check if tpoint group mask has only one bit set. +- * This is to avoid enabling individual tpoints in +- * more than one tracepoint group at once. */ +- if (!spdk_u64_is_pow2(tpoint_group_mask)) { +- SPDK_ERRLOG("Tpoint group mask: %s contains multiple tpoint groups.\n", tpoint_group); +- SPDK_ERRLOG("This is not supported, to prevent from activating tpoints by mistake.\n"); +- error_found = true; +- break; +- } +- +- errno = 0; +- tpoint_mask = strtoull(tpoints, &end, 16); +- if (*end != '\0' || errno) { +- error_found = true; +- break; +- } +- } else { +- errno = 0; +- tpoint_group_mask = strtoull(tpoint_group_str, &end, 16); +- if (*end != '\0' || errno) { +- tpoint_group_mask = spdk_trace_create_tpoint_group_mask(tpoint_group_str); +- if (tpoint_group_mask == 0) { +- error_found = true; +- break; +- } +- } +- tpoint_mask = -1ULL; +- } +- +- for (group_id = 0; group_id < SPDK_TRACE_MAX_GROUP_ID; ++group_id) { +- if (tpoint_group_mask & (1 << group_id)) { +- spdk_trace_set_tpoints(group_id, tpoint_mask); +- } +- } +- } +- +- if (error_found) { +- SPDK_ERRLOG("invalid tpoint mask %s\n", opts->tpoint_group_mask); +- free(tp_g_str); +- return -1; +- } else { +- SPDK_NOTICELOG("Tracepoint Group Mask %s specified.\n", opts->tpoint_group_mask); +- SPDK_NOTICELOG("Use 'spdk_trace -s %s %s %d' to capture a snapshot of events at runtime.\n", +- opts->name, +- opts->shm_id >= 0 ? "-i" : "-p", +- opts->shm_id >= 0 ? opts->shm_id : getpid()); +-#if defined(__linux__) +- SPDK_NOTICELOG("Or copy /dev/shm%s for offline analysis/debug.\n", shm_name); +-#endif +- } +- free(tp_g_str); +- +- return 0; +-} +- +-static void +-bootstrap_fn(void *arg1) +-{ +- int rc; +- +- if (g_spdk_app.json_config_file) { +- g_delay_subsystem_init = false; +- spdk_subsystem_init_from_json_config(g_spdk_app.json_config_file, g_spdk_app.rpc_addr, +- app_start_rpc, +- NULL, !g_spdk_app.json_config_ignore_errors); +- } else { +- if (!g_delay_subsystem_init) { +- spdk_subsystem_init(app_start_rpc, NULL); +- } else { +- spdk_rpc_set_allowlist(g_spdk_app.rpc_allowlist); +- +- rc = spdk_rpc_initialize(g_spdk_app.rpc_addr); +- if (rc) { +- spdk_app_stop(rc); +- return; +- } +- } +- } +-} +- +-static void +-app_copy_opts(struct spdk_app_opts *opts, struct spdk_app_opts *opts_user, size_t opts_size) +-{ +- spdk_app_opts_init(opts, sizeof(*opts)); +- opts->opts_size = opts_size; +- +-#define SET_FIELD(field) \ +- if (offsetof(struct spdk_app_opts, field) + sizeof(opts->field) <= (opts->opts_size)) { \ +- opts->field = opts_user->field; \ +- } \ +- +- SET_FIELD(name); +- SET_FIELD(json_config_file); +- SET_FIELD(json_config_ignore_errors); +- SET_FIELD(rpc_addr); +- SET_FIELD(reactor_mask); +- SET_FIELD(tpoint_group_mask); +- SET_FIELD(shm_id); +- SET_FIELD(shutdown_cb); +- SET_FIELD(enable_coredump); +- SET_FIELD(mem_channel); +- SET_FIELD(main_core); +- SET_FIELD(mem_size); +- SET_FIELD(no_pci); +- SET_FIELD(hugepage_single_segments); +- SET_FIELD(unlink_hugepage); +- SET_FIELD(hugedir); +- SET_FIELD(print_level); +- SET_FIELD(num_pci_addr); +- SET_FIELD(pci_blocked); +- SET_FIELD(pci_allowed); +- SET_FIELD(iova_mode); +- SET_FIELD(delay_subsystem_init); +- SET_FIELD(num_entries); +- SET_FIELD(env_context); +- SET_FIELD(log); +- SET_FIELD(base_virtaddr); +- SET_FIELD(disable_signal_handlers); +- SET_FIELD(msg_mempool_size); +- SET_FIELD(rpc_allowlist); +- SET_FIELD(vf_token); +- +- /* You should not remove this statement, but need to update the assert statement +- * if you add a new field, and also add a corresponding SET_FIELD statement */ +- SPDK_STATIC_ASSERT(sizeof(struct spdk_app_opts) == 216, "Incorrect size"); +- +-#undef SET_FIELD +-} +- +-static void +-unclaim_cpu_cores(void) +-{ +- char core_name[40]; +- uint32_t i; +- +- for (i = 0; i < MAX_CPU_CORES; i++) { +- if (g_core_locks[i] != -1) { +- snprintf(core_name, sizeof(core_name), "/var/tmp/spdk_cpu_lock_%03d", i); +- close(g_core_locks[i]); +- g_core_locks[i] = -1; +- unlink(core_name); +- } +- } +-} +- +-static int +-claim_cpu_cores(uint32_t *failed_core) +-{ +- char core_name[40]; +- int core_fd, pid; +- int *core_map; +- uint32_t core; +- +- struct flock core_lock = { +- .l_type = F_WRLCK, +- .l_whence = SEEK_SET, +- .l_start = 0, +- .l_len = 0, +- }; +- +- SPDK_ENV_FOREACH_CORE(core) { +- if (g_core_locks[core] != -1) { +- /* If this core is locked already, do not try lock it again. */ +- continue; +- } +- +- snprintf(core_name, sizeof(core_name), "/var/tmp/spdk_cpu_lock_%03d", core); +- core_fd = open(core_name, O_RDWR | O_CREAT, S_IRUSR | S_IWUSR); +- if (core_fd == -1) { +- SPDK_ERRLOG("Could not open %s (%s).\n", core_name, spdk_strerror(errno)); +- /* Return number of core we failed to claim. */ +- goto error; +- } +- +- if (ftruncate(core_fd, sizeof(int)) != 0) { +- SPDK_ERRLOG("Could not truncate %s (%s).\n", core_name, spdk_strerror(errno)); +- close(core_fd); +- goto error; +- } +- +- core_map = mmap(NULL, sizeof(int), PROT_READ | PROT_WRITE, MAP_SHARED, core_fd, 0); +- if (core_map == MAP_FAILED) { +- SPDK_ERRLOG("Could not mmap core %s (%s).\n", core_name, spdk_strerror(errno)); +- close(core_fd); +- goto error; +- } +- +- if (fcntl(core_fd, F_SETLK, &core_lock) != 0) { +- pid = *core_map; +- SPDK_ERRLOG("Cannot create lock on core %" PRIu32 ", probably process %d has claimed it.\n", +- core, pid); +- munmap(core_map, sizeof(int)); +- close(core_fd); +- goto error; +- } +- +- /* We write the PID to the core lock file so that other processes trying +- * to claim the same core will know what process is holding the lock. */ +- *core_map = (int)getpid(); +- munmap(core_map, sizeof(int)); +- g_core_locks[core] = core_fd; +- /* Keep core_fd open to maintain the lock. */ +- } +- +- return 0; +- +-error: +- if (failed_core != NULL) { +- /* Set number of core we failed to claim. */ +- *failed_core = core; +- } +- unclaim_cpu_cores(); +- return -1; +-} +- +-int +-spdk_app_start(struct spdk_app_opts *opts_user, spdk_msg_fn start_fn, +- void *arg1) +-{ +- int rc; +- char *tty; +- struct spdk_cpuset tmp_cpumask = {}; +- static bool g_env_was_setup = false; +- struct spdk_app_opts opts_local = {}; +- struct spdk_app_opts *opts = &opts_local; +- uint32_t i; +- +- if (!opts_user) { +- SPDK_ERRLOG("opts_user should not be NULL\n"); +- return 1; +- } +- +- if (!opts_user->opts_size) { +- SPDK_ERRLOG("The opts_size in opts_user structure should not be zero value\n"); +- return 1; +- } +- +- if (opts_user->name == NULL) { +- SPDK_ERRLOG("spdk_app_opts::name not specified\n"); +- return 1; +- } +- +- app_copy_opts(opts, opts_user, opts_user->opts_size); +- +- if (!start_fn) { +- SPDK_ERRLOG("start_fn should not be NULL\n"); +- return 1; +- } +- +- tty = ttyname(STDERR_FILENO); +- if (opts->print_level > SPDK_LOG_WARN && +- isatty(STDERR_FILENO) && +- tty && +- !strncmp(tty, "/dev/tty", strlen("/dev/tty"))) { +- printf("Warning: printing stderr to console terminal without -q option specified.\n"); +- printf("Suggest using --silence-noticelog to disable logging to stderr and\n"); +- printf("monitor syslog, or redirect stderr to a file.\n"); +- printf("(Delaying for 10 seconds...)\n"); +- sleep(10); +- } +- +- spdk_log_set_print_level(opts->print_level); +- +-#ifndef SPDK_NO_RLIMIT +- if (opts->enable_coredump) { +- struct rlimit core_limits; +- +- core_limits.rlim_cur = core_limits.rlim_max = SPDK_APP_DEFAULT_CORE_LIMIT; +- setrlimit(RLIMIT_CORE, &core_limits); +- } +-#endif +- +- memset(&g_spdk_app, 0, sizeof(g_spdk_app)); +- g_spdk_app.json_config_file = opts->json_config_file; +- g_spdk_app.json_config_ignore_errors = opts->json_config_ignore_errors; +- g_spdk_app.rpc_addr = opts->rpc_addr; +- g_spdk_app.rpc_allowlist = opts->rpc_allowlist; +- g_spdk_app.shm_id = opts->shm_id; +- g_spdk_app.shutdown_cb = opts->shutdown_cb; +- g_spdk_app.rc = 0; +- g_spdk_app.stopped = false; +- +- spdk_log_set_level(SPDK_APP_DEFAULT_LOG_LEVEL); +- +- /* Pass NULL to app_setup_env if SPDK app has been set up, in order to +- * indicate that this is a reinitialization. +- */ +- if (app_setup_env(g_env_was_setup ? NULL : opts) < 0) { +- return 1; +- } +- +- spdk_log_open(opts->log); +- +- /* Initialize each lock to -1 to indicate "empty" status */ +- for (i = 0; i < MAX_CPU_CORES; i++) { +- g_core_locks[i] = -1; +- } +- +- if (!g_disable_cpumask_locks) { +- if (claim_cpu_cores(NULL)) { +- SPDK_ERRLOG("Unable to acquire lock on assigned core mask - exiting.\n"); +- return 1; +- } +- } else { +- SPDK_NOTICELOG("CPU core locks deactivated.\n"); +- } +- +- SPDK_NOTICELOG("Total cores available: %d\n", spdk_env_get_core_count()); +- +- if ((rc = spdk_reactors_init(opts->msg_mempool_size)) != 0) { +- SPDK_ERRLOG("Reactor Initialization failed: rc = %d\n", rc); +- return 1; +- } +- +- spdk_cpuset_set_cpu(&tmp_cpumask, spdk_env_get_current_core(), true); +- +- /* Now that the reactors have been initialized, we can create an +- * initialization thread. */ +- g_app_thread = spdk_thread_create("app_thread", &tmp_cpumask); +- if (!g_app_thread) { +- SPDK_ERRLOG("Unable to create an spdk_thread for initialization\n"); +- return 1; +- } +- +- /* +- * Disable and ignore trace setup if setting num_entries +- * to be 0. +- * +- * Note the call to app_setup_trace() is located here +- * ahead of app_setup_signal_handlers(). +- * That's because there is not an easy/direct clean +- * way of unwinding alloc'd resources that can occur +- * in app_setup_signal_handlers(). +- */ +- if (opts->num_entries != 0 && app_setup_trace(opts) != 0) { +- return 1; +- } +- +- if (!opts->disable_signal_handlers && app_setup_signal_handlers(opts) != 0) { +- return 1; +- } +- +- g_delay_subsystem_init = opts->delay_subsystem_init; +- g_start_fn = start_fn; +- g_start_arg = arg1; +- +- spdk_thread_send_msg(g_app_thread, bootstrap_fn, NULL); +- +- /* This blocks until spdk_app_stop is called */ +- spdk_reactors_start(); +- +- g_env_was_setup = true; +- +- return g_spdk_app.rc; +-} +- +-void +-spdk_app_fini(void) +-{ +- spdk_trace_cleanup(); +- spdk_reactors_fini(); +- spdk_env_fini(); +- spdk_log_close(); +- unclaim_cpu_cores(); +-} +- +-static void +-_start_subsystem_fini(void *arg1) +-{ +- if (g_scheduling_in_progress) { +- spdk_thread_send_msg(g_app_thread, _start_subsystem_fini, NULL); +- return; +- } +- +- spdk_subsystem_fini(spdk_reactors_stop, NULL); +-} +- +-static int +-log_deprecation_hits(void *ctx, struct spdk_deprecation *dep) +-{ +- uint64_t hits = spdk_deprecation_get_hits(dep); +- +- if (hits == 0) { +- return 0; +- } +- +- SPDK_WARNLOG("%s: deprecation '%s' scheduled for removal in %s hit %" PRIu64 " times\n", +- spdk_deprecation_get_tag(dep), spdk_deprecation_get_description(dep), +- spdk_deprecation_get_remove_release(dep), hits); +- return 0; +-} +- +-static void +-app_stop(void *arg1) +-{ +- if (g_spdk_app.rc == 0) { +- g_spdk_app.rc = (int)(intptr_t)arg1; +- } +- +- if (g_spdk_app.stopped) { +- SPDK_NOTICELOG("spdk_app_stop called twice\n"); +- return; +- } +- +- spdk_rpc_finish(); +- g_spdk_app.stopped = true; +- spdk_log_for_each_deprecation(NULL, log_deprecation_hits); +- _start_subsystem_fini(NULL); +-} +- +-void +-spdk_app_stop(int rc) +-{ +- if (rc) { +- SPDK_WARNLOG("spdk_app_stop'd on non-zero\n"); +- } +- +- /* +- * We want to run spdk_subsystem_fini() from the same thread where spdk_subsystem_init() +- * was called. +- */ +- spdk_thread_send_msg(g_app_thread, app_stop, (void *)(intptr_t)rc); +-} +- +-struct spdk_thread * +-_spdk_get_app_thread(void) +-{ +- return g_app_thread; +-} +- +-static void +-usage(void (*app_usage)(void)) +-{ +- printf("%s [options]\n", g_executable_name); +- printf("options:\n"); +- printf(" -c, --config JSON config file (default %s)\n", +- g_default_opts.json_config_file != NULL ? g_default_opts.json_config_file : "none"); +- printf(" --json JSON config file (default %s)\n", +- g_default_opts.json_config_file != NULL ? g_default_opts.json_config_file : "none"); +- printf(" --json-ignore-init-errors\n"); +- printf(" don't exit on invalid config entry\n"); +- printf(" -d, --limit-coredump do not set max coredump size to RLIM_INFINITY\n"); +- printf(" -g, --single-file-segments\n"); +- printf(" force creating just one hugetlbfs file\n"); +- printf(" -h, --help show this usage\n"); +- printf(" -i, --shm-id shared memory ID (optional)\n"); +- printf(" -m, --cpumask core mask (like 0xF) or core list of '[]' embraced (like [0,1,10]) for DPDK\n"); +- printf(" -n, --mem-channels channel number of memory channels used for DPDK\n"); +- printf(" -p, --main-core main (primary) core for DPDK\n"); +- printf(" -r, --rpc-socket RPC listen address (default %s)\n", SPDK_DEFAULT_RPC_ADDR); +- printf(" -s, --mem-size memory size in MB for DPDK (default: "); +-#ifndef __linux__ +- if (g_default_opts.mem_size <= 0) { +- printf("all hugepage memory)\n"); +- } else +-#endif +- { +- printf("%dMB)\n", g_default_opts.mem_size >= 0 ? g_default_opts.mem_size : 0); +- } +- printf(" --disable-cpumask-locks Disable CPU core lock files.\n"); +- printf(" --silence-noticelog disable notice level logging to stderr\n"); +- printf(" --msg-mempool-size global message memory pool size in count (default: %d)\n", +- SPDK_DEFAULT_MSG_MEMPOOL_SIZE); +- printf(" -u, --no-pci disable PCI access\n"); +- printf(" --wait-for-rpc wait for RPCs to initialize subsystems\n"); +- printf(" --max-delay maximum reactor delay (in microseconds)\n"); +- printf(" -B, --pci-blocked \n"); +- printf(" pci addr to block (can be used more than once)\n"); +- printf(" -R, --huge-unlink unlink huge files after initialization\n"); +- printf(" -v, --version print SPDK version\n"); +- printf(" -A, --pci-allowed \n"); +- printf(" pci addr to allow (-B and -A cannot be used at the same time)\n"); +- printf(" --huge-dir use a specific hugetlbfs mount to reserve memory from\n"); +- printf(" --iova-mode set IOVA mode ('pa' for IOVA_PA and 'va' for IOVA_VA)\n"); +- printf(" --base-virtaddr the base virtual address for DPDK (default: 0x200000000000)\n"); +- printf(" --num-trace-entries number of trace entries for each core, must be power of 2, setting 0 to disable trace (default %d)\n", +- SPDK_APP_DEFAULT_NUM_TRACE_ENTRIES); +- printf(" --rpcs-allowed comma-separated list of permitted RPCS\n"); +- printf(" --env-context Opaque context for use of the env implementation\n"); +- printf(" --vfio-vf-token VF token (UUID) shared between SR-IOV PF and VFs for vfio_pci driver\n"); +- spdk_log_usage(stdout, "-L"); +- spdk_trace_mask_usage(stdout, "-e"); +- if (app_usage) { +- app_usage(); +- } +-} +- +-spdk_app_parse_args_rvals_t +-spdk_app_parse_args(int argc, char **argv, struct spdk_app_opts *opts, +- const char *app_getopt_str, const struct option *app_long_opts, +- int (*app_parse)(int ch, char *arg), +- void (*app_usage)(void)) +-{ +- int ch, rc, opt_idx, global_long_opts_len, app_long_opts_len; +- struct option *cmdline_options; +- char *cmdline_short_opts = NULL; +- char *shm_id_str = NULL; +- enum spdk_app_parse_args_rvals retval = SPDK_APP_PARSE_ARGS_FAIL; +- long int tmp; +- +- memcpy(&g_default_opts, opts, sizeof(g_default_opts)); +- +- if (opts->json_config_file && access(opts->json_config_file, R_OK) != 0) { +- SPDK_WARNLOG("Can't read JSON configuration file '%s'\n", opts->json_config_file); +- opts->json_config_file = NULL; +- } +- +- if (app_long_opts == NULL) { +- app_long_opts_len = 0; +- } else { +- for (app_long_opts_len = 0; +- app_long_opts[app_long_opts_len].name != NULL; +- app_long_opts_len++); +- } +- +- global_long_opts_len = SPDK_COUNTOF(g_cmdline_options); +- +- cmdline_options = calloc(global_long_opts_len + app_long_opts_len + 1, sizeof(*cmdline_options)); +- if (!cmdline_options) { +- SPDK_ERRLOG("Out of memory\n"); +- return SPDK_APP_PARSE_ARGS_FAIL; +- } +- +- memcpy(&cmdline_options[0], g_cmdline_options, sizeof(g_cmdline_options)); +- if (app_long_opts) { +- memcpy(&cmdline_options[global_long_opts_len], app_long_opts, +- app_long_opts_len * sizeof(*app_long_opts)); +- } +- +- if (app_getopt_str != NULL) { +- ch = app_opts_validate(app_getopt_str); +- if (ch) { +- SPDK_ERRLOG("Duplicated option '%c' between the generic and application specific spdk opts.\n", +- ch); +- goto out; +- } +- } +- +- cmdline_short_opts = spdk_sprintf_alloc("%s%s", app_getopt_str, SPDK_APP_GETOPT_STRING); +- if (!cmdline_short_opts) { +- SPDK_ERRLOG("Out of memory\n"); +- goto out; +- } +- +- g_executable_name = argv[0]; +- +- while ((ch = getopt_long(argc, argv, cmdline_short_opts, cmdline_options, &opt_idx)) != -1) { +- switch (ch) { +- case CONFIG_FILE_OPT_IDX: +- case JSON_CONFIG_OPT_IDX: +- opts->json_config_file = optarg; +- break; +- case JSON_CONFIG_IGNORE_INIT_ERRORS_IDX: +- opts->json_config_ignore_errors = true; +- break; +- case LIMIT_COREDUMP_OPT_IDX: +- opts->enable_coredump = false; +- break; +- case TPOINT_GROUP_OPT_IDX: +- opts->tpoint_group_mask = optarg; +- break; +- case SINGLE_FILE_SEGMENTS_OPT_IDX: +- opts->hugepage_single_segments = true; +- break; +- case HELP_OPT_IDX: +- usage(app_usage); +- retval = SPDK_APP_PARSE_ARGS_HELP; +- goto out; +- case SHM_ID_OPT_IDX: +- shm_id_str = optarg; +- /* a negative shm-id disables shared configuration file */ +- if (optarg[0] == '-') { +- shm_id_str++; +- } +- /* check if the positive value of provided shm_id can be parsed as +- * an integer +- */ +- opts->shm_id = spdk_strtol(shm_id_str, 0); +- if (opts->shm_id < 0) { +- SPDK_ERRLOG("Invalid shared memory ID %s\n", optarg); +- goto out; +- } +- if (optarg[0] == '-') { +- opts->shm_id = -opts->shm_id; +- } +- break; +- case CPUMASK_OPT_IDX: +- opts->reactor_mask = optarg; +- break; +- case DISABLE_CPUMASK_LOCKS_OPT_IDX: +- g_disable_cpumask_locks = true; +- break; +- case MEM_CHANNELS_OPT_IDX: +- opts->mem_channel = spdk_strtol(optarg, 0); +- if (opts->mem_channel < 0) { +- SPDK_ERRLOG("Invalid memory channel %s\n", optarg); +- goto out; +- } +- break; +- case MAIN_CORE_OPT_IDX: +- opts->main_core = spdk_strtol(optarg, 0); +- if (opts->main_core < 0) { +- SPDK_ERRLOG("Invalid main core %s\n", optarg); +- goto out; +- } +- break; +- case SILENCE_NOTICELOG_OPT_IDX: +- opts->print_level = SPDK_LOG_WARN; +- break; +- case RPC_SOCKET_OPT_IDX: +- opts->rpc_addr = optarg; +- break; +- case MEM_SIZE_OPT_IDX: { +- uint64_t mem_size_mb; +- bool mem_size_has_prefix; +- +- rc = spdk_parse_capacity(optarg, &mem_size_mb, &mem_size_has_prefix); +- if (rc != 0) { +- SPDK_ERRLOG("invalid memory pool size `-s %s`\n", optarg); +- usage(app_usage); +- goto out; +- } +- +- if (mem_size_has_prefix) { +- /* the mem size is in MB by default, so if a prefix was +- * specified, we need to manually convert to MB. +- */ +- mem_size_mb /= 1024 * 1024; +- } +- +- if (mem_size_mb > INT_MAX) { +- SPDK_ERRLOG("invalid memory pool size `-s %s`\n", optarg); +- usage(app_usage); +- goto out; +- } +- +- opts->mem_size = (int) mem_size_mb; +- break; +- } +- case MSG_MEMPOOL_SIZE_OPT_IDX: +- tmp = spdk_strtol(optarg, 10); +- if (tmp <= 0) { +- SPDK_ERRLOG("Invalid message memory pool size %s\n", optarg); +- goto out; +- } +- +- opts->msg_mempool_size = (size_t)tmp; +- break; +- +- case NO_PCI_OPT_IDX: +- opts->no_pci = true; +- break; +- case WAIT_FOR_RPC_OPT_IDX: +- opts->delay_subsystem_init = true; +- break; +- case PCI_BLOCKED_OPT_IDX: +- if (opts->pci_allowed) { +- free(opts->pci_allowed); +- opts->pci_allowed = NULL; +- SPDK_ERRLOG("-B and -A cannot be used at the same time\n"); +- usage(app_usage); +- goto out; +- } +- +- rc = app_opts_add_pci_addr(opts, &opts->pci_blocked, optarg); +- if (rc != 0) { +- free(opts->pci_blocked); +- opts->pci_blocked = NULL; +- goto out; +- } +- break; +- case LOGFLAG_OPT_IDX: +- rc = spdk_log_set_flag(optarg); +- if (rc < 0) { +- SPDK_ERRLOG("unknown flag\n"); +- usage(app_usage); +- goto out; +- } +-#ifdef DEBUG +- opts->print_level = SPDK_LOG_DEBUG; +-#endif +- break; +- case HUGE_UNLINK_OPT_IDX: +- opts->unlink_hugepage = true; +- break; +- case PCI_WHITELIST_OPT_IDX: +- SPDK_WARNLOG("-W/--pci-whitelist is deprecated. Use -A/--pci-allowed.\n"); +- /* fallthrough */ +- case PCI_ALLOWED_OPT_IDX: +- if (opts->pci_blocked) { +- free(opts->pci_blocked); +- opts->pci_blocked = NULL; +- SPDK_ERRLOG("-B and -W cannot be used at the same time\n"); +- usage(app_usage); +- goto out; +- } +- +- rc = app_opts_add_pci_addr(opts, &opts->pci_allowed, optarg); +- if (rc != 0) { +- free(opts->pci_allowed); +- opts->pci_allowed = NULL; +- goto out; +- } +- break; +- case BASE_VIRTADDR_OPT_IDX: +- tmp = spdk_strtoll(optarg, 0); +- if (tmp <= 0) { +- SPDK_ERRLOG("Invalid base-virtaddr %s\n", optarg); +- usage(app_usage); +- goto out; +- } +- opts->base_virtaddr = (uint64_t)tmp; +- break; +- case HUGE_DIR_OPT_IDX: +- opts->hugedir = optarg; +- break; +- case IOVA_MODE_OPT_IDX: +- opts->iova_mode = optarg; +- break; +- case NUM_TRACE_ENTRIES_OPT_IDX: +- tmp = spdk_strtoll(optarg, 0); +- if (tmp < 0) { +- SPDK_ERRLOG("Invalid num-trace-entries %s\n", optarg); +- usage(app_usage); +- goto out; +- } +- opts->num_entries = (uint64_t)tmp; +- if (opts->num_entries > 0 && opts->num_entries & (opts->num_entries - 1)) { +- SPDK_ERRLOG("num-trace-entries must be power of 2\n"); +- usage(app_usage); +- goto out; +- } +- break; +- case MAX_REACTOR_DELAY_OPT_IDX: +- SPDK_ERRLOG("Deprecation warning: The maximum allowed latency parameter is no longer supported.\n"); +- break; +- case ENV_CONTEXT_OPT_IDX: +- opts->env_context = optarg; +- break; +- case RPCS_ALLOWED_OPT_IDX: +- opts->rpc_allowlist = (const char **)spdk_strarray_from_string(optarg, ","); +- if (opts->rpc_allowlist == NULL) { +- SPDK_ERRLOG("Invalid --rpcs-allowed argument\n"); +- usage(app_usage); +- goto out; +- } +- break; +- case ENV_VF_TOKEN_OPT_IDX: +- opts->vf_token = optarg; +- break; +- case VERSION_OPT_IDX: +- printf(SPDK_VERSION_STRING"\n"); +- retval = SPDK_APP_PARSE_ARGS_HELP; +- goto out; +- case '?': +- /* +- * In the event getopt() above detects an option +- * in argv that is NOT in the getopt_str, +- * getopt() will return a '?' indicating failure. +- */ +- usage(app_usage); +- goto out; +- default: +- rc = app_parse(ch, optarg); +- if (rc) { +- SPDK_ERRLOG("Parsing application specific arguments failed: %d\n", rc); +- goto out; +- } +- } +- } +- +- if (opts->json_config_file && opts->delay_subsystem_init) { +- SPDK_ERRLOG("JSON configuration file can't be used together with --wait-for-rpc.\n"); +- goto out; +- } +- +- retval = SPDK_APP_PARSE_ARGS_SUCCESS; +-out: +- if (retval != SPDK_APP_PARSE_ARGS_SUCCESS) { +- free(opts->pci_blocked); +- opts->pci_blocked = NULL; +- free(opts->pci_allowed); +- opts->pci_allowed = NULL; +- spdk_strarray_free((char **)opts->rpc_allowlist); +- opts->rpc_allowlist = NULL; +- } +- free(cmdline_short_opts); +- free(cmdline_options); +- return retval; +-} +- +-void +-spdk_app_usage(void) +-{ +- if (g_executable_name == NULL) { +- SPDK_ERRLOG("%s not valid before calling spdk_app_parse_args()\n", __func__); +- return; +- } +- +- usage(NULL); +-} +- +-static void +-rpc_framework_start_init_cpl(int rc, void *arg1) +-{ +- struct spdk_jsonrpc_request *request = arg1; +- +- assert(spdk_get_thread() == g_app_thread); +- +- if (rc) { +- spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, +- "framework_initialization failed"); +- return; +- } +- +- spdk_rpc_set_state(SPDK_RPC_RUNTIME); +- app_start_application(); +- +- spdk_jsonrpc_send_bool_response(request, true); +-} +- +-static void +-rpc_framework_start_init(struct spdk_jsonrpc_request *request, +- const struct spdk_json_val *params) +-{ +- if (params != NULL) { +- spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, +- "framework_start_init requires no parameters"); +- return; +- } +- +- spdk_subsystem_init(rpc_framework_start_init_cpl, request); +-} +-SPDK_RPC_REGISTER("framework_start_init", rpc_framework_start_init, SPDK_RPC_STARTUP) +- +-struct subsystem_init_poller_ctx { +- struct spdk_poller *init_poller; +- struct spdk_jsonrpc_request *request; +-}; +- +-static int +-rpc_subsystem_init_poller_ctx(void *ctx) +-{ +- struct subsystem_init_poller_ctx *poller_ctx = ctx; +- +- if (spdk_rpc_get_state() == SPDK_RPC_RUNTIME) { +- spdk_jsonrpc_send_bool_response(poller_ctx->request, true); +- spdk_poller_unregister(&poller_ctx->init_poller); +- free(poller_ctx); +- } +- +- return SPDK_POLLER_BUSY; +-} +- +-static void +-rpc_framework_wait_init(struct spdk_jsonrpc_request *request, +- const struct spdk_json_val *params) +-{ +- struct subsystem_init_poller_ctx *ctx; +- +- if (spdk_rpc_get_state() == SPDK_RPC_RUNTIME) { +- spdk_jsonrpc_send_bool_response(request, true); +- } else { +- ctx = malloc(sizeof(struct subsystem_init_poller_ctx)); +- if (ctx == NULL) { +- spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, +- "Unable to allocate memory for the request context\n"); +- return; +- } +- ctx->request = request; +- ctx->init_poller = SPDK_POLLER_REGISTER(rpc_subsystem_init_poller_ctx, ctx, 0); +- } +-} +-SPDK_RPC_REGISTER("framework_wait_init", rpc_framework_wait_init, +- SPDK_RPC_STARTUP | SPDK_RPC_RUNTIME) +- +-static void +-rpc_framework_disable_cpumask_locks(struct spdk_jsonrpc_request *request, +- const struct spdk_json_val *params) +-{ +- if (params != NULL) { +- spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, +- "framework_disable_cpumask_locks" +- "requires no arguments"); +- return; +- } +- +- unclaim_cpu_cores(); +- spdk_jsonrpc_send_bool_response(request, true); +-} +-SPDK_RPC_REGISTER("framework_disable_cpumask_locks", rpc_framework_disable_cpumask_locks, +- SPDK_RPC_STARTUP | SPDK_RPC_RUNTIME) +- +-static void +-rpc_framework_enable_cpumask_locks(struct spdk_jsonrpc_request *request, +- const struct spdk_json_val *params) +-{ +- char msg[128]; +- int rc; +- uint32_t failed_core; +- +- if (params != NULL) { +- spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, +- "framework_enable_cpumask_locks" +- "requires no arguments"); +- return; +- } +- +- rc = claim_cpu_cores(&failed_core); +- if (rc) { +- snprintf(msg, sizeof(msg), "Failed to claim CPU core: %" PRIu32, failed_core); +- spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, msg); +- return; +- } +- +- spdk_jsonrpc_send_bool_response(request, true); +-} +-SPDK_RPC_REGISTER("framework_enable_cpumask_locks", rpc_framework_enable_cpumask_locks, +- SPDK_RPC_STARTUP | SPDK_RPC_RUNTIME) ++/* SPDX-License-Identifier: BSD-3-Clause ++ * Copyright (C) 2016 Intel Corporation. All rights reserved. ++ * Copyright (c) 2019 Mellanox Technologies LTD. All rights reserved. ++ * Copyright (c) 2021 NVIDIA CORPORATION & AFFILIATES. All rights reserved. ++ */ ++ ++#include "spdk/stdinc.h" ++#include "spdk/version.h" ++ ++#include "spdk_internal/event.h" ++ ++#include "spdk/assert.h" ++#include "spdk/env.h" ++#include "spdk/init.h" ++#include "spdk/log.h" ++#include "spdk/thread.h" ++#include "spdk/trace.h" ++#include "spdk/string.h" ++#include "spdk/scheduler.h" ++#include "spdk/rpc.h" ++#include "spdk/util.h" ++ ++#define SPDK_APP_DEFAULT_LOG_LEVEL SPDK_LOG_NOTICE ++#define SPDK_APP_DEFAULT_LOG_PRINT_LEVEL SPDK_LOG_INFO ++#define SPDK_APP_DEFAULT_NUM_TRACE_ENTRIES SPDK_DEFAULT_NUM_TRACE_ENTRIES ++ ++#define SPDK_APP_DPDK_DEFAULT_MEM_SIZE -1 ++#define SPDK_APP_DPDK_DEFAULT_MAIN_CORE -1 ++#define SPDK_APP_DPDK_DEFAULT_MEM_CHANNEL -1 ++#define SPDK_APP_DPDK_DEFAULT_CORE_MASK "0x1" ++#define SPDK_APP_DPDK_DEFAULT_BASE_VIRTADDR 0x200000000000 ++#define SPDK_APP_DEFAULT_CORE_LIMIT 0x140000000 /* 5 GiB */ ++ ++#define MAX_CPU_CORES 128 ++ ++struct spdk_app { ++ const char *json_config_file; ++ bool json_config_ignore_errors; ++ bool stopped; ++ const char *rpc_addr; ++ const char **rpc_allowlist; ++ int shm_id; ++ spdk_app_shutdown_cb shutdown_cb; ++ int rc; ++}; ++ ++static struct spdk_app g_spdk_app; ++static spdk_msg_fn g_start_fn = NULL; ++static void *g_start_arg = NULL; ++static struct spdk_thread *g_app_thread = NULL; ++static bool g_delay_subsystem_init = false; ++static bool g_shutdown_sig_received = false; ++static char *g_executable_name; ++static struct spdk_app_opts g_default_opts; ++static bool g_disable_cpumask_locks = false; ++ ++static int g_core_locks[MAX_CPU_CORES]; ++ ++int ++spdk_app_get_shm_id(void) ++{ ++ return g_spdk_app.shm_id; ++} ++ ++bool spdk_get_shutdown_sig_received(void) ++{ ++ return g_shutdown_sig_received; ++} ++ ++/* append one empty option to indicate the end of the array */ ++static const struct option g_cmdline_options[] = { ++#define CONFIG_FILE_OPT_IDX 'c' ++ {"config", required_argument, NULL, CONFIG_FILE_OPT_IDX}, ++#define LIMIT_COREDUMP_OPT_IDX 'd' ++ {"limit-coredump", no_argument, NULL, LIMIT_COREDUMP_OPT_IDX}, ++#define TPOINT_GROUP_OPT_IDX 'e' ++ {"tpoint-group", required_argument, NULL, TPOINT_GROUP_OPT_IDX}, ++#define SINGLE_FILE_SEGMENTS_OPT_IDX 'g' ++ {"single-file-segments", no_argument, NULL, SINGLE_FILE_SEGMENTS_OPT_IDX}, ++#define HELP_OPT_IDX 'h' ++ {"help", no_argument, NULL, HELP_OPT_IDX}, ++#define SHM_ID_OPT_IDX 'i' ++ {"shm-id", required_argument, NULL, SHM_ID_OPT_IDX}, ++#define CPUMASK_OPT_IDX 'm' ++ {"cpumask", required_argument, NULL, CPUMASK_OPT_IDX}, ++#define MEM_CHANNELS_OPT_IDX 'n' ++ {"mem-channels", required_argument, NULL, MEM_CHANNELS_OPT_IDX}, ++#define MAIN_CORE_OPT_IDX 'p' ++ {"main-core", required_argument, NULL, MAIN_CORE_OPT_IDX}, ++ {"master-core", required_argument, NULL, MAIN_CORE_OPT_IDX}, /* deprecated */ ++#define RPC_SOCKET_OPT_IDX 'r' ++ {"rpc-socket", required_argument, NULL, RPC_SOCKET_OPT_IDX}, ++#define MEM_SIZE_OPT_IDX 's' ++ {"mem-size", required_argument, NULL, MEM_SIZE_OPT_IDX}, ++#define NO_PCI_OPT_IDX 'u' ++ {"no-pci", no_argument, NULL, NO_PCI_OPT_IDX}, ++#define VERSION_OPT_IDX 'v' ++ {"version", no_argument, NULL, VERSION_OPT_IDX}, ++#define PCI_BLOCKED_OPT_IDX 'B' ++ {"pci-blocked", required_argument, NULL, PCI_BLOCKED_OPT_IDX}, ++ {"pci-blacklist", required_argument, NULL, PCI_BLOCKED_OPT_IDX}, /* deprecated */ ++#define LOGFLAG_OPT_IDX 'L' ++ {"logflag", required_argument, NULL, LOGFLAG_OPT_IDX}, ++#define HUGE_UNLINK_OPT_IDX 'R' ++ {"huge-unlink", no_argument, NULL, HUGE_UNLINK_OPT_IDX}, ++#define PCI_ALLOWED_OPT_IDX 'A' ++ {"pci-allowed", required_argument, NULL, PCI_ALLOWED_OPT_IDX}, ++#define PCI_WHITELIST_OPT_IDX 'W' ++ {"pci-whitelist", required_argument, NULL, PCI_WHITELIST_OPT_IDX}, /* deprecated */ ++#define SILENCE_NOTICELOG_OPT_IDX 257 ++ {"silence-noticelog", no_argument, NULL, SILENCE_NOTICELOG_OPT_IDX}, ++#define WAIT_FOR_RPC_OPT_IDX 258 ++ {"wait-for-rpc", no_argument, NULL, WAIT_FOR_RPC_OPT_IDX}, ++#define HUGE_DIR_OPT_IDX 259 ++ {"huge-dir", required_argument, NULL, HUGE_DIR_OPT_IDX}, ++#define NUM_TRACE_ENTRIES_OPT_IDX 260 ++ {"num-trace-entries", required_argument, NULL, NUM_TRACE_ENTRIES_OPT_IDX}, ++#define MAX_REACTOR_DELAY_OPT_IDX 261 ++ {"max-delay", required_argument, NULL, MAX_REACTOR_DELAY_OPT_IDX}, ++#define JSON_CONFIG_OPT_IDX 262 ++ {"json", required_argument, NULL, JSON_CONFIG_OPT_IDX}, ++#define JSON_CONFIG_IGNORE_INIT_ERRORS_IDX 263 ++ {"json-ignore-init-errors", no_argument, NULL, JSON_CONFIG_IGNORE_INIT_ERRORS_IDX}, ++#define IOVA_MODE_OPT_IDX 264 ++ {"iova-mode", required_argument, NULL, IOVA_MODE_OPT_IDX}, ++#define BASE_VIRTADDR_OPT_IDX 265 ++ {"base-virtaddr", required_argument, NULL, BASE_VIRTADDR_OPT_IDX}, ++#define ENV_CONTEXT_OPT_IDX 266 ++ {"env-context", required_argument, NULL, ENV_CONTEXT_OPT_IDX}, ++#define DISABLE_CPUMASK_LOCKS_OPT_IDX 267 ++ {"disable-cpumask-locks", no_argument, NULL, DISABLE_CPUMASK_LOCKS_OPT_IDX}, ++#define RPCS_ALLOWED_OPT_IDX 268 ++ {"rpcs-allowed", required_argument, NULL, RPCS_ALLOWED_OPT_IDX}, ++#define ENV_VF_TOKEN_OPT_IDX 269 ++ {"vfio-vf-token", required_argument, NULL, ENV_VF_TOKEN_OPT_IDX}, ++#define MSG_MEMPOOL_SIZE_OPT_IDX 270 ++ {"msg-mempool-size", required_argument, NULL, MSG_MEMPOOL_SIZE_OPT_IDX}, ++#define HOT_RESTART_OPT_IDX 271 ++ {"hot-restart", no_argument, NULL, HOT_RESTART_OPT_IDX}, ++}; ++ ++static void ++app_start_shutdown(void *ctx) ++{ ++ if (g_spdk_app.shutdown_cb) { ++ g_spdk_app.shutdown_cb(); ++ g_spdk_app.shutdown_cb = NULL; ++ } else { ++ spdk_app_stop(0); ++ } ++} ++ ++void ++spdk_app_start_shutdown(void) ++{ ++ spdk_thread_send_critical_msg(g_app_thread, app_start_shutdown); ++} ++ ++static void ++__shutdown_signal(int signo) ++{ ++ if (!g_shutdown_sig_received) { ++ g_shutdown_sig_received = true; ++ spdk_app_start_shutdown(); ++ } ++} ++ ++static int ++app_opts_validate(const char *app_opts) ++{ ++ int i = 0, j; ++ ++ for (i = 0; app_opts[i] != '\0'; i++) { ++ /* ignore getopt control characters */ ++ if (app_opts[i] == ':' || app_opts[i] == '+' || app_opts[i] == '-') { ++ continue; ++ } ++ ++ for (j = 0; SPDK_APP_GETOPT_STRING[j] != '\0'; j++) { ++ if (app_opts[i] == SPDK_APP_GETOPT_STRING[j]) { ++ return app_opts[i]; ++ } ++ } ++ } ++ return 0; ++} ++ ++void ++spdk_app_opts_init(struct spdk_app_opts *opts, size_t opts_size) ++{ ++ if (!opts) { ++ SPDK_ERRLOG("opts should not be NULL\n"); ++ return; ++ } ++ ++ if (!opts_size) { ++ SPDK_ERRLOG("opts_size should not be zero value\n"); ++ return; ++ } ++ ++ memset(opts, 0, opts_size); ++ opts->opts_size = opts_size; ++ ++#define SET_FIELD(field, value) \ ++ if (offsetof(struct spdk_app_opts, field) + sizeof(opts->field) <= opts_size) { \ ++ opts->field = value; \ ++ } \ ++ ++ SET_FIELD(enable_coredump, true); ++ SET_FIELD(shm_id, -1); ++ SET_FIELD(mem_size, SPDK_APP_DPDK_DEFAULT_MEM_SIZE); ++ SET_FIELD(main_core, SPDK_APP_DPDK_DEFAULT_MAIN_CORE); ++ SET_FIELD(mem_channel, SPDK_APP_DPDK_DEFAULT_MEM_CHANNEL); ++ SET_FIELD(reactor_mask, SPDK_APP_DPDK_DEFAULT_CORE_MASK); ++ SET_FIELD(base_virtaddr, SPDK_APP_DPDK_DEFAULT_BASE_VIRTADDR); ++ SET_FIELD(print_level, SPDK_APP_DEFAULT_LOG_PRINT_LEVEL); ++ SET_FIELD(rpc_addr, SPDK_DEFAULT_RPC_ADDR); ++ SET_FIELD(num_entries, SPDK_APP_DEFAULT_NUM_TRACE_ENTRIES); ++ SET_FIELD(delay_subsystem_init, false); ++ SET_FIELD(disable_signal_handlers, false); ++ SET_FIELD(msg_mempool_size, SPDK_DEFAULT_MSG_MEMPOOL_SIZE); ++ SET_FIELD(rpc_allowlist, NULL); ++#undef SET_FIELD ++} ++ ++static int ++app_setup_signal_handlers(struct spdk_app_opts *opts) ++{ ++ struct sigaction sigact; ++ sigset_t sigmask; ++ int rc; ++ ++ sigemptyset(&sigmask); ++ memset(&sigact, 0, sizeof(sigact)); ++ sigemptyset(&sigact.sa_mask); ++ ++ sigact.sa_handler = SIG_IGN; ++ rc = sigaction(SIGPIPE, &sigact, NULL); ++ if (rc < 0) { ++ SPDK_ERRLOG("sigaction(SIGPIPE) failed\n"); ++ return rc; ++ } ++ ++ /* Install the same handler for SIGINT and SIGTERM */ ++ g_shutdown_sig_received = false; ++ sigact.sa_handler = __shutdown_signal; ++ rc = sigaction(SIGINT, &sigact, NULL); ++ if (rc < 0) { ++ SPDK_ERRLOG("sigaction(SIGINT) failed\n"); ++ return rc; ++ } ++ sigaddset(&sigmask, SIGINT); ++ ++ rc = sigaction(SIGTERM, &sigact, NULL); ++ if (rc < 0) { ++ SPDK_ERRLOG("sigaction(SIGTERM) failed\n"); ++ return rc; ++ } ++ sigaddset(&sigmask, SIGTERM); ++ ++ pthread_sigmask(SIG_UNBLOCK, &sigmask, NULL); ++ ++ return 0; ++} ++ ++static void ++app_start_application(void) ++{ ++ assert(spdk_get_thread() == g_app_thread); ++ ++ g_start_fn(g_start_arg); ++} ++ ++static void ++app_start_rpc(int rc, void *arg1) ++{ ++ if (rc) { ++ spdk_app_stop(rc); ++ return; ++ } ++ ++ spdk_rpc_set_allowlist(g_spdk_app.rpc_allowlist); ++ ++ rc = spdk_rpc_initialize(g_spdk_app.rpc_addr, RPC_SELECT_INTERVAL); ++ if (rc) { ++ spdk_app_stop(rc); ++ return; ++ } ++ ++ if (!g_delay_subsystem_init) { ++ spdk_rpc_set_state(SPDK_RPC_RUNTIME); ++ app_start_application(); ++ } ++} ++ ++static int ++app_opts_add_pci_addr(struct spdk_app_opts *opts, struct spdk_pci_addr **list, char *bdf) ++{ ++ struct spdk_pci_addr *tmp = *list; ++ size_t i = opts->num_pci_addr; ++ ++ tmp = realloc(tmp, sizeof(*tmp) * (i + 1)); ++ if (tmp == NULL) { ++ SPDK_ERRLOG("realloc error\n"); ++ return -ENOMEM; ++ } ++ ++ *list = tmp; ++ if (spdk_pci_addr_parse(*list + i, bdf) < 0) { ++ SPDK_ERRLOG("Invalid address %s\n", bdf); ++ return -EINVAL; ++ } ++ ++ opts->num_pci_addr++; ++ return 0; ++} ++ ++static int ++app_setup_env(struct spdk_app_opts *opts) ++{ ++ struct spdk_env_opts env_opts = {}; ++ int rc; ++ ++ if (opts == NULL) { ++ rc = spdk_env_init(NULL); ++ if (rc != 0) { ++ SPDK_ERRLOG("Unable to reinitialize SPDK env\n"); ++ } ++ ++ return rc; ++ } ++ ++ spdk_env_opts_init(&env_opts); ++ ++ env_opts.name = opts->name; ++ env_opts.core_mask = opts->reactor_mask; ++ env_opts.shm_id = opts->shm_id; ++ env_opts.mem_channel = opts->mem_channel; ++ env_opts.main_core = opts->main_core; ++ env_opts.mem_size = opts->mem_size; ++ env_opts.hugepage_single_segments = opts->hugepage_single_segments; ++ env_opts.unlink_hugepage = opts->unlink_hugepage; ++ env_opts.hugedir = opts->hugedir; ++ env_opts.no_pci = opts->no_pci; ++ env_opts.num_pci_addr = opts->num_pci_addr; ++ env_opts.pci_blocked = opts->pci_blocked; ++ env_opts.pci_allowed = opts->pci_allowed; ++ env_opts.base_virtaddr = opts->base_virtaddr; ++ env_opts.env_context = opts->env_context; ++ env_opts.iova_mode = opts->iova_mode; ++ env_opts.vf_token = opts->vf_token; ++ env_opts.hot_restart = opts->hot_restart; ++ ++ rc = spdk_env_init(&env_opts); ++ free(env_opts.pci_blocked); ++ free(env_opts.pci_allowed); ++ ++ if (rc < 0) { ++ SPDK_ERRLOG("Unable to initialize SPDK env\n"); ++ } ++ ++ return rc; ++} ++ ++static int ++app_setup_trace(struct spdk_app_opts *opts) ++{ ++ char shm_name[64]; ++ uint64_t tpoint_group_mask, tpoint_mask = -1ULL; ++ char *end = NULL, *tpoint_group_mask_str, *tpoint_group_str = NULL; ++ char *tp_g_str, *tpoint_group, *tpoints; ++ bool error_found = false; ++ uint64_t group_id; ++ ++ if (opts->shm_id >= 0) { ++ snprintf(shm_name, sizeof(shm_name), "/%s_trace.%d", opts->name, opts->shm_id); ++ } else { ++ snprintf(shm_name, sizeof(shm_name), "/%s_trace.pid%d", opts->name, (int)getpid()); ++ } ++ ++ if (spdk_trace_init(shm_name, opts->num_entries) != 0) { ++ return -1; ++ } ++ ++ if (opts->tpoint_group_mask == NULL) { ++ return 0; ++ } ++ ++ tpoint_group_mask_str = strdup(opts->tpoint_group_mask); ++ if (tpoint_group_mask_str == NULL) { ++ SPDK_ERRLOG("Unable to get string of tpoint group mask from opts.\n"); ++ return -1; ++ } ++ /* Save a pointer to the original value of the tpoint group mask string ++ * to free later, because spdk_strsepq() modifies given char*. */ ++ tp_g_str = tpoint_group_mask_str; ++ while ((tpoint_group_str = spdk_strsepq(&tpoint_group_mask_str, ",")) != NULL) { ++ if (strchr(tpoint_group_str, ':')) { ++ /* Get the tpoint group mask */ ++ tpoint_group = spdk_strsepq(&tpoint_group_str, ":"); ++ /* Get the tpoint mask inside that group */ ++ tpoints = spdk_strsepq(&tpoint_group_str, ":"); ++ ++ errno = 0; ++ tpoint_group_mask = strtoull(tpoint_group, &end, 16); ++ if (*end != '\0' || errno) { ++ tpoint_group_mask = spdk_trace_create_tpoint_group_mask(tpoint_group); ++ if (tpoint_group_mask == 0) { ++ error_found = true; ++ break; ++ } ++ } ++ /* Check if tpoint group mask has only one bit set. ++ * This is to avoid enabling individual tpoints in ++ * more than one tracepoint group at once. */ ++ if (!spdk_u64_is_pow2(tpoint_group_mask)) { ++ SPDK_ERRLOG("Tpoint group mask: %s contains multiple tpoint groups.\n", tpoint_group); ++ SPDK_ERRLOG("This is not supported, to prevent from activating tpoints by mistake.\n"); ++ error_found = true; ++ break; ++ } ++ ++ errno = 0; ++ tpoint_mask = strtoull(tpoints, &end, 16); ++ if (*end != '\0' || errno) { ++ error_found = true; ++ break; ++ } ++ } else { ++ errno = 0; ++ tpoint_group_mask = strtoull(tpoint_group_str, &end, 16); ++ if (*end != '\0' || errno) { ++ tpoint_group_mask = spdk_trace_create_tpoint_group_mask(tpoint_group_str); ++ if (tpoint_group_mask == 0) { ++ error_found = true; ++ break; ++ } ++ } ++ tpoint_mask = -1ULL; ++ } ++ ++ for (group_id = 0; group_id < SPDK_TRACE_MAX_GROUP_ID; ++group_id) { ++ if (tpoint_group_mask & (1 << group_id)) { ++ spdk_trace_set_tpoints(group_id, tpoint_mask); ++ } ++ } ++ } ++ ++ if (error_found) { ++ SPDK_ERRLOG("invalid tpoint mask %s\n", opts->tpoint_group_mask); ++ free(tp_g_str); ++ return -1; ++ } else { ++ SPDK_NOTICELOG("Tracepoint Group Mask %s specified.\n", opts->tpoint_group_mask); ++ SPDK_NOTICELOG("Use 'spdk_trace -s %s %s %d' to capture a snapshot of events at runtime.\n", ++ opts->name, ++ opts->shm_id >= 0 ? "-i" : "-p", ++ opts->shm_id >= 0 ? opts->shm_id : getpid()); ++#if defined(__linux__) ++ SPDK_NOTICELOG("Or copy /dev/shm%s for offline analysis/debug.\n", shm_name); ++#endif ++ } ++ free(tp_g_str); ++ ++ return 0; ++} ++ ++static void ++bootstrap_fn(void *arg1) ++{ ++ int rc; ++ ++ if (g_spdk_app.json_config_file) { ++ g_delay_subsystem_init = false; ++ spdk_subsystem_init_from_json_config(g_spdk_app.json_config_file, g_spdk_app.rpc_addr, ++ app_start_rpc, ++ NULL, !g_spdk_app.json_config_ignore_errors); ++ } else { ++ if (!g_delay_subsystem_init) { ++ spdk_subsystem_init(app_start_rpc, NULL); ++ } else { ++ spdk_rpc_set_allowlist(g_spdk_app.rpc_allowlist); ++ ++ rc = spdk_rpc_initialize(g_spdk_app.rpc_addr, RPC_SELECT_INTERVAL); ++ if (rc) { ++ spdk_app_stop(rc); ++ return; ++ } ++ } ++ } ++} ++ ++static void ++app_copy_opts(struct spdk_app_opts *opts, struct spdk_app_opts *opts_user, size_t opts_size) ++{ ++ spdk_app_opts_init(opts, sizeof(*opts)); ++ opts->opts_size = opts_size; ++ ++#define SET_FIELD(field) \ ++ if (offsetof(struct spdk_app_opts, field) + sizeof(opts->field) <= (opts->opts_size)) { \ ++ opts->field = opts_user->field; \ ++ } \ ++ ++ SET_FIELD(name); ++ SET_FIELD(json_config_file); ++ SET_FIELD(json_config_ignore_errors); ++ SET_FIELD(rpc_addr); ++ SET_FIELD(reactor_mask); ++ SET_FIELD(tpoint_group_mask); ++ SET_FIELD(shm_id); ++ SET_FIELD(shutdown_cb); ++ SET_FIELD(enable_coredump); ++ SET_FIELD(mem_channel); ++ SET_FIELD(main_core); ++ SET_FIELD(mem_size); ++ SET_FIELD(no_pci); ++ SET_FIELD(hugepage_single_segments); ++ SET_FIELD(unlink_hugepage); ++ SET_FIELD(hugedir); ++ SET_FIELD(print_level); ++ SET_FIELD(num_pci_addr); ++ SET_FIELD(pci_blocked); ++ SET_FIELD(pci_allowed); ++ SET_FIELD(iova_mode); ++ SET_FIELD(delay_subsystem_init); ++ SET_FIELD(num_entries); ++ SET_FIELD(env_context); ++ SET_FIELD(log); ++ SET_FIELD(base_virtaddr); ++ SET_FIELD(disable_signal_handlers); ++ SET_FIELD(hot_restart); ++ SET_FIELD(msg_mempool_size); ++ SET_FIELD(rpc_allowlist); ++ SET_FIELD(vf_token); ++ ++ /* You should not remove this statement, but need to update the assert statement ++ * if you add a new field, and also add a corresponding SET_FIELD statement */ ++ SPDK_STATIC_ASSERT(sizeof(struct spdk_app_opts) == 216, "Incorrect size"); ++ ++#undef SET_FIELD ++} ++ ++static void ++unclaim_cpu_cores(void) ++{ ++ char core_name[40]; ++ uint32_t i; ++ ++ for (i = 0; i < MAX_CPU_CORES; i++) { ++ if (g_core_locks[i] != -1) { ++ snprintf(core_name, sizeof(core_name), "/var/tmp/spdk_cpu_lock_%03d", i); ++ close(g_core_locks[i]); ++ g_core_locks[i] = -1; ++ unlink(core_name); ++ } ++ } ++} ++ ++static int ++claim_cpu_cores(uint32_t *failed_core) ++{ ++ char core_name[40]; ++ int core_fd, pid; ++ int *core_map; ++ uint32_t core; ++ ++ struct flock core_lock = { ++ .l_type = F_WRLCK, ++ .l_whence = SEEK_SET, ++ .l_start = 0, ++ .l_len = 0, ++ }; ++ ++ SPDK_ENV_FOREACH_CORE(core) { ++ if (g_core_locks[core] != -1) { ++ /* If this core is locked already, do not try lock it again. */ ++ continue; ++ } ++ ++ snprintf(core_name, sizeof(core_name), "/var/tmp/spdk_cpu_lock_%03d", core); ++ core_fd = open(core_name, O_RDWR | O_CREAT, S_IRUSR | S_IWUSR); ++ if (core_fd == -1) { ++ SPDK_ERRLOG("Could not open %s (%s).\n", core_name, spdk_strerror(errno)); ++ /* Return number of core we failed to claim. */ ++ goto error; ++ } ++ ++ if (ftruncate(core_fd, sizeof(int)) != 0) { ++ SPDK_ERRLOG("Could not truncate %s (%s).\n", core_name, spdk_strerror(errno)); ++ close(core_fd); ++ goto error; ++ } ++ ++ core_map = mmap(NULL, sizeof(int), PROT_READ | PROT_WRITE, MAP_SHARED, core_fd, 0); ++ if (core_map == MAP_FAILED) { ++ SPDK_ERRLOG("Could not mmap core %s (%s).\n", core_name, spdk_strerror(errno)); ++ close(core_fd); ++ goto error; ++ } ++ ++ if (fcntl(core_fd, F_SETLK, &core_lock) != 0) { ++ pid = *core_map; ++ SPDK_ERRLOG("Cannot create lock on core %" PRIu32 ", probably process %d has claimed it.\n", ++ core, pid); ++ munmap(core_map, sizeof(int)); ++ close(core_fd); ++ goto error; ++ } ++ ++ /* We write the PID to the core lock file so that other processes trying ++ * to claim the same core will know what process is holding the lock. */ ++ *core_map = (int)getpid(); ++ munmap(core_map, sizeof(int)); ++ g_core_locks[core] = core_fd; ++ /* Keep core_fd open to maintain the lock. */ ++ } ++ ++ return 0; ++ ++error: ++ if (failed_core != NULL) { ++ /* Set number of core we failed to claim. */ ++ *failed_core = core; ++ } ++ unclaim_cpu_cores(); ++ return -1; ++} ++ ++int ++spdk_app_start(struct spdk_app_opts *opts_user, spdk_msg_fn start_fn, ++ void *arg1) ++{ ++ int rc; ++ char *tty; ++ struct spdk_cpuset tmp_cpumask = {}; ++ static bool g_env_was_setup = false; ++ struct spdk_app_opts opts_local = {}; ++ struct spdk_app_opts *opts = &opts_local; ++ uint32_t i; ++ ++ if (!opts_user) { ++ SPDK_ERRLOG("opts_user should not be NULL\n"); ++ return 1; ++ } ++ ++ if (!opts_user->opts_size) { ++ SPDK_ERRLOG("The opts_size in opts_user structure should not be zero value\n"); ++ return 1; ++ } ++ ++ if (opts_user->name == NULL) { ++ SPDK_ERRLOG("spdk_app_opts::name not specified\n"); ++ return 1; ++ } ++ ++ app_copy_opts(opts, opts_user, opts_user->opts_size); ++ ++ if (!start_fn) { ++ SPDK_ERRLOG("start_fn should not be NULL\n"); ++ return 1; ++ } ++ ++ tty = ttyname(STDERR_FILENO); ++ if (opts->print_level > SPDK_LOG_WARN && ++ isatty(STDERR_FILENO) && ++ tty && ++ !strncmp(tty, "/dev/tty", strlen("/dev/tty"))) { ++ printf("Warning: printing stderr to console terminal without -q option specified.\n"); ++ printf("Suggest using --silence-noticelog to disable logging to stderr and\n"); ++ printf("monitor syslog, or redirect stderr to a file.\n"); ++ printf("(Delaying for 10 seconds...)\n"); ++ sleep(10); ++ } ++ ++ spdk_log_set_print_level(opts->print_level); ++ ++#ifndef SPDK_NO_RLIMIT ++ if (opts->enable_coredump) { ++ struct rlimit core_limits; ++ ++ core_limits.rlim_cur = core_limits.rlim_max = SPDK_APP_DEFAULT_CORE_LIMIT; ++ setrlimit(RLIMIT_CORE, &core_limits); ++ } ++#endif ++ ++ memset(&g_spdk_app, 0, sizeof(g_spdk_app)); ++ g_spdk_app.json_config_file = opts->json_config_file; ++ g_spdk_app.json_config_ignore_errors = opts->json_config_ignore_errors; ++ g_spdk_app.rpc_addr = opts->rpc_addr; ++ g_spdk_app.rpc_allowlist = opts->rpc_allowlist; ++ g_spdk_app.shm_id = opts->shm_id; ++ g_spdk_app.shutdown_cb = opts->shutdown_cb; ++ g_spdk_app.rc = 0; ++ g_spdk_app.stopped = false; ++ ++ spdk_log_set_level(SPDK_APP_DEFAULT_LOG_LEVEL); ++ ++ /* Pass NULL to app_setup_env if SPDK app has been set up, in order to ++ * indicate that this is a reinitialization. ++ */ ++ if (app_setup_env(g_env_was_setup ? NULL : opts) < 0) { ++ return 1; ++ } ++ ++ spdk_log_open(opts->log); ++ ++ /* Initialize each lock to -1 to indicate "empty" status */ ++ for (i = 0; i < MAX_CPU_CORES; i++) { ++ g_core_locks[i] = -1; ++ } ++ ++ if (!g_disable_cpumask_locks) { ++ if (claim_cpu_cores(NULL)) { ++ SPDK_ERRLOG("Unable to acquire lock on assigned core mask - exiting.\n"); ++ return 1; ++ } ++ } else { ++ SPDK_NOTICELOG("CPU core locks deactivated.\n"); ++ } ++ ++ SPDK_NOTICELOG("Total cores available: %d\n", spdk_env_get_core_count()); ++ ++ if ((rc = spdk_reactors_init(opts->msg_mempool_size)) != 0) { ++ SPDK_ERRLOG("Reactor Initialization failed: rc = %d\n", rc); ++ return 1; ++ } ++ ++ spdk_cpuset_set_cpu(&tmp_cpumask, spdk_env_get_current_core(), true); ++ ++ /* Now that the reactors have been initialized, we can create an ++ * initialization thread. */ ++ g_app_thread = spdk_thread_create("app_thread", &tmp_cpumask); ++ if (!g_app_thread) { ++ SPDK_ERRLOG("Unable to create an spdk_thread for initialization\n"); ++ return 1; ++ } ++ ++ /* ++ * Disable and ignore trace setup if setting num_entries ++ * to be 0. ++ * ++ * Note the call to app_setup_trace() is located here ++ * ahead of app_setup_signal_handlers(). ++ * That's because there is not an easy/direct clean ++ * way of unwinding alloc'd resources that can occur ++ * in app_setup_signal_handlers(). ++ */ ++ if (opts->num_entries != 0 && app_setup_trace(opts) != 0) { ++ return 1; ++ } ++ ++ if (!opts->disable_signal_handlers && app_setup_signal_handlers(opts) != 0) { ++ return 1; ++ } ++ ++ g_delay_subsystem_init = opts->delay_subsystem_init; ++ g_start_fn = start_fn; ++ g_start_arg = arg1; ++ ++ spdk_thread_send_msg(g_app_thread, bootstrap_fn, NULL); ++ ++ /* This blocks until spdk_app_stop is called */ ++ spdk_reactors_start(); ++ ++ g_env_was_setup = true; ++ ++ return g_spdk_app.rc; ++} ++ ++void ++spdk_app_fini(void) ++{ ++ spdk_trace_cleanup(); ++ spdk_reactors_fini(); ++ spdk_env_fini(); ++ spdk_log_close(); ++ unclaim_cpu_cores(); ++} ++ ++static void ++_start_subsystem_fini(void *arg1) ++{ ++ if (g_scheduling_in_progress) { ++ spdk_thread_send_msg(g_app_thread, _start_subsystem_fini, NULL); ++ return; ++ } ++ ++ spdk_subsystem_fini(spdk_reactors_stop, NULL); ++} ++ ++static int ++log_deprecation_hits(void *ctx, struct spdk_deprecation *dep) ++{ ++ uint64_t hits = spdk_deprecation_get_hits(dep); ++ ++ if (hits == 0) { ++ return 0; ++ } ++ ++ SPDK_WARNLOG("%s: deprecation '%s' scheduled for removal in %s hit %" PRIu64 " times\n", ++ spdk_deprecation_get_tag(dep), spdk_deprecation_get_description(dep), ++ spdk_deprecation_get_remove_release(dep), hits); ++ return 0; ++} ++ ++static void ++app_stop(void *arg1) ++{ ++ if (g_spdk_app.rc == 0) { ++ g_spdk_app.rc = (int)(intptr_t)arg1; ++ } ++ ++ if (g_spdk_app.stopped) { ++ SPDK_NOTICELOG("spdk_app_stop called twice\n"); ++ return; ++ } ++ ++ spdk_rpc_finish(); ++ g_spdk_app.stopped = true; ++ spdk_log_for_each_deprecation(NULL, log_deprecation_hits); ++ _start_subsystem_fini(NULL); ++} ++ ++void ++spdk_app_stop(int rc) ++{ ++ if (rc) { ++ SPDK_WARNLOG("spdk_app_stop'd on non-zero\n"); ++ } ++ ++ /* ++ * We want to run spdk_subsystem_fini() from the same thread where spdk_subsystem_init() ++ * was called. ++ */ ++ spdk_thread_send_msg(g_app_thread, app_stop, (void *)(intptr_t)rc); ++} ++ ++struct spdk_thread * ++_spdk_get_app_thread(void) ++{ ++ return g_app_thread; ++} ++ ++static void ++usage(void (*app_usage)(void)) ++{ ++ printf("%s [options]\n", g_executable_name); ++ printf("options:\n"); ++ printf(" -c, --config JSON config file (default %s)\n", ++ g_default_opts.json_config_file != NULL ? g_default_opts.json_config_file : "none"); ++ printf(" --json JSON config file (default %s)\n", ++ g_default_opts.json_config_file != NULL ? g_default_opts.json_config_file : "none"); ++ printf(" --json-ignore-init-errors\n"); ++ printf(" don't exit on invalid config entry\n"); ++ printf(" -d, --limit-coredump do not set max coredump size to RLIM_INFINITY\n"); ++ printf(" -g, --single-file-segments\n"); ++ printf(" force creating just one hugetlbfs file\n"); ++ printf(" -h, --help show this usage\n"); ++ printf(" -i, --shm-id shared memory ID (optional)\n"); ++ printf(" -m, --cpumask core mask (like 0xF) or core list of '[]' embraced (like [0,1,10]) for DPDK\n"); ++ printf(" -n, --mem-channels channel number of memory channels used for DPDK\n"); ++ printf(" -p, --main-core main (primary) core for DPDK\n"); ++ printf(" -r, --rpc-socket RPC listen address (default %s)\n", SPDK_DEFAULT_RPC_ADDR); ++ printf(" -s, --mem-size memory size in MB for DPDK (default: "); ++#ifndef __linux__ ++ if (g_default_opts.mem_size <= 0) { ++ printf("all hugepage memory)\n"); ++ } else ++#endif ++ { ++ printf("%dMB)\n", g_default_opts.mem_size >= 0 ? g_default_opts.mem_size : 0); ++ } ++ printf(" --disable-cpumask-locks Disable CPU core lock files.\n"); ++ printf(" --silence-noticelog disable notice level logging to stderr\n"); ++ printf(" --msg-mempool-size global message memory pool size in count (default: %d)\n", ++ SPDK_DEFAULT_MSG_MEMPOOL_SIZE); ++ printf(" -u, --no-pci disable PCI access\n"); ++ printf(" --wait-for-rpc wait for RPCs to initialize subsystems\n"); ++ printf(" --max-delay maximum reactor delay (in microseconds)\n"); ++ printf(" -B, --pci-blocked \n"); ++ printf(" pci addr to block (can be used more than once)\n"); ++ printf(" -R, --huge-unlink unlink huge files after initialization\n"); ++ printf(" -v, --version print SPDK version\n"); ++ printf(" -A, --pci-allowed \n"); ++ printf(" pci addr to allow (-B and -A cannot be used at the same time)\n"); ++ printf(" --huge-dir use a specific hugetlbfs mount to reserve memory from\n"); ++ printf(" --iova-mode set IOVA mode ('pa' for IOVA_PA and 'va' for IOVA_VA)\n"); ++ printf(" --base-virtaddr the base virtual address for DPDK (default: 0x200000000000)\n"); ++ printf(" --num-trace-entries number of trace entries for each core, must be power of 2, setting 0 to disable trace (default %d)\n", ++ SPDK_APP_DEFAULT_NUM_TRACE_ENTRIES); ++ printf(" --rpcs-allowed comma-separated list of permitted RPCS\n"); ++ printf(" --env-context Opaque context for use of the env implementation\n"); ++ printf(" --vfio-vf-token VF token (UUID) shared between SR-IOV PF and VFs for vfio_pci driver\n"); ++ printf(" --hot-restart enable hot restart\n"); ++ spdk_log_usage(stdout, "-L"); ++ spdk_trace_mask_usage(stdout, "-e"); ++ if (app_usage) { ++ app_usage(); ++ } ++} ++ ++spdk_app_parse_args_rvals_t ++spdk_app_parse_args(int argc, char **argv, struct spdk_app_opts *opts, ++ const char *app_getopt_str, const struct option *app_long_opts, ++ int (*app_parse)(int ch, char *arg), ++ void (*app_usage)(void)) ++{ ++ int ch, rc, opt_idx, global_long_opts_len, app_long_opts_len; ++ struct option *cmdline_options; ++ char *cmdline_short_opts = NULL; ++ char *shm_id_str = NULL; ++ enum spdk_app_parse_args_rvals retval = SPDK_APP_PARSE_ARGS_FAIL; ++ long int tmp; ++ ++ memcpy(&g_default_opts, opts, sizeof(g_default_opts)); ++ ++ if (opts->json_config_file && access(opts->json_config_file, R_OK) != 0) { ++ SPDK_WARNLOG("Can't read JSON configuration file '%s'\n", opts->json_config_file); ++ opts->json_config_file = NULL; ++ } ++ ++ if (app_long_opts == NULL) { ++ app_long_opts_len = 0; ++ } else { ++ for (app_long_opts_len = 0; ++ app_long_opts[app_long_opts_len].name != NULL; ++ app_long_opts_len++); ++ } ++ ++ global_long_opts_len = SPDK_COUNTOF(g_cmdline_options); ++ ++ cmdline_options = calloc(global_long_opts_len + app_long_opts_len + 1, sizeof(*cmdline_options)); ++ if (!cmdline_options) { ++ SPDK_ERRLOG("Out of memory\n"); ++ return SPDK_APP_PARSE_ARGS_FAIL; ++ } ++ ++ memcpy(&cmdline_options[0], g_cmdline_options, sizeof(g_cmdline_options)); ++ if (app_long_opts) { ++ memcpy(&cmdline_options[global_long_opts_len], app_long_opts, ++ app_long_opts_len * sizeof(*app_long_opts)); ++ } ++ ++ if (app_getopt_str != NULL) { ++ ch = app_opts_validate(app_getopt_str); ++ if (ch) { ++ SPDK_ERRLOG("Duplicated option '%c' between the generic and application specific spdk opts.\n", ++ ch); ++ goto out; ++ } ++ } ++ ++ cmdline_short_opts = spdk_sprintf_alloc("%s%s", app_getopt_str, SPDK_APP_GETOPT_STRING); ++ if (!cmdline_short_opts) { ++ SPDK_ERRLOG("Out of memory\n"); ++ goto out; ++ } ++ ++ g_executable_name = argv[0]; ++ ++ while ((ch = getopt_long(argc, argv, cmdline_short_opts, cmdline_options, &opt_idx)) != -1) { ++ switch (ch) { ++ case CONFIG_FILE_OPT_IDX: ++ case JSON_CONFIG_OPT_IDX: ++ opts->json_config_file = optarg; ++ break; ++ case JSON_CONFIG_IGNORE_INIT_ERRORS_IDX: ++ opts->json_config_ignore_errors = true; ++ break; ++ case LIMIT_COREDUMP_OPT_IDX: ++ opts->enable_coredump = false; ++ break; ++ case TPOINT_GROUP_OPT_IDX: ++ opts->tpoint_group_mask = optarg; ++ break; ++ case SINGLE_FILE_SEGMENTS_OPT_IDX: ++ opts->hugepage_single_segments = true; ++ break; ++ case HELP_OPT_IDX: ++ usage(app_usage); ++ retval = SPDK_APP_PARSE_ARGS_HELP; ++ goto out; ++ case SHM_ID_OPT_IDX: ++ shm_id_str = optarg; ++ /* a negative shm-id disables shared configuration file */ ++ if (optarg[0] == '-') { ++ shm_id_str++; ++ } ++ /* check if the positive value of provided shm_id can be parsed as ++ * an integer ++ */ ++ opts->shm_id = spdk_strtol(shm_id_str, 0); ++ if (opts->shm_id < 0) { ++ SPDK_ERRLOG("Invalid shared memory ID %s\n", optarg); ++ goto out; ++ } ++ if (optarg[0] == '-') { ++ opts->shm_id = -opts->shm_id; ++ } ++ break; ++ case CPUMASK_OPT_IDX: ++ opts->reactor_mask = optarg; ++ break; ++ case DISABLE_CPUMASK_LOCKS_OPT_IDX: ++ g_disable_cpumask_locks = true; ++ break; ++ case MEM_CHANNELS_OPT_IDX: ++ opts->mem_channel = spdk_strtol(optarg, 0); ++ if (opts->mem_channel < 0) { ++ SPDK_ERRLOG("Invalid memory channel %s\n", optarg); ++ goto out; ++ } ++ break; ++ case MAIN_CORE_OPT_IDX: ++ opts->main_core = spdk_strtol(optarg, 0); ++ if (opts->main_core < 0) { ++ SPDK_ERRLOG("Invalid main core %s\n", optarg); ++ goto out; ++ } ++ break; ++ case SILENCE_NOTICELOG_OPT_IDX: ++ opts->print_level = SPDK_LOG_WARN; ++ break; ++ case RPC_SOCKET_OPT_IDX: ++ opts->rpc_addr = optarg; ++ break; ++ case MEM_SIZE_OPT_IDX: { ++ uint64_t mem_size_mb; ++ bool mem_size_has_prefix; ++ ++ rc = spdk_parse_capacity(optarg, &mem_size_mb, &mem_size_has_prefix); ++ if (rc != 0) { ++ SPDK_ERRLOG("invalid memory pool size `-s %s`\n", optarg); ++ usage(app_usage); ++ goto out; ++ } ++ ++ if (mem_size_has_prefix) { ++ /* the mem size is in MB by default, so if a prefix was ++ * specified, we need to manually convert to MB. ++ */ ++ mem_size_mb /= 1024 * 1024; ++ } ++ ++ if (mem_size_mb > INT_MAX) { ++ SPDK_ERRLOG("invalid memory pool size `-s %s`\n", optarg); ++ usage(app_usage); ++ goto out; ++ } ++ ++ opts->mem_size = (int) mem_size_mb; ++ break; ++ } ++ case MSG_MEMPOOL_SIZE_OPT_IDX: ++ tmp = spdk_strtol(optarg, 10); ++ if (tmp <= 0) { ++ SPDK_ERRLOG("Invalid message memory pool size %s\n", optarg); ++ goto out; ++ } ++ ++ opts->msg_mempool_size = (size_t)tmp; ++ break; ++ ++ case NO_PCI_OPT_IDX: ++ opts->no_pci = true; ++ break; ++ case WAIT_FOR_RPC_OPT_IDX: ++ opts->delay_subsystem_init = true; ++ break; ++ case PCI_BLOCKED_OPT_IDX: ++ if (opts->pci_allowed) { ++ free(opts->pci_allowed); ++ opts->pci_allowed = NULL; ++ SPDK_ERRLOG("-B and -A cannot be used at the same time\n"); ++ usage(app_usage); ++ goto out; ++ } ++ ++ rc = app_opts_add_pci_addr(opts, &opts->pci_blocked, optarg); ++ if (rc != 0) { ++ free(opts->pci_blocked); ++ opts->pci_blocked = NULL; ++ goto out; ++ } ++ break; ++ case LOGFLAG_OPT_IDX: ++ rc = spdk_log_set_flag(optarg); ++ if (rc < 0) { ++ SPDK_ERRLOG("unknown flag\n"); ++ usage(app_usage); ++ goto out; ++ } ++#ifdef DEBUG ++ opts->print_level = SPDK_LOG_DEBUG; ++#endif ++ break; ++ case HUGE_UNLINK_OPT_IDX: ++ opts->unlink_hugepage = true; ++ break; ++ case PCI_WHITELIST_OPT_IDX: ++ SPDK_WARNLOG("-W/--pci-whitelist is deprecated. Use -A/--pci-allowed.\n"); ++ /* fallthrough */ ++ case PCI_ALLOWED_OPT_IDX: ++ if (opts->pci_blocked) { ++ free(opts->pci_blocked); ++ opts->pci_blocked = NULL; ++ SPDK_ERRLOG("-B and -W cannot be used at the same time\n"); ++ usage(app_usage); ++ goto out; ++ } ++ ++ rc = app_opts_add_pci_addr(opts, &opts->pci_allowed, optarg); ++ if (rc != 0) { ++ free(opts->pci_allowed); ++ opts->pci_allowed = NULL; ++ goto out; ++ } ++ break; ++ case BASE_VIRTADDR_OPT_IDX: ++ tmp = spdk_strtoll(optarg, 0); ++ if (tmp <= 0) { ++ SPDK_ERRLOG("Invalid base-virtaddr %s\n", optarg); ++ usage(app_usage); ++ goto out; ++ } ++ opts->base_virtaddr = (uint64_t)tmp; ++ break; ++ case HUGE_DIR_OPT_IDX: ++ opts->hugedir = optarg; ++ break; ++ case IOVA_MODE_OPT_IDX: ++ opts->iova_mode = optarg; ++ break; ++ case NUM_TRACE_ENTRIES_OPT_IDX: ++ tmp = spdk_strtoll(optarg, 0); ++ if (tmp < 0) { ++ SPDK_ERRLOG("Invalid num-trace-entries %s\n", optarg); ++ usage(app_usage); ++ goto out; ++ } ++ opts->num_entries = (uint64_t)tmp; ++ if (opts->num_entries > 0 && opts->num_entries & (opts->num_entries - 1)) { ++ SPDK_ERRLOG("num-trace-entries must be power of 2\n"); ++ usage(app_usage); ++ goto out; ++ } ++ break; ++ case MAX_REACTOR_DELAY_OPT_IDX: ++ SPDK_ERRLOG("Deprecation warning: The maximum allowed latency parameter is no longer supported.\n"); ++ break; ++ case ENV_CONTEXT_OPT_IDX: ++ opts->env_context = optarg; ++ break; ++ case RPCS_ALLOWED_OPT_IDX: ++ opts->rpc_allowlist = (const char **)spdk_strarray_from_string(optarg, ","); ++ if (opts->rpc_allowlist == NULL) { ++ SPDK_ERRLOG("Invalid --rpcs-allowed argument\n"); ++ usage(app_usage); ++ goto out; ++ } ++ break; ++ case ENV_VF_TOKEN_OPT_IDX: ++ opts->vf_token = optarg; ++ break; ++ case VERSION_OPT_IDX: ++ printf(SPDK_VERSION_STRING"\n"); ++ retval = SPDK_APP_PARSE_ARGS_HELP; ++ goto out; ++ case HOT_RESTART_OPT_IDX: ++ opts->hot_restart = true; ++ break; ++ case '?': ++ /* ++ * In the event getopt() above detects an option ++ * in argv that is NOT in the getopt_str, ++ * getopt() will return a '?' indicating failure. ++ */ ++ usage(app_usage); ++ goto out; ++ default: ++ rc = app_parse(ch, optarg); ++ if (rc) { ++ SPDK_ERRLOG("Parsing application specific arguments failed: %d\n", rc); ++ goto out; ++ } ++ } ++ } ++ ++ if (opts->json_config_file && opts->delay_subsystem_init) { ++ SPDK_ERRLOG("JSON configuration file can't be used together with --wait-for-rpc.\n"); ++ goto out; ++ } ++ ++ retval = SPDK_APP_PARSE_ARGS_SUCCESS; ++out: ++ if (retval != SPDK_APP_PARSE_ARGS_SUCCESS) { ++ free(opts->pci_blocked); ++ opts->pci_blocked = NULL; ++ free(opts->pci_allowed); ++ opts->pci_allowed = NULL; ++ spdk_strarray_free((char **)opts->rpc_allowlist); ++ opts->rpc_allowlist = NULL; ++ } ++ free(cmdline_short_opts); ++ free(cmdline_options); ++ return retval; ++} ++ ++void ++spdk_app_usage(void) ++{ ++ if (g_executable_name == NULL) { ++ SPDK_ERRLOG("%s not valid before calling spdk_app_parse_args()\n", __func__); ++ return; ++ } ++ ++ usage(NULL); ++} ++ ++static void ++rpc_framework_start_init_cpl(int rc, void *arg1) ++{ ++ struct spdk_jsonrpc_request *request = arg1; ++ ++ assert(spdk_get_thread() == g_app_thread); ++ ++ if (rc) { ++ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, ++ "framework_initialization failed"); ++ return; ++ } ++ ++ spdk_rpc_set_state(SPDK_RPC_RUNTIME); ++ app_start_application(); ++ ++ spdk_jsonrpc_send_bool_response(request, true); ++} ++ ++static void ++rpc_framework_start_init(struct spdk_jsonrpc_request *request, ++ const struct spdk_json_val *params) ++{ ++ if (params != NULL) { ++ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, ++ "framework_start_init requires no parameters"); ++ return; ++ } ++ ++ spdk_subsystem_init(rpc_framework_start_init_cpl, request); ++} ++SPDK_RPC_REGISTER("framework_start_init", rpc_framework_start_init, SPDK_RPC_STARTUP) ++ ++struct subsystem_init_poller_ctx { ++ struct spdk_poller *init_poller; ++ struct spdk_jsonrpc_request *request; ++}; ++ ++static int ++rpc_subsystem_init_poller_ctx(void *ctx) ++{ ++ struct subsystem_init_poller_ctx *poller_ctx = ctx; ++ ++ if (spdk_rpc_get_state() == SPDK_RPC_RUNTIME) { ++ spdk_jsonrpc_send_bool_response(poller_ctx->request, true); ++ spdk_poller_unregister(&poller_ctx->init_poller); ++ free(poller_ctx); ++ } ++ ++ return SPDK_POLLER_BUSY; ++} ++ ++static void ++rpc_framework_wait_init(struct spdk_jsonrpc_request *request, ++ const struct spdk_json_val *params) ++{ ++ struct subsystem_init_poller_ctx *ctx; ++ ++ if (spdk_rpc_get_state() == SPDK_RPC_RUNTIME) { ++ spdk_jsonrpc_send_bool_response(request, true); ++ } else { ++ ctx = malloc(sizeof(struct subsystem_init_poller_ctx)); ++ if (ctx == NULL) { ++ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, ++ "Unable to allocate memory for the request context\n"); ++ return; ++ } ++ ctx->request = request; ++ ctx->init_poller = SPDK_POLLER_REGISTER(rpc_subsystem_init_poller_ctx, ctx, 0); ++ } ++} ++SPDK_RPC_REGISTER("framework_wait_init", rpc_framework_wait_init, ++ SPDK_RPC_STARTUP | SPDK_RPC_RUNTIME) ++ ++static void ++rpc_framework_disable_cpumask_locks(struct spdk_jsonrpc_request *request, ++ const struct spdk_json_val *params) ++{ ++ if (params != NULL) { ++ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, ++ "framework_disable_cpumask_locks" ++ "requires no arguments"); ++ return; ++ } ++ ++ unclaim_cpu_cores(); ++ spdk_jsonrpc_send_bool_response(request, true); ++} ++SPDK_RPC_REGISTER("framework_disable_cpumask_locks", rpc_framework_disable_cpumask_locks, ++ SPDK_RPC_STARTUP | SPDK_RPC_RUNTIME) ++ ++static void ++rpc_framework_enable_cpumask_locks(struct spdk_jsonrpc_request *request, ++ const struct spdk_json_val *params) ++{ ++ char msg[128]; ++ int rc; ++ uint32_t failed_core; ++ ++ if (params != NULL) { ++ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, ++ "framework_enable_cpumask_locks" ++ "requires no arguments"); ++ return; ++ } ++ ++ rc = claim_cpu_cores(&failed_core); ++ if (rc) { ++ snprintf(msg, sizeof(msg), "Failed to claim CPU core: %" PRIu32, failed_core); ++ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, msg); ++ return; ++ } ++ ++ spdk_jsonrpc_send_bool_response(request, true); ++} ++SPDK_RPC_REGISTER("framework_enable_cpumask_locks", rpc_framework_enable_cpumask_locks, ++ SPDK_RPC_STARTUP | SPDK_RPC_RUNTIME) +diff --git a/lib/event/spdk_event.map b/lib/event/spdk_event.map +index 2d6d0dd..25b3a64 100644 +--- a/lib/event/spdk_event.map ++++ b/lib/event/spdk_event.map +@@ -1,40 +1,41 @@ +-{ +- global: +- +- # Public functions +- spdk_app_opts_init; +- spdk_app_start; +- spdk_app_fini; +- spdk_app_start_shutdown; +- spdk_app_stop; +- spdk_app_get_shm_id; +- spdk_app_parse_core_mask; +- spdk_app_get_core_mask; +- spdk_app_parse_args; +- spdk_app_usage; +- spdk_event_allocate; +- spdk_event_call; +- spdk_framework_enable_context_switch_monitor; +- spdk_framework_context_switch_monitor_enabled; +- +- # Public scheduler functions +- spdk_scheduler_set; +- spdk_scheduler_get; +- spdk_scheduler_register; +- spdk_scheduler_set_period; +- spdk_scheduler_get_period; +- spdk_governor_set; +- spdk_governor_get; +- spdk_governor_register; +- +- # Functions used by other SPDK libraries +- spdk_reactors_init; +- spdk_reactors_fini; +- spdk_reactors_start; +- spdk_reactors_stop; +- spdk_reactor_get; +- spdk_for_each_reactor; +- spdk_reactor_set_interrupt_mode; +- +- local: *; +-}; ++{ ++ global: ++ ++ # Public functions ++ spdk_app_opts_init; ++ spdk_app_start; ++ spdk_app_fini; ++ spdk_app_start_shutdown; ++ spdk_app_stop; ++ spdk_app_get_shm_id; ++ spdk_app_parse_core_mask; ++ spdk_app_get_core_mask; ++ spdk_app_parse_args; ++ spdk_app_usage; ++ spdk_event_allocate; ++ spdk_event_call; ++ spdk_framework_enable_context_switch_monitor; ++ spdk_framework_context_switch_monitor_enabled; ++ spdk_get_shutdown_sig_received; ++ ++ # Public scheduler functions ++ spdk_scheduler_set; ++ spdk_scheduler_get; ++ spdk_scheduler_register; ++ spdk_scheduler_set_period; ++ spdk_scheduler_get_period; ++ spdk_governor_set; ++ spdk_governor_get; ++ spdk_governor_register; ++ ++ # Functions used by other SPDK libraries ++ spdk_reactors_init; ++ spdk_reactors_fini; ++ spdk_reactors_start; ++ spdk_reactors_stop; ++ spdk_reactor_get; ++ spdk_for_each_reactor; ++ spdk_reactor_set_interrupt_mode; ++ ++ local: *; ++}; +diff --git a/lib/init/json_config.c b/lib/init/json_config.c +index 0d39506..d69bf2c 100644 +--- a/lib/init/json_config.c ++++ b/lib/init/json_config.c +@@ -1,640 +1,668 @@ +-/* SPDX-License-Identifier: BSD-3-Clause +- * Copyright (C) 2018 Intel Corporation. +- * All rights reserved. +- * Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +- */ +- +-#include "spdk/stdinc.h" +- +-#include "spdk/init.h" +-#include "spdk/util.h" +-#include "spdk/file.h" +-#include "spdk/log.h" +-#include "spdk/env.h" +-#include "spdk/thread.h" +-#include "spdk/jsonrpc.h" +-#include "spdk/rpc.h" +-#include "spdk/string.h" +- +-#include "spdk_internal/event.h" +- +-#define SPDK_DEBUG_APP_CFG(...) SPDK_DEBUGLOG(app_config, __VA_ARGS__) +- +-/* JSON configuration format is as follows +- * +- * { +- * "subsystems" : [ <<== *subsystems JSON array +- * { <<== *subsystems_it array entry pointer (iterator) +- * "subsystem": "<< SUBSYSTEM NAME >>", +- * "config": [ <<== *config JSON array +- * { <<== *config_it array entry pointer (iterator) +- * "method": "<< METHOD NAME >>", <<== *method +- * "params": { << PARAMS >> } <<== *params +- * }, +- * << MORE "config" ARRY ENTRIES >> +- * ] +- * }, +- * << MORE "subsystems" ARRAY ENTRIES >> +- * ] +- * +- * << ANYTHING ELSE IS IGNORED IN ROOT OBJECT>> +- * } +- * +- */ +- +-struct load_json_config_ctx; +-typedef void (*client_resp_handler)(struct load_json_config_ctx *, +- struct spdk_jsonrpc_client_response *); +- +-#define RPC_SOCKET_PATH_MAX SPDK_SIZEOF_MEMBER(struct sockaddr_un, sun_path) +- +-/* 1s connections timeout */ +-#define RPC_CLIENT_CONNECT_TIMEOUT_US (1U * 1000U * 1000U) +- +-/* +- * Currently there is no timeout in SPDK for any RPC command. This result that +- * we can't put a hard limit during configuration load as it most likely randomly fail. +- * So just print WARNLOG every 10s. */ +-#define RPC_CLIENT_REQUEST_TIMEOUT_US (10U * 1000 * 1000) +- +-struct load_json_config_ctx { +- /* Thread used during configuration. */ +- struct spdk_thread *thread; +- spdk_subsystem_init_fn cb_fn; +- void *cb_arg; +- bool stop_on_error; +- +- /* Current subsystem */ +- struct spdk_json_val *subsystems; /* "subsystems" array */ +- struct spdk_json_val *subsystems_it; /* current subsystem array position in "subsystems" array */ +- +- struct spdk_json_val *subsystem_name; /* current subsystem name */ +- +- /* Current "config" entry we are processing */ +- struct spdk_json_val *config; /* "config" array */ +- struct spdk_json_val *config_it; /* current config position in "config" array */ +- +- /* Current request id we are sending. */ +- uint32_t rpc_request_id; +- +- /* Whole configuration file read and parsed. */ +- size_t json_data_size; +- char *json_data; +- +- size_t values_cnt; +- struct spdk_json_val *values; +- +- char rpc_socket_path_temp[RPC_SOCKET_PATH_MAX + 1]; +- +- struct spdk_jsonrpc_client *client_conn; +- struct spdk_poller *client_conn_poller; +- +- client_resp_handler client_resp_cb; +- +- /* Timeout for current RPC client action. */ +- uint64_t timeout; +-}; +- +-static void app_json_config_load_subsystem(void *_ctx); +- +-static void +-app_json_config_load_done(struct load_json_config_ctx *ctx, int rc) +-{ +- spdk_poller_unregister(&ctx->client_conn_poller); +- if (ctx->client_conn != NULL) { +- spdk_jsonrpc_client_close(ctx->client_conn); +- } +- +- spdk_rpc_finish(); +- +- SPDK_DEBUG_APP_CFG("Config load finished with rc %d\n", rc); +- ctx->cb_fn(rc, ctx->cb_arg); +- +- free(ctx->json_data); +- free(ctx->values); +- free(ctx); +-} +- +-static void +-rpc_client_set_timeout(struct load_json_config_ctx *ctx, uint64_t timeout_us) +-{ +- ctx->timeout = spdk_get_ticks() + timeout_us * spdk_get_ticks_hz() / (1000 * 1000); +-} +- +-static int +-rpc_client_check_timeout(struct load_json_config_ctx *ctx) +-{ +- if (ctx->timeout < spdk_get_ticks()) { +- SPDK_WARNLOG("RPC client command timeout.\n"); +- return -ETIMEDOUT; +- } +- +- return 0; +-} +- +-struct json_write_buf { +- char data[1024]; +- unsigned cur_off; +-}; +- +-static int +-json_write_stdout(void *cb_ctx, const void *data, size_t size) +-{ +- struct json_write_buf *buf = cb_ctx; +- size_t rc; +- +- rc = snprintf(buf->data + buf->cur_off, sizeof(buf->data) - buf->cur_off, +- "%s", (const char *)data); +- if (rc > 0) { +- buf->cur_off += rc; +- } +- return rc == size ? 0 : -1; +-} +- +-static int +-rpc_client_poller(void *arg) +-{ +- struct load_json_config_ctx *ctx = arg; +- struct spdk_jsonrpc_client_response *resp; +- client_resp_handler cb; +- int rc; +- +- assert(spdk_get_thread() == ctx->thread); +- +- rc = spdk_jsonrpc_client_poll(ctx->client_conn, 0); +- if (rc == 0) { +- rc = rpc_client_check_timeout(ctx); +- if (rc == -ETIMEDOUT) { +- rpc_client_set_timeout(ctx, RPC_CLIENT_REQUEST_TIMEOUT_US); +- rc = 0; +- } +- } +- +- if (rc == 0) { +- /* No response yet */ +- return SPDK_POLLER_BUSY; +- } else if (rc < 0) { +- app_json_config_load_done(ctx, rc); +- return SPDK_POLLER_BUSY; +- } +- +- resp = spdk_jsonrpc_client_get_response(ctx->client_conn); +- assert(resp); +- +- if (resp->error) { +- struct json_write_buf buf = {}; +- struct spdk_json_write_ctx *w = spdk_json_write_begin(json_write_stdout, +- &buf, SPDK_JSON_PARSE_FLAG_DECODE_IN_PLACE); +- +- if (w == NULL) { +- SPDK_ERRLOG("error response: (?)\n"); +- } else { +- spdk_json_write_val(w, resp->error); +- spdk_json_write_end(w); +- SPDK_ERRLOG("error response: \n%s\n", buf.data); +- } +- } +- +- if (resp->error && ctx->stop_on_error) { +- spdk_jsonrpc_client_free_response(resp); +- app_json_config_load_done(ctx, -EINVAL); +- } else { +- /* We have response so we must have callback for it. */ +- cb = ctx->client_resp_cb; +- assert(cb != NULL); +- +- /* Mark we are done with this handler. */ +- ctx->client_resp_cb = NULL; +- cb(ctx, resp); +- } +- +- +- return SPDK_POLLER_BUSY; +-} +- +-static int +-rpc_client_connect_poller(void *_ctx) +-{ +- struct load_json_config_ctx *ctx = _ctx; +- int rc; +- +- rc = spdk_jsonrpc_client_poll(ctx->client_conn, 0); +- if (rc != -ENOTCONN) { +- /* We are connected. Start regular poller and issue first request */ +- spdk_poller_unregister(&ctx->client_conn_poller); +- ctx->client_conn_poller = SPDK_POLLER_REGISTER(rpc_client_poller, ctx, 100); +- app_json_config_load_subsystem(ctx); +- } else { +- rc = rpc_client_check_timeout(ctx); +- if (rc) { +- app_json_config_load_done(ctx, rc); +- } +- +- return SPDK_POLLER_IDLE; +- } +- +- return SPDK_POLLER_BUSY; +-} +- +-static int +-client_send_request(struct load_json_config_ctx *ctx, struct spdk_jsonrpc_client_request *request, +- client_resp_handler client_resp_cb) +-{ +- int rc; +- +- assert(spdk_get_thread() == ctx->thread); +- +- ctx->client_resp_cb = client_resp_cb; +- rpc_client_set_timeout(ctx, RPC_CLIENT_REQUEST_TIMEOUT_US); +- rc = spdk_jsonrpc_client_send_request(ctx->client_conn, request); +- +- if (rc) { +- SPDK_DEBUG_APP_CFG("Sending request to client failed (%d)\n", rc); +- } +- +- return rc; +-} +- +-static int +-cap_string(const struct spdk_json_val *val, void *out) +-{ +- const struct spdk_json_val **vptr = out; +- +- if (val->type != SPDK_JSON_VAL_STRING) { +- return -EINVAL; +- } +- +- *vptr = val; +- return 0; +-} +- +-static int +-cap_object(const struct spdk_json_val *val, void *out) +-{ +- const struct spdk_json_val **vptr = out; +- +- if (val->type != SPDK_JSON_VAL_OBJECT_BEGIN) { +- return -EINVAL; +- } +- +- *vptr = val; +- return 0; +-} +- +- +-static int +-cap_array_or_null(const struct spdk_json_val *val, void *out) +-{ +- const struct spdk_json_val **vptr = out; +- +- if (val->type != SPDK_JSON_VAL_ARRAY_BEGIN && val->type != SPDK_JSON_VAL_NULL) { +- return -EINVAL; +- } +- +- *vptr = val; +- return 0; +-} +- +-struct config_entry { +- char *method; +- struct spdk_json_val *params; +-}; +- +-static struct spdk_json_object_decoder jsonrpc_cmd_decoders[] = { +- {"method", offsetof(struct config_entry, method), spdk_json_decode_string}, +- {"params", offsetof(struct config_entry, params), cap_object, true} +-}; +- +-static void app_json_config_load_subsystem_config_entry(void *_ctx); +- +-static void +-app_json_config_load_subsystem_config_entry_next(struct load_json_config_ctx *ctx, +- struct spdk_jsonrpc_client_response *resp) +-{ +- /* Don't care about the response */ +- spdk_jsonrpc_client_free_response(resp); +- +- ctx->config_it = spdk_json_next(ctx->config_it); +- app_json_config_load_subsystem_config_entry(ctx); +-} +- +-/* Load "config" entry */ +-static void +-app_json_config_load_subsystem_config_entry(void *_ctx) +-{ +- struct load_json_config_ctx *ctx = _ctx; +- struct spdk_jsonrpc_client_request *rpc_request; +- struct spdk_json_write_ctx *w; +- struct config_entry cfg = {}; +- struct spdk_json_val *params_end; +- size_t params_len = 0; +- uint32_t state_mask = 0, cur_state_mask, startup_runtime = SPDK_RPC_STARTUP | SPDK_RPC_RUNTIME; +- int rc; +- +- if (ctx->config_it == NULL) { +- SPDK_DEBUG_APP_CFG("Subsystem '%.*s': configuration done.\n", ctx->subsystem_name->len, +- (char *)ctx->subsystem_name->start); +- ctx->subsystems_it = spdk_json_next(ctx->subsystems_it); +- /* Invoke later to avoid recurrence */ +- spdk_thread_send_msg(ctx->thread, app_json_config_load_subsystem, ctx); +- return; +- } +- +- if (spdk_json_decode_object(ctx->config_it, jsonrpc_cmd_decoders, +- SPDK_COUNTOF(jsonrpc_cmd_decoders), &cfg)) { +- SPDK_ERRLOG("Failed to decode config entry\n"); +- app_json_config_load_done(ctx, -EINVAL); +- goto out; +- } +- +- rc = spdk_rpc_get_method_state_mask(cfg.method, &state_mask); +- if (rc == -ENOENT) { +- SPDK_ERRLOG("Method '%s' was not found\n", cfg.method); +- app_json_config_load_done(ctx, rc); +- goto out; +- } +- cur_state_mask = spdk_rpc_get_state(); +- if ((state_mask & cur_state_mask) != cur_state_mask) { +- SPDK_DEBUG_APP_CFG("Method '%s' not allowed -> skipping\n", cfg.method); +- /* Invoke later to avoid recurrence */ +- ctx->config_it = spdk_json_next(ctx->config_it); +- spdk_thread_send_msg(ctx->thread, app_json_config_load_subsystem_config_entry, ctx); +- goto out; +- } +- if ((state_mask & startup_runtime) == startup_runtime && cur_state_mask == SPDK_RPC_RUNTIME) { +- /* Some methods are allowed to be run in both STARTUP and RUNTIME states. +- * We should not call such methods twice, so ignore the second attempt in RUNTIME state */ +- SPDK_DEBUG_APP_CFG("Method '%s' has already been run in STARTUP state\n", cfg.method); +- /* Invoke later to avoid recurrence */ +- ctx->config_it = spdk_json_next(ctx->config_it); +- spdk_thread_send_msg(ctx->thread, app_json_config_load_subsystem_config_entry, ctx); +- goto out; +- } +- +- SPDK_DEBUG_APP_CFG("\tmethod: %s\n", cfg.method); +- +- if (cfg.params) { +- /* Get _END by skipping params and going back by one element. */ +- params_end = cfg.params + spdk_json_val_len(cfg.params) - 1; +- +- /* Need to add one character to include '}' */ +- params_len = params_end->start - cfg.params->start + 1; +- +- SPDK_DEBUG_APP_CFG("\tparams: %.*s\n", (int)params_len, (char *)cfg.params->start); +- } +- +- rpc_request = spdk_jsonrpc_client_create_request(); +- if (!rpc_request) { +- app_json_config_load_done(ctx, -errno); +- goto out; +- } +- +- w = spdk_jsonrpc_begin_request(rpc_request, ctx->rpc_request_id, NULL); +- if (!w) { +- spdk_jsonrpc_client_free_request(rpc_request); +- app_json_config_load_done(ctx, -ENOMEM); +- goto out; +- } +- +- spdk_json_write_named_string(w, "method", cfg.method); +- +- if (cfg.params) { +- /* No need to parse "params". Just dump the whole content of "params" +- * directly into the request and let the remote side verify it. */ +- spdk_json_write_name(w, "params"); +- spdk_json_write_val_raw(w, cfg.params->start, params_len); +- } +- +- spdk_jsonrpc_end_request(rpc_request, w); +- +- rc = client_send_request(ctx, rpc_request, app_json_config_load_subsystem_config_entry_next); +- if (rc != 0) { +- app_json_config_load_done(ctx, -rc); +- goto out; +- } +-out: +- free(cfg.method); +-} +- +-static void +-subsystem_init_done(int rc, void *arg1) +-{ +- struct load_json_config_ctx *ctx = arg1; +- +- if (rc) { +- app_json_config_load_done(ctx, rc); +- return; +- } +- +- spdk_rpc_set_state(SPDK_RPC_RUNTIME); +- /* Another round. This time for RUNTIME methods */ +- SPDK_DEBUG_APP_CFG("'framework_start_init' done - continuing configuration\n"); +- +- assert(ctx != NULL); +- if (ctx->subsystems) { +- ctx->subsystems_it = spdk_json_array_first(ctx->subsystems); +- } +- +- app_json_config_load_subsystem(ctx); +-} +- +-static struct spdk_json_object_decoder subsystem_decoders[] = { +- {"subsystem", offsetof(struct load_json_config_ctx, subsystem_name), cap_string}, +- {"config", offsetof(struct load_json_config_ctx, config), cap_array_or_null} +-}; +- +-/* +- * Start loading subsystem pointed by ctx->subsystems_it. This must point to the +- * beginning of the "subsystem" object in "subsystems" array or be NULL. If it is +- * NULL then no more subsystems to load. +- * +- * There are two iterations: +- * +- * In first iteration only STARTUP RPC methods are used, other methods are ignored. When +- * allsubsystems are walked the ctx->subsystems_it became NULL and "framework_start_init" +- * is called to let the SPDK move to RUNTIME state (initialize all subsystems) and +- * second iteration begins. +- * +- * In second iteration "subsystems" array is walked through again, this time only +- * RUNTIME RPC methods are used. When ctx->subsystems_it became NULL second time it +- * indicate that there is no more subsystems to load. The cb_fn is called to finish +- * configuration. +- */ +-static void +-app_json_config_load_subsystem(void *_ctx) +-{ +- struct load_json_config_ctx *ctx = _ctx; +- +- if (ctx->subsystems_it == NULL) { +- if (spdk_rpc_get_state() == SPDK_RPC_STARTUP) { +- SPDK_DEBUG_APP_CFG("No more entries for current state, calling 'framework_start_init'\n"); +- spdk_subsystem_init(subsystem_init_done, ctx); +- } else { +- app_json_config_load_done(ctx, 0); +- } +- +- return; +- } +- +- /* Capture subsystem name and config array */ +- if (spdk_json_decode_object(ctx->subsystems_it, subsystem_decoders, +- SPDK_COUNTOF(subsystem_decoders), ctx)) { +- SPDK_ERRLOG("Failed to parse subsystem configuration\n"); +- app_json_config_load_done(ctx, -EINVAL); +- return; +- } +- +- SPDK_DEBUG_APP_CFG("Loading subsystem '%.*s' configuration\n", ctx->subsystem_name->len, +- (char *)ctx->subsystem_name->start); +- +- /* Get 'config' array first configuration entry */ +- ctx->config_it = spdk_json_array_first(ctx->config); +- app_json_config_load_subsystem_config_entry(ctx); +-} +- +-static void * +-read_file(const char *filename, size_t *size) +-{ +- FILE *file = fopen(filename, "r"); +- void *data; +- +- if (file == NULL) { +- /* errno is set by fopen */ +- return NULL; +- } +- +- data = spdk_posix_file_load(file, size); +- fclose(file); +- return data; +-} +- +-static int +-app_json_config_read(const char *config_file, struct load_json_config_ctx *ctx) +-{ +- struct spdk_json_val *values = NULL; +- void *json = NULL, *end; +- ssize_t values_cnt, rc; +- size_t json_size; +- +- json = read_file(config_file, &json_size); +- if (!json) { +- SPDK_ERRLOG("Read JSON configuration file %s failed: %s\n", +- config_file, spdk_strerror(errno)); +- return -errno; +- } +- +- rc = spdk_json_parse(json, json_size, NULL, 0, &end, +- SPDK_JSON_PARSE_FLAG_ALLOW_COMMENTS); +- if (rc < 0) { +- SPDK_ERRLOG("Parsing JSON configuration failed (%zd)\n", rc); +- goto err; +- } +- +- values_cnt = rc; +- values = calloc(values_cnt, sizeof(struct spdk_json_val)); +- if (values == NULL) { +- SPDK_ERRLOG("Out of memory\n"); +- goto err; +- } +- +- rc = spdk_json_parse(json, json_size, values, values_cnt, &end, +- SPDK_JSON_PARSE_FLAG_ALLOW_COMMENTS); +- if (rc != values_cnt) { +- SPDK_ERRLOG("Parsing JSON configuration failed (%zd)\n", rc); +- goto err; +- } +- +- ctx->json_data = json; +- ctx->json_data_size = json_size; +- +- ctx->values = values; +- ctx->values_cnt = values_cnt; +- +- return 0; +-err: +- free(json); +- free(values); +- return rc; +-} +- +-void +-spdk_subsystem_init_from_json_config(const char *json_config_file, const char *rpc_addr, +- spdk_subsystem_init_fn cb_fn, void *cb_arg, +- bool stop_on_error) +-{ +- struct load_json_config_ctx *ctx = calloc(1, sizeof(*ctx)); +- int rc; +- +- assert(cb_fn); +- if (!ctx) { +- cb_fn(-ENOMEM, cb_arg); +- return; +- } +- +- ctx->cb_fn = cb_fn; +- ctx->cb_arg = cb_arg; +- ctx->stop_on_error = stop_on_error; +- ctx->thread = spdk_get_thread(); +- +- rc = app_json_config_read(json_config_file, ctx); +- if (rc) { +- goto fail; +- } +- +- /* Capture subsystems array */ +- rc = spdk_json_find_array(ctx->values, "subsystems", NULL, &ctx->subsystems); +- switch (rc) { +- case 0: +- /* Get first subsystem */ +- ctx->subsystems_it = spdk_json_array_first(ctx->subsystems); +- if (ctx->subsystems_it == NULL) { +- SPDK_NOTICELOG("'subsystems' configuration is empty\n"); +- } +- break; +- case -EPROTOTYPE: +- SPDK_ERRLOG("Invalid JSON configuration: not enclosed in {}.\n"); +- goto fail; +- case -ENOENT: +- SPDK_WARNLOG("No 'subsystems' key JSON configuration file.\n"); +- break; +- case -EDOM: +- SPDK_ERRLOG("Invalid JSON configuration: 'subsystems' should be an array.\n"); +- goto fail; +- default: +- SPDK_ERRLOG("Failed to parse JSON configuration.\n"); +- goto fail; +- } +- +- /* If rpc_addr is not an Unix socket use default address as prefix. */ +- if (rpc_addr == NULL || rpc_addr[0] != '/') { +- rpc_addr = SPDK_DEFAULT_RPC_ADDR; +- } +- +- /* FIXME: rpc client should use socketpair() instead of this temporary socket nonsense */ +- rc = snprintf(ctx->rpc_socket_path_temp, sizeof(ctx->rpc_socket_path_temp), "%s.%d_config", +- rpc_addr, getpid()); +- if (rc >= (int)sizeof(ctx->rpc_socket_path_temp)) { +- SPDK_ERRLOG("Socket name create failed\n"); +- goto fail; +- } +- +- rc = spdk_rpc_initialize(ctx->rpc_socket_path_temp); +- if (rc) { +- goto fail; +- } +- +- ctx->client_conn = spdk_jsonrpc_client_connect(ctx->rpc_socket_path_temp, AF_UNIX); +- if (ctx->client_conn == NULL) { +- SPDK_ERRLOG("Failed to connect to '%s'\n", ctx->rpc_socket_path_temp); +- goto fail; +- } +- +- rpc_client_set_timeout(ctx, RPC_CLIENT_CONNECT_TIMEOUT_US); +- ctx->client_conn_poller = SPDK_POLLER_REGISTER(rpc_client_connect_poller, ctx, 100); +- return; +- +-fail: +- app_json_config_load_done(ctx, -EINVAL); +-} +- +-SPDK_LOG_REGISTER_COMPONENT(app_config) ++/* SPDX-License-Identifier: BSD-3-Clause ++ * Copyright (C) 2018 Intel Corporation. ++ * All rights reserved. ++ * Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved. ++ */ ++ ++#include "spdk/stdinc.h" ++ ++#include "spdk/init.h" ++#include "spdk/util.h" ++#include "spdk/file.h" ++#include "spdk/log.h" ++#include "spdk/env.h" ++#include "spdk/thread.h" ++#include "spdk/jsonrpc.h" ++#include "spdk/rpc.h" ++#include "spdk/string.h" ++ ++#include "spdk_internal/event.h" ++ ++#define SPDK_DEBUG_APP_CFG(...) SPDK_DEBUGLOG(app_config, __VA_ARGS__) ++#define SPDK_JSON_CONFIG_HOT_RESTART_INTERVAL 4 /* 4us */ ++#define SPDK_JSON_CONFIG_SELECT_INTERNAL 4000 /* 4ms */ ++ ++/* JSON configuration format is as follows ++ * ++ * { ++ * "subsystems" : [ <<== *subsystems JSON array ++ * { <<== *subsystems_it array entry pointer (iterator) ++ * "subsystem": "<< SUBSYSTEM NAME >>", ++ * "config": [ <<== *config JSON array ++ * { <<== *config_it array entry pointer (iterator) ++ * "method": "<< METHOD NAME >>", <<== *method ++ * "params": { << PARAMS >> } <<== *params ++ * }, ++ * << MORE "config" ARRY ENTRIES >> ++ * ] ++ * }, ++ * << MORE "subsystems" ARRAY ENTRIES >> ++ * ] ++ * ++ * << ANYTHING ELSE IS IGNORED IN ROOT OBJECT>> ++ * } ++ * ++ */ ++ ++struct load_json_config_ctx; ++typedef void (*client_resp_handler)(struct load_json_config_ctx *, ++ struct spdk_jsonrpc_client_response *); ++ ++#define RPC_SOCKET_PATH_MAX SPDK_SIZEOF_MEMBER(struct sockaddr_un, sun_path) ++ ++/* 1s connections timeout */ ++#define RPC_CLIENT_CONNECT_TIMEOUT_US (1U * 1000U * 1000U) ++ ++/* ++ * Currently there is no timeout in SPDK for any RPC command. This result that ++ * we can't put a hard limit during configuration load as it most likely randomly fail. ++ * So just print WARNLOG every 10s. */ ++#define RPC_CLIENT_REQUEST_TIMEOUT_US (10U * 1000 * 1000) ++ ++struct load_json_config_ctx { ++ /* Thread used during configuration. */ ++ struct spdk_thread *thread; ++ spdk_subsystem_init_fn cb_fn; ++ void *cb_arg; ++ bool stop_on_error; ++ ++ /* Current subsystem */ ++ struct spdk_json_val *subsystems; /* "subsystems" array */ ++ struct spdk_json_val *subsystems_it; /* current subsystem array position in "subsystems" array */ ++ ++ struct spdk_json_val *subsystem_name; /* current subsystem name */ ++ ++ /* Current "config" entry we are processing */ ++ struct spdk_json_val *config; /* "config" array */ ++ struct spdk_json_val *config_it; /* current config position in "config" array */ ++ ++ /* Current request id we are sending. */ ++ uint32_t rpc_request_id; ++ ++ /* Whole configuration file read and parsed. */ ++ size_t json_data_size; ++ char *json_data; ++ ++ size_t values_cnt; ++ struct spdk_json_val *values; ++ ++ char rpc_socket_path_temp[RPC_SOCKET_PATH_MAX + 1]; ++ ++ struct spdk_jsonrpc_client *client_conn; ++ struct spdk_poller *client_conn_poller; ++ ++ client_resp_handler client_resp_cb; ++ ++ /* Timeout for current RPC client action. */ ++ uint64_t timeout; ++}; ++ ++static void app_json_config_load_subsystem(void *_ctx); ++ ++static void ++app_json_config_load_done(struct load_json_config_ctx *ctx, int rc) ++{ ++ spdk_poller_unregister(&ctx->client_conn_poller); ++ if (ctx->client_conn != NULL) { ++ spdk_jsonrpc_client_close(ctx->client_conn); ++ } ++ ++ spdk_rpc_finish(); ++ ++ SPDK_DEBUG_APP_CFG("Config load finished with rc %d\n", rc); ++ ctx->cb_fn(rc, ctx->cb_arg); ++ ++ free(ctx->json_data); ++ free(ctx->values); ++ free(ctx); ++} ++ ++static void ++rpc_client_set_timeout(struct load_json_config_ctx *ctx, uint64_t timeout_us) ++{ ++ ctx->timeout = spdk_get_ticks() + timeout_us * spdk_get_ticks_hz() / (1000 * 1000); ++} ++ ++static int ++rpc_client_check_timeout(struct load_json_config_ctx *ctx) ++{ ++ if (ctx->timeout < spdk_get_ticks()) { ++ SPDK_WARNLOG("RPC client command timeout.\n"); ++ return -ETIMEDOUT; ++ } ++ ++ return 0; ++} ++ ++struct json_write_buf { ++ char data[1024]; ++ unsigned cur_off; ++}; ++ ++static int ++json_write_stdout(void *cb_ctx, const void *data, size_t size) ++{ ++ struct json_write_buf *buf = cb_ctx; ++ size_t rc; ++ ++ rc = snprintf(buf->data + buf->cur_off, sizeof(buf->data) - buf->cur_off, ++ "%s", (const char *)data); ++ if (rc > 0) { ++ buf->cur_off += rc; ++ } ++ return rc == size ? 0 : -1; ++} ++ ++static int ++rpc_client_poller(void *arg) ++{ ++ struct load_json_config_ctx *ctx = arg; ++ struct spdk_jsonrpc_client_response *resp; ++ client_resp_handler cb; ++ int rc; ++ ++ assert(spdk_get_thread() == ctx->thread); ++ ++ rc = spdk_jsonrpc_client_poll(ctx->client_conn, 0); ++ if (rc == 0) { ++ rc = rpc_client_check_timeout(ctx); ++ if (rc == -ETIMEDOUT) { ++ rpc_client_set_timeout(ctx, RPC_CLIENT_REQUEST_TIMEOUT_US); ++ rc = 0; ++ } ++ } ++ ++ if (rc == 0) { ++ /* No response yet */ ++ return SPDK_POLLER_BUSY; ++ } else if (rc < 0) { ++ app_json_config_load_done(ctx, rc); ++ return SPDK_POLLER_BUSY; ++ } ++ ++ resp = spdk_jsonrpc_client_get_response(ctx->client_conn); ++ assert(resp); ++ ++ if (resp->error) { ++ struct json_write_buf buf = {}; ++ struct spdk_json_write_ctx *w = spdk_json_write_begin(json_write_stdout, ++ &buf, SPDK_JSON_PARSE_FLAG_DECODE_IN_PLACE); ++ ++ if (w == NULL) { ++ SPDK_ERRLOG("error response: (?)\n"); ++ } else { ++ spdk_json_write_val(w, resp->error); ++ spdk_json_write_end(w); ++ SPDK_ERRLOG("error response: \n%s\n", buf.data); ++ } ++ } ++ ++ if (resp->error && ctx->stop_on_error) { ++ spdk_jsonrpc_client_free_response(resp); ++ app_json_config_load_done(ctx, -EINVAL); ++ } else { ++ /* We have response so we must have callback for it. */ ++ cb = ctx->client_resp_cb; ++ assert(cb != NULL); ++ ++ /* Mark we are done with this handler. */ ++ ctx->client_resp_cb = NULL; ++ cb(ctx, resp); ++ } ++ ++ ++ return SPDK_POLLER_BUSY; ++} ++ ++static int ++rpc_client_connect_poller(void *_ctx) ++{ ++ struct load_json_config_ctx *ctx = _ctx; ++ int rc; ++ ++ rc = spdk_jsonrpc_client_poll(ctx->client_conn, 0); ++ if (rc != -ENOTCONN) { ++ /* We are connected. Start regular poller and issue first request */ ++ spdk_poller_unregister(&ctx->client_conn_poller); ++ ctx->client_conn_poller = SPDK_POLLER_REGISTER(rpc_client_poller, ctx, 100); ++ app_json_config_load_subsystem(ctx); ++ } else { ++ rc = rpc_client_check_timeout(ctx); ++ if (rc) { ++ app_json_config_load_done(ctx, rc); ++ } ++ ++ return SPDK_POLLER_IDLE; ++ } ++ ++ return SPDK_POLLER_BUSY; ++} ++ ++static int ++client_send_request(struct load_json_config_ctx *ctx, struct spdk_jsonrpc_client_request *request, ++ client_resp_handler client_resp_cb) ++{ ++ int rc; ++ ++ assert(spdk_get_thread() == ctx->thread); ++ ++ ctx->client_resp_cb = client_resp_cb; ++ rpc_client_set_timeout(ctx, RPC_CLIENT_REQUEST_TIMEOUT_US); ++ rc = spdk_jsonrpc_client_send_request(ctx->client_conn, request); ++ ++ if (rc) { ++ SPDK_DEBUG_APP_CFG("Sending request to client failed (%d)\n", rc); ++ } ++ ++ return rc; ++} ++ ++static int ++cap_string(const struct spdk_json_val *val, void *out) ++{ ++ const struct spdk_json_val **vptr = out; ++ ++ if (val->type != SPDK_JSON_VAL_STRING) { ++ return -EINVAL; ++ } ++ ++ *vptr = val; ++ return 0; ++} ++ ++static int ++cap_object(const struct spdk_json_val *val, void *out) ++{ ++ const struct spdk_json_val **vptr = out; ++ ++ if (val->type != SPDK_JSON_VAL_OBJECT_BEGIN) { ++ return -EINVAL; ++ } ++ ++ *vptr = val; ++ return 0; ++} ++ ++ ++static int ++cap_array_or_null(const struct spdk_json_val *val, void *out) ++{ ++ const struct spdk_json_val **vptr = out; ++ ++ if (val->type != SPDK_JSON_VAL_ARRAY_BEGIN && val->type != SPDK_JSON_VAL_NULL) { ++ return -EINVAL; ++ } ++ ++ *vptr = val; ++ return 0; ++} ++ ++struct config_entry { ++ char *method; ++ struct spdk_json_val *params; ++}; ++ ++static struct spdk_json_object_decoder jsonrpc_cmd_decoders[] = { ++ {"method", offsetof(struct config_entry, method), spdk_json_decode_string}, ++ {"params", offsetof(struct config_entry, params), cap_object, true} ++}; ++ ++static void app_json_config_load_subsystem_config_entry(void *_ctx); ++ ++static void ++app_json_config_load_subsystem_config_entry_next(struct load_json_config_ctx *ctx, ++ struct spdk_jsonrpc_client_response *resp) ++{ ++ /* Don't care about the response */ ++ spdk_jsonrpc_client_free_response(resp); ++ ++ ctx->config_it = spdk_json_next(ctx->config_it); ++ app_json_config_load_subsystem_config_entry(ctx); ++} ++ ++/* Load "config" entry */ ++static void ++app_json_config_load_subsystem_config_entry(void *_ctx) ++{ ++ struct load_json_config_ctx *ctx = _ctx; ++ struct spdk_jsonrpc_client_request *rpc_request; ++ struct spdk_json_write_ctx *w; ++ struct config_entry cfg = {}; ++ struct spdk_json_val *params_end; ++ size_t params_len = 0; ++ uint32_t state_mask = 0, cur_state_mask, startup_runtime = SPDK_RPC_STARTUP | SPDK_RPC_RUNTIME; ++ int rc; ++ ++ if (spdk_get_shutdown_sig_received()) { ++ /* ++ * In the hot restart process, when this callback is triggered, ++ * rpc and thread may have been released. ++ * Therefore, dont continue. ++ */ ++ return; ++ } ++ ++ if (ctx->config_it == NULL) { ++ SPDK_DEBUG_APP_CFG("Subsystem '%.*s': configuration done.\n", ctx->subsystem_name->len, ++ (char *)ctx->subsystem_name->start); ++ ctx->subsystems_it = spdk_json_next(ctx->subsystems_it); ++ /* Invoke later to avoid recurrence */ ++ spdk_thread_send_msg(ctx->thread, app_json_config_load_subsystem, ctx); ++ return; ++ } ++ ++ if (spdk_json_decode_object(ctx->config_it, jsonrpc_cmd_decoders, ++ SPDK_COUNTOF(jsonrpc_cmd_decoders), &cfg)) { ++ SPDK_ERRLOG("Failed to decode config entry\n"); ++ app_json_config_load_done(ctx, -EINVAL); ++ goto out; ++ } ++ ++ rc = spdk_rpc_get_method_state_mask(cfg.method, &state_mask); ++ if (rc == -ENOENT) { ++ SPDK_ERRLOG("Method '%s' was not found\n", cfg.method); ++ app_json_config_load_done(ctx, rc); ++ goto out; ++ } ++ cur_state_mask = spdk_rpc_get_state(); ++ if ((state_mask & cur_state_mask) != cur_state_mask) { ++ SPDK_DEBUG_APP_CFG("Method '%s' not allowed -> skipping\n", cfg.method); ++ /* Invoke later to avoid recurrence */ ++ ctx->config_it = spdk_json_next(ctx->config_it); ++ spdk_thread_send_msg(ctx->thread, app_json_config_load_subsystem_config_entry, ctx); ++ goto out; ++ } ++ if ((state_mask & startup_runtime) == startup_runtime && cur_state_mask == SPDK_RPC_RUNTIME) { ++ /* Some methods are allowed to be run in both STARTUP and RUNTIME states. ++ * We should not call such methods twice, so ignore the second attempt in RUNTIME state */ ++ SPDK_DEBUG_APP_CFG("Method '%s' has already been run in STARTUP state\n", cfg.method); ++ /* Invoke later to avoid recurrence */ ++ ctx->config_it = spdk_json_next(ctx->config_it); ++ spdk_thread_send_msg(ctx->thread, app_json_config_load_subsystem_config_entry, ctx); ++ goto out; ++ } ++ ++ SPDK_DEBUG_APP_CFG("\tmethod: %s\n", cfg.method); ++ ++ if (cfg.params) { ++ /* Get _END by skipping params and going back by one element. */ ++ params_end = cfg.params + spdk_json_val_len(cfg.params) - 1; ++ ++ /* Need to add one character to include '}' */ ++ params_len = params_end->start - cfg.params->start + 1; ++ ++ SPDK_DEBUG_APP_CFG("\tparams: %.*s\n", (int)params_len, (char *)cfg.params->start); ++ } ++ ++ rpc_request = spdk_jsonrpc_client_create_request(); ++ if (!rpc_request) { ++ app_json_config_load_done(ctx, -errno); ++ goto out; ++ } ++ ++ w = spdk_jsonrpc_begin_request(rpc_request, ctx->rpc_request_id, NULL); ++ if (!w) { ++ spdk_jsonrpc_client_free_request(rpc_request); ++ app_json_config_load_done(ctx, -ENOMEM); ++ goto out; ++ } ++ ++ spdk_json_write_named_string(w, "method", cfg.method); ++ ++ if (cfg.params) { ++ /* No need to parse "params". Just dump the whole content of "params" ++ * directly into the request and let the remote side verify it. */ ++ spdk_json_write_name(w, "params"); ++ spdk_json_write_val_raw(w, cfg.params->start, params_len); ++ } ++ ++ spdk_jsonrpc_end_request(rpc_request, w); ++ ++ rc = client_send_request(ctx, rpc_request, app_json_config_load_subsystem_config_entry_next); ++ if (rc != 0) { ++ app_json_config_load_done(ctx, -rc); ++ goto out; ++ } ++out: ++ free(cfg.method); ++} ++ ++static void ++subsystem_init_done(int rc, void *arg1) ++{ ++ struct load_json_config_ctx *ctx = arg1; ++ ++ if (rc) { ++ app_json_config_load_done(ctx, rc); ++ return; ++ } ++ ++ spdk_rpc_set_state(SPDK_RPC_RUNTIME); ++ /* Another round. This time for RUNTIME methods */ ++ SPDK_DEBUG_APP_CFG("'framework_start_init' done - continuing configuration\n"); ++ ++ assert(ctx != NULL); ++ if (ctx->subsystems) { ++ ctx->subsystems_it = spdk_json_array_first(ctx->subsystems); ++ } ++ ++ app_json_config_load_subsystem(ctx); ++} ++ ++static struct spdk_json_object_decoder subsystem_decoders[] = { ++ {"subsystem", offsetof(struct load_json_config_ctx, subsystem_name), cap_string}, ++ {"config", offsetof(struct load_json_config_ctx, config), cap_array_or_null} ++}; ++ ++/* ++ * Start loading subsystem pointed by ctx->subsystems_it. This must point to the ++ * beginning of the "subsystem" object in "subsystems" array or be NULL. If it is ++ * NULL then no more subsystems to load. ++ * ++ * There are two iterations: ++ * ++ * In first iteration only STARTUP RPC methods are used, other methods are ignored. When ++ * allsubsystems are walked the ctx->subsystems_it became NULL and "framework_start_init" ++ * is called to let the SPDK move to RUNTIME state (initialize all subsystems) and ++ * second iteration begins. ++ * ++ * In second iteration "subsystems" array is walked through again, this time only ++ * RUNTIME RPC methods are used. When ctx->subsystems_it became NULL second time it ++ * indicate that there is no more subsystems to load. The cb_fn is called to finish ++ * configuration. ++ */ ++static void ++app_json_config_load_subsystem(void *_ctx) ++{ ++ struct load_json_config_ctx *ctx = _ctx; ++ ++ if (ctx->subsystems_it == NULL) { ++ if (spdk_rpc_get_state() == SPDK_RPC_STARTUP) { ++ SPDK_DEBUG_APP_CFG("No more entries for current state, calling 'framework_start_init'\n"); ++ spdk_subsystem_init(subsystem_init_done, ctx); ++ } else { ++ app_json_config_load_done(ctx, 0); ++ } ++ ++ return; ++ } ++ ++ /* Capture subsystem name and config array */ ++ if (spdk_json_decode_object(ctx->subsystems_it, subsystem_decoders, ++ SPDK_COUNTOF(subsystem_decoders), ctx)) { ++ SPDK_ERRLOG("Failed to parse subsystem configuration\n"); ++ app_json_config_load_done(ctx, -EINVAL); ++ return; ++ } ++ ++ SPDK_DEBUG_APP_CFG("Loading subsystem '%.*s' configuration\n", ctx->subsystem_name->len, ++ (char *)ctx->subsystem_name->start); ++ ++ /* Get 'config' array first configuration entry */ ++ ctx->config_it = spdk_json_array_first(ctx->config); ++ app_json_config_load_subsystem_config_entry(ctx); ++} ++ ++static void * ++read_file(const char *filename, size_t *size) ++{ ++ FILE *file = fopen(filename, "r"); ++ void *data; ++ ++ if (file == NULL) { ++ /* errno is set by fopen */ ++ return NULL; ++ } ++ ++ data = spdk_posix_file_load(file, size); ++ fclose(file); ++ return data; ++} ++ ++static int ++app_json_config_read(const char *config_file, struct load_json_config_ctx *ctx) ++{ ++ struct spdk_json_val *values = NULL; ++ void *json = NULL, *end; ++ ssize_t values_cnt, rc; ++ size_t json_size; ++ ++ json = read_file(config_file, &json_size); ++ if (!json) { ++ SPDK_ERRLOG("Read JSON configuration file %s failed: %s\n", ++ config_file, spdk_strerror(errno)); ++ return -errno; ++ } ++ ++ rc = spdk_json_parse(json, json_size, NULL, 0, &end, ++ SPDK_JSON_PARSE_FLAG_ALLOW_COMMENTS); ++ if (rc < 0) { ++ SPDK_ERRLOG("Parsing JSON configuration failed (%zd)\n", rc); ++ goto err; ++ } ++ ++ values_cnt = rc; ++ values = calloc(values_cnt, sizeof(struct spdk_json_val)); ++ if (values == NULL) { ++ SPDK_ERRLOG("Out of memory\n"); ++ goto err; ++ } ++ ++ rc = spdk_json_parse(json, json_size, values, values_cnt, &end, ++ SPDK_JSON_PARSE_FLAG_ALLOW_COMMENTS); ++ if (rc != values_cnt) { ++ SPDK_ERRLOG("Parsing JSON configuration failed (%zd)\n", rc); ++ goto err; ++ } ++ ++ ctx->json_data = json; ++ ctx->json_data_size = json_size; ++ ++ ctx->values = values; ++ ctx->values_cnt = values_cnt; ++ ++ return 0; ++err: ++ free(json); ++ free(values); ++ return rc; ++} ++ ++static bool g_hot_restart_flag = false; ++bool spdk_ssam_get_hot_restart(void) ++{ ++ return g_hot_restart_flag; ++} ++ ++void spdk_ssam_set_hot_restart(bool value) ++{ ++ g_hot_restart_flag = value; ++} ++ ++void ++spdk_subsystem_init_from_json_config(const char *json_config_file, const char *rpc_addr, ++ spdk_subsystem_init_fn cb_fn, void *cb_arg, ++ bool stop_on_error) ++{ ++ struct load_json_config_ctx *ctx = calloc(1, sizeof(*ctx)); ++ int rc; ++ int internal; ++ ++ assert(cb_fn); ++ if (!ctx) { ++ cb_fn(-ENOMEM, cb_arg); ++ return; ++ } ++ ++ ctx->cb_fn = cb_fn; ++ ctx->cb_arg = cb_arg; ++ ctx->stop_on_error = stop_on_error; ++ ctx->thread = spdk_get_thread(); ++ ++ rc = app_json_config_read(json_config_file, ctx); ++ if (rc) { ++ goto fail; ++ } ++ ++ /* Capture subsystems array */ ++ rc = spdk_json_find_array(ctx->values, "subsystems", NULL, &ctx->subsystems); ++ switch (rc) { ++ case 0: ++ /* Get first subsystem */ ++ ctx->subsystems_it = spdk_json_array_first(ctx->subsystems); ++ if (ctx->subsystems_it == NULL) { ++ SPDK_NOTICELOG("'subsystems' configuration is empty\n"); ++ } ++ break; ++ case -EPROTOTYPE: ++ SPDK_ERRLOG("Invalid JSON configuration: not enclosed in {}.\n"); ++ goto fail; ++ case -ENOENT: ++ SPDK_WARNLOG("No 'subsystems' key JSON configuration file.\n"); ++ break; ++ case -EDOM: ++ SPDK_ERRLOG("Invalid JSON configuration: 'subsystems' should be an array.\n"); ++ goto fail; ++ default: ++ SPDK_ERRLOG("Failed to parse JSON configuration.\n"); ++ goto fail; ++ } ++ ++ /* If rpc_addr is not an Unix socket use default address as prefix. */ ++ if (rpc_addr == NULL || rpc_addr[0] != '/') { ++ rpc_addr = SPDK_DEFAULT_RPC_ADDR; ++ } ++ ++ /* FIXME: rpc client should use socketpair() instead of this temporary socket nonsense */ ++ rc = snprintf(ctx->rpc_socket_path_temp, sizeof(ctx->rpc_socket_path_temp), "%s.%d_config", ++ rpc_addr, getpid()); ++ if (rc >= (int)sizeof(ctx->rpc_socket_path_temp)) { ++ SPDK_ERRLOG("Socket name create failed\n"); ++ goto fail; ++ } ++ ++ if (spdk_ssam_get_hot_restart() == true) { ++ internal = SPDK_JSON_CONFIG_HOT_RESTART_INTERVAL; ++ } else { ++ internal = SPDK_JSON_CONFIG_SELECT_INTERNAL; ++ } ++ rc = spdk_rpc_initialize(ctx->rpc_socket_path_temp, internal); ++ if (rc) { ++ goto fail; ++ } ++ ++ ctx->client_conn = spdk_jsonrpc_client_connect(ctx->rpc_socket_path_temp, AF_UNIX); ++ if (ctx->client_conn == NULL) { ++ SPDK_ERRLOG("Failed to connect to '%s'\n", ctx->rpc_socket_path_temp); ++ goto fail; ++ } ++ ++ rpc_client_set_timeout(ctx, RPC_CLIENT_CONNECT_TIMEOUT_US); ++ ctx->client_conn_poller = SPDK_POLLER_REGISTER(rpc_client_connect_poller, ctx, 100); ++ return; ++ ++fail: ++ app_json_config_load_done(ctx, -EINVAL); ++} ++ ++SPDK_LOG_REGISTER_COMPONENT(app_config) +diff --git a/lib/init/rpc.c b/lib/init/rpc.c +index 04be7c6..23d6fa4 100644 +--- a/lib/init/rpc.c ++++ b/lib/init/rpc.c +@@ -1,61 +1,59 @@ +-/* SPDX-License-Identifier: BSD-3-Clause +- * Copyright (C) 2017 Intel Corporation. +- * All rights reserved. +- */ +- +-#include "spdk/stdinc.h" +- +-#include "spdk/env.h" +-#include "spdk/init.h" +-#include "spdk/thread.h" +-#include "spdk/log.h" +-#include "spdk/rpc.h" +- +-#define RPC_SELECT_INTERVAL 4000 /* 4ms */ +- +-static struct spdk_poller *g_rpc_poller = NULL; +- +-static int +-rpc_subsystem_poll(void *arg) +-{ +- spdk_rpc_accept(); +- return SPDK_POLLER_BUSY; +-} +- +-int +-spdk_rpc_initialize(const char *listen_addr) +-{ +- int rc; +- +- if (listen_addr == NULL) { +- /* Not treated as an error */ +- return 0; +- } +- +- if (!spdk_rpc_verify_methods()) { +- return -EINVAL; +- } +- +- /* Listen on the requested address */ +- rc = spdk_rpc_listen(listen_addr); +- if (rc != 0) { +- SPDK_ERRLOG("Unable to start RPC service at %s\n", listen_addr); +- /* TODO: Eventually, treat this as an error. But it historically has not +- * been and many tests rely on this gracefully failing. */ +- return 0; +- } +- +- spdk_rpc_set_state(SPDK_RPC_STARTUP); +- +- /* Register a poller to periodically check for RPCs */ +- g_rpc_poller = SPDK_POLLER_REGISTER(rpc_subsystem_poll, NULL, RPC_SELECT_INTERVAL); +- +- return 0; +-} +- +-void +-spdk_rpc_finish(void) +-{ +- spdk_rpc_close(); +- spdk_poller_unregister(&g_rpc_poller); +-} ++/* SPDX-License-Identifier: BSD-3-Clause ++ * Copyright (C) 2017 Intel Corporation. ++ * All rights reserved. ++ */ ++ ++#include "spdk/stdinc.h" ++ ++#include "spdk/env.h" ++#include "spdk/init.h" ++#include "spdk/thread.h" ++#include "spdk/log.h" ++#include "spdk/rpc.h" ++ ++static struct spdk_poller *g_rpc_poller = NULL; ++ ++static int ++rpc_subsystem_poll(void *arg) ++{ ++ spdk_rpc_accept(); ++ return SPDK_POLLER_BUSY; ++} ++ ++int ++spdk_rpc_initialize(const char *listen_addr, int internval) ++{ ++ int rc; ++ ++ if (listen_addr == NULL) { ++ /* Not treated as an error */ ++ return 0; ++ } ++ ++ if (!spdk_rpc_verify_methods()) { ++ return -EINVAL; ++ } ++ ++ /* Listen on the requested address */ ++ rc = spdk_rpc_listen(listen_addr); ++ if (rc != 0) { ++ SPDK_ERRLOG("Unable to start RPC service at %s\n", listen_addr); ++ /* TODO: Eventually, treat this as an error. But it historically has not ++ * been and many tests rely on this gracefully failing. */ ++ return 0; ++ } ++ ++ spdk_rpc_set_state(SPDK_RPC_STARTUP); ++ ++ /* Register a poller to periodically check for RPCs */ ++ g_rpc_poller = SPDK_POLLER_REGISTER(rpc_subsystem_poll, NULL, internval); ++ ++ return 0; ++} ++ ++void ++spdk_rpc_finish(void) ++{ ++ spdk_rpc_close(); ++ spdk_poller_unregister(&g_rpc_poller); ++} +diff --git a/lib/init/spdk_init.map b/lib/init/spdk_init.map +index c6061c4..dc5bb0f 100644 +--- a/lib/init/spdk_init.map ++++ b/lib/init/spdk_init.map +@@ -1,17 +1,19 @@ +-{ +- global: +- +- # Public functions +- spdk_add_subsystem; +- spdk_add_subsystem_depend; +- spdk_subsystem_init; +- spdk_subsystem_fini; +- spdk_subsystem_init_next; +- spdk_subsystem_fini_next; +- spdk_subsystem_init_from_json_config; +- +- spdk_rpc_initialize; +- spdk_rpc_finish; +- +- local: *; +-}; ++{ ++ global: ++ ++ # Public functions ++ spdk_add_subsystem; ++ spdk_add_subsystem_depend; ++ spdk_subsystem_init; ++ spdk_subsystem_fini; ++ spdk_subsystem_init_next; ++ spdk_subsystem_fini_next; ++ spdk_subsystem_init_from_json_config; ++ ++ spdk_rpc_initialize; ++ spdk_rpc_finish; ++ spdk_ssam_get_hot_restart; ++ spdk_ssam_set_hot_restart; ++ ++ local: *; ++}; +diff --git a/lib/scsi/lun.c b/lib/scsi/lun.c +index a106182..fbfd58d 100644 +--- a/lib/scsi/lun.c ++++ b/lib/scsi/lun.c +@@ -1,644 +1,656 @@ +-/* SPDX-License-Identifier: BSD-3-Clause +- * Copyright (C) 2008-2012 Daisuke Aoyama . +- * Copyright (C) 2016 Intel Corporation. +- * All rights reserved. +- */ +- +-#include "scsi_internal.h" +-#include "spdk/endian.h" +-#include "spdk/env.h" +-#include "spdk/thread.h" +-#include "spdk/util.h" +-#include "spdk/likely.h" +- +-static void scsi_lun_execute_tasks(struct spdk_scsi_lun *lun); +-static void _scsi_lun_execute_mgmt_task(struct spdk_scsi_lun *lun); +- +-void +-scsi_lun_complete_task(struct spdk_scsi_lun *lun, struct spdk_scsi_task *task) +-{ +- if (lun) { +- TAILQ_REMOVE(&lun->tasks, task, scsi_link); +- spdk_trace_record(TRACE_SCSI_TASK_DONE, lun->dev->id, 0, (uintptr_t)task); +- } +- task->cpl_fn(task); +-} +- +-static void +-scsi_lun_complete_mgmt_task(struct spdk_scsi_lun *lun, struct spdk_scsi_task *task) +-{ +- TAILQ_REMOVE(&lun->mgmt_tasks, task, scsi_link); +- +- task->cpl_fn(task); +- +- /* Try to execute the first pending mgmt task if it exists. */ +- _scsi_lun_execute_mgmt_task(lun); +-} +- +-static bool +-_scsi_lun_has_pending_mgmt_tasks(const struct spdk_scsi_lun *lun) +-{ +- return !TAILQ_EMPTY(&lun->pending_mgmt_tasks); +-} +- +-static bool +-scsi_lun_has_outstanding_mgmt_tasks(const struct spdk_scsi_lun *lun) +-{ +- return !TAILQ_EMPTY(&lun->mgmt_tasks); +-} +- +-static bool +-_scsi_lun_has_pending_tasks(const struct spdk_scsi_lun *lun) +-{ +- return !TAILQ_EMPTY(&lun->pending_tasks); +-} +- +-static bool +-scsi_lun_has_outstanding_tasks(const struct spdk_scsi_lun *lun) +-{ +- return !TAILQ_EMPTY(&lun->tasks); +-} +- +-/* Reset task have to wait until all prior outstanding tasks complete. */ +-static int +-scsi_lun_reset_check_outstanding_tasks(void *arg) +-{ +- struct spdk_scsi_task *task = (struct spdk_scsi_task *)arg; +- struct spdk_scsi_lun *lun = task->lun; +- +- if (scsi_lun_has_outstanding_tasks(lun)) { +- return SPDK_POLLER_BUSY; +- } +- spdk_poller_unregister(&lun->reset_poller); +- +- scsi_lun_complete_mgmt_task(lun, task); +- return SPDK_POLLER_BUSY; +-} +- +-void +-scsi_lun_complete_reset_task(struct spdk_scsi_lun *lun, struct spdk_scsi_task *task) +-{ +- if (task->status == SPDK_SCSI_STATUS_GOOD) { +- if (scsi_lun_has_outstanding_tasks(lun)) { +- lun->reset_poller = +- SPDK_POLLER_REGISTER(scsi_lun_reset_check_outstanding_tasks, +- task, 10); +- return; +- } +- } +- +- scsi_lun_complete_mgmt_task(lun, task); +-} +- +-static void +-scsi_lun_append_mgmt_task(struct spdk_scsi_lun *lun, +- struct spdk_scsi_task *task) +-{ +- TAILQ_INSERT_TAIL(&lun->pending_mgmt_tasks, task, scsi_link); +-} +- +-static bool +-_scsi_lun_handle_unit_attention(struct spdk_scsi_task *task) +-{ +- uint8_t *cdb = task->cdb; +- +- assert(task->cdb); +- +- switch (cdb[0]) { +- case SPDK_SPC_INQUIRY: +- case SPDK_SPC_REPORT_LUNS: +- case SPDK_SPC_REQUEST_SENSE: +- return false; +- default: +- return true; +- } +-} +- +-static void +-_scsi_lun_execute_mgmt_task(struct spdk_scsi_lun *lun) +-{ +- struct spdk_scsi_task *task; +- +- if (!TAILQ_EMPTY(&lun->mgmt_tasks)) { +- return; +- } +- +- task = TAILQ_FIRST(&lun->pending_mgmt_tasks); +- if (spdk_likely(task == NULL)) { +- /* Try to execute all pending tasks */ +- scsi_lun_execute_tasks(lun); +- return; +- } +- TAILQ_REMOVE(&lun->pending_mgmt_tasks, task, scsi_link); +- +- TAILQ_INSERT_TAIL(&lun->mgmt_tasks, task, scsi_link); +- +- if (lun->removed) { +- task->response = SPDK_SCSI_TASK_MGMT_RESP_INVALID_LUN; +- scsi_lun_complete_mgmt_task(lun, task); +- return; +- } +- +- switch (task->function) { +- case SPDK_SCSI_TASK_FUNC_ABORT_TASK: +- task->response = SPDK_SCSI_TASK_MGMT_RESP_REJECT_FUNC_NOT_SUPPORTED; +- SPDK_ERRLOG("ABORT_TASK failed\n"); +- break; +- +- case SPDK_SCSI_TASK_FUNC_ABORT_TASK_SET: +- task->response = SPDK_SCSI_TASK_MGMT_RESP_REJECT_FUNC_NOT_SUPPORTED; +- SPDK_ERRLOG("ABORT_TASK_SET failed\n"); +- break; +- +- case SPDK_SCSI_TASK_FUNC_LUN_RESET: +- bdev_scsi_reset(task); +- return; +- +- default: +- SPDK_ERRLOG("Unknown Task Management Function!\n"); +- /* +- * Task management functions other than those above should never +- * reach this point having been filtered by the frontend. Reject +- * the task as being unsupported. +- */ +- task->response = SPDK_SCSI_TASK_MGMT_RESP_REJECT_FUNC_NOT_SUPPORTED; +- break; +- } +- +- scsi_lun_complete_mgmt_task(lun, task); +-} +- +-void +-scsi_lun_execute_mgmt_task(struct spdk_scsi_lun *lun, +- struct spdk_scsi_task *task) +-{ +- scsi_lun_append_mgmt_task(lun, task); +- _scsi_lun_execute_mgmt_task(lun); +-} +- +-static void +-_scsi_lun_execute_task(struct spdk_scsi_lun *lun, struct spdk_scsi_task *task) +-{ +- int rc; +- +- task->status = SPDK_SCSI_STATUS_GOOD; +- spdk_trace_record(TRACE_SCSI_TASK_START, lun->dev->id, task->length, (uintptr_t)task); +- TAILQ_INSERT_TAIL(&lun->tasks, task, scsi_link); +- if (spdk_unlikely(lun->removed)) { +- spdk_scsi_task_process_abort(task); +- rc = SPDK_SCSI_TASK_COMPLETE; +- } else if (spdk_unlikely(lun->resizing) && _scsi_lun_handle_unit_attention(task)) { +- spdk_scsi_task_set_status(task, SPDK_SCSI_STATUS_CHECK_CONDITION, +- SPDK_SCSI_SENSE_UNIT_ATTENTION, +- SPDK_SCSI_ASC_CAPACITY_DATA_HAS_CHANGED, +- SPDK_SCSI_ASCQ_CAPACITY_DATA_HAS_CHANGED); +- lun->resizing = false; +- rc = SPDK_SCSI_TASK_COMPLETE; +- } else { +- /* Check the command is allowed or not when reservation is exist */ +- if (spdk_unlikely(lun->reservation.flags & SCSI_SPC2_RESERVE)) { +- rc = scsi2_reserve_check(task); +- } else { +- rc = scsi_pr_check(task); +- } +- if (spdk_unlikely(rc < 0)) { +- /* Reservation Conflict */ +- rc = SPDK_SCSI_TASK_COMPLETE; +- } else { +- rc = bdev_scsi_execute(task); +- } +- } +- +- switch (rc) { +- case SPDK_SCSI_TASK_PENDING: +- break; +- +- case SPDK_SCSI_TASK_COMPLETE: +- scsi_lun_complete_task(lun, task); +- break; +- +- default: +- abort(); +- } +-} +- +-static void +-scsi_lun_append_task(struct spdk_scsi_lun *lun, struct spdk_scsi_task *task) +-{ +- TAILQ_INSERT_TAIL(&lun->pending_tasks, task, scsi_link); +-} +- +-static void +-scsi_lun_execute_tasks(struct spdk_scsi_lun *lun) +-{ +- struct spdk_scsi_task *task, *task_tmp; +- +- TAILQ_FOREACH_SAFE(task, &lun->pending_tasks, scsi_link, task_tmp) { +- TAILQ_REMOVE(&lun->pending_tasks, task, scsi_link); +- _scsi_lun_execute_task(lun, task); +- } +-} +- +-void +-scsi_lun_execute_task(struct spdk_scsi_lun *lun, struct spdk_scsi_task *task) +-{ +- if (spdk_unlikely(_scsi_lun_has_pending_mgmt_tasks(lun))) { +- /* Add the IO task to pending list and wait for completion of +- * existing mgmt tasks. +- */ +- scsi_lun_append_task(lun, task); +- } else if (spdk_unlikely(_scsi_lun_has_pending_tasks(lun))) { +- /* If there is any pending IO task, append the IO task to the +- * tail of the pending list, and then execute all pending IO tasks +- * from the head to submit IO tasks in order. +- */ +- scsi_lun_append_task(lun, task); +- scsi_lun_execute_tasks(lun); +- } else { +- /* Execute the IO task directly. */ +- _scsi_lun_execute_task(lun, task); +- } +-} +- +-static void +-_scsi_lun_remove(void *arg) +-{ +- struct spdk_scsi_lun *lun = (struct spdk_scsi_lun *)arg; +- +- spdk_bdev_close(lun->bdev_desc); +- spdk_scsi_dev_delete_lun(lun->dev, lun); +- free(lun); +-} +- +-static void +-scsi_lun_remove(struct spdk_scsi_lun *lun) +-{ +- struct spdk_scsi_pr_registrant *reg, *tmp; +- +- TAILQ_FOREACH_SAFE(reg, &lun->reg_head, link, tmp) { +- TAILQ_REMOVE(&lun->reg_head, reg, link); +- free(reg); +- } +- +- spdk_thread_exec_msg(lun->thread, _scsi_lun_remove, lun); +-} +- +-static int +-scsi_lun_check_io_channel(void *arg) +-{ +- struct spdk_scsi_lun *lun = (struct spdk_scsi_lun *)arg; +- +- if (lun->io_channel) { +- return SPDK_POLLER_BUSY; +- } +- spdk_poller_unregister(&lun->hotremove_poller); +- +- scsi_lun_remove(lun); +- return SPDK_POLLER_BUSY; +-} +- +-static void +-scsi_lun_notify_hot_remove(struct spdk_scsi_lun *lun) +-{ +- struct spdk_scsi_lun_desc *desc, *tmp; +- +- if (lun->hotremove_cb) { +- lun->hotremove_cb(lun, lun->hotremove_ctx); +- } +- +- TAILQ_FOREACH_SAFE(desc, &lun->open_descs, link, tmp) { +- if (desc->hotremove_cb) { +- desc->hotremove_cb(lun, desc->hotremove_ctx); +- } else { +- spdk_scsi_lun_close(desc); +- } +- } +- +- if (lun->io_channel) { +- lun->hotremove_poller = SPDK_POLLER_REGISTER(scsi_lun_check_io_channel, +- lun, 10); +- } else { +- scsi_lun_remove(lun); +- } +-} +- +-static int +-scsi_lun_check_outstanding_tasks(void *arg) +-{ +- struct spdk_scsi_lun *lun = (struct spdk_scsi_lun *)arg; +- +- if (scsi_lun_has_outstanding_tasks(lun) || +- scsi_lun_has_outstanding_mgmt_tasks(lun)) { +- return SPDK_POLLER_BUSY; +- } +- spdk_poller_unregister(&lun->hotremove_poller); +- +- scsi_lun_notify_hot_remove(lun); +- return SPDK_POLLER_BUSY; +-} +- +-static void +-_scsi_lun_hot_remove(void *arg1) +-{ +- struct spdk_scsi_lun *lun = arg1; +- +- /* If lun->removed is set, no new task can be submitted to the LUN. +- * Execute previously queued tasks, which will be immediately aborted. +- */ +- scsi_lun_execute_tasks(lun); +- +- /* Then we only need to wait for all outstanding tasks to be completed +- * before notifying the upper layer about the removal. +- */ +- if (scsi_lun_has_outstanding_tasks(lun) || +- scsi_lun_has_outstanding_mgmt_tasks(lun)) { +- lun->hotremove_poller = SPDK_POLLER_REGISTER(scsi_lun_check_outstanding_tasks, +- lun, 10); +- } else { +- scsi_lun_notify_hot_remove(lun); +- } +-} +- +-static void +-scsi_lun_hot_remove(void *remove_ctx) +-{ +- struct spdk_scsi_lun *lun = (struct spdk_scsi_lun *)remove_ctx; +- struct spdk_thread *thread; +- +- if (lun->removed) { +- return; +- } +- +- lun->removed = true; +- if (lun->io_channel == NULL) { +- _scsi_lun_hot_remove(lun); +- return; +- } +- +- thread = spdk_io_channel_get_thread(lun->io_channel); +- if (thread != spdk_get_thread()) { +- spdk_thread_send_msg(thread, _scsi_lun_hot_remove, lun); +- } else { +- _scsi_lun_hot_remove(lun); +- } +-} +- +-static void +-bdev_event_cb(enum spdk_bdev_event_type type, struct spdk_bdev *bdev, +- void *event_ctx) +-{ +- struct spdk_scsi_lun *lun = (struct spdk_scsi_lun *)event_ctx; +- switch (type) { +- case SPDK_BDEV_EVENT_REMOVE: +- SPDK_NOTICELOG("bdev name (%s) received event(SPDK_BDEV_EVENT_REMOVE)\n", spdk_bdev_get_name(bdev)); +- scsi_lun_hot_remove(event_ctx); +- break; +- case SPDK_BDEV_EVENT_RESIZE: +- SPDK_NOTICELOG("bdev name (%s) received event(SPDK_BDEV_EVENT_RESIZE)\n", spdk_bdev_get_name(bdev)); +- lun->resizing = true; +- if (lun->resize_cb) { +- lun->resize_cb(lun, lun->resize_ctx); +- } +- break; +- default: +- SPDK_NOTICELOG("Unsupported bdev event: type %d\n", type); +- break; +- } +-} +- +-/** +- * \brief Constructs a new spdk_scsi_lun object based on the provided parameters. +- * +- * \param bdev_name Bdev name to open and associate with this LUN +- * +- * \return NULL if bdev whose name matches is not found +- * \return pointer to the new spdk_scsi_lun object otherwise +- */ +-struct spdk_scsi_lun *scsi_lun_construct(const char *bdev_name, +- void (*resize_cb)(const struct spdk_scsi_lun *, void *), +- void *resize_ctx, +- void (*hotremove_cb)(const struct spdk_scsi_lun *, void *), +- void *hotremove_ctx) +-{ +- struct spdk_scsi_lun *lun; +- int rc; +- +- if (bdev_name == NULL) { +- SPDK_ERRLOG("bdev_name must be non-NULL\n"); +- return NULL; +- } +- +- lun = calloc(1, sizeof(*lun)); +- if (lun == NULL) { +- SPDK_ERRLOG("could not allocate lun\n"); +- return NULL; +- } +- +- rc = spdk_bdev_open_ext(bdev_name, true, bdev_event_cb, lun, &lun->bdev_desc); +- +- if (rc != 0) { +- SPDK_ERRLOG("bdev %s cannot be opened, error=%d\n", bdev_name, rc); +- free(lun); +- return NULL; +- } +- +- lun->thread = spdk_get_thread(); +- +- TAILQ_INIT(&lun->tasks); +- TAILQ_INIT(&lun->pending_tasks); +- TAILQ_INIT(&lun->mgmt_tasks); +- TAILQ_INIT(&lun->pending_mgmt_tasks); +- +- /* Bdev is not removed while it is opened. */ +- lun->bdev = spdk_bdev_desc_get_bdev(lun->bdev_desc); +- lun->io_channel = NULL; +- lun->hotremove_cb = hotremove_cb; +- lun->hotremove_ctx = hotremove_ctx; +- +- lun->resize_cb = resize_cb; +- lun->resize_ctx = resize_ctx; +- lun->resizing = false; +- +- TAILQ_INIT(&lun->open_descs); +- TAILQ_INIT(&lun->reg_head); +- +- return lun; +-} +- +-void +-scsi_lun_destruct(struct spdk_scsi_lun *lun) +-{ +- scsi_lun_hot_remove(lun); +-} +- +-int +-spdk_scsi_lun_open(struct spdk_scsi_lun *lun, spdk_scsi_lun_remove_cb_t hotremove_cb, +- void *hotremove_ctx, struct spdk_scsi_lun_desc **_desc) +-{ +- struct spdk_scsi_lun_desc *desc; +- +- desc = calloc(1, sizeof(*desc)); +- if (desc == NULL) { +- SPDK_ERRLOG("calloc() failed for LUN descriptor.\n"); +- return -ENOMEM; +- } +- +- TAILQ_INSERT_TAIL(&lun->open_descs, desc, link); +- +- desc->lun = lun; +- desc->hotremove_cb = hotremove_cb; +- desc->hotremove_ctx = hotremove_ctx; +- *_desc = desc; +- +- return 0; +-} +- +-void +-spdk_scsi_lun_close(struct spdk_scsi_lun_desc *desc) +-{ +- struct spdk_scsi_lun *lun = desc->lun; +- +- TAILQ_REMOVE(&lun->open_descs, desc, link); +- free(desc); +- +- assert(!TAILQ_EMPTY(&lun->open_descs) || lun->io_channel == NULL); +-} +- +-int +-scsi_lun_allocate_io_channel(struct spdk_scsi_lun *lun) +-{ +- if (lun->io_channel != NULL) { +- if (spdk_get_thread() == spdk_io_channel_get_thread(lun->io_channel)) { +- lun->ref++; +- return 0; +- } +- SPDK_ERRLOG("io_channel already allocated for lun %s\n", +- spdk_bdev_get_name(lun->bdev)); +- return -1; +- } +- +- lun->io_channel = spdk_bdev_get_io_channel(lun->bdev_desc); +- if (lun->io_channel == NULL) { +- return -1; +- } +- lun->ref = 1; +- return 0; +-} +- +-void +-scsi_lun_free_io_channel(struct spdk_scsi_lun *lun) +-{ +- if (lun->io_channel == NULL) { +- return; +- } +- +- if (spdk_get_thread() != spdk_io_channel_get_thread(lun->io_channel)) { +- SPDK_ERRLOG("io_channel was freed by different thread\n"); +- return; +- } +- +- lun->ref--; +- if (lun->ref == 0) { +- spdk_put_io_channel(lun->io_channel); +- lun->io_channel = NULL; +- } +-} +- +-int +-spdk_scsi_lun_allocate_io_channel(struct spdk_scsi_lun_desc *desc) +-{ +- struct spdk_scsi_lun *lun = desc->lun; +- +- return scsi_lun_allocate_io_channel(lun); +-} +- +-void +-spdk_scsi_lun_free_io_channel(struct spdk_scsi_lun_desc *desc) +-{ +- struct spdk_scsi_lun *lun = desc->lun; +- +- scsi_lun_free_io_channel(lun); +-} +- +-int +-spdk_scsi_lun_get_id(const struct spdk_scsi_lun *lun) +-{ +- return lun->id; +-} +- +-const char * +-spdk_scsi_lun_get_bdev_name(const struct spdk_scsi_lun *lun) +-{ +- return spdk_bdev_get_name(lun->bdev); +-} +- +-const struct spdk_scsi_dev * +-spdk_scsi_lun_get_dev(const struct spdk_scsi_lun *lun) +-{ +- return lun->dev; +-} +- +-bool +-scsi_lun_has_pending_mgmt_tasks(const struct spdk_scsi_lun *lun, +- const struct spdk_scsi_port *initiator_port) +-{ +- struct spdk_scsi_task *task; +- +- if (initiator_port == NULL) { +- return _scsi_lun_has_pending_mgmt_tasks(lun) || +- scsi_lun_has_outstanding_mgmt_tasks(lun); +- } +- +- TAILQ_FOREACH(task, &lun->pending_mgmt_tasks, scsi_link) { +- if (task->initiator_port == initiator_port) { +- return true; +- } +- } +- +- TAILQ_FOREACH(task, &lun->mgmt_tasks, scsi_link) { +- if (task->initiator_port == initiator_port) { +- return true; +- } +- } +- +- return false; +-} +-/* This check includes both pending and submitted (outstanding) tasks. */ +-bool +-scsi_lun_has_pending_tasks(const struct spdk_scsi_lun *lun, +- const struct spdk_scsi_port *initiator_port) +-{ +- struct spdk_scsi_task *task; +- +- if (initiator_port == NULL) { +- return _scsi_lun_has_pending_tasks(lun) || +- scsi_lun_has_outstanding_tasks(lun); +- } +- +- TAILQ_FOREACH(task, &lun->pending_tasks, scsi_link) { +- if (task->initiator_port == initiator_port) { +- return true; +- } +- } +- +- TAILQ_FOREACH(task, &lun->tasks, scsi_link) { +- if (task->initiator_port == initiator_port) { +- return true; +- } +- } +- +- return false; +-} +- +-bool +-spdk_scsi_lun_is_removing(const struct spdk_scsi_lun *lun) +-{ +- return lun->removed; +-} +- +-bool +-spdk_scsi_lun_get_dif_ctx(struct spdk_scsi_lun *lun, struct spdk_scsi_task *task, +- struct spdk_dif_ctx *dif_ctx) +-{ +- return bdev_scsi_get_dif_ctx(lun->bdev, task, dif_ctx); +-} ++/* SPDX-License-Identifier: BSD-3-Clause ++ * Copyright (C) 2008-2012 Daisuke Aoyama . ++ * Copyright (C) 2016 Intel Corporation. ++ * All rights reserved. ++ */ ++ ++#include "scsi_internal.h" ++#include "spdk/endian.h" ++#include "spdk/env.h" ++#include "spdk/thread.h" ++#include "spdk/util.h" ++#include "spdk/likely.h" ++#include "spdk/event.h" ++#include "spdk/bdev_module.h" ++ ++static void scsi_lun_execute_tasks(struct spdk_scsi_lun *lun); ++static void _scsi_lun_execute_mgmt_task(struct spdk_scsi_lun *lun); ++ ++void ++scsi_lun_complete_task(struct spdk_scsi_lun *lun, struct spdk_scsi_task *task) ++{ ++ if (lun) { ++ TAILQ_REMOVE(&lun->tasks, task, scsi_link); ++ spdk_trace_record(TRACE_SCSI_TASK_DONE, lun->dev->id, 0, (uintptr_t)task); ++ } ++ task->cpl_fn(task); ++} ++ ++static void ++scsi_lun_complete_mgmt_task(struct spdk_scsi_lun *lun, struct spdk_scsi_task *task) ++{ ++ TAILQ_REMOVE(&lun->mgmt_tasks, task, scsi_link); ++ ++ task->cpl_fn(task); ++ ++ /* Try to execute the first pending mgmt task if it exists. */ ++ _scsi_lun_execute_mgmt_task(lun); ++} ++ ++static bool ++_scsi_lun_has_pending_mgmt_tasks(const struct spdk_scsi_lun *lun) ++{ ++ return !TAILQ_EMPTY(&lun->pending_mgmt_tasks); ++} ++ ++static bool ++scsi_lun_has_outstanding_mgmt_tasks(const struct spdk_scsi_lun *lun) ++{ ++ return !TAILQ_EMPTY(&lun->mgmt_tasks); ++} ++ ++static bool ++_scsi_lun_has_pending_tasks(const struct spdk_scsi_lun *lun) ++{ ++ return !TAILQ_EMPTY(&lun->pending_tasks); ++} ++ ++static bool ++scsi_lun_has_outstanding_tasks(const struct spdk_scsi_lun *lun) ++{ ++ return !TAILQ_EMPTY(&lun->tasks); ++} ++ ++/* Reset task have to wait until all prior outstanding tasks complete. */ ++static int ++scsi_lun_reset_check_outstanding_tasks(void *arg) ++{ ++ struct spdk_scsi_task *task = (struct spdk_scsi_task *)arg; ++ struct spdk_scsi_lun *lun = task->lun; ++ ++ if (scsi_lun_has_outstanding_tasks(lun)) { ++ return SPDK_POLLER_BUSY; ++ } ++ spdk_poller_unregister(&lun->reset_poller); ++ ++ scsi_lun_complete_mgmt_task(lun, task); ++ return SPDK_POLLER_BUSY; ++} ++ ++void ++scsi_lun_complete_reset_task(struct spdk_scsi_lun *lun, struct spdk_scsi_task *task) ++{ ++ if (task->status == SPDK_SCSI_STATUS_GOOD) { ++ if (scsi_lun_has_outstanding_tasks(lun)) { ++ lun->reset_poller = ++ SPDK_POLLER_REGISTER(scsi_lun_reset_check_outstanding_tasks, ++ task, 10); ++ return; ++ } ++ } ++ ++ scsi_lun_complete_mgmt_task(lun, task); ++} ++ ++static void ++scsi_lun_append_mgmt_task(struct spdk_scsi_lun *lun, ++ struct spdk_scsi_task *task) ++{ ++ TAILQ_INSERT_TAIL(&lun->pending_mgmt_tasks, task, scsi_link); ++} ++ ++static bool ++_scsi_lun_handle_unit_attention(struct spdk_scsi_task *task) ++{ ++ uint8_t *cdb = task->cdb; ++ ++ assert(task->cdb); ++ ++ switch (cdb[0]) { ++ case SPDK_SPC_INQUIRY: ++ case SPDK_SPC_REPORT_LUNS: ++ case SPDK_SPC_REQUEST_SENSE: ++ return false; ++ default: ++ return true; ++ } ++} ++ ++static void ++_scsi_lun_execute_mgmt_task(struct spdk_scsi_lun *lun) ++{ ++ struct spdk_scsi_task *task; ++ ++ if (!TAILQ_EMPTY(&lun->mgmt_tasks)) { ++ return; ++ } ++ ++ task = TAILQ_FIRST(&lun->pending_mgmt_tasks); ++ if (spdk_likely(task == NULL)) { ++ /* Try to execute all pending tasks */ ++ scsi_lun_execute_tasks(lun); ++ return; ++ } ++ TAILQ_REMOVE(&lun->pending_mgmt_tasks, task, scsi_link); ++ ++ TAILQ_INSERT_TAIL(&lun->mgmt_tasks, task, scsi_link); ++ ++ if (lun->removed) { ++ task->response = SPDK_SCSI_TASK_MGMT_RESP_INVALID_LUN; ++ scsi_lun_complete_mgmt_task(lun, task); ++ return; ++ } ++ ++ switch (task->function) { ++ case SPDK_SCSI_TASK_FUNC_ABORT_TASK: ++ task->response = SPDK_SCSI_TASK_MGMT_RESP_REJECT_FUNC_NOT_SUPPORTED; ++ SPDK_ERRLOG("ABORT_TASK failed\n"); ++ break; ++ ++ case SPDK_SCSI_TASK_FUNC_ABORT_TASK_SET: ++ task->response = SPDK_SCSI_TASK_MGMT_RESP_REJECT_FUNC_NOT_SUPPORTED; ++ SPDK_ERRLOG("ABORT_TASK_SET failed\n"); ++ break; ++ ++ case SPDK_SCSI_TASK_FUNC_LUN_RESET: ++ bdev_scsi_reset(task); ++ return; ++ ++ default: ++ SPDK_ERRLOG("Unknown Task Management Function!\n"); ++ /* ++ * Task management functions other than those above should never ++ * reach this point having been filtered by the frontend. Reject ++ * the task as being unsupported. ++ */ ++ task->response = SPDK_SCSI_TASK_MGMT_RESP_REJECT_FUNC_NOT_SUPPORTED; ++ break; ++ } ++ ++ scsi_lun_complete_mgmt_task(lun, task); ++} ++ ++void ++scsi_lun_execute_mgmt_task(struct spdk_scsi_lun *lun, ++ struct spdk_scsi_task *task) ++{ ++ scsi_lun_append_mgmt_task(lun, task); ++ _scsi_lun_execute_mgmt_task(lun); ++} ++ ++static void ++_scsi_lun_execute_task(struct spdk_scsi_lun *lun, struct spdk_scsi_task *task) ++{ ++ int rc; ++ ++ task->status = SPDK_SCSI_STATUS_GOOD; ++ spdk_trace_record(TRACE_SCSI_TASK_START, lun->dev->id, task->length, (uintptr_t)task); ++ TAILQ_INSERT_TAIL(&lun->tasks, task, scsi_link); ++ if (spdk_unlikely(lun->removed)) { ++ spdk_scsi_task_process_abort(task); ++ rc = SPDK_SCSI_TASK_COMPLETE; ++ } else if (spdk_unlikely(lun->resizing) && _scsi_lun_handle_unit_attention(task)) { ++ spdk_scsi_task_set_status(task, SPDK_SCSI_STATUS_CHECK_CONDITION, ++ SPDK_SCSI_SENSE_UNIT_ATTENTION, ++ SPDK_SCSI_ASC_CAPACITY_DATA_HAS_CHANGED, ++ SPDK_SCSI_ASCQ_CAPACITY_DATA_HAS_CHANGED); ++ lun->resizing = false; ++ rc = SPDK_SCSI_TASK_COMPLETE; ++ } else { ++ /* Check the command is allowed or not when reservation is exist */ ++ if (spdk_unlikely(lun->reservation.flags & SCSI_SPC2_RESERVE)) { ++ rc = scsi2_reserve_check(task); ++ } else { ++ rc = scsi_pr_check(task); ++ } ++ if (spdk_unlikely(rc < 0)) { ++ /* Reservation Conflict */ ++ rc = SPDK_SCSI_TASK_COMPLETE; ++ } else { ++ rc = bdev_scsi_execute(task); ++ } ++ } ++ ++ switch (rc) { ++ case SPDK_SCSI_TASK_PENDING: ++ break; ++ ++ case SPDK_SCSI_TASK_COMPLETE: ++ scsi_lun_complete_task(lun, task); ++ break; ++ ++ default: ++ abort(); ++ } ++} ++ ++static void ++scsi_lun_append_task(struct spdk_scsi_lun *lun, struct spdk_scsi_task *task) ++{ ++ TAILQ_INSERT_TAIL(&lun->pending_tasks, task, scsi_link); ++} ++ ++static void ++scsi_lun_execute_tasks(struct spdk_scsi_lun *lun) ++{ ++ struct spdk_scsi_task *task, *task_tmp; ++ ++ TAILQ_FOREACH_SAFE(task, &lun->pending_tasks, scsi_link, task_tmp) { ++ TAILQ_REMOVE(&lun->pending_tasks, task, scsi_link); ++ _scsi_lun_execute_task(lun, task); ++ } ++} ++ ++void ++scsi_lun_execute_task(struct spdk_scsi_lun *lun, struct spdk_scsi_task *task) ++{ ++ if (spdk_unlikely(_scsi_lun_has_pending_mgmt_tasks(lun))) { ++ /* Add the IO task to pending list and wait for completion of ++ * existing mgmt tasks. ++ */ ++ scsi_lun_append_task(lun, task); ++ } else if (spdk_unlikely(_scsi_lun_has_pending_tasks(lun))) { ++ /* If there is any pending IO task, append the IO task to the ++ * tail of the pending list, and then execute all pending IO tasks ++ * from the head to submit IO tasks in order. ++ */ ++ scsi_lun_append_task(lun, task); ++ scsi_lun_execute_tasks(lun); ++ } else { ++ /* Execute the IO task directly. */ ++ _scsi_lun_execute_task(lun, task); ++ } ++} ++ ++static void ++_scsi_lun_remove(void *arg) ++{ ++ struct spdk_scsi_lun *lun = (struct spdk_scsi_lun *)arg; ++ ++ spdk_bdev_close(lun->bdev_desc); ++ spdk_scsi_dev_delete_lun(lun->dev, lun); ++ free(lun); ++} ++ ++static void ++scsi_lun_remove(struct spdk_scsi_lun *lun) ++{ ++ struct spdk_scsi_pr_registrant *reg, *tmp; ++ ++ TAILQ_FOREACH_SAFE(reg, &lun->reg_head, link, tmp) { ++ TAILQ_REMOVE(&lun->reg_head, reg, link); ++ free(reg); ++ } ++ ++ spdk_thread_exec_msg(lun->thread, _scsi_lun_remove, lun); ++} ++ ++static int ++scsi_lun_check_io_channel(void *arg) ++{ ++ struct spdk_scsi_lun *lun = (struct spdk_scsi_lun *)arg; ++ ++ if (lun->io_channel) { ++ return SPDK_POLLER_BUSY; ++ } ++ spdk_poller_unregister(&lun->hotremove_poller); ++ ++ scsi_lun_remove(lun); ++ return SPDK_POLLER_BUSY; ++} ++ ++static void ++scsi_lun_notify_hot_remove(struct spdk_scsi_lun *lun) ++{ ++ struct spdk_scsi_lun_desc *desc, *tmp; ++ ++ if (lun->hotremove_cb) { ++ lun->hotremove_cb(lun, lun->hotremove_ctx); ++ } ++ ++ TAILQ_FOREACH_SAFE(desc, &lun->open_descs, link, tmp) { ++ if (desc->hotremove_cb) { ++ desc->hotremove_cb(lun, desc->hotremove_ctx); ++ } else { ++ spdk_scsi_lun_close(desc); ++ } ++ } ++ ++ if (lun->io_channel) { ++ lun->hotremove_poller = SPDK_POLLER_REGISTER(scsi_lun_check_io_channel, ++ lun, 10); ++ } else { ++ scsi_lun_remove(lun); ++ } ++} ++ ++static int ++scsi_lun_check_outstanding_tasks(void *arg) ++{ ++ struct spdk_scsi_lun *lun = (struct spdk_scsi_lun *)arg; ++ ++ if (scsi_lun_has_outstanding_tasks(lun) || ++ scsi_lun_has_outstanding_mgmt_tasks(lun)) { ++ return SPDK_POLLER_BUSY; ++ } ++ spdk_poller_unregister(&lun->hotremove_poller); ++ ++ scsi_lun_notify_hot_remove(lun); ++ return SPDK_POLLER_BUSY; ++} ++ ++static void ++_scsi_lun_hot_remove(void *arg1) ++{ ++ struct spdk_scsi_lun *lun = arg1; ++ ++ if (spdk_unlikely(spdk_get_shutdown_sig_received())) { ++ /* ++ * In the hot restart process, when this callback is triggered, ++ * the task and bdev_io memory may have been released. ++ * Therefore, outstanding task are not executed in this scenario. ++ */ ++ scsi_lun_notify_hot_remove(lun); ++ return; ++ } ++ ++ /* If lun->removed is set, no new task can be submitted to the LUN. ++ * Execute previously queued tasks, which will be immediately aborted. ++ */ ++ scsi_lun_execute_tasks(lun); ++ ++ /* Then we only need to wait for all outstanding tasks to be completed ++ * before notifying the upper layer about the removal. ++ */ ++ if (scsi_lun_has_outstanding_tasks(lun) || ++ scsi_lun_has_outstanding_mgmt_tasks(lun)) { ++ lun->hotremove_poller = SPDK_POLLER_REGISTER(scsi_lun_check_outstanding_tasks, ++ lun, 10); ++ } else { ++ scsi_lun_notify_hot_remove(lun); ++ } ++} ++ ++static void ++scsi_lun_hot_remove(void *remove_ctx) ++{ ++ struct spdk_scsi_lun *lun = (struct spdk_scsi_lun *)remove_ctx; ++ struct spdk_thread *thread; ++ ++ if (lun->removed) { ++ return; ++ } ++ ++ lun->removed = true; ++ if (lun->io_channel == NULL) { ++ _scsi_lun_hot_remove(lun); ++ return; ++ } ++ ++ thread = spdk_io_channel_get_thread(lun->io_channel); ++ if (thread != spdk_get_thread()) { ++ spdk_thread_send_msg(thread, _scsi_lun_hot_remove, lun); ++ } else { ++ _scsi_lun_hot_remove(lun); ++ } ++} ++ ++static void ++bdev_event_cb(enum spdk_bdev_event_type type, struct spdk_bdev *bdev, ++ void *event_ctx) ++{ ++ struct spdk_scsi_lun *lun = (struct spdk_scsi_lun *)event_ctx; ++ switch (type) { ++ case SPDK_BDEV_EVENT_REMOVE: ++ SPDK_NOTICELOG("bdev name (%s) received event(SPDK_BDEV_EVENT_REMOVE)\n", spdk_bdev_get_name(bdev)); ++ scsi_lun_hot_remove(event_ctx); ++ break; ++ case SPDK_BDEV_EVENT_RESIZE: ++ SPDK_NOTICELOG("bdev name (%s) received event(SPDK_BDEV_EVENT_RESIZE)\n", spdk_bdev_get_name(bdev)); ++ lun->resizing = true; ++ if (lun->resize_cb) { ++ lun->resize_cb(lun, lun->resize_ctx); ++ } ++ break; ++ default: ++ SPDK_NOTICELOG("Unsupported bdev event: type %d\n", type); ++ break; ++ } ++} ++ ++/** ++ * \brief Constructs a new spdk_scsi_lun object based on the provided parameters. ++ * ++ * \param bdev_name Bdev name to open and associate with this LUN ++ * ++ * \return NULL if bdev whose name matches is not found ++ * \return pointer to the new spdk_scsi_lun object otherwise ++ */ ++struct spdk_scsi_lun *scsi_lun_construct(const char *bdev_name, ++ void (*resize_cb)(const struct spdk_scsi_lun *, void *), ++ void *resize_ctx, ++ void (*hotremove_cb)(const struct spdk_scsi_lun *, void *), ++ void *hotremove_ctx) ++{ ++ struct spdk_scsi_lun *lun; ++ int rc; ++ ++ if (bdev_name == NULL) { ++ SPDK_ERRLOG("bdev_name must be non-NULL\n"); ++ return NULL; ++ } ++ ++ lun = calloc(1, sizeof(*lun)); ++ if (lun == NULL) { ++ SPDK_ERRLOG("could not allocate lun\n"); ++ return NULL; ++ } ++ ++ rc = spdk_bdev_open_ext(bdev_name, true, bdev_event_cb, lun, &lun->bdev_desc); ++ ++ if (rc != 0) { ++ SPDK_ERRLOG("bdev %s cannot be opened, error=%d\n", bdev_name, rc); ++ free(lun); ++ return NULL; ++ } ++ ++ lun->thread = spdk_get_thread(); ++ ++ TAILQ_INIT(&lun->tasks); ++ TAILQ_INIT(&lun->pending_tasks); ++ TAILQ_INIT(&lun->mgmt_tasks); ++ TAILQ_INIT(&lun->pending_mgmt_tasks); ++ ++ /* Bdev is not removed while it is opened. */ ++ lun->bdev = spdk_bdev_desc_get_bdev(lun->bdev_desc); ++ lun->io_channel = NULL; ++ lun->hotremove_cb = hotremove_cb; ++ lun->hotremove_ctx = hotremove_ctx; ++ ++ lun->resize_cb = resize_cb; ++ lun->resize_ctx = resize_ctx; ++ lun->resizing = false; ++ ++ TAILQ_INIT(&lun->open_descs); ++ TAILQ_INIT(&lun->reg_head); ++ ++ return lun; ++} ++ ++void ++scsi_lun_destruct(struct spdk_scsi_lun *lun) ++{ ++ scsi_lun_hot_remove(lun); ++} ++ ++int ++spdk_scsi_lun_open(struct spdk_scsi_lun *lun, spdk_scsi_lun_remove_cb_t hotremove_cb, ++ void *hotremove_ctx, struct spdk_scsi_lun_desc **_desc) ++{ ++ struct spdk_scsi_lun_desc *desc; ++ ++ desc = calloc(1, sizeof(*desc)); ++ if (desc == NULL) { ++ SPDK_ERRLOG("calloc() failed for LUN descriptor.\n"); ++ return -ENOMEM; ++ } ++ ++ TAILQ_INSERT_TAIL(&lun->open_descs, desc, link); ++ ++ desc->lun = lun; ++ desc->hotremove_cb = hotremove_cb; ++ desc->hotremove_ctx = hotremove_ctx; ++ *_desc = desc; ++ ++ return 0; ++} ++ ++void ++spdk_scsi_lun_close(struct spdk_scsi_lun_desc *desc) ++{ ++ struct spdk_scsi_lun *lun = desc->lun; ++ ++ TAILQ_REMOVE(&lun->open_descs, desc, link); ++ free(desc); ++ ++ assert(!TAILQ_EMPTY(&lun->open_descs) || lun->io_channel == NULL); ++} ++ ++int ++scsi_lun_allocate_io_channel(struct spdk_scsi_lun *lun) ++{ ++ if (lun->io_channel != NULL) { ++ if (spdk_get_thread() == spdk_io_channel_get_thread(lun->io_channel)) { ++ lun->ref++; ++ return 0; ++ } ++ SPDK_ERRLOG("io_channel already allocated for lun %s\n", ++ spdk_bdev_get_name(lun->bdev)); ++ return -1; ++ } ++ ++ lun->io_channel = spdk_bdev_get_io_channel(lun->bdev_desc); ++ if (lun->io_channel == NULL) { ++ return -1; ++ } ++ lun->ref = 1; ++ return 0; ++} ++ ++void ++scsi_lun_free_io_channel(struct spdk_scsi_lun *lun) ++{ ++ if (lun->io_channel == NULL) { ++ return; ++ } ++ ++ if (spdk_get_thread() != spdk_io_channel_get_thread(lun->io_channel)) { ++ SPDK_ERRLOG("io_channel was freed by different thread\n"); ++ return; ++ } ++ ++ lun->ref--; ++ if (lun->ref == 0) { ++ spdk_put_io_channel(lun->io_channel); ++ lun->io_channel = NULL; ++ } ++} ++ ++int ++spdk_scsi_lun_allocate_io_channel(struct spdk_scsi_lun_desc *desc) ++{ ++ struct spdk_scsi_lun *lun = desc->lun; ++ ++ return scsi_lun_allocate_io_channel(lun); ++} ++ ++void ++spdk_scsi_lun_free_io_channel(struct spdk_scsi_lun_desc *desc) ++{ ++ struct spdk_scsi_lun *lun = desc->lun; ++ ++ scsi_lun_free_io_channel(lun); ++} ++ ++int ++spdk_scsi_lun_get_id(const struct spdk_scsi_lun *lun) ++{ ++ return lun->id; ++} ++ ++const char * ++spdk_scsi_lun_get_bdev_name(const struct spdk_scsi_lun *lun) ++{ ++ return spdk_bdev_get_name(lun->bdev); ++} ++ ++const struct spdk_scsi_dev * ++spdk_scsi_lun_get_dev(const struct spdk_scsi_lun *lun) ++{ ++ return lun->dev; ++} ++ ++bool ++scsi_lun_has_pending_mgmt_tasks(const struct spdk_scsi_lun *lun, ++ const struct spdk_scsi_port *initiator_port) ++{ ++ struct spdk_scsi_task *task; ++ ++ if (initiator_port == NULL) { ++ return _scsi_lun_has_pending_mgmt_tasks(lun) || ++ scsi_lun_has_outstanding_mgmt_tasks(lun); ++ } ++ ++ TAILQ_FOREACH(task, &lun->pending_mgmt_tasks, scsi_link) { ++ if (task->initiator_port == initiator_port) { ++ return true; ++ } ++ } ++ ++ TAILQ_FOREACH(task, &lun->mgmt_tasks, scsi_link) { ++ if (task->initiator_port == initiator_port) { ++ return true; ++ } ++ } ++ ++ return false; ++} ++/* This check includes both pending and submitted (outstanding) tasks. */ ++bool ++scsi_lun_has_pending_tasks(const struct spdk_scsi_lun *lun, ++ const struct spdk_scsi_port *initiator_port) ++{ ++ struct spdk_scsi_task *task; ++ ++ if (initiator_port == NULL) { ++ return _scsi_lun_has_pending_tasks(lun) || ++ scsi_lun_has_outstanding_tasks(lun); ++ } ++ ++ TAILQ_FOREACH(task, &lun->pending_tasks, scsi_link) { ++ if (task->initiator_port == initiator_port) { ++ return true; ++ } ++ } ++ ++ TAILQ_FOREACH(task, &lun->tasks, scsi_link) { ++ if (task->initiator_port == initiator_port) { ++ return true; ++ } ++ } ++ ++ return false; ++} ++ ++bool ++spdk_scsi_lun_is_removing(const struct spdk_scsi_lun *lun) ++{ ++ return lun->removed; ++} ++ ++bool ++spdk_scsi_lun_get_dif_ctx(struct spdk_scsi_lun *lun, struct spdk_scsi_task *task, ++ struct spdk_dif_ctx *dif_ctx) ++{ ++ return bdev_scsi_get_dif_ctx(lun->bdev, task, dif_ctx); ++} +diff --git a/lib/ssam/Makefile b/lib/ssam/Makefile +new file mode 100644 +index 0000000..93c1ec7 +--- /dev/null ++++ b/lib/ssam/Makefile +@@ -0,0 +1,49 @@ ++# ++# BSD LICENSE ++# ++# Copyright (c) Huawei Technologies Co., Ltd. 2021-2021. All rights reserved. ++# ++# Redistribution and use in source and binary forms, with or without ++# modification, are permitted provided that the following conditions ++# are met: ++# ++# * Redistributions of source code must retain the above copyright ++# notice, this list of conditions and the following disclaimer. ++# * Redistributions in binary form must reproduce the above copyright ++# notice, this list of conditions and the following disclaimer in ++# the documentation and/or other materials provided with the ++# distribution. ++# * Neither the name of Intel Corporation nor the names of its ++# contributors may be used to endorse or promote products derived ++# from this software without specific prior written permission. ++# ++# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS ++# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT ++# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR ++# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT ++# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, ++# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT ++# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, ++# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY ++# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT ++# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE ++# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++# ++ ++SPDK_ROOT_DIR := $(abspath $(CURDIR)/../..) ++include $(SPDK_ROOT_DIR)/mk/spdk.common.mk ++ ++SO_VER := 1 ++SO_MINOR := 0 ++ ++CFLAGS += -I. -I../../dpdk/lib/eal/common ++CFLAGS += $(ENV_CFLAGS) ++ ++C_SRCS = ssam.c ssam_blk.c ssam_rpc.c \ ++ ssam_config.c ssam_scsi.c ssam_malloc.c ssam_device_pcie.c ++ ++LIBNAME = ssam ++ ++SPDK_MAP_FILE = $(abspath $(CURDIR)/spdk_ssam.map) ++ ++include $(SPDK_ROOT_DIR)/mk/spdk.lib.mk +diff --git a/lib/ssam/spdk_ssam.map b/lib/ssam/spdk_ssam.map +new file mode 100644 +index 0000000..ed8b8cd +--- /dev/null ++++ b/lib/ssam/spdk_ssam.map +@@ -0,0 +1,16 @@ ++{ ++ global: ++ ++ # public functions ++ spdk_ssam_user_config_init; ++ spdk_ssam_init; ++ spdk_ssam_exit; ++ spdk_ssam_subsystem_fini; ++ spdk_ssam_subsystem_init; ++ spdk_ssam_config_json; ++ ssam_set_shm_created; ++ ssam_get_shm_created; ++ ssam_poller_start; ++ ++ local: *; ++}; +diff --git a/lib/ssam/ssam.c b/lib/ssam/ssam.c +new file mode 100644 +index 0000000..9326b05 +--- /dev/null ++++ b/lib/ssam/ssam.c +@@ -0,0 +1,1720 @@ ++/*- ++ * BSD LICENSE ++ * ++ * Copyright (c) Huawei Technologies Co., Ltd. 2021-2021. All rights reserved. ++ * ++ * Redistribution and use in source and binary forms, with or without ++ * modification, are permitted provided that the following conditions ++ * are met: ++ * ++ * * Redistributions of source code must retain the above copyright ++ * notice, this list of conditions and the following disclaimer. ++ * * Redistributions in binary form must reproduce the above copyright ++ * notice, this list of conditions and the following disclaimer in ++ * the documentation and/or other materials provided with the ++ * distribution. ++ * * Neither the name of Intel Corporation nor the names of its ++ * contributors may be used to endorse or promote products derived ++ * from this software without specific prior written permission. ++ * ++ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS ++ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT ++ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR ++ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT ++ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, ++ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT ++ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, ++ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY ++ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT ++ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE ++ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++ */ ++ ++#include ++#include ++ ++#include "spdk/scsi_spec.h" ++#include "spdk/scsi.h" ++#include "spdk/stdinc.h" ++#include "spdk/env.h" ++#include "spdk/likely.h" ++#include "spdk/string.h" ++#include "spdk/util.h" ++#include "spdk/memory.h" ++#include "spdk/barrier.h" ++#include "spdk/bdev_module.h" ++#include "spdk/bdev.h" ++#include "spdk/endian.h" ++ ++#include "ssam_internal.h" ++ ++#define SSAM_PF_NUM_MAX_VAL 31 ++#define SSAM_PF_PLUS_VF_NUM_MAX_VAL 4096 ++ ++#define INQUIRY_OFFSET(field) \ ++ offsetof(struct spdk_scsi_cdb_inquiry_data, field) + \ ++ sizeof(((struct spdk_scsi_cdb_inquiry_data *)0x0)->field) ++ ++#define IO_STUCK_TIMEOUT 120 ++#define SEND_EVENT_WAIT_TIME 10 ++#define VMIO_TYPE_VIRTIO_SCSI_CTRL 4 ++#define DEVICE_READY_TIMEOUT 15 ++#define DEVICE_READY_WAIT_TIME 100000 ++ ++bool g_ssam_subsystem_exit = false; ++ ++struct ssam_event_user_ctx { ++ bool session_freed; /* true if session has been freed */ ++ bool async_done; /* true if session event done */ ++ void *ctx; /* store user context pointer */ ++}; ++ ++struct ssam_session_fn_ctx { ++ /* Device session pointer obtained before enqueuing the event */ ++ struct spdk_ssam_session *smsession; ++ ++ spdk_ssam_session_rsp_fn *rsp_fn; ++ ++ void *rsp_ctx; ++ ++ /* User provided function to be executed on session's thread. */ ++ spdk_ssam_session_fn cb_fn; ++ /** ++ * User provided function to be called on the init thread ++ * after iterating through all sessions. ++ */ ++ spdk_ssam_session_cpl_fn cpl_fn; ++ ++ /* Custom user context */ ++ struct ssam_event_user_ctx user_ctx; ++ ++ /* Session start event time */ ++ uint64_t start_tsc; ++ ++ bool need_async; ++ ++ int rsp; ++}; ++ ++/* ssam total information */ ++struct spdk_ssam_info { ++ ssam_mempool_t *mp[SSAM_MAX_CORE_NUM]; ++}; ++ ++static struct spdk_ssam_info g_ssam_info; ++ ++/* Thread performing all ssam management operations */ ++static struct spdk_thread *g_ssam_init_thread; ++ ++static TAILQ_HEAD(, spdk_ssam_dev) g_ssam_devices = ++ TAILQ_HEAD_INITIALIZER(g_ssam_devices); ++ ++static pthread_mutex_t g_ssam_mutex = PTHREAD_MUTEX_INITIALIZER; ++ ++/* Save cpu mask when ssam management thread started */ ++static struct spdk_cpuset g_ssam_core_mask; ++ ++/* Call back when spdk_ssam_fini complete */ ++static spdk_ssam_fini_cb g_ssam_fini_cpl_cb; ++ ++static int spdk_ssam_init(void); ++ ++static int ++ssam_sessions_init(struct spdk_ssam_session ***smsession) ++{ ++ *smsession = (struct spdk_ssam_session **)calloc( ++ SSAM_MAX_SESSION_PER_DEV, sizeof(struct spdk_ssam_session *)); ++ if (*smsession == NULL) { ++ SPDK_ERRLOG("calloc sessions failed\n"); ++ return -ENOMEM; ++ } ++ return 0; ++} ++ ++static int ++ssam_sessions_insert(struct spdk_ssam_session **smsessions, struct spdk_ssam_session *smsession) ++{ ++ uint16_t i = smsession->gfunc_id; ++ ++ if (smsessions[i] != NULL) { ++ SPDK_ERRLOG("smsessions already have such sesseion\n"); ++ return -ENOSPC; ++ } ++ ++ smsessions[i] = smsession; ++ ++ return 0; ++} ++ ++void ++ssam_sessions_remove(struct spdk_ssam_session **smsessions, struct spdk_ssam_session *smsession) ++{ ++ uint16_t i = smsession->gfunc_id; ++ ++ if (smsessions[i] == NULL) { ++ SPDK_WARNLOG("smsessions no such sesseion\n"); ++ return; ++ } ++ ++ smsessions[i] = NULL; ++ return; ++} ++ ++static struct spdk_ssam_session * ++ssam_sessions_first(int begin, struct spdk_ssam_session **smsessions) ++{ ++ int i; ++ ++ for (i = begin; i < SSAM_MAX_SESSION_PER_DEV; i++) { ++ if (smsessions[i] != NULL) { ++ return smsessions[i]; ++ } ++ } ++ return NULL; ++} ++ ++bool ++ssam_sessions_empty(struct spdk_ssam_session **smsessions) ++{ ++ struct spdk_ssam_session *session; ++ ++ session = ssam_sessions_first(0, smsessions); ++ if (session == NULL) { ++ return true; ++ } ++ ++ return false; ++} ++ ++struct spdk_ssam_session * ++ssam_sessions_next(struct spdk_ssam_session **smsessions, struct spdk_ssam_session *smsession) ++{ ++ if (smsession == NULL) { ++ return ssam_sessions_first(0, smsessions); ++ } ++ if (smsession->gfunc_id == SSAM_MAX_SESSION_PER_DEV) { ++ return NULL; ++ } ++ return ssam_sessions_first(smsession->gfunc_id + 1, smsessions); ++} ++ ++void ++ssam_session_insert_io_wait(struct spdk_ssam_session *smsession, ++ struct spdk_ssam_session_io_wait *io_wait) ++{ ++ TAILQ_INSERT_TAIL(&smsession->smdev->io_wait_queue, io_wait, link); ++ smsession->smdev->io_wait_cnt++; ++} ++ ++static void ++ssam_session_remove_io_wait(struct spdk_ssam_dev *smdev, ++ struct spdk_ssam_session_io_wait *session_io_wait) ++{ ++ TAILQ_REMOVE(&smdev->io_wait_queue, session_io_wait, link); ++ smdev->io_wait_cnt--; ++} ++ ++void ++ssam_session_insert_io_wait_r(struct spdk_ssam_dev *smdev, ++ struct spdk_ssam_session_io_wait_r *io_wait_r) ++{ ++ TAILQ_INSERT_TAIL(&smdev->io_wait_queue_r, io_wait_r, link); ++ smdev->io_wait_r_cnt++; ++} ++ ++static void ++ssam_session_remove_io_wait_r(struct spdk_ssam_dev *smdev, ++ struct spdk_ssam_session_io_wait_r *session_io_wait_r) ++{ ++ TAILQ_REMOVE(&smdev->io_wait_queue_r, session_io_wait_r, link); ++ smdev->io_wait_r_cnt--; ++} ++ ++void ++ssam_session_destroy(struct spdk_ssam_session *smsession) ++{ ++ if (smsession == NULL || smsession->smdev == NULL) { ++ return; ++ } ++ /* Remove smsession from the queue in advance to prevent access by the poller thread. */ ++ if (!ssam_sessions_empty(smsession->smdev->smsessions)) { ++ ssam_sessions_remove(smsession->smdev->smsessions, smsession); ++ } ++ // The smdev poller is not deleted here, but at the end of the app. ++} ++ ++uint64_t ++ssam_get_diff_tsc(uint64_t tsc) ++{ ++ return spdk_get_ticks() - tsc; ++} ++ ++int ++spdk_ssam_check_gfunc_id(uint16_t gfunc_id) ++{ ++ enum ssam_device_type type; ++ ++ if (gfunc_id == SPDK_INVALID_GFUNC_ID) { ++ SPDK_ERRLOG("Check gfunc_id(%u) error\n", gfunc_id); ++ return -EINVAL; ++ } ++ ++ type = spdk_ssam_get_virtio_type(gfunc_id); ++ if (type >= SSAM_DEVICE_VIRTIO_MAX) { ++ SPDK_ERRLOG("Check gfunc_id(%u) virtio type(%d) error\n", gfunc_id, type); ++ return -ENODEV; ++ } ++ ++ return 0; ++} ++ ++/* Find a tid which has minimum device */ ++static uint16_t ++ssam_get_min_payload_tid(uint16_t cpu_num) ++{ ++ if (cpu_num == 0) { ++ return SPDK_INVALID_TID; ++ } ++ ++ struct spdk_ssam_dev *smdev = NULL; ++ struct spdk_ssam_dev *tmp = NULL; ++ /* All tid have been used, find a tid which has minimum device */ ++ uint32_t min = UINT32_MAX; ++ uint16_t tid = 0; ++ ++ TAILQ_FOREACH_SAFE(smdev, &g_ssam_devices, tailq, tmp) { ++ if (smdev->active_session_num < min) { ++ min = smdev->active_session_num; ++ tid = smdev->tid; ++ } ++ } ++ ++ return tid; ++} ++ ++/* Get a tid number */ ++uint16_t ++spdk_ssam_get_tid(void) ++{ ++ uint32_t cpu_num; ++ ++ cpu_num = spdk_cpuset_count(&g_ssam_core_mask); ++ if ((cpu_num == 0) || (cpu_num > UINT16_MAX)) { ++ /* If cpu_num > UINT16_MAX, the result of tid will overflow */ ++ SPDK_ERRLOG("CPU num %u not valid.\n", cpu_num); ++ return SPDK_INVALID_TID; ++ } ++ ++ return ssam_get_min_payload_tid((uint16_t)cpu_num); ++} ++ ++void ++spdk_ssam_lock(void) ++{ ++ pthread_mutex_lock(&g_ssam_mutex); ++} ++ ++int ++spdk_ssam_trylock(void) ++{ ++ return pthread_mutex_trylock(&g_ssam_mutex); ++} ++ ++void ++spdk_ssam_unlock(void) ++{ ++ pthread_mutex_unlock(&g_ssam_mutex); ++} ++ ++static struct spdk_ssam_session * ++spdk_ssam_session_find_in_dev(const struct spdk_ssam_dev *smdev, ++ uint16_t gfunc_id) ++{ ++ return smdev->smsessions[gfunc_id]; ++} ++ ++void ++ssam_dump_info_json(struct spdk_ssam_dev *smdev, uint16_t gfunc_id, ++ struct spdk_json_write_ctx *w) ++{ ++ struct spdk_ssam_session *smsession = NULL; ++ ++ spdk_json_write_named_array_begin(w, "session"); ++ if (gfunc_id == UINT16_MAX) { ++ smsession = ssam_sessions_next(smdev->smsessions, NULL); ++ while (smsession != NULL) { ++ smsession->backend->dump_info_json(smsession, w); ++ smsession = ssam_sessions_next(smdev->smsessions, smsession); ++ } ++ } else { ++ smsession = spdk_ssam_session_find_in_dev(smdev, gfunc_id); ++ smsession->backend->dump_info_json(smsession, w); ++ } ++ ++ spdk_json_write_array_end(w); ++} ++ ++const char * ++spdk_ssam_dev_get_name(const struct spdk_ssam_dev *smdev) ++{ ++ if (!smdev) { ++ return ""; ++ } ++ return smdev->name; ++} ++ ++const char * ++spdk_ssam_session_get_name(const struct spdk_ssam_session *smsession) ++{ ++ if (!smsession) { ++ return ""; ++ } ++ return smsession->name; ++} ++ ++struct spdk_ssam_dev * ++spdk_ssam_dev_next(const struct spdk_ssam_dev *smdev) ++{ ++ if (smdev == NULL) { ++ return TAILQ_FIRST(&g_ssam_devices); ++ } ++ ++ return TAILQ_NEXT(smdev, tailq); ++} ++ ++struct spdk_ssam_session * ++spdk_ssam_session_find(uint16_t gfunc_id) ++{ ++ struct spdk_ssam_dev *smdev = NULL; ++ struct spdk_ssam_dev *tmp = NULL; ++ struct spdk_ssam_session *smsession = NULL; ++ ++ TAILQ_FOREACH_SAFE(smdev, &g_ssam_devices, tailq, tmp) { ++ smsession = spdk_ssam_session_find_in_dev(smdev, gfunc_id); ++ if (smsession != NULL) { ++ return smsession; ++ } ++ } ++ ++ return NULL; ++} ++ ++uint16_t ++spdk_ssam_get_gfunc_id_by_name(char *name) ++{ ++ struct spdk_ssam_dev *smdev = NULL; ++ struct spdk_ssam_dev *tmp = NULL; ++ struct spdk_ssam_session *smsession = NULL; ++ uint16_t gfunc_id; ++ TAILQ_FOREACH_SAFE(smdev, &g_ssam_devices, tailq, tmp) { ++ if (smdev != NULL && smdev->active_session_num > 0) { ++ for (gfunc_id = 0; gfunc_id <= SSAM_PF_NUM_MAX_VAL; gfunc_id++) { ++ smsession = spdk_ssam_session_find_in_dev(smdev, gfunc_id); ++ if (smsession != NULL && strcmp(name, smsession->name) == 0) { ++ return gfunc_id; ++ } ++ } ++ } ++ } ++ ++ SPDK_WARNLOG("controller(%s) is not existed\n", name); ++ return SPDK_INVALID_GFUNC_ID; ++} ++ ++static struct spdk_ssam_dev * ++spdk_ssam_dev_find(uint16_t tid) ++{ ++ struct spdk_ssam_dev *smdev = NULL; ++ struct spdk_ssam_dev *tmp = NULL; ++ ++ TAILQ_FOREACH_SAFE(smdev, &g_ssam_devices, tailq, tmp) { ++ if (smdev->tid == tid) { ++ return smdev; ++ } ++ } ++ ++ return NULL; ++} ++ ++int ++ssam_mount_normal(struct spdk_ssam_session *smsession, uint32_t lun_id) ++{ ++ uint16_t gfunc_id = smsession->gfunc_id; ++ uint16_t tid = smsession->smdev->tid; ++ ++ return ssam_function_mount(gfunc_id, lun_id, SSAM_MOUNT_NORMAL, tid); ++} ++ ++int ++ssam_umount_normal(struct spdk_ssam_session *smsession, uint32_t lun_id) ++{ ++ int rc; ++ ++ rc = ssam_function_umount(smsession->gfunc_id, lun_id); ++ if (rc != 0) { ++ SPDK_WARNLOG("%s: function umount failed when add scsi tgt, %d.\n", smsession->name, rc); ++ return rc; ++ } ++ ++ return 0; ++} ++ ++int ++ssam_remount_normal(struct spdk_ssam_session *smsession, uint32_t lun_id) ++{ ++ return ssam_function_mount(smsession->gfunc_id, lun_id, SSAM_MOUNT_NORMAL, smsession->smdev->tid); ++} ++ ++static int ++ssam_remove_session(struct spdk_ssam_session *smsession) ++{ ++ int rc; ++ ++ if (smsession->backend->remove_session != NULL) { ++ rc = smsession->backend->remove_session(smsession); ++ if (rc != 0) { ++ SPDK_ERRLOG("session: %s can not be removed, task cnt %d.\n", ++ smsession->name, smsession->task_cnt); ++ return rc; ++ } ++ } ++ ++ return 0; ++} ++ ++static void ++ssam_dev_thread_exit(void *unused) ++{ ++ (void)unused; ++ spdk_thread_exit(spdk_get_thread()); ++} ++ ++static int ++ssam_tid_to_cpumask(uint16_t tid, struct spdk_cpuset *cpumask) ++{ ++ uint32_t core; ++ uint32_t lcore; ++ uint32_t cnt; ++ ++ for (lcore = 0, cnt = 0; lcore < SPDK_CPUSET_SIZE - 1; lcore++) { ++ if (spdk_cpuset_get_cpu(&g_ssam_core_mask, lcore)) { ++ if (cnt == tid) { ++ core = lcore; ++ spdk_cpuset_set_cpu(cpumask, core, true); ++ return 0; ++ } ++ cnt++; ++ } ++ } ++ ++ return -1; ++} ++ ++void ++ssam_session_start_done(struct spdk_ssam_session *smsession, int response) ++{ ++ if (response == 0) { ++ if (smsession->smdev->active_session_num == UINT32_MAX) { ++ SPDK_ERRLOG("smsession %s: active session num reached upper limit %u\n", ++ smsession->name, smsession->smdev->active_session_num); ++ return; ++ } ++ smsession->smdev->active_session_num++; ++ } ++} ++ ++void ++ssam_set_session_be_freed(void **ctx) ++{ ++ struct ssam_event_user_ctx *_ctx; ++ ++ if (ctx == NULL) { ++ return; ++ } ++ ++ _ctx = SPDK_CONTAINEROF(ctx, struct ssam_event_user_ctx, ctx); ++ _ctx->session_freed = true; ++} ++ ++void ++ssam_send_event_async_done(void **ctx) ++{ ++ struct ssam_event_user_ctx *_ctx; ++ ++ if (ctx == NULL) { ++ return; ++ } ++ ++ _ctx = SPDK_CONTAINEROF(ctx, struct ssam_event_user_ctx, ctx); ++ _ctx->async_done = true; ++} ++ ++void ++ssam_session_stop_done(struct spdk_ssam_session *smsession, int rsp, void **ctx) ++{ ++ if (rsp == 0) { ++ if (smsession->smdev->active_session_num > 0) { ++ smsession->smdev->active_session_num--; ++ } else { ++ SPDK_ERRLOG("smsession %s: active session num reached lower limit %u\n", ++ smsession->name, smsession->smdev->active_session_num); ++ } ++ } ++ // Smdev cannot be free here ++ ++ /* Stop process need async */ ++ ssam_send_event_async_done(ctx); ++} ++ ++void ++ssam_session_unreg_response_cb(struct spdk_ssam_session *smsession) ++{ ++ smsession->rsp_fn = NULL; ++ smsession->rsp_ctx = NULL; ++} ++ ++static int ++ssam_dev_create_register(struct spdk_ssam_dev *smdev, uint16_t tid) ++{ ++ char name[NAME_MAX]; ++ struct spdk_cpuset cpumask; ++ int rc; ++ ++ smdev->tid = tid; ++ ++ rc = snprintf(name, NAME_MAX, "%s%u", "ssam.", smdev->tid); ++ if (rc < 0 || rc >= NAME_MAX) { ++ SPDK_ERRLOG("ssam dev name is too long, tid %u\n", tid); ++ return -EINVAL; ++ } ++ ++ spdk_cpuset_zero(&cpumask); ++ if (ssam_tid_to_cpumask(tid, &cpumask)) { ++ SPDK_ERRLOG("Can not find cpu for tid %u\n", tid); ++ return -EINVAL; ++ } ++ ++ smdev->name = strdup(name); ++ if (smdev->name == NULL) { ++ SPDK_ERRLOG("Failed to create name for ssam controller %s.\n", name); ++ return -EIO; ++ } ++ ++ smdev->thread = spdk_thread_create(smdev->name, &cpumask); ++ if (smdev->thread == NULL) { ++ SPDK_ERRLOG("Failed to create thread for ssam controller %s.\n", name); ++ free(smdev->name); ++ smdev->name = NULL; ++ return -EIO; ++ } ++ ++ rc = ssam_sessions_init(&smdev->smsessions); ++ if (rc != 0) { ++ return rc; ++ } ++ TAILQ_INSERT_TAIL(&g_ssam_devices, smdev, tailq); ++ TAILQ_INIT(&smdev->io_wait_queue); ++ TAILQ_INIT(&smdev->io_wait_queue_r); ++ ++ SPDK_NOTICELOG("Controller %s: new controller added, tid %u\n", smdev->name, tid); ++ ++ return 0; ++} ++ ++void ++ssam_dev_unregister(struct spdk_ssam_dev **dev) ++{ ++ struct spdk_ssam_dev *smdev = *dev; ++ struct spdk_thread *thread = smdev->thread; ++ ++ if (!ssam_sessions_empty(smdev->smsessions)) { ++ SPDK_NOTICELOG("Controller %s still has valid session.\n", ++ smdev->name); ++ return; ++ } ++ memset(smdev->smsessions, 0, SSAM_MAX_SESSION_PER_DEV * sizeof(struct spdk_ssam_session *)); ++ free(smdev->smsessions); ++ smdev->smsessions = NULL; ++ ++ // Used for hot restart. ++ if (smdev->stop_poller != NULL) { ++ spdk_poller_unregister(&smdev->stop_poller); ++ smdev->stop_poller = NULL; ++ } ++ ++ SPDK_NOTICELOG("Controller %s: removed\n", smdev->name); ++ ++ free(smdev->name); ++ smdev->name = NULL; ++ spdk_ssam_lock(); ++ TAILQ_REMOVE(&g_ssam_devices, smdev, tailq); ++ spdk_ssam_unlock(); ++ ++ free(smdev); ++ smdev = NULL; ++ *dev = NULL; ++ ++ spdk_thread_send_msg(thread, ssam_dev_thread_exit, NULL); ++ ++ return; ++} ++ ++static int ++ssam_init_session_fields(struct spdk_ssam_session_reg_info *info, ++ struct spdk_ssam_dev *smdev, struct spdk_ssam_session *smsession) ++{ ++ smsession->mp = g_ssam_info.mp[smdev->tid % ssam_get_core_num()]; ++ smsession->initialized = true; ++ smsession->registered = true; ++ smsession->thread = smdev->thread; ++ smsession->backend = info->backend; ++ smsession->smdev = smdev; ++ smsession->gfunc_id = info->gfunc_id; ++ smsession->started = true; ++ smsession->rsp_fn = info->rsp_fn; ++ smsession->rsp_ctx = info->rsp_ctx; ++ smsession->max_queues = info->queues; ++ smsession->queue_size = SPDK_SSAM_DEFAULT_VQ_SIZE; ++ if (info->name == NULL) { ++ smsession->name = spdk_sprintf_alloc("%s_%s_%d", smdev->name, info->type_name, info->gfunc_id); ++ } else { ++ smsession->name = strdup(info->name); ++ } ++ if (smsession->name == NULL) { ++ SPDK_ERRLOG("smsession name alloc failed\n"); ++ return -ENOMEM; ++ } ++ ++ return 0; ++} ++ ++static int ++ssam_add_session(struct spdk_ssam_session_reg_info *info, ++ struct spdk_ssam_dev *smdev, struct spdk_ssam_session **smsession) ++{ ++ struct spdk_ssam_session *l_stsession = NULL; ++ size_t with_ctx_len = sizeof(*l_stsession) + info->session_ctx_size; ++ int rc; ++ ++ if (smdev->active_session_num == SSAM_MAX_SESSION_PER_DEV) { ++ SPDK_ERRLOG("%s reached upper limit %u\n", smdev->name, SSAM_MAX_SESSION_PER_DEV); ++ return -EAGAIN; ++ } ++ ++ if (g_ssam_info.mp == NULL) { ++ SPDK_ERRLOG("No memory pool\n"); ++ return -ENOMEM; ++ } ++ ++ rc = posix_memalign((void **)&l_stsession, SPDK_CACHE_LINE_SIZE, with_ctx_len); ++ if (rc != 0) { ++ SPDK_ERRLOG("smsession alloc failed\n"); ++ return -ENOMEM; ++ } ++ memset(l_stsession, 0, with_ctx_len); ++ ++ rc = ssam_init_session_fields(info, smdev, l_stsession); ++ if (rc != 0) { ++ free(l_stsession); ++ l_stsession = NULL; ++ return rc; ++ } ++ ++ rc = ssam_sessions_insert(smdev->smsessions, l_stsession); ++ if (rc != 0) { ++ return rc; ++ } ++ *smsession = l_stsession; ++ if (smdev->type == VIRTIO_TYPE_UNKNOWN) { ++ smdev->type = info->backend->type; ++ } ++ ++ return 0; ++} ++ ++static int ++ssam_dev_register(struct spdk_ssam_dev **dev, uint16_t tid) ++{ ++ struct spdk_ssam_dev *smdev = NULL; ++ int rc; ++ ++ smdev = calloc(1, sizeof(*smdev)); ++ if (smdev == NULL) { ++ SPDK_ERRLOG("Couldn't alloc device for tid %u.\n", tid); ++ return -1; ++ } ++ ++ rc = ssam_dev_create_register(smdev, tid); ++ if (rc != 0) { ++ free(smdev); ++ smdev = NULL; ++ return -1; ++ } ++ ++ *dev = smdev; ++ ++ return 0; ++} ++ ++int ++spdk_ssam_session_register(struct spdk_ssam_session_reg_info *info, ++ struct spdk_ssam_session **smsession) ++{ ++ struct spdk_ssam_dev *smdev = NULL; ++ int rc; ++ ++ if (spdk_ssam_session_find(info->gfunc_id)) { ++ SPDK_ERRLOG("Session with function id %d already exists.\n", info->gfunc_id); ++ return -EEXIST; ++ } ++ ++ smdev = spdk_ssam_dev_find(info->tid); ++ if (smdev == NULL) { ++ // The smdev has been started during process initialization. Do not need to start the poller here. ++ SPDK_ERRLOG("No device with function id %d tid %u.\n", info->gfunc_id, info->tid); ++ return -ENODEV; ++ } ++ ++ rc = ssam_add_session(info, smdev, smsession); ++ if (rc != 0) { ++ return rc; ++ } ++ ++ return 0; ++} ++ ++int ++spdk_ssam_session_unregister(struct spdk_ssam_session *smsession) ++{ ++ int rc; ++ ++ if (smsession == NULL) { ++ SPDK_ERRLOG("smsession null.\n"); ++ return -EINVAL; ++ } ++ ++ if (smsession->task_cnt > 0) { ++ SPDK_ERRLOG("%s is processing I/O(%d) and cannot be deleted.\n", ++ smsession->name, smsession->task_cnt); ++ return -EBUSY; ++ } ++ ++ if (smsession->pending_async_op_num != 0) { ++ SPDK_ERRLOG("[OFFLOAD_SNIC] %s has internal events(%d) and cannot be deleted.\n", ++ smsession->name, smsession->pending_async_op_num); ++ return -EBUSY; ++ } ++ ++ rc = ssam_remove_session(smsession); ++ if (rc != 0) { ++ return rc; ++ } ++ ++ return 0; ++} ++ ++static void ssam_io_queue_handle(struct spdk_ssam_dev *smdev) ++{ ++ uint64_t count = 0; ++ uint64_t io_wait_cnt = smdev->io_wait_cnt; ++ while (count < io_wait_cnt) { ++ struct spdk_ssam_session_io_wait *io_wait = TAILQ_FIRST(&smdev->io_wait_queue); ++ ssam_session_remove_io_wait(smdev, io_wait); ++ if (io_wait->cb_fn != NULL) { ++ io_wait->cb_fn(io_wait->cb_arg); ++ } ++ count++; ++ } ++} ++ ++struct forward_ctx { ++ struct spdk_ssam_session *smsession; ++ struct ssam_request *io_req; ++}; ++ ++static void ++ssam_handle_forward_req(void *_ctx) ++{ ++ struct forward_ctx *ctx = (struct forward_ctx *)_ctx; ++ ctx->smsession->backend->request_worker(ctx->smsession, ctx->io_req); ++ free(ctx); ++} ++// The resent request that is polled at the beginning of the hot restart is not the smsession of this smdev ++// and needs to be forwarded to the corresponding smdev. ++// If the forwarding is successful, true is returned. Otherwise, false is returned. ++static bool ++ssam_dev_forward_req(struct ssam_request *io_req) ++{ ++ struct spdk_ssam_dev *smdev = NULL; ++ struct forward_ctx *ctx = NULL; ++ int rc; ++ spdk_ssam_lock(); ++ smdev = spdk_ssam_dev_next(NULL); ++ while (smdev != NULL) { ++ if (smdev->smsessions[io_req->gfunc_id] != NULL) { ++ ctx = calloc(1, sizeof(struct forward_ctx)); ++ if (!ctx) { ++ SPDK_ERRLOG("%s: calloc failed.\n", smdev->name); ++ goto out; ++ } ++ ctx->smsession = smdev->smsessions[io_req->gfunc_id]; ++ ctx->io_req = io_req; ++ rc = spdk_thread_send_msg(smdev->smsessions[io_req->gfunc_id]->thread, ssam_handle_forward_req, ctx); ++ if (rc) { ++ SPDK_ERRLOG("%s: send msg error %d.\n", smdev->name, rc); ++ free(ctx); ++ goto out; ++ } ++ spdk_ssam_unlock(); ++ return true; ++ } ++ smdev = spdk_ssam_dev_next(smdev); ++ } ++out: ++ spdk_ssam_unlock(); ++ return false; ++} ++ ++struct ssam_dev_io_complete_arg { ++ struct spdk_ssam_dev *smdev; ++ struct ssam_io_response io_resp; ++}; ++ ++static void ++ssam_dev_io_complete_cb(void *arg) ++{ ++ struct ssam_dev_io_complete_arg *cb_arg = (struct ssam_dev_io_complete_arg *)arg; ++ int rc = ssam_io_complete(cb_arg->smdev->tid, &cb_arg->io_resp); ++ if (rc != 0) { ++ struct spdk_ssam_session_io_wait_r *io_wait_r = ++ calloc(1, sizeof(struct spdk_ssam_session_io_wait_r)); ++ if (io_wait_r == NULL) { ++ SPDK_ERRLOG("calloc for io_wait_r failed\n"); ++ sleep(1); ++ raise(SIGTERM); ++ } ++ io_wait_r->cb_fn = ssam_dev_io_complete_cb; ++ io_wait_r->cb_arg = cb_arg; ++ ssam_session_insert_io_wait_r(cb_arg->smdev, io_wait_r); ++ return; ++ } ++ free(cb_arg); ++ cb_arg = NULL; ++} ++ ++static void ssam_dev_io_complete(struct spdk_ssam_dev *smdev, struct ssam_request *io_req, bool success) ++{ ++ struct ssam_io_response io_resp; ++ struct ssam_virtio_res *virtio_res = (struct ssam_virtio_res*)&io_resp.data; ++ struct ssam_io_message *io_cmd = &io_req->req.cmd; ++ struct iovec io_vec; ++ struct virtio_scsi_cmd_resp resp = {0}; ++ enum ssam_device_type type; ++ uint8_t res_status; ++ int rc; ++ type = spdk_ssam_get_virtio_type(io_req->gfunc_id); ++ ++ if (success) { ++ switch (type) { ++ case SSAM_DEVICE_VIRTIO_BLK: ++ res_status = VIRTIO_BLK_S_OK; ++ break; ++ case SSAM_DEVICE_VIRTIO_SCSI: ++ res_status = VIRTIO_SCSI_S_OK; ++ break; ++ default: ++ res_status = 0; // unknown type, maybe 0 means ok ++ } ++ } else { ++ SPDK_INFOLOG(ssam, "%s: io complete return error gfunc_id %u type %d.\n", ++ smdev->name, io_req->gfunc_id, type); ++ switch (type) { ++ case SSAM_DEVICE_VIRTIO_BLK: ++ res_status = VIRTIO_BLK_S_IOERR; ++ break; ++ case SSAM_DEVICE_VIRTIO_SCSI: ++ res_status = VIRTIO_SCSI_S_FAILURE; ++ break; ++ default: ++ res_status = 1; // unknown type, maybe 1 means error ++ } ++ } ++ ++ memset(&io_resp, 0, sizeof(io_resp)); ++ io_resp.gfunc_id = io_req->gfunc_id; ++ io_resp.iocb_id = io_req->iocb_id; ++ io_resp.status = io_req->status; ++ io_resp.flr_seq = io_req->flr_seq; ++ io_resp.req = io_req; ++ ++ virtio_res->iovs = &io_vec; ++ if (type == SSAM_DEVICE_VIRTIO_SCSI && io_cmd->writable) { ++ virtio_res->iovs->iov_base = io_cmd->iovs[1].iov_base; ++ virtio_res->iovs->iov_len = io_cmd->iovs[1].iov_len; ++ } else { ++ virtio_res->iovs->iov_base = io_cmd->iovs[io_cmd->iovcnt - 1].iov_base; ++ virtio_res->iovs->iov_len = io_cmd->iovs[io_cmd->iovcnt - 1].iov_len; ++ } ++ virtio_res->iovcnt = 1; ++ if (type == SSAM_DEVICE_VIRTIO_SCSI && io_req->type != VMIO_TYPE_VIRTIO_SCSI_CTRL) { ++ resp.response = res_status; ++ virtio_res->rsp = &resp; ++ virtio_res->rsp_len = sizeof(struct virtio_scsi_cmd_resp); ++ } else { ++ virtio_res->rsp = &res_status; ++ virtio_res->rsp_len = sizeof(res_status); ++ } ++ ++ rc = ssam_io_complete(smdev->tid, &io_resp); ++ if (rc != 0) { ++ struct spdk_ssam_session_io_wait_r *io_wait_r = ++ calloc(1, sizeof(struct spdk_ssam_session_io_wait_r)); ++ struct ssam_dev_io_complete_arg *cb_arg = ++ calloc(1, sizeof(struct ssam_dev_io_complete_arg)); ++ if (io_wait_r == NULL || cb_arg == NULL) { ++ SPDK_ERRLOG("calloc for io_wait_r failed\n"); ++ sleep(1); ++ raise(SIGTERM); ++ } ++ cb_arg->smdev = smdev; ++ cb_arg->io_resp = io_resp; ++ io_wait_r->cb_fn = ssam_dev_io_complete_cb; ++ io_wait_r->cb_arg = cb_arg; ++ ssam_session_insert_io_wait_r(smdev, io_wait_r); ++ } ++} ++ ++static void ++ssam_dev_io_request(struct spdk_ssam_dev *smdev, struct ssam_request *io_req) ++{ ++ struct spdk_ssam_session *smsession = NULL; ++ ++ SPDK_INFOLOG(ssam_blk_data, "handling io tid=%u gfunc_id=%u type=%d rw=%u vqid=%u reqid=%u.\n", ++ smdev->tid, io_req->gfunc_id, io_req->type, io_req->req.cmd.writable, ++ io_req->req.cmd.virtio.vq_idx, io_req->req.cmd.virtio.req_idx); ++ ++ smsession = smdev->smsessions[io_req->gfunc_id]; ++ if (smsession == NULL) { ++ if (!ssam_dev_forward_req(io_req)) { ++ SPDK_INFOLOG(ssam, "%s: not have gfunc_id %u yet in io request.\n", ++ smdev->name, io_req->gfunc_id); ++ ssam_dev_io_complete(smdev, io_req, false); ++ } ++ return; ++ } ++ ++ smsession->backend->request_worker(smsession, io_req); ++ return; ++} ++ ++static void ssam_io_wait_r_queue_handle(struct spdk_ssam_dev *smdev) ++{ ++ uint64_t count = 0; ++ uint64_t io_wait_r_cnt = smdev->io_wait_r_cnt > SSAM_MAX_REQ_POLL_SIZE ? SSAM_MAX_REQ_POLL_SIZE : smdev->io_wait_r_cnt; ++ while (count < io_wait_r_cnt) { ++ struct spdk_ssam_session_io_wait_r *io_wait_r = TAILQ_FIRST(&smdev->io_wait_queue_r); ++ ssam_session_remove_io_wait_r(smdev, io_wait_r); ++ if (io_wait_r->cb_fn != NULL) { ++ io_wait_r->cb_fn(io_wait_r->cb_arg); ++ } ++ count++; ++ free(io_wait_r); ++ io_wait_r = NULL; ++ } ++} ++ ++static int ++ssam_dev_request_worker(void *arg) ++{ ++ int io_num; ++ struct ssam_request *io_req[SSAM_MAX_REQ_POLL_SIZE] = {0}; ++ struct spdk_ssam_dev *smdev = arg; ++ ++ // The I/O waiting due to insufficient memory needs to be processed first. ++ if (spdk_unlikely(smdev->io_wait_cnt > 0)) { ++ ssam_io_queue_handle(smdev); ++ return SPDK_POLLER_BUSY; ++ } ++ ++ io_num = ssam_request_poll(smdev->tid, SSAM_MAX_REQ_POLL_SIZE, io_req); ++ if ((io_num <= 0) || (io_num > SSAM_MAX_REQ_POLL_SIZE)) { ++ /* ++ * The rpc delete callback is registered when the bdev deleting. spdk_put_io_channel ++ * executed the RPC delete callback.The stdev_io_no_data_request function continuously ++ * determines whether to perform the spdk_put_io_channel operation to ensure that the ++ * deletion of the bdev does not time out. ++ */ ++ if (spdk_unlikely(smdev->io_wait_r_cnt > 0)) { ++ ssam_io_wait_r_queue_handle(smdev); ++ } ++ return SPDK_POLLER_BUSY; ++ } ++ ++ if (spdk_unlikely(smdev->io_wait_r_cnt > 0)) { ++ ssam_io_wait_r_queue_handle(smdev); ++ } ++ ++ for (int i = 0; i < io_num; i++) { ++ ssam_dev_io_request(smdev, io_req[i]); ++ } ++ ++ return SPDK_POLLER_BUSY; ++} ++ ++static void ++ssam_dev_io_response(struct spdk_ssam_dev *smdev, const struct ssam_dma_rsp *dma_rsp) ++{ ++ struct spdk_ssam_session *smsession = NULL; ++ const struct spdk_ssam_dma_cb *dma_cb = (const struct spdk_ssam_dma_cb *)&dma_rsp->cb; ++ ++ SPDK_INFOLOG(ssam_blk_data, "handle dma resp tid=%u gfunc_id=%u rw=%u vqid=%u task_idx=%u statuc=%u.\n", ++ smdev->tid, dma_cb->gfunc_id, dma_cb->req_dir, ++ dma_cb->vq_idx, dma_cb->task_idx, dma_cb->status); ++ ++ smsession = smdev->smsessions[dma_cb->gfunc_id]; ++ if (smsession == NULL) { ++ smdev->discard_io_num++; ++ SPDK_ERRLOG("smsessions not have gfunc_id %u yet in io response.\n", dma_cb->gfunc_id); ++ return; ++ } ++ ++ smsession->backend->response_worker(smsession, (void *)dma_rsp); ++ ++ return; ++} ++ ++static void ++ssam_dev_print_stuck_io(struct spdk_ssam_dev *smdev) ++{ ++ struct spdk_ssam_session *smsession = NULL; ++ int i; ++ ++ for (i = 0; i < SSAM_MAX_SESSION_PER_DEV; i++) { ++ smsession = smdev->smsessions[i]; ++ if (smsession == NULL) { ++ continue; ++ } ++ if (smsession->task_cnt > 0) { ++ SPDK_ERRLOG("%s: %d IO stuck for %ds\n", smsession->name, ++ smsession->task_cnt, IO_STUCK_TIMEOUT); ++ if (smsession->backend->print_stuck_io_info != NULL) { ++ smsession->backend->print_stuck_io_info(smsession); ++ } ++ } ++ } ++} ++ ++static void ++ssam_dev_io_stuck_check(struct spdk_ssam_dev *smdev) ++{ ++ uint64_t diff_tsc = spdk_get_ticks() - smdev->io_stuck_tsc; ++ ++ if (smdev->io_num == 0) { ++ smdev->io_stuck_tsc = spdk_get_ticks(); ++ return; ++ } ++ ++ if ((diff_tsc / IO_STUCK_TIMEOUT) > spdk_get_ticks_hz()) { ++ ssam_dev_print_stuck_io(smdev); ++ smdev->io_stuck_tsc = spdk_get_ticks(); ++ } ++} ++ ++void ++ssam_dev_io_dec(struct spdk_ssam_dev *smdev) ++{ ++ smdev->io_num--; ++} ++ ++static int ++ssam_dev_response_worker(void *arg) ++{ ++ int io_num; ++ struct spdk_ssam_dev *smdev = arg; ++ struct ssam_dma_rsp dma_rsp[SSAM_MAX_RESP_POLL_SIZE] = {0}; ++ ++ uint64_t ticks = spdk_get_ticks(); ++ if (smdev->stat.poll_cur_tsc == 0) { ++ smdev->stat.poll_cur_tsc = ticks; ++ } else { ++ smdev->stat.poll_tsc += ticks - smdev->stat.poll_cur_tsc; ++ smdev->stat.poll_count++; ++ smdev->stat.poll_cur_tsc = ticks; ++ } ++ ++ io_num = ssam_dma_rsp_poll(smdev->tid, SSAM_MAX_RESP_POLL_SIZE, dma_rsp); ++ if (io_num <= 0 || io_num > SSAM_MAX_RESP_POLL_SIZE) { ++ ssam_dev_io_stuck_check(smdev); ++ return SPDK_POLLER_BUSY; ++ } ++ ++ if (smdev->io_num < ((uint64_t)(uint32_t)io_num)) { ++ SPDK_ERRLOG("%s: DMA response IO num too much, should be %lu but %d\n", ++ smdev->name, smdev->io_num, io_num); ++ smdev->discard_io_num += io_num; ++ return SPDK_POLLER_BUSY; ++ } ++ smdev->io_stuck_tsc = spdk_get_ticks(); ++ ++ for (int i = 0; i < io_num; i++) { ++ ssam_dev_io_response(smdev, dma_rsp + i); ++ } ++ ++ return SPDK_POLLER_BUSY; ++} ++ ++int ++ssam_dev_register_worker_poller(struct spdk_ssam_dev *smdev) ++{ ++ SPDK_NOTICELOG("%s: worker starting.\n", smdev->name); ++ if (smdev->requestq_poller == NULL) { ++ smdev->requestq_poller = SPDK_POLLER_REGISTER(ssam_dev_request_worker, smdev, 0); ++ if (smdev->requestq_poller == NULL) { ++ SPDK_WARNLOG("%s: stdev_request_worker start failed.\n", smdev->name); ++ return -1; ++ } ++ ++ SPDK_INFOLOG(ssam, "%s: started stdev_request_worker poller on lcore %d\n", ++ smdev->name, spdk_env_get_current_core()); ++ } ++ ++ if (smdev->responseq_poller == NULL) { ++ smdev->responseq_poller = SPDK_POLLER_REGISTER(ssam_dev_response_worker, smdev, 0); ++ if (smdev->responseq_poller == NULL) { ++ SPDK_WARNLOG("%s: stdev_response_worker start failed.\n", smdev->name); ++ return -1; ++ } ++ ++ SPDK_INFOLOG(ssam, "%s: started stdev_response_worker poller on lcore %d\n", ++ smdev->name, spdk_env_get_current_core()); ++ } ++ return 0; ++} ++ ++void ++ssam_dev_unregister_worker_poller(struct spdk_ssam_dev *smdev) ++{ ++ if (!ssam_sessions_empty(smdev->smsessions)) { ++ return; ++ } ++ ++ if (smdev->requestq_poller != NULL) { ++ spdk_poller_unregister(&smdev->requestq_poller); ++ smdev->requestq_poller = NULL; ++ } ++ ++ if (smdev->responseq_poller != NULL) { ++ spdk_poller_unregister(&smdev->responseq_poller); ++ smdev->responseq_poller = NULL; ++ } ++} ++// When stopping the worker, need to stop the two pollers first ++// and wait until all sessions are deleted, and then free smdev. ++static int ++ssam_dev_stop_poller(void *arg) { ++ struct spdk_ssam_dev *smdev = arg; ++ struct spdk_ssam_session *smsession = NULL; ++ ++ // special processing is required for virtio-scsi, ++ // because In scsi scenarios, smsessions are not actively or passively removed. ++ if (smdev->type == VIRTIO_TYPE_SCSI && smdev->active_session_num > 0) { ++ for (int i = 0; i < SSAM_MAX_SESSION_PER_DEV; i++) { ++ if (smdev->smsessions[i] != NULL) { ++ smsession = smdev->smsessions[i]; ++ smsession->backend->remove_self(smsession); // remove session ++ } ++ } ++ } ++ ++ // 等待session全部被移除 ++ if (smdev->active_session_num != 0) { ++ return SPDK_POLLER_BUSY; ++ } ++ ++ // 删除smdev的资源 ++ ssam_dev_unregister(&smdev); ++ ++ return SPDK_POLLER_BUSY; ++} ++ ++static void ++ssam_dev_stop_worker_poller(void *args) ++{ ++ struct spdk_ssam_dev *smdev = (struct spdk_ssam_dev *)args; ++ ++ if (smdev->requestq_poller != NULL) { ++ spdk_poller_unregister(&smdev->requestq_poller); ++ smdev->requestq_poller = NULL; ++ } ++ ++ if (smdev->responseq_poller != NULL) { ++ spdk_poller_unregister(&smdev->responseq_poller); ++ smdev->responseq_poller = NULL; ++ } ++ ++ SPDK_NOTICELOG("%s: poller stopped.\n", smdev->name); ++ smdev->stop_poller = SPDK_POLLER_REGISTER(ssam_dev_stop_poller, smdev, 0); ++ if (smdev->stop_poller == NULL) { ++ SPDK_WARNLOG("%s: ssam_dev stop failed.\n", smdev->name); ++ } ++} ++// When starting the worker, need to start the two pollers first ++static void ++ssam_dev_start_worker_poller(void *args) ++{ ++ struct spdk_ssam_dev *smdev = (struct spdk_ssam_dev *)args; ++ ssam_dev_register_worker_poller(smdev); ++} ++ ++static void ++ssam_send_event_response(struct ssam_session_fn_ctx *ev_ctx) ++{ ++ if (ev_ctx->user_ctx.session_freed == true) { ++ goto out; ++ } ++ ++ if (*ev_ctx->rsp_fn != NULL) { ++ (*ev_ctx->rsp_fn)(ev_ctx->rsp_ctx, ev_ctx->rsp); ++ *ev_ctx->rsp_fn = NULL; ++ } ++ ++out: ++ /* ev_ctx be allocated by another thread */ ++ free(ev_ctx); ++ ev_ctx = NULL; ++} ++ ++static void ++ssam_check_send_event_timeout(struct ssam_session_fn_ctx *ev_ctx, spdk_msg_fn fn) ++{ ++ uint64_t diff_tsc = spdk_get_ticks() - ev_ctx->start_tsc; ++ struct spdk_ssam_session *smsession = ev_ctx->smsession; ++ ++ if ((diff_tsc / SEND_EVENT_WAIT_TIME) > spdk_get_ticks_hz()) { ++ /* If timeout, finish send msg, end the process */ ++ SPDK_ERRLOG("Send event to session %s time out.\n", smsession->name); ++ ev_ctx->rsp = -ETIMEDOUT; ++ ssam_send_event_response(ev_ctx); ++ return; ++ } ++ ++ spdk_thread_send_msg(spdk_get_thread(), fn, (void *)ev_ctx); ++ ++ return; ++} ++ ++static void ++ssam_send_event_finish(void *ctx) ++{ ++ struct ssam_session_fn_ctx *ev_ctx = ctx; ++ struct spdk_ssam_session *smsession = ev_ctx->smsession; ++ ++ if ((ev_ctx->rsp == 0) && (ev_ctx->need_async) && (ev_ctx->user_ctx.async_done == false)) { ++ ssam_check_send_event_timeout(ev_ctx, ssam_send_event_finish); ++ return; ++ } ++ ++ if (spdk_ssam_trylock() != 0) { ++ ssam_check_send_event_timeout(ev_ctx, ssam_send_event_finish); ++ return; ++ } ++ ++ if (smsession->pending_async_op_num > 0) { ++ smsession->pending_async_op_num--; ++ } else { ++ SPDK_ERRLOG("[OFFLOAD_SNIC] smsession %s: internal error.\n", smsession->name); ++ } ++ ++ /* If ev_ctx->cb_fn proccess failed, ev_ctx->cpl_fn will not excute */ ++ if ((ev_ctx->rsp == 0) && (ev_ctx->cpl_fn != NULL)) { ++ ev_ctx->cpl_fn(smsession, &ev_ctx->user_ctx.ctx); ++ } ++ ++ spdk_ssam_unlock(); ++ ++ ssam_send_event_response(ev_ctx); ++} ++ ++static void ++ssam_send_event(void *ctx) ++{ ++ struct ssam_session_fn_ctx *ev_ctx = ctx; ++ struct spdk_ssam_session *smsession = ev_ctx->smsession; ++ ++ if (spdk_ssam_trylock() != 0) { ++ ssam_check_send_event_timeout(ev_ctx, ssam_send_event); ++ return; ++ } ++ ++ if (smsession->initialized && (ev_ctx->cb_fn != NULL)) { ++ ev_ctx->user_ctx.async_done = false; ++ ev_ctx->rsp = ev_ctx->cb_fn(smsession, &ev_ctx->user_ctx.ctx); ++ } else { ++ ev_ctx->rsp = 0; ++ ev_ctx->user_ctx.async_done = true; ++ } ++ ++ spdk_ssam_unlock(); ++ // The judgment logic is used to adapt to the hot-restart. ++ // Because the session has been released during the hot restart, ++ // the following ssam_send_event_finish is not required. ++ if (ev_ctx->user_ctx.session_freed) { ++ free(ev_ctx); ++ return; ++ } else { ++ ev_ctx->start_tsc = spdk_get_ticks(); ++ spdk_thread_send_msg(g_ssam_init_thread, ssam_send_event_finish, ctx); ++ } ++} ++ ++static spdk_ssam_session_rsp_fn g_rsp_fn = NULL; ++ ++int ++ssam_send_event_to_session(struct spdk_ssam_session *smsession, spdk_ssam_session_fn fn, ++ spdk_ssam_session_cpl_fn cpl_fn, struct spdk_ssam_send_event_flag send_event_flag, void *ctx) ++{ ++ struct ssam_session_fn_ctx *ev_ctx; ++ int rc; ++ ++ ev_ctx = calloc(1, sizeof(*ev_ctx)); ++ if (ev_ctx == NULL) { ++ SPDK_ERRLOG("Failed to alloc ssam event.\n"); ++ return -ENOMEM; ++ } ++ ++ ev_ctx->smsession = smsession; ++ ev_ctx->cb_fn = fn; ++ ev_ctx->cpl_fn = cpl_fn; ++ ev_ctx->need_async = send_event_flag.need_async; ++ if (send_event_flag.need_rsp == true) { ++ ev_ctx->rsp_fn = &smsession->rsp_fn; ++ ev_ctx->rsp_ctx = smsession->rsp_ctx; ++ } else { ++ ev_ctx->rsp_fn = &g_rsp_fn; ++ ev_ctx->rsp_ctx = NULL; ++ } ++ ++ ev_ctx->user_ctx.ctx = ctx; ++ ev_ctx->user_ctx.session_freed = false; ++ ++ if (smsession->pending_async_op_num < UINT32_MAX) { ++ smsession->pending_async_op_num++; ++ } else { ++ SPDK_ERRLOG("[OFFLOAD_SNIC] smsession %s: internel error, events stuck too much\n", smsession->name); ++ } ++ ++ ev_ctx->start_tsc = spdk_get_ticks(); ++ rc = spdk_thread_send_msg(smsession->thread, ssam_send_event, ev_ctx); ++ if (rc != 0) { ++ SPDK_ERRLOG("send thread msg failed\n"); ++ free(ev_ctx); ++ return rc; ++ } ++ return 0; ++} ++ ++void ++spdk_ssam_config_json(struct spdk_json_write_ctx *w) ++{ ++ struct spdk_ssam_dev *smdev = NULL; ++ struct spdk_ssam_session *smsession = NULL; ++ ++ spdk_json_write_array_begin(w); ++ ++ spdk_ssam_lock(); ++ smdev = spdk_ssam_dev_next(NULL); ++ while (smdev != NULL) { ++ smsession = ssam_sessions_next(smdev->smsessions, NULL); ++ while (smsession != NULL) { ++ smsession->backend->write_config_json(smsession, w); ++ smsession = ssam_sessions_next(smdev->smsessions, smsession); ++ } ++ ++ smdev = spdk_ssam_dev_next(smdev); ++ } ++ ++ spdk_ssam_unlock(); ++ ++ spdk_json_write_array_end(w); ++} ++ ++int ++ssam_get_config(struct spdk_ssam_session *smsession, uint8_t *config, ++ uint32_t len, uint16_t queues) ++{ ++ const struct spdk_ssam_session_backend *backend = smsession->backend; ++ ++ if (backend->ssam_get_config == NULL) { ++ return -1; ++ } ++ ++ return backend->ssam_get_config(smsession, config, len, queues); ++} ++ ++struct dev_destroy_ctx { ++ struct spdk_ssam_session *smsession; ++ void *args; ++}; ++ ++static void spdk_ssam_dev_destroy(void *arg) ++{ ++ struct dev_destroy_ctx *ctx = (struct dev_destroy_ctx *)arg; ++ ctx->smsession->backend->destroy_bdev_device(ctx->smsession, ctx->args); ++ free(ctx); ++} ++ ++void ++spdk_ssam_send_dev_destroy_msg(struct spdk_ssam_session *smsession, void *args) ++{ ++ struct dev_destroy_ctx *ctx = calloc(1, sizeof(struct dev_destroy_ctx)); ++ if (ctx == NULL) { ++ SPDK_ERRLOG("%s: out of memory, destroy dev failed\n", smsession->name); ++ return; ++ } ++ ctx->smsession = smsession; ++ ctx->args = args; ++ spdk_thread_send_msg(g_ssam_init_thread, spdk_ssam_dev_destroy, ctx); ++} ++ ++void ++ssam_poller_start(void) ++{ ++ struct spdk_ssam_dev *smdev = NULL; ++ struct spdk_ssam_dev *tmp = NULL; ++ spdk_ssam_lock(); ++ smdev = spdk_ssam_dev_next(NULL); ++ while (smdev != NULL) { ++ tmp = spdk_ssam_dev_next(smdev); ++ // Send the message to each smdev to start the worker on the smdev. ++ spdk_thread_send_msg(smdev->thread, ssam_dev_start_worker_poller, smdev); ++ smdev = tmp; ++ } ++ spdk_ssam_unlock(); ++} ++ ++static void ++spdk_ssam_fini(void *arg) ++{ ++ struct spdk_ssam_dev *smdev = NULL; ++ struct spdk_ssam_dev *tmp = NULL; ++ SPDK_WARNLOG("ssam is finishing\n"); ++ spdk_ssam_lock(); ++ smdev = spdk_ssam_dev_next(NULL); ++ while (smdev != NULL) { ++ tmp = spdk_ssam_dev_next(smdev); ++ // Send the message to each smdev to stop the worker on the smdev. ++ spdk_thread_send_msg(smdev->thread, ssam_dev_stop_worker_poller, smdev); ++ smdev = tmp; ++ } ++ spdk_ssam_unlock(); ++ ++ spdk_cpuset_zero(&g_ssam_core_mask); ++ ++ g_ssam_fini_cpl_cb(); ++} ++ ++static void * ++spdk_ssam_session_shutdown(void *arg) ++{ ++ SPDK_INFOLOG(ssam, "ssam session Exiting\n"); ++ spdk_thread_send_msg(g_ssam_init_thread, spdk_ssam_fini, NULL); ++ ++ return NULL; ++} ++ ++void ++spdk_ssam_subsystem_fini(spdk_ssam_fini_cb fini_cb) ++{ ++ if (spdk_get_thread() != g_ssam_init_thread) { ++ SPDK_ERRLOG("ssam finish thread not equal init thread, internel error\n"); ++ } ++ ++ g_ssam_fini_cpl_cb = fini_cb; ++ ++ spdk_ssam_session_shutdown(NULL); ++} ++ ++void ++spdk_ssam_subsystem_init(spdk_ssam_init_cb init_cb) ++{ ++ uint32_t i; ++ int ret; ++ int shm_id; ++ ++ g_ssam_init_thread = spdk_get_thread(); ++ if (g_ssam_init_thread == NULL) { ++ ret = -EBUSY; ++ SPDK_ERRLOG("get thread error\n"); ++ goto exit; ++ } ++ ++ /* init ssam core mask */ ++ spdk_cpuset_zero(&g_ssam_core_mask); ++ SPDK_ENV_FOREACH_CORE(i) { ++ spdk_cpuset_set_cpu(&g_ssam_core_mask, i, true); ++ } ++ ++ ret = ssam_set_core_num(spdk_cpuset_count(&g_ssam_core_mask)); ++ if (ret != 0) { ++ goto exit; ++ } ++ ++ ret = spdk_ssam_init(); ++ if (ret != 0) { ++ goto exit; ++ } ++ ++ if (!ssam_get_shm_created()) { ++ shm_id = shm_open(SSAM_SHM, O_CREAT | O_EXCL | O_RDWR, SSAM_SHM_PERMIT); ++ if (shm_id < 0) { ++ SPDK_ERRLOG("failed to create shared memory %s\n", SSAM_SHM); ++ ret = -1; ++ goto exit; ++ } ++ ssam_set_shm_created(true); ++ } ++ ++exit: ++ init_cb(ret); ++ return; ++} ++ ++// Initialize all smdev modules during submodule initialization. ++static int ++ssam_smdev_init(void) ++{ ++ int rc = 0; ++ struct spdk_ssam_dev *smdev; ++ struct spdk_ssam_dev *tmp = NULL; ++ uint16_t core_num = ssam_get_core_num(); ++ for (uint16_t i = 0; i < core_num; ++i) { ++ rc = ssam_dev_register(&smdev, i); ++ if (rc != 0) { ++ goto out; ++ } ++ } ++ ++ rc = ssam_get_hot_upgrade_state(); ++ if (rc != 0) { ++ SPDK_ERRLOG(": virtio upgrade state failed.\n"); ++ return rc; ++ } ++ ++ return 0; ++out: ++ smdev = spdk_ssam_dev_next(NULL); ++ while (smdev != NULL) { ++ tmp = spdk_ssam_dev_next(smdev); ++ ssam_dev_unregister(&smdev); ++ smdev = tmp; ++ } ++ return rc; ++} ++ ++static int ++ssam_server_init(void) ++{ ++ uint32_t core_num = ssam_get_core_num(); ++ uint32_t mempool_size = (spdk_ssam_get_mempool_size() / core_num) & (~0U - 1); ++ uint32_t i; ++ ++ /* Disable dummy I/O for hot restart */ ++ ++ for (i = 0; i < core_num; i++) { ++ g_ssam_info.mp[i] = ssam_mempool_create(mempool_size * SSAM_MB, SSAM_DEFAULT_MEMPOOL_EXTRA_SIZE); ++ if (g_ssam_info.mp[i] == NULL) { ++ SPDK_ERRLOG("ssam create mempool[%d] failed, mempool_size = %uMB.\n", i, mempool_size); ++ return -ENOMEM; ++ } ++ } ++ ++ return 0; ++} ++ ++static void ++ssam_server_exit(void) ++{ ++ uint32_t core_num = ssam_get_core_num(); ++ uint32_t i; ++ ++ for (i = 0; i < core_num; i++) { ++ if (g_ssam_info.mp[i] != NULL) { ++ ssam_mempool_destroy(g_ssam_info.mp[i]); ++ g_ssam_info.mp[i] = NULL; ++ } ++ } ++ ++ memset(&g_ssam_info, 0x0, sizeof(struct spdk_ssam_info)); ++} ++ ++ ++static int ++ssam_check_device_status(void) ++{ ++ uint8_t ready = 0; ++ int times = 0; ++ int rc; ++ ++ do { ++ rc = ssam_check_device_ready(0, 0, &ready); ++ if (rc != 0) { ++ SPDK_ERRLOG("device check failed.\n"); ++ return rc; ++ } ++ ++ if (ready != 0) { ++ break; ++ } ++ ++ usleep(DEVICE_READY_WAIT_TIME); ++ times++; ++ } while (times < DEVICE_READY_TIMEOUT); ++ ++ if (ready == 0) { ++ SPDK_ERRLOG("device has not been ready after 1.5s.\n"); ++ return -1; ++ } ++ ++ return 0; ++} ++ ++ ++static int ++spdk_ssam_init(void) ++{ ++ int rc; ++ ++ rc = ssam_check_device_status(); ++ if (rc != 0) { ++ return rc; ++ } ++ ++ rc = ssam_config_init(); ++ if (rc != 0) { ++ return rc; ++ } ++ ++ rc = ssam_server_init(); ++ if (rc != 0) { ++ ssam_config_exit(); ++ return rc; ++ } ++ ++ rc = ssam_smdev_init(); ++ if (rc != 0) { ++ ssam_server_exit(); ++ ssam_config_exit(); ++ } ++ ++ return rc; ++} ++ ++void ++spdk_ssam_exit(void) ++{ ++ ssam_deinit_device_pcie_list(); ++ ssam_config_exit(); ++ ssam_server_exit(); ++} ++ ++SPDK_LOG_REGISTER_COMPONENT(ssam) +diff --git a/lib/ssam/ssam_blk.c b/lib/ssam/ssam_blk.c +new file mode 100644 +index 0000000..a13b9de +--- /dev/null ++++ b/lib/ssam/ssam_blk.c +@@ -0,0 +1,2127 @@ ++/*- ++ * BSD LICENSE ++ * ++ * Copyright (c) Huawei Technologies Co., Ltd. 2021-2021. All rights reserved. ++ * ++ * Redistribution and use in source and binary forms, with or without ++ * modification, are permitted provided that the following conditions ++ * are met: ++ * ++ * * Redistributions of source code must retain the above copyright ++ * notice, this list of conditions and the following disclaimer. ++ * * Redistributions in binary form must reproduce the above copyright ++ * notice, this list of conditions and the following disclaimer in ++ * the documentation and/or other materials provided with the ++ * distribution. ++ * * Neither the name of Intel Corporation nor the names of its ++ * contributors may be used to endorse or promote products derived ++ * from this software without specific prior written permission. ++ * ++ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS ++ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT ++ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR ++ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT ++ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, ++ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT ++ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, ++ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY ++ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT ++ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE ++ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++ */ ++ ++#include ++#include ++ ++#include "spdk/env.h" ++#include "spdk/bdev.h" ++#include "spdk/bdev_module.h" ++#include "spdk/thread.h" ++#include "spdk/likely.h" ++#include "spdk/string.h" ++#include "spdk/util.h" ++ ++#include "ssam_internal.h" ++ ++#define SESSION_STOP_POLLER_PERIOD 1000 ++#define ENQUEUE_TIMES_PER_IO 1000 ++ ++#define IOV_HEADER_TAIL_NUM 2 ++ ++#define SECTOR_SIZE 512 ++#define ALIGNMENT_2M (2048 * 1024) ++#define SERIAL_STRING_LEN 128 ++#define SMSESSION_STOP_TIMEOUT 2 // s ++#define PERF_STAT ++ ++/* Related to (SPDK_SSAM_IOVS_MAX * SPDK_SSAM_MAX_SEG_SIZE) */ ++#define PAYLOAD_SIZE_MAX (2048U * 2048) ++ ++#define RETRY_TIMEOUT 120 ++ ++/* Minimal set of features supported by every virtio-blk device */ ++#define SPDK_SSAM_BLK_FEATURES_BASE (SPDK_SSAM_FEATURES | \ ++ (1ULL << VIRTIO_BLK_F_SIZE_MAX) | (1ULL << VIRTIO_BLK_F_SEG_MAX) | \ ++ (1ULL << VIRTIO_BLK_F_GEOMETRY) | (1ULL << VIRTIO_BLK_F_BLK_SIZE) | \ ++ (1ULL << VIRTIO_BLK_F_TOPOLOGY) | (1ULL << VIRTIO_BLK_F_BARRIER) | \ ++ (1ULL << VIRTIO_BLK_F_SCSI) | (1ULL << VIRTIO_BLK_F_CONFIG_WCE) | \ ++ (1ULL << VIRTIO_BLK_F_MQ)) ++ ++extern bool g_ssam_subsystem_exit; ++ ++struct ssam_task_stat { ++ uint64_t start_tsc; ++ uint64_t dma_start_tsc; ++ uint64_t dma_end_tsc; ++ uint64_t bdev_start_tsc; ++ uint64_t bdev_func_tsc; ++ uint64_t bdev_end_tsc; ++ uint64_t complete_start_tsc; ++ uint64_t complete_end_tsc; ++}; ++ ++struct spdk_ssam_blk_task { ++ /* Returned status of I/O processing, it can be VIRTIO_BLK_S_OK, ++ * VIRTIO_BLK_S_IOERR or VIRTIO_BLK_S_UNSUPP ++ */ ++ volatile uint8_t *status; ++ ++ /* Number of bytes processed successfully */ ++ uint32_t used_len; ++ ++ /* Records the amount of valid data in the struct iovec iovs array. */ ++ uint32_t iovcnt; ++ struct ssam_iovec iovs; ++ ++ /* If set, the task is currently used for I/O processing. */ ++ bool used; ++ ++ /* For bdev io wait */ ++ struct spdk_bdev_io_wait_entry bdev_io_wait; ++ struct spdk_ssam_session_io_wait session_io_wait; ++ struct spdk_ssam_blk_session *bsmsession; ++ ++ /* Size of whole payload in bytes */ ++ uint32_t payload_size; ++ ++ /* ssam request data */ ++ struct ssam_request *io_req; ++ ++ uint16_t vq_idx; ++ uint16_t req_idx; ++ uint16_t task_idx; ++ struct ssam_task_stat task_stat; ++}; ++ ++struct ssam_blk_stat { ++ uint64_t count; ++ uint64_t start_count; ++ uint64_t total_tsc; // pre_dma <- -> post_return ++ uint64_t dma_tsc; // pre_dma <- -> post_dma ++ uint64_t dma_count; ++ uint64_t dma_complete_count; ++ uint64_t bdev_tsc; // pre_bdev <- -> post_bdev ++ uint64_t bdev_submit_tsc; // <- spdk_bdev_xxx -> ++ uint64_t bdev_count; ++ uint64_t bdev_complete_count; ++ uint64_t complete_tsc; // pre_return <- -> post_return ++ uint64_t internel_tsc; // total_tsc - dma_tsc - bdev_tsc - complete_tsc ++ ++ uint64_t complete_read_ios; // Number of successfully completed read requests ++ uint64_t err_read_ios; // Number of failed completed read requests ++ uint64_t complete_write_ios; // Number of successfully completed write requests ++ uint64_t err_write_ios; // Number of failed completed write requests ++ uint64_t flush_ios; // Total number of flush requests ++ uint64_t complete_flush_ios; // Number of successfully completed flush requests ++ uint64_t err_flush_ios; // Number of failed completed flush requests ++ uint64_t other_ios; ++ uint64_t complete_other_ios; ++ uint64_t err_other_ios; ++ uint64_t fatal_ios; // Number of discarded requests ++ uint64_t io_retry; ++}; ++ ++struct spdk_ssam_blk_session { ++ /* The parent session must be the very first field in this struct */ ++ struct spdk_ssam_session smsession; ++ struct spdk_poller *stop_poller; ++ struct spdk_bdev *bdev; ++ struct spdk_bdev_desc *bdev_desc; ++ struct spdk_io_channel *io_channel; ++ ++ /* volume id*/ ++ char *serial; ++ ++ /* accumulated I/O statistics */ ++ struct spdk_bdev_io_stat stat; ++ ++ // Current count of bdev operations for hot-restart. ++ int32_t bdev_count; ++ ++ // poller for waiting bdev finish when hot-restart ++ struct spdk_poller *stop_bdev_poller; ++ ++ /* controller statistics. */ ++ struct ssam_blk_stat blk_stat; ++ ++ /* if set, all writes to the device will fail with ++ * VIRTIO_BLK_S_IOERR error code ++ */ ++ bool readonly; ++ ++ /* if set, indicate the session not have a bdev, all writes to the device ++ * will fail with VIRTIO_BLK_S_IOERR error code ++ */ ++ bool no_bdev; ++}; ++ ++struct ssam_blk_session_ctx { ++ struct spdk_ssam_blk_session *bsmsession; ++ void **user_ctx; ++}; ++ ++static const struct spdk_ssam_session_backend g_ssam_blk_session_backend; ++static int ssam_blk_remove_session(struct spdk_ssam_session *smsession); ++static void ssam_blk_request_worker(struct spdk_ssam_session *smsession, void *arg); ++static void ssam_blk_destroy_bdev_device(struct spdk_ssam_session *smsession, void *args); ++static void ssam_blk_response_worker(struct spdk_ssam_session *smsession, void *arg); ++static void ssam_blk_no_data_request_worker(struct spdk_ssam_session *smsession); ++static inline void ssam_request_queue_io(struct spdk_ssam_blk_task *task); ++static void ssam_task_complete(struct spdk_ssam_blk_task *task, uint8_t status); ++static void ssam_data_request_para(struct ssam_dma_request *dma_req, ++ struct spdk_ssam_blk_task *task, uint32_t type, uint8_t status); ++static void ssam_blk_print_stuck_io_info(struct spdk_ssam_session *smsession); ++static int ssam_process_blk_request(struct spdk_ssam_blk_task *task); ++static void ssam_free_task_pool(struct spdk_ssam_blk_session *bsmsession); ++static int ssam_blk_io_complete(struct spdk_ssam_dev *smdev, struct ssam_request *io_req, uint8_t status); ++static void ssam_session_io_resubmit(void *arg); ++ ++static inline struct spdk_ssam_blk_session * ++ssam_to_blk_session(struct spdk_ssam_session *smsession) ++{ ++ return (struct spdk_ssam_blk_session *)smsession; ++} ++ ++static void ++ssam_blk_dump_info_json(struct spdk_ssam_session *smsession, ++ struct spdk_json_write_ctx *w) ++{ ++ struct spdk_ssam_blk_session *bsmsession = ssam_to_blk_session(smsession); ++ ++ spdk_json_write_object_begin(w); ++ spdk_json_write_named_string(w, "name", spdk_ssam_session_get_name(smsession)); ++ spdk_json_write_named_uint32(w, "function_id", (uint32_t)smsession->gfunc_id); ++ spdk_json_write_named_uint32(w, "queues", (uint32_t)smsession->max_queues); ++ ++ spdk_json_write_named_object_begin(w, "block"); ++ spdk_json_write_named_bool(w, "readonly", bsmsession->readonly); ++ spdk_json_write_name(w, "bdev"); ++ if (bsmsession->bdev != NULL) { ++ spdk_json_write_string(w, spdk_bdev_get_name(bsmsession->bdev)); ++ } else { ++ spdk_json_write_null(w); ++ } ++ spdk_json_write_object_end(w); ++ ++ spdk_json_write_object_end(w); ++} ++ ++static void ++ssam_dev_bdev_remove_cpl_cb(struct spdk_ssam_session *smsession, void **unnused) ++{ ++ /* All sessions have been notified, time to close the bdev */ ++ struct spdk_ssam_blk_session *bsmsession = ssam_to_blk_session(smsession); ++ ++ if (bsmsession == NULL) { ++ return; ++ } ++ ++ if (bsmsession->bdev_desc != NULL) { ++ spdk_bdev_close(bsmsession->bdev_desc); ++ bsmsession->bdev_desc = NULL; ++ } ++ ++ /* bdev not create by ssam blk, no need be freed here */ ++ bsmsession->bdev = NULL; ++} ++ ++static void ++ssam_blk_stop_cpl_cb(struct spdk_ssam_session *smsession, void **ctx) ++{ ++ struct spdk_ssam_blk_session *bsmsession = ssam_to_blk_session(smsession); ++ spdk_ssam_session_rsp_fn rsp_fn = smsession->rsp_fn; ++ void *rsp_ctx = smsession->rsp_ctx; ++ int rc; ++ ++ ssam_dev_bdev_remove_cpl_cb(smsession, NULL); ++ rc = ssam_virtio_blk_resize(smsession->gfunc_id, 0); ++ if (rc != 0) { ++ SPDK_WARNLOG("%s: virtio blk resize failed when remove session.\n", smsession->name); ++ } ++ ++ /* Can not umount function here, whenever the gfunc_id must be mounted to ++ * the dummy tid or to the specific tid ++ */ ++ ++ if (smsession->gfunc_id > SSAM_PF_MAX_NUM) { ++ rc = ssam_virtio_blk_release_resource(smsession->gfunc_id); ++ } ++ ++ SPDK_NOTICELOG("BLK controller %s deleted\n", smsession->name); ++ ++ if (smsession->name != NULL) { ++ free(smsession->name); ++ smsession->name = NULL; ++ } ++ ++ ssam_set_session_be_freed(ctx); ++ memset(bsmsession, 0, sizeof(*bsmsession)); ++ free(bsmsession); ++ ++ if (rsp_fn != NULL) { ++ rsp_fn(rsp_ctx, 0); ++ rsp_fn = NULL; ++ } ++} ++ ++static void ++ssam_task_stat_tick(uint64_t *tsc) ++{ ++#ifdef PERF_STAT ++ *tsc = spdk_get_ticks(); ++#endif ++ return; ++} ++ ++static void ++ssam_blk_stat_statistics(struct spdk_ssam_blk_task *task, uint8_t status) ++{ ++#ifdef PERF_STAT ++ struct spdk_ssam_blk_session *bsmsession = task->bsmsession; ++ uint64_t dma_tsc = task->task_stat.dma_end_tsc - task->task_stat.dma_start_tsc; ++ uint64_t bdev_tsc = task->task_stat.bdev_end_tsc - task->task_stat.bdev_start_tsc; ++ uint64_t bdev_submit_tsc = task->task_stat.bdev_func_tsc - task->task_stat.bdev_start_tsc; ++ uint64_t complete_tsc = task->task_stat.complete_end_tsc - task->task_stat.complete_start_tsc; ++ uint64_t total_tsc = task->task_stat.complete_end_tsc - task->task_stat.start_tsc; ++ struct virtio_blk_outhdr *req = (struct virtio_blk_outhdr *)task->io_req->req.cmd.header; ++ ++ if (req->type == VIRTIO_BLK_T_IN) { // read ++ bsmsession->stat.read_latency_ticks += total_tsc; ++ bsmsession->stat.bytes_read += task->payload_size; ++ bsmsession->stat.num_read_ops++; ++ if (status == VIRTIO_BLK_S_OK) { ++ bsmsession->blk_stat.complete_read_ios++; ++ } else { ++ bsmsession->blk_stat.err_read_ios++; ++ } ++ } else if (req->type == VIRTIO_BLK_T_OUT) { // write ++ bsmsession->stat.write_latency_ticks += total_tsc; ++ bsmsession->stat.bytes_written += task->payload_size; ++ bsmsession->stat.num_write_ops++; ++ if (status == VIRTIO_BLK_S_OK) { ++ bsmsession->blk_stat.complete_write_ios++; ++ } else { ++ bsmsession->blk_stat.err_write_ios++; ++ } ++ } else if (req->type == VIRTIO_BLK_T_FLUSH) { // flush ++ bsmsession->blk_stat.flush_ios++; ++ if (status == VIRTIO_BLK_S_OK) { ++ bsmsession->blk_stat.complete_flush_ios++; ++ } else { ++ bsmsession->blk_stat.err_flush_ios++; ++ } ++ } else { ++ bsmsession->blk_stat.other_ios++; ++ if (status == VIRTIO_BLK_S_OK) { ++ bsmsession->blk_stat.complete_other_ios++; ++ } else { ++ bsmsession->blk_stat.err_other_ios++; ++ } ++ } ++ ++ bsmsession->blk_stat.dma_tsc += dma_tsc; ++ bsmsession->blk_stat.bdev_tsc += bdev_tsc; ++ bsmsession->blk_stat.bdev_submit_tsc += bdev_submit_tsc; ++ bsmsession->blk_stat.complete_tsc += complete_tsc; ++ bsmsession->blk_stat.total_tsc += total_tsc; ++ bsmsession->blk_stat.internel_tsc += total_tsc - complete_tsc - bdev_tsc - dma_tsc; ++ bsmsession->blk_stat.count += 1; ++#endif ++} ++ ++static void ++ssam_blk_configs(uint8_t *config, struct virtio_blk_config *blkcfg, ++ uint32_t len, struct spdk_bdev *bdev) ++{ ++ uint32_t cfg_len; ++ ++ /* minimum I/O size in blocks */ ++ blkcfg->min_io_size = 1; ++ ++ if (bdev && spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_UNMAP)) { ++ /* 32768 sectors is 16MiB, expressed in 512 Bytes */ ++ blkcfg->max_discard_sectors = 32768; ++ blkcfg->max_discard_seg = 1; ++ /* expressed in 512 Bytes sectors */ ++ blkcfg->discard_sector_alignment = blkcfg->blk_size / SECTOR_SIZE; ++ } ++ if (bdev && spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE_ZEROES)) { ++ /* 32768 sectors is 16MiB, expressed in 512 Bytes */ ++ blkcfg->max_write_zeroes_sectors = 32768; ++ blkcfg->max_write_zeroes_seg = 1; ++ } ++ ++ cfg_len = sizeof(struct virtio_blk_config); ++ memcpy(config, blkcfg, (unsigned long)spdk_min(len, cfg_len)); ++ if (len < cfg_len) { ++ SPDK_NOTICELOG("Out config len %u < total config len %u\n", len, cfg_len); ++ } ++ ++ return; ++} ++ ++static int ++ssam_blk_get_config(struct spdk_ssam_session *smsession, uint8_t *config, ++ uint32_t len, uint16_t queues) ++{ ++ struct virtio_blk_config blkcfg; ++ struct spdk_ssam_blk_session *bsmsession = NULL; ++ struct spdk_bdev *bdev = NULL; ++ uint32_t blk_size; ++ uint64_t blkcnt; ++ ++ memset(&blkcfg, 0, sizeof(blkcfg)); ++ bsmsession = ssam_to_blk_session(smsession); ++ if (bsmsession == NULL) { ++ SPDK_ERRLOG("session is null.\n"); ++ return -1; ++ } ++ bdev = bsmsession->bdev; ++ if (bdev == NULL) { ++ return -1; ++ } ++ blk_size = spdk_bdev_get_block_size(bdev); ++ blkcnt = spdk_bdev_get_num_blocks(bdev); ++ /* ssam will use this configuration, this is the max capability of ++ * the ssam, configurations will be obtained through negotiation ++ * in the future. ++ */ ++ blkcfg.size_max = SPDK_SSAM_MAX_SEG_SIZE; ++ blkcfg.seg_max = SPDK_SSAM_IOVS_MAX; ++ ++ if (blk_size == 0) { ++ SPDK_ERRLOG("bdev's blk_size %u error.\n", blk_size); ++ return -1; ++ } ++ if (blkcnt > (UINT64_MAX / blk_size)) { ++ SPDK_ERRLOG("bdev's blkcnt %lu or blk_size %u out of range.\n", ++ blkcnt, blk_size); ++ return -1; ++ } ++ blkcfg.blk_size = blk_size; ++ /* expressed in 512 Bytes sectors */ ++ blkcfg.capacity = (blkcnt * blk_size) / 512; ++ blkcfg.num_queues = 1; // TODO: 1 change to queues after the VBS problem is fixed ++ ssam_blk_configs(config, &blkcfg, len, bdev); ++ ++ return 0; ++} ++ ++static void ++ssam_blk_write_config_json(struct spdk_ssam_session *smsession, ++ struct spdk_json_write_ctx *w) ++{ ++ struct spdk_ssam_blk_session *bsmsession = ssam_to_blk_session(smsession); ++ ++ if (bsmsession == NULL || bsmsession->bdev == NULL) { ++ return; ++ } ++ ++ spdk_json_write_object_begin(w); ++ spdk_json_write_named_string(w, "method", "create_blk_controller"); ++ ++ spdk_json_write_named_object_begin(w, "params"); ++ spdk_json_write_named_string(w, "dev_name", spdk_bdev_get_name(bsmsession->bdev)); ++ char *gfunc_id = spdk_sprintf_alloc("%u", bsmsession->smsession.gfunc_id); ++ if (gfunc_id == NULL) { ++ SPDK_ERRLOG("alloc for gfunc_id failed\n"); ++ } else { ++ spdk_json_write_named_string(w, "index", gfunc_id); ++ free(gfunc_id); ++ } ++ spdk_json_write_named_bool(w, "readonly", bsmsession->readonly); ++ if (bsmsession->serial != NULL) { ++ spdk_json_write_named_string(w, "serial", bsmsession->serial); ++ } ++ if (bsmsession->smsession.gfunc_id > SSAM_PF_MAX_NUM) { ++ spdk_json_write_named_int32(w, "vqueue", (int32_t)bsmsession->smsession.max_queues); ++ } ++ ++ spdk_json_write_object_end(w); ++ ++ spdk_json_write_object_end(w); ++} ++ ++static void ++ssam_blk_show_iostat_json(struct spdk_ssam_session *smsession, uint32_t id, struct spdk_json_write_ctx *w) ++{ ++ struct spdk_ssam_blk_session *bsmsession = ssam_to_blk_session(smsession); ++ struct spdk_bdev *bdev = spdk_ssam_get_session_bdev(smsession); ++ struct spdk_bdev_io_stat stat = {0}; ++ struct ssam_blk_stat blk_stat; ++ uint64_t ticks_hz = spdk_get_ticks_hz(); ++ uint64_t poll_count = smsession->smdev->stat.poll_count; ++ ++ memcpy(&stat, &bsmsession->stat, sizeof(struct spdk_bdev_io_stat)); /* a little question, mutex */ ++ memcpy(&blk_stat, &bsmsession->blk_stat, sizeof(struct ssam_blk_stat)); ++ spdk_json_write_object_begin(w); ++ ++ spdk_json_write_named_uint32(w, "function_id", smsession->gfunc_id); ++ if (smsession->smdev->stat.poll_count == 0) { ++ poll_count = 1; ++ } ++ spdk_json_write_named_string_fmt(w, "poll_lat", "%.9f", ++ (float)smsession->smdev->stat.poll_tsc / poll_count / ticks_hz); ++ spdk_json_write_named_string(w, "bdev_name", (bdev == NULL) ? "" : spdk_bdev_get_name(bdev)); ++ spdk_json_write_named_uint64(w, "bytes_read", stat.bytes_read); ++ spdk_json_write_named_uint64(w, "num_read_ops", stat.num_read_ops); ++ spdk_json_write_named_uint64(w, "bytes_written", stat.bytes_written); ++ spdk_json_write_named_uint64(w, "num_write_ops", stat.num_write_ops); ++ spdk_json_write_named_uint64(w, "read_latency_ticks", stat.read_latency_ticks); ++ spdk_json_write_named_uint64(w, "write_latency_ticks", stat.write_latency_ticks); ++ spdk_json_write_named_uint64(w, "complete_read_ios", blk_stat.complete_read_ios); ++ spdk_json_write_named_uint64(w, "err_read_ios", blk_stat.err_read_ios); ++ spdk_json_write_named_uint64(w, "complete_write_ios", blk_stat.complete_write_ios); ++ spdk_json_write_named_uint64(w, "err_write_ios", blk_stat.err_write_ios); ++ spdk_json_write_named_uint64(w, "flush_ios", blk_stat.flush_ios); ++ spdk_json_write_named_uint64(w, "complete_flush_ios", blk_stat.complete_flush_ios); ++ spdk_json_write_named_uint64(w, "err_flush_ios", blk_stat.err_flush_ios); ++ spdk_json_write_named_uint64(w, "other_ios", blk_stat.other_ios); ++ spdk_json_write_named_uint64(w, "complete_other_ios", blk_stat.complete_other_ios); ++ spdk_json_write_named_uint64(w, "err_other_ios", blk_stat.err_other_ios); ++ ++ spdk_json_write_named_uint64(w, "fatal_ios", blk_stat.fatal_ios); ++ spdk_json_write_named_uint64(w, "io_retry", blk_stat.io_retry); ++ spdk_json_write_named_object_begin(w, "counters"); ++ spdk_json_write_named_uint64(w, "start_count", blk_stat.start_count); ++ spdk_json_write_named_uint64(w, "dma_count", blk_stat.dma_count); ++ spdk_json_write_named_uint64(w, "dma_complete_count", blk_stat.dma_complete_count); ++ spdk_json_write_named_uint64(w, "bdev_count", blk_stat.bdev_count); ++ spdk_json_write_named_uint64(w, "bdev_complete_count", blk_stat.bdev_complete_count); ++ spdk_json_write_object_end(w); ++ spdk_json_write_named_object_begin(w, "details"); ++ spdk_json_write_named_uint64(w, "count", blk_stat.count); ++ if (blk_stat.count == 0) { ++ blk_stat.count = 1; ++ } ++ spdk_json_write_named_string_fmt(w, "total_lat", "%.9f", (float)blk_stat.total_tsc / blk_stat.count / ticks_hz); ++ spdk_json_write_named_string_fmt(w, "dma_lat", "%.9f", (float)blk_stat.dma_tsc / blk_stat.count / ticks_hz); ++ spdk_json_write_named_string_fmt(w, "bdev_lat", "%.9f", (float)blk_stat.bdev_tsc / blk_stat.count / ticks_hz); ++ spdk_json_write_named_string_fmt(w, "bdev_submit_lat", "%.9f", (float)blk_stat.bdev_submit_tsc / blk_stat.count / ticks_hz); ++ spdk_json_write_named_string_fmt(w, "complete_lat", "%.9f", (float)blk_stat.complete_tsc / blk_stat.count / ticks_hz); ++ spdk_json_write_named_string_fmt(w, "internel_lat", "%.9f", (float)blk_stat.internel_tsc / blk_stat.count / ticks_hz); ++ spdk_json_write_object_end(w); ++ spdk_json_write_object_end(w); ++} ++ ++static void ++ssam_blk_clear_iostat_json(struct spdk_ssam_session *smsession) ++{ ++ struct spdk_ssam_blk_session *bsmsession = ssam_to_blk_session(smsession); ++ memset(&bsmsession->stat, 0, sizeof(struct spdk_bdev_io_stat) - sizeof(uint64_t)); // exclude ticks_rate ++ memset(&bsmsession->blk_stat, 0, sizeof(struct ssam_blk_stat)); ++} ++ ++static struct spdk_bdev *ssam_blk_get_bdev(struct spdk_ssam_session *smsession, uint32_t id) ++{ ++ struct spdk_bdev *bdev = spdk_ssam_get_session_bdev(smsession); ++ ++ return bdev; ++} ++ ++static const struct spdk_ssam_session_backend g_ssam_blk_session_backend = { ++ .type = VIRTIO_TYPE_BLK, ++ .remove_session = ssam_blk_remove_session, ++ .request_worker = ssam_blk_request_worker, ++ .destroy_bdev_device = ssam_blk_destroy_bdev_device, ++ .response_worker = ssam_blk_response_worker, ++ .no_data_req_worker = ssam_blk_no_data_request_worker, ++ .ssam_get_config = ssam_blk_get_config, ++ .print_stuck_io_info = ssam_blk_print_stuck_io_info, ++ .dump_info_json = ssam_blk_dump_info_json, ++ .write_config_json = ssam_blk_write_config_json, ++ .show_iostat_json = ssam_blk_show_iostat_json, ++ .clear_iostat_json = ssam_blk_clear_iostat_json, ++ .get_bdev = ssam_blk_get_bdev, ++ .remove_self = NULL, ++}; ++ ++// Clean Smsession ++static int ++ssam_destroy_poller_cb(void *arg) ++{ ++ struct spdk_ssam_blk_session *bsmsession = (struct spdk_ssam_blk_session *)arg; ++ struct spdk_ssam_session *smsession = &bsmsession->smsession; ++ struct spdk_ssam_dev *smdev = smsession->smdev; ++ ++ SPDK_NOTICELOG("%s: remaining %u tasks\n", smsession->name, smsession->task_cnt); ++ ++ // stop poller ++ spdk_poller_unregister(&bsmsession->stop_bdev_poller); ++ ++ // remove session ++ ssam_sessions_remove(smdev->smsessions, smsession); ++ smdev->active_session_num--; ++ smsession->smdev = NULL; ++ ++ // put ioChannle ++ if (bsmsession->io_channel != NULL) { ++ spdk_put_io_channel(bsmsession->io_channel); ++ bsmsession->io_channel = NULL; ++ } ++ ++ // close bdev device, last step, async ++ spdk_ssam_send_dev_destroy_msg(smsession, NULL); ++ ++ // free smsession not here, but after close bdev device; ++ // see ssam_blk_destroy_bdev_device() ++ ++ return SPDK_POLLER_BUSY; ++} ++ ++static int ++ssam_session_bdev_remove_cb(struct spdk_ssam_session *smsession, void **ctx) ++{ ++ struct spdk_ssam_blk_session *bsmsession = ssam_to_blk_session(smsession); ++ int rc = 0; ++ ++ // smsession already removed ++ if (!smsession->started) { ++ return 0; ++ } else { ++ smsession->started = false; ++ } ++ ++ bsmsession->stop_bdev_poller = SPDK_POLLER_REGISTER(ssam_destroy_poller_cb, ++ bsmsession, 0); ++ ++ rc = ssam_virtio_blk_resize(smsession->gfunc_id, 0); ++ if (rc != 0) { ++ SPDK_WARNLOG("%s: virtio blk resize failed when remove session.\n", smsession->name); ++ } ++ ++ ssam_set_session_be_freed(ctx); ++ ssam_send_event_async_done(ctx); ++ ++ return 0; ++} ++ ++static void ++ssam_bdev_remove_cb(void *remove_ctx) ++{ ++ struct spdk_ssam_session *smsession = remove_ctx; ++ struct spdk_ssam_send_event_flag send_event_flag = { ++ .need_async = false, ++ .need_rsp = true, ++ }; ++ ++ SPDK_WARNLOG("%s: hot-removing bdev - all further requests will be stucked.\n", ++ smsession->name); ++ ++ ssam_send_event_to_session(smsession, ssam_session_bdev_remove_cb, ++ NULL, send_event_flag, NULL); ++} ++ ++static void ++ssam_session_bdev_resize_cb(struct spdk_ssam_session *smsession, void **ctx) ++{ ++ struct spdk_ssam_blk_session *bsmsession = ssam_to_blk_session(smsession); ++ int rc; ++ ++ rc = ssam_virtio_blk_resize(smsession->gfunc_id, bsmsession->bdev->blockcnt); ++ if (rc != 0) { ++ SPDK_WARNLOG("%s: virtio blk resize failed.\n", smsession->name); ++ } ++} ++ ++static void ++ssam_blk_resize_cb(void *resize_ctx) ++{ ++ struct spdk_ssam_session *smsession = resize_ctx; ++ struct spdk_ssam_send_event_flag send_event_flag = { ++ .need_async = false, ++ .need_rsp = true, ++ }; ++ ++ ssam_send_event_to_session(smsession, NULL, ssam_session_bdev_resize_cb, send_event_flag, NULL); ++} ++ ++static void ++ssam_bdev_event_cb(enum spdk_bdev_event_type type, struct spdk_bdev *bdev, ++ void *event_ctx) ++{ ++ SPDK_DEBUGLOG(ssam_blk, "Bdev event: type %d, name %s\n", ++ type, bdev->name); ++ ++ switch (type) { ++ case SPDK_BDEV_EVENT_REMOVE: ++ SPDK_NOTICELOG("bdev name (%s) received event(SPDK_BDEV_EVENT_REMOVE)\n", ++ bdev->name); ++ ssam_bdev_remove_cb(event_ctx); ++ break; ++ case SPDK_BDEV_EVENT_RESIZE: ++ SPDK_NOTICELOG("bdev name (%s) received event(SPDK_BDEV_EVENT_RESIZE)\n", ++ bdev->name); ++ ssam_blk_resize_cb(event_ctx); ++ break; ++ default: ++ SPDK_NOTICELOG("Unsupported bdev event: type %d\n", type); ++ break; ++ } ++} ++ ++static void ++ssam_free_task_pool(struct spdk_ssam_blk_session *bsmsession) ++{ ++ struct spdk_ssam_session *smsession = &bsmsession->smsession; ++ struct spdk_ssam_virtqueue *vq = NULL; ++ uint16_t max_queues = smsession->max_queues; ++ uint16_t i; ++ ++ if (max_queues > SPDK_SSAM_MAX_VQUEUES) { ++ return; ++ } ++ ++ for (i = 0; i < max_queues; i++) { ++ vq = &smsession->virtqueue[i]; ++ if (vq->tasks != NULL) { ++ spdk_free(vq->tasks); ++ vq->tasks = NULL; ++ } ++ ++ if (vq->index != NULL) { ++ spdk_free(vq->index); ++ vq->index = NULL; ++ } ++ } ++} ++ ++static int ++ssam_alloc_task_pool(struct spdk_ssam_blk_session *bsmsession) ++{ ++ struct spdk_ssam_session *smsession = &bsmsession->smsession; ++ struct spdk_ssam_virtqueue *vq = NULL; ++ struct spdk_ssam_blk_task *task = NULL; ++ uint16_t max_queues = smsession->max_queues; ++ uint32_t task_cnt = smsession->queue_size; ++ uint16_t i; ++ uint32_t j; ++ ++ if ((max_queues > SPDK_SSAM_MAX_VQUEUES) || (max_queues == 0)) { ++ SPDK_ERRLOG("%s: max_queues %u invalid\n", smsession->name, max_queues); ++ return -EINVAL; ++ } ++ ++ if ((task_cnt == 0) || (task_cnt > SPDK_SSAM_MAX_VQ_SIZE)) { ++ SPDK_ERRLOG("%s: virtuque size %u invalid\n", smsession->name, task_cnt); ++ return -EINVAL; ++ } ++ ++ for (i = 0; i < max_queues; i++) { ++ vq = &smsession->virtqueue[i]; ++ vq->smsession = smsession; ++ vq->num = task_cnt; ++ vq->use_num = 0; ++ vq->index_l = 0; ++ vq->index_r = 0; ++ vq->tasks = spdk_zmalloc(sizeof(struct spdk_ssam_blk_task) * task_cnt, ++ SPDK_CACHE_LINE_SIZE, NULL, ++ SPDK_ENV_LCORE_ID_ANY, SPDK_MALLOC_DMA); ++ vq->index = spdk_zmalloc(sizeof(uint32_t) * task_cnt, ++ SPDK_CACHE_LINE_SIZE, NULL, ++ SPDK_ENV_LCORE_ID_ANY, SPDK_MALLOC_DMA); ++ if (vq->tasks == NULL || vq->index == NULL) { ++ SPDK_ERRLOG("%s: failed to allocate %"PRIu32" tasks for virtqueue %"PRIu16"\n", ++ smsession->name, task_cnt, i); ++ ssam_free_task_pool(bsmsession); ++ return -ENOMEM; ++ } ++ for (j = 0; j < task_cnt; j++) { ++ task = &((struct spdk_ssam_blk_task *)vq->tasks)[j]; ++ task->bsmsession = bsmsession; ++ task->task_idx = j; ++ vq->index[j] = j; ++ } ++ } ++ ++ return 0; ++} ++ ++static void ++ssam_blk_print_stuck_io_info(struct spdk_ssam_session *smsession) ++{ ++ struct spdk_ssam_blk_task *tasks; ++ struct spdk_ssam_blk_task *task; ++ int i, j; ++ ++ for (i = 0; i < smsession->max_queues; i++) { ++ for (j = 0; j < smsession->queue_size; j++) { ++ tasks = (struct spdk_ssam_blk_task *)smsession->virtqueue[i].tasks; ++ task = &tasks[j]; ++ if (task == NULL) { ++ continue; ++ } ++ if (task->used) { ++ SPDK_INFOLOG(ssam_blk, "%s: stuck io payload_size %u, vq_idx %u, req_idx %u\n", ++ smsession->name, task->payload_size, task->vq_idx, task->req_idx); ++ } ++ } ++ } ++} ++ ++static uint16_t ++get_req_idx(struct spdk_ssam_blk_task *task) ++{ ++ return task->io_req->req.cmd.virtio.req_idx; ++} ++ ++static void ++ssam_blk_task_init(struct spdk_ssam_blk_task *task) ++{ ++ task->used = true; ++ task->iovcnt = 0; ++ task->io_req = NULL; ++ task->payload_size = 0; ++ memset(&task->task_stat, 0, sizeof(task->task_stat)); ++ ssam_task_stat_tick(&task->task_stat.start_tsc); ++} ++ ++static void ++ssam_blk_task_finish(struct spdk_ssam_blk_task *task) ++{ ++ struct spdk_ssam_session *smsession = &task->bsmsession->smsession; ++ struct spdk_ssam_virtqueue *vq = &smsession->virtqueue[task->vq_idx]; ++ ++ if (smsession->task_cnt == 0) { ++ SPDK_ERRLOG("smsession %s: task internel error\n", smsession->name); ++ return; ++ } ++ ++ task->io_req = NULL; ++ task->payload_size = 0; ++ ++ if (task->iovs.virt.sges[0].iov_base != NULL) { ++ ssam_mempool_free(smsession->mp, task->iovs.virt.sges[0].iov_base); ++ task->iovs.virt.sges[0].iov_base = NULL; ++ } ++ ++ memset(&task->iovs, 0, sizeof(task->iovs)); ++ ++ task->iovcnt = 0; ++ smsession->task_cnt--; ++ task->used = false; ++ vq->index[vq->index_l] = task->task_idx; ++ vq->index_l = (vq->index_l + 1) & 0xFF; ++ vq->use_num--; ++} ++ ++static int ++ssam_blk_io_complete(struct spdk_ssam_dev *smdev, struct ssam_request *io_req, uint8_t status) ++{ ++ struct ssam_io_response io_resp; ++ struct ssam_virtio_res *virtio_res = (struct ssam_virtio_res*)&io_resp.data; ++ struct ssam_io_message *io_cmd = &io_req->req.cmd; ++ struct iovec io_vec; ++ uint8_t res_status = status; ++ int rc; ++ ++ if (status != VIRTIO_BLK_S_OK) { ++ SPDK_ERRLOG("ssam io complete return error tid=%u gfunc_id:%u.\n", smdev->tid, io_req->gfunc_id); ++ } ++ ++ memset(&io_resp, 0, sizeof(io_resp)); ++ io_resp.gfunc_id = io_req->gfunc_id; ++ io_resp.iocb_id = io_req->iocb_id; ++ io_resp.status = io_req->status; ++ io_resp.req = io_req; ++ io_resp.flr_seq = io_req->flr_seq; ++ ++ virtio_res->iovs = &io_vec; ++ virtio_res->iovs->iov_base = io_cmd->iovs[io_cmd->iovcnt - 1].iov_base; ++ virtio_res->iovs->iov_len = io_cmd->iovs[io_cmd->iovcnt - 1].iov_len; ++ virtio_res->iovcnt = 1; ++ virtio_res->rsp = &res_status; ++ virtio_res->rsp_len = sizeof(res_status); ++ ++ rc = ssam_io_complete(smdev->tid, &io_resp); ++ if (rc != 0) { ++ return rc; ++ } ++ ++ ssam_dev_io_dec(smdev); ++ return 0; ++} ++ ++struct ssam_task_complete_arg { ++ struct spdk_ssam_blk_task *task; ++ uint8_t status; ++}; ++ ++static void ++ssam_task_complete_cb(void *arg) ++{ ++ struct ssam_task_complete_arg *cb_arg = (struct ssam_task_complete_arg *)arg; ++ struct spdk_ssam_session *smsession = &cb_arg->task->bsmsession->smsession; ++ struct spdk_ssam_blk_task *task = cb_arg->task; ++ int rc = ssam_blk_io_complete(smsession->smdev, task->io_req, cb_arg->status); ++ if (rc != 0) { ++ struct spdk_ssam_session_io_wait_r *io_wait_r = ++ calloc(1, sizeof(struct spdk_ssam_session_io_wait_r)); ++ if (io_wait_r == NULL) { ++ SPDK_ERRLOG("calloc for io_wait_r failed\n"); ++ sleep(1); ++ raise(SIGTERM); ++ } ++ io_wait_r->cb_fn = ssam_task_complete_cb; ++ io_wait_r->cb_arg = cb_arg; ++ ssam_session_insert_io_wait_r(smsession->smdev, io_wait_r); ++ return; ++ } ++ ssam_task_stat_tick(&task->task_stat.complete_end_tsc); ++ ssam_blk_stat_statistics(task, cb_arg->status); ++ ssam_blk_task_finish(task); ++ free(cb_arg); ++ cb_arg = NULL; ++} ++ ++static void ++ssam_task_complete(struct spdk_ssam_blk_task *task, uint8_t status) ++{ ++ struct spdk_ssam_session *smsession = &task->bsmsession->smsession; ++ if (status != VIRTIO_BLK_S_OK) { ++ SPDK_ERRLOG("ssam task return error tid=%u gfunc_id:%u.\n", ++ smsession->smdev->tid, task->io_req->gfunc_id); ++ } ++ SPDK_INFOLOG(ssam_blk_data, "handled io tid=%u gfunc_id=%u rw=%u vqid=%u reqid=%u status=%u.\n", ++ smsession->smdev->tid, smsession->gfunc_id, task->io_req->req.cmd.writable, ++ task->io_req->req.cmd.virtio.vq_idx, task->io_req->req.cmd.virtio.req_idx, status); ++ ssam_task_stat_tick(&task->task_stat.complete_start_tsc); ++ int rc = ssam_blk_io_complete(smsession->smdev, task->io_req, status); ++ if (rc != 0) { ++ struct spdk_ssam_session_io_wait_r *io_wait_r = ++ calloc(1, sizeof(struct spdk_ssam_session_io_wait_r)); ++ struct ssam_task_complete_arg *cb_arg = ++ calloc(1, sizeof(struct ssam_task_complete_arg)); ++ if (io_wait_r == NULL || cb_arg == NULL) { ++ SPDK_ERRLOG("calloc for io_wait_r failed\n"); ++ sleep(1); ++ raise(SIGTERM); ++ } ++ cb_arg->status = status; ++ cb_arg->task = task; ++ io_wait_r->cb_fn = ssam_task_complete_cb; ++ io_wait_r->cb_arg = cb_arg; ++ ssam_session_insert_io_wait_r(smsession->smdev, io_wait_r); ++ return; ++ } ++ ssam_task_stat_tick(&task->task_stat.complete_end_tsc); ++ ssam_blk_stat_statistics(task, status); ++ ssam_blk_task_finish(task); ++} ++ ++struct ssam_blk_dma_data_request_arg { ++ struct spdk_ssam_dev *smdev; ++ struct spdk_ssam_blk_task *task; ++ struct ssam_dma_request dma_req; ++}; ++ ++static void ++ssam_blk_dma_data_request_cb(void *arg) ++{ ++ struct ssam_blk_dma_data_request_arg *cb_arg = (struct ssam_blk_dma_data_request_arg *)arg; ++ int ret = ssam_dma_data_request(cb_arg->smdev->tid, &cb_arg->dma_req); ++ if (ret == -ENOMEM || ret == -EIO) { ++ struct spdk_ssam_session_io_wait_r *io_wait_r = ++ calloc(1, sizeof(struct spdk_ssam_session_io_wait_r)); ++ if (io_wait_r == NULL) { ++ SPDK_ERRLOG("calloc for io_wait_r failed\n"); ++ sleep(1); ++ raise(SIGTERM); ++ } ++ io_wait_r->cb_fn = ssam_blk_dma_data_request_cb; ++ io_wait_r->cb_arg = cb_arg; ++ ssam_session_insert_io_wait_r(cb_arg->smdev, io_wait_r); ++ return; ++ } ++ if (ret < 0) { ++ SPDK_ERRLOG("%s: ssam dma data request failed:%s\n", ++ cb_arg->task->bsmsession->smsession.name, spdk_strerror(-ret)); ++ ssam_task_complete(cb_arg->task, VIRTIO_BLK_S_IOERR); ++ } ++ free(cb_arg); ++ cb_arg = NULL; ++} ++ ++static void ++ssam_res_dma_process(struct spdk_ssam_session *smsession, ++ struct spdk_ssam_blk_task *task, uint32_t type, uint8_t status) ++{ ++ struct ssam_dma_request dma_req = {0}; ++ uint16_t tid = smsession->smdev->tid; ++ int ret; ++ ++ ssam_data_request_para(&dma_req, task, type, status); ++ ssam_task_stat_tick(&task->task_stat.dma_start_tsc); ++ task->bsmsession->blk_stat.dma_count++; ++ ret = ssam_dma_data_request(tid, &dma_req); ++ if (ret == -ENOMEM || ret == -EIO) { ++ struct spdk_ssam_session_io_wait_r *io_wait_r = ++ calloc(1, sizeof(struct spdk_ssam_session_io_wait_r)); ++ struct ssam_blk_dma_data_request_arg *cb_arg = ++ calloc(1, sizeof(struct ssam_blk_dma_data_request_arg)); ++ if (io_wait_r == NULL || cb_arg == NULL) { ++ SPDK_ERRLOG("calloc for io_wait_r failed\n"); ++ sleep(1); ++ raise(SIGTERM); ++ } ++ cb_arg->smdev = smsession->smdev; ++ cb_arg->dma_req = dma_req; ++ cb_arg->task = task; ++ io_wait_r->cb_fn = ssam_blk_dma_data_request_cb; ++ io_wait_r->cb_arg = cb_arg; ++ ssam_session_insert_io_wait_r(smsession->smdev, io_wait_r); ++ return; ++ } ++ ++ if (ret < 0) { ++ SPDK_ERRLOG("%s: ssam dma data request failed:%s\n", smsession->name, spdk_strerror(-ret)); ++ ssam_task_complete(task, VIRTIO_BLK_S_IOERR); ++ } ++} ++ ++static void ++ssam_blk_request_finish(bool success, struct spdk_ssam_blk_task *task) ++{ ++ uint8_t res_status = success ? VIRTIO_BLK_S_OK : VIRTIO_BLK_S_IOERR; ++ const struct virtio_blk_outhdr *req = NULL; ++ struct spdk_ssam_session *smsession = &task->bsmsession->smsession; ++ if (res_status != VIRTIO_BLK_S_OK) { ++ SPDK_ERRLOG("request finish return error gfunc_id=%u.\n", smsession->gfunc_id); ++ } ++ ++ req = (struct virtio_blk_outhdr *)task->io_req->req.cmd.header; ++ switch (req->type) { ++ case VIRTIO_BLK_T_IN: ++ case VIRTIO_BLK_T_GET_ID: ++ ssam_res_dma_process(smsession, task, SSAM_REQUEST_DATA_STORE, res_status); ++ break; ++ ++ case VIRTIO_BLK_T_OUT: ++ case VIRTIO_BLK_T_DISCARD: ++ case VIRTIO_BLK_T_WRITE_ZEROES: ++ case VIRTIO_BLK_T_FLUSH: ++ ssam_task_complete(task, res_status); ++ break; ++ ++ default: ++ ssam_task_complete(task, VIRTIO_BLK_S_IOERR); ++ SPDK_ERRLOG("Not supported request type '%"PRIu32"'.\n", req->type); ++ break; ++ } ++} ++ ++static void ++ssam_blk_req_complete_cb(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) ++{ ++ struct spdk_ssam_blk_task *task = cb_arg; ++ ++ if (spdk_unlikely(spdk_get_shutdown_sig_received())) { ++ /* ++ * In the hot restart process, when this callback is triggered, ++ * the task and bdev_io memory may have been released. ++ * Therefore, task and bdev_io are not released in this scenario. ++ */ ++ return; ++ } ++ ++ /* Second part start of read and write */ ++ SPDK_INFOLOG(ssam_blk_data, "backend io finish tid=%u gfunc_id=%u rw=%u vqid=%u reqid=%u success=%d.\n", ++ task->bsmsession->smsession.smdev->tid, task->bsmsession->smsession.gfunc_id, ++ task->io_req->req.cmd.writable, task->io_req->req.cmd.virtio.vq_idx, task->io_req->req.cmd.virtio.req_idx, ++ success); ++ task->bsmsession->bdev_count--; ++ task->bsmsession->blk_stat.bdev_complete_count++; ++ ssam_task_stat_tick(&task->task_stat.bdev_end_tsc); ++ ++ spdk_bdev_free_io(bdev_io); ++ ssam_blk_request_finish(success, task); ++} ++ ++static int ++ssam_request_rc_process(int rc, struct spdk_ssam_blk_task *task) ++{ ++ if (rc == 0) { ++ return rc; ++ } ++ ++ if (rc == -ENOMEM) { ++ SPDK_WARNLOG("No memory, start to queue io.\n"); ++ ssam_request_queue_io(task); ++ } else { ++ SPDK_ERRLOG("IO error, gfunc_id=%u.\n", task->bsmsession->smsession.gfunc_id); ++ ssam_task_complete(task, VIRTIO_BLK_S_IOERR); ++ return -1; ++ } ++ ++ return rc; ++} ++ ++static bool ++ssam_is_req_sector_err(uint64_t sector) ++{ ++ if (sector > (UINT64_MAX / SECTOR_SIZE)) { ++ SPDK_ERRLOG("req sector out of range, need less or equal than %lu, actually %lu\n", ++ (UINT64_MAX / SECTOR_SIZE), sector); ++ return true; ++ } ++ ++ return false; ++} ++ ++static int ++ssam_virtio_read_write_process(struct spdk_ssam_blk_task *task, ++ const struct virtio_blk_outhdr *req) ++{ ++ struct spdk_ssam_blk_session *bsmsession = task->bsmsession; ++ struct ssam_io_message *io_cmd = NULL; ++ uint32_t payload_size = task->payload_size; ++ int rc; ++ ++ io_cmd = &task->io_req->req.cmd; ++ ++ if (ssam_is_req_sector_err(req->sector)) { ++ SPDK_ERRLOG("rw check sector error, gfunc_id=%u.\n", bsmsession->smsession.gfunc_id); ++ ssam_task_complete(task, VIRTIO_BLK_S_IOERR); ++ return -1; ++ } ++ ++ if (spdk_unlikely(payload_size == 0 || (payload_size & (SECTOR_SIZE - 1)) != 0)) { ++ SPDK_ERRLOG("%s - passed IO buffer is not multiple of 512 Bytes (req_idx = %"PRIu16"), " ++ "payload_size = %u, iovcnt = %u.\n", req->type ? "WRITE" : "READ", ++ get_req_idx(task), payload_size, io_cmd->iovcnt); ++ ssam_task_complete(task, VIRTIO_BLK_S_UNSUPP); ++ return -1; ++ } ++ ++ if (req->type == VIRTIO_BLK_T_IN) { ++ bsmsession->bdev_count++; ++ ssam_task_stat_tick(&task->task_stat.bdev_start_tsc); ++ rc = spdk_bdev_readv(bsmsession->bdev_desc, bsmsession->io_channel, ++ task->iovs.virt.sges, task->iovcnt, req->sector * SECTOR_SIZE, ++ payload_size, ssam_blk_req_complete_cb, task); ++ ssam_task_stat_tick(&task->task_stat.bdev_func_tsc); ++ } else if (!bsmsession->readonly) { ++ bsmsession->bdev_count++; ++ ssam_task_stat_tick(&task->task_stat.bdev_start_tsc); ++ rc = spdk_bdev_writev(bsmsession->bdev_desc, bsmsession->io_channel, ++ task->iovs.virt.sges, task->iovcnt, req->sector * SECTOR_SIZE, ++ payload_size, ssam_blk_req_complete_cb, task); ++ ssam_task_stat_tick(&task->task_stat.bdev_func_tsc); ++ } else { ++ SPDK_DEBUGLOG(ssam_blk, "Device is in read-only mode!\n"); ++ rc = -1; ++ } ++ ++ return ssam_request_rc_process(rc, task); ++} ++ ++static int ++ssam_virtio_discard_process(struct spdk_ssam_blk_task *task) ++{ ++ int rc; ++ struct spdk_ssam_blk_session *bsmsession = task->bsmsession; ++ struct virtio_blk_discard_write_zeroes *desc = task->iovs.virt.sges[0].iov_base; ++ ++ if (ssam_is_req_sector_err(desc->sector)) { ++ SPDK_ERRLOG("discard check sector error, gfunc_id=%u.\n", bsmsession->smsession.gfunc_id); ++ ssam_task_complete(task, VIRTIO_BLK_S_IOERR); ++ return -1; ++ } ++ ++ if (task->payload_size != sizeof(*desc)) { ++ SPDK_ERRLOG("Invalid discard payload size: %u\n", task->payload_size); ++ ssam_task_complete(task, VIRTIO_BLK_S_IOERR); ++ return -1; ++ } ++ ++ if (desc->flags & VIRTIO_BLK_WRITE_ZEROES_FLAG_UNMAP) { ++ SPDK_ERRLOG("UNMAP flag is only used for WRITE ZEROES command\n"); ++ ssam_task_complete(task, VIRTIO_BLK_S_UNSUPP); ++ return -1; ++ } ++ bsmsession->bdev_count++; ++ rc = spdk_bdev_unmap(bsmsession->bdev_desc, bsmsession->io_channel, ++ desc->sector * SECTOR_SIZE, desc->num_sectors * SECTOR_SIZE, ++ ssam_blk_req_complete_cb, task); ++ ++ return ssam_request_rc_process(rc, task); ++} ++ ++static int ++ssam_virtio_write_zeroes_process(struct spdk_ssam_blk_task *task) ++{ ++ int rc; ++ struct spdk_ssam_blk_session *bsmsession = task->bsmsession; ++ struct virtio_blk_discard_write_zeroes *desc = task->iovs.virt.sges[0].iov_base; ++ ++ if (ssam_is_req_sector_err(desc->sector)) { ++ SPDK_ERRLOG("write zeros check sector error, gfunc_id=%u.\n", bsmsession->smsession.gfunc_id); ++ ssam_task_complete(task, VIRTIO_BLK_S_IOERR); ++ return -1; ++ } ++ ++ if (task->payload_size != sizeof(*desc)) { ++ SPDK_NOTICELOG("Invalid write zeroes payload size: %u\n", task->payload_size); ++ ssam_task_complete(task, VIRTIO_BLK_S_IOERR); ++ return -1; ++ } ++ ++ if (desc->flags & VIRTIO_BLK_WRITE_ZEROES_FLAG_UNMAP) { ++ SPDK_WARNLOG("Ignore the unmap flag for WRITE ZEROES from %"PRIx64", len %"PRIx64"\n", ++ (uint64_t)desc->sector * SECTOR_SIZE, (uint64_t)desc->num_sectors * SECTOR_SIZE); ++ } ++ bsmsession->bdev_count++; ++ rc = spdk_bdev_write_zeroes(bsmsession->bdev_desc, bsmsession->io_channel, ++ desc->sector * SECTOR_SIZE, desc->num_sectors * SECTOR_SIZE, ssam_blk_req_complete_cb, task); ++ ++ return ssam_request_rc_process(rc, task); ++} ++ ++static int ++ssam_virtio_flush_process(struct spdk_ssam_blk_task *task, ++ const struct virtio_blk_outhdr *req) ++{ ++ int rc; ++ struct spdk_ssam_blk_session *bsmsession = task->bsmsession; ++ uint64_t blockcnt = spdk_bdev_get_num_blocks(bsmsession->bdev); ++ uint32_t blocklen = spdk_bdev_get_block_size(bsmsession->bdev); ++ uint64_t flush_bytes; ++ ++ if (blocklen == 0) { ++ SPDK_ERRLOG("bdev's blocklen %u error.\n", blocklen); ++ ssam_task_complete(task, VIRTIO_BLK_S_IOERR); ++ return -1; ++ } ++ if (req->sector != 0) { ++ SPDK_ERRLOG("sector must be zero for flush command\n"); ++ ssam_task_complete(task, VIRTIO_BLK_S_IOERR); ++ return -1; ++ } ++ ++ if (blockcnt > (UINT64_MAX / blocklen)) { ++ SPDK_ERRLOG("bdev's blockcnt %lu or blocklen %u out of range.\n", ++ blockcnt, blocklen); ++ ssam_task_complete(task, VIRTIO_BLK_S_IOERR); ++ return -1; ++ } ++ flush_bytes = blockcnt * blocklen; ++ bsmsession->bdev_count++; ++ rc = spdk_bdev_flush(bsmsession->bdev_desc, bsmsession->io_channel, ++ 0, flush_bytes, ssam_blk_req_complete_cb, task); ++ ++ return ssam_request_rc_process(rc, task); ++} ++ ++static int ++ssam_virtio_get_id_process(struct spdk_ssam_blk_task *task) ++{ ++ uint32_t used_length; ++ struct spdk_ssam_blk_session *bsmsession = task->bsmsession; ++ ++ if (task->iovcnt == 0 || task->payload_size == 0) { ++ SPDK_ERRLOG("check task param error, gfunc_id=%u.\n", bsmsession->smsession.gfunc_id); ++ ssam_task_complete(task, VIRTIO_BLK_S_UNSUPP); ++ return -1; ++ } ++ ++ used_length = spdk_min((size_t)VIRTIO_BLK_ID_BYTES, task->iovs.virt.sges[0].iov_len); ++ if (bsmsession->serial == NULL) { ++ spdk_strcpy_pad(task->iovs.virt.sges[0].iov_base, spdk_bdev_get_product_name(bsmsession->bdev), ++ used_length, ' '); ++ } else { ++ spdk_strcpy_pad(task->iovs.virt.sges[0].iov_base, bsmsession->serial, ++ used_length, ' '); ++ } ++ bsmsession->blk_stat.bdev_complete_count++; ++ ssam_blk_request_finish(true, task); ++ ++ return 0; ++} ++ ++static int ++ssam_io_process(struct spdk_ssam_blk_task *task, const struct virtio_blk_outhdr *req) ++{ ++ int rc; ++ SPDK_INFOLOG(ssam_blk_data, "backend io start tid=%u gfunc_id=%u reqtype=%d rw=%u vqid=%u reqid=%u offset=%llu length=%u.\n", ++ task->bsmsession->smsession.smdev->tid, task->bsmsession->smsession.gfunc_id, req->type, ++ task->io_req->req.cmd.writable, task->io_req->req.cmd.virtio.vq_idx, task->io_req->req.cmd.virtio.req_idx, ++ req->sector * SECTOR_SIZE, task->payload_size); ++ task->bsmsession->blk_stat.bdev_count++; ++ switch (req->type) { ++ case VIRTIO_BLK_T_IN: ++ case VIRTIO_BLK_T_OUT: ++ rc = ssam_virtio_read_write_process(task, req); ++ break; ++ case VIRTIO_BLK_T_DISCARD: ++ rc = ssam_virtio_discard_process(task); ++ break; ++ case VIRTIO_BLK_T_WRITE_ZEROES: ++ rc = ssam_virtio_write_zeroes_process(task); ++ break; ++ case VIRTIO_BLK_T_FLUSH: ++ rc = ssam_virtio_flush_process(task, req); ++ break; ++ case VIRTIO_BLK_T_GET_ID: ++ rc = ssam_virtio_get_id_process(task); ++ break; ++ default: ++ SPDK_ERRLOG("Not supported request type '%"PRIu32"'.\n", req->type); ++ ssam_task_complete(task, VIRTIO_BLK_S_UNSUPP); ++ return -1; ++ } ++ ++ return rc; ++} ++ ++static int ++ssam_process_blk_request(struct spdk_ssam_blk_task *task) ++{ ++ int ret; ++ struct iovec *iov = NULL; ++ const struct virtio_blk_outhdr *req = NULL; ++ struct ssam_io_message *io_cmd = NULL; ++ ++ io_cmd = &task->io_req->req.cmd; ++ /* get req header */ ++ if (spdk_unlikely(io_cmd->iovs[0].iov_len != sizeof(*req))) { ++ SPDK_ERRLOG("First descriptor size is %zu but expected %zu (req_idx = %"PRIu16").\n", ++ io_cmd->iovs[0].iov_len, sizeof(*req), get_req_idx(task)); ++ ssam_task_complete(task, VIRTIO_BLK_S_UNSUPP); ++ return -1; ++ } ++ ++ req = (struct virtio_blk_outhdr *)io_cmd->header; ++ /* get req tail */ ++ iov = &io_cmd->iovs[io_cmd->iovcnt - 1]; ++ if (spdk_unlikely(iov->iov_len != 1)) { ++ SPDK_ERRLOG("Last descriptor size is %zu but expected %d (req_idx = %"PRIu16").\n", ++ iov->iov_len, 1, get_req_idx(task)); ++ ssam_task_complete(task, VIRTIO_BLK_S_UNSUPP); ++ return -1; ++ } ++ ++ ret = ssam_io_process(task, req); ++ if (ret < 0) { ++ SPDK_ERRLOG("ssam io process failed(%d)\n", ret); ++ return ret; ++ } ++ ++ return 0; ++} ++ ++static int ++ssam_get_payload_size(struct ssam_request *io_req, uint32_t *payload_size) ++{ ++ struct ssam_io_message *io_cmd = &io_req->req.cmd; ++ uint32_t payload = 0; ++ uint32_t i; ++ ++ for (i = 1; i < io_cmd->iovcnt - 1; i++) { ++ if (spdk_unlikely((UINT32_MAX - io_cmd->iovs[i].iov_len) < payload)) { ++ SPDK_ERRLOG("payload size overflow\n"); ++ return -1; ++ } ++ payload += io_cmd->iovs[i].iov_len; ++ } ++ ++ if (spdk_unlikely(payload > PAYLOAD_SIZE_MAX)) { ++ SPDK_ERRLOG("payload size larger than %u, payload_size = %u\n", ++ PAYLOAD_SIZE_MAX, payload); ++ return -1; ++ } ++ ++ *payload_size = payload; ++ ++ return 0; ++} ++ ++static int ++ssam_task_iovs_memory_get(struct spdk_ssam_blk_task *task) ++{ ++ struct ssam_mempool *mp = task->bsmsession->smsession.mp; ++ void *buffer = NULL; ++ uint64_t phys_addr = 0; ++ ++ if (task->payload_size == 0) { ++ /* request type of VIRTIO_BLK_T_FLUSH does not have payload */ ++ task->iovs.virt.sges[0].iov_base = NULL; ++ return 0; ++ } ++ ++ task->iovs.virt.sges[0].iov_base = NULL; ++ task->iovs.phys.sges[0].iov_base = NULL; ++ task->iovs.virt.sges[0].iov_len = task->payload_size; ++ task->iovs.phys.sges[0].iov_len = task->payload_size; ++ task->iovcnt = 1; ++ ++ buffer = ssam_mempool_alloc(mp, task->payload_size, &phys_addr); ++ if (spdk_unlikely(buffer == NULL)) { ++ return -ENOMEM; ++ } ++ ++ /* ssam request max IO size is PAYLOAD_SIZE_MAX, only use one iov to save data */ ++ task->iovs.virt.sges[0].iov_base = buffer; ++ task->iovs.phys.sges[0].iov_base = (void *)phys_addr; ++ ++ return 0; ++} ++ ++static void ++ssam_data_request_para(struct ssam_dma_request *dma_req, struct spdk_ssam_blk_task *task, ++ uint32_t type, uint8_t status) ++{ ++ struct ssam_io_message *io_cmd = NULL; ++ struct spdk_ssam_dma_cb dma_cb = { ++ .status = status, ++ .req_dir = type, ++ .gfunc_id = task->io_req->gfunc_id, ++ .vq_idx = task->vq_idx, ++ .task_idx = task->task_idx ++ }; ++ ++ io_cmd = &task->io_req->req.cmd; ++ dma_req->cb = (void *)*(uint64_t *)&dma_cb; ++ dma_req->gfunc_id = task->io_req->gfunc_id; ++ dma_req->flr_seq = task->io_req->flr_seq; ++ dma_req->direction = type; ++ dma_req->data_len = task->payload_size; ++ if (type == SSAM_REQUEST_DATA_STORE) { ++ dma_req->src = task->iovs.phys.sges; ++ dma_req->src_num = task->iovcnt; ++ dma_req->dst = &io_cmd->iovs[1]; ++ /* dma data iovs does not contain header and tail */ ++ dma_req->dst_num = io_cmd->iovcnt - IOV_HEADER_TAIL_NUM; ++ } else if (type == SSAM_REQUEST_DATA_LOAD) { ++ dma_req->src = &io_cmd->iovs[1]; ++ /* dma data iovs does not contain header and tail */ ++ dma_req->src_num = io_cmd->iovcnt - IOV_HEADER_TAIL_NUM; ++ dma_req->dst = task->iovs.phys.sges; ++ dma_req->dst_num = task->iovcnt; ++ } ++} ++ ++static void ++ssam_request_dma_process(struct spdk_ssam_session *smsession, struct spdk_ssam_blk_task *task) ++{ ++ struct virtio_blk_outhdr *req = NULL; ++ int ret; ++ ++ req = (struct virtio_blk_outhdr *)task->io_req->req.cmd.header; ++ SPDK_INFOLOG(ssam_blk_data, "request dma request io tid=%u gfunc_id=%u reqtype=%d rw=%u vqid=%u reqid=%u.\n", ++ smsession->smdev->tid, smsession->gfunc_id, req->type, task->io_req->req.cmd.writable, ++ task->io_req->req.cmd.virtio.vq_idx, task->io_req->req.cmd.virtio.req_idx); ++ ++ switch (req->type) { ++ case VIRTIO_BLK_T_IN: ++ case VIRTIO_BLK_T_GET_ID: ++ case VIRTIO_BLK_T_FLUSH: ++ ret = ssam_process_blk_request(task); ++ if (ret < 0) { ++ SPDK_ERRLOG("====== Task: req_idx %u failed ======\n", task->req_idx); ++ } ++ break; ++ ++ case VIRTIO_BLK_T_OUT: ++ case VIRTIO_BLK_T_DISCARD: ++ case VIRTIO_BLK_T_WRITE_ZEROES: ++ /* dma request: Host -> ipu */ ++ ssam_res_dma_process(smsession, task, SSAM_REQUEST_DATA_LOAD, 0); ++ break; ++ ++ default: ++ ssam_task_complete(task, VIRTIO_BLK_S_IOERR); ++ SPDK_ERRLOG("Not supported request type '%"PRIu32"'.\n", req->type); ++ } ++} ++ ++struct ssam_blk_io_complete_arg { ++ struct spdk_ssam_dev *smdev; ++ struct ssam_request *io_req; ++}; ++ ++static void ++ssam_blk_io_complete_cb(void *arg) ++{ ++ struct ssam_blk_io_complete_arg *cb_arg = (struct ssam_blk_io_complete_arg *)arg; ++ int rc = ssam_blk_io_complete(cb_arg->smdev, cb_arg->io_req, VIRTIO_BLK_S_IOERR); ++ if (rc != 0) { ++ struct spdk_ssam_session_io_wait_r *io_wait_r = ++ calloc(1, sizeof(struct spdk_ssam_session_io_wait_r)); ++ if (io_wait_r == NULL) { ++ SPDK_ERRLOG("calloc for io_wait_r failed\n"); ++ sleep(1); ++ raise(SIGTERM); ++ } ++ io_wait_r->cb_fn = ssam_blk_io_complete_cb; ++ io_wait_r->cb_arg = cb_arg; ++ ssam_session_insert_io_wait_r(cb_arg->smdev, io_wait_r); ++ return; ++ } ++ free(cb_arg); ++ cb_arg = NULL; ++} ++ ++static void ++ssam_process_blk_task(struct spdk_ssam_session *smsession, struct ssam_request *io_req, ++ uint16_t vq_idx, uint16_t req_idx, uint32_t payload_size) ++{ ++ int rc; ++ struct spdk_ssam_blk_task *task = NULL; ++ struct spdk_ssam_virtqueue *vq = &smsession->virtqueue[vq_idx]; ++ ++ if (spdk_unlikely(vq->use_num >= vq->num)) { ++ SPDK_ERRLOG("Session:%s vq(%hu) task_cnt(%u) limit(%u).\n", smsession->name, vq_idx, vq->use_num, vq->num); ++ rc = ssam_blk_io_complete(smsession->smdev, io_req, VIRTIO_BLK_S_IOERR); ++ if (rc != 0) { ++ struct spdk_ssam_session_io_wait_r *io_wait_r = ++ calloc(1, sizeof(struct spdk_ssam_session_io_wait_r)); ++ struct ssam_blk_io_complete_arg *cb_arg = ++ calloc(1, sizeof(struct ssam_blk_io_complete_arg)); ++ if (io_wait_r == NULL || cb_arg == NULL) { ++ SPDK_ERRLOG("calloc for io_wait_r failed\n"); ++ sleep(1); ++ raise(SIGTERM); ++ } ++ cb_arg->smdev = smsession->smdev; ++ cb_arg->io_req = io_req; ++ io_wait_r->cb_fn = ssam_blk_io_complete_cb; ++ io_wait_r->cb_arg = cb_arg; ++ ssam_session_insert_io_wait_r(smsession->smdev, io_wait_r); ++ } ++ return; ++ } ++ ++ uint32_t index = vq->index[vq->index_r]; ++ task = &((struct spdk_ssam_blk_task *)vq->tasks)[index]; ++ if (spdk_unlikely(task->used)) { ++ SPDK_ERRLOG("%s: vq(%u) task with idx %u is already pending.\n", smsession->name, vq_idx, index); ++ rc = ssam_blk_io_complete(smsession->smdev, io_req, VIRTIO_BLK_S_IOERR); ++ if (rc != 0) { ++ struct spdk_ssam_session_io_wait_r *io_wait_r = ++ calloc(1, sizeof(struct spdk_ssam_session_io_wait_r)); ++ struct ssam_blk_io_complete_arg *cb_arg = ++ calloc(1, sizeof(struct ssam_blk_io_complete_arg)); ++ if (io_wait_r == NULL || cb_arg == NULL) { ++ SPDK_ERRLOG("calloc for io_wait_r failed\n"); ++ sleep(1); ++ raise(SIGTERM); ++ } ++ cb_arg->smdev = smsession->smdev; ++ cb_arg->io_req = io_req; ++ io_wait_r->cb_fn = ssam_blk_io_complete_cb; ++ io_wait_r->cb_arg = cb_arg; ++ ssam_session_insert_io_wait_r(smsession->smdev, io_wait_r); ++ } ++ return; ++ } ++ ++ smsession->task_cnt++; ++ vq->index_r = (vq->index_r + 1) & 0xFF; ++ vq->use_num++; ++ ++ ssam_blk_task_init(task); ++ task->io_req = io_req; ++ task->vq_idx = vq_idx; ++ task->req_idx = req_idx; ++ task->payload_size = payload_size; ++ task->session_io_wait.cb_fn = ssam_session_io_resubmit; ++ task->session_io_wait.cb_arg = task; ++ ++ rc = ssam_task_iovs_memory_get(task); ++ if (rc != 0) { ++ ssam_session_insert_io_wait(smsession, &task->session_io_wait); ++ return; ++ } ++ ++ ssam_request_dma_process(smsession, task); ++ return; ++} ++ ++static void ++ssam_process_vq(struct spdk_ssam_session *smsession, struct ssam_request *io_req) ++{ ++ struct ssam_io_message *io_cmd = &io_req->req.cmd; ++ uint16_t vq_idx = io_cmd->virtio.vq_idx; ++ uint16_t req_idx = io_cmd->virtio.req_idx; ++ uint32_t payload_size = 0; ++ int rc; ++ ++ if (vq_idx >= smsession->max_queues) { ++ SPDK_ERRLOG("vq_idx out of range, need less than %u, actually %u\n", ++ smsession->max_queues, vq_idx); ++ goto err; ++ } ++ ++ if (io_req->status != SSAM_IO_STATUS_OK) { ++ SPDK_WARNLOG("%s: ssam request status invalid, but still process, status=%d\n", ++ smsession->name, io_req->status); ++ goto err; ++ } ++ ++ rc = ssam_get_payload_size(io_req, &payload_size); ++ if (rc != 0) { ++ goto err; ++ } ++ ++ ssam_process_blk_task(smsession, io_req, vq_idx, req_idx, payload_size); ++ return; ++ ++err: ++ rc = ssam_blk_io_complete(smsession->smdev, io_req, VIRTIO_BLK_S_IOERR); ++ if (rc != 0) { ++ struct spdk_ssam_session_io_wait_r *io_wait_r = ++ calloc(1, sizeof(struct spdk_ssam_session_io_wait_r)); ++ struct ssam_blk_io_complete_arg *cb_arg = ++ calloc(1, sizeof(struct ssam_blk_io_complete_arg)); ++ if (io_wait_r == NULL || cb_arg == NULL) { ++ SPDK_ERRLOG("calloc for io_wait_r failed\n"); ++ sleep(1); ++ raise(SIGTERM); ++ } ++ cb_arg->smdev = smsession->smdev; ++ cb_arg->io_req = io_req; ++ io_wait_r->cb_fn = ssam_blk_io_complete_cb; ++ io_wait_r->cb_arg = cb_arg; ++ ssam_session_insert_io_wait_r(smsession->smdev, io_wait_r); ++ } ++ return; ++} ++ ++static void ++ssam_no_bdev_put_io_channel(struct spdk_ssam_session *smsession) ++{ ++ struct spdk_ssam_blk_session *bsmsession = ssam_to_blk_session(smsession); ++ ++ if (smsession->task_cnt == 0 && (bsmsession->io_channel != NULL)) { ++ spdk_put_io_channel(bsmsession->io_channel); ++ bsmsession->io_channel = NULL; ++ } ++} ++ ++struct ssam_no_bdev_process_vq_arg { ++ struct spdk_ssam_session *smsession; ++ struct ssam_request *io_req; ++}; ++ ++static void ++ssam_no_bdev_process_vq_cb(void *arg) ++{ ++ struct ssam_no_bdev_process_vq_arg *cb_arg = (struct ssam_no_bdev_process_vq_arg *)arg; ++ int rc = ssam_blk_io_complete(cb_arg->smsession->smdev, cb_arg->io_req, VIRTIO_BLK_S_IOERR); ++ if (rc != 0) { ++ struct spdk_ssam_session_io_wait_r *io_wait_r = ++ calloc(1, sizeof(struct spdk_ssam_session_io_wait_r)); ++ if (io_wait_r == NULL) { ++ SPDK_ERRLOG("calloc for io_wait_r failed\n"); ++ sleep(1); ++ raise(SIGTERM); ++ } ++ io_wait_r->cb_fn = ssam_no_bdev_process_vq_cb; ++ io_wait_r->cb_arg = cb_arg; ++ ssam_session_insert_io_wait_r(cb_arg->smsession->smdev, io_wait_r); ++ return; ++ } ++ ssam_no_bdev_put_io_channel(cb_arg->smsession); ++ free(cb_arg); ++ cb_arg = NULL; ++} ++ ++static void ++ssam_no_bdev_process_vq(struct spdk_ssam_session *smsession, struct ssam_request *io_req) ++{ ++ SPDK_ERRLOG("gfunc_id %u No bdev, aborting request, return EIO\n", io_req->gfunc_id); ++ int rc = ssam_blk_io_complete(smsession->smdev, io_req, VIRTIO_BLK_S_IOERR); ++ if (rc != 0) { ++ struct spdk_ssam_session_io_wait_r *io_wait_r = ++ calloc(1, sizeof(struct spdk_ssam_session_io_wait_r)); ++ struct ssam_no_bdev_process_vq_arg *cb_arg = ++ calloc(1, sizeof(struct ssam_no_bdev_process_vq_arg)); ++ if (io_wait_r == NULL || cb_arg == NULL) { ++ SPDK_ERRLOG("calloc for io_wait_r failed\n"); ++ sleep(1); ++ raise(SIGTERM); ++ } ++ cb_arg->smsession = smsession; ++ cb_arg->io_req = io_req; ++ io_wait_r->cb_fn = ssam_no_bdev_process_vq_cb; ++ io_wait_r->cb_arg = cb_arg; ++ ssam_session_insert_io_wait_r(smsession->smdev, io_wait_r); ++ return; ++ } ++ SPDK_WARNLOG("Aborting request because no this controller\n"); ++ ++ ssam_no_bdev_put_io_channel(smsession); ++} ++ ++static void ++ssam_blk_response_worker(struct spdk_ssam_session *smsession, void *arg) ++{ ++ struct ssam_dma_rsp *dma_rsp = (struct ssam_dma_rsp*)arg; ++ struct spdk_ssam_dma_cb *dma_cb = (struct spdk_ssam_dma_cb *)&dma_rsp->cb; ++ struct spdk_ssam_blk_task *task = NULL; ++ uint16_t vq_idx = dma_cb->vq_idx; ++ uint16_t task_idx = dma_cb->task_idx; ++ uint8_t req_dir = dma_cb->req_dir; ++ ++ if (vq_idx >= smsession->max_queues) { ++ smsession->smdev->discard_io_num++; ++ SPDK_ERRLOG("vq_idx out of range, need less than %u, actually %u\n", ++ smsession->max_queues, vq_idx); ++ return; ++ } ++ ++ task = &((struct spdk_ssam_blk_task *)smsession->virtqueue[vq_idx].tasks)[task_idx]; ++ if (dma_rsp->status != 0) { ++ ssam_task_complete(task, VIRTIO_BLK_S_IOERR); ++ SPDK_ERRLOG("dma data process failed!\n"); ++ return; ++ } ++ if (dma_rsp->last_flag == 0) { ++ ssam_task_complete(task, VIRTIO_BLK_S_IOERR); ++ SPDK_ERRLOG("last_flag should not equal 0!\n"); ++ return; ++ } ++ ssam_task_stat_tick(&task->task_stat.dma_end_tsc); ++ task->bsmsession->blk_stat.dma_complete_count++; ++ if (req_dir == SSAM_REQUEST_DATA_LOAD) { ++ /* Write data ready, start a request to backend */ ++ ssam_process_blk_request(task); ++ } else if (req_dir == SSAM_REQUEST_DATA_STORE) { ++ /* Data have been read by user, complete the task */ ++ ssam_task_complete(task, dma_cb->status); ++ } ++} ++ ++static int ++ssam_blk_check_io_req(struct spdk_ssam_dev *smdev, struct ssam_request *io_req) ++{ ++ struct ssam_io_message *io_cmd = NULL; ++ uint16_t vq_idx; ++ uint16_t req_idx; ++ const struct virtio_blk_outhdr *req = NULL; ++ ++ if (io_req == NULL) { ++ SPDK_ERRLOG("%s: received a NULL IO message\n", smdev->name); ++ return -1; ++ } ++ ++ io_cmd = &io_req->req.cmd; ++ vq_idx = io_cmd->virtio.vq_idx; ++ req_idx = io_cmd->virtio.req_idx; ++ req = (struct virtio_blk_outhdr *)io_cmd->header; ++ ++ if (io_cmd->iovs == NULL) { ++ SPDK_ERRLOG("%s: received an empty IO, vq_idx:%u, req_idx:%u\n", ++ smdev->name, vq_idx, req_idx); ++ return -1; ++ } ++ ++ if (io_cmd->iovcnt < IOV_HEADER_TAIL_NUM) { ++ SPDK_ERRLOG("%s: iovcnt %u less than %d but expected not less than %d\n", ++ smdev->name, io_cmd->iovcnt, IOV_HEADER_TAIL_NUM, IOV_HEADER_TAIL_NUM); ++ return -1; ++ } ++ ++ if ((io_cmd->iovcnt == IOV_HEADER_TAIL_NUM) && (req->type != VIRTIO_BLK_T_FLUSH)) { ++ SPDK_ERRLOG("%s: received an IO not contain valid data, iovcnt:%u, vq_idx:%u, " ++ "req_idx:%u, req_type:%u, req_ioprio:%u, req_sector:%llu\n", ++ smdev->name, io_cmd->iovcnt, vq_idx, req_idx, req->type, req->ioprio, req->sector); ++ return -1; ++ } ++ ++ if (io_cmd->iovcnt > (SPDK_SSAM_IOVS_MAX + IOV_HEADER_TAIL_NUM)) { ++ SPDK_ERRLOG("%s: received too much IO, iovcnt:%u, vq_idx:%u, req_idx:%u\n", ++ smdev->name, io_cmd->iovcnt, vq_idx, req_idx); ++ return -1; ++ } ++ ++ return 0; ++} ++ ++static void ++ssam_blk_request_worker(struct spdk_ssam_session *smsession, void *arg) ++{ ++ struct spdk_ssam_dev *smdev = smsession->smdev; ++ struct spdk_ssam_blk_session *bsmsession = ssam_to_blk_session(smsession); ++ struct ssam_request *io_req = (struct ssam_request*)arg; ++ int ret; ++ ++ smdev->io_num++; ++ bsmsession->blk_stat.start_count++; ++ ++ ret = ssam_blk_check_io_req(smdev, io_req); ++ if (ret < 0) { ++ smdev->discard_io_num++; ++ return; ++ } ++ ++ if (bsmsession->no_bdev) { ++ ssam_no_bdev_process_vq(smsession, io_req); ++ } else { ++ ssam_process_vq(smsession, io_req); ++ } ++} ++ ++static void ++ssam_blk_no_data_request_worker(struct spdk_ssam_session *smsession) ++{ ++ struct spdk_ssam_blk_session *bsmsession = NULL; ++ ++ bsmsession = ssam_to_blk_session(smsession); ++ if (bsmsession->no_bdev) { ++ ssam_no_bdev_put_io_channel(smsession); ++ } ++} ++ ++static void ++ssam_blk_destroy_bdev_device(struct spdk_ssam_session *smsession, void *args) ++{ ++ struct spdk_ssam_blk_session *bsmsession = ssam_to_blk_session(smsession); ++ spdk_bdev_close(bsmsession->bdev_desc); ++ ++ // free taskpool ++ ssam_free_task_pool(bsmsession); ++ ++ // free ++ free(bsmsession); ++} ++ ++static void ++ssam_request_resubmit(void *arg) ++{ ++ struct spdk_ssam_blk_task *task = (struct spdk_ssam_blk_task *)arg; ++ int rc; ++ ++ rc = ssam_process_blk_request(task); ++ if (rc == 0) { ++ SPDK_DEBUGLOG(ssam_blk_data, "====== Task: req_idx = %"PRIu16" resubmitted ======\n", ++ get_req_idx(task)); ++ } else { ++ SPDK_WARNLOG("====== Task: req_idx = %"PRIu16" failed ======\n", get_req_idx(task)); ++ } ++} ++ ++static inline void ++ssam_request_queue_io(struct spdk_ssam_blk_task *task) ++{ ++ int rc; ++ struct spdk_ssam_blk_session *bsmsession = task->bsmsession; ++ ++ task->bdev_io_wait.bdev = bsmsession->bdev; ++ task->bdev_io_wait.cb_fn = ssam_request_resubmit; ++ task->bdev_io_wait.cb_arg = task; ++ ++ rc = spdk_bdev_queue_io_wait(bsmsession->bdev, bsmsession->io_channel, &task->bdev_io_wait); ++ if (rc != 0) { ++ SPDK_ERRLOG("%s: failed to queue I/O, rc=%d\n", bsmsession->smsession.name, rc); ++ ssam_task_complete(task, VIRTIO_BLK_S_IOERR); ++ } ++} ++ ++static void ++ssam_session_io_resubmit(void *arg) ++{ ++ struct spdk_ssam_blk_task *task = (struct spdk_ssam_blk_task *)arg; ++ struct spdk_ssam_session *smsession = &task->bsmsession->smsession; ++ int rc; ++ ++ rc = ssam_task_iovs_memory_get(task); ++ if (rc != 0) { ++ ssam_session_insert_io_wait(smsession, &task->session_io_wait); ++ return; ++ } ++ ssam_request_dma_process(smsession, task); ++} ++ ++static void ++ssam_blk_start_post_cb(struct spdk_ssam_session *smsession, void **arg) ++{ ++ struct spdk_ssam_blk_session *bsmsession = ssam_to_blk_session(smsession); ++ int rc; ++ ++ rc = ssam_virtio_blk_resize(smsession->gfunc_id, bsmsession->bdev->blockcnt); ++ if (rc != 0) { ++ SPDK_WARNLOG("%s: virtio blk resize failed.\n", smsession->name); ++ } ++ ++ rc = ssam_mount_normal(smsession, 0); ++ if (rc != SSAM_MOUNT_OK) { ++ SPDK_WARNLOG("%s: mount ssam volume failed\n", smsession->name); ++ } ++ ++ // Smdev poller is not created here, but is created in the initialization process. ++ SPDK_NOTICELOG("BLK controller %s created with bdev %s, queues %u\n", ++ smsession->name, spdk_bdev_get_name(bsmsession->bdev), smsession->max_queues); ++} ++ ++static int ++ssam_blk_start_cb(struct spdk_ssam_session *smsession, void **ctx) ++{ ++ struct spdk_ssam_blk_session *bsmsession = ssam_to_blk_session(smsession); ++ ++ if (bsmsession->bdev == NULL) { ++ SPDK_ERRLOG("%s: session not have a bdev.\n", smsession->name); ++ return -ENODEV; ++ } ++ ++ bsmsession->io_channel = spdk_bdev_get_io_channel(bsmsession->bdev_desc); ++ if (bsmsession->io_channel == NULL) { ++ ssam_free_task_pool(bsmsession); ++ SPDK_ERRLOG("%s: I/O channel allocation failed\n", smsession->name); ++ return -ENODEV; ++ } ++ ++ ssam_session_start_done(smsession, 0); ++ ++ ssam_send_event_async_done(ctx); ++ ++ return 0; ++} ++ ++static int ++ssam_blk_start(struct spdk_ssam_session *smsession) ++{ ++ struct spdk_ssam_blk_session *bsmsession = ssam_to_blk_session(smsession); ++ struct spdk_ssam_send_event_flag send_event_flag = { ++ .need_async = true, ++ .need_rsp = true, ++ }; ++ int rc = ssam_alloc_task_pool(bsmsession); ++ if (rc != 0) { ++ SPDK_ERRLOG("%s: failed to alloc task pool.\n", smsession->name); ++ return rc; ++ } ++ return ssam_send_event_to_session(smsession, ssam_blk_start_cb, ssam_blk_start_post_cb, send_event_flag, NULL); ++} ++ ++static void ++ssam_blk_destroy_session(struct ssam_blk_session_ctx *ctx) ++{ ++ struct spdk_ssam_blk_session *bsmsession = ctx->bsmsession; ++ struct spdk_ssam_session *smsession = &bsmsession->smsession; ++ ++ if (smsession->task_cnt > 0) { ++ return; ++ } ++ ++ /* If in ssam subsystem finish process, session registered flag will ++ * be set to false first, bdev will be removed in ssam_bdev_remove_cb() ++ * call back process, waiting for the call back process finish first. ++ */ ++ if ((smsession->registered == false) && (bsmsession->bdev != NULL)) { ++ return; ++ } ++ ++ SPDK_NOTICELOG("%s: removing on lcore %d\n", ++ smsession->name, spdk_env_get_current_core()); ++ ++ ssam_session_destroy(smsession); ++ ++ if (bsmsession->io_channel != NULL) { ++ spdk_put_io_channel(bsmsession->io_channel); ++ bsmsession->io_channel = NULL; ++ } ++ ssam_free_task_pool(bsmsession); ++ ++ if (bsmsession->serial != NULL) { ++ free(bsmsession->serial); ++ } ++ spdk_poller_unregister(&bsmsession->stop_poller); ++ ++ ssam_session_stop_done(smsession, 0, ctx->user_ctx); ++ free(ctx); ++ ++ return; ++} ++ ++static int ++ssam_destroy_session_poller_cb(void *arg) ++{ ++ struct ssam_blk_session_ctx *ctx = arg; ++ ++ if (spdk_ssam_trylock() != 0) { ++ return SPDK_POLLER_BUSY; ++ } ++ ++ ssam_blk_destroy_session(ctx); ++ ++ spdk_ssam_unlock(); ++ ++ return SPDK_POLLER_BUSY; ++} ++ ++static int ++ssam_blk_stop_cb(struct spdk_ssam_session *smsession, void **ctx) ++{ ++ struct spdk_ssam_blk_session *bsmsession = ssam_to_blk_session(smsession); ++ // smsession already removed ++ if (!smsession->started) { ++ return 0; ++ } else { ++ smsession->started = false; ++ } ++ ++ struct ssam_blk_session_ctx *_ctx = ++ (struct ssam_blk_session_ctx *)calloc(1, sizeof(struct ssam_blk_session_ctx)); ++ ++ if (_ctx == NULL) { ++ SPDK_ERRLOG("%s: calloc blk session ctx error.\n", smsession->name); ++ return -ENOMEM; ++ } ++ ++ _ctx->bsmsession = bsmsession; ++ _ctx->user_ctx = ctx; ++ ++ bsmsession->stop_poller = SPDK_POLLER_REGISTER(ssam_destroy_session_poller_cb, ++ _ctx, SESSION_STOP_POLLER_PERIOD); ++ if (bsmsession->stop_poller == NULL) { ++ SPDK_WARNLOG("%s: ssam_destroy_session_poller_cb start failed.\n", smsession->name); ++ ssam_session_stop_done(smsession, -EBUSY, ctx); ++ free(_ctx); ++ return -EBUSY; ++ } ++ ++ return 0; ++} ++ ++static int ++ssam_blk_stop(struct spdk_ssam_session *smsession) ++{ ++ struct spdk_ssam_send_event_flag send_event_flag = { ++ .need_async = true, ++ .need_rsp = true, ++ }; ++ return ssam_send_event_to_session(smsession, ssam_blk_stop_cb, ssam_blk_stop_cpl_cb, send_event_flag, NULL); ++} ++ ++static int ++ssam_blk_remove_session(struct spdk_ssam_session *smsession) ++{ ++ SPDK_NOTICELOG("session gfunc_id=%u removing\n", smsession->gfunc_id); ++ int ret = ssam_blk_stop(smsession); ++ if ((ret != 0) && (smsession->registered == true)) { ++ (void)ssam_remount_normal(smsession, 0); ++ return ret; ++ } ++ ++ return 0; ++} ++ ++const char * ++spdk_ssam_get_bdev_name_by_gfunc_id(uint16_t gfunc_id) ++{ ++ struct spdk_ssam_session *smsession; ++ struct spdk_ssam_blk_session *bsmsession = NULL; ++ ++ smsession = spdk_ssam_session_find(gfunc_id); ++ if (smsession == NULL) { ++ return NULL; ++ } ++ bsmsession = ssam_to_blk_session(smsession); ++ ++ return spdk_bdev_get_name(bsmsession->bdev); ++} ++ ++struct spdk_bdev * ++spdk_ssam_get_session_bdev(struct spdk_ssam_session *smsession) ++{ ++ struct spdk_ssam_blk_session *bsmsession = ssam_to_blk_session(smsession); ++ ++ return bsmsession->bdev; ++} ++ ++int ++spdk_ssam_blk_construct(struct spdk_ssam_session_reg_info *info, const char *dev_name, ++ bool readonly, char *serial) ++{ ++ struct spdk_ssam_session *smsession = NULL; ++ struct spdk_ssam_blk_session *bsmsession = NULL; ++ struct spdk_bdev *bdev = NULL; ++ uint32_t session_ctx_size = sizeof(struct spdk_ssam_blk_session) - ++ sizeof(struct spdk_ssam_session); ++ uint16_t tid; ++ int ret = 0; ++ int rc; ++ ++ spdk_ssam_lock(); ++ ++ tid = spdk_ssam_get_tid(); ++ if (tid == SPDK_INVALID_TID) { ++ ret = -EINVAL; ++ goto out; ++ } ++ ++ info->tid = tid; ++ info->backend = &g_ssam_blk_session_backend; ++ info->session_ctx_size = session_ctx_size; ++ strncpy(info->type_name, SPDK_SESSION_TYPE_BLK, SPDK_SESSION_TYPE_MAX_LEN); ++ ret = spdk_ssam_session_register(info, &smsession); ++ if (ret != 0) { ++ goto out; ++ } ++ ++ bsmsession = ssam_to_blk_session(smsession); ++ ++ ret = spdk_bdev_open_ext(dev_name, true, ssam_bdev_event_cb, smsession, ++ &bsmsession->bdev_desc); ++ if (ret != 0) { ++ SPDK_ERRLOG("function id %d: could not open bdev, error:%s\n", info->gfunc_id, spdk_strerror(-ret)); ++ goto out; ++ } ++ bdev = spdk_bdev_desc_get_bdev(bsmsession->bdev_desc); ++ bsmsession->bdev = bdev; ++ bsmsession->readonly = readonly; ++ ++ if (serial == NULL) { ++ SPDK_INFOLOG(ssam_blk, "function id %d: not set volume id.\n", info->gfunc_id); ++ } else { ++ bsmsession->serial = calloc(SERIAL_STRING_LEN, sizeof(char)); ++ if (!bsmsession->serial) { ++ SPDK_ERRLOG("no memory for alloc.\n"); ++ goto out; ++ } ++ (void)snprintf(bsmsession->serial, SERIAL_STRING_LEN, "%s", serial); ++ } ++ ++ ret = ssam_blk_start(smsession); ++ if (ret != 0) { ++ SPDK_ERRLOG("%s: start failed\n", smsession->name); ++ goto out; ++ } ++ ++ SPDK_INFOLOG(ssam_blk, "function id %d: using bdev '%s'\n", info->gfunc_id, dev_name); ++out: ++ if ((ret != 0) && (smsession != NULL) && (smsession->smdev != NULL)) { ++ ssam_session_unreg_response_cb(smsession); ++ rc = spdk_ssam_session_unregister(smsession); ++ if (rc != 0) { ++ SPDK_ERRLOG("function id %d: blk construct failed and session remove failed, ret=%d\n", ++ info->gfunc_id, ret); ++ } ++ } ++ spdk_ssam_unlock(); ++ return ret; ++} ++ ++SPDK_LOG_REGISTER_COMPONENT(ssam_blk) ++SPDK_LOG_REGISTER_COMPONENT(ssam_blk_data) +diff --git a/lib/ssam/ssam_config.c b/lib/ssam/ssam_config.c +new file mode 100644 +index 0000000..be301c0 +--- /dev/null ++++ b/lib/ssam/ssam_config.c +@@ -0,0 +1,614 @@ ++/*- ++ * BSD LICENSE ++ * ++ * Copyright (c) Huawei Technologies Co., Ltd. 2022-2022. All rights reserved. ++ * ++ * Redistribution and use in source and binary forms, with or without ++ * modification, are permitted provided that the following conditions ++ * are met: ++ * ++ * * Redistributions of source code must retain the above copyright ++ * notice, this list of conditions and the following disclaimer. ++ * * Redistributions in binary form must reproduce the above copyright ++ * notice, this list of conditions and the following disclaimer in ++ * the documentation and/or other materials provided with the ++ * distribution. ++ * * Neither the name of Intel Corporation nor the names of its ++ * contributors may be used to endorse or promote products derived ++ * from this software without specific prior written permission. ++ * ++ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS ++ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT ++ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR ++ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT ++ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, ++ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT ++ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, ++ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY ++ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT ++ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE ++ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++ */ ++ ++#include ++#include ++#include ++ ++#include "spdk/string.h" ++#include "spdk/file.h" ++#include "ssam_internal.h" ++ ++#define SSAM_JSON_DEFAULT_MEMPOOL_SIZE 1024 ++#define SSAM_JSON_MAX_MEMPOOL_SIZE 10240 ++ ++enum ssam_dma_queue_num { ++ SSAM_DMA_QUEUE_NUM_DISABLE = 0, ++ SSAM_DMA_QUEUE_NUM_SMALL_IO = 1, ++ SSAM_DMA_QUEUE_NUM_DEFAULT = 2, ++ SSAM_DMA_QUEUE_NUM_LARGE_IO = 4, ++}; ++ ++struct ssam_user_config { ++ char *cfg_file_name; ++ uint32_t mempool_size; ++ uint32_t queues; ++ uint32_t dma_queue_num; ++ char *mode; ++}; ++ ++struct ssam_config { ++ struct ssam_user_config user_config; ++ struct ssam_hostep_info ep_info; ++ uint32_t core_num; ++ bool shm_created; ++}; ++ ++static struct ssam_config g_ssam_config; ++ ++static const struct spdk_json_object_decoder g_ssam_user_config_decoders[] = { ++ {"mempool_size_mb", offsetof(struct ssam_user_config, mempool_size), spdk_json_decode_uint32}, ++ {"queues", offsetof(struct ssam_user_config, queues), spdk_json_decode_uint32}, ++ {"mode", offsetof(struct ssam_user_config, mode), spdk_json_decode_string}, ++}; ++ ++static int ++ssam_heap_malloc(const char *type, size_t size, int socket_arg, ++ unsigned int flags, size_t align, size_t bound, bool contig, struct ssam_melem *mem) ++{ ++ void *addr = NULL; ++ unsigned long long pg_size; ++ int socket_id; ++ int rc; ++ uint64_t iova; ++ ++ addr = rte_malloc_socket(type, size, align, socket_arg); ++ if (addr == NULL) { ++ return -ENOMEM; ++ } ++ ++ rc = ssam_malloc_elem_from_addr(addr, &pg_size, &socket_id); ++ if (rc != 0) { ++ ssam_free_ex(addr); ++ return -ENOMEM; ++ } ++ ++ iova = rte_malloc_virt2iova(addr); ++ if (iova == RTE_BAD_IOVA) { ++ ssam_free_ex(addr); ++ return -ENOMEM; ++ } ++ ++ mem->addr = addr; ++ mem->iova = iova; ++ mem->page_sz = pg_size; ++ mem->socket_id = socket_id; ++ return 0; ++} ++ ++static int ++ssam_heap_free(void *addr) ++{ ++ return ssam_free_ex(addr); ++} ++ ++static uint8_t ++spdk_ssam_get_dma_queue_num_by_mode(void) ++{ ++ if (g_ssam_config.user_config.mode == NULL) { ++ return SSAM_DMA_QUEUE_NUM_DISABLE; ++ } ++ ++ if (!strcasecmp(g_ssam_config.user_config.mode, "default")) { ++ return SSAM_DMA_QUEUE_NUM_DEFAULT; ++ } else if (!strcasecmp(g_ssam_config.user_config.mode, "small-IO")) { ++ return SSAM_DMA_QUEUE_NUM_SMALL_IO; ++ } else if (!strcasecmp(g_ssam_config.user_config.mode, "large-IO")) { ++ return SSAM_DMA_QUEUE_NUM_LARGE_IO; ++ } ++ return SSAM_DMA_QUEUE_NUM_DISABLE; ++} ++ ++static void ++ssam_get_ssam_lib_init_config(struct ssam_lib_args *cfg) ++{ ++ uint32_t core_num = g_ssam_config.core_num; ++ ++ cfg->role = 1; ++ cfg->dma_queue_num = g_ssam_config.user_config.dma_queue_num; ++ cfg->ssam_heap_malloc = ssam_heap_malloc; ++ cfg->ssam_heap_free = ssam_heap_free; ++ ++ /* The number of tid is 1 greater than the number of cores. */ ++ cfg->core_num = core_num; ++} ++ ++void ssam_set_shm_created(bool shm_created) ++{ ++ g_ssam_config.shm_created = shm_created; ++} ++ ++bool ssam_get_shm_created(void) ++{ ++ return g_ssam_config.shm_created; ++} ++ ++int ++ssam_set_core_num(uint32_t core_num) ++{ ++ if (core_num > SSAM_MAX_CORE_NUM) { ++ SPDK_ERRLOG("Invalid coremask, total cores need less or equal than %d, " ++ "actually %u, please check startup item.\n", ++ SSAM_MAX_CORE_NUM, core_num); ++ return -EINVAL; ++ } ++ if (g_ssam_config.user_config.dma_queue_num == SSAM_DMA_QUEUE_NUM_LARGE_IO ++ && core_num > SSAM_MAX_CORE_NUM_WITH_LARGE_IO) { ++ SPDK_ERRLOG("Invalid coremask, total cores need less or equal than %d, " ++ "actually %u, please check startup item.\n", ++ SSAM_MAX_CORE_NUM_WITH_LARGE_IO, core_num); ++ return -EINVAL; ++ } ++ g_ssam_config.core_num = core_num; ++ return 0; ++} ++ ++uint16_t ++ssam_get_core_num(void) ++{ ++ return (uint16_t)g_ssam_config.core_num; ++} ++ ++uint32_t ++spdk_ssam_get_mempool_size(void) ++{ ++ return g_ssam_config.user_config.mempool_size; ++} ++ ++uint16_t ++spdk_ssam_get_queues(void) ++{ ++ uint16_t cfg_queues = (uint16_t)g_ssam_config.user_config.queues; ++ ++ if (cfg_queues == 0) { ++ SPDK_INFOLOG(ssam_config, "Use default queues number: %u.\n", SPDK_SSAM_DEFAULT_VQUEUES); ++ return SPDK_SSAM_DEFAULT_VQUEUES; ++ } ++ return cfg_queues; ++} ++ ++enum ssam_device_type ++spdk_ssam_get_virtio_type(uint16_t gfunc_id) ++{ ++ uint16_t vf_start, vf_end; ++ struct ssam_pf_list *pf = g_ssam_config.ep_info.host_pf_list; ++ ++ for (uint32_t i = 0; i < SSAM_HOSTEP_NUM_MAX; i++) { ++ if (pf[i].pf_funcid == UINT16_MAX) { ++ continue; ++ } ++ if (gfunc_id == pf[i].pf_funcid) { ++ return pf[i].pf_type; ++ } ++ ++ vf_start = pf[i].vf_funcid_start; ++ if (((uint32_t)vf_start + (uint32_t)pf[i].vf_num) > UINT16_MAX) { ++ SPDK_ERRLOG("vf_start %u + vf_num %u out of range, need less or equal than %u.\n", ++ vf_start, pf[i].vf_num, UINT16_MAX); ++ continue; ++ } ++ vf_end = vf_start + pf[i].vf_num; ++ if ((gfunc_id >= vf_start) && (gfunc_id < vf_end)) { ++ return pf[i].pf_type; ++ } ++ } ++ ++ return SSAM_DEVICE_VIRTIO_MAX; ++} ++ ++static void ++ssam_get_virtio_blk_config(struct ssam_virtio_config *cfg) ++{ ++ struct virtio_blk_config *dev_cfg = (struct virtio_blk_config *)cfg->device_config; ++ ++ cfg->device_feature = SPDK_SSAM_VIRTIO_BLK_DEFAULT_FEATURE; ++ cfg->queue_num = g_ssam_config.user_config.queues; ++ cfg->config_len = sizeof(struct virtio_blk_config); ++ ++ memset(dev_cfg, 0, cfg->config_len); ++ dev_cfg->blk_size = 0x200; ++ dev_cfg->min_io_size = 0; ++ dev_cfg->capacity = 0; ++ dev_cfg->num_queues = cfg->queue_num; ++ dev_cfg->seg_max = 0x7d; ++ dev_cfg->size_max = 0x200000; ++ cfg->queue_size = VIRITO_DEFAULT_QUEUE_SIZE; ++ ++ return; ++} ++ ++static void ++ssam_get_virtio_scsi_config(struct ssam_virtio_config *cfg) ++{ ++ struct virtio_scsi_config *dev_cfg = (struct virtio_scsi_config *)cfg->device_config; ++ ++ cfg->device_feature = SPDK_SSAM_VIRTIO_SCSI_DEFAULT_FEATURE; ++ cfg->queue_num = g_ssam_config.user_config.queues; ++ cfg->config_len = sizeof(struct virtio_scsi_config); ++ ++ memset(dev_cfg, 0, sizeof(struct virtio_scsi_config)); ++ dev_cfg->num_queues = 0x04; ++ dev_cfg->seg_max = 0x6f; ++ dev_cfg->max_sectors = 0x1ff; ++ dev_cfg->cmd_per_lun = 0x80; ++ dev_cfg->event_info_size = 0; ++ dev_cfg->sense_size = 0x60; ++ dev_cfg->cdb_size = 0x20; ++ dev_cfg->max_channel = 0; ++ dev_cfg->max_target = SPDK_SSAM_SCSI_CTRLR_MAX_DEVS; ++ dev_cfg->max_lun = 0xff; ++ cfg->queue_size = VIRITO_DEFAULT_QUEUE_SIZE; ++ ++ return; ++} ++ ++static int ++ssam_virtio_config_get(struct ssam_pf_list *pf, struct ssam_function_config *cfg) ++{ ++ int ret = 0; ++ ++ cfg->gfunc_id = pf->pf_funcid; ++ cfg->type = pf->pf_type; ++ switch (cfg->type) { ++ case SSAM_DEVICE_VIRTIO_BLK: ++ ssam_get_virtio_blk_config(&cfg->virtio_config); ++ break; ++ case SSAM_DEVICE_VIRTIO_SCSI: ++ ssam_get_virtio_scsi_config(&cfg->virtio_config); ++ break; ++ default: { ++ SPDK_ERRLOG("function config init fail (%d|%d) \n", cfg->gfunc_id, cfg->type); ++ ret = -EINVAL; ++ break; ++ } ++ } ++ ++ return ret; ++} ++ ++static int ++spdk_ssam_setup_pf(struct ssam_pf_list *pf, struct ssam_function_config *cfg) ++{ ++ int rc; ++ ++ rc = ssam_setup_function(pf->pf_funcid, pf->vf_num, pf->pf_type); ++ if (rc != 0) { ++ SPDK_ERRLOG("ssam init function(%u) failed:%s\n", pf->pf_funcid, spdk_strerror(-rc)); ++ return rc; ++ } ++ rc = ssam_write_function_config(cfg); ++ if (rc != 0) { ++ SPDK_ERRLOG("ssam write function(%d) config failed:%s\n", cfg->gfunc_id, spdk_strerror(-rc)); ++ return rc; ++ } ++ ++ return 0; ++} ++ ++static int ++spdk_ssam_setup_vf(struct ssam_pf_list *pf, struct ssam_function_config *cfg) ++{ ++ struct ssam_function_config l_cfg; ++ uint16_t vf_funcid_start = pf->vf_funcid_start; ++ uint16_t vf_num = pf->vf_num; ++ int rc; ++ uint16_t i; ++ ++ if (((uint32_t)vf_funcid_start + (uint32_t)vf_num) > UINT16_MAX) { ++ SPDK_ERRLOG("vf_funcid_start %u or vf_num %u out of range.\n", ++ vf_funcid_start, vf_num); ++ return -1; ++ } ++ ++ memcpy(&l_cfg, cfg, sizeof(struct ssam_function_config)); ++ for (i = vf_funcid_start; i < vf_funcid_start + vf_num; i++) { ++ l_cfg.gfunc_id = i; ++ l_cfg.virtio_config.queue_num = SPDK_SSAM_VF_DEFAULTE_VQUEUES; ++ rc = ssam_write_function_config(&l_cfg); ++ if (rc != 0) { ++ SPDK_ERRLOG("ssam write function(%u) config failed:%s\n", i, spdk_strerror(-rc)); ++ return rc; ++ } ++ } ++ ++ return 0; ++} ++ ++static int ++ssam_virtio_config_init(struct ssam_hostep_info *ep_info) ++{ ++ int rc = 0; ++ uint32_t i; ++ struct ssam_function_config cfg = {0}; ++ struct ssam_pf_list *pf = ep_info->host_pf_list; ++ ++ if (ssam_get_shm_created()) { ++ /* If server is crashed from last time, no need setup config this time */ ++ return 0; ++ } ++ ++ for (i = 0; i < SSAM_HOSTEP_NUM_MAX; i++) { ++ if (pf[i].pf_funcid == UINT16_MAX) { ++ continue; ++ } ++ rc = ssam_virtio_config_get(&pf[i], &cfg); ++ if (rc != 0) { ++ return rc; ++ } ++ rc = spdk_ssam_setup_pf(&pf[i], &cfg); ++ if (rc != 0) { ++ return rc; ++ } ++ rc = spdk_ssam_setup_vf(&pf[i], &cfg); ++ if (rc != 0) { ++ return rc; ++ } ++ } ++ ++ return rc; ++} ++ ++static int ++ssam_virtio_init(void) ++{ ++ struct ssam_lib_args ssam_args; ++ struct ssam_hostep_info *ep_info = &g_ssam_config.ep_info; ++ int rc; ++ ++ ssam_get_ssam_lib_init_config(&ssam_args); ++ ++ rc = ssam_lib_init(&ssam_args, ep_info); ++ if (rc != 0) { ++ SPDK_ERRLOG("Failed to init ssam:%s\n", spdk_strerror(-rc)); ++ return rc; ++ } ++ ++ rc = ssam_virtio_config_init(ep_info); ++ if (rc != 0) { ++ SPDK_ERRLOG("ssam virtio device init failed:%s\n", spdk_strerror(-rc)); ++ if (ssam_lib_exit() != 0) { ++ SPDK_WARNLOG("ssam lib exit failed\n"); ++ } ++ return rc; ++ } ++ ++ return 0; ++} ++ ++static int ++ssam_user_config_default(void) ++{ ++ struct ssam_user_config *user_config = &g_ssam_config.user_config; ++ ++ user_config->mempool_size = SSAM_JSON_DEFAULT_MEMPOOL_SIZE; ++ /** ++ * If file param json file is not exist, queue number will be ++ * set default value SPDK_SSAM_DEFAULT_VQUEUES when user create controller. ++ */ ++ user_config->queues = SPDK_SSAM_DEFAULT_VQUEUES; ++ user_config->dma_queue_num = SSAM_DMA_QUEUE_NUM_DEFAULT; ++ user_config->mode = NULL; ++ ++ return -ENOENT; ++} ++ ++static int ++ssam_user_config_file_read(const char *config_file, size_t *file_len, ++ void **json, ssize_t *value_size) ++{ ++ FILE *read_json = fopen(config_file, "r"); ++ ssize_t ret; ++ void *end = NULL; ++ ++ if (read_json == NULL) { ++ if (errno != ENOENT) { ++ SPDK_ERRLOG("Read JSON configuration file \"%s\" failed\n", config_file); ++ return -1; ++ } ++ SPDK_WARNLOG("JSON config file:%s does not exist! Use default configuration.\n", ++ config_file); ++ return ssam_user_config_default(); ++ } ++ ++ void *load = spdk_posix_file_load(read_json, file_len); ++ fclose(read_json); ++ if (load == NULL) { ++ return -1; ++ } ++ ++ ret = spdk_json_parse(load, *file_len, NULL, 0, &end, SPDK_JSON_PARSE_FLAG_ALLOW_COMMENTS); ++ if (ret < 0) { ++ SPDK_ERRLOG("Parsing JSON configuration file \"%s\" failed (%zd)\n", config_file, ret); ++ free(load); ++ load = NULL; ++ if (ret == -ENOENT) { // json file exists, but content is null ++ SPDK_ERRLOG("json file exists, but content is null\n"); ++ ret = -1; ++ } ++ return ret; ++ } ++ *json = load; ++ *value_size = ret; ++ ++ return 0; ++} ++ ++static void ++ssam_user_config_free(struct ssam_user_config *user_config) ++{ ++ if (user_config->mode != NULL) { ++ free(user_config->mode); ++ user_config->mode = NULL; ++ } ++} ++ ++static int ++ssam_user_config_parse(size_t file_len, void *json, ssize_t value_size) ++{ ++ struct spdk_json_val *value; ++ struct ssam_user_config *user_config = &g_ssam_config.user_config; ++ ssize_t ret; ++ void *end = NULL; ++ int rc; ++ ++ value = calloc(value_size, sizeof(struct spdk_json_val)); ++ if (value == NULL) { ++ SPDK_ERRLOG("Out of memory\n"); ++ free(json); ++ return -ENOMEM; ++ } ++ ++ ret = spdk_json_parse(json, file_len, value, value_size, &end, SPDK_JSON_PARSE_FLAG_ALLOW_COMMENTS); ++ if (ret != value_size) { ++ SPDK_ERRLOG("Parsing JSON configuration file failed\n"); ++ free(json); ++ free(value); ++ return -1; ++ } ++ ++ /* resolve json values to struct spdk_ssam_json_config */ ++ ++ rc = spdk_json_decode_object(value, g_ssam_user_config_decoders, ++ SPDK_COUNTOF(g_ssam_user_config_decoders), user_config); ++ free(json); ++ free(value); ++ if (rc != 0) { ++ SPDK_ERRLOG("decode object failed:%s\n", spdk_strerror(-rc)); ++ ssam_user_config_free(user_config); ++ return -1; ++ } ++ ++ return 0; ++} ++ ++static int ++ssam_user_config_check(void) ++{ ++ struct ssam_user_config *user_config = &g_ssam_config.user_config; ++ ++ if (user_config->mempool_size < SSAM_JSON_DEFAULT_MEMPOOL_SIZE) { ++ SPDK_ERRLOG("mempool_size_mb value in file %s out of range, need larger or equal than %u MB, actually %u MB.\n", ++ user_config->cfg_file_name, SSAM_JSON_DEFAULT_MEMPOOL_SIZE, user_config->mempool_size); ++ return -EINVAL; ++ } ++ ++ if (user_config->mempool_size > SSAM_JSON_MAX_MEMPOOL_SIZE) { ++ SPDK_ERRLOG("mempool_size_mb value in file %s out of range, need less or equal than %u MB, actually %u MB.\n", ++ user_config->cfg_file_name, SSAM_JSON_MAX_MEMPOOL_SIZE, user_config->mempool_size); ++ return -EINVAL; ++ } ++ ++ if (user_config->queues > SPDK_SSAM_MAX_VQUEUES) { ++ SPDK_ERRLOG("queues value in file %s out of range, need less or equal than %u, actually %u\n", ++ user_config->cfg_file_name, SPDK_SSAM_MAX_VQUEUES, user_config->queues); ++ return -EINVAL; ++ } ++ ++ if (user_config->queues == 0) { ++ SPDK_ERRLOG("queues value in file %s out of range, need not equal to 0\n", ++ user_config->cfg_file_name); ++ return -EINVAL; ++ } ++ ++ user_config->dma_queue_num = spdk_ssam_get_dma_queue_num_by_mode(); ++ if (user_config->dma_queue_num == SSAM_DMA_QUEUE_NUM_DISABLE) { ++ SPDK_ERRLOG("Invalid mode in file %s, which should be chosen from default, small-IO, large-IO, " ++ "actually %s\n", ++ user_config->mode, ssam_rc_get_param_json_file_path()); ++ return -EINVAL; ++ } ++ return 0; ++} ++ ++int ++spdk_ssam_user_config_init(void) ++{ ++ size_t file_len = 0; ++ void *json = NULL; ++ ssize_t value_size = 0; ++ int rc; ++ struct ssam_user_config *user_config = &g_ssam_config.user_config; ++ ++ user_config->cfg_file_name = ssam_rc_get_param_json_file_path(); ++ rc = ssam_user_config_file_read(user_config->cfg_file_name, &file_len, &json, &value_size); ++ if (rc != 0) { ++ if (rc == -ENOENT) { ++ return 0; ++ } ++ return rc; ++ } ++ ++ rc = ssam_user_config_parse(file_len, json, value_size); ++ if (rc != 0) { ++ return rc; ++ } ++ ++ rc = ssam_user_config_check(); ++ if (rc != 0) { ++ ssam_user_config_free(&g_ssam_config.user_config); ++ return rc; ++ } ++ ++ return 0; ++} ++ ++static void ++ssam_virtio_exit(void) ++{ ++ int rc; ++ ++ rc = ssam_lib_exit(); ++ if (rc != 0) { ++ SPDK_WARNLOG("ssam lib exit failed\n"); ++ } ++} ++ ++int ++ssam_config_init(void) ++{ ++ int rc; ++ ++ rc = ssam_virtio_init(); ++ if (rc != 0) { ++ return rc; ++ } ++ ++ return 0; ++} ++ ++void ++ssam_config_exit(void) ++{ ++ ssam_virtio_exit(); ++} ++ ++SPDK_LOG_REGISTER_COMPONENT(ssam_config) +diff --git a/lib/ssam/ssam_config.h b/lib/ssam/ssam_config.h +new file mode 100644 +index 0000000..c835ef9 +--- /dev/null ++++ b/lib/ssam/ssam_config.h +@@ -0,0 +1,50 @@ ++/*- ++ * BSD LICENSE ++ * ++ * Copyright (c) Huawei Technologies Co., Ltd. 2022-2022. All rights reserved. ++ * ++ * Redistribution and use in source and binary forms, with or without ++ * modification, are permitted provided that the following conditions ++ * are met: ++ * ++ * * Redistributions of source code must retain the above copyright ++ * notice, this list of conditions and the following disclaimer. ++ * * Redistributions in binary form must reproduce the above copyright ++ * notice, this list of conditions and the following disclaimer in ++ * the documentation and/or other materials provided with the ++ * distribution. ++ * * Neither the name of Intel Corporation nor the names of its ++ * contributors may be used to endorse or promote products derived ++ * from this software without specific prior written permission. ++ * ++ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS ++ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT ++ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR ++ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT ++ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, ++ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT ++ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, ++ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY ++ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT ++ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE ++ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++ */ ++ ++#ifndef SSAM_CONFIG_H ++#define SSAM_CONFIG_H ++ ++int ssam_set_core_num(uint32_t core_num); ++ ++uint16_t ssam_get_core_num(void); ++ ++uint32_t spdk_ssam_get_mempool_size(void); ++ ++uint16_t spdk_ssam_get_queues(void); ++ ++enum ssam_device_type spdk_ssam_get_virtio_type(uint16_t gfunc_id); ++ ++int ssam_config_init(void); ++ ++void ssam_config_exit(void); ++ ++#endif /* SSAM_CONFIG_H */ +diff --git a/lib/ssam/ssam_device_pcie.c b/lib/ssam/ssam_device_pcie.c +new file mode 100644 +index 0000000..dd97f91 +--- /dev/null ++++ b/lib/ssam/ssam_device_pcie.c +@@ -0,0 +1,250 @@ ++/*- ++ * BSD LICENSE ++ * ++ * Copyright (c) Huawei Technologies Co., Ltd. 2024-2024. All rights reserved. ++ * ++ * Redistribution and use in source and binary forms, with or without ++ * modification, are permitted provided that the following conditions ++ * are met: ++ * ++ * * Redistributions of source code must retain the above copyright ++ * notice, this list of conditions and the following disclaimer. ++ * * Redistributions in binary form must reproduce the above copyright ++ * notice, this list of conditions and the following disclaimer in ++ * the documentation and/or other materials provided with the ++ * distribution. ++ * * Neither the name of Intel Corporation nor the names of its ++ * contributors may be used to endorse or promote products derived ++ * from this software without specific prior written permission. ++ * ++ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS ++ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT ++ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR ++ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT ++ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, ++ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT ++ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, ++ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY ++ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT ++ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE ++ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++ */ ++ ++#include "spdk/string.h" ++#include "spdk/file.h" ++#include "ssam_internal.h" ++ ++#define SSAM_KEY_MAX_LEN 16 ++#define SSAM_TYPE_MAX_LEN 12 ++#define SSAM_DBDF_MAX_LEN 16 ++ ++struct ssam_device_pcie_info { ++ uint32_t func_id; ++ char type[SSAM_TYPE_MAX_LEN]; ++ char dbdf[SSAM_DBDF_MAX_LEN]; ++}; ++ ++struct ssam_device_pcie_list { ++ uint32_t size; ++ struct ssam_device_pcie_info *device_pcie_list; ++}; ++ ++static struct ssam_device_pcie_list g_ssam_device_pcie_list = { ++ .size = 0, ++ .device_pcie_list = NULL, ++}; ++ ++void ++ssam_deinit_device_pcie_list(void) ++{ ++ if (g_ssam_device_pcie_list.device_pcie_list != NULL) { ++ free(g_ssam_device_pcie_list.device_pcie_list); ++ g_ssam_device_pcie_list.device_pcie_list = NULL; ++ } ++} ++ ++static int ++ssam_alloc_device_pcie_list(struct spdk_json_val *values, size_t num_values) ++{ ++ size_t i; ++ uint32_t size = 0; ++ ++ for (i = 0; i < num_values; i++) { ++ if (values[i].type == SPDK_JSON_VAL_OBJECT_END) { ++ size++; ++ } ++ } ++ ++ if (g_ssam_device_pcie_list.device_pcie_list == NULL) { ++ g_ssam_device_pcie_list.size = size; ++ g_ssam_device_pcie_list.device_pcie_list = calloc(size, sizeof(struct ssam_device_pcie_info)); ++ if (g_ssam_device_pcie_list.device_pcie_list == NULL) { ++ SPDK_ERRLOG("Unable to allocate enough memory for device_pcie_list\n"); ++ return -ENOMEM; ++ } ++ } ++ return 0; ++} ++ ++static void ++ssam_set_device_pcie_index(struct spdk_json_val *value, uint32_t cur_index) ++{ ++ char val[16]; ++ uint32_t gfunc_id; ++ if (value->type != SPDK_JSON_VAL_NUMBER || value->len > 5) { ++ SPDK_ERRLOG("device pcie gfunc id is invalid, type: %u, len: %u\n", value->type, value->len); ++ return; ++ } ++ ++ memset(val, 0, 16); ++ memcpy(val, value->start, value->len); ++ gfunc_id = spdk_strtol(val, 10); ++ if (gfunc_id >= SPDK_INVALID_GFUNC_ID) { ++ SPDK_ERRLOG("device pcie gfunc id(%u) is more than %u\n", gfunc_id, SPDK_INVALID_GFUNC_ID); ++ return; ++ } ++ g_ssam_device_pcie_list.device_pcie_list[cur_index].func_id = gfunc_id; ++} ++ ++static void ++ssam_set_device_pcie_dbdf(struct spdk_json_val *value, uint32_t cur_index) ++{ ++ if (value->type != SPDK_JSON_VAL_STRING || value->len >= SSAM_DBDF_MAX_LEN) { ++ SPDK_ERRLOG("device pcie dbdf is invalid, type: %u, len: %u\n", value->type, value->len); ++ return; ++ } ++ ++ memset(g_ssam_device_pcie_list.device_pcie_list[cur_index].dbdf, 0, SSAM_DBDF_MAX_LEN); ++ memcpy(g_ssam_device_pcie_list.device_pcie_list[cur_index].dbdf, value->start, value->len); ++} ++ ++static void ++ssam_set_device_pcie_type(struct spdk_json_val *value, uint32_t cur_index) ++{ ++ if (value->type != SPDK_JSON_VAL_STRING || value->len >= SSAM_TYPE_MAX_LEN) { ++ SPDK_ERRLOG("device pcie type is invalid, type: %u, len: %u\n", value->type, value->len); ++ return; ++ } ++ ++ memset(g_ssam_device_pcie_list.device_pcie_list[cur_index].type, 0, SSAM_TYPE_MAX_LEN); ++ memcpy(g_ssam_device_pcie_list.device_pcie_list[cur_index].type, value->start, value->len); ++} ++ ++static void ++ssam_init_device_pcie_list_by_values(struct spdk_json_val *values, size_t num_values) ++{ ++ char key[SSAM_KEY_MAX_LEN]; ++ uint32_t cur_index = 0; ++ size_t i; ++ ++ for (i = 0; i < num_values; i++) { ++ if (values[i].type == SPDK_JSON_VAL_OBJECT_END) { ++ cur_index++; ++ } ++ if (values[i].type != SPDK_JSON_VAL_NAME || values[i].len >= SSAM_KEY_MAX_LEN) { ++ continue; ++ } ++ ++ memset(key, 0, SSAM_KEY_MAX_LEN); ++ memcpy(key, values[i].start, values[i].len); ++ ++ /* point to val */ ++ i++; ++ ++ if (strcmp(key, "index") == 0) { ++ ssam_set_device_pcie_index(&values[i], cur_index); ++ } else if (strcmp(key, "dbdf") == 0) { ++ ssam_set_device_pcie_dbdf(&values[i], cur_index); ++ } else if (strcmp(key, "type") == 0) { ++ ssam_set_device_pcie_type(&values[i], cur_index); ++ } ++ } ++} ++ ++int ++ssam_init_device_pcie_list(void) ++{ ++ FILE *fp = NULL; ++ void *buf = NULL; ++ ssize_t rc = 0; ++ size_t size; ++ size_t num_values; ++ struct spdk_json_val *values = NULL; ++ ++ fp = popen("dpak-smi info -t device_pcie_list -f storage", "r"); ++ if (fp == NULL) { ++ SPDK_ERRLOG("execute dpak-smi failed\n"); ++ return -EINVAL; ++ } ++ ++ buf = spdk_posix_file_load(fp, &size); ++ if (buf == NULL) { ++ SPDK_ERRLOG("get size of json failed\n"); ++ rc = -EINVAL; ++ goto invalid; ++ } ++ ++ rc = spdk_json_parse(buf, size, NULL, 0, NULL, SPDK_JSON_PARSE_FLAG_ALLOW_COMMENTS); ++ if (rc < 0) { ++ SPDK_ERRLOG("dpak-smi error: %s\n", (char *)buf); ++ goto invalid; ++ } ++ num_values = (size_t)rc; ++ values = calloc(num_values, sizeof(*values)); ++ if (values == NULL) { ++ SPDK_ERRLOG("Unable to allocate enough memory for values\n"); ++ rc = -ENOMEM; ++ goto invalid; ++ } ++ ++ rc = spdk_json_parse(buf, size, values, num_values, NULL, ++ SPDK_JSON_PARSE_FLAG_ALLOW_COMMENTS | SPDK_JSON_PARSE_FLAG_DECODE_IN_PLACE); ++ if (rc <= 0) { ++ SPDK_ERRLOG("parse json to values failed\n"); ++ goto invalid; ++ } ++ ++ rc = ssam_alloc_device_pcie_list(values, num_values); ++ if (rc != 0) { ++ goto invalid; ++ } ++ ++ ssam_init_device_pcie_list_by_values(values, num_values); ++ rc = 0; ++ ++invalid: ++ if (values != NULL) { ++ free(values); ++ values = NULL; ++ } ++ if (buf != NULL) { ++ free(buf); ++ buf = NULL; ++ } ++ if (fp != NULL) { ++ pclose(fp); ++ fp = NULL; ++ } ++ return rc; ++} ++ ++void ++ssam_dump_device_pcie_list(struct spdk_json_write_ctx *w) ++{ ++ uint32_t i; ++ spdk_json_write_named_array_begin(w, "device_pcie_list"); ++ for (i = 0; i < g_ssam_device_pcie_list.size; i++) { ++ spdk_json_write_object_begin(w); ++ spdk_json_write_named_uint32(w, "index", g_ssam_device_pcie_list.device_pcie_list[i].func_id); ++ spdk_json_write_named_string(w, "dbdf", g_ssam_device_pcie_list.device_pcie_list[i].dbdf); ++ spdk_json_write_named_string(w, "type", g_ssam_device_pcie_list.device_pcie_list[i].type); ++ spdk_json_write_object_end(w); ++ } ++ spdk_json_write_array_end(w); ++} ++ ++uint32_t ++ssam_get_device_pcie_list_size(void) ++{ ++ return g_ssam_device_pcie_list.size; ++} +\ No newline at end of file +diff --git a/lib/ssam/ssam_internal.h b/lib/ssam/ssam_internal.h +new file mode 100644 +index 0000000..f468b64 +--- /dev/null ++++ b/lib/ssam/ssam_internal.h +@@ -0,0 +1,532 @@ ++/*- ++ * BSD LICENSE ++ * ++ * Copyright (c) Huawei Technologies Co., Ltd. 2021-2021. All rights reserved. ++ * ++ * Redistribution and use in source and binary forms, with or without ++ * modification, are permitted provided that the following conditions ++ * are met: ++ * ++ * * Redistributions of source code must retain the above copyright ++ * notice, this list of conditions and the following disclaimer. ++ * * Redistributions in binary form must reproduce the above copyright ++ * notice, this list of conditions and the following disclaimer in ++ * the documentation and/or other materials provided with the ++ * distribution. ++ * * Neither the name of Intel Corporation nor the names of its ++ * contributors may be used to endorse or promote products derived ++ * from this software without specific prior written permission. ++ * ++ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS ++ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT ++ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR ++ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT ++ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, ++ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT ++ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, ++ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY ++ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT ++ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE ++ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++ */ ++ ++#ifndef SSAM_INTERNAL_H ++#define SSAM_INTERNAL_H ++ ++#include "stdint.h" ++ ++#include ++#include ++ ++#include "spdk_internal/thread.h" ++#include "spdk/log.h" ++#include "spdk/util.h" ++#include "spdk/rpc.h" ++#include "spdk/bdev.h" ++#include "spdk/ssam.h" ++#include "ssam_config.h" ++ ++#define SPDK_SSAM_FEATURES ((1ULL << VHOST_F_LOG_ALL) | \ ++ (1ULL << VHOST_USER_F_PROTOCOL_FEATURES) | \ ++ (1ULL << VIRTIO_F_VERSION_1) | \ ++ (1ULL << VIRTIO_F_NOTIFY_ON_EMPTY) | \ ++ (1ULL << VIRTIO_RING_F_EVENT_IDX) | \ ++ (1ULL << VIRTIO_RING_F_INDIRECT_DESC) | \ ++ (1ULL << VIRTIO_F_RING_PACKED)) ++ ++#define VIRITO_DEFAULT_QUEUE_SIZE 256 ++ ++#define SPDK_SSAM_VQ_MAX_SUBMISSIONS 16 ++#define SPDK_SSAM_MAX_VQUEUES 32 ++#define SPDK_SSAM_MAX_VQ_SIZE 256 ++#define SPDK_SSAM_VF_DEFAULTE_VQUEUES 1 ++#define SPDK_SSAM_BLK_MAX_VQ_SIZE 32 ++#define SSAM_JSON_DEFAULT_QUEUES_NUM 16 ++ ++/* ssam not support config vq size so far */ ++#define SPDK_SSAM_DEFAULT_VQ_SIZE SPDK_SSAM_MAX_VQ_SIZE ++#define SPDK_SSAM_DEFAULT_VQUEUES 16 ++#define SPDK_SSAM_IOVS_MAX 32 ++#define SPDK_SSAM_MAX_SEG_SIZE (32 * 1024) ++ ++#define SPDK_INVALID_GFUNC_ID UINT16_MAX ++#define SPDK_INVALID_CORE_ID UINT16_MAX ++#define SPDK_INVALID_VQUEUE_NUM UINT16_MAX ++ ++#define SSAM_PF_MAX_NUM 32 ++#define SPDK_SSAM_SCSI_CTRLR_MAX_DEVS 255 ++#define SSAM_VIRTIO_SCSI_LUN_ID 0x400001 ++#define SPDK_SSAM_SCSI_DEFAULT_VQUEUES 128 ++#define SSAM_MAX_SESSION_PER_DEV UINT16_MAX ++#define SSAM_DEFAULT_MEMPOOL_EXTRA_SIZE 0 ++#define SSAM_MAX_CORE_NUM 16 ++#define SSAM_MAX_CORE_NUM_WITH_LARGE_IO 10 ++ ++#define SPDK_LIMIT_LOG_MAX_INTERNEL_IN_MS 3000 ++#define SPDK_CONVERT_MS_TO_US 1000 ++ ++#define SPDK_SSAM_VIRTIO_BLK_DEFAULT_FEATURE 0x7f11001046 ++#define SPDK_SSAM_VIRTIO_SCSI_DEFAULT_FEATURE 0x7f11000007 ++ ++typedef void (*spdk_ssam_session_io_wait_cb)(void *cb_arg); ++ ++struct spdk_ssam_session_io_wait { ++ spdk_ssam_session_io_wait_cb cb_fn; ++ void *cb_arg; ++ TAILQ_ENTRY(spdk_ssam_session_io_wait) link; ++}; ++ ++typedef void (*spdk_ssam_session_io_wait_r_cb)(void *cb_arg); ++ ++struct spdk_ssam_session_io_wait_r { ++ spdk_ssam_session_io_wait_r_cb cb_fn; ++ void *cb_arg; ++ TAILQ_ENTRY(spdk_ssam_session_io_wait_r) link; ++}; ++ ++struct spdk_ssam_virtqueue { ++ void *tasks; ++ struct spdk_ssam_session *smsession; ++ uint32_t *index; ++ int num; ++ int use_num; ++ int index_l; ++ int index_r; ++}; ++ ++struct spdk_ssam_session_backend { ++ enum virtio_type type; ++ int (*remove_session)(struct spdk_ssam_session *smsession); ++ void (*remove_self)(struct spdk_ssam_session *smsession); ++ void (*request_worker)(struct spdk_ssam_session *smsession, void *arg); ++ void (*destroy_bdev_device)(struct spdk_ssam_session *smsession, void *args); ++ void (*response_worker)(struct spdk_ssam_session *smsession, void *arg); ++ void (*no_data_req_worker)(struct spdk_ssam_session *smsession); ++ ++ int (*ssam_get_config)(struct spdk_ssam_session *smsession, ++ uint8_t *config, uint32_t len, uint16_t queues); ++ int (*ssam_set_config)(struct spdk_ssam_session *smsession, ++ uint8_t *config, uint32_t offset, uint32_t size, uint32_t flags); ++ ++ void (*print_stuck_io_info)(struct spdk_ssam_session *smsession); ++ ++ void (*dump_info_json)(struct spdk_ssam_session *smsession, ++ struct spdk_json_write_ctx *w); ++ void (*write_config_json)(struct spdk_ssam_session *smsession, ++ struct spdk_json_write_ctx *w); ++ void (*show_iostat_json)(struct spdk_ssam_session *smsession, uint32_t id, ++ struct spdk_json_write_ctx *w); ++ void (*clear_iostat_json)(struct spdk_ssam_session *smsession); ++ struct spdk_bdev *(*get_bdev)(struct spdk_ssam_session *smsession, uint32_t id); ++}; ++ ++struct spdk_ssam_session { ++ /* Unique session name, format as ssam.tid.gfunc_id. */ ++ char *name; ++ ++ struct spdk_ssam_dev *smdev; ++ ++ /* Session poller thread, same as ssam dev poller thread */ ++ struct spdk_thread *thread; ++ struct ssam_mempool *mp; ++ const struct spdk_ssam_session_backend *backend; ++ spdk_ssam_session_rsp_fn rsp_fn; ++ void *rsp_ctx; ++ struct spdk_ssam_virtqueue virtqueue[SPDK_SSAM_MAX_VQUEUES]; ++ ++ /* Number of processing tasks, can not remove session when task_cnt > 0 */ ++ int task_cnt; ++ ++ /* Number of pending asynchronous operations */ ++ uint32_t pending_async_op_num; ++ ++ /* ssam global virtual function id */ ++ uint16_t gfunc_id; ++ ++ /* Depth of virtio-blk virtqueue */ ++ uint16_t queue_size; ++ ++ /* Number of virtio-blk virtqueue */ ++ uint16_t max_queues; ++ bool started; ++ bool initialized; ++ ++ /* spdk_ssam_session_fn process finish flag */ ++ bool async_done; ++ ++ bool registered; ++ ++ TAILQ_ENTRY(spdk_ssam_session) tailq; ++}; ++ ++struct ssam_iovs { ++ struct iovec sges[SPDK_SSAM_IOVS_MAX]; ++}; ++ ++struct ssam_iovec { ++ struct ssam_iovs virt; /* virt's iov_base is virtual address */ ++ struct ssam_iovs phys; /* phys's iov_base is physical address */ ++}; ++ ++struct ssam_stat { ++ uint64_t poll_cur_tsc; ++ uint64_t poll_tsc; ++ uint64_t poll_count; ++}; ++ ++struct spdk_ssam_dev { ++ /* ssam device name, format as ssam.tid */ ++ char *name; ++ /* virtio type */ ++ enum virtio_type type; ++ ++ /* ssam device poller thread, same as session poller thread */ ++ struct spdk_thread *thread; ++ struct spdk_poller *requestq_poller; ++ struct spdk_poller *responseq_poller; ++ struct spdk_poller *stop_poller; ++ ++ /* Store sessions of this dev, max number is SSAM_MAX_SESSION_PER_DEV */ ++ struct spdk_ssam_session **smsessions; ++ ++ TAILQ_ENTRY(spdk_ssam_dev) tailq; ++ ++ /* IO num that is on flight */ ++ uint64_t io_num; ++ ++ uint64_t discard_io_num; ++ ++ /* IO stuck ticks in dma process */ ++ uint64_t io_stuck_tsc; ++ struct ssam_stat stat; ++ ++ uint64_t io_wait_cnt; ++ uint64_t io_wait_r_cnt; ++ ++ /* Number of started and actively polled sessions */ ++ uint32_t active_session_num; ++ ++ /* Information of tid, indicate from which ssam queue to receive or send data */ ++ uint16_t tid; ++ TAILQ_HEAD(, spdk_ssam_session_io_wait) io_wait_queue; ++ TAILQ_HEAD(, spdk_ssam_session_io_wait_r) io_wait_queue_r; ++}; ++ ++struct spdk_ssam_dma_cb { ++ uint8_t status; ++ uint8_t req_dir; ++ uint16_t vq_idx; ++ uint16_t task_idx; ++ uint16_t gfunc_id; ++}; ++ ++struct spdk_ssam_send_event_flag { ++ bool need_async; ++ bool need_rsp; ++}; ++ ++/** ++ * Remove a session from sessions array. ++ * ++ * \param smsessions sessions array. ++ * \param smsession the session to be removed. ++ */ ++void ssam_sessions_remove(struct spdk_ssam_session **smsessions, ++ struct spdk_ssam_session *smsession); ++ ++/** ++ * Check out whether sessions is empty or not. ++ * ++ * \param smsessions sessions array. ++ * \return true indicate sessions is empty or false not empty. ++*/ ++bool ssam_sessions_empty(struct spdk_ssam_session **smsessions); ++ ++/** ++ * Get next session in sessions array, begin with current session. ++ * ++ * \param smsessions sessions array. ++ * \param smsession the begin session. ++ * \return the next session found or null not found. ++ */ ++struct spdk_ssam_session * ssam_sessions_next(struct spdk_ssam_session **smsessions, ++ struct spdk_ssam_session *smsession); ++ ++/** ++ * Insert io wait task to session. ++ * ++ * \param smsession the session that io wait insert to. ++ * \param io_wait the io wait to be insert. ++ */ ++void ssam_session_insert_io_wait(struct spdk_ssam_session *smsession, ++ struct spdk_ssam_session_io_wait *io_wait); ++ ++/** ++ * Insert io wait compilete or dma task to smdev. ++ * ++ * \param smdev the smdev that io wait insert to. ++ * \param io_wait_r the io wait to be insert. ++ */ ++void ssam_session_insert_io_wait_r(struct spdk_ssam_dev *smdev, ++ struct spdk_ssam_session_io_wait_r *io_wait_r); ++ ++/** ++ * Remove session from sessions and then stop session dev poller. ++ * ++ * \param smsession the session that to be removed. ++ */ ++void ssam_session_destroy(struct spdk_ssam_session *smsession); ++ ++/** ++ * Show a ssam device info in json format. ++ * ++ * \param smdev ssam device. ++ * \param gfunc_id ssam global vf id. ++ * \param arg user-provided parameter. ++ */ ++void ssam_dump_info_json(struct spdk_ssam_dev *smdev, uint16_t gfunc_id, ++ struct spdk_json_write_ctx *w); ++ ++/** ++ * Get a ssam device name. ++ * ++ * \param smdev ssam device. ++ * \return ssam device name or NULL ++ */ ++const char *spdk_ssam_dev_get_name(const struct spdk_ssam_dev *smdev); ++ ++/** ++ * Get a ssam session name. ++ * ++ * \param smdev smsession session. ++ * \return ssam session name or NULL ++ */ ++const char *spdk_ssam_session_get_name(const struct spdk_ssam_session *smsession); ++ ++/** ++ * Call a function of the provided ssam session. ++ * The function will be called on this session's thread. ++ * ++ * \param smsession ssam session. ++ * \param fn function to call on each session's thread ++ * \param cpl_fn function to be called at the end of the ssam management thread. ++ * Optional, can be NULL. ++ * \param send_event_flag whether an asynchronous operation or response is required ++ * \param ctx additional argument to the both callbacks ++ * \return error code ++ */ ++int ssam_send_event_to_session(struct spdk_ssam_session *smsession, spdk_ssam_session_fn fn, ++ spdk_ssam_session_cpl_fn cpl_fn, struct spdk_ssam_send_event_flag send_event_flag, void *ctx); ++ ++/** ++ * Finish a blocking ssam_send_event_to_session() call and finally ++ * start the session. This must be called on the target lcore, which ++ * will now receive all session-related messages (e.g. from ++ * ssam_send_event_to_session()). ++ * ++ * Must be called under the global ssam lock. ++ * ++ * \param smsession ssam session ++ * \param response return code ++ */ ++void ssam_session_start_done(struct spdk_ssam_session *smsession, int response); ++ ++/** ++ * Finish a blocking ssam_send_event_to_session() call and finally ++ * stop the session. This must be called on the session's lcore which ++ * used to receive all session-related messages (e.g. from ++ * ssam_send_event_to_session()). After this call, the session- ++ * related messages will be once again processed by any arbitrary thread. ++ * ++ * Must be called under the global ssam lock. ++ * ++ * \param smsession ssam session ++ * \param rsp return code ++ * \param ctx user context ++ */ ++void ssam_session_stop_done(struct spdk_ssam_session *smsession, int rsp, void **ctx); ++ ++/** ++ * Set session be freed, so that not access session any more. ++ * ++ * \param ctx user context ++ */ ++void ssam_set_session_be_freed(void **ctx); ++ ++/** ++ * Find a ssam device in the global g_ssam_devices list by gfunc_id, ++ * if find the ssam device, register a session to the existent ssam device ++ * sessions list, if not find, first create a ssam device to the global ++ * g_ssam_devices list, and then register a session to the new ssam device ++ * sessions list. ++ * ++ * Must be called under the global ssam lock. ++ * ++ * \param info ssam session register info. ++ * \param smsession ssam session created. ++ * \return 0 for success or negative for failed. ++ */ ++int spdk_ssam_session_register(struct spdk_ssam_session_reg_info *info, ++ struct spdk_ssam_session **smsession); ++ ++/** ++ * unregister smsession response call back function. ++ * ++ * \param smsession ssam session ++ */ ++void ssam_session_unreg_response_cb(struct spdk_ssam_session *smsession); ++ ++void ssam_dev_unregister(struct spdk_ssam_dev **dev); ++ ++void ssam_send_event_async_done(void **ctx); ++ ++void spdk_ssam_send_dev_destroy_msg(struct spdk_ssam_session *smsession, void *args); ++ ++/** ++ * Get ssam config. ++ * ++ * \param smsession ssam session ++ * \param config a memory region to store config. ++ * \param len the input config param memory region length. ++ * \return 0 success or -1 failed. ++ */ ++int ssam_get_config(struct spdk_ssam_session *smsession, uint8_t *config, ++ uint32_t len, uint16_t queues); ++ ++/** ++ * Mount gfunc_id volume to the ssam normal queue. ++ * ++ * \param smsession ssam session ++ * \param lun_id lun id of gfunc_id. ++ * ++ * \return 0 success or not 0 failed. ++ */ ++int ssam_mount_normal(struct spdk_ssam_session *smsession, uint32_t lun_id); ++ ++/** ++ * Unmount function. ++ * ++ * \param smsession ssam session ++ * \param lun_id lun id of gfunc_id. ++ * ++ * \return 0 success or not 0 failed. ++ */ ++int ssam_umount_normal(struct spdk_ssam_session *smsession, uint32_t lun_id); ++ ++/** ++ * Mount gfunc_id volume to the ssam normal queue again. ++ * ++ * \param smsession ssam session ++ * \param lun_id lun id of gfunc_id. ++ * ++ * \return 0 success or not 0 failed. ++ */ ++int ssam_remount_normal(struct spdk_ssam_session *smsession, uint32_t lun_id); ++ ++/** ++ * Register worker poller to dev. ++ * ++ * \param smdev the dev that to be registered worker poller. ++ * \return 0 success or not 0 failed. ++ */ ++int ssam_dev_register_worker_poller(struct spdk_ssam_dev *smdev); ++ ++/** ++ * Unregister worker poller for dev. ++ * ++ * \param smdev the dev that to be unregistered worker poller. ++ */ ++void ssam_dev_unregister_worker_poller(struct spdk_ssam_dev *smdev); ++ ++/** ++ * Get the differential value of the current tsc. ++ * ++ * \param tsc the current tsc. ++ * \return the differential value. ++ */ ++uint64_t ssam_get_diff_tsc(uint64_t tsc); ++ ++/** ++ * Get the bdev name of the specific gfunc_id. ++ * ++ * \param gfunc_id ssam global vf id. ++ * ++ * \return the bdev name of gfunc_id ++ */ ++const char *spdk_ssam_get_bdev_name_by_gfunc_id(uint16_t gfunc_id); ++ ++/** ++ * Remove a ssam session. Remove a session associate to the unique gfunc_id, ++ * then remove the ssam device if the device not have a session any more. ++ * ++ * Notice that this interface cannot be reentrant, so must call spdk_ssam_lock first. ++ * ++ * \param smsession ssam session ++ * ++ * \return 0 on success, negative errno on error. ++ */ ++int spdk_ssam_session_unregister(struct spdk_ssam_session *smsession); ++ ++/** ++ * Get ssam iostat. ++ * ++ * \param smsession ssam session ++ * \param stat a memory region to store iostat. ++ */ ++void spdk_ssam_get_iostat(struct spdk_ssam_session *smsession, ++ struct spdk_bdev_io_stat *stat); ++ ++/** ++ * Decrease dev io num. ++ * ++ * \param smdev ssam device. ++ */ ++void ssam_dev_io_dec(struct spdk_ssam_dev *smdev); ++ ++/** ++ * Get ssam session bdev. ++ * ++ * \param smsession ssam session ++ * ++ * \return the session bdev. ++ */ ++struct spdk_bdev *spdk_ssam_get_session_bdev(struct spdk_ssam_session *smsession); ++ ++/** ++ * free memory with rte. ++ * ++ * \param smsession ssam session ++ * ++ * \return 0 on success. ++ */ ++int ssam_free_ex(void *addr); ++ ++/** ++ * Get elem info from memory addr. ++ * ++ * \param memory addr ++ * ++ */ ++int ssam_malloc_elem_from_addr(const void *data, unsigned long long *pg_size, int *socket_id); ++ ++#endif /* SSAM_INTERNAL_H */ +diff --git a/lib/ssam/ssam_malloc.c b/lib/ssam/ssam_malloc.c +new file mode 100644 +index 0000000..4c7b720 +--- /dev/null ++++ b/lib/ssam/ssam_malloc.c +@@ -0,0 +1,56 @@ ++/*- ++ * BSD LICENSE ++ * ++ * Copyright (c) Huawei Technologies Co., Ltd. 2022-2022. All rights reserved. ++ * ++ * Redistribution and use in source and binary forms, with or without ++ * modification, are permitted provided that the following conditions ++ * are met: ++ * ++ * * Redistributions of source code must retain the above copyright ++ * notice, this list of conditions and the following disclaimer. ++ * * Redistributions in binary form must reproduce the above copyright ++ * notice, this list of conditions and the following disclaimer in ++ * the documentation and/or other materials provided with the ++ * distribution. ++ * * Neither the name of Intel Corporation nor the names of its ++ * contributors may be used to endorse or promote products derived ++ * from this software without specific prior written permission. ++ * ++ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS ++ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT ++ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR ++ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT ++ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, ++ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT ++ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, ++ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY ++ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT ++ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE ++ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++ */ ++ ++#include ++#include "spdk/env.h" ++ ++#include "ssam_internal.h" ++ ++int ssam_free_ex(void *addr) ++{ ++ spdk_free(addr); ++ return 0; ++} ++ ++int ssam_malloc_elem_from_addr(const void *data, unsigned long long *pg_size, int *socket_id) ++{ ++ struct rte_memseg_list *msl = NULL; ++ ++ msl = rte_mem_virt2memseg_list(data); ++ if (msl == NULL) { ++ return -1; ++ } ++ ++ *socket_id = msl->socket_id; ++ *pg_size = msl->page_sz; ++ return 0; ++} +\ No newline at end of file +diff --git a/lib/ssam/ssam_rpc.c b/lib/ssam/ssam_rpc.c +new file mode 100644 +index 0000000..54c5f98 +--- /dev/null ++++ b/lib/ssam/ssam_rpc.c +@@ -0,0 +1,1811 @@ ++/*- ++ * BSD LICENSE ++ * ++ * Copyright (c) Huawei Technologies Co., Ltd. 2021-2021. All rights reserved. ++ * ++ * Redistribution and use in source and binary forms, with or without ++ * modification, are permitted provided that the following conditions ++ * are met: ++ * ++ * * Redistributions of source code must retain the above copyright ++ * notice, this list of conditions and the following disclaimer. ++ * * Redistributions in binary form must reproduce the above copyright ++ * notice, this list of conditions and the following disclaimer in ++ * the documentation and/or other materials provided with the ++ * distribution. ++ * * Neither the name of Intel Corporation nor the names of its ++ * contributors may be used to endorse or promote products derived ++ * from this software without specific prior written permission. ++ * ++ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS ++ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT ++ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR ++ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT ++ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, ++ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT ++ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, ++ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY ++ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT ++ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE ++ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++ */ ++#include ++#include "spdk/string.h" ++#include "spdk/env.h" ++#include "spdk/bdev_module.h" ++#include "spdk/ssam.h" ++#include "spdk/bdev.h" ++ ++#include "ssam_internal.h" ++#include "ssam_config.h" ++#include "rte_malloc.h" ++ ++static int spdk_ssam_rpc_get_gfunc_id_by_dbdf(char *dbdf, uint16_t *gfunc_id); ++ ++struct rpc_ssam_blk_ctrlr { ++ char *dev_name; ++ char *index; ++ bool readonly; ++ char *serial; ++ uint16_t vqueue; ++}; ++ ++static const struct spdk_json_object_decoder g_rpc_construct_ssam_blk_ctrlr[] = { ++ {"dev_name", offsetof(struct rpc_ssam_blk_ctrlr, dev_name), spdk_json_decode_string}, ++ {"index", offsetof(struct rpc_ssam_blk_ctrlr, index), spdk_json_decode_string}, ++ {"readonly", offsetof(struct rpc_ssam_blk_ctrlr, readonly), spdk_json_decode_bool, true}, ++ {"serial", offsetof(struct rpc_ssam_blk_ctrlr, serial), spdk_json_decode_string, true}, ++ {"vqueue", offsetof(struct rpc_ssam_blk_ctrlr, vqueue), spdk_json_decode_uint16, true}, ++}; ++ ++static void ++free_rpc_ssam_blk_ctrlr(struct rpc_ssam_blk_ctrlr *req) ++{ ++ if (req->dev_name != NULL) { ++ free(req->dev_name); ++ req->dev_name = NULL; ++ } ++ ++ if (req->index != NULL) { ++ free(req->index); ++ req->index = NULL; ++ } ++ ++ if (req->serial != NULL) { ++ free(req->serial); ++ req->serial = NULL; ++ } ++} ++ ++static int ++spdk_ssam_rpc_para_check(uint16_t gfunc_id) ++{ ++ int rc; ++ ++ rc = spdk_ssam_check_gfunc_id(gfunc_id); ++ if (rc != 0) { ++ return rc; ++ } ++ ++ return 0; ++} ++ ++static int ++spdk_ssam_rpc_para_check_type(uint16_t gfunc_id, enum ssam_device_type target_type) ++{ ++ int rc; ++ enum ssam_device_type type; ++ ++ rc = spdk_ssam_rpc_para_check(gfunc_id); ++ if (rc != 0) { ++ return rc; ++ } ++ ++ type = spdk_ssam_get_virtio_type(gfunc_id); ++ if (type == target_type) { ++ return 0; ++ } ++ SPDK_ERRLOG("Invalid virtio type, need type %d, actually %d\n", target_type, type); ++ ++ return -EINVAL; ++} ++ ++static void ++rpc_ssam_send_response_cb(void *arg, int rsp) ++{ ++ struct spdk_jsonrpc_request *request = arg; ++ ++ if (rsp != 0) { ++ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, ++ spdk_strerror(-rsp)); ++ } else { ++ spdk_jsonrpc_send_bool_response(request, true); ++ } ++ return; ++} ++ ++struct ssam_log_command_info { ++ char *user_name; ++ char *event; ++ char *src_addr; ++}; ++ ++static const struct spdk_json_object_decoder g_rpc_construct_log_command_info[] = { ++ {"user_name", offsetof(struct ssam_log_command_info, user_name), spdk_json_decode_string}, ++ {"event", offsetof(struct ssam_log_command_info, event), spdk_json_decode_string}, ++ {"src_addr", offsetof(struct ssam_log_command_info, src_addr), spdk_json_decode_string}, ++}; ++ ++static void ++free_rpc_ssam_log_command_info(struct ssam_log_command_info *req) ++{ ++ if (req->user_name != NULL) { ++ free(req->user_name); ++ req->user_name = NULL; ++ } ++ if (req->event != NULL) { ++ free(req->event); ++ req->event = NULL; ++ } ++ if (req->src_addr != NULL) { ++ free(req->src_addr); ++ req->src_addr = NULL; ++ } ++} ++ ++static void ++rpc_ssam_log_command_info(struct spdk_jsonrpc_request *request, ++ const struct spdk_json_val *params) ++{ ++ struct ssam_log_command_info req = {0}; ++ int rc; ++ ++ if (params == NULL) { ++ SPDK_ERRLOG("log info params error, skip\n"); ++ goto invalid; ++ } ++ ++ rc = spdk_json_decode_object(params, g_rpc_construct_log_command_info, ++ SPDK_COUNTOF(g_rpc_construct_log_command_info), &req); ++ if (rc != 0) { ++ SPDK_ERRLOG("decode cmd info failed\n"); ++ rc = -EINVAL; ++ goto invalid; ++ } ++ ++ SPDK_NOTICELOG("log event: from %s user %s event %s\n", req.src_addr, req.user_name, req.event); ++ ++invalid: ++ free_rpc_ssam_log_command_info(&req); ++ spdk_jsonrpc_send_bool_response(request, true); ++ return; ++} ++SPDK_RPC_REGISTER("log_command_info", rpc_ssam_log_command_info, ++ SPDK_RPC_RUNTIME) ++ ++static int ++rpc_ssam_session_reg_response_cb(struct spdk_ssam_session *smsession, ++ struct spdk_jsonrpc_request *request) ++{ ++ if (smsession->rsp_fn != NULL) { ++ return -1; ++ } ++ smsession->rsp_fn = rpc_ssam_send_response_cb; ++ smsession->rsp_ctx = request; ++ return 0; ++} ++ ++static void ++rpc_init_session_reg_info(struct spdk_ssam_session_reg_info *info, ++ uint16_t queues, uint16_t gfunc_id, struct spdk_jsonrpc_request *request) ++{ ++ info->queues = queues; ++ info->gfunc_id = gfunc_id; ++ info->rsp_ctx = (void *)request; ++ info->rsp_fn = rpc_ssam_send_response_cb; ++} ++ ++static void ++free_rpc_ssam_session_reg_info(struct spdk_ssam_session_reg_info *info) ++{ ++ if (info->name != NULL) { ++ free(info->name); ++ info->name = NULL; ++ } ++ if (info->dbdf != NULL) { ++ free(info->dbdf); ++ info->dbdf = NULL; ++ } ++} ++ ++static uint16_t ++rpc_ssam_get_gfunc_id_by_index(char *index) ++{ ++ uint16_t gfunc_id, i; ++ int rc; ++ if (strlen(index) <= 0x5) { ++ for (i = 0; i < strlen(index); i++) { ++ if (!isdigit(index[i])) { ++ return SPDK_INVALID_GFUNC_ID; ++ } ++ } ++ gfunc_id = spdk_strtol(index, 10) > SPDK_INVALID_GFUNC_ID ? SPDK_INVALID_GFUNC_ID : spdk_strtol(index, 10); ++ } else { ++ rc = spdk_ssam_rpc_get_gfunc_id_by_dbdf(index, &gfunc_id); ++ if (rc != 0) { ++ return SPDK_INVALID_GFUNC_ID; ++ } ++ } ++ return gfunc_id; ++} ++ ++static void ++ssam_set_virtio_blk_config(struct ssam_virtio_config *cfg, uint16_t queues) ++{ ++ struct virtio_blk_config *dev_cfg = (struct virtio_blk_config *)cfg->device_config; ++ ++ cfg->device_feature = SPDK_SSAM_VIRTIO_BLK_DEFAULT_FEATURE; ++ cfg->queue_num = queues; ++ cfg->config_len = sizeof(struct virtio_blk_config); ++ ++ memset(dev_cfg, 0, cfg->config_len); ++ dev_cfg->blk_size = 0x200; ++ dev_cfg->min_io_size = 0; ++ dev_cfg->capacity = 0; ++ dev_cfg->num_queues = cfg->queue_num; ++ dev_cfg->seg_max = 0x7d; ++ dev_cfg->size_max = 0x200000; ++ cfg->queue_size = VIRITO_DEFAULT_QUEUE_SIZE; ++ ++ return; ++} ++ ++static int ssam_get_vqueue(struct rpc_ssam_blk_ctrlr *req, uint16_t gfunc_id, uint16_t *queues) ++{ ++ if (gfunc_id <= SSAM_PF_MAX_NUM) { ++ if (req->vqueue != SPDK_INVALID_VQUEUE_NUM) { ++ SPDK_ERRLOG("The PF does not allow dynamic modification of the vqueue(%d).\n", req->vqueue); ++ return -1; ++ } ++ *queues = spdk_ssam_get_queues(); ++ return 0; ++ } ++ ++ if (req->vqueue == SPDK_INVALID_VQUEUE_NUM) { ++ *queues = SPDK_SSAM_VF_DEFAULTE_VQUEUES; ++ return 0; ++ } ++ ++ if (req->vqueue > SPDK_SSAM_BLK_MAX_VQ_SIZE || req->vqueue == 0) { ++ SPDK_ERRLOG("The queue number is out of range. Currently (%u) .\n", req->vqueue); ++ return -1; ++ } ++ ++ *queues = req->vqueue; ++ return 0; ++} ++ ++static int ssam_blk_controller_set_vqueue(uint16_t gfunc_id, uint16_t queues) ++{ ++ int rc; ++ struct ssam_function_config cfg = { 0 }; ++ struct spdk_ssam_session *smsession = NULL; ++ ++ if (gfunc_id <= SSAM_PF_MAX_NUM) { ++ return 0; ++ } ++ ++ cfg.gfunc_id = gfunc_id; ++ cfg.type = SSAM_DEVICE_VIRTIO_BLK; ++ ssam_set_virtio_blk_config(&cfg.virtio_config, queues); ++ ++ spdk_ssam_lock(); ++ smsession = spdk_ssam_session_find(gfunc_id); ++ spdk_ssam_unlock(); ++ if (smsession != NULL) { ++ SPDK_ERRLOG("Session with function id %d already exists.\n", gfunc_id); ++ return -EEXIST; ++ } ++ ++ if (spdk_ssam_is_starting() == false) { ++ rc = ssam_write_function_config(&cfg); ++ if (rc != 0) { ++ SPDK_ERRLOG("ssam write function(%d) config failed:%s\n", cfg.gfunc_id, spdk_strerror(-rc)); ++ return rc; ++ } ++ } ++ ++ return 0; ++} ++ ++static void ++rpc_ssam_create_blk_controller(struct spdk_jsonrpc_request *request, ++ const struct spdk_json_val *params) ++{ ++ struct spdk_ssam_session_reg_info info = {0}; ++ struct rpc_ssam_blk_ctrlr req = {0}; ++ uint16_t gfunc_id = SPDK_INVALID_GFUNC_ID; ++ uint16_t queues = 0; ++ int rc; ++ ++ if (params == NULL) { ++ SPDK_ERRLOG("rpc_ssam_create_blk_controller params null\n"); ++ rc = -EINVAL; ++ goto invalid; ++ } ++ ++ req.vqueue = SPDK_INVALID_VQUEUE_NUM; ++ rc = spdk_json_decode_object(params, g_rpc_construct_ssam_blk_ctrlr, ++ SPDK_COUNTOF(g_rpc_construct_ssam_blk_ctrlr), &req); ++ if (rc != 0) { ++ SPDK_DEBUGLOG(ssam_rpc, "spdk_json_decode_object failed\n"); ++ rc = -EINVAL; ++ goto invalid; ++ } ++ ++ gfunc_id = rpc_ssam_get_gfunc_id_by_index(req.index); ++ rc = spdk_ssam_rpc_para_check_type(gfunc_id, SSAM_DEVICE_VIRTIO_BLK); ++ if (rc != 0) { ++ goto invalid; ++ } ++ ++ if (req.dev_name == NULL) { ++ rc = -ENODEV; ++ goto invalid; ++ } ++ ++ rc = ssam_get_vqueue(&req, gfunc_id, &queues); ++ if (rc != 0) { ++ goto invalid; ++ } ++ ++ rc = ssam_blk_controller_set_vqueue(gfunc_id, queues); ++ if (rc != 0) { ++ goto invalid; ++ } ++ ++ rpc_init_session_reg_info(&info, queues, gfunc_id, request); ++ ++ rc = spdk_ssam_blk_construct(&info, req.dev_name, req.readonly, req.serial); ++ if (rc < 0) { ++ goto invalid; ++ } ++ ++ free_rpc_ssam_blk_ctrlr(&req); ++ free_rpc_ssam_session_reg_info(&info); ++ return; ++ ++invalid: ++ free_rpc_ssam_blk_ctrlr(&req); ++ free_rpc_ssam_session_reg_info(&info); ++ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, ++ spdk_strerror(-rc)); ++ return; ++} ++SPDK_RPC_REGISTER("create_blk_controller", rpc_ssam_create_blk_controller, ++ SPDK_RPC_RUNTIME) ++ ++struct rpc_delete_ssam_ctrlr { ++ char *index; ++}; ++ ++static const struct spdk_json_object_decoder g_rpc_delete_ssam_ctrlr_decoder[] = { ++ {"index", offsetof(struct rpc_delete_ssam_ctrlr, index), spdk_json_decode_string}, ++}; ++ ++static void ++free_rpc_delete_ssam_ctrlr(struct rpc_delete_ssam_ctrlr *req) ++{ ++ if (req->index != NULL) { ++ free(req->index); ++ req->index = NULL; ++ } ++} ++ ++static void ++rpc_ssam_delete_controller(struct spdk_jsonrpc_request *request, ++ const struct spdk_json_val *params) ++{ ++ struct rpc_delete_ssam_ctrlr req = {0}; ++ uint16_t gfunc_id = SPDK_INVALID_GFUNC_ID; ++ struct spdk_ssam_session *smsession; ++ int rc; ++ ++ if (params == NULL) { ++ SPDK_ERRLOG("rpc_ssam_delete_controller params null\n"); ++ rc = -EINVAL; ++ goto invalid; ++ } ++ ++ rc = spdk_json_decode_object(params, g_rpc_delete_ssam_ctrlr_decoder, ++ SPDK_COUNTOF(g_rpc_delete_ssam_ctrlr_decoder), &req); ++ if (rc != 0) { ++ SPDK_DEBUGLOG(ssam_rpc, "spdk_json_decode_object failed\n"); ++ rc = -EINVAL; ++ goto invalid; ++ } ++ ++ gfunc_id = rpc_ssam_get_gfunc_id_by_index(req.index); ++ rc = spdk_ssam_rpc_para_check(gfunc_id); ++ if (rc != 0) { ++ goto invalid; ++ } ++ ++ spdk_ssam_lock(); ++ smsession = spdk_ssam_session_find(gfunc_id); ++ if (smsession == NULL) { ++ SPDK_ERRLOG("Couldn't find session with function id %d.\n", gfunc_id); ++ rc = -ENODEV; ++ spdk_ssam_unlock(); ++ goto invalid; ++ } ++ ++ rc = rpc_ssam_session_reg_response_cb(smsession, request); ++ if (rc != 0) { ++ SPDK_ERRLOG("The controller is being operated.\n"); ++ rc = -EALREADY; ++ spdk_ssam_unlock(); ++ goto invalid; ++ } ++ ++ rc = spdk_ssam_session_unregister(smsession); ++ if (rc != 0) { ++ /* ++ * Unregitster response cb to avoid use request in the cb function, ++ * because if error happend, request will be responsed immediately ++ */ ++ ssam_session_unreg_response_cb(smsession); ++ spdk_ssam_unlock(); ++ goto invalid; ++ } ++ spdk_ssam_unlock(); ++ ++ free_rpc_delete_ssam_ctrlr(&req); ++ return; ++ ++invalid: ++ free_rpc_delete_ssam_ctrlr(&req); ++ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, ++ spdk_strerror(-rc)); ++ return; ++} ++SPDK_RPC_REGISTER("delete_controller", rpc_ssam_delete_controller, SPDK_RPC_RUNTIME) ++ ++struct rpc_delete_ssam_scsi_ctrlr { ++ char *name; ++}; ++ ++static const struct spdk_json_object_decoder g_rpc_delete_ssam_scsi_ctrlr_decoder[] = { ++ {"name", offsetof(struct rpc_delete_ssam_scsi_ctrlr, name), spdk_json_decode_string}, ++}; ++ ++static void ++free_rpc_delete_ssam_scsi_ctrlrs(struct rpc_delete_ssam_scsi_ctrlr *req) ++{ ++ if (req->name != NULL) { ++ free(req->name); ++ req->name = NULL; ++ } ++} ++ ++static void ++rpc_ssam_delete_scsi_controller(struct spdk_jsonrpc_request *request, ++ const struct spdk_json_val *params) ++{ ++ struct rpc_delete_ssam_scsi_ctrlr req = {0}; ++ uint16_t gfunc_id = SPDK_INVALID_GFUNC_ID; ++ struct spdk_ssam_session *smsession; ++ int rc; ++ ++ if (params == NULL) { ++ SPDK_ERRLOG("rpc_ssam_delete_controller params null\n"); ++ rc = -EINVAL; ++ goto invalid; ++ } ++ ++ rc = spdk_json_decode_object(params, g_rpc_delete_ssam_scsi_ctrlr_decoder, ++ SPDK_COUNTOF(g_rpc_delete_ssam_scsi_ctrlr_decoder), &req); ++ if (rc != 0) { ++ SPDK_DEBUGLOG(ssam_rpc, "spdk_json_decode_object failed\n"); ++ rc = -EINVAL; ++ goto invalid; ++ } ++ ++ gfunc_id = spdk_ssam_get_gfunc_id_by_name(req.name); ++ rc = spdk_ssam_rpc_para_check(gfunc_id); ++ if (rc != 0) { ++ goto invalid; ++ } ++ ++ spdk_ssam_lock(); ++ smsession = spdk_ssam_session_find(gfunc_id); ++ if (smsession == NULL) { ++ SPDK_ERRLOG("Couldn't find session with function id %d.\n", gfunc_id); ++ rc = -ENODEV; ++ spdk_ssam_unlock(); ++ goto invalid; ++ } ++ ++ rc = rpc_ssam_session_reg_response_cb(smsession, request); ++ if (rc != 0) { ++ SPDK_ERRLOG("The controller is being operated.\n"); ++ rc = -EALREADY; ++ spdk_ssam_unlock(); ++ goto invalid; ++ } ++ ++ rc = spdk_ssam_session_unregister(smsession); ++ if (rc != 0) { ++ /* ++ * Unregitster response cb to avoid use request in the cb function, ++ * because if error happend, request will be responsed immediately ++ */ ++ ssam_session_unreg_response_cb(smsession); ++ spdk_ssam_unlock(); ++ goto invalid; ++ } ++ spdk_ssam_unlock(); ++ ++ free_rpc_delete_ssam_scsi_ctrlrs(&req); ++ return; ++ ++invalid: ++ free_rpc_delete_ssam_scsi_ctrlrs(&req); ++ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, ++ spdk_strerror(-rc)); ++ return; ++} ++SPDK_RPC_REGISTER("delete_scsi_controller", rpc_ssam_delete_scsi_controller, SPDK_RPC_RUNTIME) ++ ++struct rpc_get_ssam_ctrlrs { ++ uint32_t function_id; ++ char *dbdf; ++}; ++ ++static const struct spdk_json_object_decoder g_rpc_get_ssam_ctrlrs_decoder[] = { ++ {"function_id", offsetof(struct rpc_get_ssam_ctrlrs, function_id), spdk_json_decode_uint32, true}, ++ {"dbdf", offsetof(struct rpc_get_ssam_ctrlrs, dbdf), spdk_json_decode_string, true}, ++}; ++ ++static void ++free_rpc_get_ssam_ctrlrs(struct rpc_get_ssam_ctrlrs *req) ++{ ++ if (req->dbdf != NULL) { ++ free(req->dbdf); ++ req->dbdf = NULL; ++ } ++} ++ ++static void ++_rpc_get_ssam_controller(struct spdk_json_write_ctx *w, ++ struct spdk_ssam_dev *smdev, uint16_t gfunc_id) ++{ ++ spdk_json_write_object_begin(w); ++ ++ spdk_json_write_named_string(w, "ctrlr", spdk_ssam_dev_get_name(smdev)); ++ spdk_json_write_named_string_fmt(w, "cpumask", "0x%s", ++ spdk_cpuset_fmt(spdk_thread_get_cpumask(smdev->thread))); ++ spdk_json_write_named_uint32(w, "session_num", (uint32_t)smdev->active_session_num); ++ ++ spdk_json_write_named_object_begin(w, "backend_specific"); ++ ssam_dump_info_json(smdev, gfunc_id, w); ++ spdk_json_write_object_end(w); ++ ++ spdk_json_write_object_end(w); ++} ++ ++static int ++rpc_ssam_show_controllers(struct spdk_jsonrpc_request *request, uint16_t gfunc_id) ++{ ++ struct spdk_ssam_dev *smdev = NULL; ++ struct spdk_json_write_ctx *w = NULL; ++ struct spdk_ssam_session *smsession = NULL; ++ ++ spdk_ssam_lock(); ++ if (gfunc_id != SPDK_INVALID_GFUNC_ID) { ++ smsession = spdk_ssam_session_find(gfunc_id); ++ if (smsession == NULL) { ++ spdk_ssam_unlock(); ++ return -ENODEV; ++ } ++ ++ smdev = smsession->smdev; ++ ++ w = spdk_jsonrpc_begin_result(request); ++ spdk_json_write_array_begin(w); ++ ++ _rpc_get_ssam_controller(w, smdev, gfunc_id); ++ spdk_ssam_unlock(); ++ ++ spdk_json_write_array_end(w); ++ spdk_jsonrpc_end_result(request, w); ++ return 0; ++ } ++ ++ w = spdk_jsonrpc_begin_result(request); ++ spdk_json_write_array_begin(w); ++ ++ smdev = spdk_ssam_dev_next(NULL); ++ while (smdev != NULL) { ++ _rpc_get_ssam_controller(w, smdev, gfunc_id); ++ smdev = spdk_ssam_dev_next(smdev); ++ } ++ spdk_ssam_unlock(); ++ spdk_json_write_array_end(w); ++ spdk_jsonrpc_end_result(request, w); ++ ++ return 0; ++} ++ ++static int ++rpc_ssam_show_scsi_controllers(struct spdk_jsonrpc_request *request, uint16_t gfunc_id) ++{ ++ struct spdk_ssam_dev *smdev = NULL; ++ struct spdk_json_write_ctx *w = NULL; ++ struct spdk_ssam_session *smsession = NULL; ++ ++ spdk_ssam_lock(); ++ if (gfunc_id != SPDK_INVALID_GFUNC_ID) { ++ smsession = spdk_ssam_session_find(gfunc_id); ++ if (smsession == NULL) { ++ spdk_ssam_unlock(); ++ return -ENODEV; ++ } else if (smsession->backend->type != VIRTIO_TYPE_SCSI) { ++ spdk_ssam_unlock(); ++ return -EINVAL; ++ } ++ ++ smdev = smsession->smdev; ++ ++ w = spdk_jsonrpc_begin_result(request); ++ spdk_json_write_array_begin(w); ++ ++ smsession = smdev->smsessions[gfunc_id]; ++ smsession->backend->dump_info_json(smsession, w); ++ spdk_ssam_unlock(); ++ ++ spdk_json_write_array_end(w); ++ spdk_jsonrpc_end_result(request, w); ++ return 0; ++ } ++ ++ w = spdk_jsonrpc_begin_result(request); ++ spdk_json_write_array_begin(w); ++ ++ smdev = spdk_ssam_dev_next(NULL); ++ while (smdev != NULL) { ++ smsession = ssam_sessions_next(smdev->smsessions, NULL); ++ while (smsession != NULL) { ++ if (smsession->backend->type == VIRTIO_TYPE_SCSI) { ++ smsession->backend->dump_info_json(smsession, w); ++ } ++ smsession = ssam_sessions_next(smdev->smsessions, smsession); ++ } ++ smdev = spdk_ssam_dev_next(smdev); ++ } ++ spdk_ssam_unlock(); ++ spdk_json_write_array_end(w); ++ spdk_jsonrpc_end_result(request, w); ++ ++ return 0; ++} ++ ++static void ++rpc_ssam_get_controllers(struct spdk_jsonrpc_request *request, ++ const struct spdk_json_val *params) ++{ ++ struct rpc_get_ssam_ctrlrs req = { ++ .function_id = SPDK_INVALID_GFUNC_ID, ++ .dbdf = NULL, ++ }; ++ uint16_t gfunc_id = SPDK_INVALID_GFUNC_ID; ++ int rc; ++ ++ if (params != NULL) { ++ rc = spdk_json_decode_object(params, g_rpc_get_ssam_ctrlrs_decoder, ++ SPDK_COUNTOF(g_rpc_get_ssam_ctrlrs_decoder), &req); ++ if (rc != 0) { ++ SPDK_DEBUGLOG(ssam_rpc, "spdk_json_decode_object failed\n"); ++ rc = -EINVAL; ++ goto invalid; ++ } ++ } ++ ++ if (req.function_id != SPDK_INVALID_GFUNC_ID && req.dbdf != NULL) { ++ SPDK_ERRLOG("get_controllers can have at most one parameter\n"); ++ rc = -EINVAL; ++ goto invalid; ++ } ++ ++ if (req.function_id != SPDK_INVALID_GFUNC_ID) { ++ gfunc_id = req.function_id; ++ rc = spdk_ssam_rpc_para_check(gfunc_id); ++ if (rc != 0) { ++ goto invalid; ++ } ++ } ++ ++ if (req.dbdf != NULL) { ++ rc = spdk_ssam_rpc_get_gfunc_id_by_dbdf(req.dbdf, &gfunc_id); ++ if (rc != 0) { ++ goto invalid; ++ } ++ rc = spdk_ssam_rpc_para_check(gfunc_id); ++ if (rc != 0) { ++ goto invalid; ++ } ++ } ++ ++ rc = rpc_ssam_show_controllers(request, gfunc_id); ++ if (rc != 0) { ++ goto invalid; ++ } ++ ++ free_rpc_get_ssam_ctrlrs(&req); ++ return; ++ ++invalid: ++ free_rpc_get_ssam_ctrlrs(&req); ++ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, ++ spdk_strerror(-rc)); ++ return; ++} ++SPDK_RPC_REGISTER("get_controllers", rpc_ssam_get_controllers, SPDK_RPC_RUNTIME) ++ ++struct rpc_get_ssam_scsi_ctrlrs { ++ char *name; ++}; ++ ++static const struct spdk_json_object_decoder g_rpc_get_ssam_scsi_ctrlrs_decoder[] = { ++ {"name", offsetof(struct rpc_get_ssam_scsi_ctrlrs, name), spdk_json_decode_string, true}, ++}; ++ ++static void ++free_rpc_ssam_ctrlrs(struct rpc_get_ssam_scsi_ctrlrs *req) ++{ ++ if (req->name != NULL) { ++ free(req->name); ++ req->name = NULL; ++ } ++} ++ ++static void ++rpc_ssam_get_scsi_controllers(struct spdk_jsonrpc_request *request, ++ const struct spdk_json_val *params) ++{ ++ struct rpc_get_ssam_scsi_ctrlrs req = { ++ .name = NULL, ++ }; ++ uint16_t gfunc_id = SPDK_INVALID_GFUNC_ID; ++ int rc; ++ ++ if (params != NULL) { ++ rc = spdk_json_decode_object(params, g_rpc_get_ssam_scsi_ctrlrs_decoder, ++ SPDK_COUNTOF(g_rpc_get_ssam_scsi_ctrlrs_decoder), &req); ++ if (rc != 0) { ++ SPDK_DEBUGLOG(ssam_rpc, "spdk_json_decode_object failed\n"); ++ rc = -EINVAL; ++ goto invalid; ++ } ++ } ++ ++ if (req.name != NULL) { ++ gfunc_id = spdk_ssam_get_gfunc_id_by_name(req.name); ++ rc = spdk_ssam_rpc_para_check(gfunc_id); ++ if (rc != 0) { ++ goto invalid; ++ } ++ } ++ ++ rc = rpc_ssam_show_scsi_controllers(request, gfunc_id); ++ if (rc != 0) { ++ goto invalid; ++ } ++ ++ free_rpc_ssam_ctrlrs(&req); ++ return; ++ ++invalid: ++ free_rpc_ssam_ctrlrs(&req); ++ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, ++ spdk_strerror(-rc)); ++ return; ++} ++SPDK_RPC_REGISTER("get_scsi_controllers", rpc_ssam_get_scsi_controllers, SPDK_RPC_RUNTIME) ++ ++struct rpc_ssam_controller_get_iostat { ++ uint32_t function_id; ++ char *dbdf; ++}; ++ ++static const struct spdk_json_object_decoder g_rpc_ssam_controller_get_iostat_decoder[] = { ++ {"function_id", offsetof(struct rpc_ssam_controller_get_iostat, function_id), spdk_json_decode_uint32, true}, ++ {"dbdf", offsetof(struct rpc_ssam_controller_get_iostat, dbdf), spdk_json_decode_string, true}, ++}; ++ ++static void ++free_rpc_ssam_controller_get_iostat(struct rpc_ssam_controller_get_iostat *req) ++{ ++ if (req->dbdf != NULL) { ++ free(req->dbdf); ++ req->dbdf = NULL; ++ } ++} ++ ++static int ++rpc_ssam_show_iostat(struct spdk_jsonrpc_request *request, uint16_t gfunc_id) ++{ ++ struct spdk_ssam_dev *smdev = NULL; ++ struct spdk_json_write_ctx *w = NULL; ++ struct spdk_ssam_session *smsession = NULL; ++ ++ spdk_ssam_lock(); ++ if (gfunc_id != SPDK_INVALID_GFUNC_ID) { ++ smsession = spdk_ssam_session_find(gfunc_id); ++ if (smsession == NULL) { ++ spdk_ssam_unlock(); ++ return -ENODEV; ++ } ++ ++ w = spdk_jsonrpc_begin_result(request); ++ spdk_json_write_object_begin(w); ++ spdk_json_write_named_uint64(w, "tick_rate", spdk_get_ticks_hz()); ++ spdk_json_write_named_array_begin(w, "dbdfs"); ++ ++ if (smsession->backend->show_iostat_json != NULL) { ++ smsession->backend->show_iostat_json(smsession, SPDK_SSAM_SCSI_CTRLR_MAX_DEVS, w); ++ } ++ ++ spdk_ssam_unlock(); ++ ++ spdk_json_write_array_end(w); ++ spdk_json_write_object_end(w); ++ spdk_jsonrpc_end_result(request, w); ++ return 0; ++ } ++ ++ w = spdk_jsonrpc_begin_result(request); ++ spdk_json_write_object_begin(w); ++ spdk_json_write_named_uint64(w, "tick_rate", spdk_get_ticks_hz()); ++ spdk_json_write_named_array_begin(w, "dbdfs"); ++ ++ smdev = spdk_ssam_dev_next(NULL); ++ while (smdev != NULL) { ++ smsession = ssam_sessions_next(smdev->smsessions, NULL); ++ spdk_json_write_object_begin(w); ++ spdk_json_write_named_string(w, "name", smdev->name); ++ spdk_json_write_named_uint64(w, "flight_io", smdev->io_num); ++ spdk_json_write_named_uint64(w, "discard_io_num", smdev->discard_io_num); ++ spdk_json_write_named_uint64(w, "wait_io", smdev->io_wait_cnt); ++ spdk_json_write_named_uint64(w, "wait_io_r", smdev->io_wait_r_cnt); ++ spdk_json_write_object_end(w); ++ while (smsession != NULL) { ++ if (smsession->backend->show_iostat_json != NULL) { ++ smsession->backend->show_iostat_json(smsession, SPDK_SSAM_SCSI_CTRLR_MAX_DEVS, w); ++ } ++ smsession = ssam_sessions_next(smdev->smsessions, smsession); ++ } ++ smdev = spdk_ssam_dev_next(smdev); ++ } ++ ++ spdk_ssam_unlock(); ++ spdk_json_write_array_end(w); ++ spdk_json_write_object_end(w); ++ spdk_jsonrpc_end_result(request, w); ++ return 0; ++} ++ ++static void ++rpc_ssam_controller_get_iostat(struct spdk_jsonrpc_request *request, ++ const struct spdk_json_val *params) ++{ ++ struct rpc_ssam_controller_get_iostat req = { ++ .function_id = SPDK_INVALID_GFUNC_ID, ++ .dbdf = NULL, ++ }; ++ uint16_t gfunc_id = SPDK_INVALID_GFUNC_ID; ++ int rc; ++ ++ if (params != NULL) { ++ rc = spdk_json_decode_object(params, g_rpc_ssam_controller_get_iostat_decoder, ++ SPDK_COUNTOF(g_rpc_ssam_controller_get_iostat_decoder), &req); ++ if (rc != 0) { ++ SPDK_DEBUGLOG(ssam_rpc, "spdk_json_decode_object failed\n"); ++ rc = -EINVAL; ++ goto invalid; ++ } ++ } ++ ++ if (req.function_id != SPDK_INVALID_GFUNC_ID && req.dbdf != NULL) { ++ SPDK_ERRLOG("controller_get_iostat can have at most one parameter\n"); ++ rc = -EINVAL; ++ goto invalid; ++ } ++ ++ if (req.function_id != SPDK_INVALID_GFUNC_ID) { ++ gfunc_id = req.function_id; ++ rc = spdk_ssam_rpc_para_check(gfunc_id); ++ if (rc != 0) { ++ goto invalid; ++ } ++ } ++ ++ if (req.dbdf != NULL) { ++ rc = spdk_ssam_rpc_get_gfunc_id_by_dbdf(req.dbdf, &gfunc_id); ++ if (rc != 0) { ++ goto invalid; ++ } ++ rc = spdk_ssam_rpc_para_check(gfunc_id); ++ if (rc != 0) { ++ goto invalid; ++ } ++ } ++ ++ rc = rpc_ssam_show_iostat(request, gfunc_id); ++ if (rc != 0) { ++ goto invalid; ++ } ++ ++ free_rpc_ssam_controller_get_iostat(&req); ++ return; ++ ++invalid: ++ free_rpc_ssam_controller_get_iostat(&req); ++ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, ++ spdk_strerror(-rc)); ++ return; ++} ++SPDK_RPC_REGISTER("controller_get_iostat", rpc_ssam_controller_get_iostat, SPDK_RPC_RUNTIME) ++ ++static void ++rpc_ssam_clear_iostat(void) ++{ ++ struct spdk_ssam_dev *smdev = NULL; ++ struct spdk_ssam_session *smsession = NULL; ++ ++ spdk_ssam_lock(); ++ smdev = spdk_ssam_dev_next(NULL); ++ while (smdev != NULL) { ++ smsession = ssam_sessions_next(smdev->smsessions, NULL); ++ while (smsession != NULL) { ++ if (smsession->backend->clear_iostat_json != NULL) { ++ smsession->backend->clear_iostat_json(smsession); ++ } ++ smsession = ssam_sessions_next(smdev->smsessions, smsession); ++ } ++ smdev = spdk_ssam_dev_next(smdev); ++ } ++ spdk_ssam_unlock(); ++} ++ ++static void ++rpc_ssam_controller_clear_iostat(struct spdk_jsonrpc_request *request, ++ const struct spdk_json_val *params) ++{ ++ rpc_ssam_clear_iostat(); ++ spdk_jsonrpc_send_bool_response(request, true); ++ return; ++} ++SPDK_RPC_REGISTER("controller_clear_iostat", rpc_ssam_controller_clear_iostat, SPDK_RPC_RUNTIME) ++ ++struct rpc_bdev_resize { ++ uint32_t function_id; ++ uint64_t new_size_in_mb; ++}; ++ ++static const struct spdk_json_object_decoder g_rpc_bdev_resize[] = { ++ {"function_id", offsetof(struct rpc_bdev_resize, function_id), spdk_json_decode_uint32}, ++ {"new_size_in_mb", offsetof(struct rpc_bdev_resize, new_size_in_mb), spdk_json_decode_uint64}, ++}; ++ ++static int ++ssam_bdev_resize(struct spdk_bdev *bdev, uint64_t new_size_in_mb) ++{ ++ char *bdev_name = bdev->name; ++ int rc; ++ uint64_t current_size_in_mb; ++ uint64_t new_size_in_byte; ++ ++ if (bdev->blocklen == 0) { ++ SPDK_ERRLOG("The blocklen of bdev %s is zero\n", bdev_name); ++ return -EINVAL; ++ } ++ ++ if (UINT64_MAX / bdev->blockcnt < bdev->blocklen) { ++ SPDK_ERRLOG("The old size of bdev is too large, blockcnt: %lu, blocklen: %u\n", ++ bdev->blockcnt, bdev->blocklen); ++ return -EINVAL; ++ } ++ ++ if (new_size_in_mb == 0) { ++ goto end; ++ } ++ ++ current_size_in_mb = bdev->blocklen * bdev->blockcnt / SSAM_MB; ++ if (new_size_in_mb < current_size_in_mb) { ++ SPDK_ERRLOG("The new bdev size must not be smaller than current bdev size\n"); ++ return -EINVAL; ++ } ++ ++ if (UINT64_MAX / new_size_in_mb < SSAM_MB) { ++ SPDK_ERRLOG("The new bdev size is too large\n"); ++ return -EINVAL; ++ } ++ ++end: ++ new_size_in_byte = new_size_in_mb * SSAM_MB; ++ ++ rc = spdk_bdev_notify_blockcnt_change(bdev, new_size_in_byte / bdev->blocklen); ++ if (rc != 0) { ++ SPDK_ERRLOG("failed to notify block cnt change\n"); ++ return -EINVAL; ++ } ++ SPDK_NOTICELOG("bdev %s resize %lu(mb) done.\n", bdev->name, new_size_in_mb); ++ ++ return 0; ++} ++ ++static void ++rpc_ssam_bdev_resize(struct spdk_jsonrpc_request *request, ++ const struct spdk_json_val *params) ++{ ++ struct rpc_bdev_resize req = {0}; ++ uint16_t gfunc_id = SPDK_INVALID_GFUNC_ID; ++ struct spdk_ssam_session *smsession = NULL; ++ struct spdk_bdev *bdev = NULL; ++ int rc; ++ ++ if (params == NULL) { ++ SPDK_ERRLOG("rpc_ssam_bdev_resize params null\n"); ++ rc = -EINVAL; ++ goto invalid; ++ } ++ ++ rc = spdk_json_decode_object(params, g_rpc_bdev_resize, ++ SPDK_COUNTOF(g_rpc_bdev_resize), &req); ++ if (rc != 0) { ++ SPDK_ERRLOG("spdk_json_decode_object failed\n"); ++ rc = -EINVAL; ++ goto invalid; ++ } ++ ++ gfunc_id = req.function_id; ++ rc = spdk_ssam_rpc_para_check_type(gfunc_id, SSAM_DEVICE_VIRTIO_BLK); ++ if (rc != 0) { ++ goto invalid; ++ } ++ ++ spdk_ssam_lock(); ++ smsession = spdk_ssam_session_find(gfunc_id); ++ if (smsession == NULL) { ++ SPDK_ERRLOG("Before resize target, there need to create controller.\n"); ++ rc = -ENODEV; ++ spdk_ssam_unlock(); ++ goto invalid; ++ } ++ ++ if (smsession->backend->get_bdev != NULL) { ++ bdev = smsession->backend->get_bdev(smsession, 0); ++ } ++ if (bdev == NULL) { ++ SPDK_ERRLOG("The controller hasn't correlated to a bdev.\n"); ++ rc = -ENODEV; ++ spdk_ssam_unlock(); ++ goto invalid; ++ } ++ ++ spdk_ssam_unlock(); ++ ++ rc = ssam_bdev_resize(bdev, req.new_size_in_mb); ++ if (rc != 0) { ++ goto invalid; ++ } ++ ++ spdk_jsonrpc_send_bool_response(request, true); ++ return; ++ ++invalid: ++ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, ++ spdk_strerror(-rc)); ++} ++ ++SPDK_RPC_REGISTER("bdev_resize", rpc_ssam_bdev_resize, SPDK_RPC_RUNTIME) ++ ++struct rpc_scsi_bdev_resize { ++ char *name; ++ uint32_t tgt_id; ++ uint64_t new_size_in_mb; ++}; ++ ++static const struct spdk_json_object_decoder g_rpc_scsi_bdev_resize[] = { ++ {"name", offsetof(struct rpc_scsi_bdev_resize, name), spdk_json_decode_string}, ++ {"tgt_id", offsetof(struct rpc_scsi_bdev_resize, tgt_id), spdk_json_decode_uint32}, ++ {"new_size_in_mb", offsetof(struct rpc_scsi_bdev_resize, new_size_in_mb), spdk_json_decode_uint64}, ++}; ++ ++static void ++free_rpc_scsi_bdev_resize(struct rpc_scsi_bdev_resize *req) ++{ ++ if (req->name != NULL) { ++ free(req->name); ++ req->name = NULL; ++ } ++} ++ ++static void ++rpc_ssam_scsi_bdev_resize(struct spdk_jsonrpc_request *request, ++ const struct spdk_json_val *params) ++{ ++ struct rpc_scsi_bdev_resize req = {0}; ++ uint16_t gfunc_id = SPDK_INVALID_GFUNC_ID; ++ struct spdk_ssam_session *smsession = NULL; ++ struct spdk_bdev *bdev = NULL; ++ int rc; ++ ++ if (params == NULL) { ++ SPDK_ERRLOG("rpc_ssam_bdev_resize params null\n"); ++ rc = -EINVAL; ++ goto invalid; ++ } ++ ++ rc = spdk_json_decode_object(params, g_rpc_scsi_bdev_resize, ++ SPDK_COUNTOF(g_rpc_scsi_bdev_resize), &req); ++ if (rc != 0) { ++ SPDK_ERRLOG("spdk_json_decode_object failed\n"); ++ rc = -EINVAL; ++ goto invalid; ++ } ++ ++ gfunc_id = spdk_ssam_get_gfunc_id_by_name(req.name); ++ rc = spdk_ssam_rpc_para_check_type(gfunc_id, SSAM_DEVICE_VIRTIO_SCSI); ++ if (rc != 0) { ++ goto invalid; ++ } ++ ++ spdk_ssam_lock(); ++ smsession = spdk_ssam_session_find(gfunc_id); ++ if (smsession == NULL) { ++ SPDK_ERRLOG("Before resize target, there need to create controller.\n"); ++ rc = -ENODEV; ++ spdk_ssam_unlock(); ++ goto invalid; ++ } ++ ++ if (smsession->backend->get_bdev != NULL) { ++ bdev = smsession->backend->get_bdev(smsession, req.tgt_id); ++ } ++ if (bdev == NULL) { ++ rc = -ENODEV; ++ spdk_ssam_unlock(); ++ goto invalid; ++ } ++ ++ spdk_ssam_unlock(); ++ ++ rc = ssam_bdev_resize(bdev, req.new_size_in_mb); ++ if (rc != 0) { ++ goto invalid; ++ } ++ ++ free_rpc_scsi_bdev_resize(&req); ++ spdk_jsonrpc_send_bool_response(request, true); ++ return; ++ ++invalid: ++ free_rpc_scsi_bdev_resize(&req); ++ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, ++ spdk_strerror(-rc)); ++} ++ ++SPDK_RPC_REGISTER("scsi_bdev_resize", rpc_ssam_scsi_bdev_resize, SPDK_RPC_RUNTIME) ++ ++struct rpc_bdev_aio_resize { ++ char *name; ++ uint64_t new_size_in_mb; ++}; ++ ++static const struct spdk_json_object_decoder g_rpc_bdev_aio_resize[] = { ++ {"name", offsetof(struct rpc_bdev_aio_resize, name), spdk_json_decode_string}, ++ {"new_size_in_mb", offsetof(struct rpc_bdev_aio_resize, new_size_in_mb), spdk_json_decode_uint64}, ++}; ++ ++static void ++free_rpc_ssam_bdev_aio_resize(struct rpc_bdev_aio_resize *req) ++{ ++ if (req->name != NULL) { ++ free(req->name); ++ req->name = NULL; ++ } ++} ++ ++static void ++rpc_ssam_bdev_aio_resize(struct spdk_jsonrpc_request *request, ++ const struct spdk_json_val *params) ++{ ++ struct rpc_bdev_aio_resize req = {0}; ++ struct spdk_bdev *bdev = NULL; ++ int rc; ++ ++ if (params == NULL) { ++ SPDK_ERRLOG("rpc_ssam_bdev_resize params null\n"); ++ rc = -EINVAL; ++ goto invalid; ++ } ++ ++ rc = spdk_json_decode_object(params, g_rpc_bdev_aio_resize, ++ SPDK_COUNTOF(g_rpc_bdev_aio_resize), &req); ++ if (rc != 0) { ++ SPDK_ERRLOG("spdk_json_decode_object failed\n"); ++ rc = -EINVAL; ++ goto invalid; ++ } ++ ++ if (req.name) { ++ bdev = spdk_bdev_get_by_name(req.name); ++ if (bdev == NULL) { ++ SPDK_ERRLOG("bdev '%s' does not exist\n", req.name); ++ rc = -EINVAL; ++ goto invalid; ++ } ++ } ++ ++ rc = ssam_bdev_resize(bdev, req.new_size_in_mb); ++ if (rc != 0) { ++ goto invalid; ++ } ++ ++ free_rpc_ssam_bdev_aio_resize(&req); ++ spdk_jsonrpc_send_bool_response(request, true); ++ return; ++ ++invalid: ++ free_rpc_ssam_bdev_aio_resize(&req); ++ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, ++ spdk_strerror(-rc)); ++} ++ ++SPDK_RPC_REGISTER("bdev_aio_resize", rpc_ssam_bdev_aio_resize, SPDK_RPC_RUNTIME) ++ ++static void ++rpc_os_ready(struct spdk_jsonrpc_request *request, const struct spdk_json_val *params) ++{ ++ int rc = 0; ++ int fd; ++ char *enable = "1"; ++ ++ fd = open(SSAM_STORAGE_READY_FILE, O_RDWR); ++ if (fd < 0) { ++ SPDK_ERRLOG("Open storage ready file failed.\n"); ++ rc = EPERM; ++ goto invalid; ++ } ++ ++ rc = write(fd, enable, strlen(enable)); ++ if (rc < 0) { ++ SPDK_ERRLOG("Write storage ready file failed.\n"); ++ close(fd); ++ goto invalid; ++ } ++ ++ close(fd); ++ spdk_jsonrpc_send_bool_response(request, true); ++ return; ++ ++invalid: ++ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, ++ spdk_strerror(-rc)); ++} ++ ++SPDK_RPC_REGISTER("os_ready", rpc_os_ready, SPDK_RPC_RUNTIME) ++ ++static void ++rpc_set_os_status(struct spdk_jsonrpc_request *request, const struct spdk_json_val *params) ++{ ++ int rc = 0; ++ int fd; ++ char *disable = "0"; ++ ++ fd = open(SSAM_STORAGE_READY_FILE, O_RDWR); ++ if (fd < 0) { ++ SPDK_ERRLOG("Open storage ready file failed.\n"); ++ rc = -EPERM; ++ goto invalid; ++ } ++ ++ rc = write(fd, disable, strlen(disable)); ++ if (rc < 0) { ++ SPDK_ERRLOG("Write storage ready file failed.\n"); ++ close(fd); ++ goto invalid; ++ } ++ ++ close(fd); ++ spdk_jsonrpc_send_bool_response(request, true); ++ return; ++ ++invalid: ++ spdk_jsonrpc_send_error_response(request, rc, spdk_strerror(-rc)); ++} ++ ++SPDK_RPC_REGISTER("os_not_ready", rpc_set_os_status, SPDK_RPC_RUNTIME) ++ ++struct rpc_create_scsi_controller { ++ char *dbdf; ++ char *name; ++}; ++ ++static const struct spdk_json_object_decoder g_rpc_create_scsi_controller[] = { ++ {"dbdf", offsetof(struct rpc_create_scsi_controller, dbdf), spdk_json_decode_string}, ++ {"name", offsetof(struct rpc_create_scsi_controller, name), spdk_json_decode_string}, ++}; ++ ++static void ++free_rpc_ssam_create_scsi_controller(struct rpc_create_scsi_controller *req) ++{ ++ if (req->name != NULL) { ++ free(req->name); ++ req->name = NULL; ++ } ++ if (req->dbdf != NULL) { ++ free(req->dbdf); ++ req->dbdf = NULL; ++ } ++} ++ ++static int ++spdk_ssam_rpc_get_gfunc_id_by_dbdf(char *dbdf, uint16_t *gfunc_id) ++{ ++ int rc; ++ uint32_t dbdf_num; ++ ++ rc = ssam_dbdf_str2num(dbdf, &dbdf_num); ++ if (rc != 0) { ++ SPDK_ERRLOG("convert dbdf(%s) to num failed, rc: %d.\n", dbdf, rc); ++ return -EINVAL; ++ } ++ ++ rc = ssam_get_funcid_by_dbdf(dbdf_num, gfunc_id); ++ if (rc != 0) { ++ SPDK_ERRLOG("find gfuncid by dbdf(%u) failed, rc: %d.\n", dbdf_num, rc); ++ return -EINVAL; ++ } ++ ++ return 0; ++} ++ ++static int ++spdk_ssam_rpc_para_check_name(char *name) ++{ ++ uint16_t gfunc_id = spdk_ssam_get_gfunc_id_by_name(name); ++ if (gfunc_id == SPDK_INVALID_GFUNC_ID) { ++ return 0; ++ } ++ ++ return -EINVAL; ++} ++ ++static void ++rpc_ssam_create_scsi_controller(struct spdk_jsonrpc_request *request, ++ const struct spdk_json_val *params) ++{ ++ struct spdk_ssam_session_reg_info info = {0}; ++ struct rpc_create_scsi_controller req = {0}; ++ uint16_t gfunc_id = SPDK_INVALID_GFUNC_ID; ++ int rc; ++ uint16_t queues; ++ ++ if (params == NULL) { ++ SPDK_ERRLOG("rpc_ssam_create_scsi_controller params null\n"); ++ rc = -EINVAL; ++ goto invalid; ++ } ++ ++ rc = spdk_json_decode_object(params, g_rpc_create_scsi_controller, ++ SPDK_COUNTOF(g_rpc_create_scsi_controller), &req); ++ if (rc != 0) { ++ SPDK_ERRLOG("spdk_json_decode_object failed\n"); ++ rc = -EINVAL; ++ goto invalid; ++ } ++ ++ rc = spdk_ssam_rpc_para_check_name(req.name); ++ if (rc != 0) { ++ SPDK_ERRLOG("controller name(%s) is existed\n", req.name); ++ goto invalid; ++ } ++ ++ rc = spdk_ssam_rpc_get_gfunc_id_by_dbdf(req.dbdf, &gfunc_id); ++ if (rc != 0) { ++ goto invalid; ++ } ++ ++ rc = spdk_ssam_rpc_para_check_type(gfunc_id, SSAM_DEVICE_VIRTIO_SCSI); ++ if (rc != 0) { ++ goto invalid; ++ } ++ ++ queues = spdk_ssam_get_queues(); ++ if (queues > SPDK_SSAM_MAX_VQUEUES) { ++ SPDK_ERRLOG("Queue number out of range, need less or equal than %u, actually %u.\n", ++ SPDK_SSAM_MAX_VQUEUES, queues); ++ rc = -EINVAL; ++ goto invalid; ++ } ++ ++ rpc_init_session_reg_info(&info, queues, gfunc_id, request); ++ ++ info.name = strdup(req.name); ++ if (info.name == NULL) { ++ SPDK_ERRLOG("Failed to create name(%s) for ssam session reg info.\n", req.name); ++ rc = -EINVAL; ++ goto invalid; ++ } ++ ++ info.dbdf = strdup(req.dbdf); ++ if (info.dbdf == NULL) { ++ SPDK_ERRLOG("Failed to create dbdf(%s) for ssam session reg info.\n", req.dbdf); ++ rc = -EINVAL; ++ goto invalid; ++ } ++ ++ rc = spdk_ssam_scsi_construct(&info); ++ if (rc < 0) { ++ goto invalid; ++ } ++ ++ free_rpc_ssam_create_scsi_controller(&req); ++ free_rpc_ssam_session_reg_info(&info); ++ return; ++ ++invalid: ++ free_rpc_ssam_create_scsi_controller(&req); ++ free_rpc_ssam_session_reg_info(&info); ++ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, ++ spdk_strerror(-rc)); ++ return; ++} ++ ++SPDK_RPC_REGISTER("create_scsi_controller", rpc_ssam_create_scsi_controller, SPDK_RPC_RUNTIME) ++ ++struct rpc_scsi_controller_add_target { ++ char *name; ++ int32_t scsi_tgt_num; ++ char *bdev_name; ++}; ++ ++static const struct spdk_json_object_decoder g_rpc_scsi_controller_add_target[] = { ++ {"name", offsetof(struct rpc_scsi_controller_add_target, name), spdk_json_decode_string}, ++ {"scsi_tgt_num", offsetof(struct rpc_scsi_controller_add_target, scsi_tgt_num), spdk_json_decode_uint32}, ++ {"bdev_name", offsetof(struct rpc_scsi_controller_add_target, bdev_name), spdk_json_decode_string}, ++}; ++ ++static void ++free_rpc_ssam_scsi_ctrlr_add_target(struct rpc_scsi_controller_add_target *req) ++{ ++ if (req->name != NULL) { ++ free(req->name); ++ req->name = NULL; ++ } ++ if (req->bdev_name != NULL) { ++ free(req->bdev_name); ++ req->bdev_name = NULL; ++ } ++} ++ ++static void ++rpc_ssam_scsi_controller_add_target(struct spdk_jsonrpc_request *request, ++ const struct spdk_json_val *params) ++{ ++ struct rpc_scsi_controller_add_target req = {0}; ++ uint16_t gfunc_id = SPDK_INVALID_GFUNC_ID; ++ struct spdk_ssam_session *smsession; ++ int rc; ++ ++ if (params == NULL) { ++ SPDK_ERRLOG("rpc_ssam_scsi_controller_add_target params null\n"); ++ rc = -EINVAL; ++ goto invalid; ++ } ++ ++ rc = spdk_json_decode_object(params, g_rpc_scsi_controller_add_target, ++ SPDK_COUNTOF(g_rpc_scsi_controller_add_target), &req); ++ if (rc != 0) { ++ SPDK_ERRLOG("spdk_json_decode_object failed\n"); ++ rc = -EINVAL; ++ goto invalid; ++ } ++ ++ gfunc_id = spdk_ssam_get_gfunc_id_by_name(req.name); ++ rc = spdk_ssam_rpc_para_check_type(gfunc_id, SSAM_DEVICE_VIRTIO_SCSI); ++ if (rc != 0) { ++ goto invalid; ++ } ++ ++ spdk_ssam_lock(); ++ smsession = spdk_ssam_session_find(gfunc_id); ++ if (smsession == NULL) { ++ SPDK_ERRLOG("Before adding a SCSI target, there should be a SCSI controller.\n"); ++ rc = -ENODEV; ++ spdk_ssam_unlock(); ++ goto invalid; ++ } ++ ++ rc = rpc_ssam_session_reg_response_cb(smsession, request); ++ if (rc != 0) { ++ SPDK_ERRLOG("The controller is being operated.\n"); ++ rc = -EALREADY; ++ spdk_ssam_unlock(); ++ goto invalid; ++ } ++ ++ rc = spdk_ssam_scsi_dev_add_tgt(smsession, req.scsi_tgt_num, req.bdev_name); ++ if (rc != 0) { ++ /* ++ * Unregitster response cb to avoid use request in the cb function, ++ * because if error happend, request will be responsed immediately ++ */ ++ ssam_session_unreg_response_cb(smsession); ++ spdk_ssam_unlock(); ++ goto invalid; ++ } ++ spdk_ssam_unlock(); ++ ++ free_rpc_ssam_scsi_ctrlr_add_target(&req); ++ return; ++ ++invalid: ++ free_rpc_ssam_scsi_ctrlr_add_target(&req); ++ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, ++ spdk_strerror(-rc)); ++} ++ ++SPDK_RPC_REGISTER("scsi_controller_add_target", rpc_ssam_scsi_controller_add_target, SPDK_RPC_RUNTIME) ++ ++struct rpc_scsi_controller_remove_target { ++ char *name; ++ int32_t scsi_tgt_num; ++}; ++ ++static const struct spdk_json_object_decoder g_rpc_scsi_controller_remove_target[] = { ++ {"name", offsetof(struct rpc_scsi_controller_remove_target, name), spdk_json_decode_string}, ++ {"scsi_tgt_num", offsetof(struct rpc_scsi_controller_remove_target, scsi_tgt_num), spdk_json_decode_int32}, ++}; ++ ++static void ++free_rpc_scsi_controller_remove_target(struct rpc_scsi_controller_remove_target *req) ++{ ++ if (req->name != NULL) { ++ free(req->name); ++ req->name = NULL; ++ } ++} ++ ++static void ++rpc_ssam_scsi_controller_remove_target(struct spdk_jsonrpc_request *request, ++ const struct spdk_json_val *params) ++{ ++ struct rpc_scsi_controller_remove_target req = {0}; ++ uint16_t gfunc_id = SPDK_INVALID_GFUNC_ID; ++ int rc; ++ struct spdk_ssam_session *smsession = NULL; ++ ++ if (params == NULL) { ++ SPDK_ERRLOG("rpc_ssam_scsi_controller_remove_target params null\n"); ++ rc = -EINVAL; ++ goto invalid; ++ } ++ ++ rc = spdk_json_decode_object(params, g_rpc_scsi_controller_remove_target, ++ SPDK_COUNTOF(g_rpc_scsi_controller_remove_target), &req); ++ if (rc != 0) { ++ SPDK_ERRLOG("spdk_json_decode_object failed\n"); ++ rc = -EINVAL; ++ goto invalid; ++ } ++ ++ gfunc_id = spdk_ssam_get_gfunc_id_by_name(req.name); ++ rc = spdk_ssam_rpc_para_check_type(gfunc_id, SSAM_DEVICE_VIRTIO_SCSI); ++ if (rc != 0) { ++ goto invalid; ++ } ++ ++ spdk_ssam_lock(); ++ ++ smsession = spdk_ssam_session_find(gfunc_id); ++ if (smsession == NULL) { ++ rc = -ENODEV; ++ spdk_ssam_unlock(); ++ goto invalid; ++ } ++ ++ rc = rpc_ssam_session_reg_response_cb(smsession, request); ++ if (rc != 0) { ++ SPDK_ERRLOG("The controller is being operated.\n"); ++ rc = -EALREADY; ++ spdk_ssam_unlock(); ++ goto invalid; ++ } ++ ++ rc = spdk_ssam_scsi_dev_remove_tgt(smsession, req.scsi_tgt_num, ++ rpc_ssam_send_response_cb, request); ++ if (rc != 0) { ++ /* ++ * Unregitster response cb to avoid use request in the cb function, ++ * because if error happend, request will be responsed immediately ++ */ ++ ssam_session_unreg_response_cb(smsession); ++ spdk_ssam_unlock(); ++ goto invalid; ++ } ++ spdk_ssam_unlock(); ++ free_rpc_scsi_controller_remove_target(&req); ++ return; ++ ++invalid: ++ free_rpc_scsi_controller_remove_target(&req); ++ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, ++ spdk_strerror(-rc)); ++} ++ ++SPDK_RPC_REGISTER("scsi_controller_remove_target", rpc_ssam_scsi_controller_remove_target, SPDK_RPC_RUNTIME) ++ ++struct rpc_ssam_scsi_device_iostat { ++ char *name; ++ int32_t scsi_tgt_num; ++}; ++ ++static const struct spdk_json_object_decoder g_rpc_ssam_scsi_device_iostat[] = { ++ {"name", offsetof(struct rpc_ssam_scsi_device_iostat, name), spdk_json_decode_string}, ++ {"scsi_tgt_num", offsetof(struct rpc_ssam_scsi_device_iostat, scsi_tgt_num), spdk_json_decode_int32}, ++}; ++ ++static void ++free_rpc_ssam_scsi_device_iostat(struct rpc_ssam_scsi_device_iostat *req) ++{ ++ if (req->name != NULL) { ++ free(req->name); ++ req->name = NULL; ++ } ++} ++ ++static int ++rpc_ssam_show_scsi_iostat(struct spdk_jsonrpc_request *request, uint16_t gfunc_id, uint16_t scsi_tgt_num) ++{ ++ struct spdk_json_write_ctx *w = NULL; ++ struct spdk_ssam_session *smsession = NULL; ++ ++ spdk_ssam_lock(); ++ smsession = spdk_ssam_session_find(gfunc_id); ++ if (smsession == NULL) { ++ spdk_ssam_unlock(); ++ return -ENODEV; ++ } else if (smsession->backend->type != VIRTIO_TYPE_SCSI) { ++ spdk_ssam_unlock(); ++ return -EINVAL; ++ } ++ ++ w = spdk_jsonrpc_begin_result(request); ++ ++ if (smsession->backend->show_iostat_json != NULL) { ++ smsession->backend->show_iostat_json(smsession, scsi_tgt_num, w); ++ } ++ ++ spdk_ssam_unlock(); ++ ++ spdk_jsonrpc_end_result(request, w); ++ return 0; ++} ++ ++static void ++rpc_ssam_scsi_device_iostat(struct spdk_jsonrpc_request *request, ++ const struct spdk_json_val *params) ++{ ++ struct rpc_ssam_scsi_device_iostat req = {0}; ++ uint16_t gfunc_id = SPDK_INVALID_GFUNC_ID; ++ int rc; ++ ++ if (params == NULL) { ++ SPDK_ERRLOG("rpc_ssam_scsi_device_iostat params null\n"); ++ rc = -EINVAL; ++ goto invalid; ++ } ++ ++ rc = spdk_json_decode_object(params, g_rpc_ssam_scsi_device_iostat, ++ SPDK_COUNTOF(g_rpc_ssam_scsi_device_iostat), &req); ++ if (rc != 0) { ++ SPDK_ERRLOG("spdk_json_decode_object failed\n"); ++ rc = -EINVAL; ++ goto invalid; ++ } ++ ++ if (req.scsi_tgt_num < 0 || req.scsi_tgt_num > SPDK_SSAM_SCSI_CTRLR_MAX_DEVS) { ++ SPDK_ERRLOG("scsi_tgt_num is out of range\n"); ++ rc = -EINVAL; ++ goto invalid; ++ } ++ ++ gfunc_id = spdk_ssam_get_gfunc_id_by_name(req.name); ++ rc = spdk_ssam_rpc_para_check(gfunc_id); ++ if (rc != 0) { ++ goto invalid; ++ } ++ ++ rc = rpc_ssam_show_scsi_iostat(request, gfunc_id, req.scsi_tgt_num); ++ if (rc != 0) { ++ goto invalid; ++ } ++ ++ free_rpc_ssam_scsi_device_iostat(&req); ++ return; ++ ++invalid: ++ free_rpc_ssam_scsi_device_iostat(&req); ++ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, ++ spdk_strerror(-rc)); ++ return; ++} ++SPDK_RPC_REGISTER("scsi_device_iostat", rpc_ssam_scsi_device_iostat, SPDK_RPC_RUNTIME) ++ ++struct rpc_limit_log_interval { ++ int interval; ++}; ++ ++static void ++rpc_ssam_device_pcie_list(struct spdk_jsonrpc_request *request, ++ const struct spdk_json_val *params) ++{ ++ struct spdk_json_write_ctx *w = NULL; ++ int rc; ++ uint32_t size = ssam_get_device_pcie_list_size(); ++ if (size == 0) { ++ rc = ssam_init_device_pcie_list(); ++ if (rc != 0) { ++ SPDK_ERRLOG("init device_pcie_list failed\n"); ++ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, ++ spdk_strerror(-rc)); ++ return; ++ } ++ } ++ ++ w = spdk_jsonrpc_begin_result(request); ++ spdk_json_write_object_begin(w); ++ ++ ssam_dump_device_pcie_list(w); ++ ++ spdk_json_write_object_end(w); ++ spdk_jsonrpc_end_result(request, w); ++ return; ++} ++ ++SPDK_RPC_REGISTER("device_pcie_list", rpc_ssam_device_pcie_list, SPDK_RPC_RUNTIME) ++ ++static void ++rpc_ssam_config_remove(struct spdk_jsonrpc_request *request, const struct spdk_json_val *params) ++{ ++ int rc; ++ ++ char *json_config_file = ssam_rc_get_recover_json_file_path(); ++ rc = access(json_config_file, F_OK); ++ if (rc != 0) { ++ SPDK_ERRLOG("Json config file not found.\n"); ++ goto invalid; ++ } ++ ++ rc = unlink(json_config_file); ++ if (rc != 0) { ++ SPDK_ERRLOG("Json config file remove failed.\n"); ++ goto invalid; ++ } ++ ++ SPDK_NOTICELOG("Json config file remove successfully.\n"); ++ spdk_jsonrpc_send_bool_response(request, true); ++ return; ++ ++invalid: ++ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, ++ spdk_strerror(-rc)); ++} ++ ++SPDK_RPC_REGISTER("config_remove", rpc_ssam_config_remove, SPDK_RPC_RUNTIME) ++ ++SPDK_LOG_REGISTER_COMPONENT(ssam_rpc) +diff --git a/lib/ssam/ssam_scsi.c b/lib/ssam/ssam_scsi.c +new file mode 100644 +index 0000000..d6913c0 +--- /dev/null ++++ b/lib/ssam/ssam_scsi.c +@@ -0,0 +1,2418 @@ ++/*- ++ * BSD LICENSE ++ * ++ * Copyright (c) Huawei Technologies Co., Ltd. 2021-2022. All rights reserved. ++ * ++ * Redistribution and use in source and binary forms, with or without ++ * modification, are permitted provided that the following conditions ++ * are met: ++ * ++ * * Redistributions of source code must retain the above copyright ++ * notice, this list of conditions and the following disclaimer. ++ * * Redistributions in binary form must reproduce the above copyright ++ * notice, this list of conditions and the following disclaimer in ++ * the documentation and/or other materials provided with the ++ * distribution. ++ * * Neither the name of Intel Corporation nor the names of its ++ * contributors may be used to endorse or promote products derived ++ * from this software without specific prior written permission. ++ * ++ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS ++ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT ++ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR ++ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT ++ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, ++ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT ++ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, ++ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY ++ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT ++ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE ++ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++ */ ++ ++#include "linux/virtio_scsi.h" ++ ++#include ++#include ++ ++ ++#include "spdk/likely.h" ++#include "spdk/scsi_spec.h" ++#include "spdk/env.h" ++#include "spdk/scsi.h" ++#include "spdk/ssam.h" ++#include "spdk/string.h" ++#include "spdk/bdev_module.h" ++ ++#include "ssam_internal.h" ++ ++#define SESSION_STOP_POLLER_PERIOD 1000 ++#define IOV_HEADER_TAIL_NUM 2 ++#define PAYLOAD_SIZE_MAX (2048U * 2048) ++#define VMIO_TYPE_VIRTIO_SCSI_CTRL 4 ++#define SSAM_SPDK_SCSI_DEV_MAX_LUN 1 ++#define SSAM_SENSE_DATE_LEN 32 ++#define PERF_STAT ++ ++/* Features supported by virtio-scsi lib. */ ++#define SPDK_SSAM_SCSI_FEATURES (SPDK_SSAM_FEATURES | \ ++ (1ULL << VIRTIO_SCSI_F_INOUT) | \ ++ (1ULL << VIRTIO_SCSI_F_HOTPLUG) | \ ++ (1ULL << VIRTIO_SCSI_F_CHANGE) | \ ++ (1ULL << VIRTIO_SCSI_F_T10_PI)) ++ ++/* Features that are specified in VIRTIO SCSI but currently not supported: ++ * - Live migration not supported yet ++ * - T10 PI ++ */ ++#define SPDK_SSAM_SCSI_DISABLED_FEATURES (SPDK_SSAM_DISABLED_FEATURES | \ ++ (1ULL << VIRTIO_SCSI_F_T10_PI)) ++ ++/* ssam-user-scsi support protocol features */ ++#define SPDK_SSAM_SCSI_PROTOCOL_FEATURES (1ULL << SSAM_USER_PROTOCOL_F_INFLIGHT_SHMFD) ++ ++enum spdk_scsi_dev_ssam_status { ++ /* Target ID is empty. */ ++ SSAM_SCSI_DEV_EMPTY, ++ ++ /* Target is still being added. */ ++ SSAM_SCSI_DEV_ADDING, ++ ++ /* Target ID occupied. */ ++ SSAM_SCSI_DEV_PRESENT, ++ ++ /* Target ID is occupied but removal is in progress. */ ++ SSAM_SCSI_DEV_REMOVING, ++ ++ /* In session - device (SCSI target) seen but removed. */ ++ SSAM_SCSI_DEV_REMOVED, ++}; ++ ++struct ssam_scsi_stat { ++ uint64_t count; ++ uint64_t total_tsc; // pre_dma <- -> post_return ++ uint64_t dma_tsc; // pre_dma <- -> post_dma ++ uint64_t bdev_tsc; // pre_bdev <- -> post_bdev ++ uint64_t bdev_submit_tsc; // <- spdk_bdev_xxx -> ++ uint64_t complete_tsc; // pre_return <- -> post_return ++ uint64_t internel_tsc; // total_tsc - dma_tsc - bdev_tsc - complete_tsc ++ ++ uint64_t complete_read_ios; // Number of successfully completed read requests ++ uint64_t err_read_ios; // Number of failed completed read requests ++ uint64_t complete_write_ios; // Number of successfully completed write requests ++ uint64_t err_write_ios; // Number of failed completed write requests ++ uint64_t flush_ios; // Total number of flush requests ++ uint64_t complete_flush_ios; // Number of successfully completed flush requests ++ uint64_t err_flush_ios; // Number of failed completed flush requests ++ uint64_t fatal_ios; ++ uint64_t io_retry; ++ ++ uint64_t start_count; ++ uint64_t dma_count; ++ uint64_t dma_complete_count; ++ uint64_t bdev_count; ++ uint64_t bdev_complete_count; ++}; ++ ++struct spdk_scsi_dev_io_state { ++ struct spdk_bdev_io_stat stat; ++ uint64_t submit_tsc; ++ struct ssam_scsi_stat scsi_stat; ++}; ++ ++/** Context for a SCSI target in a ssam device */ ++struct spdk_scsi_dev_ssam_state { ++ struct spdk_scsi_dev_io_state *io_stat[SSAM_SPDK_SCSI_DEV_MAX_LUN]; ++ struct spdk_scsi_dev *dev; ++ ++ enum spdk_scsi_dev_ssam_status status; ++ ++ uint64_t flight_io; ++}; ++ ++struct ssam_scsi_tgt_hotplug_ctx { ++ unsigned scsi_tgt_num; ++}; ++ ++struct spdk_ssam_scsi_session { ++ struct spdk_ssam_session smsession; ++ int ref; ++ bool registered; ++ struct spdk_poller *stop_poller; ++ struct spdk_scsi_dev_ssam_state scsi_dev_state[SPDK_SSAM_SCSI_CTRLR_MAX_DEVS]; ++ char *dbdf; ++}; ++ ++struct ssam_scsi_session_ctx { ++ struct spdk_ssam_scsi_session *ssmsession; ++ void **user_ctx; ++}; ++ ++struct ssam_scsi_task_stat { ++ uint64_t start_tsc; ++ uint64_t dma_start_tsc; ++ uint64_t dma_end_tsc; ++ uint64_t bdev_start_tsc; ++ uint64_t bdev_func_tsc; ++ uint64_t bdev_end_tsc; ++ uint64_t complete_start_tsc; ++ uint64_t complete_end_tsc; ++}; ++ ++struct spdk_ssam_scsi_task { ++ struct spdk_scsi_task scsi_task; ++ /* Returned status of I/O processing, it can be VIRTIO_BLK_S_OK, ++ * VIRTIO_BLK_S_IOERR or VIRTIO_BLK_S_UNSUPP ++ */ ++ union { ++ struct virtio_scsi_cmd_resp resp; ++ struct virtio_scsi_ctrl_tmf_resp tmf_resp; ++ }; ++ ++ /* Number of bytes processed successfully */ ++ uint32_t used_len; ++ ++ /* Records the amount of valid data in the struct iovec iovs array. */ ++ uint32_t iovcnt; ++ struct ssam_iovec iovs; ++ ++ /* If set, the task is currently used for I/O processing. */ ++ bool used; ++ ++ /* For bdev io wait */ ++ struct spdk_ssam_scsi_session *ssmsession; ++ struct spdk_ssam_session_io_wait session_io_wait; ++ ++ /* ssam request data */ ++ struct ssam_request *io_req; ++ ++ uint16_t vq_idx; ++ uint16_t task_idx; ++ int32_t tgt_id; ++ struct spdk_ssam_session *smsession; ++ struct spdk_scsi_dev *scsi_dev; ++ struct ssam_scsi_task_stat task_stat; ++}; ++ ++struct ssam_add_tgt_ev_ctx { ++ char *bdev_name; ++ int tgt_num; ++}; ++ ++static void ssam_scsi_request_worker(struct spdk_ssam_session *smsession, void *arg); ++static void ssam_scsi_destroy_bdev_device(struct spdk_ssam_session *smsession, void *args); ++static void ssam_scsi_response_worker(struct spdk_ssam_session *smsession, void *arg); ++static int ssam_scsi_remove_session(struct spdk_ssam_session *smsession); ++static void ssam_scsi_remove_self(struct spdk_ssam_session *smsession); ++static void ssam_scsi_dump_info_json(struct spdk_ssam_session *smsession, ++ struct spdk_json_write_ctx *w); ++static void ssam_scsi_write_config_json(struct spdk_ssam_session *smsession, ++ struct spdk_json_write_ctx *w); ++static int ssam_scsi_get_config(struct spdk_ssam_session *smsession, uint8_t *config, ++ uint32_t len, uint16_t queues); ++static void ssam_scsi_show_iostat_json(struct spdk_ssam_session *smsession, uint32_t id, ++ struct spdk_json_write_ctx *w); ++static void ssam_scsi_clear_iostat_json(struct spdk_ssam_session *smsession); ++static void ssam_scsi_print_stuck_io_info(struct spdk_ssam_session *smsession); ++static void ssam_scsi_req_complete(struct spdk_ssam_dev *smdev, struct ssam_request *io_req, uint8_t status); ++static struct spdk_bdev *ssam_scsi_get_bdev(struct spdk_ssam_session *smsession, uint32_t id); ++ ++static void ssam_free_scsi_task_pool(struct spdk_ssam_scsi_session *ssmsession); ++static int spdk_ssam_scsi_dev_hot_remove_tgt(struct spdk_ssam_session *smsession, void **_ctx); ++static void ssam_scsi_process_io_task(struct spdk_ssam_session *smsession, struct spdk_ssam_scsi_task *task); ++static int ssam_scsi_task_iovs_memory_get(struct spdk_ssam_scsi_task *task, uint32_t payload_size); ++static void ssam_scsi_submit_io_task(struct spdk_ssam_scsi_task *task); ++static void ssam_scsi_destruct_tgt(struct spdk_ssam_scsi_session *ssmsession, int scsi_tgt_num); ++ ++static const struct spdk_ssam_session_backend g_ssam_scsi_session_backend = { ++ .type = VIRTIO_TYPE_SCSI, ++ .request_worker = ssam_scsi_request_worker, ++ .destroy_bdev_device = ssam_scsi_destroy_bdev_device, ++ .response_worker = ssam_scsi_response_worker, ++ .remove_session = ssam_scsi_remove_session, ++ .remove_self = ssam_scsi_remove_self, ++ .print_stuck_io_info = ssam_scsi_print_stuck_io_info, ++ .dump_info_json = ssam_scsi_dump_info_json, ++ .write_config_json = ssam_scsi_write_config_json, ++ .ssam_get_config = ssam_scsi_get_config, ++ .show_iostat_json = ssam_scsi_show_iostat_json, ++ .clear_iostat_json = ssam_scsi_clear_iostat_json, ++ .get_bdev = ssam_scsi_get_bdev, ++}; ++ ++static void ++ssam_scsi_task_stat_tick(uint64_t *tsc) ++{ ++#ifdef PERF_STAT ++ *tsc = spdk_get_ticks(); ++#endif ++ return; ++} ++ ++static void ++ssam_scsi_stat_statistics(struct spdk_ssam_scsi_task *task) ++{ ++#ifdef PERF_STAT ++ if (task->scsi_task.lun == NULL || task->io_req->type == VMIO_TYPE_VIRTIO_SCSI_CTRL || ++ task->task_stat.bdev_func_tsc == 0 || task->task_stat.bdev_end_tsc == 0) { ++ return; ++ } ++ ++ int32_t lun_id = spdk_scsi_lun_get_id(task->scsi_task.lun); ++ struct ssam_scsi_stat *scsi_stat = &task->ssmsession->scsi_dev_state[task->tgt_id].io_stat[lun_id]->scsi_stat; ++ ++ uint64_t dma_tsc = task->task_stat.dma_end_tsc - task->task_stat.dma_start_tsc; ++ uint64_t bdev_tsc = task->task_stat.bdev_end_tsc - task->task_stat.bdev_start_tsc; ++ uint64_t bdev_submit_tsc = task->task_stat.bdev_func_tsc - task->task_stat.bdev_start_tsc; ++ uint64_t complete_tsc = task->task_stat.complete_end_tsc - task->task_stat.complete_start_tsc; ++ uint64_t total_tsc = task->task_stat.complete_end_tsc - task->task_stat.start_tsc; ++ ++ struct ssam_io_message *io_cmd = &task->io_req->req.cmd; ++ if (io_cmd->writable) { /* read io */ ++ if (task->scsi_task.status == SPDK_SCSI_STATUS_GOOD) { ++ scsi_stat->complete_read_ios++; ++ } else { ++ scsi_stat->err_read_ios++; ++ } ++ } else { ++ if (task->scsi_task.status == SPDK_SCSI_STATUS_GOOD) { ++ scsi_stat->complete_write_ios++; ++ } else { ++ scsi_stat->err_write_ios++; ++ } ++ } ++ ++ scsi_stat->dma_tsc += dma_tsc; ++ scsi_stat->bdev_tsc += bdev_tsc; ++ scsi_stat->bdev_submit_tsc += bdev_submit_tsc; ++ scsi_stat->complete_tsc += complete_tsc; ++ scsi_stat->total_tsc += total_tsc; ++ scsi_stat->internel_tsc += total_tsc - complete_tsc - bdev_tsc - dma_tsc; ++ scsi_stat->count += 1; ++#endif ++} ++ ++static uint32_t ++ssam_scsi_tgtid_to_lunid(uint32_t tgt_id) ++{ ++ return (((tgt_id) << 0x8) | SSAM_VIRTIO_SCSI_LUN_ID); ++} ++ ++static int ++ssam_scsi_get_config(struct spdk_ssam_session *smsession, uint8_t *config, ++ uint32_t len, uint16_t queues) ++{ ++ struct virtio_scsi_config scsi_cfg; ++ scsi_cfg.num_queues = 0x80; ++ scsi_cfg.seg_max = 0x6f; ++ scsi_cfg.max_sectors = 0x1ff; ++ scsi_cfg.cmd_per_lun = 0x80; ++ scsi_cfg.event_info_size = 0; ++ scsi_cfg.sense_size = 0x60; ++ scsi_cfg.cdb_size = 0x20; ++ scsi_cfg.max_channel = 0; ++ scsi_cfg.max_target = SPDK_SSAM_SCSI_CTRLR_MAX_DEVS; ++ scsi_cfg.max_lun = 0xff; ++ ++ memcpy(config, (void*)&scsi_cfg, sizeof(struct virtio_scsi_config)); ++ return 0; ++} ++ ++static int ++ssam_scsi_send_event(struct spdk_ssam_session *smsession, unsigned scsi_dev_num, ++ uint32_t event, uint32_t reason) ++{ ++ struct virtio_scsi_event vscsi_event = {0}; ++ int ret; ++ ++ vscsi_event.event = event; ++ vscsi_event.reason = reason; ++ ++ vscsi_event.lun[0] = 1; ++ vscsi_event.lun[0x1] = (uint8_t)scsi_dev_num; ++ vscsi_event.lun[0x2] = 0; ++ vscsi_event.lun[0x3] = 0; ++ memset(&vscsi_event.lun[0x4], 0, 0x4); ++ ++ ret = ssam_send_action(smsession->gfunc_id, SSAM_FUNCTION_ACTION_SCSI_EVENT, ++ (const void*)&vscsi_event, sizeof(struct virtio_scsi_event)); ++ if (ret < 0) { ++ SPDK_ERRLOG("%s: SCSI target %d send event %u(reason %u) failed: %s.\n", ++ smsession->name, scsi_dev_num, event, reason, strerror(-ret)); ++ } ++ return ret; ++} ++ ++static void ++ssam_scsi_stop_cpl_cb(struct spdk_ssam_session *smsession, void **ctx) ++{ ++ struct spdk_ssam_scsi_session *ssmsession = (struct spdk_ssam_scsi_session *)smsession; ++ spdk_ssam_session_rsp_fn rsp_fn = smsession->rsp_fn; ++ void *rsp_ctx = smsession->rsp_ctx; ++ ++ SPDK_NOTICELOG("SCSI controller %s deleted\n", smsession->name); ++ ++ if (smsession->name != NULL) { ++ free(smsession->name); ++ smsession->name = NULL; ++ } ++ ++ if (ssmsession->dbdf != NULL) { ++ free(ssmsession->dbdf); ++ ssmsession->dbdf = NULL; ++ } ++ ++ ssam_set_session_be_freed(ctx); ++ memset(ssmsession, 0, sizeof(*ssmsession)); ++ free(ssmsession); ++ ++ if (rsp_fn != NULL) { ++ rsp_fn(rsp_ctx, 0); ++ rsp_fn = NULL; ++ } ++} ++ ++static void ++ssam_scsi_destroy_session(struct ssam_scsi_session_ctx *ctx) ++{ ++ struct spdk_ssam_session *smsession = &ctx->ssmsession->smsession; ++ struct spdk_ssam_scsi_session *ssmsession = ctx->ssmsession; ++ ++ if (smsession->task_cnt > 0) { ++ return; ++ } ++ ++ if (ssmsession->ref > 0) { ++ return; ++ } ++ ++ ssam_session_destroy(smsession); ++ ++ ssmsession->registered = false; ++ spdk_poller_unregister(&ssmsession->stop_poller); ++ ssam_free_scsi_task_pool(ssmsession); ++ ssam_session_stop_done(&ssmsession->smsession, 0, ctx->user_ctx); ++ free(ctx); ++ ++ return; ++} ++ ++static int ++ssam_scsi_destroy_session_poller_cb(void *arg) ++{ ++ struct ssam_scsi_session_ctx *ctx = arg; ++ ++ if (spdk_ssam_trylock() != 0) { ++ return SPDK_POLLER_BUSY; ++ } ++ ++ ssam_scsi_destroy_session(ctx); ++ ++ spdk_ssam_unlock(); ++ ++ return SPDK_POLLER_BUSY; ++} ++ ++static int ++ssam_scsi_stop_cb(struct spdk_ssam_session *smsession, void **ctx) ++{ ++ struct spdk_ssam_scsi_session *ssmsession = (struct spdk_ssam_scsi_session *)smsession; ++ struct ssam_scsi_session_ctx *_ctx = ++ (struct ssam_scsi_session_ctx *)calloc(1, sizeof(struct ssam_scsi_session_ctx)); ++ ++ if (_ctx == NULL) { ++ SPDK_ERRLOG("%s: calloc scsi session ctx error.\n", smsession->name); ++ return -ENOMEM; ++ } ++ ++ _ctx->ssmsession = ssmsession; ++ _ctx->user_ctx = ctx; ++ ++ ssmsession->stop_poller = SPDK_POLLER_REGISTER(ssam_scsi_destroy_session_poller_cb, ++ _ctx, SESSION_STOP_POLLER_PERIOD); ++ if (ssmsession->stop_poller == NULL) { ++ SPDK_ERRLOG("%s: ssam_destroy_session_poller_cb start failed.\n", smsession->name); ++ ssam_session_stop_done(smsession, -EBUSY, ctx); ++ free(_ctx); ++ return -EBUSY; ++ } ++ ++ return 0; ++} ++ ++static int ++ssam_scsi_stop(struct spdk_ssam_session *smsession) ++{ ++ struct spdk_ssam_send_event_flag send_event_flag = { ++ .need_async = true, ++ .need_rsp = true, ++ }; ++ return ssam_send_event_to_session(smsession, ssam_scsi_stop_cb, ssam_scsi_stop_cpl_cb, send_event_flag, NULL); ++} ++ ++// sync interface for hot-remove ++static void ++ssam_scsi_remove_self(struct spdk_ssam_session *smsession) ++{ ++ struct spdk_ssam_scsi_session *ssmsession = (struct spdk_ssam_scsi_session*)smsession; ++ // no need error ++ if (ssmsession->ref > 0) { ++ return; // still have targets ++ } ++ ++ SPDK_NOTICELOG("%s: is being freed\n", smsession->name); ++ ++ ssmsession->registered = false; ++ ssam_free_scsi_task_pool(ssmsession); ++ ++ ssam_sessions_remove(smsession->smdev->smsessions, smsession); ++ ++ if (smsession->smdev->active_session_num > 0) { ++ smsession->smdev->active_session_num--; ++ } ++ smsession->smdev = NULL; ++ // free smsession ++ free(smsession->name); ++ free(ssmsession->dbdf); ++ free(ssmsession); ++} ++ ++// async interface ++static int ++ssam_scsi_remove_session(struct spdk_ssam_session *smsession) ++{ ++ struct spdk_ssam_scsi_session *ssmsession = (struct spdk_ssam_scsi_session*)smsession; ++ int ret; ++ ++ if (smsession->registered && ssmsession->ref != 0) { ++ SPDK_ERRLOG("%s: SCSI target %d is still present.\n", smsession->name, ssmsession->ref); ++ return -EBUSY; ++ } ++ ++ ret = ssam_scsi_stop(smsession); ++ if (ret != 0) { ++ return ret; ++ } ++ ++ return 0; ++} ++ ++static struct spdk_scsi_dev * ++spdk_ssam_scsi_dev_get_tgt(struct spdk_ssam_scsi_session *ssmsession, uint8_t num) ++{ ++ if (ssmsession == NULL) { ++ SPDK_ERRLOG("ssmsession is null.\n"); ++ return NULL; ++ } ++ if (num >= SPDK_SSAM_SCSI_CTRLR_MAX_DEVS) { ++ SPDK_ERRLOG("%s: tgt num %u over %u.\n", ssmsession->smsession.name, num, SPDK_SSAM_SCSI_CTRLR_MAX_DEVS); ++ return NULL; ++ } ++ if (ssmsession->scsi_dev_state[num].status != SSAM_SCSI_DEV_PRESENT) { ++ return NULL; ++ } ++ ++ if (ssmsession->scsi_dev_state[num].dev == NULL) { ++ SPDK_ERRLOG("%s: no tgt num %u device.\n", ssmsession->smsession.name, num); ++ return NULL; ++ } ++ return ssmsession->scsi_dev_state[num].dev; ++} ++ ++static void ++ssam_scsi_dump_device_info(struct spdk_ssam_session *smsession, struct spdk_json_write_ctx *w) ++{ ++ struct spdk_ssam_scsi_session *ssmsession = (struct spdk_ssam_scsi_session*)smsession; ++ struct spdk_scsi_dev *sdev; ++ struct spdk_scsi_lun *lun; ++ int32_t tgt_id; ++ ++ spdk_json_write_named_array_begin(w, "scsi_targets"); ++ for (tgt_id = 0; tgt_id < SPDK_SSAM_SCSI_CTRLR_MAX_DEVS; tgt_id++) { ++ sdev = spdk_ssam_scsi_dev_get_tgt(ssmsession, tgt_id); ++ if (!sdev) { ++ continue; ++ } ++ ++ spdk_json_write_object_begin(w); ++ ++ spdk_json_write_named_uint32(w, "scsi_target_num", tgt_id); ++ spdk_json_write_named_uint32(w, "id", spdk_scsi_dev_get_id(sdev)); ++ spdk_json_write_named_string(w, "target_name", spdk_scsi_dev_get_name(sdev)); ++ lun = spdk_scsi_dev_get_lun(sdev, 0); ++ if (!lun) { ++ continue; ++ } ++ spdk_json_write_named_string(w, "bdev_name", spdk_scsi_lun_get_bdev_name(lun)); ++ ++ spdk_json_write_object_end(w); ++ } ++ ++ spdk_json_write_array_end(w); ++} ++ ++static void ++ssam_scsi_dump_info_json(struct spdk_ssam_session *smsession, struct spdk_json_write_ctx *w) ++{ ++ struct spdk_ssam_scsi_session *ssmsession = (struct spdk_ssam_scsi_session*)smsession; ++ spdk_json_write_object_begin(w); ++ ++ spdk_json_write_named_string(w, "dbdf", ssmsession->dbdf); ++ spdk_json_write_named_string(w, "name", spdk_ssam_session_get_name(smsession)); ++ spdk_json_write_named_uint32(w, "function_id", (uint32_t)smsession->gfunc_id); ++ spdk_json_write_named_uint32(w, "queues", (uint32_t)smsession->max_queues); ++ spdk_json_write_named_string(w, "ctrlr", spdk_ssam_dev_get_name(smsession->smdev)); ++ spdk_json_write_named_string_fmt(w, "cpumask", "0x%s", ++ spdk_cpuset_fmt(spdk_thread_get_cpumask(smsession->smdev->thread))); ++ ++ ssam_scsi_dump_device_info(smsession, w); ++ ++ spdk_json_write_object_end(w); ++} ++ ++static void ++ssam_scsi_write_config_json(struct spdk_ssam_session *smsession, ++ struct spdk_json_write_ctx *w) ++{ ++ struct spdk_ssam_scsi_session *ssmsession = (struct spdk_ssam_scsi_session*)smsession; ++ struct spdk_scsi_dev *sdev; ++ struct spdk_scsi_lun *lun; ++ int32_t tgt_id; ++ ++ spdk_json_write_object_begin(w); ++ spdk_json_write_named_string(w, "method", "create_scsi_controller"); ++ ++ spdk_json_write_named_object_begin(w, "params"); ++ spdk_json_write_named_string(w, "dbdf", ssmsession->dbdf); ++ spdk_json_write_named_string(w, "name", smsession->name); ++ spdk_json_write_object_end(w); ++ ++ spdk_json_write_object_end(w); ++ ++ for (tgt_id = 0; tgt_id < SPDK_SSAM_SCSI_CTRLR_MAX_DEVS; tgt_id++) { ++ sdev = spdk_ssam_scsi_dev_get_tgt(ssmsession, tgt_id); ++ if (!sdev) { ++ continue; ++ } ++ ++ lun = spdk_scsi_dev_get_lun(sdev, 0); ++ if (!lun) { ++ SPDK_ERRLOG("%s: no lun, continue.\n", smsession->name); ++ continue; ++ } ++ ++ spdk_json_write_object_begin(w); ++ spdk_json_write_named_string(w, "method", "scsi_controller_add_target"); ++ ++ spdk_json_write_named_object_begin(w, "params"); ++ spdk_json_write_named_string(w, "name", smsession->name); ++ spdk_json_write_named_uint32(w, "scsi_tgt_num", tgt_id); ++ ++ spdk_json_write_named_string(w, "bdev_name", spdk_scsi_lun_get_bdev_name(lun)); ++ spdk_json_write_object_end(w); ++ ++ spdk_json_write_object_end(w); ++ } ++} ++ ++static void ++ssam_scsi_show_tgt_iostat_json(struct spdk_ssam_scsi_session *ssmsession, ++ struct spdk_json_write_ctx *w, int32_t tgt_id, struct spdk_scsi_dev *sdev) ++{ ++ struct spdk_scsi_dev_io_state *io_stat; ++ struct spdk_scsi_lun *lun; ++ struct ssam_scsi_stat scsi_stat; ++ uint64_t ticks_hz = spdk_get_ticks_hz(); ++ uint64_t count; ++ uint64_t poll_count; ++ ++ lun = spdk_scsi_dev_get_lun(sdev, 0); ++ if (lun == NULL) { ++ return; ++ } ++ ++ io_stat = ssmsession->scsi_dev_state[tgt_id].io_stat[0]; ++ if (io_stat == NULL) { ++ SPDK_ERRLOG("No scsi iostat, tgt_id %d\n", tgt_id); ++ return; ++ } ++ ++ spdk_json_write_object_begin(w); ++ ++ spdk_json_write_named_uint32(w, "scsi_dev_num", tgt_id); ++ spdk_json_write_named_uint32(w, "id", spdk_scsi_dev_get_id(sdev)); ++ spdk_json_write_named_string(w, "target_name", spdk_scsi_dev_get_name(sdev)); ++ ++ memcpy(&scsi_stat, &io_stat->scsi_stat, sizeof(struct ssam_scsi_stat)); ++ ++ spdk_json_write_named_int32(w, "id", spdk_scsi_lun_get_id(lun)); ++ spdk_json_write_named_string(w, "bdev_name", spdk_scsi_lun_get_bdev_name(lun)); ++ spdk_json_write_named_uint64(w, "bytes_read", io_stat->stat.bytes_read); ++ spdk_json_write_named_uint64(w, "num_read_ops", io_stat->stat.num_read_ops); ++ spdk_json_write_named_uint64(w, "bytes_written", io_stat->stat.bytes_written); ++ spdk_json_write_named_uint64(w, "num_write_ops", io_stat->stat.num_write_ops); ++ spdk_json_write_named_uint64(w, "read_latency_ticks", io_stat->stat.read_latency_ticks); ++ spdk_json_write_named_uint64(w, "write_latency_ticks", io_stat->stat.write_latency_ticks); ++ ++ spdk_json_write_named_uint64(w, "complete_read_ios", scsi_stat.complete_read_ios); ++ spdk_json_write_named_uint64(w, "err_read_ios", scsi_stat.err_read_ios); ++ spdk_json_write_named_uint64(w, "complete_write_ios", scsi_stat.complete_write_ios); ++ spdk_json_write_named_uint64(w, "err_write_ios", scsi_stat.err_write_ios); ++ spdk_json_write_named_uint64(w, "flush_ios", scsi_stat.flush_ios); ++ spdk_json_write_named_uint64(w, "complete_flush_ios", scsi_stat.complete_flush_ios); ++ spdk_json_write_named_uint64(w, "err_flush_ios", scsi_stat.err_flush_ios); ++ spdk_json_write_named_uint64(w, "fatal_ios", scsi_stat.fatal_ios); ++ spdk_json_write_named_uint64(w, "io_retry", scsi_stat.io_retry); ++ ++ spdk_json_write_named_uint64(w, "start_count", scsi_stat.start_count); ++ spdk_json_write_named_uint64(w, "dma_count", scsi_stat.dma_count); ++ spdk_json_write_named_uint64(w, "dma_complete_count", scsi_stat.dma_complete_count); ++ spdk_json_write_named_uint64(w, "bdev_count", scsi_stat.bdev_count); ++ spdk_json_write_named_uint64(w, "bdev_complete_count", scsi_stat.bdev_complete_count); ++ spdk_json_write_named_uint64(w, "flight_io", ssmsession->scsi_dev_state[tgt_id].flight_io); ++ ++ if (scsi_stat.count == 0) { ++ count = 1; ++ } else { ++ count = scsi_stat.count; ++ } ++ ++ if (ssmsession->smsession.smdev->stat.poll_count == 0) { ++ poll_count = 1; ++ } else { ++ poll_count = ssmsession->smsession.smdev->stat.poll_count; ++ } ++ ++ spdk_json_write_named_string_fmt(w, "poll_lat", "%.9f", ++ (float)ssmsession->smsession.smdev->stat.poll_tsc / poll_count / ticks_hz); ++ spdk_json_write_named_string_fmt(w, "total_lat", "%.9f", (float)scsi_stat.total_tsc / count / ticks_hz); ++ spdk_json_write_named_string_fmt(w, "dma_lat", "%.9f", (float)scsi_stat.dma_tsc / count / ticks_hz); ++ spdk_json_write_named_string_fmt(w, "bdev_lat", "%.9f", (float)scsi_stat.bdev_tsc / count / ticks_hz); ++ spdk_json_write_named_string_fmt(w, "bdev_submit_lat", "%.9f", (float)scsi_stat.bdev_submit_tsc / count / ticks_hz); ++ spdk_json_write_named_string_fmt(w, "complete_lat", "%.9f", (float)scsi_stat.complete_tsc / count / ticks_hz); ++ spdk_json_write_named_string_fmt(w, "internel_lat", "%.9f", (float)scsi_stat.internel_tsc / count / ticks_hz); ++ ++ spdk_json_write_object_end(w); ++} ++ ++static void ++ssam_scsi_show_iostat_json(struct spdk_ssam_session *smsession, uint32_t id, struct spdk_json_write_ctx *w) ++{ ++ struct spdk_ssam_scsi_session *ssmsession = (struct spdk_ssam_scsi_session*)smsession; ++ struct spdk_scsi_dev *sdev; ++ int32_t tgt_id; ++ ++ if (id != SPDK_SSAM_SCSI_CTRLR_MAX_DEVS) { ++ sdev = spdk_ssam_scsi_dev_get_tgt(ssmsession, id); ++ if (sdev != NULL) { ++ ssam_scsi_show_tgt_iostat_json(ssmsession, w, id, sdev); ++ } else { ++ spdk_json_write_object_begin(w); ++ spdk_json_write_object_end(w); ++ } ++ return; ++ } ++ ++ spdk_json_write_object_begin(w); ++ ++ spdk_json_write_named_uint32(w, "function_id", smsession->gfunc_id); ++ ++ spdk_json_write_named_array_begin(w, "scsi_target"); ++ ++ for (tgt_id = 0; tgt_id < SPDK_SSAM_SCSI_CTRLR_MAX_DEVS; tgt_id++) { ++ sdev = spdk_ssam_scsi_dev_get_tgt(ssmsession, tgt_id); ++ if (!sdev) { ++ continue; ++ } ++ ssam_scsi_show_tgt_iostat_json(ssmsession, w, tgt_id, sdev); ++ } ++ ++ spdk_json_write_array_end(w); ++ spdk_json_write_object_end(w); ++} ++ ++static void ++ssam_scsi_clear_iostat_json(struct spdk_ssam_session *smsession) ++{ ++ struct spdk_ssam_scsi_session *ssmsession = (struct spdk_ssam_scsi_session *)smsession; ++ struct spdk_scsi_dev_io_state *io_stat; ++ int32_t tgt_id; ++ int32_t lun_id; ++ for (tgt_id = 0; tgt_id < SPDK_SSAM_SCSI_CTRLR_MAX_DEVS; tgt_id++) { ++ for (lun_id = 0; lun_id < SSAM_SPDK_SCSI_DEV_MAX_LUN; lun_id++) { ++ io_stat = ssmsession->scsi_dev_state[tgt_id].io_stat[lun_id]; ++ if (io_stat == NULL) { ++ continue; ++ } ++ memset(io_stat, 0, sizeof(struct spdk_scsi_dev_io_state)); ++ } ++ } ++ return; ++} ++ ++static struct spdk_bdev * ++ssam_scsi_get_bdev(struct spdk_ssam_session *smsession, uint32_t tgt_id) ++{ ++ struct spdk_ssam_scsi_session *ssmsession = (struct spdk_ssam_scsi_session*)smsession; ++ struct spdk_scsi_dev *scsi_dev; ++ struct spdk_scsi_lun *scsi_lun = NULL; ++ const char *bdev_name = NULL; ++ if (tgt_id >= SPDK_SSAM_SCSI_CTRLR_MAX_DEVS) { ++ SPDK_ERRLOG("%s: tgt %d invalid\n", smsession->name, tgt_id); ++ return NULL; ++ } ++ if (ssmsession->scsi_dev_state[tgt_id].dev == NULL) { ++ SPDK_ERRLOG("%s: tgt %d not be created\n", smsession->name, tgt_id); ++ return NULL; ++ } ++ ++ scsi_dev = ssmsession->scsi_dev_state[tgt_id].dev; ++ // lun id use 0 ++ scsi_lun = spdk_scsi_dev_get_lun(scsi_dev, 0); ++ if (scsi_lun == NULL) { ++ return NULL; ++ } ++ bdev_name = spdk_scsi_lun_get_bdev_name(scsi_lun); ++ if (bdev_name == NULL) { ++ return NULL; ++ } ++ return spdk_bdev_get_by_name(bdev_name); ++} ++ ++static int ++ssam_scsi_iostat_construct(struct spdk_ssam_scsi_session *ssmsession, int32_t tgt_id, ++ int *lun_id_list, int num_luns) ++{ ++ struct spdk_scsi_dev_io_state *io_stat; ++ int32_t lun_id; ++ int i; ++ ++ for (i = 0; i < num_luns; i++) { ++ lun_id = lun_id_list[i]; ++ io_stat = ssmsession->scsi_dev_state[tgt_id].io_stat[lun_id]; ++ if (io_stat != NULL) { ++ SPDK_ERRLOG("io_stat with tgt %d lun %d already exist\n", tgt_id, lun_id); ++ return -EEXIST; ++ } ++ ++ io_stat = calloc(1, sizeof(*io_stat)); ++ if (io_stat == NULL) { ++ SPDK_ERRLOG("Could not allocate io_stat for tgt %d lun %d\n", tgt_id, lun_id); ++ return -ENOMEM; ++ } ++ ssmsession->scsi_dev_state[tgt_id].io_stat[lun_id] = io_stat; ++ } ++ ++ return 0; ++} ++ ++static void ++ssam_scsi_iostat_destruct(struct spdk_scsi_dev_ssam_state *state) ++{ ++ int32_t lun_id; ++ ++ for (lun_id = 0; lun_id < SSAM_SPDK_SCSI_DEV_MAX_LUN; lun_id++) { ++ if (state->io_stat[lun_id] != NULL) { ++ free(state->io_stat[lun_id]); ++ state->io_stat[lun_id] = NULL; ++ } ++ } ++ ++ return; ++} ++ ++static void ++ssam_remove_scsi_tgt(struct spdk_ssam_scsi_session *ssmsession, unsigned scsi_tgt_num) ++{ ++ struct spdk_scsi_dev_ssam_state *state = &ssmsession->scsi_dev_state[scsi_tgt_num]; ++ struct spdk_ssam_session *smsession = &ssmsession->smsession; ++ spdk_ssam_session_rsp_fn rsp_fn = smsession->rsp_fn; ++ void *rsp_ctx = smsession->rsp_ctx; ++ ++ smsession->rsp_fn = NULL; ++ smsession->rsp_ctx = NULL; ++ ++ // delete scsi port ++ spdk_scsi_dev_delete_port(state->dev, 0); ++ ++ // destruct scsi dev ++ spdk_scsi_dev_destruct(state->dev, NULL, NULL); ++ state->dev = NULL; ++ ++ // free iostat ++ ssam_scsi_iostat_destruct(state); ++ state->status = SSAM_SCSI_DEV_EMPTY; ++ ++ // ref-- ++ if (ssmsession->ref > 0) { ++ ssmsession->ref--; ++ } else { ++ SPDK_ERRLOG("%s: ref internel error\n", smsession->name); ++ } ++ if (rsp_fn != NULL) { ++ rsp_fn(rsp_ctx, 0); ++ rsp_fn = NULL; ++ } ++ SPDK_NOTICELOG("%s: target %u is removed\n", smsession->name, scsi_tgt_num); ++} ++ ++static int ++ssam_scsi_get_payload_size(struct ssam_request *io_req, uint32_t *payload_size) ++{ ++ struct ssam_io_message *io_cmd = &io_req->req.cmd; ++ uint32_t payload = 0; ++ uint32_t first_vec; ++ uint32_t end_vec; ++ uint32_t loop; ++ ++ if (io_cmd->writable) { /* read io */ ++ /* FROM_DEV: [req][resp][write_buf]...[write_buf ]*, write_buf start at index 2 */ ++ first_vec = 2; ++ end_vec = io_cmd->iovcnt - 1; ++ } else { /* write io */ ++ first_vec = 1; ++ /* TO_DEV: [req][read_buf]...[read_buf][resp], read_buf last index is iovnt-2 */ ++ end_vec = io_cmd->iovcnt - 2; ++ } ++ ++ for (loop = first_vec; loop <= end_vec; loop++) { ++ if (spdk_unlikely((UINT32_MAX - io_cmd->iovs[loop].iov_len) < payload)) { ++ SPDK_ERRLOG("payload size overflow\n"); ++ return -1; ++ } ++ payload += io_cmd->iovs[loop].iov_len; ++ } ++ ++ if (spdk_unlikely(payload > PAYLOAD_SIZE_MAX)) { ++ SPDK_ERRLOG("payload size larger than %u, payload_size = %u\n", ++ PAYLOAD_SIZE_MAX, payload); ++ return -1; ++ } ++ ++ *payload_size = payload; ++ ++ return 0; ++} ++ ++static void ++ssam_session_io_resubmit(void *arg) ++{ ++ struct spdk_ssam_scsi_task *task = (struct spdk_ssam_scsi_task *)arg; ++ struct spdk_ssam_session *smsession = &task->ssmsession->smsession; ++ uint32_t payload_size = task->scsi_task.transfer_len; ++ int rc; ++ ++ rc = ssam_scsi_task_iovs_memory_get(task, payload_size); ++ if (rc != 0) { ++ ssam_session_insert_io_wait(smsession, &task->session_io_wait); ++ return; ++ } ++ ssam_scsi_process_io_task(smsession, task); ++} ++ ++static void ++ssam_scsi_task_init(struct spdk_ssam_scsi_task *task) ++{ ++ memset(&task->scsi_task, 0, sizeof(struct spdk_scsi_task)); ++ ++ task->used = true; ++ task->iovcnt = 0; ++ task->io_req = NULL; ++ task->session_io_wait.cb_fn = ssam_session_io_resubmit; ++ task->session_io_wait.cb_arg = task; ++} ++ ++static void ++ssam_scsi_task_dma_request_para(struct ssam_dma_request *data_request, struct spdk_ssam_scsi_task *task, ++ uint32_t type, uint8_t status) ++{ ++ struct spdk_scsi_task *scsi_task = &task->scsi_task; ++ struct ssam_io_message *io_cmd = NULL; ++ struct spdk_ssam_dma_cb dma_cb = { ++ .status = status, ++ .req_dir = type, ++ .gfunc_id = task->io_req->gfunc_id, ++ .vq_idx = task->vq_idx, ++ .task_idx = task->task_idx ++ }; ++ ++ io_cmd = &task->io_req->req.cmd; ++ data_request->cb = (void *)*(uint64_t *)&dma_cb; ++ data_request->gfunc_id = task->io_req->gfunc_id; ++ data_request->flr_seq = task->io_req->flr_seq; ++ data_request->direction = type; ++ data_request->data_len = scsi_task->transfer_len; ++ if (type == SSAM_REQUEST_DATA_STORE) { ++ data_request->src = task->iovs.phys.sges; ++ data_request->src_num = task->iovcnt; ++ /* FROM_DEV: [req][resp][write_buf]...[write_buf ]*, write_buf start at index 2 */ ++ data_request->dst = &io_cmd->iovs[2]; ++ /* dma data iovs does not contain header and tail */ ++ data_request->dst_num = io_cmd->iovcnt - IOV_HEADER_TAIL_NUM; ++ } else if (type == SSAM_REQUEST_DATA_LOAD) { ++ data_request->src = &io_cmd->iovs[1]; ++ /* dma data iovs does not contain header and tail */ ++ data_request->src_num = io_cmd->iovcnt - IOV_HEADER_TAIL_NUM; ++ data_request->dst = task->iovs.phys.sges; ++ data_request->dst_num = task->iovcnt; ++ } ++} ++ ++static void ++ssam_scsi_task_finish(struct spdk_ssam_scsi_task *task) ++{ ++ struct spdk_ssam_session *smsession = task->smsession; ++ struct spdk_ssam_virtqueue *vq = &smsession->virtqueue[task->vq_idx]; ++ ++ if (smsession->task_cnt == 0) { ++ SPDK_ERRLOG("%s: task count internel error\n", smsession->name); ++ return; ++ } ++ ++ task->io_req = NULL; ++ ++ if (task->iovs.virt.sges[0].iov_base != NULL) { ++ ssam_mempool_free(smsession->mp, task->iovs.virt.sges[0].iov_base); ++ task->iovs.virt.sges[0].iov_base = NULL; ++ } ++ ++ memset(&task->iovs, 0, sizeof(task->iovs)); ++ ++ task->iovcnt = 0; ++ smsession->task_cnt--; ++ task->used = false; ++ vq->index[vq->index_l] = task->task_idx; ++ vq->index_l = (vq->index_l + 1) & 0xFF; ++ vq->use_num--; ++} ++ ++static int ++ssam_scsi_io_complete(struct spdk_ssam_dev *smdev, struct ssam_request *io_req, void *rsp_buf, uint32_t rsp_len) ++{ ++ struct ssam_io_message *io_cmd = &io_req->req.cmd; ++ struct ssam_virtio_res *virtio_res = NULL; ++ struct ssam_io_response io_resp; ++ struct iovec io_vec; ++ int rc; ++ ++ memset(&io_resp, 0, sizeof(io_resp)); ++ io_resp.gfunc_id = io_req->gfunc_id; ++ io_resp.iocb_id = io_req->iocb_id; ++ io_resp.status = io_req->status; ++ io_resp.req = io_req; ++ io_resp.flr_seq = io_req->flr_seq; ++ ++ virtio_res = (struct ssam_virtio_res*)&io_resp.data; ++ virtio_res->iovs = &io_vec; ++ if (io_cmd->writable) { /* FROM_DEV: [req][resp][write_buf]...[write_buf ] */ ++ virtio_res->iovs->iov_base = io_cmd->iovs[1].iov_base; ++ virtio_res->iovs->iov_len = io_cmd->iovs[1].iov_len; ++ } else { /* TO_DEV: [req][read_buf]...[read_buf][resp] */ ++ virtio_res->iovs->iov_base = io_cmd->iovs[io_cmd->iovcnt - 1].iov_base; ++ virtio_res->iovs->iov_len = io_cmd->iovs[io_cmd->iovcnt - 1].iov_len; ++ } ++ virtio_res->iovcnt = 1; ++ virtio_res->rsp = rsp_buf; ++ virtio_res->rsp_len = rsp_len; ++ ++ rc = ssam_io_complete(smdev->tid, &io_resp); ++ if (rc != 0) { ++ return rc; ++ } ++ ++ ssam_dev_io_dec(smdev); ++ return 0; ++} ++ ++struct ssam_scsi_req_complete_arg { ++ struct spdk_ssam_dev *smdev; ++ struct ssam_request *io_req; ++ uint8_t status; ++}; ++ ++static void ++ssam_scsi_req_complete_cb(void *arg) ++{ ++ struct ssam_scsi_req_complete_arg *cb_arg = (struct ssam_scsi_req_complete_arg *)arg; ++ struct virtio_scsi_cmd_resp resp = {0}; ++ struct virtio_scsi_ctrl_tmf_resp tmf_resp = {0}; ++ int rc; ++ ++ if (spdk_unlikely(cb_arg->io_req->type == VMIO_TYPE_VIRTIO_SCSI_CTRL)) { ++ tmf_resp.response = cb_arg->status; ++ rc = ssam_scsi_io_complete(cb_arg->smdev, cb_arg->io_req, &tmf_resp, sizeof(struct virtio_scsi_ctrl_tmf_resp)); ++ } else { ++ resp.response = cb_arg->status; ++ rc = ssam_scsi_io_complete(cb_arg->smdev, cb_arg->io_req, &resp, sizeof(struct virtio_scsi_cmd_resp)); ++ } ++ ++ if (rc != 0) { ++ struct spdk_ssam_session_io_wait_r *io_wait_r = ++ calloc(1, sizeof(struct spdk_ssam_session_io_wait_r)); ++ if (io_wait_r == NULL) { ++ SPDK_ERRLOG("calloc for io_wait_r failed\n"); ++ sleep(1); ++ raise(SIGTERM); ++ } ++ io_wait_r->cb_fn = ssam_scsi_req_complete_cb; ++ io_wait_r->cb_arg = cb_arg; ++ ssam_session_insert_io_wait_r(cb_arg->smdev, io_wait_r); ++ return; ++ } ++ free(cb_arg); ++ cb_arg = NULL; ++} ++ ++static void ++ssam_scsi_req_complete(struct spdk_ssam_dev *smdev, struct ssam_request *io_req, uint8_t status) ++{ ++ struct virtio_scsi_cmd_resp resp = {0}; ++ struct virtio_scsi_ctrl_tmf_resp tmf_resp = {0}; ++ int rc; ++ ++ if (spdk_unlikely(io_req->type == VMIO_TYPE_VIRTIO_SCSI_CTRL)) { ++ tmf_resp.response = status; ++ rc = ssam_scsi_io_complete(smdev, io_req, &tmf_resp, sizeof(struct virtio_scsi_ctrl_tmf_resp)); ++ } else { ++ resp.response = status; ++ rc = ssam_scsi_io_complete(smdev, io_req, &resp, sizeof(struct virtio_scsi_cmd_resp)); ++ } ++ ++ if (rc != 0) { ++ struct spdk_ssam_session_io_wait_r *io_wait_r = ++ calloc(1, sizeof(struct spdk_ssam_session_io_wait_r)); ++ struct ssam_scsi_req_complete_arg *cb_arg = ++ calloc(1, sizeof(struct ssam_scsi_req_complete_arg)); ++ if (io_wait_r == NULL || cb_arg == NULL) { ++ SPDK_ERRLOG("calloc for io_wait_r failed\n"); ++ sleep(1); ++ raise(SIGTERM); ++ } ++ cb_arg->smdev = smdev; ++ cb_arg->io_req = io_req; ++ cb_arg->status = status; ++ io_wait_r->cb_fn = ssam_scsi_req_complete_cb; ++ io_wait_r->cb_arg = cb_arg; ++ ssam_session_insert_io_wait_r(smdev, io_wait_r); ++ } ++} ++ ++static void ++ssam_scsi_task_put(struct spdk_ssam_scsi_task *task) ++{ ++ memset(&task->resp, 0, sizeof(task->resp)); ++ if (task->io_req->type != VMIO_TYPE_VIRTIO_SCSI_CTRL) { ++ task->ssmsession->scsi_dev_state[task->tgt_id].flight_io--; ++ } ++ spdk_scsi_task_put(&task->scsi_task); ++} ++ ++static void ++ssam_scsi_submit_completion_cb(void *arg) ++{ ++ struct spdk_ssam_scsi_task *task = (struct spdk_ssam_scsi_task*)arg; ++ struct spdk_ssam_session *smsession = task->smsession; ++ int rc; ++ ++ if (spdk_unlikely(task->io_req->type == VMIO_TYPE_VIRTIO_SCSI_CTRL)) { ++ rc = ssam_scsi_io_complete(smsession->smdev, task->io_req, &task->tmf_resp, ++ sizeof(struct virtio_scsi_ctrl_tmf_resp)); ++ } else { ++ rc = ssam_scsi_io_complete(smsession->smdev, task->io_req, &task->resp, ++ sizeof(struct virtio_scsi_cmd_resp)); ++ } ++ ++ if (rc != 0) { ++ struct spdk_ssam_session_io_wait_r *io_wait_r = ++ calloc(1, sizeof(struct spdk_ssam_session_io_wait_r)); ++ if (io_wait_r == NULL) { ++ SPDK_ERRLOG("calloc for io_wait_r failed\n"); ++ sleep(1); ++ raise(SIGTERM); ++ } ++ io_wait_r->cb_fn = ssam_scsi_submit_completion_cb; ++ io_wait_r->cb_arg = task; ++ ssam_session_insert_io_wait_r(smsession->smdev, io_wait_r); ++ return; ++ } ++ ssam_scsi_task_stat_tick(&task->task_stat.complete_end_tsc); ++ ssam_scsi_stat_statistics(task); ++ ++ /* after spdk_task_construct called, put task */ ++ ssam_scsi_task_put(task); ++} ++ ++static void ++ssam_scsi_submit_completion(struct spdk_ssam_scsi_task *task) ++{ ++ struct spdk_ssam_session *smsession = task->smsession; ++ struct ssam_request *io_req = task->io_req; ++ int rc; ++ ++ ssam_scsi_task_stat_tick(&task->task_stat.complete_start_tsc); ++ if (spdk_unlikely(io_req->type == VMIO_TYPE_VIRTIO_SCSI_CTRL)) { ++ rc = ssam_scsi_io_complete(smsession->smdev, io_req, &task->tmf_resp, ++ sizeof(struct virtio_scsi_ctrl_tmf_resp)); ++ } else { ++ rc = ssam_scsi_io_complete(smsession->smdev, io_req, &task->resp, sizeof(struct virtio_scsi_cmd_resp)); ++ } ++ ++ if (rc != 0) { ++ struct spdk_ssam_session_io_wait_r *io_wait_r = ++ calloc(1, sizeof(struct spdk_ssam_session_io_wait_r)); ++ if (io_wait_r == NULL) { ++ SPDK_ERRLOG("calloc for io_wait_r failed\n"); ++ sleep(1); ++ raise(SIGTERM); ++ } ++ io_wait_r->cb_fn = ssam_scsi_submit_completion_cb; ++ io_wait_r->cb_arg = task; ++ ssam_session_insert_io_wait_r(smsession->smdev, io_wait_r); ++ return; ++ } ++ ssam_scsi_task_stat_tick(&task->task_stat.complete_end_tsc); ++ ssam_scsi_stat_statistics(task); ++ ++ /* after spdk_task_construct called, put task */ ++ ssam_scsi_task_put(task); ++} ++ ++struct ssam_scsi_dma_data_request_arg { ++ struct spdk_ssam_dev *smdev; ++ struct spdk_ssam_scsi_task *task; ++ struct ssam_dma_request dma_req; ++}; ++ ++static void ++ssam_scsi_dma_data_request_cb(void *arg) ++{ ++ struct ssam_scsi_dma_data_request_arg *cb_arg = (struct ssam_scsi_dma_data_request_arg *)arg; ++ int ret = ssam_dma_data_request(cb_arg->smdev->tid, &cb_arg->dma_req); ++ if (ret == -ENOMEM || ret == -EIO) { ++ struct spdk_ssam_session_io_wait_r *io_wait_r = ++ calloc(1, sizeof(struct spdk_ssam_session_io_wait_r)); ++ if (io_wait_r == NULL) { ++ SPDK_ERRLOG("calloc for io_wait_r failed\n"); ++ sleep(1); ++ raise(SIGTERM); ++ } ++ io_wait_r->cb_fn = ssam_scsi_dma_data_request_cb; ++ io_wait_r->cb_arg = cb_arg; ++ ssam_session_insert_io_wait_r(cb_arg->smdev, io_wait_r); ++ return; ++ } ++ if (ret < 0) { ++ SPDK_ERRLOG("ssam dma data request failed(%d)\n", ret); ++ cb_arg->task->resp.response = VIRTIO_SCSI_S_FAILURE; ++ ssam_scsi_submit_completion(cb_arg->task); ++ } ++ free(cb_arg); ++ cb_arg = NULL; ++} ++ ++static void ++ssam_scsi_task_dma_request(struct spdk_ssam_scsi_task *task, enum data_request_dma_type data_dir) ++{ ++ struct spdk_ssam_session *smsession = task->smsession; ++ struct ssam_dma_request data_request = {0}; ++ int ret = 0; ++ ++ ssam_scsi_task_stat_tick(&task->task_stat.dma_start_tsc); ++ task->ssmsession->scsi_dev_state[task->tgt_id].io_stat[0]->scsi_stat.dma_count++; ++ ++ switch (data_dir) { ++ case SSAM_REQUEST_DATA_STORE: ++ ssam_scsi_task_dma_request_para(&data_request, task, SSAM_REQUEST_DATA_STORE, 0); ++ ++ /* dma request: ipu -> Host */ ++ ret = ssam_dma_data_request(smsession->smdev->tid, &data_request); ++ break; ++ ++ case SSAM_REQUEST_DATA_LOAD: ++ ssam_scsi_task_dma_request_para(&data_request, task, SSAM_REQUEST_DATA_LOAD, 0); ++ ++ /* dma request: Host -> ipu */ ++ ret = ssam_dma_data_request(smsession->smdev->tid, &data_request); ++ break; ++ ++ default: ++ SPDK_ERRLOG("Invalid data dir: %u.\n", data_dir); ++ break; ++ } ++ ++ if (ret == -ENOMEM || ret == -EIO) { ++ struct spdk_ssam_session_io_wait_r *io_wait_r = ++ calloc(1, sizeof(struct spdk_ssam_session_io_wait_r)); ++ struct ssam_scsi_dma_data_request_arg *cb_arg = ++ calloc(1, sizeof(struct ssam_scsi_dma_data_request_arg)); ++ if (io_wait_r == NULL || cb_arg == NULL) { ++ SPDK_ERRLOG("calloc for io_wait_r failed\n"); ++ sleep(1); ++ raise(SIGTERM); ++ } ++ cb_arg->smdev = smsession->smdev; ++ cb_arg->dma_req = data_request; ++ cb_arg->task = task; ++ io_wait_r->cb_fn = ssam_scsi_dma_data_request_cb; ++ io_wait_r->cb_arg = cb_arg; ++ ssam_session_insert_io_wait_r(smsession->smdev, io_wait_r); ++ return; ++ } ++ ++ if (ret < 0) { ++ SPDK_ERRLOG("ssam dma data request failed(%d)\n", ret); ++ task->resp.response = VIRTIO_SCSI_S_FAILURE; ++ ssam_scsi_submit_completion(task); ++ } ++} ++ ++static void ++ssam_scsi_task_copy_resp(struct spdk_ssam_scsi_task *task) ++{ ++ struct spdk_scsi_task *scsi_task = &task->scsi_task; ++ ++ if (spdk_unlikely(task->io_req->type == VMIO_TYPE_VIRTIO_SCSI_CTRL)) { ++ task->tmf_resp.response = scsi_task->status; ++ } else { ++ task->resp.status = scsi_task->status; ++ if (spdk_unlikely(scsi_task->sense_data_len > SSAM_SENSE_DATE_LEN)) { ++ return; ++ } ++ if (scsi_task->status != SPDK_SCSI_STATUS_GOOD) { ++ memcpy(task->resp.sense, scsi_task->sense_data, scsi_task->sense_data_len); ++ task->resp.sense_len = scsi_task->sense_data_len; ++ } ++ ++ if (scsi_task->transfer_len != scsi_task->length) { ++ SPDK_ERRLOG("task transfer_len(%u) not equal length(%u), internel error.\n", ++ scsi_task->transfer_len, scsi_task->length); ++ } ++ ++ task->resp.resid = scsi_task->length - scsi_task->data_transferred; ++ } ++} ++ ++static void ++ssam_scsi_read_task_cpl_cb(struct spdk_scsi_task *scsi_task) ++{ ++ if (spdk_unlikely(spdk_get_shutdown_sig_received())) { ++ /* ++ * In the hot restart process, when this callback is triggered, ++ * the task and bdev_io memory may have been released. ++ * Therefore, task and bdev_io are not released in this scenario. ++ */ ++ return; ++ } ++ struct spdk_ssam_scsi_task *task = SPDK_CONTAINEROF(scsi_task, struct spdk_ssam_scsi_task, scsi_task); ++ int32_t tgt_id = task->tgt_id; ++ int32_t lun_id = spdk_scsi_lun_get_id(scsi_task->lun); ++ struct spdk_scsi_dev_io_state *io_stat = task->ssmsession->scsi_dev_state[tgt_id].io_stat[lun_id]; ++ ++ /* Second part start of read */ ++ io_stat->submit_tsc = spdk_get_ticks(); ++ ++ ssam_scsi_task_copy_resp(task); ++ ++ ssam_scsi_task_stat_tick(&task->task_stat.bdev_end_tsc); ++ task->ssmsession->scsi_dev_state[task->tgt_id].io_stat[0]->scsi_stat.bdev_complete_count++; ++ ++ /* 1) Read request without data is no need to dma; ++ 2) Read request failed just complete it. ++ */ ++ if (scsi_task->length == 0 || scsi_task->status != SPDK_SCSI_STATUS_GOOD) { ++ ssam_scsi_submit_completion(task); ++ return; ++ } ++ ++ /* Dma data from IPU to HOST */ ++ ssam_scsi_task_dma_request(task, SSAM_REQUEST_DATA_STORE); ++ ++ return; ++} ++ ++static void ++ssam_scsi_write_task_cpl_cb(struct spdk_scsi_task *scsi_task) ++{ ++ if (spdk_unlikely(spdk_get_shutdown_sig_received())) { ++ /* ++ * In the hot restart process, when this callback is triggered, ++ * the task and bdev_io memory may have been released. ++ * Therefore, task and bdev_io are not released in this scenario. ++ */ ++ return; ++ } ++ struct spdk_ssam_scsi_task *task = SPDK_CONTAINEROF(scsi_task, struct spdk_ssam_scsi_task, scsi_task); ++ int32_t tgt_id = task->tgt_id; ++ int32_t lun_id = spdk_scsi_lun_get_id(scsi_task->lun); ++ struct spdk_scsi_dev_io_state *io_stat = task->ssmsession->scsi_dev_state[tgt_id].io_stat[lun_id]; ++ uint32_t payload_size = task->scsi_task.transfer_len; ++ ++ /* Second part start of write */ ++ io_stat->submit_tsc = spdk_get_ticks(); ++ ++ /* copy result from spdk_scsi_task to spdk_ssam_scsi_task->resp */ ++ ssam_scsi_task_copy_resp(task); ++ ++ ssam_scsi_task_stat_tick(&task->task_stat.bdev_end_tsc); ++ task->ssmsession->scsi_dev_state[task->tgt_id].io_stat[0]->scsi_stat.bdev_complete_count++; ++ ++ ssam_scsi_submit_completion(task); ++ /* Second part end of write */ ++ io_stat->stat.write_latency_ticks += ssam_get_diff_tsc(io_stat->submit_tsc); ++ io_stat->stat.bytes_written += payload_size; ++ io_stat->stat.num_write_ops++; ++ ++ return; ++} ++ ++static void ++ssam_scsi_ctl_task_cpl_cb(struct spdk_scsi_task *scsi_task) ++{ ++ struct spdk_ssam_scsi_task *task = SPDK_CONTAINEROF(scsi_task, struct spdk_ssam_scsi_task, scsi_task); ++ ++ ssam_scsi_task_copy_resp(task); ++ ++ ssam_scsi_submit_completion(task); ++} ++ ++static void ++ssam_scsi_task_free_cb(struct spdk_scsi_task *scsi_task) ++{ ++ struct spdk_ssam_scsi_task *task = SPDK_CONTAINEROF(scsi_task, struct spdk_ssam_scsi_task, scsi_task); ++ ++ ssam_scsi_task_finish(task); ++} ++ ++static int ++ssam_scsi_task_init_target(struct spdk_ssam_scsi_task *task, const __u8 *lun) ++{ ++ struct spdk_ssam_scsi_session *ssmsession = task->ssmsession; ++ struct spdk_scsi_dev_ssam_state *state = NULL; ++ int32_t lun_id = (((uint16_t)lun[2] << 8) | lun[3]) & 0x3FFF; ++ int32_t tgt_id = lun[1]; ++ ++ if (lun[0] != 1 || tgt_id >= SPDK_SSAM_SCSI_CTRLR_MAX_DEVS) { ++ SPDK_ERRLOG("First byte must be 1 and second is target\n"); ++ ssmsession->smsession.smdev->discard_io_num++; ++ return -1; ++ } ++ ++ state = &ssmsession->scsi_dev_state[tgt_id]; ++ task->scsi_dev = state->dev; ++ if (state->dev == NULL || state->status != SSAM_SCSI_DEV_PRESENT) { ++ return -1; ++ } ++ ++ task->tgt_id = tgt_id; ++ task->scsi_task.target_port = spdk_scsi_dev_find_port_by_id(task->scsi_dev, 0); ++ task->scsi_task.lun = spdk_scsi_dev_get_lun(state->dev, lun_id); ++ if (task->scsi_task.lun == NULL) { ++ SPDK_ERRLOG("Failed to init scsi task lun by lun_id(%d)\n", lun_id); ++ return -1; ++ } ++ return 0; ++} ++ ++static void ++ssam_scsi_submit_io_task(struct spdk_ssam_scsi_task *task) ++{ ++ task->resp.response = VIRTIO_SCSI_S_OK; ++ ++ ssam_scsi_task_stat_tick(&task->task_stat.bdev_start_tsc); ++ spdk_scsi_dev_queue_task(task->scsi_dev, &task->scsi_task); ++ task->ssmsession->scsi_dev_state[task->tgt_id].io_stat[0]->scsi_stat.bdev_count++; ++ ssam_scsi_task_stat_tick(&task->task_stat.bdev_func_tsc); ++ ++ SPDK_DEBUGLOG(ssam_blk_data, "====== Task: task_idx %u submitted ======\n", task->task_idx); ++} ++ ++static int ++ssam_scsi_task_iovs_memory_get(struct spdk_ssam_scsi_task *task, uint32_t payload_size) ++{ ++ struct ssam_mempool *mp = task->smsession->mp; ++ void *buffer = NULL; ++ uint64_t phys_addr = 0; ++ uint32_t alloc_size; ++ ++ if (payload_size == 0) { /* A little strange */ ++ alloc_size = 1; /* Alloc one iov at least */ ++ } else { ++ alloc_size = payload_size; ++ } ++ ++ buffer = ssam_mempool_alloc(mp, alloc_size, &phys_addr); ++ if (spdk_unlikely(buffer == NULL)) { ++ return -ENOMEM; ++ } ++ ++ /* ssam request max IO size is PAYLOAD_SIZE_MAX, only use one iov to save data */ ++ task->iovs.virt.sges[0].iov_base = buffer; ++ task->iovs.phys.sges[0].iov_base = (void *)phys_addr; ++ task->iovs.virt.sges[0].iov_len = payload_size; ++ task->iovs.phys.sges[0].iov_len = payload_size; ++ task->iovcnt = 1; ++ ++ return 0; ++} ++ ++static void ++scsi_mgmt_task_submit(struct spdk_ssam_scsi_task *task, enum spdk_scsi_task_func func) ++{ ++ task->tmf_resp.response = VIRTIO_SCSI_S_OK; ++ task->scsi_task.function = func; ++ spdk_scsi_dev_queue_mgmt_task(task->scsi_dev, &task->scsi_task); ++} ++ ++static void ++ssam_scsi_process_ctl_task(struct spdk_ssam_session *smsession, struct spdk_ssam_scsi_task *task) ++{ ++ struct virtio_scsi_ctrl_tmf_req *ctrl_req = (struct virtio_scsi_ctrl_tmf_req *)task->io_req->req.cmd.header; ++ int32_t lun_id = spdk_scsi_lun_get_id(task->scsi_task.lun); ++ struct spdk_scsi_dev_io_state *io_stat = task->ssmsession->scsi_dev_state[task->tgt_id].io_stat[lun_id]; ++ int ret = 0; ++ ++ spdk_scsi_task_construct(&task->scsi_task, ssam_scsi_ctl_task_cpl_cb, ssam_scsi_task_free_cb); ++ ret = ssam_scsi_task_init_target(task, ctrl_req->lun); ++ if (ret < 0) { ++ task->tmf_resp.response = VIRTIO_SCSI_S_FAILURE; ++ ssam_scsi_submit_completion(task); ++ return; ++ } ++ ++ switch (ctrl_req->type) { ++ case VIRTIO_SCSI_T_TMF: ++ /* Check if we are processing a valid request */ ++ if (task->scsi_dev == NULL) { ++ task->tmf_resp.response = VIRTIO_SCSI_S_BAD_TARGET; ++ ssam_scsi_submit_completion(task); ++ break; ++ } ++ ++ switch (ctrl_req->subtype) { ++ case VIRTIO_SCSI_T_TMF_LOGICAL_UNIT_RESET: ++ /* Handle LUN reset */ ++ SPDK_DEBUGLOG(ssam_scsi, "%s: LUN reset\n", smsession->name); ++ ++ scsi_mgmt_task_submit(task, SPDK_SCSI_TASK_FUNC_LUN_RESET); ++ return; ++ default: ++ task->tmf_resp.response = VIRTIO_SCSI_S_ABORTED; ++ ssam_scsi_submit_completion(task); ++ /* Unsupported command */ ++ SPDK_DEBUGLOG(ssam_scsi, "%s: unsupported TMF command %x\n", ++ smsession->name, ctrl_req->subtype); ++ break; ++ } ++ break; ++ ++ case VIRTIO_SCSI_T_AN_QUERY: ++ case VIRTIO_SCSI_T_AN_SUBSCRIBE: ++ task->tmf_resp.response = VIRTIO_SCSI_S_ABORTED; ++ ssam_scsi_submit_completion(task); ++ break; ++ ++ default: ++ SPDK_DEBUGLOG(ssam_scsi, "%s: Unsupported control command %x\n", ++ smsession->name, ctrl_req->type); ++ io_stat->scsi_stat.fatal_ios++; ++ break; ++ } ++} ++ ++static void ++ssam_scsi_io_task_construct(struct spdk_ssam_scsi_task *task) ++{ ++ struct spdk_scsi_task *scsi_task = &task->scsi_task; ++ struct ssam_io_message *io_cmd = &task->io_req->req.cmd; ++ ++ if (io_cmd->writable) { /* read io */ ++ spdk_scsi_task_construct(scsi_task, ssam_scsi_read_task_cpl_cb, ssam_scsi_task_free_cb); ++ } else { /* write io */ ++ spdk_scsi_task_construct(scsi_task, ssam_scsi_write_task_cpl_cb, ssam_scsi_task_free_cb); ++ } ++} ++ ++static int32_t ++ssam_scsi_io_task_setup(struct spdk_ssam_scsi_task *task) ++{ ++ struct spdk_scsi_task *scsi_task = &task->scsi_task; ++ struct ssam_io_message *io_cmd = &task->io_req->req.cmd; ++ struct virtio_scsi_cmd_req *req = (struct virtio_scsi_cmd_req *)io_cmd->header; ++ uint32_t payload_size; ++ int ret; ++ ++ ssam_scsi_io_task_construct(task); ++ ++ ret = ssam_scsi_get_payload_size(task->io_req, &payload_size); ++ if (ret != 0) { ++ return ret; ++ } ++ ++ ret = ssam_scsi_task_init_target(task, req->lun); ++ if (ret < 0) { ++ return ret; ++ } ++ ++ scsi_task->dxfer_dir = (io_cmd->writable ? SPDK_SCSI_DIR_FROM_DEV : SPDK_SCSI_DIR_TO_DEV); ++ scsi_task->iovs = task->iovs.virt.sges; ++ scsi_task->cdb = req->cdb; ++ scsi_task->transfer_len = payload_size; ++ scsi_task->length = payload_size; ++ ++ ret = ssam_scsi_task_iovs_memory_get(task, payload_size); ++ if (ret != 0) { ++ return ret; ++ } ++ ++ return 0; ++} ++ ++static void ++ssam_scsi_process_io_task(struct spdk_ssam_session *smsession, struct spdk_ssam_scsi_task *task) ++{ ++ struct spdk_ssam_scsi_session *ssmsession = (struct spdk_ssam_scsi_session*)smsession; ++ struct spdk_scsi_dev_io_state *io_stat; ++ uint64_t cur_tsc; ++ int32_t lun_id; ++ ++ ssmsession->scsi_dev_state[task->tgt_id].flight_io++; ++ ++ if (spdk_unlikely(task->scsi_task.lun == NULL)) { ++ spdk_scsi_task_process_null_lun(&task->scsi_task); ++ task->resp.response = VIRTIO_SCSI_S_OK; ++ ssam_scsi_submit_completion(task); ++ return; ++ } ++ ++ lun_id = spdk_scsi_lun_get_id(task->scsi_task.lun); ++ io_stat = ssmsession->scsi_dev_state[task->tgt_id].io_stat[lun_id]; ++ if (io_stat == NULL) { ++ SPDK_ERRLOG("No io_stat with tgt %d lun %d\n", task->tgt_id, lun_id); ++ task->resp.response = VIRTIO_SCSI_S_FAILURE; ++ ssam_scsi_submit_completion(task); ++ return; ++ } ++ /* First part start of read and write */ ++ cur_tsc = spdk_get_ticks(); ++ io_stat->submit_tsc = cur_tsc; ++ memset(&task->task_stat, 0, sizeof(task->task_stat)); ++ task->task_stat.start_tsc = cur_tsc; ++ io_stat->scsi_stat.start_count++; ++ ++ switch (task->scsi_task.dxfer_dir) { ++ case SPDK_SCSI_DIR_FROM_DEV: /* read: read data from backend to ipu, then dma to host */ ++ ssam_scsi_submit_io_task(task); ++ /* First part end of read */ ++ uint8_t rw_type = task->scsi_task.cdb[0]; ++ if (rw_type == SPDK_SBC_READ_6 || rw_type == SPDK_SBC_READ_10 || ++ rw_type == SPDK_SBC_READ_12 || rw_type == SPDK_SBC_READ_16) { ++ io_stat->stat.read_latency_ticks += ssam_get_diff_tsc(io_stat->submit_tsc); ++ io_stat->stat.bytes_read += task->scsi_task.transfer_len; ++ io_stat->stat.num_read_ops++; ++ } ++ break; ++ ++ case SPDK_SCSI_DIR_TO_DEV: /* write: dma data from host to ipu, then submit to backend */ ++ ssam_scsi_task_dma_request(task, SSAM_REQUEST_DATA_LOAD); ++ break; ++ ++ default: ++ SPDK_ERRLOG("scsi task dxfer dir error, dir is %u.\n", task->scsi_task.dxfer_dir); ++ io_stat->scsi_stat.fatal_ios++; ++ break; ++ } ++} ++ ++static void ++ssam_scsi_pre_process_io_task(struct spdk_ssam_session *smsession, struct spdk_ssam_scsi_task *task) ++{ ++ int ret; ++ struct spdk_ssam_scsi_session *ssmsession = (struct spdk_ssam_scsi_session*)smsession; ++ ++ ret = ssam_scsi_io_task_setup(task); ++ if (ret != 0) { ++ if (ret == -ENOMEM) { ++ ssam_session_insert_io_wait(smsession, &task->session_io_wait); ++ return; ++ } ++ task->resp.response = VIRTIO_SCSI_S_FAILURE; ++ ssmsession->scsi_dev_state[task->tgt_id].flight_io++; ++ ssam_scsi_submit_completion(task); ++ return; ++ } ++ ++ ssam_scsi_process_io_task(smsession, task); ++} ++ ++static void ++ssam_scsi_process_request(struct spdk_ssam_session *smsession, struct ssam_request *io_req, ++ uint16_t vq_idx) ++{ ++ struct spdk_ssam_scsi_task *task = NULL; ++ struct spdk_ssam_virtqueue *vq = &smsession->virtqueue[vq_idx]; ++ ++ if (spdk_unlikely(vq->use_num >= vq->num)) { ++ SPDK_ERRLOG("Session:%s vq(%hu) task_cnt(%u) limit(%u).\n", smsession->name, vq_idx, vq->use_num, vq->num); ++ ssam_scsi_req_complete(smsession->smdev, io_req, VIRTIO_SCSI_S_FAILURE); ++ return; ++ } ++ ++ uint32_t index = vq->index[vq->index_r]; ++ task = &((struct spdk_ssam_scsi_task *)vq->tasks)[index]; ++ if (spdk_unlikely(task->used)) { ++ SPDK_ERRLOG("%s: vq(%hu) task_idx(%hu) is already pending.\n", smsession->name, vq_idx, task->task_idx); ++ ssam_scsi_req_complete(smsession->smdev, io_req, VIRTIO_SCSI_S_FAILURE); ++ return; ++ } ++ ++ smsession->task_cnt++; ++ vq->index_r = (vq->index_r + 1) & 0xFF; ++ vq->use_num++; ++ ssam_scsi_task_init(task); ++ task->io_req = io_req; ++ ++ if (spdk_unlikely(io_req->type == VMIO_TYPE_VIRTIO_SCSI_CTRL)) { ++ ssam_scsi_process_ctl_task(smsession, task); ++ } else { ++ ssam_scsi_pre_process_io_task(smsession, task); ++ } ++ ++ return; ++} ++ ++static void ++ssam_scsi_request_worker(struct spdk_ssam_session *smsession, void *arg) ++{ ++ struct ssam_request *io_req = (struct ssam_request*)arg; ++ struct ssam_io_message *io_cmd = &io_req->req.cmd; ++ struct spdk_ssam_dev *smdev = smsession->smdev; ++ struct virtio_scsi_cmd_req *req = (struct virtio_scsi_cmd_req *)io_cmd->header; ++ struct spdk_ssam_scsi_session *ssmsession = (struct spdk_ssam_scsi_session *)smsession; ++ uint16_t vq_idx = io_cmd->virtio.vq_idx; ++ uint32_t tgt_id = req->lun[1]; ++ ++ smdev->io_num++; ++ ++ if (vq_idx >= smsession->max_queues) { ++ SPDK_ERRLOG("vq_idx out of range, need less than %u, actually %u\n", ++ smsession->max_queues, vq_idx); ++ goto err; ++ } ++ ++ if (io_req->status != SSAM_IO_STATUS_OK) { ++ SPDK_WARNLOG("%s: ssam request status invalid, but still process, status=%d\n", ++ smsession->name, io_req->status); ++ goto err; ++ } ++ ++ if (ssmsession->scsi_dev_state[tgt_id].status != SSAM_SCSI_DEV_PRESENT) { ++ /* If dev has been deleted, return io err */ ++ goto err; ++ } ++ ++ ssam_scsi_process_request(smsession, io_req, vq_idx); ++ ++ return; ++ ++err: ++ ssam_scsi_req_complete(smsession->smdev, io_req, VIRTIO_SCSI_S_FAILURE); ++ return; ++} ++ ++static void ++ssam_scsi_response_worker(struct spdk_ssam_session *smsession, void *arg) ++{ ++ struct ssam_dma_rsp *dma_rsp = (struct ssam_dma_rsp*)arg; ++ const struct spdk_ssam_dma_cb *dma_cb = (const struct spdk_ssam_dma_cb *)&dma_rsp->cb; ++ struct spdk_ssam_scsi_task *task = NULL; ++ uint16_t vq_idx = dma_cb->vq_idx; ++ uint16_t task_idx = dma_cb->task_idx; ++ uint8_t req_dir = dma_cb->req_dir; ++ ++ if (spdk_unlikely(vq_idx >= smsession->max_queues)) { ++ smsession->smdev->discard_io_num++; ++ SPDK_ERRLOG("vq_idx out of range, need less than %u, actually %u\n", ++ smsession->max_queues, vq_idx); ++ return; ++ } ++ ++ task = &((struct spdk_ssam_scsi_task *)smsession->virtqueue[vq_idx].tasks)[task_idx]; ++ if (dma_rsp->status != 0) { ++ task->resp.response = VIRTIO_SCSI_S_FAILURE; ++ ssam_scsi_submit_completion(task); ++ SPDK_ERRLOG("dma data process failed!\n"); ++ return; ++ } ++ if (dma_rsp->last_flag == 0) { ++ task->resp.response = VIRTIO_SCSI_S_FAILURE; ++ ssam_scsi_submit_completion(task); ++ SPDK_ERRLOG("last_flag should not equal 0!\n"); ++ return; ++ } ++ int32_t tgt_id = task->tgt_id; ++ int32_t lun_id = spdk_scsi_lun_get_id(task->scsi_task.lun); ++ struct spdk_scsi_dev_io_state *io_stat = task->ssmsession->scsi_dev_state[tgt_id].io_stat[lun_id]; ++ ++ ssam_scsi_task_stat_tick(&task->task_stat.dma_end_tsc); ++ task->ssmsession->scsi_dev_state[task->tgt_id].io_stat[0]->scsi_stat.dma_complete_count++; ++ ++ if (req_dir == SSAM_REQUEST_DATA_LOAD) { ++ /* Write: write data ready, submit task to backend */ ++ ssam_scsi_submit_io_task(task); ++ /* First part end of write */ ++ io_stat->stat.write_latency_ticks += ssam_get_diff_tsc(io_stat->submit_tsc); ++ } else if (req_dir == SSAM_REQUEST_DATA_STORE) { ++ /* Read: data have been read by user, complete the task */ ++ task->resp.response = VIRTIO_SCSI_S_OK; ++ ssam_scsi_submit_completion(task); ++ /* Second part end of read */ ++ io_stat->stat.read_latency_ticks += ssam_get_diff_tsc(io_stat->submit_tsc); ++ } else { ++ io_stat->scsi_stat.fatal_ios++; ++ } ++} ++ ++static void ++ssam_scsi_destroy_bdev_device(struct spdk_ssam_session *smsession, void *args) ++{ ++ unsigned scsi_tgt_num = (unsigned)(uintptr_t)(args); ++ struct spdk_ssam_scsi_session *ssmsession = (struct spdk_ssam_scsi_session *)smsession; ++ ++ ssam_remove_scsi_tgt(ssmsession, scsi_tgt_num); ++} ++ ++static void ++ssam_free_scsi_task_pool(struct spdk_ssam_scsi_session *ssmsession) ++{ ++ struct spdk_ssam_session *smsession = &ssmsession->smsession; ++ struct spdk_ssam_virtqueue *vq = NULL; ++ uint16_t max_queues = smsession->max_queues; ++ uint16_t i; ++ ++ if (max_queues > SPDK_SSAM_MAX_VQUEUES) { ++ return; ++ } ++ ++ for (i = 0; i < max_queues; i++) { ++ vq = &smsession->virtqueue[i]; ++ if (vq->tasks != NULL) { ++ spdk_free(vq->tasks); ++ vq->tasks = NULL; ++ } ++ ++ if (vq->index != NULL) { ++ spdk_free(vq->index); ++ vq->index = NULL; ++ } ++ } ++} ++ ++static int ++ssam_alloc_scsi_task_pool(struct spdk_ssam_scsi_session *ssmsession) ++{ ++ struct spdk_ssam_session *smsession = &ssmsession->smsession; ++ struct spdk_ssam_virtqueue *vq = NULL; ++ struct spdk_ssam_scsi_task *task = NULL; ++ uint16_t max_queues = smsession->max_queues; ++ uint32_t task_cnt = smsession->queue_size; ++ uint16_t i; ++ uint32_t j; ++ ++ if ((max_queues > SPDK_SSAM_MAX_VQUEUES) || (max_queues == 0)) { ++ SPDK_ERRLOG("%s: max_queues %u invalid\n", smsession->name, max_queues); ++ return -EINVAL; ++ } ++ ++ if ((task_cnt == 0) || (task_cnt > SPDK_SSAM_MAX_VQ_SIZE)) { ++ SPDK_ERRLOG("%s: virtuque size %u invalid\n", smsession->name, task_cnt); ++ return -EINVAL; ++ } ++ ++ for (i = 0; i < max_queues; i++) { ++ vq = &smsession->virtqueue[i]; ++ vq->smsession = smsession; ++ vq->num = task_cnt; ++ vq->use_num = 0; ++ vq->index_l = 0; ++ vq->index_r = 0; ++ vq->tasks = spdk_zmalloc(sizeof(struct spdk_ssam_scsi_task) * task_cnt, ++ SPDK_CACHE_LINE_SIZE, NULL, ++ SPDK_ENV_LCORE_ID_ANY, SPDK_MALLOC_DMA); ++ vq->index = spdk_zmalloc(sizeof(uint32_t) * task_cnt, ++ SPDK_CACHE_LINE_SIZE, NULL, ++ SPDK_ENV_LCORE_ID_ANY, SPDK_MALLOC_DMA); ++ if (vq->tasks == NULL || vq->index == NULL) { ++ SPDK_ERRLOG("%s: failed to allocate %"PRIu32" tasks for virtqueue %"PRIu16"\n", ++ smsession->name, task_cnt, i); ++ ssam_free_scsi_task_pool(ssmsession); ++ return -ENOMEM; ++ } ++ ++ for (j = 0; j < task_cnt; j++) { ++ task = &((struct spdk_ssam_scsi_task *)vq->tasks)[j]; ++ task->ssmsession = ssmsession; ++ task->smsession = &ssmsession->smsession; ++ task->vq_idx = i; ++ task->task_idx = j; ++ vq->index[j] = j; ++ } ++ } ++ ++ return 0; ++} ++ ++static void ++ssam_scsi_print_stuck_io_info(struct spdk_ssam_session *smsession) ++{ ++ struct spdk_ssam_scsi_task *tasks; ++ struct spdk_ssam_scsi_task *task; ++ int i, j; ++ ++ for (i = 0; i < smsession->max_queues; i++) { ++ for (j = 0; j < smsession->queue_size; j++) { ++ tasks = (struct spdk_ssam_scsi_task *)smsession->virtqueue[i].tasks; ++ task = &tasks[j]; ++ if (task == NULL) { ++ continue; ++ } ++ if (task->used) { ++ SPDK_INFOLOG(ssam_scsi, "%s: stuck io payload_size %u, vq_idx %u, task_idx %u\n", ++ smsession->name, task->scsi_task.length, task->vq_idx, task->task_idx); ++ } ++ } ++ } ++} ++ ++static int ++ssam_scsi_start_cb(struct spdk_ssam_session *smsession, void **unused) ++{ ++ SPDK_NOTICELOG("SCSI controller %s created with queues %u\n", ++ smsession->name, smsession->max_queues); ++ ++ ssam_session_start_done(smsession, 0); ++ ++ return 0; ++} ++ ++static int ++ssam_scsi_start(struct spdk_ssam_session *smsession) ++{ ++ struct spdk_ssam_scsi_session *ssmsession = (struct spdk_ssam_scsi_session *)smsession; ++ struct spdk_ssam_send_event_flag send_event_flag = { ++ .need_async = false, ++ .need_rsp = true, ++ }; ++ int rc = ssam_alloc_scsi_task_pool(ssmsession); ++ if (rc != 0) { ++ SPDK_ERRLOG("%s: failed to alloc task pool.\n", smsession->name); ++ return rc; ++ } ++ return ssam_send_event_to_session(smsession, ssam_scsi_start_cb, NULL, send_event_flag, NULL); ++} ++ ++static int ++ssam_scsi_session_connect(struct spdk_ssam_session *smsession, uint16_t queues) ++{ ++ uint16_t queue_cnt = queues; ++ ++ if (queue_cnt == 0) { ++ queue_cnt = SPDK_SSAM_SCSI_DEFAULT_VQUEUES; ++ } ++ ++ smsession->max_queues = queue_cnt; ++ smsession->queue_size = SPDK_SSAM_DEFAULT_VQ_SIZE; ++ ++ return ssam_scsi_start(smsession); ++} ++ ++int ++spdk_ssam_scsi_construct(struct spdk_ssam_session_reg_info *info) ++{ ++ struct spdk_ssam_session *smsession = NULL; ++ struct spdk_ssam_scsi_session *ssmsession = NULL; ++ uint32_t session_ctx_size = sizeof(struct spdk_ssam_scsi_session) - sizeof(struct spdk_ssam_session); ++ uint16_t tid; ++ int rc = 0; ++ ++ spdk_ssam_lock(); ++ ++ tid = spdk_ssam_get_tid(); ++ if (tid == SPDK_INVALID_TID) { ++ spdk_ssam_unlock(); ++ return -EINVAL; ++ } ++ ++ info->tid = tid; ++ info->backend = &g_ssam_scsi_session_backend; ++ info->session_ctx_size = session_ctx_size; ++ strncpy(info->type_name, SPDK_SESSION_TYPE_SCSI, SPDK_SESSION_TYPE_MAX_LEN); ++ rc = spdk_ssam_session_register(info, &smsession); ++ if (rc != 0) { ++ spdk_ssam_unlock(); ++ return rc; ++ } ++ ++ ssmsession = (struct spdk_ssam_scsi_session *)smsession; ++ ssmsession->registered = true; ++ ssmsession->dbdf = strdup(info->dbdf); ++ if (ssmsession->dbdf == NULL) { ++ spdk_ssam_session_unregister(smsession); ++ spdk_ssam_unlock(); ++ return -EINVAL; ++ } ++ ++ rc = ssam_scsi_session_connect(smsession, info->queues); ++ if (rc != 0) { ++ ssam_session_unreg_response_cb(smsession); ++ spdk_ssam_session_unregister(smsession); ++ spdk_ssam_unlock(); ++ return -EINVAL; ++ } ++ ++ spdk_ssam_unlock(); ++ ++ return 0; ++} ++ ++static int ++spdk_ssam_get_scsi_tgt_num(struct spdk_ssam_scsi_session *ssmsession, int *scsi_tgt_num_out) ++{ ++ int scsi_tgt_num = *scsi_tgt_num_out; ++ if (scsi_tgt_num < 0) { ++ for (scsi_tgt_num = 0; scsi_tgt_num < SPDK_SSAM_SCSI_CTRLR_MAX_DEVS; scsi_tgt_num++) { ++ if (ssmsession->scsi_dev_state[scsi_tgt_num].dev == NULL) { ++ break; ++ } ++ } ++ ++ if (scsi_tgt_num == SPDK_SSAM_SCSI_CTRLR_MAX_DEVS) { ++ SPDK_ERRLOG("%s: all SCSI target slots are already in use.\n", ssmsession->smsession.name); ++ return -ENOSPC; ++ } ++ } else { ++ if (scsi_tgt_num >= SPDK_SSAM_SCSI_CTRLR_MAX_DEVS) { ++ SPDK_ERRLOG("%s: SCSI target number is too big (got %d, max %d)\n", ++ ssmsession->smsession.name, scsi_tgt_num, SPDK_SSAM_SCSI_CTRLR_MAX_DEVS - 1); ++ return -EINVAL; ++ } ++ } ++ *scsi_tgt_num_out = scsi_tgt_num; ++ return 0; ++} ++ ++static int ssam_scsi_dev_param_changed(struct spdk_ssam_scsi_session *ssmsession, ++ unsigned scsi_tgt_num) ++{ ++ struct spdk_scsi_dev_ssam_state *state = &ssmsession->scsi_dev_state[scsi_tgt_num]; ++ ++ if (state->dev == NULL) { ++ return 0; ++ } ++ int rc = ssam_scsi_send_event(&ssmsession->smsession, scsi_tgt_num, VIRTIO_SCSI_T_PARAM_CHANGE, 0x2a | (0x09 << 0x8)); ++ if (rc != 0) { ++ SPDK_ERRLOG("%s: tgt %d resize send action failed\n", ssmsession->smsession.name, scsi_tgt_num); ++ return rc; ++ } ++ ++ return 0; ++} ++ ++static unsigned ++ssam_get_scsi_dev_num(const struct spdk_ssam_scsi_session *ssmsession, ++ const struct spdk_scsi_lun *lun) ++{ ++ const struct spdk_scsi_dev *scsi_dev; ++ unsigned scsi_dev_num; ++ ++ scsi_dev = spdk_scsi_lun_get_dev(lun); ++ for (scsi_dev_num = 0; scsi_dev_num < SPDK_SSAM_SCSI_CTRLR_MAX_DEVS; scsi_dev_num++) { ++ if (ssmsession->scsi_dev_state[scsi_dev_num].dev == scsi_dev) { ++ break; ++ } ++ } ++ return scsi_dev_num; ++} ++ ++static void ++ssam_scsi_lun_resize(const struct spdk_scsi_lun *lun, void *arg) ++{ ++ struct spdk_ssam_scsi_session *ssmsession = arg; ++ unsigned scsi_dev_num; ++ ++ scsi_dev_num = ssam_get_scsi_dev_num(ssmsession, lun); ++ if (scsi_dev_num == SPDK_SSAM_SCSI_CTRLR_MAX_DEVS) { ++ /* The entire device has been already removed. */ ++ return; ++ } ++ ++ (void)ssam_scsi_dev_param_changed(ssmsession, scsi_dev_num); ++} ++ ++static void ++ssam_scsi_lun_hotremove(const struct spdk_scsi_lun *lun, void *arg) ++{ ++ struct ssam_scsi_tgt_hotplug_ctx *ctx; ++ struct spdk_ssam_scsi_session *ssmsession = arg; ++ struct spdk_ssam_send_event_flag send_event_flag = { ++ .need_async = false, ++ .need_rsp = false, ++ }; ++ unsigned scsi_dev_num; ++ ++ scsi_dev_num = ssam_get_scsi_dev_num(ssmsession, lun); ++ if (scsi_dev_num == SPDK_SSAM_SCSI_CTRLR_MAX_DEVS) { ++ /* The entire device has been already removed. */ ++ return; ++ } ++ ++ ctx = calloc(1, sizeof(*ctx)); ++ if (ctx == NULL) { ++ SPDK_ERRLOG("calloc failed\n"); ++ return; ++ } ++ ++ ctx->scsi_tgt_num = scsi_dev_num; ++ ssam_send_event_to_session(&ssmsession->smsession, spdk_ssam_scsi_dev_hot_remove_tgt, ++ NULL, send_event_flag, ctx); ++} ++ ++static int ++ssam_scsi_session_add_tgt(struct spdk_ssam_session *smsession, void **ctx) ++{ ++ struct ssam_add_tgt_ev_ctx *args = (struct ssam_add_tgt_ev_ctx *)(*ctx); ++ unsigned scsi_tgt_num = args->tgt_num; ++ struct spdk_ssam_scsi_session *ssmsession = (struct spdk_ssam_scsi_session *)smsession; ++ int rc; ++ ++ rc = spdk_scsi_dev_allocate_io_channels(ssmsession->scsi_dev_state[scsi_tgt_num].dev); ++ if (rc != 0) { ++ SPDK_ERRLOG("%s: Couldn't allocate io channel for SCSI target %u.\n", ++ smsession->name, scsi_tgt_num); ++ } ++ ++ rc = ssam_scsi_send_event(smsession, scsi_tgt_num, VIRTIO_SCSI_T_TRANSPORT_RESET, VIRTIO_SCSI_EVT_RESET_RESCAN); ++ if (rc != 0) { ++ SPDK_WARNLOG("%s: send event %d(reason %d) to target %hu failed, ret: %d, host maynot boot.\n", ++ smsession->name, VIRTIO_SCSI_T_TRANSPORT_RESET, VIRTIO_SCSI_EVT_RESET_RESCAN, scsi_tgt_num, rc); ++ if (rc == -ENOSPC) { ++ spdk_scsi_dev_free_io_channels(ssmsession->scsi_dev_state[scsi_tgt_num].dev); ++ ssam_scsi_destruct_tgt(ssmsession, scsi_tgt_num); ++ return rc; ++ } ++ } ++ ++ ssmsession->scsi_dev_state[scsi_tgt_num].status = SSAM_SCSI_DEV_PRESENT; ++ ssmsession->scsi_dev_state[scsi_tgt_num].flight_io = 0; ++ ++ return 0; ++} ++ ++static void ++ssam_scsi_dev_add_tgt_cpl_cb(struct spdk_ssam_session *smsession, void **ctx) ++{ ++ struct spdk_ssam_scsi_session *ssmsession = (struct spdk_ssam_scsi_session *)smsession; ++ struct ssam_add_tgt_ev_ctx *args = (struct ssam_add_tgt_ev_ctx *)(*ctx); ++ unsigned scsi_tgt_num = args->tgt_num; ++ ssmsession->ref++; ++ ++ SPDK_NOTICELOG("SCSI controller %s target %u added with bdev %s\n", ++ smsession->name, scsi_tgt_num, args->bdev_name); ++ ++ free(args->bdev_name); ++ args->bdev_name = NULL; ++ free(args); ++} ++ ++struct ssam_scsi_session_remove_tgt_arg { ++ struct spdk_ssam_session *smsession; ++ unsigned scsi_tgt_num; ++}; ++ ++static void ++ssam_scsi_session_remove_tgt_cpl(struct spdk_ssam_session *smsession, void **_ctx) ++{ ++ struct ssam_scsi_tgt_hotplug_ctx *ctx = *_ctx; ++ unsigned scsi_tgt_num = ctx->scsi_tgt_num; ++ int rc; ++ rc = ssam_umount_normal(smsession, ssam_scsi_tgtid_to_lunid(scsi_tgt_num)); ++ if (rc != 0) { ++ SPDK_ERRLOG("%s: function umount failed when remove scsi tgt:%s.\n", ++ smsession->name, strerror(-rc)); ++ } ++ free(ctx); ++} ++ ++static int ++ssam_scsi_session_remove_tgt(struct spdk_ssam_session *smsession, void **_ctx) ++{ ++ struct ssam_scsi_tgt_hotplug_ctx *ctx = *_ctx; ++ unsigned scsi_tgt_num = ctx->scsi_tgt_num; ++ struct spdk_ssam_scsi_session *ssmsession = (struct spdk_ssam_scsi_session *)smsession; ++ struct spdk_scsi_dev_ssam_state *state = &ssmsession->scsi_dev_state[scsi_tgt_num]; ++ int rc = 0; ++ ++ if (state->status != SSAM_SCSI_DEV_PRESENT) { ++ SPDK_WARNLOG("%s: SCSI target %u is not present, skip.\n", smsession->name, scsi_tgt_num); ++ rc = -ENODEV; ++ goto out; ++ } ++ ++ if (ssmsession->scsi_dev_state[scsi_tgt_num].flight_io != 0) { ++ SPDK_ERRLOG("%s: SCSI target %u is busy.\n", smsession->name, scsi_tgt_num); ++ rc = -EBUSY; ++ goto out; ++ } ++ ++ state->status = SSAM_SCSI_DEV_REMOVING; ++ ++ SPDK_NOTICELOG("%s: target %d is removing\n", smsession->name, scsi_tgt_num); ++ ++ rc = ssam_scsi_send_event(smsession, scsi_tgt_num, VIRTIO_SCSI_T_TRANSPORT_RESET, ++ VIRTIO_SCSI_EVT_RESET_REMOVED); ++ if (rc != 0) { ++ SPDK_ERRLOG("%s: scsi send remove event failed\n", smsession->name); ++ if (rc == -ENOSPC) { ++ state->status = SSAM_SCSI_DEV_PRESENT; ++ goto out; ++ } ++ } ++ ++ spdk_scsi_dev_free_io_channels(state->dev); ++ ++ spdk_ssam_send_dev_destroy_msg(smsession, (void *)(uintptr_t)scsi_tgt_num); ++ ++ // free ctx see ssam_scsi_session_remove_tgt_cpl() ++ return rc; ++ ++out: ++ free(ctx); ++ ++ return rc; ++} ++ ++static int ++ssam_scsi_construct_tgt(struct spdk_ssam_scsi_session *ssmsession, int scsi_tgt_num, ++ const char *bdev_name) ++{ ++ struct spdk_scsi_dev_ssam_state *state = NULL; ++ char target_name[SPDK_SCSI_DEV_MAX_NAME] = {0}; ++ int lun_id_list[SSAM_SPDK_SCSI_DEV_MAX_LUN]; ++ const char *bdev_names_list[SSAM_SPDK_SCSI_DEV_MAX_LUN]; ++ int rc; ++ ++ state = &ssmsession->scsi_dev_state[scsi_tgt_num]; ++ if (state->dev != NULL) { ++ SPDK_ERRLOG("%s: SCSI target %u already occupied\n", ssmsession->smsession.name, scsi_tgt_num); ++ return -EEXIST; ++ } ++ ++ (void)snprintf(target_name, sizeof(target_name), "Target %u", scsi_tgt_num); ++ lun_id_list[0] = 0; ++ bdev_names_list[0] = (char *)bdev_name; ++ ++ state->status = SSAM_SCSI_DEV_ADDING; ++ rc = ssam_scsi_iostat_construct(ssmsession, scsi_tgt_num, lun_id_list, SSAM_SPDK_SCSI_DEV_MAX_LUN); ++ if (rc != 0) { ++ return rc; ++ } ++ ++ state->dev = spdk_scsi_dev_construct_ext(target_name, bdev_names_list, lun_id_list, ++ SSAM_SPDK_SCSI_DEV_MAX_LUN, ++ SPDK_SPC_PROTOCOL_IDENTIFIER_SAS, ++ ssam_scsi_lun_resize, ssmsession, ++ ssam_scsi_lun_hotremove, ssmsession); ++ if (state->dev == NULL) { ++ SPDK_ERRLOG("%s: couldn't create SCSI target %u using bdev '%s'\n", ++ ssmsession->smsession.name, scsi_tgt_num, bdev_name); ++ rc = -EINVAL; ++ goto dev_fail; ++ } ++ ++ rc = spdk_scsi_dev_add_port(state->dev, 0, "ssam"); ++ if (rc != 0) { ++ goto port_fail; ++ } ++ ++ return rc; ++ ++port_fail: ++ spdk_scsi_dev_destruct(state->dev, NULL, NULL); ++ ++dev_fail: ++ ssam_scsi_iostat_destruct(state); ++ ++ return rc; ++} ++ ++static void ++ssam_scsi_destruct_tgt(struct spdk_ssam_scsi_session *ssmsession, int scsi_tgt_num) ++{ ++ struct spdk_scsi_dev_ssam_state *state = NULL; ++ state = &ssmsession->scsi_dev_state[scsi_tgt_num]; ++ ++ if (state->dev) { ++ spdk_scsi_dev_delete_port(state->dev, 0); ++ spdk_scsi_dev_destruct(state->dev, NULL, NULL); ++ state->dev = NULL; ++ } ++ ssam_scsi_iostat_destruct(state); ++ ++ state->status = SSAM_SCSI_DEV_EMPTY; ++} ++ ++int ++spdk_ssam_scsi_dev_add_tgt(struct spdk_ssam_session *smsession, int scsi_tgt_num, ++ const char *bdev_name) ++{ ++ int rc; ++ struct spdk_ssam_scsi_session *ssmsession = (struct spdk_ssam_scsi_session *)smsession; ++ struct ssam_add_tgt_ev_ctx *ctx = calloc(1, sizeof(struct ssam_add_tgt_ev_ctx)); ++ struct spdk_ssam_send_event_flag send_event_flag = { ++ .need_async = false, ++ .need_rsp = true, ++ }; ++ ++ if (ctx == NULL) { ++ SPDK_ERRLOG("calloc ssam_add_tgt_ev_ctx failed\n"); ++ return -ENOMEM; ++ } ++ ++ if (bdev_name == NULL) { ++ SPDK_ERRLOG("No lun name specified\n"); ++ free(ctx); ++ return -EINVAL; ++ } ++ ++ ctx->bdev_name = spdk_sprintf_alloc("%s", bdev_name); ++ if (ctx->bdev_name == NULL) { ++ SPDK_ERRLOG("calloc ssam_add_tgt_ev_ctx bdev_name failed\n"); ++ free(ctx); ++ return -ENOMEM; ++ } ++ ++ rc = spdk_ssam_get_scsi_tgt_num(ssmsession, &scsi_tgt_num); ++ if (rc < 0) { ++ free(ctx->bdev_name); ++ free(ctx); ++ return rc; ++ } ++ ++ rc = ssam_mount_normal(smsession, ssam_scsi_tgtid_to_lunid(scsi_tgt_num)); ++ if (rc != SSAM_MOUNT_OK) { ++ SPDK_ERRLOG("%s: mount ssam volume failed, tgt id %d\n", smsession->name, scsi_tgt_num); ++ free(ctx->bdev_name); ++ free(ctx); ++ return rc; ++ } ++ ++ rc = ssam_scsi_construct_tgt(ssmsession, scsi_tgt_num, bdev_name); ++ if (rc != 0) { ++ free(ctx->bdev_name); ++ free(ctx); ++ return rc; ++ } ++ ++ ctx->tgt_num = scsi_tgt_num; ++ rc = ssam_send_event_to_session(&ssmsession->smsession, ssam_scsi_session_add_tgt, ++ ssam_scsi_dev_add_tgt_cpl_cb, send_event_flag, (void *)ctx); ++ if (rc != 0) { ++ ssam_scsi_destruct_tgt(ssmsession, scsi_tgt_num); ++ free(ctx->bdev_name); ++ free(ctx); ++ return rc; ++ } ++ ++ SPDK_INFOLOG(ssam_scsi, "%s: added SCSI target %u using bdev '%s'\n", ++ ssmsession->smsession.name, scsi_tgt_num, bdev_name); ++ ++ return 0; ++} ++ ++static int ++spdk_ssam_scsi_dev_hot_remove_tgt(struct spdk_ssam_session *smsession, void **_ctx) ++{ ++ int rc = 0; ++ struct ssam_scsi_tgt_hotplug_ctx *ctx = *_ctx; ++ struct spdk_ssam_scsi_session *ssmsession; ++ ssmsession = (struct spdk_ssam_scsi_session *)smsession; ++ unsigned scsi_tgt_num = ctx->scsi_tgt_num; ++ if (!ssmsession) { ++ SPDK_ERRLOG("invalid SCSI device"); ++ rc = -EINVAL; ++ goto out; ++ } ++ ++ struct spdk_scsi_dev_ssam_state *scsi_dev_state = &ssmsession->scsi_dev_state[scsi_tgt_num]; ++ if (scsi_dev_state->dev == NULL) { ++ /* Nothing to do */ ++ SPDK_WARNLOG("%s: There is no need to remove scsi target\n", smsession->name); ++ rc = -ENODEV; ++ goto out; ++ } ++ ++ if (scsi_dev_state->status != SSAM_SCSI_DEV_PRESENT) { ++ SPDK_INFOLOG(ssam_scsi, "%s: SCSI target %u is being removed\n", smsession->name, scsi_tgt_num); ++ rc = 0; ++ goto out; ++ } ++ ++ scsi_dev_state->status = SSAM_SCSI_DEV_REMOVING; ++ ++ SPDK_NOTICELOG("%s: target %d is hot removing\n", smsession->name, scsi_tgt_num); ++ ++rc = ssam_scsi_send_event(smsession, scsi_tgt_num, VIRTIO_SCSI_T_TRANSPORT_RESET, ++ VIRTIO_SCSI_EVT_RESET_REMOVED); ++if (rc != 0) { ++ SPDK_ERRLOG("%s: scsi send remove event failed\n", smsession->name); ++ if (rc == -ENOSPC) { ++ scsi_dev_state->status = SSAM_SCSI_DEV_PRESENT; ++ goto out; ++ } ++} ++ ++spdk_scsi_dev_free_io_channels(scsi_dev_state->dev); ++ ++spdk_ssam_send_dev_destroy_msg(smsession, (void *)(uintptr_t)scsi_tgt_num); ++ ++out: ++ free(ctx); ++ return rc; ++} ++ ++ int ++ spdk_ssam_scsi_dev_remove_tgt(struct spdk_ssam_session *smsession, unsigned scsi_tgt_num, ++ spdk_ssam_session_rsp_fn cb_fn, void *cb_arg) ++{ ++ struct spdk_ssam_scsi_session *ssmsession; ++ struct ssam_scsi_tgt_hotplug_ctx *ctx; ++ struct spdk_ssam_send_event_flag send_event_flag = { ++ .need_async = false, ++ .need_rsp = true, ++ }; ++ ++ if (scsi_tgt_num >= SPDK_SSAM_SCSI_CTRLR_MAX_DEVS) { ++ SPDK_ERRLOG("%s: invalid SCSI target number %d\n", smsession->name, scsi_tgt_num); ++ return -EINVAL; ++ } ++ ++ ssmsession = (struct spdk_ssam_scsi_session *)smsession; ++ if (!ssmsession) { ++ SPDK_ERRLOG("An invalid SCSI device that removing from a SCSI target."); ++ return -EINVAL; ++ } ++ ++ ctx = calloc(1, sizeof(*ctx)); ++ if (ctx == NULL) { ++ SPDK_ERRLOG("calloc failed\n"); ++ return -ENOMEM; ++ } ++ ++ ctx->scsi_tgt_num = scsi_tgt_num; ++ ++ ssam_send_event_to_session(smsession, ssam_scsi_session_remove_tgt, ++ ssam_scsi_session_remove_tgt_cpl, send_event_flag, ctx); ++ ++ return 0; ++} ++ ++SPDK_LOG_REGISTER_COMPONENT(ssam_scsi) +diff --git a/lib/ssam_adapter/Makefile b/lib/ssam_adapter/Makefile +new file mode 100644 +index 0000000..c76ed1c +--- /dev/null ++++ b/lib/ssam_adapter/Makefile +@@ -0,0 +1,48 @@ ++# ++# BSD LICENSE ++# ++# Copyright (c) Huawei Technologies Co., Ltd. 2021-2022. All rights reserved. ++# ++# Redistribution and use in source and binary forms, with or without ++# modification, are permitted provided that the following conditions ++# are met: ++# ++# * Redistributions of source code must retain the above copyright ++# notice, this list of conditions and the following disclaimer. ++# * Redistributions in binary form must reproduce the above copyright ++# notice, this list of conditions and the following disclaimer in ++# the documentation and/or other materials provided with the ++# distribution. ++# * Neither the name of Intel Corporation nor the names of its ++# contributors may be used to endorse or promote products derived ++# from this software without specific prior written permission. ++# ++# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS ++# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT ++# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR ++# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT ++# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, ++# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT ++# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, ++# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY ++# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT ++# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE ++# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++# ++ ++SPDK_ROOT_DIR := $(abspath $(CURDIR)/../..) ++include $(SPDK_ROOT_DIR)/mk/spdk.common.mk ++ ++SO_VER := 1 ++SO_MINOR := 0 ++ ++CFLAGS += -I. ++CFLAGS += $(ENV_CFLAGS) ++ ++C_SRCS = ssam_adapter.c ++ ++LIBNAME = ssam_adapter ++ ++SPDK_MAP_FILE = $(abspath $(CURDIR)/spdk_ssam_adapter.map) ++ ++include $(SPDK_ROOT_DIR)/mk/spdk.lib.mk +diff --git a/lib/ssam_adapter/spdk_ssam_adapter.h b/lib/ssam_adapter/spdk_ssam_adapter.h +new file mode 100644 +index 0000000..bc0c3ca +--- /dev/null ++++ b/lib/ssam_adapter/spdk_ssam_adapter.h +@@ -0,0 +1,65 @@ ++/*- ++ * BSD LICENSE ++ * ++ * Copyright (c) Huawei Technologies Co., Ltd. 2021-2022. All rights reserved. ++ * ++ * Redistribution and use in source and binary forms, with or without ++ * modification, are permitted provided that the following conditions ++ * are met: ++ * ++ * * Redistributions of source code must retain the above copyright ++ * notice, this list of conditions and the following disclaimer. ++ * * Redistributions in binary form must reproduce the above copyright ++ * notice, this list of conditions and the following disclaimer in ++ * the documentation and/or other materials provided with the ++ * distribution. ++ * * Neither the name of Intel Corporation nor the names of its ++ * contributors may be used to endorse or promote products derived ++ * from this software without specific prior written permission. ++ * ++ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS ++ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT ++ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR ++ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT ++ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, ++ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT ++ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, ++ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY ++ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT ++ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE ++ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++ */ ++ ++#ifndef SPDK_SSAM_ADAPTER_H ++#define SPDK_SSAM_ADAPTER_H ++ ++enum ssam_log_level { ++ /** All messages will be suppressed. */ ++ SSAMLOG_DISABLED = -1, ++ SSAMLOG_ERROR = 0, ++ SSAMLOG_WARN, ++ SSAMLOG_NOTICE, ++ SSAMLOG_INFO, ++ SSAMLOG_DEBUG, ++}; ++ ++struct ssam_log_para { ++ enum ssam_log_level level; ++ int line; ++ char *file; ++ char *func; ++}; ++ ++struct ssam_log_flag { ++ TAILQ_ENTRY(ssam_log_flag) tailq; ++ const char *name; ++ bool enabled; ++}; ++ ++void spdk_adapt_vlog(struct ssam_log_para *input, const char *format, va_list ap); ++void spdk_adapt_log_register_flag(const char *name, struct ssam_log_flag *flag); ++uint64_t spdk_adapt_vtophys(const void *buf, uint64_t *size); ++void *spdk_adapt_dma_malloc(size_t size, size_t align, uint64_t *phys_addr); ++void spdk_adapt_dma_free(void *buf); ++ ++#endif /* SPDK_SSAM_ADAPTER_H */ +diff --git a/lib/ssam_adapter/spdk_ssam_adapter.map b/lib/ssam_adapter/spdk_ssam_adapter.map +new file mode 100644 +index 0000000..679edf2 +--- /dev/null ++++ b/lib/ssam_adapter/spdk_ssam_adapter.map +@@ -0,0 +1,12 @@ ++{ ++ global: ++ ++ # Public functions in spdk_ssam_adapter.h ++ spdk_adapt_vlog; ++ spdk_adapt_log_register_flag; ++ spdk_adapt_vtophys; ++ spdk_adapt_dma_malloc; ++ spdk_adapt_dma_free; ++ ++ local: *; ++}; +\ No newline at end of file +diff --git a/lib/ssam_adapter/ssam_adapter.c b/lib/ssam_adapter/ssam_adapter.c +new file mode 100644 +index 0000000..d3d37a0 +--- /dev/null ++++ b/lib/ssam_adapter/ssam_adapter.c +@@ -0,0 +1,62 @@ ++/*- ++ * BSD LICENSE ++ * ++ * Copyright (c) Huawei Technologies Co., Ltd. 2021-2022. All rights reserved. ++ * ++ * Redistribution and use in source and binary forms, with or without ++ * modification, are permitted provided that the following conditions ++ * are met: ++ * ++ * * Redistributions of source code must retain the above copyright ++ * notice, this list of conditions and the following disclaimer. ++ * * Redistributions in binary form must reproduce the above copyright ++ * notice, this list of conditions and the following disclaimer in ++ * the documentation and/or other materials provided with the ++ * distribution. ++ * * Neither the name of Intel Corporation nor the names of its ++ * contributors may be used to endorse or promote products derived ++ * from this software without specific prior written permission. ++ * ++ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS ++ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT ++ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR ++ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT ++ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, ++ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT ++ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, ++ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY ++ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT ++ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE ++ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++ */ ++ ++#include "spdk/log.h" ++#include "spdk/stdinc.h" ++#include "spdk/env.h" ++ ++#include "spdk_ssam_adapter.h" ++ ++inline void spdk_adapt_vlog(struct ssam_log_para *input, const char *format, va_list ap) ++{ ++ spdk_vlog((enum spdk_log_level)input->level, input->file, input->line, input->func, format, ap); ++} ++ ++inline uint64_t spdk_adapt_vtophys(const void *buf, uint64_t *size) ++{ ++ return spdk_vtophys(buf, size); ++} ++ ++inline void *spdk_adapt_dma_malloc(size_t size, size_t align, uint64_t *phys_addr) ++{ ++ return spdk_dma_malloc(size, align, phys_addr); ++} ++ ++inline void spdk_adapt_dma_free(void *buf) ++{ ++ spdk_dma_free(buf); ++} ++ ++inline void spdk_adapt_log_register_flag(const char *name, struct ssam_log_flag *flag) ++{ ++ spdk_log_register_flag(name, (struct spdk_log_flag *)flag); ++} +diff --git a/mk/spdk.lib_deps.mk b/mk/spdk.lib_deps.mk +index 834675e..c89eabd 100644 +--- a/mk/spdk.lib_deps.mk ++++ b/mk/spdk.lib_deps.mk +@@ -1,187 +1,188 @@ +-# SPDX-License-Identifier: BSD-3-Clause +-# Copyright (C) 2015 Intel Corporation. +-# Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES +-# All rights reserved. +-# +- +-# A quick note on organization: +-# +-# Each grouping is independent from itself. it depends only on libraries +-# in the grouping above it. All dependencies are listed alphabetically within +-# groups. The only exception to this is the JSON_LIBS grouping which is a special +-# case since they almost always occur together. +- +-JSON_LIBS := json jsonrpc rpc +- +-DEPDIRS-env_ocf := +-DEPDIRS-log := +-DEPDIRS-rte_vhost := +- +-DEPDIRS-env_dpdk := log util +- +-DEPDIRS-ioat := log +-DEPDIRS-idxd := log util +-DEPDIRS-sock := log $(JSON_LIBS) +-DEPDIRS-util := log +-DEPDIRS-vmd := log util +-DEPDIRS-dma := log +-DEPDIRS-trace_parser := log +-ifeq ($(OS),Linux) +-DEPDIRS-vfio_user := log +-endif +-ifeq ($(CONFIG_VFIO_USER),y) +-DEPDIRS-vfu_tgt := log util thread $(JSON_LIBS) +-endif +- +-DEPDIRS-conf := log util +-DEPDIRS-json := log util +-DEPDIRS-rdma := log util +-DEPDIRS-reduce := log util +-DEPDIRS-thread := log util trace +- +-DEPDIRS-nvme := log sock util trace +-ifeq ($(OS),Linux) +-DEPDIRS-nvme += vfio_user +-endif +-ifeq ($(CONFIG_RDMA),y) +-DEPDIRS-nvme += rdma dma +-endif +- +-DEPDIRS-blob := log util thread dma +-DEPDIRS-accel := log util thread json rpc jsonrpc dma +-DEPDIRS-jsonrpc := log util json +-DEPDIRS-virtio := log util json thread vfio_user +- +-DEPDIRS-lvol := log util blob +-DEPDIRS-rpc := log util json jsonrpc +- +-DEPDIRS-net := log util $(JSON_LIBS) +-DEPDIRS-notify := log util $(JSON_LIBS) +-DEPDIRS-trace := log util $(JSON_LIBS) +- +-DEPDIRS-bdev := log util thread $(JSON_LIBS) notify trace dma +-DEPDIRS-blobfs := log thread blob trace util +-DEPDIRS-event := log util thread $(JSON_LIBS) trace init +-DEPDIRS-init := jsonrpc json log rpc thread util +- +-DEPDIRS-ftl := log util thread bdev trace +-DEPDIRS-nbd := log util thread $(JSON_LIBS) bdev +-ifeq ($(CONFIG_UBLK),y) +-DEPDIRS-ublk := log util thread $(JSON_LIBS) bdev +-endif +-DEPDIRS-nvmf := accel log sock util nvme thread $(JSON_LIBS) trace bdev +-ifeq ($(CONFIG_RDMA),y) +-DEPDIRS-nvmf += rdma +-endif +-ifeq ($(CONFIG_RDMA_PROV),mlx5_dv) +-DEPDIRS-mlx5 = log rdma util +-endif +-DEPDIRS-scsi := log util thread $(JSON_LIBS) trace bdev +- +-DEPDIRS-iscsi := log sock util conf thread $(JSON_LIBS) trace scsi +-DEPDIRS-vhost = log util thread $(JSON_LIBS) bdev scsi +- +-# ------------------------------------------------------------------------ +-# Start module/ directory - This section extends the organizational pattern from +-# above. However, it introduces several more groupings which may not strictly follow +-# the ordering pattern above. These are used for convenience and to help quickly +-# determine the unique dependencies of a given module. It is also grouped by directory. +- +-BDEV_DEPS = log util $(JSON_LIBS) bdev +-BDEV_DEPS_THREAD = $(BDEV_DEPS) thread +- +-# module/blob +-DEPDIRS-blob_bdev := log thread bdev +- +-# module/blobfs +-DEPDIRS-blobfs_bdev := $(BDEV_DEPS_THREAD) blob_bdev blobfs +-ifeq ($(CONFIG_FUSE),y) +-DEPDIRS-blobfs_bdev += event +-endif +- +-# module/accel +-DEPDIRS-accel_ioat := log ioat thread jsonrpc rpc accel +-DEPDIRS-accel_dsa := log idxd thread $(JSON_LIBS) accel trace +-DEPDIRS-accel_iaa := log idxd thread $(JSON_LIBS) accel trace +-DEPDIRS-accel_dpdk_cryptodev := log thread $(JSON_LIBS) accel +-DEPDIRS-accel_dpdk_compressdev := log thread $(JSON_LIBS) accel util +- +-ifeq ($(CONFIG_RDMA_PROV),mlx5_dv) +-DEPDIRS-accel_mlx5 := accel thread log mlx5 rdma util +-endif +- +-# module/env_dpdk +-DEPDIRS-env_dpdk_rpc := log $(JSON_LIBS) +- +-# module/sock +-DEPDIRS-sock_posix := log sock util +-DEPDIRS-sock_uring := log sock util +- +-# module/scheduler +-DEPDIRS-scheduler_dynamic := event log thread util json +-ifeq (y,$(DPDK_POWER)) +-DEPDIRS-scheduler_dpdk_governor := event log +-DEPDIRS-scheduler_gscheduler := event log +-endif +- +-# module/bdev +-ifeq ($(OS),Linux) +-DEPDIRS-bdev_ftl := $(BDEV_DEPS) ftl +-endif +-DEPDIRS-bdev_gpt := bdev json log thread util +- +-DEPDIRS-bdev_error := $(BDEV_DEPS) +-DEPDIRS-bdev_lvol := $(BDEV_DEPS) lvol blob blob_bdev +-DEPDIRS-bdev_rpc := $(BDEV_DEPS) +-DEPDIRS-bdev_split := $(BDEV_DEPS) +- +-DEPDIRS-bdev_aio := $(BDEV_DEPS_THREAD) +-DEPDIRS-bdev_compress := $(BDEV_DEPS_THREAD) reduce accel +-DEPDIRS-bdev_crypto := $(BDEV_DEPS_THREAD) accel +-DEPDIRS-bdev_delay := $(BDEV_DEPS_THREAD) +-DEPDIRS-bdev_iscsi := $(BDEV_DEPS_THREAD) +-DEPDIRS-bdev_malloc := $(BDEV_DEPS_THREAD) accel +-DEPDIRS-bdev_null := $(BDEV_DEPS_THREAD) +-DEPDIRS-bdev_nvme = $(BDEV_DEPS_THREAD) accel nvme trace +-DEPDIRS-bdev_ocf := $(BDEV_DEPS_THREAD) +-DEPDIRS-bdev_passthru := $(BDEV_DEPS_THREAD) +-DEPDIRS-bdev_pmem := $(BDEV_DEPS_THREAD) +-DEPDIRS-bdev_raid := $(BDEV_DEPS_THREAD) +-DEPDIRS-bdev_rbd := $(BDEV_DEPS_THREAD) +-DEPDIRS-bdev_uring := $(BDEV_DEPS_THREAD) +-DEPDIRS-bdev_virtio := $(BDEV_DEPS_THREAD) virtio +-DEPDIRS-bdev_zone_block := $(BDEV_DEPS_THREAD) +-DEPDIRS-bdev_xnvme := $(BDEV_DEPS_THREAD) +- +-# module/event +- +-# module/event/subsystems +-# These depdirs include subsystem interdependencies which +-# are not related to symbols, but are defined directly in +-# the SPDK event subsystem code. +-DEPDIRS-event_accel := init accel event_iobuf +-DEPDIRS-event_vmd := init vmd $(JSON_LIBS) log thread util +- +-DEPDIRS-event_bdev := init bdev event_accel event_vmd event_sock event_iobuf +- +-DEPDIRS-event_scheduler := event init json log +- +-DEPDIRS-event_nbd := init nbd event_bdev +-ifeq ($(CONFIG_UBLK),y) +-DEPDIRS-event_ublk := init ublk event_bdev +-endif +-DEPDIRS-event_nvmf := init nvmf event_bdev event_scheduler event_sock thread log bdev util $(JSON_LIBS) +-DEPDIRS-event_scsi := init scsi event_bdev +- +-DEPDIRS-event_iscsi := init iscsi event_scheduler event_scsi event_sock +-DEPDIRS-event_vhost_blk := init vhost +-DEPDIRS-event_vhost_scsi := init vhost event_scheduler event_scsi +-DEPDIRS-event_sock := init sock +-DEPDIRS-event_vfu_tgt := init vfu_tgt +-DEPDIRS-event_iobuf := init log thread util $(JSON_LIBS) +- +-# module/vfu_device +- +-ifeq ($(CONFIG_VFIO_USER),y) +-DEPDIRS-vfu_device := $(BDEV_DEPS_THREAD) scsi vfu_tgt +-endif ++# SPDX-License-Identifier: BSD-3-Clause ++# Copyright (C) 2015 Intel Corporation. ++# Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES ++# All rights reserved. ++# ++ ++# A quick note on organization: ++# ++# Each grouping is independent from itself. it depends only on libraries ++# in the grouping above it. All dependencies are listed alphabetically within ++# groups. The only exception to this is the JSON_LIBS grouping which is a special ++# case since they almost always occur together. ++ ++JSON_LIBS := json jsonrpc rpc ++ ++DEPDIRS-env_ocf := ++DEPDIRS-log := ++DEPDIRS-rte_vhost := ++ ++DEPDIRS-env_dpdk := log util ++ ++DEPDIRS-ioat := log ++DEPDIRS-idxd := log util ++DEPDIRS-sock := log $(JSON_LIBS) ++DEPDIRS-util := log ++DEPDIRS-vmd := log util ++DEPDIRS-dma := log ++DEPDIRS-trace_parser := log ++ifeq ($(OS),Linux) ++DEPDIRS-vfio_user := log ++endif ++ifeq ($(CONFIG_VFIO_USER),y) ++DEPDIRS-vfu_tgt := log util thread $(JSON_LIBS) ++endif ++ ++DEPDIRS-conf := log util ++DEPDIRS-json := log util ++DEPDIRS-rdma := log util ++DEPDIRS-reduce := log util ++DEPDIRS-thread := log util trace ++ ++DEPDIRS-nvme := log sock util trace ++ifeq ($(OS),Linux) ++DEPDIRS-nvme += vfio_user ++endif ++ifeq ($(CONFIG_RDMA),y) ++DEPDIRS-nvme += rdma dma ++endif ++ ++DEPDIRS-blob := log util thread dma ++DEPDIRS-accel := log util thread json rpc jsonrpc dma ++DEPDIRS-jsonrpc := log util json ++DEPDIRS-virtio := log util json thread vfio_user ++ ++DEPDIRS-lvol := log util blob ++DEPDIRS-rpc := log util json jsonrpc ++ ++DEPDIRS-net := log util $(JSON_LIBS) ++DEPDIRS-notify := log util $(JSON_LIBS) ++DEPDIRS-trace := log util $(JSON_LIBS) ++ ++DEPDIRS-bdev := log util thread $(JSON_LIBS) notify trace dma ++DEPDIRS-blobfs := log thread blob trace util ++DEPDIRS-event := log util thread $(JSON_LIBS) trace init ++DEPDIRS-init := jsonrpc json log rpc thread util ++ ++DEPDIRS-ftl := log util thread bdev trace ++DEPDIRS-nbd := log util thread $(JSON_LIBS) bdev ++ifeq ($(CONFIG_UBLK),y) ++DEPDIRS-ublk := log util thread $(JSON_LIBS) bdev ++endif ++DEPDIRS-nvmf := accel log sock util nvme thread $(JSON_LIBS) trace bdev ++ifeq ($(CONFIG_RDMA),y) ++DEPDIRS-nvmf += rdma ++endif ++ifeq ($(CONFIG_RDMA_PROV),mlx5_dv) ++DEPDIRS-mlx5 = log rdma util ++endif ++DEPDIRS-scsi := log util thread $(JSON_LIBS) trace bdev ++ ++DEPDIRS-iscsi := log sock util conf thread $(JSON_LIBS) trace scsi ++DEPDIRS-vhost = log util thread $(JSON_LIBS) bdev scsi ++ ++# ------------------------------------------------------------------------ ++# Start module/ directory - This section extends the organizational pattern from ++# above. However, it introduces several more groupings which may not strictly follow ++# the ordering pattern above. These are used for convenience and to help quickly ++# determine the unique dependencies of a given module. It is also grouped by directory. ++ ++BDEV_DEPS = log util $(JSON_LIBS) bdev ++BDEV_DEPS_THREAD = $(BDEV_DEPS) thread ++ ++# module/blob ++DEPDIRS-blob_bdev := log thread bdev ++ ++# module/blobfs ++DEPDIRS-blobfs_bdev := $(BDEV_DEPS_THREAD) blob_bdev blobfs ++ifeq ($(CONFIG_FUSE),y) ++DEPDIRS-blobfs_bdev += event ++endif ++ ++# module/accel ++DEPDIRS-accel_ioat := log ioat thread jsonrpc rpc accel ++DEPDIRS-accel_dsa := log idxd thread $(JSON_LIBS) accel trace ++DEPDIRS-accel_iaa := log idxd thread $(JSON_LIBS) accel trace ++DEPDIRS-accel_dpdk_cryptodev := log thread $(JSON_LIBS) accel ++DEPDIRS-accel_dpdk_compressdev := log thread $(JSON_LIBS) accel util ++ ++ifeq ($(CONFIG_RDMA_PROV),mlx5_dv) ++DEPDIRS-accel_mlx5 := accel thread log mlx5 rdma util ++endif ++ ++# module/env_dpdk ++DEPDIRS-env_dpdk_rpc := log $(JSON_LIBS) ++ ++# module/sock ++DEPDIRS-sock_posix := log sock util ++DEPDIRS-sock_uring := log sock util ++ ++# module/scheduler ++DEPDIRS-scheduler_dynamic := event log thread util json ++ifeq (y,$(DPDK_POWER)) ++DEPDIRS-scheduler_dpdk_governor := event log ++DEPDIRS-scheduler_gscheduler := event log ++endif ++ ++# module/bdev ++ifeq ($(OS),Linux) ++DEPDIRS-bdev_ftl := $(BDEV_DEPS) ftl ++endif ++DEPDIRS-bdev_gpt := bdev json log thread util ++ ++DEPDIRS-bdev_error := $(BDEV_DEPS) ++DEPDIRS-bdev_lvol := $(BDEV_DEPS) lvol blob blob_bdev ++DEPDIRS-bdev_rpc := $(BDEV_DEPS) ++DEPDIRS-bdev_split := $(BDEV_DEPS) ++ ++DEPDIRS-bdev_aio := $(BDEV_DEPS_THREAD) ++DEPDIRS-bdev_compress := $(BDEV_DEPS_THREAD) reduce accel ++DEPDIRS-bdev_crypto := $(BDEV_DEPS_THREAD) accel ++DEPDIRS-bdev_delay := $(BDEV_DEPS_THREAD) ++DEPDIRS-bdev_iscsi := $(BDEV_DEPS_THREAD) ++DEPDIRS-bdev_malloc := $(BDEV_DEPS_THREAD) accel ++DEPDIRS-bdev_null := $(BDEV_DEPS_THREAD) ++DEPDIRS-bdev_nvme = $(BDEV_DEPS_THREAD) accel nvme trace ++DEPDIRS-bdev_ocf := $(BDEV_DEPS_THREAD) ++DEPDIRS-bdev_passthru := $(BDEV_DEPS_THREAD) ++DEPDIRS-bdev_pmem := $(BDEV_DEPS_THREAD) ++DEPDIRS-bdev_raid := $(BDEV_DEPS_THREAD) ++DEPDIRS-bdev_rbd := $(BDEV_DEPS_THREAD) ++DEPDIRS-bdev_uring := $(BDEV_DEPS_THREAD) ++DEPDIRS-bdev_virtio := $(BDEV_DEPS_THREAD) virtio ++DEPDIRS-bdev_zone_block := $(BDEV_DEPS_THREAD) ++DEPDIRS-bdev_xnvme := $(BDEV_DEPS_THREAD) ++ ++# module/event ++ ++# module/event/subsystems ++# These depdirs include subsystem interdependencies which ++# are not related to symbols, but are defined directly in ++# the SPDK event subsystem code. ++DEPDIRS-event_accel := init accel event_iobuf ++DEPDIRS-event_vmd := init vmd $(JSON_LIBS) log thread util ++ ++DEPDIRS-event_bdev := init bdev event_accel event_vmd event_sock event_iobuf ++ ++DEPDIRS-event_scheduler := event init json log ++ ++DEPDIRS-event_nbd := init nbd event_bdev ++ifeq ($(CONFIG_UBLK),y) ++DEPDIRS-event_ublk := init ublk event_bdev ++endif ++DEPDIRS-event_nvmf := init nvmf event_bdev event_scheduler event_sock thread log bdev util $(JSON_LIBS) ++DEPDIRS-event_scsi := init scsi event_bdev ++ ++DEPDIRS-event_iscsi := init iscsi event_scheduler event_scsi event_sock ++DEPDIRS-event_vhost_blk := init vhost ++DEPDIRS-event_vhost_scsi := init vhost event_scheduler event_scsi ++DEPDIRS-event_sock := init sock ++DEPDIRS-event_vfu_tgt := init vfu_tgt ++DEPDIRS-event_iobuf := init log thread util $(JSON_LIBS) ++DEPDIRS-event_ssam := event init ssam event_scsi ++ ++# module/vfu_device ++ ++ifeq ($(CONFIG_VFIO_USER),y) ++DEPDIRS-vfu_device := $(BDEV_DEPS_THREAD) scsi vfu_tgt ++endif +diff --git a/module/bdev/nvme/bdev_nvme.c b/module/bdev/nvme/bdev_nvme.c +index 803b524..ca0b1cb 100644 +--- a/module/bdev/nvme/bdev_nvme.c ++++ b/module/bdev/nvme/bdev_nvme.c +@@ -1,7344 +1,7344 @@ +-/* SPDX-License-Identifier: BSD-3-Clause +- * Copyright (C) 2016 Intel Corporation. All rights reserved. +- * Copyright (c) 2019 Mellanox Technologies LTD. All rights reserved. +- * Copyright (c) 2021, 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +- * Copyright (c) 2022 Dell Inc, or its subsidiaries. All rights reserved. +- */ +- +-#include "spdk/stdinc.h" +- +-#include "bdev_nvme.h" +- +-#include "spdk/accel.h" +-#include "spdk/config.h" +-#include "spdk/endian.h" +-#include "spdk/bdev.h" +-#include "spdk/json.h" +-#include "spdk/likely.h" +-#include "spdk/nvme.h" +-#include "spdk/nvme_ocssd.h" +-#include "spdk/nvme_zns.h" +-#include "spdk/opal.h" +-#include "spdk/thread.h" +-#include "spdk/trace.h" +-#include "spdk/string.h" +-#include "spdk/util.h" +- +-#include "spdk/bdev_module.h" +-#include "spdk/log.h" +- +-#include "spdk_internal/usdt.h" +-#include "spdk_internal/trace_defs.h" +- +-#define SPDK_BDEV_NVME_DEFAULT_DELAY_CMD_SUBMIT true +-#define SPDK_BDEV_NVME_DEFAULT_KEEP_ALIVE_TIMEOUT_IN_MS (10000) +- +-#define NSID_STR_LEN 10 +- +-static int bdev_nvme_config_json(struct spdk_json_write_ctx *w); +- +-struct nvme_bdev_io { +- /** array of iovecs to transfer. */ +- struct iovec *iovs; +- +- /** Number of iovecs in iovs array. */ +- int iovcnt; +- +- /** Current iovec position. */ +- int iovpos; +- +- /** Offset in current iovec. */ +- uint32_t iov_offset; +- +- /** I/O path the current I/O or admin passthrough is submitted on, or the I/O path +- * being reset in a reset I/O. +- */ +- struct nvme_io_path *io_path; +- +- /** array of iovecs to transfer. */ +- struct iovec *fused_iovs; +- +- /** Number of iovecs in iovs array. */ +- int fused_iovcnt; +- +- /** Current iovec position. */ +- int fused_iovpos; +- +- /** Offset in current iovec. */ +- uint32_t fused_iov_offset; +- +- /** Saved status for admin passthru completion event, PI error verification, or intermediate compare-and-write status */ +- struct spdk_nvme_cpl cpl; +- +- /** Extended IO opts passed by the user to bdev layer and mapped to NVME format */ +- struct spdk_nvme_ns_cmd_ext_io_opts ext_opts; +- +- /** Originating thread */ +- struct spdk_thread *orig_thread; +- +- /** Keeps track if first of fused commands was submitted */ +- bool first_fused_submitted; +- +- /** Keeps track if first of fused commands was completed */ +- bool first_fused_completed; +- +- /** Temporary pointer to zone report buffer */ +- struct spdk_nvme_zns_zone_report *zone_report_buf; +- +- /** Keep track of how many zones that have been copied to the spdk_bdev_zone_info struct */ +- uint64_t handled_zones; +- +- /** Expiration value in ticks to retry the current I/O. */ +- uint64_t retry_ticks; +- +- /* How many times the current I/O was retried. */ +- int32_t retry_count; +- +- /* Current tsc at submit time. */ +- uint64_t submit_tsc; +-}; +- +-struct nvme_probe_skip_entry { +- struct spdk_nvme_transport_id trid; +- TAILQ_ENTRY(nvme_probe_skip_entry) tailq; +-}; +-/* All the controllers deleted by users via RPC are skipped by hotplug monitor */ +-static TAILQ_HEAD(, nvme_probe_skip_entry) g_skipped_nvme_ctrlrs = TAILQ_HEAD_INITIALIZER( +- g_skipped_nvme_ctrlrs); +- +-static struct spdk_bdev_nvme_opts g_opts = { +- .action_on_timeout = SPDK_BDEV_NVME_TIMEOUT_ACTION_NONE, +- .timeout_us = 0, +- .timeout_admin_us = 0, +- .keep_alive_timeout_ms = SPDK_BDEV_NVME_DEFAULT_KEEP_ALIVE_TIMEOUT_IN_MS, +- .transport_retry_count = 4, +- .arbitration_burst = 0, +- .low_priority_weight = 0, +- .medium_priority_weight = 0, +- .high_priority_weight = 0, +- .nvme_adminq_poll_period_us = 10000ULL, +- .nvme_ioq_poll_period_us = 0, +- .io_queue_requests = 0, +- .delay_cmd_submit = SPDK_BDEV_NVME_DEFAULT_DELAY_CMD_SUBMIT, +- .bdev_retry_count = 3, +- .transport_ack_timeout = 0, +- .ctrlr_loss_timeout_sec = 0, +- .reconnect_delay_sec = 0, +- .fast_io_fail_timeout_sec = 0, +- .disable_auto_failback = false, +- .generate_uuids = false, +- .transport_tos = 0, +- .nvme_error_stat = false, +- .io_path_stat = false, +-}; +- +-#define NVME_HOTPLUG_POLL_PERIOD_MAX 10000000ULL +-#define NVME_HOTPLUG_POLL_PERIOD_DEFAULT 100000ULL +- +-static int g_hot_insert_nvme_controller_index = 0; +-static uint64_t g_nvme_hotplug_poll_period_us = NVME_HOTPLUG_POLL_PERIOD_DEFAULT; +-static bool g_nvme_hotplug_enabled = false; +-struct spdk_thread *g_bdev_nvme_init_thread; +-static struct spdk_poller *g_hotplug_poller; +-static struct spdk_poller *g_hotplug_probe_poller; +-static struct spdk_nvme_probe_ctx *g_hotplug_probe_ctx; +- +-static void nvme_ctrlr_populate_namespaces(struct nvme_ctrlr *nvme_ctrlr, +- struct nvme_async_probe_ctx *ctx); +-static void nvme_ctrlr_populate_namespaces_done(struct nvme_ctrlr *nvme_ctrlr, +- struct nvme_async_probe_ctx *ctx); +-static int bdev_nvme_library_init(void); +-static void bdev_nvme_library_fini(void); +-static void _bdev_nvme_submit_request(struct nvme_bdev_channel *nbdev_ch, +- struct spdk_bdev_io *bdev_io); +-static void bdev_nvme_submit_request(struct spdk_io_channel *ch, +- struct spdk_bdev_io *bdev_io); +-static int bdev_nvme_readv(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt, +- void *md, uint64_t lba_count, uint64_t lba, +- uint32_t flags, struct spdk_bdev_ext_io_opts *ext_opts); +-static int bdev_nvme_no_pi_readv(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt, +- void *md, uint64_t lba_count, uint64_t lba); +-static int bdev_nvme_writev(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt, +- void *md, uint64_t lba_count, uint64_t lba, +- uint32_t flags, struct spdk_bdev_ext_io_opts *ext_opts); +-static int bdev_nvme_zone_appendv(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt, +- void *md, uint64_t lba_count, +- uint64_t zslba, uint32_t flags); +-static int bdev_nvme_comparev(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt, +- void *md, uint64_t lba_count, uint64_t lba, +- uint32_t flags); +-static int bdev_nvme_comparev_and_writev(struct nvme_bdev_io *bio, +- struct iovec *cmp_iov, int cmp_iovcnt, struct iovec *write_iov, +- int write_iovcnt, void *md, uint64_t lba_count, uint64_t lba, +- uint32_t flags); +-static int bdev_nvme_get_zone_info(struct nvme_bdev_io *bio, uint64_t zone_id, +- uint32_t num_zones, struct spdk_bdev_zone_info *info); +-static int bdev_nvme_zone_management(struct nvme_bdev_io *bio, uint64_t zone_id, +- enum spdk_bdev_zone_action action); +-static void bdev_nvme_admin_passthru(struct nvme_bdev_channel *nbdev_ch, +- struct nvme_bdev_io *bio, +- struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes); +-static int bdev_nvme_io_passthru(struct nvme_bdev_io *bio, struct spdk_nvme_cmd *cmd, +- void *buf, size_t nbytes); +-static int bdev_nvme_io_passthru_md(struct nvme_bdev_io *bio, struct spdk_nvme_cmd *cmd, +- void *buf, size_t nbytes, void *md_buf, size_t md_len); +-static void bdev_nvme_abort(struct nvme_bdev_channel *nbdev_ch, +- struct nvme_bdev_io *bio, struct nvme_bdev_io *bio_to_abort); +-static void bdev_nvme_reset_io(struct nvme_bdev_channel *nbdev_ch, struct nvme_bdev_io *bio); +-static int bdev_nvme_reset(struct nvme_ctrlr *nvme_ctrlr); +-static int bdev_nvme_failover(struct nvme_ctrlr *nvme_ctrlr, bool remove); +-static void remove_cb(void *cb_ctx, struct spdk_nvme_ctrlr *ctrlr); +-static int nvme_ctrlr_read_ana_log_page(struct nvme_ctrlr *nvme_ctrlr); +- +-static struct nvme_ns *nvme_ns_alloc(void); +-static void nvme_ns_free(struct nvme_ns *ns); +- +-static int +-nvme_ns_cmp(struct nvme_ns *ns1, struct nvme_ns *ns2) +-{ +- return ns1->id < ns2->id ? -1 : ns1->id > ns2->id; +-} +- +-RB_GENERATE_STATIC(nvme_ns_tree, nvme_ns, node, nvme_ns_cmp); +- +-struct spdk_nvme_qpair * +-bdev_nvme_get_io_qpair(struct spdk_io_channel *ctrlr_io_ch) +-{ +- struct nvme_ctrlr_channel *ctrlr_ch; +- +- assert(ctrlr_io_ch != NULL); +- +- ctrlr_ch = spdk_io_channel_get_ctx(ctrlr_io_ch); +- +- return ctrlr_ch->qpair->qpair; +-} +- +-static int +-bdev_nvme_get_ctx_size(void) +-{ +- return sizeof(struct nvme_bdev_io); +-} +- +-static struct spdk_bdev_module nvme_if = { +- .name = "nvme", +- .async_fini = true, +- .module_init = bdev_nvme_library_init, +- .module_fini = bdev_nvme_library_fini, +- .config_json = bdev_nvme_config_json, +- .get_ctx_size = bdev_nvme_get_ctx_size, +- +-}; +-SPDK_BDEV_MODULE_REGISTER(nvme, &nvme_if) +- +-struct nvme_bdev_ctrlrs g_nvme_bdev_ctrlrs = TAILQ_HEAD_INITIALIZER(g_nvme_bdev_ctrlrs); +-pthread_mutex_t g_bdev_nvme_mutex = PTHREAD_MUTEX_INITIALIZER; +-bool g_bdev_nvme_module_finish; +- +-struct nvme_bdev_ctrlr * +-nvme_bdev_ctrlr_get_by_name(const char *name) +-{ +- struct nvme_bdev_ctrlr *nbdev_ctrlr; +- +- TAILQ_FOREACH(nbdev_ctrlr, &g_nvme_bdev_ctrlrs, tailq) { +- if (strcmp(name, nbdev_ctrlr->name) == 0) { +- break; +- } +- } +- +- return nbdev_ctrlr; +-} +- +-static struct nvme_ctrlr * +-nvme_bdev_ctrlr_get_ctrlr(struct nvme_bdev_ctrlr *nbdev_ctrlr, +- const struct spdk_nvme_transport_id *trid) +-{ +- struct nvme_ctrlr *nvme_ctrlr; +- +- TAILQ_FOREACH(nvme_ctrlr, &nbdev_ctrlr->ctrlrs, tailq) { +- if (spdk_nvme_transport_id_compare(trid, &nvme_ctrlr->active_path_id->trid) == 0) { +- break; +- } +- } +- +- return nvme_ctrlr; +-} +- +-static struct nvme_bdev * +-nvme_bdev_ctrlr_get_bdev(struct nvme_bdev_ctrlr *nbdev_ctrlr, uint32_t nsid) +-{ +- struct nvme_bdev *bdev; +- +- pthread_mutex_lock(&g_bdev_nvme_mutex); +- TAILQ_FOREACH(bdev, &nbdev_ctrlr->bdevs, tailq) { +- if (bdev->nsid == nsid) { +- break; +- } +- } +- pthread_mutex_unlock(&g_bdev_nvme_mutex); +- +- return bdev; +-} +- +-struct nvme_ns * +-nvme_ctrlr_get_ns(struct nvme_ctrlr *nvme_ctrlr, uint32_t nsid) +-{ +- struct nvme_ns ns; +- +- assert(nsid > 0); +- +- ns.id = nsid; +- return RB_FIND(nvme_ns_tree, &nvme_ctrlr->namespaces, &ns); +-} +- +-struct nvme_ns * +-nvme_ctrlr_get_first_active_ns(struct nvme_ctrlr *nvme_ctrlr) +-{ +- return RB_MIN(nvme_ns_tree, &nvme_ctrlr->namespaces); +-} +- +-struct nvme_ns * +-nvme_ctrlr_get_next_active_ns(struct nvme_ctrlr *nvme_ctrlr, struct nvme_ns *ns) +-{ +- if (ns == NULL) { +- return NULL; +- } +- +- return RB_NEXT(nvme_ns_tree, &nvme_ctrlr->namespaces, ns); +-} +- +-static struct nvme_ctrlr * +-nvme_ctrlr_get(const struct spdk_nvme_transport_id *trid) +-{ +- struct nvme_bdev_ctrlr *nbdev_ctrlr; +- struct nvme_ctrlr *nvme_ctrlr = NULL; +- +- pthread_mutex_lock(&g_bdev_nvme_mutex); +- TAILQ_FOREACH(nbdev_ctrlr, &g_nvme_bdev_ctrlrs, tailq) { +- nvme_ctrlr = nvme_bdev_ctrlr_get_ctrlr(nbdev_ctrlr, trid); +- if (nvme_ctrlr != NULL) { +- break; +- } +- } +- pthread_mutex_unlock(&g_bdev_nvme_mutex); +- +- return nvme_ctrlr; +-} +- +-struct nvme_ctrlr * +-nvme_ctrlr_get_by_name(const char *name) +-{ +- struct nvme_bdev_ctrlr *nbdev_ctrlr; +- struct nvme_ctrlr *nvme_ctrlr = NULL; +- +- if (name == NULL) { +- return NULL; +- } +- +- pthread_mutex_lock(&g_bdev_nvme_mutex); +- nbdev_ctrlr = nvme_bdev_ctrlr_get_by_name(name); +- if (nbdev_ctrlr != NULL) { +- nvme_ctrlr = TAILQ_FIRST(&nbdev_ctrlr->ctrlrs); +- } +- pthread_mutex_unlock(&g_bdev_nvme_mutex); +- +- return nvme_ctrlr; +-} +- +-void +-nvme_bdev_ctrlr_for_each(nvme_bdev_ctrlr_for_each_fn fn, void *ctx) +-{ +- struct nvme_bdev_ctrlr *nbdev_ctrlr; +- +- pthread_mutex_lock(&g_bdev_nvme_mutex); +- TAILQ_FOREACH(nbdev_ctrlr, &g_nvme_bdev_ctrlrs, tailq) { +- fn(nbdev_ctrlr, ctx); +- } +- pthread_mutex_unlock(&g_bdev_nvme_mutex); +-} +- +-void +-nvme_bdev_dump_trid_json(const struct spdk_nvme_transport_id *trid, struct spdk_json_write_ctx *w) +-{ +- const char *trtype_str; +- const char *adrfam_str; +- +- trtype_str = spdk_nvme_transport_id_trtype_str(trid->trtype); +- if (trtype_str) { +- spdk_json_write_named_string(w, "trtype", trtype_str); +- } +- +- adrfam_str = spdk_nvme_transport_id_adrfam_str(trid->adrfam); +- if (adrfam_str) { +- spdk_json_write_named_string(w, "adrfam", adrfam_str); +- } +- +- if (trid->traddr[0] != '\0') { +- spdk_json_write_named_string(w, "traddr", trid->traddr); +- } +- +- if (trid->trsvcid[0] != '\0') { +- spdk_json_write_named_string(w, "trsvcid", trid->trsvcid); +- } +- +- if (trid->subnqn[0] != '\0') { +- spdk_json_write_named_string(w, "subnqn", trid->subnqn); +- } +-} +- +-static void +-nvme_bdev_ctrlr_delete(struct nvme_bdev_ctrlr *nbdev_ctrlr, +- struct nvme_ctrlr *nvme_ctrlr) +-{ +- SPDK_DTRACE_PROBE1(bdev_nvme_ctrlr_delete, nvme_ctrlr->nbdev_ctrlr->name); +- pthread_mutex_lock(&g_bdev_nvme_mutex); +- +- TAILQ_REMOVE(&nbdev_ctrlr->ctrlrs, nvme_ctrlr, tailq); +- if (!TAILQ_EMPTY(&nbdev_ctrlr->ctrlrs)) { +- pthread_mutex_unlock(&g_bdev_nvme_mutex); +- +- return; +- } +- TAILQ_REMOVE(&g_nvme_bdev_ctrlrs, nbdev_ctrlr, tailq); +- +- pthread_mutex_unlock(&g_bdev_nvme_mutex); +- +- assert(TAILQ_EMPTY(&nbdev_ctrlr->bdevs)); +- +- free(nbdev_ctrlr->name); +- free(nbdev_ctrlr); +-} +- +-static void +-_nvme_ctrlr_delete(struct nvme_ctrlr *nvme_ctrlr) +-{ +- struct nvme_path_id *path_id, *tmp_path; +- struct nvme_ns *ns, *tmp_ns; +- +- free(nvme_ctrlr->copied_ana_desc); +- spdk_free(nvme_ctrlr->ana_log_page); +- +- if (nvme_ctrlr->opal_dev) { +- spdk_opal_dev_destruct(nvme_ctrlr->opal_dev); +- nvme_ctrlr->opal_dev = NULL; +- } +- +- if (nvme_ctrlr->nbdev_ctrlr) { +- nvme_bdev_ctrlr_delete(nvme_ctrlr->nbdev_ctrlr, nvme_ctrlr); +- } +- +- RB_FOREACH_SAFE(ns, nvme_ns_tree, &nvme_ctrlr->namespaces, tmp_ns) { +- RB_REMOVE(nvme_ns_tree, &nvme_ctrlr->namespaces, ns); +- nvme_ns_free(ns); +- } +- +- TAILQ_FOREACH_SAFE(path_id, &nvme_ctrlr->trids, link, tmp_path) { +- TAILQ_REMOVE(&nvme_ctrlr->trids, path_id, link); +- free(path_id); +- } +- +- pthread_mutex_destroy(&nvme_ctrlr->mutex); +- +- free(nvme_ctrlr); +- +- pthread_mutex_lock(&g_bdev_nvme_mutex); +- if (g_bdev_nvme_module_finish && TAILQ_EMPTY(&g_nvme_bdev_ctrlrs)) { +- pthread_mutex_unlock(&g_bdev_nvme_mutex); +- spdk_io_device_unregister(&g_nvme_bdev_ctrlrs, NULL); +- spdk_bdev_module_fini_done(); +- return; +- } +- pthread_mutex_unlock(&g_bdev_nvme_mutex); +-} +- +-static int +-nvme_detach_poller(void *arg) +-{ +- struct nvme_ctrlr *nvme_ctrlr = arg; +- int rc; +- +- rc = spdk_nvme_detach_poll_async(nvme_ctrlr->detach_ctx); +- if (rc != -EAGAIN) { +- spdk_poller_unregister(&nvme_ctrlr->reset_detach_poller); +- _nvme_ctrlr_delete(nvme_ctrlr); +- } +- +- return SPDK_POLLER_BUSY; +-} +- +-static void +-nvme_ctrlr_delete(struct nvme_ctrlr *nvme_ctrlr) +-{ +- int rc; +- +- spdk_poller_unregister(&nvme_ctrlr->reconnect_delay_timer); +- +- /* First, unregister the adminq poller, as the driver will poll adminq if necessary */ +- spdk_poller_unregister(&nvme_ctrlr->adminq_timer_poller); +- +- /* If we got here, the reset/detach poller cannot be active */ +- assert(nvme_ctrlr->reset_detach_poller == NULL); +- nvme_ctrlr->reset_detach_poller = SPDK_POLLER_REGISTER(nvme_detach_poller, +- nvme_ctrlr, 1000); +- if (nvme_ctrlr->reset_detach_poller == NULL) { +- SPDK_ERRLOG("Failed to register detach poller\n"); +- goto error; +- } +- +- rc = spdk_nvme_detach_async(nvme_ctrlr->ctrlr, &nvme_ctrlr->detach_ctx); +- if (rc != 0) { +- SPDK_ERRLOG("Failed to detach the NVMe controller\n"); +- goto error; +- } +- +- return; +-error: +- /* We don't have a good way to handle errors here, so just do what we can and delete the +- * controller without detaching the underlying NVMe device. +- */ +- spdk_poller_unregister(&nvme_ctrlr->reset_detach_poller); +- _nvme_ctrlr_delete(nvme_ctrlr); +-} +- +-static void +-nvme_ctrlr_unregister_cb(void *io_device) +-{ +- struct nvme_ctrlr *nvme_ctrlr = io_device; +- +- nvme_ctrlr_delete(nvme_ctrlr); +-} +- +-static void +-nvme_ctrlr_unregister(void *ctx) +-{ +- struct nvme_ctrlr *nvme_ctrlr = ctx; +- +- spdk_io_device_unregister(nvme_ctrlr, nvme_ctrlr_unregister_cb); +-} +- +-static bool +-nvme_ctrlr_can_be_unregistered(struct nvme_ctrlr *nvme_ctrlr) +-{ +- if (!nvme_ctrlr->destruct) { +- return false; +- } +- +- if (nvme_ctrlr->ref > 0) { +- return false; +- } +- +- if (nvme_ctrlr->resetting) { +- return false; +- } +- +- if (nvme_ctrlr->ana_log_page_updating) { +- return false; +- } +- +- if (nvme_ctrlr->io_path_cache_clearing) { +- return false; +- } +- +- return true; +-} +- +-static void +-nvme_ctrlr_release(struct nvme_ctrlr *nvme_ctrlr) +-{ +- pthread_mutex_lock(&nvme_ctrlr->mutex); +- SPDK_DTRACE_PROBE2(bdev_nvme_ctrlr_release, nvme_ctrlr->nbdev_ctrlr->name, nvme_ctrlr->ref); +- +- assert(nvme_ctrlr->ref > 0); +- nvme_ctrlr->ref--; +- +- if (!nvme_ctrlr_can_be_unregistered(nvme_ctrlr)) { +- pthread_mutex_unlock(&nvme_ctrlr->mutex); +- return; +- } +- +- pthread_mutex_unlock(&nvme_ctrlr->mutex); +- +- spdk_thread_exec_msg(nvme_ctrlr->thread, nvme_ctrlr_unregister, nvme_ctrlr); +-} +- +-static void +-bdev_nvme_clear_current_io_path(struct nvme_bdev_channel *nbdev_ch) +-{ +- nbdev_ch->current_io_path = NULL; +- nbdev_ch->rr_counter = 0; +-} +- +-static struct nvme_io_path * +-_bdev_nvme_get_io_path(struct nvme_bdev_channel *nbdev_ch, struct nvme_ns *nvme_ns) +-{ +- struct nvme_io_path *io_path; +- +- STAILQ_FOREACH(io_path, &nbdev_ch->io_path_list, stailq) { +- if (io_path->nvme_ns == nvme_ns) { +- break; +- } +- } +- +- return io_path; +-} +- +-static int +-_bdev_nvme_add_io_path(struct nvme_bdev_channel *nbdev_ch, struct nvme_ns *nvme_ns) +-{ +- struct nvme_io_path *io_path; +- struct spdk_io_channel *ch; +- struct nvme_ctrlr_channel *ctrlr_ch; +- struct nvme_qpair *nvme_qpair; +- +- io_path = calloc(1, sizeof(*io_path)); +- if (io_path == NULL) { +- SPDK_ERRLOG("Failed to alloc io_path.\n"); +- return -ENOMEM; +- } +- +- if (g_opts.io_path_stat) { +- io_path->stat = calloc(1, sizeof(struct spdk_bdev_io_stat)); +- if (io_path->stat == NULL) { +- free(io_path); +- SPDK_ERRLOG("Failed to alloc io_path stat.\n"); +- return -ENOMEM; +- } +- spdk_bdev_reset_io_stat(io_path->stat, SPDK_BDEV_RESET_STAT_MAXMIN); +- } +- +- io_path->nvme_ns = nvme_ns; +- +- ch = spdk_get_io_channel(nvme_ns->ctrlr); +- if (ch == NULL) { +- free(io_path->stat); +- free(io_path); +- SPDK_ERRLOG("Failed to alloc io_channel.\n"); +- return -ENOMEM; +- } +- +- ctrlr_ch = spdk_io_channel_get_ctx(ch); +- +- nvme_qpair = ctrlr_ch->qpair; +- assert(nvme_qpair != NULL); +- +- io_path->qpair = nvme_qpair; +- TAILQ_INSERT_TAIL(&nvme_qpair->io_path_list, io_path, tailq); +- +- io_path->nbdev_ch = nbdev_ch; +- STAILQ_INSERT_TAIL(&nbdev_ch->io_path_list, io_path, stailq); +- +- bdev_nvme_clear_current_io_path(nbdev_ch); +- +- return 0; +-} +- +-static void +-_bdev_nvme_delete_io_path(struct nvme_bdev_channel *nbdev_ch, struct nvme_io_path *io_path) +-{ +- struct spdk_io_channel *ch; +- struct nvme_qpair *nvme_qpair; +- struct nvme_ctrlr_channel *ctrlr_ch; +- struct nvme_bdev *nbdev; +- +- nbdev = spdk_io_channel_get_io_device(spdk_io_channel_from_ctx(nbdev_ch)); +- +- /* Add the statistics to nvme_ns before this path is destroyed. */ +- pthread_mutex_lock(&nbdev->mutex); +- if (nbdev->ref != 0 && io_path->nvme_ns->stat != NULL && io_path->stat != NULL) { +- spdk_bdev_add_io_stat(io_path->nvme_ns->stat, io_path->stat); +- } +- pthread_mutex_unlock(&nbdev->mutex); +- +- bdev_nvme_clear_current_io_path(nbdev_ch); +- +- STAILQ_REMOVE(&nbdev_ch->io_path_list, io_path, nvme_io_path, stailq); +- +- nvme_qpair = io_path->qpair; +- assert(nvme_qpair != NULL); +- +- TAILQ_REMOVE(&nvme_qpair->io_path_list, io_path, tailq); +- +- ctrlr_ch = nvme_qpair->ctrlr_ch; +- assert(ctrlr_ch != NULL); +- +- ch = spdk_io_channel_from_ctx(ctrlr_ch); +- spdk_put_io_channel(ch); +- +- free(io_path->stat); +- free(io_path); +-} +- +-static void +-_bdev_nvme_delete_io_paths(struct nvme_bdev_channel *nbdev_ch) +-{ +- struct nvme_io_path *io_path, *tmp_io_path; +- +- STAILQ_FOREACH_SAFE(io_path, &nbdev_ch->io_path_list, stailq, tmp_io_path) { +- _bdev_nvme_delete_io_path(nbdev_ch, io_path); +- } +-} +- +-static int +-bdev_nvme_create_bdev_channel_cb(void *io_device, void *ctx_buf) +-{ +- struct nvme_bdev_channel *nbdev_ch = ctx_buf; +- struct nvme_bdev *nbdev = io_device; +- struct nvme_ns *nvme_ns; +- int rc; +- +- STAILQ_INIT(&nbdev_ch->io_path_list); +- TAILQ_INIT(&nbdev_ch->retry_io_list); +- +- pthread_mutex_lock(&nbdev->mutex); +- +- nbdev_ch->mp_policy = nbdev->mp_policy; +- nbdev_ch->mp_selector = nbdev->mp_selector; +- nbdev_ch->rr_min_io = nbdev->rr_min_io; +- +- TAILQ_FOREACH(nvme_ns, &nbdev->nvme_ns_list, tailq) { +- rc = _bdev_nvme_add_io_path(nbdev_ch, nvme_ns); +- if (rc != 0) { +- pthread_mutex_unlock(&nbdev->mutex); +- +- _bdev_nvme_delete_io_paths(nbdev_ch); +- return rc; +- } +- } +- pthread_mutex_unlock(&nbdev->mutex); +- +- return 0; +-} +- +-/* If cpl != NULL, complete the bdev_io with nvme status based on 'cpl'. +- * If cpl == NULL, complete the bdev_io with bdev status based on 'status'. +- */ +-static inline void +-__bdev_nvme_io_complete(struct spdk_bdev_io *bdev_io, enum spdk_bdev_io_status status, +- const struct spdk_nvme_cpl *cpl) +-{ +- spdk_trace_record(TRACE_BDEV_NVME_IO_DONE, 0, 0, (uintptr_t)bdev_io->driver_ctx, +- (uintptr_t)bdev_io); +- if (cpl) { +- spdk_bdev_io_complete_nvme_status(bdev_io, cpl->cdw0, cpl->status.sct, cpl->status.sc); +- } else { +- spdk_bdev_io_complete(bdev_io, status); +- } +-} +- +-static void bdev_nvme_abort_retry_ios(struct nvme_bdev_channel *nbdev_ch); +- +-static void +-bdev_nvme_destroy_bdev_channel_cb(void *io_device, void *ctx_buf) +-{ +- struct nvme_bdev_channel *nbdev_ch = ctx_buf; +- +- bdev_nvme_abort_retry_ios(nbdev_ch); +- _bdev_nvme_delete_io_paths(nbdev_ch); +-} +- +-static inline bool +-bdev_nvme_io_type_is_admin(enum spdk_bdev_io_type io_type) +-{ +- switch (io_type) { +- case SPDK_BDEV_IO_TYPE_RESET: +- case SPDK_BDEV_IO_TYPE_NVME_ADMIN: +- case SPDK_BDEV_IO_TYPE_ABORT: +- return true; +- default: +- break; +- } +- +- return false; +-} +- +-static inline bool +-nvme_ns_is_accessible(struct nvme_ns *nvme_ns) +-{ +- if (spdk_unlikely(nvme_ns->ana_state_updating)) { +- return false; +- } +- +- switch (nvme_ns->ana_state) { +- case SPDK_NVME_ANA_OPTIMIZED_STATE: +- case SPDK_NVME_ANA_NON_OPTIMIZED_STATE: +- return true; +- default: +- break; +- } +- +- return false; +-} +- +-static inline bool +-nvme_io_path_is_connected(struct nvme_io_path *io_path) +-{ +- if (spdk_unlikely(io_path->qpair->qpair == NULL)) { +- return false; +- } +- +- if (spdk_unlikely(spdk_nvme_qpair_get_failure_reason(io_path->qpair->qpair) != +- SPDK_NVME_QPAIR_FAILURE_NONE)) { +- return false; +- } +- +- if (spdk_unlikely(io_path->qpair->ctrlr_ch->reset_iter != NULL)) { +- return false; +- } +- +- if (spdk_nvme_ctrlr_get_admin_qp_failure_reason(io_path->qpair->ctrlr->ctrlr) != +- SPDK_NVME_QPAIR_FAILURE_NONE) { +- return false; +- } +- +- return true; +-} +- +-static inline bool +-nvme_io_path_is_available(struct nvme_io_path *io_path) +-{ +- if (spdk_unlikely(!nvme_io_path_is_connected(io_path))) { +- return false; +- } +- +- if (spdk_unlikely(!nvme_ns_is_accessible(io_path->nvme_ns))) { +- return false; +- } +- +- return true; +-} +- +-static inline bool +-nvme_io_path_is_failed(struct nvme_io_path *io_path) +-{ +- struct nvme_ctrlr *nvme_ctrlr; +- +- nvme_ctrlr = io_path->qpair->ctrlr; +- +- if (nvme_ctrlr->destruct) { +- return true; +- } +- +- if (nvme_ctrlr->fast_io_fail_timedout) { +- return true; +- } +- +- if (nvme_ctrlr->resetting) { +- if (nvme_ctrlr->opts.reconnect_delay_sec != 0) { +- return false; +- } else { +- return true; +- } +- } +- +- if (nvme_ctrlr->reconnect_is_delayed) { +- return false; +- } +- +- if (spdk_nvme_ctrlr_is_failed(nvme_ctrlr->ctrlr)) { +- return true; +- } else { +- return false; +- } +-} +- +-static bool +-nvme_ctrlr_is_available(struct nvme_ctrlr *nvme_ctrlr) +-{ +- if (nvme_ctrlr->destruct) { +- return false; +- } +- +- if (spdk_nvme_ctrlr_is_failed(nvme_ctrlr->ctrlr)) { +- return false; +- } +- +- if (nvme_ctrlr->resetting || nvme_ctrlr->reconnect_is_delayed) { +- return false; +- } +- +- return true; +-} +- +-/* Simulate circular linked list. */ +-static inline struct nvme_io_path * +-nvme_io_path_get_next(struct nvme_bdev_channel *nbdev_ch, struct nvme_io_path *prev_path) +-{ +- struct nvme_io_path *next_path; +- +- if (prev_path != NULL) { +- next_path = STAILQ_NEXT(prev_path, stailq); +- if (next_path != NULL) { +- return next_path; +- } +- } +- +- return STAILQ_FIRST(&nbdev_ch->io_path_list); +-} +- +-static struct nvme_io_path * +-_bdev_nvme_find_io_path(struct nvme_bdev_channel *nbdev_ch) +-{ +- struct nvme_io_path *io_path, *start, *non_optimized = NULL; +- +- start = nvme_io_path_get_next(nbdev_ch, nbdev_ch->current_io_path); +- +- io_path = start; +- do { +- if (spdk_likely(nvme_io_path_is_connected(io_path) && +- !io_path->nvme_ns->ana_state_updating)) { +- switch (io_path->nvme_ns->ana_state) { +- case SPDK_NVME_ANA_OPTIMIZED_STATE: +- nbdev_ch->current_io_path = io_path; +- return io_path; +- case SPDK_NVME_ANA_NON_OPTIMIZED_STATE: +- if (non_optimized == NULL) { +- non_optimized = io_path; +- } +- break; +- default: +- break; +- } +- } +- io_path = nvme_io_path_get_next(nbdev_ch, io_path); +- } while (io_path != start); +- +- if (nbdev_ch->mp_policy == BDEV_NVME_MP_POLICY_ACTIVE_ACTIVE) { +- /* We come here only if there is no optimized path. Cache even non_optimized +- * path for load balance across multiple non_optimized paths. +- */ +- nbdev_ch->current_io_path = non_optimized; +- } +- +- return non_optimized; +-} +- +-static struct nvme_io_path * +-_bdev_nvme_find_io_path_min_qd(struct nvme_bdev_channel *nbdev_ch) +-{ +- struct nvme_io_path *io_path; +- struct nvme_io_path *optimized = NULL, *non_optimized = NULL; +- uint32_t opt_min_qd = UINT32_MAX, non_opt_min_qd = UINT32_MAX; +- uint32_t num_outstanding_reqs; +- +- STAILQ_FOREACH(io_path, &nbdev_ch->io_path_list, stailq) { +- if (spdk_unlikely(!nvme_io_path_is_connected(io_path))) { +- /* The device is currently resetting. */ +- continue; +- } +- +- if (spdk_unlikely(io_path->nvme_ns->ana_state_updating)) { +- continue; +- } +- +- num_outstanding_reqs = spdk_nvme_qpair_get_num_outstanding_reqs(io_path->qpair->qpair); +- switch (io_path->nvme_ns->ana_state) { +- case SPDK_NVME_ANA_OPTIMIZED_STATE: +- if (num_outstanding_reqs < opt_min_qd) { +- opt_min_qd = num_outstanding_reqs; +- optimized = io_path; +- } +- break; +- case SPDK_NVME_ANA_NON_OPTIMIZED_STATE: +- if (num_outstanding_reqs < non_opt_min_qd) { +- non_opt_min_qd = num_outstanding_reqs; +- non_optimized = io_path; +- } +- break; +- default: +- break; +- } +- } +- +- /* don't cache io path for BDEV_NVME_MP_SELECTOR_QUEUE_DEPTH selector */ +- if (optimized != NULL) { +- return optimized; +- } +- +- return non_optimized; +-} +- +-static inline struct nvme_io_path * +-bdev_nvme_find_io_path(struct nvme_bdev_channel *nbdev_ch) +-{ +- if (spdk_likely(nbdev_ch->current_io_path != NULL)) { +- if (nbdev_ch->mp_policy == BDEV_NVME_MP_POLICY_ACTIVE_PASSIVE) { +- return nbdev_ch->current_io_path; +- } else if (nbdev_ch->mp_selector == BDEV_NVME_MP_SELECTOR_ROUND_ROBIN) { +- if (++nbdev_ch->rr_counter < nbdev_ch->rr_min_io) { +- return nbdev_ch->current_io_path; +- } +- nbdev_ch->rr_counter = 0; +- } +- } +- +- if (nbdev_ch->mp_policy == BDEV_NVME_MP_POLICY_ACTIVE_PASSIVE || +- nbdev_ch->mp_selector == BDEV_NVME_MP_SELECTOR_ROUND_ROBIN) { +- return _bdev_nvme_find_io_path(nbdev_ch); +- } else { +- return _bdev_nvme_find_io_path_min_qd(nbdev_ch); +- } +-} +- +-/* Return true if there is any io_path whose qpair is active or ctrlr is not failed, +- * or false otherwise. +- * +- * If any io_path has an active qpair but find_io_path() returned NULL, its namespace +- * is likely to be non-accessible now but may become accessible. +- * +- * If any io_path has an unfailed ctrlr but find_io_path() returned NULL, the ctrlr +- * is likely to be resetting now but the reset may succeed. A ctrlr is set to unfailed +- * when starting to reset it but it is set to failed when the reset failed. Hence, if +- * a ctrlr is unfailed, it is likely that it works fine or is resetting. +- */ +-static bool +-any_io_path_may_become_available(struct nvme_bdev_channel *nbdev_ch) +-{ +- struct nvme_io_path *io_path; +- +- STAILQ_FOREACH(io_path, &nbdev_ch->io_path_list, stailq) { +- if (io_path->nvme_ns->ana_transition_timedout) { +- continue; +- } +- +- if (nvme_io_path_is_connected(io_path) || +- !nvme_io_path_is_failed(io_path)) { +- return true; +- } +- } +- +- return false; +-} +- +-static void +-bdev_nvme_retry_io(struct nvme_bdev_channel *nbdev_ch, struct spdk_bdev_io *bdev_io) +-{ +- struct nvme_bdev_io *nbdev_io = (struct nvme_bdev_io *)bdev_io->driver_ctx; +- struct spdk_io_channel *ch; +- +- if (nbdev_io->io_path != NULL && nvme_io_path_is_available(nbdev_io->io_path)) { +- _bdev_nvme_submit_request(nbdev_ch, bdev_io); +- } else { +- ch = spdk_io_channel_from_ctx(nbdev_ch); +- bdev_nvme_submit_request(ch, bdev_io); +- } +-} +- +-static int +-bdev_nvme_retry_ios(void *arg) +-{ +- struct nvme_bdev_channel *nbdev_ch = arg; +- struct spdk_bdev_io *bdev_io, *tmp_bdev_io; +- struct nvme_bdev_io *bio; +- uint64_t now, delay_us; +- +- now = spdk_get_ticks(); +- +- TAILQ_FOREACH_SAFE(bdev_io, &nbdev_ch->retry_io_list, module_link, tmp_bdev_io) { +- bio = (struct nvme_bdev_io *)bdev_io->driver_ctx; +- if (bio->retry_ticks > now) { +- break; +- } +- +- TAILQ_REMOVE(&nbdev_ch->retry_io_list, bdev_io, module_link); +- +- bdev_nvme_retry_io(nbdev_ch, bdev_io); +- } +- +- spdk_poller_unregister(&nbdev_ch->retry_io_poller); +- +- bdev_io = TAILQ_FIRST(&nbdev_ch->retry_io_list); +- if (bdev_io != NULL) { +- bio = (struct nvme_bdev_io *)bdev_io->driver_ctx; +- +- delay_us = (bio->retry_ticks - now) * SPDK_SEC_TO_USEC / spdk_get_ticks_hz(); +- +- nbdev_ch->retry_io_poller = SPDK_POLLER_REGISTER(bdev_nvme_retry_ios, nbdev_ch, +- delay_us); +- } +- +- return SPDK_POLLER_BUSY; +-} +- +-static void +-bdev_nvme_queue_retry_io(struct nvme_bdev_channel *nbdev_ch, +- struct nvme_bdev_io *bio, uint64_t delay_ms) +-{ +- struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); +- struct spdk_bdev_io *tmp_bdev_io; +- struct nvme_bdev_io *tmp_bio; +- +- bio->retry_ticks = spdk_get_ticks() + delay_ms * spdk_get_ticks_hz() / 1000ULL; +- +- TAILQ_FOREACH_REVERSE(tmp_bdev_io, &nbdev_ch->retry_io_list, retry_io_head, module_link) { +- tmp_bio = (struct nvme_bdev_io *)tmp_bdev_io->driver_ctx; +- +- if (tmp_bio->retry_ticks <= bio->retry_ticks) { +- TAILQ_INSERT_AFTER(&nbdev_ch->retry_io_list, tmp_bdev_io, bdev_io, +- module_link); +- return; +- } +- } +- +- /* No earlier I/Os were found. This I/O must be the new head. */ +- TAILQ_INSERT_HEAD(&nbdev_ch->retry_io_list, bdev_io, module_link); +- +- spdk_poller_unregister(&nbdev_ch->retry_io_poller); +- +- nbdev_ch->retry_io_poller = SPDK_POLLER_REGISTER(bdev_nvme_retry_ios, nbdev_ch, +- delay_ms * 1000ULL); +-} +- +-static void +-bdev_nvme_abort_retry_ios(struct nvme_bdev_channel *nbdev_ch) +-{ +- struct spdk_bdev_io *bdev_io, *tmp_io; +- +- TAILQ_FOREACH_SAFE(bdev_io, &nbdev_ch->retry_io_list, module_link, tmp_io) { +- TAILQ_REMOVE(&nbdev_ch->retry_io_list, bdev_io, module_link); +- __bdev_nvme_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_ABORTED, NULL); +- } +- +- spdk_poller_unregister(&nbdev_ch->retry_io_poller); +-} +- +-static int +-bdev_nvme_abort_retry_io(struct nvme_bdev_channel *nbdev_ch, +- struct nvme_bdev_io *bio_to_abort) +-{ +- struct spdk_bdev_io *bdev_io_to_abort; +- +- TAILQ_FOREACH(bdev_io_to_abort, &nbdev_ch->retry_io_list, module_link) { +- if ((struct nvme_bdev_io *)bdev_io_to_abort->driver_ctx == bio_to_abort) { +- TAILQ_REMOVE(&nbdev_ch->retry_io_list, bdev_io_to_abort, module_link); +- __bdev_nvme_io_complete(bdev_io_to_abort, SPDK_BDEV_IO_STATUS_ABORTED, NULL); +- return 0; +- } +- } +- +- return -ENOENT; +-} +- +-static void +-bdev_nvme_update_nvme_error_stat(struct spdk_bdev_io *bdev_io, const struct spdk_nvme_cpl *cpl) +-{ +- struct nvme_bdev *nbdev; +- uint16_t sct, sc; +- +- assert(spdk_nvme_cpl_is_error(cpl)); +- +- nbdev = bdev_io->bdev->ctxt; +- +- if (nbdev->err_stat == NULL) { +- return; +- } +- +- sct = cpl->status.sct; +- sc = cpl->status.sc; +- +- pthread_mutex_lock(&nbdev->mutex); +- +- nbdev->err_stat->status_type[sct]++; +- switch (sct) { +- case SPDK_NVME_SCT_GENERIC: +- case SPDK_NVME_SCT_COMMAND_SPECIFIC: +- case SPDK_NVME_SCT_MEDIA_ERROR: +- case SPDK_NVME_SCT_PATH: +- nbdev->err_stat->status[sct][sc]++; +- break; +- default: +- break; +- } +- +- pthread_mutex_unlock(&nbdev->mutex); +-} +- +-static inline void +-bdev_nvme_update_io_path_stat(struct nvme_bdev_io *bio) +-{ +- struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); +- uint64_t num_blocks = bdev_io->u.bdev.num_blocks; +- uint32_t blocklen = bdev_io->bdev->blocklen; +- struct spdk_bdev_io_stat *stat; +- uint64_t tsc_diff; +- +- if (bio->io_path->stat == NULL) { +- return; +- } +- +- tsc_diff = spdk_get_ticks() - bio->submit_tsc; +- stat = bio->io_path->stat; +- +- switch (bdev_io->type) { +- case SPDK_BDEV_IO_TYPE_READ: +- stat->bytes_read += num_blocks * blocklen; +- stat->num_read_ops++; +- stat->read_latency_ticks += tsc_diff; +- if (stat->max_read_latency_ticks < tsc_diff) { +- stat->max_read_latency_ticks = tsc_diff; +- } +- if (stat->min_read_latency_ticks > tsc_diff) { +- stat->min_read_latency_ticks = tsc_diff; +- } +- break; +- case SPDK_BDEV_IO_TYPE_WRITE: +- stat->bytes_written += num_blocks * blocklen; +- stat->num_write_ops++; +- stat->write_latency_ticks += tsc_diff; +- if (stat->max_write_latency_ticks < tsc_diff) { +- stat->max_write_latency_ticks = tsc_diff; +- } +- if (stat->min_write_latency_ticks > tsc_diff) { +- stat->min_write_latency_ticks = tsc_diff; +- } +- break; +- case SPDK_BDEV_IO_TYPE_UNMAP: +- stat->bytes_unmapped += num_blocks * blocklen; +- stat->num_unmap_ops++; +- stat->unmap_latency_ticks += tsc_diff; +- if (stat->max_unmap_latency_ticks < tsc_diff) { +- stat->max_unmap_latency_ticks = tsc_diff; +- } +- if (stat->min_unmap_latency_ticks > tsc_diff) { +- stat->min_unmap_latency_ticks = tsc_diff; +- } +- break; +- case SPDK_BDEV_IO_TYPE_ZCOPY: +- /* Track the data in the start phase only */ +- if (!bdev_io->u.bdev.zcopy.start) { +- break; +- } +- if (bdev_io->u.bdev.zcopy.populate) { +- stat->bytes_read += num_blocks * blocklen; +- stat->num_read_ops++; +- stat->read_latency_ticks += tsc_diff; +- if (stat->max_read_latency_ticks < tsc_diff) { +- stat->max_read_latency_ticks = tsc_diff; +- } +- if (stat->min_read_latency_ticks > tsc_diff) { +- stat->min_read_latency_ticks = tsc_diff; +- } +- } else { +- stat->bytes_written += num_blocks * blocklen; +- stat->num_write_ops++; +- stat->write_latency_ticks += tsc_diff; +- if (stat->max_write_latency_ticks < tsc_diff) { +- stat->max_write_latency_ticks = tsc_diff; +- } +- if (stat->min_write_latency_ticks > tsc_diff) { +- stat->min_write_latency_ticks = tsc_diff; +- } +- } +- break; +- case SPDK_BDEV_IO_TYPE_COPY: +- stat->bytes_copied += num_blocks * blocklen; +- stat->num_copy_ops++; +- stat->copy_latency_ticks += tsc_diff; +- if (stat->max_copy_latency_ticks < tsc_diff) { +- stat->max_copy_latency_ticks = tsc_diff; +- } +- if (stat->min_copy_latency_ticks > tsc_diff) { +- stat->min_copy_latency_ticks = tsc_diff; +- } +- break; +- default: +- break; +- } +-} +- +-static inline void +-bdev_nvme_io_complete_nvme_status(struct nvme_bdev_io *bio, +- const struct spdk_nvme_cpl *cpl) +-{ +- struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); +- struct nvme_bdev_channel *nbdev_ch; +- struct nvme_io_path *io_path; +- struct nvme_ctrlr *nvme_ctrlr; +- const struct spdk_nvme_ctrlr_data *cdata; +- uint64_t delay_ms; +- +- assert(!bdev_nvme_io_type_is_admin(bdev_io->type)); +- +- if (spdk_likely(spdk_nvme_cpl_is_success(cpl))) { +- bdev_nvme_update_io_path_stat(bio); +- goto complete; +- } +- +- /* Update error counts before deciding if retry is needed. +- * Hence, error counts may be more than the number of I/O errors. +- */ +- bdev_nvme_update_nvme_error_stat(bdev_io, cpl); +- +- if (cpl->status.dnr != 0 || spdk_nvme_cpl_is_aborted_by_request(cpl) || +- (g_opts.bdev_retry_count != -1 && bio->retry_count >= g_opts.bdev_retry_count)) { +- goto complete; +- } +- +- nbdev_ch = spdk_io_channel_get_ctx(spdk_bdev_io_get_io_channel(bdev_io)); +- +- assert(bio->io_path != NULL); +- io_path = bio->io_path; +- +- nvme_ctrlr = io_path->qpair->ctrlr; +- +- if (spdk_nvme_cpl_is_path_error(cpl) || +- spdk_nvme_cpl_is_aborted_sq_deletion(cpl) || +- !nvme_io_path_is_available(io_path) || +- !nvme_ctrlr_is_available(nvme_ctrlr)) { +- bdev_nvme_clear_current_io_path(nbdev_ch); +- bio->io_path = NULL; +- if (spdk_nvme_cpl_is_ana_error(cpl)) { +- if (nvme_ctrlr_read_ana_log_page(nvme_ctrlr) == 0) { +- io_path->nvme_ns->ana_state_updating = true; +- } +- } +- if (!any_io_path_may_become_available(nbdev_ch)) { +- goto complete; +- } +- delay_ms = 0; +- } else { +- bio->retry_count++; +- +- cdata = spdk_nvme_ctrlr_get_data(nvme_ctrlr->ctrlr); +- +- if (cpl->status.crd != 0) { +- delay_ms = cdata->crdt[cpl->status.crd] * 100; +- } else { +- delay_ms = 0; +- } +- } +- +- bdev_nvme_queue_retry_io(nbdev_ch, bio, delay_ms); +- return; +- +-complete: +- bio->retry_count = 0; +- bio->submit_tsc = 0; +- __bdev_nvme_io_complete(bdev_io, 0, cpl); +-} +- +-static inline void +-bdev_nvme_io_complete(struct nvme_bdev_io *bio, int rc) +-{ +- struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); +- struct nvme_bdev_channel *nbdev_ch; +- enum spdk_bdev_io_status io_status; +- +- switch (rc) { +- case 0: +- io_status = SPDK_BDEV_IO_STATUS_SUCCESS; +- break; +- case -ENOMEM: +- io_status = SPDK_BDEV_IO_STATUS_NOMEM; +- break; +- case -ENXIO: +- nbdev_ch = spdk_io_channel_get_ctx(spdk_bdev_io_get_io_channel(bdev_io)); +- +- bdev_nvme_clear_current_io_path(nbdev_ch); +- bio->io_path = NULL; +- +- if (any_io_path_may_become_available(nbdev_ch)) { +- bdev_nvme_queue_retry_io(nbdev_ch, bio, 1000ULL); +- return; +- } +- +- /* fallthrough */ +- default: +- io_status = SPDK_BDEV_IO_STATUS_FAILED; +- break; +- } +- +- bio->retry_count = 0; +- bio->submit_tsc = 0; +- __bdev_nvme_io_complete(bdev_io, io_status, NULL); +-} +- +-static inline void +-bdev_nvme_admin_passthru_complete(struct nvme_bdev_io *bio, int rc) +-{ +- struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); +- enum spdk_bdev_io_status io_status; +- +- switch (rc) { +- case 0: +- io_status = SPDK_BDEV_IO_STATUS_SUCCESS; +- break; +- case -ENOMEM: +- io_status = SPDK_BDEV_IO_STATUS_NOMEM; +- break; +- case -ENXIO: +- /* fallthrough */ +- default: +- io_status = SPDK_BDEV_IO_STATUS_FAILED; +- break; +- } +- +- __bdev_nvme_io_complete(bdev_io, io_status, NULL); +-} +- +-static void +-bdev_nvme_clear_io_path_caches_done(struct spdk_io_channel_iter *i, int status) +-{ +- struct nvme_ctrlr *nvme_ctrlr = spdk_io_channel_iter_get_io_device(i); +- +- pthread_mutex_lock(&nvme_ctrlr->mutex); +- +- assert(nvme_ctrlr->io_path_cache_clearing == true); +- nvme_ctrlr->io_path_cache_clearing = false; +- +- if (!nvme_ctrlr_can_be_unregistered(nvme_ctrlr)) { +- pthread_mutex_unlock(&nvme_ctrlr->mutex); +- return; +- } +- +- pthread_mutex_unlock(&nvme_ctrlr->mutex); +- +- nvme_ctrlr_unregister(nvme_ctrlr); +-} +- +-static void +-_bdev_nvme_clear_io_path_cache(struct nvme_qpair *nvme_qpair) +-{ +- struct nvme_io_path *io_path; +- +- TAILQ_FOREACH(io_path, &nvme_qpair->io_path_list, tailq) { +- bdev_nvme_clear_current_io_path(io_path->nbdev_ch); +- } +-} +- +-static void +-bdev_nvme_clear_io_path_cache(struct spdk_io_channel_iter *i) +-{ +- struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i); +- struct nvme_ctrlr_channel *ctrlr_ch = spdk_io_channel_get_ctx(_ch); +- +- assert(ctrlr_ch->qpair != NULL); +- +- _bdev_nvme_clear_io_path_cache(ctrlr_ch->qpair); +- +- spdk_for_each_channel_continue(i, 0); +-} +- +-static void +-bdev_nvme_clear_io_path_caches(struct nvme_ctrlr *nvme_ctrlr) +-{ +- pthread_mutex_lock(&nvme_ctrlr->mutex); +- if (!nvme_ctrlr_is_available(nvme_ctrlr) || +- nvme_ctrlr->io_path_cache_clearing) { +- pthread_mutex_unlock(&nvme_ctrlr->mutex); +- return; +- } +- +- nvme_ctrlr->io_path_cache_clearing = true; +- pthread_mutex_unlock(&nvme_ctrlr->mutex); +- +- spdk_for_each_channel(nvme_ctrlr, +- bdev_nvme_clear_io_path_cache, +- NULL, +- bdev_nvme_clear_io_path_caches_done); +-} +- +-static struct nvme_qpair * +-nvme_poll_group_get_qpair(struct nvme_poll_group *group, struct spdk_nvme_qpair *qpair) +-{ +- struct nvme_qpair *nvme_qpair; +- +- TAILQ_FOREACH(nvme_qpair, &group->qpair_list, tailq) { +- if (nvme_qpair->qpair == qpair) { +- break; +- } +- } +- +- return nvme_qpair; +-} +- +-static void nvme_qpair_delete(struct nvme_qpair *nvme_qpair); +- +-static void +-bdev_nvme_disconnected_qpair_cb(struct spdk_nvme_qpair *qpair, void *poll_group_ctx) +-{ +- struct nvme_poll_group *group = poll_group_ctx; +- struct nvme_qpair *nvme_qpair; +- struct nvme_ctrlr_channel *ctrlr_ch; +- +- nvme_qpair = nvme_poll_group_get_qpair(group, qpair); +- if (nvme_qpair == NULL) { +- return; +- } +- +- if (nvme_qpair->qpair != NULL) { +- spdk_nvme_ctrlr_free_io_qpair(nvme_qpair->qpair); +- nvme_qpair->qpair = NULL; +- } +- +- _bdev_nvme_clear_io_path_cache(nvme_qpair); +- +- ctrlr_ch = nvme_qpair->ctrlr_ch; +- +- if (ctrlr_ch != NULL) { +- if (ctrlr_ch->reset_iter != NULL) { +- /* If we are already in a full reset sequence, we do not have +- * to restart it. Just move to the next ctrlr_channel. +- */ +- SPDK_DEBUGLOG(bdev_nvme, "qpair %p was disconnected and freed in a reset ctrlr sequence.\n", +- qpair); +- spdk_for_each_channel_continue(ctrlr_ch->reset_iter, 0); +- ctrlr_ch->reset_iter = NULL; +- } else { +- /* qpair was disconnected unexpectedly. Reset controller for recovery. */ +- SPDK_NOTICELOG("qpair %p was disconnected and freed. reset controller.\n", qpair); +- bdev_nvme_failover(nvme_qpair->ctrlr, false); +- } +- } else { +- /* In this case, ctrlr_channel is already deleted. */ +- SPDK_DEBUGLOG(bdev_nvme, "qpair %p was disconnected and freed. delete nvme_qpair.\n", qpair); +- nvme_qpair_delete(nvme_qpair); +- } +-} +- +-static void +-bdev_nvme_check_io_qpairs(struct nvme_poll_group *group) +-{ +- struct nvme_qpair *nvme_qpair; +- +- TAILQ_FOREACH(nvme_qpair, &group->qpair_list, tailq) { +- if (nvme_qpair->qpair == NULL || nvme_qpair->ctrlr_ch == NULL) { +- continue; +- } +- +- if (spdk_nvme_qpair_get_failure_reason(nvme_qpair->qpair) != +- SPDK_NVME_QPAIR_FAILURE_NONE) { +- _bdev_nvme_clear_io_path_cache(nvme_qpair); +- } +- } +-} +- +-static int +-bdev_nvme_poll(void *arg) +-{ +- struct nvme_poll_group *group = arg; +- int64_t num_completions; +- +- if (group->collect_spin_stat && group->start_ticks == 0) { +- group->start_ticks = spdk_get_ticks(); +- } +- +- num_completions = spdk_nvme_poll_group_process_completions(group->group, 0, +- bdev_nvme_disconnected_qpair_cb); +- if (group->collect_spin_stat) { +- if (num_completions > 0) { +- if (group->end_ticks != 0) { +- group->spin_ticks += (group->end_ticks - group->start_ticks); +- group->end_ticks = 0; +- } +- group->start_ticks = 0; +- } else { +- group->end_ticks = spdk_get_ticks(); +- } +- } +- +- if (spdk_unlikely(num_completions < 0)) { +- bdev_nvme_check_io_qpairs(group); +- } +- +- return num_completions > 0 ? SPDK_POLLER_BUSY : SPDK_POLLER_IDLE; +-} +- +-static int bdev_nvme_poll_adminq(void *arg); +- +-static void +-bdev_nvme_change_adminq_poll_period(struct nvme_ctrlr *nvme_ctrlr, uint64_t new_period_us) +-{ +- spdk_poller_unregister(&nvme_ctrlr->adminq_timer_poller); +- +- nvme_ctrlr->adminq_timer_poller = SPDK_POLLER_REGISTER(bdev_nvme_poll_adminq, +- nvme_ctrlr, new_period_us); +-} +- +-static int +-bdev_nvme_poll_adminq(void *arg) +-{ +- int32_t rc; +- struct nvme_ctrlr *nvme_ctrlr = arg; +- nvme_ctrlr_disconnected_cb disconnected_cb; +- +- assert(nvme_ctrlr != NULL); +- +- rc = spdk_nvme_ctrlr_process_admin_completions(nvme_ctrlr->ctrlr); +- if (rc < 0) { +- disconnected_cb = nvme_ctrlr->disconnected_cb; +- nvme_ctrlr->disconnected_cb = NULL; +- +- if (rc == -ENXIO && disconnected_cb != NULL) { +- bdev_nvme_change_adminq_poll_period(nvme_ctrlr, +- g_opts.nvme_adminq_poll_period_us); +- disconnected_cb(nvme_ctrlr); +- } else { +- bdev_nvme_failover(nvme_ctrlr, false); +- } +- } else if (spdk_nvme_ctrlr_get_admin_qp_failure_reason(nvme_ctrlr->ctrlr) != +- SPDK_NVME_QPAIR_FAILURE_NONE) { +- bdev_nvme_clear_io_path_caches(nvme_ctrlr); +- } +- +- return rc == 0 ? SPDK_POLLER_IDLE : SPDK_POLLER_BUSY; +-} +- +-static void +-_bdev_nvme_unregister_dev_cb(void *io_device) +-{ +- struct nvme_bdev *nvme_disk = io_device; +- +- free(nvme_disk->disk.name); +- free(nvme_disk->err_stat); +- free(nvme_disk); +-} +- +-static int +-bdev_nvme_destruct(void *ctx) +-{ +- struct nvme_bdev *nvme_disk = ctx; +- struct nvme_ns *nvme_ns, *tmp_nvme_ns; +- +- SPDK_DTRACE_PROBE2(bdev_nvme_destruct, nvme_disk->nbdev_ctrlr->name, nvme_disk->nsid); +- +- TAILQ_FOREACH_SAFE(nvme_ns, &nvme_disk->nvme_ns_list, tailq, tmp_nvme_ns) { +- pthread_mutex_lock(&nvme_ns->ctrlr->mutex); +- +- nvme_ns->bdev = NULL; +- +- assert(nvme_ns->id > 0); +- +- if (nvme_ctrlr_get_ns(nvme_ns->ctrlr, nvme_ns->id) == NULL) { +- pthread_mutex_unlock(&nvme_ns->ctrlr->mutex); +- +- nvme_ctrlr_release(nvme_ns->ctrlr); +- nvme_ns_free(nvme_ns); +- } else { +- pthread_mutex_unlock(&nvme_ns->ctrlr->mutex); +- } +- } +- +- pthread_mutex_lock(&g_bdev_nvme_mutex); +- TAILQ_REMOVE(&nvme_disk->nbdev_ctrlr->bdevs, nvme_disk, tailq); +- pthread_mutex_unlock(&g_bdev_nvme_mutex); +- +- spdk_io_device_unregister(nvme_disk, _bdev_nvme_unregister_dev_cb); +- +- return 0; +-} +- +-static int +-bdev_nvme_create_qpair(struct nvme_qpair *nvme_qpair) +-{ +- struct nvme_ctrlr *nvme_ctrlr; +- struct spdk_nvme_io_qpair_opts opts; +- struct spdk_nvme_qpair *qpair; +- int rc; +- +- nvme_ctrlr = nvme_qpair->ctrlr; +- +- spdk_nvme_ctrlr_get_default_io_qpair_opts(nvme_ctrlr->ctrlr, &opts, sizeof(opts)); +- opts.delay_cmd_submit = g_opts.delay_cmd_submit; +- opts.create_only = true; +- opts.async_mode = true; +- opts.io_queue_requests = spdk_max(g_opts.io_queue_requests, opts.io_queue_requests); +- g_opts.io_queue_requests = opts.io_queue_requests; +- +- qpair = spdk_nvme_ctrlr_alloc_io_qpair(nvme_ctrlr->ctrlr, &opts, sizeof(opts)); +- if (qpair == NULL) { +- return -1; +- } +- +- SPDK_DTRACE_PROBE3(bdev_nvme_create_qpair, nvme_ctrlr->nbdev_ctrlr->name, +- spdk_nvme_qpair_get_id(qpair), spdk_thread_get_id(nvme_ctrlr->thread)); +- +- assert(nvme_qpair->group != NULL); +- +- rc = spdk_nvme_poll_group_add(nvme_qpair->group->group, qpair); +- if (rc != 0) { +- SPDK_ERRLOG("Unable to begin polling on NVMe Channel.\n"); +- goto err; +- } +- +- rc = spdk_nvme_ctrlr_connect_io_qpair(nvme_ctrlr->ctrlr, qpair); +- if (rc != 0) { +- SPDK_ERRLOG("Unable to connect I/O qpair.\n"); +- goto err; +- } +- +- nvme_qpair->qpair = qpair; +- +- if (!g_opts.disable_auto_failback) { +- _bdev_nvme_clear_io_path_cache(nvme_qpair); +- } +- +- return 0; +- +-err: +- spdk_nvme_ctrlr_free_io_qpair(qpair); +- +- return rc; +-} +- +-static void +-bdev_nvme_complete_pending_resets(struct spdk_io_channel_iter *i) +-{ +- struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i); +- struct nvme_ctrlr_channel *ctrlr_ch = spdk_io_channel_get_ctx(_ch); +- enum spdk_bdev_io_status status = SPDK_BDEV_IO_STATUS_SUCCESS; +- struct spdk_bdev_io *bdev_io; +- +- if (spdk_io_channel_iter_get_ctx(i) != NULL) { +- status = SPDK_BDEV_IO_STATUS_FAILED; +- } +- +- while (!TAILQ_EMPTY(&ctrlr_ch->pending_resets)) { +- bdev_io = TAILQ_FIRST(&ctrlr_ch->pending_resets); +- TAILQ_REMOVE(&ctrlr_ch->pending_resets, bdev_io, module_link); +- __bdev_nvme_io_complete(bdev_io, status, NULL); +- } +- +- spdk_for_each_channel_continue(i, 0); +-} +- +-static void +-bdev_nvme_failover_trid(struct nvme_ctrlr *nvme_ctrlr, bool remove) +-{ +- struct nvme_path_id *path_id, *next_path; +- int rc __attribute__((unused)); +- +- path_id = TAILQ_FIRST(&nvme_ctrlr->trids); +- assert(path_id); +- assert(path_id == nvme_ctrlr->active_path_id); +- next_path = TAILQ_NEXT(path_id, link); +- +- path_id->is_failed = true; +- +- if (next_path) { +- assert(path_id->trid.trtype != SPDK_NVME_TRANSPORT_PCIE); +- +- SPDK_NOTICELOG("Start failover from %s:%s to %s:%s\n", path_id->trid.traddr, +- path_id->trid.trsvcid, next_path->trid.traddr, next_path->trid.trsvcid); +- +- spdk_nvme_ctrlr_fail(nvme_ctrlr->ctrlr); +- nvme_ctrlr->active_path_id = next_path; +- rc = spdk_nvme_ctrlr_set_trid(nvme_ctrlr->ctrlr, &next_path->trid); +- assert(rc == 0); +- TAILQ_REMOVE(&nvme_ctrlr->trids, path_id, link); +- if (!remove) { +- /** Shuffle the old trid to the end of the list and use the new one. +- * Allows for round robin through multiple connections. +- */ +- TAILQ_INSERT_TAIL(&nvme_ctrlr->trids, path_id, link); +- } else { +- free(path_id); +- } +- } +-} +- +-static bool +-bdev_nvme_check_ctrlr_loss_timeout(struct nvme_ctrlr *nvme_ctrlr) +-{ +- int32_t elapsed; +- +- if (nvme_ctrlr->opts.ctrlr_loss_timeout_sec == 0 || +- nvme_ctrlr->opts.ctrlr_loss_timeout_sec == -1) { +- return false; +- } +- +- elapsed = (spdk_get_ticks() - nvme_ctrlr->reset_start_tsc) / spdk_get_ticks_hz(); +- if (elapsed >= nvme_ctrlr->opts.ctrlr_loss_timeout_sec) { +- return true; +- } else { +- return false; +- } +-} +- +-static bool +-bdev_nvme_check_fast_io_fail_timeout(struct nvme_ctrlr *nvme_ctrlr) +-{ +- uint32_t elapsed; +- +- if (nvme_ctrlr->opts.fast_io_fail_timeout_sec == 0) { +- return false; +- } +- +- elapsed = (spdk_get_ticks() - nvme_ctrlr->reset_start_tsc) / spdk_get_ticks_hz(); +- if (elapsed >= nvme_ctrlr->opts.fast_io_fail_timeout_sec) { +- return true; +- } else { +- return false; +- } +-} +- +-static void bdev_nvme_reset_complete(struct nvme_ctrlr *nvme_ctrlr, bool success); +- +-static void +-nvme_ctrlr_disconnect(struct nvme_ctrlr *nvme_ctrlr, nvme_ctrlr_disconnected_cb cb_fn) +-{ +- int rc; +- +- rc = spdk_nvme_ctrlr_disconnect(nvme_ctrlr->ctrlr); +- if (rc != 0) { +- /* Disconnect fails if ctrlr is already resetting or removed. In this case, +- * fail the reset sequence immediately. +- */ +- bdev_nvme_reset_complete(nvme_ctrlr, false); +- return; +- } +- +- /* spdk_nvme_ctrlr_disconnect() may complete asynchronously later by polling adminq. +- * Set callback here to execute the specified operation after ctrlr is really disconnected. +- */ +- assert(nvme_ctrlr->disconnected_cb == NULL); +- nvme_ctrlr->disconnected_cb = cb_fn; +- +- /* During disconnection, reduce the period to poll adminq more often. */ +- bdev_nvme_change_adminq_poll_period(nvme_ctrlr, 0); +-} +- +-enum bdev_nvme_op_after_reset { +- OP_NONE, +- OP_COMPLETE_PENDING_DESTRUCT, +- OP_DESTRUCT, +- OP_DELAYED_RECONNECT, +-}; +- +-typedef enum bdev_nvme_op_after_reset _bdev_nvme_op_after_reset; +- +-static _bdev_nvme_op_after_reset +-bdev_nvme_check_op_after_reset(struct nvme_ctrlr *nvme_ctrlr, bool success) +-{ +- if (nvme_ctrlr_can_be_unregistered(nvme_ctrlr)) { +- /* Complete pending destruct after reset completes. */ +- return OP_COMPLETE_PENDING_DESTRUCT; +- } else if (success || nvme_ctrlr->opts.reconnect_delay_sec == 0) { +- nvme_ctrlr->reset_start_tsc = 0; +- return OP_NONE; +- } else if (bdev_nvme_check_ctrlr_loss_timeout(nvme_ctrlr)) { +- return OP_DESTRUCT; +- } else { +- if (bdev_nvme_check_fast_io_fail_timeout(nvme_ctrlr)) { +- nvme_ctrlr->fast_io_fail_timedout = true; +- } +- bdev_nvme_failover_trid(nvme_ctrlr, false); +- return OP_DELAYED_RECONNECT; +- } +-} +- +-static int _bdev_nvme_delete(struct nvme_ctrlr *nvme_ctrlr, bool hotplug); +-static void bdev_nvme_reconnect_ctrlr(struct nvme_ctrlr *nvme_ctrlr); +- +-static int +-bdev_nvme_reconnect_delay_timer_expired(void *ctx) +-{ +- struct nvme_ctrlr *nvme_ctrlr = ctx; +- +- SPDK_DTRACE_PROBE1(bdev_nvme_ctrlr_reconnect_delay, nvme_ctrlr->nbdev_ctrlr->name); +- pthread_mutex_lock(&nvme_ctrlr->mutex); +- +- spdk_poller_unregister(&nvme_ctrlr->reconnect_delay_timer); +- +- assert(nvme_ctrlr->reconnect_is_delayed == true); +- nvme_ctrlr->reconnect_is_delayed = false; +- +- if (nvme_ctrlr->destruct) { +- pthread_mutex_unlock(&nvme_ctrlr->mutex); +- return SPDK_POLLER_BUSY; +- } +- +- assert(nvme_ctrlr->resetting == false); +- nvme_ctrlr->resetting = true; +- +- pthread_mutex_unlock(&nvme_ctrlr->mutex); +- +- spdk_poller_resume(nvme_ctrlr->adminq_timer_poller); +- +- bdev_nvme_reconnect_ctrlr(nvme_ctrlr); +- return SPDK_POLLER_BUSY; +-} +- +-static void +-bdev_nvme_start_reconnect_delay_timer(struct nvme_ctrlr *nvme_ctrlr) +-{ +- spdk_poller_pause(nvme_ctrlr->adminq_timer_poller); +- +- assert(nvme_ctrlr->reconnect_is_delayed == false); +- nvme_ctrlr->reconnect_is_delayed = true; +- +- assert(nvme_ctrlr->reconnect_delay_timer == NULL); +- nvme_ctrlr->reconnect_delay_timer = SPDK_POLLER_REGISTER(bdev_nvme_reconnect_delay_timer_expired, +- nvme_ctrlr, +- nvme_ctrlr->opts.reconnect_delay_sec * SPDK_SEC_TO_USEC); +-} +- +-static void +-_bdev_nvme_reset_complete(struct spdk_io_channel_iter *i, int status) +-{ +- struct nvme_ctrlr *nvme_ctrlr = spdk_io_channel_iter_get_io_device(i); +- bool success = spdk_io_channel_iter_get_ctx(i) == NULL; +- struct nvme_path_id *path_id; +- bdev_nvme_reset_cb reset_cb_fn = nvme_ctrlr->reset_cb_fn; +- void *reset_cb_arg = nvme_ctrlr->reset_cb_arg; +- enum bdev_nvme_op_after_reset op_after_reset; +- +- assert(nvme_ctrlr->thread == spdk_get_thread()); +- +- nvme_ctrlr->reset_cb_fn = NULL; +- nvme_ctrlr->reset_cb_arg = NULL; +- +- if (!success) { +- SPDK_ERRLOG("Resetting controller failed.\n"); +- } else { +- SPDK_NOTICELOG("Resetting controller successful.\n"); +- } +- +- pthread_mutex_lock(&nvme_ctrlr->mutex); +- nvme_ctrlr->resetting = false; +- +- path_id = TAILQ_FIRST(&nvme_ctrlr->trids); +- assert(path_id != NULL); +- assert(path_id == nvme_ctrlr->active_path_id); +- +- path_id->is_failed = !success; +- +- op_after_reset = bdev_nvme_check_op_after_reset(nvme_ctrlr, success); +- +- pthread_mutex_unlock(&nvme_ctrlr->mutex); +- +- if (reset_cb_fn) { +- reset_cb_fn(reset_cb_arg, success); +- } +- +- switch (op_after_reset) { +- case OP_COMPLETE_PENDING_DESTRUCT: +- nvme_ctrlr_unregister(nvme_ctrlr); +- break; +- case OP_DESTRUCT: +- _bdev_nvme_delete(nvme_ctrlr, false); +- break; +- case OP_DELAYED_RECONNECT: +- nvme_ctrlr_disconnect(nvme_ctrlr, bdev_nvme_start_reconnect_delay_timer); +- break; +- default: +- break; +- } +-} +- +-static void +-bdev_nvme_reset_complete(struct nvme_ctrlr *nvme_ctrlr, bool success) +-{ +- /* Make sure we clear any pending resets before returning. */ +- spdk_for_each_channel(nvme_ctrlr, +- bdev_nvme_complete_pending_resets, +- success ? NULL : (void *)0x1, +- _bdev_nvme_reset_complete); +-} +- +-static void +-bdev_nvme_reset_create_qpairs_failed(struct spdk_io_channel_iter *i, int status) +-{ +- struct nvme_ctrlr *nvme_ctrlr = spdk_io_channel_iter_get_io_device(i); +- +- bdev_nvme_reset_complete(nvme_ctrlr, false); +-} +- +-static void +-bdev_nvme_reset_destroy_qpair(struct spdk_io_channel_iter *i) +-{ +- struct spdk_io_channel *ch = spdk_io_channel_iter_get_channel(i); +- struct nvme_ctrlr_channel *ctrlr_ch = spdk_io_channel_get_ctx(ch); +- struct nvme_qpair *nvme_qpair; +- +- nvme_qpair = ctrlr_ch->qpair; +- assert(nvme_qpair != NULL); +- +- _bdev_nvme_clear_io_path_cache(nvme_qpair); +- +- if (nvme_qpair->qpair != NULL) { +- spdk_nvme_ctrlr_disconnect_io_qpair(nvme_qpair->qpair); +- +- /* The current full reset sequence will move to the next +- * ctrlr_channel after the qpair is actually disconnected. +- */ +- assert(ctrlr_ch->reset_iter == NULL); +- ctrlr_ch->reset_iter = i; +- } else { +- spdk_for_each_channel_continue(i, 0); +- } +-} +- +-static void +-bdev_nvme_reset_create_qpairs_done(struct spdk_io_channel_iter *i, int status) +-{ +- struct nvme_ctrlr *nvme_ctrlr = spdk_io_channel_iter_get_io_device(i); +- +- if (status == 0) { +- bdev_nvme_reset_complete(nvme_ctrlr, true); +- } else { +- /* Delete the added qpairs and quiesce ctrlr to make the states clean. */ +- spdk_for_each_channel(nvme_ctrlr, +- bdev_nvme_reset_destroy_qpair, +- NULL, +- bdev_nvme_reset_create_qpairs_failed); +- } +-} +- +-static void +-bdev_nvme_reset_create_qpair(struct spdk_io_channel_iter *i) +-{ +- struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i); +- struct nvme_ctrlr_channel *ctrlr_ch = spdk_io_channel_get_ctx(_ch); +- int rc; +- +- rc = bdev_nvme_create_qpair(ctrlr_ch->qpair); +- +- spdk_for_each_channel_continue(i, rc); +-} +- +-static int +-bdev_nvme_reconnect_ctrlr_poll(void *arg) +-{ +- struct nvme_ctrlr *nvme_ctrlr = arg; +- int rc = -ETIMEDOUT; +- +- if (!bdev_nvme_check_ctrlr_loss_timeout(nvme_ctrlr)) { +- rc = spdk_nvme_ctrlr_reconnect_poll_async(nvme_ctrlr->ctrlr); +- if (rc == -EAGAIN) { +- return SPDK_POLLER_BUSY; +- } +- } +- +- spdk_poller_unregister(&nvme_ctrlr->reset_detach_poller); +- if (rc == 0) { +- /* Recreate all of the I/O queue pairs */ +- spdk_for_each_channel(nvme_ctrlr, +- bdev_nvme_reset_create_qpair, +- NULL, +- bdev_nvme_reset_create_qpairs_done); +- } else { +- bdev_nvme_reset_complete(nvme_ctrlr, false); +- } +- return SPDK_POLLER_BUSY; +-} +- +-static void +-bdev_nvme_reconnect_ctrlr(struct nvme_ctrlr *nvme_ctrlr) +-{ +- spdk_nvme_ctrlr_reconnect_async(nvme_ctrlr->ctrlr); +- +- SPDK_DTRACE_PROBE1(bdev_nvme_ctrlr_reconnect, nvme_ctrlr->nbdev_ctrlr->name); +- assert(nvme_ctrlr->reset_detach_poller == NULL); +- nvme_ctrlr->reset_detach_poller = SPDK_POLLER_REGISTER(bdev_nvme_reconnect_ctrlr_poll, +- nvme_ctrlr, 0); +-} +- +-static void +-bdev_nvme_reset_ctrlr(struct spdk_io_channel_iter *i, int status) +-{ +- struct nvme_ctrlr *nvme_ctrlr = spdk_io_channel_iter_get_io_device(i); +- +- SPDK_DTRACE_PROBE1(bdev_nvme_ctrlr_reset, nvme_ctrlr->nbdev_ctrlr->name); +- assert(status == 0); +- +- if (!spdk_nvme_ctrlr_is_fabrics(nvme_ctrlr->ctrlr)) { +- bdev_nvme_reconnect_ctrlr(nvme_ctrlr); +- } else { +- nvme_ctrlr_disconnect(nvme_ctrlr, bdev_nvme_reconnect_ctrlr); +- } +-} +- +-static void +-bdev_nvme_reset_destroy_qpairs(struct nvme_ctrlr *nvme_ctrlr) +-{ +- spdk_for_each_channel(nvme_ctrlr, +- bdev_nvme_reset_destroy_qpair, +- NULL, +- bdev_nvme_reset_ctrlr); +-} +- +-static void +-_bdev_nvme_reset(void *ctx) +-{ +- struct nvme_ctrlr *nvme_ctrlr = ctx; +- +- assert(nvme_ctrlr->resetting == true); +- assert(nvme_ctrlr->thread == spdk_get_thread()); +- +- if (!spdk_nvme_ctrlr_is_fabrics(nvme_ctrlr->ctrlr)) { +- nvme_ctrlr_disconnect(nvme_ctrlr, bdev_nvme_reset_destroy_qpairs); +- } else { +- bdev_nvme_reset_destroy_qpairs(nvme_ctrlr); +- } +-} +- +-static int +-bdev_nvme_reset(struct nvme_ctrlr *nvme_ctrlr) +-{ +- pthread_mutex_lock(&nvme_ctrlr->mutex); +- if (nvme_ctrlr->destruct) { +- pthread_mutex_unlock(&nvme_ctrlr->mutex); +- return -ENXIO; +- } +- +- if (nvme_ctrlr->resetting) { +- pthread_mutex_unlock(&nvme_ctrlr->mutex); +- SPDK_NOTICELOG("Unable to perform reset, already in progress.\n"); +- return -EBUSY; +- } +- +- if (nvme_ctrlr->reconnect_is_delayed) { +- pthread_mutex_unlock(&nvme_ctrlr->mutex); +- SPDK_NOTICELOG("Reconnect is already scheduled.\n"); +- return -EBUSY; +- } +- +- nvme_ctrlr->resetting = true; +- +- assert(nvme_ctrlr->reset_start_tsc == 0); +- nvme_ctrlr->reset_start_tsc = spdk_get_ticks(); +- +- pthread_mutex_unlock(&nvme_ctrlr->mutex); +- +- spdk_thread_send_msg(nvme_ctrlr->thread, _bdev_nvme_reset, nvme_ctrlr); +- return 0; +-} +- +-int +-bdev_nvme_reset_rpc(struct nvme_ctrlr *nvme_ctrlr, bdev_nvme_reset_cb cb_fn, void *cb_arg) +-{ +- int rc; +- +- rc = bdev_nvme_reset(nvme_ctrlr); +- if (rc == 0) { +- nvme_ctrlr->reset_cb_fn = cb_fn; +- nvme_ctrlr->reset_cb_arg = cb_arg; +- } +- return rc; +-} +- +-static int _bdev_nvme_reset_io(struct nvme_io_path *io_path, struct nvme_bdev_io *bio); +- +-static void +-bdev_nvme_reset_io_complete(struct nvme_bdev_io *bio) +-{ +- enum spdk_bdev_io_status io_status; +- +- if (bio->cpl.cdw0 == 0) { +- io_status = SPDK_BDEV_IO_STATUS_SUCCESS; +- } else { +- io_status = SPDK_BDEV_IO_STATUS_FAILED; +- } +- +- __bdev_nvme_io_complete(spdk_bdev_io_from_ctx(bio), io_status, NULL); +-} +- +-static void +-_bdev_nvme_reset_io_continue(void *ctx) +-{ +- struct nvme_bdev_io *bio = ctx; +- struct nvme_io_path *prev_io_path, *next_io_path; +- int rc; +- +- prev_io_path = bio->io_path; +- bio->io_path = NULL; +- +- if (bio->cpl.cdw0 != 0) { +- goto complete; +- } +- +- next_io_path = STAILQ_NEXT(prev_io_path, stailq); +- if (next_io_path == NULL) { +- goto complete; +- } +- +- rc = _bdev_nvme_reset_io(next_io_path, bio); +- if (rc == 0) { +- return; +- } +- +- bio->cpl.cdw0 = 1; +- +-complete: +- bdev_nvme_reset_io_complete(bio); +-} +- +-static void +-bdev_nvme_reset_io_continue(void *cb_arg, bool success) +-{ +- struct nvme_bdev_io *bio = cb_arg; +- +- bio->cpl.cdw0 = !success; +- +- spdk_thread_send_msg(bio->orig_thread, _bdev_nvme_reset_io_continue, bio); +-} +- +-static int +-_bdev_nvme_reset_io(struct nvme_io_path *io_path, struct nvme_bdev_io *bio) +-{ +- struct nvme_ctrlr *nvme_ctrlr = io_path->qpair->ctrlr; +- struct nvme_ctrlr_channel *ctrlr_ch; +- struct spdk_bdev_io *bdev_io; +- int rc; +- +- rc = bdev_nvme_reset(nvme_ctrlr); +- if (rc == 0) { +- assert(bio->io_path == NULL); +- bio->io_path = io_path; +- +- assert(nvme_ctrlr->reset_cb_fn == NULL); +- assert(nvme_ctrlr->reset_cb_arg == NULL); +- nvme_ctrlr->reset_cb_fn = bdev_nvme_reset_io_continue; +- nvme_ctrlr->reset_cb_arg = bio; +- } else if (rc == -EBUSY) { +- ctrlr_ch = io_path->qpair->ctrlr_ch; +- assert(ctrlr_ch != NULL); +- /* +- * Reset call is queued only if it is from the app framework. This is on purpose so that +- * we don't interfere with the app framework reset strategy. i.e. we are deferring to the +- * upper level. If they are in the middle of a reset, we won't try to schedule another one. +- */ +- bdev_io = spdk_bdev_io_from_ctx(bio); +- TAILQ_INSERT_TAIL(&ctrlr_ch->pending_resets, bdev_io, module_link); +- } else { +- return rc; +- } +- +- return 0; +-} +- +-static void +-bdev_nvme_reset_io(struct nvme_bdev_channel *nbdev_ch, struct nvme_bdev_io *bio) +-{ +- struct nvme_io_path *io_path; +- int rc; +- +- bio->cpl.cdw0 = 0; +- bio->orig_thread = spdk_get_thread(); +- +- /* Reset only the first nvme_ctrlr in the nvme_bdev_ctrlr for now. +- * +- * TODO: Reset all nvme_ctrlrs in the nvme_bdev_ctrlr sequentially. +- * This will be done in the following patches. +- */ +- io_path = STAILQ_FIRST(&nbdev_ch->io_path_list); +- assert(io_path != NULL); +- +- rc = _bdev_nvme_reset_io(io_path, bio); +- if (rc != 0) { +- bio->cpl.cdw0 = 1; +- bdev_nvme_reset_io_complete(bio); +- } +-} +- +-static int +-bdev_nvme_failover(struct nvme_ctrlr *nvme_ctrlr, bool remove) +-{ +- pthread_mutex_lock(&nvme_ctrlr->mutex); +- if (nvme_ctrlr->destruct) { +- pthread_mutex_unlock(&nvme_ctrlr->mutex); +- /* Don't bother resetting if the controller is in the process of being destructed. */ +- return -ENXIO; +- } +- +- if (nvme_ctrlr->resetting) { +- pthread_mutex_unlock(&nvme_ctrlr->mutex); +- SPDK_NOTICELOG("Unable to perform reset, already in progress.\n"); +- return -EBUSY; +- } +- +- bdev_nvme_failover_trid(nvme_ctrlr, remove); +- +- if (nvme_ctrlr->reconnect_is_delayed) { +- pthread_mutex_unlock(&nvme_ctrlr->mutex); +- SPDK_NOTICELOG("Reconnect is already scheduled.\n"); +- +- /* We rely on the next reconnect for the failover. */ +- return 0; +- } +- +- nvme_ctrlr->resetting = true; +- +- assert(nvme_ctrlr->reset_start_tsc == 0); +- nvme_ctrlr->reset_start_tsc = spdk_get_ticks(); +- +- pthread_mutex_unlock(&nvme_ctrlr->mutex); +- +- spdk_thread_send_msg(nvme_ctrlr->thread, _bdev_nvme_reset, nvme_ctrlr); +- return 0; +-} +- +-static int bdev_nvme_unmap(struct nvme_bdev_io *bio, uint64_t offset_blocks, +- uint64_t num_blocks); +- +-static int bdev_nvme_write_zeroes(struct nvme_bdev_io *bio, uint64_t offset_blocks, +- uint64_t num_blocks); +- +-static int bdev_nvme_copy(struct nvme_bdev_io *bio, uint64_t dst_offset_blocks, +- uint64_t src_offset_blocks, +- uint64_t num_blocks); +- +-static void +-bdev_nvme_get_buf_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io, +- bool success) +-{ +- struct nvme_bdev_io *bio = (struct nvme_bdev_io *)bdev_io->driver_ctx; +- struct spdk_bdev *bdev = bdev_io->bdev; +- int ret; +- +- if (!success) { +- ret = -EINVAL; +- goto exit; +- } +- +- if (spdk_unlikely(!nvme_io_path_is_available(bio->io_path))) { +- ret = -ENXIO; +- goto exit; +- } +- +- ret = bdev_nvme_readv(bio, +- bdev_io->u.bdev.iovs, +- bdev_io->u.bdev.iovcnt, +- bdev_io->u.bdev.md_buf, +- bdev_io->u.bdev.num_blocks, +- bdev_io->u.bdev.offset_blocks, +- bdev->dif_check_flags, +- bdev_io->u.bdev.ext_opts); +- +-exit: +- if (spdk_unlikely(ret != 0)) { +- bdev_nvme_io_complete(bio, ret); +- } +-} +- +-static inline void +-_bdev_nvme_submit_request(struct nvme_bdev_channel *nbdev_ch, struct spdk_bdev_io *bdev_io) +-{ +- struct nvme_bdev_io *nbdev_io = (struct nvme_bdev_io *)bdev_io->driver_ctx; +- struct spdk_bdev *bdev = bdev_io->bdev; +- struct nvme_bdev_io *nbdev_io_to_abort; +- int rc = 0; +- +- switch (bdev_io->type) { +- case SPDK_BDEV_IO_TYPE_READ: +- if (bdev_io->u.bdev.iovs && bdev_io->u.bdev.iovs[0].iov_base) { +- rc = bdev_nvme_readv(nbdev_io, +- bdev_io->u.bdev.iovs, +- bdev_io->u.bdev.iovcnt, +- bdev_io->u.bdev.md_buf, +- bdev_io->u.bdev.num_blocks, +- bdev_io->u.bdev.offset_blocks, +- bdev->dif_check_flags, +- bdev_io->u.bdev.ext_opts); +- } else { +- spdk_bdev_io_get_buf(bdev_io, bdev_nvme_get_buf_cb, +- bdev_io->u.bdev.num_blocks * bdev->blocklen); +- rc = 0; +- } +- break; +- case SPDK_BDEV_IO_TYPE_WRITE: +- rc = bdev_nvme_writev(nbdev_io, +- bdev_io->u.bdev.iovs, +- bdev_io->u.bdev.iovcnt, +- bdev_io->u.bdev.md_buf, +- bdev_io->u.bdev.num_blocks, +- bdev_io->u.bdev.offset_blocks, +- bdev->dif_check_flags, +- bdev_io->u.bdev.ext_opts); +- break; +- case SPDK_BDEV_IO_TYPE_COMPARE: +- rc = bdev_nvme_comparev(nbdev_io, +- bdev_io->u.bdev.iovs, +- bdev_io->u.bdev.iovcnt, +- bdev_io->u.bdev.md_buf, +- bdev_io->u.bdev.num_blocks, +- bdev_io->u.bdev.offset_blocks, +- bdev->dif_check_flags); +- break; +- case SPDK_BDEV_IO_TYPE_COMPARE_AND_WRITE: +- rc = bdev_nvme_comparev_and_writev(nbdev_io, +- bdev_io->u.bdev.iovs, +- bdev_io->u.bdev.iovcnt, +- bdev_io->u.bdev.fused_iovs, +- bdev_io->u.bdev.fused_iovcnt, +- bdev_io->u.bdev.md_buf, +- bdev_io->u.bdev.num_blocks, +- bdev_io->u.bdev.offset_blocks, +- bdev->dif_check_flags); +- break; +- case SPDK_BDEV_IO_TYPE_UNMAP: +- rc = bdev_nvme_unmap(nbdev_io, +- bdev_io->u.bdev.offset_blocks, +- bdev_io->u.bdev.num_blocks); +- break; +- case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: +- rc = bdev_nvme_write_zeroes(nbdev_io, +- bdev_io->u.bdev.offset_blocks, +- bdev_io->u.bdev.num_blocks); +- break; +- case SPDK_BDEV_IO_TYPE_RESET: +- nbdev_io->io_path = NULL; +- bdev_nvme_reset_io(nbdev_ch, nbdev_io); +- break; +- case SPDK_BDEV_IO_TYPE_FLUSH: +- bdev_nvme_io_complete(nbdev_io, 0); +- break; +- case SPDK_BDEV_IO_TYPE_ZONE_APPEND: +- rc = bdev_nvme_zone_appendv(nbdev_io, +- bdev_io->u.bdev.iovs, +- bdev_io->u.bdev.iovcnt, +- bdev_io->u.bdev.md_buf, +- bdev_io->u.bdev.num_blocks, +- bdev_io->u.bdev.offset_blocks, +- bdev->dif_check_flags); +- break; +- case SPDK_BDEV_IO_TYPE_GET_ZONE_INFO: +- rc = bdev_nvme_get_zone_info(nbdev_io, +- bdev_io->u.zone_mgmt.zone_id, +- bdev_io->u.zone_mgmt.num_zones, +- bdev_io->u.zone_mgmt.buf); +- break; +- case SPDK_BDEV_IO_TYPE_ZONE_MANAGEMENT: +- rc = bdev_nvme_zone_management(nbdev_io, +- bdev_io->u.zone_mgmt.zone_id, +- bdev_io->u.zone_mgmt.zone_action); +- break; +- case SPDK_BDEV_IO_TYPE_NVME_ADMIN: +- nbdev_io->io_path = NULL; +- bdev_nvme_admin_passthru(nbdev_ch, +- nbdev_io, +- &bdev_io->u.nvme_passthru.cmd, +- bdev_io->u.nvme_passthru.buf, +- bdev_io->u.nvme_passthru.nbytes); +- break; +- case SPDK_BDEV_IO_TYPE_NVME_IO: +- rc = bdev_nvme_io_passthru(nbdev_io, +- &bdev_io->u.nvme_passthru.cmd, +- bdev_io->u.nvme_passthru.buf, +- bdev_io->u.nvme_passthru.nbytes); +- break; +- case SPDK_BDEV_IO_TYPE_NVME_IO_MD: +- rc = bdev_nvme_io_passthru_md(nbdev_io, +- &bdev_io->u.nvme_passthru.cmd, +- bdev_io->u.nvme_passthru.buf, +- bdev_io->u.nvme_passthru.nbytes, +- bdev_io->u.nvme_passthru.md_buf, +- bdev_io->u.nvme_passthru.md_len); +- break; +- case SPDK_BDEV_IO_TYPE_ABORT: +- nbdev_io->io_path = NULL; +- nbdev_io_to_abort = (struct nvme_bdev_io *)bdev_io->u.abort.bio_to_abort->driver_ctx; +- bdev_nvme_abort(nbdev_ch, +- nbdev_io, +- nbdev_io_to_abort); +- break; +- case SPDK_BDEV_IO_TYPE_COPY: +- rc = bdev_nvme_copy(nbdev_io, +- bdev_io->u.bdev.offset_blocks, +- bdev_io->u.bdev.copy.src_offset_blocks, +- bdev_io->u.bdev.num_blocks); +- break; +- default: +- rc = -EINVAL; +- break; +- } +- +- if (spdk_unlikely(rc != 0)) { +- bdev_nvme_io_complete(nbdev_io, rc); +- } +-} +- +-static void +-bdev_nvme_submit_request(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io) +-{ +- struct nvme_bdev_channel *nbdev_ch = spdk_io_channel_get_ctx(ch); +- struct nvme_bdev_io *nbdev_io = (struct nvme_bdev_io *)bdev_io->driver_ctx; +- +- if (spdk_likely(nbdev_io->submit_tsc == 0)) { +- nbdev_io->submit_tsc = spdk_bdev_io_get_submit_tsc(bdev_io); +- } else { +- /* There are cases where submit_tsc != 0, i.e. retry I/O. +- * We need to update submit_tsc here. +- */ +- nbdev_io->submit_tsc = spdk_get_ticks(); +- } +- +- spdk_trace_record(TRACE_BDEV_NVME_IO_START, 0, 0, (uintptr_t)nbdev_io, (uintptr_t)bdev_io); +- nbdev_io->io_path = bdev_nvme_find_io_path(nbdev_ch); +- if (spdk_unlikely(!nbdev_io->io_path)) { +- if (!bdev_nvme_io_type_is_admin(bdev_io->type)) { +- bdev_nvme_io_complete(nbdev_io, -ENXIO); +- return; +- } +- +- /* Admin commands do not use the optimal I/O path. +- * Simply fall through even if it is not found. +- */ +- } +- +- _bdev_nvme_submit_request(nbdev_ch, bdev_io); +-} +- +-static bool +-bdev_nvme_io_type_supported(void *ctx, enum spdk_bdev_io_type io_type) +-{ +- struct nvme_bdev *nbdev = ctx; +- struct nvme_ns *nvme_ns; +- struct spdk_nvme_ns *ns; +- struct spdk_nvme_ctrlr *ctrlr; +- const struct spdk_nvme_ctrlr_data *cdata; +- +- nvme_ns = TAILQ_FIRST(&nbdev->nvme_ns_list); +- assert(nvme_ns != NULL); +- ns = nvme_ns->ns; +- ctrlr = spdk_nvme_ns_get_ctrlr(ns); +- +- switch (io_type) { +- case SPDK_BDEV_IO_TYPE_READ: +- case SPDK_BDEV_IO_TYPE_WRITE: +- case SPDK_BDEV_IO_TYPE_RESET: +- case SPDK_BDEV_IO_TYPE_FLUSH: +- case SPDK_BDEV_IO_TYPE_NVME_ADMIN: +- case SPDK_BDEV_IO_TYPE_NVME_IO: +- case SPDK_BDEV_IO_TYPE_ABORT: +- return true; +- +- case SPDK_BDEV_IO_TYPE_COMPARE: +- return spdk_nvme_ns_supports_compare(ns); +- +- case SPDK_BDEV_IO_TYPE_NVME_IO_MD: +- return spdk_nvme_ns_get_md_size(ns) ? true : false; +- +- case SPDK_BDEV_IO_TYPE_UNMAP: +- cdata = spdk_nvme_ctrlr_get_data(ctrlr); +- return cdata->oncs.dsm; +- +- case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: +- cdata = spdk_nvme_ctrlr_get_data(ctrlr); +- return cdata->oncs.write_zeroes; +- +- case SPDK_BDEV_IO_TYPE_COMPARE_AND_WRITE: +- if (spdk_nvme_ctrlr_get_flags(ctrlr) & +- SPDK_NVME_CTRLR_COMPARE_AND_WRITE_SUPPORTED) { +- return true; +- } +- return false; +- +- case SPDK_BDEV_IO_TYPE_GET_ZONE_INFO: +- case SPDK_BDEV_IO_TYPE_ZONE_MANAGEMENT: +- return spdk_nvme_ns_get_csi(ns) == SPDK_NVME_CSI_ZNS; +- +- case SPDK_BDEV_IO_TYPE_ZONE_APPEND: +- return spdk_nvme_ns_get_csi(ns) == SPDK_NVME_CSI_ZNS && +- spdk_nvme_ctrlr_get_flags(ctrlr) & SPDK_NVME_CTRLR_ZONE_APPEND_SUPPORTED; +- +- case SPDK_BDEV_IO_TYPE_COPY: +- cdata = spdk_nvme_ctrlr_get_data(ctrlr); +- return cdata->oncs.copy; +- +- default: +- return false; +- } +-} +- +-static int +-nvme_qpair_create(struct nvme_ctrlr *nvme_ctrlr, struct nvme_ctrlr_channel *ctrlr_ch) +-{ +- struct nvme_qpair *nvme_qpair; +- struct spdk_io_channel *pg_ch; +- int rc; +- +- nvme_qpair = calloc(1, sizeof(*nvme_qpair)); +- if (!nvme_qpair) { +- SPDK_ERRLOG("Failed to alloc nvme_qpair.\n"); +- return -1; +- } +- +- TAILQ_INIT(&nvme_qpair->io_path_list); +- +- nvme_qpair->ctrlr = nvme_ctrlr; +- nvme_qpair->ctrlr_ch = ctrlr_ch; +- +- pg_ch = spdk_get_io_channel(&g_nvme_bdev_ctrlrs); +- if (!pg_ch) { +- free(nvme_qpair); +- return -1; +- } +- +- nvme_qpair->group = spdk_io_channel_get_ctx(pg_ch); +- +-#ifdef SPDK_CONFIG_VTUNE +- nvme_qpair->group->collect_spin_stat = true; +-#else +- nvme_qpair->group->collect_spin_stat = false; +-#endif +- +- rc = bdev_nvme_create_qpair(nvme_qpair); +- if (rc != 0) { +- /* nvme_ctrlr can't create IO qpair if connection is down. +- * +- * If reconnect_delay_sec is non-zero, creating IO qpair is retried +- * after reconnect_delay_sec seconds. If bdev_retry_count is non-zero, +- * submitted IO will be queued until IO qpair is successfully created. +- * +- * Hence, if both are satisfied, ignore the failure. +- */ +- if (nvme_ctrlr->opts.reconnect_delay_sec == 0 || g_opts.bdev_retry_count == 0) { +- spdk_put_io_channel(pg_ch); +- free(nvme_qpair); +- return rc; +- } +- } +- +- TAILQ_INSERT_TAIL(&nvme_qpair->group->qpair_list, nvme_qpair, tailq); +- +- ctrlr_ch->qpair = nvme_qpair; +- +- pthread_mutex_lock(&nvme_qpair->ctrlr->mutex); +- nvme_qpair->ctrlr->ref++; +- pthread_mutex_unlock(&nvme_qpair->ctrlr->mutex); +- +- return 0; +-} +- +-static int +-bdev_nvme_create_ctrlr_channel_cb(void *io_device, void *ctx_buf) +-{ +- struct nvme_ctrlr *nvme_ctrlr = io_device; +- struct nvme_ctrlr_channel *ctrlr_ch = ctx_buf; +- +- TAILQ_INIT(&ctrlr_ch->pending_resets); +- +- return nvme_qpair_create(nvme_ctrlr, ctrlr_ch); +-} +- +-static void +-nvme_qpair_delete(struct nvme_qpair *nvme_qpair) +-{ +- assert(nvme_qpair->group != NULL); +- +- TAILQ_REMOVE(&nvme_qpair->group->qpair_list, nvme_qpair, tailq); +- +- spdk_put_io_channel(spdk_io_channel_from_ctx(nvme_qpair->group)); +- +- nvme_ctrlr_release(nvme_qpair->ctrlr); +- +- free(nvme_qpair); +-} +- +-static void +-bdev_nvme_destroy_ctrlr_channel_cb(void *io_device, void *ctx_buf) +-{ +- struct nvme_ctrlr_channel *ctrlr_ch = ctx_buf; +- struct nvme_qpair *nvme_qpair; +- +- nvme_qpair = ctrlr_ch->qpair; +- assert(nvme_qpair != NULL); +- +- _bdev_nvme_clear_io_path_cache(nvme_qpair); +- +- if (nvme_qpair->qpair != NULL) { +- if (ctrlr_ch->reset_iter == NULL) { +- spdk_nvme_ctrlr_disconnect_io_qpair(nvme_qpair->qpair); +- } else { +- /* Skip current ctrlr_channel in a full reset sequence because +- * it is being deleted now. The qpair is already being disconnected. +- * We do not have to restart disconnecting it. +- */ +- spdk_for_each_channel_continue(ctrlr_ch->reset_iter, 0); +- } +- +- /* We cannot release a reference to the poll group now. +- * The qpair may be disconnected asynchronously later. +- * We need to poll it until it is actually disconnected. +- * Just detach the qpair from the deleting ctrlr_channel. +- */ +- nvme_qpair->ctrlr_ch = NULL; +- } else { +- assert(ctrlr_ch->reset_iter == NULL); +- +- nvme_qpair_delete(nvme_qpair); +- } +-} +- +-static void +-bdev_nvme_submit_accel_crc32c(void *ctx, uint32_t *dst, struct iovec *iov, +- uint32_t iov_cnt, uint32_t seed, +- spdk_nvme_accel_completion_cb cb_fn, void *cb_arg) +-{ +- struct nvme_poll_group *group = ctx; +- int rc; +- +- assert(group->accel_channel != NULL); +- assert(cb_fn != NULL); +- +- rc = spdk_accel_submit_crc32cv(group->accel_channel, dst, iov, iov_cnt, seed, cb_fn, cb_arg); +- if (rc) { +- /* For the two cases, spdk_accel_submit_crc32cv does not call the user's cb_fn */ +- if (rc == -ENOMEM || rc == -EINVAL) { +- cb_fn(cb_arg, rc); +- } +- SPDK_ERRLOG("Cannot complete the accelerated crc32c operation with iov=%p\n", iov); +- } +-} +- +-static struct spdk_nvme_accel_fn_table g_bdev_nvme_accel_fn_table = { +- .table_size = sizeof(struct spdk_nvme_accel_fn_table), +- .submit_accel_crc32c = bdev_nvme_submit_accel_crc32c, +-}; +- +-static int +-bdev_nvme_create_poll_group_cb(void *io_device, void *ctx_buf) +-{ +- struct nvme_poll_group *group = ctx_buf; +- +- TAILQ_INIT(&group->qpair_list); +- +- group->group = spdk_nvme_poll_group_create(group, &g_bdev_nvme_accel_fn_table); +- if (group->group == NULL) { +- return -1; +- } +- +- group->accel_channel = spdk_accel_get_io_channel(); +- if (!group->accel_channel) { +- spdk_nvme_poll_group_destroy(group->group); +- SPDK_ERRLOG("Cannot get the accel_channel for bdev nvme polling group=%p\n", +- group); +- return -1; +- } +- +- group->poller = SPDK_POLLER_REGISTER(bdev_nvme_poll, group, g_opts.nvme_ioq_poll_period_us); +- +- if (group->poller == NULL) { +- spdk_put_io_channel(group->accel_channel); +- spdk_nvme_poll_group_destroy(group->group); +- return -1; +- } +- +- return 0; +-} +- +-static void +-bdev_nvme_destroy_poll_group_cb(void *io_device, void *ctx_buf) +-{ +- struct nvme_poll_group *group = ctx_buf; +- +- assert(TAILQ_EMPTY(&group->qpair_list)); +- +- if (group->accel_channel) { +- spdk_put_io_channel(group->accel_channel); +- } +- +- spdk_poller_unregister(&group->poller); +- if (spdk_nvme_poll_group_destroy(group->group)) { +- SPDK_ERRLOG("Unable to destroy a poll group for the NVMe bdev module.\n"); +- assert(false); +- } +-} +- +-static struct spdk_io_channel * +-bdev_nvme_get_io_channel(void *ctx) +-{ +- struct nvme_bdev *nvme_bdev = ctx; +- +- return spdk_get_io_channel(nvme_bdev); +-} +- +-static void * +-bdev_nvme_get_module_ctx(void *ctx) +-{ +- struct nvme_bdev *nvme_bdev = ctx; +- struct nvme_ns *nvme_ns; +- +- if (!nvme_bdev || nvme_bdev->disk.module != &nvme_if) { +- return NULL; +- } +- +- nvme_ns = TAILQ_FIRST(&nvme_bdev->nvme_ns_list); +- if (!nvme_ns) { +- return NULL; +- } +- +- return nvme_ns->ns; +-} +- +-static const char * +-_nvme_ana_state_str(enum spdk_nvme_ana_state ana_state) +-{ +- switch (ana_state) { +- case SPDK_NVME_ANA_OPTIMIZED_STATE: +- return "optimized"; +- case SPDK_NVME_ANA_NON_OPTIMIZED_STATE: +- return "non_optimized"; +- case SPDK_NVME_ANA_INACCESSIBLE_STATE: +- return "inaccessible"; +- case SPDK_NVME_ANA_PERSISTENT_LOSS_STATE: +- return "persistent_loss"; +- case SPDK_NVME_ANA_CHANGE_STATE: +- return "change"; +- default: +- return NULL; +- } +-} +- +-static int +-bdev_nvme_get_memory_domains(void *ctx, struct spdk_memory_domain **domains, int array_size) +-{ +- struct spdk_memory_domain **_domains = NULL; +- struct nvme_bdev *nbdev = ctx; +- struct nvme_ns *nvme_ns; +- int i = 0, _array_size = array_size; +- int rc = 0; +- +- TAILQ_FOREACH(nvme_ns, &nbdev->nvme_ns_list, tailq) { +- if (domains && array_size >= i) { +- _domains = &domains[i]; +- } else { +- _domains = NULL; +- } +- rc = spdk_nvme_ctrlr_get_memory_domains(nvme_ns->ctrlr->ctrlr, _domains, _array_size); +- if (rc > 0) { +- i += rc; +- if (_array_size >= rc) { +- _array_size -= rc; +- } else { +- _array_size = 0; +- } +- } else if (rc < 0) { +- return rc; +- } +- } +- +- return i; +-} +- +-static const char * +-nvme_ctrlr_get_state_str(struct nvme_ctrlr *nvme_ctrlr) +-{ +- if (nvme_ctrlr->destruct) { +- return "deleting"; +- } else if (spdk_nvme_ctrlr_is_failed(nvme_ctrlr->ctrlr)) { +- return "failed"; +- } else if (nvme_ctrlr->resetting) { +- return "resetting"; +- } else if (nvme_ctrlr->reconnect_is_delayed > 0) { +- return "reconnect_is_delayed"; +- } else { +- return "enabled"; +- } +-} +- +-void +-nvme_ctrlr_info_json(struct spdk_json_write_ctx *w, struct nvme_ctrlr *nvme_ctrlr) +-{ +- struct spdk_nvme_transport_id *trid; +- const struct spdk_nvme_ctrlr_opts *opts; +- const struct spdk_nvme_ctrlr_data *cdata; +- +- spdk_json_write_object_begin(w); +- +- spdk_json_write_named_string(w, "state", nvme_ctrlr_get_state_str(nvme_ctrlr)); +- +-#ifdef SPDK_CONFIG_NVME_CUSE +- size_t cuse_name_size = 128; +- char cuse_name[cuse_name_size]; +- +- int rc = spdk_nvme_cuse_get_ctrlr_name(nvme_ctrlr->ctrlr, cuse_name, &cuse_name_size); +- if (rc == 0) { +- spdk_json_write_named_string(w, "cuse_device", cuse_name); +- } +-#endif +- trid = &nvme_ctrlr->active_path_id->trid; +- spdk_json_write_named_object_begin(w, "trid"); +- nvme_bdev_dump_trid_json(trid, w); +- spdk_json_write_object_end(w); +- +- cdata = spdk_nvme_ctrlr_get_data(nvme_ctrlr->ctrlr); +- spdk_json_write_named_uint16(w, "cntlid", cdata->cntlid); +- +- opts = spdk_nvme_ctrlr_get_opts(nvme_ctrlr->ctrlr); +- spdk_json_write_named_object_begin(w, "host"); +- spdk_json_write_named_string(w, "nqn", opts->hostnqn); +- spdk_json_write_named_string(w, "addr", opts->src_addr); +- spdk_json_write_named_string(w, "svcid", opts->src_svcid); +- spdk_json_write_object_end(w); +- +- spdk_json_write_object_end(w); +-} +- +-static void +-nvme_namespace_info_json(struct spdk_json_write_ctx *w, +- struct nvme_ns *nvme_ns) +-{ +- struct spdk_nvme_ns *ns; +- struct spdk_nvme_ctrlr *ctrlr; +- const struct spdk_nvme_ctrlr_data *cdata; +- const struct spdk_nvme_transport_id *trid; +- union spdk_nvme_vs_register vs; +- const struct spdk_nvme_ns_data *nsdata; +- char buf[128]; +- +- ns = nvme_ns->ns; +- ctrlr = spdk_nvme_ns_get_ctrlr(ns); +- +- cdata = spdk_nvme_ctrlr_get_data(ctrlr); +- trid = spdk_nvme_ctrlr_get_transport_id(ctrlr); +- vs = spdk_nvme_ctrlr_get_regs_vs(ctrlr); +- +- spdk_json_write_object_begin(w); +- +- if (trid->trtype == SPDK_NVME_TRANSPORT_PCIE) { +- spdk_json_write_named_string(w, "pci_address", trid->traddr); +- } +- +- spdk_json_write_named_object_begin(w, "trid"); +- +- nvme_bdev_dump_trid_json(trid, w); +- +- spdk_json_write_object_end(w); +- +-#ifdef SPDK_CONFIG_NVME_CUSE +- size_t cuse_name_size = 128; +- char cuse_name[cuse_name_size]; +- +- int rc = spdk_nvme_cuse_get_ns_name(ctrlr, spdk_nvme_ns_get_id(ns), +- cuse_name, &cuse_name_size); +- if (rc == 0) { +- spdk_json_write_named_string(w, "cuse_device", cuse_name); +- } +-#endif +- +- spdk_json_write_named_object_begin(w, "ctrlr_data"); +- +- spdk_json_write_named_uint16(w, "cntlid", cdata->cntlid); +- +- spdk_json_write_named_string_fmt(w, "vendor_id", "0x%04x", cdata->vid); +- +- snprintf(buf, sizeof(cdata->mn) + 1, "%s", cdata->mn); +- spdk_str_trim(buf); +- spdk_json_write_named_string(w, "model_number", buf); +- +- snprintf(buf, sizeof(cdata->sn) + 1, "%s", cdata->sn); +- spdk_str_trim(buf); +- spdk_json_write_named_string(w, "serial_number", buf); +- +- snprintf(buf, sizeof(cdata->fr) + 1, "%s", cdata->fr); +- spdk_str_trim(buf); +- spdk_json_write_named_string(w, "firmware_revision", buf); +- +- if (cdata->subnqn[0] != '\0') { +- spdk_json_write_named_string(w, "subnqn", cdata->subnqn); +- } +- +- spdk_json_write_named_object_begin(w, "oacs"); +- +- spdk_json_write_named_uint32(w, "security", cdata->oacs.security); +- spdk_json_write_named_uint32(w, "format", cdata->oacs.format); +- spdk_json_write_named_uint32(w, "firmware", cdata->oacs.firmware); +- spdk_json_write_named_uint32(w, "ns_manage", cdata->oacs.ns_manage); +- +- spdk_json_write_object_end(w); +- +- spdk_json_write_named_bool(w, "multi_ctrlr", cdata->cmic.multi_ctrlr); +- spdk_json_write_named_bool(w, "ana_reporting", cdata->cmic.ana_reporting); +- +- spdk_json_write_object_end(w); +- +- spdk_json_write_named_object_begin(w, "vs"); +- +- spdk_json_write_name(w, "nvme_version"); +- if (vs.bits.ter) { +- spdk_json_write_string_fmt(w, "%u.%u.%u", vs.bits.mjr, vs.bits.mnr, vs.bits.ter); +- } else { +- spdk_json_write_string_fmt(w, "%u.%u", vs.bits.mjr, vs.bits.mnr); +- } +- +- spdk_json_write_object_end(w); +- +- nsdata = spdk_nvme_ns_get_data(ns); +- +- spdk_json_write_named_object_begin(w, "ns_data"); +- +- spdk_json_write_named_uint32(w, "id", spdk_nvme_ns_get_id(ns)); +- +- if (cdata->cmic.ana_reporting) { +- spdk_json_write_named_string(w, "ana_state", +- _nvme_ana_state_str(nvme_ns->ana_state)); +- } +- +- spdk_json_write_named_bool(w, "can_share", nsdata->nmic.can_share); +- +- spdk_json_write_object_end(w); +- +- if (cdata->oacs.security) { +- spdk_json_write_named_object_begin(w, "security"); +- +- spdk_json_write_named_bool(w, "opal", nvme_ns->bdev->opal); +- +- spdk_json_write_object_end(w); +- } +- +- spdk_json_write_object_end(w); +-} +- +-static const char * +-nvme_bdev_get_mp_policy_str(struct nvme_bdev *nbdev) +-{ +- switch (nbdev->mp_policy) { +- case BDEV_NVME_MP_POLICY_ACTIVE_PASSIVE: +- return "active_passive"; +- case BDEV_NVME_MP_POLICY_ACTIVE_ACTIVE: +- return "active_active"; +- default: +- assert(false); +- return "invalid"; +- } +-} +- +-static int +-bdev_nvme_dump_info_json(void *ctx, struct spdk_json_write_ctx *w) +-{ +- struct nvme_bdev *nvme_bdev = ctx; +- struct nvme_ns *nvme_ns; +- +- pthread_mutex_lock(&nvme_bdev->mutex); +- spdk_json_write_named_array_begin(w, "nvme"); +- TAILQ_FOREACH(nvme_ns, &nvme_bdev->nvme_ns_list, tailq) { +- nvme_namespace_info_json(w, nvme_ns); +- } +- spdk_json_write_array_end(w); +- spdk_json_write_named_string(w, "mp_policy", nvme_bdev_get_mp_policy_str(nvme_bdev)); +- pthread_mutex_unlock(&nvme_bdev->mutex); +- +- return 0; +-} +- +-static void +-bdev_nvme_write_config_json(struct spdk_bdev *bdev, struct spdk_json_write_ctx *w) +-{ +- /* No config per bdev needed */ +-} +- +-static uint64_t +-bdev_nvme_get_spin_time(struct spdk_io_channel *ch) +-{ +- struct nvme_bdev_channel *nbdev_ch = spdk_io_channel_get_ctx(ch); +- struct nvme_io_path *io_path; +- struct nvme_poll_group *group; +- uint64_t spin_time = 0; +- +- STAILQ_FOREACH(io_path, &nbdev_ch->io_path_list, stailq) { +- group = io_path->qpair->group; +- +- if (!group || !group->collect_spin_stat) { +- continue; +- } +- +- if (group->end_ticks != 0) { +- group->spin_ticks += (group->end_ticks - group->start_ticks); +- group->end_ticks = 0; +- } +- +- spin_time += group->spin_ticks; +- group->start_ticks = 0; +- group->spin_ticks = 0; +- } +- +- return (spin_time * 1000000ULL) / spdk_get_ticks_hz(); +-} +- +-static void +-bdev_nvme_reset_device_stat(void *ctx) +-{ +- struct nvme_bdev *nbdev = ctx; +- +- if (nbdev->err_stat != NULL) { +- memset(nbdev->err_stat, 0, sizeof(struct nvme_error_stat)); +- } +-} +- +-/* JSON string should be lowercases and underscore delimited string. */ +-static void +-bdev_nvme_format_nvme_status(char *dst, const char *src) +-{ +- char tmp[256]; +- +- spdk_strcpy_replace(dst, 256, src, " - ", "_"); +- spdk_strcpy_replace(tmp, 256, dst, "-", "_"); +- spdk_strcpy_replace(dst, 256, tmp, " ", "_"); +- spdk_strlwr(dst); +-} +- +-static void +-bdev_nvme_dump_device_stat_json(void *ctx, struct spdk_json_write_ctx *w) +-{ +- struct nvme_bdev *nbdev = ctx; +- struct spdk_nvme_status status = {}; +- uint16_t sct, sc; +- char status_json[256]; +- const char *status_str; +- +- if (nbdev->err_stat == NULL) { +- return; +- } +- +- spdk_json_write_named_object_begin(w, "nvme_error"); +- +- spdk_json_write_named_object_begin(w, "status_type"); +- for (sct = 0; sct < 8; sct++) { +- if (nbdev->err_stat->status_type[sct] == 0) { +- continue; +- } +- status.sct = sct; +- +- status_str = spdk_nvme_cpl_get_status_type_string(&status); +- assert(status_str != NULL); +- bdev_nvme_format_nvme_status(status_json, status_str); +- +- spdk_json_write_named_uint32(w, status_json, nbdev->err_stat->status_type[sct]); +- } +- spdk_json_write_object_end(w); +- +- spdk_json_write_named_object_begin(w, "status_code"); +- for (sct = 0; sct < 4; sct++) { +- status.sct = sct; +- for (sc = 0; sc < 256; sc++) { +- if (nbdev->err_stat->status[sct][sc] == 0) { +- continue; +- } +- status.sc = sc; +- +- status_str = spdk_nvme_cpl_get_status_string(&status); +- assert(status_str != NULL); +- bdev_nvme_format_nvme_status(status_json, status_str); +- +- spdk_json_write_named_uint32(w, status_json, nbdev->err_stat->status[sct][sc]); +- } +- } +- spdk_json_write_object_end(w); +- +- spdk_json_write_object_end(w); +-} +- +-static const struct spdk_bdev_fn_table nvmelib_fn_table = { +- .destruct = bdev_nvme_destruct, +- .submit_request = bdev_nvme_submit_request, +- .io_type_supported = bdev_nvme_io_type_supported, +- .get_io_channel = bdev_nvme_get_io_channel, +- .dump_info_json = bdev_nvme_dump_info_json, +- .write_config_json = bdev_nvme_write_config_json, +- .get_spin_time = bdev_nvme_get_spin_time, +- .get_module_ctx = bdev_nvme_get_module_ctx, +- .get_memory_domains = bdev_nvme_get_memory_domains, +- .reset_device_stat = bdev_nvme_reset_device_stat, +- .dump_device_stat_json = bdev_nvme_dump_device_stat_json, +-}; +- +-typedef int (*bdev_nvme_parse_ana_log_page_cb)( +- const struct spdk_nvme_ana_group_descriptor *desc, void *cb_arg); +- +-static int +-bdev_nvme_parse_ana_log_page(struct nvme_ctrlr *nvme_ctrlr, +- bdev_nvme_parse_ana_log_page_cb cb_fn, void *cb_arg) +-{ +- struct spdk_nvme_ana_group_descriptor *copied_desc; +- uint8_t *orig_desc; +- uint32_t i, desc_size, copy_len; +- int rc = 0; +- +- if (nvme_ctrlr->ana_log_page == NULL) { +- return -EINVAL; +- } +- +- copied_desc = nvme_ctrlr->copied_ana_desc; +- +- orig_desc = (uint8_t *)nvme_ctrlr->ana_log_page + sizeof(struct spdk_nvme_ana_page); +- copy_len = nvme_ctrlr->max_ana_log_page_size - sizeof(struct spdk_nvme_ana_page); +- +- for (i = 0; i < nvme_ctrlr->ana_log_page->num_ana_group_desc; i++) { +- memcpy(copied_desc, orig_desc, copy_len); +- +- rc = cb_fn(copied_desc, cb_arg); +- if (rc != 0) { +- break; +- } +- +- desc_size = sizeof(struct spdk_nvme_ana_group_descriptor) + +- copied_desc->num_of_nsid * sizeof(uint32_t); +- orig_desc += desc_size; +- copy_len -= desc_size; +- } +- +- return rc; +-} +- +-static int +-nvme_ns_ana_transition_timedout(void *ctx) +-{ +- struct nvme_ns *nvme_ns = ctx; +- +- spdk_poller_unregister(&nvme_ns->anatt_timer); +- nvme_ns->ana_transition_timedout = true; +- +- return SPDK_POLLER_BUSY; +-} +- +-static void +-_nvme_ns_set_ana_state(struct nvme_ns *nvme_ns, +- const struct spdk_nvme_ana_group_descriptor *desc) +-{ +- const struct spdk_nvme_ctrlr_data *cdata; +- +- nvme_ns->ana_group_id = desc->ana_group_id; +- nvme_ns->ana_state = desc->ana_state; +- nvme_ns->ana_state_updating = false; +- +- switch (nvme_ns->ana_state) { +- case SPDK_NVME_ANA_OPTIMIZED_STATE: +- case SPDK_NVME_ANA_NON_OPTIMIZED_STATE: +- nvme_ns->ana_transition_timedout = false; +- spdk_poller_unregister(&nvme_ns->anatt_timer); +- break; +- +- case SPDK_NVME_ANA_INACCESSIBLE_STATE: +- case SPDK_NVME_ANA_CHANGE_STATE: +- if (nvme_ns->anatt_timer != NULL) { +- break; +- } +- +- cdata = spdk_nvme_ctrlr_get_data(nvme_ns->ctrlr->ctrlr); +- nvme_ns->anatt_timer = SPDK_POLLER_REGISTER(nvme_ns_ana_transition_timedout, +- nvme_ns, +- cdata->anatt * SPDK_SEC_TO_USEC); +- break; +- default: +- break; +- } +-} +- +-static int +-nvme_ns_set_ana_state(const struct spdk_nvme_ana_group_descriptor *desc, void *cb_arg) +-{ +- struct nvme_ns *nvme_ns = cb_arg; +- uint32_t i; +- +- for (i = 0; i < desc->num_of_nsid; i++) { +- if (desc->nsid[i] != spdk_nvme_ns_get_id(nvme_ns->ns)) { +- continue; +- } +- +- _nvme_ns_set_ana_state(nvme_ns, desc); +- return 1; +- } +- +- return 0; +-} +- +-static void +-merge_nsid_sn_strings(const char *sn, char *nsid, int8_t *out) +-{ +- int i = 0, j = 0; +- int sn_len = strlen(sn), nsid_len = strlen(nsid); +- +- for (i = 0; i < nsid_len; i++) { +- out[i] = nsid[i]; +- } +- +- /* Since last few characters are more likely to be unique, +- * even among the devices from the same manufacturer, +- * we use serial number in reverse. We also skip the +- * terminating character of serial number string. */ +- for (j = sn_len - 1; j >= 0; j--) { +- if (i == SPDK_UUID_STRING_LEN - 1) { +- break; +- } +- +- /* There may be a lot of spaces in serial number string +- * and they will generate equally large number of the +- * same character, so just skip them. */ +- if (sn[j] == ' ') { +- continue; +- } +- +- out[i] = sn[j]; +- i++; +- } +-} +- +-/* Dictionary of characters for UUID generation. */ +-static char dict[17] = "0123456789abcdef"; +- +-static struct spdk_uuid +-nvme_generate_uuid(const char *sn, uint32_t nsid) +-{ +- struct spdk_uuid new_uuid; +- char buf[SPDK_UUID_STRING_LEN] = {'\0'}, merged_str[SPDK_UUID_STRING_LEN] = {'\0'}; +- char nsid_str[NSID_STR_LEN] = {'\0'}, tmp; +- uint64_t i = 0, j = 0, rem, dict_size = strlen(dict); +- int rc; +- +- assert(strlen(sn) <= SPDK_NVME_CTRLR_SN_LEN); +- +- snprintf(nsid_str, NSID_STR_LEN, "%" PRIu32, nsid); +- +- merge_nsid_sn_strings(sn, nsid_str, merged_str); +- +- while (i < SPDK_UUID_STRING_LEN) { +- /* If 'j' is equal to indexes, where '-' should be placed, +- * insert this character and continue the loop without +- * increasing 'i'. */ +- if ((j == 8 || j == 13 || j == 18 || j == 23)) { +- buf[j] = '-'; +- j++; +- +- /* Break, if we ran out of characters in +- * serial number and namespace ID string. */ +- if (j == strlen(merged_str)) { +- break; +- } +- continue; +- } +- +- /* Change character in shuffled string to lower case. */ +- tmp = tolower(merged_str[i]); +- +- if (isxdigit(tmp)) { +- /* If character can be represented by a hex +- * value as is, copy it to the result buffer. */ +- buf[j] = tmp; +- } else { +- /* Otherwise get its code and divide it +- * by the number of elements in dictionary. +- * The remainder will be the index of dictionary +- * character to replace tmp value with. */ +- rem = tmp % dict_size; +- buf[j] = dict[rem]; +- } +- +- i++; +- j++; +- +- /* Break, if we ran out of characters in +- * serial number and namespace ID string. */ +- if (j == strlen(merged_str)) { +- break; +- } +- } +- +- /* If there are not enough values to fill UUID, +- * the rest is taken from dictionary characters. */ +- i = 0; +- while (j < SPDK_UUID_STRING_LEN - 1) { +- if ((j == 8 || j == 13 || j == 18 || j == 23)) { +- buf[j] = '-'; +- j++; +- continue; +- } +- buf[j] = dict[i % dict_size]; +- i++; +- j++; +- } +- +- rc = spdk_uuid_parse(&new_uuid, buf); +- if (rc != 0) { +- SPDK_ERRLOG("Unexpected spdk_uuid_parse failure on %s.\n", buf); +- assert(false); +- } +- +- return new_uuid; +-} +- +-static int +-nvme_disk_create(struct spdk_bdev *disk, const char *base_name, +- struct spdk_nvme_ctrlr *ctrlr, struct spdk_nvme_ns *ns, +- uint32_t prchk_flags, void *ctx) +-{ +- const struct spdk_uuid *uuid; +- const uint8_t *nguid; +- const struct spdk_nvme_ctrlr_data *cdata; +- const struct spdk_nvme_ns_data *nsdata; +- const struct spdk_nvme_ctrlr_opts *opts; +- enum spdk_nvme_csi csi; +- uint32_t atomic_bs, phys_bs, bs; +- char sn_tmp[SPDK_NVME_CTRLR_SN_LEN + 1] = {'\0'}; +- +- cdata = spdk_nvme_ctrlr_get_data(ctrlr); +- csi = spdk_nvme_ns_get_csi(ns); +- opts = spdk_nvme_ctrlr_get_opts(ctrlr); +- +- switch (csi) { +- case SPDK_NVME_CSI_NVM: +- disk->product_name = "NVMe disk"; +- break; +- case SPDK_NVME_CSI_ZNS: +- disk->product_name = "NVMe ZNS disk"; +- disk->zoned = true; +- disk->zone_size = spdk_nvme_zns_ns_get_zone_size_sectors(ns); +- disk->max_zone_append_size = spdk_nvme_zns_ctrlr_get_max_zone_append_size(ctrlr) / +- spdk_nvme_ns_get_extended_sector_size(ns); +- disk->max_open_zones = spdk_nvme_zns_ns_get_max_open_zones(ns); +- disk->max_active_zones = spdk_nvme_zns_ns_get_max_active_zones(ns); +- break; +- default: +- SPDK_ERRLOG("unsupported CSI: %u\n", csi); +- return -ENOTSUP; +- } +- +- disk->name = spdk_sprintf_alloc("%sn%d", base_name, spdk_nvme_ns_get_id(ns)); +- if (!disk->name) { +- return -ENOMEM; +- } +- +- disk->write_cache = 0; +- if (cdata->vwc.present) { +- /* Enable if the Volatile Write Cache exists */ +- disk->write_cache = 1; +- } +- if (cdata->oncs.write_zeroes) { +- disk->max_write_zeroes = UINT16_MAX + 1; +- } +- disk->blocklen = spdk_nvme_ns_get_extended_sector_size(ns); +- disk->blockcnt = spdk_nvme_ns_get_num_sectors(ns); +- disk->max_segment_size = spdk_nvme_ctrlr_get_max_xfer_size(ctrlr); +- /* NVMe driver will split one request into multiple requests +- * based on MDTS and stripe boundary, the bdev layer will use +- * max_segment_size and max_num_segments to split one big IO +- * into multiple requests, then small request can't run out +- * of NVMe internal requests data structure. +- */ +- if (opts && opts->io_queue_requests) { +- disk->max_num_segments = opts->io_queue_requests / 2; +- } +- disk->optimal_io_boundary = spdk_nvme_ns_get_optimal_io_boundary(ns); +- +- nguid = spdk_nvme_ns_get_nguid(ns); +- if (!nguid) { +- uuid = spdk_nvme_ns_get_uuid(ns); +- if (uuid) { +- disk->uuid = *uuid; +- } else if (g_opts.generate_uuids) { +- spdk_strcpy_pad(sn_tmp, cdata->sn, SPDK_NVME_CTRLR_SN_LEN, '\0'); +- disk->uuid = nvme_generate_uuid(sn_tmp, spdk_nvme_ns_get_id(ns)); +- } +- } else { +- memcpy(&disk->uuid, nguid, sizeof(disk->uuid)); +- } +- +- nsdata = spdk_nvme_ns_get_data(ns); +- bs = spdk_nvme_ns_get_sector_size(ns); +- atomic_bs = bs; +- phys_bs = bs; +- if (nsdata->nabo == 0) { +- if (nsdata->nsfeat.ns_atomic_write_unit && nsdata->nawupf) { +- atomic_bs = bs * (1 + nsdata->nawupf); +- } else { +- atomic_bs = bs * (1 + cdata->awupf); +- } +- } +- if (nsdata->nsfeat.optperf) { +- phys_bs = bs * (1 + nsdata->npwg); +- } +- disk->phys_blocklen = spdk_min(phys_bs, atomic_bs); +- +- disk->md_len = spdk_nvme_ns_get_md_size(ns); +- if (disk->md_len != 0) { +- disk->md_interleave = nsdata->flbas.extended; +- disk->dif_type = (enum spdk_dif_type)spdk_nvme_ns_get_pi_type(ns); +- if (disk->dif_type != SPDK_DIF_DISABLE) { +- disk->dif_is_head_of_md = nsdata->dps.md_start; +- disk->dif_check_flags = prchk_flags; +- } +- } +- +- if (!(spdk_nvme_ctrlr_get_flags(ctrlr) & +- SPDK_NVME_CTRLR_COMPARE_AND_WRITE_SUPPORTED)) { +- disk->acwu = 0; +- } else if (nsdata->nsfeat.ns_atomic_write_unit) { +- disk->acwu = nsdata->nacwu + 1; /* 0-based */ +- } else { +- disk->acwu = cdata->acwu + 1; /* 0-based */ +- } +- +- if (cdata->oncs.copy) { +- /* For now bdev interface allows only single segment copy */ +- disk->max_copy = nsdata->mssrl; +- } +- +- disk->ctxt = ctx; +- disk->fn_table = &nvmelib_fn_table; +- disk->module = &nvme_if; +- +- return 0; +-} +- +-static int +-nvme_bdev_create(struct nvme_ctrlr *nvme_ctrlr, struct nvme_ns *nvme_ns) +-{ +- struct nvme_bdev *bdev; +- int rc; +- +- bdev = calloc(1, sizeof(*bdev)); +- if (!bdev) { +- SPDK_ERRLOG("bdev calloc() failed\n"); +- return -ENOMEM; +- } +- +- if (g_opts.nvme_error_stat) { +- bdev->err_stat = calloc(1, sizeof(struct nvme_error_stat)); +- if (!bdev->err_stat) { +- SPDK_ERRLOG("err_stat calloc() failed\n"); +- free(bdev); +- return -ENOMEM; +- } +- } +- +- rc = pthread_mutex_init(&bdev->mutex, NULL); +- if (rc != 0) { +- free(bdev->err_stat); +- free(bdev); +- return rc; +- } +- +- bdev->ref = 1; +- bdev->mp_policy = BDEV_NVME_MP_POLICY_ACTIVE_PASSIVE; +- bdev->mp_selector = BDEV_NVME_MP_SELECTOR_ROUND_ROBIN; +- bdev->rr_min_io = UINT32_MAX; +- TAILQ_INIT(&bdev->nvme_ns_list); +- TAILQ_INSERT_TAIL(&bdev->nvme_ns_list, nvme_ns, tailq); +- bdev->opal = nvme_ctrlr->opal_dev != NULL; +- +- rc = nvme_disk_create(&bdev->disk, nvme_ctrlr->nbdev_ctrlr->name, nvme_ctrlr->ctrlr, +- nvme_ns->ns, nvme_ctrlr->opts.prchk_flags, bdev); +- if (rc != 0) { +- SPDK_ERRLOG("Failed to create NVMe disk\n"); +- pthread_mutex_destroy(&bdev->mutex); +- free(bdev->err_stat); +- free(bdev); +- return rc; +- } +- +- spdk_io_device_register(bdev, +- bdev_nvme_create_bdev_channel_cb, +- bdev_nvme_destroy_bdev_channel_cb, +- sizeof(struct nvme_bdev_channel), +- bdev->disk.name); +- +- rc = spdk_bdev_register(&bdev->disk); +- if (rc != 0) { +- SPDK_ERRLOG("spdk_bdev_register() failed\n"); +- spdk_io_device_unregister(bdev, NULL); +- pthread_mutex_destroy(&bdev->mutex); +- free(bdev->disk.name); +- free(bdev->err_stat); +- free(bdev); +- return rc; +- } +- +- nvme_ns->bdev = bdev; +- bdev->nsid = nvme_ns->id; +- +- bdev->nbdev_ctrlr = nvme_ctrlr->nbdev_ctrlr; +- TAILQ_INSERT_TAIL(&nvme_ctrlr->nbdev_ctrlr->bdevs, bdev, tailq); +- +- return 0; +-} +- +-static bool +-bdev_nvme_compare_ns(struct spdk_nvme_ns *ns1, struct spdk_nvme_ns *ns2) +-{ +- const struct spdk_nvme_ns_data *nsdata1, *nsdata2; +- const struct spdk_uuid *uuid1, *uuid2; +- +- nsdata1 = spdk_nvme_ns_get_data(ns1); +- nsdata2 = spdk_nvme_ns_get_data(ns2); +- uuid1 = spdk_nvme_ns_get_uuid(ns1); +- uuid2 = spdk_nvme_ns_get_uuid(ns2); +- +- return memcmp(nsdata1->nguid, nsdata2->nguid, sizeof(nsdata1->nguid)) == 0 && +- nsdata1->eui64 == nsdata2->eui64 && +- ((uuid1 == NULL && uuid2 == NULL) || +- (uuid1 != NULL && uuid2 != NULL && spdk_uuid_compare(uuid1, uuid2) == 0)) && +- spdk_nvme_ns_get_csi(ns1) == spdk_nvme_ns_get_csi(ns2); +-} +- +-static bool +-hotplug_probe_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid, +- struct spdk_nvme_ctrlr_opts *opts) +-{ +- struct nvme_probe_skip_entry *entry; +- +- TAILQ_FOREACH(entry, &g_skipped_nvme_ctrlrs, tailq) { +- if (spdk_nvme_transport_id_compare(trid, &entry->trid) == 0) { +- return false; +- } +- } +- +- opts->arbitration_burst = (uint8_t)g_opts.arbitration_burst; +- opts->low_priority_weight = (uint8_t)g_opts.low_priority_weight; +- opts->medium_priority_weight = (uint8_t)g_opts.medium_priority_weight; +- opts->high_priority_weight = (uint8_t)g_opts.high_priority_weight; +- opts->disable_read_ana_log_page = true; +- +- SPDK_DEBUGLOG(bdev_nvme, "Attaching to %s\n", trid->traddr); +- +- return true; +-} +- +-static void +-nvme_abort_cpl(void *ctx, const struct spdk_nvme_cpl *cpl) +-{ +- struct nvme_ctrlr *nvme_ctrlr = ctx; +- +- if (spdk_nvme_cpl_is_error(cpl)) { +- SPDK_WARNLOG("Abort failed. Resetting controller. sc is %u, sct is %u.\n", cpl->status.sc, +- cpl->status.sct); +- bdev_nvme_reset(nvme_ctrlr); +- } else if (cpl->cdw0 & 0x1) { +- SPDK_WARNLOG("Specified command could not be aborted.\n"); +- bdev_nvme_reset(nvme_ctrlr); +- } +-} +- +-static void +-timeout_cb(void *cb_arg, struct spdk_nvme_ctrlr *ctrlr, +- struct spdk_nvme_qpair *qpair, uint16_t cid) +-{ +- struct nvme_ctrlr *nvme_ctrlr = cb_arg; +- union spdk_nvme_csts_register csts; +- int rc; +- +- assert(nvme_ctrlr->ctrlr == ctrlr); +- +- SPDK_WARNLOG("Warning: Detected a timeout. ctrlr=%p qpair=%p cid=%u\n", ctrlr, qpair, cid); +- +- /* Only try to read CSTS if it's a PCIe controller or we have a timeout on an I/O +- * queue. (Note: qpair == NULL when there's an admin cmd timeout.) Otherwise we +- * would submit another fabrics cmd on the admin queue to read CSTS and check for its +- * completion recursively. +- */ +- if (nvme_ctrlr->active_path_id->trid.trtype == SPDK_NVME_TRANSPORT_PCIE || qpair != NULL) { +- csts = spdk_nvme_ctrlr_get_regs_csts(ctrlr); +- if (csts.bits.cfs) { +- SPDK_ERRLOG("Controller Fatal Status, reset required\n"); +- bdev_nvme_reset(nvme_ctrlr); +- return; +- } +- } +- +- switch (g_opts.action_on_timeout) { +- case SPDK_BDEV_NVME_TIMEOUT_ACTION_ABORT: +- if (qpair) { +- /* Don't send abort to ctrlr when ctrlr is not available. */ +- pthread_mutex_lock(&nvme_ctrlr->mutex); +- if (!nvme_ctrlr_is_available(nvme_ctrlr)) { +- pthread_mutex_unlock(&nvme_ctrlr->mutex); +- SPDK_NOTICELOG("Quit abort. Ctrlr is not available.\n"); +- return; +- } +- pthread_mutex_unlock(&nvme_ctrlr->mutex); +- +- rc = spdk_nvme_ctrlr_cmd_abort(ctrlr, qpair, cid, +- nvme_abort_cpl, nvme_ctrlr); +- if (rc == 0) { +- return; +- } +- +- SPDK_ERRLOG("Unable to send abort. Resetting, rc is %d.\n", rc); +- } +- +- /* FALLTHROUGH */ +- case SPDK_BDEV_NVME_TIMEOUT_ACTION_RESET: +- bdev_nvme_reset(nvme_ctrlr); +- break; +- case SPDK_BDEV_NVME_TIMEOUT_ACTION_NONE: +- SPDK_DEBUGLOG(bdev_nvme, "No action for nvme controller timeout.\n"); +- break; +- default: +- SPDK_ERRLOG("An invalid timeout action value is found.\n"); +- break; +- } +-} +- +-static struct nvme_ns * +-nvme_ns_alloc(void) +-{ +- struct nvme_ns *nvme_ns; +- +- nvme_ns = calloc(1, sizeof(struct nvme_ns)); +- if (nvme_ns == NULL) { +- return NULL; +- } +- +- if (g_opts.io_path_stat) { +- nvme_ns->stat = calloc(1, sizeof(struct spdk_bdev_io_stat)); +- if (nvme_ns->stat == NULL) { +- free(nvme_ns); +- return NULL; +- } +- spdk_bdev_reset_io_stat(nvme_ns->stat, SPDK_BDEV_RESET_STAT_MAXMIN); +- } +- +- return nvme_ns; +-} +- +-static void +-nvme_ns_free(struct nvme_ns *nvme_ns) +-{ +- free(nvme_ns->stat); +- free(nvme_ns); +-} +- +-static void +-nvme_ctrlr_populate_namespace_done(struct nvme_ns *nvme_ns, int rc) +-{ +- struct nvme_ctrlr *nvme_ctrlr = nvme_ns->ctrlr; +- struct nvme_async_probe_ctx *ctx = nvme_ns->probe_ctx; +- +- if (rc == 0) { +- nvme_ns->probe_ctx = NULL; +- pthread_mutex_lock(&nvme_ctrlr->mutex); +- nvme_ctrlr->ref++; +- pthread_mutex_unlock(&nvme_ctrlr->mutex); +- } else { +- RB_REMOVE(nvme_ns_tree, &nvme_ctrlr->namespaces, nvme_ns); +- nvme_ns_free(nvme_ns); +- } +- +- if (ctx) { +- ctx->populates_in_progress--; +- if (ctx->populates_in_progress == 0) { +- nvme_ctrlr_populate_namespaces_done(nvme_ctrlr, ctx); +- } +- } +-} +- +-static void +-bdev_nvme_add_io_path(struct spdk_io_channel_iter *i) +-{ +- struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i); +- struct nvme_bdev_channel *nbdev_ch = spdk_io_channel_get_ctx(_ch); +- struct nvme_ns *nvme_ns = spdk_io_channel_iter_get_ctx(i); +- int rc; +- +- rc = _bdev_nvme_add_io_path(nbdev_ch, nvme_ns); +- if (rc != 0) { +- SPDK_ERRLOG("Failed to add I/O path to bdev_channel dynamically.\n"); +- } +- +- spdk_for_each_channel_continue(i, rc); +-} +- +-static void +-bdev_nvme_delete_io_path(struct spdk_io_channel_iter *i) +-{ +- struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i); +- struct nvme_bdev_channel *nbdev_ch = spdk_io_channel_get_ctx(_ch); +- struct nvme_ns *nvme_ns = spdk_io_channel_iter_get_ctx(i); +- struct nvme_io_path *io_path; +- +- io_path = _bdev_nvme_get_io_path(nbdev_ch, nvme_ns); +- if (io_path != NULL) { +- _bdev_nvme_delete_io_path(nbdev_ch, io_path); +- } +- +- spdk_for_each_channel_continue(i, 0); +-} +- +-static void +-bdev_nvme_add_io_path_failed(struct spdk_io_channel_iter *i, int status) +-{ +- struct nvme_ns *nvme_ns = spdk_io_channel_iter_get_ctx(i); +- +- nvme_ctrlr_populate_namespace_done(nvme_ns, -1); +-} +- +-static void +-bdev_nvme_add_io_path_done(struct spdk_io_channel_iter *i, int status) +-{ +- struct nvme_ns *nvme_ns = spdk_io_channel_iter_get_ctx(i); +- struct nvme_bdev *bdev = spdk_io_channel_iter_get_io_device(i); +- +- if (status == 0) { +- nvme_ctrlr_populate_namespace_done(nvme_ns, 0); +- } else { +- /* Delete the added io_paths and fail populating the namespace. */ +- spdk_for_each_channel(bdev, +- bdev_nvme_delete_io_path, +- nvme_ns, +- bdev_nvme_add_io_path_failed); +- } +-} +- +-static int +-nvme_bdev_add_ns(struct nvme_bdev *bdev, struct nvme_ns *nvme_ns) +-{ +- struct nvme_ns *tmp_ns; +- const struct spdk_nvme_ns_data *nsdata; +- +- nsdata = spdk_nvme_ns_get_data(nvme_ns->ns); +- if (!nsdata->nmic.can_share) { +- SPDK_ERRLOG("Namespace cannot be shared.\n"); +- return -EINVAL; +- } +- +- pthread_mutex_lock(&bdev->mutex); +- +- tmp_ns = TAILQ_FIRST(&bdev->nvme_ns_list); +- assert(tmp_ns != NULL); +- +- if (!bdev_nvme_compare_ns(nvme_ns->ns, tmp_ns->ns)) { +- pthread_mutex_unlock(&bdev->mutex); +- SPDK_ERRLOG("Namespaces are not identical.\n"); +- return -EINVAL; +- } +- +- bdev->ref++; +- TAILQ_INSERT_TAIL(&bdev->nvme_ns_list, nvme_ns, tailq); +- nvme_ns->bdev = bdev; +- +- pthread_mutex_unlock(&bdev->mutex); +- +- /* Add nvme_io_path to nvme_bdev_channels dynamically. */ +- spdk_for_each_channel(bdev, +- bdev_nvme_add_io_path, +- nvme_ns, +- bdev_nvme_add_io_path_done); +- +- return 0; +-} +- +-static void +-nvme_ctrlr_populate_namespace(struct nvme_ctrlr *nvme_ctrlr, struct nvme_ns *nvme_ns) +-{ +- struct spdk_nvme_ns *ns; +- struct nvme_bdev *bdev; +- int rc = 0; +- +- ns = spdk_nvme_ctrlr_get_ns(nvme_ctrlr->ctrlr, nvme_ns->id); +- if (!ns) { +- SPDK_DEBUGLOG(bdev_nvme, "Invalid NS %d\n", nvme_ns->id); +- rc = -EINVAL; +- goto done; +- } +- +- nvme_ns->ns = ns; +- nvme_ns->ana_state = SPDK_NVME_ANA_OPTIMIZED_STATE; +- +- if (nvme_ctrlr->ana_log_page != NULL) { +- bdev_nvme_parse_ana_log_page(nvme_ctrlr, nvme_ns_set_ana_state, nvme_ns); +- } +- +- bdev = nvme_bdev_ctrlr_get_bdev(nvme_ctrlr->nbdev_ctrlr, nvme_ns->id); +- if (bdev == NULL) { +- rc = nvme_bdev_create(nvme_ctrlr, nvme_ns); +- } else { +- rc = nvme_bdev_add_ns(bdev, nvme_ns); +- if (rc == 0) { +- return; +- } +- } +-done: +- nvme_ctrlr_populate_namespace_done(nvme_ns, rc); +-} +- +-static void +-nvme_ctrlr_depopulate_namespace_done(struct nvme_ns *nvme_ns) +-{ +- struct nvme_ctrlr *nvme_ctrlr = nvme_ns->ctrlr; +- +- assert(nvme_ctrlr != NULL); +- +- pthread_mutex_lock(&nvme_ctrlr->mutex); +- +- RB_REMOVE(nvme_ns_tree, &nvme_ctrlr->namespaces, nvme_ns); +- +- if (nvme_ns->bdev != NULL) { +- pthread_mutex_unlock(&nvme_ctrlr->mutex); +- return; +- } +- +- nvme_ns_free(nvme_ns); +- pthread_mutex_unlock(&nvme_ctrlr->mutex); +- +- nvme_ctrlr_release(nvme_ctrlr); +-} +- +-static void +-bdev_nvme_delete_io_path_done(struct spdk_io_channel_iter *i, int status) +-{ +- struct nvme_ns *nvme_ns = spdk_io_channel_iter_get_ctx(i); +- +- nvme_ctrlr_depopulate_namespace_done(nvme_ns); +-} +- +-static void +-nvme_ctrlr_depopulate_namespace(struct nvme_ctrlr *nvme_ctrlr, struct nvme_ns *nvme_ns) +-{ +- struct nvme_bdev *bdev; +- +- spdk_poller_unregister(&nvme_ns->anatt_timer); +- +- bdev = nvme_ns->bdev; +- if (bdev != NULL) { +- pthread_mutex_lock(&bdev->mutex); +- +- assert(bdev->ref > 0); +- bdev->ref--; +- if (bdev->ref == 0) { +- pthread_mutex_unlock(&bdev->mutex); +- +- spdk_bdev_unregister(&bdev->disk, NULL, NULL); +- } else { +- /* spdk_bdev_unregister() is not called until the last nvme_ns is +- * depopulated. Hence we need to remove nvme_ns from bdev->nvme_ns_list +- * and clear nvme_ns->bdev here. +- */ +- TAILQ_REMOVE(&bdev->nvme_ns_list, nvme_ns, tailq); +- nvme_ns->bdev = NULL; +- +- pthread_mutex_unlock(&bdev->mutex); +- +- /* Delete nvme_io_paths from nvme_bdev_channels dynamically. After that, +- * we call depopulate_namespace_done() to avoid use-after-free. +- */ +- spdk_for_each_channel(bdev, +- bdev_nvme_delete_io_path, +- nvme_ns, +- bdev_nvme_delete_io_path_done); +- return; +- } +- } +- +- nvme_ctrlr_depopulate_namespace_done(nvme_ns); +-} +- +-static void +-nvme_ctrlr_populate_namespaces(struct nvme_ctrlr *nvme_ctrlr, +- struct nvme_async_probe_ctx *ctx) +-{ +- struct spdk_nvme_ctrlr *ctrlr = nvme_ctrlr->ctrlr; +- struct nvme_ns *nvme_ns, *next; +- struct spdk_nvme_ns *ns; +- struct nvme_bdev *bdev; +- uint32_t nsid; +- int rc; +- uint64_t num_sectors; +- +- if (ctx) { +- /* Initialize this count to 1 to handle the populate functions +- * calling nvme_ctrlr_populate_namespace_done() immediately. +- */ +- ctx->populates_in_progress = 1; +- } +- +- /* First loop over our existing namespaces and see if they have been +- * removed. */ +- nvme_ns = nvme_ctrlr_get_first_active_ns(nvme_ctrlr); +- while (nvme_ns != NULL) { +- next = nvme_ctrlr_get_next_active_ns(nvme_ctrlr, nvme_ns); +- +- if (spdk_nvme_ctrlr_is_active_ns(ctrlr, nvme_ns->id)) { +- /* NS is still there but attributes may have changed */ +- ns = spdk_nvme_ctrlr_get_ns(ctrlr, nvme_ns->id); +- num_sectors = spdk_nvme_ns_get_num_sectors(ns); +- bdev = nvme_ns->bdev; +- assert(bdev != NULL); +- if (bdev->disk.blockcnt != num_sectors) { +- SPDK_NOTICELOG("NSID %u is resized: bdev name %s, old size %" PRIu64 ", new size %" PRIu64 "\n", +- nvme_ns->id, +- bdev->disk.name, +- bdev->disk.blockcnt, +- num_sectors); +- rc = spdk_bdev_notify_blockcnt_change(&bdev->disk, num_sectors); +- if (rc != 0) { +- SPDK_ERRLOG("Could not change num blocks for nvme bdev: name %s, errno: %d.\n", +- bdev->disk.name, rc); +- } +- } +- } else { +- /* Namespace was removed */ +- nvme_ctrlr_depopulate_namespace(nvme_ctrlr, nvme_ns); +- } +- +- nvme_ns = next; +- } +- +- /* Loop through all of the namespaces at the nvme level and see if any of them are new */ +- nsid = spdk_nvme_ctrlr_get_first_active_ns(ctrlr); +- while (nsid != 0) { +- nvme_ns = nvme_ctrlr_get_ns(nvme_ctrlr, nsid); +- +- if (nvme_ns == NULL) { +- /* Found a new one */ +- nvme_ns = nvme_ns_alloc(); +- if (nvme_ns == NULL) { +- SPDK_ERRLOG("Failed to allocate namespace\n"); +- /* This just fails to attach the namespace. It may work on a future attempt. */ +- continue; +- } +- +- nvme_ns->id = nsid; +- nvme_ns->ctrlr = nvme_ctrlr; +- +- nvme_ns->bdev = NULL; +- +- if (ctx) { +- ctx->populates_in_progress++; +- } +- nvme_ns->probe_ctx = ctx; +- +- RB_INSERT(nvme_ns_tree, &nvme_ctrlr->namespaces, nvme_ns); +- +- nvme_ctrlr_populate_namespace(nvme_ctrlr, nvme_ns); +- } +- +- nsid = spdk_nvme_ctrlr_get_next_active_ns(ctrlr, nsid); +- } +- +- if (ctx) { +- /* Decrement this count now that the loop is over to account +- * for the one we started with. If the count is then 0, we +- * know any populate_namespace functions completed immediately, +- * so we'll kick the callback here. +- */ +- ctx->populates_in_progress--; +- if (ctx->populates_in_progress == 0) { +- nvme_ctrlr_populate_namespaces_done(nvme_ctrlr, ctx); +- } +- } +- +-} +- +-static void +-nvme_ctrlr_depopulate_namespaces(struct nvme_ctrlr *nvme_ctrlr) +-{ +- struct nvme_ns *nvme_ns, *tmp; +- +- RB_FOREACH_SAFE(nvme_ns, nvme_ns_tree, &nvme_ctrlr->namespaces, tmp) { +- nvme_ctrlr_depopulate_namespace(nvme_ctrlr, nvme_ns); +- } +-} +- +-static uint32_t +-nvme_ctrlr_get_ana_log_page_size(struct nvme_ctrlr *nvme_ctrlr) +-{ +- struct spdk_nvme_ctrlr *ctrlr = nvme_ctrlr->ctrlr; +- const struct spdk_nvme_ctrlr_data *cdata; +- uint32_t nsid, ns_count = 0; +- +- cdata = spdk_nvme_ctrlr_get_data(ctrlr); +- +- for (nsid = spdk_nvme_ctrlr_get_first_active_ns(ctrlr); +- nsid != 0; nsid = spdk_nvme_ctrlr_get_next_active_ns(ctrlr, nsid)) { +- ns_count++; +- } +- +- return sizeof(struct spdk_nvme_ana_page) + cdata->nanagrpid * +- sizeof(struct spdk_nvme_ana_group_descriptor) + ns_count * +- sizeof(uint32_t); +-} +- +-static int +-nvme_ctrlr_set_ana_states(const struct spdk_nvme_ana_group_descriptor *desc, +- void *cb_arg) +-{ +- struct nvme_ctrlr *nvme_ctrlr = cb_arg; +- struct nvme_ns *nvme_ns; +- uint32_t i, nsid; +- +- for (i = 0; i < desc->num_of_nsid; i++) { +- nsid = desc->nsid[i]; +- if (nsid == 0) { +- continue; +- } +- +- nvme_ns = nvme_ctrlr_get_ns(nvme_ctrlr, nsid); +- +- assert(nvme_ns != NULL); +- if (nvme_ns == NULL) { +- /* Target told us that an inactive namespace had an ANA change */ +- continue; +- } +- +- _nvme_ns_set_ana_state(nvme_ns, desc); +- } +- +- return 0; +-} +- +-static void +-bdev_nvme_disable_read_ana_log_page(struct nvme_ctrlr *nvme_ctrlr) +-{ +- struct nvme_ns *nvme_ns; +- +- spdk_free(nvme_ctrlr->ana_log_page); +- nvme_ctrlr->ana_log_page = NULL; +- +- for (nvme_ns = nvme_ctrlr_get_first_active_ns(nvme_ctrlr); +- nvme_ns != NULL; +- nvme_ns = nvme_ctrlr_get_next_active_ns(nvme_ctrlr, nvme_ns)) { +- nvme_ns->ana_state_updating = false; +- nvme_ns->ana_state = SPDK_NVME_ANA_OPTIMIZED_STATE; +- } +-} +- +-static void +-nvme_ctrlr_read_ana_log_page_done(void *ctx, const struct spdk_nvme_cpl *cpl) +-{ +- struct nvme_ctrlr *nvme_ctrlr = ctx; +- +- if (cpl != NULL && spdk_nvme_cpl_is_success(cpl)) { +- bdev_nvme_parse_ana_log_page(nvme_ctrlr, nvme_ctrlr_set_ana_states, +- nvme_ctrlr); +- } else { +- bdev_nvme_disable_read_ana_log_page(nvme_ctrlr); +- } +- +- pthread_mutex_lock(&nvme_ctrlr->mutex); +- +- assert(nvme_ctrlr->ana_log_page_updating == true); +- nvme_ctrlr->ana_log_page_updating = false; +- +- if (nvme_ctrlr_can_be_unregistered(nvme_ctrlr)) { +- pthread_mutex_unlock(&nvme_ctrlr->mutex); +- +- nvme_ctrlr_unregister(nvme_ctrlr); +- } else { +- pthread_mutex_unlock(&nvme_ctrlr->mutex); +- +- bdev_nvme_clear_io_path_caches(nvme_ctrlr); +- } +-} +- +-static int +-nvme_ctrlr_read_ana_log_page(struct nvme_ctrlr *nvme_ctrlr) +-{ +- uint32_t ana_log_page_size; +- int rc; +- +- if (nvme_ctrlr->ana_log_page == NULL) { +- return -EINVAL; +- } +- +- ana_log_page_size = nvme_ctrlr_get_ana_log_page_size(nvme_ctrlr); +- +- if (ana_log_page_size > nvme_ctrlr->max_ana_log_page_size) { +- SPDK_ERRLOG("ANA log page size %" PRIu32 " is larger than allowed %" PRIu32 "\n", +- ana_log_page_size, nvme_ctrlr->max_ana_log_page_size); +- return -EINVAL; +- } +- +- pthread_mutex_lock(&nvme_ctrlr->mutex); +- if (!nvme_ctrlr_is_available(nvme_ctrlr) || +- nvme_ctrlr->ana_log_page_updating) { +- pthread_mutex_unlock(&nvme_ctrlr->mutex); +- return -EBUSY; +- } +- +- nvme_ctrlr->ana_log_page_updating = true; +- pthread_mutex_unlock(&nvme_ctrlr->mutex); +- +- rc = spdk_nvme_ctrlr_cmd_get_log_page(nvme_ctrlr->ctrlr, +- SPDK_NVME_LOG_ASYMMETRIC_NAMESPACE_ACCESS, +- SPDK_NVME_GLOBAL_NS_TAG, +- nvme_ctrlr->ana_log_page, +- ana_log_page_size, 0, +- nvme_ctrlr_read_ana_log_page_done, +- nvme_ctrlr); +- if (rc != 0) { +- nvme_ctrlr_read_ana_log_page_done(nvme_ctrlr, NULL); +- } +- +- return rc; +-} +- +-static void +-dummy_bdev_event_cb(enum spdk_bdev_event_type type, struct spdk_bdev *bdev, void *ctx) +-{ +-} +- +-struct bdev_nvme_set_preferred_path_ctx { +- struct spdk_bdev_desc *desc; +- struct nvme_ns *nvme_ns; +- bdev_nvme_set_preferred_path_cb cb_fn; +- void *cb_arg; +-}; +- +-static void +-bdev_nvme_set_preferred_path_done(struct spdk_io_channel_iter *i, int status) +-{ +- struct bdev_nvme_set_preferred_path_ctx *ctx = spdk_io_channel_iter_get_ctx(i); +- +- assert(ctx != NULL); +- assert(ctx->desc != NULL); +- assert(ctx->cb_fn != NULL); +- +- spdk_bdev_close(ctx->desc); +- +- ctx->cb_fn(ctx->cb_arg, status); +- +- free(ctx); +-} +- +-static void +-_bdev_nvme_set_preferred_path(struct spdk_io_channel_iter *i) +-{ +- struct bdev_nvme_set_preferred_path_ctx *ctx = spdk_io_channel_iter_get_ctx(i); +- struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i); +- struct nvme_bdev_channel *nbdev_ch = spdk_io_channel_get_ctx(_ch); +- struct nvme_io_path *io_path, *prev; +- +- prev = NULL; +- STAILQ_FOREACH(io_path, &nbdev_ch->io_path_list, stailq) { +- if (io_path->nvme_ns == ctx->nvme_ns) { +- break; +- } +- prev = io_path; +- } +- +- if (io_path != NULL) { +- if (prev != NULL) { +- STAILQ_REMOVE_AFTER(&nbdev_ch->io_path_list, prev, stailq); +- STAILQ_INSERT_HEAD(&nbdev_ch->io_path_list, io_path, stailq); +- } +- +- /* We can set io_path to nbdev_ch->current_io_path directly here. +- * However, it needs to be conditional. To simplify the code, +- * just clear nbdev_ch->current_io_path and let find_io_path() +- * fill it. +- * +- * Automatic failback may be disabled. Hence even if the io_path is +- * already at the head, clear nbdev_ch->current_io_path. +- */ +- bdev_nvme_clear_current_io_path(nbdev_ch); +- } +- +- spdk_for_each_channel_continue(i, 0); +-} +- +-static struct nvme_ns * +-bdev_nvme_set_preferred_ns(struct nvme_bdev *nbdev, uint16_t cntlid) +-{ +- struct nvme_ns *nvme_ns, *prev; +- const struct spdk_nvme_ctrlr_data *cdata; +- +- prev = NULL; +- TAILQ_FOREACH(nvme_ns, &nbdev->nvme_ns_list, tailq) { +- cdata = spdk_nvme_ctrlr_get_data(nvme_ns->ctrlr->ctrlr); +- +- if (cdata->cntlid == cntlid) { +- break; +- } +- prev = nvme_ns; +- } +- +- if (nvme_ns != NULL && prev != NULL) { +- TAILQ_REMOVE(&nbdev->nvme_ns_list, nvme_ns, tailq); +- TAILQ_INSERT_HEAD(&nbdev->nvme_ns_list, nvme_ns, tailq); +- } +- +- return nvme_ns; +-} +- +-/* This function supports only multipath mode. There is only a single I/O path +- * for each NVMe-oF controller. Hence, just move the matched I/O path to the +- * head of the I/O path list for each NVMe bdev channel. +- * +- * NVMe bdev channel may be acquired after completing this function. move the +- * matched namespace to the head of the namespace list for the NVMe bdev too. +- */ +-void +-bdev_nvme_set_preferred_path(const char *name, uint16_t cntlid, +- bdev_nvme_set_preferred_path_cb cb_fn, void *cb_arg) +-{ +- struct bdev_nvme_set_preferred_path_ctx *ctx; +- struct spdk_bdev *bdev; +- struct nvme_bdev *nbdev; +- int rc = 0; +- +- assert(cb_fn != NULL); +- +- ctx = calloc(1, sizeof(*ctx)); +- if (ctx == NULL) { +- SPDK_ERRLOG("Failed to alloc context.\n"); +- rc = -ENOMEM; +- goto err_alloc; +- } +- +- ctx->cb_fn = cb_fn; +- ctx->cb_arg = cb_arg; +- +- rc = spdk_bdev_open_ext(name, false, dummy_bdev_event_cb, NULL, &ctx->desc); +- if (rc != 0) { +- SPDK_ERRLOG("Failed to open bdev %s.\n", name); +- goto err_open; +- } +- +- bdev = spdk_bdev_desc_get_bdev(ctx->desc); +- +- if (bdev->module != &nvme_if) { +- SPDK_ERRLOG("bdev %s is not registered in this module.\n", name); +- rc = -ENODEV; +- goto err_bdev; +- } +- +- nbdev = SPDK_CONTAINEROF(bdev, struct nvme_bdev, disk); +- +- pthread_mutex_lock(&nbdev->mutex); +- +- ctx->nvme_ns = bdev_nvme_set_preferred_ns(nbdev, cntlid); +- if (ctx->nvme_ns == NULL) { +- pthread_mutex_unlock(&nbdev->mutex); +- +- SPDK_ERRLOG("bdev %s does not have namespace to controller %u.\n", name, cntlid); +- rc = -ENODEV; +- goto err_bdev; +- } +- +- pthread_mutex_unlock(&nbdev->mutex); +- +- spdk_for_each_channel(nbdev, +- _bdev_nvme_set_preferred_path, +- ctx, +- bdev_nvme_set_preferred_path_done); +- return; +- +-err_bdev: +- spdk_bdev_close(ctx->desc); +-err_open: +- free(ctx); +-err_alloc: +- cb_fn(cb_arg, rc); +-} +- +-struct bdev_nvme_set_multipath_policy_ctx { +- struct spdk_bdev_desc *desc; +- bdev_nvme_set_multipath_policy_cb cb_fn; +- void *cb_arg; +-}; +- +-static void +-bdev_nvme_set_multipath_policy_done(struct spdk_io_channel_iter *i, int status) +-{ +- struct bdev_nvme_set_multipath_policy_ctx *ctx = spdk_io_channel_iter_get_ctx(i); +- +- assert(ctx != NULL); +- assert(ctx->desc != NULL); +- assert(ctx->cb_fn != NULL); +- +- spdk_bdev_close(ctx->desc); +- +- ctx->cb_fn(ctx->cb_arg, status); +- +- free(ctx); +-} +- +-static void +-_bdev_nvme_set_multipath_policy(struct spdk_io_channel_iter *i) +-{ +- struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i); +- struct nvme_bdev_channel *nbdev_ch = spdk_io_channel_get_ctx(_ch); +- struct nvme_bdev *nbdev = spdk_io_channel_get_io_device(_ch); +- +- nbdev_ch->mp_policy = nbdev->mp_policy; +- nbdev_ch->mp_selector = nbdev->mp_selector; +- nbdev_ch->rr_min_io = nbdev->rr_min_io; +- bdev_nvme_clear_current_io_path(nbdev_ch); +- +- spdk_for_each_channel_continue(i, 0); +-} +- +-void +-bdev_nvme_set_multipath_policy(const char *name, enum bdev_nvme_multipath_policy policy, +- enum bdev_nvme_multipath_selector selector, uint32_t rr_min_io, +- bdev_nvme_set_multipath_policy_cb cb_fn, void *cb_arg) +-{ +- struct bdev_nvme_set_multipath_policy_ctx *ctx; +- struct spdk_bdev *bdev; +- struct nvme_bdev *nbdev; +- int rc; +- +- assert(cb_fn != NULL); +- +- if (policy == BDEV_NVME_MP_POLICY_ACTIVE_ACTIVE && selector == BDEV_NVME_MP_SELECTOR_ROUND_ROBIN) { +- if (rr_min_io == UINT32_MAX) { +- rr_min_io = 1; +- } else if (rr_min_io == 0) { +- rc = -EINVAL; +- goto exit; +- } +- } else if (rr_min_io != UINT32_MAX) { +- rc = -EINVAL; +- goto exit; +- } +- +- ctx = calloc(1, sizeof(*ctx)); +- if (ctx == NULL) { +- SPDK_ERRLOG("Failed to alloc context.\n"); +- rc = -ENOMEM; +- goto exit; +- } +- +- ctx->cb_fn = cb_fn; +- ctx->cb_arg = cb_arg; +- +- rc = spdk_bdev_open_ext(name, false, dummy_bdev_event_cb, NULL, &ctx->desc); +- if (rc != 0) { +- SPDK_ERRLOG("Failed to open bdev %s.\n", name); +- rc = -ENODEV; +- goto err_open; +- } +- +- bdev = spdk_bdev_desc_get_bdev(ctx->desc); +- if (bdev->module != &nvme_if) { +- SPDK_ERRLOG("bdev %s is not registered in this module.\n", name); +- rc = -ENODEV; +- goto err_module; +- } +- nbdev = SPDK_CONTAINEROF(bdev, struct nvme_bdev, disk); +- +- pthread_mutex_lock(&nbdev->mutex); +- nbdev->mp_policy = policy; +- nbdev->mp_selector = selector; +- nbdev->rr_min_io = rr_min_io; +- pthread_mutex_unlock(&nbdev->mutex); +- +- spdk_for_each_channel(nbdev, +- _bdev_nvme_set_multipath_policy, +- ctx, +- bdev_nvme_set_multipath_policy_done); +- return; +- +-err_module: +- spdk_bdev_close(ctx->desc); +-err_open: +- free(ctx); +-exit: +- cb_fn(cb_arg, rc); +-} +- +-static void +-aer_cb(void *arg, const struct spdk_nvme_cpl *cpl) +-{ +- struct nvme_ctrlr *nvme_ctrlr = arg; +- union spdk_nvme_async_event_completion event; +- +- if (spdk_nvme_cpl_is_error(cpl)) { +- SPDK_WARNLOG("AER request execute failed\n"); +- return; +- } +- +- event.raw = cpl->cdw0; +- if ((event.bits.async_event_type == SPDK_NVME_ASYNC_EVENT_TYPE_NOTICE) && +- (event.bits.async_event_info == SPDK_NVME_ASYNC_EVENT_NS_ATTR_CHANGED)) { +- nvme_ctrlr_populate_namespaces(nvme_ctrlr, NULL); +- } else if ((event.bits.async_event_type == SPDK_NVME_ASYNC_EVENT_TYPE_NOTICE) && +- (event.bits.async_event_info == SPDK_NVME_ASYNC_EVENT_ANA_CHANGE)) { +- nvme_ctrlr_read_ana_log_page(nvme_ctrlr); +- } +-} +- +-static void +-populate_namespaces_cb(struct nvme_async_probe_ctx *ctx, size_t count, int rc) +-{ +- if (ctx->cb_fn) { +- ctx->cb_fn(ctx->cb_ctx, count, rc); +- } +- +- ctx->namespaces_populated = true; +- if (ctx->probe_done) { +- /* The probe was already completed, so we need to free the context +- * here. This can happen for cases like OCSSD, where we need to +- * send additional commands to the SSD after attach. +- */ +- free(ctx); +- } +-} +- +-static void +-nvme_ctrlr_create_done(struct nvme_ctrlr *nvme_ctrlr, +- struct nvme_async_probe_ctx *ctx) +-{ +- spdk_io_device_register(nvme_ctrlr, +- bdev_nvme_create_ctrlr_channel_cb, +- bdev_nvme_destroy_ctrlr_channel_cb, +- sizeof(struct nvme_ctrlr_channel), +- nvme_ctrlr->nbdev_ctrlr->name); +- +- nvme_ctrlr_populate_namespaces(nvme_ctrlr, ctx); +-} +- +-static void +-nvme_ctrlr_init_ana_log_page_done(void *_ctx, const struct spdk_nvme_cpl *cpl) +-{ +- struct nvme_ctrlr *nvme_ctrlr = _ctx; +- struct nvme_async_probe_ctx *ctx = nvme_ctrlr->probe_ctx; +- +- nvme_ctrlr->probe_ctx = NULL; +- +- if (spdk_nvme_cpl_is_error(cpl)) { +- nvme_ctrlr_delete(nvme_ctrlr); +- +- if (ctx != NULL) { +- populate_namespaces_cb(ctx, 0, -1); +- } +- return; +- } +- +- nvme_ctrlr_create_done(nvme_ctrlr, ctx); +-} +- +-static int +-nvme_ctrlr_init_ana_log_page(struct nvme_ctrlr *nvme_ctrlr, +- struct nvme_async_probe_ctx *ctx) +-{ +- struct spdk_nvme_ctrlr *ctrlr = nvme_ctrlr->ctrlr; +- const struct spdk_nvme_ctrlr_data *cdata; +- uint32_t ana_log_page_size; +- +- cdata = spdk_nvme_ctrlr_get_data(ctrlr); +- +- /* Set buffer size enough to include maximum number of allowed namespaces. */ +- ana_log_page_size = sizeof(struct spdk_nvme_ana_page) + cdata->nanagrpid * +- sizeof(struct spdk_nvme_ana_group_descriptor) + cdata->mnan * +- sizeof(uint32_t); +- +- nvme_ctrlr->ana_log_page = spdk_zmalloc(ana_log_page_size, 64, NULL, +- SPDK_ENV_SOCKET_ID_ANY, SPDK_MALLOC_DMA); +- if (nvme_ctrlr->ana_log_page == NULL) { +- SPDK_ERRLOG("could not allocate ANA log page buffer\n"); +- return -ENXIO; +- } +- +- /* Each descriptor in a ANA log page is not ensured to be 8-bytes aligned. +- * Hence copy each descriptor to a temporary area when parsing it. +- * +- * Allocate a buffer whose size is as large as ANA log page buffer because +- * we do not know the size of a descriptor until actually reading it. +- */ +- nvme_ctrlr->copied_ana_desc = calloc(1, ana_log_page_size); +- if (nvme_ctrlr->copied_ana_desc == NULL) { +- SPDK_ERRLOG("could not allocate a buffer to parse ANA descriptor\n"); +- return -ENOMEM; +- } +- +- nvme_ctrlr->max_ana_log_page_size = ana_log_page_size; +- +- nvme_ctrlr->probe_ctx = ctx; +- +- /* Then, set the read size only to include the current active namespaces. */ +- ana_log_page_size = nvme_ctrlr_get_ana_log_page_size(nvme_ctrlr); +- +- if (ana_log_page_size > nvme_ctrlr->max_ana_log_page_size) { +- SPDK_ERRLOG("ANA log page size %" PRIu32 " is larger than allowed %" PRIu32 "\n", +- ana_log_page_size, nvme_ctrlr->max_ana_log_page_size); +- return -EINVAL; +- } +- +- return spdk_nvme_ctrlr_cmd_get_log_page(ctrlr, +- SPDK_NVME_LOG_ASYMMETRIC_NAMESPACE_ACCESS, +- SPDK_NVME_GLOBAL_NS_TAG, +- nvme_ctrlr->ana_log_page, +- ana_log_page_size, 0, +- nvme_ctrlr_init_ana_log_page_done, +- nvme_ctrlr); +-} +- +-/* hostnqn and subnqn were already verified before attaching a controller. +- * Hence check only the multipath capability and cntlid here. +- */ +-static bool +-bdev_nvme_check_multipath(struct nvme_bdev_ctrlr *nbdev_ctrlr, struct spdk_nvme_ctrlr *ctrlr) +-{ +- struct nvme_ctrlr *tmp; +- const struct spdk_nvme_ctrlr_data *cdata, *tmp_cdata; +- +- cdata = spdk_nvme_ctrlr_get_data(ctrlr); +- +- if (!cdata->cmic.multi_ctrlr) { +- SPDK_ERRLOG("Ctrlr%u does not support multipath.\n", cdata->cntlid); +- return false; +- } +- +- TAILQ_FOREACH(tmp, &nbdev_ctrlr->ctrlrs, tailq) { +- tmp_cdata = spdk_nvme_ctrlr_get_data(tmp->ctrlr); +- +- if (!tmp_cdata->cmic.multi_ctrlr) { +- SPDK_ERRLOG("Ctrlr%u does not support multipath.\n", cdata->cntlid); +- return false; +- } +- if (cdata->cntlid == tmp_cdata->cntlid) { +- SPDK_ERRLOG("cntlid %u are duplicated.\n", tmp_cdata->cntlid); +- return false; +- } +- } +- +- return true; +-} +- +-static int +-nvme_bdev_ctrlr_create(const char *name, struct nvme_ctrlr *nvme_ctrlr) +-{ +- struct nvme_bdev_ctrlr *nbdev_ctrlr; +- struct spdk_nvme_ctrlr *ctrlr = nvme_ctrlr->ctrlr; +- int rc = 0; +- +- pthread_mutex_lock(&g_bdev_nvme_mutex); +- +- nbdev_ctrlr = nvme_bdev_ctrlr_get_by_name(name); +- if (nbdev_ctrlr != NULL) { +- if (!bdev_nvme_check_multipath(nbdev_ctrlr, ctrlr)) { +- rc = -EINVAL; +- goto exit; +- } +- } else { +- nbdev_ctrlr = calloc(1, sizeof(*nbdev_ctrlr)); +- if (nbdev_ctrlr == NULL) { +- SPDK_ERRLOG("Failed to allocate nvme_bdev_ctrlr.\n"); +- rc = -ENOMEM; +- goto exit; +- } +- nbdev_ctrlr->name = strdup(name); +- if (nbdev_ctrlr->name == NULL) { +- SPDK_ERRLOG("Failed to allocate name of nvme_bdev_ctrlr.\n"); +- free(nbdev_ctrlr); +- goto exit; +- } +- TAILQ_INIT(&nbdev_ctrlr->ctrlrs); +- TAILQ_INIT(&nbdev_ctrlr->bdevs); +- TAILQ_INSERT_TAIL(&g_nvme_bdev_ctrlrs, nbdev_ctrlr, tailq); +- } +- nvme_ctrlr->nbdev_ctrlr = nbdev_ctrlr; +- TAILQ_INSERT_TAIL(&nbdev_ctrlr->ctrlrs, nvme_ctrlr, tailq); +-exit: +- pthread_mutex_unlock(&g_bdev_nvme_mutex); +- return rc; +-} +- +-static int +-nvme_ctrlr_create(struct spdk_nvme_ctrlr *ctrlr, +- const char *name, +- const struct spdk_nvme_transport_id *trid, +- struct nvme_async_probe_ctx *ctx) +-{ +- struct nvme_ctrlr *nvme_ctrlr; +- struct nvme_path_id *path_id; +- const struct spdk_nvme_ctrlr_data *cdata; +- int rc; +- +- nvme_ctrlr = calloc(1, sizeof(*nvme_ctrlr)); +- if (nvme_ctrlr == NULL) { +- SPDK_ERRLOG("Failed to allocate device struct\n"); +- return -ENOMEM; +- } +- +- rc = pthread_mutex_init(&nvme_ctrlr->mutex, NULL); +- if (rc != 0) { +- free(nvme_ctrlr); +- return rc; +- } +- +- TAILQ_INIT(&nvme_ctrlr->trids); +- +- RB_INIT(&nvme_ctrlr->namespaces); +- +- path_id = calloc(1, sizeof(*path_id)); +- if (path_id == NULL) { +- SPDK_ERRLOG("Failed to allocate trid entry pointer\n"); +- rc = -ENOMEM; +- goto err; +- } +- +- path_id->trid = *trid; +- if (ctx != NULL) { +- memcpy(path_id->hostid.hostaddr, ctx->drv_opts.src_addr, sizeof(path_id->hostid.hostaddr)); +- memcpy(path_id->hostid.hostsvcid, ctx->drv_opts.src_svcid, sizeof(path_id->hostid.hostsvcid)); +- } +- nvme_ctrlr->active_path_id = path_id; +- TAILQ_INSERT_HEAD(&nvme_ctrlr->trids, path_id, link); +- +- nvme_ctrlr->thread = spdk_get_thread(); +- nvme_ctrlr->ctrlr = ctrlr; +- nvme_ctrlr->ref = 1; +- +- if (spdk_nvme_ctrlr_is_ocssd_supported(ctrlr)) { +- SPDK_ERRLOG("OCSSDs are not supported"); +- rc = -ENOTSUP; +- goto err; +- } +- +- if (ctx != NULL) { +- memcpy(&nvme_ctrlr->opts, &ctx->bdev_opts, sizeof(ctx->bdev_opts)); +- } else { +- bdev_nvme_get_default_ctrlr_opts(&nvme_ctrlr->opts); +- } +- +- nvme_ctrlr->adminq_timer_poller = SPDK_POLLER_REGISTER(bdev_nvme_poll_adminq, nvme_ctrlr, +- g_opts.nvme_adminq_poll_period_us); +- +- if (g_opts.timeout_us > 0) { +- /* Register timeout callback. Timeout values for IO vs. admin reqs can be different. */ +- /* If timeout_admin_us is 0 (not specified), admin uses same timeout as IO. */ +- uint64_t adm_timeout_us = (g_opts.timeout_admin_us == 0) ? +- g_opts.timeout_us : g_opts.timeout_admin_us; +- spdk_nvme_ctrlr_register_timeout_callback(ctrlr, g_opts.timeout_us, +- adm_timeout_us, timeout_cb, nvme_ctrlr); +- } +- +- spdk_nvme_ctrlr_register_aer_callback(ctrlr, aer_cb, nvme_ctrlr); +- spdk_nvme_ctrlr_set_remove_cb(ctrlr, remove_cb, nvme_ctrlr); +- +- if (spdk_nvme_ctrlr_get_flags(ctrlr) & +- SPDK_NVME_CTRLR_SECURITY_SEND_RECV_SUPPORTED) { +- nvme_ctrlr->opal_dev = spdk_opal_dev_construct(ctrlr); +- } +- +- rc = nvme_bdev_ctrlr_create(name, nvme_ctrlr); +- if (rc != 0) { +- goto err; +- } +- +- cdata = spdk_nvme_ctrlr_get_data(ctrlr); +- +- if (cdata->cmic.ana_reporting) { +- rc = nvme_ctrlr_init_ana_log_page(nvme_ctrlr, ctx); +- if (rc == 0) { +- return 0; +- } +- } else { +- nvme_ctrlr_create_done(nvme_ctrlr, ctx); +- return 0; +- } +- +-err: +- nvme_ctrlr_delete(nvme_ctrlr); +- return rc; +-} +- +-void +-bdev_nvme_get_default_ctrlr_opts(struct nvme_ctrlr_opts *opts) +-{ +- opts->prchk_flags = 0; +- opts->ctrlr_loss_timeout_sec = g_opts.ctrlr_loss_timeout_sec; +- opts->reconnect_delay_sec = g_opts.reconnect_delay_sec; +- opts->fast_io_fail_timeout_sec = g_opts.fast_io_fail_timeout_sec; +-} +- +-static void +-attach_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid, +- struct spdk_nvme_ctrlr *ctrlr, const struct spdk_nvme_ctrlr_opts *drv_opts) +-{ +- char *name; +- +- name = spdk_sprintf_alloc("HotInNvme%d", g_hot_insert_nvme_controller_index++); +- if (!name) { +- SPDK_ERRLOG("Failed to assign name to NVMe device\n"); +- return; +- } +- +- if (nvme_ctrlr_create(ctrlr, name, trid, NULL) == 0) { +- SPDK_DEBUGLOG(bdev_nvme, "Attached to %s (%s)\n", trid->traddr, name); +- } else { +- SPDK_ERRLOG("Failed to attach to %s (%s)\n", trid->traddr, name); +- } +- +- free(name); +-} +- +-static void +-_nvme_ctrlr_destruct(void *ctx) +-{ +- struct nvme_ctrlr *nvme_ctrlr = ctx; +- +- nvme_ctrlr_depopulate_namespaces(nvme_ctrlr); +- nvme_ctrlr_release(nvme_ctrlr); +-} +- +-static int +-_bdev_nvme_delete(struct nvme_ctrlr *nvme_ctrlr, bool hotplug) +-{ +- struct nvme_probe_skip_entry *entry; +- +- pthread_mutex_lock(&nvme_ctrlr->mutex); +- +- /* The controller's destruction was already started */ +- if (nvme_ctrlr->destruct) { +- pthread_mutex_unlock(&nvme_ctrlr->mutex); +- return 0; +- } +- +- if (!hotplug && +- nvme_ctrlr->active_path_id->trid.trtype == SPDK_NVME_TRANSPORT_PCIE) { +- entry = calloc(1, sizeof(*entry)); +- if (!entry) { +- pthread_mutex_unlock(&nvme_ctrlr->mutex); +- return -ENOMEM; +- } +- entry->trid = nvme_ctrlr->active_path_id->trid; +- TAILQ_INSERT_TAIL(&g_skipped_nvme_ctrlrs, entry, tailq); +- } +- +- nvme_ctrlr->destruct = true; +- pthread_mutex_unlock(&nvme_ctrlr->mutex); +- +- _nvme_ctrlr_destruct(nvme_ctrlr); +- +- return 0; +-} +- +-static void +-remove_cb(void *cb_ctx, struct spdk_nvme_ctrlr *ctrlr) +-{ +- struct nvme_ctrlr *nvme_ctrlr = cb_ctx; +- +- _bdev_nvme_delete(nvme_ctrlr, true); +-} +- +-static int +-bdev_nvme_hotplug_probe(void *arg) +-{ +- if (g_hotplug_probe_ctx == NULL) { +- spdk_poller_unregister(&g_hotplug_probe_poller); +- return SPDK_POLLER_IDLE; +- } +- +- if (spdk_nvme_probe_poll_async(g_hotplug_probe_ctx) != -EAGAIN) { +- g_hotplug_probe_ctx = NULL; +- spdk_poller_unregister(&g_hotplug_probe_poller); +- } +- +- return SPDK_POLLER_BUSY; +-} +- +-static int +-bdev_nvme_hotplug(void *arg) +-{ +- struct spdk_nvme_transport_id trid_pcie; +- +- if (g_hotplug_probe_ctx) { +- return SPDK_POLLER_BUSY; +- } +- +- memset(&trid_pcie, 0, sizeof(trid_pcie)); +- spdk_nvme_trid_populate_transport(&trid_pcie, SPDK_NVME_TRANSPORT_PCIE); +- +- g_hotplug_probe_ctx = spdk_nvme_probe_async(&trid_pcie, NULL, +- hotplug_probe_cb, attach_cb, NULL); +- +- if (g_hotplug_probe_ctx) { +- assert(g_hotplug_probe_poller == NULL); +- g_hotplug_probe_poller = SPDK_POLLER_REGISTER(bdev_nvme_hotplug_probe, NULL, 1000); +- } +- +- return SPDK_POLLER_BUSY; +-} +- +-void +-bdev_nvme_get_opts(struct spdk_bdev_nvme_opts *opts) +-{ +- *opts = g_opts; +-} +- +-static bool bdev_nvme_check_io_error_resiliency_params(int32_t ctrlr_loss_timeout_sec, +- uint32_t reconnect_delay_sec, +- uint32_t fast_io_fail_timeout_sec); +- +-static int +-bdev_nvme_validate_opts(const struct spdk_bdev_nvme_opts *opts) +-{ +- if ((opts->timeout_us == 0) && (opts->timeout_admin_us != 0)) { +- /* Can't set timeout_admin_us without also setting timeout_us */ +- SPDK_WARNLOG("Invalid options: Can't have (timeout_us == 0) with (timeout_admin_us > 0)\n"); +- return -EINVAL; +- } +- +- if (opts->bdev_retry_count < -1) { +- SPDK_WARNLOG("Invalid option: bdev_retry_count can't be less than -1.\n"); +- return -EINVAL; +- } +- +- if (!bdev_nvme_check_io_error_resiliency_params(opts->ctrlr_loss_timeout_sec, +- opts->reconnect_delay_sec, +- opts->fast_io_fail_timeout_sec)) { +- return -EINVAL; +- } +- +- return 0; +-} +- +-int +-bdev_nvme_set_opts(const struct spdk_bdev_nvme_opts *opts) +-{ +- int ret; +- +- ret = bdev_nvme_validate_opts(opts); +- if (ret) { +- SPDK_WARNLOG("Failed to set nvme opts.\n"); +- return ret; +- } +- +- if (g_bdev_nvme_init_thread != NULL) { +- if (!TAILQ_EMPTY(&g_nvme_bdev_ctrlrs)) { +- return -EPERM; +- } +- } +- +- if (opts->rdma_srq_size != 0) { +- struct spdk_nvme_transport_opts drv_opts; +- +- spdk_nvme_transport_get_opts(&drv_opts, sizeof(drv_opts)); +- drv_opts.rdma_srq_size = opts->rdma_srq_size; +- +- ret = spdk_nvme_transport_set_opts(&drv_opts, sizeof(drv_opts)); +- if (ret) { +- SPDK_ERRLOG("Failed to set NVMe transport opts.\n"); +- return ret; +- } +- } +- +- g_opts = *opts; +- +- return 0; +-} +- +-struct set_nvme_hotplug_ctx { +- uint64_t period_us; +- bool enabled; +- spdk_msg_fn fn; +- void *fn_ctx; +-}; +- +-static void +-set_nvme_hotplug_period_cb(void *_ctx) +-{ +- struct set_nvme_hotplug_ctx *ctx = _ctx; +- +- spdk_poller_unregister(&g_hotplug_poller); +- if (ctx->enabled) { +- g_hotplug_poller = SPDK_POLLER_REGISTER(bdev_nvme_hotplug, NULL, ctx->period_us); +- } +- +- g_nvme_hotplug_poll_period_us = ctx->period_us; +- g_nvme_hotplug_enabled = ctx->enabled; +- if (ctx->fn) { +- ctx->fn(ctx->fn_ctx); +- } +- +- free(ctx); +-} +- +-int +-bdev_nvme_set_hotplug(bool enabled, uint64_t period_us, spdk_msg_fn cb, void *cb_ctx) +-{ +- struct set_nvme_hotplug_ctx *ctx; +- +- if (enabled == true && !spdk_process_is_primary()) { +- return -EPERM; +- } +- +- ctx = calloc(1, sizeof(*ctx)); +- if (ctx == NULL) { +- return -ENOMEM; +- } +- +- period_us = period_us == 0 ? NVME_HOTPLUG_POLL_PERIOD_DEFAULT : period_us; +- ctx->period_us = spdk_min(period_us, NVME_HOTPLUG_POLL_PERIOD_MAX); +- ctx->enabled = enabled; +- ctx->fn = cb; +- ctx->fn_ctx = cb_ctx; +- +- spdk_thread_send_msg(g_bdev_nvme_init_thread, set_nvme_hotplug_period_cb, ctx); +- return 0; +-} +- +-static void +-nvme_ctrlr_populate_namespaces_done(struct nvme_ctrlr *nvme_ctrlr, +- struct nvme_async_probe_ctx *ctx) +-{ +- struct nvme_ns *nvme_ns; +- struct nvme_bdev *nvme_bdev; +- size_t j; +- +- assert(nvme_ctrlr != NULL); +- +- if (ctx->names == NULL) { +- populate_namespaces_cb(ctx, 0, 0); +- return; +- } +- +- /* +- * Report the new bdevs that were created in this call. +- * There can be more than one bdev per NVMe controller. +- */ +- j = 0; +- nvme_ns = nvme_ctrlr_get_first_active_ns(nvme_ctrlr); +- while (nvme_ns != NULL) { +- nvme_bdev = nvme_ns->bdev; +- if (j < ctx->count) { +- ctx->names[j] = nvme_bdev->disk.name; +- j++; +- } else { +- SPDK_ERRLOG("Maximum number of namespaces supported per NVMe controller is %du. Unable to return all names of created bdevs\n", +- ctx->count); +- populate_namespaces_cb(ctx, 0, -ERANGE); +- return; +- } +- +- nvme_ns = nvme_ctrlr_get_next_active_ns(nvme_ctrlr, nvme_ns); +- } +- +- populate_namespaces_cb(ctx, j, 0); +-} +- +-static int +-bdev_nvme_check_secondary_trid(struct nvme_ctrlr *nvme_ctrlr, +- struct spdk_nvme_ctrlr *new_ctrlr, +- struct spdk_nvme_transport_id *trid) +-{ +- struct nvme_path_id *tmp_trid; +- +- if (trid->trtype == SPDK_NVME_TRANSPORT_PCIE) { +- SPDK_ERRLOG("PCIe failover is not supported.\n"); +- return -ENOTSUP; +- } +- +- /* Currently we only support failover to the same transport type. */ +- if (nvme_ctrlr->active_path_id->trid.trtype != trid->trtype) { +- SPDK_WARNLOG("Failover from trtype: %s to a different trtype: %s is not supported currently\n", +- spdk_nvme_transport_id_trtype_str(nvme_ctrlr->active_path_id->trid.trtype), +- spdk_nvme_transport_id_trtype_str(trid->trtype)); +- return -EINVAL; +- } +- +- +- /* Currently we only support failover to the same NQN. */ +- if (strncmp(trid->subnqn, nvme_ctrlr->active_path_id->trid.subnqn, SPDK_NVMF_NQN_MAX_LEN)) { +- SPDK_WARNLOG("Failover from subnqn: %s to a different subnqn: %s is not supported currently\n", +- nvme_ctrlr->active_path_id->trid.subnqn, trid->subnqn); +- return -EINVAL; +- } +- +- /* Skip all the other checks if we've already registered this path. */ +- TAILQ_FOREACH(tmp_trid, &nvme_ctrlr->trids, link) { +- if (!spdk_nvme_transport_id_compare(&tmp_trid->trid, trid)) { +- SPDK_WARNLOG("This path (traddr: %s subnqn: %s) is already registered\n", trid->traddr, +- trid->subnqn); +- return -EEXIST; +- } +- } +- +- return 0; +-} +- +-static int +-bdev_nvme_check_secondary_namespace(struct nvme_ctrlr *nvme_ctrlr, +- struct spdk_nvme_ctrlr *new_ctrlr) +-{ +- struct nvme_ns *nvme_ns; +- struct spdk_nvme_ns *new_ns; +- +- nvme_ns = nvme_ctrlr_get_first_active_ns(nvme_ctrlr); +- while (nvme_ns != NULL) { +- new_ns = spdk_nvme_ctrlr_get_ns(new_ctrlr, nvme_ns->id); +- assert(new_ns != NULL); +- +- if (!bdev_nvme_compare_ns(nvme_ns->ns, new_ns)) { +- return -EINVAL; +- } +- +- nvme_ns = nvme_ctrlr_get_next_active_ns(nvme_ctrlr, nvme_ns); +- } +- +- return 0; +-} +- +-static int +-_bdev_nvme_add_secondary_trid(struct nvme_ctrlr *nvme_ctrlr, +- struct spdk_nvme_transport_id *trid) +-{ +- struct nvme_path_id *new_trid, *tmp_trid; +- +- new_trid = calloc(1, sizeof(*new_trid)); +- if (new_trid == NULL) { +- return -ENOMEM; +- } +- new_trid->trid = *trid; +- new_trid->is_failed = false; +- +- TAILQ_FOREACH(tmp_trid, &nvme_ctrlr->trids, link) { +- if (tmp_trid->is_failed && tmp_trid != nvme_ctrlr->active_path_id) { +- TAILQ_INSERT_BEFORE(tmp_trid, new_trid, link); +- return 0; +- } +- } +- +- TAILQ_INSERT_TAIL(&nvme_ctrlr->trids, new_trid, link); +- return 0; +-} +- +-/* This is the case that a secondary path is added to an existing +- * nvme_ctrlr for failover. After checking if it can access the same +- * namespaces as the primary path, it is disconnected until failover occurs. +- */ +-static int +-bdev_nvme_add_secondary_trid(struct nvme_ctrlr *nvme_ctrlr, +- struct spdk_nvme_ctrlr *new_ctrlr, +- struct spdk_nvme_transport_id *trid) +-{ +- int rc; +- +- assert(nvme_ctrlr != NULL); +- +- pthread_mutex_lock(&nvme_ctrlr->mutex); +- +- rc = bdev_nvme_check_secondary_trid(nvme_ctrlr, new_ctrlr, trid); +- if (rc != 0) { +- goto exit; +- } +- +- rc = bdev_nvme_check_secondary_namespace(nvme_ctrlr, new_ctrlr); +- if (rc != 0) { +- goto exit; +- } +- +- rc = _bdev_nvme_add_secondary_trid(nvme_ctrlr, trid); +- +-exit: +- pthread_mutex_unlock(&nvme_ctrlr->mutex); +- +- spdk_nvme_detach(new_ctrlr); +- +- return rc; +-} +- +-static void +-connect_attach_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid, +- struct spdk_nvme_ctrlr *ctrlr, const struct spdk_nvme_ctrlr_opts *opts) +-{ +- struct spdk_nvme_ctrlr_opts *user_opts = cb_ctx; +- struct nvme_async_probe_ctx *ctx; +- int rc; +- +- ctx = SPDK_CONTAINEROF(user_opts, struct nvme_async_probe_ctx, drv_opts); +- ctx->ctrlr_attached = true; +- +- rc = nvme_ctrlr_create(ctrlr, ctx->base_name, &ctx->trid, ctx); +- if (rc != 0) { +- populate_namespaces_cb(ctx, 0, rc); +- } +-} +- +-static void +-connect_set_failover_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid, +- struct spdk_nvme_ctrlr *ctrlr, +- const struct spdk_nvme_ctrlr_opts *opts) +-{ +- struct spdk_nvme_ctrlr_opts *user_opts = cb_ctx; +- struct nvme_ctrlr *nvme_ctrlr; +- struct nvme_async_probe_ctx *ctx; +- int rc; +- +- ctx = SPDK_CONTAINEROF(user_opts, struct nvme_async_probe_ctx, drv_opts); +- ctx->ctrlr_attached = true; +- +- nvme_ctrlr = nvme_ctrlr_get_by_name(ctx->base_name); +- if (nvme_ctrlr) { +- rc = bdev_nvme_add_secondary_trid(nvme_ctrlr, ctrlr, &ctx->trid); +- } else { +- rc = -ENODEV; +- } +- +- populate_namespaces_cb(ctx, 0, rc); +-} +- +-static int +-bdev_nvme_async_poll(void *arg) +-{ +- struct nvme_async_probe_ctx *ctx = arg; +- int rc; +- +- rc = spdk_nvme_probe_poll_async(ctx->probe_ctx); +- if (spdk_unlikely(rc != -EAGAIN)) { +- ctx->probe_done = true; +- spdk_poller_unregister(&ctx->poller); +- if (!ctx->ctrlr_attached) { +- /* The probe is done, but no controller was attached. +- * That means we had a failure, so report -EIO back to +- * the caller (usually the RPC). populate_namespaces_cb() +- * will take care of freeing the nvme_async_probe_ctx. +- */ +- populate_namespaces_cb(ctx, 0, -EIO); +- } else if (ctx->namespaces_populated) { +- /* The namespaces for the attached controller were all +- * populated and the response was already sent to the +- * caller (usually the RPC). So free the context here. +- */ +- free(ctx); +- } +- } +- +- return SPDK_POLLER_BUSY; +-} +- +-static bool +-bdev_nvme_check_io_error_resiliency_params(int32_t ctrlr_loss_timeout_sec, +- uint32_t reconnect_delay_sec, +- uint32_t fast_io_fail_timeout_sec) +-{ +- if (ctrlr_loss_timeout_sec < -1) { +- SPDK_ERRLOG("ctrlr_loss_timeout_sec can't be less than -1.\n"); +- return false; +- } else if (ctrlr_loss_timeout_sec == -1) { +- if (reconnect_delay_sec == 0) { +- SPDK_ERRLOG("reconnect_delay_sec can't be 0 if ctrlr_loss_timeout_sec is not 0.\n"); +- return false; +- } else if (fast_io_fail_timeout_sec != 0 && +- fast_io_fail_timeout_sec < reconnect_delay_sec) { +- SPDK_ERRLOG("reconnect_delay_sec can't be more than fast_io-fail_timeout_sec.\n"); +- return false; +- } +- } else if (ctrlr_loss_timeout_sec != 0) { +- if (reconnect_delay_sec == 0) { +- SPDK_ERRLOG("reconnect_delay_sec can't be 0 if ctrlr_loss_timeout_sec is not 0.\n"); +- return false; +- } else if (reconnect_delay_sec > (uint32_t)ctrlr_loss_timeout_sec) { +- SPDK_ERRLOG("reconnect_delay_sec can't be more than ctrlr_loss_timeout_sec.\n"); +- return false; +- } else if (fast_io_fail_timeout_sec != 0) { +- if (fast_io_fail_timeout_sec < reconnect_delay_sec) { +- SPDK_ERRLOG("reconnect_delay_sec can't be more than fast_io_fail_timeout_sec.\n"); +- return false; +- } else if (fast_io_fail_timeout_sec > (uint32_t)ctrlr_loss_timeout_sec) { +- SPDK_ERRLOG("fast_io_fail_timeout_sec can't be more than ctrlr_loss_timeout_sec.\n"); +- return false; +- } +- } +- } else if (reconnect_delay_sec != 0 || fast_io_fail_timeout_sec != 0) { +- SPDK_ERRLOG("Both reconnect_delay_sec and fast_io_fail_timeout_sec must be 0 if ctrlr_loss_timeout_sec is 0.\n"); +- return false; +- } +- +- return true; +-} +- +-int +-bdev_nvme_create(struct spdk_nvme_transport_id *trid, +- const char *base_name, +- const char **names, +- uint32_t count, +- spdk_bdev_create_nvme_fn cb_fn, +- void *cb_ctx, +- struct spdk_nvme_ctrlr_opts *drv_opts, +- struct nvme_ctrlr_opts *bdev_opts, +- bool multipath) +-{ +- struct nvme_probe_skip_entry *entry, *tmp; +- struct nvme_async_probe_ctx *ctx; +- spdk_nvme_attach_cb attach_cb; +- +- /* TODO expand this check to include both the host and target TRIDs. +- * Only if both are the same should we fail. +- */ +- if (nvme_ctrlr_get(trid) != NULL) { +- SPDK_ERRLOG("A controller with the provided trid (traddr: %s) already exists.\n", trid->traddr); +- return -EEXIST; +- } +- +- if (bdev_opts != NULL && +- !bdev_nvme_check_io_error_resiliency_params(bdev_opts->ctrlr_loss_timeout_sec, +- bdev_opts->reconnect_delay_sec, +- bdev_opts->fast_io_fail_timeout_sec)) { +- return -EINVAL; +- } +- +- ctx = calloc(1, sizeof(*ctx)); +- if (!ctx) { +- return -ENOMEM; +- } +- ctx->base_name = base_name; +- ctx->names = names; +- ctx->count = count; +- ctx->cb_fn = cb_fn; +- ctx->cb_ctx = cb_ctx; +- ctx->trid = *trid; +- +- if (bdev_opts) { +- memcpy(&ctx->bdev_opts, bdev_opts, sizeof(*bdev_opts)); +- } else { +- bdev_nvme_get_default_ctrlr_opts(&ctx->bdev_opts); +- } +- +- if (trid->trtype == SPDK_NVME_TRANSPORT_PCIE) { +- TAILQ_FOREACH_SAFE(entry, &g_skipped_nvme_ctrlrs, tailq, tmp) { +- if (spdk_nvme_transport_id_compare(trid, &entry->trid) == 0) { +- TAILQ_REMOVE(&g_skipped_nvme_ctrlrs, entry, tailq); +- free(entry); +- break; +- } +- } +- } +- +- if (drv_opts) { +- memcpy(&ctx->drv_opts, drv_opts, sizeof(*drv_opts)); +- } else { +- spdk_nvme_ctrlr_get_default_ctrlr_opts(&ctx->drv_opts, sizeof(ctx->drv_opts)); +- } +- +- ctx->drv_opts.transport_retry_count = g_opts.transport_retry_count; +- ctx->drv_opts.transport_ack_timeout = g_opts.transport_ack_timeout; +- ctx->drv_opts.keep_alive_timeout_ms = g_opts.keep_alive_timeout_ms; +- ctx->drv_opts.disable_read_ana_log_page = true; +- ctx->drv_opts.transport_tos = g_opts.transport_tos; +- +- if (nvme_bdev_ctrlr_get_by_name(base_name) == NULL || multipath) { +- attach_cb = connect_attach_cb; +- } else { +- attach_cb = connect_set_failover_cb; +- } +- +- ctx->probe_ctx = spdk_nvme_connect_async(trid, &ctx->drv_opts, attach_cb); +- if (ctx->probe_ctx == NULL) { +- SPDK_ERRLOG("No controller was found with provided trid (traddr: %s)\n", trid->traddr); +- free(ctx); +- return -ENODEV; +- } +- ctx->poller = SPDK_POLLER_REGISTER(bdev_nvme_async_poll, ctx, 1000); +- +- return 0; +-} +- +-int +-bdev_nvme_delete(const char *name, const struct nvme_path_id *path_id) +-{ +- struct nvme_bdev_ctrlr *nbdev_ctrlr; +- struct nvme_ctrlr *nvme_ctrlr, *tmp_nvme_ctrlr; +- struct nvme_path_id *p, *t; +- int rc = -ENXIO; +- +- if (name == NULL || path_id == NULL) { +- return -EINVAL; +- } +- +- nbdev_ctrlr = nvme_bdev_ctrlr_get_by_name(name); +- if (nbdev_ctrlr == NULL) { +- SPDK_ERRLOG("Failed to find NVMe bdev controller\n"); +- return -ENODEV; +- } +- +- TAILQ_FOREACH_SAFE(nvme_ctrlr, &nbdev_ctrlr->ctrlrs, tailq, tmp_nvme_ctrlr) { +- TAILQ_FOREACH_REVERSE_SAFE(p, &nvme_ctrlr->trids, nvme_paths, link, t) { +- if (path_id->trid.trtype != 0) { +- if (path_id->trid.trtype == SPDK_NVME_TRANSPORT_CUSTOM) { +- if (strcasecmp(path_id->trid.trstring, p->trid.trstring) != 0) { +- continue; +- } +- } else { +- if (path_id->trid.trtype != p->trid.trtype) { +- continue; +- } +- } +- } +- +- if (!spdk_mem_all_zero(path_id->trid.traddr, sizeof(path_id->trid.traddr))) { +- if (strcasecmp(path_id->trid.traddr, p->trid.traddr) != 0) { +- continue; +- } +- } +- +- if (path_id->trid.adrfam != 0) { +- if (path_id->trid.adrfam != p->trid.adrfam) { +- continue; +- } +- } +- +- if (!spdk_mem_all_zero(path_id->trid.trsvcid, sizeof(path_id->trid.trsvcid))) { +- if (strcasecmp(path_id->trid.trsvcid, p->trid.trsvcid) != 0) { +- continue; +- } +- } +- +- if (!spdk_mem_all_zero(path_id->trid.subnqn, sizeof(path_id->trid.subnqn))) { +- if (strcmp(path_id->trid.subnqn, p->trid.subnqn) != 0) { +- continue; +- } +- } +- +- if (!spdk_mem_all_zero(path_id->hostid.hostaddr, sizeof(path_id->hostid.hostaddr))) { +- if (strcmp(path_id->hostid.hostaddr, p->hostid.hostaddr) != 0) { +- continue; +- } +- } +- +- if (!spdk_mem_all_zero(path_id->hostid.hostsvcid, sizeof(path_id->hostid.hostsvcid))) { +- if (strcmp(path_id->hostid.hostsvcid, p->hostid.hostsvcid) != 0) { +- continue; +- } +- } +- +- /* If we made it here, then this path is a match! Now we need to remove it. */ +- if (p == nvme_ctrlr->active_path_id) { +- /* This is the active path in use right now. The active path is always the first in the list. */ +- +- if (!TAILQ_NEXT(p, link)) { +- /* The current path is the only path. */ +- rc = _bdev_nvme_delete(nvme_ctrlr, false); +- } else { +- /* There is an alternative path. */ +- rc = bdev_nvme_failover(nvme_ctrlr, true); +- } +- } else { +- /* We are not using the specified path. */ +- TAILQ_REMOVE(&nvme_ctrlr->trids, p, link); +- free(p); +- rc = 0; +- } +- +- if (rc < 0 && rc != -ENXIO) { +- return rc; +- } +- +- +- } +- } +- +- /* All nvme_ctrlrs were deleted or no nvme_ctrlr which had the trid was found. */ +- return rc; +-} +- +-#define DISCOVERY_INFOLOG(ctx, format, ...) \ +- SPDK_INFOLOG(bdev_nvme, "Discovery[%s:%s] " format, ctx->trid.traddr, ctx->trid.trsvcid, ##__VA_ARGS__); +- +-#define DISCOVERY_ERRLOG(ctx, format, ...) \ +- SPDK_ERRLOG("Discovery[%s:%s] " format, ctx->trid.traddr, ctx->trid.trsvcid, ##__VA_ARGS__); +- +-struct discovery_entry_ctx { +- char name[128]; +- struct spdk_nvme_transport_id trid; +- struct spdk_nvme_ctrlr_opts drv_opts; +- struct spdk_nvmf_discovery_log_page_entry entry; +- TAILQ_ENTRY(discovery_entry_ctx) tailq; +- struct discovery_ctx *ctx; +-}; +- +-struct discovery_ctx { +- char *name; +- spdk_bdev_nvme_start_discovery_fn start_cb_fn; +- spdk_bdev_nvme_stop_discovery_fn stop_cb_fn; +- void *cb_ctx; +- struct spdk_nvme_probe_ctx *probe_ctx; +- struct spdk_nvme_detach_ctx *detach_ctx; +- struct spdk_nvme_ctrlr *ctrlr; +- struct spdk_nvme_transport_id trid; +- struct discovery_entry_ctx *entry_ctx_in_use; +- struct spdk_poller *poller; +- struct spdk_nvme_ctrlr_opts drv_opts; +- struct nvme_ctrlr_opts bdev_opts; +- struct spdk_nvmf_discovery_log_page *log_page; +- TAILQ_ENTRY(discovery_ctx) tailq; +- TAILQ_HEAD(, discovery_entry_ctx) nvm_entry_ctxs; +- TAILQ_HEAD(, discovery_entry_ctx) discovery_entry_ctxs; +- int rc; +- bool wait_for_attach; +- uint64_t timeout_ticks; +- /* Denotes that the discovery service is being started. We're waiting +- * for the initial connection to the discovery controller to be +- * established and attach discovered NVM ctrlrs. +- */ +- bool initializing; +- /* Denotes if a discovery is currently in progress for this context. +- * That includes connecting to newly discovered subsystems. Used to +- * ensure we do not start a new discovery until an existing one is +- * complete. +- */ +- bool in_progress; +- +- /* Denotes if another discovery is needed after the one in progress +- * completes. Set when we receive an AER completion while a discovery +- * is already in progress. +- */ +- bool pending; +- +- /* Signal to the discovery context poller that it should stop the +- * discovery service, including detaching from the current discovery +- * controller. +- */ +- bool stop; +- +- struct spdk_thread *calling_thread; +- uint32_t index; +- uint32_t attach_in_progress; +- char *hostnqn; +- +- /* Denotes if the discovery service was started by the mdns discovery. +- */ +- bool from_mdns_discovery_service; +-}; +- +-TAILQ_HEAD(discovery_ctxs, discovery_ctx); +-static struct discovery_ctxs g_discovery_ctxs = TAILQ_HEAD_INITIALIZER(g_discovery_ctxs); +- +-static void get_discovery_log_page(struct discovery_ctx *ctx); +- +-static void +-free_discovery_ctx(struct discovery_ctx *ctx) +-{ +- free(ctx->log_page); +- free(ctx->hostnqn); +- free(ctx->name); +- free(ctx); +-} +- +-static void +-discovery_complete(struct discovery_ctx *ctx) +-{ +- ctx->initializing = false; +- ctx->in_progress = false; +- if (ctx->pending) { +- ctx->pending = false; +- get_discovery_log_page(ctx); +- } +-} +- +-static void +-build_trid_from_log_page_entry(struct spdk_nvme_transport_id *trid, +- struct spdk_nvmf_discovery_log_page_entry *entry) +-{ +- char *space; +- +- trid->trtype = entry->trtype; +- trid->adrfam = entry->adrfam; +- memcpy(trid->traddr, entry->traddr, sizeof(entry->traddr)); +- memcpy(trid->trsvcid, entry->trsvcid, sizeof(entry->trsvcid)); +- memcpy(trid->subnqn, entry->subnqn, sizeof(trid->subnqn)); +- +- /* We want the traddr, trsvcid and subnqn fields to be NULL-terminated. +- * But the log page entries typically pad them with spaces, not zeroes. +- * So add a NULL terminator to each of these fields at the appropriate +- * location. +- */ +- space = strchr(trid->traddr, ' '); +- if (space) { +- *space = 0; +- } +- space = strchr(trid->trsvcid, ' '); +- if (space) { +- *space = 0; +- } +- space = strchr(trid->subnqn, ' '); +- if (space) { +- *space = 0; +- } +-} +- +-static void +-stop_discovery(struct discovery_ctx *ctx, spdk_bdev_nvme_stop_discovery_fn cb_fn, void *cb_ctx) +-{ +- ctx->stop = true; +- ctx->stop_cb_fn = cb_fn; +- ctx->cb_ctx = cb_ctx; +- +- while (!TAILQ_EMPTY(&ctx->nvm_entry_ctxs)) { +- struct discovery_entry_ctx *entry_ctx; +- struct nvme_path_id path = {}; +- +- entry_ctx = TAILQ_FIRST(&ctx->nvm_entry_ctxs); +- path.trid = entry_ctx->trid; +- bdev_nvme_delete(entry_ctx->name, &path); +- TAILQ_REMOVE(&ctx->nvm_entry_ctxs, entry_ctx, tailq); +- free(entry_ctx); +- } +- +- while (!TAILQ_EMPTY(&ctx->discovery_entry_ctxs)) { +- struct discovery_entry_ctx *entry_ctx; +- +- entry_ctx = TAILQ_FIRST(&ctx->discovery_entry_ctxs); +- TAILQ_REMOVE(&ctx->discovery_entry_ctxs, entry_ctx, tailq); +- free(entry_ctx); +- } +- +- free(ctx->entry_ctx_in_use); +- ctx->entry_ctx_in_use = NULL; +-} +- +-static void +-discovery_remove_controllers(struct discovery_ctx *ctx) +-{ +- struct spdk_nvmf_discovery_log_page *log_page = ctx->log_page; +- struct discovery_entry_ctx *entry_ctx, *tmp; +- struct spdk_nvmf_discovery_log_page_entry *new_entry, *old_entry; +- struct spdk_nvme_transport_id old_trid; +- uint64_t numrec, i; +- bool found; +- +- numrec = from_le64(&log_page->numrec); +- TAILQ_FOREACH_SAFE(entry_ctx, &ctx->nvm_entry_ctxs, tailq, tmp) { +- found = false; +- old_entry = &entry_ctx->entry; +- build_trid_from_log_page_entry(&old_trid, old_entry); +- for (i = 0; i < numrec; i++) { +- new_entry = &log_page->entries[i]; +- if (!memcmp(old_entry, new_entry, sizeof(*old_entry))) { +- DISCOVERY_INFOLOG(ctx, "NVM %s:%s:%s found again\n", +- old_trid.subnqn, old_trid.traddr, old_trid.trsvcid); +- found = true; +- break; +- } +- } +- if (!found) { +- struct nvme_path_id path = {}; +- +- DISCOVERY_INFOLOG(ctx, "NVM %s:%s:%s not found\n", +- old_trid.subnqn, old_trid.traddr, old_trid.trsvcid); +- +- path.trid = entry_ctx->trid; +- bdev_nvme_delete(entry_ctx->name, &path); +- TAILQ_REMOVE(&ctx->nvm_entry_ctxs, entry_ctx, tailq); +- free(entry_ctx); +- } +- } +- free(log_page); +- ctx->log_page = NULL; +- discovery_complete(ctx); +-} +- +-static void +-complete_discovery_start(struct discovery_ctx *ctx, int status) +-{ +- ctx->timeout_ticks = 0; +- ctx->rc = status; +- if (ctx->start_cb_fn) { +- ctx->start_cb_fn(ctx->cb_ctx, status); +- ctx->start_cb_fn = NULL; +- ctx->cb_ctx = NULL; +- } +-} +- +-static void +-discovery_attach_controller_done(void *cb_ctx, size_t bdev_count, int rc) +-{ +- struct discovery_entry_ctx *entry_ctx = cb_ctx; +- struct discovery_ctx *ctx = entry_ctx->ctx; +- +- DISCOVERY_INFOLOG(ctx, "attach %s done\n", entry_ctx->name); +- ctx->attach_in_progress--; +- if (ctx->attach_in_progress == 0) { +- complete_discovery_start(ctx, ctx->rc); +- if (ctx->initializing && ctx->rc != 0) { +- DISCOVERY_ERRLOG(ctx, "stopping discovery due to errors: %d\n", ctx->rc); +- stop_discovery(ctx, NULL, ctx->cb_ctx); +- } else { +- discovery_remove_controllers(ctx); +- } +- } +-} +- +-static struct discovery_entry_ctx * +-create_discovery_entry_ctx(struct discovery_ctx *ctx, struct spdk_nvme_transport_id *trid) +-{ +- struct discovery_entry_ctx *new_ctx; +- +- new_ctx = calloc(1, sizeof(*new_ctx)); +- if (new_ctx == NULL) { +- DISCOVERY_ERRLOG(ctx, "could not allocate new entry_ctx\n"); +- return NULL; +- } +- +- new_ctx->ctx = ctx; +- memcpy(&new_ctx->trid, trid, sizeof(*trid)); +- spdk_nvme_ctrlr_get_default_ctrlr_opts(&new_ctx->drv_opts, sizeof(new_ctx->drv_opts)); +- snprintf(new_ctx->drv_opts.hostnqn, sizeof(new_ctx->drv_opts.hostnqn), "%s", ctx->hostnqn); +- return new_ctx; +-} +- +-static void +-discovery_log_page_cb(void *cb_arg, int rc, const struct spdk_nvme_cpl *cpl, +- struct spdk_nvmf_discovery_log_page *log_page) +-{ +- struct discovery_ctx *ctx = cb_arg; +- struct discovery_entry_ctx *entry_ctx, *tmp; +- struct spdk_nvmf_discovery_log_page_entry *new_entry, *old_entry; +- uint64_t numrec, i; +- bool found; +- +- if (rc || spdk_nvme_cpl_is_error(cpl)) { +- DISCOVERY_ERRLOG(ctx, "could not get discovery log page\n"); +- return; +- } +- +- ctx->log_page = log_page; +- assert(ctx->attach_in_progress == 0); +- numrec = from_le64(&log_page->numrec); +- TAILQ_FOREACH_SAFE(entry_ctx, &ctx->discovery_entry_ctxs, tailq, tmp) { +- TAILQ_REMOVE(&ctx->discovery_entry_ctxs, entry_ctx, tailq); +- free(entry_ctx); +- } +- for (i = 0; i < numrec; i++) { +- found = false; +- new_entry = &log_page->entries[i]; +- if (new_entry->subtype == SPDK_NVMF_SUBTYPE_DISCOVERY) { +- struct discovery_entry_ctx *new_ctx; +- struct spdk_nvme_transport_id trid = {}; +- +- build_trid_from_log_page_entry(&trid, new_entry); +- new_ctx = create_discovery_entry_ctx(ctx, &trid); +- if (new_ctx == NULL) { +- DISCOVERY_ERRLOG(ctx, "could not allocate new entry_ctx\n"); +- break; +- } +- +- TAILQ_INSERT_TAIL(&ctx->discovery_entry_ctxs, new_ctx, tailq); +- continue; +- } +- TAILQ_FOREACH(entry_ctx, &ctx->nvm_entry_ctxs, tailq) { +- old_entry = &entry_ctx->entry; +- if (!memcmp(new_entry, old_entry, sizeof(*new_entry))) { +- found = true; +- break; +- } +- } +- if (!found) { +- struct discovery_entry_ctx *subnqn_ctx = NULL, *new_ctx; +- struct discovery_ctx *d_ctx; +- +- TAILQ_FOREACH(d_ctx, &g_discovery_ctxs, tailq) { +- TAILQ_FOREACH(subnqn_ctx, &d_ctx->nvm_entry_ctxs, tailq) { +- if (!memcmp(subnqn_ctx->entry.subnqn, new_entry->subnqn, +- sizeof(new_entry->subnqn))) { +- break; +- } +- } +- if (subnqn_ctx) { +- break; +- } +- } +- +- new_ctx = calloc(1, sizeof(*new_ctx)); +- if (new_ctx == NULL) { +- DISCOVERY_ERRLOG(ctx, "could not allocate new entry_ctx\n"); +- break; +- } +- +- new_ctx->ctx = ctx; +- memcpy(&new_ctx->entry, new_entry, sizeof(*new_entry)); +- build_trid_from_log_page_entry(&new_ctx->trid, new_entry); +- if (subnqn_ctx) { +- snprintf(new_ctx->name, sizeof(new_ctx->name), "%s", subnqn_ctx->name); +- DISCOVERY_INFOLOG(ctx, "NVM %s:%s:%s new path for %s\n", +- new_ctx->trid.subnqn, new_ctx->trid.traddr, new_ctx->trid.trsvcid, +- new_ctx->name); +- } else { +- snprintf(new_ctx->name, sizeof(new_ctx->name), "%s%d", ctx->name, ctx->index++); +- DISCOVERY_INFOLOG(ctx, "NVM %s:%s:%s new subsystem %s\n", +- new_ctx->trid.subnqn, new_ctx->trid.traddr, new_ctx->trid.trsvcid, +- new_ctx->name); +- } +- spdk_nvme_ctrlr_get_default_ctrlr_opts(&new_ctx->drv_opts, sizeof(new_ctx->drv_opts)); +- snprintf(new_ctx->drv_opts.hostnqn, sizeof(new_ctx->drv_opts.hostnqn), "%s", ctx->hostnqn); +- rc = bdev_nvme_create(&new_ctx->trid, new_ctx->name, NULL, 0, +- discovery_attach_controller_done, new_ctx, +- &new_ctx->drv_opts, &ctx->bdev_opts, true); +- if (rc == 0) { +- TAILQ_INSERT_TAIL(&ctx->nvm_entry_ctxs, new_ctx, tailq); +- ctx->attach_in_progress++; +- } else { +- DISCOVERY_ERRLOG(ctx, "bdev_nvme_create failed (%s)\n", spdk_strerror(-rc)); +- } +- } +- } +- +- if (ctx->attach_in_progress == 0) { +- discovery_remove_controllers(ctx); +- } +-} +- +-static void +-get_discovery_log_page(struct discovery_ctx *ctx) +-{ +- int rc; +- +- assert(ctx->in_progress == false); +- ctx->in_progress = true; +- rc = spdk_nvme_ctrlr_get_discovery_log_page(ctx->ctrlr, discovery_log_page_cb, ctx); +- if (rc != 0) { +- DISCOVERY_ERRLOG(ctx, "could not get discovery log page\n"); +- } +- DISCOVERY_INFOLOG(ctx, "sent discovery log page command\n"); +-} +- +-static void +-discovery_aer_cb(void *arg, const struct spdk_nvme_cpl *cpl) +-{ +- struct discovery_ctx *ctx = arg; +- uint32_t log_page_id = (cpl->cdw0 & 0xFF0000) >> 16; +- +- if (spdk_nvme_cpl_is_error(cpl)) { +- DISCOVERY_ERRLOG(ctx, "aer failed\n"); +- return; +- } +- +- if (log_page_id != SPDK_NVME_LOG_DISCOVERY) { +- DISCOVERY_ERRLOG(ctx, "unexpected log page 0x%x\n", log_page_id); +- return; +- } +- +- DISCOVERY_INFOLOG(ctx, "got aer\n"); +- if (ctx->in_progress) { +- ctx->pending = true; +- return; +- } +- +- get_discovery_log_page(ctx); +-} +- +-static void +-discovery_attach_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid, +- struct spdk_nvme_ctrlr *ctrlr, const struct spdk_nvme_ctrlr_opts *opts) +-{ +- struct spdk_nvme_ctrlr_opts *user_opts = cb_ctx; +- struct discovery_ctx *ctx; +- +- ctx = SPDK_CONTAINEROF(user_opts, struct discovery_ctx, drv_opts); +- +- DISCOVERY_INFOLOG(ctx, "discovery ctrlr attached\n"); +- ctx->probe_ctx = NULL; +- ctx->ctrlr = ctrlr; +- +- if (ctx->rc != 0) { +- DISCOVERY_ERRLOG(ctx, "encountered error while attaching discovery ctrlr: %d\n", +- ctx->rc); +- return; +- } +- +- spdk_nvme_ctrlr_register_aer_callback(ctx->ctrlr, discovery_aer_cb, ctx); +-} +- +-static int +-discovery_poller(void *arg) +-{ +- struct discovery_ctx *ctx = arg; +- struct spdk_nvme_transport_id *trid; +- int rc; +- +- if (ctx->detach_ctx) { +- rc = spdk_nvme_detach_poll_async(ctx->detach_ctx); +- if (rc != -EAGAIN) { +- ctx->detach_ctx = NULL; +- ctx->ctrlr = NULL; +- } +- } else if (ctx->stop) { +- if (ctx->ctrlr != NULL) { +- rc = spdk_nvme_detach_async(ctx->ctrlr, &ctx->detach_ctx); +- if (rc == 0) { +- return SPDK_POLLER_BUSY; +- } +- DISCOVERY_ERRLOG(ctx, "could not detach discovery ctrlr\n"); +- } +- spdk_poller_unregister(&ctx->poller); +- TAILQ_REMOVE(&g_discovery_ctxs, ctx, tailq); +- assert(ctx->start_cb_fn == NULL); +- if (ctx->stop_cb_fn != NULL) { +- ctx->stop_cb_fn(ctx->cb_ctx); +- } +- free_discovery_ctx(ctx); +- } else if (ctx->probe_ctx == NULL && ctx->ctrlr == NULL) { +- if (ctx->timeout_ticks != 0 && ctx->timeout_ticks < spdk_get_ticks()) { +- DISCOVERY_ERRLOG(ctx, "timed out while attaching discovery ctrlr\n"); +- assert(ctx->initializing); +- spdk_poller_unregister(&ctx->poller); +- TAILQ_REMOVE(&g_discovery_ctxs, ctx, tailq); +- complete_discovery_start(ctx, -ETIMEDOUT); +- stop_discovery(ctx, NULL, NULL); +- free_discovery_ctx(ctx); +- return SPDK_POLLER_BUSY; +- } +- +- assert(ctx->entry_ctx_in_use == NULL); +- ctx->entry_ctx_in_use = TAILQ_FIRST(&ctx->discovery_entry_ctxs); +- TAILQ_REMOVE(&ctx->discovery_entry_ctxs, ctx->entry_ctx_in_use, tailq); +- trid = &ctx->entry_ctx_in_use->trid; +- ctx->probe_ctx = spdk_nvme_connect_async(trid, &ctx->drv_opts, discovery_attach_cb); +- if (ctx->probe_ctx) { +- spdk_poller_unregister(&ctx->poller); +- ctx->poller = SPDK_POLLER_REGISTER(discovery_poller, ctx, 1000); +- } else { +- DISCOVERY_ERRLOG(ctx, "could not start discovery connect\n"); +- TAILQ_INSERT_TAIL(&ctx->discovery_entry_ctxs, ctx->entry_ctx_in_use, tailq); +- ctx->entry_ctx_in_use = NULL; +- } +- } else if (ctx->probe_ctx) { +- if (ctx->timeout_ticks != 0 && ctx->timeout_ticks < spdk_get_ticks()) { +- DISCOVERY_ERRLOG(ctx, "timed out while attaching discovery ctrlr\n"); +- complete_discovery_start(ctx, -ETIMEDOUT); +- return SPDK_POLLER_BUSY; +- } +- +- rc = spdk_nvme_probe_poll_async(ctx->probe_ctx); +- if (rc != -EAGAIN) { +- if (ctx->rc != 0) { +- assert(ctx->initializing); +- stop_discovery(ctx, NULL, ctx->cb_ctx); +- } else { +- assert(rc == 0); +- DISCOVERY_INFOLOG(ctx, "discovery ctrlr connected\n"); +- ctx->rc = rc; +- get_discovery_log_page(ctx); +- } +- } +- } else { +- if (ctx->timeout_ticks != 0 && ctx->timeout_ticks < spdk_get_ticks()) { +- DISCOVERY_ERRLOG(ctx, "timed out while attaching NVM ctrlrs\n"); +- complete_discovery_start(ctx, -ETIMEDOUT); +- /* We need to wait until all NVM ctrlrs are attached before we stop the +- * discovery service to make sure we don't detach a ctrlr that is still +- * being attached. +- */ +- if (ctx->attach_in_progress == 0) { +- stop_discovery(ctx, NULL, ctx->cb_ctx); +- return SPDK_POLLER_BUSY; +- } +- } +- +- rc = spdk_nvme_ctrlr_process_admin_completions(ctx->ctrlr); +- if (rc < 0) { +- spdk_poller_unregister(&ctx->poller); +- ctx->poller = SPDK_POLLER_REGISTER(discovery_poller, ctx, 1000 * 1000); +- TAILQ_INSERT_TAIL(&ctx->discovery_entry_ctxs, ctx->entry_ctx_in_use, tailq); +- ctx->entry_ctx_in_use = NULL; +- +- rc = spdk_nvme_detach_async(ctx->ctrlr, &ctx->detach_ctx); +- if (rc != 0) { +- DISCOVERY_ERRLOG(ctx, "could not detach discovery ctrlr\n"); +- ctx->ctrlr = NULL; +- } +- } +- } +- +- return SPDK_POLLER_BUSY; +-} +- +-static void +-start_discovery_poller(void *arg) +-{ +- struct discovery_ctx *ctx = arg; +- +- TAILQ_INSERT_TAIL(&g_discovery_ctxs, ctx, tailq); +- ctx->poller = SPDK_POLLER_REGISTER(discovery_poller, ctx, 1000 * 1000); +-} +- +-int +-bdev_nvme_start_discovery(struct spdk_nvme_transport_id *trid, +- const char *base_name, +- struct spdk_nvme_ctrlr_opts *drv_opts, +- struct nvme_ctrlr_opts *bdev_opts, +- uint64_t attach_timeout, +- bool from_mdns, +- spdk_bdev_nvme_start_discovery_fn cb_fn, void *cb_ctx) +-{ +- struct discovery_ctx *ctx; +- struct discovery_entry_ctx *discovery_entry_ctx; +- +- snprintf(trid->subnqn, sizeof(trid->subnqn), "%s", SPDK_NVMF_DISCOVERY_NQN); +- TAILQ_FOREACH(ctx, &g_discovery_ctxs, tailq) { +- if (strcmp(ctx->name, base_name) == 0) { +- return -EEXIST; +- } +- +- if (ctx->entry_ctx_in_use != NULL) { +- if (!spdk_nvme_transport_id_compare(trid, &ctx->entry_ctx_in_use->trid)) { +- return -EEXIST; +- } +- } +- +- TAILQ_FOREACH(discovery_entry_ctx, &ctx->discovery_entry_ctxs, tailq) { +- if (!spdk_nvme_transport_id_compare(trid, &discovery_entry_ctx->trid)) { +- return -EEXIST; +- } +- } +- } +- +- ctx = calloc(1, sizeof(*ctx)); +- if (ctx == NULL) { +- return -ENOMEM; +- } +- +- ctx->name = strdup(base_name); +- if (ctx->name == NULL) { +- free_discovery_ctx(ctx); +- return -ENOMEM; +- } +- memcpy(&ctx->drv_opts, drv_opts, sizeof(*drv_opts)); +- memcpy(&ctx->bdev_opts, bdev_opts, sizeof(*bdev_opts)); +- ctx->from_mdns_discovery_service = from_mdns; +- ctx->bdev_opts.from_discovery_service = true; +- ctx->calling_thread = spdk_get_thread(); +- ctx->start_cb_fn = cb_fn; +- ctx->cb_ctx = cb_ctx; +- ctx->initializing = true; +- if (ctx->start_cb_fn) { +- /* We can use this when dumping json to denote if this RPC parameter +- * was specified or not. +- */ +- ctx->wait_for_attach = true; +- } +- if (attach_timeout != 0) { +- ctx->timeout_ticks = spdk_get_ticks() + attach_timeout * +- spdk_get_ticks_hz() / 1000ull; +- } +- TAILQ_INIT(&ctx->nvm_entry_ctxs); +- TAILQ_INIT(&ctx->discovery_entry_ctxs); +- memcpy(&ctx->trid, trid, sizeof(*trid)); +- /* Even if user did not specify hostnqn, we can still strdup("\0"); */ +- ctx->hostnqn = strdup(ctx->drv_opts.hostnqn); +- if (ctx->hostnqn == NULL) { +- free_discovery_ctx(ctx); +- return -ENOMEM; +- } +- discovery_entry_ctx = create_discovery_entry_ctx(ctx, trid); +- if (discovery_entry_ctx == NULL) { +- DISCOVERY_ERRLOG(ctx, "could not allocate new entry_ctx\n"); +- free_discovery_ctx(ctx); +- return -ENOMEM; +- } +- +- TAILQ_INSERT_TAIL(&ctx->discovery_entry_ctxs, discovery_entry_ctx, tailq); +- spdk_thread_send_msg(g_bdev_nvme_init_thread, start_discovery_poller, ctx); +- return 0; +-} +- +-int +-bdev_nvme_stop_discovery(const char *name, spdk_bdev_nvme_stop_discovery_fn cb_fn, void *cb_ctx) +-{ +- struct discovery_ctx *ctx; +- +- TAILQ_FOREACH(ctx, &g_discovery_ctxs, tailq) { +- if (strcmp(name, ctx->name) == 0) { +- if (ctx->stop) { +- return -EALREADY; +- } +- /* If we're still starting the discovery service and ->rc is non-zero, we're +- * going to stop it as soon as we can +- */ +- if (ctx->initializing && ctx->rc != 0) { +- return -EALREADY; +- } +- stop_discovery(ctx, cb_fn, cb_ctx); +- return 0; +- } +- } +- +- return -ENOENT; +-} +- +-static int +-bdev_nvme_library_init(void) +-{ +- g_bdev_nvme_init_thread = spdk_get_thread(); +- +- spdk_io_device_register(&g_nvme_bdev_ctrlrs, bdev_nvme_create_poll_group_cb, +- bdev_nvme_destroy_poll_group_cb, +- sizeof(struct nvme_poll_group), "nvme_poll_groups"); +- +- return 0; +-} +- +-static void +-bdev_nvme_fini_destruct_ctrlrs(void) +-{ +- struct nvme_bdev_ctrlr *nbdev_ctrlr; +- struct nvme_ctrlr *nvme_ctrlr; +- +- pthread_mutex_lock(&g_bdev_nvme_mutex); +- TAILQ_FOREACH(nbdev_ctrlr, &g_nvme_bdev_ctrlrs, tailq) { +- TAILQ_FOREACH(nvme_ctrlr, &nbdev_ctrlr->ctrlrs, tailq) { +- pthread_mutex_lock(&nvme_ctrlr->mutex); +- if (nvme_ctrlr->destruct) { +- /* This controller's destruction was already started +- * before the application started shutting down +- */ +- pthread_mutex_unlock(&nvme_ctrlr->mutex); +- continue; +- } +- nvme_ctrlr->destruct = true; +- pthread_mutex_unlock(&nvme_ctrlr->mutex); +- +- spdk_thread_send_msg(nvme_ctrlr->thread, _nvme_ctrlr_destruct, +- nvme_ctrlr); +- } +- } +- +- g_bdev_nvme_module_finish = true; +- if (TAILQ_EMPTY(&g_nvme_bdev_ctrlrs)) { +- pthread_mutex_unlock(&g_bdev_nvme_mutex); +- spdk_io_device_unregister(&g_nvme_bdev_ctrlrs, NULL); +- spdk_bdev_module_fini_done(); +- return; +- } +- +- pthread_mutex_unlock(&g_bdev_nvme_mutex); +-} +- +-static void +-check_discovery_fini(void *arg) +-{ +- if (TAILQ_EMPTY(&g_discovery_ctxs)) { +- bdev_nvme_fini_destruct_ctrlrs(); +- } +-} +- +-static void +-bdev_nvme_library_fini(void) +-{ +- struct nvme_probe_skip_entry *entry, *entry_tmp; +- struct discovery_ctx *ctx; +- +- spdk_poller_unregister(&g_hotplug_poller); +- free(g_hotplug_probe_ctx); +- g_hotplug_probe_ctx = NULL; +- +- TAILQ_FOREACH_SAFE(entry, &g_skipped_nvme_ctrlrs, tailq, entry_tmp) { +- TAILQ_REMOVE(&g_skipped_nvme_ctrlrs, entry, tailq); +- free(entry); +- } +- +- assert(spdk_get_thread() == g_bdev_nvme_init_thread); +- if (TAILQ_EMPTY(&g_discovery_ctxs)) { +- bdev_nvme_fini_destruct_ctrlrs(); +- } else { +- TAILQ_FOREACH(ctx, &g_discovery_ctxs, tailq) { +- stop_discovery(ctx, check_discovery_fini, NULL); +- } +- } +-} +- +-static void +-bdev_nvme_verify_pi_error(struct nvme_bdev_io *bio) +-{ +- struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); +- struct spdk_bdev *bdev = bdev_io->bdev; +- struct spdk_dif_ctx dif_ctx; +- struct spdk_dif_error err_blk = {}; +- int rc; +- +- rc = spdk_dif_ctx_init(&dif_ctx, +- bdev->blocklen, bdev->md_len, bdev->md_interleave, +- bdev->dif_is_head_of_md, bdev->dif_type, bdev->dif_check_flags, +- bdev_io->u.bdev.offset_blocks, 0, 0, 0, 0); +- if (rc != 0) { +- SPDK_ERRLOG("Initialization of DIF context failed\n"); +- return; +- } +- +- if (bdev->md_interleave) { +- rc = spdk_dif_verify(bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt, +- bdev_io->u.bdev.num_blocks, &dif_ctx, &err_blk); +- } else { +- struct iovec md_iov = { +- .iov_base = bdev_io->u.bdev.md_buf, +- .iov_len = bdev_io->u.bdev.num_blocks * bdev->md_len, +- }; +- +- rc = spdk_dix_verify(bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt, +- &md_iov, bdev_io->u.bdev.num_blocks, &dif_ctx, &err_blk); +- } +- +- if (rc != 0) { +- SPDK_ERRLOG("DIF error detected. type=%d, offset=%" PRIu32 "\n", +- err_blk.err_type, err_blk.err_offset); +- } else { +- SPDK_ERRLOG("Hardware reported PI error but SPDK could not find any.\n"); +- } +-} +- +-static void +-bdev_nvme_no_pi_readv_done(void *ref, const struct spdk_nvme_cpl *cpl) +-{ +- struct nvme_bdev_io *bio = ref; +- +- if (spdk_nvme_cpl_is_success(cpl)) { +- /* Run PI verification for read data buffer. */ +- bdev_nvme_verify_pi_error(bio); +- } +- +- /* Return original completion status */ +- bdev_nvme_io_complete_nvme_status(bio, &bio->cpl); +-} +- +-static void +-bdev_nvme_readv_done(void *ref, const struct spdk_nvme_cpl *cpl) +-{ +- struct nvme_bdev_io *bio = ref; +- struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); +- int ret; +- +- if (spdk_unlikely(spdk_nvme_cpl_is_pi_error(cpl))) { +- SPDK_ERRLOG("readv completed with PI error (sct=%d, sc=%d)\n", +- cpl->status.sct, cpl->status.sc); +- +- /* Save completion status to use after verifying PI error. */ +- bio->cpl = *cpl; +- +- if (spdk_likely(nvme_io_path_is_available(bio->io_path))) { +- /* Read without PI checking to verify PI error. */ +- ret = bdev_nvme_no_pi_readv(bio, +- bdev_io->u.bdev.iovs, +- bdev_io->u.bdev.iovcnt, +- bdev_io->u.bdev.md_buf, +- bdev_io->u.bdev.num_blocks, +- bdev_io->u.bdev.offset_blocks); +- if (ret == 0) { +- return; +- } +- } +- } +- +- bdev_nvme_io_complete_nvme_status(bio, cpl); +-} +- +-static void +-bdev_nvme_writev_done(void *ref, const struct spdk_nvme_cpl *cpl) +-{ +- struct nvme_bdev_io *bio = ref; +- +- if (spdk_nvme_cpl_is_pi_error(cpl)) { +- SPDK_ERRLOG("writev completed with PI error (sct=%d, sc=%d)\n", +- cpl->status.sct, cpl->status.sc); +- /* Run PI verification for write data buffer if PI error is detected. */ +- bdev_nvme_verify_pi_error(bio); +- } +- +- bdev_nvme_io_complete_nvme_status(bio, cpl); +-} +- +-static void +-bdev_nvme_zone_appendv_done(void *ref, const struct spdk_nvme_cpl *cpl) +-{ +- struct nvme_bdev_io *bio = ref; +- struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); +- +- /* spdk_bdev_io_get_append_location() requires that the ALBA is stored in offset_blocks. +- * Additionally, offset_blocks has to be set before calling bdev_nvme_verify_pi_error(). +- */ +- bdev_io->u.bdev.offset_blocks = *(uint64_t *)&cpl->cdw0; +- +- if (spdk_nvme_cpl_is_pi_error(cpl)) { +- SPDK_ERRLOG("zone append completed with PI error (sct=%d, sc=%d)\n", +- cpl->status.sct, cpl->status.sc); +- /* Run PI verification for zone append data buffer if PI error is detected. */ +- bdev_nvme_verify_pi_error(bio); +- } +- +- bdev_nvme_io_complete_nvme_status(bio, cpl); +-} +- +-static void +-bdev_nvme_comparev_done(void *ref, const struct spdk_nvme_cpl *cpl) +-{ +- struct nvme_bdev_io *bio = ref; +- +- if (spdk_nvme_cpl_is_pi_error(cpl)) { +- SPDK_ERRLOG("comparev completed with PI error (sct=%d, sc=%d)\n", +- cpl->status.sct, cpl->status.sc); +- /* Run PI verification for compare data buffer if PI error is detected. */ +- bdev_nvme_verify_pi_error(bio); +- } +- +- bdev_nvme_io_complete_nvme_status(bio, cpl); +-} +- +-static void +-bdev_nvme_comparev_and_writev_done(void *ref, const struct spdk_nvme_cpl *cpl) +-{ +- struct nvme_bdev_io *bio = ref; +- +- /* Compare operation completion */ +- if (!bio->first_fused_completed) { +- /* Save compare result for write callback */ +- bio->cpl = *cpl; +- bio->first_fused_completed = true; +- return; +- } +- +- /* Write operation completion */ +- if (spdk_nvme_cpl_is_error(&bio->cpl)) { +- /* If bio->cpl is already an error, it means the compare operation failed. In that case, +- * complete the IO with the compare operation's status. +- */ +- if (!spdk_nvme_cpl_is_error(cpl)) { +- SPDK_ERRLOG("Unexpected write success after compare failure.\n"); +- } +- +- bdev_nvme_io_complete_nvme_status(bio, &bio->cpl); +- } else { +- bdev_nvme_io_complete_nvme_status(bio, cpl); +- } +-} +- +-static void +-bdev_nvme_queued_done(void *ref, const struct spdk_nvme_cpl *cpl) +-{ +- struct nvme_bdev_io *bio = ref; +- +- bdev_nvme_io_complete_nvme_status(bio, cpl); +-} +- +-static int +-fill_zone_from_report(struct spdk_bdev_zone_info *info, struct spdk_nvme_zns_zone_desc *desc) +-{ +- switch (desc->zt) { +- case SPDK_NVME_ZONE_TYPE_SEQWR: +- info->type = SPDK_BDEV_ZONE_TYPE_SEQWR; +- break; +- default: +- SPDK_ERRLOG("Invalid zone type: %#x in zone report\n", desc->zt); +- return -EIO; +- } +- +- switch (desc->zs) { +- case SPDK_NVME_ZONE_STATE_EMPTY: +- info->state = SPDK_BDEV_ZONE_STATE_EMPTY; +- break; +- case SPDK_NVME_ZONE_STATE_IOPEN: +- info->state = SPDK_BDEV_ZONE_STATE_IMP_OPEN; +- break; +- case SPDK_NVME_ZONE_STATE_EOPEN: +- info->state = SPDK_BDEV_ZONE_STATE_EXP_OPEN; +- break; +- case SPDK_NVME_ZONE_STATE_CLOSED: +- info->state = SPDK_BDEV_ZONE_STATE_CLOSED; +- break; +- case SPDK_NVME_ZONE_STATE_RONLY: +- info->state = SPDK_BDEV_ZONE_STATE_READ_ONLY; +- break; +- case SPDK_NVME_ZONE_STATE_FULL: +- info->state = SPDK_BDEV_ZONE_STATE_FULL; +- break; +- case SPDK_NVME_ZONE_STATE_OFFLINE: +- info->state = SPDK_BDEV_ZONE_STATE_OFFLINE; +- break; +- default: +- SPDK_ERRLOG("Invalid zone state: %#x in zone report\n", desc->zs); +- return -EIO; +- } +- +- info->zone_id = desc->zslba; +- info->write_pointer = desc->wp; +- info->capacity = desc->zcap; +- +- return 0; +-} +- +-static void +-bdev_nvme_get_zone_info_done(void *ref, const struct spdk_nvme_cpl *cpl) +-{ +- struct nvme_bdev_io *bio = ref; +- struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); +- uint64_t zone_id = bdev_io->u.zone_mgmt.zone_id; +- uint32_t zones_to_copy = bdev_io->u.zone_mgmt.num_zones; +- struct spdk_bdev_zone_info *info = bdev_io->u.zone_mgmt.buf; +- uint64_t max_zones_per_buf, i; +- uint32_t zone_report_bufsize; +- struct spdk_nvme_ns *ns; +- struct spdk_nvme_qpair *qpair; +- int ret; +- +- if (spdk_nvme_cpl_is_error(cpl)) { +- goto out_complete_io_nvme_cpl; +- } +- +- if (spdk_unlikely(!nvme_io_path_is_available(bio->io_path))) { +- ret = -ENXIO; +- goto out_complete_io_ret; +- } +- +- ns = bio->io_path->nvme_ns->ns; +- qpair = bio->io_path->qpair->qpair; +- +- zone_report_bufsize = spdk_nvme_ns_get_max_io_xfer_size(ns); +- max_zones_per_buf = (zone_report_bufsize - sizeof(*bio->zone_report_buf)) / +- sizeof(bio->zone_report_buf->descs[0]); +- +- if (bio->zone_report_buf->nr_zones > max_zones_per_buf) { +- ret = -EINVAL; +- goto out_complete_io_ret; +- } +- +- if (!bio->zone_report_buf->nr_zones) { +- ret = -EINVAL; +- goto out_complete_io_ret; +- } +- +- for (i = 0; i < bio->zone_report_buf->nr_zones && bio->handled_zones < zones_to_copy; i++) { +- ret = fill_zone_from_report(&info[bio->handled_zones], +- &bio->zone_report_buf->descs[i]); +- if (ret) { +- goto out_complete_io_ret; +- } +- bio->handled_zones++; +- } +- +- if (bio->handled_zones < zones_to_copy) { +- uint64_t zone_size_lba = spdk_nvme_zns_ns_get_zone_size_sectors(ns); +- uint64_t slba = zone_id + (zone_size_lba * bio->handled_zones); +- +- memset(bio->zone_report_buf, 0, zone_report_bufsize); +- ret = spdk_nvme_zns_report_zones(ns, qpair, +- bio->zone_report_buf, zone_report_bufsize, +- slba, SPDK_NVME_ZRA_LIST_ALL, true, +- bdev_nvme_get_zone_info_done, bio); +- if (!ret) { +- return; +- } else { +- goto out_complete_io_ret; +- } +- } +- +-out_complete_io_nvme_cpl: +- free(bio->zone_report_buf); +- bio->zone_report_buf = NULL; +- bdev_nvme_io_complete_nvme_status(bio, cpl); +- return; +- +-out_complete_io_ret: +- free(bio->zone_report_buf); +- bio->zone_report_buf = NULL; +- bdev_nvme_io_complete(bio, ret); +-} +- +-static void +-bdev_nvme_zone_management_done(void *ref, const struct spdk_nvme_cpl *cpl) +-{ +- struct nvme_bdev_io *bio = ref; +- +- bdev_nvme_io_complete_nvme_status(bio, cpl); +-} +- +-static void +-bdev_nvme_admin_passthru_complete_nvme_status(void *ctx) +-{ +- struct nvme_bdev_io *bio = ctx; +- struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); +- const struct spdk_nvme_cpl *cpl = &bio->cpl; +- +- assert(bdev_nvme_io_type_is_admin(bdev_io->type)); +- +- __bdev_nvme_io_complete(bdev_io, 0, cpl); +-} +- +-static void +-bdev_nvme_abort_complete(void *ctx) +-{ +- struct nvme_bdev_io *bio = ctx; +- struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); +- +- if (spdk_nvme_cpl_is_abort_success(&bio->cpl)) { +- __bdev_nvme_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_SUCCESS, NULL); +- } else { +- __bdev_nvme_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED, NULL); +- } +-} +- +-static void +-bdev_nvme_abort_done(void *ref, const struct spdk_nvme_cpl *cpl) +-{ +- struct nvme_bdev_io *bio = ref; +- +- bio->cpl = *cpl; +- spdk_thread_send_msg(bio->orig_thread, bdev_nvme_abort_complete, bio); +-} +- +-static void +-bdev_nvme_admin_passthru_done(void *ref, const struct spdk_nvme_cpl *cpl) +-{ +- struct nvme_bdev_io *bio = ref; +- +- bio->cpl = *cpl; +- spdk_thread_send_msg(bio->orig_thread, +- bdev_nvme_admin_passthru_complete_nvme_status, bio); +-} +- +-static void +-bdev_nvme_queued_reset_sgl(void *ref, uint32_t sgl_offset) +-{ +- struct nvme_bdev_io *bio = ref; +- struct iovec *iov; +- +- bio->iov_offset = sgl_offset; +- for (bio->iovpos = 0; bio->iovpos < bio->iovcnt; bio->iovpos++) { +- iov = &bio->iovs[bio->iovpos]; +- if (bio->iov_offset < iov->iov_len) { +- break; +- } +- +- bio->iov_offset -= iov->iov_len; +- } +-} +- +-static int +-bdev_nvme_queued_next_sge(void *ref, void **address, uint32_t *length) +-{ +- struct nvme_bdev_io *bio = ref; +- struct iovec *iov; +- +- assert(bio->iovpos < bio->iovcnt); +- +- iov = &bio->iovs[bio->iovpos]; +- +- *address = iov->iov_base; +- *length = iov->iov_len; +- +- if (bio->iov_offset) { +- assert(bio->iov_offset <= iov->iov_len); +- *address += bio->iov_offset; +- *length -= bio->iov_offset; +- } +- +- bio->iov_offset += *length; +- if (bio->iov_offset == iov->iov_len) { +- bio->iovpos++; +- bio->iov_offset = 0; +- } +- +- return 0; +-} +- +-static void +-bdev_nvme_queued_reset_fused_sgl(void *ref, uint32_t sgl_offset) +-{ +- struct nvme_bdev_io *bio = ref; +- struct iovec *iov; +- +- bio->fused_iov_offset = sgl_offset; +- for (bio->fused_iovpos = 0; bio->fused_iovpos < bio->fused_iovcnt; bio->fused_iovpos++) { +- iov = &bio->fused_iovs[bio->fused_iovpos]; +- if (bio->fused_iov_offset < iov->iov_len) { +- break; +- } +- +- bio->fused_iov_offset -= iov->iov_len; +- } +-} +- +-static int +-bdev_nvme_queued_next_fused_sge(void *ref, void **address, uint32_t *length) +-{ +- struct nvme_bdev_io *bio = ref; +- struct iovec *iov; +- +- assert(bio->fused_iovpos < bio->fused_iovcnt); +- +- iov = &bio->fused_iovs[bio->fused_iovpos]; +- +- *address = iov->iov_base; +- *length = iov->iov_len; +- +- if (bio->fused_iov_offset) { +- assert(bio->fused_iov_offset <= iov->iov_len); +- *address += bio->fused_iov_offset; +- *length -= bio->fused_iov_offset; +- } +- +- bio->fused_iov_offset += *length; +- if (bio->fused_iov_offset == iov->iov_len) { +- bio->fused_iovpos++; +- bio->fused_iov_offset = 0; +- } +- +- return 0; +-} +- +-static int +-bdev_nvme_no_pi_readv(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt, +- void *md, uint64_t lba_count, uint64_t lba) +-{ +- int rc; +- +- SPDK_DEBUGLOG(bdev_nvme, "read %" PRIu64 " blocks with offset %#" PRIx64 " without PI check\n", +- lba_count, lba); +- +- bio->iovs = iov; +- bio->iovcnt = iovcnt; +- bio->iovpos = 0; +- bio->iov_offset = 0; +- +- rc = spdk_nvme_ns_cmd_readv_with_md(bio->io_path->nvme_ns->ns, +- bio->io_path->qpair->qpair, +- lba, lba_count, +- bdev_nvme_no_pi_readv_done, bio, 0, +- bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge, +- md, 0, 0); +- +- if (rc != 0 && rc != -ENOMEM) { +- SPDK_ERRLOG("no_pi_readv failed: rc = %d\n", rc); +- } +- return rc; +-} +- +-static int +-bdev_nvme_readv(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt, +- void *md, uint64_t lba_count, uint64_t lba, uint32_t flags, +- struct spdk_bdev_ext_io_opts *ext_opts) +-{ +- struct spdk_nvme_ns *ns = bio->io_path->nvme_ns->ns; +- struct spdk_nvme_qpair *qpair = bio->io_path->qpair->qpair; +- int rc; +- +- SPDK_DEBUGLOG(bdev_nvme, "read %" PRIu64 " blocks with offset %#" PRIx64 "\n", +- lba_count, lba); +- +- bio->iovs = iov; +- bio->iovcnt = iovcnt; +- bio->iovpos = 0; +- bio->iov_offset = 0; +- +- if (ext_opts) { +- bio->ext_opts.size = sizeof(struct spdk_nvme_ns_cmd_ext_io_opts); +- bio->ext_opts.memory_domain = ext_opts->memory_domain; +- bio->ext_opts.memory_domain_ctx = ext_opts->memory_domain_ctx; +- bio->ext_opts.io_flags = flags; +- bio->ext_opts.metadata = md; +- +- rc = spdk_nvme_ns_cmd_readv_ext(ns, qpair, lba, lba_count, +- bdev_nvme_readv_done, bio, +- bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge, +- &bio->ext_opts); +- } else if (iovcnt == 1) { +- rc = spdk_nvme_ns_cmd_read_with_md(ns, qpair, iov[0].iov_base, md, lba, +- lba_count, +- bdev_nvme_readv_done, bio, +- flags, +- 0, 0); +- } else { +- rc = spdk_nvme_ns_cmd_readv_with_md(ns, qpair, lba, lba_count, +- bdev_nvme_readv_done, bio, flags, +- bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge, +- md, 0, 0); +- } +- +- if (rc != 0 && rc != -ENOMEM) { +- SPDK_ERRLOG("readv failed: rc = %d\n", rc); +- } +- return rc; +-} +- +-static int +-bdev_nvme_writev(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt, +- void *md, uint64_t lba_count, uint64_t lba, +- uint32_t flags, struct spdk_bdev_ext_io_opts *ext_opts) +-{ +- struct spdk_nvme_ns *ns = bio->io_path->nvme_ns->ns; +- struct spdk_nvme_qpair *qpair = bio->io_path->qpair->qpair; +- int rc; +- +- SPDK_DEBUGLOG(bdev_nvme, "write %" PRIu64 " blocks with offset %#" PRIx64 "\n", +- lba_count, lba); +- +- bio->iovs = iov; +- bio->iovcnt = iovcnt; +- bio->iovpos = 0; +- bio->iov_offset = 0; +- +- if (ext_opts) { +- bio->ext_opts.size = sizeof(struct spdk_nvme_ns_cmd_ext_io_opts); +- bio->ext_opts.memory_domain = ext_opts->memory_domain; +- bio->ext_opts.memory_domain_ctx = ext_opts->memory_domain_ctx; +- bio->ext_opts.io_flags = flags; +- bio->ext_opts.metadata = md; +- +- rc = spdk_nvme_ns_cmd_writev_ext(ns, qpair, lba, lba_count, +- bdev_nvme_writev_done, bio, +- bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge, +- &bio->ext_opts); +- } else if (iovcnt == 1) { +- rc = spdk_nvme_ns_cmd_write_with_md(ns, qpair, iov[0].iov_base, md, lba, +- lba_count, +- bdev_nvme_writev_done, bio, +- flags, +- 0, 0); +- } else { +- rc = spdk_nvme_ns_cmd_writev_with_md(ns, qpair, lba, lba_count, +- bdev_nvme_writev_done, bio, flags, +- bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge, +- md, 0, 0); +- } +- +- if (rc != 0 && rc != -ENOMEM) { +- SPDK_ERRLOG("writev failed: rc = %d\n", rc); +- } +- return rc; +-} +- +-static int +-bdev_nvme_zone_appendv(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt, +- void *md, uint64_t lba_count, uint64_t zslba, +- uint32_t flags) +-{ +- struct spdk_nvme_ns *ns = bio->io_path->nvme_ns->ns; +- struct spdk_nvme_qpair *qpair = bio->io_path->qpair->qpair; +- int rc; +- +- SPDK_DEBUGLOG(bdev_nvme, "zone append %" PRIu64 " blocks to zone start lba %#" PRIx64 "\n", +- lba_count, zslba); +- +- bio->iovs = iov; +- bio->iovcnt = iovcnt; +- bio->iovpos = 0; +- bio->iov_offset = 0; +- +- if (iovcnt == 1) { +- rc = spdk_nvme_zns_zone_append_with_md(ns, qpair, iov[0].iov_base, md, zslba, +- lba_count, +- bdev_nvme_zone_appendv_done, bio, +- flags, +- 0, 0); +- } else { +- rc = spdk_nvme_zns_zone_appendv_with_md(ns, qpair, zslba, lba_count, +- bdev_nvme_zone_appendv_done, bio, flags, +- bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge, +- md, 0, 0); +- } +- +- if (rc != 0 && rc != -ENOMEM) { +- SPDK_ERRLOG("zone append failed: rc = %d\n", rc); +- } +- return rc; +-} +- +-static int +-bdev_nvme_comparev(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt, +- void *md, uint64_t lba_count, uint64_t lba, +- uint32_t flags) +-{ +- int rc; +- +- SPDK_DEBUGLOG(bdev_nvme, "compare %" PRIu64 " blocks with offset %#" PRIx64 "\n", +- lba_count, lba); +- +- bio->iovs = iov; +- bio->iovcnt = iovcnt; +- bio->iovpos = 0; +- bio->iov_offset = 0; +- +- rc = spdk_nvme_ns_cmd_comparev_with_md(bio->io_path->nvme_ns->ns, +- bio->io_path->qpair->qpair, +- lba, lba_count, +- bdev_nvme_comparev_done, bio, flags, +- bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge, +- md, 0, 0); +- +- if (rc != 0 && rc != -ENOMEM) { +- SPDK_ERRLOG("comparev failed: rc = %d\n", rc); +- } +- return rc; +-} +- +-static int +-bdev_nvme_comparev_and_writev(struct nvme_bdev_io *bio, struct iovec *cmp_iov, int cmp_iovcnt, +- struct iovec *write_iov, int write_iovcnt, +- void *md, uint64_t lba_count, uint64_t lba, uint32_t flags) +-{ +- struct spdk_nvme_ns *ns = bio->io_path->nvme_ns->ns; +- struct spdk_nvme_qpair *qpair = bio->io_path->qpair->qpair; +- struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); +- int rc; +- +- SPDK_DEBUGLOG(bdev_nvme, "compare and write %" PRIu64 " blocks with offset %#" PRIx64 "\n", +- lba_count, lba); +- +- bio->iovs = cmp_iov; +- bio->iovcnt = cmp_iovcnt; +- bio->iovpos = 0; +- bio->iov_offset = 0; +- bio->fused_iovs = write_iov; +- bio->fused_iovcnt = write_iovcnt; +- bio->fused_iovpos = 0; +- bio->fused_iov_offset = 0; +- +- if (bdev_io->num_retries == 0) { +- bio->first_fused_submitted = false; +- bio->first_fused_completed = false; +- } +- +- if (!bio->first_fused_submitted) { +- flags |= SPDK_NVME_IO_FLAGS_FUSE_FIRST; +- memset(&bio->cpl, 0, sizeof(bio->cpl)); +- +- rc = spdk_nvme_ns_cmd_comparev_with_md(ns, qpair, lba, lba_count, +- bdev_nvme_comparev_and_writev_done, bio, flags, +- bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge, md, 0, 0); +- if (rc == 0) { +- bio->first_fused_submitted = true; +- flags &= ~SPDK_NVME_IO_FLAGS_FUSE_FIRST; +- } else { +- if (rc != -ENOMEM) { +- SPDK_ERRLOG("compare failed: rc = %d\n", rc); +- } +- return rc; +- } +- } +- +- flags |= SPDK_NVME_IO_FLAGS_FUSE_SECOND; +- +- rc = spdk_nvme_ns_cmd_writev_with_md(ns, qpair, lba, lba_count, +- bdev_nvme_comparev_and_writev_done, bio, flags, +- bdev_nvme_queued_reset_fused_sgl, bdev_nvme_queued_next_fused_sge, md, 0, 0); +- if (rc != 0 && rc != -ENOMEM) { +- SPDK_ERRLOG("write failed: rc = %d\n", rc); +- rc = 0; +- } +- +- return rc; +-} +- +-static int +-bdev_nvme_unmap(struct nvme_bdev_io *bio, uint64_t offset_blocks, uint64_t num_blocks) +-{ +- struct spdk_nvme_dsm_range dsm_ranges[SPDK_NVME_DATASET_MANAGEMENT_MAX_RANGES]; +- struct spdk_nvme_dsm_range *range; +- uint64_t offset, remaining; +- uint64_t num_ranges_u64; +- uint16_t num_ranges; +- int rc; +- +- num_ranges_u64 = (num_blocks + SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS - 1) / +- SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS; +- if (num_ranges_u64 > SPDK_COUNTOF(dsm_ranges)) { +- SPDK_ERRLOG("Unmap request for %" PRIu64 " blocks is too large\n", num_blocks); +- return -EINVAL; +- } +- num_ranges = (uint16_t)num_ranges_u64; +- +- offset = offset_blocks; +- remaining = num_blocks; +- range = &dsm_ranges[0]; +- +- /* Fill max-size ranges until the remaining blocks fit into one range */ +- while (remaining > SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS) { +- range->attributes.raw = 0; +- range->length = SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS; +- range->starting_lba = offset; +- +- offset += SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS; +- remaining -= SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS; +- range++; +- } +- +- /* Final range describes the remaining blocks */ +- range->attributes.raw = 0; +- range->length = remaining; +- range->starting_lba = offset; +- +- rc = spdk_nvme_ns_cmd_dataset_management(bio->io_path->nvme_ns->ns, +- bio->io_path->qpair->qpair, +- SPDK_NVME_DSM_ATTR_DEALLOCATE, +- dsm_ranges, num_ranges, +- bdev_nvme_queued_done, bio); +- +- return rc; +-} +- +-static int +-bdev_nvme_write_zeroes(struct nvme_bdev_io *bio, uint64_t offset_blocks, uint64_t num_blocks) +-{ +- if (num_blocks > UINT16_MAX + 1) { +- SPDK_ERRLOG("NVMe write zeroes is limited to 16-bit block count\n"); +- return -EINVAL; +- } +- +- return spdk_nvme_ns_cmd_write_zeroes(bio->io_path->nvme_ns->ns, +- bio->io_path->qpair->qpair, +- offset_blocks, num_blocks, +- bdev_nvme_queued_done, bio, +- 0); +-} +- +-static int +-bdev_nvme_get_zone_info(struct nvme_bdev_io *bio, uint64_t zone_id, uint32_t num_zones, +- struct spdk_bdev_zone_info *info) +-{ +- struct spdk_nvme_ns *ns = bio->io_path->nvme_ns->ns; +- struct spdk_nvme_qpair *qpair = bio->io_path->qpair->qpair; +- uint32_t zone_report_bufsize = spdk_nvme_ns_get_max_io_xfer_size(ns); +- uint64_t zone_size = spdk_nvme_zns_ns_get_zone_size_sectors(ns); +- uint64_t total_zones = spdk_nvme_zns_ns_get_num_zones(ns); +- +- if (zone_id % zone_size != 0) { +- return -EINVAL; +- } +- +- if (num_zones > total_zones || !num_zones) { +- return -EINVAL; +- } +- +- assert(!bio->zone_report_buf); +- bio->zone_report_buf = calloc(1, zone_report_bufsize); +- if (!bio->zone_report_buf) { +- return -ENOMEM; +- } +- +- bio->handled_zones = 0; +- +- return spdk_nvme_zns_report_zones(ns, qpair, bio->zone_report_buf, zone_report_bufsize, +- zone_id, SPDK_NVME_ZRA_LIST_ALL, true, +- bdev_nvme_get_zone_info_done, bio); +-} +- +-static int +-bdev_nvme_zone_management(struct nvme_bdev_io *bio, uint64_t zone_id, +- enum spdk_bdev_zone_action action) +-{ +- struct spdk_nvme_ns *ns = bio->io_path->nvme_ns->ns; +- struct spdk_nvme_qpair *qpair = bio->io_path->qpair->qpair; +- +- switch (action) { +- case SPDK_BDEV_ZONE_CLOSE: +- return spdk_nvme_zns_close_zone(ns, qpair, zone_id, false, +- bdev_nvme_zone_management_done, bio); +- case SPDK_BDEV_ZONE_FINISH: +- return spdk_nvme_zns_finish_zone(ns, qpair, zone_id, false, +- bdev_nvme_zone_management_done, bio); +- case SPDK_BDEV_ZONE_OPEN: +- return spdk_nvme_zns_open_zone(ns, qpair, zone_id, false, +- bdev_nvme_zone_management_done, bio); +- case SPDK_BDEV_ZONE_RESET: +- return spdk_nvme_zns_reset_zone(ns, qpair, zone_id, false, +- bdev_nvme_zone_management_done, bio); +- case SPDK_BDEV_ZONE_OFFLINE: +- return spdk_nvme_zns_offline_zone(ns, qpair, zone_id, false, +- bdev_nvme_zone_management_done, bio); +- default: +- return -EINVAL; +- } +-} +- +-static void +-bdev_nvme_admin_passthru(struct nvme_bdev_channel *nbdev_ch, struct nvme_bdev_io *bio, +- struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes) +-{ +- struct nvme_io_path *io_path; +- struct nvme_ctrlr *nvme_ctrlr; +- uint32_t max_xfer_size; +- int rc = -ENXIO; +- +- /* Choose the first ctrlr which is not failed. */ +- STAILQ_FOREACH(io_path, &nbdev_ch->io_path_list, stailq) { +- nvme_ctrlr = io_path->qpair->ctrlr; +- +- /* We should skip any unavailable nvme_ctrlr rather than checking +- * if the return value of spdk_nvme_ctrlr_cmd_admin_raw() is -ENXIO. +- */ +- if (!nvme_ctrlr_is_available(nvme_ctrlr)) { +- continue; +- } +- +- max_xfer_size = spdk_nvme_ctrlr_get_max_xfer_size(nvme_ctrlr->ctrlr); +- +- if (nbytes > max_xfer_size) { +- SPDK_ERRLOG("nbytes is greater than MDTS %" PRIu32 ".\n", max_xfer_size); +- rc = -EINVAL; +- goto err; +- } +- +- bio->io_path = io_path; +- bio->orig_thread = spdk_get_thread(); +- +- rc = spdk_nvme_ctrlr_cmd_admin_raw(nvme_ctrlr->ctrlr, cmd, buf, (uint32_t)nbytes, +- bdev_nvme_admin_passthru_done, bio); +- if (rc == 0) { +- return; +- } +- } +- +-err: +- bdev_nvme_admin_passthru_complete(bio, rc); +-} +- +-static int +-bdev_nvme_io_passthru(struct nvme_bdev_io *bio, struct spdk_nvme_cmd *cmd, +- void *buf, size_t nbytes) +-{ +- struct spdk_nvme_ns *ns = bio->io_path->nvme_ns->ns; +- struct spdk_nvme_qpair *qpair = bio->io_path->qpair->qpair; +- uint32_t max_xfer_size = spdk_nvme_ns_get_max_io_xfer_size(ns); +- struct spdk_nvme_ctrlr *ctrlr = spdk_nvme_ns_get_ctrlr(ns); +- +- if (nbytes > max_xfer_size) { +- SPDK_ERRLOG("nbytes is greater than MDTS %" PRIu32 ".\n", max_xfer_size); +- return -EINVAL; +- } +- +- /* +- * Each NVMe bdev is a specific namespace, and all NVMe I/O commands require a nsid, +- * so fill it out automatically. +- */ +- cmd->nsid = spdk_nvme_ns_get_id(ns); +- +- return spdk_nvme_ctrlr_cmd_io_raw(ctrlr, qpair, cmd, buf, +- (uint32_t)nbytes, bdev_nvme_queued_done, bio); +-} +- +-static int +-bdev_nvme_io_passthru_md(struct nvme_bdev_io *bio, struct spdk_nvme_cmd *cmd, +- void *buf, size_t nbytes, void *md_buf, size_t md_len) +-{ +- struct spdk_nvme_ns *ns = bio->io_path->nvme_ns->ns; +- struct spdk_nvme_qpair *qpair = bio->io_path->qpair->qpair; +- size_t nr_sectors = nbytes / spdk_nvme_ns_get_extended_sector_size(ns); +- uint32_t max_xfer_size = spdk_nvme_ns_get_max_io_xfer_size(ns); +- struct spdk_nvme_ctrlr *ctrlr = spdk_nvme_ns_get_ctrlr(ns); +- +- if (nbytes > max_xfer_size) { +- SPDK_ERRLOG("nbytes is greater than MDTS %" PRIu32 ".\n", max_xfer_size); +- return -EINVAL; +- } +- +- if (md_len != nr_sectors * spdk_nvme_ns_get_md_size(ns)) { +- SPDK_ERRLOG("invalid meta data buffer size\n"); +- return -EINVAL; +- } +- +- /* +- * Each NVMe bdev is a specific namespace, and all NVMe I/O commands require a nsid, +- * so fill it out automatically. +- */ +- cmd->nsid = spdk_nvme_ns_get_id(ns); +- +- return spdk_nvme_ctrlr_cmd_io_raw_with_md(ctrlr, qpair, cmd, buf, +- (uint32_t)nbytes, md_buf, bdev_nvme_queued_done, bio); +-} +- +-static void +-bdev_nvme_abort(struct nvme_bdev_channel *nbdev_ch, struct nvme_bdev_io *bio, +- struct nvme_bdev_io *bio_to_abort) +-{ +- struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); +- struct nvme_io_path *io_path; +- struct nvme_ctrlr *nvme_ctrlr; +- int rc = 0; +- +- bio->orig_thread = spdk_get_thread(); +- +- rc = bdev_nvme_abort_retry_io(nbdev_ch, bio_to_abort); +- if (rc == 0) { +- __bdev_nvme_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_SUCCESS, NULL); +- return; +- } +- +- rc = 0; +- +- /* Even admin commands, they were submitted to only nvme_ctrlrs which were +- * on any io_path. So traverse the io_path list for not only I/O commands +- * but also admin commands. +- */ +- STAILQ_FOREACH(io_path, &nbdev_ch->io_path_list, stailq) { +- nvme_ctrlr = io_path->qpair->ctrlr; +- +- rc = spdk_nvme_ctrlr_cmd_abort_ext(nvme_ctrlr->ctrlr, +- io_path->qpair->qpair, +- bio_to_abort, +- bdev_nvme_abort_done, bio); +- if (rc == -ENOENT) { +- /* If no command was found in I/O qpair, the target command may be +- * admin command. +- */ +- rc = spdk_nvme_ctrlr_cmd_abort_ext(nvme_ctrlr->ctrlr, +- NULL, +- bio_to_abort, +- bdev_nvme_abort_done, bio); +- } +- +- if (rc != -ENOENT) { +- break; +- } +- } +- +- if (rc != 0) { +- /* If no command was found or there was any error, complete the abort +- * request with failure. +- */ +- __bdev_nvme_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED, NULL); +- } +-} +- +-static int +-bdev_nvme_copy(struct nvme_bdev_io *bio, uint64_t dst_offset_blocks, uint64_t src_offset_blocks, +- uint64_t num_blocks) +-{ +- struct spdk_nvme_scc_source_range range = { +- .slba = src_offset_blocks, +- .nlb = num_blocks - 1 +- }; +- +- return spdk_nvme_ns_cmd_copy(bio->io_path->nvme_ns->ns, +- bio->io_path->qpair->qpair, +- &range, 1, dst_offset_blocks, +- bdev_nvme_queued_done, bio); +-} +- +-static void +-bdev_nvme_opts_config_json(struct spdk_json_write_ctx *w) +-{ +- const char *action; +- +- if (g_opts.action_on_timeout == SPDK_BDEV_NVME_TIMEOUT_ACTION_RESET) { +- action = "reset"; +- } else if (g_opts.action_on_timeout == SPDK_BDEV_NVME_TIMEOUT_ACTION_ABORT) { +- action = "abort"; +- } else { +- action = "none"; +- } +- +- spdk_json_write_object_begin(w); +- +- spdk_json_write_named_string(w, "method", "bdev_nvme_set_options"); +- +- spdk_json_write_named_object_begin(w, "params"); +- spdk_json_write_named_string(w, "action_on_timeout", action); +- spdk_json_write_named_uint64(w, "timeout_us", g_opts.timeout_us); +- spdk_json_write_named_uint64(w, "timeout_admin_us", g_opts.timeout_admin_us); +- spdk_json_write_named_uint32(w, "keep_alive_timeout_ms", g_opts.keep_alive_timeout_ms); +- spdk_json_write_named_uint32(w, "transport_retry_count", g_opts.transport_retry_count); +- spdk_json_write_named_uint32(w, "arbitration_burst", g_opts.arbitration_burst); +- spdk_json_write_named_uint32(w, "low_priority_weight", g_opts.low_priority_weight); +- spdk_json_write_named_uint32(w, "medium_priority_weight", g_opts.medium_priority_weight); +- spdk_json_write_named_uint32(w, "high_priority_weight", g_opts.high_priority_weight); +- spdk_json_write_named_uint64(w, "nvme_adminq_poll_period_us", g_opts.nvme_adminq_poll_period_us); +- spdk_json_write_named_uint64(w, "nvme_ioq_poll_period_us", g_opts.nvme_ioq_poll_period_us); +- spdk_json_write_named_uint32(w, "io_queue_requests", g_opts.io_queue_requests); +- spdk_json_write_named_bool(w, "delay_cmd_submit", g_opts.delay_cmd_submit); +- spdk_json_write_named_int32(w, "bdev_retry_count", g_opts.bdev_retry_count); +- spdk_json_write_named_uint8(w, "transport_ack_timeout", g_opts.transport_ack_timeout); +- spdk_json_write_named_int32(w, "ctrlr_loss_timeout_sec", g_opts.ctrlr_loss_timeout_sec); +- spdk_json_write_named_uint32(w, "reconnect_delay_sec", g_opts.reconnect_delay_sec); +- spdk_json_write_named_uint32(w, "fast_io_fail_timeout_sec", g_opts.fast_io_fail_timeout_sec); +- spdk_json_write_named_bool(w, "generate_uuids", g_opts.generate_uuids); +- spdk_json_write_named_uint8(w, "transport_tos", g_opts.transport_tos); +- spdk_json_write_named_bool(w, "io_path_stat", g_opts.io_path_stat); +- spdk_json_write_object_end(w); +- +- spdk_json_write_object_end(w); +-} +- +-static void +-bdev_nvme_discovery_config_json(struct spdk_json_write_ctx *w, struct discovery_ctx *ctx) +-{ +- struct spdk_nvme_transport_id trid; +- +- spdk_json_write_object_begin(w); +- +- spdk_json_write_named_string(w, "method", "bdev_nvme_start_discovery"); +- +- spdk_json_write_named_object_begin(w, "params"); +- spdk_json_write_named_string(w, "name", ctx->name); +- spdk_json_write_named_string(w, "hostnqn", ctx->hostnqn); +- +- trid = ctx->trid; +- memset(trid.subnqn, 0, sizeof(trid.subnqn)); +- nvme_bdev_dump_trid_json(&trid, w); +- +- spdk_json_write_named_bool(w, "wait_for_attach", ctx->wait_for_attach); +- spdk_json_write_named_int32(w, "ctrlr_loss_timeout_sec", ctx->bdev_opts.ctrlr_loss_timeout_sec); +- spdk_json_write_named_uint32(w, "reconnect_delay_sec", ctx->bdev_opts.reconnect_delay_sec); +- spdk_json_write_named_uint32(w, "fast_io_fail_timeout_sec", +- ctx->bdev_opts.fast_io_fail_timeout_sec); +- spdk_json_write_object_end(w); +- +- spdk_json_write_object_end(w); +-} +- +-static void +-nvme_ctrlr_config_json(struct spdk_json_write_ctx *w, +- struct nvme_ctrlr *nvme_ctrlr) +-{ +- struct spdk_nvme_transport_id *trid; +- const struct spdk_nvme_ctrlr_opts *opts; +- +- if (nvme_ctrlr->opts.from_discovery_service) { +- /* Do not emit an RPC for this - it will be implicitly +- * covered by a separate bdev_nvme_start_discovery or +- * bdev_nvme_start_mdns_discovery RPC. +- */ +- return; +- } +- +- trid = &nvme_ctrlr->active_path_id->trid; +- +- spdk_json_write_object_begin(w); +- +- spdk_json_write_named_string(w, "method", "bdev_nvme_attach_controller"); +- +- spdk_json_write_named_object_begin(w, "params"); +- spdk_json_write_named_string(w, "name", nvme_ctrlr->nbdev_ctrlr->name); +- nvme_bdev_dump_trid_json(trid, w); +- spdk_json_write_named_bool(w, "prchk_reftag", +- (nvme_ctrlr->opts.prchk_flags & SPDK_NVME_IO_FLAGS_PRCHK_REFTAG) != 0); +- spdk_json_write_named_bool(w, "prchk_guard", +- (nvme_ctrlr->opts.prchk_flags & SPDK_NVME_IO_FLAGS_PRCHK_GUARD) != 0); +- spdk_json_write_named_int32(w, "ctrlr_loss_timeout_sec", nvme_ctrlr->opts.ctrlr_loss_timeout_sec); +- spdk_json_write_named_uint32(w, "reconnect_delay_sec", nvme_ctrlr->opts.reconnect_delay_sec); +- spdk_json_write_named_uint32(w, "fast_io_fail_timeout_sec", +- nvme_ctrlr->opts.fast_io_fail_timeout_sec); +- +- opts = spdk_nvme_ctrlr_get_opts(nvme_ctrlr->ctrlr); +- spdk_json_write_named_bool(w, "hdgst", opts->header_digest); +- spdk_json_write_named_bool(w, "ddgst", opts->data_digest); +- +- spdk_json_write_object_end(w); +- +- spdk_json_write_object_end(w); +-} +- +-static void +-bdev_nvme_hotplug_config_json(struct spdk_json_write_ctx *w) +-{ +- spdk_json_write_object_begin(w); +- spdk_json_write_named_string(w, "method", "bdev_nvme_set_hotplug"); +- +- spdk_json_write_named_object_begin(w, "params"); +- spdk_json_write_named_uint64(w, "period_us", g_nvme_hotplug_poll_period_us); +- spdk_json_write_named_bool(w, "enable", g_nvme_hotplug_enabled); +- spdk_json_write_object_end(w); +- +- spdk_json_write_object_end(w); +-} +- +-static int +-bdev_nvme_config_json(struct spdk_json_write_ctx *w) +-{ +- struct nvme_bdev_ctrlr *nbdev_ctrlr; +- struct nvme_ctrlr *nvme_ctrlr; +- struct discovery_ctx *ctx; +- +- bdev_nvme_opts_config_json(w); +- +- pthread_mutex_lock(&g_bdev_nvme_mutex); +- +- TAILQ_FOREACH(nbdev_ctrlr, &g_nvme_bdev_ctrlrs, tailq) { +- TAILQ_FOREACH(nvme_ctrlr, &nbdev_ctrlr->ctrlrs, tailq) { +- nvme_ctrlr_config_json(w, nvme_ctrlr); +- } +- } +- +- TAILQ_FOREACH(ctx, &g_discovery_ctxs, tailq) { +- if (!ctx->from_mdns_discovery_service) { +- bdev_nvme_discovery_config_json(w, ctx); +- } +- } +- +- bdev_nvme_mdns_discovery_config_json(w); +- +- /* Dump as last parameter to give all NVMe bdevs chance to be constructed +- * before enabling hotplug poller. +- */ +- bdev_nvme_hotplug_config_json(w); +- +- pthread_mutex_unlock(&g_bdev_nvme_mutex); +- return 0; +-} +- +-struct spdk_nvme_ctrlr * +-bdev_nvme_get_ctrlr(struct spdk_bdev *bdev) +-{ +- struct nvme_bdev *nbdev; +- struct nvme_ns *nvme_ns; +- +- if (!bdev || bdev->module != &nvme_if) { +- return NULL; +- } +- +- nbdev = SPDK_CONTAINEROF(bdev, struct nvme_bdev, disk); +- nvme_ns = TAILQ_FIRST(&nbdev->nvme_ns_list); +- assert(nvme_ns != NULL); +- +- return nvme_ns->ctrlr->ctrlr; +-} +- +-void +-nvme_io_path_info_json(struct spdk_json_write_ctx *w, struct nvme_io_path *io_path) +-{ +- struct nvme_ns *nvme_ns = io_path->nvme_ns; +- struct nvme_ctrlr *nvme_ctrlr = io_path->qpair->ctrlr; +- const struct spdk_nvme_ctrlr_data *cdata; +- const struct spdk_nvme_transport_id *trid; +- const char *adrfam_str; +- +- spdk_json_write_object_begin(w); +- +- spdk_json_write_named_string(w, "bdev_name", nvme_ns->bdev->disk.name); +- +- cdata = spdk_nvme_ctrlr_get_data(nvme_ctrlr->ctrlr); +- trid = spdk_nvme_ctrlr_get_transport_id(nvme_ctrlr->ctrlr); +- +- spdk_json_write_named_uint32(w, "cntlid", cdata->cntlid); +- spdk_json_write_named_bool(w, "current", io_path == io_path->nbdev_ch->current_io_path); +- spdk_json_write_named_bool(w, "connected", nvme_io_path_is_connected(io_path)); +- spdk_json_write_named_bool(w, "accessible", nvme_ns_is_accessible(nvme_ns)); +- +- spdk_json_write_named_object_begin(w, "transport"); +- spdk_json_write_named_string(w, "trtype", trid->trstring); +- spdk_json_write_named_string(w, "traddr", trid->traddr); +- if (trid->trsvcid[0] != '\0') { +- spdk_json_write_named_string(w, "trsvcid", trid->trsvcid); +- } +- adrfam_str = spdk_nvme_transport_id_adrfam_str(trid->adrfam); +- if (adrfam_str) { +- spdk_json_write_named_string(w, "adrfam", adrfam_str); +- } +- spdk_json_write_object_end(w); +- +- spdk_json_write_object_end(w); +-} +- +-void +-bdev_nvme_get_discovery_info(struct spdk_json_write_ctx *w) +-{ +- struct discovery_ctx *ctx; +- struct discovery_entry_ctx *entry_ctx; +- +- spdk_json_write_array_begin(w); +- TAILQ_FOREACH(ctx, &g_discovery_ctxs, tailq) { +- spdk_json_write_object_begin(w); +- spdk_json_write_named_string(w, "name", ctx->name); +- +- spdk_json_write_named_object_begin(w, "trid"); +- nvme_bdev_dump_trid_json(&ctx->trid, w); +- spdk_json_write_object_end(w); +- +- spdk_json_write_named_array_begin(w, "referrals"); +- TAILQ_FOREACH(entry_ctx, &ctx->discovery_entry_ctxs, tailq) { +- spdk_json_write_object_begin(w); +- spdk_json_write_named_object_begin(w, "trid"); +- nvme_bdev_dump_trid_json(&entry_ctx->trid, w); +- spdk_json_write_object_end(w); +- spdk_json_write_object_end(w); +- } +- spdk_json_write_array_end(w); +- +- spdk_json_write_object_end(w); +- } +- spdk_json_write_array_end(w); +-} +- +-SPDK_LOG_REGISTER_COMPONENT(bdev_nvme) +- +-SPDK_TRACE_REGISTER_FN(bdev_nvme_trace, "bdev_nvme", TRACE_GROUP_BDEV_NVME) +-{ +- struct spdk_trace_tpoint_opts opts[] = { +- { +- "BDEV_NVME_IO_START", TRACE_BDEV_NVME_IO_START, +- OWNER_NONE, OBJECT_BDEV_NVME_IO, 1, +- {{ "ctx", SPDK_TRACE_ARG_TYPE_PTR, 8 }} +- }, +- { +- "BDEV_NVME_IO_DONE", TRACE_BDEV_NVME_IO_DONE, +- OWNER_NONE, OBJECT_BDEV_NVME_IO, 0, +- {{ "ctx", SPDK_TRACE_ARG_TYPE_PTR, 8 }} +- } +- }; +- +- +- spdk_trace_register_object(OBJECT_BDEV_NVME_IO, 'N'); +- spdk_trace_register_description_ext(opts, SPDK_COUNTOF(opts)); +- spdk_trace_tpoint_register_relation(TRACE_NVME_PCIE_SUBMIT, OBJECT_BDEV_NVME_IO, 0); +- spdk_trace_tpoint_register_relation(TRACE_NVME_TCP_SUBMIT, OBJECT_BDEV_NVME_IO, 0); +- spdk_trace_tpoint_register_relation(TRACE_NVME_PCIE_COMPLETE, OBJECT_BDEV_NVME_IO, 0); +- spdk_trace_tpoint_register_relation(TRACE_NVME_TCP_COMPLETE, OBJECT_BDEV_NVME_IO, 0); +-} ++/* SPDX-License-Identifier: BSD-3-Clause ++ * Copyright (C) 2016 Intel Corporation. All rights reserved. ++ * Copyright (c) 2019 Mellanox Technologies LTD. All rights reserved. ++ * Copyright (c) 2021, 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved. ++ * Copyright (c) 2022 Dell Inc, or its subsidiaries. All rights reserved. ++ */ ++ ++#include "spdk/stdinc.h" ++ ++#include "bdev_nvme.h" ++ ++#include "spdk/accel.h" ++#include "spdk/config.h" ++#include "spdk/endian.h" ++#include "spdk/bdev.h" ++#include "spdk/json.h" ++#include "spdk/likely.h" ++#include "spdk/nvme.h" ++#include "spdk/nvme_ocssd.h" ++#include "spdk/nvme_zns.h" ++#include "spdk/opal.h" ++#include "spdk/thread.h" ++#include "spdk/trace.h" ++#include "spdk/string.h" ++#include "spdk/util.h" ++ ++#include "spdk/bdev_module.h" ++#include "spdk/log.h" ++ ++#include "spdk_internal/usdt.h" ++#include "spdk_internal/trace_defs.h" ++ ++#define SPDK_BDEV_NVME_DEFAULT_DELAY_CMD_SUBMIT true ++#define SPDK_BDEV_NVME_DEFAULT_KEEP_ALIVE_TIMEOUT_IN_MS (10000) ++ ++#define NSID_STR_LEN 10 ++ ++static int bdev_nvme_config_json(struct spdk_json_write_ctx *w); ++ ++struct nvme_bdev_io { ++ /** array of iovecs to transfer. */ ++ struct iovec *iovs; ++ ++ /** Number of iovecs in iovs array. */ ++ int iovcnt; ++ ++ /** Current iovec position. */ ++ int iovpos; ++ ++ /** Offset in current iovec. */ ++ uint32_t iov_offset; ++ ++ /** I/O path the current I/O or admin passthrough is submitted on, or the I/O path ++ * being reset in a reset I/O. ++ */ ++ struct nvme_io_path *io_path; ++ ++ /** array of iovecs to transfer. */ ++ struct iovec *fused_iovs; ++ ++ /** Number of iovecs in iovs array. */ ++ int fused_iovcnt; ++ ++ /** Current iovec position. */ ++ int fused_iovpos; ++ ++ /** Offset in current iovec. */ ++ uint32_t fused_iov_offset; ++ ++ /** Saved status for admin passthru completion event, PI error verification, or intermediate compare-and-write status */ ++ struct spdk_nvme_cpl cpl; ++ ++ /** Extended IO opts passed by the user to bdev layer and mapped to NVME format */ ++ struct spdk_nvme_ns_cmd_ext_io_opts ext_opts; ++ ++ /** Originating thread */ ++ struct spdk_thread *orig_thread; ++ ++ /** Keeps track if first of fused commands was submitted */ ++ bool first_fused_submitted; ++ ++ /** Keeps track if first of fused commands was completed */ ++ bool first_fused_completed; ++ ++ /** Temporary pointer to zone report buffer */ ++ struct spdk_nvme_zns_zone_report *zone_report_buf; ++ ++ /** Keep track of how many zones that have been copied to the spdk_bdev_zone_info struct */ ++ uint64_t handled_zones; ++ ++ /** Expiration value in ticks to retry the current I/O. */ ++ uint64_t retry_ticks; ++ ++ /* How many times the current I/O was retried. */ ++ int32_t retry_count; ++ ++ /* Current tsc at submit time. */ ++ uint64_t submit_tsc; ++}; ++ ++struct nvme_probe_skip_entry { ++ struct spdk_nvme_transport_id trid; ++ TAILQ_ENTRY(nvme_probe_skip_entry) tailq; ++}; ++/* All the controllers deleted by users via RPC are skipped by hotplug monitor */ ++static TAILQ_HEAD(, nvme_probe_skip_entry) g_skipped_nvme_ctrlrs = TAILQ_HEAD_INITIALIZER( ++ g_skipped_nvme_ctrlrs); ++ ++static struct spdk_bdev_nvme_opts g_opts = { ++ .action_on_timeout = SPDK_BDEV_NVME_TIMEOUT_ACTION_NONE, ++ .timeout_us = 0, ++ .timeout_admin_us = 0, ++ .keep_alive_timeout_ms = SPDK_BDEV_NVME_DEFAULT_KEEP_ALIVE_TIMEOUT_IN_MS, ++ .transport_retry_count = 4, ++ .arbitration_burst = 0, ++ .low_priority_weight = 0, ++ .medium_priority_weight = 0, ++ .high_priority_weight = 0, ++ .nvme_adminq_poll_period_us = 10000ULL, ++ .nvme_ioq_poll_period_us = 0, ++ .io_queue_requests = 0, ++ .delay_cmd_submit = SPDK_BDEV_NVME_DEFAULT_DELAY_CMD_SUBMIT, ++ .bdev_retry_count = 3, ++ .transport_ack_timeout = 0, ++ .ctrlr_loss_timeout_sec = 0, ++ .reconnect_delay_sec = 0, ++ .fast_io_fail_timeout_sec = 0, ++ .disable_auto_failback = false, ++ .generate_uuids = false, ++ .transport_tos = 0, ++ .nvme_error_stat = false, ++ .io_path_stat = false, ++}; ++ ++#define NVME_HOTPLUG_POLL_PERIOD_MAX 10000000ULL ++#define NVME_HOTPLUG_POLL_PERIOD_DEFAULT 100000ULL ++ ++static int g_hot_insert_nvme_controller_index = 0; ++static uint64_t g_nvme_hotplug_poll_period_us = NVME_HOTPLUG_POLL_PERIOD_DEFAULT; ++static bool g_nvme_hotplug_enabled = false; ++struct spdk_thread *g_bdev_nvme_init_thread; ++static struct spdk_poller *g_hotplug_poller; ++static struct spdk_poller *g_hotplug_probe_poller; ++static struct spdk_nvme_probe_ctx *g_hotplug_probe_ctx; ++ ++static void nvme_ctrlr_populate_namespaces(struct nvme_ctrlr *nvme_ctrlr, ++ struct nvme_async_probe_ctx *ctx); ++static void nvme_ctrlr_populate_namespaces_done(struct nvme_ctrlr *nvme_ctrlr, ++ struct nvme_async_probe_ctx *ctx); ++static int bdev_nvme_library_init(void); ++static void bdev_nvme_library_fini(void); ++static void _bdev_nvme_submit_request(struct nvme_bdev_channel *nbdev_ch, ++ struct spdk_bdev_io *bdev_io); ++static void bdev_nvme_submit_request(struct spdk_io_channel *ch, ++ struct spdk_bdev_io *bdev_io); ++static int bdev_nvme_readv(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt, ++ void *md, uint64_t lba_count, uint64_t lba, ++ uint32_t flags, struct spdk_bdev_ext_io_opts *ext_opts); ++static int bdev_nvme_no_pi_readv(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt, ++ void *md, uint64_t lba_count, uint64_t lba); ++static int bdev_nvme_writev(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt, ++ void *md, uint64_t lba_count, uint64_t lba, ++ uint32_t flags, struct spdk_bdev_ext_io_opts *ext_opts); ++static int bdev_nvme_zone_appendv(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt, ++ void *md, uint64_t lba_count, ++ uint64_t zslba, uint32_t flags); ++static int bdev_nvme_comparev(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt, ++ void *md, uint64_t lba_count, uint64_t lba, ++ uint32_t flags); ++static int bdev_nvme_comparev_and_writev(struct nvme_bdev_io *bio, ++ struct iovec *cmp_iov, int cmp_iovcnt, struct iovec *write_iov, ++ int write_iovcnt, void *md, uint64_t lba_count, uint64_t lba, ++ uint32_t flags); ++static int bdev_nvme_get_zone_info(struct nvme_bdev_io *bio, uint64_t zone_id, ++ uint32_t num_zones, struct spdk_bdev_zone_info *info); ++static int bdev_nvme_zone_management(struct nvme_bdev_io *bio, uint64_t zone_id, ++ enum spdk_bdev_zone_action action); ++static void bdev_nvme_admin_passthru(struct nvme_bdev_channel *nbdev_ch, ++ struct nvme_bdev_io *bio, ++ struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes); ++static int bdev_nvme_io_passthru(struct nvme_bdev_io *bio, struct spdk_nvme_cmd *cmd, ++ void *buf, size_t nbytes); ++static int bdev_nvme_io_passthru_md(struct nvme_bdev_io *bio, struct spdk_nvme_cmd *cmd, ++ void *buf, size_t nbytes, void *md_buf, size_t md_len); ++static void bdev_nvme_abort(struct nvme_bdev_channel *nbdev_ch, ++ struct nvme_bdev_io *bio, struct nvme_bdev_io *bio_to_abort); ++static void bdev_nvme_reset_io(struct nvme_bdev_channel *nbdev_ch, struct nvme_bdev_io *bio); ++static int bdev_nvme_reset(struct nvme_ctrlr *nvme_ctrlr); ++static int bdev_nvme_failover(struct nvme_ctrlr *nvme_ctrlr, bool remove); ++static void remove_cb(void *cb_ctx, struct spdk_nvme_ctrlr *ctrlr); ++static int nvme_ctrlr_read_ana_log_page(struct nvme_ctrlr *nvme_ctrlr); ++ ++static struct nvme_ns *nvme_ns_alloc(void); ++static void nvme_ns_free(struct nvme_ns *ns); ++ ++static int ++nvme_ns_cmp(struct nvme_ns *ns1, struct nvme_ns *ns2) ++{ ++ return ns1->id < ns2->id ? -1 : ns1->id > ns2->id; ++} ++ ++RB_GENERATE_STATIC(nvme_ns_tree, nvme_ns, node, nvme_ns_cmp); ++ ++struct spdk_nvme_qpair * ++bdev_nvme_get_io_qpair(struct spdk_io_channel *ctrlr_io_ch) ++{ ++ struct nvme_ctrlr_channel *ctrlr_ch; ++ ++ assert(ctrlr_io_ch != NULL); ++ ++ ctrlr_ch = spdk_io_channel_get_ctx(ctrlr_io_ch); ++ ++ return ctrlr_ch->qpair->qpair; ++} ++ ++static int ++bdev_nvme_get_ctx_size(void) ++{ ++ return sizeof(struct nvme_bdev_io); ++} ++ ++static struct spdk_bdev_module nvme_if = { ++ .name = "nvme", ++ .async_fini = true, ++ .module_init = bdev_nvme_library_init, ++ .module_fini = bdev_nvme_library_fini, ++ .config_json = bdev_nvme_config_json, ++ .get_ctx_size = bdev_nvme_get_ctx_size, ++ ++}; ++SPDK_BDEV_MODULE_REGISTER(nvme, &nvme_if) ++ ++struct nvme_bdev_ctrlrs g_nvme_bdev_ctrlrs = TAILQ_HEAD_INITIALIZER(g_nvme_bdev_ctrlrs); ++pthread_mutex_t g_bdev_nvme_mutex = PTHREAD_MUTEX_INITIALIZER; ++bool g_bdev_nvme_module_finish; ++ ++struct nvme_bdev_ctrlr * ++nvme_bdev_ctrlr_get_by_name(const char *name) ++{ ++ struct nvme_bdev_ctrlr *nbdev_ctrlr; ++ ++ TAILQ_FOREACH(nbdev_ctrlr, &g_nvme_bdev_ctrlrs, tailq) { ++ if (strcmp(name, nbdev_ctrlr->name) == 0) { ++ break; ++ } ++ } ++ ++ return nbdev_ctrlr; ++} ++ ++static struct nvme_ctrlr * ++nvme_bdev_ctrlr_get_ctrlr(struct nvme_bdev_ctrlr *nbdev_ctrlr, ++ const struct spdk_nvme_transport_id *trid) ++{ ++ struct nvme_ctrlr *nvme_ctrlr; ++ ++ TAILQ_FOREACH(nvme_ctrlr, &nbdev_ctrlr->ctrlrs, tailq) { ++ if (spdk_nvme_transport_id_compare(trid, &nvme_ctrlr->active_path_id->trid) == 0) { ++ break; ++ } ++ } ++ ++ return nvme_ctrlr; ++} ++ ++static struct nvme_bdev * ++nvme_bdev_ctrlr_get_bdev(struct nvme_bdev_ctrlr *nbdev_ctrlr, uint32_t nsid) ++{ ++ struct nvme_bdev *bdev; ++ ++ pthread_mutex_lock(&g_bdev_nvme_mutex); ++ TAILQ_FOREACH(bdev, &nbdev_ctrlr->bdevs, tailq) { ++ if (bdev->nsid == nsid) { ++ break; ++ } ++ } ++ pthread_mutex_unlock(&g_bdev_nvme_mutex); ++ ++ return bdev; ++} ++ ++struct nvme_ns * ++nvme_ctrlr_get_ns(struct nvme_ctrlr *nvme_ctrlr, uint32_t nsid) ++{ ++ struct nvme_ns ns; ++ ++ assert(nsid > 0); ++ ++ ns.id = nsid; ++ return RB_FIND(nvme_ns_tree, &nvme_ctrlr->namespaces, &ns); ++} ++ ++struct nvme_ns * ++nvme_ctrlr_get_first_active_ns(struct nvme_ctrlr *nvme_ctrlr) ++{ ++ return RB_MIN(nvme_ns_tree, &nvme_ctrlr->namespaces); ++} ++ ++struct nvme_ns * ++nvme_ctrlr_get_next_active_ns(struct nvme_ctrlr *nvme_ctrlr, struct nvme_ns *ns) ++{ ++ if (ns == NULL) { ++ return NULL; ++ } ++ ++ return RB_NEXT(nvme_ns_tree, &nvme_ctrlr->namespaces, ns); ++} ++ ++static struct nvme_ctrlr * ++nvme_ctrlr_get(const struct spdk_nvme_transport_id *trid) ++{ ++ struct nvme_bdev_ctrlr *nbdev_ctrlr; ++ struct nvme_ctrlr *nvme_ctrlr = NULL; ++ ++ pthread_mutex_lock(&g_bdev_nvme_mutex); ++ TAILQ_FOREACH(nbdev_ctrlr, &g_nvme_bdev_ctrlrs, tailq) { ++ nvme_ctrlr = nvme_bdev_ctrlr_get_ctrlr(nbdev_ctrlr, trid); ++ if (nvme_ctrlr != NULL) { ++ break; ++ } ++ } ++ pthread_mutex_unlock(&g_bdev_nvme_mutex); ++ ++ return nvme_ctrlr; ++} ++ ++struct nvme_ctrlr * ++nvme_ctrlr_get_by_name(const char *name) ++{ ++ struct nvme_bdev_ctrlr *nbdev_ctrlr; ++ struct nvme_ctrlr *nvme_ctrlr = NULL; ++ ++ if (name == NULL) { ++ return NULL; ++ } ++ ++ pthread_mutex_lock(&g_bdev_nvme_mutex); ++ nbdev_ctrlr = nvme_bdev_ctrlr_get_by_name(name); ++ if (nbdev_ctrlr != NULL) { ++ nvme_ctrlr = TAILQ_FIRST(&nbdev_ctrlr->ctrlrs); ++ } ++ pthread_mutex_unlock(&g_bdev_nvme_mutex); ++ ++ return nvme_ctrlr; ++} ++ ++void ++nvme_bdev_ctrlr_for_each(nvme_bdev_ctrlr_for_each_fn fn, void *ctx) ++{ ++ struct nvme_bdev_ctrlr *nbdev_ctrlr; ++ ++ pthread_mutex_lock(&g_bdev_nvme_mutex); ++ TAILQ_FOREACH(nbdev_ctrlr, &g_nvme_bdev_ctrlrs, tailq) { ++ fn(nbdev_ctrlr, ctx); ++ } ++ pthread_mutex_unlock(&g_bdev_nvme_mutex); ++} ++ ++void ++nvme_bdev_dump_trid_json(const struct spdk_nvme_transport_id *trid, struct spdk_json_write_ctx *w) ++{ ++ const char *trtype_str; ++ const char *adrfam_str; ++ ++ trtype_str = spdk_nvme_transport_id_trtype_str(trid->trtype); ++ if (trtype_str) { ++ spdk_json_write_named_string(w, "trtype", trtype_str); ++ } ++ ++ adrfam_str = spdk_nvme_transport_id_adrfam_str(trid->adrfam); ++ if (adrfam_str) { ++ spdk_json_write_named_string(w, "adrfam", adrfam_str); ++ } ++ ++ if (trid->traddr[0] != '\0') { ++ spdk_json_write_named_string(w, "traddr", trid->traddr); ++ } ++ ++ if (trid->trsvcid[0] != '\0') { ++ spdk_json_write_named_string(w, "trsvcid", trid->trsvcid); ++ } ++ ++ if (trid->subnqn[0] != '\0') { ++ spdk_json_write_named_string(w, "subnqn", trid->subnqn); ++ } ++} ++ ++static void ++nvme_bdev_ctrlr_delete(struct nvme_bdev_ctrlr *nbdev_ctrlr, ++ struct nvme_ctrlr *nvme_ctrlr) ++{ ++ SPDK_DTRACE_PROBE1(bdev_nvme_ctrlr_delete, nvme_ctrlr->nbdev_ctrlr->name); ++ pthread_mutex_lock(&g_bdev_nvme_mutex); ++ ++ TAILQ_REMOVE(&nbdev_ctrlr->ctrlrs, nvme_ctrlr, tailq); ++ if (!TAILQ_EMPTY(&nbdev_ctrlr->ctrlrs)) { ++ pthread_mutex_unlock(&g_bdev_nvme_mutex); ++ ++ return; ++ } ++ TAILQ_REMOVE(&g_nvme_bdev_ctrlrs, nbdev_ctrlr, tailq); ++ ++ pthread_mutex_unlock(&g_bdev_nvme_mutex); ++ ++ assert(TAILQ_EMPTY(&nbdev_ctrlr->bdevs)); ++ ++ free(nbdev_ctrlr->name); ++ free(nbdev_ctrlr); ++} ++ ++static void ++_nvme_ctrlr_delete(struct nvme_ctrlr *nvme_ctrlr) ++{ ++ struct nvme_path_id *path_id, *tmp_path; ++ struct nvme_ns *ns, *tmp_ns; ++ ++ free(nvme_ctrlr->copied_ana_desc); ++ spdk_free(nvme_ctrlr->ana_log_page); ++ ++ if (nvme_ctrlr->opal_dev) { ++ spdk_opal_dev_destruct(nvme_ctrlr->opal_dev); ++ nvme_ctrlr->opal_dev = NULL; ++ } ++ ++ if (nvme_ctrlr->nbdev_ctrlr) { ++ nvme_bdev_ctrlr_delete(nvme_ctrlr->nbdev_ctrlr, nvme_ctrlr); ++ } ++ ++ RB_FOREACH_SAFE(ns, nvme_ns_tree, &nvme_ctrlr->namespaces, tmp_ns) { ++ RB_REMOVE(nvme_ns_tree, &nvme_ctrlr->namespaces, ns); ++ nvme_ns_free(ns); ++ } ++ ++ TAILQ_FOREACH_SAFE(path_id, &nvme_ctrlr->trids, link, tmp_path) { ++ TAILQ_REMOVE(&nvme_ctrlr->trids, path_id, link); ++ free(path_id); ++ } ++ ++ pthread_mutex_destroy(&nvme_ctrlr->mutex); ++ ++ free(nvme_ctrlr); ++ ++ pthread_mutex_lock(&g_bdev_nvme_mutex); ++ if (g_bdev_nvme_module_finish && TAILQ_EMPTY(&g_nvme_bdev_ctrlrs)) { ++ pthread_mutex_unlock(&g_bdev_nvme_mutex); ++ spdk_io_device_unregister(&g_nvme_bdev_ctrlrs, NULL); ++ spdk_bdev_module_fini_done(); ++ return; ++ } ++ pthread_mutex_unlock(&g_bdev_nvme_mutex); ++} ++ ++static int ++nvme_detach_poller(void *arg) ++{ ++ struct nvme_ctrlr *nvme_ctrlr = arg; ++ int rc; ++ ++ rc = spdk_nvme_detach_poll_async(nvme_ctrlr->detach_ctx); ++ if (rc != -EAGAIN) { ++ spdk_poller_unregister(&nvme_ctrlr->reset_detach_poller); ++ _nvme_ctrlr_delete(nvme_ctrlr); ++ } ++ ++ return SPDK_POLLER_BUSY; ++} ++ ++static void ++nvme_ctrlr_delete(struct nvme_ctrlr *nvme_ctrlr) ++{ ++ int rc; ++ ++ spdk_poller_unregister(&nvme_ctrlr->reconnect_delay_timer); ++ ++ /* First, unregister the adminq poller, as the driver will poll adminq if necessary */ ++ spdk_poller_unregister(&nvme_ctrlr->adminq_timer_poller); ++ ++ /* If we got here, the reset/detach poller cannot be active */ ++ assert(nvme_ctrlr->reset_detach_poller == NULL); ++ nvme_ctrlr->reset_detach_poller = SPDK_POLLER_REGISTER(nvme_detach_poller, ++ nvme_ctrlr, 1000); ++ if (nvme_ctrlr->reset_detach_poller == NULL) { ++ SPDK_ERRLOG("Failed to register detach poller\n"); ++ goto error; ++ } ++ ++ rc = spdk_nvme_detach_async(nvme_ctrlr->ctrlr, &nvme_ctrlr->detach_ctx); ++ if (rc != 0) { ++ SPDK_ERRLOG("Failed to detach the NVMe controller\n"); ++ goto error; ++ } ++ ++ return; ++error: ++ /* We don't have a good way to handle errors here, so just do what we can and delete the ++ * controller without detaching the underlying NVMe device. ++ */ ++ spdk_poller_unregister(&nvme_ctrlr->reset_detach_poller); ++ _nvme_ctrlr_delete(nvme_ctrlr); ++} ++ ++static void ++nvme_ctrlr_unregister_cb(void *io_device) ++{ ++ struct nvme_ctrlr *nvme_ctrlr = io_device; ++ ++ nvme_ctrlr_delete(nvme_ctrlr); ++} ++ ++static void ++nvme_ctrlr_unregister(void *ctx) ++{ ++ struct nvme_ctrlr *nvme_ctrlr = ctx; ++ ++ spdk_io_device_unregister(nvme_ctrlr, nvme_ctrlr_unregister_cb); ++} ++ ++static bool ++nvme_ctrlr_can_be_unregistered(struct nvme_ctrlr *nvme_ctrlr) ++{ ++ if (!nvme_ctrlr->destruct) { ++ return false; ++ } ++ ++ if (nvme_ctrlr->ref > 0) { ++ return false; ++ } ++ ++ if (nvme_ctrlr->resetting) { ++ return false; ++ } ++ ++ if (nvme_ctrlr->ana_log_page_updating) { ++ return false; ++ } ++ ++ if (nvme_ctrlr->io_path_cache_clearing) { ++ return false; ++ } ++ ++ return true; ++} ++ ++static void ++nvme_ctrlr_release(struct nvme_ctrlr *nvme_ctrlr) ++{ ++ pthread_mutex_lock(&nvme_ctrlr->mutex); ++ SPDK_DTRACE_PROBE2(bdev_nvme_ctrlr_release, nvme_ctrlr->nbdev_ctrlr->name, nvme_ctrlr->ref); ++ ++ assert(nvme_ctrlr->ref > 0); ++ nvme_ctrlr->ref--; ++ ++ if (!nvme_ctrlr_can_be_unregistered(nvme_ctrlr)) { ++ pthread_mutex_unlock(&nvme_ctrlr->mutex); ++ return; ++ } ++ ++ pthread_mutex_unlock(&nvme_ctrlr->mutex); ++ ++ spdk_thread_exec_msg(nvme_ctrlr->thread, nvme_ctrlr_unregister, nvme_ctrlr); ++} ++ ++static void ++bdev_nvme_clear_current_io_path(struct nvme_bdev_channel *nbdev_ch) ++{ ++ nbdev_ch->current_io_path = NULL; ++ nbdev_ch->rr_counter = 0; ++} ++ ++static struct nvme_io_path * ++_bdev_nvme_get_io_path(struct nvme_bdev_channel *nbdev_ch, struct nvme_ns *nvme_ns) ++{ ++ struct nvme_io_path *io_path; ++ ++ STAILQ_FOREACH(io_path, &nbdev_ch->io_path_list, stailq) { ++ if (io_path->nvme_ns == nvme_ns) { ++ break; ++ } ++ } ++ ++ return io_path; ++} ++ ++static int ++_bdev_nvme_add_io_path(struct nvme_bdev_channel *nbdev_ch, struct nvme_ns *nvme_ns) ++{ ++ struct nvme_io_path *io_path; ++ struct spdk_io_channel *ch; ++ struct nvme_ctrlr_channel *ctrlr_ch; ++ struct nvme_qpair *nvme_qpair; ++ ++ io_path = calloc(1, sizeof(*io_path)); ++ if (io_path == NULL) { ++ SPDK_ERRLOG("Failed to alloc io_path.\n"); ++ return -ENOMEM; ++ } ++ ++ if (g_opts.io_path_stat) { ++ io_path->stat = calloc(1, sizeof(struct spdk_bdev_io_stat)); ++ if (io_path->stat == NULL) { ++ free(io_path); ++ SPDK_ERRLOG("Failed to alloc io_path stat.\n"); ++ return -ENOMEM; ++ } ++ spdk_bdev_reset_io_stat(io_path->stat, SPDK_BDEV_RESET_STAT_MAXMIN); ++ } ++ ++ io_path->nvme_ns = nvme_ns; ++ ++ ch = spdk_get_io_channel(nvme_ns->ctrlr); ++ if (ch == NULL) { ++ free(io_path->stat); ++ free(io_path); ++ SPDK_ERRLOG("Failed to alloc io_channel.\n"); ++ return -ENOMEM; ++ } ++ ++ ctrlr_ch = spdk_io_channel_get_ctx(ch); ++ ++ nvme_qpair = ctrlr_ch->qpair; ++ assert(nvme_qpair != NULL); ++ ++ io_path->qpair = nvme_qpair; ++ TAILQ_INSERT_TAIL(&nvme_qpair->io_path_list, io_path, tailq); ++ ++ io_path->nbdev_ch = nbdev_ch; ++ STAILQ_INSERT_TAIL(&nbdev_ch->io_path_list, io_path, stailq); ++ ++ bdev_nvme_clear_current_io_path(nbdev_ch); ++ ++ return 0; ++} ++ ++static void ++_bdev_nvme_delete_io_path(struct nvme_bdev_channel *nbdev_ch, struct nvme_io_path *io_path) ++{ ++ struct spdk_io_channel *ch; ++ struct nvme_qpair *nvme_qpair; ++ struct nvme_ctrlr_channel *ctrlr_ch; ++ struct nvme_bdev *nbdev; ++ ++ nbdev = spdk_io_channel_get_io_device(spdk_io_channel_from_ctx(nbdev_ch)); ++ ++ /* Add the statistics to nvme_ns before this path is destroyed. */ ++ pthread_mutex_lock(&nbdev->mutex); ++ if (nbdev->ref != 0 && io_path->nvme_ns->stat != NULL && io_path->stat != NULL) { ++ spdk_bdev_add_io_stat(io_path->nvme_ns->stat, io_path->stat); ++ } ++ pthread_mutex_unlock(&nbdev->mutex); ++ ++ bdev_nvme_clear_current_io_path(nbdev_ch); ++ ++ STAILQ_REMOVE(&nbdev_ch->io_path_list, io_path, nvme_io_path, stailq); ++ ++ nvme_qpair = io_path->qpair; ++ assert(nvme_qpair != NULL); ++ ++ TAILQ_REMOVE(&nvme_qpair->io_path_list, io_path, tailq); ++ ++ ctrlr_ch = nvme_qpair->ctrlr_ch; ++ assert(ctrlr_ch != NULL); ++ ++ ch = spdk_io_channel_from_ctx(ctrlr_ch); ++ spdk_put_io_channel(ch); ++ ++ free(io_path->stat); ++ free(io_path); ++} ++ ++static void ++_bdev_nvme_delete_io_paths(struct nvme_bdev_channel *nbdev_ch) ++{ ++ struct nvme_io_path *io_path, *tmp_io_path; ++ ++ STAILQ_FOREACH_SAFE(io_path, &nbdev_ch->io_path_list, stailq, tmp_io_path) { ++ _bdev_nvme_delete_io_path(nbdev_ch, io_path); ++ } ++} ++ ++static int ++bdev_nvme_create_bdev_channel_cb(void *io_device, void *ctx_buf) ++{ ++ struct nvme_bdev_channel *nbdev_ch = ctx_buf; ++ struct nvme_bdev *nbdev = io_device; ++ struct nvme_ns *nvme_ns; ++ int rc; ++ ++ STAILQ_INIT(&nbdev_ch->io_path_list); ++ TAILQ_INIT(&nbdev_ch->retry_io_list); ++ ++ pthread_mutex_lock(&nbdev->mutex); ++ ++ nbdev_ch->mp_policy = nbdev->mp_policy; ++ nbdev_ch->mp_selector = nbdev->mp_selector; ++ nbdev_ch->rr_min_io = nbdev->rr_min_io; ++ ++ TAILQ_FOREACH(nvme_ns, &nbdev->nvme_ns_list, tailq) { ++ rc = _bdev_nvme_add_io_path(nbdev_ch, nvme_ns); ++ if (rc != 0) { ++ pthread_mutex_unlock(&nbdev->mutex); ++ ++ _bdev_nvme_delete_io_paths(nbdev_ch); ++ return rc; ++ } ++ } ++ pthread_mutex_unlock(&nbdev->mutex); ++ ++ return 0; ++} ++ ++/* If cpl != NULL, complete the bdev_io with nvme status based on 'cpl'. ++ * If cpl == NULL, complete the bdev_io with bdev status based on 'status'. ++ */ ++static inline void ++__bdev_nvme_io_complete(struct spdk_bdev_io *bdev_io, enum spdk_bdev_io_status status, ++ const struct spdk_nvme_cpl *cpl) ++{ ++ spdk_trace_record(TRACE_BDEV_NVME_IO_DONE, 0, 0, (uintptr_t)bdev_io->driver_ctx, ++ (uintptr_t)bdev_io); ++ if (cpl) { ++ spdk_bdev_io_complete_nvme_status(bdev_io, cpl->cdw0, cpl->status.sct, cpl->status.sc); ++ } else { ++ spdk_bdev_io_complete(bdev_io, status); ++ } ++} ++ ++static void bdev_nvme_abort_retry_ios(struct nvme_bdev_channel *nbdev_ch); ++ ++static void ++bdev_nvme_destroy_bdev_channel_cb(void *io_device, void *ctx_buf) ++{ ++ struct nvme_bdev_channel *nbdev_ch = ctx_buf; ++ ++ bdev_nvme_abort_retry_ios(nbdev_ch); ++ _bdev_nvme_delete_io_paths(nbdev_ch); ++} ++ ++static inline bool ++bdev_nvme_io_type_is_admin(enum spdk_bdev_io_type io_type) ++{ ++ switch (io_type) { ++ case SPDK_BDEV_IO_TYPE_RESET: ++ case SPDK_BDEV_IO_TYPE_NVME_ADMIN: ++ case SPDK_BDEV_IO_TYPE_ABORT: ++ return true; ++ default: ++ break; ++ } ++ ++ return false; ++} ++ ++static inline bool ++nvme_ns_is_accessible(struct nvme_ns *nvme_ns) ++{ ++ if (spdk_unlikely(nvme_ns->ana_state_updating)) { ++ return false; ++ } ++ ++ switch (nvme_ns->ana_state) { ++ case SPDK_NVME_ANA_OPTIMIZED_STATE: ++ case SPDK_NVME_ANA_NON_OPTIMIZED_STATE: ++ return true; ++ default: ++ break; ++ } ++ ++ return false; ++} ++ ++static inline bool ++nvme_io_path_is_connected(struct nvme_io_path *io_path) ++{ ++ if (spdk_unlikely(io_path->qpair->qpair == NULL)) { ++ return false; ++ } ++ ++ if (spdk_unlikely(spdk_nvme_qpair_get_failure_reason(io_path->qpair->qpair) != ++ SPDK_NVME_QPAIR_FAILURE_NONE)) { ++ return false; ++ } ++ ++ if (spdk_unlikely(io_path->qpair->ctrlr_ch->reset_iter != NULL)) { ++ return false; ++ } ++ ++ if (spdk_nvme_ctrlr_get_admin_qp_failure_reason(io_path->qpair->ctrlr->ctrlr) != ++ SPDK_NVME_QPAIR_FAILURE_NONE) { ++ return false; ++ } ++ ++ return true; ++} ++ ++static inline bool ++nvme_io_path_is_available(struct nvme_io_path *io_path) ++{ ++ if (spdk_unlikely(!nvme_io_path_is_connected(io_path))) { ++ return false; ++ } ++ ++ if (spdk_unlikely(!nvme_ns_is_accessible(io_path->nvme_ns))) { ++ return false; ++ } ++ ++ return true; ++} ++ ++static inline bool ++nvme_io_path_is_failed(struct nvme_io_path *io_path) ++{ ++ struct nvme_ctrlr *nvme_ctrlr; ++ ++ nvme_ctrlr = io_path->qpair->ctrlr; ++ ++ if (nvme_ctrlr->destruct) { ++ return true; ++ } ++ ++ if (nvme_ctrlr->fast_io_fail_timedout) { ++ return true; ++ } ++ ++ if (nvme_ctrlr->resetting) { ++ if (nvme_ctrlr->opts.reconnect_delay_sec != 0) { ++ return false; ++ } else { ++ return true; ++ } ++ } ++ ++ if (nvme_ctrlr->reconnect_is_delayed) { ++ return false; ++ } ++ ++ if (spdk_nvme_ctrlr_is_failed(nvme_ctrlr->ctrlr)) { ++ return true; ++ } else { ++ return false; ++ } ++} ++ ++static bool ++nvme_ctrlr_is_available(struct nvme_ctrlr *nvme_ctrlr) ++{ ++ if (nvme_ctrlr->destruct) { ++ return false; ++ } ++ ++ if (spdk_nvme_ctrlr_is_failed(nvme_ctrlr->ctrlr)) { ++ return false; ++ } ++ ++ if (nvme_ctrlr->resetting || nvme_ctrlr->reconnect_is_delayed) { ++ return false; ++ } ++ ++ return true; ++} ++ ++/* Simulate circular linked list. */ ++static inline struct nvme_io_path * ++nvme_io_path_get_next(struct nvme_bdev_channel *nbdev_ch, struct nvme_io_path *prev_path) ++{ ++ struct nvme_io_path *next_path; ++ ++ if (prev_path != NULL) { ++ next_path = STAILQ_NEXT(prev_path, stailq); ++ if (next_path != NULL) { ++ return next_path; ++ } ++ } ++ ++ return STAILQ_FIRST(&nbdev_ch->io_path_list); ++} ++ ++static struct nvme_io_path * ++_bdev_nvme_find_io_path(struct nvme_bdev_channel *nbdev_ch) ++{ ++ struct nvme_io_path *io_path, *start, *non_optimized = NULL; ++ ++ start = nvme_io_path_get_next(nbdev_ch, nbdev_ch->current_io_path); ++ ++ io_path = start; ++ do { ++ if (spdk_likely(nvme_io_path_is_connected(io_path) && ++ !io_path->nvme_ns->ana_state_updating)) { ++ switch (io_path->nvme_ns->ana_state) { ++ case SPDK_NVME_ANA_OPTIMIZED_STATE: ++ nbdev_ch->current_io_path = io_path; ++ return io_path; ++ case SPDK_NVME_ANA_NON_OPTIMIZED_STATE: ++ if (non_optimized == NULL) { ++ non_optimized = io_path; ++ } ++ break; ++ default: ++ break; ++ } ++ } ++ io_path = nvme_io_path_get_next(nbdev_ch, io_path); ++ } while (io_path != start); ++ ++ if (nbdev_ch->mp_policy == BDEV_NVME_MP_POLICY_ACTIVE_ACTIVE) { ++ /* We come here only if there is no optimized path. Cache even non_optimized ++ * path for load balance across multiple non_optimized paths. ++ */ ++ nbdev_ch->current_io_path = non_optimized; ++ } ++ ++ return non_optimized; ++} ++ ++static struct nvme_io_path * ++_bdev_nvme_find_io_path_min_qd(struct nvme_bdev_channel *nbdev_ch) ++{ ++ struct nvme_io_path *io_path; ++ struct nvme_io_path *optimized = NULL, *non_optimized = NULL; ++ uint32_t opt_min_qd = UINT32_MAX, non_opt_min_qd = UINT32_MAX; ++ uint32_t num_outstanding_reqs; ++ ++ STAILQ_FOREACH(io_path, &nbdev_ch->io_path_list, stailq) { ++ if (spdk_unlikely(!nvme_io_path_is_connected(io_path))) { ++ /* The device is currently resetting. */ ++ continue; ++ } ++ ++ if (spdk_unlikely(io_path->nvme_ns->ana_state_updating)) { ++ continue; ++ } ++ ++ num_outstanding_reqs = spdk_nvme_qpair_get_num_outstanding_reqs(io_path->qpair->qpair); ++ switch (io_path->nvme_ns->ana_state) { ++ case SPDK_NVME_ANA_OPTIMIZED_STATE: ++ if (num_outstanding_reqs < opt_min_qd) { ++ opt_min_qd = num_outstanding_reqs; ++ optimized = io_path; ++ } ++ break; ++ case SPDK_NVME_ANA_NON_OPTIMIZED_STATE: ++ if (num_outstanding_reqs < non_opt_min_qd) { ++ non_opt_min_qd = num_outstanding_reqs; ++ non_optimized = io_path; ++ } ++ break; ++ default: ++ break; ++ } ++ } ++ ++ /* don't cache io path for BDEV_NVME_MP_SELECTOR_QUEUE_DEPTH selector */ ++ if (optimized != NULL) { ++ return optimized; ++ } ++ ++ return non_optimized; ++} ++ ++static inline struct nvme_io_path * ++bdev_nvme_find_io_path(struct nvme_bdev_channel *nbdev_ch) ++{ ++ if (spdk_likely(nbdev_ch->current_io_path != NULL)) { ++ if (nbdev_ch->mp_policy == BDEV_NVME_MP_POLICY_ACTIVE_PASSIVE) { ++ return nbdev_ch->current_io_path; ++ } else if (nbdev_ch->mp_selector == BDEV_NVME_MP_SELECTOR_ROUND_ROBIN) { ++ if (++nbdev_ch->rr_counter < nbdev_ch->rr_min_io) { ++ return nbdev_ch->current_io_path; ++ } ++ nbdev_ch->rr_counter = 0; ++ } ++ } ++ ++ if (nbdev_ch->mp_policy == BDEV_NVME_MP_POLICY_ACTIVE_PASSIVE || ++ nbdev_ch->mp_selector == BDEV_NVME_MP_SELECTOR_ROUND_ROBIN) { ++ return _bdev_nvme_find_io_path(nbdev_ch); ++ } else { ++ return _bdev_nvme_find_io_path_min_qd(nbdev_ch); ++ } ++} ++ ++/* Return true if there is any io_path whose qpair is active or ctrlr is not failed, ++ * or false otherwise. ++ * ++ * If any io_path has an active qpair but find_io_path() returned NULL, its namespace ++ * is likely to be non-accessible now but may become accessible. ++ * ++ * If any io_path has an unfailed ctrlr but find_io_path() returned NULL, the ctrlr ++ * is likely to be resetting now but the reset may succeed. A ctrlr is set to unfailed ++ * when starting to reset it but it is set to failed when the reset failed. Hence, if ++ * a ctrlr is unfailed, it is likely that it works fine or is resetting. ++ */ ++static bool ++any_io_path_may_become_available(struct nvme_bdev_channel *nbdev_ch) ++{ ++ struct nvme_io_path *io_path; ++ ++ STAILQ_FOREACH(io_path, &nbdev_ch->io_path_list, stailq) { ++ if (io_path->nvme_ns->ana_transition_timedout) { ++ continue; ++ } ++ ++ if (nvme_io_path_is_connected(io_path) || ++ !nvme_io_path_is_failed(io_path)) { ++ return true; ++ } ++ } ++ ++ return false; ++} ++ ++static void ++bdev_nvme_retry_io(struct nvme_bdev_channel *nbdev_ch, struct spdk_bdev_io *bdev_io) ++{ ++ struct nvme_bdev_io *nbdev_io = (struct nvme_bdev_io *)bdev_io->driver_ctx; ++ struct spdk_io_channel *ch; ++ ++ if (nbdev_io->io_path != NULL && nvme_io_path_is_available(nbdev_io->io_path)) { ++ _bdev_nvme_submit_request(nbdev_ch, bdev_io); ++ } else { ++ ch = spdk_io_channel_from_ctx(nbdev_ch); ++ bdev_nvme_submit_request(ch, bdev_io); ++ } ++} ++ ++static int ++bdev_nvme_retry_ios(void *arg) ++{ ++ struct nvme_bdev_channel *nbdev_ch = arg; ++ struct spdk_bdev_io *bdev_io, *tmp_bdev_io; ++ struct nvme_bdev_io *bio; ++ uint64_t now, delay_us; ++ ++ now = spdk_get_ticks(); ++ ++ TAILQ_FOREACH_SAFE(bdev_io, &nbdev_ch->retry_io_list, module_link, tmp_bdev_io) { ++ bio = (struct nvme_bdev_io *)bdev_io->driver_ctx; ++ if (bio->retry_ticks > now) { ++ break; ++ } ++ ++ TAILQ_REMOVE(&nbdev_ch->retry_io_list, bdev_io, module_link); ++ ++ bdev_nvme_retry_io(nbdev_ch, bdev_io); ++ } ++ ++ spdk_poller_unregister(&nbdev_ch->retry_io_poller); ++ ++ bdev_io = TAILQ_FIRST(&nbdev_ch->retry_io_list); ++ if (bdev_io != NULL) { ++ bio = (struct nvme_bdev_io *)bdev_io->driver_ctx; ++ ++ delay_us = (bio->retry_ticks - now) * SPDK_SEC_TO_USEC / spdk_get_ticks_hz(); ++ ++ nbdev_ch->retry_io_poller = SPDK_POLLER_REGISTER(bdev_nvme_retry_ios, nbdev_ch, ++ delay_us); ++ } ++ ++ return SPDK_POLLER_BUSY; ++} ++ ++static void ++bdev_nvme_queue_retry_io(struct nvme_bdev_channel *nbdev_ch, ++ struct nvme_bdev_io *bio, uint64_t delay_ms) ++{ ++ struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); ++ struct spdk_bdev_io *tmp_bdev_io; ++ struct nvme_bdev_io *tmp_bio; ++ ++ bio->retry_ticks = spdk_get_ticks() + delay_ms * spdk_get_ticks_hz() / 1000ULL; ++ ++ TAILQ_FOREACH_REVERSE(tmp_bdev_io, &nbdev_ch->retry_io_list, retry_io_head, module_link) { ++ tmp_bio = (struct nvme_bdev_io *)tmp_bdev_io->driver_ctx; ++ ++ if (tmp_bio->retry_ticks <= bio->retry_ticks) { ++ TAILQ_INSERT_AFTER(&nbdev_ch->retry_io_list, tmp_bdev_io, bdev_io, ++ module_link); ++ return; ++ } ++ } ++ ++ /* No earlier I/Os were found. This I/O must be the new head. */ ++ TAILQ_INSERT_HEAD(&nbdev_ch->retry_io_list, bdev_io, module_link); ++ ++ spdk_poller_unregister(&nbdev_ch->retry_io_poller); ++ ++ nbdev_ch->retry_io_poller = SPDK_POLLER_REGISTER(bdev_nvme_retry_ios, nbdev_ch, ++ delay_ms * 1000ULL); ++} ++ ++static void ++bdev_nvme_abort_retry_ios(struct nvme_bdev_channel *nbdev_ch) ++{ ++ struct spdk_bdev_io *bdev_io, *tmp_io; ++ ++ TAILQ_FOREACH_SAFE(bdev_io, &nbdev_ch->retry_io_list, module_link, tmp_io) { ++ TAILQ_REMOVE(&nbdev_ch->retry_io_list, bdev_io, module_link); ++ __bdev_nvme_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_ABORTED, NULL); ++ } ++ ++ spdk_poller_unregister(&nbdev_ch->retry_io_poller); ++} ++ ++static int ++bdev_nvme_abort_retry_io(struct nvme_bdev_channel *nbdev_ch, ++ struct nvme_bdev_io *bio_to_abort) ++{ ++ struct spdk_bdev_io *bdev_io_to_abort; ++ ++ TAILQ_FOREACH(bdev_io_to_abort, &nbdev_ch->retry_io_list, module_link) { ++ if ((struct nvme_bdev_io *)bdev_io_to_abort->driver_ctx == bio_to_abort) { ++ TAILQ_REMOVE(&nbdev_ch->retry_io_list, bdev_io_to_abort, module_link); ++ __bdev_nvme_io_complete(bdev_io_to_abort, SPDK_BDEV_IO_STATUS_ABORTED, NULL); ++ return 0; ++ } ++ } ++ ++ return -ENOENT; ++} ++ ++static void ++bdev_nvme_update_nvme_error_stat(struct spdk_bdev_io *bdev_io, const struct spdk_nvme_cpl *cpl) ++{ ++ struct nvme_bdev *nbdev; ++ uint16_t sct, sc; ++ ++ assert(spdk_nvme_cpl_is_error(cpl)); ++ ++ nbdev = bdev_io->bdev->ctxt; ++ ++ if (nbdev->err_stat == NULL) { ++ return; ++ } ++ ++ sct = cpl->status.sct; ++ sc = cpl->status.sc; ++ ++ pthread_mutex_lock(&nbdev->mutex); ++ ++ nbdev->err_stat->status_type[sct]++; ++ switch (sct) { ++ case SPDK_NVME_SCT_GENERIC: ++ case SPDK_NVME_SCT_COMMAND_SPECIFIC: ++ case SPDK_NVME_SCT_MEDIA_ERROR: ++ case SPDK_NVME_SCT_PATH: ++ nbdev->err_stat->status[sct][sc]++; ++ break; ++ default: ++ break; ++ } ++ ++ pthread_mutex_unlock(&nbdev->mutex); ++} ++ ++static inline void ++bdev_nvme_update_io_path_stat(struct nvme_bdev_io *bio) ++{ ++ struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); ++ uint64_t num_blocks = bdev_io->u.bdev.num_blocks; ++ uint32_t blocklen = bdev_io->bdev->blocklen; ++ struct spdk_bdev_io_stat *stat; ++ uint64_t tsc_diff; ++ ++ if (bio->io_path->stat == NULL) { ++ return; ++ } ++ ++ tsc_diff = spdk_get_ticks() - bio->submit_tsc; ++ stat = bio->io_path->stat; ++ ++ switch (bdev_io->type) { ++ case SPDK_BDEV_IO_TYPE_READ: ++ stat->bytes_read += num_blocks * blocklen; ++ stat->num_read_ops++; ++ stat->read_latency_ticks += tsc_diff; ++ if (stat->max_read_latency_ticks < tsc_diff) { ++ stat->max_read_latency_ticks = tsc_diff; ++ } ++ if (stat->min_read_latency_ticks > tsc_diff) { ++ stat->min_read_latency_ticks = tsc_diff; ++ } ++ break; ++ case SPDK_BDEV_IO_TYPE_WRITE: ++ stat->bytes_written += num_blocks * blocklen; ++ stat->num_write_ops++; ++ stat->write_latency_ticks += tsc_diff; ++ if (stat->max_write_latency_ticks < tsc_diff) { ++ stat->max_write_latency_ticks = tsc_diff; ++ } ++ if (stat->min_write_latency_ticks > tsc_diff) { ++ stat->min_write_latency_ticks = tsc_diff; ++ } ++ break; ++ case SPDK_BDEV_IO_TYPE_UNMAP: ++ stat->bytes_unmapped += num_blocks * blocklen; ++ stat->num_unmap_ops++; ++ stat->unmap_latency_ticks += tsc_diff; ++ if (stat->max_unmap_latency_ticks < tsc_diff) { ++ stat->max_unmap_latency_ticks = tsc_diff; ++ } ++ if (stat->min_unmap_latency_ticks > tsc_diff) { ++ stat->min_unmap_latency_ticks = tsc_diff; ++ } ++ break; ++ case SPDK_BDEV_IO_TYPE_ZCOPY: ++ /* Track the data in the start phase only */ ++ if (!bdev_io->u.bdev.zcopy.start) { ++ break; ++ } ++ if (bdev_io->u.bdev.zcopy.populate) { ++ stat->bytes_read += num_blocks * blocklen; ++ stat->num_read_ops++; ++ stat->read_latency_ticks += tsc_diff; ++ if (stat->max_read_latency_ticks < tsc_diff) { ++ stat->max_read_latency_ticks = tsc_diff; ++ } ++ if (stat->min_read_latency_ticks > tsc_diff) { ++ stat->min_read_latency_ticks = tsc_diff; ++ } ++ } else { ++ stat->bytes_written += num_blocks * blocklen; ++ stat->num_write_ops++; ++ stat->write_latency_ticks += tsc_diff; ++ if (stat->max_write_latency_ticks < tsc_diff) { ++ stat->max_write_latency_ticks = tsc_diff; ++ } ++ if (stat->min_write_latency_ticks > tsc_diff) { ++ stat->min_write_latency_ticks = tsc_diff; ++ } ++ } ++ break; ++ case SPDK_BDEV_IO_TYPE_COPY: ++ stat->bytes_copied += num_blocks * blocklen; ++ stat->num_copy_ops++; ++ stat->copy_latency_ticks += tsc_diff; ++ if (stat->max_copy_latency_ticks < tsc_diff) { ++ stat->max_copy_latency_ticks = tsc_diff; ++ } ++ if (stat->min_copy_latency_ticks > tsc_diff) { ++ stat->min_copy_latency_ticks = tsc_diff; ++ } ++ break; ++ default: ++ break; ++ } ++} ++ ++static inline void ++bdev_nvme_io_complete_nvme_status(struct nvme_bdev_io *bio, ++ const struct spdk_nvme_cpl *cpl) ++{ ++ struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); ++ struct nvme_bdev_channel *nbdev_ch; ++ struct nvme_io_path *io_path; ++ struct nvme_ctrlr *nvme_ctrlr; ++ const struct spdk_nvme_ctrlr_data *cdata; ++ uint64_t delay_ms; ++ ++ assert(!bdev_nvme_io_type_is_admin(bdev_io->type)); ++ ++ if (spdk_likely(spdk_nvme_cpl_is_success(cpl))) { ++ bdev_nvme_update_io_path_stat(bio); ++ goto complete; ++ } ++ ++ /* Update error counts before deciding if retry is needed. ++ * Hence, error counts may be more than the number of I/O errors. ++ */ ++ bdev_nvme_update_nvme_error_stat(bdev_io, cpl); ++ ++ if (cpl->status.dnr != 0 || spdk_nvme_cpl_is_aborted_by_request(cpl) || ++ (g_opts.bdev_retry_count != -1 && bio->retry_count >= g_opts.bdev_retry_count)) { ++ goto complete; ++ } ++ ++ nbdev_ch = spdk_io_channel_get_ctx(spdk_bdev_io_get_io_channel(bdev_io)); ++ ++ assert(bio->io_path != NULL); ++ io_path = bio->io_path; ++ ++ nvme_ctrlr = io_path->qpair->ctrlr; ++ ++ if (spdk_nvme_cpl_is_path_error(cpl) || ++ spdk_nvme_cpl_is_aborted_sq_deletion(cpl) || ++ !nvme_io_path_is_available(io_path) || ++ !nvme_ctrlr_is_available(nvme_ctrlr)) { ++ bdev_nvme_clear_current_io_path(nbdev_ch); ++ bio->io_path = NULL; ++ if (spdk_nvme_cpl_is_ana_error(cpl)) { ++ if (nvme_ctrlr_read_ana_log_page(nvme_ctrlr) == 0) { ++ io_path->nvme_ns->ana_state_updating = true; ++ } ++ } ++ if (!any_io_path_may_become_available(nbdev_ch)) { ++ goto complete; ++ } ++ delay_ms = 0; ++ } else { ++ bio->retry_count++; ++ ++ cdata = spdk_nvme_ctrlr_get_data(nvme_ctrlr->ctrlr); ++ ++ if (cpl->status.crd != 0) { ++ delay_ms = cdata->crdt[cpl->status.crd] * 100; ++ } else { ++ delay_ms = 0; ++ } ++ } ++ ++ bdev_nvme_queue_retry_io(nbdev_ch, bio, delay_ms); ++ return; ++ ++complete: ++ bio->retry_count = 0; ++ bio->submit_tsc = 0; ++ __bdev_nvme_io_complete(bdev_io, 0, cpl); ++} ++ ++static inline void ++bdev_nvme_io_complete(struct nvme_bdev_io *bio, int rc) ++{ ++ struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); ++ struct nvme_bdev_channel *nbdev_ch; ++ enum spdk_bdev_io_status io_status; ++ ++ switch (rc) { ++ case 0: ++ io_status = SPDK_BDEV_IO_STATUS_SUCCESS; ++ break; ++ case -ENOMEM: ++ io_status = SPDK_BDEV_IO_STATUS_NOMEM; ++ break; ++ case -ENXIO: ++ nbdev_ch = spdk_io_channel_get_ctx(spdk_bdev_io_get_io_channel(bdev_io)); ++ ++ bdev_nvme_clear_current_io_path(nbdev_ch); ++ bio->io_path = NULL; ++ ++ if (any_io_path_may_become_available(nbdev_ch)) { ++ bdev_nvme_queue_retry_io(nbdev_ch, bio, 1000ULL); ++ return; ++ } ++ ++ /* fallthrough */ ++ default: ++ io_status = SPDK_BDEV_IO_STATUS_FAILED; ++ break; ++ } ++ ++ bio->retry_count = 0; ++ bio->submit_tsc = 0; ++ __bdev_nvme_io_complete(bdev_io, io_status, NULL); ++} ++ ++static inline void ++bdev_nvme_admin_passthru_complete(struct nvme_bdev_io *bio, int rc) ++{ ++ struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); ++ enum spdk_bdev_io_status io_status; ++ ++ switch (rc) { ++ case 0: ++ io_status = SPDK_BDEV_IO_STATUS_SUCCESS; ++ break; ++ case -ENOMEM: ++ io_status = SPDK_BDEV_IO_STATUS_NOMEM; ++ break; ++ case -ENXIO: ++ /* fallthrough */ ++ default: ++ io_status = SPDK_BDEV_IO_STATUS_FAILED; ++ break; ++ } ++ ++ __bdev_nvme_io_complete(bdev_io, io_status, NULL); ++} ++ ++static void ++bdev_nvme_clear_io_path_caches_done(struct spdk_io_channel_iter *i, int status) ++{ ++ struct nvme_ctrlr *nvme_ctrlr = spdk_io_channel_iter_get_io_device(i); ++ ++ pthread_mutex_lock(&nvme_ctrlr->mutex); ++ ++ assert(nvme_ctrlr->io_path_cache_clearing == true); ++ nvme_ctrlr->io_path_cache_clearing = false; ++ ++ if (!nvme_ctrlr_can_be_unregistered(nvme_ctrlr)) { ++ pthread_mutex_unlock(&nvme_ctrlr->mutex); ++ return; ++ } ++ ++ pthread_mutex_unlock(&nvme_ctrlr->mutex); ++ ++ nvme_ctrlr_unregister(nvme_ctrlr); ++} ++ ++static void ++_bdev_nvme_clear_io_path_cache(struct nvme_qpair *nvme_qpair) ++{ ++ struct nvme_io_path *io_path; ++ ++ TAILQ_FOREACH(io_path, &nvme_qpair->io_path_list, tailq) { ++ bdev_nvme_clear_current_io_path(io_path->nbdev_ch); ++ } ++} ++ ++static void ++bdev_nvme_clear_io_path_cache(struct spdk_io_channel_iter *i) ++{ ++ struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i); ++ struct nvme_ctrlr_channel *ctrlr_ch = spdk_io_channel_get_ctx(_ch); ++ ++ assert(ctrlr_ch->qpair != NULL); ++ ++ _bdev_nvme_clear_io_path_cache(ctrlr_ch->qpair); ++ ++ spdk_for_each_channel_continue(i, 0); ++} ++ ++static void ++bdev_nvme_clear_io_path_caches(struct nvme_ctrlr *nvme_ctrlr) ++{ ++ pthread_mutex_lock(&nvme_ctrlr->mutex); ++ if (!nvme_ctrlr_is_available(nvme_ctrlr) || ++ nvme_ctrlr->io_path_cache_clearing) { ++ pthread_mutex_unlock(&nvme_ctrlr->mutex); ++ return; ++ } ++ ++ nvme_ctrlr->io_path_cache_clearing = true; ++ pthread_mutex_unlock(&nvme_ctrlr->mutex); ++ ++ spdk_for_each_channel(nvme_ctrlr, ++ bdev_nvme_clear_io_path_cache, ++ NULL, ++ bdev_nvme_clear_io_path_caches_done); ++} ++ ++static struct nvme_qpair * ++nvme_poll_group_get_qpair(struct nvme_poll_group *group, struct spdk_nvme_qpair *qpair) ++{ ++ struct nvme_qpair *nvme_qpair; ++ ++ TAILQ_FOREACH(nvme_qpair, &group->qpair_list, tailq) { ++ if (nvme_qpair->qpair == qpair) { ++ break; ++ } ++ } ++ ++ return nvme_qpair; ++} ++ ++static void nvme_qpair_delete(struct nvme_qpair *nvme_qpair); ++ ++static void ++bdev_nvme_disconnected_qpair_cb(struct spdk_nvme_qpair *qpair, void *poll_group_ctx) ++{ ++ struct nvme_poll_group *group = poll_group_ctx; ++ struct nvme_qpair *nvme_qpair; ++ struct nvme_ctrlr_channel *ctrlr_ch; ++ ++ nvme_qpair = nvme_poll_group_get_qpair(group, qpair); ++ if (nvme_qpair == NULL) { ++ return; ++ } ++ ++ if (nvme_qpair->qpair != NULL) { ++ spdk_nvme_ctrlr_free_io_qpair(nvme_qpair->qpair); ++ nvme_qpair->qpair = NULL; ++ } ++ ++ _bdev_nvme_clear_io_path_cache(nvme_qpair); ++ ++ ctrlr_ch = nvme_qpair->ctrlr_ch; ++ ++ if (ctrlr_ch != NULL) { ++ if (ctrlr_ch->reset_iter != NULL) { ++ /* If we are already in a full reset sequence, we do not have ++ * to restart it. Just move to the next ctrlr_channel. ++ */ ++ SPDK_DEBUGLOG(bdev_nvme, "qpair %p was disconnected and freed in a reset ctrlr sequence.\n", ++ qpair); ++ spdk_for_each_channel_continue(ctrlr_ch->reset_iter, 0); ++ ctrlr_ch->reset_iter = NULL; ++ } else { ++ /* qpair was disconnected unexpectedly. Reset controller for recovery. */ ++ SPDK_NOTICELOG("qpair %p was disconnected and freed. reset controller.\n", qpair); ++ bdev_nvme_failover(nvme_qpair->ctrlr, false); ++ } ++ } else { ++ /* In this case, ctrlr_channel is already deleted. */ ++ SPDK_DEBUGLOG(bdev_nvme, "qpair %p was disconnected and freed. delete nvme_qpair.\n", qpair); ++ nvme_qpair_delete(nvme_qpair); ++ } ++} ++ ++static void ++bdev_nvme_check_io_qpairs(struct nvme_poll_group *group) ++{ ++ struct nvme_qpair *nvme_qpair; ++ ++ TAILQ_FOREACH(nvme_qpair, &group->qpair_list, tailq) { ++ if (nvme_qpair->qpair == NULL || nvme_qpair->ctrlr_ch == NULL) { ++ continue; ++ } ++ ++ if (spdk_nvme_qpair_get_failure_reason(nvme_qpair->qpair) != ++ SPDK_NVME_QPAIR_FAILURE_NONE) { ++ _bdev_nvme_clear_io_path_cache(nvme_qpair); ++ } ++ } ++} ++ ++static int ++bdev_nvme_poll(void *arg) ++{ ++ struct nvme_poll_group *group = arg; ++ int64_t num_completions; ++ ++ if (group->collect_spin_stat && group->start_ticks == 0) { ++ group->start_ticks = spdk_get_ticks(); ++ } ++ ++ num_completions = spdk_nvme_poll_group_process_completions(group->group, 0, ++ bdev_nvme_disconnected_qpair_cb); ++ if (group->collect_spin_stat) { ++ if (num_completions > 0) { ++ if (group->end_ticks != 0) { ++ group->spin_ticks += (group->end_ticks - group->start_ticks); ++ group->end_ticks = 0; ++ } ++ group->start_ticks = 0; ++ } else { ++ group->end_ticks = spdk_get_ticks(); ++ } ++ } ++ ++ if (spdk_unlikely(num_completions < 0)) { ++ bdev_nvme_check_io_qpairs(group); ++ } ++ ++ return num_completions > 0 ? SPDK_POLLER_BUSY : SPDK_POLLER_IDLE; ++} ++ ++static int bdev_nvme_poll_adminq(void *arg); ++ ++static void ++bdev_nvme_change_adminq_poll_period(struct nvme_ctrlr *nvme_ctrlr, uint64_t new_period_us) ++{ ++ spdk_poller_unregister(&nvme_ctrlr->adminq_timer_poller); ++ ++ nvme_ctrlr->adminq_timer_poller = SPDK_POLLER_REGISTER(bdev_nvme_poll_adminq, ++ nvme_ctrlr, new_period_us); ++} ++ ++static int ++bdev_nvme_poll_adminq(void *arg) ++{ ++ int32_t rc; ++ struct nvme_ctrlr *nvme_ctrlr = arg; ++ nvme_ctrlr_disconnected_cb disconnected_cb; ++ ++ assert(nvme_ctrlr != NULL); ++ ++ rc = spdk_nvme_ctrlr_process_admin_completions(nvme_ctrlr->ctrlr); ++ if (rc < 0) { ++ disconnected_cb = nvme_ctrlr->disconnected_cb; ++ nvme_ctrlr->disconnected_cb = NULL; ++ ++ if (rc == -ENXIO && disconnected_cb != NULL) { ++ bdev_nvme_change_adminq_poll_period(nvme_ctrlr, ++ g_opts.nvme_adminq_poll_period_us); ++ disconnected_cb(nvme_ctrlr); ++ } else { ++ bdev_nvme_failover(nvme_ctrlr, false); ++ } ++ } else if (spdk_nvme_ctrlr_get_admin_qp_failure_reason(nvme_ctrlr->ctrlr) != ++ SPDK_NVME_QPAIR_FAILURE_NONE) { ++ bdev_nvme_clear_io_path_caches(nvme_ctrlr); ++ } ++ ++ return rc == 0 ? SPDK_POLLER_IDLE : SPDK_POLLER_BUSY; ++} ++ ++static void ++_bdev_nvme_unregister_dev_cb(void *io_device) ++{ ++ struct nvme_bdev *nvme_disk = io_device; ++ ++ free(nvme_disk->disk.name); ++ free(nvme_disk->err_stat); ++ free(nvme_disk); ++} ++ ++static int ++bdev_nvme_destruct(void *ctx) ++{ ++ struct nvme_bdev *nvme_disk = ctx; ++ struct nvme_ns *nvme_ns, *tmp_nvme_ns; ++ ++ SPDK_DTRACE_PROBE2(bdev_nvme_destruct, nvme_disk->nbdev_ctrlr->name, nvme_disk->nsid); ++ ++ TAILQ_FOREACH_SAFE(nvme_ns, &nvme_disk->nvme_ns_list, tailq, tmp_nvme_ns) { ++ pthread_mutex_lock(&nvme_ns->ctrlr->mutex); ++ ++ nvme_ns->bdev = NULL; ++ ++ assert(nvme_ns->id > 0); ++ ++ if (nvme_ctrlr_get_ns(nvme_ns->ctrlr, nvme_ns->id) == NULL) { ++ pthread_mutex_unlock(&nvme_ns->ctrlr->mutex); ++ ++ nvme_ctrlr_release(nvme_ns->ctrlr); ++ nvme_ns_free(nvme_ns); ++ } else { ++ pthread_mutex_unlock(&nvme_ns->ctrlr->mutex); ++ } ++ } ++ ++ pthread_mutex_lock(&g_bdev_nvme_mutex); ++ TAILQ_REMOVE(&nvme_disk->nbdev_ctrlr->bdevs, nvme_disk, tailq); ++ pthread_mutex_unlock(&g_bdev_nvme_mutex); ++ ++ spdk_io_device_unregister(nvme_disk, _bdev_nvme_unregister_dev_cb); ++ ++ return 0; ++} ++ ++static int ++bdev_nvme_create_qpair(struct nvme_qpair *nvme_qpair) ++{ ++ struct nvme_ctrlr *nvme_ctrlr; ++ struct spdk_nvme_io_qpair_opts opts; ++ struct spdk_nvme_qpair *qpair; ++ int rc; ++ ++ nvme_ctrlr = nvme_qpair->ctrlr; ++ ++ spdk_nvme_ctrlr_get_default_io_qpair_opts(nvme_ctrlr->ctrlr, &opts, sizeof(opts)); ++ opts.delay_cmd_submit = g_opts.delay_cmd_submit; ++ opts.create_only = true; ++ opts.async_mode = true; ++ opts.io_queue_requests = spdk_max(g_opts.io_queue_requests, opts.io_queue_requests); ++ g_opts.io_queue_requests = opts.io_queue_requests; ++ ++ qpair = spdk_nvme_ctrlr_alloc_io_qpair(nvme_ctrlr->ctrlr, &opts, sizeof(opts)); ++ if (qpair == NULL) { ++ return -1; ++ } ++ ++ SPDK_DTRACE_PROBE3(bdev_nvme_create_qpair, nvme_ctrlr->nbdev_ctrlr->name, ++ spdk_nvme_qpair_get_id(qpair), spdk_thread_get_id(nvme_ctrlr->thread)); ++ ++ assert(nvme_qpair->group != NULL); ++ ++ rc = spdk_nvme_poll_group_add(nvme_qpair->group->group, qpair); ++ if (rc != 0) { ++ SPDK_ERRLOG("Unable to begin polling on NVMe Channel.\n"); ++ goto err; ++ } ++ ++ rc = spdk_nvme_ctrlr_connect_io_qpair(nvme_ctrlr->ctrlr, qpair); ++ if (rc != 0) { ++ SPDK_ERRLOG("Unable to connect I/O qpair.\n"); ++ goto err; ++ } ++ ++ nvme_qpair->qpair = qpair; ++ ++ if (!g_opts.disable_auto_failback) { ++ _bdev_nvme_clear_io_path_cache(nvme_qpair); ++ } ++ ++ return 0; ++ ++err: ++ spdk_nvme_ctrlr_free_io_qpair(qpair); ++ ++ return rc; ++} ++ ++static void ++bdev_nvme_complete_pending_resets(struct spdk_io_channel_iter *i) ++{ ++ struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i); ++ struct nvme_ctrlr_channel *ctrlr_ch = spdk_io_channel_get_ctx(_ch); ++ enum spdk_bdev_io_status status = SPDK_BDEV_IO_STATUS_SUCCESS; ++ struct spdk_bdev_io *bdev_io; ++ ++ if (spdk_io_channel_iter_get_ctx(i) != NULL) { ++ status = SPDK_BDEV_IO_STATUS_FAILED; ++ } ++ ++ while (!TAILQ_EMPTY(&ctrlr_ch->pending_resets)) { ++ bdev_io = TAILQ_FIRST(&ctrlr_ch->pending_resets); ++ TAILQ_REMOVE(&ctrlr_ch->pending_resets, bdev_io, module_link); ++ __bdev_nvme_io_complete(bdev_io, status, NULL); ++ } ++ ++ spdk_for_each_channel_continue(i, 0); ++} ++ ++static void ++bdev_nvme_failover_trid(struct nvme_ctrlr *nvme_ctrlr, bool remove) ++{ ++ struct nvme_path_id *path_id, *next_path; ++ int rc __attribute__((unused)); ++ ++ path_id = TAILQ_FIRST(&nvme_ctrlr->trids); ++ assert(path_id); ++ assert(path_id == nvme_ctrlr->active_path_id); ++ next_path = TAILQ_NEXT(path_id, link); ++ ++ path_id->is_failed = true; ++ ++ if (next_path) { ++ assert(path_id->trid.trtype != SPDK_NVME_TRANSPORT_PCIE); ++ ++ SPDK_NOTICELOG("Start failover from %s:%s to %s:%s\n", path_id->trid.traddr, ++ path_id->trid.trsvcid, next_path->trid.traddr, next_path->trid.trsvcid); ++ ++ spdk_nvme_ctrlr_fail(nvme_ctrlr->ctrlr); ++ nvme_ctrlr->active_path_id = next_path; ++ rc = spdk_nvme_ctrlr_set_trid(nvme_ctrlr->ctrlr, &next_path->trid); ++ assert(rc == 0); ++ TAILQ_REMOVE(&nvme_ctrlr->trids, path_id, link); ++ if (!remove) { ++ /** Shuffle the old trid to the end of the list and use the new one. ++ * Allows for round robin through multiple connections. ++ */ ++ TAILQ_INSERT_TAIL(&nvme_ctrlr->trids, path_id, link); ++ } else { ++ free(path_id); ++ } ++ } ++} ++ ++static bool ++bdev_nvme_check_ctrlr_loss_timeout(struct nvme_ctrlr *nvme_ctrlr) ++{ ++ int32_t elapsed; ++ ++ if (nvme_ctrlr->opts.ctrlr_loss_timeout_sec == 0 || ++ nvme_ctrlr->opts.ctrlr_loss_timeout_sec == -1) { ++ return false; ++ } ++ ++ elapsed = (spdk_get_ticks() - nvme_ctrlr->reset_start_tsc) / spdk_get_ticks_hz(); ++ if (elapsed >= nvme_ctrlr->opts.ctrlr_loss_timeout_sec) { ++ return true; ++ } else { ++ return false; ++ } ++} ++ ++static bool ++bdev_nvme_check_fast_io_fail_timeout(struct nvme_ctrlr *nvme_ctrlr) ++{ ++ uint32_t elapsed; ++ ++ if (nvme_ctrlr->opts.fast_io_fail_timeout_sec == 0) { ++ return false; ++ } ++ ++ elapsed = (spdk_get_ticks() - nvme_ctrlr->reset_start_tsc) / spdk_get_ticks_hz(); ++ if (elapsed >= nvme_ctrlr->opts.fast_io_fail_timeout_sec) { ++ return true; ++ } else { ++ return false; ++ } ++} ++ ++static void bdev_nvme_reset_complete(struct nvme_ctrlr *nvme_ctrlr, bool success); ++ ++static void ++nvme_ctrlr_disconnect(struct nvme_ctrlr *nvme_ctrlr, nvme_ctrlr_disconnected_cb cb_fn) ++{ ++ int rc; ++ ++ rc = spdk_nvme_ctrlr_disconnect(nvme_ctrlr->ctrlr); ++ if (rc != 0) { ++ /* Disconnect fails if ctrlr is already resetting or removed. In this case, ++ * fail the reset sequence immediately. ++ */ ++ bdev_nvme_reset_complete(nvme_ctrlr, false); ++ return; ++ } ++ ++ /* spdk_nvme_ctrlr_disconnect() may complete asynchronously later by polling adminq. ++ * Set callback here to execute the specified operation after ctrlr is really disconnected. ++ */ ++ assert(nvme_ctrlr->disconnected_cb == NULL); ++ nvme_ctrlr->disconnected_cb = cb_fn; ++ ++ /* During disconnection, reduce the period to poll adminq more often. */ ++ bdev_nvme_change_adminq_poll_period(nvme_ctrlr, 0); ++} ++ ++enum bdev_nvme_op_after_reset { ++ OP_NONE, ++ OP_COMPLETE_PENDING_DESTRUCT, ++ OP_DESTRUCT, ++ OP_DELAYED_RECONNECT, ++}; ++ ++typedef enum bdev_nvme_op_after_reset _bdev_nvme_op_after_reset; ++ ++static _bdev_nvme_op_after_reset ++bdev_nvme_check_op_after_reset(struct nvme_ctrlr *nvme_ctrlr, bool success) ++{ ++ if (nvme_ctrlr_can_be_unregistered(nvme_ctrlr)) { ++ /* Complete pending destruct after reset completes. */ ++ return OP_COMPLETE_PENDING_DESTRUCT; ++ } else if (success || nvme_ctrlr->opts.reconnect_delay_sec == 0) { ++ nvme_ctrlr->reset_start_tsc = 0; ++ return OP_NONE; ++ } else if (bdev_nvme_check_ctrlr_loss_timeout(nvme_ctrlr)) { ++ return OP_DESTRUCT; ++ } else { ++ if (bdev_nvme_check_fast_io_fail_timeout(nvme_ctrlr)) { ++ nvme_ctrlr->fast_io_fail_timedout = true; ++ } ++ bdev_nvme_failover_trid(nvme_ctrlr, false); ++ return OP_DELAYED_RECONNECT; ++ } ++} ++ ++static int _bdev_nvme_delete(struct nvme_ctrlr *nvme_ctrlr, bool hotplug); ++static void bdev_nvme_reconnect_ctrlr(struct nvme_ctrlr *nvme_ctrlr); ++ ++static int ++bdev_nvme_reconnect_delay_timer_expired(void *ctx) ++{ ++ struct nvme_ctrlr *nvme_ctrlr = ctx; ++ ++ SPDK_DTRACE_PROBE1(bdev_nvme_ctrlr_reconnect_delay, nvme_ctrlr->nbdev_ctrlr->name); ++ pthread_mutex_lock(&nvme_ctrlr->mutex); ++ ++ spdk_poller_unregister(&nvme_ctrlr->reconnect_delay_timer); ++ ++ assert(nvme_ctrlr->reconnect_is_delayed == true); ++ nvme_ctrlr->reconnect_is_delayed = false; ++ ++ if (nvme_ctrlr->destruct) { ++ pthread_mutex_unlock(&nvme_ctrlr->mutex); ++ return SPDK_POLLER_BUSY; ++ } ++ ++ assert(nvme_ctrlr->resetting == false); ++ nvme_ctrlr->resetting = true; ++ ++ pthread_mutex_unlock(&nvme_ctrlr->mutex); ++ ++ spdk_poller_resume(nvme_ctrlr->adminq_timer_poller); ++ ++ bdev_nvme_reconnect_ctrlr(nvme_ctrlr); ++ return SPDK_POLLER_BUSY; ++} ++ ++static void ++bdev_nvme_start_reconnect_delay_timer(struct nvme_ctrlr *nvme_ctrlr) ++{ ++ spdk_poller_pause(nvme_ctrlr->adminq_timer_poller); ++ ++ assert(nvme_ctrlr->reconnect_is_delayed == false); ++ nvme_ctrlr->reconnect_is_delayed = true; ++ ++ assert(nvme_ctrlr->reconnect_delay_timer == NULL); ++ nvme_ctrlr->reconnect_delay_timer = SPDK_POLLER_REGISTER(bdev_nvme_reconnect_delay_timer_expired, ++ nvme_ctrlr, ++ nvme_ctrlr->opts.reconnect_delay_sec * SPDK_SEC_TO_USEC); ++} ++ ++static void ++_bdev_nvme_reset_complete(struct spdk_io_channel_iter *i, int status) ++{ ++ struct nvme_ctrlr *nvme_ctrlr = spdk_io_channel_iter_get_io_device(i); ++ bool success = spdk_io_channel_iter_get_ctx(i) == NULL; ++ struct nvme_path_id *path_id; ++ bdev_nvme_reset_cb reset_cb_fn = nvme_ctrlr->reset_cb_fn; ++ void *reset_cb_arg = nvme_ctrlr->reset_cb_arg; ++ enum bdev_nvme_op_after_reset op_after_reset; ++ ++ assert(nvme_ctrlr->thread == spdk_get_thread()); ++ ++ nvme_ctrlr->reset_cb_fn = NULL; ++ nvme_ctrlr->reset_cb_arg = NULL; ++ ++ if (!success) { ++ SPDK_ERRLOG("Resetting controller failed.\n"); ++ } else { ++ SPDK_NOTICELOG("Resetting controller successful.\n"); ++ } ++ ++ pthread_mutex_lock(&nvme_ctrlr->mutex); ++ nvme_ctrlr->resetting = false; ++ ++ path_id = TAILQ_FIRST(&nvme_ctrlr->trids); ++ assert(path_id != NULL); ++ assert(path_id == nvme_ctrlr->active_path_id); ++ ++ path_id->is_failed = !success; ++ ++ op_after_reset = bdev_nvme_check_op_after_reset(nvme_ctrlr, success); ++ ++ pthread_mutex_unlock(&nvme_ctrlr->mutex); ++ ++ if (reset_cb_fn) { ++ reset_cb_fn(reset_cb_arg, success); ++ } ++ ++ switch (op_after_reset) { ++ case OP_COMPLETE_PENDING_DESTRUCT: ++ nvme_ctrlr_unregister(nvme_ctrlr); ++ break; ++ case OP_DESTRUCT: ++ _bdev_nvme_delete(nvme_ctrlr, false); ++ break; ++ case OP_DELAYED_RECONNECT: ++ nvme_ctrlr_disconnect(nvme_ctrlr, bdev_nvme_start_reconnect_delay_timer); ++ break; ++ default: ++ break; ++ } ++} ++ ++static void ++bdev_nvme_reset_complete(struct nvme_ctrlr *nvme_ctrlr, bool success) ++{ ++ /* Make sure we clear any pending resets before returning. */ ++ spdk_for_each_channel(nvme_ctrlr, ++ bdev_nvme_complete_pending_resets, ++ success ? NULL : (void *)0x1, ++ _bdev_nvme_reset_complete); ++} ++ ++static void ++bdev_nvme_reset_create_qpairs_failed(struct spdk_io_channel_iter *i, int status) ++{ ++ struct nvme_ctrlr *nvme_ctrlr = spdk_io_channel_iter_get_io_device(i); ++ ++ bdev_nvme_reset_complete(nvme_ctrlr, false); ++} ++ ++static void ++bdev_nvme_reset_destroy_qpair(struct spdk_io_channel_iter *i) ++{ ++ struct spdk_io_channel *ch = spdk_io_channel_iter_get_channel(i); ++ struct nvme_ctrlr_channel *ctrlr_ch = spdk_io_channel_get_ctx(ch); ++ struct nvme_qpair *nvme_qpair; ++ ++ nvme_qpair = ctrlr_ch->qpair; ++ assert(nvme_qpair != NULL); ++ ++ _bdev_nvme_clear_io_path_cache(nvme_qpair); ++ ++ if (nvme_qpair->qpair != NULL) { ++ spdk_nvme_ctrlr_disconnect_io_qpair(nvme_qpair->qpair); ++ ++ /* The current full reset sequence will move to the next ++ * ctrlr_channel after the qpair is actually disconnected. ++ */ ++ assert(ctrlr_ch->reset_iter == NULL); ++ ctrlr_ch->reset_iter = i; ++ } else { ++ spdk_for_each_channel_continue(i, 0); ++ } ++} ++ ++static void ++bdev_nvme_reset_create_qpairs_done(struct spdk_io_channel_iter *i, int status) ++{ ++ struct nvme_ctrlr *nvme_ctrlr = spdk_io_channel_iter_get_io_device(i); ++ ++ if (status == 0) { ++ bdev_nvme_reset_complete(nvme_ctrlr, true); ++ } else { ++ /* Delete the added qpairs and quiesce ctrlr to make the states clean. */ ++ spdk_for_each_channel(nvme_ctrlr, ++ bdev_nvme_reset_destroy_qpair, ++ NULL, ++ bdev_nvme_reset_create_qpairs_failed); ++ } ++} ++ ++static void ++bdev_nvme_reset_create_qpair(struct spdk_io_channel_iter *i) ++{ ++ struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i); ++ struct nvme_ctrlr_channel *ctrlr_ch = spdk_io_channel_get_ctx(_ch); ++ int rc; ++ ++ rc = bdev_nvme_create_qpair(ctrlr_ch->qpair); ++ ++ spdk_for_each_channel_continue(i, rc); ++} ++ ++static int ++bdev_nvme_reconnect_ctrlr_poll(void *arg) ++{ ++ struct nvme_ctrlr *nvme_ctrlr = arg; ++ int rc = -ETIMEDOUT; ++ ++ if (!bdev_nvme_check_ctrlr_loss_timeout(nvme_ctrlr)) { ++ rc = spdk_nvme_ctrlr_reconnect_poll_async(nvme_ctrlr->ctrlr); ++ if (rc == -EAGAIN) { ++ return SPDK_POLLER_BUSY; ++ } ++ } ++ ++ spdk_poller_unregister(&nvme_ctrlr->reset_detach_poller); ++ if (rc == 0) { ++ /* Recreate all of the I/O queue pairs */ ++ spdk_for_each_channel(nvme_ctrlr, ++ bdev_nvme_reset_create_qpair, ++ NULL, ++ bdev_nvme_reset_create_qpairs_done); ++ } else { ++ bdev_nvme_reset_complete(nvme_ctrlr, false); ++ } ++ return SPDK_POLLER_BUSY; ++} ++ ++static void ++bdev_nvme_reconnect_ctrlr(struct nvme_ctrlr *nvme_ctrlr) ++{ ++ spdk_nvme_ctrlr_reconnect_async(nvme_ctrlr->ctrlr); ++ ++ SPDK_DTRACE_PROBE1(bdev_nvme_ctrlr_reconnect, nvme_ctrlr->nbdev_ctrlr->name); ++ assert(nvme_ctrlr->reset_detach_poller == NULL); ++ nvme_ctrlr->reset_detach_poller = SPDK_POLLER_REGISTER(bdev_nvme_reconnect_ctrlr_poll, ++ nvme_ctrlr, 0); ++} ++ ++static void ++bdev_nvme_reset_ctrlr(struct spdk_io_channel_iter *i, int status) ++{ ++ struct nvme_ctrlr *nvme_ctrlr = spdk_io_channel_iter_get_io_device(i); ++ ++ SPDK_DTRACE_PROBE1(bdev_nvme_ctrlr_reset, nvme_ctrlr->nbdev_ctrlr->name); ++ assert(status == 0); ++ ++ if (!spdk_nvme_ctrlr_is_fabrics(nvme_ctrlr->ctrlr)) { ++ bdev_nvme_reconnect_ctrlr(nvme_ctrlr); ++ } else { ++ nvme_ctrlr_disconnect(nvme_ctrlr, bdev_nvme_reconnect_ctrlr); ++ } ++} ++ ++static void ++bdev_nvme_reset_destroy_qpairs(struct nvme_ctrlr *nvme_ctrlr) ++{ ++ spdk_for_each_channel(nvme_ctrlr, ++ bdev_nvme_reset_destroy_qpair, ++ NULL, ++ bdev_nvme_reset_ctrlr); ++} ++ ++static void ++_bdev_nvme_reset(void *ctx) ++{ ++ struct nvme_ctrlr *nvme_ctrlr = ctx; ++ ++ assert(nvme_ctrlr->resetting == true); ++ assert(nvme_ctrlr->thread == spdk_get_thread()); ++ ++ if (!spdk_nvme_ctrlr_is_fabrics(nvme_ctrlr->ctrlr)) { ++ nvme_ctrlr_disconnect(nvme_ctrlr, bdev_nvme_reset_destroy_qpairs); ++ } else { ++ bdev_nvme_reset_destroy_qpairs(nvme_ctrlr); ++ } ++} ++ ++static int ++bdev_nvme_reset(struct nvme_ctrlr *nvme_ctrlr) ++{ ++ pthread_mutex_lock(&nvme_ctrlr->mutex); ++ if (nvme_ctrlr->destruct) { ++ pthread_mutex_unlock(&nvme_ctrlr->mutex); ++ return -ENXIO; ++ } ++ ++ if (nvme_ctrlr->resetting) { ++ pthread_mutex_unlock(&nvme_ctrlr->mutex); ++ SPDK_NOTICELOG("Unable to perform reset, already in progress.\n"); ++ return -EBUSY; ++ } ++ ++ if (nvme_ctrlr->reconnect_is_delayed) { ++ pthread_mutex_unlock(&nvme_ctrlr->mutex); ++ SPDK_NOTICELOG("Reconnect is already scheduled.\n"); ++ return -EBUSY; ++ } ++ ++ nvme_ctrlr->resetting = true; ++ ++ assert(nvme_ctrlr->reset_start_tsc == 0); ++ nvme_ctrlr->reset_start_tsc = spdk_get_ticks(); ++ ++ pthread_mutex_unlock(&nvme_ctrlr->mutex); ++ ++ spdk_thread_send_msg(nvme_ctrlr->thread, _bdev_nvme_reset, nvme_ctrlr); ++ return 0; ++} ++ ++int ++bdev_nvme_reset_rpc(struct nvme_ctrlr *nvme_ctrlr, bdev_nvme_reset_cb cb_fn, void *cb_arg) ++{ ++ int rc; ++ ++ rc = bdev_nvme_reset(nvme_ctrlr); ++ if (rc == 0) { ++ nvme_ctrlr->reset_cb_fn = cb_fn; ++ nvme_ctrlr->reset_cb_arg = cb_arg; ++ } ++ return rc; ++} ++ ++static int _bdev_nvme_reset_io(struct nvme_io_path *io_path, struct nvme_bdev_io *bio); ++ ++static void ++bdev_nvme_reset_io_complete(struct nvme_bdev_io *bio) ++{ ++ enum spdk_bdev_io_status io_status; ++ ++ if (bio->cpl.cdw0 == 0) { ++ io_status = SPDK_BDEV_IO_STATUS_SUCCESS; ++ } else { ++ io_status = SPDK_BDEV_IO_STATUS_FAILED; ++ } ++ ++ __bdev_nvme_io_complete(spdk_bdev_io_from_ctx(bio), io_status, NULL); ++} ++ ++static void ++_bdev_nvme_reset_io_continue(void *ctx) ++{ ++ struct nvme_bdev_io *bio = ctx; ++ struct nvme_io_path *prev_io_path, *next_io_path; ++ int rc; ++ ++ prev_io_path = bio->io_path; ++ bio->io_path = NULL; ++ ++ if (bio->cpl.cdw0 != 0) { ++ goto complete; ++ } ++ ++ next_io_path = STAILQ_NEXT(prev_io_path, stailq); ++ if (next_io_path == NULL) { ++ goto complete; ++ } ++ ++ rc = _bdev_nvme_reset_io(next_io_path, bio); ++ if (rc == 0) { ++ return; ++ } ++ ++ bio->cpl.cdw0 = 1; ++ ++complete: ++ bdev_nvme_reset_io_complete(bio); ++} ++ ++static void ++bdev_nvme_reset_io_continue(void *cb_arg, bool success) ++{ ++ struct nvme_bdev_io *bio = cb_arg; ++ ++ bio->cpl.cdw0 = !success; ++ ++ spdk_thread_send_msg(bio->orig_thread, _bdev_nvme_reset_io_continue, bio); ++} ++ ++static int ++_bdev_nvme_reset_io(struct nvme_io_path *io_path, struct nvme_bdev_io *bio) ++{ ++ struct nvme_ctrlr *nvme_ctrlr = io_path->qpair->ctrlr; ++ struct nvme_ctrlr_channel *ctrlr_ch; ++ struct spdk_bdev_io *bdev_io; ++ int rc; ++ ++ rc = bdev_nvme_reset(nvme_ctrlr); ++ if (rc == 0) { ++ assert(bio->io_path == NULL); ++ bio->io_path = io_path; ++ ++ assert(nvme_ctrlr->reset_cb_fn == NULL); ++ assert(nvme_ctrlr->reset_cb_arg == NULL); ++ nvme_ctrlr->reset_cb_fn = bdev_nvme_reset_io_continue; ++ nvme_ctrlr->reset_cb_arg = bio; ++ } else if (rc == -EBUSY) { ++ ctrlr_ch = io_path->qpair->ctrlr_ch; ++ assert(ctrlr_ch != NULL); ++ /* ++ * Reset call is queued only if it is from the app framework. This is on purpose so that ++ * we don't interfere with the app framework reset strategy. i.e. we are deferring to the ++ * upper level. If they are in the middle of a reset, we won't try to schedule another one. ++ */ ++ bdev_io = spdk_bdev_io_from_ctx(bio); ++ TAILQ_INSERT_TAIL(&ctrlr_ch->pending_resets, bdev_io, module_link); ++ } else { ++ return rc; ++ } ++ ++ return 0; ++} ++ ++static void ++bdev_nvme_reset_io(struct nvme_bdev_channel *nbdev_ch, struct nvme_bdev_io *bio) ++{ ++ struct nvme_io_path *io_path; ++ int rc; ++ ++ bio->cpl.cdw0 = 0; ++ bio->orig_thread = spdk_get_thread(); ++ ++ /* Reset only the first nvme_ctrlr in the nvme_bdev_ctrlr for now. ++ * ++ * TODO: Reset all nvme_ctrlrs in the nvme_bdev_ctrlr sequentially. ++ * This will be done in the following patches. ++ */ ++ io_path = STAILQ_FIRST(&nbdev_ch->io_path_list); ++ assert(io_path != NULL); ++ ++ rc = _bdev_nvme_reset_io(io_path, bio); ++ if (rc != 0) { ++ bio->cpl.cdw0 = 1; ++ bdev_nvme_reset_io_complete(bio); ++ } ++} ++ ++static int ++bdev_nvme_failover(struct nvme_ctrlr *nvme_ctrlr, bool remove) ++{ ++ pthread_mutex_lock(&nvme_ctrlr->mutex); ++ if (nvme_ctrlr->destruct) { ++ pthread_mutex_unlock(&nvme_ctrlr->mutex); ++ /* Don't bother resetting if the controller is in the process of being destructed. */ ++ return -ENXIO; ++ } ++ ++ if (nvme_ctrlr->resetting) { ++ pthread_mutex_unlock(&nvme_ctrlr->mutex); ++ SPDK_NOTICELOG("Unable to perform reset, already in progress.\n"); ++ return -EBUSY; ++ } ++ ++ bdev_nvme_failover_trid(nvme_ctrlr, remove); ++ ++ if (nvme_ctrlr->reconnect_is_delayed) { ++ pthread_mutex_unlock(&nvme_ctrlr->mutex); ++ SPDK_NOTICELOG("Reconnect is already scheduled.\n"); ++ ++ /* We rely on the next reconnect for the failover. */ ++ return 0; ++ } ++ ++ nvme_ctrlr->resetting = true; ++ ++ assert(nvme_ctrlr->reset_start_tsc == 0); ++ nvme_ctrlr->reset_start_tsc = spdk_get_ticks(); ++ ++ pthread_mutex_unlock(&nvme_ctrlr->mutex); ++ ++ spdk_thread_send_msg(nvme_ctrlr->thread, _bdev_nvme_reset, nvme_ctrlr); ++ return 0; ++} ++ ++static int bdev_nvme_unmap(struct nvme_bdev_io *bio, uint64_t offset_blocks, ++ uint64_t num_blocks); ++ ++static int bdev_nvme_write_zeroes(struct nvme_bdev_io *bio, uint64_t offset_blocks, ++ uint64_t num_blocks); ++ ++static int bdev_nvme_copy(struct nvme_bdev_io *bio, uint64_t dst_offset_blocks, ++ uint64_t src_offset_blocks, ++ uint64_t num_blocks); ++ ++static void ++bdev_nvme_get_buf_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io, ++ bool success) ++{ ++ struct nvme_bdev_io *bio = (struct nvme_bdev_io *)bdev_io->driver_ctx; ++ struct spdk_bdev *bdev = bdev_io->bdev; ++ int ret; ++ ++ if (!success) { ++ ret = -EINVAL; ++ goto exit; ++ } ++ ++ if (spdk_unlikely(!nvme_io_path_is_available(bio->io_path))) { ++ ret = -ENXIO; ++ goto exit; ++ } ++ ++ ret = bdev_nvme_readv(bio, ++ bdev_io->u.bdev.iovs, ++ bdev_io->u.bdev.iovcnt, ++ bdev_io->u.bdev.md_buf, ++ bdev_io->u.bdev.num_blocks, ++ bdev_io->u.bdev.offset_blocks, ++ bdev->dif_check_flags, ++ bdev_io->u.bdev.ext_opts); ++ ++exit: ++ if (spdk_unlikely(ret != 0)) { ++ bdev_nvme_io_complete(bio, ret); ++ } ++} ++ ++static inline void ++_bdev_nvme_submit_request(struct nvme_bdev_channel *nbdev_ch, struct spdk_bdev_io *bdev_io) ++{ ++ struct nvme_bdev_io *nbdev_io = (struct nvme_bdev_io *)bdev_io->driver_ctx; ++ struct spdk_bdev *bdev = bdev_io->bdev; ++ struct nvme_bdev_io *nbdev_io_to_abort; ++ int rc = 0; ++ ++ switch (bdev_io->type) { ++ case SPDK_BDEV_IO_TYPE_READ: ++ if (bdev_io->u.bdev.iovs && bdev_io->u.bdev.iovs[0].iov_base) { ++ rc = bdev_nvme_readv(nbdev_io, ++ bdev_io->u.bdev.iovs, ++ bdev_io->u.bdev.iovcnt, ++ bdev_io->u.bdev.md_buf, ++ bdev_io->u.bdev.num_blocks, ++ bdev_io->u.bdev.offset_blocks, ++ bdev->dif_check_flags, ++ bdev_io->u.bdev.ext_opts); ++ } else { ++ spdk_bdev_io_get_buf(bdev_io, bdev_nvme_get_buf_cb, ++ bdev_io->u.bdev.num_blocks * bdev->blocklen); ++ rc = 0; ++ } ++ break; ++ case SPDK_BDEV_IO_TYPE_WRITE: ++ rc = bdev_nvme_writev(nbdev_io, ++ bdev_io->u.bdev.iovs, ++ bdev_io->u.bdev.iovcnt, ++ bdev_io->u.bdev.md_buf, ++ bdev_io->u.bdev.num_blocks, ++ bdev_io->u.bdev.offset_blocks, ++ bdev->dif_check_flags, ++ bdev_io->u.bdev.ext_opts); ++ break; ++ case SPDK_BDEV_IO_TYPE_COMPARE: ++ rc = bdev_nvme_comparev(nbdev_io, ++ bdev_io->u.bdev.iovs, ++ bdev_io->u.bdev.iovcnt, ++ bdev_io->u.bdev.md_buf, ++ bdev_io->u.bdev.num_blocks, ++ bdev_io->u.bdev.offset_blocks, ++ bdev->dif_check_flags); ++ break; ++ case SPDK_BDEV_IO_TYPE_COMPARE_AND_WRITE: ++ rc = bdev_nvme_comparev_and_writev(nbdev_io, ++ bdev_io->u.bdev.iovs, ++ bdev_io->u.bdev.iovcnt, ++ bdev_io->u.bdev.fused_iovs, ++ bdev_io->u.bdev.fused_iovcnt, ++ bdev_io->u.bdev.md_buf, ++ bdev_io->u.bdev.num_blocks, ++ bdev_io->u.bdev.offset_blocks, ++ bdev->dif_check_flags); ++ break; ++ case SPDK_BDEV_IO_TYPE_UNMAP: ++ rc = bdev_nvme_unmap(nbdev_io, ++ bdev_io->u.bdev.offset_blocks, ++ bdev_io->u.bdev.num_blocks); ++ break; ++ case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: ++ rc = bdev_nvme_write_zeroes(nbdev_io, ++ bdev_io->u.bdev.offset_blocks, ++ bdev_io->u.bdev.num_blocks); ++ break; ++ case SPDK_BDEV_IO_TYPE_RESET: ++ nbdev_io->io_path = NULL; ++ bdev_nvme_reset_io(nbdev_ch, nbdev_io); ++ break; ++ case SPDK_BDEV_IO_TYPE_FLUSH: ++ bdev_nvme_io_complete(nbdev_io, 0); ++ break; ++ case SPDK_BDEV_IO_TYPE_ZONE_APPEND: ++ rc = bdev_nvme_zone_appendv(nbdev_io, ++ bdev_io->u.bdev.iovs, ++ bdev_io->u.bdev.iovcnt, ++ bdev_io->u.bdev.md_buf, ++ bdev_io->u.bdev.num_blocks, ++ bdev_io->u.bdev.offset_blocks, ++ bdev->dif_check_flags); ++ break; ++ case SPDK_BDEV_IO_TYPE_GET_ZONE_INFO: ++ rc = bdev_nvme_get_zone_info(nbdev_io, ++ bdev_io->u.zone_mgmt.zone_id, ++ bdev_io->u.zone_mgmt.num_zones, ++ bdev_io->u.zone_mgmt.buf); ++ break; ++ case SPDK_BDEV_IO_TYPE_ZONE_MANAGEMENT: ++ rc = bdev_nvme_zone_management(nbdev_io, ++ bdev_io->u.zone_mgmt.zone_id, ++ bdev_io->u.zone_mgmt.zone_action); ++ break; ++ case SPDK_BDEV_IO_TYPE_NVME_ADMIN: ++ nbdev_io->io_path = NULL; ++ bdev_nvme_admin_passthru(nbdev_ch, ++ nbdev_io, ++ &bdev_io->u.nvme_passthru.cmd, ++ bdev_io->u.nvme_passthru.buf, ++ bdev_io->u.nvme_passthru.nbytes); ++ break; ++ case SPDK_BDEV_IO_TYPE_NVME_IO: ++ rc = bdev_nvme_io_passthru(nbdev_io, ++ &bdev_io->u.nvme_passthru.cmd, ++ bdev_io->u.nvme_passthru.buf, ++ bdev_io->u.nvme_passthru.nbytes); ++ break; ++ case SPDK_BDEV_IO_TYPE_NVME_IO_MD: ++ rc = bdev_nvme_io_passthru_md(nbdev_io, ++ &bdev_io->u.nvme_passthru.cmd, ++ bdev_io->u.nvme_passthru.buf, ++ bdev_io->u.nvme_passthru.nbytes, ++ bdev_io->u.nvme_passthru.md_buf, ++ bdev_io->u.nvme_passthru.md_len); ++ break; ++ case SPDK_BDEV_IO_TYPE_ABORT: ++ nbdev_io->io_path = NULL; ++ nbdev_io_to_abort = (struct nvme_bdev_io *)bdev_io->u.abort.bio_to_abort->driver_ctx; ++ bdev_nvme_abort(nbdev_ch, ++ nbdev_io, ++ nbdev_io_to_abort); ++ break; ++ case SPDK_BDEV_IO_TYPE_COPY: ++ rc = bdev_nvme_copy(nbdev_io, ++ bdev_io->u.bdev.offset_blocks, ++ bdev_io->u.bdev.copy.src_offset_blocks, ++ bdev_io->u.bdev.num_blocks); ++ break; ++ default: ++ rc = -EINVAL; ++ break; ++ } ++ ++ if (spdk_unlikely(rc != 0)) { ++ bdev_nvme_io_complete(nbdev_io, rc); ++ } ++} ++ ++static void ++bdev_nvme_submit_request(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io) ++{ ++ struct nvme_bdev_channel *nbdev_ch = spdk_io_channel_get_ctx(ch); ++ struct nvme_bdev_io *nbdev_io = (struct nvme_bdev_io *)bdev_io->driver_ctx; ++ ++ if (spdk_likely(nbdev_io->submit_tsc == 0)) { ++ nbdev_io->submit_tsc = spdk_bdev_io_get_submit_tsc(bdev_io); ++ } else { ++ /* There are cases where submit_tsc != 0, i.e. retry I/O. ++ * We need to update submit_tsc here. ++ */ ++ nbdev_io->submit_tsc = spdk_get_ticks(); ++ } ++ ++ spdk_trace_record(TRACE_BDEV_NVME_IO_START, 0, 0, (uintptr_t)nbdev_io, (uintptr_t)bdev_io); ++ nbdev_io->io_path = bdev_nvme_find_io_path(nbdev_ch); ++ if (spdk_unlikely(!nbdev_io->io_path)) { ++ if (!bdev_nvme_io_type_is_admin(bdev_io->type)) { ++ bdev_nvme_io_complete(nbdev_io, -ENXIO); ++ return; ++ } ++ ++ /* Admin commands do not use the optimal I/O path. ++ * Simply fall through even if it is not found. ++ */ ++ } ++ ++ _bdev_nvme_submit_request(nbdev_ch, bdev_io); ++} ++ ++static bool ++bdev_nvme_io_type_supported(void *ctx, enum spdk_bdev_io_type io_type) ++{ ++ struct nvme_bdev *nbdev = ctx; ++ struct nvme_ns *nvme_ns; ++ struct spdk_nvme_ns *ns; ++ struct spdk_nvme_ctrlr *ctrlr; ++ const struct spdk_nvme_ctrlr_data *cdata; ++ ++ nvme_ns = TAILQ_FIRST(&nbdev->nvme_ns_list); ++ assert(nvme_ns != NULL); ++ ns = nvme_ns->ns; ++ ctrlr = spdk_nvme_ns_get_ctrlr(ns); ++ ++ switch (io_type) { ++ case SPDK_BDEV_IO_TYPE_READ: ++ case SPDK_BDEV_IO_TYPE_WRITE: ++ case SPDK_BDEV_IO_TYPE_RESET: ++ case SPDK_BDEV_IO_TYPE_FLUSH: ++ case SPDK_BDEV_IO_TYPE_NVME_ADMIN: ++ case SPDK_BDEV_IO_TYPE_NVME_IO: ++ case SPDK_BDEV_IO_TYPE_ABORT: ++ return true; ++ ++ case SPDK_BDEV_IO_TYPE_COMPARE: ++ return spdk_nvme_ns_supports_compare(ns); ++ ++ case SPDK_BDEV_IO_TYPE_NVME_IO_MD: ++ return spdk_nvme_ns_get_md_size(ns) ? true : false; ++ ++ case SPDK_BDEV_IO_TYPE_UNMAP: ++ cdata = spdk_nvme_ctrlr_get_data(ctrlr); ++ return cdata->oncs.dsm; ++ ++ case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: ++ cdata = spdk_nvme_ctrlr_get_data(ctrlr); ++ return cdata->oncs.write_zeroes; ++ ++ case SPDK_BDEV_IO_TYPE_COMPARE_AND_WRITE: ++ if (spdk_nvme_ctrlr_get_flags(ctrlr) & ++ SPDK_NVME_CTRLR_COMPARE_AND_WRITE_SUPPORTED) { ++ return true; ++ } ++ return false; ++ ++ case SPDK_BDEV_IO_TYPE_GET_ZONE_INFO: ++ case SPDK_BDEV_IO_TYPE_ZONE_MANAGEMENT: ++ return spdk_nvme_ns_get_csi(ns) == SPDK_NVME_CSI_ZNS; ++ ++ case SPDK_BDEV_IO_TYPE_ZONE_APPEND: ++ return spdk_nvme_ns_get_csi(ns) == SPDK_NVME_CSI_ZNS && ++ spdk_nvme_ctrlr_get_flags(ctrlr) & SPDK_NVME_CTRLR_ZONE_APPEND_SUPPORTED; ++ ++ case SPDK_BDEV_IO_TYPE_COPY: ++ cdata = spdk_nvme_ctrlr_get_data(ctrlr); ++ return cdata->oncs.copy; ++ ++ default: ++ return false; ++ } ++} ++ ++static int ++nvme_qpair_create(struct nvme_ctrlr *nvme_ctrlr, struct nvme_ctrlr_channel *ctrlr_ch) ++{ ++ struct nvme_qpair *nvme_qpair; ++ struct spdk_io_channel *pg_ch; ++ int rc; ++ ++ nvme_qpair = calloc(1, sizeof(*nvme_qpair)); ++ if (!nvme_qpair) { ++ SPDK_ERRLOG("Failed to alloc nvme_qpair.\n"); ++ return -1; ++ } ++ ++ TAILQ_INIT(&nvme_qpair->io_path_list); ++ ++ nvme_qpair->ctrlr = nvme_ctrlr; ++ nvme_qpair->ctrlr_ch = ctrlr_ch; ++ ++ pg_ch = spdk_get_io_channel(&g_nvme_bdev_ctrlrs); ++ if (!pg_ch) { ++ free(nvme_qpair); ++ return -1; ++ } ++ ++ nvme_qpair->group = spdk_io_channel_get_ctx(pg_ch); ++ ++#ifdef SPDK_CONFIG_VTUNE ++ nvme_qpair->group->collect_spin_stat = true; ++#else ++ nvme_qpair->group->collect_spin_stat = false; ++#endif ++ ++ rc = bdev_nvme_create_qpair(nvme_qpair); ++ if (rc != 0) { ++ /* nvme_ctrlr can't create IO qpair if connection is down. ++ * ++ * If reconnect_delay_sec is non-zero, creating IO qpair is retried ++ * after reconnect_delay_sec seconds. If bdev_retry_count is non-zero, ++ * submitted IO will be queued until IO qpair is successfully created. ++ * ++ * Hence, if both are satisfied, ignore the failure. ++ */ ++ if (nvme_ctrlr->opts.reconnect_delay_sec == 0 || g_opts.bdev_retry_count == 0) { ++ spdk_put_io_channel(pg_ch); ++ free(nvme_qpair); ++ return rc; ++ } ++ } ++ ++ TAILQ_INSERT_TAIL(&nvme_qpair->group->qpair_list, nvme_qpair, tailq); ++ ++ ctrlr_ch->qpair = nvme_qpair; ++ ++ pthread_mutex_lock(&nvme_qpair->ctrlr->mutex); ++ nvme_qpair->ctrlr->ref++; ++ pthread_mutex_unlock(&nvme_qpair->ctrlr->mutex); ++ ++ return 0; ++} ++ ++static int ++bdev_nvme_create_ctrlr_channel_cb(void *io_device, void *ctx_buf) ++{ ++ struct nvme_ctrlr *nvme_ctrlr = io_device; ++ struct nvme_ctrlr_channel *ctrlr_ch = ctx_buf; ++ ++ TAILQ_INIT(&ctrlr_ch->pending_resets); ++ ++ return nvme_qpair_create(nvme_ctrlr, ctrlr_ch); ++} ++ ++static void ++nvme_qpair_delete(struct nvme_qpair *nvme_qpair) ++{ ++ assert(nvme_qpair->group != NULL); ++ ++ TAILQ_REMOVE(&nvme_qpair->group->qpair_list, nvme_qpair, tailq); ++ ++ spdk_put_io_channel(spdk_io_channel_from_ctx(nvme_qpair->group)); ++ ++ nvme_ctrlr_release(nvme_qpair->ctrlr); ++ ++ free(nvme_qpair); ++} ++ ++static void ++bdev_nvme_destroy_ctrlr_channel_cb(void *io_device, void *ctx_buf) ++{ ++ struct nvme_ctrlr_channel *ctrlr_ch = ctx_buf; ++ struct nvme_qpair *nvme_qpair; ++ ++ nvme_qpair = ctrlr_ch->qpair; ++ assert(nvme_qpair != NULL); ++ ++ _bdev_nvme_clear_io_path_cache(nvme_qpair); ++ ++ if (nvme_qpair->qpair != NULL) { ++ if (ctrlr_ch->reset_iter == NULL) { ++ spdk_nvme_ctrlr_disconnect_io_qpair(nvme_qpair->qpair); ++ } else { ++ /* Skip current ctrlr_channel in a full reset sequence because ++ * it is being deleted now. The qpair is already being disconnected. ++ * We do not have to restart disconnecting it. ++ */ ++ spdk_for_each_channel_continue(ctrlr_ch->reset_iter, 0); ++ } ++ ++ /* We cannot release a reference to the poll group now. ++ * The qpair may be disconnected asynchronously later. ++ * We need to poll it until it is actually disconnected. ++ * Just detach the qpair from the deleting ctrlr_channel. ++ */ ++ nvme_qpair->ctrlr_ch = NULL; ++ } else { ++ assert(ctrlr_ch->reset_iter == NULL); ++ ++ nvme_qpair_delete(nvme_qpair); ++ } ++} ++ ++static void ++bdev_nvme_submit_accel_crc32c(void *ctx, uint32_t *dst, struct iovec *iov, ++ uint32_t iov_cnt, uint32_t seed, ++ spdk_nvme_accel_completion_cb cb_fn, void *cb_arg) ++{ ++ struct nvme_poll_group *group = ctx; ++ int rc; ++ ++ assert(group->accel_channel != NULL); ++ assert(cb_fn != NULL); ++ ++ rc = spdk_accel_submit_crc32cv(group->accel_channel, dst, iov, iov_cnt, seed, cb_fn, cb_arg); ++ if (rc) { ++ /* For the two cases, spdk_accel_submit_crc32cv does not call the user's cb_fn */ ++ if (rc == -ENOMEM || rc == -EINVAL) { ++ cb_fn(cb_arg, rc); ++ } ++ SPDK_ERRLOG("Cannot complete the accelerated crc32c operation with iov=%p\n", iov); ++ } ++} ++ ++static struct spdk_nvme_accel_fn_table g_bdev_nvme_accel_fn_table = { ++ .table_size = sizeof(struct spdk_nvme_accel_fn_table), ++ .submit_accel_crc32c = bdev_nvme_submit_accel_crc32c, ++}; ++ ++static int ++bdev_nvme_create_poll_group_cb(void *io_device, void *ctx_buf) ++{ ++ struct nvme_poll_group *group = ctx_buf; ++ ++ TAILQ_INIT(&group->qpair_list); ++ ++ group->group = spdk_nvme_poll_group_create(group, &g_bdev_nvme_accel_fn_table); ++ if (group->group == NULL) { ++ return -1; ++ } ++ ++ group->accel_channel = spdk_accel_get_io_channel(); ++ if (!group->accel_channel) { ++ spdk_nvme_poll_group_destroy(group->group); ++ SPDK_ERRLOG("Cannot get the accel_channel for bdev nvme polling group=%p\n", ++ group); ++ return -1; ++ } ++ ++ group->poller = SPDK_POLLER_REGISTER(bdev_nvme_poll, group, g_opts.nvme_ioq_poll_period_us); ++ ++ if (group->poller == NULL) { ++ spdk_put_io_channel(group->accel_channel); ++ spdk_nvme_poll_group_destroy(group->group); ++ return -1; ++ } ++ ++ return 0; ++} ++ ++static void ++bdev_nvme_destroy_poll_group_cb(void *io_device, void *ctx_buf) ++{ ++ struct nvme_poll_group *group = ctx_buf; ++ ++ assert(TAILQ_EMPTY(&group->qpair_list)); ++ ++ if (group->accel_channel) { ++ spdk_put_io_channel(group->accel_channel); ++ } ++ ++ spdk_poller_unregister(&group->poller); ++ if (spdk_nvme_poll_group_destroy(group->group)) { ++ SPDK_ERRLOG("Unable to destroy a poll group for the NVMe bdev module.\n"); ++ assert(false); ++ } ++} ++ ++static struct spdk_io_channel * ++bdev_nvme_get_io_channel(void *ctx) ++{ ++ struct nvme_bdev *nvme_bdev = ctx; ++ ++ return spdk_get_io_channel(nvme_bdev); ++} ++ ++static void * ++bdev_nvme_get_module_ctx(void *ctx) ++{ ++ struct nvme_bdev *nvme_bdev = ctx; ++ struct nvme_ns *nvme_ns; ++ ++ if (!nvme_bdev || nvme_bdev->disk.module != &nvme_if) { ++ return NULL; ++ } ++ ++ nvme_ns = TAILQ_FIRST(&nvme_bdev->nvme_ns_list); ++ if (!nvme_ns) { ++ return NULL; ++ } ++ ++ return nvme_ns->ns; ++} ++ ++static const char * ++_nvme_ana_state_str(enum spdk_nvme_ana_state ana_state) ++{ ++ switch (ana_state) { ++ case SPDK_NVME_ANA_OPTIMIZED_STATE: ++ return "optimized"; ++ case SPDK_NVME_ANA_NON_OPTIMIZED_STATE: ++ return "non_optimized"; ++ case SPDK_NVME_ANA_INACCESSIBLE_STATE: ++ return "inaccessible"; ++ case SPDK_NVME_ANA_PERSISTENT_LOSS_STATE: ++ return "persistent_loss"; ++ case SPDK_NVME_ANA_CHANGE_STATE: ++ return "change"; ++ default: ++ return NULL; ++ } ++} ++ ++static int ++bdev_nvme_get_memory_domains(void *ctx, struct spdk_memory_domain **domains, int array_size) ++{ ++ struct spdk_memory_domain **_domains = NULL; ++ struct nvme_bdev *nbdev = ctx; ++ struct nvme_ns *nvme_ns; ++ int i = 0, _array_size = array_size; ++ int rc = 0; ++ ++ TAILQ_FOREACH(nvme_ns, &nbdev->nvme_ns_list, tailq) { ++ if (domains && array_size >= i) { ++ _domains = &domains[i]; ++ } else { ++ _domains = NULL; ++ } ++ rc = spdk_nvme_ctrlr_get_memory_domains(nvme_ns->ctrlr->ctrlr, _domains, _array_size); ++ if (rc > 0) { ++ i += rc; ++ if (_array_size >= rc) { ++ _array_size -= rc; ++ } else { ++ _array_size = 0; ++ } ++ } else if (rc < 0) { ++ return rc; ++ } ++ } ++ ++ return i; ++} ++ ++static const char * ++nvme_ctrlr_get_state_str(struct nvme_ctrlr *nvme_ctrlr) ++{ ++ if (nvme_ctrlr->destruct) { ++ return "deleting"; ++ } else if (spdk_nvme_ctrlr_is_failed(nvme_ctrlr->ctrlr)) { ++ return "failed"; ++ } else if (nvme_ctrlr->resetting) { ++ return "resetting"; ++ } else if (nvme_ctrlr->reconnect_is_delayed > 0) { ++ return "reconnect_is_delayed"; ++ } else { ++ return "enabled"; ++ } ++} ++ ++void ++nvme_ctrlr_info_json(struct spdk_json_write_ctx *w, struct nvme_ctrlr *nvme_ctrlr) ++{ ++ struct spdk_nvme_transport_id *trid; ++ const struct spdk_nvme_ctrlr_opts *opts; ++ const struct spdk_nvme_ctrlr_data *cdata; ++ ++ spdk_json_write_object_begin(w); ++ ++ spdk_json_write_named_string(w, "state", nvme_ctrlr_get_state_str(nvme_ctrlr)); ++ ++#ifdef SPDK_CONFIG_NVME_CUSE ++ size_t cuse_name_size = 128; ++ char cuse_name[cuse_name_size]; ++ ++ int rc = spdk_nvme_cuse_get_ctrlr_name(nvme_ctrlr->ctrlr, cuse_name, &cuse_name_size); ++ if (rc == 0) { ++ spdk_json_write_named_string(w, "cuse_device", cuse_name); ++ } ++#endif ++ trid = &nvme_ctrlr->active_path_id->trid; ++ spdk_json_write_named_object_begin(w, "trid"); ++ nvme_bdev_dump_trid_json(trid, w); ++ spdk_json_write_object_end(w); ++ ++ cdata = spdk_nvme_ctrlr_get_data(nvme_ctrlr->ctrlr); ++ spdk_json_write_named_uint16(w, "cntlid", cdata->cntlid); ++ ++ opts = spdk_nvme_ctrlr_get_opts(nvme_ctrlr->ctrlr); ++ spdk_json_write_named_object_begin(w, "host"); ++ spdk_json_write_named_string(w, "nqn", opts->hostnqn); ++ spdk_json_write_named_string(w, "addr", opts->src_addr); ++ spdk_json_write_named_string(w, "svcid", opts->src_svcid); ++ spdk_json_write_object_end(w); ++ ++ spdk_json_write_object_end(w); ++} ++ ++static void ++nvme_namespace_info_json(struct spdk_json_write_ctx *w, ++ struct nvme_ns *nvme_ns) ++{ ++ struct spdk_nvme_ns *ns; ++ struct spdk_nvme_ctrlr *ctrlr; ++ const struct spdk_nvme_ctrlr_data *cdata; ++ const struct spdk_nvme_transport_id *trid; ++ union spdk_nvme_vs_register vs; ++ const struct spdk_nvme_ns_data *nsdata; ++ char buf[128]; ++ ++ ns = nvme_ns->ns; ++ ctrlr = spdk_nvme_ns_get_ctrlr(ns); ++ ++ cdata = spdk_nvme_ctrlr_get_data(ctrlr); ++ trid = spdk_nvme_ctrlr_get_transport_id(ctrlr); ++ vs = spdk_nvme_ctrlr_get_regs_vs(ctrlr); ++ ++ spdk_json_write_object_begin(w); ++ ++ if (trid->trtype == SPDK_NVME_TRANSPORT_PCIE) { ++ spdk_json_write_named_string(w, "pci_address", trid->traddr); ++ } ++ ++ spdk_json_write_named_object_begin(w, "trid"); ++ ++ nvme_bdev_dump_trid_json(trid, w); ++ ++ spdk_json_write_object_end(w); ++ ++#ifdef SPDK_CONFIG_NVME_CUSE ++ size_t cuse_name_size = 128; ++ char cuse_name[cuse_name_size]; ++ ++ int rc = spdk_nvme_cuse_get_ns_name(ctrlr, spdk_nvme_ns_get_id(ns), ++ cuse_name, &cuse_name_size); ++ if (rc == 0) { ++ spdk_json_write_named_string(w, "cuse_device", cuse_name); ++ } ++#endif ++ ++ spdk_json_write_named_object_begin(w, "ctrlr_data"); ++ ++ spdk_json_write_named_uint16(w, "cntlid", cdata->cntlid); ++ ++ spdk_json_write_named_string_fmt(w, "vendor_id", "0x%04x", cdata->vid); ++ ++ snprintf(buf, sizeof(cdata->mn) + 1, "%s", cdata->mn); ++ spdk_str_trim(buf); ++ spdk_json_write_named_string(w, "model_number", buf); ++ ++ snprintf(buf, sizeof(cdata->sn) + 1, "%s", cdata->sn); ++ spdk_str_trim(buf); ++ spdk_json_write_named_string(w, "serial_number", buf); ++ ++ snprintf(buf, sizeof(cdata->fr) + 1, "%s", cdata->fr); ++ spdk_str_trim(buf); ++ spdk_json_write_named_string(w, "firmware_revision", buf); ++ ++ if (cdata->subnqn[0] != '\0') { ++ spdk_json_write_named_string(w, "subnqn", cdata->subnqn); ++ } ++ ++ spdk_json_write_named_object_begin(w, "oacs"); ++ ++ spdk_json_write_named_uint32(w, "security", cdata->oacs.security); ++ spdk_json_write_named_uint32(w, "format", cdata->oacs.format); ++ spdk_json_write_named_uint32(w, "firmware", cdata->oacs.firmware); ++ spdk_json_write_named_uint32(w, "ns_manage", cdata->oacs.ns_manage); ++ ++ spdk_json_write_object_end(w); ++ ++ spdk_json_write_named_bool(w, "multi_ctrlr", cdata->cmic.multi_ctrlr); ++ spdk_json_write_named_bool(w, "ana_reporting", cdata->cmic.ana_reporting); ++ ++ spdk_json_write_object_end(w); ++ ++ spdk_json_write_named_object_begin(w, "vs"); ++ ++ spdk_json_write_name(w, "nvme_version"); ++ if (vs.bits.ter) { ++ spdk_json_write_string_fmt(w, "%u.%u.%u", vs.bits.mjr, vs.bits.mnr, vs.bits.ter); ++ } else { ++ spdk_json_write_string_fmt(w, "%u.%u", vs.bits.mjr, vs.bits.mnr); ++ } ++ ++ spdk_json_write_object_end(w); ++ ++ nsdata = spdk_nvme_ns_get_data(ns); ++ ++ spdk_json_write_named_object_begin(w, "ns_data"); ++ ++ spdk_json_write_named_uint32(w, "id", spdk_nvme_ns_get_id(ns)); ++ ++ if (cdata->cmic.ana_reporting) { ++ spdk_json_write_named_string(w, "ana_state", ++ _nvme_ana_state_str(nvme_ns->ana_state)); ++ } ++ ++ spdk_json_write_named_bool(w, "can_share", nsdata->nmic.can_share); ++ ++ spdk_json_write_object_end(w); ++ ++ if (cdata->oacs.security) { ++ spdk_json_write_named_object_begin(w, "security"); ++ ++ spdk_json_write_named_bool(w, "opal", nvme_ns->bdev->opal); ++ ++ spdk_json_write_object_end(w); ++ } ++ ++ spdk_json_write_object_end(w); ++} ++ ++static const char * ++nvme_bdev_get_mp_policy_str(struct nvme_bdev *nbdev) ++{ ++ switch (nbdev->mp_policy) { ++ case BDEV_NVME_MP_POLICY_ACTIVE_PASSIVE: ++ return "active_passive"; ++ case BDEV_NVME_MP_POLICY_ACTIVE_ACTIVE: ++ return "active_active"; ++ default: ++ assert(false); ++ return "invalid"; ++ } ++} ++ ++static int ++bdev_nvme_dump_info_json(void *ctx, struct spdk_json_write_ctx *w) ++{ ++ struct nvme_bdev *nvme_bdev = ctx; ++ struct nvme_ns *nvme_ns; ++ ++ pthread_mutex_lock(&nvme_bdev->mutex); ++ spdk_json_write_named_array_begin(w, "nvme"); ++ TAILQ_FOREACH(nvme_ns, &nvme_bdev->nvme_ns_list, tailq) { ++ nvme_namespace_info_json(w, nvme_ns); ++ } ++ spdk_json_write_array_end(w); ++ spdk_json_write_named_string(w, "mp_policy", nvme_bdev_get_mp_policy_str(nvme_bdev)); ++ pthread_mutex_unlock(&nvme_bdev->mutex); ++ ++ return 0; ++} ++ ++static void ++bdev_nvme_write_config_json(struct spdk_bdev *bdev, struct spdk_json_write_ctx *w) ++{ ++ /* No config per bdev needed */ ++} ++ ++static uint64_t ++bdev_nvme_get_spin_time(struct spdk_io_channel *ch) ++{ ++ struct nvme_bdev_channel *nbdev_ch = spdk_io_channel_get_ctx(ch); ++ struct nvme_io_path *io_path; ++ struct nvme_poll_group *group; ++ uint64_t spin_time = 0; ++ ++ STAILQ_FOREACH(io_path, &nbdev_ch->io_path_list, stailq) { ++ group = io_path->qpair->group; ++ ++ if (!group || !group->collect_spin_stat) { ++ continue; ++ } ++ ++ if (group->end_ticks != 0) { ++ group->spin_ticks += (group->end_ticks - group->start_ticks); ++ group->end_ticks = 0; ++ } ++ ++ spin_time += group->spin_ticks; ++ group->start_ticks = 0; ++ group->spin_ticks = 0; ++ } ++ ++ return (spin_time * 1000000ULL) / spdk_get_ticks_hz(); ++} ++ ++static void ++bdev_nvme_reset_device_stat(void *ctx) ++{ ++ struct nvme_bdev *nbdev = ctx; ++ ++ if (nbdev->err_stat != NULL) { ++ memset(nbdev->err_stat, 0, sizeof(struct nvme_error_stat)); ++ } ++} ++ ++/* JSON string should be lowercases and underscore delimited string. */ ++static void ++bdev_nvme_format_nvme_status(char *dst, const char *src) ++{ ++ char tmp[256]; ++ ++ spdk_strcpy_replace(dst, 256, src, " - ", "_"); ++ spdk_strcpy_replace(tmp, 256, dst, "-", "_"); ++ spdk_strcpy_replace(dst, 256, tmp, " ", "_"); ++ spdk_strlwr(dst); ++} ++ ++static void ++bdev_nvme_dump_device_stat_json(void *ctx, struct spdk_json_write_ctx *w) ++{ ++ struct nvme_bdev *nbdev = ctx; ++ struct spdk_nvme_status status = {}; ++ uint16_t sct, sc; ++ char status_json[256]; ++ const char *status_str; ++ ++ if (nbdev->err_stat == NULL) { ++ return; ++ } ++ ++ spdk_json_write_named_object_begin(w, "nvme_error"); ++ ++ spdk_json_write_named_object_begin(w, "status_type"); ++ for (sct = 0; sct < 8; sct++) { ++ if (nbdev->err_stat->status_type[sct] == 0) { ++ continue; ++ } ++ status.sct = sct; ++ ++ status_str = spdk_nvme_cpl_get_status_type_string(&status); ++ assert(status_str != NULL); ++ bdev_nvme_format_nvme_status(status_json, status_str); ++ ++ spdk_json_write_named_uint32(w, status_json, nbdev->err_stat->status_type[sct]); ++ } ++ spdk_json_write_object_end(w); ++ ++ spdk_json_write_named_object_begin(w, "status_code"); ++ for (sct = 0; sct < 4; sct++) { ++ status.sct = sct; ++ for (sc = 0; sc < 256; sc++) { ++ if (nbdev->err_stat->status[sct][sc] == 0) { ++ continue; ++ } ++ status.sc = sc; ++ ++ status_str = spdk_nvme_cpl_get_status_string(&status); ++ assert(status_str != NULL); ++ bdev_nvme_format_nvme_status(status_json, status_str); ++ ++ spdk_json_write_named_uint32(w, status_json, nbdev->err_stat->status[sct][sc]); ++ } ++ } ++ spdk_json_write_object_end(w); ++ ++ spdk_json_write_object_end(w); ++} ++ ++static const struct spdk_bdev_fn_table nvmelib_fn_table = { ++ .destruct = bdev_nvme_destruct, ++ .submit_request = bdev_nvme_submit_request, ++ .io_type_supported = bdev_nvme_io_type_supported, ++ .get_io_channel = bdev_nvme_get_io_channel, ++ .dump_info_json = bdev_nvme_dump_info_json, ++ .write_config_json = bdev_nvme_write_config_json, ++ .get_spin_time = bdev_nvme_get_spin_time, ++ .get_module_ctx = bdev_nvme_get_module_ctx, ++ .get_memory_domains = bdev_nvme_get_memory_domains, ++ .reset_device_stat = bdev_nvme_reset_device_stat, ++ .dump_device_stat_json = bdev_nvme_dump_device_stat_json, ++}; ++ ++typedef int (*bdev_nvme_parse_ana_log_page_cb)( ++ const struct spdk_nvme_ana_group_descriptor *desc, void *cb_arg); ++ ++static int ++bdev_nvme_parse_ana_log_page(struct nvme_ctrlr *nvme_ctrlr, ++ bdev_nvme_parse_ana_log_page_cb cb_fn, void *cb_arg) ++{ ++ struct spdk_nvme_ana_group_descriptor *copied_desc; ++ uint8_t *orig_desc; ++ uint32_t i, desc_size, copy_len; ++ int rc = 0; ++ ++ if (nvme_ctrlr->ana_log_page == NULL) { ++ return -EINVAL; ++ } ++ ++ copied_desc = nvme_ctrlr->copied_ana_desc; ++ ++ orig_desc = (uint8_t *)nvme_ctrlr->ana_log_page + sizeof(struct spdk_nvme_ana_page); ++ copy_len = nvme_ctrlr->max_ana_log_page_size - sizeof(struct spdk_nvme_ana_page); ++ ++ for (i = 0; i < nvme_ctrlr->ana_log_page->num_ana_group_desc; i++) { ++ memcpy(copied_desc, orig_desc, copy_len); ++ ++ rc = cb_fn(copied_desc, cb_arg); ++ if (rc != 0) { ++ break; ++ } ++ ++ desc_size = sizeof(struct spdk_nvme_ana_group_descriptor) + ++ copied_desc->num_of_nsid * sizeof(uint32_t); ++ orig_desc += desc_size; ++ copy_len -= desc_size; ++ } ++ ++ return rc; ++} ++ ++static int ++nvme_ns_ana_transition_timedout(void *ctx) ++{ ++ struct nvme_ns *nvme_ns = ctx; ++ ++ spdk_poller_unregister(&nvme_ns->anatt_timer); ++ nvme_ns->ana_transition_timedout = true; ++ ++ return SPDK_POLLER_BUSY; ++} ++ ++static void ++_nvme_ns_set_ana_state(struct nvme_ns *nvme_ns, ++ const struct spdk_nvme_ana_group_descriptor *desc) ++{ ++ const struct spdk_nvme_ctrlr_data *cdata; ++ ++ nvme_ns->ana_group_id = desc->ana_group_id; ++ nvme_ns->ana_state = desc->ana_state; ++ nvme_ns->ana_state_updating = false; ++ ++ switch (nvme_ns->ana_state) { ++ case SPDK_NVME_ANA_OPTIMIZED_STATE: ++ case SPDK_NVME_ANA_NON_OPTIMIZED_STATE: ++ nvme_ns->ana_transition_timedout = false; ++ spdk_poller_unregister(&nvme_ns->anatt_timer); ++ break; ++ ++ case SPDK_NVME_ANA_INACCESSIBLE_STATE: ++ case SPDK_NVME_ANA_CHANGE_STATE: ++ if (nvme_ns->anatt_timer != NULL) { ++ break; ++ } ++ ++ cdata = spdk_nvme_ctrlr_get_data(nvme_ns->ctrlr->ctrlr); ++ nvme_ns->anatt_timer = SPDK_POLLER_REGISTER(nvme_ns_ana_transition_timedout, ++ nvme_ns, ++ cdata->anatt * SPDK_SEC_TO_USEC); ++ break; ++ default: ++ break; ++ } ++} ++ ++static int ++nvme_ns_set_ana_state(const struct spdk_nvme_ana_group_descriptor *desc, void *cb_arg) ++{ ++ struct nvme_ns *nvme_ns = cb_arg; ++ uint32_t i; ++ ++ for (i = 0; i < desc->num_of_nsid; i++) { ++ if (desc->nsid[i] != spdk_nvme_ns_get_id(nvme_ns->ns)) { ++ continue; ++ } ++ ++ _nvme_ns_set_ana_state(nvme_ns, desc); ++ return 1; ++ } ++ ++ return 0; ++} ++ ++static void ++merge_nsid_sn_strings(const char *sn, char *nsid, int8_t *out) ++{ ++ int i = 0, j = 0; ++ int sn_len = strlen(sn), nsid_len = strlen(nsid); ++ ++ for (i = 0; i < nsid_len; i++) { ++ out[i] = nsid[i]; ++ } ++ ++ /* Since last few characters are more likely to be unique, ++ * even among the devices from the same manufacturer, ++ * we use serial number in reverse. We also skip the ++ * terminating character of serial number string. */ ++ for (j = sn_len - 1; j >= 0; j--) { ++ if (i == SPDK_UUID_STRING_LEN - 1) { ++ break; ++ } ++ ++ /* There may be a lot of spaces in serial number string ++ * and they will generate equally large number of the ++ * same character, so just skip them. */ ++ if (sn[j] == ' ') { ++ continue; ++ } ++ ++ out[i] = sn[j]; ++ i++; ++ } ++} ++ ++/* Dictionary of characters for UUID generation. */ ++static char dict[17] = "0123456789abcdef"; ++ ++static struct spdk_uuid ++nvme_generate_uuid(const char *sn, uint32_t nsid) ++{ ++ struct spdk_uuid new_uuid; ++ char buf[SPDK_UUID_STRING_LEN] = {'\0'}, merged_str[SPDK_UUID_STRING_LEN] = {'\0'}; ++ char nsid_str[NSID_STR_LEN] = {'\0'}, tmp; ++ uint64_t i = 0, j = 0, rem, dict_size = strlen(dict); ++ int rc; ++ ++ assert(strlen(sn) <= SPDK_NVME_CTRLR_SN_LEN); ++ ++ snprintf(nsid_str, NSID_STR_LEN, "%" PRIu32, nsid); ++ ++ merge_nsid_sn_strings(sn, nsid_str, merged_str); ++ ++ while (i < SPDK_UUID_STRING_LEN) { ++ /* If 'j' is equal to indexes, where '-' should be placed, ++ * insert this character and continue the loop without ++ * increasing 'i'. */ ++ if ((j == 8 || j == 13 || j == 18 || j == 23)) { ++ buf[j] = '-'; ++ j++; ++ ++ /* Break, if we ran out of characters in ++ * serial number and namespace ID string. */ ++ if (j == strlen(merged_str)) { ++ break; ++ } ++ continue; ++ } ++ ++ /* Change character in shuffled string to lower case. */ ++ tmp = tolower(merged_str[i]); ++ ++ if (isxdigit(tmp)) { ++ /* If character can be represented by a hex ++ * value as is, copy it to the result buffer. */ ++ buf[j] = tmp; ++ } else { ++ /* Otherwise get its code and divide it ++ * by the number of elements in dictionary. ++ * The remainder will be the index of dictionary ++ * character to replace tmp value with. */ ++ rem = tmp % dict_size; ++ buf[j] = dict[rem]; ++ } ++ ++ i++; ++ j++; ++ ++ /* Break, if we ran out of characters in ++ * serial number and namespace ID string. */ ++ if (j == strlen(merged_str)) { ++ break; ++ } ++ } ++ ++ /* If there are not enough values to fill UUID, ++ * the rest is taken from dictionary characters. */ ++ i = 0; ++ while (j < SPDK_UUID_STRING_LEN - 1) { ++ if ((j == 8 || j == 13 || j == 18 || j == 23)) { ++ buf[j] = '-'; ++ j++; ++ continue; ++ } ++ buf[j] = dict[i % dict_size]; ++ i++; ++ j++; ++ } ++ ++ rc = spdk_uuid_parse(&new_uuid, buf); ++ if (rc != 0) { ++ SPDK_ERRLOG("Unexpected spdk_uuid_parse failure on %s.\n", buf); ++ assert(false); ++ } ++ ++ return new_uuid; ++} ++ ++static int ++nvme_disk_create(struct spdk_bdev *disk, const char *base_name, ++ struct spdk_nvme_ctrlr *ctrlr, struct spdk_nvme_ns *ns, ++ uint32_t prchk_flags, void *ctx) ++{ ++ const struct spdk_uuid *uuid; ++ const uint8_t *nguid; ++ const struct spdk_nvme_ctrlr_data *cdata; ++ const struct spdk_nvme_ns_data *nsdata; ++ const struct spdk_nvme_ctrlr_opts *opts; ++ enum spdk_nvme_csi csi; ++ uint32_t atomic_bs, phys_bs, bs; ++ char sn_tmp[SPDK_NVME_CTRLR_SN_LEN + 1] = {'\0'}; ++ ++ cdata = spdk_nvme_ctrlr_get_data(ctrlr); ++ csi = spdk_nvme_ns_get_csi(ns); ++ opts = spdk_nvme_ctrlr_get_opts(ctrlr); ++ ++ switch (csi) { ++ case SPDK_NVME_CSI_NVM: ++ disk->product_name = "NVMe disk"; ++ break; ++ case SPDK_NVME_CSI_ZNS: ++ disk->product_name = "NVMe ZNS disk"; ++ disk->zoned = true; ++ disk->zone_size = spdk_nvme_zns_ns_get_zone_size_sectors(ns); ++ disk->max_zone_append_size = spdk_nvme_zns_ctrlr_get_max_zone_append_size(ctrlr) / ++ spdk_nvme_ns_get_extended_sector_size(ns); ++ disk->max_open_zones = spdk_nvme_zns_ns_get_max_open_zones(ns); ++ disk->max_active_zones = spdk_nvme_zns_ns_get_max_active_zones(ns); ++ break; ++ default: ++ SPDK_ERRLOG("unsupported CSI: %u\n", csi); ++ return -ENOTSUP; ++ } ++ ++ disk->name = spdk_sprintf_alloc("%sn%d", base_name, spdk_nvme_ns_get_id(ns)); ++ if (!disk->name) { ++ return -ENOMEM; ++ } ++ ++ disk->write_cache = 0; ++ if (cdata->vwc.present) { ++ /* Enable if the Volatile Write Cache exists */ ++ disk->write_cache = 1; ++ } ++ if (cdata->oncs.write_zeroes) { ++ disk->max_write_zeroes = UINT16_MAX + 1; ++ } ++ disk->blocklen = spdk_nvme_ns_get_extended_sector_size(ns); ++ disk->blockcnt = spdk_nvme_ns_get_num_sectors(ns); ++ disk->max_segment_size = spdk_nvme_ctrlr_get_max_xfer_size(ctrlr); ++ /* NVMe driver will split one request into multiple requests ++ * based on MDTS and stripe boundary, the bdev layer will use ++ * max_segment_size and max_num_segments to split one big IO ++ * into multiple requests, then small request can't run out ++ * of NVMe internal requests data structure. ++ */ ++ if (opts && opts->io_queue_requests) { ++ disk->max_num_segments = opts->io_queue_requests / 2; ++ } ++ disk->optimal_io_boundary = spdk_nvme_ns_get_optimal_io_boundary(ns); ++ ++ nguid = spdk_nvme_ns_get_nguid(ns); ++ if (!nguid) { ++ uuid = spdk_nvme_ns_get_uuid(ns); ++ if (uuid) { ++ disk->uuid = *uuid; ++ } else if (g_opts.generate_uuids) { ++ spdk_strcpy_pad(sn_tmp, cdata->sn, SPDK_NVME_CTRLR_SN_LEN, '\0'); ++ disk->uuid = nvme_generate_uuid(sn_tmp, spdk_nvme_ns_get_id(ns)); ++ } ++ } else { ++ memcpy(&disk->uuid, nguid, sizeof(disk->uuid)); ++ } ++ ++ nsdata = spdk_nvme_ns_get_data(ns); ++ bs = spdk_nvme_ns_get_sector_size(ns); ++ atomic_bs = bs; ++ phys_bs = bs; ++ if (nsdata->nabo == 0) { ++ if (nsdata->nsfeat.ns_atomic_write_unit && nsdata->nawupf) { ++ atomic_bs = bs * (1 + nsdata->nawupf); ++ } else { ++ atomic_bs = bs * (1 + cdata->awupf); ++ } ++ } ++ if (nsdata->nsfeat.optperf) { ++ phys_bs = bs * (1 + nsdata->npwg); ++ } ++ disk->phys_blocklen = spdk_min(phys_bs, atomic_bs); ++ ++ disk->md_len = spdk_nvme_ns_get_md_size(ns); ++ if (disk->md_len != 0) { ++ disk->md_interleave = nsdata->flbas.extended; ++ disk->dif_type = (enum spdk_dif_type)spdk_nvme_ns_get_pi_type(ns); ++ if (disk->dif_type != SPDK_DIF_DISABLE) { ++ disk->dif_is_head_of_md = nsdata->dps.md_start; ++ disk->dif_check_flags = prchk_flags; ++ } ++ } ++ ++ if (!(spdk_nvme_ctrlr_get_flags(ctrlr) & ++ SPDK_NVME_CTRLR_COMPARE_AND_WRITE_SUPPORTED)) { ++ disk->acwu = 0; ++ } else if (nsdata->nsfeat.ns_atomic_write_unit) { ++ disk->acwu = nsdata->nacwu + 1; /* 0-based */ ++ } else { ++ disk->acwu = cdata->acwu + 1; /* 0-based */ ++ } ++ ++ if (cdata->oncs.copy) { ++ /* For now bdev interface allows only single segment copy */ ++ disk->max_copy = nsdata->mssrl; ++ } ++ ++ disk->ctxt = ctx; ++ disk->fn_table = &nvmelib_fn_table; ++ disk->module = &nvme_if; ++ ++ return 0; ++} ++ ++static int ++nvme_bdev_create(struct nvme_ctrlr *nvme_ctrlr, struct nvme_ns *nvme_ns) ++{ ++ struct nvme_bdev *bdev; ++ int rc; ++ ++ bdev = calloc(1, sizeof(*bdev)); ++ if (!bdev) { ++ SPDK_ERRLOG("bdev calloc() failed\n"); ++ return -ENOMEM; ++ } ++ ++ if (g_opts.nvme_error_stat) { ++ bdev->err_stat = calloc(1, sizeof(struct nvme_error_stat)); ++ if (!bdev->err_stat) { ++ SPDK_ERRLOG("err_stat calloc() failed\n"); ++ free(bdev); ++ return -ENOMEM; ++ } ++ } ++ ++ rc = pthread_mutex_init(&bdev->mutex, NULL); ++ if (rc != 0) { ++ free(bdev->err_stat); ++ free(bdev); ++ return rc; ++ } ++ ++ bdev->ref = 1; ++ bdev->mp_policy = BDEV_NVME_MP_POLICY_ACTIVE_PASSIVE; ++ bdev->mp_selector = BDEV_NVME_MP_SELECTOR_ROUND_ROBIN; ++ bdev->rr_min_io = UINT32_MAX; ++ TAILQ_INIT(&bdev->nvme_ns_list); ++ TAILQ_INSERT_TAIL(&bdev->nvme_ns_list, nvme_ns, tailq); ++ bdev->opal = nvme_ctrlr->opal_dev != NULL; ++ ++ rc = nvme_disk_create(&bdev->disk, nvme_ctrlr->nbdev_ctrlr->name, nvme_ctrlr->ctrlr, ++ nvme_ns->ns, nvme_ctrlr->opts.prchk_flags, bdev); ++ if (rc != 0) { ++ SPDK_ERRLOG("Failed to create NVMe disk\n"); ++ pthread_mutex_destroy(&bdev->mutex); ++ free(bdev->err_stat); ++ free(bdev); ++ return rc; ++ } ++ ++ spdk_io_device_register(bdev, ++ bdev_nvme_create_bdev_channel_cb, ++ bdev_nvme_destroy_bdev_channel_cb, ++ sizeof(struct nvme_bdev_channel), ++ bdev->disk.name); ++ ++ rc = spdk_bdev_register(&bdev->disk); ++ if (rc != 0) { ++ SPDK_ERRLOG("spdk_bdev_register() failed\n"); ++ spdk_io_device_unregister(bdev, NULL); ++ pthread_mutex_destroy(&bdev->mutex); ++ free(bdev->disk.name); ++ free(bdev->err_stat); ++ free(bdev); ++ return rc; ++ } ++ ++ nvme_ns->bdev = bdev; ++ bdev->nsid = nvme_ns->id; ++ ++ bdev->nbdev_ctrlr = nvme_ctrlr->nbdev_ctrlr; ++ TAILQ_INSERT_TAIL(&nvme_ctrlr->nbdev_ctrlr->bdevs, bdev, tailq); ++ ++ return 0; ++} ++ ++static bool ++bdev_nvme_compare_ns(struct spdk_nvme_ns *ns1, struct spdk_nvme_ns *ns2) ++{ ++ const struct spdk_nvme_ns_data *nsdata1, *nsdata2; ++ const struct spdk_uuid *uuid1, *uuid2; ++ ++ nsdata1 = spdk_nvme_ns_get_data(ns1); ++ nsdata2 = spdk_nvme_ns_get_data(ns2); ++ uuid1 = spdk_nvme_ns_get_uuid(ns1); ++ uuid2 = spdk_nvme_ns_get_uuid(ns2); ++ ++ return memcmp(nsdata1->nguid, nsdata2->nguid, sizeof(nsdata1->nguid)) == 0 && ++ nsdata1->eui64 == nsdata2->eui64 && ++ ((uuid1 == NULL && uuid2 == NULL) || ++ (uuid1 != NULL && uuid2 != NULL && spdk_uuid_compare(uuid1, uuid2) == 0)) && ++ spdk_nvme_ns_get_csi(ns1) == spdk_nvme_ns_get_csi(ns2); ++} ++ ++static bool ++hotplug_probe_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid, ++ struct spdk_nvme_ctrlr_opts *opts) ++{ ++ struct nvme_probe_skip_entry *entry; ++ ++ TAILQ_FOREACH(entry, &g_skipped_nvme_ctrlrs, tailq) { ++ if (spdk_nvme_transport_id_compare(trid, &entry->trid) == 0) { ++ return false; ++ } ++ } ++ ++ opts->arbitration_burst = (uint8_t)g_opts.arbitration_burst; ++ opts->low_priority_weight = (uint8_t)g_opts.low_priority_weight; ++ opts->medium_priority_weight = (uint8_t)g_opts.medium_priority_weight; ++ opts->high_priority_weight = (uint8_t)g_opts.high_priority_weight; ++ opts->disable_read_ana_log_page = true; ++ ++ SPDK_DEBUGLOG(bdev_nvme, "Attaching to %s\n", trid->traddr); ++ ++ return true; ++} ++ ++static void ++nvme_abort_cpl(void *ctx, const struct spdk_nvme_cpl *cpl) ++{ ++ struct nvme_ctrlr *nvme_ctrlr = ctx; ++ ++ if (spdk_nvme_cpl_is_error(cpl)) { ++ SPDK_WARNLOG("Abort failed. Resetting controller. sc is %u, sct is %u.\n", cpl->status.sc, ++ cpl->status.sct); ++ bdev_nvme_reset(nvme_ctrlr); ++ } else if (cpl->cdw0 & 0x1) { ++ SPDK_WARNLOG("Specified command could not be aborted.\n"); ++ bdev_nvme_reset(nvme_ctrlr); ++ } ++} ++ ++static void ++timeout_cb(void *cb_arg, struct spdk_nvme_ctrlr *ctrlr, ++ struct spdk_nvme_qpair *qpair, uint16_t cid) ++{ ++ struct nvme_ctrlr *nvme_ctrlr = cb_arg; ++ union spdk_nvme_csts_register csts; ++ int rc; ++ ++ assert(nvme_ctrlr->ctrlr == ctrlr); ++ ++ SPDK_WARNLOG("Warning: Detected a timeout. ctrlr=%p qpair=%p cid=%u\n", ctrlr, qpair, cid); ++ ++ /* Only try to read CSTS if it's a PCIe controller or we have a timeout on an I/O ++ * queue. (Note: qpair == NULL when there's an admin cmd timeout.) Otherwise we ++ * would submit another fabrics cmd on the admin queue to read CSTS and check for its ++ * completion recursively. ++ */ ++ if (nvme_ctrlr->active_path_id->trid.trtype == SPDK_NVME_TRANSPORT_PCIE || qpair != NULL) { ++ csts = spdk_nvme_ctrlr_get_regs_csts(ctrlr); ++ if (csts.bits.cfs) { ++ SPDK_ERRLOG("Controller Fatal Status, reset required\n"); ++ bdev_nvme_reset(nvme_ctrlr); ++ return; ++ } ++ } ++ ++ switch (g_opts.action_on_timeout) { ++ case SPDK_BDEV_NVME_TIMEOUT_ACTION_ABORT: ++ if (qpair) { ++ /* Don't send abort to ctrlr when ctrlr is not available. */ ++ pthread_mutex_lock(&nvme_ctrlr->mutex); ++ if (!nvme_ctrlr_is_available(nvme_ctrlr)) { ++ pthread_mutex_unlock(&nvme_ctrlr->mutex); ++ SPDK_NOTICELOG("Quit abort. Ctrlr is not available.\n"); ++ return; ++ } ++ pthread_mutex_unlock(&nvme_ctrlr->mutex); ++ ++ rc = spdk_nvme_ctrlr_cmd_abort(ctrlr, qpair, cid, ++ nvme_abort_cpl, nvme_ctrlr); ++ if (rc == 0) { ++ return; ++ } ++ ++ SPDK_ERRLOG("Unable to send abort. Resetting, rc is %d.\n", rc); ++ } ++ ++ /* FALLTHROUGH */ ++ case SPDK_BDEV_NVME_TIMEOUT_ACTION_RESET: ++ bdev_nvme_reset(nvme_ctrlr); ++ break; ++ case SPDK_BDEV_NVME_TIMEOUT_ACTION_NONE: ++ SPDK_DEBUGLOG(bdev_nvme, "No action for nvme controller timeout.\n"); ++ break; ++ default: ++ SPDK_ERRLOG("An invalid timeout action value is found.\n"); ++ break; ++ } ++} ++ ++static struct nvme_ns * ++nvme_ns_alloc(void) ++{ ++ struct nvme_ns *nvme_ns; ++ ++ nvme_ns = calloc(1, sizeof(struct nvme_ns)); ++ if (nvme_ns == NULL) { ++ return NULL; ++ } ++ ++ if (g_opts.io_path_stat) { ++ nvme_ns->stat = calloc(1, sizeof(struct spdk_bdev_io_stat)); ++ if (nvme_ns->stat == NULL) { ++ free(nvme_ns); ++ return NULL; ++ } ++ spdk_bdev_reset_io_stat(nvme_ns->stat, SPDK_BDEV_RESET_STAT_MAXMIN); ++ } ++ ++ return nvme_ns; ++} ++ ++static void ++nvme_ns_free(struct nvme_ns *nvme_ns) ++{ ++ free(nvme_ns->stat); ++ free(nvme_ns); ++} ++ ++static void ++nvme_ctrlr_populate_namespace_done(struct nvme_ns *nvme_ns, int rc) ++{ ++ struct nvme_ctrlr *nvme_ctrlr = nvme_ns->ctrlr; ++ struct nvme_async_probe_ctx *ctx = nvme_ns->probe_ctx; ++ ++ if (rc == 0) { ++ nvme_ns->probe_ctx = NULL; ++ pthread_mutex_lock(&nvme_ctrlr->mutex); ++ nvme_ctrlr->ref++; ++ pthread_mutex_unlock(&nvme_ctrlr->mutex); ++ } else { ++ RB_REMOVE(nvme_ns_tree, &nvme_ctrlr->namespaces, nvme_ns); ++ nvme_ns_free(nvme_ns); ++ } ++ ++ if (ctx) { ++ ctx->populates_in_progress--; ++ if (ctx->populates_in_progress == 0) { ++ nvme_ctrlr_populate_namespaces_done(nvme_ctrlr, ctx); ++ } ++ } ++} ++ ++static void ++bdev_nvme_add_io_path(struct spdk_io_channel_iter *i) ++{ ++ struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i); ++ struct nvme_bdev_channel *nbdev_ch = spdk_io_channel_get_ctx(_ch); ++ struct nvme_ns *nvme_ns = spdk_io_channel_iter_get_ctx(i); ++ int rc; ++ ++ rc = _bdev_nvme_add_io_path(nbdev_ch, nvme_ns); ++ if (rc != 0) { ++ SPDK_ERRLOG("Failed to add I/O path to bdev_channel dynamically.\n"); ++ } ++ ++ spdk_for_each_channel_continue(i, rc); ++} ++ ++static void ++bdev_nvme_delete_io_path(struct spdk_io_channel_iter *i) ++{ ++ struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i); ++ struct nvme_bdev_channel *nbdev_ch = spdk_io_channel_get_ctx(_ch); ++ struct nvme_ns *nvme_ns = spdk_io_channel_iter_get_ctx(i); ++ struct nvme_io_path *io_path; ++ ++ io_path = _bdev_nvme_get_io_path(nbdev_ch, nvme_ns); ++ if (io_path != NULL) { ++ _bdev_nvme_delete_io_path(nbdev_ch, io_path); ++ } ++ ++ spdk_for_each_channel_continue(i, 0); ++} ++ ++static void ++bdev_nvme_add_io_path_failed(struct spdk_io_channel_iter *i, int status) ++{ ++ struct nvme_ns *nvme_ns = spdk_io_channel_iter_get_ctx(i); ++ ++ nvme_ctrlr_populate_namespace_done(nvme_ns, -1); ++} ++ ++static void ++bdev_nvme_add_io_path_done(struct spdk_io_channel_iter *i, int status) ++{ ++ struct nvme_ns *nvme_ns = spdk_io_channel_iter_get_ctx(i); ++ struct nvme_bdev *bdev = spdk_io_channel_iter_get_io_device(i); ++ ++ if (status == 0) { ++ nvme_ctrlr_populate_namespace_done(nvme_ns, 0); ++ } else { ++ /* Delete the added io_paths and fail populating the namespace. */ ++ spdk_for_each_channel(bdev, ++ bdev_nvme_delete_io_path, ++ nvme_ns, ++ bdev_nvme_add_io_path_failed); ++ } ++} ++ ++static int ++nvme_bdev_add_ns(struct nvme_bdev *bdev, struct nvme_ns *nvme_ns) ++{ ++ struct nvme_ns *tmp_ns; ++ const struct spdk_nvme_ns_data *nsdata; ++ ++ nsdata = spdk_nvme_ns_get_data(nvme_ns->ns); ++ if (!nsdata->nmic.can_share) { ++ SPDK_ERRLOG("Namespace cannot be shared.\n"); ++ return -EINVAL; ++ } ++ ++ pthread_mutex_lock(&bdev->mutex); ++ ++ tmp_ns = TAILQ_FIRST(&bdev->nvme_ns_list); ++ assert(tmp_ns != NULL); ++ ++ if (!bdev_nvme_compare_ns(nvme_ns->ns, tmp_ns->ns)) { ++ pthread_mutex_unlock(&bdev->mutex); ++ SPDK_ERRLOG("Namespaces are not identical.\n"); ++ return -EINVAL; ++ } ++ ++ bdev->ref++; ++ TAILQ_INSERT_TAIL(&bdev->nvme_ns_list, nvme_ns, tailq); ++ nvme_ns->bdev = bdev; ++ ++ pthread_mutex_unlock(&bdev->mutex); ++ ++ /* Add nvme_io_path to nvme_bdev_channels dynamically. */ ++ spdk_for_each_channel(bdev, ++ bdev_nvme_add_io_path, ++ nvme_ns, ++ bdev_nvme_add_io_path_done); ++ ++ return 0; ++} ++ ++static void ++nvme_ctrlr_populate_namespace(struct nvme_ctrlr *nvme_ctrlr, struct nvme_ns *nvme_ns) ++{ ++ struct spdk_nvme_ns *ns; ++ struct nvme_bdev *bdev; ++ int rc = 0; ++ ++ ns = spdk_nvme_ctrlr_get_ns(nvme_ctrlr->ctrlr, nvme_ns->id); ++ if (!ns) { ++ SPDK_DEBUGLOG(bdev_nvme, "Invalid NS %d\n", nvme_ns->id); ++ rc = -EINVAL; ++ goto done; ++ } ++ ++ nvme_ns->ns = ns; ++ nvme_ns->ana_state = SPDK_NVME_ANA_OPTIMIZED_STATE; ++ ++ if (nvme_ctrlr->ana_log_page != NULL) { ++ bdev_nvme_parse_ana_log_page(nvme_ctrlr, nvme_ns_set_ana_state, nvme_ns); ++ } ++ ++ bdev = nvme_bdev_ctrlr_get_bdev(nvme_ctrlr->nbdev_ctrlr, nvme_ns->id); ++ if (bdev == NULL) { ++ rc = nvme_bdev_create(nvme_ctrlr, nvme_ns); ++ } else { ++ rc = nvme_bdev_add_ns(bdev, nvme_ns); ++ if (rc == 0) { ++ return; ++ } ++ } ++done: ++ nvme_ctrlr_populate_namespace_done(nvme_ns, rc); ++} ++ ++static void ++nvme_ctrlr_depopulate_namespace_done(struct nvme_ns *nvme_ns) ++{ ++ struct nvme_ctrlr *nvme_ctrlr = nvme_ns->ctrlr; ++ ++ assert(nvme_ctrlr != NULL); ++ ++ pthread_mutex_lock(&nvme_ctrlr->mutex); ++ ++ RB_REMOVE(nvme_ns_tree, &nvme_ctrlr->namespaces, nvme_ns); ++ ++ if (nvme_ns->bdev != NULL) { ++ pthread_mutex_unlock(&nvme_ctrlr->mutex); ++ return; ++ } ++ ++ nvme_ns_free(nvme_ns); ++ pthread_mutex_unlock(&nvme_ctrlr->mutex); ++ ++ nvme_ctrlr_release(nvme_ctrlr); ++} ++ ++static void ++bdev_nvme_delete_io_path_done(struct spdk_io_channel_iter *i, int status) ++{ ++ struct nvme_ns *nvme_ns = spdk_io_channel_iter_get_ctx(i); ++ ++ nvme_ctrlr_depopulate_namespace_done(nvme_ns); ++} ++ ++static void ++nvme_ctrlr_depopulate_namespace(struct nvme_ctrlr *nvme_ctrlr, struct nvme_ns *nvme_ns) ++{ ++ struct nvme_bdev *bdev; ++ ++ spdk_poller_unregister(&nvme_ns->anatt_timer); ++ ++ bdev = nvme_ns->bdev; ++ if (bdev != NULL) { ++ pthread_mutex_lock(&bdev->mutex); ++ ++ assert(bdev->ref > 0); ++ bdev->ref--; ++ if (bdev->ref == 0) { ++ pthread_mutex_unlock(&bdev->mutex); ++ ++ spdk_bdev_unregister(&bdev->disk, NULL, NULL); ++ } else { ++ /* spdk_bdev_unregister() is not called until the last nvme_ns is ++ * depopulated. Hence we need to remove nvme_ns from bdev->nvme_ns_list ++ * and clear nvme_ns->bdev here. ++ */ ++ TAILQ_REMOVE(&bdev->nvme_ns_list, nvme_ns, tailq); ++ nvme_ns->bdev = NULL; ++ ++ pthread_mutex_unlock(&bdev->mutex); ++ ++ /* Delete nvme_io_paths from nvme_bdev_channels dynamically. After that, ++ * we call depopulate_namespace_done() to avoid use-after-free. ++ */ ++ spdk_for_each_channel(bdev, ++ bdev_nvme_delete_io_path, ++ nvme_ns, ++ bdev_nvme_delete_io_path_done); ++ return; ++ } ++ } ++ ++ nvme_ctrlr_depopulate_namespace_done(nvme_ns); ++} ++ ++static void ++nvme_ctrlr_populate_namespaces(struct nvme_ctrlr *nvme_ctrlr, ++ struct nvme_async_probe_ctx *ctx) ++{ ++ struct spdk_nvme_ctrlr *ctrlr = nvme_ctrlr->ctrlr; ++ struct nvme_ns *nvme_ns, *next; ++ struct spdk_nvme_ns *ns; ++ struct nvme_bdev *bdev; ++ uint32_t nsid; ++ int rc; ++ uint64_t num_sectors; ++ ++ if (ctx) { ++ /* Initialize this count to 1 to handle the populate functions ++ * calling nvme_ctrlr_populate_namespace_done() immediately. ++ */ ++ ctx->populates_in_progress = 1; ++ } ++ ++ /* First loop over our existing namespaces and see if they have been ++ * removed. */ ++ nvme_ns = nvme_ctrlr_get_first_active_ns(nvme_ctrlr); ++ while (nvme_ns != NULL) { ++ next = nvme_ctrlr_get_next_active_ns(nvme_ctrlr, nvme_ns); ++ ++ if (spdk_nvme_ctrlr_is_active_ns(ctrlr, nvme_ns->id)) { ++ /* NS is still there but attributes may have changed */ ++ ns = spdk_nvme_ctrlr_get_ns(ctrlr, nvme_ns->id); ++ num_sectors = spdk_nvme_ns_get_num_sectors(ns); ++ bdev = nvme_ns->bdev; ++ assert(bdev != NULL); ++ if (bdev->disk.blockcnt != num_sectors) { ++ SPDK_NOTICELOG("NSID %u is resized: bdev name %s, old size %" PRIu64 ", new size %" PRIu64 "\n", ++ nvme_ns->id, ++ bdev->disk.name, ++ bdev->disk.blockcnt, ++ num_sectors); ++ rc = spdk_bdev_notify_blockcnt_change(&bdev->disk, num_sectors); ++ if (rc != 0) { ++ SPDK_ERRLOG("Could not change num blocks for nvme bdev: name %s, errno: %d.\n", ++ bdev->disk.name, rc); ++ } ++ } ++ } else { ++ /* Namespace was removed */ ++ nvme_ctrlr_depopulate_namespace(nvme_ctrlr, nvme_ns); ++ } ++ ++ nvme_ns = next; ++ } ++ ++ /* Loop through all of the namespaces at the nvme level and see if any of them are new */ ++ nsid = spdk_nvme_ctrlr_get_first_active_ns(ctrlr); ++ while (nsid != 0) { ++ nvme_ns = nvme_ctrlr_get_ns(nvme_ctrlr, nsid); ++ ++ if (nvme_ns == NULL) { ++ /* Found a new one */ ++ nvme_ns = nvme_ns_alloc(); ++ if (nvme_ns == NULL) { ++ SPDK_ERRLOG("Failed to allocate namespace\n"); ++ /* This just fails to attach the namespace. It may work on a future attempt. */ ++ continue; ++ } ++ ++ nvme_ns->id = nsid; ++ nvme_ns->ctrlr = nvme_ctrlr; ++ ++ nvme_ns->bdev = NULL; ++ ++ if (ctx) { ++ ctx->populates_in_progress++; ++ } ++ nvme_ns->probe_ctx = ctx; ++ ++ RB_INSERT(nvme_ns_tree, &nvme_ctrlr->namespaces, nvme_ns); ++ ++ nvme_ctrlr_populate_namespace(nvme_ctrlr, nvme_ns); ++ } ++ ++ nsid = spdk_nvme_ctrlr_get_next_active_ns(ctrlr, nsid); ++ } ++ ++ if (ctx) { ++ /* Decrement this count now that the loop is over to account ++ * for the one we started with. If the count is then 0, we ++ * know any populate_namespace functions completed immediately, ++ * so we'll kick the callback here. ++ */ ++ ctx->populates_in_progress--; ++ if (ctx->populates_in_progress == 0) { ++ nvme_ctrlr_populate_namespaces_done(nvme_ctrlr, ctx); ++ } ++ } ++ ++} ++ ++static void ++nvme_ctrlr_depopulate_namespaces(struct nvme_ctrlr *nvme_ctrlr) ++{ ++ struct nvme_ns *nvme_ns, *tmp; ++ ++ RB_FOREACH_SAFE(nvme_ns, nvme_ns_tree, &nvme_ctrlr->namespaces, tmp) { ++ nvme_ctrlr_depopulate_namespace(nvme_ctrlr, nvme_ns); ++ } ++} ++ ++static uint32_t ++nvme_ctrlr_get_ana_log_page_size(struct nvme_ctrlr *nvme_ctrlr) ++{ ++ struct spdk_nvme_ctrlr *ctrlr = nvme_ctrlr->ctrlr; ++ const struct spdk_nvme_ctrlr_data *cdata; ++ uint32_t nsid, ns_count = 0; ++ ++ cdata = spdk_nvme_ctrlr_get_data(ctrlr); ++ ++ for (nsid = spdk_nvme_ctrlr_get_first_active_ns(ctrlr); ++ nsid != 0; nsid = spdk_nvme_ctrlr_get_next_active_ns(ctrlr, nsid)) { ++ ns_count++; ++ } ++ ++ return sizeof(struct spdk_nvme_ana_page) + cdata->nanagrpid * ++ sizeof(struct spdk_nvme_ana_group_descriptor) + ns_count * ++ sizeof(uint32_t); ++} ++ ++static int ++nvme_ctrlr_set_ana_states(const struct spdk_nvme_ana_group_descriptor *desc, ++ void *cb_arg) ++{ ++ struct nvme_ctrlr *nvme_ctrlr = cb_arg; ++ struct nvme_ns *nvme_ns; ++ uint32_t i, nsid; ++ ++ for (i = 0; i < desc->num_of_nsid; i++) { ++ nsid = desc->nsid[i]; ++ if (nsid == 0) { ++ continue; ++ } ++ ++ nvme_ns = nvme_ctrlr_get_ns(nvme_ctrlr, nsid); ++ ++ assert(nvme_ns != NULL); ++ if (nvme_ns == NULL) { ++ /* Target told us that an inactive namespace had an ANA change */ ++ continue; ++ } ++ ++ _nvme_ns_set_ana_state(nvme_ns, desc); ++ } ++ ++ return 0; ++} ++ ++static void ++bdev_nvme_disable_read_ana_log_page(struct nvme_ctrlr *nvme_ctrlr) ++{ ++ struct nvme_ns *nvme_ns; ++ ++ spdk_free(nvme_ctrlr->ana_log_page); ++ nvme_ctrlr->ana_log_page = NULL; ++ ++ for (nvme_ns = nvme_ctrlr_get_first_active_ns(nvme_ctrlr); ++ nvme_ns != NULL; ++ nvme_ns = nvme_ctrlr_get_next_active_ns(nvme_ctrlr, nvme_ns)) { ++ nvme_ns->ana_state_updating = false; ++ nvme_ns->ana_state = SPDK_NVME_ANA_OPTIMIZED_STATE; ++ } ++} ++ ++static void ++nvme_ctrlr_read_ana_log_page_done(void *ctx, const struct spdk_nvme_cpl *cpl) ++{ ++ struct nvme_ctrlr *nvme_ctrlr = ctx; ++ ++ if (cpl != NULL && spdk_nvme_cpl_is_success(cpl)) { ++ bdev_nvme_parse_ana_log_page(nvme_ctrlr, nvme_ctrlr_set_ana_states, ++ nvme_ctrlr); ++ } else { ++ bdev_nvme_disable_read_ana_log_page(nvme_ctrlr); ++ } ++ ++ pthread_mutex_lock(&nvme_ctrlr->mutex); ++ ++ assert(nvme_ctrlr->ana_log_page_updating == true); ++ nvme_ctrlr->ana_log_page_updating = false; ++ ++ if (nvme_ctrlr_can_be_unregistered(nvme_ctrlr)) { ++ pthread_mutex_unlock(&nvme_ctrlr->mutex); ++ ++ nvme_ctrlr_unregister(nvme_ctrlr); ++ } else { ++ pthread_mutex_unlock(&nvme_ctrlr->mutex); ++ ++ bdev_nvme_clear_io_path_caches(nvme_ctrlr); ++ } ++} ++ ++static int ++nvme_ctrlr_read_ana_log_page(struct nvme_ctrlr *nvme_ctrlr) ++{ ++ uint32_t ana_log_page_size; ++ int rc; ++ ++ if (nvme_ctrlr->ana_log_page == NULL) { ++ return -EINVAL; ++ } ++ ++ ana_log_page_size = nvme_ctrlr_get_ana_log_page_size(nvme_ctrlr); ++ ++ if (ana_log_page_size > nvme_ctrlr->max_ana_log_page_size) { ++ SPDK_ERRLOG("ANA log page size %" PRIu32 " is larger than allowed %" PRIu32 "\n", ++ ana_log_page_size, nvme_ctrlr->max_ana_log_page_size); ++ return -EINVAL; ++ } ++ ++ pthread_mutex_lock(&nvme_ctrlr->mutex); ++ if (!nvme_ctrlr_is_available(nvme_ctrlr) || ++ nvme_ctrlr->ana_log_page_updating) { ++ pthread_mutex_unlock(&nvme_ctrlr->mutex); ++ return -EBUSY; ++ } ++ ++ nvme_ctrlr->ana_log_page_updating = true; ++ pthread_mutex_unlock(&nvme_ctrlr->mutex); ++ ++ rc = spdk_nvme_ctrlr_cmd_get_log_page(nvme_ctrlr->ctrlr, ++ SPDK_NVME_LOG_ASYMMETRIC_NAMESPACE_ACCESS, ++ SPDK_NVME_GLOBAL_NS_TAG, ++ nvme_ctrlr->ana_log_page, ++ ana_log_page_size, 0, ++ nvme_ctrlr_read_ana_log_page_done, ++ nvme_ctrlr); ++ if (rc != 0) { ++ nvme_ctrlr_read_ana_log_page_done(nvme_ctrlr, NULL); ++ } ++ ++ return rc; ++} ++ ++static void ++dummy_bdev_event_cb(enum spdk_bdev_event_type type, struct spdk_bdev *bdev, void *ctx) ++{ ++} ++ ++struct bdev_nvme_set_preferred_path_ctx { ++ struct spdk_bdev_desc *desc; ++ struct nvme_ns *nvme_ns; ++ bdev_nvme_set_preferred_path_cb cb_fn; ++ void *cb_arg; ++}; ++ ++static void ++bdev_nvme_set_preferred_path_done(struct spdk_io_channel_iter *i, int status) ++{ ++ struct bdev_nvme_set_preferred_path_ctx *ctx = spdk_io_channel_iter_get_ctx(i); ++ ++ assert(ctx != NULL); ++ assert(ctx->desc != NULL); ++ assert(ctx->cb_fn != NULL); ++ ++ spdk_bdev_close(ctx->desc); ++ ++ ctx->cb_fn(ctx->cb_arg, status); ++ ++ free(ctx); ++} ++ ++static void ++_bdev_nvme_set_preferred_path(struct spdk_io_channel_iter *i) ++{ ++ struct bdev_nvme_set_preferred_path_ctx *ctx = spdk_io_channel_iter_get_ctx(i); ++ struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i); ++ struct nvme_bdev_channel *nbdev_ch = spdk_io_channel_get_ctx(_ch); ++ struct nvme_io_path *io_path, *prev; ++ ++ prev = NULL; ++ STAILQ_FOREACH(io_path, &nbdev_ch->io_path_list, stailq) { ++ if (io_path->nvme_ns == ctx->nvme_ns) { ++ break; ++ } ++ prev = io_path; ++ } ++ ++ if (io_path != NULL) { ++ if (prev != NULL) { ++ STAILQ_REMOVE_AFTER(&nbdev_ch->io_path_list, prev, stailq); ++ STAILQ_INSERT_HEAD(&nbdev_ch->io_path_list, io_path, stailq); ++ } ++ ++ /* We can set io_path to nbdev_ch->current_io_path directly here. ++ * However, it needs to be conditional. To simplify the code, ++ * just clear nbdev_ch->current_io_path and let find_io_path() ++ * fill it. ++ * ++ * Automatic failback may be disabled. Hence even if the io_path is ++ * already at the head, clear nbdev_ch->current_io_path. ++ */ ++ bdev_nvme_clear_current_io_path(nbdev_ch); ++ } ++ ++ spdk_for_each_channel_continue(i, 0); ++} ++ ++static struct nvme_ns * ++bdev_nvme_set_preferred_ns(struct nvme_bdev *nbdev, uint16_t cntlid) ++{ ++ struct nvme_ns *nvme_ns, *prev; ++ const struct spdk_nvme_ctrlr_data *cdata; ++ ++ prev = NULL; ++ TAILQ_FOREACH(nvme_ns, &nbdev->nvme_ns_list, tailq) { ++ cdata = spdk_nvme_ctrlr_get_data(nvme_ns->ctrlr->ctrlr); ++ ++ if (cdata->cntlid == cntlid) { ++ break; ++ } ++ prev = nvme_ns; ++ } ++ ++ if (nvme_ns != NULL && prev != NULL) { ++ TAILQ_REMOVE(&nbdev->nvme_ns_list, nvme_ns, tailq); ++ TAILQ_INSERT_HEAD(&nbdev->nvme_ns_list, nvme_ns, tailq); ++ } ++ ++ return nvme_ns; ++} ++ ++/* This function supports only multipath mode. There is only a single I/O path ++ * for each NVMe-oF controller. Hence, just move the matched I/O path to the ++ * head of the I/O path list for each NVMe bdev channel. ++ * ++ * NVMe bdev channel may be acquired after completing this function. move the ++ * matched namespace to the head of the namespace list for the NVMe bdev too. ++ */ ++void ++bdev_nvme_set_preferred_path(const char *name, uint16_t cntlid, ++ bdev_nvme_set_preferred_path_cb cb_fn, void *cb_arg) ++{ ++ struct bdev_nvme_set_preferred_path_ctx *ctx; ++ struct spdk_bdev *bdev; ++ struct nvme_bdev *nbdev; ++ int rc = 0; ++ ++ assert(cb_fn != NULL); ++ ++ ctx = calloc(1, sizeof(*ctx)); ++ if (ctx == NULL) { ++ SPDK_ERRLOG("Failed to alloc context.\n"); ++ rc = -ENOMEM; ++ goto err_alloc; ++ } ++ ++ ctx->cb_fn = cb_fn; ++ ctx->cb_arg = cb_arg; ++ ++ rc = spdk_bdev_open_ext(name, false, dummy_bdev_event_cb, NULL, &ctx->desc); ++ if (rc != 0) { ++ SPDK_ERRLOG("Failed to open bdev %s.\n", name); ++ goto err_open; ++ } ++ ++ bdev = spdk_bdev_desc_get_bdev(ctx->desc); ++ ++ if (bdev->module != &nvme_if) { ++ SPDK_ERRLOG("bdev %s is not registered in this module.\n", name); ++ rc = -ENODEV; ++ goto err_bdev; ++ } ++ ++ nbdev = SPDK_CONTAINEROF(bdev, struct nvme_bdev, disk); ++ ++ pthread_mutex_lock(&nbdev->mutex); ++ ++ ctx->nvme_ns = bdev_nvme_set_preferred_ns(nbdev, cntlid); ++ if (ctx->nvme_ns == NULL) { ++ pthread_mutex_unlock(&nbdev->mutex); ++ ++ SPDK_ERRLOG("bdev %s does not have namespace to controller %u.\n", name, cntlid); ++ rc = -ENODEV; ++ goto err_bdev; ++ } ++ ++ pthread_mutex_unlock(&nbdev->mutex); ++ ++ spdk_for_each_channel(nbdev, ++ _bdev_nvme_set_preferred_path, ++ ctx, ++ bdev_nvme_set_preferred_path_done); ++ return; ++ ++err_bdev: ++ spdk_bdev_close(ctx->desc); ++err_open: ++ free(ctx); ++err_alloc: ++ cb_fn(cb_arg, rc); ++} ++ ++struct bdev_nvme_set_multipath_policy_ctx { ++ struct spdk_bdev_desc *desc; ++ bdev_nvme_set_multipath_policy_cb cb_fn; ++ void *cb_arg; ++}; ++ ++static void ++bdev_nvme_set_multipath_policy_done(struct spdk_io_channel_iter *i, int status) ++{ ++ struct bdev_nvme_set_multipath_policy_ctx *ctx = spdk_io_channel_iter_get_ctx(i); ++ ++ assert(ctx != NULL); ++ assert(ctx->desc != NULL); ++ assert(ctx->cb_fn != NULL); ++ ++ spdk_bdev_close(ctx->desc); ++ ++ ctx->cb_fn(ctx->cb_arg, status); ++ ++ free(ctx); ++} ++ ++static void ++_bdev_nvme_set_multipath_policy(struct spdk_io_channel_iter *i) ++{ ++ struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i); ++ struct nvme_bdev_channel *nbdev_ch = spdk_io_channel_get_ctx(_ch); ++ struct nvme_bdev *nbdev = spdk_io_channel_get_io_device(_ch); ++ ++ nbdev_ch->mp_policy = nbdev->mp_policy; ++ nbdev_ch->mp_selector = nbdev->mp_selector; ++ nbdev_ch->rr_min_io = nbdev->rr_min_io; ++ bdev_nvme_clear_current_io_path(nbdev_ch); ++ ++ spdk_for_each_channel_continue(i, 0); ++} ++ ++void ++bdev_nvme_set_multipath_policy(const char *name, enum bdev_nvme_multipath_policy policy, ++ enum bdev_nvme_multipath_selector selector, uint32_t rr_min_io, ++ bdev_nvme_set_multipath_policy_cb cb_fn, void *cb_arg) ++{ ++ struct bdev_nvme_set_multipath_policy_ctx *ctx; ++ struct spdk_bdev *bdev; ++ struct nvme_bdev *nbdev; ++ int rc; ++ ++ assert(cb_fn != NULL); ++ ++ if (policy == BDEV_NVME_MP_POLICY_ACTIVE_ACTIVE && selector == BDEV_NVME_MP_SELECTOR_ROUND_ROBIN) { ++ if (rr_min_io == UINT32_MAX) { ++ rr_min_io = 1; ++ } else if (rr_min_io == 0) { ++ rc = -EINVAL; ++ goto exit; ++ } ++ } else if (rr_min_io != UINT32_MAX) { ++ rc = -EINVAL; ++ goto exit; ++ } ++ ++ ctx = calloc(1, sizeof(*ctx)); ++ if (ctx == NULL) { ++ SPDK_ERRLOG("Failed to alloc context.\n"); ++ rc = -ENOMEM; ++ goto exit; ++ } ++ ++ ctx->cb_fn = cb_fn; ++ ctx->cb_arg = cb_arg; ++ ++ rc = spdk_bdev_open_ext(name, false, dummy_bdev_event_cb, NULL, &ctx->desc); ++ if (rc != 0) { ++ SPDK_ERRLOG("Failed to open bdev %s.\n", name); ++ rc = -ENODEV; ++ goto err_open; ++ } ++ ++ bdev = spdk_bdev_desc_get_bdev(ctx->desc); ++ if (bdev->module != &nvme_if) { ++ SPDK_ERRLOG("bdev %s is not registered in this module.\n", name); ++ rc = -ENODEV; ++ goto err_module; ++ } ++ nbdev = SPDK_CONTAINEROF(bdev, struct nvme_bdev, disk); ++ ++ pthread_mutex_lock(&nbdev->mutex); ++ nbdev->mp_policy = policy; ++ nbdev->mp_selector = selector; ++ nbdev->rr_min_io = rr_min_io; ++ pthread_mutex_unlock(&nbdev->mutex); ++ ++ spdk_for_each_channel(nbdev, ++ _bdev_nvme_set_multipath_policy, ++ ctx, ++ bdev_nvme_set_multipath_policy_done); ++ return; ++ ++err_module: ++ spdk_bdev_close(ctx->desc); ++err_open: ++ free(ctx); ++exit: ++ cb_fn(cb_arg, rc); ++} ++ ++static void ++aer_cb(void *arg, const struct spdk_nvme_cpl *cpl) ++{ ++ struct nvme_ctrlr *nvme_ctrlr = arg; ++ union spdk_nvme_async_event_completion event; ++ ++ if (spdk_nvme_cpl_is_error(cpl)) { ++ SPDK_WARNLOG("AER request execute failed\n"); ++ return; ++ } ++ ++ event.raw = cpl->cdw0; ++ if ((event.bits.async_event_type == SPDK_NVME_ASYNC_EVENT_TYPE_NOTICE) && ++ (event.bits.async_event_info == SPDK_NVME_ASYNC_EVENT_NS_ATTR_CHANGED)) { ++ nvme_ctrlr_populate_namespaces(nvme_ctrlr, NULL); ++ } else if ((event.bits.async_event_type == SPDK_NVME_ASYNC_EVENT_TYPE_NOTICE) && ++ (event.bits.async_event_info == SPDK_NVME_ASYNC_EVENT_ANA_CHANGE)) { ++ nvme_ctrlr_read_ana_log_page(nvme_ctrlr); ++ } ++} ++ ++static void ++populate_namespaces_cb(struct nvme_async_probe_ctx *ctx, size_t count, int rc) ++{ ++ if (ctx->cb_fn) { ++ ctx->cb_fn(ctx->cb_ctx, count, rc); ++ } ++ ++ ctx->namespaces_populated = true; ++ if (ctx->probe_done) { ++ /* The probe was already completed, so we need to free the context ++ * here. This can happen for cases like OCSSD, where we need to ++ * send additional commands to the SSD after attach. ++ */ ++ free(ctx); ++ } ++} ++ ++static void ++nvme_ctrlr_create_done(struct nvme_ctrlr *nvme_ctrlr, ++ struct nvme_async_probe_ctx *ctx) ++{ ++ spdk_io_device_register(nvme_ctrlr, ++ bdev_nvme_create_ctrlr_channel_cb, ++ bdev_nvme_destroy_ctrlr_channel_cb, ++ sizeof(struct nvme_ctrlr_channel), ++ nvme_ctrlr->nbdev_ctrlr->name); ++ ++ nvme_ctrlr_populate_namespaces(nvme_ctrlr, ctx); ++} ++ ++static void ++nvme_ctrlr_init_ana_log_page_done(void *_ctx, const struct spdk_nvme_cpl *cpl) ++{ ++ struct nvme_ctrlr *nvme_ctrlr = _ctx; ++ struct nvme_async_probe_ctx *ctx = nvme_ctrlr->probe_ctx; ++ ++ nvme_ctrlr->probe_ctx = NULL; ++ ++ if (spdk_nvme_cpl_is_error(cpl)) { ++ nvme_ctrlr_delete(nvme_ctrlr); ++ ++ if (ctx != NULL) { ++ populate_namespaces_cb(ctx, 0, -1); ++ } ++ return; ++ } ++ ++ nvme_ctrlr_create_done(nvme_ctrlr, ctx); ++} ++ ++static int ++nvme_ctrlr_init_ana_log_page(struct nvme_ctrlr *nvme_ctrlr, ++ struct nvme_async_probe_ctx *ctx) ++{ ++ struct spdk_nvme_ctrlr *ctrlr = nvme_ctrlr->ctrlr; ++ const struct spdk_nvme_ctrlr_data *cdata; ++ uint32_t ana_log_page_size; ++ ++ cdata = spdk_nvme_ctrlr_get_data(ctrlr); ++ ++ /* Set buffer size enough to include maximum number of allowed namespaces. */ ++ ana_log_page_size = sizeof(struct spdk_nvme_ana_page) + cdata->nanagrpid * ++ sizeof(struct spdk_nvme_ana_group_descriptor) + cdata->mnan * ++ sizeof(uint32_t); ++ ++ nvme_ctrlr->ana_log_page = spdk_zmalloc(ana_log_page_size, 64, NULL, ++ SPDK_ENV_SOCKET_ID_ANY, SPDK_MALLOC_DMA); ++ if (nvme_ctrlr->ana_log_page == NULL) { ++ SPDK_ERRLOG("could not allocate ANA log page buffer\n"); ++ return -ENXIO; ++ } ++ ++ /* Each descriptor in a ANA log page is not ensured to be 8-bytes aligned. ++ * Hence copy each descriptor to a temporary area when parsing it. ++ * ++ * Allocate a buffer whose size is as large as ANA log page buffer because ++ * we do not know the size of a descriptor until actually reading it. ++ */ ++ nvme_ctrlr->copied_ana_desc = calloc(1, ana_log_page_size); ++ if (nvme_ctrlr->copied_ana_desc == NULL) { ++ SPDK_ERRLOG("could not allocate a buffer to parse ANA descriptor\n"); ++ return -ENOMEM; ++ } ++ ++ nvme_ctrlr->max_ana_log_page_size = ana_log_page_size; ++ ++ nvme_ctrlr->probe_ctx = ctx; ++ ++ /* Then, set the read size only to include the current active namespaces. */ ++ ana_log_page_size = nvme_ctrlr_get_ana_log_page_size(nvme_ctrlr); ++ ++ if (ana_log_page_size > nvme_ctrlr->max_ana_log_page_size) { ++ SPDK_ERRLOG("ANA log page size %" PRIu32 " is larger than allowed %" PRIu32 "\n", ++ ana_log_page_size, nvme_ctrlr->max_ana_log_page_size); ++ return -EINVAL; ++ } ++ ++ return spdk_nvme_ctrlr_cmd_get_log_page(ctrlr, ++ SPDK_NVME_LOG_ASYMMETRIC_NAMESPACE_ACCESS, ++ SPDK_NVME_GLOBAL_NS_TAG, ++ nvme_ctrlr->ana_log_page, ++ ana_log_page_size, 0, ++ nvme_ctrlr_init_ana_log_page_done, ++ nvme_ctrlr); ++} ++ ++/* hostnqn and subnqn were already verified before attaching a controller. ++ * Hence check only the multipath capability and cntlid here. ++ */ ++static bool ++bdev_nvme_check_multipath(struct nvme_bdev_ctrlr *nbdev_ctrlr, struct spdk_nvme_ctrlr *ctrlr) ++{ ++ struct nvme_ctrlr *tmp; ++ const struct spdk_nvme_ctrlr_data *cdata, *tmp_cdata; ++ ++ cdata = spdk_nvme_ctrlr_get_data(ctrlr); ++ ++ if (!cdata->cmic.multi_ctrlr) { ++ SPDK_ERRLOG("Ctrlr%u does not support multipath.\n", cdata->cntlid); ++ return false; ++ } ++ ++ TAILQ_FOREACH(tmp, &nbdev_ctrlr->ctrlrs, tailq) { ++ tmp_cdata = spdk_nvme_ctrlr_get_data(tmp->ctrlr); ++ ++ if (!tmp_cdata->cmic.multi_ctrlr) { ++ SPDK_ERRLOG("Ctrlr%u does not support multipath.\n", cdata->cntlid); ++ return false; ++ } ++ if (cdata->cntlid == tmp_cdata->cntlid) { ++ SPDK_ERRLOG("cntlid %u are duplicated.\n", tmp_cdata->cntlid); ++ return false; ++ } ++ } ++ ++ return true; ++} ++ ++static int ++nvme_bdev_ctrlr_create(const char *name, struct nvme_ctrlr *nvme_ctrlr) ++{ ++ struct nvme_bdev_ctrlr *nbdev_ctrlr; ++ struct spdk_nvme_ctrlr *ctrlr = nvme_ctrlr->ctrlr; ++ int rc = 0; ++ ++ pthread_mutex_lock(&g_bdev_nvme_mutex); ++ ++ nbdev_ctrlr = nvme_bdev_ctrlr_get_by_name(name); ++ if (nbdev_ctrlr != NULL) { ++ if (!bdev_nvme_check_multipath(nbdev_ctrlr, ctrlr)) { ++ rc = -EINVAL; ++ goto exit; ++ } ++ } else { ++ nbdev_ctrlr = calloc(1, sizeof(*nbdev_ctrlr)); ++ if (nbdev_ctrlr == NULL) { ++ SPDK_ERRLOG("Failed to allocate nvme_bdev_ctrlr.\n"); ++ rc = -ENOMEM; ++ goto exit; ++ } ++ nbdev_ctrlr->name = strdup(name); ++ if (nbdev_ctrlr->name == NULL) { ++ SPDK_ERRLOG("Failed to allocate name of nvme_bdev_ctrlr.\n"); ++ free(nbdev_ctrlr); ++ goto exit; ++ } ++ TAILQ_INIT(&nbdev_ctrlr->ctrlrs); ++ TAILQ_INIT(&nbdev_ctrlr->bdevs); ++ TAILQ_INSERT_TAIL(&g_nvme_bdev_ctrlrs, nbdev_ctrlr, tailq); ++ } ++ nvme_ctrlr->nbdev_ctrlr = nbdev_ctrlr; ++ TAILQ_INSERT_TAIL(&nbdev_ctrlr->ctrlrs, nvme_ctrlr, tailq); ++exit: ++ pthread_mutex_unlock(&g_bdev_nvme_mutex); ++ return rc; ++} ++ ++static int ++nvme_ctrlr_create(struct spdk_nvme_ctrlr *ctrlr, ++ const char *name, ++ const struct spdk_nvme_transport_id *trid, ++ struct nvme_async_probe_ctx *ctx) ++{ ++ struct nvme_ctrlr *nvme_ctrlr; ++ struct nvme_path_id *path_id; ++ const struct spdk_nvme_ctrlr_data *cdata; ++ int rc; ++ ++ nvme_ctrlr = calloc(1, sizeof(*nvme_ctrlr)); ++ if (nvme_ctrlr == NULL) { ++ SPDK_ERRLOG("Failed to allocate device struct\n"); ++ return -ENOMEM; ++ } ++ ++ rc = pthread_mutex_init(&nvme_ctrlr->mutex, NULL); ++ if (rc != 0) { ++ free(nvme_ctrlr); ++ return rc; ++ } ++ ++ TAILQ_INIT(&nvme_ctrlr->trids); ++ ++ RB_INIT(&nvme_ctrlr->namespaces); ++ ++ path_id = calloc(1, sizeof(*path_id)); ++ if (path_id == NULL) { ++ SPDK_ERRLOG("Failed to allocate trid entry pointer\n"); ++ rc = -ENOMEM; ++ goto err; ++ } ++ ++ path_id->trid = *trid; ++ if (ctx != NULL) { ++ memcpy(path_id->hostid.hostaddr, ctx->drv_opts.src_addr, sizeof(path_id->hostid.hostaddr)); ++ memcpy(path_id->hostid.hostsvcid, ctx->drv_opts.src_svcid, sizeof(path_id->hostid.hostsvcid)); ++ } ++ nvme_ctrlr->active_path_id = path_id; ++ TAILQ_INSERT_HEAD(&nvme_ctrlr->trids, path_id, link); ++ ++ nvme_ctrlr->thread = spdk_get_thread(); ++ nvme_ctrlr->ctrlr = ctrlr; ++ nvme_ctrlr->ref = 1; ++ ++ if (spdk_nvme_ctrlr_is_ocssd_supported(ctrlr)) { ++ SPDK_ERRLOG("OCSSDs are not supported"); ++ rc = -ENOTSUP; ++ goto err; ++ } ++ ++ if (ctx != NULL) { ++ memcpy(&nvme_ctrlr->opts, &ctx->bdev_opts, sizeof(ctx->bdev_opts)); ++ } else { ++ bdev_nvme_get_default_ctrlr_opts(&nvme_ctrlr->opts); ++ } ++ ++ nvme_ctrlr->adminq_timer_poller = SPDK_POLLER_REGISTER(bdev_nvme_poll_adminq, nvme_ctrlr, ++ g_opts.nvme_adminq_poll_period_us); ++ ++ if (g_opts.timeout_us > 0) { ++ /* Register timeout callback. Timeout values for IO vs. admin reqs can be different. */ ++ /* If timeout_admin_us is 0 (not specified), admin uses same timeout as IO. */ ++ uint64_t adm_timeout_us = (g_opts.timeout_admin_us == 0) ? ++ g_opts.timeout_us : g_opts.timeout_admin_us; ++ spdk_nvme_ctrlr_register_timeout_callback(ctrlr, g_opts.timeout_us, ++ adm_timeout_us, timeout_cb, nvme_ctrlr); ++ } ++ ++ spdk_nvme_ctrlr_register_aer_callback(ctrlr, aer_cb, nvme_ctrlr); ++ spdk_nvme_ctrlr_set_remove_cb(ctrlr, remove_cb, nvme_ctrlr); ++ ++ if (spdk_nvme_ctrlr_get_flags(ctrlr) & ++ SPDK_NVME_CTRLR_SECURITY_SEND_RECV_SUPPORTED) { ++ nvme_ctrlr->opal_dev = spdk_opal_dev_construct(ctrlr); ++ } ++ ++ rc = nvme_bdev_ctrlr_create(name, nvme_ctrlr); ++ if (rc != 0) { ++ goto err; ++ } ++ ++ cdata = spdk_nvme_ctrlr_get_data(ctrlr); ++ ++ if (cdata->cmic.ana_reporting) { ++ rc = nvme_ctrlr_init_ana_log_page(nvme_ctrlr, ctx); ++ if (rc == 0) { ++ return 0; ++ } ++ } else { ++ nvme_ctrlr_create_done(nvme_ctrlr, ctx); ++ return 0; ++ } ++ ++err: ++ nvme_ctrlr_delete(nvme_ctrlr); ++ return rc; ++} ++ ++void ++bdev_nvme_get_default_ctrlr_opts(struct nvme_ctrlr_opts *opts) ++{ ++ opts->prchk_flags = 0; ++ opts->ctrlr_loss_timeout_sec = g_opts.ctrlr_loss_timeout_sec; ++ opts->reconnect_delay_sec = g_opts.reconnect_delay_sec; ++ opts->fast_io_fail_timeout_sec = g_opts.fast_io_fail_timeout_sec; ++} ++ ++static void ++attach_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid, ++ struct spdk_nvme_ctrlr *ctrlr, const struct spdk_nvme_ctrlr_opts *drv_opts) ++{ ++ char *name; ++ ++ name = spdk_sprintf_alloc("HotInNvme%d", g_hot_insert_nvme_controller_index++); ++ if (!name) { ++ SPDK_ERRLOG("Failed to assign name to NVMe device\n"); ++ return; ++ } ++ ++ if (nvme_ctrlr_create(ctrlr, name, trid, NULL) == 0) { ++ SPDK_DEBUGLOG(bdev_nvme, "Attached to %s (%s)\n", trid->traddr, name); ++ } else { ++ SPDK_ERRLOG("Failed to attach to %s (%s)\n", trid->traddr, name); ++ } ++ ++ free(name); ++} ++ ++static void ++_nvme_ctrlr_destruct(void *ctx) ++{ ++ struct nvme_ctrlr *nvme_ctrlr = ctx; ++ ++ nvme_ctrlr_depopulate_namespaces(nvme_ctrlr); ++ nvme_ctrlr_release(nvme_ctrlr); ++} ++ ++static int ++_bdev_nvme_delete(struct nvme_ctrlr *nvme_ctrlr, bool hotplug) ++{ ++ struct nvme_probe_skip_entry *entry; ++ ++ pthread_mutex_lock(&nvme_ctrlr->mutex); ++ ++ /* The controller's destruction was already started */ ++ if (nvme_ctrlr->destruct) { ++ pthread_mutex_unlock(&nvme_ctrlr->mutex); ++ return 0; ++ } ++ ++ if (!hotplug && ++ nvme_ctrlr->active_path_id->trid.trtype == SPDK_NVME_TRANSPORT_PCIE) { ++ entry = calloc(1, sizeof(*entry)); ++ if (!entry) { ++ pthread_mutex_unlock(&nvme_ctrlr->mutex); ++ return -ENOMEM; ++ } ++ entry->trid = nvme_ctrlr->active_path_id->trid; ++ TAILQ_INSERT_TAIL(&g_skipped_nvme_ctrlrs, entry, tailq); ++ } ++ ++ nvme_ctrlr->destruct = true; ++ pthread_mutex_unlock(&nvme_ctrlr->mutex); ++ ++ _nvme_ctrlr_destruct(nvme_ctrlr); ++ ++ return 0; ++} ++ ++static void ++remove_cb(void *cb_ctx, struct spdk_nvme_ctrlr *ctrlr) ++{ ++ struct nvme_ctrlr *nvme_ctrlr = cb_ctx; ++ ++ _bdev_nvme_delete(nvme_ctrlr, true); ++} ++ ++static int ++bdev_nvme_hotplug_probe(void *arg) ++{ ++ if (g_hotplug_probe_ctx == NULL) { ++ spdk_poller_unregister(&g_hotplug_probe_poller); ++ return SPDK_POLLER_IDLE; ++ } ++ ++ if (spdk_nvme_probe_poll_async(g_hotplug_probe_ctx) != -EAGAIN) { ++ g_hotplug_probe_ctx = NULL; ++ spdk_poller_unregister(&g_hotplug_probe_poller); ++ } ++ ++ return SPDK_POLLER_BUSY; ++} ++ ++static int ++bdev_nvme_hotplug(void *arg) ++{ ++ struct spdk_nvme_transport_id trid_pcie; ++ ++ if (g_hotplug_probe_ctx) { ++ return SPDK_POLLER_BUSY; ++ } ++ ++ memset(&trid_pcie, 0, sizeof(trid_pcie)); ++ spdk_nvme_trid_populate_transport(&trid_pcie, SPDK_NVME_TRANSPORT_PCIE); ++ ++ g_hotplug_probe_ctx = spdk_nvme_probe_async(&trid_pcie, NULL, ++ hotplug_probe_cb, attach_cb, NULL); ++ ++ if (g_hotplug_probe_ctx) { ++ assert(g_hotplug_probe_poller == NULL); ++ g_hotplug_probe_poller = SPDK_POLLER_REGISTER(bdev_nvme_hotplug_probe, NULL, 1000); ++ } ++ ++ return SPDK_POLLER_BUSY; ++} ++ ++void ++bdev_nvme_get_opts(struct spdk_bdev_nvme_opts *opts) ++{ ++ *opts = g_opts; ++} ++ ++static bool bdev_nvme_check_io_error_resiliency_params(int32_t ctrlr_loss_timeout_sec, ++ uint32_t reconnect_delay_sec, ++ uint32_t fast_io_fail_timeout_sec); ++ ++static int ++bdev_nvme_validate_opts(const struct spdk_bdev_nvme_opts *opts) ++{ ++ if ((opts->timeout_us == 0) && (opts->timeout_admin_us != 0)) { ++ /* Can't set timeout_admin_us without also setting timeout_us */ ++ SPDK_WARNLOG("Invalid options: Can't have (timeout_us == 0) with (timeout_admin_us > 0)\n"); ++ return -EINVAL; ++ } ++ ++ if (opts->bdev_retry_count < -1) { ++ SPDK_WARNLOG("Invalid option: bdev_retry_count can't be less than -1.\n"); ++ return -EINVAL; ++ } ++ ++ if (!bdev_nvme_check_io_error_resiliency_params(opts->ctrlr_loss_timeout_sec, ++ opts->reconnect_delay_sec, ++ opts->fast_io_fail_timeout_sec)) { ++ return -EINVAL; ++ } ++ ++ return 0; ++} ++ ++int ++bdev_nvme_set_opts(const struct spdk_bdev_nvme_opts *opts) ++{ ++ int ret; ++ ++ ret = bdev_nvme_validate_opts(opts); ++ if (ret) { ++ SPDK_WARNLOG("Failed to set nvme opts.\n"); ++ return ret; ++ } ++ ++ if (g_bdev_nvme_init_thread != NULL) { ++ if (!TAILQ_EMPTY(&g_nvme_bdev_ctrlrs)) { ++ return -EPERM; ++ } ++ } ++ ++ if (opts->rdma_srq_size != 0) { ++ struct spdk_nvme_transport_opts drv_opts; ++ ++ spdk_nvme_transport_get_opts(&drv_opts, sizeof(drv_opts)); ++ drv_opts.rdma_srq_size = opts->rdma_srq_size; ++ ++ ret = spdk_nvme_transport_set_opts(&drv_opts, sizeof(drv_opts)); ++ if (ret) { ++ SPDK_ERRLOG("Failed to set NVMe transport opts.\n"); ++ return ret; ++ } ++ } ++ ++ g_opts = *opts; ++ ++ return 0; ++} ++ ++struct set_nvme_hotplug_ctx { ++ uint64_t period_us; ++ bool enabled; ++ spdk_msg_fn fn; ++ void *fn_ctx; ++}; ++ ++static void ++set_nvme_hotplug_period_cb(void *_ctx) ++{ ++ struct set_nvme_hotplug_ctx *ctx = _ctx; ++ ++ spdk_poller_unregister(&g_hotplug_poller); ++ if (ctx->enabled) { ++ g_hotplug_poller = SPDK_POLLER_REGISTER(bdev_nvme_hotplug, NULL, ctx->period_us); ++ } ++ ++ g_nvme_hotplug_poll_period_us = ctx->period_us; ++ g_nvme_hotplug_enabled = ctx->enabled; ++ if (ctx->fn) { ++ ctx->fn(ctx->fn_ctx); ++ } ++ ++ free(ctx); ++} ++ ++int ++bdev_nvme_set_hotplug(bool enabled, uint64_t period_us, spdk_msg_fn cb, void *cb_ctx) ++{ ++ struct set_nvme_hotplug_ctx *ctx; ++ ++ if (enabled == true && !spdk_process_is_primary()) { ++ return -EPERM; ++ } ++ ++ ctx = calloc(1, sizeof(*ctx)); ++ if (ctx == NULL) { ++ return -ENOMEM; ++ } ++ ++ period_us = period_us == 0 ? NVME_HOTPLUG_POLL_PERIOD_DEFAULT : period_us; ++ ctx->period_us = spdk_min(period_us, NVME_HOTPLUG_POLL_PERIOD_MAX); ++ ctx->enabled = enabled; ++ ctx->fn = cb; ++ ctx->fn_ctx = cb_ctx; ++ ++ spdk_thread_send_msg(g_bdev_nvme_init_thread, set_nvme_hotplug_period_cb, ctx); ++ return 0; ++} ++ ++static void ++nvme_ctrlr_populate_namespaces_done(struct nvme_ctrlr *nvme_ctrlr, ++ struct nvme_async_probe_ctx *ctx) ++{ ++ struct nvme_ns *nvme_ns; ++ struct nvme_bdev *nvme_bdev; ++ size_t j; ++ ++ assert(nvme_ctrlr != NULL); ++ ++ if (ctx->names == NULL) { ++ populate_namespaces_cb(ctx, 0, 0); ++ return; ++ } ++ ++ /* ++ * Report the new bdevs that were created in this call. ++ * There can be more than one bdev per NVMe controller. ++ */ ++ j = 0; ++ nvme_ns = nvme_ctrlr_get_first_active_ns(nvme_ctrlr); ++ while (nvme_ns != NULL) { ++ nvme_bdev = nvme_ns->bdev; ++ if (j < ctx->count) { ++ ctx->names[j] = nvme_bdev->disk.name; ++ j++; ++ } else { ++ SPDK_ERRLOG("Maximum number of namespaces supported per NVMe controller is %du. Unable to return all names of created bdevs\n", ++ ctx->count); ++ populate_namespaces_cb(ctx, 0, -ERANGE); ++ return; ++ } ++ ++ nvme_ns = nvme_ctrlr_get_next_active_ns(nvme_ctrlr, nvme_ns); ++ } ++ ++ populate_namespaces_cb(ctx, j, 0); ++} ++ ++static int ++bdev_nvme_check_secondary_trid(struct nvme_ctrlr *nvme_ctrlr, ++ struct spdk_nvme_ctrlr *new_ctrlr, ++ struct spdk_nvme_transport_id *trid) ++{ ++ struct nvme_path_id *tmp_trid; ++ ++ if (trid->trtype == SPDK_NVME_TRANSPORT_PCIE) { ++ SPDK_ERRLOG("PCIe failover is not supported.\n"); ++ return -ENOTSUP; ++ } ++ ++ /* Currently we only support failover to the same transport type. */ ++ if (nvme_ctrlr->active_path_id->trid.trtype != trid->trtype) { ++ SPDK_WARNLOG("Failover from trtype: %s to a different trtype: %s is not supported currently\n", ++ spdk_nvme_transport_id_trtype_str(nvme_ctrlr->active_path_id->trid.trtype), ++ spdk_nvme_transport_id_trtype_str(trid->trtype)); ++ return -EINVAL; ++ } ++ ++ ++ /* Currently we only support failover to the same NQN. */ ++ if (strncmp(trid->subnqn, nvme_ctrlr->active_path_id->trid.subnqn, SPDK_NVMF_NQN_MAX_LEN)) { ++ SPDK_WARNLOG("Failover from subnqn: %s to a different subnqn: %s is not supported currently\n", ++ nvme_ctrlr->active_path_id->trid.subnqn, trid->subnqn); ++ return -EINVAL; ++ } ++ ++ /* Skip all the other checks if we've already registered this path. */ ++ TAILQ_FOREACH(tmp_trid, &nvme_ctrlr->trids, link) { ++ if (!spdk_nvme_transport_id_compare(&tmp_trid->trid, trid)) { ++ SPDK_WARNLOG("This path (traddr: %s subnqn: %s) is already registered\n", trid->traddr, ++ trid->subnqn); ++ return -EEXIST; ++ } ++ } ++ ++ return 0; ++} ++ ++static int ++bdev_nvme_check_secondary_namespace(struct nvme_ctrlr *nvme_ctrlr, ++ struct spdk_nvme_ctrlr *new_ctrlr) ++{ ++ struct nvme_ns *nvme_ns; ++ struct spdk_nvme_ns *new_ns; ++ ++ nvme_ns = nvme_ctrlr_get_first_active_ns(nvme_ctrlr); ++ while (nvme_ns != NULL) { ++ new_ns = spdk_nvme_ctrlr_get_ns(new_ctrlr, nvme_ns->id); ++ assert(new_ns != NULL); ++ ++ if (!bdev_nvme_compare_ns(nvme_ns->ns, new_ns)) { ++ return -EINVAL; ++ } ++ ++ nvme_ns = nvme_ctrlr_get_next_active_ns(nvme_ctrlr, nvme_ns); ++ } ++ ++ return 0; ++} ++ ++static int ++_bdev_nvme_add_secondary_trid(struct nvme_ctrlr *nvme_ctrlr, ++ struct spdk_nvme_transport_id *trid) ++{ ++ struct nvme_path_id *new_trid, *tmp_trid; ++ ++ new_trid = calloc(1, sizeof(*new_trid)); ++ if (new_trid == NULL) { ++ return -ENOMEM; ++ } ++ new_trid->trid = *trid; ++ new_trid->is_failed = false; ++ ++ TAILQ_FOREACH(tmp_trid, &nvme_ctrlr->trids, link) { ++ if (tmp_trid->is_failed && tmp_trid != nvme_ctrlr->active_path_id) { ++ TAILQ_INSERT_BEFORE(tmp_trid, new_trid, link); ++ return 0; ++ } ++ } ++ ++ TAILQ_INSERT_TAIL(&nvme_ctrlr->trids, new_trid, link); ++ return 0; ++} ++ ++/* This is the case that a secondary path is added to an existing ++ * nvme_ctrlr for failover. After checking if it can access the same ++ * namespaces as the primary path, it is disconnected until failover occurs. ++ */ ++static int ++bdev_nvme_add_secondary_trid(struct nvme_ctrlr *nvme_ctrlr, ++ struct spdk_nvme_ctrlr *new_ctrlr, ++ struct spdk_nvme_transport_id *trid) ++{ ++ int rc; ++ ++ assert(nvme_ctrlr != NULL); ++ ++ pthread_mutex_lock(&nvme_ctrlr->mutex); ++ ++ rc = bdev_nvme_check_secondary_trid(nvme_ctrlr, new_ctrlr, trid); ++ if (rc != 0) { ++ goto exit; ++ } ++ ++ rc = bdev_nvme_check_secondary_namespace(nvme_ctrlr, new_ctrlr); ++ if (rc != 0) { ++ goto exit; ++ } ++ ++ rc = _bdev_nvme_add_secondary_trid(nvme_ctrlr, trid); ++ ++exit: ++ pthread_mutex_unlock(&nvme_ctrlr->mutex); ++ ++ spdk_nvme_detach(new_ctrlr); ++ ++ return rc; ++} ++ ++static void ++connect_attach_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid, ++ struct spdk_nvme_ctrlr *ctrlr, const struct spdk_nvme_ctrlr_opts *opts) ++{ ++ struct spdk_nvme_ctrlr_opts *user_opts = cb_ctx; ++ struct nvme_async_probe_ctx *ctx; ++ int rc; ++ ++ ctx = SPDK_CONTAINEROF(user_opts, struct nvme_async_probe_ctx, drv_opts); ++ ctx->ctrlr_attached = true; ++ ++ rc = nvme_ctrlr_create(ctrlr, ctx->base_name, &ctx->trid, ctx); ++ if (rc != 0) { ++ populate_namespaces_cb(ctx, 0, rc); ++ } ++} ++ ++static void ++connect_set_failover_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid, ++ struct spdk_nvme_ctrlr *ctrlr, ++ const struct spdk_nvme_ctrlr_opts *opts) ++{ ++ struct spdk_nvme_ctrlr_opts *user_opts = cb_ctx; ++ struct nvme_ctrlr *nvme_ctrlr; ++ struct nvme_async_probe_ctx *ctx; ++ int rc; ++ ++ ctx = SPDK_CONTAINEROF(user_opts, struct nvme_async_probe_ctx, drv_opts); ++ ctx->ctrlr_attached = true; ++ ++ nvme_ctrlr = nvme_ctrlr_get_by_name(ctx->base_name); ++ if (nvme_ctrlr) { ++ rc = bdev_nvme_add_secondary_trid(nvme_ctrlr, ctrlr, &ctx->trid); ++ } else { ++ rc = -ENODEV; ++ } ++ ++ populate_namespaces_cb(ctx, 0, rc); ++} ++ ++static int ++bdev_nvme_async_poll(void *arg) ++{ ++ struct nvme_async_probe_ctx *ctx = arg; ++ int rc; ++ ++ rc = spdk_nvme_probe_poll_async(ctx->probe_ctx); ++ if (spdk_unlikely(rc != -EAGAIN)) { ++ ctx->probe_done = true; ++ spdk_poller_unregister(&ctx->poller); ++ if (!ctx->ctrlr_attached) { ++ /* The probe is done, but no controller was attached. ++ * That means we had a failure, so report -EIO back to ++ * the caller (usually the RPC). populate_namespaces_cb() ++ * will take care of freeing the nvme_async_probe_ctx. ++ */ ++ populate_namespaces_cb(ctx, 0, -EIO); ++ } else if (ctx->namespaces_populated) { ++ /* The namespaces for the attached controller were all ++ * populated and the response was already sent to the ++ * caller (usually the RPC). So free the context here. ++ */ ++ free(ctx); ++ } ++ } ++ ++ return SPDK_POLLER_BUSY; ++} ++ ++static bool ++bdev_nvme_check_io_error_resiliency_params(int32_t ctrlr_loss_timeout_sec, ++ uint32_t reconnect_delay_sec, ++ uint32_t fast_io_fail_timeout_sec) ++{ ++ if (ctrlr_loss_timeout_sec < -1) { ++ SPDK_ERRLOG("ctrlr_loss_timeout_sec can't be less than -1.\n"); ++ return false; ++ } else if (ctrlr_loss_timeout_sec == -1) { ++ if (reconnect_delay_sec == 0) { ++ SPDK_ERRLOG("reconnect_delay_sec can't be 0 if ctrlr_loss_timeout_sec is not 0.\n"); ++ return false; ++ } else if (fast_io_fail_timeout_sec != 0 && ++ fast_io_fail_timeout_sec < reconnect_delay_sec) { ++ SPDK_ERRLOG("reconnect_delay_sec can't be more than fast_io-fail_timeout_sec.\n"); ++ return false; ++ } ++ } else if (ctrlr_loss_timeout_sec != 0) { ++ if (reconnect_delay_sec == 0) { ++ SPDK_ERRLOG("reconnect_delay_sec can't be 0 if ctrlr_loss_timeout_sec is not 0.\n"); ++ return false; ++ } else if (reconnect_delay_sec > (uint32_t)ctrlr_loss_timeout_sec) { ++ SPDK_ERRLOG("reconnect_delay_sec can't be more than ctrlr_loss_timeout_sec.\n"); ++ return false; ++ } else if (fast_io_fail_timeout_sec != 0) { ++ if (fast_io_fail_timeout_sec < reconnect_delay_sec) { ++ SPDK_ERRLOG("reconnect_delay_sec can't be more than fast_io_fail_timeout_sec.\n"); ++ return false; ++ } else if (fast_io_fail_timeout_sec > (uint32_t)ctrlr_loss_timeout_sec) { ++ SPDK_ERRLOG("fast_io_fail_timeout_sec can't be more than ctrlr_loss_timeout_sec.\n"); ++ return false; ++ } ++ } ++ } else if (reconnect_delay_sec != 0 || fast_io_fail_timeout_sec != 0) { ++ SPDK_ERRLOG("Both reconnect_delay_sec and fast_io_fail_timeout_sec must be 0 if ctrlr_loss_timeout_sec is 0.\n"); ++ return false; ++ } ++ ++ return true; ++} ++ ++int ++bdev_nvme_create(struct spdk_nvme_transport_id *trid, ++ const char *base_name, ++ const char **names, ++ uint32_t count, ++ spdk_bdev_create_nvme_fn cb_fn, ++ void *cb_ctx, ++ struct spdk_nvme_ctrlr_opts *drv_opts, ++ struct nvme_ctrlr_opts *bdev_opts, ++ bool multipath) ++{ ++ struct nvme_probe_skip_entry *entry, *tmp; ++ struct nvme_async_probe_ctx *ctx; ++ spdk_nvme_attach_cb attach_cb; ++ ++ /* TODO expand this check to include both the host and target TRIDs. ++ * Only if both are the same should we fail. ++ */ ++ if (nvme_ctrlr_get(trid) != NULL) { ++ SPDK_ERRLOG("A controller with the provided trid (traddr: %s) already exists.\n", trid->traddr); ++ return -EEXIST; ++ } ++ ++ if (bdev_opts != NULL && ++ !bdev_nvme_check_io_error_resiliency_params(bdev_opts->ctrlr_loss_timeout_sec, ++ bdev_opts->reconnect_delay_sec, ++ bdev_opts->fast_io_fail_timeout_sec)) { ++ return -EINVAL; ++ } ++ ++ ctx = calloc(1, sizeof(*ctx)); ++ if (!ctx) { ++ return -ENOMEM; ++ } ++ ctx->base_name = base_name; ++ ctx->names = names; ++ ctx->count = count; ++ ctx->cb_fn = cb_fn; ++ ctx->cb_ctx = cb_ctx; ++ ctx->trid = *trid; ++ ++ if (bdev_opts) { ++ memcpy(&ctx->bdev_opts, bdev_opts, sizeof(*bdev_opts)); ++ } else { ++ bdev_nvme_get_default_ctrlr_opts(&ctx->bdev_opts); ++ } ++ ++ if (trid->trtype == SPDK_NVME_TRANSPORT_PCIE) { ++ TAILQ_FOREACH_SAFE(entry, &g_skipped_nvme_ctrlrs, tailq, tmp) { ++ if (spdk_nvme_transport_id_compare(trid, &entry->trid) == 0) { ++ TAILQ_REMOVE(&g_skipped_nvme_ctrlrs, entry, tailq); ++ free(entry); ++ break; ++ } ++ } ++ } ++ ++ if (drv_opts) { ++ memcpy(&ctx->drv_opts, drv_opts, sizeof(*drv_opts)); ++ } else { ++ spdk_nvme_ctrlr_get_default_ctrlr_opts(&ctx->drv_opts, sizeof(ctx->drv_opts)); ++ } ++ ++ ctx->drv_opts.transport_retry_count = g_opts.transport_retry_count; ++ ctx->drv_opts.transport_ack_timeout = g_opts.transport_ack_timeout; ++ ctx->drv_opts.keep_alive_timeout_ms = g_opts.keep_alive_timeout_ms; ++ ctx->drv_opts.disable_read_ana_log_page = true; ++ ctx->drv_opts.transport_tos = g_opts.transport_tos; ++ ++ if (nvme_bdev_ctrlr_get_by_name(base_name) == NULL || multipath) { ++ attach_cb = connect_attach_cb; ++ } else { ++ attach_cb = connect_set_failover_cb; ++ } ++ ++ ctx->probe_ctx = spdk_nvme_connect_async(trid, &ctx->drv_opts, attach_cb); ++ if (ctx->probe_ctx == NULL) { ++ SPDK_ERRLOG("No controller was found with provided trid (traddr: %s)\n", trid->traddr); ++ free(ctx); ++ return -ENODEV; ++ } ++ ctx->poller = SPDK_POLLER_REGISTER(bdev_nvme_async_poll, ctx, 1000); ++ ++ return 0; ++} ++ ++int ++bdev_nvme_delete(const char *name, const struct nvme_path_id *path_id) ++{ ++ struct nvme_bdev_ctrlr *nbdev_ctrlr; ++ struct nvme_ctrlr *nvme_ctrlr, *tmp_nvme_ctrlr; ++ struct nvme_path_id *p, *t; ++ int rc = -ENXIO; ++ ++ if (name == NULL || path_id == NULL) { ++ return -EINVAL; ++ } ++ ++ nbdev_ctrlr = nvme_bdev_ctrlr_get_by_name(name); ++ if (nbdev_ctrlr == NULL) { ++ SPDK_ERRLOG("Failed to find NVMe bdev controller\n"); ++ return -ENODEV; ++ } ++ ++ TAILQ_FOREACH_SAFE(nvme_ctrlr, &nbdev_ctrlr->ctrlrs, tailq, tmp_nvme_ctrlr) { ++ TAILQ_FOREACH_REVERSE_SAFE(p, &nvme_ctrlr->trids, nvme_paths, link, t) { ++ if (path_id->trid.trtype != 0) { ++ if (path_id->trid.trtype == SPDK_NVME_TRANSPORT_CUSTOM) { ++ if (strcasecmp(path_id->trid.trstring, p->trid.trstring) != 0) { ++ continue; ++ } ++ } else { ++ if (path_id->trid.trtype != p->trid.trtype) { ++ continue; ++ } ++ } ++ } ++ ++ if (!spdk_mem_all_zero(path_id->trid.traddr, sizeof(path_id->trid.traddr))) { ++ if (strcasecmp(path_id->trid.traddr, p->trid.traddr) != 0) { ++ continue; ++ } ++ } ++ ++ if (path_id->trid.adrfam != 0) { ++ if (path_id->trid.adrfam != p->trid.adrfam) { ++ continue; ++ } ++ } ++ ++ if (!spdk_mem_all_zero(path_id->trid.trsvcid, sizeof(path_id->trid.trsvcid))) { ++ if (strcasecmp(path_id->trid.trsvcid, p->trid.trsvcid) != 0) { ++ continue; ++ } ++ } ++ ++ if (!spdk_mem_all_zero(path_id->trid.subnqn, sizeof(path_id->trid.subnqn))) { ++ if (strcmp(path_id->trid.subnqn, p->trid.subnqn) != 0) { ++ continue; ++ } ++ } ++ ++ if (!spdk_mem_all_zero(path_id->hostid.hostaddr, sizeof(path_id->hostid.hostaddr))) { ++ if (strcmp(path_id->hostid.hostaddr, p->hostid.hostaddr) != 0) { ++ continue; ++ } ++ } ++ ++ if (!spdk_mem_all_zero(path_id->hostid.hostsvcid, sizeof(path_id->hostid.hostsvcid))) { ++ if (strcmp(path_id->hostid.hostsvcid, p->hostid.hostsvcid) != 0) { ++ continue; ++ } ++ } ++ ++ /* If we made it here, then this path is a match! Now we need to remove it. */ ++ if (p == nvme_ctrlr->active_path_id) { ++ /* This is the active path in use right now. The active path is always the first in the list. */ ++ ++ if (!TAILQ_NEXT(p, link)) { ++ /* The current path is the only path. */ ++ rc = _bdev_nvme_delete(nvme_ctrlr, false); ++ } else { ++ /* There is an alternative path. */ ++ rc = bdev_nvme_failover(nvme_ctrlr, true); ++ } ++ } else { ++ /* We are not using the specified path. */ ++ TAILQ_REMOVE(&nvme_ctrlr->trids, p, link); ++ free(p); ++ rc = 0; ++ } ++ ++ if (rc < 0 && rc != -ENXIO) { ++ return rc; ++ } ++ ++ ++ } ++ } ++ ++ /* All nvme_ctrlrs were deleted or no nvme_ctrlr which had the trid was found. */ ++ return rc; ++} ++ ++#define DISCOVERY_INFOLOG(ctx, format, ...) \ ++ SPDK_INFOLOG(bdev_nvme, "Discovery[%s:%s] " format, ctx->trid.traddr, ctx->trid.trsvcid, ##__VA_ARGS__); ++ ++#define DISCOVERY_ERRLOG(ctx, format, ...) \ ++ SPDK_ERRLOG("Discovery[%s:%s] " format, ctx->trid.traddr, ctx->trid.trsvcid, ##__VA_ARGS__); ++ ++struct discovery_entry_ctx { ++ char name[128]; ++ struct spdk_nvme_transport_id trid; ++ struct spdk_nvme_ctrlr_opts drv_opts; ++ struct spdk_nvmf_discovery_log_page_entry entry; ++ TAILQ_ENTRY(discovery_entry_ctx) tailq; ++ struct discovery_ctx *ctx; ++}; ++ ++struct discovery_ctx { ++ char *name; ++ spdk_bdev_nvme_start_discovery_fn start_cb_fn; ++ spdk_bdev_nvme_stop_discovery_fn stop_cb_fn; ++ void *cb_ctx; ++ struct spdk_nvme_probe_ctx *probe_ctx; ++ struct spdk_nvme_detach_ctx *detach_ctx; ++ struct spdk_nvme_ctrlr *ctrlr; ++ struct spdk_nvme_transport_id trid; ++ struct discovery_entry_ctx *entry_ctx_in_use; ++ struct spdk_poller *poller; ++ struct spdk_nvme_ctrlr_opts drv_opts; ++ struct nvme_ctrlr_opts bdev_opts; ++ struct spdk_nvmf_discovery_log_page *log_page; ++ TAILQ_ENTRY(discovery_ctx) tailq; ++ TAILQ_HEAD(, discovery_entry_ctx) nvm_entry_ctxs; ++ TAILQ_HEAD(, discovery_entry_ctx) discovery_entry_ctxs; ++ int rc; ++ bool wait_for_attach; ++ uint64_t timeout_ticks; ++ /* Denotes that the discovery service is being started. We're waiting ++ * for the initial connection to the discovery controller to be ++ * established and attach discovered NVM ctrlrs. ++ */ ++ bool initializing; ++ /* Denotes if a discovery is currently in progress for this context. ++ * That includes connecting to newly discovered subsystems. Used to ++ * ensure we do not start a new discovery until an existing one is ++ * complete. ++ */ ++ bool in_progress; ++ ++ /* Denotes if another discovery is needed after the one in progress ++ * completes. Set when we receive an AER completion while a discovery ++ * is already in progress. ++ */ ++ bool pending; ++ ++ /* Signal to the discovery context poller that it should stop the ++ * discovery service, including detaching from the current discovery ++ * controller. ++ */ ++ bool stop; ++ ++ struct spdk_thread *calling_thread; ++ uint32_t index; ++ uint32_t attach_in_progress; ++ char *hostnqn; ++ ++ /* Denotes if the discovery service was started by the mdns discovery. ++ */ ++ bool from_mdns_discovery_service; ++}; ++ ++TAILQ_HEAD(discovery_ctxs, discovery_ctx); ++static struct discovery_ctxs g_discovery_ctxs = TAILQ_HEAD_INITIALIZER(g_discovery_ctxs); ++ ++static void get_discovery_log_page(struct discovery_ctx *ctx); ++ ++static void ++free_discovery_ctx(struct discovery_ctx *ctx) ++{ ++ free(ctx->log_page); ++ free(ctx->hostnqn); ++ free(ctx->name); ++ free(ctx); ++} ++ ++static void ++discovery_complete(struct discovery_ctx *ctx) ++{ ++ ctx->initializing = false; ++ ctx->in_progress = false; ++ if (ctx->pending) { ++ ctx->pending = false; ++ get_discovery_log_page(ctx); ++ } ++} ++ ++static void ++build_trid_from_log_page_entry(struct spdk_nvme_transport_id *trid, ++ struct spdk_nvmf_discovery_log_page_entry *entry) ++{ ++ char *space; ++ ++ trid->trtype = entry->trtype; ++ trid->adrfam = entry->adrfam; ++ memcpy(trid->traddr, entry->traddr, sizeof(entry->traddr)); ++ memcpy(trid->trsvcid, entry->trsvcid, sizeof(entry->trsvcid)); ++ memcpy(trid->subnqn, entry->subnqn, sizeof(trid->subnqn)); ++ ++ /* We want the traddr, trsvcid and subnqn fields to be NULL-terminated. ++ * But the log page entries typically pad them with spaces, not zeroes. ++ * So add a NULL terminator to each of these fields at the appropriate ++ * location. ++ */ ++ space = strchr(trid->traddr, ' '); ++ if (space) { ++ *space = 0; ++ } ++ space = strchr(trid->trsvcid, ' '); ++ if (space) { ++ *space = 0; ++ } ++ space = strchr(trid->subnqn, ' '); ++ if (space) { ++ *space = 0; ++ } ++} ++ ++static void ++stop_discovery(struct discovery_ctx *ctx, spdk_bdev_nvme_stop_discovery_fn cb_fn, void *cb_ctx) ++{ ++ ctx->stop = true; ++ ctx->stop_cb_fn = cb_fn; ++ ctx->cb_ctx = cb_ctx; ++ ++ while (!TAILQ_EMPTY(&ctx->nvm_entry_ctxs)) { ++ struct discovery_entry_ctx *entry_ctx; ++ struct nvme_path_id path = {}; ++ ++ entry_ctx = TAILQ_FIRST(&ctx->nvm_entry_ctxs); ++ path.trid = entry_ctx->trid; ++ bdev_nvme_delete(entry_ctx->name, &path); ++ TAILQ_REMOVE(&ctx->nvm_entry_ctxs, entry_ctx, tailq); ++ free(entry_ctx); ++ } ++ ++ while (!TAILQ_EMPTY(&ctx->discovery_entry_ctxs)) { ++ struct discovery_entry_ctx *entry_ctx; ++ ++ entry_ctx = TAILQ_FIRST(&ctx->discovery_entry_ctxs); ++ TAILQ_REMOVE(&ctx->discovery_entry_ctxs, entry_ctx, tailq); ++ free(entry_ctx); ++ } ++ ++ free(ctx->entry_ctx_in_use); ++ ctx->entry_ctx_in_use = NULL; ++} ++ ++static void ++discovery_remove_controllers(struct discovery_ctx *ctx) ++{ ++ struct spdk_nvmf_discovery_log_page *log_page = ctx->log_page; ++ struct discovery_entry_ctx *entry_ctx, *tmp; ++ struct spdk_nvmf_discovery_log_page_entry *new_entry, *old_entry; ++ struct spdk_nvme_transport_id old_trid; ++ uint64_t numrec, i; ++ bool found; ++ ++ numrec = from_le64(&log_page->numrec); ++ TAILQ_FOREACH_SAFE(entry_ctx, &ctx->nvm_entry_ctxs, tailq, tmp) { ++ found = false; ++ old_entry = &entry_ctx->entry; ++ build_trid_from_log_page_entry(&old_trid, old_entry); ++ for (i = 0; i < numrec; i++) { ++ new_entry = &log_page->entries[i]; ++ if (!memcmp(old_entry, new_entry, sizeof(*old_entry))) { ++ DISCOVERY_INFOLOG(ctx, "NVM %s:%s:%s found again\n", ++ old_trid.subnqn, old_trid.traddr, old_trid.trsvcid); ++ found = true; ++ break; ++ } ++ } ++ if (!found) { ++ struct nvme_path_id path = {}; ++ ++ DISCOVERY_INFOLOG(ctx, "NVM %s:%s:%s not found\n", ++ old_trid.subnqn, old_trid.traddr, old_trid.trsvcid); ++ ++ path.trid = entry_ctx->trid; ++ bdev_nvme_delete(entry_ctx->name, &path); ++ TAILQ_REMOVE(&ctx->nvm_entry_ctxs, entry_ctx, tailq); ++ free(entry_ctx); ++ } ++ } ++ free(log_page); ++ ctx->log_page = NULL; ++ discovery_complete(ctx); ++} ++ ++static void ++complete_discovery_start(struct discovery_ctx *ctx, int status) ++{ ++ ctx->timeout_ticks = 0; ++ ctx->rc = status; ++ if (ctx->start_cb_fn) { ++ ctx->start_cb_fn(ctx->cb_ctx, status); ++ ctx->start_cb_fn = NULL; ++ ctx->cb_ctx = NULL; ++ } ++} ++ ++static void ++discovery_attach_controller_done(void *cb_ctx, size_t bdev_count, int rc) ++{ ++ struct discovery_entry_ctx *entry_ctx = cb_ctx; ++ struct discovery_ctx *ctx = entry_ctx->ctx; ++ ++ DISCOVERY_INFOLOG(ctx, "attach %s done\n", entry_ctx->name); ++ ctx->attach_in_progress--; ++ if (ctx->attach_in_progress == 0) { ++ complete_discovery_start(ctx, ctx->rc); ++ if (ctx->initializing && ctx->rc != 0) { ++ DISCOVERY_ERRLOG(ctx, "stopping discovery due to errors: %d\n", ctx->rc); ++ stop_discovery(ctx, NULL, ctx->cb_ctx); ++ } else { ++ discovery_remove_controllers(ctx); ++ } ++ } ++} ++ ++static struct discovery_entry_ctx * ++create_discovery_entry_ctx(struct discovery_ctx *ctx, struct spdk_nvme_transport_id *trid) ++{ ++ struct discovery_entry_ctx *new_ctx; ++ ++ new_ctx = calloc(1, sizeof(*new_ctx)); ++ if (new_ctx == NULL) { ++ DISCOVERY_ERRLOG(ctx, "could not allocate new entry_ctx\n"); ++ return NULL; ++ } ++ ++ new_ctx->ctx = ctx; ++ memcpy(&new_ctx->trid, trid, sizeof(*trid)); ++ spdk_nvme_ctrlr_get_default_ctrlr_opts(&new_ctx->drv_opts, sizeof(new_ctx->drv_opts)); ++ snprintf(new_ctx->drv_opts.hostnqn, sizeof(new_ctx->drv_opts.hostnqn), "%s", ctx->hostnqn); ++ return new_ctx; ++} ++ ++static void ++discovery_log_page_cb(void *cb_arg, int rc, const struct spdk_nvme_cpl *cpl, ++ struct spdk_nvmf_discovery_log_page *log_page) ++{ ++ struct discovery_ctx *ctx = cb_arg; ++ struct discovery_entry_ctx *entry_ctx, *tmp; ++ struct spdk_nvmf_discovery_log_page_entry *new_entry, *old_entry; ++ uint64_t numrec, i; ++ bool found; ++ ++ if (rc || spdk_nvme_cpl_is_error(cpl)) { ++ DISCOVERY_ERRLOG(ctx, "could not get discovery log page\n"); ++ return; ++ } ++ ++ ctx->log_page = log_page; ++ assert(ctx->attach_in_progress == 0); ++ numrec = from_le64(&log_page->numrec); ++ TAILQ_FOREACH_SAFE(entry_ctx, &ctx->discovery_entry_ctxs, tailq, tmp) { ++ TAILQ_REMOVE(&ctx->discovery_entry_ctxs, entry_ctx, tailq); ++ free(entry_ctx); ++ } ++ for (i = 0; i < numrec; i++) { ++ found = false; ++ new_entry = &log_page->entries[i]; ++ if (new_entry->subtype == SPDK_NVMF_SUBTYPE_DISCOVERY) { ++ struct discovery_entry_ctx *new_ctx; ++ struct spdk_nvme_transport_id trid = {}; ++ ++ build_trid_from_log_page_entry(&trid, new_entry); ++ new_ctx = create_discovery_entry_ctx(ctx, &trid); ++ if (new_ctx == NULL) { ++ DISCOVERY_ERRLOG(ctx, "could not allocate new entry_ctx\n"); ++ break; ++ } ++ ++ TAILQ_INSERT_TAIL(&ctx->discovery_entry_ctxs, new_ctx, tailq); ++ continue; ++ } ++ TAILQ_FOREACH(entry_ctx, &ctx->nvm_entry_ctxs, tailq) { ++ old_entry = &entry_ctx->entry; ++ if (!memcmp(new_entry, old_entry, sizeof(*new_entry))) { ++ found = true; ++ break; ++ } ++ } ++ if (!found) { ++ struct discovery_entry_ctx *subnqn_ctx = NULL, *new_ctx; ++ struct discovery_ctx *d_ctx; ++ ++ TAILQ_FOREACH(d_ctx, &g_discovery_ctxs, tailq) { ++ TAILQ_FOREACH(subnqn_ctx, &d_ctx->nvm_entry_ctxs, tailq) { ++ if (!memcmp(subnqn_ctx->entry.subnqn, new_entry->subnqn, ++ sizeof(new_entry->subnqn))) { ++ break; ++ } ++ } ++ if (subnqn_ctx) { ++ break; ++ } ++ } ++ ++ new_ctx = calloc(1, sizeof(*new_ctx)); ++ if (new_ctx == NULL) { ++ DISCOVERY_ERRLOG(ctx, "could not allocate new entry_ctx\n"); ++ break; ++ } ++ ++ new_ctx->ctx = ctx; ++ memcpy(&new_ctx->entry, new_entry, sizeof(*new_entry)); ++ build_trid_from_log_page_entry(&new_ctx->trid, new_entry); ++ if (subnqn_ctx) { ++ snprintf(new_ctx->name, sizeof(new_ctx->name), "%s", subnqn_ctx->name); ++ DISCOVERY_INFOLOG(ctx, "NVM %s:%s:%s new path for %s\n", ++ new_ctx->trid.subnqn, new_ctx->trid.traddr, new_ctx->trid.trsvcid, ++ new_ctx->name); ++ } else { ++ snprintf(new_ctx->name, sizeof(new_ctx->name), "%s%d", ctx->name, ctx->index++); ++ DISCOVERY_INFOLOG(ctx, "NVM %s:%s:%s new subsystem %s\n", ++ new_ctx->trid.subnqn, new_ctx->trid.traddr, new_ctx->trid.trsvcid, ++ new_ctx->name); ++ } ++ spdk_nvme_ctrlr_get_default_ctrlr_opts(&new_ctx->drv_opts, sizeof(new_ctx->drv_opts)); ++ snprintf(new_ctx->drv_opts.hostnqn, sizeof(new_ctx->drv_opts.hostnqn), "%s", ctx->hostnqn); ++ rc = bdev_nvme_create(&new_ctx->trid, new_ctx->name, NULL, 0, ++ discovery_attach_controller_done, new_ctx, ++ &new_ctx->drv_opts, &ctx->bdev_opts, true); ++ if (rc == 0) { ++ TAILQ_INSERT_TAIL(&ctx->nvm_entry_ctxs, new_ctx, tailq); ++ ctx->attach_in_progress++; ++ } else { ++ DISCOVERY_ERRLOG(ctx, "bdev_nvme_create failed (%s)\n", spdk_strerror(-rc)); ++ } ++ } ++ } ++ ++ if (ctx->attach_in_progress == 0) { ++ discovery_remove_controllers(ctx); ++ } ++} ++ ++static void ++get_discovery_log_page(struct discovery_ctx *ctx) ++{ ++ int rc; ++ ++ assert(ctx->in_progress == false); ++ ctx->in_progress = true; ++ rc = spdk_nvme_ctrlr_get_discovery_log_page(ctx->ctrlr, discovery_log_page_cb, ctx); ++ if (rc != 0) { ++ DISCOVERY_ERRLOG(ctx, "could not get discovery log page\n"); ++ } ++ DISCOVERY_INFOLOG(ctx, "sent discovery log page command\n"); ++} ++ ++static void ++discovery_aer_cb(void *arg, const struct spdk_nvme_cpl *cpl) ++{ ++ struct discovery_ctx *ctx = arg; ++ uint32_t log_page_id = (cpl->cdw0 & 0xFF0000) >> 16; ++ ++ if (spdk_nvme_cpl_is_error(cpl)) { ++ DISCOVERY_ERRLOG(ctx, "aer failed\n"); ++ return; ++ } ++ ++ if (log_page_id != SPDK_NVME_LOG_DISCOVERY) { ++ DISCOVERY_ERRLOG(ctx, "unexpected log page 0x%x\n", log_page_id); ++ return; ++ } ++ ++ DISCOVERY_INFOLOG(ctx, "got aer\n"); ++ if (ctx->in_progress) { ++ ctx->pending = true; ++ return; ++ } ++ ++ get_discovery_log_page(ctx); ++} ++ ++static void ++discovery_attach_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid, ++ struct spdk_nvme_ctrlr *ctrlr, const struct spdk_nvme_ctrlr_opts *opts) ++{ ++ struct spdk_nvme_ctrlr_opts *user_opts = cb_ctx; ++ struct discovery_ctx *ctx; ++ ++ ctx = SPDK_CONTAINEROF(user_opts, struct discovery_ctx, drv_opts); ++ ++ DISCOVERY_INFOLOG(ctx, "discovery ctrlr attached\n"); ++ ctx->probe_ctx = NULL; ++ ctx->ctrlr = ctrlr; ++ ++ if (ctx->rc != 0) { ++ DISCOVERY_ERRLOG(ctx, "encountered error while attaching discovery ctrlr: %d\n", ++ ctx->rc); ++ return; ++ } ++ ++ spdk_nvme_ctrlr_register_aer_callback(ctx->ctrlr, discovery_aer_cb, ctx); ++} ++ ++static int ++discovery_poller(void *arg) ++{ ++ struct discovery_ctx *ctx = arg; ++ struct spdk_nvme_transport_id *trid; ++ int rc; ++ ++ if (ctx->detach_ctx) { ++ rc = spdk_nvme_detach_poll_async(ctx->detach_ctx); ++ if (rc != -EAGAIN) { ++ ctx->detach_ctx = NULL; ++ ctx->ctrlr = NULL; ++ } ++ } else if (ctx->stop) { ++ if (ctx->ctrlr != NULL) { ++ rc = spdk_nvme_detach_async(ctx->ctrlr, &ctx->detach_ctx); ++ if (rc == 0) { ++ return SPDK_POLLER_BUSY; ++ } ++ DISCOVERY_ERRLOG(ctx, "could not detach discovery ctrlr\n"); ++ } ++ spdk_poller_unregister(&ctx->poller); ++ TAILQ_REMOVE(&g_discovery_ctxs, ctx, tailq); ++ assert(ctx->start_cb_fn == NULL); ++ if (ctx->stop_cb_fn != NULL) { ++ ctx->stop_cb_fn(ctx->cb_ctx); ++ } ++ free_discovery_ctx(ctx); ++ } else if (ctx->probe_ctx == NULL && ctx->ctrlr == NULL) { ++ if (ctx->timeout_ticks != 0 && ctx->timeout_ticks < spdk_get_ticks()) { ++ DISCOVERY_ERRLOG(ctx, "timed out while attaching discovery ctrlr\n"); ++ assert(ctx->initializing); ++ spdk_poller_unregister(&ctx->poller); ++ TAILQ_REMOVE(&g_discovery_ctxs, ctx, tailq); ++ complete_discovery_start(ctx, -ETIMEDOUT); ++ stop_discovery(ctx, NULL, NULL); ++ free_discovery_ctx(ctx); ++ return SPDK_POLLER_BUSY; ++ } ++ ++ assert(ctx->entry_ctx_in_use == NULL); ++ ctx->entry_ctx_in_use = TAILQ_FIRST(&ctx->discovery_entry_ctxs); ++ TAILQ_REMOVE(&ctx->discovery_entry_ctxs, ctx->entry_ctx_in_use, tailq); ++ trid = &ctx->entry_ctx_in_use->trid; ++ ctx->probe_ctx = spdk_nvme_connect_async(trid, &ctx->drv_opts, discovery_attach_cb); ++ if (ctx->probe_ctx) { ++ spdk_poller_unregister(&ctx->poller); ++ ctx->poller = SPDK_POLLER_REGISTER(discovery_poller, ctx, 1000); ++ } else { ++ DISCOVERY_ERRLOG(ctx, "could not start discovery connect\n"); ++ TAILQ_INSERT_TAIL(&ctx->discovery_entry_ctxs, ctx->entry_ctx_in_use, tailq); ++ ctx->entry_ctx_in_use = NULL; ++ } ++ } else if (ctx->probe_ctx) { ++ if (ctx->timeout_ticks != 0 && ctx->timeout_ticks < spdk_get_ticks()) { ++ DISCOVERY_ERRLOG(ctx, "timed out while attaching discovery ctrlr\n"); ++ complete_discovery_start(ctx, -ETIMEDOUT); ++ return SPDK_POLLER_BUSY; ++ } ++ ++ rc = spdk_nvme_probe_poll_async(ctx->probe_ctx); ++ if (rc != -EAGAIN) { ++ if (ctx->rc != 0) { ++ assert(ctx->initializing); ++ stop_discovery(ctx, NULL, ctx->cb_ctx); ++ } else { ++ assert(rc == 0); ++ DISCOVERY_INFOLOG(ctx, "discovery ctrlr connected\n"); ++ ctx->rc = rc; ++ get_discovery_log_page(ctx); ++ } ++ } ++ } else { ++ if (ctx->timeout_ticks != 0 && ctx->timeout_ticks < spdk_get_ticks()) { ++ DISCOVERY_ERRLOG(ctx, "timed out while attaching NVM ctrlrs\n"); ++ complete_discovery_start(ctx, -ETIMEDOUT); ++ /* We need to wait until all NVM ctrlrs are attached before we stop the ++ * discovery service to make sure we don't detach a ctrlr that is still ++ * being attached. ++ */ ++ if (ctx->attach_in_progress == 0) { ++ stop_discovery(ctx, NULL, ctx->cb_ctx); ++ return SPDK_POLLER_BUSY; ++ } ++ } ++ ++ rc = spdk_nvme_ctrlr_process_admin_completions(ctx->ctrlr); ++ if (rc < 0) { ++ spdk_poller_unregister(&ctx->poller); ++ ctx->poller = SPDK_POLLER_REGISTER(discovery_poller, ctx, 1000 * 1000); ++ TAILQ_INSERT_TAIL(&ctx->discovery_entry_ctxs, ctx->entry_ctx_in_use, tailq); ++ ctx->entry_ctx_in_use = NULL; ++ ++ rc = spdk_nvme_detach_async(ctx->ctrlr, &ctx->detach_ctx); ++ if (rc != 0) { ++ DISCOVERY_ERRLOG(ctx, "could not detach discovery ctrlr\n"); ++ ctx->ctrlr = NULL; ++ } ++ } ++ } ++ ++ return SPDK_POLLER_BUSY; ++} ++ ++static void ++start_discovery_poller(void *arg) ++{ ++ struct discovery_ctx *ctx = arg; ++ ++ TAILQ_INSERT_TAIL(&g_discovery_ctxs, ctx, tailq); ++ ctx->poller = SPDK_POLLER_REGISTER(discovery_poller, ctx, 1000 * 1000); ++} ++ ++int ++bdev_nvme_start_discovery(struct spdk_nvme_transport_id *trid, ++ const char *base_name, ++ struct spdk_nvme_ctrlr_opts *drv_opts, ++ struct nvme_ctrlr_opts *bdev_opts, ++ uint64_t attach_timeout, ++ bool from_mdns, ++ spdk_bdev_nvme_start_discovery_fn cb_fn, void *cb_ctx) ++{ ++ struct discovery_ctx *ctx; ++ struct discovery_entry_ctx *discovery_entry_ctx; ++ ++ snprintf(trid->subnqn, sizeof(trid->subnqn), "%s", SPDK_NVMF_DISCOVERY_NQN); ++ TAILQ_FOREACH(ctx, &g_discovery_ctxs, tailq) { ++ if (strcmp(ctx->name, base_name) == 0) { ++ return -EEXIST; ++ } ++ ++ if (ctx->entry_ctx_in_use != NULL) { ++ if (!spdk_nvme_transport_id_compare(trid, &ctx->entry_ctx_in_use->trid)) { ++ return -EEXIST; ++ } ++ } ++ ++ TAILQ_FOREACH(discovery_entry_ctx, &ctx->discovery_entry_ctxs, tailq) { ++ if (!spdk_nvme_transport_id_compare(trid, &discovery_entry_ctx->trid)) { ++ return -EEXIST; ++ } ++ } ++ } ++ ++ ctx = calloc(1, sizeof(*ctx)); ++ if (ctx == NULL) { ++ return -ENOMEM; ++ } ++ ++ ctx->name = strdup(base_name); ++ if (ctx->name == NULL) { ++ free_discovery_ctx(ctx); ++ return -ENOMEM; ++ } ++ memcpy(&ctx->drv_opts, drv_opts, sizeof(*drv_opts)); ++ memcpy(&ctx->bdev_opts, bdev_opts, sizeof(*bdev_opts)); ++ ctx->from_mdns_discovery_service = from_mdns; ++ ctx->bdev_opts.from_discovery_service = true; ++ ctx->calling_thread = spdk_get_thread(); ++ ctx->start_cb_fn = cb_fn; ++ ctx->cb_ctx = cb_ctx; ++ ctx->initializing = true; ++ if (ctx->start_cb_fn) { ++ /* We can use this when dumping json to denote if this RPC parameter ++ * was specified or not. ++ */ ++ ctx->wait_for_attach = true; ++ } ++ if (attach_timeout != 0) { ++ ctx->timeout_ticks = spdk_get_ticks() + attach_timeout * ++ spdk_get_ticks_hz() / 1000ull; ++ } ++ TAILQ_INIT(&ctx->nvm_entry_ctxs); ++ TAILQ_INIT(&ctx->discovery_entry_ctxs); ++ memcpy(&ctx->trid, trid, sizeof(*trid)); ++ /* Even if user did not specify hostnqn, we can still strdup("\0"); */ ++ ctx->hostnqn = strdup(ctx->drv_opts.hostnqn); ++ if (ctx->hostnqn == NULL) { ++ free_discovery_ctx(ctx); ++ return -ENOMEM; ++ } ++ discovery_entry_ctx = create_discovery_entry_ctx(ctx, trid); ++ if (discovery_entry_ctx == NULL) { ++ DISCOVERY_ERRLOG(ctx, "could not allocate new entry_ctx\n"); ++ free_discovery_ctx(ctx); ++ return -ENOMEM; ++ } ++ ++ TAILQ_INSERT_TAIL(&ctx->discovery_entry_ctxs, discovery_entry_ctx, tailq); ++ spdk_thread_send_msg(g_bdev_nvme_init_thread, start_discovery_poller, ctx); ++ return 0; ++} ++ ++int ++bdev_nvme_stop_discovery(const char *name, spdk_bdev_nvme_stop_discovery_fn cb_fn, void *cb_ctx) ++{ ++ struct discovery_ctx *ctx; ++ ++ TAILQ_FOREACH(ctx, &g_discovery_ctxs, tailq) { ++ if (strcmp(name, ctx->name) == 0) { ++ if (ctx->stop) { ++ return -EALREADY; ++ } ++ /* If we're still starting the discovery service and ->rc is non-zero, we're ++ * going to stop it as soon as we can ++ */ ++ if (ctx->initializing && ctx->rc != 0) { ++ return -EALREADY; ++ } ++ stop_discovery(ctx, cb_fn, cb_ctx); ++ return 0; ++ } ++ } ++ ++ return -ENOENT; ++} ++ ++static int ++bdev_nvme_library_init(void) ++{ ++ g_bdev_nvme_init_thread = spdk_get_thread(); ++ ++ spdk_io_device_register(&g_nvme_bdev_ctrlrs, bdev_nvme_create_poll_group_cb, ++ bdev_nvme_destroy_poll_group_cb, ++ sizeof(struct nvme_poll_group), "nvme_poll_groups"); ++ ++ return 0; ++} ++ ++static void ++bdev_nvme_fini_destruct_ctrlrs(void) ++{ ++ struct nvme_bdev_ctrlr *nbdev_ctrlr; ++ struct nvme_ctrlr *nvme_ctrlr; ++ ++ pthread_mutex_lock(&g_bdev_nvme_mutex); ++ TAILQ_FOREACH(nbdev_ctrlr, &g_nvme_bdev_ctrlrs, tailq) { ++ TAILQ_FOREACH(nvme_ctrlr, &nbdev_ctrlr->ctrlrs, tailq) { ++ pthread_mutex_lock(&nvme_ctrlr->mutex); ++ if (nvme_ctrlr->destruct) { ++ /* This controller's destruction was already started ++ * before the application started shutting down ++ */ ++ pthread_mutex_unlock(&nvme_ctrlr->mutex); ++ continue; ++ } ++ nvme_ctrlr->destruct = true; ++ pthread_mutex_unlock(&nvme_ctrlr->mutex); ++ ++ spdk_thread_send_msg(nvme_ctrlr->thread, _nvme_ctrlr_destruct, ++ nvme_ctrlr); ++ } ++ } ++ ++ g_bdev_nvme_module_finish = true; ++ if (TAILQ_EMPTY(&g_nvme_bdev_ctrlrs)) { ++ pthread_mutex_unlock(&g_bdev_nvme_mutex); ++ spdk_io_device_unregister(&g_nvme_bdev_ctrlrs, NULL); ++ spdk_bdev_module_fini_done(); ++ return; ++ } ++ ++ pthread_mutex_unlock(&g_bdev_nvme_mutex); ++} ++ ++static void ++check_discovery_fini(void *arg) ++{ ++ if (TAILQ_EMPTY(&g_discovery_ctxs)) { ++ bdev_nvme_fini_destruct_ctrlrs(); ++ } ++} ++ ++static void ++bdev_nvme_library_fini(void) ++{ ++ struct nvme_probe_skip_entry *entry, *entry_tmp; ++ struct discovery_ctx *ctx; ++ ++ spdk_poller_unregister(&g_hotplug_poller); ++ free(g_hotplug_probe_ctx); ++ g_hotplug_probe_ctx = NULL; ++ ++ TAILQ_FOREACH_SAFE(entry, &g_skipped_nvme_ctrlrs, tailq, entry_tmp) { ++ TAILQ_REMOVE(&g_skipped_nvme_ctrlrs, entry, tailq); ++ free(entry); ++ } ++ ++ assert(spdk_get_thread() == g_bdev_nvme_init_thread); ++ if (TAILQ_EMPTY(&g_discovery_ctxs)) { ++ bdev_nvme_fini_destruct_ctrlrs(); ++ } else { ++ TAILQ_FOREACH(ctx, &g_discovery_ctxs, tailq) { ++ stop_discovery(ctx, check_discovery_fini, NULL); ++ } ++ } ++} ++ ++static void ++bdev_nvme_verify_pi_error(struct nvme_bdev_io *bio) ++{ ++ struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); ++ struct spdk_bdev *bdev = bdev_io->bdev; ++ struct spdk_dif_ctx dif_ctx; ++ struct spdk_dif_error err_blk = {}; ++ int rc; ++ ++ rc = spdk_dif_ctx_init(&dif_ctx, ++ bdev->blocklen, bdev->md_len, bdev->md_interleave, ++ bdev->dif_is_head_of_md, bdev->dif_type, bdev->dif_check_flags, ++ bdev_io->u.bdev.offset_blocks, 0, 0, 0, 0); ++ if (rc != 0) { ++ SPDK_ERRLOG("Initialization of DIF context failed\n"); ++ return; ++ } ++ ++ if (bdev->md_interleave) { ++ rc = spdk_dif_verify(bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt, ++ bdev_io->u.bdev.num_blocks, &dif_ctx, &err_blk); ++ } else { ++ struct iovec md_iov = { ++ .iov_base = bdev_io->u.bdev.md_buf, ++ .iov_len = bdev_io->u.bdev.num_blocks * bdev->md_len, ++ }; ++ ++ rc = spdk_dix_verify(bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt, ++ &md_iov, bdev_io->u.bdev.num_blocks, &dif_ctx, &err_blk); ++ } ++ ++ if (rc != 0) { ++ SPDK_ERRLOG("DIF error detected. type=%d, offset=%" PRIu32 "\n", ++ err_blk.err_type, err_blk.err_offset); ++ } else { ++ SPDK_ERRLOG("Hardware reported PI error but SPDK could not find any.\n"); ++ } ++} ++ ++static void ++bdev_nvme_no_pi_readv_done(void *ref, const struct spdk_nvme_cpl *cpl) ++{ ++ struct nvme_bdev_io *bio = ref; ++ ++ if (spdk_nvme_cpl_is_success(cpl)) { ++ /* Run PI verification for read data buffer. */ ++ bdev_nvme_verify_pi_error(bio); ++ } ++ ++ /* Return original completion status */ ++ bdev_nvme_io_complete_nvme_status(bio, &bio->cpl); ++} ++ ++static void ++bdev_nvme_readv_done(void *ref, const struct spdk_nvme_cpl *cpl) ++{ ++ struct nvme_bdev_io *bio = ref; ++ struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); ++ int ret; ++ ++ if (spdk_unlikely(spdk_nvme_cpl_is_pi_error(cpl))) { ++ SPDK_ERRLOG("readv completed with PI error (sct=%d, sc=%d)\n", ++ cpl->status.sct, cpl->status.sc); ++ ++ /* Save completion status to use after verifying PI error. */ ++ bio->cpl = *cpl; ++ ++ if (spdk_likely(nvme_io_path_is_available(bio->io_path))) { ++ /* Read without PI checking to verify PI error. */ ++ ret = bdev_nvme_no_pi_readv(bio, ++ bdev_io->u.bdev.iovs, ++ bdev_io->u.bdev.iovcnt, ++ bdev_io->u.bdev.md_buf, ++ bdev_io->u.bdev.num_blocks, ++ bdev_io->u.bdev.offset_blocks); ++ if (ret == 0) { ++ return; ++ } ++ } ++ } ++ ++ bdev_nvme_io_complete_nvme_status(bio, cpl); ++} ++ ++static void ++bdev_nvme_writev_done(void *ref, const struct spdk_nvme_cpl *cpl) ++{ ++ struct nvme_bdev_io *bio = ref; ++ ++ if (spdk_nvme_cpl_is_pi_error(cpl)) { ++ SPDK_ERRLOG("writev completed with PI error (sct=%d, sc=%d)\n", ++ cpl->status.sct, cpl->status.sc); ++ /* Run PI verification for write data buffer if PI error is detected. */ ++ bdev_nvme_verify_pi_error(bio); ++ } ++ ++ bdev_nvme_io_complete_nvme_status(bio, cpl); ++} ++ ++static void ++bdev_nvme_zone_appendv_done(void *ref, const struct spdk_nvme_cpl *cpl) ++{ ++ struct nvme_bdev_io *bio = ref; ++ struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); ++ ++ /* spdk_bdev_io_get_append_location() requires that the ALBA is stored in offset_blocks. ++ * Additionally, offset_blocks has to be set before calling bdev_nvme_verify_pi_error(). ++ */ ++ bdev_io->u.bdev.offset_blocks = *(uint64_t *)&cpl->cdw0; ++ ++ if (spdk_nvme_cpl_is_pi_error(cpl)) { ++ SPDK_ERRLOG("zone append completed with PI error (sct=%d, sc=%d)\n", ++ cpl->status.sct, cpl->status.sc); ++ /* Run PI verification for zone append data buffer if PI error is detected. */ ++ bdev_nvme_verify_pi_error(bio); ++ } ++ ++ bdev_nvme_io_complete_nvme_status(bio, cpl); ++} ++ ++static void ++bdev_nvme_comparev_done(void *ref, const struct spdk_nvme_cpl *cpl) ++{ ++ struct nvme_bdev_io *bio = ref; ++ ++ if (spdk_nvme_cpl_is_pi_error(cpl)) { ++ SPDK_ERRLOG("comparev completed with PI error (sct=%d, sc=%d)\n", ++ cpl->status.sct, cpl->status.sc); ++ /* Run PI verification for compare data buffer if PI error is detected. */ ++ bdev_nvme_verify_pi_error(bio); ++ } ++ ++ bdev_nvme_io_complete_nvme_status(bio, cpl); ++} ++ ++static void ++bdev_nvme_comparev_and_writev_done(void *ref, const struct spdk_nvme_cpl *cpl) ++{ ++ struct nvme_bdev_io *bio = ref; ++ ++ /* Compare operation completion */ ++ if (!bio->first_fused_completed) { ++ /* Save compare result for write callback */ ++ bio->cpl = *cpl; ++ bio->first_fused_completed = true; ++ return; ++ } ++ ++ /* Write operation completion */ ++ if (spdk_nvme_cpl_is_error(&bio->cpl)) { ++ /* If bio->cpl is already an error, it means the compare operation failed. In that case, ++ * complete the IO with the compare operation's status. ++ */ ++ if (!spdk_nvme_cpl_is_error(cpl)) { ++ SPDK_ERRLOG("Unexpected write success after compare failure.\n"); ++ } ++ ++ bdev_nvme_io_complete_nvme_status(bio, &bio->cpl); ++ } else { ++ bdev_nvme_io_complete_nvme_status(bio, cpl); ++ } ++} ++ ++static void ++bdev_nvme_queued_done(void *ref, const struct spdk_nvme_cpl *cpl) ++{ ++ struct nvme_bdev_io *bio = ref; ++ ++ bdev_nvme_io_complete_nvme_status(bio, cpl); ++} ++ ++static int ++fill_zone_from_report(struct spdk_bdev_zone_info *info, struct spdk_nvme_zns_zone_desc *desc) ++{ ++ switch (desc->zt) { ++ case SPDK_NVME_ZONE_TYPE_SEQWR: ++ info->type = SPDK_BDEV_ZONE_TYPE_SEQWR; ++ break; ++ default: ++ SPDK_ERRLOG("Invalid zone type: %#x in zone report\n", desc->zt); ++ return -EIO; ++ } ++ ++ switch (desc->zs) { ++ case SPDK_NVME_ZONE_STATE_EMPTY: ++ info->state = SPDK_BDEV_ZONE_STATE_EMPTY; ++ break; ++ case SPDK_NVME_ZONE_STATE_IOPEN: ++ info->state = SPDK_BDEV_ZONE_STATE_IMP_OPEN; ++ break; ++ case SPDK_NVME_ZONE_STATE_EOPEN: ++ info->state = SPDK_BDEV_ZONE_STATE_EXP_OPEN; ++ break; ++ case SPDK_NVME_ZONE_STATE_CLOSED: ++ info->state = SPDK_BDEV_ZONE_STATE_CLOSED; ++ break; ++ case SPDK_NVME_ZONE_STATE_RONLY: ++ info->state = SPDK_BDEV_ZONE_STATE_READ_ONLY; ++ break; ++ case SPDK_NVME_ZONE_STATE_FULL: ++ info->state = SPDK_BDEV_ZONE_STATE_FULL; ++ break; ++ case SPDK_NVME_ZONE_STATE_OFFLINE: ++ info->state = SPDK_BDEV_ZONE_STATE_OFFLINE; ++ break; ++ default: ++ SPDK_ERRLOG("Invalid zone state: %#x in zone report\n", desc->zs); ++ return -EIO; ++ } ++ ++ info->zone_id = desc->zslba; ++ info->write_pointer = desc->wp; ++ info->capacity = desc->zcap; ++ ++ return 0; ++} ++ ++static void ++bdev_nvme_get_zone_info_done(void *ref, const struct spdk_nvme_cpl *cpl) ++{ ++ struct nvme_bdev_io *bio = ref; ++ struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); ++ uint64_t zone_id = bdev_io->u.zone_mgmt.zone_id; ++ uint32_t zones_to_copy = bdev_io->u.zone_mgmt.num_zones; ++ struct spdk_bdev_zone_info *info = bdev_io->u.zone_mgmt.buf; ++ uint64_t max_zones_per_buf, i; ++ uint32_t zone_report_bufsize; ++ struct spdk_nvme_ns *ns; ++ struct spdk_nvme_qpair *qpair; ++ int ret; ++ ++ if (spdk_nvme_cpl_is_error(cpl)) { ++ goto out_complete_io_nvme_cpl; ++ } ++ ++ if (spdk_unlikely(!nvme_io_path_is_available(bio->io_path))) { ++ ret = -ENXIO; ++ goto out_complete_io_ret; ++ } ++ ++ ns = bio->io_path->nvme_ns->ns; ++ qpair = bio->io_path->qpair->qpair; ++ ++ zone_report_bufsize = spdk_nvme_ns_get_max_io_xfer_size(ns); ++ max_zones_per_buf = (zone_report_bufsize - sizeof(*bio->zone_report_buf)) / ++ sizeof(bio->zone_report_buf->descs[0]); ++ ++ if (bio->zone_report_buf->nr_zones > max_zones_per_buf) { ++ ret = -EINVAL; ++ goto out_complete_io_ret; ++ } ++ ++ if (!bio->zone_report_buf->nr_zones) { ++ ret = -EINVAL; ++ goto out_complete_io_ret; ++ } ++ ++ for (i = 0; i < bio->zone_report_buf->nr_zones && bio->handled_zones < zones_to_copy; i++) { ++ ret = fill_zone_from_report(&info[bio->handled_zones], ++ &bio->zone_report_buf->descs[i]); ++ if (ret) { ++ goto out_complete_io_ret; ++ } ++ bio->handled_zones++; ++ } ++ ++ if (bio->handled_zones < zones_to_copy) { ++ uint64_t zone_size_lba = spdk_nvme_zns_ns_get_zone_size_sectors(ns); ++ uint64_t slba = zone_id + (zone_size_lba * bio->handled_zones); ++ ++ memset(bio->zone_report_buf, 0, zone_report_bufsize); ++ ret = spdk_nvme_zns_report_zones(ns, qpair, ++ bio->zone_report_buf, zone_report_bufsize, ++ slba, SPDK_NVME_ZRA_LIST_ALL, true, ++ bdev_nvme_get_zone_info_done, bio); ++ if (!ret) { ++ return; ++ } else { ++ goto out_complete_io_ret; ++ } ++ } ++ ++out_complete_io_nvme_cpl: ++ free(bio->zone_report_buf); ++ bio->zone_report_buf = NULL; ++ bdev_nvme_io_complete_nvme_status(bio, cpl); ++ return; ++ ++out_complete_io_ret: ++ free(bio->zone_report_buf); ++ bio->zone_report_buf = NULL; ++ bdev_nvme_io_complete(bio, ret); ++} ++ ++static void ++bdev_nvme_zone_management_done(void *ref, const struct spdk_nvme_cpl *cpl) ++{ ++ struct nvme_bdev_io *bio = ref; ++ ++ bdev_nvme_io_complete_nvme_status(bio, cpl); ++} ++ ++static void ++bdev_nvme_admin_passthru_complete_nvme_status(void *ctx) ++{ ++ struct nvme_bdev_io *bio = ctx; ++ struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); ++ const struct spdk_nvme_cpl *cpl = &bio->cpl; ++ ++ assert(bdev_nvme_io_type_is_admin(bdev_io->type)); ++ ++ __bdev_nvme_io_complete(bdev_io, 0, cpl); ++} ++ ++static void ++bdev_nvme_abort_complete(void *ctx) ++{ ++ struct nvme_bdev_io *bio = ctx; ++ struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); ++ ++ if (spdk_nvme_cpl_is_abort_success(&bio->cpl)) { ++ __bdev_nvme_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_SUCCESS, NULL); ++ } else { ++ __bdev_nvme_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED, NULL); ++ } ++} ++ ++static void ++bdev_nvme_abort_done(void *ref, const struct spdk_nvme_cpl *cpl) ++{ ++ struct nvme_bdev_io *bio = ref; ++ ++ bio->cpl = *cpl; ++ spdk_thread_send_msg(bio->orig_thread, bdev_nvme_abort_complete, bio); ++} ++ ++static void ++bdev_nvme_admin_passthru_done(void *ref, const struct spdk_nvme_cpl *cpl) ++{ ++ struct nvme_bdev_io *bio = ref; ++ ++ bio->cpl = *cpl; ++ spdk_thread_send_msg(bio->orig_thread, ++ bdev_nvme_admin_passthru_complete_nvme_status, bio); ++} ++ ++static void ++bdev_nvme_queued_reset_sgl(void *ref, uint32_t sgl_offset) ++{ ++ struct nvme_bdev_io *bio = ref; ++ struct iovec *iov; ++ ++ bio->iov_offset = sgl_offset; ++ for (bio->iovpos = 0; bio->iovpos < bio->iovcnt; bio->iovpos++) { ++ iov = &bio->iovs[bio->iovpos]; ++ if (bio->iov_offset < iov->iov_len) { ++ break; ++ } ++ ++ bio->iov_offset -= iov->iov_len; ++ } ++} ++ ++static int ++bdev_nvme_queued_next_sge(void *ref, void **address, uint32_t *length) ++{ ++ struct nvme_bdev_io *bio = ref; ++ struct iovec *iov; ++ ++ assert(bio->iovpos < bio->iovcnt); ++ ++ iov = &bio->iovs[bio->iovpos]; ++ ++ *address = iov->iov_base; ++ *length = iov->iov_len; ++ ++ if (bio->iov_offset) { ++ assert(bio->iov_offset <= iov->iov_len); ++ *address += bio->iov_offset; ++ *length -= bio->iov_offset; ++ } ++ ++ bio->iov_offset += *length; ++ if (bio->iov_offset == iov->iov_len) { ++ bio->iovpos++; ++ bio->iov_offset = 0; ++ } ++ ++ return 0; ++} ++ ++static void ++bdev_nvme_queued_reset_fused_sgl(void *ref, uint32_t sgl_offset) ++{ ++ struct nvme_bdev_io *bio = ref; ++ struct iovec *iov; ++ ++ bio->fused_iov_offset = sgl_offset; ++ for (bio->fused_iovpos = 0; bio->fused_iovpos < bio->fused_iovcnt; bio->fused_iovpos++) { ++ iov = &bio->fused_iovs[bio->fused_iovpos]; ++ if (bio->fused_iov_offset < iov->iov_len) { ++ break; ++ } ++ ++ bio->fused_iov_offset -= iov->iov_len; ++ } ++} ++ ++static int ++bdev_nvme_queued_next_fused_sge(void *ref, void **address, uint32_t *length) ++{ ++ struct nvme_bdev_io *bio = ref; ++ struct iovec *iov; ++ ++ assert(bio->fused_iovpos < bio->fused_iovcnt); ++ ++ iov = &bio->fused_iovs[bio->fused_iovpos]; ++ ++ *address = iov->iov_base; ++ *length = iov->iov_len; ++ ++ if (bio->fused_iov_offset) { ++ assert(bio->fused_iov_offset <= iov->iov_len); ++ *address += bio->fused_iov_offset; ++ *length -= bio->fused_iov_offset; ++ } ++ ++ bio->fused_iov_offset += *length; ++ if (bio->fused_iov_offset == iov->iov_len) { ++ bio->fused_iovpos++; ++ bio->fused_iov_offset = 0; ++ } ++ ++ return 0; ++} ++ ++static int ++bdev_nvme_no_pi_readv(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt, ++ void *md, uint64_t lba_count, uint64_t lba) ++{ ++ int rc; ++ ++ SPDK_DEBUGLOG(bdev_nvme, "read %" PRIu64 " blocks with offset %#" PRIx64 " without PI check\n", ++ lba_count, lba); ++ ++ bio->iovs = iov; ++ bio->iovcnt = iovcnt; ++ bio->iovpos = 0; ++ bio->iov_offset = 0; ++ ++ rc = spdk_nvme_ns_cmd_readv_with_md(bio->io_path->nvme_ns->ns, ++ bio->io_path->qpair->qpair, ++ lba, lba_count, ++ bdev_nvme_no_pi_readv_done, bio, 0, ++ bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge, ++ md, 0, 0); ++ ++ if (rc != 0 && rc != -ENOMEM) { ++ SPDK_ERRLOG("no_pi_readv failed: rc = %d\n", rc); ++ } ++ return rc; ++} ++ ++static int ++bdev_nvme_readv(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt, ++ void *md, uint64_t lba_count, uint64_t lba, uint32_t flags, ++ struct spdk_bdev_ext_io_opts *ext_opts) ++{ ++ struct spdk_nvme_ns *ns = bio->io_path->nvme_ns->ns; ++ struct spdk_nvme_qpair *qpair = bio->io_path->qpair->qpair; ++ int rc; ++ ++ SPDK_DEBUGLOG(bdev_nvme, "read %" PRIu64 " blocks with offset %#" PRIx64 "\n", ++ lba_count, lba); ++ ++ bio->iovs = iov; ++ bio->iovcnt = iovcnt; ++ bio->iovpos = 0; ++ bio->iov_offset = 0; ++ ++ if (ext_opts) { ++ bio->ext_opts.size = sizeof(struct spdk_nvme_ns_cmd_ext_io_opts); ++ bio->ext_opts.memory_domain = ext_opts->memory_domain; ++ bio->ext_opts.memory_domain_ctx = ext_opts->memory_domain_ctx; ++ bio->ext_opts.io_flags = flags; ++ bio->ext_opts.metadata = md; ++ ++ rc = spdk_nvme_ns_cmd_readv_ext(ns, qpair, lba, lba_count, ++ bdev_nvme_readv_done, bio, ++ bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge, ++ &bio->ext_opts); ++ } else if (iovcnt == 1) { ++ rc = spdk_nvme_ns_cmd_read_with_md(ns, qpair, iov[0].iov_base, md, lba, ++ lba_count, ++ bdev_nvme_readv_done, bio, ++ flags, ++ 0, 0); ++ } else { ++ rc = spdk_nvme_ns_cmd_readv_with_md(ns, qpair, lba, lba_count, ++ bdev_nvme_readv_done, bio, flags, ++ bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge, ++ md, 0, 0); ++ } ++ ++ if (rc != 0 && rc != -ENOMEM) { ++ SPDK_ERRLOG("readv failed: rc = %d\n", rc); ++ } ++ return rc; ++} ++ ++static int ++bdev_nvme_writev(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt, ++ void *md, uint64_t lba_count, uint64_t lba, ++ uint32_t flags, struct spdk_bdev_ext_io_opts *ext_opts) ++{ ++ struct spdk_nvme_ns *ns = bio->io_path->nvme_ns->ns; ++ struct spdk_nvme_qpair *qpair = bio->io_path->qpair->qpair; ++ int rc; ++ ++ SPDK_DEBUGLOG(bdev_nvme, "write %" PRIu64 " blocks with offset %#" PRIx64 "\n", ++ lba_count, lba); ++ ++ bio->iovs = iov; ++ bio->iovcnt = iovcnt; ++ bio->iovpos = 0; ++ bio->iov_offset = 0; ++ ++ if (ext_opts) { ++ bio->ext_opts.size = sizeof(struct spdk_nvme_ns_cmd_ext_io_opts); ++ bio->ext_opts.memory_domain = ext_opts->memory_domain; ++ bio->ext_opts.memory_domain_ctx = ext_opts->memory_domain_ctx; ++ bio->ext_opts.io_flags = flags; ++ bio->ext_opts.metadata = md; ++ ++ rc = spdk_nvme_ns_cmd_writev_ext(ns, qpair, lba, lba_count, ++ bdev_nvme_writev_done, bio, ++ bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge, ++ &bio->ext_opts); ++ } else if (iovcnt == 1) { ++ rc = spdk_nvme_ns_cmd_write_with_md(ns, qpair, iov[0].iov_base, md, lba, ++ lba_count, ++ bdev_nvme_writev_done, bio, ++ flags, ++ 0, 0); ++ } else { ++ rc = spdk_nvme_ns_cmd_writev_with_md(ns, qpair, lba, lba_count, ++ bdev_nvme_writev_done, bio, flags, ++ bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge, ++ md, 0, 0); ++ } ++ ++ if (rc != 0 && rc != -ENOMEM) { ++ SPDK_ERRLOG("writev failed: rc = %d\n", rc); ++ } ++ return rc; ++} ++ ++static int ++bdev_nvme_zone_appendv(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt, ++ void *md, uint64_t lba_count, uint64_t zslba, ++ uint32_t flags) ++{ ++ struct spdk_nvme_ns *ns = bio->io_path->nvme_ns->ns; ++ struct spdk_nvme_qpair *qpair = bio->io_path->qpair->qpair; ++ int rc; ++ ++ SPDK_DEBUGLOG(bdev_nvme, "zone append %" PRIu64 " blocks to zone start lba %#" PRIx64 "\n", ++ lba_count, zslba); ++ ++ bio->iovs = iov; ++ bio->iovcnt = iovcnt; ++ bio->iovpos = 0; ++ bio->iov_offset = 0; ++ ++ if (iovcnt == 1) { ++ rc = spdk_nvme_zns_zone_append_with_md(ns, qpair, iov[0].iov_base, md, zslba, ++ lba_count, ++ bdev_nvme_zone_appendv_done, bio, ++ flags, ++ 0, 0); ++ } else { ++ rc = spdk_nvme_zns_zone_appendv_with_md(ns, qpair, zslba, lba_count, ++ bdev_nvme_zone_appendv_done, bio, flags, ++ bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge, ++ md, 0, 0); ++ } ++ ++ if (rc != 0 && rc != -ENOMEM) { ++ SPDK_ERRLOG("zone append failed: rc = %d\n", rc); ++ } ++ return rc; ++} ++ ++static int ++bdev_nvme_comparev(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt, ++ void *md, uint64_t lba_count, uint64_t lba, ++ uint32_t flags) ++{ ++ int rc; ++ ++ SPDK_DEBUGLOG(bdev_nvme, "compare %" PRIu64 " blocks with offset %#" PRIx64 "\n", ++ lba_count, lba); ++ ++ bio->iovs = iov; ++ bio->iovcnt = iovcnt; ++ bio->iovpos = 0; ++ bio->iov_offset = 0; ++ ++ rc = spdk_nvme_ns_cmd_comparev_with_md(bio->io_path->nvme_ns->ns, ++ bio->io_path->qpair->qpair, ++ lba, lba_count, ++ bdev_nvme_comparev_done, bio, flags, ++ bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge, ++ md, 0, 0); ++ ++ if (rc != 0 && rc != -ENOMEM) { ++ SPDK_ERRLOG("comparev failed: rc = %d\n", rc); ++ } ++ return rc; ++} ++ ++static int ++bdev_nvme_comparev_and_writev(struct nvme_bdev_io *bio, struct iovec *cmp_iov, int cmp_iovcnt, ++ struct iovec *write_iov, int write_iovcnt, ++ void *md, uint64_t lba_count, uint64_t lba, uint32_t flags) ++{ ++ struct spdk_nvme_ns *ns = bio->io_path->nvme_ns->ns; ++ struct spdk_nvme_qpair *qpair = bio->io_path->qpair->qpair; ++ struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); ++ int rc; ++ ++ SPDK_DEBUGLOG(bdev_nvme, "compare and write %" PRIu64 " blocks with offset %#" PRIx64 "\n", ++ lba_count, lba); ++ ++ bio->iovs = cmp_iov; ++ bio->iovcnt = cmp_iovcnt; ++ bio->iovpos = 0; ++ bio->iov_offset = 0; ++ bio->fused_iovs = write_iov; ++ bio->fused_iovcnt = write_iovcnt; ++ bio->fused_iovpos = 0; ++ bio->fused_iov_offset = 0; ++ ++ if (bdev_io->num_retries == 0) { ++ bio->first_fused_submitted = false; ++ bio->first_fused_completed = false; ++ } ++ ++ if (!bio->first_fused_submitted) { ++ flags |= SPDK_NVME_IO_FLAGS_FUSE_FIRST; ++ memset(&bio->cpl, 0, sizeof(bio->cpl)); ++ ++ rc = spdk_nvme_ns_cmd_comparev_with_md(ns, qpair, lba, lba_count, ++ bdev_nvme_comparev_and_writev_done, bio, flags, ++ bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge, md, 0, 0); ++ if (rc == 0) { ++ bio->first_fused_submitted = true; ++ flags &= ~SPDK_NVME_IO_FLAGS_FUSE_FIRST; ++ } else { ++ if (rc != -ENOMEM) { ++ SPDK_ERRLOG("compare failed: rc = %d\n", rc); ++ } ++ return rc; ++ } ++ } ++ ++ flags |= SPDK_NVME_IO_FLAGS_FUSE_SECOND; ++ ++ rc = spdk_nvme_ns_cmd_writev_with_md(ns, qpair, lba, lba_count, ++ bdev_nvme_comparev_and_writev_done, bio, flags, ++ bdev_nvme_queued_reset_fused_sgl, bdev_nvme_queued_next_fused_sge, md, 0, 0); ++ if (rc != 0 && rc != -ENOMEM) { ++ SPDK_ERRLOG("write failed: rc = %d\n", rc); ++ rc = 0; ++ } ++ ++ return rc; ++} ++ ++static int ++bdev_nvme_unmap(struct nvme_bdev_io *bio, uint64_t offset_blocks, uint64_t num_blocks) ++{ ++ struct spdk_nvme_dsm_range dsm_ranges[SPDK_NVME_DATASET_MANAGEMENT_MAX_RANGES]; ++ struct spdk_nvme_dsm_range *range; ++ uint64_t offset, remaining; ++ uint64_t num_ranges_u64; ++ uint16_t num_ranges; ++ int rc; ++ ++ num_ranges_u64 = (num_blocks + SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS - 1) / ++ SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS; ++ if (num_ranges_u64 > SPDK_COUNTOF(dsm_ranges)) { ++ SPDK_ERRLOG("Unmap request for %" PRIu64 " blocks is too large\n", num_blocks); ++ return -EINVAL; ++ } ++ num_ranges = (uint16_t)num_ranges_u64; ++ ++ offset = offset_blocks; ++ remaining = num_blocks; ++ range = &dsm_ranges[0]; ++ ++ /* Fill max-size ranges until the remaining blocks fit into one range */ ++ while (remaining > SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS) { ++ range->attributes.raw = 0; ++ range->length = SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS; ++ range->starting_lba = offset; ++ ++ offset += SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS; ++ remaining -= SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS; ++ range++; ++ } ++ ++ /* Final range describes the remaining blocks */ ++ range->attributes.raw = 0; ++ range->length = remaining; ++ range->starting_lba = offset; ++ ++ rc = spdk_nvme_ns_cmd_dataset_management(bio->io_path->nvme_ns->ns, ++ bio->io_path->qpair->qpair, ++ SPDK_NVME_DSM_ATTR_DEALLOCATE, ++ dsm_ranges, num_ranges, ++ bdev_nvme_queued_done, bio); ++ ++ return rc; ++} ++ ++static int ++bdev_nvme_write_zeroes(struct nvme_bdev_io *bio, uint64_t offset_blocks, uint64_t num_blocks) ++{ ++ if (num_blocks > UINT16_MAX + 1) { ++ SPDK_ERRLOG("NVMe write zeroes is limited to 16-bit block count\n"); ++ return -EINVAL; ++ } ++ ++ return spdk_nvme_ns_cmd_write_zeroes(bio->io_path->nvme_ns->ns, ++ bio->io_path->qpair->qpair, ++ offset_blocks, num_blocks, ++ bdev_nvme_queued_done, bio, ++ 0); ++} ++ ++static int ++bdev_nvme_get_zone_info(struct nvme_bdev_io *bio, uint64_t zone_id, uint32_t num_zones, ++ struct spdk_bdev_zone_info *info) ++{ ++ struct spdk_nvme_ns *ns = bio->io_path->nvme_ns->ns; ++ struct spdk_nvme_qpair *qpair = bio->io_path->qpair->qpair; ++ uint32_t zone_report_bufsize = spdk_nvme_ns_get_max_io_xfer_size(ns); ++ uint64_t zone_size = spdk_nvme_zns_ns_get_zone_size_sectors(ns); ++ uint64_t total_zones = spdk_nvme_zns_ns_get_num_zones(ns); ++ ++ if (zone_id % zone_size != 0) { ++ return -EINVAL; ++ } ++ ++ if (num_zones > total_zones || !num_zones) { ++ return -EINVAL; ++ } ++ ++ assert(!bio->zone_report_buf); ++ bio->zone_report_buf = calloc(1, zone_report_bufsize); ++ if (!bio->zone_report_buf) { ++ return -ENOMEM; ++ } ++ ++ bio->handled_zones = 0; ++ ++ return spdk_nvme_zns_report_zones(ns, qpair, bio->zone_report_buf, zone_report_bufsize, ++ zone_id, SPDK_NVME_ZRA_LIST_ALL, true, ++ bdev_nvme_get_zone_info_done, bio); ++} ++ ++static int ++bdev_nvme_zone_management(struct nvme_bdev_io *bio, uint64_t zone_id, ++ enum spdk_bdev_zone_action action) ++{ ++ struct spdk_nvme_ns *ns = bio->io_path->nvme_ns->ns; ++ struct spdk_nvme_qpair *qpair = bio->io_path->qpair->qpair; ++ ++ switch (action) { ++ case SPDK_BDEV_ZONE_CLOSE: ++ return spdk_nvme_zns_close_zone(ns, qpair, zone_id, false, ++ bdev_nvme_zone_management_done, bio); ++ case SPDK_BDEV_ZONE_FINISH: ++ return spdk_nvme_zns_finish_zone(ns, qpair, zone_id, false, ++ bdev_nvme_zone_management_done, bio); ++ case SPDK_BDEV_ZONE_OPEN: ++ return spdk_nvme_zns_open_zone(ns, qpair, zone_id, false, ++ bdev_nvme_zone_management_done, bio); ++ case SPDK_BDEV_ZONE_RESET: ++ return spdk_nvme_zns_reset_zone(ns, qpair, zone_id, false, ++ bdev_nvme_zone_management_done, bio); ++ case SPDK_BDEV_ZONE_OFFLINE: ++ return spdk_nvme_zns_offline_zone(ns, qpair, zone_id, false, ++ bdev_nvme_zone_management_done, bio); ++ default: ++ return -EINVAL; ++ } ++} ++ ++static void ++bdev_nvme_admin_passthru(struct nvme_bdev_channel *nbdev_ch, struct nvme_bdev_io *bio, ++ struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes) ++{ ++ struct nvme_io_path *io_path; ++ struct nvme_ctrlr *nvme_ctrlr; ++ uint32_t max_xfer_size; ++ int rc = -ENXIO; ++ ++ /* Choose the first ctrlr which is not failed. */ ++ STAILQ_FOREACH(io_path, &nbdev_ch->io_path_list, stailq) { ++ nvme_ctrlr = io_path->qpair->ctrlr; ++ ++ /* We should skip any unavailable nvme_ctrlr rather than checking ++ * if the return value of spdk_nvme_ctrlr_cmd_admin_raw() is -ENXIO. ++ */ ++ if (!nvme_ctrlr_is_available(nvme_ctrlr)) { ++ continue; ++ } ++ ++ max_xfer_size = spdk_nvme_ctrlr_get_max_xfer_size(nvme_ctrlr->ctrlr); ++ ++ if (nbytes > max_xfer_size) { ++ SPDK_ERRLOG("nbytes is greater than MDTS %" PRIu32 ".\n", max_xfer_size); ++ rc = -EINVAL; ++ goto err; ++ } ++ ++ bio->io_path = io_path; ++ bio->orig_thread = spdk_get_thread(); ++ ++ rc = spdk_nvme_ctrlr_cmd_admin_raw(nvme_ctrlr->ctrlr, cmd, buf, (uint32_t)nbytes, ++ bdev_nvme_admin_passthru_done, bio); ++ if (rc == 0) { ++ return; ++ } ++ } ++ ++err: ++ bdev_nvme_admin_passthru_complete(bio, rc); ++} ++ ++static int ++bdev_nvme_io_passthru(struct nvme_bdev_io *bio, struct spdk_nvme_cmd *cmd, ++ void *buf, size_t nbytes) ++{ ++ struct spdk_nvme_ns *ns = bio->io_path->nvme_ns->ns; ++ struct spdk_nvme_qpair *qpair = bio->io_path->qpair->qpair; ++ uint32_t max_xfer_size = spdk_nvme_ns_get_max_io_xfer_size(ns); ++ struct spdk_nvme_ctrlr *ctrlr = spdk_nvme_ns_get_ctrlr(ns); ++ ++ if (nbytes > max_xfer_size) { ++ SPDK_ERRLOG("nbytes is greater than MDTS %" PRIu32 ".\n", max_xfer_size); ++ return -EINVAL; ++ } ++ ++ /* ++ * Each NVMe bdev is a specific namespace, and all NVMe I/O commands require a nsid, ++ * so fill it out automatically. ++ */ ++ cmd->nsid = spdk_nvme_ns_get_id(ns); ++ ++ return spdk_nvme_ctrlr_cmd_io_raw(ctrlr, qpair, cmd, buf, ++ (uint32_t)nbytes, bdev_nvme_queued_done, bio); ++} ++ ++static int ++bdev_nvme_io_passthru_md(struct nvme_bdev_io *bio, struct spdk_nvme_cmd *cmd, ++ void *buf, size_t nbytes, void *md_buf, size_t md_len) ++{ ++ struct spdk_nvme_ns *ns = bio->io_path->nvme_ns->ns; ++ struct spdk_nvme_qpair *qpair = bio->io_path->qpair->qpair; ++ size_t nr_sectors = nbytes / spdk_nvme_ns_get_extended_sector_size(ns); ++ uint32_t max_xfer_size = spdk_nvme_ns_get_max_io_xfer_size(ns); ++ struct spdk_nvme_ctrlr *ctrlr = spdk_nvme_ns_get_ctrlr(ns); ++ ++ if (nbytes > max_xfer_size) { ++ SPDK_ERRLOG("nbytes is greater than MDTS %" PRIu32 ".\n", max_xfer_size); ++ return -EINVAL; ++ } ++ ++ if (md_len != nr_sectors * spdk_nvme_ns_get_md_size(ns)) { ++ SPDK_ERRLOG("invalid meta data buffer size\n"); ++ return -EINVAL; ++ } ++ ++ /* ++ * Each NVMe bdev is a specific namespace, and all NVMe I/O commands require a nsid, ++ * so fill it out automatically. ++ */ ++ cmd->nsid = spdk_nvme_ns_get_id(ns); ++ ++ return spdk_nvme_ctrlr_cmd_io_raw_with_md(ctrlr, qpair, cmd, buf, ++ (uint32_t)nbytes, md_buf, bdev_nvme_queued_done, bio); ++} ++ ++static void ++bdev_nvme_abort(struct nvme_bdev_channel *nbdev_ch, struct nvme_bdev_io *bio, ++ struct nvme_bdev_io *bio_to_abort) ++{ ++ struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); ++ struct nvme_io_path *io_path; ++ struct nvme_ctrlr *nvme_ctrlr; ++ int rc = 0; ++ ++ bio->orig_thread = spdk_get_thread(); ++ ++ rc = bdev_nvme_abort_retry_io(nbdev_ch, bio_to_abort); ++ if (rc == 0) { ++ __bdev_nvme_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_SUCCESS, NULL); ++ return; ++ } ++ ++ rc = 0; ++ ++ /* Even admin commands, they were submitted to only nvme_ctrlrs which were ++ * on any io_path. So traverse the io_path list for not only I/O commands ++ * but also admin commands. ++ */ ++ STAILQ_FOREACH(io_path, &nbdev_ch->io_path_list, stailq) { ++ nvme_ctrlr = io_path->qpair->ctrlr; ++ ++ rc = spdk_nvme_ctrlr_cmd_abort_ext(nvme_ctrlr->ctrlr, ++ io_path->qpair->qpair, ++ bio_to_abort, ++ bdev_nvme_abort_done, bio); ++ if (rc == -ENOENT) { ++ /* If no command was found in I/O qpair, the target command may be ++ * admin command. ++ */ ++ rc = spdk_nvme_ctrlr_cmd_abort_ext(nvme_ctrlr->ctrlr, ++ NULL, ++ bio_to_abort, ++ bdev_nvme_abort_done, bio); ++ } ++ ++ if (rc != -ENOENT) { ++ break; ++ } ++ } ++ ++ if (rc != 0) { ++ /* If no command was found or there was any error, complete the abort ++ * request with failure. ++ */ ++ __bdev_nvme_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED, NULL); ++ } ++} ++ ++static int ++bdev_nvme_copy(struct nvme_bdev_io *bio, uint64_t dst_offset_blocks, uint64_t src_offset_blocks, ++ uint64_t num_blocks) ++{ ++ struct spdk_nvme_scc_source_range range = { ++ .slba = src_offset_blocks, ++ .nlb = num_blocks - 1 ++ }; ++ ++ return spdk_nvme_ns_cmd_copy(bio->io_path->nvme_ns->ns, ++ bio->io_path->qpair->qpair, ++ &range, 1, dst_offset_blocks, ++ bdev_nvme_queued_done, bio); ++} ++ ++static void ++bdev_nvme_opts_config_json(struct spdk_json_write_ctx *w) ++{ ++ const char *action; ++ ++ if (g_opts.action_on_timeout == SPDK_BDEV_NVME_TIMEOUT_ACTION_RESET) { ++ action = "reset"; ++ } else if (g_opts.action_on_timeout == SPDK_BDEV_NVME_TIMEOUT_ACTION_ABORT) { ++ action = "abort"; ++ } else { ++ action = "none"; ++ } ++ ++ spdk_json_write_object_begin(w); ++ ++ spdk_json_write_named_string(w, "method", "bdev_nvme_set_options"); ++ ++ spdk_json_write_named_object_begin(w, "params"); ++ spdk_json_write_named_string(w, "action_on_timeout", action); ++ spdk_json_write_named_uint64(w, "timeout_us", g_opts.timeout_us); ++ spdk_json_write_named_uint64(w, "timeout_admin_us", g_opts.timeout_admin_us); ++ spdk_json_write_named_uint32(w, "keep_alive_timeout_ms", g_opts.keep_alive_timeout_ms); ++ spdk_json_write_named_uint32(w, "transport_retry_count", g_opts.transport_retry_count); ++ spdk_json_write_named_uint32(w, "arbitration_burst", g_opts.arbitration_burst); ++ spdk_json_write_named_uint32(w, "low_priority_weight", g_opts.low_priority_weight); ++ spdk_json_write_named_uint32(w, "medium_priority_weight", g_opts.medium_priority_weight); ++ spdk_json_write_named_uint32(w, "high_priority_weight", g_opts.high_priority_weight); ++ spdk_json_write_named_uint64(w, "nvme_adminq_poll_period_us", g_opts.nvme_adminq_poll_period_us); ++ spdk_json_write_named_uint64(w, "nvme_ioq_poll_period_us", g_opts.nvme_ioq_poll_period_us); ++ spdk_json_write_named_uint32(w, "io_queue_requests", g_opts.io_queue_requests); ++ spdk_json_write_named_bool(w, "delay_cmd_submit", g_opts.delay_cmd_submit); ++ spdk_json_write_named_int32(w, "bdev_retry_count", g_opts.bdev_retry_count); ++ spdk_json_write_named_uint8(w, "transport_ack_timeout", g_opts.transport_ack_timeout); ++ spdk_json_write_named_int32(w, "ctrlr_loss_timeout_sec", g_opts.ctrlr_loss_timeout_sec); ++ spdk_json_write_named_uint32(w, "reconnect_delay_sec", g_opts.reconnect_delay_sec); ++ spdk_json_write_named_uint32(w, "fast_io_fail_timeout_sec", g_opts.fast_io_fail_timeout_sec); ++ spdk_json_write_named_bool(w, "generate_uuids", g_opts.generate_uuids); ++ spdk_json_write_named_uint8(w, "transport_tos", g_opts.transport_tos); ++ spdk_json_write_named_bool(w, "io_path_stat", g_opts.io_path_stat); ++ spdk_json_write_object_end(w); ++ ++ spdk_json_write_object_end(w); ++} ++ ++static void ++bdev_nvme_discovery_config_json(struct spdk_json_write_ctx *w, struct discovery_ctx *ctx) ++{ ++ struct spdk_nvme_transport_id trid; ++ ++ spdk_json_write_object_begin(w); ++ ++ spdk_json_write_named_string(w, "method", "bdev_nvme_start_discovery"); ++ ++ spdk_json_write_named_object_begin(w, "params"); ++ spdk_json_write_named_string(w, "name", ctx->name); ++ spdk_json_write_named_string(w, "hostnqn", ctx->hostnqn); ++ ++ trid = ctx->trid; ++ memset(trid.subnqn, 0, sizeof(trid.subnqn)); ++ nvme_bdev_dump_trid_json(&trid, w); ++ ++ spdk_json_write_named_bool(w, "wait_for_attach", ctx->wait_for_attach); ++ spdk_json_write_named_int32(w, "ctrlr_loss_timeout_sec", ctx->bdev_opts.ctrlr_loss_timeout_sec); ++ spdk_json_write_named_uint32(w, "reconnect_delay_sec", ctx->bdev_opts.reconnect_delay_sec); ++ spdk_json_write_named_uint32(w, "fast_io_fail_timeout_sec", ++ ctx->bdev_opts.fast_io_fail_timeout_sec); ++ spdk_json_write_object_end(w); ++ ++ spdk_json_write_object_end(w); ++} ++ ++static void ++nvme_ctrlr_config_json(struct spdk_json_write_ctx *w, ++ struct nvme_ctrlr *nvme_ctrlr) ++{ ++ struct spdk_nvme_transport_id *trid; ++ const struct spdk_nvme_ctrlr_opts *opts; ++ ++ if (nvme_ctrlr->opts.from_discovery_service) { ++ /* Do not emit an RPC for this - it will be implicitly ++ * covered by a separate bdev_nvme_start_discovery or ++ * bdev_nvme_start_mdns_discovery RPC. ++ */ ++ return; ++ } ++ ++ trid = &nvme_ctrlr->active_path_id->trid; ++ ++ spdk_json_write_object_begin(w); ++ ++ spdk_json_write_named_string(w, "method", "bdev_nvme_attach_controller"); ++ ++ spdk_json_write_named_object_begin(w, "params"); ++ spdk_json_write_named_string(w, "name", nvme_ctrlr->nbdev_ctrlr->name); ++ nvme_bdev_dump_trid_json(trid, w); ++ spdk_json_write_named_bool(w, "prchk_reftag", ++ (nvme_ctrlr->opts.prchk_flags & SPDK_NVME_IO_FLAGS_PRCHK_REFTAG) != 0); ++ spdk_json_write_named_bool(w, "prchk_guard", ++ (nvme_ctrlr->opts.prchk_flags & SPDK_NVME_IO_FLAGS_PRCHK_GUARD) != 0); ++ spdk_json_write_named_int32(w, "ctrlr_loss_timeout_sec", nvme_ctrlr->opts.ctrlr_loss_timeout_sec); ++ spdk_json_write_named_uint32(w, "reconnect_delay_sec", nvme_ctrlr->opts.reconnect_delay_sec); ++ spdk_json_write_named_uint32(w, "fast_io_fail_timeout_sec", ++ nvme_ctrlr->opts.fast_io_fail_timeout_sec); ++ ++ opts = spdk_nvme_ctrlr_get_opts(nvme_ctrlr->ctrlr); ++ spdk_json_write_named_bool(w, "hdgst", opts->header_digest); ++ spdk_json_write_named_bool(w, "ddgst", opts->data_digest); ++ ++ spdk_json_write_object_end(w); ++ ++ spdk_json_write_object_end(w); ++} ++ ++static void ++bdev_nvme_hotplug_config_json(struct spdk_json_write_ctx *w) ++{ ++ spdk_json_write_object_begin(w); ++ spdk_json_write_named_string(w, "method", "bdev_nvme_set_hotplug"); ++ ++ spdk_json_write_named_object_begin(w, "params"); ++ spdk_json_write_named_uint64(w, "period_us", g_nvme_hotplug_poll_period_us); ++ spdk_json_write_named_bool(w, "enable", g_nvme_hotplug_enabled); ++ spdk_json_write_object_end(w); ++ ++ spdk_json_write_object_end(w); ++} ++ ++static int ++bdev_nvme_config_json(struct spdk_json_write_ctx *w) ++{ ++ struct nvme_bdev_ctrlr *nbdev_ctrlr; ++ struct nvme_ctrlr *nvme_ctrlr; ++ struct discovery_ctx *ctx; ++ ++ bdev_nvme_opts_config_json(w); ++ ++ pthread_mutex_lock(&g_bdev_nvme_mutex); ++ ++ TAILQ_FOREACH(nbdev_ctrlr, &g_nvme_bdev_ctrlrs, tailq) { ++ TAILQ_FOREACH(nvme_ctrlr, &nbdev_ctrlr->ctrlrs, tailq) { ++ nvme_ctrlr_config_json(w, nvme_ctrlr); ++ } ++ } ++ ++ TAILQ_FOREACH(ctx, &g_discovery_ctxs, tailq) { ++ if (!ctx->from_mdns_discovery_service) { ++ bdev_nvme_discovery_config_json(w, ctx); ++ } ++ } ++ ++ bdev_nvme_mdns_discovery_config_json(w); ++ ++ /* Dump as last parameter to give all NVMe bdevs chance to be constructed ++ * before enabling hotplug poller. ++ */ ++ bdev_nvme_hotplug_config_json(w); ++ ++ pthread_mutex_unlock(&g_bdev_nvme_mutex); ++ return 0; ++} ++ ++struct spdk_nvme_ctrlr * ++bdev_nvme_get_ctrlr(struct spdk_bdev *bdev) ++{ ++ struct nvme_bdev *nbdev; ++ struct nvme_ns *nvme_ns; ++ ++ if (!bdev || bdev->module != &nvme_if) { ++ return NULL; ++ } ++ ++ nbdev = SPDK_CONTAINEROF(bdev, struct nvme_bdev, disk); ++ nvme_ns = TAILQ_FIRST(&nbdev->nvme_ns_list); ++ assert(nvme_ns != NULL); ++ ++ return nvme_ns->ctrlr->ctrlr; ++} ++ ++void ++nvme_io_path_info_json(struct spdk_json_write_ctx *w, struct nvme_io_path *io_path) ++{ ++ struct nvme_ns *nvme_ns = io_path->nvme_ns; ++ struct nvme_ctrlr *nvme_ctrlr = io_path->qpair->ctrlr; ++ const struct spdk_nvme_ctrlr_data *cdata; ++ const struct spdk_nvme_transport_id *trid; ++ const char *adrfam_str; ++ ++ spdk_json_write_object_begin(w); ++ ++ spdk_json_write_named_string(w, "bdev_name", nvme_ns->bdev->disk.name); ++ ++ cdata = spdk_nvme_ctrlr_get_data(nvme_ctrlr->ctrlr); ++ trid = spdk_nvme_ctrlr_get_transport_id(nvme_ctrlr->ctrlr); ++ ++ spdk_json_write_named_uint32(w, "cntlid", cdata->cntlid); ++ spdk_json_write_named_bool(w, "current", io_path == io_path->nbdev_ch->current_io_path); ++ spdk_json_write_named_bool(w, "connected", nvme_io_path_is_connected(io_path)); ++ spdk_json_write_named_bool(w, "accessible", nvme_ns_is_accessible(nvme_ns)); ++ ++ spdk_json_write_named_object_begin(w, "transport"); ++ spdk_json_write_named_string(w, "trtype", trid->trstring); ++ spdk_json_write_named_string(w, "traddr", trid->traddr); ++ if (trid->trsvcid[0] != '\0') { ++ spdk_json_write_named_string(w, "trsvcid", trid->trsvcid); ++ } ++ adrfam_str = spdk_nvme_transport_id_adrfam_str(trid->adrfam); ++ if (adrfam_str) { ++ spdk_json_write_named_string(w, "adrfam", adrfam_str); ++ } ++ spdk_json_write_object_end(w); ++ ++ spdk_json_write_object_end(w); ++} ++ ++void ++bdev_nvme_get_discovery_info(struct spdk_json_write_ctx *w) ++{ ++ struct discovery_ctx *ctx; ++ struct discovery_entry_ctx *entry_ctx; ++ ++ spdk_json_write_array_begin(w); ++ TAILQ_FOREACH(ctx, &g_discovery_ctxs, tailq) { ++ spdk_json_write_object_begin(w); ++ spdk_json_write_named_string(w, "name", ctx->name); ++ ++ spdk_json_write_named_object_begin(w, "trid"); ++ nvme_bdev_dump_trid_json(&ctx->trid, w); ++ spdk_json_write_object_end(w); ++ ++ spdk_json_write_named_array_begin(w, "referrals"); ++ TAILQ_FOREACH(entry_ctx, &ctx->discovery_entry_ctxs, tailq) { ++ spdk_json_write_object_begin(w); ++ spdk_json_write_named_object_begin(w, "trid"); ++ nvme_bdev_dump_trid_json(&entry_ctx->trid, w); ++ spdk_json_write_object_end(w); ++ spdk_json_write_object_end(w); ++ } ++ spdk_json_write_array_end(w); ++ ++ spdk_json_write_object_end(w); ++ } ++ spdk_json_write_array_end(w); ++} ++ ++SPDK_LOG_REGISTER_COMPONENT(bdev_nvme) ++ ++SPDK_TRACE_REGISTER_FN(bdev_nvme_trace, "bdev_nvme", TRACE_GROUP_BDEV_NVME) ++{ ++ struct spdk_trace_tpoint_opts opts[] = { ++ { ++ "BDEV_NVME_IO_START", TRACE_BDEV_NVME_IO_START, ++ OWNER_NONE, OBJECT_BDEV_NVME_IO, 1, ++ {{ "ctx", SPDK_TRACE_ARG_TYPE_PTR, 8 }} ++ }, ++ { ++ "BDEV_NVME_IO_DONE", TRACE_BDEV_NVME_IO_DONE, ++ OWNER_NONE, OBJECT_BDEV_NVME_IO, 0, ++ {{ "ctx", SPDK_TRACE_ARG_TYPE_PTR, 8 }} ++ } ++ }; ++ ++ ++ spdk_trace_register_object(OBJECT_BDEV_NVME_IO, 'N'); ++ spdk_trace_register_description_ext(opts, SPDK_COUNTOF(opts)); ++ spdk_trace_tpoint_register_relation(TRACE_NVME_PCIE_SUBMIT, OBJECT_BDEV_NVME_IO, 0); ++ spdk_trace_tpoint_register_relation(TRACE_NVME_TCP_SUBMIT, OBJECT_BDEV_NVME_IO, 0); ++ spdk_trace_tpoint_register_relation(TRACE_NVME_PCIE_COMPLETE, OBJECT_BDEV_NVME_IO, 0); ++ spdk_trace_tpoint_register_relation(TRACE_NVME_TCP_COMPLETE, OBJECT_BDEV_NVME_IO, 0); ++} +diff --git a/module/bdev/nvme/bdev_nvme.h b/module/bdev/nvme/bdev_nvme.h +index 3540b23..adb2112 100644 +--- a/module/bdev/nvme/bdev_nvme.h ++++ b/module/bdev/nvme/bdev_nvme.h +@@ -1,380 +1,380 @@ +-/* SPDX-License-Identifier: BSD-3-Clause +- * Copyright (C) 2016 Intel Corporation. All rights reserved. +- * Copyright (c) 2019 Mellanox Technologies LTD. All rights reserved. +- * Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +- * Copyright (c) 2022 Dell Inc, or its subsidiaries. All rights reserved. +- */ +- +-#ifndef SPDK_BDEV_NVME_H +-#define SPDK_BDEV_NVME_H +- +-#include "spdk/stdinc.h" +- +-#include "spdk/queue.h" +-#include "spdk/nvme.h" +-#include "spdk/bdev_module.h" +-#include "spdk/jsonrpc.h" +- +-TAILQ_HEAD(nvme_bdev_ctrlrs, nvme_bdev_ctrlr); +-extern struct nvme_bdev_ctrlrs g_nvme_bdev_ctrlrs; +-extern pthread_mutex_t g_bdev_nvme_mutex; +-extern bool g_bdev_nvme_module_finish; +-extern struct spdk_thread *g_bdev_nvme_init_thread; +- +-#define NVME_MAX_CONTROLLERS 1024 +- +-enum bdev_nvme_multipath_policy { +- BDEV_NVME_MP_POLICY_ACTIVE_PASSIVE, +- BDEV_NVME_MP_POLICY_ACTIVE_ACTIVE, +-}; +- +-enum bdev_nvme_multipath_selector { +- BDEV_NVME_MP_SELECTOR_ROUND_ROBIN = 1, +- BDEV_NVME_MP_SELECTOR_QUEUE_DEPTH, +-}; +- +-typedef void (*spdk_bdev_create_nvme_fn)(void *ctx, size_t bdev_count, int rc); +-typedef void (*spdk_bdev_nvme_start_discovery_fn)(void *ctx, int status); +-typedef void (*spdk_bdev_nvme_stop_discovery_fn)(void *ctx); +- +-struct nvme_ctrlr_opts { +- uint32_t prchk_flags; +- int32_t ctrlr_loss_timeout_sec; +- uint32_t reconnect_delay_sec; +- uint32_t fast_io_fail_timeout_sec; +- bool from_discovery_service; +-}; +- +-struct nvme_async_probe_ctx { +- struct spdk_nvme_probe_ctx *probe_ctx; +- const char *base_name; +- const char **names; +- uint32_t count; +- struct spdk_poller *poller; +- struct spdk_nvme_transport_id trid; +- struct nvme_ctrlr_opts bdev_opts; +- struct spdk_nvme_ctrlr_opts drv_opts; +- spdk_bdev_create_nvme_fn cb_fn; +- void *cb_ctx; +- uint32_t populates_in_progress; +- bool ctrlr_attached; +- bool probe_done; +- bool namespaces_populated; +-}; +- +-struct nvme_ns { +- uint32_t id; +- struct spdk_nvme_ns *ns; +- struct nvme_ctrlr *ctrlr; +- struct nvme_bdev *bdev; +- uint32_t ana_group_id; +- enum spdk_nvme_ana_state ana_state; +- bool ana_state_updating; +- bool ana_transition_timedout; +- struct spdk_poller *anatt_timer; +- struct nvme_async_probe_ctx *probe_ctx; +- TAILQ_ENTRY(nvme_ns) tailq; +- RB_ENTRY(nvme_ns) node; +- +- /** +- * record io path stat before destroyed. Allocation of stat is +- * decided by option io_path_stat of RPC +- * bdev_nvme_set_options +- */ +- struct spdk_bdev_io_stat *stat; +-}; +- +-struct nvme_bdev_io; +-struct nvme_bdev_ctrlr; +-struct nvme_bdev; +-struct nvme_io_path; +- +-struct nvme_path_id { +- struct spdk_nvme_transport_id trid; +- struct spdk_nvme_host_id hostid; +- TAILQ_ENTRY(nvme_path_id) link; +- bool is_failed; +-}; +- +-typedef void (*bdev_nvme_reset_cb)(void *cb_arg, bool success); +-typedef void (*nvme_ctrlr_disconnected_cb)(struct nvme_ctrlr *nvme_ctrlr); +- +-struct nvme_ctrlr { +- /** +- * points to pinned, physically contiguous memory region; +- * contains 4KB IDENTIFY structure for controller which is +- * target for CONTROLLER IDENTIFY command during initialization +- */ +- struct spdk_nvme_ctrlr *ctrlr; +- struct nvme_path_id *active_path_id; +- int ref; +- +- uint32_t resetting : 1; +- uint32_t reconnect_is_delayed : 1; +- uint32_t fast_io_fail_timedout : 1; +- uint32_t destruct : 1; +- uint32_t ana_log_page_updating : 1; +- uint32_t io_path_cache_clearing : 1; +- +- struct nvme_ctrlr_opts opts; +- +- RB_HEAD(nvme_ns_tree, nvme_ns) namespaces; +- +- struct spdk_opal_dev *opal_dev; +- +- struct spdk_poller *adminq_timer_poller; +- struct spdk_thread *thread; +- +- bdev_nvme_reset_cb reset_cb_fn; +- void *reset_cb_arg; +- /* Poller used to check for reset/detach completion */ +- struct spdk_poller *reset_detach_poller; +- struct spdk_nvme_detach_ctx *detach_ctx; +- +- uint64_t reset_start_tsc; +- struct spdk_poller *reconnect_delay_timer; +- +- nvme_ctrlr_disconnected_cb disconnected_cb; +- +- /** linked list pointer for device list */ +- TAILQ_ENTRY(nvme_ctrlr) tailq; +- struct nvme_bdev_ctrlr *nbdev_ctrlr; +- +- TAILQ_HEAD(nvme_paths, nvme_path_id) trids; +- +- uint32_t max_ana_log_page_size; +- struct spdk_nvme_ana_page *ana_log_page; +- struct spdk_nvme_ana_group_descriptor *copied_ana_desc; +- +- struct nvme_async_probe_ctx *probe_ctx; +- +- pthread_mutex_t mutex; +-}; +- +-struct nvme_bdev_ctrlr { +- char *name; +- TAILQ_HEAD(, nvme_ctrlr) ctrlrs; +- TAILQ_HEAD(, nvme_bdev) bdevs; +- TAILQ_ENTRY(nvme_bdev_ctrlr) tailq; +-}; +- +-struct nvme_error_stat { +- uint32_t status_type[8]; +- uint32_t status[4][256]; +-}; +- +-struct nvme_bdev { +- struct spdk_bdev disk; +- uint32_t nsid; +- struct nvme_bdev_ctrlr *nbdev_ctrlr; +- pthread_mutex_t mutex; +- int ref; +- enum bdev_nvme_multipath_policy mp_policy; +- enum bdev_nvme_multipath_selector mp_selector; +- uint32_t rr_min_io; +- TAILQ_HEAD(, nvme_ns) nvme_ns_list; +- bool opal; +- TAILQ_ENTRY(nvme_bdev) tailq; +- struct nvme_error_stat *err_stat; +-}; +- +-struct nvme_qpair { +- struct nvme_ctrlr *ctrlr; +- struct spdk_nvme_qpair *qpair; +- struct nvme_poll_group *group; +- struct nvme_ctrlr_channel *ctrlr_ch; +- +- /* The following is used to update io_path cache of nvme_bdev_channels. */ +- TAILQ_HEAD(, nvme_io_path) io_path_list; +- +- TAILQ_ENTRY(nvme_qpair) tailq; +-}; +- +-struct nvme_ctrlr_channel { +- struct nvme_qpair *qpair; +- TAILQ_HEAD(, spdk_bdev_io) pending_resets; +- +- struct spdk_io_channel_iter *reset_iter; +-}; +- +-struct nvme_io_path { +- struct nvme_ns *nvme_ns; +- struct nvme_qpair *qpair; +- STAILQ_ENTRY(nvme_io_path) stailq; +- +- /* The following are used to update io_path cache of the nvme_bdev_channel. */ +- struct nvme_bdev_channel *nbdev_ch; +- TAILQ_ENTRY(nvme_io_path) tailq; +- +- /* allocation of stat is decided by option io_path_stat of RPC bdev_nvme_set_options */ +- struct spdk_bdev_io_stat *stat; +-}; +- +-struct nvme_bdev_channel { +- struct nvme_io_path *current_io_path; +- enum bdev_nvme_multipath_policy mp_policy; +- enum bdev_nvme_multipath_selector mp_selector; +- uint32_t rr_min_io; +- uint32_t rr_counter; +- STAILQ_HEAD(, nvme_io_path) io_path_list; +- TAILQ_HEAD(retry_io_head, spdk_bdev_io) retry_io_list; +- struct spdk_poller *retry_io_poller; +-}; +- +-struct nvme_poll_group { +- struct spdk_nvme_poll_group *group; +- struct spdk_io_channel *accel_channel; +- struct spdk_poller *poller; +- bool collect_spin_stat; +- uint64_t spin_ticks; +- uint64_t start_ticks; +- uint64_t end_ticks; +- TAILQ_HEAD(, nvme_qpair) qpair_list; +-}; +- +-void nvme_io_path_info_json(struct spdk_json_write_ctx *w, struct nvme_io_path *io_path); +- +-struct nvme_ctrlr *nvme_ctrlr_get_by_name(const char *name); +- +-struct nvme_bdev_ctrlr *nvme_bdev_ctrlr_get_by_name(const char *name); +- +-typedef void (*nvme_bdev_ctrlr_for_each_fn)(struct nvme_bdev_ctrlr *nbdev_ctrlr, void *ctx); +- +-void nvme_bdev_ctrlr_for_each(nvme_bdev_ctrlr_for_each_fn fn, void *ctx); +- +-void nvme_bdev_dump_trid_json(const struct spdk_nvme_transport_id *trid, +- struct spdk_json_write_ctx *w); +- +-void nvme_ctrlr_info_json(struct spdk_json_write_ctx *w, struct nvme_ctrlr *nvme_ctrlr); +- +-struct nvme_ns *nvme_ctrlr_get_ns(struct nvme_ctrlr *nvme_ctrlr, uint32_t nsid); +-struct nvme_ns *nvme_ctrlr_get_first_active_ns(struct nvme_ctrlr *nvme_ctrlr); +-struct nvme_ns *nvme_ctrlr_get_next_active_ns(struct nvme_ctrlr *nvme_ctrlr, struct nvme_ns *ns); +- +-enum spdk_bdev_timeout_action { +- SPDK_BDEV_NVME_TIMEOUT_ACTION_NONE = 0, +- SPDK_BDEV_NVME_TIMEOUT_ACTION_RESET, +- SPDK_BDEV_NVME_TIMEOUT_ACTION_ABORT, +-}; +- +-struct spdk_bdev_nvme_opts { +- enum spdk_bdev_timeout_action action_on_timeout; +- uint64_t timeout_us; +- uint64_t timeout_admin_us; +- uint32_t keep_alive_timeout_ms; +- /* The number of attempts per I/O in the transport layer before an I/O fails. */ +- uint32_t transport_retry_count; +- uint32_t arbitration_burst; +- uint32_t low_priority_weight; +- uint32_t medium_priority_weight; +- uint32_t high_priority_weight; +- uint64_t nvme_adminq_poll_period_us; +- uint64_t nvme_ioq_poll_period_us; +- uint32_t io_queue_requests; +- bool delay_cmd_submit; +- /* The number of attempts per I/O in the bdev layer before an I/O fails. */ +- int32_t bdev_retry_count; +- uint8_t transport_ack_timeout; +- int32_t ctrlr_loss_timeout_sec; +- uint32_t reconnect_delay_sec; +- uint32_t fast_io_fail_timeout_sec; +- bool disable_auto_failback; +- bool generate_uuids; +- /* Type of Service - RDMA only */ +- uint8_t transport_tos; +- bool nvme_error_stat; +- uint32_t rdma_srq_size; +- bool io_path_stat; +-}; +- +-struct spdk_nvme_qpair *bdev_nvme_get_io_qpair(struct spdk_io_channel *ctrlr_io_ch); +-void bdev_nvme_get_opts(struct spdk_bdev_nvme_opts *opts); +-int bdev_nvme_set_opts(const struct spdk_bdev_nvme_opts *opts); +-int bdev_nvme_set_hotplug(bool enabled, uint64_t period_us, spdk_msg_fn cb, void *cb_ctx); +- +-void bdev_nvme_get_default_ctrlr_opts(struct nvme_ctrlr_opts *opts); +- +-int bdev_nvme_create(struct spdk_nvme_transport_id *trid, +- const char *base_name, +- const char **names, +- uint32_t count, +- spdk_bdev_create_nvme_fn cb_fn, +- void *cb_ctx, +- struct spdk_nvme_ctrlr_opts *drv_opts, +- struct nvme_ctrlr_opts *bdev_opts, +- bool multipath); +- +-int bdev_nvme_start_discovery(struct spdk_nvme_transport_id *trid, const char *base_name, +- struct spdk_nvme_ctrlr_opts *drv_opts, struct nvme_ctrlr_opts *bdev_opts, +- uint64_t timeout, bool from_mdns, +- spdk_bdev_nvme_start_discovery_fn cb_fn, void *cb_ctx); +-int bdev_nvme_stop_discovery(const char *name, spdk_bdev_nvme_stop_discovery_fn cb_fn, +- void *cb_ctx); +-void bdev_nvme_get_discovery_info(struct spdk_json_write_ctx *w); +- +-int bdev_nvme_start_mdns_discovery(const char *base_name, +- const char *svcname, +- struct spdk_nvme_ctrlr_opts *drv_opts, +- struct nvme_ctrlr_opts *bdev_opts); +-int bdev_nvme_stop_mdns_discovery(const char *name); +-void bdev_nvme_get_mdns_discovery_info(struct spdk_jsonrpc_request *request); +-void bdev_nvme_mdns_discovery_config_json(struct spdk_json_write_ctx *w); +- +-struct spdk_nvme_ctrlr *bdev_nvme_get_ctrlr(struct spdk_bdev *bdev); +- +-/** +- * Delete NVMe controller with all bdevs on top of it, or delete the specified path +- * if there is any alternative path. Requires to pass name of NVMe controller. +- * +- * \param name NVMe controller name +- * \param path_id The specified path to remove (optional) +- * \return zero on success, -EINVAL on wrong parameters or -ENODEV if controller is not found +- */ +-int bdev_nvme_delete(const char *name, const struct nvme_path_id *path_id); +- +-/** +- * Reset NVMe controller. +- * +- * \param nvme_ctrlr The specified NVMe controller to reset +- * \param cb_fn Function to be called back after reset completes +- * \param cb_arg Argument for callback function +- * \return zero on success. Negated errno on the following error conditions: +- * -ENXIO: controller is being destroyed. +- * -EBUSY: controller is already being reset. +- */ +-int bdev_nvme_reset_rpc(struct nvme_ctrlr *nvme_ctrlr, bdev_nvme_reset_cb cb_fn, void *cb_arg); +- +-typedef void (*bdev_nvme_set_preferred_path_cb)(void *cb_arg, int rc); +- +-/** +- * Set the preferred I/O path for an NVMe bdev in multipath mode. +- * +- * NOTE: This function does not support NVMe bdevs in failover mode. +- * +- * \param name NVMe bdev name +- * \param cntlid NVMe-oF controller ID +- * \param cb_fn Function to be called back after completion. +- * \param cb_arg Argument for callback function. +- */ +-void bdev_nvme_set_preferred_path(const char *name, uint16_t cntlid, +- bdev_nvme_set_preferred_path_cb cb_fn, void *cb_arg); +- +-typedef void (*bdev_nvme_set_multipath_policy_cb)(void *cb_arg, int rc); +- +-/** +- * Set multipath policy of the NVMe bdev. +- * +- * \param name NVMe bdev name +- * \param policy Multipath policy (active-passive or active-active) +- * \param selector Multipath selector (round_robin, queue_depth) +- * \param rr_min_io Number of IO to route to a path before switching to another for round-robin +- * \param cb_fn Function to be called back after completion. +- */ +-void bdev_nvme_set_multipath_policy(const char *name, +- enum bdev_nvme_multipath_policy policy, +- enum bdev_nvme_multipath_selector selector, +- uint32_t rr_min_io, +- bdev_nvme_set_multipath_policy_cb cb_fn, +- void *cb_arg); +- +-#endif /* SPDK_BDEV_NVME_H */ ++/* SPDX-License-Identifier: BSD-3-Clause ++ * Copyright (C) 2016 Intel Corporation. All rights reserved. ++ * Copyright (c) 2019 Mellanox Technologies LTD. All rights reserved. ++ * Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved. ++ * Copyright (c) 2022 Dell Inc, or its subsidiaries. All rights reserved. ++ */ ++ ++#ifndef SPDK_BDEV_NVME_H ++#define SPDK_BDEV_NVME_H ++ ++#include "spdk/stdinc.h" ++ ++#include "spdk/queue.h" ++#include "spdk/nvme.h" ++#include "spdk/bdev_module.h" ++#include "spdk/jsonrpc.h" ++ ++TAILQ_HEAD(nvme_bdev_ctrlrs, nvme_bdev_ctrlr); ++extern struct nvme_bdev_ctrlrs g_nvme_bdev_ctrlrs; ++extern pthread_mutex_t g_bdev_nvme_mutex; ++extern bool g_bdev_nvme_module_finish; ++extern struct spdk_thread *g_bdev_nvme_init_thread; ++ ++#define NVME_MAX_CONTROLLERS 1024 ++ ++enum bdev_nvme_multipath_policy { ++ BDEV_NVME_MP_POLICY_ACTIVE_PASSIVE, ++ BDEV_NVME_MP_POLICY_ACTIVE_ACTIVE, ++}; ++ ++enum bdev_nvme_multipath_selector { ++ BDEV_NVME_MP_SELECTOR_ROUND_ROBIN = 1, ++ BDEV_NVME_MP_SELECTOR_QUEUE_DEPTH, ++}; ++ ++typedef void (*spdk_bdev_create_nvme_fn)(void *ctx, size_t bdev_count, int rc); ++typedef void (*spdk_bdev_nvme_start_discovery_fn)(void *ctx, int status); ++typedef void (*spdk_bdev_nvme_stop_discovery_fn)(void *ctx); ++ ++struct nvme_ctrlr_opts { ++ uint32_t prchk_flags; ++ int32_t ctrlr_loss_timeout_sec; ++ uint32_t reconnect_delay_sec; ++ uint32_t fast_io_fail_timeout_sec; ++ bool from_discovery_service; ++}; ++ ++struct nvme_async_probe_ctx { ++ struct spdk_nvme_probe_ctx *probe_ctx; ++ const char *base_name; ++ const char **names; ++ uint32_t count; ++ struct spdk_poller *poller; ++ struct spdk_nvme_transport_id trid; ++ struct nvme_ctrlr_opts bdev_opts; ++ struct spdk_nvme_ctrlr_opts drv_opts; ++ spdk_bdev_create_nvme_fn cb_fn; ++ void *cb_ctx; ++ uint32_t populates_in_progress; ++ bool ctrlr_attached; ++ bool probe_done; ++ bool namespaces_populated; ++}; ++ ++struct nvme_ns { ++ uint32_t id; ++ struct spdk_nvme_ns *ns; ++ struct nvme_ctrlr *ctrlr; ++ struct nvme_bdev *bdev; ++ uint32_t ana_group_id; ++ enum spdk_nvme_ana_state ana_state; ++ bool ana_state_updating; ++ bool ana_transition_timedout; ++ struct spdk_poller *anatt_timer; ++ struct nvme_async_probe_ctx *probe_ctx; ++ TAILQ_ENTRY(nvme_ns) tailq; ++ RB_ENTRY(nvme_ns) node; ++ ++ /** ++ * record io path stat before destroyed. Allocation of stat is ++ * decided by option io_path_stat of RPC ++ * bdev_nvme_set_options ++ */ ++ struct spdk_bdev_io_stat *stat; ++}; ++ ++struct nvme_bdev_io; ++struct nvme_bdev_ctrlr; ++struct nvme_bdev; ++struct nvme_io_path; ++ ++struct nvme_path_id { ++ struct spdk_nvme_transport_id trid; ++ struct spdk_nvme_host_id hostid; ++ TAILQ_ENTRY(nvme_path_id) link; ++ bool is_failed; ++}; ++ ++typedef void (*bdev_nvme_reset_cb)(void *cb_arg, bool success); ++typedef void (*nvme_ctrlr_disconnected_cb)(struct nvme_ctrlr *nvme_ctrlr); ++ ++struct nvme_ctrlr { ++ /** ++ * points to pinned, physically contiguous memory region; ++ * contains 4KB IDENTIFY structure for controller which is ++ * target for CONTROLLER IDENTIFY command during initialization ++ */ ++ struct spdk_nvme_ctrlr *ctrlr; ++ struct nvme_path_id *active_path_id; ++ int ref; ++ ++ uint32_t resetting : 1; ++ uint32_t reconnect_is_delayed : 1; ++ uint32_t fast_io_fail_timedout : 1; ++ uint32_t destruct : 1; ++ uint32_t ana_log_page_updating : 1; ++ uint32_t io_path_cache_clearing : 1; ++ ++ struct nvme_ctrlr_opts opts; ++ ++ RB_HEAD(nvme_ns_tree, nvme_ns) namespaces; ++ ++ struct spdk_opal_dev *opal_dev; ++ ++ struct spdk_poller *adminq_timer_poller; ++ struct spdk_thread *thread; ++ ++ bdev_nvme_reset_cb reset_cb_fn; ++ void *reset_cb_arg; ++ /* Poller used to check for reset/detach completion */ ++ struct spdk_poller *reset_detach_poller; ++ struct spdk_nvme_detach_ctx *detach_ctx; ++ ++ uint64_t reset_start_tsc; ++ struct spdk_poller *reconnect_delay_timer; ++ ++ nvme_ctrlr_disconnected_cb disconnected_cb; ++ ++ /** linked list pointer for device list */ ++ TAILQ_ENTRY(nvme_ctrlr) tailq; ++ struct nvme_bdev_ctrlr *nbdev_ctrlr; ++ ++ TAILQ_HEAD(nvme_paths, nvme_path_id) trids; ++ ++ uint32_t max_ana_log_page_size; ++ struct spdk_nvme_ana_page *ana_log_page; ++ struct spdk_nvme_ana_group_descriptor *copied_ana_desc; ++ ++ struct nvme_async_probe_ctx *probe_ctx; ++ ++ pthread_mutex_t mutex; ++}; ++ ++struct nvme_bdev_ctrlr { ++ char *name; ++ TAILQ_HEAD(, nvme_ctrlr) ctrlrs; ++ TAILQ_HEAD(, nvme_bdev) bdevs; ++ TAILQ_ENTRY(nvme_bdev_ctrlr) tailq; ++}; ++ ++struct nvme_error_stat { ++ uint32_t status_type[8]; ++ uint32_t status[4][256]; ++}; ++ ++struct nvme_bdev { ++ struct spdk_bdev disk; ++ uint32_t nsid; ++ struct nvme_bdev_ctrlr *nbdev_ctrlr; ++ pthread_mutex_t mutex; ++ int ref; ++ enum bdev_nvme_multipath_policy mp_policy; ++ enum bdev_nvme_multipath_selector mp_selector; ++ uint32_t rr_min_io; ++ TAILQ_HEAD(, nvme_ns) nvme_ns_list; ++ bool opal; ++ TAILQ_ENTRY(nvme_bdev) tailq; ++ struct nvme_error_stat *err_stat; ++}; ++ ++struct nvme_qpair { ++ struct nvme_ctrlr *ctrlr; ++ struct spdk_nvme_qpair *qpair; ++ struct nvme_poll_group *group; ++ struct nvme_ctrlr_channel *ctrlr_ch; ++ ++ /* The following is used to update io_path cache of nvme_bdev_channels. */ ++ TAILQ_HEAD(, nvme_io_path) io_path_list; ++ ++ TAILQ_ENTRY(nvme_qpair) tailq; ++}; ++ ++struct nvme_ctrlr_channel { ++ struct nvme_qpair *qpair; ++ TAILQ_HEAD(, spdk_bdev_io) pending_resets; ++ ++ struct spdk_io_channel_iter *reset_iter; ++}; ++ ++struct nvme_io_path { ++ struct nvme_ns *nvme_ns; ++ struct nvme_qpair *qpair; ++ STAILQ_ENTRY(nvme_io_path) stailq; ++ ++ /* The following are used to update io_path cache of the nvme_bdev_channel. */ ++ struct nvme_bdev_channel *nbdev_ch; ++ TAILQ_ENTRY(nvme_io_path) tailq; ++ ++ /* allocation of stat is decided by option io_path_stat of RPC bdev_nvme_set_options */ ++ struct spdk_bdev_io_stat *stat; ++}; ++ ++struct nvme_bdev_channel { ++ struct nvme_io_path *current_io_path; ++ enum bdev_nvme_multipath_policy mp_policy; ++ enum bdev_nvme_multipath_selector mp_selector; ++ uint32_t rr_min_io; ++ uint32_t rr_counter; ++ STAILQ_HEAD(, nvme_io_path) io_path_list; ++ TAILQ_HEAD(retry_io_head, spdk_bdev_io) retry_io_list; ++ struct spdk_poller *retry_io_poller; ++}; ++ ++struct nvme_poll_group { ++ struct spdk_nvme_poll_group *group; ++ struct spdk_io_channel *accel_channel; ++ struct spdk_poller *poller; ++ bool collect_spin_stat; ++ uint64_t spin_ticks; ++ uint64_t start_ticks; ++ uint64_t end_ticks; ++ TAILQ_HEAD(, nvme_qpair) qpair_list; ++}; ++ ++void nvme_io_path_info_json(struct spdk_json_write_ctx *w, struct nvme_io_path *io_path); ++ ++struct nvme_ctrlr *nvme_ctrlr_get_by_name(const char *name); ++ ++struct nvme_bdev_ctrlr *nvme_bdev_ctrlr_get_by_name(const char *name); ++ ++typedef void (*nvme_bdev_ctrlr_for_each_fn)(struct nvme_bdev_ctrlr *nbdev_ctrlr, void *ctx); ++ ++void nvme_bdev_ctrlr_for_each(nvme_bdev_ctrlr_for_each_fn fn, void *ctx); ++ ++void nvme_bdev_dump_trid_json(const struct spdk_nvme_transport_id *trid, ++ struct spdk_json_write_ctx *w); ++ ++void nvme_ctrlr_info_json(struct spdk_json_write_ctx *w, struct nvme_ctrlr *nvme_ctrlr); ++ ++struct nvme_ns *nvme_ctrlr_get_ns(struct nvme_ctrlr *nvme_ctrlr, uint32_t nsid); ++struct nvme_ns *nvme_ctrlr_get_first_active_ns(struct nvme_ctrlr *nvme_ctrlr); ++struct nvme_ns *nvme_ctrlr_get_next_active_ns(struct nvme_ctrlr *nvme_ctrlr, struct nvme_ns *ns); ++ ++enum spdk_bdev_timeout_action { ++ SPDK_BDEV_NVME_TIMEOUT_ACTION_NONE = 0, ++ SPDK_BDEV_NVME_TIMEOUT_ACTION_RESET, ++ SPDK_BDEV_NVME_TIMEOUT_ACTION_ABORT, ++}; ++ ++struct spdk_bdev_nvme_opts { ++ enum spdk_bdev_timeout_action action_on_timeout; ++ uint64_t timeout_us; ++ uint64_t timeout_admin_us; ++ uint32_t keep_alive_timeout_ms; ++ /* The number of attempts per I/O in the transport layer before an I/O fails. */ ++ uint32_t transport_retry_count; ++ uint32_t arbitration_burst; ++ uint32_t low_priority_weight; ++ uint32_t medium_priority_weight; ++ uint32_t high_priority_weight; ++ uint64_t nvme_adminq_poll_period_us; ++ uint64_t nvme_ioq_poll_period_us; ++ uint32_t io_queue_requests; ++ bool delay_cmd_submit; ++ /* The number of attempts per I/O in the bdev layer before an I/O fails. */ ++ int32_t bdev_retry_count; ++ uint8_t transport_ack_timeout; ++ int32_t ctrlr_loss_timeout_sec; ++ uint32_t reconnect_delay_sec; ++ uint32_t fast_io_fail_timeout_sec; ++ bool disable_auto_failback; ++ bool generate_uuids; ++ /* Type of Service - RDMA only */ ++ uint8_t transport_tos; ++ bool nvme_error_stat; ++ uint32_t rdma_srq_size; ++ bool io_path_stat; ++}; ++ ++struct spdk_nvme_qpair *bdev_nvme_get_io_qpair(struct spdk_io_channel *ctrlr_io_ch); ++void bdev_nvme_get_opts(struct spdk_bdev_nvme_opts *opts); ++int bdev_nvme_set_opts(const struct spdk_bdev_nvme_opts *opts); ++int bdev_nvme_set_hotplug(bool enabled, uint64_t period_us, spdk_msg_fn cb, void *cb_ctx); ++ ++void bdev_nvme_get_default_ctrlr_opts(struct nvme_ctrlr_opts *opts); ++ ++int bdev_nvme_create(struct spdk_nvme_transport_id *trid, ++ const char *base_name, ++ const char **names, ++ uint32_t count, ++ spdk_bdev_create_nvme_fn cb_fn, ++ void *cb_ctx, ++ struct spdk_nvme_ctrlr_opts *drv_opts, ++ struct nvme_ctrlr_opts *bdev_opts, ++ bool multipath); ++ ++int bdev_nvme_start_discovery(struct spdk_nvme_transport_id *trid, const char *base_name, ++ struct spdk_nvme_ctrlr_opts *drv_opts, struct nvme_ctrlr_opts *bdev_opts, ++ uint64_t timeout, bool from_mdns, ++ spdk_bdev_nvme_start_discovery_fn cb_fn, void *cb_ctx); ++int bdev_nvme_stop_discovery(const char *name, spdk_bdev_nvme_stop_discovery_fn cb_fn, ++ void *cb_ctx); ++void bdev_nvme_get_discovery_info(struct spdk_json_write_ctx *w); ++ ++int bdev_nvme_start_mdns_discovery(const char *base_name, ++ const char *svcname, ++ struct spdk_nvme_ctrlr_opts *drv_opts, ++ struct nvme_ctrlr_opts *bdev_opts); ++int bdev_nvme_stop_mdns_discovery(const char *name); ++void bdev_nvme_get_mdns_discovery_info(struct spdk_jsonrpc_request *request); ++void bdev_nvme_mdns_discovery_config_json(struct spdk_json_write_ctx *w); ++ ++struct spdk_nvme_ctrlr *bdev_nvme_get_ctrlr(struct spdk_bdev *bdev); ++ ++/** ++ * Delete NVMe controller with all bdevs on top of it, or delete the specified path ++ * if there is any alternative path. Requires to pass name of NVMe controller. ++ * ++ * \param name NVMe controller name ++ * \param path_id The specified path to remove (optional) ++ * \return zero on success, -EINVAL on wrong parameters or -ENODEV if controller is not found ++ */ ++int bdev_nvme_delete(const char *name, const struct nvme_path_id *path_id); ++ ++/** ++ * Reset NVMe controller. ++ * ++ * \param nvme_ctrlr The specified NVMe controller to reset ++ * \param cb_fn Function to be called back after reset completes ++ * \param cb_arg Argument for callback function ++ * \return zero on success. Negated errno on the following error conditions: ++ * -ENXIO: controller is being destroyed. ++ * -EBUSY: controller is already being reset. ++ */ ++int bdev_nvme_reset_rpc(struct nvme_ctrlr *nvme_ctrlr, bdev_nvme_reset_cb cb_fn, void *cb_arg); ++ ++typedef void (*bdev_nvme_set_preferred_path_cb)(void *cb_arg, int rc); ++ ++/** ++ * Set the preferred I/O path for an NVMe bdev in multipath mode. ++ * ++ * NOTE: This function does not support NVMe bdevs in failover mode. ++ * ++ * \param name NVMe bdev name ++ * \param cntlid NVMe-oF controller ID ++ * \param cb_fn Function to be called back after completion. ++ * \param cb_arg Argument for callback function. ++ */ ++void bdev_nvme_set_preferred_path(const char *name, uint16_t cntlid, ++ bdev_nvme_set_preferred_path_cb cb_fn, void *cb_arg); ++ ++typedef void (*bdev_nvme_set_multipath_policy_cb)(void *cb_arg, int rc); ++ ++/** ++ * Set multipath policy of the NVMe bdev. ++ * ++ * \param name NVMe bdev name ++ * \param policy Multipath policy (active-passive or active-active) ++ * \param selector Multipath selector (round_robin, queue_depth) ++ * \param rr_min_io Number of IO to route to a path before switching to another for round-robin ++ * \param cb_fn Function to be called back after completion. ++ */ ++void bdev_nvme_set_multipath_policy(const char *name, ++ enum bdev_nvme_multipath_policy policy, ++ enum bdev_nvme_multipath_selector selector, ++ uint32_t rr_min_io, ++ bdev_nvme_set_multipath_policy_cb cb_fn, ++ void *cb_arg); ++ ++#endif /* SPDK_BDEV_NVME_H */ +diff --git a/module/bdev/nvme/bdev_nvme_cuse_rpc.c b/module/bdev/nvme/bdev_nvme_cuse_rpc.c +index 0ceb47c..90a71ab 100644 +--- a/module/bdev/nvme/bdev_nvme_cuse_rpc.c ++++ b/module/bdev/nvme/bdev_nvme_cuse_rpc.c +@@ -1,118 +1,118 @@ +-/* SPDX-License-Identifier: BSD-3-Clause +- * Copyright (C) 2019 Intel Corporation. +- * All rights reserved. +- */ +- +-#include "spdk/stdinc.h" +- +-#include "bdev_nvme.h" +- +-#include "spdk/string.h" +-#include "spdk/rpc.h" +-#include "spdk/util.h" +-#include "spdk/nvme.h" +- +-#include "spdk/log.h" +- +-struct rpc_nvme_cuse_register { +- char *name; +-}; +- +-static void +-free_rpc_nvme_cuse_register(struct rpc_nvme_cuse_register *req) +-{ +- free(req->name); +-} +- +-static const struct spdk_json_object_decoder rpc_nvme_cuse_register_decoders[] = { +- {"name", offsetof(struct rpc_nvme_cuse_register, name), spdk_json_decode_string}, +-}; +- +-static void +-rpc_nvme_cuse_register(struct spdk_jsonrpc_request *request, +- const struct spdk_json_val *params) +-{ +- struct rpc_nvme_cuse_register req = {}; +- struct nvme_ctrlr *bdev_ctrlr = NULL; +- int rc; +- +- if (spdk_json_decode_object(params, rpc_nvme_cuse_register_decoders, +- SPDK_COUNTOF(rpc_nvme_cuse_register_decoders), +- &req)) { +- SPDK_ERRLOG("spdk_json_decode_object failed\n"); +- spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, +- "spdk_json_decode_object failed"); +- goto cleanup; +- } +- +- bdev_ctrlr = nvme_ctrlr_get_by_name(req.name); +- if (!bdev_ctrlr) { +- SPDK_ERRLOG("No such controller\n"); +- spdk_jsonrpc_send_error_response(request, -ENODEV, spdk_strerror(ENODEV)); +- goto cleanup; +- } +- +- rc = spdk_nvme_cuse_register(bdev_ctrlr->ctrlr); +- if (rc) { +- SPDK_ERRLOG("Failed to register CUSE devices: %s\n", spdk_strerror(-rc)); +- spdk_jsonrpc_send_error_response(request, rc, spdk_strerror(-rc)); +- goto cleanup; +- } +- +- spdk_jsonrpc_send_bool_response(request, true); +- +-cleanup: +- free_rpc_nvme_cuse_register(&req); +-} +-SPDK_RPC_REGISTER("bdev_nvme_cuse_register", rpc_nvme_cuse_register, SPDK_RPC_RUNTIME) +- +-struct rpc_nvme_cuse_unregister { +- char *name; +-}; +- +-static void +-free_rpc_nvme_cuse_unregister(struct rpc_nvme_cuse_unregister *req) +-{ +- free(req->name); +-} +- +-static const struct spdk_json_object_decoder rpc_nvme_cuse_unregister_decoders[] = { +- {"name", offsetof(struct rpc_nvme_cuse_unregister, name), spdk_json_decode_string, true}, +-}; +- +-static void +-rpc_nvme_cuse_unregister(struct spdk_jsonrpc_request *request, +- const struct spdk_json_val *params) +-{ +- struct rpc_nvme_cuse_unregister req = {}; +- struct nvme_ctrlr *bdev_ctrlr = NULL; +- int rc; +- +- if (spdk_json_decode_object(params, rpc_nvme_cuse_unregister_decoders, +- SPDK_COUNTOF(rpc_nvme_cuse_unregister_decoders), +- &req)) { +- SPDK_ERRLOG("spdk_json_decode_object failed\n"); +- spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, +- "spdk_json_decode_object failed"); +- goto cleanup; +- } +- +- bdev_ctrlr = nvme_ctrlr_get_by_name(req.name); +- if (!bdev_ctrlr) { +- SPDK_ERRLOG("No such controller\n"); +- spdk_jsonrpc_send_error_response(request, -ENODEV, spdk_strerror(ENODEV)); +- goto cleanup; +- } +- +- rc = spdk_nvme_cuse_unregister(bdev_ctrlr->ctrlr); +- if (rc) { +- spdk_jsonrpc_send_error_response(request, rc, spdk_strerror(-rc)); +- goto cleanup; +- } +- +- spdk_jsonrpc_send_bool_response(request, true); +- +-cleanup: +- free_rpc_nvme_cuse_unregister(&req); +-} +-SPDK_RPC_REGISTER("bdev_nvme_cuse_unregister", rpc_nvme_cuse_unregister, SPDK_RPC_RUNTIME) ++/* SPDX-License-Identifier: BSD-3-Clause ++ * Copyright (C) 2019 Intel Corporation. ++ * All rights reserved. ++ */ ++ ++#include "spdk/stdinc.h" ++ ++#include "bdev_nvme.h" ++ ++#include "spdk/string.h" ++#include "spdk/rpc.h" ++#include "spdk/util.h" ++#include "spdk/nvme.h" ++ ++#include "spdk/log.h" ++ ++struct rpc_nvme_cuse_register { ++ char *name; ++}; ++ ++static void ++free_rpc_nvme_cuse_register(struct rpc_nvme_cuse_register *req) ++{ ++ free(req->name); ++} ++ ++static const struct spdk_json_object_decoder rpc_nvme_cuse_register_decoders[] = { ++ {"name", offsetof(struct rpc_nvme_cuse_register, name), spdk_json_decode_string}, ++}; ++ ++static void ++rpc_nvme_cuse_register(struct spdk_jsonrpc_request *request, ++ const struct spdk_json_val *params) ++{ ++ struct rpc_nvme_cuse_register req = {}; ++ struct nvme_ctrlr *bdev_ctrlr = NULL; ++ int rc; ++ ++ if (spdk_json_decode_object(params, rpc_nvme_cuse_register_decoders, ++ SPDK_COUNTOF(rpc_nvme_cuse_register_decoders), ++ &req)) { ++ SPDK_ERRLOG("spdk_json_decode_object failed\n"); ++ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, ++ "spdk_json_decode_object failed"); ++ goto cleanup; ++ } ++ ++ bdev_ctrlr = nvme_ctrlr_get_by_name(req.name); ++ if (!bdev_ctrlr) { ++ SPDK_ERRLOG("No such controller\n"); ++ spdk_jsonrpc_send_error_response(request, -ENODEV, spdk_strerror(ENODEV)); ++ goto cleanup; ++ } ++ ++ rc = spdk_nvme_cuse_register(bdev_ctrlr->ctrlr); ++ if (rc) { ++ SPDK_ERRLOG("Failed to register CUSE devices: %s\n", spdk_strerror(-rc)); ++ spdk_jsonrpc_send_error_response(request, rc, spdk_strerror(-rc)); ++ goto cleanup; ++ } ++ ++ spdk_jsonrpc_send_bool_response(request, true); ++ ++cleanup: ++ free_rpc_nvme_cuse_register(&req); ++} ++SPDK_RPC_REGISTER("bdev_nvme_cuse_register", rpc_nvme_cuse_register, SPDK_RPC_RUNTIME) ++ ++struct rpc_nvme_cuse_unregister { ++ char *name; ++}; ++ ++static void ++free_rpc_nvme_cuse_unregister(struct rpc_nvme_cuse_unregister *req) ++{ ++ free(req->name); ++} ++ ++static const struct spdk_json_object_decoder rpc_nvme_cuse_unregister_decoders[] = { ++ {"name", offsetof(struct rpc_nvme_cuse_unregister, name), spdk_json_decode_string, true}, ++}; ++ ++static void ++rpc_nvme_cuse_unregister(struct spdk_jsonrpc_request *request, ++ const struct spdk_json_val *params) ++{ ++ struct rpc_nvme_cuse_unregister req = {}; ++ struct nvme_ctrlr *bdev_ctrlr = NULL; ++ int rc; ++ ++ if (spdk_json_decode_object(params, rpc_nvme_cuse_unregister_decoders, ++ SPDK_COUNTOF(rpc_nvme_cuse_unregister_decoders), ++ &req)) { ++ SPDK_ERRLOG("spdk_json_decode_object failed\n"); ++ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, ++ "spdk_json_decode_object failed"); ++ goto cleanup; ++ } ++ ++ bdev_ctrlr = nvme_ctrlr_get_by_name(req.name); ++ if (!bdev_ctrlr) { ++ SPDK_ERRLOG("No such controller\n"); ++ spdk_jsonrpc_send_error_response(request, -ENODEV, spdk_strerror(ENODEV)); ++ goto cleanup; ++ } ++ ++ rc = spdk_nvme_cuse_unregister(bdev_ctrlr->ctrlr); ++ if (rc) { ++ spdk_jsonrpc_send_error_response(request, rc, spdk_strerror(-rc)); ++ goto cleanup; ++ } ++ ++ spdk_jsonrpc_send_bool_response(request, true); ++ ++cleanup: ++ free_rpc_nvme_cuse_unregister(&req); ++} ++SPDK_RPC_REGISTER("bdev_nvme_cuse_unregister", rpc_nvme_cuse_unregister, SPDK_RPC_RUNTIME) +diff --git a/module/bdev/nvme/bdev_nvme_rpc.c b/module/bdev/nvme/bdev_nvme_rpc.c +index 1d32c3d..b15f20c 100644 +--- a/module/bdev/nvme/bdev_nvme_rpc.c ++++ b/module/bdev/nvme/bdev_nvme_rpc.c +@@ -1,2656 +1,2656 @@ +-/* SPDX-License-Identifier: BSD-3-Clause +- * Copyright (C) 2016 Intel Corporation. All rights reserved. +- * Copyright (c) 2019-2021 Mellanox Technologies LTD. All rights reserved. +- * Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +- * Copyright (c) 2022 Dell Inc, or its subsidiaries. All rights reserved. +- */ +- +-#include "spdk/stdinc.h" +- +-#include "bdev_nvme.h" +- +-#include "spdk/config.h" +- +-#include "spdk/string.h" +-#include "spdk/rpc.h" +-#include "spdk/util.h" +-#include "spdk/env.h" +-#include "spdk/nvme.h" +-#include "spdk/nvme_spec.h" +- +-#include "spdk/log.h" +-#include "spdk/bdev_module.h" +- +-struct open_descriptors { +- void *desc; +- struct spdk_bdev *bdev; +- TAILQ_ENTRY(open_descriptors) tqlst; +- struct spdk_thread *thread; +-}; +-typedef TAILQ_HEAD(, open_descriptors) open_descriptors_t; +- +-static int +-rpc_decode_action_on_timeout(const struct spdk_json_val *val, void *out) +-{ +- enum spdk_bdev_timeout_action *action = out; +- +- if (spdk_json_strequal(val, "none") == true) { +- *action = SPDK_BDEV_NVME_TIMEOUT_ACTION_NONE; +- } else if (spdk_json_strequal(val, "abort") == true) { +- *action = SPDK_BDEV_NVME_TIMEOUT_ACTION_ABORT; +- } else if (spdk_json_strequal(val, "reset") == true) { +- *action = SPDK_BDEV_NVME_TIMEOUT_ACTION_RESET; +- } else { +- SPDK_NOTICELOG("Invalid parameter value: action_on_timeout\n"); +- return -EINVAL; +- } +- +- return 0; +-} +- +-static const struct spdk_json_object_decoder rpc_bdev_nvme_options_decoders[] = { +- {"action_on_timeout", offsetof(struct spdk_bdev_nvme_opts, action_on_timeout), rpc_decode_action_on_timeout, true}, +- {"timeout_us", offsetof(struct spdk_bdev_nvme_opts, timeout_us), spdk_json_decode_uint64, true}, +- {"timeout_admin_us", offsetof(struct spdk_bdev_nvme_opts, timeout_admin_us), spdk_json_decode_uint64, true}, +- {"keep_alive_timeout_ms", offsetof(struct spdk_bdev_nvme_opts, keep_alive_timeout_ms), spdk_json_decode_uint32, true}, +- {"retry_count", offsetof(struct spdk_bdev_nvme_opts, transport_retry_count), spdk_json_decode_uint32, true}, +- {"arbitration_burst", offsetof(struct spdk_bdev_nvme_opts, arbitration_burst), spdk_json_decode_uint32, true}, +- {"low_priority_weight", offsetof(struct spdk_bdev_nvme_opts, low_priority_weight), spdk_json_decode_uint32, true}, +- {"medium_priority_weight", offsetof(struct spdk_bdev_nvme_opts, medium_priority_weight), spdk_json_decode_uint32, true}, +- {"high_priority_weight", offsetof(struct spdk_bdev_nvme_opts, high_priority_weight), spdk_json_decode_uint32, true}, +- {"nvme_adminq_poll_period_us", offsetof(struct spdk_bdev_nvme_opts, nvme_adminq_poll_period_us), spdk_json_decode_uint64, true}, +- {"nvme_ioq_poll_period_us", offsetof(struct spdk_bdev_nvme_opts, nvme_ioq_poll_period_us), spdk_json_decode_uint64, true}, +- {"io_queue_requests", offsetof(struct spdk_bdev_nvme_opts, io_queue_requests), spdk_json_decode_uint32, true}, +- {"delay_cmd_submit", offsetof(struct spdk_bdev_nvme_opts, delay_cmd_submit), spdk_json_decode_bool, true}, +- {"transport_retry_count", offsetof(struct spdk_bdev_nvme_opts, transport_retry_count), spdk_json_decode_uint32, true}, +- {"bdev_retry_count", offsetof(struct spdk_bdev_nvme_opts, bdev_retry_count), spdk_json_decode_int32, true}, +- {"transport_ack_timeout", offsetof(struct spdk_bdev_nvme_opts, transport_ack_timeout), spdk_json_decode_uint8, true}, +- {"ctrlr_loss_timeout_sec", offsetof(struct spdk_bdev_nvme_opts, ctrlr_loss_timeout_sec), spdk_json_decode_int32, true}, +- {"reconnect_delay_sec", offsetof(struct spdk_bdev_nvme_opts, reconnect_delay_sec), spdk_json_decode_uint32, true}, +- {"fast_io_fail_timeout_sec", offsetof(struct spdk_bdev_nvme_opts, fast_io_fail_timeout_sec), spdk_json_decode_uint32, true}, +- {"disable_auto_failback", offsetof(struct spdk_bdev_nvme_opts, disable_auto_failback), spdk_json_decode_bool, true}, +- {"generate_uuids", offsetof(struct spdk_bdev_nvme_opts, generate_uuids), spdk_json_decode_bool, true}, +- {"transport_tos", offsetof(struct spdk_bdev_nvme_opts, transport_tos), spdk_json_decode_uint8, true}, +- {"nvme_error_stat", offsetof(struct spdk_bdev_nvme_opts, nvme_error_stat), spdk_json_decode_bool, true}, +- {"rdma_srq_size", offsetof(struct spdk_bdev_nvme_opts, rdma_srq_size), spdk_json_decode_uint32, true}, +- {"io_path_stat", offsetof(struct spdk_bdev_nvme_opts, io_path_stat), spdk_json_decode_bool, true}, +-}; +- +-static void +-rpc_bdev_nvme_set_options(struct spdk_jsonrpc_request *request, +- const struct spdk_json_val *params) +-{ +- struct spdk_bdev_nvme_opts opts; +- int rc; +- +- bdev_nvme_get_opts(&opts); +- if (params && spdk_json_decode_object(params, rpc_bdev_nvme_options_decoders, +- SPDK_COUNTOF(rpc_bdev_nvme_options_decoders), +- &opts)) { +- SPDK_ERRLOG("spdk_json_decode_object failed\n"); +- spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, +- "spdk_json_decode_object failed"); +- return; +- } +- +- rc = bdev_nvme_set_opts(&opts); +- if (rc == -EPERM) { +- spdk_jsonrpc_send_error_response(request, -EPERM, +- "RPC not permitted with nvme controllers already attached"); +- } else if (rc) { +- spdk_jsonrpc_send_error_response(request, rc, spdk_strerror(-rc)); +- } else { +- spdk_jsonrpc_send_bool_response(request, true); +- } +- +- return; +-} +-SPDK_RPC_REGISTER("bdev_nvme_set_options", rpc_bdev_nvme_set_options, +- SPDK_RPC_STARTUP | SPDK_RPC_RUNTIME) +- +-struct rpc_bdev_nvme_hotplug { +- bool enabled; +- uint64_t period_us; +-}; +- +-static const struct spdk_json_object_decoder rpc_bdev_nvme_hotplug_decoders[] = { +- {"enable", offsetof(struct rpc_bdev_nvme_hotplug, enabled), spdk_json_decode_bool, false}, +- {"period_us", offsetof(struct rpc_bdev_nvme_hotplug, period_us), spdk_json_decode_uint64, true}, +-}; +- +-static void +-rpc_bdev_nvme_set_hotplug_done(void *ctx) +-{ +- struct spdk_jsonrpc_request *request = ctx; +- +- spdk_jsonrpc_send_bool_response(request, true); +-} +- +-static void +-rpc_bdev_nvme_set_hotplug(struct spdk_jsonrpc_request *request, +- const struct spdk_json_val *params) +-{ +- struct rpc_bdev_nvme_hotplug req = {false, 0}; +- int rc; +- +- if (spdk_json_decode_object(params, rpc_bdev_nvme_hotplug_decoders, +- SPDK_COUNTOF(rpc_bdev_nvme_hotplug_decoders), &req)) { +- SPDK_ERRLOG("spdk_json_decode_object failed\n"); +- rc = -EINVAL; +- goto invalid; +- } +- +- rc = bdev_nvme_set_hotplug(req.enabled, req.period_us, rpc_bdev_nvme_set_hotplug_done, +- request); +- if (rc) { +- goto invalid; +- } +- +- return; +-invalid: +- spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, spdk_strerror(-rc)); +-} +-SPDK_RPC_REGISTER("bdev_nvme_set_hotplug", rpc_bdev_nvme_set_hotplug, SPDK_RPC_RUNTIME) +- +-enum bdev_nvme_multipath_mode { +- BDEV_NVME_MP_MODE_FAILOVER, +- BDEV_NVME_MP_MODE_MULTIPATH, +- BDEV_NVME_MP_MODE_DISABLE, +-}; +- +-struct rpc_bdev_nvme_attach_controller { +- char *name; +- char *trtype; +- char *adrfam; +- char *traddr; +- char *trsvcid; +- char *priority; +- char *subnqn; +- char *hostnqn; +- char *hostaddr; +- char *hostsvcid; +- char *psk; +- enum bdev_nvme_multipath_mode multipath; +- struct nvme_ctrlr_opts bdev_opts; +- struct spdk_nvme_ctrlr_opts drv_opts; +-}; +- +-static void +-free_rpc_bdev_nvme_attach_controller(struct rpc_bdev_nvme_attach_controller *req) +-{ +- free(req->name); +- free(req->trtype); +- free(req->adrfam); +- free(req->traddr); +- free(req->trsvcid); +- free(req->priority); +- free(req->subnqn); +- free(req->hostnqn); +- free(req->hostaddr); +- free(req->hostsvcid); +- free(req->psk); +-} +- +-static int +-bdev_nvme_decode_reftag(const struct spdk_json_val *val, void *out) +-{ +- uint32_t *flag = out; +- bool reftag; +- int rc; +- +- rc = spdk_json_decode_bool(val, &reftag); +- if (rc == 0 && reftag == true) { +- *flag |= SPDK_NVME_IO_FLAGS_PRCHK_REFTAG; +- } +- +- return rc; +-} +- +-static int +-bdev_nvme_decode_guard(const struct spdk_json_val *val, void *out) +-{ +- uint32_t *flag = out; +- bool guard; +- int rc; +- +- rc = spdk_json_decode_bool(val, &guard); +- if (rc == 0 && guard == true) { +- *flag |= SPDK_NVME_IO_FLAGS_PRCHK_GUARD; +- } +- +- return rc; +-} +- +-static int +-bdev_nvme_decode_multipath(const struct spdk_json_val *val, void *out) +-{ +- enum bdev_nvme_multipath_mode *multipath = out; +- +- if (spdk_json_strequal(val, "failover") == true) { +- *multipath = BDEV_NVME_MP_MODE_FAILOVER; +- } else if (spdk_json_strequal(val, "multipath") == true) { +- *multipath = BDEV_NVME_MP_MODE_MULTIPATH; +- } else if (spdk_json_strequal(val, "disable") == true) { +- *multipath = BDEV_NVME_MP_MODE_DISABLE; +- } else { +- SPDK_NOTICELOG("Invalid parameter value: multipath\n"); +- return -EINVAL; +- } +- +- return 0; +-} +- +- +-static const struct spdk_json_object_decoder rpc_bdev_nvme_attach_controller_decoders[] = { +- {"name", offsetof(struct rpc_bdev_nvme_attach_controller, name), spdk_json_decode_string}, +- {"trtype", offsetof(struct rpc_bdev_nvme_attach_controller, trtype), spdk_json_decode_string}, +- {"traddr", offsetof(struct rpc_bdev_nvme_attach_controller, traddr), spdk_json_decode_string}, +- +- {"adrfam", offsetof(struct rpc_bdev_nvme_attach_controller, adrfam), spdk_json_decode_string, true}, +- {"trsvcid", offsetof(struct rpc_bdev_nvme_attach_controller, trsvcid), spdk_json_decode_string, true}, +- {"priority", offsetof(struct rpc_bdev_nvme_attach_controller, priority), spdk_json_decode_string, true}, +- {"subnqn", offsetof(struct rpc_bdev_nvme_attach_controller, subnqn), spdk_json_decode_string, true}, +- {"hostnqn", offsetof(struct rpc_bdev_nvme_attach_controller, hostnqn), spdk_json_decode_string, true}, +- {"hostaddr", offsetof(struct rpc_bdev_nvme_attach_controller, hostaddr), spdk_json_decode_string, true}, +- {"hostsvcid", offsetof(struct rpc_bdev_nvme_attach_controller, hostsvcid), spdk_json_decode_string, true}, +- +- {"prchk_reftag", offsetof(struct rpc_bdev_nvme_attach_controller, bdev_opts.prchk_flags), bdev_nvme_decode_reftag, true}, +- {"prchk_guard", offsetof(struct rpc_bdev_nvme_attach_controller, bdev_opts.prchk_flags), bdev_nvme_decode_guard, true}, +- {"hdgst", offsetof(struct rpc_bdev_nvme_attach_controller, drv_opts.header_digest), spdk_json_decode_bool, true}, +- {"ddgst", offsetof(struct rpc_bdev_nvme_attach_controller, drv_opts.data_digest), spdk_json_decode_bool, true}, +- {"fabrics_connect_timeout_us", offsetof(struct rpc_bdev_nvme_attach_controller, drv_opts.fabrics_connect_timeout_us), spdk_json_decode_uint64, true}, +- {"multipath", offsetof(struct rpc_bdev_nvme_attach_controller, multipath), bdev_nvme_decode_multipath, true}, +- {"num_io_queues", offsetof(struct rpc_bdev_nvme_attach_controller, drv_opts.num_io_queues), spdk_json_decode_uint32, true}, +- {"ctrlr_loss_timeout_sec", offsetof(struct rpc_bdev_nvme_attach_controller, bdev_opts.ctrlr_loss_timeout_sec), spdk_json_decode_int32, true}, +- {"reconnect_delay_sec", offsetof(struct rpc_bdev_nvme_attach_controller, bdev_opts.reconnect_delay_sec), spdk_json_decode_uint32, true}, +- {"fast_io_fail_timeout_sec", offsetof(struct rpc_bdev_nvme_attach_controller, bdev_opts.fast_io_fail_timeout_sec), spdk_json_decode_uint32, true}, +- {"psk", offsetof(struct rpc_bdev_nvme_attach_controller, psk), spdk_json_decode_string, true}, +-}; +- +-#define NVME_MAX_BDEVS_PER_RPC 128 +- +-struct rpc_bdev_nvme_attach_controller_ctx { +- struct rpc_bdev_nvme_attach_controller req; +- uint32_t count; +- size_t bdev_count; +- const char *names[NVME_MAX_BDEVS_PER_RPC]; +- struct spdk_jsonrpc_request *request; +-}; +- +-static void +-rpc_bdev_nvme_attach_controller_examined(void *cb_ctx) +-{ +- struct rpc_bdev_nvme_attach_controller_ctx *ctx = cb_ctx; +- struct spdk_jsonrpc_request *request = ctx->request; +- struct spdk_json_write_ctx *w; +- size_t i; +- +- w = spdk_jsonrpc_begin_result(request); +- spdk_json_write_array_begin(w); +- for (i = 0; i < ctx->bdev_count; i++) { +- spdk_json_write_string(w, ctx->names[i]); +- } +- spdk_json_write_array_end(w); +- spdk_jsonrpc_end_result(request, w); +- +- free_rpc_bdev_nvme_attach_controller(&ctx->req); +- free(ctx); +-} +- +-static void +-rpc_bdev_nvme_attach_controller_done(void *cb_ctx, size_t bdev_count, int rc) +-{ +- struct rpc_bdev_nvme_attach_controller_ctx *ctx = cb_ctx; +- struct spdk_jsonrpc_request *request = ctx->request; +- +- if (rc < 0) { +- spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, "Invalid parameters"); +- free_rpc_bdev_nvme_attach_controller(&ctx->req); +- free(ctx); +- return; +- } +- +- ctx->bdev_count = bdev_count; +- spdk_bdev_wait_for_examine(rpc_bdev_nvme_attach_controller_examined, ctx); +-} +- +-static void +-rpc_bdev_nvme_attach_controller(struct spdk_jsonrpc_request *request, +- const struct spdk_json_val *params) +-{ +- struct rpc_bdev_nvme_attach_controller_ctx *ctx; +- struct spdk_nvme_transport_id trid = {}; +- const struct spdk_nvme_ctrlr_opts *drv_opts; +- const struct spdk_nvme_transport_id *ctrlr_trid; +- struct nvme_ctrlr *ctrlr = NULL; +- size_t len, maxlen; +- bool multipath = false; +- int rc; +- +- ctx = calloc(1, sizeof(*ctx)); +- if (!ctx) { +- spdk_jsonrpc_send_error_response(request, -ENOMEM, spdk_strerror(ENOMEM)); +- return; +- } +- +- spdk_nvme_ctrlr_get_default_ctrlr_opts(&ctx->req.drv_opts, sizeof(ctx->req.drv_opts)); +- bdev_nvme_get_default_ctrlr_opts(&ctx->req.bdev_opts); +- /* For now, initialize the multipath parameter to add a failover path. This maintains backward +- * compatibility with past behavior. In the future, this behavior will change to "disable". */ +- ctx->req.multipath = BDEV_NVME_MP_MODE_FAILOVER; +- +- if (spdk_json_decode_object(params, rpc_bdev_nvme_attach_controller_decoders, +- SPDK_COUNTOF(rpc_bdev_nvme_attach_controller_decoders), +- &ctx->req)) { +- SPDK_ERRLOG("spdk_json_decode_object failed\n"); +- spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, +- "spdk_json_decode_object failed"); +- goto cleanup; +- } +- +- /* Parse trstring */ +- rc = spdk_nvme_transport_id_populate_trstring(&trid, ctx->req.trtype); +- if (rc < 0) { +- SPDK_ERRLOG("Failed to parse trtype: %s\n", ctx->req.trtype); +- spdk_jsonrpc_send_error_response_fmt(request, -EINVAL, "Failed to parse trtype: %s", +- ctx->req.trtype); +- goto cleanup; +- } +- +- /* Parse trtype */ +- rc = spdk_nvme_transport_id_parse_trtype(&trid.trtype, ctx->req.trtype); +- assert(rc == 0); +- +- /* Parse traddr */ +- maxlen = sizeof(trid.traddr); +- len = strnlen(ctx->req.traddr, maxlen); +- if (len == maxlen) { +- spdk_jsonrpc_send_error_response_fmt(request, -EINVAL, "traddr too long: %s", +- ctx->req.traddr); +- goto cleanup; +- } +- memcpy(trid.traddr, ctx->req.traddr, len + 1); +- +- /* Parse adrfam */ +- if (ctx->req.adrfam) { +- rc = spdk_nvme_transport_id_parse_adrfam(&trid.adrfam, ctx->req.adrfam); +- if (rc < 0) { +- SPDK_ERRLOG("Failed to parse adrfam: %s\n", ctx->req.adrfam); +- spdk_jsonrpc_send_error_response_fmt(request, -EINVAL, "Failed to parse adrfam: %s", +- ctx->req.adrfam); +- goto cleanup; +- } +- } +- +- /* Parse trsvcid */ +- if (ctx->req.trsvcid) { +- maxlen = sizeof(trid.trsvcid); +- len = strnlen(ctx->req.trsvcid, maxlen); +- if (len == maxlen) { +- spdk_jsonrpc_send_error_response_fmt(request, -EINVAL, "trsvcid too long: %s", +- ctx->req.trsvcid); +- goto cleanup; +- } +- memcpy(trid.trsvcid, ctx->req.trsvcid, len + 1); +- } +- +- /* Parse priority for the NVMe-oF transport connection */ +- if (ctx->req.priority) { +- trid.priority = spdk_strtol(ctx->req.priority, 10); +- } +- +- /* Parse subnqn */ +- if (ctx->req.subnqn) { +- maxlen = sizeof(trid.subnqn); +- len = strnlen(ctx->req.subnqn, maxlen); +- if (len == maxlen) { +- spdk_jsonrpc_send_error_response_fmt(request, -EINVAL, "subnqn too long: %s", +- ctx->req.subnqn); +- goto cleanup; +- } +- memcpy(trid.subnqn, ctx->req.subnqn, len + 1); +- } +- +- if (ctx->req.hostnqn) { +- snprintf(ctx->req.drv_opts.hostnqn, sizeof(ctx->req.drv_opts.hostnqn), "%s", +- ctx->req.hostnqn); +- } +- +- if (ctx->req.psk) { +- snprintf(ctx->req.drv_opts.psk, sizeof(ctx->req.drv_opts.psk), "%s", +- ctx->req.psk); +- } +- +- if (ctx->req.hostaddr) { +- maxlen = sizeof(ctx->req.drv_opts.src_addr); +- len = strnlen(ctx->req.hostaddr, maxlen); +- if (len == maxlen) { +- spdk_jsonrpc_send_error_response_fmt(request, -EINVAL, "hostaddr too long: %s", +- ctx->req.hostaddr); +- goto cleanup; +- } +- snprintf(ctx->req.drv_opts.src_addr, maxlen, "%s", ctx->req.hostaddr); +- } +- +- if (ctx->req.hostsvcid) { +- maxlen = sizeof(ctx->req.drv_opts.src_svcid); +- len = strnlen(ctx->req.hostsvcid, maxlen); +- if (len == maxlen) { +- spdk_jsonrpc_send_error_response_fmt(request, -EINVAL, "hostsvcid too long: %s", +- ctx->req.hostsvcid); +- goto cleanup; +- } +- snprintf(ctx->req.drv_opts.src_svcid, maxlen, "%s", ctx->req.hostsvcid); +- } +- +- ctrlr = nvme_ctrlr_get_by_name(ctx->req.name); +- +- if (ctrlr) { +- /* This controller already exists. Check what the user wants to do. */ +- if (ctx->req.multipath == BDEV_NVME_MP_MODE_DISABLE) { +- /* The user does not want to do any form of multipathing. */ +- spdk_jsonrpc_send_error_response_fmt(request, -EALREADY, +- "A controller named %s already exists and multipath is disabled\n", +- ctx->req.name); +- goto cleanup; +- } +- +- assert(ctx->req.multipath == BDEV_NVME_MP_MODE_FAILOVER || +- ctx->req.multipath == BDEV_NVME_MP_MODE_MULTIPATH); +- +- /* The user wants to add this as a failover path or add this to create multipath. */ +- drv_opts = spdk_nvme_ctrlr_get_opts(ctrlr->ctrlr); +- ctrlr_trid = spdk_nvme_ctrlr_get_transport_id(ctrlr->ctrlr); +- +- if (strncmp(trid.traddr, ctrlr_trid->traddr, sizeof(trid.traddr)) == 0 && +- strncmp(trid.trsvcid, ctrlr_trid->trsvcid, sizeof(trid.trsvcid)) == 0 && +- strncmp(ctx->req.drv_opts.src_addr, drv_opts->src_addr, sizeof(drv_opts->src_addr)) == 0 && +- strncmp(ctx->req.drv_opts.src_svcid, drv_opts->src_svcid, sizeof(drv_opts->src_svcid)) == 0) { +- /* Exactly same network path can't be added a second time */ +- spdk_jsonrpc_send_error_response_fmt(request, -EALREADY, +- "A controller named %s already exists with the specified network path\n", +- ctx->req.name); +- goto cleanup; +- } +- +- if (strncmp(trid.subnqn, +- ctrlr_trid->subnqn, +- SPDK_NVMF_NQN_MAX_LEN) != 0) { +- /* Different SUBNQN is not allowed when specifying the same controller name. */ +- spdk_jsonrpc_send_error_response_fmt(request, -EINVAL, +- "A controller named %s already exists, but uses a different subnqn (%s)\n", +- ctx->req.name, ctrlr_trid->subnqn); +- goto cleanup; +- } +- +- if (strncmp(ctx->req.drv_opts.hostnqn, drv_opts->hostnqn, SPDK_NVMF_NQN_MAX_LEN) != 0) { +- /* Different HOSTNQN is not allowed when specifying the same controller name. */ +- spdk_jsonrpc_send_error_response_fmt(request, -EINVAL, +- "A controller named %s already exists, but uses a different hostnqn (%s)\n", +- ctx->req.name, drv_opts->hostnqn); +- goto cleanup; +- } +- +- if (ctx->req.bdev_opts.prchk_flags) { +- spdk_jsonrpc_send_error_response_fmt(request, -EINVAL, +- "A controller named %s already exists. To add a path, do not specify PI options.\n", +- ctx->req.name); +- goto cleanup; +- } +- +- ctx->req.bdev_opts.prchk_flags = ctrlr->opts.prchk_flags; +- } +- +- if (ctx->req.multipath == BDEV_NVME_MP_MODE_MULTIPATH) { +- multipath = true; +- } +- +- if (ctx->req.drv_opts.num_io_queues == 0 || ctx->req.drv_opts.num_io_queues > UINT16_MAX + 1) { +- spdk_jsonrpc_send_error_response_fmt(request, -EINVAL, +- "num_io_queues out of bounds, min: %u max: %u\n", +- 1, UINT16_MAX + 1); +- goto cleanup; +- } +- +- ctx->request = request; +- ctx->count = NVME_MAX_BDEVS_PER_RPC; +- /* Should already be zero due to the calloc(), but set explicitly for clarity. */ +- ctx->req.bdev_opts.from_discovery_service = false; +- rc = bdev_nvme_create(&trid, ctx->req.name, ctx->names, ctx->count, +- rpc_bdev_nvme_attach_controller_done, ctx, &ctx->req.drv_opts, +- &ctx->req.bdev_opts, multipath); +- if (rc) { +- spdk_jsonrpc_send_error_response(request, rc, spdk_strerror(-rc)); +- goto cleanup; +- } +- +- return; +- +-cleanup: +- free_rpc_bdev_nvme_attach_controller(&ctx->req); +- free(ctx); +-} +-SPDK_RPC_REGISTER("bdev_nvme_attach_controller", rpc_bdev_nvme_attach_controller, +- SPDK_RPC_RUNTIME) +- +-static void +-rpc_dump_nvme_bdev_controller_info(struct nvme_bdev_ctrlr *nbdev_ctrlr, void *ctx) +-{ +- struct spdk_json_write_ctx *w = ctx; +- struct nvme_ctrlr *nvme_ctrlr; +- +- spdk_json_write_object_begin(w); +- spdk_json_write_named_string(w, "name", nbdev_ctrlr->name); +- +- spdk_json_write_named_array_begin(w, "ctrlrs"); +- TAILQ_FOREACH(nvme_ctrlr, &nbdev_ctrlr->ctrlrs, tailq) { +- nvme_ctrlr_info_json(w, nvme_ctrlr); +- } +- spdk_json_write_array_end(w); +- spdk_json_write_object_end(w); +-} +- +-struct rpc_bdev_nvme_get_controllers { +- char *name; +-}; +- +-static void +-free_rpc_bdev_nvme_get_controllers(struct rpc_bdev_nvme_get_controllers *r) +-{ +- free(r->name); +-} +- +-static const struct spdk_json_object_decoder rpc_bdev_nvme_get_controllers_decoders[] = { +- {"name", offsetof(struct rpc_bdev_nvme_get_controllers, name), spdk_json_decode_string, true}, +-}; +- +-static void +-rpc_bdev_nvme_get_controllers(struct spdk_jsonrpc_request *request, +- const struct spdk_json_val *params) +-{ +- struct rpc_bdev_nvme_get_controllers req = {}; +- struct spdk_json_write_ctx *w; +- struct nvme_bdev_ctrlr *nbdev_ctrlr = NULL; +- +- if (params && spdk_json_decode_object(params, rpc_bdev_nvme_get_controllers_decoders, +- SPDK_COUNTOF(rpc_bdev_nvme_get_controllers_decoders), +- &req)) { +- SPDK_ERRLOG("spdk_json_decode_object failed\n"); +- spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, +- "spdk_json_decode_object failed"); +- goto cleanup; +- } +- +- if (req.name) { +- nbdev_ctrlr = nvme_bdev_ctrlr_get_by_name(req.name); +- if (nbdev_ctrlr == NULL) { +- SPDK_ERRLOG("ctrlr '%s' does not exist\n", req.name); +- spdk_jsonrpc_send_error_response_fmt(request, EINVAL, "Controller %s does not exist", req.name); +- goto cleanup; +- } +- } +- +- w = spdk_jsonrpc_begin_result(request); +- spdk_json_write_array_begin(w); +- +- if (nbdev_ctrlr != NULL) { +- rpc_dump_nvme_bdev_controller_info(nbdev_ctrlr, w); +- } else { +- nvme_bdev_ctrlr_for_each(rpc_dump_nvme_bdev_controller_info, w); +- } +- +- spdk_json_write_array_end(w); +- +- spdk_jsonrpc_end_result(request, w); +- +-cleanup: +- free_rpc_bdev_nvme_get_controllers(&req); +-} +-SPDK_RPC_REGISTER("bdev_nvme_get_controllers", rpc_bdev_nvme_get_controllers, SPDK_RPC_RUNTIME) +- +-struct rpc_bdev_nvme_detach_controller { +- char *name; +- char *trtype; +- char *adrfam; +- char *traddr; +- char *trsvcid; +- char *subnqn; +- char *hostaddr; +- char *hostsvcid; +-}; +- +-static void +-free_rpc_bdev_nvme_detach_controller(struct rpc_bdev_nvme_detach_controller *req) +-{ +- free(req->name); +- free(req->trtype); +- free(req->adrfam); +- free(req->traddr); +- free(req->trsvcid); +- free(req->subnqn); +- free(req->hostaddr); +- free(req->hostsvcid); +-} +- +-static const struct spdk_json_object_decoder rpc_bdev_nvme_detach_controller_decoders[] = { +- {"name", offsetof(struct rpc_bdev_nvme_detach_controller, name), spdk_json_decode_string}, +- {"trtype", offsetof(struct rpc_bdev_nvme_detach_controller, trtype), spdk_json_decode_string, true}, +- {"traddr", offsetof(struct rpc_bdev_nvme_detach_controller, traddr), spdk_json_decode_string, true}, +- {"adrfam", offsetof(struct rpc_bdev_nvme_detach_controller, adrfam), spdk_json_decode_string, true}, +- {"trsvcid", offsetof(struct rpc_bdev_nvme_detach_controller, trsvcid), spdk_json_decode_string, true}, +- {"subnqn", offsetof(struct rpc_bdev_nvme_detach_controller, subnqn), spdk_json_decode_string, true}, +- {"hostaddr", offsetof(struct rpc_bdev_nvme_detach_controller, hostaddr), spdk_json_decode_string, true}, +- {"hostsvcid", offsetof(struct rpc_bdev_nvme_detach_controller, hostsvcid), spdk_json_decode_string, true}, +-}; +- +-static void +-rpc_bdev_nvme_detach_controller(struct spdk_jsonrpc_request *request, +- const struct spdk_json_val *params) +-{ +- struct rpc_bdev_nvme_detach_controller req = {NULL}; +- struct nvme_path_id path = {}; +- size_t len, maxlen; +- int rc = 0; +- +- if (spdk_json_decode_object(params, rpc_bdev_nvme_detach_controller_decoders, +- SPDK_COUNTOF(rpc_bdev_nvme_detach_controller_decoders), +- &req)) { +- spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, +- "spdk_json_decode_object failed"); +- goto cleanup; +- } +- +- if (req.trtype != NULL) { +- rc = spdk_nvme_transport_id_populate_trstring(&path.trid, req.trtype); +- if (rc < 0) { +- SPDK_ERRLOG("Failed to parse trtype: %s\n", req.trtype); +- spdk_jsonrpc_send_error_response_fmt(request, -EINVAL, "Failed to parse trtype: %s", +- req.trtype); +- goto cleanup; +- } +- +- rc = spdk_nvme_transport_id_parse_trtype(&path.trid.trtype, req.trtype); +- if (rc < 0) { +- SPDK_ERRLOG("Failed to parse trtype: %s\n", req.trtype); +- spdk_jsonrpc_send_error_response_fmt(request, -EINVAL, "Failed to parse trtype: %s", +- req.trtype); +- goto cleanup; +- } +- } +- +- if (req.traddr != NULL) { +- maxlen = sizeof(path.trid.traddr); +- len = strnlen(req.traddr, maxlen); +- if (len == maxlen) { +- spdk_jsonrpc_send_error_response_fmt(request, -EINVAL, "traddr too long: %s", +- req.traddr); +- goto cleanup; +- } +- memcpy(path.trid.traddr, req.traddr, len + 1); +- } +- +- if (req.adrfam != NULL) { +- rc = spdk_nvme_transport_id_parse_adrfam(&path.trid.adrfam, req.adrfam); +- if (rc < 0) { +- SPDK_ERRLOG("Failed to parse adrfam: %s\n", req.adrfam); +- spdk_jsonrpc_send_error_response_fmt(request, -EINVAL, "Failed to parse adrfam: %s", +- req.adrfam); +- goto cleanup; +- } +- } +- +- if (req.trsvcid != NULL) { +- maxlen = sizeof(path.trid.trsvcid); +- len = strnlen(req.trsvcid, maxlen); +- if (len == maxlen) { +- spdk_jsonrpc_send_error_response_fmt(request, -EINVAL, "trsvcid too long: %s", +- req.trsvcid); +- goto cleanup; +- } +- memcpy(path.trid.trsvcid, req.trsvcid, len + 1); +- } +- +- /* Parse subnqn */ +- if (req.subnqn != NULL) { +- maxlen = sizeof(path.trid.subnqn); +- len = strnlen(req.subnqn, maxlen); +- if (len == maxlen) { +- spdk_jsonrpc_send_error_response_fmt(request, -EINVAL, "subnqn too long: %s", +- req.subnqn); +- goto cleanup; +- } +- memcpy(path.trid.subnqn, req.subnqn, len + 1); +- } +- +- if (req.hostaddr) { +- maxlen = sizeof(path.hostid.hostaddr); +- len = strnlen(req.hostaddr, maxlen); +- if (len == maxlen) { +- spdk_jsonrpc_send_error_response_fmt(request, -EINVAL, "hostaddr too long: %s", +- req.hostaddr); +- goto cleanup; +- } +- snprintf(path.hostid.hostaddr, maxlen, "%s", req.hostaddr); +- } +- +- if (req.hostsvcid) { +- maxlen = sizeof(path.hostid.hostsvcid); +- len = strnlen(req.hostsvcid, maxlen); +- if (len == maxlen) { +- spdk_jsonrpc_send_error_response_fmt(request, -EINVAL, "hostsvcid too long: %s", +- req.hostsvcid); +- goto cleanup; +- } +- snprintf(path.hostid.hostsvcid, maxlen, "%s", req.hostsvcid); +- } +- +- rc = bdev_nvme_delete(req.name, &path); +- +- if (rc != 0) { +- spdk_jsonrpc_send_error_response(request, rc, spdk_strerror(-rc)); +- goto cleanup; +- } +- +- spdk_jsonrpc_send_bool_response(request, true); +- +-cleanup: +- free_rpc_bdev_nvme_detach_controller(&req); +-} +-SPDK_RPC_REGISTER("bdev_nvme_detach_controller", rpc_bdev_nvme_detach_controller, +- SPDK_RPC_RUNTIME) +- +-struct rpc_apply_firmware { +- char *filename; +- char *bdev_name; +-}; +- +-static void +-free_rpc_apply_firmware(struct rpc_apply_firmware *req) +-{ +- free(req->filename); +- free(req->bdev_name); +-} +- +-static const struct spdk_json_object_decoder rpc_apply_firmware_decoders[] = { +- {"filename", offsetof(struct rpc_apply_firmware, filename), spdk_json_decode_string}, +- {"bdev_name", offsetof(struct rpc_apply_firmware, bdev_name), spdk_json_decode_string}, +-}; +- +-struct firmware_update_info { +- void *fw_image; +- void *p; +- unsigned int size; +- unsigned int size_remaining; +- unsigned int offset; +- unsigned int transfer; +- +- void *desc; +- struct spdk_io_channel *ch; +- struct spdk_jsonrpc_request *request; +- struct spdk_nvme_ctrlr *ctrlr; +- open_descriptors_t desc_head; +- struct rpc_apply_firmware *req; +-}; +- +-static void +-_apply_firmware_cleanup(void *ctx) +-{ +- struct spdk_bdev_desc *desc = ctx; +- +- spdk_bdev_close(desc); +-} +- +-static void +-apply_firmware_cleanup(void *cb_arg) +-{ +- struct open_descriptors *opt, *tmp; +- struct firmware_update_info *firm_ctx = cb_arg; +- +- if (!firm_ctx) { +- return; +- } +- +- if (firm_ctx->fw_image) { +- spdk_free(firm_ctx->fw_image); +- } +- +- if (firm_ctx->req) { +- free_rpc_apply_firmware(firm_ctx->req); +- free(firm_ctx->req); +- } +- +- if (firm_ctx->ch) { +- spdk_put_io_channel(firm_ctx->ch); +- } +- +- TAILQ_FOREACH_SAFE(opt, &firm_ctx->desc_head, tqlst, tmp) { +- TAILQ_REMOVE(&firm_ctx->desc_head, opt, tqlst); +- /* Close the underlying bdev on its same opened thread. */ +- if (opt->thread && opt->thread != spdk_get_thread()) { +- spdk_thread_send_msg(opt->thread, _apply_firmware_cleanup, opt->desc); +- } else { +- spdk_bdev_close(opt->desc); +- } +- free(opt); +- } +- free(firm_ctx); +-} +- +-static void +-apply_firmware_complete_reset(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) +-{ +- struct spdk_json_write_ctx *w; +- struct firmware_update_info *firm_ctx = cb_arg; +- +- spdk_bdev_free_io(bdev_io); +- +- if (!success) { +- spdk_jsonrpc_send_error_response(firm_ctx->request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, +- "firmware commit failed."); +- apply_firmware_cleanup(firm_ctx); +- return; +- } +- +- if (spdk_nvme_ctrlr_reset(firm_ctx->ctrlr) != 0) { +- spdk_jsonrpc_send_error_response(firm_ctx->request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, +- "Controller reset failed."); +- apply_firmware_cleanup(firm_ctx); +- return; +- } +- +- w = spdk_jsonrpc_begin_result(firm_ctx->request); +- spdk_json_write_string(w, "firmware commit succeeded. Controller reset in progress."); +- spdk_jsonrpc_end_result(firm_ctx->request, w); +- apply_firmware_cleanup(firm_ctx); +-} +- +-static void +-apply_firmware_complete(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) +-{ +- struct spdk_nvme_cmd cmd = {}; +- struct spdk_nvme_fw_commit fw_commit; +- int slot = 0; +- int rc; +- struct firmware_update_info *firm_ctx = cb_arg; +- enum spdk_nvme_fw_commit_action commit_action = SPDK_NVME_FW_COMMIT_REPLACE_AND_ENABLE_IMG; +- +- spdk_bdev_free_io(bdev_io); +- +- if (!success) { +- spdk_jsonrpc_send_error_response(firm_ctx->request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, +- "firmware download failed ."); +- apply_firmware_cleanup(firm_ctx); +- return; +- } +- +- firm_ctx->p += firm_ctx->transfer; +- firm_ctx->offset += firm_ctx->transfer; +- firm_ctx->size_remaining -= firm_ctx->transfer; +- +- switch (firm_ctx->size_remaining) { +- case 0: +- /* firmware download completed. Commit firmware */ +- memset(&fw_commit, 0, sizeof(struct spdk_nvme_fw_commit)); +- fw_commit.fs = slot; +- fw_commit.ca = commit_action; +- +- cmd.opc = SPDK_NVME_OPC_FIRMWARE_COMMIT; +- memcpy(&cmd.cdw10, &fw_commit, sizeof(uint32_t)); +- rc = spdk_bdev_nvme_admin_passthru(firm_ctx->desc, firm_ctx->ch, &cmd, NULL, 0, +- apply_firmware_complete_reset, firm_ctx); +- if (rc) { +- spdk_jsonrpc_send_error_response(firm_ctx->request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, +- "firmware commit failed."); +- apply_firmware_cleanup(firm_ctx); +- return; +- } +- break; +- default: +- firm_ctx->transfer = spdk_min(firm_ctx->size_remaining, 4096); +- cmd.opc = SPDK_NVME_OPC_FIRMWARE_IMAGE_DOWNLOAD; +- +- cmd.cdw10 = spdk_nvme_bytes_to_numd(firm_ctx->transfer); +- cmd.cdw11 = firm_ctx->offset >> 2; +- rc = spdk_bdev_nvme_admin_passthru(firm_ctx->desc, firm_ctx->ch, &cmd, firm_ctx->p, +- firm_ctx->transfer, apply_firmware_complete, firm_ctx); +- if (rc) { +- spdk_jsonrpc_send_error_response(firm_ctx->request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, +- "firmware download failed."); +- apply_firmware_cleanup(firm_ctx); +- return; +- } +- break; +- } +-} +- +-static void +-apply_firmware_open_cb(enum spdk_bdev_event_type type, struct spdk_bdev *bdev, void *event_ctx) +-{ +-} +- +-static void +-rpc_bdev_nvme_apply_firmware(struct spdk_jsonrpc_request *request, +- const struct spdk_json_val *params) +-{ +- int rc; +- int fd = -1; +- struct stat fw_stat; +- struct spdk_nvme_ctrlr *ctrlr; +- char msg[1024]; +- struct spdk_bdev *bdev; +- struct spdk_bdev *bdev2; +- struct open_descriptors *opt; +- struct spdk_bdev_desc *desc; +- struct spdk_nvme_cmd cmd = {}; +- struct firmware_update_info *firm_ctx; +- +- firm_ctx = calloc(1, sizeof(struct firmware_update_info)); +- if (!firm_ctx) { +- spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, +- "Memory allocation error."); +- return; +- } +- firm_ctx->fw_image = NULL; +- TAILQ_INIT(&firm_ctx->desc_head); +- firm_ctx->request = request; +- +- firm_ctx->req = calloc(1, sizeof(struct rpc_apply_firmware)); +- if (!firm_ctx->req) { +- snprintf(msg, sizeof(msg), "Memory allocation error."); +- goto err; +- } +- +- if (spdk_json_decode_object(params, rpc_apply_firmware_decoders, +- SPDK_COUNTOF(rpc_apply_firmware_decoders), firm_ctx->req)) { +- snprintf(msg, sizeof(msg), "spdk_json_decode_object failed."); +- goto err; +- } +- +- if ((bdev = spdk_bdev_get_by_name(firm_ctx->req->bdev_name)) == NULL) { +- snprintf(msg, sizeof(msg), "bdev %s were not found", firm_ctx->req->bdev_name); +- goto err; +- } +- +- if ((ctrlr = bdev_nvme_get_ctrlr(bdev)) == NULL) { +- snprintf(msg, sizeof(msg), "Controller information for %s were not found.", +- firm_ctx->req->bdev_name); +- goto err; +- } +- firm_ctx->ctrlr = ctrlr; +- +- for (bdev2 = spdk_bdev_first(); bdev2; bdev2 = spdk_bdev_next(bdev2)) { +- +- if (bdev_nvme_get_ctrlr(bdev2) != ctrlr) { +- continue; +- } +- +- if (!(opt = malloc(sizeof(struct open_descriptors)))) { +- snprintf(msg, sizeof(msg), "Memory allocation error."); +- goto err; +- } +- +- if (spdk_bdev_open_ext(spdk_bdev_get_name(bdev2), true, apply_firmware_open_cb, NULL, &desc) != 0) { +- snprintf(msg, sizeof(msg), "Device %s is in use.", firm_ctx->req->bdev_name); +- free(opt); +- goto err; +- } +- +- /* Save the thread where the base device is opened */ +- opt->thread = spdk_get_thread(); +- +- opt->desc = desc; +- opt->bdev = bdev; +- TAILQ_INSERT_TAIL(&firm_ctx->desc_head, opt, tqlst); +- } +- +- /* +- * find a descriptor associated with our bdev +- */ +- firm_ctx->desc = NULL; +- TAILQ_FOREACH(opt, &firm_ctx->desc_head, tqlst) { +- if (opt->bdev == bdev) { +- firm_ctx->desc = opt->desc; +- break; +- } +- } +- +- if (!firm_ctx->desc) { +- snprintf(msg, sizeof(msg), "No descriptor were found."); +- goto err; +- } +- +- firm_ctx->ch = spdk_bdev_get_io_channel(firm_ctx->desc); +- if (!firm_ctx->ch) { +- snprintf(msg, sizeof(msg), "No channels were found."); +- goto err; +- } +- +- fd = open(firm_ctx->req->filename, O_RDONLY); +- if (fd < 0) { +- snprintf(msg, sizeof(msg), "open file failed."); +- goto err; +- } +- +- rc = fstat(fd, &fw_stat); +- if (rc < 0) { +- close(fd); +- snprintf(msg, sizeof(msg), "fstat failed."); +- goto err; +- } +- +- firm_ctx->size = fw_stat.st_size; +- if (fw_stat.st_size % 4) { +- close(fd); +- snprintf(msg, sizeof(msg), "Firmware image size is not multiple of 4."); +- goto err; +- } +- +- firm_ctx->fw_image = spdk_zmalloc(firm_ctx->size, 4096, NULL, +- SPDK_ENV_LCORE_ID_ANY, SPDK_MALLOC_DMA); +- if (!firm_ctx->fw_image) { +- close(fd); +- snprintf(msg, sizeof(msg), "Memory allocation error."); +- goto err; +- } +- firm_ctx->p = firm_ctx->fw_image; +- +- if (read(fd, firm_ctx->p, firm_ctx->size) != ((ssize_t)(firm_ctx->size))) { +- close(fd); +- snprintf(msg, sizeof(msg), "Read firmware image failed!"); +- goto err; +- } +- close(fd); +- +- firm_ctx->offset = 0; +- firm_ctx->size_remaining = firm_ctx->size; +- firm_ctx->transfer = spdk_min(firm_ctx->size_remaining, 4096); +- +- cmd.opc = SPDK_NVME_OPC_FIRMWARE_IMAGE_DOWNLOAD; +- cmd.cdw10 = spdk_nvme_bytes_to_numd(firm_ctx->transfer); +- cmd.cdw11 = firm_ctx->offset >> 2; +- +- rc = spdk_bdev_nvme_admin_passthru(firm_ctx->desc, firm_ctx->ch, &cmd, firm_ctx->p, +- firm_ctx->transfer, apply_firmware_complete, firm_ctx); +- if (rc == 0) { +- /* normal return here. */ +- return; +- } +- +- snprintf(msg, sizeof(msg), "Read firmware image failed!"); +-err: +- spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, msg); +- apply_firmware_cleanup(firm_ctx); +-} +-SPDK_RPC_REGISTER("bdev_nvme_apply_firmware", rpc_bdev_nvme_apply_firmware, SPDK_RPC_RUNTIME) +- +-struct rpc_bdev_nvme_transport_stat_ctx { +- struct spdk_jsonrpc_request *request; +- struct spdk_json_write_ctx *w; +-}; +- +-static void +-rpc_bdev_nvme_rdma_stats(struct spdk_json_write_ctx *w, +- struct spdk_nvme_transport_poll_group_stat *stat) +-{ +- struct spdk_nvme_rdma_device_stat *device_stats; +- uint32_t i; +- +- spdk_json_write_named_array_begin(w, "devices"); +- +- for (i = 0; i < stat->rdma.num_devices; i++) { +- device_stats = &stat->rdma.device_stats[i]; +- spdk_json_write_object_begin(w); +- spdk_json_write_named_string(w, "dev_name", device_stats->name); +- spdk_json_write_named_uint64(w, "polls", device_stats->polls); +- spdk_json_write_named_uint64(w, "idle_polls", device_stats->idle_polls); +- spdk_json_write_named_uint64(w, "completions", device_stats->completions); +- spdk_json_write_named_uint64(w, "queued_requests", device_stats->queued_requests); +- spdk_json_write_named_uint64(w, "total_send_wrs", device_stats->total_send_wrs); +- spdk_json_write_named_uint64(w, "send_doorbell_updates", device_stats->send_doorbell_updates); +- spdk_json_write_named_uint64(w, "total_recv_wrs", device_stats->total_recv_wrs); +- spdk_json_write_named_uint64(w, "recv_doorbell_updates", device_stats->recv_doorbell_updates); +- spdk_json_write_object_end(w); +- } +- spdk_json_write_array_end(w); +-} +- +-static void +-rpc_bdev_nvme_pcie_stats(struct spdk_json_write_ctx *w, +- struct spdk_nvme_transport_poll_group_stat *stat) +-{ +- spdk_json_write_named_uint64(w, "polls", stat->pcie.polls); +- spdk_json_write_named_uint64(w, "idle_polls", stat->pcie.idle_polls); +- spdk_json_write_named_uint64(w, "completions", stat->pcie.completions); +- spdk_json_write_named_uint64(w, "cq_mmio_doorbell_updates", stat->pcie.cq_mmio_doorbell_updates); +- spdk_json_write_named_uint64(w, "cq_shadow_doorbell_updates", +- stat->pcie.cq_shadow_doorbell_updates); +- spdk_json_write_named_uint64(w, "queued_requests", stat->pcie.queued_requests); +- spdk_json_write_named_uint64(w, "submitted_requests", stat->pcie.submitted_requests); +- spdk_json_write_named_uint64(w, "sq_mmio_doorbell_updates", stat->pcie.sq_mmio_doorbell_updates); +- spdk_json_write_named_uint64(w, "sq_shadow_doorbell_updates", +- stat->pcie.sq_shadow_doorbell_updates); +-} +- +-static void +-rpc_bdev_nvme_tcp_stats(struct spdk_json_write_ctx *w, +- struct spdk_nvme_transport_poll_group_stat *stat) +-{ +- spdk_json_write_named_uint64(w, "polls", stat->tcp.polls); +- spdk_json_write_named_uint64(w, "idle_polls", stat->tcp.idle_polls); +- spdk_json_write_named_uint64(w, "socket_completions", stat->tcp.socket_completions); +- spdk_json_write_named_uint64(w, "nvme_completions", stat->tcp.nvme_completions); +- spdk_json_write_named_uint64(w, "queued_requests", stat->tcp.queued_requests); +- spdk_json_write_named_uint64(w, "submitted_requests", stat->tcp.submitted_requests); +-} +- +-static void +-rpc_bdev_nvme_stats_per_channel(struct spdk_io_channel_iter *i) +-{ +- struct rpc_bdev_nvme_transport_stat_ctx *ctx; +- struct spdk_io_channel *ch; +- struct nvme_poll_group *group; +- struct spdk_nvme_poll_group_stat *stat; +- struct spdk_nvme_transport_poll_group_stat *tr_stat; +- uint32_t j; +- int rc; +- +- ctx = spdk_io_channel_iter_get_ctx(i); +- ch = spdk_io_channel_iter_get_channel(i); +- group = spdk_io_channel_get_ctx(ch); +- +- rc = spdk_nvme_poll_group_get_stats(group->group, &stat); +- if (rc) { +- spdk_for_each_channel_continue(i, rc); +- return; +- } +- +- spdk_json_write_object_begin(ctx->w); +- spdk_json_write_named_string(ctx->w, "thread", spdk_thread_get_name(spdk_get_thread())); +- spdk_json_write_named_array_begin(ctx->w, "transports"); +- +- for (j = 0; j < stat->num_transports; j++) { +- tr_stat = stat->transport_stat[j]; +- spdk_json_write_object_begin(ctx->w); +- spdk_json_write_named_string(ctx->w, "trname", spdk_nvme_transport_id_trtype_str(tr_stat->trtype)); +- +- switch (stat->transport_stat[j]->trtype) { +- case SPDK_NVME_TRANSPORT_RDMA: +- rpc_bdev_nvme_rdma_stats(ctx->w, tr_stat); +- break; +- case SPDK_NVME_TRANSPORT_PCIE: +- case SPDK_NVME_TRANSPORT_VFIOUSER: +- rpc_bdev_nvme_pcie_stats(ctx->w, tr_stat); +- break; +- case SPDK_NVME_TRANSPORT_TCP: +- rpc_bdev_nvme_tcp_stats(ctx->w, tr_stat); +- break; +- default: +- SPDK_WARNLOG("Can't handle trtype %d %s\n", tr_stat->trtype, +- spdk_nvme_transport_id_trtype_str(tr_stat->trtype)); +- } +- spdk_json_write_object_end(ctx->w); +- } +- /* transports array */ +- spdk_json_write_array_end(ctx->w); +- spdk_json_write_object_end(ctx->w); +- +- spdk_nvme_poll_group_free_stats(group->group, stat); +- spdk_for_each_channel_continue(i, 0); +-} +- +-static void +-rpc_bdev_nvme_stats_done(struct spdk_io_channel_iter *i, int status) +-{ +- struct rpc_bdev_nvme_transport_stat_ctx *ctx = spdk_io_channel_iter_get_ctx(i); +- +- spdk_json_write_array_end(ctx->w); +- spdk_json_write_object_end(ctx->w); +- spdk_jsonrpc_end_result(ctx->request, ctx->w); +- free(ctx); +-} +- +-static void +-rpc_bdev_nvme_get_transport_statistics(struct spdk_jsonrpc_request *request, +- const struct spdk_json_val *params) +-{ +- struct rpc_bdev_nvme_transport_stat_ctx *ctx; +- +- if (params) { +- spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, +- "'bdev_nvme_get_transport_statistics' requires no arguments"); +- return; +- } +- +- ctx = calloc(1, sizeof(*ctx)); +- if (!ctx) { +- spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, +- "Memory allocation error"); +- return; +- } +- ctx->request = request; +- ctx->w = spdk_jsonrpc_begin_result(ctx->request); +- spdk_json_write_object_begin(ctx->w); +- spdk_json_write_named_array_begin(ctx->w, "poll_groups"); +- +- spdk_for_each_channel(&g_nvme_bdev_ctrlrs, +- rpc_bdev_nvme_stats_per_channel, +- ctx, +- rpc_bdev_nvme_stats_done); +-} +-SPDK_RPC_REGISTER("bdev_nvme_get_transport_statistics", rpc_bdev_nvme_get_transport_statistics, +- SPDK_RPC_RUNTIME) +- +-struct rpc_bdev_nvme_reset_controller_req { +- char *name; +-}; +- +-static void +-free_rpc_bdev_nvme_reset_controller_req(struct rpc_bdev_nvme_reset_controller_req *r) +-{ +- free(r->name); +-} +- +-static const struct spdk_json_object_decoder rpc_bdev_nvme_reset_controller_req_decoders[] = { +- {"name", offsetof(struct rpc_bdev_nvme_reset_controller_req, name), spdk_json_decode_string}, +-}; +- +-struct rpc_bdev_nvme_reset_controller_ctx { +- struct spdk_jsonrpc_request *request; +- bool success; +- struct spdk_thread *orig_thread; +-}; +- +-static void +-_rpc_bdev_nvme_reset_controller_cb(void *_ctx) +-{ +- struct rpc_bdev_nvme_reset_controller_ctx *ctx = _ctx; +- +- spdk_jsonrpc_send_bool_response(ctx->request, ctx->success); +- +- free(ctx); +-} +- +-static void +-rpc_bdev_nvme_reset_controller_cb(void *cb_arg, bool success) +-{ +- struct rpc_bdev_nvme_reset_controller_ctx *ctx = cb_arg; +- +- ctx->success = success; +- +- spdk_thread_send_msg(ctx->orig_thread, _rpc_bdev_nvme_reset_controller_cb, ctx); +-} +- +-static void +-rpc_bdev_nvme_reset_controller(struct spdk_jsonrpc_request *request, +- const struct spdk_json_val *params) +-{ +- struct rpc_bdev_nvme_reset_controller_req req = {NULL}; +- struct rpc_bdev_nvme_reset_controller_ctx *ctx; +- struct nvme_ctrlr *nvme_ctrlr; +- int rc; +- +- ctx = calloc(1, sizeof(*ctx)); +- if (ctx == NULL) { +- SPDK_ERRLOG("Memory allocation failed\n"); +- spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, +- "Memory allocation failed"); +- return; +- } +- +- if (spdk_json_decode_object(params, rpc_bdev_nvme_reset_controller_req_decoders, +- SPDK_COUNTOF(rpc_bdev_nvme_reset_controller_req_decoders), +- &req)) { +- SPDK_ERRLOG("spdk_json_decode_object failed\n"); +- spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, spdk_strerror(EINVAL)); +- goto err; +- } +- +- nvme_ctrlr = nvme_ctrlr_get_by_name(req.name); +- if (nvme_ctrlr == NULL) { +- SPDK_ERRLOG("Failed at device lookup\n"); +- spdk_jsonrpc_send_error_response(request, -ENODEV, spdk_strerror(ENODEV)); +- goto err; +- } +- +- ctx->request = request; +- ctx->orig_thread = spdk_get_thread(); +- +- rc = bdev_nvme_reset_rpc(nvme_ctrlr, rpc_bdev_nvme_reset_controller_cb, ctx); +- if (rc != 0) { +- SPDK_NOTICELOG("Failed at bdev_nvme_reset_rpc\n"); +- spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, spdk_strerror(-rc)); +- goto err; +- } +- +- free_rpc_bdev_nvme_reset_controller_req(&req); +- return; +- +-err: +- free_rpc_bdev_nvme_reset_controller_req(&req); +- free(ctx); +-} +-SPDK_RPC_REGISTER("bdev_nvme_reset_controller", rpc_bdev_nvme_reset_controller, SPDK_RPC_RUNTIME) +- +-struct rpc_get_controller_health_info { +- char *name; +-}; +- +-struct spdk_nvme_health_info_context { +- struct spdk_jsonrpc_request *request; +- struct spdk_nvme_ctrlr *ctrlr; +- struct spdk_nvme_health_information_page health_page; +-}; +- +-static void +-free_rpc_get_controller_health_info(struct rpc_get_controller_health_info *r) +-{ +- free(r->name); +-} +- +-static const struct spdk_json_object_decoder rpc_get_controller_health_info_decoders[] = { +- {"name", offsetof(struct rpc_get_controller_health_info, name), spdk_json_decode_string, true}, +-}; +- +-static void +-nvme_health_info_cleanup(struct spdk_nvme_health_info_context *context, bool response) +-{ +- if (response == true) { +- spdk_jsonrpc_send_error_response(context->request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, +- "Internal error."); +- } +- +- free(context); +-} +- +-static void +-get_health_log_page_completion(void *cb_arg, const struct spdk_nvme_cpl *cpl) +-{ +- int i; +- char buf[128]; +- struct spdk_nvme_health_info_context *context = cb_arg; +- struct spdk_jsonrpc_request *request = context->request; +- struct spdk_json_write_ctx *w; +- struct spdk_nvme_ctrlr *ctrlr = context->ctrlr; +- const struct spdk_nvme_transport_id *trid = NULL; +- const struct spdk_nvme_ctrlr_data *cdata = NULL; +- struct spdk_nvme_health_information_page *health_page = NULL; +- +- if (spdk_nvme_cpl_is_error(cpl)) { +- nvme_health_info_cleanup(context, true); +- SPDK_ERRLOG("get log page failed\n"); +- return; +- } +- +- if (ctrlr == NULL) { +- nvme_health_info_cleanup(context, true); +- SPDK_ERRLOG("ctrlr is NULL\n"); +- return; +- } else { +- trid = spdk_nvme_ctrlr_get_transport_id(ctrlr); +- cdata = spdk_nvme_ctrlr_get_data(ctrlr); +- health_page = &(context->health_page); +- } +- +- w = spdk_jsonrpc_begin_result(request); +- +- spdk_json_write_object_begin(w); +- snprintf(buf, sizeof(cdata->mn) + 1, "%s", cdata->mn); +- spdk_str_trim(buf); +- spdk_json_write_named_string(w, "model_number", buf); +- snprintf(buf, sizeof(cdata->sn) + 1, "%s", cdata->sn); +- spdk_str_trim(buf); +- spdk_json_write_named_string(w, "serial_number", buf); +- snprintf(buf, sizeof(cdata->fr) + 1, "%s", cdata->fr); +- spdk_str_trim(buf); +- spdk_json_write_named_string(w, "firmware_revision", buf); +- spdk_json_write_named_string(w, "traddr", trid->traddr); +- spdk_json_write_named_uint64(w, "temperature_celsius", health_page->temperature - 273); +- spdk_json_write_named_uint64(w, "available_spare_percentage", health_page->available_spare); +- spdk_json_write_named_uint64(w, "available_spare_threshold_percentage", +- health_page->available_spare_threshold); +- spdk_json_write_named_uint64(w, "percentage_used", health_page->percentage_used); +- spdk_json_write_named_uint128(w, "data_units_read", +- health_page->data_units_read[0], health_page->data_units_read[1]); +- spdk_json_write_named_uint128(w, "data_units_written", +- health_page->data_units_written[0], health_page->data_units_written[1]); +- spdk_json_write_named_uint128(w, "host_read_commands", +- health_page->host_read_commands[0], health_page->host_read_commands[1]); +- spdk_json_write_named_uint128(w, "host_write_commands", +- health_page->host_write_commands[0], health_page->host_write_commands[1]); +- spdk_json_write_named_uint128(w, "controller_busy_time", +- health_page->controller_busy_time[0], health_page->controller_busy_time[1]); +- spdk_json_write_named_uint128(w, "power_cycles", +- health_page->power_cycles[0], health_page->power_cycles[1]); +- spdk_json_write_named_uint128(w, "power_on_hours", +- health_page->power_on_hours[0], health_page->power_on_hours[1]); +- spdk_json_write_named_uint128(w, "unsafe_shutdowns", +- health_page->unsafe_shutdowns[0], health_page->unsafe_shutdowns[1]); +- spdk_json_write_named_uint128(w, "media_errors", +- health_page->media_errors[0], health_page->media_errors[1]); +- spdk_json_write_named_uint128(w, "num_err_log_entries", +- health_page->num_error_info_log_entries[0], health_page->num_error_info_log_entries[1]); +- spdk_json_write_named_uint64(w, "warning_temperature_time_minutes", health_page->warning_temp_time); +- spdk_json_write_named_uint64(w, "critical_composite_temperature_time_minutes", +- health_page->critical_temp_time); +- for (i = 0; i < 8; i++) { +- if (health_page->temp_sensor[i] != 0) { +- spdk_json_write_named_uint64(w, "temperature_sensor_celsius", health_page->temp_sensor[i] - 273); +- } +- } +- spdk_json_write_object_end(w); +- +- spdk_jsonrpc_end_result(request, w); +- nvme_health_info_cleanup(context, false); +-} +- +-static void +-get_health_log_page(struct spdk_nvme_health_info_context *context) +-{ +- struct spdk_nvme_ctrlr *ctrlr = context->ctrlr; +- +- if (spdk_nvme_ctrlr_cmd_get_log_page(ctrlr, SPDK_NVME_LOG_HEALTH_INFORMATION, +- SPDK_NVME_GLOBAL_NS_TAG, +- &(context->health_page), sizeof(context->health_page), 0, +- get_health_log_page_completion, context)) { +- nvme_health_info_cleanup(context, true); +- SPDK_ERRLOG("spdk_nvme_ctrlr_cmd_get_log_page() failed\n"); +- } +-} +- +-static void +-get_temperature_threshold_feature_completion(void *cb_arg, const struct spdk_nvme_cpl *cpl) +-{ +- struct spdk_nvme_health_info_context *context = cb_arg; +- +- if (spdk_nvme_cpl_is_error(cpl)) { +- nvme_health_info_cleanup(context, true); +- SPDK_ERRLOG("feature SPDK_NVME_FEAT_TEMPERATURE_THRESHOLD failed in completion\n"); +- } else { +- get_health_log_page(context); +- } +-} +- +-static int +-get_temperature_threshold_feature(struct spdk_nvme_health_info_context *context) +-{ +- struct spdk_nvme_cmd cmd = {}; +- +- cmd.opc = SPDK_NVME_OPC_GET_FEATURES; +- cmd.cdw10 = SPDK_NVME_FEAT_TEMPERATURE_THRESHOLD; +- +- return spdk_nvme_ctrlr_cmd_admin_raw(context->ctrlr, &cmd, NULL, 0, +- get_temperature_threshold_feature_completion, context); +-} +- +-static void +-get_controller_health_info(struct spdk_jsonrpc_request *request, struct spdk_nvme_ctrlr *ctrlr) +-{ +- struct spdk_nvme_health_info_context *context; +- +- context = calloc(1, sizeof(struct spdk_nvme_health_info_context)); +- if (!context) { +- spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, +- "Memory allocation error."); +- return; +- } +- +- context->request = request; +- context->ctrlr = ctrlr; +- +- if (get_temperature_threshold_feature(context)) { +- nvme_health_info_cleanup(context, true); +- SPDK_ERRLOG("feature SPDK_NVME_FEAT_TEMPERATURE_THRESHOLD failed to submit\n"); +- } +- +- return; +-} +- +-static void +-rpc_bdev_nvme_get_controller_health_info(struct spdk_jsonrpc_request *request, +- const struct spdk_json_val *params) +-{ +- struct rpc_get_controller_health_info req = {}; +- struct nvme_ctrlr *nvme_ctrlr = NULL; +- +- if (!params) { +- spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, +- "Missing device name"); +- +- return; +- } +- if (spdk_json_decode_object(params, rpc_get_controller_health_info_decoders, +- SPDK_COUNTOF(rpc_get_controller_health_info_decoders), &req)) { +- SPDK_ERRLOG("spdk_json_decode_object failed\n"); +- free_rpc_get_controller_health_info(&req); +- spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, +- "Invalid parameters"); +- +- return; +- } +- +- nvme_ctrlr = nvme_ctrlr_get_by_name(req.name); +- +- if (!nvme_ctrlr) { +- SPDK_ERRLOG("nvme ctrlr name '%s' does not exist\n", req.name); +- free_rpc_get_controller_health_info(&req); +- spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, +- "Device not found"); +- return; +- } +- +- get_controller_health_info(request, nvme_ctrlr->ctrlr); +- free_rpc_get_controller_health_info(&req); +- +- return; +-} +-SPDK_RPC_REGISTER("bdev_nvme_get_controller_health_info", +- rpc_bdev_nvme_get_controller_health_info, SPDK_RPC_RUNTIME) +- +-struct rpc_bdev_nvme_start_discovery { +- char *name; +- char *trtype; +- char *adrfam; +- char *traddr; +- char *trsvcid; +- char *hostnqn; +- bool wait_for_attach; +- uint64_t attach_timeout_ms; +- struct spdk_nvme_ctrlr_opts opts; +- struct nvme_ctrlr_opts bdev_opts; +-}; +- +-static void +-free_rpc_bdev_nvme_start_discovery(struct rpc_bdev_nvme_start_discovery *req) +-{ +- free(req->name); +- free(req->trtype); +- free(req->adrfam); +- free(req->traddr); +- free(req->trsvcid); +- free(req->hostnqn); +-} +- +-static const struct spdk_json_object_decoder rpc_bdev_nvme_start_discovery_decoders[] = { +- {"name", offsetof(struct rpc_bdev_nvme_start_discovery, name), spdk_json_decode_string}, +- {"trtype", offsetof(struct rpc_bdev_nvme_start_discovery, trtype), spdk_json_decode_string}, +- {"traddr", offsetof(struct rpc_bdev_nvme_start_discovery, traddr), spdk_json_decode_string}, +- {"adrfam", offsetof(struct rpc_bdev_nvme_start_discovery, adrfam), spdk_json_decode_string, true}, +- {"trsvcid", offsetof(struct rpc_bdev_nvme_start_discovery, trsvcid), spdk_json_decode_string, true}, +- {"hostnqn", offsetof(struct rpc_bdev_nvme_start_discovery, hostnqn), spdk_json_decode_string, true}, +- {"wait_for_attach", offsetof(struct rpc_bdev_nvme_start_discovery, wait_for_attach), spdk_json_decode_bool, true}, +- {"attach_timeout_ms", offsetof(struct rpc_bdev_nvme_start_discovery, attach_timeout_ms), spdk_json_decode_uint64, true}, +- {"ctrlr_loss_timeout_sec", offsetof(struct rpc_bdev_nvme_start_discovery, bdev_opts.ctrlr_loss_timeout_sec), spdk_json_decode_int32, true}, +- {"reconnect_delay_sec", offsetof(struct rpc_bdev_nvme_start_discovery, bdev_opts.reconnect_delay_sec), spdk_json_decode_uint32, true}, +- {"fast_io_fail_timeout_sec", offsetof(struct rpc_bdev_nvme_start_discovery, bdev_opts.fast_io_fail_timeout_sec), spdk_json_decode_uint32, true}, +-}; +- +-struct rpc_bdev_nvme_start_discovery_ctx { +- struct rpc_bdev_nvme_start_discovery req; +- struct spdk_jsonrpc_request *request; +-}; +- +-static void +-rpc_bdev_nvme_start_discovery_done(void *ctx, int status) +-{ +- struct spdk_jsonrpc_request *request = ctx; +- +- if (status != 0) { +- spdk_jsonrpc_send_error_response(request, status, spdk_strerror(-status)); +- } else { +- spdk_jsonrpc_send_bool_response(request, true); +- } +-} +- +-static void +-rpc_bdev_nvme_start_discovery(struct spdk_jsonrpc_request *request, +- const struct spdk_json_val *params) +-{ +- struct rpc_bdev_nvme_start_discovery_ctx *ctx; +- struct spdk_nvme_transport_id trid = {}; +- size_t len, maxlen; +- int rc; +- spdk_bdev_nvme_start_discovery_fn cb_fn; +- void *cb_ctx; +- +- ctx = calloc(1, sizeof(*ctx)); +- if (!ctx) { +- spdk_jsonrpc_send_error_response(request, -ENOMEM, spdk_strerror(ENOMEM)); +- return; +- } +- +- spdk_nvme_ctrlr_get_default_ctrlr_opts(&ctx->req.opts, sizeof(ctx->req.opts)); +- +- if (spdk_json_decode_object(params, rpc_bdev_nvme_start_discovery_decoders, +- SPDK_COUNTOF(rpc_bdev_nvme_start_discovery_decoders), +- &ctx->req)) { +- SPDK_ERRLOG("spdk_json_decode_object failed\n"); +- spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, +- "spdk_json_decode_object failed"); +- goto cleanup; +- } +- +- /* Parse trstring */ +- rc = spdk_nvme_transport_id_populate_trstring(&trid, ctx->req.trtype); +- if (rc < 0) { +- SPDK_ERRLOG("Failed to parse trtype: %s\n", ctx->req.trtype); +- spdk_jsonrpc_send_error_response_fmt(request, -EINVAL, "Failed to parse trtype: %s", +- ctx->req.trtype); +- goto cleanup; +- } +- +- /* Parse trtype */ +- rc = spdk_nvme_transport_id_parse_trtype(&trid.trtype, ctx->req.trtype); +- assert(rc == 0); +- +- /* Parse traddr */ +- maxlen = sizeof(trid.traddr); +- len = strnlen(ctx->req.traddr, maxlen); +- if (len == maxlen) { +- spdk_jsonrpc_send_error_response_fmt(request, -EINVAL, "traddr too long: %s", +- ctx->req.traddr); +- goto cleanup; +- } +- memcpy(trid.traddr, ctx->req.traddr, len + 1); +- +- /* Parse adrfam */ +- if (ctx->req.adrfam) { +- rc = spdk_nvme_transport_id_parse_adrfam(&trid.adrfam, ctx->req.adrfam); +- if (rc < 0) { +- SPDK_ERRLOG("Failed to parse adrfam: %s\n", ctx->req.adrfam); +- spdk_jsonrpc_send_error_response_fmt(request, -EINVAL, "Failed to parse adrfam: %s", +- ctx->req.adrfam); +- goto cleanup; +- } +- } +- +- /* Parse trsvcid */ +- if (ctx->req.trsvcid) { +- maxlen = sizeof(trid.trsvcid); +- len = strnlen(ctx->req.trsvcid, maxlen); +- if (len == maxlen) { +- spdk_jsonrpc_send_error_response_fmt(request, -EINVAL, "trsvcid too long: %s", +- ctx->req.trsvcid); +- goto cleanup; +- } +- memcpy(trid.trsvcid, ctx->req.trsvcid, len + 1); +- } +- +- if (ctx->req.hostnqn) { +- snprintf(ctx->req.opts.hostnqn, sizeof(ctx->req.opts.hostnqn), "%s", +- ctx->req.hostnqn); +- } +- +- if (ctx->req.attach_timeout_ms != 0) { +- ctx->req.wait_for_attach = true; +- } +- +- ctx->request = request; +- cb_fn = ctx->req.wait_for_attach ? rpc_bdev_nvme_start_discovery_done : NULL; +- cb_ctx = ctx->req.wait_for_attach ? request : NULL; +- rc = bdev_nvme_start_discovery(&trid, ctx->req.name, &ctx->req.opts, &ctx->req.bdev_opts, +- ctx->req.attach_timeout_ms, false, cb_fn, cb_ctx); +- if (rc) { +- spdk_jsonrpc_send_error_response(request, rc, spdk_strerror(-rc)); +- } else if (!ctx->req.wait_for_attach) { +- rpc_bdev_nvme_start_discovery_done(request, 0); +- } +- +-cleanup: +- free_rpc_bdev_nvme_start_discovery(&ctx->req); +- free(ctx); +-} +-SPDK_RPC_REGISTER("bdev_nvme_start_discovery", rpc_bdev_nvme_start_discovery, +- SPDK_RPC_RUNTIME) +- +-struct rpc_bdev_nvme_stop_discovery { +- char *name; +-}; +- +-static const struct spdk_json_object_decoder rpc_bdev_nvme_stop_discovery_decoders[] = { +- {"name", offsetof(struct rpc_bdev_nvme_stop_discovery, name), spdk_json_decode_string}, +-}; +- +-struct rpc_bdev_nvme_stop_discovery_ctx { +- struct rpc_bdev_nvme_stop_discovery req; +- struct spdk_jsonrpc_request *request; +-}; +- +-static void +-rpc_bdev_nvme_stop_discovery_done(void *cb_ctx) +-{ +- struct rpc_bdev_nvme_stop_discovery_ctx *ctx = cb_ctx; +- +- spdk_jsonrpc_send_bool_response(ctx->request, true); +- free(ctx->req.name); +- free(ctx); +-} +- +-static void +-rpc_bdev_nvme_stop_discovery(struct spdk_jsonrpc_request *request, +- const struct spdk_json_val *params) +-{ +- struct rpc_bdev_nvme_stop_discovery_ctx *ctx; +- int rc; +- +- ctx = calloc(1, sizeof(*ctx)); +- if (!ctx) { +- spdk_jsonrpc_send_error_response(request, -ENOMEM, spdk_strerror(ENOMEM)); +- return; +- } +- +- if (spdk_json_decode_object(params, rpc_bdev_nvme_stop_discovery_decoders, +- SPDK_COUNTOF(rpc_bdev_nvme_stop_discovery_decoders), +- &ctx->req)) { +- SPDK_ERRLOG("spdk_json_decode_object failed\n"); +- spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, +- "spdk_json_decode_object failed"); +- goto cleanup; +- } +- +- ctx->request = request; +- rc = bdev_nvme_stop_discovery(ctx->req.name, rpc_bdev_nvme_stop_discovery_done, ctx); +- if (rc) { +- spdk_jsonrpc_send_error_response(request, rc, spdk_strerror(-rc)); +- goto cleanup; +- } +- +- return; +- +-cleanup: +- free(ctx->req.name); +- free(ctx); +-} +-SPDK_RPC_REGISTER("bdev_nvme_stop_discovery", rpc_bdev_nvme_stop_discovery, +- SPDK_RPC_RUNTIME) +- +-static void +-rpc_bdev_nvme_get_discovery_info(struct spdk_jsonrpc_request *request, +- const struct spdk_json_val *params) +-{ +- struct spdk_json_write_ctx *w; +- +- w = spdk_jsonrpc_begin_result(request); +- bdev_nvme_get_discovery_info(w); +- spdk_jsonrpc_end_result(request, w); +-} +-SPDK_RPC_REGISTER("bdev_nvme_get_discovery_info", rpc_bdev_nvme_get_discovery_info, +- SPDK_RPC_RUNTIME) +- +-enum error_injection_cmd_type { +- NVME_ADMIN_CMD = 1, +- NVME_IO_CMD, +-}; +- +-struct rpc_add_error_injection { +- char *name; +- enum error_injection_cmd_type cmd_type; +- uint8_t opc; +- bool do_not_submit; +- uint64_t timeout_in_us; +- uint32_t err_count; +- uint8_t sct; +- uint8_t sc; +-}; +- +-static void +-free_rpc_add_error_injection(struct rpc_add_error_injection *req) +-{ +- free(req->name); +-} +- +-static int +-rpc_error_injection_decode_cmd_type(const struct spdk_json_val *val, void *out) +-{ +- int *cmd_type = out; +- +- if (spdk_json_strequal(val, "admin")) { +- *cmd_type = NVME_ADMIN_CMD; +- } else if (spdk_json_strequal(val, "io")) { +- *cmd_type = NVME_IO_CMD; +- } else { +- SPDK_ERRLOG("Invalid parameter value: cmd_type\n"); +- return -EINVAL; +- } +- +- return 0; +-} +- +-static const struct spdk_json_object_decoder rpc_add_error_injection_decoders[] = { +- { "name", offsetof(struct rpc_add_error_injection, name), spdk_json_decode_string }, +- { "cmd_type", offsetof(struct rpc_add_error_injection, cmd_type), rpc_error_injection_decode_cmd_type }, +- { "opc", offsetof(struct rpc_add_error_injection, opc), spdk_json_decode_uint8 }, +- { "do_not_submit", offsetof(struct rpc_add_error_injection, do_not_submit), spdk_json_decode_bool, true }, +- { "timeout_in_us", offsetof(struct rpc_add_error_injection, timeout_in_us), spdk_json_decode_uint64, true }, +- { "err_count", offsetof(struct rpc_add_error_injection, err_count), spdk_json_decode_uint32, true }, +- { "sct", offsetof(struct rpc_add_error_injection, sct), spdk_json_decode_uint8, true}, +- { "sc", offsetof(struct rpc_add_error_injection, sc), spdk_json_decode_uint8, true}, +-}; +- +-struct rpc_add_error_injection_ctx { +- struct spdk_jsonrpc_request *request; +- struct rpc_add_error_injection rpc; +-}; +- +-static void +-rpc_add_error_injection_done(struct spdk_io_channel_iter *i, int status) +-{ +- struct rpc_add_error_injection_ctx *ctx = spdk_io_channel_iter_get_ctx(i); +- +- if (status) { +- spdk_jsonrpc_send_error_response(ctx->request, status, +- "Failed to add the error injection."); +- } else { +- spdk_jsonrpc_send_bool_response(ctx->request, true); +- } +- +- free_rpc_add_error_injection(&ctx->rpc); +- free(ctx); +-} +- +-static void +-rpc_add_error_injection_per_channel(struct spdk_io_channel_iter *i) +-{ +- struct spdk_io_channel *ch = spdk_io_channel_iter_get_channel(i); +- struct rpc_add_error_injection_ctx *ctx = spdk_io_channel_iter_get_ctx(i); +- struct nvme_ctrlr_channel *ctrlr_ch = spdk_io_channel_get_ctx(ch); +- struct spdk_nvme_qpair *qpair = ctrlr_ch->qpair->qpair; +- struct spdk_nvme_ctrlr *ctrlr = ctrlr_ch->qpair->ctrlr->ctrlr; +- int rc = 0; +- +- if (qpair != NULL) { +- rc = spdk_nvme_qpair_add_cmd_error_injection(ctrlr, qpair, ctx->rpc.opc, +- ctx->rpc.do_not_submit, ctx->rpc.timeout_in_us, ctx->rpc.err_count, +- ctx->rpc.sct, ctx->rpc.sc); +- } +- +- spdk_for_each_channel_continue(i, rc); +-} +- +-static void +-rpc_bdev_nvme_add_error_injection( +- struct spdk_jsonrpc_request *request, +- const struct spdk_json_val *params) +-{ +- struct rpc_add_error_injection_ctx *ctx; +- struct nvme_ctrlr *nvme_ctrlr; +- int rc; +- +- ctx = calloc(1, sizeof(*ctx)); +- if (!ctx) { +- spdk_jsonrpc_send_error_response(request, -ENOMEM, spdk_strerror(ENOMEM)); +- return; +- } +- ctx->rpc.err_count = 1; +- ctx->request = request; +- +- if (spdk_json_decode_object(params, +- rpc_add_error_injection_decoders, +- SPDK_COUNTOF(rpc_add_error_injection_decoders), +- &ctx->rpc)) { +- spdk_jsonrpc_send_error_response(request, -EINVAL, +- "Failed to parse the request"); +- goto cleanup; +- } +- +- nvme_ctrlr = nvme_ctrlr_get_by_name(ctx->rpc.name); +- if (nvme_ctrlr == NULL) { +- SPDK_ERRLOG("No controller with specified name was found.\n"); +- spdk_jsonrpc_send_error_response(request, -ENODEV, spdk_strerror(ENODEV)); +- goto cleanup; +- } +- +- if (ctx->rpc.cmd_type == NVME_IO_CMD) { +- spdk_for_each_channel(nvme_ctrlr, +- rpc_add_error_injection_per_channel, +- ctx, +- rpc_add_error_injection_done); +- +- return; +- } else { +- rc = spdk_nvme_qpair_add_cmd_error_injection(nvme_ctrlr->ctrlr, NULL, ctx->rpc.opc, +- ctx->rpc.do_not_submit, ctx->rpc.timeout_in_us, ctx->rpc.err_count, +- ctx->rpc.sct, ctx->rpc.sc); +- if (rc) { +- spdk_jsonrpc_send_error_response(request, -rc, +- "Failed to add the error injection"); +- } else { +- spdk_jsonrpc_send_bool_response(ctx->request, true); +- } +- } +- +-cleanup: +- free_rpc_add_error_injection(&ctx->rpc); +- free(ctx); +-} +-SPDK_RPC_REGISTER("bdev_nvme_add_error_injection", rpc_bdev_nvme_add_error_injection, +- SPDK_RPC_RUNTIME) +- +-struct rpc_remove_error_injection { +- char *name; +- enum error_injection_cmd_type cmd_type; +- uint8_t opc; +-}; +- +-static void +-free_rpc_remove_error_injection(struct rpc_remove_error_injection *req) +-{ +- free(req->name); +-} +- +-static const struct spdk_json_object_decoder rpc_remove_error_injection_decoders[] = { +- { "name", offsetof(struct rpc_remove_error_injection, name), spdk_json_decode_string }, +- { "cmd_type", offsetof(struct rpc_remove_error_injection, cmd_type), rpc_error_injection_decode_cmd_type }, +- { "opc", offsetof(struct rpc_remove_error_injection, opc), spdk_json_decode_uint8 }, +-}; +- +-struct rpc_remove_error_injection_ctx { +- struct spdk_jsonrpc_request *request; +- struct rpc_remove_error_injection rpc; +-}; +- +-static void +-rpc_remove_error_injection_done(struct spdk_io_channel_iter *i, int status) +-{ +- struct rpc_remove_error_injection_ctx *ctx = spdk_io_channel_iter_get_ctx(i); +- +- if (status) { +- spdk_jsonrpc_send_error_response(ctx->request, status, +- "Failed to remove the error injection."); +- } else { +- spdk_jsonrpc_send_bool_response(ctx->request, true); +- } +- +- free_rpc_remove_error_injection(&ctx->rpc); +- free(ctx); +-} +- +-static void +-rpc_remove_error_injection_per_channel(struct spdk_io_channel_iter *i) +-{ +- struct spdk_io_channel *ch = spdk_io_channel_iter_get_channel(i); +- struct rpc_remove_error_injection_ctx *ctx = spdk_io_channel_iter_get_ctx(i); +- struct nvme_ctrlr_channel *ctrlr_ch = spdk_io_channel_get_ctx(ch); +- struct spdk_nvme_qpair *qpair = ctrlr_ch->qpair->qpair; +- struct spdk_nvme_ctrlr *ctrlr = ctrlr_ch->qpair->ctrlr->ctrlr; +- +- if (qpair != NULL) { +- spdk_nvme_qpair_remove_cmd_error_injection(ctrlr, qpair, ctx->rpc.opc); +- } +- +- spdk_for_each_channel_continue(i, 0); +-} +- +-static void +-rpc_bdev_nvme_remove_error_injection(struct spdk_jsonrpc_request *request, +- const struct spdk_json_val *params) +-{ +- struct rpc_remove_error_injection_ctx *ctx; +- struct nvme_ctrlr *nvme_ctrlr; +- +- ctx = calloc(1, sizeof(*ctx)); +- if (!ctx) { +- spdk_jsonrpc_send_error_response(request, -ENOMEM, spdk_strerror(ENOMEM)); +- return; +- } +- ctx->request = request; +- +- if (spdk_json_decode_object(params, +- rpc_remove_error_injection_decoders, +- SPDK_COUNTOF(rpc_remove_error_injection_decoders), +- &ctx->rpc)) { +- spdk_jsonrpc_send_error_response(request, -EINVAL, +- "Failed to parse the request"); +- goto cleanup; +- } +- +- nvme_ctrlr = nvme_ctrlr_get_by_name(ctx->rpc.name); +- if (nvme_ctrlr == NULL) { +- SPDK_ERRLOG("No controller with specified name was found.\n"); +- spdk_jsonrpc_send_error_response(request, -ENODEV, spdk_strerror(ENODEV)); +- goto cleanup; +- } +- +- if (ctx->rpc.cmd_type == NVME_IO_CMD) { +- spdk_for_each_channel(nvme_ctrlr, +- rpc_remove_error_injection_per_channel, +- ctx, +- rpc_remove_error_injection_done); +- return; +- } else { +- spdk_nvme_qpair_remove_cmd_error_injection(nvme_ctrlr->ctrlr, NULL, ctx->rpc.opc); +- spdk_jsonrpc_send_bool_response(ctx->request, true); +- } +- +-cleanup: +- free_rpc_remove_error_injection(&ctx->rpc); +- free(ctx); +-} +-SPDK_RPC_REGISTER("bdev_nvme_remove_error_injection", rpc_bdev_nvme_remove_error_injection, +- SPDK_RPC_RUNTIME) +- +-struct rpc_get_io_paths { +- char *name; +-}; +- +-static void +-free_rpc_get_io_paths(struct rpc_get_io_paths *r) +-{ +- free(r->name); +-} +- +-static const struct spdk_json_object_decoder rpc_get_io_paths_decoders[] = { +- {"name", offsetof(struct rpc_get_io_paths, name), spdk_json_decode_string, true}, +-}; +- +-struct rpc_get_io_paths_ctx { +- struct rpc_get_io_paths req; +- struct spdk_jsonrpc_request *request; +- struct spdk_json_write_ctx *w; +-}; +- +-static void +-rpc_bdev_nvme_get_io_paths_done(struct spdk_io_channel_iter *i, int status) +-{ +- struct rpc_get_io_paths_ctx *ctx = spdk_io_channel_iter_get_ctx(i); +- +- spdk_json_write_array_end(ctx->w); +- +- spdk_json_write_object_end(ctx->w); +- +- spdk_jsonrpc_end_result(ctx->request, ctx->w); +- +- free_rpc_get_io_paths(&ctx->req); +- free(ctx); +-} +- +-static void +-_rpc_bdev_nvme_get_io_paths(struct spdk_io_channel_iter *i) +-{ +- struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i); +- struct nvme_poll_group *group = spdk_io_channel_get_ctx(_ch); +- struct rpc_get_io_paths_ctx *ctx = spdk_io_channel_iter_get_ctx(i); +- struct nvme_qpair *qpair; +- struct nvme_io_path *io_path; +- struct nvme_bdev *nbdev; +- +- spdk_json_write_object_begin(ctx->w); +- +- spdk_json_write_named_string(ctx->w, "thread", spdk_thread_get_name(spdk_get_thread())); +- +- spdk_json_write_named_array_begin(ctx->w, "io_paths"); +- +- TAILQ_FOREACH(qpair, &group->qpair_list, tailq) { +- TAILQ_FOREACH(io_path, &qpair->io_path_list, tailq) { +- nbdev = io_path->nvme_ns->bdev; +- +- if (ctx->req.name != NULL && +- strcmp(ctx->req.name, nbdev->disk.name) != 0) { +- continue; +- } +- +- nvme_io_path_info_json(ctx->w, io_path); +- } +- } +- +- spdk_json_write_array_end(ctx->w); +- +- spdk_json_write_object_end(ctx->w); +- +- spdk_for_each_channel_continue(i, 0); +-} +- +-static void +-rpc_bdev_nvme_get_io_paths(struct spdk_jsonrpc_request *request, +- const struct spdk_json_val *params) +-{ +- struct rpc_get_io_paths_ctx *ctx; +- +- ctx = calloc(1, sizeof(*ctx)); +- if (ctx == NULL) { +- spdk_jsonrpc_send_error_response(request, -ENOMEM, spdk_strerror(ENOMEM)); +- return; +- } +- +- if (params != NULL && +- spdk_json_decode_object(params, rpc_get_io_paths_decoders, +- SPDK_COUNTOF(rpc_get_io_paths_decoders), +- &ctx->req)) { +- spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, +- "bdev_nvme_get_io_paths requires no parameters"); +- +- free_rpc_get_io_paths(&ctx->req); +- free(ctx); +- return; +- } +- +- ctx->request = request; +- ctx->w = spdk_jsonrpc_begin_result(request); +- +- spdk_json_write_object_begin(ctx->w); +- +- spdk_json_write_named_array_begin(ctx->w, "poll_groups"); +- +- spdk_for_each_channel(&g_nvme_bdev_ctrlrs, +- _rpc_bdev_nvme_get_io_paths, +- ctx, +- rpc_bdev_nvme_get_io_paths_done); +-} +-SPDK_RPC_REGISTER("bdev_nvme_get_io_paths", rpc_bdev_nvme_get_io_paths, SPDK_RPC_RUNTIME) +- +-struct rpc_bdev_nvme_set_preferred_path { +- char *name; +- uint16_t cntlid; +-}; +- +-static void +-free_rpc_bdev_nvme_set_preferred_path(struct rpc_bdev_nvme_set_preferred_path *req) +-{ +- free(req->name); +-} +- +-static const struct spdk_json_object_decoder rpc_bdev_nvme_set_preferred_path_decoders[] = { +- {"name", offsetof(struct rpc_bdev_nvme_set_preferred_path, name), spdk_json_decode_string}, +- {"cntlid", offsetof(struct rpc_bdev_nvme_set_preferred_path, cntlid), spdk_json_decode_uint16}, +-}; +- +-struct rpc_bdev_nvme_set_preferred_path_ctx { +- struct rpc_bdev_nvme_set_preferred_path req; +- struct spdk_jsonrpc_request *request; +-}; +- +-static void +-rpc_bdev_nvme_set_preferred_path_done(void *cb_arg, int rc) +-{ +- struct rpc_bdev_nvme_set_preferred_path_ctx *ctx = cb_arg; +- +- if (rc == 0) { +- spdk_jsonrpc_send_bool_response(ctx->request, true); +- } else { +- spdk_jsonrpc_send_error_response(ctx->request, rc, spdk_strerror(-rc)); +- } +- +- free_rpc_bdev_nvme_set_preferred_path(&ctx->req); +- free(ctx); +-} +- +-static void +-rpc_bdev_nvme_set_preferred_path(struct spdk_jsonrpc_request *request, +- const struct spdk_json_val *params) +-{ +- struct rpc_bdev_nvme_set_preferred_path_ctx *ctx; +- +- ctx = calloc(1, sizeof(*ctx)); +- if (ctx == NULL) { +- spdk_jsonrpc_send_error_response(request, -ENOMEM, spdk_strerror(ENOMEM)); +- return; +- } +- +- if (spdk_json_decode_object(params, rpc_bdev_nvme_set_preferred_path_decoders, +- SPDK_COUNTOF(rpc_bdev_nvme_set_preferred_path_decoders), +- &ctx->req)) { +- SPDK_ERRLOG("spdk_json_decode_object failed\n"); +- spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, +- "spdk_json_decode_object failed"); +- goto cleanup; +- } +- +- ctx->request = request; +- +- bdev_nvme_set_preferred_path(ctx->req.name, ctx->req.cntlid, +- rpc_bdev_nvme_set_preferred_path_done, ctx); +- return; +- +-cleanup: +- free_rpc_bdev_nvme_set_preferred_path(&ctx->req); +- free(ctx); +-} +-SPDK_RPC_REGISTER("bdev_nvme_set_preferred_path", rpc_bdev_nvme_set_preferred_path, +- SPDK_RPC_RUNTIME) +- +-struct rpc_set_multipath_policy { +- char *name; +- enum bdev_nvme_multipath_policy policy; +- enum bdev_nvme_multipath_selector selector; +- uint32_t rr_min_io; +-}; +- +-static void +-free_rpc_set_multipath_policy(struct rpc_set_multipath_policy *req) +-{ +- free(req->name); +-} +- +-static int +-rpc_decode_mp_policy(const struct spdk_json_val *val, void *out) +-{ +- enum bdev_nvme_multipath_policy *policy = out; +- +- if (spdk_json_strequal(val, "active_passive") == true) { +- *policy = BDEV_NVME_MP_POLICY_ACTIVE_PASSIVE; +- } else if (spdk_json_strequal(val, "active_active") == true) { +- *policy = BDEV_NVME_MP_POLICY_ACTIVE_ACTIVE; +- } else { +- SPDK_NOTICELOG("Invalid parameter value: policy\n"); +- return -EINVAL; +- } +- +- return 0; +-} +- +-static int +-rpc_decode_mp_selector(const struct spdk_json_val *val, void *out) +-{ +- enum bdev_nvme_multipath_selector *selector = out; +- +- if (spdk_json_strequal(val, "round_robin") == true) { +- *selector = BDEV_NVME_MP_SELECTOR_ROUND_ROBIN; +- } else if (spdk_json_strequal(val, "queue_depth") == true) { +- *selector = BDEV_NVME_MP_SELECTOR_QUEUE_DEPTH; +- } else { +- SPDK_NOTICELOG("Invalid parameter value: selector\n"); +- return -EINVAL; +- } +- +- return 0; +-} +- +-static const struct spdk_json_object_decoder rpc_set_multipath_policy_decoders[] = { +- {"name", offsetof(struct rpc_set_multipath_policy, name), spdk_json_decode_string}, +- {"policy", offsetof(struct rpc_set_multipath_policy, policy), rpc_decode_mp_policy}, +- {"selector", offsetof(struct rpc_set_multipath_policy, selector), rpc_decode_mp_selector, true}, +- {"rr_min_io", offsetof(struct rpc_set_multipath_policy, rr_min_io), spdk_json_decode_uint32, true}, +-}; +- +-struct rpc_set_multipath_policy_ctx { +- struct rpc_set_multipath_policy req; +- struct spdk_jsonrpc_request *request; +-}; +- +-static void +-rpc_bdev_nvme_set_multipath_policy_done(void *cb_arg, int rc) +-{ +- struct rpc_set_multipath_policy_ctx *ctx = cb_arg; +- +- if (rc == 0) { +- spdk_jsonrpc_send_bool_response(ctx->request, true); +- } else { +- spdk_jsonrpc_send_error_response(ctx->request, rc, spdk_strerror(-rc)); +- } +- +- free_rpc_set_multipath_policy(&ctx->req); +- free(ctx); +-} +- +-static void +-rpc_bdev_nvme_set_multipath_policy(struct spdk_jsonrpc_request *request, +- const struct spdk_json_val *params) +-{ +- struct rpc_set_multipath_policy_ctx *ctx; +- +- ctx = calloc(1, sizeof(*ctx)); +- if (ctx == NULL) { +- spdk_jsonrpc_send_error_response(request, -ENOMEM, spdk_strerror(ENOMEM)); +- return; +- } +- +- ctx->req.rr_min_io = UINT32_MAX; +- +- if (spdk_json_decode_object(params, rpc_set_multipath_policy_decoders, +- SPDK_COUNTOF(rpc_set_multipath_policy_decoders), +- &ctx->req)) { +- SPDK_ERRLOG("spdk_json_decode_object failed\n"); +- spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, +- "spdk_json_decode_object failed"); +- goto cleanup; +- } +- +- ctx->request = request; +- +- if (ctx->req.policy != BDEV_NVME_MP_POLICY_ACTIVE_ACTIVE && ctx->req.selector > 0) { +- SPDK_ERRLOG("selector only works in active_active mode\n"); +- spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, +- "spdk_json_decode_object failed"); +- goto cleanup; +- } +- +- bdev_nvme_set_multipath_policy(ctx->req.name, ctx->req.policy, ctx->req.selector, +- ctx->req.rr_min_io, +- rpc_bdev_nvme_set_multipath_policy_done, ctx); +- return; +- +-cleanup: +- free_rpc_set_multipath_policy(&ctx->req); +- free(ctx); +-} +-SPDK_RPC_REGISTER("bdev_nvme_set_multipath_policy", rpc_bdev_nvme_set_multipath_policy, +- SPDK_RPC_RUNTIME) +- +-struct rpc_bdev_nvme_start_mdns_discovery { +- char *name; +- char *svcname; +- char *hostnqn; +- struct spdk_nvme_ctrlr_opts opts; +- struct nvme_ctrlr_opts bdev_opts; +-}; +- +-static void +-free_rpc_bdev_nvme_start_mdns_discovery(struct rpc_bdev_nvme_start_mdns_discovery *req) +-{ +- free(req->name); +- free(req->svcname); +- free(req->hostnqn); +-} +- +-static const struct spdk_json_object_decoder rpc_bdev_nvme_start_mdns_discovery_decoders[] = { +- {"name", offsetof(struct rpc_bdev_nvme_start_mdns_discovery, name), spdk_json_decode_string}, +- {"svcname", offsetof(struct rpc_bdev_nvme_start_mdns_discovery, svcname), spdk_json_decode_string}, +- {"hostnqn", offsetof(struct rpc_bdev_nvme_start_mdns_discovery, hostnqn), spdk_json_decode_string, true}, +-}; +- +-struct rpc_bdev_nvme_start_mdns_discovery_ctx { +- struct rpc_bdev_nvme_start_mdns_discovery req; +- struct spdk_jsonrpc_request *request; +-}; +- +-static void +-rpc_bdev_nvme_start_mdns_discovery(struct spdk_jsonrpc_request *request, +- const struct spdk_json_val *params) +-{ +- struct rpc_bdev_nvme_start_mdns_discovery_ctx *ctx; +- int rc; +- +- ctx = calloc(1, sizeof(*ctx)); +- if (!ctx) { +- spdk_jsonrpc_send_error_response(request, -ENOMEM, spdk_strerror(ENOMEM)); +- return; +- } +- +- spdk_nvme_ctrlr_get_default_ctrlr_opts(&ctx->req.opts, sizeof(ctx->req.opts)); +- +- if (spdk_json_decode_object(params, rpc_bdev_nvme_start_mdns_discovery_decoders, +- SPDK_COUNTOF(rpc_bdev_nvme_start_mdns_discovery_decoders), +- &ctx->req)) { +- SPDK_ERRLOG("spdk_json_decode_object failed\n"); +- spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, +- "spdk_json_decode_object failed"); +- goto cleanup; +- } +- +- if (ctx->req.hostnqn) { +- snprintf(ctx->req.opts.hostnqn, sizeof(ctx->req.opts.hostnqn), "%s", +- ctx->req.hostnqn); +- } +- ctx->request = request; +- rc = bdev_nvme_start_mdns_discovery(ctx->req.name, ctx->req.svcname, &ctx->req.opts, +- &ctx->req.bdev_opts); +- if (rc) { +- spdk_jsonrpc_send_error_response(request, rc, spdk_strerror(-rc)); +- } else { +- spdk_jsonrpc_send_bool_response(request, true); +- } +- +-cleanup: +- free_rpc_bdev_nvme_start_mdns_discovery(&ctx->req); +- free(ctx); +-} +-SPDK_RPC_REGISTER("bdev_nvme_start_mdns_discovery", rpc_bdev_nvme_start_mdns_discovery, +- SPDK_RPC_RUNTIME) +- +-struct rpc_bdev_nvme_stop_mdns_discovery { +- char *name; +-}; +- +-static const struct spdk_json_object_decoder rpc_bdev_nvme_stop_mdns_discovery_decoders[] = { +- {"name", offsetof(struct rpc_bdev_nvme_stop_mdns_discovery, name), spdk_json_decode_string}, +-}; +- +-struct rpc_bdev_nvme_stop_mdns_discovery_ctx { +- struct rpc_bdev_nvme_stop_mdns_discovery req; +- struct spdk_jsonrpc_request *request; +-}; +- +-static void +-rpc_bdev_nvme_stop_mdns_discovery(struct spdk_jsonrpc_request *request, +- const struct spdk_json_val *params) +-{ +- struct rpc_bdev_nvme_stop_mdns_discovery_ctx *ctx; +- int rc; +- +- ctx = calloc(1, sizeof(*ctx)); +- if (!ctx) { +- spdk_jsonrpc_send_error_response(request, -ENOMEM, spdk_strerror(ENOMEM)); +- return; +- } +- +- if (spdk_json_decode_object(params, rpc_bdev_nvme_stop_mdns_discovery_decoders, +- SPDK_COUNTOF(rpc_bdev_nvme_stop_mdns_discovery_decoders), +- &ctx->req)) { +- SPDK_ERRLOG("spdk_json_decode_object failed\n"); +- spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, +- "spdk_json_decode_object failed"); +- goto cleanup; +- } +- +- ctx->request = request; +- rc = bdev_nvme_stop_mdns_discovery(ctx->req.name); +- +- if (rc) { +- spdk_jsonrpc_send_error_response(request, rc, spdk_strerror(-rc)); +- goto cleanup; +- } +- spdk_jsonrpc_send_bool_response(ctx->request, true); +- +-cleanup: +- free(ctx->req.name); +- free(ctx); +-} +-SPDK_RPC_REGISTER("bdev_nvme_stop_mdns_discovery", rpc_bdev_nvme_stop_mdns_discovery, +- SPDK_RPC_RUNTIME) +- +-static void +-rpc_bdev_nvme_get_mdns_discovery_info(struct spdk_jsonrpc_request *request, +- const struct spdk_json_val *params) +-{ +- bdev_nvme_get_mdns_discovery_info(request); +-} +- +-SPDK_RPC_REGISTER("bdev_nvme_get_mdns_discovery_info", rpc_bdev_nvme_get_mdns_discovery_info, +- SPDK_RPC_RUNTIME) +- +-struct rpc_get_path_stat { +- char *name; +-}; +- +-struct path_stat { +- struct spdk_bdev_io_stat stat; +- struct spdk_nvme_transport_id trid; +- struct nvme_ns *ns; +-}; +- +-struct rpc_bdev_nvme_path_stat_ctx { +- struct spdk_jsonrpc_request *request; +- struct path_stat *path_stat; +- uint32_t num_paths; +- struct spdk_bdev_desc *desc; +-}; +- +-static void +-free_rpc_get_path_stat(struct rpc_get_path_stat *req) +-{ +- free(req->name); +-} +- +-static const struct spdk_json_object_decoder rpc_get_path_stat_decoders[] = { +- {"name", offsetof(struct rpc_get_path_stat, name), spdk_json_decode_string}, +-}; +- +-static void +-dummy_bdev_event_cb(enum spdk_bdev_event_type type, struct spdk_bdev *bdev, void *ctx) +-{ +-} +- +-static void +-rpc_bdev_nvme_path_stat_per_channel(struct spdk_io_channel_iter *i) +-{ +- struct rpc_bdev_nvme_path_stat_ctx *ctx = spdk_io_channel_iter_get_ctx(i); +- struct spdk_io_channel *ch = spdk_io_channel_iter_get_channel(i); +- struct nvme_bdev_channel *nbdev_ch = spdk_io_channel_get_ctx(ch); +- struct nvme_io_path *io_path; +- struct path_stat *path_stat; +- uint32_t j; +- +- assert(ctx->num_paths != 0); +- +- for (j = 0; j < ctx->num_paths; j++) { +- path_stat = &ctx->path_stat[j]; +- +- STAILQ_FOREACH(io_path, &nbdev_ch->io_path_list, stailq) { +- if (path_stat->ns == io_path->nvme_ns) { +- assert(io_path->stat != NULL); +- spdk_bdev_add_io_stat(&path_stat->stat, io_path->stat); +- } +- } +- } +- +- spdk_for_each_channel_continue(i, 0); +-} +- +-static void +-rpc_bdev_nvme_path_stat_done(struct spdk_io_channel_iter *i, int status) +-{ +- struct rpc_bdev_nvme_path_stat_ctx *ctx = spdk_io_channel_iter_get_ctx(i); +- struct nvme_bdev *nbdev = spdk_io_channel_iter_get_io_device(i); +- struct spdk_json_write_ctx *w; +- struct path_stat *path_stat; +- uint32_t j; +- +- assert(ctx->num_paths != 0); +- +- w = spdk_jsonrpc_begin_result(ctx->request); +- spdk_json_write_object_begin(w); +- spdk_json_write_named_string(w, "name", nbdev->disk.name); +- spdk_json_write_named_array_begin(w, "stats"); +- +- for (j = 0; j < ctx->num_paths; j++) { +- path_stat = &ctx->path_stat[j]; +- spdk_json_write_object_begin(w); +- +- spdk_json_write_named_object_begin(w, "trid"); +- nvme_bdev_dump_trid_json(&path_stat->trid, w); +- spdk_json_write_object_end(w); +- +- spdk_json_write_named_object_begin(w, "stat"); +- spdk_bdev_dump_io_stat_json(&path_stat->stat, w); +- spdk_json_write_object_end(w); +- +- spdk_json_write_object_end(w); +- } +- +- spdk_json_write_array_end(w); +- spdk_json_write_object_end(w); +- spdk_jsonrpc_end_result(ctx->request, w); +- +- spdk_bdev_close(ctx->desc); +- free(ctx->path_stat); +- free(ctx); +-} +- +-static void +-rpc_bdev_nvme_get_path_iostat(struct spdk_jsonrpc_request *request, +- const struct spdk_json_val *params) +-{ +- struct rpc_get_path_stat req = {}; +- struct spdk_bdev_desc *desc = NULL; +- struct spdk_bdev *bdev; +- struct nvme_bdev *nbdev; +- struct nvme_ns *nvme_ns; +- struct path_stat *path_stat; +- struct rpc_bdev_nvme_path_stat_ctx *ctx; +- struct spdk_bdev_nvme_opts opts; +- uint32_t num_paths = 0, i = 0; +- int rc; +- +- bdev_nvme_get_opts(&opts); +- if (!opts.io_path_stat) { +- SPDK_ERRLOG("RPC not enabled if enable_io_path_stat is false\n"); +- spdk_jsonrpc_send_error_response(request, -EPERM, +- "RPC not enabled if enable_io_path_stat is false"); +- return; +- } +- +- if (spdk_json_decode_object(params, rpc_get_path_stat_decoders, +- SPDK_COUNTOF(rpc_get_path_stat_decoders), +- &req)) { +- SPDK_ERRLOG("spdk_json_decode_object failed\n"); +- spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, +- "spdk_json_decode_object failed"); +- free_rpc_get_path_stat(&req); +- return; +- } +- +- rc = spdk_bdev_open_ext(req.name, false, dummy_bdev_event_cb, NULL, &desc); +- if (rc != 0) { +- SPDK_ERRLOG("Failed to open bdev '%s': %d\n", req.name, rc); +- spdk_jsonrpc_send_error_response(request, rc, spdk_strerror(-rc)); +- free_rpc_get_path_stat(&req); +- return; +- } +- +- free_rpc_get_path_stat(&req); +- +- ctx = calloc(1, sizeof(struct rpc_bdev_nvme_path_stat_ctx)); +- if (ctx == NULL) { +- spdk_bdev_close(desc); +- SPDK_ERRLOG("Failed to allocate rpc_bdev_nvme_path_stat_ctx struct\n"); +- spdk_jsonrpc_send_error_response(request, -ENOMEM, spdk_strerror(ENOMEM)); +- return; +- } +- +- bdev = spdk_bdev_desc_get_bdev(desc); +- nbdev = bdev->ctxt; +- +- pthread_mutex_lock(&nbdev->mutex); +- if (nbdev->ref == 0) { +- rc = -ENOENT; +- goto err; +- } +- +- num_paths = nbdev->ref; +- path_stat = calloc(num_paths, sizeof(struct path_stat)); +- if (path_stat == NULL) { +- rc = -ENOMEM; +- SPDK_ERRLOG("Failed to allocate memory for path_stat.\n"); +- goto err; +- } +- +- /* store the history stat */ +- TAILQ_FOREACH(nvme_ns, &nbdev->nvme_ns_list, tailq) { +- assert(i < num_paths); +- path_stat[i].ns = nvme_ns; +- path_stat[i].trid = nvme_ns->ctrlr->active_path_id->trid; +- +- assert(nvme_ns->stat != NULL); +- memcpy(&path_stat[i].stat, nvme_ns->stat, sizeof(struct spdk_bdev_io_stat)); +- i++; +- } +- pthread_mutex_unlock(&nbdev->mutex); +- +- ctx->request = request; +- ctx->desc = desc; +- ctx->path_stat = path_stat; +- ctx->num_paths = num_paths; +- +- spdk_for_each_channel(nbdev, +- rpc_bdev_nvme_path_stat_per_channel, +- ctx, +- rpc_bdev_nvme_path_stat_done); +- return; +- +-err: +- pthread_mutex_unlock(&nbdev->mutex); +- spdk_jsonrpc_send_error_response(request, rc, spdk_strerror(-rc)); +- spdk_bdev_close(desc); +- free(ctx); +-} +-SPDK_RPC_REGISTER("bdev_nvme_get_path_iostat", rpc_bdev_nvme_get_path_iostat, +- SPDK_RPC_RUNTIME) ++/* SPDX-License-Identifier: BSD-3-Clause ++ * Copyright (C) 2016 Intel Corporation. All rights reserved. ++ * Copyright (c) 2019-2021 Mellanox Technologies LTD. All rights reserved. ++ * Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved. ++ * Copyright (c) 2022 Dell Inc, or its subsidiaries. All rights reserved. ++ */ ++ ++#include "spdk/stdinc.h" ++ ++#include "bdev_nvme.h" ++ ++#include "spdk/config.h" ++ ++#include "spdk/string.h" ++#include "spdk/rpc.h" ++#include "spdk/util.h" ++#include "spdk/env.h" ++#include "spdk/nvme.h" ++#include "spdk/nvme_spec.h" ++ ++#include "spdk/log.h" ++#include "spdk/bdev_module.h" ++ ++struct open_descriptors { ++ void *desc; ++ struct spdk_bdev *bdev; ++ TAILQ_ENTRY(open_descriptors) tqlst; ++ struct spdk_thread *thread; ++}; ++typedef TAILQ_HEAD(, open_descriptors) open_descriptors_t; ++ ++static int ++rpc_decode_action_on_timeout(const struct spdk_json_val *val, void *out) ++{ ++ enum spdk_bdev_timeout_action *action = out; ++ ++ if (spdk_json_strequal(val, "none") == true) { ++ *action = SPDK_BDEV_NVME_TIMEOUT_ACTION_NONE; ++ } else if (spdk_json_strequal(val, "abort") == true) { ++ *action = SPDK_BDEV_NVME_TIMEOUT_ACTION_ABORT; ++ } else if (spdk_json_strequal(val, "reset") == true) { ++ *action = SPDK_BDEV_NVME_TIMEOUT_ACTION_RESET; ++ } else { ++ SPDK_NOTICELOG("Invalid parameter value: action_on_timeout\n"); ++ return -EINVAL; ++ } ++ ++ return 0; ++} ++ ++static const struct spdk_json_object_decoder rpc_bdev_nvme_options_decoders[] = { ++ {"action_on_timeout", offsetof(struct spdk_bdev_nvme_opts, action_on_timeout), rpc_decode_action_on_timeout, true}, ++ {"timeout_us", offsetof(struct spdk_bdev_nvme_opts, timeout_us), spdk_json_decode_uint64, true}, ++ {"timeout_admin_us", offsetof(struct spdk_bdev_nvme_opts, timeout_admin_us), spdk_json_decode_uint64, true}, ++ {"keep_alive_timeout_ms", offsetof(struct spdk_bdev_nvme_opts, keep_alive_timeout_ms), spdk_json_decode_uint32, true}, ++ {"retry_count", offsetof(struct spdk_bdev_nvme_opts, transport_retry_count), spdk_json_decode_uint32, true}, ++ {"arbitration_burst", offsetof(struct spdk_bdev_nvme_opts, arbitration_burst), spdk_json_decode_uint32, true}, ++ {"low_priority_weight", offsetof(struct spdk_bdev_nvme_opts, low_priority_weight), spdk_json_decode_uint32, true}, ++ {"medium_priority_weight", offsetof(struct spdk_bdev_nvme_opts, medium_priority_weight), spdk_json_decode_uint32, true}, ++ {"high_priority_weight", offsetof(struct spdk_bdev_nvme_opts, high_priority_weight), spdk_json_decode_uint32, true}, ++ {"nvme_adminq_poll_period_us", offsetof(struct spdk_bdev_nvme_opts, nvme_adminq_poll_period_us), spdk_json_decode_uint64, true}, ++ {"nvme_ioq_poll_period_us", offsetof(struct spdk_bdev_nvme_opts, nvme_ioq_poll_period_us), spdk_json_decode_uint64, true}, ++ {"io_queue_requests", offsetof(struct spdk_bdev_nvme_opts, io_queue_requests), spdk_json_decode_uint32, true}, ++ {"delay_cmd_submit", offsetof(struct spdk_bdev_nvme_opts, delay_cmd_submit), spdk_json_decode_bool, true}, ++ {"transport_retry_count", offsetof(struct spdk_bdev_nvme_opts, transport_retry_count), spdk_json_decode_uint32, true}, ++ {"bdev_retry_count", offsetof(struct spdk_bdev_nvme_opts, bdev_retry_count), spdk_json_decode_int32, true}, ++ {"transport_ack_timeout", offsetof(struct spdk_bdev_nvme_opts, transport_ack_timeout), spdk_json_decode_uint8, true}, ++ {"ctrlr_loss_timeout_sec", offsetof(struct spdk_bdev_nvme_opts, ctrlr_loss_timeout_sec), spdk_json_decode_int32, true}, ++ {"reconnect_delay_sec", offsetof(struct spdk_bdev_nvme_opts, reconnect_delay_sec), spdk_json_decode_uint32, true}, ++ {"fast_io_fail_timeout_sec", offsetof(struct spdk_bdev_nvme_opts, fast_io_fail_timeout_sec), spdk_json_decode_uint32, true}, ++ {"disable_auto_failback", offsetof(struct spdk_bdev_nvme_opts, disable_auto_failback), spdk_json_decode_bool, true}, ++ {"generate_uuids", offsetof(struct spdk_bdev_nvme_opts, generate_uuids), spdk_json_decode_bool, true}, ++ {"transport_tos", offsetof(struct spdk_bdev_nvme_opts, transport_tos), spdk_json_decode_uint8, true}, ++ {"nvme_error_stat", offsetof(struct spdk_bdev_nvme_opts, nvme_error_stat), spdk_json_decode_bool, true}, ++ {"rdma_srq_size", offsetof(struct spdk_bdev_nvme_opts, rdma_srq_size), spdk_json_decode_uint32, true}, ++ {"io_path_stat", offsetof(struct spdk_bdev_nvme_opts, io_path_stat), spdk_json_decode_bool, true}, ++}; ++ ++static void ++rpc_bdev_nvme_set_options(struct spdk_jsonrpc_request *request, ++ const struct spdk_json_val *params) ++{ ++ struct spdk_bdev_nvme_opts opts; ++ int rc; ++ ++ bdev_nvme_get_opts(&opts); ++ if (params && spdk_json_decode_object(params, rpc_bdev_nvme_options_decoders, ++ SPDK_COUNTOF(rpc_bdev_nvme_options_decoders), ++ &opts)) { ++ SPDK_ERRLOG("spdk_json_decode_object failed\n"); ++ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, ++ "spdk_json_decode_object failed"); ++ return; ++ } ++ ++ rc = bdev_nvme_set_opts(&opts); ++ if (rc == -EPERM) { ++ spdk_jsonrpc_send_error_response(request, -EPERM, ++ "RPC not permitted with nvme controllers already attached"); ++ } else if (rc) { ++ spdk_jsonrpc_send_error_response(request, rc, spdk_strerror(-rc)); ++ } else { ++ spdk_jsonrpc_send_bool_response(request, true); ++ } ++ ++ return; ++} ++SPDK_RPC_REGISTER("bdev_nvme_set_options", rpc_bdev_nvme_set_options, ++ SPDK_RPC_STARTUP | SPDK_RPC_RUNTIME) ++ ++struct rpc_bdev_nvme_hotplug { ++ bool enabled; ++ uint64_t period_us; ++}; ++ ++static const struct spdk_json_object_decoder rpc_bdev_nvme_hotplug_decoders[] = { ++ {"enable", offsetof(struct rpc_bdev_nvme_hotplug, enabled), spdk_json_decode_bool, false}, ++ {"period_us", offsetof(struct rpc_bdev_nvme_hotplug, period_us), spdk_json_decode_uint64, true}, ++}; ++ ++static void ++rpc_bdev_nvme_set_hotplug_done(void *ctx) ++{ ++ struct spdk_jsonrpc_request *request = ctx; ++ ++ spdk_jsonrpc_send_bool_response(request, true); ++} ++ ++static void ++rpc_bdev_nvme_set_hotplug(struct spdk_jsonrpc_request *request, ++ const struct spdk_json_val *params) ++{ ++ struct rpc_bdev_nvme_hotplug req = {false, 0}; ++ int rc; ++ ++ if (spdk_json_decode_object(params, rpc_bdev_nvme_hotplug_decoders, ++ SPDK_COUNTOF(rpc_bdev_nvme_hotplug_decoders), &req)) { ++ SPDK_ERRLOG("spdk_json_decode_object failed\n"); ++ rc = -EINVAL; ++ goto invalid; ++ } ++ ++ rc = bdev_nvme_set_hotplug(req.enabled, req.period_us, rpc_bdev_nvme_set_hotplug_done, ++ request); ++ if (rc) { ++ goto invalid; ++ } ++ ++ return; ++invalid: ++ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, spdk_strerror(-rc)); ++} ++SPDK_RPC_REGISTER("bdev_nvme_set_hotplug", rpc_bdev_nvme_set_hotplug, SPDK_RPC_RUNTIME) ++ ++enum bdev_nvme_multipath_mode { ++ BDEV_NVME_MP_MODE_FAILOVER, ++ BDEV_NVME_MP_MODE_MULTIPATH, ++ BDEV_NVME_MP_MODE_DISABLE, ++}; ++ ++struct rpc_bdev_nvme_attach_controller { ++ char *name; ++ char *trtype; ++ char *adrfam; ++ char *traddr; ++ char *trsvcid; ++ char *priority; ++ char *subnqn; ++ char *hostnqn; ++ char *hostaddr; ++ char *hostsvcid; ++ char *psk; ++ enum bdev_nvme_multipath_mode multipath; ++ struct nvme_ctrlr_opts bdev_opts; ++ struct spdk_nvme_ctrlr_opts drv_opts; ++}; ++ ++static void ++free_rpc_bdev_nvme_attach_controller(struct rpc_bdev_nvme_attach_controller *req) ++{ ++ free(req->name); ++ free(req->trtype); ++ free(req->adrfam); ++ free(req->traddr); ++ free(req->trsvcid); ++ free(req->priority); ++ free(req->subnqn); ++ free(req->hostnqn); ++ free(req->hostaddr); ++ free(req->hostsvcid); ++ free(req->psk); ++} ++ ++static int ++bdev_nvme_decode_reftag(const struct spdk_json_val *val, void *out) ++{ ++ uint32_t *flag = out; ++ bool reftag; ++ int rc; ++ ++ rc = spdk_json_decode_bool(val, &reftag); ++ if (rc == 0 && reftag == true) { ++ *flag |= SPDK_NVME_IO_FLAGS_PRCHK_REFTAG; ++ } ++ ++ return rc; ++} ++ ++static int ++bdev_nvme_decode_guard(const struct spdk_json_val *val, void *out) ++{ ++ uint32_t *flag = out; ++ bool guard; ++ int rc; ++ ++ rc = spdk_json_decode_bool(val, &guard); ++ if (rc == 0 && guard == true) { ++ *flag |= SPDK_NVME_IO_FLAGS_PRCHK_GUARD; ++ } ++ ++ return rc; ++} ++ ++static int ++bdev_nvme_decode_multipath(const struct spdk_json_val *val, void *out) ++{ ++ enum bdev_nvme_multipath_mode *multipath = out; ++ ++ if (spdk_json_strequal(val, "failover") == true) { ++ *multipath = BDEV_NVME_MP_MODE_FAILOVER; ++ } else if (spdk_json_strequal(val, "multipath") == true) { ++ *multipath = BDEV_NVME_MP_MODE_MULTIPATH; ++ } else if (spdk_json_strequal(val, "disable") == true) { ++ *multipath = BDEV_NVME_MP_MODE_DISABLE; ++ } else { ++ SPDK_NOTICELOG("Invalid parameter value: multipath\n"); ++ return -EINVAL; ++ } ++ ++ return 0; ++} ++ ++ ++static const struct spdk_json_object_decoder rpc_bdev_nvme_attach_controller_decoders[] = { ++ {"name", offsetof(struct rpc_bdev_nvme_attach_controller, name), spdk_json_decode_string}, ++ {"trtype", offsetof(struct rpc_bdev_nvme_attach_controller, trtype), spdk_json_decode_string}, ++ {"traddr", offsetof(struct rpc_bdev_nvme_attach_controller, traddr), spdk_json_decode_string}, ++ ++ {"adrfam", offsetof(struct rpc_bdev_nvme_attach_controller, adrfam), spdk_json_decode_string, true}, ++ {"trsvcid", offsetof(struct rpc_bdev_nvme_attach_controller, trsvcid), spdk_json_decode_string, true}, ++ {"priority", offsetof(struct rpc_bdev_nvme_attach_controller, priority), spdk_json_decode_string, true}, ++ {"subnqn", offsetof(struct rpc_bdev_nvme_attach_controller, subnqn), spdk_json_decode_string, true}, ++ {"hostnqn", offsetof(struct rpc_bdev_nvme_attach_controller, hostnqn), spdk_json_decode_string, true}, ++ {"hostaddr", offsetof(struct rpc_bdev_nvme_attach_controller, hostaddr), spdk_json_decode_string, true}, ++ {"hostsvcid", offsetof(struct rpc_bdev_nvme_attach_controller, hostsvcid), spdk_json_decode_string, true}, ++ ++ {"prchk_reftag", offsetof(struct rpc_bdev_nvme_attach_controller, bdev_opts.prchk_flags), bdev_nvme_decode_reftag, true}, ++ {"prchk_guard", offsetof(struct rpc_bdev_nvme_attach_controller, bdev_opts.prchk_flags), bdev_nvme_decode_guard, true}, ++ {"hdgst", offsetof(struct rpc_bdev_nvme_attach_controller, drv_opts.header_digest), spdk_json_decode_bool, true}, ++ {"ddgst", offsetof(struct rpc_bdev_nvme_attach_controller, drv_opts.data_digest), spdk_json_decode_bool, true}, ++ {"fabrics_connect_timeout_us", offsetof(struct rpc_bdev_nvme_attach_controller, drv_opts.fabrics_connect_timeout_us), spdk_json_decode_uint64, true}, ++ {"multipath", offsetof(struct rpc_bdev_nvme_attach_controller, multipath), bdev_nvme_decode_multipath, true}, ++ {"num_io_queues", offsetof(struct rpc_bdev_nvme_attach_controller, drv_opts.num_io_queues), spdk_json_decode_uint32, true}, ++ {"ctrlr_loss_timeout_sec", offsetof(struct rpc_bdev_nvme_attach_controller, bdev_opts.ctrlr_loss_timeout_sec), spdk_json_decode_int32, true}, ++ {"reconnect_delay_sec", offsetof(struct rpc_bdev_nvme_attach_controller, bdev_opts.reconnect_delay_sec), spdk_json_decode_uint32, true}, ++ {"fast_io_fail_timeout_sec", offsetof(struct rpc_bdev_nvme_attach_controller, bdev_opts.fast_io_fail_timeout_sec), spdk_json_decode_uint32, true}, ++ {"psk", offsetof(struct rpc_bdev_nvme_attach_controller, psk), spdk_json_decode_string, true}, ++}; ++ ++#define NVME_MAX_BDEVS_PER_RPC 128 ++ ++struct rpc_bdev_nvme_attach_controller_ctx { ++ struct rpc_bdev_nvme_attach_controller req; ++ uint32_t count; ++ size_t bdev_count; ++ const char *names[NVME_MAX_BDEVS_PER_RPC]; ++ struct spdk_jsonrpc_request *request; ++}; ++ ++static void ++rpc_bdev_nvme_attach_controller_examined(void *cb_ctx) ++{ ++ struct rpc_bdev_nvme_attach_controller_ctx *ctx = cb_ctx; ++ struct spdk_jsonrpc_request *request = ctx->request; ++ struct spdk_json_write_ctx *w; ++ size_t i; ++ ++ w = spdk_jsonrpc_begin_result(request); ++ spdk_json_write_array_begin(w); ++ for (i = 0; i < ctx->bdev_count; i++) { ++ spdk_json_write_string(w, ctx->names[i]); ++ } ++ spdk_json_write_array_end(w); ++ spdk_jsonrpc_end_result(request, w); ++ ++ free_rpc_bdev_nvme_attach_controller(&ctx->req); ++ free(ctx); ++} ++ ++static void ++rpc_bdev_nvme_attach_controller_done(void *cb_ctx, size_t bdev_count, int rc) ++{ ++ struct rpc_bdev_nvme_attach_controller_ctx *ctx = cb_ctx; ++ struct spdk_jsonrpc_request *request = ctx->request; ++ ++ if (rc < 0) { ++ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, "Invalid parameters"); ++ free_rpc_bdev_nvme_attach_controller(&ctx->req); ++ free(ctx); ++ return; ++ } ++ ++ ctx->bdev_count = bdev_count; ++ spdk_bdev_wait_for_examine(rpc_bdev_nvme_attach_controller_examined, ctx); ++} ++ ++static void ++rpc_bdev_nvme_attach_controller(struct spdk_jsonrpc_request *request, ++ const struct spdk_json_val *params) ++{ ++ struct rpc_bdev_nvme_attach_controller_ctx *ctx; ++ struct spdk_nvme_transport_id trid = {}; ++ const struct spdk_nvme_ctrlr_opts *drv_opts; ++ const struct spdk_nvme_transport_id *ctrlr_trid; ++ struct nvme_ctrlr *ctrlr = NULL; ++ size_t len, maxlen; ++ bool multipath = false; ++ int rc; ++ ++ ctx = calloc(1, sizeof(*ctx)); ++ if (!ctx) { ++ spdk_jsonrpc_send_error_response(request, -ENOMEM, spdk_strerror(ENOMEM)); ++ return; ++ } ++ ++ spdk_nvme_ctrlr_get_default_ctrlr_opts(&ctx->req.drv_opts, sizeof(ctx->req.drv_opts)); ++ bdev_nvme_get_default_ctrlr_opts(&ctx->req.bdev_opts); ++ /* For now, initialize the multipath parameter to add a failover path. This maintains backward ++ * compatibility with past behavior. In the future, this behavior will change to "disable". */ ++ ctx->req.multipath = BDEV_NVME_MP_MODE_FAILOVER; ++ ++ if (spdk_json_decode_object(params, rpc_bdev_nvme_attach_controller_decoders, ++ SPDK_COUNTOF(rpc_bdev_nvme_attach_controller_decoders), ++ &ctx->req)) { ++ SPDK_ERRLOG("spdk_json_decode_object failed\n"); ++ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, ++ "spdk_json_decode_object failed"); ++ goto cleanup; ++ } ++ ++ /* Parse trstring */ ++ rc = spdk_nvme_transport_id_populate_trstring(&trid, ctx->req.trtype); ++ if (rc < 0) { ++ SPDK_ERRLOG("Failed to parse trtype: %s\n", ctx->req.trtype); ++ spdk_jsonrpc_send_error_response_fmt(request, -EINVAL, "Failed to parse trtype: %s", ++ ctx->req.trtype); ++ goto cleanup; ++ } ++ ++ /* Parse trtype */ ++ rc = spdk_nvme_transport_id_parse_trtype(&trid.trtype, ctx->req.trtype); ++ assert(rc == 0); ++ ++ /* Parse traddr */ ++ maxlen = sizeof(trid.traddr); ++ len = strnlen(ctx->req.traddr, maxlen); ++ if (len == maxlen) { ++ spdk_jsonrpc_send_error_response_fmt(request, -EINVAL, "traddr too long: %s", ++ ctx->req.traddr); ++ goto cleanup; ++ } ++ memcpy(trid.traddr, ctx->req.traddr, len + 1); ++ ++ /* Parse adrfam */ ++ if (ctx->req.adrfam) { ++ rc = spdk_nvme_transport_id_parse_adrfam(&trid.adrfam, ctx->req.adrfam); ++ if (rc < 0) { ++ SPDK_ERRLOG("Failed to parse adrfam: %s\n", ctx->req.adrfam); ++ spdk_jsonrpc_send_error_response_fmt(request, -EINVAL, "Failed to parse adrfam: %s", ++ ctx->req.adrfam); ++ goto cleanup; ++ } ++ } ++ ++ /* Parse trsvcid */ ++ if (ctx->req.trsvcid) { ++ maxlen = sizeof(trid.trsvcid); ++ len = strnlen(ctx->req.trsvcid, maxlen); ++ if (len == maxlen) { ++ spdk_jsonrpc_send_error_response_fmt(request, -EINVAL, "trsvcid too long: %s", ++ ctx->req.trsvcid); ++ goto cleanup; ++ } ++ memcpy(trid.trsvcid, ctx->req.trsvcid, len + 1); ++ } ++ ++ /* Parse priority for the NVMe-oF transport connection */ ++ if (ctx->req.priority) { ++ trid.priority = spdk_strtol(ctx->req.priority, 10); ++ } ++ ++ /* Parse subnqn */ ++ if (ctx->req.subnqn) { ++ maxlen = sizeof(trid.subnqn); ++ len = strnlen(ctx->req.subnqn, maxlen); ++ if (len == maxlen) { ++ spdk_jsonrpc_send_error_response_fmt(request, -EINVAL, "subnqn too long: %s", ++ ctx->req.subnqn); ++ goto cleanup; ++ } ++ memcpy(trid.subnqn, ctx->req.subnqn, len + 1); ++ } ++ ++ if (ctx->req.hostnqn) { ++ snprintf(ctx->req.drv_opts.hostnqn, sizeof(ctx->req.drv_opts.hostnqn), "%s", ++ ctx->req.hostnqn); ++ } ++ ++ if (ctx->req.psk) { ++ snprintf(ctx->req.drv_opts.psk, sizeof(ctx->req.drv_opts.psk), "%s", ++ ctx->req.psk); ++ } ++ ++ if (ctx->req.hostaddr) { ++ maxlen = sizeof(ctx->req.drv_opts.src_addr); ++ len = strnlen(ctx->req.hostaddr, maxlen); ++ if (len == maxlen) { ++ spdk_jsonrpc_send_error_response_fmt(request, -EINVAL, "hostaddr too long: %s", ++ ctx->req.hostaddr); ++ goto cleanup; ++ } ++ snprintf(ctx->req.drv_opts.src_addr, maxlen, "%s", ctx->req.hostaddr); ++ } ++ ++ if (ctx->req.hostsvcid) { ++ maxlen = sizeof(ctx->req.drv_opts.src_svcid); ++ len = strnlen(ctx->req.hostsvcid, maxlen); ++ if (len == maxlen) { ++ spdk_jsonrpc_send_error_response_fmt(request, -EINVAL, "hostsvcid too long: %s", ++ ctx->req.hostsvcid); ++ goto cleanup; ++ } ++ snprintf(ctx->req.drv_opts.src_svcid, maxlen, "%s", ctx->req.hostsvcid); ++ } ++ ++ ctrlr = nvme_ctrlr_get_by_name(ctx->req.name); ++ ++ if (ctrlr) { ++ /* This controller already exists. Check what the user wants to do. */ ++ if (ctx->req.multipath == BDEV_NVME_MP_MODE_DISABLE) { ++ /* The user does not want to do any form of multipathing. */ ++ spdk_jsonrpc_send_error_response_fmt(request, -EALREADY, ++ "A controller named %s already exists and multipath is disabled\n", ++ ctx->req.name); ++ goto cleanup; ++ } ++ ++ assert(ctx->req.multipath == BDEV_NVME_MP_MODE_FAILOVER || ++ ctx->req.multipath == BDEV_NVME_MP_MODE_MULTIPATH); ++ ++ /* The user wants to add this as a failover path or add this to create multipath. */ ++ drv_opts = spdk_nvme_ctrlr_get_opts(ctrlr->ctrlr); ++ ctrlr_trid = spdk_nvme_ctrlr_get_transport_id(ctrlr->ctrlr); ++ ++ if (strncmp(trid.traddr, ctrlr_trid->traddr, sizeof(trid.traddr)) == 0 && ++ strncmp(trid.trsvcid, ctrlr_trid->trsvcid, sizeof(trid.trsvcid)) == 0 && ++ strncmp(ctx->req.drv_opts.src_addr, drv_opts->src_addr, sizeof(drv_opts->src_addr)) == 0 && ++ strncmp(ctx->req.drv_opts.src_svcid, drv_opts->src_svcid, sizeof(drv_opts->src_svcid)) == 0) { ++ /* Exactly same network path can't be added a second time */ ++ spdk_jsonrpc_send_error_response_fmt(request, -EALREADY, ++ "A controller named %s already exists with the specified network path\n", ++ ctx->req.name); ++ goto cleanup; ++ } ++ ++ if (strncmp(trid.subnqn, ++ ctrlr_trid->subnqn, ++ SPDK_NVMF_NQN_MAX_LEN) != 0) { ++ /* Different SUBNQN is not allowed when specifying the same controller name. */ ++ spdk_jsonrpc_send_error_response_fmt(request, -EINVAL, ++ "A controller named %s already exists, but uses a different subnqn (%s)\n", ++ ctx->req.name, ctrlr_trid->subnqn); ++ goto cleanup; ++ } ++ ++ if (strncmp(ctx->req.drv_opts.hostnqn, drv_opts->hostnqn, SPDK_NVMF_NQN_MAX_LEN) != 0) { ++ /* Different HOSTNQN is not allowed when specifying the same controller name. */ ++ spdk_jsonrpc_send_error_response_fmt(request, -EINVAL, ++ "A controller named %s already exists, but uses a different hostnqn (%s)\n", ++ ctx->req.name, drv_opts->hostnqn); ++ goto cleanup; ++ } ++ ++ if (ctx->req.bdev_opts.prchk_flags) { ++ spdk_jsonrpc_send_error_response_fmt(request, -EINVAL, ++ "A controller named %s already exists. To add a path, do not specify PI options.\n", ++ ctx->req.name); ++ goto cleanup; ++ } ++ ++ ctx->req.bdev_opts.prchk_flags = ctrlr->opts.prchk_flags; ++ } ++ ++ if (ctx->req.multipath == BDEV_NVME_MP_MODE_MULTIPATH) { ++ multipath = true; ++ } ++ ++ if (ctx->req.drv_opts.num_io_queues == 0 || ctx->req.drv_opts.num_io_queues > UINT16_MAX + 1) { ++ spdk_jsonrpc_send_error_response_fmt(request, -EINVAL, ++ "num_io_queues out of bounds, min: %u max: %u\n", ++ 1, UINT16_MAX + 1); ++ goto cleanup; ++ } ++ ++ ctx->request = request; ++ ctx->count = NVME_MAX_BDEVS_PER_RPC; ++ /* Should already be zero due to the calloc(), but set explicitly for clarity. */ ++ ctx->req.bdev_opts.from_discovery_service = false; ++ rc = bdev_nvme_create(&trid, ctx->req.name, ctx->names, ctx->count, ++ rpc_bdev_nvme_attach_controller_done, ctx, &ctx->req.drv_opts, ++ &ctx->req.bdev_opts, multipath); ++ if (rc) { ++ spdk_jsonrpc_send_error_response(request, rc, spdk_strerror(-rc)); ++ goto cleanup; ++ } ++ ++ return; ++ ++cleanup: ++ free_rpc_bdev_nvme_attach_controller(&ctx->req); ++ free(ctx); ++} ++SPDK_RPC_REGISTER("bdev_nvme_attach_controller", rpc_bdev_nvme_attach_controller, ++ SPDK_RPC_RUNTIME) ++ ++static void ++rpc_dump_nvme_bdev_controller_info(struct nvme_bdev_ctrlr *nbdev_ctrlr, void *ctx) ++{ ++ struct spdk_json_write_ctx *w = ctx; ++ struct nvme_ctrlr *nvme_ctrlr; ++ ++ spdk_json_write_object_begin(w); ++ spdk_json_write_named_string(w, "name", nbdev_ctrlr->name); ++ ++ spdk_json_write_named_array_begin(w, "ctrlrs"); ++ TAILQ_FOREACH(nvme_ctrlr, &nbdev_ctrlr->ctrlrs, tailq) { ++ nvme_ctrlr_info_json(w, nvme_ctrlr); ++ } ++ spdk_json_write_array_end(w); ++ spdk_json_write_object_end(w); ++} ++ ++struct rpc_bdev_nvme_get_controllers { ++ char *name; ++}; ++ ++static void ++free_rpc_bdev_nvme_get_controllers(struct rpc_bdev_nvme_get_controllers *r) ++{ ++ free(r->name); ++} ++ ++static const struct spdk_json_object_decoder rpc_bdev_nvme_get_controllers_decoders[] = { ++ {"name", offsetof(struct rpc_bdev_nvme_get_controllers, name), spdk_json_decode_string, true}, ++}; ++ ++static void ++rpc_bdev_nvme_get_controllers(struct spdk_jsonrpc_request *request, ++ const struct spdk_json_val *params) ++{ ++ struct rpc_bdev_nvme_get_controllers req = {}; ++ struct spdk_json_write_ctx *w; ++ struct nvme_bdev_ctrlr *nbdev_ctrlr = NULL; ++ ++ if (params && spdk_json_decode_object(params, rpc_bdev_nvme_get_controllers_decoders, ++ SPDK_COUNTOF(rpc_bdev_nvme_get_controllers_decoders), ++ &req)) { ++ SPDK_ERRLOG("spdk_json_decode_object failed\n"); ++ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, ++ "spdk_json_decode_object failed"); ++ goto cleanup; ++ } ++ ++ if (req.name) { ++ nbdev_ctrlr = nvme_bdev_ctrlr_get_by_name(req.name); ++ if (nbdev_ctrlr == NULL) { ++ SPDK_ERRLOG("ctrlr '%s' does not exist\n", req.name); ++ spdk_jsonrpc_send_error_response_fmt(request, EINVAL, "Controller %s does not exist", req.name); ++ goto cleanup; ++ } ++ } ++ ++ w = spdk_jsonrpc_begin_result(request); ++ spdk_json_write_array_begin(w); ++ ++ if (nbdev_ctrlr != NULL) { ++ rpc_dump_nvme_bdev_controller_info(nbdev_ctrlr, w); ++ } else { ++ nvme_bdev_ctrlr_for_each(rpc_dump_nvme_bdev_controller_info, w); ++ } ++ ++ spdk_json_write_array_end(w); ++ ++ spdk_jsonrpc_end_result(request, w); ++ ++cleanup: ++ free_rpc_bdev_nvme_get_controllers(&req); ++} ++SPDK_RPC_REGISTER("bdev_nvme_get_controllers", rpc_bdev_nvme_get_controllers, SPDK_RPC_RUNTIME) ++ ++struct rpc_bdev_nvme_detach_controller { ++ char *name; ++ char *trtype; ++ char *adrfam; ++ char *traddr; ++ char *trsvcid; ++ char *subnqn; ++ char *hostaddr; ++ char *hostsvcid; ++}; ++ ++static void ++free_rpc_bdev_nvme_detach_controller(struct rpc_bdev_nvme_detach_controller *req) ++{ ++ free(req->name); ++ free(req->trtype); ++ free(req->adrfam); ++ free(req->traddr); ++ free(req->trsvcid); ++ free(req->subnqn); ++ free(req->hostaddr); ++ free(req->hostsvcid); ++} ++ ++static const struct spdk_json_object_decoder rpc_bdev_nvme_detach_controller_decoders[] = { ++ {"name", offsetof(struct rpc_bdev_nvme_detach_controller, name), spdk_json_decode_string}, ++ {"trtype", offsetof(struct rpc_bdev_nvme_detach_controller, trtype), spdk_json_decode_string, true}, ++ {"traddr", offsetof(struct rpc_bdev_nvme_detach_controller, traddr), spdk_json_decode_string, true}, ++ {"adrfam", offsetof(struct rpc_bdev_nvme_detach_controller, adrfam), spdk_json_decode_string, true}, ++ {"trsvcid", offsetof(struct rpc_bdev_nvme_detach_controller, trsvcid), spdk_json_decode_string, true}, ++ {"subnqn", offsetof(struct rpc_bdev_nvme_detach_controller, subnqn), spdk_json_decode_string, true}, ++ {"hostaddr", offsetof(struct rpc_bdev_nvme_detach_controller, hostaddr), spdk_json_decode_string, true}, ++ {"hostsvcid", offsetof(struct rpc_bdev_nvme_detach_controller, hostsvcid), spdk_json_decode_string, true}, ++}; ++ ++static void ++rpc_bdev_nvme_detach_controller(struct spdk_jsonrpc_request *request, ++ const struct spdk_json_val *params) ++{ ++ struct rpc_bdev_nvme_detach_controller req = {NULL}; ++ struct nvme_path_id path = {}; ++ size_t len, maxlen; ++ int rc = 0; ++ ++ if (spdk_json_decode_object(params, rpc_bdev_nvme_detach_controller_decoders, ++ SPDK_COUNTOF(rpc_bdev_nvme_detach_controller_decoders), ++ &req)) { ++ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, ++ "spdk_json_decode_object failed"); ++ goto cleanup; ++ } ++ ++ if (req.trtype != NULL) { ++ rc = spdk_nvme_transport_id_populate_trstring(&path.trid, req.trtype); ++ if (rc < 0) { ++ SPDK_ERRLOG("Failed to parse trtype: %s\n", req.trtype); ++ spdk_jsonrpc_send_error_response_fmt(request, -EINVAL, "Failed to parse trtype: %s", ++ req.trtype); ++ goto cleanup; ++ } ++ ++ rc = spdk_nvme_transport_id_parse_trtype(&path.trid.trtype, req.trtype); ++ if (rc < 0) { ++ SPDK_ERRLOG("Failed to parse trtype: %s\n", req.trtype); ++ spdk_jsonrpc_send_error_response_fmt(request, -EINVAL, "Failed to parse trtype: %s", ++ req.trtype); ++ goto cleanup; ++ } ++ } ++ ++ if (req.traddr != NULL) { ++ maxlen = sizeof(path.trid.traddr); ++ len = strnlen(req.traddr, maxlen); ++ if (len == maxlen) { ++ spdk_jsonrpc_send_error_response_fmt(request, -EINVAL, "traddr too long: %s", ++ req.traddr); ++ goto cleanup; ++ } ++ memcpy(path.trid.traddr, req.traddr, len + 1); ++ } ++ ++ if (req.adrfam != NULL) { ++ rc = spdk_nvme_transport_id_parse_adrfam(&path.trid.adrfam, req.adrfam); ++ if (rc < 0) { ++ SPDK_ERRLOG("Failed to parse adrfam: %s\n", req.adrfam); ++ spdk_jsonrpc_send_error_response_fmt(request, -EINVAL, "Failed to parse adrfam: %s", ++ req.adrfam); ++ goto cleanup; ++ } ++ } ++ ++ if (req.trsvcid != NULL) { ++ maxlen = sizeof(path.trid.trsvcid); ++ len = strnlen(req.trsvcid, maxlen); ++ if (len == maxlen) { ++ spdk_jsonrpc_send_error_response_fmt(request, -EINVAL, "trsvcid too long: %s", ++ req.trsvcid); ++ goto cleanup; ++ } ++ memcpy(path.trid.trsvcid, req.trsvcid, len + 1); ++ } ++ ++ /* Parse subnqn */ ++ if (req.subnqn != NULL) { ++ maxlen = sizeof(path.trid.subnqn); ++ len = strnlen(req.subnqn, maxlen); ++ if (len == maxlen) { ++ spdk_jsonrpc_send_error_response_fmt(request, -EINVAL, "subnqn too long: %s", ++ req.subnqn); ++ goto cleanup; ++ } ++ memcpy(path.trid.subnqn, req.subnqn, len + 1); ++ } ++ ++ if (req.hostaddr) { ++ maxlen = sizeof(path.hostid.hostaddr); ++ len = strnlen(req.hostaddr, maxlen); ++ if (len == maxlen) { ++ spdk_jsonrpc_send_error_response_fmt(request, -EINVAL, "hostaddr too long: %s", ++ req.hostaddr); ++ goto cleanup; ++ } ++ snprintf(path.hostid.hostaddr, maxlen, "%s", req.hostaddr); ++ } ++ ++ if (req.hostsvcid) { ++ maxlen = sizeof(path.hostid.hostsvcid); ++ len = strnlen(req.hostsvcid, maxlen); ++ if (len == maxlen) { ++ spdk_jsonrpc_send_error_response_fmt(request, -EINVAL, "hostsvcid too long: %s", ++ req.hostsvcid); ++ goto cleanup; ++ } ++ snprintf(path.hostid.hostsvcid, maxlen, "%s", req.hostsvcid); ++ } ++ ++ rc = bdev_nvme_delete(req.name, &path); ++ ++ if (rc != 0) { ++ spdk_jsonrpc_send_error_response(request, rc, spdk_strerror(-rc)); ++ goto cleanup; ++ } ++ ++ spdk_jsonrpc_send_bool_response(request, true); ++ ++cleanup: ++ free_rpc_bdev_nvme_detach_controller(&req); ++} ++SPDK_RPC_REGISTER("bdev_nvme_detach_controller", rpc_bdev_nvme_detach_controller, ++ SPDK_RPC_RUNTIME) ++ ++struct rpc_apply_firmware { ++ char *filename; ++ char *bdev_name; ++}; ++ ++static void ++free_rpc_apply_firmware(struct rpc_apply_firmware *req) ++{ ++ free(req->filename); ++ free(req->bdev_name); ++} ++ ++static const struct spdk_json_object_decoder rpc_apply_firmware_decoders[] = { ++ {"filename", offsetof(struct rpc_apply_firmware, filename), spdk_json_decode_string}, ++ {"bdev_name", offsetof(struct rpc_apply_firmware, bdev_name), spdk_json_decode_string}, ++}; ++ ++struct firmware_update_info { ++ void *fw_image; ++ void *p; ++ unsigned int size; ++ unsigned int size_remaining; ++ unsigned int offset; ++ unsigned int transfer; ++ ++ void *desc; ++ struct spdk_io_channel *ch; ++ struct spdk_jsonrpc_request *request; ++ struct spdk_nvme_ctrlr *ctrlr; ++ open_descriptors_t desc_head; ++ struct rpc_apply_firmware *req; ++}; ++ ++static void ++_apply_firmware_cleanup(void *ctx) ++{ ++ struct spdk_bdev_desc *desc = ctx; ++ ++ spdk_bdev_close(desc); ++} ++ ++static void ++apply_firmware_cleanup(void *cb_arg) ++{ ++ struct open_descriptors *opt, *tmp; ++ struct firmware_update_info *firm_ctx = cb_arg; ++ ++ if (!firm_ctx) { ++ return; ++ } ++ ++ if (firm_ctx->fw_image) { ++ spdk_free(firm_ctx->fw_image); ++ } ++ ++ if (firm_ctx->req) { ++ free_rpc_apply_firmware(firm_ctx->req); ++ free(firm_ctx->req); ++ } ++ ++ if (firm_ctx->ch) { ++ spdk_put_io_channel(firm_ctx->ch); ++ } ++ ++ TAILQ_FOREACH_SAFE(opt, &firm_ctx->desc_head, tqlst, tmp) { ++ TAILQ_REMOVE(&firm_ctx->desc_head, opt, tqlst); ++ /* Close the underlying bdev on its same opened thread. */ ++ if (opt->thread && opt->thread != spdk_get_thread()) { ++ spdk_thread_send_msg(opt->thread, _apply_firmware_cleanup, opt->desc); ++ } else { ++ spdk_bdev_close(opt->desc); ++ } ++ free(opt); ++ } ++ free(firm_ctx); ++} ++ ++static void ++apply_firmware_complete_reset(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) ++{ ++ struct spdk_json_write_ctx *w; ++ struct firmware_update_info *firm_ctx = cb_arg; ++ ++ spdk_bdev_free_io(bdev_io); ++ ++ if (!success) { ++ spdk_jsonrpc_send_error_response(firm_ctx->request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, ++ "firmware commit failed."); ++ apply_firmware_cleanup(firm_ctx); ++ return; ++ } ++ ++ if (spdk_nvme_ctrlr_reset(firm_ctx->ctrlr) != 0) { ++ spdk_jsonrpc_send_error_response(firm_ctx->request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, ++ "Controller reset failed."); ++ apply_firmware_cleanup(firm_ctx); ++ return; ++ } ++ ++ w = spdk_jsonrpc_begin_result(firm_ctx->request); ++ spdk_json_write_string(w, "firmware commit succeeded. Controller reset in progress."); ++ spdk_jsonrpc_end_result(firm_ctx->request, w); ++ apply_firmware_cleanup(firm_ctx); ++} ++ ++static void ++apply_firmware_complete(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) ++{ ++ struct spdk_nvme_cmd cmd = {}; ++ struct spdk_nvme_fw_commit fw_commit; ++ int slot = 0; ++ int rc; ++ struct firmware_update_info *firm_ctx = cb_arg; ++ enum spdk_nvme_fw_commit_action commit_action = SPDK_NVME_FW_COMMIT_REPLACE_AND_ENABLE_IMG; ++ ++ spdk_bdev_free_io(bdev_io); ++ ++ if (!success) { ++ spdk_jsonrpc_send_error_response(firm_ctx->request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, ++ "firmware download failed ."); ++ apply_firmware_cleanup(firm_ctx); ++ return; ++ } ++ ++ firm_ctx->p += firm_ctx->transfer; ++ firm_ctx->offset += firm_ctx->transfer; ++ firm_ctx->size_remaining -= firm_ctx->transfer; ++ ++ switch (firm_ctx->size_remaining) { ++ case 0: ++ /* firmware download completed. Commit firmware */ ++ memset(&fw_commit, 0, sizeof(struct spdk_nvme_fw_commit)); ++ fw_commit.fs = slot; ++ fw_commit.ca = commit_action; ++ ++ cmd.opc = SPDK_NVME_OPC_FIRMWARE_COMMIT; ++ memcpy(&cmd.cdw10, &fw_commit, sizeof(uint32_t)); ++ rc = spdk_bdev_nvme_admin_passthru(firm_ctx->desc, firm_ctx->ch, &cmd, NULL, 0, ++ apply_firmware_complete_reset, firm_ctx); ++ if (rc) { ++ spdk_jsonrpc_send_error_response(firm_ctx->request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, ++ "firmware commit failed."); ++ apply_firmware_cleanup(firm_ctx); ++ return; ++ } ++ break; ++ default: ++ firm_ctx->transfer = spdk_min(firm_ctx->size_remaining, 4096); ++ cmd.opc = SPDK_NVME_OPC_FIRMWARE_IMAGE_DOWNLOAD; ++ ++ cmd.cdw10 = spdk_nvme_bytes_to_numd(firm_ctx->transfer); ++ cmd.cdw11 = firm_ctx->offset >> 2; ++ rc = spdk_bdev_nvme_admin_passthru(firm_ctx->desc, firm_ctx->ch, &cmd, firm_ctx->p, ++ firm_ctx->transfer, apply_firmware_complete, firm_ctx); ++ if (rc) { ++ spdk_jsonrpc_send_error_response(firm_ctx->request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, ++ "firmware download failed."); ++ apply_firmware_cleanup(firm_ctx); ++ return; ++ } ++ break; ++ } ++} ++ ++static void ++apply_firmware_open_cb(enum spdk_bdev_event_type type, struct spdk_bdev *bdev, void *event_ctx) ++{ ++} ++ ++static void ++rpc_bdev_nvme_apply_firmware(struct spdk_jsonrpc_request *request, ++ const struct spdk_json_val *params) ++{ ++ int rc; ++ int fd = -1; ++ struct stat fw_stat; ++ struct spdk_nvme_ctrlr *ctrlr; ++ char msg[1024]; ++ struct spdk_bdev *bdev; ++ struct spdk_bdev *bdev2; ++ struct open_descriptors *opt; ++ struct spdk_bdev_desc *desc; ++ struct spdk_nvme_cmd cmd = {}; ++ struct firmware_update_info *firm_ctx; ++ ++ firm_ctx = calloc(1, sizeof(struct firmware_update_info)); ++ if (!firm_ctx) { ++ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, ++ "Memory allocation error."); ++ return; ++ } ++ firm_ctx->fw_image = NULL; ++ TAILQ_INIT(&firm_ctx->desc_head); ++ firm_ctx->request = request; ++ ++ firm_ctx->req = calloc(1, sizeof(struct rpc_apply_firmware)); ++ if (!firm_ctx->req) { ++ snprintf(msg, sizeof(msg), "Memory allocation error."); ++ goto err; ++ } ++ ++ if (spdk_json_decode_object(params, rpc_apply_firmware_decoders, ++ SPDK_COUNTOF(rpc_apply_firmware_decoders), firm_ctx->req)) { ++ snprintf(msg, sizeof(msg), "spdk_json_decode_object failed."); ++ goto err; ++ } ++ ++ if ((bdev = spdk_bdev_get_by_name(firm_ctx->req->bdev_name)) == NULL) { ++ snprintf(msg, sizeof(msg), "bdev %s were not found", firm_ctx->req->bdev_name); ++ goto err; ++ } ++ ++ if ((ctrlr = bdev_nvme_get_ctrlr(bdev)) == NULL) { ++ snprintf(msg, sizeof(msg), "Controller information for %s were not found.", ++ firm_ctx->req->bdev_name); ++ goto err; ++ } ++ firm_ctx->ctrlr = ctrlr; ++ ++ for (bdev2 = spdk_bdev_first(); bdev2; bdev2 = spdk_bdev_next(bdev2)) { ++ ++ if (bdev_nvme_get_ctrlr(bdev2) != ctrlr) { ++ continue; ++ } ++ ++ if (!(opt = malloc(sizeof(struct open_descriptors)))) { ++ snprintf(msg, sizeof(msg), "Memory allocation error."); ++ goto err; ++ } ++ ++ if (spdk_bdev_open_ext(spdk_bdev_get_name(bdev2), true, apply_firmware_open_cb, NULL, &desc) != 0) { ++ snprintf(msg, sizeof(msg), "Device %s is in use.", firm_ctx->req->bdev_name); ++ free(opt); ++ goto err; ++ } ++ ++ /* Save the thread where the base device is opened */ ++ opt->thread = spdk_get_thread(); ++ ++ opt->desc = desc; ++ opt->bdev = bdev; ++ TAILQ_INSERT_TAIL(&firm_ctx->desc_head, opt, tqlst); ++ } ++ ++ /* ++ * find a descriptor associated with our bdev ++ */ ++ firm_ctx->desc = NULL; ++ TAILQ_FOREACH(opt, &firm_ctx->desc_head, tqlst) { ++ if (opt->bdev == bdev) { ++ firm_ctx->desc = opt->desc; ++ break; ++ } ++ } ++ ++ if (!firm_ctx->desc) { ++ snprintf(msg, sizeof(msg), "No descriptor were found."); ++ goto err; ++ } ++ ++ firm_ctx->ch = spdk_bdev_get_io_channel(firm_ctx->desc); ++ if (!firm_ctx->ch) { ++ snprintf(msg, sizeof(msg), "No channels were found."); ++ goto err; ++ } ++ ++ fd = open(firm_ctx->req->filename, O_RDONLY); ++ if (fd < 0) { ++ snprintf(msg, sizeof(msg), "open file failed."); ++ goto err; ++ } ++ ++ rc = fstat(fd, &fw_stat); ++ if (rc < 0) { ++ close(fd); ++ snprintf(msg, sizeof(msg), "fstat failed."); ++ goto err; ++ } ++ ++ firm_ctx->size = fw_stat.st_size; ++ if (fw_stat.st_size % 4) { ++ close(fd); ++ snprintf(msg, sizeof(msg), "Firmware image size is not multiple of 4."); ++ goto err; ++ } ++ ++ firm_ctx->fw_image = spdk_zmalloc(firm_ctx->size, 4096, NULL, ++ SPDK_ENV_LCORE_ID_ANY, SPDK_MALLOC_DMA); ++ if (!firm_ctx->fw_image) { ++ close(fd); ++ snprintf(msg, sizeof(msg), "Memory allocation error."); ++ goto err; ++ } ++ firm_ctx->p = firm_ctx->fw_image; ++ ++ if (read(fd, firm_ctx->p, firm_ctx->size) != ((ssize_t)(firm_ctx->size))) { ++ close(fd); ++ snprintf(msg, sizeof(msg), "Read firmware image failed!"); ++ goto err; ++ } ++ close(fd); ++ ++ firm_ctx->offset = 0; ++ firm_ctx->size_remaining = firm_ctx->size; ++ firm_ctx->transfer = spdk_min(firm_ctx->size_remaining, 4096); ++ ++ cmd.opc = SPDK_NVME_OPC_FIRMWARE_IMAGE_DOWNLOAD; ++ cmd.cdw10 = spdk_nvme_bytes_to_numd(firm_ctx->transfer); ++ cmd.cdw11 = firm_ctx->offset >> 2; ++ ++ rc = spdk_bdev_nvme_admin_passthru(firm_ctx->desc, firm_ctx->ch, &cmd, firm_ctx->p, ++ firm_ctx->transfer, apply_firmware_complete, firm_ctx); ++ if (rc == 0) { ++ /* normal return here. */ ++ return; ++ } ++ ++ snprintf(msg, sizeof(msg), "Read firmware image failed!"); ++err: ++ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, msg); ++ apply_firmware_cleanup(firm_ctx); ++} ++SPDK_RPC_REGISTER("bdev_nvme_apply_firmware", rpc_bdev_nvme_apply_firmware, SPDK_RPC_RUNTIME) ++ ++struct rpc_bdev_nvme_transport_stat_ctx { ++ struct spdk_jsonrpc_request *request; ++ struct spdk_json_write_ctx *w; ++}; ++ ++static void ++rpc_bdev_nvme_rdma_stats(struct spdk_json_write_ctx *w, ++ struct spdk_nvme_transport_poll_group_stat *stat) ++{ ++ struct spdk_nvme_rdma_device_stat *device_stats; ++ uint32_t i; ++ ++ spdk_json_write_named_array_begin(w, "devices"); ++ ++ for (i = 0; i < stat->rdma.num_devices; i++) { ++ device_stats = &stat->rdma.device_stats[i]; ++ spdk_json_write_object_begin(w); ++ spdk_json_write_named_string(w, "dev_name", device_stats->name); ++ spdk_json_write_named_uint64(w, "polls", device_stats->polls); ++ spdk_json_write_named_uint64(w, "idle_polls", device_stats->idle_polls); ++ spdk_json_write_named_uint64(w, "completions", device_stats->completions); ++ spdk_json_write_named_uint64(w, "queued_requests", device_stats->queued_requests); ++ spdk_json_write_named_uint64(w, "total_send_wrs", device_stats->total_send_wrs); ++ spdk_json_write_named_uint64(w, "send_doorbell_updates", device_stats->send_doorbell_updates); ++ spdk_json_write_named_uint64(w, "total_recv_wrs", device_stats->total_recv_wrs); ++ spdk_json_write_named_uint64(w, "recv_doorbell_updates", device_stats->recv_doorbell_updates); ++ spdk_json_write_object_end(w); ++ } ++ spdk_json_write_array_end(w); ++} ++ ++static void ++rpc_bdev_nvme_pcie_stats(struct spdk_json_write_ctx *w, ++ struct spdk_nvme_transport_poll_group_stat *stat) ++{ ++ spdk_json_write_named_uint64(w, "polls", stat->pcie.polls); ++ spdk_json_write_named_uint64(w, "idle_polls", stat->pcie.idle_polls); ++ spdk_json_write_named_uint64(w, "completions", stat->pcie.completions); ++ spdk_json_write_named_uint64(w, "cq_mmio_doorbell_updates", stat->pcie.cq_mmio_doorbell_updates); ++ spdk_json_write_named_uint64(w, "cq_shadow_doorbell_updates", ++ stat->pcie.cq_shadow_doorbell_updates); ++ spdk_json_write_named_uint64(w, "queued_requests", stat->pcie.queued_requests); ++ spdk_json_write_named_uint64(w, "submitted_requests", stat->pcie.submitted_requests); ++ spdk_json_write_named_uint64(w, "sq_mmio_doorbell_updates", stat->pcie.sq_mmio_doorbell_updates); ++ spdk_json_write_named_uint64(w, "sq_shadow_doorbell_updates", ++ stat->pcie.sq_shadow_doorbell_updates); ++} ++ ++static void ++rpc_bdev_nvme_tcp_stats(struct spdk_json_write_ctx *w, ++ struct spdk_nvme_transport_poll_group_stat *stat) ++{ ++ spdk_json_write_named_uint64(w, "polls", stat->tcp.polls); ++ spdk_json_write_named_uint64(w, "idle_polls", stat->tcp.idle_polls); ++ spdk_json_write_named_uint64(w, "socket_completions", stat->tcp.socket_completions); ++ spdk_json_write_named_uint64(w, "nvme_completions", stat->tcp.nvme_completions); ++ spdk_json_write_named_uint64(w, "queued_requests", stat->tcp.queued_requests); ++ spdk_json_write_named_uint64(w, "submitted_requests", stat->tcp.submitted_requests); ++} ++ ++static void ++rpc_bdev_nvme_stats_per_channel(struct spdk_io_channel_iter *i) ++{ ++ struct rpc_bdev_nvme_transport_stat_ctx *ctx; ++ struct spdk_io_channel *ch; ++ struct nvme_poll_group *group; ++ struct spdk_nvme_poll_group_stat *stat; ++ struct spdk_nvme_transport_poll_group_stat *tr_stat; ++ uint32_t j; ++ int rc; ++ ++ ctx = spdk_io_channel_iter_get_ctx(i); ++ ch = spdk_io_channel_iter_get_channel(i); ++ group = spdk_io_channel_get_ctx(ch); ++ ++ rc = spdk_nvme_poll_group_get_stats(group->group, &stat); ++ if (rc) { ++ spdk_for_each_channel_continue(i, rc); ++ return; ++ } ++ ++ spdk_json_write_object_begin(ctx->w); ++ spdk_json_write_named_string(ctx->w, "thread", spdk_thread_get_name(spdk_get_thread())); ++ spdk_json_write_named_array_begin(ctx->w, "transports"); ++ ++ for (j = 0; j < stat->num_transports; j++) { ++ tr_stat = stat->transport_stat[j]; ++ spdk_json_write_object_begin(ctx->w); ++ spdk_json_write_named_string(ctx->w, "trname", spdk_nvme_transport_id_trtype_str(tr_stat->trtype)); ++ ++ switch (stat->transport_stat[j]->trtype) { ++ case SPDK_NVME_TRANSPORT_RDMA: ++ rpc_bdev_nvme_rdma_stats(ctx->w, tr_stat); ++ break; ++ case SPDK_NVME_TRANSPORT_PCIE: ++ case SPDK_NVME_TRANSPORT_VFIOUSER: ++ rpc_bdev_nvme_pcie_stats(ctx->w, tr_stat); ++ break; ++ case SPDK_NVME_TRANSPORT_TCP: ++ rpc_bdev_nvme_tcp_stats(ctx->w, tr_stat); ++ break; ++ default: ++ SPDK_WARNLOG("Can't handle trtype %d %s\n", tr_stat->trtype, ++ spdk_nvme_transport_id_trtype_str(tr_stat->trtype)); ++ } ++ spdk_json_write_object_end(ctx->w); ++ } ++ /* transports array */ ++ spdk_json_write_array_end(ctx->w); ++ spdk_json_write_object_end(ctx->w); ++ ++ spdk_nvme_poll_group_free_stats(group->group, stat); ++ spdk_for_each_channel_continue(i, 0); ++} ++ ++static void ++rpc_bdev_nvme_stats_done(struct spdk_io_channel_iter *i, int status) ++{ ++ struct rpc_bdev_nvme_transport_stat_ctx *ctx = spdk_io_channel_iter_get_ctx(i); ++ ++ spdk_json_write_array_end(ctx->w); ++ spdk_json_write_object_end(ctx->w); ++ spdk_jsonrpc_end_result(ctx->request, ctx->w); ++ free(ctx); ++} ++ ++static void ++rpc_bdev_nvme_get_transport_statistics(struct spdk_jsonrpc_request *request, ++ const struct spdk_json_val *params) ++{ ++ struct rpc_bdev_nvme_transport_stat_ctx *ctx; ++ ++ if (params) { ++ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, ++ "'bdev_nvme_get_transport_statistics' requires no arguments"); ++ return; ++ } ++ ++ ctx = calloc(1, sizeof(*ctx)); ++ if (!ctx) { ++ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, ++ "Memory allocation error"); ++ return; ++ } ++ ctx->request = request; ++ ctx->w = spdk_jsonrpc_begin_result(ctx->request); ++ spdk_json_write_object_begin(ctx->w); ++ spdk_json_write_named_array_begin(ctx->w, "poll_groups"); ++ ++ spdk_for_each_channel(&g_nvme_bdev_ctrlrs, ++ rpc_bdev_nvme_stats_per_channel, ++ ctx, ++ rpc_bdev_nvme_stats_done); ++} ++SPDK_RPC_REGISTER("bdev_nvme_get_transport_statistics", rpc_bdev_nvme_get_transport_statistics, ++ SPDK_RPC_RUNTIME) ++ ++struct rpc_bdev_nvme_reset_controller_req { ++ char *name; ++}; ++ ++static void ++free_rpc_bdev_nvme_reset_controller_req(struct rpc_bdev_nvme_reset_controller_req *r) ++{ ++ free(r->name); ++} ++ ++static const struct spdk_json_object_decoder rpc_bdev_nvme_reset_controller_req_decoders[] = { ++ {"name", offsetof(struct rpc_bdev_nvme_reset_controller_req, name), spdk_json_decode_string}, ++}; ++ ++struct rpc_bdev_nvme_reset_controller_ctx { ++ struct spdk_jsonrpc_request *request; ++ bool success; ++ struct spdk_thread *orig_thread; ++}; ++ ++static void ++_rpc_bdev_nvme_reset_controller_cb(void *_ctx) ++{ ++ struct rpc_bdev_nvme_reset_controller_ctx *ctx = _ctx; ++ ++ spdk_jsonrpc_send_bool_response(ctx->request, ctx->success); ++ ++ free(ctx); ++} ++ ++static void ++rpc_bdev_nvme_reset_controller_cb(void *cb_arg, bool success) ++{ ++ struct rpc_bdev_nvme_reset_controller_ctx *ctx = cb_arg; ++ ++ ctx->success = success; ++ ++ spdk_thread_send_msg(ctx->orig_thread, _rpc_bdev_nvme_reset_controller_cb, ctx); ++} ++ ++static void ++rpc_bdev_nvme_reset_controller(struct spdk_jsonrpc_request *request, ++ const struct spdk_json_val *params) ++{ ++ struct rpc_bdev_nvme_reset_controller_req req = {NULL}; ++ struct rpc_bdev_nvme_reset_controller_ctx *ctx; ++ struct nvme_ctrlr *nvme_ctrlr; ++ int rc; ++ ++ ctx = calloc(1, sizeof(*ctx)); ++ if (ctx == NULL) { ++ SPDK_ERRLOG("Memory allocation failed\n"); ++ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, ++ "Memory allocation failed"); ++ return; ++ } ++ ++ if (spdk_json_decode_object(params, rpc_bdev_nvme_reset_controller_req_decoders, ++ SPDK_COUNTOF(rpc_bdev_nvme_reset_controller_req_decoders), ++ &req)) { ++ SPDK_ERRLOG("spdk_json_decode_object failed\n"); ++ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, spdk_strerror(EINVAL)); ++ goto err; ++ } ++ ++ nvme_ctrlr = nvme_ctrlr_get_by_name(req.name); ++ if (nvme_ctrlr == NULL) { ++ SPDK_ERRLOG("Failed at device lookup\n"); ++ spdk_jsonrpc_send_error_response(request, -ENODEV, spdk_strerror(ENODEV)); ++ goto err; ++ } ++ ++ ctx->request = request; ++ ctx->orig_thread = spdk_get_thread(); ++ ++ rc = bdev_nvme_reset_rpc(nvme_ctrlr, rpc_bdev_nvme_reset_controller_cb, ctx); ++ if (rc != 0) { ++ SPDK_NOTICELOG("Failed at bdev_nvme_reset_rpc\n"); ++ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, spdk_strerror(-rc)); ++ goto err; ++ } ++ ++ free_rpc_bdev_nvme_reset_controller_req(&req); ++ return; ++ ++err: ++ free_rpc_bdev_nvme_reset_controller_req(&req); ++ free(ctx); ++} ++SPDK_RPC_REGISTER("bdev_nvme_reset_controller", rpc_bdev_nvme_reset_controller, SPDK_RPC_RUNTIME) ++ ++struct rpc_get_controller_health_info { ++ char *name; ++}; ++ ++struct spdk_nvme_health_info_context { ++ struct spdk_jsonrpc_request *request; ++ struct spdk_nvme_ctrlr *ctrlr; ++ struct spdk_nvme_health_information_page health_page; ++}; ++ ++static void ++free_rpc_get_controller_health_info(struct rpc_get_controller_health_info *r) ++{ ++ free(r->name); ++} ++ ++static const struct spdk_json_object_decoder rpc_get_controller_health_info_decoders[] = { ++ {"name", offsetof(struct rpc_get_controller_health_info, name), spdk_json_decode_string, true}, ++}; ++ ++static void ++nvme_health_info_cleanup(struct spdk_nvme_health_info_context *context, bool response) ++{ ++ if (response == true) { ++ spdk_jsonrpc_send_error_response(context->request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, ++ "Internal error."); ++ } ++ ++ free(context); ++} ++ ++static void ++get_health_log_page_completion(void *cb_arg, const struct spdk_nvme_cpl *cpl) ++{ ++ int i; ++ char buf[128]; ++ struct spdk_nvme_health_info_context *context = cb_arg; ++ struct spdk_jsonrpc_request *request = context->request; ++ struct spdk_json_write_ctx *w; ++ struct spdk_nvme_ctrlr *ctrlr = context->ctrlr; ++ const struct spdk_nvme_transport_id *trid = NULL; ++ const struct spdk_nvme_ctrlr_data *cdata = NULL; ++ struct spdk_nvme_health_information_page *health_page = NULL; ++ ++ if (spdk_nvme_cpl_is_error(cpl)) { ++ nvme_health_info_cleanup(context, true); ++ SPDK_ERRLOG("get log page failed\n"); ++ return; ++ } ++ ++ if (ctrlr == NULL) { ++ nvme_health_info_cleanup(context, true); ++ SPDK_ERRLOG("ctrlr is NULL\n"); ++ return; ++ } else { ++ trid = spdk_nvme_ctrlr_get_transport_id(ctrlr); ++ cdata = spdk_nvme_ctrlr_get_data(ctrlr); ++ health_page = &(context->health_page); ++ } ++ ++ w = spdk_jsonrpc_begin_result(request); ++ ++ spdk_json_write_object_begin(w); ++ snprintf(buf, sizeof(cdata->mn) + 1, "%s", cdata->mn); ++ spdk_str_trim(buf); ++ spdk_json_write_named_string(w, "model_number", buf); ++ snprintf(buf, sizeof(cdata->sn) + 1, "%s", cdata->sn); ++ spdk_str_trim(buf); ++ spdk_json_write_named_string(w, "serial_number", buf); ++ snprintf(buf, sizeof(cdata->fr) + 1, "%s", cdata->fr); ++ spdk_str_trim(buf); ++ spdk_json_write_named_string(w, "firmware_revision", buf); ++ spdk_json_write_named_string(w, "traddr", trid->traddr); ++ spdk_json_write_named_uint64(w, "temperature_celsius", health_page->temperature - 273); ++ spdk_json_write_named_uint64(w, "available_spare_percentage", health_page->available_spare); ++ spdk_json_write_named_uint64(w, "available_spare_threshold_percentage", ++ health_page->available_spare_threshold); ++ spdk_json_write_named_uint64(w, "percentage_used", health_page->percentage_used); ++ spdk_json_write_named_uint128(w, "data_units_read", ++ health_page->data_units_read[0], health_page->data_units_read[1]); ++ spdk_json_write_named_uint128(w, "data_units_written", ++ health_page->data_units_written[0], health_page->data_units_written[1]); ++ spdk_json_write_named_uint128(w, "host_read_commands", ++ health_page->host_read_commands[0], health_page->host_read_commands[1]); ++ spdk_json_write_named_uint128(w, "host_write_commands", ++ health_page->host_write_commands[0], health_page->host_write_commands[1]); ++ spdk_json_write_named_uint128(w, "controller_busy_time", ++ health_page->controller_busy_time[0], health_page->controller_busy_time[1]); ++ spdk_json_write_named_uint128(w, "power_cycles", ++ health_page->power_cycles[0], health_page->power_cycles[1]); ++ spdk_json_write_named_uint128(w, "power_on_hours", ++ health_page->power_on_hours[0], health_page->power_on_hours[1]); ++ spdk_json_write_named_uint128(w, "unsafe_shutdowns", ++ health_page->unsafe_shutdowns[0], health_page->unsafe_shutdowns[1]); ++ spdk_json_write_named_uint128(w, "media_errors", ++ health_page->media_errors[0], health_page->media_errors[1]); ++ spdk_json_write_named_uint128(w, "num_err_log_entries", ++ health_page->num_error_info_log_entries[0], health_page->num_error_info_log_entries[1]); ++ spdk_json_write_named_uint64(w, "warning_temperature_time_minutes", health_page->warning_temp_time); ++ spdk_json_write_named_uint64(w, "critical_composite_temperature_time_minutes", ++ health_page->critical_temp_time); ++ for (i = 0; i < 8; i++) { ++ if (health_page->temp_sensor[i] != 0) { ++ spdk_json_write_named_uint64(w, "temperature_sensor_celsius", health_page->temp_sensor[i] - 273); ++ } ++ } ++ spdk_json_write_object_end(w); ++ ++ spdk_jsonrpc_end_result(request, w); ++ nvme_health_info_cleanup(context, false); ++} ++ ++static void ++get_health_log_page(struct spdk_nvme_health_info_context *context) ++{ ++ struct spdk_nvme_ctrlr *ctrlr = context->ctrlr; ++ ++ if (spdk_nvme_ctrlr_cmd_get_log_page(ctrlr, SPDK_NVME_LOG_HEALTH_INFORMATION, ++ SPDK_NVME_GLOBAL_NS_TAG, ++ &(context->health_page), sizeof(context->health_page), 0, ++ get_health_log_page_completion, context)) { ++ nvme_health_info_cleanup(context, true); ++ SPDK_ERRLOG("spdk_nvme_ctrlr_cmd_get_log_page() failed\n"); ++ } ++} ++ ++static void ++get_temperature_threshold_feature_completion(void *cb_arg, const struct spdk_nvme_cpl *cpl) ++{ ++ struct spdk_nvme_health_info_context *context = cb_arg; ++ ++ if (spdk_nvme_cpl_is_error(cpl)) { ++ nvme_health_info_cleanup(context, true); ++ SPDK_ERRLOG("feature SPDK_NVME_FEAT_TEMPERATURE_THRESHOLD failed in completion\n"); ++ } else { ++ get_health_log_page(context); ++ } ++} ++ ++static int ++get_temperature_threshold_feature(struct spdk_nvme_health_info_context *context) ++{ ++ struct spdk_nvme_cmd cmd = {}; ++ ++ cmd.opc = SPDK_NVME_OPC_GET_FEATURES; ++ cmd.cdw10 = SPDK_NVME_FEAT_TEMPERATURE_THRESHOLD; ++ ++ return spdk_nvme_ctrlr_cmd_admin_raw(context->ctrlr, &cmd, NULL, 0, ++ get_temperature_threshold_feature_completion, context); ++} ++ ++static void ++get_controller_health_info(struct spdk_jsonrpc_request *request, struct spdk_nvme_ctrlr *ctrlr) ++{ ++ struct spdk_nvme_health_info_context *context; ++ ++ context = calloc(1, sizeof(struct spdk_nvme_health_info_context)); ++ if (!context) { ++ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, ++ "Memory allocation error."); ++ return; ++ } ++ ++ context->request = request; ++ context->ctrlr = ctrlr; ++ ++ if (get_temperature_threshold_feature(context)) { ++ nvme_health_info_cleanup(context, true); ++ SPDK_ERRLOG("feature SPDK_NVME_FEAT_TEMPERATURE_THRESHOLD failed to submit\n"); ++ } ++ ++ return; ++} ++ ++static void ++rpc_bdev_nvme_get_controller_health_info(struct spdk_jsonrpc_request *request, ++ const struct spdk_json_val *params) ++{ ++ struct rpc_get_controller_health_info req = {}; ++ struct nvme_ctrlr *nvme_ctrlr = NULL; ++ ++ if (!params) { ++ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, ++ "Missing device name"); ++ ++ return; ++ } ++ if (spdk_json_decode_object(params, rpc_get_controller_health_info_decoders, ++ SPDK_COUNTOF(rpc_get_controller_health_info_decoders), &req)) { ++ SPDK_ERRLOG("spdk_json_decode_object failed\n"); ++ free_rpc_get_controller_health_info(&req); ++ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, ++ "Invalid parameters"); ++ ++ return; ++ } ++ ++ nvme_ctrlr = nvme_ctrlr_get_by_name(req.name); ++ ++ if (!nvme_ctrlr) { ++ SPDK_ERRLOG("nvme ctrlr name '%s' does not exist\n", req.name); ++ free_rpc_get_controller_health_info(&req); ++ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, ++ "Device not found"); ++ return; ++ } ++ ++ get_controller_health_info(request, nvme_ctrlr->ctrlr); ++ free_rpc_get_controller_health_info(&req); ++ ++ return; ++} ++SPDK_RPC_REGISTER("bdev_nvme_get_controller_health_info", ++ rpc_bdev_nvme_get_controller_health_info, SPDK_RPC_RUNTIME) ++ ++struct rpc_bdev_nvme_start_discovery { ++ char *name; ++ char *trtype; ++ char *adrfam; ++ char *traddr; ++ char *trsvcid; ++ char *hostnqn; ++ bool wait_for_attach; ++ uint64_t attach_timeout_ms; ++ struct spdk_nvme_ctrlr_opts opts; ++ struct nvme_ctrlr_opts bdev_opts; ++}; ++ ++static void ++free_rpc_bdev_nvme_start_discovery(struct rpc_bdev_nvme_start_discovery *req) ++{ ++ free(req->name); ++ free(req->trtype); ++ free(req->adrfam); ++ free(req->traddr); ++ free(req->trsvcid); ++ free(req->hostnqn); ++} ++ ++static const struct spdk_json_object_decoder rpc_bdev_nvme_start_discovery_decoders[] = { ++ {"name", offsetof(struct rpc_bdev_nvme_start_discovery, name), spdk_json_decode_string}, ++ {"trtype", offsetof(struct rpc_bdev_nvme_start_discovery, trtype), spdk_json_decode_string}, ++ {"traddr", offsetof(struct rpc_bdev_nvme_start_discovery, traddr), spdk_json_decode_string}, ++ {"adrfam", offsetof(struct rpc_bdev_nvme_start_discovery, adrfam), spdk_json_decode_string, true}, ++ {"trsvcid", offsetof(struct rpc_bdev_nvme_start_discovery, trsvcid), spdk_json_decode_string, true}, ++ {"hostnqn", offsetof(struct rpc_bdev_nvme_start_discovery, hostnqn), spdk_json_decode_string, true}, ++ {"wait_for_attach", offsetof(struct rpc_bdev_nvme_start_discovery, wait_for_attach), spdk_json_decode_bool, true}, ++ {"attach_timeout_ms", offsetof(struct rpc_bdev_nvme_start_discovery, attach_timeout_ms), spdk_json_decode_uint64, true}, ++ {"ctrlr_loss_timeout_sec", offsetof(struct rpc_bdev_nvme_start_discovery, bdev_opts.ctrlr_loss_timeout_sec), spdk_json_decode_int32, true}, ++ {"reconnect_delay_sec", offsetof(struct rpc_bdev_nvme_start_discovery, bdev_opts.reconnect_delay_sec), spdk_json_decode_uint32, true}, ++ {"fast_io_fail_timeout_sec", offsetof(struct rpc_bdev_nvme_start_discovery, bdev_opts.fast_io_fail_timeout_sec), spdk_json_decode_uint32, true}, ++}; ++ ++struct rpc_bdev_nvme_start_discovery_ctx { ++ struct rpc_bdev_nvme_start_discovery req; ++ struct spdk_jsonrpc_request *request; ++}; ++ ++static void ++rpc_bdev_nvme_start_discovery_done(void *ctx, int status) ++{ ++ struct spdk_jsonrpc_request *request = ctx; ++ ++ if (status != 0) { ++ spdk_jsonrpc_send_error_response(request, status, spdk_strerror(-status)); ++ } else { ++ spdk_jsonrpc_send_bool_response(request, true); ++ } ++} ++ ++static void ++rpc_bdev_nvme_start_discovery(struct spdk_jsonrpc_request *request, ++ const struct spdk_json_val *params) ++{ ++ struct rpc_bdev_nvme_start_discovery_ctx *ctx; ++ struct spdk_nvme_transport_id trid = {}; ++ size_t len, maxlen; ++ int rc; ++ spdk_bdev_nvme_start_discovery_fn cb_fn; ++ void *cb_ctx; ++ ++ ctx = calloc(1, sizeof(*ctx)); ++ if (!ctx) { ++ spdk_jsonrpc_send_error_response(request, -ENOMEM, spdk_strerror(ENOMEM)); ++ return; ++ } ++ ++ spdk_nvme_ctrlr_get_default_ctrlr_opts(&ctx->req.opts, sizeof(ctx->req.opts)); ++ ++ if (spdk_json_decode_object(params, rpc_bdev_nvme_start_discovery_decoders, ++ SPDK_COUNTOF(rpc_bdev_nvme_start_discovery_decoders), ++ &ctx->req)) { ++ SPDK_ERRLOG("spdk_json_decode_object failed\n"); ++ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, ++ "spdk_json_decode_object failed"); ++ goto cleanup; ++ } ++ ++ /* Parse trstring */ ++ rc = spdk_nvme_transport_id_populate_trstring(&trid, ctx->req.trtype); ++ if (rc < 0) { ++ SPDK_ERRLOG("Failed to parse trtype: %s\n", ctx->req.trtype); ++ spdk_jsonrpc_send_error_response_fmt(request, -EINVAL, "Failed to parse trtype: %s", ++ ctx->req.trtype); ++ goto cleanup; ++ } ++ ++ /* Parse trtype */ ++ rc = spdk_nvme_transport_id_parse_trtype(&trid.trtype, ctx->req.trtype); ++ assert(rc == 0); ++ ++ /* Parse traddr */ ++ maxlen = sizeof(trid.traddr); ++ len = strnlen(ctx->req.traddr, maxlen); ++ if (len == maxlen) { ++ spdk_jsonrpc_send_error_response_fmt(request, -EINVAL, "traddr too long: %s", ++ ctx->req.traddr); ++ goto cleanup; ++ } ++ memcpy(trid.traddr, ctx->req.traddr, len + 1); ++ ++ /* Parse adrfam */ ++ if (ctx->req.adrfam) { ++ rc = spdk_nvme_transport_id_parse_adrfam(&trid.adrfam, ctx->req.adrfam); ++ if (rc < 0) { ++ SPDK_ERRLOG("Failed to parse adrfam: %s\n", ctx->req.adrfam); ++ spdk_jsonrpc_send_error_response_fmt(request, -EINVAL, "Failed to parse adrfam: %s", ++ ctx->req.adrfam); ++ goto cleanup; ++ } ++ } ++ ++ /* Parse trsvcid */ ++ if (ctx->req.trsvcid) { ++ maxlen = sizeof(trid.trsvcid); ++ len = strnlen(ctx->req.trsvcid, maxlen); ++ if (len == maxlen) { ++ spdk_jsonrpc_send_error_response_fmt(request, -EINVAL, "trsvcid too long: %s", ++ ctx->req.trsvcid); ++ goto cleanup; ++ } ++ memcpy(trid.trsvcid, ctx->req.trsvcid, len + 1); ++ } ++ ++ if (ctx->req.hostnqn) { ++ snprintf(ctx->req.opts.hostnqn, sizeof(ctx->req.opts.hostnqn), "%s", ++ ctx->req.hostnqn); ++ } ++ ++ if (ctx->req.attach_timeout_ms != 0) { ++ ctx->req.wait_for_attach = true; ++ } ++ ++ ctx->request = request; ++ cb_fn = ctx->req.wait_for_attach ? rpc_bdev_nvme_start_discovery_done : NULL; ++ cb_ctx = ctx->req.wait_for_attach ? request : NULL; ++ rc = bdev_nvme_start_discovery(&trid, ctx->req.name, &ctx->req.opts, &ctx->req.bdev_opts, ++ ctx->req.attach_timeout_ms, false, cb_fn, cb_ctx); ++ if (rc) { ++ spdk_jsonrpc_send_error_response(request, rc, spdk_strerror(-rc)); ++ } else if (!ctx->req.wait_for_attach) { ++ rpc_bdev_nvme_start_discovery_done(request, 0); ++ } ++ ++cleanup: ++ free_rpc_bdev_nvme_start_discovery(&ctx->req); ++ free(ctx); ++} ++SPDK_RPC_REGISTER("bdev_nvme_start_discovery", rpc_bdev_nvme_start_discovery, ++ SPDK_RPC_RUNTIME) ++ ++struct rpc_bdev_nvme_stop_discovery { ++ char *name; ++}; ++ ++static const struct spdk_json_object_decoder rpc_bdev_nvme_stop_discovery_decoders[] = { ++ {"name", offsetof(struct rpc_bdev_nvme_stop_discovery, name), spdk_json_decode_string}, ++}; ++ ++struct rpc_bdev_nvme_stop_discovery_ctx { ++ struct rpc_bdev_nvme_stop_discovery req; ++ struct spdk_jsonrpc_request *request; ++}; ++ ++static void ++rpc_bdev_nvme_stop_discovery_done(void *cb_ctx) ++{ ++ struct rpc_bdev_nvme_stop_discovery_ctx *ctx = cb_ctx; ++ ++ spdk_jsonrpc_send_bool_response(ctx->request, true); ++ free(ctx->req.name); ++ free(ctx); ++} ++ ++static void ++rpc_bdev_nvme_stop_discovery(struct spdk_jsonrpc_request *request, ++ const struct spdk_json_val *params) ++{ ++ struct rpc_bdev_nvme_stop_discovery_ctx *ctx; ++ int rc; ++ ++ ctx = calloc(1, sizeof(*ctx)); ++ if (!ctx) { ++ spdk_jsonrpc_send_error_response(request, -ENOMEM, spdk_strerror(ENOMEM)); ++ return; ++ } ++ ++ if (spdk_json_decode_object(params, rpc_bdev_nvme_stop_discovery_decoders, ++ SPDK_COUNTOF(rpc_bdev_nvme_stop_discovery_decoders), ++ &ctx->req)) { ++ SPDK_ERRLOG("spdk_json_decode_object failed\n"); ++ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, ++ "spdk_json_decode_object failed"); ++ goto cleanup; ++ } ++ ++ ctx->request = request; ++ rc = bdev_nvme_stop_discovery(ctx->req.name, rpc_bdev_nvme_stop_discovery_done, ctx); ++ if (rc) { ++ spdk_jsonrpc_send_error_response(request, rc, spdk_strerror(-rc)); ++ goto cleanup; ++ } ++ ++ return; ++ ++cleanup: ++ free(ctx->req.name); ++ free(ctx); ++} ++SPDK_RPC_REGISTER("bdev_nvme_stop_discovery", rpc_bdev_nvme_stop_discovery, ++ SPDK_RPC_RUNTIME) ++ ++static void ++rpc_bdev_nvme_get_discovery_info(struct spdk_jsonrpc_request *request, ++ const struct spdk_json_val *params) ++{ ++ struct spdk_json_write_ctx *w; ++ ++ w = spdk_jsonrpc_begin_result(request); ++ bdev_nvme_get_discovery_info(w); ++ spdk_jsonrpc_end_result(request, w); ++} ++SPDK_RPC_REGISTER("bdev_nvme_get_discovery_info", rpc_bdev_nvme_get_discovery_info, ++ SPDK_RPC_RUNTIME) ++ ++enum error_injection_cmd_type { ++ NVME_ADMIN_CMD = 1, ++ NVME_IO_CMD, ++}; ++ ++struct rpc_add_error_injection { ++ char *name; ++ enum error_injection_cmd_type cmd_type; ++ uint8_t opc; ++ bool do_not_submit; ++ uint64_t timeout_in_us; ++ uint32_t err_count; ++ uint8_t sct; ++ uint8_t sc; ++}; ++ ++static void ++free_rpc_add_error_injection(struct rpc_add_error_injection *req) ++{ ++ free(req->name); ++} ++ ++static int ++rpc_error_injection_decode_cmd_type(const struct spdk_json_val *val, void *out) ++{ ++ int *cmd_type = out; ++ ++ if (spdk_json_strequal(val, "admin")) { ++ *cmd_type = NVME_ADMIN_CMD; ++ } else if (spdk_json_strequal(val, "io")) { ++ *cmd_type = NVME_IO_CMD; ++ } else { ++ SPDK_ERRLOG("Invalid parameter value: cmd_type\n"); ++ return -EINVAL; ++ } ++ ++ return 0; ++} ++ ++static const struct spdk_json_object_decoder rpc_add_error_injection_decoders[] = { ++ { "name", offsetof(struct rpc_add_error_injection, name), spdk_json_decode_string }, ++ { "cmd_type", offsetof(struct rpc_add_error_injection, cmd_type), rpc_error_injection_decode_cmd_type }, ++ { "opc", offsetof(struct rpc_add_error_injection, opc), spdk_json_decode_uint8 }, ++ { "do_not_submit", offsetof(struct rpc_add_error_injection, do_not_submit), spdk_json_decode_bool, true }, ++ { "timeout_in_us", offsetof(struct rpc_add_error_injection, timeout_in_us), spdk_json_decode_uint64, true }, ++ { "err_count", offsetof(struct rpc_add_error_injection, err_count), spdk_json_decode_uint32, true }, ++ { "sct", offsetof(struct rpc_add_error_injection, sct), spdk_json_decode_uint8, true}, ++ { "sc", offsetof(struct rpc_add_error_injection, sc), spdk_json_decode_uint8, true}, ++}; ++ ++struct rpc_add_error_injection_ctx { ++ struct spdk_jsonrpc_request *request; ++ struct rpc_add_error_injection rpc; ++}; ++ ++static void ++rpc_add_error_injection_done(struct spdk_io_channel_iter *i, int status) ++{ ++ struct rpc_add_error_injection_ctx *ctx = spdk_io_channel_iter_get_ctx(i); ++ ++ if (status) { ++ spdk_jsonrpc_send_error_response(ctx->request, status, ++ "Failed to add the error injection."); ++ } else { ++ spdk_jsonrpc_send_bool_response(ctx->request, true); ++ } ++ ++ free_rpc_add_error_injection(&ctx->rpc); ++ free(ctx); ++} ++ ++static void ++rpc_add_error_injection_per_channel(struct spdk_io_channel_iter *i) ++{ ++ struct spdk_io_channel *ch = spdk_io_channel_iter_get_channel(i); ++ struct rpc_add_error_injection_ctx *ctx = spdk_io_channel_iter_get_ctx(i); ++ struct nvme_ctrlr_channel *ctrlr_ch = spdk_io_channel_get_ctx(ch); ++ struct spdk_nvme_qpair *qpair = ctrlr_ch->qpair->qpair; ++ struct spdk_nvme_ctrlr *ctrlr = ctrlr_ch->qpair->ctrlr->ctrlr; ++ int rc = 0; ++ ++ if (qpair != NULL) { ++ rc = spdk_nvme_qpair_add_cmd_error_injection(ctrlr, qpair, ctx->rpc.opc, ++ ctx->rpc.do_not_submit, ctx->rpc.timeout_in_us, ctx->rpc.err_count, ++ ctx->rpc.sct, ctx->rpc.sc); ++ } ++ ++ spdk_for_each_channel_continue(i, rc); ++} ++ ++static void ++rpc_bdev_nvme_add_error_injection( ++ struct spdk_jsonrpc_request *request, ++ const struct spdk_json_val *params) ++{ ++ struct rpc_add_error_injection_ctx *ctx; ++ struct nvme_ctrlr *nvme_ctrlr; ++ int rc; ++ ++ ctx = calloc(1, sizeof(*ctx)); ++ if (!ctx) { ++ spdk_jsonrpc_send_error_response(request, -ENOMEM, spdk_strerror(ENOMEM)); ++ return; ++ } ++ ctx->rpc.err_count = 1; ++ ctx->request = request; ++ ++ if (spdk_json_decode_object(params, ++ rpc_add_error_injection_decoders, ++ SPDK_COUNTOF(rpc_add_error_injection_decoders), ++ &ctx->rpc)) { ++ spdk_jsonrpc_send_error_response(request, -EINVAL, ++ "Failed to parse the request"); ++ goto cleanup; ++ } ++ ++ nvme_ctrlr = nvme_ctrlr_get_by_name(ctx->rpc.name); ++ if (nvme_ctrlr == NULL) { ++ SPDK_ERRLOG("No controller with specified name was found.\n"); ++ spdk_jsonrpc_send_error_response(request, -ENODEV, spdk_strerror(ENODEV)); ++ goto cleanup; ++ } ++ ++ if (ctx->rpc.cmd_type == NVME_IO_CMD) { ++ spdk_for_each_channel(nvme_ctrlr, ++ rpc_add_error_injection_per_channel, ++ ctx, ++ rpc_add_error_injection_done); ++ ++ return; ++ } else { ++ rc = spdk_nvme_qpair_add_cmd_error_injection(nvme_ctrlr->ctrlr, NULL, ctx->rpc.opc, ++ ctx->rpc.do_not_submit, ctx->rpc.timeout_in_us, ctx->rpc.err_count, ++ ctx->rpc.sct, ctx->rpc.sc); ++ if (rc) { ++ spdk_jsonrpc_send_error_response(request, -rc, ++ "Failed to add the error injection"); ++ } else { ++ spdk_jsonrpc_send_bool_response(ctx->request, true); ++ } ++ } ++ ++cleanup: ++ free_rpc_add_error_injection(&ctx->rpc); ++ free(ctx); ++} ++SPDK_RPC_REGISTER("bdev_nvme_add_error_injection", rpc_bdev_nvme_add_error_injection, ++ SPDK_RPC_RUNTIME) ++ ++struct rpc_remove_error_injection { ++ char *name; ++ enum error_injection_cmd_type cmd_type; ++ uint8_t opc; ++}; ++ ++static void ++free_rpc_remove_error_injection(struct rpc_remove_error_injection *req) ++{ ++ free(req->name); ++} ++ ++static const struct spdk_json_object_decoder rpc_remove_error_injection_decoders[] = { ++ { "name", offsetof(struct rpc_remove_error_injection, name), spdk_json_decode_string }, ++ { "cmd_type", offsetof(struct rpc_remove_error_injection, cmd_type), rpc_error_injection_decode_cmd_type }, ++ { "opc", offsetof(struct rpc_remove_error_injection, opc), spdk_json_decode_uint8 }, ++}; ++ ++struct rpc_remove_error_injection_ctx { ++ struct spdk_jsonrpc_request *request; ++ struct rpc_remove_error_injection rpc; ++}; ++ ++static void ++rpc_remove_error_injection_done(struct spdk_io_channel_iter *i, int status) ++{ ++ struct rpc_remove_error_injection_ctx *ctx = spdk_io_channel_iter_get_ctx(i); ++ ++ if (status) { ++ spdk_jsonrpc_send_error_response(ctx->request, status, ++ "Failed to remove the error injection."); ++ } else { ++ spdk_jsonrpc_send_bool_response(ctx->request, true); ++ } ++ ++ free_rpc_remove_error_injection(&ctx->rpc); ++ free(ctx); ++} ++ ++static void ++rpc_remove_error_injection_per_channel(struct spdk_io_channel_iter *i) ++{ ++ struct spdk_io_channel *ch = spdk_io_channel_iter_get_channel(i); ++ struct rpc_remove_error_injection_ctx *ctx = spdk_io_channel_iter_get_ctx(i); ++ struct nvme_ctrlr_channel *ctrlr_ch = spdk_io_channel_get_ctx(ch); ++ struct spdk_nvme_qpair *qpair = ctrlr_ch->qpair->qpair; ++ struct spdk_nvme_ctrlr *ctrlr = ctrlr_ch->qpair->ctrlr->ctrlr; ++ ++ if (qpair != NULL) { ++ spdk_nvme_qpair_remove_cmd_error_injection(ctrlr, qpair, ctx->rpc.opc); ++ } ++ ++ spdk_for_each_channel_continue(i, 0); ++} ++ ++static void ++rpc_bdev_nvme_remove_error_injection(struct spdk_jsonrpc_request *request, ++ const struct spdk_json_val *params) ++{ ++ struct rpc_remove_error_injection_ctx *ctx; ++ struct nvme_ctrlr *nvme_ctrlr; ++ ++ ctx = calloc(1, sizeof(*ctx)); ++ if (!ctx) { ++ spdk_jsonrpc_send_error_response(request, -ENOMEM, spdk_strerror(ENOMEM)); ++ return; ++ } ++ ctx->request = request; ++ ++ if (spdk_json_decode_object(params, ++ rpc_remove_error_injection_decoders, ++ SPDK_COUNTOF(rpc_remove_error_injection_decoders), ++ &ctx->rpc)) { ++ spdk_jsonrpc_send_error_response(request, -EINVAL, ++ "Failed to parse the request"); ++ goto cleanup; ++ } ++ ++ nvme_ctrlr = nvme_ctrlr_get_by_name(ctx->rpc.name); ++ if (nvme_ctrlr == NULL) { ++ SPDK_ERRLOG("No controller with specified name was found.\n"); ++ spdk_jsonrpc_send_error_response(request, -ENODEV, spdk_strerror(ENODEV)); ++ goto cleanup; ++ } ++ ++ if (ctx->rpc.cmd_type == NVME_IO_CMD) { ++ spdk_for_each_channel(nvme_ctrlr, ++ rpc_remove_error_injection_per_channel, ++ ctx, ++ rpc_remove_error_injection_done); ++ return; ++ } else { ++ spdk_nvme_qpair_remove_cmd_error_injection(nvme_ctrlr->ctrlr, NULL, ctx->rpc.opc); ++ spdk_jsonrpc_send_bool_response(ctx->request, true); ++ } ++ ++cleanup: ++ free_rpc_remove_error_injection(&ctx->rpc); ++ free(ctx); ++} ++SPDK_RPC_REGISTER("bdev_nvme_remove_error_injection", rpc_bdev_nvme_remove_error_injection, ++ SPDK_RPC_RUNTIME) ++ ++struct rpc_get_io_paths { ++ char *name; ++}; ++ ++static void ++free_rpc_get_io_paths(struct rpc_get_io_paths *r) ++{ ++ free(r->name); ++} ++ ++static const struct spdk_json_object_decoder rpc_get_io_paths_decoders[] = { ++ {"name", offsetof(struct rpc_get_io_paths, name), spdk_json_decode_string, true}, ++}; ++ ++struct rpc_get_io_paths_ctx { ++ struct rpc_get_io_paths req; ++ struct spdk_jsonrpc_request *request; ++ struct spdk_json_write_ctx *w; ++}; ++ ++static void ++rpc_bdev_nvme_get_io_paths_done(struct spdk_io_channel_iter *i, int status) ++{ ++ struct rpc_get_io_paths_ctx *ctx = spdk_io_channel_iter_get_ctx(i); ++ ++ spdk_json_write_array_end(ctx->w); ++ ++ spdk_json_write_object_end(ctx->w); ++ ++ spdk_jsonrpc_end_result(ctx->request, ctx->w); ++ ++ free_rpc_get_io_paths(&ctx->req); ++ free(ctx); ++} ++ ++static void ++_rpc_bdev_nvme_get_io_paths(struct spdk_io_channel_iter *i) ++{ ++ struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i); ++ struct nvme_poll_group *group = spdk_io_channel_get_ctx(_ch); ++ struct rpc_get_io_paths_ctx *ctx = spdk_io_channel_iter_get_ctx(i); ++ struct nvme_qpair *qpair; ++ struct nvme_io_path *io_path; ++ struct nvme_bdev *nbdev; ++ ++ spdk_json_write_object_begin(ctx->w); ++ ++ spdk_json_write_named_string(ctx->w, "thread", spdk_thread_get_name(spdk_get_thread())); ++ ++ spdk_json_write_named_array_begin(ctx->w, "io_paths"); ++ ++ TAILQ_FOREACH(qpair, &group->qpair_list, tailq) { ++ TAILQ_FOREACH(io_path, &qpair->io_path_list, tailq) { ++ nbdev = io_path->nvme_ns->bdev; ++ ++ if (ctx->req.name != NULL && ++ strcmp(ctx->req.name, nbdev->disk.name) != 0) { ++ continue; ++ } ++ ++ nvme_io_path_info_json(ctx->w, io_path); ++ } ++ } ++ ++ spdk_json_write_array_end(ctx->w); ++ ++ spdk_json_write_object_end(ctx->w); ++ ++ spdk_for_each_channel_continue(i, 0); ++} ++ ++static void ++rpc_bdev_nvme_get_io_paths(struct spdk_jsonrpc_request *request, ++ const struct spdk_json_val *params) ++{ ++ struct rpc_get_io_paths_ctx *ctx; ++ ++ ctx = calloc(1, sizeof(*ctx)); ++ if (ctx == NULL) { ++ spdk_jsonrpc_send_error_response(request, -ENOMEM, spdk_strerror(ENOMEM)); ++ return; ++ } ++ ++ if (params != NULL && ++ spdk_json_decode_object(params, rpc_get_io_paths_decoders, ++ SPDK_COUNTOF(rpc_get_io_paths_decoders), ++ &ctx->req)) { ++ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, ++ "bdev_nvme_get_io_paths requires no parameters"); ++ ++ free_rpc_get_io_paths(&ctx->req); ++ free(ctx); ++ return; ++ } ++ ++ ctx->request = request; ++ ctx->w = spdk_jsonrpc_begin_result(request); ++ ++ spdk_json_write_object_begin(ctx->w); ++ ++ spdk_json_write_named_array_begin(ctx->w, "poll_groups"); ++ ++ spdk_for_each_channel(&g_nvme_bdev_ctrlrs, ++ _rpc_bdev_nvme_get_io_paths, ++ ctx, ++ rpc_bdev_nvme_get_io_paths_done); ++} ++SPDK_RPC_REGISTER("bdev_nvme_get_io_paths", rpc_bdev_nvme_get_io_paths, SPDK_RPC_RUNTIME) ++ ++struct rpc_bdev_nvme_set_preferred_path { ++ char *name; ++ uint16_t cntlid; ++}; ++ ++static void ++free_rpc_bdev_nvme_set_preferred_path(struct rpc_bdev_nvme_set_preferred_path *req) ++{ ++ free(req->name); ++} ++ ++static const struct spdk_json_object_decoder rpc_bdev_nvme_set_preferred_path_decoders[] = { ++ {"name", offsetof(struct rpc_bdev_nvme_set_preferred_path, name), spdk_json_decode_string}, ++ {"cntlid", offsetof(struct rpc_bdev_nvme_set_preferred_path, cntlid), spdk_json_decode_uint16}, ++}; ++ ++struct rpc_bdev_nvme_set_preferred_path_ctx { ++ struct rpc_bdev_nvme_set_preferred_path req; ++ struct spdk_jsonrpc_request *request; ++}; ++ ++static void ++rpc_bdev_nvme_set_preferred_path_done(void *cb_arg, int rc) ++{ ++ struct rpc_bdev_nvme_set_preferred_path_ctx *ctx = cb_arg; ++ ++ if (rc == 0) { ++ spdk_jsonrpc_send_bool_response(ctx->request, true); ++ } else { ++ spdk_jsonrpc_send_error_response(ctx->request, rc, spdk_strerror(-rc)); ++ } ++ ++ free_rpc_bdev_nvme_set_preferred_path(&ctx->req); ++ free(ctx); ++} ++ ++static void ++rpc_bdev_nvme_set_preferred_path(struct spdk_jsonrpc_request *request, ++ const struct spdk_json_val *params) ++{ ++ struct rpc_bdev_nvme_set_preferred_path_ctx *ctx; ++ ++ ctx = calloc(1, sizeof(*ctx)); ++ if (ctx == NULL) { ++ spdk_jsonrpc_send_error_response(request, -ENOMEM, spdk_strerror(ENOMEM)); ++ return; ++ } ++ ++ if (spdk_json_decode_object(params, rpc_bdev_nvme_set_preferred_path_decoders, ++ SPDK_COUNTOF(rpc_bdev_nvme_set_preferred_path_decoders), ++ &ctx->req)) { ++ SPDK_ERRLOG("spdk_json_decode_object failed\n"); ++ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, ++ "spdk_json_decode_object failed"); ++ goto cleanup; ++ } ++ ++ ctx->request = request; ++ ++ bdev_nvme_set_preferred_path(ctx->req.name, ctx->req.cntlid, ++ rpc_bdev_nvme_set_preferred_path_done, ctx); ++ return; ++ ++cleanup: ++ free_rpc_bdev_nvme_set_preferred_path(&ctx->req); ++ free(ctx); ++} ++SPDK_RPC_REGISTER("bdev_nvme_set_preferred_path", rpc_bdev_nvme_set_preferred_path, ++ SPDK_RPC_RUNTIME) ++ ++struct rpc_set_multipath_policy { ++ char *name; ++ enum bdev_nvme_multipath_policy policy; ++ enum bdev_nvme_multipath_selector selector; ++ uint32_t rr_min_io; ++}; ++ ++static void ++free_rpc_set_multipath_policy(struct rpc_set_multipath_policy *req) ++{ ++ free(req->name); ++} ++ ++static int ++rpc_decode_mp_policy(const struct spdk_json_val *val, void *out) ++{ ++ enum bdev_nvme_multipath_policy *policy = out; ++ ++ if (spdk_json_strequal(val, "active_passive") == true) { ++ *policy = BDEV_NVME_MP_POLICY_ACTIVE_PASSIVE; ++ } else if (spdk_json_strequal(val, "active_active") == true) { ++ *policy = BDEV_NVME_MP_POLICY_ACTIVE_ACTIVE; ++ } else { ++ SPDK_NOTICELOG("Invalid parameter value: policy\n"); ++ return -EINVAL; ++ } ++ ++ return 0; ++} ++ ++static int ++rpc_decode_mp_selector(const struct spdk_json_val *val, void *out) ++{ ++ enum bdev_nvme_multipath_selector *selector = out; ++ ++ if (spdk_json_strequal(val, "round_robin") == true) { ++ *selector = BDEV_NVME_MP_SELECTOR_ROUND_ROBIN; ++ } else if (spdk_json_strequal(val, "queue_depth") == true) { ++ *selector = BDEV_NVME_MP_SELECTOR_QUEUE_DEPTH; ++ } else { ++ SPDK_NOTICELOG("Invalid parameter value: selector\n"); ++ return -EINVAL; ++ } ++ ++ return 0; ++} ++ ++static const struct spdk_json_object_decoder rpc_set_multipath_policy_decoders[] = { ++ {"name", offsetof(struct rpc_set_multipath_policy, name), spdk_json_decode_string}, ++ {"policy", offsetof(struct rpc_set_multipath_policy, policy), rpc_decode_mp_policy}, ++ {"selector", offsetof(struct rpc_set_multipath_policy, selector), rpc_decode_mp_selector, true}, ++ {"rr_min_io", offsetof(struct rpc_set_multipath_policy, rr_min_io), spdk_json_decode_uint32, true}, ++}; ++ ++struct rpc_set_multipath_policy_ctx { ++ struct rpc_set_multipath_policy req; ++ struct spdk_jsonrpc_request *request; ++}; ++ ++static void ++rpc_bdev_nvme_set_multipath_policy_done(void *cb_arg, int rc) ++{ ++ struct rpc_set_multipath_policy_ctx *ctx = cb_arg; ++ ++ if (rc == 0) { ++ spdk_jsonrpc_send_bool_response(ctx->request, true); ++ } else { ++ spdk_jsonrpc_send_error_response(ctx->request, rc, spdk_strerror(-rc)); ++ } ++ ++ free_rpc_set_multipath_policy(&ctx->req); ++ free(ctx); ++} ++ ++static void ++rpc_bdev_nvme_set_multipath_policy(struct spdk_jsonrpc_request *request, ++ const struct spdk_json_val *params) ++{ ++ struct rpc_set_multipath_policy_ctx *ctx; ++ ++ ctx = calloc(1, sizeof(*ctx)); ++ if (ctx == NULL) { ++ spdk_jsonrpc_send_error_response(request, -ENOMEM, spdk_strerror(ENOMEM)); ++ return; ++ } ++ ++ ctx->req.rr_min_io = UINT32_MAX; ++ ++ if (spdk_json_decode_object(params, rpc_set_multipath_policy_decoders, ++ SPDK_COUNTOF(rpc_set_multipath_policy_decoders), ++ &ctx->req)) { ++ SPDK_ERRLOG("spdk_json_decode_object failed\n"); ++ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, ++ "spdk_json_decode_object failed"); ++ goto cleanup; ++ } ++ ++ ctx->request = request; ++ ++ if (ctx->req.policy != BDEV_NVME_MP_POLICY_ACTIVE_ACTIVE && ctx->req.selector > 0) { ++ SPDK_ERRLOG("selector only works in active_active mode\n"); ++ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, ++ "spdk_json_decode_object failed"); ++ goto cleanup; ++ } ++ ++ bdev_nvme_set_multipath_policy(ctx->req.name, ctx->req.policy, ctx->req.selector, ++ ctx->req.rr_min_io, ++ rpc_bdev_nvme_set_multipath_policy_done, ctx); ++ return; ++ ++cleanup: ++ free_rpc_set_multipath_policy(&ctx->req); ++ free(ctx); ++} ++SPDK_RPC_REGISTER("bdev_nvme_set_multipath_policy", rpc_bdev_nvme_set_multipath_policy, ++ SPDK_RPC_RUNTIME) ++ ++struct rpc_bdev_nvme_start_mdns_discovery { ++ char *name; ++ char *svcname; ++ char *hostnqn; ++ struct spdk_nvme_ctrlr_opts opts; ++ struct nvme_ctrlr_opts bdev_opts; ++}; ++ ++static void ++free_rpc_bdev_nvme_start_mdns_discovery(struct rpc_bdev_nvme_start_mdns_discovery *req) ++{ ++ free(req->name); ++ free(req->svcname); ++ free(req->hostnqn); ++} ++ ++static const struct spdk_json_object_decoder rpc_bdev_nvme_start_mdns_discovery_decoders[] = { ++ {"name", offsetof(struct rpc_bdev_nvme_start_mdns_discovery, name), spdk_json_decode_string}, ++ {"svcname", offsetof(struct rpc_bdev_nvme_start_mdns_discovery, svcname), spdk_json_decode_string}, ++ {"hostnqn", offsetof(struct rpc_bdev_nvme_start_mdns_discovery, hostnqn), spdk_json_decode_string, true}, ++}; ++ ++struct rpc_bdev_nvme_start_mdns_discovery_ctx { ++ struct rpc_bdev_nvme_start_mdns_discovery req; ++ struct spdk_jsonrpc_request *request; ++}; ++ ++static void ++rpc_bdev_nvme_start_mdns_discovery(struct spdk_jsonrpc_request *request, ++ const struct spdk_json_val *params) ++{ ++ struct rpc_bdev_nvme_start_mdns_discovery_ctx *ctx; ++ int rc; ++ ++ ctx = calloc(1, sizeof(*ctx)); ++ if (!ctx) { ++ spdk_jsonrpc_send_error_response(request, -ENOMEM, spdk_strerror(ENOMEM)); ++ return; ++ } ++ ++ spdk_nvme_ctrlr_get_default_ctrlr_opts(&ctx->req.opts, sizeof(ctx->req.opts)); ++ ++ if (spdk_json_decode_object(params, rpc_bdev_nvme_start_mdns_discovery_decoders, ++ SPDK_COUNTOF(rpc_bdev_nvme_start_mdns_discovery_decoders), ++ &ctx->req)) { ++ SPDK_ERRLOG("spdk_json_decode_object failed\n"); ++ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, ++ "spdk_json_decode_object failed"); ++ goto cleanup; ++ } ++ ++ if (ctx->req.hostnqn) { ++ snprintf(ctx->req.opts.hostnqn, sizeof(ctx->req.opts.hostnqn), "%s", ++ ctx->req.hostnqn); ++ } ++ ctx->request = request; ++ rc = bdev_nvme_start_mdns_discovery(ctx->req.name, ctx->req.svcname, &ctx->req.opts, ++ &ctx->req.bdev_opts); ++ if (rc) { ++ spdk_jsonrpc_send_error_response(request, rc, spdk_strerror(-rc)); ++ } else { ++ spdk_jsonrpc_send_bool_response(request, true); ++ } ++ ++cleanup: ++ free_rpc_bdev_nvme_start_mdns_discovery(&ctx->req); ++ free(ctx); ++} ++SPDK_RPC_REGISTER("bdev_nvme_start_mdns_discovery", rpc_bdev_nvme_start_mdns_discovery, ++ SPDK_RPC_RUNTIME) ++ ++struct rpc_bdev_nvme_stop_mdns_discovery { ++ char *name; ++}; ++ ++static const struct spdk_json_object_decoder rpc_bdev_nvme_stop_mdns_discovery_decoders[] = { ++ {"name", offsetof(struct rpc_bdev_nvme_stop_mdns_discovery, name), spdk_json_decode_string}, ++}; ++ ++struct rpc_bdev_nvme_stop_mdns_discovery_ctx { ++ struct rpc_bdev_nvme_stop_mdns_discovery req; ++ struct spdk_jsonrpc_request *request; ++}; ++ ++static void ++rpc_bdev_nvme_stop_mdns_discovery(struct spdk_jsonrpc_request *request, ++ const struct spdk_json_val *params) ++{ ++ struct rpc_bdev_nvme_stop_mdns_discovery_ctx *ctx; ++ int rc; ++ ++ ctx = calloc(1, sizeof(*ctx)); ++ if (!ctx) { ++ spdk_jsonrpc_send_error_response(request, -ENOMEM, spdk_strerror(ENOMEM)); ++ return; ++ } ++ ++ if (spdk_json_decode_object(params, rpc_bdev_nvme_stop_mdns_discovery_decoders, ++ SPDK_COUNTOF(rpc_bdev_nvme_stop_mdns_discovery_decoders), ++ &ctx->req)) { ++ SPDK_ERRLOG("spdk_json_decode_object failed\n"); ++ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, ++ "spdk_json_decode_object failed"); ++ goto cleanup; ++ } ++ ++ ctx->request = request; ++ rc = bdev_nvme_stop_mdns_discovery(ctx->req.name); ++ ++ if (rc) { ++ spdk_jsonrpc_send_error_response(request, rc, spdk_strerror(-rc)); ++ goto cleanup; ++ } ++ spdk_jsonrpc_send_bool_response(ctx->request, true); ++ ++cleanup: ++ free(ctx->req.name); ++ free(ctx); ++} ++SPDK_RPC_REGISTER("bdev_nvme_stop_mdns_discovery", rpc_bdev_nvme_stop_mdns_discovery, ++ SPDK_RPC_RUNTIME) ++ ++static void ++rpc_bdev_nvme_get_mdns_discovery_info(struct spdk_jsonrpc_request *request, ++ const struct spdk_json_val *params) ++{ ++ bdev_nvme_get_mdns_discovery_info(request); ++} ++ ++SPDK_RPC_REGISTER("bdev_nvme_get_mdns_discovery_info", rpc_bdev_nvme_get_mdns_discovery_info, ++ SPDK_RPC_RUNTIME) ++ ++struct rpc_get_path_stat { ++ char *name; ++}; ++ ++struct path_stat { ++ struct spdk_bdev_io_stat stat; ++ struct spdk_nvme_transport_id trid; ++ struct nvme_ns *ns; ++}; ++ ++struct rpc_bdev_nvme_path_stat_ctx { ++ struct spdk_jsonrpc_request *request; ++ struct path_stat *path_stat; ++ uint32_t num_paths; ++ struct spdk_bdev_desc *desc; ++}; ++ ++static void ++free_rpc_get_path_stat(struct rpc_get_path_stat *req) ++{ ++ free(req->name); ++} ++ ++static const struct spdk_json_object_decoder rpc_get_path_stat_decoders[] = { ++ {"name", offsetof(struct rpc_get_path_stat, name), spdk_json_decode_string}, ++}; ++ ++static void ++dummy_bdev_event_cb(enum spdk_bdev_event_type type, struct spdk_bdev *bdev, void *ctx) ++{ ++} ++ ++static void ++rpc_bdev_nvme_path_stat_per_channel(struct spdk_io_channel_iter *i) ++{ ++ struct rpc_bdev_nvme_path_stat_ctx *ctx = spdk_io_channel_iter_get_ctx(i); ++ struct spdk_io_channel *ch = spdk_io_channel_iter_get_channel(i); ++ struct nvme_bdev_channel *nbdev_ch = spdk_io_channel_get_ctx(ch); ++ struct nvme_io_path *io_path; ++ struct path_stat *path_stat; ++ uint32_t j; ++ ++ assert(ctx->num_paths != 0); ++ ++ for (j = 0; j < ctx->num_paths; j++) { ++ path_stat = &ctx->path_stat[j]; ++ ++ STAILQ_FOREACH(io_path, &nbdev_ch->io_path_list, stailq) { ++ if (path_stat->ns == io_path->nvme_ns) { ++ assert(io_path->stat != NULL); ++ spdk_bdev_add_io_stat(&path_stat->stat, io_path->stat); ++ } ++ } ++ } ++ ++ spdk_for_each_channel_continue(i, 0); ++} ++ ++static void ++rpc_bdev_nvme_path_stat_done(struct spdk_io_channel_iter *i, int status) ++{ ++ struct rpc_bdev_nvme_path_stat_ctx *ctx = spdk_io_channel_iter_get_ctx(i); ++ struct nvme_bdev *nbdev = spdk_io_channel_iter_get_io_device(i); ++ struct spdk_json_write_ctx *w; ++ struct path_stat *path_stat; ++ uint32_t j; ++ ++ assert(ctx->num_paths != 0); ++ ++ w = spdk_jsonrpc_begin_result(ctx->request); ++ spdk_json_write_object_begin(w); ++ spdk_json_write_named_string(w, "name", nbdev->disk.name); ++ spdk_json_write_named_array_begin(w, "stats"); ++ ++ for (j = 0; j < ctx->num_paths; j++) { ++ path_stat = &ctx->path_stat[j]; ++ spdk_json_write_object_begin(w); ++ ++ spdk_json_write_named_object_begin(w, "trid"); ++ nvme_bdev_dump_trid_json(&path_stat->trid, w); ++ spdk_json_write_object_end(w); ++ ++ spdk_json_write_named_object_begin(w, "stat"); ++ spdk_bdev_dump_io_stat_json(&path_stat->stat, w); ++ spdk_json_write_object_end(w); ++ ++ spdk_json_write_object_end(w); ++ } ++ ++ spdk_json_write_array_end(w); ++ spdk_json_write_object_end(w); ++ spdk_jsonrpc_end_result(ctx->request, w); ++ ++ spdk_bdev_close(ctx->desc); ++ free(ctx->path_stat); ++ free(ctx); ++} ++ ++static void ++rpc_bdev_nvme_get_path_iostat(struct spdk_jsonrpc_request *request, ++ const struct spdk_json_val *params) ++{ ++ struct rpc_get_path_stat req = {}; ++ struct spdk_bdev_desc *desc = NULL; ++ struct spdk_bdev *bdev; ++ struct nvme_bdev *nbdev; ++ struct nvme_ns *nvme_ns; ++ struct path_stat *path_stat; ++ struct rpc_bdev_nvme_path_stat_ctx *ctx; ++ struct spdk_bdev_nvme_opts opts; ++ uint32_t num_paths = 0, i = 0; ++ int rc; ++ ++ bdev_nvme_get_opts(&opts); ++ if (!opts.io_path_stat) { ++ SPDK_ERRLOG("RPC not enabled if enable_io_path_stat is false\n"); ++ spdk_jsonrpc_send_error_response(request, -EPERM, ++ "RPC not enabled if enable_io_path_stat is false"); ++ return; ++ } ++ ++ if (spdk_json_decode_object(params, rpc_get_path_stat_decoders, ++ SPDK_COUNTOF(rpc_get_path_stat_decoders), ++ &req)) { ++ SPDK_ERRLOG("spdk_json_decode_object failed\n"); ++ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, ++ "spdk_json_decode_object failed"); ++ free_rpc_get_path_stat(&req); ++ return; ++ } ++ ++ rc = spdk_bdev_open_ext(req.name, false, dummy_bdev_event_cb, NULL, &desc); ++ if (rc != 0) { ++ SPDK_ERRLOG("Failed to open bdev '%s': %d\n", req.name, rc); ++ spdk_jsonrpc_send_error_response(request, rc, spdk_strerror(-rc)); ++ free_rpc_get_path_stat(&req); ++ return; ++ } ++ ++ free_rpc_get_path_stat(&req); ++ ++ ctx = calloc(1, sizeof(struct rpc_bdev_nvme_path_stat_ctx)); ++ if (ctx == NULL) { ++ spdk_bdev_close(desc); ++ SPDK_ERRLOG("Failed to allocate rpc_bdev_nvme_path_stat_ctx struct\n"); ++ spdk_jsonrpc_send_error_response(request, -ENOMEM, spdk_strerror(ENOMEM)); ++ return; ++ } ++ ++ bdev = spdk_bdev_desc_get_bdev(desc); ++ nbdev = bdev->ctxt; ++ ++ pthread_mutex_lock(&nbdev->mutex); ++ if (nbdev->ref == 0) { ++ rc = -ENOENT; ++ goto err; ++ } ++ ++ num_paths = nbdev->ref; ++ path_stat = calloc(num_paths, sizeof(struct path_stat)); ++ if (path_stat == NULL) { ++ rc = -ENOMEM; ++ SPDK_ERRLOG("Failed to allocate memory for path_stat.\n"); ++ goto err; ++ } ++ ++ /* store the history stat */ ++ TAILQ_FOREACH(nvme_ns, &nbdev->nvme_ns_list, tailq) { ++ assert(i < num_paths); ++ path_stat[i].ns = nvme_ns; ++ path_stat[i].trid = nvme_ns->ctrlr->active_path_id->trid; ++ ++ assert(nvme_ns->stat != NULL); ++ memcpy(&path_stat[i].stat, nvme_ns->stat, sizeof(struct spdk_bdev_io_stat)); ++ i++; ++ } ++ pthread_mutex_unlock(&nbdev->mutex); ++ ++ ctx->request = request; ++ ctx->desc = desc; ++ ctx->path_stat = path_stat; ++ ctx->num_paths = num_paths; ++ ++ spdk_for_each_channel(nbdev, ++ rpc_bdev_nvme_path_stat_per_channel, ++ ctx, ++ rpc_bdev_nvme_path_stat_done); ++ return; ++ ++err: ++ pthread_mutex_unlock(&nbdev->mutex); ++ spdk_jsonrpc_send_error_response(request, rc, spdk_strerror(-rc)); ++ spdk_bdev_close(desc); ++ free(ctx); ++} ++SPDK_RPC_REGISTER("bdev_nvme_get_path_iostat", rpc_bdev_nvme_get_path_iostat, ++ SPDK_RPC_RUNTIME) +diff --git a/module/bdev/nvme/nvme_rpc.c b/module/bdev/nvme/nvme_rpc.c +index 82fc849..5bb13df 100644 +--- a/module/bdev/nvme/nvme_rpc.c ++++ b/module/bdev/nvme/nvme_rpc.c +@@ -1,463 +1,463 @@ +-/* SPDX-License-Identifier: BSD-3-Clause +- * Copyright (C) 2018 Intel Corporation. +- * All rights reserved. +- */ +- +-#include "spdk/stdinc.h" +-#include "spdk/string.h" +-#include "spdk/rpc.h" +-#include "spdk/util.h" +-#include "spdk/bdev_module.h" +-#include "spdk/log.h" +- +-#include "bdev_nvme.h" +-#include "spdk/base64.h" +- +-enum spdk_nvme_rpc_type { +- NVME_ADMIN_CMD = 1, +- NVME_IO_CMD, +-}; +- +-struct rpc_bdev_nvme_send_cmd_req { +- char *name; +- int cmd_type; +- int data_direction; +- uint32_t timeout_ms; +- uint32_t data_len; +- uint32_t md_len; +- +- struct spdk_nvme_cmd *cmdbuf; +- char *data; +- char *md; +-}; +- +-struct rpc_bdev_nvme_send_cmd_resp { +- char *cpl_text; +- char *data_text; +- char *md_text; +-}; +- +-struct rpc_bdev_nvme_send_cmd_ctx { +- struct spdk_jsonrpc_request *jsonrpc_request; +- struct rpc_bdev_nvme_send_cmd_req req; +- struct rpc_bdev_nvme_send_cmd_resp resp; +- struct nvme_ctrlr *nvme_ctrlr; +- struct spdk_io_channel *ctrlr_io_ch; +-}; +- +-static void +-free_rpc_bdev_nvme_send_cmd_ctx(struct rpc_bdev_nvme_send_cmd_ctx *ctx) +-{ +- assert(ctx != NULL); +- +- free(ctx->req.name); +- free(ctx->req.cmdbuf); +- spdk_free(ctx->req.data); +- spdk_free(ctx->req.md); +- free(ctx->resp.cpl_text); +- free(ctx->resp.data_text); +- free(ctx->resp.md_text); +- free(ctx); +-} +- +-static int +-rpc_bdev_nvme_send_cmd_resp_construct(struct rpc_bdev_nvme_send_cmd_resp *resp, +- struct rpc_bdev_nvme_send_cmd_req *req, +- const struct spdk_nvme_cpl *cpl) +-{ +- resp->cpl_text = malloc(spdk_base64_get_encoded_strlen(sizeof(*cpl)) + 1); +- if (!resp->cpl_text) { +- return -ENOMEM; +- } +- spdk_base64_urlsafe_encode(resp->cpl_text, cpl, sizeof(*cpl)); +- +- if (req->data_direction == SPDK_NVME_DATA_CONTROLLER_TO_HOST) { +- if (req->data_len) { +- resp->data_text = malloc(spdk_base64_get_encoded_strlen(req->data_len) + 1); +- if (!resp->data_text) { +- return -ENOMEM; +- } +- spdk_base64_urlsafe_encode(resp->data_text, req->data, req->data_len); +- } +- if (req->md_len) { +- resp->md_text = malloc(spdk_base64_get_encoded_strlen(req->md_len) + 1); +- if (!resp->md_text) { +- return -ENOMEM; +- } +- spdk_base64_urlsafe_encode(resp->md_text, req->md, req->md_len); +- } +- } +- +- return 0; +-} +- +-static void +-rpc_bdev_nvme_send_cmd_complete(struct rpc_bdev_nvme_send_cmd_ctx *ctx, +- const struct spdk_nvme_cpl *cpl) +-{ +- struct spdk_jsonrpc_request *request = ctx->jsonrpc_request; +- struct spdk_json_write_ctx *w; +- int ret; +- +- ret = rpc_bdev_nvme_send_cmd_resp_construct(&ctx->resp, &ctx->req, cpl); +- if (ret) { +- spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, +- spdk_strerror(-ret)); +- goto out; +- } +- +- w = spdk_jsonrpc_begin_result(request); +- spdk_json_write_object_begin(w); +- spdk_json_write_named_string(w, "cpl", ctx->resp.cpl_text); +- +- if (ctx->resp.data_text) { +- spdk_json_write_named_string(w, "data", ctx->resp.data_text); +- } +- +- if (ctx->resp.md_text) { +- spdk_json_write_named_string(w, "metadata", ctx->resp.md_text); +- } +- +- spdk_json_write_object_end(w); +- spdk_jsonrpc_end_result(request, w); +- +-out: +- free_rpc_bdev_nvme_send_cmd_ctx(ctx); +- return; +-} +- +-static void +-nvme_rpc_bdev_nvme_cb(void *ref, const struct spdk_nvme_cpl *cpl) +-{ +- struct rpc_bdev_nvme_send_cmd_ctx *ctx = (struct rpc_bdev_nvme_send_cmd_ctx *)ref; +- +- if (ctx->ctrlr_io_ch) { +- spdk_put_io_channel(ctx->ctrlr_io_ch); +- ctx->ctrlr_io_ch = NULL; +- } +- +- rpc_bdev_nvme_send_cmd_complete(ctx, cpl); +-} +- +-static int +-nvme_rpc_admin_cmd_bdev_nvme(struct rpc_bdev_nvme_send_cmd_ctx *ctx, struct spdk_nvme_cmd *cmd, +- void *buf, uint32_t nbytes, uint32_t timeout_ms) +-{ +- struct nvme_ctrlr *_nvme_ctrlr = ctx->nvme_ctrlr; +- int ret; +- +- ret = spdk_nvme_ctrlr_cmd_admin_raw(_nvme_ctrlr->ctrlr, cmd, buf, +- nbytes, nvme_rpc_bdev_nvme_cb, ctx); +- +- return ret; +-} +- +-static int +-nvme_rpc_io_cmd_bdev_nvme(struct rpc_bdev_nvme_send_cmd_ctx *ctx, struct spdk_nvme_cmd *cmd, +- void *buf, uint32_t nbytes, void *md_buf, uint32_t md_len, +- uint32_t timeout_ms) +-{ +- struct nvme_ctrlr *_nvme_ctrlr = ctx->nvme_ctrlr; +- struct spdk_nvme_qpair *io_qpair; +- int ret; +- +- ctx->ctrlr_io_ch = spdk_get_io_channel(_nvme_ctrlr); +- io_qpair = bdev_nvme_get_io_qpair(ctx->ctrlr_io_ch); +- +- ret = spdk_nvme_ctrlr_cmd_io_raw_with_md(_nvme_ctrlr->ctrlr, io_qpair, +- cmd, buf, nbytes, md_buf, nvme_rpc_bdev_nvme_cb, ctx); +- if (ret) { +- spdk_put_io_channel(ctx->ctrlr_io_ch); +- } +- +- return ret; +- +-} +- +-static int +-rpc_bdev_nvme_send_cmd_exec(struct rpc_bdev_nvme_send_cmd_ctx *ctx) +-{ +- struct rpc_bdev_nvme_send_cmd_req *req = &ctx->req; +- int ret = -EINVAL; +- +- switch (req->cmd_type) { +- case NVME_ADMIN_CMD: +- ret = nvme_rpc_admin_cmd_bdev_nvme(ctx, req->cmdbuf, req->data, +- req->data_len, req->timeout_ms); +- break; +- case NVME_IO_CMD: +- ret = nvme_rpc_io_cmd_bdev_nvme(ctx, req->cmdbuf, req->data, +- req->data_len, req->md, req->md_len, req->timeout_ms); +- break; +- } +- +- return ret; +-} +- +-static int +-rpc_decode_cmd_type(const struct spdk_json_val *val, void *out) +-{ +- int *cmd_type = out; +- +- if (spdk_json_strequal(val, "admin") == true) { +- *cmd_type = NVME_ADMIN_CMD; +- } else if (spdk_json_strequal(val, "io") == true) { +- *cmd_type = NVME_IO_CMD; +- } else { +- SPDK_NOTICELOG("Invalid parameter value: cmd_type\n"); +- return -EINVAL; +- } +- +- return 0; +-} +- +-static int +-rpc_decode_data_direction(const struct spdk_json_val *val, void *out) +-{ +- int *data_direction = out; +- +- if (spdk_json_strequal(val, "h2c") == true) { +- *data_direction = SPDK_NVME_DATA_HOST_TO_CONTROLLER; +- } else if (spdk_json_strequal(val, "c2h") == true) { +- *data_direction = SPDK_NVME_DATA_CONTROLLER_TO_HOST; +- } else { +- SPDK_NOTICELOG("Invalid parameter value: data_direction\n"); +- return -EINVAL; +- } +- +- return 0; +-} +- +-static int +-rpc_decode_cmdbuf(const struct spdk_json_val *val, void *out) +-{ +- char *text = NULL; +- size_t text_strlen, raw_len; +- struct spdk_nvme_cmd *cmdbuf, **_cmdbuf = out; +- int rc; +- +- rc = spdk_json_decode_string(val, &text); +- if (rc) { +- return val->type == SPDK_JSON_VAL_STRING ? -ENOMEM : -EINVAL; +- } +- +- text_strlen = strlen(text); +- raw_len = spdk_base64_get_decoded_len(text_strlen); +- cmdbuf = malloc(raw_len); +- if (!cmdbuf) { +- rc = -ENOMEM; +- goto out; +- } +- +- rc = spdk_base64_urlsafe_decode(cmdbuf, &raw_len, text); +- if (rc) { +- free(cmdbuf); +- goto out; +- } +- if (raw_len != sizeof(*cmdbuf)) { +- rc = -EINVAL; +- free(cmdbuf); +- goto out; +- } +- +- *_cmdbuf = cmdbuf; +- +-out: +- free(text); +- return rc; +-} +- +-static int +-rpc_decode_data(const struct spdk_json_val *val, void *out) +-{ +- struct rpc_bdev_nvme_send_cmd_req *req = (struct rpc_bdev_nvme_send_cmd_req *)out; +- char *text = NULL; +- size_t text_strlen; +- int rc; +- +- rc = spdk_json_decode_string(val, &text); +- if (rc) { +- return val->type == SPDK_JSON_VAL_STRING ? -ENOMEM : -EINVAL; +- } +- text_strlen = strlen(text); +- +- if (req->data_len) { +- /* data_len is decoded by param "data_len" */ +- if (req->data_len != spdk_base64_get_decoded_len(text_strlen)) { +- rc = -EINVAL; +- goto out; +- } +- } else { +- req->data_len = spdk_base64_get_decoded_len(text_strlen); +- req->data = spdk_malloc(req->data_len > 0x1000 ? req->data_len : 0x1000, 0x1000, +- NULL, SPDK_ENV_LCORE_ID_ANY, SPDK_MALLOC_DMA); +- if (!req->data) { +- rc = -ENOMEM; +- goto out; +- } +- } +- +- rc = spdk_base64_urlsafe_decode(req->data, (size_t *)&req->data_len, text); +- +-out: +- free(text); +- return rc; +-} +- +-static int +-rpc_decode_data_len(const struct spdk_json_val *val, void *out) +-{ +- struct rpc_bdev_nvme_send_cmd_req *req = (struct rpc_bdev_nvme_send_cmd_req *)out; +- uint32_t data_len; +- int rc; +- +- rc = spdk_json_decode_uint32(val, &data_len); +- if (rc) { +- return rc; +- } +- +- if (req->data_len) { +- /* data_len is decoded by param "data" */ +- if (req->data_len != data_len) { +- rc = -EINVAL; +- } +- } else { +- req->data_len = data_len; +- req->data = spdk_malloc(req->data_len > 0x1000 ? req->data_len : 0x1000, 0x1000, +- NULL, SPDK_ENV_LCORE_ID_ANY, SPDK_MALLOC_DMA); +- if (!req->data) { +- rc = -ENOMEM; +- } +- } +- +- return rc; +-} +- +-static int +-rpc_decode_metadata(const struct spdk_json_val *val, void *out) +-{ +- struct rpc_bdev_nvme_send_cmd_req *req = (struct rpc_bdev_nvme_send_cmd_req *)out; +- char *text = NULL; +- size_t text_strlen; +- int rc; +- +- rc = spdk_json_decode_string(val, &text); +- if (rc) { +- return val->type == SPDK_JSON_VAL_STRING ? -ENOMEM : -EINVAL; +- } +- text_strlen = strlen(text); +- +- if (req->md_len) { +- /* md_len is decoded by param "metadata_len" */ +- if (req->md_len != spdk_base64_get_decoded_len(text_strlen)) { +- rc = -EINVAL; +- goto out; +- } +- } else { +- req->md_len = spdk_base64_get_decoded_len(text_strlen); +- req->md = spdk_malloc(req->md_len, 0x1000, NULL, +- SPDK_ENV_LCORE_ID_ANY, SPDK_MALLOC_DMA); +- if (!req->md) { +- rc = -ENOMEM; +- goto out; +- } +- } +- +- rc = spdk_base64_urlsafe_decode(req->md, (size_t *)&req->md_len, text); +- +-out: +- free(text); +- return rc; +-} +- +-static int +-rpc_decode_metadata_len(const struct spdk_json_val *val, void *out) +-{ +- struct rpc_bdev_nvme_send_cmd_req *req = (struct rpc_bdev_nvme_send_cmd_req *)out; +- uint32_t md_len; +- int rc; +- +- rc = spdk_json_decode_uint32(val, &md_len); +- if (rc) { +- return rc; +- } +- +- if (req->md_len) { +- /* md_len is decoded by param "metadata" */ +- if (req->md_len != md_len) { +- rc = -EINVAL; +- } +- } else { +- req->md_len = md_len; +- req->md = spdk_malloc(req->md_len, 0x1000, NULL, +- SPDK_ENV_LCORE_ID_ANY, SPDK_MALLOC_DMA); +- if (!req->md) { +- rc = -ENOMEM; +- } +- } +- +- return rc; +-} +- +-static const struct spdk_json_object_decoder rpc_bdev_nvme_send_cmd_req_decoders[] = { +- {"name", offsetof(struct rpc_bdev_nvme_send_cmd_req, name), spdk_json_decode_string}, +- {"cmd_type", offsetof(struct rpc_bdev_nvme_send_cmd_req, cmd_type), rpc_decode_cmd_type}, +- {"data_direction", offsetof(struct rpc_bdev_nvme_send_cmd_req, data_direction), rpc_decode_data_direction}, +- {"cmdbuf", offsetof(struct rpc_bdev_nvme_send_cmd_req, cmdbuf), rpc_decode_cmdbuf}, +- {"timeout_ms", offsetof(struct rpc_bdev_nvme_send_cmd_req, timeout_ms), spdk_json_decode_uint32, true}, +- {"data_len", 0, rpc_decode_data_len, true}, +- {"metadata_len", 0, rpc_decode_metadata_len, true}, +- {"data", 0, rpc_decode_data, true}, +- {"metadata", 0, rpc_decode_metadata, true}, +-}; +- +-static void +-rpc_bdev_nvme_send_cmd(struct spdk_jsonrpc_request *request, +- const struct spdk_json_val *params) +-{ +- struct rpc_bdev_nvme_send_cmd_ctx *ctx; +- int ret, error_code; +- +- ctx = calloc(1, sizeof(*ctx)); +- if (!ctx) { +- SPDK_ERRLOG("Failed at Malloc ctx\n"); +- error_code = SPDK_JSONRPC_ERROR_INTERNAL_ERROR; +- ret = -ENOMEM; +- goto invalid; +- } +- +- if (spdk_json_decode_object(params, rpc_bdev_nvme_send_cmd_req_decoders, +- SPDK_COUNTOF(rpc_bdev_nvme_send_cmd_req_decoders), +- &ctx->req)) { +- SPDK_ERRLOG("spdk_json_decode_object failed\n"); +- error_code = SPDK_JSONRPC_ERROR_INVALID_PARAMS; +- ret = -EINVAL; +- goto invalid; +- } +- +- ctx->nvme_ctrlr = nvme_ctrlr_get_by_name(ctx->req.name); +- if (ctx->nvme_ctrlr == NULL) { +- SPDK_ERRLOG("Failed at device lookup\n"); +- error_code = SPDK_JSONRPC_ERROR_INVALID_PARAMS; +- ret = -EINVAL; +- goto invalid; +- } +- +- ctx->jsonrpc_request = request; +- +- ret = rpc_bdev_nvme_send_cmd_exec(ctx); +- if (ret < 0) { +- SPDK_NOTICELOG("Failed at rpc_bdev_nvme_send_cmd_exec\n"); +- error_code = SPDK_JSONRPC_ERROR_INTERNAL_ERROR; +- goto invalid; +- } +- +- return; +- +-invalid: +- if (ctx != NULL) { +- free_rpc_bdev_nvme_send_cmd_ctx(ctx); +- } +- spdk_jsonrpc_send_error_response(request, error_code, spdk_strerror(-ret)); +-} +-SPDK_RPC_REGISTER("bdev_nvme_send_cmd", rpc_bdev_nvme_send_cmd, SPDK_RPC_RUNTIME) ++/* SPDX-License-Identifier: BSD-3-Clause ++ * Copyright (C) 2018 Intel Corporation. ++ * All rights reserved. ++ */ ++ ++#include "spdk/stdinc.h" ++#include "spdk/string.h" ++#include "spdk/rpc.h" ++#include "spdk/util.h" ++#include "spdk/bdev_module.h" ++#include "spdk/log.h" ++ ++#include "bdev_nvme.h" ++#include "spdk/base64.h" ++ ++enum spdk_nvme_rpc_type { ++ NVME_ADMIN_CMD = 1, ++ NVME_IO_CMD, ++}; ++ ++struct rpc_bdev_nvme_send_cmd_req { ++ char *name; ++ int cmd_type; ++ int data_direction; ++ uint32_t timeout_ms; ++ uint32_t data_len; ++ uint32_t md_len; ++ ++ struct spdk_nvme_cmd *cmdbuf; ++ char *data; ++ char *md; ++}; ++ ++struct rpc_bdev_nvme_send_cmd_resp { ++ char *cpl_text; ++ char *data_text; ++ char *md_text; ++}; ++ ++struct rpc_bdev_nvme_send_cmd_ctx { ++ struct spdk_jsonrpc_request *jsonrpc_request; ++ struct rpc_bdev_nvme_send_cmd_req req; ++ struct rpc_bdev_nvme_send_cmd_resp resp; ++ struct nvme_ctrlr *nvme_ctrlr; ++ struct spdk_io_channel *ctrlr_io_ch; ++}; ++ ++static void ++free_rpc_bdev_nvme_send_cmd_ctx(struct rpc_bdev_nvme_send_cmd_ctx *ctx) ++{ ++ assert(ctx != NULL); ++ ++ free(ctx->req.name); ++ free(ctx->req.cmdbuf); ++ spdk_free(ctx->req.data); ++ spdk_free(ctx->req.md); ++ free(ctx->resp.cpl_text); ++ free(ctx->resp.data_text); ++ free(ctx->resp.md_text); ++ free(ctx); ++} ++ ++static int ++rpc_bdev_nvme_send_cmd_resp_construct(struct rpc_bdev_nvme_send_cmd_resp *resp, ++ struct rpc_bdev_nvme_send_cmd_req *req, ++ const struct spdk_nvme_cpl *cpl) ++{ ++ resp->cpl_text = malloc(spdk_base64_get_encoded_strlen(sizeof(*cpl)) + 1); ++ if (!resp->cpl_text) { ++ return -ENOMEM; ++ } ++ spdk_base64_urlsafe_encode(resp->cpl_text, cpl, sizeof(*cpl)); ++ ++ if (req->data_direction == SPDK_NVME_DATA_CONTROLLER_TO_HOST) { ++ if (req->data_len) { ++ resp->data_text = malloc(spdk_base64_get_encoded_strlen(req->data_len) + 1); ++ if (!resp->data_text) { ++ return -ENOMEM; ++ } ++ spdk_base64_urlsafe_encode(resp->data_text, req->data, req->data_len); ++ } ++ if (req->md_len) { ++ resp->md_text = malloc(spdk_base64_get_encoded_strlen(req->md_len) + 1); ++ if (!resp->md_text) { ++ return -ENOMEM; ++ } ++ spdk_base64_urlsafe_encode(resp->md_text, req->md, req->md_len); ++ } ++ } ++ ++ return 0; ++} ++ ++static void ++rpc_bdev_nvme_send_cmd_complete(struct rpc_bdev_nvme_send_cmd_ctx *ctx, ++ const struct spdk_nvme_cpl *cpl) ++{ ++ struct spdk_jsonrpc_request *request = ctx->jsonrpc_request; ++ struct spdk_json_write_ctx *w; ++ int ret; ++ ++ ret = rpc_bdev_nvme_send_cmd_resp_construct(&ctx->resp, &ctx->req, cpl); ++ if (ret) { ++ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, ++ spdk_strerror(-ret)); ++ goto out; ++ } ++ ++ w = spdk_jsonrpc_begin_result(request); ++ spdk_json_write_object_begin(w); ++ spdk_json_write_named_string(w, "cpl", ctx->resp.cpl_text); ++ ++ if (ctx->resp.data_text) { ++ spdk_json_write_named_string(w, "data", ctx->resp.data_text); ++ } ++ ++ if (ctx->resp.md_text) { ++ spdk_json_write_named_string(w, "metadata", ctx->resp.md_text); ++ } ++ ++ spdk_json_write_object_end(w); ++ spdk_jsonrpc_end_result(request, w); ++ ++out: ++ free_rpc_bdev_nvme_send_cmd_ctx(ctx); ++ return; ++} ++ ++static void ++nvme_rpc_bdev_nvme_cb(void *ref, const struct spdk_nvme_cpl *cpl) ++{ ++ struct rpc_bdev_nvme_send_cmd_ctx *ctx = (struct rpc_bdev_nvme_send_cmd_ctx *)ref; ++ ++ if (ctx->ctrlr_io_ch) { ++ spdk_put_io_channel(ctx->ctrlr_io_ch); ++ ctx->ctrlr_io_ch = NULL; ++ } ++ ++ rpc_bdev_nvme_send_cmd_complete(ctx, cpl); ++} ++ ++static int ++nvme_rpc_admin_cmd_bdev_nvme(struct rpc_bdev_nvme_send_cmd_ctx *ctx, struct spdk_nvme_cmd *cmd, ++ void *buf, uint32_t nbytes, uint32_t timeout_ms) ++{ ++ struct nvme_ctrlr *_nvme_ctrlr = ctx->nvme_ctrlr; ++ int ret; ++ ++ ret = spdk_nvme_ctrlr_cmd_admin_raw(_nvme_ctrlr->ctrlr, cmd, buf, ++ nbytes, nvme_rpc_bdev_nvme_cb, ctx); ++ ++ return ret; ++} ++ ++static int ++nvme_rpc_io_cmd_bdev_nvme(struct rpc_bdev_nvme_send_cmd_ctx *ctx, struct spdk_nvme_cmd *cmd, ++ void *buf, uint32_t nbytes, void *md_buf, uint32_t md_len, ++ uint32_t timeout_ms) ++{ ++ struct nvme_ctrlr *_nvme_ctrlr = ctx->nvme_ctrlr; ++ struct spdk_nvme_qpair *io_qpair; ++ int ret; ++ ++ ctx->ctrlr_io_ch = spdk_get_io_channel(_nvme_ctrlr); ++ io_qpair = bdev_nvme_get_io_qpair(ctx->ctrlr_io_ch); ++ ++ ret = spdk_nvme_ctrlr_cmd_io_raw_with_md(_nvme_ctrlr->ctrlr, io_qpair, ++ cmd, buf, nbytes, md_buf, nvme_rpc_bdev_nvme_cb, ctx); ++ if (ret) { ++ spdk_put_io_channel(ctx->ctrlr_io_ch); ++ } ++ ++ return ret; ++ ++} ++ ++static int ++rpc_bdev_nvme_send_cmd_exec(struct rpc_bdev_nvme_send_cmd_ctx *ctx) ++{ ++ struct rpc_bdev_nvme_send_cmd_req *req = &ctx->req; ++ int ret = -EINVAL; ++ ++ switch (req->cmd_type) { ++ case NVME_ADMIN_CMD: ++ ret = nvme_rpc_admin_cmd_bdev_nvme(ctx, req->cmdbuf, req->data, ++ req->data_len, req->timeout_ms); ++ break; ++ case NVME_IO_CMD: ++ ret = nvme_rpc_io_cmd_bdev_nvme(ctx, req->cmdbuf, req->data, ++ req->data_len, req->md, req->md_len, req->timeout_ms); ++ break; ++ } ++ ++ return ret; ++} ++ ++static int ++rpc_decode_cmd_type(const struct spdk_json_val *val, void *out) ++{ ++ int *cmd_type = out; ++ ++ if (spdk_json_strequal(val, "admin") == true) { ++ *cmd_type = NVME_ADMIN_CMD; ++ } else if (spdk_json_strequal(val, "io") == true) { ++ *cmd_type = NVME_IO_CMD; ++ } else { ++ SPDK_NOTICELOG("Invalid parameter value: cmd_type\n"); ++ return -EINVAL; ++ } ++ ++ return 0; ++} ++ ++static int ++rpc_decode_data_direction(const struct spdk_json_val *val, void *out) ++{ ++ int *data_direction = out; ++ ++ if (spdk_json_strequal(val, "h2c") == true) { ++ *data_direction = SPDK_NVME_DATA_HOST_TO_CONTROLLER; ++ } else if (spdk_json_strequal(val, "c2h") == true) { ++ *data_direction = SPDK_NVME_DATA_CONTROLLER_TO_HOST; ++ } else { ++ SPDK_NOTICELOG("Invalid parameter value: data_direction\n"); ++ return -EINVAL; ++ } ++ ++ return 0; ++} ++ ++static int ++rpc_decode_cmdbuf(const struct spdk_json_val *val, void *out) ++{ ++ char *text = NULL; ++ size_t text_strlen, raw_len; ++ struct spdk_nvme_cmd *cmdbuf, **_cmdbuf = out; ++ int rc; ++ ++ rc = spdk_json_decode_string(val, &text); ++ if (rc) { ++ return val->type == SPDK_JSON_VAL_STRING ? -ENOMEM : -EINVAL; ++ } ++ ++ text_strlen = strlen(text); ++ raw_len = spdk_base64_get_decoded_len(text_strlen); ++ cmdbuf = malloc(raw_len); ++ if (!cmdbuf) { ++ rc = -ENOMEM; ++ goto out; ++ } ++ ++ rc = spdk_base64_urlsafe_decode(cmdbuf, &raw_len, text); ++ if (rc) { ++ free(cmdbuf); ++ goto out; ++ } ++ if (raw_len != sizeof(*cmdbuf)) { ++ rc = -EINVAL; ++ free(cmdbuf); ++ goto out; ++ } ++ ++ *_cmdbuf = cmdbuf; ++ ++out: ++ free(text); ++ return rc; ++} ++ ++static int ++rpc_decode_data(const struct spdk_json_val *val, void *out) ++{ ++ struct rpc_bdev_nvme_send_cmd_req *req = (struct rpc_bdev_nvme_send_cmd_req *)out; ++ char *text = NULL; ++ size_t text_strlen; ++ int rc; ++ ++ rc = spdk_json_decode_string(val, &text); ++ if (rc) { ++ return val->type == SPDK_JSON_VAL_STRING ? -ENOMEM : -EINVAL; ++ } ++ text_strlen = strlen(text); ++ ++ if (req->data_len) { ++ /* data_len is decoded by param "data_len" */ ++ if (req->data_len != spdk_base64_get_decoded_len(text_strlen)) { ++ rc = -EINVAL; ++ goto out; ++ } ++ } else { ++ req->data_len = spdk_base64_get_decoded_len(text_strlen); ++ req->data = spdk_malloc(req->data_len > 0x1000 ? req->data_len : 0x1000, 0x1000, ++ NULL, SPDK_ENV_LCORE_ID_ANY, SPDK_MALLOC_DMA); ++ if (!req->data) { ++ rc = -ENOMEM; ++ goto out; ++ } ++ } ++ ++ rc = spdk_base64_urlsafe_decode(req->data, (size_t *)&req->data_len, text); ++ ++out: ++ free(text); ++ return rc; ++} ++ ++static int ++rpc_decode_data_len(const struct spdk_json_val *val, void *out) ++{ ++ struct rpc_bdev_nvme_send_cmd_req *req = (struct rpc_bdev_nvme_send_cmd_req *)out; ++ uint32_t data_len; ++ int rc; ++ ++ rc = spdk_json_decode_uint32(val, &data_len); ++ if (rc) { ++ return rc; ++ } ++ ++ if (req->data_len) { ++ /* data_len is decoded by param "data" */ ++ if (req->data_len != data_len) { ++ rc = -EINVAL; ++ } ++ } else { ++ req->data_len = data_len; ++ req->data = spdk_malloc(req->data_len > 0x1000 ? req->data_len : 0x1000, 0x1000, ++ NULL, SPDK_ENV_LCORE_ID_ANY, SPDK_MALLOC_DMA); ++ if (!req->data) { ++ rc = -ENOMEM; ++ } ++ } ++ ++ return rc; ++} ++ ++static int ++rpc_decode_metadata(const struct spdk_json_val *val, void *out) ++{ ++ struct rpc_bdev_nvme_send_cmd_req *req = (struct rpc_bdev_nvme_send_cmd_req *)out; ++ char *text = NULL; ++ size_t text_strlen; ++ int rc; ++ ++ rc = spdk_json_decode_string(val, &text); ++ if (rc) { ++ return val->type == SPDK_JSON_VAL_STRING ? -ENOMEM : -EINVAL; ++ } ++ text_strlen = strlen(text); ++ ++ if (req->md_len) { ++ /* md_len is decoded by param "metadata_len" */ ++ if (req->md_len != spdk_base64_get_decoded_len(text_strlen)) { ++ rc = -EINVAL; ++ goto out; ++ } ++ } else { ++ req->md_len = spdk_base64_get_decoded_len(text_strlen); ++ req->md = spdk_malloc(req->md_len, 0x1000, NULL, ++ SPDK_ENV_LCORE_ID_ANY, SPDK_MALLOC_DMA); ++ if (!req->md) { ++ rc = -ENOMEM; ++ goto out; ++ } ++ } ++ ++ rc = spdk_base64_urlsafe_decode(req->md, (size_t *)&req->md_len, text); ++ ++out: ++ free(text); ++ return rc; ++} ++ ++static int ++rpc_decode_metadata_len(const struct spdk_json_val *val, void *out) ++{ ++ struct rpc_bdev_nvme_send_cmd_req *req = (struct rpc_bdev_nvme_send_cmd_req *)out; ++ uint32_t md_len; ++ int rc; ++ ++ rc = spdk_json_decode_uint32(val, &md_len); ++ if (rc) { ++ return rc; ++ } ++ ++ if (req->md_len) { ++ /* md_len is decoded by param "metadata" */ ++ if (req->md_len != md_len) { ++ rc = -EINVAL; ++ } ++ } else { ++ req->md_len = md_len; ++ req->md = spdk_malloc(req->md_len, 0x1000, NULL, ++ SPDK_ENV_LCORE_ID_ANY, SPDK_MALLOC_DMA); ++ if (!req->md) { ++ rc = -ENOMEM; ++ } ++ } ++ ++ return rc; ++} ++ ++static const struct spdk_json_object_decoder rpc_bdev_nvme_send_cmd_req_decoders[] = { ++ {"name", offsetof(struct rpc_bdev_nvme_send_cmd_req, name), spdk_json_decode_string}, ++ {"cmd_type", offsetof(struct rpc_bdev_nvme_send_cmd_req, cmd_type), rpc_decode_cmd_type}, ++ {"data_direction", offsetof(struct rpc_bdev_nvme_send_cmd_req, data_direction), rpc_decode_data_direction}, ++ {"cmdbuf", offsetof(struct rpc_bdev_nvme_send_cmd_req, cmdbuf), rpc_decode_cmdbuf}, ++ {"timeout_ms", offsetof(struct rpc_bdev_nvme_send_cmd_req, timeout_ms), spdk_json_decode_uint32, true}, ++ {"data_len", 0, rpc_decode_data_len, true}, ++ {"metadata_len", 0, rpc_decode_metadata_len, true}, ++ {"data", 0, rpc_decode_data, true}, ++ {"metadata", 0, rpc_decode_metadata, true}, ++}; ++ ++static void ++rpc_bdev_nvme_send_cmd(struct spdk_jsonrpc_request *request, ++ const struct spdk_json_val *params) ++{ ++ struct rpc_bdev_nvme_send_cmd_ctx *ctx; ++ int ret, error_code; ++ ++ ctx = calloc(1, sizeof(*ctx)); ++ if (!ctx) { ++ SPDK_ERRLOG("Failed at Malloc ctx\n"); ++ error_code = SPDK_JSONRPC_ERROR_INTERNAL_ERROR; ++ ret = -ENOMEM; ++ goto invalid; ++ } ++ ++ if (spdk_json_decode_object(params, rpc_bdev_nvme_send_cmd_req_decoders, ++ SPDK_COUNTOF(rpc_bdev_nvme_send_cmd_req_decoders), ++ &ctx->req)) { ++ SPDK_ERRLOG("spdk_json_decode_object failed\n"); ++ error_code = SPDK_JSONRPC_ERROR_INVALID_PARAMS; ++ ret = -EINVAL; ++ goto invalid; ++ } ++ ++ ctx->nvme_ctrlr = nvme_ctrlr_get_by_name(ctx->req.name); ++ if (ctx->nvme_ctrlr == NULL) { ++ SPDK_ERRLOG("Failed at device lookup\n"); ++ error_code = SPDK_JSONRPC_ERROR_INVALID_PARAMS; ++ ret = -EINVAL; ++ goto invalid; ++ } ++ ++ ctx->jsonrpc_request = request; ++ ++ ret = rpc_bdev_nvme_send_cmd_exec(ctx); ++ if (ret < 0) { ++ SPDK_NOTICELOG("Failed at rpc_bdev_nvme_send_cmd_exec\n"); ++ error_code = SPDK_JSONRPC_ERROR_INTERNAL_ERROR; ++ goto invalid; ++ } ++ ++ return; ++ ++invalid: ++ if (ctx != NULL) { ++ free_rpc_bdev_nvme_send_cmd_ctx(ctx); ++ } ++ spdk_jsonrpc_send_error_response(request, error_code, spdk_strerror(-ret)); ++} ++SPDK_RPC_REGISTER("bdev_nvme_send_cmd", rpc_bdev_nvme_send_cmd, SPDK_RPC_RUNTIME) +diff --git a/module/bdev/nvme/vbdev_opal.c b/module/bdev/nvme/vbdev_opal.c +index 863ca54..3d6d0ae 100644 +--- a/module/bdev/nvme/vbdev_opal.c ++++ b/module/bdev/nvme/vbdev_opal.c +@@ -1,613 +1,613 @@ +-/* SPDX-License-Identifier: BSD-3-Clause +- * Copyright (C) 2019 Intel Corporation. +- * All rights reserved. +- */ +- +-#include "spdk/opal.h" +-#include "spdk/bdev_module.h" +-#include "vbdev_opal.h" +-#include "spdk/log.h" +-#include "spdk/string.h" +- +-/* OPAL locking range only supports operations on nsid=1 for now */ +-#define NSID_SUPPORTED 1 +- +-struct opal_vbdev { +- char *name; +- struct nvme_ctrlr *nvme_ctrlr; +- struct spdk_opal_dev *opal_dev; +- struct spdk_bdev_part *bdev_part; +- +- uint8_t locking_range_id; +- uint64_t range_start; +- uint64_t range_length; +- struct vbdev_opal_part_base *opal_base; +- +- TAILQ_ENTRY(opal_vbdev) tailq; +-}; +- +-static TAILQ_HEAD(, opal_vbdev) g_opal_vbdev = +- TAILQ_HEAD_INITIALIZER(g_opal_vbdev); +- +-struct vbdev_opal_bdev_io { +- struct spdk_io_channel *ch; +- struct spdk_bdev_io *bdev_io; +- struct spdk_bdev_io_wait_entry bdev_io_wait; +-}; +- +-struct vbdev_opal_channel { +- struct spdk_bdev_part_channel part_ch; +-}; +- +-struct vbdev_opal_part_base { +- char *nvme_ctrlr_name; +- struct spdk_bdev_part_base *part_base; +- SPDK_BDEV_PART_TAILQ part_tailq; +- TAILQ_ENTRY(vbdev_opal_part_base) tailq; +-}; +- +-static TAILQ_HEAD(, vbdev_opal_part_base) g_opal_base = TAILQ_HEAD_INITIALIZER(g_opal_base); +- +-static void _vbdev_opal_submit_request(struct spdk_io_channel *_ch, struct spdk_bdev_io *bdev_io); +- +-static void vbdev_opal_examine(struct spdk_bdev *bdev); +- +-static void +-vbdev_opal_delete(struct opal_vbdev *opal_bdev) +-{ +- TAILQ_REMOVE(&g_opal_vbdev, opal_bdev, tailq); +- free(opal_bdev->name); +- free(opal_bdev); +- opal_bdev = NULL; +-} +- +-static void +-vbdev_opal_clear(void) +-{ +- struct opal_vbdev *opal_bdev, *tmp; +- +- TAILQ_FOREACH_SAFE(opal_bdev, &g_opal_vbdev, tailq, tmp) { +- vbdev_opal_delete(opal_bdev); +- } +-} +- +-static int +-vbdev_opal_init(void) +-{ +- /* TODO */ +- return 0; +-} +- +-static void +-vbdev_opal_fini(void) +-{ +- vbdev_opal_clear(); +-} +- +-static int +-vbdev_opal_get_ctx_size(void) +-{ +- return sizeof(struct vbdev_opal_bdev_io); +-} +- +-/* delete all the config of the same base bdev */ +-static void +-vbdev_opal_delete_all_base_config(struct vbdev_opal_part_base *base) +-{ +- char *nvme_ctrlr_name = base->nvme_ctrlr_name; +- struct opal_vbdev *bdev, *tmp_bdev; +- +- TAILQ_FOREACH_SAFE(bdev, &g_opal_vbdev, tailq, tmp_bdev) { +- if (!strcmp(nvme_ctrlr_name, bdev->nvme_ctrlr->nbdev_ctrlr->name)) { +- vbdev_opal_delete(bdev); +- } +- } +-} +- +-static int +-_vbdev_opal_destruct(void *ctx) +-{ +- struct spdk_bdev_part *part = ctx; +- +- return spdk_bdev_part_free(part); +-} +- +-static void +-vbdev_opal_base_free(void *ctx) +-{ +- struct vbdev_opal_part_base *base = ctx; +- +- TAILQ_REMOVE(&g_opal_base, base, tailq); +- +- free(base->nvme_ctrlr_name); +- free(base); +-} +- +-static void +-vbdev_opal_resubmit_io(void *arg) +-{ +- struct vbdev_opal_bdev_io *io_ctx = (struct vbdev_opal_bdev_io *)arg; +- +- _vbdev_opal_submit_request(io_ctx->ch, io_ctx->bdev_io); +-} +- +-static void +-vbdev_opal_queue_io(struct vbdev_opal_bdev_io *io_ctx) +-{ +- struct vbdev_opal_channel *ch = spdk_io_channel_get_ctx(io_ctx->ch); +- int rc; +- +- io_ctx->bdev_io_wait.bdev = io_ctx->bdev_io->bdev; +- io_ctx->bdev_io_wait.cb_fn = vbdev_opal_resubmit_io; +- io_ctx->bdev_io_wait.cb_arg = io_ctx; +- +- rc = spdk_bdev_queue_io_wait(io_ctx->bdev_io->bdev, ch->part_ch.base_ch, &io_ctx->bdev_io_wait); +- +- if (rc != 0) { +- SPDK_ERRLOG("Queue io failed in vbdev_opal_queue_io: %d\n", rc); +- spdk_bdev_io_complete(io_ctx->bdev_io, SPDK_BDEV_IO_STATUS_FAILED); +- } +-} +- +-static void +-_vbdev_opal_submit_request(struct spdk_io_channel *_ch, struct spdk_bdev_io *bdev_io) +-{ +- struct vbdev_opal_channel *ch = spdk_io_channel_get_ctx(_ch); +- struct vbdev_opal_bdev_io *io_ctx = (struct vbdev_opal_bdev_io *)bdev_io->driver_ctx; +- int rc; +- +- rc = spdk_bdev_part_submit_request(&ch->part_ch, bdev_io); +- if (rc) { +- if (rc == -ENOMEM) { +- SPDK_DEBUGLOG(vbdev_opal, "opal: no memory, queue io.\n"); +- io_ctx->ch = _ch; +- io_ctx->bdev_io = bdev_io; +- vbdev_opal_queue_io(io_ctx); +- } else { +- SPDK_ERRLOG("opal: error on io submission, rc=%d.\n", rc); +- spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); +- } +- } +-} +- +-static void +-vbdev_opal_io_get_buf_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io, bool success) +-{ +- if (!success) { +- spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); +- return; +- } +- +- _vbdev_opal_submit_request(ch, bdev_io); +-} +- +-static void +-vbdev_opal_submit_request(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io) +-{ +- switch (bdev_io->type) { +- case SPDK_BDEV_IO_TYPE_READ: +- spdk_bdev_io_get_buf(bdev_io, vbdev_opal_io_get_buf_cb, +- bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen); +- break; +- default: +- _vbdev_opal_submit_request(ch, bdev_io); +- break; +- } +-} +- +-struct spdk_opal_locking_range_info * +-vbdev_opal_get_info_from_bdev(const char *opal_bdev_name, const char *password) +-{ +- struct opal_vbdev *vbdev; +- struct nvme_ctrlr *nvme_ctrlr; +- int locking_range_id; +- int rc; +- +- TAILQ_FOREACH(vbdev, &g_opal_vbdev, tailq) { +- if (strcmp(vbdev->name, opal_bdev_name) == 0) { +- break; +- } +- } +- +- if (vbdev == NULL) { +- SPDK_ERRLOG("%s not found\n", opal_bdev_name); +- return NULL; +- } +- +- nvme_ctrlr = vbdev->nvme_ctrlr; +- if (nvme_ctrlr == NULL) { +- SPDK_ERRLOG("can't find nvme_ctrlr of %s\n", vbdev->name); +- return NULL; +- } +- +- locking_range_id = vbdev->locking_range_id; +- rc = spdk_opal_cmd_get_locking_range_info(nvme_ctrlr->opal_dev, password, +- OPAL_ADMIN1, locking_range_id); +- if (rc) { +- SPDK_ERRLOG("Get locking range info error: %d\n", rc); +- return NULL; +- } +- +- return spdk_opal_get_locking_range_info(nvme_ctrlr->opal_dev, locking_range_id); +-} +- +-static int +-vbdev_opal_dump_info_json(void *ctx, struct spdk_json_write_ctx *w) +-{ +- struct spdk_bdev_part *part = ctx; +- struct spdk_bdev *base_bdev = spdk_bdev_part_get_base_bdev(part); +- uint64_t offset = spdk_bdev_part_get_offset_blocks(part); +- +- spdk_json_write_named_object_begin(w, "opal"); +- +- spdk_json_write_named_string(w, "base_bdev", spdk_bdev_get_name(base_bdev)); +- spdk_json_write_named_uint64(w, "offset_blocks", offset); +- +- spdk_json_write_object_end(w); +- +- return 0; +-} +- +-static void +-vbdev_opal_base_bdev_hotremove_cb(void *_part_base) +-{ +- struct spdk_bdev_part_base *part_base = _part_base; +- struct vbdev_opal_part_base *base = spdk_bdev_part_base_get_ctx(part_base); +- +- spdk_bdev_part_base_hotremove(part_base, spdk_bdev_part_base_get_tailq(part_base)); +- vbdev_opal_delete_all_base_config(base); +-} +- +-static bool +-vbdev_opal_io_type_supported(void *ctx, enum spdk_bdev_io_type io_type) +-{ +- struct spdk_bdev_part *part = ctx; +- struct spdk_bdev *base_bdev = spdk_bdev_part_get_base_bdev(part); +- +- return spdk_bdev_io_type_supported(base_bdev, io_type); +-} +- +-static struct spdk_bdev_fn_table opal_vbdev_fn_table = { +- .destruct = _vbdev_opal_destruct, +- .submit_request = vbdev_opal_submit_request, +- .io_type_supported = vbdev_opal_io_type_supported, +- .dump_info_json = vbdev_opal_dump_info_json, +- .write_config_json = NULL, +-}; +- +-static struct spdk_bdev_module opal_if = { +- .name = "opal", +- .module_init = vbdev_opal_init, +- .module_fini = vbdev_opal_fini, +- .get_ctx_size = vbdev_opal_get_ctx_size, +- .examine_config = vbdev_opal_examine, +- .config_json = NULL, +-}; +- +-SPDK_BDEV_MODULE_REGISTER(opal, &opal_if) +- +-int +-vbdev_opal_create(const char *nvme_ctrlr_name, uint32_t nsid, uint8_t locking_range_id, +- uint64_t range_start, uint64_t range_length, const char *password) +-{ +- int rc; +- char *opal_vbdev_name; +- char *base_bdev_name; +- struct nvme_ctrlr *nvme_ctrlr; +- struct opal_vbdev *opal_bdev; +- struct vbdev_opal_part_base *opal_part_base = NULL; +- struct spdk_bdev_part *part_bdev; +- struct nvme_bdev *nvme_bdev; +- struct nvme_ns *nvme_ns; +- +- if (nsid != NSID_SUPPORTED) { +- SPDK_ERRLOG("nsid %d not supported", nsid); +- return -EINVAL; +- } +- +- nvme_ctrlr = nvme_ctrlr_get_by_name(nvme_ctrlr_name); +- if (!nvme_ctrlr) { +- SPDK_ERRLOG("get nvme ctrlr failed\n"); +- return -ENODEV; +- } +- +- if (!nvme_ctrlr->opal_dev) { +- SPDK_ERRLOG("Opal not supported\n"); +- return -ENOTSUP; +- } +- +- opal_bdev = calloc(1, sizeof(struct opal_vbdev)); +- if (!opal_bdev) { +- SPDK_ERRLOG("allocation for opal_bdev failed\n"); +- return -ENOMEM; +- } +- +- opal_bdev->locking_range_id = locking_range_id; +- opal_bdev->range_start = range_start; +- opal_bdev->range_length = range_length; +- +- opal_bdev->nvme_ctrlr = nvme_ctrlr; +- opal_bdev->opal_dev = nvme_ctrlr->opal_dev; +- +- nvme_ns = nvme_ctrlr_get_ns(nvme_ctrlr, nsid); +- if (nvme_ns == NULL) { +- free(opal_bdev); +- return -ENODEV; +- } +- +- nvme_bdev = nvme_ns->bdev; +- assert(nvme_bdev != NULL); +- base_bdev_name = nvme_bdev->disk.name; +- +- /* traverse base list to see if part_base is already create for this base bdev */ +- TAILQ_FOREACH(opal_part_base, &g_opal_base, tailq) { +- if (!strcmp(spdk_bdev_part_base_get_bdev_name(opal_part_base->part_base), base_bdev_name)) { +- break; +- } +- } +- +- /* If there is not a corresponding opal_part_base, a new opal_part_base will be created. +- For each new part_base, there will be one tailq to store all the parts of this base */ +- if (opal_part_base == NULL) { +- opal_part_base = calloc(1, sizeof(*opal_part_base)); +- if (opal_part_base == NULL) { +- SPDK_ERRLOG("Could not allocate opal_part_base\n"); +- free(opal_bdev); +- return -ENOMEM; +- } +- TAILQ_INIT(&opal_part_base->part_tailq); +- +- rc = spdk_bdev_part_base_construct_ext(base_bdev_name, +- vbdev_opal_base_bdev_hotremove_cb, &opal_if, +- &opal_vbdev_fn_table, &opal_part_base->part_tailq, +- vbdev_opal_base_free, opal_part_base, +- sizeof(struct vbdev_opal_channel), NULL, NULL, +- &opal_part_base->part_base); +- if (rc != 0) { +- if (rc != -ENODEV) { +- SPDK_ERRLOG("Could not allocate part_base\n"); +- } +- free(opal_bdev); +- free(opal_part_base); +- return rc; +- } +- opal_part_base->nvme_ctrlr_name = strdup(nvme_ctrlr_name); +- if (opal_part_base->nvme_ctrlr_name == NULL) { +- free(opal_bdev); +- spdk_bdev_part_base_free(opal_part_base->part_base); +- return -ENOMEM; +- } +- +- TAILQ_INSERT_TAIL(&g_opal_base, opal_part_base, tailq); +- } +- assert(opal_part_base != NULL); +- opal_bdev->opal_base = opal_part_base; +- +- part_bdev = calloc(1, sizeof(struct spdk_bdev_part)); +- if (!part_bdev) { +- SPDK_ERRLOG("Could not allocate part_bdev\n"); +- free(opal_bdev); +- return -ENOMEM; +- } +- +- TAILQ_INSERT_TAIL(&g_opal_vbdev, opal_bdev, tailq); +- opal_vbdev_name = spdk_sprintf_alloc("%sr%" PRIu8, base_bdev_name, +- opal_bdev->locking_range_id); /* e.g.: nvme0n1r1 */ +- if (opal_vbdev_name == NULL) { +- SPDK_ERRLOG("Could not allocate opal_vbdev_name\n"); +- rc = -ENOMEM; +- goto err; +- } +- +- opal_bdev->name = opal_vbdev_name; +- rc = spdk_opal_cmd_setup_locking_range(opal_bdev->opal_dev, OPAL_ADMIN1, +- opal_bdev->locking_range_id, opal_bdev->range_start, +- opal_bdev->range_length, password); +- if (rc) { +- SPDK_ERRLOG("Error construct %s\n", opal_vbdev_name); +- goto err; +- } +- +- rc = spdk_bdev_part_construct(part_bdev, opal_bdev->opal_base->part_base, opal_vbdev_name, +- opal_bdev->range_start, opal_bdev->range_length, "Opal locking range"); +- if (rc) { +- SPDK_ERRLOG("Could not allocate bdev part\n"); +- goto err; +- } +- +- /* lock this bdev initially */ +- rc = spdk_opal_cmd_lock_unlock(opal_bdev->opal_dev, OPAL_ADMIN1, OPAL_RWLOCK, locking_range_id, +- password); +- if (rc) { +- SPDK_ERRLOG("Error lock %s\n", opal_vbdev_name); +- goto err; +- } +- +- opal_bdev->bdev_part = part_bdev; +- return 0; +- +-err: +- vbdev_opal_delete(opal_bdev); +- free(part_bdev); +- return rc; +-} +- +-static void +-vbdev_opal_destruct_bdev(struct opal_vbdev *opal_bdev) +-{ +- struct spdk_bdev_part *part = opal_bdev->bdev_part; +- +- assert(opal_bdev->opal_base != NULL); +- assert(part != NULL); +- +- if (opal_bdev->range_start == spdk_bdev_part_get_offset_blocks(part)) { +- spdk_bdev_unregister(spdk_bdev_part_get_bdev(part), NULL, NULL); +- } +- vbdev_opal_delete(opal_bdev); +-} +- +-int +-vbdev_opal_destruct(const char *bdev_name, const char *password) +-{ +- struct nvme_ctrlr *nvme_ctrlr; +- int locking_range_id; +- int rc; +- struct opal_vbdev *opal_bdev; +- +- TAILQ_FOREACH(opal_bdev, &g_opal_vbdev, tailq) { +- if (strcmp(opal_bdev->name, bdev_name) == 0) { +- break; +- } +- } +- +- if (opal_bdev == NULL) { +- SPDK_ERRLOG("%s not found\n", bdev_name); +- rc = -ENODEV; +- goto err; +- } +- +- locking_range_id = opal_bdev->locking_range_id; +- +- nvme_ctrlr = opal_bdev->nvme_ctrlr; +- if (nvme_ctrlr == NULL) { +- SPDK_ERRLOG("can't find nvme_ctrlr of %s\n", bdev_name); +- return -ENODEV; +- } +- +- /* secure erase locking range */ +- rc = spdk_opal_cmd_secure_erase_locking_range(nvme_ctrlr->opal_dev, OPAL_ADMIN1, locking_range_id, +- password); +- if (rc) { +- SPDK_ERRLOG("opal erase locking range failed\n"); +- goto err; +- } +- +- /* reset the locking range to 0 */ +- rc = spdk_opal_cmd_setup_locking_range(nvme_ctrlr->opal_dev, OPAL_ADMIN1, locking_range_id, 0, +- 0, password); +- if (rc) { +- SPDK_ERRLOG("opal reset locking range failed\n"); +- goto err; +- } +- +- spdk_opal_free_locking_range_info(opal_bdev->opal_dev, locking_range_id); +- vbdev_opal_destruct_bdev(opal_bdev); +- return 0; +- +-err: +- return rc; +-} +- +-static void +-vbdev_opal_examine(struct spdk_bdev *bdev) +-{ +- /* TODO */ +- spdk_bdev_module_examine_done(&opal_if); +-} +- +-int +-vbdev_opal_set_lock_state(const char *bdev_name, uint16_t user_id, const char *password, +- const char *lock_state) +-{ +- struct nvme_ctrlr *nvme_ctrlr; +- int locking_range_id; +- int rc; +- enum spdk_opal_lock_state state_flag; +- struct opal_vbdev *opal_bdev; +- +- TAILQ_FOREACH(opal_bdev, &g_opal_vbdev, tailq) { +- if (strcmp(opal_bdev->name, bdev_name) == 0) { +- break; +- } +- } +- +- if (opal_bdev == NULL) { +- SPDK_ERRLOG("%s not found\n", bdev_name); +- return -ENODEV; +- } +- +- nvme_ctrlr = opal_bdev->nvme_ctrlr; +- if (nvme_ctrlr == NULL) { +- SPDK_ERRLOG("can't find nvme_ctrlr of %s\n", opal_bdev->name); +- return -ENODEV; +- } +- +- if (strcasecmp(lock_state, "READWRITE") == 0) { +- state_flag = OPAL_READWRITE; +- } else if (strcasecmp(lock_state, "READONLY") == 0) { +- state_flag = OPAL_READONLY; +- } else if (strcasecmp(lock_state, "RWLOCK") == 0) { +- state_flag = OPAL_RWLOCK; +- } else { +- SPDK_ERRLOG("Invalid OPAL lock state input\n"); +- return -EINVAL; +- } +- +- locking_range_id = opal_bdev->locking_range_id; +- rc = spdk_opal_cmd_lock_unlock(nvme_ctrlr->opal_dev, user_id, state_flag, locking_range_id, +- password); +- if (rc) { +- SPDK_ERRLOG("%s lock/unlock failure: %d\n", bdev_name, rc); +- } +- +- return rc; +-} +- +-int +-vbdev_opal_enable_new_user(const char *bdev_name, const char *admin_password, uint16_t user_id, +- const char *user_password) +-{ +- struct nvme_ctrlr *nvme_ctrlr; +- int locking_range_id; +- int rc; +- struct opal_vbdev *opal_bdev; +- +- TAILQ_FOREACH(opal_bdev, &g_opal_vbdev, tailq) { +- if (strcmp(opal_bdev->name, bdev_name) == 0) { +- break; +- } +- } +- +- if (opal_bdev == NULL) { +- SPDK_ERRLOG("%s not found\n", bdev_name); +- return -ENODEV; +- } +- +- nvme_ctrlr = opal_bdev->nvme_ctrlr; +- if (nvme_ctrlr == NULL) { +- SPDK_ERRLOG("can't find nvme_ctrlr of %s\n", opal_bdev->name); +- return -ENODEV; +- } +- +- rc = spdk_opal_cmd_enable_user(nvme_ctrlr->opal_dev, user_id, admin_password); +- if (rc) { +- SPDK_ERRLOG("%s enable user error: %d\n", bdev_name, rc); +- return rc; +- } +- +- rc = spdk_opal_cmd_set_new_passwd(nvme_ctrlr->opal_dev, user_id, user_password, admin_password, +- true); +- if (rc) { +- SPDK_ERRLOG("%s set user password error: %d\n", bdev_name, rc); +- return rc; +- } +- +- locking_range_id = opal_bdev->locking_range_id; +- rc = spdk_opal_cmd_add_user_to_locking_range(nvme_ctrlr->opal_dev, user_id, locking_range_id, +- OPAL_READONLY, admin_password); +- if (rc) { +- SPDK_ERRLOG("%s add user READONLY priority error: %d\n", bdev_name, rc); +- return rc; +- } +- +- rc = spdk_opal_cmd_add_user_to_locking_range(nvme_ctrlr->opal_dev, user_id, locking_range_id, +- OPAL_READWRITE, admin_password); +- if (rc) { +- SPDK_ERRLOG("%s add user READWRITE priority error: %d\n", bdev_name, rc); +- return rc; +- } +- +- return 0; +-} +- +-SPDK_LOG_REGISTER_COMPONENT(vbdev_opal) ++/* SPDX-License-Identifier: BSD-3-Clause ++ * Copyright (C) 2019 Intel Corporation. ++ * All rights reserved. ++ */ ++ ++#include "spdk/opal.h" ++#include "spdk/bdev_module.h" ++#include "vbdev_opal.h" ++#include "spdk/log.h" ++#include "spdk/string.h" ++ ++/* OPAL locking range only supports operations on nsid=1 for now */ ++#define NSID_SUPPORTED 1 ++ ++struct opal_vbdev { ++ char *name; ++ struct nvme_ctrlr *nvme_ctrlr; ++ struct spdk_opal_dev *opal_dev; ++ struct spdk_bdev_part *bdev_part; ++ ++ uint8_t locking_range_id; ++ uint64_t range_start; ++ uint64_t range_length; ++ struct vbdev_opal_part_base *opal_base; ++ ++ TAILQ_ENTRY(opal_vbdev) tailq; ++}; ++ ++static TAILQ_HEAD(, opal_vbdev) g_opal_vbdev = ++ TAILQ_HEAD_INITIALIZER(g_opal_vbdev); ++ ++struct vbdev_opal_bdev_io { ++ struct spdk_io_channel *ch; ++ struct spdk_bdev_io *bdev_io; ++ struct spdk_bdev_io_wait_entry bdev_io_wait; ++}; ++ ++struct vbdev_opal_channel { ++ struct spdk_bdev_part_channel part_ch; ++}; ++ ++struct vbdev_opal_part_base { ++ char *nvme_ctrlr_name; ++ struct spdk_bdev_part_base *part_base; ++ SPDK_BDEV_PART_TAILQ part_tailq; ++ TAILQ_ENTRY(vbdev_opal_part_base) tailq; ++}; ++ ++static TAILQ_HEAD(, vbdev_opal_part_base) g_opal_base = TAILQ_HEAD_INITIALIZER(g_opal_base); ++ ++static void _vbdev_opal_submit_request(struct spdk_io_channel *_ch, struct spdk_bdev_io *bdev_io); ++ ++static void vbdev_opal_examine(struct spdk_bdev *bdev); ++ ++static void ++vbdev_opal_delete(struct opal_vbdev *opal_bdev) ++{ ++ TAILQ_REMOVE(&g_opal_vbdev, opal_bdev, tailq); ++ free(opal_bdev->name); ++ free(opal_bdev); ++ opal_bdev = NULL; ++} ++ ++static void ++vbdev_opal_clear(void) ++{ ++ struct opal_vbdev *opal_bdev, *tmp; ++ ++ TAILQ_FOREACH_SAFE(opal_bdev, &g_opal_vbdev, tailq, tmp) { ++ vbdev_opal_delete(opal_bdev); ++ } ++} ++ ++static int ++vbdev_opal_init(void) ++{ ++ /* TODO */ ++ return 0; ++} ++ ++static void ++vbdev_opal_fini(void) ++{ ++ vbdev_opal_clear(); ++} ++ ++static int ++vbdev_opal_get_ctx_size(void) ++{ ++ return sizeof(struct vbdev_opal_bdev_io); ++} ++ ++/* delete all the config of the same base bdev */ ++static void ++vbdev_opal_delete_all_base_config(struct vbdev_opal_part_base *base) ++{ ++ char *nvme_ctrlr_name = base->nvme_ctrlr_name; ++ struct opal_vbdev *bdev, *tmp_bdev; ++ ++ TAILQ_FOREACH_SAFE(bdev, &g_opal_vbdev, tailq, tmp_bdev) { ++ if (!strcmp(nvme_ctrlr_name, bdev->nvme_ctrlr->nbdev_ctrlr->name)) { ++ vbdev_opal_delete(bdev); ++ } ++ } ++} ++ ++static int ++_vbdev_opal_destruct(void *ctx) ++{ ++ struct spdk_bdev_part *part = ctx; ++ ++ return spdk_bdev_part_free(part); ++} ++ ++static void ++vbdev_opal_base_free(void *ctx) ++{ ++ struct vbdev_opal_part_base *base = ctx; ++ ++ TAILQ_REMOVE(&g_opal_base, base, tailq); ++ ++ free(base->nvme_ctrlr_name); ++ free(base); ++} ++ ++static void ++vbdev_opal_resubmit_io(void *arg) ++{ ++ struct vbdev_opal_bdev_io *io_ctx = (struct vbdev_opal_bdev_io *)arg; ++ ++ _vbdev_opal_submit_request(io_ctx->ch, io_ctx->bdev_io); ++} ++ ++static void ++vbdev_opal_queue_io(struct vbdev_opal_bdev_io *io_ctx) ++{ ++ struct vbdev_opal_channel *ch = spdk_io_channel_get_ctx(io_ctx->ch); ++ int rc; ++ ++ io_ctx->bdev_io_wait.bdev = io_ctx->bdev_io->bdev; ++ io_ctx->bdev_io_wait.cb_fn = vbdev_opal_resubmit_io; ++ io_ctx->bdev_io_wait.cb_arg = io_ctx; ++ ++ rc = spdk_bdev_queue_io_wait(io_ctx->bdev_io->bdev, ch->part_ch.base_ch, &io_ctx->bdev_io_wait); ++ ++ if (rc != 0) { ++ SPDK_ERRLOG("Queue io failed in vbdev_opal_queue_io: %d\n", rc); ++ spdk_bdev_io_complete(io_ctx->bdev_io, SPDK_BDEV_IO_STATUS_FAILED); ++ } ++} ++ ++static void ++_vbdev_opal_submit_request(struct spdk_io_channel *_ch, struct spdk_bdev_io *bdev_io) ++{ ++ struct vbdev_opal_channel *ch = spdk_io_channel_get_ctx(_ch); ++ struct vbdev_opal_bdev_io *io_ctx = (struct vbdev_opal_bdev_io *)bdev_io->driver_ctx; ++ int rc; ++ ++ rc = spdk_bdev_part_submit_request(&ch->part_ch, bdev_io); ++ if (rc) { ++ if (rc == -ENOMEM) { ++ SPDK_DEBUGLOG(vbdev_opal, "opal: no memory, queue io.\n"); ++ io_ctx->ch = _ch; ++ io_ctx->bdev_io = bdev_io; ++ vbdev_opal_queue_io(io_ctx); ++ } else { ++ SPDK_ERRLOG("opal: error on io submission, rc=%d.\n", rc); ++ spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); ++ } ++ } ++} ++ ++static void ++vbdev_opal_io_get_buf_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io, bool success) ++{ ++ if (!success) { ++ spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); ++ return; ++ } ++ ++ _vbdev_opal_submit_request(ch, bdev_io); ++} ++ ++static void ++vbdev_opal_submit_request(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io) ++{ ++ switch (bdev_io->type) { ++ case SPDK_BDEV_IO_TYPE_READ: ++ spdk_bdev_io_get_buf(bdev_io, vbdev_opal_io_get_buf_cb, ++ bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen); ++ break; ++ default: ++ _vbdev_opal_submit_request(ch, bdev_io); ++ break; ++ } ++} ++ ++struct spdk_opal_locking_range_info * ++vbdev_opal_get_info_from_bdev(const char *opal_bdev_name, const char *password) ++{ ++ struct opal_vbdev *vbdev; ++ struct nvme_ctrlr *nvme_ctrlr; ++ int locking_range_id; ++ int rc; ++ ++ TAILQ_FOREACH(vbdev, &g_opal_vbdev, tailq) { ++ if (strcmp(vbdev->name, opal_bdev_name) == 0) { ++ break; ++ } ++ } ++ ++ if (vbdev == NULL) { ++ SPDK_ERRLOG("%s not found\n", opal_bdev_name); ++ return NULL; ++ } ++ ++ nvme_ctrlr = vbdev->nvme_ctrlr; ++ if (nvme_ctrlr == NULL) { ++ SPDK_ERRLOG("can't find nvme_ctrlr of %s\n", vbdev->name); ++ return NULL; ++ } ++ ++ locking_range_id = vbdev->locking_range_id; ++ rc = spdk_opal_cmd_get_locking_range_info(nvme_ctrlr->opal_dev, password, ++ OPAL_ADMIN1, locking_range_id); ++ if (rc) { ++ SPDK_ERRLOG("Get locking range info error: %d\n", rc); ++ return NULL; ++ } ++ ++ return spdk_opal_get_locking_range_info(nvme_ctrlr->opal_dev, locking_range_id); ++} ++ ++static int ++vbdev_opal_dump_info_json(void *ctx, struct spdk_json_write_ctx *w) ++{ ++ struct spdk_bdev_part *part = ctx; ++ struct spdk_bdev *base_bdev = spdk_bdev_part_get_base_bdev(part); ++ uint64_t offset = spdk_bdev_part_get_offset_blocks(part); ++ ++ spdk_json_write_named_object_begin(w, "opal"); ++ ++ spdk_json_write_named_string(w, "base_bdev", spdk_bdev_get_name(base_bdev)); ++ spdk_json_write_named_uint64(w, "offset_blocks", offset); ++ ++ spdk_json_write_object_end(w); ++ ++ return 0; ++} ++ ++static void ++vbdev_opal_base_bdev_hotremove_cb(void *_part_base) ++{ ++ struct spdk_bdev_part_base *part_base = _part_base; ++ struct vbdev_opal_part_base *base = spdk_bdev_part_base_get_ctx(part_base); ++ ++ spdk_bdev_part_base_hotremove(part_base, spdk_bdev_part_base_get_tailq(part_base)); ++ vbdev_opal_delete_all_base_config(base); ++} ++ ++static bool ++vbdev_opal_io_type_supported(void *ctx, enum spdk_bdev_io_type io_type) ++{ ++ struct spdk_bdev_part *part = ctx; ++ struct spdk_bdev *base_bdev = spdk_bdev_part_get_base_bdev(part); ++ ++ return spdk_bdev_io_type_supported(base_bdev, io_type); ++} ++ ++static struct spdk_bdev_fn_table opal_vbdev_fn_table = { ++ .destruct = _vbdev_opal_destruct, ++ .submit_request = vbdev_opal_submit_request, ++ .io_type_supported = vbdev_opal_io_type_supported, ++ .dump_info_json = vbdev_opal_dump_info_json, ++ .write_config_json = NULL, ++}; ++ ++static struct spdk_bdev_module opal_if = { ++ .name = "opal", ++ .module_init = vbdev_opal_init, ++ .module_fini = vbdev_opal_fini, ++ .get_ctx_size = vbdev_opal_get_ctx_size, ++ .examine_config = vbdev_opal_examine, ++ .config_json = NULL, ++}; ++ ++SPDK_BDEV_MODULE_REGISTER(opal, &opal_if) ++ ++int ++vbdev_opal_create(const char *nvme_ctrlr_name, uint32_t nsid, uint8_t locking_range_id, ++ uint64_t range_start, uint64_t range_length, const char *password) ++{ ++ int rc; ++ char *opal_vbdev_name; ++ char *base_bdev_name; ++ struct nvme_ctrlr *nvme_ctrlr; ++ struct opal_vbdev *opal_bdev; ++ struct vbdev_opal_part_base *opal_part_base = NULL; ++ struct spdk_bdev_part *part_bdev; ++ struct nvme_bdev *nvme_bdev; ++ struct nvme_ns *nvme_ns; ++ ++ if (nsid != NSID_SUPPORTED) { ++ SPDK_ERRLOG("nsid %d not supported", nsid); ++ return -EINVAL; ++ } ++ ++ nvme_ctrlr = nvme_ctrlr_get_by_name(nvme_ctrlr_name); ++ if (!nvme_ctrlr) { ++ SPDK_ERRLOG("get nvme ctrlr failed\n"); ++ return -ENODEV; ++ } ++ ++ if (!nvme_ctrlr->opal_dev) { ++ SPDK_ERRLOG("Opal not supported\n"); ++ return -ENOTSUP; ++ } ++ ++ opal_bdev = calloc(1, sizeof(struct opal_vbdev)); ++ if (!opal_bdev) { ++ SPDK_ERRLOG("allocation for opal_bdev failed\n"); ++ return -ENOMEM; ++ } ++ ++ opal_bdev->locking_range_id = locking_range_id; ++ opal_bdev->range_start = range_start; ++ opal_bdev->range_length = range_length; ++ ++ opal_bdev->nvme_ctrlr = nvme_ctrlr; ++ opal_bdev->opal_dev = nvme_ctrlr->opal_dev; ++ ++ nvme_ns = nvme_ctrlr_get_ns(nvme_ctrlr, nsid); ++ if (nvme_ns == NULL) { ++ free(opal_bdev); ++ return -ENODEV; ++ } ++ ++ nvme_bdev = nvme_ns->bdev; ++ assert(nvme_bdev != NULL); ++ base_bdev_name = nvme_bdev->disk.name; ++ ++ /* traverse base list to see if part_base is already create for this base bdev */ ++ TAILQ_FOREACH(opal_part_base, &g_opal_base, tailq) { ++ if (!strcmp(spdk_bdev_part_base_get_bdev_name(opal_part_base->part_base), base_bdev_name)) { ++ break; ++ } ++ } ++ ++ /* If there is not a corresponding opal_part_base, a new opal_part_base will be created. ++ For each new part_base, there will be one tailq to store all the parts of this base */ ++ if (opal_part_base == NULL) { ++ opal_part_base = calloc(1, sizeof(*opal_part_base)); ++ if (opal_part_base == NULL) { ++ SPDK_ERRLOG("Could not allocate opal_part_base\n"); ++ free(opal_bdev); ++ return -ENOMEM; ++ } ++ TAILQ_INIT(&opal_part_base->part_tailq); ++ ++ rc = spdk_bdev_part_base_construct_ext(base_bdev_name, ++ vbdev_opal_base_bdev_hotremove_cb, &opal_if, ++ &opal_vbdev_fn_table, &opal_part_base->part_tailq, ++ vbdev_opal_base_free, opal_part_base, ++ sizeof(struct vbdev_opal_channel), NULL, NULL, ++ &opal_part_base->part_base); ++ if (rc != 0) { ++ if (rc != -ENODEV) { ++ SPDK_ERRLOG("Could not allocate part_base\n"); ++ } ++ free(opal_bdev); ++ free(opal_part_base); ++ return rc; ++ } ++ opal_part_base->nvme_ctrlr_name = strdup(nvme_ctrlr_name); ++ if (opal_part_base->nvme_ctrlr_name == NULL) { ++ free(opal_bdev); ++ spdk_bdev_part_base_free(opal_part_base->part_base); ++ return -ENOMEM; ++ } ++ ++ TAILQ_INSERT_TAIL(&g_opal_base, opal_part_base, tailq); ++ } ++ assert(opal_part_base != NULL); ++ opal_bdev->opal_base = opal_part_base; ++ ++ part_bdev = calloc(1, sizeof(struct spdk_bdev_part)); ++ if (!part_bdev) { ++ SPDK_ERRLOG("Could not allocate part_bdev\n"); ++ free(opal_bdev); ++ return -ENOMEM; ++ } ++ ++ TAILQ_INSERT_TAIL(&g_opal_vbdev, opal_bdev, tailq); ++ opal_vbdev_name = spdk_sprintf_alloc("%sr%" PRIu8, base_bdev_name, ++ opal_bdev->locking_range_id); /* e.g.: nvme0n1r1 */ ++ if (opal_vbdev_name == NULL) { ++ SPDK_ERRLOG("Could not allocate opal_vbdev_name\n"); ++ rc = -ENOMEM; ++ goto err; ++ } ++ ++ opal_bdev->name = opal_vbdev_name; ++ rc = spdk_opal_cmd_setup_locking_range(opal_bdev->opal_dev, OPAL_ADMIN1, ++ opal_bdev->locking_range_id, opal_bdev->range_start, ++ opal_bdev->range_length, password); ++ if (rc) { ++ SPDK_ERRLOG("Error construct %s\n", opal_vbdev_name); ++ goto err; ++ } ++ ++ rc = spdk_bdev_part_construct(part_bdev, opal_bdev->opal_base->part_base, opal_vbdev_name, ++ opal_bdev->range_start, opal_bdev->range_length, "Opal locking range"); ++ if (rc) { ++ SPDK_ERRLOG("Could not allocate bdev part\n"); ++ goto err; ++ } ++ ++ /* lock this bdev initially */ ++ rc = spdk_opal_cmd_lock_unlock(opal_bdev->opal_dev, OPAL_ADMIN1, OPAL_RWLOCK, locking_range_id, ++ password); ++ if (rc) { ++ SPDK_ERRLOG("Error lock %s\n", opal_vbdev_name); ++ goto err; ++ } ++ ++ opal_bdev->bdev_part = part_bdev; ++ return 0; ++ ++err: ++ vbdev_opal_delete(opal_bdev); ++ free(part_bdev); ++ return rc; ++} ++ ++static void ++vbdev_opal_destruct_bdev(struct opal_vbdev *opal_bdev) ++{ ++ struct spdk_bdev_part *part = opal_bdev->bdev_part; ++ ++ assert(opal_bdev->opal_base != NULL); ++ assert(part != NULL); ++ ++ if (opal_bdev->range_start == spdk_bdev_part_get_offset_blocks(part)) { ++ spdk_bdev_unregister(spdk_bdev_part_get_bdev(part), NULL, NULL); ++ } ++ vbdev_opal_delete(opal_bdev); ++} ++ ++int ++vbdev_opal_destruct(const char *bdev_name, const char *password) ++{ ++ struct nvme_ctrlr *nvme_ctrlr; ++ int locking_range_id; ++ int rc; ++ struct opal_vbdev *opal_bdev; ++ ++ TAILQ_FOREACH(opal_bdev, &g_opal_vbdev, tailq) { ++ if (strcmp(opal_bdev->name, bdev_name) == 0) { ++ break; ++ } ++ } ++ ++ if (opal_bdev == NULL) { ++ SPDK_ERRLOG("%s not found\n", bdev_name); ++ rc = -ENODEV; ++ goto err; ++ } ++ ++ locking_range_id = opal_bdev->locking_range_id; ++ ++ nvme_ctrlr = opal_bdev->nvme_ctrlr; ++ if (nvme_ctrlr == NULL) { ++ SPDK_ERRLOG("can't find nvme_ctrlr of %s\n", bdev_name); ++ return -ENODEV; ++ } ++ ++ /* secure erase locking range */ ++ rc = spdk_opal_cmd_secure_erase_locking_range(nvme_ctrlr->opal_dev, OPAL_ADMIN1, locking_range_id, ++ password); ++ if (rc) { ++ SPDK_ERRLOG("opal erase locking range failed\n"); ++ goto err; ++ } ++ ++ /* reset the locking range to 0 */ ++ rc = spdk_opal_cmd_setup_locking_range(nvme_ctrlr->opal_dev, OPAL_ADMIN1, locking_range_id, 0, ++ 0, password); ++ if (rc) { ++ SPDK_ERRLOG("opal reset locking range failed\n"); ++ goto err; ++ } ++ ++ spdk_opal_free_locking_range_info(opal_bdev->opal_dev, locking_range_id); ++ vbdev_opal_destruct_bdev(opal_bdev); ++ return 0; ++ ++err: ++ return rc; ++} ++ ++static void ++vbdev_opal_examine(struct spdk_bdev *bdev) ++{ ++ /* TODO */ ++ spdk_bdev_module_examine_done(&opal_if); ++} ++ ++int ++vbdev_opal_set_lock_state(const char *bdev_name, uint16_t user_id, const char *password, ++ const char *lock_state) ++{ ++ struct nvme_ctrlr *nvme_ctrlr; ++ int locking_range_id; ++ int rc; ++ enum spdk_opal_lock_state state_flag; ++ struct opal_vbdev *opal_bdev; ++ ++ TAILQ_FOREACH(opal_bdev, &g_opal_vbdev, tailq) { ++ if (strcmp(opal_bdev->name, bdev_name) == 0) { ++ break; ++ } ++ } ++ ++ if (opal_bdev == NULL) { ++ SPDK_ERRLOG("%s not found\n", bdev_name); ++ return -ENODEV; ++ } ++ ++ nvme_ctrlr = opal_bdev->nvme_ctrlr; ++ if (nvme_ctrlr == NULL) { ++ SPDK_ERRLOG("can't find nvme_ctrlr of %s\n", opal_bdev->name); ++ return -ENODEV; ++ } ++ ++ if (strcasecmp(lock_state, "READWRITE") == 0) { ++ state_flag = OPAL_READWRITE; ++ } else if (strcasecmp(lock_state, "READONLY") == 0) { ++ state_flag = OPAL_READONLY; ++ } else if (strcasecmp(lock_state, "RWLOCK") == 0) { ++ state_flag = OPAL_RWLOCK; ++ } else { ++ SPDK_ERRLOG("Invalid OPAL lock state input\n"); ++ return -EINVAL; ++ } ++ ++ locking_range_id = opal_bdev->locking_range_id; ++ rc = spdk_opal_cmd_lock_unlock(nvme_ctrlr->opal_dev, user_id, state_flag, locking_range_id, ++ password); ++ if (rc) { ++ SPDK_ERRLOG("%s lock/unlock failure: %d\n", bdev_name, rc); ++ } ++ ++ return rc; ++} ++ ++int ++vbdev_opal_enable_new_user(const char *bdev_name, const char *admin_password, uint16_t user_id, ++ const char *user_password) ++{ ++ struct nvme_ctrlr *nvme_ctrlr; ++ int locking_range_id; ++ int rc; ++ struct opal_vbdev *opal_bdev; ++ ++ TAILQ_FOREACH(opal_bdev, &g_opal_vbdev, tailq) { ++ if (strcmp(opal_bdev->name, bdev_name) == 0) { ++ break; ++ } ++ } ++ ++ if (opal_bdev == NULL) { ++ SPDK_ERRLOG("%s not found\n", bdev_name); ++ return -ENODEV; ++ } ++ ++ nvme_ctrlr = opal_bdev->nvme_ctrlr; ++ if (nvme_ctrlr == NULL) { ++ SPDK_ERRLOG("can't find nvme_ctrlr of %s\n", opal_bdev->name); ++ return -ENODEV; ++ } ++ ++ rc = spdk_opal_cmd_enable_user(nvme_ctrlr->opal_dev, user_id, admin_password); ++ if (rc) { ++ SPDK_ERRLOG("%s enable user error: %d\n", bdev_name, rc); ++ return rc; ++ } ++ ++ rc = spdk_opal_cmd_set_new_passwd(nvme_ctrlr->opal_dev, user_id, user_password, admin_password, ++ true); ++ if (rc) { ++ SPDK_ERRLOG("%s set user password error: %d\n", bdev_name, rc); ++ return rc; ++ } ++ ++ locking_range_id = opal_bdev->locking_range_id; ++ rc = spdk_opal_cmd_add_user_to_locking_range(nvme_ctrlr->opal_dev, user_id, locking_range_id, ++ OPAL_READONLY, admin_password); ++ if (rc) { ++ SPDK_ERRLOG("%s add user READONLY priority error: %d\n", bdev_name, rc); ++ return rc; ++ } ++ ++ rc = spdk_opal_cmd_add_user_to_locking_range(nvme_ctrlr->opal_dev, user_id, locking_range_id, ++ OPAL_READWRITE, admin_password); ++ if (rc) { ++ SPDK_ERRLOG("%s add user READWRITE priority error: %d\n", bdev_name, rc); ++ return rc; ++ } ++ ++ return 0; ++} ++ ++SPDK_LOG_REGISTER_COMPONENT(vbdev_opal) +diff --git a/module/bdev/nvme/vbdev_opal.h b/module/bdev/nvme/vbdev_opal.h +index 4f8e931..bd6076d 100644 +--- a/module/bdev/nvme/vbdev_opal.h ++++ b/module/bdev/nvme/vbdev_opal.h +@@ -1,26 +1,26 @@ +-/* SPDX-License-Identifier: BSD-3-Clause +- * Copyright (C) 2019 Intel Corporation. +- * All rights reserved. +- */ +- +-#ifndef SPDK_VBDEV_OPAL_H +-#define SPDK_VBDEV_OPAL_H +- +-#include "spdk/bdev_module.h" +-#include "bdev_nvme.h" +- +-int vbdev_opal_create(const char *nvme_ctrlr_name, uint32_t nsid, uint8_t locking_range_id, +- uint64_t range_start, uint64_t range_length, const char *password); +- +-struct spdk_opal_locking_range_info *vbdev_opal_get_info_from_bdev(const char *opal_bdev_name, +- const char *password); +- +-int vbdev_opal_destruct(const char *bdev_name, const char *password); +- +-int vbdev_opal_enable_new_user(const char *bdev_name, const char *admin_password, +- uint16_t user_id, const char *user_password); +- +-int vbdev_opal_set_lock_state(const char *bdev_name, uint16_t user_id, const char *password, +- const char *lock_state); +- +-#endif ++/* SPDX-License-Identifier: BSD-3-Clause ++ * Copyright (C) 2019 Intel Corporation. ++ * All rights reserved. ++ */ ++ ++#ifndef SPDK_VBDEV_OPAL_H ++#define SPDK_VBDEV_OPAL_H ++ ++#include "spdk/bdev_module.h" ++#include "bdev_nvme.h" ++ ++int vbdev_opal_create(const char *nvme_ctrlr_name, uint32_t nsid, uint8_t locking_range_id, ++ uint64_t range_start, uint64_t range_length, const char *password); ++ ++struct spdk_opal_locking_range_info *vbdev_opal_get_info_from_bdev(const char *opal_bdev_name, ++ const char *password); ++ ++int vbdev_opal_destruct(const char *bdev_name, const char *password); ++ ++int vbdev_opal_enable_new_user(const char *bdev_name, const char *admin_password, ++ uint16_t user_id, const char *user_password); ++ ++int vbdev_opal_set_lock_state(const char *bdev_name, uint16_t user_id, const char *password, ++ const char *lock_state); ++ ++#endif +diff --git a/module/bdev/nvme/vbdev_opal_rpc.c b/module/bdev/nvme/vbdev_opal_rpc.c +index 706b8ac..f1384ce 100644 +--- a/module/bdev/nvme/vbdev_opal_rpc.c ++++ b/module/bdev/nvme/vbdev_opal_rpc.c +@@ -1,411 +1,411 @@ +-/* SPDX-License-Identifier: BSD-3-Clause +- * Copyright (C) 2019 Intel Corporation. +- * All rights reserved. +- */ +- +-#include "spdk/rpc.h" +-#include "spdk/util.h" +-#include "spdk/string.h" +-#include "spdk/log.h" +-#include "spdk/opal.h" +- +-#include "vbdev_opal.h" +- +-struct rpc_bdev_nvme_opal_init { +- char *nvme_ctrlr_name; +- char *password; +-}; +- +-static void +-free_rpc_bdev_nvme_opal_init(struct rpc_bdev_nvme_opal_init *req) +-{ +- free(req->nvme_ctrlr_name); +- free(req->password); +-} +- +-static const struct spdk_json_object_decoder rpc_bdev_nvme_opal_init_decoders[] = { +- {"nvme_ctrlr_name", offsetof(struct rpc_bdev_nvme_opal_init, nvme_ctrlr_name), spdk_json_decode_string}, +- {"password", offsetof(struct rpc_bdev_nvme_opal_init, password), spdk_json_decode_string}, +-}; +- +-static void +-rpc_bdev_nvme_opal_init(struct spdk_jsonrpc_request *request, +- const struct spdk_json_val *params) +-{ +- struct rpc_bdev_nvme_opal_init req = {}; +- struct nvme_ctrlr *nvme_ctrlr; +- int rc; +- +- if (spdk_json_decode_object(params, rpc_bdev_nvme_opal_init_decoders, +- SPDK_COUNTOF(rpc_bdev_nvme_opal_init_decoders), +- &req)) { +- SPDK_ERRLOG("spdk_json_decode_object failed\n"); +- spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, "Invalid parameters"); +- goto out; +- } +- +- /* check if opal supported */ +- nvme_ctrlr = nvme_ctrlr_get_by_name(req.nvme_ctrlr_name); +- if (nvme_ctrlr == NULL || nvme_ctrlr->opal_dev == NULL) { +- SPDK_ERRLOG("%s not support opal\n", req.nvme_ctrlr_name); +- spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, "Invalid parameters"); +- goto out; +- } +- +- /* take ownership */ +- rc = spdk_opal_cmd_take_ownership(nvme_ctrlr->opal_dev, req.password); +- if (rc) { +- SPDK_ERRLOG("Take ownership failure: %d\n", rc); +- switch (rc) { +- case -EBUSY: +- spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, +- "SP Busy, try again later"); +- break; +- case -EACCES: +- spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, +- "This drive is already enabled"); +- break; +- default: +- spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, "Internal error"); +- } +- goto out; +- } +- +- /* activate locking SP */ +- rc = spdk_opal_cmd_activate_locking_sp(nvme_ctrlr->opal_dev, req.password); +- if (rc) { +- SPDK_ERRLOG("Activate locking SP failure: %d\n", rc); +- spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, "Internal error"); +- goto out; +- } +- +- spdk_jsonrpc_send_bool_response(request, true); +- +-out: +- free_rpc_bdev_nvme_opal_init(&req); +-} +-SPDK_RPC_REGISTER("bdev_nvme_opal_init", rpc_bdev_nvme_opal_init, SPDK_RPC_RUNTIME) +- +-struct rpc_bdev_nvme_opal_revert { +- char *nvme_ctrlr_name; +- char *password; +-}; +- +-static void +-free_rpc_bdev_nvme_opal_revert(struct rpc_bdev_nvme_opal_revert *req) +-{ +- free(req->nvme_ctrlr_name); +- free(req->password); +-} +- +-static const struct spdk_json_object_decoder rpc_bdev_nvme_opal_revert_decoders[] = { +- {"nvme_ctrlr_name", offsetof(struct rpc_bdev_nvme_opal_revert, nvme_ctrlr_name), spdk_json_decode_string}, +- {"password", offsetof(struct rpc_bdev_nvme_opal_revert, password), spdk_json_decode_string}, +-}; +- +-static void +-rpc_bdev_nvme_opal_revert(struct spdk_jsonrpc_request *request, +- const struct spdk_json_val *params) +-{ +- struct rpc_bdev_nvme_opal_revert req = {}; +- struct nvme_ctrlr *nvme_ctrlr; +- int rc; +- +- if (spdk_json_decode_object(params, rpc_bdev_nvme_opal_revert_decoders, +- SPDK_COUNTOF(rpc_bdev_nvme_opal_revert_decoders), +- &req)) { +- SPDK_ERRLOG("spdk_json_decode_object failed\n"); +- spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, "Invalid parameters"); +- goto out; +- } +- +- /* check if opal supported */ +- nvme_ctrlr = nvme_ctrlr_get_by_name(req.nvme_ctrlr_name); +- if (nvme_ctrlr == NULL || nvme_ctrlr->opal_dev == NULL) { +- SPDK_ERRLOG("%s not support opal\n", req.nvme_ctrlr_name); +- spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, "Invalid parameters"); +- goto out; +- } +- +- /* TODO: delete all opal vbdev before revert TPer */ +- +- rc = spdk_opal_cmd_revert_tper(nvme_ctrlr->opal_dev, req.password); +- if (rc) { +- SPDK_ERRLOG("Revert TPer failure: %d\n", rc); +- spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, "Internal error"); +- goto out; +- } +- +- spdk_jsonrpc_send_bool_response(request, true); +- +-out: +- free_rpc_bdev_nvme_opal_revert(&req); +-} +-SPDK_RPC_REGISTER("bdev_nvme_opal_revert", rpc_bdev_nvme_opal_revert, SPDK_RPC_RUNTIME) +- +-struct rpc_bdev_opal_create { +- char *nvme_ctrlr_name; +- uint32_t nsid; +- uint16_t locking_range_id; +- uint64_t range_start; +- uint64_t range_length; +- char *password; +-}; +- +-static void +-free_rpc_bdev_opal_create(struct rpc_bdev_opal_create *req) +-{ +- free(req->nvme_ctrlr_name); +- free(req->password); +-} +- +-static const struct spdk_json_object_decoder rpc_bdev_opal_create_decoders[] = { +- {"nvme_ctrlr_name", offsetof(struct rpc_bdev_opal_create, nvme_ctrlr_name), spdk_json_decode_string}, +- {"nsid", offsetof(struct rpc_bdev_opal_create, nsid), spdk_json_decode_uint32}, +- {"locking_range_id", offsetof(struct rpc_bdev_opal_create, locking_range_id), spdk_json_decode_uint16}, +- {"range_start", offsetof(struct rpc_bdev_opal_create, range_start), spdk_json_decode_uint64}, +- {"range_length", offsetof(struct rpc_bdev_opal_create, range_length), spdk_json_decode_uint64}, +- {"password", offsetof(struct rpc_bdev_opal_create, password), spdk_json_decode_string}, +-}; +- +-static void +-rpc_bdev_opal_create(struct spdk_jsonrpc_request *request, +- const struct spdk_json_val *params) +-{ +- struct rpc_bdev_opal_create req = {}; +- struct spdk_json_write_ctx *w; +- char *opal_bdev_name; +- int rc; +- +- if (spdk_json_decode_object(params, rpc_bdev_opal_create_decoders, +- SPDK_COUNTOF(rpc_bdev_opal_create_decoders), +- &req)) { +- SPDK_ERRLOG("spdk_json_decode_object failed\n"); +- spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, "Invalid parameters"); +- goto out; +- } +- +- rc = vbdev_opal_create(req.nvme_ctrlr_name, req.nsid, req.locking_range_id, req.range_start, +- req.range_length, req.password); +- if (rc != 0) { +- spdk_jsonrpc_send_error_response_fmt(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, +- "Failed to create opal vbdev from '%s': %s", +- req.nvme_ctrlr_name, spdk_strerror(-rc)); +- goto out; +- } +- +- w = spdk_jsonrpc_begin_result(request); +- opal_bdev_name = spdk_sprintf_alloc("%sn%dr%d", req.nvme_ctrlr_name, req.nsid, +- req.locking_range_id); +- spdk_json_write_string(w, opal_bdev_name); +- spdk_jsonrpc_end_result(request, w); +- free(opal_bdev_name); +- +-out: +- free_rpc_bdev_opal_create(&req); +-} +-SPDK_RPC_REGISTER("bdev_opal_create", rpc_bdev_opal_create, SPDK_RPC_RUNTIME) +- +-struct rpc_bdev_opal_get_info { +- char *bdev_name; +- char *password; +-}; +- +-static void +-free_rpc_bdev_opal_get_info(struct rpc_bdev_opal_get_info *req) +-{ +- free(req->bdev_name); +- free(req->password); +-} +- +-static const struct spdk_json_object_decoder rpc_bdev_opal_get_info_decoders[] = { +- {"bdev_name", offsetof(struct rpc_bdev_opal_get_info, bdev_name), spdk_json_decode_string}, +- {"password", offsetof(struct rpc_bdev_opal_get_info, password), spdk_json_decode_string}, +-}; +- +-static void +-rpc_bdev_opal_get_info(struct spdk_jsonrpc_request *request, +- const struct spdk_json_val *params) +-{ +- struct rpc_bdev_opal_get_info req = {}; +- struct spdk_json_write_ctx *w; +- struct spdk_opal_locking_range_info *info; +- +- if (spdk_json_decode_object(params, rpc_bdev_opal_get_info_decoders, +- SPDK_COUNTOF(rpc_bdev_opal_get_info_decoders), +- &req)) { +- SPDK_ERRLOG("spdk_json_decode_object failed\n"); +- spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, "Invalid parameters"); +- goto out; +- } +- +- info = vbdev_opal_get_info_from_bdev(req.bdev_name, req.password); +- if (info == NULL) { +- SPDK_ERRLOG("Get opal info failure\n"); +- spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, "Internal error"); +- goto out; +- } +- +- w = spdk_jsonrpc_begin_result(request); +- spdk_json_write_object_begin(w); +- +- spdk_json_write_named_string(w, "name", req.bdev_name); +- spdk_json_write_named_uint64(w, "range_start", info->range_start); +- spdk_json_write_named_uint64(w, "range_length", info->range_length); +- spdk_json_write_named_bool(w, "read_lock_enabled", info->read_lock_enabled); +- spdk_json_write_named_bool(w, "write_lock_enabled", info->write_lock_enabled); +- spdk_json_write_named_bool(w, "read_locked", info->read_locked); +- spdk_json_write_named_bool(w, "write_locked", info->write_locked); +- +- spdk_json_write_object_end(w); +- spdk_jsonrpc_end_result(request, w); +- +-out: +- free_rpc_bdev_opal_get_info(&req); +-} +-SPDK_RPC_REGISTER("bdev_opal_get_info", rpc_bdev_opal_get_info, SPDK_RPC_RUNTIME) +- +-struct rpc_bdev_opal_delete { +- char *bdev_name; +- char *password; +-}; +- +-static void +-free_rpc_bdev_opal_delete(struct rpc_bdev_opal_delete *req) +-{ +- free(req->bdev_name); +- free(req->password); +-} +- +-static const struct spdk_json_object_decoder rpc_bdev_opal_delete_decoders[] = { +- {"bdev_name", offsetof(struct rpc_bdev_opal_delete, bdev_name), spdk_json_decode_string}, +- {"password", offsetof(struct rpc_bdev_opal_delete, password), spdk_json_decode_string}, +-}; +- +-static void +-rpc_bdev_opal_delete(struct spdk_jsonrpc_request *request, +- const struct spdk_json_val *params) +-{ +- struct rpc_bdev_opal_delete req = {}; +- int rc; +- +- if (spdk_json_decode_object(params, rpc_bdev_opal_delete_decoders, +- SPDK_COUNTOF(rpc_bdev_opal_delete_decoders), +- &req)) { +- SPDK_ERRLOG("spdk_json_decode_object failed\n"); +- spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, "Invalid parameters"); +- goto out; +- } +- +- rc = vbdev_opal_destruct(req.bdev_name, req.password); +- if (rc < 0) { +- spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, spdk_strerror(-rc)); +- goto out; +- } +- +- spdk_jsonrpc_send_bool_response(request, true); +-out: +- free_rpc_bdev_opal_delete(&req); +-} +-SPDK_RPC_REGISTER("bdev_opal_delete", rpc_bdev_opal_delete, SPDK_RPC_RUNTIME) +- +-struct rpc_bdev_opal_set_lock_state { +- char *bdev_name; +- uint16_t user_id; +- char *password; +- char *lock_state; +-}; +- +-static void +-free_rpc_bdev_opal_set_lock_state(struct rpc_bdev_opal_set_lock_state *req) +-{ +- free(req->bdev_name); +- free(req->password); +- free(req->lock_state); +-} +- +-static const struct spdk_json_object_decoder rpc_bdev_opal_set_lock_state_decoders[] = { +- {"bdev_name", offsetof(struct rpc_bdev_opal_set_lock_state, bdev_name), spdk_json_decode_string}, +- {"user_id", offsetof(struct rpc_bdev_opal_set_lock_state, user_id), spdk_json_decode_uint16}, +- {"password", offsetof(struct rpc_bdev_opal_set_lock_state, password), spdk_json_decode_string}, +- {"lock_state", offsetof(struct rpc_bdev_opal_set_lock_state, lock_state), spdk_json_decode_string}, +-}; +- +-static void +-rpc_bdev_opal_set_lock_state(struct spdk_jsonrpc_request *request, +- const struct spdk_json_val *params) +-{ +- struct rpc_bdev_opal_set_lock_state req = {}; +- int rc; +- +- if (spdk_json_decode_object(params, rpc_bdev_opal_set_lock_state_decoders, +- SPDK_COUNTOF(rpc_bdev_opal_set_lock_state_decoders), +- &req)) { +- SPDK_ERRLOG("spdk_json_decode_object failed\n"); +- spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, "Invalid parameters"); +- goto out; +- } +- +- rc = vbdev_opal_set_lock_state(req.bdev_name, req.user_id, req.password, req.lock_state); +- if (rc != 0) { +- spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, spdk_strerror(-rc)); +- goto out; +- } +- +- spdk_jsonrpc_send_bool_response(request, true); +- +-out: +- free_rpc_bdev_opal_set_lock_state(&req); +-} +-SPDK_RPC_REGISTER("bdev_opal_set_lock_state", rpc_bdev_opal_set_lock_state, SPDK_RPC_RUNTIME) +- +-struct rpc_bdev_opal_new_user { +- char *bdev_name; +- char *admin_password; +- uint16_t user_id; +- char *user_password; +-}; +- +-static void +-free_rpc_bdev_opal_new_user(struct rpc_bdev_opal_new_user *req) +-{ +- free(req->bdev_name); +- free(req->admin_password); +- free(req->user_password); +-} +- +-static const struct spdk_json_object_decoder rpc_bdev_opal_new_user_decoders[] = { +- {"bdev_name", offsetof(struct rpc_bdev_opal_new_user, bdev_name), spdk_json_decode_string}, +- {"admin_password", offsetof(struct rpc_bdev_opal_new_user, admin_password), spdk_json_decode_string}, +- {"user_id", offsetof(struct rpc_bdev_opal_new_user, user_id), spdk_json_decode_uint16}, +- {"user_password", offsetof(struct rpc_bdev_opal_new_user, user_password), spdk_json_decode_string}, +-}; +- +-static void +-rpc_bdev_opal_new_user(struct spdk_jsonrpc_request *request, +- const struct spdk_json_val *params) +-{ +- struct rpc_bdev_opal_new_user req = {}; +- int rc; +- +- if (spdk_json_decode_object(params, rpc_bdev_opal_new_user_decoders, +- SPDK_COUNTOF(rpc_bdev_opal_new_user_decoders), +- &req)) { +- SPDK_ERRLOG("spdk_json_decode_object failed\n"); +- spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, "Invalid parameters"); +- goto out; +- } +- +- rc = vbdev_opal_enable_new_user(req.bdev_name, req.admin_password, req.user_id, +- req.user_password); +- if (rc != 0) { +- spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, spdk_strerror(-rc)); +- goto out; +- } +- +- spdk_jsonrpc_send_bool_response(request, true); +- +-out: +- free_rpc_bdev_opal_new_user(&req); +-} +-SPDK_RPC_REGISTER("bdev_opal_new_user", rpc_bdev_opal_new_user, SPDK_RPC_RUNTIME) ++/* SPDX-License-Identifier: BSD-3-Clause ++ * Copyright (C) 2019 Intel Corporation. ++ * All rights reserved. ++ */ ++ ++#include "spdk/rpc.h" ++#include "spdk/util.h" ++#include "spdk/string.h" ++#include "spdk/log.h" ++#include "spdk/opal.h" ++ ++#include "vbdev_opal.h" ++ ++struct rpc_bdev_nvme_opal_init { ++ char *nvme_ctrlr_name; ++ char *password; ++}; ++ ++static void ++free_rpc_bdev_nvme_opal_init(struct rpc_bdev_nvme_opal_init *req) ++{ ++ free(req->nvme_ctrlr_name); ++ free(req->password); ++} ++ ++static const struct spdk_json_object_decoder rpc_bdev_nvme_opal_init_decoders[] = { ++ {"nvme_ctrlr_name", offsetof(struct rpc_bdev_nvme_opal_init, nvme_ctrlr_name), spdk_json_decode_string}, ++ {"password", offsetof(struct rpc_bdev_nvme_opal_init, password), spdk_json_decode_string}, ++}; ++ ++static void ++rpc_bdev_nvme_opal_init(struct spdk_jsonrpc_request *request, ++ const struct spdk_json_val *params) ++{ ++ struct rpc_bdev_nvme_opal_init req = {}; ++ struct nvme_ctrlr *nvme_ctrlr; ++ int rc; ++ ++ if (spdk_json_decode_object(params, rpc_bdev_nvme_opal_init_decoders, ++ SPDK_COUNTOF(rpc_bdev_nvme_opal_init_decoders), ++ &req)) { ++ SPDK_ERRLOG("spdk_json_decode_object failed\n"); ++ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, "Invalid parameters"); ++ goto out; ++ } ++ ++ /* check if opal supported */ ++ nvme_ctrlr = nvme_ctrlr_get_by_name(req.nvme_ctrlr_name); ++ if (nvme_ctrlr == NULL || nvme_ctrlr->opal_dev == NULL) { ++ SPDK_ERRLOG("%s not support opal\n", req.nvme_ctrlr_name); ++ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, "Invalid parameters"); ++ goto out; ++ } ++ ++ /* take ownership */ ++ rc = spdk_opal_cmd_take_ownership(nvme_ctrlr->opal_dev, req.password); ++ if (rc) { ++ SPDK_ERRLOG("Take ownership failure: %d\n", rc); ++ switch (rc) { ++ case -EBUSY: ++ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, ++ "SP Busy, try again later"); ++ break; ++ case -EACCES: ++ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, ++ "This drive is already enabled"); ++ break; ++ default: ++ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, "Internal error"); ++ } ++ goto out; ++ } ++ ++ /* activate locking SP */ ++ rc = spdk_opal_cmd_activate_locking_sp(nvme_ctrlr->opal_dev, req.password); ++ if (rc) { ++ SPDK_ERRLOG("Activate locking SP failure: %d\n", rc); ++ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, "Internal error"); ++ goto out; ++ } ++ ++ spdk_jsonrpc_send_bool_response(request, true); ++ ++out: ++ free_rpc_bdev_nvme_opal_init(&req); ++} ++SPDK_RPC_REGISTER("bdev_nvme_opal_init", rpc_bdev_nvme_opal_init, SPDK_RPC_RUNTIME) ++ ++struct rpc_bdev_nvme_opal_revert { ++ char *nvme_ctrlr_name; ++ char *password; ++}; ++ ++static void ++free_rpc_bdev_nvme_opal_revert(struct rpc_bdev_nvme_opal_revert *req) ++{ ++ free(req->nvme_ctrlr_name); ++ free(req->password); ++} ++ ++static const struct spdk_json_object_decoder rpc_bdev_nvme_opal_revert_decoders[] = { ++ {"nvme_ctrlr_name", offsetof(struct rpc_bdev_nvme_opal_revert, nvme_ctrlr_name), spdk_json_decode_string}, ++ {"password", offsetof(struct rpc_bdev_nvme_opal_revert, password), spdk_json_decode_string}, ++}; ++ ++static void ++rpc_bdev_nvme_opal_revert(struct spdk_jsonrpc_request *request, ++ const struct spdk_json_val *params) ++{ ++ struct rpc_bdev_nvme_opal_revert req = {}; ++ struct nvme_ctrlr *nvme_ctrlr; ++ int rc; ++ ++ if (spdk_json_decode_object(params, rpc_bdev_nvme_opal_revert_decoders, ++ SPDK_COUNTOF(rpc_bdev_nvme_opal_revert_decoders), ++ &req)) { ++ SPDK_ERRLOG("spdk_json_decode_object failed\n"); ++ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, "Invalid parameters"); ++ goto out; ++ } ++ ++ /* check if opal supported */ ++ nvme_ctrlr = nvme_ctrlr_get_by_name(req.nvme_ctrlr_name); ++ if (nvme_ctrlr == NULL || nvme_ctrlr->opal_dev == NULL) { ++ SPDK_ERRLOG("%s not support opal\n", req.nvme_ctrlr_name); ++ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, "Invalid parameters"); ++ goto out; ++ } ++ ++ /* TODO: delete all opal vbdev before revert TPer */ ++ ++ rc = spdk_opal_cmd_revert_tper(nvme_ctrlr->opal_dev, req.password); ++ if (rc) { ++ SPDK_ERRLOG("Revert TPer failure: %d\n", rc); ++ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, "Internal error"); ++ goto out; ++ } ++ ++ spdk_jsonrpc_send_bool_response(request, true); ++ ++out: ++ free_rpc_bdev_nvme_opal_revert(&req); ++} ++SPDK_RPC_REGISTER("bdev_nvme_opal_revert", rpc_bdev_nvme_opal_revert, SPDK_RPC_RUNTIME) ++ ++struct rpc_bdev_opal_create { ++ char *nvme_ctrlr_name; ++ uint32_t nsid; ++ uint16_t locking_range_id; ++ uint64_t range_start; ++ uint64_t range_length; ++ char *password; ++}; ++ ++static void ++free_rpc_bdev_opal_create(struct rpc_bdev_opal_create *req) ++{ ++ free(req->nvme_ctrlr_name); ++ free(req->password); ++} ++ ++static const struct spdk_json_object_decoder rpc_bdev_opal_create_decoders[] = { ++ {"nvme_ctrlr_name", offsetof(struct rpc_bdev_opal_create, nvme_ctrlr_name), spdk_json_decode_string}, ++ {"nsid", offsetof(struct rpc_bdev_opal_create, nsid), spdk_json_decode_uint32}, ++ {"locking_range_id", offsetof(struct rpc_bdev_opal_create, locking_range_id), spdk_json_decode_uint16}, ++ {"range_start", offsetof(struct rpc_bdev_opal_create, range_start), spdk_json_decode_uint64}, ++ {"range_length", offsetof(struct rpc_bdev_opal_create, range_length), spdk_json_decode_uint64}, ++ {"password", offsetof(struct rpc_bdev_opal_create, password), spdk_json_decode_string}, ++}; ++ ++static void ++rpc_bdev_opal_create(struct spdk_jsonrpc_request *request, ++ const struct spdk_json_val *params) ++{ ++ struct rpc_bdev_opal_create req = {}; ++ struct spdk_json_write_ctx *w; ++ char *opal_bdev_name; ++ int rc; ++ ++ if (spdk_json_decode_object(params, rpc_bdev_opal_create_decoders, ++ SPDK_COUNTOF(rpc_bdev_opal_create_decoders), ++ &req)) { ++ SPDK_ERRLOG("spdk_json_decode_object failed\n"); ++ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, "Invalid parameters"); ++ goto out; ++ } ++ ++ rc = vbdev_opal_create(req.nvme_ctrlr_name, req.nsid, req.locking_range_id, req.range_start, ++ req.range_length, req.password); ++ if (rc != 0) { ++ spdk_jsonrpc_send_error_response_fmt(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, ++ "Failed to create opal vbdev from '%s': %s", ++ req.nvme_ctrlr_name, spdk_strerror(-rc)); ++ goto out; ++ } ++ ++ w = spdk_jsonrpc_begin_result(request); ++ opal_bdev_name = spdk_sprintf_alloc("%sn%dr%d", req.nvme_ctrlr_name, req.nsid, ++ req.locking_range_id); ++ spdk_json_write_string(w, opal_bdev_name); ++ spdk_jsonrpc_end_result(request, w); ++ free(opal_bdev_name); ++ ++out: ++ free_rpc_bdev_opal_create(&req); ++} ++SPDK_RPC_REGISTER("bdev_opal_create", rpc_bdev_opal_create, SPDK_RPC_RUNTIME) ++ ++struct rpc_bdev_opal_get_info { ++ char *bdev_name; ++ char *password; ++}; ++ ++static void ++free_rpc_bdev_opal_get_info(struct rpc_bdev_opal_get_info *req) ++{ ++ free(req->bdev_name); ++ free(req->password); ++} ++ ++static const struct spdk_json_object_decoder rpc_bdev_opal_get_info_decoders[] = { ++ {"bdev_name", offsetof(struct rpc_bdev_opal_get_info, bdev_name), spdk_json_decode_string}, ++ {"password", offsetof(struct rpc_bdev_opal_get_info, password), spdk_json_decode_string}, ++}; ++ ++static void ++rpc_bdev_opal_get_info(struct spdk_jsonrpc_request *request, ++ const struct spdk_json_val *params) ++{ ++ struct rpc_bdev_opal_get_info req = {}; ++ struct spdk_json_write_ctx *w; ++ struct spdk_opal_locking_range_info *info; ++ ++ if (spdk_json_decode_object(params, rpc_bdev_opal_get_info_decoders, ++ SPDK_COUNTOF(rpc_bdev_opal_get_info_decoders), ++ &req)) { ++ SPDK_ERRLOG("spdk_json_decode_object failed\n"); ++ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, "Invalid parameters"); ++ goto out; ++ } ++ ++ info = vbdev_opal_get_info_from_bdev(req.bdev_name, req.password); ++ if (info == NULL) { ++ SPDK_ERRLOG("Get opal info failure\n"); ++ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, "Internal error"); ++ goto out; ++ } ++ ++ w = spdk_jsonrpc_begin_result(request); ++ spdk_json_write_object_begin(w); ++ ++ spdk_json_write_named_string(w, "name", req.bdev_name); ++ spdk_json_write_named_uint64(w, "range_start", info->range_start); ++ spdk_json_write_named_uint64(w, "range_length", info->range_length); ++ spdk_json_write_named_bool(w, "read_lock_enabled", info->read_lock_enabled); ++ spdk_json_write_named_bool(w, "write_lock_enabled", info->write_lock_enabled); ++ spdk_json_write_named_bool(w, "read_locked", info->read_locked); ++ spdk_json_write_named_bool(w, "write_locked", info->write_locked); ++ ++ spdk_json_write_object_end(w); ++ spdk_jsonrpc_end_result(request, w); ++ ++out: ++ free_rpc_bdev_opal_get_info(&req); ++} ++SPDK_RPC_REGISTER("bdev_opal_get_info", rpc_bdev_opal_get_info, SPDK_RPC_RUNTIME) ++ ++struct rpc_bdev_opal_delete { ++ char *bdev_name; ++ char *password; ++}; ++ ++static void ++free_rpc_bdev_opal_delete(struct rpc_bdev_opal_delete *req) ++{ ++ free(req->bdev_name); ++ free(req->password); ++} ++ ++static const struct spdk_json_object_decoder rpc_bdev_opal_delete_decoders[] = { ++ {"bdev_name", offsetof(struct rpc_bdev_opal_delete, bdev_name), spdk_json_decode_string}, ++ {"password", offsetof(struct rpc_bdev_opal_delete, password), spdk_json_decode_string}, ++}; ++ ++static void ++rpc_bdev_opal_delete(struct spdk_jsonrpc_request *request, ++ const struct spdk_json_val *params) ++{ ++ struct rpc_bdev_opal_delete req = {}; ++ int rc; ++ ++ if (spdk_json_decode_object(params, rpc_bdev_opal_delete_decoders, ++ SPDK_COUNTOF(rpc_bdev_opal_delete_decoders), ++ &req)) { ++ SPDK_ERRLOG("spdk_json_decode_object failed\n"); ++ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, "Invalid parameters"); ++ goto out; ++ } ++ ++ rc = vbdev_opal_destruct(req.bdev_name, req.password); ++ if (rc < 0) { ++ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, spdk_strerror(-rc)); ++ goto out; ++ } ++ ++ spdk_jsonrpc_send_bool_response(request, true); ++out: ++ free_rpc_bdev_opal_delete(&req); ++} ++SPDK_RPC_REGISTER("bdev_opal_delete", rpc_bdev_opal_delete, SPDK_RPC_RUNTIME) ++ ++struct rpc_bdev_opal_set_lock_state { ++ char *bdev_name; ++ uint16_t user_id; ++ char *password; ++ char *lock_state; ++}; ++ ++static void ++free_rpc_bdev_opal_set_lock_state(struct rpc_bdev_opal_set_lock_state *req) ++{ ++ free(req->bdev_name); ++ free(req->password); ++ free(req->lock_state); ++} ++ ++static const struct spdk_json_object_decoder rpc_bdev_opal_set_lock_state_decoders[] = { ++ {"bdev_name", offsetof(struct rpc_bdev_opal_set_lock_state, bdev_name), spdk_json_decode_string}, ++ {"user_id", offsetof(struct rpc_bdev_opal_set_lock_state, user_id), spdk_json_decode_uint16}, ++ {"password", offsetof(struct rpc_bdev_opal_set_lock_state, password), spdk_json_decode_string}, ++ {"lock_state", offsetof(struct rpc_bdev_opal_set_lock_state, lock_state), spdk_json_decode_string}, ++}; ++ ++static void ++rpc_bdev_opal_set_lock_state(struct spdk_jsonrpc_request *request, ++ const struct spdk_json_val *params) ++{ ++ struct rpc_bdev_opal_set_lock_state req = {}; ++ int rc; ++ ++ if (spdk_json_decode_object(params, rpc_bdev_opal_set_lock_state_decoders, ++ SPDK_COUNTOF(rpc_bdev_opal_set_lock_state_decoders), ++ &req)) { ++ SPDK_ERRLOG("spdk_json_decode_object failed\n"); ++ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, "Invalid parameters"); ++ goto out; ++ } ++ ++ rc = vbdev_opal_set_lock_state(req.bdev_name, req.user_id, req.password, req.lock_state); ++ if (rc != 0) { ++ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, spdk_strerror(-rc)); ++ goto out; ++ } ++ ++ spdk_jsonrpc_send_bool_response(request, true); ++ ++out: ++ free_rpc_bdev_opal_set_lock_state(&req); ++} ++SPDK_RPC_REGISTER("bdev_opal_set_lock_state", rpc_bdev_opal_set_lock_state, SPDK_RPC_RUNTIME) ++ ++struct rpc_bdev_opal_new_user { ++ char *bdev_name; ++ char *admin_password; ++ uint16_t user_id; ++ char *user_password; ++}; ++ ++static void ++free_rpc_bdev_opal_new_user(struct rpc_bdev_opal_new_user *req) ++{ ++ free(req->bdev_name); ++ free(req->admin_password); ++ free(req->user_password); ++} ++ ++static const struct spdk_json_object_decoder rpc_bdev_opal_new_user_decoders[] = { ++ {"bdev_name", offsetof(struct rpc_bdev_opal_new_user, bdev_name), spdk_json_decode_string}, ++ {"admin_password", offsetof(struct rpc_bdev_opal_new_user, admin_password), spdk_json_decode_string}, ++ {"user_id", offsetof(struct rpc_bdev_opal_new_user, user_id), spdk_json_decode_uint16}, ++ {"user_password", offsetof(struct rpc_bdev_opal_new_user, user_password), spdk_json_decode_string}, ++}; ++ ++static void ++rpc_bdev_opal_new_user(struct spdk_jsonrpc_request *request, ++ const struct spdk_json_val *params) ++{ ++ struct rpc_bdev_opal_new_user req = {}; ++ int rc; ++ ++ if (spdk_json_decode_object(params, rpc_bdev_opal_new_user_decoders, ++ SPDK_COUNTOF(rpc_bdev_opal_new_user_decoders), ++ &req)) { ++ SPDK_ERRLOG("spdk_json_decode_object failed\n"); ++ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, "Invalid parameters"); ++ goto out; ++ } ++ ++ rc = vbdev_opal_enable_new_user(req.bdev_name, req.admin_password, req.user_id, ++ req.user_password); ++ if (rc != 0) { ++ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, spdk_strerror(-rc)); ++ goto out; ++ } ++ ++ spdk_jsonrpc_send_bool_response(request, true); ++ ++out: ++ free_rpc_bdev_opal_new_user(&req); ++} ++SPDK_RPC_REGISTER("bdev_opal_new_user", rpc_bdev_opal_new_user, SPDK_RPC_RUNTIME) +diff --git a/module/bdev/ocf/Makefile b/module/bdev/ocf/Makefile +index 19eb889..c9b9b0c 100644 +--- a/module/bdev/ocf/Makefile ++++ b/module/bdev/ocf/Makefile +@@ -1,24 +1,24 @@ +-# SPDX-License-Identifier: BSD-3-Clause +-# Copyright (C) 2018 Intel Corporation. +-# All rights reserved. +-# +- +-SPDK_ROOT_DIR := $(abspath $(CURDIR)/../../..) +- +-include $(SPDK_ROOT_DIR)/mk/spdk.common.mk +- +-SO_VER := 4 +-SO_MINOR := 0 +- +-CFLAGS += $(ENV_CFLAGS) -I$(SPDK_ROOT_DIR)/lib/env_ocf -I$(SPDK_ROOT_DIR)/lib/env_ocf/include +-C_SRCS = $(shell ls *.c) +- +-LIBNAME := bdev_ocf +- +-SPDK_MAP_FILE = $(SPDK_ROOT_DIR)/mk/spdk_blank.map +- +-include $(SPDK_ROOT_DIR)/mk/spdk.lib.mk +- +-OCF_ENV := $(call spdk_lib_list_to_static_libs,ocfenv) +- +-$(LIB) : $(OCF_ENV) ++# SPDX-License-Identifier: BSD-3-Clause ++# Copyright (C) 2018 Intel Corporation. ++# All rights reserved. ++# ++ ++SPDK_ROOT_DIR := $(abspath $(CURDIR)/../../..) ++ ++include $(SPDK_ROOT_DIR)/mk/spdk.common.mk ++ ++SO_VER := 4 ++SO_MINOR := 0 ++ ++CFLAGS += $(ENV_CFLAGS) -I$(SPDK_ROOT_DIR)/lib/env_ocf -I$(SPDK_ROOT_DIR)/lib/env_ocf/include ++C_SRCS = $(shell ls *.c) ++ ++LIBNAME := bdev_ocf ++ ++SPDK_MAP_FILE = $(SPDK_ROOT_DIR)/mk/spdk_blank.map ++ ++include $(SPDK_ROOT_DIR)/mk/spdk.lib.mk ++ ++OCF_ENV := $(call spdk_lib_list_to_static_libs,ocfenv) ++ ++$(LIB) : $(OCF_ENV) +diff --git a/module/bdev/ocf/ctx.c b/module/bdev/ocf/ctx.c +index f22ed15..a291e95 100644 +--- a/module/bdev/ocf/ctx.c ++++ b/module/bdev/ocf/ctx.c +@@ -1,496 +1,496 @@ +-/* SPDX-License-Identifier: BSD-3-Clause +- * Copyright (C) 2018 Intel Corporation. +- * All rights reserved. +- */ +- +-#include +-#include +- +-#include "spdk/env.h" +-#include "spdk/log.h" +- +-#include "ctx.h" +-#include "data.h" +- +-ocf_ctx_t vbdev_ocf_ctx; +- +-static ctx_data_t * +-vbdev_ocf_ctx_data_alloc(uint32_t pages) +-{ +- struct bdev_ocf_data *data; +- void *buf; +- uint32_t sz; +- +- data = vbdev_ocf_data_alloc(1); +- +- sz = pages * PAGE_SIZE; +- buf = spdk_malloc(sz, PAGE_SIZE, NULL, +- SPDK_ENV_LCORE_ID_ANY, SPDK_MALLOC_DMA); +- if (buf == NULL) { +- return NULL; +- } +- +- vbdev_ocf_iovs_add(data, buf, sz); +- +- data->size = sz; +- +- return data; +-} +- +-static void +-vbdev_ocf_ctx_data_free(ctx_data_t *ctx_data) +-{ +- struct bdev_ocf_data *data = ctx_data; +- int i; +- +- if (!data) { +- return; +- } +- +- for (i = 0; i < data->iovcnt; i++) { +- spdk_free(data->iovs[i].iov_base); +- } +- +- vbdev_ocf_data_free(data); +-} +- +-static int +-vbdev_ocf_ctx_data_mlock(ctx_data_t *ctx_data) +-{ +- /* TODO [mlock]: add mlock option */ +- return 0; +-} +- +-static void +-vbdev_ocf_ctx_data_munlock(ctx_data_t *ctx_data) +-{ +- /* TODO [mlock]: add mlock option */ +-} +- +-static size_t +-iovec_flatten(struct iovec *iov, size_t iovcnt, void *buf, size_t size, size_t offset) +-{ +- size_t i, len, done = 0; +- +- for (i = 0; i < iovcnt; i++) { +- if (offset >= iov[i].iov_len) { +- offset -= iov[i].iov_len; +- continue; +- } +- +- if (iov[i].iov_base == NULL) { +- continue; +- } +- +- if (done >= size) { +- break; +- } +- +- len = MIN(size - done, iov[i].iov_len - offset); +- memcpy(buf, iov[i].iov_base + offset, len); +- buf += len; +- done += len; +- offset = 0; +- } +- +- return done; +-} +- +-static uint32_t +-vbdev_ocf_ctx_data_rd(void *dst, ctx_data_t *src, uint32_t size) +-{ +- struct bdev_ocf_data *s = src; +- uint32_t size_local; +- +- size_local = iovec_flatten(s->iovs, s->iovcnt, dst, size, s->seek); +- s->seek += size_local; +- +- return size_local; +-} +- +-static size_t +-buf_to_iovec(const void *buf, size_t size, struct iovec *iov, size_t iovcnt, size_t offset) +-{ +- size_t i, len, done = 0; +- +- for (i = 0; i < iovcnt; i++) { +- if (offset >= iov[i].iov_len) { +- offset -= iov[i].iov_len; +- continue; +- } +- +- if (iov[i].iov_base == NULL) { +- continue; +- } +- +- if (done >= size) { +- break; +- } +- +- len = MIN(size - done, iov[i].iov_len - offset); +- memcpy(iov[i].iov_base + offset, buf, len); +- buf += len; +- done += len; +- offset = 0; +- } +- +- return done; +-} +- +-static uint32_t +-vbdev_ocf_ctx_data_wr(ctx_data_t *dst, const void *src, uint32_t size) +-{ +- struct bdev_ocf_data *d = dst; +- uint32_t size_local; +- +- size_local = buf_to_iovec(src, size, d->iovs, d->iovcnt, d->seek); +- d->seek += size_local; +- +- return size_local; +-} +- +-static size_t +-iovset(struct iovec *iov, size_t iovcnt, int byte, size_t size, size_t offset) +-{ +- size_t i, len, done = 0; +- +- for (i = 0; i < iovcnt; i++) { +- if (offset >= iov[i].iov_len) { +- offset -= iov[i].iov_len; +- continue; +- } +- +- if (iov[i].iov_base == NULL) { +- continue; +- } +- +- if (done >= size) { +- break; +- } +- +- len = MIN(size - done, iov[i].iov_len - offset); +- memset(iov[i].iov_base + offset, byte, len); +- done += len; +- offset = 0; +- } +- +- return done; +-} +- +-static uint32_t +-vbdev_ocf_ctx_data_zero(ctx_data_t *dst, uint32_t size) +-{ +- struct bdev_ocf_data *d = dst; +- uint32_t size_local; +- +- size_local = iovset(d->iovs, d->iovcnt, 0, size, d->seek); +- d->seek += size_local; +- +- return size_local; +-} +- +-static uint32_t +-vbdev_ocf_ctx_data_seek(ctx_data_t *dst, ctx_data_seek_t seek, uint32_t offset) +-{ +- struct bdev_ocf_data *d = dst; +- uint32_t off = 0; +- +- switch (seek) { +- case ctx_data_seek_begin: +- off = MIN(offset, d->size); +- d->seek = off; +- break; +- case ctx_data_seek_current: +- off = MIN(offset, d->size - d->seek); +- d->seek += off; +- break; +- } +- +- return off; +-} +- +-static uint64_t +-vbdev_ocf_ctx_data_cpy(ctx_data_t *dst, ctx_data_t *src, uint64_t to, +- uint64_t from, uint64_t bytes) +-{ +- struct bdev_ocf_data *s = src; +- struct bdev_ocf_data *d = dst; +- uint32_t it_iov = 0; +- uint32_t it_off = 0; +- uint32_t n, sz; +- +- bytes = MIN(bytes, s->size - from); +- bytes = MIN(bytes, d->size - to); +- sz = bytes; +- +- while (from || bytes) { +- if (s->iovs[it_iov].iov_len == it_off) { +- it_iov++; +- it_off = 0; +- continue; +- } +- +- if (from) { +- n = MIN(from, s->iovs[it_iov].iov_len); +- from -= n; +- } else { +- n = MIN(bytes, s->iovs[it_iov].iov_len); +- buf_to_iovec(s->iovs[it_iov].iov_base + it_off, n, d->iovs, d->iovcnt, to); +- bytes -= n; +- to += n; +- } +- +- it_off += n; +- } +- +- return sz; +-} +- +-static void +-vbdev_ocf_ctx_data_secure_erase(ctx_data_t *ctx_data) +-{ +- struct bdev_ocf_data *data = ctx_data; +- struct iovec *iovs = data->iovs; +- int i; +- +- for (i = 0; i < data->iovcnt; i++) { +- if (env_memset(iovs[i].iov_base, iovs[i].iov_len, 0)) { +- assert(false); +- } +- } +-} +- +-int +-vbdev_ocf_queue_create(ocf_cache_t cache, ocf_queue_t *queue, const struct ocf_queue_ops *ops) +-{ +- int rc; +- struct vbdev_ocf_cache_ctx *ctx = ocf_cache_get_priv(cache); +- +- pthread_mutex_lock(&ctx->lock); +- rc = ocf_queue_create(cache, queue, ops); +- pthread_mutex_unlock(&ctx->lock); +- return rc; +-} +- +-void +-vbdev_ocf_queue_put(ocf_queue_t queue) +-{ +- ocf_cache_t cache = ocf_queue_get_cache(queue); +- struct vbdev_ocf_cache_ctx *ctx = ocf_cache_get_priv(cache); +- +- pthread_mutex_lock(&ctx->lock); +- ocf_queue_put(queue); +- pthread_mutex_unlock(&ctx->lock); +-} +- +-void +-vbdev_ocf_cache_ctx_put(struct vbdev_ocf_cache_ctx *ctx) +-{ +- if (env_atomic_dec_return(&ctx->refcnt) == 0) { +- pthread_mutex_destroy(&ctx->lock); +- free(ctx); +- } +-} +- +-void +-vbdev_ocf_cache_ctx_get(struct vbdev_ocf_cache_ctx *ctx) +-{ +- env_atomic_inc(&ctx->refcnt); +-} +- +-struct cleaner_priv { +- struct spdk_poller *poller; +- ocf_queue_t queue; +- uint64_t next_run; +-}; +- +-static int +-cleaner_poll(void *arg) +-{ +- ocf_cleaner_t cleaner = arg; +- struct cleaner_priv *priv = ocf_cleaner_get_priv(cleaner); +- uint32_t iono = ocf_queue_pending_io(priv->queue); +- int i, max = spdk_min(32, iono); +- +- for (i = 0; i < max; i++) { +- ocf_queue_run_single(priv->queue); +- } +- +- if (spdk_get_ticks() >= priv->next_run) { +- ocf_cleaner_run(cleaner, priv->queue); +- return SPDK_POLLER_BUSY; +- } +- +- if (iono > 0) { +- return SPDK_POLLER_BUSY; +- } else { +- return SPDK_POLLER_IDLE; +- } +-} +- +-static void +-cleaner_cmpl(ocf_cleaner_t c, uint32_t interval) +-{ +- struct cleaner_priv *priv = ocf_cleaner_get_priv(c); +- +- priv->next_run = spdk_get_ticks() + ((interval * spdk_get_ticks_hz()) / 1000); +-} +- +-static void +-cleaner_queue_kick(ocf_queue_t q) +-{ +-} +- +-static void +-cleaner_queue_stop(ocf_queue_t q) +-{ +- struct cleaner_priv *cpriv = ocf_queue_get_priv(q); +- +- if (cpriv) { +- spdk_poller_unregister(&cpriv->poller); +- free(cpriv); +- } +-} +- +-const struct ocf_queue_ops cleaner_queue_ops = { +- .kick_sync = cleaner_queue_kick, +- .kick = cleaner_queue_kick, +- .stop = cleaner_queue_stop, +-}; +- +-static int +-vbdev_ocf_ctx_cleaner_init(ocf_cleaner_t c) +-{ +- int rc; +- struct cleaner_priv *priv = calloc(1, sizeof(*priv)); +- ocf_cache_t cache = ocf_cleaner_get_cache(c); +- struct vbdev_ocf_cache_ctx *cctx = ocf_cache_get_priv(cache); +- +- if (priv == NULL) { +- return -ENOMEM; +- } +- +- rc = vbdev_ocf_queue_create(cache, &priv->queue, &cleaner_queue_ops); +- if (rc) { +- free(priv); +- return rc; +- } +- +- ocf_queue_set_priv(priv->queue, priv); +- +- cctx->cleaner_queue = priv->queue; +- +- ocf_cleaner_set_cmpl(c, cleaner_cmpl); +- ocf_cleaner_set_priv(c, priv); +- +- return 0; +-} +- +-static void +-vbdev_ocf_ctx_cleaner_stop(ocf_cleaner_t c) +-{ +- struct cleaner_priv *priv = ocf_cleaner_get_priv(c); +- +- vbdev_ocf_queue_put(priv->queue); +-} +- +-static void +-vbdev_ocf_ctx_cleaner_kick(ocf_cleaner_t cleaner) +-{ +- struct cleaner_priv *priv = ocf_cleaner_get_priv(cleaner); +- +- if (priv->poller) { +- return; +- } +- +- /* We start cleaner poller at the same thread where cache was created +- * TODO: allow user to specify core at which cleaner should run */ +- priv->poller = SPDK_POLLER_REGISTER(cleaner_poll, cleaner, 0); +-} +- +-/* This function is main way by which OCF communicates with user +- * We don't want to use SPDK_LOG here because debugging information that is +- * associated with every print message is not helpful in callback that only prints info +- * while the real source is somewhere in OCF code */ +-static int +-vbdev_ocf_ctx_log_printf(ocf_logger_t logger, ocf_logger_lvl_t lvl, +- const char *fmt, va_list args) +-{ +- int spdk_lvl; +- +- switch (lvl) { +- case log_emerg: +- case log_alert: +- case log_crit: +- case log_err: +- spdk_lvl = SPDK_LOG_ERROR; +- break; +- +- case log_warn: +- spdk_lvl = SPDK_LOG_WARN; +- break; +- +- case log_notice: +- spdk_lvl = SPDK_LOG_NOTICE; +- break; +- +- case log_info: +- case log_debug: +- default: +- spdk_lvl = SPDK_LOG_INFO; +- } +- +- spdk_vlog(spdk_lvl, NULL, -1, NULL, fmt, args); +- return 0; +-} +- +-static const struct ocf_ctx_config vbdev_ocf_ctx_cfg = { +- .name = "OCF SPDK", +- +- .ops = { +- .data = { +- .alloc = vbdev_ocf_ctx_data_alloc, +- .free = vbdev_ocf_ctx_data_free, +- .mlock = vbdev_ocf_ctx_data_mlock, +- .munlock = vbdev_ocf_ctx_data_munlock, +- .read = vbdev_ocf_ctx_data_rd, +- .write = vbdev_ocf_ctx_data_wr, +- .zero = vbdev_ocf_ctx_data_zero, +- .seek = vbdev_ocf_ctx_data_seek, +- .copy = vbdev_ocf_ctx_data_cpy, +- .secure_erase = vbdev_ocf_ctx_data_secure_erase, +- }, +- +- .cleaner = { +- .init = vbdev_ocf_ctx_cleaner_init, +- .stop = vbdev_ocf_ctx_cleaner_stop, +- .kick = vbdev_ocf_ctx_cleaner_kick, +- }, +- +- .logger = { +- .print = vbdev_ocf_ctx_log_printf, +- .dump_stack = NULL, +- }, +- +- }, +-}; +- +-int +-vbdev_ocf_ctx_init(void) +-{ +- int ret; +- +- ret = ocf_ctx_create(&vbdev_ocf_ctx, &vbdev_ocf_ctx_cfg); +- if (ret < 0) { +- return ret; +- } +- +- return 0; +-} +- +-void +-vbdev_ocf_ctx_cleanup(void) +-{ +- ocf_ctx_put(vbdev_ocf_ctx); +- vbdev_ocf_ctx = NULL; +-} ++/* SPDX-License-Identifier: BSD-3-Clause ++ * Copyright (C) 2018 Intel Corporation. ++ * All rights reserved. ++ */ ++ ++#include ++#include ++ ++#include "spdk/env.h" ++#include "spdk/log.h" ++ ++#include "ctx.h" ++#include "data.h" ++ ++ocf_ctx_t vbdev_ocf_ctx; ++ ++static ctx_data_t * ++vbdev_ocf_ctx_data_alloc(uint32_t pages) ++{ ++ struct bdev_ocf_data *data; ++ void *buf; ++ uint32_t sz; ++ ++ data = vbdev_ocf_data_alloc(1); ++ ++ sz = pages * PAGE_SIZE; ++ buf = spdk_malloc(sz, PAGE_SIZE, NULL, ++ SPDK_ENV_LCORE_ID_ANY, SPDK_MALLOC_DMA); ++ if (buf == NULL) { ++ return NULL; ++ } ++ ++ vbdev_ocf_iovs_add(data, buf, sz); ++ ++ data->size = sz; ++ ++ return data; ++} ++ ++static void ++vbdev_ocf_ctx_data_free(ctx_data_t *ctx_data) ++{ ++ struct bdev_ocf_data *data = ctx_data; ++ int i; ++ ++ if (!data) { ++ return; ++ } ++ ++ for (i = 0; i < data->iovcnt; i++) { ++ spdk_free(data->iovs[i].iov_base); ++ } ++ ++ vbdev_ocf_data_free(data); ++} ++ ++static int ++vbdev_ocf_ctx_data_mlock(ctx_data_t *ctx_data) ++{ ++ /* TODO [mlock]: add mlock option */ ++ return 0; ++} ++ ++static void ++vbdev_ocf_ctx_data_munlock(ctx_data_t *ctx_data) ++{ ++ /* TODO [mlock]: add mlock option */ ++} ++ ++static size_t ++iovec_flatten(struct iovec *iov, size_t iovcnt, void *buf, size_t size, size_t offset) ++{ ++ size_t i, len, done = 0; ++ ++ for (i = 0; i < iovcnt; i++) { ++ if (offset >= iov[i].iov_len) { ++ offset -= iov[i].iov_len; ++ continue; ++ } ++ ++ if (iov[i].iov_base == NULL) { ++ continue; ++ } ++ ++ if (done >= size) { ++ break; ++ } ++ ++ len = MIN(size - done, iov[i].iov_len - offset); ++ memcpy(buf, iov[i].iov_base + offset, len); ++ buf += len; ++ done += len; ++ offset = 0; ++ } ++ ++ return done; ++} ++ ++static uint32_t ++vbdev_ocf_ctx_data_rd(void *dst, ctx_data_t *src, uint32_t size) ++{ ++ struct bdev_ocf_data *s = src; ++ uint32_t size_local; ++ ++ size_local = iovec_flatten(s->iovs, s->iovcnt, dst, size, s->seek); ++ s->seek += size_local; ++ ++ return size_local; ++} ++ ++static size_t ++buf_to_iovec(const void *buf, size_t size, struct iovec *iov, size_t iovcnt, size_t offset) ++{ ++ size_t i, len, done = 0; ++ ++ for (i = 0; i < iovcnt; i++) { ++ if (offset >= iov[i].iov_len) { ++ offset -= iov[i].iov_len; ++ continue; ++ } ++ ++ if (iov[i].iov_base == NULL) { ++ continue; ++ } ++ ++ if (done >= size) { ++ break; ++ } ++ ++ len = MIN(size - done, iov[i].iov_len - offset); ++ memcpy(iov[i].iov_base + offset, buf, len); ++ buf += len; ++ done += len; ++ offset = 0; ++ } ++ ++ return done; ++} ++ ++static uint32_t ++vbdev_ocf_ctx_data_wr(ctx_data_t *dst, const void *src, uint32_t size) ++{ ++ struct bdev_ocf_data *d = dst; ++ uint32_t size_local; ++ ++ size_local = buf_to_iovec(src, size, d->iovs, d->iovcnt, d->seek); ++ d->seek += size_local; ++ ++ return size_local; ++} ++ ++static size_t ++iovset(struct iovec *iov, size_t iovcnt, int byte, size_t size, size_t offset) ++{ ++ size_t i, len, done = 0; ++ ++ for (i = 0; i < iovcnt; i++) { ++ if (offset >= iov[i].iov_len) { ++ offset -= iov[i].iov_len; ++ continue; ++ } ++ ++ if (iov[i].iov_base == NULL) { ++ continue; ++ } ++ ++ if (done >= size) { ++ break; ++ } ++ ++ len = MIN(size - done, iov[i].iov_len - offset); ++ memset(iov[i].iov_base + offset, byte, len); ++ done += len; ++ offset = 0; ++ } ++ ++ return done; ++} ++ ++static uint32_t ++vbdev_ocf_ctx_data_zero(ctx_data_t *dst, uint32_t size) ++{ ++ struct bdev_ocf_data *d = dst; ++ uint32_t size_local; ++ ++ size_local = iovset(d->iovs, d->iovcnt, 0, size, d->seek); ++ d->seek += size_local; ++ ++ return size_local; ++} ++ ++static uint32_t ++vbdev_ocf_ctx_data_seek(ctx_data_t *dst, ctx_data_seek_t seek, uint32_t offset) ++{ ++ struct bdev_ocf_data *d = dst; ++ uint32_t off = 0; ++ ++ switch (seek) { ++ case ctx_data_seek_begin: ++ off = MIN(offset, d->size); ++ d->seek = off; ++ break; ++ case ctx_data_seek_current: ++ off = MIN(offset, d->size - d->seek); ++ d->seek += off; ++ break; ++ } ++ ++ return off; ++} ++ ++static uint64_t ++vbdev_ocf_ctx_data_cpy(ctx_data_t *dst, ctx_data_t *src, uint64_t to, ++ uint64_t from, uint64_t bytes) ++{ ++ struct bdev_ocf_data *s = src; ++ struct bdev_ocf_data *d = dst; ++ uint32_t it_iov = 0; ++ uint32_t it_off = 0; ++ uint32_t n, sz; ++ ++ bytes = MIN(bytes, s->size - from); ++ bytes = MIN(bytes, d->size - to); ++ sz = bytes; ++ ++ while (from || bytes) { ++ if (s->iovs[it_iov].iov_len == it_off) { ++ it_iov++; ++ it_off = 0; ++ continue; ++ } ++ ++ if (from) { ++ n = MIN(from, s->iovs[it_iov].iov_len); ++ from -= n; ++ } else { ++ n = MIN(bytes, s->iovs[it_iov].iov_len); ++ buf_to_iovec(s->iovs[it_iov].iov_base + it_off, n, d->iovs, d->iovcnt, to); ++ bytes -= n; ++ to += n; ++ } ++ ++ it_off += n; ++ } ++ ++ return sz; ++} ++ ++static void ++vbdev_ocf_ctx_data_secure_erase(ctx_data_t *ctx_data) ++{ ++ struct bdev_ocf_data *data = ctx_data; ++ struct iovec *iovs = data->iovs; ++ int i; ++ ++ for (i = 0; i < data->iovcnt; i++) { ++ if (env_memset(iovs[i].iov_base, iovs[i].iov_len, 0)) { ++ assert(false); ++ } ++ } ++} ++ ++int ++vbdev_ocf_queue_create(ocf_cache_t cache, ocf_queue_t *queue, const struct ocf_queue_ops *ops) ++{ ++ int rc; ++ struct vbdev_ocf_cache_ctx *ctx = ocf_cache_get_priv(cache); ++ ++ pthread_mutex_lock(&ctx->lock); ++ rc = ocf_queue_create(cache, queue, ops); ++ pthread_mutex_unlock(&ctx->lock); ++ return rc; ++} ++ ++void ++vbdev_ocf_queue_put(ocf_queue_t queue) ++{ ++ ocf_cache_t cache = ocf_queue_get_cache(queue); ++ struct vbdev_ocf_cache_ctx *ctx = ocf_cache_get_priv(cache); ++ ++ pthread_mutex_lock(&ctx->lock); ++ ocf_queue_put(queue); ++ pthread_mutex_unlock(&ctx->lock); ++} ++ ++void ++vbdev_ocf_cache_ctx_put(struct vbdev_ocf_cache_ctx *ctx) ++{ ++ if (env_atomic_dec_return(&ctx->refcnt) == 0) { ++ pthread_mutex_destroy(&ctx->lock); ++ free(ctx); ++ } ++} ++ ++void ++vbdev_ocf_cache_ctx_get(struct vbdev_ocf_cache_ctx *ctx) ++{ ++ env_atomic_inc(&ctx->refcnt); ++} ++ ++struct cleaner_priv { ++ struct spdk_poller *poller; ++ ocf_queue_t queue; ++ uint64_t next_run; ++}; ++ ++static int ++cleaner_poll(void *arg) ++{ ++ ocf_cleaner_t cleaner = arg; ++ struct cleaner_priv *priv = ocf_cleaner_get_priv(cleaner); ++ uint32_t iono = ocf_queue_pending_io(priv->queue); ++ int i, max = spdk_min(32, iono); ++ ++ for (i = 0; i < max; i++) { ++ ocf_queue_run_single(priv->queue); ++ } ++ ++ if (spdk_get_ticks() >= priv->next_run) { ++ ocf_cleaner_run(cleaner, priv->queue); ++ return SPDK_POLLER_BUSY; ++ } ++ ++ if (iono > 0) { ++ return SPDK_POLLER_BUSY; ++ } else { ++ return SPDK_POLLER_IDLE; ++ } ++} ++ ++static void ++cleaner_cmpl(ocf_cleaner_t c, uint32_t interval) ++{ ++ struct cleaner_priv *priv = ocf_cleaner_get_priv(c); ++ ++ priv->next_run = spdk_get_ticks() + ((interval * spdk_get_ticks_hz()) / 1000); ++} ++ ++static void ++cleaner_queue_kick(ocf_queue_t q) ++{ ++} ++ ++static void ++cleaner_queue_stop(ocf_queue_t q) ++{ ++ struct cleaner_priv *cpriv = ocf_queue_get_priv(q); ++ ++ if (cpriv) { ++ spdk_poller_unregister(&cpriv->poller); ++ free(cpriv); ++ } ++} ++ ++const struct ocf_queue_ops cleaner_queue_ops = { ++ .kick_sync = cleaner_queue_kick, ++ .kick = cleaner_queue_kick, ++ .stop = cleaner_queue_stop, ++}; ++ ++static int ++vbdev_ocf_ctx_cleaner_init(ocf_cleaner_t c) ++{ ++ int rc; ++ struct cleaner_priv *priv = calloc(1, sizeof(*priv)); ++ ocf_cache_t cache = ocf_cleaner_get_cache(c); ++ struct vbdev_ocf_cache_ctx *cctx = ocf_cache_get_priv(cache); ++ ++ if (priv == NULL) { ++ return -ENOMEM; ++ } ++ ++ rc = vbdev_ocf_queue_create(cache, &priv->queue, &cleaner_queue_ops); ++ if (rc) { ++ free(priv); ++ return rc; ++ } ++ ++ ocf_queue_set_priv(priv->queue, priv); ++ ++ cctx->cleaner_queue = priv->queue; ++ ++ ocf_cleaner_set_cmpl(c, cleaner_cmpl); ++ ocf_cleaner_set_priv(c, priv); ++ ++ return 0; ++} ++ ++static void ++vbdev_ocf_ctx_cleaner_stop(ocf_cleaner_t c) ++{ ++ struct cleaner_priv *priv = ocf_cleaner_get_priv(c); ++ ++ vbdev_ocf_queue_put(priv->queue); ++} ++ ++static void ++vbdev_ocf_ctx_cleaner_kick(ocf_cleaner_t cleaner) ++{ ++ struct cleaner_priv *priv = ocf_cleaner_get_priv(cleaner); ++ ++ if (priv->poller) { ++ return; ++ } ++ ++ /* We start cleaner poller at the same thread where cache was created ++ * TODO: allow user to specify core at which cleaner should run */ ++ priv->poller = SPDK_POLLER_REGISTER(cleaner_poll, cleaner, 0); ++} ++ ++/* This function is main way by which OCF communicates with user ++ * We don't want to use SPDK_LOG here because debugging information that is ++ * associated with every print message is not helpful in callback that only prints info ++ * while the real source is somewhere in OCF code */ ++static int ++vbdev_ocf_ctx_log_printf(ocf_logger_t logger, ocf_logger_lvl_t lvl, ++ const char *fmt, va_list args) ++{ ++ int spdk_lvl; ++ ++ switch (lvl) { ++ case log_emerg: ++ case log_alert: ++ case log_crit: ++ case log_err: ++ spdk_lvl = SPDK_LOG_ERROR; ++ break; ++ ++ case log_warn: ++ spdk_lvl = SPDK_LOG_WARN; ++ break; ++ ++ case log_notice: ++ spdk_lvl = SPDK_LOG_NOTICE; ++ break; ++ ++ case log_info: ++ case log_debug: ++ default: ++ spdk_lvl = SPDK_LOG_INFO; ++ } ++ ++ spdk_vlog(spdk_lvl, NULL, -1, NULL, fmt, args); ++ return 0; ++} ++ ++static const struct ocf_ctx_config vbdev_ocf_ctx_cfg = { ++ .name = "OCF SPDK", ++ ++ .ops = { ++ .data = { ++ .alloc = vbdev_ocf_ctx_data_alloc, ++ .free = vbdev_ocf_ctx_data_free, ++ .mlock = vbdev_ocf_ctx_data_mlock, ++ .munlock = vbdev_ocf_ctx_data_munlock, ++ .read = vbdev_ocf_ctx_data_rd, ++ .write = vbdev_ocf_ctx_data_wr, ++ .zero = vbdev_ocf_ctx_data_zero, ++ .seek = vbdev_ocf_ctx_data_seek, ++ .copy = vbdev_ocf_ctx_data_cpy, ++ .secure_erase = vbdev_ocf_ctx_data_secure_erase, ++ }, ++ ++ .cleaner = { ++ .init = vbdev_ocf_ctx_cleaner_init, ++ .stop = vbdev_ocf_ctx_cleaner_stop, ++ .kick = vbdev_ocf_ctx_cleaner_kick, ++ }, ++ ++ .logger = { ++ .print = vbdev_ocf_ctx_log_printf, ++ .dump_stack = NULL, ++ }, ++ ++ }, ++}; ++ ++int ++vbdev_ocf_ctx_init(void) ++{ ++ int ret; ++ ++ ret = ocf_ctx_create(&vbdev_ocf_ctx, &vbdev_ocf_ctx_cfg); ++ if (ret < 0) { ++ return ret; ++ } ++ ++ return 0; ++} ++ ++void ++vbdev_ocf_ctx_cleanup(void) ++{ ++ ocf_ctx_put(vbdev_ocf_ctx); ++ vbdev_ocf_ctx = NULL; ++} +diff --git a/module/bdev/ocf/ctx.h b/module/bdev/ocf/ctx.h +index 730e1cb..6396bc7 100644 +--- a/module/bdev/ocf/ctx.h ++++ b/module/bdev/ocf/ctx.h +@@ -1,38 +1,38 @@ +-/* SPDX-License-Identifier: BSD-3-Clause +- * Copyright (C) 2018 Intel Corporation. +- * All rights reserved. +- */ +- +-#ifndef VBDEV_OCF_CTX_H +-#define VBDEV_OCF_CTX_H +- +-#include +-#include "ocf_env.h" +-#include "spdk/thread.h" +- +-extern ocf_ctx_t vbdev_ocf_ctx; +- +-#define OCF_WRITE_FLUSH 11 +- +-#define SPDK_OBJECT 1 +- +-/* Context of cache instance */ +-struct vbdev_ocf_cache_ctx { +- ocf_queue_t mngt_queue; +- ocf_queue_t cleaner_queue; +- pthread_mutex_t lock; +- env_atomic refcnt; +-}; +- +-void vbdev_ocf_cache_ctx_put(struct vbdev_ocf_cache_ctx *ctx); +-void vbdev_ocf_cache_ctx_get(struct vbdev_ocf_cache_ctx *ctx); +- +-int vbdev_ocf_ctx_init(void); +-void vbdev_ocf_ctx_cleanup(void); +- +-/* Thread safe queue creation and deletion +- * These are wrappers for original ocf_queue_create() and ocf_queue_put() */ +-int vbdev_ocf_queue_create(ocf_cache_t cache, ocf_queue_t *queue, const struct ocf_queue_ops *ops); +-void vbdev_ocf_queue_put(ocf_queue_t queue); +- +-#endif ++/* SPDX-License-Identifier: BSD-3-Clause ++ * Copyright (C) 2018 Intel Corporation. ++ * All rights reserved. ++ */ ++ ++#ifndef VBDEV_OCF_CTX_H ++#define VBDEV_OCF_CTX_H ++ ++#include ++#include "ocf_env.h" ++#include "spdk/thread.h" ++ ++extern ocf_ctx_t vbdev_ocf_ctx; ++ ++#define OCF_WRITE_FLUSH 11 ++ ++#define SPDK_OBJECT 1 ++ ++/* Context of cache instance */ ++struct vbdev_ocf_cache_ctx { ++ ocf_queue_t mngt_queue; ++ ocf_queue_t cleaner_queue; ++ pthread_mutex_t lock; ++ env_atomic refcnt; ++}; ++ ++void vbdev_ocf_cache_ctx_put(struct vbdev_ocf_cache_ctx *ctx); ++void vbdev_ocf_cache_ctx_get(struct vbdev_ocf_cache_ctx *ctx); ++ ++int vbdev_ocf_ctx_init(void); ++void vbdev_ocf_ctx_cleanup(void); ++ ++/* Thread safe queue creation and deletion ++ * These are wrappers for original ocf_queue_create() and ocf_queue_put() */ ++int vbdev_ocf_queue_create(ocf_cache_t cache, ocf_queue_t *queue, const struct ocf_queue_ops *ops); ++void vbdev_ocf_queue_put(ocf_queue_t queue); ++ ++#endif +diff --git a/module/bdev/ocf/data.c b/module/bdev/ocf/data.c +index ff3f347..93f852d 100644 +--- a/module/bdev/ocf/data.c ++++ b/module/bdev/ocf/data.c +@@ -1,94 +1,94 @@ +-/* SPDX-License-Identifier: BSD-3-Clause +- * Copyright (C) 2018 Intel Corporation. +- * All rights reserved. +- */ +- +-#include +-#include "spdk/bdev.h" +-#include "data.h" +- +-struct bdev_ocf_data * +-vbdev_ocf_data_alloc(uint32_t iovcnt) +-{ +- struct bdev_ocf_data *data; +- +- data = env_malloc(sizeof(*data), ENV_MEM_NOIO); +- if (!data) { +- return NULL; +- } +- +- data->seek = 0; +- +- if (iovcnt) { +- data->iovs = env_malloc(sizeof(*data->iovs) * iovcnt, ENV_MEM_NOIO); +- if (!data->iovs) { +- env_free(data); +- return NULL; +- } +- } +- +- data->iovcnt = 0; +- data->iovalloc = iovcnt; +- +- return data; +-} +- +-void +-vbdev_ocf_data_free(struct bdev_ocf_data *data) +-{ +- if (!data) { +- return; +- } +- +- if (data->iovalloc != 0) { +- env_free(data->iovs); +- } +- +- env_free(data); +-} +- +-void +-vbdev_ocf_iovs_add(struct bdev_ocf_data *data, void *base, size_t len) +-{ +- assert(NULL != data); +- assert(data->iovalloc != -1); +- +- if (data->iovcnt == data->iovalloc) { +- /* TODO: Realloc iovs */ +- SPDK_ERRLOG("IOV error\n"); +- } +- +- data->iovs[data->iovcnt].iov_base = base; +- data->iovs[data->iovcnt].iov_len = len; +- data->iovcnt++; +-} +- +-struct bdev_ocf_data * +-vbdev_ocf_data_from_spdk_io(struct spdk_bdev_io *bdev_io) +-{ +- struct bdev_ocf_data *data; +- +- if (bdev_io == NULL) { +- return NULL; +- } +- +- switch (bdev_io->type) { +- case SPDK_BDEV_IO_TYPE_WRITE: +- case SPDK_BDEV_IO_TYPE_READ: +- assert(bdev_io->u.bdev.iovs); +- break; +- case SPDK_BDEV_IO_TYPE_FLUSH: +- case SPDK_BDEV_IO_TYPE_UNMAP: +- break; +- default: +- SPDK_ERRLOG("Unsupported IO type %d\n", bdev_io->type); +- return NULL; +- } +- +- data = (struct bdev_ocf_data *)bdev_io->driver_ctx; +- data->iovs = bdev_io->u.bdev.iovs; +- data->iovcnt = bdev_io->u.bdev.iovcnt; +- data->size = bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen; +- +- return data; +-} ++/* SPDX-License-Identifier: BSD-3-Clause ++ * Copyright (C) 2018 Intel Corporation. ++ * All rights reserved. ++ */ ++ ++#include ++#include "spdk/bdev.h" ++#include "data.h" ++ ++struct bdev_ocf_data * ++vbdev_ocf_data_alloc(uint32_t iovcnt) ++{ ++ struct bdev_ocf_data *data; ++ ++ data = env_malloc(sizeof(*data), ENV_MEM_NOIO); ++ if (!data) { ++ return NULL; ++ } ++ ++ data->seek = 0; ++ ++ if (iovcnt) { ++ data->iovs = env_malloc(sizeof(*data->iovs) * iovcnt, ENV_MEM_NOIO); ++ if (!data->iovs) { ++ env_free(data); ++ return NULL; ++ } ++ } ++ ++ data->iovcnt = 0; ++ data->iovalloc = iovcnt; ++ ++ return data; ++} ++ ++void ++vbdev_ocf_data_free(struct bdev_ocf_data *data) ++{ ++ if (!data) { ++ return; ++ } ++ ++ if (data->iovalloc != 0) { ++ env_free(data->iovs); ++ } ++ ++ env_free(data); ++} ++ ++void ++vbdev_ocf_iovs_add(struct bdev_ocf_data *data, void *base, size_t len) ++{ ++ assert(NULL != data); ++ assert(data->iovalloc != -1); ++ ++ if (data->iovcnt == data->iovalloc) { ++ /* TODO: Realloc iovs */ ++ SPDK_ERRLOG("IOV error\n"); ++ } ++ ++ data->iovs[data->iovcnt].iov_base = base; ++ data->iovs[data->iovcnt].iov_len = len; ++ data->iovcnt++; ++} ++ ++struct bdev_ocf_data * ++vbdev_ocf_data_from_spdk_io(struct spdk_bdev_io *bdev_io) ++{ ++ struct bdev_ocf_data *data; ++ ++ if (bdev_io == NULL) { ++ return NULL; ++ } ++ ++ switch (bdev_io->type) { ++ case SPDK_BDEV_IO_TYPE_WRITE: ++ case SPDK_BDEV_IO_TYPE_READ: ++ assert(bdev_io->u.bdev.iovs); ++ break; ++ case SPDK_BDEV_IO_TYPE_FLUSH: ++ case SPDK_BDEV_IO_TYPE_UNMAP: ++ break; ++ default: ++ SPDK_ERRLOG("Unsupported IO type %d\n", bdev_io->type); ++ return NULL; ++ } ++ ++ data = (struct bdev_ocf_data *)bdev_io->driver_ctx; ++ data->iovs = bdev_io->u.bdev.iovs; ++ data->iovcnt = bdev_io->u.bdev.iovcnt; ++ data->size = bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen; ++ ++ return data; ++} +diff --git a/module/bdev/ocf/data.h b/module/bdev/ocf/data.h +index e1b571d..bcf3bb8 100644 +--- a/module/bdev/ocf/data.h ++++ b/module/bdev/ocf/data.h +@@ -1,30 +1,30 @@ +-/* SPDX-License-Identifier: BSD-3-Clause +- * Copyright (C) 2018 Intel Corporation. +- * All rights reserved. +- */ +- +-#ifndef VBDEV_OCF_DATA_H +-#define VBDEV_OCF_DATA_H +- +-#include "ocf_env.h" +-#include "spdk/bdev_module.h" +- +-struct bdev_ocf_data { +- struct iovec *iovs; +- int iovcnt; +- int iovalloc; +- uint32_t size; +- uint32_t seek; +-}; +- +-struct bdev_ocf_data *vbdev_ocf_data_from_spdk_io(struct spdk_bdev_io *bdev_io); +- +-struct bdev_ocf_data *vbdev_ocf_data_alloc(uint32_t nvecs); +- +-void vbdev_ocf_data_free(struct bdev_ocf_data *data); +- +-struct bdev_ocf_data *vbdev_ocf_data_from_iov(struct iovec *iovs); +- +-void vbdev_ocf_iovs_add(struct bdev_ocf_data *data, void *base, size_t len); +- +-#endif ++/* SPDX-License-Identifier: BSD-3-Clause ++ * Copyright (C) 2018 Intel Corporation. ++ * All rights reserved. ++ */ ++ ++#ifndef VBDEV_OCF_DATA_H ++#define VBDEV_OCF_DATA_H ++ ++#include "ocf_env.h" ++#include "spdk/bdev_module.h" ++ ++struct bdev_ocf_data { ++ struct iovec *iovs; ++ int iovcnt; ++ int iovalloc; ++ uint32_t size; ++ uint32_t seek; ++}; ++ ++struct bdev_ocf_data *vbdev_ocf_data_from_spdk_io(struct spdk_bdev_io *bdev_io); ++ ++struct bdev_ocf_data *vbdev_ocf_data_alloc(uint32_t nvecs); ++ ++void vbdev_ocf_data_free(struct bdev_ocf_data *data); ++ ++struct bdev_ocf_data *vbdev_ocf_data_from_iov(struct iovec *iovs); ++ ++void vbdev_ocf_iovs_add(struct bdev_ocf_data *data, void *base, size_t len); ++ ++#endif +diff --git a/module/bdev/ocf/stats.c b/module/bdev/ocf/stats.c +index bbc91f5..e5b9343 100644 +--- a/module/bdev/ocf/stats.c ++++ b/module/bdev/ocf/stats.c +@@ -1,81 +1,81 @@ +-/* SPDX-License-Identifier: BSD-3-Clause +- * Copyright (C) 2018 Intel Corporation. +- * All rights reserved. +- */ +- +-#include "ctx.h" +-#include "stats.h" +- +-int +-vbdev_ocf_stats_get(ocf_cache_t cache, char *core_name, struct vbdev_ocf_stats *stats) +-{ +- int status; +- ocf_core_t core; +- +- status = ocf_core_get_by_name(cache, core_name, strlen(core_name), &core); +- if (status) { +- return status; +- } +- +- return ocf_stats_collect_core(core, &stats->usage, &stats->reqs, &stats->blocks, &stats->errors); +-} +- +-#define WJSON_STAT(w, stats, group, field, units) \ +- spdk_json_write_named_object_begin(w, #field); \ +- spdk_json_write_named_uint64(w, "count", stats->group.field.value); \ +- spdk_json_write_named_string_fmt(w, "percentage", "%lu.%lu", \ +- stats->group.field.fraction / 100, stats->group.field.fraction % 100); \ +- spdk_json_write_named_string(w, "units", units); \ +- spdk_json_write_object_end(w); +- +-void +-vbdev_ocf_stats_write_json(struct spdk_json_write_ctx *w, struct vbdev_ocf_stats *stats) +-{ +- spdk_json_write_object_begin(w); +- +- spdk_json_write_named_object_begin(w, "usage"); +- WJSON_STAT(w, stats, usage, occupancy, "4KiB blocks"); +- WJSON_STAT(w, stats, usage, free, "4KiB blocks"); +- WJSON_STAT(w, stats, usage, clean, "4KiB blocks"); +- WJSON_STAT(w, stats, usage, dirty, "4KiB blocks"); +- spdk_json_write_object_end(w); +- +- spdk_json_write_named_object_begin(w, "requests"); +- WJSON_STAT(w, stats, reqs, rd_hits, "Requests"); +- WJSON_STAT(w, stats, reqs, rd_partial_misses, "Requests"); +- WJSON_STAT(w, stats, reqs, rd_full_misses, "Requests"); +- WJSON_STAT(w, stats, reqs, rd_total, "Requests"); +- WJSON_STAT(w, stats, reqs, wr_hits, "Requests"); +- WJSON_STAT(w, stats, reqs, wr_partial_misses, "Requests"); +- WJSON_STAT(w, stats, reqs, wr_full_misses, "Requests"); +- WJSON_STAT(w, stats, reqs, wr_total, "Requests"); +- WJSON_STAT(w, stats, reqs, rd_pt, "Requests"); +- WJSON_STAT(w, stats, reqs, wr_pt, "Requests"); +- WJSON_STAT(w, stats, reqs, serviced, "Requests"); +- WJSON_STAT(w, stats, reqs, total, "Requests"); +- spdk_json_write_object_end(w); +- +- spdk_json_write_named_object_begin(w, "blocks"); +- WJSON_STAT(w, stats, blocks, core_volume_rd, "4KiB blocks"); +- WJSON_STAT(w, stats, blocks, core_volume_wr, "4KiB blocks"); +- WJSON_STAT(w, stats, blocks, core_volume_total, "4KiB blocks"); +- WJSON_STAT(w, stats, blocks, cache_volume_rd, "4KiB blocks"); +- WJSON_STAT(w, stats, blocks, cache_volume_wr, "4KiB blocks"); +- WJSON_STAT(w, stats, blocks, cache_volume_total, "4KiB blocks"); +- WJSON_STAT(w, stats, blocks, volume_rd, "4KiB blocks"); +- WJSON_STAT(w, stats, blocks, volume_wr, "4KiB blocks"); +- WJSON_STAT(w, stats, blocks, volume_total, "4KiB blocks"); +- spdk_json_write_object_end(w); +- +- spdk_json_write_named_object_begin(w, "errors"); +- WJSON_STAT(w, stats, errors, core_volume_rd, "Requests"); +- WJSON_STAT(w, stats, errors, core_volume_wr, "Requests"); +- WJSON_STAT(w, stats, errors, core_volume_total, "Requests"); +- WJSON_STAT(w, stats, errors, cache_volume_rd, "Requests"); +- WJSON_STAT(w, stats, errors, cache_volume_wr, "Requests"); +- WJSON_STAT(w, stats, errors, cache_volume_total, "Requests"); +- WJSON_STAT(w, stats, errors, total, "Requests"); +- spdk_json_write_object_end(w); +- +- spdk_json_write_object_end(w); +-} ++/* SPDX-License-Identifier: BSD-3-Clause ++ * Copyright (C) 2018 Intel Corporation. ++ * All rights reserved. ++ */ ++ ++#include "ctx.h" ++#include "stats.h" ++ ++int ++vbdev_ocf_stats_get(ocf_cache_t cache, char *core_name, struct vbdev_ocf_stats *stats) ++{ ++ int status; ++ ocf_core_t core; ++ ++ status = ocf_core_get_by_name(cache, core_name, strlen(core_name), &core); ++ if (status) { ++ return status; ++ } ++ ++ return ocf_stats_collect_core(core, &stats->usage, &stats->reqs, &stats->blocks, &stats->errors); ++} ++ ++#define WJSON_STAT(w, stats, group, field, units) \ ++ spdk_json_write_named_object_begin(w, #field); \ ++ spdk_json_write_named_uint64(w, "count", stats->group.field.value); \ ++ spdk_json_write_named_string_fmt(w, "percentage", "%lu.%lu", \ ++ stats->group.field.fraction / 100, stats->group.field.fraction % 100); \ ++ spdk_json_write_named_string(w, "units", units); \ ++ spdk_json_write_object_end(w); ++ ++void ++vbdev_ocf_stats_write_json(struct spdk_json_write_ctx *w, struct vbdev_ocf_stats *stats) ++{ ++ spdk_json_write_object_begin(w); ++ ++ spdk_json_write_named_object_begin(w, "usage"); ++ WJSON_STAT(w, stats, usage, occupancy, "4KiB blocks"); ++ WJSON_STAT(w, stats, usage, free, "4KiB blocks"); ++ WJSON_STAT(w, stats, usage, clean, "4KiB blocks"); ++ WJSON_STAT(w, stats, usage, dirty, "4KiB blocks"); ++ spdk_json_write_object_end(w); ++ ++ spdk_json_write_named_object_begin(w, "requests"); ++ WJSON_STAT(w, stats, reqs, rd_hits, "Requests"); ++ WJSON_STAT(w, stats, reqs, rd_partial_misses, "Requests"); ++ WJSON_STAT(w, stats, reqs, rd_full_misses, "Requests"); ++ WJSON_STAT(w, stats, reqs, rd_total, "Requests"); ++ WJSON_STAT(w, stats, reqs, wr_hits, "Requests"); ++ WJSON_STAT(w, stats, reqs, wr_partial_misses, "Requests"); ++ WJSON_STAT(w, stats, reqs, wr_full_misses, "Requests"); ++ WJSON_STAT(w, stats, reqs, wr_total, "Requests"); ++ WJSON_STAT(w, stats, reqs, rd_pt, "Requests"); ++ WJSON_STAT(w, stats, reqs, wr_pt, "Requests"); ++ WJSON_STAT(w, stats, reqs, serviced, "Requests"); ++ WJSON_STAT(w, stats, reqs, total, "Requests"); ++ spdk_json_write_object_end(w); ++ ++ spdk_json_write_named_object_begin(w, "blocks"); ++ WJSON_STAT(w, stats, blocks, core_volume_rd, "4KiB blocks"); ++ WJSON_STAT(w, stats, blocks, core_volume_wr, "4KiB blocks"); ++ WJSON_STAT(w, stats, blocks, core_volume_total, "4KiB blocks"); ++ WJSON_STAT(w, stats, blocks, cache_volume_rd, "4KiB blocks"); ++ WJSON_STAT(w, stats, blocks, cache_volume_wr, "4KiB blocks"); ++ WJSON_STAT(w, stats, blocks, cache_volume_total, "4KiB blocks"); ++ WJSON_STAT(w, stats, blocks, volume_rd, "4KiB blocks"); ++ WJSON_STAT(w, stats, blocks, volume_wr, "4KiB blocks"); ++ WJSON_STAT(w, stats, blocks, volume_total, "4KiB blocks"); ++ spdk_json_write_object_end(w); ++ ++ spdk_json_write_named_object_begin(w, "errors"); ++ WJSON_STAT(w, stats, errors, core_volume_rd, "Requests"); ++ WJSON_STAT(w, stats, errors, core_volume_wr, "Requests"); ++ WJSON_STAT(w, stats, errors, core_volume_total, "Requests"); ++ WJSON_STAT(w, stats, errors, cache_volume_rd, "Requests"); ++ WJSON_STAT(w, stats, errors, cache_volume_wr, "Requests"); ++ WJSON_STAT(w, stats, errors, cache_volume_total, "Requests"); ++ WJSON_STAT(w, stats, errors, total, "Requests"); ++ spdk_json_write_object_end(w); ++ ++ spdk_json_write_object_end(w); ++} +diff --git a/module/bdev/ocf/stats.h b/module/bdev/ocf/stats.h +index 9bfcc36..8292c5c 100644 +--- a/module/bdev/ocf/stats.h ++++ b/module/bdev/ocf/stats.h +@@ -1,23 +1,23 @@ +-/* SPDX-License-Identifier: BSD-3-Clause +- * Copyright (C) 2018 Intel Corporation. +- * All rights reserved. +- */ +- +-#ifndef VBDEV_OCF_STATS_H +-#define VBDEV_OCF_STATS_H +- +-#include "spdk/json.h" +-#include +- +-struct vbdev_ocf_stats { +- struct ocf_stats_usage usage; +- struct ocf_stats_requests reqs; +- struct ocf_stats_blocks blocks; +- struct ocf_stats_errors errors; +-}; +- +-int vbdev_ocf_stats_get(ocf_cache_t cache, char *core_name, struct vbdev_ocf_stats *stats); +- +-void vbdev_ocf_stats_write_json(struct spdk_json_write_ctx *w, struct vbdev_ocf_stats *stats); +- +-#endif ++/* SPDX-License-Identifier: BSD-3-Clause ++ * Copyright (C) 2018 Intel Corporation. ++ * All rights reserved. ++ */ ++ ++#ifndef VBDEV_OCF_STATS_H ++#define VBDEV_OCF_STATS_H ++ ++#include "spdk/json.h" ++#include ++ ++struct vbdev_ocf_stats { ++ struct ocf_stats_usage usage; ++ struct ocf_stats_requests reqs; ++ struct ocf_stats_blocks blocks; ++ struct ocf_stats_errors errors; ++}; ++ ++int vbdev_ocf_stats_get(ocf_cache_t cache, char *core_name, struct vbdev_ocf_stats *stats); ++ ++void vbdev_ocf_stats_write_json(struct spdk_json_write_ctx *w, struct vbdev_ocf_stats *stats); ++ ++#endif +diff --git a/module/bdev/ocf/utils.c b/module/bdev/ocf/utils.c +index 4c70c70..e7ffdbf 100644 +--- a/module/bdev/ocf/utils.c ++++ b/module/bdev/ocf/utils.c +@@ -1,134 +1,134 @@ +-/* SPDX-License-Identifier: BSD-3-Clause +- * Copyright (C) 2018 Intel Corporation. +- * All rights reserved. +- */ +- +-#include "spdk/stdinc.h" +-#include "spdk/log.h" +- +-#include "utils.h" +-#include "vbdev_ocf.h" +- +-static char *cache_modes[ocf_cache_mode_max] = { +- [ocf_cache_mode_wt] = "wt", +- [ocf_cache_mode_wb] = "wb", +- [ocf_cache_mode_wa] = "wa", +- [ocf_cache_mode_pt] = "pt", +- [ocf_cache_mode_wi] = "wi", +- [ocf_cache_mode_wo] = "wo", +-}; +- +-static char *seqcutoff_policies[ocf_seq_cutoff_policy_max] = { +- [ocf_seq_cutoff_policy_always] = "always", +- [ocf_seq_cutoff_policy_full] = "full", +- [ocf_seq_cutoff_policy_never] = "never", +-}; +- +-ocf_cache_mode_t +-ocf_get_cache_mode(const char *cache_mode) +-{ +- int i; +- +- for (i = 0; i < ocf_cache_mode_max; i++) { +- if (strcmp(cache_mode, cache_modes[i]) == 0) { +- return i; +- } +- } +- +- return ocf_cache_mode_none; +-} +- +-const char * +-ocf_get_cache_modename(ocf_cache_mode_t mode) +-{ +- if (mode > ocf_cache_mode_none && mode < ocf_cache_mode_max) { +- return cache_modes[mode]; +- } else { +- return NULL; +- } +-} +- +-int +-ocf_get_cache_line_size(ocf_cache_t cache) +-{ +- return ocf_cache_get_line_size(cache) / KiB; +-} +- +-ocf_seq_cutoff_policy +-ocf_get_seqcutoff_policy(const char *policy_name) +-{ +- int policy; +- +- for (policy = 0; policy < ocf_seq_cutoff_policy_max; policy++) +- if (!strcmp(policy_name, seqcutoff_policies[policy])) { +- return policy; +- } +- +- return ocf_seq_cutoff_policy_max; +-} +- +-int +-vbdev_ocf_mngt_start(struct vbdev_ocf *vbdev, vbdev_ocf_mngt_fn *path, +- vbdev_ocf_mngt_callback cb, void *cb_arg) +-{ +- if (vbdev->mngt_ctx.current_step) { +- return -EBUSY; +- } +- +- memset(&vbdev->mngt_ctx, 0, sizeof(vbdev->mngt_ctx)); +- +- vbdev->mngt_ctx.current_step = path; +- vbdev->mngt_ctx.cb = cb; +- vbdev->mngt_ctx.cb_arg = cb_arg; +- +- (*vbdev->mngt_ctx.current_step)(vbdev); +- +- return 0; +-} +- +-void +-vbdev_ocf_mngt_stop(struct vbdev_ocf *vbdev, vbdev_ocf_mngt_fn *rollback_path, int status) +-{ +- if (status) { +- vbdev->mngt_ctx.status = status; +- } +- +- if (vbdev->mngt_ctx.status && rollback_path) { +- vbdev->mngt_ctx.poller_fn = NULL; +- vbdev->mngt_ctx.current_step = rollback_path; +- (*vbdev->mngt_ctx.current_step)(vbdev); +- return; +- } +- +- if (vbdev->mngt_ctx.cb) { +- vbdev->mngt_ctx.cb(vbdev->mngt_ctx.status, vbdev, vbdev->mngt_ctx.cb_arg); +- } +- +- memset(&vbdev->mngt_ctx, 0, sizeof(vbdev->mngt_ctx)); +-} +- +-void +-vbdev_ocf_mngt_continue(struct vbdev_ocf *vbdev, int status) +-{ +- if (vbdev->mngt_ctx.current_step == NULL) { +- return; +- } +- +- assert((*vbdev->mngt_ctx.current_step) != NULL); +- +- vbdev->mngt_ctx.status = status; +- +- vbdev->mngt_ctx.current_step++; +- if (*vbdev->mngt_ctx.current_step) { +- (*vbdev->mngt_ctx.current_step)(vbdev); +- return; +- } +- +- vbdev_ocf_mngt_stop(vbdev, NULL, 0); +-} +- +-int +-vbdev_ocf_mngt_get_status(struct vbdev_ocf *vbdev) +-{ +- return vbdev->mngt_ctx.status; +-} ++/* SPDX-License-Identifier: BSD-3-Clause ++ * Copyright (C) 2018 Intel Corporation. ++ * All rights reserved. ++ */ ++ ++#include "spdk/stdinc.h" ++#include "spdk/log.h" ++ ++#include "utils.h" ++#include "vbdev_ocf.h" ++ ++static char *cache_modes[ocf_cache_mode_max] = { ++ [ocf_cache_mode_wt] = "wt", ++ [ocf_cache_mode_wb] = "wb", ++ [ocf_cache_mode_wa] = "wa", ++ [ocf_cache_mode_pt] = "pt", ++ [ocf_cache_mode_wi] = "wi", ++ [ocf_cache_mode_wo] = "wo", ++}; ++ ++static char *seqcutoff_policies[ocf_seq_cutoff_policy_max] = { ++ [ocf_seq_cutoff_policy_always] = "always", ++ [ocf_seq_cutoff_policy_full] = "full", ++ [ocf_seq_cutoff_policy_never] = "never", ++}; ++ ++ocf_cache_mode_t ++ocf_get_cache_mode(const char *cache_mode) ++{ ++ int i; ++ ++ for (i = 0; i < ocf_cache_mode_max; i++) { ++ if (strcmp(cache_mode, cache_modes[i]) == 0) { ++ return i; ++ } ++ } ++ ++ return ocf_cache_mode_none; ++} ++ ++const char * ++ocf_get_cache_modename(ocf_cache_mode_t mode) ++{ ++ if (mode > ocf_cache_mode_none && mode < ocf_cache_mode_max) { ++ return cache_modes[mode]; ++ } else { ++ return NULL; ++ } ++} ++ ++int ++ocf_get_cache_line_size(ocf_cache_t cache) ++{ ++ return ocf_cache_get_line_size(cache) / KiB; ++} ++ ++ocf_seq_cutoff_policy ++ocf_get_seqcutoff_policy(const char *policy_name) ++{ ++ int policy; ++ ++ for (policy = 0; policy < ocf_seq_cutoff_policy_max; policy++) ++ if (!strcmp(policy_name, seqcutoff_policies[policy])) { ++ return policy; ++ } ++ ++ return ocf_seq_cutoff_policy_max; ++} ++ ++int ++vbdev_ocf_mngt_start(struct vbdev_ocf *vbdev, vbdev_ocf_mngt_fn *path, ++ vbdev_ocf_mngt_callback cb, void *cb_arg) ++{ ++ if (vbdev->mngt_ctx.current_step) { ++ return -EBUSY; ++ } ++ ++ memset(&vbdev->mngt_ctx, 0, sizeof(vbdev->mngt_ctx)); ++ ++ vbdev->mngt_ctx.current_step = path; ++ vbdev->mngt_ctx.cb = cb; ++ vbdev->mngt_ctx.cb_arg = cb_arg; ++ ++ (*vbdev->mngt_ctx.current_step)(vbdev); ++ ++ return 0; ++} ++ ++void ++vbdev_ocf_mngt_stop(struct vbdev_ocf *vbdev, vbdev_ocf_mngt_fn *rollback_path, int status) ++{ ++ if (status) { ++ vbdev->mngt_ctx.status = status; ++ } ++ ++ if (vbdev->mngt_ctx.status && rollback_path) { ++ vbdev->mngt_ctx.poller_fn = NULL; ++ vbdev->mngt_ctx.current_step = rollback_path; ++ (*vbdev->mngt_ctx.current_step)(vbdev); ++ return; ++ } ++ ++ if (vbdev->mngt_ctx.cb) { ++ vbdev->mngt_ctx.cb(vbdev->mngt_ctx.status, vbdev, vbdev->mngt_ctx.cb_arg); ++ } ++ ++ memset(&vbdev->mngt_ctx, 0, sizeof(vbdev->mngt_ctx)); ++} ++ ++void ++vbdev_ocf_mngt_continue(struct vbdev_ocf *vbdev, int status) ++{ ++ if (vbdev->mngt_ctx.current_step == NULL) { ++ return; ++ } ++ ++ assert((*vbdev->mngt_ctx.current_step) != NULL); ++ ++ vbdev->mngt_ctx.status = status; ++ ++ vbdev->mngt_ctx.current_step++; ++ if (*vbdev->mngt_ctx.current_step) { ++ (*vbdev->mngt_ctx.current_step)(vbdev); ++ return; ++ } ++ ++ vbdev_ocf_mngt_stop(vbdev, NULL, 0); ++} ++ ++int ++vbdev_ocf_mngt_get_status(struct vbdev_ocf *vbdev) ++{ ++ return vbdev->mngt_ctx.status; ++} +diff --git a/module/bdev/ocf/utils.h b/module/bdev/ocf/utils.h +index 8f1688c..696bd52 100644 +--- a/module/bdev/ocf/utils.h ++++ b/module/bdev/ocf/utils.h +@@ -1,45 +1,45 @@ +-/* SPDX-License-Identifier: BSD-3-Clause +- * Copyright (C) 2018 Intel Corporation. +- * All rights reserved. +- */ +- +-#ifndef VBDEV_OCF_UTILS_H +-#define VBDEV_OCF_UTILS_H +- +-#include +-#include "vbdev_ocf.h" +- +-ocf_cache_mode_t ocf_get_cache_mode(const char *cache_mode); +-const char *ocf_get_cache_modename(ocf_cache_mode_t mode); +- +-/* Get cache line size in KiB units */ +-int ocf_get_cache_line_size(ocf_cache_t cache); +- +-/* Get sequential cutoff policy by name */ +-ocf_seq_cutoff_policy ocf_get_seqcutoff_policy(const char *policy_name); +- +-/* Initiate management operation +- * Receives NULL terminated array of functions (path) +- * and callback (cb) +- * and callback argument (cb_arg) +- * This function may fail with ENOMEM or EBUSY */ +-int vbdev_ocf_mngt_start(struct vbdev_ocf *vbdev, vbdev_ocf_mngt_fn *path, +- vbdev_ocf_mngt_callback cb, void *cb_arg); +- +-/* Continue execution with polling operation (fn) +- * fn must invoke vbdev_ocf_mngt_continue() to stop polling +- * Poller has default timeout of 5 seconds */ +-void vbdev_ocf_mngt_poll(struct vbdev_ocf *vbdev, vbdev_ocf_mngt_fn fn); +- +-/* Continue execution with next function that is on path +- * If next function is NULL, finish management operation and invoke callback */ +-void vbdev_ocf_mngt_continue(struct vbdev_ocf *vbdev, int status); +- +-/* Stop the execution, if status is non zero set it, +- * if rollback function is not null invoke rollback +- * else invoke callback with last status returned */ +-void vbdev_ocf_mngt_stop(struct vbdev_ocf *vbdev, vbdev_ocf_mngt_fn *rollback_path, int status); +- +-/* Get status */ +-int vbdev_ocf_mngt_get_status(struct vbdev_ocf *vbdev); +-#endif ++/* SPDX-License-Identifier: BSD-3-Clause ++ * Copyright (C) 2018 Intel Corporation. ++ * All rights reserved. ++ */ ++ ++#ifndef VBDEV_OCF_UTILS_H ++#define VBDEV_OCF_UTILS_H ++ ++#include ++#include "vbdev_ocf.h" ++ ++ocf_cache_mode_t ocf_get_cache_mode(const char *cache_mode); ++const char *ocf_get_cache_modename(ocf_cache_mode_t mode); ++ ++/* Get cache line size in KiB units */ ++int ocf_get_cache_line_size(ocf_cache_t cache); ++ ++/* Get sequential cutoff policy by name */ ++ocf_seq_cutoff_policy ocf_get_seqcutoff_policy(const char *policy_name); ++ ++/* Initiate management operation ++ * Receives NULL terminated array of functions (path) ++ * and callback (cb) ++ * and callback argument (cb_arg) ++ * This function may fail with ENOMEM or EBUSY */ ++int vbdev_ocf_mngt_start(struct vbdev_ocf *vbdev, vbdev_ocf_mngt_fn *path, ++ vbdev_ocf_mngt_callback cb, void *cb_arg); ++ ++/* Continue execution with polling operation (fn) ++ * fn must invoke vbdev_ocf_mngt_continue() to stop polling ++ * Poller has default timeout of 5 seconds */ ++void vbdev_ocf_mngt_poll(struct vbdev_ocf *vbdev, vbdev_ocf_mngt_fn fn); ++ ++/* Continue execution with next function that is on path ++ * If next function is NULL, finish management operation and invoke callback */ ++void vbdev_ocf_mngt_continue(struct vbdev_ocf *vbdev, int status); ++ ++/* Stop the execution, if status is non zero set it, ++ * if rollback function is not null invoke rollback ++ * else invoke callback with last status returned */ ++void vbdev_ocf_mngt_stop(struct vbdev_ocf *vbdev, vbdev_ocf_mngt_fn *rollback_path, int status); ++ ++/* Get status */ ++int vbdev_ocf_mngt_get_status(struct vbdev_ocf *vbdev); ++#endif +diff --git a/module/bdev/ocf/vbdev_ocf.c b/module/bdev/ocf/vbdev_ocf.c +index 0c12856..47b87ac 100644 +--- a/module/bdev/ocf/vbdev_ocf.c ++++ b/module/bdev/ocf/vbdev_ocf.c +@@ -1,1830 +1,1830 @@ +-/* SPDX-License-Identifier: BSD-3-Clause +- * Copyright (C) 2018 Intel Corporation. +- * All rights reserved. +- */ +- +-#include +-#include +-#include +- +-#include "ctx.h" +-#include "data.h" +-#include "volume.h" +-#include "utils.h" +-#include "vbdev_ocf.h" +- +-#include "spdk/bdev_module.h" +-#include "spdk/thread.h" +-#include "spdk/string.h" +-#include "spdk/log.h" +-#include "spdk/cpuset.h" +- +-static struct spdk_bdev_module ocf_if; +- +-static TAILQ_HEAD(, vbdev_ocf) g_ocf_vbdev_head +- = TAILQ_HEAD_INITIALIZER(g_ocf_vbdev_head); +- +-static TAILQ_HEAD(, examining_bdev) g_ocf_examining_bdevs_head +- = TAILQ_HEAD_INITIALIZER(g_ocf_examining_bdevs_head); +- +-bool g_fini_started = false; +- +-/* Structure for keeping list of bdevs that are claimed but not used yet */ +-struct examining_bdev { +- struct spdk_bdev *bdev; +- TAILQ_ENTRY(examining_bdev) tailq; +-}; +- +-/* Add bdev to list of claimed */ +-static void +-examine_start(struct spdk_bdev *bdev) +-{ +- struct examining_bdev *entry = malloc(sizeof(*entry)); +- +- assert(entry); +- entry->bdev = bdev; +- TAILQ_INSERT_TAIL(&g_ocf_examining_bdevs_head, entry, tailq); +-} +- +-/* Find bdev on list of claimed bdevs, then remove it, +- * if it was the last one on list then report examine done */ +-static void +-examine_done(int status, struct vbdev_ocf *vbdev, void *cb_arg) +-{ +- struct spdk_bdev *bdev = cb_arg; +- struct examining_bdev *entry, *safe, *found = NULL; +- +- TAILQ_FOREACH_SAFE(entry, &g_ocf_examining_bdevs_head, tailq, safe) { +- if (entry->bdev == bdev) { +- if (found) { +- goto remove; +- } else { +- found = entry; +- } +- } +- } +- +- assert(found); +- spdk_bdev_module_examine_done(&ocf_if); +- +-remove: +- TAILQ_REMOVE(&g_ocf_examining_bdevs_head, found, tailq); +- free(found); +-} +- +-/* Free allocated strings and structure itself +- * Used at shutdown only */ +-static void +-free_vbdev(struct vbdev_ocf *vbdev) +-{ +- if (!vbdev) { +- return; +- } +- +- free(vbdev->name); +- free(vbdev->cache.name); +- free(vbdev->core.name); +- free(vbdev); +-} +- +-/* Get existing cache base +- * that is attached to other vbdev */ +-static struct vbdev_ocf_base * +-get_other_cache_base(struct vbdev_ocf_base *base) +-{ +- struct vbdev_ocf *vbdev; +- +- TAILQ_FOREACH(vbdev, &g_ocf_vbdev_head, tailq) { +- if (&vbdev->cache == base || !vbdev->cache.attached) { +- continue; +- } +- if (!strcmp(vbdev->cache.name, base->name)) { +- return &vbdev->cache; +- } +- } +- +- return NULL; +-} +- +-static bool +-is_ocf_cache_running(struct vbdev_ocf *vbdev) +-{ +- if (vbdev->cache.attached && vbdev->ocf_cache) { +- return ocf_cache_is_running(vbdev->ocf_cache); +- } +- return false; +-} +- +-/* Get existing OCF cache instance +- * that is started by other vbdev */ +-static ocf_cache_t +-get_other_cache_instance(struct vbdev_ocf *vbdev) +-{ +- struct vbdev_ocf *cmp; +- +- TAILQ_FOREACH(cmp, &g_ocf_vbdev_head, tailq) { +- if (cmp->state.doing_finish || cmp == vbdev) { +- continue; +- } +- if (strcmp(cmp->cache.name, vbdev->cache.name)) { +- continue; +- } +- if (is_ocf_cache_running(cmp)) { +- return cmp->ocf_cache; +- } +- } +- +- return NULL; +-} +- +-static void +-_remove_base_bdev(void *ctx) +-{ +- struct spdk_bdev_desc *desc = ctx; +- +- spdk_bdev_close(desc); +-} +- +-/* Close and unclaim base bdev */ +-static void +-remove_base_bdev(struct vbdev_ocf_base *base) +-{ +- if (base->attached) { +- if (base->management_channel) { +- spdk_put_io_channel(base->management_channel); +- } +- +- spdk_bdev_module_release_bdev(base->bdev); +- /* Close the underlying bdev on its same opened thread. */ +- if (base->thread && base->thread != spdk_get_thread()) { +- spdk_thread_send_msg(base->thread, _remove_base_bdev, base->desc); +- } else { +- spdk_bdev_close(base->desc); +- } +- base->attached = false; +- } +-} +- +-/* Finish unregister operation */ +-static void +-unregister_finish(struct vbdev_ocf *vbdev) +-{ +- spdk_bdev_destruct_done(&vbdev->exp_bdev, vbdev->state.stop_status); +- +- if (vbdev->ocf_cache) { +- ocf_mngt_cache_put(vbdev->ocf_cache); +- } +- +- if (vbdev->cache_ctx) { +- vbdev_ocf_cache_ctx_put(vbdev->cache_ctx); +- } +- vbdev_ocf_mngt_continue(vbdev, 0); +-} +- +-static void +-close_core_bdev(struct vbdev_ocf *vbdev) +-{ +- remove_base_bdev(&vbdev->core); +- vbdev_ocf_mngt_continue(vbdev, 0); +-} +- +-static void +-remove_core_cmpl(void *priv, int error) +-{ +- struct vbdev_ocf *vbdev = priv; +- +- ocf_mngt_cache_unlock(vbdev->ocf_cache); +- vbdev_ocf_mngt_continue(vbdev, error); +-} +- +-/* Try to lock cache, then remove core */ +-static void +-remove_core_cache_lock_cmpl(ocf_cache_t cache, void *priv, int error) +-{ +- struct vbdev_ocf *vbdev = (struct vbdev_ocf *)priv; +- +- if (error) { +- SPDK_ERRLOG("Error %d, can not lock cache instance %s\n", +- error, vbdev->name); +- vbdev_ocf_mngt_continue(vbdev, error); +- return; +- } +- +- ocf_mngt_cache_remove_core(vbdev->ocf_core, remove_core_cmpl, vbdev); +-} +- +-/* Detach core base */ +-static void +-detach_core(struct vbdev_ocf *vbdev) +-{ +- if (is_ocf_cache_running(vbdev)) { +- ocf_mngt_cache_lock(vbdev->ocf_cache, remove_core_cache_lock_cmpl, vbdev); +- } else { +- vbdev_ocf_mngt_continue(vbdev, 0); +- } +-} +- +-static void +-close_cache_bdev(struct vbdev_ocf *vbdev) +-{ +- remove_base_bdev(&vbdev->cache); +- vbdev_ocf_mngt_continue(vbdev, 0); +-} +- +-/* Detach cache base */ +-static void +-detach_cache(struct vbdev_ocf *vbdev) +-{ +- vbdev->state.stop_status = vbdev->mngt_ctx.status; +- +- /* If some other vbdev references this cache bdev, +- * we detach this only by changing the flag, without actual close */ +- if (get_other_cache_base(&vbdev->cache)) { +- vbdev->cache.attached = false; +- } +- +- vbdev_ocf_mngt_continue(vbdev, 0); +-} +- +-static void +-stop_vbdev_cmpl(ocf_cache_t cache, void *priv, int error) +-{ +- struct vbdev_ocf *vbdev = priv; +- +- vbdev_ocf_queue_put(vbdev->cache_ctx->mngt_queue); +- ocf_mngt_cache_unlock(cache); +- +- vbdev_ocf_mngt_continue(vbdev, error); +-} +- +-/* Try to lock cache, then stop it */ +-static void +-stop_vbdev_cache_lock_cmpl(ocf_cache_t cache, void *priv, int error) +-{ +- struct vbdev_ocf *vbdev = (struct vbdev_ocf *)priv; +- +- if (error) { +- SPDK_ERRLOG("Error %d, can not lock cache instance %s\n", +- error, vbdev->name); +- vbdev_ocf_mngt_continue(vbdev, error); +- return; +- } +- +- ocf_mngt_cache_stop(vbdev->ocf_cache, stop_vbdev_cmpl, vbdev); +-} +- +-/* Stop OCF cache object +- * vbdev_ocf is not operational after this */ +-static void +-stop_vbdev(struct vbdev_ocf *vbdev) +-{ +- if (!is_ocf_cache_running(vbdev)) { +- vbdev_ocf_mngt_continue(vbdev, 0); +- return; +- } +- +- if (!g_fini_started && get_other_cache_instance(vbdev)) { +- SPDK_NOTICELOG("Not stopping cache instance '%s'" +- " because it is referenced by other OCF bdev\n", +- vbdev->cache.name); +- vbdev_ocf_mngt_continue(vbdev, 0); +- return; +- } +- +- ocf_mngt_cache_lock(vbdev->ocf_cache, stop_vbdev_cache_lock_cmpl, vbdev); +-} +- +-static void +-flush_vbdev_cmpl(ocf_cache_t cache, void *priv, int error) +-{ +- struct vbdev_ocf *vbdev = priv; +- +- ocf_mngt_cache_unlock(cache); +- vbdev_ocf_mngt_continue(vbdev, error); +-} +- +-static void +-flush_vbdev_cache_lock_cmpl(ocf_cache_t cache, void *priv, int error) +-{ +- struct vbdev_ocf *vbdev = (struct vbdev_ocf *)priv; +- +- if (error) { +- SPDK_ERRLOG("Error %d, can not lock cache instance %s\n", +- error, vbdev->name); +- vbdev_ocf_mngt_continue(vbdev, error); +- return; +- } +- +- ocf_mngt_cache_flush(vbdev->ocf_cache, flush_vbdev_cmpl, vbdev); +-} +- +-static void +-flush_vbdev(struct vbdev_ocf *vbdev) +-{ +- if (!is_ocf_cache_running(vbdev)) { +- vbdev_ocf_mngt_continue(vbdev, -EINVAL); +- return; +- } +- +- ocf_mngt_cache_lock(vbdev->ocf_cache, flush_vbdev_cache_lock_cmpl, vbdev); +-} +- +-/* Procedures called during dirty unregister */ +-vbdev_ocf_mngt_fn unregister_path_dirty[] = { +- flush_vbdev, +- stop_vbdev, +- detach_cache, +- close_cache_bdev, +- detach_core, +- close_core_bdev, +- unregister_finish, +- NULL +-}; +- +-/* Procedures called during clean unregister */ +-vbdev_ocf_mngt_fn unregister_path_clean[] = { +- flush_vbdev, +- detach_core, +- close_core_bdev, +- stop_vbdev, +- detach_cache, +- close_cache_bdev, +- unregister_finish, +- NULL +-}; +- +-/* Start asynchronous management operation using unregister_path */ +-static void +-unregister_cb(void *opaque) +-{ +- struct vbdev_ocf *vbdev = opaque; +- vbdev_ocf_mngt_fn *unregister_path; +- int rc; +- +- unregister_path = vbdev->state.doing_clean_delete ? +- unregister_path_clean : unregister_path_dirty; +- +- rc = vbdev_ocf_mngt_start(vbdev, unregister_path, NULL, NULL); +- if (rc) { +- SPDK_ERRLOG("Unable to unregister OCF bdev: %d\n", rc); +- spdk_bdev_destruct_done(&vbdev->exp_bdev, rc); +- } +-} +- +-/* Clean remove case - remove core and then cache, this order +- * will remove instance permanently */ +-static void +-_vbdev_ocf_destruct_clean(struct vbdev_ocf *vbdev) +-{ +- if (vbdev->core.attached) { +- detach_core(vbdev); +- close_core_bdev(vbdev); +- } +- +- if (vbdev->cache.attached) { +- detach_cache(vbdev); +- close_cache_bdev(vbdev); +- } +-} +- +-/* Dirty shutdown/hot remove case - remove cache and then core, this order +- * will allow us to recover this instance in the future */ +-static void +-_vbdev_ocf_destruct_dirty(struct vbdev_ocf *vbdev) +-{ +- if (vbdev->cache.attached) { +- detach_cache(vbdev); +- close_cache_bdev(vbdev); +- } +- +- if (vbdev->core.attached) { +- detach_core(vbdev); +- close_core_bdev(vbdev); +- } +-} +- +-/* Unregister io device with callback to unregister_cb +- * This function is called during spdk_bdev_unregister */ +-static int +-vbdev_ocf_destruct(void *opaque) +-{ +- struct vbdev_ocf *vbdev = opaque; +- +- if (vbdev->state.doing_finish) { +- return -EALREADY; +- } +- +- if (vbdev->state.starting && !vbdev->state.started) { +- /* Prevent before detach cache/core during register path of +- this bdev */ +- return -EBUSY; +- } +- +- vbdev->state.doing_finish = true; +- +- if (vbdev->state.started) { +- spdk_io_device_unregister(vbdev, unregister_cb); +- /* Return 1 because unregister is delayed */ +- return 1; +- } +- +- if (vbdev->state.doing_clean_delete) { +- _vbdev_ocf_destruct_clean(vbdev); +- } else { +- _vbdev_ocf_destruct_dirty(vbdev); +- } +- +- return 0; +-} +- +-/* Stop OCF cache and unregister SPDK bdev */ +-int +-vbdev_ocf_delete(struct vbdev_ocf *vbdev, void (*cb)(void *, int), void *cb_arg) +-{ +- int rc = 0; +- +- if (vbdev->state.started) { +- spdk_bdev_unregister(&vbdev->exp_bdev, cb, cb_arg); +- } else { +- rc = vbdev_ocf_destruct(vbdev); +- if (rc == 0 && cb) { +- cb(cb_arg, 0); +- } +- } +- +- return rc; +-} +- +-/* Remove cores permanently and then stop OCF cache and unregister SPDK bdev */ +-int +-vbdev_ocf_delete_clean(struct vbdev_ocf *vbdev, void (*cb)(void *, int), +- void *cb_arg) +-{ +- vbdev->state.doing_clean_delete = true; +- +- return vbdev_ocf_delete(vbdev, cb, cb_arg); +-} +- +- +-/* If vbdev is online, return its object */ +-struct vbdev_ocf * +-vbdev_ocf_get_by_name(const char *name) +-{ +- struct vbdev_ocf *vbdev; +- +- if (name == NULL) { +- assert(false); +- return NULL; +- } +- +- TAILQ_FOREACH(vbdev, &g_ocf_vbdev_head, tailq) { +- if (vbdev->name == NULL || vbdev->state.doing_finish) { +- continue; +- } +- if (strcmp(vbdev->name, name) == 0) { +- return vbdev; +- } +- } +- return NULL; +-} +- +-/* Return matching base if parent vbdev is online */ +-struct vbdev_ocf_base * +-vbdev_ocf_get_base_by_name(const char *name) +-{ +- struct vbdev_ocf *vbdev; +- +- if (name == NULL) { +- assert(false); +- return NULL; +- } +- +- TAILQ_FOREACH(vbdev, &g_ocf_vbdev_head, tailq) { +- if (vbdev->state.doing_finish) { +- continue; +- } +- +- if (vbdev->cache.name && strcmp(vbdev->cache.name, name) == 0) { +- return &vbdev->cache; +- } +- if (vbdev->core.name && strcmp(vbdev->core.name, name) == 0) { +- return &vbdev->core; +- } +- } +- return NULL; +-} +- +-/* Execute fn for each OCF device that is online or waits for base devices */ +-void +-vbdev_ocf_foreach(vbdev_ocf_foreach_fn fn, void *ctx) +-{ +- struct vbdev_ocf *vbdev; +- +- assert(fn != NULL); +- +- TAILQ_FOREACH(vbdev, &g_ocf_vbdev_head, tailq) { +- if (!vbdev->state.doing_finish) { +- fn(vbdev, ctx); +- } +- } +-} +- +-/* Called from OCF when SPDK_IO is completed */ +-static void +-vbdev_ocf_io_submit_cb(struct ocf_io *io, int error) +-{ +- struct spdk_bdev_io *bdev_io = io->priv1; +- +- if (error == 0) { +- spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_SUCCESS); +- } else if (error == -OCF_ERR_NO_MEM) { +- spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_NOMEM); +- } else { +- spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); +- } +- +- ocf_io_put(io); +-} +- +-/* Configure io parameters and send it to OCF */ +-static int +-io_submit_to_ocf(struct spdk_bdev_io *bdev_io, struct ocf_io *io) +-{ +- switch (bdev_io->type) { +- case SPDK_BDEV_IO_TYPE_WRITE: +- case SPDK_BDEV_IO_TYPE_READ: +- ocf_core_submit_io(io); +- return 0; +- case SPDK_BDEV_IO_TYPE_FLUSH: +- ocf_core_submit_flush(io); +- return 0; +- case SPDK_BDEV_IO_TYPE_UNMAP: +- ocf_core_submit_discard(io); +- return 0; +- case SPDK_BDEV_IO_TYPE_RESET: +- case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: +- default: +- SPDK_ERRLOG("Unsupported IO type: %d\n", bdev_io->type); +- return -EINVAL; +- } +-} +- +-/* Submit SPDK-IO to OCF */ +-static void +-io_handle(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io) +-{ +- struct vbdev_ocf *vbdev = bdev_io->bdev->ctxt; +- struct ocf_io *io = NULL; +- struct bdev_ocf_data *data = NULL; +- struct vbdev_ocf_qctx *qctx = spdk_io_channel_get_ctx(ch); +- uint64_t len = bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen; +- uint64_t offset = bdev_io->u.bdev.offset_blocks * bdev_io->bdev->blocklen; +- int dir, flags = 0; +- int err; +- +- switch (bdev_io->type) { +- case SPDK_BDEV_IO_TYPE_READ: +- dir = OCF_READ; +- break; +- case SPDK_BDEV_IO_TYPE_WRITE: +- dir = OCF_WRITE; +- break; +- case SPDK_BDEV_IO_TYPE_FLUSH: +- dir = OCF_WRITE; +- break; +- case SPDK_BDEV_IO_TYPE_UNMAP: +- dir = OCF_WRITE; +- break; +- default: +- err = -EINVAL; +- goto fail; +- } +- +- if (bdev_io->type == SPDK_BDEV_IO_TYPE_FLUSH) { +- flags = OCF_WRITE_FLUSH; +- } +- +- io = ocf_core_new_io(vbdev->ocf_core, qctx->queue, offset, len, dir, 0, flags); +- if (!io) { +- err = -ENOMEM; +- goto fail; +- } +- +- data = vbdev_ocf_data_from_spdk_io(bdev_io); +- if (!data) { +- err = -ENOMEM; +- goto fail; +- } +- +- err = ocf_io_set_data(io, data, 0); +- if (err) { +- goto fail; +- } +- +- ocf_io_set_cmpl(io, bdev_io, NULL, vbdev_ocf_io_submit_cb); +- +- err = io_submit_to_ocf(bdev_io, io); +- if (err) { +- goto fail; +- } +- +- return; +- +-fail: +- if (io) { +- ocf_io_put(io); +- } +- +- if (err == -ENOMEM) { +- spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_NOMEM); +- } else { +- spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); +- } +-} +- +-static void +-vbdev_ocf_get_buf_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io, +- bool success) +-{ +- if (!success) { +- spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); +- return; +- } +- +- io_handle(ch, bdev_io); +-} +- +-/* Called from bdev layer when an io to Cache vbdev is submitted */ +-static void +-vbdev_ocf_submit_request(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io) +-{ +- switch (bdev_io->type) { +- case SPDK_BDEV_IO_TYPE_READ: +- /* User does not have to allocate io vectors for the request, +- * so in case they are not allocated, we allocate them here */ +- spdk_bdev_io_get_buf(bdev_io, vbdev_ocf_get_buf_cb, +- bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen); +- break; +- case SPDK_BDEV_IO_TYPE_WRITE: +- case SPDK_BDEV_IO_TYPE_FLUSH: +- case SPDK_BDEV_IO_TYPE_UNMAP: +- io_handle(ch, bdev_io); +- break; +- case SPDK_BDEV_IO_TYPE_RESET: +- case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: +- default: +- SPDK_ERRLOG("Unknown I/O type %d\n", bdev_io->type); +- spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); +- break; +- } +-} +- +-/* Called from bdev layer */ +-static bool +-vbdev_ocf_io_type_supported(void *opaque, enum spdk_bdev_io_type io_type) +-{ +- struct vbdev_ocf *vbdev = opaque; +- +- switch (io_type) { +- case SPDK_BDEV_IO_TYPE_READ: +- case SPDK_BDEV_IO_TYPE_WRITE: +- case SPDK_BDEV_IO_TYPE_FLUSH: +- case SPDK_BDEV_IO_TYPE_UNMAP: +- return spdk_bdev_io_type_supported(vbdev->core.bdev, io_type); +- case SPDK_BDEV_IO_TYPE_RESET: +- case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: +- default: +- return false; +- } +-} +- +-/* Called from bdev layer */ +-static struct spdk_io_channel * +-vbdev_ocf_get_io_channel(void *opaque) +-{ +- struct vbdev_ocf *bdev = opaque; +- +- return spdk_get_io_channel(bdev); +-} +- +-static int +-vbdev_ocf_dump_info_json(void *opaque, struct spdk_json_write_ctx *w) +-{ +- struct vbdev_ocf *vbdev = opaque; +- +- spdk_json_write_named_string(w, "cache_device", vbdev->cache.name); +- spdk_json_write_named_string(w, "core_device", vbdev->core.name); +- +- spdk_json_write_named_string(w, "mode", +- ocf_get_cache_modename(ocf_cache_get_mode(vbdev->ocf_cache))); +- spdk_json_write_named_uint32(w, "cache_line_size", +- ocf_get_cache_line_size(vbdev->ocf_cache)); +- spdk_json_write_named_bool(w, "metadata_volatile", +- vbdev->cfg.cache.metadata_volatile); +- +- return 0; +-} +- +-static void +-vbdev_ocf_write_json_config(struct spdk_bdev *bdev, struct spdk_json_write_ctx *w) +-{ +- struct vbdev_ocf *vbdev = bdev->ctxt; +- +- spdk_json_write_object_begin(w); +- +- spdk_json_write_named_string(w, "method", "bdev_ocf_create"); +- +- spdk_json_write_named_object_begin(w, "params"); +- spdk_json_write_named_string(w, "name", vbdev->name); +- spdk_json_write_named_string(w, "mode", +- ocf_get_cache_modename(ocf_cache_get_mode(vbdev->ocf_cache))); +- spdk_json_write_named_uint32(w, "cache_line_size", +- ocf_get_cache_line_size(vbdev->ocf_cache)); +- spdk_json_write_named_string(w, "cache_bdev_name", vbdev->cache.name); +- spdk_json_write_named_string(w, "core_bdev_name", vbdev->core.name); +- spdk_json_write_object_end(w); +- +- spdk_json_write_object_end(w); +-} +- +-/* Cache vbdev function table +- * Used by bdev layer */ +-static struct spdk_bdev_fn_table cache_dev_fn_table = { +- .destruct = vbdev_ocf_destruct, +- .io_type_supported = vbdev_ocf_io_type_supported, +- .submit_request = vbdev_ocf_submit_request, +- .get_io_channel = vbdev_ocf_get_io_channel, +- .write_config_json = vbdev_ocf_write_json_config, +- .dump_info_json = vbdev_ocf_dump_info_json, +-}; +- +-/* Poller function for the OCF queue +- * We execute OCF requests here synchronously */ +-static int +-queue_poll(void *opaque) +-{ +- struct vbdev_ocf_qctx *qctx = opaque; +- uint32_t iono = ocf_queue_pending_io(qctx->queue); +- int i, max = spdk_min(32, iono); +- +- for (i = 0; i < max; i++) { +- ocf_queue_run_single(qctx->queue); +- } +- +- if (iono > 0) { +- return SPDK_POLLER_BUSY; +- } else { +- return SPDK_POLLER_IDLE; +- } +-} +- +-/* Called during ocf_submit_io, ocf_purge* +- * and any other requests that need to submit io */ +-static void +-vbdev_ocf_ctx_queue_kick(ocf_queue_t q) +-{ +-} +- +-/* OCF queue deinitialization +- * Called at ocf_cache_stop */ +-static void +-vbdev_ocf_ctx_queue_stop(ocf_queue_t q) +-{ +- struct vbdev_ocf_qctx *qctx = ocf_queue_get_priv(q); +- +- if (qctx) { +- spdk_put_io_channel(qctx->cache_ch); +- spdk_put_io_channel(qctx->core_ch); +- spdk_poller_unregister(&qctx->poller); +- if (qctx->allocated) { +- free(qctx); +- } +- } +-} +- +-/* Queue ops is an interface for running queue thread +- * stop() operation in called just before queue gets destroyed */ +-const struct ocf_queue_ops queue_ops = { +- .kick_sync = vbdev_ocf_ctx_queue_kick, +- .kick = vbdev_ocf_ctx_queue_kick, +- .stop = vbdev_ocf_ctx_queue_stop, +-}; +- +-/* Called on cache vbdev creation at every thread +- * We allocate OCF queues here and SPDK poller for it */ +-static int +-io_device_create_cb(void *io_device, void *ctx_buf) +-{ +- struct vbdev_ocf *vbdev = io_device; +- struct vbdev_ocf_qctx *qctx = ctx_buf; +- int rc; +- +- rc = vbdev_ocf_queue_create(vbdev->ocf_cache, &qctx->queue, &queue_ops); +- if (rc) { +- return rc; +- } +- +- ocf_queue_set_priv(qctx->queue, qctx); +- +- qctx->vbdev = vbdev; +- qctx->cache_ch = spdk_bdev_get_io_channel(vbdev->cache.desc); +- qctx->core_ch = spdk_bdev_get_io_channel(vbdev->core.desc); +- qctx->poller = SPDK_POLLER_REGISTER(queue_poll, qctx, 0); +- +- return rc; +-} +- +-/* Called per thread +- * Put OCF queue and relaunch poller with new context to finish pending requests */ +-static void +-io_device_destroy_cb(void *io_device, void *ctx_buf) +-{ +- /* Making a copy of context to use it after io channel will be destroyed */ +- struct vbdev_ocf_qctx *copy = malloc(sizeof(*copy)); +- struct vbdev_ocf_qctx *qctx = ctx_buf; +- +- if (copy) { +- ocf_queue_set_priv(qctx->queue, copy); +- memcpy(copy, qctx, sizeof(*copy)); +- spdk_poller_unregister(&qctx->poller); +- copy->poller = SPDK_POLLER_REGISTER(queue_poll, copy, 0); +- copy->allocated = true; +- } else { +- SPDK_ERRLOG("Unable to stop OCF queue properly: %s\n", +- spdk_strerror(ENOMEM)); +- } +- +- vbdev_ocf_queue_put(qctx->queue); +-} +- +-/* OCF management queue deinitialization */ +-static void +-vbdev_ocf_ctx_mngt_queue_stop(ocf_queue_t q) +-{ +- struct spdk_poller *poller = ocf_queue_get_priv(q); +- +- if (poller) { +- spdk_poller_unregister(&poller); +- } +-} +- +-static int +-mngt_queue_poll(void *opaque) +-{ +- ocf_queue_t q = opaque; +- uint32_t iono = ocf_queue_pending_io(q); +- int i, max = spdk_min(32, iono); +- +- for (i = 0; i < max; i++) { +- ocf_queue_run_single(q); +- } +- +- if (iono > 0) { +- return SPDK_POLLER_BUSY; +- } else { +- return SPDK_POLLER_IDLE; +- } +-} +- +-static void +-vbdev_ocf_ctx_mngt_queue_kick(ocf_queue_t q) +-{ +-} +- +-/* Queue ops is an interface for running queue thread +- * stop() operation in called just before queue gets destroyed */ +-const struct ocf_queue_ops mngt_queue_ops = { +- .kick_sync = NULL, +- .kick = vbdev_ocf_ctx_mngt_queue_kick, +- .stop = vbdev_ocf_ctx_mngt_queue_stop, +-}; +- +-static void +-vbdev_ocf_mngt_exit(struct vbdev_ocf *vbdev, vbdev_ocf_mngt_fn *rollback_path, int rc) +-{ +- vbdev->state.starting = false; +- vbdev_ocf_mngt_stop(vbdev, rollback_path, rc); +-} +- +-/* Create exported spdk object */ +-static void +-finish_register(struct vbdev_ocf *vbdev) +-{ +- int result; +- +- /* Copy properties of the base bdev */ +- vbdev->exp_bdev.blocklen = vbdev->core.bdev->blocklen; +- vbdev->exp_bdev.write_cache = vbdev->core.bdev->write_cache; +- vbdev->exp_bdev.required_alignment = vbdev->core.bdev->required_alignment; +- +- vbdev->exp_bdev.name = vbdev->name; +- vbdev->exp_bdev.product_name = "SPDK OCF"; +- +- vbdev->exp_bdev.blockcnt = vbdev->core.bdev->blockcnt; +- vbdev->exp_bdev.ctxt = vbdev; +- vbdev->exp_bdev.fn_table = &cache_dev_fn_table; +- vbdev->exp_bdev.module = &ocf_if; +- +- /* Finally register vbdev in SPDK */ +- spdk_io_device_register(vbdev, io_device_create_cb, io_device_destroy_cb, +- sizeof(struct vbdev_ocf_qctx), vbdev->name); +- result = spdk_bdev_register(&vbdev->exp_bdev); +- if (result) { +- SPDK_ERRLOG("Could not register exposed bdev %s\n", +- vbdev->name); +- vbdev_ocf_mngt_exit(vbdev, unregister_path_dirty, result); +- return; +- } else { +- vbdev->state.started = true; +- } +- +- vbdev_ocf_mngt_continue(vbdev, result); +-} +- +-static void +-add_core_cmpl(ocf_cache_t cache, ocf_core_t core, void *priv, int error) +-{ +- struct vbdev_ocf *vbdev = priv; +- +- ocf_mngt_cache_unlock(cache); +- +- if (error) { +- SPDK_ERRLOG("Error %d, failed to add core device to cache instance %s," +- "starting rollback\n", error, vbdev->name); +- vbdev_ocf_mngt_exit(vbdev, unregister_path_dirty, error); +- return; +- } else { +- vbdev->ocf_core = core; +- } +- +- vbdev_ocf_mngt_continue(vbdev, error); +-} +- +-/* Try to lock cache, then add core */ +-static void +-add_core_cache_lock_cmpl(ocf_cache_t cache, void *priv, int error) +-{ +- struct vbdev_ocf *vbdev = (struct vbdev_ocf *)priv; +- +- if (error) { +- SPDK_ERRLOG("Error %d, can not lock cache instance %s," +- "starting rollback\n", error, vbdev->name); +- vbdev_ocf_mngt_exit(vbdev, unregister_path_dirty, error); +- } +- ocf_mngt_cache_add_core(vbdev->ocf_cache, &vbdev->cfg.core, add_core_cmpl, vbdev); +-} +- +-/* Add core for existing OCF cache instance */ +-static void +-add_core(struct vbdev_ocf *vbdev) +-{ +- ocf_mngt_cache_lock(vbdev->ocf_cache, add_core_cache_lock_cmpl, vbdev); +-} +- +-static void +-start_cache_cmpl(ocf_cache_t cache, void *priv, int error) +-{ +- struct vbdev_ocf *vbdev = priv; +- uint64_t mem_needed; +- +- ocf_mngt_cache_unlock(cache); +- +- if (error) { +- SPDK_ERRLOG("Error %d during start cache %s, starting rollback\n", +- error, vbdev->name); +- +- if (error == -OCF_ERR_NO_MEM) { +- ocf_mngt_get_ram_needed(cache, &vbdev->cfg.device, &mem_needed); +- +- SPDK_NOTICELOG("Try to increase hugepage memory size or cache line size. " +- "For your configuration:\nDevice size: %"PRIu64" bytes\n" +- "Cache line size: %"PRIu64" bytes\nFree memory needed to start " +- "cache: %"PRIu64" bytes\n", vbdev->cache.bdev->blockcnt * +- vbdev->cache.bdev->blocklen, vbdev->cfg.cache.cache_line_size, +- mem_needed); +- } +- +- vbdev_ocf_mngt_exit(vbdev, unregister_path_dirty, error); +- return; +- } +- +- vbdev_ocf_mngt_continue(vbdev, error); +-} +- +-static int +-create_management_queue(struct vbdev_ocf *vbdev) +-{ +- struct spdk_poller *mngt_poller; +- int rc; +- +- rc = vbdev_ocf_queue_create(vbdev->ocf_cache, &vbdev->cache_ctx->mngt_queue, &mngt_queue_ops); +- if (rc) { +- SPDK_ERRLOG("Unable to create mngt_queue: %d\n", rc); +- return rc; +- } +- +- mngt_poller = SPDK_POLLER_REGISTER(mngt_queue_poll, vbdev->cache_ctx->mngt_queue, 100); +- if (mngt_poller == NULL) { +- SPDK_ERRLOG("Unable to initiate mngt request: %s", spdk_strerror(ENOMEM)); +- return -ENOMEM; +- } +- +- ocf_queue_set_priv(vbdev->cache_ctx->mngt_queue, mngt_poller); +- ocf_mngt_cache_set_mngt_queue(vbdev->ocf_cache, vbdev->cache_ctx->mngt_queue); +- +- return 0; +-} +- +-/* Start OCF cache, attach caching device */ +-static void +-start_cache(struct vbdev_ocf *vbdev) +-{ +- ocf_cache_t existing; +- uint32_t cache_block_size = vbdev->cache.bdev->blocklen; +- uint32_t core_block_size = vbdev->core.bdev->blocklen; +- int rc; +- +- if (is_ocf_cache_running(vbdev)) { +- vbdev_ocf_mngt_stop(vbdev, NULL, -EALREADY); +- return; +- } +- +- if (cache_block_size > core_block_size) { +- SPDK_ERRLOG("Cache bdev block size (%d) is bigger then core bdev block size (%d)\n", +- cache_block_size, core_block_size); +- vbdev_ocf_mngt_exit(vbdev, unregister_path_dirty, -EINVAL); +- return; +- } +- +- existing = get_other_cache_instance(vbdev); +- if (existing) { +- SPDK_NOTICELOG("OCF bdev %s connects to existing cache device %s\n", +- vbdev->name, vbdev->cache.name); +- vbdev->ocf_cache = existing; +- ocf_mngt_cache_get(vbdev->ocf_cache); +- vbdev->cache_ctx = ocf_cache_get_priv(existing); +- vbdev_ocf_cache_ctx_get(vbdev->cache_ctx); +- vbdev_ocf_mngt_continue(vbdev, 0); +- return; +- } +- +- vbdev->cache_ctx = calloc(1, sizeof(struct vbdev_ocf_cache_ctx)); +- if (vbdev->cache_ctx == NULL) { +- vbdev_ocf_mngt_exit(vbdev, unregister_path_dirty, -ENOMEM); +- return; +- } +- +- vbdev_ocf_cache_ctx_get(vbdev->cache_ctx); +- pthread_mutex_init(&vbdev->cache_ctx->lock, NULL); +- +- rc = ocf_mngt_cache_start(vbdev_ocf_ctx, &vbdev->ocf_cache, &vbdev->cfg.cache, NULL); +- if (rc) { +- SPDK_ERRLOG("Could not start cache %s: %d\n", vbdev->name, rc); +- vbdev_ocf_mngt_exit(vbdev, unregister_path_dirty, rc); +- return; +- } +- ocf_mngt_cache_get(vbdev->ocf_cache); +- +- ocf_cache_set_priv(vbdev->ocf_cache, vbdev->cache_ctx); +- +- rc = create_management_queue(vbdev); +- if (rc) { +- SPDK_ERRLOG("Unable to create mngt_queue: %d\n", rc); +- vbdev_ocf_mngt_exit(vbdev, unregister_path_dirty, rc); +- return; +- } +- +- if (vbdev->cfg.loadq) { +- ocf_mngt_cache_load(vbdev->ocf_cache, &vbdev->cfg.device, start_cache_cmpl, vbdev); +- } else { +- ocf_mngt_cache_attach(vbdev->ocf_cache, &vbdev->cfg.device, start_cache_cmpl, vbdev); +- } +-} +- +-/* Procedures called during register operation */ +-vbdev_ocf_mngt_fn register_path[] = { +- start_cache, +- add_core, +- finish_register, +- NULL +-}; +- +-/* Start cache instance and register OCF bdev */ +-static void +-register_vbdev(struct vbdev_ocf *vbdev, vbdev_ocf_mngt_callback cb, void *cb_arg) +-{ +- int rc; +- +- if (!(vbdev->core.attached && vbdev->cache.attached) || vbdev->state.started) { +- cb(-EPERM, vbdev, cb_arg); +- return; +- } +- +- vbdev->state.starting = true; +- rc = vbdev_ocf_mngt_start(vbdev, register_path, cb, cb_arg); +- if (rc) { +- cb(rc, vbdev, cb_arg); +- } +-} +- +-/* Init OCF configuration options +- * for core and cache devices */ +-static void +-init_vbdev_config(struct vbdev_ocf *vbdev) +-{ +- struct vbdev_ocf_config *cfg = &vbdev->cfg; +- +- /* Initialize OCF defaults first */ +- ocf_mngt_cache_device_config_set_default(&cfg->device); +- ocf_mngt_cache_config_set_default(&cfg->cache); +- ocf_mngt_core_config_set_default(&cfg->core); +- +- snprintf(cfg->cache.name, sizeof(cfg->cache.name), "%s", vbdev->name); +- snprintf(cfg->core.name, sizeof(cfg->core.name), "%s", vbdev->core.name); +- +- cfg->device.open_cores = false; +- cfg->device.perform_test = false; +- cfg->device.discard_on_start = false; +- +- vbdev->cfg.cache.locked = true; +- +- cfg->core.volume_type = SPDK_OBJECT; +- cfg->device.volume_type = SPDK_OBJECT; +- +- if (vbdev->cfg.loadq) { +- /* When doing cache_load(), we need to set try_add to true, +- * otherwise OCF will interpret this core as new +- * instead of the inactive one */ +- vbdev->cfg.core.try_add = true; +- } else { +- /* When cache is initialized as new, set force flag to true, +- * to ignore warnings about existing metadata */ +- cfg->device.force = true; +- } +- +- /* Serialize bdev names in OCF UUID to interpret on future loads +- * Core UUID is a triple of (core name, vbdev name, cache name) +- * Cache UUID is cache bdev name */ +- cfg->device.uuid.size = strlen(vbdev->cache.name) + 1; +- cfg->device.uuid.data = vbdev->cache.name; +- +- snprintf(vbdev->uuid, VBDEV_OCF_MD_MAX_LEN, "%s %s %s", +- vbdev->core.name, vbdev->name, vbdev->cache.name); +- cfg->core.uuid.size = strlen(vbdev->uuid) + 1; +- cfg->core.uuid.data = vbdev->uuid; +- vbdev->uuid[strlen(vbdev->core.name)] = 0; +- vbdev->uuid[strlen(vbdev->core.name) + 1 + strlen(vbdev->name)] = 0; +-} +- +-/* Allocate vbdev structure object and add it to the global list */ +-static int +-init_vbdev(const char *vbdev_name, +- const char *cache_mode_name, +- const uint64_t cache_line_size, +- const char *cache_name, +- const char *core_name, +- bool loadq) +-{ +- struct vbdev_ocf *vbdev; +- int rc = 0; +- +- if (spdk_bdev_get_by_name(vbdev_name) || vbdev_ocf_get_by_name(vbdev_name)) { +- SPDK_ERRLOG("Device with name '%s' already exists\n", vbdev_name); +- return -EPERM; +- } +- +- vbdev = calloc(1, sizeof(*vbdev)); +- if (!vbdev) { +- goto error_mem; +- } +- +- vbdev->name = strdup(vbdev_name); +- if (!vbdev->name) { +- goto error_mem; +- } +- +- vbdev->cache.name = strdup(cache_name); +- if (!vbdev->cache.name) { +- goto error_mem; +- } +- +- vbdev->core.name = strdup(core_name); +- if (!vbdev->core.name) { +- goto error_mem; +- } +- +- vbdev->cache.parent = vbdev; +- vbdev->core.parent = vbdev; +- vbdev->cache.is_cache = true; +- vbdev->core.is_cache = false; +- vbdev->cfg.loadq = loadq; +- +- init_vbdev_config(vbdev); +- +- if (cache_mode_name) { +- vbdev->cfg.cache.cache_mode +- = ocf_get_cache_mode(cache_mode_name); +- } else if (!loadq) { /* In load path it is OK to pass NULL as cache mode */ +- SPDK_ERRLOG("No cache mode specified\n"); +- rc = -EINVAL; +- goto error_free; +- } +- if (vbdev->cfg.cache.cache_mode < 0) { +- SPDK_ERRLOG("Incorrect cache mode '%s'\n", cache_mode_name); +- rc = -EINVAL; +- goto error_free; +- } +- +- ocf_cache_line_size_t set_cache_line_size = cache_line_size ? +- (ocf_cache_line_size_t)cache_line_size * KiB : +- ocf_cache_line_size_default; +- if (set_cache_line_size == 0) { +- SPDK_ERRLOG("Cache line size should be non-zero.\n"); +- rc = -EINVAL; +- goto error_free; +- } +- vbdev->cfg.device.cache_line_size = set_cache_line_size; +- vbdev->cfg.cache.cache_line_size = set_cache_line_size; +- +- TAILQ_INSERT_TAIL(&g_ocf_vbdev_head, vbdev, tailq); +- return rc; +- +-error_mem: +- rc = -ENOMEM; +-error_free: +- free_vbdev(vbdev); +- return rc; +-} +- +-SPDK_LOG_DEPRECATION_REGISTER(bdev_ocf, "bdev_ocf support", "SPDK 23.05", 0); +- +-/* Read configuration file at the start of SPDK application +- * This adds vbdevs to global list if some mentioned in config */ +-static int +-vbdev_ocf_init(void) +-{ +- int status; +- +- SPDK_LOG_DEPRECATED(bdev_ocf); +- +- status = vbdev_ocf_ctx_init(); +- if (status) { +- SPDK_ERRLOG("OCF ctx initialization failed with=%d\n", status); +- return status; +- } +- +- status = vbdev_ocf_volume_init(); +- if (status) { +- vbdev_ocf_ctx_cleanup(); +- SPDK_ERRLOG("OCF volume initialization failed with=%d\n", status); +- return status; +- } +- +- return status; +-} +- +-/* Called after application shutdown started +- * Release memory of allocated structures here */ +-static void +-vbdev_ocf_module_fini(void) +-{ +- struct vbdev_ocf *vbdev; +- +- while ((vbdev = TAILQ_FIRST(&g_ocf_vbdev_head))) { +- TAILQ_REMOVE(&g_ocf_vbdev_head, vbdev, tailq); +- free_vbdev(vbdev); +- } +- +- vbdev_ocf_volume_cleanup(); +- vbdev_ocf_ctx_cleanup(); +-} +- +-/* When base device gets unplugged this is called +- * We will unregister cache vbdev here +- * When cache device is removed, we delete every OCF bdev that used it */ +-static void +-hotremove_cb(struct vbdev_ocf_base *base) +-{ +- struct vbdev_ocf *vbdev; +- +- if (!base->is_cache) { +- if (base->parent->state.doing_finish) { +- return; +- } +- +- SPDK_NOTICELOG("Deinitializing '%s' because its core device '%s' was removed\n", +- base->parent->name, base->name); +- vbdev_ocf_delete(base->parent, NULL, NULL); +- return; +- } +- +- TAILQ_FOREACH(vbdev, &g_ocf_vbdev_head, tailq) { +- if (vbdev->state.doing_finish) { +- continue; +- } +- if (strcmp(base->name, vbdev->cache.name) == 0) { +- SPDK_NOTICELOG("Deinitializing '%s' because" +- " its cache device '%s' was removed\n", +- vbdev->name, base->name); +- vbdev_ocf_delete(vbdev, NULL, NULL); +- } +- } +-} +- +-static void +-base_bdev_event_cb(enum spdk_bdev_event_type type, struct spdk_bdev *bdev, +- void *event_ctx) +-{ +- switch (type) { +- case SPDK_BDEV_EVENT_REMOVE: +- if (event_ctx) { +- hotremove_cb(event_ctx); +- } +- break; +- default: +- SPDK_NOTICELOG("Unsupported bdev event: type %d\n", type); +- break; +- } +-} +- +-/* Open base SPDK bdev and claim it */ +-static int +-attach_base(struct vbdev_ocf_base *base) +-{ +- int status; +- +- if (base->attached) { +- return -EALREADY; +- } +- +- /* If base cache bdev was already opened by other vbdev, +- * we just copy its descriptor here */ +- if (base->is_cache) { +- struct vbdev_ocf_base *existing = get_other_cache_base(base); +- if (existing) { +- base->desc = existing->desc; +- base->management_channel = existing->management_channel; +- base->attached = true; +- return 0; +- } +- } +- +- status = spdk_bdev_open_ext(base->name, true, base_bdev_event_cb, base, &base->desc); +- if (status) { +- SPDK_ERRLOG("Unable to open device '%s' for writing\n", base->name); +- return status; +- } +- +- status = spdk_bdev_module_claim_bdev(base->bdev, base->desc, +- &ocf_if); +- if (status) { +- SPDK_ERRLOG("Unable to claim device '%s'\n", base->name); +- spdk_bdev_close(base->desc); +- return status; +- } +- +- base->management_channel = spdk_bdev_get_io_channel(base->desc); +- if (!base->management_channel) { +- SPDK_ERRLOG("Unable to get io channel '%s'\n", base->name); +- spdk_bdev_module_release_bdev(base->bdev); +- spdk_bdev_close(base->desc); +- return -ENOMEM; +- } +- +- /* Save the thread where the base device is opened */ +- base->thread = spdk_get_thread(); +- +- base->attached = true; +- return status; +-} +- +-/* Attach base bdevs */ +-static int +-attach_base_bdevs(struct vbdev_ocf *vbdev, +- struct spdk_bdev *cache_bdev, +- struct spdk_bdev *core_bdev) +-{ +- int rc = 0; +- +- if (cache_bdev) { +- vbdev->cache.bdev = cache_bdev; +- rc |= attach_base(&vbdev->cache); +- } +- +- if (core_bdev) { +- vbdev->core.bdev = core_bdev; +- rc |= attach_base(&vbdev->core); +- } +- +- return rc; +-} +- +-/* Init and then start vbdev if all base devices are present */ +-void +-vbdev_ocf_construct(const char *vbdev_name, +- const char *cache_mode_name, +- const uint64_t cache_line_size, +- const char *cache_name, +- const char *core_name, +- bool loadq, +- void (*cb)(int, struct vbdev_ocf *, void *), +- void *cb_arg) +-{ +- int rc; +- struct spdk_bdev *cache_bdev = spdk_bdev_get_by_name(cache_name); +- struct spdk_bdev *core_bdev = spdk_bdev_get_by_name(core_name); +- struct vbdev_ocf *vbdev; +- +- rc = init_vbdev(vbdev_name, cache_mode_name, cache_line_size, cache_name, core_name, loadq); +- if (rc) { +- cb(rc, NULL, cb_arg); +- return; +- } +- +- vbdev = vbdev_ocf_get_by_name(vbdev_name); +- if (vbdev == NULL) { +- cb(-ENODEV, NULL, cb_arg); +- return; +- } +- +- if (cache_bdev == NULL) { +- SPDK_NOTICELOG("OCF bdev '%s' is waiting for cache device '%s' to connect\n", +- vbdev->name, cache_name); +- } +- if (core_bdev == NULL) { +- SPDK_NOTICELOG("OCF bdev '%s' is waiting for core device '%s' to connect\n", +- vbdev->name, core_name); +- } +- +- rc = attach_base_bdevs(vbdev, cache_bdev, core_bdev); +- if (rc) { +- cb(rc, vbdev, cb_arg); +- return; +- } +- +- if (core_bdev && cache_bdev) { +- register_vbdev(vbdev, cb, cb_arg); +- } else { +- cb(0, vbdev, cb_arg); +- } +-} +- +-/* Set new cache mode on OCF cache */ +-void +-vbdev_ocf_set_cache_mode(struct vbdev_ocf *vbdev, +- const char *cache_mode_name, +- void (*cb)(int, struct vbdev_ocf *, void *), +- void *cb_arg) +-{ +- ocf_cache_t cache; +- ocf_cache_mode_t cache_mode; +- int rc; +- +- cache = vbdev->ocf_cache; +- cache_mode = ocf_get_cache_mode(cache_mode_name); +- +- rc = ocf_mngt_cache_trylock(cache); +- if (rc) { +- cb(rc, vbdev, cb_arg); +- return; +- } +- +- rc = ocf_mngt_cache_set_mode(cache, cache_mode); +- ocf_mngt_cache_unlock(cache); +- cb(rc, vbdev, cb_arg); +-} +- +-/* Set sequential cutoff parameters on OCF cache */ +-void +-vbdev_ocf_set_seqcutoff(struct vbdev_ocf *vbdev, const char *policy_name, uint32_t threshold, +- uint32_t promotion_count, void (*cb)(int, void *), void *cb_arg) +-{ +- ocf_cache_t cache; +- ocf_seq_cutoff_policy policy; +- int rc; +- +- cache = vbdev->ocf_cache; +- +- policy = ocf_get_seqcutoff_policy(policy_name); +- if (policy == ocf_seq_cutoff_policy_max) { +- cb(OCF_ERR_INVAL, cb_arg); +- return; +- } +- +- rc = ocf_mngt_cache_trylock(cache); +- if (rc) { +- cb(rc, cb_arg); +- return; +- } +- +- rc = ocf_mngt_core_set_seq_cutoff_policy_all(cache, policy); +- if (rc) { +- goto end; +- } +- +- if (threshold) { +- threshold = threshold * KiB; +- +- rc = ocf_mngt_core_set_seq_cutoff_threshold_all(cache, threshold); +- if (rc) { +- goto end; +- } +- } +- +- if (promotion_count) { +- rc = ocf_mngt_core_set_seq_cutoff_promotion_count_all(cache, promotion_count); +- } +- +-end: +- ocf_mngt_cache_unlock(cache); +- cb(rc, cb_arg); +-} +- +-/* This called if new device is created in SPDK application +- * If that device named as one of base bdevs of OCF vbdev, +- * claim and open them */ +-static void +-vbdev_ocf_examine(struct spdk_bdev *bdev) +-{ +- const char *bdev_name = spdk_bdev_get_name(bdev); +- struct vbdev_ocf *vbdev; +- +- TAILQ_FOREACH(vbdev, &g_ocf_vbdev_head, tailq) { +- if (vbdev->state.doing_finish) { +- continue; +- } +- +- if (!strcmp(bdev_name, vbdev->cache.name)) { +- attach_base_bdevs(vbdev, bdev, NULL); +- continue; +- } +- if (!strcmp(bdev_name, vbdev->core.name)) { +- attach_base_bdevs(vbdev, NULL, bdev); +- break; +- } +- } +- spdk_bdev_module_examine_done(&ocf_if); +-} +- +-struct metadata_probe_ctx { +- struct vbdev_ocf_base base; +- ocf_volume_t volume; +- +- struct ocf_volume_uuid *core_uuids; +- unsigned int uuid_count; +- +- int result; +- int refcnt; +-}; +- +-static void +-_examine_ctx_put(void *ctx) +-{ +- struct spdk_bdev_desc *desc = ctx; +- +- spdk_bdev_close(desc); +-} +- +-static void +-examine_ctx_put(struct metadata_probe_ctx *ctx) +-{ +- unsigned int i; +- +- ctx->refcnt--; +- if (ctx->refcnt > 0) { +- return; +- } +- +- if (ctx->result) { +- SPDK_ERRLOG("OCF metadata probe for bdev '%s' failed with %d\n", +- spdk_bdev_get_name(ctx->base.bdev), ctx->result); +- } +- +- if (ctx->base.desc) { +- /* Close the underlying bdev on its same opened thread. */ +- if (ctx->base.thread && ctx->base.thread != spdk_get_thread()) { +- spdk_thread_send_msg(ctx->base.thread, _examine_ctx_put, ctx->base.desc); +- } else { +- spdk_bdev_close(ctx->base.desc); +- } +- } +- +- if (ctx->volume) { +- ocf_volume_destroy(ctx->volume); +- } +- +- if (ctx->core_uuids) { +- for (i = 0; i < ctx->uuid_count; i++) { +- free(ctx->core_uuids[i].data); +- } +- } +- free(ctx->core_uuids); +- +- examine_done(ctx->result, NULL, ctx->base.bdev); +- free(ctx); +-} +- +-static void +-metadata_probe_construct_cb(int rc, struct vbdev_ocf *vbdev, void *vctx) +-{ +- struct metadata_probe_ctx *ctx = vctx; +- +- examine_ctx_put(ctx); +-} +- +-/* This is second callback for ocf_metadata_probe_cores() +- * Here we create vbdev configurations based on UUIDs */ +-static void +-metadata_probe_cores_construct(void *priv, int error, unsigned int num_cores) +-{ +- struct metadata_probe_ctx *ctx = priv; +- const char *vbdev_name; +- const char *core_name; +- const char *cache_name; +- unsigned int i; +- +- if (error) { +- ctx->result = error; +- examine_ctx_put(ctx); +- return; +- } +- +- for (i = 0; i < num_cores; i++) { +- core_name = ocf_uuid_to_str(&ctx->core_uuids[i]); +- vbdev_name = core_name + strlen(core_name) + 1; +- cache_name = vbdev_name + strlen(vbdev_name) + 1; +- +- if (strcmp(ctx->base.bdev->name, cache_name)) { +- SPDK_NOTICELOG("OCF metadata found on %s belongs to bdev named '%s'\n", +- ctx->base.bdev->name, cache_name); +- } +- +- ctx->refcnt++; +- vbdev_ocf_construct(vbdev_name, NULL, 0, cache_name, core_name, true, +- metadata_probe_construct_cb, ctx); +- } +- +- examine_ctx_put(ctx); +-} +- +-/* This callback is called after OCF reads cores UUIDs from cache metadata +- * Here we allocate memory for those UUIDs and call ocf_metadata_probe_cores() again */ +-static void +-metadata_probe_cores_get_num(void *priv, int error, unsigned int num_cores) +-{ +- struct metadata_probe_ctx *ctx = priv; +- unsigned int i; +- +- if (error) { +- ctx->result = error; +- examine_ctx_put(ctx); +- return; +- } +- +- ctx->uuid_count = num_cores; +- ctx->core_uuids = calloc(num_cores, sizeof(struct ocf_volume_uuid)); +- if (!ctx->core_uuids) { +- ctx->result = -ENOMEM; +- examine_ctx_put(ctx); +- return; +- } +- +- for (i = 0; i < ctx->uuid_count; i++) { +- ctx->core_uuids[i].size = OCF_VOLUME_UUID_MAX_SIZE; +- ctx->core_uuids[i].data = malloc(OCF_VOLUME_UUID_MAX_SIZE); +- if (!ctx->core_uuids[i].data) { +- ctx->result = -ENOMEM; +- examine_ctx_put(ctx); +- return; +- } +- } +- +- ocf_metadata_probe_cores(vbdev_ocf_ctx, ctx->volume, ctx->core_uuids, ctx->uuid_count, +- metadata_probe_cores_construct, ctx); +-} +- +-static void +-metadata_probe_cb(void *priv, int rc, +- struct ocf_metadata_probe_status *status) +-{ +- struct metadata_probe_ctx *ctx = priv; +- +- if (rc) { +- /* -ENODATA means device does not have cache metadata on it */ +- if (rc != -OCF_ERR_NO_METADATA) { +- ctx->result = rc; +- } +- examine_ctx_put(ctx); +- return; +- } +- +- ocf_metadata_probe_cores(vbdev_ocf_ctx, ctx->volume, NULL, 0, +- metadata_probe_cores_get_num, ctx); +-} +- +-/* This is called after vbdev_ocf_examine +- * It allows to delay application initialization +- * until all OCF bdevs get registered +- * If vbdev has all of its base devices it starts asynchronously here +- * We first check if bdev appears in configuration, +- * if not we do metadata_probe() to create its configuration from bdev metadata */ +-static void +-vbdev_ocf_examine_disk(struct spdk_bdev *bdev) +-{ +- const char *bdev_name = spdk_bdev_get_name(bdev); +- struct vbdev_ocf *vbdev; +- struct metadata_probe_ctx *ctx; +- bool created_from_config = false; +- int rc; +- +- examine_start(bdev); +- +- TAILQ_FOREACH(vbdev, &g_ocf_vbdev_head, tailq) { +- if (vbdev->state.doing_finish || vbdev->state.started) { +- continue; +- } +- +- if (!strcmp(bdev_name, vbdev->cache.name)) { +- examine_start(bdev); +- register_vbdev(vbdev, examine_done, bdev); +- created_from_config = true; +- continue; +- } +- if (!strcmp(bdev_name, vbdev->core.name)) { +- examine_start(bdev); +- register_vbdev(vbdev, examine_done, bdev); +- examine_done(0, NULL, bdev); +- return; +- } +- } +- +- /* If devices is discovered during config we do not check for metadata */ +- if (created_from_config) { +- examine_done(0, NULL, bdev); +- return; +- } +- +- /* Metadata probe path +- * We create temporary OCF volume and a temporary base structure +- * to use them for ocf_metadata_probe() and for bottom adapter IOs +- * Then we get UUIDs of core devices an create configurations based on them */ +- ctx = calloc(1, sizeof(*ctx)); +- if (!ctx) { +- examine_done(-ENOMEM, NULL, bdev); +- return; +- } +- +- ctx->base.bdev = bdev; +- ctx->refcnt = 1; +- +- rc = spdk_bdev_open_ext(bdev_name, true, base_bdev_event_cb, NULL, &ctx->base.desc); +- if (rc) { +- ctx->result = rc; +- examine_ctx_put(ctx); +- return; +- } +- +- rc = ocf_ctx_volume_create(vbdev_ocf_ctx, &ctx->volume, NULL, SPDK_OBJECT); +- if (rc) { +- ctx->result = rc; +- examine_ctx_put(ctx); +- return; +- } +- +- rc = ocf_volume_open(ctx->volume, &ctx->base); +- if (rc) { +- ctx->result = rc; +- examine_ctx_put(ctx); +- return; +- } +- +- /* Save the thread where the base device is opened */ +- ctx->base.thread = spdk_get_thread(); +- +- ocf_metadata_probe(vbdev_ocf_ctx, ctx->volume, metadata_probe_cb, ctx); +-} +- +-static int +-vbdev_ocf_get_ctx_size(void) +-{ +- return sizeof(struct bdev_ocf_data); +-} +- +-static void +-fini_start(void) +-{ +- g_fini_started = true; +-} +- +-/* Module-global function table +- * Does not relate to vbdev instances */ +-static struct spdk_bdev_module ocf_if = { +- .name = "ocf", +- .module_init = vbdev_ocf_init, +- .fini_start = fini_start, +- .module_fini = vbdev_ocf_module_fini, +- .get_ctx_size = vbdev_ocf_get_ctx_size, +- .examine_config = vbdev_ocf_examine, +- .examine_disk = vbdev_ocf_examine_disk, +-}; +-SPDK_BDEV_MODULE_REGISTER(ocf, &ocf_if); ++/* SPDX-License-Identifier: BSD-3-Clause ++ * Copyright (C) 2018 Intel Corporation. ++ * All rights reserved. ++ */ ++ ++#include ++#include ++#include ++ ++#include "ctx.h" ++#include "data.h" ++#include "volume.h" ++#include "utils.h" ++#include "vbdev_ocf.h" ++ ++#include "spdk/bdev_module.h" ++#include "spdk/thread.h" ++#include "spdk/string.h" ++#include "spdk/log.h" ++#include "spdk/cpuset.h" ++ ++static struct spdk_bdev_module ocf_if; ++ ++static TAILQ_HEAD(, vbdev_ocf) g_ocf_vbdev_head ++ = TAILQ_HEAD_INITIALIZER(g_ocf_vbdev_head); ++ ++static TAILQ_HEAD(, examining_bdev) g_ocf_examining_bdevs_head ++ = TAILQ_HEAD_INITIALIZER(g_ocf_examining_bdevs_head); ++ ++bool g_fini_started = false; ++ ++/* Structure for keeping list of bdevs that are claimed but not used yet */ ++struct examining_bdev { ++ struct spdk_bdev *bdev; ++ TAILQ_ENTRY(examining_bdev) tailq; ++}; ++ ++/* Add bdev to list of claimed */ ++static void ++examine_start(struct spdk_bdev *bdev) ++{ ++ struct examining_bdev *entry = malloc(sizeof(*entry)); ++ ++ assert(entry); ++ entry->bdev = bdev; ++ TAILQ_INSERT_TAIL(&g_ocf_examining_bdevs_head, entry, tailq); ++} ++ ++/* Find bdev on list of claimed bdevs, then remove it, ++ * if it was the last one on list then report examine done */ ++static void ++examine_done(int status, struct vbdev_ocf *vbdev, void *cb_arg) ++{ ++ struct spdk_bdev *bdev = cb_arg; ++ struct examining_bdev *entry, *safe, *found = NULL; ++ ++ TAILQ_FOREACH_SAFE(entry, &g_ocf_examining_bdevs_head, tailq, safe) { ++ if (entry->bdev == bdev) { ++ if (found) { ++ goto remove; ++ } else { ++ found = entry; ++ } ++ } ++ } ++ ++ assert(found); ++ spdk_bdev_module_examine_done(&ocf_if); ++ ++remove: ++ TAILQ_REMOVE(&g_ocf_examining_bdevs_head, found, tailq); ++ free(found); ++} ++ ++/* Free allocated strings and structure itself ++ * Used at shutdown only */ ++static void ++free_vbdev(struct vbdev_ocf *vbdev) ++{ ++ if (!vbdev) { ++ return; ++ } ++ ++ free(vbdev->name); ++ free(vbdev->cache.name); ++ free(vbdev->core.name); ++ free(vbdev); ++} ++ ++/* Get existing cache base ++ * that is attached to other vbdev */ ++static struct vbdev_ocf_base * ++get_other_cache_base(struct vbdev_ocf_base *base) ++{ ++ struct vbdev_ocf *vbdev; ++ ++ TAILQ_FOREACH(vbdev, &g_ocf_vbdev_head, tailq) { ++ if (&vbdev->cache == base || !vbdev->cache.attached) { ++ continue; ++ } ++ if (!strcmp(vbdev->cache.name, base->name)) { ++ return &vbdev->cache; ++ } ++ } ++ ++ return NULL; ++} ++ ++static bool ++is_ocf_cache_running(struct vbdev_ocf *vbdev) ++{ ++ if (vbdev->cache.attached && vbdev->ocf_cache) { ++ return ocf_cache_is_running(vbdev->ocf_cache); ++ } ++ return false; ++} ++ ++/* Get existing OCF cache instance ++ * that is started by other vbdev */ ++static ocf_cache_t ++get_other_cache_instance(struct vbdev_ocf *vbdev) ++{ ++ struct vbdev_ocf *cmp; ++ ++ TAILQ_FOREACH(cmp, &g_ocf_vbdev_head, tailq) { ++ if (cmp->state.doing_finish || cmp == vbdev) { ++ continue; ++ } ++ if (strcmp(cmp->cache.name, vbdev->cache.name)) { ++ continue; ++ } ++ if (is_ocf_cache_running(cmp)) { ++ return cmp->ocf_cache; ++ } ++ } ++ ++ return NULL; ++} ++ ++static void ++_remove_base_bdev(void *ctx) ++{ ++ struct spdk_bdev_desc *desc = ctx; ++ ++ spdk_bdev_close(desc); ++} ++ ++/* Close and unclaim base bdev */ ++static void ++remove_base_bdev(struct vbdev_ocf_base *base) ++{ ++ if (base->attached) { ++ if (base->management_channel) { ++ spdk_put_io_channel(base->management_channel); ++ } ++ ++ spdk_bdev_module_release_bdev(base->bdev); ++ /* Close the underlying bdev on its same opened thread. */ ++ if (base->thread && base->thread != spdk_get_thread()) { ++ spdk_thread_send_msg(base->thread, _remove_base_bdev, base->desc); ++ } else { ++ spdk_bdev_close(base->desc); ++ } ++ base->attached = false; ++ } ++} ++ ++/* Finish unregister operation */ ++static void ++unregister_finish(struct vbdev_ocf *vbdev) ++{ ++ spdk_bdev_destruct_done(&vbdev->exp_bdev, vbdev->state.stop_status); ++ ++ if (vbdev->ocf_cache) { ++ ocf_mngt_cache_put(vbdev->ocf_cache); ++ } ++ ++ if (vbdev->cache_ctx) { ++ vbdev_ocf_cache_ctx_put(vbdev->cache_ctx); ++ } ++ vbdev_ocf_mngt_continue(vbdev, 0); ++} ++ ++static void ++close_core_bdev(struct vbdev_ocf *vbdev) ++{ ++ remove_base_bdev(&vbdev->core); ++ vbdev_ocf_mngt_continue(vbdev, 0); ++} ++ ++static void ++remove_core_cmpl(void *priv, int error) ++{ ++ struct vbdev_ocf *vbdev = priv; ++ ++ ocf_mngt_cache_unlock(vbdev->ocf_cache); ++ vbdev_ocf_mngt_continue(vbdev, error); ++} ++ ++/* Try to lock cache, then remove core */ ++static void ++remove_core_cache_lock_cmpl(ocf_cache_t cache, void *priv, int error) ++{ ++ struct vbdev_ocf *vbdev = (struct vbdev_ocf *)priv; ++ ++ if (error) { ++ SPDK_ERRLOG("Error %d, can not lock cache instance %s\n", ++ error, vbdev->name); ++ vbdev_ocf_mngt_continue(vbdev, error); ++ return; ++ } ++ ++ ocf_mngt_cache_remove_core(vbdev->ocf_core, remove_core_cmpl, vbdev); ++} ++ ++/* Detach core base */ ++static void ++detach_core(struct vbdev_ocf *vbdev) ++{ ++ if (is_ocf_cache_running(vbdev)) { ++ ocf_mngt_cache_lock(vbdev->ocf_cache, remove_core_cache_lock_cmpl, vbdev); ++ } else { ++ vbdev_ocf_mngt_continue(vbdev, 0); ++ } ++} ++ ++static void ++close_cache_bdev(struct vbdev_ocf *vbdev) ++{ ++ remove_base_bdev(&vbdev->cache); ++ vbdev_ocf_mngt_continue(vbdev, 0); ++} ++ ++/* Detach cache base */ ++static void ++detach_cache(struct vbdev_ocf *vbdev) ++{ ++ vbdev->state.stop_status = vbdev->mngt_ctx.status; ++ ++ /* If some other vbdev references this cache bdev, ++ * we detach this only by changing the flag, without actual close */ ++ if (get_other_cache_base(&vbdev->cache)) { ++ vbdev->cache.attached = false; ++ } ++ ++ vbdev_ocf_mngt_continue(vbdev, 0); ++} ++ ++static void ++stop_vbdev_cmpl(ocf_cache_t cache, void *priv, int error) ++{ ++ struct vbdev_ocf *vbdev = priv; ++ ++ vbdev_ocf_queue_put(vbdev->cache_ctx->mngt_queue); ++ ocf_mngt_cache_unlock(cache); ++ ++ vbdev_ocf_mngt_continue(vbdev, error); ++} ++ ++/* Try to lock cache, then stop it */ ++static void ++stop_vbdev_cache_lock_cmpl(ocf_cache_t cache, void *priv, int error) ++{ ++ struct vbdev_ocf *vbdev = (struct vbdev_ocf *)priv; ++ ++ if (error) { ++ SPDK_ERRLOG("Error %d, can not lock cache instance %s\n", ++ error, vbdev->name); ++ vbdev_ocf_mngt_continue(vbdev, error); ++ return; ++ } ++ ++ ocf_mngt_cache_stop(vbdev->ocf_cache, stop_vbdev_cmpl, vbdev); ++} ++ ++/* Stop OCF cache object ++ * vbdev_ocf is not operational after this */ ++static void ++stop_vbdev(struct vbdev_ocf *vbdev) ++{ ++ if (!is_ocf_cache_running(vbdev)) { ++ vbdev_ocf_mngt_continue(vbdev, 0); ++ return; ++ } ++ ++ if (!g_fini_started && get_other_cache_instance(vbdev)) { ++ SPDK_NOTICELOG("Not stopping cache instance '%s'" ++ " because it is referenced by other OCF bdev\n", ++ vbdev->cache.name); ++ vbdev_ocf_mngt_continue(vbdev, 0); ++ return; ++ } ++ ++ ocf_mngt_cache_lock(vbdev->ocf_cache, stop_vbdev_cache_lock_cmpl, vbdev); ++} ++ ++static void ++flush_vbdev_cmpl(ocf_cache_t cache, void *priv, int error) ++{ ++ struct vbdev_ocf *vbdev = priv; ++ ++ ocf_mngt_cache_unlock(cache); ++ vbdev_ocf_mngt_continue(vbdev, error); ++} ++ ++static void ++flush_vbdev_cache_lock_cmpl(ocf_cache_t cache, void *priv, int error) ++{ ++ struct vbdev_ocf *vbdev = (struct vbdev_ocf *)priv; ++ ++ if (error) { ++ SPDK_ERRLOG("Error %d, can not lock cache instance %s\n", ++ error, vbdev->name); ++ vbdev_ocf_mngt_continue(vbdev, error); ++ return; ++ } ++ ++ ocf_mngt_cache_flush(vbdev->ocf_cache, flush_vbdev_cmpl, vbdev); ++} ++ ++static void ++flush_vbdev(struct vbdev_ocf *vbdev) ++{ ++ if (!is_ocf_cache_running(vbdev)) { ++ vbdev_ocf_mngt_continue(vbdev, -EINVAL); ++ return; ++ } ++ ++ ocf_mngt_cache_lock(vbdev->ocf_cache, flush_vbdev_cache_lock_cmpl, vbdev); ++} ++ ++/* Procedures called during dirty unregister */ ++vbdev_ocf_mngt_fn unregister_path_dirty[] = { ++ flush_vbdev, ++ stop_vbdev, ++ detach_cache, ++ close_cache_bdev, ++ detach_core, ++ close_core_bdev, ++ unregister_finish, ++ NULL ++}; ++ ++/* Procedures called during clean unregister */ ++vbdev_ocf_mngt_fn unregister_path_clean[] = { ++ flush_vbdev, ++ detach_core, ++ close_core_bdev, ++ stop_vbdev, ++ detach_cache, ++ close_cache_bdev, ++ unregister_finish, ++ NULL ++}; ++ ++/* Start asynchronous management operation using unregister_path */ ++static void ++unregister_cb(void *opaque) ++{ ++ struct vbdev_ocf *vbdev = opaque; ++ vbdev_ocf_mngt_fn *unregister_path; ++ int rc; ++ ++ unregister_path = vbdev->state.doing_clean_delete ? ++ unregister_path_clean : unregister_path_dirty; ++ ++ rc = vbdev_ocf_mngt_start(vbdev, unregister_path, NULL, NULL); ++ if (rc) { ++ SPDK_ERRLOG("Unable to unregister OCF bdev: %d\n", rc); ++ spdk_bdev_destruct_done(&vbdev->exp_bdev, rc); ++ } ++} ++ ++/* Clean remove case - remove core and then cache, this order ++ * will remove instance permanently */ ++static void ++_vbdev_ocf_destruct_clean(struct vbdev_ocf *vbdev) ++{ ++ if (vbdev->core.attached) { ++ detach_core(vbdev); ++ close_core_bdev(vbdev); ++ } ++ ++ if (vbdev->cache.attached) { ++ detach_cache(vbdev); ++ close_cache_bdev(vbdev); ++ } ++} ++ ++/* Dirty shutdown/hot remove case - remove cache and then core, this order ++ * will allow us to recover this instance in the future */ ++static void ++_vbdev_ocf_destruct_dirty(struct vbdev_ocf *vbdev) ++{ ++ if (vbdev->cache.attached) { ++ detach_cache(vbdev); ++ close_cache_bdev(vbdev); ++ } ++ ++ if (vbdev->core.attached) { ++ detach_core(vbdev); ++ close_core_bdev(vbdev); ++ } ++} ++ ++/* Unregister io device with callback to unregister_cb ++ * This function is called during spdk_bdev_unregister */ ++static int ++vbdev_ocf_destruct(void *opaque) ++{ ++ struct vbdev_ocf *vbdev = opaque; ++ ++ if (vbdev->state.doing_finish) { ++ return -EALREADY; ++ } ++ ++ if (vbdev->state.starting && !vbdev->state.started) { ++ /* Prevent before detach cache/core during register path of ++ this bdev */ ++ return -EBUSY; ++ } ++ ++ vbdev->state.doing_finish = true; ++ ++ if (vbdev->state.started) { ++ spdk_io_device_unregister(vbdev, unregister_cb); ++ /* Return 1 because unregister is delayed */ ++ return 1; ++ } ++ ++ if (vbdev->state.doing_clean_delete) { ++ _vbdev_ocf_destruct_clean(vbdev); ++ } else { ++ _vbdev_ocf_destruct_dirty(vbdev); ++ } ++ ++ return 0; ++} ++ ++/* Stop OCF cache and unregister SPDK bdev */ ++int ++vbdev_ocf_delete(struct vbdev_ocf *vbdev, void (*cb)(void *, int), void *cb_arg) ++{ ++ int rc = 0; ++ ++ if (vbdev->state.started) { ++ spdk_bdev_unregister(&vbdev->exp_bdev, cb, cb_arg); ++ } else { ++ rc = vbdev_ocf_destruct(vbdev); ++ if (rc == 0 && cb) { ++ cb(cb_arg, 0); ++ } ++ } ++ ++ return rc; ++} ++ ++/* Remove cores permanently and then stop OCF cache and unregister SPDK bdev */ ++int ++vbdev_ocf_delete_clean(struct vbdev_ocf *vbdev, void (*cb)(void *, int), ++ void *cb_arg) ++{ ++ vbdev->state.doing_clean_delete = true; ++ ++ return vbdev_ocf_delete(vbdev, cb, cb_arg); ++} ++ ++ ++/* If vbdev is online, return its object */ ++struct vbdev_ocf * ++vbdev_ocf_get_by_name(const char *name) ++{ ++ struct vbdev_ocf *vbdev; ++ ++ if (name == NULL) { ++ assert(false); ++ return NULL; ++ } ++ ++ TAILQ_FOREACH(vbdev, &g_ocf_vbdev_head, tailq) { ++ if (vbdev->name == NULL || vbdev->state.doing_finish) { ++ continue; ++ } ++ if (strcmp(vbdev->name, name) == 0) { ++ return vbdev; ++ } ++ } ++ return NULL; ++} ++ ++/* Return matching base if parent vbdev is online */ ++struct vbdev_ocf_base * ++vbdev_ocf_get_base_by_name(const char *name) ++{ ++ struct vbdev_ocf *vbdev; ++ ++ if (name == NULL) { ++ assert(false); ++ return NULL; ++ } ++ ++ TAILQ_FOREACH(vbdev, &g_ocf_vbdev_head, tailq) { ++ if (vbdev->state.doing_finish) { ++ continue; ++ } ++ ++ if (vbdev->cache.name && strcmp(vbdev->cache.name, name) == 0) { ++ return &vbdev->cache; ++ } ++ if (vbdev->core.name && strcmp(vbdev->core.name, name) == 0) { ++ return &vbdev->core; ++ } ++ } ++ return NULL; ++} ++ ++/* Execute fn for each OCF device that is online or waits for base devices */ ++void ++vbdev_ocf_foreach(vbdev_ocf_foreach_fn fn, void *ctx) ++{ ++ struct vbdev_ocf *vbdev; ++ ++ assert(fn != NULL); ++ ++ TAILQ_FOREACH(vbdev, &g_ocf_vbdev_head, tailq) { ++ if (!vbdev->state.doing_finish) { ++ fn(vbdev, ctx); ++ } ++ } ++} ++ ++/* Called from OCF when SPDK_IO is completed */ ++static void ++vbdev_ocf_io_submit_cb(struct ocf_io *io, int error) ++{ ++ struct spdk_bdev_io *bdev_io = io->priv1; ++ ++ if (error == 0) { ++ spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_SUCCESS); ++ } else if (error == -OCF_ERR_NO_MEM) { ++ spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_NOMEM); ++ } else { ++ spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); ++ } ++ ++ ocf_io_put(io); ++} ++ ++/* Configure io parameters and send it to OCF */ ++static int ++io_submit_to_ocf(struct spdk_bdev_io *bdev_io, struct ocf_io *io) ++{ ++ switch (bdev_io->type) { ++ case SPDK_BDEV_IO_TYPE_WRITE: ++ case SPDK_BDEV_IO_TYPE_READ: ++ ocf_core_submit_io(io); ++ return 0; ++ case SPDK_BDEV_IO_TYPE_FLUSH: ++ ocf_core_submit_flush(io); ++ return 0; ++ case SPDK_BDEV_IO_TYPE_UNMAP: ++ ocf_core_submit_discard(io); ++ return 0; ++ case SPDK_BDEV_IO_TYPE_RESET: ++ case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: ++ default: ++ SPDK_ERRLOG("Unsupported IO type: %d\n", bdev_io->type); ++ return -EINVAL; ++ } ++} ++ ++/* Submit SPDK-IO to OCF */ ++static void ++io_handle(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io) ++{ ++ struct vbdev_ocf *vbdev = bdev_io->bdev->ctxt; ++ struct ocf_io *io = NULL; ++ struct bdev_ocf_data *data = NULL; ++ struct vbdev_ocf_qctx *qctx = spdk_io_channel_get_ctx(ch); ++ uint64_t len = bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen; ++ uint64_t offset = bdev_io->u.bdev.offset_blocks * bdev_io->bdev->blocklen; ++ int dir, flags = 0; ++ int err; ++ ++ switch (bdev_io->type) { ++ case SPDK_BDEV_IO_TYPE_READ: ++ dir = OCF_READ; ++ break; ++ case SPDK_BDEV_IO_TYPE_WRITE: ++ dir = OCF_WRITE; ++ break; ++ case SPDK_BDEV_IO_TYPE_FLUSH: ++ dir = OCF_WRITE; ++ break; ++ case SPDK_BDEV_IO_TYPE_UNMAP: ++ dir = OCF_WRITE; ++ break; ++ default: ++ err = -EINVAL; ++ goto fail; ++ } ++ ++ if (bdev_io->type == SPDK_BDEV_IO_TYPE_FLUSH) { ++ flags = OCF_WRITE_FLUSH; ++ } ++ ++ io = ocf_core_new_io(vbdev->ocf_core, qctx->queue, offset, len, dir, 0, flags); ++ if (!io) { ++ err = -ENOMEM; ++ goto fail; ++ } ++ ++ data = vbdev_ocf_data_from_spdk_io(bdev_io); ++ if (!data) { ++ err = -ENOMEM; ++ goto fail; ++ } ++ ++ err = ocf_io_set_data(io, data, 0); ++ if (err) { ++ goto fail; ++ } ++ ++ ocf_io_set_cmpl(io, bdev_io, NULL, vbdev_ocf_io_submit_cb); ++ ++ err = io_submit_to_ocf(bdev_io, io); ++ if (err) { ++ goto fail; ++ } ++ ++ return; ++ ++fail: ++ if (io) { ++ ocf_io_put(io); ++ } ++ ++ if (err == -ENOMEM) { ++ spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_NOMEM); ++ } else { ++ spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); ++ } ++} ++ ++static void ++vbdev_ocf_get_buf_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io, ++ bool success) ++{ ++ if (!success) { ++ spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); ++ return; ++ } ++ ++ io_handle(ch, bdev_io); ++} ++ ++/* Called from bdev layer when an io to Cache vbdev is submitted */ ++static void ++vbdev_ocf_submit_request(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io) ++{ ++ switch (bdev_io->type) { ++ case SPDK_BDEV_IO_TYPE_READ: ++ /* User does not have to allocate io vectors for the request, ++ * so in case they are not allocated, we allocate them here */ ++ spdk_bdev_io_get_buf(bdev_io, vbdev_ocf_get_buf_cb, ++ bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen); ++ break; ++ case SPDK_BDEV_IO_TYPE_WRITE: ++ case SPDK_BDEV_IO_TYPE_FLUSH: ++ case SPDK_BDEV_IO_TYPE_UNMAP: ++ io_handle(ch, bdev_io); ++ break; ++ case SPDK_BDEV_IO_TYPE_RESET: ++ case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: ++ default: ++ SPDK_ERRLOG("Unknown I/O type %d\n", bdev_io->type); ++ spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); ++ break; ++ } ++} ++ ++/* Called from bdev layer */ ++static bool ++vbdev_ocf_io_type_supported(void *opaque, enum spdk_bdev_io_type io_type) ++{ ++ struct vbdev_ocf *vbdev = opaque; ++ ++ switch (io_type) { ++ case SPDK_BDEV_IO_TYPE_READ: ++ case SPDK_BDEV_IO_TYPE_WRITE: ++ case SPDK_BDEV_IO_TYPE_FLUSH: ++ case SPDK_BDEV_IO_TYPE_UNMAP: ++ return spdk_bdev_io_type_supported(vbdev->core.bdev, io_type); ++ case SPDK_BDEV_IO_TYPE_RESET: ++ case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: ++ default: ++ return false; ++ } ++} ++ ++/* Called from bdev layer */ ++static struct spdk_io_channel * ++vbdev_ocf_get_io_channel(void *opaque) ++{ ++ struct vbdev_ocf *bdev = opaque; ++ ++ return spdk_get_io_channel(bdev); ++} ++ ++static int ++vbdev_ocf_dump_info_json(void *opaque, struct spdk_json_write_ctx *w) ++{ ++ struct vbdev_ocf *vbdev = opaque; ++ ++ spdk_json_write_named_string(w, "cache_device", vbdev->cache.name); ++ spdk_json_write_named_string(w, "core_device", vbdev->core.name); ++ ++ spdk_json_write_named_string(w, "mode", ++ ocf_get_cache_modename(ocf_cache_get_mode(vbdev->ocf_cache))); ++ spdk_json_write_named_uint32(w, "cache_line_size", ++ ocf_get_cache_line_size(vbdev->ocf_cache)); ++ spdk_json_write_named_bool(w, "metadata_volatile", ++ vbdev->cfg.cache.metadata_volatile); ++ ++ return 0; ++} ++ ++static void ++vbdev_ocf_write_json_config(struct spdk_bdev *bdev, struct spdk_json_write_ctx *w) ++{ ++ struct vbdev_ocf *vbdev = bdev->ctxt; ++ ++ spdk_json_write_object_begin(w); ++ ++ spdk_json_write_named_string(w, "method", "bdev_ocf_create"); ++ ++ spdk_json_write_named_object_begin(w, "params"); ++ spdk_json_write_named_string(w, "name", vbdev->name); ++ spdk_json_write_named_string(w, "mode", ++ ocf_get_cache_modename(ocf_cache_get_mode(vbdev->ocf_cache))); ++ spdk_json_write_named_uint32(w, "cache_line_size", ++ ocf_get_cache_line_size(vbdev->ocf_cache)); ++ spdk_json_write_named_string(w, "cache_bdev_name", vbdev->cache.name); ++ spdk_json_write_named_string(w, "core_bdev_name", vbdev->core.name); ++ spdk_json_write_object_end(w); ++ ++ spdk_json_write_object_end(w); ++} ++ ++/* Cache vbdev function table ++ * Used by bdev layer */ ++static struct spdk_bdev_fn_table cache_dev_fn_table = { ++ .destruct = vbdev_ocf_destruct, ++ .io_type_supported = vbdev_ocf_io_type_supported, ++ .submit_request = vbdev_ocf_submit_request, ++ .get_io_channel = vbdev_ocf_get_io_channel, ++ .write_config_json = vbdev_ocf_write_json_config, ++ .dump_info_json = vbdev_ocf_dump_info_json, ++}; ++ ++/* Poller function for the OCF queue ++ * We execute OCF requests here synchronously */ ++static int ++queue_poll(void *opaque) ++{ ++ struct vbdev_ocf_qctx *qctx = opaque; ++ uint32_t iono = ocf_queue_pending_io(qctx->queue); ++ int i, max = spdk_min(32, iono); ++ ++ for (i = 0; i < max; i++) { ++ ocf_queue_run_single(qctx->queue); ++ } ++ ++ if (iono > 0) { ++ return SPDK_POLLER_BUSY; ++ } else { ++ return SPDK_POLLER_IDLE; ++ } ++} ++ ++/* Called during ocf_submit_io, ocf_purge* ++ * and any other requests that need to submit io */ ++static void ++vbdev_ocf_ctx_queue_kick(ocf_queue_t q) ++{ ++} ++ ++/* OCF queue deinitialization ++ * Called at ocf_cache_stop */ ++static void ++vbdev_ocf_ctx_queue_stop(ocf_queue_t q) ++{ ++ struct vbdev_ocf_qctx *qctx = ocf_queue_get_priv(q); ++ ++ if (qctx) { ++ spdk_put_io_channel(qctx->cache_ch); ++ spdk_put_io_channel(qctx->core_ch); ++ spdk_poller_unregister(&qctx->poller); ++ if (qctx->allocated) { ++ free(qctx); ++ } ++ } ++} ++ ++/* Queue ops is an interface for running queue thread ++ * stop() operation in called just before queue gets destroyed */ ++const struct ocf_queue_ops queue_ops = { ++ .kick_sync = vbdev_ocf_ctx_queue_kick, ++ .kick = vbdev_ocf_ctx_queue_kick, ++ .stop = vbdev_ocf_ctx_queue_stop, ++}; ++ ++/* Called on cache vbdev creation at every thread ++ * We allocate OCF queues here and SPDK poller for it */ ++static int ++io_device_create_cb(void *io_device, void *ctx_buf) ++{ ++ struct vbdev_ocf *vbdev = io_device; ++ struct vbdev_ocf_qctx *qctx = ctx_buf; ++ int rc; ++ ++ rc = vbdev_ocf_queue_create(vbdev->ocf_cache, &qctx->queue, &queue_ops); ++ if (rc) { ++ return rc; ++ } ++ ++ ocf_queue_set_priv(qctx->queue, qctx); ++ ++ qctx->vbdev = vbdev; ++ qctx->cache_ch = spdk_bdev_get_io_channel(vbdev->cache.desc); ++ qctx->core_ch = spdk_bdev_get_io_channel(vbdev->core.desc); ++ qctx->poller = SPDK_POLLER_REGISTER(queue_poll, qctx, 0); ++ ++ return rc; ++} ++ ++/* Called per thread ++ * Put OCF queue and relaunch poller with new context to finish pending requests */ ++static void ++io_device_destroy_cb(void *io_device, void *ctx_buf) ++{ ++ /* Making a copy of context to use it after io channel will be destroyed */ ++ struct vbdev_ocf_qctx *copy = malloc(sizeof(*copy)); ++ struct vbdev_ocf_qctx *qctx = ctx_buf; ++ ++ if (copy) { ++ ocf_queue_set_priv(qctx->queue, copy); ++ memcpy(copy, qctx, sizeof(*copy)); ++ spdk_poller_unregister(&qctx->poller); ++ copy->poller = SPDK_POLLER_REGISTER(queue_poll, copy, 0); ++ copy->allocated = true; ++ } else { ++ SPDK_ERRLOG("Unable to stop OCF queue properly: %s\n", ++ spdk_strerror(ENOMEM)); ++ } ++ ++ vbdev_ocf_queue_put(qctx->queue); ++} ++ ++/* OCF management queue deinitialization */ ++static void ++vbdev_ocf_ctx_mngt_queue_stop(ocf_queue_t q) ++{ ++ struct spdk_poller *poller = ocf_queue_get_priv(q); ++ ++ if (poller) { ++ spdk_poller_unregister(&poller); ++ } ++} ++ ++static int ++mngt_queue_poll(void *opaque) ++{ ++ ocf_queue_t q = opaque; ++ uint32_t iono = ocf_queue_pending_io(q); ++ int i, max = spdk_min(32, iono); ++ ++ for (i = 0; i < max; i++) { ++ ocf_queue_run_single(q); ++ } ++ ++ if (iono > 0) { ++ return SPDK_POLLER_BUSY; ++ } else { ++ return SPDK_POLLER_IDLE; ++ } ++} ++ ++static void ++vbdev_ocf_ctx_mngt_queue_kick(ocf_queue_t q) ++{ ++} ++ ++/* Queue ops is an interface for running queue thread ++ * stop() operation in called just before queue gets destroyed */ ++const struct ocf_queue_ops mngt_queue_ops = { ++ .kick_sync = NULL, ++ .kick = vbdev_ocf_ctx_mngt_queue_kick, ++ .stop = vbdev_ocf_ctx_mngt_queue_stop, ++}; ++ ++static void ++vbdev_ocf_mngt_exit(struct vbdev_ocf *vbdev, vbdev_ocf_mngt_fn *rollback_path, int rc) ++{ ++ vbdev->state.starting = false; ++ vbdev_ocf_mngt_stop(vbdev, rollback_path, rc); ++} ++ ++/* Create exported spdk object */ ++static void ++finish_register(struct vbdev_ocf *vbdev) ++{ ++ int result; ++ ++ /* Copy properties of the base bdev */ ++ vbdev->exp_bdev.blocklen = vbdev->core.bdev->blocklen; ++ vbdev->exp_bdev.write_cache = vbdev->core.bdev->write_cache; ++ vbdev->exp_bdev.required_alignment = vbdev->core.bdev->required_alignment; ++ ++ vbdev->exp_bdev.name = vbdev->name; ++ vbdev->exp_bdev.product_name = "SPDK OCF"; ++ ++ vbdev->exp_bdev.blockcnt = vbdev->core.bdev->blockcnt; ++ vbdev->exp_bdev.ctxt = vbdev; ++ vbdev->exp_bdev.fn_table = &cache_dev_fn_table; ++ vbdev->exp_bdev.module = &ocf_if; ++ ++ /* Finally register vbdev in SPDK */ ++ spdk_io_device_register(vbdev, io_device_create_cb, io_device_destroy_cb, ++ sizeof(struct vbdev_ocf_qctx), vbdev->name); ++ result = spdk_bdev_register(&vbdev->exp_bdev); ++ if (result) { ++ SPDK_ERRLOG("Could not register exposed bdev %s\n", ++ vbdev->name); ++ vbdev_ocf_mngt_exit(vbdev, unregister_path_dirty, result); ++ return; ++ } else { ++ vbdev->state.started = true; ++ } ++ ++ vbdev_ocf_mngt_continue(vbdev, result); ++} ++ ++static void ++add_core_cmpl(ocf_cache_t cache, ocf_core_t core, void *priv, int error) ++{ ++ struct vbdev_ocf *vbdev = priv; ++ ++ ocf_mngt_cache_unlock(cache); ++ ++ if (error) { ++ SPDK_ERRLOG("Error %d, failed to add core device to cache instance %s," ++ "starting rollback\n", error, vbdev->name); ++ vbdev_ocf_mngt_exit(vbdev, unregister_path_dirty, error); ++ return; ++ } else { ++ vbdev->ocf_core = core; ++ } ++ ++ vbdev_ocf_mngt_continue(vbdev, error); ++} ++ ++/* Try to lock cache, then add core */ ++static void ++add_core_cache_lock_cmpl(ocf_cache_t cache, void *priv, int error) ++{ ++ struct vbdev_ocf *vbdev = (struct vbdev_ocf *)priv; ++ ++ if (error) { ++ SPDK_ERRLOG("Error %d, can not lock cache instance %s," ++ "starting rollback\n", error, vbdev->name); ++ vbdev_ocf_mngt_exit(vbdev, unregister_path_dirty, error); ++ } ++ ocf_mngt_cache_add_core(vbdev->ocf_cache, &vbdev->cfg.core, add_core_cmpl, vbdev); ++} ++ ++/* Add core for existing OCF cache instance */ ++static void ++add_core(struct vbdev_ocf *vbdev) ++{ ++ ocf_mngt_cache_lock(vbdev->ocf_cache, add_core_cache_lock_cmpl, vbdev); ++} ++ ++static void ++start_cache_cmpl(ocf_cache_t cache, void *priv, int error) ++{ ++ struct vbdev_ocf *vbdev = priv; ++ uint64_t mem_needed; ++ ++ ocf_mngt_cache_unlock(cache); ++ ++ if (error) { ++ SPDK_ERRLOG("Error %d during start cache %s, starting rollback\n", ++ error, vbdev->name); ++ ++ if (error == -OCF_ERR_NO_MEM) { ++ ocf_mngt_get_ram_needed(cache, &vbdev->cfg.device, &mem_needed); ++ ++ SPDK_NOTICELOG("Try to increase hugepage memory size or cache line size. " ++ "For your configuration:\nDevice size: %"PRIu64" bytes\n" ++ "Cache line size: %"PRIu64" bytes\nFree memory needed to start " ++ "cache: %"PRIu64" bytes\n", vbdev->cache.bdev->blockcnt * ++ vbdev->cache.bdev->blocklen, vbdev->cfg.cache.cache_line_size, ++ mem_needed); ++ } ++ ++ vbdev_ocf_mngt_exit(vbdev, unregister_path_dirty, error); ++ return; ++ } ++ ++ vbdev_ocf_mngt_continue(vbdev, error); ++} ++ ++static int ++create_management_queue(struct vbdev_ocf *vbdev) ++{ ++ struct spdk_poller *mngt_poller; ++ int rc; ++ ++ rc = vbdev_ocf_queue_create(vbdev->ocf_cache, &vbdev->cache_ctx->mngt_queue, &mngt_queue_ops); ++ if (rc) { ++ SPDK_ERRLOG("Unable to create mngt_queue: %d\n", rc); ++ return rc; ++ } ++ ++ mngt_poller = SPDK_POLLER_REGISTER(mngt_queue_poll, vbdev->cache_ctx->mngt_queue, 100); ++ if (mngt_poller == NULL) { ++ SPDK_ERRLOG("Unable to initiate mngt request: %s", spdk_strerror(ENOMEM)); ++ return -ENOMEM; ++ } ++ ++ ocf_queue_set_priv(vbdev->cache_ctx->mngt_queue, mngt_poller); ++ ocf_mngt_cache_set_mngt_queue(vbdev->ocf_cache, vbdev->cache_ctx->mngt_queue); ++ ++ return 0; ++} ++ ++/* Start OCF cache, attach caching device */ ++static void ++start_cache(struct vbdev_ocf *vbdev) ++{ ++ ocf_cache_t existing; ++ uint32_t cache_block_size = vbdev->cache.bdev->blocklen; ++ uint32_t core_block_size = vbdev->core.bdev->blocklen; ++ int rc; ++ ++ if (is_ocf_cache_running(vbdev)) { ++ vbdev_ocf_mngt_stop(vbdev, NULL, -EALREADY); ++ return; ++ } ++ ++ if (cache_block_size > core_block_size) { ++ SPDK_ERRLOG("Cache bdev block size (%d) is bigger then core bdev block size (%d)\n", ++ cache_block_size, core_block_size); ++ vbdev_ocf_mngt_exit(vbdev, unregister_path_dirty, -EINVAL); ++ return; ++ } ++ ++ existing = get_other_cache_instance(vbdev); ++ if (existing) { ++ SPDK_NOTICELOG("OCF bdev %s connects to existing cache device %s\n", ++ vbdev->name, vbdev->cache.name); ++ vbdev->ocf_cache = existing; ++ ocf_mngt_cache_get(vbdev->ocf_cache); ++ vbdev->cache_ctx = ocf_cache_get_priv(existing); ++ vbdev_ocf_cache_ctx_get(vbdev->cache_ctx); ++ vbdev_ocf_mngt_continue(vbdev, 0); ++ return; ++ } ++ ++ vbdev->cache_ctx = calloc(1, sizeof(struct vbdev_ocf_cache_ctx)); ++ if (vbdev->cache_ctx == NULL) { ++ vbdev_ocf_mngt_exit(vbdev, unregister_path_dirty, -ENOMEM); ++ return; ++ } ++ ++ vbdev_ocf_cache_ctx_get(vbdev->cache_ctx); ++ pthread_mutex_init(&vbdev->cache_ctx->lock, NULL); ++ ++ rc = ocf_mngt_cache_start(vbdev_ocf_ctx, &vbdev->ocf_cache, &vbdev->cfg.cache, NULL); ++ if (rc) { ++ SPDK_ERRLOG("Could not start cache %s: %d\n", vbdev->name, rc); ++ vbdev_ocf_mngt_exit(vbdev, unregister_path_dirty, rc); ++ return; ++ } ++ ocf_mngt_cache_get(vbdev->ocf_cache); ++ ++ ocf_cache_set_priv(vbdev->ocf_cache, vbdev->cache_ctx); ++ ++ rc = create_management_queue(vbdev); ++ if (rc) { ++ SPDK_ERRLOG("Unable to create mngt_queue: %d\n", rc); ++ vbdev_ocf_mngt_exit(vbdev, unregister_path_dirty, rc); ++ return; ++ } ++ ++ if (vbdev->cfg.loadq) { ++ ocf_mngt_cache_load(vbdev->ocf_cache, &vbdev->cfg.device, start_cache_cmpl, vbdev); ++ } else { ++ ocf_mngt_cache_attach(vbdev->ocf_cache, &vbdev->cfg.device, start_cache_cmpl, vbdev); ++ } ++} ++ ++/* Procedures called during register operation */ ++vbdev_ocf_mngt_fn register_path[] = { ++ start_cache, ++ add_core, ++ finish_register, ++ NULL ++}; ++ ++/* Start cache instance and register OCF bdev */ ++static void ++register_vbdev(struct vbdev_ocf *vbdev, vbdev_ocf_mngt_callback cb, void *cb_arg) ++{ ++ int rc; ++ ++ if (!(vbdev->core.attached && vbdev->cache.attached) || vbdev->state.started) { ++ cb(-EPERM, vbdev, cb_arg); ++ return; ++ } ++ ++ vbdev->state.starting = true; ++ rc = vbdev_ocf_mngt_start(vbdev, register_path, cb, cb_arg); ++ if (rc) { ++ cb(rc, vbdev, cb_arg); ++ } ++} ++ ++/* Init OCF configuration options ++ * for core and cache devices */ ++static void ++init_vbdev_config(struct vbdev_ocf *vbdev) ++{ ++ struct vbdev_ocf_config *cfg = &vbdev->cfg; ++ ++ /* Initialize OCF defaults first */ ++ ocf_mngt_cache_device_config_set_default(&cfg->device); ++ ocf_mngt_cache_config_set_default(&cfg->cache); ++ ocf_mngt_core_config_set_default(&cfg->core); ++ ++ snprintf(cfg->cache.name, sizeof(cfg->cache.name), "%s", vbdev->name); ++ snprintf(cfg->core.name, sizeof(cfg->core.name), "%s", vbdev->core.name); ++ ++ cfg->device.open_cores = false; ++ cfg->device.perform_test = false; ++ cfg->device.discard_on_start = false; ++ ++ vbdev->cfg.cache.locked = true; ++ ++ cfg->core.volume_type = SPDK_OBJECT; ++ cfg->device.volume_type = SPDK_OBJECT; ++ ++ if (vbdev->cfg.loadq) { ++ /* When doing cache_load(), we need to set try_add to true, ++ * otherwise OCF will interpret this core as new ++ * instead of the inactive one */ ++ vbdev->cfg.core.try_add = true; ++ } else { ++ /* When cache is initialized as new, set force flag to true, ++ * to ignore warnings about existing metadata */ ++ cfg->device.force = true; ++ } ++ ++ /* Serialize bdev names in OCF UUID to interpret on future loads ++ * Core UUID is a triple of (core name, vbdev name, cache name) ++ * Cache UUID is cache bdev name */ ++ cfg->device.uuid.size = strlen(vbdev->cache.name) + 1; ++ cfg->device.uuid.data = vbdev->cache.name; ++ ++ snprintf(vbdev->uuid, VBDEV_OCF_MD_MAX_LEN, "%s %s %s", ++ vbdev->core.name, vbdev->name, vbdev->cache.name); ++ cfg->core.uuid.size = strlen(vbdev->uuid) + 1; ++ cfg->core.uuid.data = vbdev->uuid; ++ vbdev->uuid[strlen(vbdev->core.name)] = 0; ++ vbdev->uuid[strlen(vbdev->core.name) + 1 + strlen(vbdev->name)] = 0; ++} ++ ++/* Allocate vbdev structure object and add it to the global list */ ++static int ++init_vbdev(const char *vbdev_name, ++ const char *cache_mode_name, ++ const uint64_t cache_line_size, ++ const char *cache_name, ++ const char *core_name, ++ bool loadq) ++{ ++ struct vbdev_ocf *vbdev; ++ int rc = 0; ++ ++ if (spdk_bdev_get_by_name(vbdev_name) || vbdev_ocf_get_by_name(vbdev_name)) { ++ SPDK_ERRLOG("Device with name '%s' already exists\n", vbdev_name); ++ return -EPERM; ++ } ++ ++ vbdev = calloc(1, sizeof(*vbdev)); ++ if (!vbdev) { ++ goto error_mem; ++ } ++ ++ vbdev->name = strdup(vbdev_name); ++ if (!vbdev->name) { ++ goto error_mem; ++ } ++ ++ vbdev->cache.name = strdup(cache_name); ++ if (!vbdev->cache.name) { ++ goto error_mem; ++ } ++ ++ vbdev->core.name = strdup(core_name); ++ if (!vbdev->core.name) { ++ goto error_mem; ++ } ++ ++ vbdev->cache.parent = vbdev; ++ vbdev->core.parent = vbdev; ++ vbdev->cache.is_cache = true; ++ vbdev->core.is_cache = false; ++ vbdev->cfg.loadq = loadq; ++ ++ init_vbdev_config(vbdev); ++ ++ if (cache_mode_name) { ++ vbdev->cfg.cache.cache_mode ++ = ocf_get_cache_mode(cache_mode_name); ++ } else if (!loadq) { /* In load path it is OK to pass NULL as cache mode */ ++ SPDK_ERRLOG("No cache mode specified\n"); ++ rc = -EINVAL; ++ goto error_free; ++ } ++ if (vbdev->cfg.cache.cache_mode < 0) { ++ SPDK_ERRLOG("Incorrect cache mode '%s'\n", cache_mode_name); ++ rc = -EINVAL; ++ goto error_free; ++ } ++ ++ ocf_cache_line_size_t set_cache_line_size = cache_line_size ? ++ (ocf_cache_line_size_t)cache_line_size * KiB : ++ ocf_cache_line_size_default; ++ if (set_cache_line_size == 0) { ++ SPDK_ERRLOG("Cache line size should be non-zero.\n"); ++ rc = -EINVAL; ++ goto error_free; ++ } ++ vbdev->cfg.device.cache_line_size = set_cache_line_size; ++ vbdev->cfg.cache.cache_line_size = set_cache_line_size; ++ ++ TAILQ_INSERT_TAIL(&g_ocf_vbdev_head, vbdev, tailq); ++ return rc; ++ ++error_mem: ++ rc = -ENOMEM; ++error_free: ++ free_vbdev(vbdev); ++ return rc; ++} ++ ++SPDK_LOG_DEPRECATION_REGISTER(bdev_ocf, "bdev_ocf support", "SPDK 23.05", 0); ++ ++/* Read configuration file at the start of SPDK application ++ * This adds vbdevs to global list if some mentioned in config */ ++static int ++vbdev_ocf_init(void) ++{ ++ int status; ++ ++ SPDK_LOG_DEPRECATED(bdev_ocf); ++ ++ status = vbdev_ocf_ctx_init(); ++ if (status) { ++ SPDK_ERRLOG("OCF ctx initialization failed with=%d\n", status); ++ return status; ++ } ++ ++ status = vbdev_ocf_volume_init(); ++ if (status) { ++ vbdev_ocf_ctx_cleanup(); ++ SPDK_ERRLOG("OCF volume initialization failed with=%d\n", status); ++ return status; ++ } ++ ++ return status; ++} ++ ++/* Called after application shutdown started ++ * Release memory of allocated structures here */ ++static void ++vbdev_ocf_module_fini(void) ++{ ++ struct vbdev_ocf *vbdev; ++ ++ while ((vbdev = TAILQ_FIRST(&g_ocf_vbdev_head))) { ++ TAILQ_REMOVE(&g_ocf_vbdev_head, vbdev, tailq); ++ free_vbdev(vbdev); ++ } ++ ++ vbdev_ocf_volume_cleanup(); ++ vbdev_ocf_ctx_cleanup(); ++} ++ ++/* When base device gets unplugged this is called ++ * We will unregister cache vbdev here ++ * When cache device is removed, we delete every OCF bdev that used it */ ++static void ++hotremove_cb(struct vbdev_ocf_base *base) ++{ ++ struct vbdev_ocf *vbdev; ++ ++ if (!base->is_cache) { ++ if (base->parent->state.doing_finish) { ++ return; ++ } ++ ++ SPDK_NOTICELOG("Deinitializing '%s' because its core device '%s' was removed\n", ++ base->parent->name, base->name); ++ vbdev_ocf_delete(base->parent, NULL, NULL); ++ return; ++ } ++ ++ TAILQ_FOREACH(vbdev, &g_ocf_vbdev_head, tailq) { ++ if (vbdev->state.doing_finish) { ++ continue; ++ } ++ if (strcmp(base->name, vbdev->cache.name) == 0) { ++ SPDK_NOTICELOG("Deinitializing '%s' because" ++ " its cache device '%s' was removed\n", ++ vbdev->name, base->name); ++ vbdev_ocf_delete(vbdev, NULL, NULL); ++ } ++ } ++} ++ ++static void ++base_bdev_event_cb(enum spdk_bdev_event_type type, struct spdk_bdev *bdev, ++ void *event_ctx) ++{ ++ switch (type) { ++ case SPDK_BDEV_EVENT_REMOVE: ++ if (event_ctx) { ++ hotremove_cb(event_ctx); ++ } ++ break; ++ default: ++ SPDK_NOTICELOG("Unsupported bdev event: type %d\n", type); ++ break; ++ } ++} ++ ++/* Open base SPDK bdev and claim it */ ++static int ++attach_base(struct vbdev_ocf_base *base) ++{ ++ int status; ++ ++ if (base->attached) { ++ return -EALREADY; ++ } ++ ++ /* If base cache bdev was already opened by other vbdev, ++ * we just copy its descriptor here */ ++ if (base->is_cache) { ++ struct vbdev_ocf_base *existing = get_other_cache_base(base); ++ if (existing) { ++ base->desc = existing->desc; ++ base->management_channel = existing->management_channel; ++ base->attached = true; ++ return 0; ++ } ++ } ++ ++ status = spdk_bdev_open_ext(base->name, true, base_bdev_event_cb, base, &base->desc); ++ if (status) { ++ SPDK_ERRLOG("Unable to open device '%s' for writing\n", base->name); ++ return status; ++ } ++ ++ status = spdk_bdev_module_claim_bdev(base->bdev, base->desc, ++ &ocf_if); ++ if (status) { ++ SPDK_ERRLOG("Unable to claim device '%s'\n", base->name); ++ spdk_bdev_close(base->desc); ++ return status; ++ } ++ ++ base->management_channel = spdk_bdev_get_io_channel(base->desc); ++ if (!base->management_channel) { ++ SPDK_ERRLOG("Unable to get io channel '%s'\n", base->name); ++ spdk_bdev_module_release_bdev(base->bdev); ++ spdk_bdev_close(base->desc); ++ return -ENOMEM; ++ } ++ ++ /* Save the thread where the base device is opened */ ++ base->thread = spdk_get_thread(); ++ ++ base->attached = true; ++ return status; ++} ++ ++/* Attach base bdevs */ ++static int ++attach_base_bdevs(struct vbdev_ocf *vbdev, ++ struct spdk_bdev *cache_bdev, ++ struct spdk_bdev *core_bdev) ++{ ++ int rc = 0; ++ ++ if (cache_bdev) { ++ vbdev->cache.bdev = cache_bdev; ++ rc |= attach_base(&vbdev->cache); ++ } ++ ++ if (core_bdev) { ++ vbdev->core.bdev = core_bdev; ++ rc |= attach_base(&vbdev->core); ++ } ++ ++ return rc; ++} ++ ++/* Init and then start vbdev if all base devices are present */ ++void ++vbdev_ocf_construct(const char *vbdev_name, ++ const char *cache_mode_name, ++ const uint64_t cache_line_size, ++ const char *cache_name, ++ const char *core_name, ++ bool loadq, ++ void (*cb)(int, struct vbdev_ocf *, void *), ++ void *cb_arg) ++{ ++ int rc; ++ struct spdk_bdev *cache_bdev = spdk_bdev_get_by_name(cache_name); ++ struct spdk_bdev *core_bdev = spdk_bdev_get_by_name(core_name); ++ struct vbdev_ocf *vbdev; ++ ++ rc = init_vbdev(vbdev_name, cache_mode_name, cache_line_size, cache_name, core_name, loadq); ++ if (rc) { ++ cb(rc, NULL, cb_arg); ++ return; ++ } ++ ++ vbdev = vbdev_ocf_get_by_name(vbdev_name); ++ if (vbdev == NULL) { ++ cb(-ENODEV, NULL, cb_arg); ++ return; ++ } ++ ++ if (cache_bdev == NULL) { ++ SPDK_NOTICELOG("OCF bdev '%s' is waiting for cache device '%s' to connect\n", ++ vbdev->name, cache_name); ++ } ++ if (core_bdev == NULL) { ++ SPDK_NOTICELOG("OCF bdev '%s' is waiting for core device '%s' to connect\n", ++ vbdev->name, core_name); ++ } ++ ++ rc = attach_base_bdevs(vbdev, cache_bdev, core_bdev); ++ if (rc) { ++ cb(rc, vbdev, cb_arg); ++ return; ++ } ++ ++ if (core_bdev && cache_bdev) { ++ register_vbdev(vbdev, cb, cb_arg); ++ } else { ++ cb(0, vbdev, cb_arg); ++ } ++} ++ ++/* Set new cache mode on OCF cache */ ++void ++vbdev_ocf_set_cache_mode(struct vbdev_ocf *vbdev, ++ const char *cache_mode_name, ++ void (*cb)(int, struct vbdev_ocf *, void *), ++ void *cb_arg) ++{ ++ ocf_cache_t cache; ++ ocf_cache_mode_t cache_mode; ++ int rc; ++ ++ cache = vbdev->ocf_cache; ++ cache_mode = ocf_get_cache_mode(cache_mode_name); ++ ++ rc = ocf_mngt_cache_trylock(cache); ++ if (rc) { ++ cb(rc, vbdev, cb_arg); ++ return; ++ } ++ ++ rc = ocf_mngt_cache_set_mode(cache, cache_mode); ++ ocf_mngt_cache_unlock(cache); ++ cb(rc, vbdev, cb_arg); ++} ++ ++/* Set sequential cutoff parameters on OCF cache */ ++void ++vbdev_ocf_set_seqcutoff(struct vbdev_ocf *vbdev, const char *policy_name, uint32_t threshold, ++ uint32_t promotion_count, void (*cb)(int, void *), void *cb_arg) ++{ ++ ocf_cache_t cache; ++ ocf_seq_cutoff_policy policy; ++ int rc; ++ ++ cache = vbdev->ocf_cache; ++ ++ policy = ocf_get_seqcutoff_policy(policy_name); ++ if (policy == ocf_seq_cutoff_policy_max) { ++ cb(OCF_ERR_INVAL, cb_arg); ++ return; ++ } ++ ++ rc = ocf_mngt_cache_trylock(cache); ++ if (rc) { ++ cb(rc, cb_arg); ++ return; ++ } ++ ++ rc = ocf_mngt_core_set_seq_cutoff_policy_all(cache, policy); ++ if (rc) { ++ goto end; ++ } ++ ++ if (threshold) { ++ threshold = threshold * KiB; ++ ++ rc = ocf_mngt_core_set_seq_cutoff_threshold_all(cache, threshold); ++ if (rc) { ++ goto end; ++ } ++ } ++ ++ if (promotion_count) { ++ rc = ocf_mngt_core_set_seq_cutoff_promotion_count_all(cache, promotion_count); ++ } ++ ++end: ++ ocf_mngt_cache_unlock(cache); ++ cb(rc, cb_arg); ++} ++ ++/* This called if new device is created in SPDK application ++ * If that device named as one of base bdevs of OCF vbdev, ++ * claim and open them */ ++static void ++vbdev_ocf_examine(struct spdk_bdev *bdev) ++{ ++ const char *bdev_name = spdk_bdev_get_name(bdev); ++ struct vbdev_ocf *vbdev; ++ ++ TAILQ_FOREACH(vbdev, &g_ocf_vbdev_head, tailq) { ++ if (vbdev->state.doing_finish) { ++ continue; ++ } ++ ++ if (!strcmp(bdev_name, vbdev->cache.name)) { ++ attach_base_bdevs(vbdev, bdev, NULL); ++ continue; ++ } ++ if (!strcmp(bdev_name, vbdev->core.name)) { ++ attach_base_bdevs(vbdev, NULL, bdev); ++ break; ++ } ++ } ++ spdk_bdev_module_examine_done(&ocf_if); ++} ++ ++struct metadata_probe_ctx { ++ struct vbdev_ocf_base base; ++ ocf_volume_t volume; ++ ++ struct ocf_volume_uuid *core_uuids; ++ unsigned int uuid_count; ++ ++ int result; ++ int refcnt; ++}; ++ ++static void ++_examine_ctx_put(void *ctx) ++{ ++ struct spdk_bdev_desc *desc = ctx; ++ ++ spdk_bdev_close(desc); ++} ++ ++static void ++examine_ctx_put(struct metadata_probe_ctx *ctx) ++{ ++ unsigned int i; ++ ++ ctx->refcnt--; ++ if (ctx->refcnt > 0) { ++ return; ++ } ++ ++ if (ctx->result) { ++ SPDK_ERRLOG("OCF metadata probe for bdev '%s' failed with %d\n", ++ spdk_bdev_get_name(ctx->base.bdev), ctx->result); ++ } ++ ++ if (ctx->base.desc) { ++ /* Close the underlying bdev on its same opened thread. */ ++ if (ctx->base.thread && ctx->base.thread != spdk_get_thread()) { ++ spdk_thread_send_msg(ctx->base.thread, _examine_ctx_put, ctx->base.desc); ++ } else { ++ spdk_bdev_close(ctx->base.desc); ++ } ++ } ++ ++ if (ctx->volume) { ++ ocf_volume_destroy(ctx->volume); ++ } ++ ++ if (ctx->core_uuids) { ++ for (i = 0; i < ctx->uuid_count; i++) { ++ free(ctx->core_uuids[i].data); ++ } ++ } ++ free(ctx->core_uuids); ++ ++ examine_done(ctx->result, NULL, ctx->base.bdev); ++ free(ctx); ++} ++ ++static void ++metadata_probe_construct_cb(int rc, struct vbdev_ocf *vbdev, void *vctx) ++{ ++ struct metadata_probe_ctx *ctx = vctx; ++ ++ examine_ctx_put(ctx); ++} ++ ++/* This is second callback for ocf_metadata_probe_cores() ++ * Here we create vbdev configurations based on UUIDs */ ++static void ++metadata_probe_cores_construct(void *priv, int error, unsigned int num_cores) ++{ ++ struct metadata_probe_ctx *ctx = priv; ++ const char *vbdev_name; ++ const char *core_name; ++ const char *cache_name; ++ unsigned int i; ++ ++ if (error) { ++ ctx->result = error; ++ examine_ctx_put(ctx); ++ return; ++ } ++ ++ for (i = 0; i < num_cores; i++) { ++ core_name = ocf_uuid_to_str(&ctx->core_uuids[i]); ++ vbdev_name = core_name + strlen(core_name) + 1; ++ cache_name = vbdev_name + strlen(vbdev_name) + 1; ++ ++ if (strcmp(ctx->base.bdev->name, cache_name)) { ++ SPDK_NOTICELOG("OCF metadata found on %s belongs to bdev named '%s'\n", ++ ctx->base.bdev->name, cache_name); ++ } ++ ++ ctx->refcnt++; ++ vbdev_ocf_construct(vbdev_name, NULL, 0, cache_name, core_name, true, ++ metadata_probe_construct_cb, ctx); ++ } ++ ++ examine_ctx_put(ctx); ++} ++ ++/* This callback is called after OCF reads cores UUIDs from cache metadata ++ * Here we allocate memory for those UUIDs and call ocf_metadata_probe_cores() again */ ++static void ++metadata_probe_cores_get_num(void *priv, int error, unsigned int num_cores) ++{ ++ struct metadata_probe_ctx *ctx = priv; ++ unsigned int i; ++ ++ if (error) { ++ ctx->result = error; ++ examine_ctx_put(ctx); ++ return; ++ } ++ ++ ctx->uuid_count = num_cores; ++ ctx->core_uuids = calloc(num_cores, sizeof(struct ocf_volume_uuid)); ++ if (!ctx->core_uuids) { ++ ctx->result = -ENOMEM; ++ examine_ctx_put(ctx); ++ return; ++ } ++ ++ for (i = 0; i < ctx->uuid_count; i++) { ++ ctx->core_uuids[i].size = OCF_VOLUME_UUID_MAX_SIZE; ++ ctx->core_uuids[i].data = malloc(OCF_VOLUME_UUID_MAX_SIZE); ++ if (!ctx->core_uuids[i].data) { ++ ctx->result = -ENOMEM; ++ examine_ctx_put(ctx); ++ return; ++ } ++ } ++ ++ ocf_metadata_probe_cores(vbdev_ocf_ctx, ctx->volume, ctx->core_uuids, ctx->uuid_count, ++ metadata_probe_cores_construct, ctx); ++} ++ ++static void ++metadata_probe_cb(void *priv, int rc, ++ struct ocf_metadata_probe_status *status) ++{ ++ struct metadata_probe_ctx *ctx = priv; ++ ++ if (rc) { ++ /* -ENODATA means device does not have cache metadata on it */ ++ if (rc != -OCF_ERR_NO_METADATA) { ++ ctx->result = rc; ++ } ++ examine_ctx_put(ctx); ++ return; ++ } ++ ++ ocf_metadata_probe_cores(vbdev_ocf_ctx, ctx->volume, NULL, 0, ++ metadata_probe_cores_get_num, ctx); ++} ++ ++/* This is called after vbdev_ocf_examine ++ * It allows to delay application initialization ++ * until all OCF bdevs get registered ++ * If vbdev has all of its base devices it starts asynchronously here ++ * We first check if bdev appears in configuration, ++ * if not we do metadata_probe() to create its configuration from bdev metadata */ ++static void ++vbdev_ocf_examine_disk(struct spdk_bdev *bdev) ++{ ++ const char *bdev_name = spdk_bdev_get_name(bdev); ++ struct vbdev_ocf *vbdev; ++ struct metadata_probe_ctx *ctx; ++ bool created_from_config = false; ++ int rc; ++ ++ examine_start(bdev); ++ ++ TAILQ_FOREACH(vbdev, &g_ocf_vbdev_head, tailq) { ++ if (vbdev->state.doing_finish || vbdev->state.started) { ++ continue; ++ } ++ ++ if (!strcmp(bdev_name, vbdev->cache.name)) { ++ examine_start(bdev); ++ register_vbdev(vbdev, examine_done, bdev); ++ created_from_config = true; ++ continue; ++ } ++ if (!strcmp(bdev_name, vbdev->core.name)) { ++ examine_start(bdev); ++ register_vbdev(vbdev, examine_done, bdev); ++ examine_done(0, NULL, bdev); ++ return; ++ } ++ } ++ ++ /* If devices is discovered during config we do not check for metadata */ ++ if (created_from_config) { ++ examine_done(0, NULL, bdev); ++ return; ++ } ++ ++ /* Metadata probe path ++ * We create temporary OCF volume and a temporary base structure ++ * to use them for ocf_metadata_probe() and for bottom adapter IOs ++ * Then we get UUIDs of core devices an create configurations based on them */ ++ ctx = calloc(1, sizeof(*ctx)); ++ if (!ctx) { ++ examine_done(-ENOMEM, NULL, bdev); ++ return; ++ } ++ ++ ctx->base.bdev = bdev; ++ ctx->refcnt = 1; ++ ++ rc = spdk_bdev_open_ext(bdev_name, true, base_bdev_event_cb, NULL, &ctx->base.desc); ++ if (rc) { ++ ctx->result = rc; ++ examine_ctx_put(ctx); ++ return; ++ } ++ ++ rc = ocf_ctx_volume_create(vbdev_ocf_ctx, &ctx->volume, NULL, SPDK_OBJECT); ++ if (rc) { ++ ctx->result = rc; ++ examine_ctx_put(ctx); ++ return; ++ } ++ ++ rc = ocf_volume_open(ctx->volume, &ctx->base); ++ if (rc) { ++ ctx->result = rc; ++ examine_ctx_put(ctx); ++ return; ++ } ++ ++ /* Save the thread where the base device is opened */ ++ ctx->base.thread = spdk_get_thread(); ++ ++ ocf_metadata_probe(vbdev_ocf_ctx, ctx->volume, metadata_probe_cb, ctx); ++} ++ ++static int ++vbdev_ocf_get_ctx_size(void) ++{ ++ return sizeof(struct bdev_ocf_data); ++} ++ ++static void ++fini_start(void) ++{ ++ g_fini_started = true; ++} ++ ++/* Module-global function table ++ * Does not relate to vbdev instances */ ++static struct spdk_bdev_module ocf_if = { ++ .name = "ocf", ++ .module_init = vbdev_ocf_init, ++ .fini_start = fini_start, ++ .module_fini = vbdev_ocf_module_fini, ++ .get_ctx_size = vbdev_ocf_get_ctx_size, ++ .examine_config = vbdev_ocf_examine, ++ .examine_disk = vbdev_ocf_examine_disk, ++}; ++SPDK_BDEV_MODULE_REGISTER(ocf, &ocf_if); +diff --git a/module/bdev/ocf/vbdev_ocf.h b/module/bdev/ocf/vbdev_ocf.h +index 2e54291..bc9bcfb 100644 +--- a/module/bdev/ocf/vbdev_ocf.h ++++ b/module/bdev/ocf/vbdev_ocf.h +@@ -1,206 +1,206 @@ +-/* SPDX-License-Identifier: BSD-3-Clause +- * Copyright (C) 2018 Intel Corporation. +- * All rights reserved. +- */ +- +-#ifndef SPDK_VBDEV_OCF_H +-#define SPDK_VBDEV_OCF_H +- +-#include +- +-#include "spdk/bdev.h" +-#include "spdk/bdev_module.h" +- +-#define VBDEV_OCF_MD_MAX_LEN 4096 +- +-struct vbdev_ocf; +- +-/* Context for OCF queue poller +- * Used for mapping SPDK threads to OCF queues */ +-struct vbdev_ocf_qctx { +- /* OCF queue. Contains OCF requests */ +- struct ocf_queue *queue; +- /* Poller for OCF queue. Runs OCF requests */ +- struct spdk_poller *poller; +- /* Reference to parent vbdev */ +- struct vbdev_ocf *vbdev; +- /* Base devices channels */ +- struct spdk_io_channel *cache_ch; +- struct spdk_io_channel *core_ch; +- /* If true, we have to free this context on queue stop */ +- bool allocated; +- /* Link to per-bdev list of queue contexts */ +- TAILQ_ENTRY(vbdev_ocf_qctx) tailq; +-}; +- +-/* Important states */ +-struct vbdev_ocf_state { +- /* From the moment when clean delete started */ +- bool doing_clean_delete; +- /* From the moment when finish started */ +- bool doing_finish; +- /* From the moment when reset IO received, until it is completed */ +- bool doing_reset; +- /* From the moment when exp_bdev is registered */ +- bool started; +- /* From the moment when register path started */ +- bool starting; +- /* Status of last attempt for stopping this device */ +- int stop_status; +-}; +- +-/* +- * OCF cache configuration options +- */ +-struct vbdev_ocf_config { +- /* Initial cache configuration */ +- struct ocf_mngt_cache_config cache; +- +- /* Cache device config */ +- struct ocf_mngt_cache_device_config device; +- +- /* Core initial config */ +- struct ocf_mngt_core_config core; +- +- /* Load flag, if set to true, then we will try load cache instance from disk, +- * otherwise we will create new cache on that disk */ +- bool loadq; +-}; +- +-/* Types for management operations */ +-typedef void (*vbdev_ocf_mngt_fn)(struct vbdev_ocf *); +-typedef void (*vbdev_ocf_mngt_callback)(int, struct vbdev_ocf *, void *); +- +-/* Context for asynchronous management operations +- * Single management operation usually contains a list of sub procedures, +- * this structure handles sharing between those sub procedures */ +-struct vbdev_ocf_mngt_ctx { +- /* Pointer to function that is currently being executed +- * It gets incremented on each step until it dereferences to NULL */ +- vbdev_ocf_mngt_fn *current_step; +- +- /* Function that gets invoked by poller on each iteration */ +- vbdev_ocf_mngt_fn poller_fn; +- /* Poller timeout time stamp - when the poller should stop with error */ +- uint64_t timeout_ts; +- +- /* Status of management operation */ +- int status; +- +- /* External callback and its argument */ +- vbdev_ocf_mngt_callback cb; +- void *cb_arg; +-}; +- +-/* Base device info */ +-struct vbdev_ocf_base { +- /* OCF internal name */ +- char *name; +- +- /* True if this is a caching device */ +- bool is_cache; +- +- /* Connected SPDK block device */ +- struct spdk_bdev *bdev; +- +- /* SPDK device io handle */ +- struct spdk_bdev_desc *desc; +- +- /* True if SPDK bdev has been claimed and opened for writing */ +- bool attached; +- +- /* Channel for cleaner operations */ +- struct spdk_io_channel *management_channel; +- +- /* Reference to main vbdev */ +- struct vbdev_ocf *parent; +- +- /* thread where base device is opened */ +- struct spdk_thread *thread; +-}; +- +-/* +- * The main information provider +- * It's also registered as io_device +- */ +-struct vbdev_ocf { +- /* Exposed unique name */ +- char *name; +- +- /* Base bdevs */ +- struct vbdev_ocf_base cache; +- struct vbdev_ocf_base core; +- +- /* Base bdevs OCF objects */ +- ocf_cache_t ocf_cache; +- ocf_core_t ocf_core; +- +- /* Parameters */ +- struct vbdev_ocf_config cfg; +- struct vbdev_ocf_state state; +- +- /* Management context */ +- struct vbdev_ocf_mngt_ctx mngt_ctx; +- +- /* Cache context */ +- struct vbdev_ocf_cache_ctx *cache_ctx; +- +- /* Status of flushing operation */ +- struct { +- bool in_progress; +- int status; +- } flush; +- +- /* Exposed SPDK bdev. Registered in bdev layer */ +- struct spdk_bdev exp_bdev; +- +- /* OCF uuid for core device of this vbdev */ +- char uuid[VBDEV_OCF_MD_MAX_LEN]; +- +- /* Link to global list of this type structures */ +- TAILQ_ENTRY(vbdev_ocf) tailq; +-}; +- +-void vbdev_ocf_construct( +- const char *vbdev_name, +- const char *cache_mode_name, +- const uint64_t cache_line_size, +- const char *cache_name, +- const char *core_name, +- bool loadq, +- void (*cb)(int, struct vbdev_ocf *, void *), +- void *cb_arg); +- +-/* If vbdev is online, return its object */ +-struct vbdev_ocf *vbdev_ocf_get_by_name(const char *name); +- +-/* Return matching base if parent vbdev is online */ +-struct vbdev_ocf_base *vbdev_ocf_get_base_by_name(const char *name); +- +-/* Stop OCF cache and unregister SPDK bdev */ +-int vbdev_ocf_delete(struct vbdev_ocf *vbdev, void (*cb)(void *, int), void *cb_arg); +- +-int vbdev_ocf_delete_clean(struct vbdev_ocf *vbdev, void (*cb)(void *, int), void *cb_arg); +- +-/* Set new cache mode on OCF cache */ +-void vbdev_ocf_set_cache_mode( +- struct vbdev_ocf *vbdev, +- const char *cache_mode_name, +- void (*cb)(int, struct vbdev_ocf *, void *), +- void *cb_arg); +- +-/* Set sequential cutoff parameters on OCF cache */ +-void vbdev_ocf_set_seqcutoff( +- struct vbdev_ocf *vbdev, +- const char *policy_name, +- uint32_t threshold, +- uint32_t promotion_count, +- void (*cb)(int, void *), +- void *cb_arg); +- +-typedef void (*vbdev_ocf_foreach_fn)(struct vbdev_ocf *, void *); +- +-/* Execute fn for each OCF device that is online or waits for base devices */ +-void vbdev_ocf_foreach(vbdev_ocf_foreach_fn fn, void *ctx); +- +-#endif ++/* SPDX-License-Identifier: BSD-3-Clause ++ * Copyright (C) 2018 Intel Corporation. ++ * All rights reserved. ++ */ ++ ++#ifndef SPDK_VBDEV_OCF_H ++#define SPDK_VBDEV_OCF_H ++ ++#include ++ ++#include "spdk/bdev.h" ++#include "spdk/bdev_module.h" ++ ++#define VBDEV_OCF_MD_MAX_LEN 4096 ++ ++struct vbdev_ocf; ++ ++/* Context for OCF queue poller ++ * Used for mapping SPDK threads to OCF queues */ ++struct vbdev_ocf_qctx { ++ /* OCF queue. Contains OCF requests */ ++ struct ocf_queue *queue; ++ /* Poller for OCF queue. Runs OCF requests */ ++ struct spdk_poller *poller; ++ /* Reference to parent vbdev */ ++ struct vbdev_ocf *vbdev; ++ /* Base devices channels */ ++ struct spdk_io_channel *cache_ch; ++ struct spdk_io_channel *core_ch; ++ /* If true, we have to free this context on queue stop */ ++ bool allocated; ++ /* Link to per-bdev list of queue contexts */ ++ TAILQ_ENTRY(vbdev_ocf_qctx) tailq; ++}; ++ ++/* Important states */ ++struct vbdev_ocf_state { ++ /* From the moment when clean delete started */ ++ bool doing_clean_delete; ++ /* From the moment when finish started */ ++ bool doing_finish; ++ /* From the moment when reset IO received, until it is completed */ ++ bool doing_reset; ++ /* From the moment when exp_bdev is registered */ ++ bool started; ++ /* From the moment when register path started */ ++ bool starting; ++ /* Status of last attempt for stopping this device */ ++ int stop_status; ++}; ++ ++/* ++ * OCF cache configuration options ++ */ ++struct vbdev_ocf_config { ++ /* Initial cache configuration */ ++ struct ocf_mngt_cache_config cache; ++ ++ /* Cache device config */ ++ struct ocf_mngt_cache_device_config device; ++ ++ /* Core initial config */ ++ struct ocf_mngt_core_config core; ++ ++ /* Load flag, if set to true, then we will try load cache instance from disk, ++ * otherwise we will create new cache on that disk */ ++ bool loadq; ++}; ++ ++/* Types for management operations */ ++typedef void (*vbdev_ocf_mngt_fn)(struct vbdev_ocf *); ++typedef void (*vbdev_ocf_mngt_callback)(int, struct vbdev_ocf *, void *); ++ ++/* Context for asynchronous management operations ++ * Single management operation usually contains a list of sub procedures, ++ * this structure handles sharing between those sub procedures */ ++struct vbdev_ocf_mngt_ctx { ++ /* Pointer to function that is currently being executed ++ * It gets incremented on each step until it dereferences to NULL */ ++ vbdev_ocf_mngt_fn *current_step; ++ ++ /* Function that gets invoked by poller on each iteration */ ++ vbdev_ocf_mngt_fn poller_fn; ++ /* Poller timeout time stamp - when the poller should stop with error */ ++ uint64_t timeout_ts; ++ ++ /* Status of management operation */ ++ int status; ++ ++ /* External callback and its argument */ ++ vbdev_ocf_mngt_callback cb; ++ void *cb_arg; ++}; ++ ++/* Base device info */ ++struct vbdev_ocf_base { ++ /* OCF internal name */ ++ char *name; ++ ++ /* True if this is a caching device */ ++ bool is_cache; ++ ++ /* Connected SPDK block device */ ++ struct spdk_bdev *bdev; ++ ++ /* SPDK device io handle */ ++ struct spdk_bdev_desc *desc; ++ ++ /* True if SPDK bdev has been claimed and opened for writing */ ++ bool attached; ++ ++ /* Channel for cleaner operations */ ++ struct spdk_io_channel *management_channel; ++ ++ /* Reference to main vbdev */ ++ struct vbdev_ocf *parent; ++ ++ /* thread where base device is opened */ ++ struct spdk_thread *thread; ++}; ++ ++/* ++ * The main information provider ++ * It's also registered as io_device ++ */ ++struct vbdev_ocf { ++ /* Exposed unique name */ ++ char *name; ++ ++ /* Base bdevs */ ++ struct vbdev_ocf_base cache; ++ struct vbdev_ocf_base core; ++ ++ /* Base bdevs OCF objects */ ++ ocf_cache_t ocf_cache; ++ ocf_core_t ocf_core; ++ ++ /* Parameters */ ++ struct vbdev_ocf_config cfg; ++ struct vbdev_ocf_state state; ++ ++ /* Management context */ ++ struct vbdev_ocf_mngt_ctx mngt_ctx; ++ ++ /* Cache context */ ++ struct vbdev_ocf_cache_ctx *cache_ctx; ++ ++ /* Status of flushing operation */ ++ struct { ++ bool in_progress; ++ int status; ++ } flush; ++ ++ /* Exposed SPDK bdev. Registered in bdev layer */ ++ struct spdk_bdev exp_bdev; ++ ++ /* OCF uuid for core device of this vbdev */ ++ char uuid[VBDEV_OCF_MD_MAX_LEN]; ++ ++ /* Link to global list of this type structures */ ++ TAILQ_ENTRY(vbdev_ocf) tailq; ++}; ++ ++void vbdev_ocf_construct( ++ const char *vbdev_name, ++ const char *cache_mode_name, ++ const uint64_t cache_line_size, ++ const char *cache_name, ++ const char *core_name, ++ bool loadq, ++ void (*cb)(int, struct vbdev_ocf *, void *), ++ void *cb_arg); ++ ++/* If vbdev is online, return its object */ ++struct vbdev_ocf *vbdev_ocf_get_by_name(const char *name); ++ ++/* Return matching base if parent vbdev is online */ ++struct vbdev_ocf_base *vbdev_ocf_get_base_by_name(const char *name); ++ ++/* Stop OCF cache and unregister SPDK bdev */ ++int vbdev_ocf_delete(struct vbdev_ocf *vbdev, void (*cb)(void *, int), void *cb_arg); ++ ++int vbdev_ocf_delete_clean(struct vbdev_ocf *vbdev, void (*cb)(void *, int), void *cb_arg); ++ ++/* Set new cache mode on OCF cache */ ++void vbdev_ocf_set_cache_mode( ++ struct vbdev_ocf *vbdev, ++ const char *cache_mode_name, ++ void (*cb)(int, struct vbdev_ocf *, void *), ++ void *cb_arg); ++ ++/* Set sequential cutoff parameters on OCF cache */ ++void vbdev_ocf_set_seqcutoff( ++ struct vbdev_ocf *vbdev, ++ const char *policy_name, ++ uint32_t threshold, ++ uint32_t promotion_count, ++ void (*cb)(int, void *), ++ void *cb_arg); ++ ++typedef void (*vbdev_ocf_foreach_fn)(struct vbdev_ocf *, void *); ++ ++/* Execute fn for each OCF device that is online or waits for base devices */ ++void vbdev_ocf_foreach(vbdev_ocf_foreach_fn fn, void *ctx); ++ ++#endif +diff --git a/module/bdev/ocf/vbdev_ocf_rpc.c b/module/bdev/ocf/vbdev_ocf_rpc.c +index b31cb24..4ea1d80 100644 +--- a/module/bdev/ocf/vbdev_ocf_rpc.c ++++ b/module/bdev/ocf/vbdev_ocf_rpc.c +@@ -1,566 +1,566 @@ +-/* SPDX-License-Identifier: BSD-3-Clause +- * Copyright (C) 2018 Intel Corporation. +- * All rights reserved. +- */ +- +-#include "vbdev_ocf.h" +-#include "stats.h" +-#include "utils.h" +-#include "spdk/log.h" +-#include "spdk/rpc.h" +-#include "spdk/string.h" +- +-/* Common structure to hold the name parameter for RPC methods using bdev name only. */ +-struct rpc_bdev_ocf_name { +- char *name; /* main vbdev name */ +-}; +- +-/* Common free function for RPC methods using bdev name only. */ +-static void +-free_rpc_bdev_ocf_name(struct rpc_bdev_ocf_name *r) +-{ +- free(r->name); +-} +- +-/* Common function to decode the name input parameter for RPC methods using bdev name only. */ +-static const struct spdk_json_object_decoder rpc_bdev_ocf_name_decoders[] = { +- {"name", offsetof(struct rpc_bdev_ocf_name, name), spdk_json_decode_string}, +-}; +- +- +-/* Structure to hold the parameters for this RPC method. */ +-struct rpc_bdev_ocf_create { +- char *name; /* main vbdev */ +- char *mode; /* OCF mode (choose one) */ +- uint64_t cache_line_size; /* OCF cache line size */ +- char *cache_bdev_name; /* sub bdev */ +- char *core_bdev_name; /* sub bdev */ +-}; +- +-static void +-free_rpc_bdev_ocf_create(struct rpc_bdev_ocf_create *r) +-{ +- free(r->name); +- free(r->core_bdev_name); +- free(r->cache_bdev_name); +- free(r->mode); +-} +- +-/* Structure to decode the input parameters for this RPC method. */ +-static const struct spdk_json_object_decoder rpc_bdev_ocf_create_decoders[] = { +- {"name", offsetof(struct rpc_bdev_ocf_create, name), spdk_json_decode_string}, +- {"mode", offsetof(struct rpc_bdev_ocf_create, mode), spdk_json_decode_string}, +- {"cache_line_size", offsetof(struct rpc_bdev_ocf_create, cache_line_size), spdk_json_decode_uint64, true}, +- {"cache_bdev_name", offsetof(struct rpc_bdev_ocf_create, cache_bdev_name), spdk_json_decode_string}, +- {"core_bdev_name", offsetof(struct rpc_bdev_ocf_create, core_bdev_name), spdk_json_decode_string}, +-}; +- +-static void +-construct_cb(int status, struct vbdev_ocf *vbdev, void *cb_arg) +-{ +- struct spdk_jsonrpc_request *request = cb_arg; +- struct spdk_json_write_ctx *w; +- +- if (status) { +- spdk_jsonrpc_send_error_response_fmt(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, +- "Could not create OCF vbdev: %d", +- status); +- } else { +- w = spdk_jsonrpc_begin_result(request); +- spdk_json_write_string(w, vbdev->name); +- spdk_jsonrpc_end_result(request, w); +- } +-} +- +-static void +-rpc_bdev_ocf_create(struct spdk_jsonrpc_request *request, +- const struct spdk_json_val *params) +-{ +- struct rpc_bdev_ocf_create req = {NULL}; +- int ret; +- +- ret = spdk_json_decode_object(params, rpc_bdev_ocf_create_decoders, +- SPDK_COUNTOF(rpc_bdev_ocf_create_decoders), +- &req); +- if (ret) { +- spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, +- "Invalid parameters"); +- free_rpc_bdev_ocf_create(&req); +- return; +- } +- +- vbdev_ocf_construct(req.name, req.mode, req.cache_line_size, req.cache_bdev_name, +- req.core_bdev_name, false, construct_cb, request); +- free_rpc_bdev_ocf_create(&req); +-} +-SPDK_RPC_REGISTER("bdev_ocf_create", rpc_bdev_ocf_create, SPDK_RPC_RUNTIME) +- +-static void +-delete_cb(void *cb_arg, int status) +-{ +- struct spdk_jsonrpc_request *request = cb_arg; +- +- if (status) { +- spdk_jsonrpc_send_error_response_fmt(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, +- "Could not delete OCF vbdev: %d", +- status); +- } else { +- spdk_jsonrpc_send_bool_response(request, true); +- } +-} +- +-static void +-rpc_bdev_ocf_delete(struct spdk_jsonrpc_request *request, +- const struct spdk_json_val *params) +-{ +- struct rpc_bdev_ocf_name req = {NULL}; +- struct vbdev_ocf *vbdev; +- int status; +- +- status = spdk_json_decode_object(params, rpc_bdev_ocf_name_decoders, +- SPDK_COUNTOF(rpc_bdev_ocf_name_decoders), +- &req); +- if (status) { +- spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, +- "Invalid parameters"); +- goto end; +- } +- +- vbdev = vbdev_ocf_get_by_name(req.name); +- if (vbdev == NULL) { +- spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, +- spdk_strerror(ENODEV)); +- goto end; +- } +- +- status = vbdev_ocf_delete_clean(vbdev, delete_cb, request); +- if (status) { +- spdk_jsonrpc_send_error_response_fmt(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, +- "Could not delete OCF vbdev: %s", +- spdk_strerror(-status)); +- goto end; +- } +- +-end: +- free_rpc_bdev_ocf_name(&req); +-} +-SPDK_RPC_REGISTER("bdev_ocf_delete", rpc_bdev_ocf_delete, SPDK_RPC_RUNTIME) +- +-struct get_ocf_stats_ctx { +- struct spdk_jsonrpc_request *request; +- char *core_name; +-}; +- +-static void +-rpc_bdev_ocf_get_stats_cmpl(ocf_cache_t cache, void *priv, int error) +-{ +- struct get_ocf_stats_ctx *ctx = (struct get_ocf_stats_ctx *) priv; +- struct spdk_json_write_ctx *w; +- struct vbdev_ocf_stats stats; +- +- if (error) { +- goto end; +- } +- +- error = vbdev_ocf_stats_get(cache, ctx->core_name, &stats); +- +- ocf_mngt_cache_read_unlock(cache); +- +- if (error) { +- goto end; +- } +- +- w = spdk_jsonrpc_begin_result(ctx->request); +- vbdev_ocf_stats_write_json(w, &stats); +- spdk_jsonrpc_end_result(ctx->request, w); +- +-end: +- if (error) { +- spdk_jsonrpc_send_error_response_fmt(ctx->request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, +- "Could not get stats: %s", +- spdk_strerror(-error)); +- } +- free(ctx); +-} +- +-static void +-rpc_bdev_ocf_get_stats(struct spdk_jsonrpc_request *request, +- const struct spdk_json_val *params) +-{ +- struct rpc_bdev_ocf_name req = {NULL}; +- struct vbdev_ocf *vbdev; +- struct get_ocf_stats_ctx *ctx; +- +- ctx = calloc(1, sizeof(*ctx)); +- if (!ctx) { +- spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, +- "Not enough memory to process request"); +- goto end; +- } +- +- if (spdk_json_decode_object(params, rpc_bdev_ocf_name_decoders, +- SPDK_COUNTOF(rpc_bdev_ocf_name_decoders), +- &req)) { +- spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, +- "Invalid parameters"); +- free(ctx); +- goto end; +- } +- +- vbdev = vbdev_ocf_get_by_name(req.name); +- if (vbdev == NULL) { +- spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, +- spdk_strerror(ENODEV)); +- free(ctx); +- goto end; +- } +- +- ctx->core_name = vbdev->core.name; +- ctx->request = request; +- ocf_mngt_cache_read_lock(vbdev->ocf_cache, rpc_bdev_ocf_get_stats_cmpl, ctx); +- +-end: +- free_rpc_bdev_ocf_name(&req); +-} +-SPDK_RPC_REGISTER("bdev_ocf_get_stats", rpc_bdev_ocf_get_stats, SPDK_RPC_RUNTIME) +- +-/* Structure to decode the input parameters for this RPC method. */ +-static const struct spdk_json_object_decoder rpc_bdev_ocf_get_bdevs_decoders[] = { +- {"name", offsetof(struct rpc_bdev_ocf_name, name), spdk_json_decode_string, true}, +-}; +- +-struct bdev_get_bdevs_ctx { +- char *name; +- struct spdk_json_write_ctx *w; +-}; +- +-static void +-bdev_get_bdevs_fn(struct vbdev_ocf *vbdev, void *ctx) +-{ +- struct bdev_get_bdevs_ctx *cctx = ctx; +- struct spdk_json_write_ctx *w = cctx->w; +- +- if (cctx->name != NULL && +- strcmp(vbdev->name, cctx->name) && +- strcmp(vbdev->cache.name, cctx->name) && +- strcmp(vbdev->core.name, cctx->name)) { +- return; +- } +- +- spdk_json_write_object_begin(w); +- spdk_json_write_named_string(w, "name", vbdev->name); +- spdk_json_write_named_bool(w, "started", vbdev->state.started); +- +- spdk_json_write_named_object_begin(w, "cache"); +- spdk_json_write_named_string(w, "name", vbdev->cache.name); +- spdk_json_write_named_bool(w, "attached", vbdev->cache.attached); +- spdk_json_write_object_end(w); +- +- spdk_json_write_named_object_begin(w, "core"); +- spdk_json_write_named_string(w, "name", vbdev->core.name); +- spdk_json_write_named_bool(w, "attached", vbdev->core.attached); +- spdk_json_write_object_end(w); +- +- spdk_json_write_object_end(w); +-} +- +-static void +-rpc_bdev_ocf_get_bdevs(struct spdk_jsonrpc_request *request, +- const struct spdk_json_val *params) +-{ +- struct spdk_json_write_ctx *w; +- struct rpc_bdev_ocf_name req = {NULL}; +- struct bdev_get_bdevs_ctx cctx; +- +- if (params && spdk_json_decode_object(params, rpc_bdev_ocf_get_bdevs_decoders, +- SPDK_COUNTOF(rpc_bdev_ocf_get_bdevs_decoders), +- &req)) { +- spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, +- "Invalid parameters"); +- goto end; +- } +- +- if (req.name) { +- if (!(vbdev_ocf_get_by_name(req.name) || vbdev_ocf_get_base_by_name(req.name))) { +- spdk_jsonrpc_send_error_response(request, +- SPDK_JSONRPC_ERROR_INVALID_PARAMS, +- spdk_strerror(ENODEV)); +- goto end; +- } +- } +- +- w = spdk_jsonrpc_begin_result(request); +- +- cctx.name = req.name; +- cctx.w = w; +- +- spdk_json_write_array_begin(w); +- vbdev_ocf_foreach(bdev_get_bdevs_fn, &cctx); +- spdk_json_write_array_end(w); +- spdk_jsonrpc_end_result(request, w); +- +-end: +- free_rpc_bdev_ocf_name(&req); +-} +-SPDK_RPC_REGISTER("bdev_ocf_get_bdevs", rpc_bdev_ocf_get_bdevs, SPDK_RPC_RUNTIME) +- +-/* Structure to hold the parameters for this RPC method. */ +-struct rpc_bdev_ocf_set_cache_mode { +- char *name; /* main vbdev name */ +- char *mode; /* OCF cache mode to switch to */ +-}; +- +-static void +-free_rpc_bdev_ocf_set_cache_mode(struct rpc_bdev_ocf_set_cache_mode *r) +-{ +- free(r->name); +- free(r->mode); +-} +- +-/* Structure to decode the input parameters for this RPC method. */ +-static const struct spdk_json_object_decoder rpc_bdev_ocf_set_cache_mode_decoders[] = { +- {"name", offsetof(struct rpc_bdev_ocf_set_cache_mode, name), spdk_json_decode_string}, +- {"mode", offsetof(struct rpc_bdev_ocf_set_cache_mode, mode), spdk_json_decode_string}, +-}; +- +-static void +-cache_mode_cb(int status, struct vbdev_ocf *vbdev, void *cb_arg) +-{ +- struct spdk_jsonrpc_request *request = cb_arg; +- struct spdk_json_write_ctx *w; +- +- if (status) { +- spdk_jsonrpc_send_error_response_fmt(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, +- "Could not change OCF vbdev cache mode: %d", +- status); +- } else { +- w = spdk_jsonrpc_begin_result(request); +- spdk_json_write_string(w, ocf_get_cache_modename( +- ocf_cache_get_mode(vbdev->ocf_cache))); +- spdk_jsonrpc_end_result(request, w); +- } +-} +- +-static void +-rpc_bdev_ocf_set_cache_mode(struct spdk_jsonrpc_request *request, +- const struct spdk_json_val *params) +-{ +- struct rpc_bdev_ocf_set_cache_mode req = {NULL}; +- struct vbdev_ocf *vbdev; +- int status; +- +- status = spdk_json_decode_object(params, rpc_bdev_ocf_set_cache_mode_decoders, +- SPDK_COUNTOF(rpc_bdev_ocf_set_cache_mode_decoders), +- &req); +- if (status) { +- spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, +- "Invalid parameters"); +- goto end; +- } +- +- vbdev = vbdev_ocf_get_by_name(req.name); +- if (vbdev == NULL) { +- spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, +- spdk_strerror(ENODEV)); +- goto end; +- } +- +- vbdev_ocf_set_cache_mode(vbdev, req.mode, cache_mode_cb, request); +- +-end: +- free_rpc_bdev_ocf_set_cache_mode(&req); +-} +-SPDK_RPC_REGISTER("bdev_ocf_set_cache_mode", rpc_bdev_ocf_set_cache_mode, SPDK_RPC_RUNTIME) +- +-static void +-seqcutoff_cb(int status, void *cb_arg) +-{ +- struct spdk_jsonrpc_request *request = cb_arg; +- +- if (status) { +- spdk_jsonrpc_send_error_response_fmt(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, +- "OCF could not set sequential cutoff parameters: %d", status); +- } else { +- spdk_jsonrpc_send_bool_response(request, true); +- } +-} +- +-/* Structure to hold the parameters for this RPC method. */ +-struct rpc_bdev_ocf_set_seqcutoff { +- char *name; /* main vbdev name */ +- char *policy; +- uint32_t threshold; +- uint32_t promotion_count; +-}; +- +-static void +-free_rpc_bdev_ocf_set_seqcutoff(struct rpc_bdev_ocf_set_seqcutoff *r) +-{ +- free(r->name); +- free(r->policy); +-} +- +-/* Structure to decode the input parameters for this RPC method. */ +-static const struct spdk_json_object_decoder rpc_bdev_ocf_set_seqcutoff_decoders[] = { +- {"name", offsetof(struct rpc_bdev_ocf_set_seqcutoff, name), spdk_json_decode_string}, +- {"policy", offsetof(struct rpc_bdev_ocf_set_seqcutoff, policy), spdk_json_decode_string}, +- {"threshold", offsetof(struct rpc_bdev_ocf_set_seqcutoff, threshold), spdk_json_decode_uint32, true}, +- {"promotion_count", offsetof(struct rpc_bdev_ocf_set_seqcutoff, promotion_count), spdk_json_decode_uint32, true}, +-}; +- +-static void +-rpc_bdev_ocf_set_seqcutoff(struct spdk_jsonrpc_request *request, +- const struct spdk_json_val *params) +-{ +- struct rpc_bdev_ocf_set_seqcutoff req = {NULL}; +- struct vbdev_ocf *vbdev; +- int ret; +- +- ret = spdk_json_decode_object(params, rpc_bdev_ocf_set_seqcutoff_decoders, +- SPDK_COUNTOF(rpc_bdev_ocf_set_seqcutoff_decoders), &req); +- if (ret) { +- spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, +- "Invalid parameters"); +- goto end; +- } +- +- vbdev = vbdev_ocf_get_by_name(req.name); +- if (vbdev == NULL) { +- spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, +- spdk_strerror(ENODEV)); +- goto end; +- } +- +- vbdev_ocf_set_seqcutoff(vbdev, req.policy, req.threshold, req.promotion_count, seqcutoff_cb, +- request); +- +-end: +- free_rpc_bdev_ocf_set_seqcutoff(&req); +-} +-SPDK_RPC_REGISTER("bdev_ocf_set_seqcutoff", rpc_bdev_ocf_set_seqcutoff, SPDK_RPC_RUNTIME) +- +-struct get_ocf_flush_start_ctx { +- struct spdk_jsonrpc_request *request; +- struct vbdev_ocf *vbdev; +-}; +- +-static void +-rpc_bdev_ocf_flush_start_cmpl(ocf_cache_t cache, void *priv, int error) +-{ +- struct get_ocf_flush_start_ctx *ctx = priv; +- +- ctx->vbdev->flush.in_progress = false; +- ctx->vbdev->flush.status = error; +- +- ocf_mngt_cache_read_unlock(cache); +- +- free(ctx); +-} +- +-static void +-rpc_bdev_ocf_flush_start_lock_cmpl(ocf_cache_t cache, void *priv, int error) +-{ +- struct get_ocf_flush_start_ctx *ctx = priv; +- +- if (error) { +- spdk_jsonrpc_send_error_response_fmt(ctx->request, +- SPDK_JSONRPC_ERROR_INTERNAL_ERROR, +- "Could not lock cache: %d", error); +- free(ctx); +- return; +- } +- +- ctx->vbdev->flush.in_progress = true; +- ocf_mngt_cache_flush(cache, rpc_bdev_ocf_flush_start_cmpl, ctx); +- +- spdk_jsonrpc_send_bool_response(ctx->request, true); +-} +- +-static void +-rpc_bdev_ocf_flush_start(struct spdk_jsonrpc_request *request, +- const struct spdk_json_val *params) +-{ +- struct rpc_bdev_ocf_name req = {NULL}; +- struct get_ocf_flush_start_ctx *ctx; +- int status; +- +- ctx = calloc(1, sizeof(*ctx)); +- if (!ctx) { +- spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, +- "Not enough memory to process request"); +- goto end; +- } +- +- status = spdk_json_decode_object(params, rpc_bdev_ocf_name_decoders, +- SPDK_COUNTOF(rpc_bdev_ocf_name_decoders), +- &req); +- if (status) { +- spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, +- "Invalid parameters"); +- free(ctx); +- goto end; +- } +- +- ctx->vbdev = vbdev_ocf_get_by_name(req.name); +- if (ctx->vbdev == NULL) { +- spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, +- spdk_strerror(ENODEV)); +- free(ctx); +- goto end; +- } +- +- if (!ctx->vbdev->ocf_cache) { +- spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, +- "Couldn't flush cache: device not attached"); +- free(ctx); +- goto end; +- } +- +- ctx->request = request; +- ocf_mngt_cache_read_lock(ctx->vbdev->ocf_cache, rpc_bdev_ocf_flush_start_lock_cmpl, ctx); +- +-end: +- free_rpc_bdev_ocf_name(&req); +-} +-SPDK_RPC_REGISTER("bdev_ocf_flush_start", rpc_bdev_ocf_flush_start, SPDK_RPC_RUNTIME) +- +-static void +-rpc_bdev_ocf_flush_status(struct spdk_jsonrpc_request *request, +- const struct spdk_json_val *params) +-{ +- struct rpc_bdev_ocf_name req = {NULL}; +- struct spdk_json_write_ctx *w; +- struct vbdev_ocf *vbdev; +- int status; +- +- status = spdk_json_decode_object(params, rpc_bdev_ocf_name_decoders, +- SPDK_COUNTOF(rpc_bdev_ocf_name_decoders), +- &req); +- if (status) { +- spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, +- "Invalid parameters"); +- goto end; +- } +- +- vbdev = vbdev_ocf_get_by_name(req.name); +- if (vbdev == NULL) { +- spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, +- spdk_strerror(ENODEV)); +- goto end; +- } +- +- w = spdk_jsonrpc_begin_result(request); +- +- spdk_json_write_object_begin(w); +- spdk_json_write_named_bool(w, "in_progress", vbdev->flush.in_progress); +- if (!vbdev->flush.in_progress) { +- spdk_json_write_named_int32(w, "status", vbdev->flush.status); +- } +- spdk_json_write_object_end(w); +- +- spdk_jsonrpc_end_result(request, w); +- +-end: +- free_rpc_bdev_ocf_name(&req); +-} +-SPDK_RPC_REGISTER("bdev_ocf_flush_status", rpc_bdev_ocf_flush_status, SPDK_RPC_RUNTIME) ++/* SPDX-License-Identifier: BSD-3-Clause ++ * Copyright (C) 2018 Intel Corporation. ++ * All rights reserved. ++ */ ++ ++#include "vbdev_ocf.h" ++#include "stats.h" ++#include "utils.h" ++#include "spdk/log.h" ++#include "spdk/rpc.h" ++#include "spdk/string.h" ++ ++/* Common structure to hold the name parameter for RPC methods using bdev name only. */ ++struct rpc_bdev_ocf_name { ++ char *name; /* main vbdev name */ ++}; ++ ++/* Common free function for RPC methods using bdev name only. */ ++static void ++free_rpc_bdev_ocf_name(struct rpc_bdev_ocf_name *r) ++{ ++ free(r->name); ++} ++ ++/* Common function to decode the name input parameter for RPC methods using bdev name only. */ ++static const struct spdk_json_object_decoder rpc_bdev_ocf_name_decoders[] = { ++ {"name", offsetof(struct rpc_bdev_ocf_name, name), spdk_json_decode_string}, ++}; ++ ++ ++/* Structure to hold the parameters for this RPC method. */ ++struct rpc_bdev_ocf_create { ++ char *name; /* main vbdev */ ++ char *mode; /* OCF mode (choose one) */ ++ uint64_t cache_line_size; /* OCF cache line size */ ++ char *cache_bdev_name; /* sub bdev */ ++ char *core_bdev_name; /* sub bdev */ ++}; ++ ++static void ++free_rpc_bdev_ocf_create(struct rpc_bdev_ocf_create *r) ++{ ++ free(r->name); ++ free(r->core_bdev_name); ++ free(r->cache_bdev_name); ++ free(r->mode); ++} ++ ++/* Structure to decode the input parameters for this RPC method. */ ++static const struct spdk_json_object_decoder rpc_bdev_ocf_create_decoders[] = { ++ {"name", offsetof(struct rpc_bdev_ocf_create, name), spdk_json_decode_string}, ++ {"mode", offsetof(struct rpc_bdev_ocf_create, mode), spdk_json_decode_string}, ++ {"cache_line_size", offsetof(struct rpc_bdev_ocf_create, cache_line_size), spdk_json_decode_uint64, true}, ++ {"cache_bdev_name", offsetof(struct rpc_bdev_ocf_create, cache_bdev_name), spdk_json_decode_string}, ++ {"core_bdev_name", offsetof(struct rpc_bdev_ocf_create, core_bdev_name), spdk_json_decode_string}, ++}; ++ ++static void ++construct_cb(int status, struct vbdev_ocf *vbdev, void *cb_arg) ++{ ++ struct spdk_jsonrpc_request *request = cb_arg; ++ struct spdk_json_write_ctx *w; ++ ++ if (status) { ++ spdk_jsonrpc_send_error_response_fmt(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, ++ "Could not create OCF vbdev: %d", ++ status); ++ } else { ++ w = spdk_jsonrpc_begin_result(request); ++ spdk_json_write_string(w, vbdev->name); ++ spdk_jsonrpc_end_result(request, w); ++ } ++} ++ ++static void ++rpc_bdev_ocf_create(struct spdk_jsonrpc_request *request, ++ const struct spdk_json_val *params) ++{ ++ struct rpc_bdev_ocf_create req = {NULL}; ++ int ret; ++ ++ ret = spdk_json_decode_object(params, rpc_bdev_ocf_create_decoders, ++ SPDK_COUNTOF(rpc_bdev_ocf_create_decoders), ++ &req); ++ if (ret) { ++ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, ++ "Invalid parameters"); ++ free_rpc_bdev_ocf_create(&req); ++ return; ++ } ++ ++ vbdev_ocf_construct(req.name, req.mode, req.cache_line_size, req.cache_bdev_name, ++ req.core_bdev_name, false, construct_cb, request); ++ free_rpc_bdev_ocf_create(&req); ++} ++SPDK_RPC_REGISTER("bdev_ocf_create", rpc_bdev_ocf_create, SPDK_RPC_RUNTIME) ++ ++static void ++delete_cb(void *cb_arg, int status) ++{ ++ struct spdk_jsonrpc_request *request = cb_arg; ++ ++ if (status) { ++ spdk_jsonrpc_send_error_response_fmt(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, ++ "Could not delete OCF vbdev: %d", ++ status); ++ } else { ++ spdk_jsonrpc_send_bool_response(request, true); ++ } ++} ++ ++static void ++rpc_bdev_ocf_delete(struct spdk_jsonrpc_request *request, ++ const struct spdk_json_val *params) ++{ ++ struct rpc_bdev_ocf_name req = {NULL}; ++ struct vbdev_ocf *vbdev; ++ int status; ++ ++ status = spdk_json_decode_object(params, rpc_bdev_ocf_name_decoders, ++ SPDK_COUNTOF(rpc_bdev_ocf_name_decoders), ++ &req); ++ if (status) { ++ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, ++ "Invalid parameters"); ++ goto end; ++ } ++ ++ vbdev = vbdev_ocf_get_by_name(req.name); ++ if (vbdev == NULL) { ++ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, ++ spdk_strerror(ENODEV)); ++ goto end; ++ } ++ ++ status = vbdev_ocf_delete_clean(vbdev, delete_cb, request); ++ if (status) { ++ spdk_jsonrpc_send_error_response_fmt(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, ++ "Could not delete OCF vbdev: %s", ++ spdk_strerror(-status)); ++ goto end; ++ } ++ ++end: ++ free_rpc_bdev_ocf_name(&req); ++} ++SPDK_RPC_REGISTER("bdev_ocf_delete", rpc_bdev_ocf_delete, SPDK_RPC_RUNTIME) ++ ++struct get_ocf_stats_ctx { ++ struct spdk_jsonrpc_request *request; ++ char *core_name; ++}; ++ ++static void ++rpc_bdev_ocf_get_stats_cmpl(ocf_cache_t cache, void *priv, int error) ++{ ++ struct get_ocf_stats_ctx *ctx = (struct get_ocf_stats_ctx *) priv; ++ struct spdk_json_write_ctx *w; ++ struct vbdev_ocf_stats stats; ++ ++ if (error) { ++ goto end; ++ } ++ ++ error = vbdev_ocf_stats_get(cache, ctx->core_name, &stats); ++ ++ ocf_mngt_cache_read_unlock(cache); ++ ++ if (error) { ++ goto end; ++ } ++ ++ w = spdk_jsonrpc_begin_result(ctx->request); ++ vbdev_ocf_stats_write_json(w, &stats); ++ spdk_jsonrpc_end_result(ctx->request, w); ++ ++end: ++ if (error) { ++ spdk_jsonrpc_send_error_response_fmt(ctx->request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, ++ "Could not get stats: %s", ++ spdk_strerror(-error)); ++ } ++ free(ctx); ++} ++ ++static void ++rpc_bdev_ocf_get_stats(struct spdk_jsonrpc_request *request, ++ const struct spdk_json_val *params) ++{ ++ struct rpc_bdev_ocf_name req = {NULL}; ++ struct vbdev_ocf *vbdev; ++ struct get_ocf_stats_ctx *ctx; ++ ++ ctx = calloc(1, sizeof(*ctx)); ++ if (!ctx) { ++ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, ++ "Not enough memory to process request"); ++ goto end; ++ } ++ ++ if (spdk_json_decode_object(params, rpc_bdev_ocf_name_decoders, ++ SPDK_COUNTOF(rpc_bdev_ocf_name_decoders), ++ &req)) { ++ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, ++ "Invalid parameters"); ++ free(ctx); ++ goto end; ++ } ++ ++ vbdev = vbdev_ocf_get_by_name(req.name); ++ if (vbdev == NULL) { ++ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, ++ spdk_strerror(ENODEV)); ++ free(ctx); ++ goto end; ++ } ++ ++ ctx->core_name = vbdev->core.name; ++ ctx->request = request; ++ ocf_mngt_cache_read_lock(vbdev->ocf_cache, rpc_bdev_ocf_get_stats_cmpl, ctx); ++ ++end: ++ free_rpc_bdev_ocf_name(&req); ++} ++SPDK_RPC_REGISTER("bdev_ocf_get_stats", rpc_bdev_ocf_get_stats, SPDK_RPC_RUNTIME) ++ ++/* Structure to decode the input parameters for this RPC method. */ ++static const struct spdk_json_object_decoder rpc_bdev_ocf_get_bdevs_decoders[] = { ++ {"name", offsetof(struct rpc_bdev_ocf_name, name), spdk_json_decode_string, true}, ++}; ++ ++struct bdev_get_bdevs_ctx { ++ char *name; ++ struct spdk_json_write_ctx *w; ++}; ++ ++static void ++bdev_get_bdevs_fn(struct vbdev_ocf *vbdev, void *ctx) ++{ ++ struct bdev_get_bdevs_ctx *cctx = ctx; ++ struct spdk_json_write_ctx *w = cctx->w; ++ ++ if (cctx->name != NULL && ++ strcmp(vbdev->name, cctx->name) && ++ strcmp(vbdev->cache.name, cctx->name) && ++ strcmp(vbdev->core.name, cctx->name)) { ++ return; ++ } ++ ++ spdk_json_write_object_begin(w); ++ spdk_json_write_named_string(w, "name", vbdev->name); ++ spdk_json_write_named_bool(w, "started", vbdev->state.started); ++ ++ spdk_json_write_named_object_begin(w, "cache"); ++ spdk_json_write_named_string(w, "name", vbdev->cache.name); ++ spdk_json_write_named_bool(w, "attached", vbdev->cache.attached); ++ spdk_json_write_object_end(w); ++ ++ spdk_json_write_named_object_begin(w, "core"); ++ spdk_json_write_named_string(w, "name", vbdev->core.name); ++ spdk_json_write_named_bool(w, "attached", vbdev->core.attached); ++ spdk_json_write_object_end(w); ++ ++ spdk_json_write_object_end(w); ++} ++ ++static void ++rpc_bdev_ocf_get_bdevs(struct spdk_jsonrpc_request *request, ++ const struct spdk_json_val *params) ++{ ++ struct spdk_json_write_ctx *w; ++ struct rpc_bdev_ocf_name req = {NULL}; ++ struct bdev_get_bdevs_ctx cctx; ++ ++ if (params && spdk_json_decode_object(params, rpc_bdev_ocf_get_bdevs_decoders, ++ SPDK_COUNTOF(rpc_bdev_ocf_get_bdevs_decoders), ++ &req)) { ++ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, ++ "Invalid parameters"); ++ goto end; ++ } ++ ++ if (req.name) { ++ if (!(vbdev_ocf_get_by_name(req.name) || vbdev_ocf_get_base_by_name(req.name))) { ++ spdk_jsonrpc_send_error_response(request, ++ SPDK_JSONRPC_ERROR_INVALID_PARAMS, ++ spdk_strerror(ENODEV)); ++ goto end; ++ } ++ } ++ ++ w = spdk_jsonrpc_begin_result(request); ++ ++ cctx.name = req.name; ++ cctx.w = w; ++ ++ spdk_json_write_array_begin(w); ++ vbdev_ocf_foreach(bdev_get_bdevs_fn, &cctx); ++ spdk_json_write_array_end(w); ++ spdk_jsonrpc_end_result(request, w); ++ ++end: ++ free_rpc_bdev_ocf_name(&req); ++} ++SPDK_RPC_REGISTER("bdev_ocf_get_bdevs", rpc_bdev_ocf_get_bdevs, SPDK_RPC_RUNTIME) ++ ++/* Structure to hold the parameters for this RPC method. */ ++struct rpc_bdev_ocf_set_cache_mode { ++ char *name; /* main vbdev name */ ++ char *mode; /* OCF cache mode to switch to */ ++}; ++ ++static void ++free_rpc_bdev_ocf_set_cache_mode(struct rpc_bdev_ocf_set_cache_mode *r) ++{ ++ free(r->name); ++ free(r->mode); ++} ++ ++/* Structure to decode the input parameters for this RPC method. */ ++static const struct spdk_json_object_decoder rpc_bdev_ocf_set_cache_mode_decoders[] = { ++ {"name", offsetof(struct rpc_bdev_ocf_set_cache_mode, name), spdk_json_decode_string}, ++ {"mode", offsetof(struct rpc_bdev_ocf_set_cache_mode, mode), spdk_json_decode_string}, ++}; ++ ++static void ++cache_mode_cb(int status, struct vbdev_ocf *vbdev, void *cb_arg) ++{ ++ struct spdk_jsonrpc_request *request = cb_arg; ++ struct spdk_json_write_ctx *w; ++ ++ if (status) { ++ spdk_jsonrpc_send_error_response_fmt(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, ++ "Could not change OCF vbdev cache mode: %d", ++ status); ++ } else { ++ w = spdk_jsonrpc_begin_result(request); ++ spdk_json_write_string(w, ocf_get_cache_modename( ++ ocf_cache_get_mode(vbdev->ocf_cache))); ++ spdk_jsonrpc_end_result(request, w); ++ } ++} ++ ++static void ++rpc_bdev_ocf_set_cache_mode(struct spdk_jsonrpc_request *request, ++ const struct spdk_json_val *params) ++{ ++ struct rpc_bdev_ocf_set_cache_mode req = {NULL}; ++ struct vbdev_ocf *vbdev; ++ int status; ++ ++ status = spdk_json_decode_object(params, rpc_bdev_ocf_set_cache_mode_decoders, ++ SPDK_COUNTOF(rpc_bdev_ocf_set_cache_mode_decoders), ++ &req); ++ if (status) { ++ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, ++ "Invalid parameters"); ++ goto end; ++ } ++ ++ vbdev = vbdev_ocf_get_by_name(req.name); ++ if (vbdev == NULL) { ++ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, ++ spdk_strerror(ENODEV)); ++ goto end; ++ } ++ ++ vbdev_ocf_set_cache_mode(vbdev, req.mode, cache_mode_cb, request); ++ ++end: ++ free_rpc_bdev_ocf_set_cache_mode(&req); ++} ++SPDK_RPC_REGISTER("bdev_ocf_set_cache_mode", rpc_bdev_ocf_set_cache_mode, SPDK_RPC_RUNTIME) ++ ++static void ++seqcutoff_cb(int status, void *cb_arg) ++{ ++ struct spdk_jsonrpc_request *request = cb_arg; ++ ++ if (status) { ++ spdk_jsonrpc_send_error_response_fmt(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, ++ "OCF could not set sequential cutoff parameters: %d", status); ++ } else { ++ spdk_jsonrpc_send_bool_response(request, true); ++ } ++} ++ ++/* Structure to hold the parameters for this RPC method. */ ++struct rpc_bdev_ocf_set_seqcutoff { ++ char *name; /* main vbdev name */ ++ char *policy; ++ uint32_t threshold; ++ uint32_t promotion_count; ++}; ++ ++static void ++free_rpc_bdev_ocf_set_seqcutoff(struct rpc_bdev_ocf_set_seqcutoff *r) ++{ ++ free(r->name); ++ free(r->policy); ++} ++ ++/* Structure to decode the input parameters for this RPC method. */ ++static const struct spdk_json_object_decoder rpc_bdev_ocf_set_seqcutoff_decoders[] = { ++ {"name", offsetof(struct rpc_bdev_ocf_set_seqcutoff, name), spdk_json_decode_string}, ++ {"policy", offsetof(struct rpc_bdev_ocf_set_seqcutoff, policy), spdk_json_decode_string}, ++ {"threshold", offsetof(struct rpc_bdev_ocf_set_seqcutoff, threshold), spdk_json_decode_uint32, true}, ++ {"promotion_count", offsetof(struct rpc_bdev_ocf_set_seqcutoff, promotion_count), spdk_json_decode_uint32, true}, ++}; ++ ++static void ++rpc_bdev_ocf_set_seqcutoff(struct spdk_jsonrpc_request *request, ++ const struct spdk_json_val *params) ++{ ++ struct rpc_bdev_ocf_set_seqcutoff req = {NULL}; ++ struct vbdev_ocf *vbdev; ++ int ret; ++ ++ ret = spdk_json_decode_object(params, rpc_bdev_ocf_set_seqcutoff_decoders, ++ SPDK_COUNTOF(rpc_bdev_ocf_set_seqcutoff_decoders), &req); ++ if (ret) { ++ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, ++ "Invalid parameters"); ++ goto end; ++ } ++ ++ vbdev = vbdev_ocf_get_by_name(req.name); ++ if (vbdev == NULL) { ++ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, ++ spdk_strerror(ENODEV)); ++ goto end; ++ } ++ ++ vbdev_ocf_set_seqcutoff(vbdev, req.policy, req.threshold, req.promotion_count, seqcutoff_cb, ++ request); ++ ++end: ++ free_rpc_bdev_ocf_set_seqcutoff(&req); ++} ++SPDK_RPC_REGISTER("bdev_ocf_set_seqcutoff", rpc_bdev_ocf_set_seqcutoff, SPDK_RPC_RUNTIME) ++ ++struct get_ocf_flush_start_ctx { ++ struct spdk_jsonrpc_request *request; ++ struct vbdev_ocf *vbdev; ++}; ++ ++static void ++rpc_bdev_ocf_flush_start_cmpl(ocf_cache_t cache, void *priv, int error) ++{ ++ struct get_ocf_flush_start_ctx *ctx = priv; ++ ++ ctx->vbdev->flush.in_progress = false; ++ ctx->vbdev->flush.status = error; ++ ++ ocf_mngt_cache_read_unlock(cache); ++ ++ free(ctx); ++} ++ ++static void ++rpc_bdev_ocf_flush_start_lock_cmpl(ocf_cache_t cache, void *priv, int error) ++{ ++ struct get_ocf_flush_start_ctx *ctx = priv; ++ ++ if (error) { ++ spdk_jsonrpc_send_error_response_fmt(ctx->request, ++ SPDK_JSONRPC_ERROR_INTERNAL_ERROR, ++ "Could not lock cache: %d", error); ++ free(ctx); ++ return; ++ } ++ ++ ctx->vbdev->flush.in_progress = true; ++ ocf_mngt_cache_flush(cache, rpc_bdev_ocf_flush_start_cmpl, ctx); ++ ++ spdk_jsonrpc_send_bool_response(ctx->request, true); ++} ++ ++static void ++rpc_bdev_ocf_flush_start(struct spdk_jsonrpc_request *request, ++ const struct spdk_json_val *params) ++{ ++ struct rpc_bdev_ocf_name req = {NULL}; ++ struct get_ocf_flush_start_ctx *ctx; ++ int status; ++ ++ ctx = calloc(1, sizeof(*ctx)); ++ if (!ctx) { ++ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, ++ "Not enough memory to process request"); ++ goto end; ++ } ++ ++ status = spdk_json_decode_object(params, rpc_bdev_ocf_name_decoders, ++ SPDK_COUNTOF(rpc_bdev_ocf_name_decoders), ++ &req); ++ if (status) { ++ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, ++ "Invalid parameters"); ++ free(ctx); ++ goto end; ++ } ++ ++ ctx->vbdev = vbdev_ocf_get_by_name(req.name); ++ if (ctx->vbdev == NULL) { ++ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, ++ spdk_strerror(ENODEV)); ++ free(ctx); ++ goto end; ++ } ++ ++ if (!ctx->vbdev->ocf_cache) { ++ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, ++ "Couldn't flush cache: device not attached"); ++ free(ctx); ++ goto end; ++ } ++ ++ ctx->request = request; ++ ocf_mngt_cache_read_lock(ctx->vbdev->ocf_cache, rpc_bdev_ocf_flush_start_lock_cmpl, ctx); ++ ++end: ++ free_rpc_bdev_ocf_name(&req); ++} ++SPDK_RPC_REGISTER("bdev_ocf_flush_start", rpc_bdev_ocf_flush_start, SPDK_RPC_RUNTIME) ++ ++static void ++rpc_bdev_ocf_flush_status(struct spdk_jsonrpc_request *request, ++ const struct spdk_json_val *params) ++{ ++ struct rpc_bdev_ocf_name req = {NULL}; ++ struct spdk_json_write_ctx *w; ++ struct vbdev_ocf *vbdev; ++ int status; ++ ++ status = spdk_json_decode_object(params, rpc_bdev_ocf_name_decoders, ++ SPDK_COUNTOF(rpc_bdev_ocf_name_decoders), ++ &req); ++ if (status) { ++ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, ++ "Invalid parameters"); ++ goto end; ++ } ++ ++ vbdev = vbdev_ocf_get_by_name(req.name); ++ if (vbdev == NULL) { ++ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, ++ spdk_strerror(ENODEV)); ++ goto end; ++ } ++ ++ w = spdk_jsonrpc_begin_result(request); ++ ++ spdk_json_write_object_begin(w); ++ spdk_json_write_named_bool(w, "in_progress", vbdev->flush.in_progress); ++ if (!vbdev->flush.in_progress) { ++ spdk_json_write_named_int32(w, "status", vbdev->flush.status); ++ } ++ spdk_json_write_object_end(w); ++ ++ spdk_jsonrpc_end_result(request, w); ++ ++end: ++ free_rpc_bdev_ocf_name(&req); ++} ++SPDK_RPC_REGISTER("bdev_ocf_flush_status", rpc_bdev_ocf_flush_status, SPDK_RPC_RUNTIME) +diff --git a/module/bdev/ocf/volume.c b/module/bdev/ocf/volume.c +index 4d606dc..b91f54b 100644 +--- a/module/bdev/ocf/volume.c ++++ b/module/bdev/ocf/volume.c +@@ -1,420 +1,420 @@ +-/* SPDX-License-Identifier: BSD-3-Clause +- * Copyright (C) 2019 Intel Corporation. +- * All rights reserved. +- */ +- +-#include +- +-#include "spdk/bdev_module.h" +-#include "spdk/env.h" +-#include "spdk/thread.h" +-#include "spdk/log.h" +- +-#include "data.h" +-#include "volume.h" +-#include "ctx.h" +-#include "vbdev_ocf.h" +- +-static int +-vbdev_ocf_volume_open(ocf_volume_t volume, void *opts) +-{ +- struct vbdev_ocf_base **priv = ocf_volume_get_priv(volume); +- struct vbdev_ocf_base *base; +- +- if (opts) { +- base = opts; +- } else { +- base = vbdev_ocf_get_base_by_name(ocf_volume_get_uuid(volume)->data); +- if (base == NULL) { +- return -ENODEV; +- } +- } +- +- *priv = base; +- +- return 0; +-} +- +-static void +-vbdev_ocf_volume_close(ocf_volume_t volume) +-{ +-} +- +-static uint64_t +-vbdev_ocf_volume_get_length(ocf_volume_t volume) +-{ +- struct vbdev_ocf_base *base = *((struct vbdev_ocf_base **)ocf_volume_get_priv(volume)); +- uint64_t len; +- +- len = base->bdev->blocklen * base->bdev->blockcnt; +- +- return len; +-} +- +-static int +-vbdev_ocf_volume_io_set_data(struct ocf_io *io, ctx_data_t *data, +- uint32_t offset) +-{ +- struct ocf_io_ctx *io_ctx = ocf_get_io_ctx(io); +- +- io_ctx->offset = offset; +- io_ctx->data = data; +- +- assert(io_ctx->data != NULL); +- if (io_ctx->data->iovs && offset >= io_ctx->data->size) { +- return -ENOBUFS; +- } +- +- return 0; +-} +- +-static ctx_data_t * +-vbdev_ocf_volume_io_get_data(struct ocf_io *io) +-{ +- return ocf_get_io_ctx(io)->data; +-} +- +-static void +-vbdev_ocf_volume_io_get(struct ocf_io *io) +-{ +- struct ocf_io_ctx *io_ctx = ocf_get_io_ctx(io); +- +- io_ctx->ref++; +-} +- +-static void +-vbdev_ocf_volume_io_put(struct ocf_io *io) +-{ +- struct ocf_io_ctx *io_ctx = ocf_get_io_ctx(io); +- +- if (--io_ctx->ref) { +- return; +- } +-} +- +-static int +-get_starting_vec(struct iovec *iovs, int iovcnt, int *offset) +-{ +- int i; +- size_t off; +- +- off = *offset; +- +- for (i = 0; i < iovcnt; i++) { +- if (off < iovs[i].iov_len) { +- *offset = off; +- return i; +- } +- off -= iovs[i].iov_len; +- } +- +- return -1; +-} +- +-static void +-initialize_cpy_vector(struct iovec *cpy_vec, int cpy_vec_len, struct iovec *orig_vec, +- int orig_vec_len, +- size_t offset, size_t bytes) +-{ +- void *curr_base; +- int len, i; +- +- i = 0; +- +- while (bytes > 0) { +- curr_base = orig_vec[i].iov_base + offset; +- len = MIN(bytes, orig_vec[i].iov_len - offset); +- +- cpy_vec[i].iov_base = curr_base; +- cpy_vec[i].iov_len = len; +- +- bytes -= len; +- offset = 0; +- i++; +- } +-} +- +-static void +-vbdev_ocf_volume_submit_io_cb(struct spdk_bdev_io *bdev_io, bool success, void *opaque) +-{ +- struct ocf_io *io; +- struct ocf_io_ctx *io_ctx; +- +- assert(opaque); +- +- io = opaque; +- io_ctx = ocf_get_io_ctx(io); +- assert(io_ctx != NULL); +- +- if (!success) { +- io_ctx->error = io_ctx->error ? : -OCF_ERR_IO; +- } +- +- if (io_ctx->iovs_allocated && bdev_io != NULL) { +- env_free(bdev_io->u.bdev.iovs); +- } +- +- if (io_ctx->error) { +- SPDK_DEBUGLOG(vbdev_ocf_volume, +- "base returned error on io submission: %d\n", io_ctx->error); +- } +- +- if (io->io_queue == NULL && io_ctx->ch != NULL) { +- spdk_put_io_channel(io_ctx->ch); +- } +- +- vbdev_ocf_volume_io_put(io); +- if (bdev_io) { +- spdk_bdev_free_io(bdev_io); +- } +- +- if (--io_ctx->rq_cnt == 0) { +- io->end(io, io_ctx->error); +- } +-} +- +-static int +-prepare_submit(struct ocf_io *io) +-{ +- struct ocf_io_ctx *io_ctx = ocf_get_io_ctx(io); +- struct vbdev_ocf_qctx *qctx; +- struct vbdev_ocf_base *base; +- ocf_queue_t q = io->io_queue; +- ocf_cache_t cache; +- struct vbdev_ocf_cache_ctx *cctx; +- int rc = 0; +- +- io_ctx->rq_cnt++; +- if (io_ctx->rq_cnt != 1) { +- return 0; +- } +- +- vbdev_ocf_volume_io_get(io); +- base = *((struct vbdev_ocf_base **)ocf_volume_get_priv(ocf_io_get_volume(io))); +- +- if (io->io_queue == NULL) { +- /* In case IO is initiated by OCF, queue is unknown +- * so we have to get io channel ourselves */ +- io_ctx->ch = spdk_bdev_get_io_channel(base->desc); +- if (io_ctx->ch == NULL) { +- return -EPERM; +- } +- return 0; +- } +- +- cache = ocf_queue_get_cache(q); +- cctx = ocf_cache_get_priv(cache); +- if (cctx == NULL) { +- return -EFAULT; +- } +- +- if (q == cctx->cleaner_queue || q == cctx->mngt_queue) { +- io_ctx->ch = base->management_channel; +- return 0; +- } +- +- qctx = ocf_queue_get_priv(q); +- if (qctx == NULL) { +- return -EFAULT; +- } +- +- if (base->is_cache) { +- io_ctx->ch = qctx->cache_ch; +- } else { +- io_ctx->ch = qctx->core_ch; +- } +- +- return rc; +-} +- +-static void +-vbdev_ocf_volume_submit_flush(struct ocf_io *io) +-{ +- struct vbdev_ocf_base *base = +- *((struct vbdev_ocf_base **) +- ocf_volume_get_priv(ocf_io_get_volume(io))); +- struct ocf_io_ctx *io_ctx = ocf_get_io_ctx(io); +- int status; +- +- status = prepare_submit(io); +- if (status) { +- SPDK_ERRLOG("Preparing io failed with status=%d\n", status); +- vbdev_ocf_volume_submit_io_cb(NULL, false, io); +- return; +- } +- +- status = spdk_bdev_flush( +- base->desc, io_ctx->ch, +- io->addr, io->bytes, +- vbdev_ocf_volume_submit_io_cb, io); +- if (status) { +- /* Since callback is not called, we need to do it manually to free io structures */ +- SPDK_ERRLOG("Submission failed with status=%d\n", status); +- vbdev_ocf_volume_submit_io_cb(NULL, false, io); +- } +-} +- +-static void +-vbdev_ocf_volume_submit_io(struct ocf_io *io) +-{ +- struct vbdev_ocf_base *base = +- *((struct vbdev_ocf_base **) +- ocf_volume_get_priv(ocf_io_get_volume(io))); +- struct ocf_io_ctx *io_ctx = ocf_get_io_ctx(io); +- struct iovec *iovs; +- int iovcnt, status = 0, i, offset; +- uint64_t addr, len; +- +- if (io->flags == OCF_WRITE_FLUSH) { +- vbdev_ocf_volume_submit_flush(io); +- return; +- } +- +- status = prepare_submit(io); +- if (status) { +- SPDK_ERRLOG("Preparing io failed with status=%d\n", status); +- vbdev_ocf_volume_submit_io_cb(NULL, false, io); +- return; +- } +- +- /* IO fields */ +- addr = io->addr; +- len = io->bytes; +- offset = io_ctx->offset; +- +- if (len < io_ctx->data->size) { +- if (io_ctx->data->iovcnt == 1) { +- if (io->dir == OCF_READ) { +- status = spdk_bdev_read(base->desc, io_ctx->ch, +- io_ctx->data->iovs[0].iov_base + offset, addr, len, +- vbdev_ocf_volume_submit_io_cb, io); +- } else if (io->dir == OCF_WRITE) { +- status = spdk_bdev_write(base->desc, io_ctx->ch, +- io_ctx->data->iovs[0].iov_base + offset, addr, len, +- vbdev_ocf_volume_submit_io_cb, io); +- } +- goto end; +- } else { +- i = get_starting_vec(io_ctx->data->iovs, io_ctx->data->iovcnt, &offset); +- +- if (i < 0) { +- SPDK_ERRLOG("offset bigger than data size\n"); +- vbdev_ocf_volume_submit_io_cb(NULL, false, io); +- return; +- } +- +- iovcnt = io_ctx->data->iovcnt - i; +- +- io_ctx->iovs_allocated = true; +- iovs = env_malloc(sizeof(*iovs) * iovcnt, ENV_MEM_NOIO); +- +- if (!iovs) { +- SPDK_ERRLOG("allocation failed\n"); +- vbdev_ocf_volume_submit_io_cb(NULL, false, io); +- return; +- } +- +- initialize_cpy_vector(iovs, io_ctx->data->iovcnt, &io_ctx->data->iovs[i], +- iovcnt, offset, len); +- } +- } else { +- iovs = io_ctx->data->iovs; +- iovcnt = io_ctx->data->iovcnt; +- } +- +- if (io->dir == OCF_READ) { +- status = spdk_bdev_readv(base->desc, io_ctx->ch, +- iovs, iovcnt, addr, len, vbdev_ocf_volume_submit_io_cb, io); +- } else if (io->dir == OCF_WRITE) { +- status = spdk_bdev_writev(base->desc, io_ctx->ch, +- iovs, iovcnt, addr, len, vbdev_ocf_volume_submit_io_cb, io); +- } +- +-end: +- if (status) { +- if (status == -ENOMEM) { +- io_ctx->error = -OCF_ERR_NO_MEM; +- } else { +- SPDK_ERRLOG("submission failed with status=%d\n", status); +- } +- +- /* Since callback is not called, we need to do it manually to free io structures */ +- vbdev_ocf_volume_submit_io_cb(NULL, false, io); +- } +-} +- +-static void +-vbdev_ocf_volume_submit_discard(struct ocf_io *io) +-{ +- struct vbdev_ocf_base *base = +- *((struct vbdev_ocf_base **) +- ocf_volume_get_priv(ocf_io_get_volume(io))); +- struct ocf_io_ctx *io_ctx = ocf_get_io_ctx(io); +- int status = 0; +- +- status = prepare_submit(io); +- if (status) { +- SPDK_ERRLOG("Preparing io failed with status=%d\n", status); +- vbdev_ocf_volume_submit_io_cb(NULL, false, io); +- return; +- } +- +- status = spdk_bdev_unmap( +- base->desc, io_ctx->ch, +- io->addr, io->bytes, +- vbdev_ocf_volume_submit_io_cb, io); +- if (status) { +- /* Since callback is not called, we need to do it manually to free io structures */ +- SPDK_ERRLOG("Submission failed with status=%d\n", status); +- vbdev_ocf_volume_submit_io_cb(NULL, false, io); +- } +-} +- +-static void +-vbdev_ocf_volume_submit_metadata(struct ocf_io *io) +-{ +- /* Implement with persistent metadata support */ +-} +- +-static unsigned int +-vbdev_ocf_volume_get_max_io_size(ocf_volume_t volume) +-{ +- return 131072; +-} +- +-static struct ocf_volume_properties vbdev_volume_props = { +- .name = "SPDK_block_device", +- .io_priv_size = sizeof(struct ocf_io_ctx), +- .volume_priv_size = sizeof(struct vbdev_ocf_base *), +- .caps = { +- .atomic_writes = 0 /* to enable need to have ops->submit_metadata */ +- }, +- .ops = { +- .open = vbdev_ocf_volume_open, +- .close = vbdev_ocf_volume_close, +- .get_length = vbdev_ocf_volume_get_length, +- .submit_io = vbdev_ocf_volume_submit_io, +- .submit_discard = vbdev_ocf_volume_submit_discard, +- .submit_flush = vbdev_ocf_volume_submit_flush, +- .get_max_io_size = vbdev_ocf_volume_get_max_io_size, +- .submit_metadata = vbdev_ocf_volume_submit_metadata, +- }, +- .io_ops = { +- .set_data = vbdev_ocf_volume_io_set_data, +- .get_data = vbdev_ocf_volume_io_get_data, +- }, +-}; +- +-int +-vbdev_ocf_volume_init(void) +-{ +- return ocf_ctx_register_volume_type(vbdev_ocf_ctx, SPDK_OBJECT, &vbdev_volume_props); +-} +- +-void +-vbdev_ocf_volume_cleanup(void) +-{ +- ocf_ctx_unregister_volume_type(vbdev_ocf_ctx, SPDK_OBJECT); +-} +- +-SPDK_LOG_REGISTER_COMPONENT(vbdev_ocf_volume) ++/* SPDX-License-Identifier: BSD-3-Clause ++ * Copyright (C) 2019 Intel Corporation. ++ * All rights reserved. ++ */ ++ ++#include ++ ++#include "spdk/bdev_module.h" ++#include "spdk/env.h" ++#include "spdk/thread.h" ++#include "spdk/log.h" ++ ++#include "data.h" ++#include "volume.h" ++#include "ctx.h" ++#include "vbdev_ocf.h" ++ ++static int ++vbdev_ocf_volume_open(ocf_volume_t volume, void *opts) ++{ ++ struct vbdev_ocf_base **priv = ocf_volume_get_priv(volume); ++ struct vbdev_ocf_base *base; ++ ++ if (opts) { ++ base = opts; ++ } else { ++ base = vbdev_ocf_get_base_by_name(ocf_volume_get_uuid(volume)->data); ++ if (base == NULL) { ++ return -ENODEV; ++ } ++ } ++ ++ *priv = base; ++ ++ return 0; ++} ++ ++static void ++vbdev_ocf_volume_close(ocf_volume_t volume) ++{ ++} ++ ++static uint64_t ++vbdev_ocf_volume_get_length(ocf_volume_t volume) ++{ ++ struct vbdev_ocf_base *base = *((struct vbdev_ocf_base **)ocf_volume_get_priv(volume)); ++ uint64_t len; ++ ++ len = base->bdev->blocklen * base->bdev->blockcnt; ++ ++ return len; ++} ++ ++static int ++vbdev_ocf_volume_io_set_data(struct ocf_io *io, ctx_data_t *data, ++ uint32_t offset) ++{ ++ struct ocf_io_ctx *io_ctx = ocf_get_io_ctx(io); ++ ++ io_ctx->offset = offset; ++ io_ctx->data = data; ++ ++ assert(io_ctx->data != NULL); ++ if (io_ctx->data->iovs && offset >= io_ctx->data->size) { ++ return -ENOBUFS; ++ } ++ ++ return 0; ++} ++ ++static ctx_data_t * ++vbdev_ocf_volume_io_get_data(struct ocf_io *io) ++{ ++ return ocf_get_io_ctx(io)->data; ++} ++ ++static void ++vbdev_ocf_volume_io_get(struct ocf_io *io) ++{ ++ struct ocf_io_ctx *io_ctx = ocf_get_io_ctx(io); ++ ++ io_ctx->ref++; ++} ++ ++static void ++vbdev_ocf_volume_io_put(struct ocf_io *io) ++{ ++ struct ocf_io_ctx *io_ctx = ocf_get_io_ctx(io); ++ ++ if (--io_ctx->ref) { ++ return; ++ } ++} ++ ++static int ++get_starting_vec(struct iovec *iovs, int iovcnt, int *offset) ++{ ++ int i; ++ size_t off; ++ ++ off = *offset; ++ ++ for (i = 0; i < iovcnt; i++) { ++ if (off < iovs[i].iov_len) { ++ *offset = off; ++ return i; ++ } ++ off -= iovs[i].iov_len; ++ } ++ ++ return -1; ++} ++ ++static void ++initialize_cpy_vector(struct iovec *cpy_vec, int cpy_vec_len, struct iovec *orig_vec, ++ int orig_vec_len, ++ size_t offset, size_t bytes) ++{ ++ void *curr_base; ++ int len, i; ++ ++ i = 0; ++ ++ while (bytes > 0) { ++ curr_base = orig_vec[i].iov_base + offset; ++ len = MIN(bytes, orig_vec[i].iov_len - offset); ++ ++ cpy_vec[i].iov_base = curr_base; ++ cpy_vec[i].iov_len = len; ++ ++ bytes -= len; ++ offset = 0; ++ i++; ++ } ++} ++ ++static void ++vbdev_ocf_volume_submit_io_cb(struct spdk_bdev_io *bdev_io, bool success, void *opaque) ++{ ++ struct ocf_io *io; ++ struct ocf_io_ctx *io_ctx; ++ ++ assert(opaque); ++ ++ io = opaque; ++ io_ctx = ocf_get_io_ctx(io); ++ assert(io_ctx != NULL); ++ ++ if (!success) { ++ io_ctx->error = io_ctx->error ? : -OCF_ERR_IO; ++ } ++ ++ if (io_ctx->iovs_allocated && bdev_io != NULL) { ++ env_free(bdev_io->u.bdev.iovs); ++ } ++ ++ if (io_ctx->error) { ++ SPDK_DEBUGLOG(vbdev_ocf_volume, ++ "base returned error on io submission: %d\n", io_ctx->error); ++ } ++ ++ if (io->io_queue == NULL && io_ctx->ch != NULL) { ++ spdk_put_io_channel(io_ctx->ch); ++ } ++ ++ vbdev_ocf_volume_io_put(io); ++ if (bdev_io) { ++ spdk_bdev_free_io(bdev_io); ++ } ++ ++ if (--io_ctx->rq_cnt == 0) { ++ io->end(io, io_ctx->error); ++ } ++} ++ ++static int ++prepare_submit(struct ocf_io *io) ++{ ++ struct ocf_io_ctx *io_ctx = ocf_get_io_ctx(io); ++ struct vbdev_ocf_qctx *qctx; ++ struct vbdev_ocf_base *base; ++ ocf_queue_t q = io->io_queue; ++ ocf_cache_t cache; ++ struct vbdev_ocf_cache_ctx *cctx; ++ int rc = 0; ++ ++ io_ctx->rq_cnt++; ++ if (io_ctx->rq_cnt != 1) { ++ return 0; ++ } ++ ++ vbdev_ocf_volume_io_get(io); ++ base = *((struct vbdev_ocf_base **)ocf_volume_get_priv(ocf_io_get_volume(io))); ++ ++ if (io->io_queue == NULL) { ++ /* In case IO is initiated by OCF, queue is unknown ++ * so we have to get io channel ourselves */ ++ io_ctx->ch = spdk_bdev_get_io_channel(base->desc); ++ if (io_ctx->ch == NULL) { ++ return -EPERM; ++ } ++ return 0; ++ } ++ ++ cache = ocf_queue_get_cache(q); ++ cctx = ocf_cache_get_priv(cache); ++ if (cctx == NULL) { ++ return -EFAULT; ++ } ++ ++ if (q == cctx->cleaner_queue || q == cctx->mngt_queue) { ++ io_ctx->ch = base->management_channel; ++ return 0; ++ } ++ ++ qctx = ocf_queue_get_priv(q); ++ if (qctx == NULL) { ++ return -EFAULT; ++ } ++ ++ if (base->is_cache) { ++ io_ctx->ch = qctx->cache_ch; ++ } else { ++ io_ctx->ch = qctx->core_ch; ++ } ++ ++ return rc; ++} ++ ++static void ++vbdev_ocf_volume_submit_flush(struct ocf_io *io) ++{ ++ struct vbdev_ocf_base *base = ++ *((struct vbdev_ocf_base **) ++ ocf_volume_get_priv(ocf_io_get_volume(io))); ++ struct ocf_io_ctx *io_ctx = ocf_get_io_ctx(io); ++ int status; ++ ++ status = prepare_submit(io); ++ if (status) { ++ SPDK_ERRLOG("Preparing io failed with status=%d\n", status); ++ vbdev_ocf_volume_submit_io_cb(NULL, false, io); ++ return; ++ } ++ ++ status = spdk_bdev_flush( ++ base->desc, io_ctx->ch, ++ io->addr, io->bytes, ++ vbdev_ocf_volume_submit_io_cb, io); ++ if (status) { ++ /* Since callback is not called, we need to do it manually to free io structures */ ++ SPDK_ERRLOG("Submission failed with status=%d\n", status); ++ vbdev_ocf_volume_submit_io_cb(NULL, false, io); ++ } ++} ++ ++static void ++vbdev_ocf_volume_submit_io(struct ocf_io *io) ++{ ++ struct vbdev_ocf_base *base = ++ *((struct vbdev_ocf_base **) ++ ocf_volume_get_priv(ocf_io_get_volume(io))); ++ struct ocf_io_ctx *io_ctx = ocf_get_io_ctx(io); ++ struct iovec *iovs; ++ int iovcnt, status = 0, i, offset; ++ uint64_t addr, len; ++ ++ if (io->flags == OCF_WRITE_FLUSH) { ++ vbdev_ocf_volume_submit_flush(io); ++ return; ++ } ++ ++ status = prepare_submit(io); ++ if (status) { ++ SPDK_ERRLOG("Preparing io failed with status=%d\n", status); ++ vbdev_ocf_volume_submit_io_cb(NULL, false, io); ++ return; ++ } ++ ++ /* IO fields */ ++ addr = io->addr; ++ len = io->bytes; ++ offset = io_ctx->offset; ++ ++ if (len < io_ctx->data->size) { ++ if (io_ctx->data->iovcnt == 1) { ++ if (io->dir == OCF_READ) { ++ status = spdk_bdev_read(base->desc, io_ctx->ch, ++ io_ctx->data->iovs[0].iov_base + offset, addr, len, ++ vbdev_ocf_volume_submit_io_cb, io); ++ } else if (io->dir == OCF_WRITE) { ++ status = spdk_bdev_write(base->desc, io_ctx->ch, ++ io_ctx->data->iovs[0].iov_base + offset, addr, len, ++ vbdev_ocf_volume_submit_io_cb, io); ++ } ++ goto end; ++ } else { ++ i = get_starting_vec(io_ctx->data->iovs, io_ctx->data->iovcnt, &offset); ++ ++ if (i < 0) { ++ SPDK_ERRLOG("offset bigger than data size\n"); ++ vbdev_ocf_volume_submit_io_cb(NULL, false, io); ++ return; ++ } ++ ++ iovcnt = io_ctx->data->iovcnt - i; ++ ++ io_ctx->iovs_allocated = true; ++ iovs = env_malloc(sizeof(*iovs) * iovcnt, ENV_MEM_NOIO); ++ ++ if (!iovs) { ++ SPDK_ERRLOG("allocation failed\n"); ++ vbdev_ocf_volume_submit_io_cb(NULL, false, io); ++ return; ++ } ++ ++ initialize_cpy_vector(iovs, io_ctx->data->iovcnt, &io_ctx->data->iovs[i], ++ iovcnt, offset, len); ++ } ++ } else { ++ iovs = io_ctx->data->iovs; ++ iovcnt = io_ctx->data->iovcnt; ++ } ++ ++ if (io->dir == OCF_READ) { ++ status = spdk_bdev_readv(base->desc, io_ctx->ch, ++ iovs, iovcnt, addr, len, vbdev_ocf_volume_submit_io_cb, io); ++ } else if (io->dir == OCF_WRITE) { ++ status = spdk_bdev_writev(base->desc, io_ctx->ch, ++ iovs, iovcnt, addr, len, vbdev_ocf_volume_submit_io_cb, io); ++ } ++ ++end: ++ if (status) { ++ if (status == -ENOMEM) { ++ io_ctx->error = -OCF_ERR_NO_MEM; ++ } else { ++ SPDK_ERRLOG("submission failed with status=%d\n", status); ++ } ++ ++ /* Since callback is not called, we need to do it manually to free io structures */ ++ vbdev_ocf_volume_submit_io_cb(NULL, false, io); ++ } ++} ++ ++static void ++vbdev_ocf_volume_submit_discard(struct ocf_io *io) ++{ ++ struct vbdev_ocf_base *base = ++ *((struct vbdev_ocf_base **) ++ ocf_volume_get_priv(ocf_io_get_volume(io))); ++ struct ocf_io_ctx *io_ctx = ocf_get_io_ctx(io); ++ int status = 0; ++ ++ status = prepare_submit(io); ++ if (status) { ++ SPDK_ERRLOG("Preparing io failed with status=%d\n", status); ++ vbdev_ocf_volume_submit_io_cb(NULL, false, io); ++ return; ++ } ++ ++ status = spdk_bdev_unmap( ++ base->desc, io_ctx->ch, ++ io->addr, io->bytes, ++ vbdev_ocf_volume_submit_io_cb, io); ++ if (status) { ++ /* Since callback is not called, we need to do it manually to free io structures */ ++ SPDK_ERRLOG("Submission failed with status=%d\n", status); ++ vbdev_ocf_volume_submit_io_cb(NULL, false, io); ++ } ++} ++ ++static void ++vbdev_ocf_volume_submit_metadata(struct ocf_io *io) ++{ ++ /* Implement with persistent metadata support */ ++} ++ ++static unsigned int ++vbdev_ocf_volume_get_max_io_size(ocf_volume_t volume) ++{ ++ return 131072; ++} ++ ++static struct ocf_volume_properties vbdev_volume_props = { ++ .name = "SPDK_block_device", ++ .io_priv_size = sizeof(struct ocf_io_ctx), ++ .volume_priv_size = sizeof(struct vbdev_ocf_base *), ++ .caps = { ++ .atomic_writes = 0 /* to enable need to have ops->submit_metadata */ ++ }, ++ .ops = { ++ .open = vbdev_ocf_volume_open, ++ .close = vbdev_ocf_volume_close, ++ .get_length = vbdev_ocf_volume_get_length, ++ .submit_io = vbdev_ocf_volume_submit_io, ++ .submit_discard = vbdev_ocf_volume_submit_discard, ++ .submit_flush = vbdev_ocf_volume_submit_flush, ++ .get_max_io_size = vbdev_ocf_volume_get_max_io_size, ++ .submit_metadata = vbdev_ocf_volume_submit_metadata, ++ }, ++ .io_ops = { ++ .set_data = vbdev_ocf_volume_io_set_data, ++ .get_data = vbdev_ocf_volume_io_get_data, ++ }, ++}; ++ ++int ++vbdev_ocf_volume_init(void) ++{ ++ return ocf_ctx_register_volume_type(vbdev_ocf_ctx, SPDK_OBJECT, &vbdev_volume_props); ++} ++ ++void ++vbdev_ocf_volume_cleanup(void) ++{ ++ ocf_ctx_unregister_volume_type(vbdev_ocf_ctx, SPDK_OBJECT); ++} ++ ++SPDK_LOG_REGISTER_COMPONENT(vbdev_ocf_volume) +diff --git a/module/bdev/ocf/volume.h b/module/bdev/ocf/volume.h +index 02749ac..2aaa575 100644 +--- a/module/bdev/ocf/volume.h ++++ b/module/bdev/ocf/volume.h +@@ -1,35 +1,35 @@ +-/* SPDX-License-Identifier: BSD-3-Clause +- * Copyright (C) 2018 Intel Corporation. +- * All rights reserved. +- */ +- +-#ifndef VBDEV_OCF_DOBJ_H +-#define VBDEV_OCF_DOBJ_H +- +-#include +- +-#include "ctx.h" +-#include "data.h" +- +-/* ocf_io context +- * It is initialized from io size and offset */ +-struct ocf_io_ctx { +- struct bdev_ocf_data *data; +- struct spdk_io_channel *ch; +- uint32_t offset; +- int ref; +- int rq_cnt; +- int error; +- bool iovs_allocated; +-}; +- +-int vbdev_ocf_volume_init(void); +-void vbdev_ocf_volume_cleanup(void); +- +-static inline struct ocf_io_ctx * +-ocf_get_io_ctx(struct ocf_io *io) +-{ +- return ocf_io_get_priv(io); +-} +- +-#endif ++/* SPDX-License-Identifier: BSD-3-Clause ++ * Copyright (C) 2018 Intel Corporation. ++ * All rights reserved. ++ */ ++ ++#ifndef VBDEV_OCF_DOBJ_H ++#define VBDEV_OCF_DOBJ_H ++ ++#include ++ ++#include "ctx.h" ++#include "data.h" ++ ++/* ocf_io context ++ * It is initialized from io size and offset */ ++struct ocf_io_ctx { ++ struct bdev_ocf_data *data; ++ struct spdk_io_channel *ch; ++ uint32_t offset; ++ int ref; ++ int rq_cnt; ++ int error; ++ bool iovs_allocated; ++}; ++ ++int vbdev_ocf_volume_init(void); ++void vbdev_ocf_volume_cleanup(void); ++ ++static inline struct ocf_io_ctx * ++ocf_get_io_ctx(struct ocf_io *io) ++{ ++ return ocf_io_get_priv(io); ++} ++ ++#endif +diff --git a/module/bdev/passthru/Makefile b/module/bdev/passthru/Makefile +index 6fb95d3..ccfa0a3 100644 +--- a/module/bdev/passthru/Makefile ++++ b/module/bdev/passthru/Makefile +@@ -1,19 +1,19 @@ +-# SPDX-License-Identifier: BSD-3-Clause +-# Copyright (C) 2015 Intel Corporation. +-# All rights reserved. +-# +- +-SPDK_ROOT_DIR := $(abspath $(CURDIR)/../../..) +-include $(SPDK_ROOT_DIR)/mk/spdk.common.mk +- +-SO_VER := 4 +-SO_MINOR := 0 +- +-CFLAGS += -I$(SPDK_ROOT_DIR)/lib/bdev/ +- +-C_SRCS = vbdev_passthru.c vbdev_passthru_rpc.c +-LIBNAME = bdev_passthru +- +-SPDK_MAP_FILE = $(SPDK_ROOT_DIR)/mk/spdk_blank.map +- +-include $(SPDK_ROOT_DIR)/mk/spdk.lib.mk ++# SPDX-License-Identifier: BSD-3-Clause ++# Copyright (C) 2015 Intel Corporation. ++# All rights reserved. ++# ++ ++SPDK_ROOT_DIR := $(abspath $(CURDIR)/../../..) ++include $(SPDK_ROOT_DIR)/mk/spdk.common.mk ++ ++SO_VER := 4 ++SO_MINOR := 0 ++ ++CFLAGS += -I$(SPDK_ROOT_DIR)/lib/bdev/ ++ ++C_SRCS = vbdev_passthru.c vbdev_passthru_rpc.c ++LIBNAME = bdev_passthru ++ ++SPDK_MAP_FILE = $(SPDK_ROOT_DIR)/mk/spdk_blank.map ++ ++include $(SPDK_ROOT_DIR)/mk/spdk.lib.mk +diff --git a/module/bdev/passthru/vbdev_passthru.c b/module/bdev/passthru/vbdev_passthru.c +index 7f60586..08812fb 100644 +--- a/module/bdev/passthru/vbdev_passthru.c ++++ b/module/bdev/passthru/vbdev_passthru.c +@@ -1,759 +1,759 @@ +-/* SPDX-License-Identifier: BSD-3-Clause +- * Copyright (C) 2018 Intel Corporation. +- * All rights reserved. +- * Copyright (c) 2021 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +- */ +- +-/* +- * This is a simple example of a virtual block device module that passes IO +- * down to a bdev (or bdevs) that its configured to attach to. +- */ +- +-#include "spdk/stdinc.h" +- +-#include "vbdev_passthru.h" +-#include "spdk/rpc.h" +-#include "spdk/env.h" +-#include "spdk/endian.h" +-#include "spdk/string.h" +-#include "spdk/thread.h" +-#include "spdk/util.h" +- +-#include "spdk/bdev_module.h" +-#include "spdk/log.h" +- +- +-static int vbdev_passthru_init(void); +-static int vbdev_passthru_get_ctx_size(void); +-static void vbdev_passthru_examine(struct spdk_bdev *bdev); +-static void vbdev_passthru_finish(void); +-static int vbdev_passthru_config_json(struct spdk_json_write_ctx *w); +- +-static struct spdk_bdev_module passthru_if = { +- .name = "passthru", +- .module_init = vbdev_passthru_init, +- .get_ctx_size = vbdev_passthru_get_ctx_size, +- .examine_config = vbdev_passthru_examine, +- .module_fini = vbdev_passthru_finish, +- .config_json = vbdev_passthru_config_json +-}; +- +-SPDK_BDEV_MODULE_REGISTER(passthru, &passthru_if) +- +-/* List of pt_bdev names and their base bdevs via configuration file. +- * Used so we can parse the conf once at init and use this list in examine(). +- */ +-struct bdev_names { +- char *vbdev_name; +- char *bdev_name; +- TAILQ_ENTRY(bdev_names) link; +-}; +-static TAILQ_HEAD(, bdev_names) g_bdev_names = TAILQ_HEAD_INITIALIZER(g_bdev_names); +- +-/* List of virtual bdevs and associated info for each. */ +-struct vbdev_passthru { +- struct spdk_bdev *base_bdev; /* the thing we're attaching to */ +- struct spdk_bdev_desc *base_desc; /* its descriptor we get from open */ +- struct spdk_bdev pt_bdev; /* the PT virtual bdev */ +- TAILQ_ENTRY(vbdev_passthru) link; +- struct spdk_thread *thread; /* thread where base device is opened */ +-}; +-static TAILQ_HEAD(, vbdev_passthru) g_pt_nodes = TAILQ_HEAD_INITIALIZER(g_pt_nodes); +- +-/* The pt vbdev channel struct. It is allocated and freed on my behalf by the io channel code. +- * If this vbdev needed to implement a poller or a queue for IO, this is where those things +- * would be defined. This passthru bdev doesn't actually need to allocate a channel, it could +- * simply pass back the channel of the bdev underneath it but for example purposes we will +- * present its own to the upper layers. +- */ +-struct pt_io_channel { +- struct spdk_io_channel *base_ch; /* IO channel of base device */ +-}; +- +-/* Just for fun, this pt_bdev module doesn't need it but this is essentially a per IO +- * context that we get handed by the bdev layer. +- */ +-struct passthru_bdev_io { +- uint8_t test; +- +- /* bdev related */ +- struct spdk_io_channel *ch; +- +- /* for bdev_io_wait */ +- struct spdk_bdev_io_wait_entry bdev_io_wait; +-}; +- +-static void vbdev_passthru_submit_request(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io); +- +- +-/* Callback for unregistering the IO device. */ +-static void +-_device_unregister_cb(void *io_device) +-{ +- struct vbdev_passthru *pt_node = io_device; +- +- /* Done with this pt_node. */ +- free(pt_node->pt_bdev.name); +- free(pt_node); +-} +- +-/* Wrapper for the bdev close operation. */ +-static void +-_vbdev_passthru_destruct(void *ctx) +-{ +- struct spdk_bdev_desc *desc = ctx; +- +- spdk_bdev_close(desc); +-} +- +-/* Called after we've unregistered following a hot remove callback. +- * Our finish entry point will be called next. +- */ +-static int +-vbdev_passthru_destruct(void *ctx) +-{ +- struct vbdev_passthru *pt_node = (struct vbdev_passthru *)ctx; +- +- /* It is important to follow this exact sequence of steps for destroying +- * a vbdev... +- */ +- +- TAILQ_REMOVE(&g_pt_nodes, pt_node, link); +- +- /* Unclaim the underlying bdev. */ +- spdk_bdev_module_release_bdev(pt_node->base_bdev); +- +- /* Close the underlying bdev on its same opened thread. */ +- if (pt_node->thread && pt_node->thread != spdk_get_thread()) { +- spdk_thread_send_msg(pt_node->thread, _vbdev_passthru_destruct, pt_node->base_desc); +- } else { +- spdk_bdev_close(pt_node->base_desc); +- } +- +- /* Unregister the io_device. */ +- spdk_io_device_unregister(pt_node, _device_unregister_cb); +- +- return 0; +-} +- +-/* Completion callback for IO that were issued from this bdev. The original bdev_io +- * is passed in as an arg so we'll complete that one with the appropriate status +- * and then free the one that this module issued. +- */ +-static void +-_pt_complete_io(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) +-{ +- struct spdk_bdev_io *orig_io = cb_arg; +- int status = success ? SPDK_BDEV_IO_STATUS_SUCCESS : SPDK_BDEV_IO_STATUS_FAILED; +- struct passthru_bdev_io *io_ctx = (struct passthru_bdev_io *)orig_io->driver_ctx; +- +- /* We setup this value in the submission routine, just showing here that it is +- * passed back to us. +- */ +- if (io_ctx->test != 0x5a) { +- SPDK_ERRLOG("Error, original IO device_ctx is wrong! 0x%x\n", +- io_ctx->test); +- } +- +- /* Complete the original IO and then free the one that we created here +- * as a result of issuing an IO via submit_request. +- */ +- spdk_bdev_io_complete(orig_io, status); +- spdk_bdev_free_io(bdev_io); +-} +- +-static void +-_pt_complete_zcopy_io(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) +-{ +- struct spdk_bdev_io *orig_io = cb_arg; +- int status = success ? SPDK_BDEV_IO_STATUS_SUCCESS : SPDK_BDEV_IO_STATUS_FAILED; +- struct passthru_bdev_io *io_ctx = (struct passthru_bdev_io *)orig_io->driver_ctx; +- +- /* We setup this value in the submission routine, just showing here that it is +- * passed back to us. +- */ +- if (io_ctx->test != 0x5a) { +- SPDK_ERRLOG("Error, original IO device_ctx is wrong! 0x%x\n", +- io_ctx->test); +- } +- +- /* Complete the original IO and then free the one that we created here +- * as a result of issuing an IO via submit_request. +- */ +- spdk_bdev_io_set_buf(orig_io, bdev_io->u.bdev.iovs[0].iov_base, bdev_io->u.bdev.iovs[0].iov_len); +- spdk_bdev_io_complete(orig_io, status); +- spdk_bdev_free_io(bdev_io); +-} +- +-static void +-vbdev_passthru_resubmit_io(void *arg) +-{ +- struct spdk_bdev_io *bdev_io = (struct spdk_bdev_io *)arg; +- struct passthru_bdev_io *io_ctx = (struct passthru_bdev_io *)bdev_io->driver_ctx; +- +- vbdev_passthru_submit_request(io_ctx->ch, bdev_io); +-} +- +-static void +-vbdev_passthru_queue_io(struct spdk_bdev_io *bdev_io) +-{ +- struct passthru_bdev_io *io_ctx = (struct passthru_bdev_io *)bdev_io->driver_ctx; +- struct pt_io_channel *pt_ch = spdk_io_channel_get_ctx(io_ctx->ch); +- int rc; +- +- io_ctx->bdev_io_wait.bdev = bdev_io->bdev; +- io_ctx->bdev_io_wait.cb_fn = vbdev_passthru_resubmit_io; +- io_ctx->bdev_io_wait.cb_arg = bdev_io; +- +- /* Queue the IO using the channel of the base device. */ +- rc = spdk_bdev_queue_io_wait(bdev_io->bdev, pt_ch->base_ch, &io_ctx->bdev_io_wait); +- if (rc != 0) { +- SPDK_ERRLOG("Queue io failed in vbdev_passthru_queue_io, rc=%d.\n", rc); +- spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); +- } +-} +- +-/* Callback for getting a buf from the bdev pool in the event that the caller passed +- * in NULL, we need to own the buffer so it doesn't get freed by another vbdev module +- * beneath us before we're done with it. That won't happen in this example but it could +- * if this example were used as a template for something more complex. +- */ +-static void +-pt_read_get_buf_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io, bool success) +-{ +- struct vbdev_passthru *pt_node = SPDK_CONTAINEROF(bdev_io->bdev, struct vbdev_passthru, +- pt_bdev); +- struct pt_io_channel *pt_ch = spdk_io_channel_get_ctx(ch); +- struct passthru_bdev_io *io_ctx = (struct passthru_bdev_io *)bdev_io->driver_ctx; +- int rc; +- +- if (!success) { +- spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); +- return; +- } +- +- if (bdev_io->u.bdev.ext_opts) { +- rc = spdk_bdev_readv_blocks_ext(pt_node->base_desc, pt_ch->base_ch, bdev_io->u.bdev.iovs, +- bdev_io->u.bdev.iovcnt, bdev_io->u.bdev.offset_blocks, +- bdev_io->u.bdev.num_blocks, _pt_complete_io, +- bdev_io, bdev_io->u.bdev.ext_opts); +- } else { +- rc = spdk_bdev_readv_blocks_with_md(pt_node->base_desc, pt_ch->base_ch, +- bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt, +- bdev_io->u.bdev.md_buf, +- bdev_io->u.bdev.offset_blocks, +- bdev_io->u.bdev.num_blocks, +- _pt_complete_io, bdev_io); +- } +- +- if (rc != 0) { +- if (rc == -ENOMEM) { +- SPDK_ERRLOG("No memory, start to queue io for passthru.\n"); +- io_ctx->ch = ch; +- vbdev_passthru_queue_io(bdev_io); +- } else { +- SPDK_ERRLOG("ERROR on bdev_io submission!\n"); +- spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); +- } +- } +-} +- +-/* Called when someone above submits IO to this pt vbdev. We're simply passing it on here +- * via SPDK IO calls which in turn allocate another bdev IO and call our cpl callback provided +- * below along with the original bdev_io so that we can complete it once this IO completes. +- */ +-static void +-vbdev_passthru_submit_request(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io) +-{ +- struct vbdev_passthru *pt_node = SPDK_CONTAINEROF(bdev_io->bdev, struct vbdev_passthru, pt_bdev); +- struct pt_io_channel *pt_ch = spdk_io_channel_get_ctx(ch); +- struct passthru_bdev_io *io_ctx = (struct passthru_bdev_io *)bdev_io->driver_ctx; +- int rc = 0; +- +- /* Setup a per IO context value; we don't do anything with it in the vbdev other +- * than confirm we get the same thing back in the completion callback just to +- * demonstrate. +- */ +- io_ctx->test = 0x5a; +- +- switch (bdev_io->type) { +- case SPDK_BDEV_IO_TYPE_READ: +- spdk_bdev_io_get_buf(bdev_io, pt_read_get_buf_cb, +- bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen); +- break; +- case SPDK_BDEV_IO_TYPE_WRITE: +- if (bdev_io->u.bdev.ext_opts) { +- rc = spdk_bdev_writev_blocks_ext(pt_node->base_desc, pt_ch->base_ch, bdev_io->u.bdev.iovs, +- bdev_io->u.bdev.iovcnt, bdev_io->u.bdev.offset_blocks, +- bdev_io->u.bdev.num_blocks, _pt_complete_io, +- bdev_io, bdev_io->u.bdev.ext_opts); +- } else { +- rc = spdk_bdev_writev_blocks_with_md(pt_node->base_desc, pt_ch->base_ch, +- bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt, +- bdev_io->u.bdev.md_buf, +- bdev_io->u.bdev.offset_blocks, +- bdev_io->u.bdev.num_blocks, +- _pt_complete_io, bdev_io); +- } +- break; +- case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: +- rc = spdk_bdev_write_zeroes_blocks(pt_node->base_desc, pt_ch->base_ch, +- bdev_io->u.bdev.offset_blocks, +- bdev_io->u.bdev.num_blocks, +- _pt_complete_io, bdev_io); +- break; +- case SPDK_BDEV_IO_TYPE_UNMAP: +- rc = spdk_bdev_unmap_blocks(pt_node->base_desc, pt_ch->base_ch, +- bdev_io->u.bdev.offset_blocks, +- bdev_io->u.bdev.num_blocks, +- _pt_complete_io, bdev_io); +- break; +- case SPDK_BDEV_IO_TYPE_FLUSH: +- rc = spdk_bdev_flush_blocks(pt_node->base_desc, pt_ch->base_ch, +- bdev_io->u.bdev.offset_blocks, +- bdev_io->u.bdev.num_blocks, +- _pt_complete_io, bdev_io); +- break; +- case SPDK_BDEV_IO_TYPE_RESET: +- rc = spdk_bdev_reset(pt_node->base_desc, pt_ch->base_ch, +- _pt_complete_io, bdev_io); +- break; +- case SPDK_BDEV_IO_TYPE_ZCOPY: +- rc = spdk_bdev_zcopy_start(pt_node->base_desc, pt_ch->base_ch, NULL, 0, +- bdev_io->u.bdev.offset_blocks, +- bdev_io->u.bdev.num_blocks, bdev_io->u.bdev.zcopy.populate, +- _pt_complete_zcopy_io, bdev_io); +- break; +- case SPDK_BDEV_IO_TYPE_ABORT: +- rc = spdk_bdev_abort(pt_node->base_desc, pt_ch->base_ch, bdev_io->u.abort.bio_to_abort, +- _pt_complete_io, bdev_io); +- break; +- case SPDK_BDEV_IO_TYPE_COPY: +- rc = spdk_bdev_copy_blocks(pt_node->base_desc, pt_ch->base_ch, +- bdev_io->u.bdev.offset_blocks, +- bdev_io->u.bdev.copy.src_offset_blocks, +- bdev_io->u.bdev.num_blocks, +- _pt_complete_io, bdev_io); +- break; +- default: +- SPDK_ERRLOG("passthru: unknown I/O type %d\n", bdev_io->type); +- spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); +- return; +- } +- if (rc != 0) { +- if (rc == -ENOMEM) { +- SPDK_ERRLOG("No memory, start to queue io for passthru.\n"); +- io_ctx->ch = ch; +- vbdev_passthru_queue_io(bdev_io); +- } else { +- SPDK_ERRLOG("ERROR on bdev_io submission!\n"); +- spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); +- } +- } +-} +- +-/* We'll just call the base bdev and let it answer however if we were more +- * restrictive for some reason (or less) we could get the response back +- * and modify according to our purposes. +- */ +-static bool +-vbdev_passthru_io_type_supported(void *ctx, enum spdk_bdev_io_type io_type) +-{ +- struct vbdev_passthru *pt_node = (struct vbdev_passthru *)ctx; +- +- return spdk_bdev_io_type_supported(pt_node->base_bdev, io_type); +-} +- +-/* We supplied this as an entry point for upper layers who want to communicate to this +- * bdev. This is how they get a channel. We are passed the same context we provided when +- * we created our PT vbdev in examine() which, for this bdev, is the address of one of +- * our context nodes. From here we'll ask the SPDK channel code to fill out our channel +- * struct and we'll keep it in our PT node. +- */ +-static struct spdk_io_channel * +-vbdev_passthru_get_io_channel(void *ctx) +-{ +- struct vbdev_passthru *pt_node = (struct vbdev_passthru *)ctx; +- struct spdk_io_channel *pt_ch = NULL; +- +- /* The IO channel code will allocate a channel for us which consists of +- * the SPDK channel structure plus the size of our pt_io_channel struct +- * that we passed in when we registered our IO device. It will then call +- * our channel create callback to populate any elements that we need to +- * update. +- */ +- pt_ch = spdk_get_io_channel(pt_node); +- +- return pt_ch; +-} +- +-/* This is the output for bdev_get_bdevs() for this vbdev */ +-static int +-vbdev_passthru_dump_info_json(void *ctx, struct spdk_json_write_ctx *w) +-{ +- struct vbdev_passthru *pt_node = (struct vbdev_passthru *)ctx; +- +- spdk_json_write_name(w, "passthru"); +- spdk_json_write_object_begin(w); +- spdk_json_write_named_string(w, "name", spdk_bdev_get_name(&pt_node->pt_bdev)); +- spdk_json_write_named_string(w, "base_bdev_name", spdk_bdev_get_name(pt_node->base_bdev)); +- spdk_json_write_object_end(w); +- +- return 0; +-} +- +-/* This is used to generate JSON that can configure this module to its current state. */ +-static int +-vbdev_passthru_config_json(struct spdk_json_write_ctx *w) +-{ +- struct vbdev_passthru *pt_node; +- +- TAILQ_FOREACH(pt_node, &g_pt_nodes, link) { +- spdk_json_write_object_begin(w); +- spdk_json_write_named_string(w, "method", "bdev_passthru_create"); +- spdk_json_write_named_object_begin(w, "params"); +- spdk_json_write_named_string(w, "base_bdev_name", spdk_bdev_get_name(pt_node->base_bdev)); +- spdk_json_write_named_string(w, "name", spdk_bdev_get_name(&pt_node->pt_bdev)); +- spdk_json_write_object_end(w); +- spdk_json_write_object_end(w); +- } +- return 0; +-} +- +-/* We provide this callback for the SPDK channel code to create a channel using +- * the channel struct we provided in our module get_io_channel() entry point. Here +- * we get and save off an underlying base channel of the device below us so that +- * we can communicate with the base bdev on a per channel basis. If we needed +- * our own poller for this vbdev, we'd register it here. +- */ +-static int +-pt_bdev_ch_create_cb(void *io_device, void *ctx_buf) +-{ +- struct pt_io_channel *pt_ch = ctx_buf; +- struct vbdev_passthru *pt_node = io_device; +- +- pt_ch->base_ch = spdk_bdev_get_io_channel(pt_node->base_desc); +- +- return 0; +-} +- +-/* We provide this callback for the SPDK channel code to destroy a channel +- * created with our create callback. We just need to undo anything we did +- * when we created. If this bdev used its own poller, we'd unregister it here. +- */ +-static void +-pt_bdev_ch_destroy_cb(void *io_device, void *ctx_buf) +-{ +- struct pt_io_channel *pt_ch = ctx_buf; +- +- spdk_put_io_channel(pt_ch->base_ch); +-} +- +-/* Create the passthru association from the bdev and vbdev name and insert +- * on the global list. */ +-static int +-vbdev_passthru_insert_name(const char *bdev_name, const char *vbdev_name) +-{ +- struct bdev_names *name; +- +- TAILQ_FOREACH(name, &g_bdev_names, link) { +- if (strcmp(vbdev_name, name->vbdev_name) == 0) { +- SPDK_ERRLOG("passthru bdev %s already exists\n", vbdev_name); +- return -EEXIST; +- } +- } +- +- name = calloc(1, sizeof(struct bdev_names)); +- if (!name) { +- SPDK_ERRLOG("could not allocate bdev_names\n"); +- return -ENOMEM; +- } +- +- name->bdev_name = strdup(bdev_name); +- if (!name->bdev_name) { +- SPDK_ERRLOG("could not allocate name->bdev_name\n"); +- free(name); +- return -ENOMEM; +- } +- +- name->vbdev_name = strdup(vbdev_name); +- if (!name->vbdev_name) { +- SPDK_ERRLOG("could not allocate name->vbdev_name\n"); +- free(name->bdev_name); +- free(name); +- return -ENOMEM; +- } +- +- TAILQ_INSERT_TAIL(&g_bdev_names, name, link); +- +- return 0; +-} +- +-/* On init, just perform bdev module specific initialization. */ +-static int +-vbdev_passthru_init(void) +-{ +- return 0; +-} +- +-/* Called when the entire module is being torn down. */ +-static void +-vbdev_passthru_finish(void) +-{ +- struct bdev_names *name; +- +- while ((name = TAILQ_FIRST(&g_bdev_names))) { +- TAILQ_REMOVE(&g_bdev_names, name, link); +- free(name->bdev_name); +- free(name->vbdev_name); +- free(name); +- } +-} +- +-/* During init we'll be asked how much memory we'd like passed to us +- * in bev_io structures as context. Here's where we specify how +- * much context we want per IO. +- */ +-static int +-vbdev_passthru_get_ctx_size(void) +-{ +- return sizeof(struct passthru_bdev_io); +-} +- +-/* Where vbdev_passthru_config_json() is used to generate per module JSON config data, this +- * function is called to output any per bdev specific methods. For the PT module, there are +- * none. +- */ +-static void +-vbdev_passthru_write_config_json(struct spdk_bdev *bdev, struct spdk_json_write_ctx *w) +-{ +- /* No config per bdev needed */ +-} +- +-static int +-vbdev_passthru_get_memory_domains(void *ctx, struct spdk_memory_domain **domains, int array_size) +-{ +- struct vbdev_passthru *pt_node = (struct vbdev_passthru *)ctx; +- +- /* Passthru bdev doesn't work with data buffers, so it supports any memory domain used by base_bdev */ +- return spdk_bdev_get_memory_domains(pt_node->base_bdev, domains, array_size); +-} +- +-/* When we register our bdev this is how we specify our entry points. */ +-static const struct spdk_bdev_fn_table vbdev_passthru_fn_table = { +- .destruct = vbdev_passthru_destruct, +- .submit_request = vbdev_passthru_submit_request, +- .io_type_supported = vbdev_passthru_io_type_supported, +- .get_io_channel = vbdev_passthru_get_io_channel, +- .dump_info_json = vbdev_passthru_dump_info_json, +- .write_config_json = vbdev_passthru_write_config_json, +- .get_memory_domains = vbdev_passthru_get_memory_domains, +-}; +- +-static void +-vbdev_passthru_base_bdev_hotremove_cb(struct spdk_bdev *bdev_find) +-{ +- struct vbdev_passthru *pt_node, *tmp; +- +- TAILQ_FOREACH_SAFE(pt_node, &g_pt_nodes, link, tmp) { +- if (bdev_find == pt_node->base_bdev) { +- spdk_bdev_unregister(&pt_node->pt_bdev, NULL, NULL); +- } +- } +-} +- +-/* Called when the underlying base bdev triggers asynchronous event such as bdev removal. */ +-static void +-vbdev_passthru_base_bdev_event_cb(enum spdk_bdev_event_type type, struct spdk_bdev *bdev, +- void *event_ctx) +-{ +- switch (type) { +- case SPDK_BDEV_EVENT_REMOVE: +- vbdev_passthru_base_bdev_hotremove_cb(bdev); +- break; +- default: +- SPDK_NOTICELOG("Unsupported bdev event: type %d\n", type); +- break; +- } +-} +- +-/* Create and register the passthru vbdev if we find it in our list of bdev names. +- * This can be called either by the examine path or RPC method. +- */ +-static int +-vbdev_passthru_register(const char *bdev_name) +-{ +- struct bdev_names *name; +- struct vbdev_passthru *pt_node; +- struct spdk_bdev *bdev; +- int rc = 0; +- +- /* Check our list of names from config versus this bdev and if +- * there's a match, create the pt_node & bdev accordingly. +- */ +- TAILQ_FOREACH(name, &g_bdev_names, link) { +- if (strcmp(name->bdev_name, bdev_name) != 0) { +- continue; +- } +- +- SPDK_NOTICELOG("Match on %s\n", bdev_name); +- pt_node = calloc(1, sizeof(struct vbdev_passthru)); +- if (!pt_node) { +- rc = -ENOMEM; +- SPDK_ERRLOG("could not allocate pt_node\n"); +- break; +- } +- +- pt_node->pt_bdev.name = strdup(name->vbdev_name); +- if (!pt_node->pt_bdev.name) { +- rc = -ENOMEM; +- SPDK_ERRLOG("could not allocate pt_bdev name\n"); +- free(pt_node); +- break; +- } +- pt_node->pt_bdev.product_name = "passthru"; +- +- /* The base bdev that we're attaching to. */ +- rc = spdk_bdev_open_ext(bdev_name, true, vbdev_passthru_base_bdev_event_cb, +- NULL, &pt_node->base_desc); +- if (rc) { +- if (rc != -ENODEV) { +- SPDK_ERRLOG("could not open bdev %s\n", bdev_name); +- } +- free(pt_node->pt_bdev.name); +- free(pt_node); +- break; +- } +- SPDK_NOTICELOG("base bdev opened\n"); +- +- bdev = spdk_bdev_desc_get_bdev(pt_node->base_desc); +- pt_node->base_bdev = bdev; +- +- /* Copy some properties from the underlying base bdev. */ +- pt_node->pt_bdev.write_cache = bdev->write_cache; +- pt_node->pt_bdev.required_alignment = bdev->required_alignment; +- pt_node->pt_bdev.optimal_io_boundary = bdev->optimal_io_boundary; +- pt_node->pt_bdev.blocklen = bdev->blocklen; +- pt_node->pt_bdev.blockcnt = bdev->blockcnt; +- +- pt_node->pt_bdev.md_interleave = bdev->md_interleave; +- pt_node->pt_bdev.md_len = bdev->md_len; +- pt_node->pt_bdev.dif_type = bdev->dif_type; +- pt_node->pt_bdev.dif_is_head_of_md = bdev->dif_is_head_of_md; +- pt_node->pt_bdev.dif_check_flags = bdev->dif_check_flags; +- +- /* This is the context that is passed to us when the bdev +- * layer calls in so we'll save our pt_bdev node here. +- */ +- pt_node->pt_bdev.ctxt = pt_node; +- pt_node->pt_bdev.fn_table = &vbdev_passthru_fn_table; +- pt_node->pt_bdev.module = &passthru_if; +- TAILQ_INSERT_TAIL(&g_pt_nodes, pt_node, link); +- +- spdk_io_device_register(pt_node, pt_bdev_ch_create_cb, pt_bdev_ch_destroy_cb, +- sizeof(struct pt_io_channel), +- name->vbdev_name); +- SPDK_NOTICELOG("io_device created at: 0x%p\n", pt_node); +- +- /* Save the thread where the base device is opened */ +- pt_node->thread = spdk_get_thread(); +- +- rc = spdk_bdev_module_claim_bdev(bdev, pt_node->base_desc, pt_node->pt_bdev.module); +- if (rc) { +- SPDK_ERRLOG("could not claim bdev %s\n", bdev_name); +- spdk_bdev_close(pt_node->base_desc); +- TAILQ_REMOVE(&g_pt_nodes, pt_node, link); +- spdk_io_device_unregister(pt_node, NULL); +- free(pt_node->pt_bdev.name); +- free(pt_node); +- break; +- } +- SPDK_NOTICELOG("bdev claimed\n"); +- +- rc = spdk_bdev_register(&pt_node->pt_bdev); +- if (rc) { +- SPDK_ERRLOG("could not register pt_bdev\n"); +- spdk_bdev_module_release_bdev(&pt_node->pt_bdev); +- spdk_bdev_close(pt_node->base_desc); +- TAILQ_REMOVE(&g_pt_nodes, pt_node, link); +- spdk_io_device_unregister(pt_node, NULL); +- free(pt_node->pt_bdev.name); +- free(pt_node); +- break; +- } +- SPDK_NOTICELOG("pt_bdev registered\n"); +- SPDK_NOTICELOG("created pt_bdev for: %s\n", name->vbdev_name); +- } +- +- return rc; +-} +- +-/* Create the passthru disk from the given bdev and vbdev name. */ +-int +-bdev_passthru_create_disk(const char *bdev_name, const char *vbdev_name) +-{ +- int rc; +- +- /* Insert the bdev name into our global name list even if it doesn't exist yet, +- * it may show up soon... +- */ +- rc = vbdev_passthru_insert_name(bdev_name, vbdev_name); +- if (rc) { +- return rc; +- } +- +- rc = vbdev_passthru_register(bdev_name); +- if (rc == -ENODEV) { +- /* This is not an error, we tracked the name above and it still +- * may show up later. +- */ +- SPDK_NOTICELOG("vbdev creation deferred pending base bdev arrival\n"); +- rc = 0; +- } +- +- return rc; +-} +- +-void +-bdev_passthru_delete_disk(const char *bdev_name, spdk_bdev_unregister_cb cb_fn, void *cb_arg) +-{ +- struct bdev_names *name; +- int rc; +- +- /* Some cleanup happens in the destruct callback. */ +- rc = spdk_bdev_unregister_by_name(bdev_name, &passthru_if, cb_fn, cb_arg); +- if (rc == 0) { +- /* Remove the association (vbdev, bdev) from g_bdev_names. This is required so that the +- * vbdev does not get re-created if the same bdev is constructed at some other time, +- * unless the underlying bdev was hot-removed. +- */ +- TAILQ_FOREACH(name, &g_bdev_names, link) { +- if (strcmp(name->vbdev_name, bdev_name) == 0) { +- TAILQ_REMOVE(&g_bdev_names, name, link); +- free(name->bdev_name); +- free(name->vbdev_name); +- free(name); +- break; +- } +- } +- } else { +- cb_fn(cb_arg, rc); +- } +-} +- +-/* Because we specified this function in our pt bdev function table when we +- * registered our pt bdev, we'll get this call anytime a new bdev shows up. +- * Here we need to decide if we care about it and if so what to do. We +- * parsed the config file at init so we check the new bdev against the list +- * we built up at that time and if the user configured us to attach to this +- * bdev, here's where we do it. +- */ +-static void +-vbdev_passthru_examine(struct spdk_bdev *bdev) +-{ +- vbdev_passthru_register(bdev->name); +- +- spdk_bdev_module_examine_done(&passthru_if); +-} +- +-SPDK_LOG_REGISTER_COMPONENT(vbdev_passthru) ++/* SPDX-License-Identifier: BSD-3-Clause ++ * Copyright (C) 2018 Intel Corporation. ++ * All rights reserved. ++ * Copyright (c) 2021 NVIDIA CORPORATION & AFFILIATES. All rights reserved. ++ */ ++ ++/* ++ * This is a simple example of a virtual block device module that passes IO ++ * down to a bdev (or bdevs) that its configured to attach to. ++ */ ++ ++#include "spdk/stdinc.h" ++ ++#include "vbdev_passthru.h" ++#include "spdk/rpc.h" ++#include "spdk/env.h" ++#include "spdk/endian.h" ++#include "spdk/string.h" ++#include "spdk/thread.h" ++#include "spdk/util.h" ++ ++#include "spdk/bdev_module.h" ++#include "spdk/log.h" ++ ++ ++static int vbdev_passthru_init(void); ++static int vbdev_passthru_get_ctx_size(void); ++static void vbdev_passthru_examine(struct spdk_bdev *bdev); ++static void vbdev_passthru_finish(void); ++static int vbdev_passthru_config_json(struct spdk_json_write_ctx *w); ++ ++static struct spdk_bdev_module passthru_if = { ++ .name = "passthru", ++ .module_init = vbdev_passthru_init, ++ .get_ctx_size = vbdev_passthru_get_ctx_size, ++ .examine_config = vbdev_passthru_examine, ++ .module_fini = vbdev_passthru_finish, ++ .config_json = vbdev_passthru_config_json ++}; ++ ++SPDK_BDEV_MODULE_REGISTER(passthru, &passthru_if) ++ ++/* List of pt_bdev names and their base bdevs via configuration file. ++ * Used so we can parse the conf once at init and use this list in examine(). ++ */ ++struct bdev_names { ++ char *vbdev_name; ++ char *bdev_name; ++ TAILQ_ENTRY(bdev_names) link; ++}; ++static TAILQ_HEAD(, bdev_names) g_bdev_names = TAILQ_HEAD_INITIALIZER(g_bdev_names); ++ ++/* List of virtual bdevs and associated info for each. */ ++struct vbdev_passthru { ++ struct spdk_bdev *base_bdev; /* the thing we're attaching to */ ++ struct spdk_bdev_desc *base_desc; /* its descriptor we get from open */ ++ struct spdk_bdev pt_bdev; /* the PT virtual bdev */ ++ TAILQ_ENTRY(vbdev_passthru) link; ++ struct spdk_thread *thread; /* thread where base device is opened */ ++}; ++static TAILQ_HEAD(, vbdev_passthru) g_pt_nodes = TAILQ_HEAD_INITIALIZER(g_pt_nodes); ++ ++/* The pt vbdev channel struct. It is allocated and freed on my behalf by the io channel code. ++ * If this vbdev needed to implement a poller or a queue for IO, this is where those things ++ * would be defined. This passthru bdev doesn't actually need to allocate a channel, it could ++ * simply pass back the channel of the bdev underneath it but for example purposes we will ++ * present its own to the upper layers. ++ */ ++struct pt_io_channel { ++ struct spdk_io_channel *base_ch; /* IO channel of base device */ ++}; ++ ++/* Just for fun, this pt_bdev module doesn't need it but this is essentially a per IO ++ * context that we get handed by the bdev layer. ++ */ ++struct passthru_bdev_io { ++ uint8_t test; ++ ++ /* bdev related */ ++ struct spdk_io_channel *ch; ++ ++ /* for bdev_io_wait */ ++ struct spdk_bdev_io_wait_entry bdev_io_wait; ++}; ++ ++static void vbdev_passthru_submit_request(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io); ++ ++ ++/* Callback for unregistering the IO device. */ ++static void ++_device_unregister_cb(void *io_device) ++{ ++ struct vbdev_passthru *pt_node = io_device; ++ ++ /* Done with this pt_node. */ ++ free(pt_node->pt_bdev.name); ++ free(pt_node); ++} ++ ++/* Wrapper for the bdev close operation. */ ++static void ++_vbdev_passthru_destruct(void *ctx) ++{ ++ struct spdk_bdev_desc *desc = ctx; ++ ++ spdk_bdev_close(desc); ++} ++ ++/* Called after we've unregistered following a hot remove callback. ++ * Our finish entry point will be called next. ++ */ ++static int ++vbdev_passthru_destruct(void *ctx) ++{ ++ struct vbdev_passthru *pt_node = (struct vbdev_passthru *)ctx; ++ ++ /* It is important to follow this exact sequence of steps for destroying ++ * a vbdev... ++ */ ++ ++ TAILQ_REMOVE(&g_pt_nodes, pt_node, link); ++ ++ /* Unclaim the underlying bdev. */ ++ spdk_bdev_module_release_bdev(pt_node->base_bdev); ++ ++ /* Close the underlying bdev on its same opened thread. */ ++ if (pt_node->thread && pt_node->thread != spdk_get_thread()) { ++ spdk_thread_send_msg(pt_node->thread, _vbdev_passthru_destruct, pt_node->base_desc); ++ } else { ++ spdk_bdev_close(pt_node->base_desc); ++ } ++ ++ /* Unregister the io_device. */ ++ spdk_io_device_unregister(pt_node, _device_unregister_cb); ++ ++ return 0; ++} ++ ++/* Completion callback for IO that were issued from this bdev. The original bdev_io ++ * is passed in as an arg so we'll complete that one with the appropriate status ++ * and then free the one that this module issued. ++ */ ++static void ++_pt_complete_io(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) ++{ ++ struct spdk_bdev_io *orig_io = cb_arg; ++ int status = success ? SPDK_BDEV_IO_STATUS_SUCCESS : SPDK_BDEV_IO_STATUS_FAILED; ++ struct passthru_bdev_io *io_ctx = (struct passthru_bdev_io *)orig_io->driver_ctx; ++ ++ /* We setup this value in the submission routine, just showing here that it is ++ * passed back to us. ++ */ ++ if (io_ctx->test != 0x5a) { ++ SPDK_ERRLOG("Error, original IO device_ctx is wrong! 0x%x\n", ++ io_ctx->test); ++ } ++ ++ /* Complete the original IO and then free the one that we created here ++ * as a result of issuing an IO via submit_request. ++ */ ++ spdk_bdev_io_complete(orig_io, status); ++ spdk_bdev_free_io(bdev_io); ++} ++ ++static void ++_pt_complete_zcopy_io(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) ++{ ++ struct spdk_bdev_io *orig_io = cb_arg; ++ int status = success ? SPDK_BDEV_IO_STATUS_SUCCESS : SPDK_BDEV_IO_STATUS_FAILED; ++ struct passthru_bdev_io *io_ctx = (struct passthru_bdev_io *)orig_io->driver_ctx; ++ ++ /* We setup this value in the submission routine, just showing here that it is ++ * passed back to us. ++ */ ++ if (io_ctx->test != 0x5a) { ++ SPDK_ERRLOG("Error, original IO device_ctx is wrong! 0x%x\n", ++ io_ctx->test); ++ } ++ ++ /* Complete the original IO and then free the one that we created here ++ * as a result of issuing an IO via submit_request. ++ */ ++ spdk_bdev_io_set_buf(orig_io, bdev_io->u.bdev.iovs[0].iov_base, bdev_io->u.bdev.iovs[0].iov_len); ++ spdk_bdev_io_complete(orig_io, status); ++ spdk_bdev_free_io(bdev_io); ++} ++ ++static void ++vbdev_passthru_resubmit_io(void *arg) ++{ ++ struct spdk_bdev_io *bdev_io = (struct spdk_bdev_io *)arg; ++ struct passthru_bdev_io *io_ctx = (struct passthru_bdev_io *)bdev_io->driver_ctx; ++ ++ vbdev_passthru_submit_request(io_ctx->ch, bdev_io); ++} ++ ++static void ++vbdev_passthru_queue_io(struct spdk_bdev_io *bdev_io) ++{ ++ struct passthru_bdev_io *io_ctx = (struct passthru_bdev_io *)bdev_io->driver_ctx; ++ struct pt_io_channel *pt_ch = spdk_io_channel_get_ctx(io_ctx->ch); ++ int rc; ++ ++ io_ctx->bdev_io_wait.bdev = bdev_io->bdev; ++ io_ctx->bdev_io_wait.cb_fn = vbdev_passthru_resubmit_io; ++ io_ctx->bdev_io_wait.cb_arg = bdev_io; ++ ++ /* Queue the IO using the channel of the base device. */ ++ rc = spdk_bdev_queue_io_wait(bdev_io->bdev, pt_ch->base_ch, &io_ctx->bdev_io_wait); ++ if (rc != 0) { ++ SPDK_ERRLOG("Queue io failed in vbdev_passthru_queue_io, rc=%d.\n", rc); ++ spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); ++ } ++} ++ ++/* Callback for getting a buf from the bdev pool in the event that the caller passed ++ * in NULL, we need to own the buffer so it doesn't get freed by another vbdev module ++ * beneath us before we're done with it. That won't happen in this example but it could ++ * if this example were used as a template for something more complex. ++ */ ++static void ++pt_read_get_buf_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io, bool success) ++{ ++ struct vbdev_passthru *pt_node = SPDK_CONTAINEROF(bdev_io->bdev, struct vbdev_passthru, ++ pt_bdev); ++ struct pt_io_channel *pt_ch = spdk_io_channel_get_ctx(ch); ++ struct passthru_bdev_io *io_ctx = (struct passthru_bdev_io *)bdev_io->driver_ctx; ++ int rc; ++ ++ if (!success) { ++ spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); ++ return; ++ } ++ ++ if (bdev_io->u.bdev.ext_opts) { ++ rc = spdk_bdev_readv_blocks_ext(pt_node->base_desc, pt_ch->base_ch, bdev_io->u.bdev.iovs, ++ bdev_io->u.bdev.iovcnt, bdev_io->u.bdev.offset_blocks, ++ bdev_io->u.bdev.num_blocks, _pt_complete_io, ++ bdev_io, bdev_io->u.bdev.ext_opts); ++ } else { ++ rc = spdk_bdev_readv_blocks_with_md(pt_node->base_desc, pt_ch->base_ch, ++ bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt, ++ bdev_io->u.bdev.md_buf, ++ bdev_io->u.bdev.offset_blocks, ++ bdev_io->u.bdev.num_blocks, ++ _pt_complete_io, bdev_io); ++ } ++ ++ if (rc != 0) { ++ if (rc == -ENOMEM) { ++ SPDK_ERRLOG("No memory, start to queue io for passthru.\n"); ++ io_ctx->ch = ch; ++ vbdev_passthru_queue_io(bdev_io); ++ } else { ++ SPDK_ERRLOG("ERROR on bdev_io submission!\n"); ++ spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); ++ } ++ } ++} ++ ++/* Called when someone above submits IO to this pt vbdev. We're simply passing it on here ++ * via SPDK IO calls which in turn allocate another bdev IO and call our cpl callback provided ++ * below along with the original bdev_io so that we can complete it once this IO completes. ++ */ ++static void ++vbdev_passthru_submit_request(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io) ++{ ++ struct vbdev_passthru *pt_node = SPDK_CONTAINEROF(bdev_io->bdev, struct vbdev_passthru, pt_bdev); ++ struct pt_io_channel *pt_ch = spdk_io_channel_get_ctx(ch); ++ struct passthru_bdev_io *io_ctx = (struct passthru_bdev_io *)bdev_io->driver_ctx; ++ int rc = 0; ++ ++ /* Setup a per IO context value; we don't do anything with it in the vbdev other ++ * than confirm we get the same thing back in the completion callback just to ++ * demonstrate. ++ */ ++ io_ctx->test = 0x5a; ++ ++ switch (bdev_io->type) { ++ case SPDK_BDEV_IO_TYPE_READ: ++ spdk_bdev_io_get_buf(bdev_io, pt_read_get_buf_cb, ++ bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen); ++ break; ++ case SPDK_BDEV_IO_TYPE_WRITE: ++ if (bdev_io->u.bdev.ext_opts) { ++ rc = spdk_bdev_writev_blocks_ext(pt_node->base_desc, pt_ch->base_ch, bdev_io->u.bdev.iovs, ++ bdev_io->u.bdev.iovcnt, bdev_io->u.bdev.offset_blocks, ++ bdev_io->u.bdev.num_blocks, _pt_complete_io, ++ bdev_io, bdev_io->u.bdev.ext_opts); ++ } else { ++ rc = spdk_bdev_writev_blocks_with_md(pt_node->base_desc, pt_ch->base_ch, ++ bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt, ++ bdev_io->u.bdev.md_buf, ++ bdev_io->u.bdev.offset_blocks, ++ bdev_io->u.bdev.num_blocks, ++ _pt_complete_io, bdev_io); ++ } ++ break; ++ case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: ++ rc = spdk_bdev_write_zeroes_blocks(pt_node->base_desc, pt_ch->base_ch, ++ bdev_io->u.bdev.offset_blocks, ++ bdev_io->u.bdev.num_blocks, ++ _pt_complete_io, bdev_io); ++ break; ++ case SPDK_BDEV_IO_TYPE_UNMAP: ++ rc = spdk_bdev_unmap_blocks(pt_node->base_desc, pt_ch->base_ch, ++ bdev_io->u.bdev.offset_blocks, ++ bdev_io->u.bdev.num_blocks, ++ _pt_complete_io, bdev_io); ++ break; ++ case SPDK_BDEV_IO_TYPE_FLUSH: ++ rc = spdk_bdev_flush_blocks(pt_node->base_desc, pt_ch->base_ch, ++ bdev_io->u.bdev.offset_blocks, ++ bdev_io->u.bdev.num_blocks, ++ _pt_complete_io, bdev_io); ++ break; ++ case SPDK_BDEV_IO_TYPE_RESET: ++ rc = spdk_bdev_reset(pt_node->base_desc, pt_ch->base_ch, ++ _pt_complete_io, bdev_io); ++ break; ++ case SPDK_BDEV_IO_TYPE_ZCOPY: ++ rc = spdk_bdev_zcopy_start(pt_node->base_desc, pt_ch->base_ch, NULL, 0, ++ bdev_io->u.bdev.offset_blocks, ++ bdev_io->u.bdev.num_blocks, bdev_io->u.bdev.zcopy.populate, ++ _pt_complete_zcopy_io, bdev_io); ++ break; ++ case SPDK_BDEV_IO_TYPE_ABORT: ++ rc = spdk_bdev_abort(pt_node->base_desc, pt_ch->base_ch, bdev_io->u.abort.bio_to_abort, ++ _pt_complete_io, bdev_io); ++ break; ++ case SPDK_BDEV_IO_TYPE_COPY: ++ rc = spdk_bdev_copy_blocks(pt_node->base_desc, pt_ch->base_ch, ++ bdev_io->u.bdev.offset_blocks, ++ bdev_io->u.bdev.copy.src_offset_blocks, ++ bdev_io->u.bdev.num_blocks, ++ _pt_complete_io, bdev_io); ++ break; ++ default: ++ SPDK_ERRLOG("passthru: unknown I/O type %d\n", bdev_io->type); ++ spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); ++ return; ++ } ++ if (rc != 0) { ++ if (rc == -ENOMEM) { ++ SPDK_ERRLOG("No memory, start to queue io for passthru.\n"); ++ io_ctx->ch = ch; ++ vbdev_passthru_queue_io(bdev_io); ++ } else { ++ SPDK_ERRLOG("ERROR on bdev_io submission!\n"); ++ spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); ++ } ++ } ++} ++ ++/* We'll just call the base bdev and let it answer however if we were more ++ * restrictive for some reason (or less) we could get the response back ++ * and modify according to our purposes. ++ */ ++static bool ++vbdev_passthru_io_type_supported(void *ctx, enum spdk_bdev_io_type io_type) ++{ ++ struct vbdev_passthru *pt_node = (struct vbdev_passthru *)ctx; ++ ++ return spdk_bdev_io_type_supported(pt_node->base_bdev, io_type); ++} ++ ++/* We supplied this as an entry point for upper layers who want to communicate to this ++ * bdev. This is how they get a channel. We are passed the same context we provided when ++ * we created our PT vbdev in examine() which, for this bdev, is the address of one of ++ * our context nodes. From here we'll ask the SPDK channel code to fill out our channel ++ * struct and we'll keep it in our PT node. ++ */ ++static struct spdk_io_channel * ++vbdev_passthru_get_io_channel(void *ctx) ++{ ++ struct vbdev_passthru *pt_node = (struct vbdev_passthru *)ctx; ++ struct spdk_io_channel *pt_ch = NULL; ++ ++ /* The IO channel code will allocate a channel for us which consists of ++ * the SPDK channel structure plus the size of our pt_io_channel struct ++ * that we passed in when we registered our IO device. It will then call ++ * our channel create callback to populate any elements that we need to ++ * update. ++ */ ++ pt_ch = spdk_get_io_channel(pt_node); ++ ++ return pt_ch; ++} ++ ++/* This is the output for bdev_get_bdevs() for this vbdev */ ++static int ++vbdev_passthru_dump_info_json(void *ctx, struct spdk_json_write_ctx *w) ++{ ++ struct vbdev_passthru *pt_node = (struct vbdev_passthru *)ctx; ++ ++ spdk_json_write_name(w, "passthru"); ++ spdk_json_write_object_begin(w); ++ spdk_json_write_named_string(w, "name", spdk_bdev_get_name(&pt_node->pt_bdev)); ++ spdk_json_write_named_string(w, "base_bdev_name", spdk_bdev_get_name(pt_node->base_bdev)); ++ spdk_json_write_object_end(w); ++ ++ return 0; ++} ++ ++/* This is used to generate JSON that can configure this module to its current state. */ ++static int ++vbdev_passthru_config_json(struct spdk_json_write_ctx *w) ++{ ++ struct vbdev_passthru *pt_node; ++ ++ TAILQ_FOREACH(pt_node, &g_pt_nodes, link) { ++ spdk_json_write_object_begin(w); ++ spdk_json_write_named_string(w, "method", "bdev_passthru_create"); ++ spdk_json_write_named_object_begin(w, "params"); ++ spdk_json_write_named_string(w, "base_bdev_name", spdk_bdev_get_name(pt_node->base_bdev)); ++ spdk_json_write_named_string(w, "name", spdk_bdev_get_name(&pt_node->pt_bdev)); ++ spdk_json_write_object_end(w); ++ spdk_json_write_object_end(w); ++ } ++ return 0; ++} ++ ++/* We provide this callback for the SPDK channel code to create a channel using ++ * the channel struct we provided in our module get_io_channel() entry point. Here ++ * we get and save off an underlying base channel of the device below us so that ++ * we can communicate with the base bdev on a per channel basis. If we needed ++ * our own poller for this vbdev, we'd register it here. ++ */ ++static int ++pt_bdev_ch_create_cb(void *io_device, void *ctx_buf) ++{ ++ struct pt_io_channel *pt_ch = ctx_buf; ++ struct vbdev_passthru *pt_node = io_device; ++ ++ pt_ch->base_ch = spdk_bdev_get_io_channel(pt_node->base_desc); ++ ++ return 0; ++} ++ ++/* We provide this callback for the SPDK channel code to destroy a channel ++ * created with our create callback. We just need to undo anything we did ++ * when we created. If this bdev used its own poller, we'd unregister it here. ++ */ ++static void ++pt_bdev_ch_destroy_cb(void *io_device, void *ctx_buf) ++{ ++ struct pt_io_channel *pt_ch = ctx_buf; ++ ++ spdk_put_io_channel(pt_ch->base_ch); ++} ++ ++/* Create the passthru association from the bdev and vbdev name and insert ++ * on the global list. */ ++static int ++vbdev_passthru_insert_name(const char *bdev_name, const char *vbdev_name) ++{ ++ struct bdev_names *name; ++ ++ TAILQ_FOREACH(name, &g_bdev_names, link) { ++ if (strcmp(vbdev_name, name->vbdev_name) == 0) { ++ SPDK_ERRLOG("passthru bdev %s already exists\n", vbdev_name); ++ return -EEXIST; ++ } ++ } ++ ++ name = calloc(1, sizeof(struct bdev_names)); ++ if (!name) { ++ SPDK_ERRLOG("could not allocate bdev_names\n"); ++ return -ENOMEM; ++ } ++ ++ name->bdev_name = strdup(bdev_name); ++ if (!name->bdev_name) { ++ SPDK_ERRLOG("could not allocate name->bdev_name\n"); ++ free(name); ++ return -ENOMEM; ++ } ++ ++ name->vbdev_name = strdup(vbdev_name); ++ if (!name->vbdev_name) { ++ SPDK_ERRLOG("could not allocate name->vbdev_name\n"); ++ free(name->bdev_name); ++ free(name); ++ return -ENOMEM; ++ } ++ ++ TAILQ_INSERT_TAIL(&g_bdev_names, name, link); ++ ++ return 0; ++} ++ ++/* On init, just perform bdev module specific initialization. */ ++static int ++vbdev_passthru_init(void) ++{ ++ return 0; ++} ++ ++/* Called when the entire module is being torn down. */ ++static void ++vbdev_passthru_finish(void) ++{ ++ struct bdev_names *name; ++ ++ while ((name = TAILQ_FIRST(&g_bdev_names))) { ++ TAILQ_REMOVE(&g_bdev_names, name, link); ++ free(name->bdev_name); ++ free(name->vbdev_name); ++ free(name); ++ } ++} ++ ++/* During init we'll be asked how much memory we'd like passed to us ++ * in bev_io structures as context. Here's where we specify how ++ * much context we want per IO. ++ */ ++static int ++vbdev_passthru_get_ctx_size(void) ++{ ++ return sizeof(struct passthru_bdev_io); ++} ++ ++/* Where vbdev_passthru_config_json() is used to generate per module JSON config data, this ++ * function is called to output any per bdev specific methods. For the PT module, there are ++ * none. ++ */ ++static void ++vbdev_passthru_write_config_json(struct spdk_bdev *bdev, struct spdk_json_write_ctx *w) ++{ ++ /* No config per bdev needed */ ++} ++ ++static int ++vbdev_passthru_get_memory_domains(void *ctx, struct spdk_memory_domain **domains, int array_size) ++{ ++ struct vbdev_passthru *pt_node = (struct vbdev_passthru *)ctx; ++ ++ /* Passthru bdev doesn't work with data buffers, so it supports any memory domain used by base_bdev */ ++ return spdk_bdev_get_memory_domains(pt_node->base_bdev, domains, array_size); ++} ++ ++/* When we register our bdev this is how we specify our entry points. */ ++static const struct spdk_bdev_fn_table vbdev_passthru_fn_table = { ++ .destruct = vbdev_passthru_destruct, ++ .submit_request = vbdev_passthru_submit_request, ++ .io_type_supported = vbdev_passthru_io_type_supported, ++ .get_io_channel = vbdev_passthru_get_io_channel, ++ .dump_info_json = vbdev_passthru_dump_info_json, ++ .write_config_json = vbdev_passthru_write_config_json, ++ .get_memory_domains = vbdev_passthru_get_memory_domains, ++}; ++ ++static void ++vbdev_passthru_base_bdev_hotremove_cb(struct spdk_bdev *bdev_find) ++{ ++ struct vbdev_passthru *pt_node, *tmp; ++ ++ TAILQ_FOREACH_SAFE(pt_node, &g_pt_nodes, link, tmp) { ++ if (bdev_find == pt_node->base_bdev) { ++ spdk_bdev_unregister(&pt_node->pt_bdev, NULL, NULL); ++ } ++ } ++} ++ ++/* Called when the underlying base bdev triggers asynchronous event such as bdev removal. */ ++static void ++vbdev_passthru_base_bdev_event_cb(enum spdk_bdev_event_type type, struct spdk_bdev *bdev, ++ void *event_ctx) ++{ ++ switch (type) { ++ case SPDK_BDEV_EVENT_REMOVE: ++ vbdev_passthru_base_bdev_hotremove_cb(bdev); ++ break; ++ default: ++ SPDK_NOTICELOG("Unsupported bdev event: type %d\n", type); ++ break; ++ } ++} ++ ++/* Create and register the passthru vbdev if we find it in our list of bdev names. ++ * This can be called either by the examine path or RPC method. ++ */ ++static int ++vbdev_passthru_register(const char *bdev_name) ++{ ++ struct bdev_names *name; ++ struct vbdev_passthru *pt_node; ++ struct spdk_bdev *bdev; ++ int rc = 0; ++ ++ /* Check our list of names from config versus this bdev and if ++ * there's a match, create the pt_node & bdev accordingly. ++ */ ++ TAILQ_FOREACH(name, &g_bdev_names, link) { ++ if (strcmp(name->bdev_name, bdev_name) != 0) { ++ continue; ++ } ++ ++ SPDK_NOTICELOG("Match on %s\n", bdev_name); ++ pt_node = calloc(1, sizeof(struct vbdev_passthru)); ++ if (!pt_node) { ++ rc = -ENOMEM; ++ SPDK_ERRLOG("could not allocate pt_node\n"); ++ break; ++ } ++ ++ pt_node->pt_bdev.name = strdup(name->vbdev_name); ++ if (!pt_node->pt_bdev.name) { ++ rc = -ENOMEM; ++ SPDK_ERRLOG("could not allocate pt_bdev name\n"); ++ free(pt_node); ++ break; ++ } ++ pt_node->pt_bdev.product_name = "passthru"; ++ ++ /* The base bdev that we're attaching to. */ ++ rc = spdk_bdev_open_ext(bdev_name, true, vbdev_passthru_base_bdev_event_cb, ++ NULL, &pt_node->base_desc); ++ if (rc) { ++ if (rc != -ENODEV) { ++ SPDK_ERRLOG("could not open bdev %s\n", bdev_name); ++ } ++ free(pt_node->pt_bdev.name); ++ free(pt_node); ++ break; ++ } ++ SPDK_NOTICELOG("base bdev opened\n"); ++ ++ bdev = spdk_bdev_desc_get_bdev(pt_node->base_desc); ++ pt_node->base_bdev = bdev; ++ ++ /* Copy some properties from the underlying base bdev. */ ++ pt_node->pt_bdev.write_cache = bdev->write_cache; ++ pt_node->pt_bdev.required_alignment = bdev->required_alignment; ++ pt_node->pt_bdev.optimal_io_boundary = bdev->optimal_io_boundary; ++ pt_node->pt_bdev.blocklen = bdev->blocklen; ++ pt_node->pt_bdev.blockcnt = bdev->blockcnt; ++ ++ pt_node->pt_bdev.md_interleave = bdev->md_interleave; ++ pt_node->pt_bdev.md_len = bdev->md_len; ++ pt_node->pt_bdev.dif_type = bdev->dif_type; ++ pt_node->pt_bdev.dif_is_head_of_md = bdev->dif_is_head_of_md; ++ pt_node->pt_bdev.dif_check_flags = bdev->dif_check_flags; ++ ++ /* This is the context that is passed to us when the bdev ++ * layer calls in so we'll save our pt_bdev node here. ++ */ ++ pt_node->pt_bdev.ctxt = pt_node; ++ pt_node->pt_bdev.fn_table = &vbdev_passthru_fn_table; ++ pt_node->pt_bdev.module = &passthru_if; ++ TAILQ_INSERT_TAIL(&g_pt_nodes, pt_node, link); ++ ++ spdk_io_device_register(pt_node, pt_bdev_ch_create_cb, pt_bdev_ch_destroy_cb, ++ sizeof(struct pt_io_channel), ++ name->vbdev_name); ++ SPDK_NOTICELOG("io_device created at: 0x%p\n", pt_node); ++ ++ /* Save the thread where the base device is opened */ ++ pt_node->thread = spdk_get_thread(); ++ ++ rc = spdk_bdev_module_claim_bdev(bdev, pt_node->base_desc, pt_node->pt_bdev.module); ++ if (rc) { ++ SPDK_ERRLOG("could not claim bdev %s\n", bdev_name); ++ spdk_bdev_close(pt_node->base_desc); ++ TAILQ_REMOVE(&g_pt_nodes, pt_node, link); ++ spdk_io_device_unregister(pt_node, NULL); ++ free(pt_node->pt_bdev.name); ++ free(pt_node); ++ break; ++ } ++ SPDK_NOTICELOG("bdev claimed\n"); ++ ++ rc = spdk_bdev_register(&pt_node->pt_bdev); ++ if (rc) { ++ SPDK_ERRLOG("could not register pt_bdev\n"); ++ spdk_bdev_module_release_bdev(&pt_node->pt_bdev); ++ spdk_bdev_close(pt_node->base_desc); ++ TAILQ_REMOVE(&g_pt_nodes, pt_node, link); ++ spdk_io_device_unregister(pt_node, NULL); ++ free(pt_node->pt_bdev.name); ++ free(pt_node); ++ break; ++ } ++ SPDK_NOTICELOG("pt_bdev registered\n"); ++ SPDK_NOTICELOG("created pt_bdev for: %s\n", name->vbdev_name); ++ } ++ ++ return rc; ++} ++ ++/* Create the passthru disk from the given bdev and vbdev name. */ ++int ++bdev_passthru_create_disk(const char *bdev_name, const char *vbdev_name) ++{ ++ int rc; ++ ++ /* Insert the bdev name into our global name list even if it doesn't exist yet, ++ * it may show up soon... ++ */ ++ rc = vbdev_passthru_insert_name(bdev_name, vbdev_name); ++ if (rc) { ++ return rc; ++ } ++ ++ rc = vbdev_passthru_register(bdev_name); ++ if (rc == -ENODEV) { ++ /* This is not an error, we tracked the name above and it still ++ * may show up later. ++ */ ++ SPDK_NOTICELOG("vbdev creation deferred pending base bdev arrival\n"); ++ rc = 0; ++ } ++ ++ return rc; ++} ++ ++void ++bdev_passthru_delete_disk(const char *bdev_name, spdk_bdev_unregister_cb cb_fn, void *cb_arg) ++{ ++ struct bdev_names *name; ++ int rc; ++ ++ /* Some cleanup happens in the destruct callback. */ ++ rc = spdk_bdev_unregister_by_name(bdev_name, &passthru_if, cb_fn, cb_arg); ++ if (rc == 0) { ++ /* Remove the association (vbdev, bdev) from g_bdev_names. This is required so that the ++ * vbdev does not get re-created if the same bdev is constructed at some other time, ++ * unless the underlying bdev was hot-removed. ++ */ ++ TAILQ_FOREACH(name, &g_bdev_names, link) { ++ if (strcmp(name->vbdev_name, bdev_name) == 0) { ++ TAILQ_REMOVE(&g_bdev_names, name, link); ++ free(name->bdev_name); ++ free(name->vbdev_name); ++ free(name); ++ break; ++ } ++ } ++ } else { ++ cb_fn(cb_arg, rc); ++ } ++} ++ ++/* Because we specified this function in our pt bdev function table when we ++ * registered our pt bdev, we'll get this call anytime a new bdev shows up. ++ * Here we need to decide if we care about it and if so what to do. We ++ * parsed the config file at init so we check the new bdev against the list ++ * we built up at that time and if the user configured us to attach to this ++ * bdev, here's where we do it. ++ */ ++static void ++vbdev_passthru_examine(struct spdk_bdev *bdev) ++{ ++ vbdev_passthru_register(bdev->name); ++ ++ spdk_bdev_module_examine_done(&passthru_if); ++} ++ ++SPDK_LOG_REGISTER_COMPONENT(vbdev_passthru) +diff --git a/module/bdev/passthru/vbdev_passthru.h b/module/bdev/passthru/vbdev_passthru.h +index 5450fbb..233b43a 100644 +--- a/module/bdev/passthru/vbdev_passthru.h ++++ b/module/bdev/passthru/vbdev_passthru.h +@@ -1,33 +1,33 @@ +-/* SPDX-License-Identifier: BSD-3-Clause +- * Copyright (C) 2018 Intel Corporation. +- * All rights reserved. +- */ +- +-#ifndef SPDK_VBDEV_PASSTHRU_H +-#define SPDK_VBDEV_PASSTHRU_H +- +-#include "spdk/stdinc.h" +- +-#include "spdk/bdev.h" +-#include "spdk/bdev_module.h" +- +-/** +- * Create new pass through bdev. +- * +- * \param bdev_name Bdev on which pass through vbdev will be created. +- * \param vbdev_name Name of the pass through bdev. +- * \return 0 on success, other on failure. +- */ +-int bdev_passthru_create_disk(const char *bdev_name, const char *vbdev_name); +- +-/** +- * Delete passthru bdev. +- * +- * \param bdev_name Name of the pass through bdev. +- * \param cb_fn Function to call after deletion. +- * \param cb_arg Argument to pass to cb_fn. +- */ +-void bdev_passthru_delete_disk(const char *bdev_name, spdk_bdev_unregister_cb cb_fn, +- void *cb_arg); +- +-#endif /* SPDK_VBDEV_PASSTHRU_H */ ++/* SPDX-License-Identifier: BSD-3-Clause ++ * Copyright (C) 2018 Intel Corporation. ++ * All rights reserved. ++ */ ++ ++#ifndef SPDK_VBDEV_PASSTHRU_H ++#define SPDK_VBDEV_PASSTHRU_H ++ ++#include "spdk/stdinc.h" ++ ++#include "spdk/bdev.h" ++#include "spdk/bdev_module.h" ++ ++/** ++ * Create new pass through bdev. ++ * ++ * \param bdev_name Bdev on which pass through vbdev will be created. ++ * \param vbdev_name Name of the pass through bdev. ++ * \return 0 on success, other on failure. ++ */ ++int bdev_passthru_create_disk(const char *bdev_name, const char *vbdev_name); ++ ++/** ++ * Delete passthru bdev. ++ * ++ * \param bdev_name Name of the pass through bdev. ++ * \param cb_fn Function to call after deletion. ++ * \param cb_arg Argument to pass to cb_fn. ++ */ ++void bdev_passthru_delete_disk(const char *bdev_name, spdk_bdev_unregister_cb cb_fn, ++ void *cb_arg); ++ ++#endif /* SPDK_VBDEV_PASSTHRU_H */ +diff --git a/module/bdev/passthru/vbdev_passthru_rpc.c b/module/bdev/passthru/vbdev_passthru_rpc.c +index 6fad37e..f03c027 100644 +--- a/module/bdev/passthru/vbdev_passthru_rpc.c ++++ b/module/bdev/passthru/vbdev_passthru_rpc.c +@@ -1,112 +1,112 @@ +-/* SPDX-License-Identifier: BSD-3-Clause +- * Copyright (C) 2018 Intel Corporation. +- * All rights reserved. +- */ +- +-#include "vbdev_passthru.h" +-#include "spdk/rpc.h" +-#include "spdk/util.h" +-#include "spdk/string.h" +-#include "spdk/log.h" +- +-/* Structure to hold the parameters for this RPC method. */ +-struct rpc_bdev_passthru_create { +- char *base_bdev_name; +- char *name; +-}; +- +-/* Free the allocated memory resource after the RPC handling. */ +-static void +-free_rpc_bdev_passthru_create(struct rpc_bdev_passthru_create *r) +-{ +- free(r->base_bdev_name); +- free(r->name); +-} +- +-/* Structure to decode the input parameters for this RPC method. */ +-static const struct spdk_json_object_decoder rpc_bdev_passthru_create_decoders[] = { +- {"base_bdev_name", offsetof(struct rpc_bdev_passthru_create, base_bdev_name), spdk_json_decode_string}, +- {"name", offsetof(struct rpc_bdev_passthru_create, name), spdk_json_decode_string}, +-}; +- +-/* Decode the parameters for this RPC method and properly construct the passthru +- * device. Error status returned in the failed cases. +- */ +-static void +-rpc_bdev_passthru_create(struct spdk_jsonrpc_request *request, +- const struct spdk_json_val *params) +-{ +- struct rpc_bdev_passthru_create req = {NULL}; +- struct spdk_json_write_ctx *w; +- int rc; +- +- if (spdk_json_decode_object(params, rpc_bdev_passthru_create_decoders, +- SPDK_COUNTOF(rpc_bdev_passthru_create_decoders), +- &req)) { +- SPDK_DEBUGLOG(vbdev_passthru, "spdk_json_decode_object failed\n"); +- spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, +- "spdk_json_decode_object failed"); +- goto cleanup; +- } +- +- rc = bdev_passthru_create_disk(req.base_bdev_name, req.name); +- if (rc != 0) { +- spdk_jsonrpc_send_error_response(request, rc, spdk_strerror(-rc)); +- goto cleanup; +- } +- +- w = spdk_jsonrpc_begin_result(request); +- spdk_json_write_string(w, req.name); +- spdk_jsonrpc_end_result(request, w); +- +-cleanup: +- free_rpc_bdev_passthru_create(&req); +-} +-SPDK_RPC_REGISTER("bdev_passthru_create", rpc_bdev_passthru_create, SPDK_RPC_RUNTIME) +- +-struct rpc_bdev_passthru_delete { +- char *name; +-}; +- +-static void +-free_rpc_bdev_passthru_delete(struct rpc_bdev_passthru_delete *req) +-{ +- free(req->name); +-} +- +-static const struct spdk_json_object_decoder rpc_bdev_passthru_delete_decoders[] = { +- {"name", offsetof(struct rpc_bdev_passthru_delete, name), spdk_json_decode_string}, +-}; +- +-static void +-rpc_bdev_passthru_delete_cb(void *cb_arg, int bdeverrno) +-{ +- struct spdk_jsonrpc_request *request = cb_arg; +- +- if (bdeverrno == 0) { +- spdk_jsonrpc_send_bool_response(request, true); +- } else { +- spdk_jsonrpc_send_error_response(request, bdeverrno, spdk_strerror(-bdeverrno)); +- } +-} +- +-static void +-rpc_bdev_passthru_delete(struct spdk_jsonrpc_request *request, +- const struct spdk_json_val *params) +-{ +- struct rpc_bdev_passthru_delete req = {NULL}; +- +- if (spdk_json_decode_object(params, rpc_bdev_passthru_delete_decoders, +- SPDK_COUNTOF(rpc_bdev_passthru_delete_decoders), +- &req)) { +- spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, +- "spdk_json_decode_object failed"); +- goto cleanup; +- } +- +- bdev_passthru_delete_disk(req.name, rpc_bdev_passthru_delete_cb, request); +- +-cleanup: +- free_rpc_bdev_passthru_delete(&req); +-} +-SPDK_RPC_REGISTER("bdev_passthru_delete", rpc_bdev_passthru_delete, SPDK_RPC_RUNTIME) ++/* SPDX-License-Identifier: BSD-3-Clause ++ * Copyright (C) 2018 Intel Corporation. ++ * All rights reserved. ++ */ ++ ++#include "vbdev_passthru.h" ++#include "spdk/rpc.h" ++#include "spdk/util.h" ++#include "spdk/string.h" ++#include "spdk/log.h" ++ ++/* Structure to hold the parameters for this RPC method. */ ++struct rpc_bdev_passthru_create { ++ char *base_bdev_name; ++ char *name; ++}; ++ ++/* Free the allocated memory resource after the RPC handling. */ ++static void ++free_rpc_bdev_passthru_create(struct rpc_bdev_passthru_create *r) ++{ ++ free(r->base_bdev_name); ++ free(r->name); ++} ++ ++/* Structure to decode the input parameters for this RPC method. */ ++static const struct spdk_json_object_decoder rpc_bdev_passthru_create_decoders[] = { ++ {"base_bdev_name", offsetof(struct rpc_bdev_passthru_create, base_bdev_name), spdk_json_decode_string}, ++ {"name", offsetof(struct rpc_bdev_passthru_create, name), spdk_json_decode_string}, ++}; ++ ++/* Decode the parameters for this RPC method and properly construct the passthru ++ * device. Error status returned in the failed cases. ++ */ ++static void ++rpc_bdev_passthru_create(struct spdk_jsonrpc_request *request, ++ const struct spdk_json_val *params) ++{ ++ struct rpc_bdev_passthru_create req = {NULL}; ++ struct spdk_json_write_ctx *w; ++ int rc; ++ ++ if (spdk_json_decode_object(params, rpc_bdev_passthru_create_decoders, ++ SPDK_COUNTOF(rpc_bdev_passthru_create_decoders), ++ &req)) { ++ SPDK_DEBUGLOG(vbdev_passthru, "spdk_json_decode_object failed\n"); ++ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, ++ "spdk_json_decode_object failed"); ++ goto cleanup; ++ } ++ ++ rc = bdev_passthru_create_disk(req.base_bdev_name, req.name); ++ if (rc != 0) { ++ spdk_jsonrpc_send_error_response(request, rc, spdk_strerror(-rc)); ++ goto cleanup; ++ } ++ ++ w = spdk_jsonrpc_begin_result(request); ++ spdk_json_write_string(w, req.name); ++ spdk_jsonrpc_end_result(request, w); ++ ++cleanup: ++ free_rpc_bdev_passthru_create(&req); ++} ++SPDK_RPC_REGISTER("bdev_passthru_create", rpc_bdev_passthru_create, SPDK_RPC_RUNTIME) ++ ++struct rpc_bdev_passthru_delete { ++ char *name; ++}; ++ ++static void ++free_rpc_bdev_passthru_delete(struct rpc_bdev_passthru_delete *req) ++{ ++ free(req->name); ++} ++ ++static const struct spdk_json_object_decoder rpc_bdev_passthru_delete_decoders[] = { ++ {"name", offsetof(struct rpc_bdev_passthru_delete, name), spdk_json_decode_string}, ++}; ++ ++static void ++rpc_bdev_passthru_delete_cb(void *cb_arg, int bdeverrno) ++{ ++ struct spdk_jsonrpc_request *request = cb_arg; ++ ++ if (bdeverrno == 0) { ++ spdk_jsonrpc_send_bool_response(request, true); ++ } else { ++ spdk_jsonrpc_send_error_response(request, bdeverrno, spdk_strerror(-bdeverrno)); ++ } ++} ++ ++static void ++rpc_bdev_passthru_delete(struct spdk_jsonrpc_request *request, ++ const struct spdk_json_val *params) ++{ ++ struct rpc_bdev_passthru_delete req = {NULL}; ++ ++ if (spdk_json_decode_object(params, rpc_bdev_passthru_delete_decoders, ++ SPDK_COUNTOF(rpc_bdev_passthru_delete_decoders), ++ &req)) { ++ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, ++ "spdk_json_decode_object failed"); ++ goto cleanup; ++ } ++ ++ bdev_passthru_delete_disk(req.name, rpc_bdev_passthru_delete_cb, request); ++ ++cleanup: ++ free_rpc_bdev_passthru_delete(&req); ++} ++SPDK_RPC_REGISTER("bdev_passthru_delete", rpc_bdev_passthru_delete, SPDK_RPC_RUNTIME) +diff --git a/module/bdev/pmem/Makefile b/module/bdev/pmem/Makefile +index 430e1c5..c097aba 100644 +--- a/module/bdev/pmem/Makefile ++++ b/module/bdev/pmem/Makefile +@@ -1,17 +1,17 @@ +-# SPDX-License-Identifier: BSD-3-Clause +-# Copyright (C) 2015 Intel Corporation. +-# All rights reserved. +-# +- +-SPDK_ROOT_DIR := $(abspath $(CURDIR)/../../..) +-include $(SPDK_ROOT_DIR)/mk/spdk.common.mk +- +-SO_VER := 4 +-SO_MINOR := 0 +- +-C_SRCS = bdev_pmem.c bdev_pmem_rpc.c +-LIBNAME = bdev_pmem +- +-SPDK_MAP_FILE = $(SPDK_ROOT_DIR)/mk/spdk_blank.map +- +-include $(SPDK_ROOT_DIR)/mk/spdk.lib.mk ++# SPDX-License-Identifier: BSD-3-Clause ++# Copyright (C) 2015 Intel Corporation. ++# All rights reserved. ++# ++ ++SPDK_ROOT_DIR := $(abspath $(CURDIR)/../../..) ++include $(SPDK_ROOT_DIR)/mk/spdk.common.mk ++ ++SO_VER := 4 ++SO_MINOR := 0 ++ ++C_SRCS = bdev_pmem.c bdev_pmem_rpc.c ++LIBNAME = bdev_pmem ++ ++SPDK_MAP_FILE = $(SPDK_ROOT_DIR)/mk/spdk_blank.map ++ ++include $(SPDK_ROOT_DIR)/mk/spdk.lib.mk +diff --git a/module/bdev/pmem/bdev_pmem.c b/module/bdev/pmem/bdev_pmem.c +index 6026fb3..b54c2a4 100644 +--- a/module/bdev/pmem/bdev_pmem.c ++++ b/module/bdev/pmem/bdev_pmem.c +@@ -1,412 +1,412 @@ +-/* SPDX-License-Identifier: BSD-3-Clause +- * Copyright (C) 2017 Intel Corporation. +- * All rights reserved. +- */ +- +-#include "spdk/string.h" +-#include "spdk/likely.h" +-#include "spdk/util.h" +-#include "spdk/rpc.h" +-#include "spdk/bdev_module.h" +-#include "spdk/log.h" +-#include "spdk/config.h" +- +-#include "bdev_pmem.h" +-#include "libpmemblk.h" +- +-struct pmem_disk { +- struct spdk_bdev disk; +- PMEMblkpool *pool; +- char pmem_file[NAME_MAX]; +- TAILQ_ENTRY(pmem_disk) tailq; +-}; +- +-static TAILQ_HEAD(, pmem_disk) g_pmem_disks = TAILQ_HEAD_INITIALIZER(g_pmem_disks); +- +-static int bdev_pmem_initialize(void); +-static void bdev_pmem_finish(void); +- +-static struct spdk_bdev_module pmem_if = { +- .name = "pmem", +- .module_init = bdev_pmem_initialize, +- .module_fini = bdev_pmem_finish, +- .async_fini = true, +- +-}; +- +-SPDK_BDEV_MODULE_REGISTER(pmem, &pmem_if) +- +-typedef int(*spdk_bdev_pmem_io_request)(PMEMblkpool *pbp, void *buf, long long blockno); +- +-static int +-_bdev_pmem_submit_io_read(PMEMblkpool *pbp, void *buf, long long blockno) +-{ +- return pmemblk_read(pbp, buf, blockno); +-} +- +-static int +-_bdev_pmem_submit_io_write(PMEMblkpool *pbp, void *buf, long long blockno) +-{ +- return pmemblk_write(pbp, buf, blockno); +-} +- +-static int +-bdev_pmem_destruct(void *ctx) +-{ +- struct pmem_disk *pdisk = ctx; +- +- TAILQ_REMOVE(&g_pmem_disks, pdisk, tailq); +- free(pdisk->disk.name); +- pmemblk_close(pdisk->pool); +- free(pdisk); +- +- return 0; +-} +- +-static int +-bdev_pmem_check_iov_len(struct iovec *iovs, int iovcnt, size_t num_blocks, uint32_t block_size) +-{ +- size_t nbytes = num_blocks * block_size; +- int i; +- +- for (i = 0; i < iovcnt; i++) { +- if (spdk_unlikely(iovs[i].iov_base == NULL && iovs[i].iov_len != 0)) { +- return -1; +- } +- +- if (nbytes <= iovs[i].iov_len) { +- return 0; +- } +- +- if (spdk_unlikely(iovs[i].iov_len % block_size != 0)) { +- return -1; +- } +- +- nbytes -= iovs[i].iov_len; +- } +- +- return -1; +-} +- +-static void +-bdev_pmem_submit_io(struct spdk_bdev_io *bdev_io, struct pmem_disk *pdisk, +- struct spdk_io_channel *ch, +- struct iovec *iov, int iovcnt, +- uint64_t offset_blocks, size_t num_blocks, uint32_t block_size, +- spdk_bdev_pmem_io_request fn) +-{ +- int rc; +- size_t nbytes, offset, len; +- enum spdk_bdev_io_status status; +- +- rc = bdev_pmem_check_iov_len(iov, iovcnt, num_blocks, block_size); +- if (rc) { +- status = SPDK_BDEV_IO_STATUS_FAILED; +- goto end; +- } +- +- SPDK_DEBUGLOG(bdev_pmem, "io %lu bytes from offset %#lx\n", +- num_blocks, offset_blocks); +- +- for (nbytes = num_blocks * block_size; nbytes > 0; iov++) { +- len = spdk_min(iov->iov_len, nbytes); +- nbytes -= len; +- +- offset = 0; +- while (offset != len) { +- rc = fn(pdisk->pool, iov->iov_base + offset, offset_blocks); +- if (rc != 0) { +- SPDK_ERRLOG("pmemblk io failed: %d (%s)\n", errno, pmemblk_errormsg()); +- status = SPDK_BDEV_IO_STATUS_FAILED; +- goto end; +- } +- +- offset += block_size; +- offset_blocks++; +- } +- } +- +- assert(num_blocks == offset_blocks - bdev_io->u.bdev.offset_blocks); +- status = SPDK_BDEV_IO_STATUS_SUCCESS; +-end: +- +- spdk_bdev_io_complete(bdev_io, status); +-} +- +-static void +-bdev_pmem_write_zeros(struct spdk_bdev_io *bdev_io, struct pmem_disk *pdisk, +- struct spdk_io_channel *ch, uint64_t offset_blocks, +- uint64_t num_blocks, uint32_t block_size) +-{ +- int rc; +- enum spdk_bdev_io_status status = SPDK_BDEV_IO_STATUS_SUCCESS; +- +- while (num_blocks > 0) { +- rc = pmemblk_set_zero(pdisk->pool, offset_blocks); +- if (rc != 0) { +- SPDK_ERRLOG("pmemblk_set_zero failed: %d (%s)\n", errno, pmemblk_errormsg()); +- status = SPDK_BDEV_IO_STATUS_FAILED; +- break; +- } +- offset_blocks++; +- num_blocks--; +- } +- spdk_bdev_io_complete(bdev_io, status); +-} +- +-static void +-bdev_pmem_io_get_buf_cb(struct spdk_io_channel *channel, struct spdk_bdev_io *bdev_io, +- bool success) +-{ +- if (!success) { +- spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); +- return; +- } +- +- bdev_pmem_submit_io(bdev_io, +- bdev_io->bdev->ctxt, +- channel, +- bdev_io->u.bdev.iovs, +- bdev_io->u.bdev.iovcnt, +- bdev_io->u.bdev.offset_blocks, +- bdev_io->u.bdev.num_blocks, +- bdev_io->bdev->blocklen, +- _bdev_pmem_submit_io_read); +-} +- +-static void +-bdev_pmem_submit_request(struct spdk_io_channel *channel, struct spdk_bdev_io *bdev_io) +-{ +- switch (bdev_io->type) { +- case SPDK_BDEV_IO_TYPE_READ: +- spdk_bdev_io_get_buf(bdev_io, bdev_pmem_io_get_buf_cb, +- bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen); +- break; +- case SPDK_BDEV_IO_TYPE_WRITE: +- bdev_pmem_submit_io(bdev_io, +- bdev_io->bdev->ctxt, +- channel, +- bdev_io->u.bdev.iovs, +- bdev_io->u.bdev.iovcnt, +- bdev_io->u.bdev.offset_blocks, +- bdev_io->u.bdev.num_blocks, +- bdev_io->bdev->blocklen, +- _bdev_pmem_submit_io_write); +- break; +- case SPDK_BDEV_IO_TYPE_UNMAP: +- case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: +- bdev_pmem_write_zeros(bdev_io, +- bdev_io->bdev->ctxt, +- channel, +- bdev_io->u.bdev.offset_blocks, +- bdev_io->u.bdev.num_blocks, +- bdev_io->bdev->blocklen); +- break; +- case SPDK_BDEV_IO_TYPE_FLUSH: +- case SPDK_BDEV_IO_TYPE_RESET: +- spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_SUCCESS); +- break; +- default: +- spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); +- } +-} +- +-static bool +-bdev_pmem_io_type_supported(void *ctx, enum spdk_bdev_io_type io_type) +-{ +- switch (io_type) { +- case SPDK_BDEV_IO_TYPE_READ: +- case SPDK_BDEV_IO_TYPE_WRITE: +- case SPDK_BDEV_IO_TYPE_RESET: +- case SPDK_BDEV_IO_TYPE_UNMAP: +- case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: +- case SPDK_BDEV_IO_TYPE_FLUSH: +- return true; +- default: +- return false; +- } +-} +- +-static struct spdk_io_channel * +-bdev_pmem_get_io_channel(void *ctx) +-{ +- return spdk_get_io_channel(&g_pmem_disks); +-} +- +-static int +-bdev_pmem_dump_info_json(void *ctx, struct spdk_json_write_ctx *w) +-{ +- struct pmem_disk *pdisk = ctx; +- +- spdk_json_write_named_object_begin(w, "pmem"); +- spdk_json_write_named_string(w, "pmem_file", pdisk->pmem_file); +- spdk_json_write_object_end(w); +- +- return 0; +-} +- +-static int +-bdev_pmem_create_cb(void *io_device, void *ctx_buf) +-{ +- return 0; +-} +- +-static void +-bdev_pmem_destroy_cb(void *io_device, void *ctx_buf) +-{ +-} +- +-static void +-bdev_pmem_write_config_json(struct spdk_bdev *bdev, struct spdk_json_write_ctx *w) +-{ +- struct pmem_disk *disk = bdev->ctxt; +- +- spdk_json_write_object_begin(w); +- +- spdk_json_write_named_string(w, "method", "bdev_pmem_create"); +- +- spdk_json_write_named_object_begin(w, "params"); +- spdk_json_write_named_string(w, "name", bdev->name); +- spdk_json_write_named_string(w, "pmem_file", disk->pmem_file); +- spdk_json_write_object_end(w); +- +- spdk_json_write_object_end(w); +-} +- +-static const struct spdk_bdev_fn_table pmem_fn_table = { +- .destruct = bdev_pmem_destruct, +- .submit_request = bdev_pmem_submit_request, +- .io_type_supported = bdev_pmem_io_type_supported, +- .get_io_channel = bdev_pmem_get_io_channel, +- .dump_info_json = bdev_pmem_dump_info_json, +- .write_config_json = bdev_pmem_write_config_json, +-}; +- +-int +-create_pmem_disk(const char *pmem_file, const char *name, struct spdk_bdev **bdev) +-{ +- uint64_t num_blocks; +- uint32_t block_size; +- struct pmem_disk *pdisk; +- int rc; +- +- *bdev = NULL; +- +- if (name == NULL) { +- SPDK_ERRLOG("Missing name parameter for create_pmem_disk()\n"); +- return -EINVAL; +- } +- +- if (pmemblk_check(pmem_file, 0) != 1) { +- SPDK_ERRLOG("Pool '%s' check failed: %s\n", pmem_file, pmemblk_errormsg()); +- return -EIO; +- } +- +- pdisk = calloc(1, sizeof(*pdisk)); +- if (!pdisk) { +- return -ENOMEM; +- } +- +- snprintf(pdisk->pmem_file, sizeof(pdisk->pmem_file), "%s", pmem_file); +- pdisk->pool = pmemblk_open(pmem_file, 0); +- if (!pdisk->pool) { +- SPDK_ERRLOG("Opening pmem pool '%s' failed: %d\n", pmem_file, errno); +- free(pdisk); +- return -errno; +- } +- +- block_size = pmemblk_bsize(pdisk->pool); +- num_blocks = pmemblk_nblock(pdisk->pool); +- +- if (block_size == 0) { +- SPDK_ERRLOG("Block size must be more than 0 bytes\n"); +- pmemblk_close(pdisk->pool); +- free(pdisk); +- return -EINVAL; +- } +- +- if (num_blocks == 0) { +- SPDK_ERRLOG("Disk must be more than 0 blocks\n"); +- pmemblk_close(pdisk->pool); +- free(pdisk); +- return -EINVAL; +- } +- +- pdisk->disk.name = strdup(name); +- if (!pdisk->disk.name) { +- pmemblk_close(pdisk->pool); +- free(pdisk); +- return -ENOMEM; +- } +- +- pdisk->disk.product_name = "pmemblk disk"; +- pdisk->disk.write_cache = 0; +- pdisk->disk.blocklen = block_size; +- pdisk->disk.blockcnt = num_blocks; +- +- pdisk->disk.ctxt = pdisk; +- pdisk->disk.fn_table = &pmem_fn_table; +- pdisk->disk.module = &pmem_if; +- +- rc = spdk_bdev_register(&pdisk->disk); +- if (rc) { +- pmemblk_close(pdisk->pool); +- free(pdisk->disk.name); +- free(pdisk); +- return rc; +- } +- +- TAILQ_INSERT_TAIL(&g_pmem_disks, pdisk, tailq); +- +- *bdev = &pdisk->disk; +- +- return 0; +-} +- +-void +-delete_pmem_disk(const char *name, spdk_delete_pmem_complete cb_fn, void *cb_arg) +-{ +- int rc; +- +- rc = spdk_bdev_unregister_by_name(name, &pmem_if, cb_fn, cb_arg); +- if (rc != 0) { +- cb_fn(cb_arg, rc); +- } +-} +- +-SPDK_LOG_DEPRECATION_REGISTER(bdev_pmem, "PMDK libpmemblk bdev_pmem integration", "SPDK 23.05", 0); +- +-static int +-bdev_pmem_initialize(void) +-{ +- const char *err = pmemblk_check_version(PMEMBLK_MAJOR_VERSION, PMEMBLK_MINOR_VERSION); +- +- SPDK_LOG_DEPRECATED(bdev_pmem); +- if (err != NULL) { +- SPDK_ERRLOG("Invalid libpmemblk version (expected %d.%d): %s\n", PMEMBLK_MAJOR_VERSION, +- PMEMBLK_MINOR_VERSION, err); +- return -1; +- } +- +-#ifdef SPDK_CONFIG_DEBUG +- setenv("PMEMBLK_LOG_LEVEL", "1", 1); +-#endif +- spdk_io_device_register(&g_pmem_disks, bdev_pmem_create_cb, bdev_pmem_destroy_cb, 0, "pmem_bdev"); +- +- return 0; +- +-} +- +-static void +-bdev_pmem_finish_done(void *io_device) +-{ +- spdk_bdev_module_fini_done(); +-} +- +-static void +-bdev_pmem_finish(void) +-{ +- spdk_io_device_unregister(&g_pmem_disks, bdev_pmem_finish_done); +-} +- +-SPDK_LOG_REGISTER_COMPONENT(bdev_pmem) ++/* SPDX-License-Identifier: BSD-3-Clause ++ * Copyright (C) 2017 Intel Corporation. ++ * All rights reserved. ++ */ ++ ++#include "spdk/string.h" ++#include "spdk/likely.h" ++#include "spdk/util.h" ++#include "spdk/rpc.h" ++#include "spdk/bdev_module.h" ++#include "spdk/log.h" ++#include "spdk/config.h" ++ ++#include "bdev_pmem.h" ++#include "libpmemblk.h" ++ ++struct pmem_disk { ++ struct spdk_bdev disk; ++ PMEMblkpool *pool; ++ char pmem_file[NAME_MAX]; ++ TAILQ_ENTRY(pmem_disk) tailq; ++}; ++ ++static TAILQ_HEAD(, pmem_disk) g_pmem_disks = TAILQ_HEAD_INITIALIZER(g_pmem_disks); ++ ++static int bdev_pmem_initialize(void); ++static void bdev_pmem_finish(void); ++ ++static struct spdk_bdev_module pmem_if = { ++ .name = "pmem", ++ .module_init = bdev_pmem_initialize, ++ .module_fini = bdev_pmem_finish, ++ .async_fini = true, ++ ++}; ++ ++SPDK_BDEV_MODULE_REGISTER(pmem, &pmem_if) ++ ++typedef int(*spdk_bdev_pmem_io_request)(PMEMblkpool *pbp, void *buf, long long blockno); ++ ++static int ++_bdev_pmem_submit_io_read(PMEMblkpool *pbp, void *buf, long long blockno) ++{ ++ return pmemblk_read(pbp, buf, blockno); ++} ++ ++static int ++_bdev_pmem_submit_io_write(PMEMblkpool *pbp, void *buf, long long blockno) ++{ ++ return pmemblk_write(pbp, buf, blockno); ++} ++ ++static int ++bdev_pmem_destruct(void *ctx) ++{ ++ struct pmem_disk *pdisk = ctx; ++ ++ TAILQ_REMOVE(&g_pmem_disks, pdisk, tailq); ++ free(pdisk->disk.name); ++ pmemblk_close(pdisk->pool); ++ free(pdisk); ++ ++ return 0; ++} ++ ++static int ++bdev_pmem_check_iov_len(struct iovec *iovs, int iovcnt, size_t num_blocks, uint32_t block_size) ++{ ++ size_t nbytes = num_blocks * block_size; ++ int i; ++ ++ for (i = 0; i < iovcnt; i++) { ++ if (spdk_unlikely(iovs[i].iov_base == NULL && iovs[i].iov_len != 0)) { ++ return -1; ++ } ++ ++ if (nbytes <= iovs[i].iov_len) { ++ return 0; ++ } ++ ++ if (spdk_unlikely(iovs[i].iov_len % block_size != 0)) { ++ return -1; ++ } ++ ++ nbytes -= iovs[i].iov_len; ++ } ++ ++ return -1; ++} ++ ++static void ++bdev_pmem_submit_io(struct spdk_bdev_io *bdev_io, struct pmem_disk *pdisk, ++ struct spdk_io_channel *ch, ++ struct iovec *iov, int iovcnt, ++ uint64_t offset_blocks, size_t num_blocks, uint32_t block_size, ++ spdk_bdev_pmem_io_request fn) ++{ ++ int rc; ++ size_t nbytes, offset, len; ++ enum spdk_bdev_io_status status; ++ ++ rc = bdev_pmem_check_iov_len(iov, iovcnt, num_blocks, block_size); ++ if (rc) { ++ status = SPDK_BDEV_IO_STATUS_FAILED; ++ goto end; ++ } ++ ++ SPDK_DEBUGLOG(bdev_pmem, "io %lu bytes from offset %#lx\n", ++ num_blocks, offset_blocks); ++ ++ for (nbytes = num_blocks * block_size; nbytes > 0; iov++) { ++ len = spdk_min(iov->iov_len, nbytes); ++ nbytes -= len; ++ ++ offset = 0; ++ while (offset != len) { ++ rc = fn(pdisk->pool, iov->iov_base + offset, offset_blocks); ++ if (rc != 0) { ++ SPDK_ERRLOG("pmemblk io failed: %d (%s)\n", errno, pmemblk_errormsg()); ++ status = SPDK_BDEV_IO_STATUS_FAILED; ++ goto end; ++ } ++ ++ offset += block_size; ++ offset_blocks++; ++ } ++ } ++ ++ assert(num_blocks == offset_blocks - bdev_io->u.bdev.offset_blocks); ++ status = SPDK_BDEV_IO_STATUS_SUCCESS; ++end: ++ ++ spdk_bdev_io_complete(bdev_io, status); ++} ++ ++static void ++bdev_pmem_write_zeros(struct spdk_bdev_io *bdev_io, struct pmem_disk *pdisk, ++ struct spdk_io_channel *ch, uint64_t offset_blocks, ++ uint64_t num_blocks, uint32_t block_size) ++{ ++ int rc; ++ enum spdk_bdev_io_status status = SPDK_BDEV_IO_STATUS_SUCCESS; ++ ++ while (num_blocks > 0) { ++ rc = pmemblk_set_zero(pdisk->pool, offset_blocks); ++ if (rc != 0) { ++ SPDK_ERRLOG("pmemblk_set_zero failed: %d (%s)\n", errno, pmemblk_errormsg()); ++ status = SPDK_BDEV_IO_STATUS_FAILED; ++ break; ++ } ++ offset_blocks++; ++ num_blocks--; ++ } ++ spdk_bdev_io_complete(bdev_io, status); ++} ++ ++static void ++bdev_pmem_io_get_buf_cb(struct spdk_io_channel *channel, struct spdk_bdev_io *bdev_io, ++ bool success) ++{ ++ if (!success) { ++ spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); ++ return; ++ } ++ ++ bdev_pmem_submit_io(bdev_io, ++ bdev_io->bdev->ctxt, ++ channel, ++ bdev_io->u.bdev.iovs, ++ bdev_io->u.bdev.iovcnt, ++ bdev_io->u.bdev.offset_blocks, ++ bdev_io->u.bdev.num_blocks, ++ bdev_io->bdev->blocklen, ++ _bdev_pmem_submit_io_read); ++} ++ ++static void ++bdev_pmem_submit_request(struct spdk_io_channel *channel, struct spdk_bdev_io *bdev_io) ++{ ++ switch (bdev_io->type) { ++ case SPDK_BDEV_IO_TYPE_READ: ++ spdk_bdev_io_get_buf(bdev_io, bdev_pmem_io_get_buf_cb, ++ bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen); ++ break; ++ case SPDK_BDEV_IO_TYPE_WRITE: ++ bdev_pmem_submit_io(bdev_io, ++ bdev_io->bdev->ctxt, ++ channel, ++ bdev_io->u.bdev.iovs, ++ bdev_io->u.bdev.iovcnt, ++ bdev_io->u.bdev.offset_blocks, ++ bdev_io->u.bdev.num_blocks, ++ bdev_io->bdev->blocklen, ++ _bdev_pmem_submit_io_write); ++ break; ++ case SPDK_BDEV_IO_TYPE_UNMAP: ++ case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: ++ bdev_pmem_write_zeros(bdev_io, ++ bdev_io->bdev->ctxt, ++ channel, ++ bdev_io->u.bdev.offset_blocks, ++ bdev_io->u.bdev.num_blocks, ++ bdev_io->bdev->blocklen); ++ break; ++ case SPDK_BDEV_IO_TYPE_FLUSH: ++ case SPDK_BDEV_IO_TYPE_RESET: ++ spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_SUCCESS); ++ break; ++ default: ++ spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); ++ } ++} ++ ++static bool ++bdev_pmem_io_type_supported(void *ctx, enum spdk_bdev_io_type io_type) ++{ ++ switch (io_type) { ++ case SPDK_BDEV_IO_TYPE_READ: ++ case SPDK_BDEV_IO_TYPE_WRITE: ++ case SPDK_BDEV_IO_TYPE_RESET: ++ case SPDK_BDEV_IO_TYPE_UNMAP: ++ case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: ++ case SPDK_BDEV_IO_TYPE_FLUSH: ++ return true; ++ default: ++ return false; ++ } ++} ++ ++static struct spdk_io_channel * ++bdev_pmem_get_io_channel(void *ctx) ++{ ++ return spdk_get_io_channel(&g_pmem_disks); ++} ++ ++static int ++bdev_pmem_dump_info_json(void *ctx, struct spdk_json_write_ctx *w) ++{ ++ struct pmem_disk *pdisk = ctx; ++ ++ spdk_json_write_named_object_begin(w, "pmem"); ++ spdk_json_write_named_string(w, "pmem_file", pdisk->pmem_file); ++ spdk_json_write_object_end(w); ++ ++ return 0; ++} ++ ++static int ++bdev_pmem_create_cb(void *io_device, void *ctx_buf) ++{ ++ return 0; ++} ++ ++static void ++bdev_pmem_destroy_cb(void *io_device, void *ctx_buf) ++{ ++} ++ ++static void ++bdev_pmem_write_config_json(struct spdk_bdev *bdev, struct spdk_json_write_ctx *w) ++{ ++ struct pmem_disk *disk = bdev->ctxt; ++ ++ spdk_json_write_object_begin(w); ++ ++ spdk_json_write_named_string(w, "method", "bdev_pmem_create"); ++ ++ spdk_json_write_named_object_begin(w, "params"); ++ spdk_json_write_named_string(w, "name", bdev->name); ++ spdk_json_write_named_string(w, "pmem_file", disk->pmem_file); ++ spdk_json_write_object_end(w); ++ ++ spdk_json_write_object_end(w); ++} ++ ++static const struct spdk_bdev_fn_table pmem_fn_table = { ++ .destruct = bdev_pmem_destruct, ++ .submit_request = bdev_pmem_submit_request, ++ .io_type_supported = bdev_pmem_io_type_supported, ++ .get_io_channel = bdev_pmem_get_io_channel, ++ .dump_info_json = bdev_pmem_dump_info_json, ++ .write_config_json = bdev_pmem_write_config_json, ++}; ++ ++int ++create_pmem_disk(const char *pmem_file, const char *name, struct spdk_bdev **bdev) ++{ ++ uint64_t num_blocks; ++ uint32_t block_size; ++ struct pmem_disk *pdisk; ++ int rc; ++ ++ *bdev = NULL; ++ ++ if (name == NULL) { ++ SPDK_ERRLOG("Missing name parameter for create_pmem_disk()\n"); ++ return -EINVAL; ++ } ++ ++ if (pmemblk_check(pmem_file, 0) != 1) { ++ SPDK_ERRLOG("Pool '%s' check failed: %s\n", pmem_file, pmemblk_errormsg()); ++ return -EIO; ++ } ++ ++ pdisk = calloc(1, sizeof(*pdisk)); ++ if (!pdisk) { ++ return -ENOMEM; ++ } ++ ++ snprintf(pdisk->pmem_file, sizeof(pdisk->pmem_file), "%s", pmem_file); ++ pdisk->pool = pmemblk_open(pmem_file, 0); ++ if (!pdisk->pool) { ++ SPDK_ERRLOG("Opening pmem pool '%s' failed: %d\n", pmem_file, errno); ++ free(pdisk); ++ return -errno; ++ } ++ ++ block_size = pmemblk_bsize(pdisk->pool); ++ num_blocks = pmemblk_nblock(pdisk->pool); ++ ++ if (block_size == 0) { ++ SPDK_ERRLOG("Block size must be more than 0 bytes\n"); ++ pmemblk_close(pdisk->pool); ++ free(pdisk); ++ return -EINVAL; ++ } ++ ++ if (num_blocks == 0) { ++ SPDK_ERRLOG("Disk must be more than 0 blocks\n"); ++ pmemblk_close(pdisk->pool); ++ free(pdisk); ++ return -EINVAL; ++ } ++ ++ pdisk->disk.name = strdup(name); ++ if (!pdisk->disk.name) { ++ pmemblk_close(pdisk->pool); ++ free(pdisk); ++ return -ENOMEM; ++ } ++ ++ pdisk->disk.product_name = "pmemblk disk"; ++ pdisk->disk.write_cache = 0; ++ pdisk->disk.blocklen = block_size; ++ pdisk->disk.blockcnt = num_blocks; ++ ++ pdisk->disk.ctxt = pdisk; ++ pdisk->disk.fn_table = &pmem_fn_table; ++ pdisk->disk.module = &pmem_if; ++ ++ rc = spdk_bdev_register(&pdisk->disk); ++ if (rc) { ++ pmemblk_close(pdisk->pool); ++ free(pdisk->disk.name); ++ free(pdisk); ++ return rc; ++ } ++ ++ TAILQ_INSERT_TAIL(&g_pmem_disks, pdisk, tailq); ++ ++ *bdev = &pdisk->disk; ++ ++ return 0; ++} ++ ++void ++delete_pmem_disk(const char *name, spdk_delete_pmem_complete cb_fn, void *cb_arg) ++{ ++ int rc; ++ ++ rc = spdk_bdev_unregister_by_name(name, &pmem_if, cb_fn, cb_arg); ++ if (rc != 0) { ++ cb_fn(cb_arg, rc); ++ } ++} ++ ++SPDK_LOG_DEPRECATION_REGISTER(bdev_pmem, "PMDK libpmemblk bdev_pmem integration", "SPDK 23.05", 0); ++ ++static int ++bdev_pmem_initialize(void) ++{ ++ const char *err = pmemblk_check_version(PMEMBLK_MAJOR_VERSION, PMEMBLK_MINOR_VERSION); ++ ++ SPDK_LOG_DEPRECATED(bdev_pmem); ++ if (err != NULL) { ++ SPDK_ERRLOG("Invalid libpmemblk version (expected %d.%d): %s\n", PMEMBLK_MAJOR_VERSION, ++ PMEMBLK_MINOR_VERSION, err); ++ return -1; ++ } ++ ++#ifdef SPDK_CONFIG_DEBUG ++ setenv("PMEMBLK_LOG_LEVEL", "1", 1); ++#endif ++ spdk_io_device_register(&g_pmem_disks, bdev_pmem_create_cb, bdev_pmem_destroy_cb, 0, "pmem_bdev"); ++ ++ return 0; ++ ++} ++ ++static void ++bdev_pmem_finish_done(void *io_device) ++{ ++ spdk_bdev_module_fini_done(); ++} ++ ++static void ++bdev_pmem_finish(void) ++{ ++ spdk_io_device_unregister(&g_pmem_disks, bdev_pmem_finish_done); ++} ++ ++SPDK_LOG_REGISTER_COMPONENT(bdev_pmem) +diff --git a/module/bdev/pmem/bdev_pmem.h b/module/bdev/pmem/bdev_pmem.h +index d6320c8..df1034d 100644 +--- a/module/bdev/pmem/bdev_pmem.h ++++ b/module/bdev/pmem/bdev_pmem.h +@@ -1,36 +1,36 @@ +-/* SPDX-License-Identifier: BSD-3-Clause +- * Copyright (C) 2017 Intel Corporation. +- * All rights reserved. +- */ +- +-#ifndef SPDK_BDEV_PMEM_H +-#define SPDK_BDEV_PMEM_H +- +-#include "spdk/bdev.h" +- +-typedef void (*spdk_delete_pmem_complete)(void *cb_arg, int bdeverrno); +- +-/** +- * Create new pmem bdev. +- * +- * \param pmem_file Pointer to pmem pool file. +- * \param name Bdev name. +- * \param bdev output parameter for bdev when operation is successful. +- * \return 0 on success. +- * -EIO if pool check failed +- * -EINVAL if input parameters check failed +- * -ENOMEM if buffer cannot be allocated +- */ +-int create_pmem_disk(const char *pmem_file, const char *name, struct spdk_bdev **bdev); +- +-/** +- * Delete pmem bdev. +- * +- * \param name Name of pmem bdev. +- * \param cb_fn Function to call after deletion. +- * \param cb_arg Argument to pass to cb_fn. +- */ +-void delete_pmem_disk(const char *name, spdk_delete_pmem_complete cb_fn, +- void *cb_arg); +- +-#endif /* SPDK_BDEV_PMEM_H */ ++/* SPDX-License-Identifier: BSD-3-Clause ++ * Copyright (C) 2017 Intel Corporation. ++ * All rights reserved. ++ */ ++ ++#ifndef SPDK_BDEV_PMEM_H ++#define SPDK_BDEV_PMEM_H ++ ++#include "spdk/bdev.h" ++ ++typedef void (*spdk_delete_pmem_complete)(void *cb_arg, int bdeverrno); ++ ++/** ++ * Create new pmem bdev. ++ * ++ * \param pmem_file Pointer to pmem pool file. ++ * \param name Bdev name. ++ * \param bdev output parameter for bdev when operation is successful. ++ * \return 0 on success. ++ * -EIO if pool check failed ++ * -EINVAL if input parameters check failed ++ * -ENOMEM if buffer cannot be allocated ++ */ ++int create_pmem_disk(const char *pmem_file, const char *name, struct spdk_bdev **bdev); ++ ++/** ++ * Delete pmem bdev. ++ * ++ * \param name Name of pmem bdev. ++ * \param cb_fn Function to call after deletion. ++ * \param cb_arg Argument to pass to cb_fn. ++ */ ++void delete_pmem_disk(const char *name, spdk_delete_pmem_complete cb_fn, ++ void *cb_arg); ++ ++#endif /* SPDK_BDEV_PMEM_H */ +diff --git a/module/bdev/pmem/bdev_pmem_rpc.c b/module/bdev/pmem/bdev_pmem_rpc.c +index b006c19..1de3949 100644 +--- a/module/bdev/pmem/bdev_pmem_rpc.c ++++ b/module/bdev/pmem/bdev_pmem_rpc.c +@@ -1,293 +1,293 @@ +-/* SPDX-License-Identifier: BSD-3-Clause +- * Copyright (C) 2017 Intel Corporation. +- * All rights reserved. +- */ +- +-#include "bdev_pmem.h" +-#include "spdk/rpc.h" +-#include "spdk/util.h" +-#include "spdk/string.h" +-#include "libpmemblk.h" +- +-#include "spdk/log.h" +- +-struct rpc_construct_pmem { +- char *pmem_file; +- char *name; +-}; +- +-static void +-free_rpc_bdev_pmem_create(struct rpc_construct_pmem *req) +-{ +- free(req->pmem_file); +- free(req->name); +-} +- +-static const struct spdk_json_object_decoder rpc_construct_pmem_decoders[] = { +- {"pmem_file", offsetof(struct rpc_construct_pmem, pmem_file), spdk_json_decode_string}, +- {"name", offsetof(struct rpc_construct_pmem, name), spdk_json_decode_string}, +-}; +- +-static void +-rpc_bdev_pmem_create(struct spdk_jsonrpc_request *request, +- const struct spdk_json_val *params) +-{ +- struct rpc_construct_pmem req = {}; +- struct spdk_json_write_ctx *w; +- struct spdk_bdev *bdev; +- int rc; +- +- if (spdk_json_decode_object(params, rpc_construct_pmem_decoders, +- SPDK_COUNTOF(rpc_construct_pmem_decoders), +- &req)) { +- SPDK_DEBUGLOG(bdev_pmem, "spdk_json_decode_object failed\n"); +- spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, +- "spdk_json_decode_object failed"); +- goto cleanup; +- } +- rc = create_pmem_disk(req.pmem_file, req.name, &bdev); +- if (rc != 0) { +- spdk_jsonrpc_send_error_response(request, rc, spdk_strerror(-rc)); +- goto cleanup; +- } +- +- w = spdk_jsonrpc_begin_result(request); +- spdk_json_write_string(w, spdk_bdev_get_name(bdev)); +- spdk_jsonrpc_end_result(request, w); +- +-cleanup: +- free_rpc_bdev_pmem_create(&req); +-} +-SPDK_RPC_REGISTER("bdev_pmem_create", rpc_bdev_pmem_create, SPDK_RPC_RUNTIME) +- +-struct rpc_delete_pmem { +- char *name; +-}; +- +-static void +-free_rpc_delete_pmem(struct rpc_delete_pmem *req) +-{ +- free(req->name); +-} +- +-static const struct spdk_json_object_decoder rpc_delete_pmem_decoders[] = { +- {"name", offsetof(struct rpc_delete_pmem, name), spdk_json_decode_string}, +-}; +- +-static void +-_rpc_bdev_pmem_delete_cb(void *cb_arg, int bdeverrno) +-{ +- struct spdk_jsonrpc_request *request = cb_arg; +- +- if (bdeverrno == 0) { +- spdk_jsonrpc_send_bool_response(request, true); +- } else { +- spdk_jsonrpc_send_error_response(request, bdeverrno, spdk_strerror(-bdeverrno)); +- } +-} +- +-static void +-rpc_bdev_pmem_delete(struct spdk_jsonrpc_request *request, +- const struct spdk_json_val *params) +-{ +- struct rpc_delete_pmem req = {NULL}; +- +- if (spdk_json_decode_object(params, rpc_delete_pmem_decoders, +- SPDK_COUNTOF(rpc_delete_pmem_decoders), +- &req)) { +- SPDK_DEBUGLOG(bdev_pmem, "spdk_json_decode_object failed\n"); +- spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, +- "spdk_json_decode_object failed"); +- goto cleanup; +- } +- +- delete_pmem_disk(req.name, _rpc_bdev_pmem_delete_cb, request); +- +-cleanup: +- free_rpc_delete_pmem(&req); +-} +-SPDK_RPC_REGISTER("bdev_pmem_delete", rpc_bdev_pmem_delete, SPDK_RPC_RUNTIME) +- +-struct rpc_bdev_pmem_create_pool { +- char *pmem_file; +- uint64_t num_blocks; +- uint32_t block_size; +-}; +- +-static const struct spdk_json_object_decoder rpc_bdev_pmem_create_pool_decoders[] = { +- {"pmem_file", offsetof(struct rpc_bdev_pmem_create_pool, pmem_file), spdk_json_decode_string}, +- {"num_blocks", offsetof(struct rpc_bdev_pmem_create_pool, num_blocks), spdk_json_decode_uint64}, +- {"block_size", offsetof(struct rpc_bdev_pmem_create_pool, block_size), spdk_json_decode_uint32}, +-}; +- +-static void +-free_rpc_bdev_pmem_create_pool(struct rpc_bdev_pmem_create_pool *req) +-{ +- free(req->pmem_file); +-} +- +-static void +-rpc_bdev_pmem_create_pool(struct spdk_jsonrpc_request *request, +- const struct spdk_json_val *params) +-{ +- struct rpc_bdev_pmem_create_pool req = {}; +- uint64_t pool_size; +- PMEMblkpool *pbp; +- +- if (spdk_json_decode_object(params, rpc_bdev_pmem_create_pool_decoders, +- SPDK_COUNTOF(rpc_bdev_pmem_create_pool_decoders), +- &req)) { +- SPDK_DEBUGLOG(bdev_pmem, "spdk_json_decode_object failed\n"); +- spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, +- "spdk_json_decode_object failed"); +- goto cleanup; +- } +- +- /* libpmemblk pool has to contain at least 256 blocks */ +- if (req.num_blocks < 256) { +- spdk_jsonrpc_send_error_response(request, -EINVAL, +- "Pmem pool num_blocks must be at least 256"); +- goto cleanup; +- } +- +- pool_size = req.num_blocks * req.block_size; +- if (pool_size < PMEMBLK_MIN_POOL) { +- spdk_jsonrpc_send_error_response_fmt(request, -EINVAL, +- "Pmem pool size must be at least %ld", PMEMBLK_MIN_POOL); +- goto cleanup; +- } +- +- pbp = pmemblk_create(req.pmem_file, req.block_size, pool_size, 0666); +- if (pbp == NULL) { +- const char *msg = pmemblk_errormsg(); +- +- SPDK_DEBUGLOG(bdev_pmem, "pmemblk_create() failed: %s\n", msg ? msg : "(logs disabled)"); +- spdk_jsonrpc_send_error_response_fmt(request, -SPDK_JSONRPC_ERROR_INVALID_PARAMS, +- "pmemblk_create failed: %s", msg ? msg : "(logs disabled)"); +- goto cleanup; +- } +- +- pmemblk_close(pbp); +- +- spdk_jsonrpc_send_bool_response(request, true); +- +-cleanup: +- free_rpc_bdev_pmem_create_pool(&req); +-} +-SPDK_RPC_REGISTER("bdev_pmem_create_pool", rpc_bdev_pmem_create_pool, SPDK_RPC_RUNTIME) +- +-struct rpc_bdev_pmem_get_pool_info { +- char *pmem_file; +-}; +- +-static const struct spdk_json_object_decoder rpc_bdev_pmem_get_pool_info_decoders[] = { +- {"pmem_file", offsetof(struct rpc_bdev_pmem_get_pool_info, pmem_file), spdk_json_decode_string}, +-}; +- +-static void +-free_rpc_bdev_pmem_get_pool_info(struct rpc_bdev_pmem_get_pool_info *req) +-{ +- free(req->pmem_file); +-} +- +-static void +-rpc_bdev_pmem_get_pool_info(struct spdk_jsonrpc_request *request, +- const struct spdk_json_val *params) +-{ +- struct rpc_bdev_pmem_get_pool_info req = {}; +- struct spdk_json_write_ctx *w; +- size_t num_blocks, block_size; +- PMEMblkpool *pbp; +- +- if (spdk_json_decode_object(params, rpc_bdev_pmem_get_pool_info_decoders, +- SPDK_COUNTOF(rpc_bdev_pmem_get_pool_info_decoders), +- &req)) { +- spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, +- "spdk_json_decode_object failed"); +- goto cleanup; +- } +- +- pbp = pmemblk_open(req.pmem_file, 0); +- if (pbp == NULL) { +- const char *msg = pmemblk_errormsg(); +- +- spdk_jsonrpc_send_error_response_fmt(request, -SPDK_JSONRPC_ERROR_INVALID_PARAMS, +- "pmemblk_open failed: %s", msg ? msg : "(logs disabled)"); +- goto cleanup; +- } +- +- block_size = pmemblk_bsize(pbp); +- num_blocks = pmemblk_nblock(pbp); +- +- pmemblk_close(pbp); +- +- /* Check pmem pool consistency */ +- if (pmemblk_check(req.pmem_file, block_size) != 1) { +- const char *msg = pmemblk_errormsg(); +- +- spdk_jsonrpc_send_error_response_fmt(request, -SPDK_JSONRPC_ERROR_INVALID_PARAMS, +- "pmemblk_check failed: %s", msg ? msg : "(logs disabled)"); +- goto cleanup; +- } +- +- w = spdk_jsonrpc_begin_result(request); +- spdk_json_write_array_begin(w); +- spdk_json_write_object_begin(w); +- spdk_json_write_named_uint64(w, "num_blocks", num_blocks); +- spdk_json_write_named_uint64(w, "block_size", block_size); +- spdk_json_write_object_end(w); +- spdk_json_write_array_end(w); +- spdk_jsonrpc_end_result(request, w); +- +-cleanup: +- free_rpc_bdev_pmem_get_pool_info(&req); +-} +-SPDK_RPC_REGISTER("bdev_pmem_get_pool_info", rpc_bdev_pmem_get_pool_info, SPDK_RPC_RUNTIME) +- +-struct rpc_bdev_pmem_delete_pool { +- char *pmem_file; +-}; +- +-static const struct spdk_json_object_decoder rpc_bdev_pmem_delete_pool_decoders[] = { +- {"pmem_file", offsetof(struct rpc_bdev_pmem_delete_pool, pmem_file), spdk_json_decode_string}, +-}; +- +-static void +-free_rpc_bdev_pmem_delete_pool(struct rpc_bdev_pmem_delete_pool *req) +-{ +- free(req->pmem_file); +-} +- +-static void +-rpc_bdev_pmem_delete_pool(struct spdk_jsonrpc_request *request, +- const struct spdk_json_val *params) +-{ +- struct rpc_bdev_pmem_delete_pool req = {}; +- int rc; +- +- if (spdk_json_decode_object(params, rpc_bdev_pmem_delete_pool_decoders, +- SPDK_COUNTOF(rpc_bdev_pmem_delete_pool_decoders), +- &req)) { +- spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, +- "spdk_json_decode_object failed"); +- goto cleanup; +- } +- +- /* Check if file is actually pmem pool */ +- rc = pmemblk_check(req.pmem_file, 0); +- if (rc != 1) { +- const char *msg = pmemblk_errormsg(); +- +- spdk_jsonrpc_send_error_response_fmt(request, -SPDK_JSONRPC_ERROR_INVALID_PARAMS, +- "pmemblk_check failed: %s", msg ? msg : "(logs disabled)"); +- goto cleanup; +- } +- +- unlink(req.pmem_file); +- +- spdk_jsonrpc_send_bool_response(request, true); +- +-cleanup: +- free_rpc_bdev_pmem_delete_pool(&req); +-} +-SPDK_RPC_REGISTER("bdev_pmem_delete_pool", rpc_bdev_pmem_delete_pool, SPDK_RPC_RUNTIME) ++/* SPDX-License-Identifier: BSD-3-Clause ++ * Copyright (C) 2017 Intel Corporation. ++ * All rights reserved. ++ */ ++ ++#include "bdev_pmem.h" ++#include "spdk/rpc.h" ++#include "spdk/util.h" ++#include "spdk/string.h" ++#include "libpmemblk.h" ++ ++#include "spdk/log.h" ++ ++struct rpc_construct_pmem { ++ char *pmem_file; ++ char *name; ++}; ++ ++static void ++free_rpc_bdev_pmem_create(struct rpc_construct_pmem *req) ++{ ++ free(req->pmem_file); ++ free(req->name); ++} ++ ++static const struct spdk_json_object_decoder rpc_construct_pmem_decoders[] = { ++ {"pmem_file", offsetof(struct rpc_construct_pmem, pmem_file), spdk_json_decode_string}, ++ {"name", offsetof(struct rpc_construct_pmem, name), spdk_json_decode_string}, ++}; ++ ++static void ++rpc_bdev_pmem_create(struct spdk_jsonrpc_request *request, ++ const struct spdk_json_val *params) ++{ ++ struct rpc_construct_pmem req = {}; ++ struct spdk_json_write_ctx *w; ++ struct spdk_bdev *bdev; ++ int rc; ++ ++ if (spdk_json_decode_object(params, rpc_construct_pmem_decoders, ++ SPDK_COUNTOF(rpc_construct_pmem_decoders), ++ &req)) { ++ SPDK_DEBUGLOG(bdev_pmem, "spdk_json_decode_object failed\n"); ++ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, ++ "spdk_json_decode_object failed"); ++ goto cleanup; ++ } ++ rc = create_pmem_disk(req.pmem_file, req.name, &bdev); ++ if (rc != 0) { ++ spdk_jsonrpc_send_error_response(request, rc, spdk_strerror(-rc)); ++ goto cleanup; ++ } ++ ++ w = spdk_jsonrpc_begin_result(request); ++ spdk_json_write_string(w, spdk_bdev_get_name(bdev)); ++ spdk_jsonrpc_end_result(request, w); ++ ++cleanup: ++ free_rpc_bdev_pmem_create(&req); ++} ++SPDK_RPC_REGISTER("bdev_pmem_create", rpc_bdev_pmem_create, SPDK_RPC_RUNTIME) ++ ++struct rpc_delete_pmem { ++ char *name; ++}; ++ ++static void ++free_rpc_delete_pmem(struct rpc_delete_pmem *req) ++{ ++ free(req->name); ++} ++ ++static const struct spdk_json_object_decoder rpc_delete_pmem_decoders[] = { ++ {"name", offsetof(struct rpc_delete_pmem, name), spdk_json_decode_string}, ++}; ++ ++static void ++_rpc_bdev_pmem_delete_cb(void *cb_arg, int bdeverrno) ++{ ++ struct spdk_jsonrpc_request *request = cb_arg; ++ ++ if (bdeverrno == 0) { ++ spdk_jsonrpc_send_bool_response(request, true); ++ } else { ++ spdk_jsonrpc_send_error_response(request, bdeverrno, spdk_strerror(-bdeverrno)); ++ } ++} ++ ++static void ++rpc_bdev_pmem_delete(struct spdk_jsonrpc_request *request, ++ const struct spdk_json_val *params) ++{ ++ struct rpc_delete_pmem req = {NULL}; ++ ++ if (spdk_json_decode_object(params, rpc_delete_pmem_decoders, ++ SPDK_COUNTOF(rpc_delete_pmem_decoders), ++ &req)) { ++ SPDK_DEBUGLOG(bdev_pmem, "spdk_json_decode_object failed\n"); ++ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, ++ "spdk_json_decode_object failed"); ++ goto cleanup; ++ } ++ ++ delete_pmem_disk(req.name, _rpc_bdev_pmem_delete_cb, request); ++ ++cleanup: ++ free_rpc_delete_pmem(&req); ++} ++SPDK_RPC_REGISTER("bdev_pmem_delete", rpc_bdev_pmem_delete, SPDK_RPC_RUNTIME) ++ ++struct rpc_bdev_pmem_create_pool { ++ char *pmem_file; ++ uint64_t num_blocks; ++ uint32_t block_size; ++}; ++ ++static const struct spdk_json_object_decoder rpc_bdev_pmem_create_pool_decoders[] = { ++ {"pmem_file", offsetof(struct rpc_bdev_pmem_create_pool, pmem_file), spdk_json_decode_string}, ++ {"num_blocks", offsetof(struct rpc_bdev_pmem_create_pool, num_blocks), spdk_json_decode_uint64}, ++ {"block_size", offsetof(struct rpc_bdev_pmem_create_pool, block_size), spdk_json_decode_uint32}, ++}; ++ ++static void ++free_rpc_bdev_pmem_create_pool(struct rpc_bdev_pmem_create_pool *req) ++{ ++ free(req->pmem_file); ++} ++ ++static void ++rpc_bdev_pmem_create_pool(struct spdk_jsonrpc_request *request, ++ const struct spdk_json_val *params) ++{ ++ struct rpc_bdev_pmem_create_pool req = {}; ++ uint64_t pool_size; ++ PMEMblkpool *pbp; ++ ++ if (spdk_json_decode_object(params, rpc_bdev_pmem_create_pool_decoders, ++ SPDK_COUNTOF(rpc_bdev_pmem_create_pool_decoders), ++ &req)) { ++ SPDK_DEBUGLOG(bdev_pmem, "spdk_json_decode_object failed\n"); ++ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, ++ "spdk_json_decode_object failed"); ++ goto cleanup; ++ } ++ ++ /* libpmemblk pool has to contain at least 256 blocks */ ++ if (req.num_blocks < 256) { ++ spdk_jsonrpc_send_error_response(request, -EINVAL, ++ "Pmem pool num_blocks must be at least 256"); ++ goto cleanup; ++ } ++ ++ pool_size = req.num_blocks * req.block_size; ++ if (pool_size < PMEMBLK_MIN_POOL) { ++ spdk_jsonrpc_send_error_response_fmt(request, -EINVAL, ++ "Pmem pool size must be at least %ld", PMEMBLK_MIN_POOL); ++ goto cleanup; ++ } ++ ++ pbp = pmemblk_create(req.pmem_file, req.block_size, pool_size, 0666); ++ if (pbp == NULL) { ++ const char *msg = pmemblk_errormsg(); ++ ++ SPDK_DEBUGLOG(bdev_pmem, "pmemblk_create() failed: %s\n", msg ? msg : "(logs disabled)"); ++ spdk_jsonrpc_send_error_response_fmt(request, -SPDK_JSONRPC_ERROR_INVALID_PARAMS, ++ "pmemblk_create failed: %s", msg ? msg : "(logs disabled)"); ++ goto cleanup; ++ } ++ ++ pmemblk_close(pbp); ++ ++ spdk_jsonrpc_send_bool_response(request, true); ++ ++cleanup: ++ free_rpc_bdev_pmem_create_pool(&req); ++} ++SPDK_RPC_REGISTER("bdev_pmem_create_pool", rpc_bdev_pmem_create_pool, SPDK_RPC_RUNTIME) ++ ++struct rpc_bdev_pmem_get_pool_info { ++ char *pmem_file; ++}; ++ ++static const struct spdk_json_object_decoder rpc_bdev_pmem_get_pool_info_decoders[] = { ++ {"pmem_file", offsetof(struct rpc_bdev_pmem_get_pool_info, pmem_file), spdk_json_decode_string}, ++}; ++ ++static void ++free_rpc_bdev_pmem_get_pool_info(struct rpc_bdev_pmem_get_pool_info *req) ++{ ++ free(req->pmem_file); ++} ++ ++static void ++rpc_bdev_pmem_get_pool_info(struct spdk_jsonrpc_request *request, ++ const struct spdk_json_val *params) ++{ ++ struct rpc_bdev_pmem_get_pool_info req = {}; ++ struct spdk_json_write_ctx *w; ++ size_t num_blocks, block_size; ++ PMEMblkpool *pbp; ++ ++ if (spdk_json_decode_object(params, rpc_bdev_pmem_get_pool_info_decoders, ++ SPDK_COUNTOF(rpc_bdev_pmem_get_pool_info_decoders), ++ &req)) { ++ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, ++ "spdk_json_decode_object failed"); ++ goto cleanup; ++ } ++ ++ pbp = pmemblk_open(req.pmem_file, 0); ++ if (pbp == NULL) { ++ const char *msg = pmemblk_errormsg(); ++ ++ spdk_jsonrpc_send_error_response_fmt(request, -SPDK_JSONRPC_ERROR_INVALID_PARAMS, ++ "pmemblk_open failed: %s", msg ? msg : "(logs disabled)"); ++ goto cleanup; ++ } ++ ++ block_size = pmemblk_bsize(pbp); ++ num_blocks = pmemblk_nblock(pbp); ++ ++ pmemblk_close(pbp); ++ ++ /* Check pmem pool consistency */ ++ if (pmemblk_check(req.pmem_file, block_size) != 1) { ++ const char *msg = pmemblk_errormsg(); ++ ++ spdk_jsonrpc_send_error_response_fmt(request, -SPDK_JSONRPC_ERROR_INVALID_PARAMS, ++ "pmemblk_check failed: %s", msg ? msg : "(logs disabled)"); ++ goto cleanup; ++ } ++ ++ w = spdk_jsonrpc_begin_result(request); ++ spdk_json_write_array_begin(w); ++ spdk_json_write_object_begin(w); ++ spdk_json_write_named_uint64(w, "num_blocks", num_blocks); ++ spdk_json_write_named_uint64(w, "block_size", block_size); ++ spdk_json_write_object_end(w); ++ spdk_json_write_array_end(w); ++ spdk_jsonrpc_end_result(request, w); ++ ++cleanup: ++ free_rpc_bdev_pmem_get_pool_info(&req); ++} ++SPDK_RPC_REGISTER("bdev_pmem_get_pool_info", rpc_bdev_pmem_get_pool_info, SPDK_RPC_RUNTIME) ++ ++struct rpc_bdev_pmem_delete_pool { ++ char *pmem_file; ++}; ++ ++static const struct spdk_json_object_decoder rpc_bdev_pmem_delete_pool_decoders[] = { ++ {"pmem_file", offsetof(struct rpc_bdev_pmem_delete_pool, pmem_file), spdk_json_decode_string}, ++}; ++ ++static void ++free_rpc_bdev_pmem_delete_pool(struct rpc_bdev_pmem_delete_pool *req) ++{ ++ free(req->pmem_file); ++} ++ ++static void ++rpc_bdev_pmem_delete_pool(struct spdk_jsonrpc_request *request, ++ const struct spdk_json_val *params) ++{ ++ struct rpc_bdev_pmem_delete_pool req = {}; ++ int rc; ++ ++ if (spdk_json_decode_object(params, rpc_bdev_pmem_delete_pool_decoders, ++ SPDK_COUNTOF(rpc_bdev_pmem_delete_pool_decoders), ++ &req)) { ++ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, ++ "spdk_json_decode_object failed"); ++ goto cleanup; ++ } ++ ++ /* Check if file is actually pmem pool */ ++ rc = pmemblk_check(req.pmem_file, 0); ++ if (rc != 1) { ++ const char *msg = pmemblk_errormsg(); ++ ++ spdk_jsonrpc_send_error_response_fmt(request, -SPDK_JSONRPC_ERROR_INVALID_PARAMS, ++ "pmemblk_check failed: %s", msg ? msg : "(logs disabled)"); ++ goto cleanup; ++ } ++ ++ unlink(req.pmem_file); ++ ++ spdk_jsonrpc_send_bool_response(request, true); ++ ++cleanup: ++ free_rpc_bdev_pmem_delete_pool(&req); ++} ++SPDK_RPC_REGISTER("bdev_pmem_delete_pool", rpc_bdev_pmem_delete_pool, SPDK_RPC_RUNTIME) +diff --git a/module/bdev/raid/Makefile b/module/bdev/raid/Makefile +index 162ecee..977fd0f 100644 +--- a/module/bdev/raid/Makefile ++++ b/module/bdev/raid/Makefile +@@ -1,23 +1,23 @@ +-# SPDX-License-Identifier: BSD-3-Clause +-# Copyright (C) 2015 Intel Corporation. +-# All rights reserved. +-# +- +-SPDK_ROOT_DIR := $(abspath $(CURDIR)/../../..) +-include $(SPDK_ROOT_DIR)/mk/spdk.common.mk +- +-SO_VER := 4 +-SO_MINOR := 0 +- +-CFLAGS += -I$(SPDK_ROOT_DIR)/lib/bdev/ +-C_SRCS = bdev_raid.c bdev_raid_rpc.c raid0.c raid1.c concat.c +- +-ifeq ($(CONFIG_RAID5F),y) +-C_SRCS += raid5f.c +-endif +- +-LIBNAME = bdev_raid +- +-SPDK_MAP_FILE = $(SPDK_ROOT_DIR)/mk/spdk_blank.map +- +-include $(SPDK_ROOT_DIR)/mk/spdk.lib.mk ++# SPDX-License-Identifier: BSD-3-Clause ++# Copyright (C) 2015 Intel Corporation. ++# All rights reserved. ++# ++ ++SPDK_ROOT_DIR := $(abspath $(CURDIR)/../../..) ++include $(SPDK_ROOT_DIR)/mk/spdk.common.mk ++ ++SO_VER := 4 ++SO_MINOR := 0 ++ ++CFLAGS += -I$(SPDK_ROOT_DIR)/lib/bdev/ ++C_SRCS = bdev_raid.c bdev_raid_rpc.c raid0.c raid1.c concat.c ++ ++ifeq ($(CONFIG_RAID5F),y) ++C_SRCS += raid5f.c ++endif ++ ++LIBNAME = bdev_raid ++ ++SPDK_MAP_FILE = $(SPDK_ROOT_DIR)/mk/spdk_blank.map ++ ++include $(SPDK_ROOT_DIR)/mk/spdk.lib.mk +diff --git a/module/bdev/raid/bdev_raid.c b/module/bdev/raid/bdev_raid.c +index 7b52e63..82584e7 100644 +--- a/module/bdev/raid/bdev_raid.c ++++ b/module/bdev/raid/bdev_raid.c +@@ -1,1498 +1,1498 @@ +-/* SPDX-License-Identifier: BSD-3-Clause +- * Copyright (C) 2018 Intel Corporation. +- * All rights reserved. +- * Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +- */ +- +-#include "bdev_raid.h" +-#include "spdk/env.h" +-#include "spdk/thread.h" +-#include "spdk/log.h" +-#include "spdk/string.h" +-#include "spdk/util.h" +-#include "spdk/json.h" +- +-static bool g_shutdown_started = false; +- +-/* List of all raid bdevs */ +-struct raid_all_tailq g_raid_bdev_list = TAILQ_HEAD_INITIALIZER(g_raid_bdev_list); +- +-static TAILQ_HEAD(, raid_bdev_module) g_raid_modules = TAILQ_HEAD_INITIALIZER(g_raid_modules); +- +-static struct raid_bdev_module * +-raid_bdev_module_find(enum raid_level level) +-{ +- struct raid_bdev_module *raid_module; +- +- TAILQ_FOREACH(raid_module, &g_raid_modules, link) { +- if (raid_module->level == level) { +- return raid_module; +- } +- } +- +- return NULL; +-} +- +-void +-raid_bdev_module_list_add(struct raid_bdev_module *raid_module) +-{ +- if (raid_bdev_module_find(raid_module->level) != NULL) { +- SPDK_ERRLOG("module for raid level '%s' already registered.\n", +- raid_bdev_level_to_str(raid_module->level)); +- assert(false); +- } else { +- TAILQ_INSERT_TAIL(&g_raid_modules, raid_module, link); +- } +-} +- +-/* Function declarations */ +-static void raid_bdev_examine(struct spdk_bdev *bdev); +-static int raid_bdev_init(void); +-static void raid_bdev_deconfigure(struct raid_bdev *raid_bdev, +- raid_bdev_destruct_cb cb_fn, void *cb_arg); +- +-/* +- * brief: +- * raid_bdev_create_cb function is a cb function for raid bdev which creates the +- * hierarchy from raid bdev to base bdev io channels. It will be called per core +- * params: +- * io_device - pointer to raid bdev io device represented by raid_bdev +- * ctx_buf - pointer to context buffer for raid bdev io channel +- * returns: +- * 0 - success +- * non zero - failure +- */ +-static int +-raid_bdev_create_cb(void *io_device, void *ctx_buf) +-{ +- struct raid_bdev *raid_bdev = io_device; +- struct raid_bdev_io_channel *raid_ch = ctx_buf; +- uint8_t i; +- int ret = 0; +- +- SPDK_DEBUGLOG(bdev_raid, "raid_bdev_create_cb, %p\n", raid_ch); +- +- assert(raid_bdev != NULL); +- assert(raid_bdev->state == RAID_BDEV_STATE_ONLINE); +- +- raid_ch->num_channels = raid_bdev->num_base_bdevs; +- +- raid_ch->base_channel = calloc(raid_ch->num_channels, +- sizeof(struct spdk_io_channel *)); +- if (!raid_ch->base_channel) { +- SPDK_ERRLOG("Unable to allocate base bdevs io channel\n"); +- return -ENOMEM; +- } +- for (i = 0; i < raid_ch->num_channels; i++) { +- /* +- * Get the spdk_io_channel for all the base bdevs. This is used during +- * split logic to send the respective child bdev ios to respective base +- * bdev io channel. +- */ +- raid_ch->base_channel[i] = spdk_bdev_get_io_channel( +- raid_bdev->base_bdev_info[i].desc); +- if (!raid_ch->base_channel[i]) { +- SPDK_ERRLOG("Unable to create io channel for base bdev\n"); +- ret = -ENOMEM; +- break; +- } +- } +- +- if (!ret && raid_bdev->module->get_io_channel) { +- raid_ch->module_channel = raid_bdev->module->get_io_channel(raid_bdev); +- if (!raid_ch->module_channel) { +- SPDK_ERRLOG("Unable to create io channel for raid module\n"); +- ret = -ENOMEM; +- } +- } +- +- if (ret) { +- uint8_t j; +- +- for (j = 0; j < i; j++) { +- spdk_put_io_channel(raid_ch->base_channel[j]); +- } +- free(raid_ch->base_channel); +- raid_ch->base_channel = NULL; +- } +- return ret; +-} +- +-/* +- * brief: +- * raid_bdev_destroy_cb function is a cb function for raid bdev which deletes the +- * hierarchy from raid bdev to base bdev io channels. It will be called per core +- * params: +- * io_device - pointer to raid bdev io device represented by raid_bdev +- * ctx_buf - pointer to context buffer for raid bdev io channel +- * returns: +- * none +- */ +-static void +-raid_bdev_destroy_cb(void *io_device, void *ctx_buf) +-{ +- struct raid_bdev_io_channel *raid_ch = ctx_buf; +- uint8_t i; +- +- SPDK_DEBUGLOG(bdev_raid, "raid_bdev_destroy_cb\n"); +- +- assert(raid_ch != NULL); +- assert(raid_ch->base_channel); +- +- if (raid_ch->module_channel) { +- spdk_put_io_channel(raid_ch->module_channel); +- } +- +- for (i = 0; i < raid_ch->num_channels; i++) { +- /* Free base bdev channels */ +- assert(raid_ch->base_channel[i] != NULL); +- spdk_put_io_channel(raid_ch->base_channel[i]); +- } +- free(raid_ch->base_channel); +- raid_ch->base_channel = NULL; +-} +- +-/* +- * brief: +- * raid_bdev_cleanup is used to cleanup raid_bdev related data +- * structures. +- * params: +- * raid_bdev - pointer to raid_bdev +- * returns: +- * none +- */ +-static void +-raid_bdev_cleanup(struct raid_bdev *raid_bdev) +-{ +- struct raid_base_bdev_info *base_info; +- +- SPDK_DEBUGLOG(bdev_raid, "raid_bdev_cleanup, %p name %s, state %s\n", +- raid_bdev, raid_bdev->bdev.name, raid_bdev_state_to_str(raid_bdev->state)); +- assert(raid_bdev->state != RAID_BDEV_STATE_ONLINE); +- assert(spdk_get_thread() == spdk_thread_get_app_thread()); +- +- RAID_FOR_EACH_BASE_BDEV(raid_bdev, base_info) { +- assert(base_info->bdev == NULL); +- assert(base_info->desc == NULL); +- free(base_info->name); +- } +- +- TAILQ_REMOVE(&g_raid_bdev_list, raid_bdev, global_link); +- free(raid_bdev->base_bdev_info); +-} +- +-static void +-raid_bdev_free(struct raid_bdev *raid_bdev) +-{ +- free(raid_bdev->bdev.name); +- free(raid_bdev); +-} +- +-static void +-raid_bdev_cleanup_and_free(struct raid_bdev *raid_bdev) +-{ +- raid_bdev_cleanup(raid_bdev); +- raid_bdev_free(raid_bdev); +-} +- +-/* +- * brief: +- * free resource of base bdev for raid bdev +- * params: +- * raid_bdev - pointer to raid bdev +- * base_info - raid base bdev info +- * returns: +- * 0 - success +- * non zero - failure +- */ +-static void +-raid_bdev_free_base_bdev_resource(struct raid_bdev *raid_bdev, +- struct raid_base_bdev_info *base_info) +-{ +- assert(spdk_get_thread() == spdk_thread_get_app_thread()); +- +- free(base_info->name); +- base_info->name = NULL; +- +- if (base_info->bdev == NULL) { +- return; +- } +- +- assert(base_info->desc); +- spdk_bdev_module_release_bdev(base_info->bdev); +- spdk_bdev_close(base_info->desc); +- base_info->desc = NULL; +- base_info->bdev = NULL; +- +- assert(raid_bdev->num_base_bdevs_discovered); +- raid_bdev->num_base_bdevs_discovered--; +-} +- +-static void +-raid_bdev_io_device_unregister_cb(void *io_device) +-{ +- struct raid_bdev *raid_bdev = io_device; +- +- if (raid_bdev->num_base_bdevs_discovered == 0) { +- /* Free raid_bdev when there are no base bdevs left */ +- SPDK_DEBUGLOG(bdev_raid, "raid bdev base bdevs is 0, going to free all in destruct\n"); +- raid_bdev_cleanup(raid_bdev); +- spdk_bdev_destruct_done(&raid_bdev->bdev, 0); +- raid_bdev_free(raid_bdev); +- } else { +- spdk_bdev_destruct_done(&raid_bdev->bdev, 0); +- } +-} +- +-void +-raid_bdev_module_stop_done(struct raid_bdev *raid_bdev) +-{ +- if (raid_bdev->state != RAID_BDEV_STATE_CONFIGURING) { +- spdk_io_device_unregister(raid_bdev, raid_bdev_io_device_unregister_cb); +- } +-} +- +-static void +-_raid_bdev_destruct(void *ctxt) +-{ +- struct raid_bdev *raid_bdev = ctxt; +- struct raid_base_bdev_info *base_info; +- +- SPDK_DEBUGLOG(bdev_raid, "raid_bdev_destruct\n"); +- +- RAID_FOR_EACH_BASE_BDEV(raid_bdev, base_info) { +- /* +- * Close all base bdev descriptors for which call has come from below +- * layers. Also close the descriptors if we have started shutdown. +- */ +- if (g_shutdown_started || base_info->remove_scheduled == true) { +- raid_bdev_free_base_bdev_resource(raid_bdev, base_info); +- } +- } +- +- if (g_shutdown_started) { +- raid_bdev->state = RAID_BDEV_STATE_OFFLINE; +- } +- +- if (raid_bdev->module->stop != NULL) { +- if (raid_bdev->module->stop(raid_bdev) == false) { +- return; +- } +- } +- +- raid_bdev_module_stop_done(raid_bdev); +-} +- +-static int +-raid_bdev_destruct(void *ctx) +-{ +- spdk_thread_exec_msg(spdk_thread_get_app_thread(), _raid_bdev_destruct, ctx); +- +- return 1; +-} +- +-void +-raid_bdev_io_complete(struct raid_bdev_io *raid_io, enum spdk_bdev_io_status status) +-{ +- struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(raid_io); +- +- spdk_bdev_io_complete(bdev_io, status); +-} +- +-/* +- * brief: +- * raid_bdev_io_complete_part - signal the completion of a part of the expected +- * base bdev IOs and complete the raid_io if this is the final expected IO. +- * The caller should first set raid_io->base_bdev_io_remaining. This function +- * will decrement this counter by the value of the 'completed' parameter and +- * complete the raid_io if the counter reaches 0. The caller is free to +- * interpret the 'base_bdev_io_remaining' and 'completed' values as needed, +- * it can represent e.g. blocks or IOs. +- * params: +- * raid_io - pointer to raid_bdev_io +- * completed - the part of the raid_io that has been completed +- * status - status of the base IO +- * returns: +- * true - if the raid_io is completed +- * false - otherwise +- */ +-bool +-raid_bdev_io_complete_part(struct raid_bdev_io *raid_io, uint64_t completed, +- enum spdk_bdev_io_status status) +-{ +- assert(raid_io->base_bdev_io_remaining >= completed); +- raid_io->base_bdev_io_remaining -= completed; +- +- if (status != SPDK_BDEV_IO_STATUS_SUCCESS) { +- raid_io->base_bdev_io_status = status; +- } +- +- if (raid_io->base_bdev_io_remaining == 0) { +- raid_bdev_io_complete(raid_io, raid_io->base_bdev_io_status); +- return true; +- } else { +- return false; +- } +-} +- +-/* +- * brief: +- * raid_bdev_queue_io_wait function processes the IO which failed to submit. +- * It will try to queue the IOs after storing the context to bdev wait queue logic. +- * params: +- * raid_io - pointer to raid_bdev_io +- * bdev - the block device that the IO is submitted to +- * ch - io channel +- * cb_fn - callback when the spdk_bdev_io for bdev becomes available +- * returns: +- * none +- */ +-void +-raid_bdev_queue_io_wait(struct raid_bdev_io *raid_io, struct spdk_bdev *bdev, +- struct spdk_io_channel *ch, spdk_bdev_io_wait_cb cb_fn) +-{ +- raid_io->waitq_entry.bdev = bdev; +- raid_io->waitq_entry.cb_fn = cb_fn; +- raid_io->waitq_entry.cb_arg = raid_io; +- spdk_bdev_queue_io_wait(bdev, ch, &raid_io->waitq_entry); +-} +- +-static void +-raid_base_bdev_reset_complete(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) +-{ +- struct raid_bdev_io *raid_io = cb_arg; +- +- spdk_bdev_free_io(bdev_io); +- +- raid_bdev_io_complete_part(raid_io, 1, success ? +- SPDK_BDEV_IO_STATUS_SUCCESS : +- SPDK_BDEV_IO_STATUS_FAILED); +-} +- +-static void raid_bdev_submit_reset_request(struct raid_bdev_io *raid_io); +- +-static void +-_raid_bdev_submit_reset_request(void *_raid_io) +-{ +- struct raid_bdev_io *raid_io = _raid_io; +- +- raid_bdev_submit_reset_request(raid_io); +-} +- +-/* +- * brief: +- * raid_bdev_submit_reset_request function submits reset requests +- * to member disks; it will submit as many as possible unless a reset fails with -ENOMEM, in +- * which case it will queue it for later submission +- * params: +- * raid_io +- * returns: +- * none +- */ +-static void +-raid_bdev_submit_reset_request(struct raid_bdev_io *raid_io) +-{ +- struct raid_bdev *raid_bdev; +- int ret; +- uint8_t i; +- struct raid_base_bdev_info *base_info; +- struct spdk_io_channel *base_ch; +- +- raid_bdev = raid_io->raid_bdev; +- +- if (raid_io->base_bdev_io_remaining == 0) { +- raid_io->base_bdev_io_remaining = raid_bdev->num_base_bdevs; +- } +- +- while (raid_io->base_bdev_io_submitted < raid_bdev->num_base_bdevs) { +- i = raid_io->base_bdev_io_submitted; +- base_info = &raid_bdev->base_bdev_info[i]; +- base_ch = raid_io->raid_ch->base_channel[i]; +- ret = spdk_bdev_reset(base_info->desc, base_ch, +- raid_base_bdev_reset_complete, raid_io); +- if (ret == 0) { +- raid_io->base_bdev_io_submitted++; +- } else if (ret == -ENOMEM) { +- raid_bdev_queue_io_wait(raid_io, base_info->bdev, base_ch, +- _raid_bdev_submit_reset_request); +- return; +- } else { +- SPDK_ERRLOG("bdev io submit error not due to ENOMEM, it should not happen\n"); +- assert(false); +- raid_bdev_io_complete(raid_io, SPDK_BDEV_IO_STATUS_FAILED); +- return; +- } +- } +-} +- +-/* +- * brief: +- * Callback function to spdk_bdev_io_get_buf. +- * params: +- * ch - pointer to raid bdev io channel +- * bdev_io - pointer to parent bdev_io on raid bdev device +- * success - True if buffer is allocated or false otherwise. +- * returns: +- * none +- */ +-static void +-raid_bdev_get_buf_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io, +- bool success) +-{ +- struct raid_bdev_io *raid_io = (struct raid_bdev_io *)bdev_io->driver_ctx; +- +- if (!success) { +- raid_bdev_io_complete(raid_io, SPDK_BDEV_IO_STATUS_FAILED); +- return; +- } +- +- raid_io->raid_bdev->module->submit_rw_request(raid_io); +-} +- +-/* +- * brief: +- * raid_bdev_submit_request function is the submit_request function pointer of +- * raid bdev function table. This is used to submit the io on raid_bdev to below +- * layers. +- * params: +- * ch - pointer to raid bdev io channel +- * bdev_io - pointer to parent bdev_io on raid bdev device +- * returns: +- * none +- */ +-static void +-raid_bdev_submit_request(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io) +-{ +- struct raid_bdev_io *raid_io = (struct raid_bdev_io *)bdev_io->driver_ctx; +- +- raid_io->raid_bdev = bdev_io->bdev->ctxt; +- raid_io->raid_ch = spdk_io_channel_get_ctx(ch); +- raid_io->base_bdev_io_remaining = 0; +- raid_io->base_bdev_io_submitted = 0; +- raid_io->base_bdev_io_status = SPDK_BDEV_IO_STATUS_SUCCESS; +- +- switch (bdev_io->type) { +- case SPDK_BDEV_IO_TYPE_READ: +- spdk_bdev_io_get_buf(bdev_io, raid_bdev_get_buf_cb, +- bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen); +- break; +- case SPDK_BDEV_IO_TYPE_WRITE: +- raid_io->raid_bdev->module->submit_rw_request(raid_io); +- break; +- +- case SPDK_BDEV_IO_TYPE_RESET: +- raid_bdev_submit_reset_request(raid_io); +- break; +- +- case SPDK_BDEV_IO_TYPE_FLUSH: +- case SPDK_BDEV_IO_TYPE_UNMAP: +- raid_io->raid_bdev->module->submit_null_payload_request(raid_io); +- break; +- +- default: +- SPDK_ERRLOG("submit request, invalid io type %u\n", bdev_io->type); +- raid_bdev_io_complete(raid_io, SPDK_BDEV_IO_STATUS_FAILED); +- break; +- } +-} +- +-/* +- * brief: +- * _raid_bdev_io_type_supported checks whether io_type is supported in +- * all base bdev modules of raid bdev module. If anyone among the base_bdevs +- * doesn't support, the raid device doesn't supports. +- * +- * params: +- * raid_bdev - pointer to raid bdev context +- * io_type - io type +- * returns: +- * true - io_type is supported +- * false - io_type is not supported +- */ +-inline static bool +-_raid_bdev_io_type_supported(struct raid_bdev *raid_bdev, enum spdk_bdev_io_type io_type) +-{ +- struct raid_base_bdev_info *base_info; +- +- if (io_type == SPDK_BDEV_IO_TYPE_FLUSH || +- io_type == SPDK_BDEV_IO_TYPE_UNMAP) { +- if (raid_bdev->module->submit_null_payload_request == NULL) { +- return false; +- } +- } +- +- RAID_FOR_EACH_BASE_BDEV(raid_bdev, base_info) { +- if (base_info->bdev == NULL) { +- assert(false); +- continue; +- } +- +- if (spdk_bdev_io_type_supported(base_info->bdev, io_type) == false) { +- return false; +- } +- } +- +- return true; +-} +- +-/* +- * brief: +- * raid_bdev_io_type_supported is the io_supported function for bdev function +- * table which returns whether the particular io type is supported or not by +- * raid bdev module +- * params: +- * ctx - pointer to raid bdev context +- * type - io type +- * returns: +- * true - io_type is supported +- * false - io_type is not supported +- */ +-static bool +-raid_bdev_io_type_supported(void *ctx, enum spdk_bdev_io_type io_type) +-{ +- switch (io_type) { +- case SPDK_BDEV_IO_TYPE_READ: +- case SPDK_BDEV_IO_TYPE_WRITE: +- return true; +- +- case SPDK_BDEV_IO_TYPE_FLUSH: +- case SPDK_BDEV_IO_TYPE_RESET: +- case SPDK_BDEV_IO_TYPE_UNMAP: +- return _raid_bdev_io_type_supported(ctx, io_type); +- +- default: +- return false; +- } +- +- return false; +-} +- +-/* +- * brief: +- * raid_bdev_get_io_channel is the get_io_channel function table pointer for +- * raid bdev. This is used to return the io channel for this raid bdev +- * params: +- * ctxt - pointer to raid_bdev +- * returns: +- * pointer to io channel for raid bdev +- */ +-static struct spdk_io_channel * +-raid_bdev_get_io_channel(void *ctxt) +-{ +- struct raid_bdev *raid_bdev = ctxt; +- +- return spdk_get_io_channel(raid_bdev); +-} +- +-void +-raid_bdev_write_info_json(struct raid_bdev *raid_bdev, struct spdk_json_write_ctx *w) +-{ +- struct raid_base_bdev_info *base_info; +- +- assert(raid_bdev != NULL); +- assert(spdk_get_thread() == spdk_thread_get_app_thread()); +- +- spdk_json_write_named_uint32(w, "strip_size_kb", raid_bdev->strip_size_kb); +- spdk_json_write_named_string(w, "state", raid_bdev_state_to_str(raid_bdev->state)); +- spdk_json_write_named_string(w, "raid_level", raid_bdev_level_to_str(raid_bdev->level)); +- spdk_json_write_named_uint32(w, "num_base_bdevs", raid_bdev->num_base_bdevs); +- spdk_json_write_named_uint32(w, "num_base_bdevs_discovered", raid_bdev->num_base_bdevs_discovered); +- spdk_json_write_name(w, "base_bdevs_list"); +- spdk_json_write_array_begin(w); +- RAID_FOR_EACH_BASE_BDEV(raid_bdev, base_info) { +- if (base_info->bdev) { +- spdk_json_write_string(w, base_info->bdev->name); +- } else { +- spdk_json_write_null(w); +- } +- } +- spdk_json_write_array_end(w); +-} +- +-/* +- * brief: +- * raid_bdev_dump_info_json is the function table pointer for raid bdev +- * params: +- * ctx - pointer to raid_bdev +- * w - pointer to json context +- * returns: +- * 0 - success +- * non zero - failure +- */ +-static int +-raid_bdev_dump_info_json(void *ctx, struct spdk_json_write_ctx *w) +-{ +- struct raid_bdev *raid_bdev = ctx; +- +- SPDK_DEBUGLOG(bdev_raid, "raid_bdev_dump_config_json\n"); +- +- /* Dump the raid bdev configuration related information */ +- spdk_json_write_named_object_begin(w, "raid"); +- raid_bdev_write_info_json(raid_bdev, w); +- spdk_json_write_object_end(w); +- +- return 0; +-} +- +-/* +- * brief: +- * raid_bdev_write_config_json is the function table pointer for raid bdev +- * params: +- * bdev - pointer to spdk_bdev +- * w - pointer to json context +- * returns: +- * none +- */ +-static void +-raid_bdev_write_config_json(struct spdk_bdev *bdev, struct spdk_json_write_ctx *w) +-{ +- struct raid_bdev *raid_bdev = bdev->ctxt; +- struct raid_base_bdev_info *base_info; +- +- assert(spdk_get_thread() == spdk_thread_get_app_thread()); +- +- spdk_json_write_object_begin(w); +- +- spdk_json_write_named_string(w, "method", "bdev_raid_create"); +- +- spdk_json_write_named_object_begin(w, "params"); +- spdk_json_write_named_string(w, "name", bdev->name); +- spdk_json_write_named_uint32(w, "strip_size_kb", raid_bdev->strip_size_kb); +- spdk_json_write_named_string(w, "raid_level", raid_bdev_level_to_str(raid_bdev->level)); +- +- spdk_json_write_named_array_begin(w, "base_bdevs"); +- RAID_FOR_EACH_BASE_BDEV(raid_bdev, base_info) { +- if (base_info->bdev) { +- spdk_json_write_string(w, base_info->bdev->name); +- } +- } +- spdk_json_write_array_end(w); +- spdk_json_write_object_end(w); +- +- spdk_json_write_object_end(w); +-} +- +-static int +-raid_bdev_get_memory_domains(void *ctx, struct spdk_memory_domain **domains, int array_size) +-{ +- struct raid_bdev *raid_bdev = ctx; +- struct spdk_bdev *base_bdev; +- uint32_t i; +- int domains_count = 0, rc; +- +- /* First loop to get the number of memory domains */ +- for (i = 0; i < raid_bdev->num_base_bdevs; i++) { +- base_bdev = raid_bdev->base_bdev_info[i].bdev; +- rc = spdk_bdev_get_memory_domains(base_bdev, NULL, 0); +- if (rc < 0) { +- return rc; +- } +- domains_count += rc; +- } +- +- if (!domains || array_size < domains_count) { +- return domains_count; +- } +- +- for (i = 0; i < raid_bdev->num_base_bdevs; i++) { +- base_bdev = raid_bdev->base_bdev_info[i].bdev; +- rc = spdk_bdev_get_memory_domains(base_bdev, domains, array_size); +- if (rc < 0) { +- return rc; +- } +- domains += rc; +- array_size -= rc; +- } +- +- return domains_count; +-} +- +-/* g_raid_bdev_fn_table is the function table for raid bdev */ +-static const struct spdk_bdev_fn_table g_raid_bdev_fn_table = { +- .destruct = raid_bdev_destruct, +- .submit_request = raid_bdev_submit_request, +- .io_type_supported = raid_bdev_io_type_supported, +- .get_io_channel = raid_bdev_get_io_channel, +- .dump_info_json = raid_bdev_dump_info_json, +- .write_config_json = raid_bdev_write_config_json, +- .get_memory_domains = raid_bdev_get_memory_domains, +-}; +- +-struct raid_bdev * +-raid_bdev_find_by_name(const char *name) +-{ +- struct raid_bdev *raid_bdev; +- +- TAILQ_FOREACH(raid_bdev, &g_raid_bdev_list, global_link) { +- if (strcmp(raid_bdev->bdev.name, name) == 0) { +- return raid_bdev; +- } +- } +- +- return NULL; +-} +- +-static struct { +- const char *name; +- enum raid_level value; +-} g_raid_level_names[] = { +- { "raid0", RAID0 }, +- { "0", RAID0 }, +- { "raid1", RAID1 }, +- { "1", RAID1 }, +- { "raid5f", RAID5F }, +- { "5f", RAID5F }, +- { "concat", CONCAT }, +- { } +-}; +- +-static struct { +- const char *name; +- enum raid_bdev_state value; +-} g_raid_state_names[] = { +- { "online", RAID_BDEV_STATE_ONLINE }, +- { "configuring", RAID_BDEV_STATE_CONFIGURING }, +- { "offline", RAID_BDEV_STATE_OFFLINE }, +- { } +-}; +- +-/* We have to use the typedef in the function declaration to appease astyle. */ +-typedef enum raid_level raid_level_t; +-typedef enum raid_bdev_state raid_bdev_state_t; +- +-raid_level_t +-raid_bdev_str_to_level(const char *str) +-{ +- unsigned int i; +- +- assert(str != NULL); +- +- for (i = 0; g_raid_level_names[i].name != NULL; i++) { +- if (strcasecmp(g_raid_level_names[i].name, str) == 0) { +- return g_raid_level_names[i].value; +- } +- } +- +- return INVALID_RAID_LEVEL; +-} +- +-const char * +-raid_bdev_level_to_str(enum raid_level level) +-{ +- unsigned int i; +- +- for (i = 0; g_raid_level_names[i].name != NULL; i++) { +- if (g_raid_level_names[i].value == level) { +- return g_raid_level_names[i].name; +- } +- } +- +- return ""; +-} +- +-raid_bdev_state_t +-raid_bdev_str_to_state(const char *str) +-{ +- unsigned int i; +- +- assert(str != NULL); +- +- for (i = 0; g_raid_state_names[i].name != NULL; i++) { +- if (strcasecmp(g_raid_state_names[i].name, str) == 0) { +- return g_raid_state_names[i].value; +- } +- } +- +- return RAID_BDEV_STATE_MAX; +-} +- +-const char * +-raid_bdev_state_to_str(enum raid_bdev_state state) +-{ +- unsigned int i; +- +- for (i = 0; g_raid_state_names[i].name != NULL; i++) { +- if (g_raid_state_names[i].value == state) { +- return g_raid_state_names[i].name; +- } +- } +- +- assert(false); +- return ""; +-} +- +-/* +- * brief: +- * raid_bdev_fini_start is called when bdev layer is starting the +- * shutdown process +- * params: +- * none +- * returns: +- * none +- */ +-static void +-raid_bdev_fini_start(void) +-{ +- SPDK_DEBUGLOG(bdev_raid, "raid_bdev_fini_start\n"); +- g_shutdown_started = true; +-} +- +-/* +- * brief: +- * raid_bdev_exit is called on raid bdev module exit time by bdev layer +- * params: +- * none +- * returns: +- * none +- */ +-static void +-raid_bdev_exit(void) +-{ +- struct raid_bdev *raid_bdev, *tmp; +- +- SPDK_DEBUGLOG(bdev_raid, "raid_bdev_exit\n"); +- +- TAILQ_FOREACH_SAFE(raid_bdev, &g_raid_bdev_list, global_link, tmp) { +- raid_bdev_cleanup_and_free(raid_bdev); +- } +-} +- +-/* +- * brief: +- * raid_bdev_get_ctx_size is used to return the context size of bdev_io for raid +- * module +- * params: +- * none +- * returns: +- * size of spdk_bdev_io context for raid +- */ +-static int +-raid_bdev_get_ctx_size(void) +-{ +- SPDK_DEBUGLOG(bdev_raid, "raid_bdev_get_ctx_size\n"); +- return sizeof(struct raid_bdev_io); +-} +- +-static struct spdk_bdev_module g_raid_if = { +- .name = "raid", +- .module_init = raid_bdev_init, +- .fini_start = raid_bdev_fini_start, +- .module_fini = raid_bdev_exit, +- .get_ctx_size = raid_bdev_get_ctx_size, +- .examine_config = raid_bdev_examine, +- .async_init = false, +- .async_fini = false, +-}; +-SPDK_BDEV_MODULE_REGISTER(raid, &g_raid_if) +- +-/* +- * brief: +- * raid_bdev_init is the initialization function for raid bdev module +- * params: +- * none +- * returns: +- * 0 - success +- * non zero - failure +- */ +-static int +-raid_bdev_init(void) +-{ +- return 0; +-} +- +-/* +- * brief: +- * raid_bdev_create allocates raid bdev based on passed configuration +- * params: +- * name - name for raid bdev +- * strip_size - strip size in KB +- * num_base_bdevs - number of base bdevs +- * level - raid level +- * raid_bdev_out - the created raid bdev +- * returns: +- * 0 - success +- * non zero - failure +- */ +-int +-raid_bdev_create(const char *name, uint32_t strip_size, uint8_t num_base_bdevs, +- enum raid_level level, struct raid_bdev **raid_bdev_out) +-{ +- struct raid_bdev *raid_bdev; +- struct spdk_bdev *raid_bdev_gen; +- struct raid_bdev_module *module; +- uint8_t min_operational; +- +- if (raid_bdev_find_by_name(name) != NULL) { +- SPDK_ERRLOG("Duplicate raid bdev name found: %s\n", name); +- return -EEXIST; +- } +- +- if (level == RAID1) { +- if (strip_size != 0) { +- SPDK_ERRLOG("Strip size is not supported by raid1\n"); +- return -EINVAL; +- } +- } else if (spdk_u32_is_pow2(strip_size) == false) { +- SPDK_ERRLOG("Invalid strip size %" PRIu32 "\n", strip_size); +- return -EINVAL; +- } +- +- module = raid_bdev_module_find(level); +- if (module == NULL) { +- SPDK_ERRLOG("Unsupported raid level '%d'\n", level); +- return -EINVAL; +- } +- +- assert(module->base_bdevs_min != 0); +- if (num_base_bdevs < module->base_bdevs_min) { +- SPDK_ERRLOG("At least %u base devices required for %s\n", +- module->base_bdevs_min, +- raid_bdev_level_to_str(level)); +- return -EINVAL; +- } +- +- switch (module->base_bdevs_constraint.type) { +- case CONSTRAINT_MAX_BASE_BDEVS_REMOVED: +- min_operational = num_base_bdevs - module->base_bdevs_constraint.value; +- break; +- case CONSTRAINT_MIN_BASE_BDEVS_OPERATIONAL: +- min_operational = module->base_bdevs_constraint.value; +- break; +- case CONSTRAINT_UNSET: +- if (module->base_bdevs_constraint.value != 0) { +- SPDK_ERRLOG("Unexpected constraint value '%u' provided for raid bdev '%s'.\n", +- (uint8_t)module->base_bdevs_constraint.value, name); +- return -EINVAL; +- } +- min_operational = num_base_bdevs; +- break; +- default: +- SPDK_ERRLOG("Unrecognised constraint type '%u' in module for raid level '%s'.\n", +- (uint8_t)module->base_bdevs_constraint.type, +- raid_bdev_level_to_str(module->level)); +- return -EINVAL; +- }; +- +- if (min_operational == 0 || min_operational > num_base_bdevs) { +- SPDK_ERRLOG("Wrong constraint value for raid level '%s'.\n", +- raid_bdev_level_to_str(module->level)); +- return -EINVAL; +- } +- +- raid_bdev = calloc(1, sizeof(*raid_bdev)); +- if (!raid_bdev) { +- SPDK_ERRLOG("Unable to allocate memory for raid bdev\n"); +- return -ENOMEM; +- } +- +- raid_bdev->module = module; +- raid_bdev->num_base_bdevs = num_base_bdevs; +- raid_bdev->base_bdev_info = calloc(raid_bdev->num_base_bdevs, +- sizeof(struct raid_base_bdev_info)); +- if (!raid_bdev->base_bdev_info) { +- SPDK_ERRLOG("Unable able to allocate base bdev info\n"); +- free(raid_bdev); +- return -ENOMEM; +- } +- +- /* strip_size_kb is from the rpc param. strip_size is in blocks and used +- * internally and set later. +- */ +- raid_bdev->strip_size = 0; +- raid_bdev->strip_size_kb = strip_size; +- raid_bdev->state = RAID_BDEV_STATE_CONFIGURING; +- raid_bdev->level = level; +- raid_bdev->min_base_bdevs_operational = min_operational; +- +- raid_bdev_gen = &raid_bdev->bdev; +- +- raid_bdev_gen->name = strdup(name); +- if (!raid_bdev_gen->name) { +- SPDK_ERRLOG("Unable to allocate name for raid\n"); +- free(raid_bdev->base_bdev_info); +- free(raid_bdev); +- return -ENOMEM; +- } +- +- raid_bdev_gen->product_name = "Raid Volume"; +- raid_bdev_gen->ctxt = raid_bdev; +- raid_bdev_gen->fn_table = &g_raid_bdev_fn_table; +- raid_bdev_gen->module = &g_raid_if; +- raid_bdev_gen->write_cache = 0; +- +- TAILQ_INSERT_TAIL(&g_raid_bdev_list, raid_bdev, global_link); +- +- *raid_bdev_out = raid_bdev; +- +- return 0; +-} +- +-/* +- * brief: +- * Check underlying block devices against support for metadata. Do not configure +- * md support when parameters from block devices are inconsistent. +- * params: +- * raid_bdev - pointer to raid bdev +- * returns: +- * 0 - The raid bdev md parameters were successfully configured. +- * non zero - Failed to configure md. +- */ +-static int +-raid_bdev_configure_md(struct raid_bdev *raid_bdev) +-{ +- struct spdk_bdev *base_bdev; +- uint8_t i; +- +- for (i = 0; i < raid_bdev->num_base_bdevs; i++) { +- base_bdev = raid_bdev->base_bdev_info[i].bdev; +- +- if (i == 0) { +- raid_bdev->bdev.md_len = spdk_bdev_get_md_size(base_bdev); +- raid_bdev->bdev.md_interleave = spdk_bdev_is_md_interleaved(base_bdev); +- raid_bdev->bdev.dif_type = spdk_bdev_get_dif_type(base_bdev); +- raid_bdev->bdev.dif_is_head_of_md = spdk_bdev_is_dif_head_of_md(base_bdev); +- raid_bdev->bdev.dif_check_flags = base_bdev->dif_check_flags; +- continue; +- } +- +- if (raid_bdev->bdev.md_len != spdk_bdev_get_md_size(base_bdev) || +- raid_bdev->bdev.md_interleave != spdk_bdev_is_md_interleaved(base_bdev) || +- raid_bdev->bdev.dif_type != spdk_bdev_get_dif_type(base_bdev) || +- raid_bdev->bdev.dif_is_head_of_md != spdk_bdev_is_dif_head_of_md(base_bdev) || +- raid_bdev->bdev.dif_check_flags != base_bdev->dif_check_flags) { +- SPDK_ERRLOG("base bdevs are configured with different metadata formats\n"); +- return -EPERM; +- } +- } +- +- return 0; +-} +- +-/* +- * brief: +- * If raid bdev config is complete, then only register the raid bdev to +- * bdev layer and remove this raid bdev from configuring list and +- * insert the raid bdev to configured list +- * params: +- * raid_bdev - pointer to raid bdev +- * returns: +- * 0 - success +- * non zero - failure +- */ +-static int +-raid_bdev_configure(struct raid_bdev *raid_bdev) +-{ +- uint32_t blocklen = 0; +- struct spdk_bdev *raid_bdev_gen; +- struct raid_base_bdev_info *base_info; +- int rc = 0; +- +- assert(raid_bdev->state == RAID_BDEV_STATE_CONFIGURING); +- assert(raid_bdev->num_base_bdevs_discovered == raid_bdev->num_base_bdevs); +- +- RAID_FOR_EACH_BASE_BDEV(raid_bdev, base_info) { +- assert(base_info->bdev != NULL); +- /* Check blocklen for all base bdevs that it should be same */ +- if (blocklen == 0) { +- blocklen = base_info->bdev->blocklen; +- } else if (blocklen != base_info->bdev->blocklen) { +- /* +- * Assumption is that all the base bdevs for any raid bdev should +- * have same blocklen +- */ +- SPDK_ERRLOG("Blocklen of various bdevs not matching\n"); +- return -EINVAL; +- } +- } +- assert(blocklen > 0); +- +- /* The strip_size_kb is read in from user in KB. Convert to blocks here for +- * internal use. +- */ +- raid_bdev->strip_size = (raid_bdev->strip_size_kb * 1024) / blocklen; +- raid_bdev->strip_size_shift = spdk_u32log2(raid_bdev->strip_size); +- raid_bdev->blocklen_shift = spdk_u32log2(blocklen); +- +- raid_bdev_gen = &raid_bdev->bdev; +- raid_bdev_gen->blocklen = blocklen; +- +- rc = raid_bdev_configure_md(raid_bdev); +- if (rc != 0) { +- SPDK_ERRLOG("raid metadata configuration failed\n"); +- return rc; +- } +- +- rc = raid_bdev->module->start(raid_bdev); +- if (rc != 0) { +- SPDK_ERRLOG("raid module startup callback failed\n"); +- return rc; +- } +- raid_bdev->state = RAID_BDEV_STATE_ONLINE; +- SPDK_DEBUGLOG(bdev_raid, "io device register %p\n", raid_bdev); +- SPDK_DEBUGLOG(bdev_raid, "blockcnt %" PRIu64 ", blocklen %u\n", +- raid_bdev_gen->blockcnt, raid_bdev_gen->blocklen); +- spdk_io_device_register(raid_bdev, raid_bdev_create_cb, raid_bdev_destroy_cb, +- sizeof(struct raid_bdev_io_channel), +- raid_bdev->bdev.name); +- rc = spdk_bdev_register(raid_bdev_gen); +- if (rc != 0) { +- SPDK_ERRLOG("Unable to register raid bdev and stay at configuring state\n"); +- if (raid_bdev->module->stop != NULL) { +- raid_bdev->module->stop(raid_bdev); +- } +- spdk_io_device_unregister(raid_bdev, NULL); +- raid_bdev->state = RAID_BDEV_STATE_CONFIGURING; +- return rc; +- } +- SPDK_DEBUGLOG(bdev_raid, "raid bdev generic %p\n", raid_bdev_gen); +- SPDK_DEBUGLOG(bdev_raid, "raid bdev is created with name %s, raid_bdev %p\n", +- raid_bdev_gen->name, raid_bdev); +- +- return 0; +-} +- +-/* +- * brief: +- * If raid bdev is online and registered, change the bdev state to +- * configuring and unregister this raid device. Queue this raid device +- * in configuring list +- * params: +- * raid_bdev - pointer to raid bdev +- * cb_fn - callback function +- * cb_arg - argument to callback function +- * returns: +- * none +- */ +-static void +-raid_bdev_deconfigure(struct raid_bdev *raid_bdev, raid_bdev_destruct_cb cb_fn, +- void *cb_arg) +-{ +- if (raid_bdev->state != RAID_BDEV_STATE_ONLINE) { +- if (cb_fn) { +- cb_fn(cb_arg, 0); +- } +- return; +- } +- +- assert(raid_bdev->num_base_bdevs == raid_bdev->num_base_bdevs_discovered); +- raid_bdev->state = RAID_BDEV_STATE_OFFLINE; +- assert(raid_bdev->num_base_bdevs_discovered); +- SPDK_DEBUGLOG(bdev_raid, "raid bdev state changing from online to offline\n"); +- +- spdk_bdev_unregister(&raid_bdev->bdev, cb_fn, cb_arg); +-} +- +-/* +- * brief: +- * raid_bdev_find_by_base_bdev function finds the raid bdev which has +- * claimed the base bdev. +- * params: +- * base_bdev - pointer to base bdev pointer +- * _raid_bdev - Reference to pointer to raid bdev +- * _base_info - Reference to the raid base bdev info. +- * returns: +- * true - if the raid bdev is found. +- * false - if the raid bdev is not found. +- */ +-static bool +-raid_bdev_find_by_base_bdev(struct spdk_bdev *base_bdev, struct raid_bdev **_raid_bdev, +- struct raid_base_bdev_info **_base_info) +-{ +- struct raid_bdev *raid_bdev; +- struct raid_base_bdev_info *base_info; +- +- TAILQ_FOREACH(raid_bdev, &g_raid_bdev_list, global_link) { +- RAID_FOR_EACH_BASE_BDEV(raid_bdev, base_info) { +- if (base_info->bdev == base_bdev) { +- *_raid_bdev = raid_bdev; +- *_base_info = base_info; +- return true; +- } +- } +- } +- +- return false; +-} +- +-/* +- * brief: +- * raid_bdev_remove_base_bdev function is called by below layers when base_bdev +- * is removed. This function checks if this base bdev is part of any raid bdev +- * or not. If yes, it takes necessary action on that particular raid bdev. +- * params: +- * base_bdev - pointer to base bdev which got removed +- * returns: +- * none +- */ +-static void +-raid_bdev_remove_base_bdev(struct spdk_bdev *base_bdev) +-{ +- struct raid_bdev *raid_bdev = NULL; +- struct raid_base_bdev_info *base_info; +- +- SPDK_DEBUGLOG(bdev_raid, "raid_bdev_remove_base_bdev\n"); +- +- /* Find the raid_bdev which has claimed this base_bdev */ +- if (!raid_bdev_find_by_base_bdev(base_bdev, &raid_bdev, &base_info)) { +- SPDK_ERRLOG("bdev to remove '%s' not found\n", base_bdev->name); +- return; +- } +- +- assert(spdk_get_thread() == spdk_thread_get_app_thread()); +- +- assert(base_info->desc); +- base_info->remove_scheduled = true; +- +- if (raid_bdev->state != RAID_BDEV_STATE_ONLINE) { +- /* +- * As raid bdev is not registered yet or already unregistered, +- * so cleanup should be done here itself. +- */ +- raid_bdev_free_base_bdev_resource(raid_bdev, base_info); +- if (raid_bdev->num_base_bdevs_discovered == 0) { +- /* There is no base bdev for this raid, so free the raid device. */ +- raid_bdev_cleanup_and_free(raid_bdev); +- return; +- } +- } +- +- raid_bdev_deconfigure(raid_bdev, NULL, NULL); +-} +- +-/* +- * brief: +- * raid_bdev_resize_base_bdev function is called by below layers when base_bdev +- * is resized. This function checks if the smallest size of the base_bdevs is changed. +- * If yes, call module handler to resize the raid_bdev if implemented. +- * params: +- * base_bdev - pointer to base bdev which got resized. +- * returns: +- * none +- */ +-static void +-raid_bdev_resize_base_bdev(struct spdk_bdev *base_bdev) +-{ +- struct raid_bdev *raid_bdev = NULL; +- struct raid_base_bdev_info *base_info; +- +- SPDK_DEBUGLOG(bdev_raid, "raid_bdev_resize_base_bdev\n"); +- +- /* Find the raid_bdev which has claimed this base_bdev */ +- if (!raid_bdev_find_by_base_bdev(base_bdev, &raid_bdev, &base_info)) { +- SPDK_ERRLOG("raid_bdev whose base_bdev '%s' not found\n", base_bdev->name); +- return; +- } +- +- assert(spdk_get_thread() == spdk_thread_get_app_thread()); +- +- SPDK_NOTICELOG("base_bdev '%s' was resized: old size %" PRIu64 ", new size %" PRIu64 "\n", +- base_bdev->name, base_info->blockcnt, base_bdev->blockcnt); +- +- if (raid_bdev->module->resize) { +- raid_bdev->module->resize(raid_bdev); +- } +-} +- +-/* +- * brief: +- * raid_bdev_event_base_bdev function is called by below layers when base_bdev +- * triggers asynchronous event. +- * params: +- * type - event details. +- * bdev - bdev that triggered event. +- * event_ctx - context for event. +- * returns: +- * none +- */ +-static void +-raid_bdev_event_base_bdev(enum spdk_bdev_event_type type, struct spdk_bdev *bdev, +- void *event_ctx) +-{ +- switch (type) { +- case SPDK_BDEV_EVENT_REMOVE: +- raid_bdev_remove_base_bdev(bdev); +- break; +- case SPDK_BDEV_EVENT_RESIZE: +- raid_bdev_resize_base_bdev(bdev); +- break; +- default: +- SPDK_NOTICELOG("Unsupported bdev event: type %d\n", type); +- break; +- } +-} +- +-/* +- * brief: +- * Deletes the specified raid bdev +- * params: +- * raid_bdev - pointer to raid bdev +- * cb_fn - callback function +- * cb_arg - argument to callback function +- */ +-void +-raid_bdev_delete(struct raid_bdev *raid_bdev, raid_bdev_destruct_cb cb_fn, void *cb_arg) +-{ +- struct raid_base_bdev_info *base_info; +- +- SPDK_DEBUGLOG(bdev_raid, "delete raid bdev: %s\n", raid_bdev->bdev.name); +- +- if (raid_bdev->destroy_started) { +- SPDK_DEBUGLOG(bdev_raid, "destroying raid bdev %s is already started\n", +- raid_bdev->bdev.name); +- if (cb_fn) { +- cb_fn(cb_arg, -EALREADY); +- } +- return; +- } +- +- raid_bdev->destroy_started = true; +- +- RAID_FOR_EACH_BASE_BDEV(raid_bdev, base_info) { +- base_info->remove_scheduled = true; +- +- if (raid_bdev->state != RAID_BDEV_STATE_ONLINE) { +- /* +- * As raid bdev is not registered yet or already unregistered, +- * so cleanup should be done here itself. +- */ +- raid_bdev_free_base_bdev_resource(raid_bdev, base_info); +- } +- } +- +- if (raid_bdev->num_base_bdevs_discovered == 0) { +- /* There is no base bdev for this raid, so free the raid device. */ +- raid_bdev_cleanup_and_free(raid_bdev); +- if (cb_fn) { +- cb_fn(cb_arg, 0); +- } +- } else { +- raid_bdev_deconfigure(raid_bdev, cb_fn, cb_arg); +- } +-} +- +-static int +-raid_bdev_configure_base_bdev(struct raid_bdev *raid_bdev, struct raid_base_bdev_info *base_info) +-{ +- struct spdk_bdev_desc *desc; +- struct spdk_bdev *bdev; +- int rc; +- +- assert(spdk_get_thread() == spdk_thread_get_app_thread()); +- assert(base_info->name != NULL); +- assert(base_info->bdev == NULL); +- +- rc = spdk_bdev_open_ext(base_info->name, true, raid_bdev_event_base_bdev, NULL, &desc); +- if (rc != 0) { +- if (rc != -ENODEV) { +- SPDK_ERRLOG("Unable to create desc on bdev '%s'\n", base_info->name); +- } +- return rc; +- } +- +- bdev = spdk_bdev_desc_get_bdev(desc); +- +- rc = spdk_bdev_module_claim_bdev(bdev, NULL, &g_raid_if); +- if (rc != 0) { +- SPDK_ERRLOG("Unable to claim this bdev as it is already claimed\n"); +- spdk_bdev_close(desc); +- return rc; +- } +- +- SPDK_DEBUGLOG(bdev_raid, "bdev %s is claimed\n", bdev->name); +- +- assert(raid_bdev->state != RAID_BDEV_STATE_ONLINE); +- +- base_info->bdev = bdev; +- base_info->desc = desc; +- base_info->blockcnt = bdev->blockcnt; +- raid_bdev->num_base_bdevs_discovered++; +- assert(raid_bdev->num_base_bdevs_discovered <= raid_bdev->num_base_bdevs); +- +- if (raid_bdev->num_base_bdevs_discovered == raid_bdev->num_base_bdevs) { +- rc = raid_bdev_configure(raid_bdev); +- if (rc != 0) { +- SPDK_ERRLOG("Failed to configure raid bdev\n"); +- return rc; +- } +- } +- +- return 0; +-} +- +-/* +- * brief: +- * raid_bdev_add_base_device function is the actual function which either adds +- * the nvme base device to existing raid bdev or create a new raid bdev. It also claims +- * the base device and keep the open descriptor. +- * params: +- * raid_bdev - pointer to raid bdev +- * name - name of the base bdev +- * slot - position to add base bdev +- * returns: +- * 0 - success +- * non zero - failure +- */ +-int +-raid_bdev_add_base_device(struct raid_bdev *raid_bdev, const char *name, uint8_t slot) +-{ +- struct raid_base_bdev_info *base_info; +- int rc; +- +- if (slot >= raid_bdev->num_base_bdevs) { +- return -EINVAL; +- } +- +- base_info = &raid_bdev->base_bdev_info[slot]; +- +- if (base_info->name != NULL) { +- SPDK_ERRLOG("Slot %u on raid bdev '%s' already assigned to bdev '%s'\n", +- slot, raid_bdev->bdev.name, base_info->name); +- return -EBUSY; +- } +- +- base_info->name = strdup(name); +- if (base_info->name == NULL) { +- return -ENOMEM; +- } +- +- rc = raid_bdev_configure_base_bdev(raid_bdev, base_info); +- if (rc != 0) { +- if (rc != -ENODEV) { +- SPDK_ERRLOG("Failed to allocate resource for bdev '%s'\n", name); +- } +- return rc; +- } +- +- return 0; +-} +- +-/* +- * brief: +- * raid_bdev_examine function is the examine function call by the below layers +- * like bdev_nvme layer. This function will check if this base bdev can be +- * claimed by this raid bdev or not. +- * params: +- * bdev - pointer to base bdev +- * returns: +- * none +- */ +-static void +-raid_bdev_examine(struct spdk_bdev *bdev) +-{ +- struct raid_bdev *raid_bdev; +- struct raid_base_bdev_info *base_info; +- +- TAILQ_FOREACH(raid_bdev, &g_raid_bdev_list, global_link) { +- RAID_FOR_EACH_BASE_BDEV(raid_bdev, base_info) { +- if (base_info->bdev == NULL && strcmp(bdev->name, base_info->name) == 0) { +- raid_bdev_configure_base_bdev(raid_bdev, base_info); +- break; +- } +- } +- } +- +- spdk_bdev_module_examine_done(&g_raid_if); +-} +- +-/* Log component for bdev raid bdev module */ +-SPDK_LOG_REGISTER_COMPONENT(bdev_raid) ++/* SPDX-License-Identifier: BSD-3-Clause ++ * Copyright (C) 2018 Intel Corporation. ++ * All rights reserved. ++ * Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. ++ */ ++ ++#include "bdev_raid.h" ++#include "spdk/env.h" ++#include "spdk/thread.h" ++#include "spdk/log.h" ++#include "spdk/string.h" ++#include "spdk/util.h" ++#include "spdk/json.h" ++ ++static bool g_shutdown_started = false; ++ ++/* List of all raid bdevs */ ++struct raid_all_tailq g_raid_bdev_list = TAILQ_HEAD_INITIALIZER(g_raid_bdev_list); ++ ++static TAILQ_HEAD(, raid_bdev_module) g_raid_modules = TAILQ_HEAD_INITIALIZER(g_raid_modules); ++ ++static struct raid_bdev_module * ++raid_bdev_module_find(enum raid_level level) ++{ ++ struct raid_bdev_module *raid_module; ++ ++ TAILQ_FOREACH(raid_module, &g_raid_modules, link) { ++ if (raid_module->level == level) { ++ return raid_module; ++ } ++ } ++ ++ return NULL; ++} ++ ++void ++raid_bdev_module_list_add(struct raid_bdev_module *raid_module) ++{ ++ if (raid_bdev_module_find(raid_module->level) != NULL) { ++ SPDK_ERRLOG("module for raid level '%s' already registered.\n", ++ raid_bdev_level_to_str(raid_module->level)); ++ assert(false); ++ } else { ++ TAILQ_INSERT_TAIL(&g_raid_modules, raid_module, link); ++ } ++} ++ ++/* Function declarations */ ++static void raid_bdev_examine(struct spdk_bdev *bdev); ++static int raid_bdev_init(void); ++static void raid_bdev_deconfigure(struct raid_bdev *raid_bdev, ++ raid_bdev_destruct_cb cb_fn, void *cb_arg); ++ ++/* ++ * brief: ++ * raid_bdev_create_cb function is a cb function for raid bdev which creates the ++ * hierarchy from raid bdev to base bdev io channels. It will be called per core ++ * params: ++ * io_device - pointer to raid bdev io device represented by raid_bdev ++ * ctx_buf - pointer to context buffer for raid bdev io channel ++ * returns: ++ * 0 - success ++ * non zero - failure ++ */ ++static int ++raid_bdev_create_cb(void *io_device, void *ctx_buf) ++{ ++ struct raid_bdev *raid_bdev = io_device; ++ struct raid_bdev_io_channel *raid_ch = ctx_buf; ++ uint8_t i; ++ int ret = 0; ++ ++ SPDK_DEBUGLOG(bdev_raid, "raid_bdev_create_cb, %p\n", raid_ch); ++ ++ assert(raid_bdev != NULL); ++ assert(raid_bdev->state == RAID_BDEV_STATE_ONLINE); ++ ++ raid_ch->num_channels = raid_bdev->num_base_bdevs; ++ ++ raid_ch->base_channel = calloc(raid_ch->num_channels, ++ sizeof(struct spdk_io_channel *)); ++ if (!raid_ch->base_channel) { ++ SPDK_ERRLOG("Unable to allocate base bdevs io channel\n"); ++ return -ENOMEM; ++ } ++ for (i = 0; i < raid_ch->num_channels; i++) { ++ /* ++ * Get the spdk_io_channel for all the base bdevs. This is used during ++ * split logic to send the respective child bdev ios to respective base ++ * bdev io channel. ++ */ ++ raid_ch->base_channel[i] = spdk_bdev_get_io_channel( ++ raid_bdev->base_bdev_info[i].desc); ++ if (!raid_ch->base_channel[i]) { ++ SPDK_ERRLOG("Unable to create io channel for base bdev\n"); ++ ret = -ENOMEM; ++ break; ++ } ++ } ++ ++ if (!ret && raid_bdev->module->get_io_channel) { ++ raid_ch->module_channel = raid_bdev->module->get_io_channel(raid_bdev); ++ if (!raid_ch->module_channel) { ++ SPDK_ERRLOG("Unable to create io channel for raid module\n"); ++ ret = -ENOMEM; ++ } ++ } ++ ++ if (ret) { ++ uint8_t j; ++ ++ for (j = 0; j < i; j++) { ++ spdk_put_io_channel(raid_ch->base_channel[j]); ++ } ++ free(raid_ch->base_channel); ++ raid_ch->base_channel = NULL; ++ } ++ return ret; ++} ++ ++/* ++ * brief: ++ * raid_bdev_destroy_cb function is a cb function for raid bdev which deletes the ++ * hierarchy from raid bdev to base bdev io channels. It will be called per core ++ * params: ++ * io_device - pointer to raid bdev io device represented by raid_bdev ++ * ctx_buf - pointer to context buffer for raid bdev io channel ++ * returns: ++ * none ++ */ ++static void ++raid_bdev_destroy_cb(void *io_device, void *ctx_buf) ++{ ++ struct raid_bdev_io_channel *raid_ch = ctx_buf; ++ uint8_t i; ++ ++ SPDK_DEBUGLOG(bdev_raid, "raid_bdev_destroy_cb\n"); ++ ++ assert(raid_ch != NULL); ++ assert(raid_ch->base_channel); ++ ++ if (raid_ch->module_channel) { ++ spdk_put_io_channel(raid_ch->module_channel); ++ } ++ ++ for (i = 0; i < raid_ch->num_channels; i++) { ++ /* Free base bdev channels */ ++ assert(raid_ch->base_channel[i] != NULL); ++ spdk_put_io_channel(raid_ch->base_channel[i]); ++ } ++ free(raid_ch->base_channel); ++ raid_ch->base_channel = NULL; ++} ++ ++/* ++ * brief: ++ * raid_bdev_cleanup is used to cleanup raid_bdev related data ++ * structures. ++ * params: ++ * raid_bdev - pointer to raid_bdev ++ * returns: ++ * none ++ */ ++static void ++raid_bdev_cleanup(struct raid_bdev *raid_bdev) ++{ ++ struct raid_base_bdev_info *base_info; ++ ++ SPDK_DEBUGLOG(bdev_raid, "raid_bdev_cleanup, %p name %s, state %s\n", ++ raid_bdev, raid_bdev->bdev.name, raid_bdev_state_to_str(raid_bdev->state)); ++ assert(raid_bdev->state != RAID_BDEV_STATE_ONLINE); ++ assert(spdk_get_thread() == spdk_thread_get_app_thread()); ++ ++ RAID_FOR_EACH_BASE_BDEV(raid_bdev, base_info) { ++ assert(base_info->bdev == NULL); ++ assert(base_info->desc == NULL); ++ free(base_info->name); ++ } ++ ++ TAILQ_REMOVE(&g_raid_bdev_list, raid_bdev, global_link); ++ free(raid_bdev->base_bdev_info); ++} ++ ++static void ++raid_bdev_free(struct raid_bdev *raid_bdev) ++{ ++ free(raid_bdev->bdev.name); ++ free(raid_bdev); ++} ++ ++static void ++raid_bdev_cleanup_and_free(struct raid_bdev *raid_bdev) ++{ ++ raid_bdev_cleanup(raid_bdev); ++ raid_bdev_free(raid_bdev); ++} ++ ++/* ++ * brief: ++ * free resource of base bdev for raid bdev ++ * params: ++ * raid_bdev - pointer to raid bdev ++ * base_info - raid base bdev info ++ * returns: ++ * 0 - success ++ * non zero - failure ++ */ ++static void ++raid_bdev_free_base_bdev_resource(struct raid_bdev *raid_bdev, ++ struct raid_base_bdev_info *base_info) ++{ ++ assert(spdk_get_thread() == spdk_thread_get_app_thread()); ++ ++ free(base_info->name); ++ base_info->name = NULL; ++ ++ if (base_info->bdev == NULL) { ++ return; ++ } ++ ++ assert(base_info->desc); ++ spdk_bdev_module_release_bdev(base_info->bdev); ++ spdk_bdev_close(base_info->desc); ++ base_info->desc = NULL; ++ base_info->bdev = NULL; ++ ++ assert(raid_bdev->num_base_bdevs_discovered); ++ raid_bdev->num_base_bdevs_discovered--; ++} ++ ++static void ++raid_bdev_io_device_unregister_cb(void *io_device) ++{ ++ struct raid_bdev *raid_bdev = io_device; ++ ++ if (raid_bdev->num_base_bdevs_discovered == 0) { ++ /* Free raid_bdev when there are no base bdevs left */ ++ SPDK_DEBUGLOG(bdev_raid, "raid bdev base bdevs is 0, going to free all in destruct\n"); ++ raid_bdev_cleanup(raid_bdev); ++ spdk_bdev_destruct_done(&raid_bdev->bdev, 0); ++ raid_bdev_free(raid_bdev); ++ } else { ++ spdk_bdev_destruct_done(&raid_bdev->bdev, 0); ++ } ++} ++ ++void ++raid_bdev_module_stop_done(struct raid_bdev *raid_bdev) ++{ ++ if (raid_bdev->state != RAID_BDEV_STATE_CONFIGURING) { ++ spdk_io_device_unregister(raid_bdev, raid_bdev_io_device_unregister_cb); ++ } ++} ++ ++static void ++_raid_bdev_destruct(void *ctxt) ++{ ++ struct raid_bdev *raid_bdev = ctxt; ++ struct raid_base_bdev_info *base_info; ++ ++ SPDK_DEBUGLOG(bdev_raid, "raid_bdev_destruct\n"); ++ ++ RAID_FOR_EACH_BASE_BDEV(raid_bdev, base_info) { ++ /* ++ * Close all base bdev descriptors for which call has come from below ++ * layers. Also close the descriptors if we have started shutdown. ++ */ ++ if (g_shutdown_started || base_info->remove_scheduled == true) { ++ raid_bdev_free_base_bdev_resource(raid_bdev, base_info); ++ } ++ } ++ ++ if (g_shutdown_started) { ++ raid_bdev->state = RAID_BDEV_STATE_OFFLINE; ++ } ++ ++ if (raid_bdev->module->stop != NULL) { ++ if (raid_bdev->module->stop(raid_bdev) == false) { ++ return; ++ } ++ } ++ ++ raid_bdev_module_stop_done(raid_bdev); ++} ++ ++static int ++raid_bdev_destruct(void *ctx) ++{ ++ spdk_thread_exec_msg(spdk_thread_get_app_thread(), _raid_bdev_destruct, ctx); ++ ++ return 1; ++} ++ ++void ++raid_bdev_io_complete(struct raid_bdev_io *raid_io, enum spdk_bdev_io_status status) ++{ ++ struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(raid_io); ++ ++ spdk_bdev_io_complete(bdev_io, status); ++} ++ ++/* ++ * brief: ++ * raid_bdev_io_complete_part - signal the completion of a part of the expected ++ * base bdev IOs and complete the raid_io if this is the final expected IO. ++ * The caller should first set raid_io->base_bdev_io_remaining. This function ++ * will decrement this counter by the value of the 'completed' parameter and ++ * complete the raid_io if the counter reaches 0. The caller is free to ++ * interpret the 'base_bdev_io_remaining' and 'completed' values as needed, ++ * it can represent e.g. blocks or IOs. ++ * params: ++ * raid_io - pointer to raid_bdev_io ++ * completed - the part of the raid_io that has been completed ++ * status - status of the base IO ++ * returns: ++ * true - if the raid_io is completed ++ * false - otherwise ++ */ ++bool ++raid_bdev_io_complete_part(struct raid_bdev_io *raid_io, uint64_t completed, ++ enum spdk_bdev_io_status status) ++{ ++ assert(raid_io->base_bdev_io_remaining >= completed); ++ raid_io->base_bdev_io_remaining -= completed; ++ ++ if (status != SPDK_BDEV_IO_STATUS_SUCCESS) { ++ raid_io->base_bdev_io_status = status; ++ } ++ ++ if (raid_io->base_bdev_io_remaining == 0) { ++ raid_bdev_io_complete(raid_io, raid_io->base_bdev_io_status); ++ return true; ++ } else { ++ return false; ++ } ++} ++ ++/* ++ * brief: ++ * raid_bdev_queue_io_wait function processes the IO which failed to submit. ++ * It will try to queue the IOs after storing the context to bdev wait queue logic. ++ * params: ++ * raid_io - pointer to raid_bdev_io ++ * bdev - the block device that the IO is submitted to ++ * ch - io channel ++ * cb_fn - callback when the spdk_bdev_io for bdev becomes available ++ * returns: ++ * none ++ */ ++void ++raid_bdev_queue_io_wait(struct raid_bdev_io *raid_io, struct spdk_bdev *bdev, ++ struct spdk_io_channel *ch, spdk_bdev_io_wait_cb cb_fn) ++{ ++ raid_io->waitq_entry.bdev = bdev; ++ raid_io->waitq_entry.cb_fn = cb_fn; ++ raid_io->waitq_entry.cb_arg = raid_io; ++ spdk_bdev_queue_io_wait(bdev, ch, &raid_io->waitq_entry); ++} ++ ++static void ++raid_base_bdev_reset_complete(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) ++{ ++ struct raid_bdev_io *raid_io = cb_arg; ++ ++ spdk_bdev_free_io(bdev_io); ++ ++ raid_bdev_io_complete_part(raid_io, 1, success ? ++ SPDK_BDEV_IO_STATUS_SUCCESS : ++ SPDK_BDEV_IO_STATUS_FAILED); ++} ++ ++static void raid_bdev_submit_reset_request(struct raid_bdev_io *raid_io); ++ ++static void ++_raid_bdev_submit_reset_request(void *_raid_io) ++{ ++ struct raid_bdev_io *raid_io = _raid_io; ++ ++ raid_bdev_submit_reset_request(raid_io); ++} ++ ++/* ++ * brief: ++ * raid_bdev_submit_reset_request function submits reset requests ++ * to member disks; it will submit as many as possible unless a reset fails with -ENOMEM, in ++ * which case it will queue it for later submission ++ * params: ++ * raid_io ++ * returns: ++ * none ++ */ ++static void ++raid_bdev_submit_reset_request(struct raid_bdev_io *raid_io) ++{ ++ struct raid_bdev *raid_bdev; ++ int ret; ++ uint8_t i; ++ struct raid_base_bdev_info *base_info; ++ struct spdk_io_channel *base_ch; ++ ++ raid_bdev = raid_io->raid_bdev; ++ ++ if (raid_io->base_bdev_io_remaining == 0) { ++ raid_io->base_bdev_io_remaining = raid_bdev->num_base_bdevs; ++ } ++ ++ while (raid_io->base_bdev_io_submitted < raid_bdev->num_base_bdevs) { ++ i = raid_io->base_bdev_io_submitted; ++ base_info = &raid_bdev->base_bdev_info[i]; ++ base_ch = raid_io->raid_ch->base_channel[i]; ++ ret = spdk_bdev_reset(base_info->desc, base_ch, ++ raid_base_bdev_reset_complete, raid_io); ++ if (ret == 0) { ++ raid_io->base_bdev_io_submitted++; ++ } else if (ret == -ENOMEM) { ++ raid_bdev_queue_io_wait(raid_io, base_info->bdev, base_ch, ++ _raid_bdev_submit_reset_request); ++ return; ++ } else { ++ SPDK_ERRLOG("bdev io submit error not due to ENOMEM, it should not happen\n"); ++ assert(false); ++ raid_bdev_io_complete(raid_io, SPDK_BDEV_IO_STATUS_FAILED); ++ return; ++ } ++ } ++} ++ ++/* ++ * brief: ++ * Callback function to spdk_bdev_io_get_buf. ++ * params: ++ * ch - pointer to raid bdev io channel ++ * bdev_io - pointer to parent bdev_io on raid bdev device ++ * success - True if buffer is allocated or false otherwise. ++ * returns: ++ * none ++ */ ++static void ++raid_bdev_get_buf_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io, ++ bool success) ++{ ++ struct raid_bdev_io *raid_io = (struct raid_bdev_io *)bdev_io->driver_ctx; ++ ++ if (!success) { ++ raid_bdev_io_complete(raid_io, SPDK_BDEV_IO_STATUS_FAILED); ++ return; ++ } ++ ++ raid_io->raid_bdev->module->submit_rw_request(raid_io); ++} ++ ++/* ++ * brief: ++ * raid_bdev_submit_request function is the submit_request function pointer of ++ * raid bdev function table. This is used to submit the io on raid_bdev to below ++ * layers. ++ * params: ++ * ch - pointer to raid bdev io channel ++ * bdev_io - pointer to parent bdev_io on raid bdev device ++ * returns: ++ * none ++ */ ++static void ++raid_bdev_submit_request(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io) ++{ ++ struct raid_bdev_io *raid_io = (struct raid_bdev_io *)bdev_io->driver_ctx; ++ ++ raid_io->raid_bdev = bdev_io->bdev->ctxt; ++ raid_io->raid_ch = spdk_io_channel_get_ctx(ch); ++ raid_io->base_bdev_io_remaining = 0; ++ raid_io->base_bdev_io_submitted = 0; ++ raid_io->base_bdev_io_status = SPDK_BDEV_IO_STATUS_SUCCESS; ++ ++ switch (bdev_io->type) { ++ case SPDK_BDEV_IO_TYPE_READ: ++ spdk_bdev_io_get_buf(bdev_io, raid_bdev_get_buf_cb, ++ bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen); ++ break; ++ case SPDK_BDEV_IO_TYPE_WRITE: ++ raid_io->raid_bdev->module->submit_rw_request(raid_io); ++ break; ++ ++ case SPDK_BDEV_IO_TYPE_RESET: ++ raid_bdev_submit_reset_request(raid_io); ++ break; ++ ++ case SPDK_BDEV_IO_TYPE_FLUSH: ++ case SPDK_BDEV_IO_TYPE_UNMAP: ++ raid_io->raid_bdev->module->submit_null_payload_request(raid_io); ++ break; ++ ++ default: ++ SPDK_ERRLOG("submit request, invalid io type %u\n", bdev_io->type); ++ raid_bdev_io_complete(raid_io, SPDK_BDEV_IO_STATUS_FAILED); ++ break; ++ } ++} ++ ++/* ++ * brief: ++ * _raid_bdev_io_type_supported checks whether io_type is supported in ++ * all base bdev modules of raid bdev module. If anyone among the base_bdevs ++ * doesn't support, the raid device doesn't supports. ++ * ++ * params: ++ * raid_bdev - pointer to raid bdev context ++ * io_type - io type ++ * returns: ++ * true - io_type is supported ++ * false - io_type is not supported ++ */ ++inline static bool ++_raid_bdev_io_type_supported(struct raid_bdev *raid_bdev, enum spdk_bdev_io_type io_type) ++{ ++ struct raid_base_bdev_info *base_info; ++ ++ if (io_type == SPDK_BDEV_IO_TYPE_FLUSH || ++ io_type == SPDK_BDEV_IO_TYPE_UNMAP) { ++ if (raid_bdev->module->submit_null_payload_request == NULL) { ++ return false; ++ } ++ } ++ ++ RAID_FOR_EACH_BASE_BDEV(raid_bdev, base_info) { ++ if (base_info->bdev == NULL) { ++ assert(false); ++ continue; ++ } ++ ++ if (spdk_bdev_io_type_supported(base_info->bdev, io_type) == false) { ++ return false; ++ } ++ } ++ ++ return true; ++} ++ ++/* ++ * brief: ++ * raid_bdev_io_type_supported is the io_supported function for bdev function ++ * table which returns whether the particular io type is supported or not by ++ * raid bdev module ++ * params: ++ * ctx - pointer to raid bdev context ++ * type - io type ++ * returns: ++ * true - io_type is supported ++ * false - io_type is not supported ++ */ ++static bool ++raid_bdev_io_type_supported(void *ctx, enum spdk_bdev_io_type io_type) ++{ ++ switch (io_type) { ++ case SPDK_BDEV_IO_TYPE_READ: ++ case SPDK_BDEV_IO_TYPE_WRITE: ++ return true; ++ ++ case SPDK_BDEV_IO_TYPE_FLUSH: ++ case SPDK_BDEV_IO_TYPE_RESET: ++ case SPDK_BDEV_IO_TYPE_UNMAP: ++ return _raid_bdev_io_type_supported(ctx, io_type); ++ ++ default: ++ return false; ++ } ++ ++ return false; ++} ++ ++/* ++ * brief: ++ * raid_bdev_get_io_channel is the get_io_channel function table pointer for ++ * raid bdev. This is used to return the io channel for this raid bdev ++ * params: ++ * ctxt - pointer to raid_bdev ++ * returns: ++ * pointer to io channel for raid bdev ++ */ ++static struct spdk_io_channel * ++raid_bdev_get_io_channel(void *ctxt) ++{ ++ struct raid_bdev *raid_bdev = ctxt; ++ ++ return spdk_get_io_channel(raid_bdev); ++} ++ ++void ++raid_bdev_write_info_json(struct raid_bdev *raid_bdev, struct spdk_json_write_ctx *w) ++{ ++ struct raid_base_bdev_info *base_info; ++ ++ assert(raid_bdev != NULL); ++ assert(spdk_get_thread() == spdk_thread_get_app_thread()); ++ ++ spdk_json_write_named_uint32(w, "strip_size_kb", raid_bdev->strip_size_kb); ++ spdk_json_write_named_string(w, "state", raid_bdev_state_to_str(raid_bdev->state)); ++ spdk_json_write_named_string(w, "raid_level", raid_bdev_level_to_str(raid_bdev->level)); ++ spdk_json_write_named_uint32(w, "num_base_bdevs", raid_bdev->num_base_bdevs); ++ spdk_json_write_named_uint32(w, "num_base_bdevs_discovered", raid_bdev->num_base_bdevs_discovered); ++ spdk_json_write_name(w, "base_bdevs_list"); ++ spdk_json_write_array_begin(w); ++ RAID_FOR_EACH_BASE_BDEV(raid_bdev, base_info) { ++ if (base_info->bdev) { ++ spdk_json_write_string(w, base_info->bdev->name); ++ } else { ++ spdk_json_write_null(w); ++ } ++ } ++ spdk_json_write_array_end(w); ++} ++ ++/* ++ * brief: ++ * raid_bdev_dump_info_json is the function table pointer for raid bdev ++ * params: ++ * ctx - pointer to raid_bdev ++ * w - pointer to json context ++ * returns: ++ * 0 - success ++ * non zero - failure ++ */ ++static int ++raid_bdev_dump_info_json(void *ctx, struct spdk_json_write_ctx *w) ++{ ++ struct raid_bdev *raid_bdev = ctx; ++ ++ SPDK_DEBUGLOG(bdev_raid, "raid_bdev_dump_config_json\n"); ++ ++ /* Dump the raid bdev configuration related information */ ++ spdk_json_write_named_object_begin(w, "raid"); ++ raid_bdev_write_info_json(raid_bdev, w); ++ spdk_json_write_object_end(w); ++ ++ return 0; ++} ++ ++/* ++ * brief: ++ * raid_bdev_write_config_json is the function table pointer for raid bdev ++ * params: ++ * bdev - pointer to spdk_bdev ++ * w - pointer to json context ++ * returns: ++ * none ++ */ ++static void ++raid_bdev_write_config_json(struct spdk_bdev *bdev, struct spdk_json_write_ctx *w) ++{ ++ struct raid_bdev *raid_bdev = bdev->ctxt; ++ struct raid_base_bdev_info *base_info; ++ ++ assert(spdk_get_thread() == spdk_thread_get_app_thread()); ++ ++ spdk_json_write_object_begin(w); ++ ++ spdk_json_write_named_string(w, "method", "bdev_raid_create"); ++ ++ spdk_json_write_named_object_begin(w, "params"); ++ spdk_json_write_named_string(w, "name", bdev->name); ++ spdk_json_write_named_uint32(w, "strip_size_kb", raid_bdev->strip_size_kb); ++ spdk_json_write_named_string(w, "raid_level", raid_bdev_level_to_str(raid_bdev->level)); ++ ++ spdk_json_write_named_array_begin(w, "base_bdevs"); ++ RAID_FOR_EACH_BASE_BDEV(raid_bdev, base_info) { ++ if (base_info->bdev) { ++ spdk_json_write_string(w, base_info->bdev->name); ++ } ++ } ++ spdk_json_write_array_end(w); ++ spdk_json_write_object_end(w); ++ ++ spdk_json_write_object_end(w); ++} ++ ++static int ++raid_bdev_get_memory_domains(void *ctx, struct spdk_memory_domain **domains, int array_size) ++{ ++ struct raid_bdev *raid_bdev = ctx; ++ struct spdk_bdev *base_bdev; ++ uint32_t i; ++ int domains_count = 0, rc; ++ ++ /* First loop to get the number of memory domains */ ++ for (i = 0; i < raid_bdev->num_base_bdevs; i++) { ++ base_bdev = raid_bdev->base_bdev_info[i].bdev; ++ rc = spdk_bdev_get_memory_domains(base_bdev, NULL, 0); ++ if (rc < 0) { ++ return rc; ++ } ++ domains_count += rc; ++ } ++ ++ if (!domains || array_size < domains_count) { ++ return domains_count; ++ } ++ ++ for (i = 0; i < raid_bdev->num_base_bdevs; i++) { ++ base_bdev = raid_bdev->base_bdev_info[i].bdev; ++ rc = spdk_bdev_get_memory_domains(base_bdev, domains, array_size); ++ if (rc < 0) { ++ return rc; ++ } ++ domains += rc; ++ array_size -= rc; ++ } ++ ++ return domains_count; ++} ++ ++/* g_raid_bdev_fn_table is the function table for raid bdev */ ++static const struct spdk_bdev_fn_table g_raid_bdev_fn_table = { ++ .destruct = raid_bdev_destruct, ++ .submit_request = raid_bdev_submit_request, ++ .io_type_supported = raid_bdev_io_type_supported, ++ .get_io_channel = raid_bdev_get_io_channel, ++ .dump_info_json = raid_bdev_dump_info_json, ++ .write_config_json = raid_bdev_write_config_json, ++ .get_memory_domains = raid_bdev_get_memory_domains, ++}; ++ ++struct raid_bdev * ++raid_bdev_find_by_name(const char *name) ++{ ++ struct raid_bdev *raid_bdev; ++ ++ TAILQ_FOREACH(raid_bdev, &g_raid_bdev_list, global_link) { ++ if (strcmp(raid_bdev->bdev.name, name) == 0) { ++ return raid_bdev; ++ } ++ } ++ ++ return NULL; ++} ++ ++static struct { ++ const char *name; ++ enum raid_level value; ++} g_raid_level_names[] = { ++ { "raid0", RAID0 }, ++ { "0", RAID0 }, ++ { "raid1", RAID1 }, ++ { "1", RAID1 }, ++ { "raid5f", RAID5F }, ++ { "5f", RAID5F }, ++ { "concat", CONCAT }, ++ { } ++}; ++ ++static struct { ++ const char *name; ++ enum raid_bdev_state value; ++} g_raid_state_names[] = { ++ { "online", RAID_BDEV_STATE_ONLINE }, ++ { "configuring", RAID_BDEV_STATE_CONFIGURING }, ++ { "offline", RAID_BDEV_STATE_OFFLINE }, ++ { } ++}; ++ ++/* We have to use the typedef in the function declaration to appease astyle. */ ++typedef enum raid_level raid_level_t; ++typedef enum raid_bdev_state raid_bdev_state_t; ++ ++raid_level_t ++raid_bdev_str_to_level(const char *str) ++{ ++ unsigned int i; ++ ++ assert(str != NULL); ++ ++ for (i = 0; g_raid_level_names[i].name != NULL; i++) { ++ if (strcasecmp(g_raid_level_names[i].name, str) == 0) { ++ return g_raid_level_names[i].value; ++ } ++ } ++ ++ return INVALID_RAID_LEVEL; ++} ++ ++const char * ++raid_bdev_level_to_str(enum raid_level level) ++{ ++ unsigned int i; ++ ++ for (i = 0; g_raid_level_names[i].name != NULL; i++) { ++ if (g_raid_level_names[i].value == level) { ++ return g_raid_level_names[i].name; ++ } ++ } ++ ++ return ""; ++} ++ ++raid_bdev_state_t ++raid_bdev_str_to_state(const char *str) ++{ ++ unsigned int i; ++ ++ assert(str != NULL); ++ ++ for (i = 0; g_raid_state_names[i].name != NULL; i++) { ++ if (strcasecmp(g_raid_state_names[i].name, str) == 0) { ++ return g_raid_state_names[i].value; ++ } ++ } ++ ++ return RAID_BDEV_STATE_MAX; ++} ++ ++const char * ++raid_bdev_state_to_str(enum raid_bdev_state state) ++{ ++ unsigned int i; ++ ++ for (i = 0; g_raid_state_names[i].name != NULL; i++) { ++ if (g_raid_state_names[i].value == state) { ++ return g_raid_state_names[i].name; ++ } ++ } ++ ++ assert(false); ++ return ""; ++} ++ ++/* ++ * brief: ++ * raid_bdev_fini_start is called when bdev layer is starting the ++ * shutdown process ++ * params: ++ * none ++ * returns: ++ * none ++ */ ++static void ++raid_bdev_fini_start(void) ++{ ++ SPDK_DEBUGLOG(bdev_raid, "raid_bdev_fini_start\n"); ++ g_shutdown_started = true; ++} ++ ++/* ++ * brief: ++ * raid_bdev_exit is called on raid bdev module exit time by bdev layer ++ * params: ++ * none ++ * returns: ++ * none ++ */ ++static void ++raid_bdev_exit(void) ++{ ++ struct raid_bdev *raid_bdev, *tmp; ++ ++ SPDK_DEBUGLOG(bdev_raid, "raid_bdev_exit\n"); ++ ++ TAILQ_FOREACH_SAFE(raid_bdev, &g_raid_bdev_list, global_link, tmp) { ++ raid_bdev_cleanup_and_free(raid_bdev); ++ } ++} ++ ++/* ++ * brief: ++ * raid_bdev_get_ctx_size is used to return the context size of bdev_io for raid ++ * module ++ * params: ++ * none ++ * returns: ++ * size of spdk_bdev_io context for raid ++ */ ++static int ++raid_bdev_get_ctx_size(void) ++{ ++ SPDK_DEBUGLOG(bdev_raid, "raid_bdev_get_ctx_size\n"); ++ return sizeof(struct raid_bdev_io); ++} ++ ++static struct spdk_bdev_module g_raid_if = { ++ .name = "raid", ++ .module_init = raid_bdev_init, ++ .fini_start = raid_bdev_fini_start, ++ .module_fini = raid_bdev_exit, ++ .get_ctx_size = raid_bdev_get_ctx_size, ++ .examine_config = raid_bdev_examine, ++ .async_init = false, ++ .async_fini = false, ++}; ++SPDK_BDEV_MODULE_REGISTER(raid, &g_raid_if) ++ ++/* ++ * brief: ++ * raid_bdev_init is the initialization function for raid bdev module ++ * params: ++ * none ++ * returns: ++ * 0 - success ++ * non zero - failure ++ */ ++static int ++raid_bdev_init(void) ++{ ++ return 0; ++} ++ ++/* ++ * brief: ++ * raid_bdev_create allocates raid bdev based on passed configuration ++ * params: ++ * name - name for raid bdev ++ * strip_size - strip size in KB ++ * num_base_bdevs - number of base bdevs ++ * level - raid level ++ * raid_bdev_out - the created raid bdev ++ * returns: ++ * 0 - success ++ * non zero - failure ++ */ ++int ++raid_bdev_create(const char *name, uint32_t strip_size, uint8_t num_base_bdevs, ++ enum raid_level level, struct raid_bdev **raid_bdev_out) ++{ ++ struct raid_bdev *raid_bdev; ++ struct spdk_bdev *raid_bdev_gen; ++ struct raid_bdev_module *module; ++ uint8_t min_operational; ++ ++ if (raid_bdev_find_by_name(name) != NULL) { ++ SPDK_ERRLOG("Duplicate raid bdev name found: %s\n", name); ++ return -EEXIST; ++ } ++ ++ if (level == RAID1) { ++ if (strip_size != 0) { ++ SPDK_ERRLOG("Strip size is not supported by raid1\n"); ++ return -EINVAL; ++ } ++ } else if (spdk_u32_is_pow2(strip_size) == false) { ++ SPDK_ERRLOG("Invalid strip size %" PRIu32 "\n", strip_size); ++ return -EINVAL; ++ } ++ ++ module = raid_bdev_module_find(level); ++ if (module == NULL) { ++ SPDK_ERRLOG("Unsupported raid level '%d'\n", level); ++ return -EINVAL; ++ } ++ ++ assert(module->base_bdevs_min != 0); ++ if (num_base_bdevs < module->base_bdevs_min) { ++ SPDK_ERRLOG("At least %u base devices required for %s\n", ++ module->base_bdevs_min, ++ raid_bdev_level_to_str(level)); ++ return -EINVAL; ++ } ++ ++ switch (module->base_bdevs_constraint.type) { ++ case CONSTRAINT_MAX_BASE_BDEVS_REMOVED: ++ min_operational = num_base_bdevs - module->base_bdevs_constraint.value; ++ break; ++ case CONSTRAINT_MIN_BASE_BDEVS_OPERATIONAL: ++ min_operational = module->base_bdevs_constraint.value; ++ break; ++ case CONSTRAINT_UNSET: ++ if (module->base_bdevs_constraint.value != 0) { ++ SPDK_ERRLOG("Unexpected constraint value '%u' provided for raid bdev '%s'.\n", ++ (uint8_t)module->base_bdevs_constraint.value, name); ++ return -EINVAL; ++ } ++ min_operational = num_base_bdevs; ++ break; ++ default: ++ SPDK_ERRLOG("Unrecognised constraint type '%u' in module for raid level '%s'.\n", ++ (uint8_t)module->base_bdevs_constraint.type, ++ raid_bdev_level_to_str(module->level)); ++ return -EINVAL; ++ }; ++ ++ if (min_operational == 0 || min_operational > num_base_bdevs) { ++ SPDK_ERRLOG("Wrong constraint value for raid level '%s'.\n", ++ raid_bdev_level_to_str(module->level)); ++ return -EINVAL; ++ } ++ ++ raid_bdev = calloc(1, sizeof(*raid_bdev)); ++ if (!raid_bdev) { ++ SPDK_ERRLOG("Unable to allocate memory for raid bdev\n"); ++ return -ENOMEM; ++ } ++ ++ raid_bdev->module = module; ++ raid_bdev->num_base_bdevs = num_base_bdevs; ++ raid_bdev->base_bdev_info = calloc(raid_bdev->num_base_bdevs, ++ sizeof(struct raid_base_bdev_info)); ++ if (!raid_bdev->base_bdev_info) { ++ SPDK_ERRLOG("Unable able to allocate base bdev info\n"); ++ free(raid_bdev); ++ return -ENOMEM; ++ } ++ ++ /* strip_size_kb is from the rpc param. strip_size is in blocks and used ++ * internally and set later. ++ */ ++ raid_bdev->strip_size = 0; ++ raid_bdev->strip_size_kb = strip_size; ++ raid_bdev->state = RAID_BDEV_STATE_CONFIGURING; ++ raid_bdev->level = level; ++ raid_bdev->min_base_bdevs_operational = min_operational; ++ ++ raid_bdev_gen = &raid_bdev->bdev; ++ ++ raid_bdev_gen->name = strdup(name); ++ if (!raid_bdev_gen->name) { ++ SPDK_ERRLOG("Unable to allocate name for raid\n"); ++ free(raid_bdev->base_bdev_info); ++ free(raid_bdev); ++ return -ENOMEM; ++ } ++ ++ raid_bdev_gen->product_name = "Raid Volume"; ++ raid_bdev_gen->ctxt = raid_bdev; ++ raid_bdev_gen->fn_table = &g_raid_bdev_fn_table; ++ raid_bdev_gen->module = &g_raid_if; ++ raid_bdev_gen->write_cache = 0; ++ ++ TAILQ_INSERT_TAIL(&g_raid_bdev_list, raid_bdev, global_link); ++ ++ *raid_bdev_out = raid_bdev; ++ ++ return 0; ++} ++ ++/* ++ * brief: ++ * Check underlying block devices against support for metadata. Do not configure ++ * md support when parameters from block devices are inconsistent. ++ * params: ++ * raid_bdev - pointer to raid bdev ++ * returns: ++ * 0 - The raid bdev md parameters were successfully configured. ++ * non zero - Failed to configure md. ++ */ ++static int ++raid_bdev_configure_md(struct raid_bdev *raid_bdev) ++{ ++ struct spdk_bdev *base_bdev; ++ uint8_t i; ++ ++ for (i = 0; i < raid_bdev->num_base_bdevs; i++) { ++ base_bdev = raid_bdev->base_bdev_info[i].bdev; ++ ++ if (i == 0) { ++ raid_bdev->bdev.md_len = spdk_bdev_get_md_size(base_bdev); ++ raid_bdev->bdev.md_interleave = spdk_bdev_is_md_interleaved(base_bdev); ++ raid_bdev->bdev.dif_type = spdk_bdev_get_dif_type(base_bdev); ++ raid_bdev->bdev.dif_is_head_of_md = spdk_bdev_is_dif_head_of_md(base_bdev); ++ raid_bdev->bdev.dif_check_flags = base_bdev->dif_check_flags; ++ continue; ++ } ++ ++ if (raid_bdev->bdev.md_len != spdk_bdev_get_md_size(base_bdev) || ++ raid_bdev->bdev.md_interleave != spdk_bdev_is_md_interleaved(base_bdev) || ++ raid_bdev->bdev.dif_type != spdk_bdev_get_dif_type(base_bdev) || ++ raid_bdev->bdev.dif_is_head_of_md != spdk_bdev_is_dif_head_of_md(base_bdev) || ++ raid_bdev->bdev.dif_check_flags != base_bdev->dif_check_flags) { ++ SPDK_ERRLOG("base bdevs are configured with different metadata formats\n"); ++ return -EPERM; ++ } ++ } ++ ++ return 0; ++} ++ ++/* ++ * brief: ++ * If raid bdev config is complete, then only register the raid bdev to ++ * bdev layer and remove this raid bdev from configuring list and ++ * insert the raid bdev to configured list ++ * params: ++ * raid_bdev - pointer to raid bdev ++ * returns: ++ * 0 - success ++ * non zero - failure ++ */ ++static int ++raid_bdev_configure(struct raid_bdev *raid_bdev) ++{ ++ uint32_t blocklen = 0; ++ struct spdk_bdev *raid_bdev_gen; ++ struct raid_base_bdev_info *base_info; ++ int rc = 0; ++ ++ assert(raid_bdev->state == RAID_BDEV_STATE_CONFIGURING); ++ assert(raid_bdev->num_base_bdevs_discovered == raid_bdev->num_base_bdevs); ++ ++ RAID_FOR_EACH_BASE_BDEV(raid_bdev, base_info) { ++ assert(base_info->bdev != NULL); ++ /* Check blocklen for all base bdevs that it should be same */ ++ if (blocklen == 0) { ++ blocklen = base_info->bdev->blocklen; ++ } else if (blocklen != base_info->bdev->blocklen) { ++ /* ++ * Assumption is that all the base bdevs for any raid bdev should ++ * have same blocklen ++ */ ++ SPDK_ERRLOG("Blocklen of various bdevs not matching\n"); ++ return -EINVAL; ++ } ++ } ++ assert(blocklen > 0); ++ ++ /* The strip_size_kb is read in from user in KB. Convert to blocks here for ++ * internal use. ++ */ ++ raid_bdev->strip_size = (raid_bdev->strip_size_kb * 1024) / blocklen; ++ raid_bdev->strip_size_shift = spdk_u32log2(raid_bdev->strip_size); ++ raid_bdev->blocklen_shift = spdk_u32log2(blocklen); ++ ++ raid_bdev_gen = &raid_bdev->bdev; ++ raid_bdev_gen->blocklen = blocklen; ++ ++ rc = raid_bdev_configure_md(raid_bdev); ++ if (rc != 0) { ++ SPDK_ERRLOG("raid metadata configuration failed\n"); ++ return rc; ++ } ++ ++ rc = raid_bdev->module->start(raid_bdev); ++ if (rc != 0) { ++ SPDK_ERRLOG("raid module startup callback failed\n"); ++ return rc; ++ } ++ raid_bdev->state = RAID_BDEV_STATE_ONLINE; ++ SPDK_DEBUGLOG(bdev_raid, "io device register %p\n", raid_bdev); ++ SPDK_DEBUGLOG(bdev_raid, "blockcnt %" PRIu64 ", blocklen %u\n", ++ raid_bdev_gen->blockcnt, raid_bdev_gen->blocklen); ++ spdk_io_device_register(raid_bdev, raid_bdev_create_cb, raid_bdev_destroy_cb, ++ sizeof(struct raid_bdev_io_channel), ++ raid_bdev->bdev.name); ++ rc = spdk_bdev_register(raid_bdev_gen); ++ if (rc != 0) { ++ SPDK_ERRLOG("Unable to register raid bdev and stay at configuring state\n"); ++ if (raid_bdev->module->stop != NULL) { ++ raid_bdev->module->stop(raid_bdev); ++ } ++ spdk_io_device_unregister(raid_bdev, NULL); ++ raid_bdev->state = RAID_BDEV_STATE_CONFIGURING; ++ return rc; ++ } ++ SPDK_DEBUGLOG(bdev_raid, "raid bdev generic %p\n", raid_bdev_gen); ++ SPDK_DEBUGLOG(bdev_raid, "raid bdev is created with name %s, raid_bdev %p\n", ++ raid_bdev_gen->name, raid_bdev); ++ ++ return 0; ++} ++ ++/* ++ * brief: ++ * If raid bdev is online and registered, change the bdev state to ++ * configuring and unregister this raid device. Queue this raid device ++ * in configuring list ++ * params: ++ * raid_bdev - pointer to raid bdev ++ * cb_fn - callback function ++ * cb_arg - argument to callback function ++ * returns: ++ * none ++ */ ++static void ++raid_bdev_deconfigure(struct raid_bdev *raid_bdev, raid_bdev_destruct_cb cb_fn, ++ void *cb_arg) ++{ ++ if (raid_bdev->state != RAID_BDEV_STATE_ONLINE) { ++ if (cb_fn) { ++ cb_fn(cb_arg, 0); ++ } ++ return; ++ } ++ ++ assert(raid_bdev->num_base_bdevs == raid_bdev->num_base_bdevs_discovered); ++ raid_bdev->state = RAID_BDEV_STATE_OFFLINE; ++ assert(raid_bdev->num_base_bdevs_discovered); ++ SPDK_DEBUGLOG(bdev_raid, "raid bdev state changing from online to offline\n"); ++ ++ spdk_bdev_unregister(&raid_bdev->bdev, cb_fn, cb_arg); ++} ++ ++/* ++ * brief: ++ * raid_bdev_find_by_base_bdev function finds the raid bdev which has ++ * claimed the base bdev. ++ * params: ++ * base_bdev - pointer to base bdev pointer ++ * _raid_bdev - Reference to pointer to raid bdev ++ * _base_info - Reference to the raid base bdev info. ++ * returns: ++ * true - if the raid bdev is found. ++ * false - if the raid bdev is not found. ++ */ ++static bool ++raid_bdev_find_by_base_bdev(struct spdk_bdev *base_bdev, struct raid_bdev **_raid_bdev, ++ struct raid_base_bdev_info **_base_info) ++{ ++ struct raid_bdev *raid_bdev; ++ struct raid_base_bdev_info *base_info; ++ ++ TAILQ_FOREACH(raid_bdev, &g_raid_bdev_list, global_link) { ++ RAID_FOR_EACH_BASE_BDEV(raid_bdev, base_info) { ++ if (base_info->bdev == base_bdev) { ++ *_raid_bdev = raid_bdev; ++ *_base_info = base_info; ++ return true; ++ } ++ } ++ } ++ ++ return false; ++} ++ ++/* ++ * brief: ++ * raid_bdev_remove_base_bdev function is called by below layers when base_bdev ++ * is removed. This function checks if this base bdev is part of any raid bdev ++ * or not. If yes, it takes necessary action on that particular raid bdev. ++ * params: ++ * base_bdev - pointer to base bdev which got removed ++ * returns: ++ * none ++ */ ++static void ++raid_bdev_remove_base_bdev(struct spdk_bdev *base_bdev) ++{ ++ struct raid_bdev *raid_bdev = NULL; ++ struct raid_base_bdev_info *base_info; ++ ++ SPDK_DEBUGLOG(bdev_raid, "raid_bdev_remove_base_bdev\n"); ++ ++ /* Find the raid_bdev which has claimed this base_bdev */ ++ if (!raid_bdev_find_by_base_bdev(base_bdev, &raid_bdev, &base_info)) { ++ SPDK_ERRLOG("bdev to remove '%s' not found\n", base_bdev->name); ++ return; ++ } ++ ++ assert(spdk_get_thread() == spdk_thread_get_app_thread()); ++ ++ assert(base_info->desc); ++ base_info->remove_scheduled = true; ++ ++ if (raid_bdev->state != RAID_BDEV_STATE_ONLINE) { ++ /* ++ * As raid bdev is not registered yet or already unregistered, ++ * so cleanup should be done here itself. ++ */ ++ raid_bdev_free_base_bdev_resource(raid_bdev, base_info); ++ if (raid_bdev->num_base_bdevs_discovered == 0) { ++ /* There is no base bdev for this raid, so free the raid device. */ ++ raid_bdev_cleanup_and_free(raid_bdev); ++ return; ++ } ++ } ++ ++ raid_bdev_deconfigure(raid_bdev, NULL, NULL); ++} ++ ++/* ++ * brief: ++ * raid_bdev_resize_base_bdev function is called by below layers when base_bdev ++ * is resized. This function checks if the smallest size of the base_bdevs is changed. ++ * If yes, call module handler to resize the raid_bdev if implemented. ++ * params: ++ * base_bdev - pointer to base bdev which got resized. ++ * returns: ++ * none ++ */ ++static void ++raid_bdev_resize_base_bdev(struct spdk_bdev *base_bdev) ++{ ++ struct raid_bdev *raid_bdev = NULL; ++ struct raid_base_bdev_info *base_info; ++ ++ SPDK_DEBUGLOG(bdev_raid, "raid_bdev_resize_base_bdev\n"); ++ ++ /* Find the raid_bdev which has claimed this base_bdev */ ++ if (!raid_bdev_find_by_base_bdev(base_bdev, &raid_bdev, &base_info)) { ++ SPDK_ERRLOG("raid_bdev whose base_bdev '%s' not found\n", base_bdev->name); ++ return; ++ } ++ ++ assert(spdk_get_thread() == spdk_thread_get_app_thread()); ++ ++ SPDK_NOTICELOG("base_bdev '%s' was resized: old size %" PRIu64 ", new size %" PRIu64 "\n", ++ base_bdev->name, base_info->blockcnt, base_bdev->blockcnt); ++ ++ if (raid_bdev->module->resize) { ++ raid_bdev->module->resize(raid_bdev); ++ } ++} ++ ++/* ++ * brief: ++ * raid_bdev_event_base_bdev function is called by below layers when base_bdev ++ * triggers asynchronous event. ++ * params: ++ * type - event details. ++ * bdev - bdev that triggered event. ++ * event_ctx - context for event. ++ * returns: ++ * none ++ */ ++static void ++raid_bdev_event_base_bdev(enum spdk_bdev_event_type type, struct spdk_bdev *bdev, ++ void *event_ctx) ++{ ++ switch (type) { ++ case SPDK_BDEV_EVENT_REMOVE: ++ raid_bdev_remove_base_bdev(bdev); ++ break; ++ case SPDK_BDEV_EVENT_RESIZE: ++ raid_bdev_resize_base_bdev(bdev); ++ break; ++ default: ++ SPDK_NOTICELOG("Unsupported bdev event: type %d\n", type); ++ break; ++ } ++} ++ ++/* ++ * brief: ++ * Deletes the specified raid bdev ++ * params: ++ * raid_bdev - pointer to raid bdev ++ * cb_fn - callback function ++ * cb_arg - argument to callback function ++ */ ++void ++raid_bdev_delete(struct raid_bdev *raid_bdev, raid_bdev_destruct_cb cb_fn, void *cb_arg) ++{ ++ struct raid_base_bdev_info *base_info; ++ ++ SPDK_DEBUGLOG(bdev_raid, "delete raid bdev: %s\n", raid_bdev->bdev.name); ++ ++ if (raid_bdev->destroy_started) { ++ SPDK_DEBUGLOG(bdev_raid, "destroying raid bdev %s is already started\n", ++ raid_bdev->bdev.name); ++ if (cb_fn) { ++ cb_fn(cb_arg, -EALREADY); ++ } ++ return; ++ } ++ ++ raid_bdev->destroy_started = true; ++ ++ RAID_FOR_EACH_BASE_BDEV(raid_bdev, base_info) { ++ base_info->remove_scheduled = true; ++ ++ if (raid_bdev->state != RAID_BDEV_STATE_ONLINE) { ++ /* ++ * As raid bdev is not registered yet or already unregistered, ++ * so cleanup should be done here itself. ++ */ ++ raid_bdev_free_base_bdev_resource(raid_bdev, base_info); ++ } ++ } ++ ++ if (raid_bdev->num_base_bdevs_discovered == 0) { ++ /* There is no base bdev for this raid, so free the raid device. */ ++ raid_bdev_cleanup_and_free(raid_bdev); ++ if (cb_fn) { ++ cb_fn(cb_arg, 0); ++ } ++ } else { ++ raid_bdev_deconfigure(raid_bdev, cb_fn, cb_arg); ++ } ++} ++ ++static int ++raid_bdev_configure_base_bdev(struct raid_bdev *raid_bdev, struct raid_base_bdev_info *base_info) ++{ ++ struct spdk_bdev_desc *desc; ++ struct spdk_bdev *bdev; ++ int rc; ++ ++ assert(spdk_get_thread() == spdk_thread_get_app_thread()); ++ assert(base_info->name != NULL); ++ assert(base_info->bdev == NULL); ++ ++ rc = spdk_bdev_open_ext(base_info->name, true, raid_bdev_event_base_bdev, NULL, &desc); ++ if (rc != 0) { ++ if (rc != -ENODEV) { ++ SPDK_ERRLOG("Unable to create desc on bdev '%s'\n", base_info->name); ++ } ++ return rc; ++ } ++ ++ bdev = spdk_bdev_desc_get_bdev(desc); ++ ++ rc = spdk_bdev_module_claim_bdev(bdev, NULL, &g_raid_if); ++ if (rc != 0) { ++ SPDK_ERRLOG("Unable to claim this bdev as it is already claimed\n"); ++ spdk_bdev_close(desc); ++ return rc; ++ } ++ ++ SPDK_DEBUGLOG(bdev_raid, "bdev %s is claimed\n", bdev->name); ++ ++ assert(raid_bdev->state != RAID_BDEV_STATE_ONLINE); ++ ++ base_info->bdev = bdev; ++ base_info->desc = desc; ++ base_info->blockcnt = bdev->blockcnt; ++ raid_bdev->num_base_bdevs_discovered++; ++ assert(raid_bdev->num_base_bdevs_discovered <= raid_bdev->num_base_bdevs); ++ ++ if (raid_bdev->num_base_bdevs_discovered == raid_bdev->num_base_bdevs) { ++ rc = raid_bdev_configure(raid_bdev); ++ if (rc != 0) { ++ SPDK_ERRLOG("Failed to configure raid bdev\n"); ++ return rc; ++ } ++ } ++ ++ return 0; ++} ++ ++/* ++ * brief: ++ * raid_bdev_add_base_device function is the actual function which either adds ++ * the nvme base device to existing raid bdev or create a new raid bdev. It also claims ++ * the base device and keep the open descriptor. ++ * params: ++ * raid_bdev - pointer to raid bdev ++ * name - name of the base bdev ++ * slot - position to add base bdev ++ * returns: ++ * 0 - success ++ * non zero - failure ++ */ ++int ++raid_bdev_add_base_device(struct raid_bdev *raid_bdev, const char *name, uint8_t slot) ++{ ++ struct raid_base_bdev_info *base_info; ++ int rc; ++ ++ if (slot >= raid_bdev->num_base_bdevs) { ++ return -EINVAL; ++ } ++ ++ base_info = &raid_bdev->base_bdev_info[slot]; ++ ++ if (base_info->name != NULL) { ++ SPDK_ERRLOG("Slot %u on raid bdev '%s' already assigned to bdev '%s'\n", ++ slot, raid_bdev->bdev.name, base_info->name); ++ return -EBUSY; ++ } ++ ++ base_info->name = strdup(name); ++ if (base_info->name == NULL) { ++ return -ENOMEM; ++ } ++ ++ rc = raid_bdev_configure_base_bdev(raid_bdev, base_info); ++ if (rc != 0) { ++ if (rc != -ENODEV) { ++ SPDK_ERRLOG("Failed to allocate resource for bdev '%s'\n", name); ++ } ++ return rc; ++ } ++ ++ return 0; ++} ++ ++/* ++ * brief: ++ * raid_bdev_examine function is the examine function call by the below layers ++ * like bdev_nvme layer. This function will check if this base bdev can be ++ * claimed by this raid bdev or not. ++ * params: ++ * bdev - pointer to base bdev ++ * returns: ++ * none ++ */ ++static void ++raid_bdev_examine(struct spdk_bdev *bdev) ++{ ++ struct raid_bdev *raid_bdev; ++ struct raid_base_bdev_info *base_info; ++ ++ TAILQ_FOREACH(raid_bdev, &g_raid_bdev_list, global_link) { ++ RAID_FOR_EACH_BASE_BDEV(raid_bdev, base_info) { ++ if (base_info->bdev == NULL && strcmp(bdev->name, base_info->name) == 0) { ++ raid_bdev_configure_base_bdev(raid_bdev, base_info); ++ break; ++ } ++ } ++ } ++ ++ spdk_bdev_module_examine_done(&g_raid_if); ++} ++ ++/* Log component for bdev raid bdev module */ ++SPDK_LOG_REGISTER_COMPONENT(bdev_raid) +diff --git a/module/bdev/raid/bdev_raid.h b/module/bdev/raid/bdev_raid.h +index a140bb7..6737a51 100644 +--- a/module/bdev/raid/bdev_raid.h ++++ b/module/bdev/raid/bdev_raid.h +@@ -1,262 +1,262 @@ +-/* SPDX-License-Identifier: BSD-3-Clause +- * Copyright (C) 2018 Intel Corporation. +- * All rights reserved. +- */ +- +-#ifndef SPDK_BDEV_RAID_INTERNAL_H +-#define SPDK_BDEV_RAID_INTERNAL_H +- +-#include "spdk/bdev_module.h" +- +-enum raid_level { +- INVALID_RAID_LEVEL = -1, +- RAID0 = 0, +- RAID1 = 1, +- RAID5F = 95, /* 0x5f */ +- CONCAT = 99, +-}; +- +-/* +- * Raid state describes the state of the raid. This raid bdev can be either in +- * configured list or configuring list +- */ +-enum raid_bdev_state { +- /* raid bdev is ready and is seen by upper layers */ +- RAID_BDEV_STATE_ONLINE, +- +- /* +- * raid bdev is configuring, not all underlying bdevs are present. +- * And can't be seen by upper layers. +- */ +- RAID_BDEV_STATE_CONFIGURING, +- +- /* +- * In offline state, raid bdev layer will complete all incoming commands without +- * submitting to underlying base nvme bdevs +- */ +- RAID_BDEV_STATE_OFFLINE, +- +- /* raid bdev state max, new states should be added before this */ +- RAID_BDEV_STATE_MAX +-}; +- +-/* +- * raid_base_bdev_info contains information for the base bdevs which are part of some +- * raid. This structure contains the per base bdev information. Whatever is +- * required per base device for raid bdev will be kept here +- */ +-struct raid_base_bdev_info { +- /* name of the bdev */ +- char *name; +- +- /* pointer to base spdk bdev */ +- struct spdk_bdev *bdev; +- +- /* pointer to base bdev descriptor opened by raid bdev */ +- struct spdk_bdev_desc *desc; +- +- /* +- * When underlying base device calls the hot plug function on drive removal, +- * this flag will be set and later after doing some processing, base device +- * descriptor will be closed +- */ +- bool remove_scheduled; +- +- /* Hold the number of blocks to know how large the base bdev is resized. */ +- uint64_t blockcnt; +-}; +- +-/* +- * raid_bdev_io is the context part of bdev_io. It contains the information +- * related to bdev_io for a raid bdev +- */ +-struct raid_bdev_io { +- /* The raid bdev associated with this IO */ +- struct raid_bdev *raid_bdev; +- +- /* WaitQ entry, used only in waitq logic */ +- struct spdk_bdev_io_wait_entry waitq_entry; +- +- /* Context of the original channel for this IO */ +- struct raid_bdev_io_channel *raid_ch; +- +- /* Used for tracking progress on io requests sent to member disks. */ +- uint64_t base_bdev_io_remaining; +- uint8_t base_bdev_io_submitted; +- uint8_t base_bdev_io_status; +- +- /* Private data for the raid module */ +- void *module_private; +-}; +- +-/* +- * raid_bdev is the single entity structure which contains SPDK block device +- * and the information related to any raid bdev either configured or +- * in configuring list. io device is created on this. +- */ +-struct raid_bdev { +- /* raid bdev device, this will get registered in bdev layer */ +- struct spdk_bdev bdev; +- +- /* link of raid bdev to link it to global raid bdev list */ +- TAILQ_ENTRY(raid_bdev) global_link; +- +- /* array of base bdev info */ +- struct raid_base_bdev_info *base_bdev_info; +- +- /* strip size of raid bdev in blocks */ +- uint32_t strip_size; +- +- /* strip size of raid bdev in KB */ +- uint32_t strip_size_kb; +- +- /* strip size bit shift for optimized calculation */ +- uint32_t strip_size_shift; +- +- /* block length bit shift for optimized calculation */ +- uint32_t blocklen_shift; +- +- /* state of raid bdev */ +- enum raid_bdev_state state; +- +- /* number of base bdevs comprising raid bdev */ +- uint8_t num_base_bdevs; +- +- /* number of base bdevs discovered */ +- uint8_t num_base_bdevs_discovered; +- +- /* minimum number of viable base bdevs that are required by array to operate */ +- uint8_t min_base_bdevs_operational; +- +- /* Raid Level of this raid bdev */ +- enum raid_level level; +- +- /* Set to true if destroy of this raid bdev is started. */ +- bool destroy_started; +- +- /* Module for RAID-level specific operations */ +- struct raid_bdev_module *module; +- +- /* Private data for the raid module */ +- void *module_private; +-}; +- +-#define RAID_FOR_EACH_BASE_BDEV(r, i) \ +- for (i = r->base_bdev_info; i < r->base_bdev_info + r->num_base_bdevs; i++) +- +-/* +- * raid_bdev_io_channel is the context of spdk_io_channel for raid bdev device. It +- * contains the relationship of raid bdev io channel with base bdev io channels. +- */ +-struct raid_bdev_io_channel { +- /* Array of IO channels of base bdevs */ +- struct spdk_io_channel **base_channel; +- +- /* Number of IO channels */ +- uint8_t num_channels; +- +- /* Private raid module IO channel */ +- struct spdk_io_channel *module_channel; +-}; +- +-/* TAIL head for raid bdev list */ +-TAILQ_HEAD(raid_all_tailq, raid_bdev); +- +-extern struct raid_all_tailq g_raid_bdev_list; +- +-typedef void (*raid_bdev_destruct_cb)(void *cb_ctx, int rc); +- +-int raid_bdev_create(const char *name, uint32_t strip_size, uint8_t num_base_bdevs, +- enum raid_level level, struct raid_bdev **raid_bdev_out); +-void raid_bdev_delete(struct raid_bdev *raid_bdev, raid_bdev_destruct_cb cb_fn, void *cb_ctx); +-int raid_bdev_add_base_device(struct raid_bdev *raid_bdev, const char *name, uint8_t slot); +-struct raid_bdev *raid_bdev_find_by_name(const char *name); +-enum raid_level raid_bdev_str_to_level(const char *str); +-const char *raid_bdev_level_to_str(enum raid_level level); +-enum raid_bdev_state raid_bdev_str_to_state(const char *str); +-const char *raid_bdev_state_to_str(enum raid_bdev_state state); +-void raid_bdev_write_info_json(struct raid_bdev *raid_bdev, struct spdk_json_write_ctx *w); +- +-/* +- * RAID module descriptor +- */ +-struct raid_bdev_module { +- /* RAID level implemented by this module */ +- enum raid_level level; +- +- /* Minimum required number of base bdevs. Must be > 0. */ +- uint8_t base_bdevs_min; +- +- /* +- * RAID constraint. Determines number of base bdevs that can be removed +- * without failing the array. +- */ +- struct { +- enum { +- CONSTRAINT_UNSET = 0, +- CONSTRAINT_MAX_BASE_BDEVS_REMOVED, +- CONSTRAINT_MIN_BASE_BDEVS_OPERATIONAL, +- } type; +- uint8_t value; +- } base_bdevs_constraint; +- +- /* +- * Called when the raid is starting, right before changing the state to +- * online and registering the bdev. Parameters of the bdev like blockcnt +- * should be set here. +- * +- * Non-zero return value will abort the startup process. +- */ +- int (*start)(struct raid_bdev *raid_bdev); +- +- /* +- * Called when the raid is stopping, right before changing the state to +- * offline and unregistering the bdev. Optional. +- * +- * The function should return false if it is asynchronous. Then, after +- * the async operation has completed and the module is fully stopped +- * raid_bdev_module_stop_done() must be called. +- */ +- bool (*stop)(struct raid_bdev *raid_bdev); +- +- /* Handler for R/W requests */ +- void (*submit_rw_request)(struct raid_bdev_io *raid_io); +- +- /* Handler for requests without payload (flush, unmap). Optional. */ +- void (*submit_null_payload_request)(struct raid_bdev_io *raid_io); +- +- /* +- * Called when the bdev's IO channel is created to get the module's private IO channel. +- * Optional. +- */ +- struct spdk_io_channel *(*get_io_channel)(struct raid_bdev *raid_bdev); +- +- /* +- * Called when a base_bdev is resized to resize the raid if the condition +- * is satisfied. +- */ +- void (*resize)(struct raid_bdev *raid_bdev); +- +- TAILQ_ENTRY(raid_bdev_module) link; +-}; +- +-void raid_bdev_module_list_add(struct raid_bdev_module *raid_module); +- +-#define __RAID_MODULE_REGISTER(line) __RAID_MODULE_REGISTER_(line) +-#define __RAID_MODULE_REGISTER_(line) raid_module_register_##line +- +-#define RAID_MODULE_REGISTER(_module) \ +-__attribute__((constructor)) static void \ +-__RAID_MODULE_REGISTER(__LINE__)(void) \ +-{ \ +- raid_bdev_module_list_add(_module); \ +-} +- +-bool raid_bdev_io_complete_part(struct raid_bdev_io *raid_io, uint64_t completed, +- enum spdk_bdev_io_status status); +-void raid_bdev_queue_io_wait(struct raid_bdev_io *raid_io, struct spdk_bdev *bdev, +- struct spdk_io_channel *ch, spdk_bdev_io_wait_cb cb_fn); +-void raid_bdev_io_complete(struct raid_bdev_io *raid_io, enum spdk_bdev_io_status status); +-void raid_bdev_module_stop_done(struct raid_bdev *raid_bdev); +- +-#endif /* SPDK_BDEV_RAID_INTERNAL_H */ ++/* SPDX-License-Identifier: BSD-3-Clause ++ * Copyright (C) 2018 Intel Corporation. ++ * All rights reserved. ++ */ ++ ++#ifndef SPDK_BDEV_RAID_INTERNAL_H ++#define SPDK_BDEV_RAID_INTERNAL_H ++ ++#include "spdk/bdev_module.h" ++ ++enum raid_level { ++ INVALID_RAID_LEVEL = -1, ++ RAID0 = 0, ++ RAID1 = 1, ++ RAID5F = 95, /* 0x5f */ ++ CONCAT = 99, ++}; ++ ++/* ++ * Raid state describes the state of the raid. This raid bdev can be either in ++ * configured list or configuring list ++ */ ++enum raid_bdev_state { ++ /* raid bdev is ready and is seen by upper layers */ ++ RAID_BDEV_STATE_ONLINE, ++ ++ /* ++ * raid bdev is configuring, not all underlying bdevs are present. ++ * And can't be seen by upper layers. ++ */ ++ RAID_BDEV_STATE_CONFIGURING, ++ ++ /* ++ * In offline state, raid bdev layer will complete all incoming commands without ++ * submitting to underlying base nvme bdevs ++ */ ++ RAID_BDEV_STATE_OFFLINE, ++ ++ /* raid bdev state max, new states should be added before this */ ++ RAID_BDEV_STATE_MAX ++}; ++ ++/* ++ * raid_base_bdev_info contains information for the base bdevs which are part of some ++ * raid. This structure contains the per base bdev information. Whatever is ++ * required per base device for raid bdev will be kept here ++ */ ++struct raid_base_bdev_info { ++ /* name of the bdev */ ++ char *name; ++ ++ /* pointer to base spdk bdev */ ++ struct spdk_bdev *bdev; ++ ++ /* pointer to base bdev descriptor opened by raid bdev */ ++ struct spdk_bdev_desc *desc; ++ ++ /* ++ * When underlying base device calls the hot plug function on drive removal, ++ * this flag will be set and later after doing some processing, base device ++ * descriptor will be closed ++ */ ++ bool remove_scheduled; ++ ++ /* Hold the number of blocks to know how large the base bdev is resized. */ ++ uint64_t blockcnt; ++}; ++ ++/* ++ * raid_bdev_io is the context part of bdev_io. It contains the information ++ * related to bdev_io for a raid bdev ++ */ ++struct raid_bdev_io { ++ /* The raid bdev associated with this IO */ ++ struct raid_bdev *raid_bdev; ++ ++ /* WaitQ entry, used only in waitq logic */ ++ struct spdk_bdev_io_wait_entry waitq_entry; ++ ++ /* Context of the original channel for this IO */ ++ struct raid_bdev_io_channel *raid_ch; ++ ++ /* Used for tracking progress on io requests sent to member disks. */ ++ uint64_t base_bdev_io_remaining; ++ uint8_t base_bdev_io_submitted; ++ uint8_t base_bdev_io_status; ++ ++ /* Private data for the raid module */ ++ void *module_private; ++}; ++ ++/* ++ * raid_bdev is the single entity structure which contains SPDK block device ++ * and the information related to any raid bdev either configured or ++ * in configuring list. io device is created on this. ++ */ ++struct raid_bdev { ++ /* raid bdev device, this will get registered in bdev layer */ ++ struct spdk_bdev bdev; ++ ++ /* link of raid bdev to link it to global raid bdev list */ ++ TAILQ_ENTRY(raid_bdev) global_link; ++ ++ /* array of base bdev info */ ++ struct raid_base_bdev_info *base_bdev_info; ++ ++ /* strip size of raid bdev in blocks */ ++ uint32_t strip_size; ++ ++ /* strip size of raid bdev in KB */ ++ uint32_t strip_size_kb; ++ ++ /* strip size bit shift for optimized calculation */ ++ uint32_t strip_size_shift; ++ ++ /* block length bit shift for optimized calculation */ ++ uint32_t blocklen_shift; ++ ++ /* state of raid bdev */ ++ enum raid_bdev_state state; ++ ++ /* number of base bdevs comprising raid bdev */ ++ uint8_t num_base_bdevs; ++ ++ /* number of base bdevs discovered */ ++ uint8_t num_base_bdevs_discovered; ++ ++ /* minimum number of viable base bdevs that are required by array to operate */ ++ uint8_t min_base_bdevs_operational; ++ ++ /* Raid Level of this raid bdev */ ++ enum raid_level level; ++ ++ /* Set to true if destroy of this raid bdev is started. */ ++ bool destroy_started; ++ ++ /* Module for RAID-level specific operations */ ++ struct raid_bdev_module *module; ++ ++ /* Private data for the raid module */ ++ void *module_private; ++}; ++ ++#define RAID_FOR_EACH_BASE_BDEV(r, i) \ ++ for (i = r->base_bdev_info; i < r->base_bdev_info + r->num_base_bdevs; i++) ++ ++/* ++ * raid_bdev_io_channel is the context of spdk_io_channel for raid bdev device. It ++ * contains the relationship of raid bdev io channel with base bdev io channels. ++ */ ++struct raid_bdev_io_channel { ++ /* Array of IO channels of base bdevs */ ++ struct spdk_io_channel **base_channel; ++ ++ /* Number of IO channels */ ++ uint8_t num_channels; ++ ++ /* Private raid module IO channel */ ++ struct spdk_io_channel *module_channel; ++}; ++ ++/* TAIL head for raid bdev list */ ++TAILQ_HEAD(raid_all_tailq, raid_bdev); ++ ++extern struct raid_all_tailq g_raid_bdev_list; ++ ++typedef void (*raid_bdev_destruct_cb)(void *cb_ctx, int rc); ++ ++int raid_bdev_create(const char *name, uint32_t strip_size, uint8_t num_base_bdevs, ++ enum raid_level level, struct raid_bdev **raid_bdev_out); ++void raid_bdev_delete(struct raid_bdev *raid_bdev, raid_bdev_destruct_cb cb_fn, void *cb_ctx); ++int raid_bdev_add_base_device(struct raid_bdev *raid_bdev, const char *name, uint8_t slot); ++struct raid_bdev *raid_bdev_find_by_name(const char *name); ++enum raid_level raid_bdev_str_to_level(const char *str); ++const char *raid_bdev_level_to_str(enum raid_level level); ++enum raid_bdev_state raid_bdev_str_to_state(const char *str); ++const char *raid_bdev_state_to_str(enum raid_bdev_state state); ++void raid_bdev_write_info_json(struct raid_bdev *raid_bdev, struct spdk_json_write_ctx *w); ++ ++/* ++ * RAID module descriptor ++ */ ++struct raid_bdev_module { ++ /* RAID level implemented by this module */ ++ enum raid_level level; ++ ++ /* Minimum required number of base bdevs. Must be > 0. */ ++ uint8_t base_bdevs_min; ++ ++ /* ++ * RAID constraint. Determines number of base bdevs that can be removed ++ * without failing the array. ++ */ ++ struct { ++ enum { ++ CONSTRAINT_UNSET = 0, ++ CONSTRAINT_MAX_BASE_BDEVS_REMOVED, ++ CONSTRAINT_MIN_BASE_BDEVS_OPERATIONAL, ++ } type; ++ uint8_t value; ++ } base_bdevs_constraint; ++ ++ /* ++ * Called when the raid is starting, right before changing the state to ++ * online and registering the bdev. Parameters of the bdev like blockcnt ++ * should be set here. ++ * ++ * Non-zero return value will abort the startup process. ++ */ ++ int (*start)(struct raid_bdev *raid_bdev); ++ ++ /* ++ * Called when the raid is stopping, right before changing the state to ++ * offline and unregistering the bdev. Optional. ++ * ++ * The function should return false if it is asynchronous. Then, after ++ * the async operation has completed and the module is fully stopped ++ * raid_bdev_module_stop_done() must be called. ++ */ ++ bool (*stop)(struct raid_bdev *raid_bdev); ++ ++ /* Handler for R/W requests */ ++ void (*submit_rw_request)(struct raid_bdev_io *raid_io); ++ ++ /* Handler for requests without payload (flush, unmap). Optional. */ ++ void (*submit_null_payload_request)(struct raid_bdev_io *raid_io); ++ ++ /* ++ * Called when the bdev's IO channel is created to get the module's private IO channel. ++ * Optional. ++ */ ++ struct spdk_io_channel *(*get_io_channel)(struct raid_bdev *raid_bdev); ++ ++ /* ++ * Called when a base_bdev is resized to resize the raid if the condition ++ * is satisfied. ++ */ ++ void (*resize)(struct raid_bdev *raid_bdev); ++ ++ TAILQ_ENTRY(raid_bdev_module) link; ++}; ++ ++void raid_bdev_module_list_add(struct raid_bdev_module *raid_module); ++ ++#define __RAID_MODULE_REGISTER(line) __RAID_MODULE_REGISTER_(line) ++#define __RAID_MODULE_REGISTER_(line) raid_module_register_##line ++ ++#define RAID_MODULE_REGISTER(_module) \ ++__attribute__((constructor)) static void \ ++__RAID_MODULE_REGISTER(__LINE__)(void) \ ++{ \ ++ raid_bdev_module_list_add(_module); \ ++} ++ ++bool raid_bdev_io_complete_part(struct raid_bdev_io *raid_io, uint64_t completed, ++ enum spdk_bdev_io_status status); ++void raid_bdev_queue_io_wait(struct raid_bdev_io *raid_io, struct spdk_bdev *bdev, ++ struct spdk_io_channel *ch, spdk_bdev_io_wait_cb cb_fn); ++void raid_bdev_io_complete(struct raid_bdev_io *raid_io, enum spdk_bdev_io_status status); ++void raid_bdev_module_stop_done(struct raid_bdev *raid_bdev); ++ ++#endif /* SPDK_BDEV_RAID_INTERNAL_H */ +diff --git a/module/bdev/raid/bdev_raid_rpc.c b/module/bdev/raid/bdev_raid_rpc.c +index 7def735..3e3aa72 100644 +--- a/module/bdev/raid/bdev_raid_rpc.c ++++ b/module/bdev/raid/bdev_raid_rpc.c +@@ -1,367 +1,367 @@ +-/* SPDX-License-Identifier: BSD-3-Clause +- * Copyright (C) 2018 Intel Corporation. +- * All rights reserved. +- */ +- +-#include "spdk/rpc.h" +-#include "spdk/bdev.h" +-#include "bdev_raid.h" +-#include "spdk/util.h" +-#include "spdk/string.h" +-#include "spdk/log.h" +-#include "spdk/env.h" +- +-#define RPC_MAX_BASE_BDEVS 255 +- +-/* +- * Input structure for bdev_raid_get_bdevs RPC +- */ +-struct rpc_bdev_raid_get_bdevs { +- /* category - all or online or configuring or offline */ +- char *category; +-}; +- +-/* +- * brief: +- * free_rpc_bdev_raid_get_bdevs function frees RPC bdev_raid_get_bdevs related parameters +- * params: +- * req - pointer to RPC request +- * returns: +- * none +- */ +-static void +-free_rpc_bdev_raid_get_bdevs(struct rpc_bdev_raid_get_bdevs *req) +-{ +- free(req->category); +-} +- +-/* +- * Decoder object for RPC get_raids +- */ +-static const struct spdk_json_object_decoder rpc_bdev_raid_get_bdevs_decoders[] = { +- {"category", offsetof(struct rpc_bdev_raid_get_bdevs, category), spdk_json_decode_string}, +-}; +- +-/* +- * brief: +- * rpc_bdev_raid_get_bdevs function is the RPC for rpc_bdev_raid_get_bdevs. This is used to list +- * all the raid bdev names based on the input category requested. Category should be +- * one of "all", "online", "configuring" or "offline". "all" means all the raids +- * whether they are online or configuring or offline. "online" is the raid bdev which +- * is registered with bdev layer. "configuring" is the raid bdev which does not have +- * full configuration discovered yet. "offline" is the raid bdev which is not +- * registered with bdev as of now and it has encountered any error or user has +- * requested to offline the raid. +- * params: +- * request - pointer to json rpc request +- * params - pointer to request parameters +- * returns: +- * none +- */ +-static void +-rpc_bdev_raid_get_bdevs(struct spdk_jsonrpc_request *request, +- const struct spdk_json_val *params) +-{ +- struct rpc_bdev_raid_get_bdevs req = {}; +- struct spdk_json_write_ctx *w; +- struct raid_bdev *raid_bdev; +- enum raid_bdev_state state; +- +- if (spdk_json_decode_object(params, rpc_bdev_raid_get_bdevs_decoders, +- SPDK_COUNTOF(rpc_bdev_raid_get_bdevs_decoders), +- &req)) { +- spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_PARSE_ERROR, +- "spdk_json_decode_object failed"); +- goto cleanup; +- } +- +- state = raid_bdev_str_to_state(req.category); +- if (state == RAID_BDEV_STATE_MAX && strcmp(req.category, "all") != 0) { +- spdk_jsonrpc_send_error_response(request, -EINVAL, spdk_strerror(EINVAL)); +- goto cleanup; +- } +- +- w = spdk_jsonrpc_begin_result(request); +- spdk_json_write_array_begin(w); +- +- /* Get raid bdev list based on the category requested */ +- TAILQ_FOREACH(raid_bdev, &g_raid_bdev_list, global_link) { +- if (raid_bdev->state == state || state == RAID_BDEV_STATE_MAX) { +- spdk_json_write_object_begin(w); +- spdk_json_write_named_string(w, "name", raid_bdev->bdev.name); +- raid_bdev_write_info_json(raid_bdev, w); +- spdk_json_write_object_end(w); +- } +- } +- spdk_json_write_array_end(w); +- spdk_jsonrpc_end_result(request, w); +- +-cleanup: +- free_rpc_bdev_raid_get_bdevs(&req); +-} +-SPDK_RPC_REGISTER("bdev_raid_get_bdevs", rpc_bdev_raid_get_bdevs, SPDK_RPC_RUNTIME) +- +-/* +- * Base bdevs in RPC bdev_raid_create +- */ +-struct rpc_bdev_raid_create_base_bdevs { +- /* Number of base bdevs */ +- size_t num_base_bdevs; +- +- /* List of base bdevs names */ +- char *base_bdevs[RPC_MAX_BASE_BDEVS]; +-}; +- +-/* +- * Input structure for RPC rpc_bdev_raid_create +- */ +-struct rpc_bdev_raid_create { +- /* Raid bdev name */ +- char *name; +- +- /* RAID strip size in KB */ +- uint32_t strip_size_kb; +- +- /* RAID raid level */ +- enum raid_level level; +- +- /* Base bdevs information */ +- struct rpc_bdev_raid_create_base_bdevs base_bdevs; +-}; +- +-/* +- * brief: +- * free_rpc_bdev_raid_create function is to free RPC bdev_raid_create related parameters +- * params: +- * req - pointer to RPC request +- * returns: +- * none +- */ +-static void +-free_rpc_bdev_raid_create(struct rpc_bdev_raid_create *req) +-{ +- size_t i; +- +- free(req->name); +- for (i = 0; i < req->base_bdevs.num_base_bdevs; i++) { +- free(req->base_bdevs.base_bdevs[i]); +- } +-} +- +-/* +- * Decoder function for RPC bdev_raid_create to decode raid level +- */ +-static int +-decode_raid_level(const struct spdk_json_val *val, void *out) +-{ +- int ret; +- char *str = NULL; +- enum raid_level level; +- +- ret = spdk_json_decode_string(val, &str); +- if (ret == 0 && str != NULL) { +- level = raid_bdev_str_to_level(str); +- if (level == INVALID_RAID_LEVEL) { +- ret = -EINVAL; +- } else { +- *(enum raid_level *)out = level; +- } +- } +- +- free(str); +- return ret; +-} +- +-/* +- * Decoder function for RPC bdev_raid_create to decode base bdevs list +- */ +-static int +-decode_base_bdevs(const struct spdk_json_val *val, void *out) +-{ +- struct rpc_bdev_raid_create_base_bdevs *base_bdevs = out; +- return spdk_json_decode_array(val, spdk_json_decode_string, base_bdevs->base_bdevs, +- RPC_MAX_BASE_BDEVS, &base_bdevs->num_base_bdevs, sizeof(char *)); +-} +- +-/* +- * Decoder object for RPC bdev_raid_create +- */ +-static const struct spdk_json_object_decoder rpc_bdev_raid_create_decoders[] = { +- {"name", offsetof(struct rpc_bdev_raid_create, name), spdk_json_decode_string}, +- {"strip_size_kb", offsetof(struct rpc_bdev_raid_create, strip_size_kb), spdk_json_decode_uint32, true}, +- {"raid_level", offsetof(struct rpc_bdev_raid_create, level), decode_raid_level}, +- {"base_bdevs", offsetof(struct rpc_bdev_raid_create, base_bdevs), decode_base_bdevs}, +-}; +- +-/* +- * brief: +- * rpc_bdev_raid_create function is the RPC for creating RAID bdevs. It takes +- * input as raid bdev name, raid level, strip size in KB and list of base bdev names. +- * params: +- * request - pointer to json rpc request +- * params - pointer to request parameters +- * returns: +- * none +- */ +-static void +-rpc_bdev_raid_create(struct spdk_jsonrpc_request *request, +- const struct spdk_json_val *params) +-{ +- struct rpc_bdev_raid_create req = {}; +- struct raid_bdev *raid_bdev; +- int rc; +- size_t i; +- +- if (spdk_json_decode_object(params, rpc_bdev_raid_create_decoders, +- SPDK_COUNTOF(rpc_bdev_raid_create_decoders), +- &req)) { +- spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_PARSE_ERROR, +- "spdk_json_decode_object failed"); +- goto cleanup; +- } +- +- rc = raid_bdev_create(req.name, req.strip_size_kb, req.base_bdevs.num_base_bdevs, +- req.level, &raid_bdev); +- if (rc != 0) { +- spdk_jsonrpc_send_error_response_fmt(request, rc, +- "Failed to create RAID bdev %s: %s", +- req.name, spdk_strerror(-rc)); +- goto cleanup; +- } +- +- for (i = 0; i < req.base_bdevs.num_base_bdevs; i++) { +- const char *base_bdev_name = req.base_bdevs.base_bdevs[i]; +- +- rc = raid_bdev_add_base_device(raid_bdev, base_bdev_name, i); +- if (rc == -ENODEV) { +- SPDK_DEBUGLOG(bdev_raid, "base bdev %s doesn't exist now\n", base_bdev_name); +- } else if (rc != 0) { +- raid_bdev_delete(raid_bdev, NULL, NULL); +- spdk_jsonrpc_send_error_response_fmt(request, rc, +- "Failed to add base bdev %s to RAID bdev %s: %s", +- base_bdev_name, req.name, +- spdk_strerror(-rc)); +- goto cleanup; +- } +- } +- +- spdk_jsonrpc_send_bool_response(request, true); +- +-cleanup: +- free_rpc_bdev_raid_create(&req); +-} +-SPDK_RPC_REGISTER("bdev_raid_create", rpc_bdev_raid_create, SPDK_RPC_RUNTIME) +- +-/* +- * Input structure for RPC deleting a raid bdev +- */ +-struct rpc_bdev_raid_delete { +- /* raid bdev name */ +- char *name; +-}; +- +-/* +- * brief: +- * free_rpc_bdev_raid_delete function is used to free RPC bdev_raid_delete related parameters +- * params: +- * req - pointer to RPC request +- * params: +- * none +- */ +-static void +-free_rpc_bdev_raid_delete(struct rpc_bdev_raid_delete *req) +-{ +- free(req->name); +-} +- +-/* +- * Decoder object for RPC raid_bdev_delete +- */ +-static const struct spdk_json_object_decoder rpc_bdev_raid_delete_decoders[] = { +- {"name", offsetof(struct rpc_bdev_raid_delete, name), spdk_json_decode_string}, +-}; +- +-struct rpc_bdev_raid_delete_ctx { +- struct rpc_bdev_raid_delete req; +- struct spdk_jsonrpc_request *request; +-}; +- +-/* +- * brief: +- * params: +- * cb_arg - pointer to the callback context. +- * rc - return code of the deletion of the raid bdev. +- * returns: +- * none +- */ +-static void +-bdev_raid_delete_done(void *cb_arg, int rc) +-{ +- struct rpc_bdev_raid_delete_ctx *ctx = cb_arg; +- struct spdk_jsonrpc_request *request = ctx->request; +- +- if (rc != 0) { +- SPDK_ERRLOG("Failed to delete raid bdev %s (%d): %s\n", +- ctx->req.name, rc, spdk_strerror(-rc)); +- spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, +- spdk_strerror(-rc)); +- goto exit; +- } +- +- spdk_jsonrpc_send_bool_response(request, true); +-exit: +- free_rpc_bdev_raid_delete(&ctx->req); +- free(ctx); +-} +- +-/* +- * brief: +- * rpc_bdev_raid_delete function is the RPC for deleting a raid bdev. It takes raid +- * name as input and delete that raid bdev including freeing the base bdev +- * resources. +- * params: +- * request - pointer to json rpc request +- * params - pointer to request parameters +- * returns: +- * none +- */ +-static void +-rpc_bdev_raid_delete(struct spdk_jsonrpc_request *request, +- const struct spdk_json_val *params) +-{ +- struct rpc_bdev_raid_delete_ctx *ctx; +- struct raid_bdev *raid_bdev; +- +- ctx = calloc(1, sizeof(*ctx)); +- if (!ctx) { +- spdk_jsonrpc_send_error_response(request, -ENOMEM, spdk_strerror(ENOMEM)); +- return; +- } +- +- if (spdk_json_decode_object(params, rpc_bdev_raid_delete_decoders, +- SPDK_COUNTOF(rpc_bdev_raid_delete_decoders), +- &ctx->req)) { +- spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_PARSE_ERROR, +- "spdk_json_decode_object failed"); +- goto cleanup; +- } +- +- raid_bdev = raid_bdev_find_by_name(ctx->req.name); +- if (raid_bdev == NULL) { +- spdk_jsonrpc_send_error_response_fmt(request, -ENODEV, +- "raid bdev %s not found", +- ctx->req.name); +- goto cleanup; +- } +- +- ctx->request = request; +- +- raid_bdev_delete(raid_bdev, bdev_raid_delete_done, ctx); +- +- return; +- +-cleanup: +- free_rpc_bdev_raid_delete(&ctx->req); +- free(ctx); +-} +-SPDK_RPC_REGISTER("bdev_raid_delete", rpc_bdev_raid_delete, SPDK_RPC_RUNTIME) ++/* SPDX-License-Identifier: BSD-3-Clause ++ * Copyright (C) 2018 Intel Corporation. ++ * All rights reserved. ++ */ ++ ++#include "spdk/rpc.h" ++#include "spdk/bdev.h" ++#include "bdev_raid.h" ++#include "spdk/util.h" ++#include "spdk/string.h" ++#include "spdk/log.h" ++#include "spdk/env.h" ++ ++#define RPC_MAX_BASE_BDEVS 255 ++ ++/* ++ * Input structure for bdev_raid_get_bdevs RPC ++ */ ++struct rpc_bdev_raid_get_bdevs { ++ /* category - all or online or configuring or offline */ ++ char *category; ++}; ++ ++/* ++ * brief: ++ * free_rpc_bdev_raid_get_bdevs function frees RPC bdev_raid_get_bdevs related parameters ++ * params: ++ * req - pointer to RPC request ++ * returns: ++ * none ++ */ ++static void ++free_rpc_bdev_raid_get_bdevs(struct rpc_bdev_raid_get_bdevs *req) ++{ ++ free(req->category); ++} ++ ++/* ++ * Decoder object for RPC get_raids ++ */ ++static const struct spdk_json_object_decoder rpc_bdev_raid_get_bdevs_decoders[] = { ++ {"category", offsetof(struct rpc_bdev_raid_get_bdevs, category), spdk_json_decode_string}, ++}; ++ ++/* ++ * brief: ++ * rpc_bdev_raid_get_bdevs function is the RPC for rpc_bdev_raid_get_bdevs. This is used to list ++ * all the raid bdev names based on the input category requested. Category should be ++ * one of "all", "online", "configuring" or "offline". "all" means all the raids ++ * whether they are online or configuring or offline. "online" is the raid bdev which ++ * is registered with bdev layer. "configuring" is the raid bdev which does not have ++ * full configuration discovered yet. "offline" is the raid bdev which is not ++ * registered with bdev as of now and it has encountered any error or user has ++ * requested to offline the raid. ++ * params: ++ * request - pointer to json rpc request ++ * params - pointer to request parameters ++ * returns: ++ * none ++ */ ++static void ++rpc_bdev_raid_get_bdevs(struct spdk_jsonrpc_request *request, ++ const struct spdk_json_val *params) ++{ ++ struct rpc_bdev_raid_get_bdevs req = {}; ++ struct spdk_json_write_ctx *w; ++ struct raid_bdev *raid_bdev; ++ enum raid_bdev_state state; ++ ++ if (spdk_json_decode_object(params, rpc_bdev_raid_get_bdevs_decoders, ++ SPDK_COUNTOF(rpc_bdev_raid_get_bdevs_decoders), ++ &req)) { ++ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_PARSE_ERROR, ++ "spdk_json_decode_object failed"); ++ goto cleanup; ++ } ++ ++ state = raid_bdev_str_to_state(req.category); ++ if (state == RAID_BDEV_STATE_MAX && strcmp(req.category, "all") != 0) { ++ spdk_jsonrpc_send_error_response(request, -EINVAL, spdk_strerror(EINVAL)); ++ goto cleanup; ++ } ++ ++ w = spdk_jsonrpc_begin_result(request); ++ spdk_json_write_array_begin(w); ++ ++ /* Get raid bdev list based on the category requested */ ++ TAILQ_FOREACH(raid_bdev, &g_raid_bdev_list, global_link) { ++ if (raid_bdev->state == state || state == RAID_BDEV_STATE_MAX) { ++ spdk_json_write_object_begin(w); ++ spdk_json_write_named_string(w, "name", raid_bdev->bdev.name); ++ raid_bdev_write_info_json(raid_bdev, w); ++ spdk_json_write_object_end(w); ++ } ++ } ++ spdk_json_write_array_end(w); ++ spdk_jsonrpc_end_result(request, w); ++ ++cleanup: ++ free_rpc_bdev_raid_get_bdevs(&req); ++} ++SPDK_RPC_REGISTER("bdev_raid_get_bdevs", rpc_bdev_raid_get_bdevs, SPDK_RPC_RUNTIME) ++ ++/* ++ * Base bdevs in RPC bdev_raid_create ++ */ ++struct rpc_bdev_raid_create_base_bdevs { ++ /* Number of base bdevs */ ++ size_t num_base_bdevs; ++ ++ /* List of base bdevs names */ ++ char *base_bdevs[RPC_MAX_BASE_BDEVS]; ++}; ++ ++/* ++ * Input structure for RPC rpc_bdev_raid_create ++ */ ++struct rpc_bdev_raid_create { ++ /* Raid bdev name */ ++ char *name; ++ ++ /* RAID strip size in KB */ ++ uint32_t strip_size_kb; ++ ++ /* RAID raid level */ ++ enum raid_level level; ++ ++ /* Base bdevs information */ ++ struct rpc_bdev_raid_create_base_bdevs base_bdevs; ++}; ++ ++/* ++ * brief: ++ * free_rpc_bdev_raid_create function is to free RPC bdev_raid_create related parameters ++ * params: ++ * req - pointer to RPC request ++ * returns: ++ * none ++ */ ++static void ++free_rpc_bdev_raid_create(struct rpc_bdev_raid_create *req) ++{ ++ size_t i; ++ ++ free(req->name); ++ for (i = 0; i < req->base_bdevs.num_base_bdevs; i++) { ++ free(req->base_bdevs.base_bdevs[i]); ++ } ++} ++ ++/* ++ * Decoder function for RPC bdev_raid_create to decode raid level ++ */ ++static int ++decode_raid_level(const struct spdk_json_val *val, void *out) ++{ ++ int ret; ++ char *str = NULL; ++ enum raid_level level; ++ ++ ret = spdk_json_decode_string(val, &str); ++ if (ret == 0 && str != NULL) { ++ level = raid_bdev_str_to_level(str); ++ if (level == INVALID_RAID_LEVEL) { ++ ret = -EINVAL; ++ } else { ++ *(enum raid_level *)out = level; ++ } ++ } ++ ++ free(str); ++ return ret; ++} ++ ++/* ++ * Decoder function for RPC bdev_raid_create to decode base bdevs list ++ */ ++static int ++decode_base_bdevs(const struct spdk_json_val *val, void *out) ++{ ++ struct rpc_bdev_raid_create_base_bdevs *base_bdevs = out; ++ return spdk_json_decode_array(val, spdk_json_decode_string, base_bdevs->base_bdevs, ++ RPC_MAX_BASE_BDEVS, &base_bdevs->num_base_bdevs, sizeof(char *)); ++} ++ ++/* ++ * Decoder object for RPC bdev_raid_create ++ */ ++static const struct spdk_json_object_decoder rpc_bdev_raid_create_decoders[] = { ++ {"name", offsetof(struct rpc_bdev_raid_create, name), spdk_json_decode_string}, ++ {"strip_size_kb", offsetof(struct rpc_bdev_raid_create, strip_size_kb), spdk_json_decode_uint32, true}, ++ {"raid_level", offsetof(struct rpc_bdev_raid_create, level), decode_raid_level}, ++ {"base_bdevs", offsetof(struct rpc_bdev_raid_create, base_bdevs), decode_base_bdevs}, ++}; ++ ++/* ++ * brief: ++ * rpc_bdev_raid_create function is the RPC for creating RAID bdevs. It takes ++ * input as raid bdev name, raid level, strip size in KB and list of base bdev names. ++ * params: ++ * request - pointer to json rpc request ++ * params - pointer to request parameters ++ * returns: ++ * none ++ */ ++static void ++rpc_bdev_raid_create(struct spdk_jsonrpc_request *request, ++ const struct spdk_json_val *params) ++{ ++ struct rpc_bdev_raid_create req = {}; ++ struct raid_bdev *raid_bdev; ++ int rc; ++ size_t i; ++ ++ if (spdk_json_decode_object(params, rpc_bdev_raid_create_decoders, ++ SPDK_COUNTOF(rpc_bdev_raid_create_decoders), ++ &req)) { ++ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_PARSE_ERROR, ++ "spdk_json_decode_object failed"); ++ goto cleanup; ++ } ++ ++ rc = raid_bdev_create(req.name, req.strip_size_kb, req.base_bdevs.num_base_bdevs, ++ req.level, &raid_bdev); ++ if (rc != 0) { ++ spdk_jsonrpc_send_error_response_fmt(request, rc, ++ "Failed to create RAID bdev %s: %s", ++ req.name, spdk_strerror(-rc)); ++ goto cleanup; ++ } ++ ++ for (i = 0; i < req.base_bdevs.num_base_bdevs; i++) { ++ const char *base_bdev_name = req.base_bdevs.base_bdevs[i]; ++ ++ rc = raid_bdev_add_base_device(raid_bdev, base_bdev_name, i); ++ if (rc == -ENODEV) { ++ SPDK_DEBUGLOG(bdev_raid, "base bdev %s doesn't exist now\n", base_bdev_name); ++ } else if (rc != 0) { ++ raid_bdev_delete(raid_bdev, NULL, NULL); ++ spdk_jsonrpc_send_error_response_fmt(request, rc, ++ "Failed to add base bdev %s to RAID bdev %s: %s", ++ base_bdev_name, req.name, ++ spdk_strerror(-rc)); ++ goto cleanup; ++ } ++ } ++ ++ spdk_jsonrpc_send_bool_response(request, true); ++ ++cleanup: ++ free_rpc_bdev_raid_create(&req); ++} ++SPDK_RPC_REGISTER("bdev_raid_create", rpc_bdev_raid_create, SPDK_RPC_RUNTIME) ++ ++/* ++ * Input structure for RPC deleting a raid bdev ++ */ ++struct rpc_bdev_raid_delete { ++ /* raid bdev name */ ++ char *name; ++}; ++ ++/* ++ * brief: ++ * free_rpc_bdev_raid_delete function is used to free RPC bdev_raid_delete related parameters ++ * params: ++ * req - pointer to RPC request ++ * params: ++ * none ++ */ ++static void ++free_rpc_bdev_raid_delete(struct rpc_bdev_raid_delete *req) ++{ ++ free(req->name); ++} ++ ++/* ++ * Decoder object for RPC raid_bdev_delete ++ */ ++static const struct spdk_json_object_decoder rpc_bdev_raid_delete_decoders[] = { ++ {"name", offsetof(struct rpc_bdev_raid_delete, name), spdk_json_decode_string}, ++}; ++ ++struct rpc_bdev_raid_delete_ctx { ++ struct rpc_bdev_raid_delete req; ++ struct spdk_jsonrpc_request *request; ++}; ++ ++/* ++ * brief: ++ * params: ++ * cb_arg - pointer to the callback context. ++ * rc - return code of the deletion of the raid bdev. ++ * returns: ++ * none ++ */ ++static void ++bdev_raid_delete_done(void *cb_arg, int rc) ++{ ++ struct rpc_bdev_raid_delete_ctx *ctx = cb_arg; ++ struct spdk_jsonrpc_request *request = ctx->request; ++ ++ if (rc != 0) { ++ SPDK_ERRLOG("Failed to delete raid bdev %s (%d): %s\n", ++ ctx->req.name, rc, spdk_strerror(-rc)); ++ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, ++ spdk_strerror(-rc)); ++ goto exit; ++ } ++ ++ spdk_jsonrpc_send_bool_response(request, true); ++exit: ++ free_rpc_bdev_raid_delete(&ctx->req); ++ free(ctx); ++} ++ ++/* ++ * brief: ++ * rpc_bdev_raid_delete function is the RPC for deleting a raid bdev. It takes raid ++ * name as input and delete that raid bdev including freeing the base bdev ++ * resources. ++ * params: ++ * request - pointer to json rpc request ++ * params - pointer to request parameters ++ * returns: ++ * none ++ */ ++static void ++rpc_bdev_raid_delete(struct spdk_jsonrpc_request *request, ++ const struct spdk_json_val *params) ++{ ++ struct rpc_bdev_raid_delete_ctx *ctx; ++ struct raid_bdev *raid_bdev; ++ ++ ctx = calloc(1, sizeof(*ctx)); ++ if (!ctx) { ++ spdk_jsonrpc_send_error_response(request, -ENOMEM, spdk_strerror(ENOMEM)); ++ return; ++ } ++ ++ if (spdk_json_decode_object(params, rpc_bdev_raid_delete_decoders, ++ SPDK_COUNTOF(rpc_bdev_raid_delete_decoders), ++ &ctx->req)) { ++ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_PARSE_ERROR, ++ "spdk_json_decode_object failed"); ++ goto cleanup; ++ } ++ ++ raid_bdev = raid_bdev_find_by_name(ctx->req.name); ++ if (raid_bdev == NULL) { ++ spdk_jsonrpc_send_error_response_fmt(request, -ENODEV, ++ "raid bdev %s not found", ++ ctx->req.name); ++ goto cleanup; ++ } ++ ++ ctx->request = request; ++ ++ raid_bdev_delete(raid_bdev, bdev_raid_delete_done, ctx); ++ ++ return; ++ ++cleanup: ++ free_rpc_bdev_raid_delete(&ctx->req); ++ free(ctx); ++} ++SPDK_RPC_REGISTER("bdev_raid_delete", rpc_bdev_raid_delete, SPDK_RPC_RUNTIME) +diff --git a/module/bdev/raid/concat.c b/module/bdev/raid/concat.c +index ca8f1dd..6c69dc0 100644 +--- a/module/bdev/raid/concat.c ++++ b/module/bdev/raid/concat.c +@@ -1,340 +1,340 @@ +-/* SPDX-License-Identifier: BSD-3-Clause +- * Copyright (C) 2022 Intel Corporation. +- * Copyright (c) Peng Yu yupeng0921@gmail.com. +- * All rights reserved. +- */ +- +-#include "bdev_raid.h" +- +-#include "spdk/env.h" +-#include "spdk/thread.h" +-#include "spdk/string.h" +-#include "spdk/util.h" +- +-#include "spdk/log.h" +- +-struct concat_block_range { +- uint64_t start; +- uint64_t length; +-}; +- +-/* +- * brief: +- * concat_bdev_io_completion function is called by lower layers to notify raid +- * module that particular bdev_io is completed. +- * params: +- * bdev_io - pointer to bdev io submitted to lower layers, like child io +- * success - bdev_io status +- * cb_arg - function callback context (parent raid_bdev_io) +- * returns: +- * none +- */ +-static void +-concat_bdev_io_completion(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) +-{ +- struct raid_bdev_io *raid_io = cb_arg; +- +- spdk_bdev_free_io(bdev_io); +- +- if (success) { +- raid_bdev_io_complete(raid_io, SPDK_BDEV_IO_STATUS_SUCCESS); +- } else { +- raid_bdev_io_complete(raid_io, SPDK_BDEV_IO_STATUS_FAILED); +- } +-} +- +-static void concat_submit_rw_request(struct raid_bdev_io *raid_io); +- +-static void +-_concat_submit_rw_request(void *_raid_io) +-{ +- struct raid_bdev_io *raid_io = _raid_io; +- +- concat_submit_rw_request(raid_io); +-} +- +-/* +- * brief: +- * concat_submit_rw_request function is used to submit I/O to the correct +- * member disk for concat bdevs. +- * params: +- * raid_io +- * returns: +- * none +- */ +-static void +-concat_submit_rw_request(struct raid_bdev_io *raid_io) +-{ +- struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(raid_io); +- struct raid_bdev_io_channel *raid_ch = raid_io->raid_ch; +- struct raid_bdev *raid_bdev = raid_io->raid_bdev; +- struct concat_block_range *block_range = raid_bdev->module_private; +- uint64_t pd_lba; +- uint64_t pd_blocks; +- int pd_idx; +- int ret = 0; +- struct raid_base_bdev_info *base_info; +- struct spdk_io_channel *base_ch; +- int i; +- +- pd_idx = -1; +- for (i = 0; i < raid_bdev->num_base_bdevs; i++) { +- if (block_range[i].start > bdev_io->u.bdev.offset_blocks) { +- break; +- } +- pd_idx = i; +- } +- assert(pd_idx >= 0); +- assert(bdev_io->u.bdev.offset_blocks >= block_range[pd_idx].start); +- pd_lba = bdev_io->u.bdev.offset_blocks - block_range[pd_idx].start; +- pd_blocks = bdev_io->u.bdev.num_blocks; +- base_info = &raid_bdev->base_bdev_info[pd_idx]; +- if (base_info->desc == NULL) { +- SPDK_ERRLOG("base bdev desc null for pd_idx %u\n", pd_idx); +- assert(0); +- } +- +- /* +- * Submit child io to bdev layer with using base bdev descriptors, base +- * bdev lba, base bdev child io length in blocks, buffer, completion +- * function and function callback context +- */ +- assert(raid_ch != NULL); +- assert(raid_ch->base_channel); +- base_ch = raid_ch->base_channel[pd_idx]; +- if (bdev_io->type == SPDK_BDEV_IO_TYPE_READ) { +- if (bdev_io->u.bdev.ext_opts != NULL) { +- ret = spdk_bdev_readv_blocks_ext(base_info->desc, base_ch, +- bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt, +- pd_lba, pd_blocks, concat_bdev_io_completion, +- raid_io, bdev_io->u.bdev.ext_opts); +- } else { +- ret = spdk_bdev_readv_blocks_with_md(base_info->desc, base_ch, +- bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt, +- bdev_io->u.bdev.md_buf, +- pd_lba, pd_blocks, +- concat_bdev_io_completion, raid_io); +- } +- } else if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) { +- if (bdev_io->u.bdev.ext_opts != NULL) { +- ret = spdk_bdev_writev_blocks_ext(base_info->desc, base_ch, +- bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt, +- pd_lba, pd_blocks, concat_bdev_io_completion, +- raid_io, bdev_io->u.bdev.ext_opts); +- } else { +- ret = spdk_bdev_writev_blocks_with_md(base_info->desc, base_ch, +- bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt, +- bdev_io->u.bdev.md_buf, +- pd_lba, pd_blocks, +- concat_bdev_io_completion, raid_io); +- } +- } else { +- SPDK_ERRLOG("Recvd not supported io type %u\n", bdev_io->type); +- assert(0); +- } +- +- if (ret == -ENOMEM) { +- raid_bdev_queue_io_wait(raid_io, base_info->bdev, base_ch, +- _concat_submit_rw_request); +- } else if (ret != 0) { +- SPDK_ERRLOG("bdev io submit error not due to ENOMEM, it should not happen\n"); +- assert(false); +- raid_bdev_io_complete(raid_io, SPDK_BDEV_IO_STATUS_FAILED); +- } +-} +- +-static void concat_submit_null_payload_request(struct raid_bdev_io *raid_io); +- +-static void +-_concat_submit_null_payload_request(void *_raid_io) +-{ +- struct raid_bdev_io *raid_io = _raid_io; +- +- concat_submit_null_payload_request(raid_io); +-} +- +-static void +-concat_base_io_complete(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) +-{ +- struct raid_bdev_io *raid_io = cb_arg; +- +- raid_bdev_io_complete_part(raid_io, 1, success ? +- SPDK_BDEV_IO_STATUS_SUCCESS : +- SPDK_BDEV_IO_STATUS_FAILED); +- +- spdk_bdev_free_io(bdev_io); +-} +- +-/* +- * brief: +- * concat_submit_null_payload_request function submits the next batch of +- * io requests with range but without payload, like FLUSH and UNMAP, to member disks; +- * it will submit as many as possible unless one base io request fails with -ENOMEM, +- * in which case it will queue itself for later submission. +- * params: +- * bdev_io - pointer to parent bdev_io on raid bdev device +- * returns: +- * none +- */ +-static void +-concat_submit_null_payload_request(struct raid_bdev_io *raid_io) +-{ +- struct spdk_bdev_io *bdev_io; +- struct raid_bdev *raid_bdev; +- int ret; +- struct raid_base_bdev_info *base_info; +- struct spdk_io_channel *base_ch; +- uint64_t pd_lba; +- uint64_t pd_blocks; +- uint64_t offset_blocks; +- uint64_t num_blocks; +- struct concat_block_range *block_range; +- int i, start_idx, stop_idx; +- +- bdev_io = spdk_bdev_io_from_ctx(raid_io); +- raid_bdev = raid_io->raid_bdev; +- block_range = raid_bdev->module_private; +- +- offset_blocks = bdev_io->u.bdev.offset_blocks; +- num_blocks = bdev_io->u.bdev.num_blocks; +- start_idx = -1; +- stop_idx = -1; +- /* +- * Go through all base bdevs, find the first bdev and the last bdev +- */ +- for (i = 0; i < raid_bdev->num_base_bdevs; i++) { +- /* skip the bdevs before the offset_blocks */ +- if (offset_blocks >= block_range[i].start + block_range[i].length) { +- continue; +- } +- if (start_idx == -1) { +- start_idx = i; +- } else { +- /* +- * The offset_blocks might be at the middle of the first bdev. +- * Besides the first bdev, the offset_blocks should be always +- * at the start of the bdev. +- */ +- assert(offset_blocks == block_range[i].start); +- } +- pd_lba = offset_blocks - block_range[i].start; +- pd_blocks = spdk_min(num_blocks, block_range[i].length - pd_lba); +- offset_blocks += pd_blocks; +- num_blocks -= pd_blocks; +- if (num_blocks == 0) { +- stop_idx = i; +- break; +- } +- } +- assert(start_idx >= 0); +- assert(stop_idx >= 0); +- +- if (raid_io->base_bdev_io_remaining == 0) { +- raid_io->base_bdev_io_remaining = stop_idx - start_idx + 1; +- } +- offset_blocks = bdev_io->u.bdev.offset_blocks; +- num_blocks = bdev_io->u.bdev.num_blocks; +- for (i = start_idx; i <= stop_idx; i++) { +- assert(offset_blocks >= block_range[i].start); +- assert(offset_blocks < block_range[i].start + block_range[i].length); +- pd_lba = offset_blocks - block_range[i].start; +- pd_blocks = spdk_min(num_blocks, block_range[i].length - pd_lba); +- offset_blocks += pd_blocks; +- num_blocks -= pd_blocks; +- /* +- * Skip the IOs we have submitted +- */ +- if (i < start_idx + raid_io->base_bdev_io_submitted) { +- continue; +- } +- base_info = &raid_bdev->base_bdev_info[i]; +- base_ch = raid_io->raid_ch->base_channel[i]; +- switch (bdev_io->type) { +- case SPDK_BDEV_IO_TYPE_UNMAP: +- ret = spdk_bdev_unmap_blocks(base_info->desc, base_ch, +- pd_lba, pd_blocks, +- concat_base_io_complete, raid_io); +- break; +- case SPDK_BDEV_IO_TYPE_FLUSH: +- ret = spdk_bdev_flush_blocks(base_info->desc, base_ch, +- pd_lba, pd_blocks, +- concat_base_io_complete, raid_io); +- break; +- default: +- SPDK_ERRLOG("submit request, invalid io type with null payload %u\n", bdev_io->type); +- assert(false); +- ret = -EIO; +- } +- if (ret == 0) { +- raid_io->base_bdev_io_submitted++; +- } else if (ret == -ENOMEM) { +- raid_bdev_queue_io_wait(raid_io, base_info->bdev, base_ch, +- _concat_submit_null_payload_request); +- return; +- } else { +- SPDK_ERRLOG("bdev io submit error not due to ENOMEM, it should not happen\n"); +- assert(false); +- raid_bdev_io_complete(raid_io, SPDK_BDEV_IO_STATUS_FAILED); +- return; +- } +- } +-} +- +-static int +-concat_start(struct raid_bdev *raid_bdev) +-{ +- uint64_t total_blockcnt = 0; +- struct raid_base_bdev_info *base_info; +- struct concat_block_range *block_range; +- +- block_range = calloc(raid_bdev->num_base_bdevs, sizeof(struct concat_block_range)); +- if (!block_range) { +- SPDK_ERRLOG("Can not allocate block_range, num_base_bdevs: %u", +- raid_bdev->num_base_bdevs); +- return -ENOMEM; +- } +- +- int idx = 0; +- RAID_FOR_EACH_BASE_BDEV(raid_bdev, base_info) { +- uint64_t strip_cnt = base_info->bdev->blockcnt >> raid_bdev->strip_size_shift; +- uint64_t pd_block_cnt = strip_cnt << raid_bdev->strip_size_shift; +- +- block_range[idx].start = total_blockcnt; +- block_range[idx].length = pd_block_cnt; +- total_blockcnt += pd_block_cnt; +- idx++; +- } +- +- raid_bdev->module_private = block_range; +- +- SPDK_DEBUGLOG(bdev_concat, "total blockcount %" PRIu64 ", numbasedev %u, strip size shift %u\n", +- total_blockcnt, raid_bdev->num_base_bdevs, raid_bdev->strip_size_shift); +- raid_bdev->bdev.blockcnt = total_blockcnt; +- +- raid_bdev->bdev.optimal_io_boundary = raid_bdev->strip_size; +- raid_bdev->bdev.split_on_optimal_io_boundary = true; +- +- return 0; +-} +- +-static bool +-concat_stop(struct raid_bdev *raid_bdev) +-{ +- struct concat_block_range *block_range = raid_bdev->module_private; +- +- free(block_range); +- +- return true; +-} +- +-static struct raid_bdev_module g_concat_module = { +- .level = CONCAT, +- .base_bdevs_min = 1, +- .start = concat_start, +- .stop = concat_stop, +- .submit_rw_request = concat_submit_rw_request, +- .submit_null_payload_request = concat_submit_null_payload_request, +-}; +-RAID_MODULE_REGISTER(&g_concat_module) +- +-SPDK_LOG_REGISTER_COMPONENT(bdev_concat) ++/* SPDX-License-Identifier: BSD-3-Clause ++ * Copyright (C) 2022 Intel Corporation. ++ * Copyright (c) Peng Yu yupeng0921@gmail.com. ++ * All rights reserved. ++ */ ++ ++#include "bdev_raid.h" ++ ++#include "spdk/env.h" ++#include "spdk/thread.h" ++#include "spdk/string.h" ++#include "spdk/util.h" ++ ++#include "spdk/log.h" ++ ++struct concat_block_range { ++ uint64_t start; ++ uint64_t length; ++}; ++ ++/* ++ * brief: ++ * concat_bdev_io_completion function is called by lower layers to notify raid ++ * module that particular bdev_io is completed. ++ * params: ++ * bdev_io - pointer to bdev io submitted to lower layers, like child io ++ * success - bdev_io status ++ * cb_arg - function callback context (parent raid_bdev_io) ++ * returns: ++ * none ++ */ ++static void ++concat_bdev_io_completion(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) ++{ ++ struct raid_bdev_io *raid_io = cb_arg; ++ ++ spdk_bdev_free_io(bdev_io); ++ ++ if (success) { ++ raid_bdev_io_complete(raid_io, SPDK_BDEV_IO_STATUS_SUCCESS); ++ } else { ++ raid_bdev_io_complete(raid_io, SPDK_BDEV_IO_STATUS_FAILED); ++ } ++} ++ ++static void concat_submit_rw_request(struct raid_bdev_io *raid_io); ++ ++static void ++_concat_submit_rw_request(void *_raid_io) ++{ ++ struct raid_bdev_io *raid_io = _raid_io; ++ ++ concat_submit_rw_request(raid_io); ++} ++ ++/* ++ * brief: ++ * concat_submit_rw_request function is used to submit I/O to the correct ++ * member disk for concat bdevs. ++ * params: ++ * raid_io ++ * returns: ++ * none ++ */ ++static void ++concat_submit_rw_request(struct raid_bdev_io *raid_io) ++{ ++ struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(raid_io); ++ struct raid_bdev_io_channel *raid_ch = raid_io->raid_ch; ++ struct raid_bdev *raid_bdev = raid_io->raid_bdev; ++ struct concat_block_range *block_range = raid_bdev->module_private; ++ uint64_t pd_lba; ++ uint64_t pd_blocks; ++ int pd_idx; ++ int ret = 0; ++ struct raid_base_bdev_info *base_info; ++ struct spdk_io_channel *base_ch; ++ int i; ++ ++ pd_idx = -1; ++ for (i = 0; i < raid_bdev->num_base_bdevs; i++) { ++ if (block_range[i].start > bdev_io->u.bdev.offset_blocks) { ++ break; ++ } ++ pd_idx = i; ++ } ++ assert(pd_idx >= 0); ++ assert(bdev_io->u.bdev.offset_blocks >= block_range[pd_idx].start); ++ pd_lba = bdev_io->u.bdev.offset_blocks - block_range[pd_idx].start; ++ pd_blocks = bdev_io->u.bdev.num_blocks; ++ base_info = &raid_bdev->base_bdev_info[pd_idx]; ++ if (base_info->desc == NULL) { ++ SPDK_ERRLOG("base bdev desc null for pd_idx %u\n", pd_idx); ++ assert(0); ++ } ++ ++ /* ++ * Submit child io to bdev layer with using base bdev descriptors, base ++ * bdev lba, base bdev child io length in blocks, buffer, completion ++ * function and function callback context ++ */ ++ assert(raid_ch != NULL); ++ assert(raid_ch->base_channel); ++ base_ch = raid_ch->base_channel[pd_idx]; ++ if (bdev_io->type == SPDK_BDEV_IO_TYPE_READ) { ++ if (bdev_io->u.bdev.ext_opts != NULL) { ++ ret = spdk_bdev_readv_blocks_ext(base_info->desc, base_ch, ++ bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt, ++ pd_lba, pd_blocks, concat_bdev_io_completion, ++ raid_io, bdev_io->u.bdev.ext_opts); ++ } else { ++ ret = spdk_bdev_readv_blocks_with_md(base_info->desc, base_ch, ++ bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt, ++ bdev_io->u.bdev.md_buf, ++ pd_lba, pd_blocks, ++ concat_bdev_io_completion, raid_io); ++ } ++ } else if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) { ++ if (bdev_io->u.bdev.ext_opts != NULL) { ++ ret = spdk_bdev_writev_blocks_ext(base_info->desc, base_ch, ++ bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt, ++ pd_lba, pd_blocks, concat_bdev_io_completion, ++ raid_io, bdev_io->u.bdev.ext_opts); ++ } else { ++ ret = spdk_bdev_writev_blocks_with_md(base_info->desc, base_ch, ++ bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt, ++ bdev_io->u.bdev.md_buf, ++ pd_lba, pd_blocks, ++ concat_bdev_io_completion, raid_io); ++ } ++ } else { ++ SPDK_ERRLOG("Recvd not supported io type %u\n", bdev_io->type); ++ assert(0); ++ } ++ ++ if (ret == -ENOMEM) { ++ raid_bdev_queue_io_wait(raid_io, base_info->bdev, base_ch, ++ _concat_submit_rw_request); ++ } else if (ret != 0) { ++ SPDK_ERRLOG("bdev io submit error not due to ENOMEM, it should not happen\n"); ++ assert(false); ++ raid_bdev_io_complete(raid_io, SPDK_BDEV_IO_STATUS_FAILED); ++ } ++} ++ ++static void concat_submit_null_payload_request(struct raid_bdev_io *raid_io); ++ ++static void ++_concat_submit_null_payload_request(void *_raid_io) ++{ ++ struct raid_bdev_io *raid_io = _raid_io; ++ ++ concat_submit_null_payload_request(raid_io); ++} ++ ++static void ++concat_base_io_complete(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) ++{ ++ struct raid_bdev_io *raid_io = cb_arg; ++ ++ raid_bdev_io_complete_part(raid_io, 1, success ? ++ SPDK_BDEV_IO_STATUS_SUCCESS : ++ SPDK_BDEV_IO_STATUS_FAILED); ++ ++ spdk_bdev_free_io(bdev_io); ++} ++ ++/* ++ * brief: ++ * concat_submit_null_payload_request function submits the next batch of ++ * io requests with range but without payload, like FLUSH and UNMAP, to member disks; ++ * it will submit as many as possible unless one base io request fails with -ENOMEM, ++ * in which case it will queue itself for later submission. ++ * params: ++ * bdev_io - pointer to parent bdev_io on raid bdev device ++ * returns: ++ * none ++ */ ++static void ++concat_submit_null_payload_request(struct raid_bdev_io *raid_io) ++{ ++ struct spdk_bdev_io *bdev_io; ++ struct raid_bdev *raid_bdev; ++ int ret; ++ struct raid_base_bdev_info *base_info; ++ struct spdk_io_channel *base_ch; ++ uint64_t pd_lba; ++ uint64_t pd_blocks; ++ uint64_t offset_blocks; ++ uint64_t num_blocks; ++ struct concat_block_range *block_range; ++ int i, start_idx, stop_idx; ++ ++ bdev_io = spdk_bdev_io_from_ctx(raid_io); ++ raid_bdev = raid_io->raid_bdev; ++ block_range = raid_bdev->module_private; ++ ++ offset_blocks = bdev_io->u.bdev.offset_blocks; ++ num_blocks = bdev_io->u.bdev.num_blocks; ++ start_idx = -1; ++ stop_idx = -1; ++ /* ++ * Go through all base bdevs, find the first bdev and the last bdev ++ */ ++ for (i = 0; i < raid_bdev->num_base_bdevs; i++) { ++ /* skip the bdevs before the offset_blocks */ ++ if (offset_blocks >= block_range[i].start + block_range[i].length) { ++ continue; ++ } ++ if (start_idx == -1) { ++ start_idx = i; ++ } else { ++ /* ++ * The offset_blocks might be at the middle of the first bdev. ++ * Besides the first bdev, the offset_blocks should be always ++ * at the start of the bdev. ++ */ ++ assert(offset_blocks == block_range[i].start); ++ } ++ pd_lba = offset_blocks - block_range[i].start; ++ pd_blocks = spdk_min(num_blocks, block_range[i].length - pd_lba); ++ offset_blocks += pd_blocks; ++ num_blocks -= pd_blocks; ++ if (num_blocks == 0) { ++ stop_idx = i; ++ break; ++ } ++ } ++ assert(start_idx >= 0); ++ assert(stop_idx >= 0); ++ ++ if (raid_io->base_bdev_io_remaining == 0) { ++ raid_io->base_bdev_io_remaining = stop_idx - start_idx + 1; ++ } ++ offset_blocks = bdev_io->u.bdev.offset_blocks; ++ num_blocks = bdev_io->u.bdev.num_blocks; ++ for (i = start_idx; i <= stop_idx; i++) { ++ assert(offset_blocks >= block_range[i].start); ++ assert(offset_blocks < block_range[i].start + block_range[i].length); ++ pd_lba = offset_blocks - block_range[i].start; ++ pd_blocks = spdk_min(num_blocks, block_range[i].length - pd_lba); ++ offset_blocks += pd_blocks; ++ num_blocks -= pd_blocks; ++ /* ++ * Skip the IOs we have submitted ++ */ ++ if (i < start_idx + raid_io->base_bdev_io_submitted) { ++ continue; ++ } ++ base_info = &raid_bdev->base_bdev_info[i]; ++ base_ch = raid_io->raid_ch->base_channel[i]; ++ switch (bdev_io->type) { ++ case SPDK_BDEV_IO_TYPE_UNMAP: ++ ret = spdk_bdev_unmap_blocks(base_info->desc, base_ch, ++ pd_lba, pd_blocks, ++ concat_base_io_complete, raid_io); ++ break; ++ case SPDK_BDEV_IO_TYPE_FLUSH: ++ ret = spdk_bdev_flush_blocks(base_info->desc, base_ch, ++ pd_lba, pd_blocks, ++ concat_base_io_complete, raid_io); ++ break; ++ default: ++ SPDK_ERRLOG("submit request, invalid io type with null payload %u\n", bdev_io->type); ++ assert(false); ++ ret = -EIO; ++ } ++ if (ret == 0) { ++ raid_io->base_bdev_io_submitted++; ++ } else if (ret == -ENOMEM) { ++ raid_bdev_queue_io_wait(raid_io, base_info->bdev, base_ch, ++ _concat_submit_null_payload_request); ++ return; ++ } else { ++ SPDK_ERRLOG("bdev io submit error not due to ENOMEM, it should not happen\n"); ++ assert(false); ++ raid_bdev_io_complete(raid_io, SPDK_BDEV_IO_STATUS_FAILED); ++ return; ++ } ++ } ++} ++ ++static int ++concat_start(struct raid_bdev *raid_bdev) ++{ ++ uint64_t total_blockcnt = 0; ++ struct raid_base_bdev_info *base_info; ++ struct concat_block_range *block_range; ++ ++ block_range = calloc(raid_bdev->num_base_bdevs, sizeof(struct concat_block_range)); ++ if (!block_range) { ++ SPDK_ERRLOG("Can not allocate block_range, num_base_bdevs: %u", ++ raid_bdev->num_base_bdevs); ++ return -ENOMEM; ++ } ++ ++ int idx = 0; ++ RAID_FOR_EACH_BASE_BDEV(raid_bdev, base_info) { ++ uint64_t strip_cnt = base_info->bdev->blockcnt >> raid_bdev->strip_size_shift; ++ uint64_t pd_block_cnt = strip_cnt << raid_bdev->strip_size_shift; ++ ++ block_range[idx].start = total_blockcnt; ++ block_range[idx].length = pd_block_cnt; ++ total_blockcnt += pd_block_cnt; ++ idx++; ++ } ++ ++ raid_bdev->module_private = block_range; ++ ++ SPDK_DEBUGLOG(bdev_concat, "total blockcount %" PRIu64 ", numbasedev %u, strip size shift %u\n", ++ total_blockcnt, raid_bdev->num_base_bdevs, raid_bdev->strip_size_shift); ++ raid_bdev->bdev.blockcnt = total_blockcnt; ++ ++ raid_bdev->bdev.optimal_io_boundary = raid_bdev->strip_size; ++ raid_bdev->bdev.split_on_optimal_io_boundary = true; ++ ++ return 0; ++} ++ ++static bool ++concat_stop(struct raid_bdev *raid_bdev) ++{ ++ struct concat_block_range *block_range = raid_bdev->module_private; ++ ++ free(block_range); ++ ++ return true; ++} ++ ++static struct raid_bdev_module g_concat_module = { ++ .level = CONCAT, ++ .base_bdevs_min = 1, ++ .start = concat_start, ++ .stop = concat_stop, ++ .submit_rw_request = concat_submit_rw_request, ++ .submit_null_payload_request = concat_submit_null_payload_request, ++}; ++RAID_MODULE_REGISTER(&g_concat_module) ++ ++SPDK_LOG_REGISTER_COMPONENT(bdev_concat) +diff --git a/module/bdev/raid/raid0.c b/module/bdev/raid/raid0.c +index b42a8f9..7da9e8c 100644 +--- a/module/bdev/raid/raid0.c ++++ b/module/bdev/raid/raid0.c +@@ -1,420 +1,420 @@ +-/* SPDX-License-Identifier: BSD-3-Clause +- * Copyright (C) 2019 Intel Corporation. +- * All rights reserved. +- * Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +- */ +- +-#include "bdev_raid.h" +- +-#include "spdk/env.h" +-#include "spdk/thread.h" +-#include "spdk/string.h" +-#include "spdk/util.h" +- +-#include "spdk/log.h" +- +-/* +- * brief: +- * raid0_bdev_io_completion function is called by lower layers to notify raid +- * module that particular bdev_io is completed. +- * params: +- * bdev_io - pointer to bdev io submitted to lower layers, like child io +- * success - bdev_io status +- * cb_arg - function callback context (parent raid_bdev_io) +- * returns: +- * none +- */ +-static void +-raid0_bdev_io_completion(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) +-{ +- struct raid_bdev_io *raid_io = cb_arg; +- +- spdk_bdev_free_io(bdev_io); +- +- if (success) { +- raid_bdev_io_complete(raid_io, SPDK_BDEV_IO_STATUS_SUCCESS); +- } else { +- raid_bdev_io_complete(raid_io, SPDK_BDEV_IO_STATUS_FAILED); +- } +-} +- +-static void raid0_submit_rw_request(struct raid_bdev_io *raid_io); +- +-static void +-_raid0_submit_rw_request(void *_raid_io) +-{ +- struct raid_bdev_io *raid_io = _raid_io; +- +- raid0_submit_rw_request(raid_io); +-} +- +-/* +- * brief: +- * raid0_submit_rw_request function is used to submit I/O to the correct +- * member disk for raid0 bdevs. +- * params: +- * raid_io +- * returns: +- * none +- */ +-static void +-raid0_submit_rw_request(struct raid_bdev_io *raid_io) +-{ +- struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(raid_io); +- struct raid_bdev_io_channel *raid_ch = raid_io->raid_ch; +- struct raid_bdev *raid_bdev = raid_io->raid_bdev; +- uint64_t pd_strip; +- uint32_t offset_in_strip; +- uint64_t pd_lba; +- uint64_t pd_blocks; +- uint8_t pd_idx; +- int ret = 0; +- uint64_t start_strip; +- uint64_t end_strip; +- struct raid_base_bdev_info *base_info; +- struct spdk_io_channel *base_ch; +- +- start_strip = bdev_io->u.bdev.offset_blocks >> raid_bdev->strip_size_shift; +- end_strip = (bdev_io->u.bdev.offset_blocks + bdev_io->u.bdev.num_blocks - 1) >> +- raid_bdev->strip_size_shift; +- if (start_strip != end_strip && raid_bdev->num_base_bdevs > 1) { +- assert(false); +- SPDK_ERRLOG("I/O spans strip boundary!\n"); +- raid_bdev_io_complete(raid_io, SPDK_BDEV_IO_STATUS_FAILED); +- return; +- } +- +- pd_strip = start_strip / raid_bdev->num_base_bdevs; +- pd_idx = start_strip % raid_bdev->num_base_bdevs; +- offset_in_strip = bdev_io->u.bdev.offset_blocks & (raid_bdev->strip_size - 1); +- pd_lba = (pd_strip << raid_bdev->strip_size_shift) + offset_in_strip; +- pd_blocks = bdev_io->u.bdev.num_blocks; +- base_info = &raid_bdev->base_bdev_info[pd_idx]; +- if (base_info->desc == NULL) { +- SPDK_ERRLOG("base bdev desc null for pd_idx %u\n", pd_idx); +- assert(0); +- } +- +- /* +- * Submit child io to bdev layer with using base bdev descriptors, base +- * bdev lba, base bdev child io length in blocks, buffer, completion +- * function and function callback context +- */ +- assert(raid_ch != NULL); +- assert(raid_ch->base_channel); +- base_ch = raid_ch->base_channel[pd_idx]; +- if (bdev_io->type == SPDK_BDEV_IO_TYPE_READ) { +- if (bdev_io->u.bdev.ext_opts != NULL) { +- ret = spdk_bdev_readv_blocks_ext(base_info->desc, base_ch, +- bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt, +- pd_lba, pd_blocks, raid0_bdev_io_completion, +- raid_io, bdev_io->u.bdev.ext_opts); +- } else { +- ret = spdk_bdev_readv_blocks_with_md(base_info->desc, base_ch, +- bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt, +- bdev_io->u.bdev.md_buf, +- pd_lba, pd_blocks, +- raid0_bdev_io_completion, raid_io); +- } +- } else if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) { +- if (bdev_io->u.bdev.ext_opts != NULL) { +- ret = spdk_bdev_writev_blocks_ext(base_info->desc, base_ch, +- bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt, +- pd_lba, pd_blocks, raid0_bdev_io_completion, +- raid_io, bdev_io->u.bdev.ext_opts); +- } else { +- ret = spdk_bdev_writev_blocks_with_md(base_info->desc, base_ch, +- bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt, +- bdev_io->u.bdev.md_buf, +- pd_lba, pd_blocks, +- raid0_bdev_io_completion, raid_io); +- } +- } else { +- SPDK_ERRLOG("Recvd not supported io type %u\n", bdev_io->type); +- assert(0); +- } +- +- if (ret == -ENOMEM) { +- raid_bdev_queue_io_wait(raid_io, base_info->bdev, base_ch, +- _raid0_submit_rw_request); +- } else if (ret != 0) { +- SPDK_ERRLOG("bdev io submit error not due to ENOMEM, it should not happen\n"); +- assert(false); +- raid_bdev_io_complete(raid_io, SPDK_BDEV_IO_STATUS_FAILED); +- } +-} +- +-/* raid0 IO range */ +-struct raid_bdev_io_range { +- uint64_t strip_size; +- uint64_t start_strip_in_disk; +- uint64_t end_strip_in_disk; +- uint64_t start_offset_in_strip; +- uint64_t end_offset_in_strip; +- uint8_t start_disk; +- uint8_t end_disk; +- uint8_t n_disks_involved; +-}; +- +-static inline void +-_raid0_get_io_range(struct raid_bdev_io_range *io_range, +- uint8_t num_base_bdevs, uint64_t strip_size, uint64_t strip_size_shift, +- uint64_t offset_blocks, uint64_t num_blocks) +-{ +- uint64_t start_strip; +- uint64_t end_strip; +- uint64_t total_blocks; +- +- io_range->strip_size = strip_size; +- total_blocks = offset_blocks + num_blocks - (num_blocks > 0); +- +- /* The start and end strip index in raid0 bdev scope */ +- start_strip = offset_blocks >> strip_size_shift; +- end_strip = total_blocks >> strip_size_shift; +- io_range->start_strip_in_disk = start_strip / num_base_bdevs; +- io_range->end_strip_in_disk = end_strip / num_base_bdevs; +- +- /* The first strip may have unaligned start LBA offset. +- * The end strip may have unaligned end LBA offset. +- * Strips between them certainly have aligned offset and length to boundaries. +- */ +- io_range->start_offset_in_strip = offset_blocks % strip_size; +- io_range->end_offset_in_strip = total_blocks % strip_size; +- +- /* The base bdev indexes in which start and end strips are located */ +- io_range->start_disk = start_strip % num_base_bdevs; +- io_range->end_disk = end_strip % num_base_bdevs; +- +- /* Calculate how many base_bdevs are involved in io operation. +- * Number of base bdevs involved is between 1 and num_base_bdevs. +- * It will be 1 if the first strip and last strip are the same one. +- */ +- io_range->n_disks_involved = spdk_min((end_strip - start_strip + 1), num_base_bdevs); +-} +- +-static inline void +-_raid0_split_io_range(struct raid_bdev_io_range *io_range, uint8_t disk_idx, +- uint64_t *_offset_in_disk, uint64_t *_nblocks_in_disk) +-{ +- uint64_t n_strips_in_disk; +- uint64_t start_offset_in_disk; +- uint64_t end_offset_in_disk; +- uint64_t offset_in_disk; +- uint64_t nblocks_in_disk; +- uint64_t start_strip_in_disk; +- uint64_t end_strip_in_disk; +- +- start_strip_in_disk = io_range->start_strip_in_disk; +- if (disk_idx < io_range->start_disk) { +- start_strip_in_disk += 1; +- } +- +- end_strip_in_disk = io_range->end_strip_in_disk; +- if (disk_idx > io_range->end_disk) { +- end_strip_in_disk -= 1; +- } +- +- assert(end_strip_in_disk >= start_strip_in_disk); +- n_strips_in_disk = end_strip_in_disk - start_strip_in_disk + 1; +- +- if (disk_idx == io_range->start_disk) { +- start_offset_in_disk = io_range->start_offset_in_strip; +- } else { +- start_offset_in_disk = 0; +- } +- +- if (disk_idx == io_range->end_disk) { +- end_offset_in_disk = io_range->end_offset_in_strip; +- } else { +- end_offset_in_disk = io_range->strip_size - 1; +- } +- +- offset_in_disk = start_offset_in_disk + start_strip_in_disk * io_range->strip_size; +- nblocks_in_disk = (n_strips_in_disk - 1) * io_range->strip_size +- + end_offset_in_disk - start_offset_in_disk + 1; +- +- SPDK_DEBUGLOG(bdev_raid0, +- "raid_bdev (strip_size 0x%" PRIx64 ") splits IO to base_bdev (%u) at (0x%" PRIx64 ", 0x%" PRIx64 +- ").\n", +- io_range->strip_size, disk_idx, offset_in_disk, nblocks_in_disk); +- +- *_offset_in_disk = offset_in_disk; +- *_nblocks_in_disk = nblocks_in_disk; +-} +- +-static void raid0_submit_null_payload_request(struct raid_bdev_io *raid_io); +- +-static void +-_raid0_submit_null_payload_request(void *_raid_io) +-{ +- struct raid_bdev_io *raid_io = _raid_io; +- +- raid0_submit_null_payload_request(raid_io); +-} +- +-static void +-raid0_base_io_complete(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) +-{ +- struct raid_bdev_io *raid_io = cb_arg; +- +- raid_bdev_io_complete_part(raid_io, 1, success ? +- SPDK_BDEV_IO_STATUS_SUCCESS : +- SPDK_BDEV_IO_STATUS_FAILED); +- +- spdk_bdev_free_io(bdev_io); +-} +- +-/* +- * brief: +- * raid0_submit_null_payload_request function submits the next batch of +- * io requests with range but without payload, like FLUSH and UNMAP, to member disks; +- * it will submit as many as possible unless one base io request fails with -ENOMEM, +- * in which case it will queue itself for later submission. +- * params: +- * bdev_io - pointer to parent bdev_io on raid bdev device +- * returns: +- * none +- */ +-static void +-raid0_submit_null_payload_request(struct raid_bdev_io *raid_io) +-{ +- struct spdk_bdev_io *bdev_io; +- struct raid_bdev *raid_bdev; +- struct raid_bdev_io_range io_range; +- int ret; +- struct raid_base_bdev_info *base_info; +- struct spdk_io_channel *base_ch; +- +- bdev_io = spdk_bdev_io_from_ctx(raid_io); +- raid_bdev = raid_io->raid_bdev; +- +- _raid0_get_io_range(&io_range, raid_bdev->num_base_bdevs, +- raid_bdev->strip_size, raid_bdev->strip_size_shift, +- bdev_io->u.bdev.offset_blocks, bdev_io->u.bdev.num_blocks); +- +- if (raid_io->base_bdev_io_remaining == 0) { +- raid_io->base_bdev_io_remaining = io_range.n_disks_involved; +- } +- +- while (raid_io->base_bdev_io_submitted < io_range.n_disks_involved) { +- uint8_t disk_idx; +- uint64_t offset_in_disk; +- uint64_t nblocks_in_disk; +- +- /* base_bdev is started from start_disk to end_disk. +- * It is possible that index of start_disk is larger than end_disk's. +- */ +- disk_idx = (io_range.start_disk + raid_io->base_bdev_io_submitted) % raid_bdev->num_base_bdevs; +- base_info = &raid_bdev->base_bdev_info[disk_idx]; +- base_ch = raid_io->raid_ch->base_channel[disk_idx]; +- +- _raid0_split_io_range(&io_range, disk_idx, &offset_in_disk, &nblocks_in_disk); +- +- switch (bdev_io->type) { +- case SPDK_BDEV_IO_TYPE_UNMAP: +- ret = spdk_bdev_unmap_blocks(base_info->desc, base_ch, +- offset_in_disk, nblocks_in_disk, +- raid0_base_io_complete, raid_io); +- break; +- +- case SPDK_BDEV_IO_TYPE_FLUSH: +- ret = spdk_bdev_flush_blocks(base_info->desc, base_ch, +- offset_in_disk, nblocks_in_disk, +- raid0_base_io_complete, raid_io); +- break; +- +- default: +- SPDK_ERRLOG("submit request, invalid io type with null payload %u\n", bdev_io->type); +- assert(false); +- ret = -EIO; +- } +- +- if (ret == 0) { +- raid_io->base_bdev_io_submitted++; +- } else if (ret == -ENOMEM) { +- raid_bdev_queue_io_wait(raid_io, base_info->bdev, base_ch, +- _raid0_submit_null_payload_request); +- return; +- } else { +- SPDK_ERRLOG("bdev io submit error not due to ENOMEM, it should not happen\n"); +- assert(false); +- raid_bdev_io_complete(raid_io, SPDK_BDEV_IO_STATUS_FAILED); +- return; +- } +- } +-} +- +-static uint64_t +-raid0_calculate_blockcnt(struct raid_bdev *raid_bdev) +-{ +- uint64_t min_blockcnt = UINT64_MAX; +- struct raid_base_bdev_info *base_info; +- +- RAID_FOR_EACH_BASE_BDEV(raid_bdev, base_info) { +- /* Calculate minimum block count from all base bdevs */ +- min_blockcnt = spdk_min(min_blockcnt, base_info->bdev->blockcnt); +- } +- +- /* +- * Take the minimum block count based approach where total block count +- * of raid bdev is the number of base bdev times the minimum block count +- * of any base bdev. +- */ +- SPDK_DEBUGLOG(bdev_raid0, "min blockcount %" PRIu64 ", numbasedev %u, strip size shift %u\n", +- min_blockcnt, raid_bdev->num_base_bdevs, raid_bdev->strip_size_shift); +- +- return ((min_blockcnt >> raid_bdev->strip_size_shift) << +- raid_bdev->strip_size_shift) * raid_bdev->num_base_bdevs; +-} +- +-static int +-raid0_start(struct raid_bdev *raid_bdev) +-{ +- raid_bdev->bdev.blockcnt = raid0_calculate_blockcnt(raid_bdev); +- +- if (raid_bdev->num_base_bdevs > 1) { +- raid_bdev->bdev.optimal_io_boundary = raid_bdev->strip_size; +- raid_bdev->bdev.split_on_optimal_io_boundary = true; +- } else { +- /* Do not need to split reads/writes on single bdev RAID modules. */ +- raid_bdev->bdev.optimal_io_boundary = 0; +- raid_bdev->bdev.split_on_optimal_io_boundary = false; +- } +- +- return 0; +-} +- +-static void +-raid0_resize(struct raid_bdev *raid_bdev) +-{ +- uint64_t blockcnt; +- int rc; +- +- blockcnt = raid0_calculate_blockcnt(raid_bdev); +- +- if (blockcnt == raid_bdev->bdev.blockcnt) { +- return; +- } +- +- SPDK_NOTICELOG("raid0 '%s': min blockcount was changed from %" PRIu64 " to %" PRIu64 "\n", +- raid_bdev->bdev.name, +- raid_bdev->bdev.blockcnt, +- blockcnt); +- +- rc = spdk_bdev_notify_blockcnt_change(&raid_bdev->bdev, blockcnt); +- if (rc != 0) { +- SPDK_ERRLOG("Failed to notify blockcount change\n"); +- } +-} +- +-static struct raid_bdev_module g_raid0_module = { +- .level = RAID0, +- .base_bdevs_min = 1, +- .start = raid0_start, +- .submit_rw_request = raid0_submit_rw_request, +- .submit_null_payload_request = raid0_submit_null_payload_request, +- .resize = raid0_resize, +-}; +-RAID_MODULE_REGISTER(&g_raid0_module) +- +-SPDK_LOG_REGISTER_COMPONENT(bdev_raid0) ++/* SPDX-License-Identifier: BSD-3-Clause ++ * Copyright (C) 2019 Intel Corporation. ++ * All rights reserved. ++ * Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. ++ */ ++ ++#include "bdev_raid.h" ++ ++#include "spdk/env.h" ++#include "spdk/thread.h" ++#include "spdk/string.h" ++#include "spdk/util.h" ++ ++#include "spdk/log.h" ++ ++/* ++ * brief: ++ * raid0_bdev_io_completion function is called by lower layers to notify raid ++ * module that particular bdev_io is completed. ++ * params: ++ * bdev_io - pointer to bdev io submitted to lower layers, like child io ++ * success - bdev_io status ++ * cb_arg - function callback context (parent raid_bdev_io) ++ * returns: ++ * none ++ */ ++static void ++raid0_bdev_io_completion(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) ++{ ++ struct raid_bdev_io *raid_io = cb_arg; ++ ++ spdk_bdev_free_io(bdev_io); ++ ++ if (success) { ++ raid_bdev_io_complete(raid_io, SPDK_BDEV_IO_STATUS_SUCCESS); ++ } else { ++ raid_bdev_io_complete(raid_io, SPDK_BDEV_IO_STATUS_FAILED); ++ } ++} ++ ++static void raid0_submit_rw_request(struct raid_bdev_io *raid_io); ++ ++static void ++_raid0_submit_rw_request(void *_raid_io) ++{ ++ struct raid_bdev_io *raid_io = _raid_io; ++ ++ raid0_submit_rw_request(raid_io); ++} ++ ++/* ++ * brief: ++ * raid0_submit_rw_request function is used to submit I/O to the correct ++ * member disk for raid0 bdevs. ++ * params: ++ * raid_io ++ * returns: ++ * none ++ */ ++static void ++raid0_submit_rw_request(struct raid_bdev_io *raid_io) ++{ ++ struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(raid_io); ++ struct raid_bdev_io_channel *raid_ch = raid_io->raid_ch; ++ struct raid_bdev *raid_bdev = raid_io->raid_bdev; ++ uint64_t pd_strip; ++ uint32_t offset_in_strip; ++ uint64_t pd_lba; ++ uint64_t pd_blocks; ++ uint8_t pd_idx; ++ int ret = 0; ++ uint64_t start_strip; ++ uint64_t end_strip; ++ struct raid_base_bdev_info *base_info; ++ struct spdk_io_channel *base_ch; ++ ++ start_strip = bdev_io->u.bdev.offset_blocks >> raid_bdev->strip_size_shift; ++ end_strip = (bdev_io->u.bdev.offset_blocks + bdev_io->u.bdev.num_blocks - 1) >> ++ raid_bdev->strip_size_shift; ++ if (start_strip != end_strip && raid_bdev->num_base_bdevs > 1) { ++ assert(false); ++ SPDK_ERRLOG("I/O spans strip boundary!\n"); ++ raid_bdev_io_complete(raid_io, SPDK_BDEV_IO_STATUS_FAILED); ++ return; ++ } ++ ++ pd_strip = start_strip / raid_bdev->num_base_bdevs; ++ pd_idx = start_strip % raid_bdev->num_base_bdevs; ++ offset_in_strip = bdev_io->u.bdev.offset_blocks & (raid_bdev->strip_size - 1); ++ pd_lba = (pd_strip << raid_bdev->strip_size_shift) + offset_in_strip; ++ pd_blocks = bdev_io->u.bdev.num_blocks; ++ base_info = &raid_bdev->base_bdev_info[pd_idx]; ++ if (base_info->desc == NULL) { ++ SPDK_ERRLOG("base bdev desc null for pd_idx %u\n", pd_idx); ++ assert(0); ++ } ++ ++ /* ++ * Submit child io to bdev layer with using base bdev descriptors, base ++ * bdev lba, base bdev child io length in blocks, buffer, completion ++ * function and function callback context ++ */ ++ assert(raid_ch != NULL); ++ assert(raid_ch->base_channel); ++ base_ch = raid_ch->base_channel[pd_idx]; ++ if (bdev_io->type == SPDK_BDEV_IO_TYPE_READ) { ++ if (bdev_io->u.bdev.ext_opts != NULL) { ++ ret = spdk_bdev_readv_blocks_ext(base_info->desc, base_ch, ++ bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt, ++ pd_lba, pd_blocks, raid0_bdev_io_completion, ++ raid_io, bdev_io->u.bdev.ext_opts); ++ } else { ++ ret = spdk_bdev_readv_blocks_with_md(base_info->desc, base_ch, ++ bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt, ++ bdev_io->u.bdev.md_buf, ++ pd_lba, pd_blocks, ++ raid0_bdev_io_completion, raid_io); ++ } ++ } else if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) { ++ if (bdev_io->u.bdev.ext_opts != NULL) { ++ ret = spdk_bdev_writev_blocks_ext(base_info->desc, base_ch, ++ bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt, ++ pd_lba, pd_blocks, raid0_bdev_io_completion, ++ raid_io, bdev_io->u.bdev.ext_opts); ++ } else { ++ ret = spdk_bdev_writev_blocks_with_md(base_info->desc, base_ch, ++ bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt, ++ bdev_io->u.bdev.md_buf, ++ pd_lba, pd_blocks, ++ raid0_bdev_io_completion, raid_io); ++ } ++ } else { ++ SPDK_ERRLOG("Recvd not supported io type %u\n", bdev_io->type); ++ assert(0); ++ } ++ ++ if (ret == -ENOMEM) { ++ raid_bdev_queue_io_wait(raid_io, base_info->bdev, base_ch, ++ _raid0_submit_rw_request); ++ } else if (ret != 0) { ++ SPDK_ERRLOG("bdev io submit error not due to ENOMEM, it should not happen\n"); ++ assert(false); ++ raid_bdev_io_complete(raid_io, SPDK_BDEV_IO_STATUS_FAILED); ++ } ++} ++ ++/* raid0 IO range */ ++struct raid_bdev_io_range { ++ uint64_t strip_size; ++ uint64_t start_strip_in_disk; ++ uint64_t end_strip_in_disk; ++ uint64_t start_offset_in_strip; ++ uint64_t end_offset_in_strip; ++ uint8_t start_disk; ++ uint8_t end_disk; ++ uint8_t n_disks_involved; ++}; ++ ++static inline void ++_raid0_get_io_range(struct raid_bdev_io_range *io_range, ++ uint8_t num_base_bdevs, uint64_t strip_size, uint64_t strip_size_shift, ++ uint64_t offset_blocks, uint64_t num_blocks) ++{ ++ uint64_t start_strip; ++ uint64_t end_strip; ++ uint64_t total_blocks; ++ ++ io_range->strip_size = strip_size; ++ total_blocks = offset_blocks + num_blocks - (num_blocks > 0); ++ ++ /* The start and end strip index in raid0 bdev scope */ ++ start_strip = offset_blocks >> strip_size_shift; ++ end_strip = total_blocks >> strip_size_shift; ++ io_range->start_strip_in_disk = start_strip / num_base_bdevs; ++ io_range->end_strip_in_disk = end_strip / num_base_bdevs; ++ ++ /* The first strip may have unaligned start LBA offset. ++ * The end strip may have unaligned end LBA offset. ++ * Strips between them certainly have aligned offset and length to boundaries. ++ */ ++ io_range->start_offset_in_strip = offset_blocks % strip_size; ++ io_range->end_offset_in_strip = total_blocks % strip_size; ++ ++ /* The base bdev indexes in which start and end strips are located */ ++ io_range->start_disk = start_strip % num_base_bdevs; ++ io_range->end_disk = end_strip % num_base_bdevs; ++ ++ /* Calculate how many base_bdevs are involved in io operation. ++ * Number of base bdevs involved is between 1 and num_base_bdevs. ++ * It will be 1 if the first strip and last strip are the same one. ++ */ ++ io_range->n_disks_involved = spdk_min((end_strip - start_strip + 1), num_base_bdevs); ++} ++ ++static inline void ++_raid0_split_io_range(struct raid_bdev_io_range *io_range, uint8_t disk_idx, ++ uint64_t *_offset_in_disk, uint64_t *_nblocks_in_disk) ++{ ++ uint64_t n_strips_in_disk; ++ uint64_t start_offset_in_disk; ++ uint64_t end_offset_in_disk; ++ uint64_t offset_in_disk; ++ uint64_t nblocks_in_disk; ++ uint64_t start_strip_in_disk; ++ uint64_t end_strip_in_disk; ++ ++ start_strip_in_disk = io_range->start_strip_in_disk; ++ if (disk_idx < io_range->start_disk) { ++ start_strip_in_disk += 1; ++ } ++ ++ end_strip_in_disk = io_range->end_strip_in_disk; ++ if (disk_idx > io_range->end_disk) { ++ end_strip_in_disk -= 1; ++ } ++ ++ assert(end_strip_in_disk >= start_strip_in_disk); ++ n_strips_in_disk = end_strip_in_disk - start_strip_in_disk + 1; ++ ++ if (disk_idx == io_range->start_disk) { ++ start_offset_in_disk = io_range->start_offset_in_strip; ++ } else { ++ start_offset_in_disk = 0; ++ } ++ ++ if (disk_idx == io_range->end_disk) { ++ end_offset_in_disk = io_range->end_offset_in_strip; ++ } else { ++ end_offset_in_disk = io_range->strip_size - 1; ++ } ++ ++ offset_in_disk = start_offset_in_disk + start_strip_in_disk * io_range->strip_size; ++ nblocks_in_disk = (n_strips_in_disk - 1) * io_range->strip_size ++ + end_offset_in_disk - start_offset_in_disk + 1; ++ ++ SPDK_DEBUGLOG(bdev_raid0, ++ "raid_bdev (strip_size 0x%" PRIx64 ") splits IO to base_bdev (%u) at (0x%" PRIx64 ", 0x%" PRIx64 ++ ").\n", ++ io_range->strip_size, disk_idx, offset_in_disk, nblocks_in_disk); ++ ++ *_offset_in_disk = offset_in_disk; ++ *_nblocks_in_disk = nblocks_in_disk; ++} ++ ++static void raid0_submit_null_payload_request(struct raid_bdev_io *raid_io); ++ ++static void ++_raid0_submit_null_payload_request(void *_raid_io) ++{ ++ struct raid_bdev_io *raid_io = _raid_io; ++ ++ raid0_submit_null_payload_request(raid_io); ++} ++ ++static void ++raid0_base_io_complete(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) ++{ ++ struct raid_bdev_io *raid_io = cb_arg; ++ ++ raid_bdev_io_complete_part(raid_io, 1, success ? ++ SPDK_BDEV_IO_STATUS_SUCCESS : ++ SPDK_BDEV_IO_STATUS_FAILED); ++ ++ spdk_bdev_free_io(bdev_io); ++} ++ ++/* ++ * brief: ++ * raid0_submit_null_payload_request function submits the next batch of ++ * io requests with range but without payload, like FLUSH and UNMAP, to member disks; ++ * it will submit as many as possible unless one base io request fails with -ENOMEM, ++ * in which case it will queue itself for later submission. ++ * params: ++ * bdev_io - pointer to parent bdev_io on raid bdev device ++ * returns: ++ * none ++ */ ++static void ++raid0_submit_null_payload_request(struct raid_bdev_io *raid_io) ++{ ++ struct spdk_bdev_io *bdev_io; ++ struct raid_bdev *raid_bdev; ++ struct raid_bdev_io_range io_range; ++ int ret; ++ struct raid_base_bdev_info *base_info; ++ struct spdk_io_channel *base_ch; ++ ++ bdev_io = spdk_bdev_io_from_ctx(raid_io); ++ raid_bdev = raid_io->raid_bdev; ++ ++ _raid0_get_io_range(&io_range, raid_bdev->num_base_bdevs, ++ raid_bdev->strip_size, raid_bdev->strip_size_shift, ++ bdev_io->u.bdev.offset_blocks, bdev_io->u.bdev.num_blocks); ++ ++ if (raid_io->base_bdev_io_remaining == 0) { ++ raid_io->base_bdev_io_remaining = io_range.n_disks_involved; ++ } ++ ++ while (raid_io->base_bdev_io_submitted < io_range.n_disks_involved) { ++ uint8_t disk_idx; ++ uint64_t offset_in_disk; ++ uint64_t nblocks_in_disk; ++ ++ /* base_bdev is started from start_disk to end_disk. ++ * It is possible that index of start_disk is larger than end_disk's. ++ */ ++ disk_idx = (io_range.start_disk + raid_io->base_bdev_io_submitted) % raid_bdev->num_base_bdevs; ++ base_info = &raid_bdev->base_bdev_info[disk_idx]; ++ base_ch = raid_io->raid_ch->base_channel[disk_idx]; ++ ++ _raid0_split_io_range(&io_range, disk_idx, &offset_in_disk, &nblocks_in_disk); ++ ++ switch (bdev_io->type) { ++ case SPDK_BDEV_IO_TYPE_UNMAP: ++ ret = spdk_bdev_unmap_blocks(base_info->desc, base_ch, ++ offset_in_disk, nblocks_in_disk, ++ raid0_base_io_complete, raid_io); ++ break; ++ ++ case SPDK_BDEV_IO_TYPE_FLUSH: ++ ret = spdk_bdev_flush_blocks(base_info->desc, base_ch, ++ offset_in_disk, nblocks_in_disk, ++ raid0_base_io_complete, raid_io); ++ break; ++ ++ default: ++ SPDK_ERRLOG("submit request, invalid io type with null payload %u\n", bdev_io->type); ++ assert(false); ++ ret = -EIO; ++ } ++ ++ if (ret == 0) { ++ raid_io->base_bdev_io_submitted++; ++ } else if (ret == -ENOMEM) { ++ raid_bdev_queue_io_wait(raid_io, base_info->bdev, base_ch, ++ _raid0_submit_null_payload_request); ++ return; ++ } else { ++ SPDK_ERRLOG("bdev io submit error not due to ENOMEM, it should not happen\n"); ++ assert(false); ++ raid_bdev_io_complete(raid_io, SPDK_BDEV_IO_STATUS_FAILED); ++ return; ++ } ++ } ++} ++ ++static uint64_t ++raid0_calculate_blockcnt(struct raid_bdev *raid_bdev) ++{ ++ uint64_t min_blockcnt = UINT64_MAX; ++ struct raid_base_bdev_info *base_info; ++ ++ RAID_FOR_EACH_BASE_BDEV(raid_bdev, base_info) { ++ /* Calculate minimum block count from all base bdevs */ ++ min_blockcnt = spdk_min(min_blockcnt, base_info->bdev->blockcnt); ++ } ++ ++ /* ++ * Take the minimum block count based approach where total block count ++ * of raid bdev is the number of base bdev times the minimum block count ++ * of any base bdev. ++ */ ++ SPDK_DEBUGLOG(bdev_raid0, "min blockcount %" PRIu64 ", numbasedev %u, strip size shift %u\n", ++ min_blockcnt, raid_bdev->num_base_bdevs, raid_bdev->strip_size_shift); ++ ++ return ((min_blockcnt >> raid_bdev->strip_size_shift) << ++ raid_bdev->strip_size_shift) * raid_bdev->num_base_bdevs; ++} ++ ++static int ++raid0_start(struct raid_bdev *raid_bdev) ++{ ++ raid_bdev->bdev.blockcnt = raid0_calculate_blockcnt(raid_bdev); ++ ++ if (raid_bdev->num_base_bdevs > 1) { ++ raid_bdev->bdev.optimal_io_boundary = raid_bdev->strip_size; ++ raid_bdev->bdev.split_on_optimal_io_boundary = true; ++ } else { ++ /* Do not need to split reads/writes on single bdev RAID modules. */ ++ raid_bdev->bdev.optimal_io_boundary = 0; ++ raid_bdev->bdev.split_on_optimal_io_boundary = false; ++ } ++ ++ return 0; ++} ++ ++static void ++raid0_resize(struct raid_bdev *raid_bdev) ++{ ++ uint64_t blockcnt; ++ int rc; ++ ++ blockcnt = raid0_calculate_blockcnt(raid_bdev); ++ ++ if (blockcnt == raid_bdev->bdev.blockcnt) { ++ return; ++ } ++ ++ SPDK_NOTICELOG("raid0 '%s': min blockcount was changed from %" PRIu64 " to %" PRIu64 "\n", ++ raid_bdev->bdev.name, ++ raid_bdev->bdev.blockcnt, ++ blockcnt); ++ ++ rc = spdk_bdev_notify_blockcnt_change(&raid_bdev->bdev, blockcnt); ++ if (rc != 0) { ++ SPDK_ERRLOG("Failed to notify blockcount change\n"); ++ } ++} ++ ++static struct raid_bdev_module g_raid0_module = { ++ .level = RAID0, ++ .base_bdevs_min = 1, ++ .start = raid0_start, ++ .submit_rw_request = raid0_submit_rw_request, ++ .submit_null_payload_request = raid0_submit_null_payload_request, ++ .resize = raid0_resize, ++}; ++RAID_MODULE_REGISTER(&g_raid0_module) ++ ++SPDK_LOG_REGISTER_COMPONENT(bdev_raid0) +diff --git a/module/bdev/raid/raid1.c b/module/bdev/raid/raid1.c +index cfe1e6b..555b8c8 100644 +--- a/module/bdev/raid/raid1.c ++++ b/module/bdev/raid/raid1.c +@@ -1,201 +1,201 @@ +-/* SPDX-License-Identifier: BSD-3-Clause +- * Copyright (C) 2022 Intel Corporation. +- * All rights reserved. +- */ +- +-#include "bdev_raid.h" +- +-#include "spdk/likely.h" +-#include "spdk/log.h" +- +-struct raid1_info { +- /* The parent raid bdev */ +- struct raid_bdev *raid_bdev; +-}; +- +-static void +-raid1_bdev_io_completion(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) +-{ +- struct raid_bdev_io *raid_io = cb_arg; +- +- spdk_bdev_free_io(bdev_io); +- +- raid_bdev_io_complete_part(raid_io, 1, success ? +- SPDK_BDEV_IO_STATUS_SUCCESS : +- SPDK_BDEV_IO_STATUS_FAILED); +-} +- +-static void raid1_submit_rw_request(struct raid_bdev_io *raid_io); +- +-static void +-_raid1_submit_rw_request(void *_raid_io) +-{ +- struct raid_bdev_io *raid_io = _raid_io; +- +- raid1_submit_rw_request(raid_io); +-} +- +-static int +-raid1_submit_read_request(struct raid_bdev_io *raid_io) +-{ +- struct raid_bdev *raid_bdev = raid_io->raid_bdev; +- struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(raid_io); +- uint8_t ch_idx = 0; +- struct raid_base_bdev_info *base_info = &raid_bdev->base_bdev_info[ch_idx]; +- struct spdk_io_channel *base_ch = raid_io->raid_ch->base_channel[ch_idx]; +- uint64_t pd_lba, pd_blocks; +- int ret; +- +- pd_lba = bdev_io->u.bdev.offset_blocks; +- pd_blocks = bdev_io->u.bdev.num_blocks; +- +- raid_io->base_bdev_io_remaining = 1; +- +- if (bdev_io->u.bdev.ext_opts != NULL) { +- ret = spdk_bdev_readv_blocks_ext(base_info->desc, base_ch, +- bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt, +- pd_lba, pd_blocks, raid1_bdev_io_completion, +- raid_io, bdev_io->u.bdev.ext_opts); +- } else { +- ret = spdk_bdev_readv_blocks_with_md(base_info->desc, base_ch, +- bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt, +- bdev_io->u.bdev.md_buf, +- pd_lba, pd_blocks, +- raid1_bdev_io_completion, raid_io); +- } +- +- if (spdk_likely(ret == 0)) { +- raid_io->base_bdev_io_submitted++; +- } else if (spdk_unlikely(ret == -ENOMEM)) { +- raid_bdev_queue_io_wait(raid_io, base_info->bdev, base_ch, +- _raid1_submit_rw_request); +- return 0; +- } +- +- return ret; +-} +- +-static int +-raid1_submit_write_request(struct raid_bdev_io *raid_io) +-{ +- struct raid_bdev *raid_bdev = raid_io->raid_bdev; +- struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(raid_io); +- struct raid_base_bdev_info *base_info; +- struct spdk_io_channel *base_ch; +- uint64_t pd_lba, pd_blocks; +- uint16_t idx = raid_io->base_bdev_io_submitted; +- uint64_t base_bdev_io_not_submitted; +- int ret = 0; +- +- pd_lba = bdev_io->u.bdev.offset_blocks; +- pd_blocks = bdev_io->u.bdev.num_blocks; +- +- if (raid_io->base_bdev_io_submitted == 0) { +- raid_io->base_bdev_io_remaining = raid_bdev->num_base_bdevs; +- } +- +- for (; idx < raid_bdev->num_base_bdevs; idx++) { +- base_info = &raid_bdev->base_bdev_info[idx]; +- base_ch = raid_io->raid_ch->base_channel[idx]; +- +- if (bdev_io->u.bdev.ext_opts != NULL) { +- ret = spdk_bdev_writev_blocks_ext(base_info->desc, base_ch, +- bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt, +- pd_lba, pd_blocks, raid1_bdev_io_completion, +- raid_io, bdev_io->u.bdev.ext_opts); +- } else { +- ret = spdk_bdev_writev_blocks_with_md(base_info->desc, base_ch, +- bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt, +- bdev_io->u.bdev.md_buf, +- pd_lba, pd_blocks, +- raid1_bdev_io_completion, raid_io); +- } +- +- if (spdk_unlikely(ret != 0)) { +- if (spdk_unlikely(ret == -ENOMEM)) { +- raid_bdev_queue_io_wait(raid_io, base_info->bdev, base_ch, +- _raid1_submit_rw_request); +- return 0; +- } +- +- base_bdev_io_not_submitted = raid_bdev->num_base_bdevs - +- raid_io->base_bdev_io_submitted; +- raid_bdev_io_complete_part(raid_io, base_bdev_io_not_submitted, +- SPDK_BDEV_IO_STATUS_FAILED); +- return 0; +- } +- +- raid_io->base_bdev_io_submitted++; +- } +- +- return ret; +-} +- +-static void +-raid1_submit_rw_request(struct raid_bdev_io *raid_io) +-{ +- struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(raid_io); +- int ret; +- +- switch (bdev_io->type) { +- case SPDK_BDEV_IO_TYPE_READ: +- ret = raid1_submit_read_request(raid_io); +- break; +- case SPDK_BDEV_IO_TYPE_WRITE: +- ret = raid1_submit_write_request(raid_io); +- break; +- default: +- ret = -EINVAL; +- break; +- } +- +- if (spdk_unlikely(ret != 0)) { +- raid_bdev_io_complete(raid_io, SPDK_BDEV_IO_STATUS_FAILED); +- } +-} +- +-static int +-raid1_start(struct raid_bdev *raid_bdev) +-{ +- uint64_t min_blockcnt = UINT64_MAX; +- struct raid_base_bdev_info *base_info; +- struct raid1_info *r1info; +- +- r1info = calloc(1, sizeof(*r1info)); +- if (!r1info) { +- SPDK_ERRLOG("Failed to allocate RAID1 info device structure\n"); +- return -ENOMEM; +- } +- r1info->raid_bdev = raid_bdev; +- +- RAID_FOR_EACH_BASE_BDEV(raid_bdev, base_info) { +- min_blockcnt = spdk_min(min_blockcnt, base_info->bdev->blockcnt); +- } +- +- raid_bdev->bdev.blockcnt = min_blockcnt; +- raid_bdev->module_private = r1info; +- +- return 0; +-} +- +-static bool +-raid1_stop(struct raid_bdev *raid_bdev) +-{ +- struct raid1_info *r1info = raid_bdev->module_private; +- +- free(r1info); +- +- return true; +-} +- +-static struct raid_bdev_module g_raid1_module = { +- .level = RAID1, +- .base_bdevs_min = 2, +- .base_bdevs_constraint = {CONSTRAINT_MIN_BASE_BDEVS_OPERATIONAL, 1}, +- .start = raid1_start, +- .stop = raid1_stop, +- .submit_rw_request = raid1_submit_rw_request, +-}; +-RAID_MODULE_REGISTER(&g_raid1_module) +- +-SPDK_LOG_REGISTER_COMPONENT(bdev_raid1) ++/* SPDX-License-Identifier: BSD-3-Clause ++ * Copyright (C) 2022 Intel Corporation. ++ * All rights reserved. ++ */ ++ ++#include "bdev_raid.h" ++ ++#include "spdk/likely.h" ++#include "spdk/log.h" ++ ++struct raid1_info { ++ /* The parent raid bdev */ ++ struct raid_bdev *raid_bdev; ++}; ++ ++static void ++raid1_bdev_io_completion(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) ++{ ++ struct raid_bdev_io *raid_io = cb_arg; ++ ++ spdk_bdev_free_io(bdev_io); ++ ++ raid_bdev_io_complete_part(raid_io, 1, success ? ++ SPDK_BDEV_IO_STATUS_SUCCESS : ++ SPDK_BDEV_IO_STATUS_FAILED); ++} ++ ++static void raid1_submit_rw_request(struct raid_bdev_io *raid_io); ++ ++static void ++_raid1_submit_rw_request(void *_raid_io) ++{ ++ struct raid_bdev_io *raid_io = _raid_io; ++ ++ raid1_submit_rw_request(raid_io); ++} ++ ++static int ++raid1_submit_read_request(struct raid_bdev_io *raid_io) ++{ ++ struct raid_bdev *raid_bdev = raid_io->raid_bdev; ++ struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(raid_io); ++ uint8_t ch_idx = 0; ++ struct raid_base_bdev_info *base_info = &raid_bdev->base_bdev_info[ch_idx]; ++ struct spdk_io_channel *base_ch = raid_io->raid_ch->base_channel[ch_idx]; ++ uint64_t pd_lba, pd_blocks; ++ int ret; ++ ++ pd_lba = bdev_io->u.bdev.offset_blocks; ++ pd_blocks = bdev_io->u.bdev.num_blocks; ++ ++ raid_io->base_bdev_io_remaining = 1; ++ ++ if (bdev_io->u.bdev.ext_opts != NULL) { ++ ret = spdk_bdev_readv_blocks_ext(base_info->desc, base_ch, ++ bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt, ++ pd_lba, pd_blocks, raid1_bdev_io_completion, ++ raid_io, bdev_io->u.bdev.ext_opts); ++ } else { ++ ret = spdk_bdev_readv_blocks_with_md(base_info->desc, base_ch, ++ bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt, ++ bdev_io->u.bdev.md_buf, ++ pd_lba, pd_blocks, ++ raid1_bdev_io_completion, raid_io); ++ } ++ ++ if (spdk_likely(ret == 0)) { ++ raid_io->base_bdev_io_submitted++; ++ } else if (spdk_unlikely(ret == -ENOMEM)) { ++ raid_bdev_queue_io_wait(raid_io, base_info->bdev, base_ch, ++ _raid1_submit_rw_request); ++ return 0; ++ } ++ ++ return ret; ++} ++ ++static int ++raid1_submit_write_request(struct raid_bdev_io *raid_io) ++{ ++ struct raid_bdev *raid_bdev = raid_io->raid_bdev; ++ struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(raid_io); ++ struct raid_base_bdev_info *base_info; ++ struct spdk_io_channel *base_ch; ++ uint64_t pd_lba, pd_blocks; ++ uint16_t idx = raid_io->base_bdev_io_submitted; ++ uint64_t base_bdev_io_not_submitted; ++ int ret = 0; ++ ++ pd_lba = bdev_io->u.bdev.offset_blocks; ++ pd_blocks = bdev_io->u.bdev.num_blocks; ++ ++ if (raid_io->base_bdev_io_submitted == 0) { ++ raid_io->base_bdev_io_remaining = raid_bdev->num_base_bdevs; ++ } ++ ++ for (; idx < raid_bdev->num_base_bdevs; idx++) { ++ base_info = &raid_bdev->base_bdev_info[idx]; ++ base_ch = raid_io->raid_ch->base_channel[idx]; ++ ++ if (bdev_io->u.bdev.ext_opts != NULL) { ++ ret = spdk_bdev_writev_blocks_ext(base_info->desc, base_ch, ++ bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt, ++ pd_lba, pd_blocks, raid1_bdev_io_completion, ++ raid_io, bdev_io->u.bdev.ext_opts); ++ } else { ++ ret = spdk_bdev_writev_blocks_with_md(base_info->desc, base_ch, ++ bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt, ++ bdev_io->u.bdev.md_buf, ++ pd_lba, pd_blocks, ++ raid1_bdev_io_completion, raid_io); ++ } ++ ++ if (spdk_unlikely(ret != 0)) { ++ if (spdk_unlikely(ret == -ENOMEM)) { ++ raid_bdev_queue_io_wait(raid_io, base_info->bdev, base_ch, ++ _raid1_submit_rw_request); ++ return 0; ++ } ++ ++ base_bdev_io_not_submitted = raid_bdev->num_base_bdevs - ++ raid_io->base_bdev_io_submitted; ++ raid_bdev_io_complete_part(raid_io, base_bdev_io_not_submitted, ++ SPDK_BDEV_IO_STATUS_FAILED); ++ return 0; ++ } ++ ++ raid_io->base_bdev_io_submitted++; ++ } ++ ++ return ret; ++} ++ ++static void ++raid1_submit_rw_request(struct raid_bdev_io *raid_io) ++{ ++ struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(raid_io); ++ int ret; ++ ++ switch (bdev_io->type) { ++ case SPDK_BDEV_IO_TYPE_READ: ++ ret = raid1_submit_read_request(raid_io); ++ break; ++ case SPDK_BDEV_IO_TYPE_WRITE: ++ ret = raid1_submit_write_request(raid_io); ++ break; ++ default: ++ ret = -EINVAL; ++ break; ++ } ++ ++ if (spdk_unlikely(ret != 0)) { ++ raid_bdev_io_complete(raid_io, SPDK_BDEV_IO_STATUS_FAILED); ++ } ++} ++ ++static int ++raid1_start(struct raid_bdev *raid_bdev) ++{ ++ uint64_t min_blockcnt = UINT64_MAX; ++ struct raid_base_bdev_info *base_info; ++ struct raid1_info *r1info; ++ ++ r1info = calloc(1, sizeof(*r1info)); ++ if (!r1info) { ++ SPDK_ERRLOG("Failed to allocate RAID1 info device structure\n"); ++ return -ENOMEM; ++ } ++ r1info->raid_bdev = raid_bdev; ++ ++ RAID_FOR_EACH_BASE_BDEV(raid_bdev, base_info) { ++ min_blockcnt = spdk_min(min_blockcnt, base_info->bdev->blockcnt); ++ } ++ ++ raid_bdev->bdev.blockcnt = min_blockcnt; ++ raid_bdev->module_private = r1info; ++ ++ return 0; ++} ++ ++static bool ++raid1_stop(struct raid_bdev *raid_bdev) ++{ ++ struct raid1_info *r1info = raid_bdev->module_private; ++ ++ free(r1info); ++ ++ return true; ++} ++ ++static struct raid_bdev_module g_raid1_module = { ++ .level = RAID1, ++ .base_bdevs_min = 2, ++ .base_bdevs_constraint = {CONSTRAINT_MIN_BASE_BDEVS_OPERATIONAL, 1}, ++ .start = raid1_start, ++ .stop = raid1_stop, ++ .submit_rw_request = raid1_submit_rw_request, ++}; ++RAID_MODULE_REGISTER(&g_raid1_module) ++ ++SPDK_LOG_REGISTER_COMPONENT(bdev_raid1) +diff --git a/module/bdev/raid/raid5f.c b/module/bdev/raid/raid5f.c +index 6659ab9..243a45c 100644 +--- a/module/bdev/raid/raid5f.c ++++ b/module/bdev/raid/raid5f.c +@@ -1,790 +1,790 @@ +-/* SPDX-License-Identifier: BSD-3-Clause +- * Copyright (C) 2022 Intel Corporation. +- * All rights reserved. +- */ +- +-#include "bdev_raid.h" +- +-#include "spdk/env.h" +-#include "spdk/thread.h" +-#include "spdk/string.h" +-#include "spdk/util.h" +-#include "spdk/likely.h" +-#include "spdk/log.h" +-#include "spdk/xor.h" +- +-/* Maximum concurrent full stripe writes per io channel */ +-#define RAID5F_MAX_STRIPES 32 +- +-struct chunk { +- /* Corresponds to base_bdev index */ +- uint8_t index; +- +- /* Array of iovecs */ +- struct iovec *iovs; +- +- /* Number of used iovecs */ +- int iovcnt; +- +- /* Total number of available iovecs in the array */ +- int iovcnt_max; +- +- /* Pointer to buffer with I/O metadata */ +- void *md_buf; +- +- /* Shallow copy of IO request parameters */ +- struct spdk_bdev_ext_io_opts ext_opts; +-}; +- +-struct stripe_request { +- struct raid5f_io_channel *r5ch; +- +- /* The associated raid_bdev_io */ +- struct raid_bdev_io *raid_io; +- +- /* The stripe's index in the raid array. */ +- uint64_t stripe_index; +- +- /* The stripe's parity chunk */ +- struct chunk *parity_chunk; +- +- /* Buffer for stripe parity */ +- void *parity_buf; +- +- /* Buffer for stripe io metadata parity */ +- void *parity_md_buf; +- +- TAILQ_ENTRY(stripe_request) link; +- +- /* Array of chunks corresponding to base_bdevs */ +- struct chunk chunks[0]; +-}; +- +-struct raid5f_info { +- /* The parent raid bdev */ +- struct raid_bdev *raid_bdev; +- +- /* Number of data blocks in a stripe (without parity) */ +- uint64_t stripe_blocks; +- +- /* Number of stripes on this array */ +- uint64_t total_stripes; +- +- /* Alignment for buffer allocation */ +- size_t buf_alignment; +-}; +- +-struct raid5f_io_channel { +- /* All available stripe requests on this channel */ +- TAILQ_HEAD(, stripe_request) free_stripe_requests; +- +- /* Array of iovec iterators for each data chunk */ +- struct iov_iter { +- struct iovec *iovs; +- int iovcnt; +- int index; +- size_t offset; +- } *chunk_iov_iters; +- +- /* Array of source buffer pointers for parity calculation */ +- void **chunk_xor_buffers; +- +- /* Array of source buffer pointers for parity calculation of io metadata */ +- void **chunk_xor_md_buffers; +- +- /* Bounce buffers for parity calculation in case of unaligned source buffers */ +- struct iovec *chunk_xor_bounce_buffers; +-}; +- +-#define __CHUNK_IN_RANGE(req, c) \ +- c < req->chunks + raid5f_ch_to_r5f_info(req->r5ch)->raid_bdev->num_base_bdevs +- +-#define FOR_EACH_CHUNK_FROM(req, c, from) \ +- for (c = from; __CHUNK_IN_RANGE(req, c); c++) +- +-#define FOR_EACH_CHUNK(req, c) \ +- FOR_EACH_CHUNK_FROM(req, c, req->chunks) +- +-#define __NEXT_DATA_CHUNK(req, c) \ +- c == req->parity_chunk ? c+1 : c +- +-#define FOR_EACH_DATA_CHUNK(req, c) \ +- for (c = __NEXT_DATA_CHUNK(req, req->chunks); __CHUNK_IN_RANGE(req, c); \ +- c = __NEXT_DATA_CHUNK(req, c+1)) +- +-static inline struct raid5f_info * +-raid5f_ch_to_r5f_info(struct raid5f_io_channel *r5ch) +-{ +- return spdk_io_channel_get_io_device(spdk_io_channel_from_ctx(r5ch)); +-} +- +-static inline struct stripe_request * +-raid5f_chunk_stripe_req(struct chunk *chunk) +-{ +- return SPDK_CONTAINEROF((chunk - chunk->index), struct stripe_request, chunks); +-} +- +-static inline uint8_t +-raid5f_stripe_data_chunks_num(const struct raid_bdev *raid_bdev) +-{ +- return raid_bdev->min_base_bdevs_operational; +-} +- +-static inline uint8_t +-raid5f_stripe_parity_chunk_index(const struct raid_bdev *raid_bdev, uint64_t stripe_index) +-{ +- return raid5f_stripe_data_chunks_num(raid_bdev) - stripe_index % raid_bdev->num_base_bdevs; +-} +- +-static inline void +-raid5f_stripe_request_release(struct stripe_request *stripe_req) +-{ +- TAILQ_INSERT_HEAD(&stripe_req->r5ch->free_stripe_requests, stripe_req, link); +-} +- +-static int +-raid5f_xor_stripe(struct stripe_request *stripe_req) +-{ +- struct raid_bdev_io *raid_io = stripe_req->raid_io; +- struct raid5f_io_channel *r5ch = stripe_req->r5ch; +- struct raid_bdev *raid_bdev = raid_io->raid_bdev; +- struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(raid_io); +- size_t remaining = raid_bdev->strip_size << raid_bdev->blocklen_shift; +- uint8_t n_src = raid5f_stripe_data_chunks_num(raid_bdev); +- void *dest = stripe_req->parity_buf; +- size_t alignment_mask = spdk_xor_get_optimal_alignment() - 1; +- void *raid_md = spdk_bdev_io_get_md_buf(bdev_io); +- uint32_t raid_md_size = spdk_bdev_get_md_size(&raid_bdev->bdev); +- struct chunk *chunk; +- int ret; +- uint8_t c; +- +- c = 0; +- FOR_EACH_DATA_CHUNK(stripe_req, chunk) { +- struct iov_iter *iov_iter = &r5ch->chunk_iov_iters[c]; +- bool aligned = true; +- int i; +- +- for (i = 0; i < chunk->iovcnt; i++) { +- if (((uintptr_t)chunk->iovs[i].iov_base & alignment_mask) || +- (chunk->iovs[i].iov_len & alignment_mask)) { +- aligned = false; +- break; +- } +- } +- +- if (aligned) { +- iov_iter->iovs = chunk->iovs; +- iov_iter->iovcnt = chunk->iovcnt; +- } else { +- iov_iter->iovs = &r5ch->chunk_xor_bounce_buffers[c]; +- iov_iter->iovcnt = 1; +- spdk_iovcpy(chunk->iovs, chunk->iovcnt, iov_iter->iovs, iov_iter->iovcnt); +- } +- +- iov_iter->index = 0; +- iov_iter->offset = 0; +- +- c++; +- } +- +- while (remaining > 0) { +- size_t len = remaining; +- uint8_t i; +- +- for (i = 0; i < n_src; i++) { +- struct iov_iter *iov_iter = &r5ch->chunk_iov_iters[i]; +- struct iovec *iov = &iov_iter->iovs[iov_iter->index]; +- +- len = spdk_min(len, iov->iov_len - iov_iter->offset); +- r5ch->chunk_xor_buffers[i] = iov->iov_base + iov_iter->offset; +- } +- +- assert(len > 0); +- +- ret = spdk_xor_gen(dest, r5ch->chunk_xor_buffers, n_src, len); +- if (spdk_unlikely(ret)) { +- SPDK_ERRLOG("stripe xor failed\n"); +- return ret; +- } +- +- for (i = 0; i < n_src; i++) { +- struct iov_iter *iov_iter = &r5ch->chunk_iov_iters[i]; +- struct iovec *iov = &iov_iter->iovs[iov_iter->index]; +- +- iov_iter->offset += len; +- if (iov_iter->offset == iov->iov_len) { +- iov_iter->offset = 0; +- iov_iter->index++; +- } +- } +- dest += len; +- +- remaining -= len; +- } +- +- if (raid_md != NULL) { +- uint64_t len = raid_bdev->strip_size * raid_md_size; +- c = 0; +- FOR_EACH_DATA_CHUNK(stripe_req, chunk) { +- r5ch->chunk_xor_md_buffers[c] = chunk->md_buf; +- c++; +- } +- ret = spdk_xor_gen(stripe_req->parity_md_buf, r5ch->chunk_xor_md_buffers, n_src, len); +- if (spdk_unlikely(ret)) { +- SPDK_ERRLOG("stripe io metadata xor failed\n"); +- return ret; +- } +- } +- +- return 0; +-} +- +-static void +-raid5f_chunk_write_complete(struct chunk *chunk, enum spdk_bdev_io_status status) +-{ +- struct stripe_request *stripe_req = raid5f_chunk_stripe_req(chunk); +- +- if (raid_bdev_io_complete_part(stripe_req->raid_io, 1, status)) { +- raid5f_stripe_request_release(stripe_req); +- } +-} +- +-static void +-raid5f_chunk_write_complete_bdev_io(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) +-{ +- struct chunk *chunk = cb_arg; +- +- spdk_bdev_free_io(bdev_io); +- +- raid5f_chunk_write_complete(chunk, success ? SPDK_BDEV_IO_STATUS_SUCCESS : +- SPDK_BDEV_IO_STATUS_FAILED); +-} +- +-static void raid5f_stripe_request_submit_chunks(struct stripe_request *stripe_req); +- +-static void +-raid5f_chunk_write_retry(void *_raid_io) +-{ +- struct raid_bdev_io *raid_io = _raid_io; +- struct stripe_request *stripe_req = raid_io->module_private; +- +- raid5f_stripe_request_submit_chunks(stripe_req); +-} +- +-static inline void +-copy_ext_io_opts(struct spdk_bdev_ext_io_opts *dst, struct spdk_bdev_ext_io_opts *src) +-{ +- memset(dst, 0, sizeof(*dst)); +- memcpy(dst, src, src->size); +- dst->size = sizeof(*dst); +-} +- +-static int +-raid5f_chunk_write(struct chunk *chunk) +-{ +- struct stripe_request *stripe_req = raid5f_chunk_stripe_req(chunk); +- struct raid_bdev_io *raid_io = stripe_req->raid_io; +- struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(raid_io); +- struct raid_bdev *raid_bdev = raid_io->raid_bdev; +- struct raid_base_bdev_info *base_info = &raid_bdev->base_bdev_info[chunk->index]; +- struct spdk_io_channel *base_ch = raid_io->raid_ch->base_channel[chunk->index]; +- uint64_t base_offset_blocks = (stripe_req->stripe_index << raid_bdev->strip_size_shift); +- int ret; +- +- if (bdev_io->u.bdev.ext_opts != NULL) { +- copy_ext_io_opts(&chunk->ext_opts, bdev_io->u.bdev.ext_opts); +- chunk->ext_opts.metadata = chunk->md_buf; +- +- ret = spdk_bdev_writev_blocks_ext(base_info->desc, base_ch, chunk->iovs, chunk->iovcnt, +- base_offset_blocks, raid_bdev->strip_size, raid5f_chunk_write_complete_bdev_io, +- chunk, &chunk->ext_opts); +- } else { +- ret = spdk_bdev_writev_blocks_with_md(base_info->desc, base_ch, chunk->iovs, chunk->iovcnt, +- chunk->md_buf, base_offset_blocks, raid_bdev->strip_size, +- raid5f_chunk_write_complete_bdev_io, chunk); +- } +- +- if (spdk_unlikely(ret)) { +- if (ret == -ENOMEM) { +- raid_bdev_queue_io_wait(raid_io, base_info->bdev, base_ch, +- raid5f_chunk_write_retry); +- } else { +- /* +- * Implicitly complete any I/Os not yet submitted as FAILED. If completing +- * these means there are no more to complete for the stripe request, we can +- * release the stripe request as well. +- */ +- uint64_t base_bdev_io_not_submitted = raid_bdev->num_base_bdevs - +- raid_io->base_bdev_io_submitted; +- +- if (raid_bdev_io_complete_part(stripe_req->raid_io, base_bdev_io_not_submitted, +- SPDK_BDEV_IO_STATUS_FAILED)) { +- raid5f_stripe_request_release(stripe_req); +- } +- } +- } +- +- return ret; +-} +- +-static int +-raid5f_stripe_request_map_iovecs(struct stripe_request *stripe_req) +-{ +- struct raid_bdev *raid_bdev = stripe_req->raid_io->raid_bdev; +- struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(stripe_req->raid_io); +- const struct iovec *raid_io_iovs = bdev_io->u.bdev.iovs; +- int raid_io_iovcnt = bdev_io->u.bdev.iovcnt; +- void *raid_io_md = spdk_bdev_io_get_md_buf(bdev_io); +- uint32_t raid_io_md_size = spdk_bdev_get_md_size(&raid_bdev->bdev); +- struct chunk *chunk; +- int raid_io_iov_idx = 0; +- size_t raid_io_offset = 0; +- size_t raid_io_iov_offset = 0; +- int i; +- +- FOR_EACH_DATA_CHUNK(stripe_req, chunk) { +- int chunk_iovcnt = 0; +- uint64_t len = raid_bdev->strip_size << raid_bdev->blocklen_shift; +- size_t off = raid_io_iov_offset; +- +- for (i = raid_io_iov_idx; i < raid_io_iovcnt; i++) { +- chunk_iovcnt++; +- off += raid_io_iovs[i].iov_len; +- if (off >= raid_io_offset + len) { +- break; +- } +- } +- +- assert(raid_io_iov_idx + chunk_iovcnt <= raid_io_iovcnt); +- +- if (chunk_iovcnt > chunk->iovcnt_max) { +- struct iovec *iovs = chunk->iovs; +- +- iovs = realloc(iovs, chunk_iovcnt * sizeof(*iovs)); +- if (!iovs) { +- return -ENOMEM; +- } +- chunk->iovs = iovs; +- chunk->iovcnt_max = chunk_iovcnt; +- } +- chunk->iovcnt = chunk_iovcnt; +- +- if (raid_io_md) { +- chunk->md_buf = raid_io_md + +- (raid_io_offset >> raid_bdev->blocklen_shift) * raid_io_md_size; +- } +- +- for (i = 0; i < chunk_iovcnt; i++) { +- struct iovec *chunk_iov = &chunk->iovs[i]; +- const struct iovec *raid_io_iov = &raid_io_iovs[raid_io_iov_idx]; +- size_t chunk_iov_offset = raid_io_offset - raid_io_iov_offset; +- +- chunk_iov->iov_base = raid_io_iov->iov_base + chunk_iov_offset; +- chunk_iov->iov_len = spdk_min(len, raid_io_iov->iov_len - chunk_iov_offset); +- raid_io_offset += chunk_iov->iov_len; +- len -= chunk_iov->iov_len; +- +- if (raid_io_offset >= raid_io_iov_offset + raid_io_iov->iov_len) { +- raid_io_iov_idx++; +- raid_io_iov_offset += raid_io_iov->iov_len; +- } +- } +- +- if (spdk_unlikely(len > 0)) { +- return -EINVAL; +- } +- } +- +- stripe_req->parity_chunk->iovs[0].iov_base = stripe_req->parity_buf; +- stripe_req->parity_chunk->iovs[0].iov_len = raid_bdev->strip_size << +- raid_bdev->blocklen_shift; +- stripe_req->parity_chunk->md_buf = stripe_req->parity_md_buf; +- stripe_req->parity_chunk->iovcnt = 1; +- +- return 0; +-} +- +-static void +-raid5f_stripe_request_submit_chunks(struct stripe_request *stripe_req) +-{ +- struct raid_bdev_io *raid_io = stripe_req->raid_io; +- struct chunk *start = &stripe_req->chunks[raid_io->base_bdev_io_submitted]; +- struct chunk *chunk; +- +- FOR_EACH_CHUNK_FROM(stripe_req, chunk, start) { +- if (spdk_unlikely(raid5f_chunk_write(chunk) != 0)) { +- break; +- } +- raid_io->base_bdev_io_submitted++; +- } +-} +- +-static void +-raid5f_submit_stripe_request(struct stripe_request *stripe_req) +-{ +- if (spdk_unlikely(raid5f_xor_stripe(stripe_req) != 0)) { +- raid_bdev_io_complete(stripe_req->raid_io, SPDK_BDEV_IO_STATUS_FAILED); +- return; +- } +- +- raid5f_stripe_request_submit_chunks(stripe_req); +-} +- +-static int +-raid5f_submit_write_request(struct raid_bdev_io *raid_io, uint64_t stripe_index) +-{ +- struct raid_bdev *raid_bdev = raid_io->raid_bdev; +- struct raid5f_io_channel *r5ch = spdk_io_channel_get_ctx(raid_io->raid_ch->module_channel); +- struct stripe_request *stripe_req; +- int ret; +- +- stripe_req = TAILQ_FIRST(&r5ch->free_stripe_requests); +- if (!stripe_req) { +- return -ENOMEM; +- } +- +- stripe_req->stripe_index = stripe_index; +- stripe_req->parity_chunk = stripe_req->chunks + raid5f_stripe_parity_chunk_index(raid_bdev, +- stripe_req->stripe_index); +- stripe_req->raid_io = raid_io; +- +- ret = raid5f_stripe_request_map_iovecs(stripe_req); +- if (spdk_unlikely(ret)) { +- return ret; +- } +- +- TAILQ_REMOVE(&r5ch->free_stripe_requests, stripe_req, link); +- +- raid_io->module_private = stripe_req; +- raid_io->base_bdev_io_remaining = raid_bdev->num_base_bdevs; +- +- raid5f_submit_stripe_request(stripe_req); +- +- return 0; +-} +- +-static void +-raid5f_chunk_read_complete(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) +-{ +- struct raid_bdev_io *raid_io = cb_arg; +- +- spdk_bdev_free_io(bdev_io); +- +- raid_bdev_io_complete(raid_io, success ? SPDK_BDEV_IO_STATUS_SUCCESS : +- SPDK_BDEV_IO_STATUS_FAILED); +-} +- +-static void raid5f_submit_rw_request(struct raid_bdev_io *raid_io); +- +-static void +-_raid5f_submit_rw_request(void *_raid_io) +-{ +- struct raid_bdev_io *raid_io = _raid_io; +- +- raid5f_submit_rw_request(raid_io); +-} +- +-static int +-raid5f_submit_read_request(struct raid_bdev_io *raid_io, uint64_t stripe_index, +- uint64_t stripe_offset) +-{ +- struct raid_bdev *raid_bdev = raid_io->raid_bdev; +- uint8_t chunk_data_idx = stripe_offset >> raid_bdev->strip_size_shift; +- uint8_t p_idx = raid5f_stripe_parity_chunk_index(raid_bdev, stripe_index); +- uint8_t chunk_idx = chunk_data_idx < p_idx ? chunk_data_idx : chunk_data_idx + 1; +- struct raid_base_bdev_info *base_info = &raid_bdev->base_bdev_info[chunk_idx]; +- struct spdk_io_channel *base_ch = raid_io->raid_ch->base_channel[chunk_idx]; +- uint64_t chunk_offset = stripe_offset - (chunk_data_idx << raid_bdev->strip_size_shift); +- uint64_t base_offset_blocks = (stripe_index << raid_bdev->strip_size_shift) + chunk_offset; +- struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(raid_io); +- int ret; +- +- if (bdev_io->u.bdev.ext_opts != NULL) { +- ret = spdk_bdev_readv_blocks_ext(base_info->desc, base_ch, bdev_io->u.bdev.iovs, +- bdev_io->u.bdev.iovcnt, +- base_offset_blocks, bdev_io->u.bdev.num_blocks, raid5f_chunk_read_complete, raid_io, +- bdev_io->u.bdev.ext_opts); +- } else { +- ret = spdk_bdev_readv_blocks_with_md(base_info->desc, base_ch, +- bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt, +- bdev_io->u.bdev.md_buf, +- base_offset_blocks, bdev_io->u.bdev.num_blocks, +- raid5f_chunk_read_complete, raid_io); +- } +- +- if (spdk_unlikely(ret == -ENOMEM)) { +- raid_bdev_queue_io_wait(raid_io, base_info->bdev, base_ch, +- _raid5f_submit_rw_request); +- return 0; +- } +- +- return ret; +-} +- +-static void +-raid5f_submit_rw_request(struct raid_bdev_io *raid_io) +-{ +- struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(raid_io); +- struct raid_bdev *raid_bdev = raid_io->raid_bdev; +- struct raid5f_info *r5f_info = raid_bdev->module_private; +- uint64_t offset_blocks = bdev_io->u.bdev.offset_blocks; +- uint64_t stripe_index = offset_blocks / r5f_info->stripe_blocks; +- uint64_t stripe_offset = offset_blocks % r5f_info->stripe_blocks; +- int ret; +- +- switch (bdev_io->type) { +- case SPDK_BDEV_IO_TYPE_READ: +- assert(bdev_io->u.bdev.num_blocks <= raid_bdev->strip_size); +- ret = raid5f_submit_read_request(raid_io, stripe_index, stripe_offset); +- break; +- case SPDK_BDEV_IO_TYPE_WRITE: +- assert(stripe_offset == 0); +- assert(bdev_io->u.bdev.num_blocks == r5f_info->stripe_blocks); +- ret = raid5f_submit_write_request(raid_io, stripe_index); +- break; +- default: +- ret = -EINVAL; +- break; +- } +- +- if (spdk_unlikely(ret)) { +- raid_bdev_io_complete(raid_io, ret == -ENOMEM ? SPDK_BDEV_IO_STATUS_NOMEM : +- SPDK_BDEV_IO_STATUS_FAILED); +- } +-} +- +-static void +-raid5f_stripe_request_free(struct stripe_request *stripe_req) +-{ +- struct chunk *chunk; +- +- FOR_EACH_CHUNK(stripe_req, chunk) { +- free(chunk->iovs); +- } +- +- spdk_dma_free(stripe_req->parity_buf); +- spdk_dma_free(stripe_req->parity_md_buf); +- +- free(stripe_req); +-} +- +-static struct stripe_request * +-raid5f_stripe_request_alloc(struct raid5f_io_channel *r5ch) +-{ +- struct raid5f_info *r5f_info = raid5f_ch_to_r5f_info(r5ch); +- struct raid_bdev *raid_bdev = r5f_info->raid_bdev; +- uint32_t raid_io_md_size = spdk_bdev_get_md_size(&raid_bdev->bdev); +- struct stripe_request *stripe_req; +- struct chunk *chunk; +- +- stripe_req = calloc(1, sizeof(*stripe_req) + +- sizeof(struct chunk) * raid_bdev->num_base_bdevs); +- if (!stripe_req) { +- return NULL; +- } +- +- stripe_req->r5ch = r5ch; +- +- FOR_EACH_CHUNK(stripe_req, chunk) { +- chunk->index = chunk - stripe_req->chunks; +- chunk->iovcnt_max = 4; +- chunk->iovs = calloc(chunk->iovcnt_max, sizeof(chunk->iovs[0])); +- if (!chunk->iovs) { +- goto err; +- } +- } +- +- stripe_req->parity_buf = spdk_dma_malloc(raid_bdev->strip_size << raid_bdev->blocklen_shift, +- r5f_info->buf_alignment, NULL); +- if (!stripe_req->parity_buf) { +- goto err; +- } +- +- if (raid_io_md_size != 0) { +- stripe_req->parity_md_buf = spdk_dma_malloc(raid_bdev->strip_size * raid_io_md_size, +- r5f_info->buf_alignment, NULL); +- if (!stripe_req->parity_md_buf) { +- goto err; +- } +- } +- +- return stripe_req; +-err: +- raid5f_stripe_request_free(stripe_req); +- return NULL; +-} +- +-static void +-raid5f_ioch_destroy(void *io_device, void *ctx_buf) +-{ +- struct raid5f_io_channel *r5ch = ctx_buf; +- struct raid5f_info *r5f_info = io_device; +- struct raid_bdev *raid_bdev = r5f_info->raid_bdev; +- struct stripe_request *stripe_req; +- int i; +- +- while ((stripe_req = TAILQ_FIRST(&r5ch->free_stripe_requests))) { +- TAILQ_REMOVE(&r5ch->free_stripe_requests, stripe_req, link); +- raid5f_stripe_request_free(stripe_req); +- } +- +- if (r5ch->chunk_xor_bounce_buffers) { +- for (i = 0; i < raid5f_stripe_data_chunks_num(raid_bdev); i++) { +- free(r5ch->chunk_xor_bounce_buffers[i].iov_base); +- } +- free(r5ch->chunk_xor_bounce_buffers); +- } +- +- free(r5ch->chunk_xor_buffers); +- free(r5ch->chunk_xor_md_buffers); +- free(r5ch->chunk_iov_iters); +-} +- +-static int +-raid5f_ioch_create(void *io_device, void *ctx_buf) +-{ +- struct raid5f_io_channel *r5ch = ctx_buf; +- struct raid5f_info *r5f_info = io_device; +- struct raid_bdev *raid_bdev = r5f_info->raid_bdev; +- size_t chunk_len = raid_bdev->strip_size << raid_bdev->blocklen_shift; +- int status = 0; +- int i; +- +- TAILQ_INIT(&r5ch->free_stripe_requests); +- +- for (i = 0; i < RAID5F_MAX_STRIPES; i++) { +- struct stripe_request *stripe_req; +- +- stripe_req = raid5f_stripe_request_alloc(r5ch); +- if (!stripe_req) { +- status = -ENOMEM; +- goto out; +- } +- +- TAILQ_INSERT_HEAD(&r5ch->free_stripe_requests, stripe_req, link); +- } +- +- r5ch->chunk_iov_iters = calloc(raid5f_stripe_data_chunks_num(raid_bdev), +- sizeof(r5ch->chunk_iov_iters[0])); +- if (!r5ch->chunk_iov_iters) { +- status = -ENOMEM; +- goto out; +- } +- +- r5ch->chunk_xor_buffers = calloc(raid5f_stripe_data_chunks_num(raid_bdev), +- sizeof(r5ch->chunk_xor_buffers[0])); +- if (!r5ch->chunk_xor_buffers) { +- status = -ENOMEM; +- goto out; +- } +- +- r5ch->chunk_xor_md_buffers = calloc(raid5f_stripe_data_chunks_num(raid_bdev), +- sizeof(r5ch->chunk_xor_md_buffers[0])); +- if (!r5ch->chunk_xor_md_buffers) { +- status = -ENOMEM; +- goto out; +- } +- +- r5ch->chunk_xor_bounce_buffers = calloc(raid5f_stripe_data_chunks_num(raid_bdev), +- sizeof(r5ch->chunk_xor_bounce_buffers[0])); +- if (!r5ch->chunk_xor_bounce_buffers) { +- status = -ENOMEM; +- goto out; +- } +- +- for (i = 0; i < raid5f_stripe_data_chunks_num(raid_bdev); i++) { +- status = posix_memalign(&r5ch->chunk_xor_bounce_buffers[i].iov_base, +- spdk_xor_get_optimal_alignment(), chunk_len); +- if (status) { +- goto out; +- } +- r5ch->chunk_xor_bounce_buffers[i].iov_len = chunk_len; +- } +-out: +- if (status) { +- SPDK_ERRLOG("Failed to initialize io channel\n"); +- raid5f_ioch_destroy(r5f_info, r5ch); +- } +- return status; +-} +- +-static int +-raid5f_start(struct raid_bdev *raid_bdev) +-{ +- uint64_t min_blockcnt = UINT64_MAX; +- struct raid_base_bdev_info *base_info; +- struct raid5f_info *r5f_info; +- size_t alignment; +- +- r5f_info = calloc(1, sizeof(*r5f_info)); +- if (!r5f_info) { +- SPDK_ERRLOG("Failed to allocate r5f_info\n"); +- return -ENOMEM; +- } +- r5f_info->raid_bdev = raid_bdev; +- +- alignment = spdk_xor_get_optimal_alignment(); +- RAID_FOR_EACH_BASE_BDEV(raid_bdev, base_info) { +- min_blockcnt = spdk_min(min_blockcnt, base_info->bdev->blockcnt); +- alignment = spdk_max(alignment, spdk_bdev_get_buf_align(base_info->bdev)); +- } +- +- r5f_info->total_stripes = min_blockcnt / raid_bdev->strip_size; +- r5f_info->stripe_blocks = raid_bdev->strip_size * raid5f_stripe_data_chunks_num(raid_bdev); +- r5f_info->buf_alignment = alignment; +- +- raid_bdev->bdev.blockcnt = r5f_info->stripe_blocks * r5f_info->total_stripes; +- raid_bdev->bdev.optimal_io_boundary = raid_bdev->strip_size; +- raid_bdev->bdev.split_on_optimal_io_boundary = true; +- raid_bdev->bdev.write_unit_size = r5f_info->stripe_blocks; +- raid_bdev->bdev.split_on_write_unit = true; +- +- raid_bdev->module_private = r5f_info; +- +- spdk_io_device_register(r5f_info, raid5f_ioch_create, raid5f_ioch_destroy, +- sizeof(struct raid5f_io_channel), NULL); +- +- return 0; +-} +- +-static void +-raid5f_io_device_unregister_done(void *io_device) +-{ +- struct raid5f_info *r5f_info = io_device; +- +- raid_bdev_module_stop_done(r5f_info->raid_bdev); +- +- free(r5f_info); +-} +- +-static bool +-raid5f_stop(struct raid_bdev *raid_bdev) +-{ +- struct raid5f_info *r5f_info = raid_bdev->module_private; +- +- spdk_io_device_unregister(r5f_info, raid5f_io_device_unregister_done); +- +- return false; +-} +- +-static struct spdk_io_channel * +-raid5f_get_io_channel(struct raid_bdev *raid_bdev) +-{ +- struct raid5f_info *r5f_info = raid_bdev->module_private; +- +- return spdk_get_io_channel(r5f_info); +-} +- +-static struct raid_bdev_module g_raid5f_module = { +- .level = RAID5F, +- .base_bdevs_min = 3, +- .base_bdevs_constraint = {CONSTRAINT_MAX_BASE_BDEVS_REMOVED, 1}, +- .start = raid5f_start, +- .stop = raid5f_stop, +- .submit_rw_request = raid5f_submit_rw_request, +- .get_io_channel = raid5f_get_io_channel, +-}; +-RAID_MODULE_REGISTER(&g_raid5f_module) +- +-SPDK_LOG_REGISTER_COMPONENT(bdev_raid5f) ++/* SPDX-License-Identifier: BSD-3-Clause ++ * Copyright (C) 2022 Intel Corporation. ++ * All rights reserved. ++ */ ++ ++#include "bdev_raid.h" ++ ++#include "spdk/env.h" ++#include "spdk/thread.h" ++#include "spdk/string.h" ++#include "spdk/util.h" ++#include "spdk/likely.h" ++#include "spdk/log.h" ++#include "spdk/xor.h" ++ ++/* Maximum concurrent full stripe writes per io channel */ ++#define RAID5F_MAX_STRIPES 32 ++ ++struct chunk { ++ /* Corresponds to base_bdev index */ ++ uint8_t index; ++ ++ /* Array of iovecs */ ++ struct iovec *iovs; ++ ++ /* Number of used iovecs */ ++ int iovcnt; ++ ++ /* Total number of available iovecs in the array */ ++ int iovcnt_max; ++ ++ /* Pointer to buffer with I/O metadata */ ++ void *md_buf; ++ ++ /* Shallow copy of IO request parameters */ ++ struct spdk_bdev_ext_io_opts ext_opts; ++}; ++ ++struct stripe_request { ++ struct raid5f_io_channel *r5ch; ++ ++ /* The associated raid_bdev_io */ ++ struct raid_bdev_io *raid_io; ++ ++ /* The stripe's index in the raid array. */ ++ uint64_t stripe_index; ++ ++ /* The stripe's parity chunk */ ++ struct chunk *parity_chunk; ++ ++ /* Buffer for stripe parity */ ++ void *parity_buf; ++ ++ /* Buffer for stripe io metadata parity */ ++ void *parity_md_buf; ++ ++ TAILQ_ENTRY(stripe_request) link; ++ ++ /* Array of chunks corresponding to base_bdevs */ ++ struct chunk chunks[0]; ++}; ++ ++struct raid5f_info { ++ /* The parent raid bdev */ ++ struct raid_bdev *raid_bdev; ++ ++ /* Number of data blocks in a stripe (without parity) */ ++ uint64_t stripe_blocks; ++ ++ /* Number of stripes on this array */ ++ uint64_t total_stripes; ++ ++ /* Alignment for buffer allocation */ ++ size_t buf_alignment; ++}; ++ ++struct raid5f_io_channel { ++ /* All available stripe requests on this channel */ ++ TAILQ_HEAD(, stripe_request) free_stripe_requests; ++ ++ /* Array of iovec iterators for each data chunk */ ++ struct iov_iter { ++ struct iovec *iovs; ++ int iovcnt; ++ int index; ++ size_t offset; ++ } *chunk_iov_iters; ++ ++ /* Array of source buffer pointers for parity calculation */ ++ void **chunk_xor_buffers; ++ ++ /* Array of source buffer pointers for parity calculation of io metadata */ ++ void **chunk_xor_md_buffers; ++ ++ /* Bounce buffers for parity calculation in case of unaligned source buffers */ ++ struct iovec *chunk_xor_bounce_buffers; ++}; ++ ++#define __CHUNK_IN_RANGE(req, c) \ ++ c < req->chunks + raid5f_ch_to_r5f_info(req->r5ch)->raid_bdev->num_base_bdevs ++ ++#define FOR_EACH_CHUNK_FROM(req, c, from) \ ++ for (c = from; __CHUNK_IN_RANGE(req, c); c++) ++ ++#define FOR_EACH_CHUNK(req, c) \ ++ FOR_EACH_CHUNK_FROM(req, c, req->chunks) ++ ++#define __NEXT_DATA_CHUNK(req, c) \ ++ c == req->parity_chunk ? c+1 : c ++ ++#define FOR_EACH_DATA_CHUNK(req, c) \ ++ for (c = __NEXT_DATA_CHUNK(req, req->chunks); __CHUNK_IN_RANGE(req, c); \ ++ c = __NEXT_DATA_CHUNK(req, c+1)) ++ ++static inline struct raid5f_info * ++raid5f_ch_to_r5f_info(struct raid5f_io_channel *r5ch) ++{ ++ return spdk_io_channel_get_io_device(spdk_io_channel_from_ctx(r5ch)); ++} ++ ++static inline struct stripe_request * ++raid5f_chunk_stripe_req(struct chunk *chunk) ++{ ++ return SPDK_CONTAINEROF((chunk - chunk->index), struct stripe_request, chunks); ++} ++ ++static inline uint8_t ++raid5f_stripe_data_chunks_num(const struct raid_bdev *raid_bdev) ++{ ++ return raid_bdev->min_base_bdevs_operational; ++} ++ ++static inline uint8_t ++raid5f_stripe_parity_chunk_index(const struct raid_bdev *raid_bdev, uint64_t stripe_index) ++{ ++ return raid5f_stripe_data_chunks_num(raid_bdev) - stripe_index % raid_bdev->num_base_bdevs; ++} ++ ++static inline void ++raid5f_stripe_request_release(struct stripe_request *stripe_req) ++{ ++ TAILQ_INSERT_HEAD(&stripe_req->r5ch->free_stripe_requests, stripe_req, link); ++} ++ ++static int ++raid5f_xor_stripe(struct stripe_request *stripe_req) ++{ ++ struct raid_bdev_io *raid_io = stripe_req->raid_io; ++ struct raid5f_io_channel *r5ch = stripe_req->r5ch; ++ struct raid_bdev *raid_bdev = raid_io->raid_bdev; ++ struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(raid_io); ++ size_t remaining = raid_bdev->strip_size << raid_bdev->blocklen_shift; ++ uint8_t n_src = raid5f_stripe_data_chunks_num(raid_bdev); ++ void *dest = stripe_req->parity_buf; ++ size_t alignment_mask = spdk_xor_get_optimal_alignment() - 1; ++ void *raid_md = spdk_bdev_io_get_md_buf(bdev_io); ++ uint32_t raid_md_size = spdk_bdev_get_md_size(&raid_bdev->bdev); ++ struct chunk *chunk; ++ int ret; ++ uint8_t c; ++ ++ c = 0; ++ FOR_EACH_DATA_CHUNK(stripe_req, chunk) { ++ struct iov_iter *iov_iter = &r5ch->chunk_iov_iters[c]; ++ bool aligned = true; ++ int i; ++ ++ for (i = 0; i < chunk->iovcnt; i++) { ++ if (((uintptr_t)chunk->iovs[i].iov_base & alignment_mask) || ++ (chunk->iovs[i].iov_len & alignment_mask)) { ++ aligned = false; ++ break; ++ } ++ } ++ ++ if (aligned) { ++ iov_iter->iovs = chunk->iovs; ++ iov_iter->iovcnt = chunk->iovcnt; ++ } else { ++ iov_iter->iovs = &r5ch->chunk_xor_bounce_buffers[c]; ++ iov_iter->iovcnt = 1; ++ spdk_iovcpy(chunk->iovs, chunk->iovcnt, iov_iter->iovs, iov_iter->iovcnt); ++ } ++ ++ iov_iter->index = 0; ++ iov_iter->offset = 0; ++ ++ c++; ++ } ++ ++ while (remaining > 0) { ++ size_t len = remaining; ++ uint8_t i; ++ ++ for (i = 0; i < n_src; i++) { ++ struct iov_iter *iov_iter = &r5ch->chunk_iov_iters[i]; ++ struct iovec *iov = &iov_iter->iovs[iov_iter->index]; ++ ++ len = spdk_min(len, iov->iov_len - iov_iter->offset); ++ r5ch->chunk_xor_buffers[i] = iov->iov_base + iov_iter->offset; ++ } ++ ++ assert(len > 0); ++ ++ ret = spdk_xor_gen(dest, r5ch->chunk_xor_buffers, n_src, len); ++ if (spdk_unlikely(ret)) { ++ SPDK_ERRLOG("stripe xor failed\n"); ++ return ret; ++ } ++ ++ for (i = 0; i < n_src; i++) { ++ struct iov_iter *iov_iter = &r5ch->chunk_iov_iters[i]; ++ struct iovec *iov = &iov_iter->iovs[iov_iter->index]; ++ ++ iov_iter->offset += len; ++ if (iov_iter->offset == iov->iov_len) { ++ iov_iter->offset = 0; ++ iov_iter->index++; ++ } ++ } ++ dest += len; ++ ++ remaining -= len; ++ } ++ ++ if (raid_md != NULL) { ++ uint64_t len = raid_bdev->strip_size * raid_md_size; ++ c = 0; ++ FOR_EACH_DATA_CHUNK(stripe_req, chunk) { ++ r5ch->chunk_xor_md_buffers[c] = chunk->md_buf; ++ c++; ++ } ++ ret = spdk_xor_gen(stripe_req->parity_md_buf, r5ch->chunk_xor_md_buffers, n_src, len); ++ if (spdk_unlikely(ret)) { ++ SPDK_ERRLOG("stripe io metadata xor failed\n"); ++ return ret; ++ } ++ } ++ ++ return 0; ++} ++ ++static void ++raid5f_chunk_write_complete(struct chunk *chunk, enum spdk_bdev_io_status status) ++{ ++ struct stripe_request *stripe_req = raid5f_chunk_stripe_req(chunk); ++ ++ if (raid_bdev_io_complete_part(stripe_req->raid_io, 1, status)) { ++ raid5f_stripe_request_release(stripe_req); ++ } ++} ++ ++static void ++raid5f_chunk_write_complete_bdev_io(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) ++{ ++ struct chunk *chunk = cb_arg; ++ ++ spdk_bdev_free_io(bdev_io); ++ ++ raid5f_chunk_write_complete(chunk, success ? SPDK_BDEV_IO_STATUS_SUCCESS : ++ SPDK_BDEV_IO_STATUS_FAILED); ++} ++ ++static void raid5f_stripe_request_submit_chunks(struct stripe_request *stripe_req); ++ ++static void ++raid5f_chunk_write_retry(void *_raid_io) ++{ ++ struct raid_bdev_io *raid_io = _raid_io; ++ struct stripe_request *stripe_req = raid_io->module_private; ++ ++ raid5f_stripe_request_submit_chunks(stripe_req); ++} ++ ++static inline void ++copy_ext_io_opts(struct spdk_bdev_ext_io_opts *dst, struct spdk_bdev_ext_io_opts *src) ++{ ++ memset(dst, 0, sizeof(*dst)); ++ memcpy(dst, src, src->size); ++ dst->size = sizeof(*dst); ++} ++ ++static int ++raid5f_chunk_write(struct chunk *chunk) ++{ ++ struct stripe_request *stripe_req = raid5f_chunk_stripe_req(chunk); ++ struct raid_bdev_io *raid_io = stripe_req->raid_io; ++ struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(raid_io); ++ struct raid_bdev *raid_bdev = raid_io->raid_bdev; ++ struct raid_base_bdev_info *base_info = &raid_bdev->base_bdev_info[chunk->index]; ++ struct spdk_io_channel *base_ch = raid_io->raid_ch->base_channel[chunk->index]; ++ uint64_t base_offset_blocks = (stripe_req->stripe_index << raid_bdev->strip_size_shift); ++ int ret; ++ ++ if (bdev_io->u.bdev.ext_opts != NULL) { ++ copy_ext_io_opts(&chunk->ext_opts, bdev_io->u.bdev.ext_opts); ++ chunk->ext_opts.metadata = chunk->md_buf; ++ ++ ret = spdk_bdev_writev_blocks_ext(base_info->desc, base_ch, chunk->iovs, chunk->iovcnt, ++ base_offset_blocks, raid_bdev->strip_size, raid5f_chunk_write_complete_bdev_io, ++ chunk, &chunk->ext_opts); ++ } else { ++ ret = spdk_bdev_writev_blocks_with_md(base_info->desc, base_ch, chunk->iovs, chunk->iovcnt, ++ chunk->md_buf, base_offset_blocks, raid_bdev->strip_size, ++ raid5f_chunk_write_complete_bdev_io, chunk); ++ } ++ ++ if (spdk_unlikely(ret)) { ++ if (ret == -ENOMEM) { ++ raid_bdev_queue_io_wait(raid_io, base_info->bdev, base_ch, ++ raid5f_chunk_write_retry); ++ } else { ++ /* ++ * Implicitly complete any I/Os not yet submitted as FAILED. If completing ++ * these means there are no more to complete for the stripe request, we can ++ * release the stripe request as well. ++ */ ++ uint64_t base_bdev_io_not_submitted = raid_bdev->num_base_bdevs - ++ raid_io->base_bdev_io_submitted; ++ ++ if (raid_bdev_io_complete_part(stripe_req->raid_io, base_bdev_io_not_submitted, ++ SPDK_BDEV_IO_STATUS_FAILED)) { ++ raid5f_stripe_request_release(stripe_req); ++ } ++ } ++ } ++ ++ return ret; ++} ++ ++static int ++raid5f_stripe_request_map_iovecs(struct stripe_request *stripe_req) ++{ ++ struct raid_bdev *raid_bdev = stripe_req->raid_io->raid_bdev; ++ struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(stripe_req->raid_io); ++ const struct iovec *raid_io_iovs = bdev_io->u.bdev.iovs; ++ int raid_io_iovcnt = bdev_io->u.bdev.iovcnt; ++ void *raid_io_md = spdk_bdev_io_get_md_buf(bdev_io); ++ uint32_t raid_io_md_size = spdk_bdev_get_md_size(&raid_bdev->bdev); ++ struct chunk *chunk; ++ int raid_io_iov_idx = 0; ++ size_t raid_io_offset = 0; ++ size_t raid_io_iov_offset = 0; ++ int i; ++ ++ FOR_EACH_DATA_CHUNK(stripe_req, chunk) { ++ int chunk_iovcnt = 0; ++ uint64_t len = raid_bdev->strip_size << raid_bdev->blocklen_shift; ++ size_t off = raid_io_iov_offset; ++ ++ for (i = raid_io_iov_idx; i < raid_io_iovcnt; i++) { ++ chunk_iovcnt++; ++ off += raid_io_iovs[i].iov_len; ++ if (off >= raid_io_offset + len) { ++ break; ++ } ++ } ++ ++ assert(raid_io_iov_idx + chunk_iovcnt <= raid_io_iovcnt); ++ ++ if (chunk_iovcnt > chunk->iovcnt_max) { ++ struct iovec *iovs = chunk->iovs; ++ ++ iovs = realloc(iovs, chunk_iovcnt * sizeof(*iovs)); ++ if (!iovs) { ++ return -ENOMEM; ++ } ++ chunk->iovs = iovs; ++ chunk->iovcnt_max = chunk_iovcnt; ++ } ++ chunk->iovcnt = chunk_iovcnt; ++ ++ if (raid_io_md) { ++ chunk->md_buf = raid_io_md + ++ (raid_io_offset >> raid_bdev->blocklen_shift) * raid_io_md_size; ++ } ++ ++ for (i = 0; i < chunk_iovcnt; i++) { ++ struct iovec *chunk_iov = &chunk->iovs[i]; ++ const struct iovec *raid_io_iov = &raid_io_iovs[raid_io_iov_idx]; ++ size_t chunk_iov_offset = raid_io_offset - raid_io_iov_offset; ++ ++ chunk_iov->iov_base = raid_io_iov->iov_base + chunk_iov_offset; ++ chunk_iov->iov_len = spdk_min(len, raid_io_iov->iov_len - chunk_iov_offset); ++ raid_io_offset += chunk_iov->iov_len; ++ len -= chunk_iov->iov_len; ++ ++ if (raid_io_offset >= raid_io_iov_offset + raid_io_iov->iov_len) { ++ raid_io_iov_idx++; ++ raid_io_iov_offset += raid_io_iov->iov_len; ++ } ++ } ++ ++ if (spdk_unlikely(len > 0)) { ++ return -EINVAL; ++ } ++ } ++ ++ stripe_req->parity_chunk->iovs[0].iov_base = stripe_req->parity_buf; ++ stripe_req->parity_chunk->iovs[0].iov_len = raid_bdev->strip_size << ++ raid_bdev->blocklen_shift; ++ stripe_req->parity_chunk->md_buf = stripe_req->parity_md_buf; ++ stripe_req->parity_chunk->iovcnt = 1; ++ ++ return 0; ++} ++ ++static void ++raid5f_stripe_request_submit_chunks(struct stripe_request *stripe_req) ++{ ++ struct raid_bdev_io *raid_io = stripe_req->raid_io; ++ struct chunk *start = &stripe_req->chunks[raid_io->base_bdev_io_submitted]; ++ struct chunk *chunk; ++ ++ FOR_EACH_CHUNK_FROM(stripe_req, chunk, start) { ++ if (spdk_unlikely(raid5f_chunk_write(chunk) != 0)) { ++ break; ++ } ++ raid_io->base_bdev_io_submitted++; ++ } ++} ++ ++static void ++raid5f_submit_stripe_request(struct stripe_request *stripe_req) ++{ ++ if (spdk_unlikely(raid5f_xor_stripe(stripe_req) != 0)) { ++ raid_bdev_io_complete(stripe_req->raid_io, SPDK_BDEV_IO_STATUS_FAILED); ++ return; ++ } ++ ++ raid5f_stripe_request_submit_chunks(stripe_req); ++} ++ ++static int ++raid5f_submit_write_request(struct raid_bdev_io *raid_io, uint64_t stripe_index) ++{ ++ struct raid_bdev *raid_bdev = raid_io->raid_bdev; ++ struct raid5f_io_channel *r5ch = spdk_io_channel_get_ctx(raid_io->raid_ch->module_channel); ++ struct stripe_request *stripe_req; ++ int ret; ++ ++ stripe_req = TAILQ_FIRST(&r5ch->free_stripe_requests); ++ if (!stripe_req) { ++ return -ENOMEM; ++ } ++ ++ stripe_req->stripe_index = stripe_index; ++ stripe_req->parity_chunk = stripe_req->chunks + raid5f_stripe_parity_chunk_index(raid_bdev, ++ stripe_req->stripe_index); ++ stripe_req->raid_io = raid_io; ++ ++ ret = raid5f_stripe_request_map_iovecs(stripe_req); ++ if (spdk_unlikely(ret)) { ++ return ret; ++ } ++ ++ TAILQ_REMOVE(&r5ch->free_stripe_requests, stripe_req, link); ++ ++ raid_io->module_private = stripe_req; ++ raid_io->base_bdev_io_remaining = raid_bdev->num_base_bdevs; ++ ++ raid5f_submit_stripe_request(stripe_req); ++ ++ return 0; ++} ++ ++static void ++raid5f_chunk_read_complete(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) ++{ ++ struct raid_bdev_io *raid_io = cb_arg; ++ ++ spdk_bdev_free_io(bdev_io); ++ ++ raid_bdev_io_complete(raid_io, success ? SPDK_BDEV_IO_STATUS_SUCCESS : ++ SPDK_BDEV_IO_STATUS_FAILED); ++} ++ ++static void raid5f_submit_rw_request(struct raid_bdev_io *raid_io); ++ ++static void ++_raid5f_submit_rw_request(void *_raid_io) ++{ ++ struct raid_bdev_io *raid_io = _raid_io; ++ ++ raid5f_submit_rw_request(raid_io); ++} ++ ++static int ++raid5f_submit_read_request(struct raid_bdev_io *raid_io, uint64_t stripe_index, ++ uint64_t stripe_offset) ++{ ++ struct raid_bdev *raid_bdev = raid_io->raid_bdev; ++ uint8_t chunk_data_idx = stripe_offset >> raid_bdev->strip_size_shift; ++ uint8_t p_idx = raid5f_stripe_parity_chunk_index(raid_bdev, stripe_index); ++ uint8_t chunk_idx = chunk_data_idx < p_idx ? chunk_data_idx : chunk_data_idx + 1; ++ struct raid_base_bdev_info *base_info = &raid_bdev->base_bdev_info[chunk_idx]; ++ struct spdk_io_channel *base_ch = raid_io->raid_ch->base_channel[chunk_idx]; ++ uint64_t chunk_offset = stripe_offset - (chunk_data_idx << raid_bdev->strip_size_shift); ++ uint64_t base_offset_blocks = (stripe_index << raid_bdev->strip_size_shift) + chunk_offset; ++ struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(raid_io); ++ int ret; ++ ++ if (bdev_io->u.bdev.ext_opts != NULL) { ++ ret = spdk_bdev_readv_blocks_ext(base_info->desc, base_ch, bdev_io->u.bdev.iovs, ++ bdev_io->u.bdev.iovcnt, ++ base_offset_blocks, bdev_io->u.bdev.num_blocks, raid5f_chunk_read_complete, raid_io, ++ bdev_io->u.bdev.ext_opts); ++ } else { ++ ret = spdk_bdev_readv_blocks_with_md(base_info->desc, base_ch, ++ bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt, ++ bdev_io->u.bdev.md_buf, ++ base_offset_blocks, bdev_io->u.bdev.num_blocks, ++ raid5f_chunk_read_complete, raid_io); ++ } ++ ++ if (spdk_unlikely(ret == -ENOMEM)) { ++ raid_bdev_queue_io_wait(raid_io, base_info->bdev, base_ch, ++ _raid5f_submit_rw_request); ++ return 0; ++ } ++ ++ return ret; ++} ++ ++static void ++raid5f_submit_rw_request(struct raid_bdev_io *raid_io) ++{ ++ struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(raid_io); ++ struct raid_bdev *raid_bdev = raid_io->raid_bdev; ++ struct raid5f_info *r5f_info = raid_bdev->module_private; ++ uint64_t offset_blocks = bdev_io->u.bdev.offset_blocks; ++ uint64_t stripe_index = offset_blocks / r5f_info->stripe_blocks; ++ uint64_t stripe_offset = offset_blocks % r5f_info->stripe_blocks; ++ int ret; ++ ++ switch (bdev_io->type) { ++ case SPDK_BDEV_IO_TYPE_READ: ++ assert(bdev_io->u.bdev.num_blocks <= raid_bdev->strip_size); ++ ret = raid5f_submit_read_request(raid_io, stripe_index, stripe_offset); ++ break; ++ case SPDK_BDEV_IO_TYPE_WRITE: ++ assert(stripe_offset == 0); ++ assert(bdev_io->u.bdev.num_blocks == r5f_info->stripe_blocks); ++ ret = raid5f_submit_write_request(raid_io, stripe_index); ++ break; ++ default: ++ ret = -EINVAL; ++ break; ++ } ++ ++ if (spdk_unlikely(ret)) { ++ raid_bdev_io_complete(raid_io, ret == -ENOMEM ? SPDK_BDEV_IO_STATUS_NOMEM : ++ SPDK_BDEV_IO_STATUS_FAILED); ++ } ++} ++ ++static void ++raid5f_stripe_request_free(struct stripe_request *stripe_req) ++{ ++ struct chunk *chunk; ++ ++ FOR_EACH_CHUNK(stripe_req, chunk) { ++ free(chunk->iovs); ++ } ++ ++ spdk_dma_free(stripe_req->parity_buf); ++ spdk_dma_free(stripe_req->parity_md_buf); ++ ++ free(stripe_req); ++} ++ ++static struct stripe_request * ++raid5f_stripe_request_alloc(struct raid5f_io_channel *r5ch) ++{ ++ struct raid5f_info *r5f_info = raid5f_ch_to_r5f_info(r5ch); ++ struct raid_bdev *raid_bdev = r5f_info->raid_bdev; ++ uint32_t raid_io_md_size = spdk_bdev_get_md_size(&raid_bdev->bdev); ++ struct stripe_request *stripe_req; ++ struct chunk *chunk; ++ ++ stripe_req = calloc(1, sizeof(*stripe_req) + ++ sizeof(struct chunk) * raid_bdev->num_base_bdevs); ++ if (!stripe_req) { ++ return NULL; ++ } ++ ++ stripe_req->r5ch = r5ch; ++ ++ FOR_EACH_CHUNK(stripe_req, chunk) { ++ chunk->index = chunk - stripe_req->chunks; ++ chunk->iovcnt_max = 4; ++ chunk->iovs = calloc(chunk->iovcnt_max, sizeof(chunk->iovs[0])); ++ if (!chunk->iovs) { ++ goto err; ++ } ++ } ++ ++ stripe_req->parity_buf = spdk_dma_malloc(raid_bdev->strip_size << raid_bdev->blocklen_shift, ++ r5f_info->buf_alignment, NULL); ++ if (!stripe_req->parity_buf) { ++ goto err; ++ } ++ ++ if (raid_io_md_size != 0) { ++ stripe_req->parity_md_buf = spdk_dma_malloc(raid_bdev->strip_size * raid_io_md_size, ++ r5f_info->buf_alignment, NULL); ++ if (!stripe_req->parity_md_buf) { ++ goto err; ++ } ++ } ++ ++ return stripe_req; ++err: ++ raid5f_stripe_request_free(stripe_req); ++ return NULL; ++} ++ ++static void ++raid5f_ioch_destroy(void *io_device, void *ctx_buf) ++{ ++ struct raid5f_io_channel *r5ch = ctx_buf; ++ struct raid5f_info *r5f_info = io_device; ++ struct raid_bdev *raid_bdev = r5f_info->raid_bdev; ++ struct stripe_request *stripe_req; ++ int i; ++ ++ while ((stripe_req = TAILQ_FIRST(&r5ch->free_stripe_requests))) { ++ TAILQ_REMOVE(&r5ch->free_stripe_requests, stripe_req, link); ++ raid5f_stripe_request_free(stripe_req); ++ } ++ ++ if (r5ch->chunk_xor_bounce_buffers) { ++ for (i = 0; i < raid5f_stripe_data_chunks_num(raid_bdev); i++) { ++ free(r5ch->chunk_xor_bounce_buffers[i].iov_base); ++ } ++ free(r5ch->chunk_xor_bounce_buffers); ++ } ++ ++ free(r5ch->chunk_xor_buffers); ++ free(r5ch->chunk_xor_md_buffers); ++ free(r5ch->chunk_iov_iters); ++} ++ ++static int ++raid5f_ioch_create(void *io_device, void *ctx_buf) ++{ ++ struct raid5f_io_channel *r5ch = ctx_buf; ++ struct raid5f_info *r5f_info = io_device; ++ struct raid_bdev *raid_bdev = r5f_info->raid_bdev; ++ size_t chunk_len = raid_bdev->strip_size << raid_bdev->blocklen_shift; ++ int status = 0; ++ int i; ++ ++ TAILQ_INIT(&r5ch->free_stripe_requests); ++ ++ for (i = 0; i < RAID5F_MAX_STRIPES; i++) { ++ struct stripe_request *stripe_req; ++ ++ stripe_req = raid5f_stripe_request_alloc(r5ch); ++ if (!stripe_req) { ++ status = -ENOMEM; ++ goto out; ++ } ++ ++ TAILQ_INSERT_HEAD(&r5ch->free_stripe_requests, stripe_req, link); ++ } ++ ++ r5ch->chunk_iov_iters = calloc(raid5f_stripe_data_chunks_num(raid_bdev), ++ sizeof(r5ch->chunk_iov_iters[0])); ++ if (!r5ch->chunk_iov_iters) { ++ status = -ENOMEM; ++ goto out; ++ } ++ ++ r5ch->chunk_xor_buffers = calloc(raid5f_stripe_data_chunks_num(raid_bdev), ++ sizeof(r5ch->chunk_xor_buffers[0])); ++ if (!r5ch->chunk_xor_buffers) { ++ status = -ENOMEM; ++ goto out; ++ } ++ ++ r5ch->chunk_xor_md_buffers = calloc(raid5f_stripe_data_chunks_num(raid_bdev), ++ sizeof(r5ch->chunk_xor_md_buffers[0])); ++ if (!r5ch->chunk_xor_md_buffers) { ++ status = -ENOMEM; ++ goto out; ++ } ++ ++ r5ch->chunk_xor_bounce_buffers = calloc(raid5f_stripe_data_chunks_num(raid_bdev), ++ sizeof(r5ch->chunk_xor_bounce_buffers[0])); ++ if (!r5ch->chunk_xor_bounce_buffers) { ++ status = -ENOMEM; ++ goto out; ++ } ++ ++ for (i = 0; i < raid5f_stripe_data_chunks_num(raid_bdev); i++) { ++ status = posix_memalign(&r5ch->chunk_xor_bounce_buffers[i].iov_base, ++ spdk_xor_get_optimal_alignment(), chunk_len); ++ if (status) { ++ goto out; ++ } ++ r5ch->chunk_xor_bounce_buffers[i].iov_len = chunk_len; ++ } ++out: ++ if (status) { ++ SPDK_ERRLOG("Failed to initialize io channel\n"); ++ raid5f_ioch_destroy(r5f_info, r5ch); ++ } ++ return status; ++} ++ ++static int ++raid5f_start(struct raid_bdev *raid_bdev) ++{ ++ uint64_t min_blockcnt = UINT64_MAX; ++ struct raid_base_bdev_info *base_info; ++ struct raid5f_info *r5f_info; ++ size_t alignment; ++ ++ r5f_info = calloc(1, sizeof(*r5f_info)); ++ if (!r5f_info) { ++ SPDK_ERRLOG("Failed to allocate r5f_info\n"); ++ return -ENOMEM; ++ } ++ r5f_info->raid_bdev = raid_bdev; ++ ++ alignment = spdk_xor_get_optimal_alignment(); ++ RAID_FOR_EACH_BASE_BDEV(raid_bdev, base_info) { ++ min_blockcnt = spdk_min(min_blockcnt, base_info->bdev->blockcnt); ++ alignment = spdk_max(alignment, spdk_bdev_get_buf_align(base_info->bdev)); ++ } ++ ++ r5f_info->total_stripes = min_blockcnt / raid_bdev->strip_size; ++ r5f_info->stripe_blocks = raid_bdev->strip_size * raid5f_stripe_data_chunks_num(raid_bdev); ++ r5f_info->buf_alignment = alignment; ++ ++ raid_bdev->bdev.blockcnt = r5f_info->stripe_blocks * r5f_info->total_stripes; ++ raid_bdev->bdev.optimal_io_boundary = raid_bdev->strip_size; ++ raid_bdev->bdev.split_on_optimal_io_boundary = true; ++ raid_bdev->bdev.write_unit_size = r5f_info->stripe_blocks; ++ raid_bdev->bdev.split_on_write_unit = true; ++ ++ raid_bdev->module_private = r5f_info; ++ ++ spdk_io_device_register(r5f_info, raid5f_ioch_create, raid5f_ioch_destroy, ++ sizeof(struct raid5f_io_channel), NULL); ++ ++ return 0; ++} ++ ++static void ++raid5f_io_device_unregister_done(void *io_device) ++{ ++ struct raid5f_info *r5f_info = io_device; ++ ++ raid_bdev_module_stop_done(r5f_info->raid_bdev); ++ ++ free(r5f_info); ++} ++ ++static bool ++raid5f_stop(struct raid_bdev *raid_bdev) ++{ ++ struct raid5f_info *r5f_info = raid_bdev->module_private; ++ ++ spdk_io_device_unregister(r5f_info, raid5f_io_device_unregister_done); ++ ++ return false; ++} ++ ++static struct spdk_io_channel * ++raid5f_get_io_channel(struct raid_bdev *raid_bdev) ++{ ++ struct raid5f_info *r5f_info = raid_bdev->module_private; ++ ++ return spdk_get_io_channel(r5f_info); ++} ++ ++static struct raid_bdev_module g_raid5f_module = { ++ .level = RAID5F, ++ .base_bdevs_min = 3, ++ .base_bdevs_constraint = {CONSTRAINT_MAX_BASE_BDEVS_REMOVED, 1}, ++ .start = raid5f_start, ++ .stop = raid5f_stop, ++ .submit_rw_request = raid5f_submit_rw_request, ++ .get_io_channel = raid5f_get_io_channel, ++}; ++RAID_MODULE_REGISTER(&g_raid5f_module) ++ ++SPDK_LOG_REGISTER_COMPONENT(bdev_raid5f) +diff --git a/module/bdev/rbd/Makefile b/module/bdev/rbd/Makefile +index b74c967..d24eb86 100644 +--- a/module/bdev/rbd/Makefile ++++ b/module/bdev/rbd/Makefile +@@ -1,17 +1,17 @@ +-# SPDX-License-Identifier: BSD-3-Clause +-# Copyright (C) 2015 Intel Corporation. +-# All rights reserved. +-# +- +-SPDK_ROOT_DIR := $(abspath $(CURDIR)/../../..) +-include $(SPDK_ROOT_DIR)/mk/spdk.common.mk +- +-SO_VER := 5 +-SO_MINOR := 0 +- +-C_SRCS = bdev_rbd.c bdev_rbd_rpc.c +-LIBNAME = bdev_rbd +- +-SPDK_MAP_FILE = $(SPDK_ROOT_DIR)/mk/spdk_blank.map +- +-include $(SPDK_ROOT_DIR)/mk/spdk.lib.mk ++# SPDX-License-Identifier: BSD-3-Clause ++# Copyright (C) 2015 Intel Corporation. ++# All rights reserved. ++# ++ ++SPDK_ROOT_DIR := $(abspath $(CURDIR)/../../..) ++include $(SPDK_ROOT_DIR)/mk/spdk.common.mk ++ ++SO_VER := 5 ++SO_MINOR := 0 ++ ++C_SRCS = bdev_rbd.c bdev_rbd_rpc.c ++LIBNAME = bdev_rbd ++ ++SPDK_MAP_FILE = $(SPDK_ROOT_DIR)/mk/spdk_blank.map ++ ++include $(SPDK_ROOT_DIR)/mk/spdk.lib.mk +diff --git a/module/bdev/rbd/bdev_rbd.c b/module/bdev/rbd/bdev_rbd.c +index 30a500d..c8e119a 100644 +--- a/module/bdev/rbd/bdev_rbd.c ++++ b/module/bdev/rbd/bdev_rbd.c +@@ -1,1358 +1,1358 @@ +-/* SPDX-License-Identifier: BSD-3-Clause +- * Copyright (C) 2017 Intel Corporation. +- * All rights reserved. +- */ +- +-#include "spdk/stdinc.h" +- +-#include "bdev_rbd.h" +- +-#include +-#include +- +-#include "spdk/env.h" +-#include "spdk/bdev.h" +-#include "spdk/thread.h" +-#include "spdk/json.h" +-#include "spdk/string.h" +-#include "spdk/util.h" +-#include "spdk/likely.h" +- +-#include "spdk/bdev_module.h" +-#include "spdk/log.h" +- +-static int bdev_rbd_count = 0; +- +-struct bdev_rbd { +- struct spdk_bdev disk; +- char *rbd_name; +- char *user_id; +- char *pool_name; +- char **config; +- +- rados_t cluster; +- rados_t *cluster_p; +- char *cluster_name; +- +- rados_ioctx_t io_ctx; +- rbd_image_t image; +- +- rbd_image_info_t info; +- pthread_mutex_t mutex; +- struct spdk_thread *main_td; +- struct spdk_thread *destruct_td; +- uint32_t ch_count; +- struct spdk_io_channel *group_ch; +- +- TAILQ_ENTRY(bdev_rbd) tailq; +- struct spdk_poller *reset_timer; +- struct spdk_bdev_io *reset_bdev_io; +-}; +- +-struct bdev_rbd_io_channel { +- struct bdev_rbd *disk; +-}; +- +-struct bdev_rbd_io { +- struct spdk_thread *submit_td; +- enum spdk_bdev_io_status status; +- rbd_completion_t comp; +- size_t total_len; +-}; +- +-struct bdev_rbd_cluster { +- char *name; +- char *user_id; +- char **config_param; +- char *config_file; +- char *key_file; +- rados_t cluster; +- uint32_t ref; +- STAILQ_ENTRY(bdev_rbd_cluster) link; +-}; +- +-static STAILQ_HEAD(, bdev_rbd_cluster) g_map_bdev_rbd_cluster = STAILQ_HEAD_INITIALIZER( +- g_map_bdev_rbd_cluster); +-static pthread_mutex_t g_map_bdev_rbd_cluster_mutex = PTHREAD_MUTEX_INITIALIZER; +- +-static void +-bdev_rbd_cluster_free(struct bdev_rbd_cluster *entry) +-{ +- assert(entry != NULL); +- +- bdev_rbd_free_config(entry->config_param); +- free(entry->config_file); +- free(entry->key_file); +- free(entry->user_id); +- free(entry->name); +- free(entry); +-} +- +-static void +-bdev_rbd_put_cluster(rados_t **cluster) +-{ +- struct bdev_rbd_cluster *entry; +- +- assert(cluster != NULL); +- +- /* No need go through the map if *cluster equals to NULL */ +- if (*cluster == NULL) { +- return; +- } +- +- pthread_mutex_lock(&g_map_bdev_rbd_cluster_mutex); +- STAILQ_FOREACH(entry, &g_map_bdev_rbd_cluster, link) { +- if (*cluster != &entry->cluster) { +- continue; +- } +- +- assert(entry->ref > 0); +- entry->ref--; +- *cluster = NULL; +- pthread_mutex_unlock(&g_map_bdev_rbd_cluster_mutex); +- return; +- } +- +- pthread_mutex_unlock(&g_map_bdev_rbd_cluster_mutex); +- SPDK_ERRLOG("Cannot find the entry for cluster=%p\n", cluster); +-} +- +-static void +-bdev_rbd_free(struct bdev_rbd *rbd) +-{ +- if (!rbd) { +- return; +- } +- +- free(rbd->disk.name); +- free(rbd->rbd_name); +- free(rbd->user_id); +- free(rbd->pool_name); +- bdev_rbd_free_config(rbd->config); +- +- if (rbd->io_ctx) { +- rados_ioctx_destroy(rbd->io_ctx); +- } +- +- if (rbd->cluster_name) { +- bdev_rbd_put_cluster(&rbd->cluster_p); +- free(rbd->cluster_name); +- } else if (rbd->cluster) { +- rados_shutdown(rbd->cluster); +- } +- +- pthread_mutex_destroy(&rbd->mutex); +- free(rbd); +-} +- +-void +-bdev_rbd_free_config(char **config) +-{ +- char **entry; +- +- if (config) { +- for (entry = config; *entry; entry++) { +- free(*entry); +- } +- free(config); +- } +-} +- +-char ** +-bdev_rbd_dup_config(const char *const *config) +-{ +- size_t count; +- char **copy; +- +- if (!config) { +- return NULL; +- } +- for (count = 0; config[count]; count++) {} +- copy = calloc(count + 1, sizeof(*copy)); +- if (!copy) { +- return NULL; +- } +- for (count = 0; config[count]; count++) { +- if (!(copy[count] = strdup(config[count]))) { +- bdev_rbd_free_config(copy); +- return NULL; +- } +- } +- return copy; +-} +- +-static int +-bdev_rados_cluster_init(const char *user_id, const char *const *config, +- rados_t *cluster) +-{ +- int ret; +- +- ret = rados_create(cluster, user_id); +- if (ret < 0) { +- SPDK_ERRLOG("Failed to create rados_t struct\n"); +- return -1; +- } +- +- if (config) { +- const char *const *entry = config; +- while (*entry) { +- ret = rados_conf_set(*cluster, entry[0], entry[1]); +- if (ret < 0) { +- SPDK_ERRLOG("Failed to set %s = %s\n", entry[0], entry[1]); +- rados_shutdown(*cluster); +- return -1; +- } +- entry += 2; +- } +- } else { +- ret = rados_conf_read_file(*cluster, NULL); +- if (ret < 0) { +- SPDK_ERRLOG("Failed to read conf file\n"); +- rados_shutdown(*cluster); +- return -1; +- } +- } +- +- ret = rados_connect(*cluster); +- if (ret < 0) { +- SPDK_ERRLOG("Failed to connect to rbd_pool\n"); +- rados_shutdown(*cluster); +- return -1; +- } +- +- return 0; +-} +- +-static int +-bdev_rbd_get_cluster(const char *cluster_name, rados_t **cluster) +-{ +- struct bdev_rbd_cluster *entry; +- +- if (cluster == NULL) { +- SPDK_ERRLOG("cluster should not be NULL\n"); +- return -1; +- } +- +- pthread_mutex_lock(&g_map_bdev_rbd_cluster_mutex); +- STAILQ_FOREACH(entry, &g_map_bdev_rbd_cluster, link) { +- if (strcmp(cluster_name, entry->name) == 0) { +- entry->ref++; +- *cluster = &entry->cluster; +- pthread_mutex_unlock(&g_map_bdev_rbd_cluster_mutex); +- return 0; +- } +- } +- +- pthread_mutex_unlock(&g_map_bdev_rbd_cluster_mutex); +- return -1; +-} +- +-static int +-bdev_rbd_shared_cluster_init(const char *cluster_name, rados_t **cluster) +-{ +- int ret; +- +- ret = bdev_rbd_get_cluster(cluster_name, cluster); +- if (ret < 0) { +- SPDK_ERRLOG("Failed to create rados_t struct\n"); +- return -1; +- } +- +- return ret; +-} +- +-static void * +-bdev_rbd_cluster_handle(void *arg) +-{ +- void *ret = arg; +- struct bdev_rbd *rbd = arg; +- int rc; +- +- rc = bdev_rados_cluster_init(rbd->user_id, (const char *const *)rbd->config, +- &rbd->cluster); +- if (rc < 0) { +- SPDK_ERRLOG("Failed to create rados cluster for user_id=%s and rbd_pool=%s\n", +- rbd->user_id ? rbd->user_id : "admin (the default)", rbd->pool_name); +- ret = NULL; +- } +- +- return ret; +-} +- +-static void * +-bdev_rbd_init_context(void *arg) +-{ +- struct bdev_rbd *rbd = arg; +- int rc; +- +- if (rados_ioctx_create(*(rbd->cluster_p), rbd->pool_name, &rbd->io_ctx) < 0) { +- SPDK_ERRLOG("Failed to create ioctx on rbd=%p\n", rbd); +- return NULL; +- } +- +- rc = rbd_open(rbd->io_ctx, rbd->rbd_name, &rbd->image, NULL); +- if (rc < 0) { +- SPDK_ERRLOG("Failed to open specified rbd device\n"); +- return NULL; +- } +- +- rc = rbd_stat(rbd->image, &rbd->info, sizeof(rbd->info)); +- rbd_close(rbd->image); +- if (rc < 0) { +- SPDK_ERRLOG("Failed to stat specified rbd device\n"); +- return NULL; +- } +- +- return arg; +-} +- +-static int +-bdev_rbd_init(struct bdev_rbd *rbd) +-{ +- int ret = 0; +- +- if (!rbd->cluster_name) { +- rbd->cluster_p = &rbd->cluster; +- /* Cluster should be created in non-SPDK thread to avoid conflict between +- * Rados and SPDK thread */ +- if (spdk_call_unaffinitized(bdev_rbd_cluster_handle, rbd) == NULL) { +- SPDK_ERRLOG("Cannot create the rados object on rbd=%p\n", rbd); +- return -1; +- } +- } else { +- ret = bdev_rbd_shared_cluster_init(rbd->cluster_name, &rbd->cluster_p); +- if (ret < 0) { +- SPDK_ERRLOG("Failed to create rados object for rbd =%p on cluster_name=%s\n", +- rbd, rbd->cluster_name); +- return -1; +- } +- } +- +- if (spdk_call_unaffinitized(bdev_rbd_init_context, rbd) == NULL) { +- SPDK_ERRLOG("Cannot init rbd context for rbd=%p\n", rbd); +- return -1; +- } +- +- return ret; +-} +- +-static void +-bdev_rbd_exit(rbd_image_t image) +-{ +- rbd_flush(image); +- rbd_close(image); +-} +- +-static void +-_bdev_rbd_io_complete(void *_rbd_io) +-{ +- struct bdev_rbd_io *rbd_io = _rbd_io; +- +- spdk_bdev_io_complete(spdk_bdev_io_from_ctx(rbd_io), rbd_io->status); +-} +- +-static void +-bdev_rbd_io_complete(struct spdk_bdev_io *bdev_io, enum spdk_bdev_io_status status) +-{ +- struct bdev_rbd_io *rbd_io = (struct bdev_rbd_io *)bdev_io->driver_ctx; +- struct spdk_thread *current_thread = spdk_get_thread(); +- +- rbd_io->status = status; +- assert(rbd_io->submit_td != NULL); +- if (rbd_io->submit_td != current_thread) { +- spdk_thread_send_msg(rbd_io->submit_td, _bdev_rbd_io_complete, rbd_io); +- } else { +- _bdev_rbd_io_complete(rbd_io); +- } +-} +- +-static void +-bdev_rbd_finish_aiocb(rbd_completion_t cb, void *arg) +-{ +- int io_status; +- struct spdk_bdev_io *bdev_io; +- struct bdev_rbd_io *rbd_io; +- enum spdk_bdev_io_status bio_status; +- +- bdev_io = rbd_aio_get_arg(cb); +- rbd_io = (struct bdev_rbd_io *)bdev_io->driver_ctx; +- io_status = rbd_aio_get_return_value(cb); +- bio_status = SPDK_BDEV_IO_STATUS_SUCCESS; +- +- if (bdev_io->type == SPDK_BDEV_IO_TYPE_READ) { +- if ((int)rbd_io->total_len != io_status) { +- bio_status = SPDK_BDEV_IO_STATUS_FAILED; +- } +- } else { +- /* For others, 0 means success */ +- if (io_status != 0) { +- bio_status = SPDK_BDEV_IO_STATUS_FAILED; +- } +- } +- +- rbd_aio_release(cb); +- +- bdev_rbd_io_complete(bdev_io, bio_status); +-} +- +-static void +-_bdev_rbd_start_aio(struct bdev_rbd *disk, struct spdk_bdev_io *bdev_io, +- struct iovec *iov, int iovcnt, uint64_t offset, size_t len) +-{ +- int ret; +- struct bdev_rbd_io *rbd_io = (struct bdev_rbd_io *)bdev_io->driver_ctx; +- rbd_image_t image = disk->image; +- +- ret = rbd_aio_create_completion(bdev_io, bdev_rbd_finish_aiocb, +- &rbd_io->comp); +- if (ret < 0) { +- goto err; +- } +- +- if (bdev_io->type == SPDK_BDEV_IO_TYPE_READ) { +- rbd_io->total_len = len; +- if (spdk_likely(iovcnt == 1)) { +- ret = rbd_aio_read(image, offset, iov[0].iov_len, iov[0].iov_base, rbd_io->comp); +- } else { +- ret = rbd_aio_readv(image, iov, iovcnt, offset, rbd_io->comp); +- } +- } else if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) { +- if (spdk_likely(iovcnt == 1)) { +- ret = rbd_aio_write(image, offset, iov[0].iov_len, iov[0].iov_base, rbd_io->comp); +- } else { +- ret = rbd_aio_writev(image, iov, iovcnt, offset, rbd_io->comp); +- } +- } else if (bdev_io->type == SPDK_BDEV_IO_TYPE_UNMAP) { +- ret = rbd_aio_discard(image, offset, len, rbd_io->comp); +- } else if (bdev_io->type == SPDK_BDEV_IO_TYPE_FLUSH) { +- ret = rbd_aio_flush(image, rbd_io->comp); +- } else if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE_ZEROES) { +- ret = rbd_aio_write_zeroes(image, offset, len, rbd_io->comp, /* zero_flags */ 0, /* op_flags */ 0); +- } +- +- if (ret < 0) { +- rbd_aio_release(rbd_io->comp); +- goto err; +- } +- +- return; +- +-err: +- bdev_rbd_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); +-} +- +-static void +-bdev_rbd_start_aio(void *ctx) +-{ +- struct spdk_bdev_io *bdev_io = ctx; +- struct bdev_rbd *disk = (struct bdev_rbd *)bdev_io->bdev->ctxt; +- +- _bdev_rbd_start_aio(disk, +- bdev_io, +- bdev_io->u.bdev.iovs, +- bdev_io->u.bdev.iovcnt, +- bdev_io->u.bdev.offset_blocks * bdev_io->bdev->blocklen, +- bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen); +-} +- +-static int bdev_rbd_library_init(void); +-static void bdev_rbd_library_fini(void); +- +-static int +-bdev_rbd_get_ctx_size(void) +-{ +- return sizeof(struct bdev_rbd_io); +-} +- +-static struct spdk_bdev_module rbd_if = { +- .name = "rbd", +- .module_init = bdev_rbd_library_init, +- .module_fini = bdev_rbd_library_fini, +- .get_ctx_size = bdev_rbd_get_ctx_size, +- +-}; +-SPDK_BDEV_MODULE_REGISTER(rbd, &rbd_if) +- +-static int bdev_rbd_reset_timer(void *arg); +- +-static void +-bdev_rbd_check_outstanding_ios(struct spdk_bdev *bdev, uint64_t current_qd, +- void *cb_arg, int rc) +-{ +- struct bdev_rbd *disk = cb_arg; +- enum spdk_bdev_io_status bio_status; +- +- if (rc == 0 && current_qd > 0) { +- disk->reset_timer = SPDK_POLLER_REGISTER(bdev_rbd_reset_timer, disk, 1000); +- return; +- } +- +- if (rc != 0) { +- bio_status = SPDK_BDEV_IO_STATUS_FAILED; +- } else { +- bio_status = SPDK_BDEV_IO_STATUS_SUCCESS; +- } +- +- bdev_rbd_io_complete(disk->reset_bdev_io, bio_status); +- disk->reset_bdev_io = NULL; +-} +- +-static int +-bdev_rbd_reset_timer(void *arg) +-{ +- struct bdev_rbd *disk = arg; +- +- spdk_poller_unregister(&disk->reset_timer); +- +- spdk_bdev_get_current_qd(&disk->disk, bdev_rbd_check_outstanding_ios, disk); +- +- return SPDK_POLLER_BUSY; +-} +- +-static void +-bdev_rbd_reset(void *ctx) +-{ +- struct spdk_bdev_io *bdev_io = ctx; +- struct bdev_rbd *disk = (struct bdev_rbd *)bdev_io->bdev->ctxt; +- +- /* +- * HACK: Since librbd doesn't provide any way to cancel outstanding aio, just kick off a +- * poller to wait for in-flight I/O to complete. +- */ +- assert(disk->reset_bdev_io == NULL); +- disk->reset_bdev_io = bdev_io; +- +- bdev_rbd_reset_timer(disk); +-} +- +-static void +-_bdev_rbd_destruct_done(void *io_device) +-{ +- struct bdev_rbd *rbd = io_device; +- +- assert(rbd != NULL); +- assert(rbd->ch_count == 0); +- +- spdk_bdev_destruct_done(&rbd->disk, 0); +- bdev_rbd_free(rbd); +-} +- +-static void +-bdev_rbd_free_cb(void *io_device) +-{ +- struct bdev_rbd *rbd = io_device; +- +- /* The io device has been unregistered. Send a message back to the +- * original thread that started the destruct operation, so that the +- * bdev unregister callback is invoked on the same thread that started +- * this whole process. +- */ +- spdk_thread_send_msg(rbd->destruct_td, _bdev_rbd_destruct_done, rbd); +-} +- +-static void +-_bdev_rbd_destruct(void *ctx) +-{ +- struct bdev_rbd *rbd = ctx; +- +- spdk_io_device_unregister(rbd, bdev_rbd_free_cb); +-} +- +-static int +-bdev_rbd_destruct(void *ctx) +-{ +- struct bdev_rbd *rbd = ctx; +- struct spdk_thread *td; +- +- if (rbd->main_td == NULL) { +- td = spdk_get_thread(); +- } else { +- td = rbd->main_td; +- } +- +- /* Start the destruct operation on the rbd bdev's +- * main thread. This guarantees it will only start +- * executing after any messages related to channel +- * deletions have finished completing. *Always* +- * send a message, even if this function gets called +- * from the main thread, in case there are pending +- * channel delete messages in flight to this thread. +- */ +- assert(rbd->destruct_td == NULL); +- rbd->destruct_td = td; +- spdk_thread_send_msg(td, _bdev_rbd_destruct, rbd); +- +- /* Return 1 to indicate the destruct path is asynchronous. */ +- return 1; +-} +- +-static void +-bdev_rbd_get_buf_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io, +- bool success) +-{ +- struct bdev_rbd *disk = (struct bdev_rbd *)bdev_io->bdev->ctxt; +- +- if (!success) { +- bdev_rbd_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); +- return; +- } +- +- spdk_thread_exec_msg(disk->main_td, bdev_rbd_start_aio, bdev_io); +-} +- +-static void +-bdev_rbd_submit_request(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io) +-{ +- struct spdk_thread *submit_td = spdk_io_channel_get_thread(ch); +- struct bdev_rbd_io *rbd_io = (struct bdev_rbd_io *)bdev_io->driver_ctx; +- struct bdev_rbd *disk = (struct bdev_rbd *)bdev_io->bdev->ctxt; +- +- rbd_io->submit_td = submit_td; +- switch (bdev_io->type) { +- case SPDK_BDEV_IO_TYPE_READ: +- spdk_bdev_io_get_buf(bdev_io, bdev_rbd_get_buf_cb, +- bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen); +- break; +- +- case SPDK_BDEV_IO_TYPE_WRITE: +- case SPDK_BDEV_IO_TYPE_UNMAP: +- case SPDK_BDEV_IO_TYPE_FLUSH: +- case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: +- spdk_thread_exec_msg(disk->main_td, bdev_rbd_start_aio, bdev_io); +- break; +- +- case SPDK_BDEV_IO_TYPE_RESET: +- spdk_thread_exec_msg(disk->main_td, bdev_rbd_reset, bdev_io); +- break; +- +- default: +- SPDK_ERRLOG("Unsupported IO type =%d\n", bdev_io->type); +- bdev_rbd_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); +- break; +- } +-} +- +-static bool +-bdev_rbd_io_type_supported(void *ctx, enum spdk_bdev_io_type io_type) +-{ +- switch (io_type) { +- case SPDK_BDEV_IO_TYPE_READ: +- case SPDK_BDEV_IO_TYPE_WRITE: +- case SPDK_BDEV_IO_TYPE_UNMAP: +- case SPDK_BDEV_IO_TYPE_FLUSH: +- case SPDK_BDEV_IO_TYPE_RESET: +- case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: +- return true; +- +- default: +- return false; +- } +-} +- +-static void +-bdev_rbd_free_channel_resources(struct bdev_rbd *disk) +-{ +- assert(disk != NULL); +- assert(disk->main_td == spdk_get_thread()); +- assert(disk->ch_count == 0); +- +- spdk_put_io_channel(disk->group_ch); +- if (disk->image) { +- bdev_rbd_exit(disk->image); +- } +- +- disk->main_td = NULL; +- disk->group_ch = NULL; +-} +- +-static void * +-bdev_rbd_handle(void *arg) +-{ +- struct bdev_rbd *disk = arg; +- void *ret = arg; +- +- if (rbd_open(disk->io_ctx, disk->rbd_name, &disk->image, NULL) < 0) { +- SPDK_ERRLOG("Failed to open specified rbd device\n"); +- ret = NULL; +- } +- +- return ret; +-} +- +-static int +-_bdev_rbd_create_cb(struct bdev_rbd *disk) +-{ +- disk->group_ch = spdk_get_io_channel(&rbd_if); +- assert(disk->group_ch != NULL); +- +- if (spdk_call_unaffinitized(bdev_rbd_handle, disk) == NULL) { +- bdev_rbd_free_channel_resources(disk); +- return -1; +- } +- +- return 0; +-} +- +-static int +-bdev_rbd_create_cb(void *io_device, void *ctx_buf) +-{ +- struct bdev_rbd_io_channel *ch = ctx_buf; +- struct bdev_rbd *disk = io_device; +- int rc; +- +- ch->disk = disk; +- pthread_mutex_lock(&disk->mutex); +- if (disk->ch_count == 0) { +- assert(disk->main_td == NULL); +- rc = _bdev_rbd_create_cb(disk); +- if (rc) { +- SPDK_ERRLOG("Cannot create channel for disk=%p\n", disk); +- pthread_mutex_unlock(&disk->mutex); +- return rc; +- } +- +- disk->main_td = spdk_get_thread(); +- } +- +- disk->ch_count++; +- pthread_mutex_unlock(&disk->mutex); +- +- return 0; +-} +- +-static void +-_bdev_rbd_destroy_cb(void *ctx) +-{ +- struct bdev_rbd *disk = ctx; +- +- pthread_mutex_lock(&disk->mutex); +- assert(disk->ch_count > 0); +- disk->ch_count--; +- +- if (disk->ch_count > 0) { +- /* A new channel was created between when message was sent and this function executed */ +- pthread_mutex_unlock(&disk->mutex); +- return; +- } +- +- bdev_rbd_free_channel_resources(disk); +- pthread_mutex_unlock(&disk->mutex); +-} +- +-static void +-bdev_rbd_destroy_cb(void *io_device, void *ctx_buf) +-{ +- struct bdev_rbd *disk = io_device; +- struct spdk_thread *thread; +- +- pthread_mutex_lock(&disk->mutex); +- assert(disk->ch_count > 0); +- disk->ch_count--; +- if (disk->ch_count == 0) { +- assert(disk->main_td != NULL); +- if (disk->main_td != spdk_get_thread()) { +- /* The final channel was destroyed on a different thread +- * than where the first channel was created. Pass a message +- * to the main thread to unregister the poller. */ +- disk->ch_count++; +- thread = disk->main_td; +- pthread_mutex_unlock(&disk->mutex); +- spdk_thread_send_msg(thread, _bdev_rbd_destroy_cb, disk); +- return; +- } +- +- bdev_rbd_free_channel_resources(disk); +- } +- pthread_mutex_unlock(&disk->mutex); +-} +- +-static struct spdk_io_channel * +-bdev_rbd_get_io_channel(void *ctx) +-{ +- struct bdev_rbd *rbd_bdev = ctx; +- +- return spdk_get_io_channel(rbd_bdev); +-} +- +-static void +-bdev_rbd_cluster_dump_entry(const char *cluster_name, struct spdk_json_write_ctx *w) +-{ +- struct bdev_rbd_cluster *entry; +- +- pthread_mutex_lock(&g_map_bdev_rbd_cluster_mutex); +- STAILQ_FOREACH(entry, &g_map_bdev_rbd_cluster, link) { +- if (strcmp(cluster_name, entry->name)) { +- continue; +- } +- if (entry->user_id) { +- spdk_json_write_named_string(w, "user_id", entry->user_id); +- } +- +- if (entry->config_param) { +- char **config_entry = entry->config_param; +- +- spdk_json_write_named_object_begin(w, "config_param"); +- while (*config_entry) { +- spdk_json_write_named_string(w, config_entry[0], config_entry[1]); +- config_entry += 2; +- } +- spdk_json_write_object_end(w); +- } +- if (entry->config_file) { +- spdk_json_write_named_string(w, "config_file", entry->config_file); +- } +- if (entry->key_file) { +- spdk_json_write_named_string(w, "key_file", entry->key_file); +- } +- +- pthread_mutex_unlock(&g_map_bdev_rbd_cluster_mutex); +- return; +- } +- +- pthread_mutex_unlock(&g_map_bdev_rbd_cluster_mutex); +-} +- +-static int +-bdev_rbd_dump_info_json(void *ctx, struct spdk_json_write_ctx *w) +-{ +- struct bdev_rbd *rbd_bdev = ctx; +- +- spdk_json_write_named_object_begin(w, "rbd"); +- +- spdk_json_write_named_string(w, "pool_name", rbd_bdev->pool_name); +- +- spdk_json_write_named_string(w, "rbd_name", rbd_bdev->rbd_name); +- +- if (rbd_bdev->cluster_name) { +- bdev_rbd_cluster_dump_entry(rbd_bdev->cluster_name, w); +- goto end; +- } +- +- if (rbd_bdev->user_id) { +- spdk_json_write_named_string(w, "user_id", rbd_bdev->user_id); +- } +- +- if (rbd_bdev->config) { +- char **entry = rbd_bdev->config; +- +- spdk_json_write_named_object_begin(w, "config"); +- while (*entry) { +- spdk_json_write_named_string(w, entry[0], entry[1]); +- entry += 2; +- } +- spdk_json_write_object_end(w); +- } +- +-end: +- spdk_json_write_object_end(w); +- +- return 0; +-} +- +-static void +-bdev_rbd_write_config_json(struct spdk_bdev *bdev, struct spdk_json_write_ctx *w) +-{ +- struct bdev_rbd *rbd = bdev->ctxt; +- char uuid_str[SPDK_UUID_STRING_LEN]; +- +- spdk_json_write_object_begin(w); +- +- spdk_json_write_named_string(w, "method", "bdev_rbd_create"); +- +- spdk_json_write_named_object_begin(w, "params"); +- spdk_json_write_named_string(w, "name", bdev->name); +- spdk_json_write_named_string(w, "pool_name", rbd->pool_name); +- spdk_json_write_named_string(w, "rbd_name", rbd->rbd_name); +- spdk_json_write_named_uint32(w, "block_size", bdev->blocklen); +- if (rbd->user_id) { +- spdk_json_write_named_string(w, "user_id", rbd->user_id); +- } +- +- if (rbd->config) { +- char **entry = rbd->config; +- +- spdk_json_write_named_object_begin(w, "config"); +- while (*entry) { +- spdk_json_write_named_string(w, entry[0], entry[1]); +- entry += 2; +- } +- spdk_json_write_object_end(w); +- } +- +- spdk_uuid_fmt_lower(uuid_str, sizeof(uuid_str), &bdev->uuid); +- spdk_json_write_named_string(w, "uuid", uuid_str); +- +- spdk_json_write_object_end(w); +- +- spdk_json_write_object_end(w); +-} +- +-static void +-dump_single_cluster_entry(struct bdev_rbd_cluster *entry, struct spdk_json_write_ctx *w) +-{ +- assert(entry != NULL); +- +- spdk_json_write_object_begin(w); +- spdk_json_write_named_string(w, "cluster_name", entry->name); +- +- if (entry->user_id) { +- spdk_json_write_named_string(w, "user_id", entry->user_id); +- } +- +- if (entry->config_param) { +- char **config_entry = entry->config_param; +- +- spdk_json_write_named_object_begin(w, "config_param"); +- while (*config_entry) { +- spdk_json_write_named_string(w, config_entry[0], config_entry[1]); +- config_entry += 2; +- } +- spdk_json_write_object_end(w); +- } +- if (entry->config_file) { +- spdk_json_write_named_string(w, "config_file", entry->config_file); +- } +- if (entry->key_file) { +- spdk_json_write_named_string(w, "key_file", entry->key_file); +- } +- +- spdk_json_write_object_end(w); +-} +- +-int +-bdev_rbd_get_clusters_info(struct spdk_jsonrpc_request *request, const char *name) +-{ +- struct bdev_rbd_cluster *entry; +- struct spdk_json_write_ctx *w; +- +- pthread_mutex_lock(&g_map_bdev_rbd_cluster_mutex); +- +- if (STAILQ_EMPTY(&g_map_bdev_rbd_cluster)) { +- pthread_mutex_unlock(&g_map_bdev_rbd_cluster_mutex); +- return -ENOENT; +- } +- +- /* If cluster name is provided */ +- if (name) { +- STAILQ_FOREACH(entry, &g_map_bdev_rbd_cluster, link) { +- if (strcmp(name, entry->name) == 0) { +- w = spdk_jsonrpc_begin_result(request); +- dump_single_cluster_entry(entry, w); +- spdk_jsonrpc_end_result(request, w); +- +- pthread_mutex_unlock(&g_map_bdev_rbd_cluster_mutex); +- return 0; +- } +- } +- +- pthread_mutex_unlock(&g_map_bdev_rbd_cluster_mutex); +- return -ENOENT; +- } +- +- w = spdk_jsonrpc_begin_result(request); +- spdk_json_write_array_begin(w); +- STAILQ_FOREACH(entry, &g_map_bdev_rbd_cluster, link) { +- dump_single_cluster_entry(entry, w); +- } +- spdk_json_write_array_end(w); +- spdk_jsonrpc_end_result(request, w); +- pthread_mutex_unlock(&g_map_bdev_rbd_cluster_mutex); +- +- return 0; +-} +- +-static const struct spdk_bdev_fn_table rbd_fn_table = { +- .destruct = bdev_rbd_destruct, +- .submit_request = bdev_rbd_submit_request, +- .io_type_supported = bdev_rbd_io_type_supported, +- .get_io_channel = bdev_rbd_get_io_channel, +- .dump_info_json = bdev_rbd_dump_info_json, +- .write_config_json = bdev_rbd_write_config_json, +-}; +- +-static int +-rbd_register_cluster(const char *name, const char *user_id, const char *const *config_param, +- const char *config_file, const char *key_file) +-{ +- struct bdev_rbd_cluster *entry; +- int rc; +- +- pthread_mutex_lock(&g_map_bdev_rbd_cluster_mutex); +- STAILQ_FOREACH(entry, &g_map_bdev_rbd_cluster, link) { +- if (strcmp(name, entry->name) == 0) { +- SPDK_ERRLOG("Cluster name=%s already exists\n", name); +- pthread_mutex_unlock(&g_map_bdev_rbd_cluster_mutex); +- return -1; +- } +- } +- +- entry = calloc(1, sizeof(*entry)); +- if (!entry) { +- SPDK_ERRLOG("Cannot allocate an entry for name=%s\n", name); +- pthread_mutex_unlock(&g_map_bdev_rbd_cluster_mutex); +- return -1; +- } +- +- entry->name = strdup(name); +- if (entry->name == NULL) { +- SPDK_ERRLOG("Failed to save the name =%s on entry =%p\n", name, entry); +- goto err_handle; +- } +- +- if (user_id) { +- entry->user_id = strdup(user_id); +- if (entry->user_id == NULL) { +- SPDK_ERRLOG("Failed to save the str =%s on entry =%p\n", user_id, entry); +- goto err_handle; +- } +- } +- +- /* Support specify config_param or config_file separately, or both of them. */ +- if (config_param) { +- entry->config_param = bdev_rbd_dup_config(config_param); +- if (entry->config_param == NULL) { +- SPDK_ERRLOG("Failed to save the config_param=%p on entry = %p\n", config_param, entry); +- goto err_handle; +- } +- } +- +- if (config_file) { +- entry->config_file = strdup(config_file); +- if (entry->config_file == NULL) { +- SPDK_ERRLOG("Failed to save the config_file=%s on entry = %p\n", config_file, entry); +- goto err_handle; +- } +- } +- +- if (key_file) { +- entry->key_file = strdup(key_file); +- if (entry->key_file == NULL) { +- SPDK_ERRLOG("Failed to save the key_file=%s on entry = %p\n", key_file, entry); +- goto err_handle; +- } +- } +- +- rc = rados_create(&entry->cluster, user_id); +- if (rc < 0) { +- SPDK_ERRLOG("Failed to create rados_t struct\n"); +- goto err_handle; +- } +- +- /* Try default location when entry->config_file is NULL, but ignore failure when it is NULL */ +- rc = rados_conf_read_file(entry->cluster, entry->config_file); +- if (entry->config_file && rc < 0) { +- SPDK_ERRLOG("Failed to read conf file %s\n", entry->config_file); +- rados_shutdown(entry->cluster); +- goto err_handle; +- } +- +- if (config_param) { +- const char *const *config_entry = config_param; +- while (*config_entry) { +- rc = rados_conf_set(entry->cluster, config_entry[0], config_entry[1]); +- if (rc < 0) { +- SPDK_ERRLOG("Failed to set %s = %s\n", config_entry[0], config_entry[1]); +- rados_shutdown(entry->cluster); +- goto err_handle; +- } +- config_entry += 2; +- } +- } +- +- if (key_file) { +- rc = rados_conf_set(entry->cluster, "keyring", key_file); +- if (rc < 0) { +- SPDK_ERRLOG("Failed to set keyring = %s\n", key_file); +- rados_shutdown(entry->cluster); +- goto err_handle; +- } +- } +- +- rc = rados_connect(entry->cluster); +- if (rc < 0) { +- SPDK_ERRLOG("Failed to connect to rbd_pool on cluster=%p\n", entry->cluster); +- rados_shutdown(entry->cluster); +- goto err_handle; +- } +- +- STAILQ_INSERT_TAIL(&g_map_bdev_rbd_cluster, entry, link); +- pthread_mutex_unlock(&g_map_bdev_rbd_cluster_mutex); +- +- return 0; +- +-err_handle: +- bdev_rbd_cluster_free(entry); +- pthread_mutex_unlock(&g_map_bdev_rbd_cluster_mutex); +- return -1; +-} +- +-int +-bdev_rbd_unregister_cluster(const char *name) +-{ +- struct bdev_rbd_cluster *entry; +- int rc = 0; +- +- if (name == NULL) { +- return -1; +- } +- +- pthread_mutex_lock(&g_map_bdev_rbd_cluster_mutex); +- STAILQ_FOREACH(entry, &g_map_bdev_rbd_cluster, link) { +- if (strcmp(name, entry->name) == 0) { +- if (entry->ref == 0) { +- STAILQ_REMOVE(&g_map_bdev_rbd_cluster, entry, bdev_rbd_cluster, link); +- rados_shutdown(entry->cluster); +- bdev_rbd_cluster_free(entry); +- } else { +- SPDK_ERRLOG("Cluster with name=%p is still used and we cannot delete it\n", +- entry->name); +- rc = -1; +- } +- +- pthread_mutex_unlock(&g_map_bdev_rbd_cluster_mutex); +- return rc; +- } +- } +- +- pthread_mutex_unlock(&g_map_bdev_rbd_cluster_mutex); +- +- SPDK_ERRLOG("Could not find the cluster name =%p\n", name); +- +- return -1; +-} +- +-static void * +-_bdev_rbd_register_cluster(void *arg) +-{ +- struct cluster_register_info *info = arg; +- void *ret = arg; +- int rc; +- +- rc = rbd_register_cluster((const char *)info->name, (const char *)info->user_id, +- (const char *const *)info->config_param, (const char *)info->config_file, +- (const char *)info->key_file); +- if (rc) { +- ret = NULL; +- } +- +- return ret; +-} +- +-int +-bdev_rbd_register_cluster(struct cluster_register_info *info) +-{ +- assert(info != NULL); +- +- /* Rados cluster info need to be created in non SPDK-thread to avoid CPU +- * resource contention */ +- if (spdk_call_unaffinitized(_bdev_rbd_register_cluster, info) == NULL) { +- return -1; +- } +- +- return 0; +-} +- +-int +-bdev_rbd_create(struct spdk_bdev **bdev, const char *name, const char *user_id, +- const char *pool_name, +- const char *const *config, +- const char *rbd_name, +- uint32_t block_size, +- const char *cluster_name, +- const struct spdk_uuid *uuid) +-{ +- struct bdev_rbd *rbd; +- int ret; +- +- if ((pool_name == NULL) || (rbd_name == NULL)) { +- return -EINVAL; +- } +- +- rbd = calloc(1, sizeof(struct bdev_rbd)); +- if (rbd == NULL) { +- SPDK_ERRLOG("Failed to allocate bdev_rbd struct\n"); +- return -ENOMEM; +- } +- +- ret = pthread_mutex_init(&rbd->mutex, NULL); +- if (ret) { +- SPDK_ERRLOG("Cannot init mutex on rbd=%p\n", rbd->disk.name); +- free(rbd); +- return ret; +- } +- +- rbd->rbd_name = strdup(rbd_name); +- if (!rbd->rbd_name) { +- bdev_rbd_free(rbd); +- return -ENOMEM; +- } +- +- if (user_id) { +- rbd->user_id = strdup(user_id); +- if (!rbd->user_id) { +- bdev_rbd_free(rbd); +- return -ENOMEM; +- } +- } +- +- if (cluster_name) { +- rbd->cluster_name = strdup(cluster_name); +- if (!rbd->cluster_name) { +- bdev_rbd_free(rbd); +- return -ENOMEM; +- } +- } +- rbd->pool_name = strdup(pool_name); +- if (!rbd->pool_name) { +- bdev_rbd_free(rbd); +- return -ENOMEM; +- } +- +- if (config && !(rbd->config = bdev_rbd_dup_config(config))) { +- bdev_rbd_free(rbd); +- return -ENOMEM; +- } +- +- ret = bdev_rbd_init(rbd); +- if (ret < 0) { +- bdev_rbd_free(rbd); +- SPDK_ERRLOG("Failed to init rbd device\n"); +- return ret; +- } +- +- if (uuid) { +- rbd->disk.uuid = *uuid; +- } else { +- spdk_uuid_generate(&rbd->disk.uuid); +- } +- +- if (name) { +- rbd->disk.name = strdup(name); +- } else { +- rbd->disk.name = spdk_sprintf_alloc("Ceph%d", bdev_rbd_count); +- } +- if (!rbd->disk.name) { +- bdev_rbd_free(rbd); +- return -ENOMEM; +- } +- rbd->disk.product_name = "Ceph Rbd Disk"; +- bdev_rbd_count++; +- +- rbd->disk.write_cache = 0; +- rbd->disk.blocklen = block_size; +- rbd->disk.blockcnt = rbd->info.size / rbd->disk.blocklen; +- rbd->disk.ctxt = rbd; +- rbd->disk.fn_table = &rbd_fn_table; +- rbd->disk.module = &rbd_if; +- +- SPDK_NOTICELOG("Add %s rbd disk to lun\n", rbd->disk.name); +- +- spdk_io_device_register(rbd, bdev_rbd_create_cb, +- bdev_rbd_destroy_cb, +- sizeof(struct bdev_rbd_io_channel), +- rbd_name); +- ret = spdk_bdev_register(&rbd->disk); +- if (ret) { +- spdk_io_device_unregister(rbd, NULL); +- bdev_rbd_free(rbd); +- return ret; +- } +- +- *bdev = &(rbd->disk); +- +- return ret; +-} +- +-void +-bdev_rbd_delete(const char *name, spdk_delete_rbd_complete cb_fn, void *cb_arg) +-{ +- int rc; +- +- rc = spdk_bdev_unregister_by_name(name, &rbd_if, cb_fn, cb_arg); +- if (rc != 0) { +- cb_fn(cb_arg, rc); +- } +-} +- +-static void +-dummy_bdev_event_cb(enum spdk_bdev_event_type type, struct spdk_bdev *bdev, void *ctx) +-{ +-} +- +-int +-bdev_rbd_resize(const char *name, const uint64_t new_size_in_mb) +-{ +- struct spdk_bdev_desc *desc; +- struct spdk_bdev *bdev; +- struct spdk_io_channel *ch; +- struct bdev_rbd_io_channel *rbd_io_ch; +- int rc = 0; +- uint64_t new_size_in_byte; +- uint64_t current_size_in_mb; +- +- rc = spdk_bdev_open_ext(name, false, dummy_bdev_event_cb, NULL, &desc); +- if (rc != 0) { +- return rc; +- } +- +- bdev = spdk_bdev_desc_get_bdev(desc); +- +- if (bdev->module != &rbd_if) { +- rc = -EINVAL; +- goto exit; +- } +- +- current_size_in_mb = bdev->blocklen * bdev->blockcnt / (1024 * 1024); +- if (current_size_in_mb > new_size_in_mb) { +- SPDK_ERRLOG("The new bdev size must be larger than current bdev size.\n"); +- rc = -EINVAL; +- goto exit; +- } +- +- ch = bdev_rbd_get_io_channel(bdev); +- rbd_io_ch = spdk_io_channel_get_ctx(ch); +- new_size_in_byte = new_size_in_mb * 1024 * 1024; +- +- rc = rbd_resize(rbd_io_ch->disk->image, new_size_in_byte); +- spdk_put_io_channel(ch); +- if (rc != 0) { +- SPDK_ERRLOG("failed to resize the ceph bdev.\n"); +- goto exit; +- } +- +- rc = spdk_bdev_notify_blockcnt_change(bdev, new_size_in_byte / bdev->blocklen); +- if (rc != 0) { +- SPDK_ERRLOG("failed to notify block cnt change.\n"); +- } +- +-exit: +- spdk_bdev_close(desc); +- return rc; +-} +- +-static int +-bdev_rbd_group_create_cb(void *io_device, void *ctx_buf) +-{ +- return 0; +-} +- +-static void +-bdev_rbd_group_destroy_cb(void *io_device, void *ctx_buf) +-{ +-} +- +-static int +-bdev_rbd_library_init(void) +-{ +- spdk_io_device_register(&rbd_if, bdev_rbd_group_create_cb, bdev_rbd_group_destroy_cb, +- 0, "bdev_rbd_poll_groups"); +- return 0; +-} +- +-static void +-bdev_rbd_library_fini(void) +-{ +- spdk_io_device_unregister(&rbd_if, NULL); +-} +- +-SPDK_LOG_REGISTER_COMPONENT(bdev_rbd) ++/* SPDX-License-Identifier: BSD-3-Clause ++ * Copyright (C) 2017 Intel Corporation. ++ * All rights reserved. ++ */ ++ ++#include "spdk/stdinc.h" ++ ++#include "bdev_rbd.h" ++ ++#include ++#include ++ ++#include "spdk/env.h" ++#include "spdk/bdev.h" ++#include "spdk/thread.h" ++#include "spdk/json.h" ++#include "spdk/string.h" ++#include "spdk/util.h" ++#include "spdk/likely.h" ++ ++#include "spdk/bdev_module.h" ++#include "spdk/log.h" ++ ++static int bdev_rbd_count = 0; ++ ++struct bdev_rbd { ++ struct spdk_bdev disk; ++ char *rbd_name; ++ char *user_id; ++ char *pool_name; ++ char **config; ++ ++ rados_t cluster; ++ rados_t *cluster_p; ++ char *cluster_name; ++ ++ rados_ioctx_t io_ctx; ++ rbd_image_t image; ++ ++ rbd_image_info_t info; ++ pthread_mutex_t mutex; ++ struct spdk_thread *main_td; ++ struct spdk_thread *destruct_td; ++ uint32_t ch_count; ++ struct spdk_io_channel *group_ch; ++ ++ TAILQ_ENTRY(bdev_rbd) tailq; ++ struct spdk_poller *reset_timer; ++ struct spdk_bdev_io *reset_bdev_io; ++}; ++ ++struct bdev_rbd_io_channel { ++ struct bdev_rbd *disk; ++}; ++ ++struct bdev_rbd_io { ++ struct spdk_thread *submit_td; ++ enum spdk_bdev_io_status status; ++ rbd_completion_t comp; ++ size_t total_len; ++}; ++ ++struct bdev_rbd_cluster { ++ char *name; ++ char *user_id; ++ char **config_param; ++ char *config_file; ++ char *key_file; ++ rados_t cluster; ++ uint32_t ref; ++ STAILQ_ENTRY(bdev_rbd_cluster) link; ++}; ++ ++static STAILQ_HEAD(, bdev_rbd_cluster) g_map_bdev_rbd_cluster = STAILQ_HEAD_INITIALIZER( ++ g_map_bdev_rbd_cluster); ++static pthread_mutex_t g_map_bdev_rbd_cluster_mutex = PTHREAD_MUTEX_INITIALIZER; ++ ++static void ++bdev_rbd_cluster_free(struct bdev_rbd_cluster *entry) ++{ ++ assert(entry != NULL); ++ ++ bdev_rbd_free_config(entry->config_param); ++ free(entry->config_file); ++ free(entry->key_file); ++ free(entry->user_id); ++ free(entry->name); ++ free(entry); ++} ++ ++static void ++bdev_rbd_put_cluster(rados_t **cluster) ++{ ++ struct bdev_rbd_cluster *entry; ++ ++ assert(cluster != NULL); ++ ++ /* No need go through the map if *cluster equals to NULL */ ++ if (*cluster == NULL) { ++ return; ++ } ++ ++ pthread_mutex_lock(&g_map_bdev_rbd_cluster_mutex); ++ STAILQ_FOREACH(entry, &g_map_bdev_rbd_cluster, link) { ++ if (*cluster != &entry->cluster) { ++ continue; ++ } ++ ++ assert(entry->ref > 0); ++ entry->ref--; ++ *cluster = NULL; ++ pthread_mutex_unlock(&g_map_bdev_rbd_cluster_mutex); ++ return; ++ } ++ ++ pthread_mutex_unlock(&g_map_bdev_rbd_cluster_mutex); ++ SPDK_ERRLOG("Cannot find the entry for cluster=%p\n", cluster); ++} ++ ++static void ++bdev_rbd_free(struct bdev_rbd *rbd) ++{ ++ if (!rbd) { ++ return; ++ } ++ ++ free(rbd->disk.name); ++ free(rbd->rbd_name); ++ free(rbd->user_id); ++ free(rbd->pool_name); ++ bdev_rbd_free_config(rbd->config); ++ ++ if (rbd->io_ctx) { ++ rados_ioctx_destroy(rbd->io_ctx); ++ } ++ ++ if (rbd->cluster_name) { ++ bdev_rbd_put_cluster(&rbd->cluster_p); ++ free(rbd->cluster_name); ++ } else if (rbd->cluster) { ++ rados_shutdown(rbd->cluster); ++ } ++ ++ pthread_mutex_destroy(&rbd->mutex); ++ free(rbd); ++} ++ ++void ++bdev_rbd_free_config(char **config) ++{ ++ char **entry; ++ ++ if (config) { ++ for (entry = config; *entry; entry++) { ++ free(*entry); ++ } ++ free(config); ++ } ++} ++ ++char ** ++bdev_rbd_dup_config(const char *const *config) ++{ ++ size_t count; ++ char **copy; ++ ++ if (!config) { ++ return NULL; ++ } ++ for (count = 0; config[count]; count++) {} ++ copy = calloc(count + 1, sizeof(*copy)); ++ if (!copy) { ++ return NULL; ++ } ++ for (count = 0; config[count]; count++) { ++ if (!(copy[count] = strdup(config[count]))) { ++ bdev_rbd_free_config(copy); ++ return NULL; ++ } ++ } ++ return copy; ++} ++ ++static int ++bdev_rados_cluster_init(const char *user_id, const char *const *config, ++ rados_t *cluster) ++{ ++ int ret; ++ ++ ret = rados_create(cluster, user_id); ++ if (ret < 0) { ++ SPDK_ERRLOG("Failed to create rados_t struct\n"); ++ return -1; ++ } ++ ++ if (config) { ++ const char *const *entry = config; ++ while (*entry) { ++ ret = rados_conf_set(*cluster, entry[0], entry[1]); ++ if (ret < 0) { ++ SPDK_ERRLOG("Failed to set %s = %s\n", entry[0], entry[1]); ++ rados_shutdown(*cluster); ++ return -1; ++ } ++ entry += 2; ++ } ++ } else { ++ ret = rados_conf_read_file(*cluster, NULL); ++ if (ret < 0) { ++ SPDK_ERRLOG("Failed to read conf file\n"); ++ rados_shutdown(*cluster); ++ return -1; ++ } ++ } ++ ++ ret = rados_connect(*cluster); ++ if (ret < 0) { ++ SPDK_ERRLOG("Failed to connect to rbd_pool\n"); ++ rados_shutdown(*cluster); ++ return -1; ++ } ++ ++ return 0; ++} ++ ++static int ++bdev_rbd_get_cluster(const char *cluster_name, rados_t **cluster) ++{ ++ struct bdev_rbd_cluster *entry; ++ ++ if (cluster == NULL) { ++ SPDK_ERRLOG("cluster should not be NULL\n"); ++ return -1; ++ } ++ ++ pthread_mutex_lock(&g_map_bdev_rbd_cluster_mutex); ++ STAILQ_FOREACH(entry, &g_map_bdev_rbd_cluster, link) { ++ if (strcmp(cluster_name, entry->name) == 0) { ++ entry->ref++; ++ *cluster = &entry->cluster; ++ pthread_mutex_unlock(&g_map_bdev_rbd_cluster_mutex); ++ return 0; ++ } ++ } ++ ++ pthread_mutex_unlock(&g_map_bdev_rbd_cluster_mutex); ++ return -1; ++} ++ ++static int ++bdev_rbd_shared_cluster_init(const char *cluster_name, rados_t **cluster) ++{ ++ int ret; ++ ++ ret = bdev_rbd_get_cluster(cluster_name, cluster); ++ if (ret < 0) { ++ SPDK_ERRLOG("Failed to create rados_t struct\n"); ++ return -1; ++ } ++ ++ return ret; ++} ++ ++static void * ++bdev_rbd_cluster_handle(void *arg) ++{ ++ void *ret = arg; ++ struct bdev_rbd *rbd = arg; ++ int rc; ++ ++ rc = bdev_rados_cluster_init(rbd->user_id, (const char *const *)rbd->config, ++ &rbd->cluster); ++ if (rc < 0) { ++ SPDK_ERRLOG("Failed to create rados cluster for user_id=%s and rbd_pool=%s\n", ++ rbd->user_id ? rbd->user_id : "admin (the default)", rbd->pool_name); ++ ret = NULL; ++ } ++ ++ return ret; ++} ++ ++static void * ++bdev_rbd_init_context(void *arg) ++{ ++ struct bdev_rbd *rbd = arg; ++ int rc; ++ ++ if (rados_ioctx_create(*(rbd->cluster_p), rbd->pool_name, &rbd->io_ctx) < 0) { ++ SPDK_ERRLOG("Failed to create ioctx on rbd=%p\n", rbd); ++ return NULL; ++ } ++ ++ rc = rbd_open(rbd->io_ctx, rbd->rbd_name, &rbd->image, NULL); ++ if (rc < 0) { ++ SPDK_ERRLOG("Failed to open specified rbd device\n"); ++ return NULL; ++ } ++ ++ rc = rbd_stat(rbd->image, &rbd->info, sizeof(rbd->info)); ++ rbd_close(rbd->image); ++ if (rc < 0) { ++ SPDK_ERRLOG("Failed to stat specified rbd device\n"); ++ return NULL; ++ } ++ ++ return arg; ++} ++ ++static int ++bdev_rbd_init(struct bdev_rbd *rbd) ++{ ++ int ret = 0; ++ ++ if (!rbd->cluster_name) { ++ rbd->cluster_p = &rbd->cluster; ++ /* Cluster should be created in non-SPDK thread to avoid conflict between ++ * Rados and SPDK thread */ ++ if (spdk_call_unaffinitized(bdev_rbd_cluster_handle, rbd) == NULL) { ++ SPDK_ERRLOG("Cannot create the rados object on rbd=%p\n", rbd); ++ return -1; ++ } ++ } else { ++ ret = bdev_rbd_shared_cluster_init(rbd->cluster_name, &rbd->cluster_p); ++ if (ret < 0) { ++ SPDK_ERRLOG("Failed to create rados object for rbd =%p on cluster_name=%s\n", ++ rbd, rbd->cluster_name); ++ return -1; ++ } ++ } ++ ++ if (spdk_call_unaffinitized(bdev_rbd_init_context, rbd) == NULL) { ++ SPDK_ERRLOG("Cannot init rbd context for rbd=%p\n", rbd); ++ return -1; ++ } ++ ++ return ret; ++} ++ ++static void ++bdev_rbd_exit(rbd_image_t image) ++{ ++ rbd_flush(image); ++ rbd_close(image); ++} ++ ++static void ++_bdev_rbd_io_complete(void *_rbd_io) ++{ ++ struct bdev_rbd_io *rbd_io = _rbd_io; ++ ++ spdk_bdev_io_complete(spdk_bdev_io_from_ctx(rbd_io), rbd_io->status); ++} ++ ++static void ++bdev_rbd_io_complete(struct spdk_bdev_io *bdev_io, enum spdk_bdev_io_status status) ++{ ++ struct bdev_rbd_io *rbd_io = (struct bdev_rbd_io *)bdev_io->driver_ctx; ++ struct spdk_thread *current_thread = spdk_get_thread(); ++ ++ rbd_io->status = status; ++ assert(rbd_io->submit_td != NULL); ++ if (rbd_io->submit_td != current_thread) { ++ spdk_thread_send_msg(rbd_io->submit_td, _bdev_rbd_io_complete, rbd_io); ++ } else { ++ _bdev_rbd_io_complete(rbd_io); ++ } ++} ++ ++static void ++bdev_rbd_finish_aiocb(rbd_completion_t cb, void *arg) ++{ ++ int io_status; ++ struct spdk_bdev_io *bdev_io; ++ struct bdev_rbd_io *rbd_io; ++ enum spdk_bdev_io_status bio_status; ++ ++ bdev_io = rbd_aio_get_arg(cb); ++ rbd_io = (struct bdev_rbd_io *)bdev_io->driver_ctx; ++ io_status = rbd_aio_get_return_value(cb); ++ bio_status = SPDK_BDEV_IO_STATUS_SUCCESS; ++ ++ if (bdev_io->type == SPDK_BDEV_IO_TYPE_READ) { ++ if ((int)rbd_io->total_len != io_status) { ++ bio_status = SPDK_BDEV_IO_STATUS_FAILED; ++ } ++ } else { ++ /* For others, 0 means success */ ++ if (io_status != 0) { ++ bio_status = SPDK_BDEV_IO_STATUS_FAILED; ++ } ++ } ++ ++ rbd_aio_release(cb); ++ ++ bdev_rbd_io_complete(bdev_io, bio_status); ++} ++ ++static void ++_bdev_rbd_start_aio(struct bdev_rbd *disk, struct spdk_bdev_io *bdev_io, ++ struct iovec *iov, int iovcnt, uint64_t offset, size_t len) ++{ ++ int ret; ++ struct bdev_rbd_io *rbd_io = (struct bdev_rbd_io *)bdev_io->driver_ctx; ++ rbd_image_t image = disk->image; ++ ++ ret = rbd_aio_create_completion(bdev_io, bdev_rbd_finish_aiocb, ++ &rbd_io->comp); ++ if (ret < 0) { ++ goto err; ++ } ++ ++ if (bdev_io->type == SPDK_BDEV_IO_TYPE_READ) { ++ rbd_io->total_len = len; ++ if (spdk_likely(iovcnt == 1)) { ++ ret = rbd_aio_read(image, offset, iov[0].iov_len, iov[0].iov_base, rbd_io->comp); ++ } else { ++ ret = rbd_aio_readv(image, iov, iovcnt, offset, rbd_io->comp); ++ } ++ } else if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) { ++ if (spdk_likely(iovcnt == 1)) { ++ ret = rbd_aio_write(image, offset, iov[0].iov_len, iov[0].iov_base, rbd_io->comp); ++ } else { ++ ret = rbd_aio_writev(image, iov, iovcnt, offset, rbd_io->comp); ++ } ++ } else if (bdev_io->type == SPDK_BDEV_IO_TYPE_UNMAP) { ++ ret = rbd_aio_discard(image, offset, len, rbd_io->comp); ++ } else if (bdev_io->type == SPDK_BDEV_IO_TYPE_FLUSH) { ++ ret = rbd_aio_flush(image, rbd_io->comp); ++ } else if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE_ZEROES) { ++ ret = rbd_aio_write_zeroes(image, offset, len, rbd_io->comp, /* zero_flags */ 0, /* op_flags */ 0); ++ } ++ ++ if (ret < 0) { ++ rbd_aio_release(rbd_io->comp); ++ goto err; ++ } ++ ++ return; ++ ++err: ++ bdev_rbd_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); ++} ++ ++static void ++bdev_rbd_start_aio(void *ctx) ++{ ++ struct spdk_bdev_io *bdev_io = ctx; ++ struct bdev_rbd *disk = (struct bdev_rbd *)bdev_io->bdev->ctxt; ++ ++ _bdev_rbd_start_aio(disk, ++ bdev_io, ++ bdev_io->u.bdev.iovs, ++ bdev_io->u.bdev.iovcnt, ++ bdev_io->u.bdev.offset_blocks * bdev_io->bdev->blocklen, ++ bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen); ++} ++ ++static int bdev_rbd_library_init(void); ++static void bdev_rbd_library_fini(void); ++ ++static int ++bdev_rbd_get_ctx_size(void) ++{ ++ return sizeof(struct bdev_rbd_io); ++} ++ ++static struct spdk_bdev_module rbd_if = { ++ .name = "rbd", ++ .module_init = bdev_rbd_library_init, ++ .module_fini = bdev_rbd_library_fini, ++ .get_ctx_size = bdev_rbd_get_ctx_size, ++ ++}; ++SPDK_BDEV_MODULE_REGISTER(rbd, &rbd_if) ++ ++static int bdev_rbd_reset_timer(void *arg); ++ ++static void ++bdev_rbd_check_outstanding_ios(struct spdk_bdev *bdev, uint64_t current_qd, ++ void *cb_arg, int rc) ++{ ++ struct bdev_rbd *disk = cb_arg; ++ enum spdk_bdev_io_status bio_status; ++ ++ if (rc == 0 && current_qd > 0) { ++ disk->reset_timer = SPDK_POLLER_REGISTER(bdev_rbd_reset_timer, disk, 1000); ++ return; ++ } ++ ++ if (rc != 0) { ++ bio_status = SPDK_BDEV_IO_STATUS_FAILED; ++ } else { ++ bio_status = SPDK_BDEV_IO_STATUS_SUCCESS; ++ } ++ ++ bdev_rbd_io_complete(disk->reset_bdev_io, bio_status); ++ disk->reset_bdev_io = NULL; ++} ++ ++static int ++bdev_rbd_reset_timer(void *arg) ++{ ++ struct bdev_rbd *disk = arg; ++ ++ spdk_poller_unregister(&disk->reset_timer); ++ ++ spdk_bdev_get_current_qd(&disk->disk, bdev_rbd_check_outstanding_ios, disk); ++ ++ return SPDK_POLLER_BUSY; ++} ++ ++static void ++bdev_rbd_reset(void *ctx) ++{ ++ struct spdk_bdev_io *bdev_io = ctx; ++ struct bdev_rbd *disk = (struct bdev_rbd *)bdev_io->bdev->ctxt; ++ ++ /* ++ * HACK: Since librbd doesn't provide any way to cancel outstanding aio, just kick off a ++ * poller to wait for in-flight I/O to complete. ++ */ ++ assert(disk->reset_bdev_io == NULL); ++ disk->reset_bdev_io = bdev_io; ++ ++ bdev_rbd_reset_timer(disk); ++} ++ ++static void ++_bdev_rbd_destruct_done(void *io_device) ++{ ++ struct bdev_rbd *rbd = io_device; ++ ++ assert(rbd != NULL); ++ assert(rbd->ch_count == 0); ++ ++ spdk_bdev_destruct_done(&rbd->disk, 0); ++ bdev_rbd_free(rbd); ++} ++ ++static void ++bdev_rbd_free_cb(void *io_device) ++{ ++ struct bdev_rbd *rbd = io_device; ++ ++ /* The io device has been unregistered. Send a message back to the ++ * original thread that started the destruct operation, so that the ++ * bdev unregister callback is invoked on the same thread that started ++ * this whole process. ++ */ ++ spdk_thread_send_msg(rbd->destruct_td, _bdev_rbd_destruct_done, rbd); ++} ++ ++static void ++_bdev_rbd_destruct(void *ctx) ++{ ++ struct bdev_rbd *rbd = ctx; ++ ++ spdk_io_device_unregister(rbd, bdev_rbd_free_cb); ++} ++ ++static int ++bdev_rbd_destruct(void *ctx) ++{ ++ struct bdev_rbd *rbd = ctx; ++ struct spdk_thread *td; ++ ++ if (rbd->main_td == NULL) { ++ td = spdk_get_thread(); ++ } else { ++ td = rbd->main_td; ++ } ++ ++ /* Start the destruct operation on the rbd bdev's ++ * main thread. This guarantees it will only start ++ * executing after any messages related to channel ++ * deletions have finished completing. *Always* ++ * send a message, even if this function gets called ++ * from the main thread, in case there are pending ++ * channel delete messages in flight to this thread. ++ */ ++ assert(rbd->destruct_td == NULL); ++ rbd->destruct_td = td; ++ spdk_thread_send_msg(td, _bdev_rbd_destruct, rbd); ++ ++ /* Return 1 to indicate the destruct path is asynchronous. */ ++ return 1; ++} ++ ++static void ++bdev_rbd_get_buf_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io, ++ bool success) ++{ ++ struct bdev_rbd *disk = (struct bdev_rbd *)bdev_io->bdev->ctxt; ++ ++ if (!success) { ++ bdev_rbd_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); ++ return; ++ } ++ ++ spdk_thread_exec_msg(disk->main_td, bdev_rbd_start_aio, bdev_io); ++} ++ ++static void ++bdev_rbd_submit_request(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io) ++{ ++ struct spdk_thread *submit_td = spdk_io_channel_get_thread(ch); ++ struct bdev_rbd_io *rbd_io = (struct bdev_rbd_io *)bdev_io->driver_ctx; ++ struct bdev_rbd *disk = (struct bdev_rbd *)bdev_io->bdev->ctxt; ++ ++ rbd_io->submit_td = submit_td; ++ switch (bdev_io->type) { ++ case SPDK_BDEV_IO_TYPE_READ: ++ spdk_bdev_io_get_buf(bdev_io, bdev_rbd_get_buf_cb, ++ bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen); ++ break; ++ ++ case SPDK_BDEV_IO_TYPE_WRITE: ++ case SPDK_BDEV_IO_TYPE_UNMAP: ++ case SPDK_BDEV_IO_TYPE_FLUSH: ++ case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: ++ spdk_thread_exec_msg(disk->main_td, bdev_rbd_start_aio, bdev_io); ++ break; ++ ++ case SPDK_BDEV_IO_TYPE_RESET: ++ spdk_thread_exec_msg(disk->main_td, bdev_rbd_reset, bdev_io); ++ break; ++ ++ default: ++ SPDK_ERRLOG("Unsupported IO type =%d\n", bdev_io->type); ++ bdev_rbd_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); ++ break; ++ } ++} ++ ++static bool ++bdev_rbd_io_type_supported(void *ctx, enum spdk_bdev_io_type io_type) ++{ ++ switch (io_type) { ++ case SPDK_BDEV_IO_TYPE_READ: ++ case SPDK_BDEV_IO_TYPE_WRITE: ++ case SPDK_BDEV_IO_TYPE_UNMAP: ++ case SPDK_BDEV_IO_TYPE_FLUSH: ++ case SPDK_BDEV_IO_TYPE_RESET: ++ case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: ++ return true; ++ ++ default: ++ return false; ++ } ++} ++ ++static void ++bdev_rbd_free_channel_resources(struct bdev_rbd *disk) ++{ ++ assert(disk != NULL); ++ assert(disk->main_td == spdk_get_thread()); ++ assert(disk->ch_count == 0); ++ ++ spdk_put_io_channel(disk->group_ch); ++ if (disk->image) { ++ bdev_rbd_exit(disk->image); ++ } ++ ++ disk->main_td = NULL; ++ disk->group_ch = NULL; ++} ++ ++static void * ++bdev_rbd_handle(void *arg) ++{ ++ struct bdev_rbd *disk = arg; ++ void *ret = arg; ++ ++ if (rbd_open(disk->io_ctx, disk->rbd_name, &disk->image, NULL) < 0) { ++ SPDK_ERRLOG("Failed to open specified rbd device\n"); ++ ret = NULL; ++ } ++ ++ return ret; ++} ++ ++static int ++_bdev_rbd_create_cb(struct bdev_rbd *disk) ++{ ++ disk->group_ch = spdk_get_io_channel(&rbd_if); ++ assert(disk->group_ch != NULL); ++ ++ if (spdk_call_unaffinitized(bdev_rbd_handle, disk) == NULL) { ++ bdev_rbd_free_channel_resources(disk); ++ return -1; ++ } ++ ++ return 0; ++} ++ ++static int ++bdev_rbd_create_cb(void *io_device, void *ctx_buf) ++{ ++ struct bdev_rbd_io_channel *ch = ctx_buf; ++ struct bdev_rbd *disk = io_device; ++ int rc; ++ ++ ch->disk = disk; ++ pthread_mutex_lock(&disk->mutex); ++ if (disk->ch_count == 0) { ++ assert(disk->main_td == NULL); ++ rc = _bdev_rbd_create_cb(disk); ++ if (rc) { ++ SPDK_ERRLOG("Cannot create channel for disk=%p\n", disk); ++ pthread_mutex_unlock(&disk->mutex); ++ return rc; ++ } ++ ++ disk->main_td = spdk_get_thread(); ++ } ++ ++ disk->ch_count++; ++ pthread_mutex_unlock(&disk->mutex); ++ ++ return 0; ++} ++ ++static void ++_bdev_rbd_destroy_cb(void *ctx) ++{ ++ struct bdev_rbd *disk = ctx; ++ ++ pthread_mutex_lock(&disk->mutex); ++ assert(disk->ch_count > 0); ++ disk->ch_count--; ++ ++ if (disk->ch_count > 0) { ++ /* A new channel was created between when message was sent and this function executed */ ++ pthread_mutex_unlock(&disk->mutex); ++ return; ++ } ++ ++ bdev_rbd_free_channel_resources(disk); ++ pthread_mutex_unlock(&disk->mutex); ++} ++ ++static void ++bdev_rbd_destroy_cb(void *io_device, void *ctx_buf) ++{ ++ struct bdev_rbd *disk = io_device; ++ struct spdk_thread *thread; ++ ++ pthread_mutex_lock(&disk->mutex); ++ assert(disk->ch_count > 0); ++ disk->ch_count--; ++ if (disk->ch_count == 0) { ++ assert(disk->main_td != NULL); ++ if (disk->main_td != spdk_get_thread()) { ++ /* The final channel was destroyed on a different thread ++ * than where the first channel was created. Pass a message ++ * to the main thread to unregister the poller. */ ++ disk->ch_count++; ++ thread = disk->main_td; ++ pthread_mutex_unlock(&disk->mutex); ++ spdk_thread_send_msg(thread, _bdev_rbd_destroy_cb, disk); ++ return; ++ } ++ ++ bdev_rbd_free_channel_resources(disk); ++ } ++ pthread_mutex_unlock(&disk->mutex); ++} ++ ++static struct spdk_io_channel * ++bdev_rbd_get_io_channel(void *ctx) ++{ ++ struct bdev_rbd *rbd_bdev = ctx; ++ ++ return spdk_get_io_channel(rbd_bdev); ++} ++ ++static void ++bdev_rbd_cluster_dump_entry(const char *cluster_name, struct spdk_json_write_ctx *w) ++{ ++ struct bdev_rbd_cluster *entry; ++ ++ pthread_mutex_lock(&g_map_bdev_rbd_cluster_mutex); ++ STAILQ_FOREACH(entry, &g_map_bdev_rbd_cluster, link) { ++ if (strcmp(cluster_name, entry->name)) { ++ continue; ++ } ++ if (entry->user_id) { ++ spdk_json_write_named_string(w, "user_id", entry->user_id); ++ } ++ ++ if (entry->config_param) { ++ char **config_entry = entry->config_param; ++ ++ spdk_json_write_named_object_begin(w, "config_param"); ++ while (*config_entry) { ++ spdk_json_write_named_string(w, config_entry[0], config_entry[1]); ++ config_entry += 2; ++ } ++ spdk_json_write_object_end(w); ++ } ++ if (entry->config_file) { ++ spdk_json_write_named_string(w, "config_file", entry->config_file); ++ } ++ if (entry->key_file) { ++ spdk_json_write_named_string(w, "key_file", entry->key_file); ++ } ++ ++ pthread_mutex_unlock(&g_map_bdev_rbd_cluster_mutex); ++ return; ++ } ++ ++ pthread_mutex_unlock(&g_map_bdev_rbd_cluster_mutex); ++} ++ ++static int ++bdev_rbd_dump_info_json(void *ctx, struct spdk_json_write_ctx *w) ++{ ++ struct bdev_rbd *rbd_bdev = ctx; ++ ++ spdk_json_write_named_object_begin(w, "rbd"); ++ ++ spdk_json_write_named_string(w, "pool_name", rbd_bdev->pool_name); ++ ++ spdk_json_write_named_string(w, "rbd_name", rbd_bdev->rbd_name); ++ ++ if (rbd_bdev->cluster_name) { ++ bdev_rbd_cluster_dump_entry(rbd_bdev->cluster_name, w); ++ goto end; ++ } ++ ++ if (rbd_bdev->user_id) { ++ spdk_json_write_named_string(w, "user_id", rbd_bdev->user_id); ++ } ++ ++ if (rbd_bdev->config) { ++ char **entry = rbd_bdev->config; ++ ++ spdk_json_write_named_object_begin(w, "config"); ++ while (*entry) { ++ spdk_json_write_named_string(w, entry[0], entry[1]); ++ entry += 2; ++ } ++ spdk_json_write_object_end(w); ++ } ++ ++end: ++ spdk_json_write_object_end(w); ++ ++ return 0; ++} ++ ++static void ++bdev_rbd_write_config_json(struct spdk_bdev *bdev, struct spdk_json_write_ctx *w) ++{ ++ struct bdev_rbd *rbd = bdev->ctxt; ++ char uuid_str[SPDK_UUID_STRING_LEN]; ++ ++ spdk_json_write_object_begin(w); ++ ++ spdk_json_write_named_string(w, "method", "bdev_rbd_create"); ++ ++ spdk_json_write_named_object_begin(w, "params"); ++ spdk_json_write_named_string(w, "name", bdev->name); ++ spdk_json_write_named_string(w, "pool_name", rbd->pool_name); ++ spdk_json_write_named_string(w, "rbd_name", rbd->rbd_name); ++ spdk_json_write_named_uint32(w, "block_size", bdev->blocklen); ++ if (rbd->user_id) { ++ spdk_json_write_named_string(w, "user_id", rbd->user_id); ++ } ++ ++ if (rbd->config) { ++ char **entry = rbd->config; ++ ++ spdk_json_write_named_object_begin(w, "config"); ++ while (*entry) { ++ spdk_json_write_named_string(w, entry[0], entry[1]); ++ entry += 2; ++ } ++ spdk_json_write_object_end(w); ++ } ++ ++ spdk_uuid_fmt_lower(uuid_str, sizeof(uuid_str), &bdev->uuid); ++ spdk_json_write_named_string(w, "uuid", uuid_str); ++ ++ spdk_json_write_object_end(w); ++ ++ spdk_json_write_object_end(w); ++} ++ ++static void ++dump_single_cluster_entry(struct bdev_rbd_cluster *entry, struct spdk_json_write_ctx *w) ++{ ++ assert(entry != NULL); ++ ++ spdk_json_write_object_begin(w); ++ spdk_json_write_named_string(w, "cluster_name", entry->name); ++ ++ if (entry->user_id) { ++ spdk_json_write_named_string(w, "user_id", entry->user_id); ++ } ++ ++ if (entry->config_param) { ++ char **config_entry = entry->config_param; ++ ++ spdk_json_write_named_object_begin(w, "config_param"); ++ while (*config_entry) { ++ spdk_json_write_named_string(w, config_entry[0], config_entry[1]); ++ config_entry += 2; ++ } ++ spdk_json_write_object_end(w); ++ } ++ if (entry->config_file) { ++ spdk_json_write_named_string(w, "config_file", entry->config_file); ++ } ++ if (entry->key_file) { ++ spdk_json_write_named_string(w, "key_file", entry->key_file); ++ } ++ ++ spdk_json_write_object_end(w); ++} ++ ++int ++bdev_rbd_get_clusters_info(struct spdk_jsonrpc_request *request, const char *name) ++{ ++ struct bdev_rbd_cluster *entry; ++ struct spdk_json_write_ctx *w; ++ ++ pthread_mutex_lock(&g_map_bdev_rbd_cluster_mutex); ++ ++ if (STAILQ_EMPTY(&g_map_bdev_rbd_cluster)) { ++ pthread_mutex_unlock(&g_map_bdev_rbd_cluster_mutex); ++ return -ENOENT; ++ } ++ ++ /* If cluster name is provided */ ++ if (name) { ++ STAILQ_FOREACH(entry, &g_map_bdev_rbd_cluster, link) { ++ if (strcmp(name, entry->name) == 0) { ++ w = spdk_jsonrpc_begin_result(request); ++ dump_single_cluster_entry(entry, w); ++ spdk_jsonrpc_end_result(request, w); ++ ++ pthread_mutex_unlock(&g_map_bdev_rbd_cluster_mutex); ++ return 0; ++ } ++ } ++ ++ pthread_mutex_unlock(&g_map_bdev_rbd_cluster_mutex); ++ return -ENOENT; ++ } ++ ++ w = spdk_jsonrpc_begin_result(request); ++ spdk_json_write_array_begin(w); ++ STAILQ_FOREACH(entry, &g_map_bdev_rbd_cluster, link) { ++ dump_single_cluster_entry(entry, w); ++ } ++ spdk_json_write_array_end(w); ++ spdk_jsonrpc_end_result(request, w); ++ pthread_mutex_unlock(&g_map_bdev_rbd_cluster_mutex); ++ ++ return 0; ++} ++ ++static const struct spdk_bdev_fn_table rbd_fn_table = { ++ .destruct = bdev_rbd_destruct, ++ .submit_request = bdev_rbd_submit_request, ++ .io_type_supported = bdev_rbd_io_type_supported, ++ .get_io_channel = bdev_rbd_get_io_channel, ++ .dump_info_json = bdev_rbd_dump_info_json, ++ .write_config_json = bdev_rbd_write_config_json, ++}; ++ ++static int ++rbd_register_cluster(const char *name, const char *user_id, const char *const *config_param, ++ const char *config_file, const char *key_file) ++{ ++ struct bdev_rbd_cluster *entry; ++ int rc; ++ ++ pthread_mutex_lock(&g_map_bdev_rbd_cluster_mutex); ++ STAILQ_FOREACH(entry, &g_map_bdev_rbd_cluster, link) { ++ if (strcmp(name, entry->name) == 0) { ++ SPDK_ERRLOG("Cluster name=%s already exists\n", name); ++ pthread_mutex_unlock(&g_map_bdev_rbd_cluster_mutex); ++ return -1; ++ } ++ } ++ ++ entry = calloc(1, sizeof(*entry)); ++ if (!entry) { ++ SPDK_ERRLOG("Cannot allocate an entry for name=%s\n", name); ++ pthread_mutex_unlock(&g_map_bdev_rbd_cluster_mutex); ++ return -1; ++ } ++ ++ entry->name = strdup(name); ++ if (entry->name == NULL) { ++ SPDK_ERRLOG("Failed to save the name =%s on entry =%p\n", name, entry); ++ goto err_handle; ++ } ++ ++ if (user_id) { ++ entry->user_id = strdup(user_id); ++ if (entry->user_id == NULL) { ++ SPDK_ERRLOG("Failed to save the str =%s on entry =%p\n", user_id, entry); ++ goto err_handle; ++ } ++ } ++ ++ /* Support specify config_param or config_file separately, or both of them. */ ++ if (config_param) { ++ entry->config_param = bdev_rbd_dup_config(config_param); ++ if (entry->config_param == NULL) { ++ SPDK_ERRLOG("Failed to save the config_param=%p on entry = %p\n", config_param, entry); ++ goto err_handle; ++ } ++ } ++ ++ if (config_file) { ++ entry->config_file = strdup(config_file); ++ if (entry->config_file == NULL) { ++ SPDK_ERRLOG("Failed to save the config_file=%s on entry = %p\n", config_file, entry); ++ goto err_handle; ++ } ++ } ++ ++ if (key_file) { ++ entry->key_file = strdup(key_file); ++ if (entry->key_file == NULL) { ++ SPDK_ERRLOG("Failed to save the key_file=%s on entry = %p\n", key_file, entry); ++ goto err_handle; ++ } ++ } ++ ++ rc = rados_create(&entry->cluster, user_id); ++ if (rc < 0) { ++ SPDK_ERRLOG("Failed to create rados_t struct\n"); ++ goto err_handle; ++ } ++ ++ /* Try default location when entry->config_file is NULL, but ignore failure when it is NULL */ ++ rc = rados_conf_read_file(entry->cluster, entry->config_file); ++ if (entry->config_file && rc < 0) { ++ SPDK_ERRLOG("Failed to read conf file %s\n", entry->config_file); ++ rados_shutdown(entry->cluster); ++ goto err_handle; ++ } ++ ++ if (config_param) { ++ const char *const *config_entry = config_param; ++ while (*config_entry) { ++ rc = rados_conf_set(entry->cluster, config_entry[0], config_entry[1]); ++ if (rc < 0) { ++ SPDK_ERRLOG("Failed to set %s = %s\n", config_entry[0], config_entry[1]); ++ rados_shutdown(entry->cluster); ++ goto err_handle; ++ } ++ config_entry += 2; ++ } ++ } ++ ++ if (key_file) { ++ rc = rados_conf_set(entry->cluster, "keyring", key_file); ++ if (rc < 0) { ++ SPDK_ERRLOG("Failed to set keyring = %s\n", key_file); ++ rados_shutdown(entry->cluster); ++ goto err_handle; ++ } ++ } ++ ++ rc = rados_connect(entry->cluster); ++ if (rc < 0) { ++ SPDK_ERRLOG("Failed to connect to rbd_pool on cluster=%p\n", entry->cluster); ++ rados_shutdown(entry->cluster); ++ goto err_handle; ++ } ++ ++ STAILQ_INSERT_TAIL(&g_map_bdev_rbd_cluster, entry, link); ++ pthread_mutex_unlock(&g_map_bdev_rbd_cluster_mutex); ++ ++ return 0; ++ ++err_handle: ++ bdev_rbd_cluster_free(entry); ++ pthread_mutex_unlock(&g_map_bdev_rbd_cluster_mutex); ++ return -1; ++} ++ ++int ++bdev_rbd_unregister_cluster(const char *name) ++{ ++ struct bdev_rbd_cluster *entry; ++ int rc = 0; ++ ++ if (name == NULL) { ++ return -1; ++ } ++ ++ pthread_mutex_lock(&g_map_bdev_rbd_cluster_mutex); ++ STAILQ_FOREACH(entry, &g_map_bdev_rbd_cluster, link) { ++ if (strcmp(name, entry->name) == 0) { ++ if (entry->ref == 0) { ++ STAILQ_REMOVE(&g_map_bdev_rbd_cluster, entry, bdev_rbd_cluster, link); ++ rados_shutdown(entry->cluster); ++ bdev_rbd_cluster_free(entry); ++ } else { ++ SPDK_ERRLOG("Cluster with name=%p is still used and we cannot delete it\n", ++ entry->name); ++ rc = -1; ++ } ++ ++ pthread_mutex_unlock(&g_map_bdev_rbd_cluster_mutex); ++ return rc; ++ } ++ } ++ ++ pthread_mutex_unlock(&g_map_bdev_rbd_cluster_mutex); ++ ++ SPDK_ERRLOG("Could not find the cluster name =%p\n", name); ++ ++ return -1; ++} ++ ++static void * ++_bdev_rbd_register_cluster(void *arg) ++{ ++ struct cluster_register_info *info = arg; ++ void *ret = arg; ++ int rc; ++ ++ rc = rbd_register_cluster((const char *)info->name, (const char *)info->user_id, ++ (const char *const *)info->config_param, (const char *)info->config_file, ++ (const char *)info->key_file); ++ if (rc) { ++ ret = NULL; ++ } ++ ++ return ret; ++} ++ ++int ++bdev_rbd_register_cluster(struct cluster_register_info *info) ++{ ++ assert(info != NULL); ++ ++ /* Rados cluster info need to be created in non SPDK-thread to avoid CPU ++ * resource contention */ ++ if (spdk_call_unaffinitized(_bdev_rbd_register_cluster, info) == NULL) { ++ return -1; ++ } ++ ++ return 0; ++} ++ ++int ++bdev_rbd_create(struct spdk_bdev **bdev, const char *name, const char *user_id, ++ const char *pool_name, ++ const char *const *config, ++ const char *rbd_name, ++ uint32_t block_size, ++ const char *cluster_name, ++ const struct spdk_uuid *uuid) ++{ ++ struct bdev_rbd *rbd; ++ int ret; ++ ++ if ((pool_name == NULL) || (rbd_name == NULL)) { ++ return -EINVAL; ++ } ++ ++ rbd = calloc(1, sizeof(struct bdev_rbd)); ++ if (rbd == NULL) { ++ SPDK_ERRLOG("Failed to allocate bdev_rbd struct\n"); ++ return -ENOMEM; ++ } ++ ++ ret = pthread_mutex_init(&rbd->mutex, NULL); ++ if (ret) { ++ SPDK_ERRLOG("Cannot init mutex on rbd=%p\n", rbd->disk.name); ++ free(rbd); ++ return ret; ++ } ++ ++ rbd->rbd_name = strdup(rbd_name); ++ if (!rbd->rbd_name) { ++ bdev_rbd_free(rbd); ++ return -ENOMEM; ++ } ++ ++ if (user_id) { ++ rbd->user_id = strdup(user_id); ++ if (!rbd->user_id) { ++ bdev_rbd_free(rbd); ++ return -ENOMEM; ++ } ++ } ++ ++ if (cluster_name) { ++ rbd->cluster_name = strdup(cluster_name); ++ if (!rbd->cluster_name) { ++ bdev_rbd_free(rbd); ++ return -ENOMEM; ++ } ++ } ++ rbd->pool_name = strdup(pool_name); ++ if (!rbd->pool_name) { ++ bdev_rbd_free(rbd); ++ return -ENOMEM; ++ } ++ ++ if (config && !(rbd->config = bdev_rbd_dup_config(config))) { ++ bdev_rbd_free(rbd); ++ return -ENOMEM; ++ } ++ ++ ret = bdev_rbd_init(rbd); ++ if (ret < 0) { ++ bdev_rbd_free(rbd); ++ SPDK_ERRLOG("Failed to init rbd device\n"); ++ return ret; ++ } ++ ++ if (uuid) { ++ rbd->disk.uuid = *uuid; ++ } else { ++ spdk_uuid_generate(&rbd->disk.uuid); ++ } ++ ++ if (name) { ++ rbd->disk.name = strdup(name); ++ } else { ++ rbd->disk.name = spdk_sprintf_alloc("Ceph%d", bdev_rbd_count); ++ } ++ if (!rbd->disk.name) { ++ bdev_rbd_free(rbd); ++ return -ENOMEM; ++ } ++ rbd->disk.product_name = "Ceph Rbd Disk"; ++ bdev_rbd_count++; ++ ++ rbd->disk.write_cache = 0; ++ rbd->disk.blocklen = block_size; ++ rbd->disk.blockcnt = rbd->info.size / rbd->disk.blocklen; ++ rbd->disk.ctxt = rbd; ++ rbd->disk.fn_table = &rbd_fn_table; ++ rbd->disk.module = &rbd_if; ++ ++ SPDK_NOTICELOG("Add %s rbd disk to lun\n", rbd->disk.name); ++ ++ spdk_io_device_register(rbd, bdev_rbd_create_cb, ++ bdev_rbd_destroy_cb, ++ sizeof(struct bdev_rbd_io_channel), ++ rbd_name); ++ ret = spdk_bdev_register(&rbd->disk); ++ if (ret) { ++ spdk_io_device_unregister(rbd, NULL); ++ bdev_rbd_free(rbd); ++ return ret; ++ } ++ ++ *bdev = &(rbd->disk); ++ ++ return ret; ++} ++ ++void ++bdev_rbd_delete(const char *name, spdk_delete_rbd_complete cb_fn, void *cb_arg) ++{ ++ int rc; ++ ++ rc = spdk_bdev_unregister_by_name(name, &rbd_if, cb_fn, cb_arg); ++ if (rc != 0) { ++ cb_fn(cb_arg, rc); ++ } ++} ++ ++static void ++dummy_bdev_event_cb(enum spdk_bdev_event_type type, struct spdk_bdev *bdev, void *ctx) ++{ ++} ++ ++int ++bdev_rbd_resize(const char *name, const uint64_t new_size_in_mb) ++{ ++ struct spdk_bdev_desc *desc; ++ struct spdk_bdev *bdev; ++ struct spdk_io_channel *ch; ++ struct bdev_rbd_io_channel *rbd_io_ch; ++ int rc = 0; ++ uint64_t new_size_in_byte; ++ uint64_t current_size_in_mb; ++ ++ rc = spdk_bdev_open_ext(name, false, dummy_bdev_event_cb, NULL, &desc); ++ if (rc != 0) { ++ return rc; ++ } ++ ++ bdev = spdk_bdev_desc_get_bdev(desc); ++ ++ if (bdev->module != &rbd_if) { ++ rc = -EINVAL; ++ goto exit; ++ } ++ ++ current_size_in_mb = bdev->blocklen * bdev->blockcnt / (1024 * 1024); ++ if (current_size_in_mb > new_size_in_mb) { ++ SPDK_ERRLOG("The new bdev size must be larger than current bdev size.\n"); ++ rc = -EINVAL; ++ goto exit; ++ } ++ ++ ch = bdev_rbd_get_io_channel(bdev); ++ rbd_io_ch = spdk_io_channel_get_ctx(ch); ++ new_size_in_byte = new_size_in_mb * 1024 * 1024; ++ ++ rc = rbd_resize(rbd_io_ch->disk->image, new_size_in_byte); ++ spdk_put_io_channel(ch); ++ if (rc != 0) { ++ SPDK_ERRLOG("failed to resize the ceph bdev.\n"); ++ goto exit; ++ } ++ ++ rc = spdk_bdev_notify_blockcnt_change(bdev, new_size_in_byte / bdev->blocklen); ++ if (rc != 0) { ++ SPDK_ERRLOG("failed to notify block cnt change.\n"); ++ } ++ ++exit: ++ spdk_bdev_close(desc); ++ return rc; ++} ++ ++static int ++bdev_rbd_group_create_cb(void *io_device, void *ctx_buf) ++{ ++ return 0; ++} ++ ++static void ++bdev_rbd_group_destroy_cb(void *io_device, void *ctx_buf) ++{ ++} ++ ++static int ++bdev_rbd_library_init(void) ++{ ++ spdk_io_device_register(&rbd_if, bdev_rbd_group_create_cb, bdev_rbd_group_destroy_cb, ++ 0, "bdev_rbd_poll_groups"); ++ return 0; ++} ++ ++static void ++bdev_rbd_library_fini(void) ++{ ++ spdk_io_device_unregister(&rbd_if, NULL); ++} ++ ++SPDK_LOG_REGISTER_COMPONENT(bdev_rbd) +diff --git a/module/bdev/rbd/bdev_rbd.h b/module/bdev/rbd/bdev_rbd.h +index 7ce21d7..942f0f0 100644 +--- a/module/bdev/rbd/bdev_rbd.h ++++ b/module/bdev/rbd/bdev_rbd.h +@@ -1,72 +1,72 @@ +-/* SPDX-License-Identifier: BSD-3-Clause +- * Copyright (C) 2016 Intel Corporation. +- * All rights reserved. +- */ +- +-#ifndef SPDK_BDEV_RBD_H +-#define SPDK_BDEV_RBD_H +- +-#include "spdk/stdinc.h" +- +-#include "spdk/bdev.h" +-#include "spdk/rpc.h" +- +-struct cluster_register_info { +- char *name; +- char *user_id; +- char **config_param; +- char *config_file; +- char *key_file; +-}; +- +-void bdev_rbd_free_config(char **config); +-char **bdev_rbd_dup_config(const char *const *config); +- +-typedef void (*spdk_delete_rbd_complete)(void *cb_arg, int bdeverrno); +- +-int bdev_rbd_create(struct spdk_bdev **bdev, const char *name, const char *user_id, +- const char *pool_name, +- const char *const *config, +- const char *rbd_name, uint32_t block_size, const char *cluster_name, const struct spdk_uuid *uuid); +-/** +- * Delete rbd bdev. +- * +- * \param name Name of rbd bdev. +- * \param cb_fn Function to call after deletion. +- * \param cb_arg Argument to pass to cb_fn. +- */ +-void bdev_rbd_delete(const char *name, spdk_delete_rbd_complete cb_fn, +- void *cb_arg); +- +-/** +- * Resize rbd bdev. +- * +- * \param bdev Name of rbd bdev. +- * \param new_size_in_mb The new size in MiB for this bdev. +- */ +-int bdev_rbd_resize(const char *name, const uint64_t new_size_in_mb); +- +-/** +- * Create a Rados cluster. +- * +- * \param info the info to register the Rados cluster object +- */ +-int bdev_rbd_register_cluster(struct cluster_register_info *info); +- +-/** +- * Delete a registered cluster. +- * +- * \param name the name of the cluster to be deleted. +- */ +-int bdev_rbd_unregister_cluster(const char *name); +- +-/** +- * Show the cluster info of a given name. If given name is empty, +- * the info of every registered cluster name will be showed. +- * +- * \param request the json request. +- * \param name the name of the cluster. +- */ +-int bdev_rbd_get_clusters_info(struct spdk_jsonrpc_request *request, const char *name); +- +-#endif /* SPDK_BDEV_RBD_H */ ++/* SPDX-License-Identifier: BSD-3-Clause ++ * Copyright (C) 2016 Intel Corporation. ++ * All rights reserved. ++ */ ++ ++#ifndef SPDK_BDEV_RBD_H ++#define SPDK_BDEV_RBD_H ++ ++#include "spdk/stdinc.h" ++ ++#include "spdk/bdev.h" ++#include "spdk/rpc.h" ++ ++struct cluster_register_info { ++ char *name; ++ char *user_id; ++ char **config_param; ++ char *config_file; ++ char *key_file; ++}; ++ ++void bdev_rbd_free_config(char **config); ++char **bdev_rbd_dup_config(const char *const *config); ++ ++typedef void (*spdk_delete_rbd_complete)(void *cb_arg, int bdeverrno); ++ ++int bdev_rbd_create(struct spdk_bdev **bdev, const char *name, const char *user_id, ++ const char *pool_name, ++ const char *const *config, ++ const char *rbd_name, uint32_t block_size, const char *cluster_name, const struct spdk_uuid *uuid); ++/** ++ * Delete rbd bdev. ++ * ++ * \param name Name of rbd bdev. ++ * \param cb_fn Function to call after deletion. ++ * \param cb_arg Argument to pass to cb_fn. ++ */ ++void bdev_rbd_delete(const char *name, spdk_delete_rbd_complete cb_fn, ++ void *cb_arg); ++ ++/** ++ * Resize rbd bdev. ++ * ++ * \param bdev Name of rbd bdev. ++ * \param new_size_in_mb The new size in MiB for this bdev. ++ */ ++int bdev_rbd_resize(const char *name, const uint64_t new_size_in_mb); ++ ++/** ++ * Create a Rados cluster. ++ * ++ * \param info the info to register the Rados cluster object ++ */ ++int bdev_rbd_register_cluster(struct cluster_register_info *info); ++ ++/** ++ * Delete a registered cluster. ++ * ++ * \param name the name of the cluster to be deleted. ++ */ ++int bdev_rbd_unregister_cluster(const char *name); ++ ++/** ++ * Show the cluster info of a given name. If given name is empty, ++ * the info of every registered cluster name will be showed. ++ * ++ * \param request the json request. ++ * \param name the name of the cluster. ++ */ ++int bdev_rbd_get_clusters_info(struct spdk_jsonrpc_request *request, const char *name); ++ ++#endif /* SPDK_BDEV_RBD_H */ +diff --git a/module/bdev/rbd/bdev_rbd_rpc.c b/module/bdev/rbd/bdev_rbd_rpc.c +index 4f8f7cf..d416d3e 100644 +--- a/module/bdev/rbd/bdev_rbd_rpc.c ++++ b/module/bdev/rbd/bdev_rbd_rpc.c +@@ -1,354 +1,354 @@ +-/* SPDX-License-Identifier: BSD-3-Clause +- * Copyright (C) 2016 Intel Corporation. +- * All rights reserved. +- */ +- +-#include "bdev_rbd.h" +-#include "spdk/util.h" +-#include "spdk/uuid.h" +-#include "spdk/string.h" +-#include "spdk/log.h" +- +-struct rpc_create_rbd { +- char *name; +- char *user_id; +- char *pool_name; +- char *rbd_name; +- uint32_t block_size; +- char **config; +- char *cluster_name; +- char *uuid; +-}; +- +-static void +-free_rpc_create_rbd(struct rpc_create_rbd *req) +-{ +- free(req->name); +- free(req->user_id); +- free(req->pool_name); +- free(req->rbd_name); +- bdev_rbd_free_config(req->config); +- free(req->cluster_name); +- free(req->uuid); +-} +- +-static int +-bdev_rbd_decode_config(const struct spdk_json_val *values, void *out) +-{ +- char ***map = out; +- char **entry; +- uint32_t i; +- +- if (values->type == SPDK_JSON_VAL_NULL) { +- /* treated like empty object: empty config */ +- *map = calloc(1, sizeof(**map)); +- if (!*map) { +- return -1; +- } +- return 0; +- } +- +- if (values->type != SPDK_JSON_VAL_OBJECT_BEGIN) { +- return -1; +- } +- +- *map = calloc(values->len + 1, sizeof(**map)); +- if (!*map) { +- return -1; +- } +- +- for (i = 0, entry = *map; i < values->len;) { +- const struct spdk_json_val *name = &values[i + 1]; +- const struct spdk_json_val *v = &values[i + 2]; +- /* Here we catch errors like invalid types. */ +- if (!(entry[0] = spdk_json_strdup(name)) || +- !(entry[1] = spdk_json_strdup(v))) { +- bdev_rbd_free_config(*map); +- *map = NULL; +- return -1; +- } +- i += 1 + spdk_json_val_len(v); +- entry += 2; +- } +- +- return 0; +-} +- +-static const struct spdk_json_object_decoder rpc_create_rbd_decoders[] = { +- {"name", offsetof(struct rpc_create_rbd, name), spdk_json_decode_string, true}, +- {"user_id", offsetof(struct rpc_create_rbd, user_id), spdk_json_decode_string, true}, +- {"pool_name", offsetof(struct rpc_create_rbd, pool_name), spdk_json_decode_string}, +- {"rbd_name", offsetof(struct rpc_create_rbd, rbd_name), spdk_json_decode_string}, +- {"block_size", offsetof(struct rpc_create_rbd, block_size), spdk_json_decode_uint32}, +- {"config", offsetof(struct rpc_create_rbd, config), bdev_rbd_decode_config, true}, +- {"cluster_name", offsetof(struct rpc_create_rbd, cluster_name), spdk_json_decode_string, true}, +- {"uuid", offsetof(struct rpc_create_rbd, uuid), spdk_json_decode_string, true} +-}; +- +-static void +-rpc_bdev_rbd_create(struct spdk_jsonrpc_request *request, +- const struct spdk_json_val *params) +-{ +- struct rpc_create_rbd req = {}; +- struct spdk_json_write_ctx *w; +- struct spdk_bdev *bdev; +- int rc = 0; +- struct spdk_uuid *uuid = NULL; +- struct spdk_uuid decoded_uuid; +- +- if (spdk_json_decode_object(params, rpc_create_rbd_decoders, +- SPDK_COUNTOF(rpc_create_rbd_decoders), +- &req)) { +- SPDK_DEBUGLOG(bdev_rbd, "spdk_json_decode_object failed\n"); +- spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, +- "spdk_json_decode_object failed"); +- goto cleanup; +- } +- +- if (req.uuid) { +- if (spdk_uuid_parse(&decoded_uuid, req.uuid)) { +- spdk_jsonrpc_send_error_response(request, -EINVAL, +- "Failed to parse bdev UUID"); +- goto cleanup; +- } +- uuid = &decoded_uuid; +- } +- +- rc = bdev_rbd_create(&bdev, req.name, req.user_id, req.pool_name, +- (const char *const *)req.config, +- req.rbd_name, +- req.block_size, req.cluster_name, uuid); +- if (rc) { +- spdk_jsonrpc_send_error_response(request, rc, spdk_strerror(-rc)); +- goto cleanup; +- } +- +- w = spdk_jsonrpc_begin_result(request); +- spdk_json_write_string(w, spdk_bdev_get_name(bdev)); +- spdk_jsonrpc_end_result(request, w); +- +-cleanup: +- free_rpc_create_rbd(&req); +-} +-SPDK_RPC_REGISTER("bdev_rbd_create", rpc_bdev_rbd_create, SPDK_RPC_RUNTIME) +- +-struct rpc_bdev_rbd_delete { +- char *name; +-}; +- +-static void +-free_rpc_bdev_rbd_delete(struct rpc_bdev_rbd_delete *req) +-{ +- free(req->name); +-} +- +-static const struct spdk_json_object_decoder rpc_bdev_rbd_delete_decoders[] = { +- {"name", offsetof(struct rpc_bdev_rbd_delete, name), spdk_json_decode_string}, +-}; +- +-static void +-_rpc_bdev_rbd_delete_cb(void *cb_arg, int bdeverrno) +-{ +- struct spdk_jsonrpc_request *request = cb_arg; +- +- if (bdeverrno == 0) { +- spdk_jsonrpc_send_bool_response(request, true); +- } else { +- spdk_jsonrpc_send_error_response(request, bdeverrno, spdk_strerror(-bdeverrno)); +- } +-} +- +-static void +-rpc_bdev_rbd_delete(struct spdk_jsonrpc_request *request, +- const struct spdk_json_val *params) +-{ +- struct rpc_bdev_rbd_delete req = {NULL}; +- +- if (spdk_json_decode_object(params, rpc_bdev_rbd_delete_decoders, +- SPDK_COUNTOF(rpc_bdev_rbd_delete_decoders), +- &req)) { +- spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, +- "spdk_json_decode_object failed"); +- goto cleanup; +- } +- +- bdev_rbd_delete(req.name, _rpc_bdev_rbd_delete_cb, request); +- +-cleanup: +- free_rpc_bdev_rbd_delete(&req); +-} +-SPDK_RPC_REGISTER("bdev_rbd_delete", rpc_bdev_rbd_delete, SPDK_RPC_RUNTIME) +- +-struct rpc_bdev_rbd_resize { +- char *name; +- uint64_t new_size; +-}; +- +-static const struct spdk_json_object_decoder rpc_bdev_rbd_resize_decoders[] = { +- {"name", offsetof(struct rpc_bdev_rbd_resize, name), spdk_json_decode_string}, +- {"new_size", offsetof(struct rpc_bdev_rbd_resize, new_size), spdk_json_decode_uint64} +-}; +- +-static void +-free_rpc_bdev_rbd_resize(struct rpc_bdev_rbd_resize *req) +-{ +- free(req->name); +-} +- +-static void +-rpc_bdev_rbd_resize(struct spdk_jsonrpc_request *request, +- const struct spdk_json_val *params) +-{ +- struct rpc_bdev_rbd_resize req = {}; +- int rc; +- +- if (spdk_json_decode_object(params, rpc_bdev_rbd_resize_decoders, +- SPDK_COUNTOF(rpc_bdev_rbd_resize_decoders), +- &req)) { +- spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, +- "spdk_json_decode_object failed"); +- goto cleanup; +- } +- +- rc = bdev_rbd_resize(req.name, req.new_size); +- if (rc) { +- spdk_jsonrpc_send_error_response(request, rc, spdk_strerror(-rc)); +- goto cleanup; +- } +- +- spdk_jsonrpc_send_bool_response(request, true); +-cleanup: +- free_rpc_bdev_rbd_resize(&req); +-} +-SPDK_RPC_REGISTER("bdev_rbd_resize", rpc_bdev_rbd_resize, SPDK_RPC_RUNTIME) +- +-static void +-free_rpc_register_cluster(struct cluster_register_info *req) +-{ +- free(req->name); +- free(req->user_id); +- bdev_rbd_free_config(req->config_param); +- free(req->config_file); +- free(req->key_file); +-} +- +-static const struct spdk_json_object_decoder rpc_register_cluster_decoders[] = { +- {"name", offsetof(struct cluster_register_info, name), spdk_json_decode_string, true}, +- {"user_id", offsetof(struct cluster_register_info, user_id), spdk_json_decode_string, true}, +- {"config_param", offsetof(struct cluster_register_info, config_param), bdev_rbd_decode_config, true}, +- {"config_file", offsetof(struct cluster_register_info, config_file), spdk_json_decode_string, true}, +- {"key_file", offsetof(struct cluster_register_info, key_file), spdk_json_decode_string, true} +-}; +- +-static void +-rpc_bdev_rbd_register_cluster(struct spdk_jsonrpc_request *request, +- const struct spdk_json_val *params) +-{ +- struct cluster_register_info req = {}; +- int rc = 0; +- struct spdk_json_write_ctx *w; +- +- if (spdk_json_decode_object(params, rpc_register_cluster_decoders, +- SPDK_COUNTOF(rpc_register_cluster_decoders), +- &req)) { +- SPDK_DEBUGLOG(bdev_rbd, "spdk_json_decode_object failed\n"); +- spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, +- "spdk_json_decode_object failed"); +- goto cleanup; +- } +- +- rc = bdev_rbd_register_cluster(&req); +- if (rc) { +- spdk_jsonrpc_send_error_response(request, rc, spdk_strerror(-rc)); +- goto cleanup; +- } +- +- w = spdk_jsonrpc_begin_result(request); +- spdk_json_write_string(w, req.name); +- spdk_jsonrpc_end_result(request, w); +-cleanup: +- free_rpc_register_cluster(&req); +-} +-SPDK_RPC_REGISTER("bdev_rbd_register_cluster", rpc_bdev_rbd_register_cluster, SPDK_RPC_RUNTIME) +- +-struct rpc_bdev_rbd_unregister_cluster { +- char *name; +-}; +- +-static void +-free_rpc_bdev_cluster_unregister(struct rpc_bdev_rbd_unregister_cluster *req) +-{ +- free(req->name); +-} +- +-static const struct spdk_json_object_decoder rpc_bdev_rbd_unregister_cluster_decoders[] = { +- {"name", offsetof(struct rpc_bdev_rbd_unregister_cluster, name), spdk_json_decode_string}, +-}; +- +-static void +-rpc_bdev_rbd_unregister_cluster(struct spdk_jsonrpc_request *request, +- const struct spdk_json_val *params) +-{ +- struct rpc_bdev_rbd_unregister_cluster req = {NULL}; +- int rc; +- +- if (spdk_json_decode_object(params, rpc_bdev_rbd_unregister_cluster_decoders, +- SPDK_COUNTOF(rpc_bdev_rbd_unregister_cluster_decoders), +- &req)) { +- spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, +- "spdk_json_decode_object failed"); +- goto cleanup; +- } +- +- rc = bdev_rbd_unregister_cluster(req.name); +- if (rc) { +- spdk_jsonrpc_send_error_response(request, rc, spdk_strerror(-rc)); +- goto cleanup; +- } +- +- spdk_jsonrpc_send_bool_response(request, true); +- +-cleanup: +- free_rpc_bdev_cluster_unregister(&req); +-} +-SPDK_RPC_REGISTER("bdev_rbd_unregister_cluster", rpc_bdev_rbd_unregister_cluster, SPDK_RPC_RUNTIME) +- +-struct rpc_bdev_rbd_get_cluster_info { +- char *name; +-}; +- +-static void +-free_rpc_bdev_rbd_get_cluster_info(struct rpc_bdev_rbd_get_cluster_info *req) +-{ +- free(req->name); +-} +- +-static const struct spdk_json_object_decoder rpc_bdev_rbd_get_cluster_info_decoders[] = { +- {"name", offsetof(struct rpc_bdev_rbd_get_cluster_info, name), spdk_json_decode_string, true}, +-}; +- +-static void +-rpc_bdev_rbd_get_clusters_info(struct spdk_jsonrpc_request *request, +- const struct spdk_json_val *params) +-{ +- struct rpc_bdev_rbd_get_cluster_info req = {NULL}; +- int rc; +- +- if (params && spdk_json_decode_object(params, rpc_bdev_rbd_get_cluster_info_decoders, +- SPDK_COUNTOF(rpc_bdev_rbd_get_cluster_info_decoders), +- &req)) { +- spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, +- "spdk_json_decode_object failed"); +- goto cleanup; +- } +- +- rc = bdev_rbd_get_clusters_info(request, req.name); +- if (rc) { +- spdk_jsonrpc_send_error_response(request, rc, spdk_strerror(-rc)); +- goto cleanup; +- } +- +-cleanup: +- free_rpc_bdev_rbd_get_cluster_info(&req); +-} +-SPDK_RPC_REGISTER("bdev_rbd_get_clusters_info", rpc_bdev_rbd_get_clusters_info, SPDK_RPC_RUNTIME) ++/* SPDX-License-Identifier: BSD-3-Clause ++ * Copyright (C) 2016 Intel Corporation. ++ * All rights reserved. ++ */ ++ ++#include "bdev_rbd.h" ++#include "spdk/util.h" ++#include "spdk/uuid.h" ++#include "spdk/string.h" ++#include "spdk/log.h" ++ ++struct rpc_create_rbd { ++ char *name; ++ char *user_id; ++ char *pool_name; ++ char *rbd_name; ++ uint32_t block_size; ++ char **config; ++ char *cluster_name; ++ char *uuid; ++}; ++ ++static void ++free_rpc_create_rbd(struct rpc_create_rbd *req) ++{ ++ free(req->name); ++ free(req->user_id); ++ free(req->pool_name); ++ free(req->rbd_name); ++ bdev_rbd_free_config(req->config); ++ free(req->cluster_name); ++ free(req->uuid); ++} ++ ++static int ++bdev_rbd_decode_config(const struct spdk_json_val *values, void *out) ++{ ++ char ***map = out; ++ char **entry; ++ uint32_t i; ++ ++ if (values->type == SPDK_JSON_VAL_NULL) { ++ /* treated like empty object: empty config */ ++ *map = calloc(1, sizeof(**map)); ++ if (!*map) { ++ return -1; ++ } ++ return 0; ++ } ++ ++ if (values->type != SPDK_JSON_VAL_OBJECT_BEGIN) { ++ return -1; ++ } ++ ++ *map = calloc(values->len + 1, sizeof(**map)); ++ if (!*map) { ++ return -1; ++ } ++ ++ for (i = 0, entry = *map; i < values->len;) { ++ const struct spdk_json_val *name = &values[i + 1]; ++ const struct spdk_json_val *v = &values[i + 2]; ++ /* Here we catch errors like invalid types. */ ++ if (!(entry[0] = spdk_json_strdup(name)) || ++ !(entry[1] = spdk_json_strdup(v))) { ++ bdev_rbd_free_config(*map); ++ *map = NULL; ++ return -1; ++ } ++ i += 1 + spdk_json_val_len(v); ++ entry += 2; ++ } ++ ++ return 0; ++} ++ ++static const struct spdk_json_object_decoder rpc_create_rbd_decoders[] = { ++ {"name", offsetof(struct rpc_create_rbd, name), spdk_json_decode_string, true}, ++ {"user_id", offsetof(struct rpc_create_rbd, user_id), spdk_json_decode_string, true}, ++ {"pool_name", offsetof(struct rpc_create_rbd, pool_name), spdk_json_decode_string}, ++ {"rbd_name", offsetof(struct rpc_create_rbd, rbd_name), spdk_json_decode_string}, ++ {"block_size", offsetof(struct rpc_create_rbd, block_size), spdk_json_decode_uint32}, ++ {"config", offsetof(struct rpc_create_rbd, config), bdev_rbd_decode_config, true}, ++ {"cluster_name", offsetof(struct rpc_create_rbd, cluster_name), spdk_json_decode_string, true}, ++ {"uuid", offsetof(struct rpc_create_rbd, uuid), spdk_json_decode_string, true} ++}; ++ ++static void ++rpc_bdev_rbd_create(struct spdk_jsonrpc_request *request, ++ const struct spdk_json_val *params) ++{ ++ struct rpc_create_rbd req = {}; ++ struct spdk_json_write_ctx *w; ++ struct spdk_bdev *bdev; ++ int rc = 0; ++ struct spdk_uuid *uuid = NULL; ++ struct spdk_uuid decoded_uuid; ++ ++ if (spdk_json_decode_object(params, rpc_create_rbd_decoders, ++ SPDK_COUNTOF(rpc_create_rbd_decoders), ++ &req)) { ++ SPDK_DEBUGLOG(bdev_rbd, "spdk_json_decode_object failed\n"); ++ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, ++ "spdk_json_decode_object failed"); ++ goto cleanup; ++ } ++ ++ if (req.uuid) { ++ if (spdk_uuid_parse(&decoded_uuid, req.uuid)) { ++ spdk_jsonrpc_send_error_response(request, -EINVAL, ++ "Failed to parse bdev UUID"); ++ goto cleanup; ++ } ++ uuid = &decoded_uuid; ++ } ++ ++ rc = bdev_rbd_create(&bdev, req.name, req.user_id, req.pool_name, ++ (const char *const *)req.config, ++ req.rbd_name, ++ req.block_size, req.cluster_name, uuid); ++ if (rc) { ++ spdk_jsonrpc_send_error_response(request, rc, spdk_strerror(-rc)); ++ goto cleanup; ++ } ++ ++ w = spdk_jsonrpc_begin_result(request); ++ spdk_json_write_string(w, spdk_bdev_get_name(bdev)); ++ spdk_jsonrpc_end_result(request, w); ++ ++cleanup: ++ free_rpc_create_rbd(&req); ++} ++SPDK_RPC_REGISTER("bdev_rbd_create", rpc_bdev_rbd_create, SPDK_RPC_RUNTIME) ++ ++struct rpc_bdev_rbd_delete { ++ char *name; ++}; ++ ++static void ++free_rpc_bdev_rbd_delete(struct rpc_bdev_rbd_delete *req) ++{ ++ free(req->name); ++} ++ ++static const struct spdk_json_object_decoder rpc_bdev_rbd_delete_decoders[] = { ++ {"name", offsetof(struct rpc_bdev_rbd_delete, name), spdk_json_decode_string}, ++}; ++ ++static void ++_rpc_bdev_rbd_delete_cb(void *cb_arg, int bdeverrno) ++{ ++ struct spdk_jsonrpc_request *request = cb_arg; ++ ++ if (bdeverrno == 0) { ++ spdk_jsonrpc_send_bool_response(request, true); ++ } else { ++ spdk_jsonrpc_send_error_response(request, bdeverrno, spdk_strerror(-bdeverrno)); ++ } ++} ++ ++static void ++rpc_bdev_rbd_delete(struct spdk_jsonrpc_request *request, ++ const struct spdk_json_val *params) ++{ ++ struct rpc_bdev_rbd_delete req = {NULL}; ++ ++ if (spdk_json_decode_object(params, rpc_bdev_rbd_delete_decoders, ++ SPDK_COUNTOF(rpc_bdev_rbd_delete_decoders), ++ &req)) { ++ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, ++ "spdk_json_decode_object failed"); ++ goto cleanup; ++ } ++ ++ bdev_rbd_delete(req.name, _rpc_bdev_rbd_delete_cb, request); ++ ++cleanup: ++ free_rpc_bdev_rbd_delete(&req); ++} ++SPDK_RPC_REGISTER("bdev_rbd_delete", rpc_bdev_rbd_delete, SPDK_RPC_RUNTIME) ++ ++struct rpc_bdev_rbd_resize { ++ char *name; ++ uint64_t new_size; ++}; ++ ++static const struct spdk_json_object_decoder rpc_bdev_rbd_resize_decoders[] = { ++ {"name", offsetof(struct rpc_bdev_rbd_resize, name), spdk_json_decode_string}, ++ {"new_size", offsetof(struct rpc_bdev_rbd_resize, new_size), spdk_json_decode_uint64} ++}; ++ ++static void ++free_rpc_bdev_rbd_resize(struct rpc_bdev_rbd_resize *req) ++{ ++ free(req->name); ++} ++ ++static void ++rpc_bdev_rbd_resize(struct spdk_jsonrpc_request *request, ++ const struct spdk_json_val *params) ++{ ++ struct rpc_bdev_rbd_resize req = {}; ++ int rc; ++ ++ if (spdk_json_decode_object(params, rpc_bdev_rbd_resize_decoders, ++ SPDK_COUNTOF(rpc_bdev_rbd_resize_decoders), ++ &req)) { ++ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, ++ "spdk_json_decode_object failed"); ++ goto cleanup; ++ } ++ ++ rc = bdev_rbd_resize(req.name, req.new_size); ++ if (rc) { ++ spdk_jsonrpc_send_error_response(request, rc, spdk_strerror(-rc)); ++ goto cleanup; ++ } ++ ++ spdk_jsonrpc_send_bool_response(request, true); ++cleanup: ++ free_rpc_bdev_rbd_resize(&req); ++} ++SPDK_RPC_REGISTER("bdev_rbd_resize", rpc_bdev_rbd_resize, SPDK_RPC_RUNTIME) ++ ++static void ++free_rpc_register_cluster(struct cluster_register_info *req) ++{ ++ free(req->name); ++ free(req->user_id); ++ bdev_rbd_free_config(req->config_param); ++ free(req->config_file); ++ free(req->key_file); ++} ++ ++static const struct spdk_json_object_decoder rpc_register_cluster_decoders[] = { ++ {"name", offsetof(struct cluster_register_info, name), spdk_json_decode_string, true}, ++ {"user_id", offsetof(struct cluster_register_info, user_id), spdk_json_decode_string, true}, ++ {"config_param", offsetof(struct cluster_register_info, config_param), bdev_rbd_decode_config, true}, ++ {"config_file", offsetof(struct cluster_register_info, config_file), spdk_json_decode_string, true}, ++ {"key_file", offsetof(struct cluster_register_info, key_file), spdk_json_decode_string, true} ++}; ++ ++static void ++rpc_bdev_rbd_register_cluster(struct spdk_jsonrpc_request *request, ++ const struct spdk_json_val *params) ++{ ++ struct cluster_register_info req = {}; ++ int rc = 0; ++ struct spdk_json_write_ctx *w; ++ ++ if (spdk_json_decode_object(params, rpc_register_cluster_decoders, ++ SPDK_COUNTOF(rpc_register_cluster_decoders), ++ &req)) { ++ SPDK_DEBUGLOG(bdev_rbd, "spdk_json_decode_object failed\n"); ++ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, ++ "spdk_json_decode_object failed"); ++ goto cleanup; ++ } ++ ++ rc = bdev_rbd_register_cluster(&req); ++ if (rc) { ++ spdk_jsonrpc_send_error_response(request, rc, spdk_strerror(-rc)); ++ goto cleanup; ++ } ++ ++ w = spdk_jsonrpc_begin_result(request); ++ spdk_json_write_string(w, req.name); ++ spdk_jsonrpc_end_result(request, w); ++cleanup: ++ free_rpc_register_cluster(&req); ++} ++SPDK_RPC_REGISTER("bdev_rbd_register_cluster", rpc_bdev_rbd_register_cluster, SPDK_RPC_RUNTIME) ++ ++struct rpc_bdev_rbd_unregister_cluster { ++ char *name; ++}; ++ ++static void ++free_rpc_bdev_cluster_unregister(struct rpc_bdev_rbd_unregister_cluster *req) ++{ ++ free(req->name); ++} ++ ++static const struct spdk_json_object_decoder rpc_bdev_rbd_unregister_cluster_decoders[] = { ++ {"name", offsetof(struct rpc_bdev_rbd_unregister_cluster, name), spdk_json_decode_string}, ++}; ++ ++static void ++rpc_bdev_rbd_unregister_cluster(struct spdk_jsonrpc_request *request, ++ const struct spdk_json_val *params) ++{ ++ struct rpc_bdev_rbd_unregister_cluster req = {NULL}; ++ int rc; ++ ++ if (spdk_json_decode_object(params, rpc_bdev_rbd_unregister_cluster_decoders, ++ SPDK_COUNTOF(rpc_bdev_rbd_unregister_cluster_decoders), ++ &req)) { ++ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, ++ "spdk_json_decode_object failed"); ++ goto cleanup; ++ } ++ ++ rc = bdev_rbd_unregister_cluster(req.name); ++ if (rc) { ++ spdk_jsonrpc_send_error_response(request, rc, spdk_strerror(-rc)); ++ goto cleanup; ++ } ++ ++ spdk_jsonrpc_send_bool_response(request, true); ++ ++cleanup: ++ free_rpc_bdev_cluster_unregister(&req); ++} ++SPDK_RPC_REGISTER("bdev_rbd_unregister_cluster", rpc_bdev_rbd_unregister_cluster, SPDK_RPC_RUNTIME) ++ ++struct rpc_bdev_rbd_get_cluster_info { ++ char *name; ++}; ++ ++static void ++free_rpc_bdev_rbd_get_cluster_info(struct rpc_bdev_rbd_get_cluster_info *req) ++{ ++ free(req->name); ++} ++ ++static const struct spdk_json_object_decoder rpc_bdev_rbd_get_cluster_info_decoders[] = { ++ {"name", offsetof(struct rpc_bdev_rbd_get_cluster_info, name), spdk_json_decode_string, true}, ++}; ++ ++static void ++rpc_bdev_rbd_get_clusters_info(struct spdk_jsonrpc_request *request, ++ const struct spdk_json_val *params) ++{ ++ struct rpc_bdev_rbd_get_cluster_info req = {NULL}; ++ int rc; ++ ++ if (params && spdk_json_decode_object(params, rpc_bdev_rbd_get_cluster_info_decoders, ++ SPDK_COUNTOF(rpc_bdev_rbd_get_cluster_info_decoders), ++ &req)) { ++ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, ++ "spdk_json_decode_object failed"); ++ goto cleanup; ++ } ++ ++ rc = bdev_rbd_get_clusters_info(request, req.name); ++ if (rc) { ++ spdk_jsonrpc_send_error_response(request, rc, spdk_strerror(-rc)); ++ goto cleanup; ++ } ++ ++cleanup: ++ free_rpc_bdev_rbd_get_cluster_info(&req); ++} ++SPDK_RPC_REGISTER("bdev_rbd_get_clusters_info", rpc_bdev_rbd_get_clusters_info, SPDK_RPC_RUNTIME) +diff --git a/module/bdev/split/Makefile b/module/bdev/split/Makefile +index a714564..37a40cf 100644 +--- a/module/bdev/split/Makefile ++++ b/module/bdev/split/Makefile +@@ -1,17 +1,17 @@ +-# SPDX-License-Identifier: BSD-3-Clause +-# Copyright (C) 2015 Intel Corporation. +-# All rights reserved. +-# +- +-SPDK_ROOT_DIR := $(abspath $(CURDIR)/../../..) +-include $(SPDK_ROOT_DIR)/mk/spdk.common.mk +- +-SO_VER := 4 +-SO_MINOR := 0 +- +-C_SRCS = vbdev_split.c vbdev_split_rpc.c +-LIBNAME = bdev_split +- +-SPDK_MAP_FILE = $(SPDK_ROOT_DIR)/mk/spdk_blank.map +- +-include $(SPDK_ROOT_DIR)/mk/spdk.lib.mk ++# SPDX-License-Identifier: BSD-3-Clause ++# Copyright (C) 2015 Intel Corporation. ++# All rights reserved. ++# ++ ++SPDK_ROOT_DIR := $(abspath $(CURDIR)/../../..) ++include $(SPDK_ROOT_DIR)/mk/spdk.common.mk ++ ++SO_VER := 4 ++SO_MINOR := 0 ++ ++C_SRCS = vbdev_split.c vbdev_split_rpc.c ++LIBNAME = bdev_split ++ ++SPDK_MAP_FILE = $(SPDK_ROOT_DIR)/mk/spdk_blank.map ++ ++include $(SPDK_ROOT_DIR)/mk/spdk.lib.mk +diff --git a/module/bdev/split/vbdev_split.c b/module/bdev/split/vbdev_split.c +index 44c5d7d..9cbc744 100644 +--- a/module/bdev/split/vbdev_split.c ++++ b/module/bdev/split/vbdev_split.c +@@ -1,495 +1,495 @@ +-/* SPDX-License-Identifier: BSD-3-Clause +- * Copyright (C) 2016 Intel Corporation. +- * All rights reserved. +- */ +- +-/* +- * This is a simple example of a virtual block device that takes a single +- * bdev and slices it into multiple smaller bdevs. +- */ +- +-#include "vbdev_split.h" +- +-#include "spdk/rpc.h" +-#include "spdk/endian.h" +-#include "spdk/string.h" +-#include "spdk/thread.h" +-#include "spdk/util.h" +- +-#include "spdk/bdev_module.h" +-#include "spdk/log.h" +- +-struct spdk_vbdev_split_config { +- char *base_bdev; +- unsigned split_count; +- uint64_t split_size_mb; +- +- SPDK_BDEV_PART_TAILQ splits; +- struct spdk_bdev_part_base *split_base; +- +- TAILQ_ENTRY(spdk_vbdev_split_config) tailq; +-}; +- +-static TAILQ_HEAD(, spdk_vbdev_split_config) g_split_config = TAILQ_HEAD_INITIALIZER( +- g_split_config); +- +-struct vbdev_split_channel { +- struct spdk_bdev_part_channel part_ch; +-}; +- +-struct vbdev_split_bdev_io { +- struct spdk_io_channel *ch; +- struct spdk_bdev_io *bdev_io; +- +- /* for bdev_io_wait */ +- struct spdk_bdev_io_wait_entry bdev_io_wait; +-}; +- +-static void vbdev_split_del_config(struct spdk_vbdev_split_config *cfg); +- +-static int vbdev_split_init(void); +-static void vbdev_split_fini(void); +-static void vbdev_split_examine(struct spdk_bdev *bdev); +-static int vbdev_split_config_json(struct spdk_json_write_ctx *w); +-static int vbdev_split_get_ctx_size(void); +- +-static void _vbdev_split_submit_request(struct spdk_io_channel *_ch, struct spdk_bdev_io *bdev_io); +- +-static struct spdk_bdev_module split_if = { +- .name = "split", +- .module_init = vbdev_split_init, +- .module_fini = vbdev_split_fini, +- .get_ctx_size = vbdev_split_get_ctx_size, +- .examine_config = vbdev_split_examine, +- .config_json = vbdev_split_config_json, +-}; +- +-SPDK_BDEV_MODULE_REGISTER(split, &split_if) +- +-static void +-vbdev_split_base_free(void *ctx) +-{ +- struct spdk_vbdev_split_config *cfg = ctx; +- +- vbdev_split_del_config(cfg); +-} +- +-static int +-_vbdev_split_destruct(void *ctx) +-{ +- struct spdk_bdev_part *part = ctx; +- +- return spdk_bdev_part_free(part); +-} +- +-static void +-vbdev_split_base_bdev_hotremove_cb(void *_part_base) +-{ +- struct spdk_bdev_part_base *part_base = _part_base; +- struct spdk_vbdev_split_config *cfg = spdk_bdev_part_base_get_ctx(part_base); +- +- spdk_bdev_part_base_hotremove(part_base, &cfg->splits); +-} +- +-static void +-vbdev_split_resubmit_io(void *arg) +-{ +- struct vbdev_split_bdev_io *split_io = (struct vbdev_split_bdev_io *)arg; +- +- _vbdev_split_submit_request(split_io->ch, split_io->bdev_io); +-} +- +-static void +-vbdev_split_queue_io(struct vbdev_split_bdev_io *split_io) +-{ +- struct vbdev_split_channel *ch = spdk_io_channel_get_ctx(split_io->ch); +- int rc; +- +- split_io->bdev_io_wait.bdev = split_io->bdev_io->bdev; +- split_io->bdev_io_wait.cb_fn = vbdev_split_resubmit_io; +- split_io->bdev_io_wait.cb_arg = split_io; +- +- rc = spdk_bdev_queue_io_wait(split_io->bdev_io->bdev, +- ch->part_ch.base_ch, &split_io->bdev_io_wait); +- if (rc != 0) { +- SPDK_ERRLOG("Queue io failed in vbdev_split_queue_io, rc=%d\n", rc); +- spdk_bdev_io_complete(split_io->bdev_io, SPDK_BDEV_IO_STATUS_FAILED); +- } +-} +- +-static void +-_vbdev_split_submit_request(struct spdk_io_channel *_ch, struct spdk_bdev_io *bdev_io) +-{ +- struct vbdev_split_channel *ch = spdk_io_channel_get_ctx(_ch); +- struct vbdev_split_bdev_io *io_ctx = (struct vbdev_split_bdev_io *)bdev_io->driver_ctx; +- int rc; +- +- rc = spdk_bdev_part_submit_request(&ch->part_ch, bdev_io); +- if (rc) { +- if (rc == -ENOMEM) { +- SPDK_DEBUGLOG(vbdev_split, "split: no memory, queue io.\n"); +- io_ctx->ch = _ch; +- io_ctx->bdev_io = bdev_io; +- vbdev_split_queue_io(io_ctx); +- } else { +- SPDK_ERRLOG("split: error on io submission, rc=%d.\n", rc); +- spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); +- } +- } +-} +- +-static void +-vbdev_split_get_buf_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io, bool success) +-{ +- if (!success) { +- spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); +- return; +- } +- +- _vbdev_split_submit_request(ch, bdev_io); +-} +- +-static void +-vbdev_split_submit_request(struct spdk_io_channel *_ch, struct spdk_bdev_io *bdev_io) +-{ +- switch (bdev_io->type) { +- case SPDK_BDEV_IO_TYPE_READ: +- spdk_bdev_io_get_buf(bdev_io, vbdev_split_get_buf_cb, +- bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen); +- break; +- default: +- _vbdev_split_submit_request(_ch, bdev_io); +- break; +- } +-} +- +-static int +-vbdev_split_dump_info_json(void *ctx, struct spdk_json_write_ctx *w) +-{ +- struct spdk_bdev_part *part = ctx; +- struct spdk_bdev *split_base_bdev = spdk_bdev_part_get_base_bdev(part); +- uint64_t offset_blocks = spdk_bdev_part_get_offset_blocks(part); +- +- spdk_json_write_named_object_begin(w, "split"); +- +- spdk_json_write_named_string(w, "base_bdev", spdk_bdev_get_name(split_base_bdev)); +- spdk_json_write_named_uint64(w, "offset_blocks", offset_blocks); +- +- spdk_json_write_object_end(w); +- +- return 0; +-} +- +-static void +-vbdev_split_write_config_json(struct spdk_bdev *bdev, struct spdk_json_write_ctx *w) +-{ +- /* No config per bdev needed */ +-} +- +-static struct spdk_bdev_fn_table vbdev_split_fn_table = { +- .destruct = _vbdev_split_destruct, +- .submit_request = vbdev_split_submit_request, +- .dump_info_json = vbdev_split_dump_info_json, +- .write_config_json = vbdev_split_write_config_json +-}; +- +-static int +-vbdev_split_create(struct spdk_vbdev_split_config *cfg) +-{ +- uint64_t split_size_blocks, offset_blocks; +- uint64_t split_count, max_split_count; +- uint64_t mb = 1024 * 1024; +- uint64_t i; +- int rc; +- char *name; +- struct spdk_bdev *base_bdev; +- struct bdev_part_tailq *split_base_tailq; +- +- assert(cfg->split_count > 0); +- +- TAILQ_INIT(&cfg->splits); +- rc = spdk_bdev_part_base_construct_ext(cfg->base_bdev, +- vbdev_split_base_bdev_hotremove_cb, +- &split_if, &vbdev_split_fn_table, +- &cfg->splits, vbdev_split_base_free, cfg, +- sizeof(struct vbdev_split_channel), +- NULL, NULL, &cfg->split_base); +- if (rc != 0) { +- if (rc != -ENODEV) { +- SPDK_ERRLOG("Cannot construct bdev part base\n"); +- } +- return rc; +- } +- +- base_bdev = spdk_bdev_part_base_get_bdev(cfg->split_base); +- +- if (cfg->split_size_mb) { +- if (((cfg->split_size_mb * mb) % base_bdev->blocklen) != 0) { +- SPDK_ERRLOG("Split size %" PRIu64 " MB is not possible with block size " +- "%" PRIu32 "\n", +- cfg->split_size_mb, base_bdev->blocklen); +- rc = -EINVAL; +- goto err; +- } +- split_size_blocks = (cfg->split_size_mb * mb) / base_bdev->blocklen; +- SPDK_DEBUGLOG(vbdev_split, "Split size %" PRIu64 " MB specified by user\n", +- cfg->split_size_mb); +- } else { +- split_size_blocks = base_bdev->blockcnt / cfg->split_count; +- SPDK_DEBUGLOG(vbdev_split, "Split size not specified by user\n"); +- } +- +- max_split_count = base_bdev->blockcnt / split_size_blocks; +- split_count = cfg->split_count; +- if (split_count > max_split_count) { +- SPDK_WARNLOG("Split count %" PRIu64 " is greater than maximum possible split count " +- "%" PRIu64 " - clamping\n", split_count, max_split_count); +- split_count = max_split_count; +- } +- +- SPDK_DEBUGLOG(vbdev_split, "base_bdev: %s split_count: %" PRIu64 +- " split_size_blocks: %" PRIu64 "\n", +- cfg->base_bdev, split_count, split_size_blocks); +- +- offset_blocks = 0; +- for (i = 0; i < split_count; i++) { +- struct spdk_bdev_part *d; +- +- d = calloc(1, sizeof(*d)); +- if (d == NULL) { +- SPDK_ERRLOG("could not allocate bdev part\n"); +- rc = -ENOMEM; +- goto err; +- } +- +- name = spdk_sprintf_alloc("%sp%" PRIu64, cfg->base_bdev, i); +- if (!name) { +- SPDK_ERRLOG("could not allocate name\n"); +- free(d); +- rc = -ENOMEM; +- goto err; +- } +- +- rc = spdk_bdev_part_construct(d, cfg->split_base, name, offset_blocks, split_size_blocks, +- "Split Disk"); +- free(name); +- if (rc) { +- SPDK_ERRLOG("could not construct bdev part\n"); +- /* spdk_bdev_part_construct will free name if it fails */ +- free(d); +- rc = -ENOMEM; +- goto err; +- } +- +- offset_blocks += split_size_blocks; +- } +- +- return 0; +-err: +- split_base_tailq = spdk_bdev_part_base_get_tailq(cfg->split_base); +- spdk_bdev_part_base_hotremove(cfg->split_base, split_base_tailq); +- spdk_bdev_part_base_free(cfg->split_base); +- return rc; +-} +- +-static void +-vbdev_split_del_config(struct spdk_vbdev_split_config *cfg) +-{ +- TAILQ_REMOVE(&g_split_config, cfg, tailq); +- free(cfg->base_bdev); +- free(cfg); +-} +- +-static void +-vbdev_split_destruct_config(struct spdk_vbdev_split_config *cfg) +-{ +- struct bdev_part_tailq *split_base_tailq; +- +- if (cfg->split_base != NULL) { +- split_base_tailq = spdk_bdev_part_base_get_tailq(cfg->split_base); +- spdk_bdev_part_base_hotremove(cfg->split_base, split_base_tailq); +- } else { +- vbdev_split_del_config(cfg); +- } +-} +- +-static void +-vbdev_split_clear_config(void) +-{ +- struct spdk_vbdev_split_config *cfg, *tmp_cfg; +- +- TAILQ_FOREACH_SAFE(cfg, &g_split_config, tailq, tmp_cfg) { +- vbdev_split_destruct_config(cfg); +- } +-} +- +-static struct spdk_vbdev_split_config * +-vbdev_split_config_find_by_base_name(const char *base_bdev_name) +-{ +- struct spdk_vbdev_split_config *cfg; +- +- TAILQ_FOREACH(cfg, &g_split_config, tailq) { +- if (strcmp(cfg->base_bdev, base_bdev_name) == 0) { +- return cfg; +- } +- } +- +- return NULL; +-} +- +-static int +-vbdev_split_add_config(const char *base_bdev_name, unsigned split_count, uint64_t split_size, +- struct spdk_vbdev_split_config **config) +-{ +- struct spdk_vbdev_split_config *cfg; +- assert(base_bdev_name); +- +- if (base_bdev_name == NULL) { +- SPDK_ERRLOG("Split bdev config: no base bdev provided."); +- return -EINVAL; +- } +- +- if (split_count == 0) { +- SPDK_ERRLOG("Split bdev config: split_count can't be 0."); +- return -EINVAL; +- } +- +- /* Check if we already have 'base_bdev_name' registered in config */ +- cfg = vbdev_split_config_find_by_base_name(base_bdev_name); +- if (cfg) { +- SPDK_ERRLOG("Split bdev config for base bdev '%s' already exist.", base_bdev_name); +- return -EEXIST; +- } +- +- cfg = calloc(1, sizeof(*cfg)); +- if (!cfg) { +- SPDK_ERRLOG("calloc(): Out of memory"); +- return -ENOMEM; +- } +- +- cfg->base_bdev = strdup(base_bdev_name); +- if (!cfg->base_bdev) { +- SPDK_ERRLOG("strdup(): Out of memory"); +- free(cfg); +- return -ENOMEM; +- } +- +- cfg->split_count = split_count; +- cfg->split_size_mb = split_size; +- TAILQ_INSERT_TAIL(&g_split_config, cfg, tailq); +- if (config) { +- *config = cfg; +- } +- +- return 0; +-} +- +-static int +-vbdev_split_init(void) +-{ +- return 0; +-} +- +-static void +-vbdev_split_fini(void) +-{ +- vbdev_split_clear_config(); +-} +- +-static void +-vbdev_split_examine(struct spdk_bdev *bdev) +-{ +- struct spdk_vbdev_split_config *cfg = vbdev_split_config_find_by_base_name(bdev->name); +- +- if (cfg != NULL) { +- assert(cfg->split_base == NULL); +- +- if (vbdev_split_create(cfg)) { +- SPDK_ERRLOG("could not split bdev %s\n", bdev->name); +- } +- } +- spdk_bdev_module_examine_done(&split_if); +-} +- +-static int +-vbdev_split_config_json(struct spdk_json_write_ctx *w) +-{ +- struct spdk_vbdev_split_config *cfg; +- +- TAILQ_FOREACH(cfg, &g_split_config, tailq) { +- spdk_json_write_object_begin(w); +- +- spdk_json_write_named_string(w, "method", "bdev_split_create"); +- +- spdk_json_write_named_object_begin(w, "params"); +- spdk_json_write_named_string(w, "base_bdev", cfg->base_bdev); +- spdk_json_write_named_uint32(w, "split_count", cfg->split_count); +- spdk_json_write_named_uint64(w, "split_size_mb", cfg->split_size_mb); +- spdk_json_write_object_end(w); +- +- spdk_json_write_object_end(w); +- } +- +- return 0; +-} +- +-int +-create_vbdev_split(const char *base_bdev_name, unsigned split_count, uint64_t split_size_mb) +-{ +- int rc; +- struct spdk_vbdev_split_config *cfg; +- +- rc = vbdev_split_add_config(base_bdev_name, split_count, split_size_mb, &cfg); +- if (rc) { +- return rc; +- } +- +- rc = vbdev_split_create(cfg); +- if (rc == -ENODEV) { +- /* It is ok if base bdev does not exist yet. */ +- rc = 0; +- } +- +- return rc; +-} +- +-int +-vbdev_split_destruct(const char *base_bdev_name) +-{ +- struct spdk_vbdev_split_config *cfg = vbdev_split_config_find_by_base_name(base_bdev_name); +- +- if (!cfg) { +- SPDK_ERRLOG("Split configuration for '%s' not found\n", base_bdev_name); +- return -ENOENT; +- } +- +- vbdev_split_destruct_config(cfg); +- return 0; +-} +- +-struct spdk_bdev_part_base * +-vbdev_split_get_part_base(struct spdk_bdev *bdev) +-{ +- struct spdk_vbdev_split_config *cfg; +- +- cfg = vbdev_split_config_find_by_base_name(spdk_bdev_get_name(bdev)); +- +- if (cfg == NULL) { +- return NULL; +- } +- +- return cfg->split_base; +-} +- +-/* +- * During init we'll be asked how much memory we'd like passed to us +- * in bev_io structures as context. Here's where we specify how +- * much context we want per IO. +- */ +-static int +-vbdev_split_get_ctx_size(void) +-{ +- return sizeof(struct vbdev_split_bdev_io); +-} +- +-SPDK_LOG_REGISTER_COMPONENT(vbdev_split) ++/* SPDX-License-Identifier: BSD-3-Clause ++ * Copyright (C) 2016 Intel Corporation. ++ * All rights reserved. ++ */ ++ ++/* ++ * This is a simple example of a virtual block device that takes a single ++ * bdev and slices it into multiple smaller bdevs. ++ */ ++ ++#include "vbdev_split.h" ++ ++#include "spdk/rpc.h" ++#include "spdk/endian.h" ++#include "spdk/string.h" ++#include "spdk/thread.h" ++#include "spdk/util.h" ++ ++#include "spdk/bdev_module.h" ++#include "spdk/log.h" ++ ++struct spdk_vbdev_split_config { ++ char *base_bdev; ++ unsigned split_count; ++ uint64_t split_size_mb; ++ ++ SPDK_BDEV_PART_TAILQ splits; ++ struct spdk_bdev_part_base *split_base; ++ ++ TAILQ_ENTRY(spdk_vbdev_split_config) tailq; ++}; ++ ++static TAILQ_HEAD(, spdk_vbdev_split_config) g_split_config = TAILQ_HEAD_INITIALIZER( ++ g_split_config); ++ ++struct vbdev_split_channel { ++ struct spdk_bdev_part_channel part_ch; ++}; ++ ++struct vbdev_split_bdev_io { ++ struct spdk_io_channel *ch; ++ struct spdk_bdev_io *bdev_io; ++ ++ /* for bdev_io_wait */ ++ struct spdk_bdev_io_wait_entry bdev_io_wait; ++}; ++ ++static void vbdev_split_del_config(struct spdk_vbdev_split_config *cfg); ++ ++static int vbdev_split_init(void); ++static void vbdev_split_fini(void); ++static void vbdev_split_examine(struct spdk_bdev *bdev); ++static int vbdev_split_config_json(struct spdk_json_write_ctx *w); ++static int vbdev_split_get_ctx_size(void); ++ ++static void _vbdev_split_submit_request(struct spdk_io_channel *_ch, struct spdk_bdev_io *bdev_io); ++ ++static struct spdk_bdev_module split_if = { ++ .name = "split", ++ .module_init = vbdev_split_init, ++ .module_fini = vbdev_split_fini, ++ .get_ctx_size = vbdev_split_get_ctx_size, ++ .examine_config = vbdev_split_examine, ++ .config_json = vbdev_split_config_json, ++}; ++ ++SPDK_BDEV_MODULE_REGISTER(split, &split_if) ++ ++static void ++vbdev_split_base_free(void *ctx) ++{ ++ struct spdk_vbdev_split_config *cfg = ctx; ++ ++ vbdev_split_del_config(cfg); ++} ++ ++static int ++_vbdev_split_destruct(void *ctx) ++{ ++ struct spdk_bdev_part *part = ctx; ++ ++ return spdk_bdev_part_free(part); ++} ++ ++static void ++vbdev_split_base_bdev_hotremove_cb(void *_part_base) ++{ ++ struct spdk_bdev_part_base *part_base = _part_base; ++ struct spdk_vbdev_split_config *cfg = spdk_bdev_part_base_get_ctx(part_base); ++ ++ spdk_bdev_part_base_hotremove(part_base, &cfg->splits); ++} ++ ++static void ++vbdev_split_resubmit_io(void *arg) ++{ ++ struct vbdev_split_bdev_io *split_io = (struct vbdev_split_bdev_io *)arg; ++ ++ _vbdev_split_submit_request(split_io->ch, split_io->bdev_io); ++} ++ ++static void ++vbdev_split_queue_io(struct vbdev_split_bdev_io *split_io) ++{ ++ struct vbdev_split_channel *ch = spdk_io_channel_get_ctx(split_io->ch); ++ int rc; ++ ++ split_io->bdev_io_wait.bdev = split_io->bdev_io->bdev; ++ split_io->bdev_io_wait.cb_fn = vbdev_split_resubmit_io; ++ split_io->bdev_io_wait.cb_arg = split_io; ++ ++ rc = spdk_bdev_queue_io_wait(split_io->bdev_io->bdev, ++ ch->part_ch.base_ch, &split_io->bdev_io_wait); ++ if (rc != 0) { ++ SPDK_ERRLOG("Queue io failed in vbdev_split_queue_io, rc=%d\n", rc); ++ spdk_bdev_io_complete(split_io->bdev_io, SPDK_BDEV_IO_STATUS_FAILED); ++ } ++} ++ ++static void ++_vbdev_split_submit_request(struct spdk_io_channel *_ch, struct spdk_bdev_io *bdev_io) ++{ ++ struct vbdev_split_channel *ch = spdk_io_channel_get_ctx(_ch); ++ struct vbdev_split_bdev_io *io_ctx = (struct vbdev_split_bdev_io *)bdev_io->driver_ctx; ++ int rc; ++ ++ rc = spdk_bdev_part_submit_request(&ch->part_ch, bdev_io); ++ if (rc) { ++ if (rc == -ENOMEM) { ++ SPDK_DEBUGLOG(vbdev_split, "split: no memory, queue io.\n"); ++ io_ctx->ch = _ch; ++ io_ctx->bdev_io = bdev_io; ++ vbdev_split_queue_io(io_ctx); ++ } else { ++ SPDK_ERRLOG("split: error on io submission, rc=%d.\n", rc); ++ spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); ++ } ++ } ++} ++ ++static void ++vbdev_split_get_buf_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io, bool success) ++{ ++ if (!success) { ++ spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); ++ return; ++ } ++ ++ _vbdev_split_submit_request(ch, bdev_io); ++} ++ ++static void ++vbdev_split_submit_request(struct spdk_io_channel *_ch, struct spdk_bdev_io *bdev_io) ++{ ++ switch (bdev_io->type) { ++ case SPDK_BDEV_IO_TYPE_READ: ++ spdk_bdev_io_get_buf(bdev_io, vbdev_split_get_buf_cb, ++ bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen); ++ break; ++ default: ++ _vbdev_split_submit_request(_ch, bdev_io); ++ break; ++ } ++} ++ ++static int ++vbdev_split_dump_info_json(void *ctx, struct spdk_json_write_ctx *w) ++{ ++ struct spdk_bdev_part *part = ctx; ++ struct spdk_bdev *split_base_bdev = spdk_bdev_part_get_base_bdev(part); ++ uint64_t offset_blocks = spdk_bdev_part_get_offset_blocks(part); ++ ++ spdk_json_write_named_object_begin(w, "split"); ++ ++ spdk_json_write_named_string(w, "base_bdev", spdk_bdev_get_name(split_base_bdev)); ++ spdk_json_write_named_uint64(w, "offset_blocks", offset_blocks); ++ ++ spdk_json_write_object_end(w); ++ ++ return 0; ++} ++ ++static void ++vbdev_split_write_config_json(struct spdk_bdev *bdev, struct spdk_json_write_ctx *w) ++{ ++ /* No config per bdev needed */ ++} ++ ++static struct spdk_bdev_fn_table vbdev_split_fn_table = { ++ .destruct = _vbdev_split_destruct, ++ .submit_request = vbdev_split_submit_request, ++ .dump_info_json = vbdev_split_dump_info_json, ++ .write_config_json = vbdev_split_write_config_json ++}; ++ ++static int ++vbdev_split_create(struct spdk_vbdev_split_config *cfg) ++{ ++ uint64_t split_size_blocks, offset_blocks; ++ uint64_t split_count, max_split_count; ++ uint64_t mb = 1024 * 1024; ++ uint64_t i; ++ int rc; ++ char *name; ++ struct spdk_bdev *base_bdev; ++ struct bdev_part_tailq *split_base_tailq; ++ ++ assert(cfg->split_count > 0); ++ ++ TAILQ_INIT(&cfg->splits); ++ rc = spdk_bdev_part_base_construct_ext(cfg->base_bdev, ++ vbdev_split_base_bdev_hotremove_cb, ++ &split_if, &vbdev_split_fn_table, ++ &cfg->splits, vbdev_split_base_free, cfg, ++ sizeof(struct vbdev_split_channel), ++ NULL, NULL, &cfg->split_base); ++ if (rc != 0) { ++ if (rc != -ENODEV) { ++ SPDK_ERRLOG("Cannot construct bdev part base\n"); ++ } ++ return rc; ++ } ++ ++ base_bdev = spdk_bdev_part_base_get_bdev(cfg->split_base); ++ ++ if (cfg->split_size_mb) { ++ if (((cfg->split_size_mb * mb) % base_bdev->blocklen) != 0) { ++ SPDK_ERRLOG("Split size %" PRIu64 " MB is not possible with block size " ++ "%" PRIu32 "\n", ++ cfg->split_size_mb, base_bdev->blocklen); ++ rc = -EINVAL; ++ goto err; ++ } ++ split_size_blocks = (cfg->split_size_mb * mb) / base_bdev->blocklen; ++ SPDK_DEBUGLOG(vbdev_split, "Split size %" PRIu64 " MB specified by user\n", ++ cfg->split_size_mb); ++ } else { ++ split_size_blocks = base_bdev->blockcnt / cfg->split_count; ++ SPDK_DEBUGLOG(vbdev_split, "Split size not specified by user\n"); ++ } ++ ++ max_split_count = base_bdev->blockcnt / split_size_blocks; ++ split_count = cfg->split_count; ++ if (split_count > max_split_count) { ++ SPDK_WARNLOG("Split count %" PRIu64 " is greater than maximum possible split count " ++ "%" PRIu64 " - clamping\n", split_count, max_split_count); ++ split_count = max_split_count; ++ } ++ ++ SPDK_DEBUGLOG(vbdev_split, "base_bdev: %s split_count: %" PRIu64 ++ " split_size_blocks: %" PRIu64 "\n", ++ cfg->base_bdev, split_count, split_size_blocks); ++ ++ offset_blocks = 0; ++ for (i = 0; i < split_count; i++) { ++ struct spdk_bdev_part *d; ++ ++ d = calloc(1, sizeof(*d)); ++ if (d == NULL) { ++ SPDK_ERRLOG("could not allocate bdev part\n"); ++ rc = -ENOMEM; ++ goto err; ++ } ++ ++ name = spdk_sprintf_alloc("%sp%" PRIu64, cfg->base_bdev, i); ++ if (!name) { ++ SPDK_ERRLOG("could not allocate name\n"); ++ free(d); ++ rc = -ENOMEM; ++ goto err; ++ } ++ ++ rc = spdk_bdev_part_construct(d, cfg->split_base, name, offset_blocks, split_size_blocks, ++ "Split Disk"); ++ free(name); ++ if (rc) { ++ SPDK_ERRLOG("could not construct bdev part\n"); ++ /* spdk_bdev_part_construct will free name if it fails */ ++ free(d); ++ rc = -ENOMEM; ++ goto err; ++ } ++ ++ offset_blocks += split_size_blocks; ++ } ++ ++ return 0; ++err: ++ split_base_tailq = spdk_bdev_part_base_get_tailq(cfg->split_base); ++ spdk_bdev_part_base_hotremove(cfg->split_base, split_base_tailq); ++ spdk_bdev_part_base_free(cfg->split_base); ++ return rc; ++} ++ ++static void ++vbdev_split_del_config(struct spdk_vbdev_split_config *cfg) ++{ ++ TAILQ_REMOVE(&g_split_config, cfg, tailq); ++ free(cfg->base_bdev); ++ free(cfg); ++} ++ ++static void ++vbdev_split_destruct_config(struct spdk_vbdev_split_config *cfg) ++{ ++ struct bdev_part_tailq *split_base_tailq; ++ ++ if (cfg->split_base != NULL) { ++ split_base_tailq = spdk_bdev_part_base_get_tailq(cfg->split_base); ++ spdk_bdev_part_base_hotremove(cfg->split_base, split_base_tailq); ++ } else { ++ vbdev_split_del_config(cfg); ++ } ++} ++ ++static void ++vbdev_split_clear_config(void) ++{ ++ struct spdk_vbdev_split_config *cfg, *tmp_cfg; ++ ++ TAILQ_FOREACH_SAFE(cfg, &g_split_config, tailq, tmp_cfg) { ++ vbdev_split_destruct_config(cfg); ++ } ++} ++ ++static struct spdk_vbdev_split_config * ++vbdev_split_config_find_by_base_name(const char *base_bdev_name) ++{ ++ struct spdk_vbdev_split_config *cfg; ++ ++ TAILQ_FOREACH(cfg, &g_split_config, tailq) { ++ if (strcmp(cfg->base_bdev, base_bdev_name) == 0) { ++ return cfg; ++ } ++ } ++ ++ return NULL; ++} ++ ++static int ++vbdev_split_add_config(const char *base_bdev_name, unsigned split_count, uint64_t split_size, ++ struct spdk_vbdev_split_config **config) ++{ ++ struct spdk_vbdev_split_config *cfg; ++ assert(base_bdev_name); ++ ++ if (base_bdev_name == NULL) { ++ SPDK_ERRLOG("Split bdev config: no base bdev provided."); ++ return -EINVAL; ++ } ++ ++ if (split_count == 0) { ++ SPDK_ERRLOG("Split bdev config: split_count can't be 0."); ++ return -EINVAL; ++ } ++ ++ /* Check if we already have 'base_bdev_name' registered in config */ ++ cfg = vbdev_split_config_find_by_base_name(base_bdev_name); ++ if (cfg) { ++ SPDK_ERRLOG("Split bdev config for base bdev '%s' already exist.", base_bdev_name); ++ return -EEXIST; ++ } ++ ++ cfg = calloc(1, sizeof(*cfg)); ++ if (!cfg) { ++ SPDK_ERRLOG("calloc(): Out of memory"); ++ return -ENOMEM; ++ } ++ ++ cfg->base_bdev = strdup(base_bdev_name); ++ if (!cfg->base_bdev) { ++ SPDK_ERRLOG("strdup(): Out of memory"); ++ free(cfg); ++ return -ENOMEM; ++ } ++ ++ cfg->split_count = split_count; ++ cfg->split_size_mb = split_size; ++ TAILQ_INSERT_TAIL(&g_split_config, cfg, tailq); ++ if (config) { ++ *config = cfg; ++ } ++ ++ return 0; ++} ++ ++static int ++vbdev_split_init(void) ++{ ++ return 0; ++} ++ ++static void ++vbdev_split_fini(void) ++{ ++ vbdev_split_clear_config(); ++} ++ ++static void ++vbdev_split_examine(struct spdk_bdev *bdev) ++{ ++ struct spdk_vbdev_split_config *cfg = vbdev_split_config_find_by_base_name(bdev->name); ++ ++ if (cfg != NULL) { ++ assert(cfg->split_base == NULL); ++ ++ if (vbdev_split_create(cfg)) { ++ SPDK_ERRLOG("could not split bdev %s\n", bdev->name); ++ } ++ } ++ spdk_bdev_module_examine_done(&split_if); ++} ++ ++static int ++vbdev_split_config_json(struct spdk_json_write_ctx *w) ++{ ++ struct spdk_vbdev_split_config *cfg; ++ ++ TAILQ_FOREACH(cfg, &g_split_config, tailq) { ++ spdk_json_write_object_begin(w); ++ ++ spdk_json_write_named_string(w, "method", "bdev_split_create"); ++ ++ spdk_json_write_named_object_begin(w, "params"); ++ spdk_json_write_named_string(w, "base_bdev", cfg->base_bdev); ++ spdk_json_write_named_uint32(w, "split_count", cfg->split_count); ++ spdk_json_write_named_uint64(w, "split_size_mb", cfg->split_size_mb); ++ spdk_json_write_object_end(w); ++ ++ spdk_json_write_object_end(w); ++ } ++ ++ return 0; ++} ++ ++int ++create_vbdev_split(const char *base_bdev_name, unsigned split_count, uint64_t split_size_mb) ++{ ++ int rc; ++ struct spdk_vbdev_split_config *cfg; ++ ++ rc = vbdev_split_add_config(base_bdev_name, split_count, split_size_mb, &cfg); ++ if (rc) { ++ return rc; ++ } ++ ++ rc = vbdev_split_create(cfg); ++ if (rc == -ENODEV) { ++ /* It is ok if base bdev does not exist yet. */ ++ rc = 0; ++ } ++ ++ return rc; ++} ++ ++int ++vbdev_split_destruct(const char *base_bdev_name) ++{ ++ struct spdk_vbdev_split_config *cfg = vbdev_split_config_find_by_base_name(base_bdev_name); ++ ++ if (!cfg) { ++ SPDK_ERRLOG("Split configuration for '%s' not found\n", base_bdev_name); ++ return -ENOENT; ++ } ++ ++ vbdev_split_destruct_config(cfg); ++ return 0; ++} ++ ++struct spdk_bdev_part_base * ++vbdev_split_get_part_base(struct spdk_bdev *bdev) ++{ ++ struct spdk_vbdev_split_config *cfg; ++ ++ cfg = vbdev_split_config_find_by_base_name(spdk_bdev_get_name(bdev)); ++ ++ if (cfg == NULL) { ++ return NULL; ++ } ++ ++ return cfg->split_base; ++} ++ ++/* ++ * During init we'll be asked how much memory we'd like passed to us ++ * in bev_io structures as context. Here's where we specify how ++ * much context we want per IO. ++ */ ++static int ++vbdev_split_get_ctx_size(void) ++{ ++ return sizeof(struct vbdev_split_bdev_io); ++} ++ ++SPDK_LOG_REGISTER_COMPONENT(vbdev_split) +diff --git a/module/bdev/split/vbdev_split.h b/module/bdev/split/vbdev_split.h +index ff4fdab..41a6716 100644 +--- a/module/bdev/split/vbdev_split.h ++++ b/module/bdev/split/vbdev_split.h +@@ -1,40 +1,40 @@ +-/* SPDX-License-Identifier: BSD-3-Clause +- * Copyright (C) 2018 Intel Corporation. +- * All rights reserved. +- */ +- +-#ifndef SPDK_VBDEV_SPLIT_H +-#define SPDK_VBDEV_SPLIT_H +- +-#include "spdk/bdev_module.h" +- +-/** +- * Add given disk name to split config. If bdev with \c base_bdev_name name +- * exist the split bdevs will be created right away, if not the split bdevs will +- * be created when base bdev became be available (during examination process). +- * +- * \param base_bdev_name Base bdev name +- * \param split_count number of splits to be created. +- * \param split_size_mb size of each bdev. If 0 use base bdev size / split_count +- * \return value >= 0 - number of splits create. Negative errno code on error. +- */ +-int create_vbdev_split(const char *base_bdev_name, unsigned split_count, uint64_t split_size_mb); +- +-/** +- * Remove all created split bdevs and split config. +- * +- * \param base_bdev_name base bdev name +- * \return 0 on success or negative errno value. +- */ +-int vbdev_split_destruct(const char *base_bdev_name); +- +-/** +- * Get the spdk_bdev_part_base associated with the given split base_bdev. +- * +- * \param base_bdev Bdev to get the part_base from +- * \return pointer to the associated spdk_bdev_part_base +- * \return NULL if the base_bdev is not being split by the split module +- */ +-struct spdk_bdev_part_base *vbdev_split_get_part_base(struct spdk_bdev *base_bdev); +- +-#endif /* SPDK_VBDEV_SPLIT_H */ ++/* SPDX-License-Identifier: BSD-3-Clause ++ * Copyright (C) 2018 Intel Corporation. ++ * All rights reserved. ++ */ ++ ++#ifndef SPDK_VBDEV_SPLIT_H ++#define SPDK_VBDEV_SPLIT_H ++ ++#include "spdk/bdev_module.h" ++ ++/** ++ * Add given disk name to split config. If bdev with \c base_bdev_name name ++ * exist the split bdevs will be created right away, if not the split bdevs will ++ * be created when base bdev became be available (during examination process). ++ * ++ * \param base_bdev_name Base bdev name ++ * \param split_count number of splits to be created. ++ * \param split_size_mb size of each bdev. If 0 use base bdev size / split_count ++ * \return value >= 0 - number of splits create. Negative errno code on error. ++ */ ++int create_vbdev_split(const char *base_bdev_name, unsigned split_count, uint64_t split_size_mb); ++ ++/** ++ * Remove all created split bdevs and split config. ++ * ++ * \param base_bdev_name base bdev name ++ * \return 0 on success or negative errno value. ++ */ ++int vbdev_split_destruct(const char *base_bdev_name); ++ ++/** ++ * Get the spdk_bdev_part_base associated with the given split base_bdev. ++ * ++ * \param base_bdev Bdev to get the part_base from ++ * \return pointer to the associated spdk_bdev_part_base ++ * \return NULL if the base_bdev is not being split by the split module ++ */ ++struct spdk_bdev_part_base *vbdev_split_get_part_base(struct spdk_bdev *base_bdev); ++ ++#endif /* SPDK_VBDEV_SPLIT_H */ +diff --git a/module/bdev/split/vbdev_split_rpc.c b/module/bdev/split/vbdev_split_rpc.c +index e8af8ed..9fd40f7 100644 +--- a/module/bdev/split/vbdev_split_rpc.c ++++ b/module/bdev/split/vbdev_split_rpc.c +@@ -1,122 +1,122 @@ +-/* SPDX-License-Identifier: BSD-3-Clause +- * Copyright (C) 2018 Intel Corporation. +- * All rights reserved. +- */ +- +-#include "spdk/rpc.h" +-#include "spdk/util.h" +-#include "spdk/string.h" +- +-#include "vbdev_split.h" +-#include "spdk/log.h" +- +-struct rpc_construct_split { +- char *base_bdev; +- uint32_t split_count; +- uint64_t split_size_mb; +-}; +- +-static const struct spdk_json_object_decoder rpc_construct_split_decoders[] = { +- {"base_bdev", offsetof(struct rpc_construct_split, base_bdev), spdk_json_decode_string}, +- {"split_count", offsetof(struct rpc_construct_split, split_count), spdk_json_decode_uint32}, +- {"split_size_mb", offsetof(struct rpc_construct_split, split_size_mb), spdk_json_decode_uint64, true}, +-}; +- +-static void +-dummy_bdev_event_cb(enum spdk_bdev_event_type type, struct spdk_bdev *bdev, void *ctx) +-{ +-} +- +-static void +-rpc_bdev_split_create(struct spdk_jsonrpc_request *request, +- const struct spdk_json_val *params) +-{ +- struct rpc_construct_split req = {}; +- struct spdk_json_write_ctx *w; +- struct spdk_bdev_desc *base_desc; +- struct spdk_bdev *base_bdev; +- int rc; +- +- if (spdk_json_decode_object(params, rpc_construct_split_decoders, +- SPDK_COUNTOF(rpc_construct_split_decoders), +- &req)) { +- SPDK_ERRLOG("spdk_json_decode_object failed\n"); +- spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, "Invalid parameters"); +- goto out; +- } +- +- rc = create_vbdev_split(req.base_bdev, req.split_count, req.split_size_mb); +- if (rc < 0) { +- spdk_jsonrpc_send_error_response_fmt(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, +- "Failed to create %"PRIu32" split bdevs from '%s': %s", +- req.split_count, req.base_bdev, spdk_strerror(-rc)); +- goto out; +- } +- +- w = spdk_jsonrpc_begin_result(request); +- spdk_json_write_array_begin(w); +- +- rc = spdk_bdev_open_ext(req.base_bdev, false, dummy_bdev_event_cb, NULL, &base_desc); +- if (rc == 0) { +- struct spdk_bdev_part_base *split_base; +- struct bdev_part_tailq *split_base_tailq; +- struct spdk_bdev_part *split_part; +- struct spdk_bdev *split_bdev; +- +- base_bdev = spdk_bdev_desc_get_bdev(base_desc); +- +- split_base = vbdev_split_get_part_base(base_bdev); +- +- assert(split_base != NULL); +- +- split_base_tailq = spdk_bdev_part_base_get_tailq(split_base); +- TAILQ_FOREACH(split_part, split_base_tailq, tailq) { +- split_bdev = spdk_bdev_part_get_bdev(split_part); +- spdk_json_write_string(w, spdk_bdev_get_name(split_bdev)); +- } +- +- spdk_bdev_close(base_desc); +- } +- +- spdk_json_write_array_end(w); +- spdk_jsonrpc_end_result(request, w); +- +-out: +- free(req.base_bdev); +-} +-SPDK_RPC_REGISTER("bdev_split_create", rpc_bdev_split_create, SPDK_RPC_RUNTIME) +- +-struct rpc_delete_split { +- char *base_bdev; +-}; +- +-static const struct spdk_json_object_decoder rpc_delete_split_decoders[] = { +- {"base_bdev", offsetof(struct rpc_delete_split, base_bdev), spdk_json_decode_string}, +-}; +- +-static void +-rpc_bdev_split_delete(struct spdk_jsonrpc_request *request, +- const struct spdk_json_val *params) +-{ +- struct rpc_delete_split req = {}; +- int rc; +- +- if (spdk_json_decode_object(params, rpc_delete_split_decoders, +- SPDK_COUNTOF(rpc_delete_split_decoders), +- &req)) { +- SPDK_ERRLOG("spdk_json_decode_object failed\n"); +- spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, "Invalid parameters"); +- goto out; +- } +- +- rc = vbdev_split_destruct(req.base_bdev); +- if (rc < 0) { +- spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, spdk_strerror(-rc)); +- goto out; +- } +- +- spdk_jsonrpc_send_bool_response(request, true); +-out: +- free(req.base_bdev); +-} +-SPDK_RPC_REGISTER("bdev_split_delete", rpc_bdev_split_delete, SPDK_RPC_RUNTIME) ++/* SPDX-License-Identifier: BSD-3-Clause ++ * Copyright (C) 2018 Intel Corporation. ++ * All rights reserved. ++ */ ++ ++#include "spdk/rpc.h" ++#include "spdk/util.h" ++#include "spdk/string.h" ++ ++#include "vbdev_split.h" ++#include "spdk/log.h" ++ ++struct rpc_construct_split { ++ char *base_bdev; ++ uint32_t split_count; ++ uint64_t split_size_mb; ++}; ++ ++static const struct spdk_json_object_decoder rpc_construct_split_decoders[] = { ++ {"base_bdev", offsetof(struct rpc_construct_split, base_bdev), spdk_json_decode_string}, ++ {"split_count", offsetof(struct rpc_construct_split, split_count), spdk_json_decode_uint32}, ++ {"split_size_mb", offsetof(struct rpc_construct_split, split_size_mb), spdk_json_decode_uint64, true}, ++}; ++ ++static void ++dummy_bdev_event_cb(enum spdk_bdev_event_type type, struct spdk_bdev *bdev, void *ctx) ++{ ++} ++ ++static void ++rpc_bdev_split_create(struct spdk_jsonrpc_request *request, ++ const struct spdk_json_val *params) ++{ ++ struct rpc_construct_split req = {}; ++ struct spdk_json_write_ctx *w; ++ struct spdk_bdev_desc *base_desc; ++ struct spdk_bdev *base_bdev; ++ int rc; ++ ++ if (spdk_json_decode_object(params, rpc_construct_split_decoders, ++ SPDK_COUNTOF(rpc_construct_split_decoders), ++ &req)) { ++ SPDK_ERRLOG("spdk_json_decode_object failed\n"); ++ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, "Invalid parameters"); ++ goto out; ++ } ++ ++ rc = create_vbdev_split(req.base_bdev, req.split_count, req.split_size_mb); ++ if (rc < 0) { ++ spdk_jsonrpc_send_error_response_fmt(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, ++ "Failed to create %"PRIu32" split bdevs from '%s': %s", ++ req.split_count, req.base_bdev, spdk_strerror(-rc)); ++ goto out; ++ } ++ ++ w = spdk_jsonrpc_begin_result(request); ++ spdk_json_write_array_begin(w); ++ ++ rc = spdk_bdev_open_ext(req.base_bdev, false, dummy_bdev_event_cb, NULL, &base_desc); ++ if (rc == 0) { ++ struct spdk_bdev_part_base *split_base; ++ struct bdev_part_tailq *split_base_tailq; ++ struct spdk_bdev_part *split_part; ++ struct spdk_bdev *split_bdev; ++ ++ base_bdev = spdk_bdev_desc_get_bdev(base_desc); ++ ++ split_base = vbdev_split_get_part_base(base_bdev); ++ ++ assert(split_base != NULL); ++ ++ split_base_tailq = spdk_bdev_part_base_get_tailq(split_base); ++ TAILQ_FOREACH(split_part, split_base_tailq, tailq) { ++ split_bdev = spdk_bdev_part_get_bdev(split_part); ++ spdk_json_write_string(w, spdk_bdev_get_name(split_bdev)); ++ } ++ ++ spdk_bdev_close(base_desc); ++ } ++ ++ spdk_json_write_array_end(w); ++ spdk_jsonrpc_end_result(request, w); ++ ++out: ++ free(req.base_bdev); ++} ++SPDK_RPC_REGISTER("bdev_split_create", rpc_bdev_split_create, SPDK_RPC_RUNTIME) ++ ++struct rpc_delete_split { ++ char *base_bdev; ++}; ++ ++static const struct spdk_json_object_decoder rpc_delete_split_decoders[] = { ++ {"base_bdev", offsetof(struct rpc_delete_split, base_bdev), spdk_json_decode_string}, ++}; ++ ++static void ++rpc_bdev_split_delete(struct spdk_jsonrpc_request *request, ++ const struct spdk_json_val *params) ++{ ++ struct rpc_delete_split req = {}; ++ int rc; ++ ++ if (spdk_json_decode_object(params, rpc_delete_split_decoders, ++ SPDK_COUNTOF(rpc_delete_split_decoders), ++ &req)) { ++ SPDK_ERRLOG("spdk_json_decode_object failed\n"); ++ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, "Invalid parameters"); ++ goto out; ++ } ++ ++ rc = vbdev_split_destruct(req.base_bdev); ++ if (rc < 0) { ++ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, spdk_strerror(-rc)); ++ goto out; ++ } ++ ++ spdk_jsonrpc_send_bool_response(request, true); ++out: ++ free(req.base_bdev); ++} ++SPDK_RPC_REGISTER("bdev_split_delete", rpc_bdev_split_delete, SPDK_RPC_RUNTIME) +diff --git a/module/bdev/uring/Makefile b/module/bdev/uring/Makefile +index 6e41143..350275b 100644 +--- a/module/bdev/uring/Makefile ++++ b/module/bdev/uring/Makefile +@@ -1,23 +1,23 @@ +-# SPDX-License-Identifier: BSD-3-Clause +-# Copyright (C) 2015 Intel Corporation. +-# All rights reserved. +-# +- +-SPDK_ROOT_DIR := $(abspath $(CURDIR)/../../..) +-include $(SPDK_ROOT_DIR)/mk/spdk.common.mk +- +-SO_VER := 4 +-SO_MINOR := 0 +- +-C_SRCS = bdev_uring.c bdev_uring_rpc.c +-LIBNAME = bdev_uring +-LOCAL_SYS_LIBS = -luring +- +-ifneq ($(strip $(CONFIG_URING_PATH)),) +-CFLAGS += -I$(CONFIG_URING_PATH) +-LDFLAGS += -L$(CONFIG_URING_PATH) +-endif +- +-SPDK_MAP_FILE = $(SPDK_ROOT_DIR)/mk/spdk_blank.map +- +-include $(SPDK_ROOT_DIR)/mk/spdk.lib.mk ++# SPDX-License-Identifier: BSD-3-Clause ++# Copyright (C) 2015 Intel Corporation. ++# All rights reserved. ++# ++ ++SPDK_ROOT_DIR := $(abspath $(CURDIR)/../../..) ++include $(SPDK_ROOT_DIR)/mk/spdk.common.mk ++ ++SO_VER := 4 ++SO_MINOR := 0 ++ ++C_SRCS = bdev_uring.c bdev_uring_rpc.c ++LIBNAME = bdev_uring ++LOCAL_SYS_LIBS = -luring ++ ++ifneq ($(strip $(CONFIG_URING_PATH)),) ++CFLAGS += -I$(CONFIG_URING_PATH) ++LDFLAGS += -L$(CONFIG_URING_PATH) ++endif ++ ++SPDK_MAP_FILE = $(SPDK_ROOT_DIR)/mk/spdk_blank.map ++ ++include $(SPDK_ROOT_DIR)/mk/spdk.lib.mk +diff --git a/module/bdev/uring/bdev_uring.c b/module/bdev/uring/bdev_uring.c +index 3512e01..80103d2 100644 +--- a/module/bdev/uring/bdev_uring.c ++++ b/module/bdev/uring/bdev_uring.c +@@ -1,886 +1,886 @@ +-/* SPDX-License-Identifier: BSD-3-Clause +- * Copyright (C) 2019 Intel Corporation. +- * All rights reserved. +- */ +- +-#include "bdev_uring.h" +- +-#include "spdk/stdinc.h" +-#include "spdk/config.h" +-#include "spdk/barrier.h" +-#include "spdk/bdev.h" +-#include "spdk/env.h" +-#include "spdk/fd.h" +-#include "spdk/likely.h" +-#include "spdk/thread.h" +-#include "spdk/json.h" +-#include "spdk/util.h" +-#include "spdk/string.h" +- +-#include "spdk/log.h" +-#include "spdk_internal/uring.h" +- +-#ifdef SPDK_CONFIG_URING_ZNS +-#include +-#define SECTOR_SHIFT 9 +-#endif +- +-struct bdev_uring_zoned_dev { +- uint64_t num_zones; +- uint32_t zone_shift; +- uint32_t lba_shift; +-}; +- +-struct bdev_uring_io_channel { +- struct bdev_uring_group_channel *group_ch; +-}; +- +-struct bdev_uring_group_channel { +- uint64_t io_inflight; +- uint64_t io_pending; +- struct spdk_poller *poller; +- struct io_uring uring; +-}; +- +-struct bdev_uring_task { +- uint64_t len; +- struct bdev_uring_io_channel *ch; +- TAILQ_ENTRY(bdev_uring_task) link; +-}; +- +-struct bdev_uring { +- struct spdk_bdev bdev; +- struct bdev_uring_zoned_dev zd; +- char *filename; +- int fd; +- TAILQ_ENTRY(bdev_uring) link; +-}; +- +-static int bdev_uring_init(void); +-static void bdev_uring_fini(void); +-static void uring_free_bdev(struct bdev_uring *uring); +-static TAILQ_HEAD(, bdev_uring) g_uring_bdev_head = TAILQ_HEAD_INITIALIZER(g_uring_bdev_head); +- +-#define SPDK_URING_QUEUE_DEPTH 512 +-#define MAX_EVENTS_PER_POLL 32 +- +-static int +-bdev_uring_get_ctx_size(void) +-{ +- return sizeof(struct bdev_uring_task); +-} +- +-static struct spdk_bdev_module uring_if = { +- .name = "uring", +- .module_init = bdev_uring_init, +- .module_fini = bdev_uring_fini, +- .get_ctx_size = bdev_uring_get_ctx_size, +-}; +- +-SPDK_BDEV_MODULE_REGISTER(uring, &uring_if) +- +-static int +-bdev_uring_open(struct bdev_uring *bdev) +-{ +- int fd; +- +- fd = open(bdev->filename, O_RDWR | O_DIRECT | O_NOATIME); +- if (fd < 0) { +- /* Try without O_DIRECT for non-disk files */ +- fd = open(bdev->filename, O_RDWR | O_NOATIME); +- if (fd < 0) { +- SPDK_ERRLOG("open() failed (file:%s), errno %d: %s\n", +- bdev->filename, errno, spdk_strerror(errno)); +- bdev->fd = -1; +- return -1; +- } +- } +- +- bdev->fd = fd; +- +- return 0; +-} +- +-static int +-bdev_uring_close(struct bdev_uring *bdev) +-{ +- int rc; +- +- if (bdev->fd == -1) { +- return 0; +- } +- +- rc = close(bdev->fd); +- if (rc < 0) { +- SPDK_ERRLOG("close() failed (fd=%d), errno %d: %s\n", +- bdev->fd, errno, spdk_strerror(errno)); +- return -1; +- } +- +- bdev->fd = -1; +- +- return 0; +-} +- +-static int64_t +-bdev_uring_readv(struct bdev_uring *uring, struct spdk_io_channel *ch, +- struct bdev_uring_task *uring_task, +- struct iovec *iov, int iovcnt, uint64_t nbytes, uint64_t offset) +-{ +- struct bdev_uring_io_channel *uring_ch = spdk_io_channel_get_ctx(ch); +- struct bdev_uring_group_channel *group_ch = uring_ch->group_ch; +- struct io_uring_sqe *sqe; +- +- sqe = io_uring_get_sqe(&group_ch->uring); +- if (!sqe) { +- SPDK_DEBUGLOG(uring, "get sqe failed as out of resource\n"); +- return -ENOMEM; +- } +- +- io_uring_prep_readv(sqe, uring->fd, iov, iovcnt, offset); +- io_uring_sqe_set_data(sqe, uring_task); +- uring_task->len = nbytes; +- uring_task->ch = uring_ch; +- +- SPDK_DEBUGLOG(uring, "read %d iovs size %lu to off: %#lx\n", +- iovcnt, nbytes, offset); +- +- group_ch->io_pending++; +- return nbytes; +-} +- +-static int64_t +-bdev_uring_writev(struct bdev_uring *uring, struct spdk_io_channel *ch, +- struct bdev_uring_task *uring_task, +- struct iovec *iov, int iovcnt, size_t nbytes, uint64_t offset) +-{ +- struct bdev_uring_io_channel *uring_ch = spdk_io_channel_get_ctx(ch); +- struct bdev_uring_group_channel *group_ch = uring_ch->group_ch; +- struct io_uring_sqe *sqe; +- +- sqe = io_uring_get_sqe(&group_ch->uring); +- if (!sqe) { +- SPDK_DEBUGLOG(uring, "get sqe failed as out of resource\n"); +- return -ENOMEM; +- } +- +- io_uring_prep_writev(sqe, uring->fd, iov, iovcnt, offset); +- io_uring_sqe_set_data(sqe, uring_task); +- uring_task->len = nbytes; +- uring_task->ch = uring_ch; +- +- SPDK_DEBUGLOG(uring, "write %d iovs size %lu from off: %#lx\n", +- iovcnt, nbytes, offset); +- +- group_ch->io_pending++; +- return nbytes; +-} +- +-static int +-bdev_uring_destruct(void *ctx) +-{ +- struct bdev_uring *uring = ctx; +- int rc = 0; +- +- TAILQ_REMOVE(&g_uring_bdev_head, uring, link); +- rc = bdev_uring_close(uring); +- if (rc < 0) { +- SPDK_ERRLOG("bdev_uring_close() failed\n"); +- } +- spdk_io_device_unregister(uring, NULL); +- uring_free_bdev(uring); +- return rc; +-} +- +-static int +-bdev_uring_reap(struct io_uring *ring, int max) +-{ +- int i, count, ret; +- struct io_uring_cqe *cqe; +- struct bdev_uring_task *uring_task; +- enum spdk_bdev_io_status status; +- +- count = 0; +- for (i = 0; i < max; i++) { +- ret = io_uring_peek_cqe(ring, &cqe); +- if (ret != 0) { +- return ret; +- } +- +- if (cqe == NULL) { +- return count; +- } +- +- uring_task = (struct bdev_uring_task *)cqe->user_data; +- if (cqe->res != (signed)uring_task->len) { +- status = SPDK_BDEV_IO_STATUS_FAILED; +- } else { +- status = SPDK_BDEV_IO_STATUS_SUCCESS; +- } +- +- uring_task->ch->group_ch->io_inflight--; +- io_uring_cqe_seen(ring, cqe); +- spdk_bdev_io_complete(spdk_bdev_io_from_ctx(uring_task), status); +- count++; +- } +- +- return count; +-} +- +-static int +-bdev_uring_group_poll(void *arg) +-{ +- struct bdev_uring_group_channel *group_ch = arg; +- int to_complete, to_submit; +- int count, ret; +- +- to_submit = group_ch->io_pending; +- +- if (to_submit > 0) { +- /* If there are I/O to submit, use io_uring_submit here. +- * It will automatically call spdk_io_uring_enter appropriately. */ +- ret = io_uring_submit(&group_ch->uring); +- if (ret < 0) { +- return SPDK_POLLER_BUSY; +- } +- +- group_ch->io_pending = 0; +- group_ch->io_inflight += to_submit; +- } +- +- to_complete = group_ch->io_inflight; +- count = 0; +- if (to_complete > 0) { +- count = bdev_uring_reap(&group_ch->uring, to_complete); +- } +- +- if (count + to_submit > 0) { +- return SPDK_POLLER_BUSY; +- } else { +- return SPDK_POLLER_IDLE; +- } +-} +- +-static void +-bdev_uring_get_buf_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io, +- bool success) +-{ +- int64_t ret = 0; +- +- if (!success) { +- spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); +- return; +- } +- +- switch (bdev_io->type) { +- case SPDK_BDEV_IO_TYPE_READ: +- ret = bdev_uring_readv((struct bdev_uring *)bdev_io->bdev->ctxt, +- ch, +- (struct bdev_uring_task *)bdev_io->driver_ctx, +- bdev_io->u.bdev.iovs, +- bdev_io->u.bdev.iovcnt, +- bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen, +- bdev_io->u.bdev.offset_blocks * bdev_io->bdev->blocklen); +- break; +- case SPDK_BDEV_IO_TYPE_WRITE: +- ret = bdev_uring_writev((struct bdev_uring *)bdev_io->bdev->ctxt, +- ch, +- (struct bdev_uring_task *)bdev_io->driver_ctx, +- bdev_io->u.bdev.iovs, +- bdev_io->u.bdev.iovcnt, +- bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen, +- bdev_io->u.bdev.offset_blocks * bdev_io->bdev->blocklen); +- break; +- default: +- SPDK_ERRLOG("Wrong io type\n"); +- break; +- } +- +- if (ret == -ENOMEM) { +- spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_NOMEM); +- } +-} +- +-#ifdef SPDK_CONFIG_URING_ZNS +-static int +-bdev_uring_read_sysfs_attr(const char *devname, const char *attr, char *str, int str_len) +-{ +- char *path = NULL; +- char *device = NULL; +- FILE *file; +- int ret = 0; +- +- device = basename(devname); +- path = spdk_sprintf_alloc("/sys/block/%s/%s", device, attr); +- if (!path) { +- return -EINVAL; +- } +- +- file = fopen(path, "r"); +- if (!file) { +- free(path); +- return -ENOENT; +- } +- +- if (!fgets(str, str_len, file)) { +- ret = -EINVAL; +- goto close; +- } +- +- spdk_str_chomp(str); +- +-close: +- free(path); +- fclose(file); +- return ret; +-} +- +-static int +-bdev_uring_read_sysfs_attr_long(const char *devname, const char *attr, long *val) +-{ +- char str[128]; +- int ret; +- +- ret = bdev_uring_read_sysfs_attr(devname, attr, str, sizeof(str)); +- if (ret) { +- return ret; +- } +- +- *val = spdk_strtol(str, 10); +- +- return 0; +-} +- +-static int +-bdev_uring_fill_zone_type(struct spdk_bdev_zone_info *zone_info, struct blk_zone *zones_rep) +-{ +- switch (zones_rep->type) { +- case BLK_ZONE_TYPE_CONVENTIONAL: +- zone_info->type = SPDK_BDEV_ZONE_TYPE_CNV; +- break; +- case BLK_ZONE_TYPE_SEQWRITE_REQ: +- zone_info->type = SPDK_BDEV_ZONE_TYPE_SEQWR; +- break; +- case BLK_ZONE_TYPE_SEQWRITE_PREF: +- zone_info->type = SPDK_BDEV_ZONE_TYPE_SEQWP; +- break; +- default: +- SPDK_ERRLOG("Invalid zone type: %#x in zone report\n", zones_rep->type); +- return -EIO; +- } +- return 0; +-} +- +-static int +-bdev_uring_fill_zone_state(struct spdk_bdev_zone_info *zone_info, struct blk_zone *zones_rep) +-{ +- switch (zones_rep->cond) { +- case BLK_ZONE_COND_EMPTY: +- zone_info->state = SPDK_BDEV_ZONE_STATE_EMPTY; +- break; +- case BLK_ZONE_COND_IMP_OPEN: +- zone_info->state = SPDK_BDEV_ZONE_STATE_IMP_OPEN; +- break; +- case BLK_ZONE_COND_EXP_OPEN: +- zone_info->state = SPDK_BDEV_ZONE_STATE_EXP_OPEN; +- break; +- case BLK_ZONE_COND_CLOSED: +- zone_info->state = SPDK_BDEV_ZONE_STATE_CLOSED; +- break; +- case BLK_ZONE_COND_READONLY: +- zone_info->state = SPDK_BDEV_ZONE_STATE_READ_ONLY; +- break; +- case BLK_ZONE_COND_FULL: +- zone_info->state = SPDK_BDEV_ZONE_STATE_FULL; +- break; +- case BLK_ZONE_COND_OFFLINE: +- zone_info->state = SPDK_BDEV_ZONE_STATE_OFFLINE; +- break; +- case BLK_ZONE_COND_NOT_WP: +- zone_info->state = SPDK_BDEV_ZONE_STATE_NOT_WP; +- break; +- default: +- SPDK_ERRLOG("Invalid zone state: %#x in zone report\n", zones_rep->cond); +- return -EIO; +- } +- return 0; +-} +- +-static int +-bdev_uring_zone_management_op(struct spdk_bdev_io *bdev_io) +-{ +- struct bdev_uring *uring; +- struct blk_zone_range range; +- long unsigned zone_mgmt_op; +- uint64_t zone_id = bdev_io->u.zone_mgmt.zone_id; +- +- uring = (struct bdev_uring *)bdev_io->bdev->ctxt; +- +- switch (bdev_io->u.zone_mgmt.zone_action) { +- case SPDK_BDEV_ZONE_RESET: +- zone_mgmt_op = BLKRESETZONE; +- break; +- case SPDK_BDEV_ZONE_OPEN: +- zone_mgmt_op = BLKOPENZONE; +- break; +- case SPDK_BDEV_ZONE_CLOSE: +- zone_mgmt_op = BLKCLOSEZONE; +- break; +- case SPDK_BDEV_ZONE_FINISH: +- zone_mgmt_op = BLKFINISHZONE; +- break; +- default: +- return -EINVAL; +- } +- +- range.sector = (zone_id << uring->zd.lba_shift); +- range.nr_sectors = (uring->bdev.zone_size << uring->zd.lba_shift); +- +- if (ioctl(uring->fd, zone_mgmt_op, &range)) { +- SPDK_ERRLOG("Ioctl BLKXXXZONE(%#x) failed errno: %d(%s)\n", +- bdev_io->u.zone_mgmt.zone_action, errno, strerror(errno)); +- return -EINVAL; +- } +- +- spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_SUCCESS); +- +- return 0; +-} +- +-static int +-bdev_uring_zone_get_info(struct spdk_bdev_io *bdev_io) +-{ +- struct bdev_uring *uring; +- struct blk_zone *zones; +- struct blk_zone_report *rep; +- struct spdk_bdev_zone_info *zone_info = bdev_io->u.zone_mgmt.buf; +- size_t repsize; +- uint32_t i, shift; +- uint32_t num_zones = bdev_io->u.zone_mgmt.num_zones; +- uint64_t zone_id = bdev_io->u.zone_mgmt.zone_id; +- +- uring = (struct bdev_uring *)bdev_io->bdev->ctxt; +- shift = uring->zd.lba_shift; +- +- if ((num_zones > uring->zd.num_zones) || !num_zones) { +- return -EINVAL; +- } +- +- repsize = sizeof(struct blk_zone_report) + (sizeof(struct blk_zone) * num_zones); +- rep = (struct blk_zone_report *)malloc(repsize); +- if (!rep) { +- return -ENOMEM; +- } +- +- zones = (struct blk_zone *)(rep + 1); +- +- while (num_zones && ((zone_id >> uring->zd.zone_shift) <= num_zones)) { +- memset(rep, 0, repsize); +- rep->sector = zone_id; +- rep->nr_zones = num_zones; +- +- if (ioctl(uring->fd, BLKREPORTZONE, rep)) { +- SPDK_ERRLOG("Ioctl BLKREPORTZONE failed errno: %d(%s)\n", +- errno, strerror(errno)); +- free(rep); +- return -EINVAL; +- } +- +- if (!rep->nr_zones) { +- break; +- } +- +- for (i = 0; i < rep->nr_zones; i++) { +- zone_info->zone_id = ((zones + i)->start >> shift); +- zone_info->write_pointer = ((zones + i)->wp >> shift); +- zone_info->capacity = ((zones + i)->capacity >> shift); +- +- bdev_uring_fill_zone_state(zone_info, zones + i); +- bdev_uring_fill_zone_type(zone_info, zones + i); +- +- zone_id = ((zones + i)->start + (zones + i)->len) >> shift; +- zone_info++; +- num_zones--; +- } +- } +- +- spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_SUCCESS); +- free(rep); +- return 0; +-} +- +-static int +-bdev_uring_check_zoned_support(struct bdev_uring *uring, const char *name, const char *filename) +-{ +- char str[128]; +- long int val = 0; +- uint32_t zinfo; +- int retval = -1; +- +- uring->bdev.zoned = false; +- +- /* Check if this is a zoned block device */ +- if (bdev_uring_read_sysfs_attr(filename, "queue/zoned", str, sizeof(str))) { +- SPDK_ERRLOG("Unable to open file %s/queue/zoned. errno: %d\n", filename, errno); +- } else if (strcmp(str, "host-aware") == 0 || strcmp(str, "host-managed") == 0) { +- /* Only host-aware & host-managed zns devices */ +- uring->bdev.zoned = true; +- +- if (ioctl(uring->fd, BLKGETNRZONES, &zinfo)) { +- SPDK_ERRLOG("ioctl BLKNRZONES failed %d (%s)\n", errno, strerror(errno)); +- goto err_ret; +- } +- uring->zd.num_zones = zinfo; +- +- if (ioctl(uring->fd, BLKGETZONESZ, &zinfo)) { +- SPDK_ERRLOG("ioctl BLKGETZONESZ failed %d (%s)\n", errno, strerror(errno)); +- goto err_ret; +- } +- +- uring->zd.lba_shift = uring->bdev.required_alignment - SECTOR_SHIFT; +- uring->bdev.zone_size = (zinfo >> uring->zd.lba_shift); +- uring->zd.zone_shift = spdk_u32log2(zinfo >> uring->zd.lba_shift); +- +- if (bdev_uring_read_sysfs_attr_long(filename, "queue/max_open_zones", &val)) { +- SPDK_ERRLOG("Failed to get max open zones %d (%s)\n", errno, strerror(errno)); +- goto err_ret; +- } +- uring->bdev.max_open_zones = uring->bdev.optimal_open_zones = (uint32_t)val; +- +- if (bdev_uring_read_sysfs_attr_long(filename, "queue/max_active_zones", &val)) { +- SPDK_ERRLOG("Failed to get max active zones %d (%s)\n", errno, strerror(errno)); +- goto err_ret; +- } +- uring->bdev.max_active_zones = (uint32_t)val; +- retval = 0; +- } else { +- retval = 0; /* queue/zoned=none */ +- } +- +-err_ret: +- return retval; +-} +-#else +-/* No support for zoned devices */ +-static int +-bdev_uring_zone_management_op(struct spdk_bdev_io *bdev_io) +-{ +- return -1; +-} +- +-static int +-bdev_uring_zone_get_info(struct spdk_bdev_io *bdev_io) +-{ +- return -1; +-} +- +-static int +-bdev_uring_check_zoned_support(struct bdev_uring *uring, const char *name, const char *filename) +-{ +- return 0; +-} +-#endif +- +-static int +-_bdev_uring_submit_request(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io) +-{ +- +- switch (bdev_io->type) { +- case SPDK_BDEV_IO_TYPE_GET_ZONE_INFO: +- return bdev_uring_zone_get_info(bdev_io); +- case SPDK_BDEV_IO_TYPE_ZONE_MANAGEMENT: +- return bdev_uring_zone_management_op(bdev_io); +- /* Read and write operations must be performed on buffers aligned to +- * bdev->required_alignment. If user specified unaligned buffers, +- * get the aligned buffer from the pool by calling spdk_bdev_io_get_buf. */ +- case SPDK_BDEV_IO_TYPE_READ: +- case SPDK_BDEV_IO_TYPE_WRITE: +- spdk_bdev_io_get_buf(bdev_io, bdev_uring_get_buf_cb, +- bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen); +- return 0; +- default: +- return -1; +- } +-} +- +-static void +-bdev_uring_submit_request(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io) +-{ +- if (_bdev_uring_submit_request(ch, bdev_io) < 0) { +- spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); +- } +-} +- +-static bool +-bdev_uring_io_type_supported(void *ctx, enum spdk_bdev_io_type io_type) +-{ +- switch (io_type) { +-#ifdef SPDK_CONFIG_URING_ZNS +- case SPDK_BDEV_IO_TYPE_GET_ZONE_INFO: +- case SPDK_BDEV_IO_TYPE_ZONE_MANAGEMENT: +-#endif +- case SPDK_BDEV_IO_TYPE_READ: +- case SPDK_BDEV_IO_TYPE_WRITE: +- return true; +- default: +- return false; +- } +-} +- +-static int +-bdev_uring_create_cb(void *io_device, void *ctx_buf) +-{ +- struct bdev_uring_io_channel *ch = ctx_buf; +- +- ch->group_ch = spdk_io_channel_get_ctx(spdk_get_io_channel(&uring_if)); +- +- return 0; +-} +- +-static void +-bdev_uring_destroy_cb(void *io_device, void *ctx_buf) +-{ +- struct bdev_uring_io_channel *ch = ctx_buf; +- +- spdk_put_io_channel(spdk_io_channel_from_ctx(ch->group_ch)); +-} +- +-static struct spdk_io_channel * +-bdev_uring_get_io_channel(void *ctx) +-{ +- struct bdev_uring *uring = ctx; +- +- return spdk_get_io_channel(uring); +-} +- +-static int +-bdev_uring_dump_info_json(void *ctx, struct spdk_json_write_ctx *w) +-{ +- struct bdev_uring *uring = ctx; +- +- spdk_json_write_named_object_begin(w, "uring"); +- +- spdk_json_write_named_string(w, "filename", uring->filename); +- +- spdk_json_write_object_end(w); +- +- return 0; +-} +- +-static void +-bdev_uring_write_json_config(struct spdk_bdev *bdev, struct spdk_json_write_ctx *w) +-{ +- struct bdev_uring *uring = bdev->ctxt; +- +- spdk_json_write_object_begin(w); +- +- spdk_json_write_named_string(w, "method", "bdev_uring_create"); +- +- spdk_json_write_named_object_begin(w, "params"); +- spdk_json_write_named_string(w, "name", bdev->name); +- spdk_json_write_named_uint32(w, "block_size", bdev->blocklen); +- spdk_json_write_named_string(w, "filename", uring->filename); +- spdk_json_write_object_end(w); +- +- spdk_json_write_object_end(w); +-} +- +-static const struct spdk_bdev_fn_table uring_fn_table = { +- .destruct = bdev_uring_destruct, +- .submit_request = bdev_uring_submit_request, +- .io_type_supported = bdev_uring_io_type_supported, +- .get_io_channel = bdev_uring_get_io_channel, +- .dump_info_json = bdev_uring_dump_info_json, +- .write_config_json = bdev_uring_write_json_config, +-}; +- +-static void +-uring_free_bdev(struct bdev_uring *uring) +-{ +- if (uring == NULL) { +- return; +- } +- free(uring->filename); +- free(uring->bdev.name); +- free(uring); +-} +- +-static int +-bdev_uring_group_create_cb(void *io_device, void *ctx_buf) +-{ +- struct bdev_uring_group_channel *ch = ctx_buf; +- +- /* Do not use IORING_SETUP_IOPOLL until the Linux kernel can support not only +- * local devices but also devices attached from remote target */ +- if (io_uring_queue_init(SPDK_URING_QUEUE_DEPTH, &ch->uring, 0) < 0) { +- SPDK_ERRLOG("uring I/O context setup failure\n"); +- return -1; +- } +- +- ch->poller = SPDK_POLLER_REGISTER(bdev_uring_group_poll, ch, 0); +- return 0; +-} +- +-static void +-bdev_uring_group_destroy_cb(void *io_device, void *ctx_buf) +-{ +- struct bdev_uring_group_channel *ch = ctx_buf; +- +- io_uring_queue_exit(&ch->uring); +- +- spdk_poller_unregister(&ch->poller); +-} +- +-struct spdk_bdev * +-create_uring_bdev(const char *name, const char *filename, uint32_t block_size) +-{ +- struct bdev_uring *uring; +- uint32_t detected_block_size; +- uint64_t bdev_size; +- int rc; +- +- uring = calloc(1, sizeof(*uring)); +- if (!uring) { +- SPDK_ERRLOG("Unable to allocate enough memory for uring backend\n"); +- return NULL; +- } +- +- uring->filename = strdup(filename); +- if (!uring->filename) { +- goto error_return; +- } +- +- if (bdev_uring_open(uring)) { +- SPDK_ERRLOG("Unable to open file %s. fd: %d errno: %d\n", filename, uring->fd, errno); +- goto error_return; +- } +- +- bdev_size = spdk_fd_get_size(uring->fd); +- +- uring->bdev.name = strdup(name); +- if (!uring->bdev.name) { +- goto error_return; +- } +- uring->bdev.product_name = "URING bdev"; +- uring->bdev.module = &uring_if; +- +- uring->bdev.write_cache = 0; +- +- detected_block_size = spdk_fd_get_blocklen(uring->fd); +- if (block_size == 0) { +- /* User did not specify block size - use autodetected block size. */ +- if (detected_block_size == 0) { +- SPDK_ERRLOG("Block size could not be auto-detected\n"); +- goto error_return; +- } +- block_size = detected_block_size; +- } else { +- if (block_size < detected_block_size) { +- SPDK_ERRLOG("Specified block size %" PRIu32 " is smaller than " +- "auto-detected block size %" PRIu32 "\n", +- block_size, detected_block_size); +- goto error_return; +- } else if (detected_block_size != 0 && block_size != detected_block_size) { +- SPDK_WARNLOG("Specified block size %" PRIu32 " does not match " +- "auto-detected block size %" PRIu32 "\n", +- block_size, detected_block_size); +- } +- } +- +- if (block_size < 512) { +- SPDK_ERRLOG("Invalid block size %" PRIu32 " (must be at least 512).\n", block_size); +- goto error_return; +- } +- +- if (!spdk_u32_is_pow2(block_size)) { +- SPDK_ERRLOG("Invalid block size %" PRIu32 " (must be a power of 2.)\n", block_size); +- goto error_return; +- } +- +- uring->bdev.blocklen = block_size; +- uring->bdev.required_alignment = spdk_u32log2(block_size); +- +- rc = bdev_uring_check_zoned_support(uring, name, filename); +- if (rc) { +- goto error_return; +- } +- +- if (bdev_size % uring->bdev.blocklen != 0) { +- SPDK_ERRLOG("Disk size %" PRIu64 " is not a multiple of block size %" PRIu32 "\n", +- bdev_size, uring->bdev.blocklen); +- goto error_return; +- } +- +- uring->bdev.blockcnt = bdev_size / uring->bdev.blocklen; +- uring->bdev.ctxt = uring; +- +- uring->bdev.fn_table = &uring_fn_table; +- +- spdk_io_device_register(uring, bdev_uring_create_cb, bdev_uring_destroy_cb, +- sizeof(struct bdev_uring_io_channel), +- uring->bdev.name); +- rc = spdk_bdev_register(&uring->bdev); +- if (rc) { +- spdk_io_device_unregister(uring, NULL); +- goto error_return; +- } +- +- TAILQ_INSERT_TAIL(&g_uring_bdev_head, uring, link); +- return &uring->bdev; +- +-error_return: +- bdev_uring_close(uring); +- uring_free_bdev(uring); +- return NULL; +-} +- +-struct delete_uring_bdev_ctx { +- spdk_delete_uring_complete cb_fn; +- void *cb_arg; +-}; +- +-static void +-uring_bdev_unregister_cb(void *arg, int bdeverrno) +-{ +- struct delete_uring_bdev_ctx *ctx = arg; +- +- ctx->cb_fn(ctx->cb_arg, bdeverrno); +- free(ctx); +-} +- +-void +-delete_uring_bdev(const char *name, spdk_delete_uring_complete cb_fn, void *cb_arg) +-{ +- struct delete_uring_bdev_ctx *ctx; +- int rc; +- +- ctx = calloc(1, sizeof(*ctx)); +- if (ctx == NULL) { +- cb_fn(cb_arg, -ENOMEM); +- return; +- } +- +- ctx->cb_fn = cb_fn; +- ctx->cb_arg = cb_arg; +- rc = spdk_bdev_unregister_by_name(name, &uring_if, uring_bdev_unregister_cb, ctx); +- if (rc != 0) { +- uring_bdev_unregister_cb(ctx, rc); +- } +-} +- +-static int +-bdev_uring_init(void) +-{ +- spdk_io_device_register(&uring_if, bdev_uring_group_create_cb, bdev_uring_group_destroy_cb, +- sizeof(struct bdev_uring_group_channel), "uring_module"); +- +- return 0; +-} +- +-static void +-bdev_uring_fini(void) +-{ +- spdk_io_device_unregister(&uring_if, NULL); +-} +- +-SPDK_LOG_REGISTER_COMPONENT(uring) ++/* SPDX-License-Identifier: BSD-3-Clause ++ * Copyright (C) 2019 Intel Corporation. ++ * All rights reserved. ++ */ ++ ++#include "bdev_uring.h" ++ ++#include "spdk/stdinc.h" ++#include "spdk/config.h" ++#include "spdk/barrier.h" ++#include "spdk/bdev.h" ++#include "spdk/env.h" ++#include "spdk/fd.h" ++#include "spdk/likely.h" ++#include "spdk/thread.h" ++#include "spdk/json.h" ++#include "spdk/util.h" ++#include "spdk/string.h" ++ ++#include "spdk/log.h" ++#include "spdk_internal/uring.h" ++ ++#ifdef SPDK_CONFIG_URING_ZNS ++#include ++#define SECTOR_SHIFT 9 ++#endif ++ ++struct bdev_uring_zoned_dev { ++ uint64_t num_zones; ++ uint32_t zone_shift; ++ uint32_t lba_shift; ++}; ++ ++struct bdev_uring_io_channel { ++ struct bdev_uring_group_channel *group_ch; ++}; ++ ++struct bdev_uring_group_channel { ++ uint64_t io_inflight; ++ uint64_t io_pending; ++ struct spdk_poller *poller; ++ struct io_uring uring; ++}; ++ ++struct bdev_uring_task { ++ uint64_t len; ++ struct bdev_uring_io_channel *ch; ++ TAILQ_ENTRY(bdev_uring_task) link; ++}; ++ ++struct bdev_uring { ++ struct spdk_bdev bdev; ++ struct bdev_uring_zoned_dev zd; ++ char *filename; ++ int fd; ++ TAILQ_ENTRY(bdev_uring) link; ++}; ++ ++static int bdev_uring_init(void); ++static void bdev_uring_fini(void); ++static void uring_free_bdev(struct bdev_uring *uring); ++static TAILQ_HEAD(, bdev_uring) g_uring_bdev_head = TAILQ_HEAD_INITIALIZER(g_uring_bdev_head); ++ ++#define SPDK_URING_QUEUE_DEPTH 512 ++#define MAX_EVENTS_PER_POLL 32 ++ ++static int ++bdev_uring_get_ctx_size(void) ++{ ++ return sizeof(struct bdev_uring_task); ++} ++ ++static struct spdk_bdev_module uring_if = { ++ .name = "uring", ++ .module_init = bdev_uring_init, ++ .module_fini = bdev_uring_fini, ++ .get_ctx_size = bdev_uring_get_ctx_size, ++}; ++ ++SPDK_BDEV_MODULE_REGISTER(uring, &uring_if) ++ ++static int ++bdev_uring_open(struct bdev_uring *bdev) ++{ ++ int fd; ++ ++ fd = open(bdev->filename, O_RDWR | O_DIRECT | O_NOATIME); ++ if (fd < 0) { ++ /* Try without O_DIRECT for non-disk files */ ++ fd = open(bdev->filename, O_RDWR | O_NOATIME); ++ if (fd < 0) { ++ SPDK_ERRLOG("open() failed (file:%s), errno %d: %s\n", ++ bdev->filename, errno, spdk_strerror(errno)); ++ bdev->fd = -1; ++ return -1; ++ } ++ } ++ ++ bdev->fd = fd; ++ ++ return 0; ++} ++ ++static int ++bdev_uring_close(struct bdev_uring *bdev) ++{ ++ int rc; ++ ++ if (bdev->fd == -1) { ++ return 0; ++ } ++ ++ rc = close(bdev->fd); ++ if (rc < 0) { ++ SPDK_ERRLOG("close() failed (fd=%d), errno %d: %s\n", ++ bdev->fd, errno, spdk_strerror(errno)); ++ return -1; ++ } ++ ++ bdev->fd = -1; ++ ++ return 0; ++} ++ ++static int64_t ++bdev_uring_readv(struct bdev_uring *uring, struct spdk_io_channel *ch, ++ struct bdev_uring_task *uring_task, ++ struct iovec *iov, int iovcnt, uint64_t nbytes, uint64_t offset) ++{ ++ struct bdev_uring_io_channel *uring_ch = spdk_io_channel_get_ctx(ch); ++ struct bdev_uring_group_channel *group_ch = uring_ch->group_ch; ++ struct io_uring_sqe *sqe; ++ ++ sqe = io_uring_get_sqe(&group_ch->uring); ++ if (!sqe) { ++ SPDK_DEBUGLOG(uring, "get sqe failed as out of resource\n"); ++ return -ENOMEM; ++ } ++ ++ io_uring_prep_readv(sqe, uring->fd, iov, iovcnt, offset); ++ io_uring_sqe_set_data(sqe, uring_task); ++ uring_task->len = nbytes; ++ uring_task->ch = uring_ch; ++ ++ SPDK_DEBUGLOG(uring, "read %d iovs size %lu to off: %#lx\n", ++ iovcnt, nbytes, offset); ++ ++ group_ch->io_pending++; ++ return nbytes; ++} ++ ++static int64_t ++bdev_uring_writev(struct bdev_uring *uring, struct spdk_io_channel *ch, ++ struct bdev_uring_task *uring_task, ++ struct iovec *iov, int iovcnt, size_t nbytes, uint64_t offset) ++{ ++ struct bdev_uring_io_channel *uring_ch = spdk_io_channel_get_ctx(ch); ++ struct bdev_uring_group_channel *group_ch = uring_ch->group_ch; ++ struct io_uring_sqe *sqe; ++ ++ sqe = io_uring_get_sqe(&group_ch->uring); ++ if (!sqe) { ++ SPDK_DEBUGLOG(uring, "get sqe failed as out of resource\n"); ++ return -ENOMEM; ++ } ++ ++ io_uring_prep_writev(sqe, uring->fd, iov, iovcnt, offset); ++ io_uring_sqe_set_data(sqe, uring_task); ++ uring_task->len = nbytes; ++ uring_task->ch = uring_ch; ++ ++ SPDK_DEBUGLOG(uring, "write %d iovs size %lu from off: %#lx\n", ++ iovcnt, nbytes, offset); ++ ++ group_ch->io_pending++; ++ return nbytes; ++} ++ ++static int ++bdev_uring_destruct(void *ctx) ++{ ++ struct bdev_uring *uring = ctx; ++ int rc = 0; ++ ++ TAILQ_REMOVE(&g_uring_bdev_head, uring, link); ++ rc = bdev_uring_close(uring); ++ if (rc < 0) { ++ SPDK_ERRLOG("bdev_uring_close() failed\n"); ++ } ++ spdk_io_device_unregister(uring, NULL); ++ uring_free_bdev(uring); ++ return rc; ++} ++ ++static int ++bdev_uring_reap(struct io_uring *ring, int max) ++{ ++ int i, count, ret; ++ struct io_uring_cqe *cqe; ++ struct bdev_uring_task *uring_task; ++ enum spdk_bdev_io_status status; ++ ++ count = 0; ++ for (i = 0; i < max; i++) { ++ ret = io_uring_peek_cqe(ring, &cqe); ++ if (ret != 0) { ++ return ret; ++ } ++ ++ if (cqe == NULL) { ++ return count; ++ } ++ ++ uring_task = (struct bdev_uring_task *)cqe->user_data; ++ if (cqe->res != (signed)uring_task->len) { ++ status = SPDK_BDEV_IO_STATUS_FAILED; ++ } else { ++ status = SPDK_BDEV_IO_STATUS_SUCCESS; ++ } ++ ++ uring_task->ch->group_ch->io_inflight--; ++ io_uring_cqe_seen(ring, cqe); ++ spdk_bdev_io_complete(spdk_bdev_io_from_ctx(uring_task), status); ++ count++; ++ } ++ ++ return count; ++} ++ ++static int ++bdev_uring_group_poll(void *arg) ++{ ++ struct bdev_uring_group_channel *group_ch = arg; ++ int to_complete, to_submit; ++ int count, ret; ++ ++ to_submit = group_ch->io_pending; ++ ++ if (to_submit > 0) { ++ /* If there are I/O to submit, use io_uring_submit here. ++ * It will automatically call spdk_io_uring_enter appropriately. */ ++ ret = io_uring_submit(&group_ch->uring); ++ if (ret < 0) { ++ return SPDK_POLLER_BUSY; ++ } ++ ++ group_ch->io_pending = 0; ++ group_ch->io_inflight += to_submit; ++ } ++ ++ to_complete = group_ch->io_inflight; ++ count = 0; ++ if (to_complete > 0) { ++ count = bdev_uring_reap(&group_ch->uring, to_complete); ++ } ++ ++ if (count + to_submit > 0) { ++ return SPDK_POLLER_BUSY; ++ } else { ++ return SPDK_POLLER_IDLE; ++ } ++} ++ ++static void ++bdev_uring_get_buf_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io, ++ bool success) ++{ ++ int64_t ret = 0; ++ ++ if (!success) { ++ spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); ++ return; ++ } ++ ++ switch (bdev_io->type) { ++ case SPDK_BDEV_IO_TYPE_READ: ++ ret = bdev_uring_readv((struct bdev_uring *)bdev_io->bdev->ctxt, ++ ch, ++ (struct bdev_uring_task *)bdev_io->driver_ctx, ++ bdev_io->u.bdev.iovs, ++ bdev_io->u.bdev.iovcnt, ++ bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen, ++ bdev_io->u.bdev.offset_blocks * bdev_io->bdev->blocklen); ++ break; ++ case SPDK_BDEV_IO_TYPE_WRITE: ++ ret = bdev_uring_writev((struct bdev_uring *)bdev_io->bdev->ctxt, ++ ch, ++ (struct bdev_uring_task *)bdev_io->driver_ctx, ++ bdev_io->u.bdev.iovs, ++ bdev_io->u.bdev.iovcnt, ++ bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen, ++ bdev_io->u.bdev.offset_blocks * bdev_io->bdev->blocklen); ++ break; ++ default: ++ SPDK_ERRLOG("Wrong io type\n"); ++ break; ++ } ++ ++ if (ret == -ENOMEM) { ++ spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_NOMEM); ++ } ++} ++ ++#ifdef SPDK_CONFIG_URING_ZNS ++static int ++bdev_uring_read_sysfs_attr(const char *devname, const char *attr, char *str, int str_len) ++{ ++ char *path = NULL; ++ char *device = NULL; ++ FILE *file; ++ int ret = 0; ++ ++ device = basename(devname); ++ path = spdk_sprintf_alloc("/sys/block/%s/%s", device, attr); ++ if (!path) { ++ return -EINVAL; ++ } ++ ++ file = fopen(path, "r"); ++ if (!file) { ++ free(path); ++ return -ENOENT; ++ } ++ ++ if (!fgets(str, str_len, file)) { ++ ret = -EINVAL; ++ goto close; ++ } ++ ++ spdk_str_chomp(str); ++ ++close: ++ free(path); ++ fclose(file); ++ return ret; ++} ++ ++static int ++bdev_uring_read_sysfs_attr_long(const char *devname, const char *attr, long *val) ++{ ++ char str[128]; ++ int ret; ++ ++ ret = bdev_uring_read_sysfs_attr(devname, attr, str, sizeof(str)); ++ if (ret) { ++ return ret; ++ } ++ ++ *val = spdk_strtol(str, 10); ++ ++ return 0; ++} ++ ++static int ++bdev_uring_fill_zone_type(struct spdk_bdev_zone_info *zone_info, struct blk_zone *zones_rep) ++{ ++ switch (zones_rep->type) { ++ case BLK_ZONE_TYPE_CONVENTIONAL: ++ zone_info->type = SPDK_BDEV_ZONE_TYPE_CNV; ++ break; ++ case BLK_ZONE_TYPE_SEQWRITE_REQ: ++ zone_info->type = SPDK_BDEV_ZONE_TYPE_SEQWR; ++ break; ++ case BLK_ZONE_TYPE_SEQWRITE_PREF: ++ zone_info->type = SPDK_BDEV_ZONE_TYPE_SEQWP; ++ break; ++ default: ++ SPDK_ERRLOG("Invalid zone type: %#x in zone report\n", zones_rep->type); ++ return -EIO; ++ } ++ return 0; ++} ++ ++static int ++bdev_uring_fill_zone_state(struct spdk_bdev_zone_info *zone_info, struct blk_zone *zones_rep) ++{ ++ switch (zones_rep->cond) { ++ case BLK_ZONE_COND_EMPTY: ++ zone_info->state = SPDK_BDEV_ZONE_STATE_EMPTY; ++ break; ++ case BLK_ZONE_COND_IMP_OPEN: ++ zone_info->state = SPDK_BDEV_ZONE_STATE_IMP_OPEN; ++ break; ++ case BLK_ZONE_COND_EXP_OPEN: ++ zone_info->state = SPDK_BDEV_ZONE_STATE_EXP_OPEN; ++ break; ++ case BLK_ZONE_COND_CLOSED: ++ zone_info->state = SPDK_BDEV_ZONE_STATE_CLOSED; ++ break; ++ case BLK_ZONE_COND_READONLY: ++ zone_info->state = SPDK_BDEV_ZONE_STATE_READ_ONLY; ++ break; ++ case BLK_ZONE_COND_FULL: ++ zone_info->state = SPDK_BDEV_ZONE_STATE_FULL; ++ break; ++ case BLK_ZONE_COND_OFFLINE: ++ zone_info->state = SPDK_BDEV_ZONE_STATE_OFFLINE; ++ break; ++ case BLK_ZONE_COND_NOT_WP: ++ zone_info->state = SPDK_BDEV_ZONE_STATE_NOT_WP; ++ break; ++ default: ++ SPDK_ERRLOG("Invalid zone state: %#x in zone report\n", zones_rep->cond); ++ return -EIO; ++ } ++ return 0; ++} ++ ++static int ++bdev_uring_zone_management_op(struct spdk_bdev_io *bdev_io) ++{ ++ struct bdev_uring *uring; ++ struct blk_zone_range range; ++ long unsigned zone_mgmt_op; ++ uint64_t zone_id = bdev_io->u.zone_mgmt.zone_id; ++ ++ uring = (struct bdev_uring *)bdev_io->bdev->ctxt; ++ ++ switch (bdev_io->u.zone_mgmt.zone_action) { ++ case SPDK_BDEV_ZONE_RESET: ++ zone_mgmt_op = BLKRESETZONE; ++ break; ++ case SPDK_BDEV_ZONE_OPEN: ++ zone_mgmt_op = BLKOPENZONE; ++ break; ++ case SPDK_BDEV_ZONE_CLOSE: ++ zone_mgmt_op = BLKCLOSEZONE; ++ break; ++ case SPDK_BDEV_ZONE_FINISH: ++ zone_mgmt_op = BLKFINISHZONE; ++ break; ++ default: ++ return -EINVAL; ++ } ++ ++ range.sector = (zone_id << uring->zd.lba_shift); ++ range.nr_sectors = (uring->bdev.zone_size << uring->zd.lba_shift); ++ ++ if (ioctl(uring->fd, zone_mgmt_op, &range)) { ++ SPDK_ERRLOG("Ioctl BLKXXXZONE(%#x) failed errno: %d(%s)\n", ++ bdev_io->u.zone_mgmt.zone_action, errno, strerror(errno)); ++ return -EINVAL; ++ } ++ ++ spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_SUCCESS); ++ ++ return 0; ++} ++ ++static int ++bdev_uring_zone_get_info(struct spdk_bdev_io *bdev_io) ++{ ++ struct bdev_uring *uring; ++ struct blk_zone *zones; ++ struct blk_zone_report *rep; ++ struct spdk_bdev_zone_info *zone_info = bdev_io->u.zone_mgmt.buf; ++ size_t repsize; ++ uint32_t i, shift; ++ uint32_t num_zones = bdev_io->u.zone_mgmt.num_zones; ++ uint64_t zone_id = bdev_io->u.zone_mgmt.zone_id; ++ ++ uring = (struct bdev_uring *)bdev_io->bdev->ctxt; ++ shift = uring->zd.lba_shift; ++ ++ if ((num_zones > uring->zd.num_zones) || !num_zones) { ++ return -EINVAL; ++ } ++ ++ repsize = sizeof(struct blk_zone_report) + (sizeof(struct blk_zone) * num_zones); ++ rep = (struct blk_zone_report *)malloc(repsize); ++ if (!rep) { ++ return -ENOMEM; ++ } ++ ++ zones = (struct blk_zone *)(rep + 1); ++ ++ while (num_zones && ((zone_id >> uring->zd.zone_shift) <= num_zones)) { ++ memset(rep, 0, repsize); ++ rep->sector = zone_id; ++ rep->nr_zones = num_zones; ++ ++ if (ioctl(uring->fd, BLKREPORTZONE, rep)) { ++ SPDK_ERRLOG("Ioctl BLKREPORTZONE failed errno: %d(%s)\n", ++ errno, strerror(errno)); ++ free(rep); ++ return -EINVAL; ++ } ++ ++ if (!rep->nr_zones) { ++ break; ++ } ++ ++ for (i = 0; i < rep->nr_zones; i++) { ++ zone_info->zone_id = ((zones + i)->start >> shift); ++ zone_info->write_pointer = ((zones + i)->wp >> shift); ++ zone_info->capacity = ((zones + i)->capacity >> shift); ++ ++ bdev_uring_fill_zone_state(zone_info, zones + i); ++ bdev_uring_fill_zone_type(zone_info, zones + i); ++ ++ zone_id = ((zones + i)->start + (zones + i)->len) >> shift; ++ zone_info++; ++ num_zones--; ++ } ++ } ++ ++ spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_SUCCESS); ++ free(rep); ++ return 0; ++} ++ ++static int ++bdev_uring_check_zoned_support(struct bdev_uring *uring, const char *name, const char *filename) ++{ ++ char str[128]; ++ long int val = 0; ++ uint32_t zinfo; ++ int retval = -1; ++ ++ uring->bdev.zoned = false; ++ ++ /* Check if this is a zoned block device */ ++ if (bdev_uring_read_sysfs_attr(filename, "queue/zoned", str, sizeof(str))) { ++ SPDK_ERRLOG("Unable to open file %s/queue/zoned. errno: %d\n", filename, errno); ++ } else if (strcmp(str, "host-aware") == 0 || strcmp(str, "host-managed") == 0) { ++ /* Only host-aware & host-managed zns devices */ ++ uring->bdev.zoned = true; ++ ++ if (ioctl(uring->fd, BLKGETNRZONES, &zinfo)) { ++ SPDK_ERRLOG("ioctl BLKNRZONES failed %d (%s)\n", errno, strerror(errno)); ++ goto err_ret; ++ } ++ uring->zd.num_zones = zinfo; ++ ++ if (ioctl(uring->fd, BLKGETZONESZ, &zinfo)) { ++ SPDK_ERRLOG("ioctl BLKGETZONESZ failed %d (%s)\n", errno, strerror(errno)); ++ goto err_ret; ++ } ++ ++ uring->zd.lba_shift = uring->bdev.required_alignment - SECTOR_SHIFT; ++ uring->bdev.zone_size = (zinfo >> uring->zd.lba_shift); ++ uring->zd.zone_shift = spdk_u32log2(zinfo >> uring->zd.lba_shift); ++ ++ if (bdev_uring_read_sysfs_attr_long(filename, "queue/max_open_zones", &val)) { ++ SPDK_ERRLOG("Failed to get max open zones %d (%s)\n", errno, strerror(errno)); ++ goto err_ret; ++ } ++ uring->bdev.max_open_zones = uring->bdev.optimal_open_zones = (uint32_t)val; ++ ++ if (bdev_uring_read_sysfs_attr_long(filename, "queue/max_active_zones", &val)) { ++ SPDK_ERRLOG("Failed to get max active zones %d (%s)\n", errno, strerror(errno)); ++ goto err_ret; ++ } ++ uring->bdev.max_active_zones = (uint32_t)val; ++ retval = 0; ++ } else { ++ retval = 0; /* queue/zoned=none */ ++ } ++ ++err_ret: ++ return retval; ++} ++#else ++/* No support for zoned devices */ ++static int ++bdev_uring_zone_management_op(struct spdk_bdev_io *bdev_io) ++{ ++ return -1; ++} ++ ++static int ++bdev_uring_zone_get_info(struct spdk_bdev_io *bdev_io) ++{ ++ return -1; ++} ++ ++static int ++bdev_uring_check_zoned_support(struct bdev_uring *uring, const char *name, const char *filename) ++{ ++ return 0; ++} ++#endif ++ ++static int ++_bdev_uring_submit_request(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io) ++{ ++ ++ switch (bdev_io->type) { ++ case SPDK_BDEV_IO_TYPE_GET_ZONE_INFO: ++ return bdev_uring_zone_get_info(bdev_io); ++ case SPDK_BDEV_IO_TYPE_ZONE_MANAGEMENT: ++ return bdev_uring_zone_management_op(bdev_io); ++ /* Read and write operations must be performed on buffers aligned to ++ * bdev->required_alignment. If user specified unaligned buffers, ++ * get the aligned buffer from the pool by calling spdk_bdev_io_get_buf. */ ++ case SPDK_BDEV_IO_TYPE_READ: ++ case SPDK_BDEV_IO_TYPE_WRITE: ++ spdk_bdev_io_get_buf(bdev_io, bdev_uring_get_buf_cb, ++ bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen); ++ return 0; ++ default: ++ return -1; ++ } ++} ++ ++static void ++bdev_uring_submit_request(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io) ++{ ++ if (_bdev_uring_submit_request(ch, bdev_io) < 0) { ++ spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); ++ } ++} ++ ++static bool ++bdev_uring_io_type_supported(void *ctx, enum spdk_bdev_io_type io_type) ++{ ++ switch (io_type) { ++#ifdef SPDK_CONFIG_URING_ZNS ++ case SPDK_BDEV_IO_TYPE_GET_ZONE_INFO: ++ case SPDK_BDEV_IO_TYPE_ZONE_MANAGEMENT: ++#endif ++ case SPDK_BDEV_IO_TYPE_READ: ++ case SPDK_BDEV_IO_TYPE_WRITE: ++ return true; ++ default: ++ return false; ++ } ++} ++ ++static int ++bdev_uring_create_cb(void *io_device, void *ctx_buf) ++{ ++ struct bdev_uring_io_channel *ch = ctx_buf; ++ ++ ch->group_ch = spdk_io_channel_get_ctx(spdk_get_io_channel(&uring_if)); ++ ++ return 0; ++} ++ ++static void ++bdev_uring_destroy_cb(void *io_device, void *ctx_buf) ++{ ++ struct bdev_uring_io_channel *ch = ctx_buf; ++ ++ spdk_put_io_channel(spdk_io_channel_from_ctx(ch->group_ch)); ++} ++ ++static struct spdk_io_channel * ++bdev_uring_get_io_channel(void *ctx) ++{ ++ struct bdev_uring *uring = ctx; ++ ++ return spdk_get_io_channel(uring); ++} ++ ++static int ++bdev_uring_dump_info_json(void *ctx, struct spdk_json_write_ctx *w) ++{ ++ struct bdev_uring *uring = ctx; ++ ++ spdk_json_write_named_object_begin(w, "uring"); ++ ++ spdk_json_write_named_string(w, "filename", uring->filename); ++ ++ spdk_json_write_object_end(w); ++ ++ return 0; ++} ++ ++static void ++bdev_uring_write_json_config(struct spdk_bdev *bdev, struct spdk_json_write_ctx *w) ++{ ++ struct bdev_uring *uring = bdev->ctxt; ++ ++ spdk_json_write_object_begin(w); ++ ++ spdk_json_write_named_string(w, "method", "bdev_uring_create"); ++ ++ spdk_json_write_named_object_begin(w, "params"); ++ spdk_json_write_named_string(w, "name", bdev->name); ++ spdk_json_write_named_uint32(w, "block_size", bdev->blocklen); ++ spdk_json_write_named_string(w, "filename", uring->filename); ++ spdk_json_write_object_end(w); ++ ++ spdk_json_write_object_end(w); ++} ++ ++static const struct spdk_bdev_fn_table uring_fn_table = { ++ .destruct = bdev_uring_destruct, ++ .submit_request = bdev_uring_submit_request, ++ .io_type_supported = bdev_uring_io_type_supported, ++ .get_io_channel = bdev_uring_get_io_channel, ++ .dump_info_json = bdev_uring_dump_info_json, ++ .write_config_json = bdev_uring_write_json_config, ++}; ++ ++static void ++uring_free_bdev(struct bdev_uring *uring) ++{ ++ if (uring == NULL) { ++ return; ++ } ++ free(uring->filename); ++ free(uring->bdev.name); ++ free(uring); ++} ++ ++static int ++bdev_uring_group_create_cb(void *io_device, void *ctx_buf) ++{ ++ struct bdev_uring_group_channel *ch = ctx_buf; ++ ++ /* Do not use IORING_SETUP_IOPOLL until the Linux kernel can support not only ++ * local devices but also devices attached from remote target */ ++ if (io_uring_queue_init(SPDK_URING_QUEUE_DEPTH, &ch->uring, 0) < 0) { ++ SPDK_ERRLOG("uring I/O context setup failure\n"); ++ return -1; ++ } ++ ++ ch->poller = SPDK_POLLER_REGISTER(bdev_uring_group_poll, ch, 0); ++ return 0; ++} ++ ++static void ++bdev_uring_group_destroy_cb(void *io_device, void *ctx_buf) ++{ ++ struct bdev_uring_group_channel *ch = ctx_buf; ++ ++ io_uring_queue_exit(&ch->uring); ++ ++ spdk_poller_unregister(&ch->poller); ++} ++ ++struct spdk_bdev * ++create_uring_bdev(const char *name, const char *filename, uint32_t block_size) ++{ ++ struct bdev_uring *uring; ++ uint32_t detected_block_size; ++ uint64_t bdev_size; ++ int rc; ++ ++ uring = calloc(1, sizeof(*uring)); ++ if (!uring) { ++ SPDK_ERRLOG("Unable to allocate enough memory for uring backend\n"); ++ return NULL; ++ } ++ ++ uring->filename = strdup(filename); ++ if (!uring->filename) { ++ goto error_return; ++ } ++ ++ if (bdev_uring_open(uring)) { ++ SPDK_ERRLOG("Unable to open file %s. fd: %d errno: %d\n", filename, uring->fd, errno); ++ goto error_return; ++ } ++ ++ bdev_size = spdk_fd_get_size(uring->fd); ++ ++ uring->bdev.name = strdup(name); ++ if (!uring->bdev.name) { ++ goto error_return; ++ } ++ uring->bdev.product_name = "URING bdev"; ++ uring->bdev.module = &uring_if; ++ ++ uring->bdev.write_cache = 0; ++ ++ detected_block_size = spdk_fd_get_blocklen(uring->fd); ++ if (block_size == 0) { ++ /* User did not specify block size - use autodetected block size. */ ++ if (detected_block_size == 0) { ++ SPDK_ERRLOG("Block size could not be auto-detected\n"); ++ goto error_return; ++ } ++ block_size = detected_block_size; ++ } else { ++ if (block_size < detected_block_size) { ++ SPDK_ERRLOG("Specified block size %" PRIu32 " is smaller than " ++ "auto-detected block size %" PRIu32 "\n", ++ block_size, detected_block_size); ++ goto error_return; ++ } else if (detected_block_size != 0 && block_size != detected_block_size) { ++ SPDK_WARNLOG("Specified block size %" PRIu32 " does not match " ++ "auto-detected block size %" PRIu32 "\n", ++ block_size, detected_block_size); ++ } ++ } ++ ++ if (block_size < 512) { ++ SPDK_ERRLOG("Invalid block size %" PRIu32 " (must be at least 512).\n", block_size); ++ goto error_return; ++ } ++ ++ if (!spdk_u32_is_pow2(block_size)) { ++ SPDK_ERRLOG("Invalid block size %" PRIu32 " (must be a power of 2.)\n", block_size); ++ goto error_return; ++ } ++ ++ uring->bdev.blocklen = block_size; ++ uring->bdev.required_alignment = spdk_u32log2(block_size); ++ ++ rc = bdev_uring_check_zoned_support(uring, name, filename); ++ if (rc) { ++ goto error_return; ++ } ++ ++ if (bdev_size % uring->bdev.blocklen != 0) { ++ SPDK_ERRLOG("Disk size %" PRIu64 " is not a multiple of block size %" PRIu32 "\n", ++ bdev_size, uring->bdev.blocklen); ++ goto error_return; ++ } ++ ++ uring->bdev.blockcnt = bdev_size / uring->bdev.blocklen; ++ uring->bdev.ctxt = uring; ++ ++ uring->bdev.fn_table = &uring_fn_table; ++ ++ spdk_io_device_register(uring, bdev_uring_create_cb, bdev_uring_destroy_cb, ++ sizeof(struct bdev_uring_io_channel), ++ uring->bdev.name); ++ rc = spdk_bdev_register(&uring->bdev); ++ if (rc) { ++ spdk_io_device_unregister(uring, NULL); ++ goto error_return; ++ } ++ ++ TAILQ_INSERT_TAIL(&g_uring_bdev_head, uring, link); ++ return &uring->bdev; ++ ++error_return: ++ bdev_uring_close(uring); ++ uring_free_bdev(uring); ++ return NULL; ++} ++ ++struct delete_uring_bdev_ctx { ++ spdk_delete_uring_complete cb_fn; ++ void *cb_arg; ++}; ++ ++static void ++uring_bdev_unregister_cb(void *arg, int bdeverrno) ++{ ++ struct delete_uring_bdev_ctx *ctx = arg; ++ ++ ctx->cb_fn(ctx->cb_arg, bdeverrno); ++ free(ctx); ++} ++ ++void ++delete_uring_bdev(const char *name, spdk_delete_uring_complete cb_fn, void *cb_arg) ++{ ++ struct delete_uring_bdev_ctx *ctx; ++ int rc; ++ ++ ctx = calloc(1, sizeof(*ctx)); ++ if (ctx == NULL) { ++ cb_fn(cb_arg, -ENOMEM); ++ return; ++ } ++ ++ ctx->cb_fn = cb_fn; ++ ctx->cb_arg = cb_arg; ++ rc = spdk_bdev_unregister_by_name(name, &uring_if, uring_bdev_unregister_cb, ctx); ++ if (rc != 0) { ++ uring_bdev_unregister_cb(ctx, rc); ++ } ++} ++ ++static int ++bdev_uring_init(void) ++{ ++ spdk_io_device_register(&uring_if, bdev_uring_group_create_cb, bdev_uring_group_destroy_cb, ++ sizeof(struct bdev_uring_group_channel), "uring_module"); ++ ++ return 0; ++} ++ ++static void ++bdev_uring_fini(void) ++{ ++ spdk_io_device_unregister(&uring_if, NULL); ++} ++ ++SPDK_LOG_REGISTER_COMPONENT(uring) +diff --git a/module/bdev/uring/bdev_uring.h b/module/bdev/uring/bdev_uring.h +index 177cab8..afcff5b 100644 +--- a/module/bdev/uring/bdev_uring.h ++++ b/module/bdev/uring/bdev_uring.h +@@ -1,22 +1,22 @@ +-/* SPDX-License-Identifier: BSD-3-Clause +- * Copyright (C) 2019 Intel Corporation. +- * All rights reserved. +- */ +- +-#ifndef SPDK_BDEV_URING_H +-#define SPDK_BDEV_URING_H +- +-#include "spdk/stdinc.h" +- +-#include "spdk/queue.h" +-#include "spdk/bdev.h" +- +-#include "spdk/bdev_module.h" +- +-typedef void (*spdk_delete_uring_complete)(void *cb_arg, int bdeverrno); +- +-struct spdk_bdev *create_uring_bdev(const char *name, const char *filename, uint32_t block_size); +- +-void delete_uring_bdev(const char *name, spdk_delete_uring_complete cb_fn, void *cb_arg); +- +-#endif /* SPDK_BDEV_URING_H */ ++/* SPDX-License-Identifier: BSD-3-Clause ++ * Copyright (C) 2019 Intel Corporation. ++ * All rights reserved. ++ */ ++ ++#ifndef SPDK_BDEV_URING_H ++#define SPDK_BDEV_URING_H ++ ++#include "spdk/stdinc.h" ++ ++#include "spdk/queue.h" ++#include "spdk/bdev.h" ++ ++#include "spdk/bdev_module.h" ++ ++typedef void (*spdk_delete_uring_complete)(void *cb_arg, int bdeverrno); ++ ++struct spdk_bdev *create_uring_bdev(const char *name, const char *filename, uint32_t block_size); ++ ++void delete_uring_bdev(const char *name, spdk_delete_uring_complete cb_fn, void *cb_arg); ++ ++#endif /* SPDK_BDEV_URING_H */ +diff --git a/module/bdev/uring/bdev_uring_rpc.c b/module/bdev/uring/bdev_uring_rpc.c +index fefee33..26cac80 100644 +--- a/module/bdev/uring/bdev_uring_rpc.c ++++ b/module/bdev/uring/bdev_uring_rpc.c +@@ -1,117 +1,117 @@ +-/* SPDX-License-Identifier: BSD-3-Clause +- * Copyright (C) 2019 Intel Corporation. +- * All rights reserved. +- */ +- +-#include "bdev_uring.h" +-#include "spdk/rpc.h" +-#include "spdk/util.h" +-#include "spdk/string.h" +-#include "spdk/log.h" +- +-/* Structure to hold the parameters for this RPC method. */ +-struct rpc_create_uring { +- char *name; +- char *filename; +- uint32_t block_size; +-}; +- +-/* Free the allocated memory resource after the RPC handling. */ +-static void +-free_rpc_create_uring(struct rpc_create_uring *r) +-{ +- free(r->name); +- free(r->filename); +-} +- +-/* Structure to decode the input parameters for this RPC method. */ +-static const struct spdk_json_object_decoder rpc_create_uring_decoders[] = { +- {"name", offsetof(struct rpc_create_uring, name), spdk_json_decode_string}, +- {"filename", offsetof(struct rpc_create_uring, filename), spdk_json_decode_string}, +- {"block_size", offsetof(struct rpc_create_uring, block_size), spdk_json_decode_uint32, true}, +-}; +- +-/* Decode the parameters for this RPC method and properly create the uring +- * device. Error status returned in the failed cases. +- */ +-static void +-rpc_bdev_uring_create(struct spdk_jsonrpc_request *request, +- const struct spdk_json_val *params) +-{ +- struct rpc_create_uring req = {}; +- struct spdk_json_write_ctx *w; +- struct spdk_bdev *bdev; +- +- if (spdk_json_decode_object(params, rpc_create_uring_decoders, +- SPDK_COUNTOF(rpc_create_uring_decoders), +- &req)) { +- SPDK_ERRLOG("spdk_json_decode_object failed\n"); +- spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, +- "spdk_json_decode_object failed"); +- goto cleanup; +- } +- +- bdev = create_uring_bdev(req.name, req.filename, req.block_size); +- if (!bdev) { +- SPDK_ERRLOG("Unable to create URING bdev from file %s\n", req.filename); +- spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, +- "Unable to create URING bdev."); +- goto cleanup; +- } +- +- w = spdk_jsonrpc_begin_result(request); +- spdk_json_write_string(w, req.name); +- spdk_jsonrpc_end_result(request, w); +- +-cleanup: +- free_rpc_create_uring(&req); +-} +-SPDK_RPC_REGISTER("bdev_uring_create", rpc_bdev_uring_create, SPDK_RPC_RUNTIME) +- +-struct rpc_delete_uring { +- char *name; +-}; +- +-static void +-free_rpc_delete_uring(struct rpc_delete_uring *req) +-{ +- free(req->name); +-} +- +-static const struct spdk_json_object_decoder rpc_delete_uring_decoders[] = { +- {"name", offsetof(struct rpc_delete_uring, name), spdk_json_decode_string}, +-}; +- +-static void +-_rpc_bdev_uring_delete_cb(void *cb_arg, int bdeverrno) +-{ +- struct spdk_jsonrpc_request *request = cb_arg; +- +- if (bdeverrno == 0) { +- spdk_jsonrpc_send_bool_response(request, true); +- } else { +- spdk_jsonrpc_send_error_response(request, bdeverrno, spdk_strerror(-bdeverrno)); +- } +- +-} +- +-static void +-rpc_bdev_uring_delete(struct spdk_jsonrpc_request *request, +- const struct spdk_json_val *params) +-{ +- struct rpc_delete_uring req = {NULL}; +- +- if (spdk_json_decode_object(params, rpc_delete_uring_decoders, +- SPDK_COUNTOF(rpc_delete_uring_decoders), +- &req)) { +- spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, +- "spdk_json_decode_object failed"); +- goto cleanup; +- } +- +- delete_uring_bdev(req.name, _rpc_bdev_uring_delete_cb, request); +- +-cleanup: +- free_rpc_delete_uring(&req); +-} +-SPDK_RPC_REGISTER("bdev_uring_delete", rpc_bdev_uring_delete, SPDK_RPC_RUNTIME) ++/* SPDX-License-Identifier: BSD-3-Clause ++ * Copyright (C) 2019 Intel Corporation. ++ * All rights reserved. ++ */ ++ ++#include "bdev_uring.h" ++#include "spdk/rpc.h" ++#include "spdk/util.h" ++#include "spdk/string.h" ++#include "spdk/log.h" ++ ++/* Structure to hold the parameters for this RPC method. */ ++struct rpc_create_uring { ++ char *name; ++ char *filename; ++ uint32_t block_size; ++}; ++ ++/* Free the allocated memory resource after the RPC handling. */ ++static void ++free_rpc_create_uring(struct rpc_create_uring *r) ++{ ++ free(r->name); ++ free(r->filename); ++} ++ ++/* Structure to decode the input parameters for this RPC method. */ ++static const struct spdk_json_object_decoder rpc_create_uring_decoders[] = { ++ {"name", offsetof(struct rpc_create_uring, name), spdk_json_decode_string}, ++ {"filename", offsetof(struct rpc_create_uring, filename), spdk_json_decode_string}, ++ {"block_size", offsetof(struct rpc_create_uring, block_size), spdk_json_decode_uint32, true}, ++}; ++ ++/* Decode the parameters for this RPC method and properly create the uring ++ * device. Error status returned in the failed cases. ++ */ ++static void ++rpc_bdev_uring_create(struct spdk_jsonrpc_request *request, ++ const struct spdk_json_val *params) ++{ ++ struct rpc_create_uring req = {}; ++ struct spdk_json_write_ctx *w; ++ struct spdk_bdev *bdev; ++ ++ if (spdk_json_decode_object(params, rpc_create_uring_decoders, ++ SPDK_COUNTOF(rpc_create_uring_decoders), ++ &req)) { ++ SPDK_ERRLOG("spdk_json_decode_object failed\n"); ++ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, ++ "spdk_json_decode_object failed"); ++ goto cleanup; ++ } ++ ++ bdev = create_uring_bdev(req.name, req.filename, req.block_size); ++ if (!bdev) { ++ SPDK_ERRLOG("Unable to create URING bdev from file %s\n", req.filename); ++ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, ++ "Unable to create URING bdev."); ++ goto cleanup; ++ } ++ ++ w = spdk_jsonrpc_begin_result(request); ++ spdk_json_write_string(w, req.name); ++ spdk_jsonrpc_end_result(request, w); ++ ++cleanup: ++ free_rpc_create_uring(&req); ++} ++SPDK_RPC_REGISTER("bdev_uring_create", rpc_bdev_uring_create, SPDK_RPC_RUNTIME) ++ ++struct rpc_delete_uring { ++ char *name; ++}; ++ ++static void ++free_rpc_delete_uring(struct rpc_delete_uring *req) ++{ ++ free(req->name); ++} ++ ++static const struct spdk_json_object_decoder rpc_delete_uring_decoders[] = { ++ {"name", offsetof(struct rpc_delete_uring, name), spdk_json_decode_string}, ++}; ++ ++static void ++_rpc_bdev_uring_delete_cb(void *cb_arg, int bdeverrno) ++{ ++ struct spdk_jsonrpc_request *request = cb_arg; ++ ++ if (bdeverrno == 0) { ++ spdk_jsonrpc_send_bool_response(request, true); ++ } else { ++ spdk_jsonrpc_send_error_response(request, bdeverrno, spdk_strerror(-bdeverrno)); ++ } ++ ++} ++ ++static void ++rpc_bdev_uring_delete(struct spdk_jsonrpc_request *request, ++ const struct spdk_json_val *params) ++{ ++ struct rpc_delete_uring req = {NULL}; ++ ++ if (spdk_json_decode_object(params, rpc_delete_uring_decoders, ++ SPDK_COUNTOF(rpc_delete_uring_decoders), ++ &req)) { ++ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, ++ "spdk_json_decode_object failed"); ++ goto cleanup; ++ } ++ ++ delete_uring_bdev(req.name, _rpc_bdev_uring_delete_cb, request); ++ ++cleanup: ++ free_rpc_delete_uring(&req); ++} ++SPDK_RPC_REGISTER("bdev_uring_delete", rpc_bdev_uring_delete, SPDK_RPC_RUNTIME) +diff --git a/module/bdev/virtio/Makefile b/module/bdev/virtio/Makefile +index 608f1a0..3524b47 100644 +--- a/module/bdev/virtio/Makefile ++++ b/module/bdev/virtio/Makefile +@@ -1,17 +1,17 @@ +-# SPDX-License-Identifier: BSD-3-Clause +-# Copyright (C) 2017 Intel Corporation. +-# All rights reserved. +-# +- +-SPDK_ROOT_DIR := $(abspath $(CURDIR)/../../..) +-include $(SPDK_ROOT_DIR)/mk/spdk.common.mk +- +-SO_VER := 4 +-SO_MINOR := 0 +- +-C_SRCS = bdev_virtio_scsi.c bdev_virtio_blk.c bdev_virtio_rpc.c +-LIBNAME = bdev_virtio +- +-SPDK_MAP_FILE = $(SPDK_ROOT_DIR)/mk/spdk_blank.map +- +-include $(SPDK_ROOT_DIR)/mk/spdk.lib.mk ++# SPDX-License-Identifier: BSD-3-Clause ++# Copyright (C) 2017 Intel Corporation. ++# All rights reserved. ++# ++ ++SPDK_ROOT_DIR := $(abspath $(CURDIR)/../../..) ++include $(SPDK_ROOT_DIR)/mk/spdk.common.mk ++ ++SO_VER := 4 ++SO_MINOR := 0 ++ ++C_SRCS = bdev_virtio_scsi.c bdev_virtio_blk.c bdev_virtio_rpc.c ++LIBNAME = bdev_virtio ++ ++SPDK_MAP_FILE = $(SPDK_ROOT_DIR)/mk/spdk_blank.map ++ ++include $(SPDK_ROOT_DIR)/mk/spdk.lib.mk +diff --git a/module/bdev/virtio/bdev_virtio.h b/module/bdev/virtio/bdev_virtio.h +index 69cfade..60f6115 100644 +--- a/module/bdev/virtio/bdev_virtio.h ++++ b/module/bdev/virtio/bdev_virtio.h +@@ -1,178 +1,178 @@ +-/* SPDX-License-Identifier: BSD-3-Clause +- * Copyright (C) 2017 Intel Corporation. +- * All rights reserved. +- */ +- +-#ifndef SPDK_BDEV_VIRTIO_H +-#define SPDK_BDEV_VIRTIO_H +- +-#include "spdk/bdev.h" +-#include "spdk/env.h" +- +-/** +- * Callback for creating virtio bdevs. +- * +- * \param ctx opaque context set by the user +- * \param errnum error code. 0 on success, negative errno on error. +- * \param bdevs contiguous array of created bdevs +- * \param bdev_cnt number of bdevs in the `bdevs` array +- */ +-typedef void (*bdev_virtio_create_cb)(void *ctx, int errnum, +- struct spdk_bdev **bdevs, size_t bdev_cnt); +- +-/** +- * Callback for removing virtio devices. +- * +- * \param ctx opaque context set by the user +- * \param errnum error code. 0 on success, negative errno on error. +- */ +-typedef void (*bdev_virtio_remove_cb)(void *ctx, int errnum); +- +-/** +- * Connect to a vhost-user Unix domain socket and create a Virtio SCSI device. +- * If the connection is successful, the device will be automatically scanned. +- * The scan consists of probing the targets on the device and will result in +- * creating possibly multiple Virtio SCSI bdevs - one for each target. Currently +- * only one LUN per target is detected - LUN0. Note that the bdev creation is +- * run asynchronously in the background. After it's finished, the `cb_fn` +- * callback is called. +- * +- * \param name name for the virtio device. It will be inherited by all created +- * bdevs, which are named in the following format: t +- * \param path path to the socket +- * \param num_queues max number of request virtqueues to use. `vdev` will be +- * started successfully even if the host device supports less queues than requested. +- * \param queue_size depth of each queue +- * \param cb_fn function to be called after scanning all targets on the virtio +- * device. It's optional, can be NULL. See \c bdev_virtio_create_cb. +- * \param cb_arg argument for the `cb_fn` +- * \return zero on success (device scan is started) or negative error code. +- * In case of error the \c cb_fn is not called. +- */ +-int bdev_virtio_user_scsi_dev_create(const char *name, const char *path, +- unsigned num_queues, unsigned queue_size, +- bdev_virtio_create_cb cb_fn, void *cb_arg); +- +-/** +- * Connect to a vfio-user Unix domain socket and create a Virtio SCSI device. +- * If the connection is successful, the device will be automatically scanned. +- * The scan consists of probing the targets on the device and will result in +- * creating possibly multiple Virtio SCSI bdevs - one for each target. Currently +- * only one LUN per target is detected - LUN0. Note that the bdev creation is +- * run asynchronously in the background. After it's finished, the `cb_fn` +- * callback is called. +- * +- * \param name name for the virtio device. It will be inherited by all created +- * bdevs, which are named in the following format: t +- * \param path path to the socket +- * \param cb_fn function to be called after scanning all targets on the virtio +- * device. It's optional, can be NULL. See \c bdev_virtio_create_cb. +- * \param cb_arg argument for the `cb_fn` +- * \return zero on success (device scan is started) or negative error code. +- * In case of error the \c cb_fn is not called. +- */ +-int bdev_vfio_user_scsi_dev_create(const char *base_name, const char *path, +- bdev_virtio_create_cb cb_fn, void *cb_arg); +- +-/** +- * Attach virtio-pci device. This creates a Virtio SCSI device with the same +- * capabilities as the vhost-user equivalent. The device will be automatically +- * scanned for exposed SCSI targets. This will result in creating possibly multiple +- * Virtio SCSI bdevs - one for each target. Currently only one LUN per target is +- * detected - LUN0. Note that the bdev creation is run asynchronously in the +- * background. After it's finished, the `cb_fn` callback is called. +- * +- * \param name name for the virtio device. It will be inherited by all created +- * bdevs, which are named in the following format: t +- * \param pci_addr PCI address of the device to attach +- * \param cb_fn function to be called after scanning all targets on the virtio +- * device. It's optional, can be NULL. See \c bdev_virtio_create_cb. +- * \param cb_arg argument for the `cb_fn` +- * \return zero on success (device scan is started) or negative error code. +- * In case of error the \c cb_fn is not called. +- */ +-int bdev_virtio_pci_scsi_dev_create(const char *name, struct spdk_pci_addr *pci_addr, +- bdev_virtio_create_cb cb_fn, void *cb_arg); +- +-/** +- * Remove a Virtio device with given name. This will destroy all bdevs exposed +- * by this device. +- * +- * \param name virtio device name +- * \param cb_fn function to be called after scanning all targets on the virtio +- * device. It's optional, can be NULL. See \c bdev_virtio_create_cb. Possible +- * error codes are: +- * * ENODEV - couldn't find device with given name +- * * EBUSY - device is already being removed +- * \param cb_arg argument for the `cb_fn` +- * \return zero on success or -ENODEV if scsi dev does not exist +- */ +-int bdev_virtio_scsi_dev_remove(const char *name, +- bdev_virtio_remove_cb cb_fn, void *cb_arg); +- +-/** +- * Remove a Virtio device with given name. +- * +- * \param bdev virtio blk device bdev +- * \param cb_fn function to be called after removing bdev +- * \param cb_arg argument for the `cb_fn` +- * \return zero on success, -ENODEV if bdev with 'name' does not exist or +- * -EINVAL if bdev with 'name' is not a virtio blk device. +- */ +-int bdev_virtio_blk_dev_remove(const char *name, +- bdev_virtio_remove_cb cb_fn, void *cb_arg); +- +-/** +- * List all created Virtio-SCSI devices. +- * +- * \param write_ctx JSON context to write into +- */ +-void bdev_virtio_scsi_dev_list(struct spdk_json_write_ctx *write_ctx); +- +-/** +- * Connect to a vhost-user Unix domain socket and create a Virtio BLK bdev. +- * +- * \param name name for the virtio bdev +- * \param path path to the socket +- * \param num_queues max number of request virtqueues to use. `vdev` will be +- * started successfully even if the host device supports less queues than requested. +- * \param queue_size depth of each queue +- * \return virtio-blk bdev or NULL +- */ +-struct spdk_bdev *bdev_virtio_user_blk_dev_create(const char *name, const char *path, +- unsigned num_queues, unsigned queue_size); +- +-/** +- * Connect to a vfio-user Unix domain socket and create a Virtio BLK bdev. +- * +- * \param name name for the virtio bdev +- * \param path path to the socket +- * \return virtio-blk bdev or NULL +- */ +-struct spdk_bdev * +-bdev_virtio_vfio_user_blk_dev_create(const char *name, const char *path); +- +-/** +- * Attach virtio-pci device. This creates a Virtio BLK device with the same +- * capabilities as the vhost-user equivalent. +- * +- * \param name name for the virtio device. It will be inherited by all created +- * bdevs, which are named in the following format: t +- * \param pci_addr PCI address of the device to attach +- * \return virtio-blk bdev or NULL +- */ +-struct spdk_bdev *bdev_virtio_pci_blk_dev_create(const char *name, +- struct spdk_pci_addr *pci_addr); +- +-/** +- * Enable/Disable the virtio blk hotplug monitor or +- * change the monitor period time +- * +- * \param enabled True means to enable the hotplug monitor and the monitor +- * period time is period_us. False means to disable the hotplug monitor +- * \param period_us The period time of the hotplug monitor in us +- * \return 0 for success otherwise failure +- */ +-int bdev_virtio_pci_blk_set_hotplug(bool enabled, uint64_t period_us); +- +-#endif /* SPDK_BDEV_VIRTIO_H */ ++/* SPDX-License-Identifier: BSD-3-Clause ++ * Copyright (C) 2017 Intel Corporation. ++ * All rights reserved. ++ */ ++ ++#ifndef SPDK_BDEV_VIRTIO_H ++#define SPDK_BDEV_VIRTIO_H ++ ++#include "spdk/bdev.h" ++#include "spdk/env.h" ++ ++/** ++ * Callback for creating virtio bdevs. ++ * ++ * \param ctx opaque context set by the user ++ * \param errnum error code. 0 on success, negative errno on error. ++ * \param bdevs contiguous array of created bdevs ++ * \param bdev_cnt number of bdevs in the `bdevs` array ++ */ ++typedef void (*bdev_virtio_create_cb)(void *ctx, int errnum, ++ struct spdk_bdev **bdevs, size_t bdev_cnt); ++ ++/** ++ * Callback for removing virtio devices. ++ * ++ * \param ctx opaque context set by the user ++ * \param errnum error code. 0 on success, negative errno on error. ++ */ ++typedef void (*bdev_virtio_remove_cb)(void *ctx, int errnum); ++ ++/** ++ * Connect to a vhost-user Unix domain socket and create a Virtio SCSI device. ++ * If the connection is successful, the device will be automatically scanned. ++ * The scan consists of probing the targets on the device and will result in ++ * creating possibly multiple Virtio SCSI bdevs - one for each target. Currently ++ * only one LUN per target is detected - LUN0. Note that the bdev creation is ++ * run asynchronously in the background. After it's finished, the `cb_fn` ++ * callback is called. ++ * ++ * \param name name for the virtio device. It will be inherited by all created ++ * bdevs, which are named in the following format: t ++ * \param path path to the socket ++ * \param num_queues max number of request virtqueues to use. `vdev` will be ++ * started successfully even if the host device supports less queues than requested. ++ * \param queue_size depth of each queue ++ * \param cb_fn function to be called after scanning all targets on the virtio ++ * device. It's optional, can be NULL. See \c bdev_virtio_create_cb. ++ * \param cb_arg argument for the `cb_fn` ++ * \return zero on success (device scan is started) or negative error code. ++ * In case of error the \c cb_fn is not called. ++ */ ++int bdev_virtio_user_scsi_dev_create(const char *name, const char *path, ++ unsigned num_queues, unsigned queue_size, ++ bdev_virtio_create_cb cb_fn, void *cb_arg); ++ ++/** ++ * Connect to a vfio-user Unix domain socket and create a Virtio SCSI device. ++ * If the connection is successful, the device will be automatically scanned. ++ * The scan consists of probing the targets on the device and will result in ++ * creating possibly multiple Virtio SCSI bdevs - one for each target. Currently ++ * only one LUN per target is detected - LUN0. Note that the bdev creation is ++ * run asynchronously in the background. After it's finished, the `cb_fn` ++ * callback is called. ++ * ++ * \param name name for the virtio device. It will be inherited by all created ++ * bdevs, which are named in the following format: t ++ * \param path path to the socket ++ * \param cb_fn function to be called after scanning all targets on the virtio ++ * device. It's optional, can be NULL. See \c bdev_virtio_create_cb. ++ * \param cb_arg argument for the `cb_fn` ++ * \return zero on success (device scan is started) or negative error code. ++ * In case of error the \c cb_fn is not called. ++ */ ++int bdev_vfio_user_scsi_dev_create(const char *base_name, const char *path, ++ bdev_virtio_create_cb cb_fn, void *cb_arg); ++ ++/** ++ * Attach virtio-pci device. This creates a Virtio SCSI device with the same ++ * capabilities as the vhost-user equivalent. The device will be automatically ++ * scanned for exposed SCSI targets. This will result in creating possibly multiple ++ * Virtio SCSI bdevs - one for each target. Currently only one LUN per target is ++ * detected - LUN0. Note that the bdev creation is run asynchronously in the ++ * background. After it's finished, the `cb_fn` callback is called. ++ * ++ * \param name name for the virtio device. It will be inherited by all created ++ * bdevs, which are named in the following format: t ++ * \param pci_addr PCI address of the device to attach ++ * \param cb_fn function to be called after scanning all targets on the virtio ++ * device. It's optional, can be NULL. See \c bdev_virtio_create_cb. ++ * \param cb_arg argument for the `cb_fn` ++ * \return zero on success (device scan is started) or negative error code. ++ * In case of error the \c cb_fn is not called. ++ */ ++int bdev_virtio_pci_scsi_dev_create(const char *name, struct spdk_pci_addr *pci_addr, ++ bdev_virtio_create_cb cb_fn, void *cb_arg); ++ ++/** ++ * Remove a Virtio device with given name. This will destroy all bdevs exposed ++ * by this device. ++ * ++ * \param name virtio device name ++ * \param cb_fn function to be called after scanning all targets on the virtio ++ * device. It's optional, can be NULL. See \c bdev_virtio_create_cb. Possible ++ * error codes are: ++ * * ENODEV - couldn't find device with given name ++ * * EBUSY - device is already being removed ++ * \param cb_arg argument for the `cb_fn` ++ * \return zero on success or -ENODEV if scsi dev does not exist ++ */ ++int bdev_virtio_scsi_dev_remove(const char *name, ++ bdev_virtio_remove_cb cb_fn, void *cb_arg); ++ ++/** ++ * Remove a Virtio device with given name. ++ * ++ * \param bdev virtio blk device bdev ++ * \param cb_fn function to be called after removing bdev ++ * \param cb_arg argument for the `cb_fn` ++ * \return zero on success, -ENODEV if bdev with 'name' does not exist or ++ * -EINVAL if bdev with 'name' is not a virtio blk device. ++ */ ++int bdev_virtio_blk_dev_remove(const char *name, ++ bdev_virtio_remove_cb cb_fn, void *cb_arg); ++ ++/** ++ * List all created Virtio-SCSI devices. ++ * ++ * \param write_ctx JSON context to write into ++ */ ++void bdev_virtio_scsi_dev_list(struct spdk_json_write_ctx *write_ctx); ++ ++/** ++ * Connect to a vhost-user Unix domain socket and create a Virtio BLK bdev. ++ * ++ * \param name name for the virtio bdev ++ * \param path path to the socket ++ * \param num_queues max number of request virtqueues to use. `vdev` will be ++ * started successfully even if the host device supports less queues than requested. ++ * \param queue_size depth of each queue ++ * \return virtio-blk bdev or NULL ++ */ ++struct spdk_bdev *bdev_virtio_user_blk_dev_create(const char *name, const char *path, ++ unsigned num_queues, unsigned queue_size); ++ ++/** ++ * Connect to a vfio-user Unix domain socket and create a Virtio BLK bdev. ++ * ++ * \param name name for the virtio bdev ++ * \param path path to the socket ++ * \return virtio-blk bdev or NULL ++ */ ++struct spdk_bdev * ++bdev_virtio_vfio_user_blk_dev_create(const char *name, const char *path); ++ ++/** ++ * Attach virtio-pci device. This creates a Virtio BLK device with the same ++ * capabilities as the vhost-user equivalent. ++ * ++ * \param name name for the virtio device. It will be inherited by all created ++ * bdevs, which are named in the following format: t ++ * \param pci_addr PCI address of the device to attach ++ * \return virtio-blk bdev or NULL ++ */ ++struct spdk_bdev *bdev_virtio_pci_blk_dev_create(const char *name, ++ struct spdk_pci_addr *pci_addr); ++ ++/** ++ * Enable/Disable the virtio blk hotplug monitor or ++ * change the monitor period time ++ * ++ * \param enabled True means to enable the hotplug monitor and the monitor ++ * period time is period_us. False means to disable the hotplug monitor ++ * \param period_us The period time of the hotplug monitor in us ++ * \return 0 for success otherwise failure ++ */ ++int bdev_virtio_pci_blk_set_hotplug(bool enabled, uint64_t period_us); ++ ++#endif /* SPDK_BDEV_VIRTIO_H */ +diff --git a/module/bdev/virtio/bdev_virtio_blk.c b/module/bdev/virtio/bdev_virtio_blk.c +index 728a557..9c53b10 100644 +--- a/module/bdev/virtio/bdev_virtio_blk.c ++++ b/module/bdev/virtio/bdev_virtio_blk.c +@@ -1,791 +1,791 @@ +-/* SPDX-License-Identifier: BSD-3-Clause +- * Copyright (C) 2017 Intel Corporation. +- * All rights reserved. +- */ +- +-#include "spdk/stdinc.h" +- +-#include "spdk/bdev.h" +-#include "spdk/endian.h" +-#include "spdk/env.h" +-#include "spdk/thread.h" +-#include "spdk/string.h" +-#include "spdk/util.h" +-#include "spdk/json.h" +- +-#include "spdk_internal/assert.h" +-#include "spdk/bdev_module.h" +-#include "spdk/log.h" +-#include "spdk_internal/virtio.h" +-#include "spdk_internal/vhost_user.h" +- +-#include +-#include +- +-#include "bdev_virtio.h" +- +-struct virtio_blk_dev { +- struct virtio_dev vdev; +- struct spdk_bdev bdev; +- bool readonly; +- bool unmap; +-}; +- +-struct virtio_blk_io_ctx { +- struct iovec iov_req; +- struct iovec iov_resp; +- struct iovec iov_unmap; +- struct virtio_blk_outhdr req; +- struct virtio_blk_discard_write_zeroes unmap; +- uint8_t resp; +-}; +- +-struct bdev_virtio_blk_io_channel { +- struct virtio_dev *vdev; +- +- /** Virtqueue exclusively assigned to this channel. */ +- struct virtqueue *vq; +- +- /** Virtio response poller. */ +- struct spdk_poller *poller; +-}; +- +-/* Features desired/implemented by this driver. */ +-#define VIRTIO_BLK_DEV_SUPPORTED_FEATURES \ +- (1ULL << VIRTIO_BLK_F_SIZE_MAX | \ +- 1ULL << VIRTIO_BLK_F_SEG_MAX | \ +- 1ULL << VIRTIO_BLK_F_BLK_SIZE | \ +- 1ULL << VIRTIO_BLK_F_TOPOLOGY | \ +- 1ULL << VIRTIO_BLK_F_MQ | \ +- 1ULL << VIRTIO_BLK_F_RO | \ +- 1ULL << VIRTIO_BLK_F_DISCARD | \ +- 1ULL << VIRTIO_RING_F_EVENT_IDX) +- +-/* 10 sec for max poll period */ +-#define VIRTIO_BLK_HOTPLUG_POLL_PERIOD_MAX 10000000ULL +-/* Default poll period is 100ms */ +-#define VIRTIO_BLK_HOTPLUG_POLL_PERIOD_DEFAULT 100000ULL +- +-static struct spdk_poller *g_blk_hotplug_poller = NULL; +-static int g_blk_hotplug_fd = -1; +- +-static int bdev_virtio_initialize(void); +-static int bdev_virtio_blk_get_ctx_size(void); +- +-static struct spdk_bdev_module virtio_blk_if = { +- .name = "virtio_blk", +- .module_init = bdev_virtio_initialize, +- .get_ctx_size = bdev_virtio_blk_get_ctx_size, +-}; +- +-SPDK_BDEV_MODULE_REGISTER(virtio_blk, &virtio_blk_if) +- +-static int bdev_virtio_blk_ch_create_cb(void *io_device, void *ctx_buf); +-static void bdev_virtio_blk_ch_destroy_cb(void *io_device, void *ctx_buf); +- +-static struct virtio_blk_io_ctx * +-bdev_virtio_blk_init_io_vreq(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io) +-{ +- struct virtio_blk_outhdr *req; +- uint8_t *resp; +- struct virtio_blk_discard_write_zeroes *desc; +- +- struct virtio_blk_io_ctx *io_ctx = (struct virtio_blk_io_ctx *)bdev_io->driver_ctx; +- +- req = &io_ctx->req; +- resp = &io_ctx->resp; +- desc = &io_ctx->unmap; +- +- io_ctx->iov_req.iov_base = req; +- io_ctx->iov_req.iov_len = sizeof(*req); +- +- io_ctx->iov_resp.iov_base = resp; +- io_ctx->iov_resp.iov_len = sizeof(*resp); +- +- io_ctx->iov_unmap.iov_base = desc; +- io_ctx->iov_unmap.iov_len = sizeof(*desc); +- +- memset(req, 0, sizeof(*req)); +- return io_ctx; +-} +- +-static void +-bdev_virtio_blk_send_io(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io) +-{ +- struct bdev_virtio_blk_io_channel *virtio_channel = spdk_io_channel_get_ctx(ch); +- struct virtqueue *vq = virtio_channel->vq; +- struct virtio_blk_io_ctx *io_ctx = (struct virtio_blk_io_ctx *)bdev_io->driver_ctx; +- int rc; +- +- rc = virtqueue_req_start(vq, bdev_io, bdev_io->u.bdev.iovcnt + 2); +- if (rc == -ENOMEM) { +- spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_NOMEM); +- return; +- } else if (rc != 0) { +- spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); +- return; +- } +- +- virtqueue_req_add_iovs(vq, &io_ctx->iov_req, 1, SPDK_VIRTIO_DESC_RO); +- if (bdev_io->type == SPDK_BDEV_IO_TYPE_UNMAP) { +- virtqueue_req_add_iovs(vq, &io_ctx->iov_unmap, 1, SPDK_VIRTIO_DESC_RO); +- } else { +- virtqueue_req_add_iovs(vq, bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt, +- bdev_io->type == SPDK_BDEV_IO_TYPE_READ ? +- SPDK_VIRTIO_DESC_WR : SPDK_VIRTIO_DESC_RO); +- } +- virtqueue_req_add_iovs(vq, &io_ctx->iov_resp, 1, SPDK_VIRTIO_DESC_WR); +- +- virtqueue_req_flush(vq); +-} +- +-static void +-bdev_virtio_command(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io) +-{ +- struct virtio_blk_io_ctx *io_ctx = bdev_virtio_blk_init_io_vreq(ch, bdev_io); +- struct virtio_blk_outhdr *req = &io_ctx->req; +- struct virtio_blk_discard_write_zeroes *desc = &io_ctx->unmap; +- +- if (bdev_io->type == SPDK_BDEV_IO_TYPE_READ) { +- req->type = VIRTIO_BLK_T_IN; +- } else if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) { +- req->type = VIRTIO_BLK_T_OUT; +- } else if (bdev_io->type == SPDK_BDEV_IO_TYPE_UNMAP) { +- req->type = VIRTIO_BLK_T_DISCARD; +- desc->sector = bdev_io->u.bdev.offset_blocks * +- spdk_bdev_get_block_size(bdev_io->bdev) / 512; +- desc->num_sectors = bdev_io->u.bdev.num_blocks * +- spdk_bdev_get_block_size(bdev_io->bdev) / 512; +- desc->flags = 0; +- } +- +- req->sector = bdev_io->u.bdev.offset_blocks * +- spdk_bdev_get_block_size(bdev_io->bdev) / 512; +- +- bdev_virtio_blk_send_io(ch, bdev_io); +-} +- +-static void +-bdev_virtio_get_buf_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io, +- bool success) +-{ +- if (!success) { +- spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); +- return; +- } +- +- bdev_virtio_command(ch, bdev_io); +-} +- +-static int +-_bdev_virtio_submit_request(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io) +-{ +- struct virtio_blk_dev *bvdev = bdev_io->bdev->ctxt; +- +- switch (bdev_io->type) { +- case SPDK_BDEV_IO_TYPE_READ: +- spdk_bdev_io_get_buf(bdev_io, bdev_virtio_get_buf_cb, +- bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen); +- return 0; +- case SPDK_BDEV_IO_TYPE_WRITE: +- if (bvdev->readonly) { +- spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); +- } else { +- bdev_virtio_command(ch, bdev_io); +- } +- return 0; +- case SPDK_BDEV_IO_TYPE_RESET: +- spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_SUCCESS); +- return 0; +- case SPDK_BDEV_IO_TYPE_UNMAP: +- if (bvdev->unmap) { +- bdev_virtio_command(ch, bdev_io); +- } else { +- spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); +- } +- return 0; +- case SPDK_BDEV_IO_TYPE_FLUSH: +- default: +- return -1; +- } +- +- SPDK_UNREACHABLE(); +-} +- +-static void +-bdev_virtio_submit_request(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io) +-{ +- if (_bdev_virtio_submit_request(ch, bdev_io) < 0) { +- spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); +- } +-} +- +-static bool +-bdev_virtio_io_type_supported(void *ctx, enum spdk_bdev_io_type io_type) +-{ +- struct virtio_blk_dev *bvdev = ctx; +- +- switch (io_type) { +- case SPDK_BDEV_IO_TYPE_READ: +- case SPDK_BDEV_IO_TYPE_RESET: +- return true; +- case SPDK_BDEV_IO_TYPE_WRITE: +- return !bvdev->readonly; +- case SPDK_BDEV_IO_TYPE_UNMAP: +- return bvdev->unmap; +- case SPDK_BDEV_IO_TYPE_FLUSH: +- default: +- return false; +- } +-} +- +-static struct spdk_io_channel * +-bdev_virtio_get_io_channel(void *ctx) +-{ +- struct virtio_blk_dev *bvdev = ctx; +- +- return spdk_get_io_channel(bvdev); +-} +- +-static void +-virtio_blk_dev_unregister_cb(void *io_device) +-{ +- struct virtio_blk_dev *bvdev = io_device; +- struct virtio_dev *vdev = &bvdev->vdev; +- +- virtio_dev_stop(vdev); +- virtio_dev_destruct(vdev); +- spdk_bdev_destruct_done(&bvdev->bdev, 0); +- free(bvdev); +-} +- +-static int +-bdev_virtio_disk_destruct(void *ctx) +-{ +- struct virtio_blk_dev *bvdev = ctx; +- +- spdk_io_device_unregister(bvdev, virtio_blk_dev_unregister_cb); +- return 1; +-} +- +-int +-bdev_virtio_blk_dev_remove(const char *name, bdev_virtio_remove_cb cb_fn, void *cb_arg) +-{ +- return spdk_bdev_unregister_by_name(name, &virtio_blk_if, cb_fn, cb_arg); +-} +- +-static int +-bdev_virtio_dump_json_config(void *ctx, struct spdk_json_write_ctx *w) +-{ +- struct virtio_blk_dev *bvdev = ctx; +- +- virtio_dev_dump_json_info(&bvdev->vdev, w); +- return 0; +-} +- +-static void +-bdev_virtio_write_config_json(struct spdk_bdev *bdev, struct spdk_json_write_ctx *w) +-{ +- struct virtio_blk_dev *bvdev = bdev->ctxt; +- +- spdk_json_write_object_begin(w); +- +- spdk_json_write_named_string(w, "method", "bdev_virtio_attach_controller"); +- +- spdk_json_write_named_object_begin(w, "params"); +- spdk_json_write_named_string(w, "name", bvdev->vdev.name); +- spdk_json_write_named_string(w, "dev_type", "blk"); +- +- /* Write transport specific parameters. */ +- bvdev->vdev.backend_ops->write_json_config(&bvdev->vdev, w); +- +- spdk_json_write_object_end(w); +- +- spdk_json_write_object_end(w); +-} +- +-static const struct spdk_bdev_fn_table virtio_fn_table = { +- .destruct = bdev_virtio_disk_destruct, +- .submit_request = bdev_virtio_submit_request, +- .io_type_supported = bdev_virtio_io_type_supported, +- .get_io_channel = bdev_virtio_get_io_channel, +- .dump_info_json = bdev_virtio_dump_json_config, +- .write_config_json = bdev_virtio_write_config_json, +-}; +- +-static void +-bdev_virtio_io_cpl(struct spdk_bdev_io *bdev_io) +-{ +- struct virtio_blk_io_ctx *io_ctx = (struct virtio_blk_io_ctx *)bdev_io->driver_ctx; +- +- spdk_bdev_io_complete(bdev_io, io_ctx->resp == VIRTIO_BLK_S_OK ? +- SPDK_BDEV_IO_STATUS_SUCCESS : SPDK_BDEV_IO_STATUS_FAILED); +-} +- +-static int +-bdev_virtio_poll(void *arg) +-{ +- struct bdev_virtio_blk_io_channel *ch = arg; +- void *io[32]; +- uint32_t io_len[32]; +- uint16_t i, cnt; +- +- cnt = virtio_recv_pkts(ch->vq, io, io_len, SPDK_COUNTOF(io)); +- for (i = 0; i < cnt; ++i) { +- bdev_virtio_io_cpl(io[i]); +- } +- +- return cnt; +-} +- +-static int +-bdev_virtio_blk_ch_create_cb(void *io_device, void *ctx_buf) +-{ +- struct virtio_blk_dev *bvdev = io_device; +- struct virtio_dev *vdev = &bvdev->vdev; +- struct bdev_virtio_blk_io_channel *ch = ctx_buf; +- struct virtqueue *vq; +- int32_t queue_idx; +- +- queue_idx = virtio_dev_find_and_acquire_queue(vdev, 0); +- if (queue_idx < 0) { +- SPDK_ERRLOG("Couldn't get an unused queue for the io_channel.\n"); +- return -1; +- } +- +- vq = vdev->vqs[queue_idx]; +- +- ch->vdev = vdev; +- ch->vq = vq; +- +- ch->poller = SPDK_POLLER_REGISTER(bdev_virtio_poll, ch, 0); +- return 0; +-} +- +-static void +-bdev_virtio_blk_ch_destroy_cb(void *io_device, void *ctx_buf) +-{ +- struct virtio_blk_dev *bvdev = io_device; +- struct virtio_dev *vdev = &bvdev->vdev; +- struct bdev_virtio_blk_io_channel *ch = ctx_buf; +- struct virtqueue *vq = ch->vq; +- +- spdk_poller_unregister(&ch->poller); +- virtio_dev_release_queue(vdev, vq->vq_queue_index); +-} +- +-static int +-virtio_blk_dev_init(struct virtio_blk_dev *bvdev, uint16_t max_queues) +-{ +- struct virtio_dev *vdev = &bvdev->vdev; +- struct spdk_bdev *bdev = &bvdev->bdev; +- uint64_t capacity, num_blocks; +- uint32_t block_size, size_max, seg_max; +- uint16_t host_max_queues; +- int rc; +- +- if (virtio_dev_has_feature(vdev, VIRTIO_BLK_F_BLK_SIZE)) { +- rc = virtio_dev_read_dev_config(vdev, offsetof(struct virtio_blk_config, blk_size), +- &block_size, sizeof(block_size)); +- if (rc) { +- SPDK_ERRLOG("%s: config read failed: %s\n", vdev->name, spdk_strerror(-rc)); +- return rc; +- } +- +- if (block_size == 0 || block_size % 512 != 0) { +- SPDK_ERRLOG("%s: invalid block size (%"PRIu32"). Must be " +- "a multiple of 512.\n", vdev->name, block_size); +- return -EIO; +- } +- } else { +- block_size = 512; +- } +- +- rc = virtio_dev_read_dev_config(vdev, offsetof(struct virtio_blk_config, capacity), +- &capacity, sizeof(capacity)); +- if (rc) { +- SPDK_ERRLOG("%s: config read failed: %s\n", vdev->name, spdk_strerror(-rc)); +- return rc; +- } +- +- /* `capacity` is a number of 512-byte sectors. */ +- num_blocks = capacity * 512 / block_size; +- if (num_blocks == 0) { +- SPDK_ERRLOG("%s: size too small (size: %"PRIu64", blocksize: %"PRIu32").\n", +- vdev->name, capacity * 512, block_size); +- return -EIO; +- } +- +- if ((capacity * 512) % block_size != 0) { +- SPDK_WARNLOG("%s: size has been rounded down to the nearest block size boundary. " +- "(block size: %"PRIu32", previous size: %"PRIu64", new size: %"PRIu64")\n", +- vdev->name, block_size, capacity * 512, num_blocks * block_size); +- } +- +- if (virtio_dev_has_feature(vdev, VIRTIO_BLK_F_MQ)) { +- rc = virtio_dev_read_dev_config(vdev, offsetof(struct virtio_blk_config, num_queues), +- &host_max_queues, sizeof(host_max_queues)); +- if (rc) { +- SPDK_ERRLOG("%s: config read failed: %s\n", vdev->name, spdk_strerror(-rc)); +- return rc; +- } +- } else { +- host_max_queues = 1; +- } +- +- if (virtio_dev_has_feature(vdev, VIRTIO_BLK_F_SIZE_MAX)) { +- rc = virtio_dev_read_dev_config(vdev, offsetof(struct virtio_blk_config, size_max), +- &size_max, sizeof(size_max)); +- if (rc) { +- SPDK_ERRLOG("%s: config read failed: %s\n", vdev->name, spdk_strerror(-rc)); +- return rc; +- } +- +- if (spdk_unlikely(size_max < block_size)) { +- SPDK_WARNLOG("%s: minimum segment size is set to block size %u forcefully.\n", +- vdev->name, block_size); +- size_max = block_size; +- } +- +- bdev->max_segment_size = size_max; +- } +- +- if (virtio_dev_has_feature(vdev, VIRTIO_BLK_F_SEG_MAX)) { +- rc = virtio_dev_read_dev_config(vdev, offsetof(struct virtio_blk_config, seg_max), +- &seg_max, sizeof(seg_max)); +- if (rc) { +- SPDK_ERRLOG("%s: config read failed: %s\n", vdev->name, spdk_strerror(-rc)); +- return rc; +- } +- +- if (spdk_unlikely(seg_max == 0)) { +- SPDK_ERRLOG("%s: virtio blk SEG_MAX can't be 0\n", vdev->name); +- return -EINVAL; +- } +- +- bdev->max_num_segments = seg_max; +- } +- +- if (virtio_dev_has_feature(vdev, VIRTIO_BLK_F_RO)) { +- bvdev->readonly = true; +- } +- +- if (virtio_dev_has_feature(vdev, VIRTIO_BLK_F_DISCARD)) { +- bvdev->unmap = true; +- } +- +- if (max_queues == 0) { +- SPDK_ERRLOG("%s: requested 0 request queues (%"PRIu16" available).\n", +- vdev->name, host_max_queues); +- return -EINVAL; +- } +- +- if (max_queues > host_max_queues) { +- SPDK_WARNLOG("%s: requested %"PRIu16" request queues " +- "but only %"PRIu16" available.\n", +- vdev->name, max_queues, host_max_queues); +- max_queues = host_max_queues; +- } +- +- /* bdev is tied with the virtio device; we can reuse the name */ +- bdev->name = vdev->name; +- rc = virtio_dev_start(vdev, max_queues, 0); +- if (rc != 0) { +- return rc; +- } +- +- bdev->product_name = "VirtioBlk Disk"; +- bdev->write_cache = 0; +- bdev->blocklen = block_size; +- bdev->blockcnt = num_blocks; +- +- bdev->ctxt = bvdev; +- bdev->fn_table = &virtio_fn_table; +- bdev->module = &virtio_blk_if; +- +- spdk_io_device_register(bvdev, bdev_virtio_blk_ch_create_cb, +- bdev_virtio_blk_ch_destroy_cb, +- sizeof(struct bdev_virtio_blk_io_channel), +- vdev->name); +- +- rc = spdk_bdev_register(bdev); +- if (rc) { +- SPDK_ERRLOG("Failed to register bdev name=%s\n", bdev->name); +- spdk_io_device_unregister(bvdev, NULL); +- virtio_dev_stop(vdev); +- return rc; +- } +- +- return 0; +-} +- +-static struct virtio_blk_dev * +-virtio_pci_blk_dev_create(const char *name, struct virtio_pci_ctx *pci_ctx) +-{ +- static int pci_dev_counter = 0; +- struct virtio_blk_dev *bvdev; +- struct virtio_dev *vdev; +- char *default_name = NULL; +- uint16_t num_queues; +- int rc; +- +- bvdev = calloc(1, sizeof(*bvdev)); +- if (bvdev == NULL) { +- SPDK_ERRLOG("virtio device calloc failed\n"); +- return NULL; +- } +- vdev = &bvdev->vdev; +- +- if (name == NULL) { +- default_name = spdk_sprintf_alloc("VirtioBlk%"PRIu32, pci_dev_counter++); +- if (default_name == NULL) { +- free(vdev); +- return NULL; +- } +- name = default_name; +- } +- +- rc = virtio_pci_dev_init(vdev, name, pci_ctx); +- free(default_name); +- +- if (rc != 0) { +- free(bvdev); +- return NULL; +- } +- +- rc = virtio_dev_reset(vdev, VIRTIO_BLK_DEV_SUPPORTED_FEATURES); +- if (rc != 0) { +- goto fail; +- } +- +- /* TODO: add a way to limit usable virtqueues */ +- if (virtio_dev_has_feature(vdev, VIRTIO_BLK_F_MQ)) { +- rc = virtio_dev_read_dev_config(vdev, offsetof(struct virtio_blk_config, num_queues), +- &num_queues, sizeof(num_queues)); +- if (rc) { +- SPDK_ERRLOG("%s: config read failed: %s\n", vdev->name, spdk_strerror(-rc)); +- goto fail; +- } +- } else { +- num_queues = 1; +- } +- +- rc = virtio_blk_dev_init(bvdev, num_queues); +- if (rc != 0) { +- goto fail; +- } +- +- return bvdev; +- +-fail: +- vdev->ctx = NULL; +- virtio_dev_destruct(vdev); +- free(bvdev); +- return NULL; +-} +- +-static struct virtio_blk_dev * +-virtio_user_blk_dev_create(const char *name, const char *path, +- uint16_t num_queues, uint32_t queue_size) +-{ +- struct virtio_blk_dev *bvdev; +- uint64_t feature_bits; +- int rc; +- +- bvdev = calloc(1, sizeof(*bvdev)); +- if (bvdev == NULL) { +- SPDK_ERRLOG("calloc failed for virtio device %s: %s\n", name, path); +- return NULL; +- } +- +- rc = virtio_user_dev_init(&bvdev->vdev, name, path, queue_size); +- if (rc != 0) { +- SPDK_ERRLOG("Failed to create virito device %s: %s\n", name, path); +- free(bvdev); +- return NULL; +- } +- +- feature_bits = VIRTIO_BLK_DEV_SUPPORTED_FEATURES; +- feature_bits |= (1ULL << VHOST_USER_F_PROTOCOL_FEATURES); +- rc = virtio_dev_reset(&bvdev->vdev, feature_bits); +- if (rc != 0) { +- virtio_dev_destruct(&bvdev->vdev); +- free(bvdev); +- return NULL; +- } +- +- rc = virtio_blk_dev_init(bvdev, num_queues); +- if (rc != 0) { +- virtio_dev_destruct(&bvdev->vdev); +- free(bvdev); +- return NULL; +- } +- +- return bvdev; +-} +- +-struct bdev_virtio_pci_dev_create_ctx { +- const char *name; +- struct virtio_blk_dev *ret; +-}; +- +-static int +-bdev_virtio_pci_blk_dev_create_cb(struct virtio_pci_ctx *pci_ctx, void *ctx) +-{ +- struct bdev_virtio_pci_dev_create_ctx *create_ctx = ctx; +- +- create_ctx->ret = virtio_pci_blk_dev_create(create_ctx->name, pci_ctx); +- if (create_ctx->ret == NULL) { +- return -1; +- } +- +- return 0; +-} +- +-struct spdk_bdev * +-bdev_virtio_pci_blk_dev_create(const char *name, struct spdk_pci_addr *pci_addr) +-{ +- struct bdev_virtio_pci_dev_create_ctx create_ctx; +- +- create_ctx.name = name; +- create_ctx.ret = NULL; +- +- virtio_pci_dev_attach(bdev_virtio_pci_blk_dev_create_cb, &create_ctx, +- VIRTIO_ID_BLOCK, pci_addr); +- +- if (create_ctx.ret == NULL) { +- return NULL; +- } +- +- return &create_ctx.ret->bdev; +-} +- +-static int +-bdev_virtio_pci_blk_monitor(void *arg) +-{ +- const char *vdev_name; +- struct bdev_virtio_pci_dev_create_ctx create_ctx; +- +- while ((vdev_name = virtio_pci_dev_event_process(g_blk_hotplug_fd, VIRTIO_ID_BLOCK)) != NULL) { +- bdev_virtio_blk_dev_remove(vdev_name, NULL, NULL); +- } +- +- /* Enumerate virtio pci_blk device */ +- memset(&create_ctx, 0, sizeof(create_ctx)); +- virtio_pci_dev_enumerate(bdev_virtio_pci_blk_dev_create_cb, &create_ctx, +- VIRTIO_ID_BLOCK); +- +- return SPDK_POLLER_BUSY; +-} +- +-int +-bdev_virtio_pci_blk_set_hotplug(bool enabled, uint64_t period_us) +-{ +- if (enabled == true && !spdk_process_is_primary()) { +- return -EPERM; +- } +- +- if (g_blk_hotplug_poller) { +- close(g_blk_hotplug_fd); +- spdk_poller_unregister(&g_blk_hotplug_poller); +- } +- +- if (!enabled) { +- return 0; +- } +- +- g_blk_hotplug_fd = spdk_pci_event_listen(); +- if (g_blk_hotplug_fd < 0) { +- return g_blk_hotplug_fd; +- } +- +- period_us = period_us ? period_us : VIRTIO_BLK_HOTPLUG_POLL_PERIOD_DEFAULT; +- period_us = spdk_min(period_us, VIRTIO_BLK_HOTPLUG_POLL_PERIOD_MAX); +- g_blk_hotplug_poller = spdk_poller_register(bdev_virtio_pci_blk_monitor, NULL, period_us); +- if (!g_blk_hotplug_poller) { +- close(g_blk_hotplug_fd); +- return -1; +- } +- +- return 0; +-} +- +-static int +-bdev_virtio_initialize(void) +-{ +- return 0; +-} +- +-struct spdk_bdev * +-bdev_virtio_user_blk_dev_create(const char *name, const char *path, +- unsigned num_queues, unsigned queue_size) +-{ +- struct virtio_blk_dev *bvdev; +- +- bvdev = virtio_user_blk_dev_create(name, path, num_queues, queue_size); +- if (bvdev == NULL) { +- return NULL; +- } +- +- return &bvdev->bdev; +-} +- +-struct spdk_bdev * +-bdev_virtio_vfio_user_blk_dev_create(const char *name, const char *path) +-{ +- struct virtio_blk_dev *bvdev; +- uint16_t num_queues = 0; +- int rc; +- +- bvdev = calloc(1, sizeof(*bvdev)); +- if (bvdev == NULL) { +- SPDK_ERRLOG("calloc failed for virtio device %s: %s\n", name, path); +- return NULL; +- } +- +- rc = virtio_vfio_user_dev_init(&bvdev->vdev, name, path); +- if (rc != 0) { +- SPDK_ERRLOG("Failed to create %s as virtio device\n", path); +- free(bvdev); +- return NULL; +- } +- +- rc = virtio_dev_reset(&bvdev->vdev, VIRTIO_BLK_DEV_SUPPORTED_FEATURES); +- if (rc != 0) { +- SPDK_ERRLOG("Failed to reset %s as virtio device\n", path); +- virtio_dev_destruct(&bvdev->vdev); +- free(bvdev); +- return NULL; +- } +- +- if (virtio_dev_has_feature(&bvdev->vdev, VIRTIO_BLK_F_MQ)) { +- rc = virtio_dev_read_dev_config(&bvdev->vdev, offsetof(struct virtio_blk_config, num_queues), +- &num_queues, sizeof(num_queues)); +- if (rc) { +- SPDK_ERRLOG("%s: config read failed: %s\n", name, spdk_strerror(-rc)); +- virtio_dev_destruct(&bvdev->vdev); +- free(bvdev); +- return NULL; +- } +- } else { +- num_queues = 1; +- } +- +- rc = virtio_blk_dev_init(bvdev, num_queues); +- if (rc != 0) { +- SPDK_ERRLOG("Failed to initialize %s as virtio device\n", path); +- virtio_dev_destruct(&bvdev->vdev); +- free(bvdev); +- return NULL; +- } +- +- return &bvdev->bdev; +-} +- +-static int +-bdev_virtio_blk_get_ctx_size(void) +-{ +- return sizeof(struct virtio_blk_io_ctx); +-} +- +-SPDK_LOG_REGISTER_COMPONENT(virtio_blk) ++/* SPDX-License-Identifier: BSD-3-Clause ++ * Copyright (C) 2017 Intel Corporation. ++ * All rights reserved. ++ */ ++ ++#include "spdk/stdinc.h" ++ ++#include "spdk/bdev.h" ++#include "spdk/endian.h" ++#include "spdk/env.h" ++#include "spdk/thread.h" ++#include "spdk/string.h" ++#include "spdk/util.h" ++#include "spdk/json.h" ++ ++#include "spdk_internal/assert.h" ++#include "spdk/bdev_module.h" ++#include "spdk/log.h" ++#include "spdk_internal/virtio.h" ++#include "spdk_internal/vhost_user.h" ++ ++#include ++#include ++ ++#include "bdev_virtio.h" ++ ++struct virtio_blk_dev { ++ struct virtio_dev vdev; ++ struct spdk_bdev bdev; ++ bool readonly; ++ bool unmap; ++}; ++ ++struct virtio_blk_io_ctx { ++ struct iovec iov_req; ++ struct iovec iov_resp; ++ struct iovec iov_unmap; ++ struct virtio_blk_outhdr req; ++ struct virtio_blk_discard_write_zeroes unmap; ++ uint8_t resp; ++}; ++ ++struct bdev_virtio_blk_io_channel { ++ struct virtio_dev *vdev; ++ ++ /** Virtqueue exclusively assigned to this channel. */ ++ struct virtqueue *vq; ++ ++ /** Virtio response poller. */ ++ struct spdk_poller *poller; ++}; ++ ++/* Features desired/implemented by this driver. */ ++#define VIRTIO_BLK_DEV_SUPPORTED_FEATURES \ ++ (1ULL << VIRTIO_BLK_F_SIZE_MAX | \ ++ 1ULL << VIRTIO_BLK_F_SEG_MAX | \ ++ 1ULL << VIRTIO_BLK_F_BLK_SIZE | \ ++ 1ULL << VIRTIO_BLK_F_TOPOLOGY | \ ++ 1ULL << VIRTIO_BLK_F_MQ | \ ++ 1ULL << VIRTIO_BLK_F_RO | \ ++ 1ULL << VIRTIO_BLK_F_DISCARD | \ ++ 1ULL << VIRTIO_RING_F_EVENT_IDX) ++ ++/* 10 sec for max poll period */ ++#define VIRTIO_BLK_HOTPLUG_POLL_PERIOD_MAX 10000000ULL ++/* Default poll period is 100ms */ ++#define VIRTIO_BLK_HOTPLUG_POLL_PERIOD_DEFAULT 100000ULL ++ ++static struct spdk_poller *g_blk_hotplug_poller = NULL; ++static int g_blk_hotplug_fd = -1; ++ ++static int bdev_virtio_initialize(void); ++static int bdev_virtio_blk_get_ctx_size(void); ++ ++static struct spdk_bdev_module virtio_blk_if = { ++ .name = "virtio_blk", ++ .module_init = bdev_virtio_initialize, ++ .get_ctx_size = bdev_virtio_blk_get_ctx_size, ++}; ++ ++SPDK_BDEV_MODULE_REGISTER(virtio_blk, &virtio_blk_if) ++ ++static int bdev_virtio_blk_ch_create_cb(void *io_device, void *ctx_buf); ++static void bdev_virtio_blk_ch_destroy_cb(void *io_device, void *ctx_buf); ++ ++static struct virtio_blk_io_ctx * ++bdev_virtio_blk_init_io_vreq(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io) ++{ ++ struct virtio_blk_outhdr *req; ++ uint8_t *resp; ++ struct virtio_blk_discard_write_zeroes *desc; ++ ++ struct virtio_blk_io_ctx *io_ctx = (struct virtio_blk_io_ctx *)bdev_io->driver_ctx; ++ ++ req = &io_ctx->req; ++ resp = &io_ctx->resp; ++ desc = &io_ctx->unmap; ++ ++ io_ctx->iov_req.iov_base = req; ++ io_ctx->iov_req.iov_len = sizeof(*req); ++ ++ io_ctx->iov_resp.iov_base = resp; ++ io_ctx->iov_resp.iov_len = sizeof(*resp); ++ ++ io_ctx->iov_unmap.iov_base = desc; ++ io_ctx->iov_unmap.iov_len = sizeof(*desc); ++ ++ memset(req, 0, sizeof(*req)); ++ return io_ctx; ++} ++ ++static void ++bdev_virtio_blk_send_io(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io) ++{ ++ struct bdev_virtio_blk_io_channel *virtio_channel = spdk_io_channel_get_ctx(ch); ++ struct virtqueue *vq = virtio_channel->vq; ++ struct virtio_blk_io_ctx *io_ctx = (struct virtio_blk_io_ctx *)bdev_io->driver_ctx; ++ int rc; ++ ++ rc = virtqueue_req_start(vq, bdev_io, bdev_io->u.bdev.iovcnt + 2); ++ if (rc == -ENOMEM) { ++ spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_NOMEM); ++ return; ++ } else if (rc != 0) { ++ spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); ++ return; ++ } ++ ++ virtqueue_req_add_iovs(vq, &io_ctx->iov_req, 1, SPDK_VIRTIO_DESC_RO); ++ if (bdev_io->type == SPDK_BDEV_IO_TYPE_UNMAP) { ++ virtqueue_req_add_iovs(vq, &io_ctx->iov_unmap, 1, SPDK_VIRTIO_DESC_RO); ++ } else { ++ virtqueue_req_add_iovs(vq, bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt, ++ bdev_io->type == SPDK_BDEV_IO_TYPE_READ ? ++ SPDK_VIRTIO_DESC_WR : SPDK_VIRTIO_DESC_RO); ++ } ++ virtqueue_req_add_iovs(vq, &io_ctx->iov_resp, 1, SPDK_VIRTIO_DESC_WR); ++ ++ virtqueue_req_flush(vq); ++} ++ ++static void ++bdev_virtio_command(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io) ++{ ++ struct virtio_blk_io_ctx *io_ctx = bdev_virtio_blk_init_io_vreq(ch, bdev_io); ++ struct virtio_blk_outhdr *req = &io_ctx->req; ++ struct virtio_blk_discard_write_zeroes *desc = &io_ctx->unmap; ++ ++ if (bdev_io->type == SPDK_BDEV_IO_TYPE_READ) { ++ req->type = VIRTIO_BLK_T_IN; ++ } else if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) { ++ req->type = VIRTIO_BLK_T_OUT; ++ } else if (bdev_io->type == SPDK_BDEV_IO_TYPE_UNMAP) { ++ req->type = VIRTIO_BLK_T_DISCARD; ++ desc->sector = bdev_io->u.bdev.offset_blocks * ++ spdk_bdev_get_block_size(bdev_io->bdev) / 512; ++ desc->num_sectors = bdev_io->u.bdev.num_blocks * ++ spdk_bdev_get_block_size(bdev_io->bdev) / 512; ++ desc->flags = 0; ++ } ++ ++ req->sector = bdev_io->u.bdev.offset_blocks * ++ spdk_bdev_get_block_size(bdev_io->bdev) / 512; ++ ++ bdev_virtio_blk_send_io(ch, bdev_io); ++} ++ ++static void ++bdev_virtio_get_buf_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io, ++ bool success) ++{ ++ if (!success) { ++ spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); ++ return; ++ } ++ ++ bdev_virtio_command(ch, bdev_io); ++} ++ ++static int ++_bdev_virtio_submit_request(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io) ++{ ++ struct virtio_blk_dev *bvdev = bdev_io->bdev->ctxt; ++ ++ switch (bdev_io->type) { ++ case SPDK_BDEV_IO_TYPE_READ: ++ spdk_bdev_io_get_buf(bdev_io, bdev_virtio_get_buf_cb, ++ bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen); ++ return 0; ++ case SPDK_BDEV_IO_TYPE_WRITE: ++ if (bvdev->readonly) { ++ spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); ++ } else { ++ bdev_virtio_command(ch, bdev_io); ++ } ++ return 0; ++ case SPDK_BDEV_IO_TYPE_RESET: ++ spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_SUCCESS); ++ return 0; ++ case SPDK_BDEV_IO_TYPE_UNMAP: ++ if (bvdev->unmap) { ++ bdev_virtio_command(ch, bdev_io); ++ } else { ++ spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); ++ } ++ return 0; ++ case SPDK_BDEV_IO_TYPE_FLUSH: ++ default: ++ return -1; ++ } ++ ++ SPDK_UNREACHABLE(); ++} ++ ++static void ++bdev_virtio_submit_request(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io) ++{ ++ if (_bdev_virtio_submit_request(ch, bdev_io) < 0) { ++ spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); ++ } ++} ++ ++static bool ++bdev_virtio_io_type_supported(void *ctx, enum spdk_bdev_io_type io_type) ++{ ++ struct virtio_blk_dev *bvdev = ctx; ++ ++ switch (io_type) { ++ case SPDK_BDEV_IO_TYPE_READ: ++ case SPDK_BDEV_IO_TYPE_RESET: ++ return true; ++ case SPDK_BDEV_IO_TYPE_WRITE: ++ return !bvdev->readonly; ++ case SPDK_BDEV_IO_TYPE_UNMAP: ++ return bvdev->unmap; ++ case SPDK_BDEV_IO_TYPE_FLUSH: ++ default: ++ return false; ++ } ++} ++ ++static struct spdk_io_channel * ++bdev_virtio_get_io_channel(void *ctx) ++{ ++ struct virtio_blk_dev *bvdev = ctx; ++ ++ return spdk_get_io_channel(bvdev); ++} ++ ++static void ++virtio_blk_dev_unregister_cb(void *io_device) ++{ ++ struct virtio_blk_dev *bvdev = io_device; ++ struct virtio_dev *vdev = &bvdev->vdev; ++ ++ virtio_dev_stop(vdev); ++ virtio_dev_destruct(vdev); ++ spdk_bdev_destruct_done(&bvdev->bdev, 0); ++ free(bvdev); ++} ++ ++static int ++bdev_virtio_disk_destruct(void *ctx) ++{ ++ struct virtio_blk_dev *bvdev = ctx; ++ ++ spdk_io_device_unregister(bvdev, virtio_blk_dev_unregister_cb); ++ return 1; ++} ++ ++int ++bdev_virtio_blk_dev_remove(const char *name, bdev_virtio_remove_cb cb_fn, void *cb_arg) ++{ ++ return spdk_bdev_unregister_by_name(name, &virtio_blk_if, cb_fn, cb_arg); ++} ++ ++static int ++bdev_virtio_dump_json_config(void *ctx, struct spdk_json_write_ctx *w) ++{ ++ struct virtio_blk_dev *bvdev = ctx; ++ ++ virtio_dev_dump_json_info(&bvdev->vdev, w); ++ return 0; ++} ++ ++static void ++bdev_virtio_write_config_json(struct spdk_bdev *bdev, struct spdk_json_write_ctx *w) ++{ ++ struct virtio_blk_dev *bvdev = bdev->ctxt; ++ ++ spdk_json_write_object_begin(w); ++ ++ spdk_json_write_named_string(w, "method", "bdev_virtio_attach_controller"); ++ ++ spdk_json_write_named_object_begin(w, "params"); ++ spdk_json_write_named_string(w, "name", bvdev->vdev.name); ++ spdk_json_write_named_string(w, "dev_type", "blk"); ++ ++ /* Write transport specific parameters. */ ++ bvdev->vdev.backend_ops->write_json_config(&bvdev->vdev, w); ++ ++ spdk_json_write_object_end(w); ++ ++ spdk_json_write_object_end(w); ++} ++ ++static const struct spdk_bdev_fn_table virtio_fn_table = { ++ .destruct = bdev_virtio_disk_destruct, ++ .submit_request = bdev_virtio_submit_request, ++ .io_type_supported = bdev_virtio_io_type_supported, ++ .get_io_channel = bdev_virtio_get_io_channel, ++ .dump_info_json = bdev_virtio_dump_json_config, ++ .write_config_json = bdev_virtio_write_config_json, ++}; ++ ++static void ++bdev_virtio_io_cpl(struct spdk_bdev_io *bdev_io) ++{ ++ struct virtio_blk_io_ctx *io_ctx = (struct virtio_blk_io_ctx *)bdev_io->driver_ctx; ++ ++ spdk_bdev_io_complete(bdev_io, io_ctx->resp == VIRTIO_BLK_S_OK ? ++ SPDK_BDEV_IO_STATUS_SUCCESS : SPDK_BDEV_IO_STATUS_FAILED); ++} ++ ++static int ++bdev_virtio_poll(void *arg) ++{ ++ struct bdev_virtio_blk_io_channel *ch = arg; ++ void *io[32]; ++ uint32_t io_len[32]; ++ uint16_t i, cnt; ++ ++ cnt = virtio_recv_pkts(ch->vq, io, io_len, SPDK_COUNTOF(io)); ++ for (i = 0; i < cnt; ++i) { ++ bdev_virtio_io_cpl(io[i]); ++ } ++ ++ return cnt; ++} ++ ++static int ++bdev_virtio_blk_ch_create_cb(void *io_device, void *ctx_buf) ++{ ++ struct virtio_blk_dev *bvdev = io_device; ++ struct virtio_dev *vdev = &bvdev->vdev; ++ struct bdev_virtio_blk_io_channel *ch = ctx_buf; ++ struct virtqueue *vq; ++ int32_t queue_idx; ++ ++ queue_idx = virtio_dev_find_and_acquire_queue(vdev, 0); ++ if (queue_idx < 0) { ++ SPDK_ERRLOG("Couldn't get an unused queue for the io_channel.\n"); ++ return -1; ++ } ++ ++ vq = vdev->vqs[queue_idx]; ++ ++ ch->vdev = vdev; ++ ch->vq = vq; ++ ++ ch->poller = SPDK_POLLER_REGISTER(bdev_virtio_poll, ch, 0); ++ return 0; ++} ++ ++static void ++bdev_virtio_blk_ch_destroy_cb(void *io_device, void *ctx_buf) ++{ ++ struct virtio_blk_dev *bvdev = io_device; ++ struct virtio_dev *vdev = &bvdev->vdev; ++ struct bdev_virtio_blk_io_channel *ch = ctx_buf; ++ struct virtqueue *vq = ch->vq; ++ ++ spdk_poller_unregister(&ch->poller); ++ virtio_dev_release_queue(vdev, vq->vq_queue_index); ++} ++ ++static int ++virtio_blk_dev_init(struct virtio_blk_dev *bvdev, uint16_t max_queues) ++{ ++ struct virtio_dev *vdev = &bvdev->vdev; ++ struct spdk_bdev *bdev = &bvdev->bdev; ++ uint64_t capacity, num_blocks; ++ uint32_t block_size, size_max, seg_max; ++ uint16_t host_max_queues; ++ int rc; ++ ++ if (virtio_dev_has_feature(vdev, VIRTIO_BLK_F_BLK_SIZE)) { ++ rc = virtio_dev_read_dev_config(vdev, offsetof(struct virtio_blk_config, blk_size), ++ &block_size, sizeof(block_size)); ++ if (rc) { ++ SPDK_ERRLOG("%s: config read failed: %s\n", vdev->name, spdk_strerror(-rc)); ++ return rc; ++ } ++ ++ if (block_size == 0 || block_size % 512 != 0) { ++ SPDK_ERRLOG("%s: invalid block size (%"PRIu32"). Must be " ++ "a multiple of 512.\n", vdev->name, block_size); ++ return -EIO; ++ } ++ } else { ++ block_size = 512; ++ } ++ ++ rc = virtio_dev_read_dev_config(vdev, offsetof(struct virtio_blk_config, capacity), ++ &capacity, sizeof(capacity)); ++ if (rc) { ++ SPDK_ERRLOG("%s: config read failed: %s\n", vdev->name, spdk_strerror(-rc)); ++ return rc; ++ } ++ ++ /* `capacity` is a number of 512-byte sectors. */ ++ num_blocks = capacity * 512 / block_size; ++ if (num_blocks == 0) { ++ SPDK_ERRLOG("%s: size too small (size: %"PRIu64", blocksize: %"PRIu32").\n", ++ vdev->name, capacity * 512, block_size); ++ return -EIO; ++ } ++ ++ if ((capacity * 512) % block_size != 0) { ++ SPDK_WARNLOG("%s: size has been rounded down to the nearest block size boundary. " ++ "(block size: %"PRIu32", previous size: %"PRIu64", new size: %"PRIu64")\n", ++ vdev->name, block_size, capacity * 512, num_blocks * block_size); ++ } ++ ++ if (virtio_dev_has_feature(vdev, VIRTIO_BLK_F_MQ)) { ++ rc = virtio_dev_read_dev_config(vdev, offsetof(struct virtio_blk_config, num_queues), ++ &host_max_queues, sizeof(host_max_queues)); ++ if (rc) { ++ SPDK_ERRLOG("%s: config read failed: %s\n", vdev->name, spdk_strerror(-rc)); ++ return rc; ++ } ++ } else { ++ host_max_queues = 1; ++ } ++ ++ if (virtio_dev_has_feature(vdev, VIRTIO_BLK_F_SIZE_MAX)) { ++ rc = virtio_dev_read_dev_config(vdev, offsetof(struct virtio_blk_config, size_max), ++ &size_max, sizeof(size_max)); ++ if (rc) { ++ SPDK_ERRLOG("%s: config read failed: %s\n", vdev->name, spdk_strerror(-rc)); ++ return rc; ++ } ++ ++ if (spdk_unlikely(size_max < block_size)) { ++ SPDK_WARNLOG("%s: minimum segment size is set to block size %u forcefully.\n", ++ vdev->name, block_size); ++ size_max = block_size; ++ } ++ ++ bdev->max_segment_size = size_max; ++ } ++ ++ if (virtio_dev_has_feature(vdev, VIRTIO_BLK_F_SEG_MAX)) { ++ rc = virtio_dev_read_dev_config(vdev, offsetof(struct virtio_blk_config, seg_max), ++ &seg_max, sizeof(seg_max)); ++ if (rc) { ++ SPDK_ERRLOG("%s: config read failed: %s\n", vdev->name, spdk_strerror(-rc)); ++ return rc; ++ } ++ ++ if (spdk_unlikely(seg_max == 0)) { ++ SPDK_ERRLOG("%s: virtio blk SEG_MAX can't be 0\n", vdev->name); ++ return -EINVAL; ++ } ++ ++ bdev->max_num_segments = seg_max; ++ } ++ ++ if (virtio_dev_has_feature(vdev, VIRTIO_BLK_F_RO)) { ++ bvdev->readonly = true; ++ } ++ ++ if (virtio_dev_has_feature(vdev, VIRTIO_BLK_F_DISCARD)) { ++ bvdev->unmap = true; ++ } ++ ++ if (max_queues == 0) { ++ SPDK_ERRLOG("%s: requested 0 request queues (%"PRIu16" available).\n", ++ vdev->name, host_max_queues); ++ return -EINVAL; ++ } ++ ++ if (max_queues > host_max_queues) { ++ SPDK_WARNLOG("%s: requested %"PRIu16" request queues " ++ "but only %"PRIu16" available.\n", ++ vdev->name, max_queues, host_max_queues); ++ max_queues = host_max_queues; ++ } ++ ++ /* bdev is tied with the virtio device; we can reuse the name */ ++ bdev->name = vdev->name; ++ rc = virtio_dev_start(vdev, max_queues, 0); ++ if (rc != 0) { ++ return rc; ++ } ++ ++ bdev->product_name = "VirtioBlk Disk"; ++ bdev->write_cache = 0; ++ bdev->blocklen = block_size; ++ bdev->blockcnt = num_blocks; ++ ++ bdev->ctxt = bvdev; ++ bdev->fn_table = &virtio_fn_table; ++ bdev->module = &virtio_blk_if; ++ ++ spdk_io_device_register(bvdev, bdev_virtio_blk_ch_create_cb, ++ bdev_virtio_blk_ch_destroy_cb, ++ sizeof(struct bdev_virtio_blk_io_channel), ++ vdev->name); ++ ++ rc = spdk_bdev_register(bdev); ++ if (rc) { ++ SPDK_ERRLOG("Failed to register bdev name=%s\n", bdev->name); ++ spdk_io_device_unregister(bvdev, NULL); ++ virtio_dev_stop(vdev); ++ return rc; ++ } ++ ++ return 0; ++} ++ ++static struct virtio_blk_dev * ++virtio_pci_blk_dev_create(const char *name, struct virtio_pci_ctx *pci_ctx) ++{ ++ static int pci_dev_counter = 0; ++ struct virtio_blk_dev *bvdev; ++ struct virtio_dev *vdev; ++ char *default_name = NULL; ++ uint16_t num_queues; ++ int rc; ++ ++ bvdev = calloc(1, sizeof(*bvdev)); ++ if (bvdev == NULL) { ++ SPDK_ERRLOG("virtio device calloc failed\n"); ++ return NULL; ++ } ++ vdev = &bvdev->vdev; ++ ++ if (name == NULL) { ++ default_name = spdk_sprintf_alloc("VirtioBlk%"PRIu32, pci_dev_counter++); ++ if (default_name == NULL) { ++ free(vdev); ++ return NULL; ++ } ++ name = default_name; ++ } ++ ++ rc = virtio_pci_dev_init(vdev, name, pci_ctx); ++ free(default_name); ++ ++ if (rc != 0) { ++ free(bvdev); ++ return NULL; ++ } ++ ++ rc = virtio_dev_reset(vdev, VIRTIO_BLK_DEV_SUPPORTED_FEATURES); ++ if (rc != 0) { ++ goto fail; ++ } ++ ++ /* TODO: add a way to limit usable virtqueues */ ++ if (virtio_dev_has_feature(vdev, VIRTIO_BLK_F_MQ)) { ++ rc = virtio_dev_read_dev_config(vdev, offsetof(struct virtio_blk_config, num_queues), ++ &num_queues, sizeof(num_queues)); ++ if (rc) { ++ SPDK_ERRLOG("%s: config read failed: %s\n", vdev->name, spdk_strerror(-rc)); ++ goto fail; ++ } ++ } else { ++ num_queues = 1; ++ } ++ ++ rc = virtio_blk_dev_init(bvdev, num_queues); ++ if (rc != 0) { ++ goto fail; ++ } ++ ++ return bvdev; ++ ++fail: ++ vdev->ctx = NULL; ++ virtio_dev_destruct(vdev); ++ free(bvdev); ++ return NULL; ++} ++ ++static struct virtio_blk_dev * ++virtio_user_blk_dev_create(const char *name, const char *path, ++ uint16_t num_queues, uint32_t queue_size) ++{ ++ struct virtio_blk_dev *bvdev; ++ uint64_t feature_bits; ++ int rc; ++ ++ bvdev = calloc(1, sizeof(*bvdev)); ++ if (bvdev == NULL) { ++ SPDK_ERRLOG("calloc failed for virtio device %s: %s\n", name, path); ++ return NULL; ++ } ++ ++ rc = virtio_user_dev_init(&bvdev->vdev, name, path, queue_size); ++ if (rc != 0) { ++ SPDK_ERRLOG("Failed to create virito device %s: %s\n", name, path); ++ free(bvdev); ++ return NULL; ++ } ++ ++ feature_bits = VIRTIO_BLK_DEV_SUPPORTED_FEATURES; ++ feature_bits |= (1ULL << VHOST_USER_F_PROTOCOL_FEATURES); ++ rc = virtio_dev_reset(&bvdev->vdev, feature_bits); ++ if (rc != 0) { ++ virtio_dev_destruct(&bvdev->vdev); ++ free(bvdev); ++ return NULL; ++ } ++ ++ rc = virtio_blk_dev_init(bvdev, num_queues); ++ if (rc != 0) { ++ virtio_dev_destruct(&bvdev->vdev); ++ free(bvdev); ++ return NULL; ++ } ++ ++ return bvdev; ++} ++ ++struct bdev_virtio_pci_dev_create_ctx { ++ const char *name; ++ struct virtio_blk_dev *ret; ++}; ++ ++static int ++bdev_virtio_pci_blk_dev_create_cb(struct virtio_pci_ctx *pci_ctx, void *ctx) ++{ ++ struct bdev_virtio_pci_dev_create_ctx *create_ctx = ctx; ++ ++ create_ctx->ret = virtio_pci_blk_dev_create(create_ctx->name, pci_ctx); ++ if (create_ctx->ret == NULL) { ++ return -1; ++ } ++ ++ return 0; ++} ++ ++struct spdk_bdev * ++bdev_virtio_pci_blk_dev_create(const char *name, struct spdk_pci_addr *pci_addr) ++{ ++ struct bdev_virtio_pci_dev_create_ctx create_ctx; ++ ++ create_ctx.name = name; ++ create_ctx.ret = NULL; ++ ++ virtio_pci_dev_attach(bdev_virtio_pci_blk_dev_create_cb, &create_ctx, ++ VIRTIO_ID_BLOCK, pci_addr); ++ ++ if (create_ctx.ret == NULL) { ++ return NULL; ++ } ++ ++ return &create_ctx.ret->bdev; ++} ++ ++static int ++bdev_virtio_pci_blk_monitor(void *arg) ++{ ++ const char *vdev_name; ++ struct bdev_virtio_pci_dev_create_ctx create_ctx; ++ ++ while ((vdev_name = virtio_pci_dev_event_process(g_blk_hotplug_fd, VIRTIO_ID_BLOCK)) != NULL) { ++ bdev_virtio_blk_dev_remove(vdev_name, NULL, NULL); ++ } ++ ++ /* Enumerate virtio pci_blk device */ ++ memset(&create_ctx, 0, sizeof(create_ctx)); ++ virtio_pci_dev_enumerate(bdev_virtio_pci_blk_dev_create_cb, &create_ctx, ++ VIRTIO_ID_BLOCK); ++ ++ return SPDK_POLLER_BUSY; ++} ++ ++int ++bdev_virtio_pci_blk_set_hotplug(bool enabled, uint64_t period_us) ++{ ++ if (enabled == true && !spdk_process_is_primary()) { ++ return -EPERM; ++ } ++ ++ if (g_blk_hotplug_poller) { ++ close(g_blk_hotplug_fd); ++ spdk_poller_unregister(&g_blk_hotplug_poller); ++ } ++ ++ if (!enabled) { ++ return 0; ++ } ++ ++ g_blk_hotplug_fd = spdk_pci_event_listen(); ++ if (g_blk_hotplug_fd < 0) { ++ return g_blk_hotplug_fd; ++ } ++ ++ period_us = period_us ? period_us : VIRTIO_BLK_HOTPLUG_POLL_PERIOD_DEFAULT; ++ period_us = spdk_min(period_us, VIRTIO_BLK_HOTPLUG_POLL_PERIOD_MAX); ++ g_blk_hotplug_poller = spdk_poller_register(bdev_virtio_pci_blk_monitor, NULL, period_us); ++ if (!g_blk_hotplug_poller) { ++ close(g_blk_hotplug_fd); ++ return -1; ++ } ++ ++ return 0; ++} ++ ++static int ++bdev_virtio_initialize(void) ++{ ++ return 0; ++} ++ ++struct spdk_bdev * ++bdev_virtio_user_blk_dev_create(const char *name, const char *path, ++ unsigned num_queues, unsigned queue_size) ++{ ++ struct virtio_blk_dev *bvdev; ++ ++ bvdev = virtio_user_blk_dev_create(name, path, num_queues, queue_size); ++ if (bvdev == NULL) { ++ return NULL; ++ } ++ ++ return &bvdev->bdev; ++} ++ ++struct spdk_bdev * ++bdev_virtio_vfio_user_blk_dev_create(const char *name, const char *path) ++{ ++ struct virtio_blk_dev *bvdev; ++ uint16_t num_queues = 0; ++ int rc; ++ ++ bvdev = calloc(1, sizeof(*bvdev)); ++ if (bvdev == NULL) { ++ SPDK_ERRLOG("calloc failed for virtio device %s: %s\n", name, path); ++ return NULL; ++ } ++ ++ rc = virtio_vfio_user_dev_init(&bvdev->vdev, name, path); ++ if (rc != 0) { ++ SPDK_ERRLOG("Failed to create %s as virtio device\n", path); ++ free(bvdev); ++ return NULL; ++ } ++ ++ rc = virtio_dev_reset(&bvdev->vdev, VIRTIO_BLK_DEV_SUPPORTED_FEATURES); ++ if (rc != 0) { ++ SPDK_ERRLOG("Failed to reset %s as virtio device\n", path); ++ virtio_dev_destruct(&bvdev->vdev); ++ free(bvdev); ++ return NULL; ++ } ++ ++ if (virtio_dev_has_feature(&bvdev->vdev, VIRTIO_BLK_F_MQ)) { ++ rc = virtio_dev_read_dev_config(&bvdev->vdev, offsetof(struct virtio_blk_config, num_queues), ++ &num_queues, sizeof(num_queues)); ++ if (rc) { ++ SPDK_ERRLOG("%s: config read failed: %s\n", name, spdk_strerror(-rc)); ++ virtio_dev_destruct(&bvdev->vdev); ++ free(bvdev); ++ return NULL; ++ } ++ } else { ++ num_queues = 1; ++ } ++ ++ rc = virtio_blk_dev_init(bvdev, num_queues); ++ if (rc != 0) { ++ SPDK_ERRLOG("Failed to initialize %s as virtio device\n", path); ++ virtio_dev_destruct(&bvdev->vdev); ++ free(bvdev); ++ return NULL; ++ } ++ ++ return &bvdev->bdev; ++} ++ ++static int ++bdev_virtio_blk_get_ctx_size(void) ++{ ++ return sizeof(struct virtio_blk_io_ctx); ++} ++ ++SPDK_LOG_REGISTER_COMPONENT(virtio_blk) +diff --git a/module/bdev/virtio/bdev_virtio_rpc.c b/module/bdev/virtio/bdev_virtio_rpc.c +index 810785c..54295c9 100644 +--- a/module/bdev/virtio/bdev_virtio_rpc.c ++++ b/module/bdev/virtio/bdev_virtio_rpc.c +@@ -1,274 +1,274 @@ +-/* SPDX-License-Identifier: BSD-3-Clause +- * Copyright (C) 2017 Intel Corporation. +- * All rights reserved. +- */ +- +-#include "spdk/stdinc.h" +- +-#include "spdk/string.h" +-#include "spdk/rpc.h" +-#include "spdk/util.h" +-#include "spdk/log.h" +-#include "spdk/thread.h" +- +-#include "bdev_virtio.h" +- +-#define SPDK_VIRTIO_USER_DEFAULT_VQ_COUNT 1 +-#define SPDK_VIRTIO_USER_DEFAULT_QUEUE_SIZE 512 +- +-struct rpc_bdev_virtio_blk_hotplug { +- bool enabled; +- uint64_t period_us; +-}; +- +-static const struct spdk_json_object_decoder rpc_bdev_virtio_blk_hotplug_decoders[] = { +- {"enable", offsetof(struct rpc_bdev_virtio_blk_hotplug, enabled), spdk_json_decode_bool, false}, +- {"period_us", offsetof(struct rpc_bdev_virtio_blk_hotplug, period_us), spdk_json_decode_uint64, true}, +-}; +- +-static void +-rpc_bdev_virtio_blk_set_hotplug(struct spdk_jsonrpc_request *request, +- const struct spdk_json_val *params) +-{ +- struct rpc_bdev_virtio_blk_hotplug req = {false, 0}; +- int rc; +- +- if (spdk_json_decode_object(params, rpc_bdev_virtio_blk_hotplug_decoders, +- SPDK_COUNTOF(rpc_bdev_virtio_blk_hotplug_decoders), &req)) { +- SPDK_ERRLOG("spdk_json_decode_object failed\n"); +- rc = -EINVAL; +- goto invalid; +- } +- +- rc = bdev_virtio_pci_blk_set_hotplug(req.enabled, req.period_us); +- if (rc) { +- goto invalid; +- } +- +- spdk_jsonrpc_send_bool_response(request, true); +- return; +-invalid: +- spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, spdk_strerror(-rc)); +-} +-SPDK_RPC_REGISTER("bdev_virtio_blk_set_hotplug", rpc_bdev_virtio_blk_set_hotplug, SPDK_RPC_RUNTIME) +- +-struct rpc_remove_virtio_dev { +- char *name; +-}; +- +-static const struct spdk_json_object_decoder rpc_remove_virtio_dev[] = { +- {"name", offsetof(struct rpc_remove_virtio_dev, name), spdk_json_decode_string }, +-}; +- +-static void +-rpc_bdev_virtio_detach_controller_cb(void *ctx, int errnum) +-{ +- struct spdk_jsonrpc_request *request = ctx; +- +- if (errnum != 0) { +- spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, +- spdk_strerror(-errnum)); +- return; +- } +- +- spdk_jsonrpc_send_bool_response(request, true); +-} +- +-static void +-rpc_bdev_virtio_detach_controller(struct spdk_jsonrpc_request *request, +- const struct spdk_json_val *params) +-{ +- struct rpc_remove_virtio_dev req = {NULL}; +- int rc = 0; +- +- if (spdk_json_decode_object(params, rpc_remove_virtio_dev, +- SPDK_COUNTOF(rpc_remove_virtio_dev), +- &req)) { +- spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, +- "spdk_json_decode_object failed"); +- goto cleanup; +- } +- +- rc = bdev_virtio_blk_dev_remove(req.name, rpc_bdev_virtio_detach_controller_cb, request); +- if (rc == -ENODEV) { +- rc = bdev_virtio_scsi_dev_remove(req.name, rpc_bdev_virtio_detach_controller_cb, request); +- } +- +- if (rc != 0) { +- spdk_jsonrpc_send_error_response(request, rc, spdk_strerror(-rc)); +- } +- +-cleanup: +- free(req.name); +-} +-SPDK_RPC_REGISTER("bdev_virtio_detach_controller", +- rpc_bdev_virtio_detach_controller, SPDK_RPC_RUNTIME) +- +-static void +-rpc_bdev_virtio_scsi_get_devices(struct spdk_jsonrpc_request *request, +- const struct spdk_json_val *params) +-{ +- struct spdk_json_write_ctx *w; +- +- if (params != NULL) { +- spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, +- "bdev_virtio_scsi_get_devices requires no parameters"); +- return; +- } +- +- w = spdk_jsonrpc_begin_result(request); +- bdev_virtio_scsi_dev_list(w); +- spdk_jsonrpc_end_result(request, w); +-} +-SPDK_RPC_REGISTER("bdev_virtio_scsi_get_devices", +- rpc_bdev_virtio_scsi_get_devices, SPDK_RPC_RUNTIME) +- +-struct rpc_bdev_virtio_attach_controller_ctx { +- char *name; +- char *trtype; +- char *traddr; +- char *dev_type; +- uint32_t vq_count; +- uint32_t vq_size; +- struct spdk_jsonrpc_request *request; +-}; +- +-static const struct spdk_json_object_decoder rpc_bdev_virtio_attach_controller_ctx[] = { +- {"name", offsetof(struct rpc_bdev_virtio_attach_controller_ctx, name), spdk_json_decode_string }, +- {"trtype", offsetof(struct rpc_bdev_virtio_attach_controller_ctx, trtype), spdk_json_decode_string }, +- {"traddr", offsetof(struct rpc_bdev_virtio_attach_controller_ctx, traddr), spdk_json_decode_string }, +- {"dev_type", offsetof(struct rpc_bdev_virtio_attach_controller_ctx, dev_type), spdk_json_decode_string }, +- {"vq_count", offsetof(struct rpc_bdev_virtio_attach_controller_ctx, vq_count), spdk_json_decode_uint32, true }, +- {"vq_size", offsetof(struct rpc_bdev_virtio_attach_controller_ctx, vq_size), spdk_json_decode_uint32, true }, +-}; +- +-static void +-free_rpc_bdev_virtio_attach_controller_ctx(struct rpc_bdev_virtio_attach_controller_ctx *req) +-{ +- free(req->name); +- free(req->trtype); +- free(req->traddr); +- free(req->dev_type); +- free(req); +-} +- +-static void +-rpc_create_virtio_dev_cb(void *ctx, int result, struct spdk_bdev **bdevs, size_t cnt) +-{ +- struct rpc_bdev_virtio_attach_controller_ctx *req = ctx; +- struct spdk_json_write_ctx *w; +- size_t i; +- +- if (result) { +- spdk_jsonrpc_send_error_response(req->request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, +- spdk_strerror(-result)); +- free_rpc_bdev_virtio_attach_controller_ctx(req); +- return; +- } +- +- w = spdk_jsonrpc_begin_result(req->request); +- spdk_json_write_array_begin(w); +- +- for (i = 0; i < cnt; i++) { +- spdk_json_write_string(w, spdk_bdev_get_name(bdevs[i])); +- } +- +- spdk_json_write_array_end(w); +- spdk_jsonrpc_end_result(req->request, w); +- +- free_rpc_bdev_virtio_attach_controller_ctx(ctx); +-} +- +-static void +-rpc_bdev_virtio_attach_controller(struct spdk_jsonrpc_request *request, +- const struct spdk_json_val *params) +-{ +- struct rpc_bdev_virtio_attach_controller_ctx *req; +- struct spdk_bdev *bdev = NULL; +- struct spdk_pci_addr pci_addr; +- int rc = 0; +- +- req = calloc(1, sizeof(*req)); +- if (!req) { +- SPDK_ERRLOG("calloc() failed\n"); +- spdk_jsonrpc_send_error_response(request, -ENOMEM, spdk_strerror(ENOMEM)); +- return; +- } +- +- if (spdk_json_decode_object(params, rpc_bdev_virtio_attach_controller_ctx, +- SPDK_COUNTOF(rpc_bdev_virtio_attach_controller_ctx), +- req)) { +- spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, +- "spdk_json_decode_object failed"); +- goto cleanup; +- } +- +- if (strcmp(req->trtype, "pci") == 0) { +- if (req->vq_count != 0 || req->vq_size != 0) { +- SPDK_ERRLOG("VQ count or size is not allowed for PCI transport type\n"); +- spdk_jsonrpc_send_error_response(request, EINVAL, +- "vq_count or vq_size is not allowed for PCI transport type."); +- goto cleanup; +- } +- +- if (spdk_pci_addr_parse(&pci_addr, req->traddr) != 0) { +- SPDK_ERRLOG("Invalid PCI address '%s'\n", req->traddr); +- spdk_jsonrpc_send_error_response_fmt(request, EINVAL, "Invalid PCI address '%s'", req->traddr); +- goto cleanup; +- } +- } else if (strcmp(req->trtype, "user") == 0) { +- req->vq_count = req->vq_count == 0 ? SPDK_VIRTIO_USER_DEFAULT_VQ_COUNT : req->vq_count; +- req->vq_size = req->vq_size == 0 ? SPDK_VIRTIO_USER_DEFAULT_QUEUE_SIZE : req->vq_size; +- } else if (strcmp(req->trtype, "vfio-user") == 0) { +- if (req->vq_count != 0 || req->vq_size != 0) { +- SPDK_ERRLOG("VQ count or size is not allowed for vfio-user transport type\n"); +- spdk_jsonrpc_send_error_response(request, EINVAL, +- "vq_count or vq_size is not allowed for vfio-user transport type."); +- goto cleanup; +- } +- } else { +- SPDK_ERRLOG("Invalid trtype '%s'\n", req->trtype); +- spdk_jsonrpc_send_error_response_fmt(request, EINVAL, "Invalid trtype '%s'", req->trtype); +- goto cleanup; +- } +- +- req->request = request; +- if (strcmp(req->dev_type, "blk") == 0) { +- if (strcmp(req->trtype, "pci") == 0) { +- bdev = bdev_virtio_pci_blk_dev_create(req->name, &pci_addr); +- } else if (strcmp(req->trtype, "user") == 0) { +- bdev = bdev_virtio_user_blk_dev_create(req->name, req->traddr, req->vq_count, req->vq_size); +- } else if (strcmp(req->trtype, "vfio-user") == 0) { +- bdev = bdev_virtio_vfio_user_blk_dev_create(req->name, req->traddr); +- } +- +- /* Virtio blk doesn't use callback so call it manually to send result. */ +- rc = bdev ? 0 : -EINVAL; +- rpc_create_virtio_dev_cb(req, rc, &bdev, bdev ? 1 : 0); +- } else if (strcmp(req->dev_type, "scsi") == 0) { +- if (strcmp(req->trtype, "pci") == 0) { +- rc = bdev_virtio_pci_scsi_dev_create(req->name, &pci_addr, rpc_create_virtio_dev_cb, req); +- } else if (strcmp(req->trtype, "user") == 0) { +- rc = bdev_virtio_user_scsi_dev_create(req->name, req->traddr, req->vq_count, req->vq_size, +- rpc_create_virtio_dev_cb, req); +- } else if (strcmp(req->trtype, "vfio-user") == 0) { +- rc = bdev_vfio_user_scsi_dev_create(req->name, req->traddr, rpc_create_virtio_dev_cb, req); +- } +- +- if (rc < 0) { +- /* In case of error callback is not called so do it manually to send result. */ +- rpc_create_virtio_dev_cb(req, rc, NULL, 0); +- } +- } else { +- SPDK_ERRLOG("Invalid dev_type '%s'\n", req->dev_type); +- spdk_jsonrpc_send_error_response_fmt(request, EINVAL, "Invalid dev_type '%s'", req->dev_type); +- goto cleanup; +- } +- +- return; +- +-cleanup: +- free_rpc_bdev_virtio_attach_controller_ctx(req); +-} +-SPDK_RPC_REGISTER("bdev_virtio_attach_controller", +- rpc_bdev_virtio_attach_controller, SPDK_RPC_RUNTIME); ++/* SPDX-License-Identifier: BSD-3-Clause ++ * Copyright (C) 2017 Intel Corporation. ++ * All rights reserved. ++ */ ++ ++#include "spdk/stdinc.h" ++ ++#include "spdk/string.h" ++#include "spdk/rpc.h" ++#include "spdk/util.h" ++#include "spdk/log.h" ++#include "spdk/thread.h" ++ ++#include "bdev_virtio.h" ++ ++#define SPDK_VIRTIO_USER_DEFAULT_VQ_COUNT 1 ++#define SPDK_VIRTIO_USER_DEFAULT_QUEUE_SIZE 512 ++ ++struct rpc_bdev_virtio_blk_hotplug { ++ bool enabled; ++ uint64_t period_us; ++}; ++ ++static const struct spdk_json_object_decoder rpc_bdev_virtio_blk_hotplug_decoders[] = { ++ {"enable", offsetof(struct rpc_bdev_virtio_blk_hotplug, enabled), spdk_json_decode_bool, false}, ++ {"period_us", offsetof(struct rpc_bdev_virtio_blk_hotplug, period_us), spdk_json_decode_uint64, true}, ++}; ++ ++static void ++rpc_bdev_virtio_blk_set_hotplug(struct spdk_jsonrpc_request *request, ++ const struct spdk_json_val *params) ++{ ++ struct rpc_bdev_virtio_blk_hotplug req = {false, 0}; ++ int rc; ++ ++ if (spdk_json_decode_object(params, rpc_bdev_virtio_blk_hotplug_decoders, ++ SPDK_COUNTOF(rpc_bdev_virtio_blk_hotplug_decoders), &req)) { ++ SPDK_ERRLOG("spdk_json_decode_object failed\n"); ++ rc = -EINVAL; ++ goto invalid; ++ } ++ ++ rc = bdev_virtio_pci_blk_set_hotplug(req.enabled, req.period_us); ++ if (rc) { ++ goto invalid; ++ } ++ ++ spdk_jsonrpc_send_bool_response(request, true); ++ return; ++invalid: ++ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, spdk_strerror(-rc)); ++} ++SPDK_RPC_REGISTER("bdev_virtio_blk_set_hotplug", rpc_bdev_virtio_blk_set_hotplug, SPDK_RPC_RUNTIME) ++ ++struct rpc_remove_virtio_dev { ++ char *name; ++}; ++ ++static const struct spdk_json_object_decoder rpc_remove_virtio_dev[] = { ++ {"name", offsetof(struct rpc_remove_virtio_dev, name), spdk_json_decode_string }, ++}; ++ ++static void ++rpc_bdev_virtio_detach_controller_cb(void *ctx, int errnum) ++{ ++ struct spdk_jsonrpc_request *request = ctx; ++ ++ if (errnum != 0) { ++ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, ++ spdk_strerror(-errnum)); ++ return; ++ } ++ ++ spdk_jsonrpc_send_bool_response(request, true); ++} ++ ++static void ++rpc_bdev_virtio_detach_controller(struct spdk_jsonrpc_request *request, ++ const struct spdk_json_val *params) ++{ ++ struct rpc_remove_virtio_dev req = {NULL}; ++ int rc = 0; ++ ++ if (spdk_json_decode_object(params, rpc_remove_virtio_dev, ++ SPDK_COUNTOF(rpc_remove_virtio_dev), ++ &req)) { ++ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, ++ "spdk_json_decode_object failed"); ++ goto cleanup; ++ } ++ ++ rc = bdev_virtio_blk_dev_remove(req.name, rpc_bdev_virtio_detach_controller_cb, request); ++ if (rc == -ENODEV) { ++ rc = bdev_virtio_scsi_dev_remove(req.name, rpc_bdev_virtio_detach_controller_cb, request); ++ } ++ ++ if (rc != 0) { ++ spdk_jsonrpc_send_error_response(request, rc, spdk_strerror(-rc)); ++ } ++ ++cleanup: ++ free(req.name); ++} ++SPDK_RPC_REGISTER("bdev_virtio_detach_controller", ++ rpc_bdev_virtio_detach_controller, SPDK_RPC_RUNTIME) ++ ++static void ++rpc_bdev_virtio_scsi_get_devices(struct spdk_jsonrpc_request *request, ++ const struct spdk_json_val *params) ++{ ++ struct spdk_json_write_ctx *w; ++ ++ if (params != NULL) { ++ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, ++ "bdev_virtio_scsi_get_devices requires no parameters"); ++ return; ++ } ++ ++ w = spdk_jsonrpc_begin_result(request); ++ bdev_virtio_scsi_dev_list(w); ++ spdk_jsonrpc_end_result(request, w); ++} ++SPDK_RPC_REGISTER("bdev_virtio_scsi_get_devices", ++ rpc_bdev_virtio_scsi_get_devices, SPDK_RPC_RUNTIME) ++ ++struct rpc_bdev_virtio_attach_controller_ctx { ++ char *name; ++ char *trtype; ++ char *traddr; ++ char *dev_type; ++ uint32_t vq_count; ++ uint32_t vq_size; ++ struct spdk_jsonrpc_request *request; ++}; ++ ++static const struct spdk_json_object_decoder rpc_bdev_virtio_attach_controller_ctx[] = { ++ {"name", offsetof(struct rpc_bdev_virtio_attach_controller_ctx, name), spdk_json_decode_string }, ++ {"trtype", offsetof(struct rpc_bdev_virtio_attach_controller_ctx, trtype), spdk_json_decode_string }, ++ {"traddr", offsetof(struct rpc_bdev_virtio_attach_controller_ctx, traddr), spdk_json_decode_string }, ++ {"dev_type", offsetof(struct rpc_bdev_virtio_attach_controller_ctx, dev_type), spdk_json_decode_string }, ++ {"vq_count", offsetof(struct rpc_bdev_virtio_attach_controller_ctx, vq_count), spdk_json_decode_uint32, true }, ++ {"vq_size", offsetof(struct rpc_bdev_virtio_attach_controller_ctx, vq_size), spdk_json_decode_uint32, true }, ++}; ++ ++static void ++free_rpc_bdev_virtio_attach_controller_ctx(struct rpc_bdev_virtio_attach_controller_ctx *req) ++{ ++ free(req->name); ++ free(req->trtype); ++ free(req->traddr); ++ free(req->dev_type); ++ free(req); ++} ++ ++static void ++rpc_create_virtio_dev_cb(void *ctx, int result, struct spdk_bdev **bdevs, size_t cnt) ++{ ++ struct rpc_bdev_virtio_attach_controller_ctx *req = ctx; ++ struct spdk_json_write_ctx *w; ++ size_t i; ++ ++ if (result) { ++ spdk_jsonrpc_send_error_response(req->request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, ++ spdk_strerror(-result)); ++ free_rpc_bdev_virtio_attach_controller_ctx(req); ++ return; ++ } ++ ++ w = spdk_jsonrpc_begin_result(req->request); ++ spdk_json_write_array_begin(w); ++ ++ for (i = 0; i < cnt; i++) { ++ spdk_json_write_string(w, spdk_bdev_get_name(bdevs[i])); ++ } ++ ++ spdk_json_write_array_end(w); ++ spdk_jsonrpc_end_result(req->request, w); ++ ++ free_rpc_bdev_virtio_attach_controller_ctx(ctx); ++} ++ ++static void ++rpc_bdev_virtio_attach_controller(struct spdk_jsonrpc_request *request, ++ const struct spdk_json_val *params) ++{ ++ struct rpc_bdev_virtio_attach_controller_ctx *req; ++ struct spdk_bdev *bdev = NULL; ++ struct spdk_pci_addr pci_addr; ++ int rc = 0; ++ ++ req = calloc(1, sizeof(*req)); ++ if (!req) { ++ SPDK_ERRLOG("calloc() failed\n"); ++ spdk_jsonrpc_send_error_response(request, -ENOMEM, spdk_strerror(ENOMEM)); ++ return; ++ } ++ ++ if (spdk_json_decode_object(params, rpc_bdev_virtio_attach_controller_ctx, ++ SPDK_COUNTOF(rpc_bdev_virtio_attach_controller_ctx), ++ req)) { ++ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, ++ "spdk_json_decode_object failed"); ++ goto cleanup; ++ } ++ ++ if (strcmp(req->trtype, "pci") == 0) { ++ if (req->vq_count != 0 || req->vq_size != 0) { ++ SPDK_ERRLOG("VQ count or size is not allowed for PCI transport type\n"); ++ spdk_jsonrpc_send_error_response(request, EINVAL, ++ "vq_count or vq_size is not allowed for PCI transport type."); ++ goto cleanup; ++ } ++ ++ if (spdk_pci_addr_parse(&pci_addr, req->traddr) != 0) { ++ SPDK_ERRLOG("Invalid PCI address '%s'\n", req->traddr); ++ spdk_jsonrpc_send_error_response_fmt(request, EINVAL, "Invalid PCI address '%s'", req->traddr); ++ goto cleanup; ++ } ++ } else if (strcmp(req->trtype, "user") == 0) { ++ req->vq_count = req->vq_count == 0 ? SPDK_VIRTIO_USER_DEFAULT_VQ_COUNT : req->vq_count; ++ req->vq_size = req->vq_size == 0 ? SPDK_VIRTIO_USER_DEFAULT_QUEUE_SIZE : req->vq_size; ++ } else if (strcmp(req->trtype, "vfio-user") == 0) { ++ if (req->vq_count != 0 || req->vq_size != 0) { ++ SPDK_ERRLOG("VQ count or size is not allowed for vfio-user transport type\n"); ++ spdk_jsonrpc_send_error_response(request, EINVAL, ++ "vq_count or vq_size is not allowed for vfio-user transport type."); ++ goto cleanup; ++ } ++ } else { ++ SPDK_ERRLOG("Invalid trtype '%s'\n", req->trtype); ++ spdk_jsonrpc_send_error_response_fmt(request, EINVAL, "Invalid trtype '%s'", req->trtype); ++ goto cleanup; ++ } ++ ++ req->request = request; ++ if (strcmp(req->dev_type, "blk") == 0) { ++ if (strcmp(req->trtype, "pci") == 0) { ++ bdev = bdev_virtio_pci_blk_dev_create(req->name, &pci_addr); ++ } else if (strcmp(req->trtype, "user") == 0) { ++ bdev = bdev_virtio_user_blk_dev_create(req->name, req->traddr, req->vq_count, req->vq_size); ++ } else if (strcmp(req->trtype, "vfio-user") == 0) { ++ bdev = bdev_virtio_vfio_user_blk_dev_create(req->name, req->traddr); ++ } ++ ++ /* Virtio blk doesn't use callback so call it manually to send result. */ ++ rc = bdev ? 0 : -EINVAL; ++ rpc_create_virtio_dev_cb(req, rc, &bdev, bdev ? 1 : 0); ++ } else if (strcmp(req->dev_type, "scsi") == 0) { ++ if (strcmp(req->trtype, "pci") == 0) { ++ rc = bdev_virtio_pci_scsi_dev_create(req->name, &pci_addr, rpc_create_virtio_dev_cb, req); ++ } else if (strcmp(req->trtype, "user") == 0) { ++ rc = bdev_virtio_user_scsi_dev_create(req->name, req->traddr, req->vq_count, req->vq_size, ++ rpc_create_virtio_dev_cb, req); ++ } else if (strcmp(req->trtype, "vfio-user") == 0) { ++ rc = bdev_vfio_user_scsi_dev_create(req->name, req->traddr, rpc_create_virtio_dev_cb, req); ++ } ++ ++ if (rc < 0) { ++ /* In case of error callback is not called so do it manually to send result. */ ++ rpc_create_virtio_dev_cb(req, rc, NULL, 0); ++ } ++ } else { ++ SPDK_ERRLOG("Invalid dev_type '%s'\n", req->dev_type); ++ spdk_jsonrpc_send_error_response_fmt(request, EINVAL, "Invalid dev_type '%s'", req->dev_type); ++ goto cleanup; ++ } ++ ++ return; ++ ++cleanup: ++ free_rpc_bdev_virtio_attach_controller_ctx(req); ++} ++SPDK_RPC_REGISTER("bdev_virtio_attach_controller", ++ rpc_bdev_virtio_attach_controller, SPDK_RPC_RUNTIME); +diff --git a/module/bdev/virtio/bdev_virtio_scsi.c b/module/bdev/virtio/bdev_virtio_scsi.c +index 322e219..cd8f043 100644 +--- a/module/bdev/virtio/bdev_virtio_scsi.c ++++ b/module/bdev/virtio/bdev_virtio_scsi.c +@@ -1,1948 +1,1948 @@ +-/* SPDX-License-Identifier: BSD-3-Clause +- * Copyright (C) 2017 Intel Corporation. +- * All rights reserved. +- */ +- +-#include "spdk/stdinc.h" +- +-#include "spdk/bdev.h" +-#include "spdk/endian.h" +-#include "spdk/env.h" +-#include "spdk/thread.h" +-#include "spdk/scsi_spec.h" +-#include "spdk/string.h" +-#include "spdk/util.h" +-#include "spdk/json.h" +- +-#include "spdk/bdev_module.h" +-#include "spdk/log.h" +-#include "spdk_internal/virtio.h" +-#include "spdk_internal/vhost_user.h" +- +-#include +-#include +- +-#include "bdev_virtio.h" +- +-#define BDEV_VIRTIO_MAX_TARGET 64 +-#define BDEV_VIRTIO_SCAN_PAYLOAD_SIZE 256 +-#define MGMT_POLL_PERIOD_US (1000 * 5) +-#define CTRLQ_RING_SIZE 16 +-#define SCAN_REQUEST_RETRIES 5 +- +-/* Number of non-request queues - eventq and controlq */ +-#define SPDK_VIRTIO_SCSI_QUEUE_NUM_FIXED 2 +- +-#define VIRTIO_SCSI_EVENTQ_BUFFER_COUNT 16 +- +-#define VIRTIO_SCSI_CONTROLQ 0 +-#define VIRTIO_SCSI_EVENTQ 1 +-#define VIRTIO_SCSI_REQUESTQ 2 +- +-static int bdev_virtio_initialize(void); +-static void bdev_virtio_finish(void); +- +-struct virtio_scsi_dev { +- /* Generic virtio device data. */ +- struct virtio_dev vdev; +- +- /** Detected SCSI LUNs */ +- TAILQ_HEAD(, virtio_scsi_disk) luns; +- +- /** Context for the SCSI target scan. */ +- struct virtio_scsi_scan_base *scan_ctx; +- +- /** Controlq poller. */ +- struct spdk_poller *mgmt_poller; +- +- /** Controlq messages to be sent. */ +- struct spdk_ring *ctrlq_ring; +- +- /** Buffers for the eventq. */ +- struct virtio_scsi_eventq_io *eventq_ios; +- +- /** Device marked for removal. */ +- bool removed; +- +- /** Callback to be called after vdev removal. */ +- bdev_virtio_remove_cb remove_cb; +- +- /** Context for the `remove_cb`. */ +- void *remove_ctx; +- +- TAILQ_ENTRY(virtio_scsi_dev) tailq; +-}; +- +-struct virtio_scsi_io_ctx { +- struct iovec iov_req; +- struct iovec iov_resp; +- union { +- struct virtio_scsi_cmd_req req; +- struct virtio_scsi_ctrl_tmf_req tmf_req; +- }; +- union { +- struct virtio_scsi_cmd_resp resp; +- struct virtio_scsi_ctrl_tmf_resp tmf_resp; +- }; +-}; +- +-struct virtio_scsi_eventq_io { +- struct iovec iov; +- struct virtio_scsi_event ev; +-}; +- +-struct virtio_scsi_scan_info { +- uint64_t num_blocks; +- uint32_t block_size; +- uint8_t target; +- bool unmap_supported; +- TAILQ_ENTRY(virtio_scsi_scan_info) tailq; +-}; +- +-struct virtio_scsi_scan_base { +- struct virtio_scsi_dev *svdev; +- +- /** I/O channel used for the scan I/O. */ +- struct bdev_virtio_io_channel *channel; +- +- bdev_virtio_create_cb cb_fn; +- void *cb_arg; +- +- /** Scan all targets on the device. */ +- bool full_scan; +- +- /** Start a full rescan after receiving next scan I/O response. */ +- bool restart; +- +- /** Additional targets to be (re)scanned. */ +- TAILQ_HEAD(, virtio_scsi_scan_info) scan_queue; +- +- /** Remaining attempts for sending the current request. */ +- unsigned retries; +- +- /** If set, the last scan I/O needs to be resent */ +- bool needs_resend; +- +- struct virtio_scsi_io_ctx io_ctx; +- struct iovec iov; +- uint8_t payload[BDEV_VIRTIO_SCAN_PAYLOAD_SIZE]; +- +- /** Scan results for the current target. */ +- struct virtio_scsi_scan_info info; +-}; +- +-struct virtio_scsi_disk { +- struct spdk_bdev bdev; +- struct virtio_scsi_dev *svdev; +- struct virtio_scsi_scan_info info; +- +- /** Descriptor opened just to be notified of external bdev hotremove. */ +- struct spdk_bdev_desc *notify_desc; +- +- /** Disk marked for removal. */ +- bool removed; +- TAILQ_ENTRY(virtio_scsi_disk) link; +-}; +- +-struct bdev_virtio_io_channel { +- struct virtio_scsi_dev *svdev; +- +- /** Virtqueue exclusively assigned to this channel. */ +- struct virtqueue *vq; +- +- /** Virtio response poller. */ +- struct spdk_poller *poller; +-}; +- +-static TAILQ_HEAD(, virtio_scsi_dev) g_virtio_scsi_devs = +- TAILQ_HEAD_INITIALIZER(g_virtio_scsi_devs); +- +-static pthread_mutex_t g_virtio_scsi_mutex = PTHREAD_MUTEX_INITIALIZER; +- +-/** Module finish in progress */ +-static bool g_bdev_virtio_finish = false; +- +-/* Features desired/implemented by this driver. */ +-#define VIRTIO_SCSI_DEV_SUPPORTED_FEATURES \ +- (1ULL << VIRTIO_SCSI_F_INOUT | \ +- 1ULL << VIRTIO_SCSI_F_HOTPLUG | \ +- 1ULL << VIRTIO_RING_F_EVENT_IDX) +- +-static void virtio_scsi_dev_unregister_cb(void *io_device); +-static void virtio_scsi_dev_remove(struct virtio_scsi_dev *svdev, +- bdev_virtio_remove_cb cb_fn, void *cb_arg); +-static int bdev_virtio_scsi_ch_create_cb(void *io_device, void *ctx_buf); +-static void bdev_virtio_scsi_ch_destroy_cb(void *io_device, void *ctx_buf); +-static void process_scan_resp(struct virtio_scsi_scan_base *base); +-static int bdev_virtio_mgmt_poll(void *arg); +- +-static int +-virtio_scsi_dev_send_eventq_io(struct virtqueue *vq, struct virtio_scsi_eventq_io *io) +-{ +- int rc; +- +- rc = virtqueue_req_start(vq, io, 1); +- if (rc != 0) { +- return -1; +- } +- +- virtqueue_req_add_iovs(vq, &io->iov, 1, SPDK_VIRTIO_DESC_WR); +- virtqueue_req_flush(vq); +- +- return 0; +-} +- +-static int +-virtio_scsi_dev_init(struct virtio_scsi_dev *svdev, uint16_t max_queues, uint64_t feature_bits) +-{ +- struct virtio_dev *vdev = &svdev->vdev; +- struct spdk_ring *ctrlq_ring; +- struct virtio_scsi_eventq_io *eventq_io; +- struct virtqueue *eventq; +- uint16_t i, num_events; +- int rc; +- +- rc = virtio_dev_reset(vdev, feature_bits); +- if (rc != 0) { +- return rc; +- } +- +- rc = virtio_dev_start(vdev, max_queues, SPDK_VIRTIO_SCSI_QUEUE_NUM_FIXED); +- if (rc != 0) { +- return rc; +- } +- +- ctrlq_ring = spdk_ring_create(SPDK_RING_TYPE_MP_SC, CTRLQ_RING_SIZE, +- SPDK_ENV_SOCKET_ID_ANY); +- if (ctrlq_ring == NULL) { +- SPDK_ERRLOG("Failed to allocate send ring for the controlq.\n"); +- return -1; +- } +- +- rc = virtio_dev_acquire_queue(vdev, VIRTIO_SCSI_CONTROLQ); +- if (rc != 0) { +- SPDK_ERRLOG("Failed to acquire the controlq.\n"); +- spdk_ring_free(ctrlq_ring); +- return -1; +- } +- +- rc = virtio_dev_acquire_queue(vdev, VIRTIO_SCSI_EVENTQ); +- if (rc != 0) { +- SPDK_ERRLOG("Failed to acquire the eventq.\n"); +- virtio_dev_release_queue(vdev, VIRTIO_SCSI_CONTROLQ); +- spdk_ring_free(ctrlq_ring); +- return -1; +- } +- +- eventq = vdev->vqs[VIRTIO_SCSI_EVENTQ]; +- num_events = spdk_min(eventq->vq_nentries, VIRTIO_SCSI_EVENTQ_BUFFER_COUNT); +- svdev->eventq_ios = spdk_zmalloc(sizeof(*svdev->eventq_ios) * num_events, +- 0, NULL, SPDK_ENV_LCORE_ID_ANY, +- SPDK_MALLOC_DMA); +- if (svdev->eventq_ios == NULL) { +- SPDK_ERRLOG("cannot allocate memory for %"PRIu16" eventq buffers\n", +- num_events); +- virtio_dev_release_queue(vdev, VIRTIO_SCSI_EVENTQ); +- virtio_dev_release_queue(vdev, VIRTIO_SCSI_CONTROLQ); +- spdk_ring_free(ctrlq_ring); +- return -1; +- } +- +- for (i = 0; i < num_events; i++) { +- eventq_io = &svdev->eventq_ios[i]; +- eventq_io->iov.iov_base = &eventq_io->ev; +- eventq_io->iov.iov_len = sizeof(eventq_io->ev); +- virtio_scsi_dev_send_eventq_io(eventq, eventq_io); +- } +- +- svdev->ctrlq_ring = ctrlq_ring; +- +- svdev->mgmt_poller = SPDK_POLLER_REGISTER(bdev_virtio_mgmt_poll, svdev, +- MGMT_POLL_PERIOD_US); +- +- TAILQ_INIT(&svdev->luns); +- svdev->scan_ctx = NULL; +- svdev->removed = false; +- svdev->remove_cb = NULL; +- svdev->remove_ctx = NULL; +- +- spdk_io_device_register(svdev, bdev_virtio_scsi_ch_create_cb, +- bdev_virtio_scsi_ch_destroy_cb, +- sizeof(struct bdev_virtio_io_channel), +- svdev->vdev.name); +- +- pthread_mutex_lock(&g_virtio_scsi_mutex); +- TAILQ_INSERT_TAIL(&g_virtio_scsi_devs, svdev, tailq); +- pthread_mutex_unlock(&g_virtio_scsi_mutex); +- return 0; +-} +- +-static struct virtio_scsi_dev * +-virtio_pci_scsi_dev_create(const char *name, struct virtio_pci_ctx *pci_ctx) +-{ +- static int pci_dev_counter = 0; +- struct virtio_scsi_dev *svdev; +- struct virtio_dev *vdev; +- char *default_name = NULL; +- uint32_t num_queues; +- int rc; +- +- svdev = calloc(1, sizeof(*svdev)); +- if (svdev == NULL) { +- SPDK_ERRLOG("virtio device calloc failed\n"); +- return NULL; +- } +- +- vdev = &svdev->vdev; +- if (name == NULL) { +- default_name = spdk_sprintf_alloc("VirtioScsi%"PRIu32, pci_dev_counter++); +- if (default_name == NULL) { +- free(vdev); +- return NULL; +- } +- name = default_name; +- } +- +- rc = virtio_pci_dev_init(vdev, name, pci_ctx); +- free(default_name); +- +- if (rc != 0) { +- free(svdev); +- return NULL; +- } +- +- rc = virtio_dev_read_dev_config(vdev, offsetof(struct virtio_scsi_config, num_queues), +- &num_queues, sizeof(num_queues)); +- if (rc) { +- SPDK_ERRLOG("%s: config read failed: %s\n", vdev->name, spdk_strerror(-rc)); +- goto fail; +- } +- +- rc = virtio_scsi_dev_init(svdev, num_queues, VIRTIO_SCSI_DEV_SUPPORTED_FEATURES); +- if (rc != 0) { +- goto fail; +- } +- +- return svdev; +- +-fail: +- vdev->ctx = NULL; +- virtio_dev_destruct(vdev); +- free(svdev); +- return NULL; +-} +- +-static struct virtio_scsi_dev * +-virtio_user_scsi_dev_create(const char *name, const char *path, +- uint16_t num_queues, uint32_t queue_size) +-{ +- struct virtio_scsi_dev *svdev; +- struct virtio_dev *vdev; +- uint64_t feature_bits; +- int rc; +- +- svdev = calloc(1, sizeof(*svdev)); +- if (svdev == NULL) { +- SPDK_ERRLOG("calloc failed for virtio device %s: %s\n", name, path); +- return NULL; +- } +- +- vdev = &svdev->vdev; +- rc = virtio_user_dev_init(vdev, name, path, queue_size); +- if (rc != 0) { +- SPDK_ERRLOG("Failed to create virito device %s: %s\n", name, path); +- free(svdev); +- return NULL; +- } +- +- feature_bits = VIRTIO_SCSI_DEV_SUPPORTED_FEATURES; +- feature_bits |= (1ULL << VHOST_USER_F_PROTOCOL_FEATURES); +- rc = virtio_scsi_dev_init(svdev, num_queues + SPDK_VIRTIO_SCSI_QUEUE_NUM_FIXED, feature_bits); +- if (rc != 0) { +- virtio_dev_destruct(vdev); +- free(svdev); +- return NULL; +- } +- +- return svdev; +-} +- +-static struct virtio_scsi_disk * +-virtio_scsi_dev_get_disk_by_id(struct virtio_scsi_dev *svdev, uint8_t target_id) +-{ +- struct virtio_scsi_disk *disk; +- +- TAILQ_FOREACH(disk, &svdev->luns, link) { +- if (disk->info.target == target_id) { +- return disk; +- } +- } +- +- return NULL; +-} +- +-static int virtio_scsi_dev_scan(struct virtio_scsi_dev *svdev, +- bdev_virtio_create_cb cb_fn, void *cb_arg); +-static int send_scan_io(struct virtio_scsi_scan_base *base); +-static void _virtio_scsi_dev_scan_tgt(struct virtio_scsi_scan_base *base, uint8_t target); +-static int _virtio_scsi_dev_scan_next(struct virtio_scsi_scan_base *base, int rc); +-static void _virtio_scsi_dev_scan_finish(struct virtio_scsi_scan_base *base, int errnum); +-static int virtio_scsi_dev_scan_tgt(struct virtio_scsi_dev *svdev, uint8_t target); +- +-static int +-bdev_virtio_get_ctx_size(void) +-{ +- return sizeof(struct virtio_scsi_io_ctx); +-} +- +-static int +-bdev_virtio_scsi_config_json(struct spdk_json_write_ctx *w) +-{ +- struct virtio_scsi_dev *svdev; +- +- pthread_mutex_lock(&g_virtio_scsi_mutex); +- TAILQ_FOREACH(svdev, &g_virtio_scsi_devs, tailq) { +- spdk_json_write_object_begin(w); +- +- spdk_json_write_named_string(w, "method", "bdev_virtio_attach_controller"); +- +- spdk_json_write_named_object_begin(w, "params"); +- spdk_json_write_named_string(w, "name", svdev->vdev.name); +- spdk_json_write_named_string(w, "dev_type", "scsi"); +- +- /* Write transport specific parameters. */ +- svdev->vdev.backend_ops->write_json_config(&svdev->vdev, w); +- +- spdk_json_write_object_end(w); +- +- spdk_json_write_object_end(w); +- +- } +- pthread_mutex_unlock(&g_virtio_scsi_mutex); +- +- return 0; +-} +- +- +-static struct spdk_bdev_module virtio_scsi_if = { +- .name = "virtio_scsi", +- .module_init = bdev_virtio_initialize, +- .module_fini = bdev_virtio_finish, +- .get_ctx_size = bdev_virtio_get_ctx_size, +- .config_json = bdev_virtio_scsi_config_json, +- .async_fini = true, +-}; +- +-SPDK_BDEV_MODULE_REGISTER(virtio_scsi, &virtio_scsi_if) +- +-static struct virtio_scsi_io_ctx * +-bdev_virtio_init_io_vreq(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io) +-{ +- struct virtio_scsi_cmd_req *req; +- struct virtio_scsi_cmd_resp *resp; +- struct virtio_scsi_disk *disk = (struct virtio_scsi_disk *)bdev_io->bdev; +- struct virtio_scsi_io_ctx *io_ctx = (struct virtio_scsi_io_ctx *)bdev_io->driver_ctx; +- +- req = &io_ctx->req; +- resp = &io_ctx->resp; +- +- io_ctx->iov_req.iov_base = req; +- io_ctx->iov_req.iov_len = sizeof(*req); +- +- io_ctx->iov_resp.iov_base = resp; +- io_ctx->iov_resp.iov_len = sizeof(*resp); +- +- memset(req, 0, sizeof(*req)); +- req->lun[0] = 1; +- req->lun[1] = disk->info.target; +- +- return io_ctx; +-} +- +-static struct virtio_scsi_io_ctx * +-bdev_virtio_init_tmf_vreq(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io) +-{ +- struct virtio_scsi_ctrl_tmf_req *tmf_req; +- struct virtio_scsi_ctrl_tmf_resp *tmf_resp; +- struct virtio_scsi_disk *disk = SPDK_CONTAINEROF(bdev_io->bdev, struct virtio_scsi_disk, bdev); +- struct virtio_scsi_io_ctx *io_ctx = (struct virtio_scsi_io_ctx *)bdev_io->driver_ctx; +- +- tmf_req = &io_ctx->tmf_req; +- tmf_resp = &io_ctx->tmf_resp; +- +- io_ctx->iov_req.iov_base = tmf_req; +- io_ctx->iov_req.iov_len = sizeof(*tmf_req); +- io_ctx->iov_resp.iov_base = tmf_resp; +- io_ctx->iov_resp.iov_len = sizeof(*tmf_resp); +- +- memset(tmf_req, 0, sizeof(*tmf_req)); +- tmf_req->lun[0] = 1; +- tmf_req->lun[1] = disk->info.target; +- +- return io_ctx; +-} +- +-static void +-bdev_virtio_send_io(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io) +-{ +- struct bdev_virtio_io_channel *virtio_channel = spdk_io_channel_get_ctx(ch); +- struct virtqueue *vq = virtio_channel->vq; +- struct virtio_scsi_io_ctx *io_ctx = (struct virtio_scsi_io_ctx *)bdev_io->driver_ctx; +- int rc; +- +- rc = virtqueue_req_start(vq, bdev_io, bdev_io->u.bdev.iovcnt + 2); +- if (rc == -ENOMEM) { +- spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_NOMEM); +- return; +- } else if (rc != 0) { +- spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); +- return; +- } +- +- virtqueue_req_add_iovs(vq, &io_ctx->iov_req, 1, SPDK_VIRTIO_DESC_RO); +- if (bdev_io->type == SPDK_BDEV_IO_TYPE_READ) { +- virtqueue_req_add_iovs(vq, &io_ctx->iov_resp, 1, SPDK_VIRTIO_DESC_WR); +- virtqueue_req_add_iovs(vq, bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt, +- SPDK_VIRTIO_DESC_WR); +- } else { +- virtqueue_req_add_iovs(vq, bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt, +- SPDK_VIRTIO_DESC_RO); +- virtqueue_req_add_iovs(vq, &io_ctx->iov_resp, 1, SPDK_VIRTIO_DESC_WR); +- } +- +- virtqueue_req_flush(vq); +-} +- +-static void +-bdev_virtio_rw(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io) +-{ +- struct virtio_scsi_disk *disk = SPDK_CONTAINEROF(bdev_io->bdev, struct virtio_scsi_disk, bdev); +- struct virtio_scsi_io_ctx *io_ctx = bdev_virtio_init_io_vreq(ch, bdev_io); +- struct virtio_scsi_cmd_req *req = &io_ctx->req; +- bool is_write = bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE; +- +- if (disk->info.num_blocks > (1ULL << 32)) { +- req->cdb[0] = is_write ? SPDK_SBC_WRITE_16 : SPDK_SBC_READ_16; +- to_be64(&req->cdb[2], bdev_io->u.bdev.offset_blocks); +- to_be32(&req->cdb[10], bdev_io->u.bdev.num_blocks); +- } else { +- req->cdb[0] = is_write ? SPDK_SBC_WRITE_10 : SPDK_SBC_READ_10; +- to_be32(&req->cdb[2], bdev_io->u.bdev.offset_blocks); +- to_be16(&req->cdb[7], bdev_io->u.bdev.num_blocks); +- } +- +- bdev_virtio_send_io(ch, bdev_io); +-} +- +-static void +-bdev_virtio_reset(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io) +-{ +- struct bdev_virtio_io_channel *virtio_ch = spdk_io_channel_get_ctx(ch); +- struct virtio_scsi_io_ctx *io_ctx = bdev_virtio_init_tmf_vreq(ch, bdev_io); +- struct virtio_scsi_ctrl_tmf_req *tmf_req = &io_ctx->tmf_req; +- struct virtio_scsi_dev *svdev = virtio_ch->svdev; +- size_t enqueued_count; +- +- tmf_req->type = VIRTIO_SCSI_T_TMF; +- tmf_req->subtype = VIRTIO_SCSI_T_TMF_LOGICAL_UNIT_RESET; +- +- enqueued_count = spdk_ring_enqueue(svdev->ctrlq_ring, (void **)&bdev_io, 1, NULL); +- if (spdk_likely(enqueued_count == 1)) { +- return; +- } else { +- spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_NOMEM); +- } +-} +- +-static void +-bdev_virtio_unmap(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io, bool success) +-{ +- struct virtio_scsi_io_ctx *io_ctx = bdev_virtio_init_io_vreq(ch, bdev_io); +- struct virtio_scsi_cmd_req *req = &io_ctx->req; +- struct spdk_scsi_unmap_bdesc *desc, *first_desc; +- uint8_t *buf; +- uint64_t offset_blocks, num_blocks; +- uint16_t cmd_len; +- +- if (!success) { +- spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); +- return; +- } +- +- buf = bdev_io->u.bdev.iovs[0].iov_base; +- +- offset_blocks = bdev_io->u.bdev.offset_blocks; +- num_blocks = bdev_io->u.bdev.num_blocks; +- +- /* (n-1) * 16-byte descriptors */ +- first_desc = desc = (struct spdk_scsi_unmap_bdesc *)&buf[8]; +- while (num_blocks > UINT32_MAX) { +- to_be64(&desc->lba, offset_blocks); +- to_be32(&desc->block_count, UINT32_MAX); +- memset(&desc->reserved, 0, sizeof(desc->reserved)); +- offset_blocks += UINT32_MAX; +- num_blocks -= UINT32_MAX; +- desc++; +- } +- +- /* The last descriptor with block_count <= UINT32_MAX */ +- to_be64(&desc->lba, offset_blocks); +- to_be32(&desc->block_count, num_blocks); +- memset(&desc->reserved, 0, sizeof(desc->reserved)); +- +- /* 8-byte header + n * 16-byte block descriptor */ +- cmd_len = 8 + (desc - first_desc + 1) * sizeof(struct spdk_scsi_unmap_bdesc); +- +- req->cdb[0] = SPDK_SBC_UNMAP; +- to_be16(&req->cdb[7], cmd_len); +- +- /* 8-byte header */ +- to_be16(&buf[0], cmd_len - 2); /* total length (excluding the length field) */ +- to_be16(&buf[2], cmd_len - 8); /* length of block descriptors */ +- memset(&buf[4], 0, 4); /* reserved */ +- +- bdev_virtio_send_io(ch, bdev_io); +-} +- +-static void +-bdev_virtio_get_buf_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io, +- bool success) +-{ +- if (!success) { +- spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); +- return; +- } +- +- bdev_virtio_rw(ch, bdev_io); +-} +- +-static int +-_bdev_virtio_submit_request(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io) +-{ +- struct virtio_scsi_disk *disk = SPDK_CONTAINEROF(bdev_io->bdev, struct virtio_scsi_disk, bdev); +- +- switch (bdev_io->type) { +- case SPDK_BDEV_IO_TYPE_READ: +- spdk_bdev_io_get_buf(bdev_io, bdev_virtio_get_buf_cb, +- bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen); +- return 0; +- case SPDK_BDEV_IO_TYPE_WRITE: +- bdev_virtio_rw(ch, bdev_io); +- return 0; +- case SPDK_BDEV_IO_TYPE_RESET: +- bdev_virtio_reset(ch, bdev_io); +- return 0; +- case SPDK_BDEV_IO_TYPE_UNMAP: { +- uint64_t buf_len = 8 /* header size */ + +- (bdev_io->u.bdev.num_blocks + UINT32_MAX - 1) / +- UINT32_MAX * sizeof(struct spdk_scsi_unmap_bdesc); +- +- if (!disk->info.unmap_supported) { +- return -1; +- } +- +- if (buf_len > SPDK_BDEV_LARGE_BUF_MAX_SIZE) { +- SPDK_ERRLOG("Trying to UNMAP too many blocks: %"PRIu64"\n", +- bdev_io->u.bdev.num_blocks); +- return -1; +- } +- spdk_bdev_io_get_buf(bdev_io, bdev_virtio_unmap, buf_len); +- return 0; +- } +- case SPDK_BDEV_IO_TYPE_FLUSH: +- default: +- return -1; +- } +- return 0; +-} +- +-static void +-bdev_virtio_submit_request(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io) +-{ +- if (_bdev_virtio_submit_request(ch, bdev_io) < 0) { +- spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); +- } +-} +- +-static bool +-bdev_virtio_io_type_supported(void *ctx, enum spdk_bdev_io_type io_type) +-{ +- struct virtio_scsi_disk *disk = ctx; +- +- switch (io_type) { +- case SPDK_BDEV_IO_TYPE_READ: +- case SPDK_BDEV_IO_TYPE_WRITE: +- case SPDK_BDEV_IO_TYPE_FLUSH: +- case SPDK_BDEV_IO_TYPE_RESET: +- return true; +- +- case SPDK_BDEV_IO_TYPE_UNMAP: +- return disk->info.unmap_supported; +- +- default: +- return false; +- } +-} +- +-static struct spdk_io_channel * +-bdev_virtio_get_io_channel(void *ctx) +-{ +- struct virtio_scsi_disk *disk = ctx; +- +- return spdk_get_io_channel(disk->svdev); +-} +- +-static int +-bdev_virtio_disk_destruct(void *ctx) +-{ +- struct virtio_scsi_disk *disk = ctx; +- struct virtio_scsi_dev *svdev = disk->svdev; +- +- TAILQ_REMOVE(&svdev->luns, disk, link); +- free(disk->bdev.name); +- free(disk); +- +- if (svdev->removed && TAILQ_EMPTY(&svdev->luns)) { +- spdk_io_device_unregister(svdev, virtio_scsi_dev_unregister_cb); +- } +- +- return 0; +-} +- +-static int +-bdev_virtio_dump_info_json(void *ctx, struct spdk_json_write_ctx *w) +-{ +- struct virtio_scsi_disk *disk = ctx; +- +- virtio_dev_dump_json_info(&disk->svdev->vdev, w); +- return 0; +-} +- +-static void +-bdev_virtio_write_config_json(struct spdk_bdev *bdev, struct spdk_json_write_ctx *w) +-{ +- /* SCSI targets and LUNS are discovered during scan process so nothing +- * to save here. +- */ +-} +- +-static const struct spdk_bdev_fn_table virtio_fn_table = { +- .destruct = bdev_virtio_disk_destruct, +- .submit_request = bdev_virtio_submit_request, +- .io_type_supported = bdev_virtio_io_type_supported, +- .get_io_channel = bdev_virtio_get_io_channel, +- .dump_info_json = bdev_virtio_dump_info_json, +- .write_config_json = bdev_virtio_write_config_json, +-}; +- +-static void +-get_scsi_status(struct virtio_scsi_cmd_resp *resp, int *sk, int *asc, int *ascq) +-{ +- /* see spdk_scsi_task_build_sense_data() for sense data details */ +- *sk = 0; +- *asc = 0; +- *ascq = 0; +- +- if (resp->sense_len < 3) { +- return; +- } +- +- *sk = resp->sense[2] & 0xf; +- +- if (resp->sense_len < 13) { +- return; +- } +- +- *asc = resp->sense[12]; +- +- if (resp->sense_len < 14) { +- return; +- } +- +- *ascq = resp->sense[13]; +-} +- +-static void +-bdev_virtio_io_cpl(struct spdk_bdev_io *bdev_io) +-{ +- struct virtio_scsi_io_ctx *io_ctx = (struct virtio_scsi_io_ctx *)bdev_io->driver_ctx; +- int sk, asc, ascq; +- +- get_scsi_status(&io_ctx->resp, &sk, &asc, &ascq); +- spdk_bdev_io_complete_scsi_status(bdev_io, io_ctx->resp.status, sk, asc, ascq); +-} +- +-static int +-bdev_virtio_poll(void *arg) +-{ +- struct bdev_virtio_io_channel *ch = arg; +- struct virtio_scsi_dev *svdev = ch->svdev; +- struct virtio_scsi_scan_base *scan_ctx = svdev->scan_ctx; +- void *io[32]; +- uint32_t io_len[32]; +- uint16_t i, cnt; +- int rc; +- +- cnt = virtio_recv_pkts(ch->vq, (void **)io, io_len, SPDK_COUNTOF(io)); +- for (i = 0; i < cnt; ++i) { +- if (spdk_unlikely(scan_ctx && io[i] == &scan_ctx->io_ctx)) { +- if (svdev->removed) { +- _virtio_scsi_dev_scan_finish(scan_ctx, -EINTR); +- return SPDK_POLLER_BUSY; +- } +- +- if (scan_ctx->restart) { +- scan_ctx->restart = false; +- scan_ctx->full_scan = true; +- _virtio_scsi_dev_scan_tgt(scan_ctx, 0); +- continue; +- } +- +- process_scan_resp(scan_ctx); +- continue; +- } +- +- bdev_virtio_io_cpl(io[i]); +- } +- +- /* scan_ctx could have been freed while processing completions above, so +- * we need to re-read the value again here into the local variable before +- * using it. +- */ +- scan_ctx = svdev->scan_ctx; +- if (spdk_unlikely(scan_ctx && scan_ctx->needs_resend)) { +- if (svdev->removed) { +- _virtio_scsi_dev_scan_finish(scan_ctx, -EINTR); +- return SPDK_POLLER_BUSY; +- } else if (cnt == 0) { +- return SPDK_POLLER_IDLE; +- } +- +- rc = send_scan_io(scan_ctx); +- if (rc != 0) { +- assert(scan_ctx->retries > 0); +- scan_ctx->retries--; +- if (scan_ctx->retries == 0) { +- SPDK_ERRLOG("Target scan failed unrecoverably with rc = %d.\n", rc); +- _virtio_scsi_dev_scan_finish(scan_ctx, rc); +- } +- } +- } +- +- return cnt; +-} +- +-static void +-bdev_virtio_tmf_cpl_cb(void *ctx) +-{ +- struct spdk_bdev_io *bdev_io = ctx; +- struct virtio_scsi_io_ctx *io_ctx = (struct virtio_scsi_io_ctx *)bdev_io->driver_ctx; +- +- if (io_ctx->tmf_resp.response == VIRTIO_SCSI_S_OK) { +- spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_SUCCESS); +- } else { +- spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); +- } +-} +- +-static void +-bdev_virtio_tmf_cpl(struct spdk_bdev_io *bdev_io) +-{ +- spdk_thread_send_msg(spdk_bdev_io_get_thread(bdev_io), bdev_virtio_tmf_cpl_cb, bdev_io); +-} +- +-static void +-bdev_virtio_eventq_io_cpl(struct virtio_scsi_dev *svdev, struct virtio_scsi_eventq_io *io) +-{ +- struct virtio_scsi_event *ev = &io->ev; +- struct virtio_scsi_disk *disk; +- +- if (ev->lun[0] != 1) { +- SPDK_WARNLOG("Received an event with invalid data layout.\n"); +- goto out; +- } +- +- if (ev->event & VIRTIO_SCSI_T_EVENTS_MISSED) { +- ev->event &= ~VIRTIO_SCSI_T_EVENTS_MISSED; +- virtio_scsi_dev_scan(svdev, NULL, NULL); +- } +- +- switch (ev->event) { +- case VIRTIO_SCSI_T_NO_EVENT: +- break; +- case VIRTIO_SCSI_T_TRANSPORT_RESET: +- switch (ev->reason) { +- case VIRTIO_SCSI_EVT_RESET_RESCAN: +- virtio_scsi_dev_scan_tgt(svdev, ev->lun[1]); +- break; +- case VIRTIO_SCSI_EVT_RESET_REMOVED: +- disk = virtio_scsi_dev_get_disk_by_id(svdev, ev->lun[1]); +- if (disk != NULL) { +- spdk_bdev_unregister(&disk->bdev, NULL, NULL); +- } +- break; +- default: +- break; +- } +- break; +- default: +- break; +- } +- +-out: +- virtio_scsi_dev_send_eventq_io(svdev->vdev.vqs[VIRTIO_SCSI_EVENTQ], io); +-} +- +-static void +-bdev_virtio_tmf_abort_nomem_cb(void *ctx) +-{ +- struct spdk_bdev_io *bdev_io = ctx; +- +- spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_NOMEM); +-} +- +-static void +-bdev_virtio_tmf_abort_ioerr_cb(void *ctx) +-{ +- struct spdk_bdev_io *bdev_io = ctx; +- +- spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); +-} +- +-static void +-bdev_virtio_tmf_abort(struct spdk_bdev_io *bdev_io, int status) +-{ +- spdk_msg_fn fn; +- +- if (status == -ENOMEM) { +- fn = bdev_virtio_tmf_abort_nomem_cb; +- } else { +- fn = bdev_virtio_tmf_abort_ioerr_cb; +- } +- +- spdk_thread_send_msg(spdk_bdev_io_get_thread(bdev_io), fn, bdev_io); +-} +- +-static int +-bdev_virtio_send_tmf_io(struct virtqueue *ctrlq, struct spdk_bdev_io *bdev_io) +-{ +- struct virtio_scsi_io_ctx *io_ctx = (struct virtio_scsi_io_ctx *)bdev_io->driver_ctx; +- int rc; +- +- rc = virtqueue_req_start(ctrlq, bdev_io, 2); +- if (rc != 0) { +- return rc; +- } +- +- virtqueue_req_add_iovs(ctrlq, &io_ctx->iov_req, 1, SPDK_VIRTIO_DESC_RO); +- virtqueue_req_add_iovs(ctrlq, &io_ctx->iov_resp, 1, SPDK_VIRTIO_DESC_WR); +- +- virtqueue_req_flush(ctrlq); +- return 0; +-} +- +-static int +-bdev_virtio_mgmt_poll(void *arg) +-{ +- struct virtio_scsi_dev *svdev = arg; +- struct virtio_dev *vdev = &svdev->vdev; +- struct virtqueue *eventq = vdev->vqs[VIRTIO_SCSI_EVENTQ]; +- struct virtqueue *ctrlq = vdev->vqs[VIRTIO_SCSI_CONTROLQ]; +- struct spdk_ring *send_ring = svdev->ctrlq_ring; +- void *io[16]; +- uint32_t io_len[16]; +- uint16_t i, cnt; +- int rc; +- int total = 0; +- +- cnt = spdk_ring_dequeue(send_ring, io, SPDK_COUNTOF(io)); +- total += cnt; +- for (i = 0; i < cnt; ++i) { +- rc = bdev_virtio_send_tmf_io(ctrlq, io[i]); +- if (rc != 0) { +- bdev_virtio_tmf_abort(io[i], rc); +- } +- } +- +- cnt = virtio_recv_pkts(ctrlq, io, io_len, SPDK_COUNTOF(io)); +- total += cnt; +- for (i = 0; i < cnt; ++i) { +- bdev_virtio_tmf_cpl(io[i]); +- } +- +- cnt = virtio_recv_pkts(eventq, io, io_len, SPDK_COUNTOF(io)); +- total += cnt; +- for (i = 0; i < cnt; ++i) { +- bdev_virtio_eventq_io_cpl(svdev, io[i]); +- } +- +- return total; +-} +- +-static int +-bdev_virtio_scsi_ch_create_cb(void *io_device, void *ctx_buf) +-{ +- struct virtio_scsi_dev *svdev = io_device; +- struct virtio_dev *vdev = &svdev->vdev; +- struct bdev_virtio_io_channel *ch = ctx_buf; +- struct virtqueue *vq; +- int32_t queue_idx; +- +- queue_idx = virtio_dev_find_and_acquire_queue(vdev, VIRTIO_SCSI_REQUESTQ); +- if (queue_idx < 0) { +- SPDK_ERRLOG("Couldn't get an unused queue for the io_channel.\n"); +- return -1; +- } +- +- vq = vdev->vqs[queue_idx]; +- +- ch->svdev = svdev; +- ch->vq = vq; +- +- ch->poller = SPDK_POLLER_REGISTER(bdev_virtio_poll, ch, 0); +- +- return 0; +-} +- +-static void +-bdev_virtio_scsi_ch_destroy_cb(void *io_device, void *ctx_buf) +-{ +- struct bdev_virtio_io_channel *ch = ctx_buf; +- struct virtio_scsi_dev *svdev = ch->svdev; +- struct virtio_dev *vdev = &svdev->vdev; +- struct virtqueue *vq = ch->vq; +- +- spdk_poller_unregister(&ch->poller); +- virtio_dev_release_queue(vdev, vq->vq_queue_index); +-} +- +-static void +-_virtio_scsi_dev_scan_finish(struct virtio_scsi_scan_base *base, int errnum) +-{ +- struct virtio_scsi_dev *svdev = base->svdev; +- size_t bdevs_cnt; +- struct spdk_bdev *bdevs[BDEV_VIRTIO_MAX_TARGET]; +- struct virtio_scsi_disk *disk; +- struct virtio_scsi_scan_info *tgt, *next_tgt; +- +- spdk_put_io_channel(spdk_io_channel_from_ctx(base->channel)); +- base->svdev->scan_ctx = NULL; +- +- TAILQ_FOREACH_SAFE(tgt, &base->scan_queue, tailq, next_tgt) { +- TAILQ_REMOVE(&base->scan_queue, tgt, tailq); +- free(tgt); +- } +- +- if (base->cb_fn == NULL) { +- spdk_free(base); +- return; +- } +- +- bdevs_cnt = 0; +- if (errnum == 0) { +- TAILQ_FOREACH(disk, &svdev->luns, link) { +- bdevs[bdevs_cnt] = &disk->bdev; +- bdevs_cnt++; +- } +- } +- +- base->cb_fn(base->cb_arg, errnum, bdevs, bdevs_cnt); +- spdk_free(base); +-} +- +-static int +-send_scan_io(struct virtio_scsi_scan_base *base) +-{ +- struct virtio_scsi_io_ctx *io_ctx = &base->io_ctx; +- struct virtio_scsi_cmd_req *req = &base->io_ctx.req; +- struct virtqueue *vq = base->channel->vq; +- int payload_iov_cnt = base->iov.iov_len > 0 ? 1 : 0; +- int rc; +- +- req->lun[0] = 1; +- req->lun[1] = base->info.target; +- +- rc = virtqueue_req_start(vq, io_ctx, 2 + payload_iov_cnt); +- if (rc != 0) { +- base->needs_resend = true; +- return -1; +- } +- +- virtqueue_req_add_iovs(vq, &io_ctx->iov_req, 1, SPDK_VIRTIO_DESC_RO); +- virtqueue_req_add_iovs(vq, &io_ctx->iov_resp, 1, SPDK_VIRTIO_DESC_WR); +- virtqueue_req_add_iovs(vq, &base->iov, payload_iov_cnt, SPDK_VIRTIO_DESC_WR); +- +- virtqueue_req_flush(vq); +- return 0; +-} +- +-static int +-send_inquiry(struct virtio_scsi_scan_base *base) +-{ +- struct virtio_scsi_cmd_req *req = &base->io_ctx.req; +- struct spdk_scsi_cdb_inquiry *cdb; +- +- memset(req, 0, sizeof(*req)); +- +- base->iov.iov_len = BDEV_VIRTIO_SCAN_PAYLOAD_SIZE; +- cdb = (struct spdk_scsi_cdb_inquiry *)req->cdb; +- cdb->opcode = SPDK_SPC_INQUIRY; +- to_be16(cdb->alloc_len, BDEV_VIRTIO_SCAN_PAYLOAD_SIZE); +- +- return send_scan_io(base); +-} +- +-static int +-send_inquiry_vpd(struct virtio_scsi_scan_base *base, uint8_t page_code) +-{ +- struct virtio_scsi_cmd_req *req = &base->io_ctx.req; +- struct spdk_scsi_cdb_inquiry *inquiry_cdb = (struct spdk_scsi_cdb_inquiry *)req->cdb; +- +- memset(req, 0, sizeof(*req)); +- +- base->iov.iov_len = BDEV_VIRTIO_SCAN_PAYLOAD_SIZE; +- inquiry_cdb->opcode = SPDK_SPC_INQUIRY; +- inquiry_cdb->evpd = 1; +- inquiry_cdb->page_code = page_code; +- to_be16(inquiry_cdb->alloc_len, base->iov.iov_len); +- +- return send_scan_io(base); +-} +- +-static int +-send_read_cap_10(struct virtio_scsi_scan_base *base) +-{ +- struct virtio_scsi_cmd_req *req = &base->io_ctx.req; +- +- memset(req, 0, sizeof(*req)); +- +- base->iov.iov_len = 8; +- req->cdb[0] = SPDK_SBC_READ_CAPACITY_10; +- +- return send_scan_io(base); +-} +- +-static int +-send_read_cap_16(struct virtio_scsi_scan_base *base) +-{ +- struct virtio_scsi_cmd_req *req = &base->io_ctx.req; +- +- memset(req, 0, sizeof(*req)); +- +- base->iov.iov_len = 32; +- req->cdb[0] = SPDK_SPC_SERVICE_ACTION_IN_16; +- req->cdb[1] = SPDK_SBC_SAI_READ_CAPACITY_16; +- to_be32(&req->cdb[10], base->iov.iov_len); +- +- return send_scan_io(base); +-} +- +-static int +-send_test_unit_ready(struct virtio_scsi_scan_base *base) +-{ +- struct virtio_scsi_cmd_req *req = &base->io_ctx.req; +- +- memset(req, 0, sizeof(*req)); +- req->cdb[0] = SPDK_SPC_TEST_UNIT_READY; +- base->iov.iov_len = 0; +- +- return send_scan_io(base); +-} +- +-static int +-send_start_stop_unit(struct virtio_scsi_scan_base *base) +-{ +- struct virtio_scsi_cmd_req *req = &base->io_ctx.req; +- +- memset(req, 0, sizeof(*req)); +- req->cdb[0] = SPDK_SBC_START_STOP_UNIT; +- req->cdb[4] = SPDK_SBC_START_STOP_UNIT_START_BIT; +- base->iov.iov_len = 0; +- +- return send_scan_io(base); +-} +- +-static int +-process_scan_start_stop_unit(struct virtio_scsi_scan_base *base) +-{ +- struct virtio_scsi_cmd_resp *resp = &base->io_ctx.resp; +- +- if (resp->status == SPDK_SCSI_STATUS_GOOD) { +- return send_inquiry_vpd(base, SPDK_SPC_VPD_SUPPORTED_VPD_PAGES); +- } +- +- return -1; +-} +- +-static int +-process_scan_test_unit_ready(struct virtio_scsi_scan_base *base) +-{ +- struct virtio_scsi_cmd_resp *resp = &base->io_ctx.resp; +- int sk, asc, ascq; +- +- get_scsi_status(resp, &sk, &asc, &ascq); +- +- /* check response, get VPD if spun up otherwise send SSU */ +- if (resp->status == SPDK_SCSI_STATUS_GOOD) { +- return send_inquiry_vpd(base, SPDK_SPC_VPD_SUPPORTED_VPD_PAGES); +- } else if (resp->response == VIRTIO_SCSI_S_OK && +- resp->status == SPDK_SCSI_STATUS_CHECK_CONDITION && +- sk == SPDK_SCSI_SENSE_UNIT_ATTENTION && +- asc == SPDK_SCSI_ASC_LOGICAL_UNIT_NOT_READY) { +- return send_start_stop_unit(base); +- } else { +- return -1; +- } +-} +- +-static int +-process_scan_inquiry_standard(struct virtio_scsi_scan_base *base) +-{ +- struct virtio_scsi_cmd_resp *resp = &base->io_ctx.resp; +- struct spdk_scsi_cdb_inquiry_data *inquiry_data = +- (struct spdk_scsi_cdb_inquiry_data *)base->payload; +- +- if (resp->status != SPDK_SCSI_STATUS_GOOD) { +- return -1; +- } +- +- /* check to make sure its a supported device */ +- if (inquiry_data->peripheral_device_type != SPDK_SPC_PERIPHERAL_DEVICE_TYPE_DISK || +- inquiry_data->peripheral_qualifier != SPDK_SPC_PERIPHERAL_QUALIFIER_CONNECTED) { +- SPDK_WARNLOG("Unsupported peripheral device type 0x%02x (qualifier 0x%02x)\n", +- inquiry_data->peripheral_device_type, +- inquiry_data->peripheral_qualifier); +- return -1; +- } +- +- return send_test_unit_ready(base); +-} +- +-static int +-process_scan_inquiry_vpd_supported_vpd_pages(struct virtio_scsi_scan_base *base) +-{ +- struct virtio_scsi_cmd_resp *resp = &base->io_ctx.resp; +- bool block_provisioning_page_supported = false; +- +- if (resp->status == SPDK_SCSI_STATUS_GOOD) { +- const uint8_t *vpd_data = base->payload; +- const uint8_t *supported_vpd_pages = vpd_data + 4; +- uint16_t page_length; +- uint16_t num_supported_pages; +- uint16_t i; +- +- page_length = from_be16(vpd_data + 2); +- num_supported_pages = spdk_min(page_length, base->iov.iov_len - 4); +- +- for (i = 0; i < num_supported_pages; i++) { +- if (supported_vpd_pages[i] == SPDK_SPC_VPD_BLOCK_THIN_PROVISION) { +- block_provisioning_page_supported = true; +- break; +- } +- } +- } +- +- if (block_provisioning_page_supported) { +- return send_inquiry_vpd(base, SPDK_SPC_VPD_BLOCK_THIN_PROVISION); +- } else { +- return send_read_cap_10(base); +- } +-} +- +-static int +-process_scan_inquiry_vpd_block_thin_provision(struct virtio_scsi_scan_base *base) +-{ +- struct virtio_scsi_cmd_resp *resp = &base->io_ctx.resp; +- +- base->info.unmap_supported = false; +- +- if (resp->status == SPDK_SCSI_STATUS_GOOD) { +- uint8_t *vpd_data = base->payload; +- +- base->info.unmap_supported = !!(vpd_data[5] & SPDK_SCSI_UNMAP_LBPU); +- } +- +- SPDK_INFOLOG(virtio, "Target %u: unmap supported = %d\n", +- base->info.target, (int)base->info.unmap_supported); +- +- return send_read_cap_10(base); +-} +- +-static int +-process_scan_inquiry(struct virtio_scsi_scan_base *base) +-{ +- struct virtio_scsi_cmd_req *req = &base->io_ctx.req; +- struct spdk_scsi_cdb_inquiry *inquiry_cdb = (struct spdk_scsi_cdb_inquiry *)req->cdb; +- +- if ((inquiry_cdb->evpd & 1) == 0) { +- return process_scan_inquiry_standard(base); +- } +- +- switch (inquiry_cdb->page_code) { +- case SPDK_SPC_VPD_SUPPORTED_VPD_PAGES: +- return process_scan_inquiry_vpd_supported_vpd_pages(base); +- case SPDK_SPC_VPD_BLOCK_THIN_PROVISION: +- return process_scan_inquiry_vpd_block_thin_provision(base); +- default: +- SPDK_DEBUGLOG(virtio, "Unexpected VPD page 0x%02x\n", inquiry_cdb->page_code); +- return -1; +- } +-} +- +-static void +-bdev_virtio_disk_notify_remove(struct virtio_scsi_disk *disk) +-{ +- disk->removed = true; +- spdk_bdev_close(disk->notify_desc); +-} +- +-static void +-bdev_virtio_disk_notify_event_cb(enum spdk_bdev_event_type type, struct spdk_bdev *bdev, +- void *event_ctx) +-{ +- switch (type) { +- case SPDK_BDEV_EVENT_REMOVE: +- bdev_virtio_disk_notify_remove(event_ctx); +- break; +- default: +- SPDK_NOTICELOG("Unsupported bdev event: type %d\n", type); +- break; +- } +-} +- +-/* To be called only from the thread performing target scan */ +-static int +-virtio_scsi_dev_add_tgt(struct virtio_scsi_dev *svdev, struct virtio_scsi_scan_info *info) +-{ +- struct virtio_scsi_disk *disk; +- struct spdk_bdev *bdev; +- int rc; +- +- TAILQ_FOREACH(disk, &svdev->luns, link) { +- if (disk->info.target == info->target) { +- /* Target is already attached and param change is not supported */ +- return 0; +- } +- } +- +- if (info->block_size == 0 || info->num_blocks == 0) { +- SPDK_ERRLOG("%s: invalid target %u: bs=%"PRIu32" blocks=%"PRIu64"\n", +- svdev->vdev.name, info->target, info->block_size, info->num_blocks); +- return -EINVAL; +- } +- +- disk = calloc(1, sizeof(*disk)); +- if (disk == NULL) { +- SPDK_ERRLOG("could not allocate disk\n"); +- return -ENOMEM; +- } +- +- disk->svdev = svdev; +- memcpy(&disk->info, info, sizeof(*info)); +- +- bdev = &disk->bdev; +- bdev->name = spdk_sprintf_alloc("%st%"PRIu8, svdev->vdev.name, info->target); +- if (bdev->name == NULL) { +- SPDK_ERRLOG("Couldn't alloc memory for the bdev name.\n"); +- free(disk); +- return -ENOMEM; +- } +- +- bdev->product_name = "Virtio SCSI Disk"; +- bdev->write_cache = 0; +- bdev->blocklen = disk->info.block_size; +- bdev->blockcnt = disk->info.num_blocks; +- +- bdev->ctxt = disk; +- bdev->fn_table = &virtio_fn_table; +- bdev->module = &virtio_scsi_if; +- +- rc = spdk_bdev_register(&disk->bdev); +- if (rc) { +- SPDK_ERRLOG("Failed to register bdev name=%s\n", disk->bdev.name); +- free(bdev->name); +- free(disk); +- return rc; +- } +- +- rc = spdk_bdev_open_ext(bdev->name, false, bdev_virtio_disk_notify_event_cb, +- disk, &disk->notify_desc); +- if (rc) { +- assert(false); +- } +- +- TAILQ_INSERT_TAIL(&svdev->luns, disk, link); +- return 0; +-} +- +-static int +-process_read_cap_10(struct virtio_scsi_scan_base *base) +-{ +- struct virtio_scsi_cmd_req *req = &base->io_ctx.req; +- struct virtio_scsi_cmd_resp *resp = &base->io_ctx.resp; +- uint64_t max_block; +- uint32_t block_size; +- uint8_t target_id = req->lun[1]; +- int rc; +- +- if (resp->response != VIRTIO_SCSI_S_OK || resp->status != SPDK_SCSI_STATUS_GOOD) { +- SPDK_ERRLOG("READ CAPACITY (10) failed for target %"PRIu8".\n", target_id); +- return -1; +- } +- +- block_size = from_be32(base->payload + 4); +- max_block = from_be32(base->payload); +- +- if (max_block == 0xffffffff) { +- return send_read_cap_16(base); +- } +- +- base->info.num_blocks = (uint64_t)max_block + 1; +- base->info.block_size = block_size; +- +- rc = virtio_scsi_dev_add_tgt(base->svdev, &base->info); +- if (rc != 0) { +- return rc; +- } +- +- return _virtio_scsi_dev_scan_next(base, 0); +-} +- +-static int +-process_read_cap_16(struct virtio_scsi_scan_base *base) +-{ +- struct virtio_scsi_cmd_req *req = &base->io_ctx.req; +- struct virtio_scsi_cmd_resp *resp = &base->io_ctx.resp; +- uint8_t target_id = req->lun[1]; +- int rc; +- +- if (resp->response != VIRTIO_SCSI_S_OK || resp->status != SPDK_SCSI_STATUS_GOOD) { +- SPDK_ERRLOG("READ CAPACITY (16) failed for target %"PRIu8".\n", target_id); +- return -1; +- } +- +- base->info.num_blocks = from_be64(base->payload) + 1; +- base->info.block_size = from_be32(base->payload + 8); +- rc = virtio_scsi_dev_add_tgt(base->svdev, &base->info); +- if (rc != 0) { +- return rc; +- } +- +- return _virtio_scsi_dev_scan_next(base, 0); +-} +- +-static void +-process_scan_resp(struct virtio_scsi_scan_base *base) +-{ +- struct virtio_scsi_cmd_req *req = &base->io_ctx.req; +- struct virtio_scsi_cmd_resp *resp = &base->io_ctx.resp; +- int rc, sk, asc, ascq; +- uint8_t target_id; +- +- if (base->io_ctx.iov_req.iov_len < sizeof(struct virtio_scsi_cmd_req) || +- base->io_ctx.iov_resp.iov_len < sizeof(struct virtio_scsi_cmd_resp)) { +- SPDK_ERRLOG("Received target scan message with invalid length.\n"); +- _virtio_scsi_dev_scan_next(base, -EIO); +- return; +- } +- +- get_scsi_status(resp, &sk, &asc, &ascq); +- target_id = req->lun[1]; +- +- if (resp->response == VIRTIO_SCSI_S_BAD_TARGET || +- resp->response == VIRTIO_SCSI_S_INCORRECT_LUN) { +- _virtio_scsi_dev_scan_next(base, -ENODEV); +- return; +- } +- +- if (resp->response != VIRTIO_SCSI_S_OK || +- (resp->status == SPDK_SCSI_STATUS_CHECK_CONDITION && +- sk != SPDK_SCSI_SENSE_ILLEGAL_REQUEST)) { +- assert(base->retries > 0); +- base->retries--; +- if (base->retries == 0) { +- SPDK_NOTICELOG("Target %"PRIu8" is present, but unavailable.\n", target_id); +- SPDK_LOGDUMP(virtio, "CDB", req->cdb, sizeof(req->cdb)); +- SPDK_LOGDUMP(virtio, "SENSE DATA", resp->sense, sizeof(resp->sense)); +- _virtio_scsi_dev_scan_next(base, -EBUSY); +- return; +- } +- +- /* resend the same request */ +- rc = send_scan_io(base); +- if (rc != 0) { +- /* Let response poller do the resend */ +- } +- return; +- } +- +- base->retries = SCAN_REQUEST_RETRIES; +- +- switch (req->cdb[0]) { +- case SPDK_SPC_INQUIRY: +- rc = process_scan_inquiry(base); +- break; +- case SPDK_SPC_TEST_UNIT_READY: +- rc = process_scan_test_unit_ready(base); +- break; +- case SPDK_SBC_START_STOP_UNIT: +- rc = process_scan_start_stop_unit(base); +- break; +- case SPDK_SBC_READ_CAPACITY_10: +- rc = process_read_cap_10(base); +- break; +- case SPDK_SPC_SERVICE_ACTION_IN_16: +- rc = process_read_cap_16(base); +- break; +- default: +- SPDK_ERRLOG("Received invalid target scan message: cdb[0] = %"PRIu8".\n", req->cdb[0]); +- rc = -1; +- break; +- } +- +- if (rc != 0) { +- if (base->needs_resend) { +- return; /* Let response poller do the resend */ +- } +- +- _virtio_scsi_dev_scan_next(base, rc); +- } +-} +- +-static int +-_virtio_scsi_dev_scan_next(struct virtio_scsi_scan_base *base, int rc) +-{ +- struct virtio_scsi_scan_info *next; +- struct virtio_scsi_disk *disk; +- uint8_t target_id; +- +- if (base->full_scan) { +- if (rc != 0) { +- disk = virtio_scsi_dev_get_disk_by_id(base->svdev, +- base->info.target); +- if (disk != NULL) { +- spdk_bdev_unregister(&disk->bdev, NULL, NULL); +- } +- } +- +- target_id = base->info.target + 1; +- if (target_id < BDEV_VIRTIO_MAX_TARGET) { +- _virtio_scsi_dev_scan_tgt(base, target_id); +- return 0; +- } +- +- base->full_scan = false; +- } +- +- next = TAILQ_FIRST(&base->scan_queue); +- if (next == NULL) { +- _virtio_scsi_dev_scan_finish(base, 0); +- return 0; +- } +- +- TAILQ_REMOVE(&base->scan_queue, next, tailq); +- target_id = next->target; +- free(next); +- +- _virtio_scsi_dev_scan_tgt(base, target_id); +- return 0; +-} +- +-static int +-_virtio_scsi_dev_scan_init(struct virtio_scsi_dev *svdev) +-{ +- struct virtio_scsi_scan_base *base; +- struct spdk_io_channel *io_ch; +- struct virtio_scsi_io_ctx *io_ctx; +- struct virtio_scsi_cmd_req *req; +- struct virtio_scsi_cmd_resp *resp; +- +- io_ch = spdk_get_io_channel(svdev); +- if (io_ch == NULL) { +- return -EBUSY; +- } +- +- base = spdk_zmalloc(sizeof(*base), 64, NULL, +- SPDK_ENV_LCORE_ID_ANY, SPDK_MALLOC_DMA); +- if (base == NULL) { +- SPDK_ERRLOG("couldn't allocate memory for scsi target scan.\n"); +- return -ENOMEM; +- } +- +- base->svdev = svdev; +- +- base->channel = spdk_io_channel_get_ctx(io_ch); +- TAILQ_INIT(&base->scan_queue); +- svdev->scan_ctx = base; +- +- base->iov.iov_base = base->payload; +- io_ctx = &base->io_ctx; +- req = &io_ctx->req; +- resp = &io_ctx->resp; +- io_ctx->iov_req.iov_base = req; +- io_ctx->iov_req.iov_len = sizeof(*req); +- io_ctx->iov_resp.iov_base = resp; +- io_ctx->iov_resp.iov_len = sizeof(*resp); +- +- base->retries = SCAN_REQUEST_RETRIES; +- return 0; +-} +- +-static void +-_virtio_scsi_dev_scan_tgt(struct virtio_scsi_scan_base *base, uint8_t target) +-{ +- int rc; +- +- memset(&base->info, 0, sizeof(base->info)); +- base->info.target = target; +- +- rc = send_inquiry(base); +- if (rc) { +- /* Let response poller do the resend */ +- } +-} +- +-static int +-virtio_scsi_dev_scan(struct virtio_scsi_dev *svdev, bdev_virtio_create_cb cb_fn, +- void *cb_arg) +-{ +- struct virtio_scsi_scan_base *base; +- struct virtio_scsi_scan_info *tgt, *next_tgt; +- int rc; +- +- if (svdev->scan_ctx) { +- if (svdev->scan_ctx->full_scan) { +- return -EEXIST; +- } +- +- /* We're about to start a full rescan, so there's no need +- * to scan particular targets afterwards. +- */ +- TAILQ_FOREACH_SAFE(tgt, &svdev->scan_ctx->scan_queue, tailq, next_tgt) { +- TAILQ_REMOVE(&svdev->scan_ctx->scan_queue, tgt, tailq); +- free(tgt); +- } +- +- svdev->scan_ctx->cb_fn = cb_fn; +- svdev->scan_ctx->cb_arg = cb_arg; +- svdev->scan_ctx->restart = true; +- return 0; +- } +- +- rc = _virtio_scsi_dev_scan_init(svdev); +- if (rc != 0) { +- return rc; +- } +- +- base = svdev->scan_ctx; +- base->cb_fn = cb_fn; +- base->cb_arg = cb_arg; +- base->full_scan = true; +- +- _virtio_scsi_dev_scan_tgt(base, 0); +- return 0; +-} +- +-static int +-virtio_scsi_dev_scan_tgt(struct virtio_scsi_dev *svdev, uint8_t target) +-{ +- struct virtio_scsi_scan_base *base; +- struct virtio_scsi_scan_info *info; +- int rc; +- +- base = svdev->scan_ctx; +- if (base) { +- info = calloc(1, sizeof(*info)); +- if (info == NULL) { +- SPDK_ERRLOG("calloc failed\n"); +- return -ENOMEM; +- } +- +- info->target = target; +- TAILQ_INSERT_TAIL(&base->scan_queue, info, tailq); +- return 0; +- } +- +- rc = _virtio_scsi_dev_scan_init(svdev); +- if (rc != 0) { +- return rc; +- } +- +- base = svdev->scan_ctx; +- base->full_scan = true; +- _virtio_scsi_dev_scan_tgt(base, target); +- return 0; +-} +- +-static int +-bdev_virtio_initialize(void) +-{ +- return 0; +-} +- +-static void +-_virtio_scsi_dev_unregister_cb(void *io_device) +-{ +- struct virtio_scsi_dev *svdev = io_device; +- struct virtio_dev *vdev = &svdev->vdev; +- bool finish_module; +- bdev_virtio_remove_cb remove_cb; +- void *remove_ctx; +- +- assert(spdk_ring_count(svdev->ctrlq_ring) == 0); +- spdk_ring_free(svdev->ctrlq_ring); +- spdk_poller_unregister(&svdev->mgmt_poller); +- +- virtio_dev_release_queue(vdev, VIRTIO_SCSI_EVENTQ); +- virtio_dev_release_queue(vdev, VIRTIO_SCSI_CONTROLQ); +- +- virtio_dev_stop(vdev); +- virtio_dev_destruct(vdev); +- +- pthread_mutex_lock(&g_virtio_scsi_mutex); +- TAILQ_REMOVE(&g_virtio_scsi_devs, svdev, tailq); +- pthread_mutex_unlock(&g_virtio_scsi_mutex); +- +- remove_cb = svdev->remove_cb; +- remove_ctx = svdev->remove_ctx; +- spdk_free(svdev->eventq_ios); +- free(svdev); +- +- if (remove_cb) { +- remove_cb(remove_ctx, 0); +- } +- +- finish_module = TAILQ_EMPTY(&g_virtio_scsi_devs); +- +- if (g_bdev_virtio_finish && finish_module) { +- spdk_bdev_module_fini_done(); +- } +-} +- +-static void +-virtio_scsi_dev_unregister_cb(void *io_device) +-{ +- struct virtio_scsi_dev *svdev = io_device; +- struct spdk_thread *thread; +- +- thread = virtio_dev_queue_get_thread(&svdev->vdev, VIRTIO_SCSI_CONTROLQ); +- spdk_thread_send_msg(thread, _virtio_scsi_dev_unregister_cb, io_device); +-} +- +-static void +-virtio_scsi_dev_remove(struct virtio_scsi_dev *svdev, +- bdev_virtio_remove_cb cb_fn, void *cb_arg) +-{ +- struct virtio_scsi_disk *disk, *disk_tmp; +- bool do_remove = true; +- +- if (svdev->removed) { +- if (cb_fn) { +- cb_fn(cb_arg, -EBUSY); +- } +- return; +- } +- +- svdev->remove_cb = cb_fn; +- svdev->remove_ctx = cb_arg; +- svdev->removed = true; +- +- if (svdev->scan_ctx) { +- /* The removal will continue after we receive a pending scan I/O. */ +- return; +- } +- +- TAILQ_FOREACH_SAFE(disk, &svdev->luns, link, disk_tmp) { +- if (!disk->removed) { +- spdk_bdev_unregister(&disk->bdev, NULL, NULL); +- } +- do_remove = false; +- } +- +- if (do_remove) { +- spdk_io_device_unregister(svdev, virtio_scsi_dev_unregister_cb); +- } +-} +- +-static void +-bdev_virtio_finish(void) +-{ +- struct virtio_scsi_dev *svdev, *next; +- +- g_bdev_virtio_finish = true; +- +- pthread_mutex_lock(&g_virtio_scsi_mutex); +- if (TAILQ_EMPTY(&g_virtio_scsi_devs)) { +- pthread_mutex_unlock(&g_virtio_scsi_mutex); +- spdk_bdev_module_fini_done(); +- return; +- } +- +- /* Defer module finish until all controllers are removed. */ +- TAILQ_FOREACH_SAFE(svdev, &g_virtio_scsi_devs, tailq, next) { +- virtio_scsi_dev_remove(svdev, NULL, NULL); +- } +- pthread_mutex_unlock(&g_virtio_scsi_mutex); +-} +- +-int +-bdev_virtio_user_scsi_dev_create(const char *base_name, const char *path, +- unsigned num_queues, unsigned queue_size, +- bdev_virtio_create_cb cb_fn, void *cb_arg) +-{ +- struct virtio_scsi_dev *svdev; +- int rc; +- +- svdev = virtio_user_scsi_dev_create(base_name, path, num_queues, queue_size); +- if (svdev == NULL) { +- return -1; +- } +- +- rc = virtio_scsi_dev_scan(svdev, cb_fn, cb_arg); +- if (rc) { +- virtio_scsi_dev_remove(svdev, NULL, NULL); +- } +- +- return rc; +-} +- +-int +-bdev_vfio_user_scsi_dev_create(const char *base_name, const char *path, +- bdev_virtio_create_cb cb_fn, void *cb_arg) +-{ +- struct virtio_scsi_dev *svdev; +- uint32_t num_queues = 0; +- int rc; +- +- svdev = calloc(1, sizeof(*svdev)); +- if (svdev == NULL) { +- SPDK_ERRLOG("calloc failed for virtio device %s: %s\n", base_name, path); +- return -ENOMEM; +- } +- +- rc = virtio_vfio_user_dev_init(&svdev->vdev, base_name, path); +- if (rc != 0) { +- SPDK_ERRLOG("Failed to create %s as virtio device\n", path); +- free(svdev); +- return -EFAULT; +- } +- +- rc = virtio_dev_read_dev_config(&svdev->vdev, offsetof(struct virtio_scsi_config, num_queues), +- &num_queues, sizeof(num_queues)); +- if (rc) { +- SPDK_ERRLOG("%s: config read failed: %s\n", base_name, spdk_strerror(-rc)); +- virtio_dev_destruct(&svdev->vdev); +- free(svdev); +- return rc; +- } +- +- if (num_queues < SPDK_VIRTIO_SCSI_QUEUE_NUM_FIXED) { +- SPDK_ERRLOG("%s: invalid num_queues %u\n", base_name, num_queues); +- virtio_dev_destruct(&svdev->vdev); +- free(svdev); +- return -EINVAL; +- } +- +- rc = virtio_scsi_dev_init(svdev, num_queues, VIRTIO_SCSI_DEV_SUPPORTED_FEATURES); +- if (rc != 0) { +- virtio_dev_destruct(&svdev->vdev); +- free(svdev); +- return -EFAULT; +- } +- +- rc = virtio_scsi_dev_scan(svdev, cb_fn, cb_arg); +- if (rc) { +- virtio_scsi_dev_remove(svdev, NULL, NULL); +- } +- +- return rc; +-} +- +-struct bdev_virtio_pci_dev_create_ctx { +- const char *name; +- bdev_virtio_create_cb cb_fn; +- void *cb_arg; +-}; +- +-static int +-bdev_virtio_pci_scsi_dev_create_cb(struct virtio_pci_ctx *pci_ctx, void *ctx) +-{ +- struct virtio_scsi_dev *svdev; +- struct bdev_virtio_pci_dev_create_ctx *create_ctx = ctx; +- int rc; +- +- svdev = virtio_pci_scsi_dev_create(create_ctx->name, pci_ctx); +- if (svdev == NULL) { +- return -1; +- } +- +- rc = virtio_scsi_dev_scan(svdev, create_ctx->cb_fn, create_ctx->cb_arg); +- if (rc) { +- svdev->vdev.ctx = NULL; +- virtio_scsi_dev_remove(svdev, NULL, NULL); +- } +- +- return rc; +-} +- +-int +-bdev_virtio_pci_scsi_dev_create(const char *name, struct spdk_pci_addr *pci_addr, +- bdev_virtio_create_cb cb_fn, void *cb_arg) +-{ +- struct bdev_virtio_pci_dev_create_ctx create_ctx; +- +- create_ctx.name = name; +- create_ctx.cb_fn = cb_fn; +- create_ctx.cb_arg = cb_arg; +- +- return virtio_pci_dev_attach(bdev_virtio_pci_scsi_dev_create_cb, &create_ctx, +- VIRTIO_ID_SCSI, pci_addr); +-} +- +-int +-bdev_virtio_scsi_dev_remove(const char *name, bdev_virtio_remove_cb cb_fn, void *cb_arg) +-{ +- struct virtio_scsi_dev *svdev; +- +- pthread_mutex_lock(&g_virtio_scsi_mutex); +- TAILQ_FOREACH(svdev, &g_virtio_scsi_devs, tailq) { +- if (strcmp(svdev->vdev.name, name) == 0) { +- break; +- } +- } +- +- if (svdev == NULL) { +- pthread_mutex_unlock(&g_virtio_scsi_mutex); +- SPDK_ERRLOG("Cannot find Virtio-SCSI device named '%s'\n", name); +- return -ENODEV; +- } +- +- virtio_scsi_dev_remove(svdev, cb_fn, cb_arg); +- pthread_mutex_unlock(&g_virtio_scsi_mutex); +- +- return 0; +-} +- +-void +-bdev_virtio_scsi_dev_list(struct spdk_json_write_ctx *w) +-{ +- struct virtio_scsi_dev *svdev; +- +- spdk_json_write_array_begin(w); +- +- pthread_mutex_lock(&g_virtio_scsi_mutex); +- TAILQ_FOREACH(svdev, &g_virtio_scsi_devs, tailq) { +- spdk_json_write_object_begin(w); +- +- spdk_json_write_named_string(w, "name", svdev->vdev.name); +- +- virtio_dev_dump_json_info(&svdev->vdev, w); +- +- spdk_json_write_object_end(w); +- } +- pthread_mutex_unlock(&g_virtio_scsi_mutex); +- +- spdk_json_write_array_end(w); +-} +- +-SPDK_LOG_REGISTER_COMPONENT(virtio) ++/* SPDX-License-Identifier: BSD-3-Clause ++ * Copyright (C) 2017 Intel Corporation. ++ * All rights reserved. ++ */ ++ ++#include "spdk/stdinc.h" ++ ++#include "spdk/bdev.h" ++#include "spdk/endian.h" ++#include "spdk/env.h" ++#include "spdk/thread.h" ++#include "spdk/scsi_spec.h" ++#include "spdk/string.h" ++#include "spdk/util.h" ++#include "spdk/json.h" ++ ++#include "spdk/bdev_module.h" ++#include "spdk/log.h" ++#include "spdk_internal/virtio.h" ++#include "spdk_internal/vhost_user.h" ++ ++#include ++#include ++ ++#include "bdev_virtio.h" ++ ++#define BDEV_VIRTIO_MAX_TARGET 64 ++#define BDEV_VIRTIO_SCAN_PAYLOAD_SIZE 256 ++#define MGMT_POLL_PERIOD_US (1000 * 5) ++#define CTRLQ_RING_SIZE 16 ++#define SCAN_REQUEST_RETRIES 5 ++ ++/* Number of non-request queues - eventq and controlq */ ++#define SPDK_VIRTIO_SCSI_QUEUE_NUM_FIXED 2 ++ ++#define VIRTIO_SCSI_EVENTQ_BUFFER_COUNT 16 ++ ++#define VIRTIO_SCSI_CONTROLQ 0 ++#define VIRTIO_SCSI_EVENTQ 1 ++#define VIRTIO_SCSI_REQUESTQ 2 ++ ++static int bdev_virtio_initialize(void); ++static void bdev_virtio_finish(void); ++ ++struct virtio_scsi_dev { ++ /* Generic virtio device data. */ ++ struct virtio_dev vdev; ++ ++ /** Detected SCSI LUNs */ ++ TAILQ_HEAD(, virtio_scsi_disk) luns; ++ ++ /** Context for the SCSI target scan. */ ++ struct virtio_scsi_scan_base *scan_ctx; ++ ++ /** Controlq poller. */ ++ struct spdk_poller *mgmt_poller; ++ ++ /** Controlq messages to be sent. */ ++ struct spdk_ring *ctrlq_ring; ++ ++ /** Buffers for the eventq. */ ++ struct virtio_scsi_eventq_io *eventq_ios; ++ ++ /** Device marked for removal. */ ++ bool removed; ++ ++ /** Callback to be called after vdev removal. */ ++ bdev_virtio_remove_cb remove_cb; ++ ++ /** Context for the `remove_cb`. */ ++ void *remove_ctx; ++ ++ TAILQ_ENTRY(virtio_scsi_dev) tailq; ++}; ++ ++struct virtio_scsi_io_ctx { ++ struct iovec iov_req; ++ struct iovec iov_resp; ++ union { ++ struct virtio_scsi_cmd_req req; ++ struct virtio_scsi_ctrl_tmf_req tmf_req; ++ }; ++ union { ++ struct virtio_scsi_cmd_resp resp; ++ struct virtio_scsi_ctrl_tmf_resp tmf_resp; ++ }; ++}; ++ ++struct virtio_scsi_eventq_io { ++ struct iovec iov; ++ struct virtio_scsi_event ev; ++}; ++ ++struct virtio_scsi_scan_info { ++ uint64_t num_blocks; ++ uint32_t block_size; ++ uint8_t target; ++ bool unmap_supported; ++ TAILQ_ENTRY(virtio_scsi_scan_info) tailq; ++}; ++ ++struct virtio_scsi_scan_base { ++ struct virtio_scsi_dev *svdev; ++ ++ /** I/O channel used for the scan I/O. */ ++ struct bdev_virtio_io_channel *channel; ++ ++ bdev_virtio_create_cb cb_fn; ++ void *cb_arg; ++ ++ /** Scan all targets on the device. */ ++ bool full_scan; ++ ++ /** Start a full rescan after receiving next scan I/O response. */ ++ bool restart; ++ ++ /** Additional targets to be (re)scanned. */ ++ TAILQ_HEAD(, virtio_scsi_scan_info) scan_queue; ++ ++ /** Remaining attempts for sending the current request. */ ++ unsigned retries; ++ ++ /** If set, the last scan I/O needs to be resent */ ++ bool needs_resend; ++ ++ struct virtio_scsi_io_ctx io_ctx; ++ struct iovec iov; ++ uint8_t payload[BDEV_VIRTIO_SCAN_PAYLOAD_SIZE]; ++ ++ /** Scan results for the current target. */ ++ struct virtio_scsi_scan_info info; ++}; ++ ++struct virtio_scsi_disk { ++ struct spdk_bdev bdev; ++ struct virtio_scsi_dev *svdev; ++ struct virtio_scsi_scan_info info; ++ ++ /** Descriptor opened just to be notified of external bdev hotremove. */ ++ struct spdk_bdev_desc *notify_desc; ++ ++ /** Disk marked for removal. */ ++ bool removed; ++ TAILQ_ENTRY(virtio_scsi_disk) link; ++}; ++ ++struct bdev_virtio_io_channel { ++ struct virtio_scsi_dev *svdev; ++ ++ /** Virtqueue exclusively assigned to this channel. */ ++ struct virtqueue *vq; ++ ++ /** Virtio response poller. */ ++ struct spdk_poller *poller; ++}; ++ ++static TAILQ_HEAD(, virtio_scsi_dev) g_virtio_scsi_devs = ++ TAILQ_HEAD_INITIALIZER(g_virtio_scsi_devs); ++ ++static pthread_mutex_t g_virtio_scsi_mutex = PTHREAD_MUTEX_INITIALIZER; ++ ++/** Module finish in progress */ ++static bool g_bdev_virtio_finish = false; ++ ++/* Features desired/implemented by this driver. */ ++#define VIRTIO_SCSI_DEV_SUPPORTED_FEATURES \ ++ (1ULL << VIRTIO_SCSI_F_INOUT | \ ++ 1ULL << VIRTIO_SCSI_F_HOTPLUG | \ ++ 1ULL << VIRTIO_RING_F_EVENT_IDX) ++ ++static void virtio_scsi_dev_unregister_cb(void *io_device); ++static void virtio_scsi_dev_remove(struct virtio_scsi_dev *svdev, ++ bdev_virtio_remove_cb cb_fn, void *cb_arg); ++static int bdev_virtio_scsi_ch_create_cb(void *io_device, void *ctx_buf); ++static void bdev_virtio_scsi_ch_destroy_cb(void *io_device, void *ctx_buf); ++static void process_scan_resp(struct virtio_scsi_scan_base *base); ++static int bdev_virtio_mgmt_poll(void *arg); ++ ++static int ++virtio_scsi_dev_send_eventq_io(struct virtqueue *vq, struct virtio_scsi_eventq_io *io) ++{ ++ int rc; ++ ++ rc = virtqueue_req_start(vq, io, 1); ++ if (rc != 0) { ++ return -1; ++ } ++ ++ virtqueue_req_add_iovs(vq, &io->iov, 1, SPDK_VIRTIO_DESC_WR); ++ virtqueue_req_flush(vq); ++ ++ return 0; ++} ++ ++static int ++virtio_scsi_dev_init(struct virtio_scsi_dev *svdev, uint16_t max_queues, uint64_t feature_bits) ++{ ++ struct virtio_dev *vdev = &svdev->vdev; ++ struct spdk_ring *ctrlq_ring; ++ struct virtio_scsi_eventq_io *eventq_io; ++ struct virtqueue *eventq; ++ uint16_t i, num_events; ++ int rc; ++ ++ rc = virtio_dev_reset(vdev, feature_bits); ++ if (rc != 0) { ++ return rc; ++ } ++ ++ rc = virtio_dev_start(vdev, max_queues, SPDK_VIRTIO_SCSI_QUEUE_NUM_FIXED); ++ if (rc != 0) { ++ return rc; ++ } ++ ++ ctrlq_ring = spdk_ring_create(SPDK_RING_TYPE_MP_SC, CTRLQ_RING_SIZE, ++ SPDK_ENV_SOCKET_ID_ANY); ++ if (ctrlq_ring == NULL) { ++ SPDK_ERRLOG("Failed to allocate send ring for the controlq.\n"); ++ return -1; ++ } ++ ++ rc = virtio_dev_acquire_queue(vdev, VIRTIO_SCSI_CONTROLQ); ++ if (rc != 0) { ++ SPDK_ERRLOG("Failed to acquire the controlq.\n"); ++ spdk_ring_free(ctrlq_ring); ++ return -1; ++ } ++ ++ rc = virtio_dev_acquire_queue(vdev, VIRTIO_SCSI_EVENTQ); ++ if (rc != 0) { ++ SPDK_ERRLOG("Failed to acquire the eventq.\n"); ++ virtio_dev_release_queue(vdev, VIRTIO_SCSI_CONTROLQ); ++ spdk_ring_free(ctrlq_ring); ++ return -1; ++ } ++ ++ eventq = vdev->vqs[VIRTIO_SCSI_EVENTQ]; ++ num_events = spdk_min(eventq->vq_nentries, VIRTIO_SCSI_EVENTQ_BUFFER_COUNT); ++ svdev->eventq_ios = spdk_zmalloc(sizeof(*svdev->eventq_ios) * num_events, ++ 0, NULL, SPDK_ENV_LCORE_ID_ANY, ++ SPDK_MALLOC_DMA); ++ if (svdev->eventq_ios == NULL) { ++ SPDK_ERRLOG("cannot allocate memory for %"PRIu16" eventq buffers\n", ++ num_events); ++ virtio_dev_release_queue(vdev, VIRTIO_SCSI_EVENTQ); ++ virtio_dev_release_queue(vdev, VIRTIO_SCSI_CONTROLQ); ++ spdk_ring_free(ctrlq_ring); ++ return -1; ++ } ++ ++ for (i = 0; i < num_events; i++) { ++ eventq_io = &svdev->eventq_ios[i]; ++ eventq_io->iov.iov_base = &eventq_io->ev; ++ eventq_io->iov.iov_len = sizeof(eventq_io->ev); ++ virtio_scsi_dev_send_eventq_io(eventq, eventq_io); ++ } ++ ++ svdev->ctrlq_ring = ctrlq_ring; ++ ++ svdev->mgmt_poller = SPDK_POLLER_REGISTER(bdev_virtio_mgmt_poll, svdev, ++ MGMT_POLL_PERIOD_US); ++ ++ TAILQ_INIT(&svdev->luns); ++ svdev->scan_ctx = NULL; ++ svdev->removed = false; ++ svdev->remove_cb = NULL; ++ svdev->remove_ctx = NULL; ++ ++ spdk_io_device_register(svdev, bdev_virtio_scsi_ch_create_cb, ++ bdev_virtio_scsi_ch_destroy_cb, ++ sizeof(struct bdev_virtio_io_channel), ++ svdev->vdev.name); ++ ++ pthread_mutex_lock(&g_virtio_scsi_mutex); ++ TAILQ_INSERT_TAIL(&g_virtio_scsi_devs, svdev, tailq); ++ pthread_mutex_unlock(&g_virtio_scsi_mutex); ++ return 0; ++} ++ ++static struct virtio_scsi_dev * ++virtio_pci_scsi_dev_create(const char *name, struct virtio_pci_ctx *pci_ctx) ++{ ++ static int pci_dev_counter = 0; ++ struct virtio_scsi_dev *svdev; ++ struct virtio_dev *vdev; ++ char *default_name = NULL; ++ uint32_t num_queues; ++ int rc; ++ ++ svdev = calloc(1, sizeof(*svdev)); ++ if (svdev == NULL) { ++ SPDK_ERRLOG("virtio device calloc failed\n"); ++ return NULL; ++ } ++ ++ vdev = &svdev->vdev; ++ if (name == NULL) { ++ default_name = spdk_sprintf_alloc("VirtioScsi%"PRIu32, pci_dev_counter++); ++ if (default_name == NULL) { ++ free(vdev); ++ return NULL; ++ } ++ name = default_name; ++ } ++ ++ rc = virtio_pci_dev_init(vdev, name, pci_ctx); ++ free(default_name); ++ ++ if (rc != 0) { ++ free(svdev); ++ return NULL; ++ } ++ ++ rc = virtio_dev_read_dev_config(vdev, offsetof(struct virtio_scsi_config, num_queues), ++ &num_queues, sizeof(num_queues)); ++ if (rc) { ++ SPDK_ERRLOG("%s: config read failed: %s\n", vdev->name, spdk_strerror(-rc)); ++ goto fail; ++ } ++ ++ rc = virtio_scsi_dev_init(svdev, num_queues, VIRTIO_SCSI_DEV_SUPPORTED_FEATURES); ++ if (rc != 0) { ++ goto fail; ++ } ++ ++ return svdev; ++ ++fail: ++ vdev->ctx = NULL; ++ virtio_dev_destruct(vdev); ++ free(svdev); ++ return NULL; ++} ++ ++static struct virtio_scsi_dev * ++virtio_user_scsi_dev_create(const char *name, const char *path, ++ uint16_t num_queues, uint32_t queue_size) ++{ ++ struct virtio_scsi_dev *svdev; ++ struct virtio_dev *vdev; ++ uint64_t feature_bits; ++ int rc; ++ ++ svdev = calloc(1, sizeof(*svdev)); ++ if (svdev == NULL) { ++ SPDK_ERRLOG("calloc failed for virtio device %s: %s\n", name, path); ++ return NULL; ++ } ++ ++ vdev = &svdev->vdev; ++ rc = virtio_user_dev_init(vdev, name, path, queue_size); ++ if (rc != 0) { ++ SPDK_ERRLOG("Failed to create virito device %s: %s\n", name, path); ++ free(svdev); ++ return NULL; ++ } ++ ++ feature_bits = VIRTIO_SCSI_DEV_SUPPORTED_FEATURES; ++ feature_bits |= (1ULL << VHOST_USER_F_PROTOCOL_FEATURES); ++ rc = virtio_scsi_dev_init(svdev, num_queues + SPDK_VIRTIO_SCSI_QUEUE_NUM_FIXED, feature_bits); ++ if (rc != 0) { ++ virtio_dev_destruct(vdev); ++ free(svdev); ++ return NULL; ++ } ++ ++ return svdev; ++} ++ ++static struct virtio_scsi_disk * ++virtio_scsi_dev_get_disk_by_id(struct virtio_scsi_dev *svdev, uint8_t target_id) ++{ ++ struct virtio_scsi_disk *disk; ++ ++ TAILQ_FOREACH(disk, &svdev->luns, link) { ++ if (disk->info.target == target_id) { ++ return disk; ++ } ++ } ++ ++ return NULL; ++} ++ ++static int virtio_scsi_dev_scan(struct virtio_scsi_dev *svdev, ++ bdev_virtio_create_cb cb_fn, void *cb_arg); ++static int send_scan_io(struct virtio_scsi_scan_base *base); ++static void _virtio_scsi_dev_scan_tgt(struct virtio_scsi_scan_base *base, uint8_t target); ++static int _virtio_scsi_dev_scan_next(struct virtio_scsi_scan_base *base, int rc); ++static void _virtio_scsi_dev_scan_finish(struct virtio_scsi_scan_base *base, int errnum); ++static int virtio_scsi_dev_scan_tgt(struct virtio_scsi_dev *svdev, uint8_t target); ++ ++static int ++bdev_virtio_get_ctx_size(void) ++{ ++ return sizeof(struct virtio_scsi_io_ctx); ++} ++ ++static int ++bdev_virtio_scsi_config_json(struct spdk_json_write_ctx *w) ++{ ++ struct virtio_scsi_dev *svdev; ++ ++ pthread_mutex_lock(&g_virtio_scsi_mutex); ++ TAILQ_FOREACH(svdev, &g_virtio_scsi_devs, tailq) { ++ spdk_json_write_object_begin(w); ++ ++ spdk_json_write_named_string(w, "method", "bdev_virtio_attach_controller"); ++ ++ spdk_json_write_named_object_begin(w, "params"); ++ spdk_json_write_named_string(w, "name", svdev->vdev.name); ++ spdk_json_write_named_string(w, "dev_type", "scsi"); ++ ++ /* Write transport specific parameters. */ ++ svdev->vdev.backend_ops->write_json_config(&svdev->vdev, w); ++ ++ spdk_json_write_object_end(w); ++ ++ spdk_json_write_object_end(w); ++ ++ } ++ pthread_mutex_unlock(&g_virtio_scsi_mutex); ++ ++ return 0; ++} ++ ++ ++static struct spdk_bdev_module virtio_scsi_if = { ++ .name = "virtio_scsi", ++ .module_init = bdev_virtio_initialize, ++ .module_fini = bdev_virtio_finish, ++ .get_ctx_size = bdev_virtio_get_ctx_size, ++ .config_json = bdev_virtio_scsi_config_json, ++ .async_fini = true, ++}; ++ ++SPDK_BDEV_MODULE_REGISTER(virtio_scsi, &virtio_scsi_if) ++ ++static struct virtio_scsi_io_ctx * ++bdev_virtio_init_io_vreq(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io) ++{ ++ struct virtio_scsi_cmd_req *req; ++ struct virtio_scsi_cmd_resp *resp; ++ struct virtio_scsi_disk *disk = (struct virtio_scsi_disk *)bdev_io->bdev; ++ struct virtio_scsi_io_ctx *io_ctx = (struct virtio_scsi_io_ctx *)bdev_io->driver_ctx; ++ ++ req = &io_ctx->req; ++ resp = &io_ctx->resp; ++ ++ io_ctx->iov_req.iov_base = req; ++ io_ctx->iov_req.iov_len = sizeof(*req); ++ ++ io_ctx->iov_resp.iov_base = resp; ++ io_ctx->iov_resp.iov_len = sizeof(*resp); ++ ++ memset(req, 0, sizeof(*req)); ++ req->lun[0] = 1; ++ req->lun[1] = disk->info.target; ++ ++ return io_ctx; ++} ++ ++static struct virtio_scsi_io_ctx * ++bdev_virtio_init_tmf_vreq(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io) ++{ ++ struct virtio_scsi_ctrl_tmf_req *tmf_req; ++ struct virtio_scsi_ctrl_tmf_resp *tmf_resp; ++ struct virtio_scsi_disk *disk = SPDK_CONTAINEROF(bdev_io->bdev, struct virtio_scsi_disk, bdev); ++ struct virtio_scsi_io_ctx *io_ctx = (struct virtio_scsi_io_ctx *)bdev_io->driver_ctx; ++ ++ tmf_req = &io_ctx->tmf_req; ++ tmf_resp = &io_ctx->tmf_resp; ++ ++ io_ctx->iov_req.iov_base = tmf_req; ++ io_ctx->iov_req.iov_len = sizeof(*tmf_req); ++ io_ctx->iov_resp.iov_base = tmf_resp; ++ io_ctx->iov_resp.iov_len = sizeof(*tmf_resp); ++ ++ memset(tmf_req, 0, sizeof(*tmf_req)); ++ tmf_req->lun[0] = 1; ++ tmf_req->lun[1] = disk->info.target; ++ ++ return io_ctx; ++} ++ ++static void ++bdev_virtio_send_io(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io) ++{ ++ struct bdev_virtio_io_channel *virtio_channel = spdk_io_channel_get_ctx(ch); ++ struct virtqueue *vq = virtio_channel->vq; ++ struct virtio_scsi_io_ctx *io_ctx = (struct virtio_scsi_io_ctx *)bdev_io->driver_ctx; ++ int rc; ++ ++ rc = virtqueue_req_start(vq, bdev_io, bdev_io->u.bdev.iovcnt + 2); ++ if (rc == -ENOMEM) { ++ spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_NOMEM); ++ return; ++ } else if (rc != 0) { ++ spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); ++ return; ++ } ++ ++ virtqueue_req_add_iovs(vq, &io_ctx->iov_req, 1, SPDK_VIRTIO_DESC_RO); ++ if (bdev_io->type == SPDK_BDEV_IO_TYPE_READ) { ++ virtqueue_req_add_iovs(vq, &io_ctx->iov_resp, 1, SPDK_VIRTIO_DESC_WR); ++ virtqueue_req_add_iovs(vq, bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt, ++ SPDK_VIRTIO_DESC_WR); ++ } else { ++ virtqueue_req_add_iovs(vq, bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt, ++ SPDK_VIRTIO_DESC_RO); ++ virtqueue_req_add_iovs(vq, &io_ctx->iov_resp, 1, SPDK_VIRTIO_DESC_WR); ++ } ++ ++ virtqueue_req_flush(vq); ++} ++ ++static void ++bdev_virtio_rw(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io) ++{ ++ struct virtio_scsi_disk *disk = SPDK_CONTAINEROF(bdev_io->bdev, struct virtio_scsi_disk, bdev); ++ struct virtio_scsi_io_ctx *io_ctx = bdev_virtio_init_io_vreq(ch, bdev_io); ++ struct virtio_scsi_cmd_req *req = &io_ctx->req; ++ bool is_write = bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE; ++ ++ if (disk->info.num_blocks > (1ULL << 32)) { ++ req->cdb[0] = is_write ? SPDK_SBC_WRITE_16 : SPDK_SBC_READ_16; ++ to_be64(&req->cdb[2], bdev_io->u.bdev.offset_blocks); ++ to_be32(&req->cdb[10], bdev_io->u.bdev.num_blocks); ++ } else { ++ req->cdb[0] = is_write ? SPDK_SBC_WRITE_10 : SPDK_SBC_READ_10; ++ to_be32(&req->cdb[2], bdev_io->u.bdev.offset_blocks); ++ to_be16(&req->cdb[7], bdev_io->u.bdev.num_blocks); ++ } ++ ++ bdev_virtio_send_io(ch, bdev_io); ++} ++ ++static void ++bdev_virtio_reset(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io) ++{ ++ struct bdev_virtio_io_channel *virtio_ch = spdk_io_channel_get_ctx(ch); ++ struct virtio_scsi_io_ctx *io_ctx = bdev_virtio_init_tmf_vreq(ch, bdev_io); ++ struct virtio_scsi_ctrl_tmf_req *tmf_req = &io_ctx->tmf_req; ++ struct virtio_scsi_dev *svdev = virtio_ch->svdev; ++ size_t enqueued_count; ++ ++ tmf_req->type = VIRTIO_SCSI_T_TMF; ++ tmf_req->subtype = VIRTIO_SCSI_T_TMF_LOGICAL_UNIT_RESET; ++ ++ enqueued_count = spdk_ring_enqueue(svdev->ctrlq_ring, (void **)&bdev_io, 1, NULL); ++ if (spdk_likely(enqueued_count == 1)) { ++ return; ++ } else { ++ spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_NOMEM); ++ } ++} ++ ++static void ++bdev_virtio_unmap(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io, bool success) ++{ ++ struct virtio_scsi_io_ctx *io_ctx = bdev_virtio_init_io_vreq(ch, bdev_io); ++ struct virtio_scsi_cmd_req *req = &io_ctx->req; ++ struct spdk_scsi_unmap_bdesc *desc, *first_desc; ++ uint8_t *buf; ++ uint64_t offset_blocks, num_blocks; ++ uint16_t cmd_len; ++ ++ if (!success) { ++ spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); ++ return; ++ } ++ ++ buf = bdev_io->u.bdev.iovs[0].iov_base; ++ ++ offset_blocks = bdev_io->u.bdev.offset_blocks; ++ num_blocks = bdev_io->u.bdev.num_blocks; ++ ++ /* (n-1) * 16-byte descriptors */ ++ first_desc = desc = (struct spdk_scsi_unmap_bdesc *)&buf[8]; ++ while (num_blocks > UINT32_MAX) { ++ to_be64(&desc->lba, offset_blocks); ++ to_be32(&desc->block_count, UINT32_MAX); ++ memset(&desc->reserved, 0, sizeof(desc->reserved)); ++ offset_blocks += UINT32_MAX; ++ num_blocks -= UINT32_MAX; ++ desc++; ++ } ++ ++ /* The last descriptor with block_count <= UINT32_MAX */ ++ to_be64(&desc->lba, offset_blocks); ++ to_be32(&desc->block_count, num_blocks); ++ memset(&desc->reserved, 0, sizeof(desc->reserved)); ++ ++ /* 8-byte header + n * 16-byte block descriptor */ ++ cmd_len = 8 + (desc - first_desc + 1) * sizeof(struct spdk_scsi_unmap_bdesc); ++ ++ req->cdb[0] = SPDK_SBC_UNMAP; ++ to_be16(&req->cdb[7], cmd_len); ++ ++ /* 8-byte header */ ++ to_be16(&buf[0], cmd_len - 2); /* total length (excluding the length field) */ ++ to_be16(&buf[2], cmd_len - 8); /* length of block descriptors */ ++ memset(&buf[4], 0, 4); /* reserved */ ++ ++ bdev_virtio_send_io(ch, bdev_io); ++} ++ ++static void ++bdev_virtio_get_buf_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io, ++ bool success) ++{ ++ if (!success) { ++ spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); ++ return; ++ } ++ ++ bdev_virtio_rw(ch, bdev_io); ++} ++ ++static int ++_bdev_virtio_submit_request(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io) ++{ ++ struct virtio_scsi_disk *disk = SPDK_CONTAINEROF(bdev_io->bdev, struct virtio_scsi_disk, bdev); ++ ++ switch (bdev_io->type) { ++ case SPDK_BDEV_IO_TYPE_READ: ++ spdk_bdev_io_get_buf(bdev_io, bdev_virtio_get_buf_cb, ++ bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen); ++ return 0; ++ case SPDK_BDEV_IO_TYPE_WRITE: ++ bdev_virtio_rw(ch, bdev_io); ++ return 0; ++ case SPDK_BDEV_IO_TYPE_RESET: ++ bdev_virtio_reset(ch, bdev_io); ++ return 0; ++ case SPDK_BDEV_IO_TYPE_UNMAP: { ++ uint64_t buf_len = 8 /* header size */ + ++ (bdev_io->u.bdev.num_blocks + UINT32_MAX - 1) / ++ UINT32_MAX * sizeof(struct spdk_scsi_unmap_bdesc); ++ ++ if (!disk->info.unmap_supported) { ++ return -1; ++ } ++ ++ if (buf_len > SPDK_BDEV_LARGE_BUF_MAX_SIZE) { ++ SPDK_ERRLOG("Trying to UNMAP too many blocks: %"PRIu64"\n", ++ bdev_io->u.bdev.num_blocks); ++ return -1; ++ } ++ spdk_bdev_io_get_buf(bdev_io, bdev_virtio_unmap, buf_len); ++ return 0; ++ } ++ case SPDK_BDEV_IO_TYPE_FLUSH: ++ default: ++ return -1; ++ } ++ return 0; ++} ++ ++static void ++bdev_virtio_submit_request(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io) ++{ ++ if (_bdev_virtio_submit_request(ch, bdev_io) < 0) { ++ spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); ++ } ++} ++ ++static bool ++bdev_virtio_io_type_supported(void *ctx, enum spdk_bdev_io_type io_type) ++{ ++ struct virtio_scsi_disk *disk = ctx; ++ ++ switch (io_type) { ++ case SPDK_BDEV_IO_TYPE_READ: ++ case SPDK_BDEV_IO_TYPE_WRITE: ++ case SPDK_BDEV_IO_TYPE_FLUSH: ++ case SPDK_BDEV_IO_TYPE_RESET: ++ return true; ++ ++ case SPDK_BDEV_IO_TYPE_UNMAP: ++ return disk->info.unmap_supported; ++ ++ default: ++ return false; ++ } ++} ++ ++static struct spdk_io_channel * ++bdev_virtio_get_io_channel(void *ctx) ++{ ++ struct virtio_scsi_disk *disk = ctx; ++ ++ return spdk_get_io_channel(disk->svdev); ++} ++ ++static int ++bdev_virtio_disk_destruct(void *ctx) ++{ ++ struct virtio_scsi_disk *disk = ctx; ++ struct virtio_scsi_dev *svdev = disk->svdev; ++ ++ TAILQ_REMOVE(&svdev->luns, disk, link); ++ free(disk->bdev.name); ++ free(disk); ++ ++ if (svdev->removed && TAILQ_EMPTY(&svdev->luns)) { ++ spdk_io_device_unregister(svdev, virtio_scsi_dev_unregister_cb); ++ } ++ ++ return 0; ++} ++ ++static int ++bdev_virtio_dump_info_json(void *ctx, struct spdk_json_write_ctx *w) ++{ ++ struct virtio_scsi_disk *disk = ctx; ++ ++ virtio_dev_dump_json_info(&disk->svdev->vdev, w); ++ return 0; ++} ++ ++static void ++bdev_virtio_write_config_json(struct spdk_bdev *bdev, struct spdk_json_write_ctx *w) ++{ ++ /* SCSI targets and LUNS are discovered during scan process so nothing ++ * to save here. ++ */ ++} ++ ++static const struct spdk_bdev_fn_table virtio_fn_table = { ++ .destruct = bdev_virtio_disk_destruct, ++ .submit_request = bdev_virtio_submit_request, ++ .io_type_supported = bdev_virtio_io_type_supported, ++ .get_io_channel = bdev_virtio_get_io_channel, ++ .dump_info_json = bdev_virtio_dump_info_json, ++ .write_config_json = bdev_virtio_write_config_json, ++}; ++ ++static void ++get_scsi_status(struct virtio_scsi_cmd_resp *resp, int *sk, int *asc, int *ascq) ++{ ++ /* see spdk_scsi_task_build_sense_data() for sense data details */ ++ *sk = 0; ++ *asc = 0; ++ *ascq = 0; ++ ++ if (resp->sense_len < 3) { ++ return; ++ } ++ ++ *sk = resp->sense[2] & 0xf; ++ ++ if (resp->sense_len < 13) { ++ return; ++ } ++ ++ *asc = resp->sense[12]; ++ ++ if (resp->sense_len < 14) { ++ return; ++ } ++ ++ *ascq = resp->sense[13]; ++} ++ ++static void ++bdev_virtio_io_cpl(struct spdk_bdev_io *bdev_io) ++{ ++ struct virtio_scsi_io_ctx *io_ctx = (struct virtio_scsi_io_ctx *)bdev_io->driver_ctx; ++ int sk, asc, ascq; ++ ++ get_scsi_status(&io_ctx->resp, &sk, &asc, &ascq); ++ spdk_bdev_io_complete_scsi_status(bdev_io, io_ctx->resp.status, sk, asc, ascq); ++} ++ ++static int ++bdev_virtio_poll(void *arg) ++{ ++ struct bdev_virtio_io_channel *ch = arg; ++ struct virtio_scsi_dev *svdev = ch->svdev; ++ struct virtio_scsi_scan_base *scan_ctx = svdev->scan_ctx; ++ void *io[32]; ++ uint32_t io_len[32]; ++ uint16_t i, cnt; ++ int rc; ++ ++ cnt = virtio_recv_pkts(ch->vq, (void **)io, io_len, SPDK_COUNTOF(io)); ++ for (i = 0; i < cnt; ++i) { ++ if (spdk_unlikely(scan_ctx && io[i] == &scan_ctx->io_ctx)) { ++ if (svdev->removed) { ++ _virtio_scsi_dev_scan_finish(scan_ctx, -EINTR); ++ return SPDK_POLLER_BUSY; ++ } ++ ++ if (scan_ctx->restart) { ++ scan_ctx->restart = false; ++ scan_ctx->full_scan = true; ++ _virtio_scsi_dev_scan_tgt(scan_ctx, 0); ++ continue; ++ } ++ ++ process_scan_resp(scan_ctx); ++ continue; ++ } ++ ++ bdev_virtio_io_cpl(io[i]); ++ } ++ ++ /* scan_ctx could have been freed while processing completions above, so ++ * we need to re-read the value again here into the local variable before ++ * using it. ++ */ ++ scan_ctx = svdev->scan_ctx; ++ if (spdk_unlikely(scan_ctx && scan_ctx->needs_resend)) { ++ if (svdev->removed) { ++ _virtio_scsi_dev_scan_finish(scan_ctx, -EINTR); ++ return SPDK_POLLER_BUSY; ++ } else if (cnt == 0) { ++ return SPDK_POLLER_IDLE; ++ } ++ ++ rc = send_scan_io(scan_ctx); ++ if (rc != 0) { ++ assert(scan_ctx->retries > 0); ++ scan_ctx->retries--; ++ if (scan_ctx->retries == 0) { ++ SPDK_ERRLOG("Target scan failed unrecoverably with rc = %d.\n", rc); ++ _virtio_scsi_dev_scan_finish(scan_ctx, rc); ++ } ++ } ++ } ++ ++ return cnt; ++} ++ ++static void ++bdev_virtio_tmf_cpl_cb(void *ctx) ++{ ++ struct spdk_bdev_io *bdev_io = ctx; ++ struct virtio_scsi_io_ctx *io_ctx = (struct virtio_scsi_io_ctx *)bdev_io->driver_ctx; ++ ++ if (io_ctx->tmf_resp.response == VIRTIO_SCSI_S_OK) { ++ spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_SUCCESS); ++ } else { ++ spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); ++ } ++} ++ ++static void ++bdev_virtio_tmf_cpl(struct spdk_bdev_io *bdev_io) ++{ ++ spdk_thread_send_msg(spdk_bdev_io_get_thread(bdev_io), bdev_virtio_tmf_cpl_cb, bdev_io); ++} ++ ++static void ++bdev_virtio_eventq_io_cpl(struct virtio_scsi_dev *svdev, struct virtio_scsi_eventq_io *io) ++{ ++ struct virtio_scsi_event *ev = &io->ev; ++ struct virtio_scsi_disk *disk; ++ ++ if (ev->lun[0] != 1) { ++ SPDK_WARNLOG("Received an event with invalid data layout.\n"); ++ goto out; ++ } ++ ++ if (ev->event & VIRTIO_SCSI_T_EVENTS_MISSED) { ++ ev->event &= ~VIRTIO_SCSI_T_EVENTS_MISSED; ++ virtio_scsi_dev_scan(svdev, NULL, NULL); ++ } ++ ++ switch (ev->event) { ++ case VIRTIO_SCSI_T_NO_EVENT: ++ break; ++ case VIRTIO_SCSI_T_TRANSPORT_RESET: ++ switch (ev->reason) { ++ case VIRTIO_SCSI_EVT_RESET_RESCAN: ++ virtio_scsi_dev_scan_tgt(svdev, ev->lun[1]); ++ break; ++ case VIRTIO_SCSI_EVT_RESET_REMOVED: ++ disk = virtio_scsi_dev_get_disk_by_id(svdev, ev->lun[1]); ++ if (disk != NULL) { ++ spdk_bdev_unregister(&disk->bdev, NULL, NULL); ++ } ++ break; ++ default: ++ break; ++ } ++ break; ++ default: ++ break; ++ } ++ ++out: ++ virtio_scsi_dev_send_eventq_io(svdev->vdev.vqs[VIRTIO_SCSI_EVENTQ], io); ++} ++ ++static void ++bdev_virtio_tmf_abort_nomem_cb(void *ctx) ++{ ++ struct spdk_bdev_io *bdev_io = ctx; ++ ++ spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_NOMEM); ++} ++ ++static void ++bdev_virtio_tmf_abort_ioerr_cb(void *ctx) ++{ ++ struct spdk_bdev_io *bdev_io = ctx; ++ ++ spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); ++} ++ ++static void ++bdev_virtio_tmf_abort(struct spdk_bdev_io *bdev_io, int status) ++{ ++ spdk_msg_fn fn; ++ ++ if (status == -ENOMEM) { ++ fn = bdev_virtio_tmf_abort_nomem_cb; ++ } else { ++ fn = bdev_virtio_tmf_abort_ioerr_cb; ++ } ++ ++ spdk_thread_send_msg(spdk_bdev_io_get_thread(bdev_io), fn, bdev_io); ++} ++ ++static int ++bdev_virtio_send_tmf_io(struct virtqueue *ctrlq, struct spdk_bdev_io *bdev_io) ++{ ++ struct virtio_scsi_io_ctx *io_ctx = (struct virtio_scsi_io_ctx *)bdev_io->driver_ctx; ++ int rc; ++ ++ rc = virtqueue_req_start(ctrlq, bdev_io, 2); ++ if (rc != 0) { ++ return rc; ++ } ++ ++ virtqueue_req_add_iovs(ctrlq, &io_ctx->iov_req, 1, SPDK_VIRTIO_DESC_RO); ++ virtqueue_req_add_iovs(ctrlq, &io_ctx->iov_resp, 1, SPDK_VIRTIO_DESC_WR); ++ ++ virtqueue_req_flush(ctrlq); ++ return 0; ++} ++ ++static int ++bdev_virtio_mgmt_poll(void *arg) ++{ ++ struct virtio_scsi_dev *svdev = arg; ++ struct virtio_dev *vdev = &svdev->vdev; ++ struct virtqueue *eventq = vdev->vqs[VIRTIO_SCSI_EVENTQ]; ++ struct virtqueue *ctrlq = vdev->vqs[VIRTIO_SCSI_CONTROLQ]; ++ struct spdk_ring *send_ring = svdev->ctrlq_ring; ++ void *io[16]; ++ uint32_t io_len[16]; ++ uint16_t i, cnt; ++ int rc; ++ int total = 0; ++ ++ cnt = spdk_ring_dequeue(send_ring, io, SPDK_COUNTOF(io)); ++ total += cnt; ++ for (i = 0; i < cnt; ++i) { ++ rc = bdev_virtio_send_tmf_io(ctrlq, io[i]); ++ if (rc != 0) { ++ bdev_virtio_tmf_abort(io[i], rc); ++ } ++ } ++ ++ cnt = virtio_recv_pkts(ctrlq, io, io_len, SPDK_COUNTOF(io)); ++ total += cnt; ++ for (i = 0; i < cnt; ++i) { ++ bdev_virtio_tmf_cpl(io[i]); ++ } ++ ++ cnt = virtio_recv_pkts(eventq, io, io_len, SPDK_COUNTOF(io)); ++ total += cnt; ++ for (i = 0; i < cnt; ++i) { ++ bdev_virtio_eventq_io_cpl(svdev, io[i]); ++ } ++ ++ return total; ++} ++ ++static int ++bdev_virtio_scsi_ch_create_cb(void *io_device, void *ctx_buf) ++{ ++ struct virtio_scsi_dev *svdev = io_device; ++ struct virtio_dev *vdev = &svdev->vdev; ++ struct bdev_virtio_io_channel *ch = ctx_buf; ++ struct virtqueue *vq; ++ int32_t queue_idx; ++ ++ queue_idx = virtio_dev_find_and_acquire_queue(vdev, VIRTIO_SCSI_REQUESTQ); ++ if (queue_idx < 0) { ++ SPDK_ERRLOG("Couldn't get an unused queue for the io_channel.\n"); ++ return -1; ++ } ++ ++ vq = vdev->vqs[queue_idx]; ++ ++ ch->svdev = svdev; ++ ch->vq = vq; ++ ++ ch->poller = SPDK_POLLER_REGISTER(bdev_virtio_poll, ch, 0); ++ ++ return 0; ++} ++ ++static void ++bdev_virtio_scsi_ch_destroy_cb(void *io_device, void *ctx_buf) ++{ ++ struct bdev_virtio_io_channel *ch = ctx_buf; ++ struct virtio_scsi_dev *svdev = ch->svdev; ++ struct virtio_dev *vdev = &svdev->vdev; ++ struct virtqueue *vq = ch->vq; ++ ++ spdk_poller_unregister(&ch->poller); ++ virtio_dev_release_queue(vdev, vq->vq_queue_index); ++} ++ ++static void ++_virtio_scsi_dev_scan_finish(struct virtio_scsi_scan_base *base, int errnum) ++{ ++ struct virtio_scsi_dev *svdev = base->svdev; ++ size_t bdevs_cnt; ++ struct spdk_bdev *bdevs[BDEV_VIRTIO_MAX_TARGET]; ++ struct virtio_scsi_disk *disk; ++ struct virtio_scsi_scan_info *tgt, *next_tgt; ++ ++ spdk_put_io_channel(spdk_io_channel_from_ctx(base->channel)); ++ base->svdev->scan_ctx = NULL; ++ ++ TAILQ_FOREACH_SAFE(tgt, &base->scan_queue, tailq, next_tgt) { ++ TAILQ_REMOVE(&base->scan_queue, tgt, tailq); ++ free(tgt); ++ } ++ ++ if (base->cb_fn == NULL) { ++ spdk_free(base); ++ return; ++ } ++ ++ bdevs_cnt = 0; ++ if (errnum == 0) { ++ TAILQ_FOREACH(disk, &svdev->luns, link) { ++ bdevs[bdevs_cnt] = &disk->bdev; ++ bdevs_cnt++; ++ } ++ } ++ ++ base->cb_fn(base->cb_arg, errnum, bdevs, bdevs_cnt); ++ spdk_free(base); ++} ++ ++static int ++send_scan_io(struct virtio_scsi_scan_base *base) ++{ ++ struct virtio_scsi_io_ctx *io_ctx = &base->io_ctx; ++ struct virtio_scsi_cmd_req *req = &base->io_ctx.req; ++ struct virtqueue *vq = base->channel->vq; ++ int payload_iov_cnt = base->iov.iov_len > 0 ? 1 : 0; ++ int rc; ++ ++ req->lun[0] = 1; ++ req->lun[1] = base->info.target; ++ ++ rc = virtqueue_req_start(vq, io_ctx, 2 + payload_iov_cnt); ++ if (rc != 0) { ++ base->needs_resend = true; ++ return -1; ++ } ++ ++ virtqueue_req_add_iovs(vq, &io_ctx->iov_req, 1, SPDK_VIRTIO_DESC_RO); ++ virtqueue_req_add_iovs(vq, &io_ctx->iov_resp, 1, SPDK_VIRTIO_DESC_WR); ++ virtqueue_req_add_iovs(vq, &base->iov, payload_iov_cnt, SPDK_VIRTIO_DESC_WR); ++ ++ virtqueue_req_flush(vq); ++ return 0; ++} ++ ++static int ++send_inquiry(struct virtio_scsi_scan_base *base) ++{ ++ struct virtio_scsi_cmd_req *req = &base->io_ctx.req; ++ struct spdk_scsi_cdb_inquiry *cdb; ++ ++ memset(req, 0, sizeof(*req)); ++ ++ base->iov.iov_len = BDEV_VIRTIO_SCAN_PAYLOAD_SIZE; ++ cdb = (struct spdk_scsi_cdb_inquiry *)req->cdb; ++ cdb->opcode = SPDK_SPC_INQUIRY; ++ to_be16(cdb->alloc_len, BDEV_VIRTIO_SCAN_PAYLOAD_SIZE); ++ ++ return send_scan_io(base); ++} ++ ++static int ++send_inquiry_vpd(struct virtio_scsi_scan_base *base, uint8_t page_code) ++{ ++ struct virtio_scsi_cmd_req *req = &base->io_ctx.req; ++ struct spdk_scsi_cdb_inquiry *inquiry_cdb = (struct spdk_scsi_cdb_inquiry *)req->cdb; ++ ++ memset(req, 0, sizeof(*req)); ++ ++ base->iov.iov_len = BDEV_VIRTIO_SCAN_PAYLOAD_SIZE; ++ inquiry_cdb->opcode = SPDK_SPC_INQUIRY; ++ inquiry_cdb->evpd = 1; ++ inquiry_cdb->page_code = page_code; ++ to_be16(inquiry_cdb->alloc_len, base->iov.iov_len); ++ ++ return send_scan_io(base); ++} ++ ++static int ++send_read_cap_10(struct virtio_scsi_scan_base *base) ++{ ++ struct virtio_scsi_cmd_req *req = &base->io_ctx.req; ++ ++ memset(req, 0, sizeof(*req)); ++ ++ base->iov.iov_len = 8; ++ req->cdb[0] = SPDK_SBC_READ_CAPACITY_10; ++ ++ return send_scan_io(base); ++} ++ ++static int ++send_read_cap_16(struct virtio_scsi_scan_base *base) ++{ ++ struct virtio_scsi_cmd_req *req = &base->io_ctx.req; ++ ++ memset(req, 0, sizeof(*req)); ++ ++ base->iov.iov_len = 32; ++ req->cdb[0] = SPDK_SPC_SERVICE_ACTION_IN_16; ++ req->cdb[1] = SPDK_SBC_SAI_READ_CAPACITY_16; ++ to_be32(&req->cdb[10], base->iov.iov_len); ++ ++ return send_scan_io(base); ++} ++ ++static int ++send_test_unit_ready(struct virtio_scsi_scan_base *base) ++{ ++ struct virtio_scsi_cmd_req *req = &base->io_ctx.req; ++ ++ memset(req, 0, sizeof(*req)); ++ req->cdb[0] = SPDK_SPC_TEST_UNIT_READY; ++ base->iov.iov_len = 0; ++ ++ return send_scan_io(base); ++} ++ ++static int ++send_start_stop_unit(struct virtio_scsi_scan_base *base) ++{ ++ struct virtio_scsi_cmd_req *req = &base->io_ctx.req; ++ ++ memset(req, 0, sizeof(*req)); ++ req->cdb[0] = SPDK_SBC_START_STOP_UNIT; ++ req->cdb[4] = SPDK_SBC_START_STOP_UNIT_START_BIT; ++ base->iov.iov_len = 0; ++ ++ return send_scan_io(base); ++} ++ ++static int ++process_scan_start_stop_unit(struct virtio_scsi_scan_base *base) ++{ ++ struct virtio_scsi_cmd_resp *resp = &base->io_ctx.resp; ++ ++ if (resp->status == SPDK_SCSI_STATUS_GOOD) { ++ return send_inquiry_vpd(base, SPDK_SPC_VPD_SUPPORTED_VPD_PAGES); ++ } ++ ++ return -1; ++} ++ ++static int ++process_scan_test_unit_ready(struct virtio_scsi_scan_base *base) ++{ ++ struct virtio_scsi_cmd_resp *resp = &base->io_ctx.resp; ++ int sk, asc, ascq; ++ ++ get_scsi_status(resp, &sk, &asc, &ascq); ++ ++ /* check response, get VPD if spun up otherwise send SSU */ ++ if (resp->status == SPDK_SCSI_STATUS_GOOD) { ++ return send_inquiry_vpd(base, SPDK_SPC_VPD_SUPPORTED_VPD_PAGES); ++ } else if (resp->response == VIRTIO_SCSI_S_OK && ++ resp->status == SPDK_SCSI_STATUS_CHECK_CONDITION && ++ sk == SPDK_SCSI_SENSE_UNIT_ATTENTION && ++ asc == SPDK_SCSI_ASC_LOGICAL_UNIT_NOT_READY) { ++ return send_start_stop_unit(base); ++ } else { ++ return -1; ++ } ++} ++ ++static int ++process_scan_inquiry_standard(struct virtio_scsi_scan_base *base) ++{ ++ struct virtio_scsi_cmd_resp *resp = &base->io_ctx.resp; ++ struct spdk_scsi_cdb_inquiry_data *inquiry_data = ++ (struct spdk_scsi_cdb_inquiry_data *)base->payload; ++ ++ if (resp->status != SPDK_SCSI_STATUS_GOOD) { ++ return -1; ++ } ++ ++ /* check to make sure its a supported device */ ++ if (inquiry_data->peripheral_device_type != SPDK_SPC_PERIPHERAL_DEVICE_TYPE_DISK || ++ inquiry_data->peripheral_qualifier != SPDK_SPC_PERIPHERAL_QUALIFIER_CONNECTED) { ++ SPDK_WARNLOG("Unsupported peripheral device type 0x%02x (qualifier 0x%02x)\n", ++ inquiry_data->peripheral_device_type, ++ inquiry_data->peripheral_qualifier); ++ return -1; ++ } ++ ++ return send_test_unit_ready(base); ++} ++ ++static int ++process_scan_inquiry_vpd_supported_vpd_pages(struct virtio_scsi_scan_base *base) ++{ ++ struct virtio_scsi_cmd_resp *resp = &base->io_ctx.resp; ++ bool block_provisioning_page_supported = false; ++ ++ if (resp->status == SPDK_SCSI_STATUS_GOOD) { ++ const uint8_t *vpd_data = base->payload; ++ const uint8_t *supported_vpd_pages = vpd_data + 4; ++ uint16_t page_length; ++ uint16_t num_supported_pages; ++ uint16_t i; ++ ++ page_length = from_be16(vpd_data + 2); ++ num_supported_pages = spdk_min(page_length, base->iov.iov_len - 4); ++ ++ for (i = 0; i < num_supported_pages; i++) { ++ if (supported_vpd_pages[i] == SPDK_SPC_VPD_BLOCK_THIN_PROVISION) { ++ block_provisioning_page_supported = true; ++ break; ++ } ++ } ++ } ++ ++ if (block_provisioning_page_supported) { ++ return send_inquiry_vpd(base, SPDK_SPC_VPD_BLOCK_THIN_PROVISION); ++ } else { ++ return send_read_cap_10(base); ++ } ++} ++ ++static int ++process_scan_inquiry_vpd_block_thin_provision(struct virtio_scsi_scan_base *base) ++{ ++ struct virtio_scsi_cmd_resp *resp = &base->io_ctx.resp; ++ ++ base->info.unmap_supported = false; ++ ++ if (resp->status == SPDK_SCSI_STATUS_GOOD) { ++ uint8_t *vpd_data = base->payload; ++ ++ base->info.unmap_supported = !!(vpd_data[5] & SPDK_SCSI_UNMAP_LBPU); ++ } ++ ++ SPDK_INFOLOG(virtio, "Target %u: unmap supported = %d\n", ++ base->info.target, (int)base->info.unmap_supported); ++ ++ return send_read_cap_10(base); ++} ++ ++static int ++process_scan_inquiry(struct virtio_scsi_scan_base *base) ++{ ++ struct virtio_scsi_cmd_req *req = &base->io_ctx.req; ++ struct spdk_scsi_cdb_inquiry *inquiry_cdb = (struct spdk_scsi_cdb_inquiry *)req->cdb; ++ ++ if ((inquiry_cdb->evpd & 1) == 0) { ++ return process_scan_inquiry_standard(base); ++ } ++ ++ switch (inquiry_cdb->page_code) { ++ case SPDK_SPC_VPD_SUPPORTED_VPD_PAGES: ++ return process_scan_inquiry_vpd_supported_vpd_pages(base); ++ case SPDK_SPC_VPD_BLOCK_THIN_PROVISION: ++ return process_scan_inquiry_vpd_block_thin_provision(base); ++ default: ++ SPDK_DEBUGLOG(virtio, "Unexpected VPD page 0x%02x\n", inquiry_cdb->page_code); ++ return -1; ++ } ++} ++ ++static void ++bdev_virtio_disk_notify_remove(struct virtio_scsi_disk *disk) ++{ ++ disk->removed = true; ++ spdk_bdev_close(disk->notify_desc); ++} ++ ++static void ++bdev_virtio_disk_notify_event_cb(enum spdk_bdev_event_type type, struct spdk_bdev *bdev, ++ void *event_ctx) ++{ ++ switch (type) { ++ case SPDK_BDEV_EVENT_REMOVE: ++ bdev_virtio_disk_notify_remove(event_ctx); ++ break; ++ default: ++ SPDK_NOTICELOG("Unsupported bdev event: type %d\n", type); ++ break; ++ } ++} ++ ++/* To be called only from the thread performing target scan */ ++static int ++virtio_scsi_dev_add_tgt(struct virtio_scsi_dev *svdev, struct virtio_scsi_scan_info *info) ++{ ++ struct virtio_scsi_disk *disk; ++ struct spdk_bdev *bdev; ++ int rc; ++ ++ TAILQ_FOREACH(disk, &svdev->luns, link) { ++ if (disk->info.target == info->target) { ++ /* Target is already attached and param change is not supported */ ++ return 0; ++ } ++ } ++ ++ if (info->block_size == 0 || info->num_blocks == 0) { ++ SPDK_ERRLOG("%s: invalid target %u: bs=%"PRIu32" blocks=%"PRIu64"\n", ++ svdev->vdev.name, info->target, info->block_size, info->num_blocks); ++ return -EINVAL; ++ } ++ ++ disk = calloc(1, sizeof(*disk)); ++ if (disk == NULL) { ++ SPDK_ERRLOG("could not allocate disk\n"); ++ return -ENOMEM; ++ } ++ ++ disk->svdev = svdev; ++ memcpy(&disk->info, info, sizeof(*info)); ++ ++ bdev = &disk->bdev; ++ bdev->name = spdk_sprintf_alloc("%st%"PRIu8, svdev->vdev.name, info->target); ++ if (bdev->name == NULL) { ++ SPDK_ERRLOG("Couldn't alloc memory for the bdev name.\n"); ++ free(disk); ++ return -ENOMEM; ++ } ++ ++ bdev->product_name = "Virtio SCSI Disk"; ++ bdev->write_cache = 0; ++ bdev->blocklen = disk->info.block_size; ++ bdev->blockcnt = disk->info.num_blocks; ++ ++ bdev->ctxt = disk; ++ bdev->fn_table = &virtio_fn_table; ++ bdev->module = &virtio_scsi_if; ++ ++ rc = spdk_bdev_register(&disk->bdev); ++ if (rc) { ++ SPDK_ERRLOG("Failed to register bdev name=%s\n", disk->bdev.name); ++ free(bdev->name); ++ free(disk); ++ return rc; ++ } ++ ++ rc = spdk_bdev_open_ext(bdev->name, false, bdev_virtio_disk_notify_event_cb, ++ disk, &disk->notify_desc); ++ if (rc) { ++ assert(false); ++ } ++ ++ TAILQ_INSERT_TAIL(&svdev->luns, disk, link); ++ return 0; ++} ++ ++static int ++process_read_cap_10(struct virtio_scsi_scan_base *base) ++{ ++ struct virtio_scsi_cmd_req *req = &base->io_ctx.req; ++ struct virtio_scsi_cmd_resp *resp = &base->io_ctx.resp; ++ uint64_t max_block; ++ uint32_t block_size; ++ uint8_t target_id = req->lun[1]; ++ int rc; ++ ++ if (resp->response != VIRTIO_SCSI_S_OK || resp->status != SPDK_SCSI_STATUS_GOOD) { ++ SPDK_ERRLOG("READ CAPACITY (10) failed for target %"PRIu8".\n", target_id); ++ return -1; ++ } ++ ++ block_size = from_be32(base->payload + 4); ++ max_block = from_be32(base->payload); ++ ++ if (max_block == 0xffffffff) { ++ return send_read_cap_16(base); ++ } ++ ++ base->info.num_blocks = (uint64_t)max_block + 1; ++ base->info.block_size = block_size; ++ ++ rc = virtio_scsi_dev_add_tgt(base->svdev, &base->info); ++ if (rc != 0) { ++ return rc; ++ } ++ ++ return _virtio_scsi_dev_scan_next(base, 0); ++} ++ ++static int ++process_read_cap_16(struct virtio_scsi_scan_base *base) ++{ ++ struct virtio_scsi_cmd_req *req = &base->io_ctx.req; ++ struct virtio_scsi_cmd_resp *resp = &base->io_ctx.resp; ++ uint8_t target_id = req->lun[1]; ++ int rc; ++ ++ if (resp->response != VIRTIO_SCSI_S_OK || resp->status != SPDK_SCSI_STATUS_GOOD) { ++ SPDK_ERRLOG("READ CAPACITY (16) failed for target %"PRIu8".\n", target_id); ++ return -1; ++ } ++ ++ base->info.num_blocks = from_be64(base->payload) + 1; ++ base->info.block_size = from_be32(base->payload + 8); ++ rc = virtio_scsi_dev_add_tgt(base->svdev, &base->info); ++ if (rc != 0) { ++ return rc; ++ } ++ ++ return _virtio_scsi_dev_scan_next(base, 0); ++} ++ ++static void ++process_scan_resp(struct virtio_scsi_scan_base *base) ++{ ++ struct virtio_scsi_cmd_req *req = &base->io_ctx.req; ++ struct virtio_scsi_cmd_resp *resp = &base->io_ctx.resp; ++ int rc, sk, asc, ascq; ++ uint8_t target_id; ++ ++ if (base->io_ctx.iov_req.iov_len < sizeof(struct virtio_scsi_cmd_req) || ++ base->io_ctx.iov_resp.iov_len < sizeof(struct virtio_scsi_cmd_resp)) { ++ SPDK_ERRLOG("Received target scan message with invalid length.\n"); ++ _virtio_scsi_dev_scan_next(base, -EIO); ++ return; ++ } ++ ++ get_scsi_status(resp, &sk, &asc, &ascq); ++ target_id = req->lun[1]; ++ ++ if (resp->response == VIRTIO_SCSI_S_BAD_TARGET || ++ resp->response == VIRTIO_SCSI_S_INCORRECT_LUN) { ++ _virtio_scsi_dev_scan_next(base, -ENODEV); ++ return; ++ } ++ ++ if (resp->response != VIRTIO_SCSI_S_OK || ++ (resp->status == SPDK_SCSI_STATUS_CHECK_CONDITION && ++ sk != SPDK_SCSI_SENSE_ILLEGAL_REQUEST)) { ++ assert(base->retries > 0); ++ base->retries--; ++ if (base->retries == 0) { ++ SPDK_NOTICELOG("Target %"PRIu8" is present, but unavailable.\n", target_id); ++ SPDK_LOGDUMP(virtio, "CDB", req->cdb, sizeof(req->cdb)); ++ SPDK_LOGDUMP(virtio, "SENSE DATA", resp->sense, sizeof(resp->sense)); ++ _virtio_scsi_dev_scan_next(base, -EBUSY); ++ return; ++ } ++ ++ /* resend the same request */ ++ rc = send_scan_io(base); ++ if (rc != 0) { ++ /* Let response poller do the resend */ ++ } ++ return; ++ } ++ ++ base->retries = SCAN_REQUEST_RETRIES; ++ ++ switch (req->cdb[0]) { ++ case SPDK_SPC_INQUIRY: ++ rc = process_scan_inquiry(base); ++ break; ++ case SPDK_SPC_TEST_UNIT_READY: ++ rc = process_scan_test_unit_ready(base); ++ break; ++ case SPDK_SBC_START_STOP_UNIT: ++ rc = process_scan_start_stop_unit(base); ++ break; ++ case SPDK_SBC_READ_CAPACITY_10: ++ rc = process_read_cap_10(base); ++ break; ++ case SPDK_SPC_SERVICE_ACTION_IN_16: ++ rc = process_read_cap_16(base); ++ break; ++ default: ++ SPDK_ERRLOG("Received invalid target scan message: cdb[0] = %"PRIu8".\n", req->cdb[0]); ++ rc = -1; ++ break; ++ } ++ ++ if (rc != 0) { ++ if (base->needs_resend) { ++ return; /* Let response poller do the resend */ ++ } ++ ++ _virtio_scsi_dev_scan_next(base, rc); ++ } ++} ++ ++static int ++_virtio_scsi_dev_scan_next(struct virtio_scsi_scan_base *base, int rc) ++{ ++ struct virtio_scsi_scan_info *next; ++ struct virtio_scsi_disk *disk; ++ uint8_t target_id; ++ ++ if (base->full_scan) { ++ if (rc != 0) { ++ disk = virtio_scsi_dev_get_disk_by_id(base->svdev, ++ base->info.target); ++ if (disk != NULL) { ++ spdk_bdev_unregister(&disk->bdev, NULL, NULL); ++ } ++ } ++ ++ target_id = base->info.target + 1; ++ if (target_id < BDEV_VIRTIO_MAX_TARGET) { ++ _virtio_scsi_dev_scan_tgt(base, target_id); ++ return 0; ++ } ++ ++ base->full_scan = false; ++ } ++ ++ next = TAILQ_FIRST(&base->scan_queue); ++ if (next == NULL) { ++ _virtio_scsi_dev_scan_finish(base, 0); ++ return 0; ++ } ++ ++ TAILQ_REMOVE(&base->scan_queue, next, tailq); ++ target_id = next->target; ++ free(next); ++ ++ _virtio_scsi_dev_scan_tgt(base, target_id); ++ return 0; ++} ++ ++static int ++_virtio_scsi_dev_scan_init(struct virtio_scsi_dev *svdev) ++{ ++ struct virtio_scsi_scan_base *base; ++ struct spdk_io_channel *io_ch; ++ struct virtio_scsi_io_ctx *io_ctx; ++ struct virtio_scsi_cmd_req *req; ++ struct virtio_scsi_cmd_resp *resp; ++ ++ io_ch = spdk_get_io_channel(svdev); ++ if (io_ch == NULL) { ++ return -EBUSY; ++ } ++ ++ base = spdk_zmalloc(sizeof(*base), 64, NULL, ++ SPDK_ENV_LCORE_ID_ANY, SPDK_MALLOC_DMA); ++ if (base == NULL) { ++ SPDK_ERRLOG("couldn't allocate memory for scsi target scan.\n"); ++ return -ENOMEM; ++ } ++ ++ base->svdev = svdev; ++ ++ base->channel = spdk_io_channel_get_ctx(io_ch); ++ TAILQ_INIT(&base->scan_queue); ++ svdev->scan_ctx = base; ++ ++ base->iov.iov_base = base->payload; ++ io_ctx = &base->io_ctx; ++ req = &io_ctx->req; ++ resp = &io_ctx->resp; ++ io_ctx->iov_req.iov_base = req; ++ io_ctx->iov_req.iov_len = sizeof(*req); ++ io_ctx->iov_resp.iov_base = resp; ++ io_ctx->iov_resp.iov_len = sizeof(*resp); ++ ++ base->retries = SCAN_REQUEST_RETRIES; ++ return 0; ++} ++ ++static void ++_virtio_scsi_dev_scan_tgt(struct virtio_scsi_scan_base *base, uint8_t target) ++{ ++ int rc; ++ ++ memset(&base->info, 0, sizeof(base->info)); ++ base->info.target = target; ++ ++ rc = send_inquiry(base); ++ if (rc) { ++ /* Let response poller do the resend */ ++ } ++} ++ ++static int ++virtio_scsi_dev_scan(struct virtio_scsi_dev *svdev, bdev_virtio_create_cb cb_fn, ++ void *cb_arg) ++{ ++ struct virtio_scsi_scan_base *base; ++ struct virtio_scsi_scan_info *tgt, *next_tgt; ++ int rc; ++ ++ if (svdev->scan_ctx) { ++ if (svdev->scan_ctx->full_scan) { ++ return -EEXIST; ++ } ++ ++ /* We're about to start a full rescan, so there's no need ++ * to scan particular targets afterwards. ++ */ ++ TAILQ_FOREACH_SAFE(tgt, &svdev->scan_ctx->scan_queue, tailq, next_tgt) { ++ TAILQ_REMOVE(&svdev->scan_ctx->scan_queue, tgt, tailq); ++ free(tgt); ++ } ++ ++ svdev->scan_ctx->cb_fn = cb_fn; ++ svdev->scan_ctx->cb_arg = cb_arg; ++ svdev->scan_ctx->restart = true; ++ return 0; ++ } ++ ++ rc = _virtio_scsi_dev_scan_init(svdev); ++ if (rc != 0) { ++ return rc; ++ } ++ ++ base = svdev->scan_ctx; ++ base->cb_fn = cb_fn; ++ base->cb_arg = cb_arg; ++ base->full_scan = true; ++ ++ _virtio_scsi_dev_scan_tgt(base, 0); ++ return 0; ++} ++ ++static int ++virtio_scsi_dev_scan_tgt(struct virtio_scsi_dev *svdev, uint8_t target) ++{ ++ struct virtio_scsi_scan_base *base; ++ struct virtio_scsi_scan_info *info; ++ int rc; ++ ++ base = svdev->scan_ctx; ++ if (base) { ++ info = calloc(1, sizeof(*info)); ++ if (info == NULL) { ++ SPDK_ERRLOG("calloc failed\n"); ++ return -ENOMEM; ++ } ++ ++ info->target = target; ++ TAILQ_INSERT_TAIL(&base->scan_queue, info, tailq); ++ return 0; ++ } ++ ++ rc = _virtio_scsi_dev_scan_init(svdev); ++ if (rc != 0) { ++ return rc; ++ } ++ ++ base = svdev->scan_ctx; ++ base->full_scan = true; ++ _virtio_scsi_dev_scan_tgt(base, target); ++ return 0; ++} ++ ++static int ++bdev_virtio_initialize(void) ++{ ++ return 0; ++} ++ ++static void ++_virtio_scsi_dev_unregister_cb(void *io_device) ++{ ++ struct virtio_scsi_dev *svdev = io_device; ++ struct virtio_dev *vdev = &svdev->vdev; ++ bool finish_module; ++ bdev_virtio_remove_cb remove_cb; ++ void *remove_ctx; ++ ++ assert(spdk_ring_count(svdev->ctrlq_ring) == 0); ++ spdk_ring_free(svdev->ctrlq_ring); ++ spdk_poller_unregister(&svdev->mgmt_poller); ++ ++ virtio_dev_release_queue(vdev, VIRTIO_SCSI_EVENTQ); ++ virtio_dev_release_queue(vdev, VIRTIO_SCSI_CONTROLQ); ++ ++ virtio_dev_stop(vdev); ++ virtio_dev_destruct(vdev); ++ ++ pthread_mutex_lock(&g_virtio_scsi_mutex); ++ TAILQ_REMOVE(&g_virtio_scsi_devs, svdev, tailq); ++ pthread_mutex_unlock(&g_virtio_scsi_mutex); ++ ++ remove_cb = svdev->remove_cb; ++ remove_ctx = svdev->remove_ctx; ++ spdk_free(svdev->eventq_ios); ++ free(svdev); ++ ++ if (remove_cb) { ++ remove_cb(remove_ctx, 0); ++ } ++ ++ finish_module = TAILQ_EMPTY(&g_virtio_scsi_devs); ++ ++ if (g_bdev_virtio_finish && finish_module) { ++ spdk_bdev_module_fini_done(); ++ } ++} ++ ++static void ++virtio_scsi_dev_unregister_cb(void *io_device) ++{ ++ struct virtio_scsi_dev *svdev = io_device; ++ struct spdk_thread *thread; ++ ++ thread = virtio_dev_queue_get_thread(&svdev->vdev, VIRTIO_SCSI_CONTROLQ); ++ spdk_thread_send_msg(thread, _virtio_scsi_dev_unregister_cb, io_device); ++} ++ ++static void ++virtio_scsi_dev_remove(struct virtio_scsi_dev *svdev, ++ bdev_virtio_remove_cb cb_fn, void *cb_arg) ++{ ++ struct virtio_scsi_disk *disk, *disk_tmp; ++ bool do_remove = true; ++ ++ if (svdev->removed) { ++ if (cb_fn) { ++ cb_fn(cb_arg, -EBUSY); ++ } ++ return; ++ } ++ ++ svdev->remove_cb = cb_fn; ++ svdev->remove_ctx = cb_arg; ++ svdev->removed = true; ++ ++ if (svdev->scan_ctx) { ++ /* The removal will continue after we receive a pending scan I/O. */ ++ return; ++ } ++ ++ TAILQ_FOREACH_SAFE(disk, &svdev->luns, link, disk_tmp) { ++ if (!disk->removed) { ++ spdk_bdev_unregister(&disk->bdev, NULL, NULL); ++ } ++ do_remove = false; ++ } ++ ++ if (do_remove) { ++ spdk_io_device_unregister(svdev, virtio_scsi_dev_unregister_cb); ++ } ++} ++ ++static void ++bdev_virtio_finish(void) ++{ ++ struct virtio_scsi_dev *svdev, *next; ++ ++ g_bdev_virtio_finish = true; ++ ++ pthread_mutex_lock(&g_virtio_scsi_mutex); ++ if (TAILQ_EMPTY(&g_virtio_scsi_devs)) { ++ pthread_mutex_unlock(&g_virtio_scsi_mutex); ++ spdk_bdev_module_fini_done(); ++ return; ++ } ++ ++ /* Defer module finish until all controllers are removed. */ ++ TAILQ_FOREACH_SAFE(svdev, &g_virtio_scsi_devs, tailq, next) { ++ virtio_scsi_dev_remove(svdev, NULL, NULL); ++ } ++ pthread_mutex_unlock(&g_virtio_scsi_mutex); ++} ++ ++int ++bdev_virtio_user_scsi_dev_create(const char *base_name, const char *path, ++ unsigned num_queues, unsigned queue_size, ++ bdev_virtio_create_cb cb_fn, void *cb_arg) ++{ ++ struct virtio_scsi_dev *svdev; ++ int rc; ++ ++ svdev = virtio_user_scsi_dev_create(base_name, path, num_queues, queue_size); ++ if (svdev == NULL) { ++ return -1; ++ } ++ ++ rc = virtio_scsi_dev_scan(svdev, cb_fn, cb_arg); ++ if (rc) { ++ virtio_scsi_dev_remove(svdev, NULL, NULL); ++ } ++ ++ return rc; ++} ++ ++int ++bdev_vfio_user_scsi_dev_create(const char *base_name, const char *path, ++ bdev_virtio_create_cb cb_fn, void *cb_arg) ++{ ++ struct virtio_scsi_dev *svdev; ++ uint32_t num_queues = 0; ++ int rc; ++ ++ svdev = calloc(1, sizeof(*svdev)); ++ if (svdev == NULL) { ++ SPDK_ERRLOG("calloc failed for virtio device %s: %s\n", base_name, path); ++ return -ENOMEM; ++ } ++ ++ rc = virtio_vfio_user_dev_init(&svdev->vdev, base_name, path); ++ if (rc != 0) { ++ SPDK_ERRLOG("Failed to create %s as virtio device\n", path); ++ free(svdev); ++ return -EFAULT; ++ } ++ ++ rc = virtio_dev_read_dev_config(&svdev->vdev, offsetof(struct virtio_scsi_config, num_queues), ++ &num_queues, sizeof(num_queues)); ++ if (rc) { ++ SPDK_ERRLOG("%s: config read failed: %s\n", base_name, spdk_strerror(-rc)); ++ virtio_dev_destruct(&svdev->vdev); ++ free(svdev); ++ return rc; ++ } ++ ++ if (num_queues < SPDK_VIRTIO_SCSI_QUEUE_NUM_FIXED) { ++ SPDK_ERRLOG("%s: invalid num_queues %u\n", base_name, num_queues); ++ virtio_dev_destruct(&svdev->vdev); ++ free(svdev); ++ return -EINVAL; ++ } ++ ++ rc = virtio_scsi_dev_init(svdev, num_queues, VIRTIO_SCSI_DEV_SUPPORTED_FEATURES); ++ if (rc != 0) { ++ virtio_dev_destruct(&svdev->vdev); ++ free(svdev); ++ return -EFAULT; ++ } ++ ++ rc = virtio_scsi_dev_scan(svdev, cb_fn, cb_arg); ++ if (rc) { ++ virtio_scsi_dev_remove(svdev, NULL, NULL); ++ } ++ ++ return rc; ++} ++ ++struct bdev_virtio_pci_dev_create_ctx { ++ const char *name; ++ bdev_virtio_create_cb cb_fn; ++ void *cb_arg; ++}; ++ ++static int ++bdev_virtio_pci_scsi_dev_create_cb(struct virtio_pci_ctx *pci_ctx, void *ctx) ++{ ++ struct virtio_scsi_dev *svdev; ++ struct bdev_virtio_pci_dev_create_ctx *create_ctx = ctx; ++ int rc; ++ ++ svdev = virtio_pci_scsi_dev_create(create_ctx->name, pci_ctx); ++ if (svdev == NULL) { ++ return -1; ++ } ++ ++ rc = virtio_scsi_dev_scan(svdev, create_ctx->cb_fn, create_ctx->cb_arg); ++ if (rc) { ++ svdev->vdev.ctx = NULL; ++ virtio_scsi_dev_remove(svdev, NULL, NULL); ++ } ++ ++ return rc; ++} ++ ++int ++bdev_virtio_pci_scsi_dev_create(const char *name, struct spdk_pci_addr *pci_addr, ++ bdev_virtio_create_cb cb_fn, void *cb_arg) ++{ ++ struct bdev_virtio_pci_dev_create_ctx create_ctx; ++ ++ create_ctx.name = name; ++ create_ctx.cb_fn = cb_fn; ++ create_ctx.cb_arg = cb_arg; ++ ++ return virtio_pci_dev_attach(bdev_virtio_pci_scsi_dev_create_cb, &create_ctx, ++ VIRTIO_ID_SCSI, pci_addr); ++} ++ ++int ++bdev_virtio_scsi_dev_remove(const char *name, bdev_virtio_remove_cb cb_fn, void *cb_arg) ++{ ++ struct virtio_scsi_dev *svdev; ++ ++ pthread_mutex_lock(&g_virtio_scsi_mutex); ++ TAILQ_FOREACH(svdev, &g_virtio_scsi_devs, tailq) { ++ if (strcmp(svdev->vdev.name, name) == 0) { ++ break; ++ } ++ } ++ ++ if (svdev == NULL) { ++ pthread_mutex_unlock(&g_virtio_scsi_mutex); ++ SPDK_ERRLOG("Cannot find Virtio-SCSI device named '%s'\n", name); ++ return -ENODEV; ++ } ++ ++ virtio_scsi_dev_remove(svdev, cb_fn, cb_arg); ++ pthread_mutex_unlock(&g_virtio_scsi_mutex); ++ ++ return 0; ++} ++ ++void ++bdev_virtio_scsi_dev_list(struct spdk_json_write_ctx *w) ++{ ++ struct virtio_scsi_dev *svdev; ++ ++ spdk_json_write_array_begin(w); ++ ++ pthread_mutex_lock(&g_virtio_scsi_mutex); ++ TAILQ_FOREACH(svdev, &g_virtio_scsi_devs, tailq) { ++ spdk_json_write_object_begin(w); ++ ++ spdk_json_write_named_string(w, "name", svdev->vdev.name); ++ ++ virtio_dev_dump_json_info(&svdev->vdev, w); ++ ++ spdk_json_write_object_end(w); ++ } ++ pthread_mutex_unlock(&g_virtio_scsi_mutex); ++ ++ spdk_json_write_array_end(w); ++} ++ ++SPDK_LOG_REGISTER_COMPONENT(virtio) +diff --git a/module/bdev/xnvme/Makefile b/module/bdev/xnvme/Makefile +index a44d27a..e7aa739 100644 +--- a/module/bdev/xnvme/Makefile ++++ b/module/bdev/xnvme/Makefile +@@ -1,19 +1,19 @@ +-# SPDX-License-Identifier: BSD-3-Clause +-# Copyright (c) Samsung Electronics Co., Ltd. +-# All rights reserved. +-# +- +-SPDK_ROOT_DIR := $(abspath $(CURDIR)/../../..) +-include $(SPDK_ROOT_DIR)/mk/spdk.common.mk +- +-SO_VER := 1 +-SO_MINOR := 0 +- +-C_SRCS = bdev_xnvme.c bdev_xnvme_rpc.c +-LIBNAME = bdev_xnvme +- +-CFLAGS += -I$(SPDK_ROOT_DIR)/xnvme/include +- +-SPDK_MAP_FILE = $(SPDK_ROOT_DIR)/mk/spdk_blank.map +- +-include $(SPDK_ROOT_DIR)/mk/spdk.lib.mk ++# SPDX-License-Identifier: BSD-3-Clause ++# Copyright (c) Samsung Electronics Co., Ltd. ++# All rights reserved. ++# ++ ++SPDK_ROOT_DIR := $(abspath $(CURDIR)/../../..) ++include $(SPDK_ROOT_DIR)/mk/spdk.common.mk ++ ++SO_VER := 1 ++SO_MINOR := 0 ++ ++C_SRCS = bdev_xnvme.c bdev_xnvme_rpc.c ++LIBNAME = bdev_xnvme ++ ++CFLAGS += -I$(SPDK_ROOT_DIR)/xnvme/include ++ ++SPDK_MAP_FILE = $(SPDK_ROOT_DIR)/mk/spdk_blank.map ++ ++include $(SPDK_ROOT_DIR)/mk/spdk.lib.mk +diff --git a/module/bdev/xnvme/bdev_xnvme.c b/module/bdev/xnvme/bdev_xnvme.c +index 399288a..4ad52d0 100644 +--- a/module/bdev/xnvme/bdev_xnvme.c ++++ b/module/bdev/xnvme/bdev_xnvme.c +@@ -1,484 +1,484 @@ +-/* SPDX-License-Identifier: BSD-3-Clause +- * Copyright (C) 2022 Intel Corporation. +- * Copyright (c) Samsung Electronics Co., Ltd. +- * All rights reserved. +- */ +- +-#include "libxnvme.h" +-#include "libxnvme_pp.h" +- +-#include "bdev_xnvme.h" +- +-#include "spdk/stdinc.h" +- +-#include "spdk/barrier.h" +-#include "spdk/bdev.h" +-#include "spdk/env.h" +-#include "spdk/fd.h" +-#include "spdk/likely.h" +-#include "spdk/thread.h" +-#include "spdk/json.h" +-#include "spdk/util.h" +-#include "spdk/string.h" +- +-#include "spdk/log.h" +- +-struct bdev_xnvme_io_channel { +- struct xnvme_queue *queue; +- struct spdk_poller *poller; +-}; +- +-struct bdev_xnvme_task { +- struct bdev_xnvme_io_channel *ch; +- TAILQ_ENTRY(bdev_xnvme_task) link; +-}; +- +-struct bdev_xnvme { +- struct spdk_bdev bdev; +- char *filename; +- char *io_mechanism; +- struct xnvme_dev *dev; +- uint32_t nsid; +- bool conserve_cpu; +- +- TAILQ_ENTRY(bdev_xnvme) link; +-}; +- +-static int bdev_xnvme_init(void); +-static void bdev_xnvme_fini(void); +-static void bdev_xnvme_free(struct bdev_xnvme *xnvme); +-static TAILQ_HEAD(, bdev_xnvme) g_xnvme_bdev_head = TAILQ_HEAD_INITIALIZER(g_xnvme_bdev_head); +- +-static int +-bdev_xnvme_get_ctx_size(void) +-{ +- return sizeof(struct bdev_xnvme_task); +-} +- +-static int +-bdev_xnvme_config_json(struct spdk_json_write_ctx *w) +-{ +- struct bdev_xnvme *xnvme; +- +- TAILQ_FOREACH(xnvme, &g_xnvme_bdev_head, link) { +- spdk_json_write_object_begin(w); +- +- spdk_json_write_named_string(w, "method", "bdev_xnvme_create"); +- +- spdk_json_write_named_object_begin(w, "params"); +- spdk_json_write_named_string(w, "name", xnvme->bdev.name); +- spdk_json_write_named_string(w, "filename", xnvme->filename); +- spdk_json_write_named_string(w, "io_mechanism", xnvme->io_mechanism); +- spdk_json_write_named_bool(w, "conserve_cpu", xnvme->conserve_cpu); +- spdk_json_write_object_end(w); +- +- spdk_json_write_object_end(w); +- } +- +- return 0; +-} +- +-static struct spdk_bdev_module xnvme_if = { +- .name = "xnvme", +- .module_init = bdev_xnvme_init, +- .module_fini = bdev_xnvme_fini, +- .get_ctx_size = bdev_xnvme_get_ctx_size, +- .config_json = bdev_xnvme_config_json, +-}; +- +-SPDK_BDEV_MODULE_REGISTER(xnvme, &xnvme_if) +- +-static struct spdk_io_channel * +-bdev_xnvme_get_io_channel(void *ctx) +-{ +- struct bdev_xnvme *xnvme = ctx; +- +- return spdk_get_io_channel(xnvme); +-} +- +-static bool +-bdev_xnvme_io_type_supported(void *ctx, enum spdk_bdev_io_type io_type) +-{ +- switch (io_type) { +- case SPDK_BDEV_IO_TYPE_READ: +- case SPDK_BDEV_IO_TYPE_WRITE: +- return true; +- default: +- return false; +- } +-} +- +-static void +-bdev_xnvme_destruct_cb(void *io_device) +-{ +- struct bdev_xnvme *xnvme = io_device; +- +- TAILQ_REMOVE(&g_xnvme_bdev_head, xnvme, link); +- bdev_xnvme_free(xnvme); +-} +- +-static int +-bdev_xnvme_destruct(void *ctx) +-{ +- struct bdev_xnvme *xnvme = ctx; +- +- spdk_io_device_unregister(xnvme, bdev_xnvme_destruct_cb); +- +- return 0; +-} +- +-static void +-bdev_xnvme_get_buf_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io, bool success) +-{ +- struct bdev_xnvme_task *xnvme_task = (struct bdev_xnvme_task *)bdev_io->driver_ctx; +- struct bdev_xnvme *xnvme = (struct bdev_xnvme *)bdev_io->bdev->ctxt; +- struct bdev_xnvme_io_channel *xnvme_ch = spdk_io_channel_get_ctx(ch); +- struct xnvme_cmd_ctx *ctx = xnvme_queue_get_cmd_ctx(xnvme_ch->queue); +- int err; +- +- if (!success) { +- xnvme_queue_put_cmd_ctx(xnvme_ch->queue, ctx); +- spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); +- return; +- } +- +- SPDK_DEBUGLOG(xnvme, "bdev_io : %p, iov_cnt : %d, bdev_xnvme_task : %p\n", +- bdev_io, bdev_io->u.bdev.iovcnt, (struct bdev_xnvme_task *)bdev_io->driver_ctx); +- +- switch (bdev_io->type) { +- case SPDK_BDEV_IO_TYPE_READ: +- ctx->cmd.common.opcode = XNVME_SPEC_NVM_OPC_READ; +- ctx->cmd.common.nsid = xnvme->nsid; +- ctx->cmd.nvm.nlb = bdev_io->u.bdev.num_blocks - 1; +- ctx->cmd.nvm.slba = bdev_io->u.bdev.offset_blocks; +- break; +- case SPDK_BDEV_IO_TYPE_WRITE: +- ctx->cmd.common.opcode = XNVME_SPEC_NVM_OPC_WRITE; +- ctx->cmd.common.nsid = xnvme->nsid; +- ctx->cmd.nvm.nlb = bdev_io->u.bdev.num_blocks - 1; +- ctx->cmd.nvm.slba = bdev_io->u.bdev.offset_blocks; +- break; +- +- default: +- SPDK_ERRLOG("Wrong io type\n"); +- +- xnvme_queue_put_cmd_ctx(xnvme_ch->queue, ctx); +- spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); +- return; +- } +- +- xnvme_task->ch = xnvme_ch; +- ctx->async.cb_arg = xnvme_task; +- +- err = xnvme_cmd_passv(ctx, bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt, +- bdev_io->u.bdev.num_blocks * xnvme->bdev.blocklen, NULL, 0, 0); +- +- switch (err) { +- /* Submission success! */ +- case 0: +- SPDK_DEBUGLOG(xnvme, "io_channel : %p, iovcnt:%d, nblks: %lu off: %#lx\n", +- xnvme_ch, bdev_io->u.bdev.iovcnt, +- bdev_io->u.bdev.num_blocks, bdev_io->u.bdev.offset_blocks); +- return; +- +- /* Submission failed: queue is full or no memory => Queue the I/O in bdev layer */ +- case -EBUSY: +- case -EAGAIN: +- case -ENOMEM: +- SPDK_WARNLOG("Start to queue I/O for xnvme bdev\n"); +- +- xnvme_queue_put_cmd_ctx(xnvme_ch->queue, ctx); +- spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_NOMEM); +- return; +- +- /* Submission failed: unexpected error, put the command-context back in the queue */ +- default: +- SPDK_ERRLOG("bdev_xnvme_cmd_passv : Submission failed: unexpected error\n"); +- +- xnvme_queue_put_cmd_ctx(xnvme_ch->queue, ctx); +- spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); +- return; +- } +-} +- +-static void +-bdev_xnvme_submit_request(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io) +-{ +- switch (bdev_io->type) { +- /* Read and write operations must be performed on buffers aligned to +- * bdev->required_alignment. If user specified unaligned buffers, +- * get the aligned buffer from the pool by calling spdk_bdev_io_get_buf. */ +- case SPDK_BDEV_IO_TYPE_READ: +- case SPDK_BDEV_IO_TYPE_WRITE: +- spdk_bdev_io_get_buf(bdev_io, bdev_xnvme_get_buf_cb, +- bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen); +- break; +- +- default: +- spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); +- break; +- } +-} +- +-static const struct spdk_bdev_fn_table xnvme_fn_table = { +- .destruct = bdev_xnvme_destruct, +- .submit_request = bdev_xnvme_submit_request, +- .io_type_supported = bdev_xnvme_io_type_supported, +- .get_io_channel = bdev_xnvme_get_io_channel, +-}; +- +-static void +-bdev_xnvme_free(struct bdev_xnvme *xnvme) +-{ +- assert(xnvme != NULL); +- +- xnvme_dev_close(xnvme->dev); +- free(xnvme->io_mechanism); +- free(xnvme->filename); +- free(xnvme->bdev.name); +- free(xnvme); +-} +- +-static void +-bdev_xnvme_cmd_cb(struct xnvme_cmd_ctx *ctx, void *cb_arg) +-{ +- struct bdev_xnvme_task *xnvme_task = ctx->async.cb_arg; +- enum spdk_bdev_io_status status = SPDK_BDEV_IO_STATUS_SUCCESS; +- +- SPDK_DEBUGLOG(xnvme, "xnvme_task : %p\n", xnvme_task); +- +- if (xnvme_cmd_ctx_cpl_status(ctx)) { +- SPDK_ERRLOG("xNVMe I/O Failed\n"); +- xnvme_cmd_ctx_pr(ctx, XNVME_PR_DEF); +- status = SPDK_BDEV_IO_STATUS_FAILED; +- } +- +- spdk_bdev_io_complete(spdk_bdev_io_from_ctx(xnvme_task), status); +- +- /* Completed: Put the command- context back in the queue */ +- xnvme_queue_put_cmd_ctx(ctx->async.queue, ctx); +-} +- +-static int +-bdev_xnvme_poll(void *arg) +-{ +- struct bdev_xnvme_io_channel *ch = arg; +- int rc; +- +- rc = xnvme_queue_poke(ch->queue, 0); +- if (rc < 0) { +- SPDK_ERRLOG("xnvme_queue_poke failure rc : %d\n", rc); +- return SPDK_POLLER_BUSY; +- } +- +- return xnvme_queue_get_outstanding(ch->queue) ? SPDK_POLLER_BUSY : SPDK_POLLER_IDLE; +-} +- +-static int +-bdev_xnvme_queue_create_cb(void *io_device, void *ctx_buf) +-{ +- struct bdev_xnvme *xnvme = io_device; +- struct bdev_xnvme_io_channel *ch = ctx_buf; +- int rc; +- int qd = 512; +- +- rc = xnvme_queue_init(xnvme->dev, qd, 0, &ch->queue); +- if (rc) { +- SPDK_ERRLOG("xnvme_queue_init failure: %d\n", rc); +- return 1; +- } +- +- xnvme_queue_set_cb(ch->queue, bdev_xnvme_cmd_cb, ch); +- +- ch->poller = SPDK_POLLER_REGISTER(bdev_xnvme_poll, ch, 0); +- +- return 0; +-} +- +-static void +-bdev_xnvme_queue_destroy_cb(void *io_device, void *ctx_buf) +-{ +- struct bdev_xnvme_io_channel *ch = ctx_buf; +- +- spdk_poller_unregister(&ch->poller); +- +- xnvme_queue_term(ch->queue); +-} +- +-struct spdk_bdev * +-create_xnvme_bdev(const char *name, const char *filename, const char *io_mechanism, +- bool conserve_cpu) +-{ +- struct bdev_xnvme *xnvme; +- uint32_t block_size; +- uint64_t bdev_size; +- int rc; +- struct xnvme_opts opts = xnvme_opts_default(); +- +- xnvme = calloc(1, sizeof(*xnvme)); +- if (!xnvme) { +- SPDK_ERRLOG("Unable to allocate enough memory for xNVMe backend\n"); +- return NULL; +- } +- +- opts.direct = 1; +- opts.async = io_mechanism; +- if (!opts.async) { +- goto error_return; +- } +- xnvme->io_mechanism = strdup(io_mechanism); +- if (!xnvme->io_mechanism) { +- goto error_return; +- } +- +- if (!conserve_cpu) { +- if (!strcmp(xnvme->io_mechanism, "libaio")) { +- opts.poll_io = 1; +- } else if (!strcmp(xnvme->io_mechanism, "io_uring")) { +- opts.poll_io = 1; +- } else if (!strcmp(xnvme->io_mechanism, "io_uring_cmd")) { +- opts.poll_sq = 1; +- } +- } +- +- xnvme->filename = strdup(filename); +- if (!xnvme->filename) { +- goto error_return; +- } +- +- xnvme->dev = xnvme_dev_open(xnvme->filename, &opts); +- if (!xnvme->dev) { +- SPDK_ERRLOG("Unable to open xNVMe device %s\n", filename); +- goto error_return; +- } +- +- xnvme->nsid = xnvme_dev_get_nsid(xnvme->dev); +- +- bdev_size = xnvme_dev_get_geo(xnvme->dev)->tbytes; +- block_size = xnvme_dev_get_geo(xnvme->dev)->nbytes; +- +- xnvme->bdev.name = strdup(name); +- if (!xnvme->bdev.name) { +- goto error_return; +- } +- +- xnvme->bdev.product_name = "xNVMe bdev"; +- xnvme->bdev.module = &xnvme_if; +- +- xnvme->bdev.write_cache = 0; +- +- if (block_size == 0) { +- SPDK_ERRLOG("Block size could not be auto-detected\n"); +- goto error_return; +- } +- +- if (block_size < 512) { +- SPDK_ERRLOG("Invalid block size %" PRIu32 " (must be at least 512).\n", block_size); +- goto error_return; +- } +- +- if (!spdk_u32_is_pow2(block_size)) { +- SPDK_ERRLOG("Invalid block size %" PRIu32 " (must be a power of 2.)\n", block_size); +- goto error_return; +- } +- +- SPDK_DEBUGLOG(xnvme, "bdev_name : %s, bdev_size : %lu, block_size : %d\n", +- xnvme->bdev.name, bdev_size, block_size); +- +- xnvme->bdev.blocklen = block_size; +- xnvme->bdev.required_alignment = spdk_u32log2(block_size); +- +- if (bdev_size % xnvme->bdev.blocklen != 0) { +- SPDK_ERRLOG("Disk size %" PRIu64 " is not a multiple of block size %" PRIu32 "\n", +- bdev_size, xnvme->bdev.blocklen); +- goto error_return; +- } +- +- xnvme->bdev.blockcnt = bdev_size / xnvme->bdev.blocklen; +- xnvme->bdev.ctxt = xnvme; +- +- xnvme->bdev.fn_table = &xnvme_fn_table; +- +- spdk_io_device_register(xnvme, bdev_xnvme_queue_create_cb, bdev_xnvme_queue_destroy_cb, +- sizeof(struct bdev_xnvme_io_channel), +- xnvme->bdev.name); +- rc = spdk_bdev_register(&xnvme->bdev); +- if (rc) { +- spdk_io_device_unregister(xnvme, NULL); +- goto error_return; +- } +- +- TAILQ_INSERT_TAIL(&g_xnvme_bdev_head, xnvme, link); +- +- return &xnvme->bdev; +- +-error_return: +- bdev_xnvme_free(xnvme); +- return NULL; +-} +- +-struct delete_xnvme_bdev_ctx { +- struct bdev_xnvme *xnvme; +- spdk_delete_xnvme_complete cb_fn; +- void *cb_arg; +-}; +- +-static void +-xnvme_bdev_unregister_cb(void *arg, int bdeverrno) +-{ +- struct delete_xnvme_bdev_ctx *ctx = arg; +- +- ctx->cb_fn(ctx->cb_arg, bdeverrno); +- free(ctx); +-} +- +-void +-delete_xnvme_bdev(struct spdk_bdev *bdev, spdk_delete_xnvme_complete cb_fn, void *cb_arg) +-{ +- struct delete_xnvme_bdev_ctx *ctx; +- struct bdev_xnvme *xnvme = (struct bdev_xnvme *)bdev->ctxt; +- +- if (!bdev || bdev->module != &xnvme_if) { +- cb_fn(cb_arg, -ENODEV); +- return; +- } +- +- ctx = calloc(1, sizeof(*ctx)); +- if (ctx == NULL) { +- cb_fn(cb_arg, -ENOMEM); +- return; +- } +- +- ctx->xnvme = xnvme; +- ctx->cb_fn = cb_fn; +- ctx->cb_arg = cb_arg; +- spdk_bdev_unregister(bdev, xnvme_bdev_unregister_cb, ctx); +-} +- +-static int +-bdev_xnvme_module_create_cb(void *io_device, void *ctx_buf) +-{ +- return 0; +-} +- +-static void +-bdev_xnvme_module_destroy_cb(void *io_device, void *ctx_buf) +-{ +-} +- +-static int +-bdev_xnvme_init(void) +-{ +- spdk_io_device_register(&xnvme_if, bdev_xnvme_module_create_cb, bdev_xnvme_module_destroy_cb, +- 0, "xnvme_module"); +- +- return 0; +-} +- +-static void +-bdev_xnvme_fini(void) +-{ +- spdk_io_device_unregister(&xnvme_if, NULL); +-} +- +-SPDK_LOG_REGISTER_COMPONENT(xnvme) ++/* SPDX-License-Identifier: BSD-3-Clause ++ * Copyright (C) 2022 Intel Corporation. ++ * Copyright (c) Samsung Electronics Co., Ltd. ++ * All rights reserved. ++ */ ++ ++#include "libxnvme.h" ++#include "libxnvme_pp.h" ++ ++#include "bdev_xnvme.h" ++ ++#include "spdk/stdinc.h" ++ ++#include "spdk/barrier.h" ++#include "spdk/bdev.h" ++#include "spdk/env.h" ++#include "spdk/fd.h" ++#include "spdk/likely.h" ++#include "spdk/thread.h" ++#include "spdk/json.h" ++#include "spdk/util.h" ++#include "spdk/string.h" ++ ++#include "spdk/log.h" ++ ++struct bdev_xnvme_io_channel { ++ struct xnvme_queue *queue; ++ struct spdk_poller *poller; ++}; ++ ++struct bdev_xnvme_task { ++ struct bdev_xnvme_io_channel *ch; ++ TAILQ_ENTRY(bdev_xnvme_task) link; ++}; ++ ++struct bdev_xnvme { ++ struct spdk_bdev bdev; ++ char *filename; ++ char *io_mechanism; ++ struct xnvme_dev *dev; ++ uint32_t nsid; ++ bool conserve_cpu; ++ ++ TAILQ_ENTRY(bdev_xnvme) link; ++}; ++ ++static int bdev_xnvme_init(void); ++static void bdev_xnvme_fini(void); ++static void bdev_xnvme_free(struct bdev_xnvme *xnvme); ++static TAILQ_HEAD(, bdev_xnvme) g_xnvme_bdev_head = TAILQ_HEAD_INITIALIZER(g_xnvme_bdev_head); ++ ++static int ++bdev_xnvme_get_ctx_size(void) ++{ ++ return sizeof(struct bdev_xnvme_task); ++} ++ ++static int ++bdev_xnvme_config_json(struct spdk_json_write_ctx *w) ++{ ++ struct bdev_xnvme *xnvme; ++ ++ TAILQ_FOREACH(xnvme, &g_xnvme_bdev_head, link) { ++ spdk_json_write_object_begin(w); ++ ++ spdk_json_write_named_string(w, "method", "bdev_xnvme_create"); ++ ++ spdk_json_write_named_object_begin(w, "params"); ++ spdk_json_write_named_string(w, "name", xnvme->bdev.name); ++ spdk_json_write_named_string(w, "filename", xnvme->filename); ++ spdk_json_write_named_string(w, "io_mechanism", xnvme->io_mechanism); ++ spdk_json_write_named_bool(w, "conserve_cpu", xnvme->conserve_cpu); ++ spdk_json_write_object_end(w); ++ ++ spdk_json_write_object_end(w); ++ } ++ ++ return 0; ++} ++ ++static struct spdk_bdev_module xnvme_if = { ++ .name = "xnvme", ++ .module_init = bdev_xnvme_init, ++ .module_fini = bdev_xnvme_fini, ++ .get_ctx_size = bdev_xnvme_get_ctx_size, ++ .config_json = bdev_xnvme_config_json, ++}; ++ ++SPDK_BDEV_MODULE_REGISTER(xnvme, &xnvme_if) ++ ++static struct spdk_io_channel * ++bdev_xnvme_get_io_channel(void *ctx) ++{ ++ struct bdev_xnvme *xnvme = ctx; ++ ++ return spdk_get_io_channel(xnvme); ++} ++ ++static bool ++bdev_xnvme_io_type_supported(void *ctx, enum spdk_bdev_io_type io_type) ++{ ++ switch (io_type) { ++ case SPDK_BDEV_IO_TYPE_READ: ++ case SPDK_BDEV_IO_TYPE_WRITE: ++ return true; ++ default: ++ return false; ++ } ++} ++ ++static void ++bdev_xnvme_destruct_cb(void *io_device) ++{ ++ struct bdev_xnvme *xnvme = io_device; ++ ++ TAILQ_REMOVE(&g_xnvme_bdev_head, xnvme, link); ++ bdev_xnvme_free(xnvme); ++} ++ ++static int ++bdev_xnvme_destruct(void *ctx) ++{ ++ struct bdev_xnvme *xnvme = ctx; ++ ++ spdk_io_device_unregister(xnvme, bdev_xnvme_destruct_cb); ++ ++ return 0; ++} ++ ++static void ++bdev_xnvme_get_buf_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io, bool success) ++{ ++ struct bdev_xnvme_task *xnvme_task = (struct bdev_xnvme_task *)bdev_io->driver_ctx; ++ struct bdev_xnvme *xnvme = (struct bdev_xnvme *)bdev_io->bdev->ctxt; ++ struct bdev_xnvme_io_channel *xnvme_ch = spdk_io_channel_get_ctx(ch); ++ struct xnvme_cmd_ctx *ctx = xnvme_queue_get_cmd_ctx(xnvme_ch->queue); ++ int err; ++ ++ if (!success) { ++ xnvme_queue_put_cmd_ctx(xnvme_ch->queue, ctx); ++ spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); ++ return; ++ } ++ ++ SPDK_DEBUGLOG(xnvme, "bdev_io : %p, iov_cnt : %d, bdev_xnvme_task : %p\n", ++ bdev_io, bdev_io->u.bdev.iovcnt, (struct bdev_xnvme_task *)bdev_io->driver_ctx); ++ ++ switch (bdev_io->type) { ++ case SPDK_BDEV_IO_TYPE_READ: ++ ctx->cmd.common.opcode = XNVME_SPEC_NVM_OPC_READ; ++ ctx->cmd.common.nsid = xnvme->nsid; ++ ctx->cmd.nvm.nlb = bdev_io->u.bdev.num_blocks - 1; ++ ctx->cmd.nvm.slba = bdev_io->u.bdev.offset_blocks; ++ break; ++ case SPDK_BDEV_IO_TYPE_WRITE: ++ ctx->cmd.common.opcode = XNVME_SPEC_NVM_OPC_WRITE; ++ ctx->cmd.common.nsid = xnvme->nsid; ++ ctx->cmd.nvm.nlb = bdev_io->u.bdev.num_blocks - 1; ++ ctx->cmd.nvm.slba = bdev_io->u.bdev.offset_blocks; ++ break; ++ ++ default: ++ SPDK_ERRLOG("Wrong io type\n"); ++ ++ xnvme_queue_put_cmd_ctx(xnvme_ch->queue, ctx); ++ spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); ++ return; ++ } ++ ++ xnvme_task->ch = xnvme_ch; ++ ctx->async.cb_arg = xnvme_task; ++ ++ err = xnvme_cmd_passv(ctx, bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt, ++ bdev_io->u.bdev.num_blocks * xnvme->bdev.blocklen, NULL, 0, 0); ++ ++ switch (err) { ++ /* Submission success! */ ++ case 0: ++ SPDK_DEBUGLOG(xnvme, "io_channel : %p, iovcnt:%d, nblks: %lu off: %#lx\n", ++ xnvme_ch, bdev_io->u.bdev.iovcnt, ++ bdev_io->u.bdev.num_blocks, bdev_io->u.bdev.offset_blocks); ++ return; ++ ++ /* Submission failed: queue is full or no memory => Queue the I/O in bdev layer */ ++ case -EBUSY: ++ case -EAGAIN: ++ case -ENOMEM: ++ SPDK_WARNLOG("Start to queue I/O for xnvme bdev\n"); ++ ++ xnvme_queue_put_cmd_ctx(xnvme_ch->queue, ctx); ++ spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_NOMEM); ++ return; ++ ++ /* Submission failed: unexpected error, put the command-context back in the queue */ ++ default: ++ SPDK_ERRLOG("bdev_xnvme_cmd_passv : Submission failed: unexpected error\n"); ++ ++ xnvme_queue_put_cmd_ctx(xnvme_ch->queue, ctx); ++ spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); ++ return; ++ } ++} ++ ++static void ++bdev_xnvme_submit_request(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io) ++{ ++ switch (bdev_io->type) { ++ /* Read and write operations must be performed on buffers aligned to ++ * bdev->required_alignment. If user specified unaligned buffers, ++ * get the aligned buffer from the pool by calling spdk_bdev_io_get_buf. */ ++ case SPDK_BDEV_IO_TYPE_READ: ++ case SPDK_BDEV_IO_TYPE_WRITE: ++ spdk_bdev_io_get_buf(bdev_io, bdev_xnvme_get_buf_cb, ++ bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen); ++ break; ++ ++ default: ++ spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); ++ break; ++ } ++} ++ ++static const struct spdk_bdev_fn_table xnvme_fn_table = { ++ .destruct = bdev_xnvme_destruct, ++ .submit_request = bdev_xnvme_submit_request, ++ .io_type_supported = bdev_xnvme_io_type_supported, ++ .get_io_channel = bdev_xnvme_get_io_channel, ++}; ++ ++static void ++bdev_xnvme_free(struct bdev_xnvme *xnvme) ++{ ++ assert(xnvme != NULL); ++ ++ xnvme_dev_close(xnvme->dev); ++ free(xnvme->io_mechanism); ++ free(xnvme->filename); ++ free(xnvme->bdev.name); ++ free(xnvme); ++} ++ ++static void ++bdev_xnvme_cmd_cb(struct xnvme_cmd_ctx *ctx, void *cb_arg) ++{ ++ struct bdev_xnvme_task *xnvme_task = ctx->async.cb_arg; ++ enum spdk_bdev_io_status status = SPDK_BDEV_IO_STATUS_SUCCESS; ++ ++ SPDK_DEBUGLOG(xnvme, "xnvme_task : %p\n", xnvme_task); ++ ++ if (xnvme_cmd_ctx_cpl_status(ctx)) { ++ SPDK_ERRLOG("xNVMe I/O Failed\n"); ++ xnvme_cmd_ctx_pr(ctx, XNVME_PR_DEF); ++ status = SPDK_BDEV_IO_STATUS_FAILED; ++ } ++ ++ spdk_bdev_io_complete(spdk_bdev_io_from_ctx(xnvme_task), status); ++ ++ /* Completed: Put the command- context back in the queue */ ++ xnvme_queue_put_cmd_ctx(ctx->async.queue, ctx); ++} ++ ++static int ++bdev_xnvme_poll(void *arg) ++{ ++ struct bdev_xnvme_io_channel *ch = arg; ++ int rc; ++ ++ rc = xnvme_queue_poke(ch->queue, 0); ++ if (rc < 0) { ++ SPDK_ERRLOG("xnvme_queue_poke failure rc : %d\n", rc); ++ return SPDK_POLLER_BUSY; ++ } ++ ++ return xnvme_queue_get_outstanding(ch->queue) ? SPDK_POLLER_BUSY : SPDK_POLLER_IDLE; ++} ++ ++static int ++bdev_xnvme_queue_create_cb(void *io_device, void *ctx_buf) ++{ ++ struct bdev_xnvme *xnvme = io_device; ++ struct bdev_xnvme_io_channel *ch = ctx_buf; ++ int rc; ++ int qd = 512; ++ ++ rc = xnvme_queue_init(xnvme->dev, qd, 0, &ch->queue); ++ if (rc) { ++ SPDK_ERRLOG("xnvme_queue_init failure: %d\n", rc); ++ return 1; ++ } ++ ++ xnvme_queue_set_cb(ch->queue, bdev_xnvme_cmd_cb, ch); ++ ++ ch->poller = SPDK_POLLER_REGISTER(bdev_xnvme_poll, ch, 0); ++ ++ return 0; ++} ++ ++static void ++bdev_xnvme_queue_destroy_cb(void *io_device, void *ctx_buf) ++{ ++ struct bdev_xnvme_io_channel *ch = ctx_buf; ++ ++ spdk_poller_unregister(&ch->poller); ++ ++ xnvme_queue_term(ch->queue); ++} ++ ++struct spdk_bdev * ++create_xnvme_bdev(const char *name, const char *filename, const char *io_mechanism, ++ bool conserve_cpu) ++{ ++ struct bdev_xnvme *xnvme; ++ uint32_t block_size; ++ uint64_t bdev_size; ++ int rc; ++ struct xnvme_opts opts = xnvme_opts_default(); ++ ++ xnvme = calloc(1, sizeof(*xnvme)); ++ if (!xnvme) { ++ SPDK_ERRLOG("Unable to allocate enough memory for xNVMe backend\n"); ++ return NULL; ++ } ++ ++ opts.direct = 1; ++ opts.async = io_mechanism; ++ if (!opts.async) { ++ goto error_return; ++ } ++ xnvme->io_mechanism = strdup(io_mechanism); ++ if (!xnvme->io_mechanism) { ++ goto error_return; ++ } ++ ++ if (!conserve_cpu) { ++ if (!strcmp(xnvme->io_mechanism, "libaio")) { ++ opts.poll_io = 1; ++ } else if (!strcmp(xnvme->io_mechanism, "io_uring")) { ++ opts.poll_io = 1; ++ } else if (!strcmp(xnvme->io_mechanism, "io_uring_cmd")) { ++ opts.poll_sq = 1; ++ } ++ } ++ ++ xnvme->filename = strdup(filename); ++ if (!xnvme->filename) { ++ goto error_return; ++ } ++ ++ xnvme->dev = xnvme_dev_open(xnvme->filename, &opts); ++ if (!xnvme->dev) { ++ SPDK_ERRLOG("Unable to open xNVMe device %s\n", filename); ++ goto error_return; ++ } ++ ++ xnvme->nsid = xnvme_dev_get_nsid(xnvme->dev); ++ ++ bdev_size = xnvme_dev_get_geo(xnvme->dev)->tbytes; ++ block_size = xnvme_dev_get_geo(xnvme->dev)->nbytes; ++ ++ xnvme->bdev.name = strdup(name); ++ if (!xnvme->bdev.name) { ++ goto error_return; ++ } ++ ++ xnvme->bdev.product_name = "xNVMe bdev"; ++ xnvme->bdev.module = &xnvme_if; ++ ++ xnvme->bdev.write_cache = 0; ++ ++ if (block_size == 0) { ++ SPDK_ERRLOG("Block size could not be auto-detected\n"); ++ goto error_return; ++ } ++ ++ if (block_size < 512) { ++ SPDK_ERRLOG("Invalid block size %" PRIu32 " (must be at least 512).\n", block_size); ++ goto error_return; ++ } ++ ++ if (!spdk_u32_is_pow2(block_size)) { ++ SPDK_ERRLOG("Invalid block size %" PRIu32 " (must be a power of 2.)\n", block_size); ++ goto error_return; ++ } ++ ++ SPDK_DEBUGLOG(xnvme, "bdev_name : %s, bdev_size : %lu, block_size : %d\n", ++ xnvme->bdev.name, bdev_size, block_size); ++ ++ xnvme->bdev.blocklen = block_size; ++ xnvme->bdev.required_alignment = spdk_u32log2(block_size); ++ ++ if (bdev_size % xnvme->bdev.blocklen != 0) { ++ SPDK_ERRLOG("Disk size %" PRIu64 " is not a multiple of block size %" PRIu32 "\n", ++ bdev_size, xnvme->bdev.blocklen); ++ goto error_return; ++ } ++ ++ xnvme->bdev.blockcnt = bdev_size / xnvme->bdev.blocklen; ++ xnvme->bdev.ctxt = xnvme; ++ ++ xnvme->bdev.fn_table = &xnvme_fn_table; ++ ++ spdk_io_device_register(xnvme, bdev_xnvme_queue_create_cb, bdev_xnvme_queue_destroy_cb, ++ sizeof(struct bdev_xnvme_io_channel), ++ xnvme->bdev.name); ++ rc = spdk_bdev_register(&xnvme->bdev); ++ if (rc) { ++ spdk_io_device_unregister(xnvme, NULL); ++ goto error_return; ++ } ++ ++ TAILQ_INSERT_TAIL(&g_xnvme_bdev_head, xnvme, link); ++ ++ return &xnvme->bdev; ++ ++error_return: ++ bdev_xnvme_free(xnvme); ++ return NULL; ++} ++ ++struct delete_xnvme_bdev_ctx { ++ struct bdev_xnvme *xnvme; ++ spdk_delete_xnvme_complete cb_fn; ++ void *cb_arg; ++}; ++ ++static void ++xnvme_bdev_unregister_cb(void *arg, int bdeverrno) ++{ ++ struct delete_xnvme_bdev_ctx *ctx = arg; ++ ++ ctx->cb_fn(ctx->cb_arg, bdeverrno); ++ free(ctx); ++} ++ ++void ++delete_xnvme_bdev(struct spdk_bdev *bdev, spdk_delete_xnvme_complete cb_fn, void *cb_arg) ++{ ++ struct delete_xnvme_bdev_ctx *ctx; ++ struct bdev_xnvme *xnvme = (struct bdev_xnvme *)bdev->ctxt; ++ ++ if (!bdev || bdev->module != &xnvme_if) { ++ cb_fn(cb_arg, -ENODEV); ++ return; ++ } ++ ++ ctx = calloc(1, sizeof(*ctx)); ++ if (ctx == NULL) { ++ cb_fn(cb_arg, -ENOMEM); ++ return; ++ } ++ ++ ctx->xnvme = xnvme; ++ ctx->cb_fn = cb_fn; ++ ctx->cb_arg = cb_arg; ++ spdk_bdev_unregister(bdev, xnvme_bdev_unregister_cb, ctx); ++} ++ ++static int ++bdev_xnvme_module_create_cb(void *io_device, void *ctx_buf) ++{ ++ return 0; ++} ++ ++static void ++bdev_xnvme_module_destroy_cb(void *io_device, void *ctx_buf) ++{ ++} ++ ++static int ++bdev_xnvme_init(void) ++{ ++ spdk_io_device_register(&xnvme_if, bdev_xnvme_module_create_cb, bdev_xnvme_module_destroy_cb, ++ 0, "xnvme_module"); ++ ++ return 0; ++} ++ ++static void ++bdev_xnvme_fini(void) ++{ ++ spdk_io_device_unregister(&xnvme_if, NULL); ++} ++ ++SPDK_LOG_REGISTER_COMPONENT(xnvme) +diff --git a/module/bdev/xnvme/bdev_xnvme.h b/module/bdev/xnvme/bdev_xnvme.h +index dbe2a6f..39b6dd5 100644 +--- a/module/bdev/xnvme/bdev_xnvme.h ++++ b/module/bdev/xnvme/bdev_xnvme.h +@@ -1,23 +1,23 @@ +-/* SPDX-License-Identifier: BSD-3-Clause +- * Copyright (c) Samsung Electronics Co., Ltd. +- * All rights reserved. +- */ +- +-#ifndef SPDK_BDEV_XNVME_H +-#define SPDK_BDEV_XNVME_H +- +-#include "spdk/stdinc.h" +- +-#include "spdk/queue.h" +-#include "spdk/bdev.h" +- +-#include "spdk/bdev_module.h" +- +-typedef void (*spdk_delete_xnvme_complete)(void *cb_arg, int bdeverrno); +- +-struct spdk_bdev *create_xnvme_bdev(const char *name, const char *filename, +- const char *io_mechanism, bool conserve_cpu); +- +-void delete_xnvme_bdev(struct spdk_bdev *bdev, spdk_delete_xnvme_complete cb_fn, void *cb_arg); +- +-#endif /* SPDK_BDEV_XNVME_H */ ++/* SPDX-License-Identifier: BSD-3-Clause ++ * Copyright (c) Samsung Electronics Co., Ltd. ++ * All rights reserved. ++ */ ++ ++#ifndef SPDK_BDEV_XNVME_H ++#define SPDK_BDEV_XNVME_H ++ ++#include "spdk/stdinc.h" ++ ++#include "spdk/queue.h" ++#include "spdk/bdev.h" ++ ++#include "spdk/bdev_module.h" ++ ++typedef void (*spdk_delete_xnvme_complete)(void *cb_arg, int bdeverrno); ++ ++struct spdk_bdev *create_xnvme_bdev(const char *name, const char *filename, ++ const char *io_mechanism, bool conserve_cpu); ++ ++void delete_xnvme_bdev(struct spdk_bdev *bdev, spdk_delete_xnvme_complete cb_fn, void *cb_arg); ++ ++#endif /* SPDK_BDEV_XNVME_H */ +diff --git a/module/bdev/xnvme/bdev_xnvme_rpc.c b/module/bdev/xnvme/bdev_xnvme_rpc.c +index f061a9b..54d1feb 100644 +--- a/module/bdev/xnvme/bdev_xnvme_rpc.c ++++ b/module/bdev/xnvme/bdev_xnvme_rpc.c +@@ -1,137 +1,137 @@ +-/* SPDX-License-Identifier: BSD-3-Clause +- * Copyright (c) Samsung Electronics Co., Ltd. +- * All rights reserved. +- */ +- +-#include "bdev_xnvme.h" +-#include "spdk/rpc.h" +-#include "spdk/util.h" +-#include "spdk/string.h" +-#include "spdk/log.h" +- +-/* Structure to hold the parameters for this RPC method. */ +-struct rpc_create_xnvme { +- char *name; +- char *filename; +- char *io_mechanism; +- bool conserve_cpu; +-}; +- +-/* Free the allocated memory resource after the RPC handling. */ +-static void +-free_rpc_create_xnvme(struct rpc_create_xnvme *r) +-{ +- free(r->name); +- free(r->filename); +- free(r->io_mechanism); +-} +- +-/* Structure to decode the input parameters for this RPC method. */ +-static const struct spdk_json_object_decoder rpc_create_xnvme_decoders[] = { +- {"name", offsetof(struct rpc_create_xnvme, name), spdk_json_decode_string}, +- {"filename", offsetof(struct rpc_create_xnvme, filename), spdk_json_decode_string}, +- {"io_mechanism", offsetof(struct rpc_create_xnvme, io_mechanism), spdk_json_decode_string}, +- {"conserve_cpu", offsetof(struct rpc_create_xnvme, conserve_cpu), spdk_json_decode_bool, true}, +-}; +- +-static void +-dummy_bdev_event_cb(enum spdk_bdev_event_type type, struct spdk_bdev *bdev, void *ctx) +-{ +-} +- +-/* Decode the parameters for this RPC method and properly create the xnvme +- * device. Error status returned in the failed cases. +- */ +-static void +-rpc_bdev_xnvme_create(struct spdk_jsonrpc_request *request, +- const struct spdk_json_val *params) +-{ +- struct rpc_create_xnvme req = {}; +- struct spdk_json_write_ctx *w; +- struct spdk_bdev *bdev; +- +- if (spdk_json_decode_object(params, rpc_create_xnvme_decoders, +- SPDK_COUNTOF(rpc_create_xnvme_decoders), +- &req)) { +- SPDK_ERRLOG("spdk_json_decode_object failed\n"); +- spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, +- "spdk_json_decode_object failed"); +- goto cleanup; +- } +- +- bdev = create_xnvme_bdev(req.name, req.filename, req.io_mechanism, req.conserve_cpu); +- if (!bdev) { +- SPDK_ERRLOG("Unable to create xNVMe bdev from file %s\n", req.filename); +- spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, +- "Unable to create xNVMe bdev."); +- goto cleanup; +- } +- +- w = spdk_jsonrpc_begin_result(request); +- spdk_json_write_string(w, req.name); +- spdk_jsonrpc_end_result(request, w); +- +-cleanup: +- free_rpc_create_xnvme(&req); +-} +-SPDK_RPC_REGISTER("bdev_xnvme_create", rpc_bdev_xnvme_create, SPDK_RPC_RUNTIME) +- +-struct rpc_delete_xnvme { +- char *name; +-}; +- +-static void +-free_rpc_delete_xnvme(struct rpc_delete_xnvme *req) +-{ +- free(req->name); +-} +- +-static const struct spdk_json_object_decoder rpc_delete_xnvme_decoders[] = { +- {"name", offsetof(struct rpc_delete_xnvme, name), spdk_json_decode_string}, +-}; +- +-static void +-_rpc_bdev_xnvme_delete_cb(void *cb_arg, int bdeverrno) +-{ +- struct spdk_jsonrpc_request *request = cb_arg; +- +- spdk_jsonrpc_send_bool_response(request, bdeverrno == 0); +-} +- +-static void +-rpc_bdev_xnvme_delete(struct spdk_jsonrpc_request *request, +- const struct spdk_json_val *params) +-{ +- struct rpc_delete_xnvme req = {NULL}; +- struct spdk_bdev_desc *desc; +- struct spdk_bdev *bdev = NULL; +- int rc; +- +- if (spdk_json_decode_object(params, rpc_delete_xnvme_decoders, +- SPDK_COUNTOF(rpc_delete_xnvme_decoders), +- &req)) { +- spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, +- "spdk_json_decode_object failed"); +- goto cleanup; +- } +- +- rc = spdk_bdev_open_ext(req.name, false, dummy_bdev_event_cb, NULL, &desc); +- if (rc == 0) { +- bdev = spdk_bdev_desc_get_bdev(desc); +- spdk_bdev_close(desc); +- } else { +- spdk_jsonrpc_send_error_response(request, -ENODEV, spdk_strerror(ENODEV)); +- goto cleanup; +- } +- +- if (bdev == NULL) { +- spdk_jsonrpc_send_error_response(request, -ENODEV, spdk_strerror(ENODEV)); +- goto cleanup; +- } +- +- delete_xnvme_bdev(bdev, _rpc_bdev_xnvme_delete_cb, request); +- +-cleanup: +- free_rpc_delete_xnvme(&req); +-} +-SPDK_RPC_REGISTER("bdev_xnvme_delete", rpc_bdev_xnvme_delete, SPDK_RPC_RUNTIME) ++/* SPDX-License-Identifier: BSD-3-Clause ++ * Copyright (c) Samsung Electronics Co., Ltd. ++ * All rights reserved. ++ */ ++ ++#include "bdev_xnvme.h" ++#include "spdk/rpc.h" ++#include "spdk/util.h" ++#include "spdk/string.h" ++#include "spdk/log.h" ++ ++/* Structure to hold the parameters for this RPC method. */ ++struct rpc_create_xnvme { ++ char *name; ++ char *filename; ++ char *io_mechanism; ++ bool conserve_cpu; ++}; ++ ++/* Free the allocated memory resource after the RPC handling. */ ++static void ++free_rpc_create_xnvme(struct rpc_create_xnvme *r) ++{ ++ free(r->name); ++ free(r->filename); ++ free(r->io_mechanism); ++} ++ ++/* Structure to decode the input parameters for this RPC method. */ ++static const struct spdk_json_object_decoder rpc_create_xnvme_decoders[] = { ++ {"name", offsetof(struct rpc_create_xnvme, name), spdk_json_decode_string}, ++ {"filename", offsetof(struct rpc_create_xnvme, filename), spdk_json_decode_string}, ++ {"io_mechanism", offsetof(struct rpc_create_xnvme, io_mechanism), spdk_json_decode_string}, ++ {"conserve_cpu", offsetof(struct rpc_create_xnvme, conserve_cpu), spdk_json_decode_bool, true}, ++}; ++ ++static void ++dummy_bdev_event_cb(enum spdk_bdev_event_type type, struct spdk_bdev *bdev, void *ctx) ++{ ++} ++ ++/* Decode the parameters for this RPC method and properly create the xnvme ++ * device. Error status returned in the failed cases. ++ */ ++static void ++rpc_bdev_xnvme_create(struct spdk_jsonrpc_request *request, ++ const struct spdk_json_val *params) ++{ ++ struct rpc_create_xnvme req = {}; ++ struct spdk_json_write_ctx *w; ++ struct spdk_bdev *bdev; ++ ++ if (spdk_json_decode_object(params, rpc_create_xnvme_decoders, ++ SPDK_COUNTOF(rpc_create_xnvme_decoders), ++ &req)) { ++ SPDK_ERRLOG("spdk_json_decode_object failed\n"); ++ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, ++ "spdk_json_decode_object failed"); ++ goto cleanup; ++ } ++ ++ bdev = create_xnvme_bdev(req.name, req.filename, req.io_mechanism, req.conserve_cpu); ++ if (!bdev) { ++ SPDK_ERRLOG("Unable to create xNVMe bdev from file %s\n", req.filename); ++ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, ++ "Unable to create xNVMe bdev."); ++ goto cleanup; ++ } ++ ++ w = spdk_jsonrpc_begin_result(request); ++ spdk_json_write_string(w, req.name); ++ spdk_jsonrpc_end_result(request, w); ++ ++cleanup: ++ free_rpc_create_xnvme(&req); ++} ++SPDK_RPC_REGISTER("bdev_xnvme_create", rpc_bdev_xnvme_create, SPDK_RPC_RUNTIME) ++ ++struct rpc_delete_xnvme { ++ char *name; ++}; ++ ++static void ++free_rpc_delete_xnvme(struct rpc_delete_xnvme *req) ++{ ++ free(req->name); ++} ++ ++static const struct spdk_json_object_decoder rpc_delete_xnvme_decoders[] = { ++ {"name", offsetof(struct rpc_delete_xnvme, name), spdk_json_decode_string}, ++}; ++ ++static void ++_rpc_bdev_xnvme_delete_cb(void *cb_arg, int bdeverrno) ++{ ++ struct spdk_jsonrpc_request *request = cb_arg; ++ ++ spdk_jsonrpc_send_bool_response(request, bdeverrno == 0); ++} ++ ++static void ++rpc_bdev_xnvme_delete(struct spdk_jsonrpc_request *request, ++ const struct spdk_json_val *params) ++{ ++ struct rpc_delete_xnvme req = {NULL}; ++ struct spdk_bdev_desc *desc; ++ struct spdk_bdev *bdev = NULL; ++ int rc; ++ ++ if (spdk_json_decode_object(params, rpc_delete_xnvme_decoders, ++ SPDK_COUNTOF(rpc_delete_xnvme_decoders), ++ &req)) { ++ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, ++ "spdk_json_decode_object failed"); ++ goto cleanup; ++ } ++ ++ rc = spdk_bdev_open_ext(req.name, false, dummy_bdev_event_cb, NULL, &desc); ++ if (rc == 0) { ++ bdev = spdk_bdev_desc_get_bdev(desc); ++ spdk_bdev_close(desc); ++ } else { ++ spdk_jsonrpc_send_error_response(request, -ENODEV, spdk_strerror(ENODEV)); ++ goto cleanup; ++ } ++ ++ if (bdev == NULL) { ++ spdk_jsonrpc_send_error_response(request, -ENODEV, spdk_strerror(ENODEV)); ++ goto cleanup; ++ } ++ ++ delete_xnvme_bdev(bdev, _rpc_bdev_xnvme_delete_cb, request); ++ ++cleanup: ++ free_rpc_delete_xnvme(&req); ++} ++SPDK_RPC_REGISTER("bdev_xnvme_delete", rpc_bdev_xnvme_delete, SPDK_RPC_RUNTIME) +diff --git a/module/bdev/zone_block/Makefile b/module/bdev/zone_block/Makefile +index f023852..fb6f570 100644 +--- a/module/bdev/zone_block/Makefile ++++ b/module/bdev/zone_block/Makefile +@@ -1,17 +1,17 @@ +-# SPDX-License-Identifier: BSD-3-Clause +-# Copyright (C) 2015 Intel Corporation. +-# All rights reserved. +-# +- +-SPDK_ROOT_DIR := $(abspath $(CURDIR)/../../..) +-include $(SPDK_ROOT_DIR)/mk/spdk.common.mk +- +-SO_VER := 4 +-SO_MINOR := 0 +- +-C_SRCS = vbdev_zone_block.c vbdev_zone_block_rpc.c +-LIBNAME = bdev_zone_block +- +-SPDK_MAP_FILE = $(SPDK_ROOT_DIR)/mk/spdk_blank.map +- +-include $(SPDK_ROOT_DIR)/mk/spdk.lib.mk ++# SPDX-License-Identifier: BSD-3-Clause ++# Copyright (C) 2015 Intel Corporation. ++# All rights reserved. ++# ++ ++SPDK_ROOT_DIR := $(abspath $(CURDIR)/../../..) ++include $(SPDK_ROOT_DIR)/mk/spdk.common.mk ++ ++SO_VER := 4 ++SO_MINOR := 0 ++ ++C_SRCS = vbdev_zone_block.c vbdev_zone_block_rpc.c ++LIBNAME = bdev_zone_block ++ ++SPDK_MAP_FILE = $(SPDK_ROOT_DIR)/mk/spdk_blank.map ++ ++include $(SPDK_ROOT_DIR)/mk/spdk.lib.mk +diff --git a/module/bdev/zone_block/vbdev_zone_block.c b/module/bdev/zone_block/vbdev_zone_block.c +index 70fd735..72d5656 100644 +--- a/module/bdev/zone_block/vbdev_zone_block.c ++++ b/module/bdev/zone_block/vbdev_zone_block.c +@@ -1,899 +1,899 @@ +-/* SPDX-License-Identifier: BSD-3-Clause +- * Copyright (C) 2019 Intel Corporation. +- * Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. +- * All rights reserved. +- */ +- +-#include "spdk/stdinc.h" +- +-#include "vbdev_zone_block.h" +- +-#include "spdk/config.h" +-#include "spdk/nvme.h" +-#include "spdk/bdev_zone.h" +- +-#include "spdk/log.h" +- +-static int zone_block_init(void); +-static int zone_block_get_ctx_size(void); +-static void zone_block_finish(void); +-static int zone_block_config_json(struct spdk_json_write_ctx *w); +-static void zone_block_examine(struct spdk_bdev *bdev); +- +-static struct spdk_bdev_module bdev_zoned_if = { +- .name = "bdev_zoned_block", +- .module_init = zone_block_init, +- .module_fini = zone_block_finish, +- .config_json = zone_block_config_json, +- .examine_config = zone_block_examine, +- .get_ctx_size = zone_block_get_ctx_size, +-}; +- +-SPDK_BDEV_MODULE_REGISTER(bdev_zoned_block, &bdev_zoned_if) +- +-/* List of block vbdev names and their base bdevs via configuration file. +- * Used so we can parse the conf once at init and use this list in examine(). +- */ +-struct bdev_zone_block_config { +- char *vbdev_name; +- char *bdev_name; +- uint64_t zone_capacity; +- uint64_t optimal_open_zones; +- TAILQ_ENTRY(bdev_zone_block_config) link; +-}; +-static TAILQ_HEAD(, bdev_zone_block_config) g_bdev_configs = TAILQ_HEAD_INITIALIZER(g_bdev_configs); +- +-struct block_zone { +- struct spdk_bdev_zone_info zone_info; +- pthread_spinlock_t lock; +-}; +- +-/* List of block vbdevs and associated info for each. */ +-struct bdev_zone_block { +- struct spdk_bdev bdev; /* the block zoned bdev */ +- struct spdk_bdev_desc *base_desc; /* its descriptor we get from open */ +- struct block_zone *zones; /* array of zones */ +- uint64_t num_zones; /* number of zones */ +- uint64_t zone_capacity; /* zone capacity */ +- uint64_t zone_shift; /* log2 of zone_size */ +- TAILQ_ENTRY(bdev_zone_block) link; +- struct spdk_thread *thread; /* thread where base device is opened */ +-}; +-static TAILQ_HEAD(, bdev_zone_block) g_bdev_nodes = TAILQ_HEAD_INITIALIZER(g_bdev_nodes); +- +-struct zone_block_io_channel { +- struct spdk_io_channel *base_ch; /* IO channel of base device */ +-}; +- +-struct zone_block_io { +- /* vbdev to which IO was issued */ +- struct bdev_zone_block *bdev_zone_block; +-}; +- +-static int +-zone_block_init(void) +-{ +- return 0; +-} +- +-static void +-zone_block_remove_config(struct bdev_zone_block_config *name) +-{ +- TAILQ_REMOVE(&g_bdev_configs, name, link); +- free(name->bdev_name); +- free(name->vbdev_name); +- free(name); +-} +- +-static void +-zone_block_finish(void) +-{ +- struct bdev_zone_block_config *name; +- +- while ((name = TAILQ_FIRST(&g_bdev_configs))) { +- zone_block_remove_config(name); +- } +-} +- +-static int +-zone_block_get_ctx_size(void) +-{ +- return sizeof(struct zone_block_io); +-} +- +-static int +-zone_block_config_json(struct spdk_json_write_ctx *w) +-{ +- struct bdev_zone_block *bdev_node; +- struct spdk_bdev *base_bdev = NULL; +- +- TAILQ_FOREACH(bdev_node, &g_bdev_nodes, link) { +- base_bdev = spdk_bdev_desc_get_bdev(bdev_node->base_desc); +- spdk_json_write_object_begin(w); +- spdk_json_write_named_string(w, "method", "bdev_zone_block_create"); +- spdk_json_write_named_object_begin(w, "params"); +- spdk_json_write_named_string(w, "base_bdev", spdk_bdev_get_name(base_bdev)); +- spdk_json_write_named_string(w, "name", spdk_bdev_get_name(&bdev_node->bdev)); +- spdk_json_write_named_uint64(w, "zone_capacity", bdev_node->zone_capacity); +- spdk_json_write_named_uint64(w, "optimal_open_zones", bdev_node->bdev.optimal_open_zones); +- spdk_json_write_object_end(w); +- spdk_json_write_object_end(w); +- } +- +- return 0; +-} +- +-/* Callback for unregistering the IO device. */ +-static void +-_device_unregister_cb(void *io_device) +-{ +- struct bdev_zone_block *bdev_node = io_device; +- uint64_t i; +- +- free(bdev_node->bdev.name); +- for (i = 0; i < bdev_node->num_zones; i++) { +- pthread_spin_destroy(&bdev_node->zones[i].lock); +- } +- free(bdev_node->zones); +- free(bdev_node); +-} +- +-static void +-_zone_block_destruct(void *ctx) +-{ +- struct spdk_bdev_desc *desc = ctx; +- +- spdk_bdev_close(desc); +-} +- +-static int +-zone_block_destruct(void *ctx) +-{ +- struct bdev_zone_block *bdev_node = (struct bdev_zone_block *)ctx; +- +- TAILQ_REMOVE(&g_bdev_nodes, bdev_node, link); +- +- /* Unclaim the underlying bdev. */ +- spdk_bdev_module_release_bdev(spdk_bdev_desc_get_bdev(bdev_node->base_desc)); +- +- /* Close the underlying bdev on its same opened thread. */ +- if (bdev_node->thread && bdev_node->thread != spdk_get_thread()) { +- spdk_thread_send_msg(bdev_node->thread, _zone_block_destruct, bdev_node->base_desc); +- } else { +- spdk_bdev_close(bdev_node->base_desc); +- } +- +- /* Unregister the io_device. */ +- spdk_io_device_unregister(bdev_node, _device_unregister_cb); +- +- return 0; +-} +- +-static struct block_zone * +-zone_block_get_zone_containing_lba(struct bdev_zone_block *bdev_node, uint64_t lba) +-{ +- size_t index = lba >> bdev_node->zone_shift; +- +- if (index >= bdev_node->num_zones) { +- return NULL; +- } +- +- return &bdev_node->zones[index]; +-} +- +-static struct block_zone * +-zone_block_get_zone_by_slba(struct bdev_zone_block *bdev_node, uint64_t start_lba) +-{ +- struct block_zone *zone = zone_block_get_zone_containing_lba(bdev_node, start_lba); +- +- if (zone && zone->zone_info.zone_id == start_lba) { +- return zone; +- } else { +- return NULL; +- } +-} +- +-static int +-zone_block_get_zone_info(struct bdev_zone_block *bdev_node, struct spdk_bdev_io *bdev_io) +-{ +- struct block_zone *zone; +- struct spdk_bdev_zone_info *zone_info = bdev_io->u.zone_mgmt.buf; +- uint64_t zone_id = bdev_io->u.zone_mgmt.zone_id; +- size_t i; +- +- /* User can request info for more zones than exist, need to check both internal and user +- * boundaries +- */ +- for (i = 0; i < bdev_io->u.zone_mgmt.num_zones; i++, zone_id += bdev_node->bdev.zone_size) { +- zone = zone_block_get_zone_by_slba(bdev_node, zone_id); +- if (!zone) { +- return -EINVAL; +- } +- memcpy(&zone_info[i], &zone->zone_info, sizeof(*zone_info)); +- } +- +- spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_SUCCESS); +- return 0; +-} +- +-static int +-zone_block_open_zone(struct block_zone *zone, struct spdk_bdev_io *bdev_io) +-{ +- pthread_spin_lock(&zone->lock); +- +- switch (zone->zone_info.state) { +- case SPDK_BDEV_ZONE_STATE_EMPTY: +- case SPDK_BDEV_ZONE_STATE_OPEN: +- case SPDK_BDEV_ZONE_STATE_CLOSED: +- zone->zone_info.state = SPDK_BDEV_ZONE_STATE_OPEN; +- pthread_spin_unlock(&zone->lock); +- spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_SUCCESS); +- return 0; +- default: +- pthread_spin_unlock(&zone->lock); +- return -EINVAL; +- } +-} +- +-static void +-_zone_block_complete_unmap(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) +-{ +- struct spdk_bdev_io *orig_io = cb_arg; +- int status = success ? SPDK_BDEV_IO_STATUS_SUCCESS : SPDK_BDEV_IO_STATUS_FAILED; +- +- /* Complete the original IO and then free the one that we created here +- * as a result of issuing an IO via submit_request. +- */ +- spdk_bdev_io_complete(orig_io, status); +- spdk_bdev_free_io(bdev_io); +-} +- +-static int +-zone_block_reset_zone(struct bdev_zone_block *bdev_node, struct zone_block_io_channel *ch, +- struct block_zone *zone, struct spdk_bdev_io *bdev_io) +-{ +- pthread_spin_lock(&zone->lock); +- +- switch (zone->zone_info.state) { +- case SPDK_BDEV_ZONE_STATE_EMPTY: +- pthread_spin_unlock(&zone->lock); +- spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_SUCCESS); +- return 0; +- case SPDK_BDEV_ZONE_STATE_OPEN: +- case SPDK_BDEV_ZONE_STATE_FULL: +- case SPDK_BDEV_ZONE_STATE_CLOSED: +- zone->zone_info.state = SPDK_BDEV_ZONE_STATE_EMPTY; +- zone->zone_info.write_pointer = zone->zone_info.zone_id; +- pthread_spin_unlock(&zone->lock); +- +- /* The unmap isn't necessary, so if the base bdev doesn't support it, we're done */ +- if (!spdk_bdev_io_type_supported(spdk_bdev_desc_get_bdev(bdev_node->base_desc), +- SPDK_BDEV_IO_TYPE_UNMAP)) { +- spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_SUCCESS); +- return 0; +- } +- +- return spdk_bdev_unmap_blocks(bdev_node->base_desc, ch->base_ch, +- zone->zone_info.zone_id, zone->zone_info.capacity, +- _zone_block_complete_unmap, bdev_io); +- default: +- pthread_spin_unlock(&zone->lock); +- return -EINVAL; +- } +-} +- +-static int +-zone_block_close_zone(struct block_zone *zone, struct spdk_bdev_io *bdev_io) +-{ +- pthread_spin_lock(&zone->lock); +- +- switch (zone->zone_info.state) { +- case SPDK_BDEV_ZONE_STATE_OPEN: +- case SPDK_BDEV_ZONE_STATE_CLOSED: +- zone->zone_info.state = SPDK_BDEV_ZONE_STATE_CLOSED; +- pthread_spin_unlock(&zone->lock); +- spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_SUCCESS); +- return 0; +- default: +- pthread_spin_unlock(&zone->lock); +- return -EINVAL; +- } +-} +- +-static int +-zone_block_finish_zone(struct block_zone *zone, struct spdk_bdev_io *bdev_io) +-{ +- pthread_spin_lock(&zone->lock); +- +- zone->zone_info.write_pointer = zone->zone_info.zone_id + zone->zone_info.capacity; +- zone->zone_info.state = SPDK_BDEV_ZONE_STATE_FULL; +- +- pthread_spin_unlock(&zone->lock); +- spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_SUCCESS); +- return 0; +-} +- +-static int +-zone_block_zone_management(struct bdev_zone_block *bdev_node, struct zone_block_io_channel *ch, +- struct spdk_bdev_io *bdev_io) +-{ +- struct block_zone *zone; +- +- zone = zone_block_get_zone_by_slba(bdev_node, bdev_io->u.zone_mgmt.zone_id); +- if (!zone) { +- return -EINVAL; +- } +- +- switch (bdev_io->u.zone_mgmt.zone_action) { +- case SPDK_BDEV_ZONE_RESET: +- return zone_block_reset_zone(bdev_node, ch, zone, bdev_io); +- case SPDK_BDEV_ZONE_OPEN: +- return zone_block_open_zone(zone, bdev_io); +- case SPDK_BDEV_ZONE_CLOSE: +- return zone_block_close_zone(zone, bdev_io); +- case SPDK_BDEV_ZONE_FINISH: +- return zone_block_finish_zone(zone, bdev_io); +- default: +- return -EINVAL; +- } +-} +- +-static void +-_zone_block_complete_write(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) +-{ +- struct spdk_bdev_io *orig_io = cb_arg; +- int status = success ? SPDK_BDEV_IO_STATUS_SUCCESS : SPDK_BDEV_IO_STATUS_FAILED; +- +- if (success && orig_io->type == SPDK_BDEV_IO_TYPE_ZONE_APPEND) { +- orig_io->u.bdev.offset_blocks = bdev_io->u.bdev.offset_blocks; +- } +- +- /* Complete the original IO and then free the one that we created here +- * as a result of issuing an IO via submit_request. +- */ +- spdk_bdev_io_complete(orig_io, status); +- spdk_bdev_free_io(bdev_io); +-} +- +-static int +-zone_block_write(struct bdev_zone_block *bdev_node, struct zone_block_io_channel *ch, +- struct spdk_bdev_io *bdev_io) +-{ +- struct block_zone *zone; +- uint64_t len = bdev_io->u.bdev.num_blocks; +- uint64_t lba = bdev_io->u.bdev.offset_blocks; +- uint64_t num_blocks_left, wp; +- int rc = 0; +- bool is_append = bdev_io->type == SPDK_BDEV_IO_TYPE_ZONE_APPEND; +- +- if (is_append) { +- zone = zone_block_get_zone_by_slba(bdev_node, lba); +- } else { +- zone = zone_block_get_zone_containing_lba(bdev_node, lba); +- } +- if (!zone) { +- SPDK_ERRLOG("Trying to write to invalid zone (lba 0x%" PRIx64 ")\n", lba); +- return -EINVAL; +- } +- +- pthread_spin_lock(&zone->lock); +- +- switch (zone->zone_info.state) { +- case SPDK_BDEV_ZONE_STATE_OPEN: +- case SPDK_BDEV_ZONE_STATE_EMPTY: +- case SPDK_BDEV_ZONE_STATE_CLOSED: +- zone->zone_info.state = SPDK_BDEV_ZONE_STATE_OPEN; +- break; +- default: +- SPDK_ERRLOG("Trying to write to zone in invalid state %u\n", zone->zone_info.state); +- rc = -EINVAL; +- goto write_fail; +- } +- +- wp = zone->zone_info.write_pointer; +- if (is_append) { +- lba = wp; +- } else { +- if (lba != wp) { +- SPDK_ERRLOG("Trying to write to zone with invalid address (lba 0x%" PRIx64 ", wp 0x%" PRIx64 ")\n", +- lba, wp); +- rc = -EINVAL; +- goto write_fail; +- } +- } +- +- num_blocks_left = zone->zone_info.zone_id + zone->zone_info.capacity - wp; +- if (len > num_blocks_left) { +- SPDK_ERRLOG("Write exceeds zone capacity (lba 0x%" PRIx64 ", len 0x%" PRIx64 ", wp 0x%" PRIx64 +- ")\n", lba, len, wp); +- rc = -EINVAL; +- goto write_fail; +- } +- +- zone->zone_info.write_pointer += bdev_io->u.bdev.num_blocks; +- assert(zone->zone_info.write_pointer <= zone->zone_info.zone_id + zone->zone_info.capacity); +- if (zone->zone_info.write_pointer == zone->zone_info.zone_id + zone->zone_info.capacity) { +- zone->zone_info.state = SPDK_BDEV_ZONE_STATE_FULL; +- } +- pthread_spin_unlock(&zone->lock); +- +- rc = spdk_bdev_writev_blocks_with_md(bdev_node->base_desc, ch->base_ch, +- bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt, +- bdev_io->u.bdev.md_buf, +- lba, bdev_io->u.bdev.num_blocks, +- _zone_block_complete_write, bdev_io); +- +- return rc; +- +-write_fail: +- pthread_spin_unlock(&zone->lock); +- return rc; +-} +- +-static void +-_zone_block_complete_read(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) +-{ +- struct spdk_bdev_io *orig_io = cb_arg; +- int status = success ? SPDK_BDEV_IO_STATUS_SUCCESS : SPDK_BDEV_IO_STATUS_FAILED; +- +- /* Complete the original IO and then free the one that we created here +- * as a result of issuing an IO via submit_request. +- */ +- spdk_bdev_io_complete(orig_io, status); +- spdk_bdev_free_io(bdev_io); +-} +- +-static int +-zone_block_read(struct bdev_zone_block *bdev_node, struct zone_block_io_channel *ch, +- struct spdk_bdev_io *bdev_io) +-{ +- struct block_zone *zone; +- uint64_t len = bdev_io->u.bdev.num_blocks; +- uint64_t lba = bdev_io->u.bdev.offset_blocks; +- int rc; +- +- zone = zone_block_get_zone_containing_lba(bdev_node, lba); +- if (!zone) { +- SPDK_ERRLOG("Trying to read from invalid zone (lba 0x%" PRIx64 ")\n", lba); +- return -EINVAL; +- } +- +- if ((lba + len) > (zone->zone_info.zone_id + zone->zone_info.capacity)) { +- SPDK_ERRLOG("Read exceeds zone capacity (lba 0x%" PRIx64 ", len 0x%" PRIx64 ")\n", lba, len); +- return -EINVAL; +- } +- +- rc = spdk_bdev_readv_blocks_with_md(bdev_node->base_desc, ch->base_ch, +- bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt, +- bdev_io->u.bdev.md_buf, +- lba, len, +- _zone_block_complete_read, bdev_io); +- +- return rc; +-} +- +-static void +-zone_block_submit_request(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io) +-{ +- struct bdev_zone_block *bdev_node = SPDK_CONTAINEROF(bdev_io->bdev, struct bdev_zone_block, bdev); +- struct zone_block_io_channel *dev_ch = spdk_io_channel_get_ctx(ch); +- int rc = 0; +- +- switch (bdev_io->type) { +- case SPDK_BDEV_IO_TYPE_GET_ZONE_INFO: +- rc = zone_block_get_zone_info(bdev_node, bdev_io); +- break; +- case SPDK_BDEV_IO_TYPE_ZONE_MANAGEMENT: +- rc = zone_block_zone_management(bdev_node, dev_ch, bdev_io); +- break; +- case SPDK_BDEV_IO_TYPE_WRITE: +- case SPDK_BDEV_IO_TYPE_ZONE_APPEND: +- rc = zone_block_write(bdev_node, dev_ch, bdev_io); +- break; +- case SPDK_BDEV_IO_TYPE_READ: +- rc = zone_block_read(bdev_node, dev_ch, bdev_io); +- break; +- default: +- SPDK_ERRLOG("vbdev_block: unknown I/O type %u\n", bdev_io->type); +- rc = -ENOTSUP; +- break; +- } +- +- if (rc != 0) { +- if (rc == -ENOMEM) { +- SPDK_WARNLOG("ENOMEM, start to queue io for vbdev.\n"); +- spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_NOMEM); +- } else { +- SPDK_ERRLOG("ERROR on bdev_io submission!\n"); +- spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); +- } +- } +-} +- +-static bool +-zone_block_io_type_supported(void *ctx, enum spdk_bdev_io_type io_type) +-{ +- switch (io_type) { +- case SPDK_BDEV_IO_TYPE_ZONE_MANAGEMENT: +- case SPDK_BDEV_IO_TYPE_WRITE: +- case SPDK_BDEV_IO_TYPE_READ: +- case SPDK_BDEV_IO_TYPE_ZONE_APPEND: +- return true; +- default: +- return false; +- } +-} +- +-static struct spdk_io_channel * +-zone_block_get_io_channel(void *ctx) +-{ +- struct bdev_zone_block *bdev_node = (struct bdev_zone_block *)ctx; +- +- return spdk_get_io_channel(bdev_node); +-} +- +-static int +-zone_block_dump_info_json(void *ctx, struct spdk_json_write_ctx *w) +-{ +- struct bdev_zone_block *bdev_node = (struct bdev_zone_block *)ctx; +- struct spdk_bdev *base_bdev = spdk_bdev_desc_get_bdev(bdev_node->base_desc); +- +- spdk_json_write_name(w, "zoned_block"); +- spdk_json_write_object_begin(w); +- spdk_json_write_named_string(w, "name", spdk_bdev_get_name(&bdev_node->bdev)); +- spdk_json_write_named_string(w, "base_bdev", spdk_bdev_get_name(base_bdev)); +- spdk_json_write_named_uint64(w, "zone_capacity", bdev_node->zone_capacity); +- spdk_json_write_named_uint64(w, "optimal_open_zones", bdev_node->bdev.optimal_open_zones); +- spdk_json_write_object_end(w); +- +- return 0; +-} +- +-/* When we register our vbdev this is how we specify our entry points. */ +-static const struct spdk_bdev_fn_table zone_block_fn_table = { +- .destruct = zone_block_destruct, +- .submit_request = zone_block_submit_request, +- .io_type_supported = zone_block_io_type_supported, +- .get_io_channel = zone_block_get_io_channel, +- .dump_info_json = zone_block_dump_info_json, +-}; +- +-static void +-zone_block_base_bdev_hotremove_cb(struct spdk_bdev *bdev_find) +-{ +- struct bdev_zone_block *bdev_node, *tmp; +- +- TAILQ_FOREACH_SAFE(bdev_node, &g_bdev_nodes, link, tmp) { +- if (bdev_find == spdk_bdev_desc_get_bdev(bdev_node->base_desc)) { +- spdk_bdev_unregister(&bdev_node->bdev, NULL, NULL); +- } +- } +-} +- +-static void +-zone_block_base_bdev_event_cb(enum spdk_bdev_event_type type, struct spdk_bdev *bdev, +- void *event_ctx) +-{ +- switch (type) { +- case SPDK_BDEV_EVENT_REMOVE: +- zone_block_base_bdev_hotremove_cb(bdev); +- break; +- default: +- SPDK_NOTICELOG("Unsupported bdev event: type %d\n", type); +- break; +- } +-} +- +-static int +-_zone_block_ch_create_cb(void *io_device, void *ctx_buf) +-{ +- struct zone_block_io_channel *bdev_ch = ctx_buf; +- struct bdev_zone_block *bdev_node = io_device; +- +- bdev_ch->base_ch = spdk_bdev_get_io_channel(bdev_node->base_desc); +- if (!bdev_ch->base_ch) { +- return -ENOMEM; +- } +- +- return 0; +-} +- +-static void +-_zone_block_ch_destroy_cb(void *io_device, void *ctx_buf) +-{ +- struct zone_block_io_channel *bdev_ch = ctx_buf; +- +- spdk_put_io_channel(bdev_ch->base_ch); +-} +- +-static int +-zone_block_insert_name(const char *bdev_name, const char *vbdev_name, uint64_t zone_capacity, +- uint64_t optimal_open_zones) +-{ +- struct bdev_zone_block_config *name; +- +- TAILQ_FOREACH(name, &g_bdev_configs, link) { +- if (strcmp(vbdev_name, name->vbdev_name) == 0) { +- SPDK_ERRLOG("block zoned bdev %s already exists\n", vbdev_name); +- return -EEXIST; +- } +- if (strcmp(bdev_name, name->bdev_name) == 0) { +- SPDK_ERRLOG("base bdev %s already claimed\n", bdev_name); +- return -EEXIST; +- } +- } +- +- name = calloc(1, sizeof(*name)); +- if (!name) { +- SPDK_ERRLOG("could not allocate bdev_names\n"); +- return -ENOMEM; +- } +- +- name->bdev_name = strdup(bdev_name); +- if (!name->bdev_name) { +- SPDK_ERRLOG("could not allocate name->bdev_name\n"); +- free(name); +- return -ENOMEM; +- } +- +- name->vbdev_name = strdup(vbdev_name); +- if (!name->vbdev_name) { +- SPDK_ERRLOG("could not allocate name->vbdev_name\n"); +- free(name->bdev_name); +- free(name); +- return -ENOMEM; +- } +- +- name->zone_capacity = zone_capacity; +- name->optimal_open_zones = optimal_open_zones; +- +- TAILQ_INSERT_TAIL(&g_bdev_configs, name, link); +- +- return 0; +-} +- +-static int +-zone_block_init_zone_info(struct bdev_zone_block *bdev_node) +-{ +- size_t i; +- struct block_zone *zone; +- int rc = 0; +- +- for (i = 0; i < bdev_node->num_zones; i++) { +- zone = &bdev_node->zones[i]; +- zone->zone_info.zone_id = bdev_node->bdev.zone_size * i; +- zone->zone_info.capacity = bdev_node->zone_capacity; +- zone->zone_info.write_pointer = zone->zone_info.zone_id + zone->zone_info.capacity; +- zone->zone_info.state = SPDK_BDEV_ZONE_STATE_FULL; +- zone->zone_info.type = SPDK_BDEV_ZONE_TYPE_SEQWR; +- if (pthread_spin_init(&zone->lock, PTHREAD_PROCESS_PRIVATE)) { +- SPDK_ERRLOG("pthread_spin_init() failed\n"); +- rc = -ENOMEM; +- break; +- } +- } +- +- if (rc) { +- for (; i > 0; i--) { +- pthread_spin_destroy(&bdev_node->zones[i - 1].lock); +- } +- } +- +- return rc; +-} +- +-static int +-zone_block_register(const char *base_bdev_name) +-{ +- struct spdk_bdev_desc *base_desc; +- struct spdk_bdev *base_bdev; +- struct bdev_zone_block_config *name, *tmp; +- struct bdev_zone_block *bdev_node; +- uint64_t zone_size; +- int rc = 0; +- +- /* Check our list of names from config versus this bdev and if +- * there's a match, create the bdev_node & bdev accordingly. +- */ +- TAILQ_FOREACH_SAFE(name, &g_bdev_configs, link, tmp) { +- if (strcmp(name->bdev_name, base_bdev_name) != 0) { +- continue; +- } +- +- rc = spdk_bdev_open_ext(base_bdev_name, true, zone_block_base_bdev_event_cb, +- NULL, &base_desc); +- if (rc == -ENODEV) { +- return -ENODEV; +- } else if (rc) { +- SPDK_ERRLOG("could not open bdev %s\n", base_bdev_name); +- goto free_config; +- } +- +- base_bdev = spdk_bdev_desc_get_bdev(base_desc); +- +- if (spdk_bdev_is_zoned(base_bdev)) { +- SPDK_ERRLOG("Base bdev %s is already a zoned bdev\n", base_bdev_name); +- rc = -EEXIST; +- goto zone_exist; +- } +- +- bdev_node = calloc(1, sizeof(struct bdev_zone_block)); +- if (!bdev_node) { +- rc = -ENOMEM; +- SPDK_ERRLOG("could not allocate bdev_node\n"); +- goto zone_exist; +- } +- +- bdev_node->base_desc = base_desc; +- +- /* The base bdev that we're attaching to. */ +- bdev_node->bdev.name = strdup(name->vbdev_name); +- if (!bdev_node->bdev.name) { +- rc = -ENOMEM; +- SPDK_ERRLOG("could not allocate bdev_node name\n"); +- goto strdup_failed; +- } +- +- zone_size = spdk_align64pow2(name->zone_capacity); +- if (zone_size == 0) { +- rc = -EINVAL; +- SPDK_ERRLOG("invalid zone size\n"); +- goto roundup_failed; +- } +- +- bdev_node->zone_shift = spdk_u64log2(zone_size); +- bdev_node->num_zones = base_bdev->blockcnt / zone_size; +- +- bdev_node->zones = calloc(bdev_node->num_zones, sizeof(struct block_zone)); +- if (!bdev_node->zones) { +- rc = -ENOMEM; +- SPDK_ERRLOG("could not allocate zones\n"); +- goto calloc_failed; +- } +- +- bdev_node->bdev.product_name = "zone_block"; +- +- /* Copy some properties from the underlying base bdev. */ +- bdev_node->bdev.write_cache = base_bdev->write_cache; +- bdev_node->bdev.required_alignment = base_bdev->required_alignment; +- bdev_node->bdev.optimal_io_boundary = base_bdev->optimal_io_boundary; +- +- bdev_node->bdev.blocklen = base_bdev->blocklen; +- bdev_node->bdev.blockcnt = bdev_node->num_zones * zone_size; +- +- if (bdev_node->num_zones * name->zone_capacity != base_bdev->blockcnt) { +- SPDK_DEBUGLOG(vbdev_zone_block, +- "Lost %" PRIu64 " blocks due to zone capacity and base bdev size misalignment\n", +- base_bdev->blockcnt - bdev_node->num_zones * name->zone_capacity); +- } +- +- bdev_node->bdev.write_unit_size = base_bdev->write_unit_size; +- +- bdev_node->bdev.md_interleave = base_bdev->md_interleave; +- bdev_node->bdev.md_len = base_bdev->md_len; +- bdev_node->bdev.dif_type = base_bdev->dif_type; +- bdev_node->bdev.dif_is_head_of_md = base_bdev->dif_is_head_of_md; +- bdev_node->bdev.dif_check_flags = base_bdev->dif_check_flags; +- +- bdev_node->bdev.zoned = true; +- bdev_node->bdev.ctxt = bdev_node; +- bdev_node->bdev.fn_table = &zone_block_fn_table; +- bdev_node->bdev.module = &bdev_zoned_if; +- +- /* bdev specific info */ +- bdev_node->bdev.zone_size = zone_size; +- +- bdev_node->zone_capacity = name->zone_capacity; +- bdev_node->bdev.optimal_open_zones = name->optimal_open_zones; +- bdev_node->bdev.max_open_zones = 0; +- rc = zone_block_init_zone_info(bdev_node); +- if (rc) { +- SPDK_ERRLOG("could not init zone info\n"); +- goto zone_info_failed; +- } +- +- TAILQ_INSERT_TAIL(&g_bdev_nodes, bdev_node, link); +- +- spdk_io_device_register(bdev_node, _zone_block_ch_create_cb, _zone_block_ch_destroy_cb, +- sizeof(struct zone_block_io_channel), +- name->vbdev_name); +- +- /* Save the thread where the base device is opened */ +- bdev_node->thread = spdk_get_thread(); +- +- rc = spdk_bdev_module_claim_bdev(base_bdev, base_desc, bdev_node->bdev.module); +- if (rc) { +- SPDK_ERRLOG("could not claim bdev %s\n", base_bdev_name); +- goto claim_failed; +- } +- +- rc = spdk_bdev_register(&bdev_node->bdev); +- if (rc) { +- SPDK_ERRLOG("could not register zoned bdev\n"); +- goto register_failed; +- } +- } +- +- return rc; +- +-register_failed: +- spdk_bdev_module_release_bdev(&bdev_node->bdev); +-claim_failed: +- TAILQ_REMOVE(&g_bdev_nodes, bdev_node, link); +- spdk_io_device_unregister(bdev_node, NULL); +-zone_info_failed: +- free(bdev_node->zones); +-calloc_failed: +-roundup_failed: +- free(bdev_node->bdev.name); +-strdup_failed: +- free(bdev_node); +-zone_exist: +- spdk_bdev_close(base_desc); +-free_config: +- zone_block_remove_config(name); +- return rc; +-} +- +-int +-vbdev_zone_block_create(const char *bdev_name, const char *vbdev_name, uint64_t zone_capacity, +- uint64_t optimal_open_zones) +-{ +- int rc = 0; +- +- if (zone_capacity == 0) { +- SPDK_ERRLOG("Zone capacity can't be 0\n"); +- return -EINVAL; +- } +- +- if (optimal_open_zones == 0) { +- SPDK_ERRLOG("Optimal open zones can't be 0\n"); +- return -EINVAL; +- } +- +- /* Insert the bdev into our global name list even if it doesn't exist yet, +- * it may show up soon... +- */ +- rc = zone_block_insert_name(bdev_name, vbdev_name, zone_capacity, optimal_open_zones); +- if (rc) { +- return rc; +- } +- +- rc = zone_block_register(bdev_name); +- if (rc == -ENODEV) { +- /* This is not an error, even though the bdev is not present at this time it may +- * still show up later. +- */ +- rc = 0; +- } +- return rc; +-} +- +-void +-vbdev_zone_block_delete(const char *name, spdk_bdev_unregister_cb cb_fn, void *cb_arg) +-{ +- struct bdev_zone_block_config *name_node; +- int rc; +- +- rc = spdk_bdev_unregister_by_name(name, &bdev_zoned_if, cb_fn, cb_arg); +- if (rc == 0) { +- TAILQ_FOREACH(name_node, &g_bdev_configs, link) { +- if (strcmp(name_node->vbdev_name, name) == 0) { +- zone_block_remove_config(name_node); +- break; +- } +- } +- } else { +- cb_fn(cb_arg, rc); +- } +-} +- +-static void +-zone_block_examine(struct spdk_bdev *bdev) +-{ +- zone_block_register(bdev->name); +- +- spdk_bdev_module_examine_done(&bdev_zoned_if); +-} +- +-SPDK_LOG_REGISTER_COMPONENT(vbdev_zone_block) ++/* SPDX-License-Identifier: BSD-3-Clause ++ * Copyright (C) 2019 Intel Corporation. ++ * Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. ++ * All rights reserved. ++ */ ++ ++#include "spdk/stdinc.h" ++ ++#include "vbdev_zone_block.h" ++ ++#include "spdk/config.h" ++#include "spdk/nvme.h" ++#include "spdk/bdev_zone.h" ++ ++#include "spdk/log.h" ++ ++static int zone_block_init(void); ++static int zone_block_get_ctx_size(void); ++static void zone_block_finish(void); ++static int zone_block_config_json(struct spdk_json_write_ctx *w); ++static void zone_block_examine(struct spdk_bdev *bdev); ++ ++static struct spdk_bdev_module bdev_zoned_if = { ++ .name = "bdev_zoned_block", ++ .module_init = zone_block_init, ++ .module_fini = zone_block_finish, ++ .config_json = zone_block_config_json, ++ .examine_config = zone_block_examine, ++ .get_ctx_size = zone_block_get_ctx_size, ++}; ++ ++SPDK_BDEV_MODULE_REGISTER(bdev_zoned_block, &bdev_zoned_if) ++ ++/* List of block vbdev names and their base bdevs via configuration file. ++ * Used so we can parse the conf once at init and use this list in examine(). ++ */ ++struct bdev_zone_block_config { ++ char *vbdev_name; ++ char *bdev_name; ++ uint64_t zone_capacity; ++ uint64_t optimal_open_zones; ++ TAILQ_ENTRY(bdev_zone_block_config) link; ++}; ++static TAILQ_HEAD(, bdev_zone_block_config) g_bdev_configs = TAILQ_HEAD_INITIALIZER(g_bdev_configs); ++ ++struct block_zone { ++ struct spdk_bdev_zone_info zone_info; ++ pthread_spinlock_t lock; ++}; ++ ++/* List of block vbdevs and associated info for each. */ ++struct bdev_zone_block { ++ struct spdk_bdev bdev; /* the block zoned bdev */ ++ struct spdk_bdev_desc *base_desc; /* its descriptor we get from open */ ++ struct block_zone *zones; /* array of zones */ ++ uint64_t num_zones; /* number of zones */ ++ uint64_t zone_capacity; /* zone capacity */ ++ uint64_t zone_shift; /* log2 of zone_size */ ++ TAILQ_ENTRY(bdev_zone_block) link; ++ struct spdk_thread *thread; /* thread where base device is opened */ ++}; ++static TAILQ_HEAD(, bdev_zone_block) g_bdev_nodes = TAILQ_HEAD_INITIALIZER(g_bdev_nodes); ++ ++struct zone_block_io_channel { ++ struct spdk_io_channel *base_ch; /* IO channel of base device */ ++}; ++ ++struct zone_block_io { ++ /* vbdev to which IO was issued */ ++ struct bdev_zone_block *bdev_zone_block; ++}; ++ ++static int ++zone_block_init(void) ++{ ++ return 0; ++} ++ ++static void ++zone_block_remove_config(struct bdev_zone_block_config *name) ++{ ++ TAILQ_REMOVE(&g_bdev_configs, name, link); ++ free(name->bdev_name); ++ free(name->vbdev_name); ++ free(name); ++} ++ ++static void ++zone_block_finish(void) ++{ ++ struct bdev_zone_block_config *name; ++ ++ while ((name = TAILQ_FIRST(&g_bdev_configs))) { ++ zone_block_remove_config(name); ++ } ++} ++ ++static int ++zone_block_get_ctx_size(void) ++{ ++ return sizeof(struct zone_block_io); ++} ++ ++static int ++zone_block_config_json(struct spdk_json_write_ctx *w) ++{ ++ struct bdev_zone_block *bdev_node; ++ struct spdk_bdev *base_bdev = NULL; ++ ++ TAILQ_FOREACH(bdev_node, &g_bdev_nodes, link) { ++ base_bdev = spdk_bdev_desc_get_bdev(bdev_node->base_desc); ++ spdk_json_write_object_begin(w); ++ spdk_json_write_named_string(w, "method", "bdev_zone_block_create"); ++ spdk_json_write_named_object_begin(w, "params"); ++ spdk_json_write_named_string(w, "base_bdev", spdk_bdev_get_name(base_bdev)); ++ spdk_json_write_named_string(w, "name", spdk_bdev_get_name(&bdev_node->bdev)); ++ spdk_json_write_named_uint64(w, "zone_capacity", bdev_node->zone_capacity); ++ spdk_json_write_named_uint64(w, "optimal_open_zones", bdev_node->bdev.optimal_open_zones); ++ spdk_json_write_object_end(w); ++ spdk_json_write_object_end(w); ++ } ++ ++ return 0; ++} ++ ++/* Callback for unregistering the IO device. */ ++static void ++_device_unregister_cb(void *io_device) ++{ ++ struct bdev_zone_block *bdev_node = io_device; ++ uint64_t i; ++ ++ free(bdev_node->bdev.name); ++ for (i = 0; i < bdev_node->num_zones; i++) { ++ pthread_spin_destroy(&bdev_node->zones[i].lock); ++ } ++ free(bdev_node->zones); ++ free(bdev_node); ++} ++ ++static void ++_zone_block_destruct(void *ctx) ++{ ++ struct spdk_bdev_desc *desc = ctx; ++ ++ spdk_bdev_close(desc); ++} ++ ++static int ++zone_block_destruct(void *ctx) ++{ ++ struct bdev_zone_block *bdev_node = (struct bdev_zone_block *)ctx; ++ ++ TAILQ_REMOVE(&g_bdev_nodes, bdev_node, link); ++ ++ /* Unclaim the underlying bdev. */ ++ spdk_bdev_module_release_bdev(spdk_bdev_desc_get_bdev(bdev_node->base_desc)); ++ ++ /* Close the underlying bdev on its same opened thread. */ ++ if (bdev_node->thread && bdev_node->thread != spdk_get_thread()) { ++ spdk_thread_send_msg(bdev_node->thread, _zone_block_destruct, bdev_node->base_desc); ++ } else { ++ spdk_bdev_close(bdev_node->base_desc); ++ } ++ ++ /* Unregister the io_device. */ ++ spdk_io_device_unregister(bdev_node, _device_unregister_cb); ++ ++ return 0; ++} ++ ++static struct block_zone * ++zone_block_get_zone_containing_lba(struct bdev_zone_block *bdev_node, uint64_t lba) ++{ ++ size_t index = lba >> bdev_node->zone_shift; ++ ++ if (index >= bdev_node->num_zones) { ++ return NULL; ++ } ++ ++ return &bdev_node->zones[index]; ++} ++ ++static struct block_zone * ++zone_block_get_zone_by_slba(struct bdev_zone_block *bdev_node, uint64_t start_lba) ++{ ++ struct block_zone *zone = zone_block_get_zone_containing_lba(bdev_node, start_lba); ++ ++ if (zone && zone->zone_info.zone_id == start_lba) { ++ return zone; ++ } else { ++ return NULL; ++ } ++} ++ ++static int ++zone_block_get_zone_info(struct bdev_zone_block *bdev_node, struct spdk_bdev_io *bdev_io) ++{ ++ struct block_zone *zone; ++ struct spdk_bdev_zone_info *zone_info = bdev_io->u.zone_mgmt.buf; ++ uint64_t zone_id = bdev_io->u.zone_mgmt.zone_id; ++ size_t i; ++ ++ /* User can request info for more zones than exist, need to check both internal and user ++ * boundaries ++ */ ++ for (i = 0; i < bdev_io->u.zone_mgmt.num_zones; i++, zone_id += bdev_node->bdev.zone_size) { ++ zone = zone_block_get_zone_by_slba(bdev_node, zone_id); ++ if (!zone) { ++ return -EINVAL; ++ } ++ memcpy(&zone_info[i], &zone->zone_info, sizeof(*zone_info)); ++ } ++ ++ spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_SUCCESS); ++ return 0; ++} ++ ++static int ++zone_block_open_zone(struct block_zone *zone, struct spdk_bdev_io *bdev_io) ++{ ++ pthread_spin_lock(&zone->lock); ++ ++ switch (zone->zone_info.state) { ++ case SPDK_BDEV_ZONE_STATE_EMPTY: ++ case SPDK_BDEV_ZONE_STATE_OPEN: ++ case SPDK_BDEV_ZONE_STATE_CLOSED: ++ zone->zone_info.state = SPDK_BDEV_ZONE_STATE_OPEN; ++ pthread_spin_unlock(&zone->lock); ++ spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_SUCCESS); ++ return 0; ++ default: ++ pthread_spin_unlock(&zone->lock); ++ return -EINVAL; ++ } ++} ++ ++static void ++_zone_block_complete_unmap(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) ++{ ++ struct spdk_bdev_io *orig_io = cb_arg; ++ int status = success ? SPDK_BDEV_IO_STATUS_SUCCESS : SPDK_BDEV_IO_STATUS_FAILED; ++ ++ /* Complete the original IO and then free the one that we created here ++ * as a result of issuing an IO via submit_request. ++ */ ++ spdk_bdev_io_complete(orig_io, status); ++ spdk_bdev_free_io(bdev_io); ++} ++ ++static int ++zone_block_reset_zone(struct bdev_zone_block *bdev_node, struct zone_block_io_channel *ch, ++ struct block_zone *zone, struct spdk_bdev_io *bdev_io) ++{ ++ pthread_spin_lock(&zone->lock); ++ ++ switch (zone->zone_info.state) { ++ case SPDK_BDEV_ZONE_STATE_EMPTY: ++ pthread_spin_unlock(&zone->lock); ++ spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_SUCCESS); ++ return 0; ++ case SPDK_BDEV_ZONE_STATE_OPEN: ++ case SPDK_BDEV_ZONE_STATE_FULL: ++ case SPDK_BDEV_ZONE_STATE_CLOSED: ++ zone->zone_info.state = SPDK_BDEV_ZONE_STATE_EMPTY; ++ zone->zone_info.write_pointer = zone->zone_info.zone_id; ++ pthread_spin_unlock(&zone->lock); ++ ++ /* The unmap isn't necessary, so if the base bdev doesn't support it, we're done */ ++ if (!spdk_bdev_io_type_supported(spdk_bdev_desc_get_bdev(bdev_node->base_desc), ++ SPDK_BDEV_IO_TYPE_UNMAP)) { ++ spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_SUCCESS); ++ return 0; ++ } ++ ++ return spdk_bdev_unmap_blocks(bdev_node->base_desc, ch->base_ch, ++ zone->zone_info.zone_id, zone->zone_info.capacity, ++ _zone_block_complete_unmap, bdev_io); ++ default: ++ pthread_spin_unlock(&zone->lock); ++ return -EINVAL; ++ } ++} ++ ++static int ++zone_block_close_zone(struct block_zone *zone, struct spdk_bdev_io *bdev_io) ++{ ++ pthread_spin_lock(&zone->lock); ++ ++ switch (zone->zone_info.state) { ++ case SPDK_BDEV_ZONE_STATE_OPEN: ++ case SPDK_BDEV_ZONE_STATE_CLOSED: ++ zone->zone_info.state = SPDK_BDEV_ZONE_STATE_CLOSED; ++ pthread_spin_unlock(&zone->lock); ++ spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_SUCCESS); ++ return 0; ++ default: ++ pthread_spin_unlock(&zone->lock); ++ return -EINVAL; ++ } ++} ++ ++static int ++zone_block_finish_zone(struct block_zone *zone, struct spdk_bdev_io *bdev_io) ++{ ++ pthread_spin_lock(&zone->lock); ++ ++ zone->zone_info.write_pointer = zone->zone_info.zone_id + zone->zone_info.capacity; ++ zone->zone_info.state = SPDK_BDEV_ZONE_STATE_FULL; ++ ++ pthread_spin_unlock(&zone->lock); ++ spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_SUCCESS); ++ return 0; ++} ++ ++static int ++zone_block_zone_management(struct bdev_zone_block *bdev_node, struct zone_block_io_channel *ch, ++ struct spdk_bdev_io *bdev_io) ++{ ++ struct block_zone *zone; ++ ++ zone = zone_block_get_zone_by_slba(bdev_node, bdev_io->u.zone_mgmt.zone_id); ++ if (!zone) { ++ return -EINVAL; ++ } ++ ++ switch (bdev_io->u.zone_mgmt.zone_action) { ++ case SPDK_BDEV_ZONE_RESET: ++ return zone_block_reset_zone(bdev_node, ch, zone, bdev_io); ++ case SPDK_BDEV_ZONE_OPEN: ++ return zone_block_open_zone(zone, bdev_io); ++ case SPDK_BDEV_ZONE_CLOSE: ++ return zone_block_close_zone(zone, bdev_io); ++ case SPDK_BDEV_ZONE_FINISH: ++ return zone_block_finish_zone(zone, bdev_io); ++ default: ++ return -EINVAL; ++ } ++} ++ ++static void ++_zone_block_complete_write(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) ++{ ++ struct spdk_bdev_io *orig_io = cb_arg; ++ int status = success ? SPDK_BDEV_IO_STATUS_SUCCESS : SPDK_BDEV_IO_STATUS_FAILED; ++ ++ if (success && orig_io->type == SPDK_BDEV_IO_TYPE_ZONE_APPEND) { ++ orig_io->u.bdev.offset_blocks = bdev_io->u.bdev.offset_blocks; ++ } ++ ++ /* Complete the original IO and then free the one that we created here ++ * as a result of issuing an IO via submit_request. ++ */ ++ spdk_bdev_io_complete(orig_io, status); ++ spdk_bdev_free_io(bdev_io); ++} ++ ++static int ++zone_block_write(struct bdev_zone_block *bdev_node, struct zone_block_io_channel *ch, ++ struct spdk_bdev_io *bdev_io) ++{ ++ struct block_zone *zone; ++ uint64_t len = bdev_io->u.bdev.num_blocks; ++ uint64_t lba = bdev_io->u.bdev.offset_blocks; ++ uint64_t num_blocks_left, wp; ++ int rc = 0; ++ bool is_append = bdev_io->type == SPDK_BDEV_IO_TYPE_ZONE_APPEND; ++ ++ if (is_append) { ++ zone = zone_block_get_zone_by_slba(bdev_node, lba); ++ } else { ++ zone = zone_block_get_zone_containing_lba(bdev_node, lba); ++ } ++ if (!zone) { ++ SPDK_ERRLOG("Trying to write to invalid zone (lba 0x%" PRIx64 ")\n", lba); ++ return -EINVAL; ++ } ++ ++ pthread_spin_lock(&zone->lock); ++ ++ switch (zone->zone_info.state) { ++ case SPDK_BDEV_ZONE_STATE_OPEN: ++ case SPDK_BDEV_ZONE_STATE_EMPTY: ++ case SPDK_BDEV_ZONE_STATE_CLOSED: ++ zone->zone_info.state = SPDK_BDEV_ZONE_STATE_OPEN; ++ break; ++ default: ++ SPDK_ERRLOG("Trying to write to zone in invalid state %u\n", zone->zone_info.state); ++ rc = -EINVAL; ++ goto write_fail; ++ } ++ ++ wp = zone->zone_info.write_pointer; ++ if (is_append) { ++ lba = wp; ++ } else { ++ if (lba != wp) { ++ SPDK_ERRLOG("Trying to write to zone with invalid address (lba 0x%" PRIx64 ", wp 0x%" PRIx64 ")\n", ++ lba, wp); ++ rc = -EINVAL; ++ goto write_fail; ++ } ++ } ++ ++ num_blocks_left = zone->zone_info.zone_id + zone->zone_info.capacity - wp; ++ if (len > num_blocks_left) { ++ SPDK_ERRLOG("Write exceeds zone capacity (lba 0x%" PRIx64 ", len 0x%" PRIx64 ", wp 0x%" PRIx64 ++ ")\n", lba, len, wp); ++ rc = -EINVAL; ++ goto write_fail; ++ } ++ ++ zone->zone_info.write_pointer += bdev_io->u.bdev.num_blocks; ++ assert(zone->zone_info.write_pointer <= zone->zone_info.zone_id + zone->zone_info.capacity); ++ if (zone->zone_info.write_pointer == zone->zone_info.zone_id + zone->zone_info.capacity) { ++ zone->zone_info.state = SPDK_BDEV_ZONE_STATE_FULL; ++ } ++ pthread_spin_unlock(&zone->lock); ++ ++ rc = spdk_bdev_writev_blocks_with_md(bdev_node->base_desc, ch->base_ch, ++ bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt, ++ bdev_io->u.bdev.md_buf, ++ lba, bdev_io->u.bdev.num_blocks, ++ _zone_block_complete_write, bdev_io); ++ ++ return rc; ++ ++write_fail: ++ pthread_spin_unlock(&zone->lock); ++ return rc; ++} ++ ++static void ++_zone_block_complete_read(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) ++{ ++ struct spdk_bdev_io *orig_io = cb_arg; ++ int status = success ? SPDK_BDEV_IO_STATUS_SUCCESS : SPDK_BDEV_IO_STATUS_FAILED; ++ ++ /* Complete the original IO and then free the one that we created here ++ * as a result of issuing an IO via submit_request. ++ */ ++ spdk_bdev_io_complete(orig_io, status); ++ spdk_bdev_free_io(bdev_io); ++} ++ ++static int ++zone_block_read(struct bdev_zone_block *bdev_node, struct zone_block_io_channel *ch, ++ struct spdk_bdev_io *bdev_io) ++{ ++ struct block_zone *zone; ++ uint64_t len = bdev_io->u.bdev.num_blocks; ++ uint64_t lba = bdev_io->u.bdev.offset_blocks; ++ int rc; ++ ++ zone = zone_block_get_zone_containing_lba(bdev_node, lba); ++ if (!zone) { ++ SPDK_ERRLOG("Trying to read from invalid zone (lba 0x%" PRIx64 ")\n", lba); ++ return -EINVAL; ++ } ++ ++ if ((lba + len) > (zone->zone_info.zone_id + zone->zone_info.capacity)) { ++ SPDK_ERRLOG("Read exceeds zone capacity (lba 0x%" PRIx64 ", len 0x%" PRIx64 ")\n", lba, len); ++ return -EINVAL; ++ } ++ ++ rc = spdk_bdev_readv_blocks_with_md(bdev_node->base_desc, ch->base_ch, ++ bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt, ++ bdev_io->u.bdev.md_buf, ++ lba, len, ++ _zone_block_complete_read, bdev_io); ++ ++ return rc; ++} ++ ++static void ++zone_block_submit_request(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io) ++{ ++ struct bdev_zone_block *bdev_node = SPDK_CONTAINEROF(bdev_io->bdev, struct bdev_zone_block, bdev); ++ struct zone_block_io_channel *dev_ch = spdk_io_channel_get_ctx(ch); ++ int rc = 0; ++ ++ switch (bdev_io->type) { ++ case SPDK_BDEV_IO_TYPE_GET_ZONE_INFO: ++ rc = zone_block_get_zone_info(bdev_node, bdev_io); ++ break; ++ case SPDK_BDEV_IO_TYPE_ZONE_MANAGEMENT: ++ rc = zone_block_zone_management(bdev_node, dev_ch, bdev_io); ++ break; ++ case SPDK_BDEV_IO_TYPE_WRITE: ++ case SPDK_BDEV_IO_TYPE_ZONE_APPEND: ++ rc = zone_block_write(bdev_node, dev_ch, bdev_io); ++ break; ++ case SPDK_BDEV_IO_TYPE_READ: ++ rc = zone_block_read(bdev_node, dev_ch, bdev_io); ++ break; ++ default: ++ SPDK_ERRLOG("vbdev_block: unknown I/O type %u\n", bdev_io->type); ++ rc = -ENOTSUP; ++ break; ++ } ++ ++ if (rc != 0) { ++ if (rc == -ENOMEM) { ++ SPDK_WARNLOG("ENOMEM, start to queue io for vbdev.\n"); ++ spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_NOMEM); ++ } else { ++ SPDK_ERRLOG("ERROR on bdev_io submission!\n"); ++ spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); ++ } ++ } ++} ++ ++static bool ++zone_block_io_type_supported(void *ctx, enum spdk_bdev_io_type io_type) ++{ ++ switch (io_type) { ++ case SPDK_BDEV_IO_TYPE_ZONE_MANAGEMENT: ++ case SPDK_BDEV_IO_TYPE_WRITE: ++ case SPDK_BDEV_IO_TYPE_READ: ++ case SPDK_BDEV_IO_TYPE_ZONE_APPEND: ++ return true; ++ default: ++ return false; ++ } ++} ++ ++static struct spdk_io_channel * ++zone_block_get_io_channel(void *ctx) ++{ ++ struct bdev_zone_block *bdev_node = (struct bdev_zone_block *)ctx; ++ ++ return spdk_get_io_channel(bdev_node); ++} ++ ++static int ++zone_block_dump_info_json(void *ctx, struct spdk_json_write_ctx *w) ++{ ++ struct bdev_zone_block *bdev_node = (struct bdev_zone_block *)ctx; ++ struct spdk_bdev *base_bdev = spdk_bdev_desc_get_bdev(bdev_node->base_desc); ++ ++ spdk_json_write_name(w, "zoned_block"); ++ spdk_json_write_object_begin(w); ++ spdk_json_write_named_string(w, "name", spdk_bdev_get_name(&bdev_node->bdev)); ++ spdk_json_write_named_string(w, "base_bdev", spdk_bdev_get_name(base_bdev)); ++ spdk_json_write_named_uint64(w, "zone_capacity", bdev_node->zone_capacity); ++ spdk_json_write_named_uint64(w, "optimal_open_zones", bdev_node->bdev.optimal_open_zones); ++ spdk_json_write_object_end(w); ++ ++ return 0; ++} ++ ++/* When we register our vbdev this is how we specify our entry points. */ ++static const struct spdk_bdev_fn_table zone_block_fn_table = { ++ .destruct = zone_block_destruct, ++ .submit_request = zone_block_submit_request, ++ .io_type_supported = zone_block_io_type_supported, ++ .get_io_channel = zone_block_get_io_channel, ++ .dump_info_json = zone_block_dump_info_json, ++}; ++ ++static void ++zone_block_base_bdev_hotremove_cb(struct spdk_bdev *bdev_find) ++{ ++ struct bdev_zone_block *bdev_node, *tmp; ++ ++ TAILQ_FOREACH_SAFE(bdev_node, &g_bdev_nodes, link, tmp) { ++ if (bdev_find == spdk_bdev_desc_get_bdev(bdev_node->base_desc)) { ++ spdk_bdev_unregister(&bdev_node->bdev, NULL, NULL); ++ } ++ } ++} ++ ++static void ++zone_block_base_bdev_event_cb(enum spdk_bdev_event_type type, struct spdk_bdev *bdev, ++ void *event_ctx) ++{ ++ switch (type) { ++ case SPDK_BDEV_EVENT_REMOVE: ++ zone_block_base_bdev_hotremove_cb(bdev); ++ break; ++ default: ++ SPDK_NOTICELOG("Unsupported bdev event: type %d\n", type); ++ break; ++ } ++} ++ ++static int ++_zone_block_ch_create_cb(void *io_device, void *ctx_buf) ++{ ++ struct zone_block_io_channel *bdev_ch = ctx_buf; ++ struct bdev_zone_block *bdev_node = io_device; ++ ++ bdev_ch->base_ch = spdk_bdev_get_io_channel(bdev_node->base_desc); ++ if (!bdev_ch->base_ch) { ++ return -ENOMEM; ++ } ++ ++ return 0; ++} ++ ++static void ++_zone_block_ch_destroy_cb(void *io_device, void *ctx_buf) ++{ ++ struct zone_block_io_channel *bdev_ch = ctx_buf; ++ ++ spdk_put_io_channel(bdev_ch->base_ch); ++} ++ ++static int ++zone_block_insert_name(const char *bdev_name, const char *vbdev_name, uint64_t zone_capacity, ++ uint64_t optimal_open_zones) ++{ ++ struct bdev_zone_block_config *name; ++ ++ TAILQ_FOREACH(name, &g_bdev_configs, link) { ++ if (strcmp(vbdev_name, name->vbdev_name) == 0) { ++ SPDK_ERRLOG("block zoned bdev %s already exists\n", vbdev_name); ++ return -EEXIST; ++ } ++ if (strcmp(bdev_name, name->bdev_name) == 0) { ++ SPDK_ERRLOG("base bdev %s already claimed\n", bdev_name); ++ return -EEXIST; ++ } ++ } ++ ++ name = calloc(1, sizeof(*name)); ++ if (!name) { ++ SPDK_ERRLOG("could not allocate bdev_names\n"); ++ return -ENOMEM; ++ } ++ ++ name->bdev_name = strdup(bdev_name); ++ if (!name->bdev_name) { ++ SPDK_ERRLOG("could not allocate name->bdev_name\n"); ++ free(name); ++ return -ENOMEM; ++ } ++ ++ name->vbdev_name = strdup(vbdev_name); ++ if (!name->vbdev_name) { ++ SPDK_ERRLOG("could not allocate name->vbdev_name\n"); ++ free(name->bdev_name); ++ free(name); ++ return -ENOMEM; ++ } ++ ++ name->zone_capacity = zone_capacity; ++ name->optimal_open_zones = optimal_open_zones; ++ ++ TAILQ_INSERT_TAIL(&g_bdev_configs, name, link); ++ ++ return 0; ++} ++ ++static int ++zone_block_init_zone_info(struct bdev_zone_block *bdev_node) ++{ ++ size_t i; ++ struct block_zone *zone; ++ int rc = 0; ++ ++ for (i = 0; i < bdev_node->num_zones; i++) { ++ zone = &bdev_node->zones[i]; ++ zone->zone_info.zone_id = bdev_node->bdev.zone_size * i; ++ zone->zone_info.capacity = bdev_node->zone_capacity; ++ zone->zone_info.write_pointer = zone->zone_info.zone_id + zone->zone_info.capacity; ++ zone->zone_info.state = SPDK_BDEV_ZONE_STATE_FULL; ++ zone->zone_info.type = SPDK_BDEV_ZONE_TYPE_SEQWR; ++ if (pthread_spin_init(&zone->lock, PTHREAD_PROCESS_PRIVATE)) { ++ SPDK_ERRLOG("pthread_spin_init() failed\n"); ++ rc = -ENOMEM; ++ break; ++ } ++ } ++ ++ if (rc) { ++ for (; i > 0; i--) { ++ pthread_spin_destroy(&bdev_node->zones[i - 1].lock); ++ } ++ } ++ ++ return rc; ++} ++ ++static int ++zone_block_register(const char *base_bdev_name) ++{ ++ struct spdk_bdev_desc *base_desc; ++ struct spdk_bdev *base_bdev; ++ struct bdev_zone_block_config *name, *tmp; ++ struct bdev_zone_block *bdev_node; ++ uint64_t zone_size; ++ int rc = 0; ++ ++ /* Check our list of names from config versus this bdev and if ++ * there's a match, create the bdev_node & bdev accordingly. ++ */ ++ TAILQ_FOREACH_SAFE(name, &g_bdev_configs, link, tmp) { ++ if (strcmp(name->bdev_name, base_bdev_name) != 0) { ++ continue; ++ } ++ ++ rc = spdk_bdev_open_ext(base_bdev_name, true, zone_block_base_bdev_event_cb, ++ NULL, &base_desc); ++ if (rc == -ENODEV) { ++ return -ENODEV; ++ } else if (rc) { ++ SPDK_ERRLOG("could not open bdev %s\n", base_bdev_name); ++ goto free_config; ++ } ++ ++ base_bdev = spdk_bdev_desc_get_bdev(base_desc); ++ ++ if (spdk_bdev_is_zoned(base_bdev)) { ++ SPDK_ERRLOG("Base bdev %s is already a zoned bdev\n", base_bdev_name); ++ rc = -EEXIST; ++ goto zone_exist; ++ } ++ ++ bdev_node = calloc(1, sizeof(struct bdev_zone_block)); ++ if (!bdev_node) { ++ rc = -ENOMEM; ++ SPDK_ERRLOG("could not allocate bdev_node\n"); ++ goto zone_exist; ++ } ++ ++ bdev_node->base_desc = base_desc; ++ ++ /* The base bdev that we're attaching to. */ ++ bdev_node->bdev.name = strdup(name->vbdev_name); ++ if (!bdev_node->bdev.name) { ++ rc = -ENOMEM; ++ SPDK_ERRLOG("could not allocate bdev_node name\n"); ++ goto strdup_failed; ++ } ++ ++ zone_size = spdk_align64pow2(name->zone_capacity); ++ if (zone_size == 0) { ++ rc = -EINVAL; ++ SPDK_ERRLOG("invalid zone size\n"); ++ goto roundup_failed; ++ } ++ ++ bdev_node->zone_shift = spdk_u64log2(zone_size); ++ bdev_node->num_zones = base_bdev->blockcnt / zone_size; ++ ++ bdev_node->zones = calloc(bdev_node->num_zones, sizeof(struct block_zone)); ++ if (!bdev_node->zones) { ++ rc = -ENOMEM; ++ SPDK_ERRLOG("could not allocate zones\n"); ++ goto calloc_failed; ++ } ++ ++ bdev_node->bdev.product_name = "zone_block"; ++ ++ /* Copy some properties from the underlying base bdev. */ ++ bdev_node->bdev.write_cache = base_bdev->write_cache; ++ bdev_node->bdev.required_alignment = base_bdev->required_alignment; ++ bdev_node->bdev.optimal_io_boundary = base_bdev->optimal_io_boundary; ++ ++ bdev_node->bdev.blocklen = base_bdev->blocklen; ++ bdev_node->bdev.blockcnt = bdev_node->num_zones * zone_size; ++ ++ if (bdev_node->num_zones * name->zone_capacity != base_bdev->blockcnt) { ++ SPDK_DEBUGLOG(vbdev_zone_block, ++ "Lost %" PRIu64 " blocks due to zone capacity and base bdev size misalignment\n", ++ base_bdev->blockcnt - bdev_node->num_zones * name->zone_capacity); ++ } ++ ++ bdev_node->bdev.write_unit_size = base_bdev->write_unit_size; ++ ++ bdev_node->bdev.md_interleave = base_bdev->md_interleave; ++ bdev_node->bdev.md_len = base_bdev->md_len; ++ bdev_node->bdev.dif_type = base_bdev->dif_type; ++ bdev_node->bdev.dif_is_head_of_md = base_bdev->dif_is_head_of_md; ++ bdev_node->bdev.dif_check_flags = base_bdev->dif_check_flags; ++ ++ bdev_node->bdev.zoned = true; ++ bdev_node->bdev.ctxt = bdev_node; ++ bdev_node->bdev.fn_table = &zone_block_fn_table; ++ bdev_node->bdev.module = &bdev_zoned_if; ++ ++ /* bdev specific info */ ++ bdev_node->bdev.zone_size = zone_size; ++ ++ bdev_node->zone_capacity = name->zone_capacity; ++ bdev_node->bdev.optimal_open_zones = name->optimal_open_zones; ++ bdev_node->bdev.max_open_zones = 0; ++ rc = zone_block_init_zone_info(bdev_node); ++ if (rc) { ++ SPDK_ERRLOG("could not init zone info\n"); ++ goto zone_info_failed; ++ } ++ ++ TAILQ_INSERT_TAIL(&g_bdev_nodes, bdev_node, link); ++ ++ spdk_io_device_register(bdev_node, _zone_block_ch_create_cb, _zone_block_ch_destroy_cb, ++ sizeof(struct zone_block_io_channel), ++ name->vbdev_name); ++ ++ /* Save the thread where the base device is opened */ ++ bdev_node->thread = spdk_get_thread(); ++ ++ rc = spdk_bdev_module_claim_bdev(base_bdev, base_desc, bdev_node->bdev.module); ++ if (rc) { ++ SPDK_ERRLOG("could not claim bdev %s\n", base_bdev_name); ++ goto claim_failed; ++ } ++ ++ rc = spdk_bdev_register(&bdev_node->bdev); ++ if (rc) { ++ SPDK_ERRLOG("could not register zoned bdev\n"); ++ goto register_failed; ++ } ++ } ++ ++ return rc; ++ ++register_failed: ++ spdk_bdev_module_release_bdev(&bdev_node->bdev); ++claim_failed: ++ TAILQ_REMOVE(&g_bdev_nodes, bdev_node, link); ++ spdk_io_device_unregister(bdev_node, NULL); ++zone_info_failed: ++ free(bdev_node->zones); ++calloc_failed: ++roundup_failed: ++ free(bdev_node->bdev.name); ++strdup_failed: ++ free(bdev_node); ++zone_exist: ++ spdk_bdev_close(base_desc); ++free_config: ++ zone_block_remove_config(name); ++ return rc; ++} ++ ++int ++vbdev_zone_block_create(const char *bdev_name, const char *vbdev_name, uint64_t zone_capacity, ++ uint64_t optimal_open_zones) ++{ ++ int rc = 0; ++ ++ if (zone_capacity == 0) { ++ SPDK_ERRLOG("Zone capacity can't be 0\n"); ++ return -EINVAL; ++ } ++ ++ if (optimal_open_zones == 0) { ++ SPDK_ERRLOG("Optimal open zones can't be 0\n"); ++ return -EINVAL; ++ } ++ ++ /* Insert the bdev into our global name list even if it doesn't exist yet, ++ * it may show up soon... ++ */ ++ rc = zone_block_insert_name(bdev_name, vbdev_name, zone_capacity, optimal_open_zones); ++ if (rc) { ++ return rc; ++ } ++ ++ rc = zone_block_register(bdev_name); ++ if (rc == -ENODEV) { ++ /* This is not an error, even though the bdev is not present at this time it may ++ * still show up later. ++ */ ++ rc = 0; ++ } ++ return rc; ++} ++ ++void ++vbdev_zone_block_delete(const char *name, spdk_bdev_unregister_cb cb_fn, void *cb_arg) ++{ ++ struct bdev_zone_block_config *name_node; ++ int rc; ++ ++ rc = spdk_bdev_unregister_by_name(name, &bdev_zoned_if, cb_fn, cb_arg); ++ if (rc == 0) { ++ TAILQ_FOREACH(name_node, &g_bdev_configs, link) { ++ if (strcmp(name_node->vbdev_name, name) == 0) { ++ zone_block_remove_config(name_node); ++ break; ++ } ++ } ++ } else { ++ cb_fn(cb_arg, rc); ++ } ++} ++ ++static void ++zone_block_examine(struct spdk_bdev *bdev) ++{ ++ zone_block_register(bdev->name); ++ ++ spdk_bdev_module_examine_done(&bdev_zoned_if); ++} ++ ++SPDK_LOG_REGISTER_COMPONENT(vbdev_zone_block) +diff --git a/module/bdev/zone_block/vbdev_zone_block.h b/module/bdev/zone_block/vbdev_zone_block.h +index f5c8c16..4aeabd1 100644 +--- a/module/bdev/zone_block/vbdev_zone_block.h ++++ b/module/bdev/zone_block/vbdev_zone_block.h +@@ -1,19 +1,19 @@ +-/* SPDX-License-Identifier: BSD-3-Clause +- * Copyright (C) 2019 Intel Corporation. +- * All rights reserved. +- */ +- +-#ifndef SPDK_VBDEV_ZONE_BLOCK_H +-#define SPDK_VBDEV_ZONE_BLOCK_H +- +-#include "spdk/stdinc.h" +- +-#include "spdk/bdev.h" +-#include "spdk/bdev_module.h" +- +-int vbdev_zone_block_create(const char *bdev_name, const char *vbdev_name, +- uint64_t zone_capacity, uint64_t optimal_open_zones); +- +-void vbdev_zone_block_delete(const char *name, spdk_bdev_unregister_cb cb_fn, void *cb_arg); +- +-#endif /* SPDK_VBDEV_ZONE_BLOCK_H */ ++/* SPDX-License-Identifier: BSD-3-Clause ++ * Copyright (C) 2019 Intel Corporation. ++ * All rights reserved. ++ */ ++ ++#ifndef SPDK_VBDEV_ZONE_BLOCK_H ++#define SPDK_VBDEV_ZONE_BLOCK_H ++ ++#include "spdk/stdinc.h" ++ ++#include "spdk/bdev.h" ++#include "spdk/bdev_module.h" ++ ++int vbdev_zone_block_create(const char *bdev_name, const char *vbdev_name, ++ uint64_t zone_capacity, uint64_t optimal_open_zones); ++ ++void vbdev_zone_block_delete(const char *name, spdk_bdev_unregister_cb cb_fn, void *cb_arg); ++ ++#endif /* SPDK_VBDEV_ZONE_BLOCK_H */ +diff --git a/module/bdev/zone_block/vbdev_zone_block_rpc.c b/module/bdev/zone_block/vbdev_zone_block_rpc.c +index 59b5ca0..2700805 100644 +--- a/module/bdev/zone_block/vbdev_zone_block_rpc.c ++++ b/module/bdev/zone_block/vbdev_zone_block_rpc.c +@@ -1,115 +1,115 @@ +-/* SPDX-License-Identifier: BSD-3-Clause +- * Copyright (C) 2019 Intel Corporation. +- * All rights reserved. +- */ +- +-#include "spdk/stdinc.h" +- +-#include "vbdev_zone_block.h" +- +-#include "spdk/util.h" +-#include "spdk/string.h" +-#include "spdk/rpc.h" +- +-#include "spdk/log.h" +- +-struct rpc_construct_zone_block { +- char *name; +- char *base_bdev; +- uint64_t zone_capacity; +- uint64_t optimal_open_zones; +-}; +- +-static void +-free_rpc_construct_zone_block(struct rpc_construct_zone_block *req) +-{ +- free(req->name); +- free(req->base_bdev); +-} +- +-static const struct spdk_json_object_decoder rpc_construct_zone_block_decoders[] = { +- {"name", offsetof(struct rpc_construct_zone_block, name), spdk_json_decode_string}, +- {"base_bdev", offsetof(struct rpc_construct_zone_block, base_bdev), spdk_json_decode_string}, +- {"zone_capacity", offsetof(struct rpc_construct_zone_block, zone_capacity), spdk_json_decode_uint64}, +- {"optimal_open_zones", offsetof(struct rpc_construct_zone_block, optimal_open_zones), spdk_json_decode_uint64}, +-}; +- +-static void +-rpc_zone_block_create(struct spdk_jsonrpc_request *request, +- const struct spdk_json_val *params) +-{ +- struct rpc_construct_zone_block req = {}; +- struct spdk_json_write_ctx *w; +- int rc; +- +- if (spdk_json_decode_object(params, rpc_construct_zone_block_decoders, +- SPDK_COUNTOF(rpc_construct_zone_block_decoders), +- &req)) { +- SPDK_ERRLOG("Failed to decode block create parameters"); +- spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, +- "Invalid parameters"); +- goto cleanup; +- } +- +- rc = vbdev_zone_block_create(req.base_bdev, req.name, req.zone_capacity, +- req.optimal_open_zones); +- if (rc) { +- SPDK_ERRLOG("Failed to create block zoned vbdev: %s", spdk_strerror(-rc)); +- spdk_jsonrpc_send_error_response_fmt(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, +- "Failed to create block zoned vbdev: %s", +- spdk_strerror(-rc)); +- goto cleanup; +- } +- +- w = spdk_jsonrpc_begin_result(request); +- spdk_json_write_string(w, req.name); +- spdk_jsonrpc_end_result(request, w); +- +-cleanup: +- free_rpc_construct_zone_block(&req); +-} +-SPDK_RPC_REGISTER("bdev_zone_block_create", rpc_zone_block_create, SPDK_RPC_RUNTIME) +- +-struct rpc_delete_zone_block { +- char *name; +-}; +- +-static void +-free_rpc_delete_zone_block(struct rpc_delete_zone_block *req) +-{ +- free(req->name); +-} +- +-static const struct spdk_json_object_decoder rpc_delete_zone_block_decoders[] = { +- {"name", offsetof(struct rpc_delete_zone_block, name), spdk_json_decode_string}, +-}; +- +-static void +-_rpc_delete_zone_block_cb(void *cb_ctx, int rc) +-{ +- struct spdk_jsonrpc_request *request = cb_ctx; +- +- spdk_jsonrpc_send_bool_response(request, rc == 0); +-} +- +-static void +-rpc_zone_block_delete(struct spdk_jsonrpc_request *request, +- const struct spdk_json_val *params) +-{ +- struct rpc_delete_zone_block attrs = {}; +- +- if (spdk_json_decode_object(params, rpc_delete_zone_block_decoders, +- SPDK_COUNTOF(rpc_delete_zone_block_decoders), +- &attrs)) { +- SPDK_ERRLOG("Failed to decode block delete parameters"); +- spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, +- "Invalid parameters"); +- goto cleanup; +- } +- +- vbdev_zone_block_delete(attrs.name, _rpc_delete_zone_block_cb, request); +- +-cleanup: +- free_rpc_delete_zone_block(&attrs); +-} +-SPDK_RPC_REGISTER("bdev_zone_block_delete", rpc_zone_block_delete, SPDK_RPC_RUNTIME) ++/* SPDX-License-Identifier: BSD-3-Clause ++ * Copyright (C) 2019 Intel Corporation. ++ * All rights reserved. ++ */ ++ ++#include "spdk/stdinc.h" ++ ++#include "vbdev_zone_block.h" ++ ++#include "spdk/util.h" ++#include "spdk/string.h" ++#include "spdk/rpc.h" ++ ++#include "spdk/log.h" ++ ++struct rpc_construct_zone_block { ++ char *name; ++ char *base_bdev; ++ uint64_t zone_capacity; ++ uint64_t optimal_open_zones; ++}; ++ ++static void ++free_rpc_construct_zone_block(struct rpc_construct_zone_block *req) ++{ ++ free(req->name); ++ free(req->base_bdev); ++} ++ ++static const struct spdk_json_object_decoder rpc_construct_zone_block_decoders[] = { ++ {"name", offsetof(struct rpc_construct_zone_block, name), spdk_json_decode_string}, ++ {"base_bdev", offsetof(struct rpc_construct_zone_block, base_bdev), spdk_json_decode_string}, ++ {"zone_capacity", offsetof(struct rpc_construct_zone_block, zone_capacity), spdk_json_decode_uint64}, ++ {"optimal_open_zones", offsetof(struct rpc_construct_zone_block, optimal_open_zones), spdk_json_decode_uint64}, ++}; ++ ++static void ++rpc_zone_block_create(struct spdk_jsonrpc_request *request, ++ const struct spdk_json_val *params) ++{ ++ struct rpc_construct_zone_block req = {}; ++ struct spdk_json_write_ctx *w; ++ int rc; ++ ++ if (spdk_json_decode_object(params, rpc_construct_zone_block_decoders, ++ SPDK_COUNTOF(rpc_construct_zone_block_decoders), ++ &req)) { ++ SPDK_ERRLOG("Failed to decode block create parameters"); ++ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, ++ "Invalid parameters"); ++ goto cleanup; ++ } ++ ++ rc = vbdev_zone_block_create(req.base_bdev, req.name, req.zone_capacity, ++ req.optimal_open_zones); ++ if (rc) { ++ SPDK_ERRLOG("Failed to create block zoned vbdev: %s", spdk_strerror(-rc)); ++ spdk_jsonrpc_send_error_response_fmt(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, ++ "Failed to create block zoned vbdev: %s", ++ spdk_strerror(-rc)); ++ goto cleanup; ++ } ++ ++ w = spdk_jsonrpc_begin_result(request); ++ spdk_json_write_string(w, req.name); ++ spdk_jsonrpc_end_result(request, w); ++ ++cleanup: ++ free_rpc_construct_zone_block(&req); ++} ++SPDK_RPC_REGISTER("bdev_zone_block_create", rpc_zone_block_create, SPDK_RPC_RUNTIME) ++ ++struct rpc_delete_zone_block { ++ char *name; ++}; ++ ++static void ++free_rpc_delete_zone_block(struct rpc_delete_zone_block *req) ++{ ++ free(req->name); ++} ++ ++static const struct spdk_json_object_decoder rpc_delete_zone_block_decoders[] = { ++ {"name", offsetof(struct rpc_delete_zone_block, name), spdk_json_decode_string}, ++}; ++ ++static void ++_rpc_delete_zone_block_cb(void *cb_ctx, int rc) ++{ ++ struct spdk_jsonrpc_request *request = cb_ctx; ++ ++ spdk_jsonrpc_send_bool_response(request, rc == 0); ++} ++ ++static void ++rpc_zone_block_delete(struct spdk_jsonrpc_request *request, ++ const struct spdk_json_val *params) ++{ ++ struct rpc_delete_zone_block attrs = {}; ++ ++ if (spdk_json_decode_object(params, rpc_delete_zone_block_decoders, ++ SPDK_COUNTOF(rpc_delete_zone_block_decoders), ++ &attrs)) { ++ SPDK_ERRLOG("Failed to decode block delete parameters"); ++ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, ++ "Invalid parameters"); ++ goto cleanup; ++ } ++ ++ vbdev_zone_block_delete(attrs.name, _rpc_delete_zone_block_cb, request); ++ ++cleanup: ++ free_rpc_delete_zone_block(&attrs); ++} ++SPDK_RPC_REGISTER("bdev_zone_block_delete", rpc_zone_block_delete, SPDK_RPC_RUNTIME) +diff --git a/module/blob/Makefile b/module/blob/Makefile +index 17aa8dd..e8ad484 100644 +--- a/module/blob/Makefile ++++ b/module/blob/Makefile +@@ -1,16 +1,16 @@ +-# SPDX-License-Identifier: BSD-3-Clause +-# Copyright (C) 2015 Intel Corporation. +-# All rights reserved. +-# +- +-SPDK_ROOT_DIR := $(abspath $(CURDIR)/../..) +-include $(SPDK_ROOT_DIR)/mk/spdk.common.mk +- +-DIRS-y = bdev +- +-.PHONY: all clean $(DIRS-y) +- +-all: $(DIRS-y) +-clean: $(DIRS-y) +- +-include $(SPDK_ROOT_DIR)/mk/spdk.subdirs.mk ++# SPDX-License-Identifier: BSD-3-Clause ++# Copyright (C) 2015 Intel Corporation. ++# All rights reserved. ++# ++ ++SPDK_ROOT_DIR := $(abspath $(CURDIR)/../..) ++include $(SPDK_ROOT_DIR)/mk/spdk.common.mk ++ ++DIRS-y = bdev ++ ++.PHONY: all clean $(DIRS-y) ++ ++all: $(DIRS-y) ++clean: $(DIRS-y) ++ ++include $(SPDK_ROOT_DIR)/mk/spdk.subdirs.mk +diff --git a/module/blob/bdev/Makefile b/module/blob/bdev/Makefile +index bffa660..ed4caca 100644 +--- a/module/blob/bdev/Makefile ++++ b/module/blob/bdev/Makefile +@@ -1,17 +1,17 @@ +-# SPDX-License-Identifier: BSD-3-Clause +-# Copyright (C) 2015 Intel Corporation. +-# All rights reserved. +-# +- +-SPDK_ROOT_DIR := $(abspath $(CURDIR)/../../..) +-include $(SPDK_ROOT_DIR)/mk/spdk.common.mk +- +-SO_VER := 9 +-SO_MINOR := 0 +- +-C_SRCS = blob_bdev.c +-LIBNAME = blob_bdev +- +-SPDK_MAP_FILE = $(abspath $(CURDIR)/spdk_blob_bdev.map) +- +-include $(SPDK_ROOT_DIR)/mk/spdk.lib.mk ++# SPDX-License-Identifier: BSD-3-Clause ++# Copyright (C) 2015 Intel Corporation. ++# All rights reserved. ++# ++ ++SPDK_ROOT_DIR := $(abspath $(CURDIR)/../../..) ++include $(SPDK_ROOT_DIR)/mk/spdk.common.mk ++ ++SO_VER := 9 ++SO_MINOR := 0 ++ ++C_SRCS = blob_bdev.c ++LIBNAME = blob_bdev ++ ++SPDK_MAP_FILE = $(abspath $(CURDIR)/spdk_blob_bdev.map) ++ ++include $(SPDK_ROOT_DIR)/mk/spdk.lib.mk +diff --git a/module/blob/bdev/blob_bdev.c b/module/blob/bdev/blob_bdev.c +index 44e7b48..91a6a4c 100644 +--- a/module/blob/bdev/blob_bdev.c ++++ b/module/blob/bdev/blob_bdev.c +@@ -1,452 +1,452 @@ +-/* SPDX-License-Identifier: BSD-3-Clause +- * Copyright (C) 2017 Intel Corporation. +- * All rights reserved. +- * Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +- */ +- +-#include "spdk/stdinc.h" +- +-#include "spdk/blob_bdev.h" +-#include "spdk/blob.h" +-#include "spdk/thread.h" +-#include "spdk/log.h" +-#include "spdk/endian.h" +-#define __SPDK_BDEV_MODULE_ONLY +-#include "spdk/bdev_module.h" +- +-struct blob_bdev { +- struct spdk_bs_dev bs_dev; +- struct spdk_bdev *bdev; +- struct spdk_bdev_desc *desc; +- bool claimed; +-}; +- +-struct blob_resubmit { +- struct spdk_bdev_io_wait_entry bdev_io_wait; +- enum spdk_bdev_io_type io_type; +- struct spdk_bs_dev *dev; +- struct spdk_io_channel *channel; +- void *payload; +- int iovcnt; +- uint64_t lba; +- uint64_t src_lba; +- uint32_t lba_count; +- struct spdk_bs_dev_cb_args *cb_args; +- struct spdk_blob_ext_io_opts *ext_io_opts; +-}; +-static void bdev_blob_resubmit(void *); +- +-static inline struct spdk_bdev_desc * +-__get_desc(struct spdk_bs_dev *dev) +-{ +- return ((struct blob_bdev *)dev)->desc; +-} +- +-static inline struct spdk_bdev * +-__get_bdev(struct spdk_bs_dev *dev) +-{ +- return ((struct blob_bdev *)dev)->bdev; +-} +- +-static void +-bdev_blob_io_complete(struct spdk_bdev_io *bdev_io, bool success, void *arg) +-{ +- struct spdk_bs_dev_cb_args *cb_args = arg; +- int bserrno; +- +- if (success) { +- bserrno = 0; +- } else { +- bserrno = -EIO; +- } +- cb_args->cb_fn(cb_args->channel, cb_args->cb_arg, bserrno); +- spdk_bdev_free_io(bdev_io); +-} +- +-static void +-bdev_blob_queue_io(struct spdk_bs_dev *dev, struct spdk_io_channel *channel, void *payload, +- int iovcnt, uint64_t lba, uint64_t src_lba, uint32_t lba_count, +- enum spdk_bdev_io_type io_type, struct spdk_bs_dev_cb_args *cb_args, +- struct spdk_blob_ext_io_opts *ext_io_opts) +-{ +- int rc; +- struct spdk_bdev *bdev = __get_bdev(dev); +- struct blob_resubmit *ctx; +- +- ctx = calloc(1, sizeof(struct blob_resubmit)); +- +- if (ctx == NULL) { +- SPDK_ERRLOG("Not enough memory to queue io\n"); +- cb_args->cb_fn(cb_args->channel, cb_args->cb_arg, -ENOMEM); +- return; +- } +- +- ctx->io_type = io_type; +- ctx->dev = dev; +- ctx->channel = channel; +- ctx->payload = payload; +- ctx->iovcnt = iovcnt; +- ctx->lba = lba; +- ctx->src_lba = src_lba; +- ctx->lba_count = lba_count; +- ctx->cb_args = cb_args; +- ctx->bdev_io_wait.bdev = bdev; +- ctx->bdev_io_wait.cb_fn = bdev_blob_resubmit; +- ctx->bdev_io_wait.cb_arg = ctx; +- ctx->ext_io_opts = ext_io_opts; +- +- rc = spdk_bdev_queue_io_wait(bdev, channel, &ctx->bdev_io_wait); +- if (rc != 0) { +- SPDK_ERRLOG("Queue io failed, rc=%d\n", rc); +- cb_args->cb_fn(cb_args->channel, cb_args->cb_arg, rc); +- free(ctx); +- assert(false); +- } +-} +- +-static void +-bdev_blob_read(struct spdk_bs_dev *dev, struct spdk_io_channel *channel, void *payload, +- uint64_t lba, uint32_t lba_count, struct spdk_bs_dev_cb_args *cb_args) +-{ +- int rc; +- +- rc = spdk_bdev_read_blocks(__get_desc(dev), channel, payload, lba, +- lba_count, bdev_blob_io_complete, cb_args); +- if (rc == -ENOMEM) { +- bdev_blob_queue_io(dev, channel, payload, 0, lba, 0, +- lba_count, SPDK_BDEV_IO_TYPE_READ, cb_args, NULL); +- } else if (rc != 0) { +- cb_args->cb_fn(cb_args->channel, cb_args->cb_arg, rc); +- } +-} +- +-static void +-bdev_blob_write(struct spdk_bs_dev *dev, struct spdk_io_channel *channel, void *payload, +- uint64_t lba, uint32_t lba_count, struct spdk_bs_dev_cb_args *cb_args) +-{ +- int rc; +- +- rc = spdk_bdev_write_blocks(__get_desc(dev), channel, payload, lba, +- lba_count, bdev_blob_io_complete, cb_args); +- if (rc == -ENOMEM) { +- bdev_blob_queue_io(dev, channel, payload, 0, lba, 0, +- lba_count, SPDK_BDEV_IO_TYPE_WRITE, cb_args, NULL); +- } else if (rc != 0) { +- cb_args->cb_fn(cb_args->channel, cb_args->cb_arg, rc); +- } +-} +- +-static void +-bdev_blob_readv(struct spdk_bs_dev *dev, struct spdk_io_channel *channel, +- struct iovec *iov, int iovcnt, +- uint64_t lba, uint32_t lba_count, struct spdk_bs_dev_cb_args *cb_args) +-{ +- int rc; +- +- rc = spdk_bdev_readv_blocks(__get_desc(dev), channel, iov, iovcnt, lba, +- lba_count, bdev_blob_io_complete, cb_args); +- if (rc == -ENOMEM) { +- bdev_blob_queue_io(dev, channel, iov, iovcnt, lba, 0, +- lba_count, SPDK_BDEV_IO_TYPE_READ, cb_args, NULL); +- } else if (rc != 0) { +- cb_args->cb_fn(cb_args->channel, cb_args->cb_arg, rc); +- } +-} +- +-static void +-bdev_blob_writev(struct spdk_bs_dev *dev, struct spdk_io_channel *channel, +- struct iovec *iov, int iovcnt, +- uint64_t lba, uint32_t lba_count, struct spdk_bs_dev_cb_args *cb_args) +-{ +- int rc; +- +- rc = spdk_bdev_writev_blocks(__get_desc(dev), channel, iov, iovcnt, lba, +- lba_count, bdev_blob_io_complete, cb_args); +- if (rc == -ENOMEM) { +- bdev_blob_queue_io(dev, channel, iov, iovcnt, lba, 0, +- lba_count, SPDK_BDEV_IO_TYPE_WRITE, cb_args, NULL); +- } else if (rc != 0) { +- cb_args->cb_fn(cb_args->channel, cb_args->cb_arg, rc); +- } +-} +- +-static void +-bdev_blob_readv_ext(struct spdk_bs_dev *dev, struct spdk_io_channel *channel, +- struct iovec *iov, int iovcnt, +- uint64_t lba, uint32_t lba_count, struct spdk_bs_dev_cb_args *cb_args, +- struct spdk_blob_ext_io_opts *io_opts) +-{ +- struct spdk_bdev_ext_io_opts *bdev_io_opts = NULL; +- int rc; +- +- if (io_opts) { +- /* bdev ext API requires ext_io_opts to be allocated by the user, we don't have enough context to allocate +- * bdev ext_opts structure here. Also blob and bdev ext_opts are not API/ABI compatible, so we can't use the given +- * io_opts. Restore ext_opts passed by the user of this bs_dev */ +- bdev_io_opts = io_opts->user_ctx; +- assert(bdev_io_opts); +- } +- +- rc = spdk_bdev_readv_blocks_ext(__get_desc(dev), channel, iov, iovcnt, lba, lba_count, +- bdev_blob_io_complete, cb_args, bdev_io_opts); +- if (rc == -ENOMEM) { +- bdev_blob_queue_io(dev, channel, iov, iovcnt, lba, 0, lba_count, SPDK_BDEV_IO_TYPE_READ, cb_args, +- io_opts); +- } else if (rc != 0) { +- cb_args->cb_fn(cb_args->channel, cb_args->cb_arg, rc); +- } +-} +- +-static void +-bdev_blob_writev_ext(struct spdk_bs_dev *dev, struct spdk_io_channel *channel, +- struct iovec *iov, int iovcnt, +- uint64_t lba, uint32_t lba_count, struct spdk_bs_dev_cb_args *cb_args, +- struct spdk_blob_ext_io_opts *io_opts) +-{ +- struct spdk_bdev_ext_io_opts *bdev_io_opts = NULL; +- int rc; +- +- if (io_opts) { +- /* bdev ext API requires ext_io_opts to be allocated by the user, we don't have enough context to allocate +- * bdev ext_opts structure here. Also blob and bdev ext_opts are not API/ABI compatible, so we can't use the given +- * io_opts. Restore ext_opts passed by the user of this bs_dev */ +- bdev_io_opts = io_opts->user_ctx; +- assert(bdev_io_opts); +- } +- +- rc = spdk_bdev_writev_blocks_ext(__get_desc(dev), channel, iov, iovcnt, lba, lba_count, +- bdev_blob_io_complete, cb_args, bdev_io_opts); +- if (rc == -ENOMEM) { +- bdev_blob_queue_io(dev, channel, iov, iovcnt, lba, 0, lba_count, SPDK_BDEV_IO_TYPE_WRITE, cb_args, +- io_opts); +- } else if (rc != 0) { +- cb_args->cb_fn(cb_args->channel, cb_args->cb_arg, rc); +- } +-} +- +-static void +-bdev_blob_write_zeroes(struct spdk_bs_dev *dev, struct spdk_io_channel *channel, uint64_t lba, +- uint64_t lba_count, struct spdk_bs_dev_cb_args *cb_args) +-{ +- int rc; +- +- rc = spdk_bdev_write_zeroes_blocks(__get_desc(dev), channel, lba, +- lba_count, bdev_blob_io_complete, cb_args); +- if (rc == -ENOMEM) { +- bdev_blob_queue_io(dev, channel, NULL, 0, lba, 0, +- lba_count, SPDK_BDEV_IO_TYPE_WRITE_ZEROES, cb_args, NULL); +- } else if (rc != 0) { +- cb_args->cb_fn(cb_args->channel, cb_args->cb_arg, rc); +- } +-} +- +-static void +-bdev_blob_unmap(struct spdk_bs_dev *dev, struct spdk_io_channel *channel, uint64_t lba, +- uint64_t lba_count, struct spdk_bs_dev_cb_args *cb_args) +-{ +- struct blob_bdev *blob_bdev = (struct blob_bdev *)dev; +- int rc; +- +- if (spdk_bdev_io_type_supported(blob_bdev->bdev, SPDK_BDEV_IO_TYPE_UNMAP)) { +- rc = spdk_bdev_unmap_blocks(__get_desc(dev), channel, lba, lba_count, +- bdev_blob_io_complete, cb_args); +- if (rc == -ENOMEM) { +- bdev_blob_queue_io(dev, channel, NULL, 0, lba, 0, +- lba_count, SPDK_BDEV_IO_TYPE_UNMAP, cb_args, NULL); +- } else if (rc != 0) { +- cb_args->cb_fn(cb_args->channel, cb_args->cb_arg, rc); +- } +- } else { +- /* +- * If the device doesn't support unmap, immediately complete +- * the request. Blobstore does not rely on unmap zeroing +- * data. +- */ +- cb_args->cb_fn(cb_args->channel, cb_args->cb_arg, 0); +- } +-} +- +-static void +-bdev_blob_copy(struct spdk_bs_dev *dev, struct spdk_io_channel *channel, +- uint64_t dst_lba, uint64_t src_lba, uint64_t lba_count, +- struct spdk_bs_dev_cb_args *cb_args) +-{ +- int rc; +- +- rc = spdk_bdev_copy_blocks(__get_desc(dev), channel, +- dst_lba, src_lba, lba_count, +- bdev_blob_io_complete, cb_args); +- if (rc == -ENOMEM) { +- bdev_blob_queue_io(dev, channel, NULL, 0, dst_lba, src_lba, +- lba_count, SPDK_BDEV_IO_TYPE_COPY, cb_args, NULL); +- } else if (rc != 0) { +- cb_args->cb_fn(cb_args->channel, cb_args->cb_arg, rc); +- } +-} +- +-static void +-bdev_blob_resubmit(void *arg) +-{ +- struct blob_resubmit *ctx = (struct blob_resubmit *) arg; +- +- switch (ctx->io_type) { +- case SPDK_BDEV_IO_TYPE_READ: +- if (ctx->iovcnt > 0) { +- bdev_blob_readv_ext(ctx->dev, ctx->channel, (struct iovec *) ctx->payload, ctx->iovcnt, +- ctx->lba, ctx->lba_count, ctx->cb_args, ctx->ext_io_opts); +- } else { +- bdev_blob_read(ctx->dev, ctx->channel, ctx->payload, +- ctx->lba, ctx->lba_count, ctx->cb_args); +- } +- break; +- case SPDK_BDEV_IO_TYPE_WRITE: +- if (ctx->iovcnt > 0) { +- bdev_blob_writev_ext(ctx->dev, ctx->channel, (struct iovec *) ctx->payload, ctx->iovcnt, +- ctx->lba, ctx->lba_count, ctx->cb_args, ctx->ext_io_opts); +- } else { +- bdev_blob_write(ctx->dev, ctx->channel, ctx->payload, +- ctx->lba, ctx->lba_count, ctx->cb_args); +- } +- break; +- case SPDK_BDEV_IO_TYPE_UNMAP: +- bdev_blob_unmap(ctx->dev, ctx->channel, +- ctx->lba, ctx->lba_count, ctx->cb_args); +- break; +- case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: +- bdev_blob_write_zeroes(ctx->dev, ctx->channel, +- ctx->lba, ctx->lba_count, ctx->cb_args); +- break; +- case SPDK_BDEV_IO_TYPE_COPY: +- bdev_blob_copy(ctx->dev, ctx->channel, +- ctx->lba, ctx->src_lba, ctx->lba_count, ctx->cb_args); +- break; +- default: +- SPDK_ERRLOG("Unsupported io type %d\n", ctx->io_type); +- assert(false); +- break; +- } +- free(ctx); +-} +- +-int +-spdk_bs_bdev_claim(struct spdk_bs_dev *bs_dev, struct spdk_bdev_module *module) +-{ +- struct blob_bdev *blob_bdev = (struct blob_bdev *)bs_dev; +- int rc; +- +- rc = spdk_bdev_module_claim_bdev(blob_bdev->bdev, NULL, module); +- if (rc != 0) { +- SPDK_ERRLOG("could not claim bs dev\n"); +- return rc; +- } +- +- blob_bdev->claimed = true; +- +- return rc; +-} +- +-static struct spdk_io_channel * +-bdev_blob_create_channel(struct spdk_bs_dev *dev) +-{ +- struct blob_bdev *blob_bdev = (struct blob_bdev *)dev; +- +- return spdk_bdev_get_io_channel(blob_bdev->desc); +-} +- +-static void +-bdev_blob_destroy_channel(struct spdk_bs_dev *dev, struct spdk_io_channel *channel) +-{ +- spdk_put_io_channel(channel); +-} +- +-static void +-bdev_blob_destroy(struct spdk_bs_dev *bs_dev) +-{ +- struct spdk_bdev_desc *desc = __get_desc(bs_dev); +- struct blob_bdev *blob_bdev = (struct blob_bdev *)bs_dev; +- +- if (blob_bdev->claimed) { +- spdk_bdev_module_release_bdev(blob_bdev->bdev); +- } +- +- spdk_bdev_close(desc); +- free(bs_dev); +-} +- +-static struct spdk_bdev * +-bdev_blob_get_base_bdev(struct spdk_bs_dev *bs_dev) +-{ +- return __get_bdev(bs_dev); +-} +- +-static bool +-bdev_blob_is_zeroes(struct spdk_bs_dev *dev, uint64_t lba, uint64_t lba_count) +-{ +- return false; +-} +- +-static bool +-bdev_blob_translate_lba(struct spdk_bs_dev *dev, uint64_t lba, uint64_t *base_lba) +-{ +- *base_lba = lba; +- return true; +-} +- +-static void +-blob_bdev_init(struct blob_bdev *b, struct spdk_bdev_desc *desc) +-{ +- struct spdk_bdev *bdev; +- +- bdev = spdk_bdev_desc_get_bdev(desc); +- assert(bdev != NULL); +- +- b->bdev = bdev; +- b->desc = desc; +- b->bs_dev.blockcnt = spdk_bdev_get_num_blocks(bdev); +- b->bs_dev.blocklen = spdk_bdev_get_block_size(bdev); +- b->bs_dev.create_channel = bdev_blob_create_channel; +- b->bs_dev.destroy_channel = bdev_blob_destroy_channel; +- b->bs_dev.destroy = bdev_blob_destroy; +- b->bs_dev.read = bdev_blob_read; +- b->bs_dev.write = bdev_blob_write; +- b->bs_dev.readv = bdev_blob_readv; +- b->bs_dev.writev = bdev_blob_writev; +- b->bs_dev.readv_ext = bdev_blob_readv_ext; +- b->bs_dev.writev_ext = bdev_blob_writev_ext; +- b->bs_dev.write_zeroes = bdev_blob_write_zeroes; +- b->bs_dev.unmap = bdev_blob_unmap; +- if (spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_COPY)) { +- b->bs_dev.copy = bdev_blob_copy; +- } +- b->bs_dev.get_base_bdev = bdev_blob_get_base_bdev; +- b->bs_dev.is_zeroes = bdev_blob_is_zeroes; +- b->bs_dev.translate_lba = bdev_blob_translate_lba; +-} +- +-int +-spdk_bdev_create_bs_dev_ext(const char *bdev_name, spdk_bdev_event_cb_t event_cb, +- void *event_ctx, struct spdk_bs_dev **_bs_dev) +-{ +- struct blob_bdev *b; +- struct spdk_bdev_desc *desc; +- int rc; +- +- b = calloc(1, sizeof(*b)); +- +- if (b == NULL) { +- SPDK_ERRLOG("could not allocate blob_bdev\n"); +- return -ENOMEM; +- } +- +- rc = spdk_bdev_open_ext(bdev_name, true, event_cb, event_ctx, &desc); +- if (rc != 0) { +- free(b); +- return rc; +- } +- +- blob_bdev_init(b, desc); +- +- *_bs_dev = &b->bs_dev; +- +- return 0; +-} ++/* SPDX-License-Identifier: BSD-3-Clause ++ * Copyright (C) 2017 Intel Corporation. ++ * All rights reserved. ++ * Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved. ++ */ ++ ++#include "spdk/stdinc.h" ++ ++#include "spdk/blob_bdev.h" ++#include "spdk/blob.h" ++#include "spdk/thread.h" ++#include "spdk/log.h" ++#include "spdk/endian.h" ++#define __SPDK_BDEV_MODULE_ONLY ++#include "spdk/bdev_module.h" ++ ++struct blob_bdev { ++ struct spdk_bs_dev bs_dev; ++ struct spdk_bdev *bdev; ++ struct spdk_bdev_desc *desc; ++ bool claimed; ++}; ++ ++struct blob_resubmit { ++ struct spdk_bdev_io_wait_entry bdev_io_wait; ++ enum spdk_bdev_io_type io_type; ++ struct spdk_bs_dev *dev; ++ struct spdk_io_channel *channel; ++ void *payload; ++ int iovcnt; ++ uint64_t lba; ++ uint64_t src_lba; ++ uint32_t lba_count; ++ struct spdk_bs_dev_cb_args *cb_args; ++ struct spdk_blob_ext_io_opts *ext_io_opts; ++}; ++static void bdev_blob_resubmit(void *); ++ ++static inline struct spdk_bdev_desc * ++__get_desc(struct spdk_bs_dev *dev) ++{ ++ return ((struct blob_bdev *)dev)->desc; ++} ++ ++static inline struct spdk_bdev * ++__get_bdev(struct spdk_bs_dev *dev) ++{ ++ return ((struct blob_bdev *)dev)->bdev; ++} ++ ++static void ++bdev_blob_io_complete(struct spdk_bdev_io *bdev_io, bool success, void *arg) ++{ ++ struct spdk_bs_dev_cb_args *cb_args = arg; ++ int bserrno; ++ ++ if (success) { ++ bserrno = 0; ++ } else { ++ bserrno = -EIO; ++ } ++ cb_args->cb_fn(cb_args->channel, cb_args->cb_arg, bserrno); ++ spdk_bdev_free_io(bdev_io); ++} ++ ++static void ++bdev_blob_queue_io(struct spdk_bs_dev *dev, struct spdk_io_channel *channel, void *payload, ++ int iovcnt, uint64_t lba, uint64_t src_lba, uint32_t lba_count, ++ enum spdk_bdev_io_type io_type, struct spdk_bs_dev_cb_args *cb_args, ++ struct spdk_blob_ext_io_opts *ext_io_opts) ++{ ++ int rc; ++ struct spdk_bdev *bdev = __get_bdev(dev); ++ struct blob_resubmit *ctx; ++ ++ ctx = calloc(1, sizeof(struct blob_resubmit)); ++ ++ if (ctx == NULL) { ++ SPDK_ERRLOG("Not enough memory to queue io\n"); ++ cb_args->cb_fn(cb_args->channel, cb_args->cb_arg, -ENOMEM); ++ return; ++ } ++ ++ ctx->io_type = io_type; ++ ctx->dev = dev; ++ ctx->channel = channel; ++ ctx->payload = payload; ++ ctx->iovcnt = iovcnt; ++ ctx->lba = lba; ++ ctx->src_lba = src_lba; ++ ctx->lba_count = lba_count; ++ ctx->cb_args = cb_args; ++ ctx->bdev_io_wait.bdev = bdev; ++ ctx->bdev_io_wait.cb_fn = bdev_blob_resubmit; ++ ctx->bdev_io_wait.cb_arg = ctx; ++ ctx->ext_io_opts = ext_io_opts; ++ ++ rc = spdk_bdev_queue_io_wait(bdev, channel, &ctx->bdev_io_wait); ++ if (rc != 0) { ++ SPDK_ERRLOG("Queue io failed, rc=%d\n", rc); ++ cb_args->cb_fn(cb_args->channel, cb_args->cb_arg, rc); ++ free(ctx); ++ assert(false); ++ } ++} ++ ++static void ++bdev_blob_read(struct spdk_bs_dev *dev, struct spdk_io_channel *channel, void *payload, ++ uint64_t lba, uint32_t lba_count, struct spdk_bs_dev_cb_args *cb_args) ++{ ++ int rc; ++ ++ rc = spdk_bdev_read_blocks(__get_desc(dev), channel, payload, lba, ++ lba_count, bdev_blob_io_complete, cb_args); ++ if (rc == -ENOMEM) { ++ bdev_blob_queue_io(dev, channel, payload, 0, lba, 0, ++ lba_count, SPDK_BDEV_IO_TYPE_READ, cb_args, NULL); ++ } else if (rc != 0) { ++ cb_args->cb_fn(cb_args->channel, cb_args->cb_arg, rc); ++ } ++} ++ ++static void ++bdev_blob_write(struct spdk_bs_dev *dev, struct spdk_io_channel *channel, void *payload, ++ uint64_t lba, uint32_t lba_count, struct spdk_bs_dev_cb_args *cb_args) ++{ ++ int rc; ++ ++ rc = spdk_bdev_write_blocks(__get_desc(dev), channel, payload, lba, ++ lba_count, bdev_blob_io_complete, cb_args); ++ if (rc == -ENOMEM) { ++ bdev_blob_queue_io(dev, channel, payload, 0, lba, 0, ++ lba_count, SPDK_BDEV_IO_TYPE_WRITE, cb_args, NULL); ++ } else if (rc != 0) { ++ cb_args->cb_fn(cb_args->channel, cb_args->cb_arg, rc); ++ } ++} ++ ++static void ++bdev_blob_readv(struct spdk_bs_dev *dev, struct spdk_io_channel *channel, ++ struct iovec *iov, int iovcnt, ++ uint64_t lba, uint32_t lba_count, struct spdk_bs_dev_cb_args *cb_args) ++{ ++ int rc; ++ ++ rc = spdk_bdev_readv_blocks(__get_desc(dev), channel, iov, iovcnt, lba, ++ lba_count, bdev_blob_io_complete, cb_args); ++ if (rc == -ENOMEM) { ++ bdev_blob_queue_io(dev, channel, iov, iovcnt, lba, 0, ++ lba_count, SPDK_BDEV_IO_TYPE_READ, cb_args, NULL); ++ } else if (rc != 0) { ++ cb_args->cb_fn(cb_args->channel, cb_args->cb_arg, rc); ++ } ++} ++ ++static void ++bdev_blob_writev(struct spdk_bs_dev *dev, struct spdk_io_channel *channel, ++ struct iovec *iov, int iovcnt, ++ uint64_t lba, uint32_t lba_count, struct spdk_bs_dev_cb_args *cb_args) ++{ ++ int rc; ++ ++ rc = spdk_bdev_writev_blocks(__get_desc(dev), channel, iov, iovcnt, lba, ++ lba_count, bdev_blob_io_complete, cb_args); ++ if (rc == -ENOMEM) { ++ bdev_blob_queue_io(dev, channel, iov, iovcnt, lba, 0, ++ lba_count, SPDK_BDEV_IO_TYPE_WRITE, cb_args, NULL); ++ } else if (rc != 0) { ++ cb_args->cb_fn(cb_args->channel, cb_args->cb_arg, rc); ++ } ++} ++ ++static void ++bdev_blob_readv_ext(struct spdk_bs_dev *dev, struct spdk_io_channel *channel, ++ struct iovec *iov, int iovcnt, ++ uint64_t lba, uint32_t lba_count, struct spdk_bs_dev_cb_args *cb_args, ++ struct spdk_blob_ext_io_opts *io_opts) ++{ ++ struct spdk_bdev_ext_io_opts *bdev_io_opts = NULL; ++ int rc; ++ ++ if (io_opts) { ++ /* bdev ext API requires ext_io_opts to be allocated by the user, we don't have enough context to allocate ++ * bdev ext_opts structure here. Also blob and bdev ext_opts are not API/ABI compatible, so we can't use the given ++ * io_opts. Restore ext_opts passed by the user of this bs_dev */ ++ bdev_io_opts = io_opts->user_ctx; ++ assert(bdev_io_opts); ++ } ++ ++ rc = spdk_bdev_readv_blocks_ext(__get_desc(dev), channel, iov, iovcnt, lba, lba_count, ++ bdev_blob_io_complete, cb_args, bdev_io_opts); ++ if (rc == -ENOMEM) { ++ bdev_blob_queue_io(dev, channel, iov, iovcnt, lba, 0, lba_count, SPDK_BDEV_IO_TYPE_READ, cb_args, ++ io_opts); ++ } else if (rc != 0) { ++ cb_args->cb_fn(cb_args->channel, cb_args->cb_arg, rc); ++ } ++} ++ ++static void ++bdev_blob_writev_ext(struct spdk_bs_dev *dev, struct spdk_io_channel *channel, ++ struct iovec *iov, int iovcnt, ++ uint64_t lba, uint32_t lba_count, struct spdk_bs_dev_cb_args *cb_args, ++ struct spdk_blob_ext_io_opts *io_opts) ++{ ++ struct spdk_bdev_ext_io_opts *bdev_io_opts = NULL; ++ int rc; ++ ++ if (io_opts) { ++ /* bdev ext API requires ext_io_opts to be allocated by the user, we don't have enough context to allocate ++ * bdev ext_opts structure here. Also blob and bdev ext_opts are not API/ABI compatible, so we can't use the given ++ * io_opts. Restore ext_opts passed by the user of this bs_dev */ ++ bdev_io_opts = io_opts->user_ctx; ++ assert(bdev_io_opts); ++ } ++ ++ rc = spdk_bdev_writev_blocks_ext(__get_desc(dev), channel, iov, iovcnt, lba, lba_count, ++ bdev_blob_io_complete, cb_args, bdev_io_opts); ++ if (rc == -ENOMEM) { ++ bdev_blob_queue_io(dev, channel, iov, iovcnt, lba, 0, lba_count, SPDK_BDEV_IO_TYPE_WRITE, cb_args, ++ io_opts); ++ } else if (rc != 0) { ++ cb_args->cb_fn(cb_args->channel, cb_args->cb_arg, rc); ++ } ++} ++ ++static void ++bdev_blob_write_zeroes(struct spdk_bs_dev *dev, struct spdk_io_channel *channel, uint64_t lba, ++ uint64_t lba_count, struct spdk_bs_dev_cb_args *cb_args) ++{ ++ int rc; ++ ++ rc = spdk_bdev_write_zeroes_blocks(__get_desc(dev), channel, lba, ++ lba_count, bdev_blob_io_complete, cb_args); ++ if (rc == -ENOMEM) { ++ bdev_blob_queue_io(dev, channel, NULL, 0, lba, 0, ++ lba_count, SPDK_BDEV_IO_TYPE_WRITE_ZEROES, cb_args, NULL); ++ } else if (rc != 0) { ++ cb_args->cb_fn(cb_args->channel, cb_args->cb_arg, rc); ++ } ++} ++ ++static void ++bdev_blob_unmap(struct spdk_bs_dev *dev, struct spdk_io_channel *channel, uint64_t lba, ++ uint64_t lba_count, struct spdk_bs_dev_cb_args *cb_args) ++{ ++ struct blob_bdev *blob_bdev = (struct blob_bdev *)dev; ++ int rc; ++ ++ if (spdk_bdev_io_type_supported(blob_bdev->bdev, SPDK_BDEV_IO_TYPE_UNMAP)) { ++ rc = spdk_bdev_unmap_blocks(__get_desc(dev), channel, lba, lba_count, ++ bdev_blob_io_complete, cb_args); ++ if (rc == -ENOMEM) { ++ bdev_blob_queue_io(dev, channel, NULL, 0, lba, 0, ++ lba_count, SPDK_BDEV_IO_TYPE_UNMAP, cb_args, NULL); ++ } else if (rc != 0) { ++ cb_args->cb_fn(cb_args->channel, cb_args->cb_arg, rc); ++ } ++ } else { ++ /* ++ * If the device doesn't support unmap, immediately complete ++ * the request. Blobstore does not rely on unmap zeroing ++ * data. ++ */ ++ cb_args->cb_fn(cb_args->channel, cb_args->cb_arg, 0); ++ } ++} ++ ++static void ++bdev_blob_copy(struct spdk_bs_dev *dev, struct spdk_io_channel *channel, ++ uint64_t dst_lba, uint64_t src_lba, uint64_t lba_count, ++ struct spdk_bs_dev_cb_args *cb_args) ++{ ++ int rc; ++ ++ rc = spdk_bdev_copy_blocks(__get_desc(dev), channel, ++ dst_lba, src_lba, lba_count, ++ bdev_blob_io_complete, cb_args); ++ if (rc == -ENOMEM) { ++ bdev_blob_queue_io(dev, channel, NULL, 0, dst_lba, src_lba, ++ lba_count, SPDK_BDEV_IO_TYPE_COPY, cb_args, NULL); ++ } else if (rc != 0) { ++ cb_args->cb_fn(cb_args->channel, cb_args->cb_arg, rc); ++ } ++} ++ ++static void ++bdev_blob_resubmit(void *arg) ++{ ++ struct blob_resubmit *ctx = (struct blob_resubmit *) arg; ++ ++ switch (ctx->io_type) { ++ case SPDK_BDEV_IO_TYPE_READ: ++ if (ctx->iovcnt > 0) { ++ bdev_blob_readv_ext(ctx->dev, ctx->channel, (struct iovec *) ctx->payload, ctx->iovcnt, ++ ctx->lba, ctx->lba_count, ctx->cb_args, ctx->ext_io_opts); ++ } else { ++ bdev_blob_read(ctx->dev, ctx->channel, ctx->payload, ++ ctx->lba, ctx->lba_count, ctx->cb_args); ++ } ++ break; ++ case SPDK_BDEV_IO_TYPE_WRITE: ++ if (ctx->iovcnt > 0) { ++ bdev_blob_writev_ext(ctx->dev, ctx->channel, (struct iovec *) ctx->payload, ctx->iovcnt, ++ ctx->lba, ctx->lba_count, ctx->cb_args, ctx->ext_io_opts); ++ } else { ++ bdev_blob_write(ctx->dev, ctx->channel, ctx->payload, ++ ctx->lba, ctx->lba_count, ctx->cb_args); ++ } ++ break; ++ case SPDK_BDEV_IO_TYPE_UNMAP: ++ bdev_blob_unmap(ctx->dev, ctx->channel, ++ ctx->lba, ctx->lba_count, ctx->cb_args); ++ break; ++ case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: ++ bdev_blob_write_zeroes(ctx->dev, ctx->channel, ++ ctx->lba, ctx->lba_count, ctx->cb_args); ++ break; ++ case SPDK_BDEV_IO_TYPE_COPY: ++ bdev_blob_copy(ctx->dev, ctx->channel, ++ ctx->lba, ctx->src_lba, ctx->lba_count, ctx->cb_args); ++ break; ++ default: ++ SPDK_ERRLOG("Unsupported io type %d\n", ctx->io_type); ++ assert(false); ++ break; ++ } ++ free(ctx); ++} ++ ++int ++spdk_bs_bdev_claim(struct spdk_bs_dev *bs_dev, struct spdk_bdev_module *module) ++{ ++ struct blob_bdev *blob_bdev = (struct blob_bdev *)bs_dev; ++ int rc; ++ ++ rc = spdk_bdev_module_claim_bdev(blob_bdev->bdev, NULL, module); ++ if (rc != 0) { ++ SPDK_ERRLOG("could not claim bs dev\n"); ++ return rc; ++ } ++ ++ blob_bdev->claimed = true; ++ ++ return rc; ++} ++ ++static struct spdk_io_channel * ++bdev_blob_create_channel(struct spdk_bs_dev *dev) ++{ ++ struct blob_bdev *blob_bdev = (struct blob_bdev *)dev; ++ ++ return spdk_bdev_get_io_channel(blob_bdev->desc); ++} ++ ++static void ++bdev_blob_destroy_channel(struct spdk_bs_dev *dev, struct spdk_io_channel *channel) ++{ ++ spdk_put_io_channel(channel); ++} ++ ++static void ++bdev_blob_destroy(struct spdk_bs_dev *bs_dev) ++{ ++ struct spdk_bdev_desc *desc = __get_desc(bs_dev); ++ struct blob_bdev *blob_bdev = (struct blob_bdev *)bs_dev; ++ ++ if (blob_bdev->claimed) { ++ spdk_bdev_module_release_bdev(blob_bdev->bdev); ++ } ++ ++ spdk_bdev_close(desc); ++ free(bs_dev); ++} ++ ++static struct spdk_bdev * ++bdev_blob_get_base_bdev(struct spdk_bs_dev *bs_dev) ++{ ++ return __get_bdev(bs_dev); ++} ++ ++static bool ++bdev_blob_is_zeroes(struct spdk_bs_dev *dev, uint64_t lba, uint64_t lba_count) ++{ ++ return false; ++} ++ ++static bool ++bdev_blob_translate_lba(struct spdk_bs_dev *dev, uint64_t lba, uint64_t *base_lba) ++{ ++ *base_lba = lba; ++ return true; ++} ++ ++static void ++blob_bdev_init(struct blob_bdev *b, struct spdk_bdev_desc *desc) ++{ ++ struct spdk_bdev *bdev; ++ ++ bdev = spdk_bdev_desc_get_bdev(desc); ++ assert(bdev != NULL); ++ ++ b->bdev = bdev; ++ b->desc = desc; ++ b->bs_dev.blockcnt = spdk_bdev_get_num_blocks(bdev); ++ b->bs_dev.blocklen = spdk_bdev_get_block_size(bdev); ++ b->bs_dev.create_channel = bdev_blob_create_channel; ++ b->bs_dev.destroy_channel = bdev_blob_destroy_channel; ++ b->bs_dev.destroy = bdev_blob_destroy; ++ b->bs_dev.read = bdev_blob_read; ++ b->bs_dev.write = bdev_blob_write; ++ b->bs_dev.readv = bdev_blob_readv; ++ b->bs_dev.writev = bdev_blob_writev; ++ b->bs_dev.readv_ext = bdev_blob_readv_ext; ++ b->bs_dev.writev_ext = bdev_blob_writev_ext; ++ b->bs_dev.write_zeroes = bdev_blob_write_zeroes; ++ b->bs_dev.unmap = bdev_blob_unmap; ++ if (spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_COPY)) { ++ b->bs_dev.copy = bdev_blob_copy; ++ } ++ b->bs_dev.get_base_bdev = bdev_blob_get_base_bdev; ++ b->bs_dev.is_zeroes = bdev_blob_is_zeroes; ++ b->bs_dev.translate_lba = bdev_blob_translate_lba; ++} ++ ++int ++spdk_bdev_create_bs_dev_ext(const char *bdev_name, spdk_bdev_event_cb_t event_cb, ++ void *event_ctx, struct spdk_bs_dev **_bs_dev) ++{ ++ struct blob_bdev *b; ++ struct spdk_bdev_desc *desc; ++ int rc; ++ ++ b = calloc(1, sizeof(*b)); ++ ++ if (b == NULL) { ++ SPDK_ERRLOG("could not allocate blob_bdev\n"); ++ return -ENOMEM; ++ } ++ ++ rc = spdk_bdev_open_ext(bdev_name, true, event_cb, event_ctx, &desc); ++ if (rc != 0) { ++ free(b); ++ return rc; ++ } ++ ++ blob_bdev_init(b, desc); ++ ++ *_bs_dev = &b->bs_dev; ++ ++ return 0; ++} +diff --git a/module/blob/bdev/spdk_blob_bdev.map b/module/blob/bdev/spdk_blob_bdev.map +index 4edd699..8429bef 100644 +--- a/module/blob/bdev/spdk_blob_bdev.map ++++ b/module/blob/bdev/spdk_blob_bdev.map +@@ -1,10 +1,10 @@ +-{ +- global: +- +- # public functions +- spdk_bdev_create_bs_dev; +- spdk_bdev_create_bs_dev_ext; +- spdk_bs_bdev_claim; +- +- local: *; +-}; ++{ ++ global: ++ ++ # public functions ++ spdk_bdev_create_bs_dev; ++ spdk_bdev_create_bs_dev_ext; ++ spdk_bs_bdev_claim; ++ ++ local: *; ++}; +diff --git a/module/blobfs/Makefile b/module/blobfs/Makefile +index 17aa8dd..e8ad484 100644 +--- a/module/blobfs/Makefile ++++ b/module/blobfs/Makefile +@@ -1,16 +1,16 @@ +-# SPDX-License-Identifier: BSD-3-Clause +-# Copyright (C) 2015 Intel Corporation. +-# All rights reserved. +-# +- +-SPDK_ROOT_DIR := $(abspath $(CURDIR)/../..) +-include $(SPDK_ROOT_DIR)/mk/spdk.common.mk +- +-DIRS-y = bdev +- +-.PHONY: all clean $(DIRS-y) +- +-all: $(DIRS-y) +-clean: $(DIRS-y) +- +-include $(SPDK_ROOT_DIR)/mk/spdk.subdirs.mk ++# SPDX-License-Identifier: BSD-3-Clause ++# Copyright (C) 2015 Intel Corporation. ++# All rights reserved. ++# ++ ++SPDK_ROOT_DIR := $(abspath $(CURDIR)/../..) ++include $(SPDK_ROOT_DIR)/mk/spdk.common.mk ++ ++DIRS-y = bdev ++ ++.PHONY: all clean $(DIRS-y) ++ ++all: $(DIRS-y) ++clean: $(DIRS-y) ++ ++include $(SPDK_ROOT_DIR)/mk/spdk.subdirs.mk +diff --git a/module/blobfs/bdev/Makefile b/module/blobfs/bdev/Makefile +index 0cc7981..4c8e377 100644 +--- a/module/blobfs/bdev/Makefile ++++ b/module/blobfs/bdev/Makefile +@@ -1,23 +1,23 @@ +-# SPDX-License-Identifier: BSD-3-Clause +-# Copyright (C) 2015 Intel Corporation. +-# All rights reserved. +-# +- +-SPDK_ROOT_DIR := $(abspath $(CURDIR)/../../..) +-include $(SPDK_ROOT_DIR)/mk/spdk.common.mk +- +-SO_VER := 4 +-SO_MINOR := 0 +- +-C_SRCS = blobfs_bdev.c blobfs_bdev_rpc.c +- +-# libfuse3 is required by blobfs_fuse.c +-ifeq ($(CONFIG_FUSE),y) +-C_SRCS += blobfs_fuse.c +-endif +- +-LIBNAME = blobfs_bdev +- +-SPDK_MAP_FILE = $(abspath $(CURDIR)/spdk_blobfs_bdev.map) +- +-include $(SPDK_ROOT_DIR)/mk/spdk.lib.mk ++# SPDX-License-Identifier: BSD-3-Clause ++# Copyright (C) 2015 Intel Corporation. ++# All rights reserved. ++# ++ ++SPDK_ROOT_DIR := $(abspath $(CURDIR)/../../..) ++include $(SPDK_ROOT_DIR)/mk/spdk.common.mk ++ ++SO_VER := 4 ++SO_MINOR := 0 ++ ++C_SRCS = blobfs_bdev.c blobfs_bdev_rpc.c ++ ++# libfuse3 is required by blobfs_fuse.c ++ifeq ($(CONFIG_FUSE),y) ++C_SRCS += blobfs_fuse.c ++endif ++ ++LIBNAME = blobfs_bdev ++ ++SPDK_MAP_FILE = $(abspath $(CURDIR)/spdk_blobfs_bdev.map) ++ ++include $(SPDK_ROOT_DIR)/mk/spdk.lib.mk +diff --git a/module/blobfs/bdev/blobfs_bdev.c b/module/blobfs/bdev/blobfs_bdev.c +index 9f78fd2..2c9db96 100644 +--- a/module/blobfs/bdev/blobfs_bdev.c ++++ b/module/blobfs/bdev/blobfs_bdev.c +@@ -1,310 +1,310 @@ +-/* SPDX-License-Identifier: BSD-3-Clause +- * Copyright (C) 2019 Intel Corporation. +- * All rights reserved. +- */ +- +-#include "spdk/stdinc.h" +-#include "spdk/blobfs.h" +-#include "spdk/bdev.h" +-#include "spdk/bdev_module.h" +-#include "spdk/event.h" +-#include "spdk/blob_bdev.h" +-#include "spdk/blobfs_bdev.h" +-#include "spdk/log.h" +-#include "spdk/string.h" +-#include "spdk/rpc.h" +-#include "spdk/util.h" +- +-#include "blobfs_fuse.h" +- +-/* Dummy bdev module used to to claim bdevs. */ +-static struct spdk_bdev_module blobfs_bdev_module = { +- .name = "blobfs", +-}; +- +-static void +-blobfs_bdev_event_cb(enum spdk_bdev_event_type type, struct spdk_bdev *bdev, +- void *event_ctx) +-{ +- SPDK_WARNLOG("Async event(%d) is triggered in bdev %s\n", type, spdk_bdev_get_name(bdev)); +-} +- +-struct blobfs_bdev_operation_ctx { +- const char *bdev_name; +- struct spdk_filesystem *fs; +- +- /* If cb_fn is already called in other function, not _blobfs_bdev_unload_cb. +- * cb_fn should be set NULL after its being called, in order to avoid repeated +- * calling in _blobfs_bdev_unload_cb. +- */ +- spdk_blobfs_bdev_op_complete cb_fn; +- void *cb_arg; +- +- /* Variables for mount operation */ +- const char *mountpoint; +- struct spdk_thread *fs_loading_thread; +- +- /* Used in bdev_event_cb to do some proper operations on blobfs_fuse for +- * asynchronous event of the backend bdev. +- */ +- struct spdk_blobfs_fuse *bfuse; +-}; +- +-static void +-_blobfs_bdev_unload_cb(void *_ctx, int fserrno) +-{ +- struct blobfs_bdev_operation_ctx *ctx = _ctx; +- +- if (fserrno) { +- SPDK_ERRLOG("Failed to unload blobfs on bdev %s: errno %d\n", ctx->bdev_name, fserrno); +- } +- +- if (ctx->cb_fn) { +- ctx->cb_fn(ctx->cb_arg, fserrno); +- } +- +- free(ctx); +-} +- +-static void +-blobfs_bdev_unload(void *_ctx) +-{ +- struct blobfs_bdev_operation_ctx *ctx = _ctx; +- +- spdk_fs_unload(ctx->fs, _blobfs_bdev_unload_cb, ctx); +-} +- +-static void +-blobfs_bdev_load_cb_to_unload(void *_ctx, struct spdk_filesystem *fs, int fserrno) +-{ +- struct blobfs_bdev_operation_ctx *ctx = _ctx; +- +- if (fserrno) { +- ctx->cb_fn(ctx->cb_arg, fserrno); +- free(ctx); +- return; +- } +- +- ctx->fs = fs; +- spdk_thread_send_msg(spdk_get_thread(), blobfs_bdev_unload, ctx); +-} +- +-void +-spdk_blobfs_bdev_detect(const char *bdev_name, +- spdk_blobfs_bdev_op_complete cb_fn, void *cb_arg) +-{ +- struct blobfs_bdev_operation_ctx *ctx; +- struct spdk_bs_dev *bs_dev; +- int rc; +- +- ctx = calloc(1, sizeof(*ctx)); +- if (ctx == NULL) { +- SPDK_ERRLOG("Failed to allocate ctx.\n"); +- cb_fn(cb_arg, -ENOMEM); +- +- return; +- } +- +- ctx->bdev_name = bdev_name; +- ctx->cb_fn = cb_fn; +- ctx->cb_arg = cb_arg; +- +- rc = spdk_bdev_create_bs_dev_ext(bdev_name, blobfs_bdev_event_cb, NULL, &bs_dev); +- if (rc != 0) { +- SPDK_INFOLOG(blobfs_bdev, "Failed to create a blobstore block device from bdev (%s)", +- bdev_name); +- +- goto invalid; +- } +- +- spdk_fs_load(bs_dev, NULL, blobfs_bdev_load_cb_to_unload, ctx); +- +- return; +- +-invalid: +- free(ctx); +- +- cb_fn(cb_arg, rc); +-} +- +-void +-spdk_blobfs_bdev_create(const char *bdev_name, uint32_t cluster_sz, +- spdk_blobfs_bdev_op_complete cb_fn, void *cb_arg) +-{ +- struct blobfs_bdev_operation_ctx *ctx; +- struct spdk_blobfs_opts blobfs_opt; +- struct spdk_bs_dev *bs_dev; +- int rc; +- +- ctx = calloc(1, sizeof(*ctx)); +- if (ctx == NULL) { +- SPDK_ERRLOG("Failed to allocate ctx.\n"); +- cb_fn(cb_arg, -ENOMEM); +- +- return; +- } +- +- ctx->bdev_name = bdev_name; +- ctx->cb_fn = cb_fn; +- ctx->cb_arg = cb_arg; +- +- rc = spdk_bdev_create_bs_dev_ext(bdev_name, blobfs_bdev_event_cb, NULL, &bs_dev); +- if (rc) { +- SPDK_INFOLOG(blobfs_bdev, "Failed to create a blobstore block device from bdev (%s)\n", +- bdev_name); +- +- goto invalid; +- } +- +- rc = spdk_bs_bdev_claim(bs_dev, &blobfs_bdev_module); +- if (rc) { +- SPDK_INFOLOG(blobfs_bdev, "Blobfs base bdev already claimed by another bdev\n"); +- bs_dev->destroy(bs_dev); +- +- goto invalid; +- } +- +- spdk_fs_opts_init(&blobfs_opt); +- if (cluster_sz) { +- blobfs_opt.cluster_sz = cluster_sz; +- } +- +- spdk_fs_init(bs_dev, &blobfs_opt, NULL, blobfs_bdev_load_cb_to_unload, ctx); +- +- return; +- +-invalid: +- free(ctx); +- +- cb_fn(cb_arg, rc); +-} +-SPDK_LOG_REGISTER_COMPONENT(blobfs_bdev) +-#ifdef SPDK_CONFIG_FUSE +- +-static void +-blobfs_bdev_unmount(void *arg) +-{ +- struct blobfs_bdev_operation_ctx *ctx = arg; +- +- /* Keep blobfs unloaded in a same spdk thread with spdk_fs_load */ +- spdk_thread_send_msg(ctx->fs_loading_thread, blobfs_bdev_unload, ctx); +-} +- +-static void +-_blobfs_bdev_mount_fuse_start(void *_ctx) +-{ +- struct blobfs_bdev_operation_ctx *ctx = _ctx; +- spdk_blobfs_bdev_op_complete cb_fn = ctx->cb_fn; +- int rc; +- +- /* Since function of ctx->cb_fn will be called in this function, set +- * ctx->cb_fn to be NULL, in order to avoid repeated calling in unload_cb. +- */ +- ctx->cb_fn = NULL; +- +- rc = blobfs_fuse_start(ctx->bdev_name, ctx->mountpoint, ctx->fs, +- blobfs_bdev_unmount, ctx, &ctx->bfuse); +- if (rc != 0) { +- SPDK_ERRLOG("Failed to mount blobfs on bdev %s to %s\n", ctx->bdev_name, ctx->mountpoint); +- +- /* Return failure state back */ +- cb_fn(ctx->cb_arg, rc); +- +- blobfs_bdev_unmount(ctx); +- +- return; +- } +- +- cb_fn(ctx->cb_arg, 0); +-} +- +-static void +-_blobfs_bdev_mount_load_cb(void *_ctx, struct spdk_filesystem *fs, int fserrno) +-{ +- struct blobfs_bdev_operation_ctx *ctx = _ctx; +- +- if (fserrno) { +- SPDK_ERRLOG("Failed to load blobfs on bdev %s: errno %d\n", ctx->bdev_name, fserrno); +- +- ctx->cb_fn(ctx->cb_arg, fserrno); +- free(ctx); +- return; +- } +- +- ctx->fs = fs; +- ctx->fs_loading_thread = spdk_get_thread(); +- +- spdk_thread_send_msg(spdk_get_thread(), _blobfs_bdev_mount_fuse_start, ctx); +-} +- +-static void +-blobfs_bdev_fuse_event_cb(enum spdk_bdev_event_type type, struct spdk_bdev *bdev, +- void *event_ctx) +-{ +- struct blobfs_bdev_operation_ctx *ctx = event_ctx; +- +- SPDK_WARNLOG("Async event(%d) is triggered in bdev %s\n", type, spdk_bdev_get_name(bdev)); +- +- if (type == SPDK_BDEV_EVENT_REMOVE) { +- blobfs_fuse_stop(ctx->bfuse); +- } +-} +- +-void +-spdk_blobfs_bdev_mount(const char *bdev_name, const char *mountpoint, +- spdk_blobfs_bdev_op_complete cb_fn, void *cb_arg) +-{ +- struct blobfs_bdev_operation_ctx *ctx; +- struct spdk_bs_dev *bs_dev; +- int rc; +- +- ctx = calloc(1, sizeof(*ctx)); +- if (ctx == NULL) { +- SPDK_ERRLOG("Failed to allocate ctx.\n"); +- cb_fn(cb_arg, -ENOMEM); +- +- return; +- } +- +- ctx->bdev_name = bdev_name; +- ctx->mountpoint = mountpoint; +- ctx->cb_fn = cb_fn; +- ctx->cb_arg = cb_arg; +- +- rc = spdk_bdev_create_bs_dev_ext(bdev_name, blobfs_bdev_fuse_event_cb, ctx, &bs_dev); +- if (rc != 0) { +- SPDK_INFOLOG(blobfs_bdev, "Failed to create a blobstore block device from bdev (%s)", +- bdev_name); +- +- goto invalid; +- } +- +- rc = spdk_bs_bdev_claim(bs_dev, &blobfs_bdev_module); +- if (rc != 0) { +- SPDK_INFOLOG(blobfs_bdev, "Blobfs base bdev already claimed by another bdev\n"); +- bs_dev->destroy(bs_dev); +- +- goto invalid; +- } +- +- spdk_fs_load(bs_dev, blobfs_fuse_send_request, _blobfs_bdev_mount_load_cb, ctx); +- +- return; +- +-invalid: +- free(ctx); +- +- cb_fn(cb_arg, rc); +-} +- +-#else /* SPDK_CONFIG_FUSE */ +- +-void +-spdk_blobfs_bdev_mount(const char *bdev_name, const char *mountpoint, +- spdk_blobfs_bdev_op_complete cb_fn, void *cb_arg) +-{ +- SPDK_ERRLOG("spdk_blobfs_bdev_mount() is unsupported\n"); +- cb_fn(cb_arg, -ENOTSUP); +-} +- +-#endif ++/* SPDX-License-Identifier: BSD-3-Clause ++ * Copyright (C) 2019 Intel Corporation. ++ * All rights reserved. ++ */ ++ ++#include "spdk/stdinc.h" ++#include "spdk/blobfs.h" ++#include "spdk/bdev.h" ++#include "spdk/bdev_module.h" ++#include "spdk/event.h" ++#include "spdk/blob_bdev.h" ++#include "spdk/blobfs_bdev.h" ++#include "spdk/log.h" ++#include "spdk/string.h" ++#include "spdk/rpc.h" ++#include "spdk/util.h" ++ ++#include "blobfs_fuse.h" ++ ++/* Dummy bdev module used to to claim bdevs. */ ++static struct spdk_bdev_module blobfs_bdev_module = { ++ .name = "blobfs", ++}; ++ ++static void ++blobfs_bdev_event_cb(enum spdk_bdev_event_type type, struct spdk_bdev *bdev, ++ void *event_ctx) ++{ ++ SPDK_WARNLOG("Async event(%d) is triggered in bdev %s\n", type, spdk_bdev_get_name(bdev)); ++} ++ ++struct blobfs_bdev_operation_ctx { ++ const char *bdev_name; ++ struct spdk_filesystem *fs; ++ ++ /* If cb_fn is already called in other function, not _blobfs_bdev_unload_cb. ++ * cb_fn should be set NULL after its being called, in order to avoid repeated ++ * calling in _blobfs_bdev_unload_cb. ++ */ ++ spdk_blobfs_bdev_op_complete cb_fn; ++ void *cb_arg; ++ ++ /* Variables for mount operation */ ++ const char *mountpoint; ++ struct spdk_thread *fs_loading_thread; ++ ++ /* Used in bdev_event_cb to do some proper operations on blobfs_fuse for ++ * asynchronous event of the backend bdev. ++ */ ++ struct spdk_blobfs_fuse *bfuse; ++}; ++ ++static void ++_blobfs_bdev_unload_cb(void *_ctx, int fserrno) ++{ ++ struct blobfs_bdev_operation_ctx *ctx = _ctx; ++ ++ if (fserrno) { ++ SPDK_ERRLOG("Failed to unload blobfs on bdev %s: errno %d\n", ctx->bdev_name, fserrno); ++ } ++ ++ if (ctx->cb_fn) { ++ ctx->cb_fn(ctx->cb_arg, fserrno); ++ } ++ ++ free(ctx); ++} ++ ++static void ++blobfs_bdev_unload(void *_ctx) ++{ ++ struct blobfs_bdev_operation_ctx *ctx = _ctx; ++ ++ spdk_fs_unload(ctx->fs, _blobfs_bdev_unload_cb, ctx); ++} ++ ++static void ++blobfs_bdev_load_cb_to_unload(void *_ctx, struct spdk_filesystem *fs, int fserrno) ++{ ++ struct blobfs_bdev_operation_ctx *ctx = _ctx; ++ ++ if (fserrno) { ++ ctx->cb_fn(ctx->cb_arg, fserrno); ++ free(ctx); ++ return; ++ } ++ ++ ctx->fs = fs; ++ spdk_thread_send_msg(spdk_get_thread(), blobfs_bdev_unload, ctx); ++} ++ ++void ++spdk_blobfs_bdev_detect(const char *bdev_name, ++ spdk_blobfs_bdev_op_complete cb_fn, void *cb_arg) ++{ ++ struct blobfs_bdev_operation_ctx *ctx; ++ struct spdk_bs_dev *bs_dev; ++ int rc; ++ ++ ctx = calloc(1, sizeof(*ctx)); ++ if (ctx == NULL) { ++ SPDK_ERRLOG("Failed to allocate ctx.\n"); ++ cb_fn(cb_arg, -ENOMEM); ++ ++ return; ++ } ++ ++ ctx->bdev_name = bdev_name; ++ ctx->cb_fn = cb_fn; ++ ctx->cb_arg = cb_arg; ++ ++ rc = spdk_bdev_create_bs_dev_ext(bdev_name, blobfs_bdev_event_cb, NULL, &bs_dev); ++ if (rc != 0) { ++ SPDK_INFOLOG(blobfs_bdev, "Failed to create a blobstore block device from bdev (%s)", ++ bdev_name); ++ ++ goto invalid; ++ } ++ ++ spdk_fs_load(bs_dev, NULL, blobfs_bdev_load_cb_to_unload, ctx); ++ ++ return; ++ ++invalid: ++ free(ctx); ++ ++ cb_fn(cb_arg, rc); ++} ++ ++void ++spdk_blobfs_bdev_create(const char *bdev_name, uint32_t cluster_sz, ++ spdk_blobfs_bdev_op_complete cb_fn, void *cb_arg) ++{ ++ struct blobfs_bdev_operation_ctx *ctx; ++ struct spdk_blobfs_opts blobfs_opt; ++ struct spdk_bs_dev *bs_dev; ++ int rc; ++ ++ ctx = calloc(1, sizeof(*ctx)); ++ if (ctx == NULL) { ++ SPDK_ERRLOG("Failed to allocate ctx.\n"); ++ cb_fn(cb_arg, -ENOMEM); ++ ++ return; ++ } ++ ++ ctx->bdev_name = bdev_name; ++ ctx->cb_fn = cb_fn; ++ ctx->cb_arg = cb_arg; ++ ++ rc = spdk_bdev_create_bs_dev_ext(bdev_name, blobfs_bdev_event_cb, NULL, &bs_dev); ++ if (rc) { ++ SPDK_INFOLOG(blobfs_bdev, "Failed to create a blobstore block device from bdev (%s)\n", ++ bdev_name); ++ ++ goto invalid; ++ } ++ ++ rc = spdk_bs_bdev_claim(bs_dev, &blobfs_bdev_module); ++ if (rc) { ++ SPDK_INFOLOG(blobfs_bdev, "Blobfs base bdev already claimed by another bdev\n"); ++ bs_dev->destroy(bs_dev); ++ ++ goto invalid; ++ } ++ ++ spdk_fs_opts_init(&blobfs_opt); ++ if (cluster_sz) { ++ blobfs_opt.cluster_sz = cluster_sz; ++ } ++ ++ spdk_fs_init(bs_dev, &blobfs_opt, NULL, blobfs_bdev_load_cb_to_unload, ctx); ++ ++ return; ++ ++invalid: ++ free(ctx); ++ ++ cb_fn(cb_arg, rc); ++} ++SPDK_LOG_REGISTER_COMPONENT(blobfs_bdev) ++#ifdef SPDK_CONFIG_FUSE ++ ++static void ++blobfs_bdev_unmount(void *arg) ++{ ++ struct blobfs_bdev_operation_ctx *ctx = arg; ++ ++ /* Keep blobfs unloaded in a same spdk thread with spdk_fs_load */ ++ spdk_thread_send_msg(ctx->fs_loading_thread, blobfs_bdev_unload, ctx); ++} ++ ++static void ++_blobfs_bdev_mount_fuse_start(void *_ctx) ++{ ++ struct blobfs_bdev_operation_ctx *ctx = _ctx; ++ spdk_blobfs_bdev_op_complete cb_fn = ctx->cb_fn; ++ int rc; ++ ++ /* Since function of ctx->cb_fn will be called in this function, set ++ * ctx->cb_fn to be NULL, in order to avoid repeated calling in unload_cb. ++ */ ++ ctx->cb_fn = NULL; ++ ++ rc = blobfs_fuse_start(ctx->bdev_name, ctx->mountpoint, ctx->fs, ++ blobfs_bdev_unmount, ctx, &ctx->bfuse); ++ if (rc != 0) { ++ SPDK_ERRLOG("Failed to mount blobfs on bdev %s to %s\n", ctx->bdev_name, ctx->mountpoint); ++ ++ /* Return failure state back */ ++ cb_fn(ctx->cb_arg, rc); ++ ++ blobfs_bdev_unmount(ctx); ++ ++ return; ++ } ++ ++ cb_fn(ctx->cb_arg, 0); ++} ++ ++static void ++_blobfs_bdev_mount_load_cb(void *_ctx, struct spdk_filesystem *fs, int fserrno) ++{ ++ struct blobfs_bdev_operation_ctx *ctx = _ctx; ++ ++ if (fserrno) { ++ SPDK_ERRLOG("Failed to load blobfs on bdev %s: errno %d\n", ctx->bdev_name, fserrno); ++ ++ ctx->cb_fn(ctx->cb_arg, fserrno); ++ free(ctx); ++ return; ++ } ++ ++ ctx->fs = fs; ++ ctx->fs_loading_thread = spdk_get_thread(); ++ ++ spdk_thread_send_msg(spdk_get_thread(), _blobfs_bdev_mount_fuse_start, ctx); ++} ++ ++static void ++blobfs_bdev_fuse_event_cb(enum spdk_bdev_event_type type, struct spdk_bdev *bdev, ++ void *event_ctx) ++{ ++ struct blobfs_bdev_operation_ctx *ctx = event_ctx; ++ ++ SPDK_WARNLOG("Async event(%d) is triggered in bdev %s\n", type, spdk_bdev_get_name(bdev)); ++ ++ if (type == SPDK_BDEV_EVENT_REMOVE) { ++ blobfs_fuse_stop(ctx->bfuse); ++ } ++} ++ ++void ++spdk_blobfs_bdev_mount(const char *bdev_name, const char *mountpoint, ++ spdk_blobfs_bdev_op_complete cb_fn, void *cb_arg) ++{ ++ struct blobfs_bdev_operation_ctx *ctx; ++ struct spdk_bs_dev *bs_dev; ++ int rc; ++ ++ ctx = calloc(1, sizeof(*ctx)); ++ if (ctx == NULL) { ++ SPDK_ERRLOG("Failed to allocate ctx.\n"); ++ cb_fn(cb_arg, -ENOMEM); ++ ++ return; ++ } ++ ++ ctx->bdev_name = bdev_name; ++ ctx->mountpoint = mountpoint; ++ ctx->cb_fn = cb_fn; ++ ctx->cb_arg = cb_arg; ++ ++ rc = spdk_bdev_create_bs_dev_ext(bdev_name, blobfs_bdev_fuse_event_cb, ctx, &bs_dev); ++ if (rc != 0) { ++ SPDK_INFOLOG(blobfs_bdev, "Failed to create a blobstore block device from bdev (%s)", ++ bdev_name); ++ ++ goto invalid; ++ } ++ ++ rc = spdk_bs_bdev_claim(bs_dev, &blobfs_bdev_module); ++ if (rc != 0) { ++ SPDK_INFOLOG(blobfs_bdev, "Blobfs base bdev already claimed by another bdev\n"); ++ bs_dev->destroy(bs_dev); ++ ++ goto invalid; ++ } ++ ++ spdk_fs_load(bs_dev, blobfs_fuse_send_request, _blobfs_bdev_mount_load_cb, ctx); ++ ++ return; ++ ++invalid: ++ free(ctx); ++ ++ cb_fn(cb_arg, rc); ++} ++ ++#else /* SPDK_CONFIG_FUSE */ ++ ++void ++spdk_blobfs_bdev_mount(const char *bdev_name, const char *mountpoint, ++ spdk_blobfs_bdev_op_complete cb_fn, void *cb_arg) ++{ ++ SPDK_ERRLOG("spdk_blobfs_bdev_mount() is unsupported\n"); ++ cb_fn(cb_arg, -ENOTSUP); ++} ++ ++#endif +diff --git a/module/blobfs/bdev/blobfs_bdev_rpc.c b/module/blobfs/bdev/blobfs_bdev_rpc.c +index 0bd5d26..ee7bc30 100644 +--- a/module/blobfs/bdev/blobfs_bdev_rpc.c ++++ b/module/blobfs/bdev/blobfs_bdev_rpc.c +@@ -1,302 +1,302 @@ +-/* SPDX-License-Identifier: BSD-3-Clause +- * Copyright (C) 2019 Intel Corporation. +- * All rights reserved. +- */ +- +-#include "spdk/stdinc.h" +-#include "spdk/blobfs.h" +-#include "spdk/bdev.h" +-#include "spdk/event.h" +-#include "spdk/blob_bdev.h" +-#include "spdk/blobfs_bdev.h" +-#include "spdk/log.h" +-#include "spdk/string.h" +-#include "spdk/rpc.h" +-#include "spdk/util.h" +- +-#ifndef PAGE_SIZE +-#define PAGE_SIZE 4096 +-#endif +- +-#define MIN_CLUSTER_SZ (1024 * 1024) +- +-struct rpc_blobfs_set_cache_size { +- uint64_t size_in_mb; +-}; +- +-static const struct spdk_json_object_decoder rpc_blobfs_set_cache_size_decoders[] = { +- {"size_in_mb", offsetof(struct rpc_blobfs_set_cache_size, size_in_mb), spdk_json_decode_uint64}, +-}; +- +-static void +-rpc_blobfs_set_cache_size(struct spdk_jsonrpc_request *request, +- const struct spdk_json_val *params) +-{ +- struct rpc_blobfs_set_cache_size req; +- int rc; +- +- if (spdk_json_decode_object(params, rpc_blobfs_set_cache_size_decoders, +- SPDK_COUNTOF(rpc_blobfs_set_cache_size_decoders), +- &req)) { +- SPDK_ERRLOG("spdk_json_decode_object failed\n"); +- spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, +- "spdk_json_decode_object failed"); +- +- return; +- } +- +- if (req.size_in_mb == 0) { +- spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, +- "spdk_json_decode_object failed"); +- +- return; +- } +- +- rc = spdk_fs_set_cache_size(req.size_in_mb); +- +- spdk_jsonrpc_send_bool_response(request, rc == 0); +-} +- +-SPDK_RPC_REGISTER("blobfs_set_cache_size", rpc_blobfs_set_cache_size, +- SPDK_RPC_STARTUP | SPDK_RPC_RUNTIME) +- +-struct rpc_blobfs_detect { +- char *bdev_name; +- +- struct spdk_jsonrpc_request *request; +-}; +- +-static void +-free_rpc_blobfs_detect(struct rpc_blobfs_detect *req) +-{ +- free(req->bdev_name); +- free(req); +-} +- +-static const struct spdk_json_object_decoder rpc_blobfs_detect_decoders[] = { +- {"bdev_name", offsetof(struct rpc_blobfs_detect, bdev_name), spdk_json_decode_string}, +-}; +- +-static void +-_rpc_blobfs_detect_done(void *cb_arg, int fserrno) +-{ +- struct rpc_blobfs_detect *req = cb_arg; +- bool existed = true; +- +- if (fserrno == -EILSEQ) { +- /* There is no blobfs existing on bdev */ +- existed = false; +- } else if (fserrno != 0) { +- spdk_jsonrpc_send_error_response(req->request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, +- spdk_strerror(-fserrno)); +- +- return; +- } +- +- spdk_jsonrpc_send_bool_response(req->request, existed); +- +- free_rpc_blobfs_detect(req); +-} +- +-static void +-rpc_blobfs_detect(struct spdk_jsonrpc_request *request, +- const struct spdk_json_val *params) +-{ +- struct rpc_blobfs_detect *req; +- +- req = calloc(1, sizeof(*req)); +- if (req == NULL) { +- SPDK_ERRLOG("could not allocate rpc_blobfs_detect request.\n"); +- spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, "Out of memory"); +- return; +- } +- +- if (spdk_json_decode_object(params, rpc_blobfs_detect_decoders, +- SPDK_COUNTOF(rpc_blobfs_detect_decoders), +- req)) { +- SPDK_ERRLOG("spdk_json_decode_object failed\n"); +- spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, +- "spdk_json_decode_object failed"); +- +- free_rpc_blobfs_detect(req); +- +- return; +- } +- +- req->request = request; +- spdk_blobfs_bdev_detect(req->bdev_name, _rpc_blobfs_detect_done, req); +-} +- +-SPDK_RPC_REGISTER("blobfs_detect", rpc_blobfs_detect, SPDK_RPC_RUNTIME) +- +-struct rpc_blobfs_create { +- char *bdev_name; +- uint64_t cluster_sz; +- +- struct spdk_jsonrpc_request *request; +-}; +- +-static void +-free_rpc_blobfs_create(struct rpc_blobfs_create *req) +-{ +- free(req->bdev_name); +- free(req); +-} +- +-static int +-rpc_decode_cluster_sz(const struct spdk_json_val *val, void *out) +-{ +- uint64_t *cluster_sz = out; +- char *sz_str = NULL; +- bool has_prefix; +- int rc; +- +- rc = spdk_json_decode_string(val, &sz_str); +- if (rc) { +- SPDK_NOTICELOG("Invalid parameter value: cluster_sz\n"); +- return -EINVAL; +- } +- +- rc = spdk_parse_capacity(sz_str, cluster_sz, &has_prefix); +- free(sz_str); +- +- if (rc || *cluster_sz % PAGE_SIZE != 0 || *cluster_sz < MIN_CLUSTER_SZ) { +- SPDK_NOTICELOG("Invalid parameter value: cluster_sz\n"); +- return -EINVAL; +- } +- +- SPDK_DEBUGLOG(blobfs_bdev_rpc, "cluster_sz of blobfs: %" PRId64 "\n", *cluster_sz); +- return 0; +-} +- +-static const struct spdk_json_object_decoder rpc_blobfs_create_decoders[] = { +- {"bdev_name", offsetof(struct rpc_blobfs_create, bdev_name), spdk_json_decode_string}, +- {"cluster_sz", offsetof(struct rpc_blobfs_create, cluster_sz), rpc_decode_cluster_sz, true}, +-}; +- +-static void +-_rpc_blobfs_create_done(void *cb_arg, int fserrno) +-{ +- struct rpc_blobfs_create *req = cb_arg; +- +- if (fserrno != 0) { +- spdk_jsonrpc_send_error_response(req->request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, +- spdk_strerror(-fserrno)); +- +- return; +- } +- +- spdk_jsonrpc_send_bool_response(req->request, true); +- +- free_rpc_blobfs_create(req); +-} +- +-static void +-rpc_blobfs_create(struct spdk_jsonrpc_request *request, +- const struct spdk_json_val *params) +-{ +- struct rpc_blobfs_create *req; +- +- req = calloc(1, sizeof(*req)); +- if (req == NULL) { +- SPDK_ERRLOG("could not allocate rpc_blobfs_create request.\n"); +- spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, "Out of memory"); +- return; +- } +- +- if (spdk_json_decode_object(params, rpc_blobfs_create_decoders, +- SPDK_COUNTOF(rpc_blobfs_create_decoders), +- req)) { +- SPDK_ERRLOG("spdk_json_decode_object failed\n"); +- spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, +- "spdk_json_decode_object failed"); +- +- free_rpc_blobfs_create(req); +- +- return; +- } +- +- req->request = request; +- spdk_blobfs_bdev_create(req->bdev_name, req->cluster_sz, _rpc_blobfs_create_done, req); +-} +- +-SPDK_RPC_REGISTER("blobfs_create", rpc_blobfs_create, SPDK_RPC_RUNTIME) +- +-SPDK_LOG_REGISTER_COMPONENT(blobfs_bdev_rpc) +-#ifdef SPDK_CONFIG_FUSE +- +-struct rpc_blobfs_mount { +- char *bdev_name; +- char *mountpoint; +- +- struct spdk_jsonrpc_request *request; +-}; +- +-static void +-free_rpc_blobfs_mount(struct rpc_blobfs_mount *req) +-{ +- free(req->bdev_name); +- free(req->mountpoint); +- free(req); +-} +- +-static const struct spdk_json_object_decoder rpc_blobfs_mount_decoders[] = { +- {"bdev_name", offsetof(struct rpc_blobfs_mount, bdev_name), spdk_json_decode_string}, +- {"mountpoint", offsetof(struct rpc_blobfs_mount, mountpoint), spdk_json_decode_string}, +-}; +- +-static void +-_rpc_blobfs_mount_done(void *cb_arg, int fserrno) +-{ +- struct rpc_blobfs_mount *req = cb_arg; +- +- if (fserrno == -EILSEQ) { +- /* There is no blobfs existing on bdev */ +- spdk_jsonrpc_send_error_response(req->request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, +- "No blobfs detected on given bdev"); +- +- return; +- } else if (fserrno != 0) { +- spdk_jsonrpc_send_error_response(req->request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, +- spdk_strerror(-fserrno)); +- +- return; +- } +- +- spdk_jsonrpc_send_bool_response(req->request, true); +- +- free_rpc_blobfs_mount(req); +-} +- +-static void +-rpc_blobfs_mount(struct spdk_jsonrpc_request *request, +- const struct spdk_json_val *params) +-{ +- struct rpc_blobfs_mount *req; +- +- req = calloc(1, sizeof(*req)); +- if (req == NULL) { +- SPDK_ERRLOG("could not allocate rpc_blobfs_mount request.\n"); +- spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, "Out of memory"); +- return; +- } +- +- if (spdk_json_decode_object(params, rpc_blobfs_mount_decoders, +- SPDK_COUNTOF(rpc_blobfs_mount_decoders), +- req)) { +- SPDK_ERRLOG("spdk_json_decode_object failed\n"); +- spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, +- "spdk_json_decode_object failed"); +- +- free_rpc_blobfs_mount(req); +- +- return; +- } +- +- req->request = request; +- spdk_blobfs_bdev_mount(req->bdev_name, req->mountpoint, _rpc_blobfs_mount_done, req); +-} +- +-SPDK_RPC_REGISTER("blobfs_mount", rpc_blobfs_mount, SPDK_RPC_RUNTIME) +- +-#endif ++/* SPDX-License-Identifier: BSD-3-Clause ++ * Copyright (C) 2019 Intel Corporation. ++ * All rights reserved. ++ */ ++ ++#include "spdk/stdinc.h" ++#include "spdk/blobfs.h" ++#include "spdk/bdev.h" ++#include "spdk/event.h" ++#include "spdk/blob_bdev.h" ++#include "spdk/blobfs_bdev.h" ++#include "spdk/log.h" ++#include "spdk/string.h" ++#include "spdk/rpc.h" ++#include "spdk/util.h" ++ ++#ifndef PAGE_SIZE ++#define PAGE_SIZE 4096 ++#endif ++ ++#define MIN_CLUSTER_SZ (1024 * 1024) ++ ++struct rpc_blobfs_set_cache_size { ++ uint64_t size_in_mb; ++}; ++ ++static const struct spdk_json_object_decoder rpc_blobfs_set_cache_size_decoders[] = { ++ {"size_in_mb", offsetof(struct rpc_blobfs_set_cache_size, size_in_mb), spdk_json_decode_uint64}, ++}; ++ ++static void ++rpc_blobfs_set_cache_size(struct spdk_jsonrpc_request *request, ++ const struct spdk_json_val *params) ++{ ++ struct rpc_blobfs_set_cache_size req; ++ int rc; ++ ++ if (spdk_json_decode_object(params, rpc_blobfs_set_cache_size_decoders, ++ SPDK_COUNTOF(rpc_blobfs_set_cache_size_decoders), ++ &req)) { ++ SPDK_ERRLOG("spdk_json_decode_object failed\n"); ++ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, ++ "spdk_json_decode_object failed"); ++ ++ return; ++ } ++ ++ if (req.size_in_mb == 0) { ++ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, ++ "spdk_json_decode_object failed"); ++ ++ return; ++ } ++ ++ rc = spdk_fs_set_cache_size(req.size_in_mb); ++ ++ spdk_jsonrpc_send_bool_response(request, rc == 0); ++} ++ ++SPDK_RPC_REGISTER("blobfs_set_cache_size", rpc_blobfs_set_cache_size, ++ SPDK_RPC_STARTUP | SPDK_RPC_RUNTIME) ++ ++struct rpc_blobfs_detect { ++ char *bdev_name; ++ ++ struct spdk_jsonrpc_request *request; ++}; ++ ++static void ++free_rpc_blobfs_detect(struct rpc_blobfs_detect *req) ++{ ++ free(req->bdev_name); ++ free(req); ++} ++ ++static const struct spdk_json_object_decoder rpc_blobfs_detect_decoders[] = { ++ {"bdev_name", offsetof(struct rpc_blobfs_detect, bdev_name), spdk_json_decode_string}, ++}; ++ ++static void ++_rpc_blobfs_detect_done(void *cb_arg, int fserrno) ++{ ++ struct rpc_blobfs_detect *req = cb_arg; ++ bool existed = true; ++ ++ if (fserrno == -EILSEQ) { ++ /* There is no blobfs existing on bdev */ ++ existed = false; ++ } else if (fserrno != 0) { ++ spdk_jsonrpc_send_error_response(req->request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, ++ spdk_strerror(-fserrno)); ++ ++ return; ++ } ++ ++ spdk_jsonrpc_send_bool_response(req->request, existed); ++ ++ free_rpc_blobfs_detect(req); ++} ++ ++static void ++rpc_blobfs_detect(struct spdk_jsonrpc_request *request, ++ const struct spdk_json_val *params) ++{ ++ struct rpc_blobfs_detect *req; ++ ++ req = calloc(1, sizeof(*req)); ++ if (req == NULL) { ++ SPDK_ERRLOG("could not allocate rpc_blobfs_detect request.\n"); ++ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, "Out of memory"); ++ return; ++ } ++ ++ if (spdk_json_decode_object(params, rpc_blobfs_detect_decoders, ++ SPDK_COUNTOF(rpc_blobfs_detect_decoders), ++ req)) { ++ SPDK_ERRLOG("spdk_json_decode_object failed\n"); ++ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, ++ "spdk_json_decode_object failed"); ++ ++ free_rpc_blobfs_detect(req); ++ ++ return; ++ } ++ ++ req->request = request; ++ spdk_blobfs_bdev_detect(req->bdev_name, _rpc_blobfs_detect_done, req); ++} ++ ++SPDK_RPC_REGISTER("blobfs_detect", rpc_blobfs_detect, SPDK_RPC_RUNTIME) ++ ++struct rpc_blobfs_create { ++ char *bdev_name; ++ uint64_t cluster_sz; ++ ++ struct spdk_jsonrpc_request *request; ++}; ++ ++static void ++free_rpc_blobfs_create(struct rpc_blobfs_create *req) ++{ ++ free(req->bdev_name); ++ free(req); ++} ++ ++static int ++rpc_decode_cluster_sz(const struct spdk_json_val *val, void *out) ++{ ++ uint64_t *cluster_sz = out; ++ char *sz_str = NULL; ++ bool has_prefix; ++ int rc; ++ ++ rc = spdk_json_decode_string(val, &sz_str); ++ if (rc) { ++ SPDK_NOTICELOG("Invalid parameter value: cluster_sz\n"); ++ return -EINVAL; ++ } ++ ++ rc = spdk_parse_capacity(sz_str, cluster_sz, &has_prefix); ++ free(sz_str); ++ ++ if (rc || *cluster_sz % PAGE_SIZE != 0 || *cluster_sz < MIN_CLUSTER_SZ) { ++ SPDK_NOTICELOG("Invalid parameter value: cluster_sz\n"); ++ return -EINVAL; ++ } ++ ++ SPDK_DEBUGLOG(blobfs_bdev_rpc, "cluster_sz of blobfs: %" PRId64 "\n", *cluster_sz); ++ return 0; ++} ++ ++static const struct spdk_json_object_decoder rpc_blobfs_create_decoders[] = { ++ {"bdev_name", offsetof(struct rpc_blobfs_create, bdev_name), spdk_json_decode_string}, ++ {"cluster_sz", offsetof(struct rpc_blobfs_create, cluster_sz), rpc_decode_cluster_sz, true}, ++}; ++ ++static void ++_rpc_blobfs_create_done(void *cb_arg, int fserrno) ++{ ++ struct rpc_blobfs_create *req = cb_arg; ++ ++ if (fserrno != 0) { ++ spdk_jsonrpc_send_error_response(req->request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, ++ spdk_strerror(-fserrno)); ++ ++ return; ++ } ++ ++ spdk_jsonrpc_send_bool_response(req->request, true); ++ ++ free_rpc_blobfs_create(req); ++} ++ ++static void ++rpc_blobfs_create(struct spdk_jsonrpc_request *request, ++ const struct spdk_json_val *params) ++{ ++ struct rpc_blobfs_create *req; ++ ++ req = calloc(1, sizeof(*req)); ++ if (req == NULL) { ++ SPDK_ERRLOG("could not allocate rpc_blobfs_create request.\n"); ++ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, "Out of memory"); ++ return; ++ } ++ ++ if (spdk_json_decode_object(params, rpc_blobfs_create_decoders, ++ SPDK_COUNTOF(rpc_blobfs_create_decoders), ++ req)) { ++ SPDK_ERRLOG("spdk_json_decode_object failed\n"); ++ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, ++ "spdk_json_decode_object failed"); ++ ++ free_rpc_blobfs_create(req); ++ ++ return; ++ } ++ ++ req->request = request; ++ spdk_blobfs_bdev_create(req->bdev_name, req->cluster_sz, _rpc_blobfs_create_done, req); ++} ++ ++SPDK_RPC_REGISTER("blobfs_create", rpc_blobfs_create, SPDK_RPC_RUNTIME) ++ ++SPDK_LOG_REGISTER_COMPONENT(blobfs_bdev_rpc) ++#ifdef SPDK_CONFIG_FUSE ++ ++struct rpc_blobfs_mount { ++ char *bdev_name; ++ char *mountpoint; ++ ++ struct spdk_jsonrpc_request *request; ++}; ++ ++static void ++free_rpc_blobfs_mount(struct rpc_blobfs_mount *req) ++{ ++ free(req->bdev_name); ++ free(req->mountpoint); ++ free(req); ++} ++ ++static const struct spdk_json_object_decoder rpc_blobfs_mount_decoders[] = { ++ {"bdev_name", offsetof(struct rpc_blobfs_mount, bdev_name), spdk_json_decode_string}, ++ {"mountpoint", offsetof(struct rpc_blobfs_mount, mountpoint), spdk_json_decode_string}, ++}; ++ ++static void ++_rpc_blobfs_mount_done(void *cb_arg, int fserrno) ++{ ++ struct rpc_blobfs_mount *req = cb_arg; ++ ++ if (fserrno == -EILSEQ) { ++ /* There is no blobfs existing on bdev */ ++ spdk_jsonrpc_send_error_response(req->request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, ++ "No blobfs detected on given bdev"); ++ ++ return; ++ } else if (fserrno != 0) { ++ spdk_jsonrpc_send_error_response(req->request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, ++ spdk_strerror(-fserrno)); ++ ++ return; ++ } ++ ++ spdk_jsonrpc_send_bool_response(req->request, true); ++ ++ free_rpc_blobfs_mount(req); ++} ++ ++static void ++rpc_blobfs_mount(struct spdk_jsonrpc_request *request, ++ const struct spdk_json_val *params) ++{ ++ struct rpc_blobfs_mount *req; ++ ++ req = calloc(1, sizeof(*req)); ++ if (req == NULL) { ++ SPDK_ERRLOG("could not allocate rpc_blobfs_mount request.\n"); ++ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, "Out of memory"); ++ return; ++ } ++ ++ if (spdk_json_decode_object(params, rpc_blobfs_mount_decoders, ++ SPDK_COUNTOF(rpc_blobfs_mount_decoders), ++ req)) { ++ SPDK_ERRLOG("spdk_json_decode_object failed\n"); ++ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, ++ "spdk_json_decode_object failed"); ++ ++ free_rpc_blobfs_mount(req); ++ ++ return; ++ } ++ ++ req->request = request; ++ spdk_blobfs_bdev_mount(req->bdev_name, req->mountpoint, _rpc_blobfs_mount_done, req); ++} ++ ++SPDK_RPC_REGISTER("blobfs_mount", rpc_blobfs_mount, SPDK_RPC_RUNTIME) ++ ++#endif +diff --git a/module/blobfs/bdev/blobfs_fuse.c b/module/blobfs/bdev/blobfs_fuse.c +index 45f6f76..3717c69 100644 +--- a/module/blobfs/bdev/blobfs_fuse.c ++++ b/module/blobfs/bdev/blobfs_fuse.c +@@ -1,336 +1,336 @@ +-/* SPDX-License-Identifier: BSD-3-Clause +- * Copyright (C) 2019 Intel Corporation. +- * All rights reserved. +- */ +- +-#include "spdk/stdinc.h" +- +-#include "spdk/log.h" +-#include "spdk/env.h" +-#include "spdk/event.h" +-#include "spdk/thread.h" +-#include "spdk/string.h" +-#include "spdk/blobfs.h" +- +-#include "blobfs_fuse.h" +- +-#define FUSE_USE_VERSION 30 +-#include "fuse3/fuse.h" +-#include "fuse3/fuse_lowlevel.h" +- +-struct spdk_blobfs_fuse { +- char *bdev_name; +- char *mountpoint; +- struct spdk_fs_thread_ctx *channel; +- struct spdk_filesystem *fs; +- +- struct fuse *fuse_handle; +- pthread_t fuse_tid; +- +- blobfs_fuse_unmount_cb cb_fn; +- void *cb_arg; +-}; +- +-/* Each thread serves one blobfs */ +-static __thread struct spdk_blobfs_fuse *thd_bfuse; +- +-static void +-blobfs_fuse_free(struct spdk_blobfs_fuse *bfuse) +-{ +- if (bfuse == NULL) { +- return; +- } +- +- free(bfuse->bdev_name); +- free(bfuse->mountpoint); +- free(bfuse); +-} +- +-static void +-__call_fn(void *arg1, void *arg2) +-{ +- fs_request_fn fn; +- +- fn = (fs_request_fn)arg1; +- fn(arg2); +-} +- +-void +-blobfs_fuse_send_request(fs_request_fn fn, void *arg) +-{ +- struct spdk_event *event; +- +- event = spdk_event_allocate(0, __call_fn, (void *)fn, arg); +- spdk_event_call(event); +-} +- +-static int +-fuse_getattr(const char *path, struct stat *stbuf, struct fuse_file_info *fi) +-{ +- struct spdk_file_stat stat; +- int rc; +- +- if (!strcmp(path, "/")) { +- stbuf->st_mode = S_IFDIR | 0755; +- stbuf->st_nlink = 2; +- return 0; +- } +- +- rc = spdk_fs_file_stat(thd_bfuse->fs, thd_bfuse->channel, path, &stat); +- if (rc == 0) { +- stbuf->st_mode = S_IFREG | 0644; +- stbuf->st_nlink = 1; +- stbuf->st_size = stat.size; +- } +- +- return rc; +-} +- +-static int +-fuse_readdir(const char *path, void *buf, fuse_fill_dir_t filler, +- off_t offset, struct fuse_file_info *fi, +- enum fuse_readdir_flags flags) +-{ +- struct spdk_file *file; +- const char *filename; +- spdk_fs_iter iter; +- +- filler(buf, ".", NULL, 0, 0); +- filler(buf, "..", NULL, 0, 0); +- +- iter = spdk_fs_iter_first(thd_bfuse->fs); +- while (iter != NULL) { +- file = spdk_fs_iter_get_file(iter); +- iter = spdk_fs_iter_next(iter); +- filename = spdk_file_get_name(file); +- filler(buf, &filename[1], NULL, 0, 0); +- } +- +- return 0; +-} +- +-static int +-fuse_mknod(const char *path, mode_t mode, dev_t rdev) +-{ +- return spdk_fs_create_file(thd_bfuse->fs, thd_bfuse->channel, path); +-} +- +-static int +-fuse_unlink(const char *path) +-{ +- return spdk_fs_delete_file(thd_bfuse->fs, thd_bfuse->channel, path); +-} +- +-static int +-fuse_truncate(const char *path, off_t size, struct fuse_file_info *fi) +-{ +- struct spdk_file *file; +- int rc; +- +- rc = spdk_fs_open_file(thd_bfuse->fs, thd_bfuse->channel, path, 0, &file); +- if (rc != 0) { +- return -rc; +- } +- +- rc = spdk_file_truncate(file, thd_bfuse->channel, size); +- if (rc != 0) { +- return -rc; +- } +- +- spdk_file_close(file, thd_bfuse->channel); +- +- return 0; +-} +- +-static int +-fuse_utimens(const char *path, const struct timespec tv[2], struct fuse_file_info *fi) +-{ +- return 0; +-} +- +-static int +-fuse_open(const char *path, struct fuse_file_info *info) +-{ +- struct spdk_file *file; +- int rc; +- +- rc = spdk_fs_open_file(thd_bfuse->fs, thd_bfuse->channel, path, 0, &file); +- if (rc != 0) { +- return -rc; +- } +- +- info->fh = (uintptr_t)file; +- return 0; +-} +- +-static int +-fuse_release(const char *path, struct fuse_file_info *info) +-{ +- struct spdk_file *file = (struct spdk_file *)info->fh; +- +- return spdk_file_close(file, thd_bfuse->channel); +-} +- +-static int +-fuse_read(const char *path, char *buf, size_t len, off_t offset, struct fuse_file_info *info) +-{ +- struct spdk_file *file = (struct spdk_file *)info->fh; +- +- return spdk_file_read(file, thd_bfuse->channel, buf, offset, len); +-} +- +-static int +-fuse_write(const char *path, const char *buf, size_t len, off_t offset, +- struct fuse_file_info *info) +-{ +- struct spdk_file *file = (struct spdk_file *)info->fh; +- int rc; +- +- rc = spdk_file_write(file, thd_bfuse->channel, (void *)buf, offset, len); +- if (rc == 0) { +- return len; +- } else { +- return rc; +- } +-} +- +-static int +-fuse_flush(const char *path, struct fuse_file_info *info) +-{ +- return 0; +-} +- +-static int +-fuse_fsync(const char *path, int datasync, struct fuse_file_info *info) +-{ +- return 0; +-} +- +-static int +-fuse_rename(const char *old_path, const char *new_path, unsigned int flags) +-{ +- return spdk_fs_rename_file(thd_bfuse->fs, thd_bfuse->channel, old_path, new_path); +-} +- +-static struct fuse_operations spdk_fuse_oper = { +- .getattr = fuse_getattr, +- .readdir = fuse_readdir, +- .mknod = fuse_mknod, +- .unlink = fuse_unlink, +- .truncate = fuse_truncate, +- .utimens = fuse_utimens, +- .open = fuse_open, +- .release = fuse_release, +- .read = fuse_read, +- .write = fuse_write, +- .flush = fuse_flush, +- .fsync = fuse_fsync, +- .rename = fuse_rename, +-}; +- +-static void * +-fuse_loop_new_thread(void *arg) +-{ +- struct spdk_blobfs_fuse *bfuse = arg; +- +- spdk_unaffinitize_thread(); +- +- thd_bfuse = bfuse; +- SPDK_NOTICELOG("Start to loop blobfs on bdev %s mounted at %s\n", bfuse->bdev_name, +- bfuse->mountpoint); +- +- bfuse->channel = spdk_fs_alloc_thread_ctx(bfuse->fs); +- +- fuse_loop(bfuse->fuse_handle); +- fuse_unmount(bfuse->fuse_handle); +- fuse_destroy(bfuse->fuse_handle); +- SPDK_NOTICELOG("Blobfs on bdev %s unmounted from %s\n", bfuse->bdev_name, bfuse->mountpoint); +- +- spdk_fs_free_thread_ctx(bfuse->channel); +- +- bfuse->cb_fn(bfuse->cb_arg); +- +- blobfs_fuse_free(bfuse); +- +- pthread_exit(NULL); +-} +- +-int +-blobfs_fuse_start(const char *bdev_name, const char *mountpoint, struct spdk_filesystem *fs, +- blobfs_fuse_unmount_cb cb_fn, void *cb_arg, struct spdk_blobfs_fuse **_bfuse) +-{ +- /* Set argv[1] as bdev_name in order to show bdev_name as the mounting source */ +- char *argv[1] = {(char *)bdev_name}; +- struct fuse_args args = FUSE_ARGS_INIT(1, argv); +- struct fuse_cmdline_opts opts = {}; +- struct fuse *fuse_handle; +- struct spdk_blobfs_fuse *bfuse; +- pthread_t tid; +- int rc; +- +- bfuse = (struct spdk_blobfs_fuse *)calloc(1, sizeof(*bfuse)); +- if (bfuse == NULL) { +- return -ENOMEM; +- } +- +- bfuse->bdev_name = strdup(bdev_name); +- bfuse->mountpoint = strdup(mountpoint); +- if (!bfuse->bdev_name || !bfuse->mountpoint) { +- rc = -ENOMEM; +- goto err; +- } +- bfuse->fs = fs; +- bfuse->cb_fn = cb_fn; +- bfuse->cb_arg = cb_arg; +- +- rc = fuse_parse_cmdline(&args, &opts); +- assert(rc == 0); +- +- fuse_handle = fuse_new(&args, &spdk_fuse_oper, sizeof(spdk_fuse_oper), NULL); +- fuse_opt_free_args(&args); +- if (fuse_handle == NULL) { +- SPDK_ERRLOG("could not create fuse handle!\n"); +- rc = -1; +- goto err; +- } +- bfuse->fuse_handle = fuse_handle; +- +- rc = fuse_mount(bfuse->fuse_handle, bfuse->mountpoint); +- if (rc != 0) { +- SPDK_ERRLOG("could not mount fuse handle\n"); +- rc = -1; +- goto err; +- } +- +- rc = pthread_create(&tid, NULL, fuse_loop_new_thread, bfuse); +- if (rc != 0) { +- SPDK_ERRLOG("could not create thread: %s\n", spdk_strerror(rc)); +- rc = -rc; +- goto err; +- } +- bfuse->fuse_tid = tid; +- +- rc = pthread_detach(tid); +- if (rc != 0) { +- SPDK_ERRLOG("could not detach thread for fuse loop thread: %s\n", spdk_strerror(rc)); +- rc = -rc; +- goto err; +- } +- +- *_bfuse = bfuse; +- return 0; +- +-err: +- blobfs_fuse_free(bfuse); +- +- return rc; +-} +- +-void +-blobfs_fuse_stop(struct spdk_blobfs_fuse *bfuse) +-{ +- if (bfuse) { +- fuse_session_exit(fuse_get_session(bfuse->fuse_handle)); +- pthread_kill(bfuse->fuse_tid, SIGINT); +- } +-} ++/* SPDX-License-Identifier: BSD-3-Clause ++ * Copyright (C) 2019 Intel Corporation. ++ * All rights reserved. ++ */ ++ ++#include "spdk/stdinc.h" ++ ++#include "spdk/log.h" ++#include "spdk/env.h" ++#include "spdk/event.h" ++#include "spdk/thread.h" ++#include "spdk/string.h" ++#include "spdk/blobfs.h" ++ ++#include "blobfs_fuse.h" ++ ++#define FUSE_USE_VERSION 30 ++#include "fuse3/fuse.h" ++#include "fuse3/fuse_lowlevel.h" ++ ++struct spdk_blobfs_fuse { ++ char *bdev_name; ++ char *mountpoint; ++ struct spdk_fs_thread_ctx *channel; ++ struct spdk_filesystem *fs; ++ ++ struct fuse *fuse_handle; ++ pthread_t fuse_tid; ++ ++ blobfs_fuse_unmount_cb cb_fn; ++ void *cb_arg; ++}; ++ ++/* Each thread serves one blobfs */ ++static __thread struct spdk_blobfs_fuse *thd_bfuse; ++ ++static void ++blobfs_fuse_free(struct spdk_blobfs_fuse *bfuse) ++{ ++ if (bfuse == NULL) { ++ return; ++ } ++ ++ free(bfuse->bdev_name); ++ free(bfuse->mountpoint); ++ free(bfuse); ++} ++ ++static void ++__call_fn(void *arg1, void *arg2) ++{ ++ fs_request_fn fn; ++ ++ fn = (fs_request_fn)arg1; ++ fn(arg2); ++} ++ ++void ++blobfs_fuse_send_request(fs_request_fn fn, void *arg) ++{ ++ struct spdk_event *event; ++ ++ event = spdk_event_allocate(0, __call_fn, (void *)fn, arg); ++ spdk_event_call(event); ++} ++ ++static int ++fuse_getattr(const char *path, struct stat *stbuf, struct fuse_file_info *fi) ++{ ++ struct spdk_file_stat stat; ++ int rc; ++ ++ if (!strcmp(path, "/")) { ++ stbuf->st_mode = S_IFDIR | 0755; ++ stbuf->st_nlink = 2; ++ return 0; ++ } ++ ++ rc = spdk_fs_file_stat(thd_bfuse->fs, thd_bfuse->channel, path, &stat); ++ if (rc == 0) { ++ stbuf->st_mode = S_IFREG | 0644; ++ stbuf->st_nlink = 1; ++ stbuf->st_size = stat.size; ++ } ++ ++ return rc; ++} ++ ++static int ++fuse_readdir(const char *path, void *buf, fuse_fill_dir_t filler, ++ off_t offset, struct fuse_file_info *fi, ++ enum fuse_readdir_flags flags) ++{ ++ struct spdk_file *file; ++ const char *filename; ++ spdk_fs_iter iter; ++ ++ filler(buf, ".", NULL, 0, 0); ++ filler(buf, "..", NULL, 0, 0); ++ ++ iter = spdk_fs_iter_first(thd_bfuse->fs); ++ while (iter != NULL) { ++ file = spdk_fs_iter_get_file(iter); ++ iter = spdk_fs_iter_next(iter); ++ filename = spdk_file_get_name(file); ++ filler(buf, &filename[1], NULL, 0, 0); ++ } ++ ++ return 0; ++} ++ ++static int ++fuse_mknod(const char *path, mode_t mode, dev_t rdev) ++{ ++ return spdk_fs_create_file(thd_bfuse->fs, thd_bfuse->channel, path); ++} ++ ++static int ++fuse_unlink(const char *path) ++{ ++ return spdk_fs_delete_file(thd_bfuse->fs, thd_bfuse->channel, path); ++} ++ ++static int ++fuse_truncate(const char *path, off_t size, struct fuse_file_info *fi) ++{ ++ struct spdk_file *file; ++ int rc; ++ ++ rc = spdk_fs_open_file(thd_bfuse->fs, thd_bfuse->channel, path, 0, &file); ++ if (rc != 0) { ++ return -rc; ++ } ++ ++ rc = spdk_file_truncate(file, thd_bfuse->channel, size); ++ if (rc != 0) { ++ return -rc; ++ } ++ ++ spdk_file_close(file, thd_bfuse->channel); ++ ++ return 0; ++} ++ ++static int ++fuse_utimens(const char *path, const struct timespec tv[2], struct fuse_file_info *fi) ++{ ++ return 0; ++} ++ ++static int ++fuse_open(const char *path, struct fuse_file_info *info) ++{ ++ struct spdk_file *file; ++ int rc; ++ ++ rc = spdk_fs_open_file(thd_bfuse->fs, thd_bfuse->channel, path, 0, &file); ++ if (rc != 0) { ++ return -rc; ++ } ++ ++ info->fh = (uintptr_t)file; ++ return 0; ++} ++ ++static int ++fuse_release(const char *path, struct fuse_file_info *info) ++{ ++ struct spdk_file *file = (struct spdk_file *)info->fh; ++ ++ return spdk_file_close(file, thd_bfuse->channel); ++} ++ ++static int ++fuse_read(const char *path, char *buf, size_t len, off_t offset, struct fuse_file_info *info) ++{ ++ struct spdk_file *file = (struct spdk_file *)info->fh; ++ ++ return spdk_file_read(file, thd_bfuse->channel, buf, offset, len); ++} ++ ++static int ++fuse_write(const char *path, const char *buf, size_t len, off_t offset, ++ struct fuse_file_info *info) ++{ ++ struct spdk_file *file = (struct spdk_file *)info->fh; ++ int rc; ++ ++ rc = spdk_file_write(file, thd_bfuse->channel, (void *)buf, offset, len); ++ if (rc == 0) { ++ return len; ++ } else { ++ return rc; ++ } ++} ++ ++static int ++fuse_flush(const char *path, struct fuse_file_info *info) ++{ ++ return 0; ++} ++ ++static int ++fuse_fsync(const char *path, int datasync, struct fuse_file_info *info) ++{ ++ return 0; ++} ++ ++static int ++fuse_rename(const char *old_path, const char *new_path, unsigned int flags) ++{ ++ return spdk_fs_rename_file(thd_bfuse->fs, thd_bfuse->channel, old_path, new_path); ++} ++ ++static struct fuse_operations spdk_fuse_oper = { ++ .getattr = fuse_getattr, ++ .readdir = fuse_readdir, ++ .mknod = fuse_mknod, ++ .unlink = fuse_unlink, ++ .truncate = fuse_truncate, ++ .utimens = fuse_utimens, ++ .open = fuse_open, ++ .release = fuse_release, ++ .read = fuse_read, ++ .write = fuse_write, ++ .flush = fuse_flush, ++ .fsync = fuse_fsync, ++ .rename = fuse_rename, ++}; ++ ++static void * ++fuse_loop_new_thread(void *arg) ++{ ++ struct spdk_blobfs_fuse *bfuse = arg; ++ ++ spdk_unaffinitize_thread(); ++ ++ thd_bfuse = bfuse; ++ SPDK_NOTICELOG("Start to loop blobfs on bdev %s mounted at %s\n", bfuse->bdev_name, ++ bfuse->mountpoint); ++ ++ bfuse->channel = spdk_fs_alloc_thread_ctx(bfuse->fs); ++ ++ fuse_loop(bfuse->fuse_handle); ++ fuse_unmount(bfuse->fuse_handle); ++ fuse_destroy(bfuse->fuse_handle); ++ SPDK_NOTICELOG("Blobfs on bdev %s unmounted from %s\n", bfuse->bdev_name, bfuse->mountpoint); ++ ++ spdk_fs_free_thread_ctx(bfuse->channel); ++ ++ bfuse->cb_fn(bfuse->cb_arg); ++ ++ blobfs_fuse_free(bfuse); ++ ++ pthread_exit(NULL); ++} ++ ++int ++blobfs_fuse_start(const char *bdev_name, const char *mountpoint, struct spdk_filesystem *fs, ++ blobfs_fuse_unmount_cb cb_fn, void *cb_arg, struct spdk_blobfs_fuse **_bfuse) ++{ ++ /* Set argv[1] as bdev_name in order to show bdev_name as the mounting source */ ++ char *argv[1] = {(char *)bdev_name}; ++ struct fuse_args args = FUSE_ARGS_INIT(1, argv); ++ struct fuse_cmdline_opts opts = {}; ++ struct fuse *fuse_handle; ++ struct spdk_blobfs_fuse *bfuse; ++ pthread_t tid; ++ int rc; ++ ++ bfuse = (struct spdk_blobfs_fuse *)calloc(1, sizeof(*bfuse)); ++ if (bfuse == NULL) { ++ return -ENOMEM; ++ } ++ ++ bfuse->bdev_name = strdup(bdev_name); ++ bfuse->mountpoint = strdup(mountpoint); ++ if (!bfuse->bdev_name || !bfuse->mountpoint) { ++ rc = -ENOMEM; ++ goto err; ++ } ++ bfuse->fs = fs; ++ bfuse->cb_fn = cb_fn; ++ bfuse->cb_arg = cb_arg; ++ ++ rc = fuse_parse_cmdline(&args, &opts); ++ assert(rc == 0); ++ ++ fuse_handle = fuse_new(&args, &spdk_fuse_oper, sizeof(spdk_fuse_oper), NULL); ++ fuse_opt_free_args(&args); ++ if (fuse_handle == NULL) { ++ SPDK_ERRLOG("could not create fuse handle!\n"); ++ rc = -1; ++ goto err; ++ } ++ bfuse->fuse_handle = fuse_handle; ++ ++ rc = fuse_mount(bfuse->fuse_handle, bfuse->mountpoint); ++ if (rc != 0) { ++ SPDK_ERRLOG("could not mount fuse handle\n"); ++ rc = -1; ++ goto err; ++ } ++ ++ rc = pthread_create(&tid, NULL, fuse_loop_new_thread, bfuse); ++ if (rc != 0) { ++ SPDK_ERRLOG("could not create thread: %s\n", spdk_strerror(rc)); ++ rc = -rc; ++ goto err; ++ } ++ bfuse->fuse_tid = tid; ++ ++ rc = pthread_detach(tid); ++ if (rc != 0) { ++ SPDK_ERRLOG("could not detach thread for fuse loop thread: %s\n", spdk_strerror(rc)); ++ rc = -rc; ++ goto err; ++ } ++ ++ *_bfuse = bfuse; ++ return 0; ++ ++err: ++ blobfs_fuse_free(bfuse); ++ ++ return rc; ++} ++ ++void ++blobfs_fuse_stop(struct spdk_blobfs_fuse *bfuse) ++{ ++ if (bfuse) { ++ fuse_session_exit(fuse_get_session(bfuse->fuse_handle)); ++ pthread_kill(bfuse->fuse_tid, SIGINT); ++ } ++} +diff --git a/module/blobfs/bdev/blobfs_fuse.h b/module/blobfs/bdev/blobfs_fuse.h +index aa48e36..dadd680 100644 +--- a/module/blobfs/bdev/blobfs_fuse.h ++++ b/module/blobfs/bdev/blobfs_fuse.h +@@ -1,24 +1,24 @@ +-/* SPDX-License-Identifier: BSD-3-Clause +- * Copyright (C) 2019 Intel Corporation. +- * All rights reserved. +- */ +- +-#ifndef SPDK_BLOBFS_FUSE_H +-#define SPDK_BLOBFS_FUSE_H +- +-#include "spdk/stdinc.h" +-#include "spdk/blobfs.h" +- +-struct spdk_blobfs_fuse; +- +-void blobfs_fuse_send_request(fs_request_fn fn, void *arg); +- +-typedef void (*blobfs_fuse_unmount_cb)(void *arg); +- +-int blobfs_fuse_start(const char *bdev_name, const char *mountpoint, +- struct spdk_filesystem *fs, blobfs_fuse_unmount_cb cb_fn, +- void *cb_arg, struct spdk_blobfs_fuse **bfuse); +- +-void blobfs_fuse_stop(struct spdk_blobfs_fuse *bfuse); +- +-#endif /* SPDK_BLOBFS_FUSE_H */ ++/* SPDX-License-Identifier: BSD-3-Clause ++ * Copyright (C) 2019 Intel Corporation. ++ * All rights reserved. ++ */ ++ ++#ifndef SPDK_BLOBFS_FUSE_H ++#define SPDK_BLOBFS_FUSE_H ++ ++#include "spdk/stdinc.h" ++#include "spdk/blobfs.h" ++ ++struct spdk_blobfs_fuse; ++ ++void blobfs_fuse_send_request(fs_request_fn fn, void *arg); ++ ++typedef void (*blobfs_fuse_unmount_cb)(void *arg); ++ ++int blobfs_fuse_start(const char *bdev_name, const char *mountpoint, ++ struct spdk_filesystem *fs, blobfs_fuse_unmount_cb cb_fn, ++ void *cb_arg, struct spdk_blobfs_fuse **bfuse); ++ ++void blobfs_fuse_stop(struct spdk_blobfs_fuse *bfuse); ++ ++#endif /* SPDK_BLOBFS_FUSE_H */ +diff --git a/module/blobfs/bdev/spdk_blobfs_bdev.map b/module/blobfs/bdev/spdk_blobfs_bdev.map +index e3d461c..c51ba13 100644 +--- a/module/blobfs/bdev/spdk_blobfs_bdev.map ++++ b/module/blobfs/bdev/spdk_blobfs_bdev.map +@@ -1,8 +1,8 @@ +-{ +- global: +- spdk_blobfs_bdev_detect; +- spdk_blobfs_bdev_create; +- spdk_blobfs_bdev_mount; +- +- local: *; +-}; ++{ ++ global: ++ spdk_blobfs_bdev_detect; ++ spdk_blobfs_bdev_create; ++ spdk_blobfs_bdev_mount; ++ ++ local: *; ++}; +diff --git a/module/env_dpdk/Makefile b/module/env_dpdk/Makefile +index fcd3a9b..7f39dd0 100644 +--- a/module/env_dpdk/Makefile ++++ b/module/env_dpdk/Makefile +@@ -1,17 +1,17 @@ +-# SPDX-License-Identifier: BSD-3-Clause +-# Copyright (C) 2015 Intel Corporation. +-# All rights reserved. +-# +- +-SPDK_ROOT_DIR := $(abspath $(CURDIR)/../..) +-include $(SPDK_ROOT_DIR)/mk/spdk.common.mk +- +-SO_VER := 4 +-SO_MINOR := 0 +- +-C_SRCS = env_dpdk_rpc.c +-LIBNAME = env_dpdk_rpc +- +-SPDK_MAP_FILE = $(SPDK_ROOT_DIR)/mk/spdk_blank.map +- +-include $(SPDK_ROOT_DIR)/mk/spdk.lib.mk ++# SPDX-License-Identifier: BSD-3-Clause ++# Copyright (C) 2015 Intel Corporation. ++# All rights reserved. ++# ++ ++SPDK_ROOT_DIR := $(abspath $(CURDIR)/../..) ++include $(SPDK_ROOT_DIR)/mk/spdk.common.mk ++ ++SO_VER := 4 ++SO_MINOR := 0 ++ ++C_SRCS = env_dpdk_rpc.c ++LIBNAME = env_dpdk_rpc ++ ++SPDK_MAP_FILE = $(SPDK_ROOT_DIR)/mk/spdk_blank.map ++ ++include $(SPDK_ROOT_DIR)/mk/spdk.lib.mk +diff --git a/module/env_dpdk/env_dpdk_rpc.c b/module/env_dpdk/env_dpdk_rpc.c +index 0e2cbcb..80cf93d 100644 +--- a/module/env_dpdk/env_dpdk_rpc.c ++++ b/module/env_dpdk/env_dpdk_rpc.c +@@ -1,40 +1,40 @@ +-/* SPDX-License-Identifier: BSD-3-Clause +- * Copyright (C) 2019 Intel Corporation. All rights reserved. +- * Copyright (c) 2019 Mellanox Technologies LTD. All rights reserved. +- */ +- +-#include "spdk/stdinc.h" +-#include "spdk/rpc.h" +-#include "spdk/env_dpdk.h" +-#include "spdk/log.h" +- +-static void +-rpc_env_dpdk_get_mem_stats(struct spdk_jsonrpc_request *request, +- const struct spdk_json_val *params) +-{ +- FILE *file = NULL; +- struct spdk_json_write_ctx *w; +- char default_filename[] = "/tmp/spdk_mem_dump.txt"; +- +- if (params != NULL) { +- SPDK_ERRLOG("spdk_json_decode_object failed\n"); +- spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, +- "env_dpdk_get_mem_stats doesn't accept any parameters.\n"); +- } +- +- file = fopen(default_filename, "w"); +- if (!file) { +- spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, +- "Unable to open file for writing.\n"); +- return; +- } +- +- spdk_env_dpdk_dump_mem_stats(file); +- fclose(file); +- w = spdk_jsonrpc_begin_result(request); +- spdk_json_write_object_begin(w); +- spdk_json_write_named_string(w, "filename", default_filename); +- spdk_json_write_object_end(w); +- spdk_jsonrpc_end_result(request, w); +-} +-SPDK_RPC_REGISTER("env_dpdk_get_mem_stats", rpc_env_dpdk_get_mem_stats, SPDK_RPC_RUNTIME) ++/* SPDX-License-Identifier: BSD-3-Clause ++ * Copyright (C) 2019 Intel Corporation. All rights reserved. ++ * Copyright (c) 2019 Mellanox Technologies LTD. All rights reserved. ++ */ ++ ++#include "spdk/stdinc.h" ++#include "spdk/rpc.h" ++#include "spdk/env_dpdk.h" ++#include "spdk/log.h" ++ ++static void ++rpc_env_dpdk_get_mem_stats(struct spdk_jsonrpc_request *request, ++ const struct spdk_json_val *params) ++{ ++ FILE *file = NULL; ++ struct spdk_json_write_ctx *w; ++ char default_filename[] = "/tmp/spdk_mem_dump.txt"; ++ ++ if (params != NULL) { ++ SPDK_ERRLOG("spdk_json_decode_object failed\n"); ++ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, ++ "env_dpdk_get_mem_stats doesn't accept any parameters.\n"); ++ } ++ ++ file = fopen(default_filename, "w"); ++ if (!file) { ++ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, ++ "Unable to open file for writing.\n"); ++ return; ++ } ++ ++ spdk_env_dpdk_dump_mem_stats(file); ++ fclose(file); ++ w = spdk_jsonrpc_begin_result(request); ++ spdk_json_write_object_begin(w); ++ spdk_json_write_named_string(w, "filename", default_filename); ++ spdk_json_write_object_end(w); ++ spdk_jsonrpc_end_result(request, w); ++} ++SPDK_RPC_REGISTER("env_dpdk_get_mem_stats", rpc_env_dpdk_get_mem_stats, SPDK_RPC_RUNTIME) +diff --git a/module/event/Makefile b/module/event/Makefile +index 493093b..6c53b37 100644 +--- a/module/event/Makefile ++++ b/module/event/Makefile +@@ -1,16 +1,16 @@ +-# SPDX-License-Identifier: BSD-3-Clause +-# Copyright (C) 2015 Intel Corporation. +-# All rights reserved. +-# +- +-SPDK_ROOT_DIR := $(abspath $(CURDIR)/../..) +-include $(SPDK_ROOT_DIR)/mk/spdk.common.mk +- +-DIRS-y = subsystems +- +-.PHONY: all clean $(DIRS-y) +- +-all: $(DIRS-y) +-clean: $(DIRS-y) +- +-include $(SPDK_ROOT_DIR)/mk/spdk.subdirs.mk ++# SPDX-License-Identifier: BSD-3-Clause ++# Copyright (C) 2015 Intel Corporation. ++# All rights reserved. ++# ++ ++SPDK_ROOT_DIR := $(abspath $(CURDIR)/../..) ++include $(SPDK_ROOT_DIR)/mk/spdk.common.mk ++ ++DIRS-y = subsystems ++ ++.PHONY: all clean $(DIRS-y) ++ ++all: $(DIRS-y) ++clean: $(DIRS-y) ++ ++include $(SPDK_ROOT_DIR)/mk/spdk.subdirs.mk +diff --git a/module/event/subsystems/Makefile b/module/event/subsystems/Makefile +index 8242329..3a7be5b 100644 +--- a/module/event/subsystems/Makefile ++++ b/module/event/subsystems/Makefile +@@ -1,39 +1,41 @@ +-# SPDX-License-Identifier: BSD-3-Clause +-# Copyright (C) 2015 Intel Corporation. +-# All rights reserved. +-# +- +-SPDK_ROOT_DIR := $(abspath $(CURDIR)/../../..) +-include $(SPDK_ROOT_DIR)/mk/spdk.common.mk +- +-DIRS-y += bdev accel scheduler iscsi nvmf scsi vmd sock iobuf +- +-ifeq ($(OS),Linux) +-DIRS-y += nbd +-ifeq ($(CONFIG_UBLK),y) +-DIRS-y += ublk +-endif +-endif +- +-DIRS-$(CONFIG_VHOST) += vhost_blk vhost_scsi +-DIRS-$(CONFIG_VFIO_USER) += vfu_tgt +- +-# These dependencies are not based specifically on symbols, but rather +-# the subsystem dependency tree defined within the event subsystem C files +-# themselves. Should that tree change, these dependencies should change +-# accordingly. +-DEPDIRS-accel := iobuf +-DEPDIRS-bdev := accel vmd sock iobuf +-DEPDIRS-iscsi := scsi +-DEPDIRS-nbd := bdev +-DEPDIRS-ublk := bdev +-DEPDIRS-nvmf := bdev +-DEPDIRS-scsi := bdev +-DEPDIRS-vhost_scsi := scsi +- +-.PHONY: all clean $(DIRS-y) +- +-all: $(DIRS-y) +-clean: $(DIRS-y) +- +-include $(SPDK_ROOT_DIR)/mk/spdk.subdirs.mk ++# SPDX-License-Identifier: BSD-3-Clause ++# Copyright (C) 2015 Intel Corporation. ++# All rights reserved. ++# ++ ++SPDK_ROOT_DIR := $(abspath $(CURDIR)/../../..) ++include $(SPDK_ROOT_DIR)/mk/spdk.common.mk ++ ++DIRS-y += bdev accel scheduler iscsi nvmf scsi vmd sock iobuf ++ ++ifeq ($(OS),Linux) ++DIRS-y += nbd ++ifeq ($(CONFIG_UBLK),y) ++DIRS-y += ublk ++endif ++endif ++ ++DIRS-$(CONFIG_VHOST) += vhost_blk vhost_scsi ++DIRS-$(CONFIG_SSAM) += ssam ++DIRS-$(CONFIG_VFIO_USER) += vfu_tgt ++ ++# These dependencies are not based specifically on symbols, but rather ++# the subsystem dependency tree defined within the event subsystem C files ++# themselves. Should that tree change, these dependencies should change ++# accordingly. ++DEPDIRS-accel := iobuf ++DEPDIRS-bdev := accel vmd sock iobuf ++DEPDIRS-iscsi := scsi ++DEPDIRS-nbd := bdev ++DEPDIRS-ublk := bdev ++DEPDIRS-nvmf := bdev ++DEPDIRS-scsi := bdev ++DEPDIRS-vhost_scsi := scsi ++DEPDIRS-ssam := scsi ++ ++.PHONY: all clean $(DIRS-y) ++ ++all: $(DIRS-y) ++clean: $(DIRS-y) ++ ++include $(SPDK_ROOT_DIR)/mk/spdk.subdirs.mk +diff --git a/module/event/subsystems/accel/Makefile b/module/event/subsystems/accel/Makefile +index fb1b730..58705e5 100644 +--- a/module/event/subsystems/accel/Makefile ++++ b/module/event/subsystems/accel/Makefile +@@ -1,17 +1,17 @@ +-# SPDX-License-Identifier: BSD-3-Clause +-# Copyright (C) 2015 Intel Corporation. +-# All rights reserved. +-# +- +-SPDK_ROOT_DIR := $(abspath $(CURDIR)/../../../..) +-include $(SPDK_ROOT_DIR)/mk/spdk.common.mk +- +-SO_VER := 4 +-SO_MINOR := 0 +- +-C_SRCS = accel.c +-LIBNAME = event_accel +- +-SPDK_MAP_FILE = $(SPDK_ROOT_DIR)/mk/spdk_blank.map +- +-include $(SPDK_ROOT_DIR)/mk/spdk.lib.mk ++# SPDX-License-Identifier: BSD-3-Clause ++# Copyright (C) 2015 Intel Corporation. ++# All rights reserved. ++# ++ ++SPDK_ROOT_DIR := $(abspath $(CURDIR)/../../../..) ++include $(SPDK_ROOT_DIR)/mk/spdk.common.mk ++ ++SO_VER := 4 ++SO_MINOR := 0 ++ ++C_SRCS = accel.c ++LIBNAME = event_accel ++ ++SPDK_MAP_FILE = $(SPDK_ROOT_DIR)/mk/spdk_blank.map ++ ++include $(SPDK_ROOT_DIR)/mk/spdk.lib.mk +diff --git a/module/event/subsystems/accel/accel.c b/module/event/subsystems/accel/accel.c +index b2aca4d..af2cc7d 100644 +--- a/module/event/subsystems/accel/accel.c ++++ b/module/event/subsystems/accel/accel.c +@@ -1,43 +1,43 @@ +-/* SPDX-License-Identifier: BSD-3-Clause +- * Copyright (C) 2020 Intel Corporation. +- * All rights reserved. +- */ +- +-#include "spdk/stdinc.h" +- +-#include "spdk/accel.h" +- +-#include "spdk_internal/init.h" +-#include "spdk/env.h" +- +-static void +-accel_subsystem_initialize(void) +-{ +- int rc; +- +- rc = spdk_accel_initialize(); +- +- spdk_subsystem_init_next(rc); +-} +- +-static void +-accel_subsystem_finish_done(void *cb_arg) +-{ +- spdk_subsystem_fini_next(); +-} +- +-static void +-accel_subsystem_finish(void) +-{ +- spdk_accel_finish(accel_subsystem_finish_done, NULL); +-} +- +-static struct spdk_subsystem g_spdk_subsystem_accel = { +- .name = "accel", +- .init = accel_subsystem_initialize, +- .fini = accel_subsystem_finish, +- .write_config_json = spdk_accel_write_config_json, +-}; +- +-SPDK_SUBSYSTEM_REGISTER(g_spdk_subsystem_accel); +-SPDK_SUBSYSTEM_DEPEND(accel, iobuf) ++/* SPDX-License-Identifier: BSD-3-Clause ++ * Copyright (C) 2020 Intel Corporation. ++ * All rights reserved. ++ */ ++ ++#include "spdk/stdinc.h" ++ ++#include "spdk/accel.h" ++ ++#include "spdk_internal/init.h" ++#include "spdk/env.h" ++ ++static void ++accel_subsystem_initialize(void) ++{ ++ int rc; ++ ++ rc = spdk_accel_initialize(); ++ ++ spdk_subsystem_init_next(rc); ++} ++ ++static void ++accel_subsystem_finish_done(void *cb_arg) ++{ ++ spdk_subsystem_fini_next(); ++} ++ ++static void ++accel_subsystem_finish(void) ++{ ++ spdk_accel_finish(accel_subsystem_finish_done, NULL); ++} ++ ++static struct spdk_subsystem g_spdk_subsystem_accel = { ++ .name = "accel", ++ .init = accel_subsystem_initialize, ++ .fini = accel_subsystem_finish, ++ .write_config_json = spdk_accel_write_config_json, ++}; ++ ++SPDK_SUBSYSTEM_REGISTER(g_spdk_subsystem_accel); ++SPDK_SUBSYSTEM_DEPEND(accel, iobuf) +diff --git a/module/event/subsystems/bdev/Makefile b/module/event/subsystems/bdev/Makefile +index 5338858..846e9b8 100644 +--- a/module/event/subsystems/bdev/Makefile ++++ b/module/event/subsystems/bdev/Makefile +@@ -1,17 +1,17 @@ +-# SPDX-License-Identifier: BSD-3-Clause +-# Copyright (C) 2015 Intel Corporation. +-# All rights reserved. +-# +- +-SPDK_ROOT_DIR := $(abspath $(CURDIR)/../../../..) +-include $(SPDK_ROOT_DIR)/mk/spdk.common.mk +- +-SO_VER := 4 +-SO_MINOR := 0 +- +-C_SRCS = bdev.c +-LIBNAME = event_bdev +- +-SPDK_MAP_FILE = $(SPDK_ROOT_DIR)/mk/spdk_blank.map +- +-include $(SPDK_ROOT_DIR)/mk/spdk.lib.mk ++# SPDX-License-Identifier: BSD-3-Clause ++# Copyright (C) 2015 Intel Corporation. ++# All rights reserved. ++# ++ ++SPDK_ROOT_DIR := $(abspath $(CURDIR)/../../../..) ++include $(SPDK_ROOT_DIR)/mk/spdk.common.mk ++ ++SO_VER := 4 ++SO_MINOR := 0 ++ ++C_SRCS = bdev.c ++LIBNAME = event_bdev ++ ++SPDK_MAP_FILE = $(SPDK_ROOT_DIR)/mk/spdk_blank.map ++ ++include $(SPDK_ROOT_DIR)/mk/spdk.lib.mk +diff --git a/module/event/subsystems/bdev/bdev.c b/module/event/subsystems/bdev/bdev.c +index fa02859..1b4c0db 100644 +--- a/module/event/subsystems/bdev/bdev.c ++++ b/module/event/subsystems/bdev/bdev.c +@@ -1,56 +1,56 @@ +-/* SPDX-License-Identifier: BSD-3-Clause +- * Copyright (C) 2016 Intel Corporation. +- * All rights reserved. +- */ +- +-#include "spdk/stdinc.h" +- +-#include "spdk/bdev.h" +-#include "spdk/env.h" +-#include "spdk/thread.h" +- +-#include "spdk_internal/init.h" +-#include "spdk/env.h" +- +-static void +-bdev_initialize_complete(void *cb_arg, int rc) +-{ +- spdk_subsystem_init_next(rc); +-} +- +-static void +-bdev_subsystem_initialize(void) +-{ +- spdk_bdev_initialize(bdev_initialize_complete, NULL); +-} +- +-static void +-bdev_subsystem_finish_done(void *cb_arg) +-{ +- spdk_subsystem_fini_next(); +-} +- +-static void +-bdev_subsystem_finish(void) +-{ +- spdk_bdev_finish(bdev_subsystem_finish_done, NULL); +-} +- +-static void +-bdev_subsystem_config_json(struct spdk_json_write_ctx *w) +-{ +- spdk_bdev_subsystem_config_json(w); +-} +- +-static struct spdk_subsystem g_spdk_subsystem_bdev = { +- .name = "bdev", +- .init = bdev_subsystem_initialize, +- .fini = bdev_subsystem_finish, +- .write_config_json = bdev_subsystem_config_json, +-}; +- +-SPDK_SUBSYSTEM_REGISTER(g_spdk_subsystem_bdev); +-SPDK_SUBSYSTEM_DEPEND(bdev, accel) +-SPDK_SUBSYSTEM_DEPEND(bdev, vmd) +-SPDK_SUBSYSTEM_DEPEND(bdev, sock) +-SPDK_SUBSYSTEM_DEPEND(bdev, iobuf) ++/* SPDX-License-Identifier: BSD-3-Clause ++ * Copyright (C) 2016 Intel Corporation. ++ * All rights reserved. ++ */ ++ ++#include "spdk/stdinc.h" ++ ++#include "spdk/bdev.h" ++#include "spdk/env.h" ++#include "spdk/thread.h" ++ ++#include "spdk_internal/init.h" ++#include "spdk/env.h" ++ ++static void ++bdev_initialize_complete(void *cb_arg, int rc) ++{ ++ spdk_subsystem_init_next(rc); ++} ++ ++static void ++bdev_subsystem_initialize(void) ++{ ++ spdk_bdev_initialize(bdev_initialize_complete, NULL); ++} ++ ++static void ++bdev_subsystem_finish_done(void *cb_arg) ++{ ++ spdk_subsystem_fini_next(); ++} ++ ++static void ++bdev_subsystem_finish(void) ++{ ++ spdk_bdev_finish(bdev_subsystem_finish_done, NULL); ++} ++ ++static void ++bdev_subsystem_config_json(struct spdk_json_write_ctx *w) ++{ ++ spdk_bdev_subsystem_config_json(w); ++} ++ ++static struct spdk_subsystem g_spdk_subsystem_bdev = { ++ .name = "bdev", ++ .init = bdev_subsystem_initialize, ++ .fini = bdev_subsystem_finish, ++ .write_config_json = bdev_subsystem_config_json, ++}; ++ ++SPDK_SUBSYSTEM_REGISTER(g_spdk_subsystem_bdev); ++SPDK_SUBSYSTEM_DEPEND(bdev, accel) ++SPDK_SUBSYSTEM_DEPEND(bdev, vmd) ++SPDK_SUBSYSTEM_DEPEND(bdev, sock) ++SPDK_SUBSYSTEM_DEPEND(bdev, iobuf) +diff --git a/module/event/subsystems/iobuf/Makefile b/module/event/subsystems/iobuf/Makefile +index 96a89cc..ba34d66 100644 +--- a/module/event/subsystems/iobuf/Makefile ++++ b/module/event/subsystems/iobuf/Makefile +@@ -1,17 +1,17 @@ +-# SPDX-License-Identifier: BSD-3-Clause +-# Copyright (C) 2022 Intel Corporation. +-# All rights reserved. +-# +- +-SPDK_ROOT_DIR := $(abspath $(CURDIR)/../../../..) +-include $(SPDK_ROOT_DIR)/mk/spdk.common.mk +- +-SO_VER := 1 +-SO_MINOR := 0 +- +-C_SRCS = iobuf.c iobuf_rpc.c +-LIBNAME = event_iobuf +- +-SPDK_MAP_FILE = $(SPDK_ROOT_DIR)/mk/spdk_blank.map +- +-include $(SPDK_ROOT_DIR)/mk/spdk.lib.mk ++# SPDX-License-Identifier: BSD-3-Clause ++# Copyright (C) 2022 Intel Corporation. ++# All rights reserved. ++# ++ ++SPDK_ROOT_DIR := $(abspath $(CURDIR)/../../../..) ++include $(SPDK_ROOT_DIR)/mk/spdk.common.mk ++ ++SO_VER := 1 ++SO_MINOR := 0 ++ ++C_SRCS = iobuf.c iobuf_rpc.c ++LIBNAME = event_iobuf ++ ++SPDK_MAP_FILE = $(SPDK_ROOT_DIR)/mk/spdk_blank.map ++ ++include $(SPDK_ROOT_DIR)/mk/spdk.lib.mk +diff --git a/module/event/subsystems/iobuf/iobuf.c b/module/event/subsystems/iobuf/iobuf.c +index 785f855..7ae7e02 100644 +--- a/module/event/subsystems/iobuf/iobuf.c ++++ b/module/event/subsystems/iobuf/iobuf.c +@@ -1,106 +1,106 @@ +-/* SPDX-License-Identifier: BSD-3-Clause +- * Copyright (C) 2022 Intel Corporation. +- * All rights reserved. +- */ +- +-#include "spdk/stdinc.h" +-#include "spdk/bdev.h" +-#include "spdk/log.h" +-#include "spdk/thread.h" +-#include "spdk_internal/init.h" +- +-int iobuf_set_opts(struct spdk_iobuf_opts *opts); +- +-static struct spdk_iobuf_opts g_opts; +-static bool g_opts_set; +- +-int +-iobuf_set_opts(struct spdk_iobuf_opts *opts) +-{ +- int rc; +- +- rc = spdk_iobuf_set_opts(opts); +- if (rc != 0) { +- return rc; +- } +- +- g_opts = *opts; +- g_opts_set = true; +- +- return 0; +-} +- +-static void +-iobuf_subsystem_initialize(void) +-{ +- int rc; +- +- if (g_opts_set) { +- /* We want to allow users to keep using bdev layer's spdk_bdev_opts to specify the +- * sizes of the pools, but want to have iobuf_set_opts to take precedence over what +- * was set by the spdk_bdev_opts. So, reset the opts here in case bdev layer set +- * them after iobuf_set_opts. +- */ +- rc = spdk_iobuf_set_opts(&g_opts); +- if (rc != 0) { +- /* This should never happen, we've already validated these options */ +- assert(0); +- goto finish; +- } +- } +- +- rc = spdk_iobuf_initialize(); +- if (rc != 0) { +- SPDK_ERRLOG("Failed to initialize iobuf\n"); +- } +-finish: +- spdk_subsystem_init_next(rc); +-} +- +-static void +-iobuf_finish_cb(void *ctx) +-{ +- spdk_subsystem_fini_next(); +-} +- +-static void +-iobuf_subsystem_finish(void) +-{ +- spdk_iobuf_finish(iobuf_finish_cb, NULL); +-} +- +-static void +-iobuf_write_config_json(struct spdk_json_write_ctx *w) +-{ +- struct spdk_iobuf_opts opts; +- +- spdk_iobuf_get_opts(&opts); +- +- spdk_json_write_array_begin(w); +- /* Make sure we don't override the options from spdk_bdev_opts, unless iobuf_set_options +- * has been executed +- */ +- if (g_opts_set) { +- spdk_json_write_object_begin(w); +- spdk_json_write_named_string(w, "method", "iobuf_set_options"); +- +- spdk_json_write_named_object_begin(w, "params"); +- spdk_json_write_named_uint64(w, "small_pool_count", opts.small_pool_count); +- spdk_json_write_named_uint64(w, "large_pool_count", opts.large_pool_count); +- spdk_json_write_named_uint32(w, "small_bufsize", opts.small_bufsize); +- spdk_json_write_named_uint32(w, "large_bufsize", opts.large_bufsize); +- spdk_json_write_object_end(w); +- +- spdk_json_write_object_end(w); +- } +- spdk_json_write_array_end(w); +-} +- +-static struct spdk_subsystem g_subsystem_iobuf = { +- .name = "iobuf", +- .init = iobuf_subsystem_initialize, +- .fini = iobuf_subsystem_finish, +- .write_config_json = iobuf_write_config_json, +-}; +- +-SPDK_SUBSYSTEM_REGISTER(g_subsystem_iobuf); ++/* SPDX-License-Identifier: BSD-3-Clause ++ * Copyright (C) 2022 Intel Corporation. ++ * All rights reserved. ++ */ ++ ++#include "spdk/stdinc.h" ++#include "spdk/bdev.h" ++#include "spdk/log.h" ++#include "spdk/thread.h" ++#include "spdk_internal/init.h" ++ ++int iobuf_set_opts(struct spdk_iobuf_opts *opts); ++ ++static struct spdk_iobuf_opts g_opts; ++static bool g_opts_set; ++ ++int ++iobuf_set_opts(struct spdk_iobuf_opts *opts) ++{ ++ int rc; ++ ++ rc = spdk_iobuf_set_opts(opts); ++ if (rc != 0) { ++ return rc; ++ } ++ ++ g_opts = *opts; ++ g_opts_set = true; ++ ++ return 0; ++} ++ ++static void ++iobuf_subsystem_initialize(void) ++{ ++ int rc; ++ ++ if (g_opts_set) { ++ /* We want to allow users to keep using bdev layer's spdk_bdev_opts to specify the ++ * sizes of the pools, but want to have iobuf_set_opts to take precedence over what ++ * was set by the spdk_bdev_opts. So, reset the opts here in case bdev layer set ++ * them after iobuf_set_opts. ++ */ ++ rc = spdk_iobuf_set_opts(&g_opts); ++ if (rc != 0) { ++ /* This should never happen, we've already validated these options */ ++ assert(0); ++ goto finish; ++ } ++ } ++ ++ rc = spdk_iobuf_initialize(); ++ if (rc != 0) { ++ SPDK_ERRLOG("Failed to initialize iobuf\n"); ++ } ++finish: ++ spdk_subsystem_init_next(rc); ++} ++ ++static void ++iobuf_finish_cb(void *ctx) ++{ ++ spdk_subsystem_fini_next(); ++} ++ ++static void ++iobuf_subsystem_finish(void) ++{ ++ spdk_iobuf_finish(iobuf_finish_cb, NULL); ++} ++ ++static void ++iobuf_write_config_json(struct spdk_json_write_ctx *w) ++{ ++ struct spdk_iobuf_opts opts; ++ ++ spdk_iobuf_get_opts(&opts); ++ ++ spdk_json_write_array_begin(w); ++ /* Make sure we don't override the options from spdk_bdev_opts, unless iobuf_set_options ++ * has been executed ++ */ ++ if (g_opts_set) { ++ spdk_json_write_object_begin(w); ++ spdk_json_write_named_string(w, "method", "iobuf_set_options"); ++ ++ spdk_json_write_named_object_begin(w, "params"); ++ spdk_json_write_named_uint64(w, "small_pool_count", opts.small_pool_count); ++ spdk_json_write_named_uint64(w, "large_pool_count", opts.large_pool_count); ++ spdk_json_write_named_uint32(w, "small_bufsize", opts.small_bufsize); ++ spdk_json_write_named_uint32(w, "large_bufsize", opts.large_bufsize); ++ spdk_json_write_object_end(w); ++ ++ spdk_json_write_object_end(w); ++ } ++ spdk_json_write_array_end(w); ++} ++ ++static struct spdk_subsystem g_subsystem_iobuf = { ++ .name = "iobuf", ++ .init = iobuf_subsystem_initialize, ++ .fini = iobuf_subsystem_finish, ++ .write_config_json = iobuf_write_config_json, ++}; ++ ++SPDK_SUBSYSTEM_REGISTER(g_subsystem_iobuf); +diff --git a/module/event/subsystems/iobuf/iobuf_rpc.c b/module/event/subsystems/iobuf/iobuf_rpc.c +index 6186c05..beea30e 100644 +--- a/module/event/subsystems/iobuf/iobuf_rpc.c ++++ b/module/event/subsystems/iobuf/iobuf_rpc.c +@@ -1,44 +1,44 @@ +-/* SPDX-License-Identifier: BSD-3-Clause +- * Copyright (C) 2022 Intel Corporation. +- * All rights reserved. +- */ +- +-#include "spdk/stdinc.h" +-#include "spdk/thread.h" +-#include "spdk/rpc.h" +-#include "spdk/string.h" +-#include "spdk_internal/init.h" +- +-int iobuf_set_opts(struct spdk_iobuf_opts *opts); +- +-static const struct spdk_json_object_decoder rpc_iobuf_set_options_decoders[] = { +- {"small_pool_count", offsetof(struct spdk_iobuf_opts, small_pool_count), spdk_json_decode_uint64, true}, +- {"large_pool_count", offsetof(struct spdk_iobuf_opts, large_pool_count), spdk_json_decode_uint64, true}, +- {"small_bufsize", offsetof(struct spdk_iobuf_opts, small_bufsize), spdk_json_decode_uint32, true}, +- {"large_bufsize", offsetof(struct spdk_iobuf_opts, large_bufsize), spdk_json_decode_uint32, true}, +-}; +- +-static void +-rpc_iobuf_set_options(struct spdk_jsonrpc_request *request, const struct spdk_json_val *params) +-{ +- struct spdk_iobuf_opts opts; +- int rc; +- +- spdk_iobuf_get_opts(&opts); +- rc = spdk_json_decode_object(params, rpc_iobuf_set_options_decoders, +- SPDK_COUNTOF(rpc_iobuf_set_options_decoders), &opts); +- if (rc != 0) { +- spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, +- "spdk_json_decode_object failed"); +- return; +- } +- +- rc = iobuf_set_opts(&opts); +- if (rc != 0) { +- spdk_jsonrpc_send_error_response(request, rc, spdk_strerror(-rc)); +- return; +- } +- +- spdk_jsonrpc_send_bool_response(request, true); +-} +-SPDK_RPC_REGISTER("iobuf_set_options", rpc_iobuf_set_options, SPDK_RPC_STARTUP) ++/* SPDX-License-Identifier: BSD-3-Clause ++ * Copyright (C) 2022 Intel Corporation. ++ * All rights reserved. ++ */ ++ ++#include "spdk/stdinc.h" ++#include "spdk/thread.h" ++#include "spdk/rpc.h" ++#include "spdk/string.h" ++#include "spdk_internal/init.h" ++ ++int iobuf_set_opts(struct spdk_iobuf_opts *opts); ++ ++static const struct spdk_json_object_decoder rpc_iobuf_set_options_decoders[] = { ++ {"small_pool_count", offsetof(struct spdk_iobuf_opts, small_pool_count), spdk_json_decode_uint64, true}, ++ {"large_pool_count", offsetof(struct spdk_iobuf_opts, large_pool_count), spdk_json_decode_uint64, true}, ++ {"small_bufsize", offsetof(struct spdk_iobuf_opts, small_bufsize), spdk_json_decode_uint32, true}, ++ {"large_bufsize", offsetof(struct spdk_iobuf_opts, large_bufsize), spdk_json_decode_uint32, true}, ++}; ++ ++static void ++rpc_iobuf_set_options(struct spdk_jsonrpc_request *request, const struct spdk_json_val *params) ++{ ++ struct spdk_iobuf_opts opts; ++ int rc; ++ ++ spdk_iobuf_get_opts(&opts); ++ rc = spdk_json_decode_object(params, rpc_iobuf_set_options_decoders, ++ SPDK_COUNTOF(rpc_iobuf_set_options_decoders), &opts); ++ if (rc != 0) { ++ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, ++ "spdk_json_decode_object failed"); ++ return; ++ } ++ ++ rc = iobuf_set_opts(&opts); ++ if (rc != 0) { ++ spdk_jsonrpc_send_error_response(request, rc, spdk_strerror(-rc)); ++ return; ++ } ++ ++ spdk_jsonrpc_send_bool_response(request, true); ++} ++SPDK_RPC_REGISTER("iobuf_set_options", rpc_iobuf_set_options, SPDK_RPC_STARTUP) +diff --git a/module/event/subsystems/iscsi/Makefile b/module/event/subsystems/iscsi/Makefile +index 2c091c3..ae000c2 100644 +--- a/module/event/subsystems/iscsi/Makefile ++++ b/module/event/subsystems/iscsi/Makefile +@@ -1,18 +1,18 @@ +-# SPDX-License-Identifier: BSD-3-Clause +-# Copyright (C) 2015 Intel Corporation. +-# All rights reserved. +-# +- +-SPDK_ROOT_DIR := $(abspath $(CURDIR)/../../../..) +-include $(SPDK_ROOT_DIR)/mk/spdk.common.mk +- +-SO_VER := 4 +-SO_MINOR := 0 +- +-CFLAGS += -I$(SPDK_ROOT_DIR)/lib +-C_SRCS = iscsi.c +-LIBNAME = event_iscsi +- +-SPDK_MAP_FILE = $(SPDK_ROOT_DIR)/mk/spdk_blank.map +- +-include $(SPDK_ROOT_DIR)/mk/spdk.lib.mk ++# SPDX-License-Identifier: BSD-3-Clause ++# Copyright (C) 2015 Intel Corporation. ++# All rights reserved. ++# ++ ++SPDK_ROOT_DIR := $(abspath $(CURDIR)/../../../..) ++include $(SPDK_ROOT_DIR)/mk/spdk.common.mk ++ ++SO_VER := 4 ++SO_MINOR := 0 ++ ++CFLAGS += -I$(SPDK_ROOT_DIR)/lib ++C_SRCS = iscsi.c ++LIBNAME = event_iscsi ++ ++SPDK_MAP_FILE = $(SPDK_ROOT_DIR)/mk/spdk_blank.map ++ ++include $(SPDK_ROOT_DIR)/mk/spdk.lib.mk +diff --git a/module/event/subsystems/iscsi/iscsi.c b/module/event/subsystems/iscsi/iscsi.c +index ef66c22..dfa93a8 100644 +--- a/module/event/subsystems/iscsi/iscsi.c ++++ b/module/event/subsystems/iscsi/iscsi.c +@@ -1,51 +1,51 @@ +-/* SPDX-License-Identifier: BSD-3-Clause +- * Copyright (C) 2017 Intel Corporation. +- * All rights reserved. +- */ +- +-#include "spdk/stdinc.h" +- +-#include "iscsi/iscsi.h" +- +-#include "spdk_internal/init.h" +- +-static void +-iscsi_subsystem_init_complete(void *cb_arg, int rc) +-{ +- spdk_subsystem_init_next(rc); +-} +- +-static void +-iscsi_subsystem_init(void) +-{ +- spdk_iscsi_init(iscsi_subsystem_init_complete, NULL); +-} +- +-static void +-iscsi_subsystem_fini_done(void *arg) +-{ +- spdk_subsystem_fini_next(); +-} +- +-static void +-iscsi_subsystem_fini(void) +-{ +- spdk_iscsi_fini(iscsi_subsystem_fini_done, NULL); +-} +- +-static void +-iscsi_subsystem_config_json(struct spdk_json_write_ctx *w) +-{ +- spdk_iscsi_config_json(w); +-} +- +-static struct spdk_subsystem g_spdk_subsystem_iscsi = { +- .name = "iscsi", +- .init = iscsi_subsystem_init, +- .fini = iscsi_subsystem_fini, +- .write_config_json = iscsi_subsystem_config_json, +-}; +- +-SPDK_SUBSYSTEM_REGISTER(g_spdk_subsystem_iscsi); +-SPDK_SUBSYSTEM_DEPEND(iscsi, scsi) +-SPDK_SUBSYSTEM_DEPEND(iscsi, sock) ++/* SPDX-License-Identifier: BSD-3-Clause ++ * Copyright (C) 2017 Intel Corporation. ++ * All rights reserved. ++ */ ++ ++#include "spdk/stdinc.h" ++ ++#include "iscsi/iscsi.h" ++ ++#include "spdk_internal/init.h" ++ ++static void ++iscsi_subsystem_init_complete(void *cb_arg, int rc) ++{ ++ spdk_subsystem_init_next(rc); ++} ++ ++static void ++iscsi_subsystem_init(void) ++{ ++ spdk_iscsi_init(iscsi_subsystem_init_complete, NULL); ++} ++ ++static void ++iscsi_subsystem_fini_done(void *arg) ++{ ++ spdk_subsystem_fini_next(); ++} ++ ++static void ++iscsi_subsystem_fini(void) ++{ ++ spdk_iscsi_fini(iscsi_subsystem_fini_done, NULL); ++} ++ ++static void ++iscsi_subsystem_config_json(struct spdk_json_write_ctx *w) ++{ ++ spdk_iscsi_config_json(w); ++} ++ ++static struct spdk_subsystem g_spdk_subsystem_iscsi = { ++ .name = "iscsi", ++ .init = iscsi_subsystem_init, ++ .fini = iscsi_subsystem_fini, ++ .write_config_json = iscsi_subsystem_config_json, ++}; ++ ++SPDK_SUBSYSTEM_REGISTER(g_spdk_subsystem_iscsi); ++SPDK_SUBSYSTEM_DEPEND(iscsi, scsi) ++SPDK_SUBSYSTEM_DEPEND(iscsi, sock) +diff --git a/module/event/subsystems/nbd/Makefile b/module/event/subsystems/nbd/Makefile +index c2fe23a..593b227 100644 +--- a/module/event/subsystems/nbd/Makefile ++++ b/module/event/subsystems/nbd/Makefile +@@ -1,17 +1,17 @@ +-# SPDX-License-Identifier: BSD-3-Clause +-# Copyright (C) 2015 Intel Corporation. +-# All rights reserved. +-# +- +-SPDK_ROOT_DIR := $(abspath $(CURDIR)/../../../..) +-include $(SPDK_ROOT_DIR)/mk/spdk.common.mk +- +-SO_VER := 4 +-SO_MINOR := 0 +- +-C_SRCS = nbd.c +-LIBNAME = event_nbd +- +-SPDK_MAP_FILE = $(SPDK_ROOT_DIR)/mk/spdk_blank.map +- +-include $(SPDK_ROOT_DIR)/mk/spdk.lib.mk ++# SPDX-License-Identifier: BSD-3-Clause ++# Copyright (C) 2015 Intel Corporation. ++# All rights reserved. ++# ++ ++SPDK_ROOT_DIR := $(abspath $(CURDIR)/../../../..) ++include $(SPDK_ROOT_DIR)/mk/spdk.common.mk ++ ++SO_VER := 4 ++SO_MINOR := 0 ++ ++C_SRCS = nbd.c ++LIBNAME = event_nbd ++ ++SPDK_MAP_FILE = $(SPDK_ROOT_DIR)/mk/spdk_blank.map ++ ++include $(SPDK_ROOT_DIR)/mk/spdk.lib.mk +diff --git a/module/event/subsystems/nbd/nbd.c b/module/event/subsystems/nbd/nbd.c +index a482ad7..9ca2d5e 100644 +--- a/module/event/subsystems/nbd/nbd.c ++++ b/module/event/subsystems/nbd/nbd.c +@@ -1,48 +1,48 @@ +-/* SPDX-License-Identifier: BSD-3-Clause +- * Copyright (C) 2017 Intel Corporation. +- * All rights reserved. +- */ +- +-#include "spdk/stdinc.h" +- +-#include "spdk/nbd.h" +- +-#include "spdk_internal/init.h" +- +-static void +-nbd_subsystem_init(void) +-{ +- int rc; +- +- rc = spdk_nbd_init(); +- +- spdk_subsystem_init_next(rc); +-} +- +-static void +-nbd_subsystem_fini_done(void *arg) +-{ +- spdk_subsystem_fini_next(); +-} +- +-static void +-nbd_subsystem_fini(void) +-{ +- spdk_nbd_fini(nbd_subsystem_fini_done, NULL); +-} +- +-static void +-nbd_subsystem_write_config_json(struct spdk_json_write_ctx *w) +-{ +- spdk_nbd_write_config_json(w); +-} +- +-static struct spdk_subsystem g_spdk_subsystem_nbd = { +- .name = "nbd", +- .init = nbd_subsystem_init, +- .fini = nbd_subsystem_fini, +- .write_config_json = nbd_subsystem_write_config_json, +-}; +- +-SPDK_SUBSYSTEM_REGISTER(g_spdk_subsystem_nbd); +-SPDK_SUBSYSTEM_DEPEND(nbd, bdev) ++/* SPDX-License-Identifier: BSD-3-Clause ++ * Copyright (C) 2017 Intel Corporation. ++ * All rights reserved. ++ */ ++ ++#include "spdk/stdinc.h" ++ ++#include "spdk/nbd.h" ++ ++#include "spdk_internal/init.h" ++ ++static void ++nbd_subsystem_init(void) ++{ ++ int rc; ++ ++ rc = spdk_nbd_init(); ++ ++ spdk_subsystem_init_next(rc); ++} ++ ++static void ++nbd_subsystem_fini_done(void *arg) ++{ ++ spdk_subsystem_fini_next(); ++} ++ ++static void ++nbd_subsystem_fini(void) ++{ ++ spdk_nbd_fini(nbd_subsystem_fini_done, NULL); ++} ++ ++static void ++nbd_subsystem_write_config_json(struct spdk_json_write_ctx *w) ++{ ++ spdk_nbd_write_config_json(w); ++} ++ ++static struct spdk_subsystem g_spdk_subsystem_nbd = { ++ .name = "nbd", ++ .init = nbd_subsystem_init, ++ .fini = nbd_subsystem_fini, ++ .write_config_json = nbd_subsystem_write_config_json, ++}; ++ ++SPDK_SUBSYSTEM_REGISTER(g_spdk_subsystem_nbd); ++SPDK_SUBSYSTEM_DEPEND(nbd, bdev) +diff --git a/module/event/subsystems/nvmf/Makefile b/module/event/subsystems/nvmf/Makefile +index 7d8fcfa..5b2afb5 100644 +--- a/module/event/subsystems/nvmf/Makefile ++++ b/module/event/subsystems/nvmf/Makefile +@@ -1,17 +1,17 @@ +-# SPDX-License-Identifier: BSD-3-Clause +-# Copyright (C) 2015 Intel Corporation. +-# All rights reserved. +-# +- +-SPDK_ROOT_DIR := $(abspath $(CURDIR)/../../../..) +-include $(SPDK_ROOT_DIR)/mk/spdk.common.mk +- +-SO_VER := 4 +-SO_MINOR := 0 +- +-C_SRCS = nvmf_rpc.c nvmf_tgt.c +-LIBNAME = event_nvmf +- +-SPDK_MAP_FILE = $(SPDK_ROOT_DIR)/mk/spdk_blank.map +- +-include $(SPDK_ROOT_DIR)/mk/spdk.lib.mk ++# SPDX-License-Identifier: BSD-3-Clause ++# Copyright (C) 2015 Intel Corporation. ++# All rights reserved. ++# ++ ++SPDK_ROOT_DIR := $(abspath $(CURDIR)/../../../..) ++include $(SPDK_ROOT_DIR)/mk/spdk.common.mk ++ ++SO_VER := 4 ++SO_MINOR := 0 ++ ++C_SRCS = nvmf_rpc.c nvmf_tgt.c ++LIBNAME = event_nvmf ++ ++SPDK_MAP_FILE = $(SPDK_ROOT_DIR)/mk/spdk_blank.map ++ ++include $(SPDK_ROOT_DIR)/mk/spdk.lib.mk +diff --git a/module/event/subsystems/nvmf/event_nvmf.h b/module/event/subsystems/nvmf/event_nvmf.h +index 82e614d..0bf118b 100644 +--- a/module/event/subsystems/nvmf/event_nvmf.h ++++ b/module/event/subsystems/nvmf/event_nvmf.h +@@ -1,36 +1,36 @@ +-/* SPDX-License-Identifier: BSD-3-Clause +- * Copyright (C) 2016 Intel Corporation. +- * All rights reserved. +- * Copyright (c) 2021 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +- */ +- +-#ifndef NVMF_TGT_H +-#define NVMF_TGT_H +- +-#include "spdk/stdinc.h" +- +-#include "spdk/nvmf.h" +-#include "spdk/queue.h" +- +-#include "spdk_internal/init.h" +-#include "spdk/log.h" +- +-struct spdk_nvmf_admin_passthru_conf { +- bool identify_ctrlr; +-}; +- +-struct spdk_nvmf_tgt_conf { +- struct spdk_nvmf_admin_passthru_conf admin_passthru; +- enum spdk_nvmf_tgt_discovery_filter discovery_filter; +-}; +- +-extern struct spdk_nvmf_tgt_conf g_spdk_nvmf_tgt_conf; +- +-extern uint32_t g_spdk_nvmf_tgt_max_subsystems; +-extern uint16_t g_spdk_nvmf_tgt_crdt[3]; +- +-extern struct spdk_nvmf_tgt *g_spdk_nvmf_tgt; +- +-extern struct spdk_cpuset *g_poll_groups_mask; +- +-#endif ++/* SPDX-License-Identifier: BSD-3-Clause ++ * Copyright (C) 2016 Intel Corporation. ++ * All rights reserved. ++ * Copyright (c) 2021 NVIDIA CORPORATION & AFFILIATES. All rights reserved. ++ */ ++ ++#ifndef NVMF_TGT_H ++#define NVMF_TGT_H ++ ++#include "spdk/stdinc.h" ++ ++#include "spdk/nvmf.h" ++#include "spdk/queue.h" ++ ++#include "spdk_internal/init.h" ++#include "spdk/log.h" ++ ++struct spdk_nvmf_admin_passthru_conf { ++ bool identify_ctrlr; ++}; ++ ++struct spdk_nvmf_tgt_conf { ++ struct spdk_nvmf_admin_passthru_conf admin_passthru; ++ enum spdk_nvmf_tgt_discovery_filter discovery_filter; ++}; ++ ++extern struct spdk_nvmf_tgt_conf g_spdk_nvmf_tgt_conf; ++ ++extern uint32_t g_spdk_nvmf_tgt_max_subsystems; ++extern uint16_t g_spdk_nvmf_tgt_crdt[3]; ++ ++extern struct spdk_nvmf_tgt *g_spdk_nvmf_tgt; ++ ++extern struct spdk_cpuset *g_poll_groups_mask; ++ ++#endif +diff --git a/module/event/subsystems/nvmf/nvmf_rpc.c b/module/event/subsystems/nvmf/nvmf_rpc.c +index 66a4d2b..fb931d1 100644 +--- a/module/event/subsystems/nvmf/nvmf_rpc.c ++++ b/module/event/subsystems/nvmf/nvmf_rpc.c +@@ -1,230 +1,230 @@ +-/* SPDX-License-Identifier: BSD-3-Clause +- * Copyright (C) 2016 Intel Corporation. All rights reserved. +- * Copyright (c) 2018-2019 Mellanox Technologies LTD. All rights reserved. +- * Copyright (c) 2021 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +- */ +- +-#include "event_nvmf.h" +- +-#include "spdk/rpc.h" +-#include "spdk/util.h" +-#include "spdk/cpuset.h" +- +-static const struct spdk_json_object_decoder nvmf_rpc_subsystem_tgt_opts_decoder[] = { +- {"max_subsystems", 0, spdk_json_decode_uint32, true} +-}; +- +-static void +-rpc_nvmf_set_max_subsystems(struct spdk_jsonrpc_request *request, +- const struct spdk_json_val *params) +-{ +- uint32_t max_subsystems = 0; +- +- if (g_spdk_nvmf_tgt_max_subsystems != 0) { +- SPDK_ERRLOG("this RPC must not be called more than once.\n"); +- spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, +- "Must not call more than once"); +- return; +- } +- +- if (params != NULL) { +- if (spdk_json_decode_object(params, nvmf_rpc_subsystem_tgt_opts_decoder, +- SPDK_COUNTOF(nvmf_rpc_subsystem_tgt_opts_decoder), &max_subsystems)) { +- SPDK_ERRLOG("spdk_json_decode_object() failed\n"); +- spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, +- "Invalid parameters"); +- return; +- } +- } +- +- g_spdk_nvmf_tgt_max_subsystems = max_subsystems; +- +- spdk_jsonrpc_send_bool_response(request, true); +-} +-SPDK_RPC_REGISTER("nvmf_set_max_subsystems", rpc_nvmf_set_max_subsystems, +- SPDK_RPC_STARTUP) +- +-static const struct spdk_json_object_decoder admin_passthru_decoder[] = { +- {"identify_ctrlr", offsetof(struct spdk_nvmf_admin_passthru_conf, identify_ctrlr), spdk_json_decode_bool} +-}; +- +-static int +-decode_admin_passthru(const struct spdk_json_val *val, void *out) +-{ +- struct spdk_nvmf_admin_passthru_conf *req = (struct spdk_nvmf_admin_passthru_conf *)out; +- +- if (spdk_json_decode_object(val, admin_passthru_decoder, +- SPDK_COUNTOF(admin_passthru_decoder), +- req)) { +- SPDK_ERRLOG("spdk_json_decode_object failed\n"); +- return -1; +- } +- +- return 0; +-} +- +-static int +-decode_discovery_filter(const struct spdk_json_val *val, void *out) +-{ +- enum spdk_nvmf_tgt_discovery_filter *_filter = (enum spdk_nvmf_tgt_discovery_filter *)out; +- enum spdk_nvmf_tgt_discovery_filter filter = SPDK_NVMF_TGT_DISCOVERY_MATCH_ANY; +- char *tokens = spdk_json_strdup(val); +- char *tok; +- int rc = -EINVAL; +- bool all_specified = false; +- +- if (!tokens) { +- return -ENOMEM; +- } +- +- tok = strtok(tokens, ","); +- while (tok) { +- if (strncmp(tok, "match_any", 9) == 0) { +- if (filter != SPDK_NVMF_TGT_DISCOVERY_MATCH_ANY) { +- goto out; +- } +- filter = SPDK_NVMF_TGT_DISCOVERY_MATCH_ANY; +- all_specified = true; +- } else { +- if (all_specified) { +- goto out; +- } +- if (strncmp(tok, "transport", 9) == 0) { +- filter |= SPDK_NVMF_TGT_DISCOVERY_MATCH_TRANSPORT_TYPE; +- } else if (strncmp(tok, "address", 7) == 0) { +- filter |= SPDK_NVMF_TGT_DISCOVERY_MATCH_TRANSPORT_ADDRESS; +- } else if (strncmp(tok, "svcid", 5) == 0) { +- filter |= SPDK_NVMF_TGT_DISCOVERY_MATCH_TRANSPORT_SVCID; +- } else { +- SPDK_ERRLOG("Invalid value %s\n", tok); +- goto out; +- } +- } +- +- tok = strtok(NULL, ","); +- } +- +- rc = 0; +- *_filter = filter; +- +-out: +- free(tokens); +- +- return rc; +-} +- +-static int +-nvmf_is_subset_of_env_core_mask(const struct spdk_cpuset *set) +-{ +- uint32_t i, tmp_counter = 0; +- +- SPDK_ENV_FOREACH_CORE(i) { +- if (spdk_cpuset_get_cpu(set, i)) { +- ++tmp_counter; +- } +- } +- return spdk_cpuset_count(set) - tmp_counter; +-} +- +-static int +-nvmf_decode_poll_groups_mask(const struct spdk_json_val *val, void *out) +-{ +- char *mask = spdk_json_strdup(val); +- int ret = -1; +- +- if (mask == NULL) { +- return -1; +- } +- +- if (!(g_poll_groups_mask = spdk_cpuset_alloc())) { +- SPDK_ERRLOG("Unable to allocate a poll groups mask object in nvmf_decode_poll_groups_mask.\n"); +- free(mask); +- return -1; +- } +- +- ret = spdk_cpuset_parse(g_poll_groups_mask, mask); +- free(mask); +- if (ret == 0) { +- if (nvmf_is_subset_of_env_core_mask(g_poll_groups_mask) == 0) { +- return 0; +- } else { +- SPDK_ERRLOG("Poll groups cpumask 0x%s is out of range\n", spdk_cpuset_fmt(g_poll_groups_mask)); +- } +- } else { +- SPDK_ERRLOG("Invalid cpumask\n"); +- } +- +- spdk_cpuset_free(g_poll_groups_mask); +- g_poll_groups_mask = NULL; +- return -1; +-} +- +-static const struct spdk_json_object_decoder nvmf_rpc_subsystem_tgt_conf_decoder[] = { +- {"admin_cmd_passthru", offsetof(struct spdk_nvmf_tgt_conf, admin_passthru), decode_admin_passthru, true}, +- {"poll_groups_mask", 0, nvmf_decode_poll_groups_mask, true}, +- {"discovery_filter", offsetof(struct spdk_nvmf_tgt_conf, discovery_filter), decode_discovery_filter, true} +-}; +- +-static void +-rpc_nvmf_set_config(struct spdk_jsonrpc_request *request, +- const struct spdk_json_val *params) +-{ +- struct spdk_nvmf_tgt_conf conf; +- +- memcpy(&conf, &g_spdk_nvmf_tgt_conf, sizeof(conf)); +- +- if (params != NULL) { +- if (spdk_json_decode_object(params, nvmf_rpc_subsystem_tgt_conf_decoder, +- SPDK_COUNTOF(nvmf_rpc_subsystem_tgt_conf_decoder), &conf)) { +- SPDK_ERRLOG("spdk_json_decode_object() failed\n"); +- spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, +- "Invalid parameters"); +- return; +- } +- } +- +- memcpy(&g_spdk_nvmf_tgt_conf, &conf, sizeof(conf)); +- +- spdk_jsonrpc_send_bool_response(request, true); +-} +-SPDK_RPC_REGISTER("nvmf_set_config", rpc_nvmf_set_config, SPDK_RPC_STARTUP) +- +-struct nvmf_rpc_set_crdt { +- uint16_t crdt1; +- uint16_t crdt2; +- uint16_t crdt3; +-}; +- +-static const struct spdk_json_object_decoder rpc_set_crdt_opts_decoders[] = { +- {"crdt1", offsetof(struct nvmf_rpc_set_crdt, crdt1), spdk_json_decode_uint16, true}, +- {"crdt2", offsetof(struct nvmf_rpc_set_crdt, crdt2), spdk_json_decode_uint16, true}, +- {"crdt3", offsetof(struct nvmf_rpc_set_crdt, crdt3), spdk_json_decode_uint16, true}, +-}; +- +-static void +-rpc_nvmf_set_crdt(struct spdk_jsonrpc_request *request, +- const struct spdk_json_val *params) +-{ +- struct nvmf_rpc_set_crdt rpc_set_crdt; +- +- rpc_set_crdt.crdt1 = 0; +- rpc_set_crdt.crdt2 = 0; +- rpc_set_crdt.crdt3 = 0; +- +- if (params != NULL) { +- if (spdk_json_decode_object(params, rpc_set_crdt_opts_decoders, +- SPDK_COUNTOF(rpc_set_crdt_opts_decoders), &rpc_set_crdt)) { +- SPDK_ERRLOG("spdk_json_decode_object() failed\n"); +- spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, +- "Invalid parameters"); +- return; +- } +- } +- +- g_spdk_nvmf_tgt_crdt[0] = rpc_set_crdt.crdt1; +- g_spdk_nvmf_tgt_crdt[1] = rpc_set_crdt.crdt2; +- g_spdk_nvmf_tgt_crdt[2] = rpc_set_crdt.crdt3; +- +- spdk_jsonrpc_send_bool_response(request, true); +-} +-SPDK_RPC_REGISTER("nvmf_set_crdt", rpc_nvmf_set_crdt, SPDK_RPC_STARTUP) ++/* SPDX-License-Identifier: BSD-3-Clause ++ * Copyright (C) 2016 Intel Corporation. All rights reserved. ++ * Copyright (c) 2018-2019 Mellanox Technologies LTD. All rights reserved. ++ * Copyright (c) 2021 NVIDIA CORPORATION & AFFILIATES. All rights reserved. ++ */ ++ ++#include "event_nvmf.h" ++ ++#include "spdk/rpc.h" ++#include "spdk/util.h" ++#include "spdk/cpuset.h" ++ ++static const struct spdk_json_object_decoder nvmf_rpc_subsystem_tgt_opts_decoder[] = { ++ {"max_subsystems", 0, spdk_json_decode_uint32, true} ++}; ++ ++static void ++rpc_nvmf_set_max_subsystems(struct spdk_jsonrpc_request *request, ++ const struct spdk_json_val *params) ++{ ++ uint32_t max_subsystems = 0; ++ ++ if (g_spdk_nvmf_tgt_max_subsystems != 0) { ++ SPDK_ERRLOG("this RPC must not be called more than once.\n"); ++ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, ++ "Must not call more than once"); ++ return; ++ } ++ ++ if (params != NULL) { ++ if (spdk_json_decode_object(params, nvmf_rpc_subsystem_tgt_opts_decoder, ++ SPDK_COUNTOF(nvmf_rpc_subsystem_tgt_opts_decoder), &max_subsystems)) { ++ SPDK_ERRLOG("spdk_json_decode_object() failed\n"); ++ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, ++ "Invalid parameters"); ++ return; ++ } ++ } ++ ++ g_spdk_nvmf_tgt_max_subsystems = max_subsystems; ++ ++ spdk_jsonrpc_send_bool_response(request, true); ++} ++SPDK_RPC_REGISTER("nvmf_set_max_subsystems", rpc_nvmf_set_max_subsystems, ++ SPDK_RPC_STARTUP) ++ ++static const struct spdk_json_object_decoder admin_passthru_decoder[] = { ++ {"identify_ctrlr", offsetof(struct spdk_nvmf_admin_passthru_conf, identify_ctrlr), spdk_json_decode_bool} ++}; ++ ++static int ++decode_admin_passthru(const struct spdk_json_val *val, void *out) ++{ ++ struct spdk_nvmf_admin_passthru_conf *req = (struct spdk_nvmf_admin_passthru_conf *)out; ++ ++ if (spdk_json_decode_object(val, admin_passthru_decoder, ++ SPDK_COUNTOF(admin_passthru_decoder), ++ req)) { ++ SPDK_ERRLOG("spdk_json_decode_object failed\n"); ++ return -1; ++ } ++ ++ return 0; ++} ++ ++static int ++decode_discovery_filter(const struct spdk_json_val *val, void *out) ++{ ++ enum spdk_nvmf_tgt_discovery_filter *_filter = (enum spdk_nvmf_tgt_discovery_filter *)out; ++ enum spdk_nvmf_tgt_discovery_filter filter = SPDK_NVMF_TGT_DISCOVERY_MATCH_ANY; ++ char *tokens = spdk_json_strdup(val); ++ char *tok; ++ int rc = -EINVAL; ++ bool all_specified = false; ++ ++ if (!tokens) { ++ return -ENOMEM; ++ } ++ ++ tok = strtok(tokens, ","); ++ while (tok) { ++ if (strncmp(tok, "match_any", 9) == 0) { ++ if (filter != SPDK_NVMF_TGT_DISCOVERY_MATCH_ANY) { ++ goto out; ++ } ++ filter = SPDK_NVMF_TGT_DISCOVERY_MATCH_ANY; ++ all_specified = true; ++ } else { ++ if (all_specified) { ++ goto out; ++ } ++ if (strncmp(tok, "transport", 9) == 0) { ++ filter |= SPDK_NVMF_TGT_DISCOVERY_MATCH_TRANSPORT_TYPE; ++ } else if (strncmp(tok, "address", 7) == 0) { ++ filter |= SPDK_NVMF_TGT_DISCOVERY_MATCH_TRANSPORT_ADDRESS; ++ } else if (strncmp(tok, "svcid", 5) == 0) { ++ filter |= SPDK_NVMF_TGT_DISCOVERY_MATCH_TRANSPORT_SVCID; ++ } else { ++ SPDK_ERRLOG("Invalid value %s\n", tok); ++ goto out; ++ } ++ } ++ ++ tok = strtok(NULL, ","); ++ } ++ ++ rc = 0; ++ *_filter = filter; ++ ++out: ++ free(tokens); ++ ++ return rc; ++} ++ ++static int ++nvmf_is_subset_of_env_core_mask(const struct spdk_cpuset *set) ++{ ++ uint32_t i, tmp_counter = 0; ++ ++ SPDK_ENV_FOREACH_CORE(i) { ++ if (spdk_cpuset_get_cpu(set, i)) { ++ ++tmp_counter; ++ } ++ } ++ return spdk_cpuset_count(set) - tmp_counter; ++} ++ ++static int ++nvmf_decode_poll_groups_mask(const struct spdk_json_val *val, void *out) ++{ ++ char *mask = spdk_json_strdup(val); ++ int ret = -1; ++ ++ if (mask == NULL) { ++ return -1; ++ } ++ ++ if (!(g_poll_groups_mask = spdk_cpuset_alloc())) { ++ SPDK_ERRLOG("Unable to allocate a poll groups mask object in nvmf_decode_poll_groups_mask.\n"); ++ free(mask); ++ return -1; ++ } ++ ++ ret = spdk_cpuset_parse(g_poll_groups_mask, mask); ++ free(mask); ++ if (ret == 0) { ++ if (nvmf_is_subset_of_env_core_mask(g_poll_groups_mask) == 0) { ++ return 0; ++ } else { ++ SPDK_ERRLOG("Poll groups cpumask 0x%s is out of range\n", spdk_cpuset_fmt(g_poll_groups_mask)); ++ } ++ } else { ++ SPDK_ERRLOG("Invalid cpumask\n"); ++ } ++ ++ spdk_cpuset_free(g_poll_groups_mask); ++ g_poll_groups_mask = NULL; ++ return -1; ++} ++ ++static const struct spdk_json_object_decoder nvmf_rpc_subsystem_tgt_conf_decoder[] = { ++ {"admin_cmd_passthru", offsetof(struct spdk_nvmf_tgt_conf, admin_passthru), decode_admin_passthru, true}, ++ {"poll_groups_mask", 0, nvmf_decode_poll_groups_mask, true}, ++ {"discovery_filter", offsetof(struct spdk_nvmf_tgt_conf, discovery_filter), decode_discovery_filter, true} ++}; ++ ++static void ++rpc_nvmf_set_config(struct spdk_jsonrpc_request *request, ++ const struct spdk_json_val *params) ++{ ++ struct spdk_nvmf_tgt_conf conf; ++ ++ memcpy(&conf, &g_spdk_nvmf_tgt_conf, sizeof(conf)); ++ ++ if (params != NULL) { ++ if (spdk_json_decode_object(params, nvmf_rpc_subsystem_tgt_conf_decoder, ++ SPDK_COUNTOF(nvmf_rpc_subsystem_tgt_conf_decoder), &conf)) { ++ SPDK_ERRLOG("spdk_json_decode_object() failed\n"); ++ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, ++ "Invalid parameters"); ++ return; ++ } ++ } ++ ++ memcpy(&g_spdk_nvmf_tgt_conf, &conf, sizeof(conf)); ++ ++ spdk_jsonrpc_send_bool_response(request, true); ++} ++SPDK_RPC_REGISTER("nvmf_set_config", rpc_nvmf_set_config, SPDK_RPC_STARTUP) ++ ++struct nvmf_rpc_set_crdt { ++ uint16_t crdt1; ++ uint16_t crdt2; ++ uint16_t crdt3; ++}; ++ ++static const struct spdk_json_object_decoder rpc_set_crdt_opts_decoders[] = { ++ {"crdt1", offsetof(struct nvmf_rpc_set_crdt, crdt1), spdk_json_decode_uint16, true}, ++ {"crdt2", offsetof(struct nvmf_rpc_set_crdt, crdt2), spdk_json_decode_uint16, true}, ++ {"crdt3", offsetof(struct nvmf_rpc_set_crdt, crdt3), spdk_json_decode_uint16, true}, ++}; ++ ++static void ++rpc_nvmf_set_crdt(struct spdk_jsonrpc_request *request, ++ const struct spdk_json_val *params) ++{ ++ struct nvmf_rpc_set_crdt rpc_set_crdt; ++ ++ rpc_set_crdt.crdt1 = 0; ++ rpc_set_crdt.crdt2 = 0; ++ rpc_set_crdt.crdt3 = 0; ++ ++ if (params != NULL) { ++ if (spdk_json_decode_object(params, rpc_set_crdt_opts_decoders, ++ SPDK_COUNTOF(rpc_set_crdt_opts_decoders), &rpc_set_crdt)) { ++ SPDK_ERRLOG("spdk_json_decode_object() failed\n"); ++ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, ++ "Invalid parameters"); ++ return; ++ } ++ } ++ ++ g_spdk_nvmf_tgt_crdt[0] = rpc_set_crdt.crdt1; ++ g_spdk_nvmf_tgt_crdt[1] = rpc_set_crdt.crdt2; ++ g_spdk_nvmf_tgt_crdt[2] = rpc_set_crdt.crdt3; ++ ++ spdk_jsonrpc_send_bool_response(request, true); ++} ++SPDK_RPC_REGISTER("nvmf_set_crdt", rpc_nvmf_set_crdt, SPDK_RPC_STARTUP) +diff --git a/module/event/subsystems/nvmf/nvmf_tgt.c b/module/event/subsystems/nvmf/nvmf_tgt.c +index 2f23d0e..9263896 100644 +--- a/module/event/subsystems/nvmf/nvmf_tgt.c ++++ b/module/event/subsystems/nvmf/nvmf_tgt.c +@@ -1,552 +1,552 @@ +-/* SPDX-License-Identifier: BSD-3-Clause +- * Copyright (C) 2018 Intel Corporation. +- * All rights reserved. +- * Copyright (c) 2021 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +- */ +- +-#include "event_nvmf.h" +- +-#include "spdk/bdev.h" +-#include "spdk/thread.h" +-#include "spdk/log.h" +-#include "spdk/nvme.h" +-#include "spdk/nvmf_cmd.h" +-#include "spdk_internal/usdt.h" +- +-enum nvmf_tgt_state { +- NVMF_TGT_INIT_NONE = 0, +- NVMF_TGT_INIT_CREATE_TARGET, +- NVMF_TGT_INIT_CREATE_POLL_GROUPS, +- NVMF_TGT_INIT_START_SUBSYSTEMS, +- NVMF_TGT_RUNNING, +- NVMF_TGT_FINI_STOP_SUBSYSTEMS, +- NVMF_TGT_FINI_DESTROY_SUBSYSTEMS, +- NVMF_TGT_FINI_DESTROY_POLL_GROUPS, +- NVMF_TGT_FINI_DESTROY_TARGET, +- NVMF_TGT_STOPPED, +- NVMF_TGT_ERROR, +-}; +- +-struct nvmf_tgt_poll_group { +- struct spdk_nvmf_poll_group *group; +- struct spdk_thread *thread; +- TAILQ_ENTRY(nvmf_tgt_poll_group) link; +-}; +- +-struct spdk_nvmf_tgt_conf g_spdk_nvmf_tgt_conf = { +- .admin_passthru.identify_ctrlr = false +-}; +- +-struct spdk_cpuset *g_poll_groups_mask = NULL; +-struct spdk_nvmf_tgt *g_spdk_nvmf_tgt = NULL; +-uint32_t g_spdk_nvmf_tgt_max_subsystems = 0; +-uint16_t g_spdk_nvmf_tgt_crdt[3] = {0, 0, 0}; +- +-static enum nvmf_tgt_state g_tgt_state; +- +-static struct spdk_thread *g_tgt_init_thread = NULL; +-static struct spdk_thread *g_tgt_fini_thread = NULL; +- +-static TAILQ_HEAD(, nvmf_tgt_poll_group) g_poll_groups = TAILQ_HEAD_INITIALIZER(g_poll_groups); +-static size_t g_num_poll_groups = 0; +- +-static void nvmf_tgt_advance_state(void); +- +-static void +-nvmf_shutdown_cb(void *arg1) +-{ +- /* Still in initialization state, defer shutdown operation */ +- if (g_tgt_state < NVMF_TGT_RUNNING) { +- spdk_thread_send_msg(spdk_get_thread(), nvmf_shutdown_cb, NULL); +- return; +- } else if (g_tgt_state != NVMF_TGT_RUNNING && g_tgt_state != NVMF_TGT_ERROR) { +- /* Already in Shutdown status, ignore the signal */ +- return; +- } +- +- if (g_tgt_state == NVMF_TGT_ERROR) { +- /* Parse configuration error */ +- g_tgt_state = NVMF_TGT_FINI_DESTROY_TARGET; +- } else { +- g_tgt_state = NVMF_TGT_FINI_STOP_SUBSYSTEMS; +- } +- nvmf_tgt_advance_state(); +-} +- +-static void +-nvmf_subsystem_fini(void) +-{ +- nvmf_shutdown_cb(NULL); +-} +- +-static void +-_nvmf_tgt_destroy_poll_group_done(void *ctx) +-{ +- assert(g_num_poll_groups > 0); +- +- if (--g_num_poll_groups == 0) { +- g_tgt_state = NVMF_TGT_FINI_DESTROY_TARGET; +- nvmf_tgt_advance_state(); +- } +-} +- +-static void +-nvmf_tgt_destroy_poll_group_done(void *cb_arg, int status) +-{ +- struct nvmf_tgt_poll_group *pg = cb_arg; +- +- free(pg); +- +- spdk_thread_send_msg(g_tgt_fini_thread, _nvmf_tgt_destroy_poll_group_done, NULL); +- +- spdk_thread_exit(spdk_get_thread()); +-} +- +-static void +-nvmf_tgt_destroy_poll_group(void *ctx) +-{ +- struct nvmf_tgt_poll_group *pg = ctx; +- +- spdk_nvmf_poll_group_destroy(pg->group, nvmf_tgt_destroy_poll_group_done, pg); +-} +- +-static void +-nvmf_tgt_destroy_poll_groups(void) +-{ +- struct nvmf_tgt_poll_group *pg, *tpg; +- +- g_tgt_fini_thread = spdk_get_thread(); +- assert(g_tgt_fini_thread != NULL); +- +- TAILQ_FOREACH_SAFE(pg, &g_poll_groups, link, tpg) { +- TAILQ_REMOVE(&g_poll_groups, pg, link); +- spdk_thread_send_msg(pg->thread, nvmf_tgt_destroy_poll_group, pg); +- } +-} +- +-static uint32_t +-nvmf_get_cpuset_count(void) +-{ +- if (g_poll_groups_mask) { +- return spdk_cpuset_count(g_poll_groups_mask); +- } else { +- return spdk_env_get_core_count(); +- } +-} +- +-static void +-nvmf_tgt_create_poll_group_done(void *ctx) +-{ +- struct nvmf_tgt_poll_group *pg = ctx; +- +- assert(pg); +- +- if (!pg->group) { +- SPDK_ERRLOG("Failed to create nvmf poll group\n"); +- /* Change the state to error but wait for completions from all other threads */ +- g_tgt_state = NVMF_TGT_ERROR; +- } +- +- TAILQ_INSERT_TAIL(&g_poll_groups, pg, link); +- +- assert(g_num_poll_groups < nvmf_get_cpuset_count()); +- +- if (++g_num_poll_groups == nvmf_get_cpuset_count()) { +- if (g_tgt_state != NVMF_TGT_ERROR) { +- g_tgt_state = NVMF_TGT_INIT_START_SUBSYSTEMS; +- } +- nvmf_tgt_advance_state(); +- } +-} +- +-static void +-nvmf_tgt_create_poll_group(void *ctx) +-{ +- struct nvmf_tgt_poll_group *pg; +- +- pg = calloc(1, sizeof(*pg)); +- if (!pg) { +- SPDK_ERRLOG("Not enough memory to allocate poll groups\n"); +- g_tgt_state = NVMF_TGT_ERROR; +- nvmf_tgt_advance_state(); +- return; +- } +- +- pg->thread = spdk_get_thread(); +- pg->group = spdk_nvmf_poll_group_create(g_spdk_nvmf_tgt); +- +- spdk_thread_send_msg(g_tgt_init_thread, nvmf_tgt_create_poll_group_done, pg); +-} +- +-static void +-nvmf_tgt_create_poll_groups(void) +-{ +- uint32_t cpu, count = 0; +- char thread_name[32]; +- struct spdk_thread *thread; +- +- g_tgt_init_thread = spdk_get_thread(); +- assert(g_tgt_init_thread != NULL); +- +- SPDK_ENV_FOREACH_CORE(cpu) { +- if (g_poll_groups_mask && !spdk_cpuset_get_cpu(g_poll_groups_mask, cpu)) { +- continue; +- } +- snprintf(thread_name, sizeof(thread_name), "nvmf_tgt_poll_group_%u", count++); +- +- thread = spdk_thread_create(thread_name, g_poll_groups_mask); +- assert(thread != NULL); +- +- spdk_thread_send_msg(thread, nvmf_tgt_create_poll_group, NULL); +- } +-} +- +-static void +-nvmf_tgt_subsystem_started(struct spdk_nvmf_subsystem *subsystem, +- void *cb_arg, int status) +-{ +- subsystem = spdk_nvmf_subsystem_get_next(subsystem); +- int rc; +- +- if (subsystem) { +- rc = spdk_nvmf_subsystem_start(subsystem, nvmf_tgt_subsystem_started, NULL); +- if (rc) { +- g_tgt_state = NVMF_TGT_FINI_STOP_SUBSYSTEMS; +- SPDK_ERRLOG("Unable to start NVMe-oF subsystem. Stopping app.\n"); +- nvmf_tgt_advance_state(); +- } +- return; +- } +- +- g_tgt_state = NVMF_TGT_RUNNING; +- nvmf_tgt_advance_state(); +-} +- +-static void +-nvmf_tgt_subsystem_stopped(struct spdk_nvmf_subsystem *subsystem, +- void *cb_arg, int status) +-{ +- subsystem = spdk_nvmf_subsystem_get_next(subsystem); +- int rc; +- +- if (subsystem) { +- rc = spdk_nvmf_subsystem_stop(subsystem, nvmf_tgt_subsystem_stopped, NULL); +- if (rc) { +- SPDK_ERRLOG("Unable to stop NVMe-oF subsystem %s with rc %d, Trying others.\n", +- spdk_nvmf_subsystem_get_nqn(subsystem), rc); +- nvmf_tgt_subsystem_stopped(subsystem, NULL, 0); +- } +- return; +- } +- +- g_tgt_state = NVMF_TGT_FINI_DESTROY_SUBSYSTEMS; +- nvmf_tgt_advance_state(); +-} +- +-static void +-_nvmf_tgt_subsystem_destroy(void *cb_arg) +-{ +- struct spdk_nvmf_subsystem *subsystem, *next_subsystem; +- int rc; +- +- subsystem = spdk_nvmf_subsystem_get_first(g_spdk_nvmf_tgt); +- +- while (subsystem != NULL) { +- next_subsystem = spdk_nvmf_subsystem_get_next(subsystem); +- rc = spdk_nvmf_subsystem_destroy(subsystem, _nvmf_tgt_subsystem_destroy, NULL); +- if (rc) { +- if (rc == -EINPROGRESS) { +- /* If ret is -EINPROGRESS, nvmf_tgt_subsystem_destroyed will be called when subsystem +- * is destroyed, _nvmf_tgt_subsystem_destroy will continue to destroy other subsystems if any */ +- return; +- } else { +- SPDK_ERRLOG("Unable to destroy subsystem %s, rc %d. Trying others.\n", +- spdk_nvmf_subsystem_get_nqn(subsystem), rc); +- } +- } +- subsystem = next_subsystem; +- } +- +- g_tgt_state = NVMF_TGT_FINI_DESTROY_POLL_GROUPS; +- nvmf_tgt_advance_state(); +-} +- +-static void +-nvmf_tgt_destroy_done(void *ctx, int status) +-{ +- g_tgt_state = NVMF_TGT_STOPPED; +- +- nvmf_tgt_advance_state(); +-} +- +-static int +-nvmf_add_discovery_subsystem(void) +-{ +- struct spdk_nvmf_subsystem *subsystem; +- +- subsystem = spdk_nvmf_subsystem_create(g_spdk_nvmf_tgt, SPDK_NVMF_DISCOVERY_NQN, +- SPDK_NVMF_SUBTYPE_DISCOVERY, 0); +- if (subsystem == NULL) { +- SPDK_ERRLOG("Failed creating discovery nvmf library subsystem\n"); +- return -1; +- } +- +- spdk_nvmf_subsystem_set_allow_any_host(subsystem, true); +- +- return 0; +-} +- +-static int +-nvmf_tgt_create_target(void) +-{ +- struct spdk_nvmf_target_opts opts = { +- .name = "nvmf_tgt" +- }; +- +- opts.max_subsystems = g_spdk_nvmf_tgt_max_subsystems; +- opts.crdt[0] = g_spdk_nvmf_tgt_crdt[0]; +- opts.crdt[1] = g_spdk_nvmf_tgt_crdt[1]; +- opts.crdt[2] = g_spdk_nvmf_tgt_crdt[2]; +- opts.discovery_filter = g_spdk_nvmf_tgt_conf.discovery_filter; +- g_spdk_nvmf_tgt = spdk_nvmf_tgt_create(&opts); +- if (!g_spdk_nvmf_tgt) { +- SPDK_ERRLOG("spdk_nvmf_tgt_create() failed\n"); +- return -1; +- } +- +- if (nvmf_add_discovery_subsystem() != 0) { +- SPDK_ERRLOG("nvmf_add_discovery_subsystem failed\n"); +- return -1; +- } +- +- return 0; +-} +- +-static void +-fixup_identify_ctrlr(struct spdk_nvmf_request *req) +-{ +- uint32_t length; +- int rc; +- struct spdk_nvme_ctrlr_data *nvme_cdata; +- struct spdk_nvme_ctrlr_data nvmf_cdata = {}; +- struct spdk_nvmf_ctrlr *ctrlr = spdk_nvmf_request_get_ctrlr(req); +- struct spdk_nvme_cpl *rsp = spdk_nvmf_request_get_response(req); +- +- /* This is the identify data from the NVMe drive */ +- spdk_nvmf_request_get_data(req, (void **)&nvme_cdata, &length); +- +- /* Get the NVMF identify data */ +- rc = spdk_nvmf_ctrlr_identify_ctrlr(ctrlr, &nvmf_cdata); +- if (rc != SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE) { +- rsp->status.sct = SPDK_NVME_SCT_GENERIC; +- rsp->status.sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; +- return; +- } +- +- /* Fixup NVMF identify data with NVMe identify data */ +- +- /* Serial Number (SN) */ +- memcpy(&nvmf_cdata.sn[0], &nvme_cdata->sn[0], sizeof(nvmf_cdata.sn)); +- /* Model Number (MN) */ +- memcpy(&nvmf_cdata.mn[0], &nvme_cdata->mn[0], sizeof(nvmf_cdata.mn)); +- /* Firmware Revision (FR) */ +- memcpy(&nvmf_cdata.fr[0], &nvme_cdata->fr[0], sizeof(nvmf_cdata.fr)); +- /* IEEE OUI Identifier (IEEE) */ +- memcpy(&nvmf_cdata.ieee[0], &nvme_cdata->ieee[0], sizeof(nvmf_cdata.ieee)); +- /* FRU Globally Unique Identifier (FGUID) */ +- +- /* Copy the fixed up data back to the response */ +- memcpy(nvme_cdata, &nvmf_cdata, length); +-} +- +-static int +-nvmf_custom_identify_hdlr(struct spdk_nvmf_request *req) +-{ +- struct spdk_nvme_cmd *cmd = spdk_nvmf_request_get_cmd(req); +- struct spdk_bdev *bdev; +- struct spdk_bdev_desc *desc; +- struct spdk_io_channel *ch; +- struct spdk_nvmf_subsystem *subsys; +- int rc; +- +- if (cmd->cdw10_bits.identify.cns != SPDK_NVME_IDENTIFY_CTRLR) { +- return -1; /* continue */ +- } +- +- subsys = spdk_nvmf_request_get_subsystem(req); +- if (subsys == NULL) { +- return -1; +- } +- +- /* Only procss this request if it has exactly one namespace */ +- if (spdk_nvmf_subsystem_get_max_nsid(subsys) != 1) { +- return -1; +- } +- +- /* Forward to first namespace if it supports NVME admin commands */ +- rc = spdk_nvmf_request_get_bdev(1, req, &bdev, &desc, &ch); +- if (rc) { +- /* No bdev found for this namespace. Continue. */ +- return -1; +- } +- +- if (!spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_NVME_ADMIN)) { +- return -1; +- } +- +- return spdk_nvmf_bdev_ctrlr_nvme_passthru_admin(bdev, desc, ch, req, fixup_identify_ctrlr); +-} +- +-static void +-nvmf_tgt_advance_state(void) +-{ +- enum nvmf_tgt_state prev_state; +- int rc = -1; +- int ret; +- +- do { +- SPDK_DTRACE_PROBE1(nvmf_tgt_state, g_tgt_state); +- prev_state = g_tgt_state; +- +- switch (g_tgt_state) { +- case NVMF_TGT_INIT_NONE: { +- g_tgt_state = NVMF_TGT_INIT_CREATE_TARGET; +- break; +- } +- case NVMF_TGT_INIT_CREATE_TARGET: +- ret = nvmf_tgt_create_target(); +- g_tgt_state = (ret == 0) ? NVMF_TGT_INIT_CREATE_POLL_GROUPS : NVMF_TGT_ERROR; +- break; +- case NVMF_TGT_INIT_CREATE_POLL_GROUPS: +- if (g_spdk_nvmf_tgt_conf.admin_passthru.identify_ctrlr) { +- SPDK_NOTICELOG("Custom identify ctrlr handler enabled\n"); +- spdk_nvmf_set_custom_admin_cmd_hdlr(SPDK_NVME_OPC_IDENTIFY, nvmf_custom_identify_hdlr); +- } +- /* Create poll group threads, and send a message to each thread +- * and create a poll group. +- */ +- nvmf_tgt_create_poll_groups(); +- break; +- case NVMF_TGT_INIT_START_SUBSYSTEMS: { +- struct spdk_nvmf_subsystem *subsystem; +- +- subsystem = spdk_nvmf_subsystem_get_first(g_spdk_nvmf_tgt); +- +- if (subsystem) { +- ret = spdk_nvmf_subsystem_start(subsystem, nvmf_tgt_subsystem_started, NULL); +- if (ret) { +- SPDK_ERRLOG("Unable to start NVMe-oF subsystem. Stopping app.\n"); +- g_tgt_state = NVMF_TGT_FINI_STOP_SUBSYSTEMS; +- } +- } else { +- g_tgt_state = NVMF_TGT_RUNNING; +- } +- break; +- } +- case NVMF_TGT_RUNNING: +- spdk_subsystem_init_next(0); +- break; +- case NVMF_TGT_FINI_STOP_SUBSYSTEMS: { +- struct spdk_nvmf_subsystem *subsystem; +- +- subsystem = spdk_nvmf_subsystem_get_first(g_spdk_nvmf_tgt); +- +- if (subsystem) { +- ret = spdk_nvmf_subsystem_stop(subsystem, nvmf_tgt_subsystem_stopped, NULL); +- if (ret) { +- nvmf_tgt_subsystem_stopped(subsystem, NULL, 0); +- } +- } else { +- g_tgt_state = NVMF_TGT_FINI_DESTROY_SUBSYSTEMS; +- } +- break; +- } +- case NVMF_TGT_FINI_DESTROY_SUBSYSTEMS: +- _nvmf_tgt_subsystem_destroy(NULL); +- /* Function above can be asynchronous, it will call nvmf_tgt_advance_state() once done. +- * So just return here */ +- return; +- case NVMF_TGT_FINI_DESTROY_POLL_GROUPS: +- /* Send a message to each poll group thread, and terminate the thread */ +- nvmf_tgt_destroy_poll_groups(); +- break; +- case NVMF_TGT_FINI_DESTROY_TARGET: +- spdk_nvmf_tgt_destroy(g_spdk_nvmf_tgt, nvmf_tgt_destroy_done, NULL); +- break; +- case NVMF_TGT_STOPPED: +- spdk_subsystem_fini_next(); +- return; +- case NVMF_TGT_ERROR: +- spdk_subsystem_init_next(rc); +- return; +- } +- +- } while (g_tgt_state != prev_state); +-} +- +-static void +-nvmf_subsystem_init(void) +-{ +- g_tgt_state = NVMF_TGT_INIT_NONE; +- nvmf_tgt_advance_state(); +-} +- +-static void +-nvmf_subsystem_dump_discover_filter(struct spdk_json_write_ctx *w) +-{ +- static char const *const answers[] = { +- "match_any", +- "transport", +- "address", +- "transport,address", +- "svcid", +- "transport,svcid", +- "address,svcid", +- "transport,address,svcid" +- }; +- +- if ((g_spdk_nvmf_tgt_conf.discovery_filter & ~(SPDK_NVMF_TGT_DISCOVERY_MATCH_TRANSPORT_TYPE | +- SPDK_NVMF_TGT_DISCOVERY_MATCH_TRANSPORT_ADDRESS | +- SPDK_NVMF_TGT_DISCOVERY_MATCH_TRANSPORT_SVCID)) != 0) { +- SPDK_ERRLOG("Incorrect discovery filter %d\n", g_spdk_nvmf_tgt_conf.discovery_filter); +- assert(0); +- return; +- } +- +- spdk_json_write_named_string(w, "discovery_filter", answers[g_spdk_nvmf_tgt_conf.discovery_filter]); +-} +- +-static void +-nvmf_subsystem_write_config_json(struct spdk_json_write_ctx *w) +-{ +- spdk_json_write_array_begin(w); +- +- spdk_json_write_object_begin(w); +- spdk_json_write_named_string(w, "method", "nvmf_set_config"); +- +- spdk_json_write_named_object_begin(w, "params"); +- nvmf_subsystem_dump_discover_filter(w); +- spdk_json_write_named_object_begin(w, "admin_cmd_passthru"); +- spdk_json_write_named_bool(w, "identify_ctrlr", +- g_spdk_nvmf_tgt_conf.admin_passthru.identify_ctrlr); +- spdk_json_write_object_end(w); +- if (g_poll_groups_mask) { +- spdk_json_write_named_string(w, "poll_groups_mask", spdk_cpuset_fmt(g_poll_groups_mask)); +- } +- spdk_json_write_object_end(w); +- spdk_json_write_object_end(w); +- +- spdk_nvmf_tgt_write_config_json(w, g_spdk_nvmf_tgt); +- spdk_json_write_array_end(w); +-} +- +-static struct spdk_subsystem g_spdk_subsystem_nvmf = { +- .name = "nvmf", +- .init = nvmf_subsystem_init, +- .fini = nvmf_subsystem_fini, +- .write_config_json = nvmf_subsystem_write_config_json, +-}; +- +-SPDK_SUBSYSTEM_REGISTER(g_spdk_subsystem_nvmf) +-SPDK_SUBSYSTEM_DEPEND(nvmf, bdev) +-SPDK_SUBSYSTEM_DEPEND(nvmf, sock) ++/* SPDX-License-Identifier: BSD-3-Clause ++ * Copyright (C) 2018 Intel Corporation. ++ * All rights reserved. ++ * Copyright (c) 2021 NVIDIA CORPORATION & AFFILIATES. All rights reserved. ++ */ ++ ++#include "event_nvmf.h" ++ ++#include "spdk/bdev.h" ++#include "spdk/thread.h" ++#include "spdk/log.h" ++#include "spdk/nvme.h" ++#include "spdk/nvmf_cmd.h" ++#include "spdk_internal/usdt.h" ++ ++enum nvmf_tgt_state { ++ NVMF_TGT_INIT_NONE = 0, ++ NVMF_TGT_INIT_CREATE_TARGET, ++ NVMF_TGT_INIT_CREATE_POLL_GROUPS, ++ NVMF_TGT_INIT_START_SUBSYSTEMS, ++ NVMF_TGT_RUNNING, ++ NVMF_TGT_FINI_STOP_SUBSYSTEMS, ++ NVMF_TGT_FINI_DESTROY_SUBSYSTEMS, ++ NVMF_TGT_FINI_DESTROY_POLL_GROUPS, ++ NVMF_TGT_FINI_DESTROY_TARGET, ++ NVMF_TGT_STOPPED, ++ NVMF_TGT_ERROR, ++}; ++ ++struct nvmf_tgt_poll_group { ++ struct spdk_nvmf_poll_group *group; ++ struct spdk_thread *thread; ++ TAILQ_ENTRY(nvmf_tgt_poll_group) link; ++}; ++ ++struct spdk_nvmf_tgt_conf g_spdk_nvmf_tgt_conf = { ++ .admin_passthru.identify_ctrlr = false ++}; ++ ++struct spdk_cpuset *g_poll_groups_mask = NULL; ++struct spdk_nvmf_tgt *g_spdk_nvmf_tgt = NULL; ++uint32_t g_spdk_nvmf_tgt_max_subsystems = 0; ++uint16_t g_spdk_nvmf_tgt_crdt[3] = {0, 0, 0}; ++ ++static enum nvmf_tgt_state g_tgt_state; ++ ++static struct spdk_thread *g_tgt_init_thread = NULL; ++static struct spdk_thread *g_tgt_fini_thread = NULL; ++ ++static TAILQ_HEAD(, nvmf_tgt_poll_group) g_poll_groups = TAILQ_HEAD_INITIALIZER(g_poll_groups); ++static size_t g_num_poll_groups = 0; ++ ++static void nvmf_tgt_advance_state(void); ++ ++static void ++nvmf_shutdown_cb(void *arg1) ++{ ++ /* Still in initialization state, defer shutdown operation */ ++ if (g_tgt_state < NVMF_TGT_RUNNING) { ++ spdk_thread_send_msg(spdk_get_thread(), nvmf_shutdown_cb, NULL); ++ return; ++ } else if (g_tgt_state != NVMF_TGT_RUNNING && g_tgt_state != NVMF_TGT_ERROR) { ++ /* Already in Shutdown status, ignore the signal */ ++ return; ++ } ++ ++ if (g_tgt_state == NVMF_TGT_ERROR) { ++ /* Parse configuration error */ ++ g_tgt_state = NVMF_TGT_FINI_DESTROY_TARGET; ++ } else { ++ g_tgt_state = NVMF_TGT_FINI_STOP_SUBSYSTEMS; ++ } ++ nvmf_tgt_advance_state(); ++} ++ ++static void ++nvmf_subsystem_fini(void) ++{ ++ nvmf_shutdown_cb(NULL); ++} ++ ++static void ++_nvmf_tgt_destroy_poll_group_done(void *ctx) ++{ ++ assert(g_num_poll_groups > 0); ++ ++ if (--g_num_poll_groups == 0) { ++ g_tgt_state = NVMF_TGT_FINI_DESTROY_TARGET; ++ nvmf_tgt_advance_state(); ++ } ++} ++ ++static void ++nvmf_tgt_destroy_poll_group_done(void *cb_arg, int status) ++{ ++ struct nvmf_tgt_poll_group *pg = cb_arg; ++ ++ free(pg); ++ ++ spdk_thread_send_msg(g_tgt_fini_thread, _nvmf_tgt_destroy_poll_group_done, NULL); ++ ++ spdk_thread_exit(spdk_get_thread()); ++} ++ ++static void ++nvmf_tgt_destroy_poll_group(void *ctx) ++{ ++ struct nvmf_tgt_poll_group *pg = ctx; ++ ++ spdk_nvmf_poll_group_destroy(pg->group, nvmf_tgt_destroy_poll_group_done, pg); ++} ++ ++static void ++nvmf_tgt_destroy_poll_groups(void) ++{ ++ struct nvmf_tgt_poll_group *pg, *tpg; ++ ++ g_tgt_fini_thread = spdk_get_thread(); ++ assert(g_tgt_fini_thread != NULL); ++ ++ TAILQ_FOREACH_SAFE(pg, &g_poll_groups, link, tpg) { ++ TAILQ_REMOVE(&g_poll_groups, pg, link); ++ spdk_thread_send_msg(pg->thread, nvmf_tgt_destroy_poll_group, pg); ++ } ++} ++ ++static uint32_t ++nvmf_get_cpuset_count(void) ++{ ++ if (g_poll_groups_mask) { ++ return spdk_cpuset_count(g_poll_groups_mask); ++ } else { ++ return spdk_env_get_core_count(); ++ } ++} ++ ++static void ++nvmf_tgt_create_poll_group_done(void *ctx) ++{ ++ struct nvmf_tgt_poll_group *pg = ctx; ++ ++ assert(pg); ++ ++ if (!pg->group) { ++ SPDK_ERRLOG("Failed to create nvmf poll group\n"); ++ /* Change the state to error but wait for completions from all other threads */ ++ g_tgt_state = NVMF_TGT_ERROR; ++ } ++ ++ TAILQ_INSERT_TAIL(&g_poll_groups, pg, link); ++ ++ assert(g_num_poll_groups < nvmf_get_cpuset_count()); ++ ++ if (++g_num_poll_groups == nvmf_get_cpuset_count()) { ++ if (g_tgt_state != NVMF_TGT_ERROR) { ++ g_tgt_state = NVMF_TGT_INIT_START_SUBSYSTEMS; ++ } ++ nvmf_tgt_advance_state(); ++ } ++} ++ ++static void ++nvmf_tgt_create_poll_group(void *ctx) ++{ ++ struct nvmf_tgt_poll_group *pg; ++ ++ pg = calloc(1, sizeof(*pg)); ++ if (!pg) { ++ SPDK_ERRLOG("Not enough memory to allocate poll groups\n"); ++ g_tgt_state = NVMF_TGT_ERROR; ++ nvmf_tgt_advance_state(); ++ return; ++ } ++ ++ pg->thread = spdk_get_thread(); ++ pg->group = spdk_nvmf_poll_group_create(g_spdk_nvmf_tgt); ++ ++ spdk_thread_send_msg(g_tgt_init_thread, nvmf_tgt_create_poll_group_done, pg); ++} ++ ++static void ++nvmf_tgt_create_poll_groups(void) ++{ ++ uint32_t cpu, count = 0; ++ char thread_name[32]; ++ struct spdk_thread *thread; ++ ++ g_tgt_init_thread = spdk_get_thread(); ++ assert(g_tgt_init_thread != NULL); ++ ++ SPDK_ENV_FOREACH_CORE(cpu) { ++ if (g_poll_groups_mask && !spdk_cpuset_get_cpu(g_poll_groups_mask, cpu)) { ++ continue; ++ } ++ snprintf(thread_name, sizeof(thread_name), "nvmf_tgt_poll_group_%u", count++); ++ ++ thread = spdk_thread_create(thread_name, g_poll_groups_mask); ++ assert(thread != NULL); ++ ++ spdk_thread_send_msg(thread, nvmf_tgt_create_poll_group, NULL); ++ } ++} ++ ++static void ++nvmf_tgt_subsystem_started(struct spdk_nvmf_subsystem *subsystem, ++ void *cb_arg, int status) ++{ ++ subsystem = spdk_nvmf_subsystem_get_next(subsystem); ++ int rc; ++ ++ if (subsystem) { ++ rc = spdk_nvmf_subsystem_start(subsystem, nvmf_tgt_subsystem_started, NULL); ++ if (rc) { ++ g_tgt_state = NVMF_TGT_FINI_STOP_SUBSYSTEMS; ++ SPDK_ERRLOG("Unable to start NVMe-oF subsystem. Stopping app.\n"); ++ nvmf_tgt_advance_state(); ++ } ++ return; ++ } ++ ++ g_tgt_state = NVMF_TGT_RUNNING; ++ nvmf_tgt_advance_state(); ++} ++ ++static void ++nvmf_tgt_subsystem_stopped(struct spdk_nvmf_subsystem *subsystem, ++ void *cb_arg, int status) ++{ ++ subsystem = spdk_nvmf_subsystem_get_next(subsystem); ++ int rc; ++ ++ if (subsystem) { ++ rc = spdk_nvmf_subsystem_stop(subsystem, nvmf_tgt_subsystem_stopped, NULL); ++ if (rc) { ++ SPDK_ERRLOG("Unable to stop NVMe-oF subsystem %s with rc %d, Trying others.\n", ++ spdk_nvmf_subsystem_get_nqn(subsystem), rc); ++ nvmf_tgt_subsystem_stopped(subsystem, NULL, 0); ++ } ++ return; ++ } ++ ++ g_tgt_state = NVMF_TGT_FINI_DESTROY_SUBSYSTEMS; ++ nvmf_tgt_advance_state(); ++} ++ ++static void ++_nvmf_tgt_subsystem_destroy(void *cb_arg) ++{ ++ struct spdk_nvmf_subsystem *subsystem, *next_subsystem; ++ int rc; ++ ++ subsystem = spdk_nvmf_subsystem_get_first(g_spdk_nvmf_tgt); ++ ++ while (subsystem != NULL) { ++ next_subsystem = spdk_nvmf_subsystem_get_next(subsystem); ++ rc = spdk_nvmf_subsystem_destroy(subsystem, _nvmf_tgt_subsystem_destroy, NULL); ++ if (rc) { ++ if (rc == -EINPROGRESS) { ++ /* If ret is -EINPROGRESS, nvmf_tgt_subsystem_destroyed will be called when subsystem ++ * is destroyed, _nvmf_tgt_subsystem_destroy will continue to destroy other subsystems if any */ ++ return; ++ } else { ++ SPDK_ERRLOG("Unable to destroy subsystem %s, rc %d. Trying others.\n", ++ spdk_nvmf_subsystem_get_nqn(subsystem), rc); ++ } ++ } ++ subsystem = next_subsystem; ++ } ++ ++ g_tgt_state = NVMF_TGT_FINI_DESTROY_POLL_GROUPS; ++ nvmf_tgt_advance_state(); ++} ++ ++static void ++nvmf_tgt_destroy_done(void *ctx, int status) ++{ ++ g_tgt_state = NVMF_TGT_STOPPED; ++ ++ nvmf_tgt_advance_state(); ++} ++ ++static int ++nvmf_add_discovery_subsystem(void) ++{ ++ struct spdk_nvmf_subsystem *subsystem; ++ ++ subsystem = spdk_nvmf_subsystem_create(g_spdk_nvmf_tgt, SPDK_NVMF_DISCOVERY_NQN, ++ SPDK_NVMF_SUBTYPE_DISCOVERY, 0); ++ if (subsystem == NULL) { ++ SPDK_ERRLOG("Failed creating discovery nvmf library subsystem\n"); ++ return -1; ++ } ++ ++ spdk_nvmf_subsystem_set_allow_any_host(subsystem, true); ++ ++ return 0; ++} ++ ++static int ++nvmf_tgt_create_target(void) ++{ ++ struct spdk_nvmf_target_opts opts = { ++ .name = "nvmf_tgt" ++ }; ++ ++ opts.max_subsystems = g_spdk_nvmf_tgt_max_subsystems; ++ opts.crdt[0] = g_spdk_nvmf_tgt_crdt[0]; ++ opts.crdt[1] = g_spdk_nvmf_tgt_crdt[1]; ++ opts.crdt[2] = g_spdk_nvmf_tgt_crdt[2]; ++ opts.discovery_filter = g_spdk_nvmf_tgt_conf.discovery_filter; ++ g_spdk_nvmf_tgt = spdk_nvmf_tgt_create(&opts); ++ if (!g_spdk_nvmf_tgt) { ++ SPDK_ERRLOG("spdk_nvmf_tgt_create() failed\n"); ++ return -1; ++ } ++ ++ if (nvmf_add_discovery_subsystem() != 0) { ++ SPDK_ERRLOG("nvmf_add_discovery_subsystem failed\n"); ++ return -1; ++ } ++ ++ return 0; ++} ++ ++static void ++fixup_identify_ctrlr(struct spdk_nvmf_request *req) ++{ ++ uint32_t length; ++ int rc; ++ struct spdk_nvme_ctrlr_data *nvme_cdata; ++ struct spdk_nvme_ctrlr_data nvmf_cdata = {}; ++ struct spdk_nvmf_ctrlr *ctrlr = spdk_nvmf_request_get_ctrlr(req); ++ struct spdk_nvme_cpl *rsp = spdk_nvmf_request_get_response(req); ++ ++ /* This is the identify data from the NVMe drive */ ++ spdk_nvmf_request_get_data(req, (void **)&nvme_cdata, &length); ++ ++ /* Get the NVMF identify data */ ++ rc = spdk_nvmf_ctrlr_identify_ctrlr(ctrlr, &nvmf_cdata); ++ if (rc != SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE) { ++ rsp->status.sct = SPDK_NVME_SCT_GENERIC; ++ rsp->status.sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; ++ return; ++ } ++ ++ /* Fixup NVMF identify data with NVMe identify data */ ++ ++ /* Serial Number (SN) */ ++ memcpy(&nvmf_cdata.sn[0], &nvme_cdata->sn[0], sizeof(nvmf_cdata.sn)); ++ /* Model Number (MN) */ ++ memcpy(&nvmf_cdata.mn[0], &nvme_cdata->mn[0], sizeof(nvmf_cdata.mn)); ++ /* Firmware Revision (FR) */ ++ memcpy(&nvmf_cdata.fr[0], &nvme_cdata->fr[0], sizeof(nvmf_cdata.fr)); ++ /* IEEE OUI Identifier (IEEE) */ ++ memcpy(&nvmf_cdata.ieee[0], &nvme_cdata->ieee[0], sizeof(nvmf_cdata.ieee)); ++ /* FRU Globally Unique Identifier (FGUID) */ ++ ++ /* Copy the fixed up data back to the response */ ++ memcpy(nvme_cdata, &nvmf_cdata, length); ++} ++ ++static int ++nvmf_custom_identify_hdlr(struct spdk_nvmf_request *req) ++{ ++ struct spdk_nvme_cmd *cmd = spdk_nvmf_request_get_cmd(req); ++ struct spdk_bdev *bdev; ++ struct spdk_bdev_desc *desc; ++ struct spdk_io_channel *ch; ++ struct spdk_nvmf_subsystem *subsys; ++ int rc; ++ ++ if (cmd->cdw10_bits.identify.cns != SPDK_NVME_IDENTIFY_CTRLR) { ++ return -1; /* continue */ ++ } ++ ++ subsys = spdk_nvmf_request_get_subsystem(req); ++ if (subsys == NULL) { ++ return -1; ++ } ++ ++ /* Only procss this request if it has exactly one namespace */ ++ if (spdk_nvmf_subsystem_get_max_nsid(subsys) != 1) { ++ return -1; ++ } ++ ++ /* Forward to first namespace if it supports NVME admin commands */ ++ rc = spdk_nvmf_request_get_bdev(1, req, &bdev, &desc, &ch); ++ if (rc) { ++ /* No bdev found for this namespace. Continue. */ ++ return -1; ++ } ++ ++ if (!spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_NVME_ADMIN)) { ++ return -1; ++ } ++ ++ return spdk_nvmf_bdev_ctrlr_nvme_passthru_admin(bdev, desc, ch, req, fixup_identify_ctrlr); ++} ++ ++static void ++nvmf_tgt_advance_state(void) ++{ ++ enum nvmf_tgt_state prev_state; ++ int rc = -1; ++ int ret; ++ ++ do { ++ SPDK_DTRACE_PROBE1(nvmf_tgt_state, g_tgt_state); ++ prev_state = g_tgt_state; ++ ++ switch (g_tgt_state) { ++ case NVMF_TGT_INIT_NONE: { ++ g_tgt_state = NVMF_TGT_INIT_CREATE_TARGET; ++ break; ++ } ++ case NVMF_TGT_INIT_CREATE_TARGET: ++ ret = nvmf_tgt_create_target(); ++ g_tgt_state = (ret == 0) ? NVMF_TGT_INIT_CREATE_POLL_GROUPS : NVMF_TGT_ERROR; ++ break; ++ case NVMF_TGT_INIT_CREATE_POLL_GROUPS: ++ if (g_spdk_nvmf_tgt_conf.admin_passthru.identify_ctrlr) { ++ SPDK_NOTICELOG("Custom identify ctrlr handler enabled\n"); ++ spdk_nvmf_set_custom_admin_cmd_hdlr(SPDK_NVME_OPC_IDENTIFY, nvmf_custom_identify_hdlr); ++ } ++ /* Create poll group threads, and send a message to each thread ++ * and create a poll group. ++ */ ++ nvmf_tgt_create_poll_groups(); ++ break; ++ case NVMF_TGT_INIT_START_SUBSYSTEMS: { ++ struct spdk_nvmf_subsystem *subsystem; ++ ++ subsystem = spdk_nvmf_subsystem_get_first(g_spdk_nvmf_tgt); ++ ++ if (subsystem) { ++ ret = spdk_nvmf_subsystem_start(subsystem, nvmf_tgt_subsystem_started, NULL); ++ if (ret) { ++ SPDK_ERRLOG("Unable to start NVMe-oF subsystem. Stopping app.\n"); ++ g_tgt_state = NVMF_TGT_FINI_STOP_SUBSYSTEMS; ++ } ++ } else { ++ g_tgt_state = NVMF_TGT_RUNNING; ++ } ++ break; ++ } ++ case NVMF_TGT_RUNNING: ++ spdk_subsystem_init_next(0); ++ break; ++ case NVMF_TGT_FINI_STOP_SUBSYSTEMS: { ++ struct spdk_nvmf_subsystem *subsystem; ++ ++ subsystem = spdk_nvmf_subsystem_get_first(g_spdk_nvmf_tgt); ++ ++ if (subsystem) { ++ ret = spdk_nvmf_subsystem_stop(subsystem, nvmf_tgt_subsystem_stopped, NULL); ++ if (ret) { ++ nvmf_tgt_subsystem_stopped(subsystem, NULL, 0); ++ } ++ } else { ++ g_tgt_state = NVMF_TGT_FINI_DESTROY_SUBSYSTEMS; ++ } ++ break; ++ } ++ case NVMF_TGT_FINI_DESTROY_SUBSYSTEMS: ++ _nvmf_tgt_subsystem_destroy(NULL); ++ /* Function above can be asynchronous, it will call nvmf_tgt_advance_state() once done. ++ * So just return here */ ++ return; ++ case NVMF_TGT_FINI_DESTROY_POLL_GROUPS: ++ /* Send a message to each poll group thread, and terminate the thread */ ++ nvmf_tgt_destroy_poll_groups(); ++ break; ++ case NVMF_TGT_FINI_DESTROY_TARGET: ++ spdk_nvmf_tgt_destroy(g_spdk_nvmf_tgt, nvmf_tgt_destroy_done, NULL); ++ break; ++ case NVMF_TGT_STOPPED: ++ spdk_subsystem_fini_next(); ++ return; ++ case NVMF_TGT_ERROR: ++ spdk_subsystem_init_next(rc); ++ return; ++ } ++ ++ } while (g_tgt_state != prev_state); ++} ++ ++static void ++nvmf_subsystem_init(void) ++{ ++ g_tgt_state = NVMF_TGT_INIT_NONE; ++ nvmf_tgt_advance_state(); ++} ++ ++static void ++nvmf_subsystem_dump_discover_filter(struct spdk_json_write_ctx *w) ++{ ++ static char const *const answers[] = { ++ "match_any", ++ "transport", ++ "address", ++ "transport,address", ++ "svcid", ++ "transport,svcid", ++ "address,svcid", ++ "transport,address,svcid" ++ }; ++ ++ if ((g_spdk_nvmf_tgt_conf.discovery_filter & ~(SPDK_NVMF_TGT_DISCOVERY_MATCH_TRANSPORT_TYPE | ++ SPDK_NVMF_TGT_DISCOVERY_MATCH_TRANSPORT_ADDRESS | ++ SPDK_NVMF_TGT_DISCOVERY_MATCH_TRANSPORT_SVCID)) != 0) { ++ SPDK_ERRLOG("Incorrect discovery filter %d\n", g_spdk_nvmf_tgt_conf.discovery_filter); ++ assert(0); ++ return; ++ } ++ ++ spdk_json_write_named_string(w, "discovery_filter", answers[g_spdk_nvmf_tgt_conf.discovery_filter]); ++} ++ ++static void ++nvmf_subsystem_write_config_json(struct spdk_json_write_ctx *w) ++{ ++ spdk_json_write_array_begin(w); ++ ++ spdk_json_write_object_begin(w); ++ spdk_json_write_named_string(w, "method", "nvmf_set_config"); ++ ++ spdk_json_write_named_object_begin(w, "params"); ++ nvmf_subsystem_dump_discover_filter(w); ++ spdk_json_write_named_object_begin(w, "admin_cmd_passthru"); ++ spdk_json_write_named_bool(w, "identify_ctrlr", ++ g_spdk_nvmf_tgt_conf.admin_passthru.identify_ctrlr); ++ spdk_json_write_object_end(w); ++ if (g_poll_groups_mask) { ++ spdk_json_write_named_string(w, "poll_groups_mask", spdk_cpuset_fmt(g_poll_groups_mask)); ++ } ++ spdk_json_write_object_end(w); ++ spdk_json_write_object_end(w); ++ ++ spdk_nvmf_tgt_write_config_json(w, g_spdk_nvmf_tgt); ++ spdk_json_write_array_end(w); ++} ++ ++static struct spdk_subsystem g_spdk_subsystem_nvmf = { ++ .name = "nvmf", ++ .init = nvmf_subsystem_init, ++ .fini = nvmf_subsystem_fini, ++ .write_config_json = nvmf_subsystem_write_config_json, ++}; ++ ++SPDK_SUBSYSTEM_REGISTER(g_spdk_subsystem_nvmf) ++SPDK_SUBSYSTEM_DEPEND(nvmf, bdev) ++SPDK_SUBSYSTEM_DEPEND(nvmf, sock) +diff --git a/module/event/subsystems/scheduler/Makefile b/module/event/subsystems/scheduler/Makefile +index f52d679..a61a0a1 100644 +--- a/module/event/subsystems/scheduler/Makefile ++++ b/module/event/subsystems/scheduler/Makefile +@@ -1,17 +1,17 @@ +-# SPDX-License-Identifier: BSD-3-Clause +-# Copyright (C) 2015 Intel Corporation. +-# All rights reserved. +-# +- +-SPDK_ROOT_DIR := $(abspath $(CURDIR)/../../../..) +-include $(SPDK_ROOT_DIR)/mk/spdk.common.mk +- +-SO_VER := 2 +-SO_MINOR := 0 +- +-C_SRCS = scheduler.c +-LIBNAME = event_scheduler +- +-SPDK_MAP_FILE = $(SPDK_ROOT_DIR)/mk/spdk_blank.map +- +-include $(SPDK_ROOT_DIR)/mk/spdk.lib.mk ++# SPDX-License-Identifier: BSD-3-Clause ++# Copyright (C) 2015 Intel Corporation. ++# All rights reserved. ++# ++ ++SPDK_ROOT_DIR := $(abspath $(CURDIR)/../../../..) ++include $(SPDK_ROOT_DIR)/mk/spdk.common.mk ++ ++SO_VER := 2 ++SO_MINOR := 0 ++ ++C_SRCS = scheduler.c ++LIBNAME = event_scheduler ++ ++SPDK_MAP_FILE = $(SPDK_ROOT_DIR)/mk/spdk_blank.map ++ ++include $(SPDK_ROOT_DIR)/mk/spdk.lib.mk +diff --git a/module/event/subsystems/scheduler/scheduler.c b/module/event/subsystems/scheduler/scheduler.c +index 3cc714e..468c956 100644 +--- a/module/event/subsystems/scheduler/scheduler.c ++++ b/module/event/subsystems/scheduler/scheduler.c +@@ -1,73 +1,73 @@ +-/* SPDX-License-Identifier: BSD-3-Clause +- * Copyright (C) 2021 Intel Corporation. +- * All rights reserved. +- */ +- +-#include "spdk/stdinc.h" +-#include "spdk/env.h" +-#include "spdk/scheduler.h" +- +-#include "spdk_internal/event.h" +-#include "spdk_internal/init.h" +- +-static void +-scheduler_subsystem_init(void) +-{ +- int rc = 0; +- +- /* Set the defaults */ +- if (spdk_scheduler_get() == NULL) { +- rc = spdk_scheduler_set("static"); +- } +- +- spdk_subsystem_init_next(rc); +-} +- +-static void +-scheduler_subsystem_fini(void) +-{ +- spdk_scheduler_set_period(0); +- spdk_scheduler_set(NULL); +- +- spdk_subsystem_fini_next(); +-} +- +-static void +-scheduler_write_config_json(struct spdk_json_write_ctx *w) +-{ +- struct spdk_scheduler *scheduler; +- uint64_t scheduler_period; +- +- assert(w != NULL); +- +- scheduler = spdk_scheduler_get(); +- if (scheduler == NULL) { +- SPDK_ERRLOG("Unable to get scheduler info\n"); +- return; +- } +- +- scheduler_period = spdk_scheduler_get_period(); +- +- spdk_json_write_array_begin(w); +- +- spdk_json_write_object_begin(w); +- spdk_json_write_named_string(w, "method", "framework_set_scheduler"); +- spdk_json_write_named_object_begin(w, "params"); +- spdk_json_write_named_string(w, "name", scheduler->name); +- if (scheduler_period != 0) { +- spdk_json_write_named_uint32(w, "period", scheduler_period); +- } +- spdk_json_write_object_end(w); +- spdk_json_write_object_end(w); +- +- spdk_json_write_array_end(w); +-} +- +-static struct spdk_subsystem g_spdk_subsystem_scheduler = { +- .name = "scheduler", +- .init = scheduler_subsystem_init, +- .fini = scheduler_subsystem_fini, +- .write_config_json = scheduler_write_config_json, +-}; +- +-SPDK_SUBSYSTEM_REGISTER(g_spdk_subsystem_scheduler); ++/* SPDX-License-Identifier: BSD-3-Clause ++ * Copyright (C) 2021 Intel Corporation. ++ * All rights reserved. ++ */ ++ ++#include "spdk/stdinc.h" ++#include "spdk/env.h" ++#include "spdk/scheduler.h" ++ ++#include "spdk_internal/event.h" ++#include "spdk_internal/init.h" ++ ++static void ++scheduler_subsystem_init(void) ++{ ++ int rc = 0; ++ ++ /* Set the defaults */ ++ if (spdk_scheduler_get() == NULL) { ++ rc = spdk_scheduler_set("static"); ++ } ++ ++ spdk_subsystem_init_next(rc); ++} ++ ++static void ++scheduler_subsystem_fini(void) ++{ ++ spdk_scheduler_set_period(0); ++ spdk_scheduler_set(NULL); ++ ++ spdk_subsystem_fini_next(); ++} ++ ++static void ++scheduler_write_config_json(struct spdk_json_write_ctx *w) ++{ ++ struct spdk_scheduler *scheduler; ++ uint64_t scheduler_period; ++ ++ assert(w != NULL); ++ ++ scheduler = spdk_scheduler_get(); ++ if (scheduler == NULL) { ++ SPDK_ERRLOG("Unable to get scheduler info\n"); ++ return; ++ } ++ ++ scheduler_period = spdk_scheduler_get_period(); ++ ++ spdk_json_write_array_begin(w); ++ ++ spdk_json_write_object_begin(w); ++ spdk_json_write_named_string(w, "method", "framework_set_scheduler"); ++ spdk_json_write_named_object_begin(w, "params"); ++ spdk_json_write_named_string(w, "name", scheduler->name); ++ if (scheduler_period != 0) { ++ spdk_json_write_named_uint32(w, "period", scheduler_period); ++ } ++ spdk_json_write_object_end(w); ++ spdk_json_write_object_end(w); ++ ++ spdk_json_write_array_end(w); ++} ++ ++static struct spdk_subsystem g_spdk_subsystem_scheduler = { ++ .name = "scheduler", ++ .init = scheduler_subsystem_init, ++ .fini = scheduler_subsystem_fini, ++ .write_config_json = scheduler_write_config_json, ++}; ++ ++SPDK_SUBSYSTEM_REGISTER(g_spdk_subsystem_scheduler); +diff --git a/module/event/subsystems/scsi/Makefile b/module/event/subsystems/scsi/Makefile +index fa7356d..d04491b 100644 +--- a/module/event/subsystems/scsi/Makefile ++++ b/module/event/subsystems/scsi/Makefile +@@ -1,17 +1,17 @@ +-# SPDX-License-Identifier: BSD-3-Clause +-# Copyright (C) 2015 Intel Corporation. +-# All rights reserved. +-# +- +-SPDK_ROOT_DIR := $(abspath $(CURDIR)/../../../..) +-include $(SPDK_ROOT_DIR)/mk/spdk.common.mk +- +-SO_VER := 4 +-SO_MINOR := 0 +- +-C_SRCS = scsi.c +-LIBNAME = event_scsi +- +-SPDK_MAP_FILE = $(SPDK_ROOT_DIR)/mk/spdk_blank.map +- +-include $(SPDK_ROOT_DIR)/mk/spdk.lib.mk ++# SPDX-License-Identifier: BSD-3-Clause ++# Copyright (C) 2015 Intel Corporation. ++# All rights reserved. ++# ++ ++SPDK_ROOT_DIR := $(abspath $(CURDIR)/../../../..) ++include $(SPDK_ROOT_DIR)/mk/spdk.common.mk ++ ++SO_VER := 4 ++SO_MINOR := 0 ++ ++C_SRCS = scsi.c ++LIBNAME = event_scsi ++ ++SPDK_MAP_FILE = $(SPDK_ROOT_DIR)/mk/spdk_blank.map ++ ++include $(SPDK_ROOT_DIR)/mk/spdk.lib.mk +diff --git a/module/event/subsystems/scsi/scsi.c b/module/event/subsystems/scsi/scsi.c +index bc524d2..869bccd 100644 +--- a/module/event/subsystems/scsi/scsi.c ++++ b/module/event/subsystems/scsi/scsi.c +@@ -1,36 +1,36 @@ +-/* SPDX-License-Identifier: BSD-3-Clause +- * Copyright (C) 2016 Intel Corporation. +- * All rights reserved. +- */ +- +-#include "spdk/stdinc.h" +- +-#include "spdk/scsi.h" +- +-#include "spdk_internal/init.h" +- +-static void +-scsi_subsystem_init(void) +-{ +- int rc; +- +- rc = spdk_scsi_init(); +- +- spdk_subsystem_init_next(rc); +-} +- +-static void +-scsi_subsystem_fini(void) +-{ +- spdk_scsi_fini(); +- spdk_subsystem_fini_next(); +-} +- +-static struct spdk_subsystem g_spdk_subsystem_scsi = { +- .name = "scsi", +- .init = scsi_subsystem_init, +- .fini = scsi_subsystem_fini, +-}; +- +-SPDK_SUBSYSTEM_REGISTER(g_spdk_subsystem_scsi); +-SPDK_SUBSYSTEM_DEPEND(scsi, bdev) ++/* SPDX-License-Identifier: BSD-3-Clause ++ * Copyright (C) 2016 Intel Corporation. ++ * All rights reserved. ++ */ ++ ++#include "spdk/stdinc.h" ++ ++#include "spdk/scsi.h" ++ ++#include "spdk_internal/init.h" ++ ++static void ++scsi_subsystem_init(void) ++{ ++ int rc; ++ ++ rc = spdk_scsi_init(); ++ ++ spdk_subsystem_init_next(rc); ++} ++ ++static void ++scsi_subsystem_fini(void) ++{ ++ spdk_scsi_fini(); ++ spdk_subsystem_fini_next(); ++} ++ ++static struct spdk_subsystem g_spdk_subsystem_scsi = { ++ .name = "scsi", ++ .init = scsi_subsystem_init, ++ .fini = scsi_subsystem_fini, ++}; ++ ++SPDK_SUBSYSTEM_REGISTER(g_spdk_subsystem_scsi); ++SPDK_SUBSYSTEM_DEPEND(scsi, bdev) +diff --git a/module/event/subsystems/sock/Makefile b/module/event/subsystems/sock/Makefile +index e6f1240..ed67cc4 100644 +--- a/module/event/subsystems/sock/Makefile ++++ b/module/event/subsystems/sock/Makefile +@@ -1,17 +1,17 @@ +-# SPDX-License-Identifier: BSD-3-Clause +-# Copyright (C) 2021 Intel Corporation. +-# Copyright (c) 2020 Mellanox Technologies LTD. All rights reserved. +-# +- +-SPDK_ROOT_DIR := $(abspath $(CURDIR)/../../../..) +-include $(SPDK_ROOT_DIR)/mk/spdk.common.mk +- +-SO_VER := 3 +-SO_MINOR := 0 +- +-C_SRCS = sock.c +-LIBNAME = event_sock +- +-SPDK_MAP_FILE = $(SPDK_ROOT_DIR)/mk/spdk_blank.map +- +-include $(SPDK_ROOT_DIR)/mk/spdk.lib.mk ++# SPDX-License-Identifier: BSD-3-Clause ++# Copyright (C) 2021 Intel Corporation. ++# Copyright (c) 2020 Mellanox Technologies LTD. All rights reserved. ++# ++ ++SPDK_ROOT_DIR := $(abspath $(CURDIR)/../../../..) ++include $(SPDK_ROOT_DIR)/mk/spdk.common.mk ++ ++SO_VER := 3 ++SO_MINOR := 0 ++ ++C_SRCS = sock.c ++LIBNAME = event_sock ++ ++SPDK_MAP_FILE = $(SPDK_ROOT_DIR)/mk/spdk_blank.map ++ ++include $(SPDK_ROOT_DIR)/mk/spdk.lib.mk +diff --git a/module/event/subsystems/sock/sock.c b/module/event/subsystems/sock/sock.c +index 643c694..304d466 100644 +--- a/module/event/subsystems/sock/sock.c ++++ b/module/event/subsystems/sock/sock.c +@@ -1,35 +1,35 @@ +-/* SPDX-License-Identifier: BSD-3-Clause +- * Copyright (C) 2021 Intel Corporation. +- * Copyright (c) 2020 Mellanox Technologies LTD. All rights reserved. +- */ +- +-#include "spdk/stdinc.h" +-#include "spdk/sock.h" +-#include "spdk_internal/init.h" +- +-static void +-sock_subsystem_init(void) +-{ +- spdk_subsystem_init_next(0); +-} +- +-static void +-sock_subsystem_fini(void) +-{ +- spdk_subsystem_fini_next(); +-} +- +-static void +-sock_subsystem_write_config_json(struct spdk_json_write_ctx *w) +-{ +- spdk_sock_write_config_json(w); +-} +- +-static struct spdk_subsystem g_spdk_subsystem_sock = { +- .name = "sock", +- .init = sock_subsystem_init, +- .fini = sock_subsystem_fini, +- .write_config_json = sock_subsystem_write_config_json, +-}; +- +-SPDK_SUBSYSTEM_REGISTER(g_spdk_subsystem_sock); ++/* SPDX-License-Identifier: BSD-3-Clause ++ * Copyright (C) 2021 Intel Corporation. ++ * Copyright (c) 2020 Mellanox Technologies LTD. All rights reserved. ++ */ ++ ++#include "spdk/stdinc.h" ++#include "spdk/sock.h" ++#include "spdk_internal/init.h" ++ ++static void ++sock_subsystem_init(void) ++{ ++ spdk_subsystem_init_next(0); ++} ++ ++static void ++sock_subsystem_fini(void) ++{ ++ spdk_subsystem_fini_next(); ++} ++ ++static void ++sock_subsystem_write_config_json(struct spdk_json_write_ctx *w) ++{ ++ spdk_sock_write_config_json(w); ++} ++ ++static struct spdk_subsystem g_spdk_subsystem_sock = { ++ .name = "sock", ++ .init = sock_subsystem_init, ++ .fini = sock_subsystem_fini, ++ .write_config_json = sock_subsystem_write_config_json, ++}; ++ ++SPDK_SUBSYSTEM_REGISTER(g_spdk_subsystem_sock); +diff --git a/module/event/subsystems/ssam/Makefile b/module/event/subsystems/ssam/Makefile +new file mode 100644 +index 0000000..70ca55c +--- /dev/null ++++ b/module/event/subsystems/ssam/Makefile +@@ -0,0 +1,44 @@ ++# ++# BSD LICENSE ++# ++# Copyright (c) Huawei Technologies Co., Ltd. 2021-2021. All rights reserved. ++# ++# Redistribution and use in source and binary forms, with or without ++# modification, are permitted provided that the following conditions ++# are met: ++# ++# * Redistributions of source code must retain the above copyright ++# notice, this list of conditions and the following disclaimer. ++# * Redistributions in binary form must reproduce the above copyright ++# notice, this list of conditions and the following disclaimer in ++# the documentation and/or other materials provided with the ++# distribution. ++# * Neither the name of Intel Corporation nor the names of its ++# contributors may be used to endorse or promote products derived ++# from this software without specific prior written permission. ++# ++# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS ++# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT ++# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR ++# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT ++# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, ++# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT ++# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, ++# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY ++# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT ++# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE ++# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++# ++ ++SPDK_ROOT_DIR := $(abspath $(CURDIR)/../../../..) ++include $(SPDK_ROOT_DIR)/mk/spdk.common.mk ++ ++SO_VER := 3 ++SO_MINOR := 0 ++ ++C_SRCS = ssam.c ++LIBNAME = event_ssam ++ ++SPDK_MAP_FILE = $(SPDK_ROOT_DIR)/mk/spdk_blank.map ++ ++include $(SPDK_ROOT_DIR)/mk/spdk.lib.mk +diff --git a/module/event/subsystems/ssam/ssam.c b/module/event/subsystems/ssam/ssam.c +new file mode 100644 +index 0000000..02c0090 +--- /dev/null ++++ b/module/event/subsystems/ssam/ssam.c +@@ -0,0 +1,72 @@ ++/*- ++ * BSD LICENSE ++ * ++ * Copyright (c) Huawei Technologies Co., Ltd. 2021-2021. All rights reserved. ++ * ++ * Redistribution and use in source and binary forms, with or without ++ * modification, are permitted provided that the following conditions ++ * are met: ++ * ++ * * Redistributions of source code must retain the above copyright ++ * notice, this list of conditions and the following disclaimer. ++ * * Redistributions in binary form must reproduce the above copyright ++ * notice, this list of conditions and the following disclaimer in ++ * the documentation and/or other materials provided with the ++ * distribution. ++ * * Neither the name of Intel Corporation nor the names of its ++ * contributors may be used to endorse or promote products derived ++ * from this software without specific prior written permission. ++ * ++ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS ++ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT ++ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR ++ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT ++ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, ++ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT ++ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, ++ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY ++ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT ++ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE ++ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++ */ ++ ++#include "spdk/stdinc.h" ++ ++#include "spdk/ssam.h" ++ ++#include "spdk_internal/event.h" ++#include "spdk_internal/init.h" ++ ++static void ++ssam_subsystem_init_done(int rc) ++{ ++ spdk_subsystem_init_next(rc); ++} ++ ++static void ++ssam_subsystem_init(void) ++{ ++ spdk_ssam_subsystem_init(ssam_subsystem_init_done); ++} ++ ++static void ++ssam_subsystem_fini_done(void) ++{ ++ spdk_subsystem_fini_next(); ++} ++ ++static void ++ssam_subsystem_fini(void) ++{ ++ spdk_ssam_subsystem_fini(ssam_subsystem_fini_done); ++} ++ ++static struct spdk_subsystem g_spdk_subsystem_ssam = { ++ .name = SSAM_SERVER_NAME, ++ .init = ssam_subsystem_init, ++ .fini = ssam_subsystem_fini, ++ .write_config_json = spdk_ssam_config_json, ++}; ++ ++SPDK_SUBSYSTEM_REGISTER(g_spdk_subsystem_ssam); ++SPDK_SUBSYSTEM_DEPEND(ssam, scsi) +diff --git a/module/event/subsystems/ublk/Makefile b/module/event/subsystems/ublk/Makefile +index 4990458..a5a188c 100644 +--- a/module/event/subsystems/ublk/Makefile ++++ b/module/event/subsystems/ublk/Makefile +@@ -1,17 +1,17 @@ +-# SPDX-License-Identifier: BSD-3-Clause +-# Copyright (C) 2022 Intel Corporation. +-# All rights reserved. +-# +- +-SPDK_ROOT_DIR := $(abspath $(CURDIR)/../../../..) +-include $(SPDK_ROOT_DIR)/mk/spdk.common.mk +- +-SO_VER := 1 +-SO_MINOR := 0 +- +-C_SRCS = ublk.c +-LIBNAME = event_ublk +- +-SPDK_MAP_FILE = $(SPDK_ROOT_DIR)/mk/spdk_blank.map +- +-include $(SPDK_ROOT_DIR)/mk/spdk.lib.mk ++# SPDX-License-Identifier: BSD-3-Clause ++# Copyright (C) 2022 Intel Corporation. ++# All rights reserved. ++# ++ ++SPDK_ROOT_DIR := $(abspath $(CURDIR)/../../../..) ++include $(SPDK_ROOT_DIR)/mk/spdk.common.mk ++ ++SO_VER := 1 ++SO_MINOR := 0 ++ ++C_SRCS = ublk.c ++LIBNAME = event_ublk ++ ++SPDK_MAP_FILE = $(SPDK_ROOT_DIR)/mk/spdk_blank.map ++ ++include $(SPDK_ROOT_DIR)/mk/spdk.lib.mk +diff --git a/module/event/subsystems/ublk/ublk.c b/module/event/subsystems/ublk/ublk.c +index 720004d..0b39f88 100644 +--- a/module/event/subsystems/ublk/ublk.c ++++ b/module/event/subsystems/ublk/ublk.c +@@ -1,48 +1,48 @@ +-/* SPDX-License-Identifier: BSD-3-Clause +- * Copyright (C) 2022 Intel Corporation. +- * All rights reserved. +- */ +- +-#include "spdk/stdinc.h" +-#include "spdk/ublk.h" +-#include "spdk_internal/init.h" +- +-static void +-ublk_subsystem_init(void) +-{ +- spdk_ublk_init(); +- spdk_subsystem_init_next(0); +-} +- +-static void +-ublk_subsystem_fini_done(void *arg) +-{ +- spdk_subsystem_fini_next(); +-} +- +-static void +-ublk_subsystem_fini(void) +-{ +- int rc; +- +- rc = spdk_ublk_fini(ublk_subsystem_fini_done, NULL); +- if (rc != 0) { +- ublk_subsystem_fini_done(NULL); +- } +-} +- +-static void +-ublk_subsystem_write_config_json(struct spdk_json_write_ctx *w) +-{ +- spdk_ublk_write_config_json(w); +-} +- +-static struct spdk_subsystem g_spdk_subsystem_ublk = { +- .name = "ublk", +- .init = ublk_subsystem_init, +- .fini = ublk_subsystem_fini, +- .write_config_json = ublk_subsystem_write_config_json, +-}; +- +-SPDK_SUBSYSTEM_REGISTER(g_spdk_subsystem_ublk); +-SPDK_SUBSYSTEM_DEPEND(ublk, bdev) ++/* SPDX-License-Identifier: BSD-3-Clause ++ * Copyright (C) 2022 Intel Corporation. ++ * All rights reserved. ++ */ ++ ++#include "spdk/stdinc.h" ++#include "spdk/ublk.h" ++#include "spdk_internal/init.h" ++ ++static void ++ublk_subsystem_init(void) ++{ ++ spdk_ublk_init(); ++ spdk_subsystem_init_next(0); ++} ++ ++static void ++ublk_subsystem_fini_done(void *arg) ++{ ++ spdk_subsystem_fini_next(); ++} ++ ++static void ++ublk_subsystem_fini(void) ++{ ++ int rc; ++ ++ rc = spdk_ublk_fini(ublk_subsystem_fini_done, NULL); ++ if (rc != 0) { ++ ublk_subsystem_fini_done(NULL); ++ } ++} ++ ++static void ++ublk_subsystem_write_config_json(struct spdk_json_write_ctx *w) ++{ ++ spdk_ublk_write_config_json(w); ++} ++ ++static struct spdk_subsystem g_spdk_subsystem_ublk = { ++ .name = "ublk", ++ .init = ublk_subsystem_init, ++ .fini = ublk_subsystem_fini, ++ .write_config_json = ublk_subsystem_write_config_json, ++}; ++ ++SPDK_SUBSYSTEM_REGISTER(g_spdk_subsystem_ublk); ++SPDK_SUBSYSTEM_DEPEND(ublk, bdev) +diff --git a/module/event/subsystems/vfu_tgt/Makefile b/module/event/subsystems/vfu_tgt/Makefile +index d6bd968..8eb237d 100644 +--- a/module/event/subsystems/vfu_tgt/Makefile ++++ b/module/event/subsystems/vfu_tgt/Makefile +@@ -1,17 +1,17 @@ +-# SPDX-License-Identifier: BSD-3-Clause +-# Copyright (C) 2022 Intel Corporation. +-# All rights reserved. +-# +- +-SPDK_ROOT_DIR := $(abspath $(CURDIR)/../../../..) +-include $(SPDK_ROOT_DIR)/mk/spdk.common.mk +- +-SO_VER := 1 +-SO_MINOR := 0 +- +-C_SRCS = vfu_tgt.c +-LIBNAME = event_vfu_tgt +- +-SPDK_MAP_FILE = $(SPDK_ROOT_DIR)/mk/spdk_blank.map +- +-include $(SPDK_ROOT_DIR)/mk/spdk.lib.mk ++# SPDX-License-Identifier: BSD-3-Clause ++# Copyright (C) 2022 Intel Corporation. ++# All rights reserved. ++# ++ ++SPDK_ROOT_DIR := $(abspath $(CURDIR)/../../../..) ++include $(SPDK_ROOT_DIR)/mk/spdk.common.mk ++ ++SO_VER := 1 ++SO_MINOR := 0 ++ ++C_SRCS = vfu_tgt.c ++LIBNAME = event_vfu_tgt ++ ++SPDK_MAP_FILE = $(SPDK_ROOT_DIR)/mk/spdk_blank.map ++ ++include $(SPDK_ROOT_DIR)/mk/spdk.lib.mk +diff --git a/module/event/subsystems/vfu_tgt/vfu_tgt.c b/module/event/subsystems/vfu_tgt/vfu_tgt.c +index ef3a687..e870dda 100644 +--- a/module/event/subsystems/vfu_tgt/vfu_tgt.c ++++ b/module/event/subsystems/vfu_tgt/vfu_tgt.c +@@ -1,41 +1,41 @@ +-/* SPDX-License-Identifier: BSD-3-Clause +- * Copyright (C) 2022 Intel Corporation. +- * All rights reserved. +- */ +- +-#include "spdk/stdinc.h" +-#include "spdk/vfu_target.h" +- +-#include "spdk_internal/init.h" +- +-static void +-vfu_subsystem_init_done(int rc) +-{ +- spdk_subsystem_init_next(rc); +-} +- +-static void +-vfu_target_subsystem_init(void) +-{ +- spdk_vfu_init(vfu_subsystem_init_done); +-} +- +-static void +-vfu_target_subsystem_fini_done(void) +-{ +- spdk_subsystem_fini_next(); +-} +- +-static void +-vfu_target_subsystem_fini(void) +-{ +- spdk_vfu_fini(vfu_target_subsystem_fini_done); +-} +- +-static struct spdk_subsystem g_spdk_subsystem_vfu_target = { +- .name = "vfio_user_target", +- .init = vfu_target_subsystem_init, +- .fini = vfu_target_subsystem_fini, +-}; +- +-SPDK_SUBSYSTEM_REGISTER(g_spdk_subsystem_vfu_target); ++/* SPDX-License-Identifier: BSD-3-Clause ++ * Copyright (C) 2022 Intel Corporation. ++ * All rights reserved. ++ */ ++ ++#include "spdk/stdinc.h" ++#include "spdk/vfu_target.h" ++ ++#include "spdk_internal/init.h" ++ ++static void ++vfu_subsystem_init_done(int rc) ++{ ++ spdk_subsystem_init_next(rc); ++} ++ ++static void ++vfu_target_subsystem_init(void) ++{ ++ spdk_vfu_init(vfu_subsystem_init_done); ++} ++ ++static void ++vfu_target_subsystem_fini_done(void) ++{ ++ spdk_subsystem_fini_next(); ++} ++ ++static void ++vfu_target_subsystem_fini(void) ++{ ++ spdk_vfu_fini(vfu_target_subsystem_fini_done); ++} ++ ++static struct spdk_subsystem g_spdk_subsystem_vfu_target = { ++ .name = "vfio_user_target", ++ .init = vfu_target_subsystem_init, ++ .fini = vfu_target_subsystem_fini, ++}; ++ ++SPDK_SUBSYSTEM_REGISTER(g_spdk_subsystem_vfu_target); +diff --git a/module/event/subsystems/vhost_blk/Makefile b/module/event/subsystems/vhost_blk/Makefile +index 3e831b7..0f0b2d1 100644 +--- a/module/event/subsystems/vhost_blk/Makefile ++++ b/module/event/subsystems/vhost_blk/Makefile +@@ -1,17 +1,17 @@ +-# SPDX-License-Identifier: BSD-3-Clause +-# Copyright (C) 2015 Intel Corporation. +-# All rights reserved. +-# +- +-SPDK_ROOT_DIR := $(abspath $(CURDIR)/../../../..) +-include $(SPDK_ROOT_DIR)/mk/spdk.common.mk +- +-SO_VER := 1 +-SO_MINOR := 0 +- +-C_SRCS = vhost_blk.c +-LIBNAME = event_vhost_blk +- +-SPDK_MAP_FILE = $(SPDK_ROOT_DIR)/mk/spdk_blank.map +- +-include $(SPDK_ROOT_DIR)/mk/spdk.lib.mk ++# SPDX-License-Identifier: BSD-3-Clause ++# Copyright (C) 2015 Intel Corporation. ++# All rights reserved. ++# ++ ++SPDK_ROOT_DIR := $(abspath $(CURDIR)/../../../..) ++include $(SPDK_ROOT_DIR)/mk/spdk.common.mk ++ ++SO_VER := 1 ++SO_MINOR := 0 ++ ++C_SRCS = vhost_blk.c ++LIBNAME = event_vhost_blk ++ ++SPDK_MAP_FILE = $(SPDK_ROOT_DIR)/mk/spdk_blank.map ++ ++include $(SPDK_ROOT_DIR)/mk/spdk.lib.mk +diff --git a/module/event/subsystems/vhost_blk/vhost_blk.c b/module/event/subsystems/vhost_blk/vhost_blk.c +index 8a661af..5d8c0e9 100644 +--- a/module/event/subsystems/vhost_blk/vhost_blk.c ++++ b/module/event/subsystems/vhost_blk/vhost_blk.c +@@ -1,44 +1,44 @@ +-/* SPDX-License-Identifier: BSD-3-Clause +- * Copyright (C) 2021 Intel Corporation. +- * All rights reserved. +- */ +- +-#include "spdk/stdinc.h" +- +-#include "spdk/vhost.h" +- +-#include "spdk_internal/init.h" +- +-static void +-vhost_blk_subsystem_init_done(int rc) +-{ +- spdk_subsystem_init_next(rc); +-} +- +-static void +-vhost_blk_subsystem_init(void) +-{ +- spdk_vhost_blk_init(vhost_blk_subsystem_init_done); +-} +- +-static void +-vhost_blk_subsystem_fini_done(void) +-{ +- spdk_subsystem_fini_next(); +-} +- +-static void +-vhost_blk_subsystem_fini(void) +-{ +- spdk_vhost_blk_fini(vhost_blk_subsystem_fini_done); +-} +- +-static struct spdk_subsystem g_spdk_subsystem_vhost_blk = { +- .name = "vhost_blk", +- .init = vhost_blk_subsystem_init, +- .fini = vhost_blk_subsystem_fini, +- .write_config_json = spdk_vhost_blk_config_json, +-}; +- +-SPDK_SUBSYSTEM_REGISTER(g_spdk_subsystem_vhost_blk); +-SPDK_SUBSYSTEM_DEPEND(vhost_blk, bdev) ++/* SPDX-License-Identifier: BSD-3-Clause ++ * Copyright (C) 2021 Intel Corporation. ++ * All rights reserved. ++ */ ++ ++#include "spdk/stdinc.h" ++ ++#include "spdk/vhost.h" ++ ++#include "spdk_internal/init.h" ++ ++static void ++vhost_blk_subsystem_init_done(int rc) ++{ ++ spdk_subsystem_init_next(rc); ++} ++ ++static void ++vhost_blk_subsystem_init(void) ++{ ++ spdk_vhost_blk_init(vhost_blk_subsystem_init_done); ++} ++ ++static void ++vhost_blk_subsystem_fini_done(void) ++{ ++ spdk_subsystem_fini_next(); ++} ++ ++static void ++vhost_blk_subsystem_fini(void) ++{ ++ spdk_vhost_blk_fini(vhost_blk_subsystem_fini_done); ++} ++ ++static struct spdk_subsystem g_spdk_subsystem_vhost_blk = { ++ .name = "vhost_blk", ++ .init = vhost_blk_subsystem_init, ++ .fini = vhost_blk_subsystem_fini, ++ .write_config_json = spdk_vhost_blk_config_json, ++}; ++ ++SPDK_SUBSYSTEM_REGISTER(g_spdk_subsystem_vhost_blk); ++SPDK_SUBSYSTEM_DEPEND(vhost_blk, bdev) +diff --git a/module/event/subsystems/vhost_scsi/Makefile b/module/event/subsystems/vhost_scsi/Makefile +index fc2ad51..5722952 100644 +--- a/module/event/subsystems/vhost_scsi/Makefile ++++ b/module/event/subsystems/vhost_scsi/Makefile +@@ -1,17 +1,17 @@ +-# SPDX-License-Identifier: BSD-3-Clause +-# Copyright (C) 2015 Intel Corporation. +-# All rights reserved. +-# +- +-SPDK_ROOT_DIR := $(abspath $(CURDIR)/../../../..) +-include $(SPDK_ROOT_DIR)/mk/spdk.common.mk +- +-SO_VER := 1 +-SO_MINOR := 0 +- +-C_SRCS = vhost_scsi.c +-LIBNAME = event_vhost_scsi +- +-SPDK_MAP_FILE = $(SPDK_ROOT_DIR)/mk/spdk_blank.map +- +-include $(SPDK_ROOT_DIR)/mk/spdk.lib.mk ++# SPDX-License-Identifier: BSD-3-Clause ++# Copyright (C) 2015 Intel Corporation. ++# All rights reserved. ++# ++ ++SPDK_ROOT_DIR := $(abspath $(CURDIR)/../../../..) ++include $(SPDK_ROOT_DIR)/mk/spdk.common.mk ++ ++SO_VER := 1 ++SO_MINOR := 0 ++ ++C_SRCS = vhost_scsi.c ++LIBNAME = event_vhost_scsi ++ ++SPDK_MAP_FILE = $(SPDK_ROOT_DIR)/mk/spdk_blank.map ++ ++include $(SPDK_ROOT_DIR)/mk/spdk.lib.mk +diff --git a/module/event/subsystems/vhost_scsi/vhost_scsi.c b/module/event/subsystems/vhost_scsi/vhost_scsi.c +index 0b6900a..bdbea5e 100644 +--- a/module/event/subsystems/vhost_scsi/vhost_scsi.c ++++ b/module/event/subsystems/vhost_scsi/vhost_scsi.c +@@ -1,44 +1,44 @@ +-/* SPDX-License-Identifier: BSD-3-Clause +- * Copyright (C) 2021 Intel Corporation. +- * All rights reserved. +- */ +- +-#include "spdk/stdinc.h" +- +-#include "spdk/vhost.h" +- +-#include "spdk_internal/init.h" +- +-static void +-vhost_scsi_subsystem_init_done(int rc) +-{ +- spdk_subsystem_init_next(rc); +-} +- +-static void +-vhost_scsi_subsystem_init(void) +-{ +- spdk_vhost_scsi_init(vhost_scsi_subsystem_init_done); +-} +- +-static void +-vhost_scsi_subsystem_fini_done(void) +-{ +- spdk_subsystem_fini_next(); +-} +- +-static void +-vhost_scsi_subsystem_fini(void) +-{ +- spdk_vhost_scsi_fini(vhost_scsi_subsystem_fini_done); +-} +- +-static struct spdk_subsystem g_spdk_subsystem_vhost_scsi = { +- .name = "vhost_scsi", +- .init = vhost_scsi_subsystem_init, +- .fini = vhost_scsi_subsystem_fini, +- .write_config_json = spdk_vhost_scsi_config_json, +-}; +- +-SPDK_SUBSYSTEM_REGISTER(g_spdk_subsystem_vhost_scsi); +-SPDK_SUBSYSTEM_DEPEND(vhost_scsi, scsi) ++/* SPDX-License-Identifier: BSD-3-Clause ++ * Copyright (C) 2021 Intel Corporation. ++ * All rights reserved. ++ */ ++ ++#include "spdk/stdinc.h" ++ ++#include "spdk/vhost.h" ++ ++#include "spdk_internal/init.h" ++ ++static void ++vhost_scsi_subsystem_init_done(int rc) ++{ ++ spdk_subsystem_init_next(rc); ++} ++ ++static void ++vhost_scsi_subsystem_init(void) ++{ ++ spdk_vhost_scsi_init(vhost_scsi_subsystem_init_done); ++} ++ ++static void ++vhost_scsi_subsystem_fini_done(void) ++{ ++ spdk_subsystem_fini_next(); ++} ++ ++static void ++vhost_scsi_subsystem_fini(void) ++{ ++ spdk_vhost_scsi_fini(vhost_scsi_subsystem_fini_done); ++} ++ ++static struct spdk_subsystem g_spdk_subsystem_vhost_scsi = { ++ .name = "vhost_scsi", ++ .init = vhost_scsi_subsystem_init, ++ .fini = vhost_scsi_subsystem_fini, ++ .write_config_json = spdk_vhost_scsi_config_json, ++}; ++ ++SPDK_SUBSYSTEM_REGISTER(g_spdk_subsystem_vhost_scsi); ++SPDK_SUBSYSTEM_DEPEND(vhost_scsi, scsi) +diff --git a/module/event/subsystems/vmd/Makefile b/module/event/subsystems/vmd/Makefile +index 5fa0fbe..9836108 100644 +--- a/module/event/subsystems/vmd/Makefile ++++ b/module/event/subsystems/vmd/Makefile +@@ -1,17 +1,17 @@ +-# SPDX-License-Identifier: BSD-3-Clause +-# Copyright (C) 2015 Intel Corporation. +-# All rights reserved. +-# +- +-SPDK_ROOT_DIR := $(abspath $(CURDIR)/../../../..) +-include $(SPDK_ROOT_DIR)/mk/spdk.common.mk +- +-SO_VER := 4 +-SO_MINOR := 0 +- +-C_SRCS = vmd.c vmd_rpc.c +-LIBNAME = event_vmd +- +-SPDK_MAP_FILE = $(SPDK_ROOT_DIR)/mk/spdk_blank.map +- +-include $(SPDK_ROOT_DIR)/mk/spdk.lib.mk ++# SPDX-License-Identifier: BSD-3-Clause ++# Copyright (C) 2015 Intel Corporation. ++# All rights reserved. ++# ++ ++SPDK_ROOT_DIR := $(abspath $(CURDIR)/../../../..) ++include $(SPDK_ROOT_DIR)/mk/spdk.common.mk ++ ++SO_VER := 4 ++SO_MINOR := 0 ++ ++C_SRCS = vmd.c vmd_rpc.c ++LIBNAME = event_vmd ++ ++SPDK_MAP_FILE = $(SPDK_ROOT_DIR)/mk/spdk_blank.map ++ ++include $(SPDK_ROOT_DIR)/mk/spdk.lib.mk +diff --git a/module/event/subsystems/vmd/event_vmd.h b/module/event/subsystems/vmd/event_vmd.h +index 9bea645..25bf25a 100644 +--- a/module/event/subsystems/vmd/event_vmd.h ++++ b/module/event/subsystems/vmd/event_vmd.h +@@ -1,12 +1,12 @@ +-/* SPDX-License-Identifier: BSD-3-Clause +- * Copyright (C) 2018 Intel Corporation. +- * All rights reserved. +- */ +- +-#ifndef EVENT_VMD_H +-#define EVENT_VMD_H +- +-void vmd_subsystem_enable(void); +-bool vmd_subsystem_is_enabled(void); +- +-#endif ++/* SPDX-License-Identifier: BSD-3-Clause ++ * Copyright (C) 2018 Intel Corporation. ++ * All rights reserved. ++ */ ++ ++#ifndef EVENT_VMD_H ++#define EVENT_VMD_H ++ ++void vmd_subsystem_enable(void); ++bool vmd_subsystem_is_enabled(void); ++ ++#endif +diff --git a/module/event/subsystems/vmd/vmd.c b/module/event/subsystems/vmd/vmd.c +index 5a71b5f..fa0c605 100644 +--- a/module/event/subsystems/vmd/vmd.c ++++ b/module/event/subsystems/vmd/vmd.c +@@ -1,99 +1,99 @@ +-/* SPDX-License-Identifier: BSD-3-Clause +- * Copyright (C) 2019 Intel Corporation. +- * All rights reserved. +- */ +- +-#include "spdk/stdinc.h" +- +-#include "spdk/json.h" +-#include "spdk/thread.h" +-#include "spdk/likely.h" +-#include "spdk/log.h" +- +-#include "spdk/vmd.h" +- +-#include "spdk_internal/init.h" +-#include "event_vmd.h" +- +-static struct spdk_poller *g_hotplug_poller; +-static bool g_enabled; +- +-void +-vmd_subsystem_enable(void) +-{ +- g_enabled = true; +-} +- +-bool +-vmd_subsystem_is_enabled(void) +-{ +- return g_enabled; +-} +- +-static int +-vmd_hotplug_monitor(void *ctx) +-{ +- return spdk_vmd_hotplug_monitor(); +-} +- +-static void +-vmd_subsystem_init(void) +-{ +- int rc = 0; +- +- if (!g_enabled) { +- goto out; +- } +- +- rc = spdk_vmd_init(); +- if (spdk_likely(rc != 0)) { +- SPDK_ERRLOG("Failed to initialize the VMD library\n"); +- goto out; +- } +- +- assert(g_hotplug_poller == NULL); +- +- g_hotplug_poller = SPDK_POLLER_REGISTER(vmd_hotplug_monitor, NULL, 1000000ULL); +- if (g_hotplug_poller == NULL) { +- SPDK_ERRLOG("Failed to register hotplug monitor poller\n"); +- rc = -ENOMEM; +- goto out; +- } +-out: +- spdk_subsystem_init_next(rc); +-} +- +-static void +-vmd_subsystem_fini(void) +-{ +- spdk_poller_unregister(&g_hotplug_poller); +- +- spdk_vmd_fini(); +- +- spdk_subsystem_fini_next(); +-} +- +-static void +-vmd_write_config_json(struct spdk_json_write_ctx *w) +-{ +- spdk_json_write_array_begin(w); +- +- if (g_enabled) { +- spdk_json_write_object_begin(w); +- spdk_json_write_named_string(w, "method", "vmd_enable"); +- spdk_json_write_named_object_begin(w, "params"); +- spdk_json_write_object_end(w); +- spdk_json_write_object_end(w); +- } +- +- spdk_json_write_array_end(w); +-} +- +-static struct spdk_subsystem g_spdk_subsystem_vmd = { +- .name = "vmd", +- .init = vmd_subsystem_init, +- .fini = vmd_subsystem_fini, +- .write_config_json = vmd_write_config_json, +-}; +- +-SPDK_SUBSYSTEM_REGISTER(g_spdk_subsystem_vmd); ++/* SPDX-License-Identifier: BSD-3-Clause ++ * Copyright (C) 2019 Intel Corporation. ++ * All rights reserved. ++ */ ++ ++#include "spdk/stdinc.h" ++ ++#include "spdk/json.h" ++#include "spdk/thread.h" ++#include "spdk/likely.h" ++#include "spdk/log.h" ++ ++#include "spdk/vmd.h" ++ ++#include "spdk_internal/init.h" ++#include "event_vmd.h" ++ ++static struct spdk_poller *g_hotplug_poller; ++static bool g_enabled; ++ ++void ++vmd_subsystem_enable(void) ++{ ++ g_enabled = true; ++} ++ ++bool ++vmd_subsystem_is_enabled(void) ++{ ++ return g_enabled; ++} ++ ++static int ++vmd_hotplug_monitor(void *ctx) ++{ ++ return spdk_vmd_hotplug_monitor(); ++} ++ ++static void ++vmd_subsystem_init(void) ++{ ++ int rc = 0; ++ ++ if (!g_enabled) { ++ goto out; ++ } ++ ++ rc = spdk_vmd_init(); ++ if (spdk_likely(rc != 0)) { ++ SPDK_ERRLOG("Failed to initialize the VMD library\n"); ++ goto out; ++ } ++ ++ assert(g_hotplug_poller == NULL); ++ ++ g_hotplug_poller = SPDK_POLLER_REGISTER(vmd_hotplug_monitor, NULL, 1000000ULL); ++ if (g_hotplug_poller == NULL) { ++ SPDK_ERRLOG("Failed to register hotplug monitor poller\n"); ++ rc = -ENOMEM; ++ goto out; ++ } ++out: ++ spdk_subsystem_init_next(rc); ++} ++ ++static void ++vmd_subsystem_fini(void) ++{ ++ spdk_poller_unregister(&g_hotplug_poller); ++ ++ spdk_vmd_fini(); ++ ++ spdk_subsystem_fini_next(); ++} ++ ++static void ++vmd_write_config_json(struct spdk_json_write_ctx *w) ++{ ++ spdk_json_write_array_begin(w); ++ ++ if (g_enabled) { ++ spdk_json_write_object_begin(w); ++ spdk_json_write_named_string(w, "method", "vmd_enable"); ++ spdk_json_write_named_object_begin(w, "params"); ++ spdk_json_write_object_end(w); ++ spdk_json_write_object_end(w); ++ } ++ ++ spdk_json_write_array_end(w); ++} ++ ++static struct spdk_subsystem g_spdk_subsystem_vmd = { ++ .name = "vmd", ++ .init = vmd_subsystem_init, ++ .fini = vmd_subsystem_fini, ++ .write_config_json = vmd_write_config_json, ++}; ++ ++SPDK_SUBSYSTEM_REGISTER(g_spdk_subsystem_vmd); +diff --git a/module/event/subsystems/vmd/vmd_rpc.c b/module/event/subsystems/vmd/vmd_rpc.c +index 507b58f..50308b3 100644 +--- a/module/event/subsystems/vmd/vmd_rpc.c ++++ b/module/event/subsystems/vmd/vmd_rpc.c +@@ -1,96 +1,96 @@ +-/* SPDX-License-Identifier: BSD-3-Clause +- * Copyright (C) 2019 Intel Corporation. +- * All rights reserved. +- */ +- +-#include "spdk/vmd.h" +- +-#include "spdk/env.h" +-#include "spdk/rpc.h" +-#include "spdk/string.h" +-#include "spdk/util.h" +- +-#include "spdk/log.h" +-#include "event_vmd.h" +- +-static void +-rpc_vmd_enable(struct spdk_jsonrpc_request *request, const struct spdk_json_val *params) +-{ +- vmd_subsystem_enable(); +- +- spdk_jsonrpc_send_bool_response(request, true); +-} +-SPDK_RPC_REGISTER("vmd_enable", rpc_vmd_enable, SPDK_RPC_STARTUP) +-SPDK_RPC_REGISTER_ALIAS_DEPRECATED(vmd_enable, enable_vmd) +- +-struct rpc_vmd_remove_device { +- char *addr; +-}; +- +-static const struct spdk_json_object_decoder rpc_vmd_remove_device_decoders[] = { +- {"addr", offsetof(struct rpc_vmd_remove_device, addr), spdk_json_decode_string}, +-}; +- +-static void +-rpc_vmd_remove_device(struct spdk_jsonrpc_request *request, const struct spdk_json_val *params) +-{ +- struct rpc_vmd_remove_device req = {}; +- struct spdk_pci_addr addr; +- int rc; +- +- if (!vmd_subsystem_is_enabled()) { +- spdk_jsonrpc_send_error_response(request, -EPERM, "VMD subsystem is disabled"); +- return; +- } +- +- rc = spdk_json_decode_object(params, rpc_vmd_remove_device_decoders, +- SPDK_COUNTOF(rpc_vmd_remove_device_decoders), +- &req); +- if (rc != 0) { +- spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, +- "spdk_json_decode_object failed"); +- return; +- } +- +- rc = spdk_pci_addr_parse(&addr, req.addr); +- if (rc != 0) { +- spdk_jsonrpc_send_error_response(request, -EINVAL, "Failed to parse PCI address"); +- goto out; +- } +- +- rc = spdk_vmd_remove_device(&addr); +- if (rc != 0) { +- spdk_jsonrpc_send_error_response(request, rc, spdk_strerror(-rc)); +- goto out; +- } +- +- spdk_jsonrpc_send_bool_response(request, true); +-out: +- free(req.addr); +-} +-SPDK_RPC_REGISTER("vmd_remove_device", rpc_vmd_remove_device, SPDK_RPC_RUNTIME) +- +-static void +-rpc_vmd_rescan(struct spdk_jsonrpc_request *request, const struct spdk_json_val *params) +-{ +- struct spdk_json_write_ctx *w; +- int rc; +- +- if (!vmd_subsystem_is_enabled()) { +- spdk_jsonrpc_send_error_response(request, -EPERM, "VMD subsystem is disabled"); +- return; +- } +- +- rc = spdk_vmd_rescan(); +- if (rc < 0) { +- spdk_jsonrpc_send_error_response(request, rc, spdk_strerror(-rc)); +- return; +- } +- +- w = spdk_jsonrpc_begin_result(request); +- spdk_json_write_object_begin(w); +- spdk_json_write_named_uint32(w, "count", (uint32_t)rc); +- spdk_json_write_object_end(w); +- spdk_jsonrpc_end_result(request, w); +-} +-SPDK_RPC_REGISTER("vmd_rescan", rpc_vmd_rescan, SPDK_RPC_RUNTIME) ++/* SPDX-License-Identifier: BSD-3-Clause ++ * Copyright (C) 2019 Intel Corporation. ++ * All rights reserved. ++ */ ++ ++#include "spdk/vmd.h" ++ ++#include "spdk/env.h" ++#include "spdk/rpc.h" ++#include "spdk/string.h" ++#include "spdk/util.h" ++ ++#include "spdk/log.h" ++#include "event_vmd.h" ++ ++static void ++rpc_vmd_enable(struct spdk_jsonrpc_request *request, const struct spdk_json_val *params) ++{ ++ vmd_subsystem_enable(); ++ ++ spdk_jsonrpc_send_bool_response(request, true); ++} ++SPDK_RPC_REGISTER("vmd_enable", rpc_vmd_enable, SPDK_RPC_STARTUP) ++SPDK_RPC_REGISTER_ALIAS_DEPRECATED(vmd_enable, enable_vmd) ++ ++struct rpc_vmd_remove_device { ++ char *addr; ++}; ++ ++static const struct spdk_json_object_decoder rpc_vmd_remove_device_decoders[] = { ++ {"addr", offsetof(struct rpc_vmd_remove_device, addr), spdk_json_decode_string}, ++}; ++ ++static void ++rpc_vmd_remove_device(struct spdk_jsonrpc_request *request, const struct spdk_json_val *params) ++{ ++ struct rpc_vmd_remove_device req = {}; ++ struct spdk_pci_addr addr; ++ int rc; ++ ++ if (!vmd_subsystem_is_enabled()) { ++ spdk_jsonrpc_send_error_response(request, -EPERM, "VMD subsystem is disabled"); ++ return; ++ } ++ ++ rc = spdk_json_decode_object(params, rpc_vmd_remove_device_decoders, ++ SPDK_COUNTOF(rpc_vmd_remove_device_decoders), ++ &req); ++ if (rc != 0) { ++ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, ++ "spdk_json_decode_object failed"); ++ return; ++ } ++ ++ rc = spdk_pci_addr_parse(&addr, req.addr); ++ if (rc != 0) { ++ spdk_jsonrpc_send_error_response(request, -EINVAL, "Failed to parse PCI address"); ++ goto out; ++ } ++ ++ rc = spdk_vmd_remove_device(&addr); ++ if (rc != 0) { ++ spdk_jsonrpc_send_error_response(request, rc, spdk_strerror(-rc)); ++ goto out; ++ } ++ ++ spdk_jsonrpc_send_bool_response(request, true); ++out: ++ free(req.addr); ++} ++SPDK_RPC_REGISTER("vmd_remove_device", rpc_vmd_remove_device, SPDK_RPC_RUNTIME) ++ ++static void ++rpc_vmd_rescan(struct spdk_jsonrpc_request *request, const struct spdk_json_val *params) ++{ ++ struct spdk_json_write_ctx *w; ++ int rc; ++ ++ if (!vmd_subsystem_is_enabled()) { ++ spdk_jsonrpc_send_error_response(request, -EPERM, "VMD subsystem is disabled"); ++ return; ++ } ++ ++ rc = spdk_vmd_rescan(); ++ if (rc < 0) { ++ spdk_jsonrpc_send_error_response(request, rc, spdk_strerror(-rc)); ++ return; ++ } ++ ++ w = spdk_jsonrpc_begin_result(request); ++ spdk_json_write_object_begin(w); ++ spdk_json_write_named_uint32(w, "count", (uint32_t)rc); ++ spdk_json_write_object_end(w); ++ spdk_jsonrpc_end_result(request, w); ++} ++SPDK_RPC_REGISTER("vmd_rescan", rpc_vmd_rescan, SPDK_RPC_RUNTIME) +diff --git a/module/scheduler/Makefile b/module/scheduler/Makefile +index d973ce0..48e8c4e 100644 +--- a/module/scheduler/Makefile ++++ b/module/scheduler/Makefile +@@ -1,24 +1,24 @@ +-# SPDX-License-Identifier: BSD-3-Clause +-# Copyright (C) 2015 Intel Corporation. +-# All rights reserved. +-# +- +-SPDK_ROOT_DIR := $(abspath $(CURDIR)/../..) +-include $(SPDK_ROOT_DIR)/mk/spdk.common.mk +- +-DIRS-y = dynamic +- +-# When DPDK rte_power is missing, do not compile schedulers +-# and governors based on it. +-ifeq (y,$(DPDK_POWER)) +-DIRS-y += dpdk_governor gscheduler +-else +-$(warning Skipping building dpdk_governor and gscheduler, due to missing rte_power) +-endif +- +-.PHONY: all clean $(DIRS-y) +- +-all: $(DIRS-y) +-clean: $(DIRS-y) +- +-include $(SPDK_ROOT_DIR)/mk/spdk.subdirs.mk ++# SPDX-License-Identifier: BSD-3-Clause ++# Copyright (C) 2015 Intel Corporation. ++# All rights reserved. ++# ++ ++SPDK_ROOT_DIR := $(abspath $(CURDIR)/../..) ++include $(SPDK_ROOT_DIR)/mk/spdk.common.mk ++ ++DIRS-y = dynamic ++ ++# When DPDK rte_power is missing, do not compile schedulers ++# and governors based on it. ++ifeq (y,$(DPDK_POWER)) ++DIRS-y += dpdk_governor gscheduler ++else ++$(warning Skipping building dpdk_governor and gscheduler, due to missing rte_power) ++endif ++ ++.PHONY: all clean $(DIRS-y) ++ ++all: $(DIRS-y) ++clean: $(DIRS-y) ++ ++include $(SPDK_ROOT_DIR)/mk/spdk.subdirs.mk +diff --git a/module/scheduler/dpdk_governor/Makefile b/module/scheduler/dpdk_governor/Makefile +index 07717a8..7aef856 100644 +--- a/module/scheduler/dpdk_governor/Makefile ++++ b/module/scheduler/dpdk_governor/Makefile +@@ -1,19 +1,19 @@ +-# SPDX-License-Identifier: BSD-3-Clause +-# Copyright (C) 2015 Intel Corporation. +-# All rights reserved. +-# +- +-SPDK_ROOT_DIR := $(abspath $(CURDIR)/../../..) +-include $(SPDK_ROOT_DIR)/mk/spdk.common.mk +- +-SO_VER := 2 +-SO_MINOR := 0 +- +-CFLAGS += $(ENV_CFLAGS) +- +-LIBNAME = scheduler_dpdk_governor +-C_SRCS = dpdk_governor.c +- +-SPDK_MAP_FILE = $(SPDK_ROOT_DIR)/mk/spdk_blank.map +- +-include $(SPDK_ROOT_DIR)/mk/spdk.lib.mk ++# SPDX-License-Identifier: BSD-3-Clause ++# Copyright (C) 2015 Intel Corporation. ++# All rights reserved. ++# ++ ++SPDK_ROOT_DIR := $(abspath $(CURDIR)/../../..) ++include $(SPDK_ROOT_DIR)/mk/spdk.common.mk ++ ++SO_VER := 2 ++SO_MINOR := 0 ++ ++CFLAGS += $(ENV_CFLAGS) ++ ++LIBNAME = scheduler_dpdk_governor ++C_SRCS = dpdk_governor.c ++ ++SPDK_MAP_FILE = $(SPDK_ROOT_DIR)/mk/spdk_blank.map ++ ++include $(SPDK_ROOT_DIR)/mk/spdk.lib.mk +diff --git a/module/scheduler/dpdk_governor/dpdk_governor.c b/module/scheduler/dpdk_governor/dpdk_governor.c +index 2e1ff2e..5db49d0 100644 +--- a/module/scheduler/dpdk_governor/dpdk_governor.c ++++ b/module/scheduler/dpdk_governor/dpdk_governor.c +@@ -1,163 +1,163 @@ +-/* SPDX-License-Identifier: BSD-3-Clause +- * Copyright (C) 2020 Intel Corporation. +- * All rights reserved. +- */ +- +-#include "spdk/stdinc.h" +-#include "spdk/log.h" +-#include "spdk/env.h" +-#include "spdk/event.h" +-#include "spdk/scheduler.h" +- +-#include "spdk_internal/event.h" +- +-#include +- +-static uint32_t +-_get_core_curr_freq(uint32_t lcore_id) +-{ +- const uint32_t MAX_CORE_FREQ_NUM = 64; +- uint32_t freqs[MAX_CORE_FREQ_NUM]; +- uint32_t freq_index; +- int rc; +- +- rc = rte_power_freqs(lcore_id, freqs, MAX_CORE_FREQ_NUM); +- if (!rc) { +- SPDK_ERRLOG("Unable to get current core frequency array for core %d\n.", lcore_id); +- +- return 0; +- } +- freq_index = rte_power_get_freq(lcore_id); +- if (freq_index >= MAX_CORE_FREQ_NUM) { +- SPDK_ERRLOG("Unable to get current core frequency for core %d\n.", lcore_id); +- +- return 0; +- } +- +- return freqs[freq_index]; +-} +- +-static int +-_core_freq_up(uint32_t lcore_id) +-{ +- return rte_power_freq_up(lcore_id); +-} +- +-static int +-_core_freq_down(uint32_t lcore_id) +-{ +- return rte_power_freq_down(lcore_id); +-} +- +-static int +-_set_core_freq_max(uint32_t lcore_id) +-{ +- return rte_power_freq_max(lcore_id); +-} +- +-static int +-_set_core_freq_min(uint32_t lcore_id) +-{ +- return rte_power_freq_min(lcore_id); +-} +- +-static int +-_get_core_capabilities(uint32_t lcore_id, struct spdk_governor_capabilities *capabilities) +-{ +- struct rte_power_core_capabilities caps; +- int rc; +- +- rc = rte_power_get_capabilities(lcore_id, &caps); +- if (rc != 0) { +- return rc; +- } +- +- capabilities->priority = caps.priority == 0 ? false : true; +- +- return 0; +-} +- +-static int +-_init_core(uint32_t lcore_id) +-{ +- struct rte_power_core_capabilities caps; +- int rc; +- +- rc = rte_power_init(lcore_id); +- if (rc != 0) { +- SPDK_ERRLOG("Failed to initialize on core%d\n", lcore_id); +- return rc; +- } +- +- rc = rte_power_get_capabilities(lcore_id, &caps); +- if (rc != 0) { +- SPDK_ERRLOG("Failed retrieve capabilities of core%d\n", lcore_id); +- return rc; +- } +- +- if (caps.turbo) { +- rc = rte_power_freq_enable_turbo(lcore_id); +- if (rc != 0) { +- SPDK_ERRLOG("Failed to set turbo on core%d\n", lcore_id); +- return rc; +- } +- } +- +- return rc; +-} +- +-static int +-_init(void) +-{ +- uint32_t i, j; +- int rc = 0; +- +- SPDK_ENV_FOREACH_CORE(i) { +- rc = _init_core(i); +- if (rc != 0) { +- SPDK_ERRLOG("Failed to initialize on core%d\n", i); +- break; +- } +- } +- +- if (rc == 0) { +- return rc; +- } +- +- /* When initialization of a core failed, deinitialize prior cores. */ +- SPDK_ENV_FOREACH_CORE(j) { +- if (j >= i) { +- break; +- } +- if (rte_power_exit(j) != 0) { +- SPDK_ERRLOG("Failed to deinitialize on core%d\n", j); +- } +- } +- return rc; +-} +- +-static void +-_deinit(void) +-{ +- uint32_t i; +- +- SPDK_ENV_FOREACH_CORE(i) { +- if (rte_power_exit(i) != 0) { +- SPDK_ERRLOG("Failed to deinitialize on core%d\n", i); +- } +- } +-} +- +-static struct spdk_governor dpdk_governor = { +- .name = "dpdk_governor", +- .get_core_curr_freq = _get_core_curr_freq, +- .core_freq_up = _core_freq_up, +- .core_freq_down = _core_freq_down, +- .set_core_freq_max = _set_core_freq_max, +- .set_core_freq_min = _set_core_freq_min, +- .get_core_capabilities = _get_core_capabilities, +- .init = _init, +- .deinit = _deinit, +-}; +- +-SPDK_GOVERNOR_REGISTER(dpdk_governor); ++/* SPDX-License-Identifier: BSD-3-Clause ++ * Copyright (C) 2020 Intel Corporation. ++ * All rights reserved. ++ */ ++ ++#include "spdk/stdinc.h" ++#include "spdk/log.h" ++#include "spdk/env.h" ++#include "spdk/event.h" ++#include "spdk/scheduler.h" ++ ++#include "spdk_internal/event.h" ++ ++#include ++ ++static uint32_t ++_get_core_curr_freq(uint32_t lcore_id) ++{ ++ const uint32_t MAX_CORE_FREQ_NUM = 64; ++ uint32_t freqs[MAX_CORE_FREQ_NUM]; ++ uint32_t freq_index; ++ int rc; ++ ++ rc = rte_power_freqs(lcore_id, freqs, MAX_CORE_FREQ_NUM); ++ if (!rc) { ++ SPDK_ERRLOG("Unable to get current core frequency array for core %d\n.", lcore_id); ++ ++ return 0; ++ } ++ freq_index = rte_power_get_freq(lcore_id); ++ if (freq_index >= MAX_CORE_FREQ_NUM) { ++ SPDK_ERRLOG("Unable to get current core frequency for core %d\n.", lcore_id); ++ ++ return 0; ++ } ++ ++ return freqs[freq_index]; ++} ++ ++static int ++_core_freq_up(uint32_t lcore_id) ++{ ++ return rte_power_freq_up(lcore_id); ++} ++ ++static int ++_core_freq_down(uint32_t lcore_id) ++{ ++ return rte_power_freq_down(lcore_id); ++} ++ ++static int ++_set_core_freq_max(uint32_t lcore_id) ++{ ++ return rte_power_freq_max(lcore_id); ++} ++ ++static int ++_set_core_freq_min(uint32_t lcore_id) ++{ ++ return rte_power_freq_min(lcore_id); ++} ++ ++static int ++_get_core_capabilities(uint32_t lcore_id, struct spdk_governor_capabilities *capabilities) ++{ ++ struct rte_power_core_capabilities caps; ++ int rc; ++ ++ rc = rte_power_get_capabilities(lcore_id, &caps); ++ if (rc != 0) { ++ return rc; ++ } ++ ++ capabilities->priority = caps.priority == 0 ? false : true; ++ ++ return 0; ++} ++ ++static int ++_init_core(uint32_t lcore_id) ++{ ++ struct rte_power_core_capabilities caps; ++ int rc; ++ ++ rc = rte_power_init(lcore_id); ++ if (rc != 0) { ++ SPDK_ERRLOG("Failed to initialize on core%d\n", lcore_id); ++ return rc; ++ } ++ ++ rc = rte_power_get_capabilities(lcore_id, &caps); ++ if (rc != 0) { ++ SPDK_ERRLOG("Failed retrieve capabilities of core%d\n", lcore_id); ++ return rc; ++ } ++ ++ if (caps.turbo) { ++ rc = rte_power_freq_enable_turbo(lcore_id); ++ if (rc != 0) { ++ SPDK_ERRLOG("Failed to set turbo on core%d\n", lcore_id); ++ return rc; ++ } ++ } ++ ++ return rc; ++} ++ ++static int ++_init(void) ++{ ++ uint32_t i, j; ++ int rc = 0; ++ ++ SPDK_ENV_FOREACH_CORE(i) { ++ rc = _init_core(i); ++ if (rc != 0) { ++ SPDK_ERRLOG("Failed to initialize on core%d\n", i); ++ break; ++ } ++ } ++ ++ if (rc == 0) { ++ return rc; ++ } ++ ++ /* When initialization of a core failed, deinitialize prior cores. */ ++ SPDK_ENV_FOREACH_CORE(j) { ++ if (j >= i) { ++ break; ++ } ++ if (rte_power_exit(j) != 0) { ++ SPDK_ERRLOG("Failed to deinitialize on core%d\n", j); ++ } ++ } ++ return rc; ++} ++ ++static void ++_deinit(void) ++{ ++ uint32_t i; ++ ++ SPDK_ENV_FOREACH_CORE(i) { ++ if (rte_power_exit(i) != 0) { ++ SPDK_ERRLOG("Failed to deinitialize on core%d\n", i); ++ } ++ } ++} ++ ++static struct spdk_governor dpdk_governor = { ++ .name = "dpdk_governor", ++ .get_core_curr_freq = _get_core_curr_freq, ++ .core_freq_up = _core_freq_up, ++ .core_freq_down = _core_freq_down, ++ .set_core_freq_max = _set_core_freq_max, ++ .set_core_freq_min = _set_core_freq_min, ++ .get_core_capabilities = _get_core_capabilities, ++ .init = _init, ++ .deinit = _deinit, ++}; ++ ++SPDK_GOVERNOR_REGISTER(dpdk_governor); +diff --git a/module/scheduler/dynamic/Makefile b/module/scheduler/dynamic/Makefile +index 81815d8..01068fe 100644 +--- a/module/scheduler/dynamic/Makefile ++++ b/module/scheduler/dynamic/Makefile +@@ -1,17 +1,17 @@ +-# SPDX-License-Identifier: BSD-3-Clause +-# Copyright (C) 2015 Intel Corporation. +-# All rights reserved. +-# +- +-SPDK_ROOT_DIR := $(abspath $(CURDIR)/../../..) +-include $(SPDK_ROOT_DIR)/mk/spdk.common.mk +- +-SO_VER := 2 +-SO_MINOR := 0 +- +-LIBNAME = scheduler_dynamic +-C_SRCS = scheduler_dynamic.c +- +-SPDK_MAP_FILE = $(SPDK_ROOT_DIR)/mk/spdk_blank.map +- +-include $(SPDK_ROOT_DIR)/mk/spdk.lib.mk ++# SPDX-License-Identifier: BSD-3-Clause ++# Copyright (C) 2015 Intel Corporation. ++# All rights reserved. ++# ++ ++SPDK_ROOT_DIR := $(abspath $(CURDIR)/../../..) ++include $(SPDK_ROOT_DIR)/mk/spdk.common.mk ++ ++SO_VER := 2 ++SO_MINOR := 0 ++ ++LIBNAME = scheduler_dynamic ++C_SRCS = scheduler_dynamic.c ++ ++SPDK_MAP_FILE = $(SPDK_ROOT_DIR)/mk/spdk_blank.map ++ ++include $(SPDK_ROOT_DIR)/mk/spdk.lib.mk +diff --git a/module/scheduler/dynamic/scheduler_dynamic.c b/module/scheduler/dynamic/scheduler_dynamic.c +index 25ead07..9d3534b 100644 +--- a/module/scheduler/dynamic/scheduler_dynamic.c ++++ b/module/scheduler/dynamic/scheduler_dynamic.c +@@ -1,412 +1,412 @@ +-/* SPDX-License-Identifier: BSD-3-Clause +- * Copyright (C) 2021 Intel Corporation. +- * All rights reserved. +- */ +- +-#include "spdk/stdinc.h" +-#include "spdk/likely.h" +-#include "spdk/event.h" +-#include "spdk/log.h" +-#include "spdk/env.h" +- +-#include "spdk/thread.h" +-#include "spdk_internal/event.h" +-#include "spdk/scheduler.h" +-#include "spdk_internal/usdt.h" +- +-static uint32_t g_main_lcore; +- +-struct core_stats { +- uint64_t busy; +- uint64_t idle; +- uint32_t thread_count; +-}; +- +-static struct core_stats *g_cores; +- +-uint8_t g_scheduler_load_limit = 20; +-uint8_t g_scheduler_core_limit = 80; +-uint8_t g_scheduler_core_busy = 95; +- +-static uint8_t +-_busy_pct(uint64_t busy, uint64_t idle) +-{ +- if ((busy + idle) == 0) { +- return 0; +- } +- +- return busy * 100 / (busy + idle); +-} +- +-static uint8_t +-_get_thread_load(struct spdk_scheduler_thread_info *thread_info) +-{ +- uint64_t busy, idle; +- +- busy = thread_info->current_stats.busy_tsc; +- idle = thread_info->current_stats.idle_tsc; +- +- /* return percentage of time thread was busy */ +- return _busy_pct(busy, idle); +-} +- +-typedef void (*_foreach_fn)(struct spdk_scheduler_thread_info *thread_info); +- +-static void +-_foreach_thread(struct spdk_scheduler_core_info *cores_info, _foreach_fn fn) +-{ +- struct spdk_scheduler_core_info *core; +- uint32_t i, j; +- +- SPDK_ENV_FOREACH_CORE(i) { +- core = &cores_info[i]; +- for (j = 0; j < core->threads_count; j++) { +- fn(&core->thread_infos[j]); +- } +- } +-} +- +-static void +-_move_thread(struct spdk_scheduler_thread_info *thread_info, uint32_t dst_core) +-{ +- struct core_stats *dst = &g_cores[dst_core]; +- struct core_stats *src = &g_cores[thread_info->lcore]; +- uint64_t busy_tsc = thread_info->current_stats.busy_tsc; +- uint8_t busy_pct = _busy_pct(src->busy, src->idle); +- uint64_t tsc; +- +- SPDK_DTRACE_PROBE2(dynsched_move, thread_info, dst_core); +- +- if (src == dst) { +- /* Don't modify stats if thread is already on that core. */ +- return; +- } +- +- dst->busy += spdk_min(UINT64_MAX - dst->busy, busy_tsc); +- dst->idle -= spdk_min(dst->idle, busy_tsc); +- dst->thread_count++; +- +- /* Adjust busy/idle from core as if thread was not present on it. +- * Core load will reflect the sum of all remaining threads on it. */ +- src->busy -= spdk_min(src->busy, busy_tsc); +- src->idle += spdk_min(UINT64_MAX - src->idle, busy_tsc); +- +- if (busy_pct >= g_scheduler_core_busy && +- _busy_pct(src->busy, src->idle) < g_scheduler_core_limit) { +- /* This core was so busy that we cannot assume all of busy_tsc +- * consumed by the moved thread will now be idle_tsc - it's +- * very possible the remaining threads will use these cycles +- * as busy_tsc. +- * +- * So make sure we don't drop the updated estimate below +- * g_scheduler_core_limit, so that other cores can't +- * move threads to this core during this scheduling +- * period. +- */ +- tsc = src->busy + src->idle; +- src->busy = tsc * g_scheduler_core_limit / 100; +- src->idle = tsc - src->busy; +- } +- assert(src->thread_count > 0); +- src->thread_count--; +- +- thread_info->lcore = dst_core; +-} +- +-static bool +-_is_core_at_limit(uint32_t core_id) +-{ +- struct core_stats *core = &g_cores[core_id]; +- uint64_t busy, idle; +- +- /* Core with no or single thread cannot be over the limit. */ +- if (core->thread_count <= 1) { +- return false; +- } +- +- busy = core->busy; +- idle = core->idle; +- +- /* No work was done, exit before possible division by 0. */ +- if (busy == 0) { +- return false; +- } +- +- /* Work done was less than the limit */ +- if (_busy_pct(busy, idle) < g_scheduler_core_limit) { +- return false; +- } +- +- return true; +-} +- +-static bool +-_can_core_fit_thread(struct spdk_scheduler_thread_info *thread_info, uint32_t dst_core) +-{ +- struct core_stats *dst = &g_cores[dst_core]; +- uint64_t new_busy_tsc, new_idle_tsc; +- +- /* Thread can always fit on the core it's currently on. */ +- if (thread_info->lcore == dst_core) { +- return true; +- } +- +- /* Reactors in interrupt mode do not update stats, +- * a thread can always fit into reactor in interrupt mode. */ +- if (dst->busy + dst->idle == 0) { +- return true; +- } +- +- /* Core has no threads. */ +- if (dst->thread_count == 0) { +- return true; +- } +- +- /* Core doesn't have enough idle_tsc to take this thread. */ +- if (dst->idle < thread_info->current_stats.busy_tsc) { +- return false; +- } +- +- new_busy_tsc = dst->busy + thread_info->current_stats.busy_tsc; +- new_idle_tsc = dst->idle - thread_info->current_stats.busy_tsc; +- +- /* Core cannot fit this thread if it would put it over the +- * g_scheduler_core_limit. */ +- return _busy_pct(new_busy_tsc, new_idle_tsc) < g_scheduler_core_limit; +-} +- +-static uint32_t +-_find_optimal_core(struct spdk_scheduler_thread_info *thread_info) +-{ +- uint32_t i; +- uint32_t current_lcore = thread_info->lcore; +- uint32_t least_busy_lcore = thread_info->lcore; +- struct spdk_thread *thread; +- struct spdk_cpuset *cpumask; +- bool core_at_limit = _is_core_at_limit(current_lcore); +- +- thread = spdk_thread_get_by_id(thread_info->thread_id); +- if (thread == NULL) { +- return current_lcore; +- } +- cpumask = spdk_thread_get_cpumask(thread); +- +- /* Find a core that can fit the thread. */ +- SPDK_ENV_FOREACH_CORE(i) { +- /* Ignore cores outside cpumask. */ +- if (!spdk_cpuset_get_cpu(cpumask, i)) { +- continue; +- } +- +- /* Search for least busy core. */ +- if (g_cores[i].busy < g_cores[least_busy_lcore].busy) { +- least_busy_lcore = i; +- } +- +- /* Skip cores that cannot fit the thread and current one. */ +- if (!_can_core_fit_thread(thread_info, i) || i == current_lcore) { +- continue; +- } +- if (i == g_main_lcore) { +- /* First consider g_main_lcore, consolidate threads on main lcore if possible. */ +- return i; +- } else if (i < current_lcore && current_lcore != g_main_lcore) { +- /* Lower core id was found, move to consolidate threads on lowest core ids. */ +- return i; +- } else if (core_at_limit) { +- /* When core is over the limit, any core id is better than current one. */ +- return i; +- } +- } +- +- /* For cores over the limit, place the thread on least busy core +- * to balance threads. */ +- if (core_at_limit) { +- return least_busy_lcore; +- } +- +- /* If no better core is found, remain on the same one. */ +- return current_lcore; +-} +- +-static int +-init(void) +-{ +- g_main_lcore = spdk_env_get_current_core(); +- +- if (spdk_governor_set("dpdk_governor") != 0) { +- SPDK_NOTICELOG("Unable to initialize dpdk governor\n"); +- } +- +- g_cores = calloc(spdk_env_get_last_core() + 1, sizeof(struct core_stats)); +- if (g_cores == NULL) { +- SPDK_ERRLOG("Failed to allocate memory for dynamic scheduler core stats.\n"); +- return -ENOMEM; +- } +- +- if (spdk_scheduler_get_period() == 0) { +- /* set default scheduling period to one second */ +- spdk_scheduler_set_period(SPDK_SEC_TO_USEC); +- } +- +- return 0; +-} +- +-static void +-deinit(void) +-{ +- free(g_cores); +- g_cores = NULL; +- spdk_governor_set(NULL); +-} +- +-static void +-_balance_idle(struct spdk_scheduler_thread_info *thread_info) +-{ +- if (_get_thread_load(thread_info) >= g_scheduler_load_limit) { +- return; +- } +- /* This thread is idle, move it to the main core. */ +- _move_thread(thread_info, g_main_lcore); +-} +- +-static void +-_balance_active(struct spdk_scheduler_thread_info *thread_info) +-{ +- uint32_t target_lcore; +- +- if (_get_thread_load(thread_info) < g_scheduler_load_limit) { +- return; +- } +- +- /* This thread is active. */ +- target_lcore = _find_optimal_core(thread_info); +- _move_thread(thread_info, target_lcore); +-} +- +-static void +-balance(struct spdk_scheduler_core_info *cores_info, uint32_t cores_count) +-{ +- struct spdk_reactor *reactor; +- struct spdk_governor *governor; +- struct spdk_scheduler_core_info *core; +- struct core_stats *main_core; +- uint32_t i; +- int rc; +- bool busy_threads_present = false; +- +- SPDK_DTRACE_PROBE1(dynsched_balance, cores_count); +- +- SPDK_ENV_FOREACH_CORE(i) { +- g_cores[i].thread_count = cores_info[i].threads_count; +- g_cores[i].busy = cores_info[i].current_busy_tsc; +- g_cores[i].idle = cores_info[i].current_idle_tsc; +- SPDK_DTRACE_PROBE2(dynsched_core_info, i, &cores_info[i]); +- } +- main_core = &g_cores[g_main_lcore]; +- +- /* Distribute threads in two passes, to make sure updated core stats are considered on each pass. +- * 1) Move all idle threads to main core. */ +- _foreach_thread(cores_info, _balance_idle); +- /* 2) Distribute active threads across all cores. */ +- _foreach_thread(cores_info, _balance_active); +- +- /* Switch unused cores to interrupt mode and switch cores to polled mode +- * if they will be used after rebalancing */ +- SPDK_ENV_FOREACH_CORE(i) { +- reactor = spdk_reactor_get(i); +- core = &cores_info[i]; +- /* We can switch mode only if reactor already does not have any threads */ +- if (g_cores[i].thread_count == 0 && TAILQ_EMPTY(&reactor->threads)) { +- core->interrupt_mode = true; +- } else if (g_cores[i].thread_count != 0) { +- core->interrupt_mode = false; +- if (i != g_main_lcore) { +- /* If a thread is present on non g_main_lcore, +- * it has to be busy. */ +- busy_threads_present = true; +- } +- } +- } +- +- governor = spdk_governor_get(); +- if (governor == NULL) { +- return; +- } +- +- /* Change main core frequency if needed */ +- if (busy_threads_present) { +- rc = governor->set_core_freq_max(g_main_lcore); +- if (rc < 0) { +- SPDK_ERRLOG("setting default frequency for core %u failed\n", g_main_lcore); +- } +- } else if (main_core->busy > main_core->idle) { +- rc = governor->core_freq_up(g_main_lcore); +- if (rc < 0) { +- SPDK_ERRLOG("increasing frequency for core %u failed\n", g_main_lcore); +- } +- } else { +- rc = governor->core_freq_down(g_main_lcore); +- if (rc < 0) { +- SPDK_ERRLOG("lowering frequency for core %u failed\n", g_main_lcore); +- } +- } +-} +- +-struct json_scheduler_opts { +- uint8_t load_limit; +- uint8_t core_limit; +- uint8_t core_busy; +-}; +- +-static const struct spdk_json_object_decoder sched_decoders[] = { +- {"load_limit", offsetof(struct json_scheduler_opts, load_limit), spdk_json_decode_uint8, true}, +- {"core_limit", offsetof(struct json_scheduler_opts, core_limit), spdk_json_decode_uint8, true}, +- {"core_busy", offsetof(struct json_scheduler_opts, core_busy), spdk_json_decode_uint8, true}, +-}; +- +-static int +-set_opts(const struct spdk_json_val *opts) +-{ +- struct json_scheduler_opts scheduler_opts; +- +- scheduler_opts.load_limit = g_scheduler_load_limit; +- scheduler_opts.core_limit = g_scheduler_core_limit; +- scheduler_opts.core_busy = g_scheduler_core_busy; +- +- if (opts != NULL) { +- if (spdk_json_decode_object_relaxed(opts, sched_decoders, +- SPDK_COUNTOF(sched_decoders), &scheduler_opts)) { +- SPDK_ERRLOG("Decoding scheduler opts JSON failed\n"); +- return -1; +- } +- } +- +- SPDK_NOTICELOG("Setting scheduler load limit to %d\n", scheduler_opts.load_limit); +- g_scheduler_load_limit = scheduler_opts.load_limit; +- SPDK_NOTICELOG("Setting scheduler core limit to %d\n", scheduler_opts.core_limit); +- g_scheduler_core_limit = scheduler_opts.core_limit; +- SPDK_NOTICELOG("Setting scheduler core busy to %d\n", scheduler_opts.core_busy); +- g_scheduler_core_busy = scheduler_opts.core_busy; +- +- return 0; +-} +- +-static void +-get_opts(struct spdk_json_write_ctx *ctx) +-{ +- spdk_json_write_named_uint8(ctx, "load_limit", g_scheduler_load_limit); +- spdk_json_write_named_uint8(ctx, "core_limit", g_scheduler_core_limit); +- spdk_json_write_named_uint8(ctx, "core_busy", g_scheduler_core_busy); +-} +- +-static struct spdk_scheduler scheduler_dynamic = { +- .name = "dynamic", +- .init = init, +- .deinit = deinit, +- .balance = balance, +- .set_opts = set_opts, +- .get_opts = get_opts, +-}; +- +-SPDK_SCHEDULER_REGISTER(scheduler_dynamic); ++/* SPDX-License-Identifier: BSD-3-Clause ++ * Copyright (C) 2021 Intel Corporation. ++ * All rights reserved. ++ */ ++ ++#include "spdk/stdinc.h" ++#include "spdk/likely.h" ++#include "spdk/event.h" ++#include "spdk/log.h" ++#include "spdk/env.h" ++ ++#include "spdk/thread.h" ++#include "spdk_internal/event.h" ++#include "spdk/scheduler.h" ++#include "spdk_internal/usdt.h" ++ ++static uint32_t g_main_lcore; ++ ++struct core_stats { ++ uint64_t busy; ++ uint64_t idle; ++ uint32_t thread_count; ++}; ++ ++static struct core_stats *g_cores; ++ ++uint8_t g_scheduler_load_limit = 20; ++uint8_t g_scheduler_core_limit = 80; ++uint8_t g_scheduler_core_busy = 95; ++ ++static uint8_t ++_busy_pct(uint64_t busy, uint64_t idle) ++{ ++ if ((busy + idle) == 0) { ++ return 0; ++ } ++ ++ return busy * 100 / (busy + idle); ++} ++ ++static uint8_t ++_get_thread_load(struct spdk_scheduler_thread_info *thread_info) ++{ ++ uint64_t busy, idle; ++ ++ busy = thread_info->current_stats.busy_tsc; ++ idle = thread_info->current_stats.idle_tsc; ++ ++ /* return percentage of time thread was busy */ ++ return _busy_pct(busy, idle); ++} ++ ++typedef void (*_foreach_fn)(struct spdk_scheduler_thread_info *thread_info); ++ ++static void ++_foreach_thread(struct spdk_scheduler_core_info *cores_info, _foreach_fn fn) ++{ ++ struct spdk_scheduler_core_info *core; ++ uint32_t i, j; ++ ++ SPDK_ENV_FOREACH_CORE(i) { ++ core = &cores_info[i]; ++ for (j = 0; j < core->threads_count; j++) { ++ fn(&core->thread_infos[j]); ++ } ++ } ++} ++ ++static void ++_move_thread(struct spdk_scheduler_thread_info *thread_info, uint32_t dst_core) ++{ ++ struct core_stats *dst = &g_cores[dst_core]; ++ struct core_stats *src = &g_cores[thread_info->lcore]; ++ uint64_t busy_tsc = thread_info->current_stats.busy_tsc; ++ uint8_t busy_pct = _busy_pct(src->busy, src->idle); ++ uint64_t tsc; ++ ++ SPDK_DTRACE_PROBE2(dynsched_move, thread_info, dst_core); ++ ++ if (src == dst) { ++ /* Don't modify stats if thread is already on that core. */ ++ return; ++ } ++ ++ dst->busy += spdk_min(UINT64_MAX - dst->busy, busy_tsc); ++ dst->idle -= spdk_min(dst->idle, busy_tsc); ++ dst->thread_count++; ++ ++ /* Adjust busy/idle from core as if thread was not present on it. ++ * Core load will reflect the sum of all remaining threads on it. */ ++ src->busy -= spdk_min(src->busy, busy_tsc); ++ src->idle += spdk_min(UINT64_MAX - src->idle, busy_tsc); ++ ++ if (busy_pct >= g_scheduler_core_busy && ++ _busy_pct(src->busy, src->idle) < g_scheduler_core_limit) { ++ /* This core was so busy that we cannot assume all of busy_tsc ++ * consumed by the moved thread will now be idle_tsc - it's ++ * very possible the remaining threads will use these cycles ++ * as busy_tsc. ++ * ++ * So make sure we don't drop the updated estimate below ++ * g_scheduler_core_limit, so that other cores can't ++ * move threads to this core during this scheduling ++ * period. ++ */ ++ tsc = src->busy + src->idle; ++ src->busy = tsc * g_scheduler_core_limit / 100; ++ src->idle = tsc - src->busy; ++ } ++ assert(src->thread_count > 0); ++ src->thread_count--; ++ ++ thread_info->lcore = dst_core; ++} ++ ++static bool ++_is_core_at_limit(uint32_t core_id) ++{ ++ struct core_stats *core = &g_cores[core_id]; ++ uint64_t busy, idle; ++ ++ /* Core with no or single thread cannot be over the limit. */ ++ if (core->thread_count <= 1) { ++ return false; ++ } ++ ++ busy = core->busy; ++ idle = core->idle; ++ ++ /* No work was done, exit before possible division by 0. */ ++ if (busy == 0) { ++ return false; ++ } ++ ++ /* Work done was less than the limit */ ++ if (_busy_pct(busy, idle) < g_scheduler_core_limit) { ++ return false; ++ } ++ ++ return true; ++} ++ ++static bool ++_can_core_fit_thread(struct spdk_scheduler_thread_info *thread_info, uint32_t dst_core) ++{ ++ struct core_stats *dst = &g_cores[dst_core]; ++ uint64_t new_busy_tsc, new_idle_tsc; ++ ++ /* Thread can always fit on the core it's currently on. */ ++ if (thread_info->lcore == dst_core) { ++ return true; ++ } ++ ++ /* Reactors in interrupt mode do not update stats, ++ * a thread can always fit into reactor in interrupt mode. */ ++ if (dst->busy + dst->idle == 0) { ++ return true; ++ } ++ ++ /* Core has no threads. */ ++ if (dst->thread_count == 0) { ++ return true; ++ } ++ ++ /* Core doesn't have enough idle_tsc to take this thread. */ ++ if (dst->idle < thread_info->current_stats.busy_tsc) { ++ return false; ++ } ++ ++ new_busy_tsc = dst->busy + thread_info->current_stats.busy_tsc; ++ new_idle_tsc = dst->idle - thread_info->current_stats.busy_tsc; ++ ++ /* Core cannot fit this thread if it would put it over the ++ * g_scheduler_core_limit. */ ++ return _busy_pct(new_busy_tsc, new_idle_tsc) < g_scheduler_core_limit; ++} ++ ++static uint32_t ++_find_optimal_core(struct spdk_scheduler_thread_info *thread_info) ++{ ++ uint32_t i; ++ uint32_t current_lcore = thread_info->lcore; ++ uint32_t least_busy_lcore = thread_info->lcore; ++ struct spdk_thread *thread; ++ struct spdk_cpuset *cpumask; ++ bool core_at_limit = _is_core_at_limit(current_lcore); ++ ++ thread = spdk_thread_get_by_id(thread_info->thread_id); ++ if (thread == NULL) { ++ return current_lcore; ++ } ++ cpumask = spdk_thread_get_cpumask(thread); ++ ++ /* Find a core that can fit the thread. */ ++ SPDK_ENV_FOREACH_CORE(i) { ++ /* Ignore cores outside cpumask. */ ++ if (!spdk_cpuset_get_cpu(cpumask, i)) { ++ continue; ++ } ++ ++ /* Search for least busy core. */ ++ if (g_cores[i].busy < g_cores[least_busy_lcore].busy) { ++ least_busy_lcore = i; ++ } ++ ++ /* Skip cores that cannot fit the thread and current one. */ ++ if (!_can_core_fit_thread(thread_info, i) || i == current_lcore) { ++ continue; ++ } ++ if (i == g_main_lcore) { ++ /* First consider g_main_lcore, consolidate threads on main lcore if possible. */ ++ return i; ++ } else if (i < current_lcore && current_lcore != g_main_lcore) { ++ /* Lower core id was found, move to consolidate threads on lowest core ids. */ ++ return i; ++ } else if (core_at_limit) { ++ /* When core is over the limit, any core id is better than current one. */ ++ return i; ++ } ++ } ++ ++ /* For cores over the limit, place the thread on least busy core ++ * to balance threads. */ ++ if (core_at_limit) { ++ return least_busy_lcore; ++ } ++ ++ /* If no better core is found, remain on the same one. */ ++ return current_lcore; ++} ++ ++static int ++init(void) ++{ ++ g_main_lcore = spdk_env_get_current_core(); ++ ++ if (spdk_governor_set("dpdk_governor") != 0) { ++ SPDK_NOTICELOG("Unable to initialize dpdk governor\n"); ++ } ++ ++ g_cores = calloc(spdk_env_get_last_core() + 1, sizeof(struct core_stats)); ++ if (g_cores == NULL) { ++ SPDK_ERRLOG("Failed to allocate memory for dynamic scheduler core stats.\n"); ++ return -ENOMEM; ++ } ++ ++ if (spdk_scheduler_get_period() == 0) { ++ /* set default scheduling period to one second */ ++ spdk_scheduler_set_period(SPDK_SEC_TO_USEC); ++ } ++ ++ return 0; ++} ++ ++static void ++deinit(void) ++{ ++ free(g_cores); ++ g_cores = NULL; ++ spdk_governor_set(NULL); ++} ++ ++static void ++_balance_idle(struct spdk_scheduler_thread_info *thread_info) ++{ ++ if (_get_thread_load(thread_info) >= g_scheduler_load_limit) { ++ return; ++ } ++ /* This thread is idle, move it to the main core. */ ++ _move_thread(thread_info, g_main_lcore); ++} ++ ++static void ++_balance_active(struct spdk_scheduler_thread_info *thread_info) ++{ ++ uint32_t target_lcore; ++ ++ if (_get_thread_load(thread_info) < g_scheduler_load_limit) { ++ return; ++ } ++ ++ /* This thread is active. */ ++ target_lcore = _find_optimal_core(thread_info); ++ _move_thread(thread_info, target_lcore); ++} ++ ++static void ++balance(struct spdk_scheduler_core_info *cores_info, uint32_t cores_count) ++{ ++ struct spdk_reactor *reactor; ++ struct spdk_governor *governor; ++ struct spdk_scheduler_core_info *core; ++ struct core_stats *main_core; ++ uint32_t i; ++ int rc; ++ bool busy_threads_present = false; ++ ++ SPDK_DTRACE_PROBE1(dynsched_balance, cores_count); ++ ++ SPDK_ENV_FOREACH_CORE(i) { ++ g_cores[i].thread_count = cores_info[i].threads_count; ++ g_cores[i].busy = cores_info[i].current_busy_tsc; ++ g_cores[i].idle = cores_info[i].current_idle_tsc; ++ SPDK_DTRACE_PROBE2(dynsched_core_info, i, &cores_info[i]); ++ } ++ main_core = &g_cores[g_main_lcore]; ++ ++ /* Distribute threads in two passes, to make sure updated core stats are considered on each pass. ++ * 1) Move all idle threads to main core. */ ++ _foreach_thread(cores_info, _balance_idle); ++ /* 2) Distribute active threads across all cores. */ ++ _foreach_thread(cores_info, _balance_active); ++ ++ /* Switch unused cores to interrupt mode and switch cores to polled mode ++ * if they will be used after rebalancing */ ++ SPDK_ENV_FOREACH_CORE(i) { ++ reactor = spdk_reactor_get(i); ++ core = &cores_info[i]; ++ /* We can switch mode only if reactor already does not have any threads */ ++ if (g_cores[i].thread_count == 0 && TAILQ_EMPTY(&reactor->threads)) { ++ core->interrupt_mode = true; ++ } else if (g_cores[i].thread_count != 0) { ++ core->interrupt_mode = false; ++ if (i != g_main_lcore) { ++ /* If a thread is present on non g_main_lcore, ++ * it has to be busy. */ ++ busy_threads_present = true; ++ } ++ } ++ } ++ ++ governor = spdk_governor_get(); ++ if (governor == NULL) { ++ return; ++ } ++ ++ /* Change main core frequency if needed */ ++ if (busy_threads_present) { ++ rc = governor->set_core_freq_max(g_main_lcore); ++ if (rc < 0) { ++ SPDK_ERRLOG("setting default frequency for core %u failed\n", g_main_lcore); ++ } ++ } else if (main_core->busy > main_core->idle) { ++ rc = governor->core_freq_up(g_main_lcore); ++ if (rc < 0) { ++ SPDK_ERRLOG("increasing frequency for core %u failed\n", g_main_lcore); ++ } ++ } else { ++ rc = governor->core_freq_down(g_main_lcore); ++ if (rc < 0) { ++ SPDK_ERRLOG("lowering frequency for core %u failed\n", g_main_lcore); ++ } ++ } ++} ++ ++struct json_scheduler_opts { ++ uint8_t load_limit; ++ uint8_t core_limit; ++ uint8_t core_busy; ++}; ++ ++static const struct spdk_json_object_decoder sched_decoders[] = { ++ {"load_limit", offsetof(struct json_scheduler_opts, load_limit), spdk_json_decode_uint8, true}, ++ {"core_limit", offsetof(struct json_scheduler_opts, core_limit), spdk_json_decode_uint8, true}, ++ {"core_busy", offsetof(struct json_scheduler_opts, core_busy), spdk_json_decode_uint8, true}, ++}; ++ ++static int ++set_opts(const struct spdk_json_val *opts) ++{ ++ struct json_scheduler_opts scheduler_opts; ++ ++ scheduler_opts.load_limit = g_scheduler_load_limit; ++ scheduler_opts.core_limit = g_scheduler_core_limit; ++ scheduler_opts.core_busy = g_scheduler_core_busy; ++ ++ if (opts != NULL) { ++ if (spdk_json_decode_object_relaxed(opts, sched_decoders, ++ SPDK_COUNTOF(sched_decoders), &scheduler_opts)) { ++ SPDK_ERRLOG("Decoding scheduler opts JSON failed\n"); ++ return -1; ++ } ++ } ++ ++ SPDK_NOTICELOG("Setting scheduler load limit to %d\n", scheduler_opts.load_limit); ++ g_scheduler_load_limit = scheduler_opts.load_limit; ++ SPDK_NOTICELOG("Setting scheduler core limit to %d\n", scheduler_opts.core_limit); ++ g_scheduler_core_limit = scheduler_opts.core_limit; ++ SPDK_NOTICELOG("Setting scheduler core busy to %d\n", scheduler_opts.core_busy); ++ g_scheduler_core_busy = scheduler_opts.core_busy; ++ ++ return 0; ++} ++ ++static void ++get_opts(struct spdk_json_write_ctx *ctx) ++{ ++ spdk_json_write_named_uint8(ctx, "load_limit", g_scheduler_load_limit); ++ spdk_json_write_named_uint8(ctx, "core_limit", g_scheduler_core_limit); ++ spdk_json_write_named_uint8(ctx, "core_busy", g_scheduler_core_busy); ++} ++ ++static struct spdk_scheduler scheduler_dynamic = { ++ .name = "dynamic", ++ .init = init, ++ .deinit = deinit, ++ .balance = balance, ++ .set_opts = set_opts, ++ .get_opts = get_opts, ++}; ++ ++SPDK_SCHEDULER_REGISTER(scheduler_dynamic); +diff --git a/module/scheduler/gscheduler/Makefile b/module/scheduler/gscheduler/Makefile +index 20d71e6..31732b5 100644 +--- a/module/scheduler/gscheduler/Makefile ++++ b/module/scheduler/gscheduler/Makefile +@@ -1,17 +1,17 @@ +-# SPDX-License-Identifier: BSD-3-Clause +-# Copyright (C) 2015 Intel Corporation. +-# All rights reserved. +-# +- +-SPDK_ROOT_DIR := $(abspath $(CURDIR)/../../..) +-include $(SPDK_ROOT_DIR)/mk/spdk.common.mk +- +-SO_VER := 2 +-SO_MINOR := 0 +- +-LIBNAME = scheduler_gscheduler +-C_SRCS = gscheduler.c +- +-SPDK_MAP_FILE = $(SPDK_ROOT_DIR)/mk/spdk_blank.map +- +-include $(SPDK_ROOT_DIR)/mk/spdk.lib.mk ++# SPDX-License-Identifier: BSD-3-Clause ++# Copyright (C) 2015 Intel Corporation. ++# All rights reserved. ++# ++ ++SPDK_ROOT_DIR := $(abspath $(CURDIR)/../../..) ++include $(SPDK_ROOT_DIR)/mk/spdk.common.mk ++ ++SO_VER := 2 ++SO_MINOR := 0 ++ ++LIBNAME = scheduler_gscheduler ++C_SRCS = gscheduler.c ++ ++SPDK_MAP_FILE = $(SPDK_ROOT_DIR)/mk/spdk_blank.map ++ ++include $(SPDK_ROOT_DIR)/mk/spdk.lib.mk +diff --git a/module/scheduler/gscheduler/gscheduler.c b/module/scheduler/gscheduler/gscheduler.c +index 446619b..550616b 100644 +--- a/module/scheduler/gscheduler/gscheduler.c ++++ b/module/scheduler/gscheduler/gscheduler.c +@@ -1,81 +1,81 @@ +-/* SPDX-License-Identifier: BSD-3-Clause +- * Copyright (C) 2021 Intel Corporation. +- * All rights reserved. +- */ +- +-#include "spdk/stdinc.h" +-#include "spdk/likely.h" +- +-#include "spdk_internal/event.h" +-#include "spdk/thread.h" +- +-#include "spdk/log.h" +-#include "spdk/env.h" +-#include "spdk/scheduler.h" +- +-static int +-init(void) +-{ +- return spdk_governor_set("dpdk_governor"); +-} +- +-static void +-deinit(void) +-{ +- spdk_governor_set(NULL); +-} +- +-static void +-balance(struct spdk_scheduler_core_info *cores, uint32_t core_count) +-{ +- struct spdk_governor *governor; +- struct spdk_scheduler_core_info *core; +- struct spdk_governor_capabilities capabilities; +- uint32_t i; +- int rc; +- +- governor = spdk_governor_get(); +- assert(governor != NULL); +- +- /* Gather active/idle statistics */ +- SPDK_ENV_FOREACH_CORE(i) { +- core = &cores[i]; +- +- rc = governor->get_core_capabilities(core->lcore, &capabilities); +- if (rc < 0) { +- SPDK_ERRLOG("failed to get capabilities for core: %u\n", core->lcore); +- return; +- } +- +- if (core->current_busy_tsc < (core->current_idle_tsc / 1000)) { +- rc = governor->set_core_freq_min(core->lcore); +- if (rc < 0) { +- SPDK_ERRLOG("setting to minimal frequency for core %u failed\n", core->lcore); +- } +- } else if (core->current_idle_tsc > core->current_busy_tsc) { +- rc = governor->core_freq_down(core->lcore); +- if (rc < 0) { +- SPDK_ERRLOG("lowering frequency for core %u failed\n", core->lcore); +- } +- } else if (core->current_idle_tsc < (core->current_busy_tsc / 1000)) { +- rc = governor->set_core_freq_max(core->lcore); +- if (rc < 0) { +- SPDK_ERRLOG("setting to maximal frequency for core %u failed\n", core->lcore); +- } +- } else { +- rc = governor->core_freq_up(core->lcore); +- if (rc < 0) { +- SPDK_ERRLOG("increasing frequency for core %u failed\n", core->lcore); +- } +- } +- } +-} +- +-static struct spdk_scheduler gscheduler = { +- .name = "gscheduler", +- .init = init, +- .deinit = deinit, +- .balance = balance, +-}; +- +-SPDK_SCHEDULER_REGISTER(gscheduler); ++/* SPDX-License-Identifier: BSD-3-Clause ++ * Copyright (C) 2021 Intel Corporation. ++ * All rights reserved. ++ */ ++ ++#include "spdk/stdinc.h" ++#include "spdk/likely.h" ++ ++#include "spdk_internal/event.h" ++#include "spdk/thread.h" ++ ++#include "spdk/log.h" ++#include "spdk/env.h" ++#include "spdk/scheduler.h" ++ ++static int ++init(void) ++{ ++ return spdk_governor_set("dpdk_governor"); ++} ++ ++static void ++deinit(void) ++{ ++ spdk_governor_set(NULL); ++} ++ ++static void ++balance(struct spdk_scheduler_core_info *cores, uint32_t core_count) ++{ ++ struct spdk_governor *governor; ++ struct spdk_scheduler_core_info *core; ++ struct spdk_governor_capabilities capabilities; ++ uint32_t i; ++ int rc; ++ ++ governor = spdk_governor_get(); ++ assert(governor != NULL); ++ ++ /* Gather active/idle statistics */ ++ SPDK_ENV_FOREACH_CORE(i) { ++ core = &cores[i]; ++ ++ rc = governor->get_core_capabilities(core->lcore, &capabilities); ++ if (rc < 0) { ++ SPDK_ERRLOG("failed to get capabilities for core: %u\n", core->lcore); ++ return; ++ } ++ ++ if (core->current_busy_tsc < (core->current_idle_tsc / 1000)) { ++ rc = governor->set_core_freq_min(core->lcore); ++ if (rc < 0) { ++ SPDK_ERRLOG("setting to minimal frequency for core %u failed\n", core->lcore); ++ } ++ } else if (core->current_idle_tsc > core->current_busy_tsc) { ++ rc = governor->core_freq_down(core->lcore); ++ if (rc < 0) { ++ SPDK_ERRLOG("lowering frequency for core %u failed\n", core->lcore); ++ } ++ } else if (core->current_idle_tsc < (core->current_busy_tsc / 1000)) { ++ rc = governor->set_core_freq_max(core->lcore); ++ if (rc < 0) { ++ SPDK_ERRLOG("setting to maximal frequency for core %u failed\n", core->lcore); ++ } ++ } else { ++ rc = governor->core_freq_up(core->lcore); ++ if (rc < 0) { ++ SPDK_ERRLOG("increasing frequency for core %u failed\n", core->lcore); ++ } ++ } ++ } ++} ++ ++static struct spdk_scheduler gscheduler = { ++ .name = "gscheduler", ++ .init = init, ++ .deinit = deinit, ++ .balance = balance, ++}; ++ ++SPDK_SCHEDULER_REGISTER(gscheduler); +diff --git a/module/sock/Makefile b/module/sock/Makefile +index af62886..998cec9 100644 +--- a/module/sock/Makefile ++++ b/module/sock/Makefile +@@ -1,19 +1,19 @@ +-# SPDX-License-Identifier: BSD-3-Clause +-# Copyright (C) 2015 Intel Corporation. +-# All rights reserved. +-# +- +-SPDK_ROOT_DIR := $(abspath $(CURDIR)/../..) +-include $(SPDK_ROOT_DIR)/mk/spdk.common.mk +- +-DIRS-y = posix +-ifeq ($(OS), Linux) +-DIRS-$(CONFIG_URING) += uring +-endif +- +-.PHONY: all clean $(DIRS-y) +- +-all: $(DIRS-y) +-clean: $(DIRS-y) +- +-include $(SPDK_ROOT_DIR)/mk/spdk.subdirs.mk ++# SPDX-License-Identifier: BSD-3-Clause ++# Copyright (C) 2015 Intel Corporation. ++# All rights reserved. ++# ++ ++SPDK_ROOT_DIR := $(abspath $(CURDIR)/../..) ++include $(SPDK_ROOT_DIR)/mk/spdk.common.mk ++ ++DIRS-y = posix ++ifeq ($(OS), Linux) ++DIRS-$(CONFIG_URING) += uring ++endif ++ ++.PHONY: all clean $(DIRS-y) ++ ++all: $(DIRS-y) ++clean: $(DIRS-y) ++ ++include $(SPDK_ROOT_DIR)/mk/spdk.subdirs.mk +diff --git a/module/sock/posix/Makefile b/module/sock/posix/Makefile +index e08c8ba..63f3a5c 100644 +--- a/module/sock/posix/Makefile ++++ b/module/sock/posix/Makefile +@@ -1,18 +1,18 @@ +-# SPDX-License-Identifier: BSD-3-Clause +-# Copyright (C) 2015 Intel Corporation. +-# All rights reserved. +-# +- +-SPDK_ROOT_DIR := $(abspath $(CURDIR)/../../..) +-include $(SPDK_ROOT_DIR)/mk/spdk.common.mk +- +-SO_VER := 4 +-SO_MINOR := 0 +- +-LIBNAME = sock_posix +-C_SRCS = posix.c +-LOCAL_SYS_LIBS = -lssl +- +-SPDK_MAP_FILE = $(SPDK_ROOT_DIR)/mk/spdk_blank.map +- +-include $(SPDK_ROOT_DIR)/mk/spdk.lib.mk ++# SPDX-License-Identifier: BSD-3-Clause ++# Copyright (C) 2015 Intel Corporation. ++# All rights reserved. ++# ++ ++SPDK_ROOT_DIR := $(abspath $(CURDIR)/../../..) ++include $(SPDK_ROOT_DIR)/mk/spdk.common.mk ++ ++SO_VER := 4 ++SO_MINOR := 0 ++ ++LIBNAME = sock_posix ++C_SRCS = posix.c ++LOCAL_SYS_LIBS = -lssl ++ ++SPDK_MAP_FILE = $(SPDK_ROOT_DIR)/mk/spdk_blank.map ++ ++include $(SPDK_ROOT_DIR)/mk/spdk.lib.mk +diff --git a/module/sock/posix/posix.c b/module/sock/posix/posix.c +index af064f5..7227c6e 100644 +--- a/module/sock/posix/posix.c ++++ b/module/sock/posix/posix.c +@@ -1,2093 +1,2093 @@ +-/* SPDX-License-Identifier: BSD-3-Clause +- * Copyright (C) 2018 Intel Corporation. All rights reserved. +- * Copyright (c) 2020, 2021 Mellanox Technologies LTD. All rights reserved. +- * Copyright (c) 2021 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +- */ +- +-#include "spdk/stdinc.h" +- +-#if defined(__FreeBSD__) +-#include +-#define SPDK_KEVENT +-#else +-#include +-#define SPDK_EPOLL +-#endif +- +-#if defined(__linux__) +-#include +-#endif +- +-#include "spdk/env.h" +-#include "spdk/log.h" +-#include "spdk/pipe.h" +-#include "spdk/sock.h" +-#include "spdk/util.h" +-#include "spdk/string.h" +-#include "spdk_internal/sock.h" +-#include "../sock_kernel.h" +- +-#include "openssl/crypto.h" +-#include "openssl/err.h" +-#include "openssl/ssl.h" +- +-#define MAX_TMPBUF 1024 +-#define PORTNUMLEN 32 +- +-#if defined(SO_ZEROCOPY) && defined(MSG_ZEROCOPY) +-#define SPDK_ZEROCOPY +-#endif +- +-struct spdk_posix_sock { +- struct spdk_sock base; +- int fd; +- +- uint32_t sendmsg_idx; +- +- struct spdk_pipe *recv_pipe; +- void *recv_buf; +- int recv_buf_sz; +- bool pipe_has_data; +- bool socket_has_data; +- bool zcopy; +- +- int placement_id; +- +- SSL_CTX *ctx; +- SSL *ssl; +- +- TAILQ_ENTRY(spdk_posix_sock) link; +-}; +- +-TAILQ_HEAD(spdk_has_data_list, spdk_posix_sock); +- +-struct spdk_posix_sock_group_impl { +- struct spdk_sock_group_impl base; +- int fd; +- struct spdk_has_data_list socks_with_data; +- int placement_id; +-}; +- +-static struct spdk_sock_impl_opts g_spdk_posix_sock_impl_opts = { +- .recv_buf_size = MIN_SO_RCVBUF_SIZE, +- .send_buf_size = MIN_SO_SNDBUF_SIZE, +- .enable_recv_pipe = true, +- .enable_quickack = false, +- .enable_placement_id = PLACEMENT_NONE, +- .enable_zerocopy_send_server = true, +- .enable_zerocopy_send_client = false, +- .zerocopy_threshold = 0, +- .tls_version = 0, +- .enable_ktls = false, +- .psk_key = NULL, +- .psk_identity = NULL +-}; +- +-static struct spdk_sock_map g_map = { +- .entries = STAILQ_HEAD_INITIALIZER(g_map.entries), +- .mtx = PTHREAD_MUTEX_INITIALIZER +-}; +- +-__attribute((destructor)) static void +-posix_sock_map_cleanup(void) +-{ +- spdk_sock_map_cleanup(&g_map); +-} +- +-#define __posix_sock(sock) (struct spdk_posix_sock *)sock +-#define __posix_group_impl(group) (struct spdk_posix_sock_group_impl *)group +- +-static void +-posix_sock_copy_impl_opts(struct spdk_sock_impl_opts *dest, const struct spdk_sock_impl_opts *src, +- size_t len) +-{ +-#define FIELD_OK(field) \ +- offsetof(struct spdk_sock_impl_opts, field) + sizeof(src->field) <= len +- +-#define SET_FIELD(field) \ +- if (FIELD_OK(field)) { \ +- dest->field = src->field; \ +- } +- +- SET_FIELD(recv_buf_size); +- SET_FIELD(send_buf_size); +- SET_FIELD(enable_recv_pipe); +- SET_FIELD(enable_zerocopy_send); +- SET_FIELD(enable_quickack); +- SET_FIELD(enable_placement_id); +- SET_FIELD(enable_zerocopy_send_server); +- SET_FIELD(enable_zerocopy_send_client); +- SET_FIELD(zerocopy_threshold); +- SET_FIELD(tls_version); +- SET_FIELD(enable_ktls); +- SET_FIELD(psk_key); +- SET_FIELD(psk_identity); +- +-#undef SET_FIELD +-#undef FIELD_OK +-} +- +-static int +-posix_sock_impl_get_opts(struct spdk_sock_impl_opts *opts, size_t *len) +-{ +- if (!opts || !len) { +- errno = EINVAL; +- return -1; +- } +- +- assert(sizeof(*opts) >= *len); +- memset(opts, 0, *len); +- +- posix_sock_copy_impl_opts(opts, &g_spdk_posix_sock_impl_opts, *len); +- *len = spdk_min(*len, sizeof(g_spdk_posix_sock_impl_opts)); +- +- return 0; +-} +- +-static int +-posix_sock_impl_set_opts(const struct spdk_sock_impl_opts *opts, size_t len) +-{ +- if (!opts) { +- errno = EINVAL; +- return -1; +- } +- +- assert(sizeof(*opts) >= len); +- posix_sock_copy_impl_opts(&g_spdk_posix_sock_impl_opts, opts, len); +- +- return 0; +-} +- +-static void +-posix_opts_get_impl_opts(const struct spdk_sock_opts *opts, struct spdk_sock_impl_opts *dest) +-{ +- /* Copy the default impl_opts first to cover cases when user's impl_opts is smaller */ +- memcpy(dest, &g_spdk_posix_sock_impl_opts, sizeof(*dest)); +- +- if (opts->impl_opts != NULL) { +- assert(sizeof(*dest) >= opts->impl_opts_size); +- posix_sock_copy_impl_opts(dest, opts->impl_opts, opts->impl_opts_size); +- } +-} +- +-static int +-posix_sock_getaddr(struct spdk_sock *_sock, char *saddr, int slen, uint16_t *sport, +- char *caddr, int clen, uint16_t *cport) +-{ +- struct spdk_posix_sock *sock = __posix_sock(_sock); +- struct sockaddr_storage sa; +- socklen_t salen; +- int rc; +- +- assert(sock != NULL); +- +- memset(&sa, 0, sizeof sa); +- salen = sizeof sa; +- rc = getsockname(sock->fd, (struct sockaddr *) &sa, &salen); +- if (rc != 0) { +- SPDK_ERRLOG("getsockname() failed (errno=%d)\n", errno); +- return -1; +- } +- +- switch (sa.ss_family) { +- case AF_UNIX: +- /* Acceptable connection types that don't have IPs */ +- return 0; +- case AF_INET: +- case AF_INET6: +- /* Code below will get IP addresses */ +- break; +- default: +- /* Unsupported socket family */ +- return -1; +- } +- +- rc = get_addr_str((struct sockaddr *)&sa, saddr, slen); +- if (rc != 0) { +- SPDK_ERRLOG("getnameinfo() failed (errno=%d)\n", errno); +- return -1; +- } +- +- if (sport) { +- if (sa.ss_family == AF_INET) { +- *sport = ntohs(((struct sockaddr_in *) &sa)->sin_port); +- } else if (sa.ss_family == AF_INET6) { +- *sport = ntohs(((struct sockaddr_in6 *) &sa)->sin6_port); +- } +- } +- +- memset(&sa, 0, sizeof sa); +- salen = sizeof sa; +- rc = getpeername(sock->fd, (struct sockaddr *) &sa, &salen); +- if (rc != 0) { +- SPDK_ERRLOG("getpeername() failed (errno=%d)\n", errno); +- return -1; +- } +- +- rc = get_addr_str((struct sockaddr *)&sa, caddr, clen); +- if (rc != 0) { +- SPDK_ERRLOG("getnameinfo() failed (errno=%d)\n", errno); +- return -1; +- } +- +- if (cport) { +- if (sa.ss_family == AF_INET) { +- *cport = ntohs(((struct sockaddr_in *) &sa)->sin_port); +- } else if (sa.ss_family == AF_INET6) { +- *cport = ntohs(((struct sockaddr_in6 *) &sa)->sin6_port); +- } +- } +- +- return 0; +-} +- +-enum posix_sock_create_type { +- SPDK_SOCK_CREATE_LISTEN, +- SPDK_SOCK_CREATE_CONNECT, +-}; +- +-static int +-posix_sock_alloc_pipe(struct spdk_posix_sock *sock, int sz) +-{ +- uint8_t *new_buf; +- struct spdk_pipe *new_pipe; +- struct iovec siov[2]; +- struct iovec diov[2]; +- int sbytes; +- ssize_t bytes; +- +- if (sock->recv_buf_sz == sz) { +- return 0; +- } +- +- /* If the new size is 0, just free the pipe */ +- if (sz == 0) { +- spdk_pipe_destroy(sock->recv_pipe); +- free(sock->recv_buf); +- sock->recv_pipe = NULL; +- sock->recv_buf = NULL; +- return 0; +- } else if (sz < MIN_SOCK_PIPE_SIZE) { +- SPDK_ERRLOG("The size of the pipe must be larger than %d\n", MIN_SOCK_PIPE_SIZE); +- return -1; +- } +- +- /* Round up to next 64 byte multiple */ +- new_buf = calloc(SPDK_ALIGN_CEIL(sz + 1, 64), sizeof(uint8_t)); +- if (!new_buf) { +- SPDK_ERRLOG("socket recv buf allocation failed\n"); +- return -ENOMEM; +- } +- +- new_pipe = spdk_pipe_create(new_buf, sz + 1); +- if (new_pipe == NULL) { +- SPDK_ERRLOG("socket pipe allocation failed\n"); +- free(new_buf); +- return -ENOMEM; +- } +- +- if (sock->recv_pipe != NULL) { +- /* Pull all of the data out of the old pipe */ +- sbytes = spdk_pipe_reader_get_buffer(sock->recv_pipe, sock->recv_buf_sz, siov); +- if (sbytes > sz) { +- /* Too much data to fit into the new pipe size */ +- spdk_pipe_destroy(new_pipe); +- free(new_buf); +- return -EINVAL; +- } +- +- sbytes = spdk_pipe_writer_get_buffer(new_pipe, sz, diov); +- assert(sbytes == sz); +- +- bytes = spdk_iovcpy(siov, 2, diov, 2); +- spdk_pipe_writer_advance(new_pipe, bytes); +- +- spdk_pipe_destroy(sock->recv_pipe); +- free(sock->recv_buf); +- } +- +- sock->recv_buf_sz = sz; +- sock->recv_buf = new_buf; +- sock->recv_pipe = new_pipe; +- +- return 0; +-} +- +-static int +-posix_sock_set_recvbuf(struct spdk_sock *_sock, int sz) +-{ +- struct spdk_posix_sock *sock = __posix_sock(_sock); +- int min_size; +- int rc; +- +- assert(sock != NULL); +- +- if (_sock->impl_opts.enable_recv_pipe) { +- rc = posix_sock_alloc_pipe(sock, sz); +- if (rc) { +- return rc; +- } +- } +- +- /* Set kernel buffer size to be at least MIN_SO_RCVBUF_SIZE and +- * g_spdk_posix_sock_impl_opts.recv_buf_size. */ +- min_size = spdk_max(MIN_SO_RCVBUF_SIZE, g_spdk_posix_sock_impl_opts.recv_buf_size); +- +- if (sz < min_size) { +- sz = min_size; +- } +- +- rc = setsockopt(sock->fd, SOL_SOCKET, SO_RCVBUF, &sz, sizeof(sz)); +- if (rc < 0) { +- return rc; +- } +- +- _sock->impl_opts.recv_buf_size = sz; +- +- return 0; +-} +- +-static int +-posix_sock_set_sendbuf(struct spdk_sock *_sock, int sz) +-{ +- struct spdk_posix_sock *sock = __posix_sock(_sock); +- int min_size; +- int rc; +- +- assert(sock != NULL); +- +- /* Set kernel buffer size to be at least MIN_SO_SNDBUF_SIZE and +- * g_spdk_posix_sock_impl_opts.send_buf_size. */ +- min_size = spdk_max(MIN_SO_SNDBUF_SIZE, g_spdk_posix_sock_impl_opts.send_buf_size); +- +- if (sz < min_size) { +- sz = min_size; +- } +- +- rc = setsockopt(sock->fd, SOL_SOCKET, SO_SNDBUF, &sz, sizeof(sz)); +- if (rc < 0) { +- return rc; +- } +- +- _sock->impl_opts.send_buf_size = sz; +- +- return 0; +-} +- +-static void +-posix_sock_init(struct spdk_posix_sock *sock, bool enable_zero_copy) +-{ +-#if defined(SPDK_ZEROCOPY) || defined(__linux__) +- int flag; +- int rc; +-#endif +- +-#if defined(SPDK_ZEROCOPY) +- flag = 1; +- +- if (enable_zero_copy) { +- /* Try to turn on zero copy sends */ +- rc = setsockopt(sock->fd, SOL_SOCKET, SO_ZEROCOPY, &flag, sizeof(flag)); +- if (rc == 0) { +- sock->zcopy = true; +- } +- } +-#endif +- +-#if defined(__linux__) +- flag = 1; +- +- if (sock->base.impl_opts.enable_quickack) { +- rc = setsockopt(sock->fd, IPPROTO_TCP, TCP_QUICKACK, &flag, sizeof(flag)); +- if (rc != 0) { +- SPDK_ERRLOG("quickack was failed to set\n"); +- } +- } +- +- spdk_sock_get_placement_id(sock->fd, sock->base.impl_opts.enable_placement_id, +- &sock->placement_id); +- +- if (sock->base.impl_opts.enable_placement_id == PLACEMENT_MARK) { +- /* Save placement_id */ +- spdk_sock_map_insert(&g_map, sock->placement_id, NULL); +- } +-#endif +-} +- +-static struct spdk_posix_sock * +-posix_sock_alloc(int fd, struct spdk_sock_impl_opts *impl_opts, bool enable_zero_copy) +-{ +- struct spdk_posix_sock *sock; +- +- sock = calloc(1, sizeof(*sock)); +- if (sock == NULL) { +- SPDK_ERRLOG("sock allocation failed\n"); +- return NULL; +- } +- +- sock->fd = fd; +- memcpy(&sock->base.impl_opts, impl_opts, sizeof(*impl_opts)); +- posix_sock_init(sock, enable_zero_copy); +- +- return sock; +-} +- +-static int +-posix_fd_create(struct addrinfo *res, struct spdk_sock_opts *opts, +- struct spdk_sock_impl_opts *impl_opts) +-{ +- int fd; +- int val = 1; +- int rc, sz; +-#if defined(__linux__) +- int to; +-#endif +- +- fd = socket(res->ai_family, res->ai_socktype, res->ai_protocol); +- if (fd < 0) { +- /* error */ +- return -1; +- } +- +- sz = impl_opts->recv_buf_size; +- rc = setsockopt(fd, SOL_SOCKET, SO_RCVBUF, &sz, sizeof(sz)); +- if (rc) { +- /* Not fatal */ +- } +- +- sz = impl_opts->send_buf_size; +- rc = setsockopt(fd, SOL_SOCKET, SO_SNDBUF, &sz, sizeof(sz)); +- if (rc) { +- /* Not fatal */ +- } +- +- rc = setsockopt(fd, SOL_SOCKET, SO_REUSEADDR, &val, sizeof val); +- if (rc != 0) { +- close(fd); +- /* error */ +- return -1; +- } +- rc = setsockopt(fd, IPPROTO_TCP, TCP_NODELAY, &val, sizeof val); +- if (rc != 0) { +- close(fd); +- /* error */ +- return -1; +- } +- +-#if defined(SO_PRIORITY) +- if (opts->priority) { +- rc = setsockopt(fd, SOL_SOCKET, SO_PRIORITY, &opts->priority, sizeof val); +- if (rc != 0) { +- close(fd); +- /* error */ +- return -1; +- } +- } +-#endif +- +- if (res->ai_family == AF_INET6) { +- rc = setsockopt(fd, IPPROTO_IPV6, IPV6_V6ONLY, &val, sizeof val); +- if (rc != 0) { +- close(fd); +- /* error */ +- return -1; +- } +- } +- +- if (opts->ack_timeout) { +-#if defined(__linux__) +- to = opts->ack_timeout; +- rc = setsockopt(fd, IPPROTO_TCP, TCP_USER_TIMEOUT, &to, sizeof(to)); +- if (rc != 0) { +- close(fd); +- /* error */ +- return -1; +- } +-#else +- SPDK_WARNLOG("TCP_USER_TIMEOUT is not supported.\n"); +-#endif +- } +- +- return fd; +-} +- +-static unsigned int +-posix_sock_tls_psk_server_cb(SSL *ssl, +- const char *id, +- unsigned char *psk, +- unsigned int max_psk_len) +-{ +- long key_len; +- unsigned char *default_psk; +- struct spdk_sock_impl_opts *impl_opts; +- +- impl_opts = SSL_get_app_data(ssl); +- +- if (impl_opts->psk_key == NULL) { +- SPDK_ERRLOG("PSK is not set\n"); +- goto err; +- } +- SPDK_DEBUGLOG(sock_posix, "Length of Client's PSK ID %lu\n", strlen(impl_opts->psk_identity)); +- if (id == NULL) { +- SPDK_ERRLOG("Received empty PSK ID\n"); +- goto err; +- } +- SPDK_DEBUGLOG(sock_posix, "Received PSK ID '%s'\n", id); +- if (strcmp(impl_opts->psk_identity, id) != 0) { +- SPDK_ERRLOG("Unknown Client's PSK ID\n"); +- goto err; +- } +- +- SPDK_DEBUGLOG(sock_posix, "Length of Client's PSK KEY %u\n", max_psk_len); +- default_psk = OPENSSL_hexstr2buf(impl_opts->psk_key, &key_len); +- if (default_psk == NULL) { +- SPDK_ERRLOG("Could not unhexlify PSK\n"); +- goto err; +- } +- if (key_len > max_psk_len) { +- SPDK_ERRLOG("Insufficient buffer size to copy PSK\n"); +- OPENSSL_free(default_psk); +- goto err; +- } +- +- memcpy(psk, default_psk, key_len); +- OPENSSL_free(default_psk); +- +- return key_len; +- +-err: +- return 0; +-} +- +-static unsigned int +-posix_sock_tls_psk_client_cb(SSL *ssl, const char *hint, +- char *identity, +- unsigned int max_identity_len, +- unsigned char *psk, +- unsigned int max_psk_len) +-{ +- long key_len; +- unsigned char *default_psk; +- struct spdk_sock_impl_opts *impl_opts; +- +- impl_opts = SSL_get_app_data(ssl); +- +- if (hint) { +- SPDK_DEBUGLOG(sock_posix, "Received PSK identity hint '%s'\n", hint); +- } +- +- if (impl_opts->psk_key == NULL) { +- SPDK_ERRLOG("PSK is not set\n"); +- goto err; +- } +- default_psk = OPENSSL_hexstr2buf(impl_opts->psk_key, &key_len); +- if (default_psk == NULL) { +- SPDK_ERRLOG("Could not unhexlify PSK\n"); +- goto err; +- } +- if ((strlen(impl_opts->psk_identity) + 1 > max_identity_len) +- || (key_len > max_psk_len)) { +- OPENSSL_free(default_psk); +- SPDK_ERRLOG("PSK ID or Key buffer is not sufficient\n"); +- goto err; +- } +- spdk_strcpy_pad(identity, impl_opts->psk_identity, strlen(impl_opts->psk_identity), 0); +- SPDK_DEBUGLOG(sock_posix, "Sending PSK identity '%s'\n", identity); +- +- memcpy(psk, default_psk, key_len); +- SPDK_DEBUGLOG(sock_posix, "Provided out-of-band (OOB) PSK for TLS1.3 client\n"); +- OPENSSL_free(default_psk); +- +- return key_len; +- +-err: +- return 0; +-} +- +-static SSL_CTX * +-posix_sock_create_ssl_context(const SSL_METHOD *method, struct spdk_sock_opts *opts, +- struct spdk_sock_impl_opts *impl_opts) +-{ +- SSL_CTX *ctx; +- int tls_version = 0; +- bool ktls_enabled = false; +-#ifdef SSL_OP_ENABLE_KTLS +- long options; +-#endif +- +- SSL_library_init(); +- OpenSSL_add_all_algorithms(); +- SSL_load_error_strings(); +- /* Produce a SSL CTX in SSL V2 and V3 standards compliant way */ +- ctx = SSL_CTX_new(method); +- if (!ctx) { +- SPDK_ERRLOG("SSL_CTX_new() failed, msg = %s\n", ERR_error_string(ERR_peek_last_error(), NULL)); +- return NULL; +- } +- SPDK_DEBUGLOG(sock_posix, "SSL context created\n"); +- +- switch (impl_opts->tls_version) { +- case 0: +- /* auto-negotioation */ +- break; +- case SPDK_TLS_VERSION_1_1: +- tls_version = TLS1_1_VERSION; +- break; +- case SPDK_TLS_VERSION_1_2: +- tls_version = TLS1_2_VERSION; +- break; +- case SPDK_TLS_VERSION_1_3: +- tls_version = TLS1_3_VERSION; +- break; +- default: +- SPDK_ERRLOG("Incorrect TLS version provided: %d\n", impl_opts->tls_version); +- goto err; +- } +- +- if (tls_version) { +- SPDK_DEBUGLOG(sock_posix, "Hardening TLS version to '%d'='0x%X'\n", impl_opts->tls_version, +- tls_version); +- if (!SSL_CTX_set_min_proto_version(ctx, tls_version)) { +- SPDK_ERRLOG("Unable to set Min TLS version to '%d'='0x%X\n", impl_opts->tls_version, tls_version); +- goto err; +- } +- if (!SSL_CTX_set_max_proto_version(ctx, tls_version)) { +- SPDK_ERRLOG("Unable to set Max TLS version to '%d'='0x%X\n", impl_opts->tls_version, tls_version); +- goto err; +- } +- } +- if (impl_opts->enable_ktls) { +- SPDK_DEBUGLOG(sock_posix, "Enabling kTLS offload\n"); +-#ifdef SSL_OP_ENABLE_KTLS +- options = SSL_CTX_set_options(ctx, SSL_OP_ENABLE_KTLS); +- ktls_enabled = options & SSL_OP_ENABLE_KTLS; +-#else +- ktls_enabled = false; +-#endif +- if (!ktls_enabled) { +- SPDK_ERRLOG("Unable to set kTLS offload via SSL_CTX_set_options(). Configure openssl with 'enable-ktls'\n"); +- goto err; +- } +- } +- +- return ctx; +- +-err: +- SSL_CTX_free(ctx); +- return NULL; +-} +- +-static SSL * +-ssl_sock_connect_loop(SSL_CTX *ctx, int fd, struct spdk_sock_impl_opts *impl_opts) +-{ +- int rc; +- SSL *ssl; +- int ssl_get_error; +- +- ssl = SSL_new(ctx); +- if (!ssl) { +- SPDK_ERRLOG("SSL_new() failed, msg = %s\n", ERR_error_string(ERR_peek_last_error(), NULL)); +- return NULL; +- } +- SSL_set_fd(ssl, fd); +- SSL_set_app_data(ssl, impl_opts); +- SSL_set_psk_client_callback(ssl, posix_sock_tls_psk_client_cb); +- SPDK_DEBUGLOG(sock_posix, "SSL object creation finished: %p\n", ssl); +- SPDK_DEBUGLOG(sock_posix, "%s = SSL_state_string_long(%p)\n", SSL_state_string_long(ssl), ssl); +- while ((rc = SSL_connect(ssl)) != 1) { +- SPDK_DEBUGLOG(sock_posix, "%s = SSL_state_string_long(%p)\n", SSL_state_string_long(ssl), ssl); +- ssl_get_error = SSL_get_error(ssl, rc); +- SPDK_DEBUGLOG(sock_posix, "SSL_connect failed %d = SSL_connect(%p), %d = SSL_get_error(%p, %d)\n", +- rc, ssl, ssl_get_error, ssl, rc); +- switch (ssl_get_error) { +- case SSL_ERROR_WANT_READ: +- case SSL_ERROR_WANT_WRITE: +- continue; +- default: +- break; +- } +- SPDK_ERRLOG("SSL_connect() failed, errno = %d\n", errno); +- SSL_free(ssl); +- return NULL; +- } +- SPDK_DEBUGLOG(sock_posix, "%s = SSL_state_string_long(%p)\n", SSL_state_string_long(ssl), ssl); +- SPDK_DEBUGLOG(sock_posix, "Negotiated Cipher suite:%s\n", +- SSL_CIPHER_get_name(SSL_get_current_cipher(ssl))); +- return ssl; +-} +- +-static SSL * +-ssl_sock_accept_loop(SSL_CTX *ctx, int fd, struct spdk_sock_impl_opts *impl_opts) +-{ +- int rc; +- SSL *ssl; +- int ssl_get_error; +- +- ssl = SSL_new(ctx); +- if (!ssl) { +- SPDK_ERRLOG("SSL_new() failed, msg = %s\n", ERR_error_string(ERR_peek_last_error(), NULL)); +- return NULL; +- } +- SSL_set_fd(ssl, fd); +- SSL_set_app_data(ssl, impl_opts); +- SSL_set_psk_server_callback(ssl, posix_sock_tls_psk_server_cb); +- SPDK_DEBUGLOG(sock_posix, "SSL object creation finished: %p\n", ssl); +- SPDK_DEBUGLOG(sock_posix, "%s = SSL_state_string_long(%p)\n", SSL_state_string_long(ssl), ssl); +- while ((rc = SSL_accept(ssl)) != 1) { +- SPDK_DEBUGLOG(sock_posix, "%s = SSL_state_string_long(%p)\n", SSL_state_string_long(ssl), ssl); +- ssl_get_error = SSL_get_error(ssl, rc); +- SPDK_DEBUGLOG(sock_posix, "SSL_accept failed %d = SSL_accept(%p), %d = SSL_get_error(%p, %d)\n", rc, +- ssl, ssl_get_error, ssl, rc); +- switch (ssl_get_error) { +- case SSL_ERROR_WANT_READ: +- case SSL_ERROR_WANT_WRITE: +- continue; +- default: +- break; +- } +- SPDK_ERRLOG("SSL_accept() failed, errno = %d\n", errno); +- SSL_free(ssl); +- return NULL; +- } +- SPDK_DEBUGLOG(sock_posix, "%s = SSL_state_string_long(%p)\n", SSL_state_string_long(ssl), ssl); +- SPDK_DEBUGLOG(sock_posix, "Negotiated Cipher suite:%s\n", +- SSL_CIPHER_get_name(SSL_get_current_cipher(ssl))); +- return ssl; +-} +- +-static ssize_t +-SSL_readv(SSL *ssl, const struct iovec *iov, int iovcnt) +-{ +- int i, rc = 0; +- ssize_t total = 0; +- +- for (i = 0; i < iovcnt; i++) { +- rc = SSL_read(ssl, iov[i].iov_base, iov[i].iov_len); +- +- if (rc > 0) { +- total += rc; +- } +- if (rc != (int)iov[i].iov_len) { +- break; +- } +- } +- if (total > 0) { +- errno = 0; +- return total; +- } +- switch (SSL_get_error(ssl, rc)) { +- case SSL_ERROR_ZERO_RETURN: +- errno = ENOTCONN; +- return 0; +- case SSL_ERROR_WANT_READ: +- case SSL_ERROR_WANT_WRITE: +- case SSL_ERROR_WANT_CONNECT: +- case SSL_ERROR_WANT_ACCEPT: +- case SSL_ERROR_WANT_X509_LOOKUP: +- case SSL_ERROR_WANT_ASYNC: +- case SSL_ERROR_WANT_ASYNC_JOB: +- case SSL_ERROR_WANT_CLIENT_HELLO_CB: +- errno = EAGAIN; +- return -1; +- case SSL_ERROR_SYSCALL: +- case SSL_ERROR_SSL: +- errno = ENOTCONN; +- return -1; +- default: +- errno = ENOTCONN; +- return -1; +- } +-} +- +-static ssize_t +-SSL_writev(SSL *ssl, struct iovec *iov, int iovcnt) +-{ +- int i, rc = 0; +- ssize_t total = 0; +- +- for (i = 0; i < iovcnt; i++) { +- rc = SSL_write(ssl, iov[i].iov_base, iov[i].iov_len); +- +- if (rc > 0) { +- total += rc; +- } +- if (rc != (int)iov[i].iov_len) { +- break; +- } +- } +- if (total > 0) { +- errno = 0; +- return total; +- } +- switch (SSL_get_error(ssl, rc)) { +- case SSL_ERROR_ZERO_RETURN: +- errno = ENOTCONN; +- return 0; +- case SSL_ERROR_WANT_READ: +- case SSL_ERROR_WANT_WRITE: +- case SSL_ERROR_WANT_CONNECT: +- case SSL_ERROR_WANT_ACCEPT: +- case SSL_ERROR_WANT_X509_LOOKUP: +- case SSL_ERROR_WANT_ASYNC: +- case SSL_ERROR_WANT_ASYNC_JOB: +- case SSL_ERROR_WANT_CLIENT_HELLO_CB: +- errno = EAGAIN; +- return -1; +- case SSL_ERROR_SYSCALL: +- case SSL_ERROR_SSL: +- errno = ENOTCONN; +- return -1; +- default: +- errno = ENOTCONN; +- return -1; +- } +-} +- +-static struct spdk_sock * +-posix_sock_create(const char *ip, int port, +- enum posix_sock_create_type type, +- struct spdk_sock_opts *opts, +- bool enable_ssl) +-{ +- struct spdk_posix_sock *sock; +- struct spdk_sock_impl_opts impl_opts; +- char buf[MAX_TMPBUF]; +- char portnum[PORTNUMLEN]; +- char *p; +- struct addrinfo hints, *res, *res0; +- int fd, flag; +- int rc; +- bool enable_zcopy_user_opts = true; +- bool enable_zcopy_impl_opts = true; +- SSL_CTX *ctx = 0; +- SSL *ssl = 0; +- +- assert(opts != NULL); +- posix_opts_get_impl_opts(opts, &impl_opts); +- +- if (ip == NULL) { +- return NULL; +- } +- if (ip[0] == '[') { +- snprintf(buf, sizeof(buf), "%s", ip + 1); +- p = strchr(buf, ']'); +- if (p != NULL) { +- *p = '\0'; +- } +- ip = (const char *) &buf[0]; +- } +- +- snprintf(portnum, sizeof portnum, "%d", port); +- memset(&hints, 0, sizeof hints); +- hints.ai_family = PF_UNSPEC; +- hints.ai_socktype = SOCK_STREAM; +- hints.ai_flags = AI_NUMERICSERV; +- hints.ai_flags |= AI_PASSIVE; +- hints.ai_flags |= AI_NUMERICHOST; +- rc = getaddrinfo(ip, portnum, &hints, &res0); +- if (rc != 0) { +- SPDK_ERRLOG("getaddrinfo() failed %s (%d)\n", gai_strerror(rc), rc); +- return NULL; +- } +- +- /* try listen */ +- fd = -1; +- for (res = res0; res != NULL; res = res->ai_next) { +-retry: +- fd = posix_fd_create(res, opts, &impl_opts); +- if (fd < 0) { +- continue; +- } +- if (type == SPDK_SOCK_CREATE_LISTEN) { +- rc = bind(fd, res->ai_addr, res->ai_addrlen); +- if (rc != 0) { +- SPDK_ERRLOG("bind() failed at port %d, errno = %d\n", port, errno); +- switch (errno) { +- case EINTR: +- /* interrupted? */ +- close(fd); +- goto retry; +- case EADDRNOTAVAIL: +- SPDK_ERRLOG("IP address %s not available. " +- "Verify IP address in config file " +- "and make sure setup script is " +- "run before starting spdk app.\n", ip); +- /* FALLTHROUGH */ +- default: +- /* try next family */ +- close(fd); +- fd = -1; +- continue; +- } +- } +- /* bind OK */ +- rc = listen(fd, 512); +- if (rc != 0) { +- SPDK_ERRLOG("listen() failed, errno = %d\n", errno); +- close(fd); +- fd = -1; +- break; +- } +- enable_zcopy_impl_opts = impl_opts.enable_zerocopy_send_server; +- } else if (type == SPDK_SOCK_CREATE_CONNECT) { +- rc = connect(fd, res->ai_addr, res->ai_addrlen); +- if (rc != 0) { +- SPDK_ERRLOG("connect() failed, errno = %d\n", errno); +- /* try next family */ +- close(fd); +- fd = -1; +- continue; +- } +- enable_zcopy_impl_opts = impl_opts.enable_zerocopy_send_client; +- if (enable_ssl) { +- ctx = posix_sock_create_ssl_context(TLS_client_method(), opts, &impl_opts); +- if (!ctx) { +- SPDK_ERRLOG("posix_sock_create_ssl_context() failed, errno = %d\n", errno); +- close(fd); +- fd = -1; +- break; +- } +- ssl = ssl_sock_connect_loop(ctx, fd, &impl_opts); +- if (!ssl) { +- SPDK_ERRLOG("ssl_sock_connect_loop() failed, errno = %d\n", errno); +- close(fd); +- fd = -1; +- SSL_CTX_free(ctx); +- break; +- } +- } +- } +- +- flag = fcntl(fd, F_GETFL); +- if (fcntl(fd, F_SETFL, flag | O_NONBLOCK) < 0) { +- SPDK_ERRLOG("fcntl can't set nonblocking mode for socket, fd: %d (%d)\n", fd, errno); +- SSL_free(ssl); +- SSL_CTX_free(ctx); +- close(fd); +- fd = -1; +- break; +- } +- break; +- } +- freeaddrinfo(res0); +- +- if (fd < 0) { +- return NULL; +- } +- +- /* Only enable zero copy for non-loopback and non-ssl sockets. */ +- enable_zcopy_user_opts = opts->zcopy && !sock_is_loopback(fd) && !enable_ssl; +- +- sock = posix_sock_alloc(fd, &impl_opts, enable_zcopy_user_opts && enable_zcopy_impl_opts); +- if (sock == NULL) { +- SPDK_ERRLOG("sock allocation failed\n"); +- SSL_free(ssl); +- SSL_CTX_free(ctx); +- close(fd); +- return NULL; +- } +- +- if (ctx) { +- sock->ctx = ctx; +- } +- +- if (ssl) { +- sock->ssl = ssl; +- } +- +- return &sock->base; +-} +- +-static struct spdk_sock * +-posix_sock_listen(const char *ip, int port, struct spdk_sock_opts *opts) +-{ +- return posix_sock_create(ip, port, SPDK_SOCK_CREATE_LISTEN, opts, false); +-} +- +-static struct spdk_sock * +-posix_sock_connect(const char *ip, int port, struct spdk_sock_opts *opts) +-{ +- return posix_sock_create(ip, port, SPDK_SOCK_CREATE_CONNECT, opts, false); +-} +- +-static struct spdk_sock * +-_posix_sock_accept(struct spdk_sock *_sock, bool enable_ssl) +-{ +- struct spdk_posix_sock *sock = __posix_sock(_sock); +- struct sockaddr_storage sa; +- socklen_t salen; +- int rc, fd; +- struct spdk_posix_sock *new_sock; +- int flag; +- SSL_CTX *ctx = 0; +- SSL *ssl = 0; +- +- memset(&sa, 0, sizeof(sa)); +- salen = sizeof(sa); +- +- assert(sock != NULL); +- +- rc = accept(sock->fd, (struct sockaddr *)&sa, &salen); +- +- if (rc == -1) { +- return NULL; +- } +- +- fd = rc; +- +- flag = fcntl(fd, F_GETFL); +- if ((!(flag & O_NONBLOCK)) && (fcntl(fd, F_SETFL, flag | O_NONBLOCK) < 0)) { +- SPDK_ERRLOG("fcntl can't set nonblocking mode for socket, fd: %d (%d)\n", fd, errno); +- close(fd); +- return NULL; +- } +- +-#if defined(SO_PRIORITY) +- /* The priority is not inherited, so call this function again */ +- if (sock->base.opts.priority) { +- rc = setsockopt(fd, SOL_SOCKET, SO_PRIORITY, &sock->base.opts.priority, sizeof(int)); +- if (rc != 0) { +- close(fd); +- return NULL; +- } +- } +-#endif +- +- /* Establish SSL connection */ +- if (enable_ssl) { +- ctx = posix_sock_create_ssl_context(TLS_server_method(), &sock->base.opts, &sock->base.impl_opts); +- if (!ctx) { +- SPDK_ERRLOG("posix_sock_create_ssl_context() failed, errno = %d\n", errno); +- close(fd); +- return NULL; +- } +- ssl = ssl_sock_accept_loop(ctx, fd, &sock->base.impl_opts); +- if (!ssl) { +- SPDK_ERRLOG("ssl_sock_accept_loop() failed, errno = %d\n", errno); +- close(fd); +- SSL_CTX_free(ctx); +- return NULL; +- } +- } +- +- /* Inherit the zero copy feature from the listen socket */ +- new_sock = posix_sock_alloc(fd, &sock->base.impl_opts, sock->zcopy); +- if (new_sock == NULL) { +- close(fd); +- SSL_free(ssl); +- SSL_CTX_free(ctx); +- return NULL; +- } +- +- if (ctx) { +- new_sock->ctx = ctx; +- } +- +- if (ssl) { +- new_sock->ssl = ssl; +- } +- +- return &new_sock->base; +-} +- +-static struct spdk_sock * +-posix_sock_accept(struct spdk_sock *_sock) +-{ +- return _posix_sock_accept(_sock, false); +-} +- +-static int +-posix_sock_close(struct spdk_sock *_sock) +-{ +- struct spdk_posix_sock *sock = __posix_sock(_sock); +- +- assert(TAILQ_EMPTY(&_sock->pending_reqs)); +- +- if (sock->ssl != NULL) { +- SSL_shutdown(sock->ssl); +- } +- +- /* If the socket fails to close, the best choice is to +- * leak the fd but continue to free the rest of the sock +- * memory. */ +- close(sock->fd); +- +- SSL_free(sock->ssl); +- SSL_CTX_free(sock->ctx); +- +- spdk_pipe_destroy(sock->recv_pipe); +- free(sock->recv_buf); +- free(sock); +- +- return 0; +-} +- +-#ifdef SPDK_ZEROCOPY +-static int +-_sock_check_zcopy(struct spdk_sock *sock) +-{ +- struct spdk_posix_sock *psock = __posix_sock(sock); +- struct msghdr msgh = {}; +- uint8_t buf[sizeof(struct cmsghdr) + sizeof(struct sock_extended_err)]; +- ssize_t rc; +- struct sock_extended_err *serr; +- struct cmsghdr *cm; +- uint32_t idx; +- struct spdk_sock_request *req, *treq; +- bool found; +- +- msgh.msg_control = buf; +- msgh.msg_controllen = sizeof(buf); +- +- while (true) { +- rc = recvmsg(psock->fd, &msgh, MSG_ERRQUEUE); +- +- if (rc < 0) { +- if (errno == EWOULDBLOCK || errno == EAGAIN) { +- return 0; +- } +- +- if (!TAILQ_EMPTY(&sock->pending_reqs)) { +- SPDK_ERRLOG("Attempting to receive from ERRQUEUE yielded error, but pending list still has orphaned entries\n"); +- } else { +- SPDK_WARNLOG("Recvmsg yielded an error!\n"); +- } +- return 0; +- } +- +- cm = CMSG_FIRSTHDR(&msgh); +- if (!(cm && +- ((cm->cmsg_level == SOL_IP && cm->cmsg_type == IP_RECVERR) || +- (cm->cmsg_level == SOL_IPV6 && cm->cmsg_type == IPV6_RECVERR)))) { +- SPDK_WARNLOG("Unexpected cmsg level or type!\n"); +- return 0; +- } +- +- serr = (struct sock_extended_err *)CMSG_DATA(cm); +- if (serr->ee_errno != 0 || serr->ee_origin != SO_EE_ORIGIN_ZEROCOPY) { +- SPDK_WARNLOG("Unexpected extended error origin\n"); +- return 0; +- } +- +- /* Most of the time, the pending_reqs array is in the exact +- * order we need such that all of the requests to complete are +- * in order, in the front. It is guaranteed that all requests +- * belonging to the same sendmsg call are sequential, so once +- * we encounter one match we can stop looping as soon as a +- * non-match is found. +- */ +- for (idx = serr->ee_info; idx <= serr->ee_data; idx++) { +- found = false; +- TAILQ_FOREACH_SAFE(req, &sock->pending_reqs, internal.link, treq) { +- if (!req->internal.is_zcopy) { +- /* This wasn't a zcopy request. It was just waiting in line to complete */ +- rc = spdk_sock_request_put(sock, req, 0); +- if (rc < 0) { +- return rc; +- } +- } else if (req->internal.offset == idx) { +- found = true; +- rc = spdk_sock_request_put(sock, req, 0); +- if (rc < 0) { +- return rc; +- } +- } else if (found) { +- break; +- } +- } +- } +- } +- +- return 0; +-} +-#endif +- +-static int +-_sock_flush(struct spdk_sock *sock) +-{ +- struct spdk_posix_sock *psock = __posix_sock(sock); +- struct msghdr msg = {}; +- int flags; +- struct iovec iovs[IOV_BATCH_SIZE]; +- int iovcnt; +- int retval; +- struct spdk_sock_request *req; +- int i; +- ssize_t rc, sent; +- unsigned int offset; +- size_t len; +- bool is_zcopy = false; +- +- /* Can't flush from within a callback or we end up with recursive calls */ +- if (sock->cb_cnt > 0) { +- errno = EAGAIN; +- return -1; +- } +- +-#ifdef SPDK_ZEROCOPY +- if (psock->zcopy) { +- flags = MSG_ZEROCOPY | MSG_NOSIGNAL; +- } else +-#endif +- { +- flags = MSG_NOSIGNAL; +- } +- +- iovcnt = spdk_sock_prep_reqs(sock, iovs, 0, NULL, &flags); +- if (iovcnt == 0) { +- return 0; +- } +- +-#ifdef SPDK_ZEROCOPY +- is_zcopy = flags & MSG_ZEROCOPY; +-#endif +- +- /* Perform the vectored write */ +- msg.msg_iov = iovs; +- msg.msg_iovlen = iovcnt; +- +- if (psock->ssl) { +- rc = SSL_writev(psock->ssl, iovs, iovcnt); +- } else { +- rc = sendmsg(psock->fd, &msg, flags); +- } +- if (rc <= 0) { +- if (rc == 0 || errno == EAGAIN || errno == EWOULDBLOCK || (errno == ENOBUFS && psock->zcopy)) { +- errno = EAGAIN; +- } +- return -1; +- } +- +- sent = rc; +- +- if (is_zcopy) { +- /* Handling overflow case, because we use psock->sendmsg_idx - 1 for the +- * req->internal.offset, so sendmsg_idx should not be zero */ +- if (spdk_unlikely(psock->sendmsg_idx == UINT32_MAX)) { +- psock->sendmsg_idx = 1; +- } else { +- psock->sendmsg_idx++; +- } +- } +- +- /* Consume the requests that were actually written */ +- req = TAILQ_FIRST(&sock->queued_reqs); +- while (req) { +- offset = req->internal.offset; +- +- /* req->internal.is_zcopy is true when the whole req or part of it is sent with zerocopy */ +- req->internal.is_zcopy = is_zcopy; +- +- for (i = 0; i < req->iovcnt; i++) { +- /* Advance by the offset first */ +- if (offset >= SPDK_SOCK_REQUEST_IOV(req, i)->iov_len) { +- offset -= SPDK_SOCK_REQUEST_IOV(req, i)->iov_len; +- continue; +- } +- +- /* Calculate the remaining length of this element */ +- len = SPDK_SOCK_REQUEST_IOV(req, i)->iov_len - offset; +- +- if (len > (size_t)rc) { +- /* This element was partially sent. */ +- req->internal.offset += rc; +- return sent; +- } +- +- offset = 0; +- req->internal.offset += len; +- rc -= len; +- } +- +- /* Handled a full request. */ +- spdk_sock_request_pend(sock, req); +- +- if (!req->internal.is_zcopy && req == TAILQ_FIRST(&sock->pending_reqs)) { +- /* The sendmsg syscall above isn't currently asynchronous, +- * so it's already done. */ +- retval = spdk_sock_request_put(sock, req, 0); +- if (retval) { +- break; +- } +- } else { +- /* Re-use the offset field to hold the sendmsg call index. The +- * index is 0 based, so subtract one here because we've already +- * incremented above. */ +- req->internal.offset = psock->sendmsg_idx - 1; +- } +- +- if (rc == 0) { +- break; +- } +- +- req = TAILQ_FIRST(&sock->queued_reqs); +- } +- +- return sent; +-} +- +-static int +-posix_sock_flush(struct spdk_sock *sock) +-{ +-#ifdef SPDK_ZEROCOPY +- struct spdk_posix_sock *psock = __posix_sock(sock); +- +- if (psock->zcopy && !TAILQ_EMPTY(&sock->pending_reqs)) { +- _sock_check_zcopy(sock); +- } +-#endif +- +- return _sock_flush(sock); +-} +- +-static ssize_t +-posix_sock_recv_from_pipe(struct spdk_posix_sock *sock, struct iovec *diov, int diovcnt) +-{ +- struct iovec siov[2]; +- int sbytes; +- ssize_t bytes; +- struct spdk_posix_sock_group_impl *group; +- +- sbytes = spdk_pipe_reader_get_buffer(sock->recv_pipe, sock->recv_buf_sz, siov); +- if (sbytes < 0) { +- errno = EINVAL; +- return -1; +- } else if (sbytes == 0) { +- errno = EAGAIN; +- return -1; +- } +- +- bytes = spdk_iovcpy(siov, 2, diov, diovcnt); +- +- if (bytes == 0) { +- /* The only way this happens is if diov is 0 length */ +- errno = EINVAL; +- return -1; +- } +- +- spdk_pipe_reader_advance(sock->recv_pipe, bytes); +- +- /* If we drained the pipe, mark it appropriately */ +- if (spdk_pipe_reader_bytes_available(sock->recv_pipe) == 0) { +- assert(sock->pipe_has_data == true); +- +- group = __posix_group_impl(sock->base.group_impl); +- if (group && !sock->socket_has_data) { +- TAILQ_REMOVE(&group->socks_with_data, sock, link); +- } +- +- sock->pipe_has_data = false; +- } +- +- return bytes; +-} +- +-static inline ssize_t +-posix_sock_read(struct spdk_posix_sock *sock) +-{ +- struct iovec iov[2]; +- int bytes_avail, bytes_recvd; +- struct spdk_posix_sock_group_impl *group; +- +- bytes_avail = spdk_pipe_writer_get_buffer(sock->recv_pipe, sock->recv_buf_sz, iov); +- +- if (bytes_avail <= 0) { +- return bytes_avail; +- } +- +- if (sock->ssl) { +- bytes_recvd = SSL_readv(sock->ssl, iov, 2); +- } else { +- bytes_recvd = readv(sock->fd, iov, 2); +- } +- +- assert(sock->pipe_has_data == false); +- +- if (bytes_recvd <= 0) { +- /* Errors count as draining the socket data */ +- if (sock->base.group_impl && sock->socket_has_data) { +- group = __posix_group_impl(sock->base.group_impl); +- TAILQ_REMOVE(&group->socks_with_data, sock, link); +- } +- +- sock->socket_has_data = false; +- +- return bytes_recvd; +- } +- +- spdk_pipe_writer_advance(sock->recv_pipe, bytes_recvd); +- +-#if DEBUG +- if (sock->base.group_impl) { +- assert(sock->socket_has_data == true); +- } +-#endif +- +- sock->pipe_has_data = true; +- if (bytes_recvd < bytes_avail) { +- /* We drained the kernel socket entirely. */ +- sock->socket_has_data = false; +- } +- +- return bytes_recvd; +-} +- +-static ssize_t +-posix_sock_readv(struct spdk_sock *_sock, struct iovec *iov, int iovcnt) +-{ +- struct spdk_posix_sock *sock = __posix_sock(_sock); +- struct spdk_posix_sock_group_impl *group = __posix_group_impl(sock->base.group_impl); +- int rc, i; +- size_t len; +- +- if (sock->recv_pipe == NULL) { +- assert(sock->pipe_has_data == false); +- if (group && sock->socket_has_data) { +- sock->socket_has_data = false; +- TAILQ_REMOVE(&group->socks_with_data, sock, link); +- } +- if (sock->ssl) { +- return SSL_readv(sock->ssl, iov, iovcnt); +- } else { +- return readv(sock->fd, iov, iovcnt); +- } +- } +- +- /* If the socket is not in a group, we must assume it always has +- * data waiting for us because it is not epolled */ +- if (!sock->pipe_has_data && (group == NULL || sock->socket_has_data)) { +- /* If the user is receiving a sufficiently large amount of data, +- * receive directly to their buffers. */ +- len = 0; +- for (i = 0; i < iovcnt; i++) { +- len += iov[i].iov_len; +- } +- +- if (len >= MIN_SOCK_PIPE_SIZE) { +- /* TODO: Should this detect if kernel socket is drained? */ +- if (sock->ssl) { +- return SSL_readv(sock->ssl, iov, iovcnt); +- } else { +- return readv(sock->fd, iov, iovcnt); +- } +- } +- +- /* Otherwise, do a big read into our pipe */ +- rc = posix_sock_read(sock); +- if (rc <= 0) { +- return rc; +- } +- } +- +- return posix_sock_recv_from_pipe(sock, iov, iovcnt); +-} +- +-static ssize_t +-posix_sock_recv(struct spdk_sock *sock, void *buf, size_t len) +-{ +- struct iovec iov[1]; +- +- iov[0].iov_base = buf; +- iov[0].iov_len = len; +- +- return posix_sock_readv(sock, iov, 1); +-} +- +-static void +-posix_sock_readv_async(struct spdk_sock *sock, struct spdk_sock_request *req) +-{ +- req->cb_fn(req->cb_arg, -ENOTSUP); +-} +- +-static ssize_t +-posix_sock_writev(struct spdk_sock *_sock, struct iovec *iov, int iovcnt) +-{ +- struct spdk_posix_sock *sock = __posix_sock(_sock); +- int rc; +- +- /* In order to process a writev, we need to flush any asynchronous writes +- * first. */ +- rc = _sock_flush(_sock); +- if (rc < 0) { +- return rc; +- } +- +- if (!TAILQ_EMPTY(&_sock->queued_reqs)) { +- /* We weren't able to flush all requests */ +- errno = EAGAIN; +- return -1; +- } +- +- if (sock->ssl) { +- return SSL_writev(sock->ssl, iov, iovcnt); +- } else { +- return writev(sock->fd, iov, iovcnt); +- } +-} +- +-static void +-posix_sock_writev_async(struct spdk_sock *sock, struct spdk_sock_request *req) +-{ +- int rc; +- +- spdk_sock_request_queue(sock, req); +- +- /* If there are a sufficient number queued, just flush them out immediately. */ +- if (sock->queued_iovcnt >= IOV_BATCH_SIZE) { +- rc = _sock_flush(sock); +- if (rc < 0 && errno != EAGAIN) { +- spdk_sock_abort_requests(sock); +- } +- } +-} +- +-static int +-posix_sock_set_recvlowat(struct spdk_sock *_sock, int nbytes) +-{ +- struct spdk_posix_sock *sock = __posix_sock(_sock); +- int val; +- int rc; +- +- assert(sock != NULL); +- +- val = nbytes; +- rc = setsockopt(sock->fd, SOL_SOCKET, SO_RCVLOWAT, &val, sizeof val); +- if (rc != 0) { +- return -1; +- } +- return 0; +-} +- +-static bool +-posix_sock_is_ipv6(struct spdk_sock *_sock) +-{ +- struct spdk_posix_sock *sock = __posix_sock(_sock); +- struct sockaddr_storage sa; +- socklen_t salen; +- int rc; +- +- assert(sock != NULL); +- +- memset(&sa, 0, sizeof sa); +- salen = sizeof sa; +- rc = getsockname(sock->fd, (struct sockaddr *) &sa, &salen); +- if (rc != 0) { +- SPDK_ERRLOG("getsockname() failed (errno=%d)\n", errno); +- return false; +- } +- +- return (sa.ss_family == AF_INET6); +-} +- +-static bool +-posix_sock_is_ipv4(struct spdk_sock *_sock) +-{ +- struct spdk_posix_sock *sock = __posix_sock(_sock); +- struct sockaddr_storage sa; +- socklen_t salen; +- int rc; +- +- assert(sock != NULL); +- +- memset(&sa, 0, sizeof sa); +- salen = sizeof sa; +- rc = getsockname(sock->fd, (struct sockaddr *) &sa, &salen); +- if (rc != 0) { +- SPDK_ERRLOG("getsockname() failed (errno=%d)\n", errno); +- return false; +- } +- +- return (sa.ss_family == AF_INET); +-} +- +-static bool +-posix_sock_is_connected(struct spdk_sock *_sock) +-{ +- struct spdk_posix_sock *sock = __posix_sock(_sock); +- uint8_t byte; +- int rc; +- +- rc = recv(sock->fd, &byte, 1, MSG_PEEK); +- if (rc == 0) { +- return false; +- } +- +- if (rc < 0) { +- if (errno == EAGAIN || errno == EWOULDBLOCK) { +- return true; +- } +- +- return false; +- } +- +- return true; +-} +- +-static struct spdk_sock_group_impl * +-posix_sock_group_impl_get_optimal(struct spdk_sock *_sock, struct spdk_sock_group_impl *hint) +-{ +- struct spdk_posix_sock *sock = __posix_sock(_sock); +- struct spdk_sock_group_impl *group_impl; +- +- if (sock->placement_id != -1) { +- spdk_sock_map_lookup(&g_map, sock->placement_id, &group_impl, hint); +- return group_impl; +- } +- +- return NULL; +-} +- +-static struct spdk_sock_group_impl * +-posix_sock_group_impl_create(void) +-{ +- struct spdk_posix_sock_group_impl *group_impl; +- int fd; +- +-#if defined(SPDK_EPOLL) +- fd = epoll_create1(0); +-#elif defined(SPDK_KEVENT) +- fd = kqueue(); +-#endif +- if (fd == -1) { +- return NULL; +- } +- +- group_impl = calloc(1, sizeof(*group_impl)); +- if (group_impl == NULL) { +- SPDK_ERRLOG("group_impl allocation failed\n"); +- close(fd); +- return NULL; +- } +- +- group_impl->fd = fd; +- TAILQ_INIT(&group_impl->socks_with_data); +- group_impl->placement_id = -1; +- +- if (g_spdk_posix_sock_impl_opts.enable_placement_id == PLACEMENT_CPU) { +- spdk_sock_map_insert(&g_map, spdk_env_get_current_core(), &group_impl->base); +- group_impl->placement_id = spdk_env_get_current_core(); +- } +- +- return &group_impl->base; +-} +- +-static void +-posix_sock_mark(struct spdk_posix_sock_group_impl *group, struct spdk_posix_sock *sock, +- int placement_id) +-{ +-#if defined(SO_MARK) +- int rc; +- +- rc = setsockopt(sock->fd, SOL_SOCKET, SO_MARK, +- &placement_id, sizeof(placement_id)); +- if (rc != 0) { +- /* Not fatal */ +- SPDK_ERRLOG("Error setting SO_MARK\n"); +- return; +- } +- +- rc = spdk_sock_map_insert(&g_map, placement_id, &group->base); +- if (rc != 0) { +- /* Not fatal */ +- SPDK_ERRLOG("Failed to insert sock group into map: %d\n", rc); +- return; +- } +- +- sock->placement_id = placement_id; +-#endif +-} +- +-static void +-posix_sock_update_mark(struct spdk_sock_group_impl *_group, struct spdk_sock *_sock) +-{ +- struct spdk_posix_sock_group_impl *group = __posix_group_impl(_group); +- +- if (group->placement_id == -1) { +- group->placement_id = spdk_sock_map_find_free(&g_map); +- +- /* If a free placement id is found, update existing sockets in this group */ +- if (group->placement_id != -1) { +- struct spdk_sock *sock, *tmp; +- +- TAILQ_FOREACH_SAFE(sock, &_group->socks, link, tmp) { +- posix_sock_mark(group, __posix_sock(sock), group->placement_id); +- } +- } +- } +- +- if (group->placement_id != -1) { +- /* +- * group placement id is already determined for this poll group. +- * Mark socket with group's placement id. +- */ +- posix_sock_mark(group, __posix_sock(_sock), group->placement_id); +- } +-} +- +-static int +-posix_sock_group_impl_add_sock(struct spdk_sock_group_impl *_group, struct spdk_sock *_sock) +-{ +- struct spdk_posix_sock_group_impl *group = __posix_group_impl(_group); +- struct spdk_posix_sock *sock = __posix_sock(_sock); +- int rc; +- +-#if defined(SPDK_EPOLL) +- struct epoll_event event; +- +- memset(&event, 0, sizeof(event)); +- /* EPOLLERR is always on even if we don't set it, but be explicit for clarity */ +- event.events = EPOLLIN | EPOLLERR; +- event.data.ptr = sock; +- +- rc = epoll_ctl(group->fd, EPOLL_CTL_ADD, sock->fd, &event); +-#elif defined(SPDK_KEVENT) +- struct kevent event; +- struct timespec ts = {0}; +- +- EV_SET(&event, sock->fd, EVFILT_READ, EV_ADD, 0, 0, sock); +- +- rc = kevent(group->fd, &event, 1, NULL, 0, &ts); +-#endif +- +- if (rc != 0) { +- return rc; +- } +- +- /* switched from another polling group due to scheduling */ +- if (spdk_unlikely(sock->recv_pipe != NULL && +- (spdk_pipe_reader_bytes_available(sock->recv_pipe) > 0))) { +- sock->pipe_has_data = true; +- sock->socket_has_data = false; +- TAILQ_INSERT_TAIL(&group->socks_with_data, sock, link); +- } +- +- if (g_spdk_posix_sock_impl_opts.enable_placement_id == PLACEMENT_MARK) { +- posix_sock_update_mark(_group, _sock); +- } else if (sock->placement_id != -1) { +- rc = spdk_sock_map_insert(&g_map, sock->placement_id, &group->base); +- if (rc != 0) { +- SPDK_ERRLOG("Failed to insert sock group into map: %d\n", rc); +- /* Do not treat this as an error. The system will continue running. */ +- } +- } +- +- return rc; +-} +- +-static int +-posix_sock_group_impl_remove_sock(struct spdk_sock_group_impl *_group, struct spdk_sock *_sock) +-{ +- struct spdk_posix_sock_group_impl *group = __posix_group_impl(_group); +- struct spdk_posix_sock *sock = __posix_sock(_sock); +- int rc; +- +- if (sock->pipe_has_data || sock->socket_has_data) { +- TAILQ_REMOVE(&group->socks_with_data, sock, link); +- sock->pipe_has_data = false; +- sock->socket_has_data = false; +- } +- +- if (sock->placement_id != -1) { +- spdk_sock_map_release(&g_map, sock->placement_id); +- } +- +-#if defined(SPDK_EPOLL) +- struct epoll_event event; +- +- /* Event parameter is ignored but some old kernel version still require it. */ +- rc = epoll_ctl(group->fd, EPOLL_CTL_DEL, sock->fd, &event); +-#elif defined(SPDK_KEVENT) +- struct kevent event; +- struct timespec ts = {0}; +- +- EV_SET(&event, sock->fd, EVFILT_READ, EV_DELETE, 0, 0, NULL); +- +- rc = kevent(group->fd, &event, 1, NULL, 0, &ts); +- if (rc == 0 && event.flags & EV_ERROR) { +- rc = -1; +- errno = event.data; +- } +-#endif +- +- spdk_sock_abort_requests(_sock); +- +- return rc; +-} +- +-static int +-posix_sock_group_impl_poll(struct spdk_sock_group_impl *_group, int max_events, +- struct spdk_sock **socks) +-{ +- struct spdk_posix_sock_group_impl *group = __posix_group_impl(_group); +- struct spdk_sock *sock, *tmp; +- int num_events, i, rc; +- struct spdk_posix_sock *psock, *ptmp; +-#if defined(SPDK_EPOLL) +- struct epoll_event events[MAX_EVENTS_PER_POLL]; +-#elif defined(SPDK_KEVENT) +- struct kevent events[MAX_EVENTS_PER_POLL]; +- struct timespec ts = {0}; +-#endif +- +-#ifdef SPDK_ZEROCOPY +- /* When all of the following conditions are met +- * - non-blocking socket +- * - zero copy is enabled +- * - interrupts suppressed (i.e. busy polling) +- * - the NIC tx queue is full at the time sendmsg() is called +- * - epoll_wait determines there is an EPOLLIN event for the socket +- * then we can get into a situation where data we've sent is queued +- * up in the kernel network stack, but interrupts have been suppressed +- * because other traffic is flowing so the kernel misses the signal +- * to flush the software tx queue. If there wasn't incoming data +- * pending on the socket, then epoll_wait would have been sufficient +- * to kick off the send operation, but since there is a pending event +- * epoll_wait does not trigger the necessary operation. +- * +- * We deal with this by checking for all of the above conditions and +- * additionally looking for EPOLLIN events that were not consumed from +- * the last poll loop. We take this to mean that the upper layer is +- * unable to consume them because it is blocked waiting for resources +- * to free up, and those resources are most likely freed in response +- * to a pending asynchronous write completing. +- * +- * Additionally, sockets that have the same placement_id actually share +- * an underlying hardware queue. That means polling one of them is +- * equivalent to polling all of them. As a quick mechanism to avoid +- * making extra poll() calls, stash the last placement_id during the loop +- * and only poll if it's not the same. The overwhelmingly common case +- * is that all sockets in this list have the same placement_id because +- * SPDK is intentionally grouping sockets by that value, so even +- * though this won't stop all extra calls to poll(), it's very fast +- * and will catch all of them in practice. +- */ +- int last_placement_id = -1; +- +- TAILQ_FOREACH(psock, &group->socks_with_data, link) { +- if (psock->zcopy && psock->placement_id >= 0 && +- psock->placement_id != last_placement_id) { +- struct pollfd pfd = {psock->fd, POLLIN | POLLERR, 0}; +- +- poll(&pfd, 1, 0); +- last_placement_id = psock->placement_id; +- } +- } +-#endif +- +- /* This must be a TAILQ_FOREACH_SAFE because while flushing, +- * a completion callback could remove the sock from the +- * group. */ +- TAILQ_FOREACH_SAFE(sock, &_group->socks, link, tmp) { +- rc = _sock_flush(sock); +- if (rc < 0 && errno != EAGAIN) { +- spdk_sock_abort_requests(sock); +- } +- } +- +- assert(max_events > 0); +- +-#if defined(SPDK_EPOLL) +- num_events = epoll_wait(group->fd, events, max_events, 0); +-#elif defined(SPDK_KEVENT) +- num_events = kevent(group->fd, NULL, 0, events, max_events, &ts); +-#endif +- +- if (num_events == -1) { +- return -1; +- } else if (num_events == 0 && !TAILQ_EMPTY(&_group->socks)) { +- sock = TAILQ_FIRST(&_group->socks); +- psock = __posix_sock(sock); +- /* poll() is called here to busy poll the queue associated with +- * first socket in list and potentially reap incoming data. +- */ +- if (sock->opts.priority) { +- struct pollfd pfd = {0, 0, 0}; +- +- pfd.fd = psock->fd; +- pfd.events = POLLIN | POLLERR; +- poll(&pfd, 1, 0); +- } +- } +- +- for (i = 0; i < num_events; i++) { +-#if defined(SPDK_EPOLL) +- sock = events[i].data.ptr; +- psock = __posix_sock(sock); +- +-#ifdef SPDK_ZEROCOPY +- if (events[i].events & EPOLLERR) { +- rc = _sock_check_zcopy(sock); +- /* If the socket was closed or removed from +- * the group in response to a send ack, don't +- * add it to the array here. */ +- if (rc || sock->cb_fn == NULL) { +- continue; +- } +- } +-#endif +- if ((events[i].events & EPOLLIN) == 0) { +- continue; +- } +- +-#elif defined(SPDK_KEVENT) +- sock = events[i].udata; +- psock = __posix_sock(sock); +-#endif +- +- /* If the socket is not already in the list, add it now */ +- if (!psock->socket_has_data && !psock->pipe_has_data) { +- TAILQ_INSERT_TAIL(&group->socks_with_data, psock, link); +- } +- psock->socket_has_data = true; +- } +- +- num_events = 0; +- +- TAILQ_FOREACH_SAFE(psock, &group->socks_with_data, link, ptmp) { +- if (num_events == max_events) { +- break; +- } +- +- /* If the socket's cb_fn is NULL, just remove it from the +- * list and do not add it to socks array */ +- if (spdk_unlikely(psock->base.cb_fn == NULL)) { +- psock->socket_has_data = false; +- psock->pipe_has_data = false; +- TAILQ_REMOVE(&group->socks_with_data, psock, link); +- continue; +- } +- +- socks[num_events++] = &psock->base; +- } +- +- /* Cycle the has_data list so that each time we poll things aren't +- * in the same order. Say we have 6 sockets in the list, named as follows: +- * A B C D E F +- * And all 6 sockets had epoll events, but max_events is only 3. That means +- * psock currently points at D. We want to rearrange the list to the following: +- * D E F A B C +- * +- * The variables below are named according to this example to make it easier to +- * follow the swaps. +- */ +- if (psock != NULL) { +- struct spdk_posix_sock *pa, *pc, *pd, *pf; +- +- /* Capture pointers to the elements we need */ +- pd = psock; +- pc = TAILQ_PREV(pd, spdk_has_data_list, link); +- pa = TAILQ_FIRST(&group->socks_with_data); +- pf = TAILQ_LAST(&group->socks_with_data, spdk_has_data_list); +- +- /* Break the link between C and D */ +- pc->link.tqe_next = NULL; +- +- /* Connect F to A */ +- pf->link.tqe_next = pa; +- pa->link.tqe_prev = &pf->link.tqe_next; +- +- /* Fix up the list first/last pointers */ +- group->socks_with_data.tqh_first = pd; +- group->socks_with_data.tqh_last = &pc->link.tqe_next; +- +- /* D is in front of the list, make tqe prev pointer point to the head of list */ +- pd->link.tqe_prev = &group->socks_with_data.tqh_first; +- } +- +- return num_events; +-} +- +-static int +-posix_sock_group_impl_close(struct spdk_sock_group_impl *_group) +-{ +- struct spdk_posix_sock_group_impl *group = __posix_group_impl(_group); +- int rc; +- +- if (g_spdk_posix_sock_impl_opts.enable_placement_id == PLACEMENT_CPU) { +- spdk_sock_map_release(&g_map, spdk_env_get_current_core()); +- } +- +- rc = close(group->fd); +- free(group); +- return rc; +-} +- +-static struct spdk_net_impl g_posix_net_impl = { +- .name = "posix", +- .getaddr = posix_sock_getaddr, +- .connect = posix_sock_connect, +- .listen = posix_sock_listen, +- .accept = posix_sock_accept, +- .close = posix_sock_close, +- .recv = posix_sock_recv, +- .readv = posix_sock_readv, +- .readv_async = posix_sock_readv_async, +- .writev = posix_sock_writev, +- .writev_async = posix_sock_writev_async, +- .flush = posix_sock_flush, +- .set_recvlowat = posix_sock_set_recvlowat, +- .set_recvbuf = posix_sock_set_recvbuf, +- .set_sendbuf = posix_sock_set_sendbuf, +- .is_ipv6 = posix_sock_is_ipv6, +- .is_ipv4 = posix_sock_is_ipv4, +- .is_connected = posix_sock_is_connected, +- .group_impl_get_optimal = posix_sock_group_impl_get_optimal, +- .group_impl_create = posix_sock_group_impl_create, +- .group_impl_add_sock = posix_sock_group_impl_add_sock, +- .group_impl_remove_sock = posix_sock_group_impl_remove_sock, +- .group_impl_poll = posix_sock_group_impl_poll, +- .group_impl_close = posix_sock_group_impl_close, +- .get_opts = posix_sock_impl_get_opts, +- .set_opts = posix_sock_impl_set_opts, +-}; +- +-SPDK_NET_IMPL_REGISTER(posix, &g_posix_net_impl, DEFAULT_SOCK_PRIORITY + 1); +- +-static struct spdk_sock * +-ssl_sock_listen(const char *ip, int port, struct spdk_sock_opts *opts) +-{ +- return posix_sock_create(ip, port, SPDK_SOCK_CREATE_LISTEN, opts, true); +-} +- +-static struct spdk_sock * +-ssl_sock_connect(const char *ip, int port, struct spdk_sock_opts *opts) +-{ +- return posix_sock_create(ip, port, SPDK_SOCK_CREATE_CONNECT, opts, true); +-} +- +-static struct spdk_sock * +-ssl_sock_accept(struct spdk_sock *_sock) +-{ +- return _posix_sock_accept(_sock, true); +-} +- +-static struct spdk_net_impl g_ssl_net_impl = { +- .name = "ssl", +- .getaddr = posix_sock_getaddr, +- .connect = ssl_sock_connect, +- .listen = ssl_sock_listen, +- .accept = ssl_sock_accept, +- .close = posix_sock_close, +- .recv = posix_sock_recv, +- .readv = posix_sock_readv, +- .writev = posix_sock_writev, +- .writev_async = posix_sock_writev_async, +- .flush = posix_sock_flush, +- .set_recvlowat = posix_sock_set_recvlowat, +- .set_recvbuf = posix_sock_set_recvbuf, +- .set_sendbuf = posix_sock_set_sendbuf, +- .is_ipv6 = posix_sock_is_ipv6, +- .is_ipv4 = posix_sock_is_ipv4, +- .is_connected = posix_sock_is_connected, +- .group_impl_get_optimal = posix_sock_group_impl_get_optimal, +- .group_impl_create = posix_sock_group_impl_create, +- .group_impl_add_sock = posix_sock_group_impl_add_sock, +- .group_impl_remove_sock = posix_sock_group_impl_remove_sock, +- .group_impl_poll = posix_sock_group_impl_poll, +- .group_impl_close = posix_sock_group_impl_close, +- .get_opts = posix_sock_impl_get_opts, +- .set_opts = posix_sock_impl_set_opts, +-}; +- +-SPDK_NET_IMPL_REGISTER(ssl, &g_ssl_net_impl, DEFAULT_SOCK_PRIORITY); +-SPDK_LOG_REGISTER_COMPONENT(sock_posix) ++/* SPDX-License-Identifier: BSD-3-Clause ++ * Copyright (C) 2018 Intel Corporation. All rights reserved. ++ * Copyright (c) 2020, 2021 Mellanox Technologies LTD. All rights reserved. ++ * Copyright (c) 2021 NVIDIA CORPORATION & AFFILIATES. All rights reserved. ++ */ ++ ++#include "spdk/stdinc.h" ++ ++#if defined(__FreeBSD__) ++#include ++#define SPDK_KEVENT ++#else ++#include ++#define SPDK_EPOLL ++#endif ++ ++#if defined(__linux__) ++#include ++#endif ++ ++#include "spdk/env.h" ++#include "spdk/log.h" ++#include "spdk/pipe.h" ++#include "spdk/sock.h" ++#include "spdk/util.h" ++#include "spdk/string.h" ++#include "spdk_internal/sock.h" ++#include "../sock_kernel.h" ++ ++#include "openssl/crypto.h" ++#include "openssl/err.h" ++#include "openssl/ssl.h" ++ ++#define MAX_TMPBUF 1024 ++#define PORTNUMLEN 32 ++ ++#if defined(SO_ZEROCOPY) && defined(MSG_ZEROCOPY) ++#define SPDK_ZEROCOPY ++#endif ++ ++struct spdk_posix_sock { ++ struct spdk_sock base; ++ int fd; ++ ++ uint32_t sendmsg_idx; ++ ++ struct spdk_pipe *recv_pipe; ++ void *recv_buf; ++ int recv_buf_sz; ++ bool pipe_has_data; ++ bool socket_has_data; ++ bool zcopy; ++ ++ int placement_id; ++ ++ SSL_CTX *ctx; ++ SSL *ssl; ++ ++ TAILQ_ENTRY(spdk_posix_sock) link; ++}; ++ ++TAILQ_HEAD(spdk_has_data_list, spdk_posix_sock); ++ ++struct spdk_posix_sock_group_impl { ++ struct spdk_sock_group_impl base; ++ int fd; ++ struct spdk_has_data_list socks_with_data; ++ int placement_id; ++}; ++ ++static struct spdk_sock_impl_opts g_spdk_posix_sock_impl_opts = { ++ .recv_buf_size = MIN_SO_RCVBUF_SIZE, ++ .send_buf_size = MIN_SO_SNDBUF_SIZE, ++ .enable_recv_pipe = true, ++ .enable_quickack = false, ++ .enable_placement_id = PLACEMENT_NONE, ++ .enable_zerocopy_send_server = true, ++ .enable_zerocopy_send_client = false, ++ .zerocopy_threshold = 0, ++ .tls_version = 0, ++ .enable_ktls = false, ++ .psk_key = NULL, ++ .psk_identity = NULL ++}; ++ ++static struct spdk_sock_map g_map = { ++ .entries = STAILQ_HEAD_INITIALIZER(g_map.entries), ++ .mtx = PTHREAD_MUTEX_INITIALIZER ++}; ++ ++__attribute((destructor)) static void ++posix_sock_map_cleanup(void) ++{ ++ spdk_sock_map_cleanup(&g_map); ++} ++ ++#define __posix_sock(sock) (struct spdk_posix_sock *)sock ++#define __posix_group_impl(group) (struct spdk_posix_sock_group_impl *)group ++ ++static void ++posix_sock_copy_impl_opts(struct spdk_sock_impl_opts *dest, const struct spdk_sock_impl_opts *src, ++ size_t len) ++{ ++#define FIELD_OK(field) \ ++ offsetof(struct spdk_sock_impl_opts, field) + sizeof(src->field) <= len ++ ++#define SET_FIELD(field) \ ++ if (FIELD_OK(field)) { \ ++ dest->field = src->field; \ ++ } ++ ++ SET_FIELD(recv_buf_size); ++ SET_FIELD(send_buf_size); ++ SET_FIELD(enable_recv_pipe); ++ SET_FIELD(enable_zerocopy_send); ++ SET_FIELD(enable_quickack); ++ SET_FIELD(enable_placement_id); ++ SET_FIELD(enable_zerocopy_send_server); ++ SET_FIELD(enable_zerocopy_send_client); ++ SET_FIELD(zerocopy_threshold); ++ SET_FIELD(tls_version); ++ SET_FIELD(enable_ktls); ++ SET_FIELD(psk_key); ++ SET_FIELD(psk_identity); ++ ++#undef SET_FIELD ++#undef FIELD_OK ++} ++ ++static int ++posix_sock_impl_get_opts(struct spdk_sock_impl_opts *opts, size_t *len) ++{ ++ if (!opts || !len) { ++ errno = EINVAL; ++ return -1; ++ } ++ ++ assert(sizeof(*opts) >= *len); ++ memset(opts, 0, *len); ++ ++ posix_sock_copy_impl_opts(opts, &g_spdk_posix_sock_impl_opts, *len); ++ *len = spdk_min(*len, sizeof(g_spdk_posix_sock_impl_opts)); ++ ++ return 0; ++} ++ ++static int ++posix_sock_impl_set_opts(const struct spdk_sock_impl_opts *opts, size_t len) ++{ ++ if (!opts) { ++ errno = EINVAL; ++ return -1; ++ } ++ ++ assert(sizeof(*opts) >= len); ++ posix_sock_copy_impl_opts(&g_spdk_posix_sock_impl_opts, opts, len); ++ ++ return 0; ++} ++ ++static void ++posix_opts_get_impl_opts(const struct spdk_sock_opts *opts, struct spdk_sock_impl_opts *dest) ++{ ++ /* Copy the default impl_opts first to cover cases when user's impl_opts is smaller */ ++ memcpy(dest, &g_spdk_posix_sock_impl_opts, sizeof(*dest)); ++ ++ if (opts->impl_opts != NULL) { ++ assert(sizeof(*dest) >= opts->impl_opts_size); ++ posix_sock_copy_impl_opts(dest, opts->impl_opts, opts->impl_opts_size); ++ } ++} ++ ++static int ++posix_sock_getaddr(struct spdk_sock *_sock, char *saddr, int slen, uint16_t *sport, ++ char *caddr, int clen, uint16_t *cport) ++{ ++ struct spdk_posix_sock *sock = __posix_sock(_sock); ++ struct sockaddr_storage sa; ++ socklen_t salen; ++ int rc; ++ ++ assert(sock != NULL); ++ ++ memset(&sa, 0, sizeof sa); ++ salen = sizeof sa; ++ rc = getsockname(sock->fd, (struct sockaddr *) &sa, &salen); ++ if (rc != 0) { ++ SPDK_ERRLOG("getsockname() failed (errno=%d)\n", errno); ++ return -1; ++ } ++ ++ switch (sa.ss_family) { ++ case AF_UNIX: ++ /* Acceptable connection types that don't have IPs */ ++ return 0; ++ case AF_INET: ++ case AF_INET6: ++ /* Code below will get IP addresses */ ++ break; ++ default: ++ /* Unsupported socket family */ ++ return -1; ++ } ++ ++ rc = get_addr_str((struct sockaddr *)&sa, saddr, slen); ++ if (rc != 0) { ++ SPDK_ERRLOG("getnameinfo() failed (errno=%d)\n", errno); ++ return -1; ++ } ++ ++ if (sport) { ++ if (sa.ss_family == AF_INET) { ++ *sport = ntohs(((struct sockaddr_in *) &sa)->sin_port); ++ } else if (sa.ss_family == AF_INET6) { ++ *sport = ntohs(((struct sockaddr_in6 *) &sa)->sin6_port); ++ } ++ } ++ ++ memset(&sa, 0, sizeof sa); ++ salen = sizeof sa; ++ rc = getpeername(sock->fd, (struct sockaddr *) &sa, &salen); ++ if (rc != 0) { ++ SPDK_ERRLOG("getpeername() failed (errno=%d)\n", errno); ++ return -1; ++ } ++ ++ rc = get_addr_str((struct sockaddr *)&sa, caddr, clen); ++ if (rc != 0) { ++ SPDK_ERRLOG("getnameinfo() failed (errno=%d)\n", errno); ++ return -1; ++ } ++ ++ if (cport) { ++ if (sa.ss_family == AF_INET) { ++ *cport = ntohs(((struct sockaddr_in *) &sa)->sin_port); ++ } else if (sa.ss_family == AF_INET6) { ++ *cport = ntohs(((struct sockaddr_in6 *) &sa)->sin6_port); ++ } ++ } ++ ++ return 0; ++} ++ ++enum posix_sock_create_type { ++ SPDK_SOCK_CREATE_LISTEN, ++ SPDK_SOCK_CREATE_CONNECT, ++}; ++ ++static int ++posix_sock_alloc_pipe(struct spdk_posix_sock *sock, int sz) ++{ ++ uint8_t *new_buf; ++ struct spdk_pipe *new_pipe; ++ struct iovec siov[2]; ++ struct iovec diov[2]; ++ int sbytes; ++ ssize_t bytes; ++ ++ if (sock->recv_buf_sz == sz) { ++ return 0; ++ } ++ ++ /* If the new size is 0, just free the pipe */ ++ if (sz == 0) { ++ spdk_pipe_destroy(sock->recv_pipe); ++ free(sock->recv_buf); ++ sock->recv_pipe = NULL; ++ sock->recv_buf = NULL; ++ return 0; ++ } else if (sz < MIN_SOCK_PIPE_SIZE) { ++ SPDK_ERRLOG("The size of the pipe must be larger than %d\n", MIN_SOCK_PIPE_SIZE); ++ return -1; ++ } ++ ++ /* Round up to next 64 byte multiple */ ++ new_buf = calloc(SPDK_ALIGN_CEIL(sz + 1, 64), sizeof(uint8_t)); ++ if (!new_buf) { ++ SPDK_ERRLOG("socket recv buf allocation failed\n"); ++ return -ENOMEM; ++ } ++ ++ new_pipe = spdk_pipe_create(new_buf, sz + 1); ++ if (new_pipe == NULL) { ++ SPDK_ERRLOG("socket pipe allocation failed\n"); ++ free(new_buf); ++ return -ENOMEM; ++ } ++ ++ if (sock->recv_pipe != NULL) { ++ /* Pull all of the data out of the old pipe */ ++ sbytes = spdk_pipe_reader_get_buffer(sock->recv_pipe, sock->recv_buf_sz, siov); ++ if (sbytes > sz) { ++ /* Too much data to fit into the new pipe size */ ++ spdk_pipe_destroy(new_pipe); ++ free(new_buf); ++ return -EINVAL; ++ } ++ ++ sbytes = spdk_pipe_writer_get_buffer(new_pipe, sz, diov); ++ assert(sbytes == sz); ++ ++ bytes = spdk_iovcpy(siov, 2, diov, 2); ++ spdk_pipe_writer_advance(new_pipe, bytes); ++ ++ spdk_pipe_destroy(sock->recv_pipe); ++ free(sock->recv_buf); ++ } ++ ++ sock->recv_buf_sz = sz; ++ sock->recv_buf = new_buf; ++ sock->recv_pipe = new_pipe; ++ ++ return 0; ++} ++ ++static int ++posix_sock_set_recvbuf(struct spdk_sock *_sock, int sz) ++{ ++ struct spdk_posix_sock *sock = __posix_sock(_sock); ++ int min_size; ++ int rc; ++ ++ assert(sock != NULL); ++ ++ if (_sock->impl_opts.enable_recv_pipe) { ++ rc = posix_sock_alloc_pipe(sock, sz); ++ if (rc) { ++ return rc; ++ } ++ } ++ ++ /* Set kernel buffer size to be at least MIN_SO_RCVBUF_SIZE and ++ * g_spdk_posix_sock_impl_opts.recv_buf_size. */ ++ min_size = spdk_max(MIN_SO_RCVBUF_SIZE, g_spdk_posix_sock_impl_opts.recv_buf_size); ++ ++ if (sz < min_size) { ++ sz = min_size; ++ } ++ ++ rc = setsockopt(sock->fd, SOL_SOCKET, SO_RCVBUF, &sz, sizeof(sz)); ++ if (rc < 0) { ++ return rc; ++ } ++ ++ _sock->impl_opts.recv_buf_size = sz; ++ ++ return 0; ++} ++ ++static int ++posix_sock_set_sendbuf(struct spdk_sock *_sock, int sz) ++{ ++ struct spdk_posix_sock *sock = __posix_sock(_sock); ++ int min_size; ++ int rc; ++ ++ assert(sock != NULL); ++ ++ /* Set kernel buffer size to be at least MIN_SO_SNDBUF_SIZE and ++ * g_spdk_posix_sock_impl_opts.send_buf_size. */ ++ min_size = spdk_max(MIN_SO_SNDBUF_SIZE, g_spdk_posix_sock_impl_opts.send_buf_size); ++ ++ if (sz < min_size) { ++ sz = min_size; ++ } ++ ++ rc = setsockopt(sock->fd, SOL_SOCKET, SO_SNDBUF, &sz, sizeof(sz)); ++ if (rc < 0) { ++ return rc; ++ } ++ ++ _sock->impl_opts.send_buf_size = sz; ++ ++ return 0; ++} ++ ++static void ++posix_sock_init(struct spdk_posix_sock *sock, bool enable_zero_copy) ++{ ++#if defined(SPDK_ZEROCOPY) || defined(__linux__) ++ int flag; ++ int rc; ++#endif ++ ++#if defined(SPDK_ZEROCOPY) ++ flag = 1; ++ ++ if (enable_zero_copy) { ++ /* Try to turn on zero copy sends */ ++ rc = setsockopt(sock->fd, SOL_SOCKET, SO_ZEROCOPY, &flag, sizeof(flag)); ++ if (rc == 0) { ++ sock->zcopy = true; ++ } ++ } ++#endif ++ ++#if defined(__linux__) ++ flag = 1; ++ ++ if (sock->base.impl_opts.enable_quickack) { ++ rc = setsockopt(sock->fd, IPPROTO_TCP, TCP_QUICKACK, &flag, sizeof(flag)); ++ if (rc != 0) { ++ SPDK_ERRLOG("quickack was failed to set\n"); ++ } ++ } ++ ++ spdk_sock_get_placement_id(sock->fd, sock->base.impl_opts.enable_placement_id, ++ &sock->placement_id); ++ ++ if (sock->base.impl_opts.enable_placement_id == PLACEMENT_MARK) { ++ /* Save placement_id */ ++ spdk_sock_map_insert(&g_map, sock->placement_id, NULL); ++ } ++#endif ++} ++ ++static struct spdk_posix_sock * ++posix_sock_alloc(int fd, struct spdk_sock_impl_opts *impl_opts, bool enable_zero_copy) ++{ ++ struct spdk_posix_sock *sock; ++ ++ sock = calloc(1, sizeof(*sock)); ++ if (sock == NULL) { ++ SPDK_ERRLOG("sock allocation failed\n"); ++ return NULL; ++ } ++ ++ sock->fd = fd; ++ memcpy(&sock->base.impl_opts, impl_opts, sizeof(*impl_opts)); ++ posix_sock_init(sock, enable_zero_copy); ++ ++ return sock; ++} ++ ++static int ++posix_fd_create(struct addrinfo *res, struct spdk_sock_opts *opts, ++ struct spdk_sock_impl_opts *impl_opts) ++{ ++ int fd; ++ int val = 1; ++ int rc, sz; ++#if defined(__linux__) ++ int to; ++#endif ++ ++ fd = socket(res->ai_family, res->ai_socktype, res->ai_protocol); ++ if (fd < 0) { ++ /* error */ ++ return -1; ++ } ++ ++ sz = impl_opts->recv_buf_size; ++ rc = setsockopt(fd, SOL_SOCKET, SO_RCVBUF, &sz, sizeof(sz)); ++ if (rc) { ++ /* Not fatal */ ++ } ++ ++ sz = impl_opts->send_buf_size; ++ rc = setsockopt(fd, SOL_SOCKET, SO_SNDBUF, &sz, sizeof(sz)); ++ if (rc) { ++ /* Not fatal */ ++ } ++ ++ rc = setsockopt(fd, SOL_SOCKET, SO_REUSEADDR, &val, sizeof val); ++ if (rc != 0) { ++ close(fd); ++ /* error */ ++ return -1; ++ } ++ rc = setsockopt(fd, IPPROTO_TCP, TCP_NODELAY, &val, sizeof val); ++ if (rc != 0) { ++ close(fd); ++ /* error */ ++ return -1; ++ } ++ ++#if defined(SO_PRIORITY) ++ if (opts->priority) { ++ rc = setsockopt(fd, SOL_SOCKET, SO_PRIORITY, &opts->priority, sizeof val); ++ if (rc != 0) { ++ close(fd); ++ /* error */ ++ return -1; ++ } ++ } ++#endif ++ ++ if (res->ai_family == AF_INET6) { ++ rc = setsockopt(fd, IPPROTO_IPV6, IPV6_V6ONLY, &val, sizeof val); ++ if (rc != 0) { ++ close(fd); ++ /* error */ ++ return -1; ++ } ++ } ++ ++ if (opts->ack_timeout) { ++#if defined(__linux__) ++ to = opts->ack_timeout; ++ rc = setsockopt(fd, IPPROTO_TCP, TCP_USER_TIMEOUT, &to, sizeof(to)); ++ if (rc != 0) { ++ close(fd); ++ /* error */ ++ return -1; ++ } ++#else ++ SPDK_WARNLOG("TCP_USER_TIMEOUT is not supported.\n"); ++#endif ++ } ++ ++ return fd; ++} ++ ++static unsigned int ++posix_sock_tls_psk_server_cb(SSL *ssl, ++ const char *id, ++ unsigned char *psk, ++ unsigned int max_psk_len) ++{ ++ long key_len; ++ unsigned char *default_psk; ++ struct spdk_sock_impl_opts *impl_opts; ++ ++ impl_opts = SSL_get_app_data(ssl); ++ ++ if (impl_opts->psk_key == NULL) { ++ SPDK_ERRLOG("PSK is not set\n"); ++ goto err; ++ } ++ SPDK_DEBUGLOG(sock_posix, "Length of Client's PSK ID %lu\n", strlen(impl_opts->psk_identity)); ++ if (id == NULL) { ++ SPDK_ERRLOG("Received empty PSK ID\n"); ++ goto err; ++ } ++ SPDK_DEBUGLOG(sock_posix, "Received PSK ID '%s'\n", id); ++ if (strcmp(impl_opts->psk_identity, id) != 0) { ++ SPDK_ERRLOG("Unknown Client's PSK ID\n"); ++ goto err; ++ } ++ ++ SPDK_DEBUGLOG(sock_posix, "Length of Client's PSK KEY %u\n", max_psk_len); ++ default_psk = OPENSSL_hexstr2buf(impl_opts->psk_key, &key_len); ++ if (default_psk == NULL) { ++ SPDK_ERRLOG("Could not unhexlify PSK\n"); ++ goto err; ++ } ++ if (key_len > max_psk_len) { ++ SPDK_ERRLOG("Insufficient buffer size to copy PSK\n"); ++ OPENSSL_free(default_psk); ++ goto err; ++ } ++ ++ memcpy(psk, default_psk, key_len); ++ OPENSSL_free(default_psk); ++ ++ return key_len; ++ ++err: ++ return 0; ++} ++ ++static unsigned int ++posix_sock_tls_psk_client_cb(SSL *ssl, const char *hint, ++ char *identity, ++ unsigned int max_identity_len, ++ unsigned char *psk, ++ unsigned int max_psk_len) ++{ ++ long key_len; ++ unsigned char *default_psk; ++ struct spdk_sock_impl_opts *impl_opts; ++ ++ impl_opts = SSL_get_app_data(ssl); ++ ++ if (hint) { ++ SPDK_DEBUGLOG(sock_posix, "Received PSK identity hint '%s'\n", hint); ++ } ++ ++ if (impl_opts->psk_key == NULL) { ++ SPDK_ERRLOG("PSK is not set\n"); ++ goto err; ++ } ++ default_psk = OPENSSL_hexstr2buf(impl_opts->psk_key, &key_len); ++ if (default_psk == NULL) { ++ SPDK_ERRLOG("Could not unhexlify PSK\n"); ++ goto err; ++ } ++ if ((strlen(impl_opts->psk_identity) + 1 > max_identity_len) ++ || (key_len > max_psk_len)) { ++ OPENSSL_free(default_psk); ++ SPDK_ERRLOG("PSK ID or Key buffer is not sufficient\n"); ++ goto err; ++ } ++ spdk_strcpy_pad(identity, impl_opts->psk_identity, strlen(impl_opts->psk_identity), 0); ++ SPDK_DEBUGLOG(sock_posix, "Sending PSK identity '%s'\n", identity); ++ ++ memcpy(psk, default_psk, key_len); ++ SPDK_DEBUGLOG(sock_posix, "Provided out-of-band (OOB) PSK for TLS1.3 client\n"); ++ OPENSSL_free(default_psk); ++ ++ return key_len; ++ ++err: ++ return 0; ++} ++ ++static SSL_CTX * ++posix_sock_create_ssl_context(const SSL_METHOD *method, struct spdk_sock_opts *opts, ++ struct spdk_sock_impl_opts *impl_opts) ++{ ++ SSL_CTX *ctx; ++ int tls_version = 0; ++ bool ktls_enabled = false; ++#ifdef SSL_OP_ENABLE_KTLS ++ long options; ++#endif ++ ++ SSL_library_init(); ++ OpenSSL_add_all_algorithms(); ++ SSL_load_error_strings(); ++ /* Produce a SSL CTX in SSL V2 and V3 standards compliant way */ ++ ctx = SSL_CTX_new(method); ++ if (!ctx) { ++ SPDK_ERRLOG("SSL_CTX_new() failed, msg = %s\n", ERR_error_string(ERR_peek_last_error(), NULL)); ++ return NULL; ++ } ++ SPDK_DEBUGLOG(sock_posix, "SSL context created\n"); ++ ++ switch (impl_opts->tls_version) { ++ case 0: ++ /* auto-negotioation */ ++ break; ++ case SPDK_TLS_VERSION_1_1: ++ tls_version = TLS1_1_VERSION; ++ break; ++ case SPDK_TLS_VERSION_1_2: ++ tls_version = TLS1_2_VERSION; ++ break; ++ case SPDK_TLS_VERSION_1_3: ++ tls_version = TLS1_3_VERSION; ++ break; ++ default: ++ SPDK_ERRLOG("Incorrect TLS version provided: %d\n", impl_opts->tls_version); ++ goto err; ++ } ++ ++ if (tls_version) { ++ SPDK_DEBUGLOG(sock_posix, "Hardening TLS version to '%d'='0x%X'\n", impl_opts->tls_version, ++ tls_version); ++ if (!SSL_CTX_set_min_proto_version(ctx, tls_version)) { ++ SPDK_ERRLOG("Unable to set Min TLS version to '%d'='0x%X\n", impl_opts->tls_version, tls_version); ++ goto err; ++ } ++ if (!SSL_CTX_set_max_proto_version(ctx, tls_version)) { ++ SPDK_ERRLOG("Unable to set Max TLS version to '%d'='0x%X\n", impl_opts->tls_version, tls_version); ++ goto err; ++ } ++ } ++ if (impl_opts->enable_ktls) { ++ SPDK_DEBUGLOG(sock_posix, "Enabling kTLS offload\n"); ++#ifdef SSL_OP_ENABLE_KTLS ++ options = SSL_CTX_set_options(ctx, SSL_OP_ENABLE_KTLS); ++ ktls_enabled = options & SSL_OP_ENABLE_KTLS; ++#else ++ ktls_enabled = false; ++#endif ++ if (!ktls_enabled) { ++ SPDK_ERRLOG("Unable to set kTLS offload via SSL_CTX_set_options(). Configure openssl with 'enable-ktls'\n"); ++ goto err; ++ } ++ } ++ ++ return ctx; ++ ++err: ++ SSL_CTX_free(ctx); ++ return NULL; ++} ++ ++static SSL * ++ssl_sock_connect_loop(SSL_CTX *ctx, int fd, struct spdk_sock_impl_opts *impl_opts) ++{ ++ int rc; ++ SSL *ssl; ++ int ssl_get_error; ++ ++ ssl = SSL_new(ctx); ++ if (!ssl) { ++ SPDK_ERRLOG("SSL_new() failed, msg = %s\n", ERR_error_string(ERR_peek_last_error(), NULL)); ++ return NULL; ++ } ++ SSL_set_fd(ssl, fd); ++ SSL_set_app_data(ssl, impl_opts); ++ SSL_set_psk_client_callback(ssl, posix_sock_tls_psk_client_cb); ++ SPDK_DEBUGLOG(sock_posix, "SSL object creation finished: %p\n", ssl); ++ SPDK_DEBUGLOG(sock_posix, "%s = SSL_state_string_long(%p)\n", SSL_state_string_long(ssl), ssl); ++ while ((rc = SSL_connect(ssl)) != 1) { ++ SPDK_DEBUGLOG(sock_posix, "%s = SSL_state_string_long(%p)\n", SSL_state_string_long(ssl), ssl); ++ ssl_get_error = SSL_get_error(ssl, rc); ++ SPDK_DEBUGLOG(sock_posix, "SSL_connect failed %d = SSL_connect(%p), %d = SSL_get_error(%p, %d)\n", ++ rc, ssl, ssl_get_error, ssl, rc); ++ switch (ssl_get_error) { ++ case SSL_ERROR_WANT_READ: ++ case SSL_ERROR_WANT_WRITE: ++ continue; ++ default: ++ break; ++ } ++ SPDK_ERRLOG("SSL_connect() failed, errno = %d\n", errno); ++ SSL_free(ssl); ++ return NULL; ++ } ++ SPDK_DEBUGLOG(sock_posix, "%s = SSL_state_string_long(%p)\n", SSL_state_string_long(ssl), ssl); ++ SPDK_DEBUGLOG(sock_posix, "Negotiated Cipher suite:%s\n", ++ SSL_CIPHER_get_name(SSL_get_current_cipher(ssl))); ++ return ssl; ++} ++ ++static SSL * ++ssl_sock_accept_loop(SSL_CTX *ctx, int fd, struct spdk_sock_impl_opts *impl_opts) ++{ ++ int rc; ++ SSL *ssl; ++ int ssl_get_error; ++ ++ ssl = SSL_new(ctx); ++ if (!ssl) { ++ SPDK_ERRLOG("SSL_new() failed, msg = %s\n", ERR_error_string(ERR_peek_last_error(), NULL)); ++ return NULL; ++ } ++ SSL_set_fd(ssl, fd); ++ SSL_set_app_data(ssl, impl_opts); ++ SSL_set_psk_server_callback(ssl, posix_sock_tls_psk_server_cb); ++ SPDK_DEBUGLOG(sock_posix, "SSL object creation finished: %p\n", ssl); ++ SPDK_DEBUGLOG(sock_posix, "%s = SSL_state_string_long(%p)\n", SSL_state_string_long(ssl), ssl); ++ while ((rc = SSL_accept(ssl)) != 1) { ++ SPDK_DEBUGLOG(sock_posix, "%s = SSL_state_string_long(%p)\n", SSL_state_string_long(ssl), ssl); ++ ssl_get_error = SSL_get_error(ssl, rc); ++ SPDK_DEBUGLOG(sock_posix, "SSL_accept failed %d = SSL_accept(%p), %d = SSL_get_error(%p, %d)\n", rc, ++ ssl, ssl_get_error, ssl, rc); ++ switch (ssl_get_error) { ++ case SSL_ERROR_WANT_READ: ++ case SSL_ERROR_WANT_WRITE: ++ continue; ++ default: ++ break; ++ } ++ SPDK_ERRLOG("SSL_accept() failed, errno = %d\n", errno); ++ SSL_free(ssl); ++ return NULL; ++ } ++ SPDK_DEBUGLOG(sock_posix, "%s = SSL_state_string_long(%p)\n", SSL_state_string_long(ssl), ssl); ++ SPDK_DEBUGLOG(sock_posix, "Negotiated Cipher suite:%s\n", ++ SSL_CIPHER_get_name(SSL_get_current_cipher(ssl))); ++ return ssl; ++} ++ ++static ssize_t ++SSL_readv(SSL *ssl, const struct iovec *iov, int iovcnt) ++{ ++ int i, rc = 0; ++ ssize_t total = 0; ++ ++ for (i = 0; i < iovcnt; i++) { ++ rc = SSL_read(ssl, iov[i].iov_base, iov[i].iov_len); ++ ++ if (rc > 0) { ++ total += rc; ++ } ++ if (rc != (int)iov[i].iov_len) { ++ break; ++ } ++ } ++ if (total > 0) { ++ errno = 0; ++ return total; ++ } ++ switch (SSL_get_error(ssl, rc)) { ++ case SSL_ERROR_ZERO_RETURN: ++ errno = ENOTCONN; ++ return 0; ++ case SSL_ERROR_WANT_READ: ++ case SSL_ERROR_WANT_WRITE: ++ case SSL_ERROR_WANT_CONNECT: ++ case SSL_ERROR_WANT_ACCEPT: ++ case SSL_ERROR_WANT_X509_LOOKUP: ++ case SSL_ERROR_WANT_ASYNC: ++ case SSL_ERROR_WANT_ASYNC_JOB: ++ case SSL_ERROR_WANT_CLIENT_HELLO_CB: ++ errno = EAGAIN; ++ return -1; ++ case SSL_ERROR_SYSCALL: ++ case SSL_ERROR_SSL: ++ errno = ENOTCONN; ++ return -1; ++ default: ++ errno = ENOTCONN; ++ return -1; ++ } ++} ++ ++static ssize_t ++SSL_writev(SSL *ssl, struct iovec *iov, int iovcnt) ++{ ++ int i, rc = 0; ++ ssize_t total = 0; ++ ++ for (i = 0; i < iovcnt; i++) { ++ rc = SSL_write(ssl, iov[i].iov_base, iov[i].iov_len); ++ ++ if (rc > 0) { ++ total += rc; ++ } ++ if (rc != (int)iov[i].iov_len) { ++ break; ++ } ++ } ++ if (total > 0) { ++ errno = 0; ++ return total; ++ } ++ switch (SSL_get_error(ssl, rc)) { ++ case SSL_ERROR_ZERO_RETURN: ++ errno = ENOTCONN; ++ return 0; ++ case SSL_ERROR_WANT_READ: ++ case SSL_ERROR_WANT_WRITE: ++ case SSL_ERROR_WANT_CONNECT: ++ case SSL_ERROR_WANT_ACCEPT: ++ case SSL_ERROR_WANT_X509_LOOKUP: ++ case SSL_ERROR_WANT_ASYNC: ++ case SSL_ERROR_WANT_ASYNC_JOB: ++ case SSL_ERROR_WANT_CLIENT_HELLO_CB: ++ errno = EAGAIN; ++ return -1; ++ case SSL_ERROR_SYSCALL: ++ case SSL_ERROR_SSL: ++ errno = ENOTCONN; ++ return -1; ++ default: ++ errno = ENOTCONN; ++ return -1; ++ } ++} ++ ++static struct spdk_sock * ++posix_sock_create(const char *ip, int port, ++ enum posix_sock_create_type type, ++ struct spdk_sock_opts *opts, ++ bool enable_ssl) ++{ ++ struct spdk_posix_sock *sock; ++ struct spdk_sock_impl_opts impl_opts; ++ char buf[MAX_TMPBUF]; ++ char portnum[PORTNUMLEN]; ++ char *p; ++ struct addrinfo hints, *res, *res0; ++ int fd, flag; ++ int rc; ++ bool enable_zcopy_user_opts = true; ++ bool enable_zcopy_impl_opts = true; ++ SSL_CTX *ctx = 0; ++ SSL *ssl = 0; ++ ++ assert(opts != NULL); ++ posix_opts_get_impl_opts(opts, &impl_opts); ++ ++ if (ip == NULL) { ++ return NULL; ++ } ++ if (ip[0] == '[') { ++ snprintf(buf, sizeof(buf), "%s", ip + 1); ++ p = strchr(buf, ']'); ++ if (p != NULL) { ++ *p = '\0'; ++ } ++ ip = (const char *) &buf[0]; ++ } ++ ++ snprintf(portnum, sizeof portnum, "%d", port); ++ memset(&hints, 0, sizeof hints); ++ hints.ai_family = PF_UNSPEC; ++ hints.ai_socktype = SOCK_STREAM; ++ hints.ai_flags = AI_NUMERICSERV; ++ hints.ai_flags |= AI_PASSIVE; ++ hints.ai_flags |= AI_NUMERICHOST; ++ rc = getaddrinfo(ip, portnum, &hints, &res0); ++ if (rc != 0) { ++ SPDK_ERRLOG("getaddrinfo() failed %s (%d)\n", gai_strerror(rc), rc); ++ return NULL; ++ } ++ ++ /* try listen */ ++ fd = -1; ++ for (res = res0; res != NULL; res = res->ai_next) { ++retry: ++ fd = posix_fd_create(res, opts, &impl_opts); ++ if (fd < 0) { ++ continue; ++ } ++ if (type == SPDK_SOCK_CREATE_LISTEN) { ++ rc = bind(fd, res->ai_addr, res->ai_addrlen); ++ if (rc != 0) { ++ SPDK_ERRLOG("bind() failed at port %d, errno = %d\n", port, errno); ++ switch (errno) { ++ case EINTR: ++ /* interrupted? */ ++ close(fd); ++ goto retry; ++ case EADDRNOTAVAIL: ++ SPDK_ERRLOG("IP address %s not available. " ++ "Verify IP address in config file " ++ "and make sure setup script is " ++ "run before starting spdk app.\n", ip); ++ /* FALLTHROUGH */ ++ default: ++ /* try next family */ ++ close(fd); ++ fd = -1; ++ continue; ++ } ++ } ++ /* bind OK */ ++ rc = listen(fd, 512); ++ if (rc != 0) { ++ SPDK_ERRLOG("listen() failed, errno = %d\n", errno); ++ close(fd); ++ fd = -1; ++ break; ++ } ++ enable_zcopy_impl_opts = impl_opts.enable_zerocopy_send_server; ++ } else if (type == SPDK_SOCK_CREATE_CONNECT) { ++ rc = connect(fd, res->ai_addr, res->ai_addrlen); ++ if (rc != 0) { ++ SPDK_ERRLOG("connect() failed, errno = %d\n", errno); ++ /* try next family */ ++ close(fd); ++ fd = -1; ++ continue; ++ } ++ enable_zcopy_impl_opts = impl_opts.enable_zerocopy_send_client; ++ if (enable_ssl) { ++ ctx = posix_sock_create_ssl_context(TLS_client_method(), opts, &impl_opts); ++ if (!ctx) { ++ SPDK_ERRLOG("posix_sock_create_ssl_context() failed, errno = %d\n", errno); ++ close(fd); ++ fd = -1; ++ break; ++ } ++ ssl = ssl_sock_connect_loop(ctx, fd, &impl_opts); ++ if (!ssl) { ++ SPDK_ERRLOG("ssl_sock_connect_loop() failed, errno = %d\n", errno); ++ close(fd); ++ fd = -1; ++ SSL_CTX_free(ctx); ++ break; ++ } ++ } ++ } ++ ++ flag = fcntl(fd, F_GETFL); ++ if (fcntl(fd, F_SETFL, flag | O_NONBLOCK) < 0) { ++ SPDK_ERRLOG("fcntl can't set nonblocking mode for socket, fd: %d (%d)\n", fd, errno); ++ SSL_free(ssl); ++ SSL_CTX_free(ctx); ++ close(fd); ++ fd = -1; ++ break; ++ } ++ break; ++ } ++ freeaddrinfo(res0); ++ ++ if (fd < 0) { ++ return NULL; ++ } ++ ++ /* Only enable zero copy for non-loopback and non-ssl sockets. */ ++ enable_zcopy_user_opts = opts->zcopy && !sock_is_loopback(fd) && !enable_ssl; ++ ++ sock = posix_sock_alloc(fd, &impl_opts, enable_zcopy_user_opts && enable_zcopy_impl_opts); ++ if (sock == NULL) { ++ SPDK_ERRLOG("sock allocation failed\n"); ++ SSL_free(ssl); ++ SSL_CTX_free(ctx); ++ close(fd); ++ return NULL; ++ } ++ ++ if (ctx) { ++ sock->ctx = ctx; ++ } ++ ++ if (ssl) { ++ sock->ssl = ssl; ++ } ++ ++ return &sock->base; ++} ++ ++static struct spdk_sock * ++posix_sock_listen(const char *ip, int port, struct spdk_sock_opts *opts) ++{ ++ return posix_sock_create(ip, port, SPDK_SOCK_CREATE_LISTEN, opts, false); ++} ++ ++static struct spdk_sock * ++posix_sock_connect(const char *ip, int port, struct spdk_sock_opts *opts) ++{ ++ return posix_sock_create(ip, port, SPDK_SOCK_CREATE_CONNECT, opts, false); ++} ++ ++static struct spdk_sock * ++_posix_sock_accept(struct spdk_sock *_sock, bool enable_ssl) ++{ ++ struct spdk_posix_sock *sock = __posix_sock(_sock); ++ struct sockaddr_storage sa; ++ socklen_t salen; ++ int rc, fd; ++ struct spdk_posix_sock *new_sock; ++ int flag; ++ SSL_CTX *ctx = 0; ++ SSL *ssl = 0; ++ ++ memset(&sa, 0, sizeof(sa)); ++ salen = sizeof(sa); ++ ++ assert(sock != NULL); ++ ++ rc = accept(sock->fd, (struct sockaddr *)&sa, &salen); ++ ++ if (rc == -1) { ++ return NULL; ++ } ++ ++ fd = rc; ++ ++ flag = fcntl(fd, F_GETFL); ++ if ((!(flag & O_NONBLOCK)) && (fcntl(fd, F_SETFL, flag | O_NONBLOCK) < 0)) { ++ SPDK_ERRLOG("fcntl can't set nonblocking mode for socket, fd: %d (%d)\n", fd, errno); ++ close(fd); ++ return NULL; ++ } ++ ++#if defined(SO_PRIORITY) ++ /* The priority is not inherited, so call this function again */ ++ if (sock->base.opts.priority) { ++ rc = setsockopt(fd, SOL_SOCKET, SO_PRIORITY, &sock->base.opts.priority, sizeof(int)); ++ if (rc != 0) { ++ close(fd); ++ return NULL; ++ } ++ } ++#endif ++ ++ /* Establish SSL connection */ ++ if (enable_ssl) { ++ ctx = posix_sock_create_ssl_context(TLS_server_method(), &sock->base.opts, &sock->base.impl_opts); ++ if (!ctx) { ++ SPDK_ERRLOG("posix_sock_create_ssl_context() failed, errno = %d\n", errno); ++ close(fd); ++ return NULL; ++ } ++ ssl = ssl_sock_accept_loop(ctx, fd, &sock->base.impl_opts); ++ if (!ssl) { ++ SPDK_ERRLOG("ssl_sock_accept_loop() failed, errno = %d\n", errno); ++ close(fd); ++ SSL_CTX_free(ctx); ++ return NULL; ++ } ++ } ++ ++ /* Inherit the zero copy feature from the listen socket */ ++ new_sock = posix_sock_alloc(fd, &sock->base.impl_opts, sock->zcopy); ++ if (new_sock == NULL) { ++ close(fd); ++ SSL_free(ssl); ++ SSL_CTX_free(ctx); ++ return NULL; ++ } ++ ++ if (ctx) { ++ new_sock->ctx = ctx; ++ } ++ ++ if (ssl) { ++ new_sock->ssl = ssl; ++ } ++ ++ return &new_sock->base; ++} ++ ++static struct spdk_sock * ++posix_sock_accept(struct spdk_sock *_sock) ++{ ++ return _posix_sock_accept(_sock, false); ++} ++ ++static int ++posix_sock_close(struct spdk_sock *_sock) ++{ ++ struct spdk_posix_sock *sock = __posix_sock(_sock); ++ ++ assert(TAILQ_EMPTY(&_sock->pending_reqs)); ++ ++ if (sock->ssl != NULL) { ++ SSL_shutdown(sock->ssl); ++ } ++ ++ /* If the socket fails to close, the best choice is to ++ * leak the fd but continue to free the rest of the sock ++ * memory. */ ++ close(sock->fd); ++ ++ SSL_free(sock->ssl); ++ SSL_CTX_free(sock->ctx); ++ ++ spdk_pipe_destroy(sock->recv_pipe); ++ free(sock->recv_buf); ++ free(sock); ++ ++ return 0; ++} ++ ++#ifdef SPDK_ZEROCOPY ++static int ++_sock_check_zcopy(struct spdk_sock *sock) ++{ ++ struct spdk_posix_sock *psock = __posix_sock(sock); ++ struct msghdr msgh = {}; ++ uint8_t buf[sizeof(struct cmsghdr) + sizeof(struct sock_extended_err)]; ++ ssize_t rc; ++ struct sock_extended_err *serr; ++ struct cmsghdr *cm; ++ uint32_t idx; ++ struct spdk_sock_request *req, *treq; ++ bool found; ++ ++ msgh.msg_control = buf; ++ msgh.msg_controllen = sizeof(buf); ++ ++ while (true) { ++ rc = recvmsg(psock->fd, &msgh, MSG_ERRQUEUE); ++ ++ if (rc < 0) { ++ if (errno == EWOULDBLOCK || errno == EAGAIN) { ++ return 0; ++ } ++ ++ if (!TAILQ_EMPTY(&sock->pending_reqs)) { ++ SPDK_ERRLOG("Attempting to receive from ERRQUEUE yielded error, but pending list still has orphaned entries\n"); ++ } else { ++ SPDK_WARNLOG("Recvmsg yielded an error!\n"); ++ } ++ return 0; ++ } ++ ++ cm = CMSG_FIRSTHDR(&msgh); ++ if (!(cm && ++ ((cm->cmsg_level == SOL_IP && cm->cmsg_type == IP_RECVERR) || ++ (cm->cmsg_level == SOL_IPV6 && cm->cmsg_type == IPV6_RECVERR)))) { ++ SPDK_WARNLOG("Unexpected cmsg level or type!\n"); ++ return 0; ++ } ++ ++ serr = (struct sock_extended_err *)CMSG_DATA(cm); ++ if (serr->ee_errno != 0 || serr->ee_origin != SO_EE_ORIGIN_ZEROCOPY) { ++ SPDK_WARNLOG("Unexpected extended error origin\n"); ++ return 0; ++ } ++ ++ /* Most of the time, the pending_reqs array is in the exact ++ * order we need such that all of the requests to complete are ++ * in order, in the front. It is guaranteed that all requests ++ * belonging to the same sendmsg call are sequential, so once ++ * we encounter one match we can stop looping as soon as a ++ * non-match is found. ++ */ ++ for (idx = serr->ee_info; idx <= serr->ee_data; idx++) { ++ found = false; ++ TAILQ_FOREACH_SAFE(req, &sock->pending_reqs, internal.link, treq) { ++ if (!req->internal.is_zcopy) { ++ /* This wasn't a zcopy request. It was just waiting in line to complete */ ++ rc = spdk_sock_request_put(sock, req, 0); ++ if (rc < 0) { ++ return rc; ++ } ++ } else if (req->internal.offset == idx) { ++ found = true; ++ rc = spdk_sock_request_put(sock, req, 0); ++ if (rc < 0) { ++ return rc; ++ } ++ } else if (found) { ++ break; ++ } ++ } ++ } ++ } ++ ++ return 0; ++} ++#endif ++ ++static int ++_sock_flush(struct spdk_sock *sock) ++{ ++ struct spdk_posix_sock *psock = __posix_sock(sock); ++ struct msghdr msg = {}; ++ int flags; ++ struct iovec iovs[IOV_BATCH_SIZE]; ++ int iovcnt; ++ int retval; ++ struct spdk_sock_request *req; ++ int i; ++ ssize_t rc, sent; ++ unsigned int offset; ++ size_t len; ++ bool is_zcopy = false; ++ ++ /* Can't flush from within a callback or we end up with recursive calls */ ++ if (sock->cb_cnt > 0) { ++ errno = EAGAIN; ++ return -1; ++ } ++ ++#ifdef SPDK_ZEROCOPY ++ if (psock->zcopy) { ++ flags = MSG_ZEROCOPY | MSG_NOSIGNAL; ++ } else ++#endif ++ { ++ flags = MSG_NOSIGNAL; ++ } ++ ++ iovcnt = spdk_sock_prep_reqs(sock, iovs, 0, NULL, &flags); ++ if (iovcnt == 0) { ++ return 0; ++ } ++ ++#ifdef SPDK_ZEROCOPY ++ is_zcopy = flags & MSG_ZEROCOPY; ++#endif ++ ++ /* Perform the vectored write */ ++ msg.msg_iov = iovs; ++ msg.msg_iovlen = iovcnt; ++ ++ if (psock->ssl) { ++ rc = SSL_writev(psock->ssl, iovs, iovcnt); ++ } else { ++ rc = sendmsg(psock->fd, &msg, flags); ++ } ++ if (rc <= 0) { ++ if (rc == 0 || errno == EAGAIN || errno == EWOULDBLOCK || (errno == ENOBUFS && psock->zcopy)) { ++ errno = EAGAIN; ++ } ++ return -1; ++ } ++ ++ sent = rc; ++ ++ if (is_zcopy) { ++ /* Handling overflow case, because we use psock->sendmsg_idx - 1 for the ++ * req->internal.offset, so sendmsg_idx should not be zero */ ++ if (spdk_unlikely(psock->sendmsg_idx == UINT32_MAX)) { ++ psock->sendmsg_idx = 1; ++ } else { ++ psock->sendmsg_idx++; ++ } ++ } ++ ++ /* Consume the requests that were actually written */ ++ req = TAILQ_FIRST(&sock->queued_reqs); ++ while (req) { ++ offset = req->internal.offset; ++ ++ /* req->internal.is_zcopy is true when the whole req or part of it is sent with zerocopy */ ++ req->internal.is_zcopy = is_zcopy; ++ ++ for (i = 0; i < req->iovcnt; i++) { ++ /* Advance by the offset first */ ++ if (offset >= SPDK_SOCK_REQUEST_IOV(req, i)->iov_len) { ++ offset -= SPDK_SOCK_REQUEST_IOV(req, i)->iov_len; ++ continue; ++ } ++ ++ /* Calculate the remaining length of this element */ ++ len = SPDK_SOCK_REQUEST_IOV(req, i)->iov_len - offset; ++ ++ if (len > (size_t)rc) { ++ /* This element was partially sent. */ ++ req->internal.offset += rc; ++ return sent; ++ } ++ ++ offset = 0; ++ req->internal.offset += len; ++ rc -= len; ++ } ++ ++ /* Handled a full request. */ ++ spdk_sock_request_pend(sock, req); ++ ++ if (!req->internal.is_zcopy && req == TAILQ_FIRST(&sock->pending_reqs)) { ++ /* The sendmsg syscall above isn't currently asynchronous, ++ * so it's already done. */ ++ retval = spdk_sock_request_put(sock, req, 0); ++ if (retval) { ++ break; ++ } ++ } else { ++ /* Re-use the offset field to hold the sendmsg call index. The ++ * index is 0 based, so subtract one here because we've already ++ * incremented above. */ ++ req->internal.offset = psock->sendmsg_idx - 1; ++ } ++ ++ if (rc == 0) { ++ break; ++ } ++ ++ req = TAILQ_FIRST(&sock->queued_reqs); ++ } ++ ++ return sent; ++} ++ ++static int ++posix_sock_flush(struct spdk_sock *sock) ++{ ++#ifdef SPDK_ZEROCOPY ++ struct spdk_posix_sock *psock = __posix_sock(sock); ++ ++ if (psock->zcopy && !TAILQ_EMPTY(&sock->pending_reqs)) { ++ _sock_check_zcopy(sock); ++ } ++#endif ++ ++ return _sock_flush(sock); ++} ++ ++static ssize_t ++posix_sock_recv_from_pipe(struct spdk_posix_sock *sock, struct iovec *diov, int diovcnt) ++{ ++ struct iovec siov[2]; ++ int sbytes; ++ ssize_t bytes; ++ struct spdk_posix_sock_group_impl *group; ++ ++ sbytes = spdk_pipe_reader_get_buffer(sock->recv_pipe, sock->recv_buf_sz, siov); ++ if (sbytes < 0) { ++ errno = EINVAL; ++ return -1; ++ } else if (sbytes == 0) { ++ errno = EAGAIN; ++ return -1; ++ } ++ ++ bytes = spdk_iovcpy(siov, 2, diov, diovcnt); ++ ++ if (bytes == 0) { ++ /* The only way this happens is if diov is 0 length */ ++ errno = EINVAL; ++ return -1; ++ } ++ ++ spdk_pipe_reader_advance(sock->recv_pipe, bytes); ++ ++ /* If we drained the pipe, mark it appropriately */ ++ if (spdk_pipe_reader_bytes_available(sock->recv_pipe) == 0) { ++ assert(sock->pipe_has_data == true); ++ ++ group = __posix_group_impl(sock->base.group_impl); ++ if (group && !sock->socket_has_data) { ++ TAILQ_REMOVE(&group->socks_with_data, sock, link); ++ } ++ ++ sock->pipe_has_data = false; ++ } ++ ++ return bytes; ++} ++ ++static inline ssize_t ++posix_sock_read(struct spdk_posix_sock *sock) ++{ ++ struct iovec iov[2]; ++ int bytes_avail, bytes_recvd; ++ struct spdk_posix_sock_group_impl *group; ++ ++ bytes_avail = spdk_pipe_writer_get_buffer(sock->recv_pipe, sock->recv_buf_sz, iov); ++ ++ if (bytes_avail <= 0) { ++ return bytes_avail; ++ } ++ ++ if (sock->ssl) { ++ bytes_recvd = SSL_readv(sock->ssl, iov, 2); ++ } else { ++ bytes_recvd = readv(sock->fd, iov, 2); ++ } ++ ++ assert(sock->pipe_has_data == false); ++ ++ if (bytes_recvd <= 0) { ++ /* Errors count as draining the socket data */ ++ if (sock->base.group_impl && sock->socket_has_data) { ++ group = __posix_group_impl(sock->base.group_impl); ++ TAILQ_REMOVE(&group->socks_with_data, sock, link); ++ } ++ ++ sock->socket_has_data = false; ++ ++ return bytes_recvd; ++ } ++ ++ spdk_pipe_writer_advance(sock->recv_pipe, bytes_recvd); ++ ++#if DEBUG ++ if (sock->base.group_impl) { ++ assert(sock->socket_has_data == true); ++ } ++#endif ++ ++ sock->pipe_has_data = true; ++ if (bytes_recvd < bytes_avail) { ++ /* We drained the kernel socket entirely. */ ++ sock->socket_has_data = false; ++ } ++ ++ return bytes_recvd; ++} ++ ++static ssize_t ++posix_sock_readv(struct spdk_sock *_sock, struct iovec *iov, int iovcnt) ++{ ++ struct spdk_posix_sock *sock = __posix_sock(_sock); ++ struct spdk_posix_sock_group_impl *group = __posix_group_impl(sock->base.group_impl); ++ int rc, i; ++ size_t len; ++ ++ if (sock->recv_pipe == NULL) { ++ assert(sock->pipe_has_data == false); ++ if (group && sock->socket_has_data) { ++ sock->socket_has_data = false; ++ TAILQ_REMOVE(&group->socks_with_data, sock, link); ++ } ++ if (sock->ssl) { ++ return SSL_readv(sock->ssl, iov, iovcnt); ++ } else { ++ return readv(sock->fd, iov, iovcnt); ++ } ++ } ++ ++ /* If the socket is not in a group, we must assume it always has ++ * data waiting for us because it is not epolled */ ++ if (!sock->pipe_has_data && (group == NULL || sock->socket_has_data)) { ++ /* If the user is receiving a sufficiently large amount of data, ++ * receive directly to their buffers. */ ++ len = 0; ++ for (i = 0; i < iovcnt; i++) { ++ len += iov[i].iov_len; ++ } ++ ++ if (len >= MIN_SOCK_PIPE_SIZE) { ++ /* TODO: Should this detect if kernel socket is drained? */ ++ if (sock->ssl) { ++ return SSL_readv(sock->ssl, iov, iovcnt); ++ } else { ++ return readv(sock->fd, iov, iovcnt); ++ } ++ } ++ ++ /* Otherwise, do a big read into our pipe */ ++ rc = posix_sock_read(sock); ++ if (rc <= 0) { ++ return rc; ++ } ++ } ++ ++ return posix_sock_recv_from_pipe(sock, iov, iovcnt); ++} ++ ++static ssize_t ++posix_sock_recv(struct spdk_sock *sock, void *buf, size_t len) ++{ ++ struct iovec iov[1]; ++ ++ iov[0].iov_base = buf; ++ iov[0].iov_len = len; ++ ++ return posix_sock_readv(sock, iov, 1); ++} ++ ++static void ++posix_sock_readv_async(struct spdk_sock *sock, struct spdk_sock_request *req) ++{ ++ req->cb_fn(req->cb_arg, -ENOTSUP); ++} ++ ++static ssize_t ++posix_sock_writev(struct spdk_sock *_sock, struct iovec *iov, int iovcnt) ++{ ++ struct spdk_posix_sock *sock = __posix_sock(_sock); ++ int rc; ++ ++ /* In order to process a writev, we need to flush any asynchronous writes ++ * first. */ ++ rc = _sock_flush(_sock); ++ if (rc < 0) { ++ return rc; ++ } ++ ++ if (!TAILQ_EMPTY(&_sock->queued_reqs)) { ++ /* We weren't able to flush all requests */ ++ errno = EAGAIN; ++ return -1; ++ } ++ ++ if (sock->ssl) { ++ return SSL_writev(sock->ssl, iov, iovcnt); ++ } else { ++ return writev(sock->fd, iov, iovcnt); ++ } ++} ++ ++static void ++posix_sock_writev_async(struct spdk_sock *sock, struct spdk_sock_request *req) ++{ ++ int rc; ++ ++ spdk_sock_request_queue(sock, req); ++ ++ /* If there are a sufficient number queued, just flush them out immediately. */ ++ if (sock->queued_iovcnt >= IOV_BATCH_SIZE) { ++ rc = _sock_flush(sock); ++ if (rc < 0 && errno != EAGAIN) { ++ spdk_sock_abort_requests(sock); ++ } ++ } ++} ++ ++static int ++posix_sock_set_recvlowat(struct spdk_sock *_sock, int nbytes) ++{ ++ struct spdk_posix_sock *sock = __posix_sock(_sock); ++ int val; ++ int rc; ++ ++ assert(sock != NULL); ++ ++ val = nbytes; ++ rc = setsockopt(sock->fd, SOL_SOCKET, SO_RCVLOWAT, &val, sizeof val); ++ if (rc != 0) { ++ return -1; ++ } ++ return 0; ++} ++ ++static bool ++posix_sock_is_ipv6(struct spdk_sock *_sock) ++{ ++ struct spdk_posix_sock *sock = __posix_sock(_sock); ++ struct sockaddr_storage sa; ++ socklen_t salen; ++ int rc; ++ ++ assert(sock != NULL); ++ ++ memset(&sa, 0, sizeof sa); ++ salen = sizeof sa; ++ rc = getsockname(sock->fd, (struct sockaddr *) &sa, &salen); ++ if (rc != 0) { ++ SPDK_ERRLOG("getsockname() failed (errno=%d)\n", errno); ++ return false; ++ } ++ ++ return (sa.ss_family == AF_INET6); ++} ++ ++static bool ++posix_sock_is_ipv4(struct spdk_sock *_sock) ++{ ++ struct spdk_posix_sock *sock = __posix_sock(_sock); ++ struct sockaddr_storage sa; ++ socklen_t salen; ++ int rc; ++ ++ assert(sock != NULL); ++ ++ memset(&sa, 0, sizeof sa); ++ salen = sizeof sa; ++ rc = getsockname(sock->fd, (struct sockaddr *) &sa, &salen); ++ if (rc != 0) { ++ SPDK_ERRLOG("getsockname() failed (errno=%d)\n", errno); ++ return false; ++ } ++ ++ return (sa.ss_family == AF_INET); ++} ++ ++static bool ++posix_sock_is_connected(struct spdk_sock *_sock) ++{ ++ struct spdk_posix_sock *sock = __posix_sock(_sock); ++ uint8_t byte; ++ int rc; ++ ++ rc = recv(sock->fd, &byte, 1, MSG_PEEK); ++ if (rc == 0) { ++ return false; ++ } ++ ++ if (rc < 0) { ++ if (errno == EAGAIN || errno == EWOULDBLOCK) { ++ return true; ++ } ++ ++ return false; ++ } ++ ++ return true; ++} ++ ++static struct spdk_sock_group_impl * ++posix_sock_group_impl_get_optimal(struct spdk_sock *_sock, struct spdk_sock_group_impl *hint) ++{ ++ struct spdk_posix_sock *sock = __posix_sock(_sock); ++ struct spdk_sock_group_impl *group_impl; ++ ++ if (sock->placement_id != -1) { ++ spdk_sock_map_lookup(&g_map, sock->placement_id, &group_impl, hint); ++ return group_impl; ++ } ++ ++ return NULL; ++} ++ ++static struct spdk_sock_group_impl * ++posix_sock_group_impl_create(void) ++{ ++ struct spdk_posix_sock_group_impl *group_impl; ++ int fd; ++ ++#if defined(SPDK_EPOLL) ++ fd = epoll_create1(0); ++#elif defined(SPDK_KEVENT) ++ fd = kqueue(); ++#endif ++ if (fd == -1) { ++ return NULL; ++ } ++ ++ group_impl = calloc(1, sizeof(*group_impl)); ++ if (group_impl == NULL) { ++ SPDK_ERRLOG("group_impl allocation failed\n"); ++ close(fd); ++ return NULL; ++ } ++ ++ group_impl->fd = fd; ++ TAILQ_INIT(&group_impl->socks_with_data); ++ group_impl->placement_id = -1; ++ ++ if (g_spdk_posix_sock_impl_opts.enable_placement_id == PLACEMENT_CPU) { ++ spdk_sock_map_insert(&g_map, spdk_env_get_current_core(), &group_impl->base); ++ group_impl->placement_id = spdk_env_get_current_core(); ++ } ++ ++ return &group_impl->base; ++} ++ ++static void ++posix_sock_mark(struct spdk_posix_sock_group_impl *group, struct spdk_posix_sock *sock, ++ int placement_id) ++{ ++#if defined(SO_MARK) ++ int rc; ++ ++ rc = setsockopt(sock->fd, SOL_SOCKET, SO_MARK, ++ &placement_id, sizeof(placement_id)); ++ if (rc != 0) { ++ /* Not fatal */ ++ SPDK_ERRLOG("Error setting SO_MARK\n"); ++ return; ++ } ++ ++ rc = spdk_sock_map_insert(&g_map, placement_id, &group->base); ++ if (rc != 0) { ++ /* Not fatal */ ++ SPDK_ERRLOG("Failed to insert sock group into map: %d\n", rc); ++ return; ++ } ++ ++ sock->placement_id = placement_id; ++#endif ++} ++ ++static void ++posix_sock_update_mark(struct spdk_sock_group_impl *_group, struct spdk_sock *_sock) ++{ ++ struct spdk_posix_sock_group_impl *group = __posix_group_impl(_group); ++ ++ if (group->placement_id == -1) { ++ group->placement_id = spdk_sock_map_find_free(&g_map); ++ ++ /* If a free placement id is found, update existing sockets in this group */ ++ if (group->placement_id != -1) { ++ struct spdk_sock *sock, *tmp; ++ ++ TAILQ_FOREACH_SAFE(sock, &_group->socks, link, tmp) { ++ posix_sock_mark(group, __posix_sock(sock), group->placement_id); ++ } ++ } ++ } ++ ++ if (group->placement_id != -1) { ++ /* ++ * group placement id is already determined for this poll group. ++ * Mark socket with group's placement id. ++ */ ++ posix_sock_mark(group, __posix_sock(_sock), group->placement_id); ++ } ++} ++ ++static int ++posix_sock_group_impl_add_sock(struct spdk_sock_group_impl *_group, struct spdk_sock *_sock) ++{ ++ struct spdk_posix_sock_group_impl *group = __posix_group_impl(_group); ++ struct spdk_posix_sock *sock = __posix_sock(_sock); ++ int rc; ++ ++#if defined(SPDK_EPOLL) ++ struct epoll_event event; ++ ++ memset(&event, 0, sizeof(event)); ++ /* EPOLLERR is always on even if we don't set it, but be explicit for clarity */ ++ event.events = EPOLLIN | EPOLLERR; ++ event.data.ptr = sock; ++ ++ rc = epoll_ctl(group->fd, EPOLL_CTL_ADD, sock->fd, &event); ++#elif defined(SPDK_KEVENT) ++ struct kevent event; ++ struct timespec ts = {0}; ++ ++ EV_SET(&event, sock->fd, EVFILT_READ, EV_ADD, 0, 0, sock); ++ ++ rc = kevent(group->fd, &event, 1, NULL, 0, &ts); ++#endif ++ ++ if (rc != 0) { ++ return rc; ++ } ++ ++ /* switched from another polling group due to scheduling */ ++ if (spdk_unlikely(sock->recv_pipe != NULL && ++ (spdk_pipe_reader_bytes_available(sock->recv_pipe) > 0))) { ++ sock->pipe_has_data = true; ++ sock->socket_has_data = false; ++ TAILQ_INSERT_TAIL(&group->socks_with_data, sock, link); ++ } ++ ++ if (g_spdk_posix_sock_impl_opts.enable_placement_id == PLACEMENT_MARK) { ++ posix_sock_update_mark(_group, _sock); ++ } else if (sock->placement_id != -1) { ++ rc = spdk_sock_map_insert(&g_map, sock->placement_id, &group->base); ++ if (rc != 0) { ++ SPDK_ERRLOG("Failed to insert sock group into map: %d\n", rc); ++ /* Do not treat this as an error. The system will continue running. */ ++ } ++ } ++ ++ return rc; ++} ++ ++static int ++posix_sock_group_impl_remove_sock(struct spdk_sock_group_impl *_group, struct spdk_sock *_sock) ++{ ++ struct spdk_posix_sock_group_impl *group = __posix_group_impl(_group); ++ struct spdk_posix_sock *sock = __posix_sock(_sock); ++ int rc; ++ ++ if (sock->pipe_has_data || sock->socket_has_data) { ++ TAILQ_REMOVE(&group->socks_with_data, sock, link); ++ sock->pipe_has_data = false; ++ sock->socket_has_data = false; ++ } ++ ++ if (sock->placement_id != -1) { ++ spdk_sock_map_release(&g_map, sock->placement_id); ++ } ++ ++#if defined(SPDK_EPOLL) ++ struct epoll_event event; ++ ++ /* Event parameter is ignored but some old kernel version still require it. */ ++ rc = epoll_ctl(group->fd, EPOLL_CTL_DEL, sock->fd, &event); ++#elif defined(SPDK_KEVENT) ++ struct kevent event; ++ struct timespec ts = {0}; ++ ++ EV_SET(&event, sock->fd, EVFILT_READ, EV_DELETE, 0, 0, NULL); ++ ++ rc = kevent(group->fd, &event, 1, NULL, 0, &ts); ++ if (rc == 0 && event.flags & EV_ERROR) { ++ rc = -1; ++ errno = event.data; ++ } ++#endif ++ ++ spdk_sock_abort_requests(_sock); ++ ++ return rc; ++} ++ ++static int ++posix_sock_group_impl_poll(struct spdk_sock_group_impl *_group, int max_events, ++ struct spdk_sock **socks) ++{ ++ struct spdk_posix_sock_group_impl *group = __posix_group_impl(_group); ++ struct spdk_sock *sock, *tmp; ++ int num_events, i, rc; ++ struct spdk_posix_sock *psock, *ptmp; ++#if defined(SPDK_EPOLL) ++ struct epoll_event events[MAX_EVENTS_PER_POLL]; ++#elif defined(SPDK_KEVENT) ++ struct kevent events[MAX_EVENTS_PER_POLL]; ++ struct timespec ts = {0}; ++#endif ++ ++#ifdef SPDK_ZEROCOPY ++ /* When all of the following conditions are met ++ * - non-blocking socket ++ * - zero copy is enabled ++ * - interrupts suppressed (i.e. busy polling) ++ * - the NIC tx queue is full at the time sendmsg() is called ++ * - epoll_wait determines there is an EPOLLIN event for the socket ++ * then we can get into a situation where data we've sent is queued ++ * up in the kernel network stack, but interrupts have been suppressed ++ * because other traffic is flowing so the kernel misses the signal ++ * to flush the software tx queue. If there wasn't incoming data ++ * pending on the socket, then epoll_wait would have been sufficient ++ * to kick off the send operation, but since there is a pending event ++ * epoll_wait does not trigger the necessary operation. ++ * ++ * We deal with this by checking for all of the above conditions and ++ * additionally looking for EPOLLIN events that were not consumed from ++ * the last poll loop. We take this to mean that the upper layer is ++ * unable to consume them because it is blocked waiting for resources ++ * to free up, and those resources are most likely freed in response ++ * to a pending asynchronous write completing. ++ * ++ * Additionally, sockets that have the same placement_id actually share ++ * an underlying hardware queue. That means polling one of them is ++ * equivalent to polling all of them. As a quick mechanism to avoid ++ * making extra poll() calls, stash the last placement_id during the loop ++ * and only poll if it's not the same. The overwhelmingly common case ++ * is that all sockets in this list have the same placement_id because ++ * SPDK is intentionally grouping sockets by that value, so even ++ * though this won't stop all extra calls to poll(), it's very fast ++ * and will catch all of them in practice. ++ */ ++ int last_placement_id = -1; ++ ++ TAILQ_FOREACH(psock, &group->socks_with_data, link) { ++ if (psock->zcopy && psock->placement_id >= 0 && ++ psock->placement_id != last_placement_id) { ++ struct pollfd pfd = {psock->fd, POLLIN | POLLERR, 0}; ++ ++ poll(&pfd, 1, 0); ++ last_placement_id = psock->placement_id; ++ } ++ } ++#endif ++ ++ /* This must be a TAILQ_FOREACH_SAFE because while flushing, ++ * a completion callback could remove the sock from the ++ * group. */ ++ TAILQ_FOREACH_SAFE(sock, &_group->socks, link, tmp) { ++ rc = _sock_flush(sock); ++ if (rc < 0 && errno != EAGAIN) { ++ spdk_sock_abort_requests(sock); ++ } ++ } ++ ++ assert(max_events > 0); ++ ++#if defined(SPDK_EPOLL) ++ num_events = epoll_wait(group->fd, events, max_events, 0); ++#elif defined(SPDK_KEVENT) ++ num_events = kevent(group->fd, NULL, 0, events, max_events, &ts); ++#endif ++ ++ if (num_events == -1) { ++ return -1; ++ } else if (num_events == 0 && !TAILQ_EMPTY(&_group->socks)) { ++ sock = TAILQ_FIRST(&_group->socks); ++ psock = __posix_sock(sock); ++ /* poll() is called here to busy poll the queue associated with ++ * first socket in list and potentially reap incoming data. ++ */ ++ if (sock->opts.priority) { ++ struct pollfd pfd = {0, 0, 0}; ++ ++ pfd.fd = psock->fd; ++ pfd.events = POLLIN | POLLERR; ++ poll(&pfd, 1, 0); ++ } ++ } ++ ++ for (i = 0; i < num_events; i++) { ++#if defined(SPDK_EPOLL) ++ sock = events[i].data.ptr; ++ psock = __posix_sock(sock); ++ ++#ifdef SPDK_ZEROCOPY ++ if (events[i].events & EPOLLERR) { ++ rc = _sock_check_zcopy(sock); ++ /* If the socket was closed or removed from ++ * the group in response to a send ack, don't ++ * add it to the array here. */ ++ if (rc || sock->cb_fn == NULL) { ++ continue; ++ } ++ } ++#endif ++ if ((events[i].events & EPOLLIN) == 0) { ++ continue; ++ } ++ ++#elif defined(SPDK_KEVENT) ++ sock = events[i].udata; ++ psock = __posix_sock(sock); ++#endif ++ ++ /* If the socket is not already in the list, add it now */ ++ if (!psock->socket_has_data && !psock->pipe_has_data) { ++ TAILQ_INSERT_TAIL(&group->socks_with_data, psock, link); ++ } ++ psock->socket_has_data = true; ++ } ++ ++ num_events = 0; ++ ++ TAILQ_FOREACH_SAFE(psock, &group->socks_with_data, link, ptmp) { ++ if (num_events == max_events) { ++ break; ++ } ++ ++ /* If the socket's cb_fn is NULL, just remove it from the ++ * list and do not add it to socks array */ ++ if (spdk_unlikely(psock->base.cb_fn == NULL)) { ++ psock->socket_has_data = false; ++ psock->pipe_has_data = false; ++ TAILQ_REMOVE(&group->socks_with_data, psock, link); ++ continue; ++ } ++ ++ socks[num_events++] = &psock->base; ++ } ++ ++ /* Cycle the has_data list so that each time we poll things aren't ++ * in the same order. Say we have 6 sockets in the list, named as follows: ++ * A B C D E F ++ * And all 6 sockets had epoll events, but max_events is only 3. That means ++ * psock currently points at D. We want to rearrange the list to the following: ++ * D E F A B C ++ * ++ * The variables below are named according to this example to make it easier to ++ * follow the swaps. ++ */ ++ if (psock != NULL) { ++ struct spdk_posix_sock *pa, *pc, *pd, *pf; ++ ++ /* Capture pointers to the elements we need */ ++ pd = psock; ++ pc = TAILQ_PREV(pd, spdk_has_data_list, link); ++ pa = TAILQ_FIRST(&group->socks_with_data); ++ pf = TAILQ_LAST(&group->socks_with_data, spdk_has_data_list); ++ ++ /* Break the link between C and D */ ++ pc->link.tqe_next = NULL; ++ ++ /* Connect F to A */ ++ pf->link.tqe_next = pa; ++ pa->link.tqe_prev = &pf->link.tqe_next; ++ ++ /* Fix up the list first/last pointers */ ++ group->socks_with_data.tqh_first = pd; ++ group->socks_with_data.tqh_last = &pc->link.tqe_next; ++ ++ /* D is in front of the list, make tqe prev pointer point to the head of list */ ++ pd->link.tqe_prev = &group->socks_with_data.tqh_first; ++ } ++ ++ return num_events; ++} ++ ++static int ++posix_sock_group_impl_close(struct spdk_sock_group_impl *_group) ++{ ++ struct spdk_posix_sock_group_impl *group = __posix_group_impl(_group); ++ int rc; ++ ++ if (g_spdk_posix_sock_impl_opts.enable_placement_id == PLACEMENT_CPU) { ++ spdk_sock_map_release(&g_map, spdk_env_get_current_core()); ++ } ++ ++ rc = close(group->fd); ++ free(group); ++ return rc; ++} ++ ++static struct spdk_net_impl g_posix_net_impl = { ++ .name = "posix", ++ .getaddr = posix_sock_getaddr, ++ .connect = posix_sock_connect, ++ .listen = posix_sock_listen, ++ .accept = posix_sock_accept, ++ .close = posix_sock_close, ++ .recv = posix_sock_recv, ++ .readv = posix_sock_readv, ++ .readv_async = posix_sock_readv_async, ++ .writev = posix_sock_writev, ++ .writev_async = posix_sock_writev_async, ++ .flush = posix_sock_flush, ++ .set_recvlowat = posix_sock_set_recvlowat, ++ .set_recvbuf = posix_sock_set_recvbuf, ++ .set_sendbuf = posix_sock_set_sendbuf, ++ .is_ipv6 = posix_sock_is_ipv6, ++ .is_ipv4 = posix_sock_is_ipv4, ++ .is_connected = posix_sock_is_connected, ++ .group_impl_get_optimal = posix_sock_group_impl_get_optimal, ++ .group_impl_create = posix_sock_group_impl_create, ++ .group_impl_add_sock = posix_sock_group_impl_add_sock, ++ .group_impl_remove_sock = posix_sock_group_impl_remove_sock, ++ .group_impl_poll = posix_sock_group_impl_poll, ++ .group_impl_close = posix_sock_group_impl_close, ++ .get_opts = posix_sock_impl_get_opts, ++ .set_opts = posix_sock_impl_set_opts, ++}; ++ ++SPDK_NET_IMPL_REGISTER(posix, &g_posix_net_impl, DEFAULT_SOCK_PRIORITY + 1); ++ ++static struct spdk_sock * ++ssl_sock_listen(const char *ip, int port, struct spdk_sock_opts *opts) ++{ ++ return posix_sock_create(ip, port, SPDK_SOCK_CREATE_LISTEN, opts, true); ++} ++ ++static struct spdk_sock * ++ssl_sock_connect(const char *ip, int port, struct spdk_sock_opts *opts) ++{ ++ return posix_sock_create(ip, port, SPDK_SOCK_CREATE_CONNECT, opts, true); ++} ++ ++static struct spdk_sock * ++ssl_sock_accept(struct spdk_sock *_sock) ++{ ++ return _posix_sock_accept(_sock, true); ++} ++ ++static struct spdk_net_impl g_ssl_net_impl = { ++ .name = "ssl", ++ .getaddr = posix_sock_getaddr, ++ .connect = ssl_sock_connect, ++ .listen = ssl_sock_listen, ++ .accept = ssl_sock_accept, ++ .close = posix_sock_close, ++ .recv = posix_sock_recv, ++ .readv = posix_sock_readv, ++ .writev = posix_sock_writev, ++ .writev_async = posix_sock_writev_async, ++ .flush = posix_sock_flush, ++ .set_recvlowat = posix_sock_set_recvlowat, ++ .set_recvbuf = posix_sock_set_recvbuf, ++ .set_sendbuf = posix_sock_set_sendbuf, ++ .is_ipv6 = posix_sock_is_ipv6, ++ .is_ipv4 = posix_sock_is_ipv4, ++ .is_connected = posix_sock_is_connected, ++ .group_impl_get_optimal = posix_sock_group_impl_get_optimal, ++ .group_impl_create = posix_sock_group_impl_create, ++ .group_impl_add_sock = posix_sock_group_impl_add_sock, ++ .group_impl_remove_sock = posix_sock_group_impl_remove_sock, ++ .group_impl_poll = posix_sock_group_impl_poll, ++ .group_impl_close = posix_sock_group_impl_close, ++ .get_opts = posix_sock_impl_get_opts, ++ .set_opts = posix_sock_impl_set_opts, ++}; ++ ++SPDK_NET_IMPL_REGISTER(ssl, &g_ssl_net_impl, DEFAULT_SOCK_PRIORITY); ++SPDK_LOG_REGISTER_COMPONENT(sock_posix) +diff --git a/module/sock/sock_kernel.h b/module/sock/sock_kernel.h +index a9fc874..db512c8 100644 +--- a/module/sock/sock_kernel.h ++++ b/module/sock/sock_kernel.h +@@ -1,81 +1,81 @@ +-/* SPDX-License-Identifier: BSD-3-Clause +- * Copyright (C) 2021 Intel Corporation. All rights reserved. +- */ +- +-static int +-get_addr_str(struct sockaddr *sa, char *host, size_t hlen) +-{ +- const char *result = NULL; +- +- if (sa == NULL || host == NULL) { +- return -1; +- } +- +- switch (sa->sa_family) { +- case AF_INET: +- result = inet_ntop(AF_INET, &(((struct sockaddr_in *)sa)->sin_addr), +- host, hlen); +- break; +- case AF_INET6: +- result = inet_ntop(AF_INET6, &(((struct sockaddr_in6 *)sa)->sin6_addr), +- host, hlen); +- break; +- default: +- break; +- } +- +- if (result != NULL) { +- return 0; +- } else { +- return -1; +- } +-} +- +-static bool +-sock_is_loopback(int fd) +-{ +- struct ifaddrs *addrs, *tmp; +- struct sockaddr_storage sa = {}; +- socklen_t salen; +- struct ifreq ifr = {}; +- char ip_addr[256], ip_addr_tmp[256]; +- int rc; +- bool is_loopback = false; +- +- salen = sizeof(sa); +- rc = getsockname(fd, (struct sockaddr *)&sa, &salen); +- if (rc != 0) { +- return is_loopback; +- } +- +- memset(ip_addr, 0, sizeof(ip_addr)); +- rc = get_addr_str((struct sockaddr *)&sa, ip_addr, sizeof(ip_addr)); +- if (rc != 0) { +- return is_loopback; +- } +- +- getifaddrs(&addrs); +- for (tmp = addrs; tmp != NULL; tmp = tmp->ifa_next) { +- if (tmp->ifa_addr && (tmp->ifa_flags & IFF_UP) && +- (tmp->ifa_addr->sa_family == sa.ss_family)) { +- memset(ip_addr_tmp, 0, sizeof(ip_addr_tmp)); +- rc = get_addr_str(tmp->ifa_addr, ip_addr_tmp, sizeof(ip_addr_tmp)); +- if (rc != 0) { +- continue; +- } +- +- if (strncmp(ip_addr, ip_addr_tmp, sizeof(ip_addr)) == 0) { +- memcpy(ifr.ifr_name, tmp->ifa_name, sizeof(ifr.ifr_name)); +- ioctl(fd, SIOCGIFFLAGS, &ifr); +- if (ifr.ifr_flags & IFF_LOOPBACK) { +- is_loopback = true; +- } +- goto end; +- } +- } +- } +- +-end: +- freeifaddrs(addrs); +- return is_loopback; +-} ++/* SPDX-License-Identifier: BSD-3-Clause ++ * Copyright (C) 2021 Intel Corporation. All rights reserved. ++ */ ++ ++static int ++get_addr_str(struct sockaddr *sa, char *host, size_t hlen) ++{ ++ const char *result = NULL; ++ ++ if (sa == NULL || host == NULL) { ++ return -1; ++ } ++ ++ switch (sa->sa_family) { ++ case AF_INET: ++ result = inet_ntop(AF_INET, &(((struct sockaddr_in *)sa)->sin_addr), ++ host, hlen); ++ break; ++ case AF_INET6: ++ result = inet_ntop(AF_INET6, &(((struct sockaddr_in6 *)sa)->sin6_addr), ++ host, hlen); ++ break; ++ default: ++ break; ++ } ++ ++ if (result != NULL) { ++ return 0; ++ } else { ++ return -1; ++ } ++} ++ ++static bool ++sock_is_loopback(int fd) ++{ ++ struct ifaddrs *addrs, *tmp; ++ struct sockaddr_storage sa = {}; ++ socklen_t salen; ++ struct ifreq ifr = {}; ++ char ip_addr[256], ip_addr_tmp[256]; ++ int rc; ++ bool is_loopback = false; ++ ++ salen = sizeof(sa); ++ rc = getsockname(fd, (struct sockaddr *)&sa, &salen); ++ if (rc != 0) { ++ return is_loopback; ++ } ++ ++ memset(ip_addr, 0, sizeof(ip_addr)); ++ rc = get_addr_str((struct sockaddr *)&sa, ip_addr, sizeof(ip_addr)); ++ if (rc != 0) { ++ return is_loopback; ++ } ++ ++ getifaddrs(&addrs); ++ for (tmp = addrs; tmp != NULL; tmp = tmp->ifa_next) { ++ if (tmp->ifa_addr && (tmp->ifa_flags & IFF_UP) && ++ (tmp->ifa_addr->sa_family == sa.ss_family)) { ++ memset(ip_addr_tmp, 0, sizeof(ip_addr_tmp)); ++ rc = get_addr_str(tmp->ifa_addr, ip_addr_tmp, sizeof(ip_addr_tmp)); ++ if (rc != 0) { ++ continue; ++ } ++ ++ if (strncmp(ip_addr, ip_addr_tmp, sizeof(ip_addr)) == 0) { ++ memcpy(ifr.ifr_name, tmp->ifa_name, sizeof(ifr.ifr_name)); ++ ioctl(fd, SIOCGIFFLAGS, &ifr); ++ if (ifr.ifr_flags & IFF_LOOPBACK) { ++ is_loopback = true; ++ } ++ goto end; ++ } ++ } ++ } ++ ++end: ++ freeifaddrs(addrs); ++ return is_loopback; ++} +diff --git a/module/sock/uring/Makefile b/module/sock/uring/Makefile +index c506d27..d6fbbe2 100644 +--- a/module/sock/uring/Makefile ++++ b/module/sock/uring/Makefile +@@ -1,17 +1,17 @@ +-# SPDX-License-Identifier: BSD-3-Clause +-# Copyright (C) 2015 Intel Corporation. +-# All rights reserved. +-# +- +-SPDK_ROOT_DIR := $(abspath $(CURDIR)/../../..) +-include $(SPDK_ROOT_DIR)/mk/spdk.common.mk +- +-SO_VER := 3 +-SO_MINOR := 0 +- +-LIBNAME = sock_uring +-C_SRCS = uring.c +- +-SPDK_MAP_FILE = $(SPDK_ROOT_DIR)/mk/spdk_blank.map +- +-include $(SPDK_ROOT_DIR)/mk/spdk.lib.mk ++# SPDX-License-Identifier: BSD-3-Clause ++# Copyright (C) 2015 Intel Corporation. ++# All rights reserved. ++# ++ ++SPDK_ROOT_DIR := $(abspath $(CURDIR)/../../..) ++include $(SPDK_ROOT_DIR)/mk/spdk.common.mk ++ ++SO_VER := 3 ++SO_MINOR := 0 ++ ++LIBNAME = sock_uring ++C_SRCS = uring.c ++ ++SPDK_MAP_FILE = $(SPDK_ROOT_DIR)/mk/spdk_blank.map ++ ++include $(SPDK_ROOT_DIR)/mk/spdk.lib.mk +diff --git a/module/sock/uring/uring.c b/module/sock/uring/uring.c +index 60515cd..4415fec 100644 +--- a/module/sock/uring/uring.c ++++ b/module/sock/uring/uring.c +@@ -1,1684 +1,1684 @@ +-/* SPDX-License-Identifier: BSD-3-Clause +- * Copyright (C) 2019 Intel Corporation. +- * All rights reserved. +- */ +- +-#include "spdk/stdinc.h" +-#include "spdk/config.h" +- +-#include +-#include +-#include +- +-#include "spdk/barrier.h" +-#include "spdk/env.h" +-#include "spdk/log.h" +-#include "spdk/pipe.h" +-#include "spdk/sock.h" +-#include "spdk/string.h" +-#include "spdk/util.h" +- +-#include "spdk_internal/sock.h" +-#include "spdk_internal/assert.h" +-#include "../sock_kernel.h" +- +-#define MAX_TMPBUF 1024 +-#define PORTNUMLEN 32 +-#define SPDK_SOCK_GROUP_QUEUE_DEPTH 4096 +-#define SPDK_SOCK_CMG_INFO_SIZE (sizeof(struct cmsghdr) + sizeof(struct sock_extended_err)) +- +-enum spdk_sock_task_type { +- SPDK_SOCK_TASK_POLLIN = 0, +- SPDK_SOCK_TASK_ERRQUEUE, +- SPDK_SOCK_TASK_WRITE, +- SPDK_SOCK_TASK_CANCEL, +-}; +- +-#if defined(SO_ZEROCOPY) && defined(MSG_ZEROCOPY) +-#define SPDK_ZEROCOPY +-#endif +- +-enum spdk_uring_sock_task_status { +- SPDK_URING_SOCK_TASK_NOT_IN_USE = 0, +- SPDK_URING_SOCK_TASK_IN_PROCESS, +-}; +- +-struct spdk_uring_task { +- enum spdk_uring_sock_task_status status; +- enum spdk_sock_task_type type; +- struct spdk_uring_sock *sock; +- struct msghdr msg; +- struct iovec iovs[IOV_BATCH_SIZE]; +- int iov_cnt; +- struct spdk_sock_request *last_req; +- bool is_zcopy; +- STAILQ_ENTRY(spdk_uring_task) link; +-}; +- +-struct spdk_uring_sock { +- struct spdk_sock base; +- int fd; +- uint32_t sendmsg_idx; +- struct spdk_uring_sock_group_impl *group; +- struct spdk_uring_task write_task; +- struct spdk_uring_task errqueue_task; +- struct spdk_uring_task pollin_task; +- struct spdk_uring_task cancel_task; +- struct spdk_pipe *recv_pipe; +- void *recv_buf; +- int recv_buf_sz; +- bool zcopy; +- bool pending_recv; +- int zcopy_send_flags; +- int connection_status; +- int placement_id; +- uint8_t buf[SPDK_SOCK_CMG_INFO_SIZE]; +- TAILQ_ENTRY(spdk_uring_sock) link; +-}; +- +-TAILQ_HEAD(pending_recv_list, spdk_uring_sock); +- +-struct spdk_uring_sock_group_impl { +- struct spdk_sock_group_impl base; +- struct io_uring uring; +- uint32_t io_inflight; +- uint32_t io_queued; +- uint32_t io_avail; +- struct pending_recv_list pending_recv; +-}; +- +-static struct spdk_sock_impl_opts g_spdk_uring_sock_impl_opts = { +- .recv_buf_size = MIN_SO_RCVBUF_SIZE, +- .send_buf_size = MIN_SO_SNDBUF_SIZE, +- .enable_recv_pipe = true, +- .enable_quickack = false, +- .enable_placement_id = PLACEMENT_NONE, +- .enable_zerocopy_send_server = false, +- .enable_zerocopy_send_client = false, +- .zerocopy_threshold = 0, +- .tls_version = 0, +- .enable_ktls = false, +- .psk_key = NULL, +- .psk_identity = NULL +-}; +- +-static struct spdk_sock_map g_map = { +- .entries = STAILQ_HEAD_INITIALIZER(g_map.entries), +- .mtx = PTHREAD_MUTEX_INITIALIZER +-}; +- +-__attribute((destructor)) static void +-uring_sock_map_cleanup(void) +-{ +- spdk_sock_map_cleanup(&g_map); +-} +- +-#define SPDK_URING_SOCK_REQUEST_IOV(req) ((struct iovec *)((uint8_t *)req + sizeof(struct spdk_sock_request))) +- +-#define __uring_sock(sock) (struct spdk_uring_sock *)sock +-#define __uring_group_impl(group) (struct spdk_uring_sock_group_impl *)group +- +-static void +-uring_sock_copy_impl_opts(struct spdk_sock_impl_opts *dest, const struct spdk_sock_impl_opts *src, +- size_t len) +-{ +-#define FIELD_OK(field) \ +- offsetof(struct spdk_sock_impl_opts, field) + sizeof(src->field) <= len +- +-#define SET_FIELD(field) \ +- if (FIELD_OK(field)) { \ +- dest->field = src->field; \ +- } +- +- SET_FIELD(recv_buf_size); +- SET_FIELD(send_buf_size); +- SET_FIELD(enable_recv_pipe); +- SET_FIELD(enable_quickack); +- SET_FIELD(enable_placement_id); +- SET_FIELD(enable_zerocopy_send_server); +- SET_FIELD(enable_zerocopy_send_client); +- SET_FIELD(zerocopy_threshold); +- SET_FIELD(tls_version); +- SET_FIELD(enable_ktls); +- SET_FIELD(psk_key); +- SET_FIELD(psk_identity); +- +-#undef SET_FIELD +-#undef FIELD_OK +-} +- +-static int +-uring_sock_impl_get_opts(struct spdk_sock_impl_opts *opts, size_t *len) +-{ +- if (!opts || !len) { +- errno = EINVAL; +- return -1; +- } +- +- assert(sizeof(*opts) >= *len); +- memset(opts, 0, *len); +- +- uring_sock_copy_impl_opts(opts, &g_spdk_uring_sock_impl_opts, *len); +- *len = spdk_min(*len, sizeof(g_spdk_uring_sock_impl_opts)); +- +- return 0; +-} +- +-static int +-uring_sock_impl_set_opts(const struct spdk_sock_impl_opts *opts, size_t len) +-{ +- if (!opts) { +- errno = EINVAL; +- return -1; +- } +- +- assert(sizeof(*opts) >= len); +- uring_sock_copy_impl_opts(&g_spdk_uring_sock_impl_opts, opts, len); +- +- return 0; +-} +- +-static void +-uring_opts_get_impl_opts(const struct spdk_sock_opts *opts, struct spdk_sock_impl_opts *dest) +-{ +- /* Copy the default impl_opts first to cover cases when user's impl_opts is smaller */ +- memcpy(dest, &g_spdk_uring_sock_impl_opts, sizeof(*dest)); +- +- if (opts->impl_opts != NULL) { +- assert(sizeof(*dest) >= opts->impl_opts_size); +- uring_sock_copy_impl_opts(dest, opts->impl_opts, opts->impl_opts_size); +- } +-} +- +-static int +-uring_sock_getaddr(struct spdk_sock *_sock, char *saddr, int slen, uint16_t *sport, +- char *caddr, int clen, uint16_t *cport) +-{ +- struct spdk_uring_sock *sock = __uring_sock(_sock); +- struct sockaddr_storage sa; +- socklen_t salen; +- int rc; +- +- assert(sock != NULL); +- +- memset(&sa, 0, sizeof sa); +- salen = sizeof sa; +- rc = getsockname(sock->fd, (struct sockaddr *) &sa, &salen); +- if (rc != 0) { +- SPDK_ERRLOG("getsockname() failed (errno=%d)\n", errno); +- return -1; +- } +- +- switch (sa.ss_family) { +- case AF_UNIX: +- /* Acceptable connection types that don't have IPs */ +- return 0; +- case AF_INET: +- case AF_INET6: +- /* Code below will get IP addresses */ +- break; +- default: +- /* Unsupported socket family */ +- return -1; +- } +- +- rc = get_addr_str((struct sockaddr *)&sa, saddr, slen); +- if (rc != 0) { +- SPDK_ERRLOG("getnameinfo() failed (errno=%d)\n", errno); +- return -1; +- } +- +- if (sport) { +- if (sa.ss_family == AF_INET) { +- *sport = ntohs(((struct sockaddr_in *) &sa)->sin_port); +- } else if (sa.ss_family == AF_INET6) { +- *sport = ntohs(((struct sockaddr_in6 *) &sa)->sin6_port); +- } +- } +- +- memset(&sa, 0, sizeof sa); +- salen = sizeof sa; +- rc = getpeername(sock->fd, (struct sockaddr *) &sa, &salen); +- if (rc != 0) { +- SPDK_ERRLOG("getpeername() failed (errno=%d)\n", errno); +- return -1; +- } +- +- rc = get_addr_str((struct sockaddr *)&sa, caddr, clen); +- if (rc != 0) { +- SPDK_ERRLOG("getnameinfo() failed (errno=%d)\n", errno); +- return -1; +- } +- +- if (cport) { +- if (sa.ss_family == AF_INET) { +- *cport = ntohs(((struct sockaddr_in *) &sa)->sin_port); +- } else if (sa.ss_family == AF_INET6) { +- *cport = ntohs(((struct sockaddr_in6 *) &sa)->sin6_port); +- } +- } +- +- return 0; +-} +- +-enum uring_sock_create_type { +- SPDK_SOCK_CREATE_LISTEN, +- SPDK_SOCK_CREATE_CONNECT, +-}; +- +-static int +-uring_sock_alloc_pipe(struct spdk_uring_sock *sock, int sz) +-{ +- uint8_t *new_buf; +- struct spdk_pipe *new_pipe; +- struct iovec siov[2]; +- struct iovec diov[2]; +- int sbytes; +- ssize_t bytes; +- +- if (sock->recv_buf_sz == sz) { +- return 0; +- } +- +- /* If the new size is 0, just free the pipe */ +- if (sz == 0) { +- spdk_pipe_destroy(sock->recv_pipe); +- free(sock->recv_buf); +- sock->recv_pipe = NULL; +- sock->recv_buf = NULL; +- return 0; +- } else if (sz < MIN_SOCK_PIPE_SIZE) { +- SPDK_ERRLOG("The size of the pipe must be larger than %d\n", MIN_SOCK_PIPE_SIZE); +- return -1; +- } +- +- /* Round up to next 64 byte multiple */ +- new_buf = calloc(SPDK_ALIGN_CEIL(sz + 1, 64), sizeof(uint8_t)); +- if (!new_buf) { +- SPDK_ERRLOG("socket recv buf allocation failed\n"); +- return -ENOMEM; +- } +- +- new_pipe = spdk_pipe_create(new_buf, sz + 1); +- if (new_pipe == NULL) { +- SPDK_ERRLOG("socket pipe allocation failed\n"); +- free(new_buf); +- return -ENOMEM; +- } +- +- if (sock->recv_pipe != NULL) { +- /* Pull all of the data out of the old pipe */ +- sbytes = spdk_pipe_reader_get_buffer(sock->recv_pipe, sock->recv_buf_sz, siov); +- if (sbytes > sz) { +- /* Too much data to fit into the new pipe size */ +- spdk_pipe_destroy(new_pipe); +- free(new_buf); +- return -EINVAL; +- } +- +- sbytes = spdk_pipe_writer_get_buffer(new_pipe, sz, diov); +- assert(sbytes == sz); +- +- bytes = spdk_iovcpy(siov, 2, diov, 2); +- spdk_pipe_writer_advance(new_pipe, bytes); +- +- spdk_pipe_destroy(sock->recv_pipe); +- free(sock->recv_buf); +- } +- +- sock->recv_buf_sz = sz; +- sock->recv_buf = new_buf; +- sock->recv_pipe = new_pipe; +- +- return 0; +-} +- +-static int +-uring_sock_set_recvbuf(struct spdk_sock *_sock, int sz) +-{ +- struct spdk_uring_sock *sock = __uring_sock(_sock); +- int min_size; +- int rc; +- +- assert(sock != NULL); +- +- if (_sock->impl_opts.enable_recv_pipe) { +- rc = uring_sock_alloc_pipe(sock, sz); +- if (rc) { +- SPDK_ERRLOG("unable to allocate sufficient recvbuf with sz=%d on sock=%p\n", sz, _sock); +- return rc; +- } +- } +- +- /* Set kernel buffer size to be at least MIN_SO_RCVBUF_SIZE and +- * g_spdk_uring_sock_impl_opts.recv_buf_size. */ +- min_size = spdk_max(MIN_SO_RCVBUF_SIZE, g_spdk_uring_sock_impl_opts.recv_buf_size); +- +- if (sz < min_size) { +- sz = min_size; +- } +- +- rc = setsockopt(sock->fd, SOL_SOCKET, SO_RCVBUF, &sz, sizeof(sz)); +- if (rc < 0) { +- return rc; +- } +- +- _sock->impl_opts.recv_buf_size = sz; +- +- return 0; +-} +- +-static int +-uring_sock_set_sendbuf(struct spdk_sock *_sock, int sz) +-{ +- struct spdk_uring_sock *sock = __uring_sock(_sock); +- int min_size; +- int rc; +- +- assert(sock != NULL); +- +- /* Set kernel buffer size to be at least MIN_SO_SNDBUF_SIZE and +- * g_spdk_uring_sock_impl_opts.seend_buf_size. */ +- min_size = spdk_max(MIN_SO_SNDBUF_SIZE, g_spdk_uring_sock_impl_opts.send_buf_size); +- +- if (sz < min_size) { +- sz = min_size; +- } +- +- rc = setsockopt(sock->fd, SOL_SOCKET, SO_SNDBUF, &sz, sizeof(sz)); +- if (rc < 0) { +- return rc; +- } +- +- _sock->impl_opts.send_buf_size = sz; +- +- return 0; +-} +- +-static struct spdk_uring_sock * +-uring_sock_alloc(int fd, struct spdk_sock_impl_opts *impl_opts, bool enable_zero_copy) +-{ +- struct spdk_uring_sock *sock; +-#if defined(__linux__) +- int flag; +- int rc; +-#endif +- +- sock = calloc(1, sizeof(*sock)); +- if (sock == NULL) { +- SPDK_ERRLOG("sock allocation failed\n"); +- return NULL; +- } +- +- sock->fd = fd; +- memcpy(&sock->base.impl_opts, impl_opts, sizeof(*impl_opts)); +- +-#if defined(__linux__) +- flag = 1; +- +- if (sock->base.impl_opts.enable_quickack) { +- rc = setsockopt(sock->fd, IPPROTO_TCP, TCP_QUICKACK, &flag, sizeof(flag)); +- if (rc != 0) { +- SPDK_ERRLOG("quickack was failed to set\n"); +- } +- } +- +- spdk_sock_get_placement_id(sock->fd, sock->base.impl_opts.enable_placement_id, +- &sock->placement_id); +-#ifdef SPDK_ZEROCOPY +- /* Try to turn on zero copy sends */ +- flag = 1; +- +- if (enable_zero_copy) { +- rc = setsockopt(sock->fd, SOL_SOCKET, SO_ZEROCOPY, &flag, sizeof(flag)); +- if (rc == 0) { +- sock->zcopy = true; +- sock->zcopy_send_flags = MSG_ZEROCOPY; +- } +- } +-#endif +-#endif +- +- return sock; +-} +- +-static struct spdk_sock * +-uring_sock_create(const char *ip, int port, +- enum uring_sock_create_type type, +- struct spdk_sock_opts *opts) +-{ +- struct spdk_uring_sock *sock; +- struct spdk_sock_impl_opts impl_opts; +- char buf[MAX_TMPBUF]; +- char portnum[PORTNUMLEN]; +- char *p; +- struct addrinfo hints, *res, *res0; +- int fd, flag; +- int val = 1; +- int rc; +- bool enable_zcopy_impl_opts = false; +- bool enable_zcopy_user_opts = true; +- +- assert(opts != NULL); +- uring_opts_get_impl_opts(opts, &impl_opts); +- +- if (ip == NULL) { +- return NULL; +- } +- if (ip[0] == '[') { +- snprintf(buf, sizeof(buf), "%s", ip + 1); +- p = strchr(buf, ']'); +- if (p != NULL) { +- *p = '\0'; +- } +- ip = (const char *) &buf[0]; +- } +- +- snprintf(portnum, sizeof portnum, "%d", port); +- memset(&hints, 0, sizeof hints); +- hints.ai_family = PF_UNSPEC; +- hints.ai_socktype = SOCK_STREAM; +- hints.ai_flags = AI_NUMERICSERV; +- hints.ai_flags |= AI_PASSIVE; +- hints.ai_flags |= AI_NUMERICHOST; +- rc = getaddrinfo(ip, portnum, &hints, &res0); +- if (rc != 0) { +- SPDK_ERRLOG("getaddrinfo() failed %s (%d)\n", gai_strerror(rc), rc); +- return NULL; +- } +- +- /* try listen */ +- fd = -1; +- for (res = res0; res != NULL; res = res->ai_next) { +-retry: +- fd = socket(res->ai_family, res->ai_socktype, res->ai_protocol); +- if (fd < 0) { +- /* error */ +- continue; +- } +- +- val = impl_opts.recv_buf_size; +- rc = setsockopt(fd, SOL_SOCKET, SO_RCVBUF, &val, sizeof val); +- if (rc) { +- /* Not fatal */ +- } +- +- val = impl_opts.send_buf_size; +- rc = setsockopt(fd, SOL_SOCKET, SO_SNDBUF, &val, sizeof val); +- if (rc) { +- /* Not fatal */ +- } +- +- rc = setsockopt(fd, SOL_SOCKET, SO_REUSEADDR, &val, sizeof val); +- if (rc != 0) { +- close(fd); +- fd = -1; +- /* error */ +- continue; +- } +- rc = setsockopt(fd, IPPROTO_TCP, TCP_NODELAY, &val, sizeof val); +- if (rc != 0) { +- close(fd); +- fd = -1; +- /* error */ +- continue; +- } +- +- if (opts->ack_timeout) { +-#if defined(__linux__) +- val = opts->ack_timeout; +- rc = setsockopt(fd, IPPROTO_TCP, TCP_USER_TIMEOUT, &val, sizeof val); +- if (rc != 0) { +- close(fd); +- fd = -1; +- /* error */ +- continue; +- } +-#else +- SPDK_WARNLOG("TCP_USER_TIMEOUT is not supported.\n"); +-#endif +- } +- +- +- +-#if defined(SO_PRIORITY) +- if (opts != NULL && opts->priority) { +- rc = setsockopt(fd, SOL_SOCKET, SO_PRIORITY, &opts->priority, sizeof val); +- if (rc != 0) { +- close(fd); +- fd = -1; +- /* error */ +- continue; +- } +- } +-#endif +- if (res->ai_family == AF_INET6) { +- rc = setsockopt(fd, IPPROTO_IPV6, IPV6_V6ONLY, &val, sizeof val); +- if (rc != 0) { +- close(fd); +- fd = -1; +- /* error */ +- continue; +- } +- } +- +- if (type == SPDK_SOCK_CREATE_LISTEN) { +- rc = bind(fd, res->ai_addr, res->ai_addrlen); +- if (rc != 0) { +- SPDK_ERRLOG("bind() failed at port %d, errno = %d\n", port, errno); +- switch (errno) { +- case EINTR: +- /* interrupted? */ +- close(fd); +- goto retry; +- case EADDRNOTAVAIL: +- SPDK_ERRLOG("IP address %s not available. " +- "Verify IP address in config file " +- "and make sure setup script is " +- "run before starting spdk app.\n", ip); +- /* FALLTHROUGH */ +- default: +- /* try next family */ +- close(fd); +- fd = -1; +- continue; +- } +- } +- /* bind OK */ +- rc = listen(fd, 512); +- if (rc != 0) { +- SPDK_ERRLOG("listen() failed, errno = %d\n", errno); +- close(fd); +- fd = -1; +- break; +- } +- +- flag = fcntl(fd, F_GETFL); +- if (fcntl(fd, F_SETFL, flag | O_NONBLOCK) < 0) { +- SPDK_ERRLOG("fcntl can't set nonblocking mode for socket, fd: %d (%d)\n", fd, errno); +- close(fd); +- fd = -1; +- break; +- } +- +- enable_zcopy_impl_opts = impl_opts.enable_zerocopy_send_server; +- } else if (type == SPDK_SOCK_CREATE_CONNECT) { +- rc = connect(fd, res->ai_addr, res->ai_addrlen); +- if (rc != 0) { +- SPDK_ERRLOG("connect() failed, errno = %d\n", errno); +- /* try next family */ +- close(fd); +- fd = -1; +- continue; +- } +- +- flag = fcntl(fd, F_GETFL); +- if (fcntl(fd, F_SETFL, flag & ~O_NONBLOCK) < 0) { +- SPDK_ERRLOG("fcntl can't set blocking mode for socket, fd: %d (%d)\n", fd, errno); +- close(fd); +- fd = -1; +- break; +- } +- +- enable_zcopy_impl_opts = impl_opts.enable_zerocopy_send_client; +- } +- break; +- } +- freeaddrinfo(res0); +- +- if (fd < 0) { +- return NULL; +- } +- +- enable_zcopy_user_opts = opts->zcopy && !sock_is_loopback(fd); +- sock = uring_sock_alloc(fd, &impl_opts, enable_zcopy_user_opts && enable_zcopy_impl_opts); +- if (sock == NULL) { +- SPDK_ERRLOG("sock allocation failed\n"); +- close(fd); +- return NULL; +- } +- +- return &sock->base; +-} +- +-static struct spdk_sock * +-uring_sock_listen(const char *ip, int port, struct spdk_sock_opts *opts) +-{ +- return uring_sock_create(ip, port, SPDK_SOCK_CREATE_LISTEN, opts); +-} +- +-static struct spdk_sock * +-uring_sock_connect(const char *ip, int port, struct spdk_sock_opts *opts) +-{ +- return uring_sock_create(ip, port, SPDK_SOCK_CREATE_CONNECT, opts); +-} +- +-static struct spdk_sock * +-uring_sock_accept(struct spdk_sock *_sock) +-{ +- struct spdk_uring_sock *sock = __uring_sock(_sock); +- struct sockaddr_storage sa; +- socklen_t salen; +- int rc, fd; +- struct spdk_uring_sock *new_sock; +- int flag; +- +- memset(&sa, 0, sizeof(sa)); +- salen = sizeof(sa); +- +- assert(sock != NULL); +- +- rc = accept(sock->fd, (struct sockaddr *)&sa, &salen); +- +- if (rc == -1) { +- return NULL; +- } +- +- fd = rc; +- +- flag = fcntl(fd, F_GETFL); +- if ((flag & O_NONBLOCK) && (fcntl(fd, F_SETFL, flag & ~O_NONBLOCK) < 0)) { +- SPDK_ERRLOG("fcntl can't set blocking mode for socket, fd: %d (%d)\n", fd, errno); +- close(fd); +- return NULL; +- } +- +-#if defined(SO_PRIORITY) +- /* The priority is not inherited, so call this function again */ +- if (sock->base.opts.priority) { +- rc = setsockopt(fd, SOL_SOCKET, SO_PRIORITY, &sock->base.opts.priority, sizeof(int)); +- if (rc != 0) { +- close(fd); +- return NULL; +- } +- } +-#endif +- +- new_sock = uring_sock_alloc(fd, &sock->base.impl_opts, sock->zcopy); +- if (new_sock == NULL) { +- close(fd); +- return NULL; +- } +- +- return &new_sock->base; +-} +- +-static int +-uring_sock_close(struct spdk_sock *_sock) +-{ +- struct spdk_uring_sock *sock = __uring_sock(_sock); +- +- assert(TAILQ_EMPTY(&_sock->pending_reqs)); +- assert(sock->group == NULL); +- +- /* If the socket fails to close, the best choice is to +- * leak the fd but continue to free the rest of the sock +- * memory. */ +- close(sock->fd); +- +- spdk_pipe_destroy(sock->recv_pipe); +- free(sock->recv_buf); +- free(sock); +- +- return 0; +-} +- +-static ssize_t +-uring_sock_recv_from_pipe(struct spdk_uring_sock *sock, struct iovec *diov, int diovcnt) +-{ +- struct iovec siov[2]; +- int sbytes; +- ssize_t bytes; +- struct spdk_uring_sock_group_impl *group; +- +- sbytes = spdk_pipe_reader_get_buffer(sock->recv_pipe, sock->recv_buf_sz, siov); +- if (sbytes < 0) { +- errno = EINVAL; +- return -1; +- } else if (sbytes == 0) { +- errno = EAGAIN; +- return -1; +- } +- +- bytes = spdk_iovcpy(siov, 2, diov, diovcnt); +- +- if (bytes == 0) { +- /* The only way this happens is if diov is 0 length */ +- errno = EINVAL; +- return -1; +- } +- +- spdk_pipe_reader_advance(sock->recv_pipe, bytes); +- +- /* If we drained the pipe, take it off the level-triggered list */ +- if (sock->base.group_impl && spdk_pipe_reader_bytes_available(sock->recv_pipe) == 0) { +- group = __uring_group_impl(sock->base.group_impl); +- TAILQ_REMOVE(&group->pending_recv, sock, link); +- sock->pending_recv = false; +- } +- +- return bytes; +-} +- +-static inline ssize_t +-sock_readv(int fd, struct iovec *iov, int iovcnt) +-{ +- struct msghdr msg = { +- .msg_iov = iov, +- .msg_iovlen = iovcnt, +- }; +- +- return recvmsg(fd, &msg, MSG_DONTWAIT); +-} +- +-static inline ssize_t +-uring_sock_read(struct spdk_uring_sock *sock) +-{ +- struct iovec iov[2]; +- int bytes; +- struct spdk_uring_sock_group_impl *group; +- +- bytes = spdk_pipe_writer_get_buffer(sock->recv_pipe, sock->recv_buf_sz, iov); +- +- if (bytes > 0) { +- bytes = sock_readv(sock->fd, iov, 2); +- if (bytes > 0) { +- spdk_pipe_writer_advance(sock->recv_pipe, bytes); +- if (sock->base.group_impl) { +- group = __uring_group_impl(sock->base.group_impl); +- TAILQ_INSERT_TAIL(&group->pending_recv, sock, link); +- sock->pending_recv = true; +- } +- } +- } +- +- return bytes; +-} +- +-static ssize_t +-uring_sock_readv(struct spdk_sock *_sock, struct iovec *iov, int iovcnt) +-{ +- struct spdk_uring_sock *sock = __uring_sock(_sock); +- int rc, i; +- size_t len; +- +- if (sock->recv_pipe == NULL) { +- return sock_readv(sock->fd, iov, iovcnt); +- } +- +- len = 0; +- for (i = 0; i < iovcnt; i++) { +- len += iov[i].iov_len; +- } +- +- if (spdk_pipe_reader_bytes_available(sock->recv_pipe) == 0) { +- /* If the user is receiving a sufficiently large amount of data, +- * receive directly to their buffers. */ +- if (len >= MIN_SOCK_PIPE_SIZE) { +- return sock_readv(sock->fd, iov, iovcnt); +- } +- +- /* Otherwise, do a big read into our pipe */ +- rc = uring_sock_read(sock); +- if (rc <= 0) { +- return rc; +- } +- } +- +- return uring_sock_recv_from_pipe(sock, iov, iovcnt); +-} +- +-static ssize_t +-uring_sock_recv(struct spdk_sock *sock, void *buf, size_t len) +-{ +- struct iovec iov[1]; +- +- iov[0].iov_base = buf; +- iov[0].iov_len = len; +- +- return uring_sock_readv(sock, iov, 1); +-} +- +-static ssize_t +-uring_sock_writev(struct spdk_sock *_sock, struct iovec *iov, int iovcnt) +-{ +- struct spdk_uring_sock *sock = __uring_sock(_sock); +- struct msghdr msg = { +- .msg_iov = iov, +- .msg_iovlen = iovcnt, +- }; +- +- if (sock->write_task.status != SPDK_URING_SOCK_TASK_NOT_IN_USE) { +- errno = EAGAIN; +- return -1; +- } +- +- return sendmsg(sock->fd, &msg, MSG_DONTWAIT); +-} +- +-static ssize_t +-sock_request_advance_offset(struct spdk_sock_request *req, ssize_t rc) +-{ +- unsigned int offset; +- size_t len; +- int i; +- +- offset = req->internal.offset; +- for (i = 0; i < req->iovcnt; i++) { +- /* Advance by the offset first */ +- if (offset >= SPDK_SOCK_REQUEST_IOV(req, i)->iov_len) { +- offset -= SPDK_SOCK_REQUEST_IOV(req, i)->iov_len; +- continue; +- } +- +- /* Calculate the remaining length of this element */ +- len = SPDK_SOCK_REQUEST_IOV(req, i)->iov_len - offset; +- +- if (len > (size_t)rc) { +- req->internal.offset += rc; +- return -1; +- } +- +- offset = 0; +- req->internal.offset += len; +- rc -= len; +- } +- +- return rc; +-} +- +-static int +-sock_complete_write_reqs(struct spdk_sock *_sock, ssize_t rc, bool is_zcopy) +-{ +- struct spdk_uring_sock *sock = __uring_sock(_sock); +- struct spdk_sock_request *req; +- int retval; +- +- if (is_zcopy) { +- /* Handling overflow case, because we use psock->sendmsg_idx - 1 for the +- * req->internal.offset, so sendmsg_idx should not be zero */ +- if (spdk_unlikely(sock->sendmsg_idx == UINT32_MAX)) { +- sock->sendmsg_idx = 1; +- } else { +- sock->sendmsg_idx++; +- } +- } +- +- /* Consume the requests that were actually written */ +- req = TAILQ_FIRST(&_sock->queued_reqs); +- while (req) { +- /* req->internal.is_zcopy is true when the whole req or part of it is sent with zerocopy */ +- req->internal.is_zcopy = is_zcopy; +- +- rc = sock_request_advance_offset(req, rc); +- if (rc < 0) { +- /* This element was partially sent. */ +- return 0; +- } +- +- /* Handled a full request. */ +- spdk_sock_request_pend(_sock, req); +- +- if (!req->internal.is_zcopy && req == TAILQ_FIRST(&_sock->pending_reqs)) { +- retval = spdk_sock_request_put(_sock, req, 0); +- if (retval) { +- return retval; +- } +- } else { +- /* Re-use the offset field to hold the sendmsg call index. The +- * index is 0 based, so subtract one here because we've already +- * incremented above. */ +- req->internal.offset = sock->sendmsg_idx - 1; +- } +- +- if (rc == 0) { +- break; +- } +- +- req = TAILQ_FIRST(&_sock->queued_reqs); +- } +- +- return 0; +-} +- +-#ifdef SPDK_ZEROCOPY +-static int +-_sock_check_zcopy(struct spdk_sock *_sock, int status) +-{ +- struct spdk_uring_sock *sock = __uring_sock(_sock); +- ssize_t rc; +- struct sock_extended_err *serr; +- struct cmsghdr *cm; +- uint32_t idx; +- struct spdk_sock_request *req, *treq; +- bool found; +- +- assert(sock->zcopy == true); +- if (spdk_unlikely(status) < 0) { +- if (!TAILQ_EMPTY(&_sock->pending_reqs)) { +- SPDK_ERRLOG("Attempting to receive from ERRQUEUE yielded error, but pending list still has orphaned entries, status =%d\n", +- status); +- } else { +- SPDK_WARNLOG("Recvmsg yielded an error!\n"); +- } +- return 0; +- } +- +- cm = CMSG_FIRSTHDR(&sock->errqueue_task.msg); +- if (!((cm->cmsg_level == SOL_IP && cm->cmsg_type == IP_RECVERR) || +- (cm->cmsg_level == SOL_IPV6 && cm->cmsg_type == IPV6_RECVERR))) { +- SPDK_WARNLOG("Unexpected cmsg level or type!\n"); +- return 0; +- } +- +- serr = (struct sock_extended_err *)CMSG_DATA(cm); +- if (serr->ee_errno != 0 || serr->ee_origin != SO_EE_ORIGIN_ZEROCOPY) { +- SPDK_WARNLOG("Unexpected extended error origin\n"); +- return 0; +- } +- +- /* Most of the time, the pending_reqs array is in the exact +- * order we need such that all of the requests to complete are +- * in order, in the front. It is guaranteed that all requests +- * belonging to the same sendmsg call are sequential, so once +- * we encounter one match we can stop looping as soon as a +- * non-match is found. +- */ +- for (idx = serr->ee_info; idx <= serr->ee_data; idx++) { +- found = false; +- TAILQ_FOREACH_SAFE(req, &_sock->pending_reqs, internal.link, treq) { +- if (!req->internal.is_zcopy) { +- /* This wasn't a zcopy request. It was just waiting in line to complete */ +- rc = spdk_sock_request_put(_sock, req, 0); +- if (rc < 0) { +- return rc; +- } +- } else if (req->internal.offset == idx) { +- found = true; +- rc = spdk_sock_request_put(_sock, req, 0); +- if (rc < 0) { +- return rc; +- } +- } else if (found) { +- break; +- } +- } +- } +- +- return 0; +-} +- +-static void +-_sock_prep_errqueue(struct spdk_sock *_sock) +-{ +- struct spdk_uring_sock *sock = __uring_sock(_sock); +- struct spdk_uring_task *task = &sock->errqueue_task; +- struct io_uring_sqe *sqe; +- +- if (task->status == SPDK_URING_SOCK_TASK_IN_PROCESS) { +- return; +- } +- +- assert(sock->group != NULL); +- sock->group->io_queued++; +- +- sqe = io_uring_get_sqe(&sock->group->uring); +- io_uring_prep_recvmsg(sqe, sock->fd, &task->msg, MSG_ERRQUEUE); +- io_uring_sqe_set_data(sqe, task); +- task->status = SPDK_URING_SOCK_TASK_IN_PROCESS; +-} +- +-#endif +- +-static void +-_sock_flush(struct spdk_sock *_sock) +-{ +- struct spdk_uring_sock *sock = __uring_sock(_sock); +- struct spdk_uring_task *task = &sock->write_task; +- uint32_t iovcnt; +- struct io_uring_sqe *sqe; +- int flags; +- +- if (task->status == SPDK_URING_SOCK_TASK_IN_PROCESS) { +- return; +- } +- +-#ifdef SPDK_ZEROCOPY +- if (sock->zcopy) { +- flags = MSG_DONTWAIT | sock->zcopy_send_flags; +- } else +-#endif +- { +- flags = MSG_DONTWAIT; +- } +- +- iovcnt = spdk_sock_prep_reqs(&sock->base, task->iovs, task->iov_cnt, &task->last_req, &flags); +- if (!iovcnt) { +- return; +- } +- +- task->iov_cnt = iovcnt; +- assert(sock->group != NULL); +- task->msg.msg_iov = task->iovs; +- task->msg.msg_iovlen = task->iov_cnt; +-#ifdef SPDK_ZEROCOPY +- task->is_zcopy = (flags & MSG_ZEROCOPY) ? true : false; +-#endif +- sock->group->io_queued++; +- +- sqe = io_uring_get_sqe(&sock->group->uring); +- io_uring_prep_sendmsg(sqe, sock->fd, &sock->write_task.msg, flags); +- io_uring_sqe_set_data(sqe, task); +- task->status = SPDK_URING_SOCK_TASK_IN_PROCESS; +-} +- +-static void +-_sock_prep_pollin(struct spdk_sock *_sock) +-{ +- struct spdk_uring_sock *sock = __uring_sock(_sock); +- struct spdk_uring_task *task = &sock->pollin_task; +- struct io_uring_sqe *sqe; +- +- /* Do not prepare pollin event */ +- if (task->status == SPDK_URING_SOCK_TASK_IN_PROCESS || (sock->pending_recv && !sock->zcopy)) { +- return; +- } +- +- assert(sock->group != NULL); +- sock->group->io_queued++; +- +- sqe = io_uring_get_sqe(&sock->group->uring); +- io_uring_prep_poll_add(sqe, sock->fd, POLLIN | POLLERR); +- io_uring_sqe_set_data(sqe, task); +- task->status = SPDK_URING_SOCK_TASK_IN_PROCESS; +-} +- +-static void +-_sock_prep_cancel_task(struct spdk_sock *_sock, void *user_data) +-{ +- struct spdk_uring_sock *sock = __uring_sock(_sock); +- struct spdk_uring_task *task = &sock->cancel_task; +- struct io_uring_sqe *sqe; +- +- if (task->status == SPDK_URING_SOCK_TASK_IN_PROCESS) { +- return; +- } +- +- assert(sock->group != NULL); +- sock->group->io_queued++; +- +- sqe = io_uring_get_sqe(&sock->group->uring); +- io_uring_prep_cancel(sqe, user_data, 0); +- io_uring_sqe_set_data(sqe, task); +- task->status = SPDK_URING_SOCK_TASK_IN_PROCESS; +-} +- +-static int +-sock_uring_group_reap(struct spdk_uring_sock_group_impl *group, int max, int max_read_events, +- struct spdk_sock **socks) +-{ +- int i, count, ret; +- struct io_uring_cqe *cqe; +- struct spdk_uring_sock *sock, *tmp; +- struct spdk_uring_task *task; +- int status; +- bool is_zcopy; +- +- for (i = 0; i < max; i++) { +- ret = io_uring_peek_cqe(&group->uring, &cqe); +- if (ret != 0) { +- break; +- } +- +- if (cqe == NULL) { +- break; +- } +- +- task = (struct spdk_uring_task *)cqe->user_data; +- assert(task != NULL); +- sock = task->sock; +- assert(sock != NULL); +- assert(sock->group != NULL); +- assert(sock->group == group); +- sock->group->io_inflight--; +- sock->group->io_avail++; +- status = cqe->res; +- io_uring_cqe_seen(&group->uring, cqe); +- +- task->status = SPDK_URING_SOCK_TASK_NOT_IN_USE; +- +- if (spdk_unlikely(status <= 0)) { +- if (status == -EAGAIN || status == -EWOULDBLOCK || (status == -ENOBUFS && sock->zcopy)) { +- continue; +- } +- } +- +- switch (task->type) { +- case SPDK_SOCK_TASK_POLLIN: +-#ifdef SPDK_ZEROCOPY +- if ((status & POLLERR) == POLLERR) { +- _sock_prep_errqueue(&sock->base); +- } +-#endif +- if ((status & POLLIN) == POLLIN) { +- if (sock->base.cb_fn != NULL && +- sock->pending_recv == false) { +- sock->pending_recv = true; +- TAILQ_INSERT_TAIL(&group->pending_recv, sock, link); +- } +- } +- break; +- case SPDK_SOCK_TASK_WRITE: +- task->last_req = NULL; +- task->iov_cnt = 0; +- is_zcopy = task->is_zcopy; +- task->is_zcopy = false; +- if (spdk_unlikely(status) < 0) { +- sock->connection_status = status; +- spdk_sock_abort_requests(&sock->base); +- } else { +- sock_complete_write_reqs(&sock->base, status, is_zcopy); +- } +- +- break; +-#ifdef SPDK_ZEROCOPY +- case SPDK_SOCK_TASK_ERRQUEUE: +- if (spdk_unlikely(status == -ECANCELED)) { +- sock->connection_status = status; +- break; +- } +- _sock_check_zcopy(&sock->base, status); +- break; +-#endif +- case SPDK_SOCK_TASK_CANCEL: +- /* Do nothing */ +- break; +- default: +- SPDK_UNREACHABLE(); +- } +- } +- +- if (!socks) { +- return 0; +- } +- count = 0; +- TAILQ_FOREACH_SAFE(sock, &group->pending_recv, link, tmp) { +- if (count == max_read_events) { +- break; +- } +- +- if (spdk_unlikely(sock->base.cb_fn == NULL) || +- (sock->recv_pipe == NULL || spdk_pipe_reader_bytes_available(sock->recv_pipe) == 0)) { +- sock->pending_recv = false; +- TAILQ_REMOVE(&group->pending_recv, sock, link); +- if (spdk_unlikely(sock->base.cb_fn == NULL)) { +- /* If the socket's cb_fn is NULL, do not add it to socks array */ +- continue; +- } +- } +- +- socks[count++] = &sock->base; +- } +- +- +- /* Cycle the pending_recv list so that each time we poll things aren't +- * in the same order. Say we have 6 sockets in the list, named as follows: +- * A B C D E F +- * And all 6 sockets had the poll events, but max_events is only 3. That means +- * psock currently points at D. We want to rearrange the list to the following: +- * D E F A B C +- * +- * The variables below are named according to this example to make it easier to +- * follow the swaps. +- */ +- if (sock != NULL) { +- struct spdk_uring_sock *ua, *uc, *ud, *uf; +- +- /* Capture pointers to the elements we need */ +- ud = sock; +- +- ua = TAILQ_FIRST(&group->pending_recv); +- if (ua == ud) { +- goto end; +- } +- +- uf = TAILQ_LAST(&group->pending_recv, pending_recv_list); +- if (uf == ud) { +- TAILQ_REMOVE(&group->pending_recv, ud, link); +- TAILQ_INSERT_HEAD(&group->pending_recv, ud, link); +- goto end; +- } +- +- uc = TAILQ_PREV(ud, pending_recv_list, link); +- assert(uc != NULL); +- +- /* Break the link between C and D */ +- uc->link.tqe_next = NULL; +- +- /* Connect F to A */ +- uf->link.tqe_next = ua; +- ua->link.tqe_prev = &uf->link.tqe_next; +- +- /* Fix up the list first/last pointers */ +- group->pending_recv.tqh_first = ud; +- group->pending_recv.tqh_last = &uc->link.tqe_next; +- +- /* D is in front of the list, make tqe prev pointer point to the head of list */ +- ud->link.tqe_prev = &group->pending_recv.tqh_first; +- } +- +-end: +- return count; +-} +- +-static int uring_sock_flush(struct spdk_sock *_sock); +- +-static void +-uring_sock_writev_async(struct spdk_sock *_sock, struct spdk_sock_request *req) +-{ +- struct spdk_uring_sock *sock = __uring_sock(_sock); +- int rc; +- +- if (spdk_unlikely(sock->connection_status)) { +- req->cb_fn(req->cb_arg, sock->connection_status); +- return; +- } +- +- spdk_sock_request_queue(_sock, req); +- +- if (!sock->group) { +- if (_sock->queued_iovcnt >= IOV_BATCH_SIZE) { +- rc = uring_sock_flush(_sock); +- if (rc < 0 && errno != EAGAIN) { +- spdk_sock_abort_requests(_sock); +- } +- } +- } +-} +- +-static void +-uring_sock_readv_async(struct spdk_sock *sock, struct spdk_sock_request *req) +-{ +- req->cb_fn(req->cb_arg, -ENOTSUP); +-} +- +-static int +-uring_sock_set_recvlowat(struct spdk_sock *_sock, int nbytes) +-{ +- struct spdk_uring_sock *sock = __uring_sock(_sock); +- int val; +- int rc; +- +- assert(sock != NULL); +- +- val = nbytes; +- rc = setsockopt(sock->fd, SOL_SOCKET, SO_RCVLOWAT, &val, sizeof val); +- if (rc != 0) { +- return -1; +- } +- return 0; +-} +- +-static bool +-uring_sock_is_ipv6(struct spdk_sock *_sock) +-{ +- struct spdk_uring_sock *sock = __uring_sock(_sock); +- struct sockaddr_storage sa; +- socklen_t salen; +- int rc; +- +- assert(sock != NULL); +- +- memset(&sa, 0, sizeof sa); +- salen = sizeof sa; +- rc = getsockname(sock->fd, (struct sockaddr *) &sa, &salen); +- if (rc != 0) { +- SPDK_ERRLOG("getsockname() failed (errno=%d)\n", errno); +- return false; +- } +- +- return (sa.ss_family == AF_INET6); +-} +- +-static bool +-uring_sock_is_ipv4(struct spdk_sock *_sock) +-{ +- struct spdk_uring_sock *sock = __uring_sock(_sock); +- struct sockaddr_storage sa; +- socklen_t salen; +- int rc; +- +- assert(sock != NULL); +- +- memset(&sa, 0, sizeof sa); +- salen = sizeof sa; +- rc = getsockname(sock->fd, (struct sockaddr *) &sa, &salen); +- if (rc != 0) { +- SPDK_ERRLOG("getsockname() failed (errno=%d)\n", errno); +- return false; +- } +- +- return (sa.ss_family == AF_INET); +-} +- +-static bool +-uring_sock_is_connected(struct spdk_sock *_sock) +-{ +- struct spdk_uring_sock *sock = __uring_sock(_sock); +- uint8_t byte; +- int rc; +- +- rc = recv(sock->fd, &byte, 1, MSG_PEEK | MSG_DONTWAIT); +- if (rc == 0) { +- return false; +- } +- +- if (rc < 0) { +- if (errno == EAGAIN || errno == EWOULDBLOCK) { +- return true; +- } +- +- return false; +- } +- +- return true; +-} +- +-static struct spdk_sock_group_impl * +-uring_sock_group_impl_get_optimal(struct spdk_sock *_sock, struct spdk_sock_group_impl *hint) +-{ +- struct spdk_uring_sock *sock = __uring_sock(_sock); +- struct spdk_sock_group_impl *group; +- +- if (sock->placement_id != -1) { +- spdk_sock_map_lookup(&g_map, sock->placement_id, &group, hint); +- return group; +- } +- +- return NULL; +-} +- +-static struct spdk_sock_group_impl * +-uring_sock_group_impl_create(void) +-{ +- struct spdk_uring_sock_group_impl *group_impl; +- +- group_impl = calloc(1, sizeof(*group_impl)); +- if (group_impl == NULL) { +- SPDK_ERRLOG("group_impl allocation failed\n"); +- return NULL; +- } +- +- group_impl->io_avail = SPDK_SOCK_GROUP_QUEUE_DEPTH; +- +- if (io_uring_queue_init(SPDK_SOCK_GROUP_QUEUE_DEPTH, &group_impl->uring, 0) < 0) { +- SPDK_ERRLOG("uring I/O context setup failure\n"); +- free(group_impl); +- return NULL; +- } +- +- TAILQ_INIT(&group_impl->pending_recv); +- +- if (g_spdk_uring_sock_impl_opts.enable_placement_id == PLACEMENT_CPU) { +- spdk_sock_map_insert(&g_map, spdk_env_get_current_core(), &group_impl->base); +- } +- +- return &group_impl->base; +-} +- +-static int +-uring_sock_group_impl_add_sock(struct spdk_sock_group_impl *_group, +- struct spdk_sock *_sock) +-{ +- struct spdk_uring_sock *sock = __uring_sock(_sock); +- struct spdk_uring_sock_group_impl *group = __uring_group_impl(_group); +- int rc; +- +- sock->group = group; +- sock->write_task.sock = sock; +- sock->write_task.type = SPDK_SOCK_TASK_WRITE; +- +- sock->pollin_task.sock = sock; +- sock->pollin_task.type = SPDK_SOCK_TASK_POLLIN; +- +- sock->errqueue_task.sock = sock; +- sock->errqueue_task.type = SPDK_SOCK_TASK_ERRQUEUE; +- sock->errqueue_task.msg.msg_control = sock->buf; +- sock->errqueue_task.msg.msg_controllen = sizeof(sock->buf); +- +- sock->cancel_task.sock = sock; +- sock->cancel_task.type = SPDK_SOCK_TASK_CANCEL; +- +- /* switched from another polling group due to scheduling */ +- if (spdk_unlikely(sock->recv_pipe != NULL && +- (spdk_pipe_reader_bytes_available(sock->recv_pipe) > 0))) { +- assert(sock->pending_recv == false); +- sock->pending_recv = true; +- TAILQ_INSERT_TAIL(&group->pending_recv, sock, link); +- } +- +- if (sock->placement_id != -1) { +- rc = spdk_sock_map_insert(&g_map, sock->placement_id, &group->base); +- if (rc != 0) { +- SPDK_ERRLOG("Failed to insert sock group into map: %d", rc); +- /* Do not treat this as an error. The system will continue running. */ +- } +- } +- +- return 0; +-} +- +-static int +-uring_sock_group_impl_poll(struct spdk_sock_group_impl *_group, int max_events, +- struct spdk_sock **socks) +-{ +- struct spdk_uring_sock_group_impl *group = __uring_group_impl(_group); +- int count, ret; +- int to_complete, to_submit; +- struct spdk_sock *_sock, *tmp; +- struct spdk_uring_sock *sock; +- +- if (spdk_likely(socks)) { +- TAILQ_FOREACH_SAFE(_sock, &group->base.socks, link, tmp) { +- sock = __uring_sock(_sock); +- if (spdk_unlikely(sock->connection_status)) { +- continue; +- } +- _sock_flush(_sock); +- _sock_prep_pollin(_sock); +- } +- } +- +- to_submit = group->io_queued; +- +- /* For network I/O, it cannot be set with O_DIRECT, so we do not need to call spdk_io_uring_enter */ +- if (to_submit > 0) { +- /* If there are I/O to submit, use io_uring_submit here. +- * It will automatically call io_uring_enter appropriately. */ +- ret = io_uring_submit(&group->uring); +- if (ret < 0) { +- return 1; +- } +- group->io_queued = 0; +- group->io_inflight += to_submit; +- group->io_avail -= to_submit; +- } +- +- count = 0; +- to_complete = group->io_inflight; +- if (to_complete > 0 || !TAILQ_EMPTY(&group->pending_recv)) { +- count = sock_uring_group_reap(group, to_complete, max_events, socks); +- } +- +- return count; +-} +- +-static int +-uring_sock_group_impl_remove_sock(struct spdk_sock_group_impl *_group, +- struct spdk_sock *_sock) +-{ +- struct spdk_uring_sock *sock = __uring_sock(_sock); +- struct spdk_uring_sock_group_impl *group = __uring_group_impl(_group); +- +- if (sock->write_task.status != SPDK_URING_SOCK_TASK_NOT_IN_USE) { +- _sock_prep_cancel_task(_sock, &sock->write_task); +- /* Since spdk_sock_group_remove_sock is not asynchronous interface, so +- * currently can use a while loop here. */ +- while ((sock->write_task.status != SPDK_URING_SOCK_TASK_NOT_IN_USE) || +- (sock->cancel_task.status != SPDK_URING_SOCK_TASK_NOT_IN_USE)) { +- uring_sock_group_impl_poll(_group, 32, NULL); +- } +- } +- +- if (sock->pollin_task.status != SPDK_URING_SOCK_TASK_NOT_IN_USE) { +- _sock_prep_cancel_task(_sock, &sock->pollin_task); +- /* Since spdk_sock_group_remove_sock is not asynchronous interface, so +- * currently can use a while loop here. */ +- while ((sock->pollin_task.status != SPDK_URING_SOCK_TASK_NOT_IN_USE) || +- (sock->cancel_task.status != SPDK_URING_SOCK_TASK_NOT_IN_USE)) { +- uring_sock_group_impl_poll(_group, 32, NULL); +- } +- } +- +- if (sock->errqueue_task.status != SPDK_URING_SOCK_TASK_NOT_IN_USE) { +- _sock_prep_cancel_task(_sock, &sock->errqueue_task); +- /* Since spdk_sock_group_remove_sock is not asynchronous interface, so +- * currently can use a while loop here. */ +- while ((sock->errqueue_task.status != SPDK_URING_SOCK_TASK_NOT_IN_USE) || +- (sock->cancel_task.status != SPDK_URING_SOCK_TASK_NOT_IN_USE)) { +- uring_sock_group_impl_poll(_group, 32, NULL); +- } +- } +- +- /* Make sure the cancelling the tasks above didn't cause sending new requests */ +- assert(sock->write_task.status == SPDK_URING_SOCK_TASK_NOT_IN_USE); +- assert(sock->pollin_task.status == SPDK_URING_SOCK_TASK_NOT_IN_USE); +- assert(sock->errqueue_task.status == SPDK_URING_SOCK_TASK_NOT_IN_USE); +- +- if (sock->pending_recv) { +- TAILQ_REMOVE(&group->pending_recv, sock, link); +- sock->pending_recv = false; +- } +- assert(sock->pending_recv == false); +- +- if (sock->placement_id != -1) { +- spdk_sock_map_release(&g_map, sock->placement_id); +- } +- +- sock->group = NULL; +- return 0; +-} +- +-static int +-uring_sock_group_impl_close(struct spdk_sock_group_impl *_group) +-{ +- struct spdk_uring_sock_group_impl *group = __uring_group_impl(_group); +- +- /* try to reap all the active I/O */ +- while (group->io_inflight) { +- uring_sock_group_impl_poll(_group, 32, NULL); +- } +- assert(group->io_inflight == 0); +- assert(group->io_avail == SPDK_SOCK_GROUP_QUEUE_DEPTH); +- +- io_uring_queue_exit(&group->uring); +- +- if (g_spdk_uring_sock_impl_opts.enable_placement_id == PLACEMENT_CPU) { +- spdk_sock_map_release(&g_map, spdk_env_get_current_core()); +- } +- +- free(group); +- return 0; +-} +- +-static int +-uring_sock_flush(struct spdk_sock *_sock) +-{ +- struct spdk_uring_sock *sock = __uring_sock(_sock); +- struct msghdr msg = {}; +- struct iovec iovs[IOV_BATCH_SIZE]; +- int iovcnt; +- ssize_t rc; +- int flags = sock->zcopy_send_flags; +- int retval; +- bool is_zcopy = false; +- +- /* Can't flush from within a callback or we end up with recursive calls */ +- if (_sock->cb_cnt > 0) { +- errno = EAGAIN; +- return -1; +- } +- +- /* Can't flush while a write is already outstanding */ +- if (sock->write_task.status != SPDK_URING_SOCK_TASK_NOT_IN_USE) { +- errno = EAGAIN; +- return -1; +- } +- +- /* Gather an iov */ +- iovcnt = spdk_sock_prep_reqs(_sock, iovs, 0, NULL, &flags); +- if (iovcnt == 0) { +- /* Nothing to send */ +- return 0; +- } +- +- /* Perform the vectored write */ +- msg.msg_iov = iovs; +- msg.msg_iovlen = iovcnt; +- rc = sendmsg(sock->fd, &msg, flags | MSG_DONTWAIT); +- if (rc <= 0) { +- if (rc == 0 || errno == EAGAIN || errno == EWOULDBLOCK || (errno == ENOBUFS && sock->zcopy)) { +- errno = EAGAIN; +- } +- return -1; +- } +- +-#ifdef SPDK_ZEROCOPY +- is_zcopy = flags & MSG_ZEROCOPY; +-#endif +- retval = sock_complete_write_reqs(_sock, rc, is_zcopy); +- if (retval < 0) { +- /* if the socket is closed, return to avoid heap-use-after-free error */ +- errno = ENOTCONN; +- return -1; +- } +- +-#ifdef SPDK_ZEROCOPY +- if (sock->zcopy && !TAILQ_EMPTY(&_sock->pending_reqs)) { +- _sock_check_zcopy(_sock, 0); +- } +-#endif +- +- return rc; +-} +- +-static struct spdk_net_impl g_uring_net_impl = { +- .name = "uring", +- .getaddr = uring_sock_getaddr, +- .connect = uring_sock_connect, +- .listen = uring_sock_listen, +- .accept = uring_sock_accept, +- .close = uring_sock_close, +- .recv = uring_sock_recv, +- .readv = uring_sock_readv, +- .readv_async = uring_sock_readv_async, +- .writev = uring_sock_writev, +- .writev_async = uring_sock_writev_async, +- .flush = uring_sock_flush, +- .set_recvlowat = uring_sock_set_recvlowat, +- .set_recvbuf = uring_sock_set_recvbuf, +- .set_sendbuf = uring_sock_set_sendbuf, +- .is_ipv6 = uring_sock_is_ipv6, +- .is_ipv4 = uring_sock_is_ipv4, +- .is_connected = uring_sock_is_connected, +- .group_impl_get_optimal = uring_sock_group_impl_get_optimal, +- .group_impl_create = uring_sock_group_impl_create, +- .group_impl_add_sock = uring_sock_group_impl_add_sock, +- .group_impl_remove_sock = uring_sock_group_impl_remove_sock, +- .group_impl_poll = uring_sock_group_impl_poll, +- .group_impl_close = uring_sock_group_impl_close, +- .get_opts = uring_sock_impl_get_opts, +- .set_opts = uring_sock_impl_set_opts, +-}; +- +-SPDK_NET_IMPL_REGISTER(uring, &g_uring_net_impl, DEFAULT_SOCK_PRIORITY + 2); ++/* SPDX-License-Identifier: BSD-3-Clause ++ * Copyright (C) 2019 Intel Corporation. ++ * All rights reserved. ++ */ ++ ++#include "spdk/stdinc.h" ++#include "spdk/config.h" ++ ++#include ++#include ++#include ++ ++#include "spdk/barrier.h" ++#include "spdk/env.h" ++#include "spdk/log.h" ++#include "spdk/pipe.h" ++#include "spdk/sock.h" ++#include "spdk/string.h" ++#include "spdk/util.h" ++ ++#include "spdk_internal/sock.h" ++#include "spdk_internal/assert.h" ++#include "../sock_kernel.h" ++ ++#define MAX_TMPBUF 1024 ++#define PORTNUMLEN 32 ++#define SPDK_SOCK_GROUP_QUEUE_DEPTH 4096 ++#define SPDK_SOCK_CMG_INFO_SIZE (sizeof(struct cmsghdr) + sizeof(struct sock_extended_err)) ++ ++enum spdk_sock_task_type { ++ SPDK_SOCK_TASK_POLLIN = 0, ++ SPDK_SOCK_TASK_ERRQUEUE, ++ SPDK_SOCK_TASK_WRITE, ++ SPDK_SOCK_TASK_CANCEL, ++}; ++ ++#if defined(SO_ZEROCOPY) && defined(MSG_ZEROCOPY) ++#define SPDK_ZEROCOPY ++#endif ++ ++enum spdk_uring_sock_task_status { ++ SPDK_URING_SOCK_TASK_NOT_IN_USE = 0, ++ SPDK_URING_SOCK_TASK_IN_PROCESS, ++}; ++ ++struct spdk_uring_task { ++ enum spdk_uring_sock_task_status status; ++ enum spdk_sock_task_type type; ++ struct spdk_uring_sock *sock; ++ struct msghdr msg; ++ struct iovec iovs[IOV_BATCH_SIZE]; ++ int iov_cnt; ++ struct spdk_sock_request *last_req; ++ bool is_zcopy; ++ STAILQ_ENTRY(spdk_uring_task) link; ++}; ++ ++struct spdk_uring_sock { ++ struct spdk_sock base; ++ int fd; ++ uint32_t sendmsg_idx; ++ struct spdk_uring_sock_group_impl *group; ++ struct spdk_uring_task write_task; ++ struct spdk_uring_task errqueue_task; ++ struct spdk_uring_task pollin_task; ++ struct spdk_uring_task cancel_task; ++ struct spdk_pipe *recv_pipe; ++ void *recv_buf; ++ int recv_buf_sz; ++ bool zcopy; ++ bool pending_recv; ++ int zcopy_send_flags; ++ int connection_status; ++ int placement_id; ++ uint8_t buf[SPDK_SOCK_CMG_INFO_SIZE]; ++ TAILQ_ENTRY(spdk_uring_sock) link; ++}; ++ ++TAILQ_HEAD(pending_recv_list, spdk_uring_sock); ++ ++struct spdk_uring_sock_group_impl { ++ struct spdk_sock_group_impl base; ++ struct io_uring uring; ++ uint32_t io_inflight; ++ uint32_t io_queued; ++ uint32_t io_avail; ++ struct pending_recv_list pending_recv; ++}; ++ ++static struct spdk_sock_impl_opts g_spdk_uring_sock_impl_opts = { ++ .recv_buf_size = MIN_SO_RCVBUF_SIZE, ++ .send_buf_size = MIN_SO_SNDBUF_SIZE, ++ .enable_recv_pipe = true, ++ .enable_quickack = false, ++ .enable_placement_id = PLACEMENT_NONE, ++ .enable_zerocopy_send_server = false, ++ .enable_zerocopy_send_client = false, ++ .zerocopy_threshold = 0, ++ .tls_version = 0, ++ .enable_ktls = false, ++ .psk_key = NULL, ++ .psk_identity = NULL ++}; ++ ++static struct spdk_sock_map g_map = { ++ .entries = STAILQ_HEAD_INITIALIZER(g_map.entries), ++ .mtx = PTHREAD_MUTEX_INITIALIZER ++}; ++ ++__attribute((destructor)) static void ++uring_sock_map_cleanup(void) ++{ ++ spdk_sock_map_cleanup(&g_map); ++} ++ ++#define SPDK_URING_SOCK_REQUEST_IOV(req) ((struct iovec *)((uint8_t *)req + sizeof(struct spdk_sock_request))) ++ ++#define __uring_sock(sock) (struct spdk_uring_sock *)sock ++#define __uring_group_impl(group) (struct spdk_uring_sock_group_impl *)group ++ ++static void ++uring_sock_copy_impl_opts(struct spdk_sock_impl_opts *dest, const struct spdk_sock_impl_opts *src, ++ size_t len) ++{ ++#define FIELD_OK(field) \ ++ offsetof(struct spdk_sock_impl_opts, field) + sizeof(src->field) <= len ++ ++#define SET_FIELD(field) \ ++ if (FIELD_OK(field)) { \ ++ dest->field = src->field; \ ++ } ++ ++ SET_FIELD(recv_buf_size); ++ SET_FIELD(send_buf_size); ++ SET_FIELD(enable_recv_pipe); ++ SET_FIELD(enable_quickack); ++ SET_FIELD(enable_placement_id); ++ SET_FIELD(enable_zerocopy_send_server); ++ SET_FIELD(enable_zerocopy_send_client); ++ SET_FIELD(zerocopy_threshold); ++ SET_FIELD(tls_version); ++ SET_FIELD(enable_ktls); ++ SET_FIELD(psk_key); ++ SET_FIELD(psk_identity); ++ ++#undef SET_FIELD ++#undef FIELD_OK ++} ++ ++static int ++uring_sock_impl_get_opts(struct spdk_sock_impl_opts *opts, size_t *len) ++{ ++ if (!opts || !len) { ++ errno = EINVAL; ++ return -1; ++ } ++ ++ assert(sizeof(*opts) >= *len); ++ memset(opts, 0, *len); ++ ++ uring_sock_copy_impl_opts(opts, &g_spdk_uring_sock_impl_opts, *len); ++ *len = spdk_min(*len, sizeof(g_spdk_uring_sock_impl_opts)); ++ ++ return 0; ++} ++ ++static int ++uring_sock_impl_set_opts(const struct spdk_sock_impl_opts *opts, size_t len) ++{ ++ if (!opts) { ++ errno = EINVAL; ++ return -1; ++ } ++ ++ assert(sizeof(*opts) >= len); ++ uring_sock_copy_impl_opts(&g_spdk_uring_sock_impl_opts, opts, len); ++ ++ return 0; ++} ++ ++static void ++uring_opts_get_impl_opts(const struct spdk_sock_opts *opts, struct spdk_sock_impl_opts *dest) ++{ ++ /* Copy the default impl_opts first to cover cases when user's impl_opts is smaller */ ++ memcpy(dest, &g_spdk_uring_sock_impl_opts, sizeof(*dest)); ++ ++ if (opts->impl_opts != NULL) { ++ assert(sizeof(*dest) >= opts->impl_opts_size); ++ uring_sock_copy_impl_opts(dest, opts->impl_opts, opts->impl_opts_size); ++ } ++} ++ ++static int ++uring_sock_getaddr(struct spdk_sock *_sock, char *saddr, int slen, uint16_t *sport, ++ char *caddr, int clen, uint16_t *cport) ++{ ++ struct spdk_uring_sock *sock = __uring_sock(_sock); ++ struct sockaddr_storage sa; ++ socklen_t salen; ++ int rc; ++ ++ assert(sock != NULL); ++ ++ memset(&sa, 0, sizeof sa); ++ salen = sizeof sa; ++ rc = getsockname(sock->fd, (struct sockaddr *) &sa, &salen); ++ if (rc != 0) { ++ SPDK_ERRLOG("getsockname() failed (errno=%d)\n", errno); ++ return -1; ++ } ++ ++ switch (sa.ss_family) { ++ case AF_UNIX: ++ /* Acceptable connection types that don't have IPs */ ++ return 0; ++ case AF_INET: ++ case AF_INET6: ++ /* Code below will get IP addresses */ ++ break; ++ default: ++ /* Unsupported socket family */ ++ return -1; ++ } ++ ++ rc = get_addr_str((struct sockaddr *)&sa, saddr, slen); ++ if (rc != 0) { ++ SPDK_ERRLOG("getnameinfo() failed (errno=%d)\n", errno); ++ return -1; ++ } ++ ++ if (sport) { ++ if (sa.ss_family == AF_INET) { ++ *sport = ntohs(((struct sockaddr_in *) &sa)->sin_port); ++ } else if (sa.ss_family == AF_INET6) { ++ *sport = ntohs(((struct sockaddr_in6 *) &sa)->sin6_port); ++ } ++ } ++ ++ memset(&sa, 0, sizeof sa); ++ salen = sizeof sa; ++ rc = getpeername(sock->fd, (struct sockaddr *) &sa, &salen); ++ if (rc != 0) { ++ SPDK_ERRLOG("getpeername() failed (errno=%d)\n", errno); ++ return -1; ++ } ++ ++ rc = get_addr_str((struct sockaddr *)&sa, caddr, clen); ++ if (rc != 0) { ++ SPDK_ERRLOG("getnameinfo() failed (errno=%d)\n", errno); ++ return -1; ++ } ++ ++ if (cport) { ++ if (sa.ss_family == AF_INET) { ++ *cport = ntohs(((struct sockaddr_in *) &sa)->sin_port); ++ } else if (sa.ss_family == AF_INET6) { ++ *cport = ntohs(((struct sockaddr_in6 *) &sa)->sin6_port); ++ } ++ } ++ ++ return 0; ++} ++ ++enum uring_sock_create_type { ++ SPDK_SOCK_CREATE_LISTEN, ++ SPDK_SOCK_CREATE_CONNECT, ++}; ++ ++static int ++uring_sock_alloc_pipe(struct spdk_uring_sock *sock, int sz) ++{ ++ uint8_t *new_buf; ++ struct spdk_pipe *new_pipe; ++ struct iovec siov[2]; ++ struct iovec diov[2]; ++ int sbytes; ++ ssize_t bytes; ++ ++ if (sock->recv_buf_sz == sz) { ++ return 0; ++ } ++ ++ /* If the new size is 0, just free the pipe */ ++ if (sz == 0) { ++ spdk_pipe_destroy(sock->recv_pipe); ++ free(sock->recv_buf); ++ sock->recv_pipe = NULL; ++ sock->recv_buf = NULL; ++ return 0; ++ } else if (sz < MIN_SOCK_PIPE_SIZE) { ++ SPDK_ERRLOG("The size of the pipe must be larger than %d\n", MIN_SOCK_PIPE_SIZE); ++ return -1; ++ } ++ ++ /* Round up to next 64 byte multiple */ ++ new_buf = calloc(SPDK_ALIGN_CEIL(sz + 1, 64), sizeof(uint8_t)); ++ if (!new_buf) { ++ SPDK_ERRLOG("socket recv buf allocation failed\n"); ++ return -ENOMEM; ++ } ++ ++ new_pipe = spdk_pipe_create(new_buf, sz + 1); ++ if (new_pipe == NULL) { ++ SPDK_ERRLOG("socket pipe allocation failed\n"); ++ free(new_buf); ++ return -ENOMEM; ++ } ++ ++ if (sock->recv_pipe != NULL) { ++ /* Pull all of the data out of the old pipe */ ++ sbytes = spdk_pipe_reader_get_buffer(sock->recv_pipe, sock->recv_buf_sz, siov); ++ if (sbytes > sz) { ++ /* Too much data to fit into the new pipe size */ ++ spdk_pipe_destroy(new_pipe); ++ free(new_buf); ++ return -EINVAL; ++ } ++ ++ sbytes = spdk_pipe_writer_get_buffer(new_pipe, sz, diov); ++ assert(sbytes == sz); ++ ++ bytes = spdk_iovcpy(siov, 2, diov, 2); ++ spdk_pipe_writer_advance(new_pipe, bytes); ++ ++ spdk_pipe_destroy(sock->recv_pipe); ++ free(sock->recv_buf); ++ } ++ ++ sock->recv_buf_sz = sz; ++ sock->recv_buf = new_buf; ++ sock->recv_pipe = new_pipe; ++ ++ return 0; ++} ++ ++static int ++uring_sock_set_recvbuf(struct spdk_sock *_sock, int sz) ++{ ++ struct spdk_uring_sock *sock = __uring_sock(_sock); ++ int min_size; ++ int rc; ++ ++ assert(sock != NULL); ++ ++ if (_sock->impl_opts.enable_recv_pipe) { ++ rc = uring_sock_alloc_pipe(sock, sz); ++ if (rc) { ++ SPDK_ERRLOG("unable to allocate sufficient recvbuf with sz=%d on sock=%p\n", sz, _sock); ++ return rc; ++ } ++ } ++ ++ /* Set kernel buffer size to be at least MIN_SO_RCVBUF_SIZE and ++ * g_spdk_uring_sock_impl_opts.recv_buf_size. */ ++ min_size = spdk_max(MIN_SO_RCVBUF_SIZE, g_spdk_uring_sock_impl_opts.recv_buf_size); ++ ++ if (sz < min_size) { ++ sz = min_size; ++ } ++ ++ rc = setsockopt(sock->fd, SOL_SOCKET, SO_RCVBUF, &sz, sizeof(sz)); ++ if (rc < 0) { ++ return rc; ++ } ++ ++ _sock->impl_opts.recv_buf_size = sz; ++ ++ return 0; ++} ++ ++static int ++uring_sock_set_sendbuf(struct spdk_sock *_sock, int sz) ++{ ++ struct spdk_uring_sock *sock = __uring_sock(_sock); ++ int min_size; ++ int rc; ++ ++ assert(sock != NULL); ++ ++ /* Set kernel buffer size to be at least MIN_SO_SNDBUF_SIZE and ++ * g_spdk_uring_sock_impl_opts.seend_buf_size. */ ++ min_size = spdk_max(MIN_SO_SNDBUF_SIZE, g_spdk_uring_sock_impl_opts.send_buf_size); ++ ++ if (sz < min_size) { ++ sz = min_size; ++ } ++ ++ rc = setsockopt(sock->fd, SOL_SOCKET, SO_SNDBUF, &sz, sizeof(sz)); ++ if (rc < 0) { ++ return rc; ++ } ++ ++ _sock->impl_opts.send_buf_size = sz; ++ ++ return 0; ++} ++ ++static struct spdk_uring_sock * ++uring_sock_alloc(int fd, struct spdk_sock_impl_opts *impl_opts, bool enable_zero_copy) ++{ ++ struct spdk_uring_sock *sock; ++#if defined(__linux__) ++ int flag; ++ int rc; ++#endif ++ ++ sock = calloc(1, sizeof(*sock)); ++ if (sock == NULL) { ++ SPDK_ERRLOG("sock allocation failed\n"); ++ return NULL; ++ } ++ ++ sock->fd = fd; ++ memcpy(&sock->base.impl_opts, impl_opts, sizeof(*impl_opts)); ++ ++#if defined(__linux__) ++ flag = 1; ++ ++ if (sock->base.impl_opts.enable_quickack) { ++ rc = setsockopt(sock->fd, IPPROTO_TCP, TCP_QUICKACK, &flag, sizeof(flag)); ++ if (rc != 0) { ++ SPDK_ERRLOG("quickack was failed to set\n"); ++ } ++ } ++ ++ spdk_sock_get_placement_id(sock->fd, sock->base.impl_opts.enable_placement_id, ++ &sock->placement_id); ++#ifdef SPDK_ZEROCOPY ++ /* Try to turn on zero copy sends */ ++ flag = 1; ++ ++ if (enable_zero_copy) { ++ rc = setsockopt(sock->fd, SOL_SOCKET, SO_ZEROCOPY, &flag, sizeof(flag)); ++ if (rc == 0) { ++ sock->zcopy = true; ++ sock->zcopy_send_flags = MSG_ZEROCOPY; ++ } ++ } ++#endif ++#endif ++ ++ return sock; ++} ++ ++static struct spdk_sock * ++uring_sock_create(const char *ip, int port, ++ enum uring_sock_create_type type, ++ struct spdk_sock_opts *opts) ++{ ++ struct spdk_uring_sock *sock; ++ struct spdk_sock_impl_opts impl_opts; ++ char buf[MAX_TMPBUF]; ++ char portnum[PORTNUMLEN]; ++ char *p; ++ struct addrinfo hints, *res, *res0; ++ int fd, flag; ++ int val = 1; ++ int rc; ++ bool enable_zcopy_impl_opts = false; ++ bool enable_zcopy_user_opts = true; ++ ++ assert(opts != NULL); ++ uring_opts_get_impl_opts(opts, &impl_opts); ++ ++ if (ip == NULL) { ++ return NULL; ++ } ++ if (ip[0] == '[') { ++ snprintf(buf, sizeof(buf), "%s", ip + 1); ++ p = strchr(buf, ']'); ++ if (p != NULL) { ++ *p = '\0'; ++ } ++ ip = (const char *) &buf[0]; ++ } ++ ++ snprintf(portnum, sizeof portnum, "%d", port); ++ memset(&hints, 0, sizeof hints); ++ hints.ai_family = PF_UNSPEC; ++ hints.ai_socktype = SOCK_STREAM; ++ hints.ai_flags = AI_NUMERICSERV; ++ hints.ai_flags |= AI_PASSIVE; ++ hints.ai_flags |= AI_NUMERICHOST; ++ rc = getaddrinfo(ip, portnum, &hints, &res0); ++ if (rc != 0) { ++ SPDK_ERRLOG("getaddrinfo() failed %s (%d)\n", gai_strerror(rc), rc); ++ return NULL; ++ } ++ ++ /* try listen */ ++ fd = -1; ++ for (res = res0; res != NULL; res = res->ai_next) { ++retry: ++ fd = socket(res->ai_family, res->ai_socktype, res->ai_protocol); ++ if (fd < 0) { ++ /* error */ ++ continue; ++ } ++ ++ val = impl_opts.recv_buf_size; ++ rc = setsockopt(fd, SOL_SOCKET, SO_RCVBUF, &val, sizeof val); ++ if (rc) { ++ /* Not fatal */ ++ } ++ ++ val = impl_opts.send_buf_size; ++ rc = setsockopt(fd, SOL_SOCKET, SO_SNDBUF, &val, sizeof val); ++ if (rc) { ++ /* Not fatal */ ++ } ++ ++ rc = setsockopt(fd, SOL_SOCKET, SO_REUSEADDR, &val, sizeof val); ++ if (rc != 0) { ++ close(fd); ++ fd = -1; ++ /* error */ ++ continue; ++ } ++ rc = setsockopt(fd, IPPROTO_TCP, TCP_NODELAY, &val, sizeof val); ++ if (rc != 0) { ++ close(fd); ++ fd = -1; ++ /* error */ ++ continue; ++ } ++ ++ if (opts->ack_timeout) { ++#if defined(__linux__) ++ val = opts->ack_timeout; ++ rc = setsockopt(fd, IPPROTO_TCP, TCP_USER_TIMEOUT, &val, sizeof val); ++ if (rc != 0) { ++ close(fd); ++ fd = -1; ++ /* error */ ++ continue; ++ } ++#else ++ SPDK_WARNLOG("TCP_USER_TIMEOUT is not supported.\n"); ++#endif ++ } ++ ++ ++ ++#if defined(SO_PRIORITY) ++ if (opts != NULL && opts->priority) { ++ rc = setsockopt(fd, SOL_SOCKET, SO_PRIORITY, &opts->priority, sizeof val); ++ if (rc != 0) { ++ close(fd); ++ fd = -1; ++ /* error */ ++ continue; ++ } ++ } ++#endif ++ if (res->ai_family == AF_INET6) { ++ rc = setsockopt(fd, IPPROTO_IPV6, IPV6_V6ONLY, &val, sizeof val); ++ if (rc != 0) { ++ close(fd); ++ fd = -1; ++ /* error */ ++ continue; ++ } ++ } ++ ++ if (type == SPDK_SOCK_CREATE_LISTEN) { ++ rc = bind(fd, res->ai_addr, res->ai_addrlen); ++ if (rc != 0) { ++ SPDK_ERRLOG("bind() failed at port %d, errno = %d\n", port, errno); ++ switch (errno) { ++ case EINTR: ++ /* interrupted? */ ++ close(fd); ++ goto retry; ++ case EADDRNOTAVAIL: ++ SPDK_ERRLOG("IP address %s not available. " ++ "Verify IP address in config file " ++ "and make sure setup script is " ++ "run before starting spdk app.\n", ip); ++ /* FALLTHROUGH */ ++ default: ++ /* try next family */ ++ close(fd); ++ fd = -1; ++ continue; ++ } ++ } ++ /* bind OK */ ++ rc = listen(fd, 512); ++ if (rc != 0) { ++ SPDK_ERRLOG("listen() failed, errno = %d\n", errno); ++ close(fd); ++ fd = -1; ++ break; ++ } ++ ++ flag = fcntl(fd, F_GETFL); ++ if (fcntl(fd, F_SETFL, flag | O_NONBLOCK) < 0) { ++ SPDK_ERRLOG("fcntl can't set nonblocking mode for socket, fd: %d (%d)\n", fd, errno); ++ close(fd); ++ fd = -1; ++ break; ++ } ++ ++ enable_zcopy_impl_opts = impl_opts.enable_zerocopy_send_server; ++ } else if (type == SPDK_SOCK_CREATE_CONNECT) { ++ rc = connect(fd, res->ai_addr, res->ai_addrlen); ++ if (rc != 0) { ++ SPDK_ERRLOG("connect() failed, errno = %d\n", errno); ++ /* try next family */ ++ close(fd); ++ fd = -1; ++ continue; ++ } ++ ++ flag = fcntl(fd, F_GETFL); ++ if (fcntl(fd, F_SETFL, flag & ~O_NONBLOCK) < 0) { ++ SPDK_ERRLOG("fcntl can't set blocking mode for socket, fd: %d (%d)\n", fd, errno); ++ close(fd); ++ fd = -1; ++ break; ++ } ++ ++ enable_zcopy_impl_opts = impl_opts.enable_zerocopy_send_client; ++ } ++ break; ++ } ++ freeaddrinfo(res0); ++ ++ if (fd < 0) { ++ return NULL; ++ } ++ ++ enable_zcopy_user_opts = opts->zcopy && !sock_is_loopback(fd); ++ sock = uring_sock_alloc(fd, &impl_opts, enable_zcopy_user_opts && enable_zcopy_impl_opts); ++ if (sock == NULL) { ++ SPDK_ERRLOG("sock allocation failed\n"); ++ close(fd); ++ return NULL; ++ } ++ ++ return &sock->base; ++} ++ ++static struct spdk_sock * ++uring_sock_listen(const char *ip, int port, struct spdk_sock_opts *opts) ++{ ++ return uring_sock_create(ip, port, SPDK_SOCK_CREATE_LISTEN, opts); ++} ++ ++static struct spdk_sock * ++uring_sock_connect(const char *ip, int port, struct spdk_sock_opts *opts) ++{ ++ return uring_sock_create(ip, port, SPDK_SOCK_CREATE_CONNECT, opts); ++} ++ ++static struct spdk_sock * ++uring_sock_accept(struct spdk_sock *_sock) ++{ ++ struct spdk_uring_sock *sock = __uring_sock(_sock); ++ struct sockaddr_storage sa; ++ socklen_t salen; ++ int rc, fd; ++ struct spdk_uring_sock *new_sock; ++ int flag; ++ ++ memset(&sa, 0, sizeof(sa)); ++ salen = sizeof(sa); ++ ++ assert(sock != NULL); ++ ++ rc = accept(sock->fd, (struct sockaddr *)&sa, &salen); ++ ++ if (rc == -1) { ++ return NULL; ++ } ++ ++ fd = rc; ++ ++ flag = fcntl(fd, F_GETFL); ++ if ((flag & O_NONBLOCK) && (fcntl(fd, F_SETFL, flag & ~O_NONBLOCK) < 0)) { ++ SPDK_ERRLOG("fcntl can't set blocking mode for socket, fd: %d (%d)\n", fd, errno); ++ close(fd); ++ return NULL; ++ } ++ ++#if defined(SO_PRIORITY) ++ /* The priority is not inherited, so call this function again */ ++ if (sock->base.opts.priority) { ++ rc = setsockopt(fd, SOL_SOCKET, SO_PRIORITY, &sock->base.opts.priority, sizeof(int)); ++ if (rc != 0) { ++ close(fd); ++ return NULL; ++ } ++ } ++#endif ++ ++ new_sock = uring_sock_alloc(fd, &sock->base.impl_opts, sock->zcopy); ++ if (new_sock == NULL) { ++ close(fd); ++ return NULL; ++ } ++ ++ return &new_sock->base; ++} ++ ++static int ++uring_sock_close(struct spdk_sock *_sock) ++{ ++ struct spdk_uring_sock *sock = __uring_sock(_sock); ++ ++ assert(TAILQ_EMPTY(&_sock->pending_reqs)); ++ assert(sock->group == NULL); ++ ++ /* If the socket fails to close, the best choice is to ++ * leak the fd but continue to free the rest of the sock ++ * memory. */ ++ close(sock->fd); ++ ++ spdk_pipe_destroy(sock->recv_pipe); ++ free(sock->recv_buf); ++ free(sock); ++ ++ return 0; ++} ++ ++static ssize_t ++uring_sock_recv_from_pipe(struct spdk_uring_sock *sock, struct iovec *diov, int diovcnt) ++{ ++ struct iovec siov[2]; ++ int sbytes; ++ ssize_t bytes; ++ struct spdk_uring_sock_group_impl *group; ++ ++ sbytes = spdk_pipe_reader_get_buffer(sock->recv_pipe, sock->recv_buf_sz, siov); ++ if (sbytes < 0) { ++ errno = EINVAL; ++ return -1; ++ } else if (sbytes == 0) { ++ errno = EAGAIN; ++ return -1; ++ } ++ ++ bytes = spdk_iovcpy(siov, 2, diov, diovcnt); ++ ++ if (bytes == 0) { ++ /* The only way this happens is if diov is 0 length */ ++ errno = EINVAL; ++ return -1; ++ } ++ ++ spdk_pipe_reader_advance(sock->recv_pipe, bytes); ++ ++ /* If we drained the pipe, take it off the level-triggered list */ ++ if (sock->base.group_impl && spdk_pipe_reader_bytes_available(sock->recv_pipe) == 0) { ++ group = __uring_group_impl(sock->base.group_impl); ++ TAILQ_REMOVE(&group->pending_recv, sock, link); ++ sock->pending_recv = false; ++ } ++ ++ return bytes; ++} ++ ++static inline ssize_t ++sock_readv(int fd, struct iovec *iov, int iovcnt) ++{ ++ struct msghdr msg = { ++ .msg_iov = iov, ++ .msg_iovlen = iovcnt, ++ }; ++ ++ return recvmsg(fd, &msg, MSG_DONTWAIT); ++} ++ ++static inline ssize_t ++uring_sock_read(struct spdk_uring_sock *sock) ++{ ++ struct iovec iov[2]; ++ int bytes; ++ struct spdk_uring_sock_group_impl *group; ++ ++ bytes = spdk_pipe_writer_get_buffer(sock->recv_pipe, sock->recv_buf_sz, iov); ++ ++ if (bytes > 0) { ++ bytes = sock_readv(sock->fd, iov, 2); ++ if (bytes > 0) { ++ spdk_pipe_writer_advance(sock->recv_pipe, bytes); ++ if (sock->base.group_impl) { ++ group = __uring_group_impl(sock->base.group_impl); ++ TAILQ_INSERT_TAIL(&group->pending_recv, sock, link); ++ sock->pending_recv = true; ++ } ++ } ++ } ++ ++ return bytes; ++} ++ ++static ssize_t ++uring_sock_readv(struct spdk_sock *_sock, struct iovec *iov, int iovcnt) ++{ ++ struct spdk_uring_sock *sock = __uring_sock(_sock); ++ int rc, i; ++ size_t len; ++ ++ if (sock->recv_pipe == NULL) { ++ return sock_readv(sock->fd, iov, iovcnt); ++ } ++ ++ len = 0; ++ for (i = 0; i < iovcnt; i++) { ++ len += iov[i].iov_len; ++ } ++ ++ if (spdk_pipe_reader_bytes_available(sock->recv_pipe) == 0) { ++ /* If the user is receiving a sufficiently large amount of data, ++ * receive directly to their buffers. */ ++ if (len >= MIN_SOCK_PIPE_SIZE) { ++ return sock_readv(sock->fd, iov, iovcnt); ++ } ++ ++ /* Otherwise, do a big read into our pipe */ ++ rc = uring_sock_read(sock); ++ if (rc <= 0) { ++ return rc; ++ } ++ } ++ ++ return uring_sock_recv_from_pipe(sock, iov, iovcnt); ++} ++ ++static ssize_t ++uring_sock_recv(struct spdk_sock *sock, void *buf, size_t len) ++{ ++ struct iovec iov[1]; ++ ++ iov[0].iov_base = buf; ++ iov[0].iov_len = len; ++ ++ return uring_sock_readv(sock, iov, 1); ++} ++ ++static ssize_t ++uring_sock_writev(struct spdk_sock *_sock, struct iovec *iov, int iovcnt) ++{ ++ struct spdk_uring_sock *sock = __uring_sock(_sock); ++ struct msghdr msg = { ++ .msg_iov = iov, ++ .msg_iovlen = iovcnt, ++ }; ++ ++ if (sock->write_task.status != SPDK_URING_SOCK_TASK_NOT_IN_USE) { ++ errno = EAGAIN; ++ return -1; ++ } ++ ++ return sendmsg(sock->fd, &msg, MSG_DONTWAIT); ++} ++ ++static ssize_t ++sock_request_advance_offset(struct spdk_sock_request *req, ssize_t rc) ++{ ++ unsigned int offset; ++ size_t len; ++ int i; ++ ++ offset = req->internal.offset; ++ for (i = 0; i < req->iovcnt; i++) { ++ /* Advance by the offset first */ ++ if (offset >= SPDK_SOCK_REQUEST_IOV(req, i)->iov_len) { ++ offset -= SPDK_SOCK_REQUEST_IOV(req, i)->iov_len; ++ continue; ++ } ++ ++ /* Calculate the remaining length of this element */ ++ len = SPDK_SOCK_REQUEST_IOV(req, i)->iov_len - offset; ++ ++ if (len > (size_t)rc) { ++ req->internal.offset += rc; ++ return -1; ++ } ++ ++ offset = 0; ++ req->internal.offset += len; ++ rc -= len; ++ } ++ ++ return rc; ++} ++ ++static int ++sock_complete_write_reqs(struct spdk_sock *_sock, ssize_t rc, bool is_zcopy) ++{ ++ struct spdk_uring_sock *sock = __uring_sock(_sock); ++ struct spdk_sock_request *req; ++ int retval; ++ ++ if (is_zcopy) { ++ /* Handling overflow case, because we use psock->sendmsg_idx - 1 for the ++ * req->internal.offset, so sendmsg_idx should not be zero */ ++ if (spdk_unlikely(sock->sendmsg_idx == UINT32_MAX)) { ++ sock->sendmsg_idx = 1; ++ } else { ++ sock->sendmsg_idx++; ++ } ++ } ++ ++ /* Consume the requests that were actually written */ ++ req = TAILQ_FIRST(&_sock->queued_reqs); ++ while (req) { ++ /* req->internal.is_zcopy is true when the whole req or part of it is sent with zerocopy */ ++ req->internal.is_zcopy = is_zcopy; ++ ++ rc = sock_request_advance_offset(req, rc); ++ if (rc < 0) { ++ /* This element was partially sent. */ ++ return 0; ++ } ++ ++ /* Handled a full request. */ ++ spdk_sock_request_pend(_sock, req); ++ ++ if (!req->internal.is_zcopy && req == TAILQ_FIRST(&_sock->pending_reqs)) { ++ retval = spdk_sock_request_put(_sock, req, 0); ++ if (retval) { ++ return retval; ++ } ++ } else { ++ /* Re-use the offset field to hold the sendmsg call index. The ++ * index is 0 based, so subtract one here because we've already ++ * incremented above. */ ++ req->internal.offset = sock->sendmsg_idx - 1; ++ } ++ ++ if (rc == 0) { ++ break; ++ } ++ ++ req = TAILQ_FIRST(&_sock->queued_reqs); ++ } ++ ++ return 0; ++} ++ ++#ifdef SPDK_ZEROCOPY ++static int ++_sock_check_zcopy(struct spdk_sock *_sock, int status) ++{ ++ struct spdk_uring_sock *sock = __uring_sock(_sock); ++ ssize_t rc; ++ struct sock_extended_err *serr; ++ struct cmsghdr *cm; ++ uint32_t idx; ++ struct spdk_sock_request *req, *treq; ++ bool found; ++ ++ assert(sock->zcopy == true); ++ if (spdk_unlikely(status) < 0) { ++ if (!TAILQ_EMPTY(&_sock->pending_reqs)) { ++ SPDK_ERRLOG("Attempting to receive from ERRQUEUE yielded error, but pending list still has orphaned entries, status =%d\n", ++ status); ++ } else { ++ SPDK_WARNLOG("Recvmsg yielded an error!\n"); ++ } ++ return 0; ++ } ++ ++ cm = CMSG_FIRSTHDR(&sock->errqueue_task.msg); ++ if (!((cm->cmsg_level == SOL_IP && cm->cmsg_type == IP_RECVERR) || ++ (cm->cmsg_level == SOL_IPV6 && cm->cmsg_type == IPV6_RECVERR))) { ++ SPDK_WARNLOG("Unexpected cmsg level or type!\n"); ++ return 0; ++ } ++ ++ serr = (struct sock_extended_err *)CMSG_DATA(cm); ++ if (serr->ee_errno != 0 || serr->ee_origin != SO_EE_ORIGIN_ZEROCOPY) { ++ SPDK_WARNLOG("Unexpected extended error origin\n"); ++ return 0; ++ } ++ ++ /* Most of the time, the pending_reqs array is in the exact ++ * order we need such that all of the requests to complete are ++ * in order, in the front. It is guaranteed that all requests ++ * belonging to the same sendmsg call are sequential, so once ++ * we encounter one match we can stop looping as soon as a ++ * non-match is found. ++ */ ++ for (idx = serr->ee_info; idx <= serr->ee_data; idx++) { ++ found = false; ++ TAILQ_FOREACH_SAFE(req, &_sock->pending_reqs, internal.link, treq) { ++ if (!req->internal.is_zcopy) { ++ /* This wasn't a zcopy request. It was just waiting in line to complete */ ++ rc = spdk_sock_request_put(_sock, req, 0); ++ if (rc < 0) { ++ return rc; ++ } ++ } else if (req->internal.offset == idx) { ++ found = true; ++ rc = spdk_sock_request_put(_sock, req, 0); ++ if (rc < 0) { ++ return rc; ++ } ++ } else if (found) { ++ break; ++ } ++ } ++ } ++ ++ return 0; ++} ++ ++static void ++_sock_prep_errqueue(struct spdk_sock *_sock) ++{ ++ struct spdk_uring_sock *sock = __uring_sock(_sock); ++ struct spdk_uring_task *task = &sock->errqueue_task; ++ struct io_uring_sqe *sqe; ++ ++ if (task->status == SPDK_URING_SOCK_TASK_IN_PROCESS) { ++ return; ++ } ++ ++ assert(sock->group != NULL); ++ sock->group->io_queued++; ++ ++ sqe = io_uring_get_sqe(&sock->group->uring); ++ io_uring_prep_recvmsg(sqe, sock->fd, &task->msg, MSG_ERRQUEUE); ++ io_uring_sqe_set_data(sqe, task); ++ task->status = SPDK_URING_SOCK_TASK_IN_PROCESS; ++} ++ ++#endif ++ ++static void ++_sock_flush(struct spdk_sock *_sock) ++{ ++ struct spdk_uring_sock *sock = __uring_sock(_sock); ++ struct spdk_uring_task *task = &sock->write_task; ++ uint32_t iovcnt; ++ struct io_uring_sqe *sqe; ++ int flags; ++ ++ if (task->status == SPDK_URING_SOCK_TASK_IN_PROCESS) { ++ return; ++ } ++ ++#ifdef SPDK_ZEROCOPY ++ if (sock->zcopy) { ++ flags = MSG_DONTWAIT | sock->zcopy_send_flags; ++ } else ++#endif ++ { ++ flags = MSG_DONTWAIT; ++ } ++ ++ iovcnt = spdk_sock_prep_reqs(&sock->base, task->iovs, task->iov_cnt, &task->last_req, &flags); ++ if (!iovcnt) { ++ return; ++ } ++ ++ task->iov_cnt = iovcnt; ++ assert(sock->group != NULL); ++ task->msg.msg_iov = task->iovs; ++ task->msg.msg_iovlen = task->iov_cnt; ++#ifdef SPDK_ZEROCOPY ++ task->is_zcopy = (flags & MSG_ZEROCOPY) ? true : false; ++#endif ++ sock->group->io_queued++; ++ ++ sqe = io_uring_get_sqe(&sock->group->uring); ++ io_uring_prep_sendmsg(sqe, sock->fd, &sock->write_task.msg, flags); ++ io_uring_sqe_set_data(sqe, task); ++ task->status = SPDK_URING_SOCK_TASK_IN_PROCESS; ++} ++ ++static void ++_sock_prep_pollin(struct spdk_sock *_sock) ++{ ++ struct spdk_uring_sock *sock = __uring_sock(_sock); ++ struct spdk_uring_task *task = &sock->pollin_task; ++ struct io_uring_sqe *sqe; ++ ++ /* Do not prepare pollin event */ ++ if (task->status == SPDK_URING_SOCK_TASK_IN_PROCESS || (sock->pending_recv && !sock->zcopy)) { ++ return; ++ } ++ ++ assert(sock->group != NULL); ++ sock->group->io_queued++; ++ ++ sqe = io_uring_get_sqe(&sock->group->uring); ++ io_uring_prep_poll_add(sqe, sock->fd, POLLIN | POLLERR); ++ io_uring_sqe_set_data(sqe, task); ++ task->status = SPDK_URING_SOCK_TASK_IN_PROCESS; ++} ++ ++static void ++_sock_prep_cancel_task(struct spdk_sock *_sock, void *user_data) ++{ ++ struct spdk_uring_sock *sock = __uring_sock(_sock); ++ struct spdk_uring_task *task = &sock->cancel_task; ++ struct io_uring_sqe *sqe; ++ ++ if (task->status == SPDK_URING_SOCK_TASK_IN_PROCESS) { ++ return; ++ } ++ ++ assert(sock->group != NULL); ++ sock->group->io_queued++; ++ ++ sqe = io_uring_get_sqe(&sock->group->uring); ++ io_uring_prep_cancel(sqe, user_data, 0); ++ io_uring_sqe_set_data(sqe, task); ++ task->status = SPDK_URING_SOCK_TASK_IN_PROCESS; ++} ++ ++static int ++sock_uring_group_reap(struct spdk_uring_sock_group_impl *group, int max, int max_read_events, ++ struct spdk_sock **socks) ++{ ++ int i, count, ret; ++ struct io_uring_cqe *cqe; ++ struct spdk_uring_sock *sock, *tmp; ++ struct spdk_uring_task *task; ++ int status; ++ bool is_zcopy; ++ ++ for (i = 0; i < max; i++) { ++ ret = io_uring_peek_cqe(&group->uring, &cqe); ++ if (ret != 0) { ++ break; ++ } ++ ++ if (cqe == NULL) { ++ break; ++ } ++ ++ task = (struct spdk_uring_task *)cqe->user_data; ++ assert(task != NULL); ++ sock = task->sock; ++ assert(sock != NULL); ++ assert(sock->group != NULL); ++ assert(sock->group == group); ++ sock->group->io_inflight--; ++ sock->group->io_avail++; ++ status = cqe->res; ++ io_uring_cqe_seen(&group->uring, cqe); ++ ++ task->status = SPDK_URING_SOCK_TASK_NOT_IN_USE; ++ ++ if (spdk_unlikely(status <= 0)) { ++ if (status == -EAGAIN || status == -EWOULDBLOCK || (status == -ENOBUFS && sock->zcopy)) { ++ continue; ++ } ++ } ++ ++ switch (task->type) { ++ case SPDK_SOCK_TASK_POLLIN: ++#ifdef SPDK_ZEROCOPY ++ if ((status & POLLERR) == POLLERR) { ++ _sock_prep_errqueue(&sock->base); ++ } ++#endif ++ if ((status & POLLIN) == POLLIN) { ++ if (sock->base.cb_fn != NULL && ++ sock->pending_recv == false) { ++ sock->pending_recv = true; ++ TAILQ_INSERT_TAIL(&group->pending_recv, sock, link); ++ } ++ } ++ break; ++ case SPDK_SOCK_TASK_WRITE: ++ task->last_req = NULL; ++ task->iov_cnt = 0; ++ is_zcopy = task->is_zcopy; ++ task->is_zcopy = false; ++ if (spdk_unlikely(status) < 0) { ++ sock->connection_status = status; ++ spdk_sock_abort_requests(&sock->base); ++ } else { ++ sock_complete_write_reqs(&sock->base, status, is_zcopy); ++ } ++ ++ break; ++#ifdef SPDK_ZEROCOPY ++ case SPDK_SOCK_TASK_ERRQUEUE: ++ if (spdk_unlikely(status == -ECANCELED)) { ++ sock->connection_status = status; ++ break; ++ } ++ _sock_check_zcopy(&sock->base, status); ++ break; ++#endif ++ case SPDK_SOCK_TASK_CANCEL: ++ /* Do nothing */ ++ break; ++ default: ++ SPDK_UNREACHABLE(); ++ } ++ } ++ ++ if (!socks) { ++ return 0; ++ } ++ count = 0; ++ TAILQ_FOREACH_SAFE(sock, &group->pending_recv, link, tmp) { ++ if (count == max_read_events) { ++ break; ++ } ++ ++ if (spdk_unlikely(sock->base.cb_fn == NULL) || ++ (sock->recv_pipe == NULL || spdk_pipe_reader_bytes_available(sock->recv_pipe) == 0)) { ++ sock->pending_recv = false; ++ TAILQ_REMOVE(&group->pending_recv, sock, link); ++ if (spdk_unlikely(sock->base.cb_fn == NULL)) { ++ /* If the socket's cb_fn is NULL, do not add it to socks array */ ++ continue; ++ } ++ } ++ ++ socks[count++] = &sock->base; ++ } ++ ++ ++ /* Cycle the pending_recv list so that each time we poll things aren't ++ * in the same order. Say we have 6 sockets in the list, named as follows: ++ * A B C D E F ++ * And all 6 sockets had the poll events, but max_events is only 3. That means ++ * psock currently points at D. We want to rearrange the list to the following: ++ * D E F A B C ++ * ++ * The variables below are named according to this example to make it easier to ++ * follow the swaps. ++ */ ++ if (sock != NULL) { ++ struct spdk_uring_sock *ua, *uc, *ud, *uf; ++ ++ /* Capture pointers to the elements we need */ ++ ud = sock; ++ ++ ua = TAILQ_FIRST(&group->pending_recv); ++ if (ua == ud) { ++ goto end; ++ } ++ ++ uf = TAILQ_LAST(&group->pending_recv, pending_recv_list); ++ if (uf == ud) { ++ TAILQ_REMOVE(&group->pending_recv, ud, link); ++ TAILQ_INSERT_HEAD(&group->pending_recv, ud, link); ++ goto end; ++ } ++ ++ uc = TAILQ_PREV(ud, pending_recv_list, link); ++ assert(uc != NULL); ++ ++ /* Break the link between C and D */ ++ uc->link.tqe_next = NULL; ++ ++ /* Connect F to A */ ++ uf->link.tqe_next = ua; ++ ua->link.tqe_prev = &uf->link.tqe_next; ++ ++ /* Fix up the list first/last pointers */ ++ group->pending_recv.tqh_first = ud; ++ group->pending_recv.tqh_last = &uc->link.tqe_next; ++ ++ /* D is in front of the list, make tqe prev pointer point to the head of list */ ++ ud->link.tqe_prev = &group->pending_recv.tqh_first; ++ } ++ ++end: ++ return count; ++} ++ ++static int uring_sock_flush(struct spdk_sock *_sock); ++ ++static void ++uring_sock_writev_async(struct spdk_sock *_sock, struct spdk_sock_request *req) ++{ ++ struct spdk_uring_sock *sock = __uring_sock(_sock); ++ int rc; ++ ++ if (spdk_unlikely(sock->connection_status)) { ++ req->cb_fn(req->cb_arg, sock->connection_status); ++ return; ++ } ++ ++ spdk_sock_request_queue(_sock, req); ++ ++ if (!sock->group) { ++ if (_sock->queued_iovcnt >= IOV_BATCH_SIZE) { ++ rc = uring_sock_flush(_sock); ++ if (rc < 0 && errno != EAGAIN) { ++ spdk_sock_abort_requests(_sock); ++ } ++ } ++ } ++} ++ ++static void ++uring_sock_readv_async(struct spdk_sock *sock, struct spdk_sock_request *req) ++{ ++ req->cb_fn(req->cb_arg, -ENOTSUP); ++} ++ ++static int ++uring_sock_set_recvlowat(struct spdk_sock *_sock, int nbytes) ++{ ++ struct spdk_uring_sock *sock = __uring_sock(_sock); ++ int val; ++ int rc; ++ ++ assert(sock != NULL); ++ ++ val = nbytes; ++ rc = setsockopt(sock->fd, SOL_SOCKET, SO_RCVLOWAT, &val, sizeof val); ++ if (rc != 0) { ++ return -1; ++ } ++ return 0; ++} ++ ++static bool ++uring_sock_is_ipv6(struct spdk_sock *_sock) ++{ ++ struct spdk_uring_sock *sock = __uring_sock(_sock); ++ struct sockaddr_storage sa; ++ socklen_t salen; ++ int rc; ++ ++ assert(sock != NULL); ++ ++ memset(&sa, 0, sizeof sa); ++ salen = sizeof sa; ++ rc = getsockname(sock->fd, (struct sockaddr *) &sa, &salen); ++ if (rc != 0) { ++ SPDK_ERRLOG("getsockname() failed (errno=%d)\n", errno); ++ return false; ++ } ++ ++ return (sa.ss_family == AF_INET6); ++} ++ ++static bool ++uring_sock_is_ipv4(struct spdk_sock *_sock) ++{ ++ struct spdk_uring_sock *sock = __uring_sock(_sock); ++ struct sockaddr_storage sa; ++ socklen_t salen; ++ int rc; ++ ++ assert(sock != NULL); ++ ++ memset(&sa, 0, sizeof sa); ++ salen = sizeof sa; ++ rc = getsockname(sock->fd, (struct sockaddr *) &sa, &salen); ++ if (rc != 0) { ++ SPDK_ERRLOG("getsockname() failed (errno=%d)\n", errno); ++ return false; ++ } ++ ++ return (sa.ss_family == AF_INET); ++} ++ ++static bool ++uring_sock_is_connected(struct spdk_sock *_sock) ++{ ++ struct spdk_uring_sock *sock = __uring_sock(_sock); ++ uint8_t byte; ++ int rc; ++ ++ rc = recv(sock->fd, &byte, 1, MSG_PEEK | MSG_DONTWAIT); ++ if (rc == 0) { ++ return false; ++ } ++ ++ if (rc < 0) { ++ if (errno == EAGAIN || errno == EWOULDBLOCK) { ++ return true; ++ } ++ ++ return false; ++ } ++ ++ return true; ++} ++ ++static struct spdk_sock_group_impl * ++uring_sock_group_impl_get_optimal(struct spdk_sock *_sock, struct spdk_sock_group_impl *hint) ++{ ++ struct spdk_uring_sock *sock = __uring_sock(_sock); ++ struct spdk_sock_group_impl *group; ++ ++ if (sock->placement_id != -1) { ++ spdk_sock_map_lookup(&g_map, sock->placement_id, &group, hint); ++ return group; ++ } ++ ++ return NULL; ++} ++ ++static struct spdk_sock_group_impl * ++uring_sock_group_impl_create(void) ++{ ++ struct spdk_uring_sock_group_impl *group_impl; ++ ++ group_impl = calloc(1, sizeof(*group_impl)); ++ if (group_impl == NULL) { ++ SPDK_ERRLOG("group_impl allocation failed\n"); ++ return NULL; ++ } ++ ++ group_impl->io_avail = SPDK_SOCK_GROUP_QUEUE_DEPTH; ++ ++ if (io_uring_queue_init(SPDK_SOCK_GROUP_QUEUE_DEPTH, &group_impl->uring, 0) < 0) { ++ SPDK_ERRLOG("uring I/O context setup failure\n"); ++ free(group_impl); ++ return NULL; ++ } ++ ++ TAILQ_INIT(&group_impl->pending_recv); ++ ++ if (g_spdk_uring_sock_impl_opts.enable_placement_id == PLACEMENT_CPU) { ++ spdk_sock_map_insert(&g_map, spdk_env_get_current_core(), &group_impl->base); ++ } ++ ++ return &group_impl->base; ++} ++ ++static int ++uring_sock_group_impl_add_sock(struct spdk_sock_group_impl *_group, ++ struct spdk_sock *_sock) ++{ ++ struct spdk_uring_sock *sock = __uring_sock(_sock); ++ struct spdk_uring_sock_group_impl *group = __uring_group_impl(_group); ++ int rc; ++ ++ sock->group = group; ++ sock->write_task.sock = sock; ++ sock->write_task.type = SPDK_SOCK_TASK_WRITE; ++ ++ sock->pollin_task.sock = sock; ++ sock->pollin_task.type = SPDK_SOCK_TASK_POLLIN; ++ ++ sock->errqueue_task.sock = sock; ++ sock->errqueue_task.type = SPDK_SOCK_TASK_ERRQUEUE; ++ sock->errqueue_task.msg.msg_control = sock->buf; ++ sock->errqueue_task.msg.msg_controllen = sizeof(sock->buf); ++ ++ sock->cancel_task.sock = sock; ++ sock->cancel_task.type = SPDK_SOCK_TASK_CANCEL; ++ ++ /* switched from another polling group due to scheduling */ ++ if (spdk_unlikely(sock->recv_pipe != NULL && ++ (spdk_pipe_reader_bytes_available(sock->recv_pipe) > 0))) { ++ assert(sock->pending_recv == false); ++ sock->pending_recv = true; ++ TAILQ_INSERT_TAIL(&group->pending_recv, sock, link); ++ } ++ ++ if (sock->placement_id != -1) { ++ rc = spdk_sock_map_insert(&g_map, sock->placement_id, &group->base); ++ if (rc != 0) { ++ SPDK_ERRLOG("Failed to insert sock group into map: %d", rc); ++ /* Do not treat this as an error. The system will continue running. */ ++ } ++ } ++ ++ return 0; ++} ++ ++static int ++uring_sock_group_impl_poll(struct spdk_sock_group_impl *_group, int max_events, ++ struct spdk_sock **socks) ++{ ++ struct spdk_uring_sock_group_impl *group = __uring_group_impl(_group); ++ int count, ret; ++ int to_complete, to_submit; ++ struct spdk_sock *_sock, *tmp; ++ struct spdk_uring_sock *sock; ++ ++ if (spdk_likely(socks)) { ++ TAILQ_FOREACH_SAFE(_sock, &group->base.socks, link, tmp) { ++ sock = __uring_sock(_sock); ++ if (spdk_unlikely(sock->connection_status)) { ++ continue; ++ } ++ _sock_flush(_sock); ++ _sock_prep_pollin(_sock); ++ } ++ } ++ ++ to_submit = group->io_queued; ++ ++ /* For network I/O, it cannot be set with O_DIRECT, so we do not need to call spdk_io_uring_enter */ ++ if (to_submit > 0) { ++ /* If there are I/O to submit, use io_uring_submit here. ++ * It will automatically call io_uring_enter appropriately. */ ++ ret = io_uring_submit(&group->uring); ++ if (ret < 0) { ++ return 1; ++ } ++ group->io_queued = 0; ++ group->io_inflight += to_submit; ++ group->io_avail -= to_submit; ++ } ++ ++ count = 0; ++ to_complete = group->io_inflight; ++ if (to_complete > 0 || !TAILQ_EMPTY(&group->pending_recv)) { ++ count = sock_uring_group_reap(group, to_complete, max_events, socks); ++ } ++ ++ return count; ++} ++ ++static int ++uring_sock_group_impl_remove_sock(struct spdk_sock_group_impl *_group, ++ struct spdk_sock *_sock) ++{ ++ struct spdk_uring_sock *sock = __uring_sock(_sock); ++ struct spdk_uring_sock_group_impl *group = __uring_group_impl(_group); ++ ++ if (sock->write_task.status != SPDK_URING_SOCK_TASK_NOT_IN_USE) { ++ _sock_prep_cancel_task(_sock, &sock->write_task); ++ /* Since spdk_sock_group_remove_sock is not asynchronous interface, so ++ * currently can use a while loop here. */ ++ while ((sock->write_task.status != SPDK_URING_SOCK_TASK_NOT_IN_USE) || ++ (sock->cancel_task.status != SPDK_URING_SOCK_TASK_NOT_IN_USE)) { ++ uring_sock_group_impl_poll(_group, 32, NULL); ++ } ++ } ++ ++ if (sock->pollin_task.status != SPDK_URING_SOCK_TASK_NOT_IN_USE) { ++ _sock_prep_cancel_task(_sock, &sock->pollin_task); ++ /* Since spdk_sock_group_remove_sock is not asynchronous interface, so ++ * currently can use a while loop here. */ ++ while ((sock->pollin_task.status != SPDK_URING_SOCK_TASK_NOT_IN_USE) || ++ (sock->cancel_task.status != SPDK_URING_SOCK_TASK_NOT_IN_USE)) { ++ uring_sock_group_impl_poll(_group, 32, NULL); ++ } ++ } ++ ++ if (sock->errqueue_task.status != SPDK_URING_SOCK_TASK_NOT_IN_USE) { ++ _sock_prep_cancel_task(_sock, &sock->errqueue_task); ++ /* Since spdk_sock_group_remove_sock is not asynchronous interface, so ++ * currently can use a while loop here. */ ++ while ((sock->errqueue_task.status != SPDK_URING_SOCK_TASK_NOT_IN_USE) || ++ (sock->cancel_task.status != SPDK_URING_SOCK_TASK_NOT_IN_USE)) { ++ uring_sock_group_impl_poll(_group, 32, NULL); ++ } ++ } ++ ++ /* Make sure the cancelling the tasks above didn't cause sending new requests */ ++ assert(sock->write_task.status == SPDK_URING_SOCK_TASK_NOT_IN_USE); ++ assert(sock->pollin_task.status == SPDK_URING_SOCK_TASK_NOT_IN_USE); ++ assert(sock->errqueue_task.status == SPDK_URING_SOCK_TASK_NOT_IN_USE); ++ ++ if (sock->pending_recv) { ++ TAILQ_REMOVE(&group->pending_recv, sock, link); ++ sock->pending_recv = false; ++ } ++ assert(sock->pending_recv == false); ++ ++ if (sock->placement_id != -1) { ++ spdk_sock_map_release(&g_map, sock->placement_id); ++ } ++ ++ sock->group = NULL; ++ return 0; ++} ++ ++static int ++uring_sock_group_impl_close(struct spdk_sock_group_impl *_group) ++{ ++ struct spdk_uring_sock_group_impl *group = __uring_group_impl(_group); ++ ++ /* try to reap all the active I/O */ ++ while (group->io_inflight) { ++ uring_sock_group_impl_poll(_group, 32, NULL); ++ } ++ assert(group->io_inflight == 0); ++ assert(group->io_avail == SPDK_SOCK_GROUP_QUEUE_DEPTH); ++ ++ io_uring_queue_exit(&group->uring); ++ ++ if (g_spdk_uring_sock_impl_opts.enable_placement_id == PLACEMENT_CPU) { ++ spdk_sock_map_release(&g_map, spdk_env_get_current_core()); ++ } ++ ++ free(group); ++ return 0; ++} ++ ++static int ++uring_sock_flush(struct spdk_sock *_sock) ++{ ++ struct spdk_uring_sock *sock = __uring_sock(_sock); ++ struct msghdr msg = {}; ++ struct iovec iovs[IOV_BATCH_SIZE]; ++ int iovcnt; ++ ssize_t rc; ++ int flags = sock->zcopy_send_flags; ++ int retval; ++ bool is_zcopy = false; ++ ++ /* Can't flush from within a callback or we end up with recursive calls */ ++ if (_sock->cb_cnt > 0) { ++ errno = EAGAIN; ++ return -1; ++ } ++ ++ /* Can't flush while a write is already outstanding */ ++ if (sock->write_task.status != SPDK_URING_SOCK_TASK_NOT_IN_USE) { ++ errno = EAGAIN; ++ return -1; ++ } ++ ++ /* Gather an iov */ ++ iovcnt = spdk_sock_prep_reqs(_sock, iovs, 0, NULL, &flags); ++ if (iovcnt == 0) { ++ /* Nothing to send */ ++ return 0; ++ } ++ ++ /* Perform the vectored write */ ++ msg.msg_iov = iovs; ++ msg.msg_iovlen = iovcnt; ++ rc = sendmsg(sock->fd, &msg, flags | MSG_DONTWAIT); ++ if (rc <= 0) { ++ if (rc == 0 || errno == EAGAIN || errno == EWOULDBLOCK || (errno == ENOBUFS && sock->zcopy)) { ++ errno = EAGAIN; ++ } ++ return -1; ++ } ++ ++#ifdef SPDK_ZEROCOPY ++ is_zcopy = flags & MSG_ZEROCOPY; ++#endif ++ retval = sock_complete_write_reqs(_sock, rc, is_zcopy); ++ if (retval < 0) { ++ /* if the socket is closed, return to avoid heap-use-after-free error */ ++ errno = ENOTCONN; ++ return -1; ++ } ++ ++#ifdef SPDK_ZEROCOPY ++ if (sock->zcopy && !TAILQ_EMPTY(&_sock->pending_reqs)) { ++ _sock_check_zcopy(_sock, 0); ++ } ++#endif ++ ++ return rc; ++} ++ ++static struct spdk_net_impl g_uring_net_impl = { ++ .name = "uring", ++ .getaddr = uring_sock_getaddr, ++ .connect = uring_sock_connect, ++ .listen = uring_sock_listen, ++ .accept = uring_sock_accept, ++ .close = uring_sock_close, ++ .recv = uring_sock_recv, ++ .readv = uring_sock_readv, ++ .readv_async = uring_sock_readv_async, ++ .writev = uring_sock_writev, ++ .writev_async = uring_sock_writev_async, ++ .flush = uring_sock_flush, ++ .set_recvlowat = uring_sock_set_recvlowat, ++ .set_recvbuf = uring_sock_set_recvbuf, ++ .set_sendbuf = uring_sock_set_sendbuf, ++ .is_ipv6 = uring_sock_is_ipv6, ++ .is_ipv4 = uring_sock_is_ipv4, ++ .is_connected = uring_sock_is_connected, ++ .group_impl_get_optimal = uring_sock_group_impl_get_optimal, ++ .group_impl_create = uring_sock_group_impl_create, ++ .group_impl_add_sock = uring_sock_group_impl_add_sock, ++ .group_impl_remove_sock = uring_sock_group_impl_remove_sock, ++ .group_impl_poll = uring_sock_group_impl_poll, ++ .group_impl_close = uring_sock_group_impl_close, ++ .get_opts = uring_sock_impl_get_opts, ++ .set_opts = uring_sock_impl_set_opts, ++}; ++ ++SPDK_NET_IMPL_REGISTER(uring, &g_uring_net_impl, DEFAULT_SOCK_PRIORITY + 2); +diff --git a/module/vfu_device/Makefile b/module/vfu_device/Makefile +index 73a3905..856befa 100644 +--- a/module/vfu_device/Makefile ++++ b/module/vfu_device/Makefile +@@ -1,17 +1,17 @@ +-# SPDX-License-Identifier: BSD-3-Clause +-# Copyright (C) 2022 Intel Corporation. +-# All rights reserved. +-# +- +-SPDK_ROOT_DIR := $(abspath $(CURDIR)/../..) +-include $(SPDK_ROOT_DIR)/mk/spdk.common.mk +- +-SO_VER := 1 +-SO_MINOR := 0 +- +-C_SRCS = vfu_virtio.c vfu_virtio_blk.c vfu_virtio_scsi.c vfu_virtio_rpc.c +-LIBNAME = vfu_device +- +-SPDK_MAP_FILE = $(SPDK_ROOT_DIR)/mk/spdk_blank.map +- +-include $(SPDK_ROOT_DIR)/mk/spdk.lib.mk ++# SPDX-License-Identifier: BSD-3-Clause ++# Copyright (C) 2022 Intel Corporation. ++# All rights reserved. ++# ++ ++SPDK_ROOT_DIR := $(abspath $(CURDIR)/../..) ++include $(SPDK_ROOT_DIR)/mk/spdk.common.mk ++ ++SO_VER := 1 ++SO_MINOR := 0 ++ ++C_SRCS = vfu_virtio.c vfu_virtio_blk.c vfu_virtio_scsi.c vfu_virtio_rpc.c ++LIBNAME = vfu_device ++ ++SPDK_MAP_FILE = $(SPDK_ROOT_DIR)/mk/spdk_blank.map ++ ++include $(SPDK_ROOT_DIR)/mk/spdk.lib.mk +diff --git a/module/vfu_device/vfu_virtio.c b/module/vfu_device/vfu_virtio.c +index 96e219a..4b5e8ec 100644 +--- a/module/vfu_device/vfu_virtio.c ++++ b/module/vfu_device/vfu_virtio.c +@@ -1,1778 +1,1778 @@ +-/* SPDX-License-Identifier: BSD-3-Clause +- * Copyright (C) 2022 Intel Corporation. +- * All rights reserved. +- */ +- +-/* +- * virtio over vfio-user common library +- */ +-#include "spdk/env.h" +-#include "spdk/bdev.h" +-#include "spdk/bdev_module.h" +-#include "spdk/stdinc.h" +-#include "spdk/assert.h" +-#include "spdk/barrier.h" +-#include "spdk/thread.h" +-#include "spdk/memory.h" +-#include "spdk/util.h" +-#include "spdk/log.h" +-#include "spdk/string.h" +-#include "spdk/likely.h" +- +-#include "vfu_virtio_internal.h" +- +-static int vfu_virtio_dev_start(struct vfu_virtio_dev *dev); +-static int vfu_virtio_dev_stop(struct vfu_virtio_dev *dev); +- +-static inline void +-vfu_virtio_unmap_q(struct vfu_virtio_dev *dev, struct q_mapping *mapping) +-{ +- struct vfu_virtio_endpoint *virtio_endpoint = dev->virtio_endpoint; +- +- if (mapping->addr != NULL) { +- spdk_vfu_unmap_sg(virtio_endpoint->endpoint, mapping->sg, +- &mapping->iov, 1); +- mapping->addr = NULL; +- mapping->len = 0; +- } +-} +- +-static inline int +-vfu_virtio_map_q(struct vfu_virtio_dev *dev, struct q_mapping *mapping, uint64_t phys_addr, +- uint64_t len) +-{ +- struct vfu_virtio_endpoint *virtio_endpoint = dev->virtio_endpoint; +- void *addr; +- +- if (!mapping->addr && len && phys_addr) { +- addr = spdk_vfu_map_one(virtio_endpoint->endpoint, phys_addr, len, +- mapping->sg, &mapping->iov, PROT_READ | PROT_WRITE); +- if (addr == NULL) { +- return -EINVAL; +- } +- mapping->phys_addr = phys_addr; +- mapping->len = len; +- mapping->addr = addr; +- } +- +- return 0; +-} +- +-static int +-virtio_dev_map_vq(struct vfu_virtio_dev *dev, struct vfu_virtio_vq *vq) +-{ +- int ret; +- uint64_t phys_addr, len; +- +- if (!vq->enabled || (vq->q_state == VFU_VQ_ACTIVE)) { +- return 0; +- } +- +- SPDK_DEBUGLOG(vfu_virtio, "%s: try to map vq %u\n", dev->name, vq->id); +- +- len = virtio_queue_desc_size(dev, vq); +- phys_addr = ((((uint64_t)vq->desc_hi) << 32) | vq->desc_lo); +- ret = vfu_virtio_map_q(dev, &vq->desc, phys_addr, len); +- if (ret) { +- SPDK_DEBUGLOG(vfu_virtio, "Error to map descs\n"); +- return ret; +- } +- +- len = virtio_queue_avail_size(dev, vq); +- phys_addr = ((((uint64_t)vq->avail_hi) << 32) | vq->avail_lo); +- ret = vfu_virtio_map_q(dev, &vq->avail, phys_addr, len); +- if (ret) { +- vfu_virtio_unmap_q(dev, &vq->desc); +- SPDK_DEBUGLOG(vfu_virtio, "Error to map available ring\n"); +- return ret; +- } +- +- len = virtio_queue_used_size(dev, vq); +- phys_addr = ((((uint64_t)vq->used_hi) << 32) | vq->used_lo); +- ret = vfu_virtio_map_q(dev, &vq->used, phys_addr, len); +- if (ret) { +- vfu_virtio_unmap_q(dev, &vq->desc); +- vfu_virtio_unmap_q(dev, &vq->avail); +- SPDK_DEBUGLOG(vfu_virtio, "Error to map used ring\n"); +- return ret; +- } +- +- /* We're running with polling mode */ +- if (virtio_guest_has_feature(dev, VIRTIO_F_RING_PACKED)) { +- vq->used.device_event->flags = VRING_PACKED_EVENT_FLAG_DISABLE; +- } else { +- vq->used.used->flags = VRING_USED_F_NO_NOTIFY; +- } +- +- SPDK_DEBUGLOG(vfu_virtio, "%s: map vq %u successfully\n", dev->name, vq->id); +- vq->q_state = VFU_VQ_ACTIVE; +- +- return 0; +-} +- +-static void +-virtio_dev_unmap_vq(struct vfu_virtio_dev *dev, struct vfu_virtio_vq *vq) +-{ +- SPDK_DEBUGLOG(vfu_virtio, "%s: unmap vq %u\n", dev->name, vq->id); +- vq->q_state = VFU_VQ_INACTIVE; +- +- vfu_virtio_unmap_q(dev, &vq->desc); +- vfu_virtio_unmap_q(dev, &vq->avail); +- vfu_virtio_unmap_q(dev, &vq->used); +-} +- +-static bool +-vfu_virtio_vq_should_unmap(struct vfu_virtio_vq *vq, void *map_start, void *map_end) +-{ +- /* always do unmap when stopping the device */ +- if (!map_start || !map_end) { +- return true; +- } +- +- if (vq->desc.addr >= map_start && vq->desc.addr < map_end) { +- return true; +- } +- +- if (vq->avail.addr >= map_start && vq->avail.addr < map_end) { +- return true; +- } +- +- if (vq->used.addr >= map_start && vq->used.addr < map_end) { +- return true; +- } +- +- return false; +-} +- +-static void +-vfu_virtio_dev_unmap_vqs(struct vfu_virtio_dev *dev, void *map_start, void *map_end) +-{ +- uint32_t i; +- struct vfu_virtio_vq *vq; +- +- for (i = 0; i < dev->num_queues; i++) { +- vq = &dev->vqs[i]; +- if (!vq->enabled) { +- continue; +- } +- +- if (!vfu_virtio_vq_should_unmap(vq, map_start, map_end)) { +- continue; +- } +- virtio_dev_unmap_vq(dev, vq); +- } +-} +- +-/* This function is used to notify VM that the device +- * configuration space has been changed. +- */ +-void +-vfu_virtio_notify_config(struct vfu_virtio_endpoint *virtio_endpoint) +-{ +- struct spdk_vfu_endpoint *endpoint = virtio_endpoint->endpoint; +- +- if (virtio_endpoint->dev == NULL) { +- return; +- } +- +- virtio_endpoint->dev->cfg.isr = 1; +- virtio_endpoint->dev->cfg.config_generation++; +- +- vfu_irq_trigger(spdk_vfu_get_vfu_ctx(endpoint), virtio_endpoint->dev->cfg.msix_config); +-} +- +-static void +-vfu_virtio_dev_reset(struct vfu_virtio_dev *dev) +-{ +- uint32_t i; +- struct vfu_virtio_vq *vq; +- +- SPDK_DEBUGLOG(vfu_virtio, "device %s resetting\n", dev->name); +- +- for (i = 0; i < dev->num_queues; i++) { +- vq = &dev->vqs[i]; +- +- vq->q_state = VFU_VQ_CREATED; +- vq->vector = 0; +- vq->enabled = false; +- vq->last_avail_idx = 0; +- vq->last_used_idx = 0; +- +- vq->packed.packed_ring = false; +- vq->packed.avail_phase = 0; +- vq->packed.used_phase = 0; +- } +- +- memset(&dev->cfg, 0, sizeof(struct virtio_pci_cfg)); +-} +- +-static int +-virtio_dev_set_status(struct vfu_virtio_dev *dev, uint8_t status) +-{ +- int ret = 0; +- +- SPDK_DEBUGLOG(vfu_virtio, "device current status %x, set status %x\n", dev->cfg.device_status, +- status); +- +- if (!(virtio_dev_is_started(dev))) { +- if (status & VIRTIO_CONFIG_S_DRIVER_OK) { +- ret = vfu_virtio_dev_start(dev); +- } +- } else { +- if (!(status & VIRTIO_CONFIG_S_DRIVER_OK)) { +- ret = vfu_virtio_dev_stop(dev); +- } +- } +- +- if (ret) { +- SPDK_ERRLOG("Failed to start/stop device\n"); +- return ret; +- } +- +- dev->cfg.device_status = status; +- +- if (status == 0) { +- vfu_virtio_dev_reset(dev); +- } +- +- return 0; +-} +- +-static int +-virtio_dev_set_features(struct vfu_virtio_dev *dev, uint64_t features) +-{ +- if (dev->cfg.device_status & VIRTIO_CONFIG_S_FEATURES_OK) { +- SPDK_ERRLOG("Feature negotiation has finished\n"); +- return -EINVAL; +- } +- +- if (features & ~dev->host_features) { +- SPDK_ERRLOG("Host features 0x%"PRIx64", guest features 0x%"PRIx64"\n", +- dev->host_features, features); +- return -ENOTSUP; +- } +- +- SPDK_DEBUGLOG(vfu_virtio, "%s: negotiated features 0x%"PRIx64"\n", dev->name, +- features); +- dev->cfg.guest_features = features; +- +- return 0; +-} +- +-static int +-virtio_dev_enable_vq(struct vfu_virtio_dev *dev, uint16_t qid) +-{ +- struct vfu_virtio_vq *vq; +- +- SPDK_DEBUGLOG(vfu_virtio, "%s: enable vq %u\n", dev->name, qid); +- +- vq = &dev->vqs[qid]; +- if (vq->enabled) { +- SPDK_ERRLOG("Queue %u is enabled\n", qid); +- return -EINVAL; +- } +- vq->enabled = true; +- +- if (virtio_dev_map_vq(dev, vq)) { +- SPDK_ERRLOG("Queue %u failed to map\n", qid); +- return 0; +- } +- +- vq->avail.avail->idx = 0; +- vq->last_avail_idx = 0; +- vq->used.used->idx = 0; +- vq->last_used_idx = 0; +- +- if (virtio_guest_has_feature(dev, VIRTIO_F_RING_PACKED)) { +- SPDK_DEBUGLOG(vfu_virtio, "%s: vq %u PACKED RING ENABLED\n", dev->name, qid); +- vq->packed.packed_ring = true; +- vq->packed.avail_phase = true; +- vq->packed.used_phase = true; +- } +- +- return 0; +-} +- +-static int +-virtio_dev_disable_vq(struct vfu_virtio_dev *dev, uint16_t qid) +-{ +- struct vfu_virtio_vq *vq; +- +- SPDK_DEBUGLOG(vfu_virtio, "%s: disable vq %u\n", dev->name, qid); +- +- vq = &dev->vqs[qid]; +- if (!vq->enabled) { +- SPDK_NOTICELOG("Queue %u isn't enabled\n", qid); +- return 0; +- } +- +- virtio_dev_unmap_vq(dev, vq); +- +- vq->q_state = VFU_VQ_CREATED; +- vq->vector = 0; +- vq->enabled = false; +- vq->last_avail_idx = 0; +- vq->last_used_idx = 0; +- vq->packed.packed_ring = false; +- vq->packed.avail_phase = 0; +- vq->packed.used_phase = 0; +- +- return 0; +-} +- +-static int +-virtio_dev_split_get_avail_reqs(struct vfu_virtio_dev *dev, struct vfu_virtio_vq *vq, +- uint16_t *reqs, uint16_t max_reqs) +-{ +- uint16_t count, i, avail_idx, last_idx; +- +- last_idx = vq->last_avail_idx; +- avail_idx = vq->avail.avail->idx; +- +- spdk_smp_rmb(); +- +- count = avail_idx - last_idx; +- if (count == 0) { +- return 0; +- } +- +- count = spdk_min(count, max_reqs); +- vq->last_avail_idx += count; +- +- for (i = 0; i < count; i++) { +- reqs[i] = vq->avail.avail->ring[(last_idx + i) & (vq->qsize - 1)]; +- } +- +- SPDK_DEBUGLOG(vfu_virtio_io, +- "AVAIL: vq %u last_idx=%"PRIu16" avail_idx=%"PRIu16" count=%"PRIu16"\n", +- vq->id, last_idx, avail_idx, count); +- +- return count; +-} +- +-static int +-virtio_vring_split_desc_get_next(struct vring_desc **desc, +- struct vring_desc *desc_table, +- uint32_t desc_table_size) +-{ +- struct vring_desc *old_desc = *desc; +- uint16_t next_idx; +- +- if ((old_desc->flags & VRING_DESC_F_NEXT) == 0) { +- *desc = NULL; +- return 0; +- } +- +- next_idx = old_desc->next; +- if (spdk_unlikely(next_idx >= desc_table_size)) { +- *desc = NULL; +- return -1; +- } +- +- *desc = &desc_table[next_idx]; +- return 0; +-} +- +-static inline void * +-virtio_vring_desc_to_iov(struct vfu_virtio_dev *dev, struct vring_desc *desc, +- dma_sg_t *sg, struct iovec *iov) +-{ +- struct vfu_virtio_endpoint *virtio_endpoint = dev->virtio_endpoint; +- +- return spdk_vfu_map_one(virtio_endpoint->endpoint, desc->addr, desc->len, +- sg, iov, PROT_READ | PROT_WRITE); +-} +- +-static int +-virtio_split_vring_get_desc(struct vfu_virtio_dev *dev, struct vfu_virtio_vq *vq, uint16_t desc_idx, +- struct vring_desc **desc, struct vring_desc **desc_table, +- uint32_t *desc_table_size, +- dma_sg_t *sg, struct iovec *iov) +-{ +- *desc = &vq->desc.desc[desc_idx]; +- +- if (virtio_vring_split_desc_is_indirect(*desc)) { +- *desc_table_size = (*desc)->len / sizeof(struct vring_desc); +- *desc_table = virtio_vring_desc_to_iov(dev, *desc, sg, iov); +- *desc = *desc_table; +- if (*desc == NULL) { +- return -EINVAL; +- } +- return 0; +- } +- +- *desc_table = vq->desc.desc; +- *desc_table_size = vq->qsize; +- +- return 0; +-} +- +-static inline dma_sg_t * +-virtio_req_to_sg_t(struct vfu_virtio_req *req, uint32_t iovcnt) +-{ +- return (dma_sg_t *)(req->sg + iovcnt * dma_sg_size()); +-} +- +-static inline struct vfu_virtio_req * +-vfu_virtio_dev_get_req(struct vfu_virtio_endpoint *virtio_endpoint, struct vfu_virtio_vq *vq) +-{ +- struct vfu_virtio_req *req; +- +- req = STAILQ_FIRST(&vq->free_reqs); +- if (req == NULL) { +- return NULL; +- } +- STAILQ_REMOVE_HEAD(&vq->free_reqs, link); +- +- req->iovcnt = 0; +- req->used_len = 0; +- req->payload_size = 0; +- req->req_idx = 0; +- req->buffer_id = 0; +- req->num_descs = 0; +- +- return req; +-} +- +-void +-vfu_virtio_dev_put_req(struct vfu_virtio_req *req) +-{ +- struct vfu_virtio_dev *dev = req->dev; +- struct vfu_virtio_endpoint *virtio_endpoint = dev->virtio_endpoint; +- vfu_ctx_t *vfu_ctx = spdk_vfu_get_vfu_ctx(virtio_endpoint->endpoint); +- +- if (req->indirect_iov->iov_base) { +- vfu_sgl_put(vfu_ctx, req->indirect_sg, req->indirect_iov, 1); +- req->indirect_iov->iov_base = NULL; +- req->indirect_iov->iov_len = 0; +- } +- +- if (req->iovcnt) { +- vfu_sgl_put(vfu_ctx, virtio_req_to_sg_t(req, 0), req->iovs, req->iovcnt); +- req->iovcnt = 0; +- } +- +- STAILQ_INSERT_HEAD(&req->vq->free_reqs, req, link); +-} +- +-void +-vfu_virtio_finish_req(struct vfu_virtio_req *req) +-{ +- struct vfu_virtio_dev *dev = req->dev; +- struct vfu_virtio_endpoint *virtio_endpoint = dev->virtio_endpoint; +- +- assert(virtio_endpoint->io_outstanding); +- virtio_endpoint->io_outstanding--; +- +- if (!virtio_guest_has_feature(req->dev, VIRTIO_F_RING_PACKED)) { +- virtio_vq_used_ring_split_enqueue(req->vq, req->req_idx, req->used_len); +- } else { +- virtio_vq_used_ring_packed_enqueue(req->vq, req->buffer_id, req->num_descs, req->used_len); +- } +- +- vfu_virtio_dev_put_req(req); +-} +- +-static inline void +-vfu_virtio_dev_free_reqs(struct vfu_virtio_endpoint *virtio_endpoint, struct vfu_virtio_dev *dev) +-{ +- struct vfu_virtio_req *req; +- struct vfu_virtio_vq *vq; +- uint32_t i; +- +- for (i = 0; i < dev->num_queues; i++) { +- vq = &dev->vqs[i]; +- while (!STAILQ_EMPTY(&vq->free_reqs)) { +- req = STAILQ_FIRST(&vq->free_reqs); +- STAILQ_REMOVE_HEAD(&vq->free_reqs, link); +- vfu_virtio_vq_free_req(virtio_endpoint, vq, req); +- } +- } +-} +- +-static int +-virtio_dev_split_iovs_setup(struct vfu_virtio_dev *dev, struct vfu_virtio_vq *vq, +- uint16_t desc_idx, struct vfu_virtio_req *req) +-{ +- struct vring_desc *desc, *desc_table; +- uint32_t desc_table_size, len = 0; +- uint32_t desc_handled_cnt = 0; +- int rc; +- +- rc = virtio_split_vring_get_desc(dev, vq, desc_idx, &desc, +- &desc_table, &desc_table_size, +- req->indirect_sg, req->indirect_iov); +- if (spdk_unlikely(rc)) { +- SPDK_ERRLOG("Invalid descriptor at index %"PRIu16".\n", desc_idx); +- return rc; +- } +- +- assert(req->iovcnt == 0); +- +- while (true) { +- if (spdk_unlikely(!virtio_vring_desc_to_iov(dev, desc, virtio_req_to_sg_t(req, req->iovcnt), +- &req->iovs[req->iovcnt]))) { +- return -EINVAL; +- } +- req->desc_writeable[req->iovcnt] = false; +- if (virtio_vring_split_desc_is_wr(desc)) { +- req->desc_writeable[req->iovcnt] = true; +- } +- +- req->iovcnt++; +- len += desc->len; +- +- rc = virtio_vring_split_desc_get_next(&desc, desc_table, desc_table_size); +- if (spdk_unlikely(rc)) { +- return rc; +- } else if (desc == NULL) { +- break; +- } +- +- desc_handled_cnt++; +- if (spdk_unlikely(desc_handled_cnt > desc_table_size)) { +- return -EINVAL; +- } +- } +- +- req->payload_size = len; +- +- return 0; +-} +- +-void +-virtio_vq_used_ring_split_enqueue(struct vfu_virtio_vq *vq, uint16_t req_idx, uint32_t used_len) +-{ +- uint16_t last_idx = vq->last_used_idx & (vq->qsize - 1); +- +- SPDK_DEBUGLOG(vfu_virtio_io, +- "Queue %u - USED RING: last_idx=%"PRIu16" req_idx=%"PRIu16" used_len=%"PRIu32"\n", +- vq->id, last_idx, req_idx, used_len); +- +- vq->used.used->ring[last_idx].id = req_idx; +- vq->used.used->ring[last_idx].len = used_len; +- vq->last_used_idx++; +- +- spdk_smp_wmb(); +- +- *(volatile uint16_t *)&vq->used.used->idx = vq->last_used_idx; +- +- vq->used_req_cnt++; +-} +- +-void +-virtio_vq_used_ring_packed_enqueue(struct vfu_virtio_vq *vq, uint16_t buffer_id, uint32_t num_descs, +- uint32_t used_len) +-{ +- struct vring_packed_desc *desc = &vq->desc.desc_packed[vq->last_used_idx]; +- +- SPDK_DEBUGLOG(vfu_virtio_io, +- "Queue %u - USED RING: buffer_id=%"PRIu16" num_descs=%u used_len=%"PRIu32"\n", +- vq->id, buffer_id, num_descs, used_len); +- +- if (spdk_unlikely(virtio_vring_packed_is_used(desc, vq->packed.used_phase))) { +- SPDK_ERRLOG("descriptor has been used before\n"); +- return; +- } +- +- /* In used desc addr is unused and len specifies the buffer length +- * that has been written to by the device. +- */ +- desc->addr = 0; +- desc->len = used_len; +- +- /* This bit specifies whether any data has been written by the device */ +- if (used_len != 0) { +- desc->flags |= VRING_DESC_F_WRITE; +- } +- +- /* Buffer ID is included in the last descriptor in the list. +- * The driver needs to keep track of the size of the list corresponding +- * to each buffer ID. +- */ +- desc->id = buffer_id; +- +- /* A device MUST NOT make the descriptor used before buffer_id is +- * written to the descriptor. +- */ +- spdk_smp_wmb(); +- +- /* To mark a desc as used, the device sets the F_USED bit in flags to match +- * the internal Device ring wrap counter. It also sets the F_AVAIL bit to +- * match the same value. +- */ +- if (vq->packed.used_phase) { +- desc->flags |= (1 << VRING_PACKED_DESC_F_AVAIL); +- desc->flags |= (1 << VRING_PACKED_DESC_F_USED); +- } else { +- desc->flags &= ~(1 << VRING_PACKED_DESC_F_AVAIL); +- desc->flags &= ~(1 << VRING_PACKED_DESC_F_USED); +- } +- +- vq->last_used_idx += num_descs; +- if (vq->last_used_idx >= vq->qsize) { +- vq->last_used_idx -= vq->qsize; +- vq->packed.used_phase = !vq->packed.used_phase; +- } +- +- vq->used_req_cnt++; +-} +- +-static int +-vfu_virtio_vq_post_irq(struct vfu_virtio_dev *dev, struct vfu_virtio_vq *vq) +-{ +- struct vfu_virtio_endpoint *virtio_endpoint = dev->virtio_endpoint; +- vfu_ctx_t *vfu_ctx = spdk_vfu_get_vfu_ctx(virtio_endpoint->endpoint); +- +- vq->used_req_cnt = 0; +- +- if (spdk_vfu_endpoint_msix_enabled(virtio_endpoint->endpoint)) { +- SPDK_DEBUGLOG(vfu_virtio_io, "%s: Queue %u post MSIX IV %u\n", +- spdk_vfu_get_endpoint_id(virtio_endpoint->endpoint), +- vq->id, vq->vector); +- return vfu_irq_trigger(vfu_ctx, vq->vector); +- } else { +- if (!spdk_vfu_endpoint_intx_enabled(virtio_endpoint->endpoint)) { +- SPDK_DEBUGLOG(vfu_virtio_io, "%s: IRQ disabled\n", +- spdk_vfu_get_endpoint_id(virtio_endpoint->endpoint)); +- return 0; +- } +- +- SPDK_DEBUGLOG(vfu_virtio_io, "%s: Queue %u post ISR\n", +- spdk_vfu_get_endpoint_id(virtio_endpoint->endpoint), vq->id); +- dev->cfg.isr = 1; +- return vfu_irq_trigger(vfu_ctx, 0); +- } +-} +- +-void +-vfu_virtio_vq_flush_irq(struct vfu_virtio_dev *dev, struct vfu_virtio_vq *vq) +-{ +- struct vfu_virtio_endpoint *virtio_endpoint = dev->virtio_endpoint; +- uint32_t delay_us; +- +- if (vq->used_req_cnt == 0) { +- return; +- } +- +- /* No need to notify client */ +- if (virtio_queue_event_is_suppressed(dev, vq)) { +- return; +- } +- +- /* Interrupt coalescing disabled */ +- if (!virtio_endpoint->coalescing_delay_us) { +- vfu_virtio_vq_post_irq(dev, vq); +- return; +- } +- +- /* No need for event right now */ +- if (spdk_get_ticks() < vq->next_event_time) { +- return; +- } +- +- vfu_virtio_vq_post_irq(dev, vq); +- +- delay_us = virtio_endpoint->coalescing_delay_us; +- vq->next_event_time = spdk_get_ticks() + delay_us * spdk_get_ticks_hz() / (1000000ULL); +-} +- +-int +-vfu_virito_dev_process_split_ring(struct vfu_virtio_dev *dev, struct vfu_virtio_vq *vq) +-{ +- struct vfu_virtio_endpoint *virtio_endpoint = dev->virtio_endpoint; +- struct vfu_virtio_req *req; +- uint16_t reqs_idx[VIRTIO_DEV_VRING_MAX_REQS]; +- uint16_t reqs_cnt, i; +- int ret; +- +- reqs_cnt = virtio_dev_split_get_avail_reqs(dev, vq, reqs_idx, VIRTIO_DEV_VRING_MAX_REQS); +- if (!reqs_cnt) { +- return 0; +- } +- +- SPDK_DEBUGLOG(vfu_virtio_io, "%s: get %u descriptors\n", dev->name, reqs_cnt); +- +- for (i = 0; i < reqs_cnt; i++) { +- req = vfu_virtio_dev_get_req(virtio_endpoint, vq); +- if (spdk_unlikely(!req)) { +- SPDK_ERRLOG("Error to get request\n"); +- /* TODO: address the error case */ +- return -EIO; +- } +- +- req->req_idx = reqs_idx[i]; +- ret = virtio_dev_split_iovs_setup(dev, vq, req->req_idx, req); +- if (spdk_unlikely(ret)) { +- /* let the device to response this error */ +- SPDK_ERRLOG("Split vring setup failed with index %u\n", i); +- } +- +- assert(virtio_endpoint->virtio_ops.exec_request); +- virtio_endpoint->io_outstanding++; +- virtio_endpoint->virtio_ops.exec_request(virtio_endpoint, vq, req); +- } +- +- return i; +-} +- +-struct vfu_virtio_req * +-virito_dev_split_ring_get_next_avail_req(struct vfu_virtio_dev *dev, struct vfu_virtio_vq *vq) +-{ +- struct vfu_virtio_endpoint *virtio_endpoint = dev->virtio_endpoint; +- struct vfu_virtio_req *req; +- uint16_t reqs_idx[VIRTIO_DEV_VRING_MAX_REQS]; +- uint16_t reqs_cnt; +- int ret; +- +- reqs_cnt = virtio_dev_split_get_avail_reqs(dev, vq, reqs_idx, 1); +- if (!reqs_cnt) { +- return NULL; +- } +- assert(reqs_cnt == 1); +- +- SPDK_DEBUGLOG(vfu_virtio_io, "%s: get 1 descriptors\n", dev->name); +- +- req = vfu_virtio_dev_get_req(virtio_endpoint, vq); +- if (!req) { +- SPDK_ERRLOG("Error to get request\n"); +- return NULL; +- } +- +- req->req_idx = reqs_idx[0]; +- ret = virtio_dev_split_iovs_setup(dev, vq, req->req_idx, req); +- if (ret) { +- SPDK_ERRLOG("Split vring setup failed\n"); +- vfu_virtio_dev_put_req(req); +- return NULL; +- } +- +- return req; +-} +- +-static inline void * +-virtio_vring_packed_desc_to_iov(struct vfu_virtio_dev *dev, struct vring_packed_desc *desc, +- dma_sg_t *sg, struct iovec *iov) +-{ +- struct vfu_virtio_endpoint *virtio_endpoint = dev->virtio_endpoint; +- +- return spdk_vfu_map_one(virtio_endpoint->endpoint, desc->addr, desc->len, +- sg, iov, PROT_READ | PROT_WRITE); +-} +- +-static int +-virtio_dev_packed_iovs_setup(struct vfu_virtio_dev *dev, struct vfu_virtio_vq *vq, +- uint16_t last_avail_idx, +- struct vring_packed_desc *current_desc, struct vfu_virtio_req *req) +-{ +- struct vring_packed_desc *desc, *desc_table = NULL; +- uint16_t new_idx, num_descs, desc_table_size = 0; +- uint32_t len = 0; +- +- SPDK_DEBUGLOG(vfu_virtio_io, "%s: last avail idx %u, req %p\n", dev->name, last_avail_idx, req); +- +- desc = NULL; +- num_descs = 1; +- if (virtio_vring_packed_desc_is_indirect(current_desc)) { +- req->buffer_id = current_desc->id; +- desc_table = virtio_vring_packed_desc_to_iov(dev, current_desc, req->indirect_sg, +- req->indirect_iov); +- if (spdk_unlikely(desc_table == NULL)) { +- SPDK_ERRLOG("Map Indirect Desc to IOV failed\n"); +- return -EINVAL; +- } +- desc_table_size = current_desc->len / sizeof(struct vring_packed_desc); +- desc = desc_table; +- SPDK_DEBUGLOG(vfu_virtio_io, "%s: indirect desc %p, desc size %u, req %p\n", +- dev->name, desc_table, desc_table_size, req); +- } else { +- desc = current_desc; +- } +- +- assert(req->iovcnt == 0); +- /* Map descs to IOVs */ +- new_idx = last_avail_idx; +- while (1) { +- assert(desc != NULL); +- if (spdk_unlikely(req->iovcnt == VIRTIO_DEV_MAX_IOVS)) { +- SPDK_ERRLOG("Max IOVs in request reached (iovcnt = %d).\n", req->iovcnt); +- return -EINVAL; +- } +- +- if (spdk_unlikely(!virtio_vring_packed_desc_to_iov(dev, desc, virtio_req_to_sg_t(req, req->iovcnt), +- &req->iovs[req->iovcnt]))) { +- SPDK_ERRLOG("Map Desc to IOV failed (iovcnt = %d).\n", req->iovcnt); +- return -EINVAL; +- } +- req->desc_writeable[req->iovcnt] = false; +- if (virtio_vring_packed_desc_is_wr(desc)) { +- req->desc_writeable[req->iovcnt] = true; +- } +- +- req->iovcnt++; +- len += desc->len; +- +- /* get next desc */ +- if (desc_table) { +- if (req->iovcnt < desc_table_size) { +- desc = &desc_table[req->iovcnt]; +- } else { +- desc = NULL; +- } +- } else { +- if ((desc->flags & VRING_DESC_F_NEXT) == 0) { +- req->buffer_id = desc->id; +- desc = NULL; +- } else { +- new_idx = (new_idx + 1) % vq->qsize; +- desc = &vq->desc.desc_packed[new_idx]; +- num_descs++; +- req->buffer_id = desc->id; +- } +- } +- +- if (desc == NULL) { +- break; +- } +- } +- +- req->num_descs = num_descs; +- vq->last_avail_idx = (new_idx + 1) % vq->qsize; +- if (vq->last_avail_idx < last_avail_idx) { +- vq->packed.avail_phase = !vq->packed.avail_phase; +- } +- +- req->payload_size = len; +- +- SPDK_DEBUGLOG(vfu_virtio_io, "%s: req %p, iovcnt %u, num_descs %u\n", +- dev->name, req, req->iovcnt, num_descs); +- return 0; +-} +- +-int +-vfu_virito_dev_process_packed_ring(struct vfu_virtio_dev *dev, struct vfu_virtio_vq *vq) +-{ +- struct vfu_virtio_endpoint *virtio_endpoint = dev->virtio_endpoint; +- struct vring_packed_desc *desc; +- int ret; +- struct vfu_virtio_req *req; +- uint16_t i, max_reqs; +- +- max_reqs = VIRTIO_DEV_VRING_MAX_REQS; +- for (i = 0; i < max_reqs; i++) { +- desc = &vq->desc.desc_packed[vq->last_avail_idx]; +- if (!virtio_vring_packed_is_avail(desc, vq->packed.avail_phase)) { +- return i; +- } +- +- req = vfu_virtio_dev_get_req(virtio_endpoint, vq); +- if (spdk_unlikely(!req)) { +- SPDK_ERRLOG("Error to get request\n"); +- /* TODO: address the error case */ +- assert(false); +- return -EIO; +- } +- +- ret = virtio_dev_packed_iovs_setup(dev, vq, vq->last_avail_idx, desc, req); +- if (spdk_unlikely(ret)) { +- /* let the device to response the error */ +- SPDK_ERRLOG("virtio_dev_packed_iovs_setup failed\n"); +- } +- +- assert(virtio_endpoint->virtio_ops.exec_request); +- virtio_endpoint->io_outstanding++; +- virtio_endpoint->virtio_ops.exec_request(virtio_endpoint, vq, req); +- } +- +- return i; +-} +- +-struct vfu_virtio_req * +-virito_dev_packed_ring_get_next_avail_req(struct vfu_virtio_dev *dev, struct vfu_virtio_vq *vq) +-{ +- struct vfu_virtio_endpoint *virtio_endpoint = dev->virtio_endpoint; +- struct vring_packed_desc *desc; +- int ret; +- struct vfu_virtio_req *req; +- +- desc = &vq->desc.desc_packed[vq->last_avail_idx]; +- if (!virtio_vring_packed_is_avail(desc, vq->packed.avail_phase)) { +- return NULL; +- } +- +- SPDK_DEBUGLOG(vfu_virtio_io, "%s: get 1 descriptors\n", dev->name); +- +- req = vfu_virtio_dev_get_req(virtio_endpoint, vq); +- if (!req) { +- SPDK_ERRLOG("Error to get request\n"); +- return NULL; +- } +- +- ret = virtio_dev_packed_iovs_setup(dev, vq, vq->last_avail_idx, desc, req); +- if (ret) { +- SPDK_ERRLOG("virtio_dev_packed_iovs_setup failed\n"); +- vfu_virtio_dev_put_req(req); +- return NULL; +- } +- +- return req; +-} +- +-static int +-virtio_vfu_pci_common_cfg(struct vfu_virtio_endpoint *virtio_endpoint, char *buf, +- size_t count, loff_t pos, bool is_write) +-{ +- struct vfu_virtio_dev *dev = virtio_endpoint->dev; +- uint32_t offset, value = 0; +- int ret; +- +- assert(count <= 4); +- offset = pos - VIRTIO_PCI_COMMON_CFG_OFFSET; +- +- if (is_write) { +- memcpy(&value, buf, count); +- switch (offset) { +- case VIRTIO_PCI_COMMON_DFSELECT: +- dev->cfg.host_feature_select = value; +- SPDK_DEBUGLOG(vfu_virtio, "%s: WRITE PCI_COMMON_DFSELECT with 0x%x\n", +- spdk_vfu_get_endpoint_id(virtio_endpoint->endpoint), +- value); +- break; +- case VIRTIO_PCI_COMMON_GFSELECT: +- dev->cfg.guest_feature_select = value; +- SPDK_DEBUGLOG(vfu_virtio, "%s: WRITE PCI_COMMON_GFSELECT with 0x%x\n", +- spdk_vfu_get_endpoint_id(virtio_endpoint->endpoint), +- value); +- break; +- case VIRTIO_PCI_COMMON_GF: +- assert(dev->cfg.guest_feature_select <= 1); +- if (dev->cfg.guest_feature_select) { +- dev->cfg.guest_feat_hi = value; +- SPDK_DEBUGLOG(vfu_virtio, "%s: WRITE PCI_COMMON_GF_HI with 0x%x\n", +- spdk_vfu_get_endpoint_id(virtio_endpoint->endpoint), +- value); +- } else { +- dev->cfg.guest_feat_lo = value; +- SPDK_DEBUGLOG(vfu_virtio, "%s: WRITE PCI_COMMON_GF_LO with 0x%x\n", +- spdk_vfu_get_endpoint_id(virtio_endpoint->endpoint), +- value); +- } +- +- ret = virtio_dev_set_features(dev, +- (((uint64_t)dev->cfg.guest_feat_hi << 32) | dev->cfg.guest_feat_lo)); +- if (ret) { +- return ret; +- } +- break; +- case VIRTIO_PCI_COMMON_MSIX: +- dev->cfg.msix_config = value; +- SPDK_DEBUGLOG(vfu_virtio, "%s: WRITE PCI_COMMON_MSIX with 0x%x\n", +- spdk_vfu_get_endpoint_id(virtio_endpoint->endpoint), +- value); +- break; +- case VIRTIO_PCI_COMMON_STATUS: +- SPDK_DEBUGLOG(vfu_virtio, "%s: WRITE PCI_COMMON_STATUS with 0x%x\n", +- spdk_vfu_get_endpoint_id(virtio_endpoint->endpoint), +- value); +- ret = virtio_dev_set_status(dev, value); +- if (ret) { +- return ret; +- } +- break; +- case VIRTIO_PCI_COMMON_Q_SELECT: +- if (value < VIRTIO_DEV_MAX_VQS) { +- dev->cfg.queue_select = value; +- } +- SPDK_DEBUGLOG(vfu_virtio, "%s: WRITE PCI_COMMON_Q_SELECT with 0x%x\n", +- spdk_vfu_get_endpoint_id(virtio_endpoint->endpoint), +- value); +- break; +- case VIRTIO_PCI_COMMON_Q_SIZE: +- dev->vqs[dev->cfg.queue_select].qsize = value; +- SPDK_DEBUGLOG(vfu_virtio, "%s: WRITE PCI_COMMON_Q_SIZE with 0x%x\n", +- spdk_vfu_get_endpoint_id(virtio_endpoint->endpoint), +- value); +- break; +- case VIRTIO_PCI_COMMON_Q_MSIX: +- dev->vqs[dev->cfg.queue_select].vector = value; +- SPDK_DEBUGLOG(vfu_virtio, "%s: WRITE PCI_COMMON_Q_MSIX with 0x%x\n", +- spdk_vfu_get_endpoint_id(virtio_endpoint->endpoint), +- value); +- break; +- case VIRTIO_PCI_COMMON_Q_ENABLE: +- SPDK_DEBUGLOG(vfu_virtio, "%s: WRITE PCI_COMMON_Q_ENABLE with 0x%x\n", +- spdk_vfu_get_endpoint_id(virtio_endpoint->endpoint), +- value); +- if (value == 1) { +- ret = virtio_dev_enable_vq(dev, dev->cfg.queue_select); +- if (ret) { +- return ret; +- } +- } else { +- ret = virtio_dev_disable_vq(dev, dev->cfg.queue_select); +- if (ret) { +- return ret; +- } +- } +- break; +- case VIRTIO_PCI_COMMON_Q_DESCLO: +- dev->vqs[dev->cfg.queue_select].desc_lo = value; +- SPDK_DEBUGLOG(vfu_virtio, "%s: WRITE queue %u PCI_COMMON_Q_DESCLO with 0x%x\n", +- spdk_vfu_get_endpoint_id(virtio_endpoint->endpoint), dev->cfg.queue_select, value); +- break; +- case VIRTIO_PCI_COMMON_Q_DESCHI: +- dev->vqs[dev->cfg.queue_select].desc_hi = value; +- SPDK_DEBUGLOG(vfu_virtio, "%s: WRITE queue %u PCI_COMMON_Q_DESCHI with 0x%x\n", +- spdk_vfu_get_endpoint_id(virtio_endpoint->endpoint), dev->cfg.queue_select, value); +- break; +- case VIRTIO_PCI_COMMON_Q_AVAILLO: +- dev->vqs[dev->cfg.queue_select].avail_lo = value; +- SPDK_DEBUGLOG(vfu_virtio, "%s: WRITE queue %u PCI_COMMON_Q_AVAILLO with 0x%x\n", +- spdk_vfu_get_endpoint_id(virtio_endpoint->endpoint), dev->cfg.queue_select, value); +- break; +- case VIRTIO_PCI_COMMON_Q_AVAILHI: +- dev->vqs[dev->cfg.queue_select].avail_hi = value; +- SPDK_DEBUGLOG(vfu_virtio, "%s: WRITE queue %u PCI_COMMON_Q_AVAILHI with 0x%x\n", +- spdk_vfu_get_endpoint_id(virtio_endpoint->endpoint), dev->cfg.queue_select, value); +- break; +- case VIRTIO_PCI_COMMON_Q_USEDLO: +- dev->vqs[dev->cfg.queue_select].used_lo = value; +- SPDK_DEBUGLOG(vfu_virtio, "%s: WRITE queue %u PCI_COMMON_Q_USEDLO with 0x%x\n", +- spdk_vfu_get_endpoint_id(virtio_endpoint->endpoint), dev->cfg.queue_select, value); +- break; +- case VIRTIO_PCI_COMMON_Q_USEDHI: +- dev->vqs[dev->cfg.queue_select].used_hi = value; +- SPDK_DEBUGLOG(vfu_virtio, "%s: WRITE queue %u PCI_COMMON_Q_USEDHI with 0x%x\n", +- spdk_vfu_get_endpoint_id(virtio_endpoint->endpoint), dev->cfg.queue_select, value); +- break; +- +- default: +- SPDK_ERRLOG("%s: WRITE UNSUPPORTED offset 0x%x\n", +- spdk_vfu_get_endpoint_id(virtio_endpoint->endpoint), offset); +- errno = EIO; +- return -1; +- } +- } else { +- switch (offset) { +- case VIRTIO_PCI_COMMON_DFSELECT: +- value = dev->cfg.host_feature_select; +- SPDK_DEBUGLOG(vfu_virtio, "%s: READ PCI_COMMON_DFSELECT with 0x%x\n", +- spdk_vfu_get_endpoint_id(virtio_endpoint->endpoint), +- value); +- break; +- case VIRTIO_PCI_COMMON_DF: +- assert(dev->cfg.host_feature_select <= 1); +- if (dev->cfg.host_feature_select) { +- value = dev->host_features >> 32; +- SPDK_DEBUGLOG(vfu_virtio, "%s: READ PCI_COMMON_DF_HI with 0x%x\n", +- spdk_vfu_get_endpoint_id(virtio_endpoint->endpoint), +- value); +- } else { +- value = dev->host_features; +- SPDK_DEBUGLOG(vfu_virtio, "%s: READ PCI_COMMON_DF_LO with 0x%x\n", +- spdk_vfu_get_endpoint_id(virtio_endpoint->endpoint), +- value); +- } +- break; +- case VIRTIO_PCI_COMMON_GFSELECT: +- value = dev->cfg.guest_feature_select; +- SPDK_DEBUGLOG(vfu_virtio, "%s: READ PCI_COMMON_GFSELECT with 0x%x\n", +- spdk_vfu_get_endpoint_id(virtio_endpoint->endpoint), +- value); +- break; +- case VIRTIO_PCI_COMMON_GF: +- assert(dev->cfg.guest_feature_select <= 1); +- if (dev->cfg.guest_feature_select) { +- value = dev->cfg.guest_feat_hi; +- SPDK_DEBUGLOG(vfu_virtio, "%s: READ PCI_COMMON_GF_HI with 0x%x\n", +- spdk_vfu_get_endpoint_id(virtio_endpoint->endpoint), +- value); +- } else { +- value = dev->cfg.guest_feat_lo; +- SPDK_DEBUGLOG(vfu_virtio, "%s: READ PCI_COMMON_GF_LO with 0x%x\n", +- spdk_vfu_get_endpoint_id(virtio_endpoint->endpoint), +- value); +- } +- break; +- case VIRTIO_PCI_COMMON_MSIX: +- value = dev->cfg.msix_config; +- SPDK_DEBUGLOG(vfu_virtio, "%s: READ PCI_COMMON_MSIX with 0x%x\n", +- spdk_vfu_get_endpoint_id(virtio_endpoint->endpoint), +- value); +- break; +- case VIRTIO_PCI_COMMON_NUMQ: +- value = dev->num_queues; +- SPDK_DEBUGLOG(vfu_virtio, "%s: READ PCI_COMMON_NUMQ with 0x%x\n", +- spdk_vfu_get_endpoint_id(virtio_endpoint->endpoint), +- value); +- break; +- case VIRTIO_PCI_COMMON_STATUS: +- value = dev->cfg.device_status; +- SPDK_DEBUGLOG(vfu_virtio, "%s: READ PCI_COMMON_STATUS with 0x%x\n", +- spdk_vfu_get_endpoint_id(virtio_endpoint->endpoint), +- value); +- break; +- case VIRTIO_PCI_COMMON_CFGGENERATION: +- value = dev->cfg.config_generation; +- SPDK_DEBUGLOG(vfu_virtio, "%s: READ PCI_COMMON_CFGGENERATION with 0x%x\n", +- spdk_vfu_get_endpoint_id(virtio_endpoint->endpoint), +- value); +- break; +- case VIRTIO_PCI_COMMON_Q_NOFF: +- value = dev->cfg.queue_select; +- SPDK_DEBUGLOG(vfu_virtio, "%s: READ PCI_COMMON_Q_NOFF with 0x%x\n", +- spdk_vfu_get_endpoint_id(virtio_endpoint->endpoint), +- value); +- break; +- case VIRTIO_PCI_COMMON_Q_SELECT: +- value = dev->cfg.queue_select; +- SPDK_DEBUGLOG(vfu_virtio, "%s: READ PCI_COMMON_Q_SELECT with 0x%x\n", +- spdk_vfu_get_endpoint_id(virtio_endpoint->endpoint), +- value); +- break; +- case VIRTIO_PCI_COMMON_Q_SIZE: +- value = dev->vqs[dev->cfg.queue_select].qsize; +- SPDK_DEBUGLOG(vfu_virtio, "%s: READ queue %u PCI_COMMON_Q_SIZE with 0x%x\n", +- spdk_vfu_get_endpoint_id(virtio_endpoint->endpoint), +- dev->cfg.queue_select, value); +- break; +- case VIRTIO_PCI_COMMON_Q_MSIX: +- value = dev->vqs[dev->cfg.queue_select].vector; +- SPDK_DEBUGLOG(vfu_virtio, "%s: READ queue %u PCI_COMMON_Q_MSIX with 0x%x\n", +- spdk_vfu_get_endpoint_id(virtio_endpoint->endpoint), +- dev->cfg.queue_select, value); +- break; +- case VIRTIO_PCI_COMMON_Q_ENABLE: +- value = dev->vqs[dev->cfg.queue_select].enabled; +- SPDK_DEBUGLOG(vfu_virtio, "%s: READ queue %u PCI_COMMON_Q_ENABLE with 0x%x\n", +- spdk_vfu_get_endpoint_id(virtio_endpoint->endpoint), dev->cfg.queue_select, value); +- break; +- case VIRTIO_PCI_COMMON_Q_DESCLO: +- value = dev->vqs[dev->cfg.queue_select].desc_lo; +- SPDK_DEBUGLOG(vfu_virtio, "%s: READ queue %u PCI_COMMON_Q_DESCLO with 0x%x\n", +- spdk_vfu_get_endpoint_id(virtio_endpoint->endpoint), dev->cfg.queue_select, value); +- break; +- case VIRTIO_PCI_COMMON_Q_DESCHI: +- value = dev->vqs[dev->cfg.queue_select].desc_hi; +- SPDK_DEBUGLOG(vfu_virtio, "%s: READ queue %u PCI_COMMON_Q_DESCHI with 0x%x\n", +- spdk_vfu_get_endpoint_id(virtio_endpoint->endpoint), dev->cfg.queue_select, value); +- break; +- case VIRTIO_PCI_COMMON_Q_AVAILLO: +- value = dev->vqs[dev->cfg.queue_select].avail_lo; +- SPDK_DEBUGLOG(vfu_virtio, "%s: READ queue %u PCI_COMMON_Q_AVAILLO with 0x%x\n", +- spdk_vfu_get_endpoint_id(virtio_endpoint->endpoint), dev->cfg.queue_select, value); +- break; +- case VIRTIO_PCI_COMMON_Q_AVAILHI: +- value = dev->vqs[dev->cfg.queue_select].avail_hi; +- SPDK_DEBUGLOG(vfu_virtio, "%s: READ queue %u PCI_COMMON_Q_AVAILHI with 0x%x\n", +- spdk_vfu_get_endpoint_id(virtio_endpoint->endpoint), dev->cfg.queue_select, value); +- break; +- case VIRTIO_PCI_COMMON_Q_USEDLO: +- value = dev->vqs[dev->cfg.queue_select].used_lo; +- SPDK_DEBUGLOG(vfu_virtio, "%s: READ queue %u PCI_COMMON_Q_USEDLO with 0x%x\n", +- spdk_vfu_get_endpoint_id(virtio_endpoint->endpoint), dev->cfg.queue_select, value); +- break; +- case VIRTIO_PCI_COMMON_Q_USEDHI: +- value = dev->vqs[dev->cfg.queue_select].used_hi; +- SPDK_DEBUGLOG(vfu_virtio, "%s: READ queue %u PCI_COMMON_Q_USEDHI with 0x%x\n", +- spdk_vfu_get_endpoint_id(virtio_endpoint->endpoint), dev->cfg.queue_select, value); +- break; +- default: +- SPDK_ERRLOG("%s: READ UNSUPPORTED offset 0x%x\n", +- spdk_vfu_get_endpoint_id(virtio_endpoint->endpoint), offset); +- errno = EIO; +- return -1; +- } +- memcpy(buf, &value, count); +- } +- +- return count; +-} +- +-static int +-virtio_vfu_device_specific_cfg(struct vfu_virtio_endpoint *virtio_endpoint, char *buf, +- size_t count, loff_t pos, bool is_write) +-{ +- loff_t offset; +- int ret = -1; +- +- assert(count <= 8); +- offset = pos - VIRTIO_PCI_SPECIFIC_CFG_OFFSET; +- if (!is_write) { +- if (virtio_endpoint->virtio_ops.get_config) { +- ret = virtio_endpoint->virtio_ops.get_config(virtio_endpoint, buf, offset, count); +- } +- } else { +- if (virtio_endpoint->virtio_ops.set_config) { +- ret = virtio_endpoint->virtio_ops.set_config(virtio_endpoint, buf, offset, count); +- } +- } +- +- if (ret < 0) { +- return ret; +- } +- +- return count; +-} +- +-static int +-virtio_vfu_pci_isr(struct vfu_virtio_endpoint *virtio_endpoint, char *buf, +- size_t count, bool is_write) +-{ +- uint8_t *isr; +- +- if (count != 1) { +- SPDK_ERRLOG("ISR register is 1 byte\n"); +- errno = EIO; +- return -1; +- } +- +- isr = buf; +- +- if (!is_write) { +- SPDK_DEBUGLOG(vfu_virtio, "READ PCI ISR\n"); +- /* Read-Acknowledge Clear */ +- *isr = virtio_endpoint->dev->cfg.isr; +- virtio_endpoint->dev->cfg.isr = 0; +- } else { +- SPDK_ERRLOG("ISR register is RO\n"); +- errno = EIO; +- return -1; +- } +- +- return count; +-} +- +-static ssize_t +-virtio_vfu_access_bar4(vfu_ctx_t *vfu_ctx, char *buf, size_t count, +- loff_t pos, +- bool is_write) +-{ +- struct spdk_vfu_endpoint *endpoint = vfu_get_private(vfu_ctx); +- struct vfu_virtio_endpoint *virtio_endpoint = spdk_vfu_get_endpoint_private(endpoint); +- uint64_t start, end; +- +- start = pos; +- end = start + count; +- SPDK_DEBUGLOG(vfu_virtio, "%s: %s bar4 0x%"PRIX64"-0x%"PRIX64", len = %lu\n", +- spdk_vfu_get_endpoint_id(virtio_endpoint->endpoint), +- is_write ? "write" : "read", start, end - 1, count); +- +- if (end < VIRTIO_PCI_COMMON_CFG_OFFSET + VIRTIO_PCI_COMMON_CFG_LENGTH) { +- /* virtio PCI common configuration */ +- return virtio_vfu_pci_common_cfg(virtio_endpoint, buf, count, pos, is_write); +- } else if (start >= VIRTIO_PCI_ISR_ACCESS_OFFSET && +- end < VIRTIO_PCI_ISR_ACCESS_OFFSET + VIRTIO_PCI_ISR_ACCESS_LENGTH) { +- /* ISR access */ +- return virtio_vfu_pci_isr(virtio_endpoint, buf, count, is_write); +- } else if (start >= VIRTIO_PCI_SPECIFIC_CFG_OFFSET && +- end < VIRTIO_PCI_SPECIFIC_CFG_OFFSET + VIRTIO_PCI_SPECIFIC_CFG_LENGTH) { +- /* Device specific configuration */ +- return virtio_vfu_device_specific_cfg(virtio_endpoint, buf, count, pos, is_write); +- } else if (start >= VIRTIO_PCI_NOTIFICATIONS_OFFSET && +- end < VIRTIO_PCI_NOTIFICATIONS_OFFSET + VIRTIO_PCI_NOTIFICATIONS_LENGTH) { +- /* Notifications */ +- /* Sparse mmap region by default, there are no MMIO R/W messages */ +- assert(false); +- return count; +- } else { +- assert(false); +- } +- +- return 0; +-} +- +-int +-vfu_virtio_post_memory_add(struct spdk_vfu_endpoint *endpoint, void *map_start, void *map_end) +-{ +- struct vfu_virtio_endpoint *virtio_endpoint = spdk_vfu_get_endpoint_private(endpoint); +- struct vfu_virtio_dev *dev = virtio_endpoint->dev; +- uint32_t i; +- +- if (!dev) { +- return 0; +- } +- +- for (i = 0; i < dev->num_queues; i++) { +- /* Try to remap VQs if necessary */ +- virtio_dev_map_vq(dev, &dev->vqs[i]); +- } +- +- return 0; +-} +- +-int +-vfu_virtio_pre_memory_remove(struct spdk_vfu_endpoint *endpoint, void *map_start, void *map_end) +-{ +- struct vfu_virtio_endpoint *virtio_endpoint = spdk_vfu_get_endpoint_private(endpoint); +- +- if (virtio_endpoint->dev != NULL) { +- vfu_virtio_dev_unmap_vqs(virtio_endpoint->dev, map_start, map_end); +- } +- +- return 0; +-} +- +-int +-vfu_virtio_pci_reset_cb(struct spdk_vfu_endpoint *endpoint) +-{ +- struct vfu_virtio_endpoint *virtio_endpoint = spdk_vfu_get_endpoint_private(endpoint); +- +- if (virtio_endpoint->dev) { +- vfu_virtio_dev_stop(virtio_endpoint->dev); +- vfu_virtio_dev_reset(virtio_endpoint->dev); +- } +- +- return 0; +-} +- +-static ssize_t +-access_pci_config(vfu_ctx_t *vfu_ctx, char *buf, size_t count, loff_t offset, +- bool is_write) +-{ +- struct spdk_vfu_endpoint *endpoint = vfu_get_private(vfu_ctx); +- void *pci_config = spdk_vfu_endpoint_get_pci_config(endpoint); +- +- SPDK_DEBUGLOG(vfu_virtio, +- "%s: PCI_CFG %s %#lx-%#lx\n", +- spdk_vfu_get_endpoint_id(endpoint), is_write ? "write" : "read", +- offset, offset + count); +- +- if (is_write) { +- SPDK_ERRLOG("write %#lx-%#lx not supported\n", +- offset, offset + count); +- errno = EINVAL; +- return -1; +- } +- +- if (offset + count > 0x1000) { +- SPDK_ERRLOG("access past end of extended PCI configuration space, want=%ld+%ld, max=%d\n", +- offset, count, 0x1000); +- errno = ERANGE; +- return -1; +- } +- +- memcpy(buf, ((unsigned char *)pci_config) + offset, count); +- return count; +-} +- +-static int +-vfu_virtio_dev_start(struct vfu_virtio_dev *dev) +-{ +- struct vfu_virtio_endpoint *virtio_endpoint = dev->virtio_endpoint; +- int ret = 0; +- +- SPDK_DEBUGLOG(vfu_virtio, "start %s\n", dev->name); +- +- if (virtio_dev_is_started(dev)) { +- SPDK_ERRLOG("Device %s is already started\n", dev->name); +- return -EFAULT; +- } +- +- if (virtio_endpoint->virtio_ops.start_device) { +- virtio_endpoint->io_outstanding = 0; +- ret = virtio_endpoint->virtio_ops.start_device(virtio_endpoint); +- } +- +- SPDK_DEBUGLOG(vfu_virtio, "%s is started with ret %d\n", dev->name, ret); +- +- return ret; +-} +- +-static int +-vfu_virtio_dev_stop(struct vfu_virtio_dev *dev) +-{ +- struct vfu_virtio_endpoint *virtio_endpoint = dev->virtio_endpoint; +- int ret = 0; +- +- SPDK_DEBUGLOG(vfu_virtio, "stop %s\n", dev->name); +- +- if (!virtio_dev_is_started(dev)) { +- SPDK_DEBUGLOG(vfu_virtio, "%s isn't started\n", dev->name); +- return 0; +- } +- +- if (virtio_endpoint->virtio_ops.stop_device) { +- ret = virtio_endpoint->virtio_ops.stop_device(virtio_endpoint); +- assert(ret == 0); +- } +- +- /* Unmap all VQs */ +- vfu_virtio_dev_unmap_vqs(dev, NULL, NULL); +- +- return ret; +-} +- +-int +-vfu_virtio_detach_device(struct spdk_vfu_endpoint *endpoint) +-{ +- struct vfu_virtio_endpoint *virtio_endpoint = spdk_vfu_get_endpoint_private(endpoint); +- struct vfu_virtio_dev *dev = virtio_endpoint->dev; +- +- if (virtio_endpoint->dev == NULL) { +- return 0; +- } +- +- SPDK_DEBUGLOG(vfu_virtio, "detach device %s\n", dev->name); +- +- vfu_virtio_dev_stop(dev); +- vfu_virtio_dev_free_reqs(virtio_endpoint, dev); +- virtio_endpoint->dev = NULL; +- free(dev); +- +- return 0; +-} +- +-int +-vfu_virtio_attach_device(struct spdk_vfu_endpoint *endpoint) +-{ +- struct vfu_virtio_endpoint *virtio_endpoint = spdk_vfu_get_endpoint_private(endpoint); +- uint64_t supported_features = 0; +- struct vfu_virtio_dev *dev; +- struct vfu_virtio_vq *vq; +- struct vfu_virtio_req *req; +- uint32_t i, j; +- int ret = 0; +- +- dev = calloc(1, sizeof(*dev) + virtio_endpoint->num_queues * 3 * dma_sg_size()); +- if (dev == NULL) { +- return -ENOMEM; +- } +- +- dev->num_queues = virtio_endpoint->num_queues; +- for (i = 0; i < dev->num_queues; i++) { +- vq = &dev->vqs[i]; +- vq->id = i; +- vq->qsize = virtio_endpoint->qsize; +- vq->avail.sg = (dma_sg_t *)(dev->sg + i * dma_sg_size() * 3); +- vq->used.sg = (dma_sg_t *)((uint8_t *)vq->avail.sg + dma_sg_size()); +- vq->desc.sg = (dma_sg_t *)((uint8_t *)vq->used.sg + dma_sg_size()); +- +- STAILQ_INIT(&vq->free_reqs); +- for (j = 0; j <= vq->qsize; j++) { +- req = vfu_virtio_vq_alloc_req(virtio_endpoint, vq); +- if (!req) { +- SPDK_ERRLOG("Error to allocate req\n"); +- ret = -ENOMEM; +- goto out; +- } +- req->indirect_iov = &req->iovs[VIRTIO_DEV_MAX_IOVS]; +- req->indirect_sg = virtio_req_to_sg_t(req, VIRTIO_DEV_MAX_IOVS); +- req->dev = dev; +- req->vq = vq; +- STAILQ_INSERT_TAIL(&vq->free_reqs, req, link); +- } +- } +- +- if (virtio_endpoint->virtio_ops.get_device_features) { +- supported_features = virtio_endpoint->virtio_ops.get_device_features(virtio_endpoint); +- } +- dev->host_features = supported_features; +- +- snprintf(dev->name, SPDK_VFU_MAX_NAME_LEN, "%s", +- spdk_vfu_get_endpoint_name(virtio_endpoint->endpoint)); +- virtio_endpoint->dev = dev; +- dev->virtio_endpoint = virtio_endpoint; +- virtio_endpoint->thread = spdk_get_thread(); +- return 0; +- +-out: +- vfu_virtio_dev_free_reqs(virtio_endpoint, dev); +- return ret; +-} +- +-int +-vfu_virtio_endpoint_setup(struct vfu_virtio_endpoint *virtio_endpoint, +- struct spdk_vfu_endpoint *endpoint, +- char *basename, const char *endpoint_name, +- struct vfu_virtio_ops *ops) +-{ +- char path[PATH_MAX] = ""; +- int ret; +- +- if (!ops) { +- return -EINVAL; +- } +- +- ret = snprintf(path, PATH_MAX, "%s%s_bar4", basename, endpoint_name); +- if (ret < 0 || ret >= PATH_MAX) { +- SPDK_ERRLOG("%s: error to get socket path: %s.\n", basename, spdk_strerror(errno)); +- return -EINVAL; +- } +- +- ret = open(path, O_RDWR | O_CREAT, S_IRUSR | S_IWUSR); +- if (ret == -1) { +- SPDK_ERRLOG("%s: failed to open device memory at %s.\n", +- path, spdk_strerror(errno)); +- return ret; +- } +- unlink(path); +- +- virtio_endpoint->devmem_fd = ret; +- ret = ftruncate(virtio_endpoint->devmem_fd, VIRTIO_PCI_BAR4_LENGTH); +- if (ret != 0) { +- SPDK_ERRLOG("%s: error to ftruncate file %s.\n", path, +- spdk_strerror(errno)); +- close(virtio_endpoint->devmem_fd); +- return ret; +- } +- +- virtio_endpoint->doorbells = mmap(NULL, VIRTIO_PCI_NOTIFICATIONS_LENGTH, PROT_READ | PROT_WRITE, +- MAP_SHARED, +- virtio_endpoint->devmem_fd, VIRTIO_PCI_NOTIFICATIONS_OFFSET); +- if (virtio_endpoint->doorbells == MAP_FAILED) { +- SPDK_ERRLOG("%s: error to mmap file %s.\n", path, spdk_strerror(errno)); +- close(virtio_endpoint->devmem_fd); +- return -EFAULT; +- } +- virtio_endpoint->endpoint = endpoint; +- virtio_endpoint->virtio_ops = *ops; +- virtio_endpoint->num_queues = VIRTIO_DEV_MAX_VQS; +- virtio_endpoint->qsize = VIRTIO_VQ_DEFAULT_SIZE; +- +- SPDK_DEBUGLOG(vfu_virtio, "mmap file %s, devmem_fd %d\n", path, virtio_endpoint->devmem_fd); +- return 0; +-} +- +-int +-vfu_virtio_endpoint_destruct(struct vfu_virtio_endpoint *virtio_endpoint) +-{ +- if (virtio_endpoint->doorbells) { +- munmap((void *)virtio_endpoint->doorbells, VIRTIO_PCI_NOTIFICATIONS_LENGTH); +- } +- +- if (virtio_endpoint->devmem_fd) { +- close(virtio_endpoint->devmem_fd); +- } +- +- return 0; +-} +- +-static int +-vfu_virtio_quiesce_poll(void *ctx) +-{ +- struct vfu_virtio_endpoint *virtio_endpoint = ctx; +- vfu_ctx_t *vfu_ctx = spdk_vfu_get_vfu_ctx(virtio_endpoint->endpoint); +- +- if (virtio_endpoint->io_outstanding) { +- return SPDK_POLLER_IDLE; +- } +- +- spdk_poller_unregister(&virtio_endpoint->quiesce_poller); +- virtio_endpoint->quiesce_in_progress = false; +- vfu_device_quiesced(vfu_ctx, 0); +- +- return SPDK_POLLER_BUSY; +-} +- +-int +-vfu_virtio_quiesce_cb(struct spdk_vfu_endpoint *endpoint) +-{ +- struct vfu_virtio_endpoint *virtio_endpoint = spdk_vfu_get_endpoint_private(endpoint); +- +- if (virtio_endpoint->quiesce_in_progress) { +- return -EBUSY; +- } +- +- if (!virtio_endpoint->io_outstanding) { +- return 0; +- } +- +- virtio_endpoint->quiesce_in_progress = true; +- virtio_endpoint->quiesce_poller = SPDK_POLLER_REGISTER(vfu_virtio_quiesce_poll, virtio_endpoint, +- 10); +- +- return -EBUSY; +-} +- +-static struct spdk_vfu_pci_device vfu_virtio_device_info = { +- .id = { +- .vid = SPDK_PCI_VID_VIRTIO, +- /* Realize when calling get device information */ +- .did = 0x0, +- .ssvid = SPDK_PCI_VID_VIRTIO, +- .ssid = 0x0, +- }, +- +- .class = { +- /* 0x01, mass storage controller */ +- .bcc = 0x01, +- /* 0x00, SCSI controller */ +- .scc = 0x00, +- /* 0x00, SCSI controller - vendor specific interface */ +- .pi = 0x00, +- }, +- +- .pmcap = { +- .hdr.id = PCI_CAP_ID_PM, +- .pmcs.nsfrst = 0x1, +- }, +- +- .pxcap = { +- .hdr.id = PCI_CAP_ID_EXP, +- .pxcaps.ver = 0x2, +- .pxdcap = {.rer = 0x1, .flrc = 0x1}, +- .pxdcap2.ctds = 0x1, +- }, +- +- .msixcap = { +- .hdr.id = PCI_CAP_ID_MSIX, +- .mxc.ts = VIRTIO_DEV_MAX_VQS - 1, +- .mtab = {.tbir = 0x1, .to = 0x0}, +- .mpba = {.pbir = 0x2, .pbao = 0x0}, +- }, +- +- .nr_vendor_caps = 4, +- +- .intr_ipin = 0x1, +- .nr_int_irqs = 0x1, +- .nr_msix_irqs = VIRTIO_DEV_MAX_VQS, +- +- .regions = { +- /* BAR0 */ +- {0}, +- /* BAR1 */ +- { +- .access_cb = NULL, +- .offset = 0, +- .fd = -1, +- .len = 0x1000, +- .flags = VFU_REGION_FLAG_RW, +- .nr_sparse_mmaps = 0, +- }, +- /* BAR2 */ +- { +- .access_cb = NULL, +- .offset = 0, +- .fd = -1, +- .len = 0x1000, +- .flags = VFU_REGION_FLAG_RW, +- .nr_sparse_mmaps = 0, +- }, +- /* BAR3 */ +- {0}, +- /* BAR4 */ +- { +- .access_cb = virtio_vfu_access_bar4, +- .offset = 0, +- .fd = -1, +- .len = VIRTIO_PCI_BAR4_LENGTH, +- .flags = VFU_REGION_FLAG_RW | VFU_REGION_FLAG_MEM, +- .nr_sparse_mmaps = 1, +- .mmaps = { +- { +- .offset = VIRTIO_PCI_NOTIFICATIONS_OFFSET, +- .len = VIRTIO_PCI_NOTIFICATIONS_LENGTH, +- }, +- }, +- }, +- /* BAR5 */ +- {0}, +- /* BAR6 */ +- {0}, +- /* ROM */ +- {0}, +- /* PCI Config */ +- { +- .access_cb = access_pci_config, +- .offset = 0, +- .fd = -1, +- .len = 0x1000, +- .flags = VFU_REGION_FLAG_RW, +- .nr_sparse_mmaps = 0, +- }, +- }, +-}; +- +-void +-vfu_virtio_get_device_info(struct vfu_virtio_endpoint *virtio_endpoint, +- struct spdk_vfu_pci_device *device_info) +-{ +- memcpy(device_info, &vfu_virtio_device_info, sizeof(*device_info)); +- +- /* BAR4 Region FD */ +- device_info->regions[VFU_PCI_DEV_BAR4_REGION_IDX].fd = virtio_endpoint->devmem_fd; +- SPDK_DEBUGLOG(vfu_virtio, "%s: get device information, fd %d\n", +- spdk_vfu_get_endpoint_id(virtio_endpoint->endpoint), +- virtio_endpoint->devmem_fd); +-} +- +-static struct virtio_pci_cap common_cap = { +- .cap_vndr = PCI_CAP_ID_VNDR, +- .cap_len = sizeof(common_cap), +- .cfg_type = VIRTIO_PCI_CAP_COMMON_CFG, +- .bar = 4, +- .offset = VIRTIO_PCI_COMMON_CFG_OFFSET, +- .length = VIRTIO_PCI_COMMON_CFG_LENGTH, +-}; +- +-static struct virtio_pci_cap isr_cap = { +- .cap_vndr = PCI_CAP_ID_VNDR, +- .cap_len = sizeof(isr_cap), +- .cfg_type = VIRTIO_PCI_CAP_ISR_CFG, +- .bar = 4, +- .offset = VIRTIO_PCI_ISR_ACCESS_OFFSET, +- .length = VIRTIO_PCI_ISR_ACCESS_LENGTH, +-}; +- +-static struct virtio_pci_cap dev_cap = { +- .cap_vndr = PCI_CAP_ID_VNDR, +- .cap_len = sizeof(dev_cap), +- .cfg_type = VIRTIO_PCI_CAP_DEVICE_CFG, +- .bar = 4, +- .offset = VIRTIO_PCI_SPECIFIC_CFG_OFFSET, +- .length = VIRTIO_PCI_SPECIFIC_CFG_LENGTH, +-}; +- +-static struct virtio_pci_notify_cap notify_cap = { +- .cap = { +- .cap_vndr = PCI_CAP_ID_VNDR, +- .cap_len = sizeof(notify_cap), +- .cfg_type = VIRTIO_PCI_CAP_NOTIFY_CFG, +- .bar = 4, +- .offset = VIRTIO_PCI_NOTIFICATIONS_OFFSET, +- .length = VIRTIO_PCI_NOTIFICATIONS_LENGTH, +- }, +- .notify_off_multiplier = 4, +-}; +- +-uint16_t +-vfu_virtio_get_vendor_capability(struct spdk_vfu_endpoint *endpoint, char *buf, +- uint16_t buf_len, +- uint16_t idx) +-{ +- uint16_t len; +- +- SPDK_DEBUGLOG(vfu_virtio, "%s: get vendor capability, idx %u\n", +- spdk_vfu_get_endpoint_id(endpoint), idx); +- +- switch (idx) { +- case 0: +- assert(buf_len > sizeof(common_cap)); +- memcpy(buf, &common_cap, sizeof(common_cap)); +- len = sizeof(common_cap); +- break; +- case 1: +- assert(buf_len > sizeof(isr_cap)); +- memcpy(buf, &isr_cap, sizeof(isr_cap)); +- len = sizeof(isr_cap); +- break; +- case 2: +- assert(buf_len > sizeof(dev_cap)); +- memcpy(buf, &dev_cap, sizeof(dev_cap)); +- len = sizeof(dev_cap); +- break; +- case 3: +- assert(buf_len > sizeof(notify_cap)); +- memcpy(buf, ¬ify_cap, sizeof(notify_cap)); +- len = sizeof(notify_cap); +- break; +- default: +- return 0; +- } +- +- return len; +-} +- +-SPDK_LOG_REGISTER_COMPONENT(vfu_virtio) +-SPDK_LOG_REGISTER_COMPONENT(vfu_virtio_io) ++/* SPDX-License-Identifier: BSD-3-Clause ++ * Copyright (C) 2022 Intel Corporation. ++ * All rights reserved. ++ */ ++ ++/* ++ * virtio over vfio-user common library ++ */ ++#include "spdk/env.h" ++#include "spdk/bdev.h" ++#include "spdk/bdev_module.h" ++#include "spdk/stdinc.h" ++#include "spdk/assert.h" ++#include "spdk/barrier.h" ++#include "spdk/thread.h" ++#include "spdk/memory.h" ++#include "spdk/util.h" ++#include "spdk/log.h" ++#include "spdk/string.h" ++#include "spdk/likely.h" ++ ++#include "vfu_virtio_internal.h" ++ ++static int vfu_virtio_dev_start(struct vfu_virtio_dev *dev); ++static int vfu_virtio_dev_stop(struct vfu_virtio_dev *dev); ++ ++static inline void ++vfu_virtio_unmap_q(struct vfu_virtio_dev *dev, struct q_mapping *mapping) ++{ ++ struct vfu_virtio_endpoint *virtio_endpoint = dev->virtio_endpoint; ++ ++ if (mapping->addr != NULL) { ++ spdk_vfu_unmap_sg(virtio_endpoint->endpoint, mapping->sg, ++ &mapping->iov, 1); ++ mapping->addr = NULL; ++ mapping->len = 0; ++ } ++} ++ ++static inline int ++vfu_virtio_map_q(struct vfu_virtio_dev *dev, struct q_mapping *mapping, uint64_t phys_addr, ++ uint64_t len) ++{ ++ struct vfu_virtio_endpoint *virtio_endpoint = dev->virtio_endpoint; ++ void *addr; ++ ++ if (!mapping->addr && len && phys_addr) { ++ addr = spdk_vfu_map_one(virtio_endpoint->endpoint, phys_addr, len, ++ mapping->sg, &mapping->iov, PROT_READ | PROT_WRITE); ++ if (addr == NULL) { ++ return -EINVAL; ++ } ++ mapping->phys_addr = phys_addr; ++ mapping->len = len; ++ mapping->addr = addr; ++ } ++ ++ return 0; ++} ++ ++static int ++virtio_dev_map_vq(struct vfu_virtio_dev *dev, struct vfu_virtio_vq *vq) ++{ ++ int ret; ++ uint64_t phys_addr, len; ++ ++ if (!vq->enabled || (vq->q_state == VFU_VQ_ACTIVE)) { ++ return 0; ++ } ++ ++ SPDK_DEBUGLOG(vfu_virtio, "%s: try to map vq %u\n", dev->name, vq->id); ++ ++ len = virtio_queue_desc_size(dev, vq); ++ phys_addr = ((((uint64_t)vq->desc_hi) << 32) | vq->desc_lo); ++ ret = vfu_virtio_map_q(dev, &vq->desc, phys_addr, len); ++ if (ret) { ++ SPDK_DEBUGLOG(vfu_virtio, "Error to map descs\n"); ++ return ret; ++ } ++ ++ len = virtio_queue_avail_size(dev, vq); ++ phys_addr = ((((uint64_t)vq->avail_hi) << 32) | vq->avail_lo); ++ ret = vfu_virtio_map_q(dev, &vq->avail, phys_addr, len); ++ if (ret) { ++ vfu_virtio_unmap_q(dev, &vq->desc); ++ SPDK_DEBUGLOG(vfu_virtio, "Error to map available ring\n"); ++ return ret; ++ } ++ ++ len = virtio_queue_used_size(dev, vq); ++ phys_addr = ((((uint64_t)vq->used_hi) << 32) | vq->used_lo); ++ ret = vfu_virtio_map_q(dev, &vq->used, phys_addr, len); ++ if (ret) { ++ vfu_virtio_unmap_q(dev, &vq->desc); ++ vfu_virtio_unmap_q(dev, &vq->avail); ++ SPDK_DEBUGLOG(vfu_virtio, "Error to map used ring\n"); ++ return ret; ++ } ++ ++ /* We're running with polling mode */ ++ if (virtio_guest_has_feature(dev, VIRTIO_F_RING_PACKED)) { ++ vq->used.device_event->flags = VRING_PACKED_EVENT_FLAG_DISABLE; ++ } else { ++ vq->used.used->flags = VRING_USED_F_NO_NOTIFY; ++ } ++ ++ SPDK_DEBUGLOG(vfu_virtio, "%s: map vq %u successfully\n", dev->name, vq->id); ++ vq->q_state = VFU_VQ_ACTIVE; ++ ++ return 0; ++} ++ ++static void ++virtio_dev_unmap_vq(struct vfu_virtio_dev *dev, struct vfu_virtio_vq *vq) ++{ ++ SPDK_DEBUGLOG(vfu_virtio, "%s: unmap vq %u\n", dev->name, vq->id); ++ vq->q_state = VFU_VQ_INACTIVE; ++ ++ vfu_virtio_unmap_q(dev, &vq->desc); ++ vfu_virtio_unmap_q(dev, &vq->avail); ++ vfu_virtio_unmap_q(dev, &vq->used); ++} ++ ++static bool ++vfu_virtio_vq_should_unmap(struct vfu_virtio_vq *vq, void *map_start, void *map_end) ++{ ++ /* always do unmap when stopping the device */ ++ if (!map_start || !map_end) { ++ return true; ++ } ++ ++ if (vq->desc.addr >= map_start && vq->desc.addr < map_end) { ++ return true; ++ } ++ ++ if (vq->avail.addr >= map_start && vq->avail.addr < map_end) { ++ return true; ++ } ++ ++ if (vq->used.addr >= map_start && vq->used.addr < map_end) { ++ return true; ++ } ++ ++ return false; ++} ++ ++static void ++vfu_virtio_dev_unmap_vqs(struct vfu_virtio_dev *dev, void *map_start, void *map_end) ++{ ++ uint32_t i; ++ struct vfu_virtio_vq *vq; ++ ++ for (i = 0; i < dev->num_queues; i++) { ++ vq = &dev->vqs[i]; ++ if (!vq->enabled) { ++ continue; ++ } ++ ++ if (!vfu_virtio_vq_should_unmap(vq, map_start, map_end)) { ++ continue; ++ } ++ virtio_dev_unmap_vq(dev, vq); ++ } ++} ++ ++/* This function is used to notify VM that the device ++ * configuration space has been changed. ++ */ ++void ++vfu_virtio_notify_config(struct vfu_virtio_endpoint *virtio_endpoint) ++{ ++ struct spdk_vfu_endpoint *endpoint = virtio_endpoint->endpoint; ++ ++ if (virtio_endpoint->dev == NULL) { ++ return; ++ } ++ ++ virtio_endpoint->dev->cfg.isr = 1; ++ virtio_endpoint->dev->cfg.config_generation++; ++ ++ vfu_irq_trigger(spdk_vfu_get_vfu_ctx(endpoint), virtio_endpoint->dev->cfg.msix_config); ++} ++ ++static void ++vfu_virtio_dev_reset(struct vfu_virtio_dev *dev) ++{ ++ uint32_t i; ++ struct vfu_virtio_vq *vq; ++ ++ SPDK_DEBUGLOG(vfu_virtio, "device %s resetting\n", dev->name); ++ ++ for (i = 0; i < dev->num_queues; i++) { ++ vq = &dev->vqs[i]; ++ ++ vq->q_state = VFU_VQ_CREATED; ++ vq->vector = 0; ++ vq->enabled = false; ++ vq->last_avail_idx = 0; ++ vq->last_used_idx = 0; ++ ++ vq->packed.packed_ring = false; ++ vq->packed.avail_phase = 0; ++ vq->packed.used_phase = 0; ++ } ++ ++ memset(&dev->cfg, 0, sizeof(struct virtio_pci_cfg)); ++} ++ ++static int ++virtio_dev_set_status(struct vfu_virtio_dev *dev, uint8_t status) ++{ ++ int ret = 0; ++ ++ SPDK_DEBUGLOG(vfu_virtio, "device current status %x, set status %x\n", dev->cfg.device_status, ++ status); ++ ++ if (!(virtio_dev_is_started(dev))) { ++ if (status & VIRTIO_CONFIG_S_DRIVER_OK) { ++ ret = vfu_virtio_dev_start(dev); ++ } ++ } else { ++ if (!(status & VIRTIO_CONFIG_S_DRIVER_OK)) { ++ ret = vfu_virtio_dev_stop(dev); ++ } ++ } ++ ++ if (ret) { ++ SPDK_ERRLOG("Failed to start/stop device\n"); ++ return ret; ++ } ++ ++ dev->cfg.device_status = status; ++ ++ if (status == 0) { ++ vfu_virtio_dev_reset(dev); ++ } ++ ++ return 0; ++} ++ ++static int ++virtio_dev_set_features(struct vfu_virtio_dev *dev, uint64_t features) ++{ ++ if (dev->cfg.device_status & VIRTIO_CONFIG_S_FEATURES_OK) { ++ SPDK_ERRLOG("Feature negotiation has finished\n"); ++ return -EINVAL; ++ } ++ ++ if (features & ~dev->host_features) { ++ SPDK_ERRLOG("Host features 0x%"PRIx64", guest features 0x%"PRIx64"\n", ++ dev->host_features, features); ++ return -ENOTSUP; ++ } ++ ++ SPDK_DEBUGLOG(vfu_virtio, "%s: negotiated features 0x%"PRIx64"\n", dev->name, ++ features); ++ dev->cfg.guest_features = features; ++ ++ return 0; ++} ++ ++static int ++virtio_dev_enable_vq(struct vfu_virtio_dev *dev, uint16_t qid) ++{ ++ struct vfu_virtio_vq *vq; ++ ++ SPDK_DEBUGLOG(vfu_virtio, "%s: enable vq %u\n", dev->name, qid); ++ ++ vq = &dev->vqs[qid]; ++ if (vq->enabled) { ++ SPDK_ERRLOG("Queue %u is enabled\n", qid); ++ return -EINVAL; ++ } ++ vq->enabled = true; ++ ++ if (virtio_dev_map_vq(dev, vq)) { ++ SPDK_ERRLOG("Queue %u failed to map\n", qid); ++ return 0; ++ } ++ ++ vq->avail.avail->idx = 0; ++ vq->last_avail_idx = 0; ++ vq->used.used->idx = 0; ++ vq->last_used_idx = 0; ++ ++ if (virtio_guest_has_feature(dev, VIRTIO_F_RING_PACKED)) { ++ SPDK_DEBUGLOG(vfu_virtio, "%s: vq %u PACKED RING ENABLED\n", dev->name, qid); ++ vq->packed.packed_ring = true; ++ vq->packed.avail_phase = true; ++ vq->packed.used_phase = true; ++ } ++ ++ return 0; ++} ++ ++static int ++virtio_dev_disable_vq(struct vfu_virtio_dev *dev, uint16_t qid) ++{ ++ struct vfu_virtio_vq *vq; ++ ++ SPDK_DEBUGLOG(vfu_virtio, "%s: disable vq %u\n", dev->name, qid); ++ ++ vq = &dev->vqs[qid]; ++ if (!vq->enabled) { ++ SPDK_NOTICELOG("Queue %u isn't enabled\n", qid); ++ return 0; ++ } ++ ++ virtio_dev_unmap_vq(dev, vq); ++ ++ vq->q_state = VFU_VQ_CREATED; ++ vq->vector = 0; ++ vq->enabled = false; ++ vq->last_avail_idx = 0; ++ vq->last_used_idx = 0; ++ vq->packed.packed_ring = false; ++ vq->packed.avail_phase = 0; ++ vq->packed.used_phase = 0; ++ ++ return 0; ++} ++ ++static int ++virtio_dev_split_get_avail_reqs(struct vfu_virtio_dev *dev, struct vfu_virtio_vq *vq, ++ uint16_t *reqs, uint16_t max_reqs) ++{ ++ uint16_t count, i, avail_idx, last_idx; ++ ++ last_idx = vq->last_avail_idx; ++ avail_idx = vq->avail.avail->idx; ++ ++ spdk_smp_rmb(); ++ ++ count = avail_idx - last_idx; ++ if (count == 0) { ++ return 0; ++ } ++ ++ count = spdk_min(count, max_reqs); ++ vq->last_avail_idx += count; ++ ++ for (i = 0; i < count; i++) { ++ reqs[i] = vq->avail.avail->ring[(last_idx + i) & (vq->qsize - 1)]; ++ } ++ ++ SPDK_DEBUGLOG(vfu_virtio_io, ++ "AVAIL: vq %u last_idx=%"PRIu16" avail_idx=%"PRIu16" count=%"PRIu16"\n", ++ vq->id, last_idx, avail_idx, count); ++ ++ return count; ++} ++ ++static int ++virtio_vring_split_desc_get_next(struct vring_desc **desc, ++ struct vring_desc *desc_table, ++ uint32_t desc_table_size) ++{ ++ struct vring_desc *old_desc = *desc; ++ uint16_t next_idx; ++ ++ if ((old_desc->flags & VRING_DESC_F_NEXT) == 0) { ++ *desc = NULL; ++ return 0; ++ } ++ ++ next_idx = old_desc->next; ++ if (spdk_unlikely(next_idx >= desc_table_size)) { ++ *desc = NULL; ++ return -1; ++ } ++ ++ *desc = &desc_table[next_idx]; ++ return 0; ++} ++ ++static inline void * ++virtio_vring_desc_to_iov(struct vfu_virtio_dev *dev, struct vring_desc *desc, ++ dma_sg_t *sg, struct iovec *iov) ++{ ++ struct vfu_virtio_endpoint *virtio_endpoint = dev->virtio_endpoint; ++ ++ return spdk_vfu_map_one(virtio_endpoint->endpoint, desc->addr, desc->len, ++ sg, iov, PROT_READ | PROT_WRITE); ++} ++ ++static int ++virtio_split_vring_get_desc(struct vfu_virtio_dev *dev, struct vfu_virtio_vq *vq, uint16_t desc_idx, ++ struct vring_desc **desc, struct vring_desc **desc_table, ++ uint32_t *desc_table_size, ++ dma_sg_t *sg, struct iovec *iov) ++{ ++ *desc = &vq->desc.desc[desc_idx]; ++ ++ if (virtio_vring_split_desc_is_indirect(*desc)) { ++ *desc_table_size = (*desc)->len / sizeof(struct vring_desc); ++ *desc_table = virtio_vring_desc_to_iov(dev, *desc, sg, iov); ++ *desc = *desc_table; ++ if (*desc == NULL) { ++ return -EINVAL; ++ } ++ return 0; ++ } ++ ++ *desc_table = vq->desc.desc; ++ *desc_table_size = vq->qsize; ++ ++ return 0; ++} ++ ++static inline dma_sg_t * ++virtio_req_to_sg_t(struct vfu_virtio_req *req, uint32_t iovcnt) ++{ ++ return (dma_sg_t *)(req->sg + iovcnt * dma_sg_size()); ++} ++ ++static inline struct vfu_virtio_req * ++vfu_virtio_dev_get_req(struct vfu_virtio_endpoint *virtio_endpoint, struct vfu_virtio_vq *vq) ++{ ++ struct vfu_virtio_req *req; ++ ++ req = STAILQ_FIRST(&vq->free_reqs); ++ if (req == NULL) { ++ return NULL; ++ } ++ STAILQ_REMOVE_HEAD(&vq->free_reqs, link); ++ ++ req->iovcnt = 0; ++ req->used_len = 0; ++ req->payload_size = 0; ++ req->req_idx = 0; ++ req->buffer_id = 0; ++ req->num_descs = 0; ++ ++ return req; ++} ++ ++void ++vfu_virtio_dev_put_req(struct vfu_virtio_req *req) ++{ ++ struct vfu_virtio_dev *dev = req->dev; ++ struct vfu_virtio_endpoint *virtio_endpoint = dev->virtio_endpoint; ++ vfu_ctx_t *vfu_ctx = spdk_vfu_get_vfu_ctx(virtio_endpoint->endpoint); ++ ++ if (req->indirect_iov->iov_base) { ++ vfu_sgl_put(vfu_ctx, req->indirect_sg, req->indirect_iov, 1); ++ req->indirect_iov->iov_base = NULL; ++ req->indirect_iov->iov_len = 0; ++ } ++ ++ if (req->iovcnt) { ++ vfu_sgl_put(vfu_ctx, virtio_req_to_sg_t(req, 0), req->iovs, req->iovcnt); ++ req->iovcnt = 0; ++ } ++ ++ STAILQ_INSERT_HEAD(&req->vq->free_reqs, req, link); ++} ++ ++void ++vfu_virtio_finish_req(struct vfu_virtio_req *req) ++{ ++ struct vfu_virtio_dev *dev = req->dev; ++ struct vfu_virtio_endpoint *virtio_endpoint = dev->virtio_endpoint; ++ ++ assert(virtio_endpoint->io_outstanding); ++ virtio_endpoint->io_outstanding--; ++ ++ if (!virtio_guest_has_feature(req->dev, VIRTIO_F_RING_PACKED)) { ++ virtio_vq_used_ring_split_enqueue(req->vq, req->req_idx, req->used_len); ++ } else { ++ virtio_vq_used_ring_packed_enqueue(req->vq, req->buffer_id, req->num_descs, req->used_len); ++ } ++ ++ vfu_virtio_dev_put_req(req); ++} ++ ++static inline void ++vfu_virtio_dev_free_reqs(struct vfu_virtio_endpoint *virtio_endpoint, struct vfu_virtio_dev *dev) ++{ ++ struct vfu_virtio_req *req; ++ struct vfu_virtio_vq *vq; ++ uint32_t i; ++ ++ for (i = 0; i < dev->num_queues; i++) { ++ vq = &dev->vqs[i]; ++ while (!STAILQ_EMPTY(&vq->free_reqs)) { ++ req = STAILQ_FIRST(&vq->free_reqs); ++ STAILQ_REMOVE_HEAD(&vq->free_reqs, link); ++ vfu_virtio_vq_free_req(virtio_endpoint, vq, req); ++ } ++ } ++} ++ ++static int ++virtio_dev_split_iovs_setup(struct vfu_virtio_dev *dev, struct vfu_virtio_vq *vq, ++ uint16_t desc_idx, struct vfu_virtio_req *req) ++{ ++ struct vring_desc *desc, *desc_table; ++ uint32_t desc_table_size, len = 0; ++ uint32_t desc_handled_cnt = 0; ++ int rc; ++ ++ rc = virtio_split_vring_get_desc(dev, vq, desc_idx, &desc, ++ &desc_table, &desc_table_size, ++ req->indirect_sg, req->indirect_iov); ++ if (spdk_unlikely(rc)) { ++ SPDK_ERRLOG("Invalid descriptor at index %"PRIu16".\n", desc_idx); ++ return rc; ++ } ++ ++ assert(req->iovcnt == 0); ++ ++ while (true) { ++ if (spdk_unlikely(!virtio_vring_desc_to_iov(dev, desc, virtio_req_to_sg_t(req, req->iovcnt), ++ &req->iovs[req->iovcnt]))) { ++ return -EINVAL; ++ } ++ req->desc_writeable[req->iovcnt] = false; ++ if (virtio_vring_split_desc_is_wr(desc)) { ++ req->desc_writeable[req->iovcnt] = true; ++ } ++ ++ req->iovcnt++; ++ len += desc->len; ++ ++ rc = virtio_vring_split_desc_get_next(&desc, desc_table, desc_table_size); ++ if (spdk_unlikely(rc)) { ++ return rc; ++ } else if (desc == NULL) { ++ break; ++ } ++ ++ desc_handled_cnt++; ++ if (spdk_unlikely(desc_handled_cnt > desc_table_size)) { ++ return -EINVAL; ++ } ++ } ++ ++ req->payload_size = len; ++ ++ return 0; ++} ++ ++void ++virtio_vq_used_ring_split_enqueue(struct vfu_virtio_vq *vq, uint16_t req_idx, uint32_t used_len) ++{ ++ uint16_t last_idx = vq->last_used_idx & (vq->qsize - 1); ++ ++ SPDK_DEBUGLOG(vfu_virtio_io, ++ "Queue %u - USED RING: last_idx=%"PRIu16" req_idx=%"PRIu16" used_len=%"PRIu32"\n", ++ vq->id, last_idx, req_idx, used_len); ++ ++ vq->used.used->ring[last_idx].id = req_idx; ++ vq->used.used->ring[last_idx].len = used_len; ++ vq->last_used_idx++; ++ ++ spdk_smp_wmb(); ++ ++ *(volatile uint16_t *)&vq->used.used->idx = vq->last_used_idx; ++ ++ vq->used_req_cnt++; ++} ++ ++void ++virtio_vq_used_ring_packed_enqueue(struct vfu_virtio_vq *vq, uint16_t buffer_id, uint32_t num_descs, ++ uint32_t used_len) ++{ ++ struct vring_packed_desc *desc = &vq->desc.desc_packed[vq->last_used_idx]; ++ ++ SPDK_DEBUGLOG(vfu_virtio_io, ++ "Queue %u - USED RING: buffer_id=%"PRIu16" num_descs=%u used_len=%"PRIu32"\n", ++ vq->id, buffer_id, num_descs, used_len); ++ ++ if (spdk_unlikely(virtio_vring_packed_is_used(desc, vq->packed.used_phase))) { ++ SPDK_ERRLOG("descriptor has been used before\n"); ++ return; ++ } ++ ++ /* In used desc addr is unused and len specifies the buffer length ++ * that has been written to by the device. ++ */ ++ desc->addr = 0; ++ desc->len = used_len; ++ ++ /* This bit specifies whether any data has been written by the device */ ++ if (used_len != 0) { ++ desc->flags |= VRING_DESC_F_WRITE; ++ } ++ ++ /* Buffer ID is included in the last descriptor in the list. ++ * The driver needs to keep track of the size of the list corresponding ++ * to each buffer ID. ++ */ ++ desc->id = buffer_id; ++ ++ /* A device MUST NOT make the descriptor used before buffer_id is ++ * written to the descriptor. ++ */ ++ spdk_smp_wmb(); ++ ++ /* To mark a desc as used, the device sets the F_USED bit in flags to match ++ * the internal Device ring wrap counter. It also sets the F_AVAIL bit to ++ * match the same value. ++ */ ++ if (vq->packed.used_phase) { ++ desc->flags |= (1 << VRING_PACKED_DESC_F_AVAIL); ++ desc->flags |= (1 << VRING_PACKED_DESC_F_USED); ++ } else { ++ desc->flags &= ~(1 << VRING_PACKED_DESC_F_AVAIL); ++ desc->flags &= ~(1 << VRING_PACKED_DESC_F_USED); ++ } ++ ++ vq->last_used_idx += num_descs; ++ if (vq->last_used_idx >= vq->qsize) { ++ vq->last_used_idx -= vq->qsize; ++ vq->packed.used_phase = !vq->packed.used_phase; ++ } ++ ++ vq->used_req_cnt++; ++} ++ ++static int ++vfu_virtio_vq_post_irq(struct vfu_virtio_dev *dev, struct vfu_virtio_vq *vq) ++{ ++ struct vfu_virtio_endpoint *virtio_endpoint = dev->virtio_endpoint; ++ vfu_ctx_t *vfu_ctx = spdk_vfu_get_vfu_ctx(virtio_endpoint->endpoint); ++ ++ vq->used_req_cnt = 0; ++ ++ if (spdk_vfu_endpoint_msix_enabled(virtio_endpoint->endpoint)) { ++ SPDK_DEBUGLOG(vfu_virtio_io, "%s: Queue %u post MSIX IV %u\n", ++ spdk_vfu_get_endpoint_id(virtio_endpoint->endpoint), ++ vq->id, vq->vector); ++ return vfu_irq_trigger(vfu_ctx, vq->vector); ++ } else { ++ if (!spdk_vfu_endpoint_intx_enabled(virtio_endpoint->endpoint)) { ++ SPDK_DEBUGLOG(vfu_virtio_io, "%s: IRQ disabled\n", ++ spdk_vfu_get_endpoint_id(virtio_endpoint->endpoint)); ++ return 0; ++ } ++ ++ SPDK_DEBUGLOG(vfu_virtio_io, "%s: Queue %u post ISR\n", ++ spdk_vfu_get_endpoint_id(virtio_endpoint->endpoint), vq->id); ++ dev->cfg.isr = 1; ++ return vfu_irq_trigger(vfu_ctx, 0); ++ } ++} ++ ++void ++vfu_virtio_vq_flush_irq(struct vfu_virtio_dev *dev, struct vfu_virtio_vq *vq) ++{ ++ struct vfu_virtio_endpoint *virtio_endpoint = dev->virtio_endpoint; ++ uint32_t delay_us; ++ ++ if (vq->used_req_cnt == 0) { ++ return; ++ } ++ ++ /* No need to notify client */ ++ if (virtio_queue_event_is_suppressed(dev, vq)) { ++ return; ++ } ++ ++ /* Interrupt coalescing disabled */ ++ if (!virtio_endpoint->coalescing_delay_us) { ++ vfu_virtio_vq_post_irq(dev, vq); ++ return; ++ } ++ ++ /* No need for event right now */ ++ if (spdk_get_ticks() < vq->next_event_time) { ++ return; ++ } ++ ++ vfu_virtio_vq_post_irq(dev, vq); ++ ++ delay_us = virtio_endpoint->coalescing_delay_us; ++ vq->next_event_time = spdk_get_ticks() + delay_us * spdk_get_ticks_hz() / (1000000ULL); ++} ++ ++int ++vfu_virito_dev_process_split_ring(struct vfu_virtio_dev *dev, struct vfu_virtio_vq *vq) ++{ ++ struct vfu_virtio_endpoint *virtio_endpoint = dev->virtio_endpoint; ++ struct vfu_virtio_req *req; ++ uint16_t reqs_idx[VIRTIO_DEV_VRING_MAX_REQS]; ++ uint16_t reqs_cnt, i; ++ int ret; ++ ++ reqs_cnt = virtio_dev_split_get_avail_reqs(dev, vq, reqs_idx, VIRTIO_DEV_VRING_MAX_REQS); ++ if (!reqs_cnt) { ++ return 0; ++ } ++ ++ SPDK_DEBUGLOG(vfu_virtio_io, "%s: get %u descriptors\n", dev->name, reqs_cnt); ++ ++ for (i = 0; i < reqs_cnt; i++) { ++ req = vfu_virtio_dev_get_req(virtio_endpoint, vq); ++ if (spdk_unlikely(!req)) { ++ SPDK_ERRLOG("Error to get request\n"); ++ /* TODO: address the error case */ ++ return -EIO; ++ } ++ ++ req->req_idx = reqs_idx[i]; ++ ret = virtio_dev_split_iovs_setup(dev, vq, req->req_idx, req); ++ if (spdk_unlikely(ret)) { ++ /* let the device to response this error */ ++ SPDK_ERRLOG("Split vring setup failed with index %u\n", i); ++ } ++ ++ assert(virtio_endpoint->virtio_ops.exec_request); ++ virtio_endpoint->io_outstanding++; ++ virtio_endpoint->virtio_ops.exec_request(virtio_endpoint, vq, req); ++ } ++ ++ return i; ++} ++ ++struct vfu_virtio_req * ++virito_dev_split_ring_get_next_avail_req(struct vfu_virtio_dev *dev, struct vfu_virtio_vq *vq) ++{ ++ struct vfu_virtio_endpoint *virtio_endpoint = dev->virtio_endpoint; ++ struct vfu_virtio_req *req; ++ uint16_t reqs_idx[VIRTIO_DEV_VRING_MAX_REQS]; ++ uint16_t reqs_cnt; ++ int ret; ++ ++ reqs_cnt = virtio_dev_split_get_avail_reqs(dev, vq, reqs_idx, 1); ++ if (!reqs_cnt) { ++ return NULL; ++ } ++ assert(reqs_cnt == 1); ++ ++ SPDK_DEBUGLOG(vfu_virtio_io, "%s: get 1 descriptors\n", dev->name); ++ ++ req = vfu_virtio_dev_get_req(virtio_endpoint, vq); ++ if (!req) { ++ SPDK_ERRLOG("Error to get request\n"); ++ return NULL; ++ } ++ ++ req->req_idx = reqs_idx[0]; ++ ret = virtio_dev_split_iovs_setup(dev, vq, req->req_idx, req); ++ if (ret) { ++ SPDK_ERRLOG("Split vring setup failed\n"); ++ vfu_virtio_dev_put_req(req); ++ return NULL; ++ } ++ ++ return req; ++} ++ ++static inline void * ++virtio_vring_packed_desc_to_iov(struct vfu_virtio_dev *dev, struct vring_packed_desc *desc, ++ dma_sg_t *sg, struct iovec *iov) ++{ ++ struct vfu_virtio_endpoint *virtio_endpoint = dev->virtio_endpoint; ++ ++ return spdk_vfu_map_one(virtio_endpoint->endpoint, desc->addr, desc->len, ++ sg, iov, PROT_READ | PROT_WRITE); ++} ++ ++static int ++virtio_dev_packed_iovs_setup(struct vfu_virtio_dev *dev, struct vfu_virtio_vq *vq, ++ uint16_t last_avail_idx, ++ struct vring_packed_desc *current_desc, struct vfu_virtio_req *req) ++{ ++ struct vring_packed_desc *desc, *desc_table = NULL; ++ uint16_t new_idx, num_descs, desc_table_size = 0; ++ uint32_t len = 0; ++ ++ SPDK_DEBUGLOG(vfu_virtio_io, "%s: last avail idx %u, req %p\n", dev->name, last_avail_idx, req); ++ ++ desc = NULL; ++ num_descs = 1; ++ if (virtio_vring_packed_desc_is_indirect(current_desc)) { ++ req->buffer_id = current_desc->id; ++ desc_table = virtio_vring_packed_desc_to_iov(dev, current_desc, req->indirect_sg, ++ req->indirect_iov); ++ if (spdk_unlikely(desc_table == NULL)) { ++ SPDK_ERRLOG("Map Indirect Desc to IOV failed\n"); ++ return -EINVAL; ++ } ++ desc_table_size = current_desc->len / sizeof(struct vring_packed_desc); ++ desc = desc_table; ++ SPDK_DEBUGLOG(vfu_virtio_io, "%s: indirect desc %p, desc size %u, req %p\n", ++ dev->name, desc_table, desc_table_size, req); ++ } else { ++ desc = current_desc; ++ } ++ ++ assert(req->iovcnt == 0); ++ /* Map descs to IOVs */ ++ new_idx = last_avail_idx; ++ while (1) { ++ assert(desc != NULL); ++ if (spdk_unlikely(req->iovcnt == VIRTIO_DEV_MAX_IOVS)) { ++ SPDK_ERRLOG("Max IOVs in request reached (iovcnt = %d).\n", req->iovcnt); ++ return -EINVAL; ++ } ++ ++ if (spdk_unlikely(!virtio_vring_packed_desc_to_iov(dev, desc, virtio_req_to_sg_t(req, req->iovcnt), ++ &req->iovs[req->iovcnt]))) { ++ SPDK_ERRLOG("Map Desc to IOV failed (iovcnt = %d).\n", req->iovcnt); ++ return -EINVAL; ++ } ++ req->desc_writeable[req->iovcnt] = false; ++ if (virtio_vring_packed_desc_is_wr(desc)) { ++ req->desc_writeable[req->iovcnt] = true; ++ } ++ ++ req->iovcnt++; ++ len += desc->len; ++ ++ /* get next desc */ ++ if (desc_table) { ++ if (req->iovcnt < desc_table_size) { ++ desc = &desc_table[req->iovcnt]; ++ } else { ++ desc = NULL; ++ } ++ } else { ++ if ((desc->flags & VRING_DESC_F_NEXT) == 0) { ++ req->buffer_id = desc->id; ++ desc = NULL; ++ } else { ++ new_idx = (new_idx + 1) % vq->qsize; ++ desc = &vq->desc.desc_packed[new_idx]; ++ num_descs++; ++ req->buffer_id = desc->id; ++ } ++ } ++ ++ if (desc == NULL) { ++ break; ++ } ++ } ++ ++ req->num_descs = num_descs; ++ vq->last_avail_idx = (new_idx + 1) % vq->qsize; ++ if (vq->last_avail_idx < last_avail_idx) { ++ vq->packed.avail_phase = !vq->packed.avail_phase; ++ } ++ ++ req->payload_size = len; ++ ++ SPDK_DEBUGLOG(vfu_virtio_io, "%s: req %p, iovcnt %u, num_descs %u\n", ++ dev->name, req, req->iovcnt, num_descs); ++ return 0; ++} ++ ++int ++vfu_virito_dev_process_packed_ring(struct vfu_virtio_dev *dev, struct vfu_virtio_vq *vq) ++{ ++ struct vfu_virtio_endpoint *virtio_endpoint = dev->virtio_endpoint; ++ struct vring_packed_desc *desc; ++ int ret; ++ struct vfu_virtio_req *req; ++ uint16_t i, max_reqs; ++ ++ max_reqs = VIRTIO_DEV_VRING_MAX_REQS; ++ for (i = 0; i < max_reqs; i++) { ++ desc = &vq->desc.desc_packed[vq->last_avail_idx]; ++ if (!virtio_vring_packed_is_avail(desc, vq->packed.avail_phase)) { ++ return i; ++ } ++ ++ req = vfu_virtio_dev_get_req(virtio_endpoint, vq); ++ if (spdk_unlikely(!req)) { ++ SPDK_ERRLOG("Error to get request\n"); ++ /* TODO: address the error case */ ++ assert(false); ++ return -EIO; ++ } ++ ++ ret = virtio_dev_packed_iovs_setup(dev, vq, vq->last_avail_idx, desc, req); ++ if (spdk_unlikely(ret)) { ++ /* let the device to response the error */ ++ SPDK_ERRLOG("virtio_dev_packed_iovs_setup failed\n"); ++ } ++ ++ assert(virtio_endpoint->virtio_ops.exec_request); ++ virtio_endpoint->io_outstanding++; ++ virtio_endpoint->virtio_ops.exec_request(virtio_endpoint, vq, req); ++ } ++ ++ return i; ++} ++ ++struct vfu_virtio_req * ++virito_dev_packed_ring_get_next_avail_req(struct vfu_virtio_dev *dev, struct vfu_virtio_vq *vq) ++{ ++ struct vfu_virtio_endpoint *virtio_endpoint = dev->virtio_endpoint; ++ struct vring_packed_desc *desc; ++ int ret; ++ struct vfu_virtio_req *req; ++ ++ desc = &vq->desc.desc_packed[vq->last_avail_idx]; ++ if (!virtio_vring_packed_is_avail(desc, vq->packed.avail_phase)) { ++ return NULL; ++ } ++ ++ SPDK_DEBUGLOG(vfu_virtio_io, "%s: get 1 descriptors\n", dev->name); ++ ++ req = vfu_virtio_dev_get_req(virtio_endpoint, vq); ++ if (!req) { ++ SPDK_ERRLOG("Error to get request\n"); ++ return NULL; ++ } ++ ++ ret = virtio_dev_packed_iovs_setup(dev, vq, vq->last_avail_idx, desc, req); ++ if (ret) { ++ SPDK_ERRLOG("virtio_dev_packed_iovs_setup failed\n"); ++ vfu_virtio_dev_put_req(req); ++ return NULL; ++ } ++ ++ return req; ++} ++ ++static int ++virtio_vfu_pci_common_cfg(struct vfu_virtio_endpoint *virtio_endpoint, char *buf, ++ size_t count, loff_t pos, bool is_write) ++{ ++ struct vfu_virtio_dev *dev = virtio_endpoint->dev; ++ uint32_t offset, value = 0; ++ int ret; ++ ++ assert(count <= 4); ++ offset = pos - VIRTIO_PCI_COMMON_CFG_OFFSET; ++ ++ if (is_write) { ++ memcpy(&value, buf, count); ++ switch (offset) { ++ case VIRTIO_PCI_COMMON_DFSELECT: ++ dev->cfg.host_feature_select = value; ++ SPDK_DEBUGLOG(vfu_virtio, "%s: WRITE PCI_COMMON_DFSELECT with 0x%x\n", ++ spdk_vfu_get_endpoint_id(virtio_endpoint->endpoint), ++ value); ++ break; ++ case VIRTIO_PCI_COMMON_GFSELECT: ++ dev->cfg.guest_feature_select = value; ++ SPDK_DEBUGLOG(vfu_virtio, "%s: WRITE PCI_COMMON_GFSELECT with 0x%x\n", ++ spdk_vfu_get_endpoint_id(virtio_endpoint->endpoint), ++ value); ++ break; ++ case VIRTIO_PCI_COMMON_GF: ++ assert(dev->cfg.guest_feature_select <= 1); ++ if (dev->cfg.guest_feature_select) { ++ dev->cfg.guest_feat_hi = value; ++ SPDK_DEBUGLOG(vfu_virtio, "%s: WRITE PCI_COMMON_GF_HI with 0x%x\n", ++ spdk_vfu_get_endpoint_id(virtio_endpoint->endpoint), ++ value); ++ } else { ++ dev->cfg.guest_feat_lo = value; ++ SPDK_DEBUGLOG(vfu_virtio, "%s: WRITE PCI_COMMON_GF_LO with 0x%x\n", ++ spdk_vfu_get_endpoint_id(virtio_endpoint->endpoint), ++ value); ++ } ++ ++ ret = virtio_dev_set_features(dev, ++ (((uint64_t)dev->cfg.guest_feat_hi << 32) | dev->cfg.guest_feat_lo)); ++ if (ret) { ++ return ret; ++ } ++ break; ++ case VIRTIO_PCI_COMMON_MSIX: ++ dev->cfg.msix_config = value; ++ SPDK_DEBUGLOG(vfu_virtio, "%s: WRITE PCI_COMMON_MSIX with 0x%x\n", ++ spdk_vfu_get_endpoint_id(virtio_endpoint->endpoint), ++ value); ++ break; ++ case VIRTIO_PCI_COMMON_STATUS: ++ SPDK_DEBUGLOG(vfu_virtio, "%s: WRITE PCI_COMMON_STATUS with 0x%x\n", ++ spdk_vfu_get_endpoint_id(virtio_endpoint->endpoint), ++ value); ++ ret = virtio_dev_set_status(dev, value); ++ if (ret) { ++ return ret; ++ } ++ break; ++ case VIRTIO_PCI_COMMON_Q_SELECT: ++ if (value < VIRTIO_DEV_MAX_VQS) { ++ dev->cfg.queue_select = value; ++ } ++ SPDK_DEBUGLOG(vfu_virtio, "%s: WRITE PCI_COMMON_Q_SELECT with 0x%x\n", ++ spdk_vfu_get_endpoint_id(virtio_endpoint->endpoint), ++ value); ++ break; ++ case VIRTIO_PCI_COMMON_Q_SIZE: ++ dev->vqs[dev->cfg.queue_select].qsize = value; ++ SPDK_DEBUGLOG(vfu_virtio, "%s: WRITE PCI_COMMON_Q_SIZE with 0x%x\n", ++ spdk_vfu_get_endpoint_id(virtio_endpoint->endpoint), ++ value); ++ break; ++ case VIRTIO_PCI_COMMON_Q_MSIX: ++ dev->vqs[dev->cfg.queue_select].vector = value; ++ SPDK_DEBUGLOG(vfu_virtio, "%s: WRITE PCI_COMMON_Q_MSIX with 0x%x\n", ++ spdk_vfu_get_endpoint_id(virtio_endpoint->endpoint), ++ value); ++ break; ++ case VIRTIO_PCI_COMMON_Q_ENABLE: ++ SPDK_DEBUGLOG(vfu_virtio, "%s: WRITE PCI_COMMON_Q_ENABLE with 0x%x\n", ++ spdk_vfu_get_endpoint_id(virtio_endpoint->endpoint), ++ value); ++ if (value == 1) { ++ ret = virtio_dev_enable_vq(dev, dev->cfg.queue_select); ++ if (ret) { ++ return ret; ++ } ++ } else { ++ ret = virtio_dev_disable_vq(dev, dev->cfg.queue_select); ++ if (ret) { ++ return ret; ++ } ++ } ++ break; ++ case VIRTIO_PCI_COMMON_Q_DESCLO: ++ dev->vqs[dev->cfg.queue_select].desc_lo = value; ++ SPDK_DEBUGLOG(vfu_virtio, "%s: WRITE queue %u PCI_COMMON_Q_DESCLO with 0x%x\n", ++ spdk_vfu_get_endpoint_id(virtio_endpoint->endpoint), dev->cfg.queue_select, value); ++ break; ++ case VIRTIO_PCI_COMMON_Q_DESCHI: ++ dev->vqs[dev->cfg.queue_select].desc_hi = value; ++ SPDK_DEBUGLOG(vfu_virtio, "%s: WRITE queue %u PCI_COMMON_Q_DESCHI with 0x%x\n", ++ spdk_vfu_get_endpoint_id(virtio_endpoint->endpoint), dev->cfg.queue_select, value); ++ break; ++ case VIRTIO_PCI_COMMON_Q_AVAILLO: ++ dev->vqs[dev->cfg.queue_select].avail_lo = value; ++ SPDK_DEBUGLOG(vfu_virtio, "%s: WRITE queue %u PCI_COMMON_Q_AVAILLO with 0x%x\n", ++ spdk_vfu_get_endpoint_id(virtio_endpoint->endpoint), dev->cfg.queue_select, value); ++ break; ++ case VIRTIO_PCI_COMMON_Q_AVAILHI: ++ dev->vqs[dev->cfg.queue_select].avail_hi = value; ++ SPDK_DEBUGLOG(vfu_virtio, "%s: WRITE queue %u PCI_COMMON_Q_AVAILHI with 0x%x\n", ++ spdk_vfu_get_endpoint_id(virtio_endpoint->endpoint), dev->cfg.queue_select, value); ++ break; ++ case VIRTIO_PCI_COMMON_Q_USEDLO: ++ dev->vqs[dev->cfg.queue_select].used_lo = value; ++ SPDK_DEBUGLOG(vfu_virtio, "%s: WRITE queue %u PCI_COMMON_Q_USEDLO with 0x%x\n", ++ spdk_vfu_get_endpoint_id(virtio_endpoint->endpoint), dev->cfg.queue_select, value); ++ break; ++ case VIRTIO_PCI_COMMON_Q_USEDHI: ++ dev->vqs[dev->cfg.queue_select].used_hi = value; ++ SPDK_DEBUGLOG(vfu_virtio, "%s: WRITE queue %u PCI_COMMON_Q_USEDHI with 0x%x\n", ++ spdk_vfu_get_endpoint_id(virtio_endpoint->endpoint), dev->cfg.queue_select, value); ++ break; ++ ++ default: ++ SPDK_ERRLOG("%s: WRITE UNSUPPORTED offset 0x%x\n", ++ spdk_vfu_get_endpoint_id(virtio_endpoint->endpoint), offset); ++ errno = EIO; ++ return -1; ++ } ++ } else { ++ switch (offset) { ++ case VIRTIO_PCI_COMMON_DFSELECT: ++ value = dev->cfg.host_feature_select; ++ SPDK_DEBUGLOG(vfu_virtio, "%s: READ PCI_COMMON_DFSELECT with 0x%x\n", ++ spdk_vfu_get_endpoint_id(virtio_endpoint->endpoint), ++ value); ++ break; ++ case VIRTIO_PCI_COMMON_DF: ++ assert(dev->cfg.host_feature_select <= 1); ++ if (dev->cfg.host_feature_select) { ++ value = dev->host_features >> 32; ++ SPDK_DEBUGLOG(vfu_virtio, "%s: READ PCI_COMMON_DF_HI with 0x%x\n", ++ spdk_vfu_get_endpoint_id(virtio_endpoint->endpoint), ++ value); ++ } else { ++ value = dev->host_features; ++ SPDK_DEBUGLOG(vfu_virtio, "%s: READ PCI_COMMON_DF_LO with 0x%x\n", ++ spdk_vfu_get_endpoint_id(virtio_endpoint->endpoint), ++ value); ++ } ++ break; ++ case VIRTIO_PCI_COMMON_GFSELECT: ++ value = dev->cfg.guest_feature_select; ++ SPDK_DEBUGLOG(vfu_virtio, "%s: READ PCI_COMMON_GFSELECT with 0x%x\n", ++ spdk_vfu_get_endpoint_id(virtio_endpoint->endpoint), ++ value); ++ break; ++ case VIRTIO_PCI_COMMON_GF: ++ assert(dev->cfg.guest_feature_select <= 1); ++ if (dev->cfg.guest_feature_select) { ++ value = dev->cfg.guest_feat_hi; ++ SPDK_DEBUGLOG(vfu_virtio, "%s: READ PCI_COMMON_GF_HI with 0x%x\n", ++ spdk_vfu_get_endpoint_id(virtio_endpoint->endpoint), ++ value); ++ } else { ++ value = dev->cfg.guest_feat_lo; ++ SPDK_DEBUGLOG(vfu_virtio, "%s: READ PCI_COMMON_GF_LO with 0x%x\n", ++ spdk_vfu_get_endpoint_id(virtio_endpoint->endpoint), ++ value); ++ } ++ break; ++ case VIRTIO_PCI_COMMON_MSIX: ++ value = dev->cfg.msix_config; ++ SPDK_DEBUGLOG(vfu_virtio, "%s: READ PCI_COMMON_MSIX with 0x%x\n", ++ spdk_vfu_get_endpoint_id(virtio_endpoint->endpoint), ++ value); ++ break; ++ case VIRTIO_PCI_COMMON_NUMQ: ++ value = dev->num_queues; ++ SPDK_DEBUGLOG(vfu_virtio, "%s: READ PCI_COMMON_NUMQ with 0x%x\n", ++ spdk_vfu_get_endpoint_id(virtio_endpoint->endpoint), ++ value); ++ break; ++ case VIRTIO_PCI_COMMON_STATUS: ++ value = dev->cfg.device_status; ++ SPDK_DEBUGLOG(vfu_virtio, "%s: READ PCI_COMMON_STATUS with 0x%x\n", ++ spdk_vfu_get_endpoint_id(virtio_endpoint->endpoint), ++ value); ++ break; ++ case VIRTIO_PCI_COMMON_CFGGENERATION: ++ value = dev->cfg.config_generation; ++ SPDK_DEBUGLOG(vfu_virtio, "%s: READ PCI_COMMON_CFGGENERATION with 0x%x\n", ++ spdk_vfu_get_endpoint_id(virtio_endpoint->endpoint), ++ value); ++ break; ++ case VIRTIO_PCI_COMMON_Q_NOFF: ++ value = dev->cfg.queue_select; ++ SPDK_DEBUGLOG(vfu_virtio, "%s: READ PCI_COMMON_Q_NOFF with 0x%x\n", ++ spdk_vfu_get_endpoint_id(virtio_endpoint->endpoint), ++ value); ++ break; ++ case VIRTIO_PCI_COMMON_Q_SELECT: ++ value = dev->cfg.queue_select; ++ SPDK_DEBUGLOG(vfu_virtio, "%s: READ PCI_COMMON_Q_SELECT with 0x%x\n", ++ spdk_vfu_get_endpoint_id(virtio_endpoint->endpoint), ++ value); ++ break; ++ case VIRTIO_PCI_COMMON_Q_SIZE: ++ value = dev->vqs[dev->cfg.queue_select].qsize; ++ SPDK_DEBUGLOG(vfu_virtio, "%s: READ queue %u PCI_COMMON_Q_SIZE with 0x%x\n", ++ spdk_vfu_get_endpoint_id(virtio_endpoint->endpoint), ++ dev->cfg.queue_select, value); ++ break; ++ case VIRTIO_PCI_COMMON_Q_MSIX: ++ value = dev->vqs[dev->cfg.queue_select].vector; ++ SPDK_DEBUGLOG(vfu_virtio, "%s: READ queue %u PCI_COMMON_Q_MSIX with 0x%x\n", ++ spdk_vfu_get_endpoint_id(virtio_endpoint->endpoint), ++ dev->cfg.queue_select, value); ++ break; ++ case VIRTIO_PCI_COMMON_Q_ENABLE: ++ value = dev->vqs[dev->cfg.queue_select].enabled; ++ SPDK_DEBUGLOG(vfu_virtio, "%s: READ queue %u PCI_COMMON_Q_ENABLE with 0x%x\n", ++ spdk_vfu_get_endpoint_id(virtio_endpoint->endpoint), dev->cfg.queue_select, value); ++ break; ++ case VIRTIO_PCI_COMMON_Q_DESCLO: ++ value = dev->vqs[dev->cfg.queue_select].desc_lo; ++ SPDK_DEBUGLOG(vfu_virtio, "%s: READ queue %u PCI_COMMON_Q_DESCLO with 0x%x\n", ++ spdk_vfu_get_endpoint_id(virtio_endpoint->endpoint), dev->cfg.queue_select, value); ++ break; ++ case VIRTIO_PCI_COMMON_Q_DESCHI: ++ value = dev->vqs[dev->cfg.queue_select].desc_hi; ++ SPDK_DEBUGLOG(vfu_virtio, "%s: READ queue %u PCI_COMMON_Q_DESCHI with 0x%x\n", ++ spdk_vfu_get_endpoint_id(virtio_endpoint->endpoint), dev->cfg.queue_select, value); ++ break; ++ case VIRTIO_PCI_COMMON_Q_AVAILLO: ++ value = dev->vqs[dev->cfg.queue_select].avail_lo; ++ SPDK_DEBUGLOG(vfu_virtio, "%s: READ queue %u PCI_COMMON_Q_AVAILLO with 0x%x\n", ++ spdk_vfu_get_endpoint_id(virtio_endpoint->endpoint), dev->cfg.queue_select, value); ++ break; ++ case VIRTIO_PCI_COMMON_Q_AVAILHI: ++ value = dev->vqs[dev->cfg.queue_select].avail_hi; ++ SPDK_DEBUGLOG(vfu_virtio, "%s: READ queue %u PCI_COMMON_Q_AVAILHI with 0x%x\n", ++ spdk_vfu_get_endpoint_id(virtio_endpoint->endpoint), dev->cfg.queue_select, value); ++ break; ++ case VIRTIO_PCI_COMMON_Q_USEDLO: ++ value = dev->vqs[dev->cfg.queue_select].used_lo; ++ SPDK_DEBUGLOG(vfu_virtio, "%s: READ queue %u PCI_COMMON_Q_USEDLO with 0x%x\n", ++ spdk_vfu_get_endpoint_id(virtio_endpoint->endpoint), dev->cfg.queue_select, value); ++ break; ++ case VIRTIO_PCI_COMMON_Q_USEDHI: ++ value = dev->vqs[dev->cfg.queue_select].used_hi; ++ SPDK_DEBUGLOG(vfu_virtio, "%s: READ queue %u PCI_COMMON_Q_USEDHI with 0x%x\n", ++ spdk_vfu_get_endpoint_id(virtio_endpoint->endpoint), dev->cfg.queue_select, value); ++ break; ++ default: ++ SPDK_ERRLOG("%s: READ UNSUPPORTED offset 0x%x\n", ++ spdk_vfu_get_endpoint_id(virtio_endpoint->endpoint), offset); ++ errno = EIO; ++ return -1; ++ } ++ memcpy(buf, &value, count); ++ } ++ ++ return count; ++} ++ ++static int ++virtio_vfu_device_specific_cfg(struct vfu_virtio_endpoint *virtio_endpoint, char *buf, ++ size_t count, loff_t pos, bool is_write) ++{ ++ loff_t offset; ++ int ret = -1; ++ ++ assert(count <= 8); ++ offset = pos - VIRTIO_PCI_SPECIFIC_CFG_OFFSET; ++ if (!is_write) { ++ if (virtio_endpoint->virtio_ops.get_config) { ++ ret = virtio_endpoint->virtio_ops.get_config(virtio_endpoint, buf, offset, count); ++ } ++ } else { ++ if (virtio_endpoint->virtio_ops.set_config) { ++ ret = virtio_endpoint->virtio_ops.set_config(virtio_endpoint, buf, offset, count); ++ } ++ } ++ ++ if (ret < 0) { ++ return ret; ++ } ++ ++ return count; ++} ++ ++static int ++virtio_vfu_pci_isr(struct vfu_virtio_endpoint *virtio_endpoint, char *buf, ++ size_t count, bool is_write) ++{ ++ uint8_t *isr; ++ ++ if (count != 1) { ++ SPDK_ERRLOG("ISR register is 1 byte\n"); ++ errno = EIO; ++ return -1; ++ } ++ ++ isr = buf; ++ ++ if (!is_write) { ++ SPDK_DEBUGLOG(vfu_virtio, "READ PCI ISR\n"); ++ /* Read-Acknowledge Clear */ ++ *isr = virtio_endpoint->dev->cfg.isr; ++ virtio_endpoint->dev->cfg.isr = 0; ++ } else { ++ SPDK_ERRLOG("ISR register is RO\n"); ++ errno = EIO; ++ return -1; ++ } ++ ++ return count; ++} ++ ++static ssize_t ++virtio_vfu_access_bar4(vfu_ctx_t *vfu_ctx, char *buf, size_t count, ++ loff_t pos, ++ bool is_write) ++{ ++ struct spdk_vfu_endpoint *endpoint = vfu_get_private(vfu_ctx); ++ struct vfu_virtio_endpoint *virtio_endpoint = spdk_vfu_get_endpoint_private(endpoint); ++ uint64_t start, end; ++ ++ start = pos; ++ end = start + count; ++ SPDK_DEBUGLOG(vfu_virtio, "%s: %s bar4 0x%"PRIX64"-0x%"PRIX64", len = %lu\n", ++ spdk_vfu_get_endpoint_id(virtio_endpoint->endpoint), ++ is_write ? "write" : "read", start, end - 1, count); ++ ++ if (end < VIRTIO_PCI_COMMON_CFG_OFFSET + VIRTIO_PCI_COMMON_CFG_LENGTH) { ++ /* virtio PCI common configuration */ ++ return virtio_vfu_pci_common_cfg(virtio_endpoint, buf, count, pos, is_write); ++ } else if (start >= VIRTIO_PCI_ISR_ACCESS_OFFSET && ++ end < VIRTIO_PCI_ISR_ACCESS_OFFSET + VIRTIO_PCI_ISR_ACCESS_LENGTH) { ++ /* ISR access */ ++ return virtio_vfu_pci_isr(virtio_endpoint, buf, count, is_write); ++ } else if (start >= VIRTIO_PCI_SPECIFIC_CFG_OFFSET && ++ end < VIRTIO_PCI_SPECIFIC_CFG_OFFSET + VIRTIO_PCI_SPECIFIC_CFG_LENGTH) { ++ /* Device specific configuration */ ++ return virtio_vfu_device_specific_cfg(virtio_endpoint, buf, count, pos, is_write); ++ } else if (start >= VIRTIO_PCI_NOTIFICATIONS_OFFSET && ++ end < VIRTIO_PCI_NOTIFICATIONS_OFFSET + VIRTIO_PCI_NOTIFICATIONS_LENGTH) { ++ /* Notifications */ ++ /* Sparse mmap region by default, there are no MMIO R/W messages */ ++ assert(false); ++ return count; ++ } else { ++ assert(false); ++ } ++ ++ return 0; ++} ++ ++int ++vfu_virtio_post_memory_add(struct spdk_vfu_endpoint *endpoint, void *map_start, void *map_end) ++{ ++ struct vfu_virtio_endpoint *virtio_endpoint = spdk_vfu_get_endpoint_private(endpoint); ++ struct vfu_virtio_dev *dev = virtio_endpoint->dev; ++ uint32_t i; ++ ++ if (!dev) { ++ return 0; ++ } ++ ++ for (i = 0; i < dev->num_queues; i++) { ++ /* Try to remap VQs if necessary */ ++ virtio_dev_map_vq(dev, &dev->vqs[i]); ++ } ++ ++ return 0; ++} ++ ++int ++vfu_virtio_pre_memory_remove(struct spdk_vfu_endpoint *endpoint, void *map_start, void *map_end) ++{ ++ struct vfu_virtio_endpoint *virtio_endpoint = spdk_vfu_get_endpoint_private(endpoint); ++ ++ if (virtio_endpoint->dev != NULL) { ++ vfu_virtio_dev_unmap_vqs(virtio_endpoint->dev, map_start, map_end); ++ } ++ ++ return 0; ++} ++ ++int ++vfu_virtio_pci_reset_cb(struct spdk_vfu_endpoint *endpoint) ++{ ++ struct vfu_virtio_endpoint *virtio_endpoint = spdk_vfu_get_endpoint_private(endpoint); ++ ++ if (virtio_endpoint->dev) { ++ vfu_virtio_dev_stop(virtio_endpoint->dev); ++ vfu_virtio_dev_reset(virtio_endpoint->dev); ++ } ++ ++ return 0; ++} ++ ++static ssize_t ++access_pci_config(vfu_ctx_t *vfu_ctx, char *buf, size_t count, loff_t offset, ++ bool is_write) ++{ ++ struct spdk_vfu_endpoint *endpoint = vfu_get_private(vfu_ctx); ++ void *pci_config = spdk_vfu_endpoint_get_pci_config(endpoint); ++ ++ SPDK_DEBUGLOG(vfu_virtio, ++ "%s: PCI_CFG %s %#lx-%#lx\n", ++ spdk_vfu_get_endpoint_id(endpoint), is_write ? "write" : "read", ++ offset, offset + count); ++ ++ if (is_write) { ++ SPDK_ERRLOG("write %#lx-%#lx not supported\n", ++ offset, offset + count); ++ errno = EINVAL; ++ return -1; ++ } ++ ++ if (offset + count > 0x1000) { ++ SPDK_ERRLOG("access past end of extended PCI configuration space, want=%ld+%ld, max=%d\n", ++ offset, count, 0x1000); ++ errno = ERANGE; ++ return -1; ++ } ++ ++ memcpy(buf, ((unsigned char *)pci_config) + offset, count); ++ return count; ++} ++ ++static int ++vfu_virtio_dev_start(struct vfu_virtio_dev *dev) ++{ ++ struct vfu_virtio_endpoint *virtio_endpoint = dev->virtio_endpoint; ++ int ret = 0; ++ ++ SPDK_DEBUGLOG(vfu_virtio, "start %s\n", dev->name); ++ ++ if (virtio_dev_is_started(dev)) { ++ SPDK_ERRLOG("Device %s is already started\n", dev->name); ++ return -EFAULT; ++ } ++ ++ if (virtio_endpoint->virtio_ops.start_device) { ++ virtio_endpoint->io_outstanding = 0; ++ ret = virtio_endpoint->virtio_ops.start_device(virtio_endpoint); ++ } ++ ++ SPDK_DEBUGLOG(vfu_virtio, "%s is started with ret %d\n", dev->name, ret); ++ ++ return ret; ++} ++ ++static int ++vfu_virtio_dev_stop(struct vfu_virtio_dev *dev) ++{ ++ struct vfu_virtio_endpoint *virtio_endpoint = dev->virtio_endpoint; ++ int ret = 0; ++ ++ SPDK_DEBUGLOG(vfu_virtio, "stop %s\n", dev->name); ++ ++ if (!virtio_dev_is_started(dev)) { ++ SPDK_DEBUGLOG(vfu_virtio, "%s isn't started\n", dev->name); ++ return 0; ++ } ++ ++ if (virtio_endpoint->virtio_ops.stop_device) { ++ ret = virtio_endpoint->virtio_ops.stop_device(virtio_endpoint); ++ assert(ret == 0); ++ } ++ ++ /* Unmap all VQs */ ++ vfu_virtio_dev_unmap_vqs(dev, NULL, NULL); ++ ++ return ret; ++} ++ ++int ++vfu_virtio_detach_device(struct spdk_vfu_endpoint *endpoint) ++{ ++ struct vfu_virtio_endpoint *virtio_endpoint = spdk_vfu_get_endpoint_private(endpoint); ++ struct vfu_virtio_dev *dev = virtio_endpoint->dev; ++ ++ if (virtio_endpoint->dev == NULL) { ++ return 0; ++ } ++ ++ SPDK_DEBUGLOG(vfu_virtio, "detach device %s\n", dev->name); ++ ++ vfu_virtio_dev_stop(dev); ++ vfu_virtio_dev_free_reqs(virtio_endpoint, dev); ++ virtio_endpoint->dev = NULL; ++ free(dev); ++ ++ return 0; ++} ++ ++int ++vfu_virtio_attach_device(struct spdk_vfu_endpoint *endpoint) ++{ ++ struct vfu_virtio_endpoint *virtio_endpoint = spdk_vfu_get_endpoint_private(endpoint); ++ uint64_t supported_features = 0; ++ struct vfu_virtio_dev *dev; ++ struct vfu_virtio_vq *vq; ++ struct vfu_virtio_req *req; ++ uint32_t i, j; ++ int ret = 0; ++ ++ dev = calloc(1, sizeof(*dev) + virtio_endpoint->num_queues * 3 * dma_sg_size()); ++ if (dev == NULL) { ++ return -ENOMEM; ++ } ++ ++ dev->num_queues = virtio_endpoint->num_queues; ++ for (i = 0; i < dev->num_queues; i++) { ++ vq = &dev->vqs[i]; ++ vq->id = i; ++ vq->qsize = virtio_endpoint->qsize; ++ vq->avail.sg = (dma_sg_t *)(dev->sg + i * dma_sg_size() * 3); ++ vq->used.sg = (dma_sg_t *)((uint8_t *)vq->avail.sg + dma_sg_size()); ++ vq->desc.sg = (dma_sg_t *)((uint8_t *)vq->used.sg + dma_sg_size()); ++ ++ STAILQ_INIT(&vq->free_reqs); ++ for (j = 0; j <= vq->qsize; j++) { ++ req = vfu_virtio_vq_alloc_req(virtio_endpoint, vq); ++ if (!req) { ++ SPDK_ERRLOG("Error to allocate req\n"); ++ ret = -ENOMEM; ++ goto out; ++ } ++ req->indirect_iov = &req->iovs[VIRTIO_DEV_MAX_IOVS]; ++ req->indirect_sg = virtio_req_to_sg_t(req, VIRTIO_DEV_MAX_IOVS); ++ req->dev = dev; ++ req->vq = vq; ++ STAILQ_INSERT_TAIL(&vq->free_reqs, req, link); ++ } ++ } ++ ++ if (virtio_endpoint->virtio_ops.get_device_features) { ++ supported_features = virtio_endpoint->virtio_ops.get_device_features(virtio_endpoint); ++ } ++ dev->host_features = supported_features; ++ ++ snprintf(dev->name, SPDK_VFU_MAX_NAME_LEN, "%s", ++ spdk_vfu_get_endpoint_name(virtio_endpoint->endpoint)); ++ virtio_endpoint->dev = dev; ++ dev->virtio_endpoint = virtio_endpoint; ++ virtio_endpoint->thread = spdk_get_thread(); ++ return 0; ++ ++out: ++ vfu_virtio_dev_free_reqs(virtio_endpoint, dev); ++ return ret; ++} ++ ++int ++vfu_virtio_endpoint_setup(struct vfu_virtio_endpoint *virtio_endpoint, ++ struct spdk_vfu_endpoint *endpoint, ++ char *basename, const char *endpoint_name, ++ struct vfu_virtio_ops *ops) ++{ ++ char path[PATH_MAX] = ""; ++ int ret; ++ ++ if (!ops) { ++ return -EINVAL; ++ } ++ ++ ret = snprintf(path, PATH_MAX, "%s%s_bar4", basename, endpoint_name); ++ if (ret < 0 || ret >= PATH_MAX) { ++ SPDK_ERRLOG("%s: error to get socket path: %s.\n", basename, spdk_strerror(errno)); ++ return -EINVAL; ++ } ++ ++ ret = open(path, O_RDWR | O_CREAT, S_IRUSR | S_IWUSR); ++ if (ret == -1) { ++ SPDK_ERRLOG("%s: failed to open device memory at %s.\n", ++ path, spdk_strerror(errno)); ++ return ret; ++ } ++ unlink(path); ++ ++ virtio_endpoint->devmem_fd = ret; ++ ret = ftruncate(virtio_endpoint->devmem_fd, VIRTIO_PCI_BAR4_LENGTH); ++ if (ret != 0) { ++ SPDK_ERRLOG("%s: error to ftruncate file %s.\n", path, ++ spdk_strerror(errno)); ++ close(virtio_endpoint->devmem_fd); ++ return ret; ++ } ++ ++ virtio_endpoint->doorbells = mmap(NULL, VIRTIO_PCI_NOTIFICATIONS_LENGTH, PROT_READ | PROT_WRITE, ++ MAP_SHARED, ++ virtio_endpoint->devmem_fd, VIRTIO_PCI_NOTIFICATIONS_OFFSET); ++ if (virtio_endpoint->doorbells == MAP_FAILED) { ++ SPDK_ERRLOG("%s: error to mmap file %s.\n", path, spdk_strerror(errno)); ++ close(virtio_endpoint->devmem_fd); ++ return -EFAULT; ++ } ++ virtio_endpoint->endpoint = endpoint; ++ virtio_endpoint->virtio_ops = *ops; ++ virtio_endpoint->num_queues = VIRTIO_DEV_MAX_VQS; ++ virtio_endpoint->qsize = VIRTIO_VQ_DEFAULT_SIZE; ++ ++ SPDK_DEBUGLOG(vfu_virtio, "mmap file %s, devmem_fd %d\n", path, virtio_endpoint->devmem_fd); ++ return 0; ++} ++ ++int ++vfu_virtio_endpoint_destruct(struct vfu_virtio_endpoint *virtio_endpoint) ++{ ++ if (virtio_endpoint->doorbells) { ++ munmap((void *)virtio_endpoint->doorbells, VIRTIO_PCI_NOTIFICATIONS_LENGTH); ++ } ++ ++ if (virtio_endpoint->devmem_fd) { ++ close(virtio_endpoint->devmem_fd); ++ } ++ ++ return 0; ++} ++ ++static int ++vfu_virtio_quiesce_poll(void *ctx) ++{ ++ struct vfu_virtio_endpoint *virtio_endpoint = ctx; ++ vfu_ctx_t *vfu_ctx = spdk_vfu_get_vfu_ctx(virtio_endpoint->endpoint); ++ ++ if (virtio_endpoint->io_outstanding) { ++ return SPDK_POLLER_IDLE; ++ } ++ ++ spdk_poller_unregister(&virtio_endpoint->quiesce_poller); ++ virtio_endpoint->quiesce_in_progress = false; ++ vfu_device_quiesced(vfu_ctx, 0); ++ ++ return SPDK_POLLER_BUSY; ++} ++ ++int ++vfu_virtio_quiesce_cb(struct spdk_vfu_endpoint *endpoint) ++{ ++ struct vfu_virtio_endpoint *virtio_endpoint = spdk_vfu_get_endpoint_private(endpoint); ++ ++ if (virtio_endpoint->quiesce_in_progress) { ++ return -EBUSY; ++ } ++ ++ if (!virtio_endpoint->io_outstanding) { ++ return 0; ++ } ++ ++ virtio_endpoint->quiesce_in_progress = true; ++ virtio_endpoint->quiesce_poller = SPDK_POLLER_REGISTER(vfu_virtio_quiesce_poll, virtio_endpoint, ++ 10); ++ ++ return -EBUSY; ++} ++ ++static struct spdk_vfu_pci_device vfu_virtio_device_info = { ++ .id = { ++ .vid = SPDK_PCI_VID_VIRTIO, ++ /* Realize when calling get device information */ ++ .did = 0x0, ++ .ssvid = SPDK_PCI_VID_VIRTIO, ++ .ssid = 0x0, ++ }, ++ ++ .class = { ++ /* 0x01, mass storage controller */ ++ .bcc = 0x01, ++ /* 0x00, SCSI controller */ ++ .scc = 0x00, ++ /* 0x00, SCSI controller - vendor specific interface */ ++ .pi = 0x00, ++ }, ++ ++ .pmcap = { ++ .hdr.id = PCI_CAP_ID_PM, ++ .pmcs.nsfrst = 0x1, ++ }, ++ ++ .pxcap = { ++ .hdr.id = PCI_CAP_ID_EXP, ++ .pxcaps.ver = 0x2, ++ .pxdcap = {.rer = 0x1, .flrc = 0x1}, ++ .pxdcap2.ctds = 0x1, ++ }, ++ ++ .msixcap = { ++ .hdr.id = PCI_CAP_ID_MSIX, ++ .mxc.ts = VIRTIO_DEV_MAX_VQS - 1, ++ .mtab = {.tbir = 0x1, .to = 0x0}, ++ .mpba = {.pbir = 0x2, .pbao = 0x0}, ++ }, ++ ++ .nr_vendor_caps = 4, ++ ++ .intr_ipin = 0x1, ++ .nr_int_irqs = 0x1, ++ .nr_msix_irqs = VIRTIO_DEV_MAX_VQS, ++ ++ .regions = { ++ /* BAR0 */ ++ {0}, ++ /* BAR1 */ ++ { ++ .access_cb = NULL, ++ .offset = 0, ++ .fd = -1, ++ .len = 0x1000, ++ .flags = VFU_REGION_FLAG_RW, ++ .nr_sparse_mmaps = 0, ++ }, ++ /* BAR2 */ ++ { ++ .access_cb = NULL, ++ .offset = 0, ++ .fd = -1, ++ .len = 0x1000, ++ .flags = VFU_REGION_FLAG_RW, ++ .nr_sparse_mmaps = 0, ++ }, ++ /* BAR3 */ ++ {0}, ++ /* BAR4 */ ++ { ++ .access_cb = virtio_vfu_access_bar4, ++ .offset = 0, ++ .fd = -1, ++ .len = VIRTIO_PCI_BAR4_LENGTH, ++ .flags = VFU_REGION_FLAG_RW | VFU_REGION_FLAG_MEM, ++ .nr_sparse_mmaps = 1, ++ .mmaps = { ++ { ++ .offset = VIRTIO_PCI_NOTIFICATIONS_OFFSET, ++ .len = VIRTIO_PCI_NOTIFICATIONS_LENGTH, ++ }, ++ }, ++ }, ++ /* BAR5 */ ++ {0}, ++ /* BAR6 */ ++ {0}, ++ /* ROM */ ++ {0}, ++ /* PCI Config */ ++ { ++ .access_cb = access_pci_config, ++ .offset = 0, ++ .fd = -1, ++ .len = 0x1000, ++ .flags = VFU_REGION_FLAG_RW, ++ .nr_sparse_mmaps = 0, ++ }, ++ }, ++}; ++ ++void ++vfu_virtio_get_device_info(struct vfu_virtio_endpoint *virtio_endpoint, ++ struct spdk_vfu_pci_device *device_info) ++{ ++ memcpy(device_info, &vfu_virtio_device_info, sizeof(*device_info)); ++ ++ /* BAR4 Region FD */ ++ device_info->regions[VFU_PCI_DEV_BAR4_REGION_IDX].fd = virtio_endpoint->devmem_fd; ++ SPDK_DEBUGLOG(vfu_virtio, "%s: get device information, fd %d\n", ++ spdk_vfu_get_endpoint_id(virtio_endpoint->endpoint), ++ virtio_endpoint->devmem_fd); ++} ++ ++static struct virtio_pci_cap common_cap = { ++ .cap_vndr = PCI_CAP_ID_VNDR, ++ .cap_len = sizeof(common_cap), ++ .cfg_type = VIRTIO_PCI_CAP_COMMON_CFG, ++ .bar = 4, ++ .offset = VIRTIO_PCI_COMMON_CFG_OFFSET, ++ .length = VIRTIO_PCI_COMMON_CFG_LENGTH, ++}; ++ ++static struct virtio_pci_cap isr_cap = { ++ .cap_vndr = PCI_CAP_ID_VNDR, ++ .cap_len = sizeof(isr_cap), ++ .cfg_type = VIRTIO_PCI_CAP_ISR_CFG, ++ .bar = 4, ++ .offset = VIRTIO_PCI_ISR_ACCESS_OFFSET, ++ .length = VIRTIO_PCI_ISR_ACCESS_LENGTH, ++}; ++ ++static struct virtio_pci_cap dev_cap = { ++ .cap_vndr = PCI_CAP_ID_VNDR, ++ .cap_len = sizeof(dev_cap), ++ .cfg_type = VIRTIO_PCI_CAP_DEVICE_CFG, ++ .bar = 4, ++ .offset = VIRTIO_PCI_SPECIFIC_CFG_OFFSET, ++ .length = VIRTIO_PCI_SPECIFIC_CFG_LENGTH, ++}; ++ ++static struct virtio_pci_notify_cap notify_cap = { ++ .cap = { ++ .cap_vndr = PCI_CAP_ID_VNDR, ++ .cap_len = sizeof(notify_cap), ++ .cfg_type = VIRTIO_PCI_CAP_NOTIFY_CFG, ++ .bar = 4, ++ .offset = VIRTIO_PCI_NOTIFICATIONS_OFFSET, ++ .length = VIRTIO_PCI_NOTIFICATIONS_LENGTH, ++ }, ++ .notify_off_multiplier = 4, ++}; ++ ++uint16_t ++vfu_virtio_get_vendor_capability(struct spdk_vfu_endpoint *endpoint, char *buf, ++ uint16_t buf_len, ++ uint16_t idx) ++{ ++ uint16_t len; ++ ++ SPDK_DEBUGLOG(vfu_virtio, "%s: get vendor capability, idx %u\n", ++ spdk_vfu_get_endpoint_id(endpoint), idx); ++ ++ switch (idx) { ++ case 0: ++ assert(buf_len > sizeof(common_cap)); ++ memcpy(buf, &common_cap, sizeof(common_cap)); ++ len = sizeof(common_cap); ++ break; ++ case 1: ++ assert(buf_len > sizeof(isr_cap)); ++ memcpy(buf, &isr_cap, sizeof(isr_cap)); ++ len = sizeof(isr_cap); ++ break; ++ case 2: ++ assert(buf_len > sizeof(dev_cap)); ++ memcpy(buf, &dev_cap, sizeof(dev_cap)); ++ len = sizeof(dev_cap); ++ break; ++ case 3: ++ assert(buf_len > sizeof(notify_cap)); ++ memcpy(buf, ¬ify_cap, sizeof(notify_cap)); ++ len = sizeof(notify_cap); ++ break; ++ default: ++ return 0; ++ } ++ ++ return len; ++} ++ ++SPDK_LOG_REGISTER_COMPONENT(vfu_virtio) ++SPDK_LOG_REGISTER_COMPONENT(vfu_virtio_io) +diff --git a/module/vfu_device/vfu_virtio_blk.c b/module/vfu_device/vfu_virtio_blk.c +index d8b52da..46a7144 100644 +--- a/module/vfu_device/vfu_virtio_blk.c ++++ b/module/vfu_device/vfu_virtio_blk.c +@@ -1,615 +1,615 @@ +-/* SPDX-License-Identifier: BSD-3-Clause +- * Copyright (C) 2022 Intel Corporation. +- * All rights reserved. +- */ +- +-/* +- * virtio-blk over vfio-user transport +- */ +-#include +- +-#include "spdk/env.h" +-#include "spdk/bdev.h" +-#include "spdk/bdev_module.h" +-#include "spdk/stdinc.h" +-#include "spdk/assert.h" +-#include "spdk/barrier.h" +-#include "spdk/thread.h" +-#include "spdk/memory.h" +-#include "spdk/util.h" +-#include "spdk/log.h" +-#include "spdk/string.h" +-#include "spdk/likely.h" +-#include "spdk/pci_ids.h" +- +-#include "vfu_virtio_internal.h" +- +-#define VIRTIO_BLK_SUPPORTED_FEATURES ((1ULL << VIRTIO_BLK_F_SIZE_MAX) | \ +- (1ULL << VIRTIO_BLK_F_SEG_MAX) | \ +- (1ULL << VIRTIO_BLK_F_TOPOLOGY) | \ +- (1ULL << VIRTIO_BLK_F_BLK_SIZE) | \ +- (1ULL << VIRTIO_BLK_F_MQ)) +- +-struct virtio_blk_endpoint { +- struct vfu_virtio_endpoint virtio; +- +- /* virtio_blk specific configurations */ +- struct spdk_thread *init_thread; +- struct spdk_bdev *bdev; +- struct spdk_bdev_desc *bdev_desc; +- struct spdk_io_channel *io_channel; +- struct virtio_blk_config blk_cfg; +- +- /* virtio_blk ring process poller */ +- struct spdk_poller *ring_poller; +-}; +- +-struct virtio_blk_req { +- volatile uint8_t *status; +- struct virtio_blk_endpoint *endpoint; +- /* KEEP req at last */ +- struct vfu_virtio_req req; +-}; +- +-static inline struct virtio_blk_endpoint * +-to_blk_endpoint(struct vfu_virtio_endpoint *virtio_endpoint) +-{ +- return SPDK_CONTAINEROF(virtio_endpoint, struct virtio_blk_endpoint, virtio); +-} +- +-static inline struct virtio_blk_req * +-to_blk_request(struct vfu_virtio_req *request) +-{ +- return SPDK_CONTAINEROF(request, struct virtio_blk_req, req); +-} +- +-static int +-vfu_virtio_blk_vring_poll(void *ctx) +-{ +- struct virtio_blk_endpoint *blk_endpoint = ctx; +- struct vfu_virtio_dev *dev = blk_endpoint->virtio.dev; +- struct vfu_virtio_vq *vq; +- uint32_t i, count = 0; +- +- if (spdk_unlikely(!virtio_dev_is_started(dev))) { +- return SPDK_POLLER_IDLE; +- } +- +- if (spdk_unlikely(blk_endpoint->virtio.quiesce_in_progress)) { +- return SPDK_POLLER_IDLE; +- } +- +- for (i = 0; i < dev->num_queues; i++) { +- vq = &dev->vqs[i]; +- if (!vq->enabled || vq->q_state != VFU_VQ_ACTIVE) { +- continue; +- } +- +- vfu_virtio_vq_flush_irq(dev, vq); +- +- if (vq->packed.packed_ring) { +- /* packed vring */ +- count += vfu_virito_dev_process_packed_ring(dev, vq); +- } else { +- /* split vring */ +- count += vfu_virito_dev_process_split_ring(dev, vq); +- } +- } +- +- return count ? SPDK_POLLER_BUSY : SPDK_POLLER_IDLE; +-} +- +-static int +-virtio_blk_start(struct vfu_virtio_endpoint *virtio_endpoint) +-{ +- struct virtio_blk_endpoint *blk_endpoint = to_blk_endpoint(virtio_endpoint); +- +- if (blk_endpoint->ring_poller) { +- return 0; +- } +- +- SPDK_DEBUGLOG(vfu_virtio_blk, "starting %s\n", virtio_endpoint->dev->name); +- blk_endpoint->io_channel = spdk_bdev_get_io_channel(blk_endpoint->bdev_desc); +- blk_endpoint->ring_poller = SPDK_POLLER_REGISTER(vfu_virtio_blk_vring_poll, blk_endpoint, 0); +- +- return 0; +-} +- +-static void +-_virtio_blk_stop_msg(void *ctx) +-{ +- struct virtio_blk_endpoint *blk_endpoint = ctx; +- +- spdk_poller_unregister(&blk_endpoint->ring_poller); +- spdk_put_io_channel(blk_endpoint->io_channel); +- blk_endpoint->io_channel = NULL; +- +- SPDK_DEBUGLOG(vfu_virtio_blk, "%s is stopped\n", +- spdk_vfu_get_endpoint_id(blk_endpoint->virtio.endpoint)); +-} +- +-static int +-virtio_blk_stop(struct vfu_virtio_endpoint *virtio_endpoint) +-{ +- struct virtio_blk_endpoint *blk_endpoint = to_blk_endpoint(virtio_endpoint); +- +- if (!blk_endpoint->io_channel) { +- return 0; +- } +- +- SPDK_DEBUGLOG(vfu_virtio_blk, "%s stopping\n", spdk_vfu_get_endpoint_id(virtio_endpoint->endpoint)); +- spdk_thread_send_msg(virtio_endpoint->thread, _virtio_blk_stop_msg, blk_endpoint); +- return 0; +-} +- +-static void +-virtio_blk_req_finish(struct virtio_blk_req *blk_req, uint8_t status) +-{ +- struct vfu_virtio_req *req = &blk_req->req; +- +- if (spdk_likely(blk_req->status)) { +- *blk_req->status = status; +- blk_req->status = NULL; +- } +- +- vfu_virtio_finish_req(req); +-} +- +-static void +-blk_request_complete_cb(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) +-{ +- struct virtio_blk_req *blk_req = cb_arg; +- +- SPDK_DEBUGLOG(vfu_virtio_blk, "IO done status %u\n", success); +- +- spdk_bdev_free_io(bdev_io); +- virtio_blk_req_finish(blk_req, success ? VIRTIO_BLK_S_OK : VIRTIO_BLK_S_IOERR); +-} +- +-static int +-virtio_blk_process_req(struct vfu_virtio_endpoint *virtio_endpoint, struct vfu_virtio_vq *vq, +- struct vfu_virtio_req *req) +-{ +- struct virtio_blk_endpoint *blk_endpoint = to_blk_endpoint(virtio_endpoint); +- struct virtio_blk_req *blk_req = to_blk_request(req); +- const struct virtio_blk_outhdr *hdr; +- struct virtio_blk_discard_write_zeroes *desc; +- struct iovec *iov; +- uint16_t iovcnt; +- uint64_t flush_bytes; +- uint32_t type; +- uint32_t payload_len; +- int ret; +- +- blk_req->endpoint = blk_endpoint; +- +- iov = &req->iovs[0]; +- if (spdk_unlikely(iov->iov_len != sizeof(*hdr))) { +- SPDK_ERRLOG("Invalid virtio_blk header length %lu\n", iov->iov_len); +- virtio_blk_req_finish(blk_req, VIRTIO_BLK_S_UNSUPP); +- return -EINVAL; +- } +- hdr = iov->iov_base; +- +- iov = &req->iovs[req->iovcnt - 1]; +- if (spdk_unlikely(iov->iov_len != 1)) { +- SPDK_ERRLOG("Invalid virtio_blk response length %lu\n", iov->iov_len); +- virtio_blk_req_finish(blk_req, VIRTIO_BLK_S_UNSUPP); +- return -EINVAL; +- } +- blk_req->status = iov->iov_base; +- +- payload_len = req->payload_size; +- payload_len -= sizeof(*hdr) + 1; +- iovcnt = req->iovcnt - 2; +- +- type = hdr->type; +- /* Legacy type isn't supported */ +- type &= ~VIRTIO_BLK_T_BARRIER; +- +- SPDK_DEBUGLOG(vfu_virtio_blk, "%s: type %u, iovcnt %u, payload_len %u\n", +- spdk_vfu_get_endpoint_id(virtio_endpoint->endpoint), +- type, iovcnt, payload_len); +- +- if (spdk_unlikely(blk_endpoint->bdev_desc == NULL)) { +- SPDK_ERRLOG("Bdev has been removed\n"); +- virtio_blk_req_finish(blk_req, VIRTIO_BLK_S_IOERR); +- return 0; +- } +- +- switch (type) { +- case VIRTIO_BLK_T_IN: +- case VIRTIO_BLK_T_OUT: +- if (spdk_unlikely(payload_len == 0 || (payload_len & (512 - 1)) != 0)) { +- SPDK_ERRLOG("Invalid payload length %u\n", payload_len); +- virtio_blk_req_finish(blk_req, VIRTIO_BLK_S_UNSUPP); +- return -EINVAL; +- } +- if (type == VIRTIO_BLK_T_IN) { +- req->used_len = payload_len + 1; +- ret = spdk_bdev_readv(blk_endpoint->bdev_desc, blk_endpoint->io_channel, +- &req->iovs[1], iovcnt, hdr->sector * 512, +- payload_len, blk_request_complete_cb, blk_req); +- } else { +- req->used_len = 1; +- ret = spdk_bdev_writev(blk_endpoint->bdev_desc, blk_endpoint->io_channel, +- &req->iovs[1], iovcnt, hdr->sector * 512, +- payload_len, blk_request_complete_cb, blk_req); +- } +- if (ret) { +- SPDK_ERRLOG("R/W error\n"); +- virtio_blk_req_finish(blk_req, VIRTIO_BLK_S_IOERR); +- return ret; +- } +- break; +- case VIRTIO_BLK_T_DISCARD: +- desc = req->iovs[1].iov_base; +- if (payload_len != sizeof(*desc)) { +- SPDK_NOTICELOG("Invalid discard payload size: %u\n", payload_len); +- virtio_blk_req_finish(blk_req, VIRTIO_BLK_S_IOERR); +- return -EINVAL; +- } +- +- if (desc->flags & VIRTIO_BLK_WRITE_ZEROES_FLAG_UNMAP) { +- SPDK_ERRLOG("UNMAP flag is only used for WRITE ZEROES command\n"); +- virtio_blk_req_finish(blk_req, VIRTIO_BLK_S_UNSUPP); +- return -EINVAL; +- } +- +- ret = spdk_bdev_unmap(blk_endpoint->bdev_desc, blk_endpoint->io_channel, +- desc->sector * 512, desc->num_sectors * 512, +- blk_request_complete_cb, blk_req); +- if (ret) { +- SPDK_ERRLOG("UNMAP error\n"); +- virtio_blk_req_finish(blk_req, VIRTIO_BLK_S_IOERR); +- return ret; +- } +- break; +- case VIRTIO_BLK_T_WRITE_ZEROES: +- desc = req->iovs[1].iov_base; +- if (payload_len != sizeof(*desc)) { +- SPDK_NOTICELOG("Invalid write zeroes payload size: %u\n", payload_len); +- virtio_blk_req_finish(blk_req, VIRTIO_BLK_S_IOERR); +- return -1; +- } +- +- /* Unmap this range, SPDK doesn't support it, kernel will enable this flag by default +- * without checking unmap feature is negotiated or not, the flag isn't mandatory, so +- * just print a warning. +- */ +- if (desc->flags & VIRTIO_BLK_WRITE_ZEROES_FLAG_UNMAP) { +- SPDK_WARNLOG("Ignore the unmap flag for WRITE ZEROES from %"PRIx64", len %"PRIx64"\n", +- (uint64_t)desc->sector * 512, (uint64_t)desc->num_sectors * 512); +- } +- +- ret = spdk_bdev_write_zeroes(blk_endpoint->bdev_desc, blk_endpoint->io_channel, +- desc->sector * 512, desc->num_sectors * 512, +- blk_request_complete_cb, blk_req); +- if (ret) { +- SPDK_ERRLOG("WRITE ZEROES error\n"); +- virtio_blk_req_finish(blk_req, VIRTIO_BLK_S_IOERR); +- return ret; +- } +- break; +- case VIRTIO_BLK_T_FLUSH: +- flush_bytes = spdk_bdev_get_num_blocks(blk_endpoint->bdev) * spdk_bdev_get_block_size( +- blk_endpoint->bdev); +- if (hdr->sector != 0) { +- SPDK_NOTICELOG("sector must be zero for flush command\n"); +- virtio_blk_req_finish(blk_req, VIRTIO_BLK_S_IOERR); +- return -EINVAL; +- } +- ret = spdk_bdev_flush(blk_endpoint->bdev_desc, blk_endpoint->io_channel, +- 0, flush_bytes, +- blk_request_complete_cb, blk_req); +- if (ret) { +- SPDK_ERRLOG("FLUSH error\n"); +- virtio_blk_req_finish(blk_req, VIRTIO_BLK_S_IOERR); +- return ret; +- } +- break; +- case VIRTIO_BLK_T_GET_ID: +- if (!iovcnt || !payload_len) { +- virtio_blk_req_finish(blk_req, VIRTIO_BLK_S_UNSUPP); +- return -EINVAL; +- } +- req->used_len = spdk_min((size_t)VIRTIO_BLK_ID_BYTES, req->iovs[1].iov_len); +- spdk_strcpy_pad(req->iovs[1].iov_base, spdk_bdev_get_name(blk_endpoint->bdev), +- req->used_len, ' '); +- virtio_blk_req_finish(blk_req, VIRTIO_BLK_S_OK); +- break; +- default: +- virtio_blk_req_finish(blk_req, VIRTIO_BLK_S_UNSUPP); +- return -ENOTSUP; +- } +- +- return 0; +-} +- +-static void +-virtio_blk_update_config(struct virtio_blk_config *blk_cfg, struct spdk_bdev *bdev, +- uint16_t num_queues) +-{ +- memset(blk_cfg, 0, sizeof(*blk_cfg)); +- +- if (!bdev) { +- return; +- } +- +- blk_cfg->blk_size = spdk_bdev_get_block_size(bdev); +- blk_cfg->capacity = (blk_cfg->blk_size * spdk_bdev_get_num_blocks(bdev)) / 512; +- /* minimum I/O size in blocks */ +- blk_cfg->min_io_size = 1; +- blk_cfg->num_queues = num_queues; +- +- if (spdk_bdev_get_buf_align(bdev) > 1) { +- blk_cfg->size_max = SPDK_BDEV_LARGE_BUF_MAX_SIZE; +- blk_cfg->seg_max = spdk_min(VIRTIO_DEV_MAX_IOVS - 2 - 1, SPDK_BDEV_IO_NUM_CHILD_IOV - 2 - 1); +- } else { +- blk_cfg->size_max = 131072; +- /* -2 for REQ and RESP and -1 for region boundary splitting */ +- blk_cfg->seg_max = VIRTIO_DEV_MAX_IOVS - 2 - 1; +- } +- +- if (spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_UNMAP)) { +- /* 16MiB, expressed in 512 Bytes */ +- blk_cfg->max_discard_sectors = 32768; +- blk_cfg->max_discard_seg = 1; +- blk_cfg->discard_sector_alignment = blk_cfg->blk_size / 512; +- } +- if (spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE_ZEROES)) { +- blk_cfg->max_write_zeroes_sectors = 32768; +- blk_cfg->max_write_zeroes_seg = 1; +- } +-} +- +-static void +-_vfu_virtio_blk_bdev_close(void *arg1) +-{ +- struct spdk_bdev_desc *bdev_desc = arg1; +- +- spdk_bdev_close(bdev_desc); +-} +- +-static void +-bdev_event_cb(enum spdk_bdev_event_type type, struct spdk_bdev *bdev, +- void *event_ctx) +-{ +- struct virtio_blk_endpoint *blk_endpoint = event_ctx; +- +- SPDK_DEBUGLOG(vfu_virtio_blk, "Bdev event: type %d, name %s\n", type, bdev->name); +- +- switch (type) { +- case SPDK_BDEV_EVENT_REMOVE: +- SPDK_NOTICELOG("bdev name (%s) received event(SPDK_BDEV_EVENT_REMOVE)\n", bdev->name); +- virtio_blk_update_config(&blk_endpoint->blk_cfg, NULL, 0); +- +- if (blk_endpoint->io_channel) { +- spdk_thread_send_msg(blk_endpoint->virtio.thread, _virtio_blk_stop_msg, blk_endpoint); +- } +- +- if (blk_endpoint->bdev_desc) { +- spdk_thread_send_msg(blk_endpoint->init_thread, _vfu_virtio_blk_bdev_close, +- blk_endpoint->bdev_desc); +- blk_endpoint->bdev_desc = NULL; +- } +- break; +- case SPDK_BDEV_EVENT_RESIZE: +- SPDK_NOTICELOG("bdev name (%s) received event(SPDK_BDEV_EVENT_RESIZE)\n", bdev->name); +- virtio_blk_update_config(&blk_endpoint->blk_cfg, blk_endpoint->bdev, +- blk_endpoint->virtio.num_queues); +- vfu_virtio_notify_config(&blk_endpoint->virtio); +- break; +- default: +- SPDK_NOTICELOG("Unsupported bdev event: type %d\n", type); +- break; +- } +-} +- +-static uint64_t +-virtio_blk_get_supported_features(struct vfu_virtio_endpoint *virtio_endpoint) +-{ +- struct virtio_blk_endpoint *blk_endpoint = to_blk_endpoint(virtio_endpoint); +- uint64_t features; +- struct spdk_bdev *bdev; +- +- features = VIRTIO_BLK_SUPPORTED_FEATURES | VIRTIO_HOST_SUPPORTED_FEATURES; +- +- if (!virtio_endpoint->packed_ring) { +- features &= ~(1ULL << VIRTIO_F_RING_PACKED); +- } +- bdev = blk_endpoint->bdev; +- +- if (spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_UNMAP)) { +- features |= (1ULL << VIRTIO_BLK_F_DISCARD); +- } +- +- if (spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE_ZEROES)) { +- features |= (1ULL << VIRTIO_BLK_F_WRITE_ZEROES); +- } +- +- if (spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_FLUSH)) { +- features |= (1ULL << VIRTIO_BLK_F_FLUSH); +- } +- +- return features; +-} +- +-static int +-virtio_blk_get_device_specific_config(struct vfu_virtio_endpoint *virtio_endpoint, char *buf, +- uint64_t offset, uint64_t count) +-{ +- struct virtio_blk_endpoint *blk_endpoint = to_blk_endpoint(virtio_endpoint); +- uint8_t *blk_cfg; +- uint64_t len; +- +- if (offset >= sizeof(struct virtio_blk_config)) { +- return -EINVAL; +- } +- len = spdk_min(sizeof(struct virtio_blk_config) - offset, count); +- +- blk_cfg = (uint8_t *)&blk_endpoint->blk_cfg; +- memcpy(buf, blk_cfg + offset, len); +- +- return 0; +-} +- +-static struct vfu_virtio_req * +-virtio_blk_alloc_req(struct vfu_virtio_endpoint *virtio_endpoint, struct vfu_virtio_vq *vq) +-{ +- struct virtio_blk_req *blk_req; +- +- blk_req = calloc(1, sizeof(*blk_req) + dma_sg_size() * (VIRTIO_DEV_MAX_IOVS + 1)); +- if (!blk_req) { +- return NULL; +- } +- +- return &blk_req->req; +-} +- +-static void +-virtio_blk_free_req(struct vfu_virtio_endpoint *virtio_endpoint, struct vfu_virtio_vq *vq, +- struct vfu_virtio_req *req) +-{ +- struct virtio_blk_req *blk_req = to_blk_request(req); +- +- free(blk_req); +-} +- +-struct vfu_virtio_ops virtio_blk_ops = { +- .get_device_features = virtio_blk_get_supported_features, +- .alloc_req = virtio_blk_alloc_req, +- .free_req = virtio_blk_free_req, +- .exec_request = virtio_blk_process_req, +- .get_config = virtio_blk_get_device_specific_config, +- .start_device = virtio_blk_start, +- .stop_device = virtio_blk_stop, +-}; +- +-int +-vfu_virtio_blk_add_bdev(const char *name, const char *bdev_name, +- uint16_t num_queues, uint16_t qsize, bool packed_ring) +-{ +- struct spdk_vfu_endpoint *endpoint; +- struct vfu_virtio_endpoint *virtio_endpoint; +- struct virtio_blk_endpoint *blk_endpoint; +- int ret; +- +- endpoint = spdk_vfu_get_endpoint_by_name(name); +- if (!endpoint) { +- SPDK_ERRLOG("Endpoint %s doesn't exist\n", name); +- return -ENOENT; +- } +- +- virtio_endpoint = spdk_vfu_get_endpoint_private(endpoint); +- blk_endpoint = to_blk_endpoint(virtio_endpoint); +- +- if (blk_endpoint->bdev_desc) { +- SPDK_ERRLOG("%s: block device already exists\n", spdk_vfu_get_endpoint_id(endpoint)); +- return -EEXIST; +- } +- +- if (num_queues && (num_queues <= VIRTIO_DEV_MAX_VQS)) { +- blk_endpoint->virtio.num_queues = num_queues; +- } +- if (qsize && (qsize <= VIRTIO_VQ_MAX_SIZE)) { +- blk_endpoint->virtio.qsize = qsize; +- } +- blk_endpoint->virtio.packed_ring = packed_ring; +- +- SPDK_DEBUGLOG(vfu_virtio_blk, "%s: add block device %s, num_queues %u, qsize %u, packed ring %s\n", +- spdk_vfu_get_endpoint_id(endpoint), +- bdev_name, blk_endpoint->virtio.num_queues, blk_endpoint->virtio.qsize, +- packed_ring ? "enabled" : "disabled"); +- +- ret = spdk_bdev_open_ext(bdev_name, true, bdev_event_cb, blk_endpoint, +- &blk_endpoint->bdev_desc); +- if (ret != 0) { +- SPDK_ERRLOG("%s could not open bdev '%s', error=%d\n", +- name, bdev_name, ret); +- return ret; +- } +- blk_endpoint->bdev = spdk_bdev_desc_get_bdev(blk_endpoint->bdev_desc); +- virtio_blk_update_config(&blk_endpoint->blk_cfg, blk_endpoint->bdev, +- blk_endpoint->virtio.num_queues); +- blk_endpoint->init_thread = spdk_get_thread(); +- +- return 0; +-} +- +-static int +-vfu_virtio_blk_endpoint_destruct(struct spdk_vfu_endpoint *endpoint) +-{ +- struct vfu_virtio_endpoint *virtio_endpoint = spdk_vfu_get_endpoint_private(endpoint); +- struct virtio_blk_endpoint *blk_endpoint = to_blk_endpoint(virtio_endpoint); +- +- if (blk_endpoint->bdev_desc) { +- spdk_thread_send_msg(blk_endpoint->init_thread, _vfu_virtio_blk_bdev_close, +- blk_endpoint->bdev_desc); +- blk_endpoint->bdev_desc = NULL; +- } +- +- vfu_virtio_endpoint_destruct(&blk_endpoint->virtio); +- free(blk_endpoint); +- +- return 0; +-} +- +-static void * +-vfu_virtio_blk_endpoint_init(struct spdk_vfu_endpoint *endpoint, +- char *basename, const char *endpoint_name) +-{ +- struct virtio_blk_endpoint *blk_endpoint; +- int ret; +- +- blk_endpoint = calloc(1, sizeof(*blk_endpoint)); +- if (!blk_endpoint) { +- return NULL; +- } +- +- ret = vfu_virtio_endpoint_setup(&blk_endpoint->virtio, endpoint, basename, endpoint_name, +- &virtio_blk_ops); +- if (ret) { +- SPDK_ERRLOG("Error to setup endpoint %s\n", endpoint_name); +- free(blk_endpoint); +- return NULL; +- } +- +- return (void *)&blk_endpoint->virtio; +-} +- +-static int +-vfu_virtio_blk_get_device_info(struct spdk_vfu_endpoint *endpoint, +- struct spdk_vfu_pci_device *device_info) +-{ +- struct vfu_virtio_endpoint *virtio_endpoint = spdk_vfu_get_endpoint_private(endpoint); +- struct virtio_blk_endpoint *blk_endpoint = to_blk_endpoint(virtio_endpoint); +- +- vfu_virtio_get_device_info(&blk_endpoint->virtio, device_info); +- /* Fill Device ID */ +- device_info->id.did = PCI_DEVICE_ID_VIRTIO_BLK_MODERN; +- +- return 0; +-} +- +-struct spdk_vfu_endpoint_ops vfu_virtio_blk_ops = { +- .name = "virtio_blk", +- .init = vfu_virtio_blk_endpoint_init, +- .get_device_info = vfu_virtio_blk_get_device_info, +- .get_vendor_capability = vfu_virtio_get_vendor_capability, +- .post_memory_add = vfu_virtio_post_memory_add, +- .pre_memory_remove = vfu_virtio_pre_memory_remove, +- .reset_device = vfu_virtio_pci_reset_cb, +- .quiesce_device = vfu_virtio_quiesce_cb, +- .destruct = vfu_virtio_blk_endpoint_destruct, +- .attach_device = vfu_virtio_attach_device, +- .detach_device = vfu_virtio_detach_device, +-}; +- +-static void +-__attribute__((constructor)) _vfu_virtio_blk_pci_model_register(void) +-{ +- spdk_vfu_register_endpoint_ops(&vfu_virtio_blk_ops); +-} +- +-SPDK_LOG_REGISTER_COMPONENT(vfu_virtio_blk) ++/* SPDX-License-Identifier: BSD-3-Clause ++ * Copyright (C) 2022 Intel Corporation. ++ * All rights reserved. ++ */ ++ ++/* ++ * virtio-blk over vfio-user transport ++ */ ++#include ++ ++#include "spdk/env.h" ++#include "spdk/bdev.h" ++#include "spdk/bdev_module.h" ++#include "spdk/stdinc.h" ++#include "spdk/assert.h" ++#include "spdk/barrier.h" ++#include "spdk/thread.h" ++#include "spdk/memory.h" ++#include "spdk/util.h" ++#include "spdk/log.h" ++#include "spdk/string.h" ++#include "spdk/likely.h" ++#include "spdk/pci_ids.h" ++ ++#include "vfu_virtio_internal.h" ++ ++#define VIRTIO_BLK_SUPPORTED_FEATURES ((1ULL << VIRTIO_BLK_F_SIZE_MAX) | \ ++ (1ULL << VIRTIO_BLK_F_SEG_MAX) | \ ++ (1ULL << VIRTIO_BLK_F_TOPOLOGY) | \ ++ (1ULL << VIRTIO_BLK_F_BLK_SIZE) | \ ++ (1ULL << VIRTIO_BLK_F_MQ)) ++ ++struct virtio_blk_endpoint { ++ struct vfu_virtio_endpoint virtio; ++ ++ /* virtio_blk specific configurations */ ++ struct spdk_thread *init_thread; ++ struct spdk_bdev *bdev; ++ struct spdk_bdev_desc *bdev_desc; ++ struct spdk_io_channel *io_channel; ++ struct virtio_blk_config blk_cfg; ++ ++ /* virtio_blk ring process poller */ ++ struct spdk_poller *ring_poller; ++}; ++ ++struct virtio_blk_req { ++ volatile uint8_t *status; ++ struct virtio_blk_endpoint *endpoint; ++ /* KEEP req at last */ ++ struct vfu_virtio_req req; ++}; ++ ++static inline struct virtio_blk_endpoint * ++to_blk_endpoint(struct vfu_virtio_endpoint *virtio_endpoint) ++{ ++ return SPDK_CONTAINEROF(virtio_endpoint, struct virtio_blk_endpoint, virtio); ++} ++ ++static inline struct virtio_blk_req * ++to_blk_request(struct vfu_virtio_req *request) ++{ ++ return SPDK_CONTAINEROF(request, struct virtio_blk_req, req); ++} ++ ++static int ++vfu_virtio_blk_vring_poll(void *ctx) ++{ ++ struct virtio_blk_endpoint *blk_endpoint = ctx; ++ struct vfu_virtio_dev *dev = blk_endpoint->virtio.dev; ++ struct vfu_virtio_vq *vq; ++ uint32_t i, count = 0; ++ ++ if (spdk_unlikely(!virtio_dev_is_started(dev))) { ++ return SPDK_POLLER_IDLE; ++ } ++ ++ if (spdk_unlikely(blk_endpoint->virtio.quiesce_in_progress)) { ++ return SPDK_POLLER_IDLE; ++ } ++ ++ for (i = 0; i < dev->num_queues; i++) { ++ vq = &dev->vqs[i]; ++ if (!vq->enabled || vq->q_state != VFU_VQ_ACTIVE) { ++ continue; ++ } ++ ++ vfu_virtio_vq_flush_irq(dev, vq); ++ ++ if (vq->packed.packed_ring) { ++ /* packed vring */ ++ count += vfu_virito_dev_process_packed_ring(dev, vq); ++ } else { ++ /* split vring */ ++ count += vfu_virito_dev_process_split_ring(dev, vq); ++ } ++ } ++ ++ return count ? SPDK_POLLER_BUSY : SPDK_POLLER_IDLE; ++} ++ ++static int ++virtio_blk_start(struct vfu_virtio_endpoint *virtio_endpoint) ++{ ++ struct virtio_blk_endpoint *blk_endpoint = to_blk_endpoint(virtio_endpoint); ++ ++ if (blk_endpoint->ring_poller) { ++ return 0; ++ } ++ ++ SPDK_DEBUGLOG(vfu_virtio_blk, "starting %s\n", virtio_endpoint->dev->name); ++ blk_endpoint->io_channel = spdk_bdev_get_io_channel(blk_endpoint->bdev_desc); ++ blk_endpoint->ring_poller = SPDK_POLLER_REGISTER(vfu_virtio_blk_vring_poll, blk_endpoint, 0); ++ ++ return 0; ++} ++ ++static void ++_virtio_blk_stop_msg(void *ctx) ++{ ++ struct virtio_blk_endpoint *blk_endpoint = ctx; ++ ++ spdk_poller_unregister(&blk_endpoint->ring_poller); ++ spdk_put_io_channel(blk_endpoint->io_channel); ++ blk_endpoint->io_channel = NULL; ++ ++ SPDK_DEBUGLOG(vfu_virtio_blk, "%s is stopped\n", ++ spdk_vfu_get_endpoint_id(blk_endpoint->virtio.endpoint)); ++} ++ ++static int ++virtio_blk_stop(struct vfu_virtio_endpoint *virtio_endpoint) ++{ ++ struct virtio_blk_endpoint *blk_endpoint = to_blk_endpoint(virtio_endpoint); ++ ++ if (!blk_endpoint->io_channel) { ++ return 0; ++ } ++ ++ SPDK_DEBUGLOG(vfu_virtio_blk, "%s stopping\n", spdk_vfu_get_endpoint_id(virtio_endpoint->endpoint)); ++ spdk_thread_send_msg(virtio_endpoint->thread, _virtio_blk_stop_msg, blk_endpoint); ++ return 0; ++} ++ ++static void ++virtio_blk_req_finish(struct virtio_blk_req *blk_req, uint8_t status) ++{ ++ struct vfu_virtio_req *req = &blk_req->req; ++ ++ if (spdk_likely(blk_req->status)) { ++ *blk_req->status = status; ++ blk_req->status = NULL; ++ } ++ ++ vfu_virtio_finish_req(req); ++} ++ ++static void ++blk_request_complete_cb(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) ++{ ++ struct virtio_blk_req *blk_req = cb_arg; ++ ++ SPDK_DEBUGLOG(vfu_virtio_blk, "IO done status %u\n", success); ++ ++ spdk_bdev_free_io(bdev_io); ++ virtio_blk_req_finish(blk_req, success ? VIRTIO_BLK_S_OK : VIRTIO_BLK_S_IOERR); ++} ++ ++static int ++virtio_blk_process_req(struct vfu_virtio_endpoint *virtio_endpoint, struct vfu_virtio_vq *vq, ++ struct vfu_virtio_req *req) ++{ ++ struct virtio_blk_endpoint *blk_endpoint = to_blk_endpoint(virtio_endpoint); ++ struct virtio_blk_req *blk_req = to_blk_request(req); ++ const struct virtio_blk_outhdr *hdr; ++ struct virtio_blk_discard_write_zeroes *desc; ++ struct iovec *iov; ++ uint16_t iovcnt; ++ uint64_t flush_bytes; ++ uint32_t type; ++ uint32_t payload_len; ++ int ret; ++ ++ blk_req->endpoint = blk_endpoint; ++ ++ iov = &req->iovs[0]; ++ if (spdk_unlikely(iov->iov_len != sizeof(*hdr))) { ++ SPDK_ERRLOG("Invalid virtio_blk header length %lu\n", iov->iov_len); ++ virtio_blk_req_finish(blk_req, VIRTIO_BLK_S_UNSUPP); ++ return -EINVAL; ++ } ++ hdr = iov->iov_base; ++ ++ iov = &req->iovs[req->iovcnt - 1]; ++ if (spdk_unlikely(iov->iov_len != 1)) { ++ SPDK_ERRLOG("Invalid virtio_blk response length %lu\n", iov->iov_len); ++ virtio_blk_req_finish(blk_req, VIRTIO_BLK_S_UNSUPP); ++ return -EINVAL; ++ } ++ blk_req->status = iov->iov_base; ++ ++ payload_len = req->payload_size; ++ payload_len -= sizeof(*hdr) + 1; ++ iovcnt = req->iovcnt - 2; ++ ++ type = hdr->type; ++ /* Legacy type isn't supported */ ++ type &= ~VIRTIO_BLK_T_BARRIER; ++ ++ SPDK_DEBUGLOG(vfu_virtio_blk, "%s: type %u, iovcnt %u, payload_len %u\n", ++ spdk_vfu_get_endpoint_id(virtio_endpoint->endpoint), ++ type, iovcnt, payload_len); ++ ++ if (spdk_unlikely(blk_endpoint->bdev_desc == NULL)) { ++ SPDK_ERRLOG("Bdev has been removed\n"); ++ virtio_blk_req_finish(blk_req, VIRTIO_BLK_S_IOERR); ++ return 0; ++ } ++ ++ switch (type) { ++ case VIRTIO_BLK_T_IN: ++ case VIRTIO_BLK_T_OUT: ++ if (spdk_unlikely(payload_len == 0 || (payload_len & (512 - 1)) != 0)) { ++ SPDK_ERRLOG("Invalid payload length %u\n", payload_len); ++ virtio_blk_req_finish(blk_req, VIRTIO_BLK_S_UNSUPP); ++ return -EINVAL; ++ } ++ if (type == VIRTIO_BLK_T_IN) { ++ req->used_len = payload_len + 1; ++ ret = spdk_bdev_readv(blk_endpoint->bdev_desc, blk_endpoint->io_channel, ++ &req->iovs[1], iovcnt, hdr->sector * 512, ++ payload_len, blk_request_complete_cb, blk_req); ++ } else { ++ req->used_len = 1; ++ ret = spdk_bdev_writev(blk_endpoint->bdev_desc, blk_endpoint->io_channel, ++ &req->iovs[1], iovcnt, hdr->sector * 512, ++ payload_len, blk_request_complete_cb, blk_req); ++ } ++ if (ret) { ++ SPDK_ERRLOG("R/W error\n"); ++ virtio_blk_req_finish(blk_req, VIRTIO_BLK_S_IOERR); ++ return ret; ++ } ++ break; ++ case VIRTIO_BLK_T_DISCARD: ++ desc = req->iovs[1].iov_base; ++ if (payload_len != sizeof(*desc)) { ++ SPDK_NOTICELOG("Invalid discard payload size: %u\n", payload_len); ++ virtio_blk_req_finish(blk_req, VIRTIO_BLK_S_IOERR); ++ return -EINVAL; ++ } ++ ++ if (desc->flags & VIRTIO_BLK_WRITE_ZEROES_FLAG_UNMAP) { ++ SPDK_ERRLOG("UNMAP flag is only used for WRITE ZEROES command\n"); ++ virtio_blk_req_finish(blk_req, VIRTIO_BLK_S_UNSUPP); ++ return -EINVAL; ++ } ++ ++ ret = spdk_bdev_unmap(blk_endpoint->bdev_desc, blk_endpoint->io_channel, ++ desc->sector * 512, desc->num_sectors * 512, ++ blk_request_complete_cb, blk_req); ++ if (ret) { ++ SPDK_ERRLOG("UNMAP error\n"); ++ virtio_blk_req_finish(blk_req, VIRTIO_BLK_S_IOERR); ++ return ret; ++ } ++ break; ++ case VIRTIO_BLK_T_WRITE_ZEROES: ++ desc = req->iovs[1].iov_base; ++ if (payload_len != sizeof(*desc)) { ++ SPDK_NOTICELOG("Invalid write zeroes payload size: %u\n", payload_len); ++ virtio_blk_req_finish(blk_req, VIRTIO_BLK_S_IOERR); ++ return -1; ++ } ++ ++ /* Unmap this range, SPDK doesn't support it, kernel will enable this flag by default ++ * without checking unmap feature is negotiated or not, the flag isn't mandatory, so ++ * just print a warning. ++ */ ++ if (desc->flags & VIRTIO_BLK_WRITE_ZEROES_FLAG_UNMAP) { ++ SPDK_WARNLOG("Ignore the unmap flag for WRITE ZEROES from %"PRIx64", len %"PRIx64"\n", ++ (uint64_t)desc->sector * 512, (uint64_t)desc->num_sectors * 512); ++ } ++ ++ ret = spdk_bdev_write_zeroes(blk_endpoint->bdev_desc, blk_endpoint->io_channel, ++ desc->sector * 512, desc->num_sectors * 512, ++ blk_request_complete_cb, blk_req); ++ if (ret) { ++ SPDK_ERRLOG("WRITE ZEROES error\n"); ++ virtio_blk_req_finish(blk_req, VIRTIO_BLK_S_IOERR); ++ return ret; ++ } ++ break; ++ case VIRTIO_BLK_T_FLUSH: ++ flush_bytes = spdk_bdev_get_num_blocks(blk_endpoint->bdev) * spdk_bdev_get_block_size( ++ blk_endpoint->bdev); ++ if (hdr->sector != 0) { ++ SPDK_NOTICELOG("sector must be zero for flush command\n"); ++ virtio_blk_req_finish(blk_req, VIRTIO_BLK_S_IOERR); ++ return -EINVAL; ++ } ++ ret = spdk_bdev_flush(blk_endpoint->bdev_desc, blk_endpoint->io_channel, ++ 0, flush_bytes, ++ blk_request_complete_cb, blk_req); ++ if (ret) { ++ SPDK_ERRLOG("FLUSH error\n"); ++ virtio_blk_req_finish(blk_req, VIRTIO_BLK_S_IOERR); ++ return ret; ++ } ++ break; ++ case VIRTIO_BLK_T_GET_ID: ++ if (!iovcnt || !payload_len) { ++ virtio_blk_req_finish(blk_req, VIRTIO_BLK_S_UNSUPP); ++ return -EINVAL; ++ } ++ req->used_len = spdk_min((size_t)VIRTIO_BLK_ID_BYTES, req->iovs[1].iov_len); ++ spdk_strcpy_pad(req->iovs[1].iov_base, spdk_bdev_get_name(blk_endpoint->bdev), ++ req->used_len, ' '); ++ virtio_blk_req_finish(blk_req, VIRTIO_BLK_S_OK); ++ break; ++ default: ++ virtio_blk_req_finish(blk_req, VIRTIO_BLK_S_UNSUPP); ++ return -ENOTSUP; ++ } ++ ++ return 0; ++} ++ ++static void ++virtio_blk_update_config(struct virtio_blk_config *blk_cfg, struct spdk_bdev *bdev, ++ uint16_t num_queues) ++{ ++ memset(blk_cfg, 0, sizeof(*blk_cfg)); ++ ++ if (!bdev) { ++ return; ++ } ++ ++ blk_cfg->blk_size = spdk_bdev_get_block_size(bdev); ++ blk_cfg->capacity = (blk_cfg->blk_size * spdk_bdev_get_num_blocks(bdev)) / 512; ++ /* minimum I/O size in blocks */ ++ blk_cfg->min_io_size = 1; ++ blk_cfg->num_queues = num_queues; ++ ++ if (spdk_bdev_get_buf_align(bdev) > 1) { ++ blk_cfg->size_max = SPDK_BDEV_LARGE_BUF_MAX_SIZE; ++ blk_cfg->seg_max = spdk_min(VIRTIO_DEV_MAX_IOVS - 2 - 1, SPDK_BDEV_IO_NUM_CHILD_IOV - 2 - 1); ++ } else { ++ blk_cfg->size_max = 131072; ++ /* -2 for REQ and RESP and -1 for region boundary splitting */ ++ blk_cfg->seg_max = VIRTIO_DEV_MAX_IOVS - 2 - 1; ++ } ++ ++ if (spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_UNMAP)) { ++ /* 16MiB, expressed in 512 Bytes */ ++ blk_cfg->max_discard_sectors = 32768; ++ blk_cfg->max_discard_seg = 1; ++ blk_cfg->discard_sector_alignment = blk_cfg->blk_size / 512; ++ } ++ if (spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE_ZEROES)) { ++ blk_cfg->max_write_zeroes_sectors = 32768; ++ blk_cfg->max_write_zeroes_seg = 1; ++ } ++} ++ ++static void ++_vfu_virtio_blk_bdev_close(void *arg1) ++{ ++ struct spdk_bdev_desc *bdev_desc = arg1; ++ ++ spdk_bdev_close(bdev_desc); ++} ++ ++static void ++bdev_event_cb(enum spdk_bdev_event_type type, struct spdk_bdev *bdev, ++ void *event_ctx) ++{ ++ struct virtio_blk_endpoint *blk_endpoint = event_ctx; ++ ++ SPDK_DEBUGLOG(vfu_virtio_blk, "Bdev event: type %d, name %s\n", type, bdev->name); ++ ++ switch (type) { ++ case SPDK_BDEV_EVENT_REMOVE: ++ SPDK_NOTICELOG("bdev name (%s) received event(SPDK_BDEV_EVENT_REMOVE)\n", bdev->name); ++ virtio_blk_update_config(&blk_endpoint->blk_cfg, NULL, 0); ++ ++ if (blk_endpoint->io_channel) { ++ spdk_thread_send_msg(blk_endpoint->virtio.thread, _virtio_blk_stop_msg, blk_endpoint); ++ } ++ ++ if (blk_endpoint->bdev_desc) { ++ spdk_thread_send_msg(blk_endpoint->init_thread, _vfu_virtio_blk_bdev_close, ++ blk_endpoint->bdev_desc); ++ blk_endpoint->bdev_desc = NULL; ++ } ++ break; ++ case SPDK_BDEV_EVENT_RESIZE: ++ SPDK_NOTICELOG("bdev name (%s) received event(SPDK_BDEV_EVENT_RESIZE)\n", bdev->name); ++ virtio_blk_update_config(&blk_endpoint->blk_cfg, blk_endpoint->bdev, ++ blk_endpoint->virtio.num_queues); ++ vfu_virtio_notify_config(&blk_endpoint->virtio); ++ break; ++ default: ++ SPDK_NOTICELOG("Unsupported bdev event: type %d\n", type); ++ break; ++ } ++} ++ ++static uint64_t ++virtio_blk_get_supported_features(struct vfu_virtio_endpoint *virtio_endpoint) ++{ ++ struct virtio_blk_endpoint *blk_endpoint = to_blk_endpoint(virtio_endpoint); ++ uint64_t features; ++ struct spdk_bdev *bdev; ++ ++ features = VIRTIO_BLK_SUPPORTED_FEATURES | VIRTIO_HOST_SUPPORTED_FEATURES; ++ ++ if (!virtio_endpoint->packed_ring) { ++ features &= ~(1ULL << VIRTIO_F_RING_PACKED); ++ } ++ bdev = blk_endpoint->bdev; ++ ++ if (spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_UNMAP)) { ++ features |= (1ULL << VIRTIO_BLK_F_DISCARD); ++ } ++ ++ if (spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE_ZEROES)) { ++ features |= (1ULL << VIRTIO_BLK_F_WRITE_ZEROES); ++ } ++ ++ if (spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_FLUSH)) { ++ features |= (1ULL << VIRTIO_BLK_F_FLUSH); ++ } ++ ++ return features; ++} ++ ++static int ++virtio_blk_get_device_specific_config(struct vfu_virtio_endpoint *virtio_endpoint, char *buf, ++ uint64_t offset, uint64_t count) ++{ ++ struct virtio_blk_endpoint *blk_endpoint = to_blk_endpoint(virtio_endpoint); ++ uint8_t *blk_cfg; ++ uint64_t len; ++ ++ if (offset >= sizeof(struct virtio_blk_config)) { ++ return -EINVAL; ++ } ++ len = spdk_min(sizeof(struct virtio_blk_config) - offset, count); ++ ++ blk_cfg = (uint8_t *)&blk_endpoint->blk_cfg; ++ memcpy(buf, blk_cfg + offset, len); ++ ++ return 0; ++} ++ ++static struct vfu_virtio_req * ++virtio_blk_alloc_req(struct vfu_virtio_endpoint *virtio_endpoint, struct vfu_virtio_vq *vq) ++{ ++ struct virtio_blk_req *blk_req; ++ ++ blk_req = calloc(1, sizeof(*blk_req) + dma_sg_size() * (VIRTIO_DEV_MAX_IOVS + 1)); ++ if (!blk_req) { ++ return NULL; ++ } ++ ++ return &blk_req->req; ++} ++ ++static void ++virtio_blk_free_req(struct vfu_virtio_endpoint *virtio_endpoint, struct vfu_virtio_vq *vq, ++ struct vfu_virtio_req *req) ++{ ++ struct virtio_blk_req *blk_req = to_blk_request(req); ++ ++ free(blk_req); ++} ++ ++struct vfu_virtio_ops virtio_blk_ops = { ++ .get_device_features = virtio_blk_get_supported_features, ++ .alloc_req = virtio_blk_alloc_req, ++ .free_req = virtio_blk_free_req, ++ .exec_request = virtio_blk_process_req, ++ .get_config = virtio_blk_get_device_specific_config, ++ .start_device = virtio_blk_start, ++ .stop_device = virtio_blk_stop, ++}; ++ ++int ++vfu_virtio_blk_add_bdev(const char *name, const char *bdev_name, ++ uint16_t num_queues, uint16_t qsize, bool packed_ring) ++{ ++ struct spdk_vfu_endpoint *endpoint; ++ struct vfu_virtio_endpoint *virtio_endpoint; ++ struct virtio_blk_endpoint *blk_endpoint; ++ int ret; ++ ++ endpoint = spdk_vfu_get_endpoint_by_name(name); ++ if (!endpoint) { ++ SPDK_ERRLOG("Endpoint %s doesn't exist\n", name); ++ return -ENOENT; ++ } ++ ++ virtio_endpoint = spdk_vfu_get_endpoint_private(endpoint); ++ blk_endpoint = to_blk_endpoint(virtio_endpoint); ++ ++ if (blk_endpoint->bdev_desc) { ++ SPDK_ERRLOG("%s: block device already exists\n", spdk_vfu_get_endpoint_id(endpoint)); ++ return -EEXIST; ++ } ++ ++ if (num_queues && (num_queues <= VIRTIO_DEV_MAX_VQS)) { ++ blk_endpoint->virtio.num_queues = num_queues; ++ } ++ if (qsize && (qsize <= VIRTIO_VQ_MAX_SIZE)) { ++ blk_endpoint->virtio.qsize = qsize; ++ } ++ blk_endpoint->virtio.packed_ring = packed_ring; ++ ++ SPDK_DEBUGLOG(vfu_virtio_blk, "%s: add block device %s, num_queues %u, qsize %u, packed ring %s\n", ++ spdk_vfu_get_endpoint_id(endpoint), ++ bdev_name, blk_endpoint->virtio.num_queues, blk_endpoint->virtio.qsize, ++ packed_ring ? "enabled" : "disabled"); ++ ++ ret = spdk_bdev_open_ext(bdev_name, true, bdev_event_cb, blk_endpoint, ++ &blk_endpoint->bdev_desc); ++ if (ret != 0) { ++ SPDK_ERRLOG("%s could not open bdev '%s', error=%d\n", ++ name, bdev_name, ret); ++ return ret; ++ } ++ blk_endpoint->bdev = spdk_bdev_desc_get_bdev(blk_endpoint->bdev_desc); ++ virtio_blk_update_config(&blk_endpoint->blk_cfg, blk_endpoint->bdev, ++ blk_endpoint->virtio.num_queues); ++ blk_endpoint->init_thread = spdk_get_thread(); ++ ++ return 0; ++} ++ ++static int ++vfu_virtio_blk_endpoint_destruct(struct spdk_vfu_endpoint *endpoint) ++{ ++ struct vfu_virtio_endpoint *virtio_endpoint = spdk_vfu_get_endpoint_private(endpoint); ++ struct virtio_blk_endpoint *blk_endpoint = to_blk_endpoint(virtio_endpoint); ++ ++ if (blk_endpoint->bdev_desc) { ++ spdk_thread_send_msg(blk_endpoint->init_thread, _vfu_virtio_blk_bdev_close, ++ blk_endpoint->bdev_desc); ++ blk_endpoint->bdev_desc = NULL; ++ } ++ ++ vfu_virtio_endpoint_destruct(&blk_endpoint->virtio); ++ free(blk_endpoint); ++ ++ return 0; ++} ++ ++static void * ++vfu_virtio_blk_endpoint_init(struct spdk_vfu_endpoint *endpoint, ++ char *basename, const char *endpoint_name) ++{ ++ struct virtio_blk_endpoint *blk_endpoint; ++ int ret; ++ ++ blk_endpoint = calloc(1, sizeof(*blk_endpoint)); ++ if (!blk_endpoint) { ++ return NULL; ++ } ++ ++ ret = vfu_virtio_endpoint_setup(&blk_endpoint->virtio, endpoint, basename, endpoint_name, ++ &virtio_blk_ops); ++ if (ret) { ++ SPDK_ERRLOG("Error to setup endpoint %s\n", endpoint_name); ++ free(blk_endpoint); ++ return NULL; ++ } ++ ++ return (void *)&blk_endpoint->virtio; ++} ++ ++static int ++vfu_virtio_blk_get_device_info(struct spdk_vfu_endpoint *endpoint, ++ struct spdk_vfu_pci_device *device_info) ++{ ++ struct vfu_virtio_endpoint *virtio_endpoint = spdk_vfu_get_endpoint_private(endpoint); ++ struct virtio_blk_endpoint *blk_endpoint = to_blk_endpoint(virtio_endpoint); ++ ++ vfu_virtio_get_device_info(&blk_endpoint->virtio, device_info); ++ /* Fill Device ID */ ++ device_info->id.did = PCI_DEVICE_ID_VIRTIO_BLK_MODERN; ++ ++ return 0; ++} ++ ++struct spdk_vfu_endpoint_ops vfu_virtio_blk_ops = { ++ .name = "virtio_blk", ++ .init = vfu_virtio_blk_endpoint_init, ++ .get_device_info = vfu_virtio_blk_get_device_info, ++ .get_vendor_capability = vfu_virtio_get_vendor_capability, ++ .post_memory_add = vfu_virtio_post_memory_add, ++ .pre_memory_remove = vfu_virtio_pre_memory_remove, ++ .reset_device = vfu_virtio_pci_reset_cb, ++ .quiesce_device = vfu_virtio_quiesce_cb, ++ .destruct = vfu_virtio_blk_endpoint_destruct, ++ .attach_device = vfu_virtio_attach_device, ++ .detach_device = vfu_virtio_detach_device, ++}; ++ ++static void ++__attribute__((constructor)) _vfu_virtio_blk_pci_model_register(void) ++{ ++ spdk_vfu_register_endpoint_ops(&vfu_virtio_blk_ops); ++} ++ ++SPDK_LOG_REGISTER_COMPONENT(vfu_virtio_blk) +diff --git a/module/vfu_device/vfu_virtio_internal.h b/module/vfu_device/vfu_virtio_internal.h +index 992cf72..43aa213 100644 +--- a/module/vfu_device/vfu_virtio_internal.h ++++ b/module/vfu_device/vfu_virtio_internal.h +@@ -1,408 +1,408 @@ +-/* SPDX-License-Identifier: BSD-3-Clause +- * Copyright (C) 2022 Intel Corporation. +- * All rights reserved. +- */ +- +-#ifndef _VFU_VIRTIO_INTERNAL_H +-#define _VFU_VIRTIO_INTERNAL_H +- +-#include +-#include +-#include +- +-#include "spdk/vfu_target.h" +- +-#define VIRTIO_HOST_SUPPORTED_FEATURES ((1ULL << VIRTIO_F_VERSION_1) | \ +- (1ULL << VIRTIO_RING_F_INDIRECT_DESC) | \ +- (1ULL << VIRTIO_F_RING_PACKED)) +- +-/* virtio device layout: +- * +- * region 1: MSI-X Table +- * region 2: MSI-X PBA +- * region 4: virtio modern memory 64bits BAR +- * Common configuration 0x0 - 0x1000 +- * ISR access 0x1000 - 0x2000 +- * Device specific configuration 0x2000 - 0x3000 +- * Notifications 0x3000 - 0x4000 +- */ +-#define VIRTIO_PCI_COMMON_CFG_OFFSET (0x0) +-#define VIRTIO_PCI_COMMON_CFG_LENGTH (0x1000) +-#define VIRTIO_PCI_ISR_ACCESS_OFFSET (VIRTIO_PCI_COMMON_CFG_OFFSET + VIRTIO_PCI_COMMON_CFG_LENGTH) +-#define VIRTIO_PCI_ISR_ACCESS_LENGTH (0x1000) +-#define VIRTIO_PCI_SPECIFIC_CFG_OFFSET (VIRTIO_PCI_ISR_ACCESS_OFFSET + VIRTIO_PCI_ISR_ACCESS_LENGTH) +-#define VIRTIO_PCI_SPECIFIC_CFG_LENGTH (0x1000) +-#define VIRTIO_PCI_NOTIFICATIONS_OFFSET (VIRTIO_PCI_SPECIFIC_CFG_OFFSET + VIRTIO_PCI_SPECIFIC_CFG_LENGTH) +-#define VIRTIO_PCI_NOTIFICATIONS_LENGTH (0x1000) +- +-#define VIRTIO_PCI_BAR4_LENGTH (VIRTIO_PCI_NOTIFICATIONS_OFFSET + VIRTIO_PCI_NOTIFICATIONS_LENGTH) +- +-#define VIRTIO_DEV_MAX_IOVS (129) +-/* Maximum number of requests which can be processed one time */ +-#define VIRTIO_DEV_VRING_MAX_REQS (32) +-/* Maximum number of queues can be supported by virtio device */ +-#define VIRTIO_DEV_MAX_VQS (64) +-/* Default queue size */ +-#define VIRTIO_VQ_DEFAULT_SIZE (128) +-/* Maximum queue size */ +-#define VIRTIO_VQ_MAX_SIZE (1024) +- +-struct vfu_virtio_endpoint; +-struct vfu_virtio_req; +- +-struct virtio_pci_cfg { +- /* Common PCI configuration */ +- uint32_t guest_feat_lo; +- uint32_t guest_feat_hi; +- +- /* Negotiated feature bits */ +- uint64_t guest_features; +- +- uint32_t host_feature_select; +- uint32_t guest_feature_select; +- +- uint16_t msix_config; +- uint8_t device_status; +- uint8_t config_generation; +- uint16_t queue_select; +- +- /* ISR access */ +- uint8_t isr; +-}; +- +-enum vfu_vq_state { +- VFU_VQ_CREATED = 0, +- VFU_VQ_ACTIVE, +- VFU_VQ_INACTIVE, +-}; +- +-struct q_mapping { +- /* iov of local process mapping. */ +- struct iovec iov; +- /* Stored sg, needed for unmap. */ +- dma_sg_t *sg; +- /* physical address */ +- uint64_t phys_addr; +- /* virtual address */ +- union { +- void *addr; +- +- struct vring_desc *desc; +- struct vring_packed_desc *desc_packed; +- +- struct vring_avail *avail; +- struct vring_packed_desc_event *driver_event; +- +- struct vring_used *used; +- struct vring_packed_desc_event *device_event; +- }; +- /* size in bytes */ +- uint64_t len; +-}; +- +-struct vfu_virtio_vq { +- /* Read Only */ +- uint16_t id; +- uint16_t qsize; +- +- bool enabled; +- uint16_t vector; +- +- enum vfu_vq_state q_state; +- STAILQ_HEAD(, vfu_virtio_req) free_reqs; +- +- uint32_t desc_lo; +- uint32_t desc_hi; +- uint32_t avail_lo; +- uint32_t avail_hi; +- uint32_t used_lo; +- uint32_t used_hi; +- +- struct q_mapping avail; +- struct q_mapping used; +- struct q_mapping desc; +- +- uint16_t last_avail_idx; +- uint16_t last_used_idx; +- +- struct { +- /* To mark a descriptor as available in packed ring +- * Equal to avail_wrap_counter in spec. +- */ +- uint8_t avail_phase : 1; +- /* To mark a descriptor as used in packed ring +- * Equal to used_wrap_counter in spec. +- */ +- uint8_t used_phase : 1; +- uint8_t padding : 5; +- bool packed_ring : 1; +- } packed; +- +- /* Request count from last event */ +- uint16_t used_req_cnt; +- /* Next time when we need to send event */ +- uint64_t next_event_time; +-}; +- +-struct vfu_virtio_dev { +- char name[SPDK_VFU_MAX_NAME_LEN]; +- /* RO for Guest Driver */ +- uint16_t num_queues; +- /* Supported feature bits by host driver, RO for Guest Driver */ +- uint64_t host_features; +- +- struct virtio_pci_cfg cfg; +- struct vfu_virtio_vq vqs[VIRTIO_DEV_MAX_VQS]; +- +- struct vfu_virtio_endpoint *virtio_endpoint; +- +- /* VIRTIO_DEV_MAX_VQS * 3 worth of dma_sg_size() */ +- uint8_t sg[]; +-}; +- +-struct vfu_virtio_ops { +- uint64_t (*get_device_features)(struct vfu_virtio_endpoint *virtio_endpoint); +- struct vfu_virtio_req *(*alloc_req)(struct vfu_virtio_endpoint *virtio_endpoint, +- struct vfu_virtio_vq *vq); +- void (*free_req)(struct vfu_virtio_endpoint *virtio_endpoint, struct vfu_virtio_vq *vq, +- struct vfu_virtio_req *req); +- int (*exec_request)(struct vfu_virtio_endpoint *virtio_endpoint, struct vfu_virtio_vq *vq, +- struct vfu_virtio_req *req); +- int (*get_config)(struct vfu_virtio_endpoint *virtio_endpoint, char *buf, uint64_t offset, +- uint64_t count); +- int (*set_config)(struct vfu_virtio_endpoint *virtio_endpoint, char *buf, uint64_t offset, +- uint64_t count); +- int (*start_device)(struct vfu_virtio_endpoint *virtio_endpoint); +- int (*stop_device)(struct vfu_virtio_endpoint *virtio_endpoint); +-}; +- +-struct vfu_virtio_endpoint { +- struct vfu_virtio_dev *dev; +- int devmem_fd; +- volatile uint32_t *doorbells; +- +- uint16_t num_queues; +- uint16_t qsize; +- bool packed_ring; +- +- uint32_t coalescing_delay_us; +- +- struct spdk_vfu_endpoint *endpoint; +- struct spdk_thread *thread; +- +- struct vfu_virtio_ops virtio_ops; +- +- /* quiesce poller */ +- uint32_t io_outstanding; +- bool quiesce_in_progress; +- struct spdk_poller *quiesce_poller; +-}; +- +-struct vfu_virtio_req { +- struct vfu_virtio_dev *dev; +- struct vfu_virtio_vq *vq; +- +- STAILQ_ENTRY(vfu_virtio_req) link; +- +- uint32_t payload_size; +- uint32_t used_len; +- +- /* split vring */ +- uint16_t req_idx; +- /* packed vring */ +- uint16_t buffer_id; +- uint16_t num_descs; +- +- uint16_t iovcnt; +- struct iovec iovs[VIRTIO_DEV_MAX_IOVS + 1]; +- uint8_t desc_writeable[VIRTIO_DEV_MAX_IOVS + 1]; +- +- struct iovec *indirect_iov; +- dma_sg_t *indirect_sg; +- +- /* VIRIO_DEV_MAX_IOVS + 1 worth of dma_sg_size() */ +- uint8_t sg[]; +-}; +- +-static inline bool +-virtio_guest_has_feature(struct vfu_virtio_dev *dev, uint32_t feature_bit) +-{ +- assert(feature_bit <= 64); +- +- return !!(dev->cfg.guest_features & (1ULL << feature_bit)); +-} +- +-static inline uint64_t +-virtio_queue_desc_size(struct vfu_virtio_dev *dev, struct vfu_virtio_vq *vq) +-{ +- return sizeof(struct vring_desc) * vq->qsize; +-} +- +-static inline uint64_t +-virtio_queue_avail_size(struct vfu_virtio_dev *dev, struct vfu_virtio_vq *vq) +-{ +- uint16_t event_size; +- +- if (virtio_guest_has_feature(dev, VIRTIO_F_RING_PACKED)) { +- return sizeof(struct vring_packed_desc_event); +- } +- +- event_size = virtio_guest_has_feature(dev, VIRTIO_RING_F_EVENT_IDX) ? 2 : 0; +- return (sizeof(struct vring_avail) + sizeof(uint16_t) * vq->qsize +- + event_size); +-} +- +-static inline uint64_t +-virtio_queue_used_size(struct vfu_virtio_dev *dev, struct vfu_virtio_vq *vq) +-{ +- uint16_t event_size; +- +- if (virtio_guest_has_feature(dev, VIRTIO_F_RING_PACKED)) { +- return sizeof(struct vring_packed_desc_event); +- } +- +- event_size = virtio_guest_has_feature(dev, VIRTIO_RING_F_EVENT_IDX) ? 2 : 0; +- return (sizeof(struct vring_used) + sizeof(struct vring_used_elem) * vq->qsize +- + event_size); +-} +- +-static inline bool +-virtio_queue_event_is_suppressed(struct vfu_virtio_dev *dev, struct vfu_virtio_vq *vq) +-{ +- bool is_suppressed = false; +- +- if (virtio_guest_has_feature(dev, VIRTIO_F_RING_PACKED)) { +- is_suppressed = vq->avail.driver_event->flags & VRING_PACKED_EVENT_FLAG_DISABLE; +- } else { +- is_suppressed = vq->avail.avail->flags & VRING_AVAIL_F_NO_INTERRUPT; +- +- } +- +- return is_suppressed; +-} +- +-static inline bool +-virtio_dev_is_started(struct vfu_virtio_dev *dev) +-{ +- return !!(dev->cfg.device_status & VIRTIO_CONFIG_S_DRIVER_OK); +-} +- +-static inline bool +-virtio_vring_split_desc_is_indirect(struct vring_desc *desc) +-{ +- return !!(desc->flags & VRING_DESC_F_INDIRECT); +-} +- +-static inline bool +-virtio_vring_packed_desc_is_indirect(struct vring_packed_desc *desc) +-{ +- return !!(desc->flags & VRING_DESC_F_INDIRECT); +-} +- +-static inline bool +-virtio_vring_split_desc_is_wr(struct vring_desc *desc) +-{ +- return !!(desc->flags & VRING_DESC_F_WRITE); +-} +- +-static inline bool +-virtio_vring_packed_desc_is_wr(struct vring_packed_desc *desc) +-{ +- return !!(desc->flags & VRING_DESC_F_WRITE); +-} +- +-static inline bool +-virtio_vring_packed_is_avail(struct vring_packed_desc *desc, bool avail_phase) +-{ +- bool avail_flag, used_flag; +- uint16_t flags = desc->flags; +- +- avail_flag = !!(flags & (1 << VRING_PACKED_DESC_F_AVAIL)); +- used_flag = !!(flags & (1 << VRING_PACKED_DESC_F_USED)); +- +- /* To mark a desc as available, the driver sets the F_AVAIL bit in flags +- * to match the internal avail wrap counter. It also sets the F_USED bit to +- * match the inverse value but it's not mandatory. +- */ +- return (avail_flag != used_flag) && (avail_flag == avail_phase); +-} +- +-static inline bool +-virtio_vring_packed_is_used(struct vring_packed_desc *desc, bool used_phase) +-{ +- bool avail_flag, used_flag; +- uint16_t flags = desc->flags; +- +- avail_flag = !!(flags & (1 << VRING_PACKED_DESC_F_AVAIL)); +- used_flag = !!(flags & (1 << VRING_PACKED_DESC_F_USED)); +- +- /* When the descriptor is used, two flags in descriptor +- * avail flag and used flag are set to equal +- * and used flag value == used_wrap_counter. +- */ +- return (used_flag == avail_flag) && (used_flag == used_phase); +-} +- +-static inline bool +-virtio_req_iov_is_wr(struct vfu_virtio_req *req, uint32_t iov_num) +-{ +- assert(iov_num <= VIRTIO_DEV_MAX_IOVS); +- return req->desc_writeable[iov_num]; +-} +- +-static inline struct vfu_virtio_req * +-vfu_virtio_vq_alloc_req(struct vfu_virtio_endpoint *endpoint, struct vfu_virtio_vq *vq) +-{ +- assert(endpoint->virtio_ops.alloc_req != NULL); +- return endpoint->virtio_ops.alloc_req(endpoint, vq); +-} +- +-static inline void +-vfu_virtio_vq_free_req(struct vfu_virtio_endpoint *endpoint, struct vfu_virtio_vq *vq, +- struct vfu_virtio_req *req) +-{ +- assert(endpoint->virtio_ops.free_req); +- endpoint->virtio_ops.free_req(endpoint, vq, req); +-} +- +-void virtio_vq_used_ring_split_enqueue(struct vfu_virtio_vq *vq, uint16_t req_idx, +- uint32_t used_len); +-void virtio_vq_used_ring_packed_enqueue(struct vfu_virtio_vq *vq, uint16_t buffer_id, +- uint32_t num_descs, uint32_t used_len); +-struct vfu_virtio_req *virito_dev_packed_ring_get_next_avail_req(struct vfu_virtio_dev *dev, +- struct vfu_virtio_vq *vq); +-struct vfu_virtio_req *virito_dev_split_ring_get_next_avail_req(struct vfu_virtio_dev *dev, +- struct vfu_virtio_vq *vq); +- +-int vfu_virtio_quiesce_cb(struct spdk_vfu_endpoint *endpoint); +- +-void vfu_virtio_dev_put_req(struct vfu_virtio_req *req); +-void vfu_virtio_finish_req(struct vfu_virtio_req *req); +-void vfu_virtio_vq_flush_irq(struct vfu_virtio_dev *dev, struct vfu_virtio_vq *vq); +-int vfu_virito_dev_process_packed_ring(struct vfu_virtio_dev *dev, struct vfu_virtio_vq *vq); +-int vfu_virito_dev_process_split_ring(struct vfu_virtio_dev *dev, struct vfu_virtio_vq *vq); +-void vfu_virtio_notify_config(struct vfu_virtio_endpoint *virtio_endpoint); +-int vfu_virtio_endpoint_setup(struct vfu_virtio_endpoint *virtio_endpoint, +- struct spdk_vfu_endpoint *endpoint, +- char *basename, const char *endpoint_name, +- struct vfu_virtio_ops *ops); +-int vfu_virtio_endpoint_destruct(struct vfu_virtio_endpoint *virtio_endpoint); +-void vfu_virtio_get_device_info(struct vfu_virtio_endpoint *virtio_endpoint, +- struct spdk_vfu_pci_device *device_info); +-int vfu_virtio_attach_device(struct spdk_vfu_endpoint *endpoint); +-int vfu_virtio_detach_device(struct spdk_vfu_endpoint *endpoint); +-uint16_t vfu_virtio_get_vendor_capability(struct spdk_vfu_endpoint *endpoint, char *buf, +- uint16_t buf_len, uint16_t idx); +-int vfu_virtio_post_memory_add(struct spdk_vfu_endpoint *endpoint, void *map_start, void *map_end); +-int vfu_virtio_pre_memory_remove(struct spdk_vfu_endpoint *endpoint, void *map_start, +- void *map_end); +-int vfu_virtio_pci_reset_cb(struct spdk_vfu_endpoint *endpoint); +-int vfu_virtio_blk_add_bdev(const char *name, const char *bdev_name, +- uint16_t num_queues, uint16_t qsize, bool packed_ring); +-/* virtio_scsi */ +-int vfu_virtio_scsi_add_target(const char *name, uint8_t scsi_target_num, +- const char *bdev_name); +-int vfu_virtio_scsi_remove_target(const char *name, uint8_t scsi_target_num); +-int vfu_virtio_scsi_set_options(const char *name, uint16_t num_io_queues, uint16_t qsize, +- bool packed_ring); +-#endif ++/* SPDX-License-Identifier: BSD-3-Clause ++ * Copyright (C) 2022 Intel Corporation. ++ * All rights reserved. ++ */ ++ ++#ifndef _VFU_VIRTIO_INTERNAL_H ++#define _VFU_VIRTIO_INTERNAL_H ++ ++#include ++#include ++#include ++ ++#include "spdk/vfu_target.h" ++ ++#define VIRTIO_HOST_SUPPORTED_FEATURES ((1ULL << VIRTIO_F_VERSION_1) | \ ++ (1ULL << VIRTIO_RING_F_INDIRECT_DESC) | \ ++ (1ULL << VIRTIO_F_RING_PACKED)) ++ ++/* virtio device layout: ++ * ++ * region 1: MSI-X Table ++ * region 2: MSI-X PBA ++ * region 4: virtio modern memory 64bits BAR ++ * Common configuration 0x0 - 0x1000 ++ * ISR access 0x1000 - 0x2000 ++ * Device specific configuration 0x2000 - 0x3000 ++ * Notifications 0x3000 - 0x4000 ++ */ ++#define VIRTIO_PCI_COMMON_CFG_OFFSET (0x0) ++#define VIRTIO_PCI_COMMON_CFG_LENGTH (0x1000) ++#define VIRTIO_PCI_ISR_ACCESS_OFFSET (VIRTIO_PCI_COMMON_CFG_OFFSET + VIRTIO_PCI_COMMON_CFG_LENGTH) ++#define VIRTIO_PCI_ISR_ACCESS_LENGTH (0x1000) ++#define VIRTIO_PCI_SPECIFIC_CFG_OFFSET (VIRTIO_PCI_ISR_ACCESS_OFFSET + VIRTIO_PCI_ISR_ACCESS_LENGTH) ++#define VIRTIO_PCI_SPECIFIC_CFG_LENGTH (0x1000) ++#define VIRTIO_PCI_NOTIFICATIONS_OFFSET (VIRTIO_PCI_SPECIFIC_CFG_OFFSET + VIRTIO_PCI_SPECIFIC_CFG_LENGTH) ++#define VIRTIO_PCI_NOTIFICATIONS_LENGTH (0x1000) ++ ++#define VIRTIO_PCI_BAR4_LENGTH (VIRTIO_PCI_NOTIFICATIONS_OFFSET + VIRTIO_PCI_NOTIFICATIONS_LENGTH) ++ ++#define VIRTIO_DEV_MAX_IOVS (129) ++/* Maximum number of requests which can be processed one time */ ++#define VIRTIO_DEV_VRING_MAX_REQS (32) ++/* Maximum number of queues can be supported by virtio device */ ++#define VIRTIO_DEV_MAX_VQS (64) ++/* Default queue size */ ++#define VIRTIO_VQ_DEFAULT_SIZE (128) ++/* Maximum queue size */ ++#define VIRTIO_VQ_MAX_SIZE (1024) ++ ++struct vfu_virtio_endpoint; ++struct vfu_virtio_req; ++ ++struct virtio_pci_cfg { ++ /* Common PCI configuration */ ++ uint32_t guest_feat_lo; ++ uint32_t guest_feat_hi; ++ ++ /* Negotiated feature bits */ ++ uint64_t guest_features; ++ ++ uint32_t host_feature_select; ++ uint32_t guest_feature_select; ++ ++ uint16_t msix_config; ++ uint8_t device_status; ++ uint8_t config_generation; ++ uint16_t queue_select; ++ ++ /* ISR access */ ++ uint8_t isr; ++}; ++ ++enum vfu_vq_state { ++ VFU_VQ_CREATED = 0, ++ VFU_VQ_ACTIVE, ++ VFU_VQ_INACTIVE, ++}; ++ ++struct q_mapping { ++ /* iov of local process mapping. */ ++ struct iovec iov; ++ /* Stored sg, needed for unmap. */ ++ dma_sg_t *sg; ++ /* physical address */ ++ uint64_t phys_addr; ++ /* virtual address */ ++ union { ++ void *addr; ++ ++ struct vring_desc *desc; ++ struct vring_packed_desc *desc_packed; ++ ++ struct vring_avail *avail; ++ struct vring_packed_desc_event *driver_event; ++ ++ struct vring_used *used; ++ struct vring_packed_desc_event *device_event; ++ }; ++ /* size in bytes */ ++ uint64_t len; ++}; ++ ++struct vfu_virtio_vq { ++ /* Read Only */ ++ uint16_t id; ++ uint16_t qsize; ++ ++ bool enabled; ++ uint16_t vector; ++ ++ enum vfu_vq_state q_state; ++ STAILQ_HEAD(, vfu_virtio_req) free_reqs; ++ ++ uint32_t desc_lo; ++ uint32_t desc_hi; ++ uint32_t avail_lo; ++ uint32_t avail_hi; ++ uint32_t used_lo; ++ uint32_t used_hi; ++ ++ struct q_mapping avail; ++ struct q_mapping used; ++ struct q_mapping desc; ++ ++ uint16_t last_avail_idx; ++ uint16_t last_used_idx; ++ ++ struct { ++ /* To mark a descriptor as available in packed ring ++ * Equal to avail_wrap_counter in spec. ++ */ ++ uint8_t avail_phase : 1; ++ /* To mark a descriptor as used in packed ring ++ * Equal to used_wrap_counter in spec. ++ */ ++ uint8_t used_phase : 1; ++ uint8_t padding : 5; ++ bool packed_ring : 1; ++ } packed; ++ ++ /* Request count from last event */ ++ uint16_t used_req_cnt; ++ /* Next time when we need to send event */ ++ uint64_t next_event_time; ++}; ++ ++struct vfu_virtio_dev { ++ char name[SPDK_VFU_MAX_NAME_LEN]; ++ /* RO for Guest Driver */ ++ uint16_t num_queues; ++ /* Supported feature bits by host driver, RO for Guest Driver */ ++ uint64_t host_features; ++ ++ struct virtio_pci_cfg cfg; ++ struct vfu_virtio_vq vqs[VIRTIO_DEV_MAX_VQS]; ++ ++ struct vfu_virtio_endpoint *virtio_endpoint; ++ ++ /* VIRTIO_DEV_MAX_VQS * 3 worth of dma_sg_size() */ ++ uint8_t sg[]; ++}; ++ ++struct vfu_virtio_ops { ++ uint64_t (*get_device_features)(struct vfu_virtio_endpoint *virtio_endpoint); ++ struct vfu_virtio_req *(*alloc_req)(struct vfu_virtio_endpoint *virtio_endpoint, ++ struct vfu_virtio_vq *vq); ++ void (*free_req)(struct vfu_virtio_endpoint *virtio_endpoint, struct vfu_virtio_vq *vq, ++ struct vfu_virtio_req *req); ++ int (*exec_request)(struct vfu_virtio_endpoint *virtio_endpoint, struct vfu_virtio_vq *vq, ++ struct vfu_virtio_req *req); ++ int (*get_config)(struct vfu_virtio_endpoint *virtio_endpoint, char *buf, uint64_t offset, ++ uint64_t count); ++ int (*set_config)(struct vfu_virtio_endpoint *virtio_endpoint, char *buf, uint64_t offset, ++ uint64_t count); ++ int (*start_device)(struct vfu_virtio_endpoint *virtio_endpoint); ++ int (*stop_device)(struct vfu_virtio_endpoint *virtio_endpoint); ++}; ++ ++struct vfu_virtio_endpoint { ++ struct vfu_virtio_dev *dev; ++ int devmem_fd; ++ volatile uint32_t *doorbells; ++ ++ uint16_t num_queues; ++ uint16_t qsize; ++ bool packed_ring; ++ ++ uint32_t coalescing_delay_us; ++ ++ struct spdk_vfu_endpoint *endpoint; ++ struct spdk_thread *thread; ++ ++ struct vfu_virtio_ops virtio_ops; ++ ++ /* quiesce poller */ ++ uint32_t io_outstanding; ++ bool quiesce_in_progress; ++ struct spdk_poller *quiesce_poller; ++}; ++ ++struct vfu_virtio_req { ++ struct vfu_virtio_dev *dev; ++ struct vfu_virtio_vq *vq; ++ ++ STAILQ_ENTRY(vfu_virtio_req) link; ++ ++ uint32_t payload_size; ++ uint32_t used_len; ++ ++ /* split vring */ ++ uint16_t req_idx; ++ /* packed vring */ ++ uint16_t buffer_id; ++ uint16_t num_descs; ++ ++ uint16_t iovcnt; ++ struct iovec iovs[VIRTIO_DEV_MAX_IOVS + 1]; ++ uint8_t desc_writeable[VIRTIO_DEV_MAX_IOVS + 1]; ++ ++ struct iovec *indirect_iov; ++ dma_sg_t *indirect_sg; ++ ++ /* VIRIO_DEV_MAX_IOVS + 1 worth of dma_sg_size() */ ++ uint8_t sg[]; ++}; ++ ++static inline bool ++virtio_guest_has_feature(struct vfu_virtio_dev *dev, uint32_t feature_bit) ++{ ++ assert(feature_bit <= 64); ++ ++ return !!(dev->cfg.guest_features & (1ULL << feature_bit)); ++} ++ ++static inline uint64_t ++virtio_queue_desc_size(struct vfu_virtio_dev *dev, struct vfu_virtio_vq *vq) ++{ ++ return sizeof(struct vring_desc) * vq->qsize; ++} ++ ++static inline uint64_t ++virtio_queue_avail_size(struct vfu_virtio_dev *dev, struct vfu_virtio_vq *vq) ++{ ++ uint16_t event_size; ++ ++ if (virtio_guest_has_feature(dev, VIRTIO_F_RING_PACKED)) { ++ return sizeof(struct vring_packed_desc_event); ++ } ++ ++ event_size = virtio_guest_has_feature(dev, VIRTIO_RING_F_EVENT_IDX) ? 2 : 0; ++ return (sizeof(struct vring_avail) + sizeof(uint16_t) * vq->qsize ++ + event_size); ++} ++ ++static inline uint64_t ++virtio_queue_used_size(struct vfu_virtio_dev *dev, struct vfu_virtio_vq *vq) ++{ ++ uint16_t event_size; ++ ++ if (virtio_guest_has_feature(dev, VIRTIO_F_RING_PACKED)) { ++ return sizeof(struct vring_packed_desc_event); ++ } ++ ++ event_size = virtio_guest_has_feature(dev, VIRTIO_RING_F_EVENT_IDX) ? 2 : 0; ++ return (sizeof(struct vring_used) + sizeof(struct vring_used_elem) * vq->qsize ++ + event_size); ++} ++ ++static inline bool ++virtio_queue_event_is_suppressed(struct vfu_virtio_dev *dev, struct vfu_virtio_vq *vq) ++{ ++ bool is_suppressed = false; ++ ++ if (virtio_guest_has_feature(dev, VIRTIO_F_RING_PACKED)) { ++ is_suppressed = vq->avail.driver_event->flags & VRING_PACKED_EVENT_FLAG_DISABLE; ++ } else { ++ is_suppressed = vq->avail.avail->flags & VRING_AVAIL_F_NO_INTERRUPT; ++ ++ } ++ ++ return is_suppressed; ++} ++ ++static inline bool ++virtio_dev_is_started(struct vfu_virtio_dev *dev) ++{ ++ return !!(dev->cfg.device_status & VIRTIO_CONFIG_S_DRIVER_OK); ++} ++ ++static inline bool ++virtio_vring_split_desc_is_indirect(struct vring_desc *desc) ++{ ++ return !!(desc->flags & VRING_DESC_F_INDIRECT); ++} ++ ++static inline bool ++virtio_vring_packed_desc_is_indirect(struct vring_packed_desc *desc) ++{ ++ return !!(desc->flags & VRING_DESC_F_INDIRECT); ++} ++ ++static inline bool ++virtio_vring_split_desc_is_wr(struct vring_desc *desc) ++{ ++ return !!(desc->flags & VRING_DESC_F_WRITE); ++} ++ ++static inline bool ++virtio_vring_packed_desc_is_wr(struct vring_packed_desc *desc) ++{ ++ return !!(desc->flags & VRING_DESC_F_WRITE); ++} ++ ++static inline bool ++virtio_vring_packed_is_avail(struct vring_packed_desc *desc, bool avail_phase) ++{ ++ bool avail_flag, used_flag; ++ uint16_t flags = desc->flags; ++ ++ avail_flag = !!(flags & (1 << VRING_PACKED_DESC_F_AVAIL)); ++ used_flag = !!(flags & (1 << VRING_PACKED_DESC_F_USED)); ++ ++ /* To mark a desc as available, the driver sets the F_AVAIL bit in flags ++ * to match the internal avail wrap counter. It also sets the F_USED bit to ++ * match the inverse value but it's not mandatory. ++ */ ++ return (avail_flag != used_flag) && (avail_flag == avail_phase); ++} ++ ++static inline bool ++virtio_vring_packed_is_used(struct vring_packed_desc *desc, bool used_phase) ++{ ++ bool avail_flag, used_flag; ++ uint16_t flags = desc->flags; ++ ++ avail_flag = !!(flags & (1 << VRING_PACKED_DESC_F_AVAIL)); ++ used_flag = !!(flags & (1 << VRING_PACKED_DESC_F_USED)); ++ ++ /* When the descriptor is used, two flags in descriptor ++ * avail flag and used flag are set to equal ++ * and used flag value == used_wrap_counter. ++ */ ++ return (used_flag == avail_flag) && (used_flag == used_phase); ++} ++ ++static inline bool ++virtio_req_iov_is_wr(struct vfu_virtio_req *req, uint32_t iov_num) ++{ ++ assert(iov_num <= VIRTIO_DEV_MAX_IOVS); ++ return req->desc_writeable[iov_num]; ++} ++ ++static inline struct vfu_virtio_req * ++vfu_virtio_vq_alloc_req(struct vfu_virtio_endpoint *endpoint, struct vfu_virtio_vq *vq) ++{ ++ assert(endpoint->virtio_ops.alloc_req != NULL); ++ return endpoint->virtio_ops.alloc_req(endpoint, vq); ++} ++ ++static inline void ++vfu_virtio_vq_free_req(struct vfu_virtio_endpoint *endpoint, struct vfu_virtio_vq *vq, ++ struct vfu_virtio_req *req) ++{ ++ assert(endpoint->virtio_ops.free_req); ++ endpoint->virtio_ops.free_req(endpoint, vq, req); ++} ++ ++void virtio_vq_used_ring_split_enqueue(struct vfu_virtio_vq *vq, uint16_t req_idx, ++ uint32_t used_len); ++void virtio_vq_used_ring_packed_enqueue(struct vfu_virtio_vq *vq, uint16_t buffer_id, ++ uint32_t num_descs, uint32_t used_len); ++struct vfu_virtio_req *virito_dev_packed_ring_get_next_avail_req(struct vfu_virtio_dev *dev, ++ struct vfu_virtio_vq *vq); ++struct vfu_virtio_req *virito_dev_split_ring_get_next_avail_req(struct vfu_virtio_dev *dev, ++ struct vfu_virtio_vq *vq); ++ ++int vfu_virtio_quiesce_cb(struct spdk_vfu_endpoint *endpoint); ++ ++void vfu_virtio_dev_put_req(struct vfu_virtio_req *req); ++void vfu_virtio_finish_req(struct vfu_virtio_req *req); ++void vfu_virtio_vq_flush_irq(struct vfu_virtio_dev *dev, struct vfu_virtio_vq *vq); ++int vfu_virito_dev_process_packed_ring(struct vfu_virtio_dev *dev, struct vfu_virtio_vq *vq); ++int vfu_virito_dev_process_split_ring(struct vfu_virtio_dev *dev, struct vfu_virtio_vq *vq); ++void vfu_virtio_notify_config(struct vfu_virtio_endpoint *virtio_endpoint); ++int vfu_virtio_endpoint_setup(struct vfu_virtio_endpoint *virtio_endpoint, ++ struct spdk_vfu_endpoint *endpoint, ++ char *basename, const char *endpoint_name, ++ struct vfu_virtio_ops *ops); ++int vfu_virtio_endpoint_destruct(struct vfu_virtio_endpoint *virtio_endpoint); ++void vfu_virtio_get_device_info(struct vfu_virtio_endpoint *virtio_endpoint, ++ struct spdk_vfu_pci_device *device_info); ++int vfu_virtio_attach_device(struct spdk_vfu_endpoint *endpoint); ++int vfu_virtio_detach_device(struct spdk_vfu_endpoint *endpoint); ++uint16_t vfu_virtio_get_vendor_capability(struct spdk_vfu_endpoint *endpoint, char *buf, ++ uint16_t buf_len, uint16_t idx); ++int vfu_virtio_post_memory_add(struct spdk_vfu_endpoint *endpoint, void *map_start, void *map_end); ++int vfu_virtio_pre_memory_remove(struct spdk_vfu_endpoint *endpoint, void *map_start, ++ void *map_end); ++int vfu_virtio_pci_reset_cb(struct spdk_vfu_endpoint *endpoint); ++int vfu_virtio_blk_add_bdev(const char *name, const char *bdev_name, ++ uint16_t num_queues, uint16_t qsize, bool packed_ring); ++/* virtio_scsi */ ++int vfu_virtio_scsi_add_target(const char *name, uint8_t scsi_target_num, ++ const char *bdev_name); ++int vfu_virtio_scsi_remove_target(const char *name, uint8_t scsi_target_num); ++int vfu_virtio_scsi_set_options(const char *name, uint16_t num_io_queues, uint16_t qsize, ++ bool packed_ring); ++#endif +diff --git a/module/vfu_device/vfu_virtio_rpc.c b/module/vfu_device/vfu_virtio_rpc.c +index 3179458..b51257e 100644 +--- a/module/vfu_device/vfu_virtio_rpc.c ++++ b/module/vfu_device/vfu_virtio_rpc.c +@@ -1,287 +1,287 @@ +-/* SPDX-License-Identifier: BSD-3-Clause +- * Copyright (C) 2022 Intel Corporation. +- * All rights reserved. +- */ +- +-#include "spdk/bdev.h" +-#include "spdk/log.h" +-#include "spdk/rpc.h" +-#include "spdk/env.h" +-#include "spdk/string.h" +-#include "spdk/util.h" +-#include "spdk/thread.h" +- +-#include "vfu_virtio_internal.h" +- +-struct rpc_delete_vfu_endpoint { +- char *name; +-}; +- +-static const struct spdk_json_object_decoder rpc_delete_vfu_endpoint_decode[] = { +- {"name", offsetof(struct rpc_delete_vfu_endpoint, name), spdk_json_decode_string } +-}; +- +-static void +-free_rpc_delete_vfu_endpoint(struct rpc_delete_vfu_endpoint *req) +-{ +- free(req->name); +-} +- +-static void +-rpc_vfu_virtio_delete_endpoint(struct spdk_jsonrpc_request *request, +- const struct spdk_json_val *params) +-{ +- struct rpc_delete_vfu_endpoint req = {0}; +- int rc; +- +- if (spdk_json_decode_object(params, rpc_delete_vfu_endpoint_decode, +- SPDK_COUNTOF(rpc_delete_vfu_endpoint_decode), +- &req)) { +- SPDK_ERRLOG("spdk_json_decode_object failed\n"); +- rc = -EINVAL; +- goto invalid; +- } +- +- rc = spdk_vfu_delete_endpoint(req.name); +- if (rc < 0) { +- goto invalid; +- } +- free_rpc_delete_vfu_endpoint(&req); +- +- spdk_jsonrpc_send_bool_response(request, true); +- return; +- +-invalid: +- free_rpc_delete_vfu_endpoint(&req); +- spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, +- spdk_strerror(-rc)); +-} +-SPDK_RPC_REGISTER("vfu_virtio_delete_endpoint", rpc_vfu_virtio_delete_endpoint, +- SPDK_RPC_RUNTIME) +- +-struct rpc_vfu_virtio_create_blk { +- char *name; +- char *bdev_name; +- char *cpumask; +- uint16_t num_queues; +- uint16_t qsize; +- bool packed_ring; +-}; +- +-static const struct spdk_json_object_decoder rpc_construct_vfu_virtio_create_blk[] = { +- {"name", offsetof(struct rpc_vfu_virtio_create_blk, name), spdk_json_decode_string }, +- {"bdev_name", offsetof(struct rpc_vfu_virtio_create_blk, bdev_name), spdk_json_decode_string }, +- {"cpumask", offsetof(struct rpc_vfu_virtio_create_blk, cpumask), spdk_json_decode_string, true}, +- {"num_queues", offsetof(struct rpc_vfu_virtio_create_blk, num_queues), spdk_json_decode_uint16, true }, +- {"qsize", offsetof(struct rpc_vfu_virtio_create_blk, qsize), spdk_json_decode_uint16, true }, +- {"packed_ring", offsetof(struct rpc_vfu_virtio_create_blk, packed_ring), spdk_json_decode_bool, true}, +-}; +- +-static void +-free_rpc_vfu_virtio_create_blk(struct rpc_vfu_virtio_create_blk *req) +-{ +- free(req->name); +- free(req->bdev_name); +- free(req->cpumask); +-} +- +-static void +-rpc_vfu_virtio_create_blk_endpoint(struct spdk_jsonrpc_request *request, +- const struct spdk_json_val *params) +-{ +- struct rpc_vfu_virtio_create_blk req = {0}; +- int rc; +- +- if (spdk_json_decode_object(params, rpc_construct_vfu_virtio_create_blk, +- SPDK_COUNTOF(rpc_construct_vfu_virtio_create_blk), +- &req)) { +- SPDK_ERRLOG("spdk_json_decode_object failed\n"); +- rc = -EINVAL; +- goto invalid; +- } +- +- rc = spdk_vfu_create_endpoint(req.name, req.cpumask, "virtio_blk"); +- if (rc) { +- SPDK_ERRLOG("Failed to create virtio_blk endpoint\n"); +- goto invalid; +- } +- +- rc = vfu_virtio_blk_add_bdev(req.name, req.bdev_name, req.num_queues, req.qsize, +- req.packed_ring); +- if (rc < 0) { +- spdk_vfu_delete_endpoint(req.name); +- goto invalid; +- } +- free_rpc_vfu_virtio_create_blk(&req); +- +- spdk_jsonrpc_send_bool_response(request, true); +- return; +- +-invalid: +- free_rpc_vfu_virtio_create_blk(&req); +- spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, +- spdk_strerror(-rc)); +-} +-SPDK_RPC_REGISTER("vfu_virtio_create_blk_endpoint", rpc_vfu_virtio_create_blk_endpoint, +- SPDK_RPC_RUNTIME) +- +-struct rpc_vfu_virtio_scsi { +- char *name; +- uint8_t scsi_target_num; +- char *bdev_name; +-}; +- +-static const struct spdk_json_object_decoder rpc_construct_vfu_virtio_scsi[] = { +- {"name", offsetof(struct rpc_vfu_virtio_scsi, name), spdk_json_decode_string }, +- {"scsi_target_num", offsetof(struct rpc_vfu_virtio_scsi, scsi_target_num), spdk_json_decode_uint8 }, +- {"bdev_name", offsetof(struct rpc_vfu_virtio_scsi, bdev_name), spdk_json_decode_string }, +-}; +- +-static void +-free_rpc_vfu_virtio_scsi(struct rpc_vfu_virtio_scsi *req) +-{ +- free(req->name); +- free(req->bdev_name); +-} +- +-static void +-rpc_vfu_virtio_scsi_add_target(struct spdk_jsonrpc_request *request, +- const struct spdk_json_val *params) +-{ +- struct rpc_vfu_virtio_scsi req = {0}; +- int rc; +- +- if (spdk_json_decode_object(params, rpc_construct_vfu_virtio_scsi, +- SPDK_COUNTOF(rpc_construct_vfu_virtio_scsi), +- &req)) { +- SPDK_ERRLOG("spdk_json_decode_object failed\n"); +- rc = -EINVAL; +- goto invalid; +- } +- +- rc = vfu_virtio_scsi_add_target(req.name, req.scsi_target_num, req.bdev_name);; +- if (rc < 0) { +- goto invalid; +- } +- +- free_rpc_vfu_virtio_scsi(&req); +- spdk_jsonrpc_send_bool_response(request, true); +- return; +- +-invalid: +- free_rpc_vfu_virtio_scsi(&req); +- spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, +- spdk_strerror(-rc)); +-} +-SPDK_RPC_REGISTER("vfu_virtio_scsi_add_target", rpc_vfu_virtio_scsi_add_target, +- SPDK_RPC_RUNTIME) +- +-struct rpc_vfu_virtio_scsi_remove { +- char *name; +- uint8_t scsi_target_num; +-}; +- +-static const struct spdk_json_object_decoder rpc_remove_vfu_virtio_scsi_target[] = { +- {"name", offsetof(struct rpc_vfu_virtio_scsi_remove, name), spdk_json_decode_string }, +- {"scsi_target_num", offsetof(struct rpc_vfu_virtio_scsi_remove, scsi_target_num), spdk_json_decode_uint8 }, +-}; +- +-static void +-free_rpc_vfu_virtio_scsi_remove(struct rpc_vfu_virtio_scsi_remove *req) +-{ +- free(req->name); +-} +- +-static void +-rpc_vfu_virtio_scsi_remove_target(struct spdk_jsonrpc_request *request, +- const struct spdk_json_val *params) +-{ +- struct rpc_vfu_virtio_scsi_remove req = {0}; +- int rc; +- +- if (spdk_json_decode_object(params, rpc_remove_vfu_virtio_scsi_target, +- SPDK_COUNTOF(rpc_remove_vfu_virtio_scsi_target), +- &req)) { +- SPDK_ERRLOG("spdk_json_decode_object failed\n"); +- rc = -EINVAL; +- goto invalid; +- } +- +- rc = vfu_virtio_scsi_remove_target(req.name, req.scsi_target_num); +- if (rc < 0) { +- goto invalid; +- } +- +- free_rpc_vfu_virtio_scsi_remove(&req); +- spdk_jsonrpc_send_bool_response(request, true); +- return; +- +-invalid: +- free_rpc_vfu_virtio_scsi_remove(&req); +- spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, +- spdk_strerror(-rc)); +-} +-SPDK_RPC_REGISTER("vfu_virtio_scsi_remove_target", rpc_vfu_virtio_scsi_remove_target, +- SPDK_RPC_RUNTIME) +- +-struct rpc_vfu_virtio_create_scsi { +- char *name; +- char *cpumask; +- uint16_t num_io_queues; +- uint16_t qsize; +- bool packed_ring; +-}; +- +-static const struct spdk_json_object_decoder rpc_construct_vfu_virtio_create_scsi[] = { +- {"name", offsetof(struct rpc_vfu_virtio_create_scsi, name), spdk_json_decode_string }, +- {"cpumask", offsetof(struct rpc_vfu_virtio_create_scsi, cpumask), spdk_json_decode_string, true}, +- {"num_io_queues", offsetof(struct rpc_vfu_virtio_create_scsi, num_io_queues), spdk_json_decode_uint16, true }, +- {"qsize", offsetof(struct rpc_vfu_virtio_create_scsi, qsize), spdk_json_decode_uint16, true }, +- {"packed_ring", offsetof(struct rpc_vfu_virtio_create_scsi, packed_ring), spdk_json_decode_bool, true}, +-}; +- +-static void +-free_rpc_vfu_virtio_create_scsi(struct rpc_vfu_virtio_create_scsi *req) +-{ +- free(req->name); +- free(req->cpumask); +-} +- +-static void +-rpc_vfu_virtio_create_scsi_endpoint(struct spdk_jsonrpc_request *request, +- const struct spdk_json_val *params) +-{ +- struct rpc_vfu_virtio_create_scsi req = {0}; +- int rc; +- +- if (spdk_json_decode_object(params, rpc_construct_vfu_virtio_create_scsi, +- SPDK_COUNTOF(rpc_construct_vfu_virtio_create_scsi), +- &req)) { +- SPDK_ERRLOG("spdk_json_decode_object failed\n"); +- rc = -EINVAL; +- goto invalid; +- } +- +- rc = spdk_vfu_create_endpoint(req.name, req.cpumask, "virtio_scsi"); +- if (rc) { +- SPDK_ERRLOG("Failed to create virtio_blk endpoint\n"); +- goto invalid; +- } +- +- rc = vfu_virtio_scsi_set_options(req.name, req.num_io_queues, req.qsize, req.packed_ring); +- if (rc < 0) { +- spdk_vfu_delete_endpoint(req.name); +- goto invalid; +- } +- free_rpc_vfu_virtio_create_scsi(&req); +- +- spdk_jsonrpc_send_bool_response(request, true); +- return; +- +-invalid: +- free_rpc_vfu_virtio_create_scsi(&req); +- spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, +- spdk_strerror(-rc)); +-} +-SPDK_RPC_REGISTER("vfu_virtio_create_scsi_endpoint", rpc_vfu_virtio_create_scsi_endpoint, +- SPDK_RPC_RUNTIME) ++/* SPDX-License-Identifier: BSD-3-Clause ++ * Copyright (C) 2022 Intel Corporation. ++ * All rights reserved. ++ */ ++ ++#include "spdk/bdev.h" ++#include "spdk/log.h" ++#include "spdk/rpc.h" ++#include "spdk/env.h" ++#include "spdk/string.h" ++#include "spdk/util.h" ++#include "spdk/thread.h" ++ ++#include "vfu_virtio_internal.h" ++ ++struct rpc_delete_vfu_endpoint { ++ char *name; ++}; ++ ++static const struct spdk_json_object_decoder rpc_delete_vfu_endpoint_decode[] = { ++ {"name", offsetof(struct rpc_delete_vfu_endpoint, name), spdk_json_decode_string } ++}; ++ ++static void ++free_rpc_delete_vfu_endpoint(struct rpc_delete_vfu_endpoint *req) ++{ ++ free(req->name); ++} ++ ++static void ++rpc_vfu_virtio_delete_endpoint(struct spdk_jsonrpc_request *request, ++ const struct spdk_json_val *params) ++{ ++ struct rpc_delete_vfu_endpoint req = {0}; ++ int rc; ++ ++ if (spdk_json_decode_object(params, rpc_delete_vfu_endpoint_decode, ++ SPDK_COUNTOF(rpc_delete_vfu_endpoint_decode), ++ &req)) { ++ SPDK_ERRLOG("spdk_json_decode_object failed\n"); ++ rc = -EINVAL; ++ goto invalid; ++ } ++ ++ rc = spdk_vfu_delete_endpoint(req.name); ++ if (rc < 0) { ++ goto invalid; ++ } ++ free_rpc_delete_vfu_endpoint(&req); ++ ++ spdk_jsonrpc_send_bool_response(request, true); ++ return; ++ ++invalid: ++ free_rpc_delete_vfu_endpoint(&req); ++ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, ++ spdk_strerror(-rc)); ++} ++SPDK_RPC_REGISTER("vfu_virtio_delete_endpoint", rpc_vfu_virtio_delete_endpoint, ++ SPDK_RPC_RUNTIME) ++ ++struct rpc_vfu_virtio_create_blk { ++ char *name; ++ char *bdev_name; ++ char *cpumask; ++ uint16_t num_queues; ++ uint16_t qsize; ++ bool packed_ring; ++}; ++ ++static const struct spdk_json_object_decoder rpc_construct_vfu_virtio_create_blk[] = { ++ {"name", offsetof(struct rpc_vfu_virtio_create_blk, name), spdk_json_decode_string }, ++ {"bdev_name", offsetof(struct rpc_vfu_virtio_create_blk, bdev_name), spdk_json_decode_string }, ++ {"cpumask", offsetof(struct rpc_vfu_virtio_create_blk, cpumask), spdk_json_decode_string, true}, ++ {"num_queues", offsetof(struct rpc_vfu_virtio_create_blk, num_queues), spdk_json_decode_uint16, true }, ++ {"qsize", offsetof(struct rpc_vfu_virtio_create_blk, qsize), spdk_json_decode_uint16, true }, ++ {"packed_ring", offsetof(struct rpc_vfu_virtio_create_blk, packed_ring), spdk_json_decode_bool, true}, ++}; ++ ++static void ++free_rpc_vfu_virtio_create_blk(struct rpc_vfu_virtio_create_blk *req) ++{ ++ free(req->name); ++ free(req->bdev_name); ++ free(req->cpumask); ++} ++ ++static void ++rpc_vfu_virtio_create_blk_endpoint(struct spdk_jsonrpc_request *request, ++ const struct spdk_json_val *params) ++{ ++ struct rpc_vfu_virtio_create_blk req = {0}; ++ int rc; ++ ++ if (spdk_json_decode_object(params, rpc_construct_vfu_virtio_create_blk, ++ SPDK_COUNTOF(rpc_construct_vfu_virtio_create_blk), ++ &req)) { ++ SPDK_ERRLOG("spdk_json_decode_object failed\n"); ++ rc = -EINVAL; ++ goto invalid; ++ } ++ ++ rc = spdk_vfu_create_endpoint(req.name, req.cpumask, "virtio_blk"); ++ if (rc) { ++ SPDK_ERRLOG("Failed to create virtio_blk endpoint\n"); ++ goto invalid; ++ } ++ ++ rc = vfu_virtio_blk_add_bdev(req.name, req.bdev_name, req.num_queues, req.qsize, ++ req.packed_ring); ++ if (rc < 0) { ++ spdk_vfu_delete_endpoint(req.name); ++ goto invalid; ++ } ++ free_rpc_vfu_virtio_create_blk(&req); ++ ++ spdk_jsonrpc_send_bool_response(request, true); ++ return; ++ ++invalid: ++ free_rpc_vfu_virtio_create_blk(&req); ++ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, ++ spdk_strerror(-rc)); ++} ++SPDK_RPC_REGISTER("vfu_virtio_create_blk_endpoint", rpc_vfu_virtio_create_blk_endpoint, ++ SPDK_RPC_RUNTIME) ++ ++struct rpc_vfu_virtio_scsi { ++ char *name; ++ uint8_t scsi_target_num; ++ char *bdev_name; ++}; ++ ++static const struct spdk_json_object_decoder rpc_construct_vfu_virtio_scsi[] = { ++ {"name", offsetof(struct rpc_vfu_virtio_scsi, name), spdk_json_decode_string }, ++ {"scsi_target_num", offsetof(struct rpc_vfu_virtio_scsi, scsi_target_num), spdk_json_decode_uint8 }, ++ {"bdev_name", offsetof(struct rpc_vfu_virtio_scsi, bdev_name), spdk_json_decode_string }, ++}; ++ ++static void ++free_rpc_vfu_virtio_scsi(struct rpc_vfu_virtio_scsi *req) ++{ ++ free(req->name); ++ free(req->bdev_name); ++} ++ ++static void ++rpc_vfu_virtio_scsi_add_target(struct spdk_jsonrpc_request *request, ++ const struct spdk_json_val *params) ++{ ++ struct rpc_vfu_virtio_scsi req = {0}; ++ int rc; ++ ++ if (spdk_json_decode_object(params, rpc_construct_vfu_virtio_scsi, ++ SPDK_COUNTOF(rpc_construct_vfu_virtio_scsi), ++ &req)) { ++ SPDK_ERRLOG("spdk_json_decode_object failed\n"); ++ rc = -EINVAL; ++ goto invalid; ++ } ++ ++ rc = vfu_virtio_scsi_add_target(req.name, req.scsi_target_num, req.bdev_name);; ++ if (rc < 0) { ++ goto invalid; ++ } ++ ++ free_rpc_vfu_virtio_scsi(&req); ++ spdk_jsonrpc_send_bool_response(request, true); ++ return; ++ ++invalid: ++ free_rpc_vfu_virtio_scsi(&req); ++ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, ++ spdk_strerror(-rc)); ++} ++SPDK_RPC_REGISTER("vfu_virtio_scsi_add_target", rpc_vfu_virtio_scsi_add_target, ++ SPDK_RPC_RUNTIME) ++ ++struct rpc_vfu_virtio_scsi_remove { ++ char *name; ++ uint8_t scsi_target_num; ++}; ++ ++static const struct spdk_json_object_decoder rpc_remove_vfu_virtio_scsi_target[] = { ++ {"name", offsetof(struct rpc_vfu_virtio_scsi_remove, name), spdk_json_decode_string }, ++ {"scsi_target_num", offsetof(struct rpc_vfu_virtio_scsi_remove, scsi_target_num), spdk_json_decode_uint8 }, ++}; ++ ++static void ++free_rpc_vfu_virtio_scsi_remove(struct rpc_vfu_virtio_scsi_remove *req) ++{ ++ free(req->name); ++} ++ ++static void ++rpc_vfu_virtio_scsi_remove_target(struct spdk_jsonrpc_request *request, ++ const struct spdk_json_val *params) ++{ ++ struct rpc_vfu_virtio_scsi_remove req = {0}; ++ int rc; ++ ++ if (spdk_json_decode_object(params, rpc_remove_vfu_virtio_scsi_target, ++ SPDK_COUNTOF(rpc_remove_vfu_virtio_scsi_target), ++ &req)) { ++ SPDK_ERRLOG("spdk_json_decode_object failed\n"); ++ rc = -EINVAL; ++ goto invalid; ++ } ++ ++ rc = vfu_virtio_scsi_remove_target(req.name, req.scsi_target_num); ++ if (rc < 0) { ++ goto invalid; ++ } ++ ++ free_rpc_vfu_virtio_scsi_remove(&req); ++ spdk_jsonrpc_send_bool_response(request, true); ++ return; ++ ++invalid: ++ free_rpc_vfu_virtio_scsi_remove(&req); ++ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, ++ spdk_strerror(-rc)); ++} ++SPDK_RPC_REGISTER("vfu_virtio_scsi_remove_target", rpc_vfu_virtio_scsi_remove_target, ++ SPDK_RPC_RUNTIME) ++ ++struct rpc_vfu_virtio_create_scsi { ++ char *name; ++ char *cpumask; ++ uint16_t num_io_queues; ++ uint16_t qsize; ++ bool packed_ring; ++}; ++ ++static const struct spdk_json_object_decoder rpc_construct_vfu_virtio_create_scsi[] = { ++ {"name", offsetof(struct rpc_vfu_virtio_create_scsi, name), spdk_json_decode_string }, ++ {"cpumask", offsetof(struct rpc_vfu_virtio_create_scsi, cpumask), spdk_json_decode_string, true}, ++ {"num_io_queues", offsetof(struct rpc_vfu_virtio_create_scsi, num_io_queues), spdk_json_decode_uint16, true }, ++ {"qsize", offsetof(struct rpc_vfu_virtio_create_scsi, qsize), spdk_json_decode_uint16, true }, ++ {"packed_ring", offsetof(struct rpc_vfu_virtio_create_scsi, packed_ring), spdk_json_decode_bool, true}, ++}; ++ ++static void ++free_rpc_vfu_virtio_create_scsi(struct rpc_vfu_virtio_create_scsi *req) ++{ ++ free(req->name); ++ free(req->cpumask); ++} ++ ++static void ++rpc_vfu_virtio_create_scsi_endpoint(struct spdk_jsonrpc_request *request, ++ const struct spdk_json_val *params) ++{ ++ struct rpc_vfu_virtio_create_scsi req = {0}; ++ int rc; ++ ++ if (spdk_json_decode_object(params, rpc_construct_vfu_virtio_create_scsi, ++ SPDK_COUNTOF(rpc_construct_vfu_virtio_create_scsi), ++ &req)) { ++ SPDK_ERRLOG("spdk_json_decode_object failed\n"); ++ rc = -EINVAL; ++ goto invalid; ++ } ++ ++ rc = spdk_vfu_create_endpoint(req.name, req.cpumask, "virtio_scsi"); ++ if (rc) { ++ SPDK_ERRLOG("Failed to create virtio_blk endpoint\n"); ++ goto invalid; ++ } ++ ++ rc = vfu_virtio_scsi_set_options(req.name, req.num_io_queues, req.qsize, req.packed_ring); ++ if (rc < 0) { ++ spdk_vfu_delete_endpoint(req.name); ++ goto invalid; ++ } ++ free_rpc_vfu_virtio_create_scsi(&req); ++ ++ spdk_jsonrpc_send_bool_response(request, true); ++ return; ++ ++invalid: ++ free_rpc_vfu_virtio_create_scsi(&req); ++ spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, ++ spdk_strerror(-rc)); ++} ++SPDK_RPC_REGISTER("vfu_virtio_create_scsi_endpoint", rpc_vfu_virtio_create_scsi_endpoint, ++ SPDK_RPC_RUNTIME) +diff --git a/module/vfu_device/vfu_virtio_scsi.c b/module/vfu_device/vfu_virtio_scsi.c +index 727bf3d..3124a7d 100644 +--- a/module/vfu_device/vfu_virtio_scsi.c ++++ b/module/vfu_device/vfu_virtio_scsi.c +@@ -1,1037 +1,1037 @@ +-/* SPDX-License-Identifier: BSD-3-Clause +- * Copyright (C) 2022 Intel Corporation. +- * All rights reserved. +- */ +- +-/* +- * virtio-scsi over vfio-user transport +- */ +-#include +- +-#include "spdk/stdinc.h" +-#include "spdk/env.h" +-#include "spdk/bdev.h" +-#include "spdk/bdev_module.h" +-#include "spdk/assert.h" +-#include "spdk/barrier.h" +-#include "spdk/thread.h" +-#include "spdk/memory.h" +-#include "spdk/util.h" +-#include "spdk/log.h" +-#include "spdk/string.h" +-#include "spdk/likely.h" +-#include "spdk/scsi.h" +-#include "spdk/scsi_spec.h" +-#include "spdk/pci_ids.h" +- +-#include "vfu_virtio_internal.h" +- +-#define VIRTIO_SCSI_SUPPORTED_FEATURES ((1ULL << VIRTIO_SCSI_F_INOUT) | \ +- (1ULL << VIRTIO_SCSI_F_HOTPLUG) | \ +- (1ULL << VIRTIO_SCSI_F_CHANGE)) +- +-#define VIRTIO_SCSI_CTRLR_MAX_TARGETS (8) +- +-struct virtio_scsi_target { +- struct spdk_scsi_dev *dev; +-}; +- +-struct virtio_scsi_endpoint { +- struct vfu_virtio_endpoint virtio; +- +- struct virtio_scsi_config scsi_cfg; +- /* virtio_scsi specific configurations */ +- struct virtio_scsi_target targets[VIRTIO_SCSI_CTRLR_MAX_TARGETS]; +- /* virtio_scsi SCSI task and IO ring process poller */ +- struct spdk_poller *ring_poller; +-}; +- +-struct virtio_scsi_req { +- struct spdk_scsi_task scsi; +- union { +- struct virtio_scsi_cmd_req *cmd_req; +- struct virtio_scsi_ctrl_tmf_req *tmf_req; +- }; +- union { +- struct virtio_scsi_cmd_resp *cmd_resp; +- struct virtio_scsi_ctrl_tmf_resp *tmf_resp; +- }; +- struct virtio_scsi_endpoint *endpoint; +- /* KEEP req at last */ +- struct vfu_virtio_req req; +-}; +- +-static inline struct virtio_scsi_endpoint * +-to_scsi_endpoint(struct vfu_virtio_endpoint *virtio_endpoint) +-{ +- return SPDK_CONTAINEROF(virtio_endpoint, struct virtio_scsi_endpoint, virtio); +-} +- +-static inline struct virtio_scsi_req * +-to_scsi_request(struct vfu_virtio_req *request) +-{ +- return SPDK_CONTAINEROF(request, struct virtio_scsi_req, req); +-} +- +-static void +-virtio_scsi_req_finish(struct virtio_scsi_req *scsi_req) +-{ +- struct vfu_virtio_req *req = &scsi_req->req; +- +- vfu_virtio_finish_req(req); +-} +- +-static int +-vfu_virtio_scsi_vring_poll(void *ctx) +-{ +- struct virtio_scsi_endpoint *scsi_endpoint = ctx; +- struct vfu_virtio_dev *dev = scsi_endpoint->virtio.dev; +- struct vfu_virtio_vq *vq; +- uint32_t i, count = 0; +- +- if (spdk_unlikely(!virtio_dev_is_started(dev))) { +- return SPDK_POLLER_IDLE; +- } +- +- if (spdk_unlikely(scsi_endpoint->virtio.quiesce_in_progress)) { +- return SPDK_POLLER_IDLE; +- } +- +- /* We don't process event queue here */ +- for (i = 0; i < dev->num_queues; i++) { +- if (i == 1) { +- continue; +- } +- +- vq = &dev->vqs[i]; +- if (!vq->enabled || vq->q_state != VFU_VQ_ACTIVE) { +- continue; +- } +- +- vfu_virtio_vq_flush_irq(dev, vq); +- +- if (vq->packed.packed_ring) { +- /* packed vring */ +- count += vfu_virito_dev_process_packed_ring(dev, vq); +- } else { +- /* split vring */ +- count += vfu_virito_dev_process_split_ring(dev, vq); +- } +- } +- +- return count ? SPDK_POLLER_BUSY : SPDK_POLLER_IDLE; +-} +- +-static void +-vfu_virtio_scsi_eventq_enqueue(struct virtio_scsi_endpoint *scsi_endpoint, uint8_t scsi_target_num, +- uint32_t event, uint32_t reason) +-{ +- struct vfu_virtio_dev *dev = scsi_endpoint->virtio.dev; +- struct vfu_virtio_req *req = NULL; +- struct virtio_scsi_req *scsi_req; +- struct virtio_scsi_event *desc_ev; +- struct vfu_virtio_vq *vq; +- +- assert(dev != NULL); +- +- if (scsi_target_num >= VIRTIO_SCSI_CTRLR_MAX_TARGETS) { +- return; +- } +- +- if (spdk_unlikely(scsi_endpoint->virtio.quiesce_in_progress)) { +- return; +- } +- +- /* event queue */ +- vq = &dev->vqs[1]; +- if (!vq->enabled || vq->q_state != VFU_VQ_ACTIVE) { +- return; +- } +- +- if (vq->packed.packed_ring) { +- /* packed vring */ +- req = virito_dev_packed_ring_get_next_avail_req(dev, vq); +- } else { +- /* split vring */ +- req = virito_dev_split_ring_get_next_avail_req(dev, vq); +- } +- +- if (!req) { +- return; +- } +- scsi_req = to_scsi_request(req); +- scsi_req->endpoint = scsi_endpoint; +- /* add 1 for scsi event */ +- scsi_endpoint->virtio.io_outstanding++; +- +- assert(req->iovcnt == 1); +- assert(req->iovs[0].iov_len == sizeof(struct virtio_scsi_event)); +- desc_ev = req->iovs[0].iov_base; +- +- desc_ev->event = event; +- desc_ev->lun[0] = 1; +- desc_ev->lun[1] = scsi_target_num; +- /* virtio LUN id 0 can refer either to the entire device +- * or actual LUN 0 (the only supported by vhost for now) +- */ +- desc_ev->lun[2] = 0 >> 8; +- desc_ev->lun[3] = 0 & 0xFF; +- /* virtio doesn't specify any strict format for LUN id (bytes 2 and 3) +- * current implementation relies on linux kernel sources +- */ +- memset(&desc_ev->lun[4], 0, 4); +- desc_ev->reason = reason; +- +- req->used_len = sizeof(*desc_ev); +- +- SPDK_DEBUGLOG(vfu_virtio_scsi, "%s: SCSI Target Num %u, Desc %p, Event %u, Reason %u\n", +- spdk_vfu_get_endpoint_name(scsi_endpoint->virtio.endpoint), scsi_target_num, desc_ev, event, +- reason); +- +- virtio_scsi_req_finish(scsi_req); +- vfu_virtio_vq_flush_irq(dev, vq); +-} +- +-static int +-virtio_scsi_start(struct vfu_virtio_endpoint *virtio_endpoint) +-{ +- struct virtio_scsi_endpoint *scsi_endpoint = to_scsi_endpoint(virtio_endpoint); +- struct virtio_scsi_target *scsi_target; +- uint8_t i; +- int ret; +- +- if (scsi_endpoint->ring_poller) { +- return 0; +- } +- +- SPDK_DEBUGLOG(vfu_virtio_scsi, "starting %s\n", +- spdk_vfu_get_endpoint_name(scsi_endpoint->virtio.endpoint)); +- +- for (i = 0; i < VIRTIO_SCSI_CTRLR_MAX_TARGETS; i++) { +- scsi_target = &scsi_endpoint->targets[i]; +- if (scsi_target->dev) { +- ret = spdk_scsi_dev_allocate_io_channels(scsi_target->dev); +- if (ret) { +- SPDK_ERRLOG("%s: Couldn't allocate io channel for SCSI target %u.\n", +- spdk_vfu_get_endpoint_name(scsi_endpoint->virtio.endpoint), i); +- continue; +- } +- } +- } +- +- scsi_endpoint->ring_poller = SPDK_POLLER_REGISTER(vfu_virtio_scsi_vring_poll, scsi_endpoint, +- 0); +- +- return 0; +-} +- +-static int +-virtio_scsi_stop(struct vfu_virtio_endpoint *virtio_endpoint) +-{ +- struct virtio_scsi_endpoint *scsi_endpoint = to_scsi_endpoint(virtio_endpoint); +- struct virtio_scsi_target *scsi_target; +- uint8_t i; +- +- SPDK_DEBUGLOG(vfu_virtio_scsi, "stopping %s\n", +- spdk_vfu_get_endpoint_name(scsi_endpoint->virtio.endpoint)); +- +- spdk_poller_unregister(&scsi_endpoint->ring_poller); +- +- for (i = 0; i < VIRTIO_SCSI_CTRLR_MAX_TARGETS; i++) { +- scsi_target = &scsi_endpoint->targets[i]; +- if (scsi_target->dev) { +- spdk_scsi_dev_free_io_channels(scsi_target->dev); +- } +- } +- +- return 0; +-} +- +-static void +-virtio_scsi_task_cpl(struct spdk_scsi_task *scsi_task) +-{ +- struct virtio_scsi_req *scsi_req = SPDK_CONTAINEROF(scsi_task, struct virtio_scsi_req, scsi); +- +- scsi_req->cmd_resp->status = scsi_task->status; +- if (scsi_task->status != SPDK_SCSI_STATUS_GOOD) { +- scsi_req->cmd_resp->sense_len = scsi_task->sense_data_len; +- memcpy(scsi_req->cmd_resp->sense, scsi_task->sense_data, scsi_task->sense_data_len); +- } +- assert(scsi_task->transfer_len == scsi_task->length); +- scsi_req->cmd_resp->resid = scsi_task->length - scsi_task->data_transferred; +- +- virtio_scsi_req_finish(scsi_req); +- spdk_scsi_task_put(scsi_task); +-} +- +-static void +-virtio_scsi_task_mgmt_cpl(struct spdk_scsi_task *scsi_task) +-{ +- struct virtio_scsi_req *scsi_req = SPDK_CONTAINEROF(scsi_task, struct virtio_scsi_req, scsi); +- +- virtio_scsi_req_finish(scsi_req); +- spdk_scsi_task_put(scsi_task); +-} +- +-static void +-virtio_scsi_task_free_cb(struct spdk_scsi_task *scsi_task) +-{ +- +-} +- +-static struct virtio_scsi_target * +-virtio_scsi_cmd_lun_setup(struct virtio_scsi_endpoint *scsi_endpoint, +- struct virtio_scsi_req *scsi_req, __u8 *lun) +-{ +- struct virtio_scsi_target *scsi_target; +- uint16_t lun_id = (((uint16_t)lun[2] << 8) | lun[3]) & 0x3FFF; +- +- SPDK_LOGDUMP(vfu_virtio_scsi_data, "LUN", lun, 8); +- +- /* First byte must be 1 and second is target */ +- if (lun[0] != 1 || lun[1] >= VIRTIO_SCSI_CTRLR_MAX_TARGETS) { +- SPDK_DEBUGLOG(vfu_virtio_scsi, "Invalid LUN %u:%u\n", lun[0], lun[1]); +- return NULL; +- } +- +- scsi_target = &scsi_endpoint->targets[lun[1]]; +- if (!scsi_target->dev) { +- SPDK_DEBUGLOG(vfu_virtio_scsi, "SCSI Target num %u doesn't exist\n", lun[1]); +- return NULL; +- } +- +- scsi_req->scsi.target_port = spdk_scsi_dev_find_port_by_id(scsi_target->dev, 0); +- scsi_req->scsi.lun = spdk_scsi_dev_get_lun(scsi_target->dev, lun_id); +- if (scsi_req->scsi.lun == NULL) { +- SPDK_DEBUGLOG(vfu_virtio_scsi, "LUN %u:%u doesn't exist\n", lun[0], lun[1]); +- return NULL; +- } +- SPDK_DEBUGLOG(vfu_virtio_scsi, "Got valid SCSI Target num %u, bdev %s\n", lun[1], +- spdk_scsi_lun_get_bdev_name(scsi_req->scsi.lun)); +- +- return scsi_target; +-} +- +-static int +-virtio_scsi_cmd_data_setup(struct virtio_scsi_req *scsi_req) +-{ +- struct iovec *iov; +- uint32_t iovcnt; +- uint32_t payload_len; +- +- iov = &scsi_req->req.iovs[0]; +- iovcnt = scsi_req->req.iovcnt; +- payload_len = scsi_req->req.payload_size; +- +- if (spdk_unlikely(iov->iov_len < sizeof(struct virtio_scsi_cmd_req))) { +- SPDK_ERRLOG("Invalid virtio_scsi command header length"); +- return -EINVAL; +- } +- if (spdk_unlikely(iovcnt < 2)) { +- SPDK_ERRLOG("Invalid iovcnt %u\n", iovcnt); +- return -EINVAL; +- } +- +- scsi_req->cmd_req = scsi_req->req.iovs[0].iov_base; +- payload_len -= scsi_req->req.iovs[0].iov_len; +- +- /* +- * FROM_DEV (READ): [RO_req][WR_resp][WR_buf0]...[WR_bufN] +- * TO_DEV (WRITE): [RO_req][RO_buf0]...[RO_bufN][WR_resp] +- */ +- if (virtio_req_iov_is_wr(&scsi_req->req, 1)) { +- scsi_req->scsi.dxfer_dir = SPDK_SCSI_DIR_FROM_DEV; +- } else { +- scsi_req->scsi.dxfer_dir = SPDK_SCSI_DIR_TO_DEV; +- } +- +- if (scsi_req->scsi.dxfer_dir == SPDK_SCSI_DIR_FROM_DEV) { +- if (scsi_req->req.iovs[1].iov_len < sizeof(struct virtio_scsi_cmd_resp)) { +- SPDK_ERRLOG("DIR_FROM_DEV: Invalid virtio_scsi command resp length"); +- return -EINVAL; +- } +- scsi_req->cmd_resp = scsi_req->req.iovs[1].iov_base; +- scsi_req->req.used_len = payload_len; +- scsi_req->scsi.iovs = &scsi_req->req.iovs[2]; +- } else { +- if (scsi_req->req.iovs[iovcnt - 1].iov_len < sizeof(struct virtio_scsi_cmd_resp)) { +- SPDK_ERRLOG("DIR_TO_DEV: Invalid virtio_scsi command resp length"); +- return -EINVAL; +- } +- scsi_req->req.used_len = sizeof(struct virtio_scsi_cmd_resp); +- scsi_req->cmd_resp = scsi_req->req.iovs[iovcnt - 1].iov_base; +- scsi_req->scsi.iovs = &scsi_req->req.iovs[1]; +- } +- +- /* -2 for REQ and RESP */ +- iovcnt -= 2; +- if (!iovcnt) { +- scsi_req->scsi.length = 0; +- scsi_req->scsi.transfer_len = 0; +- scsi_req->scsi.iovs[0].iov_len = 0; +- } else { +- assert(payload_len > sizeof(struct virtio_scsi_cmd_resp)); +- payload_len -= sizeof(struct virtio_scsi_cmd_resp); +- scsi_req->scsi.length = payload_len; +- scsi_req->scsi.transfer_len = payload_len; +- } +- scsi_req->scsi.iovcnt = iovcnt; +- scsi_req->scsi.cdb = scsi_req->cmd_req->cdb; +- scsi_req->cmd_resp->response = VIRTIO_SCSI_S_OK; +- +- SPDK_LOGDUMP(vfu_virtio_scsi_data, "CDB=", scsi_req->cmd_req->cdb, VIRTIO_SCSI_CDB_SIZE); +- SPDK_DEBUGLOG(vfu_virtio_scsi, "%s, iovcnt %u, transfer_len %u, used len %u\n", +- scsi_req->scsi.dxfer_dir == SPDK_SCSI_DIR_FROM_DEV ? "XFER_FROM_DEV" : "XFER_TO_DEV", +- scsi_req->scsi.iovcnt, payload_len, scsi_req->req.used_len); +- +- return 0; +-} +- +-static int +-virtio_scsi_tmf_cmd_req(struct virtio_scsi_endpoint *scsi_endpoint, +- struct virtio_scsi_req *scsi_req) +-{ +- uint32_t iovcnt; +- struct iovec *iov; +- struct virtio_scsi_ctrl_tmf_req *tmf_req; +- struct virtio_scsi_target *scsi_target; +- +- iov = &scsi_req->req.iovs[0]; +- iovcnt = scsi_req->req.iovcnt; +- tmf_req = iov->iov_base; +- if (spdk_unlikely(iovcnt < 2)) { +- SPDK_ERRLOG("Invalid iovcnt %u\n", iovcnt); +- goto invalid; +- } +- +- memset(&scsi_req->scsi, 0, sizeof(struct spdk_scsi_task)); +- spdk_scsi_task_construct(&scsi_req->scsi, virtio_scsi_task_mgmt_cpl, virtio_scsi_task_free_cb); +- +- switch (tmf_req->type) { +- case VIRTIO_SCSI_T_TMF: +- if (scsi_req->req.iovs[0].iov_len < sizeof(struct virtio_scsi_ctrl_tmf_req) || +- scsi_req->req.iovs[1].iov_len < sizeof(struct virtio_scsi_ctrl_tmf_resp)) { +- SPDK_ERRLOG("Invalid size of tmf_req or tmf_resp\n"); +- goto invalid; +- } +- scsi_req->tmf_req = tmf_req; +- scsi_req->tmf_resp = scsi_req->req.iovs[1].iov_base; +- switch (tmf_req->subtype) { +- case VIRTIO_SCSI_T_TMF_LOGICAL_UNIT_RESET: +- scsi_target = virtio_scsi_cmd_lun_setup(scsi_endpoint, scsi_req, scsi_req->tmf_req->lun); +- if (!scsi_target) { +- scsi_req->tmf_resp->response = VIRTIO_SCSI_S_BAD_TARGET; +- break; +- } +- /* Management task submission */ +- scsi_req->tmf_resp->response = VIRTIO_SCSI_S_OK; +- scsi_req->scsi.function = SPDK_SCSI_TASK_FUNC_LUN_RESET; +- spdk_scsi_dev_queue_mgmt_task(scsi_target->dev, &scsi_req->scsi); +- return 0; +- break; +- default: +- scsi_req->tmf_resp->response = VIRTIO_SCSI_S_FUNCTION_REJECTED; +- break; +- } +- break; +- +- case VIRTIO_SCSI_T_AN_QUERY: +- case VIRTIO_SCSI_T_AN_SUBSCRIBE: +- if (scsi_req->req.iovs[0].iov_len < sizeof(struct virtio_scsi_ctrl_an_req) || +- scsi_req->req.iovs[1].iov_len < sizeof(struct virtio_scsi_ctrl_an_resp)) { +- SPDK_ERRLOG("Invalid size of tmf_req or tmf_resp\n"); +- goto invalid; +- } +- scsi_req->req.used_len = sizeof(struct virtio_scsi_ctrl_an_resp); +- /* Do nothing to response byte of virtio_scsi_ctrl_an_resp */ +- goto invalid; +- break; +- default: +- break; +- } +- +-invalid: +- /* invalid request */ +- virtio_scsi_req_finish(scsi_req); +- return -1; +-} +- +-static int +-virtio_scsi_cmd_req(struct virtio_scsi_endpoint *scsi_endpoint, struct virtio_scsi_req *scsi_req) +-{ +- int ret; +- struct virtio_scsi_target *scsi_target; +- +- memset(&scsi_req->scsi, 0, sizeof(struct spdk_scsi_task)); +- spdk_scsi_task_construct(&scsi_req->scsi, virtio_scsi_task_cpl, virtio_scsi_task_free_cb); +- +- ret = virtio_scsi_cmd_data_setup(scsi_req); +- if (ret) { +- SPDK_ERRLOG("Error to setup SCSI command, ret %d\n", ret); +- goto invalid; +- } +- +- scsi_target = virtio_scsi_cmd_lun_setup(scsi_endpoint, scsi_req, scsi_req->cmd_req->lun); +- if (!scsi_target) { +- scsi_req->cmd_resp->response = VIRTIO_SCSI_S_BAD_TARGET; +- goto invalid; +- } +- +- spdk_scsi_dev_queue_task(scsi_target->dev, &scsi_req->scsi); +- return 0; +- +-invalid: +- /* invalid request */ +- virtio_scsi_req_finish(scsi_req); +- return ret; +-} +- +-static int +-virtio_scsi_process_req(struct vfu_virtio_endpoint *virtio_endpoint, struct vfu_virtio_vq *vq, +- struct vfu_virtio_req *req) +-{ +- struct virtio_scsi_endpoint *scsi_endpoint = to_scsi_endpoint(virtio_endpoint); +- struct virtio_scsi_req *scsi_req = to_scsi_request(req); +- +- scsi_req->endpoint = scsi_endpoint; +- +- /* SCSI task management command */ +- if (spdk_unlikely(vq->id == 0)) { +- return virtio_scsi_tmf_cmd_req(scsi_endpoint, scsi_req); +- } +- +- /* SCSI command */ +- return virtio_scsi_cmd_req(scsi_endpoint, scsi_req);; +-} +- +-static void +-virtio_scsi_update_config(struct virtio_scsi_endpoint *scsi_endpoint) +-{ +- struct virtio_scsi_config *scsi_cfg; +- +- if (!scsi_endpoint) { +- return; +- } +- +- scsi_cfg = &scsi_endpoint->scsi_cfg; +- +- scsi_cfg->num_queues = scsi_endpoint->virtio.num_queues; +- /* -2 for REQ and RESP and -1 for region boundary splitting */ +- scsi_cfg->seg_max = spdk_min(VIRTIO_DEV_MAX_IOVS - 2 - 1, SPDK_BDEV_IO_NUM_CHILD_IOV - 2 - 1); +- /* we can set `max_sectors` and `cmd_per_lun` based on bdevs */ +- scsi_cfg->max_sectors = 131072; +- scsi_cfg->cmd_per_lun = scsi_endpoint->virtio.qsize; +- scsi_cfg->event_info_size = sizeof(struct virtio_scsi_event); +- scsi_cfg->sense_size = VIRTIO_SCSI_SENSE_DEFAULT_SIZE; +- scsi_cfg->cdb_size = VIRTIO_SCSI_CDB_DEFAULT_SIZE; +- scsi_cfg->max_channel = 0; +- scsi_cfg->max_target = VIRTIO_SCSI_CTRLR_MAX_TARGETS; +- scsi_cfg->max_lun = 16383; +-} +- +-static uint64_t +-virtio_scsi_get_supported_features(struct vfu_virtio_endpoint *virtio_endpoint) +-{ +- uint64_t features; +- +- features = VIRTIO_SCSI_SUPPORTED_FEATURES | VIRTIO_HOST_SUPPORTED_FEATURES; +- +- if (!virtio_endpoint->packed_ring) { +- features &= ~(1ULL << VIRTIO_F_RING_PACKED); +- } +- +- return features; +-} +- +-static int +-virtio_scsi_get_device_specific_config(struct vfu_virtio_endpoint *virtio_endpoint, char *buf, +- uint64_t offset, uint64_t count) +-{ +- struct virtio_scsi_endpoint *scsi_endpoint = to_scsi_endpoint(virtio_endpoint); +- uint8_t *scsi_cfg; +- +- if ((offset + count) > sizeof(struct virtio_scsi_config)) { +- SPDK_ERRLOG("Invalid device specific configuration offset 0x%"PRIx64"\n", offset); +- return -EINVAL; +- } +- +- scsi_cfg = (uint8_t *)&scsi_endpoint->scsi_cfg; +- memcpy(buf, scsi_cfg + offset, count); +- +- return 0; +-} +- +-static int +-virtio_scsi_set_device_specific_config(struct vfu_virtio_endpoint *virtio_endpoint, char *buf, +- uint64_t offset, uint64_t count) +-{ +- struct virtio_scsi_endpoint *scsi_endpoint = to_scsi_endpoint(virtio_endpoint); +- uint32_t value; +- int ret = 0; +- +- if ((offset + count) > sizeof(struct virtio_scsi_config)) { +- SPDK_ERRLOG("Invalid device specific configuration offset 0x%"PRIx64"\n", offset); +- return -EINVAL; +- } +- +- switch (offset) { +- case offsetof(struct virtio_scsi_config, sense_size): +- value = *(uint32_t *)buf; +- if (scsi_endpoint->scsi_cfg.sense_size != value) { +- SPDK_ERRLOG("Sense data size set to %u\n", value); +- ret = -ENOTSUP; +- } +- break; +- case offsetof(struct virtio_scsi_config, cdb_size): +- value = *(uint32_t *)buf; +- if (scsi_endpoint->scsi_cfg.cdb_size != value) { +- SPDK_ERRLOG("CDB size set to %u\n", value); +- ret = -ENOTSUP; +- } +- break; +- default: +- SPDK_ERRLOG("Error offset %"PRIu64"\n", offset); +- ret = -EINVAL; +- break; +- } +- +- +- return ret; +-} +- +-static struct vfu_virtio_req * +-virtio_scsi_alloc_req(struct vfu_virtio_endpoint *virtio_endpoint, struct vfu_virtio_vq *vq) +-{ +- struct virtio_scsi_req *scsi_req; +- +- scsi_req = calloc(1, sizeof(*scsi_req) + dma_sg_size() * (VIRTIO_DEV_MAX_IOVS + 1)); +- if (!scsi_req) { +- return NULL; +- } +- +- return &scsi_req->req; +-} +- +-static void +-virtio_scsi_free_req(struct vfu_virtio_endpoint *virtio_endpoint, struct vfu_virtio_vq *vq, +- struct vfu_virtio_req *req) +-{ +- struct virtio_scsi_req *scsi_req = to_scsi_request(req); +- +- free(scsi_req); +-} +- +-struct vfu_virtio_ops virtio_scsi_ops = { +- .get_device_features = virtio_scsi_get_supported_features, +- .alloc_req = virtio_scsi_alloc_req, +- .free_req = virtio_scsi_free_req, +- .exec_request = virtio_scsi_process_req, +- .get_config = virtio_scsi_get_device_specific_config, +- .set_config = virtio_scsi_set_device_specific_config, +- .start_device = virtio_scsi_start, +- .stop_device = virtio_scsi_stop, +-}; +- +-int +-vfu_virtio_scsi_set_options(const char *name, uint16_t num_io_queues, uint16_t qsize, +- bool packed_ring) +-{ +- struct spdk_vfu_endpoint *endpoint; +- uint32_t num_queues; +- struct vfu_virtio_endpoint *virtio_endpoint; +- struct virtio_scsi_endpoint *scsi_endpoint; +- +- num_queues = num_io_queues + 2; +- +- endpoint = spdk_vfu_get_endpoint_by_name(name); +- if (!endpoint) { +- SPDK_ERRLOG("Endpoint %s doesn't exist\n", name); +- return -ENOENT; +- } +- +- virtio_endpoint = spdk_vfu_get_endpoint_private(endpoint); +- scsi_endpoint = to_scsi_endpoint(virtio_endpoint); +- if (virtio_endpoint->dev) { +- SPDK_ERRLOG("Options are not allowed to change in runtime\n"); +- return -EFAULT; +- } +- +- if ((num_queues > 2) && (num_queues <= VIRTIO_DEV_MAX_VQS)) { +- scsi_endpoint->virtio.num_queues = num_queues; +- } else { +- SPDK_NOTICELOG("Number of IO queue %u\n", VIRTIO_DEV_MAX_VQS - 2); +- scsi_endpoint->virtio.num_queues = VIRTIO_DEV_MAX_VQS; +- } +- +- if (qsize && qsize <= VIRTIO_VQ_MAX_SIZE) { +- scsi_endpoint->virtio.qsize = qsize; +- } else { +- SPDK_NOTICELOG("Use queue size %u\n", VIRTIO_VQ_DEFAULT_SIZE); +- scsi_endpoint->virtio.qsize = VIRTIO_VQ_DEFAULT_SIZE; +- } +- scsi_endpoint->virtio.packed_ring = packed_ring; +- +- SPDK_DEBUGLOG(vfu_virtio_scsi, "%s: num_queues %u, qsize %u, packed ring %s\n", +- spdk_vfu_get_endpoint_id(endpoint), +- scsi_endpoint->virtio.num_queues, scsi_endpoint->virtio.qsize, +- packed_ring ? "enabled" : "disabled"); +- +- virtio_scsi_update_config(scsi_endpoint); +- +- return 0; +-} +- +-struct virtio_scsi_event_ctx { +- struct virtio_scsi_endpoint *scsi_endpoint; +- struct virtio_scsi_target *scsi_target; +- uint8_t scsi_target_num; +-}; +- +-static uint8_t +-get_scsi_target_num_by_lun(struct virtio_scsi_endpoint *scsi_endpoint, +- const struct spdk_scsi_lun *lun) +-{ +- const struct spdk_scsi_dev *scsi_dev; +- struct virtio_scsi_target *scsi_target; +- uint8_t i; +- +- scsi_dev = spdk_scsi_lun_get_dev(lun); +- for (i = 0; i < VIRTIO_SCSI_CTRLR_MAX_TARGETS; i++) { +- scsi_target = &scsi_endpoint->targets[i]; +- if (scsi_target->dev == scsi_dev) { +- return i; +- } +- } +- +- return VIRTIO_SCSI_CTRLR_MAX_TARGETS; +-} +- +-static void +-vfu_virtio_scsi_lun_resize_msg(void *ctx) +-{ +- struct virtio_scsi_event_ctx *resize_ctx = ctx; +- struct virtio_scsi_endpoint *scsi_endpoint = resize_ctx->scsi_endpoint; +- uint8_t scsi_target_num = resize_ctx->scsi_target_num; +- +- free(resize_ctx); +- +- if (virtio_guest_has_feature(scsi_endpoint->virtio.dev, VIRTIO_SCSI_F_CHANGE)) { +- vfu_virtio_scsi_eventq_enqueue(scsi_endpoint, scsi_target_num, +- VIRTIO_SCSI_T_PARAM_CHANGE, 0x2a | (0x09 << 8)); +- } +-} +- +-static void +-vfu_virtio_scsi_lun_resize(const struct spdk_scsi_lun *lun, void *arg) +-{ +- struct virtio_scsi_endpoint *scsi_endpoint = arg; +- uint8_t scsi_target_num; +- struct virtio_scsi_event_ctx *ctx; +- +- scsi_target_num = get_scsi_target_num_by_lun(scsi_endpoint, lun); +- if (scsi_target_num == VIRTIO_SCSI_CTRLR_MAX_TARGETS) { +- return; +- } +- +- ctx = calloc(1, sizeof(*ctx)); +- if (!ctx) { +- SPDK_ERRLOG("Error to allocate hotplug ctx\n"); +- return; +- } +- ctx->scsi_endpoint = scsi_endpoint; +- ctx->scsi_target_num = scsi_target_num; +- +- spdk_thread_send_msg(scsi_endpoint->virtio.thread, vfu_virtio_scsi_lun_resize_msg, ctx); +-} +- +-static void +-vfu_virtio_scsi_lun_hotremove_msg(void *ctx) +-{ +- struct virtio_scsi_event_ctx *hotplug = ctx; +- struct virtio_scsi_endpoint *scsi_endpoint = hotplug->scsi_endpoint; +- struct virtio_scsi_target *scsi_target = hotplug->scsi_target; +- struct spdk_scsi_dev *scsi_dev = scsi_target->dev; +- uint8_t scsi_target_num = hotplug->scsi_target_num; +- +- free(hotplug); +- +- if (!scsi_dev) { +- return; +- } +- scsi_target->dev = NULL; +- spdk_scsi_dev_free_io_channels(scsi_dev); +- spdk_scsi_dev_destruct(scsi_dev, NULL, NULL); +- +- assert(scsi_endpoint->virtio.dev); +- if (!virtio_dev_is_started(scsi_endpoint->virtio.dev)) { +- return; +- } +- +- if (virtio_guest_has_feature(scsi_endpoint->virtio.dev, VIRTIO_SCSI_F_HOTPLUG)) { +- SPDK_DEBUGLOG(vfu_virtio_scsi, "Target num %u, sending event\n", scsi_target_num); +- vfu_virtio_scsi_eventq_enqueue(scsi_endpoint, scsi_target_num, +- VIRTIO_SCSI_T_TRANSPORT_RESET, VIRTIO_SCSI_EVT_RESET_REMOVED); +- } +-} +- +-static void +-vfu_virtio_scsi_lun_hotremove(const struct spdk_scsi_lun *lun, void *arg) +-{ +- struct virtio_scsi_endpoint *scsi_endpoint = arg; +- struct virtio_scsi_target *scsi_target; +- struct virtio_scsi_event_ctx *ctx; +- uint8_t scsi_target_num; +- +- if (!scsi_endpoint->virtio.dev) { +- return; +- } +- +- scsi_target_num = get_scsi_target_num_by_lun(scsi_endpoint, lun); +- if (scsi_target_num == VIRTIO_SCSI_CTRLR_MAX_TARGETS) { +- return; +- } +- scsi_target = &scsi_endpoint->targets[scsi_target_num]; +- if (!scsi_target->dev) { +- return; +- } +- +- SPDK_DEBUGLOG(vfu_virtio_scsi, "Removing bdev %s, Target num %u\n", +- spdk_scsi_lun_get_bdev_name(lun), scsi_target_num); +- +- ctx = calloc(1, sizeof(*ctx)); +- if (!ctx) { +- SPDK_ERRLOG("Error to allocate hotplug ctx\n"); +- return; +- } +- ctx->scsi_endpoint = scsi_endpoint; +- ctx->scsi_target = scsi_target; +- ctx->scsi_target_num = scsi_target_num; +- +- spdk_thread_send_msg(scsi_endpoint->virtio.thread, vfu_virtio_scsi_lun_hotremove_msg, ctx); +-} +- +-static void +-vfu_virtio_scsi_lun_hotplug_msg(void *ctx) +-{ +- struct virtio_scsi_event_ctx *hotplug = ctx; +- struct virtio_scsi_endpoint *scsi_endpoint = hotplug->scsi_endpoint; +- struct virtio_scsi_target *scsi_target = hotplug->scsi_target; +- uint8_t scsi_target_num = hotplug->scsi_target_num; +- int ret; +- +- free(hotplug); +- +- assert(scsi_endpoint->virtio.dev); +- if (!virtio_dev_is_started(scsi_endpoint->virtio.dev)) { +- return; +- } +- +- ret = spdk_scsi_dev_allocate_io_channels(scsi_target->dev); +- if (ret) { +- SPDK_ERRLOG("%s: Couldn't allocate io channel for SCSI target %u.\n", +- spdk_vfu_get_endpoint_name(scsi_endpoint->virtio.endpoint), scsi_target_num); +- return; +- } +- +- if (virtio_guest_has_feature(scsi_endpoint->virtio.dev, VIRTIO_SCSI_F_HOTPLUG)) { +- vfu_virtio_scsi_eventq_enqueue(scsi_endpoint, scsi_target_num, +- VIRTIO_SCSI_T_TRANSPORT_RESET, VIRTIO_SCSI_EVT_RESET_RESCAN); +- } +-} +- +-int +-vfu_virtio_scsi_add_target(const char *name, uint8_t scsi_target_num, const char *bdev_name) +-{ +- struct spdk_vfu_endpoint *endpoint; +- struct vfu_virtio_endpoint *virtio_endpoint; +- struct virtio_scsi_endpoint *scsi_endpoint; +- struct virtio_scsi_target *scsi_target; +- char target_name[SPDK_SCSI_DEV_MAX_NAME]; +- int lun_id_list[1]; +- const char *bdev_names_list[1]; +- +- endpoint = spdk_vfu_get_endpoint_by_name(name); +- if (!endpoint) { +- SPDK_ERRLOG("Endpoint %s doesn't exist\n", name); +- return -ENOENT; +- } +- virtio_endpoint = spdk_vfu_get_endpoint_private(endpoint); +- scsi_endpoint = to_scsi_endpoint(virtio_endpoint); +- +- if (scsi_target_num >= VIRTIO_SCSI_CTRLR_MAX_TARGETS) { +- SPDK_ERRLOG("Invalid SCSI target number, maximum SCSI target number is %u\n", +- VIRTIO_SCSI_CTRLR_MAX_TARGETS - 1); +- return -EINVAL; +- } +- scsi_target = &scsi_endpoint->targets[scsi_target_num]; +- if (scsi_target->dev) { +- SPDK_ERRLOG("SCSI Target %u is already occupied\n", scsi_target_num); +- return -EEXIST; +- } +- +- snprintf(target_name, sizeof(target_name), "Target %u", scsi_target_num); +- lun_id_list[0] = 0; +- bdev_names_list[0] = (char *)bdev_name; +- +- scsi_target->dev = spdk_scsi_dev_construct_ext(target_name, bdev_names_list, lun_id_list, 1, +- SPDK_SPC_PROTOCOL_IDENTIFIER_SAS, +- vfu_virtio_scsi_lun_resize, scsi_endpoint, +- vfu_virtio_scsi_lun_hotremove, scsi_endpoint); +- if (!scsi_target->dev) { +- SPDK_ERRLOG("%s: couldn't create SCSI target %u via bdev %s\n", name, scsi_target_num, bdev_name); +- return -EFAULT; +- } +- spdk_scsi_dev_add_port(scsi_target->dev, 0, "vfu-virtio-scsi"); +- +- SPDK_NOTICELOG("%s: added SCSI target %u using bdev '%s'\n", name, scsi_target_num, bdev_name); +- virtio_scsi_update_config(scsi_endpoint); +- +- if (virtio_endpoint->dev) { +- struct virtio_scsi_event_ctx *ctx; +- +- ctx = calloc(1, sizeof(*ctx)); +- if (!ctx) { +- SPDK_ERRLOG("Error to allocate hotplug ctx\n"); +- /* This isn't fatal, just skip hotplug notification */ +- } else { +- ctx->scsi_endpoint = scsi_endpoint; +- ctx->scsi_target = scsi_target; +- ctx->scsi_target_num = scsi_target_num; +- spdk_thread_send_msg(virtio_endpoint->thread, vfu_virtio_scsi_lun_hotplug_msg, ctx); +- } +- } +- +- return 0; +-} +- +-int +-vfu_virtio_scsi_remove_target(const char *name, uint8_t scsi_target_num) +-{ +- struct spdk_vfu_endpoint *endpoint; +- struct vfu_virtio_endpoint *virtio_endpoint; +- struct virtio_scsi_endpoint *scsi_endpoint; +- struct virtio_scsi_target *scsi_target; +- +- endpoint = spdk_vfu_get_endpoint_by_name(name); +- if (!endpoint) { +- SPDK_ERRLOG("Endpoint %s doesn't exist\n", name); +- return -ENOENT; +- } +- virtio_endpoint = spdk_vfu_get_endpoint_private(endpoint); +- scsi_endpoint = to_scsi_endpoint(virtio_endpoint); +- +- if (scsi_target_num >= VIRTIO_SCSI_CTRLR_MAX_TARGETS) { +- SPDK_ERRLOG("Invalid SCSI target number, maximum SCSI target number is %u\n", +- VIRTIO_SCSI_CTRLR_MAX_TARGETS - 1); +- return -EINVAL; +- } +- scsi_target = &scsi_endpoint->targets[scsi_target_num]; +- if (!scsi_target->dev) { +- SPDK_ERRLOG("SCSI Target %u doesn't exist\n", scsi_target_num); +- return -ENOENT; +- } +- +- SPDK_NOTICELOG("%s: Remove SCSI target num %u\n", name, scsi_target_num); +- +- if (virtio_endpoint->dev) { +- struct virtio_scsi_event_ctx *ctx; +- +- ctx = calloc(1, sizeof(*ctx)); +- if (!ctx) { +- SPDK_ERRLOG("Error to allocate hotplug ctx\n"); +- /* This isn't fatal, just skip hotplug notification */ +- } else { +- ctx->scsi_endpoint = scsi_endpoint; +- ctx->scsi_target = scsi_target; +- ctx->scsi_target_num = scsi_target_num; +- spdk_thread_send_msg(scsi_endpoint->virtio.thread, vfu_virtio_scsi_lun_hotremove_msg, ctx); +- } +- } else { +- spdk_scsi_dev_destruct(scsi_target->dev, NULL, NULL); +- scsi_target->dev = NULL; +- } +- +- return 0; +-} +- +-static int +-vfu_virtio_scsi_endpoint_destruct(struct spdk_vfu_endpoint *endpoint) +-{ +- struct vfu_virtio_endpoint *virtio_endpoint = spdk_vfu_get_endpoint_private(endpoint); +- struct virtio_scsi_endpoint *scsi_endpoint = to_scsi_endpoint(virtio_endpoint); +- struct virtio_scsi_target *scsi_target; +- uint8_t i; +- +- for (i = 0; i < VIRTIO_SCSI_CTRLR_MAX_TARGETS; i++) { +- scsi_target = &scsi_endpoint->targets[i]; +- if (scsi_target->dev) { +- spdk_scsi_dev_destruct(scsi_target->dev, NULL, NULL); +- } +- } +- +- vfu_virtio_endpoint_destruct(&scsi_endpoint->virtio); +- free(scsi_endpoint); +- +- return 0; +-} +- +-static void * +-vfu_virtio_scsi_endpoint_init(struct spdk_vfu_endpoint *endpoint, +- char *basename, const char *endpoint_name) +-{ +- struct virtio_scsi_endpoint *scsi_endpoint; +- int ret; +- +- scsi_endpoint = calloc(1, sizeof(*scsi_endpoint)); +- if (!scsi_endpoint) { +- return NULL; +- } +- +- ret = vfu_virtio_endpoint_setup(&scsi_endpoint->virtio, endpoint, basename, endpoint_name, +- &virtio_scsi_ops); +- if (ret) { +- SPDK_ERRLOG("Error to setup endpoint %s\n", endpoint_name); +- free(scsi_endpoint); +- return NULL; +- } +- +- virtio_scsi_update_config(scsi_endpoint); +- return (void *)&scsi_endpoint->virtio; +-} +- +-static int +-vfu_virtio_scsi_get_device_info(struct spdk_vfu_endpoint *endpoint, +- struct spdk_vfu_pci_device *device_info) +-{ +- struct vfu_virtio_endpoint *virtio_endpoint = spdk_vfu_get_endpoint_private(endpoint); +- struct virtio_scsi_endpoint *scsi_endpoint = to_scsi_endpoint(virtio_endpoint); +- +- vfu_virtio_get_device_info(&scsi_endpoint->virtio, device_info); +- /* Fill Device ID */ +- device_info->id.did = PCI_DEVICE_ID_VIRTIO_SCSI_MODERN; +- +- return 0; +-} +- +-struct spdk_vfu_endpoint_ops vfu_virtio_scsi_ops = { +- .name = "virtio_scsi", +- .init = vfu_virtio_scsi_endpoint_init, +- .get_device_info = vfu_virtio_scsi_get_device_info, +- .get_vendor_capability = vfu_virtio_get_vendor_capability, +- .post_memory_add = vfu_virtio_post_memory_add, +- .pre_memory_remove = vfu_virtio_pre_memory_remove, +- .reset_device = vfu_virtio_pci_reset_cb, +- .quiesce_device = vfu_virtio_quiesce_cb, +- .destruct = vfu_virtio_scsi_endpoint_destruct, +- .attach_device = vfu_virtio_attach_device, +- .detach_device = vfu_virtio_detach_device, +-}; +- +-static void +-__attribute__((constructor)) _vfu_virtio_scsi_pci_model_register(void) +-{ +- spdk_vfu_register_endpoint_ops(&vfu_virtio_scsi_ops); +-} +- +-SPDK_LOG_REGISTER_COMPONENT(vfu_virtio_scsi) +-SPDK_LOG_REGISTER_COMPONENT(vfu_virtio_scsi_data) ++/* SPDX-License-Identifier: BSD-3-Clause ++ * Copyright (C) 2022 Intel Corporation. ++ * All rights reserved. ++ */ ++ ++/* ++ * virtio-scsi over vfio-user transport ++ */ ++#include ++ ++#include "spdk/stdinc.h" ++#include "spdk/env.h" ++#include "spdk/bdev.h" ++#include "spdk/bdev_module.h" ++#include "spdk/assert.h" ++#include "spdk/barrier.h" ++#include "spdk/thread.h" ++#include "spdk/memory.h" ++#include "spdk/util.h" ++#include "spdk/log.h" ++#include "spdk/string.h" ++#include "spdk/likely.h" ++#include "spdk/scsi.h" ++#include "spdk/scsi_spec.h" ++#include "spdk/pci_ids.h" ++ ++#include "vfu_virtio_internal.h" ++ ++#define VIRTIO_SCSI_SUPPORTED_FEATURES ((1ULL << VIRTIO_SCSI_F_INOUT) | \ ++ (1ULL << VIRTIO_SCSI_F_HOTPLUG) | \ ++ (1ULL << VIRTIO_SCSI_F_CHANGE)) ++ ++#define VIRTIO_SCSI_CTRLR_MAX_TARGETS (8) ++ ++struct virtio_scsi_target { ++ struct spdk_scsi_dev *dev; ++}; ++ ++struct virtio_scsi_endpoint { ++ struct vfu_virtio_endpoint virtio; ++ ++ struct virtio_scsi_config scsi_cfg; ++ /* virtio_scsi specific configurations */ ++ struct virtio_scsi_target targets[VIRTIO_SCSI_CTRLR_MAX_TARGETS]; ++ /* virtio_scsi SCSI task and IO ring process poller */ ++ struct spdk_poller *ring_poller; ++}; ++ ++struct virtio_scsi_req { ++ struct spdk_scsi_task scsi; ++ union { ++ struct virtio_scsi_cmd_req *cmd_req; ++ struct virtio_scsi_ctrl_tmf_req *tmf_req; ++ }; ++ union { ++ struct virtio_scsi_cmd_resp *cmd_resp; ++ struct virtio_scsi_ctrl_tmf_resp *tmf_resp; ++ }; ++ struct virtio_scsi_endpoint *endpoint; ++ /* KEEP req at last */ ++ struct vfu_virtio_req req; ++}; ++ ++static inline struct virtio_scsi_endpoint * ++to_scsi_endpoint(struct vfu_virtio_endpoint *virtio_endpoint) ++{ ++ return SPDK_CONTAINEROF(virtio_endpoint, struct virtio_scsi_endpoint, virtio); ++} ++ ++static inline struct virtio_scsi_req * ++to_scsi_request(struct vfu_virtio_req *request) ++{ ++ return SPDK_CONTAINEROF(request, struct virtio_scsi_req, req); ++} ++ ++static void ++virtio_scsi_req_finish(struct virtio_scsi_req *scsi_req) ++{ ++ struct vfu_virtio_req *req = &scsi_req->req; ++ ++ vfu_virtio_finish_req(req); ++} ++ ++static int ++vfu_virtio_scsi_vring_poll(void *ctx) ++{ ++ struct virtio_scsi_endpoint *scsi_endpoint = ctx; ++ struct vfu_virtio_dev *dev = scsi_endpoint->virtio.dev; ++ struct vfu_virtio_vq *vq; ++ uint32_t i, count = 0; ++ ++ if (spdk_unlikely(!virtio_dev_is_started(dev))) { ++ return SPDK_POLLER_IDLE; ++ } ++ ++ if (spdk_unlikely(scsi_endpoint->virtio.quiesce_in_progress)) { ++ return SPDK_POLLER_IDLE; ++ } ++ ++ /* We don't process event queue here */ ++ for (i = 0; i < dev->num_queues; i++) { ++ if (i == 1) { ++ continue; ++ } ++ ++ vq = &dev->vqs[i]; ++ if (!vq->enabled || vq->q_state != VFU_VQ_ACTIVE) { ++ continue; ++ } ++ ++ vfu_virtio_vq_flush_irq(dev, vq); ++ ++ if (vq->packed.packed_ring) { ++ /* packed vring */ ++ count += vfu_virito_dev_process_packed_ring(dev, vq); ++ } else { ++ /* split vring */ ++ count += vfu_virito_dev_process_split_ring(dev, vq); ++ } ++ } ++ ++ return count ? SPDK_POLLER_BUSY : SPDK_POLLER_IDLE; ++} ++ ++static void ++vfu_virtio_scsi_eventq_enqueue(struct virtio_scsi_endpoint *scsi_endpoint, uint8_t scsi_target_num, ++ uint32_t event, uint32_t reason) ++{ ++ struct vfu_virtio_dev *dev = scsi_endpoint->virtio.dev; ++ struct vfu_virtio_req *req = NULL; ++ struct virtio_scsi_req *scsi_req; ++ struct virtio_scsi_event *desc_ev; ++ struct vfu_virtio_vq *vq; ++ ++ assert(dev != NULL); ++ ++ if (scsi_target_num >= VIRTIO_SCSI_CTRLR_MAX_TARGETS) { ++ return; ++ } ++ ++ if (spdk_unlikely(scsi_endpoint->virtio.quiesce_in_progress)) { ++ return; ++ } ++ ++ /* event queue */ ++ vq = &dev->vqs[1]; ++ if (!vq->enabled || vq->q_state != VFU_VQ_ACTIVE) { ++ return; ++ } ++ ++ if (vq->packed.packed_ring) { ++ /* packed vring */ ++ req = virito_dev_packed_ring_get_next_avail_req(dev, vq); ++ } else { ++ /* split vring */ ++ req = virito_dev_split_ring_get_next_avail_req(dev, vq); ++ } ++ ++ if (!req) { ++ return; ++ } ++ scsi_req = to_scsi_request(req); ++ scsi_req->endpoint = scsi_endpoint; ++ /* add 1 for scsi event */ ++ scsi_endpoint->virtio.io_outstanding++; ++ ++ assert(req->iovcnt == 1); ++ assert(req->iovs[0].iov_len == sizeof(struct virtio_scsi_event)); ++ desc_ev = req->iovs[0].iov_base; ++ ++ desc_ev->event = event; ++ desc_ev->lun[0] = 1; ++ desc_ev->lun[1] = scsi_target_num; ++ /* virtio LUN id 0 can refer either to the entire device ++ * or actual LUN 0 (the only supported by vhost for now) ++ */ ++ desc_ev->lun[2] = 0 >> 8; ++ desc_ev->lun[3] = 0 & 0xFF; ++ /* virtio doesn't specify any strict format for LUN id (bytes 2 and 3) ++ * current implementation relies on linux kernel sources ++ */ ++ memset(&desc_ev->lun[4], 0, 4); ++ desc_ev->reason = reason; ++ ++ req->used_len = sizeof(*desc_ev); ++ ++ SPDK_DEBUGLOG(vfu_virtio_scsi, "%s: SCSI Target Num %u, Desc %p, Event %u, Reason %u\n", ++ spdk_vfu_get_endpoint_name(scsi_endpoint->virtio.endpoint), scsi_target_num, desc_ev, event, ++ reason); ++ ++ virtio_scsi_req_finish(scsi_req); ++ vfu_virtio_vq_flush_irq(dev, vq); ++} ++ ++static int ++virtio_scsi_start(struct vfu_virtio_endpoint *virtio_endpoint) ++{ ++ struct virtio_scsi_endpoint *scsi_endpoint = to_scsi_endpoint(virtio_endpoint); ++ struct virtio_scsi_target *scsi_target; ++ uint8_t i; ++ int ret; ++ ++ if (scsi_endpoint->ring_poller) { ++ return 0; ++ } ++ ++ SPDK_DEBUGLOG(vfu_virtio_scsi, "starting %s\n", ++ spdk_vfu_get_endpoint_name(scsi_endpoint->virtio.endpoint)); ++ ++ for (i = 0; i < VIRTIO_SCSI_CTRLR_MAX_TARGETS; i++) { ++ scsi_target = &scsi_endpoint->targets[i]; ++ if (scsi_target->dev) { ++ ret = spdk_scsi_dev_allocate_io_channels(scsi_target->dev); ++ if (ret) { ++ SPDK_ERRLOG("%s: Couldn't allocate io channel for SCSI target %u.\n", ++ spdk_vfu_get_endpoint_name(scsi_endpoint->virtio.endpoint), i); ++ continue; ++ } ++ } ++ } ++ ++ scsi_endpoint->ring_poller = SPDK_POLLER_REGISTER(vfu_virtio_scsi_vring_poll, scsi_endpoint, ++ 0); ++ ++ return 0; ++} ++ ++static int ++virtio_scsi_stop(struct vfu_virtio_endpoint *virtio_endpoint) ++{ ++ struct virtio_scsi_endpoint *scsi_endpoint = to_scsi_endpoint(virtio_endpoint); ++ struct virtio_scsi_target *scsi_target; ++ uint8_t i; ++ ++ SPDK_DEBUGLOG(vfu_virtio_scsi, "stopping %s\n", ++ spdk_vfu_get_endpoint_name(scsi_endpoint->virtio.endpoint)); ++ ++ spdk_poller_unregister(&scsi_endpoint->ring_poller); ++ ++ for (i = 0; i < VIRTIO_SCSI_CTRLR_MAX_TARGETS; i++) { ++ scsi_target = &scsi_endpoint->targets[i]; ++ if (scsi_target->dev) { ++ spdk_scsi_dev_free_io_channels(scsi_target->dev); ++ } ++ } ++ ++ return 0; ++} ++ ++static void ++virtio_scsi_task_cpl(struct spdk_scsi_task *scsi_task) ++{ ++ struct virtio_scsi_req *scsi_req = SPDK_CONTAINEROF(scsi_task, struct virtio_scsi_req, scsi); ++ ++ scsi_req->cmd_resp->status = scsi_task->status; ++ if (scsi_task->status != SPDK_SCSI_STATUS_GOOD) { ++ scsi_req->cmd_resp->sense_len = scsi_task->sense_data_len; ++ memcpy(scsi_req->cmd_resp->sense, scsi_task->sense_data, scsi_task->sense_data_len); ++ } ++ assert(scsi_task->transfer_len == scsi_task->length); ++ scsi_req->cmd_resp->resid = scsi_task->length - scsi_task->data_transferred; ++ ++ virtio_scsi_req_finish(scsi_req); ++ spdk_scsi_task_put(scsi_task); ++} ++ ++static void ++virtio_scsi_task_mgmt_cpl(struct spdk_scsi_task *scsi_task) ++{ ++ struct virtio_scsi_req *scsi_req = SPDK_CONTAINEROF(scsi_task, struct virtio_scsi_req, scsi); ++ ++ virtio_scsi_req_finish(scsi_req); ++ spdk_scsi_task_put(scsi_task); ++} ++ ++static void ++virtio_scsi_task_free_cb(struct spdk_scsi_task *scsi_task) ++{ ++ ++} ++ ++static struct virtio_scsi_target * ++virtio_scsi_cmd_lun_setup(struct virtio_scsi_endpoint *scsi_endpoint, ++ struct virtio_scsi_req *scsi_req, __u8 *lun) ++{ ++ struct virtio_scsi_target *scsi_target; ++ uint16_t lun_id = (((uint16_t)lun[2] << 8) | lun[3]) & 0x3FFF; ++ ++ SPDK_LOGDUMP(vfu_virtio_scsi_data, "LUN", lun, 8); ++ ++ /* First byte must be 1 and second is target */ ++ if (lun[0] != 1 || lun[1] >= VIRTIO_SCSI_CTRLR_MAX_TARGETS) { ++ SPDK_DEBUGLOG(vfu_virtio_scsi, "Invalid LUN %u:%u\n", lun[0], lun[1]); ++ return NULL; ++ } ++ ++ scsi_target = &scsi_endpoint->targets[lun[1]]; ++ if (!scsi_target->dev) { ++ SPDK_DEBUGLOG(vfu_virtio_scsi, "SCSI Target num %u doesn't exist\n", lun[1]); ++ return NULL; ++ } ++ ++ scsi_req->scsi.target_port = spdk_scsi_dev_find_port_by_id(scsi_target->dev, 0); ++ scsi_req->scsi.lun = spdk_scsi_dev_get_lun(scsi_target->dev, lun_id); ++ if (scsi_req->scsi.lun == NULL) { ++ SPDK_DEBUGLOG(vfu_virtio_scsi, "LUN %u:%u doesn't exist\n", lun[0], lun[1]); ++ return NULL; ++ } ++ SPDK_DEBUGLOG(vfu_virtio_scsi, "Got valid SCSI Target num %u, bdev %s\n", lun[1], ++ spdk_scsi_lun_get_bdev_name(scsi_req->scsi.lun)); ++ ++ return scsi_target; ++} ++ ++static int ++virtio_scsi_cmd_data_setup(struct virtio_scsi_req *scsi_req) ++{ ++ struct iovec *iov; ++ uint32_t iovcnt; ++ uint32_t payload_len; ++ ++ iov = &scsi_req->req.iovs[0]; ++ iovcnt = scsi_req->req.iovcnt; ++ payload_len = scsi_req->req.payload_size; ++ ++ if (spdk_unlikely(iov->iov_len < sizeof(struct virtio_scsi_cmd_req))) { ++ SPDK_ERRLOG("Invalid virtio_scsi command header length"); ++ return -EINVAL; ++ } ++ if (spdk_unlikely(iovcnt < 2)) { ++ SPDK_ERRLOG("Invalid iovcnt %u\n", iovcnt); ++ return -EINVAL; ++ } ++ ++ scsi_req->cmd_req = scsi_req->req.iovs[0].iov_base; ++ payload_len -= scsi_req->req.iovs[0].iov_len; ++ ++ /* ++ * FROM_DEV (READ): [RO_req][WR_resp][WR_buf0]...[WR_bufN] ++ * TO_DEV (WRITE): [RO_req][RO_buf0]...[RO_bufN][WR_resp] ++ */ ++ if (virtio_req_iov_is_wr(&scsi_req->req, 1)) { ++ scsi_req->scsi.dxfer_dir = SPDK_SCSI_DIR_FROM_DEV; ++ } else { ++ scsi_req->scsi.dxfer_dir = SPDK_SCSI_DIR_TO_DEV; ++ } ++ ++ if (scsi_req->scsi.dxfer_dir == SPDK_SCSI_DIR_FROM_DEV) { ++ if (scsi_req->req.iovs[1].iov_len < sizeof(struct virtio_scsi_cmd_resp)) { ++ SPDK_ERRLOG("DIR_FROM_DEV: Invalid virtio_scsi command resp length"); ++ return -EINVAL; ++ } ++ scsi_req->cmd_resp = scsi_req->req.iovs[1].iov_base; ++ scsi_req->req.used_len = payload_len; ++ scsi_req->scsi.iovs = &scsi_req->req.iovs[2]; ++ } else { ++ if (scsi_req->req.iovs[iovcnt - 1].iov_len < sizeof(struct virtio_scsi_cmd_resp)) { ++ SPDK_ERRLOG("DIR_TO_DEV: Invalid virtio_scsi command resp length"); ++ return -EINVAL; ++ } ++ scsi_req->req.used_len = sizeof(struct virtio_scsi_cmd_resp); ++ scsi_req->cmd_resp = scsi_req->req.iovs[iovcnt - 1].iov_base; ++ scsi_req->scsi.iovs = &scsi_req->req.iovs[1]; ++ } ++ ++ /* -2 for REQ and RESP */ ++ iovcnt -= 2; ++ if (!iovcnt) { ++ scsi_req->scsi.length = 0; ++ scsi_req->scsi.transfer_len = 0; ++ scsi_req->scsi.iovs[0].iov_len = 0; ++ } else { ++ assert(payload_len > sizeof(struct virtio_scsi_cmd_resp)); ++ payload_len -= sizeof(struct virtio_scsi_cmd_resp); ++ scsi_req->scsi.length = payload_len; ++ scsi_req->scsi.transfer_len = payload_len; ++ } ++ scsi_req->scsi.iovcnt = iovcnt; ++ scsi_req->scsi.cdb = scsi_req->cmd_req->cdb; ++ scsi_req->cmd_resp->response = VIRTIO_SCSI_S_OK; ++ ++ SPDK_LOGDUMP(vfu_virtio_scsi_data, "CDB=", scsi_req->cmd_req->cdb, VIRTIO_SCSI_CDB_SIZE); ++ SPDK_DEBUGLOG(vfu_virtio_scsi, "%s, iovcnt %u, transfer_len %u, used len %u\n", ++ scsi_req->scsi.dxfer_dir == SPDK_SCSI_DIR_FROM_DEV ? "XFER_FROM_DEV" : "XFER_TO_DEV", ++ scsi_req->scsi.iovcnt, payload_len, scsi_req->req.used_len); ++ ++ return 0; ++} ++ ++static int ++virtio_scsi_tmf_cmd_req(struct virtio_scsi_endpoint *scsi_endpoint, ++ struct virtio_scsi_req *scsi_req) ++{ ++ uint32_t iovcnt; ++ struct iovec *iov; ++ struct virtio_scsi_ctrl_tmf_req *tmf_req; ++ struct virtio_scsi_target *scsi_target; ++ ++ iov = &scsi_req->req.iovs[0]; ++ iovcnt = scsi_req->req.iovcnt; ++ tmf_req = iov->iov_base; ++ if (spdk_unlikely(iovcnt < 2)) { ++ SPDK_ERRLOG("Invalid iovcnt %u\n", iovcnt); ++ goto invalid; ++ } ++ ++ memset(&scsi_req->scsi, 0, sizeof(struct spdk_scsi_task)); ++ spdk_scsi_task_construct(&scsi_req->scsi, virtio_scsi_task_mgmt_cpl, virtio_scsi_task_free_cb); ++ ++ switch (tmf_req->type) { ++ case VIRTIO_SCSI_T_TMF: ++ if (scsi_req->req.iovs[0].iov_len < sizeof(struct virtio_scsi_ctrl_tmf_req) || ++ scsi_req->req.iovs[1].iov_len < sizeof(struct virtio_scsi_ctrl_tmf_resp)) { ++ SPDK_ERRLOG("Invalid size of tmf_req or tmf_resp\n"); ++ goto invalid; ++ } ++ scsi_req->tmf_req = tmf_req; ++ scsi_req->tmf_resp = scsi_req->req.iovs[1].iov_base; ++ switch (tmf_req->subtype) { ++ case VIRTIO_SCSI_T_TMF_LOGICAL_UNIT_RESET: ++ scsi_target = virtio_scsi_cmd_lun_setup(scsi_endpoint, scsi_req, scsi_req->tmf_req->lun); ++ if (!scsi_target) { ++ scsi_req->tmf_resp->response = VIRTIO_SCSI_S_BAD_TARGET; ++ break; ++ } ++ /* Management task submission */ ++ scsi_req->tmf_resp->response = VIRTIO_SCSI_S_OK; ++ scsi_req->scsi.function = SPDK_SCSI_TASK_FUNC_LUN_RESET; ++ spdk_scsi_dev_queue_mgmt_task(scsi_target->dev, &scsi_req->scsi); ++ return 0; ++ break; ++ default: ++ scsi_req->tmf_resp->response = VIRTIO_SCSI_S_FUNCTION_REJECTED; ++ break; ++ } ++ break; ++ ++ case VIRTIO_SCSI_T_AN_QUERY: ++ case VIRTIO_SCSI_T_AN_SUBSCRIBE: ++ if (scsi_req->req.iovs[0].iov_len < sizeof(struct virtio_scsi_ctrl_an_req) || ++ scsi_req->req.iovs[1].iov_len < sizeof(struct virtio_scsi_ctrl_an_resp)) { ++ SPDK_ERRLOG("Invalid size of tmf_req or tmf_resp\n"); ++ goto invalid; ++ } ++ scsi_req->req.used_len = sizeof(struct virtio_scsi_ctrl_an_resp); ++ /* Do nothing to response byte of virtio_scsi_ctrl_an_resp */ ++ goto invalid; ++ break; ++ default: ++ break; ++ } ++ ++invalid: ++ /* invalid request */ ++ virtio_scsi_req_finish(scsi_req); ++ return -1; ++} ++ ++static int ++virtio_scsi_cmd_req(struct virtio_scsi_endpoint *scsi_endpoint, struct virtio_scsi_req *scsi_req) ++{ ++ int ret; ++ struct virtio_scsi_target *scsi_target; ++ ++ memset(&scsi_req->scsi, 0, sizeof(struct spdk_scsi_task)); ++ spdk_scsi_task_construct(&scsi_req->scsi, virtio_scsi_task_cpl, virtio_scsi_task_free_cb); ++ ++ ret = virtio_scsi_cmd_data_setup(scsi_req); ++ if (ret) { ++ SPDK_ERRLOG("Error to setup SCSI command, ret %d\n", ret); ++ goto invalid; ++ } ++ ++ scsi_target = virtio_scsi_cmd_lun_setup(scsi_endpoint, scsi_req, scsi_req->cmd_req->lun); ++ if (!scsi_target) { ++ scsi_req->cmd_resp->response = VIRTIO_SCSI_S_BAD_TARGET; ++ goto invalid; ++ } ++ ++ spdk_scsi_dev_queue_task(scsi_target->dev, &scsi_req->scsi); ++ return 0; ++ ++invalid: ++ /* invalid request */ ++ virtio_scsi_req_finish(scsi_req); ++ return ret; ++} ++ ++static int ++virtio_scsi_process_req(struct vfu_virtio_endpoint *virtio_endpoint, struct vfu_virtio_vq *vq, ++ struct vfu_virtio_req *req) ++{ ++ struct virtio_scsi_endpoint *scsi_endpoint = to_scsi_endpoint(virtio_endpoint); ++ struct virtio_scsi_req *scsi_req = to_scsi_request(req); ++ ++ scsi_req->endpoint = scsi_endpoint; ++ ++ /* SCSI task management command */ ++ if (spdk_unlikely(vq->id == 0)) { ++ return virtio_scsi_tmf_cmd_req(scsi_endpoint, scsi_req); ++ } ++ ++ /* SCSI command */ ++ return virtio_scsi_cmd_req(scsi_endpoint, scsi_req);; ++} ++ ++static void ++virtio_scsi_update_config(struct virtio_scsi_endpoint *scsi_endpoint) ++{ ++ struct virtio_scsi_config *scsi_cfg; ++ ++ if (!scsi_endpoint) { ++ return; ++ } ++ ++ scsi_cfg = &scsi_endpoint->scsi_cfg; ++ ++ scsi_cfg->num_queues = scsi_endpoint->virtio.num_queues; ++ /* -2 for REQ and RESP and -1 for region boundary splitting */ ++ scsi_cfg->seg_max = spdk_min(VIRTIO_DEV_MAX_IOVS - 2 - 1, SPDK_BDEV_IO_NUM_CHILD_IOV - 2 - 1); ++ /* we can set `max_sectors` and `cmd_per_lun` based on bdevs */ ++ scsi_cfg->max_sectors = 131072; ++ scsi_cfg->cmd_per_lun = scsi_endpoint->virtio.qsize; ++ scsi_cfg->event_info_size = sizeof(struct virtio_scsi_event); ++ scsi_cfg->sense_size = VIRTIO_SCSI_SENSE_DEFAULT_SIZE; ++ scsi_cfg->cdb_size = VIRTIO_SCSI_CDB_DEFAULT_SIZE; ++ scsi_cfg->max_channel = 0; ++ scsi_cfg->max_target = VIRTIO_SCSI_CTRLR_MAX_TARGETS; ++ scsi_cfg->max_lun = 16383; ++} ++ ++static uint64_t ++virtio_scsi_get_supported_features(struct vfu_virtio_endpoint *virtio_endpoint) ++{ ++ uint64_t features; ++ ++ features = VIRTIO_SCSI_SUPPORTED_FEATURES | VIRTIO_HOST_SUPPORTED_FEATURES; ++ ++ if (!virtio_endpoint->packed_ring) { ++ features &= ~(1ULL << VIRTIO_F_RING_PACKED); ++ } ++ ++ return features; ++} ++ ++static int ++virtio_scsi_get_device_specific_config(struct vfu_virtio_endpoint *virtio_endpoint, char *buf, ++ uint64_t offset, uint64_t count) ++{ ++ struct virtio_scsi_endpoint *scsi_endpoint = to_scsi_endpoint(virtio_endpoint); ++ uint8_t *scsi_cfg; ++ ++ if ((offset + count) > sizeof(struct virtio_scsi_config)) { ++ SPDK_ERRLOG("Invalid device specific configuration offset 0x%"PRIx64"\n", offset); ++ return -EINVAL; ++ } ++ ++ scsi_cfg = (uint8_t *)&scsi_endpoint->scsi_cfg; ++ memcpy(buf, scsi_cfg + offset, count); ++ ++ return 0; ++} ++ ++static int ++virtio_scsi_set_device_specific_config(struct vfu_virtio_endpoint *virtio_endpoint, char *buf, ++ uint64_t offset, uint64_t count) ++{ ++ struct virtio_scsi_endpoint *scsi_endpoint = to_scsi_endpoint(virtio_endpoint); ++ uint32_t value; ++ int ret = 0; ++ ++ if ((offset + count) > sizeof(struct virtio_scsi_config)) { ++ SPDK_ERRLOG("Invalid device specific configuration offset 0x%"PRIx64"\n", offset); ++ return -EINVAL; ++ } ++ ++ switch (offset) { ++ case offsetof(struct virtio_scsi_config, sense_size): ++ value = *(uint32_t *)buf; ++ if (scsi_endpoint->scsi_cfg.sense_size != value) { ++ SPDK_ERRLOG("Sense data size set to %u\n", value); ++ ret = -ENOTSUP; ++ } ++ break; ++ case offsetof(struct virtio_scsi_config, cdb_size): ++ value = *(uint32_t *)buf; ++ if (scsi_endpoint->scsi_cfg.cdb_size != value) { ++ SPDK_ERRLOG("CDB size set to %u\n", value); ++ ret = -ENOTSUP; ++ } ++ break; ++ default: ++ SPDK_ERRLOG("Error offset %"PRIu64"\n", offset); ++ ret = -EINVAL; ++ break; ++ } ++ ++ ++ return ret; ++} ++ ++static struct vfu_virtio_req * ++virtio_scsi_alloc_req(struct vfu_virtio_endpoint *virtio_endpoint, struct vfu_virtio_vq *vq) ++{ ++ struct virtio_scsi_req *scsi_req; ++ ++ scsi_req = calloc(1, sizeof(*scsi_req) + dma_sg_size() * (VIRTIO_DEV_MAX_IOVS + 1)); ++ if (!scsi_req) { ++ return NULL; ++ } ++ ++ return &scsi_req->req; ++} ++ ++static void ++virtio_scsi_free_req(struct vfu_virtio_endpoint *virtio_endpoint, struct vfu_virtio_vq *vq, ++ struct vfu_virtio_req *req) ++{ ++ struct virtio_scsi_req *scsi_req = to_scsi_request(req); ++ ++ free(scsi_req); ++} ++ ++struct vfu_virtio_ops virtio_scsi_ops = { ++ .get_device_features = virtio_scsi_get_supported_features, ++ .alloc_req = virtio_scsi_alloc_req, ++ .free_req = virtio_scsi_free_req, ++ .exec_request = virtio_scsi_process_req, ++ .get_config = virtio_scsi_get_device_specific_config, ++ .set_config = virtio_scsi_set_device_specific_config, ++ .start_device = virtio_scsi_start, ++ .stop_device = virtio_scsi_stop, ++}; ++ ++int ++vfu_virtio_scsi_set_options(const char *name, uint16_t num_io_queues, uint16_t qsize, ++ bool packed_ring) ++{ ++ struct spdk_vfu_endpoint *endpoint; ++ uint32_t num_queues; ++ struct vfu_virtio_endpoint *virtio_endpoint; ++ struct virtio_scsi_endpoint *scsi_endpoint; ++ ++ num_queues = num_io_queues + 2; ++ ++ endpoint = spdk_vfu_get_endpoint_by_name(name); ++ if (!endpoint) { ++ SPDK_ERRLOG("Endpoint %s doesn't exist\n", name); ++ return -ENOENT; ++ } ++ ++ virtio_endpoint = spdk_vfu_get_endpoint_private(endpoint); ++ scsi_endpoint = to_scsi_endpoint(virtio_endpoint); ++ if (virtio_endpoint->dev) { ++ SPDK_ERRLOG("Options are not allowed to change in runtime\n"); ++ return -EFAULT; ++ } ++ ++ if ((num_queues > 2) && (num_queues <= VIRTIO_DEV_MAX_VQS)) { ++ scsi_endpoint->virtio.num_queues = num_queues; ++ } else { ++ SPDK_NOTICELOG("Number of IO queue %u\n", VIRTIO_DEV_MAX_VQS - 2); ++ scsi_endpoint->virtio.num_queues = VIRTIO_DEV_MAX_VQS; ++ } ++ ++ if (qsize && qsize <= VIRTIO_VQ_MAX_SIZE) { ++ scsi_endpoint->virtio.qsize = qsize; ++ } else { ++ SPDK_NOTICELOG("Use queue size %u\n", VIRTIO_VQ_DEFAULT_SIZE); ++ scsi_endpoint->virtio.qsize = VIRTIO_VQ_DEFAULT_SIZE; ++ } ++ scsi_endpoint->virtio.packed_ring = packed_ring; ++ ++ SPDK_DEBUGLOG(vfu_virtio_scsi, "%s: num_queues %u, qsize %u, packed ring %s\n", ++ spdk_vfu_get_endpoint_id(endpoint), ++ scsi_endpoint->virtio.num_queues, scsi_endpoint->virtio.qsize, ++ packed_ring ? "enabled" : "disabled"); ++ ++ virtio_scsi_update_config(scsi_endpoint); ++ ++ return 0; ++} ++ ++struct virtio_scsi_event_ctx { ++ struct virtio_scsi_endpoint *scsi_endpoint; ++ struct virtio_scsi_target *scsi_target; ++ uint8_t scsi_target_num; ++}; ++ ++static uint8_t ++get_scsi_target_num_by_lun(struct virtio_scsi_endpoint *scsi_endpoint, ++ const struct spdk_scsi_lun *lun) ++{ ++ const struct spdk_scsi_dev *scsi_dev; ++ struct virtio_scsi_target *scsi_target; ++ uint8_t i; ++ ++ scsi_dev = spdk_scsi_lun_get_dev(lun); ++ for (i = 0; i < VIRTIO_SCSI_CTRLR_MAX_TARGETS; i++) { ++ scsi_target = &scsi_endpoint->targets[i]; ++ if (scsi_target->dev == scsi_dev) { ++ return i; ++ } ++ } ++ ++ return VIRTIO_SCSI_CTRLR_MAX_TARGETS; ++} ++ ++static void ++vfu_virtio_scsi_lun_resize_msg(void *ctx) ++{ ++ struct virtio_scsi_event_ctx *resize_ctx = ctx; ++ struct virtio_scsi_endpoint *scsi_endpoint = resize_ctx->scsi_endpoint; ++ uint8_t scsi_target_num = resize_ctx->scsi_target_num; ++ ++ free(resize_ctx); ++ ++ if (virtio_guest_has_feature(scsi_endpoint->virtio.dev, VIRTIO_SCSI_F_CHANGE)) { ++ vfu_virtio_scsi_eventq_enqueue(scsi_endpoint, scsi_target_num, ++ VIRTIO_SCSI_T_PARAM_CHANGE, 0x2a | (0x09 << 8)); ++ } ++} ++ ++static void ++vfu_virtio_scsi_lun_resize(const struct spdk_scsi_lun *lun, void *arg) ++{ ++ struct virtio_scsi_endpoint *scsi_endpoint = arg; ++ uint8_t scsi_target_num; ++ struct virtio_scsi_event_ctx *ctx; ++ ++ scsi_target_num = get_scsi_target_num_by_lun(scsi_endpoint, lun); ++ if (scsi_target_num == VIRTIO_SCSI_CTRLR_MAX_TARGETS) { ++ return; ++ } ++ ++ ctx = calloc(1, sizeof(*ctx)); ++ if (!ctx) { ++ SPDK_ERRLOG("Error to allocate hotplug ctx\n"); ++ return; ++ } ++ ctx->scsi_endpoint = scsi_endpoint; ++ ctx->scsi_target_num = scsi_target_num; ++ ++ spdk_thread_send_msg(scsi_endpoint->virtio.thread, vfu_virtio_scsi_lun_resize_msg, ctx); ++} ++ ++static void ++vfu_virtio_scsi_lun_hotremove_msg(void *ctx) ++{ ++ struct virtio_scsi_event_ctx *hotplug = ctx; ++ struct virtio_scsi_endpoint *scsi_endpoint = hotplug->scsi_endpoint; ++ struct virtio_scsi_target *scsi_target = hotplug->scsi_target; ++ struct spdk_scsi_dev *scsi_dev = scsi_target->dev; ++ uint8_t scsi_target_num = hotplug->scsi_target_num; ++ ++ free(hotplug); ++ ++ if (!scsi_dev) { ++ return; ++ } ++ scsi_target->dev = NULL; ++ spdk_scsi_dev_free_io_channels(scsi_dev); ++ spdk_scsi_dev_destruct(scsi_dev, NULL, NULL); ++ ++ assert(scsi_endpoint->virtio.dev); ++ if (!virtio_dev_is_started(scsi_endpoint->virtio.dev)) { ++ return; ++ } ++ ++ if (virtio_guest_has_feature(scsi_endpoint->virtio.dev, VIRTIO_SCSI_F_HOTPLUG)) { ++ SPDK_DEBUGLOG(vfu_virtio_scsi, "Target num %u, sending event\n", scsi_target_num); ++ vfu_virtio_scsi_eventq_enqueue(scsi_endpoint, scsi_target_num, ++ VIRTIO_SCSI_T_TRANSPORT_RESET, VIRTIO_SCSI_EVT_RESET_REMOVED); ++ } ++} ++ ++static void ++vfu_virtio_scsi_lun_hotremove(const struct spdk_scsi_lun *lun, void *arg) ++{ ++ struct virtio_scsi_endpoint *scsi_endpoint = arg; ++ struct virtio_scsi_target *scsi_target; ++ struct virtio_scsi_event_ctx *ctx; ++ uint8_t scsi_target_num; ++ ++ if (!scsi_endpoint->virtio.dev) { ++ return; ++ } ++ ++ scsi_target_num = get_scsi_target_num_by_lun(scsi_endpoint, lun); ++ if (scsi_target_num == VIRTIO_SCSI_CTRLR_MAX_TARGETS) { ++ return; ++ } ++ scsi_target = &scsi_endpoint->targets[scsi_target_num]; ++ if (!scsi_target->dev) { ++ return; ++ } ++ ++ SPDK_DEBUGLOG(vfu_virtio_scsi, "Removing bdev %s, Target num %u\n", ++ spdk_scsi_lun_get_bdev_name(lun), scsi_target_num); ++ ++ ctx = calloc(1, sizeof(*ctx)); ++ if (!ctx) { ++ SPDK_ERRLOG("Error to allocate hotplug ctx\n"); ++ return; ++ } ++ ctx->scsi_endpoint = scsi_endpoint; ++ ctx->scsi_target = scsi_target; ++ ctx->scsi_target_num = scsi_target_num; ++ ++ spdk_thread_send_msg(scsi_endpoint->virtio.thread, vfu_virtio_scsi_lun_hotremove_msg, ctx); ++} ++ ++static void ++vfu_virtio_scsi_lun_hotplug_msg(void *ctx) ++{ ++ struct virtio_scsi_event_ctx *hotplug = ctx; ++ struct virtio_scsi_endpoint *scsi_endpoint = hotplug->scsi_endpoint; ++ struct virtio_scsi_target *scsi_target = hotplug->scsi_target; ++ uint8_t scsi_target_num = hotplug->scsi_target_num; ++ int ret; ++ ++ free(hotplug); ++ ++ assert(scsi_endpoint->virtio.dev); ++ if (!virtio_dev_is_started(scsi_endpoint->virtio.dev)) { ++ return; ++ } ++ ++ ret = spdk_scsi_dev_allocate_io_channels(scsi_target->dev); ++ if (ret) { ++ SPDK_ERRLOG("%s: Couldn't allocate io channel for SCSI target %u.\n", ++ spdk_vfu_get_endpoint_name(scsi_endpoint->virtio.endpoint), scsi_target_num); ++ return; ++ } ++ ++ if (virtio_guest_has_feature(scsi_endpoint->virtio.dev, VIRTIO_SCSI_F_HOTPLUG)) { ++ vfu_virtio_scsi_eventq_enqueue(scsi_endpoint, scsi_target_num, ++ VIRTIO_SCSI_T_TRANSPORT_RESET, VIRTIO_SCSI_EVT_RESET_RESCAN); ++ } ++} ++ ++int ++vfu_virtio_scsi_add_target(const char *name, uint8_t scsi_target_num, const char *bdev_name) ++{ ++ struct spdk_vfu_endpoint *endpoint; ++ struct vfu_virtio_endpoint *virtio_endpoint; ++ struct virtio_scsi_endpoint *scsi_endpoint; ++ struct virtio_scsi_target *scsi_target; ++ char target_name[SPDK_SCSI_DEV_MAX_NAME]; ++ int lun_id_list[1]; ++ const char *bdev_names_list[1]; ++ ++ endpoint = spdk_vfu_get_endpoint_by_name(name); ++ if (!endpoint) { ++ SPDK_ERRLOG("Endpoint %s doesn't exist\n", name); ++ return -ENOENT; ++ } ++ virtio_endpoint = spdk_vfu_get_endpoint_private(endpoint); ++ scsi_endpoint = to_scsi_endpoint(virtio_endpoint); ++ ++ if (scsi_target_num >= VIRTIO_SCSI_CTRLR_MAX_TARGETS) { ++ SPDK_ERRLOG("Invalid SCSI target number, maximum SCSI target number is %u\n", ++ VIRTIO_SCSI_CTRLR_MAX_TARGETS - 1); ++ return -EINVAL; ++ } ++ scsi_target = &scsi_endpoint->targets[scsi_target_num]; ++ if (scsi_target->dev) { ++ SPDK_ERRLOG("SCSI Target %u is already occupied\n", scsi_target_num); ++ return -EEXIST; ++ } ++ ++ snprintf(target_name, sizeof(target_name), "Target %u", scsi_target_num); ++ lun_id_list[0] = 0; ++ bdev_names_list[0] = (char *)bdev_name; ++ ++ scsi_target->dev = spdk_scsi_dev_construct_ext(target_name, bdev_names_list, lun_id_list, 1, ++ SPDK_SPC_PROTOCOL_IDENTIFIER_SAS, ++ vfu_virtio_scsi_lun_resize, scsi_endpoint, ++ vfu_virtio_scsi_lun_hotremove, scsi_endpoint); ++ if (!scsi_target->dev) { ++ SPDK_ERRLOG("%s: couldn't create SCSI target %u via bdev %s\n", name, scsi_target_num, bdev_name); ++ return -EFAULT; ++ } ++ spdk_scsi_dev_add_port(scsi_target->dev, 0, "vfu-virtio-scsi"); ++ ++ SPDK_NOTICELOG("%s: added SCSI target %u using bdev '%s'\n", name, scsi_target_num, bdev_name); ++ virtio_scsi_update_config(scsi_endpoint); ++ ++ if (virtio_endpoint->dev) { ++ struct virtio_scsi_event_ctx *ctx; ++ ++ ctx = calloc(1, sizeof(*ctx)); ++ if (!ctx) { ++ SPDK_ERRLOG("Error to allocate hotplug ctx\n"); ++ /* This isn't fatal, just skip hotplug notification */ ++ } else { ++ ctx->scsi_endpoint = scsi_endpoint; ++ ctx->scsi_target = scsi_target; ++ ctx->scsi_target_num = scsi_target_num; ++ spdk_thread_send_msg(virtio_endpoint->thread, vfu_virtio_scsi_lun_hotplug_msg, ctx); ++ } ++ } ++ ++ return 0; ++} ++ ++int ++vfu_virtio_scsi_remove_target(const char *name, uint8_t scsi_target_num) ++{ ++ struct spdk_vfu_endpoint *endpoint; ++ struct vfu_virtio_endpoint *virtio_endpoint; ++ struct virtio_scsi_endpoint *scsi_endpoint; ++ struct virtio_scsi_target *scsi_target; ++ ++ endpoint = spdk_vfu_get_endpoint_by_name(name); ++ if (!endpoint) { ++ SPDK_ERRLOG("Endpoint %s doesn't exist\n", name); ++ return -ENOENT; ++ } ++ virtio_endpoint = spdk_vfu_get_endpoint_private(endpoint); ++ scsi_endpoint = to_scsi_endpoint(virtio_endpoint); ++ ++ if (scsi_target_num >= VIRTIO_SCSI_CTRLR_MAX_TARGETS) { ++ SPDK_ERRLOG("Invalid SCSI target number, maximum SCSI target number is %u\n", ++ VIRTIO_SCSI_CTRLR_MAX_TARGETS - 1); ++ return -EINVAL; ++ } ++ scsi_target = &scsi_endpoint->targets[scsi_target_num]; ++ if (!scsi_target->dev) { ++ SPDK_ERRLOG("SCSI Target %u doesn't exist\n", scsi_target_num); ++ return -ENOENT; ++ } ++ ++ SPDK_NOTICELOG("%s: Remove SCSI target num %u\n", name, scsi_target_num); ++ ++ if (virtio_endpoint->dev) { ++ struct virtio_scsi_event_ctx *ctx; ++ ++ ctx = calloc(1, sizeof(*ctx)); ++ if (!ctx) { ++ SPDK_ERRLOG("Error to allocate hotplug ctx\n"); ++ /* This isn't fatal, just skip hotplug notification */ ++ } else { ++ ctx->scsi_endpoint = scsi_endpoint; ++ ctx->scsi_target = scsi_target; ++ ctx->scsi_target_num = scsi_target_num; ++ spdk_thread_send_msg(scsi_endpoint->virtio.thread, vfu_virtio_scsi_lun_hotremove_msg, ctx); ++ } ++ } else { ++ spdk_scsi_dev_destruct(scsi_target->dev, NULL, NULL); ++ scsi_target->dev = NULL; ++ } ++ ++ return 0; ++} ++ ++static int ++vfu_virtio_scsi_endpoint_destruct(struct spdk_vfu_endpoint *endpoint) ++{ ++ struct vfu_virtio_endpoint *virtio_endpoint = spdk_vfu_get_endpoint_private(endpoint); ++ struct virtio_scsi_endpoint *scsi_endpoint = to_scsi_endpoint(virtio_endpoint); ++ struct virtio_scsi_target *scsi_target; ++ uint8_t i; ++ ++ for (i = 0; i < VIRTIO_SCSI_CTRLR_MAX_TARGETS; i++) { ++ scsi_target = &scsi_endpoint->targets[i]; ++ if (scsi_target->dev) { ++ spdk_scsi_dev_destruct(scsi_target->dev, NULL, NULL); ++ } ++ } ++ ++ vfu_virtio_endpoint_destruct(&scsi_endpoint->virtio); ++ free(scsi_endpoint); ++ ++ return 0; ++} ++ ++static void * ++vfu_virtio_scsi_endpoint_init(struct spdk_vfu_endpoint *endpoint, ++ char *basename, const char *endpoint_name) ++{ ++ struct virtio_scsi_endpoint *scsi_endpoint; ++ int ret; ++ ++ scsi_endpoint = calloc(1, sizeof(*scsi_endpoint)); ++ if (!scsi_endpoint) { ++ return NULL; ++ } ++ ++ ret = vfu_virtio_endpoint_setup(&scsi_endpoint->virtio, endpoint, basename, endpoint_name, ++ &virtio_scsi_ops); ++ if (ret) { ++ SPDK_ERRLOG("Error to setup endpoint %s\n", endpoint_name); ++ free(scsi_endpoint); ++ return NULL; ++ } ++ ++ virtio_scsi_update_config(scsi_endpoint); ++ return (void *)&scsi_endpoint->virtio; ++} ++ ++static int ++vfu_virtio_scsi_get_device_info(struct spdk_vfu_endpoint *endpoint, ++ struct spdk_vfu_pci_device *device_info) ++{ ++ struct vfu_virtio_endpoint *virtio_endpoint = spdk_vfu_get_endpoint_private(endpoint); ++ struct virtio_scsi_endpoint *scsi_endpoint = to_scsi_endpoint(virtio_endpoint); ++ ++ vfu_virtio_get_device_info(&scsi_endpoint->virtio, device_info); ++ /* Fill Device ID */ ++ device_info->id.did = PCI_DEVICE_ID_VIRTIO_SCSI_MODERN; ++ ++ return 0; ++} ++ ++struct spdk_vfu_endpoint_ops vfu_virtio_scsi_ops = { ++ .name = "virtio_scsi", ++ .init = vfu_virtio_scsi_endpoint_init, ++ .get_device_info = vfu_virtio_scsi_get_device_info, ++ .get_vendor_capability = vfu_virtio_get_vendor_capability, ++ .post_memory_add = vfu_virtio_post_memory_add, ++ .pre_memory_remove = vfu_virtio_pre_memory_remove, ++ .reset_device = vfu_virtio_pci_reset_cb, ++ .quiesce_device = vfu_virtio_quiesce_cb, ++ .destruct = vfu_virtio_scsi_endpoint_destruct, ++ .attach_device = vfu_virtio_attach_device, ++ .detach_device = vfu_virtio_detach_device, ++}; ++ ++static void ++__attribute__((constructor)) _vfu_virtio_scsi_pci_model_register(void) ++{ ++ spdk_vfu_register_endpoint_ops(&vfu_virtio_scsi_ops); ++} ++ ++SPDK_LOG_REGISTER_COMPONENT(vfu_virtio_scsi) ++SPDK_LOG_REGISTER_COMPONENT(vfu_virtio_scsi_data) +diff --git a/proto/Makefile b/proto/Makefile +index 767b93b..e94a935 100644 +--- a/proto/Makefile ++++ b/proto/Makefile +@@ -1,28 +1,28 @@ +-# SPDX-License-Identifier: BSD-3-Clause +-# Copyright (C) 2022 Intel Corporation. +-# All rights reserved. +-# +- +-SPDK_ROOT_DIR := $(abspath $(CURDIR)/..) +-include $(SPDK_ROOT_DIR)/mk/spdk.common.mk +- +-# Location where the python generated code is placed +-pydir=$(SPDK_ROOT_DIR)/python/spdk/sma/proto +-protodefs = $(wildcard *.proto) +-protopy = $(foreach proto,$(basename $(protodefs)),$(addprefix $(pydir)/$(proto),_pb2.py _pb2_grpc.py)) +- +-all: $(protopy) +- +-clean: +- $(Q)$(RM) $(protopy) +- +-# TODO: we should probably write a proper install rule here instead of just blindly copying all +-# python scripts when building the RPMs +-install: +-uninstall: +- +-$(pydir)/%_pb2.py $(pydir)/%_pb2_grpc.py: %.proto +- $(Q)python3 -m grpc_tools.protoc --proto_path=. --python_out=$(pydir) \ +- --grpc_python_out=$(pydir) $^ +- +-.PHONY: all clean install uninstall ++# SPDX-License-Identifier: BSD-3-Clause ++# Copyright (C) 2022 Intel Corporation. ++# All rights reserved. ++# ++ ++SPDK_ROOT_DIR := $(abspath $(CURDIR)/..) ++include $(SPDK_ROOT_DIR)/mk/spdk.common.mk ++ ++# Location where the python generated code is placed ++pydir=$(SPDK_ROOT_DIR)/python/spdk/sma/proto ++protodefs = $(wildcard *.proto) ++protopy = $(foreach proto,$(basename $(protodefs)),$(addprefix $(pydir)/$(proto),_pb2.py _pb2_grpc.py)) ++ ++all: $(protopy) ++ ++clean: ++ $(Q)$(RM) $(protopy) ++ ++# TODO: we should probably write a proper install rule here instead of just blindly copying all ++# python scripts when building the RPMs ++install: ++uninstall: ++ ++$(pydir)/%_pb2.py $(pydir)/%_pb2_grpc.py: %.proto ++ $(Q)python3 -m grpc_tools.protoc --proto_path=. --python_out=$(pydir) \ ++ --grpc_python_out=$(pydir) $^ ++ ++.PHONY: all clean install uninstall +diff --git a/proto/nvme.proto b/proto/nvme.proto +index e43d4a4..adbbab9 100644 +--- a/proto/nvme.proto ++++ b/proto/nvme.proto +@@ -1,12 +1,12 @@ +-syntax = "proto3"; +- +-package sma.nvme; +- +-option go_package = "spdk.io/sma/nvme"; +- +-message DeviceParameters { +- // Physical function index +- uint32 physical_id = 1; +- // Virtual function index +- uint32 virtual_id = 2; +-} ++syntax = "proto3"; ++ ++package sma.nvme; ++ ++option go_package = "spdk.io/sma/nvme"; ++ ++message DeviceParameters { ++ // Physical function index ++ uint32 physical_id = 1; ++ // Virtual function index ++ uint32 virtual_id = 2; ++} +diff --git a/proto/nvmf.proto b/proto/nvmf.proto +index 9b864d6..8eb1bf7 100644 +--- a/proto/nvmf.proto ++++ b/proto/nvmf.proto +@@ -1,35 +1,35 @@ +-syntax = "proto3"; +- +-package sma.nvmf; +- +-option go_package = "spdk.io/sma/nvmf"; +- +-// Defines an address of an NVMeoF endpoint +-message Address { +- // Transport type ("rdma" or "tcp") +- string trtype = 1; +- // Transport address (IP) +- string traddr = 2; +- // Transport service identifier (port number) +- string trsvcid = 3; +-} +-// NVMeoF connection using discovery service +-message VolumeDiscoveryParameters { +- // One or more discovery endpoints +- repeated Address discovery_endpoints = 1; +-} +- +-// Describes connection parameters for an NVMeoF volume (namespace) +-message VolumeConnectionParameters { +- // Subsystem that the volume is exposed through. A volume with a given +- // GUID/UUID won't be created if it's attached to a different subsystem. This +- // field is optional and can be left empty. +- string subnqn = 1; +- // Host NQN to use when connecting to the subsystem exposing the volume (and, +- // if using discovery, to the discovery subsystem too). +- string hostnqn = 2; +- oneof connection_params { +- // Connection through discovery service +- VolumeDiscoveryParameters discovery = 3; +- } +-} ++syntax = "proto3"; ++ ++package sma.nvmf; ++ ++option go_package = "spdk.io/sma/nvmf"; ++ ++// Defines an address of an NVMeoF endpoint ++message Address { ++ // Transport type ("rdma" or "tcp") ++ string trtype = 1; ++ // Transport address (IP) ++ string traddr = 2; ++ // Transport service identifier (port number) ++ string trsvcid = 3; ++} ++// NVMeoF connection using discovery service ++message VolumeDiscoveryParameters { ++ // One or more discovery endpoints ++ repeated Address discovery_endpoints = 1; ++} ++ ++// Describes connection parameters for an NVMeoF volume (namespace) ++message VolumeConnectionParameters { ++ // Subsystem that the volume is exposed through. A volume with a given ++ // GUID/UUID won't be created if it's attached to a different subsystem. This ++ // field is optional and can be left empty. ++ string subnqn = 1; ++ // Host NQN to use when connecting to the subsystem exposing the volume (and, ++ // if using discovery, to the discovery subsystem too). ++ string hostnqn = 2; ++ oneof connection_params { ++ // Connection through discovery service ++ VolumeDiscoveryParameters discovery = 3; ++ } ++} +diff --git a/proto/nvmf_tcp.proto b/proto/nvmf_tcp.proto +index da78708..68f4362 100644 +--- a/proto/nvmf_tcp.proto ++++ b/proto/nvmf_tcp.proto +@@ -1,22 +1,22 @@ +-syntax = "proto3"; +- +-package sma.nvmf_tcp; +- +-option go_package = "spdk.io/sma/nvmf_tcp"; +- +-// Create device NVMe/TCP-specific parameters +-message DeviceParameters { +- // Subsystem NQN +- string subnqn = 1; +- // Address family ("ipv4", "ipv6") +- string adrfam = 2; +- // Transport address +- string traddr = 3; +- // Transport service ID (port number) +- string trsvcid = 4; +- // Allow any host to connect +- bool allow_any_host = 5; +- // List of host NQNs that are allowed to connect to the subsystem (if +- // allow_any_host is false) +- repeated string hosts = 6; +-} ++syntax = "proto3"; ++ ++package sma.nvmf_tcp; ++ ++option go_package = "spdk.io/sma/nvmf_tcp"; ++ ++// Create device NVMe/TCP-specific parameters ++message DeviceParameters { ++ // Subsystem NQN ++ string subnqn = 1; ++ // Address family ("ipv4", "ipv6") ++ string adrfam = 2; ++ // Transport address ++ string traddr = 3; ++ // Transport service ID (port number) ++ string trsvcid = 4; ++ // Allow any host to connect ++ bool allow_any_host = 5; ++ // List of host NQNs that are allowed to connect to the subsystem (if ++ // allow_any_host is false) ++ repeated string hosts = 6; ++} +diff --git a/proto/sma.proto b/proto/sma.proto +index a725bc4..0b61963 100644 +--- a/proto/sma.proto ++++ b/proto/sma.proto +@@ -1,217 +1,217 @@ +-syntax = "proto3"; +- +-import "nvme.proto"; +-import "virtio_blk.proto"; +-import "nvmf_tcp.proto"; +-import "nvmf.proto"; +- +-// This file provides the generic definitions for the Storage Management Agent +-// gRPC calls. All of the methods are supposed to be idempotent. Errors are +-// reported as standard gRPC status codes. +- +-package sma; +- +-option go_package = "spdk.io/sma"; +- +-// Enumeration defining types of devices +-enum DeviceType { +- DEVICE_TYPE_INVALID = 0; +- DEVICE_TYPE_NVME = 1; +- DEVICE_TYPE_VIRTIO_BLK = 2; +- DEVICE_TYPE_NVMF_TCP = 3; +-} +- +-// Volume's crypto parameters +-message VolumeCryptoParameters { +- // Key to be used for encryption +- bytes key = 1; +- // Second key (only required by some ciphers) +- bytes key2 = 2; +- enum Cipher { +- AES_CBC = 0; +- AES_XTS = 1; +- } +- // Cipher to use +- Cipher cipher = 3; +- // Tweak mode - determine how nvme LBA is converted into tweak +- enum TweakMode { +- // default for SPDK bdev_crypto +- // Tweak[127:0] = {64'b0, LBA[63:0]} +- TWEAK_MODE_SIMPLE_LBA = 0; +- +- // Tweak[127:0] = {1’b0, ~LBA[62:0], LBA[63:0]} +- TWEAK_MODE_NOT_AND_LBA = 1; +- +- // tweak is derived from nvme LBA that is internally incremented by 1 for every 512 bytes processed +- // so initial lba = (BLOCK_SIZE_IN_BYTES / 512) * LBA +- // Tweak[127:0] = {lba[127:0]} +- TWEAK_MODE_FORCE_512_FULL = 2; +- +- // tweak is derived from nvme LBA that is internally incremented by 1 for every 512 bytes processed +- // so initial lba = (BLOCK_SIZE_IN_BYTES / 512) * LBA +- // Tweak[127:0] = {lba[63:0], 64'b0} +- TWEAK_MODE_FORCE_512_UPPER = 3; +- } +- TweakMode tweak_mode = 4; +-} +- +-// Parameters describing a volume +-message VolumeParameters { +- // Volume GUID/UUID +- bytes volume_id = 1; +- oneof connection_params { +- // NVMeoF volume +- nvmf.VolumeConnectionParameters nvmf = 2; +- } +- // Crypto parameters (optional) +- VolumeCryptoParameters crypto = 3; +-} +- +-// Create device request +-message CreateDeviceRequest { +- // Volume to immediately attach to the created device. This field may be +- // optional for some device types (e.g. NVMe), while it may be required for +- // others (e.g. virtio-blk). +- VolumeParameters volume = 1; +- // Device-specific parameters +- oneof params { +- // NVMe parameters +- nvme.DeviceParameters nvme = 2; +- // Virtio-blk parameters +- virtio_blk.DeviceParameters virtio_blk = 3; +- // NVMe/TCP parameters +- nvmf_tcp.DeviceParameters nvmf_tcp = 4; +- } +-} +- +-// Create device response +-message CreateDeviceResponse { +- // Device handle that can uniquely identify a device within an instance of +- // Storage Management Agent +- string handle = 1; +-} +- +-// Delete device request +-message DeleteDeviceRequest { +- // Device handle +- string handle = 1; +-} +- +-// Delete device response +-message DeleteDeviceResponse {} +- +-// Attach volume request +-message AttachVolumeRequest { +- // Volume parameters +- VolumeParameters volume = 1; +- // Device handle +- string device_handle = 2; +-} +- +-// Attach volume response +-message AttachVolumeResponse {} +- +-// Detach volume request +-message DetachVolumeRequest { +- // Volume GUID/UUID +- bytes volume_id = 1; +- // Device handle +- string device_handle = 2; +-} +- +-// Detach volume response +-message DetachVolumeResponse {} +- +-// QoS limit values. 0 means unlimited, while UINT64_MAX means to leave the +-// current limit value unchanged. If one of the limits isn't supported by a +-// given device/volume, it must be set to 0. +-message QosLimit { +- // Read kIOPS +- uint64 rd_iops = 1; +- // Write kIOPS +- uint64 wr_iops = 2; +- // Read/write kIOPS +- uint64 rw_iops = 3; +- // Read bandwidth (MB/s) +- uint64 rd_bandwidth = 4; +- // Write bandwidth (MB/s) +- uint64 wr_bandwidth = 5; +- // Read/write bandwidth (MB/s) +- uint64 rw_bandwidth = 6; +-} +- +-// SetQos request +-message SetQosRequest { +- // Device handle +- string device_handle = 1; +- // GUID/UUID of a volume to configure QoS on. If this parameter is omitted, +- // the QoS will be set up on the whole device (all volumes attached to that +- // device will share QoS settings). Some device types might only support +- // configuring QoS on per-device (volume_id must be empty) or per-volume level +- // (volume_id cannot be empty). This information can be obtained by sending a +- // GetQosCapabilities request. +- bytes volume_id = 2; +- // Maximum allowed IOPS/bandwidth +- QosLimit maximum = 3; +-} +- +-// SetQos response +-message SetQosResponse {} +- +-// Get QoS capabilities request +-message GetQosCapabilitiesRequest { +- // Type of a device to query for QoS capabilities +- DeviceType device_type = 1; +-} +- +-// Get QoS capabilities response +-message GetQosCapabilitiesResponse { +- message QosCapabilities { +- // Read IOPS +- bool rd_iops = 1; +- // Write IOPS +- bool wr_iops = 2; +- // Read/write IOPS +- bool rw_iops = 3; +- // Read bandwidth +- bool rd_bandwidth = 4; +- // Write bandwidth +- bool wr_bandwidth = 5; +- // Read/write bandwidth +- bool rw_bandwidth = 6; +- } +- // Device level maximum QoS limits +- QosCapabilities max_device_caps = 1; +- // Volume level maximum QoS limits +- QosCapabilities max_volume_caps = 2; +-}; +- +-// Storage Management Agent gRPC service definition +-service StorageManagementAgent { +- // Creates a new device. A device is an entity that can be used to expose +- // volumes (e.g. an NVMeoF subsystem). +- rpc CreateDevice (CreateDeviceRequest) +- returns (CreateDeviceResponse) {} +- // Deletes a device. It is only allowed to delete a device with volumes still +- // attached if that device doesn't support attaching volumes through +- // AttachVolume (e.g. virtio-blk). In other cases, it is forbidden and +- // FAILED_PRECONDITION status will be returned. +- rpc DeleteDevice (DeleteDeviceRequest) +- returns (DeleteDeviceResponse) {} +- // Attaches a volume to a specified device making it available through that +- // device (e.g. for NVMeoF this results in adding a namespace to an NVMeoF +- // subsystem). The type of volume doesn't need to match the type of device +- // (e.g. it's perfectly fine to attach an NVMe/TCP volume to a virtio-blk +- // device). +- rpc AttachVolume (AttachVolumeRequest) +- returns (AttachVolumeResponse) {} +- // Detaches a volume from a device +- rpc DetachVolume (DetachVolumeRequest) +- returns (DetachVolumeResponse) {} +- // Configures QoS on a device/volume +- rpc SetQos (SetQosRequest) +- returns (SetQosResponse) {} +- // Returns QoS capabilities of a given device type +- rpc GetQosCapabilities (GetQosCapabilitiesRequest) +- returns (GetQosCapabilitiesResponse) {} +-} ++syntax = "proto3"; ++ ++import "nvme.proto"; ++import "virtio_blk.proto"; ++import "nvmf_tcp.proto"; ++import "nvmf.proto"; ++ ++// This file provides the generic definitions for the Storage Management Agent ++// gRPC calls. All of the methods are supposed to be idempotent. Errors are ++// reported as standard gRPC status codes. ++ ++package sma; ++ ++option go_package = "spdk.io/sma"; ++ ++// Enumeration defining types of devices ++enum DeviceType { ++ DEVICE_TYPE_INVALID = 0; ++ DEVICE_TYPE_NVME = 1; ++ DEVICE_TYPE_VIRTIO_BLK = 2; ++ DEVICE_TYPE_NVMF_TCP = 3; ++} ++ ++// Volume's crypto parameters ++message VolumeCryptoParameters { ++ // Key to be used for encryption ++ bytes key = 1; ++ // Second key (only required by some ciphers) ++ bytes key2 = 2; ++ enum Cipher { ++ AES_CBC = 0; ++ AES_XTS = 1; ++ } ++ // Cipher to use ++ Cipher cipher = 3; ++ // Tweak mode - determine how nvme LBA is converted into tweak ++ enum TweakMode { ++ // default for SPDK bdev_crypto ++ // Tweak[127:0] = {64'b0, LBA[63:0]} ++ TWEAK_MODE_SIMPLE_LBA = 0; ++ ++ // Tweak[127:0] = {1’b0, ~LBA[62:0], LBA[63:0]} ++ TWEAK_MODE_NOT_AND_LBA = 1; ++ ++ // tweak is derived from nvme LBA that is internally incremented by 1 for every 512 bytes processed ++ // so initial lba = (BLOCK_SIZE_IN_BYTES / 512) * LBA ++ // Tweak[127:0] = {lba[127:0]} ++ TWEAK_MODE_FORCE_512_FULL = 2; ++ ++ // tweak is derived from nvme LBA that is internally incremented by 1 for every 512 bytes processed ++ // so initial lba = (BLOCK_SIZE_IN_BYTES / 512) * LBA ++ // Tweak[127:0] = {lba[63:0], 64'b0} ++ TWEAK_MODE_FORCE_512_UPPER = 3; ++ } ++ TweakMode tweak_mode = 4; ++} ++ ++// Parameters describing a volume ++message VolumeParameters { ++ // Volume GUID/UUID ++ bytes volume_id = 1; ++ oneof connection_params { ++ // NVMeoF volume ++ nvmf.VolumeConnectionParameters nvmf = 2; ++ } ++ // Crypto parameters (optional) ++ VolumeCryptoParameters crypto = 3; ++} ++ ++// Create device request ++message CreateDeviceRequest { ++ // Volume to immediately attach to the created device. This field may be ++ // optional for some device types (e.g. NVMe), while it may be required for ++ // others (e.g. virtio-blk). ++ VolumeParameters volume = 1; ++ // Device-specific parameters ++ oneof params { ++ // NVMe parameters ++ nvme.DeviceParameters nvme = 2; ++ // Virtio-blk parameters ++ virtio_blk.DeviceParameters virtio_blk = 3; ++ // NVMe/TCP parameters ++ nvmf_tcp.DeviceParameters nvmf_tcp = 4; ++ } ++} ++ ++// Create device response ++message CreateDeviceResponse { ++ // Device handle that can uniquely identify a device within an instance of ++ // Storage Management Agent ++ string handle = 1; ++} ++ ++// Delete device request ++message DeleteDeviceRequest { ++ // Device handle ++ string handle = 1; ++} ++ ++// Delete device response ++message DeleteDeviceResponse {} ++ ++// Attach volume request ++message AttachVolumeRequest { ++ // Volume parameters ++ VolumeParameters volume = 1; ++ // Device handle ++ string device_handle = 2; ++} ++ ++// Attach volume response ++message AttachVolumeResponse {} ++ ++// Detach volume request ++message DetachVolumeRequest { ++ // Volume GUID/UUID ++ bytes volume_id = 1; ++ // Device handle ++ string device_handle = 2; ++} ++ ++// Detach volume response ++message DetachVolumeResponse {} ++ ++// QoS limit values. 0 means unlimited, while UINT64_MAX means to leave the ++// current limit value unchanged. If one of the limits isn't supported by a ++// given device/volume, it must be set to 0. ++message QosLimit { ++ // Read kIOPS ++ uint64 rd_iops = 1; ++ // Write kIOPS ++ uint64 wr_iops = 2; ++ // Read/write kIOPS ++ uint64 rw_iops = 3; ++ // Read bandwidth (MB/s) ++ uint64 rd_bandwidth = 4; ++ // Write bandwidth (MB/s) ++ uint64 wr_bandwidth = 5; ++ // Read/write bandwidth (MB/s) ++ uint64 rw_bandwidth = 6; ++} ++ ++// SetQos request ++message SetQosRequest { ++ // Device handle ++ string device_handle = 1; ++ // GUID/UUID of a volume to configure QoS on. If this parameter is omitted, ++ // the QoS will be set up on the whole device (all volumes attached to that ++ // device will share QoS settings). Some device types might only support ++ // configuring QoS on per-device (volume_id must be empty) or per-volume level ++ // (volume_id cannot be empty). This information can be obtained by sending a ++ // GetQosCapabilities request. ++ bytes volume_id = 2; ++ // Maximum allowed IOPS/bandwidth ++ QosLimit maximum = 3; ++} ++ ++// SetQos response ++message SetQosResponse {} ++ ++// Get QoS capabilities request ++message GetQosCapabilitiesRequest { ++ // Type of a device to query for QoS capabilities ++ DeviceType device_type = 1; ++} ++ ++// Get QoS capabilities response ++message GetQosCapabilitiesResponse { ++ message QosCapabilities { ++ // Read IOPS ++ bool rd_iops = 1; ++ // Write IOPS ++ bool wr_iops = 2; ++ // Read/write IOPS ++ bool rw_iops = 3; ++ // Read bandwidth ++ bool rd_bandwidth = 4; ++ // Write bandwidth ++ bool wr_bandwidth = 5; ++ // Read/write bandwidth ++ bool rw_bandwidth = 6; ++ } ++ // Device level maximum QoS limits ++ QosCapabilities max_device_caps = 1; ++ // Volume level maximum QoS limits ++ QosCapabilities max_volume_caps = 2; ++}; ++ ++// Storage Management Agent gRPC service definition ++service StorageManagementAgent { ++ // Creates a new device. A device is an entity that can be used to expose ++ // volumes (e.g. an NVMeoF subsystem). ++ rpc CreateDevice (CreateDeviceRequest) ++ returns (CreateDeviceResponse) {} ++ // Deletes a device. It is only allowed to delete a device with volumes still ++ // attached if that device doesn't support attaching volumes through ++ // AttachVolume (e.g. virtio-blk). In other cases, it is forbidden and ++ // FAILED_PRECONDITION status will be returned. ++ rpc DeleteDevice (DeleteDeviceRequest) ++ returns (DeleteDeviceResponse) {} ++ // Attaches a volume to a specified device making it available through that ++ // device (e.g. for NVMeoF this results in adding a namespace to an NVMeoF ++ // subsystem). The type of volume doesn't need to match the type of device ++ // (e.g. it's perfectly fine to attach an NVMe/TCP volume to a virtio-blk ++ // device). ++ rpc AttachVolume (AttachVolumeRequest) ++ returns (AttachVolumeResponse) {} ++ // Detaches a volume from a device ++ rpc DetachVolume (DetachVolumeRequest) ++ returns (DetachVolumeResponse) {} ++ // Configures QoS on a device/volume ++ rpc SetQos (SetQosRequest) ++ returns (SetQosResponse) {} ++ // Returns QoS capabilities of a given device type ++ rpc GetQosCapabilities (GetQosCapabilitiesRequest) ++ returns (GetQosCapabilitiesResponse) {} ++} +diff --git a/proto/virtio_blk.proto b/proto/virtio_blk.proto +index 82e4f2a..2252cfc 100644 +--- a/proto/virtio_blk.proto ++++ b/proto/virtio_blk.proto +@@ -1,12 +1,12 @@ +-syntax = "proto3"; +- +-package sma.virtio_blk; +- +-option go_package = "spdk.io/sma/virtio_blk"; +- +-message DeviceParameters { +- // Physical function index +- uint32 physical_id = 1; +- // Virtual function index +- uint32 virtual_id = 2; +-} ++syntax = "proto3"; ++ ++package sma.virtio_blk; ++ ++option go_package = "spdk.io/sma/virtio_blk"; ++ ++message DeviceParameters { ++ // Physical function index ++ uint32 physical_id = 1; ++ // Virtual function index ++ uint32 virtual_id = 2; ++} +diff --git a/python/spdk/__init__.py b/python/spdk/__init__.py +index 5e8c056..cf459ed 100644 +--- a/python/spdk/__init__.py ++++ b/python/spdk/__init__.py +@@ -1,3 +1,3 @@ +-# SPDX-License-Identifier: BSD-3-Clause +-# Copyright (C) 2021 Intel Corporation. +-# All rights reserved. ++# SPDX-License-Identifier: BSD-3-Clause ++# Copyright (C) 2021 Intel Corporation. ++# All rights reserved. +diff --git a/python/spdk/rpc/__init__.py b/python/spdk/rpc/__init__.py +index 0541544..6e86523 100644 +--- a/python/spdk/rpc/__init__.py ++++ b/python/spdk/rpc/__init__.py +@@ -1,217 +1,218 @@ +-# SPDX-License-Identifier: BSD-3-Clause +-# Copyright (C) 2017 Intel Corporation. +-# All rights reserved. +- +-import json +-import os +-import sys +- +-from io import IOBase as io +- +-from . import accel +-from . import app +-from . import bdev +-from . import blobfs +-from . import compressdev +-from . import env_dpdk +-from . import dsa +-from . import iaa +-from . import ioat +-from . import iscsi +-from . import log +-from . import lvol +-from . import nbd +-from . import ublk +-from . import notify +-from . import nvme +-from . import nvmf +-from . import pmem +-from . import subsystem +-from . import trace +-from . import vhost +-from . import vmd +-from . import sock +-from . import vfio_user +-from . import iobuf +-from . import dpdk_cryptodev +-from . import mlx5 +-from . import client as rpc_client +- +- +-def framework_start_init(client): +- """Start initialization of subsystems""" +- return client.call('framework_start_init') +- +- +-def framework_wait_init(client): +- """Block until subsystems have been initialized""" +- return client.call('framework_wait_init') +- +- +-def framework_disable_cpumask_locks(client): +- """ Disable CPU core lock files.""" +- return client.call('framework_disable_cpumask_locks') +- +- +-def framework_enable_cpumask_locks(client): +- """ Enable CPU core lock files.""" +- return client.call('framework_enable_cpumask_locks') +- +- +-def rpc_get_methods(client, current=None, include_aliases=None): +- """Get list of supported RPC methods. +- Args: +- current: Get list of RPC methods only callable in the current state. +- include_aliases: Include aliases in the list with RPC methods. +- """ +- params = {} +- +- if current: +- params['current'] = current +- if include_aliases: +- params['include_aliases'] = include_aliases +- +- return client.call('rpc_get_methods', params) +- +- +-def spdk_get_version(client): +- """Get SPDK version""" +- return client.call('spdk_get_version') +- +- +-def _json_dump(config, fd, indent): +- if indent is None: +- indent = 2 +- elif indent < 0: +- indent = None +- json.dump(config, fd, indent=indent) +- fd.write('\n') +- +- +-def _json_load(j): +- if j == sys.stdin or isinstance(j, io): +- json_conf = json.load(j) +- elif os.path.exists(j): +- with open(j, "r") as j: +- json_conf = json.load(j) +- else: +- json_conf = json.loads(j) +- return json_conf +- +- +-def save_config(client, fd, indent=2): +- """Write current (live) configuration of SPDK subsystems and targets to stdout. +- Args: +- fd: opened file descriptor where data will be saved +- indent: Indent level. Value less than 0 mean compact mode. +- Default indent level is 2. +- """ +- config = { +- 'subsystems': [] +- } +- +- for elem in client.call('framework_get_subsystems'): +- cfg = { +- 'subsystem': elem['subsystem'], +- 'config': client.call('framework_get_config', {"name": elem['subsystem']}) +- } +- config['subsystems'].append(cfg) +- +- _json_dump(config, fd, indent) +- +- +-def load_config(client, fd, include_aliases=False): +- """Configure SPDK subsystems and targets using JSON RPC read from stdin. +- Args: +- fd: opened file descriptor where data will be taken from +- """ +- json_config = _json_load(fd) +- +- # remove subsystems with no config +- subsystems = json_config['subsystems'] +- for subsystem in list(subsystems): +- if not subsystem['config']: +- subsystems.remove(subsystem) +- +- # check if methods in the config file are known +- allowed_methods = client.call('rpc_get_methods', {'include_aliases': include_aliases}) +- if not subsystems and 'framework_start_init' in allowed_methods: +- framework_start_init(client) +- return +- +- for subsystem in list(subsystems): +- config = subsystem['config'] +- for elem in list(config): +- if 'method' not in elem or elem['method'] not in allowed_methods: +- raise rpc_client.JSONRPCException("Unknown method was included in the config file") +- +- while subsystems: +- allowed_methods = client.call('rpc_get_methods', {'current': True, +- 'include_aliases': include_aliases}) +- allowed_found = False +- +- for subsystem in list(subsystems): +- config = subsystem['config'] +- for elem in list(config): +- if 'method' not in elem or elem['method'] not in allowed_methods: +- continue +- +- client.call(**elem) +- config.remove(elem) +- allowed_found = True +- +- if not config: +- subsystems.remove(subsystem) +- +- if 'framework_start_init' in allowed_methods: +- framework_start_init(client) +- allowed_found = True +- +- if not allowed_found: +- break +- +- if subsystems: +- print("Some configs were skipped because the RPC state that can call them passed over.") +- +- +-def save_subsystem_config(client, fd, indent=2, name=None): +- """Write current (live) configuration of SPDK subsystem to stdout. +- Args: +- fd: opened file descriptor where data will be saved +- indent: Indent level. Value less than 0 mean compact mode. +- Default is indent level 2. +- """ +- cfg = { +- 'subsystem': name, +- 'config': client.call('framework_get_config', {"name": name}) +- } +- +- _json_dump(cfg, fd, indent) +- +- +-def load_subsystem_config(client, fd): +- """Configure SPDK subsystem using JSON RPC read from stdin. +- Args: +- fd: opened file descriptor where data will be taken from +- """ +- subsystem = _json_load(fd) +- +- if not subsystem['config']: +- return +- +- allowed_methods = client.call('rpc_get_methods') +- config = subsystem['config'] +- for elem in list(config): +- if 'method' not in elem or elem['method'] not in allowed_methods: +- raise rpc_client.JSONRPCException("Unknown method was included in the config file") +- +- allowed_methods = client.call('rpc_get_methods', {'current': True}) +- for elem in list(config): +- if 'method' not in elem or elem['method'] not in allowed_methods: +- continue +- +- client.call(**elem) +- config.remove(elem) +- +- if config: +- print("Some configs were skipped because they cannot be called in the current RPC state.") ++# SPDX-License-Identifier: BSD-3-Clause ++# Copyright (C) 2017 Intel Corporation. ++# All rights reserved. ++ ++import json ++import os ++import sys ++ ++from io import IOBase as io ++ ++from . import accel ++from . import app ++from . import bdev ++from . import blobfs ++from . import compressdev ++from . import env_dpdk ++from . import dsa ++from . import iaa ++from . import ioat ++from . import iscsi ++from . import log ++from . import lvol ++from . import nbd ++from . import ublk ++from . import notify ++from . import nvme ++from . import nvmf ++from . import pmem ++from . import subsystem ++from . import trace ++from . import vhost ++from . import ssam ++from . import vmd ++from . import sock ++from . import vfio_user ++from . import iobuf ++from . import dpdk_cryptodev ++from . import mlx5 ++from . import client as rpc_client ++ ++ ++def framework_start_init(client): ++ """Start initialization of subsystems""" ++ return client.call('framework_start_init') ++ ++ ++def framework_wait_init(client): ++ """Block until subsystems have been initialized""" ++ return client.call('framework_wait_init') ++ ++ ++def framework_disable_cpumask_locks(client): ++ """ Disable CPU core lock files.""" ++ return client.call('framework_disable_cpumask_locks') ++ ++ ++def framework_enable_cpumask_locks(client): ++ """ Enable CPU core lock files.""" ++ return client.call('framework_enable_cpumask_locks') ++ ++ ++def rpc_get_methods(client, current=None, include_aliases=None): ++ """Get list of supported RPC methods. ++ Args: ++ current: Get list of RPC methods only callable in the current state. ++ include_aliases: Include aliases in the list with RPC methods. ++ """ ++ params = {} ++ ++ if current: ++ params['current'] = current ++ if include_aliases: ++ params['include_aliases'] = include_aliases ++ ++ return client.call('rpc_get_methods', params) ++ ++ ++def spdk_get_version(client): ++ """Get SPDK version""" ++ return client.call('spdk_get_version') ++ ++ ++def _json_dump(config, fd, indent): ++ if indent is None: ++ indent = 2 ++ elif indent < 0: ++ indent = None ++ json.dump(config, fd, indent=indent) ++ fd.write('\n') ++ ++ ++def _json_load(j): ++ if j == sys.stdin or isinstance(j, io): ++ json_conf = json.load(j) ++ elif os.path.exists(j): ++ with open(j, "r") as j: ++ json_conf = json.load(j) ++ else: ++ json_conf = json.loads(j) ++ return json_conf ++ ++ ++def save_config(client, fd, indent=2): ++ """Write current (live) configuration of SPDK subsystems and targets to stdout. ++ Args: ++ fd: opened file descriptor where data will be saved ++ indent: Indent level. Value less than 0 mean compact mode. ++ Default indent level is 2. ++ """ ++ config = { ++ 'subsystems': [] ++ } ++ ++ for elem in client.call('framework_get_subsystems'): ++ cfg = { ++ 'subsystem': elem['subsystem'], ++ 'config': client.call('framework_get_config', {"name": elem['subsystem']}) ++ } ++ config['subsystems'].append(cfg) ++ ++ _json_dump(config, fd, indent) ++ ++ ++def load_config(client, fd, include_aliases=False): ++ """Configure SPDK subsystems and targets using JSON RPC read from stdin. ++ Args: ++ fd: opened file descriptor where data will be taken from ++ """ ++ json_config = _json_load(fd) ++ ++ # remove subsystems with no config ++ subsystems = json_config['subsystems'] ++ for subsystem in list(subsystems): ++ if not subsystem['config']: ++ subsystems.remove(subsystem) ++ ++ # check if methods in the config file are known ++ allowed_methods = client.call('rpc_get_methods', {'include_aliases': include_aliases}) ++ if not subsystems and 'framework_start_init' in allowed_methods: ++ framework_start_init(client) ++ return ++ ++ for subsystem in list(subsystems): ++ config = subsystem['config'] ++ for elem in list(config): ++ if 'method' not in elem or elem['method'] not in allowed_methods: ++ raise rpc_client.JSONRPCException("Unknown method was included in the config file") ++ ++ while subsystems: ++ allowed_methods = client.call('rpc_get_methods', {'current': True, ++ 'include_aliases': include_aliases}) ++ allowed_found = False ++ ++ for subsystem in list(subsystems): ++ config = subsystem['config'] ++ for elem in list(config): ++ if 'method' not in elem or elem['method'] not in allowed_methods: ++ continue ++ ++ client.call(**elem) ++ config.remove(elem) ++ allowed_found = True ++ ++ if not config: ++ subsystems.remove(subsystem) ++ ++ if 'framework_start_init' in allowed_methods: ++ framework_start_init(client) ++ allowed_found = True ++ ++ if not allowed_found: ++ break ++ ++ if subsystems: ++ print("Some configs were skipped because the RPC state that can call them passed over.") ++ ++ ++def save_subsystem_config(client, fd, indent=2, name=None): ++ """Write current (live) configuration of SPDK subsystem to stdout. ++ Args: ++ fd: opened file descriptor where data will be saved ++ indent: Indent level. Value less than 0 mean compact mode. ++ Default is indent level 2. ++ """ ++ cfg = { ++ 'subsystem': name, ++ 'config': client.call('framework_get_config', {"name": name}) ++ } ++ ++ _json_dump(cfg, fd, indent) ++ ++ ++def load_subsystem_config(client, fd): ++ """Configure SPDK subsystem using JSON RPC read from stdin. ++ Args: ++ fd: opened file descriptor where data will be taken from ++ """ ++ subsystem = _json_load(fd) ++ ++ if not subsystem['config']: ++ return ++ ++ allowed_methods = client.call('rpc_get_methods') ++ config = subsystem['config'] ++ for elem in list(config): ++ if 'method' not in elem or elem['method'] not in allowed_methods: ++ raise rpc_client.JSONRPCException("Unknown method was included in the config file") ++ ++ allowed_methods = client.call('rpc_get_methods', {'current': True}) ++ for elem in list(config): ++ if 'method' not in elem or elem['method'] not in allowed_methods: ++ continue ++ ++ client.call(**elem) ++ config.remove(elem) ++ ++ if config: ++ print("Some configs were skipped because they cannot be called in the current RPC state.") +diff --git a/python/spdk/rpc/accel.py b/python/spdk/rpc/accel.py +index e4cccc2..7bea2c2 100644 +--- a/python/spdk/rpc/accel.py ++++ b/python/spdk/rpc/accel.py +@@ -1,81 +1,81 @@ +-# SPDX-License-Identifier: BSD-3-Clause +-# Copyright (C) 2022 Intel Corporation. +-# All rights reserved. +-# +- +-from spdk.rpc.helpers import deprecated_alias +- +- +-def accel_get_opc_assignments(client): +- """Get list of opcode name to module assignments. +- """ +- return client.call('accel_get_opc_assignments') +- +- +-@deprecated_alias('accel_get_engine_info') +-def accel_get_module_info(client): +- """Get list of valid module names and their operations. +- """ +- return client.call('accel_get_module_info') +- +- +-def accel_assign_opc(client, opname, module): +- """Manually assign an operation to a module. +- +- Args: +- opname: name of operation +- module: name of module +- """ +- params = { +- 'opname': opname, +- 'module': module, +- } +- +- return client.call('accel_assign_opc', params) +- +- +-def accel_crypto_key_create(client, cipher, key, key2, name): +- """Create Data Encryption Key Identifier. +- +- Args: +- cipher: cipher +- key: key +- key2: key2 +- name: key name +- """ +- params = { +- 'cipher': cipher, +- 'key': key, +- 'name': name, +- } +- if key2 is not None: +- params['key2'] = key2 +- +- return client.call('accel_crypto_key_create', params) +- +- +-def accel_crypto_key_destroy(client, name): +- """Destroy Data Encryption Key. +- +- Args: +- name: key name +- """ +- params = { +- 'name': name +- } +- +- return client.call('accel_crypto_key_destroy', params) +- +- +-def accel_crypto_keys_get(client, key_name): +- """Get a list of the crypto keys. +- +- Args: +- key_name: Get information about a specific key +- """ +- params = {} +- +- if key_name is not None: +- params['key_name'] = key_name +- +- return client.call('accel_crypto_keys_get', params) ++# SPDX-License-Identifier: BSD-3-Clause ++# Copyright (C) 2022 Intel Corporation. ++# All rights reserved. ++# ++ ++from spdk.rpc.helpers import deprecated_alias ++ ++ ++def accel_get_opc_assignments(client): ++ """Get list of opcode name to module assignments. ++ """ ++ return client.call('accel_get_opc_assignments') ++ ++ ++@deprecated_alias('accel_get_engine_info') ++def accel_get_module_info(client): ++ """Get list of valid module names and their operations. ++ """ ++ return client.call('accel_get_module_info') ++ ++ ++def accel_assign_opc(client, opname, module): ++ """Manually assign an operation to a module. ++ ++ Args: ++ opname: name of operation ++ module: name of module ++ """ ++ params = { ++ 'opname': opname, ++ 'module': module, ++ } ++ ++ return client.call('accel_assign_opc', params) ++ ++ ++def accel_crypto_key_create(client, cipher, key, key2, name): ++ """Create Data Encryption Key Identifier. ++ ++ Args: ++ cipher: cipher ++ key: key ++ key2: key2 ++ name: key name ++ """ ++ params = { ++ 'cipher': cipher, ++ 'key': key, ++ 'name': name, ++ } ++ if key2 is not None: ++ params['key2'] = key2 ++ ++ return client.call('accel_crypto_key_create', params) ++ ++ ++def accel_crypto_key_destroy(client, name): ++ """Destroy Data Encryption Key. ++ ++ Args: ++ name: key name ++ """ ++ params = { ++ 'name': name ++ } ++ ++ return client.call('accel_crypto_key_destroy', params) ++ ++ ++def accel_crypto_keys_get(client, key_name): ++ """Get a list of the crypto keys. ++ ++ Args: ++ key_name: Get information about a specific key ++ """ ++ params = {} ++ ++ if key_name is not None: ++ params['key_name'] = key_name ++ ++ return client.call('accel_crypto_keys_get', params) +diff --git a/python/spdk/rpc/app.py b/python/spdk/rpc/app.py +index 319c3ba..7edbf75 100644 +--- a/python/spdk/rpc/app.py ++++ b/python/spdk/rpc/app.py +@@ -1,122 +1,122 @@ +-# SPDX-License-Identifier: BSD-3-Clause +-# Copyright (C) 2017 Intel Corporation. +-# All rights reserved. +- +- +-def spdk_kill_instance(client, sig_name): +- """Send a signal to the SPDK process. +- +- Args: +- sig_name: signal to send ("SIGINT", "SIGTERM", "SIGQUIT", "SIGHUP", or "SIGKILL") +- """ +- params = {'sig_name': sig_name} +- return client.call('spdk_kill_instance', params) +- +- +-def framework_monitor_context_switch(client, enabled=None): +- """Query or set state of context switch monitoring. +- +- Args: +- enabled: True to enable monitoring; False to disable monitoring; None to query (optional) +- +- Returns: +- Current context switch monitoring state (after applying enabled flag). +- """ +- params = {} +- if enabled is not None: +- params['enabled'] = enabled +- return client.call('framework_monitor_context_switch', params) +- +- +-def framework_get_reactors(client): +- """Query list of all reactors. +- +- Returns: +- List of all reactors. +- """ +- return client.call('framework_get_reactors') +- +- +-def framework_set_scheduler(client, name, period=None, load_limit=None, core_limit=None, +- core_busy=None): +- """Select threads scheduler that will be activated and its period. +- +- Args: +- name: Name of a scheduler +- period: Scheduler period in microseconds +- Returns: +- True or False +- """ +- params = {'name': name} +- if period is not None: +- params['period'] = period +- if load_limit is not None: +- params['load_limit'] = load_limit +- if core_limit is not None: +- params['core_limit'] = core_limit +- if core_busy is not None: +- params['core_busy'] = core_busy +- return client.call('framework_set_scheduler', params) +- +- +-def framework_get_scheduler(client): +- """Query currently set scheduler. +- +- Returns: +- Name, period (in microseconds) of currently set scheduler and name of currently set governor. +- """ +- return client.call('framework_get_scheduler') +- +- +-def thread_get_stats(client): +- """Query threads statistics. +- +- Returns: +- Current threads statistics. +- """ +- return client.call('thread_get_stats') +- +- +-def thread_set_cpumask(client, id, cpumask): +- """Set the cpumask of the thread whose ID matches to the specified value. +- +- Args: +- id: thread ID +- cpumask: cpumask for this thread +- +- Returns: +- True or False +- """ +- params = {'id': id, 'cpumask': cpumask} +- return client.call('thread_set_cpumask', params) +- +- +-def log_enable_timestamps(client, enabled): +- """Enable or disable timestamps. +- +- Args: +- value: on or off +- +- Returns: +- None +- """ +- params = {'enabled': enabled} +- return client.call('log_enable_timestamps', params) +- +- +-def thread_get_pollers(client): +- """Query current pollers. +- +- Returns: +- Current pollers. +- """ +- return client.call('thread_get_pollers') +- +- +-def thread_get_io_channels(client): +- """Query current IO channels. +- +- Returns: +- Current IO channels. +- """ +- return client.call('thread_get_io_channels') ++# SPDX-License-Identifier: BSD-3-Clause ++# Copyright (C) 2017 Intel Corporation. ++# All rights reserved. ++ ++ ++def spdk_kill_instance(client, sig_name): ++ """Send a signal to the SPDK process. ++ ++ Args: ++ sig_name: signal to send ("SIGINT", "SIGTERM", "SIGQUIT", "SIGHUP", or "SIGKILL") ++ """ ++ params = {'sig_name': sig_name} ++ return client.call('spdk_kill_instance', params) ++ ++ ++def framework_monitor_context_switch(client, enabled=None): ++ """Query or set state of context switch monitoring. ++ ++ Args: ++ enabled: True to enable monitoring; False to disable monitoring; None to query (optional) ++ ++ Returns: ++ Current context switch monitoring state (after applying enabled flag). ++ """ ++ params = {} ++ if enabled is not None: ++ params['enabled'] = enabled ++ return client.call('framework_monitor_context_switch', params) ++ ++ ++def framework_get_reactors(client): ++ """Query list of all reactors. ++ ++ Returns: ++ List of all reactors. ++ """ ++ return client.call('framework_get_reactors') ++ ++ ++def framework_set_scheduler(client, name, period=None, load_limit=None, core_limit=None, ++ core_busy=None): ++ """Select threads scheduler that will be activated and its period. ++ ++ Args: ++ name: Name of a scheduler ++ period: Scheduler period in microseconds ++ Returns: ++ True or False ++ """ ++ params = {'name': name} ++ if period is not None: ++ params['period'] = period ++ if load_limit is not None: ++ params['load_limit'] = load_limit ++ if core_limit is not None: ++ params['core_limit'] = core_limit ++ if core_busy is not None: ++ params['core_busy'] = core_busy ++ return client.call('framework_set_scheduler', params) ++ ++ ++def framework_get_scheduler(client): ++ """Query currently set scheduler. ++ ++ Returns: ++ Name, period (in microseconds) of currently set scheduler and name of currently set governor. ++ """ ++ return client.call('framework_get_scheduler') ++ ++ ++def thread_get_stats(client): ++ """Query threads statistics. ++ ++ Returns: ++ Current threads statistics. ++ """ ++ return client.call('thread_get_stats') ++ ++ ++def thread_set_cpumask(client, id, cpumask): ++ """Set the cpumask of the thread whose ID matches to the specified value. ++ ++ Args: ++ id: thread ID ++ cpumask: cpumask for this thread ++ ++ Returns: ++ True or False ++ """ ++ params = {'id': id, 'cpumask': cpumask} ++ return client.call('thread_set_cpumask', params) ++ ++ ++def log_enable_timestamps(client, enabled): ++ """Enable or disable timestamps. ++ ++ Args: ++ value: on or off ++ ++ Returns: ++ None ++ """ ++ params = {'enabled': enabled} ++ return client.call('log_enable_timestamps', params) ++ ++ ++def thread_get_pollers(client): ++ """Query current pollers. ++ ++ Returns: ++ Current pollers. ++ """ ++ return client.call('thread_get_pollers') ++ ++ ++def thread_get_io_channels(client): ++ """Query current IO channels. ++ ++ Returns: ++ Current IO channels. ++ """ ++ return client.call('thread_get_io_channels') +diff --git a/python/spdk/rpc/bdev.py b/python/spdk/rpc/bdev.py +index bbbfcbe..bcaacc9 100644 +--- a/python/spdk/rpc/bdev.py ++++ b/python/spdk/rpc/bdev.py +@@ -1,1787 +1,1787 @@ +-# SPDX-License-Identifier: BSD-3-Clause +-# Copyright (C) 2017 Intel Corporation. +-# All rights reserved. +-# Copyright (c) 2022 Dell Inc, or its subsidiaries. +- +- +-def bdev_set_options(client, bdev_io_pool_size=None, bdev_io_cache_size=None, bdev_auto_examine=None, +- small_buf_pool_size=None, large_buf_pool_size=None): +- """Set parameters for the bdev subsystem. +- +- Args: +- bdev_io_pool_size: number of bdev_io structures in shared buffer pool (optional) +- bdev_io_cache_size: maximum number of bdev_io structures cached per thread (optional) +- bdev_auto_examine: if set to false, the bdev layer will not examine every disks automatically (optional) +- small_buf_pool_size: maximum number of small buffer (8KB buffer) pool size (optional) +- large_buf_pool_size: maximum number of large buffer (64KB buffer) pool size (optional) +- """ +- params = {} +- +- if bdev_io_pool_size: +- params['bdev_io_pool_size'] = bdev_io_pool_size +- if bdev_io_cache_size: +- params['bdev_io_cache_size'] = bdev_io_cache_size +- if bdev_auto_examine is not None: +- params["bdev_auto_examine"] = bdev_auto_examine +- if small_buf_pool_size: +- params['small_buf_pool_size'] = small_buf_pool_size +- if large_buf_pool_size: +- params['large_buf_pool_size'] = large_buf_pool_size +- return client.call('bdev_set_options', params) +- +- +-def bdev_examine(client, name): +- """Examine a bdev manually. If the bdev does not exist yet when this RPC is called, +- it will be examined when it is created +- +- Args: +- name: name of the bdev +- """ +- params = { +- 'name': name +- } +- return client.call('bdev_examine', params) +- +- +-def bdev_wait_for_examine(client): +- """Report when all bdevs have been examined +- """ +- return client.call('bdev_wait_for_examine') +- +- +-def bdev_compress_create(client, base_bdev_name, pm_path, lb_size): +- """Construct a compress virtual block device. +- +- Args: +- base_bdev_name: name of the underlying base bdev +- pm_path: path to persistent memory +- lb_size: logical block size for the compressed vol in bytes. Must be 4K or 512. +- +- Returns: +- Name of created virtual block device. +- """ +- params = {'base_bdev_name': base_bdev_name, 'pm_path': pm_path} +- +- if lb_size: +- params['lb_size'] = lb_size +- +- return client.call('bdev_compress_create', params) +- +- +-def bdev_compress_delete(client, name): +- """Delete compress virtual block device. +- +- Args: +- name: name of compress vbdev to delete +- """ +- params = {'name': name} +- return client.call('bdev_compress_delete', params) +- +- +-def bdev_compress_get_orphans(client, name=None): +- """Get a list of comp bdevs that do not have a pmem file (aka orphaned). +- +- Args: +- name: comp bdev name to query (optional; if omitted, query all comp bdevs) +- +- Returns: +- List of comp bdev names. +- """ +- params = {} +- if name: +- params['name'] = name +- return client.call('bdev_compress_get_orphans', params) +- +- +-def bdev_crypto_create(client, base_bdev_name, name, crypto_pmd=None, key=None, cipher=None, key2=None, key_name=None): +- """Construct a crypto virtual block device. +- +- Args: +- base_bdev_name: name of the underlying base bdev +- name: name for the crypto vbdev +- crypto_pmd: name of the DPDK crypto driver to use +- key: key +- cipher: crypto algorithm to use +- key2: Optional second part of the key +- key_name: The key name to use in crypto operations +- +- Returns: +- Name of created virtual block device. +- """ +- params = {'base_bdev_name': base_bdev_name, 'name': name} +- +- if crypto_pmd is not None: +- params['crypto_pmd'] = crypto_pmd +- if key is not None: +- params['key'] = key +- if key2 is not None: +- params['key2'] = key2 +- if cipher is not None: +- params['cipher'] = cipher +- if key_name is not None: +- params['key_name'] = key_name +- return client.call('bdev_crypto_create', params) +- +- +-def bdev_crypto_delete(client, name): +- """Delete crypto virtual block device. +- +- Args: +- name: name of crypto vbdev to delete +- """ +- params = {'name': name} +- return client.call('bdev_crypto_delete', params) +- +- +-def bdev_ocf_create(client, name, mode, cache_line_size, cache_bdev_name, core_bdev_name): +- """Add an OCF block device +- +- Args: +- name: name of constructed OCF bdev +- mode: OCF cache mode: {'wb', 'wt', 'pt', 'wa', 'wi', 'wo'} +- cache_line_size: OCF cache line size. The unit is KiB: {4, 8, 16, 32, 64} +- cache_bdev_name: name of underlying cache bdev +- core_bdev_name: name of underlying core bdev +- +- Returns: +- Name of created block device +- """ +- params = { +- 'name': name, +- 'mode': mode, +- 'cache_bdev_name': cache_bdev_name, +- 'core_bdev_name': core_bdev_name, +- } +- +- if cache_line_size: +- params['cache_line_size'] = cache_line_size +- +- return client.call('bdev_ocf_create', params) +- +- +-def bdev_ocf_delete(client, name): +- """Delete an OCF device +- +- Args: +- name: name of OCF bdev +- +- """ +- params = {'name': name} +- +- return client.call('bdev_ocf_delete', params) +- +- +-def bdev_ocf_get_stats(client, name): +- """Get statistics of chosen OCF block device +- +- Args: +- name: name of OCF bdev +- +- Returns: +- Statistics as json object +- """ +- params = {'name': name} +- +- return client.call('bdev_ocf_get_stats', params) +- +- +-def bdev_ocf_get_bdevs(client, name=None): +- """Get list of OCF devices including unregistered ones +- +- Args: +- name: name of OCF vbdev or name of cache device or name of core device (optional) +- +- Returns: +- Array of OCF devices with their current status +- """ +- params = None +- if name: +- params = {'name': name} +- return client.call('bdev_ocf_get_bdevs', params) +- +- +-def bdev_ocf_set_cache_mode(client, name, mode): +- """Set cache mode of OCF block device +- +- Args: +- name: name of OCF bdev +- mode: OCF cache mode: {'wb', 'wt', 'pt', 'wa', 'wi', 'wo'} +- +- Returns: +- New cache mode name +- """ +- params = { +- 'name': name, +- 'mode': mode, +- } +- +- return client.call('bdev_ocf_set_cache_mode', params) +- +- +-def bdev_ocf_set_seqcutoff(client, name, policy, threshold, promotion_count): +- """Set sequential cutoff parameters on all cores for the given OCF cache device +- +- Args: +- name: Name of OCF cache bdev +- policy: Sequential cutoff policy +- threshold: Activation threshold [KiB] (optional) +- promotion_count: Promotion request count (optional) +- """ +- params = { +- 'name': name, +- 'policy': policy, +- } +- if threshold: +- params['threshold'] = threshold +- if promotion_count: +- params['promotion_count'] = promotion_count +- +- return client.call('bdev_ocf_set_seqcutoff', params) +- +- +-def bdev_ocf_flush_start(client, name): +- """Start flushing OCF cache device +- +- Args: +- name: name of OCF bdev +- """ +- params = { +- 'name': name, +- } +- +- return client.call('bdev_ocf_flush_start', params) +- +- +-def bdev_ocf_flush_status(client, name): +- """Get flush status of OCF cache device +- +- Args: +- name: name of OCF bdev +- +- Returns: +- Flush status +- """ +- params = { +- 'name': name, +- } +- +- return client.call('bdev_ocf_flush_status', params) +- +- +-def bdev_malloc_create(client, num_blocks, block_size, name=None, uuid=None, optimal_io_boundary=None, +- md_size=None, md_interleave=None, dif_type=None, dif_is_head_of_md=None): +- """Construct a malloc block device. +- +- Args: +- num_blocks: size of block device in blocks +- block_size: Data block size of device; must be a power of 2 and at least 512 +- name: name of block device (optional) +- uuid: UUID of block device (optional) +- optimal_io_boundary: Split on optimal IO boundary, in number of blocks, default 0 (disabled, optional) +- md_size: metadata size of device (0, 8, 16, 32, 64, or 128), default 0 (optional) +- md_interleave: metadata location, interleaved if set, and separated if omitted (optional) +- dif_type: protection information type (optional) +- dif_is_head_of_md: protection information is in the first 8 bytes of metadata (optional) +- +- Returns: +- Name of created block device. +- """ +- params = {'num_blocks': num_blocks, 'block_size': block_size} +- if name: +- params['name'] = name +- if uuid: +- params['uuid'] = uuid +- if optimal_io_boundary: +- params['optimal_io_boundary'] = optimal_io_boundary +- if md_size: +- params['md_size'] = md_size +- if md_interleave: +- params['md_interleave'] = md_interleave +- if dif_type: +- params['dif_type'] = dif_type +- if dif_is_head_of_md: +- params['dif_is_head_of_md'] = dif_is_head_of_md +- +- return client.call('bdev_malloc_create', params) +- +- +-def bdev_malloc_delete(client, name): +- """Delete malloc block device. +- +- Args: +- bdev_name: name of malloc bdev to delete +- """ +- params = {'name': name} +- return client.call('bdev_malloc_delete', params) +- +- +-def bdev_null_create(client, num_blocks, block_size, name, uuid=None, md_size=None, +- dif_type=None, dif_is_head_of_md=None): +- """Construct a null block device. +- +- Args: +- num_blocks: size of block device in blocks +- block_size: block size of device; data part size must be a power of 2 and at least 512 +- name: name of block device +- uuid: UUID of block device (optional) +- md_size: metadata size of device (optional) +- dif_type: protection information type (optional) +- dif_is_head_of_md: protection information is in the first 8 bytes of metadata (optional) +- +- Returns: +- Name of created block device. +- """ +- params = {'name': name, 'num_blocks': num_blocks, +- 'block_size': block_size} +- if uuid: +- params['uuid'] = uuid +- if md_size: +- params['md_size'] = md_size +- if dif_type: +- params['dif_type'] = dif_type +- if dif_is_head_of_md: +- params['dif_is_head_of_md'] = dif_is_head_of_md +- return client.call('bdev_null_create', params) +- +- +-def bdev_null_delete(client, name): +- """Remove null bdev from the system. +- +- Args: +- name: name of null bdev to delete +- """ +- params = {'name': name} +- return client.call('bdev_null_delete', params) +- +- +-def bdev_null_resize(client, name, new_size): +- """Resize null bdev in the system. +- +- Args: +- name: name of null bdev to resize +- new_size: new bdev size of resize operation. The unit is MiB +- """ +- params = { +- 'name': name, +- 'new_size': new_size, +- } +- return client.call('bdev_null_resize', params) +- +- +-def bdev_raid_get_bdevs(client, category): +- """Get list of raid bdevs based on category +- +- Args: +- category: any one of all or online or configuring or offline +- +- Returns: +- List of raid bdev details +- """ +- params = {'category': category} +- return client.call('bdev_raid_get_bdevs', params) +- +- +-def bdev_raid_create(client, name, raid_level, base_bdevs, strip_size=None, strip_size_kb=None): +- """Create raid bdev. Either strip size arg will work but one is required. +- +- Args: +- name: user defined raid bdev name +- strip_size (deprecated): strip size of raid bdev in KB, supported values like 8, 16, 32, 64, 128, 256, etc +- strip_size_kb: strip size of raid bdev in KB, supported values like 8, 16, 32, 64, 128, 256, etc +- raid_level: raid level of raid bdev, supported values 0 +- base_bdevs: Space separated names of Nvme bdevs in double quotes, like "Nvme0n1 Nvme1n1 Nvme2n1" +- +- Returns: +- None +- """ +- params = {'name': name, 'raid_level': raid_level, 'base_bdevs': base_bdevs} +- +- if strip_size: +- params['strip_size'] = strip_size +- +- if strip_size_kb: +- params['strip_size_kb'] = strip_size_kb +- +- return client.call('bdev_raid_create', params) +- +- +-def bdev_raid_delete(client, name): +- """Delete raid bdev +- +- Args: +- name: raid bdev name +- +- Returns: +- None +- """ +- params = {'name': name} +- return client.call('bdev_raid_delete', params) +- +- +-def bdev_aio_create(client, filename, name, block_size=None, readonly=False): +- """Construct a Linux AIO block device. +- +- Args: +- filename: path to device or file (ex: /dev/sda) +- name: name of block device +- block_size: block size of device (optional; autodetected if omitted) +- readonly: set aio bdev as read-only +- +- Returns: +- Name of created block device. +- """ +- params = {'name': name, +- 'filename': filename} +- +- if block_size: +- params['block_size'] = block_size +- +- if readonly: +- params['readonly'] = readonly +- +- return client.call('bdev_aio_create', params) +- +- +-def bdev_aio_rescan(client, name): +- """Rescan a Linux AIO block device. +- +- Args: +- bdev_name: name of aio bdev to delete +- """ +- params = {'name': name} +- return client.call('bdev_aio_rescan', params) +- +- +-def bdev_aio_delete(client, name): +- """Remove aio bdev from the system. +- +- Args: +- bdev_name: name of aio bdev to delete +- """ +- params = {'name': name} +- return client.call('bdev_aio_delete', params) +- +- +-def bdev_uring_create(client, filename, name, block_size=None): +- """Create a bdev with Linux io_uring backend. +- +- Args: +- filename: path to device or file (ex: /dev/nvme0n1) +- name: name of bdev +- block_size: block size of device (optional; autodetected if omitted) +- +- Returns: +- Name of created bdev. +- """ +- params = {'name': name, +- 'filename': filename} +- +- if block_size: +- params['block_size'] = block_size +- +- return client.call('bdev_uring_create', params) +- +- +-def bdev_uring_delete(client, name): +- """Delete a uring bdev. +- +- Args: +- name: name of uring bdev to delete +- """ +- params = {'name': name} +- return client.call('bdev_uring_delete', params) +- +- +-def bdev_xnvme_create(client, filename, name, io_mechanism, conserve_cpu=None): +- """Create a bdev with xNVMe backend. +- +- Args: +- filename: path to device or file (ex: /dev/nvme0n1) +- name: name of xNVMe bdev to create +- io_mechanism: I/O mechanism to use (ex: io_uring, io_uring_cmd, etc.) +- conserve_cpu: Whether or not to conserve CPU when polling (default: False) +- +- Returns: +- Name of created bdev. +- """ +- params = { +- 'name': name, +- 'filename': filename, +- 'io_mechanism': io_mechanism, +- } +- if conserve_cpu: +- params['conserve_cpu'] = conserve_cpu +- +- return client.call('bdev_xnvme_create', params) +- +- +-def bdev_xnvme_delete(client, name): +- """Delete a xNVMe bdev. +- +- Args: +- name: name of xNVMe bdev to delete +- """ +- params = {'name': name} +- return client.call('bdev_xnvme_delete', params) +- +- +-def bdev_nvme_set_options(client, action_on_timeout=None, timeout_us=None, timeout_admin_us=None, +- keep_alive_timeout_ms=None, retry_count=None, arbitration_burst=None, +- low_priority_weight=None, medium_priority_weight=None, high_priority_weight=None, +- nvme_adminq_poll_period_us=None, nvme_ioq_poll_period_us=None, io_queue_requests=None, +- delay_cmd_submit=None, transport_retry_count=None, bdev_retry_count=None, +- transport_ack_timeout=None, ctrlr_loss_timeout_sec=None, reconnect_delay_sec=None, +- fast_io_fail_timeout_sec=None, disable_auto_failback=None, generate_uuids=None, +- transport_tos=None, nvme_error_stat=None, rdma_srq_size=None, io_path_stat=None): +- """Set options for the bdev nvme. This is startup command. +- +- Args: +- action_on_timeout: action to take on command time out. Valid values are: none, reset, abort (optional) +- timeout_us: Timeout for each command, in microseconds. If 0, don't track timeouts (optional) +- timeout_admin_us: Timeout for each admin command, in microseconds. If 0, treat same as io timeouts (optional) +- keep_alive_timeout_ms: Keep alive timeout period in millisecond, default is 10s (optional) +- retry_count: The number of attempts per I/O when an I/O fails (deprecated) (optional) +- arbitration_burst: The value is expressed as a power of two (optional) +- low_priority_weight: The number of commands that may be executed from the low priority queue at one time (optional) +- medium_priority_weight: The number of commands that may be executed from the medium priority queue at one time (optional) +- high_priority_weight: The number of commands that may be executed from the high priority queue at one time (optional) +- nvme_adminq_poll_period_us: How often the admin queue is polled for asynchronous events in microseconds (optional) +- nvme_ioq_poll_period_us: How often to poll I/O queues for completions in microseconds (optional) +- io_queue_requests: The number of requests allocated for each NVMe I/O queue. Default: 512 (optional) +- delay_cmd_submit: Enable delayed NVMe command submission to allow batching of multiple commands (optional) +- transport_retry_count: The number of attempts per I/O in the transport layer when an I/O fails (optional) +- bdev_retry_count: The number of attempts per I/O in the bdev layer when an I/O fails. -1 means infinite retries. (optional) +- transport_ack_timeout: Time to wait ack until packet retransmission for RDMA or until closes connection for TCP. +- Range 0-31 where 0 is driver-specific default value (optional) +- ctrlr_loss_timeout_sec: Time to wait until ctrlr is reconnected before deleting ctrlr. +- -1 means infinite reconnect retries. 0 means no reconnect retry. +- If reconnect_delay_sec is zero, ctrlr_loss_timeout_sec has to be zero. +- If reconnect_delay_sec is non-zero, ctrlr_loss_timeout_sec has to be -1 or not less than reconnect_delay_sec. +- This can be overridden by bdev_nvme_attach_controller. (optional) +- reconnect_delay_sec: Time to delay a reconnect retry. +- If ctrlr_loss_timeout_sec is zero, reconnect_delay_sec has to be zero. +- If ctrlr_loss_timeout_sec is -1, reconnect_delay_sec has to be non-zero. +- If ctrlr_loss_timeout_sec is not -1 or zero, reconnect_sec has to be non-zero and less than ctrlr_loss_timeout_sec. +- This can be overridden by bdev_nvme_attach_controller. (optional) +- fail_io_fast_timeout_sec: Time to wait until ctrlr is reconnected before failing I/O to ctrlr. +- 0 means no such timeout. +- If fast_io_fail_timeout_sec is not zero, it has to be not less than reconnect_delay_sec and less than +- ctrlr_loss_timeout_sec if ctrlr_loss_timeout_sec is not -1. +- This can be overridden by bdev_nvme_attach_controller. (optional) +- disable_auto_failback: Disable automatic failback. bdev_nvme_set_preferred_path can be used to do manual failback. +- By default, immediately failback to the preferred I/O path if it is restored. (optional) +- generate_uuids: Enable generation of unique identifiers for NVMe bdevs only if they do not provide UUID themselves. +- These strings are based on device serial number and namespace ID and will always be the same for that device. +- transport_tos: IPv4 Type of Service value. Only applicable for RDMA transports. +- The default is 0 which means no TOS is applied. (optional) +- nvme_error_stat: Enable collecting NVMe error counts. (optional) +- rdma_srq_size: Set the size of a shared rdma receive queue. Default: 0 (disabled) (optional) +- io_path_stat: Enable collection I/O path stat of each io path. (optional) +- +- """ +- params = {} +- +- if action_on_timeout: +- params['action_on_timeout'] = action_on_timeout +- +- if timeout_us is not None: +- params['timeout_us'] = timeout_us +- +- if timeout_admin_us is not None: +- params['timeout_admin_us'] = timeout_admin_us +- +- if keep_alive_timeout_ms is not None: +- params['keep_alive_timeout_ms'] = keep_alive_timeout_ms +- +- if retry_count is not None: +- print("WARNING: retry_count is deprecated, please use transport_retry_count.") +- params['retry_count'] = retry_count +- +- if arbitration_burst is not None: +- params['arbitration_burst'] = arbitration_burst +- +- if low_priority_weight is not None: +- params['low_priority_weight'] = low_priority_weight +- +- if medium_priority_weight is not None: +- params['medium_priority_weight'] = medium_priority_weight +- +- if high_priority_weight is not None: +- params['high_priority_weight'] = high_priority_weight +- +- if nvme_adminq_poll_period_us: +- params['nvme_adminq_poll_period_us'] = nvme_adminq_poll_period_us +- +- if nvme_ioq_poll_period_us is not None: +- params['nvme_ioq_poll_period_us'] = nvme_ioq_poll_period_us +- +- if io_queue_requests is not None: +- params['io_queue_requests'] = io_queue_requests +- +- if delay_cmd_submit is not None: +- params['delay_cmd_submit'] = delay_cmd_submit +- +- if transport_retry_count is not None: +- params['transport_retry_count'] = transport_retry_count +- +- if bdev_retry_count is not None: +- params['bdev_retry_count'] = bdev_retry_count +- +- if transport_ack_timeout is not None: +- params['transport_ack_timeout'] = transport_ack_timeout +- +- if ctrlr_loss_timeout_sec is not None: +- params['ctrlr_loss_timeout_sec'] = ctrlr_loss_timeout_sec +- +- if reconnect_delay_sec is not None: +- params['reconnect_delay_sec'] = reconnect_delay_sec +- +- if fast_io_fail_timeout_sec is not None: +- params['fast_io_fail_timeout_sec'] = fast_io_fail_timeout_sec +- +- if disable_auto_failback is not None: +- params['disable_auto_failback'] = disable_auto_failback +- +- if generate_uuids is not None: +- params['generate_uuids'] = generate_uuids +- +- if transport_tos is not None: +- params['transport_tos'] = transport_tos +- +- if nvme_error_stat is not None: +- params['nvme_error_stat'] = nvme_error_stat +- +- if rdma_srq_size is not None: +- params['rdma_srq_size'] = rdma_srq_size +- +- if io_path_stat is not None: +- params['io_path_stat'] = io_path_stat +- +- return client.call('bdev_nvme_set_options', params) +- +- +-def bdev_nvme_set_hotplug(client, enable, period_us=None): +- """Set options for the bdev nvme. This is startup command. +- +- Args: +- enable: True to enable hotplug, False to disable. +- period_us: how often the hotplug is processed for insert and remove events. Set 0 to reset to default. (optional) +- """ +- params = {'enable': enable} +- +- if period_us: +- params['period_us'] = period_us +- +- return client.call('bdev_nvme_set_hotplug', params) +- +- +-def bdev_nvme_attach_controller(client, name, trtype, traddr, adrfam=None, trsvcid=None, +- priority=None, subnqn=None, hostnqn=None, hostaddr=None, +- hostsvcid=None, prchk_reftag=None, prchk_guard=None, +- hdgst=None, ddgst=None, fabrics_timeout=None, multipath=None, num_io_queues=None, +- ctrlr_loss_timeout_sec=None, reconnect_delay_sec=None, +- fast_io_fail_timeout_sec=None, psk=None): +- """Construct block device for each NVMe namespace in the attached controller. +- +- Args: +- name: bdev name prefix; "n" + namespace ID will be appended to create unique names +- trtype: transport type ("PCIe", "RDMA", "FC", "TCP") +- traddr: transport address (PCI BDF or IP address) +- adrfam: address family ("IPv4", "IPv6", "IB", or "FC") +- trsvcid: transport service ID (port number for IP-based addresses) +- priority: transport connection priority (Sock priority for TCP-based transports; optional) +- subnqn: subsystem NQN to connect to (optional) +- hostnqn: NQN to connect from (optional) +- hostaddr: host transport address (IP address for IP-based transports, NULL for PCIe or FC; optional) +- hostsvcid: host transport service ID (port number for IP-based transports, NULL for PCIe or FC; optional) +- prchk_reftag: Enable checking of PI reference tag for I/O processing (optional) +- prchk_guard: Enable checking of PI guard for I/O processing (optional) +- hdgst: Enable TCP header digest (optional) +- ddgst: Enable TCP data digest (optional) +- fabrics_timeout: Fabrics connect timeout in us (optional) +- multipath: The behavior when multiple paths are created ("disable", "failover", or "multipath"; failover if not specified) +- num_io_queues: The number of IO queues to request during initialization. (optional) +- ctrlr_loss_timeout_sec: Time to wait until ctrlr is reconnected before deleting ctrlr. +- -1 means infinite reconnect retries. 0 means no reconnect retry. +- If reconnect_delay_sec is zero, ctrlr_loss_timeout_sec has to be zero. +- If reconnect_delay_sec is non-zero, ctrlr_loss_timeout_sec has to be -1 or not less than reconnect_delay_sec. +- (optional) +- reconnect_delay_sec: Time to delay a reconnect retry. +- If ctrlr_loss_timeout_sec is zero, reconnect_delay_sec has to be zero. +- If ctrlr_loss_timeout_sec is -1, reconnect_delay_sec has to be non-zero. +- If ctrlr_loss_timeout_sec is not -1 or zero, reconnect_sec has to be non-zero and less than ctrlr_loss_timeout_sec. +- (optional) +- fail_io_fast_timeout_sec: Time to wait until ctrlr is reconnected before failing I/O to ctrlr. +- 0 means no such timeout. +- If fast_io_fail_timeout_sec is not zero, it has to be not less than reconnect_delay_sec and less than +- ctrlr_loss_timeout_sec if ctrlr_loss_timeout_sec is not -1. (optional) +- psk: Set PSK and enable TCP SSL socket implementation (optional) +- +- Returns: +- Names of created block devices. +- """ +- params = {'name': name, +- 'trtype': trtype, +- 'traddr': traddr} +- +- if hostnqn: +- params['hostnqn'] = hostnqn +- +- if hostaddr: +- params['hostaddr'] = hostaddr +- +- if hostsvcid: +- params['hostsvcid'] = hostsvcid +- +- if adrfam: +- params['adrfam'] = adrfam +- +- if trsvcid: +- params['trsvcid'] = trsvcid +- +- if priority: +- params['priority'] = priority +- +- if subnqn: +- params['subnqn'] = subnqn +- +- if prchk_reftag: +- params['prchk_reftag'] = prchk_reftag +- +- if prchk_guard: +- params['prchk_guard'] = prchk_guard +- +- if hdgst: +- params['hdgst'] = hdgst +- +- if ddgst: +- params['ddgst'] = ddgst +- +- if fabrics_timeout: +- params['fabrics_connect_timeout_us'] = fabrics_timeout +- +- if multipath: +- params['multipath'] = multipath +- +- if num_io_queues: +- params['num_io_queues'] = num_io_queues +- +- if ctrlr_loss_timeout_sec is not None: +- params['ctrlr_loss_timeout_sec'] = ctrlr_loss_timeout_sec +- +- if reconnect_delay_sec is not None: +- params['reconnect_delay_sec'] = reconnect_delay_sec +- +- if fast_io_fail_timeout_sec is not None: +- params['fast_io_fail_timeout_sec'] = fast_io_fail_timeout_sec +- +- if psk: +- params['psk'] = psk +- +- return client.call('bdev_nvme_attach_controller', params) +- +- +-def bdev_nvme_detach_controller(client, name, trtype=None, traddr=None, +- adrfam=None, trsvcid=None, subnqn=None, +- hostaddr=None, hostsvcid=None): +- """Detach NVMe controller and delete any associated bdevs. Optionally, +- If all of the transport ID options are specified, only remove that +- transport path from the specified controller. If that is the only +- available path for the controller, this will also result in the +- controller being detached and the associated bdevs being deleted. +- +- Args: +- name: controller name +- trtype: transport type ("PCIe", "RDMA") +- traddr: transport address (PCI BDF or IP address) +- adrfam: address family ("IPv4", "IPv6", "IB", or "FC") +- trsvcid: transport service ID (port number for IP-based addresses) +- subnqn: subsystem NQN to connect to (optional) +- hostaddr: Host address (IP address) +- hostsvcid: transport service ID on host side (port number) +- """ +- +- params = {'name': name} +- +- if trtype: +- params['trtype'] = trtype +- +- if traddr: +- params['traddr'] = traddr +- +- if adrfam: +- params['adrfam'] = adrfam +- +- if trsvcid: +- params['trsvcid'] = trsvcid +- +- if subnqn: +- params['subnqn'] = subnqn +- +- if hostaddr: +- params['hostaddr'] = hostaddr +- +- if hostsvcid: +- params['hostsvcid'] = hostsvcid +- +- return client.call('bdev_nvme_detach_controller', params) +- +- +-def bdev_nvme_reset_controller(client, name): +- """Reset NVMe controller. +- +- Args: +- name: controller name +- """ +- +- params = {'name': name} +- +- return client.call('bdev_nvme_reset_controller', params) +- +- +-def bdev_nvme_start_discovery(client, name, trtype, traddr, adrfam=None, trsvcid=None, +- hostnqn=None, wait_for_attach=None, ctrlr_loss_timeout_sec=None, +- reconnect_delay_sec=None, fast_io_fail_timeout_sec=None, +- attach_timeout_ms=None): +- """Start discovery with the specified discovery subsystem +- +- Args: +- name: bdev name prefix; "n" + namespace ID will be appended to create unique names +- trtype: transport type ("PCIe", "RDMA", "FC", "TCP") +- traddr: transport address (PCI BDF or IP address) +- adrfam: address family ("IPv4", "IPv6", "IB", or "FC") +- trsvcid: transport service ID (port number for IP-based addresses) +- hostnqn: NQN to connect from (optional) +- wait_for_attach: Wait to complete RPC until all discovered NVM subsystems have attached (optional) +- ctrlr_loss_timeout_sec: Time to wait until ctrlr is reconnected before deleting ctrlr. +- -1 means infinite reconnect retries. 0 means no reconnect retry. +- If reconnect_delay_sec is zero, ctrlr_loss_timeout_sec has to be zero. +- If reconnect_delay_sec is non-zero, ctrlr_loss_timeout_sec has to be -1 or not less than reconnect_delay_sec. +- (optional) +- reconnect_delay_sec: Time to delay a reconnect retry. +- If ctrlr_loss_timeout_sec is zero, reconnect_delay_sec has to be zero. +- If ctrlr_loss_timeout_sec is -1, reconnect_delay_sec has to be non-zero. +- If ctrlr_loss_timeout_sec is not -1 or zero, reconnect_sec has to be non-zero and less than ctrlr_loss_timeout_sec. +- (optional) +- fail_io_fast_timeout_sec: Time to wait until ctrlr is reconnected before failing I/O to ctrlr. +- 0 means no such timeout. +- If fast_io_fail_timeout_sec is not zero, it has to be not less than reconnect_delay_sec and less than +- ctrlr_loss_timeout_sec if ctrlr_loss_timeout_sec is not -1. (optional) +- attach_timeout_ms: Time to wait until the discovery and all discovered NVM subsystems are attached (optional) +- """ +- params = {'name': name, +- 'trtype': trtype, +- 'traddr': traddr} +- +- if hostnqn: +- params['hostnqn'] = hostnqn +- +- if adrfam: +- params['adrfam'] = adrfam +- +- if trsvcid: +- params['trsvcid'] = trsvcid +- +- if wait_for_attach: +- params['wait_for_attach'] = True +- +- if attach_timeout_ms is not None: +- params['attach_timeout_ms'] = attach_timeout_ms +- +- if ctrlr_loss_timeout_sec is not None: +- params['ctrlr_loss_timeout_sec'] = ctrlr_loss_timeout_sec +- +- if reconnect_delay_sec is not None: +- params['reconnect_delay_sec'] = reconnect_delay_sec +- +- if fast_io_fail_timeout_sec is not None: +- params['fast_io_fail_timeout_sec'] = fast_io_fail_timeout_sec +- +- return client.call('bdev_nvme_start_discovery', params) +- +- +-def bdev_nvme_stop_discovery(client, name): +- """Stop a previously started discovery service +- +- Args: +- name: name of discovery service to start +- """ +- params = {'name': name} +- +- return client.call('bdev_nvme_stop_discovery', params) +- +- +-def bdev_nvme_get_discovery_info(client): +- """Get information about the automatic discovery +- """ +- return client.call('bdev_nvme_get_discovery_info') +- +- +-def bdev_nvme_get_io_paths(client, name): +- """Display all or the specified NVMe bdev's active I/O paths +- +- Args: +- name: Name of the NVMe bdev (optional) +- +- Returns: +- List of active I/O paths +- """ +- params = {} +- if name: +- params['name'] = name +- return client.call('bdev_nvme_get_io_paths', params) +- +- +-def bdev_nvme_set_preferred_path(client, name, cntlid): +- """Set the preferred I/O path for an NVMe bdev when in multipath mode +- +- Args: +- name: NVMe bdev name +- cntlid: NVMe-oF controller ID +- """ +- +- params = {'name': name, +- 'cntlid': cntlid} +- +- return client.call('bdev_nvme_set_preferred_path', params) +- +- +-def bdev_nvme_set_multipath_policy(client, name, policy, selector, rr_min_io): +- """Set multipath policy of the NVMe bdev +- +- Args: +- name: NVMe bdev name +- policy: Multipath policy (active_passive or active_active) +- selector: Multipath selector (round_robin, queue_depth) +- rr_min_io: Number of IO to route to a path before switching to another one (optional) +- """ +- +- params = {'name': name, +- 'policy': policy} +- if selector: +- params['selector'] = selector +- if rr_min_io: +- params['rr_min_io'] = rr_min_io +- +- return client.call('bdev_nvme_set_multipath_policy', params) +- +- +-def bdev_nvme_get_path_iostat(client, name): +- """Get I/O statistics for IO paths of the block device. +- +- Args: +- name: bdev name to query +- +- Returns: +- I/O statistics for IO paths of the requested block device. +- """ +- params = {'name': name} +- +- return client.call('bdev_nvme_get_path_iostat', params) +- +- +-def bdev_nvme_cuse_register(client, name): +- """Register CUSE devices on NVMe controller. +- +- Args: +- name: Name of the operating NVMe controller +- """ +- params = {'name': name} +- +- return client.call('bdev_nvme_cuse_register', params) +- +- +-def bdev_nvme_cuse_unregister(client, name): +- """Unregister CUSE devices on NVMe controller. +- +- Args: +- name: Name of the operating NVMe controller +- """ +- params = {'name': name} +- +- return client.call('bdev_nvme_cuse_unregister', params) +- +- +-def bdev_zone_block_create(client, name, base_bdev, zone_capacity, optimal_open_zones): +- """Creates a virtual zone device on top of existing non-zoned bdev. +- +- Args: +- name: Zone device name +- base_bdev: Base Nvme bdev name +- zone_capacity: Surfaced zone capacity in blocks +- optimal_open_zones: Number of zones required to reach optimal write speed (optional, default: 1) +- +- Returns: +- Name of created block device. +- """ +- params = {'name': name, +- 'base_bdev': base_bdev, +- 'zone_capacity': zone_capacity, +- 'optimal_open_zones': optimal_open_zones} +- +- return client.call('bdev_zone_block_create', params) +- +- +-def bdev_zone_block_delete(client, name): +- """Remove block zone bdev from the system. +- +- Args: +- name: name of block zone bdev to delete +- """ +- params = {'name': name} +- return client.call('bdev_zone_block_delete', params) +- +- +-def bdev_rbd_register_cluster(client, name, user=None, config_param=None, config_file=None, key_file=None): +- """Create a Rados Cluster object of the Ceph RBD backend. +- +- Args: +- name: name of Rados Cluster +- user: Ceph user name (optional) +- config_param: map of config keys to values (optional) +- config_file: file path of Ceph configuration file (optional) +- key_file: file path of Ceph key file (optional) +- +- Returns: +- Name of registered Rados Cluster object. +- """ +- params = {'name': name} +- +- if user is not None: +- params['user_id'] = user +- if config_param is not None: +- params['config_param'] = config_param +- if config_file is not None: +- params['config_file'] = config_file +- if key_file is not None: +- params['key_file'] = key_file +- +- return client.call('bdev_rbd_register_cluster', params) +- +- +-def bdev_rbd_unregister_cluster(client, name): +- """Remove Rados cluster object from the system. +- +- Args: +- name: name of Rados cluster object to unregister +- """ +- params = {'name': name} +- return client.call('bdev_rbd_unregister_cluster', params) +- +- +-def bdev_rbd_get_clusters_info(client, name): +- """Get the cluster(s) info +- +- Args: +- name: name of Rados cluster object to query (optional; if omitted, query all clusters) +- +- Returns: +- List of registered Rados cluster information objects. +- """ +- params = {} +- if name: +- params['name'] = name +- return client.call('bdev_rbd_get_clusters_info', params) +- +- +-def bdev_rbd_create(client, pool_name, rbd_name, block_size, name=None, user=None, config=None, cluster_name=None, uuid=None): +- """Create a Ceph RBD block device. +- +- Args: +- pool_name: Ceph RBD pool name +- rbd_name: Ceph RBD image name +- block_size: block size of RBD volume +- name: name of block device (optional) +- user: Ceph user name (optional) +- config: map of config keys to values (optional) +- cluster_name: Name to identify Rados cluster (optional) +- uuid: UUID of block device (optional) +- +- Returns: +- Name of created block device. +- """ +- params = { +- 'pool_name': pool_name, +- 'rbd_name': rbd_name, +- 'block_size': block_size, +- } +- +- if name: +- params['name'] = name +- if user is not None: +- params['user_id'] = user +- if config is not None: +- params['config'] = config +- if cluster_name is not None: +- params['cluster_name'] = cluster_name +- else: +- print("WARNING:bdev_rbd_create should be used with specifying -c to have a cluster name after bdev_rbd_register_cluster.") +- if uuid is not None: +- params['uuid'] = uuid +- +- return client.call('bdev_rbd_create', params) +- +- +-def bdev_rbd_delete(client, name): +- """Remove rbd bdev from the system. +- +- Args: +- name: name of rbd bdev to delete +- """ +- params = {'name': name} +- return client.call('bdev_rbd_delete', params) +- +- +-def bdev_rbd_resize(client, name, new_size): +- """Resize rbd bdev in the system. +- +- Args: +- name: name of rbd bdev to resize +- new_size: new bdev size of resize operation. The unit is MiB +- """ +- params = { +- 'name': name, +- 'new_size': new_size, +- } +- return client.call('bdev_rbd_resize', params) +- +- +-def bdev_error_create(client, base_name): +- """Construct an error injection block device. +- +- Args: +- base_name: base bdev name +- """ +- params = {'base_name': base_name} +- return client.call('bdev_error_create', params) +- +- +-def bdev_delay_create(client, base_bdev_name, name, avg_read_latency, p99_read_latency, avg_write_latency, p99_write_latency): +- """Construct a delay block device. +- +- Args: +- base_bdev_name: name of the existing bdev +- name: name of block device +- avg_read_latency: complete 99% of read ops with this delay +- p99_read_latency: complete 1% of read ops with this delay +- avg_write_latency: complete 99% of write ops with this delay +- p99_write_latency: complete 1% of write ops with this delay +- +- Returns: +- Name of created block device. +- """ +- params = { +- 'base_bdev_name': base_bdev_name, +- 'name': name, +- 'avg_read_latency': avg_read_latency, +- 'p99_read_latency': p99_read_latency, +- 'avg_write_latency': avg_write_latency, +- 'p99_write_latency': p99_write_latency, +- } +- return client.call('bdev_delay_create', params) +- +- +-def bdev_delay_delete(client, name): +- """Remove delay bdev from the system. +- +- Args: +- name: name of delay bdev to delete +- """ +- params = {'name': name} +- return client.call('bdev_delay_delete', params) +- +- +-def bdev_delay_update_latency(client, delay_bdev_name, latency_type, latency_us): +- """Update the latency value for a delay block device +- +- Args: +- delay_bdev_name: name of the delay bdev +- latency_type: 'one of: avg_read, avg_write, p99_read, p99_write. No other values accepted.' +- latency_us: 'new latency value.' +- +- Returns: +- True if successful, or a specific error otherwise. +- """ +- params = { +- 'delay_bdev_name': delay_bdev_name, +- 'latency_type': latency_type, +- 'latency_us': latency_us, +- } +- return client.call('bdev_delay_update_latency', params) +- +- +-def bdev_error_delete(client, name): +- """Remove error bdev from the system. +- +- Args: +- bdev_name: name of error bdev to delete +- """ +- params = {'name': name} +- return client.call('bdev_error_delete', params) +- +- +-def bdev_iscsi_set_options(client, timeout_sec): +- """Set options for the bdev iscsi. +- +- Args: +- timeout_sec: Timeout for command, in seconds, if 0, don't track timeout +- """ +- params = {} +- +- if timeout_sec is not None: +- params['timeout_sec'] = timeout_sec +- +- return client.call('bdev_iscsi_set_options', params) +- +- +-def bdev_iscsi_create(client, name, url, initiator_iqn): +- """Construct an iSCSI block device. +- +- Args: +- name: name of block device +- url: iSCSI URL +- initiator_iqn: IQN name to be used by initiator +- +- Returns: +- Name of created block device. +- """ +- params = { +- 'name': name, +- 'url': url, +- 'initiator_iqn': initiator_iqn, +- } +- return client.call('bdev_iscsi_create', params) +- +- +-def bdev_iscsi_delete(client, name): +- """Remove iSCSI bdev from the system. +- +- Args: +- bdev_name: name of iSCSI bdev to delete +- """ +- params = {'name': name} +- return client.call('bdev_iscsi_delete', params) +- +- +-def bdev_pmem_create(client, pmem_file, name): +- """Construct a libpmemblk block device. +- +- Args: +- pmem_file: path to pmemblk pool file +- name: name of block device +- +- Returns: +- Name of created block device. +- """ +- params = { +- 'pmem_file': pmem_file, +- 'name': name +- } +- return client.call('bdev_pmem_create', params) +- +- +-def bdev_pmem_delete(client, name): +- """Remove pmem bdev from the system. +- +- Args: +- name: name of pmem bdev to delete +- """ +- params = {'name': name} +- return client.call('bdev_pmem_delete', params) +- +- +-def bdev_passthru_create(client, base_bdev_name, name): +- """Construct a pass-through block device. +- +- Args: +- base_bdev_name: name of the existing bdev +- name: name of block device +- +- Returns: +- Name of created block device. +- """ +- params = { +- 'base_bdev_name': base_bdev_name, +- 'name': name, +- } +- return client.call('bdev_passthru_create', params) +- +- +-def bdev_passthru_delete(client, name): +- """Remove pass through bdev from the system. +- +- Args: +- name: name of pass through bdev to delete +- """ +- params = {'name': name} +- return client.call('bdev_passthru_delete', params) +- +- +-def bdev_opal_create(client, nvme_ctrlr_name, nsid, locking_range_id, range_start, range_length, password): +- """Create opal virtual block devices from a base nvme bdev. +- +- Args: +- nvme_ctrlr_name: name of the nvme ctrlr +- nsid: namespace ID of nvme ctrlr +- locking_range_id: locking range ID corresponding to this virtual bdev +- range_start: start address of this locking range +- range_length: length of this locking range +- password: admin password of base nvme bdev +- +- Returns: +- Name of the new created block devices. +- """ +- params = { +- 'nvme_ctrlr_name': nvme_ctrlr_name, +- 'nsid': nsid, +- 'locking_range_id': locking_range_id, +- 'range_start': range_start, +- 'range_length': range_length, +- 'password': password, +- } +- +- return client.call('bdev_opal_create', params) +- +- +-def bdev_opal_get_info(client, bdev_name, password): +- """Get opal locking range info. +- +- Args: +- bdev_name: name of opal vbdev to get info +- password: admin password +- +- Returns: +- Locking range info. +- """ +- params = { +- 'bdev_name': bdev_name, +- 'password': password, +- } +- +- return client.call('bdev_opal_get_info', params) +- +- +-def bdev_opal_delete(client, bdev_name, password): +- """Delete opal virtual bdev from the system. +- +- Args: +- bdev_name: name of opal vbdev to delete +- password: admin password of base nvme bdev +- """ +- params = { +- 'bdev_name': bdev_name, +- 'password': password, +- } +- +- return client.call('bdev_opal_delete', params) +- +- +-def bdev_opal_new_user(client, bdev_name, admin_password, user_id, user_password): +- """Add a user to opal bdev who can set lock state for this bdev. +- +- Args: +- bdev_name: name of opal vbdev +- admin_password: admin password +- user_id: ID of the user who will be added to this opal bdev +- user_password: password set for this user +- """ +- params = { +- 'bdev_name': bdev_name, +- 'admin_password': admin_password, +- 'user_id': user_id, +- 'user_password': user_password, +- } +- +- return client.call('bdev_opal_new_user', params) +- +- +-def bdev_opal_set_lock_state(client, bdev_name, user_id, password, lock_state): +- """set lock state for an opal bdev. +- +- Args: +- bdev_name: name of opal vbdev +- user_id: ID of the user who will set lock state +- password: password of the user +- lock_state: lock state to set +- """ +- params = { +- 'bdev_name': bdev_name, +- 'user_id': user_id, +- 'password': password, +- 'lock_state': lock_state, +- } +- +- return client.call('bdev_opal_set_lock_state', params) +- +- +-def bdev_split_create(client, base_bdev, split_count, split_size_mb=None): +- """Create split block devices from a base bdev. +- +- Args: +- base_bdev: name of bdev to split +- split_count: number of split bdevs to create +- split_size_mb: size of each split volume in MiB (optional) +- +- Returns: +- List of created block devices. +- """ +- params = { +- 'base_bdev': base_bdev, +- 'split_count': split_count, +- } +- if split_size_mb: +- params['split_size_mb'] = split_size_mb +- +- return client.call('bdev_split_create', params) +- +- +-def bdev_split_delete(client, base_bdev): +- """Delete split block devices. +- +- Args: +- base_bdev: name of previously split bdev +- """ +- params = { +- 'base_bdev': base_bdev, +- } +- +- return client.call('bdev_split_delete', params) +- +- +-def bdev_ftl_create(client, name, base_bdev, **kwargs): +- """Construct FTL bdev +- +- Args: +- name: name of the bdev +- base_bdev: name of the base bdev +- kwargs: optional parameters +- """ +- params = {'name': name, +- 'base_bdev': base_bdev} +- for key, value in kwargs.items(): +- if value is not None: +- params[key] = value +- +- return client.call('bdev_ftl_create', params) +- +- +-def bdev_ftl_load(client, name, base_bdev, **kwargs): +- """Load FTL bdev +- +- Args: +- name: name of the bdev +- base_bdev: name of the base bdev +- kwargs: optional parameters +- """ +- params = {'name': name, +- 'base_bdev': base_bdev} +- for key, value in kwargs.items(): +- if value is not None: +- params[key] = value +- +- return client.call('bdev_ftl_load', params) +- +- +-def bdev_ftl_unload(client, name, fast_shutdown): +- """Unload FTL bdev +- +- Args: +- name: name of the bdev +- """ +- params = {'name': name, +- 'fast_shutdown': fast_shutdown} +- +- return client.call('bdev_ftl_unload', params) +- +- +-def bdev_ftl_delete(client, name, fast_shutdown): +- """Delete FTL bdev +- +- Args: +- name: name of the bdev +- """ +- params = {'name': name, +- 'fast_shutdown': fast_shutdown} +- +- return client.call('bdev_ftl_delete', params) +- +- +-def bdev_ftl_unmap(client, name, lba, num_blocks): +- """FTL unmap +- +- Args: +- name: name of the bdev +- lba: starting lba to be unmapped +- num_blocks: number of blocks to unmap +- """ +- params = {'name': name, +- 'lba': lba, +- 'num_blocks': num_blocks} +- +- return client.call('bdev_ftl_unmap', params) +- +- +-def bdev_ftl_get_stats(client, name): +- """get FTL stats +- +- Args: +- name: name of the bdev +- """ +- params = {'name': name} +- +- return client.call('bdev_ftl_get_stats', params) +- +- +-def bdev_get_bdevs(client, name=None, timeout=None): +- """Get information about block devices. +- +- Args: +- name: bdev name to query (optional; if omitted, query all bdevs) +- timeout: time in ms to wait for the bdev with specified name to appear +- +- Returns: +- List of bdev information objects. +- """ +- params = {} +- if name: +- params['name'] = name +- if timeout: +- params['timeout'] = timeout +- return client.call('bdev_get_bdevs', params) +- +- +-def bdev_get_iostat(client, name=None, per_channel=None): +- """Get I/O statistics for block devices. +- +- Args: +- name: bdev name to query (optional; if omitted, query all bdevs) +- per_channel: display per channel IO stats for specified bdev +- +- Returns: +- I/O statistics for the requested block devices. +- """ +- params = {} +- if name: +- params['name'] = name +- if per_channel: +- params['per_channel'] = per_channel +- return client.call('bdev_get_iostat', params) +- +- +-def bdev_reset_iostat(client, name=None, mode=None): +- """Reset I/O statistics for block devices. +- +- Args: +- name: bdev name to reset (optional; if omitted, reset all bdevs) +- mode: mode to reset: all, maxmin (optional: if omitted, reset all fields) +- """ +- params = {} +- if name: +- params['name'] = name +- if mode: +- params['mode'] = mode +- +- return client.call('bdev_reset_iostat', params) +- +- +-def bdev_enable_histogram(client, name, enable): +- """Control whether histogram is enabled for specified bdev. +- +- Args: +- bdev_name: name of bdev +- """ +- params = {'name': name, "enable": enable} +- return client.call('bdev_enable_histogram', params) +- +- +-def bdev_get_histogram(client, name): +- """Get histogram for specified bdev. +- +- Args: +- bdev_name: name of bdev +- """ +- params = {'name': name} +- return client.call('bdev_get_histogram', params) +- +- +-def bdev_error_inject_error(client, name, io_type, error_type, num, +- corrupt_offset, corrupt_value): +- """Inject an error via an error bdev. +- +- Args: +- name: name of error bdev +- io_type: one of "clear", "read", "write", "unmap", "flush", or "all" +- error_type: one of "failure", "pending", or "corrupt_data" +- num: number of commands to fail +- corrupt_offset: offset in bytes to xor with corrupt_value +- corrupt_value: value for xor (1-255, 0 is invalid) +- """ +- params = { +- 'name': name, +- 'io_type': io_type, +- 'error_type': error_type, +- } +- +- if num: +- params['num'] = num +- if corrupt_offset: +- params['corrupt_offset'] = corrupt_offset +- if corrupt_value: +- params['corrupt_value'] = corrupt_value +- +- return client.call('bdev_error_inject_error', params) +- +- +-def bdev_set_qd_sampling_period(client, name, period): +- """Enable queue depth tracking on a specified bdev. +- +- Args: +- name: name of a bdev on which to track queue depth. +- period: period (in microseconds) at which to update the queue depth reading. If set to 0, polling will be disabled. +- """ +- +- params = {} +- params['name'] = name +- params['period'] = period +- return client.call('bdev_set_qd_sampling_period', params) +- +- +-def bdev_set_qos_limit( +- client, +- name, +- rw_ios_per_sec=None, +- rw_mbytes_per_sec=None, +- r_mbytes_per_sec=None, +- w_mbytes_per_sec=None): +- """Set QoS rate limit on a block device. +- +- Args: +- name: name of block device +- rw_ios_per_sec: R/W IOs per second limit (>=1000, example: 20000). 0 means unlimited. +- rw_mbytes_per_sec: R/W megabytes per second limit (>=10, example: 100). 0 means unlimited. +- r_mbytes_per_sec: Read megabytes per second limit (>=10, example: 100). 0 means unlimited. +- w_mbytes_per_sec: Write megabytes per second limit (>=10, example: 100). 0 means unlimited. +- """ +- params = {} +- params['name'] = name +- if rw_ios_per_sec is not None: +- params['rw_ios_per_sec'] = rw_ios_per_sec +- if rw_mbytes_per_sec is not None: +- params['rw_mbytes_per_sec'] = rw_mbytes_per_sec +- if r_mbytes_per_sec is not None: +- params['r_mbytes_per_sec'] = r_mbytes_per_sec +- if w_mbytes_per_sec is not None: +- params['w_mbytes_per_sec'] = w_mbytes_per_sec +- return client.call('bdev_set_qos_limit', params) +- +- +-def bdev_nvme_apply_firmware(client, bdev_name, filename): +- """Download and commit firmware to NVMe device. +- +- Args: +- bdev_name: name of NVMe block device +- filename: filename of the firmware to download +- """ +- params = { +- 'filename': filename, +- 'bdev_name': bdev_name, +- } +- return client.call('bdev_nvme_apply_firmware', params) +- +- +-def bdev_nvme_get_transport_statistics(client): +- """Get bdev_nvme poll group transport statistics""" +- return client.call('bdev_nvme_get_transport_statistics') +- +- +-def bdev_nvme_get_controller_health_info(client, name): +- """Display health log of the required NVMe bdev controller. +- +- Args: +- name: name of the required NVMe bdev controller +- +- Returns: +- Health log for the requested NVMe bdev controller. +- """ +- params = {} +- params['name'] = name +- return client.call('bdev_nvme_get_controller_health_info', params) +- +- +-def bdev_daos_create(client, num_blocks, block_size, pool, cont, name, oclass=None, uuid=None): +- """Construct DAOS block device. +- +- Args: +- num_blocks: size of block device in blocks +- block_size: block size of device; must be a power of 2 and at least 512 +- name: name of block device (also the name of the backend file on DAOS DFS) +- pool: UUID of DAOS pool +- cont: UUID of DAOS container +- uuid: UUID of block device (optional) +- oclass: DAOS object class (optional) +- +- Returns: +- Name of created block device. +- """ +- params = {'num_blocks': num_blocks, 'block_size': block_size, 'pool': pool, 'cont': cont, 'name': name} +- if uuid: +- params['uuid'] = uuid +- if oclass: +- params['oclass'] = oclass +- return client.call('bdev_daos_create', params) +- +- +-def bdev_daos_delete(client, name): +- """Delete DAOS block device. +- +- Args: +- bdev_name: name of DAOS bdev to delete +- """ +- params = {'name': name} +- return client.call('bdev_daos_delete', params) +- +- +-def bdev_daos_resize(client, name, new_size): +- """Resize DAOS bdev in the system. +- Args: +- name: name of DAOS bdev to resize +- new_size: new bdev size of resize operation. The unit is MiB +- """ +- params = { +- 'name': name, +- 'new_size': new_size, +- } +- return client.call('bdev_daos_resize', params) +- +- +-def bdev_nvme_start_mdns_discovery(client, name, svcname, hostnqn=None): +- """Start discovery with mDNS +- +- Args: +- name: bdev name prefix; "n" + unique seqno + namespace ID will be appended to create unique names +- svcname: service to discover ("_nvme-disc._tcp") +- hostnqn: NQN to connect from (optional) +- """ +- params = {'name': name, +- 'svcname': svcname} +- +- if hostnqn: +- params['hostnqn'] = hostnqn +- return client.call('bdev_nvme_start_mdns_discovery', params) +- +- +-def bdev_nvme_stop_mdns_discovery(client, name): +- """Stop a previously started mdns discovery service +- +- Args: +- name: name of the discovery service to stop +- """ +- params = {'name': name} +- +- return client.call('bdev_nvme_stop_mdns_discovery', params) +- +- +-def bdev_nvme_get_mdns_discovery_info(client): +- """Get information about the automatic mdns discovery +- """ +- return client.call('bdev_nvme_get_mdns_discovery_info') ++# SPDX-License-Identifier: BSD-3-Clause ++# Copyright (C) 2017 Intel Corporation. ++# All rights reserved. ++# Copyright (c) 2022 Dell Inc, or its subsidiaries. ++ ++ ++def bdev_set_options(client, bdev_io_pool_size=None, bdev_io_cache_size=None, bdev_auto_examine=None, ++ small_buf_pool_size=None, large_buf_pool_size=None): ++ """Set parameters for the bdev subsystem. ++ ++ Args: ++ bdev_io_pool_size: number of bdev_io structures in shared buffer pool (optional) ++ bdev_io_cache_size: maximum number of bdev_io structures cached per thread (optional) ++ bdev_auto_examine: if set to false, the bdev layer will not examine every disks automatically (optional) ++ small_buf_pool_size: maximum number of small buffer (8KB buffer) pool size (optional) ++ large_buf_pool_size: maximum number of large buffer (64KB buffer) pool size (optional) ++ """ ++ params = {} ++ ++ if bdev_io_pool_size: ++ params['bdev_io_pool_size'] = bdev_io_pool_size ++ if bdev_io_cache_size: ++ params['bdev_io_cache_size'] = bdev_io_cache_size ++ if bdev_auto_examine is not None: ++ params["bdev_auto_examine"] = bdev_auto_examine ++ if small_buf_pool_size: ++ params['small_buf_pool_size'] = small_buf_pool_size ++ if large_buf_pool_size: ++ params['large_buf_pool_size'] = large_buf_pool_size ++ return client.call('bdev_set_options', params) ++ ++ ++def bdev_examine(client, name): ++ """Examine a bdev manually. If the bdev does not exist yet when this RPC is called, ++ it will be examined when it is created ++ ++ Args: ++ name: name of the bdev ++ """ ++ params = { ++ 'name': name ++ } ++ return client.call('bdev_examine', params) ++ ++ ++def bdev_wait_for_examine(client): ++ """Report when all bdevs have been examined ++ """ ++ return client.call('bdev_wait_for_examine') ++ ++ ++def bdev_compress_create(client, base_bdev_name, pm_path, lb_size): ++ """Construct a compress virtual block device. ++ ++ Args: ++ base_bdev_name: name of the underlying base bdev ++ pm_path: path to persistent memory ++ lb_size: logical block size for the compressed vol in bytes. Must be 4K or 512. ++ ++ Returns: ++ Name of created virtual block device. ++ """ ++ params = {'base_bdev_name': base_bdev_name, 'pm_path': pm_path} ++ ++ if lb_size: ++ params['lb_size'] = lb_size ++ ++ return client.call('bdev_compress_create', params) ++ ++ ++def bdev_compress_delete(client, name): ++ """Delete compress virtual block device. ++ ++ Args: ++ name: name of compress vbdev to delete ++ """ ++ params = {'name': name} ++ return client.call('bdev_compress_delete', params) ++ ++ ++def bdev_compress_get_orphans(client, name=None): ++ """Get a list of comp bdevs that do not have a pmem file (aka orphaned). ++ ++ Args: ++ name: comp bdev name to query (optional; if omitted, query all comp bdevs) ++ ++ Returns: ++ List of comp bdev names. ++ """ ++ params = {} ++ if name: ++ params['name'] = name ++ return client.call('bdev_compress_get_orphans', params) ++ ++ ++def bdev_crypto_create(client, base_bdev_name, name, crypto_pmd=None, key=None, cipher=None, key2=None, key_name=None): ++ """Construct a crypto virtual block device. ++ ++ Args: ++ base_bdev_name: name of the underlying base bdev ++ name: name for the crypto vbdev ++ crypto_pmd: name of the DPDK crypto driver to use ++ key: key ++ cipher: crypto algorithm to use ++ key2: Optional second part of the key ++ key_name: The key name to use in crypto operations ++ ++ Returns: ++ Name of created virtual block device. ++ """ ++ params = {'base_bdev_name': base_bdev_name, 'name': name} ++ ++ if crypto_pmd is not None: ++ params['crypto_pmd'] = crypto_pmd ++ if key is not None: ++ params['key'] = key ++ if key2 is not None: ++ params['key2'] = key2 ++ if cipher is not None: ++ params['cipher'] = cipher ++ if key_name is not None: ++ params['key_name'] = key_name ++ return client.call('bdev_crypto_create', params) ++ ++ ++def bdev_crypto_delete(client, name): ++ """Delete crypto virtual block device. ++ ++ Args: ++ name: name of crypto vbdev to delete ++ """ ++ params = {'name': name} ++ return client.call('bdev_crypto_delete', params) ++ ++ ++def bdev_ocf_create(client, name, mode, cache_line_size, cache_bdev_name, core_bdev_name): ++ """Add an OCF block device ++ ++ Args: ++ name: name of constructed OCF bdev ++ mode: OCF cache mode: {'wb', 'wt', 'pt', 'wa', 'wi', 'wo'} ++ cache_line_size: OCF cache line size. The unit is KiB: {4, 8, 16, 32, 64} ++ cache_bdev_name: name of underlying cache bdev ++ core_bdev_name: name of underlying core bdev ++ ++ Returns: ++ Name of created block device ++ """ ++ params = { ++ 'name': name, ++ 'mode': mode, ++ 'cache_bdev_name': cache_bdev_name, ++ 'core_bdev_name': core_bdev_name, ++ } ++ ++ if cache_line_size: ++ params['cache_line_size'] = cache_line_size ++ ++ return client.call('bdev_ocf_create', params) ++ ++ ++def bdev_ocf_delete(client, name): ++ """Delete an OCF device ++ ++ Args: ++ name: name of OCF bdev ++ ++ """ ++ params = {'name': name} ++ ++ return client.call('bdev_ocf_delete', params) ++ ++ ++def bdev_ocf_get_stats(client, name): ++ """Get statistics of chosen OCF block device ++ ++ Args: ++ name: name of OCF bdev ++ ++ Returns: ++ Statistics as json object ++ """ ++ params = {'name': name} ++ ++ return client.call('bdev_ocf_get_stats', params) ++ ++ ++def bdev_ocf_get_bdevs(client, name=None): ++ """Get list of OCF devices including unregistered ones ++ ++ Args: ++ name: name of OCF vbdev or name of cache device or name of core device (optional) ++ ++ Returns: ++ Array of OCF devices with their current status ++ """ ++ params = None ++ if name: ++ params = {'name': name} ++ return client.call('bdev_ocf_get_bdevs', params) ++ ++ ++def bdev_ocf_set_cache_mode(client, name, mode): ++ """Set cache mode of OCF block device ++ ++ Args: ++ name: name of OCF bdev ++ mode: OCF cache mode: {'wb', 'wt', 'pt', 'wa', 'wi', 'wo'} ++ ++ Returns: ++ New cache mode name ++ """ ++ params = { ++ 'name': name, ++ 'mode': mode, ++ } ++ ++ return client.call('bdev_ocf_set_cache_mode', params) ++ ++ ++def bdev_ocf_set_seqcutoff(client, name, policy, threshold, promotion_count): ++ """Set sequential cutoff parameters on all cores for the given OCF cache device ++ ++ Args: ++ name: Name of OCF cache bdev ++ policy: Sequential cutoff policy ++ threshold: Activation threshold [KiB] (optional) ++ promotion_count: Promotion request count (optional) ++ """ ++ params = { ++ 'name': name, ++ 'policy': policy, ++ } ++ if threshold: ++ params['threshold'] = threshold ++ if promotion_count: ++ params['promotion_count'] = promotion_count ++ ++ return client.call('bdev_ocf_set_seqcutoff', params) ++ ++ ++def bdev_ocf_flush_start(client, name): ++ """Start flushing OCF cache device ++ ++ Args: ++ name: name of OCF bdev ++ """ ++ params = { ++ 'name': name, ++ } ++ ++ return client.call('bdev_ocf_flush_start', params) ++ ++ ++def bdev_ocf_flush_status(client, name): ++ """Get flush status of OCF cache device ++ ++ Args: ++ name: name of OCF bdev ++ ++ Returns: ++ Flush status ++ """ ++ params = { ++ 'name': name, ++ } ++ ++ return client.call('bdev_ocf_flush_status', params) ++ ++ ++def bdev_malloc_create(client, num_blocks, block_size, name=None, uuid=None, optimal_io_boundary=None, ++ md_size=None, md_interleave=None, dif_type=None, dif_is_head_of_md=None): ++ """Construct a malloc block device. ++ ++ Args: ++ num_blocks: size of block device in blocks ++ block_size: Data block size of device; must be a power of 2 and at least 512 ++ name: name of block device (optional) ++ uuid: UUID of block device (optional) ++ optimal_io_boundary: Split on optimal IO boundary, in number of blocks, default 0 (disabled, optional) ++ md_size: metadata size of device (0, 8, 16, 32, 64, or 128), default 0 (optional) ++ md_interleave: metadata location, interleaved if set, and separated if omitted (optional) ++ dif_type: protection information type (optional) ++ dif_is_head_of_md: protection information is in the first 8 bytes of metadata (optional) ++ ++ Returns: ++ Name of created block device. ++ """ ++ params = {'num_blocks': num_blocks, 'block_size': block_size} ++ if name: ++ params['name'] = name ++ if uuid: ++ params['uuid'] = uuid ++ if optimal_io_boundary: ++ params['optimal_io_boundary'] = optimal_io_boundary ++ if md_size: ++ params['md_size'] = md_size ++ if md_interleave: ++ params['md_interleave'] = md_interleave ++ if dif_type: ++ params['dif_type'] = dif_type ++ if dif_is_head_of_md: ++ params['dif_is_head_of_md'] = dif_is_head_of_md ++ ++ return client.call('bdev_malloc_create', params) ++ ++ ++def bdev_malloc_delete(client, name): ++ """Delete malloc block device. ++ ++ Args: ++ bdev_name: name of malloc bdev to delete ++ """ ++ params = {'name': name} ++ return client.call('bdev_malloc_delete', params) ++ ++ ++def bdev_null_create(client, num_blocks, block_size, name, uuid=None, md_size=None, ++ dif_type=None, dif_is_head_of_md=None): ++ """Construct a null block device. ++ ++ Args: ++ num_blocks: size of block device in blocks ++ block_size: block size of device; data part size must be a power of 2 and at least 512 ++ name: name of block device ++ uuid: UUID of block device (optional) ++ md_size: metadata size of device (optional) ++ dif_type: protection information type (optional) ++ dif_is_head_of_md: protection information is in the first 8 bytes of metadata (optional) ++ ++ Returns: ++ Name of created block device. ++ """ ++ params = {'name': name, 'num_blocks': num_blocks, ++ 'block_size': block_size} ++ if uuid: ++ params['uuid'] = uuid ++ if md_size: ++ params['md_size'] = md_size ++ if dif_type: ++ params['dif_type'] = dif_type ++ if dif_is_head_of_md: ++ params['dif_is_head_of_md'] = dif_is_head_of_md ++ return client.call('bdev_null_create', params) ++ ++ ++def bdev_null_delete(client, name): ++ """Remove null bdev from the system. ++ ++ Args: ++ name: name of null bdev to delete ++ """ ++ params = {'name': name} ++ return client.call('bdev_null_delete', params) ++ ++ ++def bdev_null_resize(client, name, new_size): ++ """Resize null bdev in the system. ++ ++ Args: ++ name: name of null bdev to resize ++ new_size: new bdev size of resize operation. The unit is MiB ++ """ ++ params = { ++ 'name': name, ++ 'new_size': new_size, ++ } ++ return client.call('bdev_null_resize', params) ++ ++ ++def bdev_raid_get_bdevs(client, category): ++ """Get list of raid bdevs based on category ++ ++ Args: ++ category: any one of all or online or configuring or offline ++ ++ Returns: ++ List of raid bdev details ++ """ ++ params = {'category': category} ++ return client.call('bdev_raid_get_bdevs', params) ++ ++ ++def bdev_raid_create(client, name, raid_level, base_bdevs, strip_size=None, strip_size_kb=None): ++ """Create raid bdev. Either strip size arg will work but one is required. ++ ++ Args: ++ name: user defined raid bdev name ++ strip_size (deprecated): strip size of raid bdev in KB, supported values like 8, 16, 32, 64, 128, 256, etc ++ strip_size_kb: strip size of raid bdev in KB, supported values like 8, 16, 32, 64, 128, 256, etc ++ raid_level: raid level of raid bdev, supported values 0 ++ base_bdevs: Space separated names of Nvme bdevs in double quotes, like "Nvme0n1 Nvme1n1 Nvme2n1" ++ ++ Returns: ++ None ++ """ ++ params = {'name': name, 'raid_level': raid_level, 'base_bdevs': base_bdevs} ++ ++ if strip_size: ++ params['strip_size'] = strip_size ++ ++ if strip_size_kb: ++ params['strip_size_kb'] = strip_size_kb ++ ++ return client.call('bdev_raid_create', params) ++ ++ ++def bdev_raid_delete(client, name): ++ """Delete raid bdev ++ ++ Args: ++ name: raid bdev name ++ ++ Returns: ++ None ++ """ ++ params = {'name': name} ++ return client.call('bdev_raid_delete', params) ++ ++ ++def bdev_aio_create(client, filename, name, block_size=None, readonly=False): ++ """Construct a Linux AIO block device. ++ ++ Args: ++ filename: path to device or file (ex: /dev/sda) ++ name: name of block device ++ block_size: block size of device (optional; autodetected if omitted) ++ readonly: set aio bdev as read-only ++ ++ Returns: ++ Name of created block device. ++ """ ++ params = {'name': name, ++ 'filename': filename} ++ ++ if block_size: ++ params['block_size'] = block_size ++ ++ if readonly: ++ params['readonly'] = readonly ++ ++ return client.call('bdev_aio_create', params) ++ ++ ++def bdev_aio_rescan(client, name): ++ """Rescan a Linux AIO block device. ++ ++ Args: ++ bdev_name: name of aio bdev to delete ++ """ ++ params = {'name': name} ++ return client.call('bdev_aio_rescan', params) ++ ++ ++def bdev_aio_delete(client, name): ++ """Remove aio bdev from the system. ++ ++ Args: ++ bdev_name: name of aio bdev to delete ++ """ ++ params = {'name': name} ++ return client.call('bdev_aio_delete', params) ++ ++ ++def bdev_uring_create(client, filename, name, block_size=None): ++ """Create a bdev with Linux io_uring backend. ++ ++ Args: ++ filename: path to device or file (ex: /dev/nvme0n1) ++ name: name of bdev ++ block_size: block size of device (optional; autodetected if omitted) ++ ++ Returns: ++ Name of created bdev. ++ """ ++ params = {'name': name, ++ 'filename': filename} ++ ++ if block_size: ++ params['block_size'] = block_size ++ ++ return client.call('bdev_uring_create', params) ++ ++ ++def bdev_uring_delete(client, name): ++ """Delete a uring bdev. ++ ++ Args: ++ name: name of uring bdev to delete ++ """ ++ params = {'name': name} ++ return client.call('bdev_uring_delete', params) ++ ++ ++def bdev_xnvme_create(client, filename, name, io_mechanism, conserve_cpu=None): ++ """Create a bdev with xNVMe backend. ++ ++ Args: ++ filename: path to device or file (ex: /dev/nvme0n1) ++ name: name of xNVMe bdev to create ++ io_mechanism: I/O mechanism to use (ex: io_uring, io_uring_cmd, etc.) ++ conserve_cpu: Whether or not to conserve CPU when polling (default: False) ++ ++ Returns: ++ Name of created bdev. ++ """ ++ params = { ++ 'name': name, ++ 'filename': filename, ++ 'io_mechanism': io_mechanism, ++ } ++ if conserve_cpu: ++ params['conserve_cpu'] = conserve_cpu ++ ++ return client.call('bdev_xnvme_create', params) ++ ++ ++def bdev_xnvme_delete(client, name): ++ """Delete a xNVMe bdev. ++ ++ Args: ++ name: name of xNVMe bdev to delete ++ """ ++ params = {'name': name} ++ return client.call('bdev_xnvme_delete', params) ++ ++ ++def bdev_nvme_set_options(client, action_on_timeout=None, timeout_us=None, timeout_admin_us=None, ++ keep_alive_timeout_ms=None, retry_count=None, arbitration_burst=None, ++ low_priority_weight=None, medium_priority_weight=None, high_priority_weight=None, ++ nvme_adminq_poll_period_us=None, nvme_ioq_poll_period_us=None, io_queue_requests=None, ++ delay_cmd_submit=None, transport_retry_count=None, bdev_retry_count=None, ++ transport_ack_timeout=None, ctrlr_loss_timeout_sec=None, reconnect_delay_sec=None, ++ fast_io_fail_timeout_sec=None, disable_auto_failback=None, generate_uuids=None, ++ transport_tos=None, nvme_error_stat=None, rdma_srq_size=None, io_path_stat=None): ++ """Set options for the bdev nvme. This is startup command. ++ ++ Args: ++ action_on_timeout: action to take on command time out. Valid values are: none, reset, abort (optional) ++ timeout_us: Timeout for each command, in microseconds. If 0, don't track timeouts (optional) ++ timeout_admin_us: Timeout for each admin command, in microseconds. If 0, treat same as io timeouts (optional) ++ keep_alive_timeout_ms: Keep alive timeout period in millisecond, default is 10s (optional) ++ retry_count: The number of attempts per I/O when an I/O fails (deprecated) (optional) ++ arbitration_burst: The value is expressed as a power of two (optional) ++ low_priority_weight: The number of commands that may be executed from the low priority queue at one time (optional) ++ medium_priority_weight: The number of commands that may be executed from the medium priority queue at one time (optional) ++ high_priority_weight: The number of commands that may be executed from the high priority queue at one time (optional) ++ nvme_adminq_poll_period_us: How often the admin queue is polled for asynchronous events in microseconds (optional) ++ nvme_ioq_poll_period_us: How often to poll I/O queues for completions in microseconds (optional) ++ io_queue_requests: The number of requests allocated for each NVMe I/O queue. Default: 512 (optional) ++ delay_cmd_submit: Enable delayed NVMe command submission to allow batching of multiple commands (optional) ++ transport_retry_count: The number of attempts per I/O in the transport layer when an I/O fails (optional) ++ bdev_retry_count: The number of attempts per I/O in the bdev layer when an I/O fails. -1 means infinite retries. (optional) ++ transport_ack_timeout: Time to wait ack until packet retransmission for RDMA or until closes connection for TCP. ++ Range 0-31 where 0 is driver-specific default value (optional) ++ ctrlr_loss_timeout_sec: Time to wait until ctrlr is reconnected before deleting ctrlr. ++ -1 means infinite reconnect retries. 0 means no reconnect retry. ++ If reconnect_delay_sec is zero, ctrlr_loss_timeout_sec has to be zero. ++ If reconnect_delay_sec is non-zero, ctrlr_loss_timeout_sec has to be -1 or not less than reconnect_delay_sec. ++ This can be overridden by bdev_nvme_attach_controller. (optional) ++ reconnect_delay_sec: Time to delay a reconnect retry. ++ If ctrlr_loss_timeout_sec is zero, reconnect_delay_sec has to be zero. ++ If ctrlr_loss_timeout_sec is -1, reconnect_delay_sec has to be non-zero. ++ If ctrlr_loss_timeout_sec is not -1 or zero, reconnect_sec has to be non-zero and less than ctrlr_loss_timeout_sec. ++ This can be overridden by bdev_nvme_attach_controller. (optional) ++ fail_io_fast_timeout_sec: Time to wait until ctrlr is reconnected before failing I/O to ctrlr. ++ 0 means no such timeout. ++ If fast_io_fail_timeout_sec is not zero, it has to be not less than reconnect_delay_sec and less than ++ ctrlr_loss_timeout_sec if ctrlr_loss_timeout_sec is not -1. ++ This can be overridden by bdev_nvme_attach_controller. (optional) ++ disable_auto_failback: Disable automatic failback. bdev_nvme_set_preferred_path can be used to do manual failback. ++ By default, immediately failback to the preferred I/O path if it is restored. (optional) ++ generate_uuids: Enable generation of unique identifiers for NVMe bdevs only if they do not provide UUID themselves. ++ These strings are based on device serial number and namespace ID and will always be the same for that device. ++ transport_tos: IPv4 Type of Service value. Only applicable for RDMA transports. ++ The default is 0 which means no TOS is applied. (optional) ++ nvme_error_stat: Enable collecting NVMe error counts. (optional) ++ rdma_srq_size: Set the size of a shared rdma receive queue. Default: 0 (disabled) (optional) ++ io_path_stat: Enable collection I/O path stat of each io path. (optional) ++ ++ """ ++ params = {} ++ ++ if action_on_timeout: ++ params['action_on_timeout'] = action_on_timeout ++ ++ if timeout_us is not None: ++ params['timeout_us'] = timeout_us ++ ++ if timeout_admin_us is not None: ++ params['timeout_admin_us'] = timeout_admin_us ++ ++ if keep_alive_timeout_ms is not None: ++ params['keep_alive_timeout_ms'] = keep_alive_timeout_ms ++ ++ if retry_count is not None: ++ print("WARNING: retry_count is deprecated, please use transport_retry_count.") ++ params['retry_count'] = retry_count ++ ++ if arbitration_burst is not None: ++ params['arbitration_burst'] = arbitration_burst ++ ++ if low_priority_weight is not None: ++ params['low_priority_weight'] = low_priority_weight ++ ++ if medium_priority_weight is not None: ++ params['medium_priority_weight'] = medium_priority_weight ++ ++ if high_priority_weight is not None: ++ params['high_priority_weight'] = high_priority_weight ++ ++ if nvme_adminq_poll_period_us: ++ params['nvme_adminq_poll_period_us'] = nvme_adminq_poll_period_us ++ ++ if nvme_ioq_poll_period_us is not None: ++ params['nvme_ioq_poll_period_us'] = nvme_ioq_poll_period_us ++ ++ if io_queue_requests is not None: ++ params['io_queue_requests'] = io_queue_requests ++ ++ if delay_cmd_submit is not None: ++ params['delay_cmd_submit'] = delay_cmd_submit ++ ++ if transport_retry_count is not None: ++ params['transport_retry_count'] = transport_retry_count ++ ++ if bdev_retry_count is not None: ++ params['bdev_retry_count'] = bdev_retry_count ++ ++ if transport_ack_timeout is not None: ++ params['transport_ack_timeout'] = transport_ack_timeout ++ ++ if ctrlr_loss_timeout_sec is not None: ++ params['ctrlr_loss_timeout_sec'] = ctrlr_loss_timeout_sec ++ ++ if reconnect_delay_sec is not None: ++ params['reconnect_delay_sec'] = reconnect_delay_sec ++ ++ if fast_io_fail_timeout_sec is not None: ++ params['fast_io_fail_timeout_sec'] = fast_io_fail_timeout_sec ++ ++ if disable_auto_failback is not None: ++ params['disable_auto_failback'] = disable_auto_failback ++ ++ if generate_uuids is not None: ++ params['generate_uuids'] = generate_uuids ++ ++ if transport_tos is not None: ++ params['transport_tos'] = transport_tos ++ ++ if nvme_error_stat is not None: ++ params['nvme_error_stat'] = nvme_error_stat ++ ++ if rdma_srq_size is not None: ++ params['rdma_srq_size'] = rdma_srq_size ++ ++ if io_path_stat is not None: ++ params['io_path_stat'] = io_path_stat ++ ++ return client.call('bdev_nvme_set_options', params) ++ ++ ++def bdev_nvme_set_hotplug(client, enable, period_us=None): ++ """Set options for the bdev nvme. This is startup command. ++ ++ Args: ++ enable: True to enable hotplug, False to disable. ++ period_us: how often the hotplug is processed for insert and remove events. Set 0 to reset to default. (optional) ++ """ ++ params = {'enable': enable} ++ ++ if period_us: ++ params['period_us'] = period_us ++ ++ return client.call('bdev_nvme_set_hotplug', params) ++ ++ ++def bdev_nvme_attach_controller(client, name, trtype, traddr, adrfam=None, trsvcid=None, ++ priority=None, subnqn=None, hostnqn=None, hostaddr=None, ++ hostsvcid=None, prchk_reftag=None, prchk_guard=None, ++ hdgst=None, ddgst=None, fabrics_timeout=None, multipath=None, num_io_queues=None, ++ ctrlr_loss_timeout_sec=None, reconnect_delay_sec=None, ++ fast_io_fail_timeout_sec=None, psk=None): ++ """Construct block device for each NVMe namespace in the attached controller. ++ ++ Args: ++ name: bdev name prefix; "n" + namespace ID will be appended to create unique names ++ trtype: transport type ("PCIe", "RDMA", "FC", "TCP") ++ traddr: transport address (PCI BDF or IP address) ++ adrfam: address family ("IPv4", "IPv6", "IB", or "FC") ++ trsvcid: transport service ID (port number for IP-based addresses) ++ priority: transport connection priority (Sock priority for TCP-based transports; optional) ++ subnqn: subsystem NQN to connect to (optional) ++ hostnqn: NQN to connect from (optional) ++ hostaddr: host transport address (IP address for IP-based transports, NULL for PCIe or FC; optional) ++ hostsvcid: host transport service ID (port number for IP-based transports, NULL for PCIe or FC; optional) ++ prchk_reftag: Enable checking of PI reference tag for I/O processing (optional) ++ prchk_guard: Enable checking of PI guard for I/O processing (optional) ++ hdgst: Enable TCP header digest (optional) ++ ddgst: Enable TCP data digest (optional) ++ fabrics_timeout: Fabrics connect timeout in us (optional) ++ multipath: The behavior when multiple paths are created ("disable", "failover", or "multipath"; failover if not specified) ++ num_io_queues: The number of IO queues to request during initialization. (optional) ++ ctrlr_loss_timeout_sec: Time to wait until ctrlr is reconnected before deleting ctrlr. ++ -1 means infinite reconnect retries. 0 means no reconnect retry. ++ If reconnect_delay_sec is zero, ctrlr_loss_timeout_sec has to be zero. ++ If reconnect_delay_sec is non-zero, ctrlr_loss_timeout_sec has to be -1 or not less than reconnect_delay_sec. ++ (optional) ++ reconnect_delay_sec: Time to delay a reconnect retry. ++ If ctrlr_loss_timeout_sec is zero, reconnect_delay_sec has to be zero. ++ If ctrlr_loss_timeout_sec is -1, reconnect_delay_sec has to be non-zero. ++ If ctrlr_loss_timeout_sec is not -1 or zero, reconnect_sec has to be non-zero and less than ctrlr_loss_timeout_sec. ++ (optional) ++ fail_io_fast_timeout_sec: Time to wait until ctrlr is reconnected before failing I/O to ctrlr. ++ 0 means no such timeout. ++ If fast_io_fail_timeout_sec is not zero, it has to be not less than reconnect_delay_sec and less than ++ ctrlr_loss_timeout_sec if ctrlr_loss_timeout_sec is not -1. (optional) ++ psk: Set PSK and enable TCP SSL socket implementation (optional) ++ ++ Returns: ++ Names of created block devices. ++ """ ++ params = {'name': name, ++ 'trtype': trtype, ++ 'traddr': traddr} ++ ++ if hostnqn: ++ params['hostnqn'] = hostnqn ++ ++ if hostaddr: ++ params['hostaddr'] = hostaddr ++ ++ if hostsvcid: ++ params['hostsvcid'] = hostsvcid ++ ++ if adrfam: ++ params['adrfam'] = adrfam ++ ++ if trsvcid: ++ params['trsvcid'] = trsvcid ++ ++ if priority: ++ params['priority'] = priority ++ ++ if subnqn: ++ params['subnqn'] = subnqn ++ ++ if prchk_reftag: ++ params['prchk_reftag'] = prchk_reftag ++ ++ if prchk_guard: ++ params['prchk_guard'] = prchk_guard ++ ++ if hdgst: ++ params['hdgst'] = hdgst ++ ++ if ddgst: ++ params['ddgst'] = ddgst ++ ++ if fabrics_timeout: ++ params['fabrics_connect_timeout_us'] = fabrics_timeout ++ ++ if multipath: ++ params['multipath'] = multipath ++ ++ if num_io_queues: ++ params['num_io_queues'] = num_io_queues ++ ++ if ctrlr_loss_timeout_sec is not None: ++ params['ctrlr_loss_timeout_sec'] = ctrlr_loss_timeout_sec ++ ++ if reconnect_delay_sec is not None: ++ params['reconnect_delay_sec'] = reconnect_delay_sec ++ ++ if fast_io_fail_timeout_sec is not None: ++ params['fast_io_fail_timeout_sec'] = fast_io_fail_timeout_sec ++ ++ if psk: ++ params['psk'] = psk ++ ++ return client.call('bdev_nvme_attach_controller', params) ++ ++ ++def bdev_nvme_detach_controller(client, name, trtype=None, traddr=None, ++ adrfam=None, trsvcid=None, subnqn=None, ++ hostaddr=None, hostsvcid=None): ++ """Detach NVMe controller and delete any associated bdevs. Optionally, ++ If all of the transport ID options are specified, only remove that ++ transport path from the specified controller. If that is the only ++ available path for the controller, this will also result in the ++ controller being detached and the associated bdevs being deleted. ++ ++ Args: ++ name: controller name ++ trtype: transport type ("PCIe", "RDMA") ++ traddr: transport address (PCI BDF or IP address) ++ adrfam: address family ("IPv4", "IPv6", "IB", or "FC") ++ trsvcid: transport service ID (port number for IP-based addresses) ++ subnqn: subsystem NQN to connect to (optional) ++ hostaddr: Host address (IP address) ++ hostsvcid: transport service ID on host side (port number) ++ """ ++ ++ params = {'name': name} ++ ++ if trtype: ++ params['trtype'] = trtype ++ ++ if traddr: ++ params['traddr'] = traddr ++ ++ if adrfam: ++ params['adrfam'] = adrfam ++ ++ if trsvcid: ++ params['trsvcid'] = trsvcid ++ ++ if subnqn: ++ params['subnqn'] = subnqn ++ ++ if hostaddr: ++ params['hostaddr'] = hostaddr ++ ++ if hostsvcid: ++ params['hostsvcid'] = hostsvcid ++ ++ return client.call('bdev_nvme_detach_controller', params) ++ ++ ++def bdev_nvme_reset_controller(client, name): ++ """Reset NVMe controller. ++ ++ Args: ++ name: controller name ++ """ ++ ++ params = {'name': name} ++ ++ return client.call('bdev_nvme_reset_controller', params) ++ ++ ++def bdev_nvme_start_discovery(client, name, trtype, traddr, adrfam=None, trsvcid=None, ++ hostnqn=None, wait_for_attach=None, ctrlr_loss_timeout_sec=None, ++ reconnect_delay_sec=None, fast_io_fail_timeout_sec=None, ++ attach_timeout_ms=None): ++ """Start discovery with the specified discovery subsystem ++ ++ Args: ++ name: bdev name prefix; "n" + namespace ID will be appended to create unique names ++ trtype: transport type ("PCIe", "RDMA", "FC", "TCP") ++ traddr: transport address (PCI BDF or IP address) ++ adrfam: address family ("IPv4", "IPv6", "IB", or "FC") ++ trsvcid: transport service ID (port number for IP-based addresses) ++ hostnqn: NQN to connect from (optional) ++ wait_for_attach: Wait to complete RPC until all discovered NVM subsystems have attached (optional) ++ ctrlr_loss_timeout_sec: Time to wait until ctrlr is reconnected before deleting ctrlr. ++ -1 means infinite reconnect retries. 0 means no reconnect retry. ++ If reconnect_delay_sec is zero, ctrlr_loss_timeout_sec has to be zero. ++ If reconnect_delay_sec is non-zero, ctrlr_loss_timeout_sec has to be -1 or not less than reconnect_delay_sec. ++ (optional) ++ reconnect_delay_sec: Time to delay a reconnect retry. ++ If ctrlr_loss_timeout_sec is zero, reconnect_delay_sec has to be zero. ++ If ctrlr_loss_timeout_sec is -1, reconnect_delay_sec has to be non-zero. ++ If ctrlr_loss_timeout_sec is not -1 or zero, reconnect_sec has to be non-zero and less than ctrlr_loss_timeout_sec. ++ (optional) ++ fail_io_fast_timeout_sec: Time to wait until ctrlr is reconnected before failing I/O to ctrlr. ++ 0 means no such timeout. ++ If fast_io_fail_timeout_sec is not zero, it has to be not less than reconnect_delay_sec and less than ++ ctrlr_loss_timeout_sec if ctrlr_loss_timeout_sec is not -1. (optional) ++ attach_timeout_ms: Time to wait until the discovery and all discovered NVM subsystems are attached (optional) ++ """ ++ params = {'name': name, ++ 'trtype': trtype, ++ 'traddr': traddr} ++ ++ if hostnqn: ++ params['hostnqn'] = hostnqn ++ ++ if adrfam: ++ params['adrfam'] = adrfam ++ ++ if trsvcid: ++ params['trsvcid'] = trsvcid ++ ++ if wait_for_attach: ++ params['wait_for_attach'] = True ++ ++ if attach_timeout_ms is not None: ++ params['attach_timeout_ms'] = attach_timeout_ms ++ ++ if ctrlr_loss_timeout_sec is not None: ++ params['ctrlr_loss_timeout_sec'] = ctrlr_loss_timeout_sec ++ ++ if reconnect_delay_sec is not None: ++ params['reconnect_delay_sec'] = reconnect_delay_sec ++ ++ if fast_io_fail_timeout_sec is not None: ++ params['fast_io_fail_timeout_sec'] = fast_io_fail_timeout_sec ++ ++ return client.call('bdev_nvme_start_discovery', params) ++ ++ ++def bdev_nvme_stop_discovery(client, name): ++ """Stop a previously started discovery service ++ ++ Args: ++ name: name of discovery service to start ++ """ ++ params = {'name': name} ++ ++ return client.call('bdev_nvme_stop_discovery', params) ++ ++ ++def bdev_nvme_get_discovery_info(client): ++ """Get information about the automatic discovery ++ """ ++ return client.call('bdev_nvme_get_discovery_info') ++ ++ ++def bdev_nvme_get_io_paths(client, name): ++ """Display all or the specified NVMe bdev's active I/O paths ++ ++ Args: ++ name: Name of the NVMe bdev (optional) ++ ++ Returns: ++ List of active I/O paths ++ """ ++ params = {} ++ if name: ++ params['name'] = name ++ return client.call('bdev_nvme_get_io_paths', params) ++ ++ ++def bdev_nvme_set_preferred_path(client, name, cntlid): ++ """Set the preferred I/O path for an NVMe bdev when in multipath mode ++ ++ Args: ++ name: NVMe bdev name ++ cntlid: NVMe-oF controller ID ++ """ ++ ++ params = {'name': name, ++ 'cntlid': cntlid} ++ ++ return client.call('bdev_nvme_set_preferred_path', params) ++ ++ ++def bdev_nvme_set_multipath_policy(client, name, policy, selector, rr_min_io): ++ """Set multipath policy of the NVMe bdev ++ ++ Args: ++ name: NVMe bdev name ++ policy: Multipath policy (active_passive or active_active) ++ selector: Multipath selector (round_robin, queue_depth) ++ rr_min_io: Number of IO to route to a path before switching to another one (optional) ++ """ ++ ++ params = {'name': name, ++ 'policy': policy} ++ if selector: ++ params['selector'] = selector ++ if rr_min_io: ++ params['rr_min_io'] = rr_min_io ++ ++ return client.call('bdev_nvme_set_multipath_policy', params) ++ ++ ++def bdev_nvme_get_path_iostat(client, name): ++ """Get I/O statistics for IO paths of the block device. ++ ++ Args: ++ name: bdev name to query ++ ++ Returns: ++ I/O statistics for IO paths of the requested block device. ++ """ ++ params = {'name': name} ++ ++ return client.call('bdev_nvme_get_path_iostat', params) ++ ++ ++def bdev_nvme_cuse_register(client, name): ++ """Register CUSE devices on NVMe controller. ++ ++ Args: ++ name: Name of the operating NVMe controller ++ """ ++ params = {'name': name} ++ ++ return client.call('bdev_nvme_cuse_register', params) ++ ++ ++def bdev_nvme_cuse_unregister(client, name): ++ """Unregister CUSE devices on NVMe controller. ++ ++ Args: ++ name: Name of the operating NVMe controller ++ """ ++ params = {'name': name} ++ ++ return client.call('bdev_nvme_cuse_unregister', params) ++ ++ ++def bdev_zone_block_create(client, name, base_bdev, zone_capacity, optimal_open_zones): ++ """Creates a virtual zone device on top of existing non-zoned bdev. ++ ++ Args: ++ name: Zone device name ++ base_bdev: Base Nvme bdev name ++ zone_capacity: Surfaced zone capacity in blocks ++ optimal_open_zones: Number of zones required to reach optimal write speed (optional, default: 1) ++ ++ Returns: ++ Name of created block device. ++ """ ++ params = {'name': name, ++ 'base_bdev': base_bdev, ++ 'zone_capacity': zone_capacity, ++ 'optimal_open_zones': optimal_open_zones} ++ ++ return client.call('bdev_zone_block_create', params) ++ ++ ++def bdev_zone_block_delete(client, name): ++ """Remove block zone bdev from the system. ++ ++ Args: ++ name: name of block zone bdev to delete ++ """ ++ params = {'name': name} ++ return client.call('bdev_zone_block_delete', params) ++ ++ ++def bdev_rbd_register_cluster(client, name, user=None, config_param=None, config_file=None, key_file=None): ++ """Create a Rados Cluster object of the Ceph RBD backend. ++ ++ Args: ++ name: name of Rados Cluster ++ user: Ceph user name (optional) ++ config_param: map of config keys to values (optional) ++ config_file: file path of Ceph configuration file (optional) ++ key_file: file path of Ceph key file (optional) ++ ++ Returns: ++ Name of registered Rados Cluster object. ++ """ ++ params = {'name': name} ++ ++ if user is not None: ++ params['user_id'] = user ++ if config_param is not None: ++ params['config_param'] = config_param ++ if config_file is not None: ++ params['config_file'] = config_file ++ if key_file is not None: ++ params['key_file'] = key_file ++ ++ return client.call('bdev_rbd_register_cluster', params) ++ ++ ++def bdev_rbd_unregister_cluster(client, name): ++ """Remove Rados cluster object from the system. ++ ++ Args: ++ name: name of Rados cluster object to unregister ++ """ ++ params = {'name': name} ++ return client.call('bdev_rbd_unregister_cluster', params) ++ ++ ++def bdev_rbd_get_clusters_info(client, name): ++ """Get the cluster(s) info ++ ++ Args: ++ name: name of Rados cluster object to query (optional; if omitted, query all clusters) ++ ++ Returns: ++ List of registered Rados cluster information objects. ++ """ ++ params = {} ++ if name: ++ params['name'] = name ++ return client.call('bdev_rbd_get_clusters_info', params) ++ ++ ++def bdev_rbd_create(client, pool_name, rbd_name, block_size, name=None, user=None, config=None, cluster_name=None, uuid=None): ++ """Create a Ceph RBD block device. ++ ++ Args: ++ pool_name: Ceph RBD pool name ++ rbd_name: Ceph RBD image name ++ block_size: block size of RBD volume ++ name: name of block device (optional) ++ user: Ceph user name (optional) ++ config: map of config keys to values (optional) ++ cluster_name: Name to identify Rados cluster (optional) ++ uuid: UUID of block device (optional) ++ ++ Returns: ++ Name of created block device. ++ """ ++ params = { ++ 'pool_name': pool_name, ++ 'rbd_name': rbd_name, ++ 'block_size': block_size, ++ } ++ ++ if name: ++ params['name'] = name ++ if user is not None: ++ params['user_id'] = user ++ if config is not None: ++ params['config'] = config ++ if cluster_name is not None: ++ params['cluster_name'] = cluster_name ++ else: ++ print("WARNING:bdev_rbd_create should be used with specifying -c to have a cluster name after bdev_rbd_register_cluster.") ++ if uuid is not None: ++ params['uuid'] = uuid ++ ++ return client.call('bdev_rbd_create', params) ++ ++ ++def bdev_rbd_delete(client, name): ++ """Remove rbd bdev from the system. ++ ++ Args: ++ name: name of rbd bdev to delete ++ """ ++ params = {'name': name} ++ return client.call('bdev_rbd_delete', params) ++ ++ ++def bdev_rbd_resize(client, name, new_size): ++ """Resize rbd bdev in the system. ++ ++ Args: ++ name: name of rbd bdev to resize ++ new_size: new bdev size of resize operation. The unit is MiB ++ """ ++ params = { ++ 'name': name, ++ 'new_size': new_size, ++ } ++ return client.call('bdev_rbd_resize', params) ++ ++ ++def bdev_error_create(client, base_name): ++ """Construct an error injection block device. ++ ++ Args: ++ base_name: base bdev name ++ """ ++ params = {'base_name': base_name} ++ return client.call('bdev_error_create', params) ++ ++ ++def bdev_delay_create(client, base_bdev_name, name, avg_read_latency, p99_read_latency, avg_write_latency, p99_write_latency): ++ """Construct a delay block device. ++ ++ Args: ++ base_bdev_name: name of the existing bdev ++ name: name of block device ++ avg_read_latency: complete 99% of read ops with this delay ++ p99_read_latency: complete 1% of read ops with this delay ++ avg_write_latency: complete 99% of write ops with this delay ++ p99_write_latency: complete 1% of write ops with this delay ++ ++ Returns: ++ Name of created block device. ++ """ ++ params = { ++ 'base_bdev_name': base_bdev_name, ++ 'name': name, ++ 'avg_read_latency': avg_read_latency, ++ 'p99_read_latency': p99_read_latency, ++ 'avg_write_latency': avg_write_latency, ++ 'p99_write_latency': p99_write_latency, ++ } ++ return client.call('bdev_delay_create', params) ++ ++ ++def bdev_delay_delete(client, name): ++ """Remove delay bdev from the system. ++ ++ Args: ++ name: name of delay bdev to delete ++ """ ++ params = {'name': name} ++ return client.call('bdev_delay_delete', params) ++ ++ ++def bdev_delay_update_latency(client, delay_bdev_name, latency_type, latency_us): ++ """Update the latency value for a delay block device ++ ++ Args: ++ delay_bdev_name: name of the delay bdev ++ latency_type: 'one of: avg_read, avg_write, p99_read, p99_write. No other values accepted.' ++ latency_us: 'new latency value.' ++ ++ Returns: ++ True if successful, or a specific error otherwise. ++ """ ++ params = { ++ 'delay_bdev_name': delay_bdev_name, ++ 'latency_type': latency_type, ++ 'latency_us': latency_us, ++ } ++ return client.call('bdev_delay_update_latency', params) ++ ++ ++def bdev_error_delete(client, name): ++ """Remove error bdev from the system. ++ ++ Args: ++ bdev_name: name of error bdev to delete ++ """ ++ params = {'name': name} ++ return client.call('bdev_error_delete', params) ++ ++ ++def bdev_iscsi_set_options(client, timeout_sec): ++ """Set options for the bdev iscsi. ++ ++ Args: ++ timeout_sec: Timeout for command, in seconds, if 0, don't track timeout ++ """ ++ params = {} ++ ++ if timeout_sec is not None: ++ params['timeout_sec'] = timeout_sec ++ ++ return client.call('bdev_iscsi_set_options', params) ++ ++ ++def bdev_iscsi_create(client, name, url, initiator_iqn): ++ """Construct an iSCSI block device. ++ ++ Args: ++ name: name of block device ++ url: iSCSI URL ++ initiator_iqn: IQN name to be used by initiator ++ ++ Returns: ++ Name of created block device. ++ """ ++ params = { ++ 'name': name, ++ 'url': url, ++ 'initiator_iqn': initiator_iqn, ++ } ++ return client.call('bdev_iscsi_create', params) ++ ++ ++def bdev_iscsi_delete(client, name): ++ """Remove iSCSI bdev from the system. ++ ++ Args: ++ bdev_name: name of iSCSI bdev to delete ++ """ ++ params = {'name': name} ++ return client.call('bdev_iscsi_delete', params) ++ ++ ++def bdev_pmem_create(client, pmem_file, name): ++ """Construct a libpmemblk block device. ++ ++ Args: ++ pmem_file: path to pmemblk pool file ++ name: name of block device ++ ++ Returns: ++ Name of created block device. ++ """ ++ params = { ++ 'pmem_file': pmem_file, ++ 'name': name ++ } ++ return client.call('bdev_pmem_create', params) ++ ++ ++def bdev_pmem_delete(client, name): ++ """Remove pmem bdev from the system. ++ ++ Args: ++ name: name of pmem bdev to delete ++ """ ++ params = {'name': name} ++ return client.call('bdev_pmem_delete', params) ++ ++ ++def bdev_passthru_create(client, base_bdev_name, name): ++ """Construct a pass-through block device. ++ ++ Args: ++ base_bdev_name: name of the existing bdev ++ name: name of block device ++ ++ Returns: ++ Name of created block device. ++ """ ++ params = { ++ 'base_bdev_name': base_bdev_name, ++ 'name': name, ++ } ++ return client.call('bdev_passthru_create', params) ++ ++ ++def bdev_passthru_delete(client, name): ++ """Remove pass through bdev from the system. ++ ++ Args: ++ name: name of pass through bdev to delete ++ """ ++ params = {'name': name} ++ return client.call('bdev_passthru_delete', params) ++ ++ ++def bdev_opal_create(client, nvme_ctrlr_name, nsid, locking_range_id, range_start, range_length, password): ++ """Create opal virtual block devices from a base nvme bdev. ++ ++ Args: ++ nvme_ctrlr_name: name of the nvme ctrlr ++ nsid: namespace ID of nvme ctrlr ++ locking_range_id: locking range ID corresponding to this virtual bdev ++ range_start: start address of this locking range ++ range_length: length of this locking range ++ password: admin password of base nvme bdev ++ ++ Returns: ++ Name of the new created block devices. ++ """ ++ params = { ++ 'nvme_ctrlr_name': nvme_ctrlr_name, ++ 'nsid': nsid, ++ 'locking_range_id': locking_range_id, ++ 'range_start': range_start, ++ 'range_length': range_length, ++ 'password': password, ++ } ++ ++ return client.call('bdev_opal_create', params) ++ ++ ++def bdev_opal_get_info(client, bdev_name, password): ++ """Get opal locking range info. ++ ++ Args: ++ bdev_name: name of opal vbdev to get info ++ password: admin password ++ ++ Returns: ++ Locking range info. ++ """ ++ params = { ++ 'bdev_name': bdev_name, ++ 'password': password, ++ } ++ ++ return client.call('bdev_opal_get_info', params) ++ ++ ++def bdev_opal_delete(client, bdev_name, password): ++ """Delete opal virtual bdev from the system. ++ ++ Args: ++ bdev_name: name of opal vbdev to delete ++ password: admin password of base nvme bdev ++ """ ++ params = { ++ 'bdev_name': bdev_name, ++ 'password': password, ++ } ++ ++ return client.call('bdev_opal_delete', params) ++ ++ ++def bdev_opal_new_user(client, bdev_name, admin_password, user_id, user_password): ++ """Add a user to opal bdev who can set lock state for this bdev. ++ ++ Args: ++ bdev_name: name of opal vbdev ++ admin_password: admin password ++ user_id: ID of the user who will be added to this opal bdev ++ user_password: password set for this user ++ """ ++ params = { ++ 'bdev_name': bdev_name, ++ 'admin_password': admin_password, ++ 'user_id': user_id, ++ 'user_password': user_password, ++ } ++ ++ return client.call('bdev_opal_new_user', params) ++ ++ ++def bdev_opal_set_lock_state(client, bdev_name, user_id, password, lock_state): ++ """set lock state for an opal bdev. ++ ++ Args: ++ bdev_name: name of opal vbdev ++ user_id: ID of the user who will set lock state ++ password: password of the user ++ lock_state: lock state to set ++ """ ++ params = { ++ 'bdev_name': bdev_name, ++ 'user_id': user_id, ++ 'password': password, ++ 'lock_state': lock_state, ++ } ++ ++ return client.call('bdev_opal_set_lock_state', params) ++ ++ ++def bdev_split_create(client, base_bdev, split_count, split_size_mb=None): ++ """Create split block devices from a base bdev. ++ ++ Args: ++ base_bdev: name of bdev to split ++ split_count: number of split bdevs to create ++ split_size_mb: size of each split volume in MiB (optional) ++ ++ Returns: ++ List of created block devices. ++ """ ++ params = { ++ 'base_bdev': base_bdev, ++ 'split_count': split_count, ++ } ++ if split_size_mb: ++ params['split_size_mb'] = split_size_mb ++ ++ return client.call('bdev_split_create', params) ++ ++ ++def bdev_split_delete(client, base_bdev): ++ """Delete split block devices. ++ ++ Args: ++ base_bdev: name of previously split bdev ++ """ ++ params = { ++ 'base_bdev': base_bdev, ++ } ++ ++ return client.call('bdev_split_delete', params) ++ ++ ++def bdev_ftl_create(client, name, base_bdev, **kwargs): ++ """Construct FTL bdev ++ ++ Args: ++ name: name of the bdev ++ base_bdev: name of the base bdev ++ kwargs: optional parameters ++ """ ++ params = {'name': name, ++ 'base_bdev': base_bdev} ++ for key, value in kwargs.items(): ++ if value is not None: ++ params[key] = value ++ ++ return client.call('bdev_ftl_create', params) ++ ++ ++def bdev_ftl_load(client, name, base_bdev, **kwargs): ++ """Load FTL bdev ++ ++ Args: ++ name: name of the bdev ++ base_bdev: name of the base bdev ++ kwargs: optional parameters ++ """ ++ params = {'name': name, ++ 'base_bdev': base_bdev} ++ for key, value in kwargs.items(): ++ if value is not None: ++ params[key] = value ++ ++ return client.call('bdev_ftl_load', params) ++ ++ ++def bdev_ftl_unload(client, name, fast_shutdown): ++ """Unload FTL bdev ++ ++ Args: ++ name: name of the bdev ++ """ ++ params = {'name': name, ++ 'fast_shutdown': fast_shutdown} ++ ++ return client.call('bdev_ftl_unload', params) ++ ++ ++def bdev_ftl_delete(client, name, fast_shutdown): ++ """Delete FTL bdev ++ ++ Args: ++ name: name of the bdev ++ """ ++ params = {'name': name, ++ 'fast_shutdown': fast_shutdown} ++ ++ return client.call('bdev_ftl_delete', params) ++ ++ ++def bdev_ftl_unmap(client, name, lba, num_blocks): ++ """FTL unmap ++ ++ Args: ++ name: name of the bdev ++ lba: starting lba to be unmapped ++ num_blocks: number of blocks to unmap ++ """ ++ params = {'name': name, ++ 'lba': lba, ++ 'num_blocks': num_blocks} ++ ++ return client.call('bdev_ftl_unmap', params) ++ ++ ++def bdev_ftl_get_stats(client, name): ++ """get FTL stats ++ ++ Args: ++ name: name of the bdev ++ """ ++ params = {'name': name} ++ ++ return client.call('bdev_ftl_get_stats', params) ++ ++ ++def bdev_get_bdevs(client, name=None, timeout=None): ++ """Get information about block devices. ++ ++ Args: ++ name: bdev name to query (optional; if omitted, query all bdevs) ++ timeout: time in ms to wait for the bdev with specified name to appear ++ ++ Returns: ++ List of bdev information objects. ++ """ ++ params = {} ++ if name: ++ params['name'] = name ++ if timeout: ++ params['timeout'] = timeout ++ return client.call('bdev_get_bdevs', params) ++ ++ ++def bdev_get_iostat(client, name=None, per_channel=None): ++ """Get I/O statistics for block devices. ++ ++ Args: ++ name: bdev name to query (optional; if omitted, query all bdevs) ++ per_channel: display per channel IO stats for specified bdev ++ ++ Returns: ++ I/O statistics for the requested block devices. ++ """ ++ params = {} ++ if name: ++ params['name'] = name ++ if per_channel: ++ params['per_channel'] = per_channel ++ return client.call('bdev_get_iostat', params) ++ ++ ++def bdev_reset_iostat(client, name=None, mode=None): ++ """Reset I/O statistics for block devices. ++ ++ Args: ++ name: bdev name to reset (optional; if omitted, reset all bdevs) ++ mode: mode to reset: all, maxmin (optional: if omitted, reset all fields) ++ """ ++ params = {} ++ if name: ++ params['name'] = name ++ if mode: ++ params['mode'] = mode ++ ++ return client.call('bdev_reset_iostat', params) ++ ++ ++def bdev_enable_histogram(client, name, enable): ++ """Control whether histogram is enabled for specified bdev. ++ ++ Args: ++ bdev_name: name of bdev ++ """ ++ params = {'name': name, "enable": enable} ++ return client.call('bdev_enable_histogram', params) ++ ++ ++def bdev_get_histogram(client, name): ++ """Get histogram for specified bdev. ++ ++ Args: ++ bdev_name: name of bdev ++ """ ++ params = {'name': name} ++ return client.call('bdev_get_histogram', params) ++ ++ ++def bdev_error_inject_error(client, name, io_type, error_type, num, ++ corrupt_offset, corrupt_value): ++ """Inject an error via an error bdev. ++ ++ Args: ++ name: name of error bdev ++ io_type: one of "clear", "read", "write", "unmap", "flush", or "all" ++ error_type: one of "failure", "pending", or "corrupt_data" ++ num: number of commands to fail ++ corrupt_offset: offset in bytes to xor with corrupt_value ++ corrupt_value: value for xor (1-255, 0 is invalid) ++ """ ++ params = { ++ 'name': name, ++ 'io_type': io_type, ++ 'error_type': error_type, ++ } ++ ++ if num: ++ params['num'] = num ++ if corrupt_offset: ++ params['corrupt_offset'] = corrupt_offset ++ if corrupt_value: ++ params['corrupt_value'] = corrupt_value ++ ++ return client.call('bdev_error_inject_error', params) ++ ++ ++def bdev_set_qd_sampling_period(client, name, period): ++ """Enable queue depth tracking on a specified bdev. ++ ++ Args: ++ name: name of a bdev on which to track queue depth. ++ period: period (in microseconds) at which to update the queue depth reading. If set to 0, polling will be disabled. ++ """ ++ ++ params = {} ++ params['name'] = name ++ params['period'] = period ++ return client.call('bdev_set_qd_sampling_period', params) ++ ++ ++def bdev_set_qos_limit( ++ client, ++ name, ++ rw_ios_per_sec=None, ++ rw_mbytes_per_sec=None, ++ r_mbytes_per_sec=None, ++ w_mbytes_per_sec=None): ++ """Set QoS rate limit on a block device. ++ ++ Args: ++ name: name of block device ++ rw_ios_per_sec: R/W IOs per second limit (>=1000, example: 20000). 0 means unlimited. ++ rw_mbytes_per_sec: R/W megabytes per second limit (>=10, example: 100). 0 means unlimited. ++ r_mbytes_per_sec: Read megabytes per second limit (>=10, example: 100). 0 means unlimited. ++ w_mbytes_per_sec: Write megabytes per second limit (>=10, example: 100). 0 means unlimited. ++ """ ++ params = {} ++ params['name'] = name ++ if rw_ios_per_sec is not None: ++ params['rw_ios_per_sec'] = rw_ios_per_sec ++ if rw_mbytes_per_sec is not None: ++ params['rw_mbytes_per_sec'] = rw_mbytes_per_sec ++ if r_mbytes_per_sec is not None: ++ params['r_mbytes_per_sec'] = r_mbytes_per_sec ++ if w_mbytes_per_sec is not None: ++ params['w_mbytes_per_sec'] = w_mbytes_per_sec ++ return client.call('bdev_set_qos_limit', params) ++ ++ ++def bdev_nvme_apply_firmware(client, bdev_name, filename): ++ """Download and commit firmware to NVMe device. ++ ++ Args: ++ bdev_name: name of NVMe block device ++ filename: filename of the firmware to download ++ """ ++ params = { ++ 'filename': filename, ++ 'bdev_name': bdev_name, ++ } ++ return client.call('bdev_nvme_apply_firmware', params) ++ ++ ++def bdev_nvme_get_transport_statistics(client): ++ """Get bdev_nvme poll group transport statistics""" ++ return client.call('bdev_nvme_get_transport_statistics') ++ ++ ++def bdev_nvme_get_controller_health_info(client, name): ++ """Display health log of the required NVMe bdev controller. ++ ++ Args: ++ name: name of the required NVMe bdev controller ++ ++ Returns: ++ Health log for the requested NVMe bdev controller. ++ """ ++ params = {} ++ params['name'] = name ++ return client.call('bdev_nvme_get_controller_health_info', params) ++ ++ ++def bdev_daos_create(client, num_blocks, block_size, pool, cont, name, oclass=None, uuid=None): ++ """Construct DAOS block device. ++ ++ Args: ++ num_blocks: size of block device in blocks ++ block_size: block size of device; must be a power of 2 and at least 512 ++ name: name of block device (also the name of the backend file on DAOS DFS) ++ pool: UUID of DAOS pool ++ cont: UUID of DAOS container ++ uuid: UUID of block device (optional) ++ oclass: DAOS object class (optional) ++ ++ Returns: ++ Name of created block device. ++ """ ++ params = {'num_blocks': num_blocks, 'block_size': block_size, 'pool': pool, 'cont': cont, 'name': name} ++ if uuid: ++ params['uuid'] = uuid ++ if oclass: ++ params['oclass'] = oclass ++ return client.call('bdev_daos_create', params) ++ ++ ++def bdev_daos_delete(client, name): ++ """Delete DAOS block device. ++ ++ Args: ++ bdev_name: name of DAOS bdev to delete ++ """ ++ params = {'name': name} ++ return client.call('bdev_daos_delete', params) ++ ++ ++def bdev_daos_resize(client, name, new_size): ++ """Resize DAOS bdev in the system. ++ Args: ++ name: name of DAOS bdev to resize ++ new_size: new bdev size of resize operation. The unit is MiB ++ """ ++ params = { ++ 'name': name, ++ 'new_size': new_size, ++ } ++ return client.call('bdev_daos_resize', params) ++ ++ ++def bdev_nvme_start_mdns_discovery(client, name, svcname, hostnqn=None): ++ """Start discovery with mDNS ++ ++ Args: ++ name: bdev name prefix; "n" + unique seqno + namespace ID will be appended to create unique names ++ svcname: service to discover ("_nvme-disc._tcp") ++ hostnqn: NQN to connect from (optional) ++ """ ++ params = {'name': name, ++ 'svcname': svcname} ++ ++ if hostnqn: ++ params['hostnqn'] = hostnqn ++ return client.call('bdev_nvme_start_mdns_discovery', params) ++ ++ ++def bdev_nvme_stop_mdns_discovery(client, name): ++ """Stop a previously started mdns discovery service ++ ++ Args: ++ name: name of the discovery service to stop ++ """ ++ params = {'name': name} ++ ++ return client.call('bdev_nvme_stop_mdns_discovery', params) ++ ++ ++def bdev_nvme_get_mdns_discovery_info(client): ++ """Get information about the automatic mdns discovery ++ """ ++ return client.call('bdev_nvme_get_mdns_discovery_info') +diff --git a/python/spdk/rpc/blobfs.py b/python/spdk/rpc/blobfs.py +index bacb943..c21a9cc 100644 +--- a/python/spdk/rpc/blobfs.py ++++ b/python/spdk/rpc/blobfs.py +@@ -1,62 +1,62 @@ +-# SPDX-License-Identifier: BSD-3-Clause +-# Copyright (C) 2019 Intel Corporation. +-# All rights reserved. +- +- +-def blobfs_detect(client, bdev_name): +- """Detect whether a blobfs exists on bdev. +- +- Args: +- bdev_name: block device name to detect blobfs +- +- Returns: +- True if a blobfs exists on the bdev; False otherwise. +- """ +- params = { +- 'bdev_name': bdev_name +- } +- return client.call('blobfs_detect', params) +- +- +-def blobfs_create(client, bdev_name, cluster_sz=None): +- """Build blobfs on bdev. +- +- Args: +- bdev_name: block device name to build blobfs +- cluster_sz: Size of cluster in bytes (Optional). Must be multiple of 4KB page size. Default and minimal value is 1M. +- """ +- params = { +- 'bdev_name': bdev_name +- } +- if cluster_sz: +- params['cluster_sz'] = cluster_sz +- return client.call('blobfs_create', params) +- +- +-def blobfs_mount(client, bdev_name, mountpoint): +- """Mount blobfs on bdev by FUSE. +- +- Args: +- bdev_name: block device name where the blobfs is +- mountpoint: Mountpoint path in host to mount blobfs +- """ +- params = { +- 'bdev_name': bdev_name, +- 'mountpoint': mountpoint +- } +- return client.call('blobfs_mount', params) +- +- +-def blobfs_set_cache_size(client, size_in_mb): +- """Set cache size for the blobstore filesystem. +- +- Args: +- size_in_mb: Cache size in megabytes +- +- Returns: +- True if cache size is set successfully; False if failed to set. +- """ +- params = { +- 'size_in_mb': size_in_mb +- } +- return client.call('blobfs_set_cache_size', params) ++# SPDX-License-Identifier: BSD-3-Clause ++# Copyright (C) 2019 Intel Corporation. ++# All rights reserved. ++ ++ ++def blobfs_detect(client, bdev_name): ++ """Detect whether a blobfs exists on bdev. ++ ++ Args: ++ bdev_name: block device name to detect blobfs ++ ++ Returns: ++ True if a blobfs exists on the bdev; False otherwise. ++ """ ++ params = { ++ 'bdev_name': bdev_name ++ } ++ return client.call('blobfs_detect', params) ++ ++ ++def blobfs_create(client, bdev_name, cluster_sz=None): ++ """Build blobfs on bdev. ++ ++ Args: ++ bdev_name: block device name to build blobfs ++ cluster_sz: Size of cluster in bytes (Optional). Must be multiple of 4KB page size. Default and minimal value is 1M. ++ """ ++ params = { ++ 'bdev_name': bdev_name ++ } ++ if cluster_sz: ++ params['cluster_sz'] = cluster_sz ++ return client.call('blobfs_create', params) ++ ++ ++def blobfs_mount(client, bdev_name, mountpoint): ++ """Mount blobfs on bdev by FUSE. ++ ++ Args: ++ bdev_name: block device name where the blobfs is ++ mountpoint: Mountpoint path in host to mount blobfs ++ """ ++ params = { ++ 'bdev_name': bdev_name, ++ 'mountpoint': mountpoint ++ } ++ return client.call('blobfs_mount', params) ++ ++ ++def blobfs_set_cache_size(client, size_in_mb): ++ """Set cache size for the blobstore filesystem. ++ ++ Args: ++ size_in_mb: Cache size in megabytes ++ ++ Returns: ++ True if cache size is set successfully; False if failed to set. ++ """ ++ params = { ++ 'size_in_mb': size_in_mb ++ } ++ return client.call('blobfs_set_cache_size', params) +diff --git a/python/spdk/rpc/client.py b/python/spdk/rpc/client.py +index 652dbed..4c2eb88 100644 +--- a/python/spdk/rpc/client.py ++++ b/python/spdk/rpc/client.py +@@ -1,205 +1,205 @@ +-# SPDX-License-Identifier: BSD-3-Clause +-# Copyright (C) 2017 Intel Corporation. +-# All rights reserved. +- +-import json +-import socket +-import time +-import os +-import logging +-import copy +- +- +-def print_dict(d): +- print(json.dumps(d, indent=2)) +- +- +-def print_json(s): +- print(json.dumps(s, indent=2).strip('"')) +- +- +-def get_addr_type(addr): +- try: +- socket.inet_pton(socket.AF_INET, addr) +- return socket.AF_INET +- except Exception as e: +- pass +- try: +- socket.inet_pton(socket.AF_INET6, addr) +- return socket.AF_INET6 +- except Exception as e: +- pass +- if os.path.exists(addr): +- return socket.AF_UNIX +- return None +- +- +-class JSONRPCException(Exception): +- def __init__(self, message): +- self.message = message +- +- +-class JSONRPCClient(object): +- def __init__(self, addr, port=None, timeout=60.0, **kwargs): +- self.sock = None +- ch = logging.StreamHandler() +- ch.setFormatter(logging.Formatter('%(levelname)s: %(message)s')) +- ch.setLevel(logging.DEBUG) +- self._logger = logging.getLogger("JSONRPCClient(%s)" % addr) +- self._logger.addHandler(ch) +- self.log_set_level(kwargs.get('log_level', logging.ERROR)) +- connect_retries = kwargs.get('conn_retries', 0) +- +- self.timeout = timeout +- self._request_id = 0 +- self._recv_buf = "" +- self._reqs = [] +- +- for i in range(connect_retries): +- try: +- self._connect(addr, port) +- return +- except Exception as e: +- # ignore and retry in 200ms +- time.sleep(0.2) +- +- # try one last time without try/except +- self._connect(addr, port) +- +- def __enter__(self): +- return self +- +- def __exit__(self, exception_type, exception_value, traceback): +- self.close() +- +- def _connect(self, addr, port): +- try: +- addr_type = get_addr_type(addr) +- +- if addr_type == socket.AF_UNIX: +- self._logger.debug("Trying to connect to UNIX socket: %s", addr) +- self.sock = socket.socket(socket.AF_UNIX, socket.SOCK_STREAM) +- self.sock.connect(addr) +- elif addr_type == socket.AF_INET6: +- self._logger.debug("Trying to connect to IPv6 address addr:%s, port:%i", addr, port) +- for res in socket.getaddrinfo(addr, port, socket.AF_INET6, socket.SOCK_STREAM, socket.SOL_TCP): +- af, socktype, proto, canonname, sa = res +- self.sock = socket.socket(af, socktype, proto) +- self.sock.connect(sa) +- elif addr_type == socket.AF_INET: +- self._logger.debug("Trying to connect to IPv4 address addr:%s, port:%i'", addr, port) +- self.sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM) +- self.sock.connect((addr, port)) +- else: +- raise socket.error("Invalid or non-existing address: '%s'" % addr) +- except socket.error as ex: +- raise JSONRPCException("Error while connecting to %s\n" +- "Is SPDK application running?\n" +- "Error details: %s" % (addr, ex)) +- +- def get_logger(self): +- return self._logger +- +- """Set logging level +- +- Args: +- lvl: Log level to set as accepted by logger.setLevel +- """ +- def log_set_level(self, lvl): +- self._logger.info("Setting log level to %s", lvl) +- self._logger.setLevel(lvl) +- self._logger.info("Log level set to %s", lvl) +- +- def close(self): +- if getattr(self, "sock", None): +- self.sock.shutdown(socket.SHUT_RDWR) +- self.sock.close() +- self.sock = None +- +- def add_request(self, method, params): +- self._request_id += 1 +- req = { +- 'jsonrpc': '2.0', +- 'method': method, +- 'id': self._request_id +- } +- +- if params: +- req['params'] = copy.deepcopy(params) +- +- self._logger.debug("append request:\n%s\n", json.dumps(req)) +- self._reqs.append(req) +- return self._request_id +- +- def flush(self): +- self._logger.debug("Flushing buffer") +- # TODO: We can drop indent parameter +- reqstr = "\n".join(json.dumps(req, indent=2) for req in self._reqs) +- self._reqs = [] +- self._logger.info("Requests:\n%s\n", reqstr) +- self.sock.sendall(reqstr.encode("utf-8")) +- +- def send(self, method, params=None): +- id = self.add_request(method, params) +- self.flush() +- return id +- +- def decode_one_response(self): +- try: +- self._logger.debug("Trying to decode response '%s'", self._recv_buf) +- buf = self._recv_buf.lstrip() +- obj, idx = json.JSONDecoder().raw_decode(buf) +- self._recv_buf = buf[idx:] +- return obj +- except ValueError: +- self._logger.debug("Partial response") +- return None +- +- def recv(self): +- start_time = time.process_time() +- response = self.decode_one_response() +- while not response: +- try: +- timeout = self.timeout - (time.process_time() - start_time) +- self.sock.settimeout(timeout) +- newdata = self.sock.recv(4096) +- if not newdata: +- self.sock.close() +- self.sock = None +- raise JSONRPCException("Connection closed with partial response:\n%s\n" % self._recv_buf) +- self._recv_buf += newdata.decode("utf-8") +- response = self.decode_one_response() +- except socket.timeout: +- break # throw exception after loop to avoid Python freaking out about nested exceptions +- except ValueError: +- continue # incomplete response; keep buffering +- +- if not response: +- raise JSONRPCException("Timeout while waiting for response:\n%s\n" % self._recv_buf) +- +- self._logger.info("response:\n%s\n", json.dumps(response, indent=2)) +- return response +- +- def call(self, method, params={}): +- self._logger.debug("call('%s')" % method) +- req_id = self.send(method, params) +- try: +- response = self.recv() +- except JSONRPCException as e: +- """ Don't expect response to kill """ +- if not self.sock and method == "spdk_kill_instance": +- self._logger.info("Connection terminated but ignoring since method is '%s'" % method) +- return {} +- else: +- raise e +- +- if 'error' in response: +- params["method"] = method +- params["req_id"] = req_id +- msg = "\n".join(["request:", "%s" % json.dumps(params, indent=2), +- "Got JSON-RPC error response", +- "response:", +- json.dumps(response['error'], indent=2)]) +- raise JSONRPCException(msg) +- +- return response['result'] ++# SPDX-License-Identifier: BSD-3-Clause ++# Copyright (C) 2017 Intel Corporation. ++# All rights reserved. ++ ++import json ++import socket ++import time ++import os ++import logging ++import copy ++ ++ ++def print_dict(d): ++ print(json.dumps(d, indent=2)) ++ ++ ++def print_json(s): ++ print(json.dumps(s, indent=2).strip('"')) ++ ++ ++def get_addr_type(addr): ++ try: ++ socket.inet_pton(socket.AF_INET, addr) ++ return socket.AF_INET ++ except Exception as e: ++ pass ++ try: ++ socket.inet_pton(socket.AF_INET6, addr) ++ return socket.AF_INET6 ++ except Exception as e: ++ pass ++ if os.path.exists(addr): ++ return socket.AF_UNIX ++ return None ++ ++ ++class JSONRPCException(Exception): ++ def __init__(self, message): ++ self.message = message ++ ++ ++class JSONRPCClient(object): ++ def __init__(self, addr, port=None, timeout=60.0, **kwargs): ++ self.sock = None ++ ch = logging.StreamHandler() ++ ch.setFormatter(logging.Formatter('%(levelname)s: %(message)s')) ++ ch.setLevel(logging.DEBUG) ++ self._logger = logging.getLogger("JSONRPCClient(%s)" % addr) ++ self._logger.addHandler(ch) ++ self.log_set_level(kwargs.get('log_level', logging.ERROR)) ++ connect_retries = kwargs.get('conn_retries', 0) ++ ++ self.timeout = timeout ++ self._request_id = 0 ++ self._recv_buf = "" ++ self._reqs = [] ++ ++ for i in range(connect_retries): ++ try: ++ self._connect(addr, port) ++ return ++ except Exception as e: ++ # ignore and retry in 200ms ++ time.sleep(0.2) ++ ++ # try one last time without try/except ++ self._connect(addr, port) ++ ++ def __enter__(self): ++ return self ++ ++ def __exit__(self, exception_type, exception_value, traceback): ++ self.close() ++ ++ def _connect(self, addr, port): ++ try: ++ addr_type = get_addr_type(addr) ++ ++ if addr_type == socket.AF_UNIX: ++ self._logger.debug("Trying to connect to UNIX socket: %s", addr) ++ self.sock = socket.socket(socket.AF_UNIX, socket.SOCK_STREAM) ++ self.sock.connect(addr) ++ elif addr_type == socket.AF_INET6: ++ self._logger.debug("Trying to connect to IPv6 address addr:%s, port:%i", addr, port) ++ for res in socket.getaddrinfo(addr, port, socket.AF_INET6, socket.SOCK_STREAM, socket.SOL_TCP): ++ af, socktype, proto, canonname, sa = res ++ self.sock = socket.socket(af, socktype, proto) ++ self.sock.connect(sa) ++ elif addr_type == socket.AF_INET: ++ self._logger.debug("Trying to connect to IPv4 address addr:%s, port:%i'", addr, port) ++ self.sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM) ++ self.sock.connect((addr, port)) ++ else: ++ raise socket.error("Invalid or non-existing address: '%s'" % addr) ++ except socket.error as ex: ++ raise JSONRPCException("Error while connecting to %s\n" ++ "Is SPDK application running?\n" ++ "Error details: %s" % (addr, ex)) ++ ++ def get_logger(self): ++ return self._logger ++ ++ """Set logging level ++ ++ Args: ++ lvl: Log level to set as accepted by logger.setLevel ++ """ ++ def log_set_level(self, lvl): ++ self._logger.info("Setting log level to %s", lvl) ++ self._logger.setLevel(lvl) ++ self._logger.info("Log level set to %s", lvl) ++ ++ def close(self): ++ if getattr(self, "sock", None): ++ self.sock.shutdown(socket.SHUT_RDWR) ++ self.sock.close() ++ self.sock = None ++ ++ def add_request(self, method, params): ++ self._request_id += 1 ++ req = { ++ 'jsonrpc': '2.0', ++ 'method': method, ++ 'id': self._request_id ++ } ++ ++ if params: ++ req['params'] = copy.deepcopy(params) ++ ++ self._logger.debug("append request:\n%s\n", json.dumps(req)) ++ self._reqs.append(req) ++ return self._request_id ++ ++ def flush(self): ++ self._logger.debug("Flushing buffer") ++ # TODO: We can drop indent parameter ++ reqstr = "\n".join(json.dumps(req, indent=2) for req in self._reqs) ++ self._reqs = [] ++ self._logger.info("Requests:\n%s\n", reqstr) ++ self.sock.sendall(reqstr.encode("utf-8")) ++ ++ def send(self, method, params=None): ++ id = self.add_request(method, params) ++ self.flush() ++ return id ++ ++ def decode_one_response(self): ++ try: ++ self._logger.debug("Trying to decode response '%s'", self._recv_buf) ++ buf = self._recv_buf.lstrip() ++ obj, idx = json.JSONDecoder().raw_decode(buf) ++ self._recv_buf = buf[idx:] ++ return obj ++ except ValueError: ++ self._logger.debug("Partial response") ++ return None ++ ++ def recv(self): ++ start_time = time.process_time() ++ response = self.decode_one_response() ++ while not response: ++ try: ++ timeout = self.timeout - (time.process_time() - start_time) ++ self.sock.settimeout(timeout) ++ newdata = self.sock.recv(4096) ++ if not newdata: ++ self.sock.close() ++ self.sock = None ++ raise JSONRPCException("Connection closed with partial response:\n%s\n" % self._recv_buf) ++ self._recv_buf += newdata.decode("utf-8") ++ response = self.decode_one_response() ++ except socket.timeout: ++ break # throw exception after loop to avoid Python freaking out about nested exceptions ++ except ValueError: ++ continue # incomplete response; keep buffering ++ ++ if not response: ++ raise JSONRPCException("Timeout while waiting for response:\n%s\n" % self._recv_buf) ++ ++ self._logger.info("response:\n%s\n", json.dumps(response, indent=2)) ++ return response ++ ++ def call(self, method, params={}): ++ self._logger.debug("call('%s')" % method) ++ req_id = self.send(method, params) ++ try: ++ response = self.recv() ++ except JSONRPCException as e: ++ """ Don't expect response to kill """ ++ if not self.sock and method == "spdk_kill_instance": ++ self._logger.info("Connection terminated but ignoring since method is '%s'" % method) ++ return {} ++ else: ++ raise e ++ ++ if 'error' in response: ++ params["method"] = method ++ params["req_id"] = req_id ++ msg = "\n".join(["request:", "%s" % json.dumps(params, indent=2), ++ "Got JSON-RPC error response", ++ "response:", ++ json.dumps(response['error'], indent=2)]) ++ raise JSONRPCException(msg) ++ ++ return response['result'] +diff --git a/python/spdk/rpc/cmd_parser.py b/python/spdk/rpc/cmd_parser.py +index 97694ba..e8144bf 100644 +--- a/python/spdk/rpc/cmd_parser.py ++++ b/python/spdk/rpc/cmd_parser.py +@@ -1,35 +1,35 @@ +-# SPDX-License-Identifier: BSD-3-Clause +-# Copyright (C) 2021 Intel Corporation. +-# All rights reserved. +- +-args_global = ['server_addr', 'port', 'timeout', 'verbose', 'dry_run', 'conn_retries', +- 'is_server', 'rpc_plugin', 'called_rpc_name', 'func', 'client'] +- +- +-def strip_globals(kwargs): +- for arg in args_global: +- kwargs.pop(arg, None) +- +- +-def remove_null(kwargs): +- keys = [] +- for key, value in kwargs.items(): +- if value is None: +- keys.append(key) +- +- for key in keys: +- kwargs.pop(key, None) +- +- +-def apply_defaults(kwargs, **defaults): +- for key, value in defaults.items(): +- if key not in kwargs: +- kwargs[key] = value +- +- +-def group_as(kwargs, name, values): +- group = {} +- for arg in values: +- if arg in kwargs and kwargs[arg] is not None: +- group[arg] = kwargs.pop(arg, None) +- kwargs[name] = group ++# SPDX-License-Identifier: BSD-3-Clause ++# Copyright (C) 2021 Intel Corporation. ++# All rights reserved. ++ ++args_global = ['server_addr', 'port', 'timeout', 'verbose', 'dry_run', 'conn_retries', ++ 'is_server', 'rpc_plugin', 'called_rpc_name', 'func', 'client'] ++ ++ ++def strip_globals(kwargs): ++ for arg in args_global: ++ kwargs.pop(arg, None) ++ ++ ++def remove_null(kwargs): ++ keys = [] ++ for key, value in kwargs.items(): ++ if value is None: ++ keys.append(key) ++ ++ for key in keys: ++ kwargs.pop(key, None) ++ ++ ++def apply_defaults(kwargs, **defaults): ++ for key, value in defaults.items(): ++ if key not in kwargs: ++ kwargs[key] = value ++ ++ ++def group_as(kwargs, name, values): ++ group = {} ++ for arg in values: ++ if arg in kwargs and kwargs[arg] is not None: ++ group[arg] = kwargs.pop(arg, None) ++ kwargs[name] = group +diff --git a/python/spdk/rpc/compressdev.py b/python/spdk/rpc/compressdev.py +index 91076ae..8642393 100644 +--- a/python/spdk/rpc/compressdev.py ++++ b/python/spdk/rpc/compressdev.py +@@ -1,14 +1,14 @@ +-# SPDX-License-Identifier: BSD-3-Clause +-# Copyright (C) 2022 Intel Corporation. +-# All rights reserved. +-# +- +-def compressdev_scan_accel_module(client, pmd): +- """Scan and enable compressdev module and set pmd option. +- +- Args: +- pmd: 0 = auto-select, 1 = QAT, 2 = mlx5_pci +- """ +- params = {'pmd': pmd} +- +- return client.call('compressdev_scan_accel_module', params) ++# SPDX-License-Identifier: BSD-3-Clause ++# Copyright (C) 2022 Intel Corporation. ++# All rights reserved. ++# ++ ++def compressdev_scan_accel_module(client, pmd): ++ """Scan and enable compressdev module and set pmd option. ++ ++ Args: ++ pmd: 0 = auto-select, 1 = QAT, 2 = mlx5_pci ++ """ ++ params = {'pmd': pmd} ++ ++ return client.call('compressdev_scan_accel_module', params) +diff --git a/python/spdk/rpc/dpdk_cryptodev.py b/python/spdk/rpc/dpdk_cryptodev.py +index 5f5f5de..4813b84 100644 +--- a/python/spdk/rpc/dpdk_cryptodev.py ++++ b/python/spdk/rpc/dpdk_cryptodev.py +@@ -1,25 +1,25 @@ +-# SPDX-License-Identifier: BSD-3-Clause +-# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. +-# All rights reserved. +- +-def dpdk_cryptodev_scan_accel_module(client): +- """Enable dpdk_cryptodev accel module. +- """ +- return client.call('dpdk_cryptodev_scan_accel_module') +- +- +-def dpdk_cryptodev_set_driver(client, driver_name): +- """Set the DPDK cryptodev driver. +- +- Args: +- driver_name: The driver, can be one of crypto_aesni_mb, crypto_qat or mlx5_pci +- """ +- params = {'driver_name': driver_name} +- +- return client.call('dpdk_cryptodev_set_driver', params) +- +- +-def dpdk_cryptodev_get_driver(client): +- """Get the DPDK cryptodev driver. +- """ +- return client.call('dpdk_cryptodev_get_driver') ++# SPDX-License-Identifier: BSD-3-Clause ++# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. ++# All rights reserved. ++ ++def dpdk_cryptodev_scan_accel_module(client): ++ """Enable dpdk_cryptodev accel module. ++ """ ++ return client.call('dpdk_cryptodev_scan_accel_module') ++ ++ ++def dpdk_cryptodev_set_driver(client, driver_name): ++ """Set the DPDK cryptodev driver. ++ ++ Args: ++ driver_name: The driver, can be one of crypto_aesni_mb, crypto_qat or mlx5_pci ++ """ ++ params = {'driver_name': driver_name} ++ ++ return client.call('dpdk_cryptodev_set_driver', params) ++ ++ ++def dpdk_cryptodev_get_driver(client): ++ """Get the DPDK cryptodev driver. ++ """ ++ return client.call('dpdk_cryptodev_get_driver') +diff --git a/python/spdk/rpc/dsa.py b/python/spdk/rpc/dsa.py +index 1ee5a8e..5fec2b9 100644 +--- a/python/spdk/rpc/dsa.py ++++ b/python/spdk/rpc/dsa.py +@@ -1,19 +1,19 @@ +-# SPDX-License-Identifier: BSD-3-Clause +-# Copyright (C) 2022 Intel Corporation. +-# All rights reserved. +- +-from spdk.rpc.helpers import deprecated_alias +- +- +-@deprecated_alias('dsa_scan_accel_engine') +-def dsa_scan_accel_module(client, config_kernel_mode=None): +- """Scan and enable DSA accel module. +- +- Args: +- config_kernel_mode: Use kernel DSA driver. (optional) +- """ +- params = {} +- +- if config_kernel_mode is not None: +- params['config_kernel_mode'] = config_kernel_mode +- return client.call('dsa_scan_accel_module', params) ++# SPDX-License-Identifier: BSD-3-Clause ++# Copyright (C) 2022 Intel Corporation. ++# All rights reserved. ++ ++from spdk.rpc.helpers import deprecated_alias ++ ++ ++@deprecated_alias('dsa_scan_accel_engine') ++def dsa_scan_accel_module(client, config_kernel_mode=None): ++ """Scan and enable DSA accel module. ++ ++ Args: ++ config_kernel_mode: Use kernel DSA driver. (optional) ++ """ ++ params = {} ++ ++ if config_kernel_mode is not None: ++ params['config_kernel_mode'] = config_kernel_mode ++ return client.call('dsa_scan_accel_module', params) +diff --git a/python/spdk/rpc/env_dpdk.py b/python/spdk/rpc/env_dpdk.py +index a85c4e8..2dc70e5 100644 +--- a/python/spdk/rpc/env_dpdk.py ++++ b/python/spdk/rpc/env_dpdk.py +@@ -1,13 +1,13 @@ +-# SPDX-License-Identifier: BSD-3-Clause +-# Copyright (C) 2019 Intel Corporation. +-# All rights reserved. +- +- +-def env_dpdk_get_mem_stats(client): +- """Dump the applications memory stats to a file. +- +- Returns: +- The path to the file where the stats are written. +- """ +- +- return client.call('env_dpdk_get_mem_stats') ++# SPDX-License-Identifier: BSD-3-Clause ++# Copyright (C) 2019 Intel Corporation. ++# All rights reserved. ++ ++ ++def env_dpdk_get_mem_stats(client): ++ """Dump the applications memory stats to a file. ++ ++ Returns: ++ The path to the file where the stats are written. ++ """ ++ ++ return client.call('env_dpdk_get_mem_stats') +diff --git a/python/spdk/rpc/helpers.py b/python/spdk/rpc/helpers.py +index 52aaf61..c59cec2 100644 +--- a/python/spdk/rpc/helpers.py ++++ b/python/spdk/rpc/helpers.py +@@ -1,20 +1,20 @@ +-# SPDX-License-Identifier: BSD-3-Clause +-# Copyright (C) 2019 Intel Corporation. +-# All rights reserved. +- +-import sys +- +-deprecated_aliases = {} +- +- +-def deprecated_alias(old_name): +- def wrap(f): +- def old_f(*args, **kwargs): +- ret = f(*args, **kwargs) +- print("{} is deprecated, use {} instead.".format(old_name, f.__name__), file=sys.stderr) +- return ret +- old_f.__name__ = old_name +- deprecated_aliases[old_name] = f.__name__ +- setattr(sys.modules[f.__module__], old_name, old_f) +- return f +- return wrap ++# SPDX-License-Identifier: BSD-3-Clause ++# Copyright (C) 2019 Intel Corporation. ++# All rights reserved. ++ ++import sys ++ ++deprecated_aliases = {} ++ ++ ++def deprecated_alias(old_name): ++ def wrap(f): ++ def old_f(*args, **kwargs): ++ ret = f(*args, **kwargs) ++ print("{} is deprecated, use {} instead.".format(old_name, f.__name__), file=sys.stderr) ++ return ret ++ old_f.__name__ = old_name ++ deprecated_aliases[old_name] = f.__name__ ++ setattr(sys.modules[f.__module__], old_name, old_f) ++ return f ++ return wrap +diff --git a/python/spdk/rpc/iaa.py b/python/spdk/rpc/iaa.py +index 613ab7b..09f8194 100644 +--- a/python/spdk/rpc/iaa.py ++++ b/python/spdk/rpc/iaa.py +@@ -1,12 +1,12 @@ +-# SPDX-License-Identifier: BSD-3-Clause +-# Copyright (C) 2022 Intel Corporation. +-# All rights reserved. +- +-from spdk.rpc.helpers import deprecated_alias +- +- +-@deprecated_alias('iaa_scan_accel_engine') +-def iaa_scan_accel_module(client): +- """Scan and enable IAA accel module. +- """ +- return client.call('iaa_scan_accel_module') ++# SPDX-License-Identifier: BSD-3-Clause ++# Copyright (C) 2022 Intel Corporation. ++# All rights reserved. ++ ++from spdk.rpc.helpers import deprecated_alias ++ ++ ++@deprecated_alias('iaa_scan_accel_engine') ++def iaa_scan_accel_module(client): ++ """Scan and enable IAA accel module. ++ """ ++ return client.call('iaa_scan_accel_module') +diff --git a/python/spdk/rpc/ioat.py b/python/spdk/rpc/ioat.py +index c76be10..76d1489 100644 +--- a/python/spdk/rpc/ioat.py ++++ b/python/spdk/rpc/ioat.py +@@ -1,12 +1,12 @@ +-# SPDX-License-Identifier: BSD-3-Clause +-# Copyright (C) 2021 Intel Corporation. +-# All rights reserved. +- +-from spdk.rpc.helpers import deprecated_alias +- +- +-@deprecated_alias('ioat_scan_accel_engine') +-def ioat_scan_accel_module(client): +- """Enable IOAT accel module. +- """ +- return client.call('ioat_scan_accel_module') ++# SPDX-License-Identifier: BSD-3-Clause ++# Copyright (C) 2021 Intel Corporation. ++# All rights reserved. ++ ++from spdk.rpc.helpers import deprecated_alias ++ ++ ++@deprecated_alias('ioat_scan_accel_engine') ++def ioat_scan_accel_module(client): ++ """Enable IOAT accel module. ++ """ ++ return client.call('ioat_scan_accel_module') +diff --git a/python/spdk/rpc/iobuf.py b/python/spdk/rpc/iobuf.py +index 153efe4..34c089e 100644 +--- a/python/spdk/rpc/iobuf.py ++++ b/python/spdk/rpc/iobuf.py +@@ -1,25 +1,25 @@ +-# SPDX-License-Identifier: BSD-3-Clause +-# Copyright (C) 2022 Intel Corporation. +-# All rights reserved. +- +-def iobuf_set_options(client, small_pool_count, large_pool_count, small_bufsize, large_bufsize): +- """Set iobuf pool options. +- +- Args: +- small_pool_count: number of small buffers in the global pool +- large_pool_count: number of large buffers in the global pool +- small_bufsize: size of a small buffer +- large_bufsize: size of a large buffer +- """ +- params = {} +- +- if small_pool_count is not None: +- params['small_pool_count'] = small_pool_count +- if large_pool_count is not None: +- params['large_pool_count'] = large_pool_count +- if small_bufsize is not None: +- params['small_bufsize'] = small_bufsize +- if large_bufsize is not None: +- params['large_bufsize'] = large_bufsize +- +- return client.call('iobuf_set_options', params) ++# SPDX-License-Identifier: BSD-3-Clause ++# Copyright (C) 2022 Intel Corporation. ++# All rights reserved. ++ ++def iobuf_set_options(client, small_pool_count, large_pool_count, small_bufsize, large_bufsize): ++ """Set iobuf pool options. ++ ++ Args: ++ small_pool_count: number of small buffers in the global pool ++ large_pool_count: number of large buffers in the global pool ++ small_bufsize: size of a small buffer ++ large_bufsize: size of a large buffer ++ """ ++ params = {} ++ ++ if small_pool_count is not None: ++ params['small_pool_count'] = small_pool_count ++ if large_pool_count is not None: ++ params['large_pool_count'] = large_pool_count ++ if small_bufsize is not None: ++ params['small_bufsize'] = small_bufsize ++ if large_bufsize is not None: ++ params['large_bufsize'] = large_bufsize ++ ++ return client.call('iobuf_set_options', params) +diff --git a/python/spdk/rpc/iscsi.py b/python/spdk/rpc/iscsi.py +index 471f621..a7e2280 100644 +--- a/python/spdk/rpc/iscsi.py ++++ b/python/spdk/rpc/iscsi.py +@@ -1,618 +1,618 @@ +-# SPDX-License-Identifier: BSD-3-Clause +-# Copyright (C) 2017 Intel Corporation. +-# All rights reserved. +- +- +-def iscsi_set_options( +- client, +- auth_file=None, +- node_base=None, +- nop_timeout=None, +- nop_in_interval=None, +- disable_chap=None, +- require_chap=None, +- mutual_chap=None, +- chap_group=None, +- max_sessions=None, +- max_queue_depth=None, +- max_connections_per_session=None, +- default_time2wait=None, +- default_time2retain=None, +- first_burst_length=None, +- immediate_data=None, +- error_recovery_level=None, +- allow_duplicated_isid=None, +- max_large_datain_per_connection=None, +- max_r2t_per_connection=None, +- pdu_pool_size=None, +- immediate_data_pool_size=None, +- data_out_pool_size=None): +- """Set iSCSI target options. +- +- Args: +- auth_file: Path to CHAP shared secret file (optional) +- node_base: Prefix of the name of iSCSI target node (optional) +- nop_timeout: Timeout in seconds to nop-in request to the initiator (optional) +- nop_in_interval: Time interval in secs between nop-in requests by the target (optional) +- disable_chap: CHAP for discovery session should be disabled (optional) +- require_chap: CHAP for discovery session should be required +- mutual_chap: CHAP for discovery session should be mutual +- chap_group: Authentication group ID for discovery session +- max_sessions: Maximum number of sessions in the host +- max_queue_depth: Maximum number of outstanding I/Os per queue +- max_connections_per_session: Negotiated parameter, MaxConnections +- default_time2wait: Negotiated parameter, DefaultTime2Wait +- default_time2retain: Negotiated parameter, DefaultTime2Retain +- first_burst_length: Negotiated parameter, FirstBurstLength +- immediate_data: Negotiated parameter, ImmediateData +- error_recovery_level: Negotiated parameter, ErrorRecoveryLevel +- allow_duplicated_isid: Allow duplicated initiator session ID +- max_large_datain_per_connection: Max number of outstanding split read I/Os per connection (optional) +- max_r2t_per_connection: Max number of outstanding R2Ts per connection (optional) +- pdu_pool_size: Number of PDUs in the pool (optional) +- immediate_data_pool_size: Number of immediate data buffers in the pool (optional) +- data_out_pool_size: Number of data out buffers in the pool (optional) +- +- Returns: +- True or False +- """ +- params = {} +- +- if auth_file: +- params['auth_file'] = auth_file +- if node_base: +- params['node_base'] = node_base +- if nop_timeout: +- params['nop_timeout'] = nop_timeout +- if nop_in_interval: +- params['nop_in_interval'] = nop_in_interval +- if disable_chap: +- params['disable_chap'] = disable_chap +- if require_chap: +- params['require_chap'] = require_chap +- if mutual_chap: +- params['mutual_chap'] = mutual_chap +- if chap_group: +- params['chap_group'] = chap_group +- if max_sessions: +- params['max_sessions'] = max_sessions +- if max_queue_depth: +- params['max_queue_depth'] = max_queue_depth +- if max_connections_per_session: +- params['max_connections_per_session'] = max_connections_per_session +- if default_time2wait: +- params['default_time2wait'] = default_time2wait +- if default_time2retain: +- params['default_time2retain'] = default_time2retain +- if first_burst_length: +- params['first_burst_length'] = first_burst_length +- if immediate_data: +- params['immediate_data'] = immediate_data +- if error_recovery_level: +- params['error_recovery_level'] = error_recovery_level +- if allow_duplicated_isid: +- params['allow_duplicated_isid'] = allow_duplicated_isid +- if max_large_datain_per_connection: +- params['max_large_datain_per_connection'] = max_large_datain_per_connection +- if max_r2t_per_connection: +- params['max_r2t_per_connection'] = max_r2t_per_connection +- if pdu_pool_size: +- params['pdu_pool_size'] = pdu_pool_size +- if immediate_data_pool_size: +- params['immediate_data_pool_size'] = immediate_data_pool_size +- if data_out_pool_size: +- params['data_out_pool_size'] = data_out_pool_size +- +- return client.call('iscsi_set_options', params) +- +- +-def iscsi_set_discovery_auth( +- client, +- disable_chap=None, +- require_chap=None, +- mutual_chap=None, +- chap_group=None): +- """Set CHAP authentication for discovery service. +- +- Args: +- disable_chap: CHAP for discovery session should be disabled (optional) +- require_chap: CHAP for discovery session should be required (optional) +- mutual_chap: CHAP for discovery session should be mutual (optional) +- chap_group: Authentication group ID for discovery session (optional) +- +- Returns: +- True or False +- """ +- params = {} +- +- if disable_chap: +- params['disable_chap'] = disable_chap +- if require_chap: +- params['require_chap'] = require_chap +- if mutual_chap: +- params['mutual_chap'] = mutual_chap +- if chap_group: +- params['chap_group'] = chap_group +- +- return client.call('iscsi_set_discovery_auth', params) +- +- +-def iscsi_get_auth_groups(client): +- """Display current authentication group configuration. +- +- Returns: +- List of current authentication group configuration. +- """ +- return client.call('iscsi_get_auth_groups') +- +- +-def iscsi_get_portal_groups(client): +- """Display current portal group configuration. +- +- Returns: +- List of current portal group configuration. +- """ +- return client.call('iscsi_get_portal_groups') +- +- +-def iscsi_get_initiator_groups(client): +- """Display current initiator group configuration. +- +- Returns: +- List of current initiator group configuration. +- """ +- return client.call('iscsi_get_initiator_groups') +- +- +-def iscsi_get_target_nodes(client): +- """Display target nodes. +- +- Returns: +- List of ISCSI target node objects. +- """ +- return client.call('iscsi_get_target_nodes') +- +- +-def iscsi_create_target_node( +- client, +- luns, +- pg_ig_maps, +- name, +- alias_name, +- queue_depth, +- chap_group=None, +- disable_chap=None, +- require_chap=None, +- mutual_chap=None, +- header_digest=None, +- data_digest=None): +- """Add a target node. +- +- Args: +- luns: List of bdev_name_id_pairs, e.g. [{"bdev_name": "Malloc1", "lun_id": 1}] +- pg_ig_maps: List of pg_ig_mappings, e.g. [{"pg_tag": pg, "ig_tag": ig}] +- name: Target node name (ASCII) +- alias_name: Target node alias name (ASCII) +- queue_depth: Desired target queue depth +- chap_group: Authentication group ID for this target node +- disable_chap: CHAP authentication should be disabled for this target node +- require_chap: CHAP authentication should be required for this target node +- mutual_chap: CHAP authentication should be mutual/bidirectional +- header_digest: Header Digest should be required for this target node +- data_digest: Data Digest should be required for this target node +- +- Returns: +- True or False +- """ +- params = { +- 'name': name, +- 'alias_name': alias_name, +- 'pg_ig_maps': pg_ig_maps, +- 'luns': luns, +- 'queue_depth': queue_depth, +- } +- +- if chap_group: +- params['chap_group'] = chap_group +- if disable_chap: +- params['disable_chap'] = disable_chap +- if require_chap: +- params['require_chap'] = require_chap +- if mutual_chap: +- params['mutual_chap'] = mutual_chap +- if header_digest: +- params['header_digest'] = header_digest +- if data_digest: +- params['data_digest'] = data_digest +- return client.call('iscsi_create_target_node', params) +- +- +-def iscsi_target_node_add_lun(client, name, bdev_name, lun_id=None): +- """Add LUN to the target node. +- +- Args: +- name: Target node name (ASCII) +- bdev_name: bdev name +- lun_id: LUN ID (integer >= 0) +- +- Returns: +- True or False +- """ +- params = { +- 'name': name, +- 'bdev_name': bdev_name, +- } +- if lun_id: +- params['lun_id'] = lun_id +- return client.call('iscsi_target_node_add_lun', params) +- +- +-def iscsi_target_node_set_auth( +- client, +- name, +- chap_group=None, +- disable_chap=None, +- require_chap=None, +- mutual_chap=None): +- """Set CHAP authentication for the target node. +- +- Args: +- name: Target node name (ASCII) +- chap_group: Authentication group ID for this target node +- disable_chap: CHAP authentication should be disabled for this target node +- require_chap: CHAP authentication should be required for this target node +- mutual_chap: CHAP authentication should be mutual/bidirectional +- +- Returns: +- True or False +- """ +- params = { +- 'name': name, +- } +- +- if chap_group: +- params['chap_group'] = chap_group +- if disable_chap: +- params['disable_chap'] = disable_chap +- if require_chap: +- params['require_chap'] = require_chap +- if mutual_chap: +- params['mutual_chap'] = mutual_chap +- return client.call('iscsi_target_node_set_auth', params) +- +- +-def iscsi_create_auth_group(client, tag, secrets=None): +- """Create authentication group for CHAP authentication. +- +- Args: +- tag: Authentication group tag (unique, integer > 0). +- secrets: Array of secrets objects (optional). +- +- Returns: +- True or False +- """ +- params = {'tag': tag} +- +- if secrets: +- params['secrets'] = secrets +- return client.call('iscsi_create_auth_group', params) +- +- +-def iscsi_delete_auth_group(client, tag): +- """Delete an authentication group. +- +- Args: +- tag: Authentication group tag (unique, integer > 0) +- +- Returns: +- True or False +- """ +- params = {'tag': tag} +- return client.call('iscsi_delete_auth_group', params) +- +- +-def iscsi_auth_group_add_secret(client, tag, user, secret, muser=None, msecret=None): +- """Add a secret to an authentication group. +- +- Args: +- tag: Authentication group tag (unique, integer > 0) +- user: User name for one-way CHAP authentication +- secret: Secret for one-way CHAP authentication +- muser: User name for mutual CHAP authentication (optional) +- msecret: Secret for mutual CHAP authentication (optional) +- +- Returns: +- True or False +- """ +- params = {'tag': tag, 'user': user, 'secret': secret} +- +- if muser: +- params['muser'] = muser +- if msecret: +- params['msecret'] = msecret +- return client.call('iscsi_auth_group_add_secret', params) +- +- +-def iscsi_auth_group_remove_secret(client, tag, user): +- """Remove a secret from an authentication group. +- +- Args: +- tag: Authentication group tag (unique, integer > 0) +- user: User name for one-way CHAP authentication +- +- Returns: +- True or False +- """ +- params = {'tag': tag, 'user': user} +- return client.call('iscsi_auth_group_remove_secret', params) +- +- +-def iscsi_target_node_remove_pg_ig_maps(client, pg_ig_maps, name): +- """Delete PG-IG maps from the target node. +- +- Args: +- pg_ig_maps: List of pg_ig_mappings, e.g. [{"pg_tag": pg, "ig_tag": ig}] +- name: Target node alias name (ASCII) +- +- Returns: +- True or False +- """ +- params = { +- 'name': name, +- 'pg_ig_maps': pg_ig_maps, +- } +- return client.call('iscsi_target_node_remove_pg_ig_maps', params) +- +- +-def iscsi_target_node_add_pg_ig_maps(client, pg_ig_maps, name): +- """Add PG-IG maps to the target node. +- +- Args: +- pg_ig_maps: List of pg_ig_mappings, e.g. [{"pg_tag": pg, "ig_tag": ig}] +- name: Target node alias name (ASCII) +- +- Returns: +- True or False +- """ +- params = { +- 'name': name, +- 'pg_ig_maps': pg_ig_maps, +- } +- return client.call('iscsi_target_node_add_pg_ig_maps', params) +- +- +-def iscsi_target_node_set_redirect(client, name, pg_tag, redirect_host, redirect_port): +- """Update redirect portal of the public portal group for the target node. +- +- Args: +- name: Target node name (ASCII) +- pg_tag: Portal group tag (unique, integer > 0) +- redirect_host: Numeric IP address to which the target node is redirected +- redirect_port: Numeric TCP port to which the target node is redirected +- +- Returns: +- True or False +- """ +- params = { +- 'name': name, +- 'pg_tag': pg_tag +- } +- +- if redirect_host: +- params['redirect_host'] = redirect_host +- if redirect_port: +- params['redirect_port'] = redirect_port +- return client.call('iscsi_target_node_set_redirect', params) +- +- +-def iscsi_target_node_request_logout(client, name, pg_tag): +- """Request connections to the target node to logout. +- +- Args: +- name: Target node name (ASCII) +- pg_tag: Portal group tag (unique, integer > 0) (optional) +- +- Returns: +- True or False +- """ +- params = {'name': name} +- +- if pg_tag: +- params['pg_tag'] = pg_tag +- return client.call('iscsi_target_node_request_logout', params) +- +- +-def iscsi_create_portal_group(client, portals, tag, private, wait): +- """Add a portal group. +- +- Args: +- portals: List of portals, e.g. [{'host': ip, 'port': port}] +- tag: Initiator group tag (unique, integer > 0) +- private: Public (false) or private (true) portal group for login redirection. +- wait: Do not listen on portals until it is allowed explicitly. +- +- Returns: +- True or False +- """ +- params = {'tag': tag, 'portals': portals} +- +- if private: +- params['private'] = private +- if wait: +- params['wait'] = wait +- return client.call('iscsi_create_portal_group', params) +- +- +-def iscsi_start_portal_group(client, tag): +- """Start listening on portals if it is not started yet. +- +- Args: +- tag: Portal group tag (unique, integer > 0) +- +- Returns: +- True or False +- """ +- params = {'tag': tag} +- return client.call('iscsi_start_portal_group', params) +- +- +-def iscsi_create_initiator_group(client, tag, initiators, netmasks): +- """Add an initiator group. +- +- Args: +- tag: Initiator group tag (unique, integer > 0) +- initiators: List of initiator hostnames or IP addresses, e.g. +- ["ANY"] or ["iqn.2016-06.io.spdk:host1","iqn.2016-06.io.spdk:host2"] +- netmasks: List of initiator netmasks, e.g. ["255.255.0.0","255.248.0.0"] +- +- Returns: +- True or False +- """ +- params = {'tag': tag, 'initiators': initiators, 'netmasks': netmasks} +- return client.call('iscsi_create_initiator_group', params) +- +- +-def iscsi_initiator_group_add_initiators( +- client, +- tag, +- initiators=None, +- netmasks=None): +- """Add initiators to an existing initiator group. +- +- Args: +- tag: Initiator group tag (unique, integer > 0) +- initiators: List of initiator hostnames or IP addresses, e.g. +- ["ANY"] or ["iqn.2016-06.io.spdk:host1","iqn.2016-06.io.spdk:host2"] +- netmasks: List of initiator netmasks, e.g. ["255.255.0.0","255.248.0.0"] +- +- Returns: +- True or False +- """ +- params = {'tag': tag} +- +- if initiators: +- params['initiators'] = initiators +- if netmasks: +- params['netmasks'] = netmasks +- return client.call('iscsi_initiator_group_add_initiators', params) +- +- +-def iscsi_initiator_group_remove_initiators( +- client, tag, initiators=None, netmasks=None): +- """Delete initiators from an existing initiator group. +- +- Args: +- tag: Initiator group tag (unique, integer > 0) +- initiators: List of initiator hostnames or IP addresses, e.g. ["127.0.0.1","192.168.200.100"] +- netmasks: List of initiator netmasks, e.g. ["255.255.0.0","255.248.0.0"] +- +- Returns: +- True or False +- """ +- params = {'tag': tag} +- +- if initiators: +- params['initiators'] = initiators +- if netmasks: +- params['netmasks'] = netmasks +- return client.call('iscsi_initiator_group_remove_initiators', params) +- +- +-def iscsi_delete_target_node(client, target_node_name): +- """Delete a target node. +- +- Args: +- target_node_name: Target node name to be deleted. Example: iqn.2016-06.io.spdk:disk1. +- +- Returns: +- True or False +- """ +- params = {'name': target_node_name} +- return client.call('iscsi_delete_target_node', params) +- +- +-def iscsi_delete_portal_group(client, tag): +- """Delete a portal group. +- +- Args: +- tag: Portal group tag (unique, integer > 0) +- +- Returns: +- True or False +- """ +- params = {'tag': tag} +- return client.call('iscsi_delete_portal_group', params) +- +- +-def iscsi_delete_initiator_group(client, tag): +- """Delete an initiator group. +- +- Args: +- tag: Initiator group tag (unique, integer > 0) +- +- Returns: +- True or False +- """ +- params = {'tag': tag} +- return client.call('iscsi_delete_initiator_group', params) +- +- +-def iscsi_portal_group_set_auth( +- client, +- tag, +- chap_group=None, +- disable_chap=None, +- require_chap=None, +- mutual_chap=None): +- """Set CHAP authentication for discovery sessions specific for the portal group. +- +- Args: +- tag: Portal group tag (unique, integer > 0) +- chap_group: Authentication group ID for this portal group +- disable_chap: CHAP authentication should be disabled for this portal group +- require_chap: CHAP authentication should be required for this portal group +- mutual_chap: CHAP authentication should be mutual/bidirectional +- +- Returns: +- True or False +- """ +- params = { +- 'tag': tag, +- } +- +- if chap_group: +- params['chap_group'] = chap_group +- if disable_chap: +- params['disable_chap'] = disable_chap +- if require_chap: +- params['require_chap'] = require_chap +- if mutual_chap: +- params['mutual_chap'] = mutual_chap +- return client.call('iscsi_portal_group_set_auth', params) +- +- +-def iscsi_get_connections(client): +- """Display iSCSI connections. +- +- Returns: +- List of iSCSI connection. +- """ +- return client.call('iscsi_get_connections') +- +- +-def iscsi_get_options(client): +- """Display iSCSI global parameters. +- +- Returns: +- List of iSCSI global parameter. +- """ +- return client.call('iscsi_get_options') +- +- +-def scsi_get_devices(client): +- """Display SCSI devices. +- +- Returns: +- List of SCSI device. +- """ +- return client.call('scsi_get_devices') ++# SPDX-License-Identifier: BSD-3-Clause ++# Copyright (C) 2017 Intel Corporation. ++# All rights reserved. ++ ++ ++def iscsi_set_options( ++ client, ++ auth_file=None, ++ node_base=None, ++ nop_timeout=None, ++ nop_in_interval=None, ++ disable_chap=None, ++ require_chap=None, ++ mutual_chap=None, ++ chap_group=None, ++ max_sessions=None, ++ max_queue_depth=None, ++ max_connections_per_session=None, ++ default_time2wait=None, ++ default_time2retain=None, ++ first_burst_length=None, ++ immediate_data=None, ++ error_recovery_level=None, ++ allow_duplicated_isid=None, ++ max_large_datain_per_connection=None, ++ max_r2t_per_connection=None, ++ pdu_pool_size=None, ++ immediate_data_pool_size=None, ++ data_out_pool_size=None): ++ """Set iSCSI target options. ++ ++ Args: ++ auth_file: Path to CHAP shared secret file (optional) ++ node_base: Prefix of the name of iSCSI target node (optional) ++ nop_timeout: Timeout in seconds to nop-in request to the initiator (optional) ++ nop_in_interval: Time interval in secs between nop-in requests by the target (optional) ++ disable_chap: CHAP for discovery session should be disabled (optional) ++ require_chap: CHAP for discovery session should be required ++ mutual_chap: CHAP for discovery session should be mutual ++ chap_group: Authentication group ID for discovery session ++ max_sessions: Maximum number of sessions in the host ++ max_queue_depth: Maximum number of outstanding I/Os per queue ++ max_connections_per_session: Negotiated parameter, MaxConnections ++ default_time2wait: Negotiated parameter, DefaultTime2Wait ++ default_time2retain: Negotiated parameter, DefaultTime2Retain ++ first_burst_length: Negotiated parameter, FirstBurstLength ++ immediate_data: Negotiated parameter, ImmediateData ++ error_recovery_level: Negotiated parameter, ErrorRecoveryLevel ++ allow_duplicated_isid: Allow duplicated initiator session ID ++ max_large_datain_per_connection: Max number of outstanding split read I/Os per connection (optional) ++ max_r2t_per_connection: Max number of outstanding R2Ts per connection (optional) ++ pdu_pool_size: Number of PDUs in the pool (optional) ++ immediate_data_pool_size: Number of immediate data buffers in the pool (optional) ++ data_out_pool_size: Number of data out buffers in the pool (optional) ++ ++ Returns: ++ True or False ++ """ ++ params = {} ++ ++ if auth_file: ++ params['auth_file'] = auth_file ++ if node_base: ++ params['node_base'] = node_base ++ if nop_timeout: ++ params['nop_timeout'] = nop_timeout ++ if nop_in_interval: ++ params['nop_in_interval'] = nop_in_interval ++ if disable_chap: ++ params['disable_chap'] = disable_chap ++ if require_chap: ++ params['require_chap'] = require_chap ++ if mutual_chap: ++ params['mutual_chap'] = mutual_chap ++ if chap_group: ++ params['chap_group'] = chap_group ++ if max_sessions: ++ params['max_sessions'] = max_sessions ++ if max_queue_depth: ++ params['max_queue_depth'] = max_queue_depth ++ if max_connections_per_session: ++ params['max_connections_per_session'] = max_connections_per_session ++ if default_time2wait: ++ params['default_time2wait'] = default_time2wait ++ if default_time2retain: ++ params['default_time2retain'] = default_time2retain ++ if first_burst_length: ++ params['first_burst_length'] = first_burst_length ++ if immediate_data: ++ params['immediate_data'] = immediate_data ++ if error_recovery_level: ++ params['error_recovery_level'] = error_recovery_level ++ if allow_duplicated_isid: ++ params['allow_duplicated_isid'] = allow_duplicated_isid ++ if max_large_datain_per_connection: ++ params['max_large_datain_per_connection'] = max_large_datain_per_connection ++ if max_r2t_per_connection: ++ params['max_r2t_per_connection'] = max_r2t_per_connection ++ if pdu_pool_size: ++ params['pdu_pool_size'] = pdu_pool_size ++ if immediate_data_pool_size: ++ params['immediate_data_pool_size'] = immediate_data_pool_size ++ if data_out_pool_size: ++ params['data_out_pool_size'] = data_out_pool_size ++ ++ return client.call('iscsi_set_options', params) ++ ++ ++def iscsi_set_discovery_auth( ++ client, ++ disable_chap=None, ++ require_chap=None, ++ mutual_chap=None, ++ chap_group=None): ++ """Set CHAP authentication for discovery service. ++ ++ Args: ++ disable_chap: CHAP for discovery session should be disabled (optional) ++ require_chap: CHAP for discovery session should be required (optional) ++ mutual_chap: CHAP for discovery session should be mutual (optional) ++ chap_group: Authentication group ID for discovery session (optional) ++ ++ Returns: ++ True or False ++ """ ++ params = {} ++ ++ if disable_chap: ++ params['disable_chap'] = disable_chap ++ if require_chap: ++ params['require_chap'] = require_chap ++ if mutual_chap: ++ params['mutual_chap'] = mutual_chap ++ if chap_group: ++ params['chap_group'] = chap_group ++ ++ return client.call('iscsi_set_discovery_auth', params) ++ ++ ++def iscsi_get_auth_groups(client): ++ """Display current authentication group configuration. ++ ++ Returns: ++ List of current authentication group configuration. ++ """ ++ return client.call('iscsi_get_auth_groups') ++ ++ ++def iscsi_get_portal_groups(client): ++ """Display current portal group configuration. ++ ++ Returns: ++ List of current portal group configuration. ++ """ ++ return client.call('iscsi_get_portal_groups') ++ ++ ++def iscsi_get_initiator_groups(client): ++ """Display current initiator group configuration. ++ ++ Returns: ++ List of current initiator group configuration. ++ """ ++ return client.call('iscsi_get_initiator_groups') ++ ++ ++def iscsi_get_target_nodes(client): ++ """Display target nodes. ++ ++ Returns: ++ List of ISCSI target node objects. ++ """ ++ return client.call('iscsi_get_target_nodes') ++ ++ ++def iscsi_create_target_node( ++ client, ++ luns, ++ pg_ig_maps, ++ name, ++ alias_name, ++ queue_depth, ++ chap_group=None, ++ disable_chap=None, ++ require_chap=None, ++ mutual_chap=None, ++ header_digest=None, ++ data_digest=None): ++ """Add a target node. ++ ++ Args: ++ luns: List of bdev_name_id_pairs, e.g. [{"bdev_name": "Malloc1", "lun_id": 1}] ++ pg_ig_maps: List of pg_ig_mappings, e.g. [{"pg_tag": pg, "ig_tag": ig}] ++ name: Target node name (ASCII) ++ alias_name: Target node alias name (ASCII) ++ queue_depth: Desired target queue depth ++ chap_group: Authentication group ID for this target node ++ disable_chap: CHAP authentication should be disabled for this target node ++ require_chap: CHAP authentication should be required for this target node ++ mutual_chap: CHAP authentication should be mutual/bidirectional ++ header_digest: Header Digest should be required for this target node ++ data_digest: Data Digest should be required for this target node ++ ++ Returns: ++ True or False ++ """ ++ params = { ++ 'name': name, ++ 'alias_name': alias_name, ++ 'pg_ig_maps': pg_ig_maps, ++ 'luns': luns, ++ 'queue_depth': queue_depth, ++ } ++ ++ if chap_group: ++ params['chap_group'] = chap_group ++ if disable_chap: ++ params['disable_chap'] = disable_chap ++ if require_chap: ++ params['require_chap'] = require_chap ++ if mutual_chap: ++ params['mutual_chap'] = mutual_chap ++ if header_digest: ++ params['header_digest'] = header_digest ++ if data_digest: ++ params['data_digest'] = data_digest ++ return client.call('iscsi_create_target_node', params) ++ ++ ++def iscsi_target_node_add_lun(client, name, bdev_name, lun_id=None): ++ """Add LUN to the target node. ++ ++ Args: ++ name: Target node name (ASCII) ++ bdev_name: bdev name ++ lun_id: LUN ID (integer >= 0) ++ ++ Returns: ++ True or False ++ """ ++ params = { ++ 'name': name, ++ 'bdev_name': bdev_name, ++ } ++ if lun_id: ++ params['lun_id'] = lun_id ++ return client.call('iscsi_target_node_add_lun', params) ++ ++ ++def iscsi_target_node_set_auth( ++ client, ++ name, ++ chap_group=None, ++ disable_chap=None, ++ require_chap=None, ++ mutual_chap=None): ++ """Set CHAP authentication for the target node. ++ ++ Args: ++ name: Target node name (ASCII) ++ chap_group: Authentication group ID for this target node ++ disable_chap: CHAP authentication should be disabled for this target node ++ require_chap: CHAP authentication should be required for this target node ++ mutual_chap: CHAP authentication should be mutual/bidirectional ++ ++ Returns: ++ True or False ++ """ ++ params = { ++ 'name': name, ++ } ++ ++ if chap_group: ++ params['chap_group'] = chap_group ++ if disable_chap: ++ params['disable_chap'] = disable_chap ++ if require_chap: ++ params['require_chap'] = require_chap ++ if mutual_chap: ++ params['mutual_chap'] = mutual_chap ++ return client.call('iscsi_target_node_set_auth', params) ++ ++ ++def iscsi_create_auth_group(client, tag, secrets=None): ++ """Create authentication group for CHAP authentication. ++ ++ Args: ++ tag: Authentication group tag (unique, integer > 0). ++ secrets: Array of secrets objects (optional). ++ ++ Returns: ++ True or False ++ """ ++ params = {'tag': tag} ++ ++ if secrets: ++ params['secrets'] = secrets ++ return client.call('iscsi_create_auth_group', params) ++ ++ ++def iscsi_delete_auth_group(client, tag): ++ """Delete an authentication group. ++ ++ Args: ++ tag: Authentication group tag (unique, integer > 0) ++ ++ Returns: ++ True or False ++ """ ++ params = {'tag': tag} ++ return client.call('iscsi_delete_auth_group', params) ++ ++ ++def iscsi_auth_group_add_secret(client, tag, user, secret, muser=None, msecret=None): ++ """Add a secret to an authentication group. ++ ++ Args: ++ tag: Authentication group tag (unique, integer > 0) ++ user: User name for one-way CHAP authentication ++ secret: Secret for one-way CHAP authentication ++ muser: User name for mutual CHAP authentication (optional) ++ msecret: Secret for mutual CHAP authentication (optional) ++ ++ Returns: ++ True or False ++ """ ++ params = {'tag': tag, 'user': user, 'secret': secret} ++ ++ if muser: ++ params['muser'] = muser ++ if msecret: ++ params['msecret'] = msecret ++ return client.call('iscsi_auth_group_add_secret', params) ++ ++ ++def iscsi_auth_group_remove_secret(client, tag, user): ++ """Remove a secret from an authentication group. ++ ++ Args: ++ tag: Authentication group tag (unique, integer > 0) ++ user: User name for one-way CHAP authentication ++ ++ Returns: ++ True or False ++ """ ++ params = {'tag': tag, 'user': user} ++ return client.call('iscsi_auth_group_remove_secret', params) ++ ++ ++def iscsi_target_node_remove_pg_ig_maps(client, pg_ig_maps, name): ++ """Delete PG-IG maps from the target node. ++ ++ Args: ++ pg_ig_maps: List of pg_ig_mappings, e.g. [{"pg_tag": pg, "ig_tag": ig}] ++ name: Target node alias name (ASCII) ++ ++ Returns: ++ True or False ++ """ ++ params = { ++ 'name': name, ++ 'pg_ig_maps': pg_ig_maps, ++ } ++ return client.call('iscsi_target_node_remove_pg_ig_maps', params) ++ ++ ++def iscsi_target_node_add_pg_ig_maps(client, pg_ig_maps, name): ++ """Add PG-IG maps to the target node. ++ ++ Args: ++ pg_ig_maps: List of pg_ig_mappings, e.g. [{"pg_tag": pg, "ig_tag": ig}] ++ name: Target node alias name (ASCII) ++ ++ Returns: ++ True or False ++ """ ++ params = { ++ 'name': name, ++ 'pg_ig_maps': pg_ig_maps, ++ } ++ return client.call('iscsi_target_node_add_pg_ig_maps', params) ++ ++ ++def iscsi_target_node_set_redirect(client, name, pg_tag, redirect_host, redirect_port): ++ """Update redirect portal of the public portal group for the target node. ++ ++ Args: ++ name: Target node name (ASCII) ++ pg_tag: Portal group tag (unique, integer > 0) ++ redirect_host: Numeric IP address to which the target node is redirected ++ redirect_port: Numeric TCP port to which the target node is redirected ++ ++ Returns: ++ True or False ++ """ ++ params = { ++ 'name': name, ++ 'pg_tag': pg_tag ++ } ++ ++ if redirect_host: ++ params['redirect_host'] = redirect_host ++ if redirect_port: ++ params['redirect_port'] = redirect_port ++ return client.call('iscsi_target_node_set_redirect', params) ++ ++ ++def iscsi_target_node_request_logout(client, name, pg_tag): ++ """Request connections to the target node to logout. ++ ++ Args: ++ name: Target node name (ASCII) ++ pg_tag: Portal group tag (unique, integer > 0) (optional) ++ ++ Returns: ++ True or False ++ """ ++ params = {'name': name} ++ ++ if pg_tag: ++ params['pg_tag'] = pg_tag ++ return client.call('iscsi_target_node_request_logout', params) ++ ++ ++def iscsi_create_portal_group(client, portals, tag, private, wait): ++ """Add a portal group. ++ ++ Args: ++ portals: List of portals, e.g. [{'host': ip, 'port': port}] ++ tag: Initiator group tag (unique, integer > 0) ++ private: Public (false) or private (true) portal group for login redirection. ++ wait: Do not listen on portals until it is allowed explicitly. ++ ++ Returns: ++ True or False ++ """ ++ params = {'tag': tag, 'portals': portals} ++ ++ if private: ++ params['private'] = private ++ if wait: ++ params['wait'] = wait ++ return client.call('iscsi_create_portal_group', params) ++ ++ ++def iscsi_start_portal_group(client, tag): ++ """Start listening on portals if it is not started yet. ++ ++ Args: ++ tag: Portal group tag (unique, integer > 0) ++ ++ Returns: ++ True or False ++ """ ++ params = {'tag': tag} ++ return client.call('iscsi_start_portal_group', params) ++ ++ ++def iscsi_create_initiator_group(client, tag, initiators, netmasks): ++ """Add an initiator group. ++ ++ Args: ++ tag: Initiator group tag (unique, integer > 0) ++ initiators: List of initiator hostnames or IP addresses, e.g. ++ ["ANY"] or ["iqn.2016-06.io.spdk:host1","iqn.2016-06.io.spdk:host2"] ++ netmasks: List of initiator netmasks, e.g. ["255.255.0.0","255.248.0.0"] ++ ++ Returns: ++ True or False ++ """ ++ params = {'tag': tag, 'initiators': initiators, 'netmasks': netmasks} ++ return client.call('iscsi_create_initiator_group', params) ++ ++ ++def iscsi_initiator_group_add_initiators( ++ client, ++ tag, ++ initiators=None, ++ netmasks=None): ++ """Add initiators to an existing initiator group. ++ ++ Args: ++ tag: Initiator group tag (unique, integer > 0) ++ initiators: List of initiator hostnames or IP addresses, e.g. ++ ["ANY"] or ["iqn.2016-06.io.spdk:host1","iqn.2016-06.io.spdk:host2"] ++ netmasks: List of initiator netmasks, e.g. ["255.255.0.0","255.248.0.0"] ++ ++ Returns: ++ True or False ++ """ ++ params = {'tag': tag} ++ ++ if initiators: ++ params['initiators'] = initiators ++ if netmasks: ++ params['netmasks'] = netmasks ++ return client.call('iscsi_initiator_group_add_initiators', params) ++ ++ ++def iscsi_initiator_group_remove_initiators( ++ client, tag, initiators=None, netmasks=None): ++ """Delete initiators from an existing initiator group. ++ ++ Args: ++ tag: Initiator group tag (unique, integer > 0) ++ initiators: List of initiator hostnames or IP addresses, e.g. ["127.0.0.1","192.168.200.100"] ++ netmasks: List of initiator netmasks, e.g. ["255.255.0.0","255.248.0.0"] ++ ++ Returns: ++ True or False ++ """ ++ params = {'tag': tag} ++ ++ if initiators: ++ params['initiators'] = initiators ++ if netmasks: ++ params['netmasks'] = netmasks ++ return client.call('iscsi_initiator_group_remove_initiators', params) ++ ++ ++def iscsi_delete_target_node(client, target_node_name): ++ """Delete a target node. ++ ++ Args: ++ target_node_name: Target node name to be deleted. Example: iqn.2016-06.io.spdk:disk1. ++ ++ Returns: ++ True or False ++ """ ++ params = {'name': target_node_name} ++ return client.call('iscsi_delete_target_node', params) ++ ++ ++def iscsi_delete_portal_group(client, tag): ++ """Delete a portal group. ++ ++ Args: ++ tag: Portal group tag (unique, integer > 0) ++ ++ Returns: ++ True or False ++ """ ++ params = {'tag': tag} ++ return client.call('iscsi_delete_portal_group', params) ++ ++ ++def iscsi_delete_initiator_group(client, tag): ++ """Delete an initiator group. ++ ++ Args: ++ tag: Initiator group tag (unique, integer > 0) ++ ++ Returns: ++ True or False ++ """ ++ params = {'tag': tag} ++ return client.call('iscsi_delete_initiator_group', params) ++ ++ ++def iscsi_portal_group_set_auth( ++ client, ++ tag, ++ chap_group=None, ++ disable_chap=None, ++ require_chap=None, ++ mutual_chap=None): ++ """Set CHAP authentication for discovery sessions specific for the portal group. ++ ++ Args: ++ tag: Portal group tag (unique, integer > 0) ++ chap_group: Authentication group ID for this portal group ++ disable_chap: CHAP authentication should be disabled for this portal group ++ require_chap: CHAP authentication should be required for this portal group ++ mutual_chap: CHAP authentication should be mutual/bidirectional ++ ++ Returns: ++ True or False ++ """ ++ params = { ++ 'tag': tag, ++ } ++ ++ if chap_group: ++ params['chap_group'] = chap_group ++ if disable_chap: ++ params['disable_chap'] = disable_chap ++ if require_chap: ++ params['require_chap'] = require_chap ++ if mutual_chap: ++ params['mutual_chap'] = mutual_chap ++ return client.call('iscsi_portal_group_set_auth', params) ++ ++ ++def iscsi_get_connections(client): ++ """Display iSCSI connections. ++ ++ Returns: ++ List of iSCSI connection. ++ """ ++ return client.call('iscsi_get_connections') ++ ++ ++def iscsi_get_options(client): ++ """Display iSCSI global parameters. ++ ++ Returns: ++ List of iSCSI global parameter. ++ """ ++ return client.call('iscsi_get_options') ++ ++ ++def scsi_get_devices(client): ++ """Display SCSI devices. ++ ++ Returns: ++ List of SCSI device. ++ """ ++ return client.call('scsi_get_devices') +diff --git a/python/spdk/rpc/log.py b/python/spdk/rpc/log.py +index 5f99aff..89f4f09 100644 +--- a/python/spdk/rpc/log.py ++++ b/python/spdk/rpc/log.py +@@ -1,70 +1,70 @@ +-# SPDX-License-Identifier: BSD-3-Clause +-# Copyright (C) 2017 Intel Corporation. +-# All rights reserved. +- +- +-def log_set_flag(client, flag): +- """Set log flag. +- +- Args: +- flag: log flag we want to set. (for example "nvme") +- """ +- params = {'flag': flag} +- return client.call('log_set_flag', params) +- +- +-def log_clear_flag(client, flag): +- """Clear log flag. +- +- Args: +- flag: log flag we want to clear. (for example "nvme") +- """ +- params = {'flag': flag} +- return client.call('log_clear_flag', params) +- +- +-def log_get_flags(client): +- """Get log flags +- +- Returns: +- List of log flags +- """ +- return client.call('log_get_flags') +- +- +-def log_set_level(client, level): +- """Set log level. +- +- Args: +- level: log level we want to set. (for example "DEBUG") +- """ +- params = {'level': level} +- return client.call('log_set_level', params) +- +- +-def log_get_level(client): +- """Get log level +- +- Returns: +- Current log level +- """ +- return client.call('log_get_level') +- +- +-def log_set_print_level(client, level): +- """Set log print level. +- +- Args: +- level: log print level we want to set. (for example "DEBUG") +- """ +- params = {'level': level} +- return client.call('log_set_print_level', params) +- +- +-def log_get_print_level(client): +- """Get log print level +- +- Returns: +- Current log print level +- """ +- return client.call('log_get_print_level') ++# SPDX-License-Identifier: BSD-3-Clause ++# Copyright (C) 2017 Intel Corporation. ++# All rights reserved. ++ ++ ++def log_set_flag(client, flag): ++ """Set log flag. ++ ++ Args: ++ flag: log flag we want to set. (for example "nvme") ++ """ ++ params = {'flag': flag} ++ return client.call('log_set_flag', params) ++ ++ ++def log_clear_flag(client, flag): ++ """Clear log flag. ++ ++ Args: ++ flag: log flag we want to clear. (for example "nvme") ++ """ ++ params = {'flag': flag} ++ return client.call('log_clear_flag', params) ++ ++ ++def log_get_flags(client): ++ """Get log flags ++ ++ Returns: ++ List of log flags ++ """ ++ return client.call('log_get_flags') ++ ++ ++def log_set_level(client, level): ++ """Set log level. ++ ++ Args: ++ level: log level we want to set. (for example "DEBUG") ++ """ ++ params = {'level': level} ++ return client.call('log_set_level', params) ++ ++ ++def log_get_level(client): ++ """Get log level ++ ++ Returns: ++ Current log level ++ """ ++ return client.call('log_get_level') ++ ++ ++def log_set_print_level(client, level): ++ """Set log print level. ++ ++ Args: ++ level: log print level we want to set. (for example "DEBUG") ++ """ ++ params = {'level': level} ++ return client.call('log_set_print_level', params) ++ ++ ++def log_get_print_level(client): ++ """Get log print level ++ ++ Returns: ++ Current log print level ++ """ ++ return client.call('log_get_print_level') +diff --git a/python/spdk/rpc/lvol.py b/python/spdk/rpc/lvol.py +index 9d814a2..c04d5b5 100644 +--- a/python/spdk/rpc/lvol.py ++++ b/python/spdk/rpc/lvol.py +@@ -1,238 +1,238 @@ +-# SPDX-License-Identifier: BSD-3-Clause +-# Copyright (C) 2017 Intel Corporation. +-# All rights reserved. +- +- +-def bdev_lvol_create_lvstore(client, bdev_name, lvs_name, cluster_sz=None, +- clear_method=None, num_md_pages_per_cluster_ratio=None): +- """Construct a logical volume store. +- +- Args: +- bdev_name: bdev on which to construct logical volume store +- lvs_name: name of the logical volume store to create +- cluster_sz: cluster size of the logical volume store in bytes (optional) +- clear_method: Change clear method for data region. Available: none, unmap, write_zeroes (optional) +- num_md_pages_per_cluster_ratio: metadata pages per cluster (optional) +- +- Returns: +- UUID of created logical volume store. +- """ +- params = {'bdev_name': bdev_name, 'lvs_name': lvs_name} +- if cluster_sz: +- params['cluster_sz'] = cluster_sz +- if clear_method: +- params['clear_method'] = clear_method +- if num_md_pages_per_cluster_ratio: +- params['num_md_pages_per_cluster_ratio'] = num_md_pages_per_cluster_ratio +- return client.call('bdev_lvol_create_lvstore', params) +- +- +-def bdev_lvol_rename_lvstore(client, old_name, new_name): +- """Rename a logical volume store. +- +- Args: +- old_name: existing logical volume store name +- new_name: new logical volume store name +- """ +- params = { +- 'old_name': old_name, +- 'new_name': new_name +- } +- return client.call('bdev_lvol_rename_lvstore', params) +- +- +-def bdev_lvol_grow_lvstore(client, uuid=None, lvs_name=None): +- """Grow the logical volume store to fill the underlying bdev +- +- Args: +- uuid: UUID of logical volume store to resize (optional) +- lvs_name: name of logical volume store to resize (optional) +- """ +- if (uuid and lvs_name): +- raise ValueError("Exactly one of uuid or lvs_name may be specified") +- params = {} +- if uuid: +- params['uuid'] = uuid +- if lvs_name: +- params['lvs_name'] = lvs_name +- return client.call('bdev_lvol_grow_lvstore', params) +- +- +-def bdev_lvol_create(client, lvol_name, size, thin_provision=False, uuid=None, lvs_name=None, clear_method=None): +- """Create a logical volume on a logical volume store. +- +- Args: +- lvol_name: name of logical volume to create +- size: desired size of logical volume in bytes (will be rounded up to a multiple of cluster size) +- thin_provision: True to enable thin provisioning +- uuid: UUID of logical volume store to create logical volume on (optional) +- lvs_name: name of logical volume store to create logical volume on (optional) +- +- Either uuid or lvs_name must be specified, but not both. +- +- Returns: +- Name of created logical volume block device. +- """ +- if (uuid and lvs_name) or (not uuid and not lvs_name): +- raise ValueError("Either uuid or lvs_name must be specified, but not both") +- +- params = {'lvol_name': lvol_name, 'size': size} +- if thin_provision: +- params['thin_provision'] = thin_provision +- if uuid: +- params['uuid'] = uuid +- if lvs_name: +- params['lvs_name'] = lvs_name +- if clear_method: +- params['clear_method'] = clear_method +- return client.call('bdev_lvol_create', params) +- +- +-def bdev_lvol_snapshot(client, lvol_name, snapshot_name): +- """Capture a snapshot of the current state of a logical volume. +- +- Args: +- lvol_name: logical volume to create a snapshot from +- snapshot_name: name for the newly created snapshot +- +- Returns: +- Name of created logical volume snapshot. +- """ +- params = { +- 'lvol_name': lvol_name, +- 'snapshot_name': snapshot_name +- } +- return client.call('bdev_lvol_snapshot', params) +- +- +-def bdev_lvol_clone(client, snapshot_name, clone_name): +- """Create a logical volume based on a snapshot. +- +- Args: +- snapshot_name: snapshot to clone +- clone_name: name of logical volume to create +- +- Returns: +- Name of created logical volume clone. +- """ +- params = { +- 'snapshot_name': snapshot_name, +- 'clone_name': clone_name +- } +- return client.call('bdev_lvol_clone', params) +- +- +-def bdev_lvol_rename(client, old_name, new_name): +- """Rename a logical volume. +- +- Args: +- old_name: existing logical volume name +- new_name: new logical volume name +- """ +- params = { +- 'old_name': old_name, +- 'new_name': new_name +- } +- return client.call('bdev_lvol_rename', params) +- +- +-def bdev_lvol_resize(client, name, size): +- """Resize a logical volume. +- +- Args: +- name: name of logical volume to resize +- size: desired size of logical volume in bytes (will be rounded up to a multiple of cluster size) +- """ +- params = { +- 'name': name, +- 'size': size, +- } +- return client.call('bdev_lvol_resize', params) +- +- +-def bdev_lvol_set_read_only(client, name): +- """Mark logical volume as read only. +- +- Args: +- name: name of logical volume to set as read only +- """ +- params = { +- 'name': name, +- } +- return client.call('bdev_lvol_set_read_only', params) +- +- +-def bdev_lvol_delete(client, name): +- """Destroy a logical volume. +- +- Args: +- name: name of logical volume to destroy +- """ +- params = { +- 'name': name, +- } +- return client.call('bdev_lvol_delete', params) +- +- +-def bdev_lvol_inflate(client, name): +- """Inflate a logical volume. +- +- Args: +- name: name of logical volume to inflate +- """ +- params = { +- 'name': name, +- } +- return client.call('bdev_lvol_inflate', params) +- +- +-def bdev_lvol_decouple_parent(client, name): +- """Decouple parent of a logical volume. +- +- Args: +- name: name of logical volume to decouple parent +- """ +- params = { +- 'name': name, +- } +- return client.call('bdev_lvol_decouple_parent', params) +- +- +-def bdev_lvol_delete_lvstore(client, uuid=None, lvs_name=None): +- """Destroy a logical volume store. +- +- Args: +- uuid: UUID of logical volume store to destroy (optional) +- lvs_name: name of logical volume store to destroy (optional) +- +- Either uuid or lvs_name must be specified, but not both. +- """ +- if (uuid and lvs_name) or (not uuid and not lvs_name): +- raise ValueError("Exactly one of uuid or lvs_name must be specified") +- +- params = {} +- if uuid: +- params['uuid'] = uuid +- if lvs_name: +- params['lvs_name'] = lvs_name +- return client.call('bdev_lvol_delete_lvstore', params) +- +- +-def bdev_lvol_get_lvstores(client, uuid=None, lvs_name=None): +- """List logical volume stores. +- +- Args: +- uuid: UUID of logical volume store to retrieve information about (optional) +- lvs_name: name of logical volume store to retrieve information about (optional) +- +- Either uuid or lvs_name may be specified, but not both. +- If both uuid and lvs_name are omitted, information about all logical volume stores is returned. +- """ +- if (uuid and lvs_name): +- raise ValueError("Exactly one of uuid or lvs_name may be specified") +- params = {} +- if uuid: +- params['uuid'] = uuid +- if lvs_name: +- params['lvs_name'] = lvs_name +- return client.call('bdev_lvol_get_lvstores', params) ++# SPDX-License-Identifier: BSD-3-Clause ++# Copyright (C) 2017 Intel Corporation. ++# All rights reserved. ++ ++ ++def bdev_lvol_create_lvstore(client, bdev_name, lvs_name, cluster_sz=None, ++ clear_method=None, num_md_pages_per_cluster_ratio=None): ++ """Construct a logical volume store. ++ ++ Args: ++ bdev_name: bdev on which to construct logical volume store ++ lvs_name: name of the logical volume store to create ++ cluster_sz: cluster size of the logical volume store in bytes (optional) ++ clear_method: Change clear method for data region. Available: none, unmap, write_zeroes (optional) ++ num_md_pages_per_cluster_ratio: metadata pages per cluster (optional) ++ ++ Returns: ++ UUID of created logical volume store. ++ """ ++ params = {'bdev_name': bdev_name, 'lvs_name': lvs_name} ++ if cluster_sz: ++ params['cluster_sz'] = cluster_sz ++ if clear_method: ++ params['clear_method'] = clear_method ++ if num_md_pages_per_cluster_ratio: ++ params['num_md_pages_per_cluster_ratio'] = num_md_pages_per_cluster_ratio ++ return client.call('bdev_lvol_create_lvstore', params) ++ ++ ++def bdev_lvol_rename_lvstore(client, old_name, new_name): ++ """Rename a logical volume store. ++ ++ Args: ++ old_name: existing logical volume store name ++ new_name: new logical volume store name ++ """ ++ params = { ++ 'old_name': old_name, ++ 'new_name': new_name ++ } ++ return client.call('bdev_lvol_rename_lvstore', params) ++ ++ ++def bdev_lvol_grow_lvstore(client, uuid=None, lvs_name=None): ++ """Grow the logical volume store to fill the underlying bdev ++ ++ Args: ++ uuid: UUID of logical volume store to resize (optional) ++ lvs_name: name of logical volume store to resize (optional) ++ """ ++ if (uuid and lvs_name): ++ raise ValueError("Exactly one of uuid or lvs_name may be specified") ++ params = {} ++ if uuid: ++ params['uuid'] = uuid ++ if lvs_name: ++ params['lvs_name'] = lvs_name ++ return client.call('bdev_lvol_grow_lvstore', params) ++ ++ ++def bdev_lvol_create(client, lvol_name, size, thin_provision=False, uuid=None, lvs_name=None, clear_method=None): ++ """Create a logical volume on a logical volume store. ++ ++ Args: ++ lvol_name: name of logical volume to create ++ size: desired size of logical volume in bytes (will be rounded up to a multiple of cluster size) ++ thin_provision: True to enable thin provisioning ++ uuid: UUID of logical volume store to create logical volume on (optional) ++ lvs_name: name of logical volume store to create logical volume on (optional) ++ ++ Either uuid or lvs_name must be specified, but not both. ++ ++ Returns: ++ Name of created logical volume block device. ++ """ ++ if (uuid and lvs_name) or (not uuid and not lvs_name): ++ raise ValueError("Either uuid or lvs_name must be specified, but not both") ++ ++ params = {'lvol_name': lvol_name, 'size': size} ++ if thin_provision: ++ params['thin_provision'] = thin_provision ++ if uuid: ++ params['uuid'] = uuid ++ if lvs_name: ++ params['lvs_name'] = lvs_name ++ if clear_method: ++ params['clear_method'] = clear_method ++ return client.call('bdev_lvol_create', params) ++ ++ ++def bdev_lvol_snapshot(client, lvol_name, snapshot_name): ++ """Capture a snapshot of the current state of a logical volume. ++ ++ Args: ++ lvol_name: logical volume to create a snapshot from ++ snapshot_name: name for the newly created snapshot ++ ++ Returns: ++ Name of created logical volume snapshot. ++ """ ++ params = { ++ 'lvol_name': lvol_name, ++ 'snapshot_name': snapshot_name ++ } ++ return client.call('bdev_lvol_snapshot', params) ++ ++ ++def bdev_lvol_clone(client, snapshot_name, clone_name): ++ """Create a logical volume based on a snapshot. ++ ++ Args: ++ snapshot_name: snapshot to clone ++ clone_name: name of logical volume to create ++ ++ Returns: ++ Name of created logical volume clone. ++ """ ++ params = { ++ 'snapshot_name': snapshot_name, ++ 'clone_name': clone_name ++ } ++ return client.call('bdev_lvol_clone', params) ++ ++ ++def bdev_lvol_rename(client, old_name, new_name): ++ """Rename a logical volume. ++ ++ Args: ++ old_name: existing logical volume name ++ new_name: new logical volume name ++ """ ++ params = { ++ 'old_name': old_name, ++ 'new_name': new_name ++ } ++ return client.call('bdev_lvol_rename', params) ++ ++ ++def bdev_lvol_resize(client, name, size): ++ """Resize a logical volume. ++ ++ Args: ++ name: name of logical volume to resize ++ size: desired size of logical volume in bytes (will be rounded up to a multiple of cluster size) ++ """ ++ params = { ++ 'name': name, ++ 'size': size, ++ } ++ return client.call('bdev_lvol_resize', params) ++ ++ ++def bdev_lvol_set_read_only(client, name): ++ """Mark logical volume as read only. ++ ++ Args: ++ name: name of logical volume to set as read only ++ """ ++ params = { ++ 'name': name, ++ } ++ return client.call('bdev_lvol_set_read_only', params) ++ ++ ++def bdev_lvol_delete(client, name): ++ """Destroy a logical volume. ++ ++ Args: ++ name: name of logical volume to destroy ++ """ ++ params = { ++ 'name': name, ++ } ++ return client.call('bdev_lvol_delete', params) ++ ++ ++def bdev_lvol_inflate(client, name): ++ """Inflate a logical volume. ++ ++ Args: ++ name: name of logical volume to inflate ++ """ ++ params = { ++ 'name': name, ++ } ++ return client.call('bdev_lvol_inflate', params) ++ ++ ++def bdev_lvol_decouple_parent(client, name): ++ """Decouple parent of a logical volume. ++ ++ Args: ++ name: name of logical volume to decouple parent ++ """ ++ params = { ++ 'name': name, ++ } ++ return client.call('bdev_lvol_decouple_parent', params) ++ ++ ++def bdev_lvol_delete_lvstore(client, uuid=None, lvs_name=None): ++ """Destroy a logical volume store. ++ ++ Args: ++ uuid: UUID of logical volume store to destroy (optional) ++ lvs_name: name of logical volume store to destroy (optional) ++ ++ Either uuid or lvs_name must be specified, but not both. ++ """ ++ if (uuid and lvs_name) or (not uuid and not lvs_name): ++ raise ValueError("Exactly one of uuid or lvs_name must be specified") ++ ++ params = {} ++ if uuid: ++ params['uuid'] = uuid ++ if lvs_name: ++ params['lvs_name'] = lvs_name ++ return client.call('bdev_lvol_delete_lvstore', params) ++ ++ ++def bdev_lvol_get_lvstores(client, uuid=None, lvs_name=None): ++ """List logical volume stores. ++ ++ Args: ++ uuid: UUID of logical volume store to retrieve information about (optional) ++ lvs_name: name of logical volume store to retrieve information about (optional) ++ ++ Either uuid or lvs_name may be specified, but not both. ++ If both uuid and lvs_name are omitted, information about all logical volume stores is returned. ++ """ ++ if (uuid and lvs_name): ++ raise ValueError("Exactly one of uuid or lvs_name may be specified") ++ params = {} ++ if uuid: ++ params['uuid'] = uuid ++ if lvs_name: ++ params['lvs_name'] = lvs_name ++ return client.call('bdev_lvol_get_lvstores', params) +diff --git a/python/spdk/rpc/mlx5.py b/python/spdk/rpc/mlx5.py +index 2c1424d..a8c874c 100644 +--- a/python/spdk/rpc/mlx5.py ++++ b/python/spdk/rpc/mlx5.py +@@ -1,21 +1,21 @@ +-# SPDX-License-Identifier: BSD-3-Clause +-# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. +-# All rights reserved. +- +-from spdk.rpc.helpers import deprecated_alias +- +- +-def mlx5_scan_accel_module(client, qp_size=None, num_requests=None): +- """Enable mlx5 accel module. Scans all mlx5 devices which can perform needed operations +- +- Args: +- qp_size: Qpair size. (optional) +- num_requests: size of a global requests pool per mlx5 device (optional) +- """ +- params = {} +- +- if qp_size is not None: +- params['qp_size'] = qp_size +- if num_requests is not None: +- params['num_requests'] = num_requests +- return client.call('mlx5_scan_accel_module', params) ++# SPDX-License-Identifier: BSD-3-Clause ++# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. ++# All rights reserved. ++ ++from spdk.rpc.helpers import deprecated_alias ++ ++ ++def mlx5_scan_accel_module(client, qp_size=None, num_requests=None): ++ """Enable mlx5 accel module. Scans all mlx5 devices which can perform needed operations ++ ++ Args: ++ qp_size: Qpair size. (optional) ++ num_requests: size of a global requests pool per mlx5 device (optional) ++ """ ++ params = {} ++ ++ if qp_size is not None: ++ params['qp_size'] = qp_size ++ if num_requests is not None: ++ params['num_requests'] = num_requests ++ return client.call('mlx5_scan_accel_module', params) +diff --git a/python/spdk/rpc/nbd.py b/python/spdk/rpc/nbd.py +index 73d17d2..258f83c 100644 +--- a/python/spdk/rpc/nbd.py ++++ b/python/spdk/rpc/nbd.py +@@ -1,24 +1,24 @@ +-# SPDX-License-Identifier: BSD-3-Clause +-# Copyright (C) 2017 Intel Corporation. +-# All rights reserved. +- +- +-def nbd_start_disk(client, bdev_name, nbd_device): +- params = { +- 'bdev_name': bdev_name +- } +- if nbd_device: +- params['nbd_device'] = nbd_device +- return client.call('nbd_start_disk', params) +- +- +-def nbd_stop_disk(client, nbd_device): +- params = {'nbd_device': nbd_device} +- return client.call('nbd_stop_disk', params) +- +- +-def nbd_get_disks(client, nbd_device=None): +- params = {} +- if nbd_device: +- params['nbd_device'] = nbd_device +- return client.call('nbd_get_disks', params) ++# SPDX-License-Identifier: BSD-3-Clause ++# Copyright (C) 2017 Intel Corporation. ++# All rights reserved. ++ ++ ++def nbd_start_disk(client, bdev_name, nbd_device): ++ params = { ++ 'bdev_name': bdev_name ++ } ++ if nbd_device: ++ params['nbd_device'] = nbd_device ++ return client.call('nbd_start_disk', params) ++ ++ ++def nbd_stop_disk(client, nbd_device): ++ params = {'nbd_device': nbd_device} ++ return client.call('nbd_stop_disk', params) ++ ++ ++def nbd_get_disks(client, nbd_device=None): ++ params = {} ++ if nbd_device: ++ params['nbd_device'] = nbd_device ++ return client.call('nbd_get_disks', params) +diff --git a/python/spdk/rpc/notify.py b/python/spdk/rpc/notify.py +index 34dfe36..e78d680 100644 +--- a/python/spdk/rpc/notify.py ++++ b/python/spdk/rpc/notify.py +@@ -1,30 +1,30 @@ +-# SPDX-License-Identifier: BSD-3-Clause +-# Copyright (C) 2018 Intel Corporation. +-# All rights reserved. +- +- +-def notify_get_types(client): +- return client.call("notify_get_types") +- +- +-def notify_get_notifications(client, +- id=None, +- max=None): +- """ +- +- Args: +- id First ID to start fetching from +- max Maximum number of notifications to return in response +- +- Return: +- Notifications array +- """ +- +- params = {} +- if id: +- params['id'] = id +- +- if max: +- params['max'] = max +- +- return client.call("notify_get_notifications", params) ++# SPDX-License-Identifier: BSD-3-Clause ++# Copyright (C) 2018 Intel Corporation. ++# All rights reserved. ++ ++ ++def notify_get_types(client): ++ return client.call("notify_get_types") ++ ++ ++def notify_get_notifications(client, ++ id=None, ++ max=None): ++ """ ++ ++ Args: ++ id First ID to start fetching from ++ max Maximum number of notifications to return in response ++ ++ Return: ++ Notifications array ++ """ ++ ++ params = {} ++ if id: ++ params['id'] = id ++ ++ if max: ++ params['max'] = max ++ ++ return client.call("notify_get_notifications", params) +diff --git a/python/spdk/rpc/nvme.py b/python/spdk/rpc/nvme.py +index b75b9ed..597543a 100644 +--- a/python/spdk/rpc/nvme.py ++++ b/python/spdk/rpc/nvme.py +@@ -1,140 +1,140 @@ +-# SPDX-License-Identifier: BSD-3-Clause +-# Copyright (C) 2018 Intel Corporation. +-# All rights reserved. +- +- +-def bdev_nvme_send_cmd(client, name, cmd_type, data_direction, cmdbuf, +- data=None, metadata=None, +- data_len=None, metadata_len=None, +- timeout_ms=None): +- """Send one NVMe command +- +- Args: +- name: Name of the operating NVMe controller +- cmd_type: Type of nvme cmd. Valid values are: admin, io +- data_direction: Direction of data transfer. Valid values are: c2h, h2c +- cmdbuf: NVMe command encoded by base64 urlsafe +- data: Data transferring to controller from host, encoded by base64 urlsafe +- metadata: metadata transferring to controller from host, encoded by base64 urlsafe +- data_length: Data length required to transfer from controller to host +- metadata_length: Metadata length required to transfer from controller to host +- timeout-ms: Command execution timeout value, in milliseconds, if 0, don't track timeout +- +- Returns: +- NVMe completion queue entry, requested data and metadata, all are encoded by base64 urlsafe. +- """ +- params = {'name': name, +- 'cmd_type': cmd_type, +- 'data_direction': data_direction, +- 'cmdbuf': cmdbuf} +- +- if data: +- params['data'] = data +- if metadata: +- params['metadata'] = metadata +- if data_len: +- params['data_len'] = data_len +- if metadata_len: +- params['metadata_len'] = metadata_len +- if timeout_ms: +- params['timeout_ms'] = timeout_ms +- +- return client.call('bdev_nvme_send_cmd', params) +- +- +-def bdev_nvme_get_controllers(client, name=None): +- """Get information about NVMe controllers. +- +- Args: +- name: NVMe controller name to query (optional; if omitted, query all NVMe controllers) +- +- Returns: +- List of NVMe controller information objects. +- """ +- params = {} +- if name: +- params['name'] = name +- return client.call('bdev_nvme_get_controllers', params) +- +- +-def bdev_nvme_opal_init(client, nvme_ctrlr_name, password): +- """Init nvme opal. Take ownership and activate +- +- Args: +- nvme_ctrlr_name: name of nvme ctrlr +- password: password to init opal +- """ +- params = { +- 'nvme_ctrlr_name': nvme_ctrlr_name, +- 'password': password, +- } +- +- return client.call('bdev_nvme_opal_init', params) +- +- +-def bdev_nvme_opal_revert(client, nvme_ctrlr_name, password): +- """Revert opal to default factory settings. Erase all data. +- +- Args: +- nvme_ctrlr_name: name of nvme ctrlr +- password: password +- """ +- params = { +- 'nvme_ctrlr_name': nvme_ctrlr_name, +- 'password': password, +- } +- +- return client.call('bdev_nvme_opal_revert', params) +- +- +-def bdev_nvme_add_error_injection(client, name, opc, cmd_type, do_not_submit, timeout_in_us, +- err_count, sct, sc): +- """Add error injection +- +- Args: +- name: Name of the operating NVMe controller +- opc: Opcode of the NVMe command +- cmd_type: Type of NVMe command. Valid values are: admin, io +- do_not_submit: Do not submit commands to the controller +- timeout_in_us: Wait specified microseconds when do_not_submit is true +- err_count: Number of matching NVMe commands to inject errors +- sct: NVMe status code type +- sc: NVMe status code +- +- Returns: +- True on success, RPC error otherwise +- """ +- params = {'name': name, +- 'opc': opc, +- 'cmd_type': cmd_type} +- +- if do_not_submit: +- params['do_not_submit'] = do_not_submit +- if timeout_in_us: +- params['timeout_in_us'] = timeout_in_us +- if err_count: +- params['err_count'] = err_count +- if sct: +- params['sct'] = sct +- if sc: +- params['sc'] = sc +- +- return client.call('bdev_nvme_add_error_injection', params) +- +- +-def bdev_nvme_remove_error_injection(client, name, opc, cmd_type): +- """Remove error injection +- +- Args: +- name: Name of the operating NVMe controller +- opc: Opcode of the NVMe command +- cmd_type: Type of NVMe command. Valid values are: admin, io +- +- Returns: +- True on success, RPC error otherwise +- """ +- params = {'name': name, +- 'opc': opc, +- 'cmd_type': cmd_type} +- +- return client.call('bdev_nvme_remove_error_injection', params) ++# SPDX-License-Identifier: BSD-3-Clause ++# Copyright (C) 2018 Intel Corporation. ++# All rights reserved. ++ ++ ++def bdev_nvme_send_cmd(client, name, cmd_type, data_direction, cmdbuf, ++ data=None, metadata=None, ++ data_len=None, metadata_len=None, ++ timeout_ms=None): ++ """Send one NVMe command ++ ++ Args: ++ name: Name of the operating NVMe controller ++ cmd_type: Type of nvme cmd. Valid values are: admin, io ++ data_direction: Direction of data transfer. Valid values are: c2h, h2c ++ cmdbuf: NVMe command encoded by base64 urlsafe ++ data: Data transferring to controller from host, encoded by base64 urlsafe ++ metadata: metadata transferring to controller from host, encoded by base64 urlsafe ++ data_length: Data length required to transfer from controller to host ++ metadata_length: Metadata length required to transfer from controller to host ++ timeout-ms: Command execution timeout value, in milliseconds, if 0, don't track timeout ++ ++ Returns: ++ NVMe completion queue entry, requested data and metadata, all are encoded by base64 urlsafe. ++ """ ++ params = {'name': name, ++ 'cmd_type': cmd_type, ++ 'data_direction': data_direction, ++ 'cmdbuf': cmdbuf} ++ ++ if data: ++ params['data'] = data ++ if metadata: ++ params['metadata'] = metadata ++ if data_len: ++ params['data_len'] = data_len ++ if metadata_len: ++ params['metadata_len'] = metadata_len ++ if timeout_ms: ++ params['timeout_ms'] = timeout_ms ++ ++ return client.call('bdev_nvme_send_cmd', params) ++ ++ ++def bdev_nvme_get_controllers(client, name=None): ++ """Get information about NVMe controllers. ++ ++ Args: ++ name: NVMe controller name to query (optional; if omitted, query all NVMe controllers) ++ ++ Returns: ++ List of NVMe controller information objects. ++ """ ++ params = {} ++ if name: ++ params['name'] = name ++ return client.call('bdev_nvme_get_controllers', params) ++ ++ ++def bdev_nvme_opal_init(client, nvme_ctrlr_name, password): ++ """Init nvme opal. Take ownership and activate ++ ++ Args: ++ nvme_ctrlr_name: name of nvme ctrlr ++ password: password to init opal ++ """ ++ params = { ++ 'nvme_ctrlr_name': nvme_ctrlr_name, ++ 'password': password, ++ } ++ ++ return client.call('bdev_nvme_opal_init', params) ++ ++ ++def bdev_nvme_opal_revert(client, nvme_ctrlr_name, password): ++ """Revert opal to default factory settings. Erase all data. ++ ++ Args: ++ nvme_ctrlr_name: name of nvme ctrlr ++ password: password ++ """ ++ params = { ++ 'nvme_ctrlr_name': nvme_ctrlr_name, ++ 'password': password, ++ } ++ ++ return client.call('bdev_nvme_opal_revert', params) ++ ++ ++def bdev_nvme_add_error_injection(client, name, opc, cmd_type, do_not_submit, timeout_in_us, ++ err_count, sct, sc): ++ """Add error injection ++ ++ Args: ++ name: Name of the operating NVMe controller ++ opc: Opcode of the NVMe command ++ cmd_type: Type of NVMe command. Valid values are: admin, io ++ do_not_submit: Do not submit commands to the controller ++ timeout_in_us: Wait specified microseconds when do_not_submit is true ++ err_count: Number of matching NVMe commands to inject errors ++ sct: NVMe status code type ++ sc: NVMe status code ++ ++ Returns: ++ True on success, RPC error otherwise ++ """ ++ params = {'name': name, ++ 'opc': opc, ++ 'cmd_type': cmd_type} ++ ++ if do_not_submit: ++ params['do_not_submit'] = do_not_submit ++ if timeout_in_us: ++ params['timeout_in_us'] = timeout_in_us ++ if err_count: ++ params['err_count'] = err_count ++ if sct: ++ params['sct'] = sct ++ if sc: ++ params['sc'] = sc ++ ++ return client.call('bdev_nvme_add_error_injection', params) ++ ++ ++def bdev_nvme_remove_error_injection(client, name, opc, cmd_type): ++ """Remove error injection ++ ++ Args: ++ name: Name of the operating NVMe controller ++ opc: Opcode of the NVMe command ++ cmd_type: Type of NVMe command. Valid values are: admin, io ++ ++ Returns: ++ True on success, RPC error otherwise ++ """ ++ params = {'name': name, ++ 'opc': opc, ++ 'cmd_type': cmd_type} ++ ++ return client.call('bdev_nvme_remove_error_injection', params) +diff --git a/python/spdk/rpc/nvmf.py b/python/spdk/rpc/nvmf.py +index 4d4d074..085f319 100644 +--- a/python/spdk/rpc/nvmf.py ++++ b/python/spdk/rpc/nvmf.py +@@ -1,596 +1,596 @@ +-# SPDX-License-Identifier: BSD-3-Clause +-# Copyright (C) 2017 Intel Corporation. +-# All rights reserved. +- +-from .cmd_parser import * +- +- +-def nvmf_set_max_subsystems(client, +- max_subsystems=None): +- """Set NVMe-oF target options. +- +- Args: +- max_subsystems: Maximum number of NVMe-oF subsystems (e.g. 1024) +- +- Returns: +- True or False +- """ +- params = {} +- +- params['max_subsystems'] = max_subsystems +- return client.call('nvmf_set_max_subsystems', params) +- +- +-def nvmf_set_config(client, +- passthru_identify_ctrlr=None, +- poll_groups_mask=None, +- discovery_filter=None): +- """Set NVMe-oF target subsystem configuration. +- +- Args: +- discovery_filter: Set discovery filter (optional), possible values are: `match_any` (default) or +- comma separated values: `transport`, `address`, `svcid` +- +- Returns: +- True or False +- """ +- params = {} +- +- if passthru_identify_ctrlr: +- admin_cmd_passthru = {} +- admin_cmd_passthru['identify_ctrlr'] = passthru_identify_ctrlr +- params['admin_cmd_passthru'] = admin_cmd_passthru +- if poll_groups_mask: +- params['poll_groups_mask'] = poll_groups_mask +- if discovery_filter: +- params['discovery_filter'] = discovery_filter +- +- return client.call('nvmf_set_config', params) +- +- +-def nvmf_create_target(client, +- name, +- max_subsystems=0, +- discovery_filter="match_any"): +- """Create a new NVMe-oF Target. +- +- Args: +- name: Must be unique within the application +- max_subsystems: Maximum number of NVMe-oF subsystems (e.g. 1024). default: 0 (Uses SPDK_NVMF_DEFAULT_MAX_SUBSYSTEMS). +- discovery_filter: Set discovery filter (optional), possible values are: `match_any` (default) or +- comma separated values: `transport`, `address`, `svcid` +- +- Returns: +- The name of the new target. +- """ +- params = {} +- +- params['name'] = name +- params['max_subsystems'] = max_subsystems +- params['discovery_filter'] = discovery_filter +- return client.call("nvmf_create_target", params) +- +- +-def nvmf_delete_target(client, +- name): +- """Destroy an NVMe-oF Target. +- +- Args: +- name: The name of the target you wish to destroy +- +- Returns: +- True on success or False +- """ +- params = {} +- +- params['name'] = name +- return client.call("nvmf_delete_target", params) +- +- +-def nvmf_get_targets(client): +- """Get a list of all the NVMe-oF targets in this application +- +- Returns: +- An array of target names. +- """ +- +- return client.call("nvmf_get_targets") +- +- +-def nvmf_create_transport(client, **params): +- """NVMf Transport Create options. +- +- Args: +- trtype: Transport type (ex. RDMA) +- max_queue_depth: Max number of outstanding I/O per queue (optional) +- max_io_qpairs_per_ctrlr: Max number of IO qpairs per controller (optional) +- in_capsule_data_size: Maximum in-capsule data size in bytes (optional) +- max_io_size: Maximum I/O data size in bytes (optional) +- io_unit_size: I/O unit size in bytes (optional) +- max_aq_depth: Max size admin queue per controller (optional) +- num_shared_buffers: The number of pooled data buffers available to the transport (optional) +- buf_cache_size: The number of shared buffers to reserve for each poll group (optional) +- zcopy: Use zero-copy operations if the underlying bdev supports them (optional) +- num_cqe: The number of CQ entries to configure CQ size. Only used when no_srq=true - RDMA specific (optional) +- max_srq_depth: Max number of outstanding I/O per shared receive queue - RDMA specific (optional) +- no_srq: Boolean flag to disable SRQ even for devices that support it - RDMA specific (optional) +- c2h_success: Boolean flag to disable the C2H success optimization - TCP specific (optional) +- dif_insert_or_strip: Boolean flag to enable DIF insert/strip for I/O - TCP specific (optional) +- acceptor_backlog: Pending connections allowed at one time - RDMA specific (optional) +- abort_timeout_sec: Abort execution timeout value, in seconds (optional) +- no_wr_batching: Boolean flag to disable work requests batching - RDMA specific (optional) +- control_msg_num: The number of control messages per poll group - TCP specific (optional) +- disable_mappable_bar0: disable client mmap() of BAR0 - VFIO-USER specific (optional) +- disable_adaptive_irq: Disable adaptive interrupt feature - VFIO-USER specific (optional) +- disable_shadow_doorbells: disable shadow doorbell support - VFIO-USER specific (optional) +- acceptor_poll_rate: Acceptor poll period in microseconds (optional) +- Returns: +- True or False +- """ +- +- strip_globals(params) +- apply_defaults(params, no_srq=False, c2h_success=True) +- remove_null(params) +- +- return client.call('nvmf_create_transport', params) +- +- +-def nvmf_get_transports(client, trtype=None, tgt_name=None): +- """Get list of NVMe-oF transports. +- Args: +- trtype: Transport type (optional; if omitted, query all transports). +- tgt_name: name of the parent NVMe-oF target (optional). +- +- Returns: +- List of NVMe-oF transport objects. +- """ +- +- params = {} +- +- if tgt_name: +- params['tgt_name'] = tgt_name +- +- if trtype: +- params['trtype'] = trtype +- +- return client.call('nvmf_get_transports', params) +- +- +-def nvmf_get_subsystems(client, nqn=None, tgt_name=None): +- """Get list of NVMe-oF subsystems. +- Args: +- nqn: Subsystem NQN (optional; if omitted, query all subsystems). +- tgt_name: name of the parent NVMe-oF target (optional). +- +- Returns: +- List of NVMe-oF subsystem objects. +- """ +- +- params = {} +- +- if tgt_name: +- params['tgt_name'] = tgt_name +- +- if nqn: +- params['nqn'] = nqn +- +- return client.call('nvmf_get_subsystems', params) +- +- +-def nvmf_create_subsystem(client, +- nqn, +- serial_number, +- tgt_name=None, +- model_number='SPDK bdev Controller', +- allow_any_host=False, +- max_namespaces=0, +- ana_reporting=False, +- min_cntlid=1, +- max_cntlid=0xffef): +- """Construct an NVMe over Fabrics target subsystem. +- +- Args: +- nqn: Subsystem NQN. +- tgt_name: name of the parent NVMe-oF target (optional). +- serial_number: Serial number of virtual controller. +- model_number: Model number of virtual controller. +- allow_any_host: Allow any host (True) or enforce allowed host list (False). Default: False. +- max_namespaces: Maximum number of namespaces that can be attached to the subsystem (optional). Default: 0 (Unlimited). +- ana_reporting: Enable ANA reporting feature. Default: False. +- min_cntlid: Minimum controller ID. Default: 1 +- max_cntlid: Maximum controller ID. Default: 0xffef +- +- +- Returns: +- True or False +- """ +- params = { +- 'nqn': nqn, +- } +- +- if serial_number: +- params['serial_number'] = serial_number +- +- if model_number: +- params['model_number'] = model_number +- +- if allow_any_host: +- params['allow_any_host'] = True +- +- if max_namespaces is not None: +- params['max_namespaces'] = max_namespaces +- +- if tgt_name: +- params['tgt_name'] = tgt_name +- +- if ana_reporting: +- params['ana_reporting'] = ana_reporting +- +- if min_cntlid is not None: +- params['min_cntlid'] = min_cntlid +- +- if max_cntlid is not None: +- params['max_cntlid'] = max_cntlid +- +- return client.call('nvmf_create_subsystem', params) +- +- +-def nvmf_subsystem_add_listener(client, **params): +- +- """Add a new listen address to an NVMe-oF subsystem. +- +- Args: +- nqn: Subsystem NQN. +- trtype: Transport type ("RDMA"). +- traddr: Transport address. +- trsvcid: Transport service ID (required for RDMA or TCP). +- tgt_name: name of the parent NVMe-oF target (optional). +- adrfam: Address family ("IPv4", "IPv6", "IB", or "FC"). +- +- Returns: +- True or False +- """ +- +- strip_globals(params) +- apply_defaults(params, tgt_name=None) +- group_as(params, 'listen_address', ['trtype', 'traddr', 'trsvcid', 'adrfam']) +- remove_null(params) +- +- if params['nqn'] == 'discovery': +- params['nqn'] = 'nqn.2014-08.org.nvmexpress.discovery' +- +- return client.call('nvmf_subsystem_add_listener', params) +- +- +-def nvmf_subsystem_remove_listener( +- client, +- nqn, +- trtype, +- traddr, +- trsvcid, +- adrfam, +- tgt_name=None): +- """Remove existing listen address from an NVMe-oF subsystem. +- +- Args: +- nqn: Subsystem NQN. +- trtype: Transport type ("RDMA"). +- traddr: Transport address. +- trsvcid: Transport service ID. +- tgt_name: name of the parent NVMe-oF target (optional). +- adrfam: Address family ("IPv4", "IPv6", "IB", or "FC"). +- +- Returns: +- True or False +- """ +- listen_address = {'trtype': trtype, +- 'traddr': traddr} +- +- if trsvcid: +- listen_address['trsvcid'] = trsvcid +- +- if adrfam: +- listen_address['adrfam'] = adrfam +- +- params = {'nqn': nqn, +- 'listen_address': listen_address} +- +- if tgt_name: +- params['tgt_name'] = tgt_name +- +- if params['nqn'] == 'discovery': +- params['nqn'] = 'nqn.2014-08.org.nvmexpress.discovery' +- +- return client.call('nvmf_subsystem_remove_listener', params) +- +- +-def nvmf_subsystem_listener_set_ana_state( +- client, +- nqn, +- ana_state, +- trtype, +- traddr, +- trsvcid, +- adrfam, +- tgt_name=None, +- anagrpid=None): +- """Set ANA state of a listener for an NVMe-oF subsystem. +- +- Args: +- nqn: Subsystem NQN. +- ana_state: ANA state to set ("optimized", "non_optimized", or "inaccessible"). +- trtype: Transport type ("RDMA"). +- traddr: Transport address. +- trsvcid: Transport service ID. +- tgt_name: name of the parent NVMe-oF target (optional). +- adrfam: Address family ("IPv4", "IPv6", "IB", or "FC"). +- anagrpid: ANA group ID (optional) +- +- Returns: +- True or False +- """ +- listen_address = {'trtype': trtype, +- 'traddr': traddr, +- 'trsvcid': trsvcid} +- +- if adrfam: +- listen_address['adrfam'] = adrfam +- +- params = {'nqn': nqn, +- 'listen_address': listen_address, +- 'ana_state': ana_state} +- +- if tgt_name: +- params['tgt_name'] = tgt_name +- +- if anagrpid: +- params['anagrpid'] = anagrpid +- +- return client.call('nvmf_subsystem_listener_set_ana_state', params) +- +- +-def nvmf_subsystem_add_ns(client, +- nqn, +- bdev_name, +- tgt_name=None, +- ptpl_file=None, +- nsid=None, +- nguid=None, +- eui64=None, +- uuid=None, +- anagrpid=None): +- """Add a namespace to a subsystem. +- +- Args: +- nqn: Subsystem NQN. +- bdev_name: Name of bdev to expose as a namespace. +- tgt_name: name of the parent NVMe-oF target (optional). +- nsid: Namespace ID (optional). +- nguid: 16-byte namespace globally unique identifier in hexadecimal (optional). +- eui64: 8-byte namespace EUI-64 in hexadecimal (e.g. "ABCDEF0123456789") (optional). +- uuid: Namespace UUID (optional). +- anagrpid: ANA group ID (optional). +- +- Returns: +- The namespace ID +- """ +- ns = {'bdev_name': bdev_name} +- +- if ptpl_file: +- ns['ptpl_file'] = ptpl_file +- +- if nsid: +- ns['nsid'] = nsid +- +- if nguid: +- ns['nguid'] = nguid +- +- if eui64: +- ns['eui64'] = eui64 +- +- if uuid: +- ns['uuid'] = uuid +- +- if anagrpid: +- ns['anagrpid'] = anagrpid +- +- params = {'nqn': nqn, +- 'namespace': ns} +- +- if tgt_name: +- params['tgt_name'] = tgt_name +- +- return client.call('nvmf_subsystem_add_ns', params) +- +- +-def nvmf_subsystem_remove_ns(client, nqn, nsid, tgt_name=None): +- """Remove a existing namespace from a subsystem. +- +- Args: +- nqn: Subsystem NQN. +- nsid: Namespace ID. +- tgt_name: name of the parent NVMe-oF target (optional). +- +- Returns: +- True or False +- """ +- params = {'nqn': nqn, +- 'nsid': nsid} +- +- if tgt_name: +- params['tgt_name'] = tgt_name +- +- return client.call('nvmf_subsystem_remove_ns', params) +- +- +-def nvmf_subsystem_add_host(client, nqn, host, tgt_name=None): +- """Add a host NQN to the list of allowed hosts. +- +- Args: +- nqn: Subsystem NQN. +- host: Host NQN to add to the list of allowed host NQNs +- tgt_name: name of the parent NVMe-oF target (optional). +- +- Returns: +- True or False +- """ +- params = {'nqn': nqn, +- 'host': host} +- +- if tgt_name: +- params['tgt_name'] = tgt_name +- +- return client.call('nvmf_subsystem_add_host', params) +- +- +-def nvmf_subsystem_remove_host(client, nqn, host, tgt_name=None): +- """Remove a host NQN from the list of allowed hosts. +- +- Args: +- nqn: Subsystem NQN. +- host: Host NQN to remove to the list of allowed host NQNs +- tgt_name: name of the parent NVMe-oF target (optional). +- +- Returns: +- True or False +- """ +- params = {'nqn': nqn, +- 'host': host} +- +- if tgt_name: +- params['tgt_name'] = tgt_name +- +- return client.call('nvmf_subsystem_remove_host', params) +- +- +-def nvmf_subsystem_allow_any_host(client, nqn, disable, tgt_name=None): +- """Configure a subsystem to allow any host to connect or to enforce the host NQN list. +- +- Args: +- nqn: Subsystem NQN. +- disable: Allow any host (true) or enforce allowed host list (false). +- tgt_name: name of the parent NVMe-oF target (optional). +- +- Returns: +- True or False +- """ +- params = {'nqn': nqn, 'allow_any_host': False if disable else True} +- +- if tgt_name: +- params['tgt_name'] = tgt_name +- +- return client.call('nvmf_subsystem_allow_any_host', params) +- +- +-def nvmf_delete_subsystem(client, nqn, tgt_name=None): +- """Delete an existing NVMe-oF subsystem. +- +- Args: +- nqn: Subsystem NQN. +- tgt_name: name of the parent NVMe-oF target (optional). +- +- Returns: +- True or False +- """ +- params = {'nqn': nqn} +- +- if tgt_name: +- params['tgt_name'] = tgt_name +- +- return client.call('nvmf_delete_subsystem', params) +- +- +-def nvmf_subsystem_get_controllers(client, nqn, tgt_name=None): +- """Get list of controllers of an NVMe-oF subsystem. +- +- Args: +- nqn: Subsystem NQN. +- tgt_name: name of the parent NVMe-oF target (optional). +- +- Returns: +- List of controller objects of an NVMe-oF subsystem. +- """ +- params = {'nqn': nqn} +- +- if tgt_name: +- params['tgt_name'] = tgt_name +- +- return client.call('nvmf_subsystem_get_controllers', params) +- +- +-def nvmf_subsystem_get_qpairs(client, nqn, tgt_name=None): +- """Get list of queue pairs of an NVMe-oF subsystem. +- +- Args: +- nqn: Subsystem NQN. +- tgt_name: name of the parent NVMe-oF target (optional). +- +- Returns: +- List of queue pair objects of an NVMe-oF subsystem. +- """ +- params = {'nqn': nqn} +- +- if tgt_name: +- params['tgt_name'] = tgt_name +- +- return client.call('nvmf_subsystem_get_qpairs', params) +- +- +-def nvmf_subsystem_get_listeners(client, nqn, tgt_name=None): +- """Get list of listeners of an NVMe-oF subsystem. +- +- Args: +- nqn: Subsystem NQN. +- tgt_name: name of the parent NVMe-oF target (optional). +- +- Returns: +- List of listener objects of an NVMe-oF subsystem. +- """ +- params = {'nqn': nqn} +- +- if tgt_name: +- params['tgt_name'] = tgt_name +- +- return client.call('nvmf_subsystem_get_listeners', params) +- +- +-def nvmf_get_stats(client, tgt_name=None): +- """Query NVMf statistics. +- +- Args: +- tgt_name: name of the parent NVMe-oF target (optional). +- +- Returns: +- Current NVMf statistics. +- """ +- +- params = {} +- +- if tgt_name: +- params = { +- 'tgt_name': tgt_name, +- } +- +- return client.call('nvmf_get_stats', params) +- +- +-def nvmf_set_crdt(client, crdt1=None, crdt2=None, crdt3=None): +- """Set the 3 crdt (Command Retry Delay Time) values +- +- Args: +- crdt1: Command Retry Delay Time 1 +- crdt2: Command Retry Delay Time 2 +- crdt3: Command Retry Delay Time 3 +- +- Returns: +- True or False +- """ +- params = {} +- if crdt1 is not None: +- params['crdt1'] = crdt1 +- if crdt2 is not None: +- params['crdt2'] = crdt2 +- if crdt3 is not None: +- params['crdt3'] = crdt3 +- +- return client.call('nvmf_set_crdt', params) ++# SPDX-License-Identifier: BSD-3-Clause ++# Copyright (C) 2017 Intel Corporation. ++# All rights reserved. ++ ++from .cmd_parser import * ++ ++ ++def nvmf_set_max_subsystems(client, ++ max_subsystems=None): ++ """Set NVMe-oF target options. ++ ++ Args: ++ max_subsystems: Maximum number of NVMe-oF subsystems (e.g. 1024) ++ ++ Returns: ++ True or False ++ """ ++ params = {} ++ ++ params['max_subsystems'] = max_subsystems ++ return client.call('nvmf_set_max_subsystems', params) ++ ++ ++def nvmf_set_config(client, ++ passthru_identify_ctrlr=None, ++ poll_groups_mask=None, ++ discovery_filter=None): ++ """Set NVMe-oF target subsystem configuration. ++ ++ Args: ++ discovery_filter: Set discovery filter (optional), possible values are: `match_any` (default) or ++ comma separated values: `transport`, `address`, `svcid` ++ ++ Returns: ++ True or False ++ """ ++ params = {} ++ ++ if passthru_identify_ctrlr: ++ admin_cmd_passthru = {} ++ admin_cmd_passthru['identify_ctrlr'] = passthru_identify_ctrlr ++ params['admin_cmd_passthru'] = admin_cmd_passthru ++ if poll_groups_mask: ++ params['poll_groups_mask'] = poll_groups_mask ++ if discovery_filter: ++ params['discovery_filter'] = discovery_filter ++ ++ return client.call('nvmf_set_config', params) ++ ++ ++def nvmf_create_target(client, ++ name, ++ max_subsystems=0, ++ discovery_filter="match_any"): ++ """Create a new NVMe-oF Target. ++ ++ Args: ++ name: Must be unique within the application ++ max_subsystems: Maximum number of NVMe-oF subsystems (e.g. 1024). default: 0 (Uses SPDK_NVMF_DEFAULT_MAX_SUBSYSTEMS). ++ discovery_filter: Set discovery filter (optional), possible values are: `match_any` (default) or ++ comma separated values: `transport`, `address`, `svcid` ++ ++ Returns: ++ The name of the new target. ++ """ ++ params = {} ++ ++ params['name'] = name ++ params['max_subsystems'] = max_subsystems ++ params['discovery_filter'] = discovery_filter ++ return client.call("nvmf_create_target", params) ++ ++ ++def nvmf_delete_target(client, ++ name): ++ """Destroy an NVMe-oF Target. ++ ++ Args: ++ name: The name of the target you wish to destroy ++ ++ Returns: ++ True on success or False ++ """ ++ params = {} ++ ++ params['name'] = name ++ return client.call("nvmf_delete_target", params) ++ ++ ++def nvmf_get_targets(client): ++ """Get a list of all the NVMe-oF targets in this application ++ ++ Returns: ++ An array of target names. ++ """ ++ ++ return client.call("nvmf_get_targets") ++ ++ ++def nvmf_create_transport(client, **params): ++ """NVMf Transport Create options. ++ ++ Args: ++ trtype: Transport type (ex. RDMA) ++ max_queue_depth: Max number of outstanding I/O per queue (optional) ++ max_io_qpairs_per_ctrlr: Max number of IO qpairs per controller (optional) ++ in_capsule_data_size: Maximum in-capsule data size in bytes (optional) ++ max_io_size: Maximum I/O data size in bytes (optional) ++ io_unit_size: I/O unit size in bytes (optional) ++ max_aq_depth: Max size admin queue per controller (optional) ++ num_shared_buffers: The number of pooled data buffers available to the transport (optional) ++ buf_cache_size: The number of shared buffers to reserve for each poll group (optional) ++ zcopy: Use zero-copy operations if the underlying bdev supports them (optional) ++ num_cqe: The number of CQ entries to configure CQ size. Only used when no_srq=true - RDMA specific (optional) ++ max_srq_depth: Max number of outstanding I/O per shared receive queue - RDMA specific (optional) ++ no_srq: Boolean flag to disable SRQ even for devices that support it - RDMA specific (optional) ++ c2h_success: Boolean flag to disable the C2H success optimization - TCP specific (optional) ++ dif_insert_or_strip: Boolean flag to enable DIF insert/strip for I/O - TCP specific (optional) ++ acceptor_backlog: Pending connections allowed at one time - RDMA specific (optional) ++ abort_timeout_sec: Abort execution timeout value, in seconds (optional) ++ no_wr_batching: Boolean flag to disable work requests batching - RDMA specific (optional) ++ control_msg_num: The number of control messages per poll group - TCP specific (optional) ++ disable_mappable_bar0: disable client mmap() of BAR0 - VFIO-USER specific (optional) ++ disable_adaptive_irq: Disable adaptive interrupt feature - VFIO-USER specific (optional) ++ disable_shadow_doorbells: disable shadow doorbell support - VFIO-USER specific (optional) ++ acceptor_poll_rate: Acceptor poll period in microseconds (optional) ++ Returns: ++ True or False ++ """ ++ ++ strip_globals(params) ++ apply_defaults(params, no_srq=False, c2h_success=True) ++ remove_null(params) ++ ++ return client.call('nvmf_create_transport', params) ++ ++ ++def nvmf_get_transports(client, trtype=None, tgt_name=None): ++ """Get list of NVMe-oF transports. ++ Args: ++ trtype: Transport type (optional; if omitted, query all transports). ++ tgt_name: name of the parent NVMe-oF target (optional). ++ ++ Returns: ++ List of NVMe-oF transport objects. ++ """ ++ ++ params = {} ++ ++ if tgt_name: ++ params['tgt_name'] = tgt_name ++ ++ if trtype: ++ params['trtype'] = trtype ++ ++ return client.call('nvmf_get_transports', params) ++ ++ ++def nvmf_get_subsystems(client, nqn=None, tgt_name=None): ++ """Get list of NVMe-oF subsystems. ++ Args: ++ nqn: Subsystem NQN (optional; if omitted, query all subsystems). ++ tgt_name: name of the parent NVMe-oF target (optional). ++ ++ Returns: ++ List of NVMe-oF subsystem objects. ++ """ ++ ++ params = {} ++ ++ if tgt_name: ++ params['tgt_name'] = tgt_name ++ ++ if nqn: ++ params['nqn'] = nqn ++ ++ return client.call('nvmf_get_subsystems', params) ++ ++ ++def nvmf_create_subsystem(client, ++ nqn, ++ serial_number, ++ tgt_name=None, ++ model_number='SPDK bdev Controller', ++ allow_any_host=False, ++ max_namespaces=0, ++ ana_reporting=False, ++ min_cntlid=1, ++ max_cntlid=0xffef): ++ """Construct an NVMe over Fabrics target subsystem. ++ ++ Args: ++ nqn: Subsystem NQN. ++ tgt_name: name of the parent NVMe-oF target (optional). ++ serial_number: Serial number of virtual controller. ++ model_number: Model number of virtual controller. ++ allow_any_host: Allow any host (True) or enforce allowed host list (False). Default: False. ++ max_namespaces: Maximum number of namespaces that can be attached to the subsystem (optional). Default: 0 (Unlimited). ++ ana_reporting: Enable ANA reporting feature. Default: False. ++ min_cntlid: Minimum controller ID. Default: 1 ++ max_cntlid: Maximum controller ID. Default: 0xffef ++ ++ ++ Returns: ++ True or False ++ """ ++ params = { ++ 'nqn': nqn, ++ } ++ ++ if serial_number: ++ params['serial_number'] = serial_number ++ ++ if model_number: ++ params['model_number'] = model_number ++ ++ if allow_any_host: ++ params['allow_any_host'] = True ++ ++ if max_namespaces is not None: ++ params['max_namespaces'] = max_namespaces ++ ++ if tgt_name: ++ params['tgt_name'] = tgt_name ++ ++ if ana_reporting: ++ params['ana_reporting'] = ana_reporting ++ ++ if min_cntlid is not None: ++ params['min_cntlid'] = min_cntlid ++ ++ if max_cntlid is not None: ++ params['max_cntlid'] = max_cntlid ++ ++ return client.call('nvmf_create_subsystem', params) ++ ++ ++def nvmf_subsystem_add_listener(client, **params): ++ ++ """Add a new listen address to an NVMe-oF subsystem. ++ ++ Args: ++ nqn: Subsystem NQN. ++ trtype: Transport type ("RDMA"). ++ traddr: Transport address. ++ trsvcid: Transport service ID (required for RDMA or TCP). ++ tgt_name: name of the parent NVMe-oF target (optional). ++ adrfam: Address family ("IPv4", "IPv6", "IB", or "FC"). ++ ++ Returns: ++ True or False ++ """ ++ ++ strip_globals(params) ++ apply_defaults(params, tgt_name=None) ++ group_as(params, 'listen_address', ['trtype', 'traddr', 'trsvcid', 'adrfam']) ++ remove_null(params) ++ ++ if params['nqn'] == 'discovery': ++ params['nqn'] = 'nqn.2014-08.org.nvmexpress.discovery' ++ ++ return client.call('nvmf_subsystem_add_listener', params) ++ ++ ++def nvmf_subsystem_remove_listener( ++ client, ++ nqn, ++ trtype, ++ traddr, ++ trsvcid, ++ adrfam, ++ tgt_name=None): ++ """Remove existing listen address from an NVMe-oF subsystem. ++ ++ Args: ++ nqn: Subsystem NQN. ++ trtype: Transport type ("RDMA"). ++ traddr: Transport address. ++ trsvcid: Transport service ID. ++ tgt_name: name of the parent NVMe-oF target (optional). ++ adrfam: Address family ("IPv4", "IPv6", "IB", or "FC"). ++ ++ Returns: ++ True or False ++ """ ++ listen_address = {'trtype': trtype, ++ 'traddr': traddr} ++ ++ if trsvcid: ++ listen_address['trsvcid'] = trsvcid ++ ++ if adrfam: ++ listen_address['adrfam'] = adrfam ++ ++ params = {'nqn': nqn, ++ 'listen_address': listen_address} ++ ++ if tgt_name: ++ params['tgt_name'] = tgt_name ++ ++ if params['nqn'] == 'discovery': ++ params['nqn'] = 'nqn.2014-08.org.nvmexpress.discovery' ++ ++ return client.call('nvmf_subsystem_remove_listener', params) ++ ++ ++def nvmf_subsystem_listener_set_ana_state( ++ client, ++ nqn, ++ ana_state, ++ trtype, ++ traddr, ++ trsvcid, ++ adrfam, ++ tgt_name=None, ++ anagrpid=None): ++ """Set ANA state of a listener for an NVMe-oF subsystem. ++ ++ Args: ++ nqn: Subsystem NQN. ++ ana_state: ANA state to set ("optimized", "non_optimized", or "inaccessible"). ++ trtype: Transport type ("RDMA"). ++ traddr: Transport address. ++ trsvcid: Transport service ID. ++ tgt_name: name of the parent NVMe-oF target (optional). ++ adrfam: Address family ("IPv4", "IPv6", "IB", or "FC"). ++ anagrpid: ANA group ID (optional) ++ ++ Returns: ++ True or False ++ """ ++ listen_address = {'trtype': trtype, ++ 'traddr': traddr, ++ 'trsvcid': trsvcid} ++ ++ if adrfam: ++ listen_address['adrfam'] = adrfam ++ ++ params = {'nqn': nqn, ++ 'listen_address': listen_address, ++ 'ana_state': ana_state} ++ ++ if tgt_name: ++ params['tgt_name'] = tgt_name ++ ++ if anagrpid: ++ params['anagrpid'] = anagrpid ++ ++ return client.call('nvmf_subsystem_listener_set_ana_state', params) ++ ++ ++def nvmf_subsystem_add_ns(client, ++ nqn, ++ bdev_name, ++ tgt_name=None, ++ ptpl_file=None, ++ nsid=None, ++ nguid=None, ++ eui64=None, ++ uuid=None, ++ anagrpid=None): ++ """Add a namespace to a subsystem. ++ ++ Args: ++ nqn: Subsystem NQN. ++ bdev_name: Name of bdev to expose as a namespace. ++ tgt_name: name of the parent NVMe-oF target (optional). ++ nsid: Namespace ID (optional). ++ nguid: 16-byte namespace globally unique identifier in hexadecimal (optional). ++ eui64: 8-byte namespace EUI-64 in hexadecimal (e.g. "ABCDEF0123456789") (optional). ++ uuid: Namespace UUID (optional). ++ anagrpid: ANA group ID (optional). ++ ++ Returns: ++ The namespace ID ++ """ ++ ns = {'bdev_name': bdev_name} ++ ++ if ptpl_file: ++ ns['ptpl_file'] = ptpl_file ++ ++ if nsid: ++ ns['nsid'] = nsid ++ ++ if nguid: ++ ns['nguid'] = nguid ++ ++ if eui64: ++ ns['eui64'] = eui64 ++ ++ if uuid: ++ ns['uuid'] = uuid ++ ++ if anagrpid: ++ ns['anagrpid'] = anagrpid ++ ++ params = {'nqn': nqn, ++ 'namespace': ns} ++ ++ if tgt_name: ++ params['tgt_name'] = tgt_name ++ ++ return client.call('nvmf_subsystem_add_ns', params) ++ ++ ++def nvmf_subsystem_remove_ns(client, nqn, nsid, tgt_name=None): ++ """Remove a existing namespace from a subsystem. ++ ++ Args: ++ nqn: Subsystem NQN. ++ nsid: Namespace ID. ++ tgt_name: name of the parent NVMe-oF target (optional). ++ ++ Returns: ++ True or False ++ """ ++ params = {'nqn': nqn, ++ 'nsid': nsid} ++ ++ if tgt_name: ++ params['tgt_name'] = tgt_name ++ ++ return client.call('nvmf_subsystem_remove_ns', params) ++ ++ ++def nvmf_subsystem_add_host(client, nqn, host, tgt_name=None): ++ """Add a host NQN to the list of allowed hosts. ++ ++ Args: ++ nqn: Subsystem NQN. ++ host: Host NQN to add to the list of allowed host NQNs ++ tgt_name: name of the parent NVMe-oF target (optional). ++ ++ Returns: ++ True or False ++ """ ++ params = {'nqn': nqn, ++ 'host': host} ++ ++ if tgt_name: ++ params['tgt_name'] = tgt_name ++ ++ return client.call('nvmf_subsystem_add_host', params) ++ ++ ++def nvmf_subsystem_remove_host(client, nqn, host, tgt_name=None): ++ """Remove a host NQN from the list of allowed hosts. ++ ++ Args: ++ nqn: Subsystem NQN. ++ host: Host NQN to remove to the list of allowed host NQNs ++ tgt_name: name of the parent NVMe-oF target (optional). ++ ++ Returns: ++ True or False ++ """ ++ params = {'nqn': nqn, ++ 'host': host} ++ ++ if tgt_name: ++ params['tgt_name'] = tgt_name ++ ++ return client.call('nvmf_subsystem_remove_host', params) ++ ++ ++def nvmf_subsystem_allow_any_host(client, nqn, disable, tgt_name=None): ++ """Configure a subsystem to allow any host to connect or to enforce the host NQN list. ++ ++ Args: ++ nqn: Subsystem NQN. ++ disable: Allow any host (true) or enforce allowed host list (false). ++ tgt_name: name of the parent NVMe-oF target (optional). ++ ++ Returns: ++ True or False ++ """ ++ params = {'nqn': nqn, 'allow_any_host': False if disable else True} ++ ++ if tgt_name: ++ params['tgt_name'] = tgt_name ++ ++ return client.call('nvmf_subsystem_allow_any_host', params) ++ ++ ++def nvmf_delete_subsystem(client, nqn, tgt_name=None): ++ """Delete an existing NVMe-oF subsystem. ++ ++ Args: ++ nqn: Subsystem NQN. ++ tgt_name: name of the parent NVMe-oF target (optional). ++ ++ Returns: ++ True or False ++ """ ++ params = {'nqn': nqn} ++ ++ if tgt_name: ++ params['tgt_name'] = tgt_name ++ ++ return client.call('nvmf_delete_subsystem', params) ++ ++ ++def nvmf_subsystem_get_controllers(client, nqn, tgt_name=None): ++ """Get list of controllers of an NVMe-oF subsystem. ++ ++ Args: ++ nqn: Subsystem NQN. ++ tgt_name: name of the parent NVMe-oF target (optional). ++ ++ Returns: ++ List of controller objects of an NVMe-oF subsystem. ++ """ ++ params = {'nqn': nqn} ++ ++ if tgt_name: ++ params['tgt_name'] = tgt_name ++ ++ return client.call('nvmf_subsystem_get_controllers', params) ++ ++ ++def nvmf_subsystem_get_qpairs(client, nqn, tgt_name=None): ++ """Get list of queue pairs of an NVMe-oF subsystem. ++ ++ Args: ++ nqn: Subsystem NQN. ++ tgt_name: name of the parent NVMe-oF target (optional). ++ ++ Returns: ++ List of queue pair objects of an NVMe-oF subsystem. ++ """ ++ params = {'nqn': nqn} ++ ++ if tgt_name: ++ params['tgt_name'] = tgt_name ++ ++ return client.call('nvmf_subsystem_get_qpairs', params) ++ ++ ++def nvmf_subsystem_get_listeners(client, nqn, tgt_name=None): ++ """Get list of listeners of an NVMe-oF subsystem. ++ ++ Args: ++ nqn: Subsystem NQN. ++ tgt_name: name of the parent NVMe-oF target (optional). ++ ++ Returns: ++ List of listener objects of an NVMe-oF subsystem. ++ """ ++ params = {'nqn': nqn} ++ ++ if tgt_name: ++ params['tgt_name'] = tgt_name ++ ++ return client.call('nvmf_subsystem_get_listeners', params) ++ ++ ++def nvmf_get_stats(client, tgt_name=None): ++ """Query NVMf statistics. ++ ++ Args: ++ tgt_name: name of the parent NVMe-oF target (optional). ++ ++ Returns: ++ Current NVMf statistics. ++ """ ++ ++ params = {} ++ ++ if tgt_name: ++ params = { ++ 'tgt_name': tgt_name, ++ } ++ ++ return client.call('nvmf_get_stats', params) ++ ++ ++def nvmf_set_crdt(client, crdt1=None, crdt2=None, crdt3=None): ++ """Set the 3 crdt (Command Retry Delay Time) values ++ ++ Args: ++ crdt1: Command Retry Delay Time 1 ++ crdt2: Command Retry Delay Time 2 ++ crdt3: Command Retry Delay Time 3 ++ ++ Returns: ++ True or False ++ """ ++ params = {} ++ if crdt1 is not None: ++ params['crdt1'] = crdt1 ++ if crdt2 is not None: ++ params['crdt2'] = crdt2 ++ if crdt3 is not None: ++ params['crdt3'] = crdt3 ++ ++ return client.call('nvmf_set_crdt', params) +diff --git a/python/spdk/rpc/pmem.py b/python/spdk/rpc/pmem.py +index 59a01ef..72a3683 100644 +--- a/python/spdk/rpc/pmem.py ++++ b/python/spdk/rpc/pmem.py +@@ -1,34 +1,34 @@ +-# SPDX-License-Identifier: BSD-3-Clause +-# Copyright (C) 2017 Intel Corporation. +-# All rights reserved. +- +- +-def bdev_pmem_create_pool(client, pmem_file, num_blocks, block_size): +- """Create pmem pool at specified path. +- Args: +- pmem_file: path at which to create pmem pool +- num_blocks: number of blocks for created pmem pool file +- block_size: block size for pmem pool file +- """ +- params = {'pmem_file': pmem_file, +- 'num_blocks': num_blocks, +- 'block_size': block_size} +- return client.call('bdev_pmem_create_pool', params) +- +- +-def bdev_pmem_get_pool_info(client, pmem_file): +- """Get details about pmem pool. +- Args: +- pmem_file: path to pmem pool +- """ +- params = {'pmem_file': pmem_file} +- return client.call('bdev_pmem_get_pool_info', params) +- +- +-def bdev_pmem_delete_pool(client, pmem_file): +- """Delete pmem pool. +- Args: +- pmem_file: path to pmem pool +- """ +- params = {'pmem_file': pmem_file} +- return client.call('bdev_pmem_delete_pool', params) ++# SPDX-License-Identifier: BSD-3-Clause ++# Copyright (C) 2017 Intel Corporation. ++# All rights reserved. ++ ++ ++def bdev_pmem_create_pool(client, pmem_file, num_blocks, block_size): ++ """Create pmem pool at specified path. ++ Args: ++ pmem_file: path at which to create pmem pool ++ num_blocks: number of blocks for created pmem pool file ++ block_size: block size for pmem pool file ++ """ ++ params = {'pmem_file': pmem_file, ++ 'num_blocks': num_blocks, ++ 'block_size': block_size} ++ return client.call('bdev_pmem_create_pool', params) ++ ++ ++def bdev_pmem_get_pool_info(client, pmem_file): ++ """Get details about pmem pool. ++ Args: ++ pmem_file: path to pmem pool ++ """ ++ params = {'pmem_file': pmem_file} ++ return client.call('bdev_pmem_get_pool_info', params) ++ ++ ++def bdev_pmem_delete_pool(client, pmem_file): ++ """Delete pmem pool. ++ Args: ++ pmem_file: path to pmem pool ++ """ ++ params = {'pmem_file': pmem_file} ++ return client.call('bdev_pmem_delete_pool', params) +diff --git a/python/spdk/rpc/sock.py b/python/spdk/rpc/sock.py +index 4065e2c..4e5ac99 100644 +--- a/python/spdk/rpc/sock.py ++++ b/python/spdk/rpc/sock.py +@@ -1,91 +1,91 @@ +-# SPDX-License-Identifier: BSD-3-Clause +-# Copyright (C) 2021 Intel Corporation. +-# All rights reserved. +- +- +-def sock_impl_get_options(client, impl_name=None): +- """Get parameters for the socket layer implementation. +- +- Args: +- impl_name: name of socket implementation, e.g. posix +- """ +- params = {} +- +- params['impl_name'] = impl_name +- +- return client.call('sock_impl_get_options', params) +- +- +-def sock_impl_set_options(client, +- impl_name=None, +- recv_buf_size=None, +- send_buf_size=None, +- enable_recv_pipe=None, +- enable_quickack=None, +- enable_placement_id=None, +- enable_zerocopy_send_server=None, +- enable_zerocopy_send_client=None, +- zerocopy_threshold=None, +- tls_version=None, +- enable_ktls=None, +- psk_key=None, +- psk_identity=None): +- """Set parameters for the socket layer implementation. +- +- Args: +- impl_name: name of socket implementation, e.g. posix +- recv_buf_size: size of socket receive buffer in bytes (optional) +- send_buf_size: size of socket send buffer in bytes (optional) +- enable_recv_pipe: enable or disable receive pipe (optional) +- enable_quickack: enable or disable quickack (optional) +- enable_placement_id: option for placement_id. 0:disable,1:incoming_napi,2:incoming_cpu (optional) +- enable_zerocopy_send_server: enable or disable zerocopy on send for server sockets(optional) +- enable_zerocopy_send_client: enable or disable zerocopy on send for client sockets(optional) +- zerocopy_threshold: set zerocopy_threshold in bytes(optional) +- tls_version: set TLS protocol version (optional) +- enable_ktls: enable or disable Kernel TLS (optional) +- psk_key: set psk_key (optional) +- psk_identity: set psk_identity (optional) +- """ +- params = {} +- +- params['impl_name'] = impl_name +- if recv_buf_size is not None: +- params['recv_buf_size'] = recv_buf_size +- if send_buf_size is not None: +- params['send_buf_size'] = send_buf_size +- if enable_recv_pipe is not None: +- params['enable_recv_pipe'] = enable_recv_pipe +- if enable_quickack is not None: +- params['enable_quickack'] = enable_quickack +- if enable_placement_id is not None: +- params['enable_placement_id'] = enable_placement_id +- if enable_zerocopy_send_server is not None: +- params['enable_zerocopy_send_server'] = enable_zerocopy_send_server +- if enable_zerocopy_send_client is not None: +- params['enable_zerocopy_send_client'] = enable_zerocopy_send_client +- if zerocopy_threshold is not None: +- params['zerocopy_threshold'] = zerocopy_threshold +- if tls_version is not None: +- params['tls_version'] = tls_version +- if enable_ktls is not None: +- params['enable_ktls'] = enable_ktls +- if psk_key is not None: +- params['psk_key'] = psk_key +- if psk_identity is not None: +- params['psk_identity'] = psk_identity +- +- return client.call('sock_impl_set_options', params) +- +- +-def sock_set_default_impl(client, impl_name=None): +- """Set the default socket implementation. +- +- Args: +- impl_name: name of socket implementation, e.g. posix +- """ +- params = {} +- +- params['impl_name'] = impl_name +- +- return client.call('sock_set_default_impl', params) ++# SPDX-License-Identifier: BSD-3-Clause ++# Copyright (C) 2021 Intel Corporation. ++# All rights reserved. ++ ++ ++def sock_impl_get_options(client, impl_name=None): ++ """Get parameters for the socket layer implementation. ++ ++ Args: ++ impl_name: name of socket implementation, e.g. posix ++ """ ++ params = {} ++ ++ params['impl_name'] = impl_name ++ ++ return client.call('sock_impl_get_options', params) ++ ++ ++def sock_impl_set_options(client, ++ impl_name=None, ++ recv_buf_size=None, ++ send_buf_size=None, ++ enable_recv_pipe=None, ++ enable_quickack=None, ++ enable_placement_id=None, ++ enable_zerocopy_send_server=None, ++ enable_zerocopy_send_client=None, ++ zerocopy_threshold=None, ++ tls_version=None, ++ enable_ktls=None, ++ psk_key=None, ++ psk_identity=None): ++ """Set parameters for the socket layer implementation. ++ ++ Args: ++ impl_name: name of socket implementation, e.g. posix ++ recv_buf_size: size of socket receive buffer in bytes (optional) ++ send_buf_size: size of socket send buffer in bytes (optional) ++ enable_recv_pipe: enable or disable receive pipe (optional) ++ enable_quickack: enable or disable quickack (optional) ++ enable_placement_id: option for placement_id. 0:disable,1:incoming_napi,2:incoming_cpu (optional) ++ enable_zerocopy_send_server: enable or disable zerocopy on send for server sockets(optional) ++ enable_zerocopy_send_client: enable or disable zerocopy on send for client sockets(optional) ++ zerocopy_threshold: set zerocopy_threshold in bytes(optional) ++ tls_version: set TLS protocol version (optional) ++ enable_ktls: enable or disable Kernel TLS (optional) ++ psk_key: set psk_key (optional) ++ psk_identity: set psk_identity (optional) ++ """ ++ params = {} ++ ++ params['impl_name'] = impl_name ++ if recv_buf_size is not None: ++ params['recv_buf_size'] = recv_buf_size ++ if send_buf_size is not None: ++ params['send_buf_size'] = send_buf_size ++ if enable_recv_pipe is not None: ++ params['enable_recv_pipe'] = enable_recv_pipe ++ if enable_quickack is not None: ++ params['enable_quickack'] = enable_quickack ++ if enable_placement_id is not None: ++ params['enable_placement_id'] = enable_placement_id ++ if enable_zerocopy_send_server is not None: ++ params['enable_zerocopy_send_server'] = enable_zerocopy_send_server ++ if enable_zerocopy_send_client is not None: ++ params['enable_zerocopy_send_client'] = enable_zerocopy_send_client ++ if zerocopy_threshold is not None: ++ params['zerocopy_threshold'] = zerocopy_threshold ++ if tls_version is not None: ++ params['tls_version'] = tls_version ++ if enable_ktls is not None: ++ params['enable_ktls'] = enable_ktls ++ if psk_key is not None: ++ params['psk_key'] = psk_key ++ if psk_identity is not None: ++ params['psk_identity'] = psk_identity ++ ++ return client.call('sock_impl_set_options', params) ++ ++ ++def sock_set_default_impl(client, impl_name=None): ++ """Set the default socket implementation. ++ ++ Args: ++ impl_name: name of socket implementation, e.g. posix ++ """ ++ params = {} ++ ++ params['impl_name'] = impl_name ++ ++ return client.call('sock_set_default_impl', params) +diff --git a/python/spdk/rpc/ssam.py b/python/spdk/rpc/ssam.py +new file mode 100644 +index 0000000..014649b +--- /dev/null ++++ b/python/spdk/rpc/ssam.py +@@ -0,0 +1,235 @@ ++from .helpers import deprecated_alias ++from getpass import getuser ++ ++def log_command_info(client, event): ++ """log event info. ++ Args: ++ user_name: event user ++ event: function id of PCI device ++ src_addr: queue number of ssam ++ """ ++ params = { ++ 'user_name': getuser(), ++ 'event': event, ++ 'src_addr': "localhost", ++ } ++ return client.call('log_command_info', params) ++ ++def log_info(func): ++ def wrapper_log_info(arg, *args, **kw): ++ log_command_info(arg.client, func.__name__) ++ return func(arg, *args, **kw) ++ return wrapper_log_info ++ ++def create_blk_controller(client, dev_name, index, readonly=None, serial=None, vqueue=None): ++ """Create ssam BLK controller. ++ Args: ++ dev_name: device name to add to controller ++ index: function id or dbdf of PCI device ++ queues: queue number of ssam ++ readonly: set controller as read-only ++ serial: set volume id ++ vqueue: set virtio queue num ++ """ ++ params = { ++ 'dev_name': dev_name, ++ 'index': index, ++ } ++ if readonly: ++ params['readonly'] = readonly ++ if serial: ++ params['serial'] = serial ++ if vqueue is not None: ++ params['vqueue'] = vqueue ++ return client.call('create_blk_controller', params) ++ ++ ++def get_controllers(client, function_id=None, dbdf=None): ++ """Get information about configured ssam controllers. ++ ++ Args: ++ function_id: function id of PCI device ++ dbdf: dbdf of PCI device ++ ++ Returns: ++ List of ssam controllers. ++ """ ++ params = {} ++ if function_id is not None: ++ params['function_id'] = function_id ++ if dbdf is not None: ++ params['dbdf'] = dbdf ++ return client.call('get_controllers', params) ++ ++def get_scsi_controllers(client, name=None): ++ """Get information about configured ssam controllers. ++ ++ Args: ++ name: name of scsi controller ++ ++ Returns: ++ List of ssam scsi controllers. ++ """ ++ params = {} ++ if name is not None: ++ params['name'] = name ++ return client.call('get_scsi_controllers', params) ++ ++def delete_controller(client, index): ++ """Delete ssam controller from configuration. ++ Args: ++ index: function id or dbdf of PCI device ++ """ ++ params = {'index': index} ++ return client.call('delete_controller', params) ++ ++def delete_scsi_controller(client, name): ++ """Delete ssam controller from configuration. ++ Args: ++ name: scsi controller name to be delete ++ """ ++ params = {'name': name} ++ return client.call('delete_scsi_controller', params) ++ ++def controller_get_iostat(client, function_id=None, dbdf=None): ++ """Get iostat about configured ssam controllers. ++ ++ Args: ++ function_id: function id of PCI device ++ dbdf: dbdf of PCI device ++ ++ Returns: ++ List of iostat of ssam controllers. ++ """ ++ params = {} ++ if function_id is not None: ++ params['function_id'] = function_id ++ if dbdf is not None: ++ params['dbdf'] = dbdf ++ return client.call('controller_get_iostat', params) ++ ++def controller_clear_iostat(client): ++ """Clear iostat about configured ssam controllers. ++ """ ++ return client.call('controller_clear_iostat') ++ ++def bdev_resize(client, function_id, new_size_in_mb): ++ """Resize bdev in the system. ++ Args: ++ function_id: function id of PCI device ++ new_size_in_mb: new bdev size for resize operation. The unit is MiB ++ """ ++ params = { ++ 'function_id': function_id, ++ 'new_size_in_mb': new_size_in_mb, ++ } ++ return client.call('bdev_resize', params) ++ ++def scsi_bdev_resize(client, name, tgt_id, new_size_in_mb): ++ """Resize scsi bdev in the system. ++ Args: ++ name: controller name of PCI device ++ tgt_id: tgt id of bdev ++ new_size_in_mb: new bdev size for resize operation. The unit is MiB ++ """ ++ params = { ++ 'name': name, ++ 'tgt_id': tgt_id, ++ 'new_size_in_mb': new_size_in_mb, ++ } ++ return client.call('scsi_bdev_resize', params) ++ ++def bdev_aio_resize(client, name, new_size_in_mb): ++ """Resize aio bdev in the system. ++ Args: ++ name: aio bdev name ++ new_size_in_mb: new bdev size for resize operation. The unit is MiB ++ """ ++ params = { ++ 'name': name, ++ 'new_size_in_mb': new_size_in_mb, ++ } ++ return client.call('bdev_aio_resize', params) ++ ++def os_ready(client): ++ """Write ready flag for booting OS. ++ ++ """ ++ return client.call('os_ready') ++ ++def create_scsi_controller(client, dbdf, name): ++ """Create ssam scsi controller. ++ Args: ++ dbdf: the pci dbdf of virtio scsi controller ++ name: controller name to be create ++ """ ++ params = { ++ 'dbdf': dbdf, ++ 'name': name, ++ } ++ ++ return client.call('create_scsi_controller', params) ++ ++def scsi_controller_add_target(client, name, scsi_tgt_num, bdev_name): ++ """Add LUN to ssam scsi controller target. ++ Args: ++ name: controller name where add lun ++ scsi_tgt_num: target number to use ++ bdev_name: name of bdev to add to target ++ """ ++ params = { ++ 'name': name, ++ 'scsi_tgt_num': scsi_tgt_num, ++ 'bdev_name': bdev_name, ++ } ++ return client.call('scsi_controller_add_target', params) ++ ++def scsi_controller_remove_target(client, name, scsi_tgt_num): ++ """Remove LUN from ssam scsi controller target. ++ Args: ++ name: controller name to remove lun ++ scsi_tgt_num: target number to use ++ """ ++ params = { ++ 'name': name, ++ 'scsi_tgt_num': scsi_tgt_num, ++ } ++ return client.call('scsi_controller_remove_target', params) ++ ++def scsi_device_iostat(client, name, scsi_tgt_num): ++ """Get iostat about scsi device. ++ ++ Args: ++ name: controller name ++ scsi_tgt_num: target number ++ ++ Returns: ++ List of iostat of ssam controllers. ++ """ ++ params = { ++ 'name': name, ++ 'scsi_tgt_num': scsi_tgt_num, ++ } ++ return client.call('scsi_device_iostat', params) ++ ++def device_pcie_list(client): ++ """Show storage device pcie list. ++ ++ Returns: ++ List of storage device pcie. ++ """ ++ ++ return client.call('device_pcie_list') ++ ++def config_remove(client): ++ """Remove json config file. ++ ++ """ ++ ++ return client.call('config_remove') ++ ++def os_not_ready(client): ++ """Write not ready flag for booting OS. ++ ++ """ ++ return client.call('os_not_ready') +\ No newline at end of file +diff --git a/python/spdk/rpc/subsystem.py b/python/spdk/rpc/subsystem.py +index db4d20d..e9e6894 100644 +--- a/python/spdk/rpc/subsystem.py ++++ b/python/spdk/rpc/subsystem.py +@@ -1,16 +1,16 @@ +-# SPDX-License-Identifier: BSD-3-Clause +-# Copyright (C) 2018 Intel Corporation. +-# All rights reserved. +- +- +-def framework_get_subsystems(client): +- return client.call('framework_get_subsystems') +- +- +-def framework_get_config(client, name): +- params = {'name': name} +- return client.call('framework_get_config', params) +- +- +-def framework_get_pci_devices(client): +- return client.call('framework_get_pci_devices') ++# SPDX-License-Identifier: BSD-3-Clause ++# Copyright (C) 2018 Intel Corporation. ++# All rights reserved. ++ ++ ++def framework_get_subsystems(client): ++ return client.call('framework_get_subsystems') ++ ++ ++def framework_get_config(client, name): ++ params = {'name': name} ++ return client.call('framework_get_config', params) ++ ++ ++def framework_get_pci_devices(client): ++ return client.call('framework_get_pci_devices') +diff --git a/python/spdk/rpc/trace.py b/python/spdk/rpc/trace.py +index b37676e..3470720 100644 +--- a/python/spdk/rpc/trace.py ++++ b/python/spdk/rpc/trace.py +@@ -1,65 +1,65 @@ +-# SPDX-License-Identifier: BSD-3-Clause +-# Copyright (C) 2018 Intel Corporation. +-# All rights reserved. +- +- +-def trace_enable_tpoint_group(client, name): +- """Enable trace on a specific tpoint group. +- +- Args: +- name: trace group name we want to enable in tpoint_group_mask. (for example "bdev"). +- """ +- params = {'name': name} +- return client.call('trace_enable_tpoint_group', params) +- +- +-def trace_disable_tpoint_group(client, name): +- """Disable trace on a specific tpoint group. +- +- Args: +- name: trace group name we want to disable in tpoint_group_mask. (for example "bdev"). +- """ +- params = {'name': name} +- return client.call('trace_disable_tpoint_group', params) +- +- +-def trace_set_tpoint_mask(client, name, tpoint_mask): +- """Enable tracepoint mask on a specific tpoint group. +- +- Args: +- name: trace group name we want to enable in tpoint_group_mask. (for example "bdev"). +- tpoint_mask: tracepoints to be enabled inside decleared group +- (for example "0x3" to enable first two tpoints). +- """ +- params = {'name': name, 'tpoint_mask': tpoint_mask} +- return client.call('trace_set_tpoint_mask', params) +- +- +-def trace_clear_tpoint_mask(client, name, tpoint_mask): +- """Disable tracepoint mask on a specific tpoint group. +- +- Args: +- name: trace group name we want to disable in tpoint_group_mask. (for example "bdev"). +- tpoint_mask: tracepoints to be disabled inside decleared group +- (for example "0x3" to disable first two tpoints). +- """ +- params = {'name': name, 'tpoint_mask': tpoint_mask} +- return client.call('trace_clear_tpoint_mask', params) +- +- +-def trace_get_tpoint_group_mask(client): +- """Get trace point group mask +- +- Returns: +- List of trace point group mask +- """ +- return client.call('trace_get_tpoint_group_mask') +- +- +-def trace_get_info(client): +- """Get name of shared memory file and list of the available trace point groups +- +- Returns: +- Name of shared memory file and list of the available trace point groups +- """ +- return client.call('trace_get_info') ++# SPDX-License-Identifier: BSD-3-Clause ++# Copyright (C) 2018 Intel Corporation. ++# All rights reserved. ++ ++ ++def trace_enable_tpoint_group(client, name): ++ """Enable trace on a specific tpoint group. ++ ++ Args: ++ name: trace group name we want to enable in tpoint_group_mask. (for example "bdev"). ++ """ ++ params = {'name': name} ++ return client.call('trace_enable_tpoint_group', params) ++ ++ ++def trace_disable_tpoint_group(client, name): ++ """Disable trace on a specific tpoint group. ++ ++ Args: ++ name: trace group name we want to disable in tpoint_group_mask. (for example "bdev"). ++ """ ++ params = {'name': name} ++ return client.call('trace_disable_tpoint_group', params) ++ ++ ++def trace_set_tpoint_mask(client, name, tpoint_mask): ++ """Enable tracepoint mask on a specific tpoint group. ++ ++ Args: ++ name: trace group name we want to enable in tpoint_group_mask. (for example "bdev"). ++ tpoint_mask: tracepoints to be enabled inside decleared group ++ (for example "0x3" to enable first two tpoints). ++ """ ++ params = {'name': name, 'tpoint_mask': tpoint_mask} ++ return client.call('trace_set_tpoint_mask', params) ++ ++ ++def trace_clear_tpoint_mask(client, name, tpoint_mask): ++ """Disable tracepoint mask on a specific tpoint group. ++ ++ Args: ++ name: trace group name we want to disable in tpoint_group_mask. (for example "bdev"). ++ tpoint_mask: tracepoints to be disabled inside decleared group ++ (for example "0x3" to disable first two tpoints). ++ """ ++ params = {'name': name, 'tpoint_mask': tpoint_mask} ++ return client.call('trace_clear_tpoint_mask', params) ++ ++ ++def trace_get_tpoint_group_mask(client): ++ """Get trace point group mask ++ ++ Returns: ++ List of trace point group mask ++ """ ++ return client.call('trace_get_tpoint_group_mask') ++ ++ ++def trace_get_info(client): ++ """Get name of shared memory file and list of the available trace point groups ++ ++ Returns: ++ Name of shared memory file and list of the available trace point groups ++ """ ++ return client.call('trace_get_info') +diff --git a/python/spdk/rpc/ublk.py b/python/spdk/rpc/ublk.py +index 62fdf42..d8a2a3f 100644 +--- a/python/spdk/rpc/ublk.py ++++ b/python/spdk/rpc/ublk.py +@@ -1,37 +1,37 @@ +-# SPDX-License-Identifier: BSD-3-Clause +-# Copyright (C) 2022 Intel Corporation. +-# All rights reserved. +- +-def ublk_create_target(client, cpumask=None): +- params = {} +- if cpumask: +- params['cpumask'] = cpumask +- return client.call('ublk_create_target', params) +- +- +-def ublk_destroy_target(client): +- return client.call('ublk_destroy_target') +- +- +-def ublk_start_disk(client, bdev_name, ublk_id=1, num_queues=1, queue_depth=128): +- params = { +- 'bdev_name': bdev_name, +- 'ublk_id': ublk_id +- } +- if num_queues: +- params['num_queues'] = num_queues +- if queue_depth: +- params['queue_depth'] = queue_depth +- return client.call('ublk_start_disk', params) +- +- +-def ublk_stop_disk(client, ublk_id=1): +- params = {'ublk_id': ublk_id} +- return client.call('ublk_stop_disk', params) +- +- +-def ublk_get_disks(client, ublk_id=1): +- params = {} +- if ublk_id: +- params['ublk_id'] = ublk_id +- return client.call('ublk_get_disks', params) ++# SPDX-License-Identifier: BSD-3-Clause ++# Copyright (C) 2022 Intel Corporation. ++# All rights reserved. ++ ++def ublk_create_target(client, cpumask=None): ++ params = {} ++ if cpumask: ++ params['cpumask'] = cpumask ++ return client.call('ublk_create_target', params) ++ ++ ++def ublk_destroy_target(client): ++ return client.call('ublk_destroy_target') ++ ++ ++def ublk_start_disk(client, bdev_name, ublk_id=1, num_queues=1, queue_depth=128): ++ params = { ++ 'bdev_name': bdev_name, ++ 'ublk_id': ublk_id ++ } ++ if num_queues: ++ params['num_queues'] = num_queues ++ if queue_depth: ++ params['queue_depth'] = queue_depth ++ return client.call('ublk_start_disk', params) ++ ++ ++def ublk_stop_disk(client, ublk_id=1): ++ params = {'ublk_id': ublk_id} ++ return client.call('ublk_stop_disk', params) ++ ++ ++def ublk_get_disks(client, ublk_id=1): ++ params = {} ++ if ublk_id: ++ params['ublk_id'] = ublk_id ++ return client.call('ublk_get_disks', params) +diff --git a/python/spdk/rpc/vfio_user.py b/python/spdk/rpc/vfio_user.py +index 161e7af..4878a99 100644 +--- a/python/spdk/rpc/vfio_user.py ++++ b/python/spdk/rpc/vfio_user.py +@@ -1,113 +1,113 @@ +-# SPDX-License-Identifier: BSD-3-Clause +-# Copyright (C) 2022 Intel Corporation. +-# All rights reserved. +- +- +-def vfu_tgt_set_base_path(client, path): +- """Set socket base path. +- +- Args: +- path: base path +- """ +- params = { +- 'path': path +- } +- +- return client.call('vfu_tgt_set_base_path', params) +- +- +-def vfu_virtio_delete_endpoint(client, name): +- """Delete specified endpoint name. +- +- Args: +- name: endpoint name +- """ +- params = { +- 'name': name +- } +- +- return client.call('vfu_virtio_delete_endpoint', params) +- +- +-def vfu_virtio_create_blk_endpoint(client, name, bdev_name, cpumask, num_queues, qsize, packed_ring): +- """Create virtio-blk endpoint. +- +- Args: +- name: endpoint name +- bdev_name: name of block device +- cpumask: CPU core mask +- num_queues: number of vrings +- qsize: number of element of each vring +- packed_ring: enable packed ring +- """ +- params = { +- 'name': name, +- 'bdev_name': bdev_name +- } +- if cpumask: +- params['cpumask'] = cpumask +- if num_queues: +- params['num_queues'] = num_queues +- if qsize: +- params['qsize'] = qsize +- if packed_ring: +- params['packed_ring'] = packed_ring +- +- return client.call('vfu_virtio_create_blk_endpoint', params) +- +- +-def vfu_virtio_scsi_add_target(client, name, scsi_target_num, bdev_name): +- """Attach a block device to the specified SCSI target. +- +- Args: +- name: endpoint name +- scsi_target_num: SCSI target number +- bdev_name: name of block device +- """ +- params = { +- 'name': name, +- 'scsi_target_num': scsi_target_num, +- 'bdev_name': bdev_name +- } +- +- return client.call('vfu_virtio_scsi_add_target', params) +- +- +-def vfu_virtio_scsi_remove_target(client, name, scsi_target_num): +- """Remove specified SCSI target of socket endpoint. +- +- Args: +- name: endpoint name +- scsi_target_num: SCSI target number +- """ +- params = { +- 'name': name, +- 'scsi_target_num': scsi_target_num +- } +- +- return client.call('vfu_virtio_scsi_remove_target', params) +- +- +-def vfu_virtio_create_scsi_endpoint(client, name, cpumask, num_io_queues, qsize, packed_ring): +- """Create virtio-scsi endpoint. +- +- Args: +- name: endpoint name +- cpumask: CPU core mask +- num_io_queues: number of IO vrings +- qsize: number of element of each vring +- packed_ring: enable packed ring +- """ +- params = { +- 'name': name, +- } +- if cpumask: +- params['cpumask'] = cpumask +- if num_io_queues: +- params['num_io_queues'] = num_io_queues +- if qsize: +- params['qsize'] = qsize +- if packed_ring: +- params['packed_ring'] = packed_ring +- +- return client.call('vfu_virtio_create_scsi_endpoint', params) ++# SPDX-License-Identifier: BSD-3-Clause ++# Copyright (C) 2022 Intel Corporation. ++# All rights reserved. ++ ++ ++def vfu_tgt_set_base_path(client, path): ++ """Set socket base path. ++ ++ Args: ++ path: base path ++ """ ++ params = { ++ 'path': path ++ } ++ ++ return client.call('vfu_tgt_set_base_path', params) ++ ++ ++def vfu_virtio_delete_endpoint(client, name): ++ """Delete specified endpoint name. ++ ++ Args: ++ name: endpoint name ++ """ ++ params = { ++ 'name': name ++ } ++ ++ return client.call('vfu_virtio_delete_endpoint', params) ++ ++ ++def vfu_virtio_create_blk_endpoint(client, name, bdev_name, cpumask, num_queues, qsize, packed_ring): ++ """Create virtio-blk endpoint. ++ ++ Args: ++ name: endpoint name ++ bdev_name: name of block device ++ cpumask: CPU core mask ++ num_queues: number of vrings ++ qsize: number of element of each vring ++ packed_ring: enable packed ring ++ """ ++ params = { ++ 'name': name, ++ 'bdev_name': bdev_name ++ } ++ if cpumask: ++ params['cpumask'] = cpumask ++ if num_queues: ++ params['num_queues'] = num_queues ++ if qsize: ++ params['qsize'] = qsize ++ if packed_ring: ++ params['packed_ring'] = packed_ring ++ ++ return client.call('vfu_virtio_create_blk_endpoint', params) ++ ++ ++def vfu_virtio_scsi_add_target(client, name, scsi_target_num, bdev_name): ++ """Attach a block device to the specified SCSI target. ++ ++ Args: ++ name: endpoint name ++ scsi_target_num: SCSI target number ++ bdev_name: name of block device ++ """ ++ params = { ++ 'name': name, ++ 'scsi_target_num': scsi_target_num, ++ 'bdev_name': bdev_name ++ } ++ ++ return client.call('vfu_virtio_scsi_add_target', params) ++ ++ ++def vfu_virtio_scsi_remove_target(client, name, scsi_target_num): ++ """Remove specified SCSI target of socket endpoint. ++ ++ Args: ++ name: endpoint name ++ scsi_target_num: SCSI target number ++ """ ++ params = { ++ 'name': name, ++ 'scsi_target_num': scsi_target_num ++ } ++ ++ return client.call('vfu_virtio_scsi_remove_target', params) ++ ++ ++def vfu_virtio_create_scsi_endpoint(client, name, cpumask, num_io_queues, qsize, packed_ring): ++ """Create virtio-scsi endpoint. ++ ++ Args: ++ name: endpoint name ++ cpumask: CPU core mask ++ num_io_queues: number of IO vrings ++ qsize: number of element of each vring ++ packed_ring: enable packed ring ++ """ ++ params = { ++ 'name': name, ++ } ++ if cpumask: ++ params['cpumask'] = cpumask ++ if num_io_queues: ++ params['num_io_queues'] = num_io_queues ++ if qsize: ++ params['qsize'] = qsize ++ if packed_ring: ++ params['packed_ring'] = packed_ring ++ ++ return client.call('vfu_virtio_create_scsi_endpoint', params) +diff --git a/python/spdk/rpc/vhost.py b/python/spdk/rpc/vhost.py +index 1b02027..846798a 100644 +--- a/python/spdk/rpc/vhost.py ++++ b/python/spdk/rpc/vhost.py +@@ -1,188 +1,188 @@ +-# SPDX-License-Identifier: BSD-3-Clause +-# Copyright (C) 2017 Intel Corporation. +-# All rights reserved. +- +-from .cmd_parser import * +- +- +-def vhost_controller_set_coalescing(client, ctrlr, delay_base_us, iops_threshold): +- """Set coalescing for vhost controller. +- Args: +- ctrlr: controller name +- delay_base_us: base delay time +- iops_threshold: IOPS threshold when coalescing is enabled +- """ +- params = { +- 'ctrlr': ctrlr, +- 'delay_base_us': delay_base_us, +- 'iops_threshold': iops_threshold, +- } +- return client.call('vhost_controller_set_coalescing', params) +- +- +-def virtio_blk_get_transports(client, name=None): +- """Get list of virtio-blk transports. +- Args: +- name: name of the virtio-blk transport (optional). +- +- Returns: +- List of virtio-blk transport objects. +- """ +- +- params = {} +- +- if name: +- params['name'] = name +- +- return client.call('virtio_blk_get_transports', params) +- +- +-def virtio_blk_create_transport(client, **params): +- """Create virtio blk transport. +- Args: +- name: transport name +- """ +- strip_globals(params) +- remove_null(params) +- +- return client.call('virtio_blk_create_transport', params) +- +- +-def vhost_create_scsi_controller(client, ctrlr, cpumask=None): +- """Create a vhost scsi controller. +- Args: +- ctrlr: controller name +- cpumask: cpu mask for this controller +- """ +- params = {'ctrlr': ctrlr} +- +- if cpumask: +- params['cpumask'] = cpumask +- +- return client.call('vhost_create_scsi_controller', params) +- +- +-def vhost_scsi_controller_add_target(client, ctrlr, scsi_target_num, bdev_name): +- """Add LUN to vhost scsi controller target. +- Args: +- ctrlr: controller name +- scsi_target_num: target number to use +- bdev_name: name of bdev to add to target +- """ +- params = { +- 'ctrlr': ctrlr, +- 'scsi_target_num': scsi_target_num, +- 'bdev_name': bdev_name, +- } +- return client.call('vhost_scsi_controller_add_target', params) +- +- +-def vhost_scsi_controller_remove_target(client, ctrlr, scsi_target_num): +- """Remove target from vhost scsi controller. +- Args: +- ctrlr: controller name to remove target from +- scsi_target_num: number of target to remove from controller +- """ +- params = { +- 'ctrlr': ctrlr, +- 'scsi_target_num': scsi_target_num +- } +- return client.call('vhost_scsi_controller_remove_target', params) +- +- +-def vhost_create_blk_controller(client, **params): +- """Create vhost BLK controller. +- Args: +- ctrlr: controller name +- dev_name: device name to add to controller +- cpumask: cpu mask for this controller +- transport: virtio blk transport name (default: vhost_user_blk) +- readonly: set controller as read-only +- packed_ring: support controller packed_ring +- packed_ring_recovery: enable packed ring live recovery +- """ +- strip_globals(params) +- remove_null(params) +- +- return client.call('vhost_create_blk_controller', params) +- +- +-def vhost_get_controllers(client, name=None): +- """Get information about configured vhost controllers. +- +- Args: +- name: controller name to query (optional; if omitted, query all controllers) +- +- Returns: +- List of vhost controllers. +- """ +- params = {} +- if name: +- params['name'] = name +- return client.call('vhost_get_controllers', params) +- +- +-def vhost_delete_controller(client, ctrlr): +- """Delete vhost controller from configuration. +- Args: +- ctrlr: controller name to remove +- """ +- params = {'ctrlr': ctrlr} +- return client.call('vhost_delete_controller', params) +- +- +-def bdev_virtio_attach_controller(client, name, trtype, traddr, dev_type, vq_count=None, vq_size=None): +- """Attaches virtio controller using +- provided transport type and device type. +- This will also create bdevs for any block +- devices connected to that controller. +- Args: +- name: name base for new created bdevs +- trtype: virtio target transport type: pci or user +- traddr: transport type specific target address: e.g. UNIX +- domain socket path or BDF +- dev_type: device type: blk or scsi +- vq_count: number of virtual queues to be used +- vq_size: size of each queue +- """ +- params = { +- 'name': name, +- 'trtype': trtype, +- 'traddr': traddr, +- 'dev_type': dev_type +- } +- if vq_count: +- params['vq_count'] = vq_count +- if vq_size: +- params['vq_size'] = vq_size +- return client.call('bdev_virtio_attach_controller', params) +- +- +-def bdev_virtio_detach_controller(client, name): +- """Remove a Virtio device +- This will delete all bdevs exposed by this device. +- Args: +- name: virtio device name +- """ +- params = {'name': name} +- return client.call('bdev_virtio_detach_controller', params) +- +- +-def bdev_virtio_scsi_get_devices(client): +- """Get list of virtio scsi devices.""" +- return client.call('bdev_virtio_scsi_get_devices') +- +- +-def bdev_virtio_blk_set_hotplug(client, enable, period_us=None): +- """Set options for the bdev virtio blk. This is startup command. +- +- Args: +- enable: True to enable hotplug, False to disable. +- period_us: how often the hotplug is processed for insert and remove events. Set 0 to reset to default. (optional) +- """ +- params = {'enable': enable} +- +- if period_us: +- params['period_us'] = period_us +- +- return client.call('bdev_virtio_blk_set_hotplug', params) ++# SPDX-License-Identifier: BSD-3-Clause ++# Copyright (C) 2017 Intel Corporation. ++# All rights reserved. ++ ++from .cmd_parser import * ++ ++ ++def vhost_controller_set_coalescing(client, ctrlr, delay_base_us, iops_threshold): ++ """Set coalescing for vhost controller. ++ Args: ++ ctrlr: controller name ++ delay_base_us: base delay time ++ iops_threshold: IOPS threshold when coalescing is enabled ++ """ ++ params = { ++ 'ctrlr': ctrlr, ++ 'delay_base_us': delay_base_us, ++ 'iops_threshold': iops_threshold, ++ } ++ return client.call('vhost_controller_set_coalescing', params) ++ ++ ++def virtio_blk_get_transports(client, name=None): ++ """Get list of virtio-blk transports. ++ Args: ++ name: name of the virtio-blk transport (optional). ++ ++ Returns: ++ List of virtio-blk transport objects. ++ """ ++ ++ params = {} ++ ++ if name: ++ params['name'] = name ++ ++ return client.call('virtio_blk_get_transports', params) ++ ++ ++def virtio_blk_create_transport(client, **params): ++ """Create virtio blk transport. ++ Args: ++ name: transport name ++ """ ++ strip_globals(params) ++ remove_null(params) ++ ++ return client.call('virtio_blk_create_transport', params) ++ ++ ++def vhost_create_scsi_controller(client, ctrlr, cpumask=None): ++ """Create a vhost scsi controller. ++ Args: ++ ctrlr: controller name ++ cpumask: cpu mask for this controller ++ """ ++ params = {'ctrlr': ctrlr} ++ ++ if cpumask: ++ params['cpumask'] = cpumask ++ ++ return client.call('vhost_create_scsi_controller', params) ++ ++ ++def vhost_scsi_controller_add_target(client, ctrlr, scsi_target_num, bdev_name): ++ """Add LUN to vhost scsi controller target. ++ Args: ++ ctrlr: controller name ++ scsi_target_num: target number to use ++ bdev_name: name of bdev to add to target ++ """ ++ params = { ++ 'ctrlr': ctrlr, ++ 'scsi_target_num': scsi_target_num, ++ 'bdev_name': bdev_name, ++ } ++ return client.call('vhost_scsi_controller_add_target', params) ++ ++ ++def vhost_scsi_controller_remove_target(client, ctrlr, scsi_target_num): ++ """Remove target from vhost scsi controller. ++ Args: ++ ctrlr: controller name to remove target from ++ scsi_target_num: number of target to remove from controller ++ """ ++ params = { ++ 'ctrlr': ctrlr, ++ 'scsi_target_num': scsi_target_num ++ } ++ return client.call('vhost_scsi_controller_remove_target', params) ++ ++ ++def vhost_create_blk_controller(client, **params): ++ """Create vhost BLK controller. ++ Args: ++ ctrlr: controller name ++ dev_name: device name to add to controller ++ cpumask: cpu mask for this controller ++ transport: virtio blk transport name (default: vhost_user_blk) ++ readonly: set controller as read-only ++ packed_ring: support controller packed_ring ++ packed_ring_recovery: enable packed ring live recovery ++ """ ++ strip_globals(params) ++ remove_null(params) ++ ++ return client.call('vhost_create_blk_controller', params) ++ ++ ++def vhost_get_controllers(client, name=None): ++ """Get information about configured vhost controllers. ++ ++ Args: ++ name: controller name to query (optional; if omitted, query all controllers) ++ ++ Returns: ++ List of vhost controllers. ++ """ ++ params = {} ++ if name: ++ params['name'] = name ++ return client.call('vhost_get_controllers', params) ++ ++ ++def vhost_delete_controller(client, ctrlr): ++ """Delete vhost controller from configuration. ++ Args: ++ ctrlr: controller name to remove ++ """ ++ params = {'ctrlr': ctrlr} ++ return client.call('vhost_delete_controller', params) ++ ++ ++def bdev_virtio_attach_controller(client, name, trtype, traddr, dev_type, vq_count=None, vq_size=None): ++ """Attaches virtio controller using ++ provided transport type and device type. ++ This will also create bdevs for any block ++ devices connected to that controller. ++ Args: ++ name: name base for new created bdevs ++ trtype: virtio target transport type: pci or user ++ traddr: transport type specific target address: e.g. UNIX ++ domain socket path or BDF ++ dev_type: device type: blk or scsi ++ vq_count: number of virtual queues to be used ++ vq_size: size of each queue ++ """ ++ params = { ++ 'name': name, ++ 'trtype': trtype, ++ 'traddr': traddr, ++ 'dev_type': dev_type ++ } ++ if vq_count: ++ params['vq_count'] = vq_count ++ if vq_size: ++ params['vq_size'] = vq_size ++ return client.call('bdev_virtio_attach_controller', params) ++ ++ ++def bdev_virtio_detach_controller(client, name): ++ """Remove a Virtio device ++ This will delete all bdevs exposed by this device. ++ Args: ++ name: virtio device name ++ """ ++ params = {'name': name} ++ return client.call('bdev_virtio_detach_controller', params) ++ ++ ++def bdev_virtio_scsi_get_devices(client): ++ """Get list of virtio scsi devices.""" ++ return client.call('bdev_virtio_scsi_get_devices') ++ ++ ++def bdev_virtio_blk_set_hotplug(client, enable, period_us=None): ++ """Set options for the bdev virtio blk. This is startup command. ++ ++ Args: ++ enable: True to enable hotplug, False to disable. ++ period_us: how often the hotplug is processed for insert and remove events. Set 0 to reset to default. (optional) ++ """ ++ params = {'enable': enable} ++ ++ if period_us: ++ params['period_us'] = period_us ++ ++ return client.call('bdev_virtio_blk_set_hotplug', params) +diff --git a/python/spdk/rpc/vmd.py b/python/spdk/rpc/vmd.py +index e6f128a..2037432 100644 +--- a/python/spdk/rpc/vmd.py ++++ b/python/spdk/rpc/vmd.py +@@ -1,21 +1,21 @@ +-# SPDX-License-Identifier: BSD-3-Clause +-# Copyright (C) 2019 Intel Corporation. +-# All rights reserved. +- +-from .helpers import deprecated_alias +- +- +-@deprecated_alias('enable_vmd') +-def vmd_enable(client): +- """Enable VMD enumeration.""" +- return client.call('vmd_enable') +- +- +-def vmd_remove_device(client, addr): +- """Remove a device behind VMD""" +- return client.call('vmd_remove_device', {'addr': addr}) +- +- +-def vmd_rescan(client): +- """Force a rescan of the devices behind VMD""" +- return client.call('vmd_rescan') ++# SPDX-License-Identifier: BSD-3-Clause ++# Copyright (C) 2019 Intel Corporation. ++# All rights reserved. ++ ++from .helpers import deprecated_alias ++ ++ ++@deprecated_alias('enable_vmd') ++def vmd_enable(client): ++ """Enable VMD enumeration.""" ++ return client.call('vmd_enable') ++ ++ ++def vmd_remove_device(client, addr): ++ """Remove a device behind VMD""" ++ return client.call('vmd_remove_device', {'addr': addr}) ++ ++ ++def vmd_rescan(client): ++ """Force a rescan of the devices behind VMD""" ++ return client.call('vmd_rescan') +diff --git a/python/spdk/sma/__init__.py b/python/spdk/sma/__init__.py +index dbe563e..ed9395e 100644 +--- a/python/spdk/sma/__init__.py ++++ b/python/spdk/sma/__init__.py +@@ -1,21 +1,21 @@ +-# SPDX-License-Identifier: BSD-3-Clause +-# Copyright (C) 2022 Intel Corporation. +-# All rights reserved. +- +-import os +-import sys +- +-# Fix up the import paths for the autogenerated files +-sys.path.append(os.path.dirname(__file__) + '/proto') +- +-from .sma import StorageManagementAgent # noqa +-from .device import DeviceException # noqa +-from .device import DeviceManager # noqa +-from .device import NvmfTcpDeviceManager # noqa +-from .device import VhostBlkDeviceManager # noqa +-from .device import NvmfVfioDeviceManager # noqa +-from .volume import CryptoEngine # noqa +-from .volume import CryptoException # noqa +-from .volume import set_crypto_engine # noqa +-from .volume import get_crypto_engine # noqa +-from .volume import register_crypto_engine # noqa ++# SPDX-License-Identifier: BSD-3-Clause ++# Copyright (C) 2022 Intel Corporation. ++# All rights reserved. ++ ++import os ++import sys ++ ++# Fix up the import paths for the autogenerated files ++sys.path.append(os.path.dirname(__file__) + '/proto') ++ ++from .sma import StorageManagementAgent # noqa ++from .device import DeviceException # noqa ++from .device import DeviceManager # noqa ++from .device import NvmfTcpDeviceManager # noqa ++from .device import VhostBlkDeviceManager # noqa ++from .device import NvmfVfioDeviceManager # noqa ++from .volume import CryptoEngine # noqa ++from .volume import CryptoException # noqa ++from .volume import set_crypto_engine # noqa ++from .volume import get_crypto_engine # noqa ++from .volume import register_crypto_engine # noqa +diff --git a/python/spdk/sma/common.py b/python/spdk/sma/common.py +index 5fbc01a..61178c1 100644 +--- a/python/spdk/sma/common.py ++++ b/python/spdk/sma/common.py +@@ -1,25 +1,25 @@ +-# SPDX-License-Identifier: BSD-3-Clause +-# Copyright (C) 2022 Intel Corporation. +-# All rights reserved. +- +-import uuid +- +- +-def format_volume_id(volume_id): +- """Verifies volume_id and returns it as a str +- +- Args: +- volume_id: either a str (in which case it's only validated) or bytes object +- """ +- try: +- if type(volume_id) is bytes: +- return str(uuid.UUID(bytes=volume_id)) +- elif type(volume_id) is str: +- return str(uuid.UUID(hex=volume_id)) +- except ValueError: +- pass +- return None +- +- +-def volume_id_to_nguid(uuid): +- return uuid.replace('-', '') ++# SPDX-License-Identifier: BSD-3-Clause ++# Copyright (C) 2022 Intel Corporation. ++# All rights reserved. ++ ++import uuid ++ ++ ++def format_volume_id(volume_id): ++ """Verifies volume_id and returns it as a str ++ ++ Args: ++ volume_id: either a str (in which case it's only validated) or bytes object ++ """ ++ try: ++ if type(volume_id) is bytes: ++ return str(uuid.UUID(bytes=volume_id)) ++ elif type(volume_id) is str: ++ return str(uuid.UUID(hex=volume_id)) ++ except ValueError: ++ pass ++ return None ++ ++ ++def volume_id_to_nguid(uuid): ++ return uuid.replace('-', '') +diff --git a/python/spdk/sma/device/__init__.py b/python/spdk/sma/device/__init__.py +index 096a82f..9919bda 100644 +--- a/python/spdk/sma/device/__init__.py ++++ b/python/spdk/sma/device/__init__.py +@@ -1,9 +1,9 @@ +-# SPDX-License-Identifier: BSD-3-Clause +-# Copyright (C) 2022 Intel Corporation. +-# All rights reserved. +- +-from .device import DeviceException +-from .device import DeviceManager +-from .nvmf_tcp import NvmfTcpDeviceManager +-from .vhost_blk import VhostBlkDeviceManager +-from .nvmf_vfiouser import NvmfVfioDeviceManager ++# SPDX-License-Identifier: BSD-3-Clause ++# Copyright (C) 2022 Intel Corporation. ++# All rights reserved. ++ ++from .device import DeviceException ++from .device import DeviceManager ++from .nvmf_tcp import NvmfTcpDeviceManager ++from .vhost_blk import VhostBlkDeviceManager ++from .nvmf_vfiouser import NvmfVfioDeviceManager +diff --git a/python/spdk/sma/device/device.py b/python/spdk/sma/device/device.py +index 20d757f..c7668ac 100644 +--- a/python/spdk/sma/device/device.py ++++ b/python/spdk/sma/device/device.py +@@ -1,44 +1,44 @@ +-# SPDX-License-Identifier: BSD-3-Clause +-# Copyright (C) 2022 Intel Corporation. +-# All rights reserved. +- +-from ..proto import sma_pb2 +- +- +-class DeviceException(Exception): +- def __init__(self, code, message): +- self.code = code +- self.message = message +- +- +-class DeviceManager: +- def __init__(self, name, protocol, client, allow_delete_volumes=False): +- self._client = client +- self.protocol = protocol +- self.name = name +- # Configures whether the device allows deleting a device with attached volumes +- self.allow_delete_volumes = allow_delete_volumes +- +- def init(self, config): +- pass +- +- def create_device(self, request): +- raise NotImplementedError() +- +- def delete_device(self, request): +- raise NotImplementedError() +- +- def attach_volume(self, request): +- raise NotImplementedError() +- +- def detach_volume(self, request): +- raise NotImplementedError() +- +- def owns_device(self, id): +- raise NotImplementedError() +- +- def set_qos(self, request): +- raise NotImplementedError() +- +- def get_qos_capabilities(self, request): +- raise NotImplementedError() ++# SPDX-License-Identifier: BSD-3-Clause ++# Copyright (C) 2022 Intel Corporation. ++# All rights reserved. ++ ++from ..proto import sma_pb2 ++ ++ ++class DeviceException(Exception): ++ def __init__(self, code, message): ++ self.code = code ++ self.message = message ++ ++ ++class DeviceManager: ++ def __init__(self, name, protocol, client, allow_delete_volumes=False): ++ self._client = client ++ self.protocol = protocol ++ self.name = name ++ # Configures whether the device allows deleting a device with attached volumes ++ self.allow_delete_volumes = allow_delete_volumes ++ ++ def init(self, config): ++ pass ++ ++ def create_device(self, request): ++ raise NotImplementedError() ++ ++ def delete_device(self, request): ++ raise NotImplementedError() ++ ++ def attach_volume(self, request): ++ raise NotImplementedError() ++ ++ def detach_volume(self, request): ++ raise NotImplementedError() ++ ++ def owns_device(self, id): ++ raise NotImplementedError() ++ ++ def set_qos(self, request): ++ raise NotImplementedError() ++ ++ def get_qos_capabilities(self, request): ++ raise NotImplementedError() +diff --git a/python/spdk/sma/device/nvmf_tcp.py b/python/spdk/sma/device/nvmf_tcp.py +index c69d9d3..fdfedcf 100644 +--- a/python/spdk/sma/device/nvmf_tcp.py ++++ b/python/spdk/sma/device/nvmf_tcp.py +@@ -1,250 +1,250 @@ +-# SPDX-License-Identifier: BSD-3-Clause +-# Copyright (C) 2022 Intel Corporation. +-# All rights reserved. +- +-import grpc +-import logging +-import uuid +-from spdk.rpc.client import JSONRPCException +-from spdk.sma import qos +-from .device import DeviceManager, DeviceException +-from ..common import format_volume_id, volume_id_to_nguid +-from ..volume import get_crypto_engine, CryptoException +-from ..proto import sma_pb2 +-from ..proto import nvmf_tcp_pb2 +- +- +-class NvmfTcpDeviceManager(DeviceManager): +- def __init__(self, client): +- super().__init__('nvmf_tcp', 'nvmf_tcp', client) +- +- def init(self, config): +- self._has_transport = self._create_transport() +- +- def _create_transport(self): +- try: +- with self._client() as client: +- transports = client.call('nvmf_get_transports') +- for transport in transports: +- if transport['trtype'].lower() == 'tcp': +- return True +- # TODO: take the transport params from config +- return client.call('nvmf_create_transport', +- {'trtype': 'tcp'}) +- except JSONRPCException: +- logging.error('Failed to query for NVMe/TCP transport') +- return False +- +- def _check_transport(f): +- def wrapper(self, *args): +- if not self._has_transport: +- raise DeviceException(grpc.StatusCode.INTERNAL, +- 'NVMe/TCP transport is unavailable') +- return f(self, *args) +- return wrapper +- +- def _get_params(self, request, params): +- result = {} +- for grpc_param, *rpc_param in params: +- rpc_param = rpc_param[0] if rpc_param else grpc_param +- result[rpc_param] = getattr(request, grpc_param) +- return result +- +- def _check_addr(self, addr, addrlist): +- return next(filter(lambda a: ( +- a['trtype'].lower() == 'tcp' and +- a['adrfam'].lower() == addr['adrfam'].lower() and +- a['traddr'].lower() == addr['traddr'].lower() and +- a['trsvcid'].lower() == addr['trsvcid'].lower()), addrlist), None) is not None +- +- def _get_nqn_from_handle(self, handle): +- return handle[len('nvmf-tcp:'):] +- +- @_check_transport +- def create_device(self, request): +- params = request.nvmf_tcp +- with self._client() as client: +- try: +- subsystems = client.call('nvmf_get_subsystems') +- for subsystem in subsystems: +- if subsystem['nqn'] == params.subnqn: +- break +- else: +- subsystem = None +- result = client.call('nvmf_create_subsystem', +- {**self._get_params(params, [ +- ('subnqn', 'nqn'), +- ('allow_any_host',)])}) +- except JSONRPCException: +- raise DeviceException(grpc.StatusCode.INTERNAL, +- 'Failed to create NVMe/TCP device') +- try: +- for host in params.hosts: +- client.call('nvmf_subsystem_add_host', +- {'nqn': params.subnqn, +- 'host': host}) +- if subsystem is not None: +- for host in [h['nqn'] for h in subsystem['hosts']]: +- if host not in params.hosts: +- client.call('nvmf_subsystem_remove_host', +- {'nqn': params.subnqn, +- 'host': host}) +- +- addr = self._get_params(params, [ +- ('adrfam',), +- ('traddr',), +- ('trsvcid',)]) +- if subsystem is None or not self._check_addr(addr, +- subsystem['listen_addresses']): +- client.call('nvmf_subsystem_add_listener', +- {'nqn': params.subnqn, +- 'listen_address': {'trtype': 'tcp', **addr}}) +- volume_id = format_volume_id(request.volume.volume_id) +- if volume_id is not None: +- bdev_name = get_crypto_engine().get_crypto_bdev(volume_id) or volume_id +- result = client.call('nvmf_subsystem_add_ns', +- {'nqn': params.subnqn, +- 'namespace': { +- 'bdev_name': bdev_name, +- 'uuid': volume_id, +- 'nguid': volume_id_to_nguid(volume_id)}}) +- except (JSONRPCException, CryptoException): +- try: +- client.call('nvmf_delete_subsystem', {'nqn': params.subnqn}) +- except JSONRPCException: +- logging.warning(f'Failed to delete subsystem: {params.subnqn}') +- raise DeviceException(grpc.StatusCode.INTERNAL, +- 'Failed to create NVMe/TCP device') +- +- return sma_pb2.CreateDeviceResponse(handle=f'nvmf-tcp:{params.subnqn}') +- +- @_check_transport +- def delete_device(self, request): +- with self._client() as client: +- nqn = self._get_nqn_from_handle(request.handle) +- subsystems = client.call('nvmf_get_subsystems') +- for subsystem in subsystems: +- if subsystem['nqn'] == nqn: +- result = client.call('nvmf_delete_subsystem', +- {'nqn': nqn}) +- if not result: +- raise DeviceException(grpc.StatusCode.INTERNAL, +- 'Failed to delete device') +- break +- else: +- logging.info(f'Tried to delete a non-existing device: {nqn}') +- +- def _find_bdev(self, client, guid): +- try: +- bdev_name = get_crypto_engine().get_crypto_bdev(guid) or guid +- return client.call('bdev_get_bdevs', {'name': bdev_name})[0] +- except (JSONRPCException, CryptoException): +- return None +- +- @_check_transport +- def attach_volume(self, request): +- nqn = self._get_nqn_from_handle(request.device_handle) +- volume_id = format_volume_id(request.volume.volume_id) +- if volume_id is None: +- raise DeviceException(grpc.StatusCode.INVALID_ARGUMENT, +- 'Invalid volume ID') +- try: +- with self._client() as client: +- bdev = self._find_bdev(client, volume_id) +- if bdev is None: +- raise DeviceException(grpc.StatusCode.NOT_FOUND, +- 'Invalid volume GUID') +- subsystems = client.call('nvmf_get_subsystems') +- for subsys in subsystems: +- if subsys['nqn'] == nqn: +- break +- else: +- raise DeviceException(grpc.StatusCode.NOT_FOUND, +- 'Invalid device handle') +- if bdev['name'] not in [ns['name'] for ns in subsys['namespaces']]: +- result = client.call('nvmf_subsystem_add_ns', +- {'nqn': nqn, +- 'namespace': { +- 'bdev_name': bdev['name'], +- 'uuid': volume_id, +- 'nguid': volume_id_to_nguid(volume_id)}}) +- if not result: +- raise DeviceException(grpc.StatusCode.INTERNAL, +- 'Failed to attach volume') +- except JSONRPCException: +- raise DeviceException(grpc.StatusCode.INTERNAL, +- 'Failed to attach volume') +- +- @_check_transport +- def detach_volume(self, request): +- nqn = self._get_nqn_from_handle(request.device_handle) +- volume = format_volume_id(request.volume_id) +- if volume is None: +- raise DeviceException(grpc.StatusCode.INVALID_ARGUMENT, +- 'Invalid volume ID') +- try: +- with self._client() as client: +- bdev = self._find_bdev(client, volume) +- if bdev is None: +- logging.info(f'Tried to detach non-existing volume: {volume}') +- return +- +- subsystems = client.call('nvmf_get_subsystems') +- for subsys in subsystems: +- if subsys['nqn'] == nqn: +- break +- else: +- logging.info(f'Tried to detach volume: {volume} from non-existing ' + +- f'device: {nqn}') +- return +- +- for ns in subsys['namespaces']: +- if ns['name'] != bdev['name']: +- continue +- result = client.call('nvmf_subsystem_remove_ns', +- {'nqn': nqn, +- 'nsid': ns['nsid']}) +- if not result: +- raise DeviceException(grpc.StatusCode.INTERNAL, +- 'Failed to detach volume') +- break +- except JSONRPCException: +- raise DeviceException(grpc.StatusCode.INTERNAL, +- 'Failed to detach volume') +- +- def owns_device(self, handle): +- return handle.startswith('nvmf-tcp') +- +- def set_qos(self, request): +- nqn = self._get_nqn_from_handle(request.device_handle) +- volume = format_volume_id(request.volume_id) +- if volume is None: +- raise DeviceException(grpc.StatusCode.INVALID_ARGUMENT, +- 'Invalid volume ID') +- try: +- with self._client() as client: +- # Make sure that a volume exists and is attached to the device +- bdev = self._find_bdev(client, volume) +- if bdev is None: +- raise DeviceException(grpc.StatusCode.NOT_FOUND, +- 'No volume associated with volume_id could be found') +- try: +- subsys = client.call('nvmf_get_subsystems', {'nqn': nqn})[0] +- except JSONRPCException: +- raise DeviceException(grpc.StatusCode.NOT_FOUND, +- 'No device associated with device_handle could be found') +- for ns in subsys['namespaces']: +- if ns['name'] == bdev['name']: +- break +- else: +- raise DeviceException(grpc.StatusCode.INVALID_ARGUMENT, +- 'Specified volume is not attached to the device') +- qos.set_volume_bdev_qos(client, request) +- except qos.QosException as ex: +- raise DeviceException(ex.code, ex.message) +- except JSONRPCException: +- raise DeviceException(grpc.StatusCode.INTERNAL, +- 'Failed to set QoS') +- +- def get_qos_capabilities(self, request): +- return qos.get_bdev_qos_capabilities() ++# SPDX-License-Identifier: BSD-3-Clause ++# Copyright (C) 2022 Intel Corporation. ++# All rights reserved. ++ ++import grpc ++import logging ++import uuid ++from spdk.rpc.client import JSONRPCException ++from spdk.sma import qos ++from .device import DeviceManager, DeviceException ++from ..common import format_volume_id, volume_id_to_nguid ++from ..volume import get_crypto_engine, CryptoException ++from ..proto import sma_pb2 ++from ..proto import nvmf_tcp_pb2 ++ ++ ++class NvmfTcpDeviceManager(DeviceManager): ++ def __init__(self, client): ++ super().__init__('nvmf_tcp', 'nvmf_tcp', client) ++ ++ def init(self, config): ++ self._has_transport = self._create_transport() ++ ++ def _create_transport(self): ++ try: ++ with self._client() as client: ++ transports = client.call('nvmf_get_transports') ++ for transport in transports: ++ if transport['trtype'].lower() == 'tcp': ++ return True ++ # TODO: take the transport params from config ++ return client.call('nvmf_create_transport', ++ {'trtype': 'tcp'}) ++ except JSONRPCException: ++ logging.error('Failed to query for NVMe/TCP transport') ++ return False ++ ++ def _check_transport(f): ++ def wrapper(self, *args): ++ if not self._has_transport: ++ raise DeviceException(grpc.StatusCode.INTERNAL, ++ 'NVMe/TCP transport is unavailable') ++ return f(self, *args) ++ return wrapper ++ ++ def _get_params(self, request, params): ++ result = {} ++ for grpc_param, *rpc_param in params: ++ rpc_param = rpc_param[0] if rpc_param else grpc_param ++ result[rpc_param] = getattr(request, grpc_param) ++ return result ++ ++ def _check_addr(self, addr, addrlist): ++ return next(filter(lambda a: ( ++ a['trtype'].lower() == 'tcp' and ++ a['adrfam'].lower() == addr['adrfam'].lower() and ++ a['traddr'].lower() == addr['traddr'].lower() and ++ a['trsvcid'].lower() == addr['trsvcid'].lower()), addrlist), None) is not None ++ ++ def _get_nqn_from_handle(self, handle): ++ return handle[len('nvmf-tcp:'):] ++ ++ @_check_transport ++ def create_device(self, request): ++ params = request.nvmf_tcp ++ with self._client() as client: ++ try: ++ subsystems = client.call('nvmf_get_subsystems') ++ for subsystem in subsystems: ++ if subsystem['nqn'] == params.subnqn: ++ break ++ else: ++ subsystem = None ++ result = client.call('nvmf_create_subsystem', ++ {**self._get_params(params, [ ++ ('subnqn', 'nqn'), ++ ('allow_any_host',)])}) ++ except JSONRPCException: ++ raise DeviceException(grpc.StatusCode.INTERNAL, ++ 'Failed to create NVMe/TCP device') ++ try: ++ for host in params.hosts: ++ client.call('nvmf_subsystem_add_host', ++ {'nqn': params.subnqn, ++ 'host': host}) ++ if subsystem is not None: ++ for host in [h['nqn'] for h in subsystem['hosts']]: ++ if host not in params.hosts: ++ client.call('nvmf_subsystem_remove_host', ++ {'nqn': params.subnqn, ++ 'host': host}) ++ ++ addr = self._get_params(params, [ ++ ('adrfam',), ++ ('traddr',), ++ ('trsvcid',)]) ++ if subsystem is None or not self._check_addr(addr, ++ subsystem['listen_addresses']): ++ client.call('nvmf_subsystem_add_listener', ++ {'nqn': params.subnqn, ++ 'listen_address': {'trtype': 'tcp', **addr}}) ++ volume_id = format_volume_id(request.volume.volume_id) ++ if volume_id is not None: ++ bdev_name = get_crypto_engine().get_crypto_bdev(volume_id) or volume_id ++ result = client.call('nvmf_subsystem_add_ns', ++ {'nqn': params.subnqn, ++ 'namespace': { ++ 'bdev_name': bdev_name, ++ 'uuid': volume_id, ++ 'nguid': volume_id_to_nguid(volume_id)}}) ++ except (JSONRPCException, CryptoException): ++ try: ++ client.call('nvmf_delete_subsystem', {'nqn': params.subnqn}) ++ except JSONRPCException: ++ logging.warning(f'Failed to delete subsystem: {params.subnqn}') ++ raise DeviceException(grpc.StatusCode.INTERNAL, ++ 'Failed to create NVMe/TCP device') ++ ++ return sma_pb2.CreateDeviceResponse(handle=f'nvmf-tcp:{params.subnqn}') ++ ++ @_check_transport ++ def delete_device(self, request): ++ with self._client() as client: ++ nqn = self._get_nqn_from_handle(request.handle) ++ subsystems = client.call('nvmf_get_subsystems') ++ for subsystem in subsystems: ++ if subsystem['nqn'] == nqn: ++ result = client.call('nvmf_delete_subsystem', ++ {'nqn': nqn}) ++ if not result: ++ raise DeviceException(grpc.StatusCode.INTERNAL, ++ 'Failed to delete device') ++ break ++ else: ++ logging.info(f'Tried to delete a non-existing device: {nqn}') ++ ++ def _find_bdev(self, client, guid): ++ try: ++ bdev_name = get_crypto_engine().get_crypto_bdev(guid) or guid ++ return client.call('bdev_get_bdevs', {'name': bdev_name})[0] ++ except (JSONRPCException, CryptoException): ++ return None ++ ++ @_check_transport ++ def attach_volume(self, request): ++ nqn = self._get_nqn_from_handle(request.device_handle) ++ volume_id = format_volume_id(request.volume.volume_id) ++ if volume_id is None: ++ raise DeviceException(grpc.StatusCode.INVALID_ARGUMENT, ++ 'Invalid volume ID') ++ try: ++ with self._client() as client: ++ bdev = self._find_bdev(client, volume_id) ++ if bdev is None: ++ raise DeviceException(grpc.StatusCode.NOT_FOUND, ++ 'Invalid volume GUID') ++ subsystems = client.call('nvmf_get_subsystems') ++ for subsys in subsystems: ++ if subsys['nqn'] == nqn: ++ break ++ else: ++ raise DeviceException(grpc.StatusCode.NOT_FOUND, ++ 'Invalid device handle') ++ if bdev['name'] not in [ns['name'] for ns in subsys['namespaces']]: ++ result = client.call('nvmf_subsystem_add_ns', ++ {'nqn': nqn, ++ 'namespace': { ++ 'bdev_name': bdev['name'], ++ 'uuid': volume_id, ++ 'nguid': volume_id_to_nguid(volume_id)}}) ++ if not result: ++ raise DeviceException(grpc.StatusCode.INTERNAL, ++ 'Failed to attach volume') ++ except JSONRPCException: ++ raise DeviceException(grpc.StatusCode.INTERNAL, ++ 'Failed to attach volume') ++ ++ @_check_transport ++ def detach_volume(self, request): ++ nqn = self._get_nqn_from_handle(request.device_handle) ++ volume = format_volume_id(request.volume_id) ++ if volume is None: ++ raise DeviceException(grpc.StatusCode.INVALID_ARGUMENT, ++ 'Invalid volume ID') ++ try: ++ with self._client() as client: ++ bdev = self._find_bdev(client, volume) ++ if bdev is None: ++ logging.info(f'Tried to detach non-existing volume: {volume}') ++ return ++ ++ subsystems = client.call('nvmf_get_subsystems') ++ for subsys in subsystems: ++ if subsys['nqn'] == nqn: ++ break ++ else: ++ logging.info(f'Tried to detach volume: {volume} from non-existing ' + ++ f'device: {nqn}') ++ return ++ ++ for ns in subsys['namespaces']: ++ if ns['name'] != bdev['name']: ++ continue ++ result = client.call('nvmf_subsystem_remove_ns', ++ {'nqn': nqn, ++ 'nsid': ns['nsid']}) ++ if not result: ++ raise DeviceException(grpc.StatusCode.INTERNAL, ++ 'Failed to detach volume') ++ break ++ except JSONRPCException: ++ raise DeviceException(grpc.StatusCode.INTERNAL, ++ 'Failed to detach volume') ++ ++ def owns_device(self, handle): ++ return handle.startswith('nvmf-tcp') ++ ++ def set_qos(self, request): ++ nqn = self._get_nqn_from_handle(request.device_handle) ++ volume = format_volume_id(request.volume_id) ++ if volume is None: ++ raise DeviceException(grpc.StatusCode.INVALID_ARGUMENT, ++ 'Invalid volume ID') ++ try: ++ with self._client() as client: ++ # Make sure that a volume exists and is attached to the device ++ bdev = self._find_bdev(client, volume) ++ if bdev is None: ++ raise DeviceException(grpc.StatusCode.NOT_FOUND, ++ 'No volume associated with volume_id could be found') ++ try: ++ subsys = client.call('nvmf_get_subsystems', {'nqn': nqn})[0] ++ except JSONRPCException: ++ raise DeviceException(grpc.StatusCode.NOT_FOUND, ++ 'No device associated with device_handle could be found') ++ for ns in subsys['namespaces']: ++ if ns['name'] == bdev['name']: ++ break ++ else: ++ raise DeviceException(grpc.StatusCode.INVALID_ARGUMENT, ++ 'Specified volume is not attached to the device') ++ qos.set_volume_bdev_qos(client, request) ++ except qos.QosException as ex: ++ raise DeviceException(ex.code, ex.message) ++ except JSONRPCException: ++ raise DeviceException(grpc.StatusCode.INTERNAL, ++ 'Failed to set QoS') ++ ++ def get_qos_capabilities(self, request): ++ return qos.get_bdev_qos_capabilities() +diff --git a/python/spdk/sma/device/nvmf_vfiouser.py b/python/spdk/sma/device/nvmf_vfiouser.py +index 7758146..464496b 100644 +--- a/python/spdk/sma/device/nvmf_vfiouser.py ++++ b/python/spdk/sma/device/nvmf_vfiouser.py +@@ -1,321 +1,321 @@ +-# SPDX-License-Identifier: BSD-3-Clause +-# Copyright (C) 2022 Intel Corporation. +-# All rights reserved. +- +-import logging +-import os +-import shutil +-from contextlib import contextmanager +-from socket import AddressFamily +- +-import grpc +-from google.protobuf import wrappers_pb2 as wrap +-from spdk.rpc.client import JSONRPCException +-from spdk.sma import qos +- +-from ..common import format_volume_id, volume_id_to_nguid +-from ..proto import sma_pb2 +-from ..qmp import QMPClient, QMPError +-from ..volume import CryptoException, get_crypto_engine +-from .device import DeviceException, DeviceManager +- +-log = logging.getLogger(__name__) +- +- +-class NvmfVfioDeviceManager(DeviceManager): +- def __init__(self, client): +- super().__init__('vfiouser', 'nvme', client) +- +- def init(self, config): +- self._buses = config.get('buses', []) +- try: +- if len(self._buses) != len(list({v['name']: v for v in self._buses}.values())): +- raise DeviceException(grpc.StatusCode.INVALID_ARGUMENT, +- 'Duplicate PCI bridge names') +- except KeyError: +- raise DeviceException(grpc.StatusCode.INVALID_ARGUMENT, +- 'PCI bridge name is missing') +- for bus in self._buses: +- bus['count'] = bus.get('count', 32) +- if bus['count'] < 0: +- raise DeviceException(grpc.StatusCode.INVALID_ARGUMENT, +- 'Incorrect PCI bridge count') +- self._qmp_addr = (config.get('qmp_addr', '127.0.0.1'), config.get('qmp_port')) +- self._sock_path = config.get('sock_path', '/var/tmp/') +- self._prefix = f'{self.protocol}' +- if not self._create_transport(): +- raise DeviceException(grpc.StatusCode.INTERNAL, +- 'NVMe/vfiouser transport is unavailable') +- +- def _create_transport(self): +- try: +- with self._client() as client: +- transports = client.call('nvmf_get_transports') +- for transport in transports: +- if transport['trtype'].lower() == 'vfiouser': +- return True +- return client.call('nvmf_create_transport', {'trtype': 'vfiouser'}) +- except JSONRPCException: +- logging.error(f'Transport query NVMe/vfiouser failed') +- return False +- +- @contextmanager +- def _client_wrap(self): +- try: +- with self._client() as client: +- yield client +- except JSONRPCException: +- raise DeviceException(grpc.StatusCode.INTERNAL, +- 'Failed to connect to SPDK application') +- +- def _get_subsys(self, client, nqn): +- try: +- return client.call('nvmf_get_subsystems', {'nqn': nqn})[0] +- except JSONRPCException: +- return False +- +- def _create_subsystem(self, client, subnqn): +- try: +- if self._get_subsys(client, subnqn): +- return True +- return client.call('nvmf_create_subsystem', {'nqn': subnqn, 'allow_any_host': True}) +- except JSONRPCException: +- logging.error('Failed to create subsystem') +- return False +- +- def _delete_subsystem(self, client, subnqn): +- try: +- if not self._get_subsys(client, subnqn): +- return True +- return client.call('nvmf_delete_subsystem', {'nqn': subnqn}) +- except JSONRPCException: +- logging.error('Failed to delete subsystem') +- return False +- +- def _subsystem_add_listener(self, client, subnqn, addr): +- try: +- return client.call('nvmf_subsystem_add_listener', +- {'nqn': subnqn, +- 'listen_address': { +- 'trtype': 'vfiouser', +- 'traddr': addr}}) +- except JSONRPCException: +- logging.error('Failed to add listener') +- return False +- +- def _create_socket_dir(self, dev_id): +- try: +- path = os.path.join(self._sock_path, dev_id) +- if os.path.exists(path): +- shutil.rmtree(path) +- os.makedirs(path) +- if os.path.isdir(path): +- return path +- except OSError: +- raise DeviceException(grpc.StatusCode.INTERNAL, f'Socket path error {path}') +- +- def _find_pcidev(self, qclient, name): +- def rsearch(devices, name): +- found_dev = None +- for dev in devices: +- if dev['qdev_id'] == name: +- found_dev = dev +- elif 'pci_bridge' in dev: +- found_dev = rsearch(dev['pci_bridge']['devices'], name) +- +- if found_dev: +- break +- return found_dev +- +- try: +- buses = qclient.query_pci()['return'] +- for bus in buses: +- rc = rsearch(bus['devices'], name) +- if rc is not None: +- return rc +- except QMPError: +- return None +- +- def _qmp_add_device(self, phid, dev_id): +- # Find a bus that the physical_id maps to +- for bus in self._buses: +- if phid >= bus['count']: +- phid = phid - bus['count'] +- else: +- break +- else: +- raise DeviceException(grpc.StatusCode.INVALID_ARGUMENT, 'Invalid physical_id') +- try: +- with QMPClient(self._qmp_addr, AddressFamily.AF_INET) as qclient: +- if self._find_pcidev(qclient, dev_id) is None: +- qclient.device_add({'driver': 'vfio-user-pci', +- 'x-enable-migration': 'on', +- 'socket': os.path.join(self._sock_path, dev_id, 'cntrl'), +- 'bus': bus.get('name'), +- 'addr': hex(phid), +- 'id': dev_id}) +- return True +- except QMPError: +- logging.error('QMP: Failed to add device') +- return False +- +- def _create_device(self, physical_id): +- with self._client_wrap() as client: +- dev_id = f'{self.name}-{physical_id}' +- subnqn = f'nqn.2016-06.io.spdk:{dev_id}' +- rc = self._create_subsystem(client, subnqn) +- if not rc: +- raise DeviceException(grpc.StatusCode.INTERNAL, +- 'Failed to create the NVMe/vfiouser subsystem') +- rc = self._subsystem_add_listener(client, subnqn, +- self._create_socket_dir(dev_id)) +- if not rc: +- self._delete_subsystem(client, subnqn) +- raise DeviceException(grpc.StatusCode.INTERNAL, +- 'Failed to add the NVMe/vfiouser listener') +- rc = self._qmp_add_device(physical_id, dev_id) +- if not rc: +- self._delete_subsystem(client, subnqn) +- raise DeviceException(grpc.StatusCode.INTERNAL, +- 'Failed to create NVMe/vfiouser device') +- return subnqn +- +- def create_device(self, request): +- if request.nvme.virtual_id != 0: +- raise DeviceException(grpc.StatusCode.INVALID_ARGUMENT, +- 'Unsupported virtual_id value') +- subnqn = self._create_device(request.nvme.physical_id) +- return sma_pb2.CreateDeviceResponse(handle=f'{self._prefix}:{subnqn}') +- +- def _qmp_delete_device(self, dev_id): +- try: +- with QMPClient(self._qmp_addr, AddressFamily.AF_INET) as qclient: +- if self._find_pcidev(qclient, dev_id) is not None: +- qclient.device_del({'id': dev_id}) +- return True +- except QMPError: +- logging.error('QMP: Failed to delete device') +- return False +- +- def delete_device(self, request): +- with self._client_wrap() as client: +- nqn = request.handle[len(f'{self._prefix}:'):] +- dev_id = nqn[len('nqn.2016-06.io.spdk:'):] +- if not self._delete_subsystem(client, nqn): +- raise DeviceException(grpc.StatusCode.INTERNAL, +- 'Failed to delete NVMe/vfiouser device') +- if not self._qmp_delete_device(dev_id): +- raise DeviceException(grpc.StatusCode.INTERNAL, +- 'Failed to delete NVMe/vfiouser device') +- try: +- path = os.path.join(self._sock_path, dev_id) +- if os.path.exists(path): +- shutil.rmtree(path) +- except OSError: +- raise DeviceException(grpc.StatusCode.INTERNAL, f'Socket path error {path}') +- +- def _get_bdev(self, client, guid): +- try: +- bdev_name = get_crypto_engine().get_crypto_bdev(guid) or guid +- return client.call('bdev_get_bdevs', {'name': bdev_name})[0] +- except (JSONRPCException, CryptoException): +- logging.error('Failed to find bdev') +- return None +- +- def _get_ns(self, bdev, subsystem): +- for ns in subsystem['namespaces']: +- if ns['name'] == bdev['name']: +- return ns +- +- def _subsystem_add_ns(self, client, bdev, subsystem, subnqn, volume_id): +- try: +- if self._get_ns(bdev, subsystem) is not None: +- return True +- return client.call('nvmf_subsystem_add_ns', +- {'nqn': subnqn, +- 'namespace': { +- 'bdev_name': bdev['name'], +- 'uuid': volume_id, +- 'nguid': volume_id_to_nguid(volume_id)}}) +- except JSONRPCException: +- logging.error('Failed to add ns') +- return False +- +- def attach_volume(self, request): +- nqn = request.device_handle[len(f'{self._prefix}:'):] +- volume_id = format_volume_id(request.volume.volume_id) +- with self._client_wrap() as client: +- bdev = self._get_bdev(client, volume_id) +- if bdev is None: +- raise DeviceException(grpc.StatusCode.NOT_FOUND, +- 'Invalid volume GUID') +- subsys = self._get_subsys(client, nqn) +- if subsys is None: +- raise DeviceException(grpc.StatusCode.NOT_FOUND, +- 'Invalid device handle') +- result = self._subsystem_add_ns(client, bdev, subsys, nqn, volume_id) +- if not result: +- raise DeviceException(grpc.StatusCode.INTERNAL, +- 'Failed to attach volume') +- +- def _subsystem_remove_ns(self, client, bdev, subsystem, subnqn): +- try: +- ns = self._get_ns(bdev, subsystem) +- if ns is None: +- return True +- return client.call('nvmf_subsystem_remove_ns', +- {'nqn': subnqn, 'nsid': ns['nsid']}) +- except JSONRPCException: +- logging.error('Failed to remove ns') +- return False +- +- def detach_volume(self, request): +- nqn = request.device_handle[len(f'{self._prefix}:'):] +- volume_id = format_volume_id(request.volume_id) +- with self._client_wrap() as client: +- bdev = self._get_bdev(client, volume_id) +- if bdev is None: +- raise DeviceException(grpc.StatusCode.NOT_FOUND, +- 'Invalid volume GUID') +- subsys = self._get_subsys(client, nqn) +- if subsys is None: +- raise DeviceException(grpc.StatusCode.NOT_FOUND, +- 'Invalid device handle') +- result = self._subsystem_remove_ns(client, bdev, subsys, nqn) +- if not result: +- raise DeviceException(grpc.StatusCode.INTERNAL, +- 'Failed to detach volume') +- +- def owns_device(self, id): +- return id.startswith(self._prefix) +- +- def set_qos(self, request): +- nqn = request.device_handle[len(f'{self._prefix}:'):] +- volume = format_volume_id(request.volume_id) +- if volume is None: +- raise DeviceException(grpc.StatusCode.INVALID_ARGUMENT, +- 'Invalid volume ID') +- try: +- with self._client() as client: +- # Make sure that a volume exists and is attached to the device +- bdev = self._get_bdev(client, volume) +- if bdev is None: +- raise DeviceException(grpc.StatusCode.NOT_FOUND, +- 'No volume associated with volume_id could be found') +- subsys = self._get_subsys(client, nqn) +- if subsys is None: +- raise DeviceException(grpc.StatusCode.NOT_FOUND, +- 'No device associated with device_handle could be found') +- ns = self._get_ns(bdev, subsys) +- if ns is None: +- raise DeviceException(grpc.StatusCode.INVALID_ARGUMENT, +- 'Specified volume is not attached to the device') +- qos.set_volume_bdev_qos(client, request) +- except qos.QosException as ex: +- raise DeviceException(ex.code, ex.message) +- except JSONRPCException: +- raise DeviceException(grpc.StatusCode.INTERNAL, +- 'Failed to set QoS') +- +- def get_qos_capabilities(self, request): +- return qos.get_bdev_qos_capabilities() ++# SPDX-License-Identifier: BSD-3-Clause ++# Copyright (C) 2022 Intel Corporation. ++# All rights reserved. ++ ++import logging ++import os ++import shutil ++from contextlib import contextmanager ++from socket import AddressFamily ++ ++import grpc ++from google.protobuf import wrappers_pb2 as wrap ++from spdk.rpc.client import JSONRPCException ++from spdk.sma import qos ++ ++from ..common import format_volume_id, volume_id_to_nguid ++from ..proto import sma_pb2 ++from ..qmp import QMPClient, QMPError ++from ..volume import CryptoException, get_crypto_engine ++from .device import DeviceException, DeviceManager ++ ++log = logging.getLogger(__name__) ++ ++ ++class NvmfVfioDeviceManager(DeviceManager): ++ def __init__(self, client): ++ super().__init__('vfiouser', 'nvme', client) ++ ++ def init(self, config): ++ self._buses = config.get('buses', []) ++ try: ++ if len(self._buses) != len(list({v['name']: v for v in self._buses}.values())): ++ raise DeviceException(grpc.StatusCode.INVALID_ARGUMENT, ++ 'Duplicate PCI bridge names') ++ except KeyError: ++ raise DeviceException(grpc.StatusCode.INVALID_ARGUMENT, ++ 'PCI bridge name is missing') ++ for bus in self._buses: ++ bus['count'] = bus.get('count', 32) ++ if bus['count'] < 0: ++ raise DeviceException(grpc.StatusCode.INVALID_ARGUMENT, ++ 'Incorrect PCI bridge count') ++ self._qmp_addr = (config.get('qmp_addr', '127.0.0.1'), config.get('qmp_port')) ++ self._sock_path = config.get('sock_path', '/var/tmp/') ++ self._prefix = f'{self.protocol}' ++ if not self._create_transport(): ++ raise DeviceException(grpc.StatusCode.INTERNAL, ++ 'NVMe/vfiouser transport is unavailable') ++ ++ def _create_transport(self): ++ try: ++ with self._client() as client: ++ transports = client.call('nvmf_get_transports') ++ for transport in transports: ++ if transport['trtype'].lower() == 'vfiouser': ++ return True ++ return client.call('nvmf_create_transport', {'trtype': 'vfiouser'}) ++ except JSONRPCException: ++ logging.error(f'Transport query NVMe/vfiouser failed') ++ return False ++ ++ @contextmanager ++ def _client_wrap(self): ++ try: ++ with self._client() as client: ++ yield client ++ except JSONRPCException: ++ raise DeviceException(grpc.StatusCode.INTERNAL, ++ 'Failed to connect to SPDK application') ++ ++ def _get_subsys(self, client, nqn): ++ try: ++ return client.call('nvmf_get_subsystems', {'nqn': nqn})[0] ++ except JSONRPCException: ++ return False ++ ++ def _create_subsystem(self, client, subnqn): ++ try: ++ if self._get_subsys(client, subnqn): ++ return True ++ return client.call('nvmf_create_subsystem', {'nqn': subnqn, 'allow_any_host': True}) ++ except JSONRPCException: ++ logging.error('Failed to create subsystem') ++ return False ++ ++ def _delete_subsystem(self, client, subnqn): ++ try: ++ if not self._get_subsys(client, subnqn): ++ return True ++ return client.call('nvmf_delete_subsystem', {'nqn': subnqn}) ++ except JSONRPCException: ++ logging.error('Failed to delete subsystem') ++ return False ++ ++ def _subsystem_add_listener(self, client, subnqn, addr): ++ try: ++ return client.call('nvmf_subsystem_add_listener', ++ {'nqn': subnqn, ++ 'listen_address': { ++ 'trtype': 'vfiouser', ++ 'traddr': addr}}) ++ except JSONRPCException: ++ logging.error('Failed to add listener') ++ return False ++ ++ def _create_socket_dir(self, dev_id): ++ try: ++ path = os.path.join(self._sock_path, dev_id) ++ if os.path.exists(path): ++ shutil.rmtree(path) ++ os.makedirs(path) ++ if os.path.isdir(path): ++ return path ++ except OSError: ++ raise DeviceException(grpc.StatusCode.INTERNAL, f'Socket path error {path}') ++ ++ def _find_pcidev(self, qclient, name): ++ def rsearch(devices, name): ++ found_dev = None ++ for dev in devices: ++ if dev['qdev_id'] == name: ++ found_dev = dev ++ elif 'pci_bridge' in dev: ++ found_dev = rsearch(dev['pci_bridge']['devices'], name) ++ ++ if found_dev: ++ break ++ return found_dev ++ ++ try: ++ buses = qclient.query_pci()['return'] ++ for bus in buses: ++ rc = rsearch(bus['devices'], name) ++ if rc is not None: ++ return rc ++ except QMPError: ++ return None ++ ++ def _qmp_add_device(self, phid, dev_id): ++ # Find a bus that the physical_id maps to ++ for bus in self._buses: ++ if phid >= bus['count']: ++ phid = phid - bus['count'] ++ else: ++ break ++ else: ++ raise DeviceException(grpc.StatusCode.INVALID_ARGUMENT, 'Invalid physical_id') ++ try: ++ with QMPClient(self._qmp_addr, AddressFamily.AF_INET) as qclient: ++ if self._find_pcidev(qclient, dev_id) is None: ++ qclient.device_add({'driver': 'vfio-user-pci', ++ 'x-enable-migration': 'on', ++ 'socket': os.path.join(self._sock_path, dev_id, 'cntrl'), ++ 'bus': bus.get('name'), ++ 'addr': hex(phid), ++ 'id': dev_id}) ++ return True ++ except QMPError: ++ logging.error('QMP: Failed to add device') ++ return False ++ ++ def _create_device(self, physical_id): ++ with self._client_wrap() as client: ++ dev_id = f'{self.name}-{physical_id}' ++ subnqn = f'nqn.2016-06.io.spdk:{dev_id}' ++ rc = self._create_subsystem(client, subnqn) ++ if not rc: ++ raise DeviceException(grpc.StatusCode.INTERNAL, ++ 'Failed to create the NVMe/vfiouser subsystem') ++ rc = self._subsystem_add_listener(client, subnqn, ++ self._create_socket_dir(dev_id)) ++ if not rc: ++ self._delete_subsystem(client, subnqn) ++ raise DeviceException(grpc.StatusCode.INTERNAL, ++ 'Failed to add the NVMe/vfiouser listener') ++ rc = self._qmp_add_device(physical_id, dev_id) ++ if not rc: ++ self._delete_subsystem(client, subnqn) ++ raise DeviceException(grpc.StatusCode.INTERNAL, ++ 'Failed to create NVMe/vfiouser device') ++ return subnqn ++ ++ def create_device(self, request): ++ if request.nvme.virtual_id != 0: ++ raise DeviceException(grpc.StatusCode.INVALID_ARGUMENT, ++ 'Unsupported virtual_id value') ++ subnqn = self._create_device(request.nvme.physical_id) ++ return sma_pb2.CreateDeviceResponse(handle=f'{self._prefix}:{subnqn}') ++ ++ def _qmp_delete_device(self, dev_id): ++ try: ++ with QMPClient(self._qmp_addr, AddressFamily.AF_INET) as qclient: ++ if self._find_pcidev(qclient, dev_id) is not None: ++ qclient.device_del({'id': dev_id}) ++ return True ++ except QMPError: ++ logging.error('QMP: Failed to delete device') ++ return False ++ ++ def delete_device(self, request): ++ with self._client_wrap() as client: ++ nqn = request.handle[len(f'{self._prefix}:'):] ++ dev_id = nqn[len('nqn.2016-06.io.spdk:'):] ++ if not self._delete_subsystem(client, nqn): ++ raise DeviceException(grpc.StatusCode.INTERNAL, ++ 'Failed to delete NVMe/vfiouser device') ++ if not self._qmp_delete_device(dev_id): ++ raise DeviceException(grpc.StatusCode.INTERNAL, ++ 'Failed to delete NVMe/vfiouser device') ++ try: ++ path = os.path.join(self._sock_path, dev_id) ++ if os.path.exists(path): ++ shutil.rmtree(path) ++ except OSError: ++ raise DeviceException(grpc.StatusCode.INTERNAL, f'Socket path error {path}') ++ ++ def _get_bdev(self, client, guid): ++ try: ++ bdev_name = get_crypto_engine().get_crypto_bdev(guid) or guid ++ return client.call('bdev_get_bdevs', {'name': bdev_name})[0] ++ except (JSONRPCException, CryptoException): ++ logging.error('Failed to find bdev') ++ return None ++ ++ def _get_ns(self, bdev, subsystem): ++ for ns in subsystem['namespaces']: ++ if ns['name'] == bdev['name']: ++ return ns ++ ++ def _subsystem_add_ns(self, client, bdev, subsystem, subnqn, volume_id): ++ try: ++ if self._get_ns(bdev, subsystem) is not None: ++ return True ++ return client.call('nvmf_subsystem_add_ns', ++ {'nqn': subnqn, ++ 'namespace': { ++ 'bdev_name': bdev['name'], ++ 'uuid': volume_id, ++ 'nguid': volume_id_to_nguid(volume_id)}}) ++ except JSONRPCException: ++ logging.error('Failed to add ns') ++ return False ++ ++ def attach_volume(self, request): ++ nqn = request.device_handle[len(f'{self._prefix}:'):] ++ volume_id = format_volume_id(request.volume.volume_id) ++ with self._client_wrap() as client: ++ bdev = self._get_bdev(client, volume_id) ++ if bdev is None: ++ raise DeviceException(grpc.StatusCode.NOT_FOUND, ++ 'Invalid volume GUID') ++ subsys = self._get_subsys(client, nqn) ++ if subsys is None: ++ raise DeviceException(grpc.StatusCode.NOT_FOUND, ++ 'Invalid device handle') ++ result = self._subsystem_add_ns(client, bdev, subsys, nqn, volume_id) ++ if not result: ++ raise DeviceException(grpc.StatusCode.INTERNAL, ++ 'Failed to attach volume') ++ ++ def _subsystem_remove_ns(self, client, bdev, subsystem, subnqn): ++ try: ++ ns = self._get_ns(bdev, subsystem) ++ if ns is None: ++ return True ++ return client.call('nvmf_subsystem_remove_ns', ++ {'nqn': subnqn, 'nsid': ns['nsid']}) ++ except JSONRPCException: ++ logging.error('Failed to remove ns') ++ return False ++ ++ def detach_volume(self, request): ++ nqn = request.device_handle[len(f'{self._prefix}:'):] ++ volume_id = format_volume_id(request.volume_id) ++ with self._client_wrap() as client: ++ bdev = self._get_bdev(client, volume_id) ++ if bdev is None: ++ raise DeviceException(grpc.StatusCode.NOT_FOUND, ++ 'Invalid volume GUID') ++ subsys = self._get_subsys(client, nqn) ++ if subsys is None: ++ raise DeviceException(grpc.StatusCode.NOT_FOUND, ++ 'Invalid device handle') ++ result = self._subsystem_remove_ns(client, bdev, subsys, nqn) ++ if not result: ++ raise DeviceException(grpc.StatusCode.INTERNAL, ++ 'Failed to detach volume') ++ ++ def owns_device(self, id): ++ return id.startswith(self._prefix) ++ ++ def set_qos(self, request): ++ nqn = request.device_handle[len(f'{self._prefix}:'):] ++ volume = format_volume_id(request.volume_id) ++ if volume is None: ++ raise DeviceException(grpc.StatusCode.INVALID_ARGUMENT, ++ 'Invalid volume ID') ++ try: ++ with self._client() as client: ++ # Make sure that a volume exists and is attached to the device ++ bdev = self._get_bdev(client, volume) ++ if bdev is None: ++ raise DeviceException(grpc.StatusCode.NOT_FOUND, ++ 'No volume associated with volume_id could be found') ++ subsys = self._get_subsys(client, nqn) ++ if subsys is None: ++ raise DeviceException(grpc.StatusCode.NOT_FOUND, ++ 'No device associated with device_handle could be found') ++ ns = self._get_ns(bdev, subsys) ++ if ns is None: ++ raise DeviceException(grpc.StatusCode.INVALID_ARGUMENT, ++ 'Specified volume is not attached to the device') ++ qos.set_volume_bdev_qos(client, request) ++ except qos.QosException as ex: ++ raise DeviceException(ex.code, ex.message) ++ except JSONRPCException: ++ raise DeviceException(grpc.StatusCode.INTERNAL, ++ 'Failed to set QoS') ++ ++ def get_qos_capabilities(self, request): ++ return qos.get_bdev_qos_capabilities() +diff --git a/python/spdk/sma/device/vhost_blk.py b/python/spdk/sma/device/vhost_blk.py +index 7b655a0..170e4b4 100644 +--- a/python/spdk/sma/device/vhost_blk.py ++++ b/python/spdk/sma/device/vhost_blk.py +@@ -1,228 +1,228 @@ +-# SPDX-License-Identifier: BSD-3-Clause +-# Copyright (C) 2022 Intel Corporation. +-# All rights reserved. +- +-import logging +-import os +-import uuid +-from socket import AddressFamily +- +-import grpc +-from spdk.rpc.client import JSONRPCException +-from spdk.sma import qos +- +-from ..common import format_volume_id, volume_id_to_nguid +-from ..proto import sma_pb2, virtio_blk_pb2 +-from ..qmp import QMPClient, QMPError +-from ..volume import CryptoException, get_crypto_engine +-from .device import DeviceException, DeviceManager +- +- +-class VhostBlkDeviceManager(DeviceManager): +- def __init__(self, client): +- super().__init__('vhost_blk', 'virtio_blk', client, allow_delete_volumes=True) +- +- def init(self, config): +- self._buses = config.get('buses', []) +- try: +- if len(self._buses) != len(list({v['name']: v for v in self._buses}.values())): +- raise DeviceException(grpc.StatusCode.INVALID_ARGUMENT, +- 'Duplicate PCI bridge names') +- except KeyError: +- raise DeviceException(grpc.StatusCode.INVALID_ARGUMENT, +- 'PCI bridge name is missing') +- for bus in self._buses: +- bus['count'] = bus.get('count', 32) +- if bus['count'] < 0: +- raise DeviceException(grpc.StatusCode.INVALID_ARGUMENT, +- 'Incorrect PCI bridge count') +- self._qmp_addr = (config.get('qmp_addr', '127.0.0.1'), config.get('qmp_port')) +- self._vhost_path = config.get('sock_path', '/var/tmp/') +- self._prefix = f'{self.protocol}' +- +- def owns_device(self, id): +- return id.startswith(self._prefix) +- +- def _find_controller(self, client, controller): +- try: +- ctrlrs = client.call('vhost_get_controllers') +- for ctrlr in ctrlrs: +- if ctrlr['ctrlr'] == controller: +- return ctrlr +- except JSONRPCException: +- logging.error('Failed to find vhost controller') +- return None +- +- def _qmp_delete_device(self, ctrlr): +- try: +- with QMPClient(self._qmp_addr, AddressFamily.AF_INET) as qclient: +- if self._find_pcidev(qclient, ctrlr) is not None: +- qclient.device_del({'id': ctrlr}, {'event': 'DEVICE_DELETED', +- 'data': {'device': ctrlr}}) +- except QMPError: +- logging.error('QMP: Failed to delete device') +- try: +- with QMPClient(self._qmp_addr, AddressFamily.AF_INET) as qclient: +- if (self._find_pcidev(qclient, ctrlr) is None and +- self._find_chardev(qclient, ctrlr) is not None): +- qclient.chardev_remove({'id': ctrlr}) +- return True +- except QMPError: +- logging.error('QMP: Failed to delete chardev') +- return False +- +- def _delete_controller(self, client, ctrlr): +- if self._find_controller(client, ctrlr) is None: +- return True +- try: +- return client.call('vhost_delete_controller', {'ctrlr': ctrlr}) +- except JSONRPCException: +- logging.error('Failed to delete controller') +- return False +- +- def _find_bdev(self, client, guid): +- try: +- bdev_name = get_crypto_engine().get_crypto_bdev(guid) or guid +- return client.call('bdev_get_bdevs', {'name': bdev_name})[0] +- except (JSONRPCException, CryptoException): +- return None +- +- def _bdev_cmp(self, client, bdev1, bdev2): +- try: +- return self._find_bdev(client, bdev1)['name'] == self._find_bdev(client, bdev2)['name'] +- except (KeyError, TypeError): +- return False +- +- def _create_controller(self, client, ctrlr, volume_guid): +- nctrlr = self._find_controller(client, ctrlr) +- if nctrlr is not None: +- return self._bdev_cmp(client, nctrlr['backend_specific']['block']['bdev'], volume_guid) +- try: +- bdev_name = get_crypto_engine().get_crypto_bdev(volume_guid) or volume_guid +- return client.call('vhost_create_blk_controller', +- {'ctrlr': ctrlr, 'dev_name': bdev_name}) +- except JSONRPCException: +- logging.error('Failed to create subsystem') +- return False +- +- def _find_pcidev(self, qclient, name): +- try: +- buses = qclient.query_pci()['return'] +- for bus in buses: +- for dev in bus['devices']: +- if 'pci_bridge' in dev: +- for pcidev in dev['pci_bridge']['devices']: +- if pcidev['qdev_id'] == name: +- return pcidev +- return None +- except QMPError: +- return None +- +- def _find_chardev(self, qclient, name): +- try: +- devs = qclient.query_chardev()['return'] +- for dev in devs: +- if dev['label'] == name: +- return dev +- return None +- except QMPError: +- return None +- +- def _qmp_add_device(self, ctrlr, phid, sock_path): +- # Find a bus that the physical_id maps to +- for bus in self._buses: +- if phid >= bus.get('count'): +- phid = phid - bus.get('count') +- else: +- break +- else: +- raise DeviceException(grpc.StatusCode.INVALID_ARGUMENT, 'Invalid physical_id') +- try: +- with QMPClient(self._qmp_addr, AddressFamily.AF_INET) as qclient: +- if self._find_chardev(qclient, ctrlr) is None: +- qclient.chardev_add({ +- 'id': ctrlr, +- 'backend': { +- 'type': 'socket', +- 'data': { +- 'addr': { +- 'type': 'unix', +- 'data': { +- 'path': os.path.join(sock_path, ctrlr), +- } +- }, +- 'server': False, +- } +- }}) +- if self._find_pcidev(qclient, ctrlr) is None: +- qclient.device_add({'driver': 'vhost-user-blk-pci', +- 'chardev': ctrlr, +- 'bus': bus.get('name'), +- 'addr': hex(phid), +- 'id': ctrlr}) +- return True +- except QMPError: +- self._qmp_delete_device(ctrlr) +- logging.error('QMP: Failed to add device') +- return False +- +- def create_device(self, request): +- params = request.virtio_blk +- ctrlr = f'sma-{params.physical_id}' +- volume_guid = format_volume_id(request.volume.volume_id) +- if params.virtual_id != 0: +- raise DeviceException(grpc.StatusCode.INVALID_ARGUMENT, +- 'Unsupported virtual_id value') +- with self._client() as client: +- rc = self._create_controller(client, ctrlr, volume_guid) +- if not rc: +- raise DeviceException(grpc.StatusCode.INTERNAL, +- 'Failed to create vhost device') +- rc = self._qmp_add_device(ctrlr, params.physical_id, self._vhost_path) +- if not rc: +- self._delete_controller(client, ctrlr) +- raise DeviceException(grpc.StatusCode.INTERNAL, +- 'Failed to create vhost device') +- return sma_pb2.CreateDeviceResponse(handle=f'{self.protocol}:{ctrlr}') +- +- def delete_device(self, request): +- with self._client() as client: +- ctrlr = request.handle[len(f'{self._prefix}:'):] +- if not self._qmp_delete_device(ctrlr): +- raise DeviceException(grpc.StatusCode.INTERNAL, +- 'Failed to delete vhost device') +- if not self._delete_controller(client, ctrlr): +- raise DeviceException(grpc.StatusCode.INTERNAL, +- 'Failed to delete vhost device') +- +- def set_qos(self, request): +- ctrlr = request.device_handle[len(f'{self._prefix}:'):] +- volume = format_volume_id(request.volume_id) +- try: +- with self._client() as client: +- nctrlr = self._find_controller(client, ctrlr) +- if nctrlr is None: +- raise DeviceException(grpc.StatusCode.INVALID_ARGUMENT, +- 'No device associated with device_handle could be found') +- nbdev = nctrlr['backend_specific']['block']['bdev'] +- if len(request.volume_id) == 0: +- id = self._find_bdev(client, nbdev)['uuid'] +- request.volume_id = uuid.UUID(id).bytes +- elif volume is not None: +- if not self._bdev_cmp(client, nbdev, volume): +- raise DeviceException(grpc.StatusCode.INVALID_ARGUMENT, +- 'Specified volume is not attached to the device') +- else: +- raise DeviceException(grpc.StatusCode.INVALID_ARGUMENT, +- 'Invalid volume uuid') +- qos.set_volume_bdev_qos(client, request) +- except qos.QosException as ex: +- raise DeviceException(ex.code, ex.message) +- except JSONRPCException: +- raise DeviceException(grpc.StatusCode.INTERNAL, +- 'Failed to set QoS') +- +- def get_qos_capabilities(self, request): +- bdev_caps = qos.get_bdev_qos_capabilities().max_volume_caps +- return sma_pb2.GetQosCapabilitiesResponse(max_volume_caps=bdev_caps, +- max_device_caps=bdev_caps) ++# SPDX-License-Identifier: BSD-3-Clause ++# Copyright (C) 2022 Intel Corporation. ++# All rights reserved. ++ ++import logging ++import os ++import uuid ++from socket import AddressFamily ++ ++import grpc ++from spdk.rpc.client import JSONRPCException ++from spdk.sma import qos ++ ++from ..common import format_volume_id, volume_id_to_nguid ++from ..proto import sma_pb2, virtio_blk_pb2 ++from ..qmp import QMPClient, QMPError ++from ..volume import CryptoException, get_crypto_engine ++from .device import DeviceException, DeviceManager ++ ++ ++class VhostBlkDeviceManager(DeviceManager): ++ def __init__(self, client): ++ super().__init__('vhost_blk', 'virtio_blk', client, allow_delete_volumes=True) ++ ++ def init(self, config): ++ self._buses = config.get('buses', []) ++ try: ++ if len(self._buses) != len(list({v['name']: v for v in self._buses}.values())): ++ raise DeviceException(grpc.StatusCode.INVALID_ARGUMENT, ++ 'Duplicate PCI bridge names') ++ except KeyError: ++ raise DeviceException(grpc.StatusCode.INVALID_ARGUMENT, ++ 'PCI bridge name is missing') ++ for bus in self._buses: ++ bus['count'] = bus.get('count', 32) ++ if bus['count'] < 0: ++ raise DeviceException(grpc.StatusCode.INVALID_ARGUMENT, ++ 'Incorrect PCI bridge count') ++ self._qmp_addr = (config.get('qmp_addr', '127.0.0.1'), config.get('qmp_port')) ++ self._vhost_path = config.get('sock_path', '/var/tmp/') ++ self._prefix = f'{self.protocol}' ++ ++ def owns_device(self, id): ++ return id.startswith(self._prefix) ++ ++ def _find_controller(self, client, controller): ++ try: ++ ctrlrs = client.call('vhost_get_controllers') ++ for ctrlr in ctrlrs: ++ if ctrlr['ctrlr'] == controller: ++ return ctrlr ++ except JSONRPCException: ++ logging.error('Failed to find vhost controller') ++ return None ++ ++ def _qmp_delete_device(self, ctrlr): ++ try: ++ with QMPClient(self._qmp_addr, AddressFamily.AF_INET) as qclient: ++ if self._find_pcidev(qclient, ctrlr) is not None: ++ qclient.device_del({'id': ctrlr}, {'event': 'DEVICE_DELETED', ++ 'data': {'device': ctrlr}}) ++ except QMPError: ++ logging.error('QMP: Failed to delete device') ++ try: ++ with QMPClient(self._qmp_addr, AddressFamily.AF_INET) as qclient: ++ if (self._find_pcidev(qclient, ctrlr) is None and ++ self._find_chardev(qclient, ctrlr) is not None): ++ qclient.chardev_remove({'id': ctrlr}) ++ return True ++ except QMPError: ++ logging.error('QMP: Failed to delete chardev') ++ return False ++ ++ def _delete_controller(self, client, ctrlr): ++ if self._find_controller(client, ctrlr) is None: ++ return True ++ try: ++ return client.call('vhost_delete_controller', {'ctrlr': ctrlr}) ++ except JSONRPCException: ++ logging.error('Failed to delete controller') ++ return False ++ ++ def _find_bdev(self, client, guid): ++ try: ++ bdev_name = get_crypto_engine().get_crypto_bdev(guid) or guid ++ return client.call('bdev_get_bdevs', {'name': bdev_name})[0] ++ except (JSONRPCException, CryptoException): ++ return None ++ ++ def _bdev_cmp(self, client, bdev1, bdev2): ++ try: ++ return self._find_bdev(client, bdev1)['name'] == self._find_bdev(client, bdev2)['name'] ++ except (KeyError, TypeError): ++ return False ++ ++ def _create_controller(self, client, ctrlr, volume_guid): ++ nctrlr = self._find_controller(client, ctrlr) ++ if nctrlr is not None: ++ return self._bdev_cmp(client, nctrlr['backend_specific']['block']['bdev'], volume_guid) ++ try: ++ bdev_name = get_crypto_engine().get_crypto_bdev(volume_guid) or volume_guid ++ return client.call('vhost_create_blk_controller', ++ {'ctrlr': ctrlr, 'dev_name': bdev_name}) ++ except JSONRPCException: ++ logging.error('Failed to create subsystem') ++ return False ++ ++ def _find_pcidev(self, qclient, name): ++ try: ++ buses = qclient.query_pci()['return'] ++ for bus in buses: ++ for dev in bus['devices']: ++ if 'pci_bridge' in dev: ++ for pcidev in dev['pci_bridge']['devices']: ++ if pcidev['qdev_id'] == name: ++ return pcidev ++ return None ++ except QMPError: ++ return None ++ ++ def _find_chardev(self, qclient, name): ++ try: ++ devs = qclient.query_chardev()['return'] ++ for dev in devs: ++ if dev['label'] == name: ++ return dev ++ return None ++ except QMPError: ++ return None ++ ++ def _qmp_add_device(self, ctrlr, phid, sock_path): ++ # Find a bus that the physical_id maps to ++ for bus in self._buses: ++ if phid >= bus.get('count'): ++ phid = phid - bus.get('count') ++ else: ++ break ++ else: ++ raise DeviceException(grpc.StatusCode.INVALID_ARGUMENT, 'Invalid physical_id') ++ try: ++ with QMPClient(self._qmp_addr, AddressFamily.AF_INET) as qclient: ++ if self._find_chardev(qclient, ctrlr) is None: ++ qclient.chardev_add({ ++ 'id': ctrlr, ++ 'backend': { ++ 'type': 'socket', ++ 'data': { ++ 'addr': { ++ 'type': 'unix', ++ 'data': { ++ 'path': os.path.join(sock_path, ctrlr), ++ } ++ }, ++ 'server': False, ++ } ++ }}) ++ if self._find_pcidev(qclient, ctrlr) is None: ++ qclient.device_add({'driver': 'vhost-user-blk-pci', ++ 'chardev': ctrlr, ++ 'bus': bus.get('name'), ++ 'addr': hex(phid), ++ 'id': ctrlr}) ++ return True ++ except QMPError: ++ self._qmp_delete_device(ctrlr) ++ logging.error('QMP: Failed to add device') ++ return False ++ ++ def create_device(self, request): ++ params = request.virtio_blk ++ ctrlr = f'sma-{params.physical_id}' ++ volume_guid = format_volume_id(request.volume.volume_id) ++ if params.virtual_id != 0: ++ raise DeviceException(grpc.StatusCode.INVALID_ARGUMENT, ++ 'Unsupported virtual_id value') ++ with self._client() as client: ++ rc = self._create_controller(client, ctrlr, volume_guid) ++ if not rc: ++ raise DeviceException(grpc.StatusCode.INTERNAL, ++ 'Failed to create vhost device') ++ rc = self._qmp_add_device(ctrlr, params.physical_id, self._vhost_path) ++ if not rc: ++ self._delete_controller(client, ctrlr) ++ raise DeviceException(grpc.StatusCode.INTERNAL, ++ 'Failed to create vhost device') ++ return sma_pb2.CreateDeviceResponse(handle=f'{self.protocol}:{ctrlr}') ++ ++ def delete_device(self, request): ++ with self._client() as client: ++ ctrlr = request.handle[len(f'{self._prefix}:'):] ++ if not self._qmp_delete_device(ctrlr): ++ raise DeviceException(grpc.StatusCode.INTERNAL, ++ 'Failed to delete vhost device') ++ if not self._delete_controller(client, ctrlr): ++ raise DeviceException(grpc.StatusCode.INTERNAL, ++ 'Failed to delete vhost device') ++ ++ def set_qos(self, request): ++ ctrlr = request.device_handle[len(f'{self._prefix}:'):] ++ volume = format_volume_id(request.volume_id) ++ try: ++ with self._client() as client: ++ nctrlr = self._find_controller(client, ctrlr) ++ if nctrlr is None: ++ raise DeviceException(grpc.StatusCode.INVALID_ARGUMENT, ++ 'No device associated with device_handle could be found') ++ nbdev = nctrlr['backend_specific']['block']['bdev'] ++ if len(request.volume_id) == 0: ++ id = self._find_bdev(client, nbdev)['uuid'] ++ request.volume_id = uuid.UUID(id).bytes ++ elif volume is not None: ++ if not self._bdev_cmp(client, nbdev, volume): ++ raise DeviceException(grpc.StatusCode.INVALID_ARGUMENT, ++ 'Specified volume is not attached to the device') ++ else: ++ raise DeviceException(grpc.StatusCode.INVALID_ARGUMENT, ++ 'Invalid volume uuid') ++ qos.set_volume_bdev_qos(client, request) ++ except qos.QosException as ex: ++ raise DeviceException(ex.code, ex.message) ++ except JSONRPCException: ++ raise DeviceException(grpc.StatusCode.INTERNAL, ++ 'Failed to set QoS') ++ ++ def get_qos_capabilities(self, request): ++ bdev_caps = qos.get_bdev_qos_capabilities().max_volume_caps ++ return sma_pb2.GetQosCapabilitiesResponse(max_volume_caps=bdev_caps, ++ max_device_caps=bdev_caps) +diff --git a/python/spdk/sma/proto/.gitignore b/python/spdk/sma/proto/.gitignore +index 9f5a3dd..cdc9f4b 100644 +--- a/python/spdk/sma/proto/.gitignore ++++ b/python/spdk/sma/proto/.gitignore +@@ -1,2 +1,2 @@ +-!__init__.py +-*.py ++!__init__.py ++*.py +diff --git a/python/spdk/sma/qmp.py b/python/spdk/sma/qmp.py +index a5b475f..d4297a2 100644 +--- a/python/spdk/sma/qmp.py ++++ b/python/spdk/sma/qmp.py +@@ -1,290 +1,290 @@ +-#!/usr/bin/env python3 +-# SPDX-License-Identifier: BSD-3-Clause +-# Copyright (C) 2022 Intel Corporation +-# All rights reserved. +-# +- +-import socket +-from socket import error as SocketError +-import time +-import json +-import logging +-import sys +-from typing import (Any, Dict, Tuple) +-from argparse import ArgumentParser +- +-log = logging.getLogger(__name__) +- +- +-QMPMessage = Dict[str, Any] +-''' +-Base class for all QMPBaseClass messages +-''' +- +- +-QMPEvent = QMPMessage +-''' +-Base class alias all asynchronous event messages +-''' +- +- +-class QMPError(Exception): +- ''' +- Base Exception Class for QMPClient implementation +- ''' +- def __init__(self, message, code='internal'): +- self.code = repr(code) +- self.message = repr(message) +- self.description = f'QMP Error ({self.code}): {self.message}' +- +- def __str__(self): +- return repr(self.description) +- +- +-class QMPSocketError(QMPError): +- ''' +- Exception Class for socket exceptions in QMPClient implementation +- ''' +- def __init__(self, message, code='socket'): +- super().__init__(message, code) +- +- +-class QMPRequestError(QMPError): +- ''' +- Exception Class for handling request response errors +- ''' +- def __init__(self, reply: QMPMessage): +- self.error_class = reply.get('error', {}).get('class', 'Undefined') +- self.error_msg = reply.get('error', {}).get('desc', 'Unknown') +- super().__init__(self.error_msg, self.error_class) +- +- +-class QMPClient(): +- ''' +- QMPBaseClass implements a low level connection to QMP socket +- +- :param family is one of [socket.AF_INET, socket.AF_UNIX] +- :param address is tuple(address, port) for socket.AF_INET +- or a path string for socket.AF_UNIX +- :param timeout: timeout in seconds to use for the connection +- :raise QMPError: for most error cases +- ''' +- def __init__(self, +- address=('127.0.0.1', 10500), +- family: socket.AddressFamily = socket.AF_INET, +- timeout: float = 8.0): +- self._exec_id = 0 +- self._capabilities = None +- self._timeout = timeout +- self._socketf = None +- self._address = address +- try: +- self._socket = socket.socket(family, socket.SOCK_STREAM) +- self._socket.settimeout(timeout) +- except OSError as e: +- raise QMPSocketError('Create: exception while creating') from e +- +- def __enter__(self): +- self._start() +- return self +- +- def __exit__(self, exception_type, exception_value, traceback): +- self._disconnect_socket() +- +- def _start(self): +- ''' +- Exit negotiation mode and enter command mode +- +- Based on: https://wiki.qemu.org/Documentation/QMP +- Part of communication done after connect. +- As stated in Capabilities Negotiation paragraph, for new connection +- QMP sends greetings msg and enters capabilities negotiation mode. +- To enter command mode, the qmp_capabilities command must be issued. +- Can be issued only once per session or the QMP will report an error. +- ''' +- self._connect() +- self._capabilities = self._receive()[0] +- if 'QMP' not in self._capabilities: +- raise QMPError('NegotiateCap: protocol error, wrong message') +- self.exec('qmp_capabilities') +- +- def _get_next_exec_id(self): +- self._exec_id += 1 +- return str(self._exec_id) +- +- def _connect(self): +- try: +- if not self._is_connected(): +- self._socket.connect(self._address) +- self._socketf = self._socket.makefile(mode='rw', encoding='utf-8') +- except SocketError as e: +- raise QMPSocketError('Connect: could not connect') from e +- +- def _disconnect_socket(self): +- if self._socket is not None: +- self._socket.close() +- if self._socketf is not None: +- self._socketf.close() +- self._socket = None +- self._socketf = None +- +- def _is_connected(self) -> bool: +- return self._socketf is not None +- +- def _check_event(self, event, received): +- ''' +- Method for cheking if "received" is the "event" we are waiting for. +- :param event: dictionary description of event, mandatory fields are +- 'event' = QMP name of the event +- 'data' = event specific params in form of a dict. +- :param received: received QMP event to check. +- ''' +- if event['event'].lower() != received['event'].lower(): +- return False +- for it in event.get('data', {}).items(): +- if it not in received.get('data', {}).items(): +- return False +- return True +- +- def _receive(self, event=None) -> Tuple[QMPMessage, QMPEvent]: +- response = None +- timeout_begin = time.time() +- while self._timeout > (time.time() - timeout_begin): +- try: +- data = self._socketf.readline() +- if data is None: +- raise QMPSocketError('Receive: socket got disconnected') +- log.debug(f'Received: {data}') +- msg = json.loads(data) +- except SocketError as e: +- raise QMPSocketError('Receive: socket read failed') from e +- except EOFError as e: +- raise QMPSocketError('Receive: socket read got unexpected EOF') from e +- except json.JSONDecodeError as e: +- raise QMPError('Receive: QMP message decode failed, JSONDecodeError') from e +- if response is None: +- if 'error' in msg: +- return msg, None +- elif 'return' in msg: +- if event is None: +- return msg, None +- response = msg +- # Sent only once per connection. Valid for capabilities negotiation mode only +- elif 'QMP' in msg: +- if self._capabilities is not None: +- raise QMPError('Receive: QMP unexpected message type') +- return msg, None +- elif self._check_event(event, msg): +- return response, msg +- raise QMPSocketError('Receive: Timed out while processing QMP receive loop') +- +- def _send(self, msg: Dict): +- log.debug(f'Sending: {msg}') +- try: +- self._socket.sendall(bytes(json.dumps(msg) + '\r\n', 'utf-8')) +- except TimeoutError as e: +- raise QMPSocketError('Send: got socket timeout error') from e +- except SocketError as e: +- raise QMPSocketError('Send: got system socket error') from e +- +- def exec(self, cmd: str, args: Dict = None, event: Dict = None) -> QMPMessage: +- ''' +- Execute QMP cmd and read result. Returns resulting message, error or optionally +- an event that the QMP client should wait for to be send by the server. +- +- :param cmd: string name of the command to execute +- :param args: optional arguments dictionary to pass +- :param event: optional dictionary describing an event to wait for +- :return command exec response or optionally execute result event +- :raise QMPRequestError: on response from QMP server being of error type +- :raise QMPSocketError: on timeout or socket errors +- :raise QMPError: on id mismatch and JSONdecoder errors +- ''' +- cmd_id = self._get_next_exec_id() +- msg = {'execute': cmd, 'id': cmd_id} +- if args is not None and len(args): +- msg['arguments'] = args +- +- self._send(msg) +- response, result = self._receive(event) +- +- if response.get('id') != cmd_id: +- raise QMPError('QMP Protocol Error, invalid result id') +- elif 'error' in response: +- raise QMPRequestError(response) +- if result is not None: +- return result +- return response +- +- def device_add(self, params: Dict, event: Dict = None): +- return self.exec('device_add', params, event) +- +- def device_del(self, params: Dict, event: Dict = None): +- return self.exec('device_del', params, event) +- +- def chardev_add(self, params: Dict, event: Dict = None): +- return self.exec('chardev-add', params, event) +- +- def chardev_remove(self, params: Dict, event: Dict = None): +- return self.exec('chardev-remove', params, event) +- +- def query_pci(self): +- return self.exec('query-pci') +- +- def query_chardev(self): +- return self.exec('query-chardev') +- +- def device_list_properties(self, typename: str): +- return self.exec('device-list-properties', {'typename': typename}) +- +- +-def parse_argv(): +- parser = ArgumentParser(description='QEMU Machine Protocol (QMP) client') +- parser.add_argument('--address', '-a', default='127.0.0.1', +- help='IP address of QMP server instance to connect to') +- parser.add_argument('--port', '-p', default=10500, type=int, +- help='Port number of QMP server instance to connect to') +- return parser.parse_args() +- +- +-def main(args): +- argv = parse_argv() +- data = json.loads(sys.stdin.read()) +- request = data.get('request') +- event = data.get('event') +- with QMPClient((argv.address, argv.port)) as cli: +- result = cli.exec(request['execute'], request.get('arguments'), event) +- print(json.dumps(result, indent=2)) +- +- +-# Example usage with command line calls: +-# 1) Without event parameter: +-# { +-# "request": { +-# "execute": "device-list-properties", +-# "arguments": { +-# "typename": "vfiouser-1-1" +-# } +-# } +-# } +-# 2) With event parameter specified. Specifying 'event' +-# parameter will set script to block wait for occurrence +-# of such one after a valid execution of specified request: +-# { +-# "event": { +-# "event": "DEVICE_DELETED", +-# "data": { +-# "device": "vfiouser-1-1" +-# } +-# }, +-# "request": { +-# "execute": "device_del", +-# "arguments": { +-# "id": "vfiouser-1-1" +-# } +-# } +-# } +- +- +-if __name__ == '__main__': +- main(sys.argv[1:]) ++#!/usr/bin/env python3 ++# SPDX-License-Identifier: BSD-3-Clause ++# Copyright (C) 2022 Intel Corporation ++# All rights reserved. ++# ++ ++import socket ++from socket import error as SocketError ++import time ++import json ++import logging ++import sys ++from typing import (Any, Dict, Tuple) ++from argparse import ArgumentParser ++ ++log = logging.getLogger(__name__) ++ ++ ++QMPMessage = Dict[str, Any] ++''' ++Base class for all QMPBaseClass messages ++''' ++ ++ ++QMPEvent = QMPMessage ++''' ++Base class alias all asynchronous event messages ++''' ++ ++ ++class QMPError(Exception): ++ ''' ++ Base Exception Class for QMPClient implementation ++ ''' ++ def __init__(self, message, code='internal'): ++ self.code = repr(code) ++ self.message = repr(message) ++ self.description = f'QMP Error ({self.code}): {self.message}' ++ ++ def __str__(self): ++ return repr(self.description) ++ ++ ++class QMPSocketError(QMPError): ++ ''' ++ Exception Class for socket exceptions in QMPClient implementation ++ ''' ++ def __init__(self, message, code='socket'): ++ super().__init__(message, code) ++ ++ ++class QMPRequestError(QMPError): ++ ''' ++ Exception Class for handling request response errors ++ ''' ++ def __init__(self, reply: QMPMessage): ++ self.error_class = reply.get('error', {}).get('class', 'Undefined') ++ self.error_msg = reply.get('error', {}).get('desc', 'Unknown') ++ super().__init__(self.error_msg, self.error_class) ++ ++ ++class QMPClient(): ++ ''' ++ QMPBaseClass implements a low level connection to QMP socket ++ ++ :param family is one of [socket.AF_INET, socket.AF_UNIX] ++ :param address is tuple(address, port) for socket.AF_INET ++ or a path string for socket.AF_UNIX ++ :param timeout: timeout in seconds to use for the connection ++ :raise QMPError: for most error cases ++ ''' ++ def __init__(self, ++ address=('127.0.0.1', 10500), ++ family: socket.AddressFamily = socket.AF_INET, ++ timeout: float = 8.0): ++ self._exec_id = 0 ++ self._capabilities = None ++ self._timeout = timeout ++ self._socketf = None ++ self._address = address ++ try: ++ self._socket = socket.socket(family, socket.SOCK_STREAM) ++ self._socket.settimeout(timeout) ++ except OSError as e: ++ raise QMPSocketError('Create: exception while creating') from e ++ ++ def __enter__(self): ++ self._start() ++ return self ++ ++ def __exit__(self, exception_type, exception_value, traceback): ++ self._disconnect_socket() ++ ++ def _start(self): ++ ''' ++ Exit negotiation mode and enter command mode ++ ++ Based on: https://wiki.qemu.org/Documentation/QMP ++ Part of communication done after connect. ++ As stated in Capabilities Negotiation paragraph, for new connection ++ QMP sends greetings msg and enters capabilities negotiation mode. ++ To enter command mode, the qmp_capabilities command must be issued. ++ Can be issued only once per session or the QMP will report an error. ++ ''' ++ self._connect() ++ self._capabilities = self._receive()[0] ++ if 'QMP' not in self._capabilities: ++ raise QMPError('NegotiateCap: protocol error, wrong message') ++ self.exec('qmp_capabilities') ++ ++ def _get_next_exec_id(self): ++ self._exec_id += 1 ++ return str(self._exec_id) ++ ++ def _connect(self): ++ try: ++ if not self._is_connected(): ++ self._socket.connect(self._address) ++ self._socketf = self._socket.makefile(mode='rw', encoding='utf-8') ++ except SocketError as e: ++ raise QMPSocketError('Connect: could not connect') from e ++ ++ def _disconnect_socket(self): ++ if self._socket is not None: ++ self._socket.close() ++ if self._socketf is not None: ++ self._socketf.close() ++ self._socket = None ++ self._socketf = None ++ ++ def _is_connected(self) -> bool: ++ return self._socketf is not None ++ ++ def _check_event(self, event, received): ++ ''' ++ Method for cheking if "received" is the "event" we are waiting for. ++ :param event: dictionary description of event, mandatory fields are ++ 'event' = QMP name of the event ++ 'data' = event specific params in form of a dict. ++ :param received: received QMP event to check. ++ ''' ++ if event['event'].lower() != received['event'].lower(): ++ return False ++ for it in event.get('data', {}).items(): ++ if it not in received.get('data', {}).items(): ++ return False ++ return True ++ ++ def _receive(self, event=None) -> Tuple[QMPMessage, QMPEvent]: ++ response = None ++ timeout_begin = time.time() ++ while self._timeout > (time.time() - timeout_begin): ++ try: ++ data = self._socketf.readline() ++ if data is None: ++ raise QMPSocketError('Receive: socket got disconnected') ++ log.debug(f'Received: {data}') ++ msg = json.loads(data) ++ except SocketError as e: ++ raise QMPSocketError('Receive: socket read failed') from e ++ except EOFError as e: ++ raise QMPSocketError('Receive: socket read got unexpected EOF') from e ++ except json.JSONDecodeError as e: ++ raise QMPError('Receive: QMP message decode failed, JSONDecodeError') from e ++ if response is None: ++ if 'error' in msg: ++ return msg, None ++ elif 'return' in msg: ++ if event is None: ++ return msg, None ++ response = msg ++ # Sent only once per connection. Valid for capabilities negotiation mode only ++ elif 'QMP' in msg: ++ if self._capabilities is not None: ++ raise QMPError('Receive: QMP unexpected message type') ++ return msg, None ++ elif self._check_event(event, msg): ++ return response, msg ++ raise QMPSocketError('Receive: Timed out while processing QMP receive loop') ++ ++ def _send(self, msg: Dict): ++ log.debug(f'Sending: {msg}') ++ try: ++ self._socket.sendall(bytes(json.dumps(msg) + '\r\n', 'utf-8')) ++ except TimeoutError as e: ++ raise QMPSocketError('Send: got socket timeout error') from e ++ except SocketError as e: ++ raise QMPSocketError('Send: got system socket error') from e ++ ++ def exec(self, cmd: str, args: Dict = None, event: Dict = None) -> QMPMessage: ++ ''' ++ Execute QMP cmd and read result. Returns resulting message, error or optionally ++ an event that the QMP client should wait for to be send by the server. ++ ++ :param cmd: string name of the command to execute ++ :param args: optional arguments dictionary to pass ++ :param event: optional dictionary describing an event to wait for ++ :return command exec response or optionally execute result event ++ :raise QMPRequestError: on response from QMP server being of error type ++ :raise QMPSocketError: on timeout or socket errors ++ :raise QMPError: on id mismatch and JSONdecoder errors ++ ''' ++ cmd_id = self._get_next_exec_id() ++ msg = {'execute': cmd, 'id': cmd_id} ++ if args is not None and len(args): ++ msg['arguments'] = args ++ ++ self._send(msg) ++ response, result = self._receive(event) ++ ++ if response.get('id') != cmd_id: ++ raise QMPError('QMP Protocol Error, invalid result id') ++ elif 'error' in response: ++ raise QMPRequestError(response) ++ if result is not None: ++ return result ++ return response ++ ++ def device_add(self, params: Dict, event: Dict = None): ++ return self.exec('device_add', params, event) ++ ++ def device_del(self, params: Dict, event: Dict = None): ++ return self.exec('device_del', params, event) ++ ++ def chardev_add(self, params: Dict, event: Dict = None): ++ return self.exec('chardev-add', params, event) ++ ++ def chardev_remove(self, params: Dict, event: Dict = None): ++ return self.exec('chardev-remove', params, event) ++ ++ def query_pci(self): ++ return self.exec('query-pci') ++ ++ def query_chardev(self): ++ return self.exec('query-chardev') ++ ++ def device_list_properties(self, typename: str): ++ return self.exec('device-list-properties', {'typename': typename}) ++ ++ ++def parse_argv(): ++ parser = ArgumentParser(description='QEMU Machine Protocol (QMP) client') ++ parser.add_argument('--address', '-a', default='127.0.0.1', ++ help='IP address of QMP server instance to connect to') ++ parser.add_argument('--port', '-p', default=10500, type=int, ++ help='Port number of QMP server instance to connect to') ++ return parser.parse_args() ++ ++ ++def main(args): ++ argv = parse_argv() ++ data = json.loads(sys.stdin.read()) ++ request = data.get('request') ++ event = data.get('event') ++ with QMPClient((argv.address, argv.port)) as cli: ++ result = cli.exec(request['execute'], request.get('arguments'), event) ++ print(json.dumps(result, indent=2)) ++ ++ ++# Example usage with command line calls: ++# 1) Without event parameter: ++# { ++# "request": { ++# "execute": "device-list-properties", ++# "arguments": { ++# "typename": "vfiouser-1-1" ++# } ++# } ++# } ++# 2) With event parameter specified. Specifying 'event' ++# parameter will set script to block wait for occurrence ++# of such one after a valid execution of specified request: ++# { ++# "event": { ++# "event": "DEVICE_DELETED", ++# "data": { ++# "device": "vfiouser-1-1" ++# } ++# }, ++# "request": { ++# "execute": "device_del", ++# "arguments": { ++# "id": "vfiouser-1-1" ++# } ++# } ++# } ++ ++ ++if __name__ == '__main__': ++ main(sys.argv[1:]) +diff --git a/python/spdk/sma/qos.py b/python/spdk/sma/qos.py +index 7b52e1d..daad1db 100644 +--- a/python/spdk/sma/qos.py ++++ b/python/spdk/sma/qos.py +@@ -1,63 +1,63 @@ +-# SPDX-License-Identifier: BSD-3-Clause +-# Copyright (C) 2022 Intel Corporation. +-# All rights reserved. +- +-import grpc +- +-from spdk.rpc.client import JSONRPCException +-from .common import format_volume_id +-from .proto import sma_pb2 +- +- +-LIMIT_UNDEFINED = (1 << 64) - 1 +- +- +-class QosException(Exception): +- def __init__(self, code, message): +- self.code = code +- self.message = message +- +- +-def set_volume_bdev_qos(client, params): +- class BdevLimit: +- def __init__(self, name, transform=lambda v: v): +- self.name = name +- self._transform = transform +- +- def get_value(self, value): +- return self._transform(value) +- +- supported_limits = { +- 'rw_iops': BdevLimit('rw_ios_per_sec', lambda v: v * 1000), +- 'rd_bandwidth': BdevLimit('r_mbytes_per_sec'), +- 'wr_bandwidth': BdevLimit('w_mbytes_per_sec'), +- 'rw_bandwidth': BdevLimit('rw_mbytes_per_sec') +- } +- # Check that none of the unsupported fields aren't set either +- if params.HasField('maximum'): +- for field, value in params.maximum.ListFields(): +- if field.name in supported_limits.keys(): +- continue +- if value != 0 and value != LIMIT_UNDEFINED: +- raise QosException(grpc.StatusCode.INVALID_ARGUMENT, +- f'Unsupported QoS limit: maximum.{field.name}') +- try: +- rpc_params = {'name': format_volume_id(params.volume_id)} +- for name, limit in supported_limits.items(): +- value = getattr(params.maximum, name) +- if value != LIMIT_UNDEFINED: +- rpc_params[limit.name] = limit.get_value(value) +- client.call('bdev_set_qos_limit', rpc_params) +- except JSONRPCException: +- raise QosException(grpc.StatusCode.INTERNAL, 'Failed to set QoS') +- +- +-def get_bdev_qos_capabilities(): +- return sma_pb2.GetQosCapabilitiesResponse( +- max_volume_caps=sma_pb2.GetQosCapabilitiesResponse.QosCapabilities( +- rw_iops=True, +- rw_bandwidth=True, +- rd_bandwidth=True, +- wr_bandwidth=True +- ), +- ) ++# SPDX-License-Identifier: BSD-3-Clause ++# Copyright (C) 2022 Intel Corporation. ++# All rights reserved. ++ ++import grpc ++ ++from spdk.rpc.client import JSONRPCException ++from .common import format_volume_id ++from .proto import sma_pb2 ++ ++ ++LIMIT_UNDEFINED = (1 << 64) - 1 ++ ++ ++class QosException(Exception): ++ def __init__(self, code, message): ++ self.code = code ++ self.message = message ++ ++ ++def set_volume_bdev_qos(client, params): ++ class BdevLimit: ++ def __init__(self, name, transform=lambda v: v): ++ self.name = name ++ self._transform = transform ++ ++ def get_value(self, value): ++ return self._transform(value) ++ ++ supported_limits = { ++ 'rw_iops': BdevLimit('rw_ios_per_sec', lambda v: v * 1000), ++ 'rd_bandwidth': BdevLimit('r_mbytes_per_sec'), ++ 'wr_bandwidth': BdevLimit('w_mbytes_per_sec'), ++ 'rw_bandwidth': BdevLimit('rw_mbytes_per_sec') ++ } ++ # Check that none of the unsupported fields aren't set either ++ if params.HasField('maximum'): ++ for field, value in params.maximum.ListFields(): ++ if field.name in supported_limits.keys(): ++ continue ++ if value != 0 and value != LIMIT_UNDEFINED: ++ raise QosException(grpc.StatusCode.INVALID_ARGUMENT, ++ f'Unsupported QoS limit: maximum.{field.name}') ++ try: ++ rpc_params = {'name': format_volume_id(params.volume_id)} ++ for name, limit in supported_limits.items(): ++ value = getattr(params.maximum, name) ++ if value != LIMIT_UNDEFINED: ++ rpc_params[limit.name] = limit.get_value(value) ++ client.call('bdev_set_qos_limit', rpc_params) ++ except JSONRPCException: ++ raise QosException(grpc.StatusCode.INTERNAL, 'Failed to set QoS') ++ ++ ++def get_bdev_qos_capabilities(): ++ return sma_pb2.GetQosCapabilitiesResponse( ++ max_volume_caps=sma_pb2.GetQosCapabilitiesResponse.QosCapabilities( ++ rw_iops=True, ++ rw_bandwidth=True, ++ rd_bandwidth=True, ++ wr_bandwidth=True ++ ), ++ ) +diff --git a/python/spdk/sma/sma.py b/python/spdk/sma/sma.py +index e0460e6..2a4fd6e 100644 +--- a/python/spdk/sma/sma.py ++++ b/python/spdk/sma/sma.py +@@ -1,198 +1,198 @@ +-# SPDX-License-Identifier: BSD-3-Clause +-# Copyright (C) 2022 Intel Corporation. +-# All rights reserved. +- +-from concurrent import futures +-from contextlib import contextmanager +-from multiprocessing import Lock +-import grpc +-import logging +-from .device import DeviceException +-from .volume import VolumeException, VolumeManager +-from .volume import crypto +-from .volume import crypto_bdev +-from .proto import sma_pb2 as pb2 +-from .proto import sma_pb2_grpc as pb2_grpc +- +- +-class StorageManagementAgent(pb2_grpc.StorageManagementAgentServicer): +- def __init__(self, config, client): +- addr, port = config['address'], config['port'] +- self._devices = {} +- self._server = grpc.server(futures.ThreadPoolExecutor(max_workers=1)) +- self._server.add_insecure_port(f'{addr}:{port}') +- self._volume_mgr = VolumeManager(client, config['discovery_timeout'], +- config['volume_cleanup_period']) +- pb2_grpc.add_StorageManagementAgentServicer_to_server(self, self._server) +- +- def _grpc_method(f): +- def wrapper(self, request, context): +- logging.debug(f'{f.__name__}\n{request}') +- return f(self, request, context) +- return wrapper +- +- def register_device(self, device_manager): +- self._devices[device_manager.protocol] = device_manager +- +- def start(self): +- self._volume_mgr.start() +- self._server.start() +- +- def stop(self): +- self._server.stop(None) +- self._volume_mgr.stop() +- +- def _find_device_by_name(self, name): +- return self._devices.get(name) +- +- def _find_device_by_handle(self, handle): +- for device in self._devices.values(): +- try: +- if device.owns_device(handle): +- return device +- except NotImplementedError: +- pass +- return None +- +- def _cleanup_volume(self, volume_id, existing): +- if volume_id is None or existing: +- return +- try: +- self._volume_mgr.disconnect_volume(volume_id) +- except VolumeException: +- logging.warning('Failed to cleanup volume {volume_id}') +- +- @_grpc_method +- def CreateDevice(self, request, context): +- response = pb2.CreateDeviceResponse() +- volume_id, existing = None, False +- try: +- if request.HasField('volume'): +- volume_id, existing = self._volume_mgr.connect_volume(request.volume) +- +- manager = self._find_device_by_name(request.WhichOneof('params')) +- if manager is None: +- raise DeviceException(grpc.StatusCode.INVALID_ARGUMENT, +- 'Unsupported device type') +- response = manager.create_device(request) +- # Now that we know the device handle, mark the volume as attached to +- # that device +- if volume_id is not None: +- self._volume_mgr.set_device(volume_id, response.handle) +- except (DeviceException, VolumeException) as ex: +- self._cleanup_volume(volume_id, existing) +- context.set_details(ex.message) +- context.set_code(ex.code) +- except NotImplementedError: +- self._cleanup_volume(volume_id, existing) +- context.set_details('Method is not implemented by selected device type') +- context.set_code(grpc.StatusCode.UNIMPLEMENTED) +- return response +- +- @_grpc_method +- def DeleteDevice(self, request, context): +- response = pb2.DeleteDeviceResponse() +- try: +- device = self._find_device_by_handle(request.handle) +- if device is None: +- raise DeviceException(grpc.StatusCode.NOT_FOUND, +- 'Invalid device handle') +- if not device.allow_delete_volumes and self._volume_mgr.has_volumes(request.handle): +- raise DeviceException(grpc.StatusCode.FAILED_PRECONDITION, +- 'Device has attached volumes') +- device.delete_device(request) +- # Either there are no volumes attached to this device or we're allowed to delete it +- # with volumes still attached +- self._volume_mgr.disconnect_device_volumes(request.handle) +- except DeviceException as ex: +- context.set_details(ex.message) +- context.set_code(ex.code) +- except NotImplementedError: +- context.set_details('Method is not implemented by selected device type') +- context.set_code(grpc.StatusCode.UNIMPLEMENTED) +- return response +- +- @_grpc_method +- def AttachVolume(self, request, context): +- response = pb2.AttachVolumeResponse() +- volume_id, existing = None, False +- try: +- if not request.HasField('volume'): +- raise VolumeException(grpc.StatusCode.INVALID_ARGUMENT, +- 'Missing required field: volume') +- volume_id, existing = self._volume_mgr.connect_volume(request.volume, +- request.device_handle) +- device = self._find_device_by_handle(request.device_handle) +- if device is None: +- raise DeviceException(grpc.StatusCode.NOT_FOUND, 'Invalid device handle') +- device.attach_volume(request) +- except (DeviceException, VolumeException) as ex: +- self._cleanup_volume(volume_id, existing) +- context.set_details(ex.message) +- context.set_code(ex.code) +- except NotImplementedError: +- self._cleanup_volume(volume_id, existing) +- context.set_details('Method is not implemented by selected device type') +- context.set_code(grpc.StatusCode.UNIMPLEMENTED) +- return response +- +- @_grpc_method +- def DetachVolume(self, request, context): +- response = pb2.DetachVolumeResponse() +- try: +- device = self._find_device_by_handle(request.device_handle) +- if device is not None: +- device.detach_volume(request) +- self._volume_mgr.disconnect_volume(request.volume_id) +- except DeviceException as ex: +- context.set_details(ex.message) +- context.set_code(ex.code) +- return response +- +- @_grpc_method +- def SetQos(self, request, context): +- response = pb2.SetQosResponse() +- try: +- device = self._find_device_by_handle(request.device_handle) +- if device is None: +- raise DeviceException(grpc.StatusCode.NOT_FOUND, 'Invalid device handle') +- device.set_qos(request) +- except DeviceException as ex: +- context.set_details(ex.message) +- context.set_code(ex.code) +- except NotImplementedError: +- context.set_details('Method is not implemented by selected device type') +- context.set_code(grpc.StatusCode.UNIMPLEMENTED) +- return response +- +- @_grpc_method +- def GetQosCapabilities(self, request, context): +- device_type_map = { +- pb2.DeviceType.DEVICE_TYPE_NVME: 'nvme', +- pb2.DeviceType.DEVICE_TYPE_VIRTIO_BLK: 'virtio_blk', +- pb2.DeviceType.DEVICE_TYPE_NVMF_TCP: 'nvmf_tcp', +- } +- response = pb2.GetQosCapabilitiesResponse() +- try: +- name = device_type_map.get(request.device_type) +- if name is None: +- raise DeviceException(grpc.StatusCode.INVALID_ARGUMENT, +- 'Invalid device type') +- manager = self._find_device_by_name(name) +- if manager is None: +- raise DeviceException(grpc.StatusCode.INVALID_ARGUMENT, +- 'Unsupported device type') +- response = manager.get_qos_capabilities(request) +- except DeviceException as ex: +- context.set_details(ex.message) +- context.set_code(ex.code) +- except NotImplementedError: +- # If a device manager doesn't implement this method, return empty capabilities to +- # indicate that no QoS capabilities are supported +- pass +- return response +- +- +-crypto.register_crypto_engine(crypto.CryptoEngineNop()) +-crypto.register_crypto_engine(crypto_bdev.CryptoEngineBdev()) +-crypto.set_crypto_engine('nop') ++# SPDX-License-Identifier: BSD-3-Clause ++# Copyright (C) 2022 Intel Corporation. ++# All rights reserved. ++ ++from concurrent import futures ++from contextlib import contextmanager ++from multiprocessing import Lock ++import grpc ++import logging ++from .device import DeviceException ++from .volume import VolumeException, VolumeManager ++from .volume import crypto ++from .volume import crypto_bdev ++from .proto import sma_pb2 as pb2 ++from .proto import sma_pb2_grpc as pb2_grpc ++ ++ ++class StorageManagementAgent(pb2_grpc.StorageManagementAgentServicer): ++ def __init__(self, config, client): ++ addr, port = config['address'], config['port'] ++ self._devices = {} ++ self._server = grpc.server(futures.ThreadPoolExecutor(max_workers=1)) ++ self._server.add_insecure_port(f'{addr}:{port}') ++ self._volume_mgr = VolumeManager(client, config['discovery_timeout'], ++ config['volume_cleanup_period']) ++ pb2_grpc.add_StorageManagementAgentServicer_to_server(self, self._server) ++ ++ def _grpc_method(f): ++ def wrapper(self, request, context): ++ logging.debug(f'{f.__name__}\n{request}') ++ return f(self, request, context) ++ return wrapper ++ ++ def register_device(self, device_manager): ++ self._devices[device_manager.protocol] = device_manager ++ ++ def start(self): ++ self._volume_mgr.start() ++ self._server.start() ++ ++ def stop(self): ++ self._server.stop(None) ++ self._volume_mgr.stop() ++ ++ def _find_device_by_name(self, name): ++ return self._devices.get(name) ++ ++ def _find_device_by_handle(self, handle): ++ for device in self._devices.values(): ++ try: ++ if device.owns_device(handle): ++ return device ++ except NotImplementedError: ++ pass ++ return None ++ ++ def _cleanup_volume(self, volume_id, existing): ++ if volume_id is None or existing: ++ return ++ try: ++ self._volume_mgr.disconnect_volume(volume_id) ++ except VolumeException: ++ logging.warning('Failed to cleanup volume {volume_id}') ++ ++ @_grpc_method ++ def CreateDevice(self, request, context): ++ response = pb2.CreateDeviceResponse() ++ volume_id, existing = None, False ++ try: ++ if request.HasField('volume'): ++ volume_id, existing = self._volume_mgr.connect_volume(request.volume) ++ ++ manager = self._find_device_by_name(request.WhichOneof('params')) ++ if manager is None: ++ raise DeviceException(grpc.StatusCode.INVALID_ARGUMENT, ++ 'Unsupported device type') ++ response = manager.create_device(request) ++ # Now that we know the device handle, mark the volume as attached to ++ # that device ++ if volume_id is not None: ++ self._volume_mgr.set_device(volume_id, response.handle) ++ except (DeviceException, VolumeException) as ex: ++ self._cleanup_volume(volume_id, existing) ++ context.set_details(ex.message) ++ context.set_code(ex.code) ++ except NotImplementedError: ++ self._cleanup_volume(volume_id, existing) ++ context.set_details('Method is not implemented by selected device type') ++ context.set_code(grpc.StatusCode.UNIMPLEMENTED) ++ return response ++ ++ @_grpc_method ++ def DeleteDevice(self, request, context): ++ response = pb2.DeleteDeviceResponse() ++ try: ++ device = self._find_device_by_handle(request.handle) ++ if device is None: ++ raise DeviceException(grpc.StatusCode.NOT_FOUND, ++ 'Invalid device handle') ++ if not device.allow_delete_volumes and self._volume_mgr.has_volumes(request.handle): ++ raise DeviceException(grpc.StatusCode.FAILED_PRECONDITION, ++ 'Device has attached volumes') ++ device.delete_device(request) ++ # Either there are no volumes attached to this device or we're allowed to delete it ++ # with volumes still attached ++ self._volume_mgr.disconnect_device_volumes(request.handle) ++ except DeviceException as ex: ++ context.set_details(ex.message) ++ context.set_code(ex.code) ++ except NotImplementedError: ++ context.set_details('Method is not implemented by selected device type') ++ context.set_code(grpc.StatusCode.UNIMPLEMENTED) ++ return response ++ ++ @_grpc_method ++ def AttachVolume(self, request, context): ++ response = pb2.AttachVolumeResponse() ++ volume_id, existing = None, False ++ try: ++ if not request.HasField('volume'): ++ raise VolumeException(grpc.StatusCode.INVALID_ARGUMENT, ++ 'Missing required field: volume') ++ volume_id, existing = self._volume_mgr.connect_volume(request.volume, ++ request.device_handle) ++ device = self._find_device_by_handle(request.device_handle) ++ if device is None: ++ raise DeviceException(grpc.StatusCode.NOT_FOUND, 'Invalid device handle') ++ device.attach_volume(request) ++ except (DeviceException, VolumeException) as ex: ++ self._cleanup_volume(volume_id, existing) ++ context.set_details(ex.message) ++ context.set_code(ex.code) ++ except NotImplementedError: ++ self._cleanup_volume(volume_id, existing) ++ context.set_details('Method is not implemented by selected device type') ++ context.set_code(grpc.StatusCode.UNIMPLEMENTED) ++ return response ++ ++ @_grpc_method ++ def DetachVolume(self, request, context): ++ response = pb2.DetachVolumeResponse() ++ try: ++ device = self._find_device_by_handle(request.device_handle) ++ if device is not None: ++ device.detach_volume(request) ++ self._volume_mgr.disconnect_volume(request.volume_id) ++ except DeviceException as ex: ++ context.set_details(ex.message) ++ context.set_code(ex.code) ++ return response ++ ++ @_grpc_method ++ def SetQos(self, request, context): ++ response = pb2.SetQosResponse() ++ try: ++ device = self._find_device_by_handle(request.device_handle) ++ if device is None: ++ raise DeviceException(grpc.StatusCode.NOT_FOUND, 'Invalid device handle') ++ device.set_qos(request) ++ except DeviceException as ex: ++ context.set_details(ex.message) ++ context.set_code(ex.code) ++ except NotImplementedError: ++ context.set_details('Method is not implemented by selected device type') ++ context.set_code(grpc.StatusCode.UNIMPLEMENTED) ++ return response ++ ++ @_grpc_method ++ def GetQosCapabilities(self, request, context): ++ device_type_map = { ++ pb2.DeviceType.DEVICE_TYPE_NVME: 'nvme', ++ pb2.DeviceType.DEVICE_TYPE_VIRTIO_BLK: 'virtio_blk', ++ pb2.DeviceType.DEVICE_TYPE_NVMF_TCP: 'nvmf_tcp', ++ } ++ response = pb2.GetQosCapabilitiesResponse() ++ try: ++ name = device_type_map.get(request.device_type) ++ if name is None: ++ raise DeviceException(grpc.StatusCode.INVALID_ARGUMENT, ++ 'Invalid device type') ++ manager = self._find_device_by_name(name) ++ if manager is None: ++ raise DeviceException(grpc.StatusCode.INVALID_ARGUMENT, ++ 'Unsupported device type') ++ response = manager.get_qos_capabilities(request) ++ except DeviceException as ex: ++ context.set_details(ex.message) ++ context.set_code(ex.code) ++ except NotImplementedError: ++ # If a device manager doesn't implement this method, return empty capabilities to ++ # indicate that no QoS capabilities are supported ++ pass ++ return response ++ ++ ++crypto.register_crypto_engine(crypto.CryptoEngineNop()) ++crypto.register_crypto_engine(crypto_bdev.CryptoEngineBdev()) ++crypto.set_crypto_engine('nop') +diff --git a/python/spdk/sma/volume/__init__.py b/python/spdk/sma/volume/__init__.py +index 582fd2f..503456d 100644 +--- a/python/spdk/sma/volume/__init__.py ++++ b/python/spdk/sma/volume/__init__.py +@@ -1,11 +1,11 @@ +-# SPDX-License-Identifier: BSD-3-Clause +-# Copyright (C) 2022 Intel Corporation. +-# All rights reserved. +- +-from .volume import VolumeException +-from .volume import VolumeManager +-from .crypto import CryptoEngine +-from .crypto import CryptoException +-from .crypto import set_crypto_engine +-from .crypto import get_crypto_engine +-from .crypto import register_crypto_engine ++# SPDX-License-Identifier: BSD-3-Clause ++# Copyright (C) 2022 Intel Corporation. ++# All rights reserved. ++ ++from .volume import VolumeException ++from .volume import VolumeManager ++from .crypto import CryptoEngine ++from .crypto import CryptoException ++from .crypto import set_crypto_engine ++from .crypto import get_crypto_engine ++from .crypto import register_crypto_engine +diff --git a/python/spdk/sma/volume/crypto.py b/python/spdk/sma/volume/crypto.py +index 3edfdf5..4283a55 100644 +--- a/python/spdk/sma/volume/crypto.py ++++ b/python/spdk/sma/volume/crypto.py +@@ -1,91 +1,91 @@ +-# SPDX-License-Identifier: BSD-3-Clause +-# Copyright (C) 2022 Intel Corporation. +-# All rights reserved. +- +-import grpc +-import logging +- +- +-log = logging.getLogger(__name__) +- +- +-class CryptoException(Exception): +- def __init__(self, code, message): +- self.code = code +- self.message = message +- +- +-class CryptoEngine: +- def __init__(self, name): +- self.name = name +- +- def init(self, client, params): +- """Initialize crypto engine""" +- self._client = client +- +- def setup(self, volume_id, key, cipher, key2=None, tweak_mode=None): +- """Set up crypto on a given volume""" +- raise NotImplementedError() +- +- def cleanup(self, volume_id): +- """ +- Disable crypto on a given volume. If crypto was not configured on that volume, this method +- is a no-op and shouldn't raise any exceptions. +- """ +- raise NotImplementedError() +- +- def verify(self, volume_id, key, cipher, key2=None, tweak_mode=None): +- """ +- Verify that specified crypto parameters match those that are currently deployed on a given +- volume. If key is None, this mehtod ensures that the volume doesn't use crypto. If +- something is wrong (e.g. keys don't match, different cipher is used, etc.), this method +- raises CryptoException. +- """ +- raise NotImplementedError() +- +- def get_crypto_bdev(self, volume_id): +- """ +- Return the name of a crypto bdev on a given volume. This method might return volume_id if +- crypto engine doesn't create a separate crypto bdev to set up crypto. If crypto is +- disabled on a given volue, this method returns None. +- """ +- raise NotImplementedError() +- +- +-class CryptoEngineNop(CryptoEngine): +- def __init__(self): +- super().__init__('nop') +- +- def setup(self, volume_id, key, cipher, key2=None, tweak_mode=None): +- raise CryptoException(grpc.StatusCode.INVALID_ARGUMENT, 'Crypto is disabled') +- +- def cleanup(self, volume_id): +- pass +- +- def verify(self, volume_id, key, cipher, key2=None, tweak_mode=None): +- pass +- +- def get_crypto_bdev(self, volume_id): +- return None +- +- +-_crypto_engine = None +-_crypto_engines = {} +- +- +-def get_crypto_engine(): +- return _crypto_engine +- +- +-def set_crypto_engine(name): +- global _crypto_engine +- engine = _crypto_engines.get(name) +- if engine is None: +- raise ValueError(f'Unknown crypto engine: {name}') +- log.info(f'Setting crypto engine: {name}') +- _crypto_engine = engine +- +- +-def register_crypto_engine(engine): +- global _crypto_engines +- _crypto_engines[engine.name] = engine ++# SPDX-License-Identifier: BSD-3-Clause ++# Copyright (C) 2022 Intel Corporation. ++# All rights reserved. ++ ++import grpc ++import logging ++ ++ ++log = logging.getLogger(__name__) ++ ++ ++class CryptoException(Exception): ++ def __init__(self, code, message): ++ self.code = code ++ self.message = message ++ ++ ++class CryptoEngine: ++ def __init__(self, name): ++ self.name = name ++ ++ def init(self, client, params): ++ """Initialize crypto engine""" ++ self._client = client ++ ++ def setup(self, volume_id, key, cipher, key2=None, tweak_mode=None): ++ """Set up crypto on a given volume""" ++ raise NotImplementedError() ++ ++ def cleanup(self, volume_id): ++ """ ++ Disable crypto on a given volume. If crypto was not configured on that volume, this method ++ is a no-op and shouldn't raise any exceptions. ++ """ ++ raise NotImplementedError() ++ ++ def verify(self, volume_id, key, cipher, key2=None, tweak_mode=None): ++ """ ++ Verify that specified crypto parameters match those that are currently deployed on a given ++ volume. If key is None, this mehtod ensures that the volume doesn't use crypto. If ++ something is wrong (e.g. keys don't match, different cipher is used, etc.), this method ++ raises CryptoException. ++ """ ++ raise NotImplementedError() ++ ++ def get_crypto_bdev(self, volume_id): ++ """ ++ Return the name of a crypto bdev on a given volume. This method might return volume_id if ++ crypto engine doesn't create a separate crypto bdev to set up crypto. If crypto is ++ disabled on a given volue, this method returns None. ++ """ ++ raise NotImplementedError() ++ ++ ++class CryptoEngineNop(CryptoEngine): ++ def __init__(self): ++ super().__init__('nop') ++ ++ def setup(self, volume_id, key, cipher, key2=None, tweak_mode=None): ++ raise CryptoException(grpc.StatusCode.INVALID_ARGUMENT, 'Crypto is disabled') ++ ++ def cleanup(self, volume_id): ++ pass ++ ++ def verify(self, volume_id, key, cipher, key2=None, tweak_mode=None): ++ pass ++ ++ def get_crypto_bdev(self, volume_id): ++ return None ++ ++ ++_crypto_engine = None ++_crypto_engines = {} ++ ++ ++def get_crypto_engine(): ++ return _crypto_engine ++ ++ ++def set_crypto_engine(name): ++ global _crypto_engine ++ engine = _crypto_engines.get(name) ++ if engine is None: ++ raise ValueError(f'Unknown crypto engine: {name}') ++ log.info(f'Setting crypto engine: {name}') ++ _crypto_engine = engine ++ ++ ++def register_crypto_engine(engine): ++ global _crypto_engines ++ _crypto_engines[engine.name] = engine +diff --git a/python/spdk/sma/volume/crypto_bdev.py b/python/spdk/sma/volume/crypto_bdev.py +index 8d465ad..d9f2f50 100644 +--- a/python/spdk/sma/volume/crypto_bdev.py ++++ b/python/spdk/sma/volume/crypto_bdev.py +@@ -1,136 +1,136 @@ +-# SPDX-License-Identifier: BSD-3-Clause +-# Copyright (C) 2022 Intel Corporation. +-# All rights reserved. +- +-import grpc +-import logging +-import uuid +-from spdk.rpc.client import JSONRPCException +-from . import crypto +-from ..common import format_volume_id +-from ..proto import sma_pb2 +- +- +-log = logging.getLogger(__name__) +- +- +-class CryptoEngineBdev(crypto.CryptoEngine): +- _ciphers = {sma_pb2.VolumeCryptoParameters.AES_CBC: 'AES_CBC', +- sma_pb2.VolumeCryptoParameters.AES_XTS: 'AES_XTS'} +- +- def __init__(self): +- super().__init__('bdev_crypto') +- +- def init(self, client, params): +- super().init(client, params) +- # _driver can be None +- self._driver = params.get('driver') +- +- def setup(self, volume_id, key, cipher, key2=None, tweak_mode=None): +- try: +- with self._client() as client: +- cipher = self._ciphers.get(cipher) +- if cipher is None: +- raise crypto.CryptoException(grpc.StatusCode.INVALID_ARGUMENT, +- 'Invalid volume crypto configuration: bad cipher') +- params = {'base_bdev_name': volume_id, +- 'name': str(uuid.uuid4()), +- 'key': key, +- 'cipher': cipher} +- if self._driver is not None: +- params['crypto_pmd'] = self._driver +- if key2 is not None: +- params['key2'] = key2 +- if tweak_mode is not None and tweak_mode != sma_pb2.VolumeCryptoParameters.TWEAK_MODE_SIMPLE_LBA: +- raise crypto.CryptoException(grpc.StatusCode.INVALID_ARGUMENT, +- 'Invalid volume crypto configuration: bad tweak_mode') +- +- log.info('Creating crypto bdev: {} on volume: {}'.format( +- params['name'], volume_id)) +- client.call('bdev_crypto_create', params) +- except JSONRPCException: +- raise crypto.CryptoException(grpc.StatusCode.INTERNAL, +- f'Failed to setup crypto for volume: {volume_id}') +- +- def cleanup(self, volume_id): +- crypto_bdev = self.get_crypto_bdev(volume_id) +- # If there's no crypto bdev set up on top of this volume, we're done +- if crypto_bdev is None: +- return +- try: +- with self._client() as client: +- log.info('Deleting crypto bdev: {} from volume: {}'.format( +- crypto_bdev, volume_id)) +- client.call('bdev_crypto_delete', {'name': crypto_bdev}) +- except JSONRPCException: +- raise crypto.CryptoException(grpc.StatusCode.INTERNAL, +- 'Failed to delete crypto bdev') +- +- def verify(self, volume_id, key, cipher, key2=None, tweak_mode=None): +- crypto_bdev = self._get_crypto_bdev(volume_id) +- # Key being None/non-None defines whether we expect a bdev_crypto on top of a given volume +- if ((key is None and crypto_bdev is not None) or (key is not None and crypto_bdev is None)): +- raise crypto.CryptoException(grpc.StatusCode.INVALID_ARGUMENT, +- 'Invalid volume crypto configuration') +- if key is None: +- return +- params = crypto_bdev['driver_specific']['crypto'] +- crypto_key = self._get_crypto_key(params['key_name']) +- if crypto_key is None: +- raise crypto.CryptoException(grpc.StatusCode.INVALID_ARGUMENT, +- 'No key object found') +- cipher = self._ciphers.get(cipher) +- if cipher is None: +- raise crypto.CryptoException(grpc.StatusCode.INVALID_ARGUMENT, +- 'Invalid volume crypto configuration: bad cipher') +- if crypto_key['cipher'].lower() != cipher.lower(): +- raise crypto.CryptoException(grpc.StatusCode.INVALID_ARGUMENT, +- 'Invalid volume crypto configuration: bad cipher') +- if crypto_key['key'].lower() != key.lower(): +- raise crypto.CryptoException(grpc.StatusCode.INVALID_ARGUMENT, +- 'Invalid volume crypto configuration: bad key') +- if key2 is not None and crypto_key.get('key2', '').lower() != key2.lower(): +- raise crypto.CryptoException(grpc.StatusCode.INVALID_ARGUMENT, +- 'Invalid volume crypto configuration: bad key2') +- if crypto_key['name'].lower() != params['key_name'].lower(): +- raise crypto.CryptoException(grpc.StatusCode.INVALID_ARGUMENT, +- 'Invalid volume crypto configuration: key name does not match') +- if tweak_mode is not None and tweak_mode != sma_pb2.VolumeCryptoParameters.TWEAK_MODE_SIMPLE_LBA: +- raise crypto.CryptoException(grpc.StatusCode.INVALID_ARGUMENT, +- 'Invalid volume crypto configuration: bad tweak_mode') +- +- def _get_crypto_bdev(self, volume_id): +- try: +- with self._client() as client: +- bdevs = client.call('bdev_get_bdevs') +- for bdev in [b for b in bdevs if b['product_name'] == 'crypto']: +- base_name = bdev['driver_specific']['crypto']['base_bdev_name'] +- base_bdev = next(filter(lambda b: b['name'] == base_name, bdevs), None) +- # Should never really happen, but check it just in case +- if base_bdev is None: +- raise crypto.CryptoException( +- grpc.StatusCode.INTERNAL, +- 'Unexpected crypto configuration: cannot find base bdev') +- if format_volume_id(base_bdev['uuid']) == volume_id: +- return bdev +- # There's no crypto bdev set up on top of this volume +- return None +- except JSONRPCException: +- raise crypto.CryptoException(grpc.StatusCode.INTERNAL, +- f'Failed to get bdev_crypto for volume: {volume_id}') +- +- def _get_crypto_key(self, key_name): +- try: +- with self._client() as client: +- _keys = client.call('accel_crypto_keys_get', {'key_name': key_name}) +- if _keys is not None: +- return _keys[0] +- return None +- except JSONRPCException: +- pass +- +- def get_crypto_bdev(self, volume_id): +- bdev = self._get_crypto_bdev(volume_id) +- if bdev is not None: +- return bdev['name'] +- return None ++# SPDX-License-Identifier: BSD-3-Clause ++# Copyright (C) 2022 Intel Corporation. ++# All rights reserved. ++ ++import grpc ++import logging ++import uuid ++from spdk.rpc.client import JSONRPCException ++from . import crypto ++from ..common import format_volume_id ++from ..proto import sma_pb2 ++ ++ ++log = logging.getLogger(__name__) ++ ++ ++class CryptoEngineBdev(crypto.CryptoEngine): ++ _ciphers = {sma_pb2.VolumeCryptoParameters.AES_CBC: 'AES_CBC', ++ sma_pb2.VolumeCryptoParameters.AES_XTS: 'AES_XTS'} ++ ++ def __init__(self): ++ super().__init__('bdev_crypto') ++ ++ def init(self, client, params): ++ super().init(client, params) ++ # _driver can be None ++ self._driver = params.get('driver') ++ ++ def setup(self, volume_id, key, cipher, key2=None, tweak_mode=None): ++ try: ++ with self._client() as client: ++ cipher = self._ciphers.get(cipher) ++ if cipher is None: ++ raise crypto.CryptoException(grpc.StatusCode.INVALID_ARGUMENT, ++ 'Invalid volume crypto configuration: bad cipher') ++ params = {'base_bdev_name': volume_id, ++ 'name': str(uuid.uuid4()), ++ 'key': key, ++ 'cipher': cipher} ++ if self._driver is not None: ++ params['crypto_pmd'] = self._driver ++ if key2 is not None: ++ params['key2'] = key2 ++ if tweak_mode is not None and tweak_mode != sma_pb2.VolumeCryptoParameters.TWEAK_MODE_SIMPLE_LBA: ++ raise crypto.CryptoException(grpc.StatusCode.INVALID_ARGUMENT, ++ 'Invalid volume crypto configuration: bad tweak_mode') ++ ++ log.info('Creating crypto bdev: {} on volume: {}'.format( ++ params['name'], volume_id)) ++ client.call('bdev_crypto_create', params) ++ except JSONRPCException: ++ raise crypto.CryptoException(grpc.StatusCode.INTERNAL, ++ f'Failed to setup crypto for volume: {volume_id}') ++ ++ def cleanup(self, volume_id): ++ crypto_bdev = self.get_crypto_bdev(volume_id) ++ # If there's no crypto bdev set up on top of this volume, we're done ++ if crypto_bdev is None: ++ return ++ try: ++ with self._client() as client: ++ log.info('Deleting crypto bdev: {} from volume: {}'.format( ++ crypto_bdev, volume_id)) ++ client.call('bdev_crypto_delete', {'name': crypto_bdev}) ++ except JSONRPCException: ++ raise crypto.CryptoException(grpc.StatusCode.INTERNAL, ++ 'Failed to delete crypto bdev') ++ ++ def verify(self, volume_id, key, cipher, key2=None, tweak_mode=None): ++ crypto_bdev = self._get_crypto_bdev(volume_id) ++ # Key being None/non-None defines whether we expect a bdev_crypto on top of a given volume ++ if ((key is None and crypto_bdev is not None) or (key is not None and crypto_bdev is None)): ++ raise crypto.CryptoException(grpc.StatusCode.INVALID_ARGUMENT, ++ 'Invalid volume crypto configuration') ++ if key is None: ++ return ++ params = crypto_bdev['driver_specific']['crypto'] ++ crypto_key = self._get_crypto_key(params['key_name']) ++ if crypto_key is None: ++ raise crypto.CryptoException(grpc.StatusCode.INVALID_ARGUMENT, ++ 'No key object found') ++ cipher = self._ciphers.get(cipher) ++ if cipher is None: ++ raise crypto.CryptoException(grpc.StatusCode.INVALID_ARGUMENT, ++ 'Invalid volume crypto configuration: bad cipher') ++ if crypto_key['cipher'].lower() != cipher.lower(): ++ raise crypto.CryptoException(grpc.StatusCode.INVALID_ARGUMENT, ++ 'Invalid volume crypto configuration: bad cipher') ++ if crypto_key['key'].lower() != key.lower(): ++ raise crypto.CryptoException(grpc.StatusCode.INVALID_ARGUMENT, ++ 'Invalid volume crypto configuration: bad key') ++ if key2 is not None and crypto_key.get('key2', '').lower() != key2.lower(): ++ raise crypto.CryptoException(grpc.StatusCode.INVALID_ARGUMENT, ++ 'Invalid volume crypto configuration: bad key2') ++ if crypto_key['name'].lower() != params['key_name'].lower(): ++ raise crypto.CryptoException(grpc.StatusCode.INVALID_ARGUMENT, ++ 'Invalid volume crypto configuration: key name does not match') ++ if tweak_mode is not None and tweak_mode != sma_pb2.VolumeCryptoParameters.TWEAK_MODE_SIMPLE_LBA: ++ raise crypto.CryptoException(grpc.StatusCode.INVALID_ARGUMENT, ++ 'Invalid volume crypto configuration: bad tweak_mode') ++ ++ def _get_crypto_bdev(self, volume_id): ++ try: ++ with self._client() as client: ++ bdevs = client.call('bdev_get_bdevs') ++ for bdev in [b for b in bdevs if b['product_name'] == 'crypto']: ++ base_name = bdev['driver_specific']['crypto']['base_bdev_name'] ++ base_bdev = next(filter(lambda b: b['name'] == base_name, bdevs), None) ++ # Should never really happen, but check it just in case ++ if base_bdev is None: ++ raise crypto.CryptoException( ++ grpc.StatusCode.INTERNAL, ++ 'Unexpected crypto configuration: cannot find base bdev') ++ if format_volume_id(base_bdev['uuid']) == volume_id: ++ return bdev ++ # There's no crypto bdev set up on top of this volume ++ return None ++ except JSONRPCException: ++ raise crypto.CryptoException(grpc.StatusCode.INTERNAL, ++ f'Failed to get bdev_crypto for volume: {volume_id}') ++ ++ def _get_crypto_key(self, key_name): ++ try: ++ with self._client() as client: ++ _keys = client.call('accel_crypto_keys_get', {'key_name': key_name}) ++ if _keys is not None: ++ return _keys[0] ++ return None ++ except JSONRPCException: ++ pass ++ ++ def get_crypto_bdev(self, volume_id): ++ bdev = self._get_crypto_bdev(volume_id) ++ if bdev is not None: ++ return bdev['name'] ++ return None +diff --git a/python/spdk/sma/volume/volume.py b/python/spdk/sma/volume/volume.py +index 7d49503..c66231b 100644 +--- a/python/spdk/sma/volume/volume.py ++++ b/python/spdk/sma/volume/volume.py +@@ -1,330 +1,330 @@ +-# SPDX-License-Identifier: BSD-3-Clause +-# Copyright (C) 2022 Intel Corporation. +-# All rights reserved. +- +-import grpc +-import ipaddress +-import logging +-import threading +-import uuid +-from dataclasses import dataclass +-from spdk.rpc.client import JSONRPCException +-from . import crypto +-from ..common import format_volume_id +-from ..proto import sma_pb2 +- +- +-log = logging.getLogger(__name__) +- +- +-class VolumeException(Exception): +- def __init__(self, code, message): +- self.code = code +- self.message = message +- +- +-class Volume: +- def __init__(self, volume_id, device_handle, discovery_services): +- self.volume_id = volume_id +- self.discovery_services = discovery_services +- self.device_handle = device_handle +- +- +-class VolumeManager: +- def __init__(self, client, discovery_timeout, cleanup_period): +- self._client = client +- # Discovery service map (name -> refcnt) +- self._discovery = {} +- # Volume map (volume_id -> Volume) +- self._volumes = {} +- self._discovery_timeout = int(discovery_timeout * 1000) +- self._cleanup_period = cleanup_period +- self._lock = threading.Lock() +- self._cv = threading.Condition(self._lock) +- self._running = False +- self._thread = None +- +- def _locked(f): +- def wrapper(self, *args, **kwargs): +- self._lock.acquire() +- try: +- return f(self, *args, **kwargs) +- finally: +- self._lock.release() +- return wrapper +- +- def start(self): +- if self._thread is not None: +- raise ValueError('Volume manager was already started') +- self._running = True +- self._thread = threading.Thread(target=self._cleanup_thread, args=(self,)) +- self._thread.start() +- +- def stop(self): +- if self._thread is None: +- return +- with self._lock: +- self._running = False +- self._cv.notify_all() +- self._thread.join() +- self._thread = None +- +- @staticmethod +- def _cleanup_thread(*args): +- self, = args +- with self._lock: +- while self._running: +- self._cleanup_volumes() +- self._cv.wait(self._cleanup_period) +- +- def _cleanup_volumes(self): +- try: +- disconnected = [] +- with self._client() as client: +- bdevs = client.call('bdev_get_bdevs') +- for volume_id in self._volumes: +- if volume_id not in [b['uuid'] for b in bdevs]: +- log.warning(f'Found disconnected volume: {volume_id}') +- disconnected.append(volume_id) +- for volume_id in disconnected: +- self._disconnect_volume(volume_id) +- except VolumeException as ex: +- log.error(f'Failure when trying to disconnect volumes: {ex.message}') +- except JSONRPCException as ex: +- log.error(f'Failed to retrieve bdevs: {ex.message}') +- +- def _get_discovery_info(self): +- try: +- with self._client() as client: +- return client.call('bdev_nvme_get_discovery_info') +- except JSONRPCException: +- raise VolumeException(grpc.StatusCode.INTERNAL, +- 'Failed to retrieve discovery service status') +- +- def _compare_trid(self, trid1, trid2): +- return (trid1['trtype'].lower() == trid2['trtype'].lower() and +- trid1['traddr'].lower() == trid2['traddr'].lower() and +- trid1['trsvcid'].lower() == trid2['trsvcid'].lower() and +- trid1['adrfam'].lower() == trid2['adrfam'].lower()) +- +- def _get_adrfam(self, traddr): +- try: +- return 'ipv{}'.format(ipaddress.ip_address(traddr).version) +- except ValueError: +- raise VolumeException(grpc.StatusCode.INVALID_ARGUMENT, +- 'Invalid traddr') +- +- def _get_volume_bdev(self, volume_id, timeout): +- try: +- with self._client() as client: +- return client.call('bdev_get_bdevs', +- {'name': volume_id, +- 'timeout': timeout})[0] +- except JSONRPCException: +- return None +- +- def _start_discovery(self, trid, hostnqn): +- try: +- # Use random UUID as name +- name = str(uuid.uuid4()) +- log.debug(f'Starting discovery service {name}') +- with self._client() as client: +- client.call('bdev_nvme_start_discovery', +- {'name': name, +- 'wait_for_attach': True, +- 'attach_timeout_ms': self._discovery_timeout, +- 'hostnqn': hostnqn, +- **trid}) +- self._discovery[name] = 1 +- return name +- except JSONRPCException: +- raise VolumeException(grpc.StatusCode.INTERNAL, +- 'Failed to start discovery') +- +- def _stop_discovery(self, name): +- refcnt = self._discovery.get(name) +- log.debug(f'Stopping discovery service {name}, refcnt={refcnt}') +- if refcnt is None: +- # Should never happen +- log.warning('Tried to stop discovery using non-existing name') +- return +- # Check the refcount to leave the service running if there are more volumes using it +- if refcnt > 1: +- self._discovery[name] = refcnt - 1 +- return +- del self._discovery[name] +- try: +- with self._client() as client: +- client.call('bdev_nvme_stop_discovery', +- {'name': name}) +- log.debug(f'Stopped discovery service {name}') +- except JSONRPCException: +- raise VolumeException(grpc.StatusCode.INTERNAL, +- 'Failed to stop discovery') +- +- def _get_crypto_params(self, params): +- key, cipher, key2, tweak_mode = None, None, None, None +- try: +- if params.HasField('crypto'): +- key, cipher = params.crypto.key.decode('ascii'), params.crypto.cipher +- if len(params.crypto.key2) > 0: +- key2 = params.crypto.key2.decode('ascii') +- if params.crypto.tweak_mode is not None: +- tweak_mode = params.crypto.tweak_mode +- except UnicodeDecodeError: +- raise VolumeException(grpc.StatusCode.INVALID_ARGUMENT, +- 'Corrupted crypto key') +- return key, cipher, key2, tweak_mode +- +- def _setup_crypto(self, volume_id, params): +- try: +- if not params.HasField('crypto'): +- return +- key, cipher, key2, tweak_mode = self._get_crypto_params(params) +- crypto.get_crypto_engine().setup(volume_id, key, cipher, key2, tweak_mode) +- except crypto.CryptoException as ex: +- raise VolumeException(ex.code, ex.message) +- +- def _cleanup_crypto(self, volume_id): +- try: +- crypto.get_crypto_engine().cleanup(volume_id) +- except crypto.CryptoException as ex: +- logging.warning(f'Failed to cleanup crypto: {ex.message}') +- +- def _verify_crypto(self, volume_id, params): +- try: +- key, cipher, key2, tweak_mode = self._get_crypto_params(params) +- crypto.get_crypto_engine().verify(volume_id, key, cipher, key2, tweak_mode) +- except crypto.CryptoException as ex: +- raise VolumeException(ex.code, ex.message) +- +- @_locked +- def connect_volume(self, params, device_handle=None): +- """ Connects a volume through a discovery service. Returns a tuple (volume_id, existing): +- the first item is a volume_id as str, while the second denotes whether the selected volume +- existed prior to calling this method. +- """ +- volume_id = format_volume_id(params.volume_id) +- if volume_id is None: +- raise VolumeException(grpc.StatusCode.INVALID_ARGUMENT, +- 'Invalid volume ID') +- if volume_id in self._volumes: +- volume = self._volumes[volume_id] +- if device_handle is not None and volume.device_handle != device_handle: +- raise VolumeException(grpc.StatusCode.ALREADY_EXISTS, +- 'Volume is already attached to a different device') +- # Make sure the crypto params are the same +- self._verify_crypto(volume_id, params) +- return volume_id, True +- discovery_services = set() +- try: +- # First start discovery connecting to specified endpoints +- for req_ep in params.nvmf.discovery.discovery_endpoints: +- info = self._get_discovery_info() +- trid = {'trtype': req_ep.trtype, +- 'traddr': req_ep.traddr, +- 'trsvcid': req_ep.trsvcid, +- 'adrfam': self._get_adrfam(req_ep.traddr)} +- name = None +- for discovery in info: +- if self._compare_trid(discovery['trid'], trid): +- name = discovery['name'] +- break +- if next(filter(lambda r: self._compare_trid(r['trid'], trid), +- discovery['referrals']), None): +- name = discovery['name'] +- break +- if name is not None: +- # If we've already attached a discovery service, it probably means that the user +- # specified a referred address +- if name not in discovery_services: +- refcnt = self._discovery.get(name) +- if refcnt is None: +- log.warning('Found a discovery service missing from internal map') +- refcnt = 0 +- self._discovery[name] = refcnt + 1 +- else: +- name = self._start_discovery(trid, params.nvmf.hostnqn) +- discovery_services.add(name) +- +- # Now check if a bdev with specified volume_id exists, give it 1s to appear +- bdev = self._get_volume_bdev(volume_id, timeout=1000) +- if bdev is None: +- raise VolumeException(grpc.StatusCode.NOT_FOUND, +- 'Volume could not be found') +- # Check subsystem's NQN if it's specified +- if params.nvmf.subnqn: +- nvme = bdev.get('driver_specific', {}).get('nvme', []) +- # The NVMe bdev can report multiple subnqns, but they all should be the same, so +- # don't bother checking more than the first one +- subnqn = next(iter(nvme), {}).get('trid', {}).get('subnqn') +- if subnqn != params.nvmf.subnqn: +- raise VolumeException(grpc.StatusCode.INVALID_ARGUMENT, +- 'Unexpected subsystem NQN') +- self._setup_crypto(volume_id, params) +- # Finally remember that volume +- self._volumes[volume_id] = Volume(volume_id, device_handle, discovery_services) +- except Exception as ex: +- for name in discovery_services: +- try: +- self._stop_discovery(name) +- except Exception: +- log.warning(f'Failed to cleanup discovery service: {name}') +- raise ex +- return volume_id, False +- +- def _disconnect_volume(self, volume_id): +- id = format_volume_id(volume_id) +- if id is None: +- raise VolumeException(grpc.StatusCode.INVALID_ARGUMENT, +- 'Invalid volume ID') +- # Return immediately if the volume is not on our map +- volume = self._volumes.get(id) +- if volume is None: +- return +- self._cleanup_crypto(id) +- # Delete the volume from the map and stop the services it uses +- for name in volume.discovery_services: +- try: +- self._stop_discovery(name) +- except Exception: +- # There's no good way to handle this, so just print an error message and +- # continue +- log.error(f'Failed to stop discovery service: {name}') +- del self._volumes[id] +- +- @_locked +- def disconnect_volume(self, volume_id): +- """Disconnects a volume connected through discovery service""" +- return self._disconnect_volume(volume_id) +- +- @_locked +- def set_device(self, volume_id, device_handle): +- """Marks a previously connected volume as being attached to specified device. This is only +- necessary if the device handle is not known at a time a volume is connected. +- """ +- id = format_volume_id(volume_id) +- if id is None: +- raise VolumeException(grpc.StatusCode.INVALID_ARGUMENT, +- 'Invalid volume ID') +- volume = self._volumes.get(id) +- if volume is None: +- raise VolumeException(grpc.StatusCode.NOT_FOUND, +- 'Volume could not be found') +- if volume.device_handle is not None and volume.device_handle != device_handle: +- raise VolumeException(grpc.StatusCode.ALREADY_EXISTS, +- 'Volume is already attached to a different device') +- volume.device_handle = device_handle +- +- @_locked +- def disconnect_device_volumes(self, device_handle): +- """Disconnects all volumes attached to a specific device""" +- volumes = [i for i, v in self._volumes.items() if v.device_handle == device_handle] +- for volume_id in volumes: +- self._disconnect_volume(volume_id) +- +- @_locked +- def has_volumes(self, device_handle): +- """Checks whether a given device has volumes attached to it""" +- return next(filter(lambda v: v.device_handle == device_handle, +- self._volumes.values()), None) is not None ++# SPDX-License-Identifier: BSD-3-Clause ++# Copyright (C) 2022 Intel Corporation. ++# All rights reserved. ++ ++import grpc ++import ipaddress ++import logging ++import threading ++import uuid ++from dataclasses import dataclass ++from spdk.rpc.client import JSONRPCException ++from . import crypto ++from ..common import format_volume_id ++from ..proto import sma_pb2 ++ ++ ++log = logging.getLogger(__name__) ++ ++ ++class VolumeException(Exception): ++ def __init__(self, code, message): ++ self.code = code ++ self.message = message ++ ++ ++class Volume: ++ def __init__(self, volume_id, device_handle, discovery_services): ++ self.volume_id = volume_id ++ self.discovery_services = discovery_services ++ self.device_handle = device_handle ++ ++ ++class VolumeManager: ++ def __init__(self, client, discovery_timeout, cleanup_period): ++ self._client = client ++ # Discovery service map (name -> refcnt) ++ self._discovery = {} ++ # Volume map (volume_id -> Volume) ++ self._volumes = {} ++ self._discovery_timeout = int(discovery_timeout * 1000) ++ self._cleanup_period = cleanup_period ++ self._lock = threading.Lock() ++ self._cv = threading.Condition(self._lock) ++ self._running = False ++ self._thread = None ++ ++ def _locked(f): ++ def wrapper(self, *args, **kwargs): ++ self._lock.acquire() ++ try: ++ return f(self, *args, **kwargs) ++ finally: ++ self._lock.release() ++ return wrapper ++ ++ def start(self): ++ if self._thread is not None: ++ raise ValueError('Volume manager was already started') ++ self._running = True ++ self._thread = threading.Thread(target=self._cleanup_thread, args=(self,)) ++ self._thread.start() ++ ++ def stop(self): ++ if self._thread is None: ++ return ++ with self._lock: ++ self._running = False ++ self._cv.notify_all() ++ self._thread.join() ++ self._thread = None ++ ++ @staticmethod ++ def _cleanup_thread(*args): ++ self, = args ++ with self._lock: ++ while self._running: ++ self._cleanup_volumes() ++ self._cv.wait(self._cleanup_period) ++ ++ def _cleanup_volumes(self): ++ try: ++ disconnected = [] ++ with self._client() as client: ++ bdevs = client.call('bdev_get_bdevs') ++ for volume_id in self._volumes: ++ if volume_id not in [b['uuid'] for b in bdevs]: ++ log.warning(f'Found disconnected volume: {volume_id}') ++ disconnected.append(volume_id) ++ for volume_id in disconnected: ++ self._disconnect_volume(volume_id) ++ except VolumeException as ex: ++ log.error(f'Failure when trying to disconnect volumes: {ex.message}') ++ except JSONRPCException as ex: ++ log.error(f'Failed to retrieve bdevs: {ex.message}') ++ ++ def _get_discovery_info(self): ++ try: ++ with self._client() as client: ++ return client.call('bdev_nvme_get_discovery_info') ++ except JSONRPCException: ++ raise VolumeException(grpc.StatusCode.INTERNAL, ++ 'Failed to retrieve discovery service status') ++ ++ def _compare_trid(self, trid1, trid2): ++ return (trid1['trtype'].lower() == trid2['trtype'].lower() and ++ trid1['traddr'].lower() == trid2['traddr'].lower() and ++ trid1['trsvcid'].lower() == trid2['trsvcid'].lower() and ++ trid1['adrfam'].lower() == trid2['adrfam'].lower()) ++ ++ def _get_adrfam(self, traddr): ++ try: ++ return 'ipv{}'.format(ipaddress.ip_address(traddr).version) ++ except ValueError: ++ raise VolumeException(grpc.StatusCode.INVALID_ARGUMENT, ++ 'Invalid traddr') ++ ++ def _get_volume_bdev(self, volume_id, timeout): ++ try: ++ with self._client() as client: ++ return client.call('bdev_get_bdevs', ++ {'name': volume_id, ++ 'timeout': timeout})[0] ++ except JSONRPCException: ++ return None ++ ++ def _start_discovery(self, trid, hostnqn): ++ try: ++ # Use random UUID as name ++ name = str(uuid.uuid4()) ++ log.debug(f'Starting discovery service {name}') ++ with self._client() as client: ++ client.call('bdev_nvme_start_discovery', ++ {'name': name, ++ 'wait_for_attach': True, ++ 'attach_timeout_ms': self._discovery_timeout, ++ 'hostnqn': hostnqn, ++ **trid}) ++ self._discovery[name] = 1 ++ return name ++ except JSONRPCException: ++ raise VolumeException(grpc.StatusCode.INTERNAL, ++ 'Failed to start discovery') ++ ++ def _stop_discovery(self, name): ++ refcnt = self._discovery.get(name) ++ log.debug(f'Stopping discovery service {name}, refcnt={refcnt}') ++ if refcnt is None: ++ # Should never happen ++ log.warning('Tried to stop discovery using non-existing name') ++ return ++ # Check the refcount to leave the service running if there are more volumes using it ++ if refcnt > 1: ++ self._discovery[name] = refcnt - 1 ++ return ++ del self._discovery[name] ++ try: ++ with self._client() as client: ++ client.call('bdev_nvme_stop_discovery', ++ {'name': name}) ++ log.debug(f'Stopped discovery service {name}') ++ except JSONRPCException: ++ raise VolumeException(grpc.StatusCode.INTERNAL, ++ 'Failed to stop discovery') ++ ++ def _get_crypto_params(self, params): ++ key, cipher, key2, tweak_mode = None, None, None, None ++ try: ++ if params.HasField('crypto'): ++ key, cipher = params.crypto.key.decode('ascii'), params.crypto.cipher ++ if len(params.crypto.key2) > 0: ++ key2 = params.crypto.key2.decode('ascii') ++ if params.crypto.tweak_mode is not None: ++ tweak_mode = params.crypto.tweak_mode ++ except UnicodeDecodeError: ++ raise VolumeException(grpc.StatusCode.INVALID_ARGUMENT, ++ 'Corrupted crypto key') ++ return key, cipher, key2, tweak_mode ++ ++ def _setup_crypto(self, volume_id, params): ++ try: ++ if not params.HasField('crypto'): ++ return ++ key, cipher, key2, tweak_mode = self._get_crypto_params(params) ++ crypto.get_crypto_engine().setup(volume_id, key, cipher, key2, tweak_mode) ++ except crypto.CryptoException as ex: ++ raise VolumeException(ex.code, ex.message) ++ ++ def _cleanup_crypto(self, volume_id): ++ try: ++ crypto.get_crypto_engine().cleanup(volume_id) ++ except crypto.CryptoException as ex: ++ logging.warning(f'Failed to cleanup crypto: {ex.message}') ++ ++ def _verify_crypto(self, volume_id, params): ++ try: ++ key, cipher, key2, tweak_mode = self._get_crypto_params(params) ++ crypto.get_crypto_engine().verify(volume_id, key, cipher, key2, tweak_mode) ++ except crypto.CryptoException as ex: ++ raise VolumeException(ex.code, ex.message) ++ ++ @_locked ++ def connect_volume(self, params, device_handle=None): ++ """ Connects a volume through a discovery service. Returns a tuple (volume_id, existing): ++ the first item is a volume_id as str, while the second denotes whether the selected volume ++ existed prior to calling this method. ++ """ ++ volume_id = format_volume_id(params.volume_id) ++ if volume_id is None: ++ raise VolumeException(grpc.StatusCode.INVALID_ARGUMENT, ++ 'Invalid volume ID') ++ if volume_id in self._volumes: ++ volume = self._volumes[volume_id] ++ if device_handle is not None and volume.device_handle != device_handle: ++ raise VolumeException(grpc.StatusCode.ALREADY_EXISTS, ++ 'Volume is already attached to a different device') ++ # Make sure the crypto params are the same ++ self._verify_crypto(volume_id, params) ++ return volume_id, True ++ discovery_services = set() ++ try: ++ # First start discovery connecting to specified endpoints ++ for req_ep in params.nvmf.discovery.discovery_endpoints: ++ info = self._get_discovery_info() ++ trid = {'trtype': req_ep.trtype, ++ 'traddr': req_ep.traddr, ++ 'trsvcid': req_ep.trsvcid, ++ 'adrfam': self._get_adrfam(req_ep.traddr)} ++ name = None ++ for discovery in info: ++ if self._compare_trid(discovery['trid'], trid): ++ name = discovery['name'] ++ break ++ if next(filter(lambda r: self._compare_trid(r['trid'], trid), ++ discovery['referrals']), None): ++ name = discovery['name'] ++ break ++ if name is not None: ++ # If we've already attached a discovery service, it probably means that the user ++ # specified a referred address ++ if name not in discovery_services: ++ refcnt = self._discovery.get(name) ++ if refcnt is None: ++ log.warning('Found a discovery service missing from internal map') ++ refcnt = 0 ++ self._discovery[name] = refcnt + 1 ++ else: ++ name = self._start_discovery(trid, params.nvmf.hostnqn) ++ discovery_services.add(name) ++ ++ # Now check if a bdev with specified volume_id exists, give it 1s to appear ++ bdev = self._get_volume_bdev(volume_id, timeout=1000) ++ if bdev is None: ++ raise VolumeException(grpc.StatusCode.NOT_FOUND, ++ 'Volume could not be found') ++ # Check subsystem's NQN if it's specified ++ if params.nvmf.subnqn: ++ nvme = bdev.get('driver_specific', {}).get('nvme', []) ++ # The NVMe bdev can report multiple subnqns, but they all should be the same, so ++ # don't bother checking more than the first one ++ subnqn = next(iter(nvme), {}).get('trid', {}).get('subnqn') ++ if subnqn != params.nvmf.subnqn: ++ raise VolumeException(grpc.StatusCode.INVALID_ARGUMENT, ++ 'Unexpected subsystem NQN') ++ self._setup_crypto(volume_id, params) ++ # Finally remember that volume ++ self._volumes[volume_id] = Volume(volume_id, device_handle, discovery_services) ++ except Exception as ex: ++ for name in discovery_services: ++ try: ++ self._stop_discovery(name) ++ except Exception: ++ log.warning(f'Failed to cleanup discovery service: {name}') ++ raise ex ++ return volume_id, False ++ ++ def _disconnect_volume(self, volume_id): ++ id = format_volume_id(volume_id) ++ if id is None: ++ raise VolumeException(grpc.StatusCode.INVALID_ARGUMENT, ++ 'Invalid volume ID') ++ # Return immediately if the volume is not on our map ++ volume = self._volumes.get(id) ++ if volume is None: ++ return ++ self._cleanup_crypto(id) ++ # Delete the volume from the map and stop the services it uses ++ for name in volume.discovery_services: ++ try: ++ self._stop_discovery(name) ++ except Exception: ++ # There's no good way to handle this, so just print an error message and ++ # continue ++ log.error(f'Failed to stop discovery service: {name}') ++ del self._volumes[id] ++ ++ @_locked ++ def disconnect_volume(self, volume_id): ++ """Disconnects a volume connected through discovery service""" ++ return self._disconnect_volume(volume_id) ++ ++ @_locked ++ def set_device(self, volume_id, device_handle): ++ """Marks a previously connected volume as being attached to specified device. This is only ++ necessary if the device handle is not known at a time a volume is connected. ++ """ ++ id = format_volume_id(volume_id) ++ if id is None: ++ raise VolumeException(grpc.StatusCode.INVALID_ARGUMENT, ++ 'Invalid volume ID') ++ volume = self._volumes.get(id) ++ if volume is None: ++ raise VolumeException(grpc.StatusCode.NOT_FOUND, ++ 'Volume could not be found') ++ if volume.device_handle is not None and volume.device_handle != device_handle: ++ raise VolumeException(grpc.StatusCode.ALREADY_EXISTS, ++ 'Volume is already attached to a different device') ++ volume.device_handle = device_handle ++ ++ @_locked ++ def disconnect_device_volumes(self, device_handle): ++ """Disconnects all volumes attached to a specific device""" ++ volumes = [i for i, v in self._volumes.items() if v.device_handle == device_handle] ++ for volume_id in volumes: ++ self._disconnect_volume(volume_id) ++ ++ @_locked ++ def has_volumes(self, device_handle): ++ """Checks whether a given device has volumes attached to it""" ++ return next(filter(lambda v: v.device_handle == device_handle, ++ self._volumes.values()), None) is not None +diff --git a/python/spdk/spdkcli/__init__.py b/python/spdk/spdkcli/__init__.py +index 1b0bed7..6950e16 100644 +--- a/python/spdk/spdkcli/__init__.py ++++ b/python/spdk/spdkcli/__init__.py +@@ -1,5 +1,5 @@ +-# SPDX-License-Identifier: BSD-3-Clause +-# Copyright (C) 2018 Intel Corporation. +-# All rights reserved. +- +-from .ui_root import UIRoot ++# SPDX-License-Identifier: BSD-3-Clause ++# Copyright (C) 2018 Intel Corporation. ++# All rights reserved. ++ ++from .ui_root import UIRoot +diff --git a/python/spdk/spdkcli/ui_node.py b/python/spdk/spdkcli/ui_node.py +index e01ed2e..de0609c 100644 +--- a/python/spdk/spdkcli/ui_node.py ++++ b/python/spdk/spdkcli/ui_node.py +@@ -1,901 +1,901 @@ +-# SPDX-License-Identifier: BSD-3-Clause +-# Copyright (C) 2018 Intel Corporation. +-# All rights reserved. +- +-from configshell_fb import ConfigNode, ExecutionError +-from uuid import UUID +-from ..rpc.client import JSONRPCException +-import json +- +- +-def convert_bytes_to_human(size): +- if size == 0: +- return "%3.1f%s" % (size, "bytes") +- if not size: +- return "" +- for x in ["bytes", "K", "M", "G", "T"]: +- if size < 1024.0: +- return "%3.1f%s" % (size, x) +- size /= 1024.0 +- +- +-class UINode(ConfigNode): +- def __init__(self, name, parent=None, shell=None): +- ConfigNode.__init__(self, name, parent, shell) +- +- def refresh(self): +- for child in self.children: +- child.refresh() +- +- def refresh_node(self): +- self.refresh() +- +- def ui_command_refresh(self): +- self.refresh() +- +- def ui_command_ll(self, path=None, depth=None): +- """ +- Alias for ls. +- """ +- self.ui_command_ls(path, depth) +- +- def execute_command(self, command, pparams=[], kparams={}): +- try: +- result = ConfigNode.execute_command(self, command, +- pparams, kparams) +- except Exception as e: +- raise e +- else: +- self.shell.log.debug("Command %s succeeded." % command) +- return result +- finally: +- if self.shell.interactive and\ +- command in ["create", "delete", "delete_all", "add_initiator", +- "allow_any_host", "bdev_split_create", "add_lun", +- "iscsi_target_node_add_pg_ig_maps", "remove_target", "add_secret", +- "bdev_split_delete", "bdev_pmem_delete_pool", +- "bdev_pmem_create_pool", "delete_secret_all", +- "delete_initiator", "set_auth", "delete_secret", +- "iscsi_target_node_remove_pg_ig_maps", "load_config", +- "load_subsystem_config"]: +- self.get_root().refresh() +- self.refresh_node() +- +- +-class UIBdevs(UINode): +- def __init__(self, parent): +- UINode.__init__(self, "bdevs", parent) +- self.refresh() +- +- def refresh(self): +- self._children = set([]) +- UIMallocBdev(self) +- UIAIOBdev(self) +- UILvolBdev(self) +- UINvmeBdev(self) +- UINullBdev(self) +- UIErrorBdev(self) +- UISplitBdev(self) +- UIPmemBdev(self) +- UIRbdBdev(self) +- UIiSCSIBdev(self) +- UIVirtioBlkBdev(self) +- UIVirtioScsiBdev(self) +- UIRaidBdev(self) +- UIUringBdev(self) +- +- +-class UILvolStores(UINode): +- def __init__(self, parent): +- UINode.__init__(self, "lvol_stores", parent) +- self.refresh() +- +- def refresh(self): +- self._children = set([]) +- for lvs in self.get_root().bdev_lvol_get_lvstores(): +- UILvsObj(lvs, self) +- +- def delete(self, name, uuid): +- if name is None and uuid is None: +- self.shell.log.error("Please specify one of the identifiers: " +- "lvol store name or UUID") +- self.get_root().bdev_lvol_delete_lvstore(lvs_name=name, uuid=uuid) +- +- def ui_command_create(self, name, bdev_name, cluster_size=None): +- """ +- Creates logical volume store on target bdev. +- +- Arguments: +- name - Friendly name to use alongside with UUID identifier. +- bdev_name - On which bdev to create the lvol store. +- cluster_size - Cluster size to use when creating lvol store, in bytes. Default: 4194304. +- """ +- +- cluster_size = self.ui_eval_param(cluster_size, "number", None) +- self.get_root().bdev_lvol_create_lvstore(lvs_name=name, bdev_name=bdev_name, cluster_sz=cluster_size) +- +- def ui_command_delete(self, name=None, uuid=None): +- """ +- Deletes logical volume store from configuration. +- This will also delete all logical volume bdevs created on this lvol store! +- +- Arguments: +- name - Friendly name of the logical volume store to be deleted. +- uuid - UUID number of the logical volume store to be deleted. +- """ +- self.delete(name, uuid) +- +- def ui_command_delete_all(self): +- rpc_messages = "" +- for lvs in self._children: +- try: +- self.delete(None, lvs.lvs.uuid) +- except JSONRPCException as e: +- rpc_messages += e.message +- if rpc_messages: +- raise JSONRPCException(rpc_messages) +- +- def summary(self): +- return "Lvol stores: %s" % len(self.children), None +- +- +-class UIBdev(UINode): +- def __init__(self, name, parent): +- UINode.__init__(self, name, parent) +- self.refresh() +- +- def refresh(self): +- self._children = set([]) +- for bdev in self.get_root().bdev_get_bdevs(self.name): +- UIBdevObj(bdev, self) +- +- def ui_command_get_bdev_iostat(self, name=None): +- ret = self.get_root().bdev_get_iostat(name=name) +- self.shell.log.info(json.dumps(ret, indent=2)) +- +- def ui_command_delete_all(self): +- """Delete all bdevs from this tree node.""" +- rpc_messages = "" +- for bdev in self._children: +- try: +- self.delete(bdev.name) +- except JSONRPCException as e: +- rpc_messages += e.message +- if rpc_messages: +- raise JSONRPCException(rpc_messages) +- +- def summary(self): +- return "Bdevs: %d" % len(self.children), None +- +- +-class UIMallocBdev(UIBdev): +- def __init__(self, parent): +- UIBdev.__init__(self, "malloc", parent) +- +- def delete(self, name): +- self.get_root().bdev_malloc_delete(name=name) +- +- def ui_command_create(self, size, block_size, name=None, uuid=None): +- """ +- Construct a Malloc bdev. +- +- Arguments: +- size - Size in megabytes. +- block_size - Integer, block size to use when constructing bdev. +- name - Optional argument. Custom name to use for bdev. If not provided +- then name will be "MallocX" where X is next available ID. +- uuid - Optional parameter. Custom UUID to use. If empty then random +- will be generated. +- """ +- +- size = self.ui_eval_param(size, "number", None) +- block_size = self.ui_eval_param(block_size, "number", None) +- ret_name = self.get_root().create_malloc_bdev(num_blocks=size * 1024 * 1024 // block_size, +- block_size=block_size, +- name=name, uuid=uuid) +- self.shell.log.info(ret_name) +- +- def ui_command_delete(self, name): +- """ +- Deletes malloc bdev from configuration. +- +- Arguments: +- name - Is a unique identifier of the malloc bdev to be deleted - UUID number or name alias. +- """ +- self.delete(name) +- +- +-class UIAIOBdev(UIBdev): +- def __init__(self, parent): +- UIBdev.__init__(self, "aio", parent) +- +- def delete(self, name): +- self.get_root().bdev_aio_delete(name=name) +- +- def ui_command_create(self, name, filename, block_size): +- """ +- Construct an AIO bdev. +- Backend file must exist before trying to create an AIO bdev. +- +- Arguments: +- name - Optional argument. Custom name to use for bdev. If not provided +- then name will be "MallocX" where X is next available ID. +- filename - Path to AIO backend. +- block_size - Integer, block size to use when constructing bdev. +- """ +- +- block_size = self.ui_eval_param(block_size, "number", None) +- ret_name = self.get_root().bdev_aio_create(name=name, +- block_size=int(block_size), +- filename=filename) +- self.shell.log.info(ret_name) +- +- def ui_command_delete(self, name): +- """ +- Deletes aio bdev from configuration. +- +- Arguments: +- name - Is a unique identifier of the aio bdev to be deleted - UUID number or name alias. +- """ +- self.delete(name) +- +- +-class UILvolBdev(UIBdev): +- def __init__(self, parent): +- UIBdev.__init__(self, "logical_volume", parent) +- +- def delete(self, name): +- self.get_root().bdev_lvol_delete(name=name) +- +- def ui_command_create(self, name, size, lvs, thin_provision=None): +- """ +- Construct a Logical Volume bdev. +- +- Arguments: +- name - Friendly name to use for creating logical volume bdev. +- size - Size in megabytes. +- lvs - Identifier of logical volume store on which the bdev should be +- created. Can be either a friendly name or UUID. +- thin_provision - Whether the bdev should be thick or thin provisioned. +- Default is False, and created bdevs are thick-provisioned. +- """ +- uuid = None +- lvs_name = None +- try: +- UUID(lvs) +- uuid = lvs +- except ValueError: +- lvs_name = lvs +- +- size = self.ui_eval_param(size, "number", None) +- size *= (1024 * 1024) +- thin_provision = self.ui_eval_param(thin_provision, "bool", False) +- +- ret_uuid = self.get_root().create_lvol_bdev(lvol_name=name, size=size, +- lvs_name=lvs_name, uuid=uuid, +- thin_provision=thin_provision) +- self.shell.log.info(ret_uuid) +- +- def ui_command_delete(self, name): +- """ +- Deletes lvol bdev from configuration. +- +- Arguments: +- name - Is a unique identifier of the lvol bdev to be deleted - UUID number or name alias. +- """ +- self.delete(name) +- +- +-class UINvmeBdev(UIBdev): +- def __init__(self, parent): +- UIBdev.__init__(self, "nvme", parent) +- +- def delete(self, name): +- self.get_root().bdev_nvme_detach_controller(name=name) +- +- def ui_command_create(self, name, trtype, traddr, +- adrfam=None, trsvcid=None, subnqn=None): +- if "rdma" in trtype and None in [adrfam, trsvcid, subnqn]: +- self.shell.log.error("Using RDMA transport type." +- "Please provide arguments for adrfam, trsvcid and subnqn.") +- ret_name = self.get_root().create_nvme_bdev(name=name, trtype=trtype, +- traddr=traddr, adrfam=adrfam, +- trsvcid=trsvcid, subnqn=subnqn) +- self.shell.log.info(ret_name) +- +- def ui_command_delete_all(self): +- rpc_messages = "" +- ctrlrs = [x.name for x in self._children] +- ctrlrs = [x.rsplit("n", 1)[0] for x in ctrlrs] +- ctrlrs = set(ctrlrs) +- for ctrlr in ctrlrs: +- try: +- self.delete(ctrlr) +- except JSONRPCException as e: +- rpc_messages += e.messages +- if rpc_messages: +- raise JSONRPCException(rpc_messages) +- +- def ui_command_delete(self, name): +- """ +- Deletes NVMe controller from configuration. +- +- Arguments: +- name - Is a unique identifier of the NVMe controller to be deleted. +- """ +- self.delete(name) +- +- +-class UINullBdev(UIBdev): +- def __init__(self, parent): +- UIBdev.__init__(self, "null", parent) +- +- def delete(self, name): +- self.get_root().bdev_null_delete(name=name) +- +- def ui_command_create(self, name, size, block_size, uuid=None): +- """ +- Construct a Null bdev. +- +- Arguments: +- name - Name to use for bdev. +- size - Size in megabytes. +- block_size - Integer, block size to use when constructing bdev. +- uuid - Optional parameter. Custom UUID to use. If empty then random +- will be generated. +- """ +- +- size = self.ui_eval_param(size, "number", None) +- block_size = self.ui_eval_param(block_size, "number", None) +- num_blocks = size * 1024 * 1024 // block_size +- ret_name = self.get_root().bdev_null_create(num_blocks=num_blocks, +- block_size=block_size, +- name=name, uuid=uuid) +- self.shell.log.info(ret_name) +- +- def ui_command_delete(self, name): +- """ +- Deletes null bdev from configuration. +- +- Arguments: +- name - Is a unique identifier of the null bdev to be deleted - UUID number or name alias. +- """ +- self.delete(name) +- +- +-class UIErrorBdev(UIBdev): +- def __init__(self, parent): +- UIBdev.__init__(self, "error", parent) +- +- def delete(self, name): +- self.get_root().bdev_error_delete(name=name) +- +- def ui_command_create(self, base_name): +- """ +- Construct a error injection bdev. +- +- Arguments: +- base_name - base bdev name on top of which error bdev will be created. +- """ +- +- self.get_root().create_error_bdev(base_name=base_name) +- +- def ui_command_delete(self, name): +- """ +- Deletes error bdev from configuration. +- +- Arguments: +- name - Is a unique identifier of the error bdev to be deleted - UUID number or name alias. +- """ +- self.delete(name) +- +- +-class UISplitBdev(UIBdev): +- def __init__(self, parent): +- UIBdev.__init__(self, "split_disk", parent) +- +- def delete(self, name): +- pass +- +- def ui_command_bdev_split_create(self, base_bdev, split_count, split_size_mb=None): +- """ +- Create split block devices from a base bdev. +- +- Arguments: +- base_bdev - Name of bdev to split +- split_count - Number of split bdevs to create +- split_size_mb- Size of each split volume in MiB (optional) +- """ +- +- split_count = self.ui_eval_param(split_count, "number", None) +- split_size_mb = self.ui_eval_param(split_size_mb, "number", None) +- +- ret_name = self.get_root().bdev_split_create(base_bdev=base_bdev, +- split_count=split_count, +- split_size_mb=split_size_mb) +- self.shell.log.info(ret_name) +- +- def ui_command_bdev_split_delete(self, base_bdev): +- """Delete split block devices associated with base bdev. +- +- Args: +- base_bdev: name of previously split bdev +- """ +- +- self.get_root().bdev_split_delete(base_bdev=base_bdev) +- +- +-class UIPmemBdev(UIBdev): +- def __init__(self, parent): +- UIBdev.__init__(self, "pmemblk", parent) +- +- def delete(self, name): +- self.get_root().bdev_pmem_delete(name=name) +- +- def ui_command_bdev_pmem_create_pool(self, pmem_file, total_size, block_size): +- total_size = self.ui_eval_param(total_size, "number", None) +- block_size = self.ui_eval_param(block_size, "number", None) +- num_blocks = int((total_size * 1024 * 1024) / block_size) +- +- self.get_root().bdev_pmem_create_pool(pmem_file=pmem_file, +- num_blocks=num_blocks, +- block_size=block_size) +- +- def ui_command_bdev_pmem_delete_pool(self, pmem_file): +- self.get_root().bdev_pmem_delete_pool(pmem_file=pmem_file) +- +- def ui_command_bdev_pmem_get_pool_info(self, pmem_file): +- ret = self.get_root().bdev_pmem_get_pool_info(pmem_file=pmem_file) +- self.shell.log.info(json.dumps(ret, indent=2)) +- +- def ui_command_create(self, pmem_file, name): +- ret_name = self.get_root().bdev_pmem_create(pmem_file=pmem_file, +- name=name) +- self.shell.log.info(ret_name) +- +- def ui_command_delete(self, name): +- """ +- Deletes pmem bdev from configuration. +- +- Arguments: +- name - Is a unique identifier of the pmem bdev to be deleted - UUID number or name alias. +- """ +- self.delete(name) +- +- +-class UIRbdBdev(UIBdev): +- def __init__(self, parent): +- UIBdev.__init__(self, "rbd", parent) +- +- def delete(self, name): +- self.get_root().bdev_rbd_delete(name=name) +- +- def ui_command_create(self, pool_name, rbd_name, block_size, name=None): +- block_size = self.ui_eval_param(block_size, "number", None) +- +- ret_name = self.get_root().create_rbd_bdev(pool_name=pool_name, +- rbd_name=rbd_name, +- block_size=block_size, +- name=name) +- self.shell.log.info(ret_name) +- +- def ui_command_delete(self, name): +- """ +- Deletes rbd bdev from configuration. +- +- Arguments: +- name - Is a unique identifier of the rbd bdev to be deleted - UUID number or name alias. +- """ +- self.delete(name) +- +- +-class UIiSCSIBdev(UIBdev): +- def __init__(self, parent): +- UIBdev.__init__(self, "iscsi", parent) +- +- def delete(self, name): +- self.get_root().bdev_iscsi_delete(name=name) +- +- def ui_command_create(self, name, url, initiator_iqn): +- """ +- Create iSCSI bdev in configuration by connecting to remote +- iSCSI target. +- +- Arguments: +- name - name to be used as an ID for created iSCSI bdev. +- url - iscsi url pointing to LUN on remote iSCSI target. +- Example: iscsi://127.0.0.1:3260/iqn.2018-06.org.spdk/0. +- initiator_iqn - IQN to use for initiating connection with the target. +- """ +- ret_name = self.get_root().create_iscsi_bdev(name=name, +- url=url, +- initiator_iqn=initiator_iqn) +- self.shell.log.info(ret_name) +- +- def ui_command_delete(self, name): +- """ +- Deletes iSCSI bdev from configuration. +- +- Arguments: +- name - name of the iscsi bdev to be deleted. +- """ +- self.delete(name) +- +- +-class UIVirtioBlkBdev(UIBdev): +- def __init__(self, parent): +- UIBdev.__init__(self, "virtioblk_disk", parent) +- +- def ui_command_create(self, name, trtype, traddr, +- vq_count=None, vq_size=None): +- +- vq_count = self.ui_eval_param(vq_count, "number", None) +- vq_size = self.ui_eval_param(vq_size, "number", None) +- +- ret = self.get_root().create_virtio_dev(name=name, +- trtype=trtype, +- traddr=traddr, +- dev_type="blk", +- vq_count=vq_count, +- vq_size=vq_size) +- +- self.shell.log.info(ret) +- +- def ui_command_delete(self, name): +- """ +- Deletes virtio scsi bdev from configuration. +- +- Arguments: +- name - Is a unique identifier of the virtio scsi bdev to be deleted - UUID number or name alias. +- """ +- self.get_root().bdev_virtio_detach_controller(name=name) +- +- +-class UIVirtioScsiBdev(UIBdev): +- def __init__(self, parent): +- UIBdev.__init__(self, "virtioscsi_disk", parent) +- +- def refresh(self): +- self._children = set([]) +- for bdev in self.get_root().bdev_virtio_scsi_get_devices(): +- UIVirtioScsiBdevObj(bdev, self) +- +- def ui_command_create(self, name, trtype, traddr, +- vq_count=None, vq_size=None): +- +- vq_count = self.ui_eval_param(vq_count, "number", None) +- vq_size = self.ui_eval_param(vq_size, "number", None) +- +- ret = self.get_root().create_virtio_dev(name=name, +- trtype=trtype, +- traddr=traddr, +- dev_type="scsi", +- vq_count=vq_count, +- vq_size=vq_size) +- +- self.shell.log.info(ret) +- +- def ui_command_delete(self, name): +- self.get_root().bdev_virtio_detach_controller(name=name) +- +- +-class UIBdevObj(UINode): +- def __init__(self, bdev, parent): +- self.bdev = bdev +- # Using bdev name also for lvol bdevs, which results in displaying +- # UUID instead of alias. This is because alias naming convention +- # (lvol_store_name/lvol_bdev_name) conflicts with configshell paths +- # ("/" as separator). +- # Solution: show lvol alias in "summary field" for now. +- # TODO: Possible next steps: +- # - Either change default separator in tree for smth else +- # - or add a UI command which would be able to autocomplete +- # "cd" command based on objects alias and match is to the +- # "main" bdev name. +- UINode.__init__(self, self.bdev.name, parent) +- +- def ui_command_show_details(self): +- self.shell.log.info(json.dumps(vars(self.bdev), indent=2)) +- +- def summary(self): +- size = convert_bytes_to_human(self.bdev.block_size * self.bdev.num_blocks) +- size = "=".join(["Size", size]) +- +- in_use = "Not claimed" +- if bool(self.bdev.claimed): +- in_use = "Claimed" +- +- alias = None +- if self.bdev.aliases: +- alias = self.bdev.aliases[0] +- +- info = ", ".join([_f for _f in [alias, size, in_use] if _f]) +- return info, True +- +- +-class UIVirtioScsiBdevObj(UIBdevObj): +- def __init__(self, bdev, parent): +- UIBdevObj.__init__(self, bdev, parent) +- self.refresh() +- +- def refresh(self): +- self._children = set([]) +- for bdev in self.get_root().bdev_get_bdevs("virtio_scsi_disk"): +- if self.bdev.name in bdev.name: +- UIBdevObj(bdev, self) +- +- def summary(self): +- if "socket" in list(self.bdev.virtio.keys()): +- info = self.bdev.virtio["socket"] +- if "pci_address" in list(self.bdev.virtio.keys()): +- info = self.bdev.virtio["pci_address"] +- return info, True +- +- +-class UILvsObj(UINode): +- def __init__(self, lvs, parent): +- UINode.__init__(self, lvs.name, parent) +- self.lvs = lvs +- +- def ui_command_show_details(self): +- self.shell.log.info(json.dumps(vars(self.lvs), indent=2)) +- +- def summary(self): +- size = convert_bytes_to_human(self.lvs.total_data_clusters * self.lvs.cluster_size) +- free = convert_bytes_to_human(self.lvs.free_clusters * self.lvs.cluster_size) +- if not free: +- free = "0" +- size = "=".join(["Size", size]) +- free = "=".join(["Free", free]) +- info = ", ".join([str(size), str(free)]) +- return info, True +- +- +-class UIVhosts(UINode): +- def __init__(self, parent): +- UINode.__init__(self, "vhost", parent) +- self.refresh() +- +- def refresh(self): +- self._children = set([]) +- self.get_root().list_vhost_ctrls() +- UIVhostBlk(self) +- UIVhostScsi(self) +- +- +-class UIVhost(UINode): +- def __init__(self, name, parent): +- UINode.__init__(self, name, parent) +- self.refresh() +- +- def ui_command_delete(self, name): +- """ +- Delete a Vhost controller from configuration. +- +- Arguments: +- name - Controller name. +- """ +- self.get_root().vhost_delete_controller(ctrlr=name) +- +- +-class UIVhostBlk(UIVhost): +- def __init__(self, parent): +- UIVhost.__init__(self, "block", parent) +- self.refresh() +- +- def refresh(self): +- self._children = set([]) +- for ctrlr in self.get_root().vhost_get_controllers(ctrlr_type=self.name): +- UIVhostBlkCtrlObj(ctrlr, self) +- +- def ui_command_create(self, name, bdev, cpumask=None, readonly=False): +- """ +- Create a Vhost BLK controller. +- +- Arguments: +- name - Controller name. +- bdev - Which bdev to attach to the controller. +- cpumask - Optional. Integer to specify mask of CPUs to use. +- Default: 1. +- readonly - Whether controller should be read only or not. +- Default: False. +- """ +- self.get_root().vhost_create_blk_controller(ctrlr=name, +- dev_name=bdev, +- cpumask=cpumask, +- readonly=bool(readonly)) +- +- +-class UIVhostScsi(UIVhost): +- def __init__(self, parent): +- UIVhost.__init__(self, "scsi", parent) +- self.refresh() +- +- def refresh(self): +- self._children = set([]) +- for ctrlr in self.get_root().vhost_get_controllers(ctrlr_type=self.name): +- UIVhostScsiCtrlObj(ctrlr, self) +- +- def ui_command_create(self, name, cpumask=None): +- """ +- Create a Vhost SCSI controller. +- +- Arguments: +- name - Controller name. +- cpumask - Optional. Integer to specify mask of CPUs to use. +- Default: 1. +- """ +- self.get_root().vhost_create_scsi_controller(ctrlr=name, +- cpumask=cpumask) +- +- +-class UIVhostCtrl(UINode): +- # Base class for SCSI and BLK controllers, do not instantiate +- def __init__(self, ctrlr, parent): +- self.ctrlr = ctrlr +- UINode.__init__(self, self.ctrlr.ctrlr, parent) +- self.refresh() +- +- def ui_command_show_details(self): +- self.shell.log.info(json.dumps(vars(self.ctrlr), indent=2)) +- +- def ui_command_set_coalescing(self, delay_base_us, iops_threshold): +- delay_base_us = self.ui_eval_param(delay_base_us, "number", None) +- iops_threshold = self.ui_eval_param(iops_threshold, "number", None) +- +- self.get_root().vhost_controller_set_coalescing(ctrlr=self.ctrlr.ctrlr, +- delay_base_us=delay_base_us, +- iops_threshold=iops_threshold) +- +- +-class UIVhostScsiCtrlObj(UIVhostCtrl): +- def refresh(self): +- self._children = set([]) +- for lun in self.ctrlr.backend_specific["scsi"]: +- UIVhostTargetObj(lun, self) +- +- def ui_command_remove_target(self, target_num): +- """ +- Remove target node from SCSI controller. +- +- Arguments: +- target_num - Integer identifier of target node to delete. +- """ +- self.get_root().vhost_scsi_controller_remove_target(ctrlr=self.ctrlr.ctrlr, +- scsi_target_num=int(target_num)) +- for ctrlr in self.get_root().vhost_get_controllers(ctrlr_type="scsi"): +- if ctrlr.ctrlr == self.ctrlr.ctrlr: +- self.ctrlr = ctrlr +- +- def ui_command_add_lun(self, target_num, bdev_name): +- """ +- Add LUN to SCSI target node. +- Currently only one LUN (which is LUN ID 0) per target is supported. +- Adding LUN to not existing target node will create that node. +- +- Arguments: +- target_num - Integer identifier of target node to modify. +- bdev - Which bdev to add as LUN. +- """ +- self.get_root().vhost_scsi_controller_add_target(ctrlr=self.ctrlr.ctrlr, +- scsi_target_num=int(target_num), +- bdev_name=bdev_name) +- for ctrlr in self.get_root().vhost_get_controllers(ctrlr_type="scsi"): +- if ctrlr.ctrlr == self.ctrlr.ctrlr: +- self.ctrlr = ctrlr +- +- def summary(self): +- info = self.ctrlr.socket +- return info, True +- +- +-class UIVhostBlkCtrlObj(UIVhostCtrl): +- def refresh(self): +- self._children = set([]) +- UIVhostLunDevObj(self.ctrlr.backend_specific["block"]["bdev"], self) +- +- def summary(self): +- ro = None +- if self.ctrlr.backend_specific["block"]["readonly"]: +- ro = "Readonly" +- info = ", ".join([_f for _f in [self.ctrlr.socket, ro] if _f]) +- return info, True +- +- +-class UIVhostTargetObj(UINode): +- def __init__(self, target, parent): +- self.target = target +- # Next line: configshell does not allow paths with spaces. +- UINode.__init__(self, target["target_name"].replace(" ", "_"), parent) +- self.refresh() +- +- def refresh(self): +- self._children = set([]) +- for target in self.target["luns"]: +- UIVhostLunDevObj(target["bdev_name"], self) +- +- def ui_command_show_details(self): +- self.shell.log.info(json.dumps(self.target, indent=2)) +- +- def summary(self): +- luns = "LUNs: %s" % len(self.target["luns"]) +- id = "TargetID: %s" % self.target["scsi_dev_num"] +- info = ",".join([luns, id]) +- return info, True +- +- +-class UIVhostLunDevObj(UINode): +- def __init__(self, name, parent): +- UINode.__init__(self, name, parent) +- +- +-class UIRaidBdev(UIBdev): +- def __init__(self, parent): +- UIBdev.__init__(self, "raid_volume", parent) +- +- def delete(self, name): +- self.get_root().bdev_raid_delete(name=name) +- +- def ui_command_create(self, name, raid_level, base_bdevs, strip_size_kb): +- """ +- Creates a raid bdev of the provided base_bdevs +- +- Arguments: +- name - raid bdev name +- raid_level - raid level, supported values 0 +- base_bdevs - base bdevs name, whitespace separated list in quotes +- strip_size_kb - strip size of raid bdev in KB, supported values like 8, 16, 32, 64, 128, 256, etc +- """ +- base_bdevs_array = [] +- for u in base_bdevs.strip().split(" "): +- base_bdevs_array.append(u) +- +- strip_size_kb = self.ui_eval_param(strip_size_kb, "number", None) +- +- ret_name = self.get_root().bdev_raid_create(name=name, +- raid_level=raid_level, +- base_bdevs=base_bdevs_array, +- strip_size_kb=strip_size_kb) +- self.shell.log.info(ret_name) +- +- def ui_command_delete(self, name): +- """ +- Deletes this raid bdev object +- +- Arguments: +- name - raid bdev name +- """ +- self.delete(name) +- +- +-class UIUringBdev(UIBdev): +- def __init__(self, parent): +- UIBdev.__init__(self, "uring", parent) +- +- def delete(self, name): +- self.get_root().bdev_uring_delete(name=name) +- +- def ui_command_create(self, filename, name, block_size): +- """ +- Construct a uring bdev. +- +- Arguments: +- filename - Path to device or file. +- name - Name to use for bdev. +- block_size - Integer, block size to use when constructing bdev. +- """ +- +- block_size = self.ui_eval_param(block_size, "number", None) +- ret_name = self.get_root().bdev_uring_create(filename=filename, +- name=name, +- block_size=int(block_size)) +- self.shell.log.info(ret_name) +- +- def ui_command_delete(self, name): +- """ +- Deletes a uring bdev. +- +- Arguments: +- name - uring bdev name +- """ +- self.delete(name) ++# SPDX-License-Identifier: BSD-3-Clause ++# Copyright (C) 2018 Intel Corporation. ++# All rights reserved. ++ ++from configshell_fb import ConfigNode, ExecutionError ++from uuid import UUID ++from ..rpc.client import JSONRPCException ++import json ++ ++ ++def convert_bytes_to_human(size): ++ if size == 0: ++ return "%3.1f%s" % (size, "bytes") ++ if not size: ++ return "" ++ for x in ["bytes", "K", "M", "G", "T"]: ++ if size < 1024.0: ++ return "%3.1f%s" % (size, x) ++ size /= 1024.0 ++ ++ ++class UINode(ConfigNode): ++ def __init__(self, name, parent=None, shell=None): ++ ConfigNode.__init__(self, name, parent, shell) ++ ++ def refresh(self): ++ for child in self.children: ++ child.refresh() ++ ++ def refresh_node(self): ++ self.refresh() ++ ++ def ui_command_refresh(self): ++ self.refresh() ++ ++ def ui_command_ll(self, path=None, depth=None): ++ """ ++ Alias for ls. ++ """ ++ self.ui_command_ls(path, depth) ++ ++ def execute_command(self, command, pparams=[], kparams={}): ++ try: ++ result = ConfigNode.execute_command(self, command, ++ pparams, kparams) ++ except Exception as e: ++ raise e ++ else: ++ self.shell.log.debug("Command %s succeeded." % command) ++ return result ++ finally: ++ if self.shell.interactive and\ ++ command in ["create", "delete", "delete_all", "add_initiator", ++ "allow_any_host", "bdev_split_create", "add_lun", ++ "iscsi_target_node_add_pg_ig_maps", "remove_target", "add_secret", ++ "bdev_split_delete", "bdev_pmem_delete_pool", ++ "bdev_pmem_create_pool", "delete_secret_all", ++ "delete_initiator", "set_auth", "delete_secret", ++ "iscsi_target_node_remove_pg_ig_maps", "load_config", ++ "load_subsystem_config"]: ++ self.get_root().refresh() ++ self.refresh_node() ++ ++ ++class UIBdevs(UINode): ++ def __init__(self, parent): ++ UINode.__init__(self, "bdevs", parent) ++ self.refresh() ++ ++ def refresh(self): ++ self._children = set([]) ++ UIMallocBdev(self) ++ UIAIOBdev(self) ++ UILvolBdev(self) ++ UINvmeBdev(self) ++ UINullBdev(self) ++ UIErrorBdev(self) ++ UISplitBdev(self) ++ UIPmemBdev(self) ++ UIRbdBdev(self) ++ UIiSCSIBdev(self) ++ UIVirtioBlkBdev(self) ++ UIVirtioScsiBdev(self) ++ UIRaidBdev(self) ++ UIUringBdev(self) ++ ++ ++class UILvolStores(UINode): ++ def __init__(self, parent): ++ UINode.__init__(self, "lvol_stores", parent) ++ self.refresh() ++ ++ def refresh(self): ++ self._children = set([]) ++ for lvs in self.get_root().bdev_lvol_get_lvstores(): ++ UILvsObj(lvs, self) ++ ++ def delete(self, name, uuid): ++ if name is None and uuid is None: ++ self.shell.log.error("Please specify one of the identifiers: " ++ "lvol store name or UUID") ++ self.get_root().bdev_lvol_delete_lvstore(lvs_name=name, uuid=uuid) ++ ++ def ui_command_create(self, name, bdev_name, cluster_size=None): ++ """ ++ Creates logical volume store on target bdev. ++ ++ Arguments: ++ name - Friendly name to use alongside with UUID identifier. ++ bdev_name - On which bdev to create the lvol store. ++ cluster_size - Cluster size to use when creating lvol store, in bytes. Default: 4194304. ++ """ ++ ++ cluster_size = self.ui_eval_param(cluster_size, "number", None) ++ self.get_root().bdev_lvol_create_lvstore(lvs_name=name, bdev_name=bdev_name, cluster_sz=cluster_size) ++ ++ def ui_command_delete(self, name=None, uuid=None): ++ """ ++ Deletes logical volume store from configuration. ++ This will also delete all logical volume bdevs created on this lvol store! ++ ++ Arguments: ++ name - Friendly name of the logical volume store to be deleted. ++ uuid - UUID number of the logical volume store to be deleted. ++ """ ++ self.delete(name, uuid) ++ ++ def ui_command_delete_all(self): ++ rpc_messages = "" ++ for lvs in self._children: ++ try: ++ self.delete(None, lvs.lvs.uuid) ++ except JSONRPCException as e: ++ rpc_messages += e.message ++ if rpc_messages: ++ raise JSONRPCException(rpc_messages) ++ ++ def summary(self): ++ return "Lvol stores: %s" % len(self.children), None ++ ++ ++class UIBdev(UINode): ++ def __init__(self, name, parent): ++ UINode.__init__(self, name, parent) ++ self.refresh() ++ ++ def refresh(self): ++ self._children = set([]) ++ for bdev in self.get_root().bdev_get_bdevs(self.name): ++ UIBdevObj(bdev, self) ++ ++ def ui_command_get_bdev_iostat(self, name=None): ++ ret = self.get_root().bdev_get_iostat(name=name) ++ self.shell.log.info(json.dumps(ret, indent=2)) ++ ++ def ui_command_delete_all(self): ++ """Delete all bdevs from this tree node.""" ++ rpc_messages = "" ++ for bdev in self._children: ++ try: ++ self.delete(bdev.name) ++ except JSONRPCException as e: ++ rpc_messages += e.message ++ if rpc_messages: ++ raise JSONRPCException(rpc_messages) ++ ++ def summary(self): ++ return "Bdevs: %d" % len(self.children), None ++ ++ ++class UIMallocBdev(UIBdev): ++ def __init__(self, parent): ++ UIBdev.__init__(self, "malloc", parent) ++ ++ def delete(self, name): ++ self.get_root().bdev_malloc_delete(name=name) ++ ++ def ui_command_create(self, size, block_size, name=None, uuid=None): ++ """ ++ Construct a Malloc bdev. ++ ++ Arguments: ++ size - Size in megabytes. ++ block_size - Integer, block size to use when constructing bdev. ++ name - Optional argument. Custom name to use for bdev. If not provided ++ then name will be "MallocX" where X is next available ID. ++ uuid - Optional parameter. Custom UUID to use. If empty then random ++ will be generated. ++ """ ++ ++ size = self.ui_eval_param(size, "number", None) ++ block_size = self.ui_eval_param(block_size, "number", None) ++ ret_name = self.get_root().create_malloc_bdev(num_blocks=size * 1024 * 1024 // block_size, ++ block_size=block_size, ++ name=name, uuid=uuid) ++ self.shell.log.info(ret_name) ++ ++ def ui_command_delete(self, name): ++ """ ++ Deletes malloc bdev from configuration. ++ ++ Arguments: ++ name - Is a unique identifier of the malloc bdev to be deleted - UUID number or name alias. ++ """ ++ self.delete(name) ++ ++ ++class UIAIOBdev(UIBdev): ++ def __init__(self, parent): ++ UIBdev.__init__(self, "aio", parent) ++ ++ def delete(self, name): ++ self.get_root().bdev_aio_delete(name=name) ++ ++ def ui_command_create(self, name, filename, block_size): ++ """ ++ Construct an AIO bdev. ++ Backend file must exist before trying to create an AIO bdev. ++ ++ Arguments: ++ name - Optional argument. Custom name to use for bdev. If not provided ++ then name will be "MallocX" where X is next available ID. ++ filename - Path to AIO backend. ++ block_size - Integer, block size to use when constructing bdev. ++ """ ++ ++ block_size = self.ui_eval_param(block_size, "number", None) ++ ret_name = self.get_root().bdev_aio_create(name=name, ++ block_size=int(block_size), ++ filename=filename) ++ self.shell.log.info(ret_name) ++ ++ def ui_command_delete(self, name): ++ """ ++ Deletes aio bdev from configuration. ++ ++ Arguments: ++ name - Is a unique identifier of the aio bdev to be deleted - UUID number or name alias. ++ """ ++ self.delete(name) ++ ++ ++class UILvolBdev(UIBdev): ++ def __init__(self, parent): ++ UIBdev.__init__(self, "logical_volume", parent) ++ ++ def delete(self, name): ++ self.get_root().bdev_lvol_delete(name=name) ++ ++ def ui_command_create(self, name, size, lvs, thin_provision=None): ++ """ ++ Construct a Logical Volume bdev. ++ ++ Arguments: ++ name - Friendly name to use for creating logical volume bdev. ++ size - Size in megabytes. ++ lvs - Identifier of logical volume store on which the bdev should be ++ created. Can be either a friendly name or UUID. ++ thin_provision - Whether the bdev should be thick or thin provisioned. ++ Default is False, and created bdevs are thick-provisioned. ++ """ ++ uuid = None ++ lvs_name = None ++ try: ++ UUID(lvs) ++ uuid = lvs ++ except ValueError: ++ lvs_name = lvs ++ ++ size = self.ui_eval_param(size, "number", None) ++ size *= (1024 * 1024) ++ thin_provision = self.ui_eval_param(thin_provision, "bool", False) ++ ++ ret_uuid = self.get_root().create_lvol_bdev(lvol_name=name, size=size, ++ lvs_name=lvs_name, uuid=uuid, ++ thin_provision=thin_provision) ++ self.shell.log.info(ret_uuid) ++ ++ def ui_command_delete(self, name): ++ """ ++ Deletes lvol bdev from configuration. ++ ++ Arguments: ++ name - Is a unique identifier of the lvol bdev to be deleted - UUID number or name alias. ++ """ ++ self.delete(name) ++ ++ ++class UINvmeBdev(UIBdev): ++ def __init__(self, parent): ++ UIBdev.__init__(self, "nvme", parent) ++ ++ def delete(self, name): ++ self.get_root().bdev_nvme_detach_controller(name=name) ++ ++ def ui_command_create(self, name, trtype, traddr, ++ adrfam=None, trsvcid=None, subnqn=None): ++ if "rdma" in trtype and None in [adrfam, trsvcid, subnqn]: ++ self.shell.log.error("Using RDMA transport type." ++ "Please provide arguments for adrfam, trsvcid and subnqn.") ++ ret_name = self.get_root().create_nvme_bdev(name=name, trtype=trtype, ++ traddr=traddr, adrfam=adrfam, ++ trsvcid=trsvcid, subnqn=subnqn) ++ self.shell.log.info(ret_name) ++ ++ def ui_command_delete_all(self): ++ rpc_messages = "" ++ ctrlrs = [x.name for x in self._children] ++ ctrlrs = [x.rsplit("n", 1)[0] for x in ctrlrs] ++ ctrlrs = set(ctrlrs) ++ for ctrlr in ctrlrs: ++ try: ++ self.delete(ctrlr) ++ except JSONRPCException as e: ++ rpc_messages += e.messages ++ if rpc_messages: ++ raise JSONRPCException(rpc_messages) ++ ++ def ui_command_delete(self, name): ++ """ ++ Deletes NVMe controller from configuration. ++ ++ Arguments: ++ name - Is a unique identifier of the NVMe controller to be deleted. ++ """ ++ self.delete(name) ++ ++ ++class UINullBdev(UIBdev): ++ def __init__(self, parent): ++ UIBdev.__init__(self, "null", parent) ++ ++ def delete(self, name): ++ self.get_root().bdev_null_delete(name=name) ++ ++ def ui_command_create(self, name, size, block_size, uuid=None): ++ """ ++ Construct a Null bdev. ++ ++ Arguments: ++ name - Name to use for bdev. ++ size - Size in megabytes. ++ block_size - Integer, block size to use when constructing bdev. ++ uuid - Optional parameter. Custom UUID to use. If empty then random ++ will be generated. ++ """ ++ ++ size = self.ui_eval_param(size, "number", None) ++ block_size = self.ui_eval_param(block_size, "number", None) ++ num_blocks = size * 1024 * 1024 // block_size ++ ret_name = self.get_root().bdev_null_create(num_blocks=num_blocks, ++ block_size=block_size, ++ name=name, uuid=uuid) ++ self.shell.log.info(ret_name) ++ ++ def ui_command_delete(self, name): ++ """ ++ Deletes null bdev from configuration. ++ ++ Arguments: ++ name - Is a unique identifier of the null bdev to be deleted - UUID number or name alias. ++ """ ++ self.delete(name) ++ ++ ++class UIErrorBdev(UIBdev): ++ def __init__(self, parent): ++ UIBdev.__init__(self, "error", parent) ++ ++ def delete(self, name): ++ self.get_root().bdev_error_delete(name=name) ++ ++ def ui_command_create(self, base_name): ++ """ ++ Construct a error injection bdev. ++ ++ Arguments: ++ base_name - base bdev name on top of which error bdev will be created. ++ """ ++ ++ self.get_root().create_error_bdev(base_name=base_name) ++ ++ def ui_command_delete(self, name): ++ """ ++ Deletes error bdev from configuration. ++ ++ Arguments: ++ name - Is a unique identifier of the error bdev to be deleted - UUID number or name alias. ++ """ ++ self.delete(name) ++ ++ ++class UISplitBdev(UIBdev): ++ def __init__(self, parent): ++ UIBdev.__init__(self, "split_disk", parent) ++ ++ def delete(self, name): ++ pass ++ ++ def ui_command_bdev_split_create(self, base_bdev, split_count, split_size_mb=None): ++ """ ++ Create split block devices from a base bdev. ++ ++ Arguments: ++ base_bdev - Name of bdev to split ++ split_count - Number of split bdevs to create ++ split_size_mb- Size of each split volume in MiB (optional) ++ """ ++ ++ split_count = self.ui_eval_param(split_count, "number", None) ++ split_size_mb = self.ui_eval_param(split_size_mb, "number", None) ++ ++ ret_name = self.get_root().bdev_split_create(base_bdev=base_bdev, ++ split_count=split_count, ++ split_size_mb=split_size_mb) ++ self.shell.log.info(ret_name) ++ ++ def ui_command_bdev_split_delete(self, base_bdev): ++ """Delete split block devices associated with base bdev. ++ ++ Args: ++ base_bdev: name of previously split bdev ++ """ ++ ++ self.get_root().bdev_split_delete(base_bdev=base_bdev) ++ ++ ++class UIPmemBdev(UIBdev): ++ def __init__(self, parent): ++ UIBdev.__init__(self, "pmemblk", parent) ++ ++ def delete(self, name): ++ self.get_root().bdev_pmem_delete(name=name) ++ ++ def ui_command_bdev_pmem_create_pool(self, pmem_file, total_size, block_size): ++ total_size = self.ui_eval_param(total_size, "number", None) ++ block_size = self.ui_eval_param(block_size, "number", None) ++ num_blocks = int((total_size * 1024 * 1024) / block_size) ++ ++ self.get_root().bdev_pmem_create_pool(pmem_file=pmem_file, ++ num_blocks=num_blocks, ++ block_size=block_size) ++ ++ def ui_command_bdev_pmem_delete_pool(self, pmem_file): ++ self.get_root().bdev_pmem_delete_pool(pmem_file=pmem_file) ++ ++ def ui_command_bdev_pmem_get_pool_info(self, pmem_file): ++ ret = self.get_root().bdev_pmem_get_pool_info(pmem_file=pmem_file) ++ self.shell.log.info(json.dumps(ret, indent=2)) ++ ++ def ui_command_create(self, pmem_file, name): ++ ret_name = self.get_root().bdev_pmem_create(pmem_file=pmem_file, ++ name=name) ++ self.shell.log.info(ret_name) ++ ++ def ui_command_delete(self, name): ++ """ ++ Deletes pmem bdev from configuration. ++ ++ Arguments: ++ name - Is a unique identifier of the pmem bdev to be deleted - UUID number or name alias. ++ """ ++ self.delete(name) ++ ++ ++class UIRbdBdev(UIBdev): ++ def __init__(self, parent): ++ UIBdev.__init__(self, "rbd", parent) ++ ++ def delete(self, name): ++ self.get_root().bdev_rbd_delete(name=name) ++ ++ def ui_command_create(self, pool_name, rbd_name, block_size, name=None): ++ block_size = self.ui_eval_param(block_size, "number", None) ++ ++ ret_name = self.get_root().create_rbd_bdev(pool_name=pool_name, ++ rbd_name=rbd_name, ++ block_size=block_size, ++ name=name) ++ self.shell.log.info(ret_name) ++ ++ def ui_command_delete(self, name): ++ """ ++ Deletes rbd bdev from configuration. ++ ++ Arguments: ++ name - Is a unique identifier of the rbd bdev to be deleted - UUID number or name alias. ++ """ ++ self.delete(name) ++ ++ ++class UIiSCSIBdev(UIBdev): ++ def __init__(self, parent): ++ UIBdev.__init__(self, "iscsi", parent) ++ ++ def delete(self, name): ++ self.get_root().bdev_iscsi_delete(name=name) ++ ++ def ui_command_create(self, name, url, initiator_iqn): ++ """ ++ Create iSCSI bdev in configuration by connecting to remote ++ iSCSI target. ++ ++ Arguments: ++ name - name to be used as an ID for created iSCSI bdev. ++ url - iscsi url pointing to LUN on remote iSCSI target. ++ Example: iscsi://127.0.0.1:3260/iqn.2018-06.org.spdk/0. ++ initiator_iqn - IQN to use for initiating connection with the target. ++ """ ++ ret_name = self.get_root().create_iscsi_bdev(name=name, ++ url=url, ++ initiator_iqn=initiator_iqn) ++ self.shell.log.info(ret_name) ++ ++ def ui_command_delete(self, name): ++ """ ++ Deletes iSCSI bdev from configuration. ++ ++ Arguments: ++ name - name of the iscsi bdev to be deleted. ++ """ ++ self.delete(name) ++ ++ ++class UIVirtioBlkBdev(UIBdev): ++ def __init__(self, parent): ++ UIBdev.__init__(self, "virtioblk_disk", parent) ++ ++ def ui_command_create(self, name, trtype, traddr, ++ vq_count=None, vq_size=None): ++ ++ vq_count = self.ui_eval_param(vq_count, "number", None) ++ vq_size = self.ui_eval_param(vq_size, "number", None) ++ ++ ret = self.get_root().create_virtio_dev(name=name, ++ trtype=trtype, ++ traddr=traddr, ++ dev_type="blk", ++ vq_count=vq_count, ++ vq_size=vq_size) ++ ++ self.shell.log.info(ret) ++ ++ def ui_command_delete(self, name): ++ """ ++ Deletes virtio scsi bdev from configuration. ++ ++ Arguments: ++ name - Is a unique identifier of the virtio scsi bdev to be deleted - UUID number or name alias. ++ """ ++ self.get_root().bdev_virtio_detach_controller(name=name) ++ ++ ++class UIVirtioScsiBdev(UIBdev): ++ def __init__(self, parent): ++ UIBdev.__init__(self, "virtioscsi_disk", parent) ++ ++ def refresh(self): ++ self._children = set([]) ++ for bdev in self.get_root().bdev_virtio_scsi_get_devices(): ++ UIVirtioScsiBdevObj(bdev, self) ++ ++ def ui_command_create(self, name, trtype, traddr, ++ vq_count=None, vq_size=None): ++ ++ vq_count = self.ui_eval_param(vq_count, "number", None) ++ vq_size = self.ui_eval_param(vq_size, "number", None) ++ ++ ret = self.get_root().create_virtio_dev(name=name, ++ trtype=trtype, ++ traddr=traddr, ++ dev_type="scsi", ++ vq_count=vq_count, ++ vq_size=vq_size) ++ ++ self.shell.log.info(ret) ++ ++ def ui_command_delete(self, name): ++ self.get_root().bdev_virtio_detach_controller(name=name) ++ ++ ++class UIBdevObj(UINode): ++ def __init__(self, bdev, parent): ++ self.bdev = bdev ++ # Using bdev name also for lvol bdevs, which results in displaying ++ # UUID instead of alias. This is because alias naming convention ++ # (lvol_store_name/lvol_bdev_name) conflicts with configshell paths ++ # ("/" as separator). ++ # Solution: show lvol alias in "summary field" for now. ++ # TODO: Possible next steps: ++ # - Either change default separator in tree for smth else ++ # - or add a UI command which would be able to autocomplete ++ # "cd" command based on objects alias and match is to the ++ # "main" bdev name. ++ UINode.__init__(self, self.bdev.name, parent) ++ ++ def ui_command_show_details(self): ++ self.shell.log.info(json.dumps(vars(self.bdev), indent=2)) ++ ++ def summary(self): ++ size = convert_bytes_to_human(self.bdev.block_size * self.bdev.num_blocks) ++ size = "=".join(["Size", size]) ++ ++ in_use = "Not claimed" ++ if bool(self.bdev.claimed): ++ in_use = "Claimed" ++ ++ alias = None ++ if self.bdev.aliases: ++ alias = self.bdev.aliases[0] ++ ++ info = ", ".join([_f for _f in [alias, size, in_use] if _f]) ++ return info, True ++ ++ ++class UIVirtioScsiBdevObj(UIBdevObj): ++ def __init__(self, bdev, parent): ++ UIBdevObj.__init__(self, bdev, parent) ++ self.refresh() ++ ++ def refresh(self): ++ self._children = set([]) ++ for bdev in self.get_root().bdev_get_bdevs("virtio_scsi_disk"): ++ if self.bdev.name in bdev.name: ++ UIBdevObj(bdev, self) ++ ++ def summary(self): ++ if "socket" in list(self.bdev.virtio.keys()): ++ info = self.bdev.virtio["socket"] ++ if "pci_address" in list(self.bdev.virtio.keys()): ++ info = self.bdev.virtio["pci_address"] ++ return info, True ++ ++ ++class UILvsObj(UINode): ++ def __init__(self, lvs, parent): ++ UINode.__init__(self, lvs.name, parent) ++ self.lvs = lvs ++ ++ def ui_command_show_details(self): ++ self.shell.log.info(json.dumps(vars(self.lvs), indent=2)) ++ ++ def summary(self): ++ size = convert_bytes_to_human(self.lvs.total_data_clusters * self.lvs.cluster_size) ++ free = convert_bytes_to_human(self.lvs.free_clusters * self.lvs.cluster_size) ++ if not free: ++ free = "0" ++ size = "=".join(["Size", size]) ++ free = "=".join(["Free", free]) ++ info = ", ".join([str(size), str(free)]) ++ return info, True ++ ++ ++class UIVhosts(UINode): ++ def __init__(self, parent): ++ UINode.__init__(self, "vhost", parent) ++ self.refresh() ++ ++ def refresh(self): ++ self._children = set([]) ++ self.get_root().list_vhost_ctrls() ++ UIVhostBlk(self) ++ UIVhostScsi(self) ++ ++ ++class UIVhost(UINode): ++ def __init__(self, name, parent): ++ UINode.__init__(self, name, parent) ++ self.refresh() ++ ++ def ui_command_delete(self, name): ++ """ ++ Delete a Vhost controller from configuration. ++ ++ Arguments: ++ name - Controller name. ++ """ ++ self.get_root().vhost_delete_controller(ctrlr=name) ++ ++ ++class UIVhostBlk(UIVhost): ++ def __init__(self, parent): ++ UIVhost.__init__(self, "block", parent) ++ self.refresh() ++ ++ def refresh(self): ++ self._children = set([]) ++ for ctrlr in self.get_root().vhost_get_controllers(ctrlr_type=self.name): ++ UIVhostBlkCtrlObj(ctrlr, self) ++ ++ def ui_command_create(self, name, bdev, cpumask=None, readonly=False): ++ """ ++ Create a Vhost BLK controller. ++ ++ Arguments: ++ name - Controller name. ++ bdev - Which bdev to attach to the controller. ++ cpumask - Optional. Integer to specify mask of CPUs to use. ++ Default: 1. ++ readonly - Whether controller should be read only or not. ++ Default: False. ++ """ ++ self.get_root().vhost_create_blk_controller(ctrlr=name, ++ dev_name=bdev, ++ cpumask=cpumask, ++ readonly=bool(readonly)) ++ ++ ++class UIVhostScsi(UIVhost): ++ def __init__(self, parent): ++ UIVhost.__init__(self, "scsi", parent) ++ self.refresh() ++ ++ def refresh(self): ++ self._children = set([]) ++ for ctrlr in self.get_root().vhost_get_controllers(ctrlr_type=self.name): ++ UIVhostScsiCtrlObj(ctrlr, self) ++ ++ def ui_command_create(self, name, cpumask=None): ++ """ ++ Create a Vhost SCSI controller. ++ ++ Arguments: ++ name - Controller name. ++ cpumask - Optional. Integer to specify mask of CPUs to use. ++ Default: 1. ++ """ ++ self.get_root().vhost_create_scsi_controller(ctrlr=name, ++ cpumask=cpumask) ++ ++ ++class UIVhostCtrl(UINode): ++ # Base class for SCSI and BLK controllers, do not instantiate ++ def __init__(self, ctrlr, parent): ++ self.ctrlr = ctrlr ++ UINode.__init__(self, self.ctrlr.ctrlr, parent) ++ self.refresh() ++ ++ def ui_command_show_details(self): ++ self.shell.log.info(json.dumps(vars(self.ctrlr), indent=2)) ++ ++ def ui_command_set_coalescing(self, delay_base_us, iops_threshold): ++ delay_base_us = self.ui_eval_param(delay_base_us, "number", None) ++ iops_threshold = self.ui_eval_param(iops_threshold, "number", None) ++ ++ self.get_root().vhost_controller_set_coalescing(ctrlr=self.ctrlr.ctrlr, ++ delay_base_us=delay_base_us, ++ iops_threshold=iops_threshold) ++ ++ ++class UIVhostScsiCtrlObj(UIVhostCtrl): ++ def refresh(self): ++ self._children = set([]) ++ for lun in self.ctrlr.backend_specific["scsi"]: ++ UIVhostTargetObj(lun, self) ++ ++ def ui_command_remove_target(self, target_num): ++ """ ++ Remove target node from SCSI controller. ++ ++ Arguments: ++ target_num - Integer identifier of target node to delete. ++ """ ++ self.get_root().vhost_scsi_controller_remove_target(ctrlr=self.ctrlr.ctrlr, ++ scsi_target_num=int(target_num)) ++ for ctrlr in self.get_root().vhost_get_controllers(ctrlr_type="scsi"): ++ if ctrlr.ctrlr == self.ctrlr.ctrlr: ++ self.ctrlr = ctrlr ++ ++ def ui_command_add_lun(self, target_num, bdev_name): ++ """ ++ Add LUN to SCSI target node. ++ Currently only one LUN (which is LUN ID 0) per target is supported. ++ Adding LUN to not existing target node will create that node. ++ ++ Arguments: ++ target_num - Integer identifier of target node to modify. ++ bdev - Which bdev to add as LUN. ++ """ ++ self.get_root().vhost_scsi_controller_add_target(ctrlr=self.ctrlr.ctrlr, ++ scsi_target_num=int(target_num), ++ bdev_name=bdev_name) ++ for ctrlr in self.get_root().vhost_get_controllers(ctrlr_type="scsi"): ++ if ctrlr.ctrlr == self.ctrlr.ctrlr: ++ self.ctrlr = ctrlr ++ ++ def summary(self): ++ info = self.ctrlr.socket ++ return info, True ++ ++ ++class UIVhostBlkCtrlObj(UIVhostCtrl): ++ def refresh(self): ++ self._children = set([]) ++ UIVhostLunDevObj(self.ctrlr.backend_specific["block"]["bdev"], self) ++ ++ def summary(self): ++ ro = None ++ if self.ctrlr.backend_specific["block"]["readonly"]: ++ ro = "Readonly" ++ info = ", ".join([_f for _f in [self.ctrlr.socket, ro] if _f]) ++ return info, True ++ ++ ++class UIVhostTargetObj(UINode): ++ def __init__(self, target, parent): ++ self.target = target ++ # Next line: configshell does not allow paths with spaces. ++ UINode.__init__(self, target["target_name"].replace(" ", "_"), parent) ++ self.refresh() ++ ++ def refresh(self): ++ self._children = set([]) ++ for target in self.target["luns"]: ++ UIVhostLunDevObj(target["bdev_name"], self) ++ ++ def ui_command_show_details(self): ++ self.shell.log.info(json.dumps(self.target, indent=2)) ++ ++ def summary(self): ++ luns = "LUNs: %s" % len(self.target["luns"]) ++ id = "TargetID: %s" % self.target["scsi_dev_num"] ++ info = ",".join([luns, id]) ++ return info, True ++ ++ ++class UIVhostLunDevObj(UINode): ++ def __init__(self, name, parent): ++ UINode.__init__(self, name, parent) ++ ++ ++class UIRaidBdev(UIBdev): ++ def __init__(self, parent): ++ UIBdev.__init__(self, "raid_volume", parent) ++ ++ def delete(self, name): ++ self.get_root().bdev_raid_delete(name=name) ++ ++ def ui_command_create(self, name, raid_level, base_bdevs, strip_size_kb): ++ """ ++ Creates a raid bdev of the provided base_bdevs ++ ++ Arguments: ++ name - raid bdev name ++ raid_level - raid level, supported values 0 ++ base_bdevs - base bdevs name, whitespace separated list in quotes ++ strip_size_kb - strip size of raid bdev in KB, supported values like 8, 16, 32, 64, 128, 256, etc ++ """ ++ base_bdevs_array = [] ++ for u in base_bdevs.strip().split(" "): ++ base_bdevs_array.append(u) ++ ++ strip_size_kb = self.ui_eval_param(strip_size_kb, "number", None) ++ ++ ret_name = self.get_root().bdev_raid_create(name=name, ++ raid_level=raid_level, ++ base_bdevs=base_bdevs_array, ++ strip_size_kb=strip_size_kb) ++ self.shell.log.info(ret_name) ++ ++ def ui_command_delete(self, name): ++ """ ++ Deletes this raid bdev object ++ ++ Arguments: ++ name - raid bdev name ++ """ ++ self.delete(name) ++ ++ ++class UIUringBdev(UIBdev): ++ def __init__(self, parent): ++ UIBdev.__init__(self, "uring", parent) ++ ++ def delete(self, name): ++ self.get_root().bdev_uring_delete(name=name) ++ ++ def ui_command_create(self, filename, name, block_size): ++ """ ++ Construct a uring bdev. ++ ++ Arguments: ++ filename - Path to device or file. ++ name - Name to use for bdev. ++ block_size - Integer, block size to use when constructing bdev. ++ """ ++ ++ block_size = self.ui_eval_param(block_size, "number", None) ++ ret_name = self.get_root().bdev_uring_create(filename=filename, ++ name=name, ++ block_size=int(block_size)) ++ self.shell.log.info(ret_name) ++ ++ def ui_command_delete(self, name): ++ """ ++ Deletes a uring bdev. ++ ++ Arguments: ++ name - uring bdev name ++ """ ++ self.delete(name) +diff --git a/python/spdk/spdkcli/ui_node_iscsi.py b/python/spdk/spdkcli/ui_node_iscsi.py +index 9bedadd..2011887 100644 +--- a/python/spdk/spdkcli/ui_node_iscsi.py ++++ b/python/spdk/spdkcli/ui_node_iscsi.py +@@ -1,643 +1,643 @@ +-# SPDX-License-Identifier: BSD-3-Clause +-# Copyright (C) 2018 Intel Corporation. +-# All rights reserved. +- +-from configshell_fb import ExecutionError +-from ..rpc.client import JSONRPCException +-from .ui_node import UINode +- +- +-class UIISCSI(UINode): +- def __init__(self, parent): +- UINode.__init__(self, "iscsi", parent) +- self.refresh() +- +- def refresh(self): +- self._children = set([]) +- UIISCSIDevices(self) +- UIPortalGroups(self) +- UIInitiatorGroups(self) +- UIISCSIConnections(self) +- UIISCSIAuthGroups(self) +- UIISCSIGlobalParams(self) +- +- +-class UIISCSIGlobalParams(UINode): +- def __init__(self, parent): +- UINode.__init__(self, "global_params", parent) +- self.refresh() +- +- def refresh(self): +- self._children = set([]) +- iscsi_global_params = self.get_root().iscsi_get_options() +- if not iscsi_global_params: +- return +- for param, val in iscsi_global_params.items(): +- UIISCSIGlobalParam("%s: %s" % (param, val), self) +- +- def ui_command_set_auth(self, g=None, d=None, r=None, m=None): +- """Set CHAP authentication for discovery service. +- +- Optional arguments: +- g = chap_group: Authentication group ID for discovery session +- d = disable_chap: CHAP for discovery session should be disabled +- r = require_chap: CHAP for discovery session should be required +- m = mutual_chap: CHAP for discovery session should be mutual +- """ +- chap_group = self.ui_eval_param(g, "number", None) +- disable_chap = self.ui_eval_param(d, "bool", None) +- require_chap = self.ui_eval_param(r, "bool", None) +- mutual_chap = self.ui_eval_param(m, "bool", None) +- self.get_root().iscsi_set_discovery_auth( +- chap_group=chap_group, disable_chap=disable_chap, +- require_chap=require_chap, mutual_chap=mutual_chap) +- +- +-class UIISCSIGlobalParam(UINode): +- def __init__(self, param, parent): +- UINode.__init__(self, param, parent) +- +- +-class UIISCSIDevices(UINode): +- def __init__(self, parent): +- UINode.__init__(self, "target_nodes", parent) +- self.scsi_devices = list() +- self.refresh() +- +- def refresh(self): +- self._children = set([]) +- self.target_nodes = list(self.get_root().iscsi_get_target_nodes()) +- self.scsi_devices = list(self.get_root().scsi_get_devices()) +- for device in self.scsi_devices: +- for node in self.target_nodes: +- if hasattr(device, "device_name") and node['name'] \ +- == device.device_name: +- UIISCSIDevice(device, node, self) +- +- def delete(self, name): +- self.get_root().iscsi_delete_target_node(target_node_name=name) +- +- def ui_command_create(self, name, alias_name, bdev_name_id_pairs, +- pg_ig_mappings, queue_depth, g=None, d=None, r=None, +- m=None, h=None, t=None): +- """Create target node +- +- Positional args: +- name: Target node name (ASCII) +- alias_name: Target node alias name (ASCII) +- bdev_name_id_pairs: List of bdev_name_id_pairs +- pg_ig_mappings: List of pg_ig_mappings +- queue_depth: Desired target queue depth +- Optional args: +- g = chap_group: Authentication group ID for this target node +- d = disable_chap: CHAP authentication should be disabled for this target node +- r = require_chap: CHAP authentication should be required for this target node +- m = mutual_chap: CHAP authentication should be mutual/bidirectional +- h = header_digest: Header Digest should be required for this target node +- t = data_digest: Data Digest should be required for this target node +- """ +- luns = [] +- print("bdev_name_id_pairs: %s" % bdev_name_id_pairs) +- print("pg_ig_mappings: %s" % pg_ig_mappings) +- for u in bdev_name_id_pairs.strip().split(" "): +- bdev_name, lun_id = u.split(":") +- luns.append({"bdev_name": bdev_name, "lun_id": int(lun_id)}) +- pg_ig_maps = [] +- for u in pg_ig_mappings.strip().split(" "): +- pg, ig = u.split(":") +- pg_ig_maps.append({"pg_tag": int(pg), "ig_tag": int(ig)}) +- queue_depth = self.ui_eval_param(queue_depth, "number", None) +- chap_group = self.ui_eval_param(g, "number", None) +- disable_chap = self.ui_eval_param(d, "bool", None) +- require_chap = self.ui_eval_param(r, "bool", None) +- mutual_chap = self.ui_eval_param(m, "bool", None) +- header_digest = self.ui_eval_param(h, "bool", None) +- data_digest = self.ui_eval_param(t, "bool", None) +- self.get_root().iscsi_create_target_node( +- name=name, alias_name=alias_name, luns=luns, +- pg_ig_maps=pg_ig_maps, queue_depth=queue_depth, +- chap_group=chap_group, disable_chap=disable_chap, +- require_chap=require_chap, mutual_chap=mutual_chap, +- header_digest=header_digest, data_digest=data_digest) +- +- def ui_command_delete(self, name=None): +- """Delete a target node. If name is not specified delete all target nodes. +- +- Arguments: +- name - Target node name. +- """ +- self.delete(name) +- +- def ui_command_delete_all(self): +- """Delete all target nodes""" +- rpc_messages = "" +- for device in self.scsi_devices: +- try: +- self.delete(device.device_name) +- except JSONRPCException as e: +- rpc_messages += e.message +- if rpc_messages: +- raise JSONRPCException(rpc_messages) +- +- def ui_command_add_lun(self, name, bdev_name, lun_id=None): +- """Add lun to the target node. +- +- Required args: +- name: Target node name (ASCII) +- bdev_name: bdev name +- Positional args: +- lun_id: LUN ID (integer >= 0) +- """ +- if lun_id: +- lun_id = self.ui_eval_param(lun_id, "number", None) +- self.get_root().iscsi_target_node_add_lun( +- name=name, bdev_name=bdev_name, lun_id=lun_id) +- +- def summary(self): +- count = 0 +- for device in self.scsi_devices: +- for node in self.target_nodes: +- if hasattr(device, "device_name") and node['name'] \ +- == device.device_name: +- count = count + 1 +- return "Target nodes: %d" % count, None +- +- +-class UIISCSIDevice(UINode): +- def __init__(self, device, target, parent): +- UINode.__init__(self, device.device_name, parent) +- self.device = device +- self.target = target +- self.refresh() +- +- def ui_command_set_auth(self, g=None, d=None, r=None, m=None): +- """Set CHAP authentication for the target node. +- +- Optionals args: +- g = chap_group: Authentication group ID for this target node +- d = disable_chap: CHAP authentication should be disabled for this target node +- r = require_chap: CHAP authentication should be required for this target node +- m = mutual_chap: CHAP authentication should be mutual/bidirectional +- """ +- chap_group = self.ui_eval_param(g, "number", None) +- disable_chap = self.ui_eval_param(d, "bool", None) +- require_chap = self.ui_eval_param(r, "bool", None) +- mutual_chap = self.ui_eval_param(m, "bool", None) +- self.get_root().iscsi_target_node_set_auth( +- name=self.device.device_name, chap_group=chap_group, +- disable_chap=disable_chap, +- require_chap=require_chap, mutual_chap=mutual_chap) +- +- def ui_command_iscsi_target_node_add_pg_ig_maps(self, pg_ig_mappings): +- """Add PG-IG maps to the target node. +- +- Args: +- pg_ig_maps: List of pg_ig_mappings, e.g. pg_tag:ig_tag pg_tag2:ig_tag2 +- """ +- pg_ig_maps = [] +- for u in pg_ig_mappings.strip().split(" "): +- pg, ig = u.split(":") +- pg_ig_maps.append({"pg_tag": int(pg), "ig_tag": int(ig)}) +- self.get_root().iscsi_target_node_add_pg_ig_maps( +- pg_ig_maps=pg_ig_maps, name=self.device.device_name) +- +- def ui_command_iscsi_target_node_remove_pg_ig_maps(self, pg_ig_mappings): +- """Remove PG-IG maps from the target node. +- +- Args: +- pg_ig_maps: List of pg_ig_mappings, e.g. pg_tag:ig_tag pg_tag2:ig_tag2 +- """ +- pg_ig_maps = [] +- for u in pg_ig_mappings.strip().split(" "): +- pg, ig = u.split(":") +- pg_ig_maps.append({"pg_tag": int(pg), "ig_tag": int(ig)}) +- self.get_root().iscsi_target_node_remove_pg_ig_maps( +- pg_ig_maps=pg_ig_maps, name=self.device.device_name) +- +- def refresh(self): +- self._children = set([]) +- UIISCSILuns(self.target['luns'], self) +- UIISCSIPgIgMaps(self.target['pg_ig_maps'], self) +- auths = {"disable_chap": self.target["disable_chap"], +- "require_chap": self.target["require_chap"], +- "mutual_chap": self.target["mutual_chap"], +- "chap_group": self.target["chap_group"], +- "data_digest": self.target["data_digest"]} +- UIISCSIAuth(auths, self) +- +- def summary(self): +- return "Id: %s, QueueDepth: %s" % (self.device.id, +- self.target['queue_depth']), None +- +- +-class UIISCSIAuth(UINode): +- def __init__(self, auths, parent): +- UINode.__init__(self, "auths", parent) +- self.auths = auths +- self.refresh() +- +- def summary(self): +- return "disable_chap: %s, require_chap: %s, mutual_chap: %s, chap_group: %s" % ( +- self.auths['disable_chap'], self.auths['require_chap'], +- self.auths['mutual_chap'], self.auths['chap_group']), None +- +- +-class UIISCSILuns(UINode): +- def __init__(self, luns, parent): +- UINode.__init__(self, "luns", parent) +- self.luns = luns +- self.refresh() +- +- def refresh(self): +- self._children = set([]) +- for lun in self.luns: +- UIISCSILun(lun, self) +- +- def summary(self): +- return "Luns: %d" % len(self.luns), None +- +- +-class UIISCSILun(UINode): +- def __init__(self, lun, parent): +- UINode.__init__(self, "lun %s" % lun['lun_id'], parent) +- self.lun = lun +- self.refresh() +- +- def summary(self): +- return "%s" % self.lun['bdev_name'], None +- +- +-class UIISCSIPgIgMaps(UINode): +- def __init__(self, pg_ig_maps, parent): +- UINode.__init__(self, "pg_ig_maps", parent) +- self.pg_ig_maps = pg_ig_maps +- self.refresh() +- +- def refresh(self): +- self._children = set([]) +- for pg_ig in self.pg_ig_maps: +- UIISCSIPgIg(pg_ig, self) +- +- def summary(self): +- return "Pg_ig_maps: %d" % len(self.pg_ig_maps), None +- +- +-class UIISCSIPgIg(UINode): +- def __init__(self, pg_ig, parent): +- UINode.__init__(self, "portal_group%s - initiator_group%s" % +- (pg_ig['pg_tag'], pg_ig['ig_tag']), parent) +- self.pg_ig = pg_ig +- self.refresh() +- +- +-class UIPortalGroups(UINode): +- def __init__(self, parent): +- UINode.__init__(self, "portal_groups", parent) +- self.refresh() +- +- def delete(self, tag): +- self.get_root().iscsi_delete_portal_group(tag=tag) +- +- def ui_command_create(self, tag, portal_list): +- """Add a portal group. +- +- Args: +- portals: List of portals e.g. ip:port ip2:port2 +- tag: Portal group tag (unique, integer > 0) +- """ +- portals = [] +- for portal in portal_list.strip().split(" "): +- host = portal +- cpumask = None +- if "@" in portal: +- host, cpumask = portal.split("@") +- if ":" not in host: +- raise ExecutionError("Incorrect format of portal group. Port is missing." +- "Use 'help create' to see the command syntax.") +- host, port = host.rsplit(":", -1) +- portals.append({'host': host, 'port': port}) +- if cpumask: +- print("WARNING: Specifying a CPU mask for portal groups is no longer supported. Ignoring.") +- tag = self.ui_eval_param(tag, "number", None) +- self.get_root().construct_portal_group(tag=tag, portals=portals, private=None, wait=None) +- +- def ui_command_delete(self, tag): +- """Delete a portal group with given tag (unique, integer > 0))""" +- tag = self.ui_eval_param(tag, "number", None) +- self.delete(tag) +- +- def ui_command_delete_all(self): +- """Delete all portal groups""" +- rpc_messages = "" +- for pg in self.pgs: +- try: +- self.delete(pg.tag) +- except JSONRPCException as e: +- rpc_messages += e.message +- if rpc_messages: +- raise JSONRPCException(rpc_messages) +- +- def refresh(self): +- self._children = set([]) +- self.pgs = list(self.get_root().iscsi_get_portal_groups()) +- for pg in self.pgs: +- try: +- UIPortalGroup(pg, self) +- except JSONRPCException as e: +- self.shell.log.error(e.message) +- +- def summary(self): +- return "Portal groups: %d" % len(self.pgs), None +- +- +-class UIPortalGroup(UINode): +- def __init__(self, pg, parent): +- UINode.__init__(self, "portal_group%s" % pg.tag, parent) +- self.pg = pg +- self.refresh() +- +- def refresh(self): +- self._children = set([]) +- for portal in self.pg.portals: +- UIPortal(portal['host'], portal['port'], self) +- +- def summary(self): +- return "Portals: %d" % len(self.pg.portals), None +- +- +-class UIPortal(UINode): +- def __init__(self, host, port, parent): +- UINode.__init__(self, "host=%s, port=%s" % ( +- host, port), parent) +- self.refresh() +- +- +-class UIInitiatorGroups(UINode): +- def __init__(self, parent): +- UINode.__init__(self, "initiator_groups", parent) +- self.refresh() +- +- def delete(self, tag): +- self.get_root().iscsi_delete_initiator_group(tag=tag) +- +- def ui_command_create(self, tag, initiator_list, netmask_list): +- """Add an initiator group. +- +- Args: +- tag: Initiator group tag (unique, integer > 0) +- initiators: List of initiator hostnames or IP addresses +- separated with whitespaces, e.g. 127.0.0.1 192.168.200.100 +- netmasks: List of initiator netmasks separated with whitespaces, +- e.g. 255.255.0.0 255.248.0.0 +- """ +- tag = self.ui_eval_param(tag, "number", None) +- self.get_root().construct_initiator_group( +- tag=tag, initiators=initiator_list.split(" "), +- netmasks=netmask_list.split(" ")) +- +- def ui_command_delete(self, tag): +- """Delete an initiator group. +- +- Args: +- tag: Initiator group tag (unique, integer > 0) +- """ +- tag = self.ui_eval_param(tag, "number", None) +- self.delete(tag) +- +- def ui_command_delete_all(self): +- """Delete all initiator groups""" +- rpc_messages = "" +- for ig in self.igs: +- try: +- self.delete(ig.tag) +- except JSONRPCException as e: +- rpc_messages += e.message +- if rpc_messages: +- raise JSONRPCException(rpc_messages) +- +- def ui_command_add_initiator(self, tag, initiators, netmasks): +- """Add initiators to an existing initiator group. +- +- Args: +- tag: Initiator group tag (unique, integer > 0) +- initiators: List of initiator hostnames or IP addresses, +- e.g. 127.0.0.1 192.168.200.100 +- netmasks: List of initiator netmasks, +- e.g. 255.255.0.0 255.248.0.0 +- """ +- tag = self.ui_eval_param(tag, "number", None) +- self.get_root().iscsi_initiator_group_add_initiators( +- tag=tag, initiators=initiators.split(" "), +- netmasks=netmasks.split(" ")) +- +- def ui_command_delete_initiator(self, tag, initiators=None, netmasks=None): +- """Delete initiators from an existing initiator group. +- +- Args: +- tag: Initiator group tag (unique, integer > 0) +- initiators: List of initiator hostnames or IP addresses, e.g. 127.0.0.1 192.168.200.100 +- netmasks: List of initiator netmasks, e.g. 255.255.0.0 255.248.0.0 +- """ +- tag = self.ui_eval_param(tag, "number", None) +- if initiators: +- initiators = initiators.split(" ") +- if netmasks: +- netmasks = netmasks.split(" ") +- self.get_root().iscsi_initiator_group_remove_initiators( +- tag=tag, initiators=initiators, +- netmasks=netmasks) +- +- def refresh(self): +- self._children = set([]) +- self.igs = list(self.get_root().iscsi_get_initiator_groups()) +- for ig in self.igs: +- UIInitiatorGroup(ig, self) +- +- def summary(self): +- return "Initiator groups: %d" % len(self.igs), None +- +- +-class UIInitiatorGroup(UINode): +- def __init__(self, ig, parent): +- UINode.__init__(self, "initiator_group%s" % ig.tag, parent) +- self.ig = ig +- self.refresh() +- +- def refresh(self): +- self._children = set([]) +- for initiator, netmask in zip(self.ig.initiators, self.ig.netmasks): +- UIInitiator(initiator, netmask, self) +- +- def summary(self): +- return "Initiators: %d" % len(self.ig.initiators), None +- +- +-class UIInitiator(UINode): +- def __init__(self, initiator, netmask, parent): +- UINode.__init__(self, "hostname=%s, netmask=%s" % (initiator, netmask), parent) +- self.refresh() +- +- +-class UIISCSIConnections(UINode): +- def __init__(self, parent): +- UINode.__init__(self, "iscsi_connections", parent) +- self.refresh() +- +- def refresh(self): +- self._children = set([]) +- self.iscsicons = list(self.get_root().iscsi_get_connections()) +- for ic in self.iscsicons: +- UIISCSIConnection(ic, self) +- +- def summary(self): +- return "Connections: %d" % len(self.iscsicons), None +- +- +-class UIISCSIConnection(UINode): +- def __init__(self, ic, parent): +- UINode.__init__(self, "%s" % ic['id'], parent) +- self.ic = ic +- self.refresh() +- +- def refresh(self): +- self._children = set([]) +- for key, val in self.ic.items(): +- if key == "id": +- continue +- UIISCSIConnectionDetails("%s: %s" % (key, val), self) +- +- +-class UIISCSIConnectionDetails(UINode): +- def __init__(self, info, parent): +- UINode.__init__(self, "%s" % info, parent) +- self.refresh() +- +- +-class UIISCSIAuthGroups(UINode): +- def __init__(self, parent): +- UINode.__init__(self, "auth_groups", parent) +- self.refresh() +- +- def refresh(self): +- self._children = set([]) +- self.iscsi_auth_groups = list(self.get_root().iscsi_get_auth_groups()) +- if self.iscsi_auth_groups is None: +- self.iscsi_auth_groups = [] +- for ag in self.iscsi_auth_groups: +- UIISCSIAuthGroup(ag, self) +- +- def delete(self, tag): +- self.get_root().iscsi_delete_auth_group(tag=tag) +- +- def delete_secret(self, tag, user): +- self.get_root().iscsi_auth_group_remove_secret( +- tag=tag, user=user) +- +- def ui_command_create(self, tag, secrets=None): +- """Add authentication group for CHAP authentication. +- +- Args: +- tag: Authentication group tag (unique, integer > 0). +- Optional args: +- secrets: Array of secrets objects separated by comma sign, +- e.g. user:test secret:test muser:mutual_test msecret:mutual_test +- """ +- tag = self.ui_eval_param(tag, "number", None) +- if secrets: +- secrets = [dict(u.split(":") for u in a.split(" ")) +- for a in secrets.split(",")] +- self.get_root().iscsi_create_auth_group(tag=tag, secrets=secrets) +- +- def ui_command_delete(self, tag): +- """Delete an authentication group. +- +- Args: +- tag: Authentication group tag (unique, integer > 0) +- """ +- tag = self.ui_eval_param(tag, "number", None) +- self.delete(tag) +- +- def ui_command_delete_all(self): +- """Delete all authentication groups.""" +- rpc_messages = "" +- for iscsi_auth_group in self.iscsi_auth_groups: +- try: +- self.delete(iscsi_auth_group['tag']) +- except JSONRPCException as e: +- rpc_messages += e.message +- if rpc_messages: +- raise JSONRPCException(rpc_messages) +- +- def ui_command_add_secret(self, tag, user, secret, +- muser=None, msecret=None): +- """Add a secret to an authentication group. +- +- Args: +- tag: Authentication group tag (unique, integer > 0) +- user: User name for one-way CHAP authentication +- secret: Secret for one-way CHAP authentication +- Optional args: +- muser: User name for mutual CHAP authentication +- msecret: Secret for mutual CHAP authentication +- """ +- tag = self.ui_eval_param(tag, "number", None) +- self.get_root().iscsi_auth_group_add_secret( +- tag=tag, user=user, secret=secret, +- muser=muser, msecret=msecret) +- +- def ui_command_delete_secret(self, tag, user): +- """Delete a secret from an authentication group. +- +- Args: +- tag: Authentication group tag (unique, integer > 0) +- user: User name for one-way CHAP authentication +- """ +- tag = self.ui_eval_param(tag, "number", None) +- self.delete_secret(tag, user) +- +- def ui_command_delete_secret_all(self, tag): +- """Delete all secrets from an authentication group. +- +- Args: +- tag: Authentication group tag (unique, integer > 0) +- """ +- rpc_messages = "" +- tag = self.ui_eval_param(tag, "number", None) +- for ag in self.iscsi_auth_groups: +- if ag['tag'] == tag: +- for secret in ag['secrets']: +- try: +- self.delete_secret(tag, secret['user']) +- except JSONRPCException as e: +- rpc_messages += e.message +- if rpc_messages: +- raise JSONRPCException(rpc_messages) +- +- def summary(self): +- return "Groups: %s" % len(self.iscsi_auth_groups), None +- +- +-class UIISCSIAuthGroup(UINode): +- def __init__(self, ag, parent): +- UINode.__init__(self, "group" + str(ag['tag']), parent) +- self.ag = ag +- self.refresh() +- +- def refresh(self): +- self._children = set([]) +- for secret in self.ag['secrets']: +- UISCSIAuthSecret(secret, self) +- +- def summary(self): +- return "Secrets: %s" % len(self.ag['secrets']), None +- +- +-class UISCSIAuthSecret(UINode): +- def __init__(self, secret, parent): +- info_list = ["%s=%s" % (key, val) +- for key, val in secret.items()] +- info_list.sort(reverse=True) +- info = ", ".join(info_list) +- UINode.__init__(self, info, parent) +- self.secret = secret +- self.refresh() ++# SPDX-License-Identifier: BSD-3-Clause ++# Copyright (C) 2018 Intel Corporation. ++# All rights reserved. ++ ++from configshell_fb import ExecutionError ++from ..rpc.client import JSONRPCException ++from .ui_node import UINode ++ ++ ++class UIISCSI(UINode): ++ def __init__(self, parent): ++ UINode.__init__(self, "iscsi", parent) ++ self.refresh() ++ ++ def refresh(self): ++ self._children = set([]) ++ UIISCSIDevices(self) ++ UIPortalGroups(self) ++ UIInitiatorGroups(self) ++ UIISCSIConnections(self) ++ UIISCSIAuthGroups(self) ++ UIISCSIGlobalParams(self) ++ ++ ++class UIISCSIGlobalParams(UINode): ++ def __init__(self, parent): ++ UINode.__init__(self, "global_params", parent) ++ self.refresh() ++ ++ def refresh(self): ++ self._children = set([]) ++ iscsi_global_params = self.get_root().iscsi_get_options() ++ if not iscsi_global_params: ++ return ++ for param, val in iscsi_global_params.items(): ++ UIISCSIGlobalParam("%s: %s" % (param, val), self) ++ ++ def ui_command_set_auth(self, g=None, d=None, r=None, m=None): ++ """Set CHAP authentication for discovery service. ++ ++ Optional arguments: ++ g = chap_group: Authentication group ID for discovery session ++ d = disable_chap: CHAP for discovery session should be disabled ++ r = require_chap: CHAP for discovery session should be required ++ m = mutual_chap: CHAP for discovery session should be mutual ++ """ ++ chap_group = self.ui_eval_param(g, "number", None) ++ disable_chap = self.ui_eval_param(d, "bool", None) ++ require_chap = self.ui_eval_param(r, "bool", None) ++ mutual_chap = self.ui_eval_param(m, "bool", None) ++ self.get_root().iscsi_set_discovery_auth( ++ chap_group=chap_group, disable_chap=disable_chap, ++ require_chap=require_chap, mutual_chap=mutual_chap) ++ ++ ++class UIISCSIGlobalParam(UINode): ++ def __init__(self, param, parent): ++ UINode.__init__(self, param, parent) ++ ++ ++class UIISCSIDevices(UINode): ++ def __init__(self, parent): ++ UINode.__init__(self, "target_nodes", parent) ++ self.scsi_devices = list() ++ self.refresh() ++ ++ def refresh(self): ++ self._children = set([]) ++ self.target_nodes = list(self.get_root().iscsi_get_target_nodes()) ++ self.scsi_devices = list(self.get_root().scsi_get_devices()) ++ for device in self.scsi_devices: ++ for node in self.target_nodes: ++ if hasattr(device, "device_name") and node['name'] \ ++ == device.device_name: ++ UIISCSIDevice(device, node, self) ++ ++ def delete(self, name): ++ self.get_root().iscsi_delete_target_node(target_node_name=name) ++ ++ def ui_command_create(self, name, alias_name, bdev_name_id_pairs, ++ pg_ig_mappings, queue_depth, g=None, d=None, r=None, ++ m=None, h=None, t=None): ++ """Create target node ++ ++ Positional args: ++ name: Target node name (ASCII) ++ alias_name: Target node alias name (ASCII) ++ bdev_name_id_pairs: List of bdev_name_id_pairs ++ pg_ig_mappings: List of pg_ig_mappings ++ queue_depth: Desired target queue depth ++ Optional args: ++ g = chap_group: Authentication group ID for this target node ++ d = disable_chap: CHAP authentication should be disabled for this target node ++ r = require_chap: CHAP authentication should be required for this target node ++ m = mutual_chap: CHAP authentication should be mutual/bidirectional ++ h = header_digest: Header Digest should be required for this target node ++ t = data_digest: Data Digest should be required for this target node ++ """ ++ luns = [] ++ print("bdev_name_id_pairs: %s" % bdev_name_id_pairs) ++ print("pg_ig_mappings: %s" % pg_ig_mappings) ++ for u in bdev_name_id_pairs.strip().split(" "): ++ bdev_name, lun_id = u.split(":") ++ luns.append({"bdev_name": bdev_name, "lun_id": int(lun_id)}) ++ pg_ig_maps = [] ++ for u in pg_ig_mappings.strip().split(" "): ++ pg, ig = u.split(":") ++ pg_ig_maps.append({"pg_tag": int(pg), "ig_tag": int(ig)}) ++ queue_depth = self.ui_eval_param(queue_depth, "number", None) ++ chap_group = self.ui_eval_param(g, "number", None) ++ disable_chap = self.ui_eval_param(d, "bool", None) ++ require_chap = self.ui_eval_param(r, "bool", None) ++ mutual_chap = self.ui_eval_param(m, "bool", None) ++ header_digest = self.ui_eval_param(h, "bool", None) ++ data_digest = self.ui_eval_param(t, "bool", None) ++ self.get_root().iscsi_create_target_node( ++ name=name, alias_name=alias_name, luns=luns, ++ pg_ig_maps=pg_ig_maps, queue_depth=queue_depth, ++ chap_group=chap_group, disable_chap=disable_chap, ++ require_chap=require_chap, mutual_chap=mutual_chap, ++ header_digest=header_digest, data_digest=data_digest) ++ ++ def ui_command_delete(self, name=None): ++ """Delete a target node. If name is not specified delete all target nodes. ++ ++ Arguments: ++ name - Target node name. ++ """ ++ self.delete(name) ++ ++ def ui_command_delete_all(self): ++ """Delete all target nodes""" ++ rpc_messages = "" ++ for device in self.scsi_devices: ++ try: ++ self.delete(device.device_name) ++ except JSONRPCException as e: ++ rpc_messages += e.message ++ if rpc_messages: ++ raise JSONRPCException(rpc_messages) ++ ++ def ui_command_add_lun(self, name, bdev_name, lun_id=None): ++ """Add lun to the target node. ++ ++ Required args: ++ name: Target node name (ASCII) ++ bdev_name: bdev name ++ Positional args: ++ lun_id: LUN ID (integer >= 0) ++ """ ++ if lun_id: ++ lun_id = self.ui_eval_param(lun_id, "number", None) ++ self.get_root().iscsi_target_node_add_lun( ++ name=name, bdev_name=bdev_name, lun_id=lun_id) ++ ++ def summary(self): ++ count = 0 ++ for device in self.scsi_devices: ++ for node in self.target_nodes: ++ if hasattr(device, "device_name") and node['name'] \ ++ == device.device_name: ++ count = count + 1 ++ return "Target nodes: %d" % count, None ++ ++ ++class UIISCSIDevice(UINode): ++ def __init__(self, device, target, parent): ++ UINode.__init__(self, device.device_name, parent) ++ self.device = device ++ self.target = target ++ self.refresh() ++ ++ def ui_command_set_auth(self, g=None, d=None, r=None, m=None): ++ """Set CHAP authentication for the target node. ++ ++ Optionals args: ++ g = chap_group: Authentication group ID for this target node ++ d = disable_chap: CHAP authentication should be disabled for this target node ++ r = require_chap: CHAP authentication should be required for this target node ++ m = mutual_chap: CHAP authentication should be mutual/bidirectional ++ """ ++ chap_group = self.ui_eval_param(g, "number", None) ++ disable_chap = self.ui_eval_param(d, "bool", None) ++ require_chap = self.ui_eval_param(r, "bool", None) ++ mutual_chap = self.ui_eval_param(m, "bool", None) ++ self.get_root().iscsi_target_node_set_auth( ++ name=self.device.device_name, chap_group=chap_group, ++ disable_chap=disable_chap, ++ require_chap=require_chap, mutual_chap=mutual_chap) ++ ++ def ui_command_iscsi_target_node_add_pg_ig_maps(self, pg_ig_mappings): ++ """Add PG-IG maps to the target node. ++ ++ Args: ++ pg_ig_maps: List of pg_ig_mappings, e.g. pg_tag:ig_tag pg_tag2:ig_tag2 ++ """ ++ pg_ig_maps = [] ++ for u in pg_ig_mappings.strip().split(" "): ++ pg, ig = u.split(":") ++ pg_ig_maps.append({"pg_tag": int(pg), "ig_tag": int(ig)}) ++ self.get_root().iscsi_target_node_add_pg_ig_maps( ++ pg_ig_maps=pg_ig_maps, name=self.device.device_name) ++ ++ def ui_command_iscsi_target_node_remove_pg_ig_maps(self, pg_ig_mappings): ++ """Remove PG-IG maps from the target node. ++ ++ Args: ++ pg_ig_maps: List of pg_ig_mappings, e.g. pg_tag:ig_tag pg_tag2:ig_tag2 ++ """ ++ pg_ig_maps = [] ++ for u in pg_ig_mappings.strip().split(" "): ++ pg, ig = u.split(":") ++ pg_ig_maps.append({"pg_tag": int(pg), "ig_tag": int(ig)}) ++ self.get_root().iscsi_target_node_remove_pg_ig_maps( ++ pg_ig_maps=pg_ig_maps, name=self.device.device_name) ++ ++ def refresh(self): ++ self._children = set([]) ++ UIISCSILuns(self.target['luns'], self) ++ UIISCSIPgIgMaps(self.target['pg_ig_maps'], self) ++ auths = {"disable_chap": self.target["disable_chap"], ++ "require_chap": self.target["require_chap"], ++ "mutual_chap": self.target["mutual_chap"], ++ "chap_group": self.target["chap_group"], ++ "data_digest": self.target["data_digest"]} ++ UIISCSIAuth(auths, self) ++ ++ def summary(self): ++ return "Id: %s, QueueDepth: %s" % (self.device.id, ++ self.target['queue_depth']), None ++ ++ ++class UIISCSIAuth(UINode): ++ def __init__(self, auths, parent): ++ UINode.__init__(self, "auths", parent) ++ self.auths = auths ++ self.refresh() ++ ++ def summary(self): ++ return "disable_chap: %s, require_chap: %s, mutual_chap: %s, chap_group: %s" % ( ++ self.auths['disable_chap'], self.auths['require_chap'], ++ self.auths['mutual_chap'], self.auths['chap_group']), None ++ ++ ++class UIISCSILuns(UINode): ++ def __init__(self, luns, parent): ++ UINode.__init__(self, "luns", parent) ++ self.luns = luns ++ self.refresh() ++ ++ def refresh(self): ++ self._children = set([]) ++ for lun in self.luns: ++ UIISCSILun(lun, self) ++ ++ def summary(self): ++ return "Luns: %d" % len(self.luns), None ++ ++ ++class UIISCSILun(UINode): ++ def __init__(self, lun, parent): ++ UINode.__init__(self, "lun %s" % lun['lun_id'], parent) ++ self.lun = lun ++ self.refresh() ++ ++ def summary(self): ++ return "%s" % self.lun['bdev_name'], None ++ ++ ++class UIISCSIPgIgMaps(UINode): ++ def __init__(self, pg_ig_maps, parent): ++ UINode.__init__(self, "pg_ig_maps", parent) ++ self.pg_ig_maps = pg_ig_maps ++ self.refresh() ++ ++ def refresh(self): ++ self._children = set([]) ++ for pg_ig in self.pg_ig_maps: ++ UIISCSIPgIg(pg_ig, self) ++ ++ def summary(self): ++ return "Pg_ig_maps: %d" % len(self.pg_ig_maps), None ++ ++ ++class UIISCSIPgIg(UINode): ++ def __init__(self, pg_ig, parent): ++ UINode.__init__(self, "portal_group%s - initiator_group%s" % ++ (pg_ig['pg_tag'], pg_ig['ig_tag']), parent) ++ self.pg_ig = pg_ig ++ self.refresh() ++ ++ ++class UIPortalGroups(UINode): ++ def __init__(self, parent): ++ UINode.__init__(self, "portal_groups", parent) ++ self.refresh() ++ ++ def delete(self, tag): ++ self.get_root().iscsi_delete_portal_group(tag=tag) ++ ++ def ui_command_create(self, tag, portal_list): ++ """Add a portal group. ++ ++ Args: ++ portals: List of portals e.g. ip:port ip2:port2 ++ tag: Portal group tag (unique, integer > 0) ++ """ ++ portals = [] ++ for portal in portal_list.strip().split(" "): ++ host = portal ++ cpumask = None ++ if "@" in portal: ++ host, cpumask = portal.split("@") ++ if ":" not in host: ++ raise ExecutionError("Incorrect format of portal group. Port is missing." ++ "Use 'help create' to see the command syntax.") ++ host, port = host.rsplit(":", -1) ++ portals.append({'host': host, 'port': port}) ++ if cpumask: ++ print("WARNING: Specifying a CPU mask for portal groups is no longer supported. Ignoring.") ++ tag = self.ui_eval_param(tag, "number", None) ++ self.get_root().construct_portal_group(tag=tag, portals=portals, private=None, wait=None) ++ ++ def ui_command_delete(self, tag): ++ """Delete a portal group with given tag (unique, integer > 0))""" ++ tag = self.ui_eval_param(tag, "number", None) ++ self.delete(tag) ++ ++ def ui_command_delete_all(self): ++ """Delete all portal groups""" ++ rpc_messages = "" ++ for pg in self.pgs: ++ try: ++ self.delete(pg.tag) ++ except JSONRPCException as e: ++ rpc_messages += e.message ++ if rpc_messages: ++ raise JSONRPCException(rpc_messages) ++ ++ def refresh(self): ++ self._children = set([]) ++ self.pgs = list(self.get_root().iscsi_get_portal_groups()) ++ for pg in self.pgs: ++ try: ++ UIPortalGroup(pg, self) ++ except JSONRPCException as e: ++ self.shell.log.error(e.message) ++ ++ def summary(self): ++ return "Portal groups: %d" % len(self.pgs), None ++ ++ ++class UIPortalGroup(UINode): ++ def __init__(self, pg, parent): ++ UINode.__init__(self, "portal_group%s" % pg.tag, parent) ++ self.pg = pg ++ self.refresh() ++ ++ def refresh(self): ++ self._children = set([]) ++ for portal in self.pg.portals: ++ UIPortal(portal['host'], portal['port'], self) ++ ++ def summary(self): ++ return "Portals: %d" % len(self.pg.portals), None ++ ++ ++class UIPortal(UINode): ++ def __init__(self, host, port, parent): ++ UINode.__init__(self, "host=%s, port=%s" % ( ++ host, port), parent) ++ self.refresh() ++ ++ ++class UIInitiatorGroups(UINode): ++ def __init__(self, parent): ++ UINode.__init__(self, "initiator_groups", parent) ++ self.refresh() ++ ++ def delete(self, tag): ++ self.get_root().iscsi_delete_initiator_group(tag=tag) ++ ++ def ui_command_create(self, tag, initiator_list, netmask_list): ++ """Add an initiator group. ++ ++ Args: ++ tag: Initiator group tag (unique, integer > 0) ++ initiators: List of initiator hostnames or IP addresses ++ separated with whitespaces, e.g. 127.0.0.1 192.168.200.100 ++ netmasks: List of initiator netmasks separated with whitespaces, ++ e.g. 255.255.0.0 255.248.0.0 ++ """ ++ tag = self.ui_eval_param(tag, "number", None) ++ self.get_root().construct_initiator_group( ++ tag=tag, initiators=initiator_list.split(" "), ++ netmasks=netmask_list.split(" ")) ++ ++ def ui_command_delete(self, tag): ++ """Delete an initiator group. ++ ++ Args: ++ tag: Initiator group tag (unique, integer > 0) ++ """ ++ tag = self.ui_eval_param(tag, "number", None) ++ self.delete(tag) ++ ++ def ui_command_delete_all(self): ++ """Delete all initiator groups""" ++ rpc_messages = "" ++ for ig in self.igs: ++ try: ++ self.delete(ig.tag) ++ except JSONRPCException as e: ++ rpc_messages += e.message ++ if rpc_messages: ++ raise JSONRPCException(rpc_messages) ++ ++ def ui_command_add_initiator(self, tag, initiators, netmasks): ++ """Add initiators to an existing initiator group. ++ ++ Args: ++ tag: Initiator group tag (unique, integer > 0) ++ initiators: List of initiator hostnames or IP addresses, ++ e.g. 127.0.0.1 192.168.200.100 ++ netmasks: List of initiator netmasks, ++ e.g. 255.255.0.0 255.248.0.0 ++ """ ++ tag = self.ui_eval_param(tag, "number", None) ++ self.get_root().iscsi_initiator_group_add_initiators( ++ tag=tag, initiators=initiators.split(" "), ++ netmasks=netmasks.split(" ")) ++ ++ def ui_command_delete_initiator(self, tag, initiators=None, netmasks=None): ++ """Delete initiators from an existing initiator group. ++ ++ Args: ++ tag: Initiator group tag (unique, integer > 0) ++ initiators: List of initiator hostnames or IP addresses, e.g. 127.0.0.1 192.168.200.100 ++ netmasks: List of initiator netmasks, e.g. 255.255.0.0 255.248.0.0 ++ """ ++ tag = self.ui_eval_param(tag, "number", None) ++ if initiators: ++ initiators = initiators.split(" ") ++ if netmasks: ++ netmasks = netmasks.split(" ") ++ self.get_root().iscsi_initiator_group_remove_initiators( ++ tag=tag, initiators=initiators, ++ netmasks=netmasks) ++ ++ def refresh(self): ++ self._children = set([]) ++ self.igs = list(self.get_root().iscsi_get_initiator_groups()) ++ for ig in self.igs: ++ UIInitiatorGroup(ig, self) ++ ++ def summary(self): ++ return "Initiator groups: %d" % len(self.igs), None ++ ++ ++class UIInitiatorGroup(UINode): ++ def __init__(self, ig, parent): ++ UINode.__init__(self, "initiator_group%s" % ig.tag, parent) ++ self.ig = ig ++ self.refresh() ++ ++ def refresh(self): ++ self._children = set([]) ++ for initiator, netmask in zip(self.ig.initiators, self.ig.netmasks): ++ UIInitiator(initiator, netmask, self) ++ ++ def summary(self): ++ return "Initiators: %d" % len(self.ig.initiators), None ++ ++ ++class UIInitiator(UINode): ++ def __init__(self, initiator, netmask, parent): ++ UINode.__init__(self, "hostname=%s, netmask=%s" % (initiator, netmask), parent) ++ self.refresh() ++ ++ ++class UIISCSIConnections(UINode): ++ def __init__(self, parent): ++ UINode.__init__(self, "iscsi_connections", parent) ++ self.refresh() ++ ++ def refresh(self): ++ self._children = set([]) ++ self.iscsicons = list(self.get_root().iscsi_get_connections()) ++ for ic in self.iscsicons: ++ UIISCSIConnection(ic, self) ++ ++ def summary(self): ++ return "Connections: %d" % len(self.iscsicons), None ++ ++ ++class UIISCSIConnection(UINode): ++ def __init__(self, ic, parent): ++ UINode.__init__(self, "%s" % ic['id'], parent) ++ self.ic = ic ++ self.refresh() ++ ++ def refresh(self): ++ self._children = set([]) ++ for key, val in self.ic.items(): ++ if key == "id": ++ continue ++ UIISCSIConnectionDetails("%s: %s" % (key, val), self) ++ ++ ++class UIISCSIConnectionDetails(UINode): ++ def __init__(self, info, parent): ++ UINode.__init__(self, "%s" % info, parent) ++ self.refresh() ++ ++ ++class UIISCSIAuthGroups(UINode): ++ def __init__(self, parent): ++ UINode.__init__(self, "auth_groups", parent) ++ self.refresh() ++ ++ def refresh(self): ++ self._children = set([]) ++ self.iscsi_auth_groups = list(self.get_root().iscsi_get_auth_groups()) ++ if self.iscsi_auth_groups is None: ++ self.iscsi_auth_groups = [] ++ for ag in self.iscsi_auth_groups: ++ UIISCSIAuthGroup(ag, self) ++ ++ def delete(self, tag): ++ self.get_root().iscsi_delete_auth_group(tag=tag) ++ ++ def delete_secret(self, tag, user): ++ self.get_root().iscsi_auth_group_remove_secret( ++ tag=tag, user=user) ++ ++ def ui_command_create(self, tag, secrets=None): ++ """Add authentication group for CHAP authentication. ++ ++ Args: ++ tag: Authentication group tag (unique, integer > 0). ++ Optional args: ++ secrets: Array of secrets objects separated by comma sign, ++ e.g. user:test secret:test muser:mutual_test msecret:mutual_test ++ """ ++ tag = self.ui_eval_param(tag, "number", None) ++ if secrets: ++ secrets = [dict(u.split(":") for u in a.split(" ")) ++ for a in secrets.split(",")] ++ self.get_root().iscsi_create_auth_group(tag=tag, secrets=secrets) ++ ++ def ui_command_delete(self, tag): ++ """Delete an authentication group. ++ ++ Args: ++ tag: Authentication group tag (unique, integer > 0) ++ """ ++ tag = self.ui_eval_param(tag, "number", None) ++ self.delete(tag) ++ ++ def ui_command_delete_all(self): ++ """Delete all authentication groups.""" ++ rpc_messages = "" ++ for iscsi_auth_group in self.iscsi_auth_groups: ++ try: ++ self.delete(iscsi_auth_group['tag']) ++ except JSONRPCException as e: ++ rpc_messages += e.message ++ if rpc_messages: ++ raise JSONRPCException(rpc_messages) ++ ++ def ui_command_add_secret(self, tag, user, secret, ++ muser=None, msecret=None): ++ """Add a secret to an authentication group. ++ ++ Args: ++ tag: Authentication group tag (unique, integer > 0) ++ user: User name for one-way CHAP authentication ++ secret: Secret for one-way CHAP authentication ++ Optional args: ++ muser: User name for mutual CHAP authentication ++ msecret: Secret for mutual CHAP authentication ++ """ ++ tag = self.ui_eval_param(tag, "number", None) ++ self.get_root().iscsi_auth_group_add_secret( ++ tag=tag, user=user, secret=secret, ++ muser=muser, msecret=msecret) ++ ++ def ui_command_delete_secret(self, tag, user): ++ """Delete a secret from an authentication group. ++ ++ Args: ++ tag: Authentication group tag (unique, integer > 0) ++ user: User name for one-way CHAP authentication ++ """ ++ tag = self.ui_eval_param(tag, "number", None) ++ self.delete_secret(tag, user) ++ ++ def ui_command_delete_secret_all(self, tag): ++ """Delete all secrets from an authentication group. ++ ++ Args: ++ tag: Authentication group tag (unique, integer > 0) ++ """ ++ rpc_messages = "" ++ tag = self.ui_eval_param(tag, "number", None) ++ for ag in self.iscsi_auth_groups: ++ if ag['tag'] == tag: ++ for secret in ag['secrets']: ++ try: ++ self.delete_secret(tag, secret['user']) ++ except JSONRPCException as e: ++ rpc_messages += e.message ++ if rpc_messages: ++ raise JSONRPCException(rpc_messages) ++ ++ def summary(self): ++ return "Groups: %s" % len(self.iscsi_auth_groups), None ++ ++ ++class UIISCSIAuthGroup(UINode): ++ def __init__(self, ag, parent): ++ UINode.__init__(self, "group" + str(ag['tag']), parent) ++ self.ag = ag ++ self.refresh() ++ ++ def refresh(self): ++ self._children = set([]) ++ for secret in self.ag['secrets']: ++ UISCSIAuthSecret(secret, self) ++ ++ def summary(self): ++ return "Secrets: %s" % len(self.ag['secrets']), None ++ ++ ++class UISCSIAuthSecret(UINode): ++ def __init__(self, secret, parent): ++ info_list = ["%s=%s" % (key, val) ++ for key, val in secret.items()] ++ info_list.sort(reverse=True) ++ info = ", ".join(info_list) ++ UINode.__init__(self, info, parent) ++ self.secret = secret ++ self.refresh() +diff --git a/python/spdk/spdkcli/ui_node_nvmf.py b/python/spdk/spdkcli/ui_node_nvmf.py +index f15f3bc..be7f4b2 100644 +--- a/python/spdk/spdkcli/ui_node_nvmf.py ++++ b/python/spdk/spdkcli/ui_node_nvmf.py +@@ -1,367 +1,367 @@ +-# SPDX-License-Identifier: BSD-3-Clause +-# Copyright (C) 2018 Intel Corporation. +-# All rights reserved. +- +-from ..rpc.client import JSONRPCException +-from .ui_node import UINode +- +- +-class UINVMf(UINode): +- def __init__(self, parent): +- UINode.__init__(self, "nvmf", parent) +- self.refresh() +- +- def refresh(self): +- self._children = set([]) +- UINVMfSubsystems(self) +- UINVMfTransports(self) +- +- +-class UINVMfTransports(UINode): +- def __init__(self, parent): +- UINode.__init__(self, "transport", parent) +- self.refresh() +- +- def refresh(self): +- self._children = set([]) +- for transport in self.get_root().nvmf_get_transports(): +- UINVMfTransport(transport, self) +- +- def ui_command_create(self, trtype, max_queue_depth=None, max_io_qpairs_per_ctrlr=None, +- in_capsule_data_size=None, max_io_size=None, io_unit_size=None, max_aq_depth=None): +- """Create a transport with given parameters +- +- Arguments: +- trtype - Example: 'RDMA'. +- max_queue_depth - Optional parameter. Integer, max value 65535. +- max_io_qpairs_per_ctrlr - Optional parameter. 16 bit Integer, max value 65535. +- in_capsule_data_size - Optional parameter. 32 bit Integer, max value 4294967295 +- max_io_size - Optional parameter. 32 bit integer, max value 4294967295 +- io_unit_size - Optional parameter. 32 bit integer, max value 4294967295 +- max_aq_depth - Optional parameter. 32 bit integer, max value 4294967295 +- """ +- max_queue_depth = self.ui_eval_param(max_queue_depth, "number", None) +- max_io_qpairs_per_ctrlr = self.ui_eval_param(max_io_qpairs_per_ctrlr, "number", None) +- in_capsule_data_size = self.ui_eval_param(in_capsule_data_size, "number", None) +- max_io_size = self.ui_eval_param(max_io_size, "number", None) +- io_unit_size = self.ui_eval_param(io_unit_size, "number", None) +- max_aq_depth = self.ui_eval_param(max_aq_depth, "number", None) +- +- self.get_root().create_nvmf_transport(trtype=trtype, +- max_queue_depth=max_queue_depth, +- max_io_qpairs_per_ctrlr=max_io_qpairs_per_ctrlr, +- in_capsule_data_size=in_capsule_data_size, +- max_io_size=max_io_size, +- io_unit_size=io_unit_size, +- max_aq_depth=max_aq_depth) +- +- def summary(self): +- return "Transports: %s" % len(self.children), None +- +- +-class UINVMfTransport(UINode): +- def __init__(self, transport, parent): +- UINode.__init__(self, transport.trtype, parent) +- self.transport = transport +- +- +-class UINVMfSubsystems(UINode): +- def __init__(self, parent): +- UINode.__init__(self, "subsystem", parent) +- self.refresh() +- +- def refresh(self): +- self._children = set([]) +- for subsystem in self.get_root().nvmf_get_subsystems(): +- UINVMfSubsystem(subsystem, self) +- +- def delete(self, subsystem_nqn): +- self.get_root().nvmf_delete_subsystem(nqn=subsystem_nqn) +- +- def ui_command_create(self, nqn, serial_number=None, +- max_namespaces=None, allow_any_host="false"): +- """Create subsystem with given parameters. +- +- Arguments: +- nqn - Target nqn(ASCII). +- serial_number - Example: 'SPDK00000000000001'. +- max_namespaces - Optional parameter. Maximum number of namespaces allowed to added during +- active connection +- allow_any_host - Optional parameter. Allow any host to connect (don't enforce allowed host NQN +- list) +- """ +- allow_any_host = self.ui_eval_param(allow_any_host, "bool", False) +- max_namespaces = self.ui_eval_param(max_namespaces, "number", 0) +- self.get_root().create_nvmf_subsystem(nqn=nqn, serial_number=serial_number, +- allow_any_host=allow_any_host, +- max_namespaces=max_namespaces) +- +- def ui_command_delete(self, subsystem_nqn): +- """Delete subsystem with given nqn. +- +- Arguments: +- nqn_subsystem - Name of subsystem to delete +- """ +- self.delete(subsystem_nqn) +- +- def ui_command_delete_all(self): +- """Delete all subsystems""" +- rpc_messages = "" +- for child in self._children: +- try: +- self.delete(child.subsystem.nqn) +- except JSONRPCException as e: +- rpc_messages += e.message +- if rpc_messages: +- raise JSONRPCException(rpc_messages) +- +- def summary(self): +- return "Subsystems: %s" % len(self.children), None +- +- +-class UINVMfSubsystem(UINode): +- def __init__(self, subsystem, parent): +- UINode.__init__(self, subsystem.nqn, parent) +- self.subsystem = subsystem +- self.refresh() +- +- def refresh(self): +- self._children = set([]) +- UINVMfSubsystemListeners(self.subsystem.listen_addresses, self) +- UINVMfSubsystemHosts(self.subsystem.hosts, self) +- if hasattr(self.subsystem, 'namespaces'): +- UINVMfSubsystemNamespaces(self.subsystem.namespaces, self) +- +- def refresh_node(self): +- for subsystem in self.get_root().nvmf_get_subsystems(): +- if subsystem.nqn == self.subsystem.nqn: +- self.subsystem = subsystem +- self.refresh() +- +- def ui_command_show_details(self): +- self.shell.log.info(json.dumps(vars(self.lvs), indent=2)) +- +- def ui_command_allow_any_host(self, disable="false"): +- """Disable or or enable allow_any_host flag. +- +- Arguments: +- disable - Optional parameter. If false then enable, if true disable +- """ +- disable = self.ui_eval_param(disable, "bool", None) +- self.get_root().nvmf_subsystem_allow_any_host( +- nqn=self.subsystem.nqn, disable=disable) +- +- def summary(self): +- sn = None +- if hasattr(self.subsystem, 'serial_number'): +- sn = "sn=%s" % self.subsystem.serial_number +- st = None +- if hasattr(self.subsystem, 'subtype'): +- st = "st=%s" % self.subsystem.subtype +- allow_any_host = None +- if self.subsystem.allow_any_host: +- allow_any_host = "Allow any host" +- info = ", ".join(filter(None, [sn, st, allow_any_host])) +- return info, None +- +- +-class UINVMfSubsystemListeners(UINode): +- def __init__(self, listen_addresses, parent): +- UINode.__init__(self, "listen_addresses", parent) +- self.listen_addresses = listen_addresses +- self.refresh() +- +- def refresh(self): +- self._children = set([]) +- for address in self.listen_addresses: +- UINVMfSubsystemListener(address, self) +- +- def refresh_node(self): +- for subsystem in self.get_root().nvmf_get_subsystems(): +- if subsystem.nqn == self.parent.subsystem.nqn: +- self.listen_addresses = subsystem.listen_addresses +- self.refresh() +- +- def delete(self, trtype, traddr, trsvcid, adrfam=None): +- self.get_root().nvmf_subsystem_remove_listener( +- nqn=self.parent.subsystem.nqn, trtype=trtype, +- traddr=traddr, trsvcid=trsvcid, adrfam=adrfam) +- +- def ui_command_create(self, trtype, traddr, trsvcid, adrfam): +- """Create address listener for subsystem. +- +- Arguments: +- trtype - NVMe-oF transport type: e.g., rdma. +- traddr - NVMe-oF transport address: e.g., an ip address. +- trsvcid - NVMe-oF transport service id: e.g., a port number. +- adrfam - NVMe-oF transport adrfam: e.g., ipv4, ipv6, ib, fc. +- """ +- self.get_root().nvmf_subsystem_add_listener( +- nqn=self.parent.subsystem.nqn, trtype=trtype, traddr=traddr, +- trsvcid=trsvcid, adrfam=adrfam) +- +- def ui_command_delete(self, trtype, traddr, trsvcid, adrfam=None): +- """Remove address listener for subsystem. +- +- Arguments: +- trtype - Transport type (RDMA) +- traddr - NVMe-oF transport address: e.g., an ip address. +- trsvcid - NVMe-oF transport service id: e.g., a port number. +- adrfam - Optional argument. Address family ("IPv4", "IPv6", "IB" or "FC"). +- """ +- self.delete(trtype, traddr, trsvcid, adrfam) +- +- def ui_command_delete_all(self): +- """Remove all address listeners from subsystem.""" +- rpc_messages = "" +- for la in self.listen_addresses: +- try: +- self.delete(la['trtype'], la['traddr'], la['trsvcid'], la['adrfam']) +- except JSONRPCException as e: +- rpc_messages += e.message +- if rpc_messages: +- raise JSONRPCException(rpc_messages) +- +- def summary(self): +- return "Addresses: %s" % len(self.listen_addresses), None +- +- +-class UINVMfSubsystemListener(UINode): +- def __init__(self, address, parent): +- UINode.__init__(self, "%s:%s" % (address['traddr'], address['trsvcid']), +- parent) +- self.address = address +- +- def summary(self): +- return "%s" % self.address['trtype'], True +- +- +-class UINVMfSubsystemHosts(UINode): +- def __init__(self, hosts, parent): +- UINode.__init__(self, "hosts", parent) +- self.hosts = hosts +- self.refresh() +- +- def refresh(self): +- self._children = set([]) +- for host in self.hosts: +- UINVMfSubsystemHost(host, self) +- +- def refresh_node(self): +- for subsystem in self.get_root().nvmf_get_subsystems(): +- if subsystem.nqn == self.parent.subsystem.nqn: +- self.hosts = subsystem.hosts +- self.refresh() +- +- def delete(self, host): +- self.get_root().nvmf_subsystem_remove_host( +- nqn=self.parent.subsystem.nqn, host=host) +- +- def ui_command_create(self, host): +- """Add a host NQN to the list of allowed hosts. +- +- Args: +- host: Host NQN to add to the list of allowed host NQNs +- """ +- self.get_root().nvmf_subsystem_add_host( +- nqn=self.parent.subsystem.nqn, host=host) +- +- def ui_command_delete(self, host): +- """Delete host from subsystem. +- +- Arguments: +- host - NQN of host to remove. +- """ +- self.delete(host) +- +- def ui_command_delete_all(self): +- """Delete host from subsystem""" +- rpc_messages = "" +- for host in self.hosts: +- try: +- self.delete(host['nqn']) +- except JSONRPCException as e: +- rpc_messages += e.message +- if rpc_messages: +- raise JSONRPCException(rpc_messages) +- +- def summary(self): +- return "Hosts: %s" % len(self.hosts), None +- +- +-class UINVMfSubsystemHost(UINode): +- def __init__(self, host, parent): +- UINode.__init__(self, "%s" % host['nqn'], parent) +- self.host = host +- +- +-class UINVMfSubsystemNamespaces(UINode): +- def __init__(self, namespaces, parent): +- UINode.__init__(self, "namespaces", parent) +- self.namespaces = namespaces +- self.refresh() +- +- def refresh(self): +- self._children = set([]) +- for namespace in self.namespaces: +- UINVMfSubsystemNamespace(namespace, self) +- +- def refresh_node(self): +- for subsystem in self.get_root().nvmf_get_subsystems(): +- if subsystem.nqn == self.parent.subsystem.nqn: +- self.namespaces = subsystem.namespaces +- self.refresh() +- +- def delete(self, nsid): +- self.get_root().nvmf_subsystem_remove_ns( +- nqn=self.parent.subsystem.nqn, nsid=nsid) +- +- def ui_command_create(self, bdev_name, nsid=None, +- nguid=None, eui64=None, uuid=None): +- """Add a namespace to a subsystem. +- +- Args: +- bdev_name: Name of bdev to expose as a namespace. +- Optional args: +- nsid: Namespace ID. +- nguid: 16-byte namespace globally unique identifier in hexadecimal. +- eui64: 8-byte namespace EUI-64 in hexadecimal (e.g. "ABCDEF0123456789"). +- uuid: Namespace UUID. +- """ +- nsid = self.ui_eval_param(nsid, "number", None) +- self.get_root().nvmf_subsystem_add_ns( +- nqn=self.parent.subsystem.nqn, bdev_name=bdev_name, +- nsid=nsid, nguid=nguid, eui64=eui64, uuid=uuid) +- +- def ui_command_delete(self, nsid): +- """Delete namespace from subsystem. +- +- Arguments: +- nsid - Id of namespace to remove. +- """ +- nsid = self.ui_eval_param(nsid, "number", None) +- self.delete(nsid) +- +- def ui_command_delete_all(self): +- """Delete all namespaces from subsystem.""" +- rpc_messages = "" +- for namespace in self.namespaces: +- try: +- self.delete(namespace['nsid']) +- except JSONRPCException as e: +- rpc_messages += e.message +- if rpc_messages: +- raise JSONRPCException(rpc_messages) +- +- def summary(self): +- return "Namespaces: %s" % len(self.namespaces), None +- +- +-class UINVMfSubsystemNamespace(UINode): +- def __init__(self, namespace, parent): +- UINode.__init__(self, namespace['bdev_name'], parent) +- self.namespace = namespace +- +- def summary(self): +- info = ", ".join([self.namespace['name'], str(self.namespace['nsid'])]) +- return info, None ++# SPDX-License-Identifier: BSD-3-Clause ++# Copyright (C) 2018 Intel Corporation. ++# All rights reserved. ++ ++from ..rpc.client import JSONRPCException ++from .ui_node import UINode ++ ++ ++class UINVMf(UINode): ++ def __init__(self, parent): ++ UINode.__init__(self, "nvmf", parent) ++ self.refresh() ++ ++ def refresh(self): ++ self._children = set([]) ++ UINVMfSubsystems(self) ++ UINVMfTransports(self) ++ ++ ++class UINVMfTransports(UINode): ++ def __init__(self, parent): ++ UINode.__init__(self, "transport", parent) ++ self.refresh() ++ ++ def refresh(self): ++ self._children = set([]) ++ for transport in self.get_root().nvmf_get_transports(): ++ UINVMfTransport(transport, self) ++ ++ def ui_command_create(self, trtype, max_queue_depth=None, max_io_qpairs_per_ctrlr=None, ++ in_capsule_data_size=None, max_io_size=None, io_unit_size=None, max_aq_depth=None): ++ """Create a transport with given parameters ++ ++ Arguments: ++ trtype - Example: 'RDMA'. ++ max_queue_depth - Optional parameter. Integer, max value 65535. ++ max_io_qpairs_per_ctrlr - Optional parameter. 16 bit Integer, max value 65535. ++ in_capsule_data_size - Optional parameter. 32 bit Integer, max value 4294967295 ++ max_io_size - Optional parameter. 32 bit integer, max value 4294967295 ++ io_unit_size - Optional parameter. 32 bit integer, max value 4294967295 ++ max_aq_depth - Optional parameter. 32 bit integer, max value 4294967295 ++ """ ++ max_queue_depth = self.ui_eval_param(max_queue_depth, "number", None) ++ max_io_qpairs_per_ctrlr = self.ui_eval_param(max_io_qpairs_per_ctrlr, "number", None) ++ in_capsule_data_size = self.ui_eval_param(in_capsule_data_size, "number", None) ++ max_io_size = self.ui_eval_param(max_io_size, "number", None) ++ io_unit_size = self.ui_eval_param(io_unit_size, "number", None) ++ max_aq_depth = self.ui_eval_param(max_aq_depth, "number", None) ++ ++ self.get_root().create_nvmf_transport(trtype=trtype, ++ max_queue_depth=max_queue_depth, ++ max_io_qpairs_per_ctrlr=max_io_qpairs_per_ctrlr, ++ in_capsule_data_size=in_capsule_data_size, ++ max_io_size=max_io_size, ++ io_unit_size=io_unit_size, ++ max_aq_depth=max_aq_depth) ++ ++ def summary(self): ++ return "Transports: %s" % len(self.children), None ++ ++ ++class UINVMfTransport(UINode): ++ def __init__(self, transport, parent): ++ UINode.__init__(self, transport.trtype, parent) ++ self.transport = transport ++ ++ ++class UINVMfSubsystems(UINode): ++ def __init__(self, parent): ++ UINode.__init__(self, "subsystem", parent) ++ self.refresh() ++ ++ def refresh(self): ++ self._children = set([]) ++ for subsystem in self.get_root().nvmf_get_subsystems(): ++ UINVMfSubsystem(subsystem, self) ++ ++ def delete(self, subsystem_nqn): ++ self.get_root().nvmf_delete_subsystem(nqn=subsystem_nqn) ++ ++ def ui_command_create(self, nqn, serial_number=None, ++ max_namespaces=None, allow_any_host="false"): ++ """Create subsystem with given parameters. ++ ++ Arguments: ++ nqn - Target nqn(ASCII). ++ serial_number - Example: 'SPDK00000000000001'. ++ max_namespaces - Optional parameter. Maximum number of namespaces allowed to added during ++ active connection ++ allow_any_host - Optional parameter. Allow any host to connect (don't enforce allowed host NQN ++ list) ++ """ ++ allow_any_host = self.ui_eval_param(allow_any_host, "bool", False) ++ max_namespaces = self.ui_eval_param(max_namespaces, "number", 0) ++ self.get_root().create_nvmf_subsystem(nqn=nqn, serial_number=serial_number, ++ allow_any_host=allow_any_host, ++ max_namespaces=max_namespaces) ++ ++ def ui_command_delete(self, subsystem_nqn): ++ """Delete subsystem with given nqn. ++ ++ Arguments: ++ nqn_subsystem - Name of subsystem to delete ++ """ ++ self.delete(subsystem_nqn) ++ ++ def ui_command_delete_all(self): ++ """Delete all subsystems""" ++ rpc_messages = "" ++ for child in self._children: ++ try: ++ self.delete(child.subsystem.nqn) ++ except JSONRPCException as e: ++ rpc_messages += e.message ++ if rpc_messages: ++ raise JSONRPCException(rpc_messages) ++ ++ def summary(self): ++ return "Subsystems: %s" % len(self.children), None ++ ++ ++class UINVMfSubsystem(UINode): ++ def __init__(self, subsystem, parent): ++ UINode.__init__(self, subsystem.nqn, parent) ++ self.subsystem = subsystem ++ self.refresh() ++ ++ def refresh(self): ++ self._children = set([]) ++ UINVMfSubsystemListeners(self.subsystem.listen_addresses, self) ++ UINVMfSubsystemHosts(self.subsystem.hosts, self) ++ if hasattr(self.subsystem, 'namespaces'): ++ UINVMfSubsystemNamespaces(self.subsystem.namespaces, self) ++ ++ def refresh_node(self): ++ for subsystem in self.get_root().nvmf_get_subsystems(): ++ if subsystem.nqn == self.subsystem.nqn: ++ self.subsystem = subsystem ++ self.refresh() ++ ++ def ui_command_show_details(self): ++ self.shell.log.info(json.dumps(vars(self.lvs), indent=2)) ++ ++ def ui_command_allow_any_host(self, disable="false"): ++ """Disable or or enable allow_any_host flag. ++ ++ Arguments: ++ disable - Optional parameter. If false then enable, if true disable ++ """ ++ disable = self.ui_eval_param(disable, "bool", None) ++ self.get_root().nvmf_subsystem_allow_any_host( ++ nqn=self.subsystem.nqn, disable=disable) ++ ++ def summary(self): ++ sn = None ++ if hasattr(self.subsystem, 'serial_number'): ++ sn = "sn=%s" % self.subsystem.serial_number ++ st = None ++ if hasattr(self.subsystem, 'subtype'): ++ st = "st=%s" % self.subsystem.subtype ++ allow_any_host = None ++ if self.subsystem.allow_any_host: ++ allow_any_host = "Allow any host" ++ info = ", ".join(filter(None, [sn, st, allow_any_host])) ++ return info, None ++ ++ ++class UINVMfSubsystemListeners(UINode): ++ def __init__(self, listen_addresses, parent): ++ UINode.__init__(self, "listen_addresses", parent) ++ self.listen_addresses = listen_addresses ++ self.refresh() ++ ++ def refresh(self): ++ self._children = set([]) ++ for address in self.listen_addresses: ++ UINVMfSubsystemListener(address, self) ++ ++ def refresh_node(self): ++ for subsystem in self.get_root().nvmf_get_subsystems(): ++ if subsystem.nqn == self.parent.subsystem.nqn: ++ self.listen_addresses = subsystem.listen_addresses ++ self.refresh() ++ ++ def delete(self, trtype, traddr, trsvcid, adrfam=None): ++ self.get_root().nvmf_subsystem_remove_listener( ++ nqn=self.parent.subsystem.nqn, trtype=trtype, ++ traddr=traddr, trsvcid=trsvcid, adrfam=adrfam) ++ ++ def ui_command_create(self, trtype, traddr, trsvcid, adrfam): ++ """Create address listener for subsystem. ++ ++ Arguments: ++ trtype - NVMe-oF transport type: e.g., rdma. ++ traddr - NVMe-oF transport address: e.g., an ip address. ++ trsvcid - NVMe-oF transport service id: e.g., a port number. ++ adrfam - NVMe-oF transport adrfam: e.g., ipv4, ipv6, ib, fc. ++ """ ++ self.get_root().nvmf_subsystem_add_listener( ++ nqn=self.parent.subsystem.nqn, trtype=trtype, traddr=traddr, ++ trsvcid=trsvcid, adrfam=adrfam) ++ ++ def ui_command_delete(self, trtype, traddr, trsvcid, adrfam=None): ++ """Remove address listener for subsystem. ++ ++ Arguments: ++ trtype - Transport type (RDMA) ++ traddr - NVMe-oF transport address: e.g., an ip address. ++ trsvcid - NVMe-oF transport service id: e.g., a port number. ++ adrfam - Optional argument. Address family ("IPv4", "IPv6", "IB" or "FC"). ++ """ ++ self.delete(trtype, traddr, trsvcid, adrfam) ++ ++ def ui_command_delete_all(self): ++ """Remove all address listeners from subsystem.""" ++ rpc_messages = "" ++ for la in self.listen_addresses: ++ try: ++ self.delete(la['trtype'], la['traddr'], la['trsvcid'], la['adrfam']) ++ except JSONRPCException as e: ++ rpc_messages += e.message ++ if rpc_messages: ++ raise JSONRPCException(rpc_messages) ++ ++ def summary(self): ++ return "Addresses: %s" % len(self.listen_addresses), None ++ ++ ++class UINVMfSubsystemListener(UINode): ++ def __init__(self, address, parent): ++ UINode.__init__(self, "%s:%s" % (address['traddr'], address['trsvcid']), ++ parent) ++ self.address = address ++ ++ def summary(self): ++ return "%s" % self.address['trtype'], True ++ ++ ++class UINVMfSubsystemHosts(UINode): ++ def __init__(self, hosts, parent): ++ UINode.__init__(self, "hosts", parent) ++ self.hosts = hosts ++ self.refresh() ++ ++ def refresh(self): ++ self._children = set([]) ++ for host in self.hosts: ++ UINVMfSubsystemHost(host, self) ++ ++ def refresh_node(self): ++ for subsystem in self.get_root().nvmf_get_subsystems(): ++ if subsystem.nqn == self.parent.subsystem.nqn: ++ self.hosts = subsystem.hosts ++ self.refresh() ++ ++ def delete(self, host): ++ self.get_root().nvmf_subsystem_remove_host( ++ nqn=self.parent.subsystem.nqn, host=host) ++ ++ def ui_command_create(self, host): ++ """Add a host NQN to the list of allowed hosts. ++ ++ Args: ++ host: Host NQN to add to the list of allowed host NQNs ++ """ ++ self.get_root().nvmf_subsystem_add_host( ++ nqn=self.parent.subsystem.nqn, host=host) ++ ++ def ui_command_delete(self, host): ++ """Delete host from subsystem. ++ ++ Arguments: ++ host - NQN of host to remove. ++ """ ++ self.delete(host) ++ ++ def ui_command_delete_all(self): ++ """Delete host from subsystem""" ++ rpc_messages = "" ++ for host in self.hosts: ++ try: ++ self.delete(host['nqn']) ++ except JSONRPCException as e: ++ rpc_messages += e.message ++ if rpc_messages: ++ raise JSONRPCException(rpc_messages) ++ ++ def summary(self): ++ return "Hosts: %s" % len(self.hosts), None ++ ++ ++class UINVMfSubsystemHost(UINode): ++ def __init__(self, host, parent): ++ UINode.__init__(self, "%s" % host['nqn'], parent) ++ self.host = host ++ ++ ++class UINVMfSubsystemNamespaces(UINode): ++ def __init__(self, namespaces, parent): ++ UINode.__init__(self, "namespaces", parent) ++ self.namespaces = namespaces ++ self.refresh() ++ ++ def refresh(self): ++ self._children = set([]) ++ for namespace in self.namespaces: ++ UINVMfSubsystemNamespace(namespace, self) ++ ++ def refresh_node(self): ++ for subsystem in self.get_root().nvmf_get_subsystems(): ++ if subsystem.nqn == self.parent.subsystem.nqn: ++ self.namespaces = subsystem.namespaces ++ self.refresh() ++ ++ def delete(self, nsid): ++ self.get_root().nvmf_subsystem_remove_ns( ++ nqn=self.parent.subsystem.nqn, nsid=nsid) ++ ++ def ui_command_create(self, bdev_name, nsid=None, ++ nguid=None, eui64=None, uuid=None): ++ """Add a namespace to a subsystem. ++ ++ Args: ++ bdev_name: Name of bdev to expose as a namespace. ++ Optional args: ++ nsid: Namespace ID. ++ nguid: 16-byte namespace globally unique identifier in hexadecimal. ++ eui64: 8-byte namespace EUI-64 in hexadecimal (e.g. "ABCDEF0123456789"). ++ uuid: Namespace UUID. ++ """ ++ nsid = self.ui_eval_param(nsid, "number", None) ++ self.get_root().nvmf_subsystem_add_ns( ++ nqn=self.parent.subsystem.nqn, bdev_name=bdev_name, ++ nsid=nsid, nguid=nguid, eui64=eui64, uuid=uuid) ++ ++ def ui_command_delete(self, nsid): ++ """Delete namespace from subsystem. ++ ++ Arguments: ++ nsid - Id of namespace to remove. ++ """ ++ nsid = self.ui_eval_param(nsid, "number", None) ++ self.delete(nsid) ++ ++ def ui_command_delete_all(self): ++ """Delete all namespaces from subsystem.""" ++ rpc_messages = "" ++ for namespace in self.namespaces: ++ try: ++ self.delete(namespace['nsid']) ++ except JSONRPCException as e: ++ rpc_messages += e.message ++ if rpc_messages: ++ raise JSONRPCException(rpc_messages) ++ ++ def summary(self): ++ return "Namespaces: %s" % len(self.namespaces), None ++ ++ ++class UINVMfSubsystemNamespace(UINode): ++ def __init__(self, namespace, parent): ++ UINode.__init__(self, namespace['bdev_name'], parent) ++ self.namespace = namespace ++ ++ def summary(self): ++ info = ", ".join([self.namespace['name'], str(self.namespace['nsid'])]) ++ return info, None +diff --git a/python/spdk/spdkcli/ui_root.py b/python/spdk/spdkcli/ui_root.py +index 5eb4e12..9f2c4d7 100644 +--- a/python/spdk/spdkcli/ui_root.py ++++ b/python/spdk/spdkcli/ui_root.py +@@ -1,572 +1,572 @@ +-# SPDX-License-Identifier: BSD-3-Clause +-# Copyright (C) 2018 Intel Corporation. +-# All rights reserved. +- +-from .ui_node import UINode, UIBdevs, UILvolStores, UIVhosts +-from .ui_node_nvmf import UINVMf +-from .ui_node_iscsi import UIISCSI +-from .. import rpc +-from functools import wraps +- +- +-class UIRoot(UINode): +- """ +- Root node for CLI menu tree structure. Refreshes running config on startup. +- """ +- def __init__(self, client, shell): +- UINode.__init__(self, "/", shell=shell) +- self.current_bdevs = [] +- self.current_lvol_stores = [] +- self.current_vhost_ctrls = [] +- self.current_nvmf_transports = [] +- self.current_nvmf_subsystems = [] +- self.set_rpc_target(client) +- self.verbose = False +- self.is_init = self.check_init() +- self.methods = [] +- +- def refresh(self): +- self.methods = self.rpc_get_methods(current=True) +- if self.is_init is False: +- methods = "\n".join(self.methods) +- self.shell.log.warning("SPDK Application is not yet initialized.\n" +- "Please initialize subsystems with framework_start_init command.\n" +- "List of available commands in current state:\n" +- "%s" % methods) +- else: +- # Pass because we'd like to build main tree structure for "ls" +- # even if state is uninitialized +- pass +- +- self._children = set([]) +- UIBdevs(self) +- UILvolStores(self) +- if self.has_subsystem("vhost_scsi") or self.has_subsystem("vhost_blk"): +- UIVhosts(self) +- if self.has_subsystem("nvmf"): +- UINVMf(self) +- if self.has_subsystem("iscsi"): +- UIISCSI(self) +- +- def set_rpc_target(self, client): +- self.client = client +- +- def print_array(self, a): +- return " ".join(a) +- +- def verbose(f): +- # For any configuration calls (create, delete, construct, etc.) +- # Check if verbose option is to be used and set appropriately. +- # Do not use for "get_*" methods so that output is not +- # flooded. +- def w(self, **kwargs): +- self.client.log_set_level("INFO" if self.verbose else "ERROR") +- r = f(self, **kwargs) +- self.client.log_set_level("ERROR") +- return r +- return w +- +- def is_method_available(f): +- # Check if method f is available for given spdk target +- def w(self, **kwargs): +- if f.__name__ in self.methods: +- r = f(self, **kwargs) +- return r +- # If given method is not available return empty list +- # similar to real get_* like rpc +- return [] +- return w +- +- def ui_command_framework_start_init(self): +- if rpc.framework_start_init(self.client): +- self.is_init = True +- self.refresh() +- +- def ui_command_load_config(self, filename): +- with open(filename, "r") as fd: +- rpc.load_config(self.client, fd) +- +- def ui_command_load_subsystem_config(self, filename): +- with open(filename, "r") as fd: +- rpc.load_subsystem_config(self.client, fd) +- +- def ui_command_save_config(self, filename, indent=2): +- with open(filename, "w") as fd: +- rpc.save_config(self.client, fd, indent) +- +- def ui_command_save_subsystem_config(self, filename, subsystem, indent=2): +- with open(filename, "w") as fd: +- rpc.save_subsystem_config(self.client, fd, indent, subsystem) +- +- def rpc_get_methods(self, current=False): +- return rpc.rpc_get_methods(self.client, current=current) +- +- def check_init(self): +- return "framework_start_init" not in self.rpc_get_methods(current=True) +- +- def bdev_get_bdevs(self, bdev_type): +- if self.is_init: +- self.current_bdevs = rpc.bdev.bdev_get_bdevs(self.client) +- # Following replace needs to be done in order for some of the bdev +- # listings to work: logical volumes, split disk. +- # For example logical volumes: listing in menu is "Logical_Volume" +- # (cannot have space), but the product name in SPDK is "Logical Volume" +- bdev_type = bdev_type.replace("_", " ") +- for bdev in [x for x in self.current_bdevs if bdev_type in x["product_name"].lower()]: +- test = Bdev(bdev) +- yield test +- +- def bdev_get_iostat(self, **kwargs): +- return rpc.bdev.bdev_get_iostat(self.client, **kwargs) +- +- @verbose +- def bdev_split_create(self, **kwargs): +- response = rpc.bdev.bdev_split_create(self.client, **kwargs) +- return self.print_array(response) +- +- @verbose +- def bdev_split_delete(self, **kwargs): +- rpc.bdev.bdev_split_delete(self.client, **kwargs) +- +- @verbose +- def create_malloc_bdev(self, **kwargs): +- response = rpc.bdev.bdev_malloc_create(self.client, **kwargs) +- return response +- +- @verbose +- def bdev_malloc_delete(self, **kwargs): +- rpc.bdev.bdev_malloc_delete(self.client, **kwargs) +- +- @verbose +- def create_iscsi_bdev(self, **kwargs): +- response = rpc.bdev.bdev_iscsi_create(self.client, **kwargs) +- return response +- +- @verbose +- def bdev_iscsi_delete(self, **kwargs): +- rpc.bdev.bdev_iscsi_delete(self.client, **kwargs) +- +- @verbose +- def bdev_aio_create(self, **kwargs): +- response = rpc.bdev.bdev_aio_create(self.client, **kwargs) +- return response +- +- @verbose +- def bdev_aio_delete(self, **kwargs): +- rpc.bdev.bdev_aio_delete(self.client, **kwargs) +- +- @verbose +- def create_lvol_bdev(self, **kwargs): +- response = rpc.lvol.bdev_lvol_create(self.client, **kwargs) +- return response +- +- @verbose +- def bdev_lvol_delete(self, **kwargs): +- response = rpc.lvol.bdev_lvol_delete(self.client, **kwargs) +- return response +- +- @verbose +- def create_nvme_bdev(self, **kwargs): +- response = rpc.bdev.bdev_nvme_attach_controller(self.client, **kwargs) +- return response +- +- @verbose +- def bdev_nvme_detach_controller(self, **kwargs): +- rpc.bdev.bdev_nvme_detach_controller(self.client, **kwargs) +- +- @verbose +- def bdev_null_create(self, **kwargs): +- response = rpc.bdev.bdev_null_create(self.client, **kwargs) +- return response +- +- @verbose +- def bdev_null_delete(self, **kwargs): +- rpc.bdev.bdev_null_delete(self.client, **kwargs) +- +- @verbose +- def create_error_bdev(self, **kwargs): +- response = rpc.bdev.bdev_error_create(self.client, **kwargs) +- +- @verbose +- def bdev_error_delete(self, **kwargs): +- rpc.bdev.bdev_error_delete(self.client, **kwargs) +- +- @verbose +- @is_method_available +- def bdev_lvol_get_lvstores(self): +- if self.is_init: +- self.current_lvol_stores = rpc.lvol.bdev_lvol_get_lvstores(self.client) +- for lvs in self.current_lvol_stores: +- yield LvolStore(lvs) +- +- @verbose +- def bdev_lvol_create_lvstore(self, **kwargs): +- response = rpc.lvol.bdev_lvol_create_lvstore(self.client, **kwargs) +- return response +- +- @verbose +- def bdev_lvol_delete_lvstore(self, **kwargs): +- rpc.lvol.bdev_lvol_delete_lvstore(self.client, **kwargs) +- +- @verbose +- def bdev_pmem_create_pool(self, **kwargs): +- response = rpc.pmem.bdev_pmem_create_pool(self.client, **kwargs) +- return response +- +- @verbose +- def bdev_pmem_delete_pool(self, **kwargs): +- rpc.pmem.bdev_pmem_delete_pool(self.client, **kwargs) +- +- @verbose +- def bdev_pmem_get_pool_info(self, **kwargs): +- response = rpc.pmem.bdev_pmem_get_pool_info(self.client, **kwargs) +- return response +- +- @verbose +- def bdev_pmem_create(self, **kwargs): +- response = rpc.bdev.bdev_pmem_create(self.client, **kwargs) +- return response +- +- @verbose +- def bdev_pmem_delete(self, **kwargs): +- response = rpc.bdev.bdev_pmem_delete(self.client, **kwargs) +- return response +- +- @verbose +- def create_rbd_bdev(self, **kwargs): +- response = rpc.bdev.bdev_rbd_create(self.client, **kwargs) +- return response +- +- @verbose +- def bdev_rbd_delete(self, **kwargs): +- response = rpc.bdev.bdev_rbd_delete(self.client, **kwargs) +- return response +- +- @verbose +- def create_virtio_dev(self, **kwargs): +- response = rpc.vhost.bdev_virtio_attach_controller(self.client, **kwargs) +- return self.print_array(response) +- +- @verbose +- def bdev_virtio_detach_controller(self, **kwargs): +- response = rpc.vhost.bdev_virtio_detach_controller(self.client, **kwargs) +- return response +- +- @verbose +- def bdev_raid_create(self, **kwargs): +- rpc.bdev.bdev_raid_create(self.client, **kwargs) +- +- @verbose +- def bdev_raid_delete(self, **kwargs): +- rpc.bdev.bdev_raid_delete(self.client, **kwargs) +- +- @verbose +- def bdev_uring_create(self, **kwargs): +- response = rpc.bdev.bdev_uring_create(self.client, **kwargs) +- return response +- +- @verbose +- def bdev_uring_delete(self, **kwargs): +- rpc.bdev.bdev_uring_delete(self.client, **kwargs) +- +- @verbose +- @is_method_available +- def bdev_virtio_scsi_get_devices(self): +- if self.is_init: +- for bdev in rpc.vhost.bdev_virtio_scsi_get_devices(self.client): +- test = Bdev(bdev) +- yield test +- +- def list_vhost_ctrls(self): +- if self.is_init: +- self.current_vhost_ctrls = rpc.vhost.vhost_get_controllers(self.client) +- +- @verbose +- @is_method_available +- def vhost_get_controllers(self, ctrlr_type): +- if self.is_init: +- self.list_vhost_ctrls() +- for ctrlr in [x for x in self.current_vhost_ctrls if ctrlr_type in list(x["backend_specific"].keys())]: +- yield VhostCtrlr(ctrlr) +- +- @verbose +- def vhost_delete_controller(self, **kwargs): +- rpc.vhost.vhost_delete_controller(self.client, **kwargs) +- +- @verbose +- def vhost_create_scsi_controller(self, **kwargs): +- rpc.vhost.vhost_create_scsi_controller(self.client, **kwargs) +- +- @verbose +- def vhost_create_blk_controller(self, **kwargs): +- rpc.vhost.vhost_create_blk_controller(self.client, **kwargs) +- +- @verbose +- def vhost_scsi_controller_remove_target(self, **kwargs): +- rpc.vhost.vhost_scsi_controller_remove_target(self.client, **kwargs) +- +- @verbose +- def vhost_scsi_controller_add_target(self, **kwargs): +- rpc.vhost.vhost_scsi_controller_add_target(self.client, **kwargs) +- +- def vhost_controller_set_coalescing(self, **kwargs): +- rpc.vhost.vhost_controller_set_coalescing(self.client, **kwargs) +- +- @verbose +- def create_nvmf_transport(self, **kwargs): +- rpc.nvmf.nvmf_create_transport(self.client, **kwargs) +- +- def list_nvmf_transports(self): +- if self.is_init: +- self.current_nvmf_transports = rpc.nvmf.nvmf_get_transports(self.client) +- +- @verbose +- @is_method_available +- def nvmf_get_transports(self): +- if self.is_init: +- self.list_nvmf_transports() +- for transport in self.current_nvmf_transports: +- yield NvmfTransport(transport) +- +- def list_nvmf_subsystems(self): +- if self.is_init: +- self.current_nvmf_subsystems = rpc.nvmf.nvmf_get_subsystems(self.client) +- +- @verbose +- @is_method_available +- def nvmf_get_subsystems(self): +- if self.is_init: +- self.list_nvmf_subsystems() +- for subsystem in self.current_nvmf_subsystems: +- yield NvmfSubsystem(subsystem) +- +- @verbose +- def create_nvmf_subsystem(self, **kwargs): +- rpc.nvmf.nvmf_create_subsystem(self.client, **kwargs) +- +- @verbose +- def nvmf_delete_subsystem(self, **kwargs): +- rpc.nvmf.nvmf_delete_subsystem(self.client, **kwargs) +- +- @verbose +- def nvmf_subsystem_add_listener(self, **kwargs): +- rpc.nvmf.nvmf_subsystem_add_listener(self.client, **kwargs) +- +- @verbose +- def nvmf_subsystem_remove_listener(self, **kwargs): +- rpc.nvmf.nvmf_subsystem_remove_listener(self.client, **kwargs) +- +- @verbose +- def nvmf_subsystem_add_host(self, **kwargs): +- rpc.nvmf.nvmf_subsystem_add_host(self.client, **kwargs) +- +- @verbose +- def nvmf_subsystem_remove_host(self, **kwargs): +- rpc.nvmf.nvmf_subsystem_remove_host(self.client, **kwargs) +- +- @verbose +- def nvmf_subsystem_allow_any_host(self, **kwargs): +- rpc.nvmf.nvmf_subsystem_allow_any_host(self.client, **kwargs) +- +- @verbose +- def nvmf_subsystem_add_ns(self, **kwargs): +- rpc.nvmf.nvmf_subsystem_add_ns(self.client, **kwargs) +- +- @verbose +- def nvmf_subsystem_remove_ns(self, **kwargs): +- rpc.nvmf.nvmf_subsystem_remove_ns(self.client, **kwargs) +- +- @verbose +- def nvmf_subsystem_allow_any_host(self, **kwargs): +- rpc.nvmf.nvmf_subsystem_allow_any_host(self.client, **kwargs) +- +- @verbose +- @is_method_available +- def scsi_get_devices(self): +- if self.is_init: +- for device in rpc.iscsi.scsi_get_devices(self.client): +- yield ScsiObj(device) +- +- @verbose +- @is_method_available +- def iscsi_get_target_nodes(self): +- if self.is_init: +- for tg in rpc.iscsi.iscsi_get_target_nodes(self.client): +- yield tg +- +- @verbose +- def iscsi_create_target_node(self, **kwargs): +- rpc.iscsi.iscsi_create_target_node(self.client, **kwargs) +- +- @verbose +- def iscsi_delete_target_node(self, **kwargs): +- rpc.iscsi.iscsi_delete_target_node(self.client, **kwargs) +- +- @verbose +- @is_method_available +- def iscsi_get_portal_groups(self): +- if self.is_init: +- for pg in rpc.iscsi.iscsi_get_portal_groups(self.client): +- yield ScsiObj(pg) +- +- @verbose +- @is_method_available +- def iscsi_get_initiator_groups(self): +- if self.is_init: +- for ig in rpc.iscsi.iscsi_get_initiator_groups(self.client): +- yield ScsiObj(ig) +- +- @verbose +- def construct_portal_group(self, **kwargs): +- rpc.iscsi.iscsi_create_portal_group(self.client, **kwargs) +- +- @verbose +- def iscsi_delete_portal_group(self, **kwargs): +- rpc.iscsi.iscsi_delete_portal_group(self.client, **kwargs) +- +- @verbose +- def construct_initiator_group(self, **kwargs): +- rpc.iscsi.iscsi_create_initiator_group(self.client, **kwargs) +- +- @verbose +- def iscsi_delete_initiator_group(self, **kwargs): +- rpc.iscsi.iscsi_delete_initiator_group(self.client, **kwargs) +- +- @verbose +- @is_method_available +- def iscsi_get_connections(self, **kwargs): +- if self.is_init: +- for ic in rpc.iscsi.iscsi_get_connections(self.client, **kwargs): +- yield ic +- +- @verbose +- def iscsi_initiator_group_add_initiators(self, **kwargs): +- rpc.iscsi.iscsi_initiator_group_add_initiators(self.client, **kwargs) +- +- @verbose +- def iscsi_initiator_group_remove_initiators(self, **kwargs): +- rpc.iscsi.iscsi_initiator_group_remove_initiators(self.client, **kwargs) +- +- @verbose +- def iscsi_target_node_add_pg_ig_maps(self, **kwargs): +- rpc.iscsi.iscsi_target_node_add_pg_ig_maps(self.client, **kwargs) +- +- @verbose +- def iscsi_target_node_remove_pg_ig_maps(self, **kwargs): +- rpc.iscsi.iscsi_target_node_remove_pg_ig_maps(self.client, **kwargs) +- +- @verbose +- def iscsi_auth_group_add_secret(self, **kwargs): +- rpc.iscsi.iscsi_auth_group_add_secret(self.client, **kwargs) +- +- @verbose +- def iscsi_auth_group_remove_secret(self, **kwargs): +- rpc.iscsi.iscsi_auth_group_remove_secret(self.client, **kwargs) +- +- @verbose +- @is_method_available +- def iscsi_get_auth_groups(self, **kwargs): +- return rpc.iscsi.iscsi_get_auth_groups(self.client, **kwargs) +- +- @verbose +- def iscsi_create_auth_group(self, **kwargs): +- rpc.iscsi.iscsi_create_auth_group(self.client, **kwargs) +- +- @verbose +- def iscsi_delete_auth_group(self, **kwargs): +- rpc.iscsi.iscsi_delete_auth_group(self.client, **kwargs) +- +- @verbose +- def iscsi_target_node_set_auth(self, **kwargs): +- rpc.iscsi.iscsi_target_node_set_auth(self.client, **kwargs) +- +- @verbose +- def iscsi_target_node_add_lun(self, **kwargs): +- rpc.iscsi.iscsi_target_node_add_lun(self.client, **kwargs) +- +- @verbose +- def iscsi_set_discovery_auth(self, **kwargs): +- rpc.iscsi.iscsi_set_discovery_auth(self.client, **kwargs) +- +- @verbose +- @is_method_available +- def iscsi_get_options(self, **kwargs): +- return rpc.iscsi.iscsi_get_options(self.client, **kwargs) +- +- def has_subsystem(self, subsystem): +- for system in rpc.subsystem.framework_get_subsystems(self.client): +- if subsystem.lower() == system["subsystem"].lower(): +- return True +- return False +- +- +-class Bdev(object): +- def __init__(self, bdev_info): +- """ +- All class attributes are set based on what information is received +- from bdev_get_bdevs RPC call. +- # TODO: Document in docstring parameters which describe bdevs. +- # TODO: Possible improvement: JSON schema might be used here in future +- """ +- for i in list(bdev_info.keys()): +- setattr(self, i, bdev_info[i]) +- +- +-class LvolStore(object): +- def __init__(self, lvs_info): +- """ +- All class attributes are set based on what information is received +- from bdev_get_bdevs RPC call. +- # TODO: Document in docstring parameters which describe bdevs. +- # TODO: Possible improvement: JSON schema might be used here in future +- """ +- for i in list(lvs_info.keys()): +- setattr(self, i, lvs_info[i]) +- +- +-class VhostCtrlr(object): +- def __init__(self, ctrlr_info): +- """ +- All class attributes are set based on what information is received +- from vhost_get_controllers RPC call. +- # TODO: Document in docstring parameters which describe bdevs. +- # TODO: Possible improvement: JSON schema might be used here in future +- """ +- for i in list(ctrlr_info.keys()): +- setattr(self, i, ctrlr_info[i]) +- +- +-class NvmfTransport(object): +- def __init__(self, transport_info): +- """ +- All class attributes are set based on what information is received +- from get_nvmf_transport RPC call. +- # TODO: Document in docstring parameters which describe bdevs. +- # TODO: Possible improvement: JSON schema might be used here in future +- """ +- for i in transport_info.keys(): +- setattr(self, i, transport_info[i]) +- +- +-class NvmfSubsystem(object): +- def __init__(self, subsystem_info): +- """ +- All class attributes are set based on what information is received +- from get_nvmf_subsystem RPC call. +- # TODO: Document in docstring parameters which describe bdevs. +- # TODO: Possible improvement: JSON schema might be used here in future +- """ +- for i in subsystem_info.keys(): +- setattr(self, i, subsystem_info[i]) +- +- +-class ScsiObj(object): +- def __init__(self, device_info): +- """ +- All class attributes are set based on what information is received +- from iscsi related RPC calls. +- # TODO: Document in docstring parameters which describe bdevs. +- # TODO: Possible improvement: JSON schema might be used here in future +- """ +- for i in device_info.keys(): +- setattr(self, i, device_info[i]) ++# SPDX-License-Identifier: BSD-3-Clause ++# Copyright (C) 2018 Intel Corporation. ++# All rights reserved. ++ ++from .ui_node import UINode, UIBdevs, UILvolStores, UIVhosts ++from .ui_node_nvmf import UINVMf ++from .ui_node_iscsi import UIISCSI ++from .. import rpc ++from functools import wraps ++ ++ ++class UIRoot(UINode): ++ """ ++ Root node for CLI menu tree structure. Refreshes running config on startup. ++ """ ++ def __init__(self, client, shell): ++ UINode.__init__(self, "/", shell=shell) ++ self.current_bdevs = [] ++ self.current_lvol_stores = [] ++ self.current_vhost_ctrls = [] ++ self.current_nvmf_transports = [] ++ self.current_nvmf_subsystems = [] ++ self.set_rpc_target(client) ++ self.verbose = False ++ self.is_init = self.check_init() ++ self.methods = [] ++ ++ def refresh(self): ++ self.methods = self.rpc_get_methods(current=True) ++ if self.is_init is False: ++ methods = "\n".join(self.methods) ++ self.shell.log.warning("SPDK Application is not yet initialized.\n" ++ "Please initialize subsystems with framework_start_init command.\n" ++ "List of available commands in current state:\n" ++ "%s" % methods) ++ else: ++ # Pass because we'd like to build main tree structure for "ls" ++ # even if state is uninitialized ++ pass ++ ++ self._children = set([]) ++ UIBdevs(self) ++ UILvolStores(self) ++ if self.has_subsystem("vhost_scsi") or self.has_subsystem("vhost_blk"): ++ UIVhosts(self) ++ if self.has_subsystem("nvmf"): ++ UINVMf(self) ++ if self.has_subsystem("iscsi"): ++ UIISCSI(self) ++ ++ def set_rpc_target(self, client): ++ self.client = client ++ ++ def print_array(self, a): ++ return " ".join(a) ++ ++ def verbose(f): ++ # For any configuration calls (create, delete, construct, etc.) ++ # Check if verbose option is to be used and set appropriately. ++ # Do not use for "get_*" methods so that output is not ++ # flooded. ++ def w(self, **kwargs): ++ self.client.log_set_level("INFO" if self.verbose else "ERROR") ++ r = f(self, **kwargs) ++ self.client.log_set_level("ERROR") ++ return r ++ return w ++ ++ def is_method_available(f): ++ # Check if method f is available for given spdk target ++ def w(self, **kwargs): ++ if f.__name__ in self.methods: ++ r = f(self, **kwargs) ++ return r ++ # If given method is not available return empty list ++ # similar to real get_* like rpc ++ return [] ++ return w ++ ++ def ui_command_framework_start_init(self): ++ if rpc.framework_start_init(self.client): ++ self.is_init = True ++ self.refresh() ++ ++ def ui_command_load_config(self, filename): ++ with open(filename, "r") as fd: ++ rpc.load_config(self.client, fd) ++ ++ def ui_command_load_subsystem_config(self, filename): ++ with open(filename, "r") as fd: ++ rpc.load_subsystem_config(self.client, fd) ++ ++ def ui_command_save_config(self, filename, indent=2): ++ with open(filename, "w") as fd: ++ rpc.save_config(self.client, fd, indent) ++ ++ def ui_command_save_subsystem_config(self, filename, subsystem, indent=2): ++ with open(filename, "w") as fd: ++ rpc.save_subsystem_config(self.client, fd, indent, subsystem) ++ ++ def rpc_get_methods(self, current=False): ++ return rpc.rpc_get_methods(self.client, current=current) ++ ++ def check_init(self): ++ return "framework_start_init" not in self.rpc_get_methods(current=True) ++ ++ def bdev_get_bdevs(self, bdev_type): ++ if self.is_init: ++ self.current_bdevs = rpc.bdev.bdev_get_bdevs(self.client) ++ # Following replace needs to be done in order for some of the bdev ++ # listings to work: logical volumes, split disk. ++ # For example logical volumes: listing in menu is "Logical_Volume" ++ # (cannot have space), but the product name in SPDK is "Logical Volume" ++ bdev_type = bdev_type.replace("_", " ") ++ for bdev in [x for x in self.current_bdevs if bdev_type in x["product_name"].lower()]: ++ test = Bdev(bdev) ++ yield test ++ ++ def bdev_get_iostat(self, **kwargs): ++ return rpc.bdev.bdev_get_iostat(self.client, **kwargs) ++ ++ @verbose ++ def bdev_split_create(self, **kwargs): ++ response = rpc.bdev.bdev_split_create(self.client, **kwargs) ++ return self.print_array(response) ++ ++ @verbose ++ def bdev_split_delete(self, **kwargs): ++ rpc.bdev.bdev_split_delete(self.client, **kwargs) ++ ++ @verbose ++ def create_malloc_bdev(self, **kwargs): ++ response = rpc.bdev.bdev_malloc_create(self.client, **kwargs) ++ return response ++ ++ @verbose ++ def bdev_malloc_delete(self, **kwargs): ++ rpc.bdev.bdev_malloc_delete(self.client, **kwargs) ++ ++ @verbose ++ def create_iscsi_bdev(self, **kwargs): ++ response = rpc.bdev.bdev_iscsi_create(self.client, **kwargs) ++ return response ++ ++ @verbose ++ def bdev_iscsi_delete(self, **kwargs): ++ rpc.bdev.bdev_iscsi_delete(self.client, **kwargs) ++ ++ @verbose ++ def bdev_aio_create(self, **kwargs): ++ response = rpc.bdev.bdev_aio_create(self.client, **kwargs) ++ return response ++ ++ @verbose ++ def bdev_aio_delete(self, **kwargs): ++ rpc.bdev.bdev_aio_delete(self.client, **kwargs) ++ ++ @verbose ++ def create_lvol_bdev(self, **kwargs): ++ response = rpc.lvol.bdev_lvol_create(self.client, **kwargs) ++ return response ++ ++ @verbose ++ def bdev_lvol_delete(self, **kwargs): ++ response = rpc.lvol.bdev_lvol_delete(self.client, **kwargs) ++ return response ++ ++ @verbose ++ def create_nvme_bdev(self, **kwargs): ++ response = rpc.bdev.bdev_nvme_attach_controller(self.client, **kwargs) ++ return response ++ ++ @verbose ++ def bdev_nvme_detach_controller(self, **kwargs): ++ rpc.bdev.bdev_nvme_detach_controller(self.client, **kwargs) ++ ++ @verbose ++ def bdev_null_create(self, **kwargs): ++ response = rpc.bdev.bdev_null_create(self.client, **kwargs) ++ return response ++ ++ @verbose ++ def bdev_null_delete(self, **kwargs): ++ rpc.bdev.bdev_null_delete(self.client, **kwargs) ++ ++ @verbose ++ def create_error_bdev(self, **kwargs): ++ response = rpc.bdev.bdev_error_create(self.client, **kwargs) ++ ++ @verbose ++ def bdev_error_delete(self, **kwargs): ++ rpc.bdev.bdev_error_delete(self.client, **kwargs) ++ ++ @verbose ++ @is_method_available ++ def bdev_lvol_get_lvstores(self): ++ if self.is_init: ++ self.current_lvol_stores = rpc.lvol.bdev_lvol_get_lvstores(self.client) ++ for lvs in self.current_lvol_stores: ++ yield LvolStore(lvs) ++ ++ @verbose ++ def bdev_lvol_create_lvstore(self, **kwargs): ++ response = rpc.lvol.bdev_lvol_create_lvstore(self.client, **kwargs) ++ return response ++ ++ @verbose ++ def bdev_lvol_delete_lvstore(self, **kwargs): ++ rpc.lvol.bdev_lvol_delete_lvstore(self.client, **kwargs) ++ ++ @verbose ++ def bdev_pmem_create_pool(self, **kwargs): ++ response = rpc.pmem.bdev_pmem_create_pool(self.client, **kwargs) ++ return response ++ ++ @verbose ++ def bdev_pmem_delete_pool(self, **kwargs): ++ rpc.pmem.bdev_pmem_delete_pool(self.client, **kwargs) ++ ++ @verbose ++ def bdev_pmem_get_pool_info(self, **kwargs): ++ response = rpc.pmem.bdev_pmem_get_pool_info(self.client, **kwargs) ++ return response ++ ++ @verbose ++ def bdev_pmem_create(self, **kwargs): ++ response = rpc.bdev.bdev_pmem_create(self.client, **kwargs) ++ return response ++ ++ @verbose ++ def bdev_pmem_delete(self, **kwargs): ++ response = rpc.bdev.bdev_pmem_delete(self.client, **kwargs) ++ return response ++ ++ @verbose ++ def create_rbd_bdev(self, **kwargs): ++ response = rpc.bdev.bdev_rbd_create(self.client, **kwargs) ++ return response ++ ++ @verbose ++ def bdev_rbd_delete(self, **kwargs): ++ response = rpc.bdev.bdev_rbd_delete(self.client, **kwargs) ++ return response ++ ++ @verbose ++ def create_virtio_dev(self, **kwargs): ++ response = rpc.vhost.bdev_virtio_attach_controller(self.client, **kwargs) ++ return self.print_array(response) ++ ++ @verbose ++ def bdev_virtio_detach_controller(self, **kwargs): ++ response = rpc.vhost.bdev_virtio_detach_controller(self.client, **kwargs) ++ return response ++ ++ @verbose ++ def bdev_raid_create(self, **kwargs): ++ rpc.bdev.bdev_raid_create(self.client, **kwargs) ++ ++ @verbose ++ def bdev_raid_delete(self, **kwargs): ++ rpc.bdev.bdev_raid_delete(self.client, **kwargs) ++ ++ @verbose ++ def bdev_uring_create(self, **kwargs): ++ response = rpc.bdev.bdev_uring_create(self.client, **kwargs) ++ return response ++ ++ @verbose ++ def bdev_uring_delete(self, **kwargs): ++ rpc.bdev.bdev_uring_delete(self.client, **kwargs) ++ ++ @verbose ++ @is_method_available ++ def bdev_virtio_scsi_get_devices(self): ++ if self.is_init: ++ for bdev in rpc.vhost.bdev_virtio_scsi_get_devices(self.client): ++ test = Bdev(bdev) ++ yield test ++ ++ def list_vhost_ctrls(self): ++ if self.is_init: ++ self.current_vhost_ctrls = rpc.vhost.vhost_get_controllers(self.client) ++ ++ @verbose ++ @is_method_available ++ def vhost_get_controllers(self, ctrlr_type): ++ if self.is_init: ++ self.list_vhost_ctrls() ++ for ctrlr in [x for x in self.current_vhost_ctrls if ctrlr_type in list(x["backend_specific"].keys())]: ++ yield VhostCtrlr(ctrlr) ++ ++ @verbose ++ def vhost_delete_controller(self, **kwargs): ++ rpc.vhost.vhost_delete_controller(self.client, **kwargs) ++ ++ @verbose ++ def vhost_create_scsi_controller(self, **kwargs): ++ rpc.vhost.vhost_create_scsi_controller(self.client, **kwargs) ++ ++ @verbose ++ def vhost_create_blk_controller(self, **kwargs): ++ rpc.vhost.vhost_create_blk_controller(self.client, **kwargs) ++ ++ @verbose ++ def vhost_scsi_controller_remove_target(self, **kwargs): ++ rpc.vhost.vhost_scsi_controller_remove_target(self.client, **kwargs) ++ ++ @verbose ++ def vhost_scsi_controller_add_target(self, **kwargs): ++ rpc.vhost.vhost_scsi_controller_add_target(self.client, **kwargs) ++ ++ def vhost_controller_set_coalescing(self, **kwargs): ++ rpc.vhost.vhost_controller_set_coalescing(self.client, **kwargs) ++ ++ @verbose ++ def create_nvmf_transport(self, **kwargs): ++ rpc.nvmf.nvmf_create_transport(self.client, **kwargs) ++ ++ def list_nvmf_transports(self): ++ if self.is_init: ++ self.current_nvmf_transports = rpc.nvmf.nvmf_get_transports(self.client) ++ ++ @verbose ++ @is_method_available ++ def nvmf_get_transports(self): ++ if self.is_init: ++ self.list_nvmf_transports() ++ for transport in self.current_nvmf_transports: ++ yield NvmfTransport(transport) ++ ++ def list_nvmf_subsystems(self): ++ if self.is_init: ++ self.current_nvmf_subsystems = rpc.nvmf.nvmf_get_subsystems(self.client) ++ ++ @verbose ++ @is_method_available ++ def nvmf_get_subsystems(self): ++ if self.is_init: ++ self.list_nvmf_subsystems() ++ for subsystem in self.current_nvmf_subsystems: ++ yield NvmfSubsystem(subsystem) ++ ++ @verbose ++ def create_nvmf_subsystem(self, **kwargs): ++ rpc.nvmf.nvmf_create_subsystem(self.client, **kwargs) ++ ++ @verbose ++ def nvmf_delete_subsystem(self, **kwargs): ++ rpc.nvmf.nvmf_delete_subsystem(self.client, **kwargs) ++ ++ @verbose ++ def nvmf_subsystem_add_listener(self, **kwargs): ++ rpc.nvmf.nvmf_subsystem_add_listener(self.client, **kwargs) ++ ++ @verbose ++ def nvmf_subsystem_remove_listener(self, **kwargs): ++ rpc.nvmf.nvmf_subsystem_remove_listener(self.client, **kwargs) ++ ++ @verbose ++ def nvmf_subsystem_add_host(self, **kwargs): ++ rpc.nvmf.nvmf_subsystem_add_host(self.client, **kwargs) ++ ++ @verbose ++ def nvmf_subsystem_remove_host(self, **kwargs): ++ rpc.nvmf.nvmf_subsystem_remove_host(self.client, **kwargs) ++ ++ @verbose ++ def nvmf_subsystem_allow_any_host(self, **kwargs): ++ rpc.nvmf.nvmf_subsystem_allow_any_host(self.client, **kwargs) ++ ++ @verbose ++ def nvmf_subsystem_add_ns(self, **kwargs): ++ rpc.nvmf.nvmf_subsystem_add_ns(self.client, **kwargs) ++ ++ @verbose ++ def nvmf_subsystem_remove_ns(self, **kwargs): ++ rpc.nvmf.nvmf_subsystem_remove_ns(self.client, **kwargs) ++ ++ @verbose ++ def nvmf_subsystem_allow_any_host(self, **kwargs): ++ rpc.nvmf.nvmf_subsystem_allow_any_host(self.client, **kwargs) ++ ++ @verbose ++ @is_method_available ++ def scsi_get_devices(self): ++ if self.is_init: ++ for device in rpc.iscsi.scsi_get_devices(self.client): ++ yield ScsiObj(device) ++ ++ @verbose ++ @is_method_available ++ def iscsi_get_target_nodes(self): ++ if self.is_init: ++ for tg in rpc.iscsi.iscsi_get_target_nodes(self.client): ++ yield tg ++ ++ @verbose ++ def iscsi_create_target_node(self, **kwargs): ++ rpc.iscsi.iscsi_create_target_node(self.client, **kwargs) ++ ++ @verbose ++ def iscsi_delete_target_node(self, **kwargs): ++ rpc.iscsi.iscsi_delete_target_node(self.client, **kwargs) ++ ++ @verbose ++ @is_method_available ++ def iscsi_get_portal_groups(self): ++ if self.is_init: ++ for pg in rpc.iscsi.iscsi_get_portal_groups(self.client): ++ yield ScsiObj(pg) ++ ++ @verbose ++ @is_method_available ++ def iscsi_get_initiator_groups(self): ++ if self.is_init: ++ for ig in rpc.iscsi.iscsi_get_initiator_groups(self.client): ++ yield ScsiObj(ig) ++ ++ @verbose ++ def construct_portal_group(self, **kwargs): ++ rpc.iscsi.iscsi_create_portal_group(self.client, **kwargs) ++ ++ @verbose ++ def iscsi_delete_portal_group(self, **kwargs): ++ rpc.iscsi.iscsi_delete_portal_group(self.client, **kwargs) ++ ++ @verbose ++ def construct_initiator_group(self, **kwargs): ++ rpc.iscsi.iscsi_create_initiator_group(self.client, **kwargs) ++ ++ @verbose ++ def iscsi_delete_initiator_group(self, **kwargs): ++ rpc.iscsi.iscsi_delete_initiator_group(self.client, **kwargs) ++ ++ @verbose ++ @is_method_available ++ def iscsi_get_connections(self, **kwargs): ++ if self.is_init: ++ for ic in rpc.iscsi.iscsi_get_connections(self.client, **kwargs): ++ yield ic ++ ++ @verbose ++ def iscsi_initiator_group_add_initiators(self, **kwargs): ++ rpc.iscsi.iscsi_initiator_group_add_initiators(self.client, **kwargs) ++ ++ @verbose ++ def iscsi_initiator_group_remove_initiators(self, **kwargs): ++ rpc.iscsi.iscsi_initiator_group_remove_initiators(self.client, **kwargs) ++ ++ @verbose ++ def iscsi_target_node_add_pg_ig_maps(self, **kwargs): ++ rpc.iscsi.iscsi_target_node_add_pg_ig_maps(self.client, **kwargs) ++ ++ @verbose ++ def iscsi_target_node_remove_pg_ig_maps(self, **kwargs): ++ rpc.iscsi.iscsi_target_node_remove_pg_ig_maps(self.client, **kwargs) ++ ++ @verbose ++ def iscsi_auth_group_add_secret(self, **kwargs): ++ rpc.iscsi.iscsi_auth_group_add_secret(self.client, **kwargs) ++ ++ @verbose ++ def iscsi_auth_group_remove_secret(self, **kwargs): ++ rpc.iscsi.iscsi_auth_group_remove_secret(self.client, **kwargs) ++ ++ @verbose ++ @is_method_available ++ def iscsi_get_auth_groups(self, **kwargs): ++ return rpc.iscsi.iscsi_get_auth_groups(self.client, **kwargs) ++ ++ @verbose ++ def iscsi_create_auth_group(self, **kwargs): ++ rpc.iscsi.iscsi_create_auth_group(self.client, **kwargs) ++ ++ @verbose ++ def iscsi_delete_auth_group(self, **kwargs): ++ rpc.iscsi.iscsi_delete_auth_group(self.client, **kwargs) ++ ++ @verbose ++ def iscsi_target_node_set_auth(self, **kwargs): ++ rpc.iscsi.iscsi_target_node_set_auth(self.client, **kwargs) ++ ++ @verbose ++ def iscsi_target_node_add_lun(self, **kwargs): ++ rpc.iscsi.iscsi_target_node_add_lun(self.client, **kwargs) ++ ++ @verbose ++ def iscsi_set_discovery_auth(self, **kwargs): ++ rpc.iscsi.iscsi_set_discovery_auth(self.client, **kwargs) ++ ++ @verbose ++ @is_method_available ++ def iscsi_get_options(self, **kwargs): ++ return rpc.iscsi.iscsi_get_options(self.client, **kwargs) ++ ++ def has_subsystem(self, subsystem): ++ for system in rpc.subsystem.framework_get_subsystems(self.client): ++ if subsystem.lower() == system["subsystem"].lower(): ++ return True ++ return False ++ ++ ++class Bdev(object): ++ def __init__(self, bdev_info): ++ """ ++ All class attributes are set based on what information is received ++ from bdev_get_bdevs RPC call. ++ # TODO: Document in docstring parameters which describe bdevs. ++ # TODO: Possible improvement: JSON schema might be used here in future ++ """ ++ for i in list(bdev_info.keys()): ++ setattr(self, i, bdev_info[i]) ++ ++ ++class LvolStore(object): ++ def __init__(self, lvs_info): ++ """ ++ All class attributes are set based on what information is received ++ from bdev_get_bdevs RPC call. ++ # TODO: Document in docstring parameters which describe bdevs. ++ # TODO: Possible improvement: JSON schema might be used here in future ++ """ ++ for i in list(lvs_info.keys()): ++ setattr(self, i, lvs_info[i]) ++ ++ ++class VhostCtrlr(object): ++ def __init__(self, ctrlr_info): ++ """ ++ All class attributes are set based on what information is received ++ from vhost_get_controllers RPC call. ++ # TODO: Document in docstring parameters which describe bdevs. ++ # TODO: Possible improvement: JSON schema might be used here in future ++ """ ++ for i in list(ctrlr_info.keys()): ++ setattr(self, i, ctrlr_info[i]) ++ ++ ++class NvmfTransport(object): ++ def __init__(self, transport_info): ++ """ ++ All class attributes are set based on what information is received ++ from get_nvmf_transport RPC call. ++ # TODO: Document in docstring parameters which describe bdevs. ++ # TODO: Possible improvement: JSON schema might be used here in future ++ """ ++ for i in transport_info.keys(): ++ setattr(self, i, transport_info[i]) ++ ++ ++class NvmfSubsystem(object): ++ def __init__(self, subsystem_info): ++ """ ++ All class attributes are set based on what information is received ++ from get_nvmf_subsystem RPC call. ++ # TODO: Document in docstring parameters which describe bdevs. ++ # TODO: Possible improvement: JSON schema might be used here in future ++ """ ++ for i in subsystem_info.keys(): ++ setattr(self, i, subsystem_info[i]) ++ ++ ++class ScsiObj(object): ++ def __init__(self, device_info): ++ """ ++ All class attributes are set based on what information is received ++ from iscsi related RPC calls. ++ # TODO: Document in docstring parameters which describe bdevs. ++ # TODO: Possible improvement: JSON schema might be used here in future ++ """ ++ for i in device_info.keys(): ++ setattr(self, i, device_info[i]) +diff --git a/rpmbuild/rpm-deps.sh b/rpmbuild/rpm-deps.sh +index de7db95..bfa949e 100644 +--- a/rpmbuild/rpm-deps.sh ++++ b/rpmbuild/rpm-deps.sh +@@ -1,62 +1,62 @@ +-#!/usr/bin/env bash +-# SPDX-License-Identifier: BSD-3-Clause +-# Copyright (C) 2021 Intel Corporation +-# All rights reserved. +-# +-# This script simply iterates over all libs SPDK binaries link +-# to and returns a list of .rpm packages SPDK may depend on. At +-# the end, the list strictly relates to how the SPDK build was +-# ./configure'd. +- +-shopt -s nullglob +- +-rpmdir=$(readlink -f "$(dirname "$0")") +-rootdir=$(readlink -f "$rpmdir/../") +-rc=0 +- +-bins=("$rootdir/"build/{bin,examples}/*) +-(($#)) && bins=("$@") +- +-((${#bins[@]} > 0)) || exit 0 +- +-source /etc/os-release +- +-id_ok=no +- +-for id in $ID $ID_LIKE; do +- [[ "$id" =~ ^(fedora|centos|rhel) ]] && id_ok=yes +-done +- +-if [[ "$id_ok" != "yes" ]]; then +- exit 0 +-fi +- +-declare -A deps=() +-for bin in "${bins[@]}"; do +- if ! type -P "$bin"; then +- printf '%s is missing\n' "$bin" >&2 +- rc=1 +- continue +- fi +- while read -r name _ lib _; do +- [[ -n $lib ]] || continue +- [[ -z ${deps["$lib"]} ]] || continue +- if [[ ! -e $lib ]]; then +- lib=$name pkg="missing" +- rc=1 +- elif ! pkg=$(rpm -qf "$lib"); then +- pkg=${lib##*/} +- fi +- deps["$lib"]=$pkg +- done < <(LD_TRACE_LOADED_OBJECTS=1 "$bin") +-done +- +-if [[ -n $LIST_LIBS ]]; then +- for lib in "${!deps[@]}"; do +- echo "$lib:${deps["$lib"]}" +- done +-else +- printf '%s\n' "${deps[@]}" +-fi | sort -u +- +-((rc == 0)) ++#!/usr/bin/env bash ++# SPDX-License-Identifier: BSD-3-Clause ++# Copyright (C) 2021 Intel Corporation ++# All rights reserved. ++# ++# This script simply iterates over all libs SPDK binaries link ++# to and returns a list of .rpm packages SPDK may depend on. At ++# the end, the list strictly relates to how the SPDK build was ++# ./configure'd. ++ ++shopt -s nullglob ++ ++rpmdir=$(readlink -f "$(dirname "$0")") ++rootdir=$(readlink -f "$rpmdir/../") ++rc=0 ++ ++bins=("$rootdir/"build/{bin,examples}/*) ++(($#)) && bins=("$@") ++ ++((${#bins[@]} > 0)) || exit 0 ++ ++source /etc/os-release ++ ++id_ok=no ++ ++for id in $ID $ID_LIKE; do ++ [[ "$id" =~ ^(fedora|centos|rhel) ]] && id_ok=yes ++done ++ ++if [[ "$id_ok" != "yes" ]]; then ++ exit 0 ++fi ++ ++declare -A deps=() ++for bin in "${bins[@]}"; do ++ if ! type -P "$bin"; then ++ printf '%s is missing\n' "$bin" >&2 ++ rc=1 ++ continue ++ fi ++ while read -r name _ lib _; do ++ [[ -n $lib ]] || continue ++ [[ -z ${deps["$lib"]} ]] || continue ++ if [[ ! -e $lib ]]; then ++ lib=$name pkg="missing" ++ rc=1 ++ elif ! pkg=$(rpm -qf "$lib"); then ++ pkg=${lib##*/} ++ fi ++ deps["$lib"]=$pkg ++ done < <(LD_TRACE_LOADED_OBJECTS=1 "$bin") ++done ++ ++if [[ -n $LIST_LIBS ]]; then ++ for lib in "${!deps[@]}"; do ++ echo "$lib:${deps["$lib"]}" ++ done ++else ++ printf '%s\n' "${deps[@]}" ++fi | sort -u ++ ++((rc == 0)) +diff --git a/rpmbuild/rpm.sh b/rpmbuild/rpm.sh +index 667422c..dbf935f 100644 +--- a/rpmbuild/rpm.sh ++++ b/rpmbuild/rpm.sh +@@ -1,208 +1,208 @@ +-#!/usr/bin/env bash +-# SPDX-License-Identifier: BSD-3-Clause +-# Copyright (C) 2021 Intel Corporation +-# All rights reserved. +-# +- +-set -e +- +-specdir=$(readlink -f "$(dirname "$0")") +-rootdir=$(readlink -f "$specdir/../") +- +-[[ -e /etc/os-release ]] +-source /etc/os-release +- +-id_ok=no +- +-for id in $ID $ID_LIKE; do +- [[ "$id" =~ ^(fedora|centos|rhel) ]] && id_ok=yes +-done +- +-if [[ "$id_ok" != "yes" ]]; then +- printf '%s not supported\n' "$ID" >&2 +- exit 1 +-fi +- +-get_config() { +- # Intercept part of the ./configure's cmdline we are interested in +- configure_opts=($(getopt -l "$1::" -o "" -- $configure 2> /dev/null)) || true +- # If "--" is the first argument then either the cmdline is empty or doesn't +- # match on what we are looking for. In either case simply return as there +- # is nothing to check. +- [[ ${configure_opts[0]} == "--" ]] && return 1 +- +- if [[ $2 == has-arg ]]; then +- [[ -n ${configure_opts[1]} && ${configure_opts[1]} != "''" ]] +- elif [[ $2 == print ]]; then +- echo "${configure_opts[1]//\'/}" +- fi +-} +- +-fedora_python_sys_path_workaround() { +- [[ -z $NO_WORKAROUND ]] || return 0 +- +- # Fedora builds its python version with a patch which attempts to remove all +- # "/usr/local" paths from sys.path in case it's run under RPM environment, +- # i.e., when RPM_BUILD_ROOT variable is detected. This particular variable +- # is set by the rpmbuild when it executes its sh wrappers built out of the +- # .spec file. +- +- # This is problematic in case meson and ninja were installed via rooted pip +- # which had its working directory set to /usr/local. As a result, when the +- # SPDK executes meson to build DPDK, from rpmbuild env, it fails as +- # it's not able to find its mesonbuild module. +- +- # To workaround this little hiccup we fetch the entire sys.path list and +- # then export it via PYTHONPATH so when rpmbuild kicks in, python will be +- # able to find all the modules regardless if the RPM_BUILD_ROOT is set or +- # not. +- # FIXME: The alternative is to unset RPM_BUILD_ROOT directly in the spec? +- # It does work but it feels wrong. +- +- PYTHONPATH="$(python3 -c "import sys; print('%s' % ':'.join(sys.path)[1:])")" +- export PYTHONPATH +-} +- +-get_version() { +- local version +- version=$(git -C "$rootdir" describe --tags --abbrev=0) +- +- echo "${version%%-*}" +-} +- +-build_macros() { +- local -g macros=() +- local dir _dir +- +- macros+=(-D "configure ${configure:-"%{nil}"}") +- macros+=(-D "make $make") +- macros+=(-D "release $release") +- macros+=(-D "version $version") +- +- # Adjust dir macros to update the final location of the RPMS +- for dir in build buildroot rpm source spec srcrpm; do +- _dir=$(rpm --eval "%{_${dir}dir}") +- if [[ -z $USE_DEFAULT_DIRS ]]; then +- macros+=(-D "_${dir}dir $rpmbuild_dir/$dir") +- _dir=$rpmbuild_dir/$dir +- fi +- local -g "_${dir}dir=$_dir" +- done +- +- if get_config with-shared; then +- macros+=(-D "shared 1") +- macros+=(-D "dpdk 1") +- fi +- +- if get_config with-dpdk; then +- if ! get_config with-dpdk has-arg; then +- # spdk is requested to build against installed dpdk (i.e. provided by the dist). +- # Don't build dpdk rpm rather define proper requirements for the spdk. +- macros+=(-D "dpdk 0") +- macros+=(-D "shared 1") +- # This maps how Epoch is used inside dpdk packages across different distros. It's +- # mainly relevant when comparing version of required packages. Default maps to 0. +- local -A dpdk_rpm_epoch["fedora"]=2 +- local dpdk_version_min=${dpdk_rpm_epoch["$ID"]:-0}:20.11 +- local dpdk_req="dpdk-devel >= $dpdk_version_min" +- +- requirements=${requirements:+$requirements, }"$dpdk_req" +- build_requirements=${build_requirements:+$build_requirements, }"$dpdk_req" +- else +- dpdk_build_path=$(get_config with-dpdk print) +- dpdk_path=$(dirname "$dpdk_build_path") +- macros+=(-D "dpdk_build_path $dpdk_build_path") +- macros+=(-D "dpdk_path $dpdk_path") +- fi +- fi +- +- if get_config with-rbd; then +- macros+=(-D "rbd 1") +- requirements=${requirements:+$requirements, }"librados2, librbd1" +- build_requirements=${build_requirements:+$build_requirements, }"librados-devel, librbd-devel" +- fi +- +- if get_config libdir has-arg; then +- macros+=(-D "libdir $(get_config libdir print)") +- fi +- +- if get_config with-vfio-user; then +- macros+=(-D "vfio_user 1") +- fi +- +- if [[ $deps == no ]]; then +- macros+=(-D "deps 0") +- fi +- +- if [[ -n $requirements ]]; then +- macros+=(-D "requirements 1") +- macros+=(-D "requirements_list $requirements") +- fi +- +- if [[ -n $build_requirements ]]; then +- macros+=(-D "build_requirements 1") +- macros+=(-D "build_requirements_list $build_requirements") +- fi +- +- build_macros_flags +-} +- +-build_macros_flags() { +- local flags flag +- +- flags=(CFLAGS CXXFLAGS LDFLAGS) +- +- for flag in "${flags[@]}"; do +- # If we are running in the environment where the flag is set, don't touch it - +- # rpmbuild will use it as is during the build. If it's not set, make sure the +- # rpmbuild won't set its defaults which may affect the build in an unpredictable +- # manner. +- [[ -n ${!flag} ]] && continue +- macros+=(-D "build_${flag,,} %{nil}") +- done +-} +- +-gen_spec() { +- rpmspec "${macros[@]}" -P "$spec" +-} +- +-build_rpm() ( +- fedora_python_sys_path_workaround +- +- mkdir -p \ +- "$_builddir" \ +- "$_buildrootdir" \ +- "$_rpmdir" \ +- "$_sourcedir" \ +- "$_specdir" \ +- "$_srcrpmdir" +- +- # Despite building in-place, rpmbuild still looks under %{_sourcedir} as defined +- # in Source:. Create a dummy file to fulfil its needs and to keep Source in +- # the .spec. +- : > "$_sourcedir/spdk-$version.tar.gz" +- +- cd "$rootdir" +- +- printf '* Starting rpmbuild...\n' +- rpmbuild --clean --nodebuginfo "${macros[@]}" --build-in-place -ba "$spec" +-) +- +-# .spec defaults +-configure=$* +-deps=${DEPS:-yes} +-make="${MAKEFLAGS:--j $(nproc)}" +-release=${RPM_RELEASE:-1} +-requirements=${REQUIREMENTS:-} +-build_requirements=${BUILD_REQUIREMENTS:-} +-version=${SPDK_VERSION:-$(get_version)} +- +-rpmbuild_dir=${BUILDDIR:-"$HOME/rpmbuild"} +-spec=$specdir/spdk.spec +- +-build_macros +-if [[ -n $GEN_SPEC ]]; then +- gen_spec +- exit 0 +-fi +-build_rpm ++#!/usr/bin/env bash ++# SPDX-License-Identifier: BSD-3-Clause ++# Copyright (C) 2021 Intel Corporation ++# All rights reserved. ++# ++ ++set -e ++ ++specdir=$(readlink -f "$(dirname "$0")") ++rootdir=$(readlink -f "$specdir/../") ++ ++[[ -e /etc/os-release ]] ++source /etc/os-release ++ ++id_ok=no ++ ++for id in $ID $ID_LIKE; do ++ [[ "$id" =~ ^(fedora|centos|rhel) ]] && id_ok=yes ++done ++ ++if [[ "$id_ok" != "yes" ]]; then ++ printf '%s not supported\n' "$ID" >&2 ++ exit 1 ++fi ++ ++get_config() { ++ # Intercept part of the ./configure's cmdline we are interested in ++ configure_opts=($(getopt -l "$1::" -o "" -- $configure 2> /dev/null)) || true ++ # If "--" is the first argument then either the cmdline is empty or doesn't ++ # match on what we are looking for. In either case simply return as there ++ # is nothing to check. ++ [[ ${configure_opts[0]} == "--" ]] && return 1 ++ ++ if [[ $2 == has-arg ]]; then ++ [[ -n ${configure_opts[1]} && ${configure_opts[1]} != "''" ]] ++ elif [[ $2 == print ]]; then ++ echo "${configure_opts[1]//\'/}" ++ fi ++} ++ ++fedora_python_sys_path_workaround() { ++ [[ -z $NO_WORKAROUND ]] || return 0 ++ ++ # Fedora builds its python version with a patch which attempts to remove all ++ # "/usr/local" paths from sys.path in case it's run under RPM environment, ++ # i.e., when RPM_BUILD_ROOT variable is detected. This particular variable ++ # is set by the rpmbuild when it executes its sh wrappers built out of the ++ # .spec file. ++ ++ # This is problematic in case meson and ninja were installed via rooted pip ++ # which had its working directory set to /usr/local. As a result, when the ++ # SPDK executes meson to build DPDK, from rpmbuild env, it fails as ++ # it's not able to find its mesonbuild module. ++ ++ # To workaround this little hiccup we fetch the entire sys.path list and ++ # then export it via PYTHONPATH so when rpmbuild kicks in, python will be ++ # able to find all the modules regardless if the RPM_BUILD_ROOT is set or ++ # not. ++ # FIXME: The alternative is to unset RPM_BUILD_ROOT directly in the spec? ++ # It does work but it feels wrong. ++ ++ PYTHONPATH="$(python3 -c "import sys; print('%s' % ':'.join(sys.path)[1:])")" ++ export PYTHONPATH ++} ++ ++get_version() { ++ local version ++ version=$(git -C "$rootdir" describe --tags --abbrev=0) ++ ++ echo "${version%%-*}" ++} ++ ++build_macros() { ++ local -g macros=() ++ local dir _dir ++ ++ macros+=(-D "configure ${configure:-"%{nil}"}") ++ macros+=(-D "make $make") ++ macros+=(-D "release $release") ++ macros+=(-D "version $version") ++ ++ # Adjust dir macros to update the final location of the RPMS ++ for dir in build buildroot rpm source spec srcrpm; do ++ _dir=$(rpm --eval "%{_${dir}dir}") ++ if [[ -z $USE_DEFAULT_DIRS ]]; then ++ macros+=(-D "_${dir}dir $rpmbuild_dir/$dir") ++ _dir=$rpmbuild_dir/$dir ++ fi ++ local -g "_${dir}dir=$_dir" ++ done ++ ++ if get_config with-shared; then ++ macros+=(-D "shared 1") ++ macros+=(-D "dpdk 1") ++ fi ++ ++ if get_config with-dpdk; then ++ if ! get_config with-dpdk has-arg; then ++ # spdk is requested to build against installed dpdk (i.e. provided by the dist). ++ # Don't build dpdk rpm rather define proper requirements for the spdk. ++ macros+=(-D "dpdk 0") ++ macros+=(-D "shared 1") ++ # This maps how Epoch is used inside dpdk packages across different distros. It's ++ # mainly relevant when comparing version of required packages. Default maps to 0. ++ local -A dpdk_rpm_epoch["fedora"]=2 ++ local dpdk_version_min=${dpdk_rpm_epoch["$ID"]:-0}:20.11 ++ local dpdk_req="dpdk-devel >= $dpdk_version_min" ++ ++ requirements=${requirements:+$requirements, }"$dpdk_req" ++ build_requirements=${build_requirements:+$build_requirements, }"$dpdk_req" ++ else ++ dpdk_build_path=$(get_config with-dpdk print) ++ dpdk_path=$(dirname "$dpdk_build_path") ++ macros+=(-D "dpdk_build_path $dpdk_build_path") ++ macros+=(-D "dpdk_path $dpdk_path") ++ fi ++ fi ++ ++ if get_config with-rbd; then ++ macros+=(-D "rbd 1") ++ requirements=${requirements:+$requirements, }"librados2, librbd1" ++ build_requirements=${build_requirements:+$build_requirements, }"librados-devel, librbd-devel" ++ fi ++ ++ if get_config libdir has-arg; then ++ macros+=(-D "libdir $(get_config libdir print)") ++ fi ++ ++ if get_config with-vfio-user; then ++ macros+=(-D "vfio_user 1") ++ fi ++ ++ if [[ $deps == no ]]; then ++ macros+=(-D "deps 0") ++ fi ++ ++ if [[ -n $requirements ]]; then ++ macros+=(-D "requirements 1") ++ macros+=(-D "requirements_list $requirements") ++ fi ++ ++ if [[ -n $build_requirements ]]; then ++ macros+=(-D "build_requirements 1") ++ macros+=(-D "build_requirements_list $build_requirements") ++ fi ++ ++ build_macros_flags ++} ++ ++build_macros_flags() { ++ local flags flag ++ ++ flags=(CFLAGS CXXFLAGS LDFLAGS) ++ ++ for flag in "${flags[@]}"; do ++ # If we are running in the environment where the flag is set, don't touch it - ++ # rpmbuild will use it as is during the build. If it's not set, make sure the ++ # rpmbuild won't set its defaults which may affect the build in an unpredictable ++ # manner. ++ [[ -n ${!flag} ]] && continue ++ macros+=(-D "build_${flag,,} %{nil}") ++ done ++} ++ ++gen_spec() { ++ rpmspec "${macros[@]}" -P "$spec" ++} ++ ++build_rpm() ( ++ fedora_python_sys_path_workaround ++ ++ mkdir -p \ ++ "$_builddir" \ ++ "$_buildrootdir" \ ++ "$_rpmdir" \ ++ "$_sourcedir" \ ++ "$_specdir" \ ++ "$_srcrpmdir" ++ ++ # Despite building in-place, rpmbuild still looks under %{_sourcedir} as defined ++ # in Source:. Create a dummy file to fulfil its needs and to keep Source in ++ # the .spec. ++ : > "$_sourcedir/spdk-$version.tar.gz" ++ ++ cd "$rootdir" ++ ++ printf '* Starting rpmbuild...\n' ++ rpmbuild --clean --nodebuginfo "${macros[@]}" --build-in-place -ba "$spec" ++) ++ ++# .spec defaults ++configure=$* ++deps=${DEPS:-yes} ++make="${MAKEFLAGS:--j $(nproc)}" ++release=${RPM_RELEASE:-1} ++requirements=${REQUIREMENTS:-} ++build_requirements=${BUILD_REQUIREMENTS:-} ++version=${SPDK_VERSION:-$(get_version)} ++ ++rpmbuild_dir=${BUILDDIR:-"$HOME/rpmbuild"} ++spec=$specdir/spdk.spec ++ ++build_macros ++if [[ -n $GEN_SPEC ]]; then ++ gen_spec ++ exit 0 ++fi ++build_rpm +diff --git a/rpmbuild/spdk.spec b/rpmbuild/spdk.spec +index 107de82..2d4f942 100644 +--- a/rpmbuild/spdk.spec ++++ b/rpmbuild/spdk.spec +@@ -1,206 +1,206 @@ +-# Global macros +-%define debug_package %{nil} +- +-%{!?deps:%define deps 1} +-%{!?dpdk:%define dpdk 0} +-%{!?dpdk_build_path:%define dpdk_build_path "dpdk/build"} +-%{!?dpdk_path:%define dpdk_path "dpdk"} +-%{!?requirements:%define requirements 0} +-%{!?build_requirements:%define build_requirements 0} +-%{!?shared:%define shared 0} +-%{!?rbd:%define rbd 0} +-%{!?libdir:%define libdir /usr/local/lib} +-%{!?vfio_user:%define vfio_user 0} +- +-# Spec metadata +-Name: spdk +-Version: %{version} +-Release: %{release} +-Summary: Storage Performance Development Kit +- +-# This is a minimal set of requirements needed for SPDK apps to run when built with +-# default configuration. These are also predetermined by rpmbuild. Extra requirements +-# can be defined through a comma-separated list passed via $requirements when building +-# the spec. +-Requires: glibc +-Requires: libaio +-Requires: libgcc +-Requires: libstdc++ +-Requires: libuuid +-Requires: ncurses-libs +-Requires: numactl-libs +-Requires: openssl-libs +-Requires: zlib +- +-%if %{requirements} +-Requires: %(echo "%{requirements_list}") +-%endif +- +-BuildRequires: python3-devel +- +-%if %{build_requirements} +-BuildRequires: %(echo "%{build_requirements_list}") +-%endif +- +-License: BSD +-URL: https://spdk.io +-Source: spdk-%{version}.tar.gz +- +-%description +- +-The Storage Performance Development Kit (SPDK) provides a set of tools and libraries for +-writing high performance, scalable, user-mode storage applications. It achieves high +-performance by moving all of the necessary drivers into userspace and operating in a +-polled mode instead of relying on interrupts, which avoids kernel context switches and +-eliminates interrupt handling overhead. +- +-%prep +-make clean %{make} &>/dev/null || : +-%setup +- +-%build +-set +x +- +-cfs() { +- (($# > 1)) || return 0 +- +- local dst=$1 f +- +- mkdir -p "$dst" +- shift; for f; do [[ -e $f ]] && cp -a "$f" "$dst"; done +-} +- +-cl() { +- [[ -e $2 ]] || return 0 +- +- cfs "$1" $(find "$2" -name '*.so*' -type f -o -type l | grep -v .symbols) +-} +- +-%if %{deps} +-_PKGDEP_OPTS="--docs --pmem --rdma --uring" +-%if %{rbd} +-_PKGDEP_OPTS="$_PKGDEP_OPTS --rbd" +-%endif +-./scripts/pkgdep.sh $_PKGDEP_OPTS +-%endif +- +-# Rely mainly on CONFIG +-./configure --disable-unit-tests --disable-tests %{configure} +-make %{make} +-make DESTDIR=%{buildroot} install %{make} +- +-# Include DPDK libs in case --with-shared is in use. +-%if %{dpdk} +-cfs %{buildroot}/usr/local/lib/dpdk %{dpdk_build_path}/lib/* +-# Special case for SPDK_RUN_EXTERNAL_DPDK setup +-cl %{buildroot}/usr/local/lib/dpdk %{dpdk_path}/intel-ipsec-mb/ +-cl %{buildroot}/usr/local/lib/dpdk %{dpdk_path}/isa-l/ +-%endif +- +-# Include libvfio-user libs in case --with-vfio-user is in use together with --with-shared +-%if %{vfio_user} && %{shared} +-cl %{buildroot}/usr/local/lib/libvfio-user build/libvfio-user/ +-%endif +-# Try to include extra binaries that were potentially built +-cfs %{buildroot}/usr/local/bin build/fio +- +-# And some useful setup scripts SPDK uses +-mkdir -p %{buildroot}/usr/libexec/spdk +-mkdir -p %{buildroot}/etc/bash_completion.d +-mkdir -p %{buildroot}/etc/profile.d +-mkdir -p %{buildroot}/etc/ld.so.conf.d +-mkdir -p %{buildroot}%{python3_sitelib} +- +-cat <<-EOF > %{buildroot}/etc/ld.so.conf.d/spdk.conf +-%{libdir} +-/usr/local/lib/dpdk +-/usr/local/lib/libvfio-user +-EOF +- +-cat <<-'EOF' > %{buildroot}/etc/profile.d/spdk_path.sh +-PATH=$PATH:/usr/libexec/spdk/scripts +-PATH=$PATH:/usr/libexec/spdk/scripts/vagrant +-PATH=$PATH:/usr/libexec/spdk/test/common/config +-export PATH +-EOF +- +-cfs %{buildroot}/usr/libexec/spdk scripts +-cfs %{buildroot}%{python3_sitelib} python/spdk +-ln -s /usr/libexec/spdk/scripts/bash-completion/spdk %{buildroot}/etc/bash_completion.d/ +- +-# We need to take into the account the fact that most of the scripts depend on being +-# run directly from the repo. To workaround it, create common root space under dir +-# like /usr/libexec/spdk and link all potential relative paths the script may try +-# to reference. +- +-# setup.sh uses pci_ids.h +-ln -s /usr/local/include %{buildroot}/usr/libexec/spdk +- +-%files +-/etc/profile.d/* +-/etc/bash_completion.d/* +-/usr/libexec/spdk/* +-/usr/local/bin/* +-%{python3_sitelib}/spdk/* +- +- +-%package devel +-Summary: SPDK development libraries and headers +- +-%description devel +-SPDK development libraries and headers +- +-%files devel +-/usr/local/include/* +-%if %{shared} +-%{libdir}/lib*.so +-%endif +- +-%package libs +-Summary: SPDK libraries +- +-%description libs +-SPDK libraries +- +-%files libs +-/etc/ld.so.conf.d/* +-%{libdir}/lib*.a +-%{libdir}/pkgconfig/*.pc +-%if %{shared} +-%{libdir}/lib*.so.* +-%endif +- +-%post libs +-ldconfig +- +-%if %{dpdk} +-%package dpdk-libs +-Summary: DPDK libraries +- +-%description dpdk-libs +-DPDK libraries +- +-%files dpdk-libs +-/usr/local/lib/dpdk +- +-%post dpdk-libs +-ldconfig +-%endif +- +-%if %{vfio_user} && %{shared} +-%package libvfio-user +-Summary: libvfio-user libraries +- +-%description libvfio-user +-libvfio-user libraries +- +-%files libvfio-user +-/usr/local/lib/libvfio-user +- +-%post libvfio-user +-ldconfig +-%endif +- +-%changelog +-* Tue Feb 16 2021 Michal Berger +-- Initial RPM .spec for the SPDK ++# Global macros ++%define debug_package %{nil} ++ ++%{!?deps:%define deps 1} ++%{!?dpdk:%define dpdk 0} ++%{!?dpdk_build_path:%define dpdk_build_path "dpdk/build"} ++%{!?dpdk_path:%define dpdk_path "dpdk"} ++%{!?requirements:%define requirements 0} ++%{!?build_requirements:%define build_requirements 0} ++%{!?shared:%define shared 0} ++%{!?rbd:%define rbd 0} ++%{!?libdir:%define libdir /usr/local/lib} ++%{!?vfio_user:%define vfio_user 0} ++ ++# Spec metadata ++Name: spdk ++Version: %{version} ++Release: %{release} ++Summary: Storage Performance Development Kit ++ ++# This is a minimal set of requirements needed for SPDK apps to run when built with ++# default configuration. These are also predetermined by rpmbuild. Extra requirements ++# can be defined through a comma-separated list passed via $requirements when building ++# the spec. ++Requires: glibc ++Requires: libaio ++Requires: libgcc ++Requires: libstdc++ ++Requires: libuuid ++Requires: ncurses-libs ++Requires: numactl-libs ++Requires: openssl-libs ++Requires: zlib ++ ++%if %{requirements} ++Requires: %(echo "%{requirements_list}") ++%endif ++ ++BuildRequires: python3-devel ++ ++%if %{build_requirements} ++BuildRequires: %(echo "%{build_requirements_list}") ++%endif ++ ++License: BSD ++URL: https://spdk.io ++Source: spdk-%{version}.tar.gz ++ ++%description ++ ++The Storage Performance Development Kit (SPDK) provides a set of tools and libraries for ++writing high performance, scalable, user-mode storage applications. It achieves high ++performance by moving all of the necessary drivers into userspace and operating in a ++polled mode instead of relying on interrupts, which avoids kernel context switches and ++eliminates interrupt handling overhead. ++ ++%prep ++make clean %{make} &>/dev/null || : ++%setup ++ ++%build ++set +x ++ ++cfs() { ++ (($# > 1)) || return 0 ++ ++ local dst=$1 f ++ ++ mkdir -p "$dst" ++ shift; for f; do [[ -e $f ]] && cp -a "$f" "$dst"; done ++} ++ ++cl() { ++ [[ -e $2 ]] || return 0 ++ ++ cfs "$1" $(find "$2" -name '*.so*' -type f -o -type l | grep -v .symbols) ++} ++ ++%if %{deps} ++_PKGDEP_OPTS="--docs --pmem --rdma --uring" ++%if %{rbd} ++_PKGDEP_OPTS="$_PKGDEP_OPTS --rbd" ++%endif ++./scripts/pkgdep.sh $_PKGDEP_OPTS ++%endif ++ ++# Rely mainly on CONFIG ++./configure --disable-unit-tests --disable-tests %{configure} ++make %{make} ++make DESTDIR=%{buildroot} install %{make} ++ ++# Include DPDK libs in case --with-shared is in use. ++%if %{dpdk} ++cfs %{buildroot}/usr/local/lib/dpdk %{dpdk_build_path}/lib/* ++# Special case for SPDK_RUN_EXTERNAL_DPDK setup ++cl %{buildroot}/usr/local/lib/dpdk %{dpdk_path}/intel-ipsec-mb/ ++cl %{buildroot}/usr/local/lib/dpdk %{dpdk_path}/isa-l/ ++%endif ++ ++# Include libvfio-user libs in case --with-vfio-user is in use together with --with-shared ++%if %{vfio_user} && %{shared} ++cl %{buildroot}/usr/local/lib/libvfio-user build/libvfio-user/ ++%endif ++# Try to include extra binaries that were potentially built ++cfs %{buildroot}/usr/local/bin build/fio ++ ++# And some useful setup scripts SPDK uses ++mkdir -p %{buildroot}/usr/libexec/spdk ++mkdir -p %{buildroot}/etc/bash_completion.d ++mkdir -p %{buildroot}/etc/profile.d ++mkdir -p %{buildroot}/etc/ld.so.conf.d ++mkdir -p %{buildroot}%{python3_sitelib} ++ ++cat <<-EOF > %{buildroot}/etc/ld.so.conf.d/spdk.conf ++%{libdir} ++/usr/local/lib/dpdk ++/usr/local/lib/libvfio-user ++EOF ++ ++cat <<-'EOF' > %{buildroot}/etc/profile.d/spdk_path.sh ++PATH=$PATH:/usr/libexec/spdk/scripts ++PATH=$PATH:/usr/libexec/spdk/scripts/vagrant ++PATH=$PATH:/usr/libexec/spdk/test/common/config ++export PATH ++EOF ++ ++cfs %{buildroot}/usr/libexec/spdk scripts ++cfs %{buildroot}%{python3_sitelib} python/spdk ++ln -s /usr/libexec/spdk/scripts/bash-completion/spdk %{buildroot}/etc/bash_completion.d/ ++ ++# We need to take into the account the fact that most of the scripts depend on being ++# run directly from the repo. To workaround it, create common root space under dir ++# like /usr/libexec/spdk and link all potential relative paths the script may try ++# to reference. ++ ++# setup.sh uses pci_ids.h ++ln -s /usr/local/include %{buildroot}/usr/libexec/spdk ++ ++%files ++/etc/profile.d/* ++/etc/bash_completion.d/* ++/usr/libexec/spdk/* ++/usr/local/bin/* ++%{python3_sitelib}/spdk/* ++ ++ ++%package devel ++Summary: SPDK development libraries and headers ++ ++%description devel ++SPDK development libraries and headers ++ ++%files devel ++/usr/local/include/* ++%if %{shared} ++%{libdir}/lib*.so ++%endif ++ ++%package libs ++Summary: SPDK libraries ++ ++%description libs ++SPDK libraries ++ ++%files libs ++/etc/ld.so.conf.d/* ++%{libdir}/lib*.a ++%{libdir}/pkgconfig/*.pc ++%if %{shared} ++%{libdir}/lib*.so.* ++%endif ++ ++%post libs ++ldconfig ++ ++%if %{dpdk} ++%package dpdk-libs ++Summary: DPDK libraries ++ ++%description dpdk-libs ++DPDK libraries ++ ++%files dpdk-libs ++/usr/local/lib/dpdk ++ ++%post dpdk-libs ++ldconfig ++%endif ++ ++%if %{vfio_user} && %{shared} ++%package libvfio-user ++Summary: libvfio-user libraries ++ ++%description libvfio-user ++libvfio-user libraries ++ ++%files libvfio-user ++/usr/local/lib/libvfio-user ++ ++%post libvfio-user ++ldconfig ++%endif ++ ++%changelog ++* Tue Feb 16 2021 Michal Berger ++- Initial RPM .spec for the SPDK +diff --git a/scripts/ar-xnvme-fixer b/scripts/ar-xnvme-fixer +index f2ad2a5..86ee2c0 100644 +--- a/scripts/ar-xnvme-fixer ++++ b/scripts/ar-xnvme-fixer +@@ -1,34 +1,34 @@ +-#!/usr/bin/env bash +-# SPDX-License-Identifier: BSD-3-Clause +-# Copyright (C) 2022 Intel Corporation +-# All rights reserved. +- +-# The xnvme build executes library_bundler.py which wraps itself around ar +-# to create libxnvme.a. It builds a set of MRI commands which then is +-# passed to ar via stdin. The set of members is declared via ADDLIB +-# followed by an absolute path to the file. On the physical nodes this +-# path may look as the following: +-# +-# /workspace/foo-job@tmp/... +-# +-# The '@' has a special meaning for ar when spotted on the cmdline. +-# It ends up splitting the path into /workspace/foo-job treating it +-# as a member path which doesn't exist. This causes the entire build +-# to fail. To workaround this, we inject ourselves via AR_TOOL and +-# modify the MRI commands such that the absolute paths to members are +-# replaced with relative ones (relative to xnvme/builddir from where +-# the library_bundler.py is executed). +- +-curdir=$(readlink -f "$(dirname "$0")") +-rootdir=$(readlink -f "$curdir/../") +- +-[[ ! -t 0 ]] || exit 1 +- +-while read -r cmd arg; do +- if [[ $cmd == ADDLIB && $arg == /* ]]; then +- arg=${arg/"$rootdir/xnvme/"/"../"} +- fi +- mri+=("$cmd${arg:+ $arg}") +-done +- +-ar "$@" < <(printf '%s\n' "${mri[@]}") ++#!/usr/bin/env bash ++# SPDX-License-Identifier: BSD-3-Clause ++# Copyright (C) 2022 Intel Corporation ++# All rights reserved. ++ ++# The xnvme build executes library_bundler.py which wraps itself around ar ++# to create libxnvme.a. It builds a set of MRI commands which then is ++# passed to ar via stdin. The set of members is declared via ADDLIB ++# followed by an absolute path to the file. On the physical nodes this ++# path may look as the following: ++# ++# /workspace/foo-job@tmp/... ++# ++# The '@' has a special meaning for ar when spotted on the cmdline. ++# It ends up splitting the path into /workspace/foo-job treating it ++# as a member path which doesn't exist. This causes the entire build ++# to fail. To workaround this, we inject ourselves via AR_TOOL and ++# modify the MRI commands such that the absolute paths to members are ++# replaced with relative ones (relative to xnvme/builddir from where ++# the library_bundler.py is executed). ++ ++curdir=$(readlink -f "$(dirname "$0")") ++rootdir=$(readlink -f "$curdir/../") ++ ++[[ ! -t 0 ]] || exit 1 ++ ++while read -r cmd arg; do ++ if [[ $cmd == ADDLIB && $arg == /* ]]; then ++ arg=${arg/"$rootdir/xnvme/"/"../"} ++ fi ++ mri+=("$cmd${arg:+ $arg}") ++done ++ ++ar "$@" < <(printf '%s\n' "${mri[@]}") +diff --git a/scripts/arm_cross_compile.sh b/scripts/arm_cross_compile.sh +index 3760e2c..e7ebd8b 100644 +--- a/scripts/arm_cross_compile.sh ++++ b/scripts/arm_cross_compile.sh +@@ -1,326 +1,326 @@ +-#!/usr/bin/env bash +-# SPDX-License-Identifier: BSD-3-Clause +-# All rights reserved. +-# +- +-# exit on errors +-set -e +- +-ROOT_DIR=$(readlink -f $(dirname $0))/../.. +-export CROSS_COMPILE_DIR=$ROOT_DIR/cross_compiling +-export SPDK_DIR=$ROOT_DIR/spdk +-export DPDK_DIR=$SPDK_DIR/dpdk +- +-# Get Toolchain +-function get_cc_toolchain() { +- cd $CROSS_COMPILE_DIR +- +- if [ ! -d "$CROSS_COMPILE_DIR/gcc-arm-10.2-2020.11-x86_64-aarch64-none-linux-gnu" ]; then +- echo -e "Getting ARM Cross Compiler Toolchain..." +- wget https://developer.arm.com/-/media/Files/downloads/gnu-a/10.2-2020.11/binrel/gcc-arm-10.2-2020.11-x86_64-aarch64-none-linux-gnu.tar.xz --no-check-certificate +- tar xvf gcc-arm-10.2-2020.11-x86_64-aarch64-none-linux-gnu.tar.xz +- else +- echo -e "ARM Cross Compiler Toolchain already downloaded" +- fi +- +- export PATH=$PATH:$CROSS_COMPILE_DIR/gcc-arm-10.2-2020.11-x86_64-aarch64-none-linux-gnu/bin +-} +- +-# NUMA +-function cross_compile_numa() { +- cd $CROSS_COMPILE_DIR +- +- # Download NUMA library +- if [ ! -d "$CROSS_COMPILE_DIR/numactl" ]; then +- echo -e "Downloading NUMA library..." +- git clone https://github.com/numactl/numactl.git +- cd numactl/ +- git checkout v2.0.13 -b v2.0.13 +- else +- echo -e "NUMA library already downloaded" +- cd numactl/ +- fi +- +- # Build NUMA library +- if [ ! -d "$CROSS_COMPILE_DIR/numactl/build" ]; then +- echo -e "Building NUMA library..." +- ./autogen.sh +- autoconf -i +- mkdir build +- ./configure --host=aarch64-none-linux-gnu CC=aarch64-none-linux-gnu-gcc --prefix=$CROSS_COMPILE_DIR/numactl/build +- make -j install +- +- # Copy NUMA related dependencies +- echo -e "Copying NUMA library dependencies..." +- +- cp build/include/numa*.h $CROSS_COMPILE_DIR/gcc-arm-10.2-2020.11-x86_64-aarch64-none-linux-gnu/aarch64-none-linux-gnu/libc/usr/include/ +- cp build/lib/libnuma.a $CROSS_COMPILE_DIR/gcc-arm-10.2-2020.11-x86_64-aarch64-none-linux-gnu/lib/gcc/aarch64-none-linux-gnu/10.2.1/ +- cp build/lib/libnuma.so $CROSS_COMPILE_DIR/gcc-arm-10.2-2020.11-x86_64-aarch64-none-linux-gnu/lib/gcc/aarch64-none-linux-gnu/10.2.1/ +- else +- echo -e "NUMA library already built" +- fi +-} +- +-# util-linux UUID +-function cross_compile_uuid() { +- cd $CROSS_COMPILE_DIR +- +- # Download util-linux UUID library +- if [ ! -d "$CROSS_COMPILE_DIR/util-linux" ]; then +- echo -e "Downloading util-linux UUID library..." +- git clone https://github.com/karelzak/util-linux.git +- else +- echo -e "util-linux UUID library already downloaded" +- fi +- +- if [ ! -d "$CROSS_COMPILE_DIR/util-linux/.libs" ]; then +- cd util-linux/ +- +- # Build util-linux UUID library +- echo -e "Building util-linux UUID library..." +- +- ./autogen.sh +- CC=aarch64-none-linux-gnu-gcc CXX=aarch64-none-linux-gnu-g++ LD=aarch64-none-linux-gnu-ld CFLAGS+=-Wl,-rpath=$CROSS_COMPILE_DIR/util-linux/.libs ./configure --host=aarch64-none-linux-gnu --without-tinfo --without-ncurses --without-ncursesw --disable-mount --disable-libmount --disable-pylibmount --disable-libblkid --disable-fdisks --disable-libfdisk +- make clean +- make -j +- +- # Copy util-linux UUID related dependencies +- echo -e "Copying util-linux UUID library dependencies..." +- +- cp .libs/libuuid.so.1.3.0 $CROSS_COMPILE_DIR/gcc-arm-10.2-2020.11-x86_64-aarch64-none-linux-gnu/lib/gcc/aarch64-none-linux-gnu/10.2.1/libuuid.so +- mkdir -p $CROSS_COMPILE_DIR/gcc-arm-10.2-2020.11-x86_64-aarch64-none-linux-gnu/aarch64-none-linux-gnu/libc/usr/include/uuid/ +- cp libuuid/src/uuid.h $CROSS_COMPILE_DIR/gcc-arm-10.2-2020.11-x86_64-aarch64-none-linux-gnu/aarch64-none-linux-gnu/libc/usr/include/uuid/ +- else +- echo -e "util-linux UUID library already built" +- fi +-} +- +-# Openssl Crypto and SSL +-function cross_compile_crypto_ssl() { +- cd $CROSS_COMPILE_DIR +- +- # Download Openssl Crypto and SSL libraries +- if [ ! -d "$CROSS_COMPILE_DIR/openssl" ]; then +- echo -e "Downloading Openssl Crypto and SSL libraries..." +- git clone https://github.com/openssl/openssl.git +- else +- echo -e "Openssl Crypto and SSL libraries already downloaded" +- fi +- +- if [ ! -d "$CROSS_COMPILE_DIR/openssl/build" ]; then +- cd openssl +- +- # Build Openssl Crypto and SSL libraries +- echo -e "Building Openssl Crypto and SSL libraries..." +- +- mkdir build +- ./Configure linux-aarch64 --prefix=$CROSS_COMPILE_DIR/openssl/build --cross-compile-prefix=aarch64-none-linux-gnu- +- make -j +- make -j install +- +- # Copy Openssl Crypto and SSL related dependencies +- echo -e "Copying Openssl Crypto and SSL libraries dependencies..." +- +- cp -fr build/include/openssl $CROSS_COMPILE_DIR/gcc-arm-10.2-2020.11-x86_64-aarch64-none-linux-gnu/aarch64-none-linux-gnu/libc/usr/include/ +- cp build/lib/libcrypto.so.3 $CROSS_COMPILE_DIR/gcc-arm-10.2-2020.11-x86_64-aarch64-none-linux-gnu/lib/gcc/aarch64-none-linux-gnu/10.2.1/libcrypto.so +- cp build/lib/libcrypto.so.3 $CROSS_COMPILE_DIR/gcc-arm-10.2-2020.11-x86_64-aarch64-none-linux-gnu/lib/gcc/aarch64-none-linux-gnu/10.2.1/libcrypto.so.3 +- cp build/lib/libssl.so.3 $CROSS_COMPILE_DIR/gcc-arm-10.2-2020.11-x86_64-aarch64-none-linux-gnu/lib/gcc/aarch64-none-linux-gnu/10.2.1/libssl.so +- else +- echo -e "Openssl Crypto and SSL libraries already built" +- fi +-} +- +-# Libaio +-function cross_compile_libaio() { +- cd $CROSS_COMPILE_DIR +- +- # Download libaio library +- if [ ! -d "$CROSS_COMPILE_DIR/libaio" ]; then +- echo -e "Downloading libaio library..." +- +- wget https://ftp.debian.org/debian/pool/main/liba/libaio/libaio_0.3.112.orig.tar.xz --no-check-certificate +- tar xvf libaio_0.3.112.orig.tar.xz +- mv libaio-0.3.112 libaio +- else +- echo -e "libaio library already downloaded" +- fi +- +- if [ ! -d "$CROSS_COMPILE_DIR/libaio/build" ]; then +- cd libaio +- +- # Build libaio library +- echo -e "Building libaio library..." +- +- mkdir build +- CC=aarch64-none-linux-gnu-gcc CXX=aarch64-none-linux-gnu-g++ LD=aarch64-none-linux-gnu-ld make -j +- make -j install DESTDIR=$CROSS_COMPILE_DIR/libaio/build +- +- # Copy libaio related dependencies +- echo -e "Copying libaio library dependencies..." +- +- cp build/usr/include/libaio.h $CROSS_COMPILE_DIR/gcc-arm-10.2-2020.11-x86_64-aarch64-none-linux-gnu/aarch64-none-linux-gnu/libc/usr/include/ +- cp build/usr/lib/libaio.so.1.0.1 $CROSS_COMPILE_DIR/gcc-arm-10.2-2020.11-x86_64-aarch64-none-linux-gnu/lib/gcc/aarch64-none-linux-gnu/10.2.1/libaio.so +- else +- echo -e "libaio library already built" +- fi +-} +- +-# Ncurses +-function cross_compile_ncurses() { +- cd $CROSS_COMPILE_DIR +- +- # Download ncurses library +- if [ ! -d "$CROSS_COMPILE_DIR/ncurses" ]; then +- echo -e "Downloading ncurses library..." +- +- wget https://ftp.gnu.org/pub/gnu/ncurses/ncurses-6.2.tar.gz --no-check-certificate +- tar xvf ncurses-6.2.tar.gz +- mv ncurses-6.2 ncurses +- else +- echo -e "ncurses library already downloaded" +- fi +- +- if [ ! -d "$CROSS_COMPILE_DIR/ncurses_build" ]; then +- mkdir ncurses_build +- +- # Build ncurses library +- echo -e "Building ncurses library..." +- +- (cd ncurses && ./configure --host=aarch64-none-linux-gnu --prefix=$CROSS_COMPILE_DIR/ncurses_build --disable-stripping && make -j install) +- +- # Copy ncurses related dependencies +- echo -e "Copying ncurses library dependencies..." +- +- cp ncurses_build/include/ncurses/ncurses.h $CROSS_COMPILE_DIR/gcc-arm-10.2-2020.11-x86_64-aarch64-none-linux-gnu/aarch64-none-linux-gnu/libc/usr/include/ +- cp ncurses_build/include/ncurses/curses.h $CROSS_COMPILE_DIR/gcc-arm-10.2-2020.11-x86_64-aarch64-none-linux-gnu/aarch64-none-linux-gnu/libc/usr/include/ +- cp -fr ncurses_build/include/ncurses $CROSS_COMPILE_DIR/gcc-arm-10.2-2020.11-x86_64-aarch64-none-linux-gnu/aarch64-none-linux-gnu/libc/usr/include/ +- cp ncurses_build/include/ncurses/menu.h $CROSS_COMPILE_DIR/gcc-arm-10.2-2020.11-x86_64-aarch64-none-linux-gnu/aarch64-none-linux-gnu/libc/usr/include/ +- cp ncurses_build/include/ncurses/eti.h $CROSS_COMPILE_DIR/gcc-arm-10.2-2020.11-x86_64-aarch64-none-linux-gnu/aarch64-none-linux-gnu/libc/usr/include/ +- cp ncurses_build/include/ncurses/panel.h $CROSS_COMPILE_DIR/gcc-arm-10.2-2020.11-x86_64-aarch64-none-linux-gnu/aarch64-none-linux-gnu/libc/usr/include/ +- cp ncurses_build/lib/libncurses* $CROSS_COMPILE_DIR/gcc-arm-10.2-2020.11-x86_64-aarch64-none-linux-gnu/lib/gcc/aarch64-none-linux-gnu/10.2.1/ +- cp ncurses_build/lib/libmenu.a $CROSS_COMPILE_DIR/gcc-arm-10.2-2020.11-x86_64-aarch64-none-linux-gnu/lib/gcc/aarch64-none-linux-gnu/10.2.1/ +- cp ncurses_build/lib/libpanel.a $CROSS_COMPILE_DIR/gcc-arm-10.2-2020.11-x86_64-aarch64-none-linux-gnu/lib/gcc/aarch64-none-linux-gnu/10.2.1/ +- else +- echo -e "ncurses library already built" +- fi +- +-} +- +-# CUnit +-function cross_compile_cunit() { +- cd $CROSS_COMPILE_DIR +- +- # Download cunit library +- if [ ! -d "$CROSS_COMPILE_DIR/CUnit" ]; then +- echo -e "Downloading cunit library..." +- +- git clone https://github.com/jacklicn/CUnit.git +- else +- echo -e "cunit library already downloaded" +- fi +- +- if [ ! -d "$CROSS_COMPILE_DIR/CUnit/build" ]; then +- cd CUnit +- +- # Build cunit library +- echo -e "Building cunit library..." +- +- mkdir build +- libtoolize --force +- aclocal +- autoheader +- automake --force-missing --add-missing +- autoconf +- ./configure --host=aarch64-none-linux-gnu --prefix=$CROSS_COMPILE_DIR/CUnit/build +- make -j +- make -j install +- +- # Copy cunit related dependencies +- echo -e "Copying cunit library dependencies..." +- +- cp -fr build/include/CUnit $CROSS_COMPILE_DIR/gcc-arm-10.2-2020.11-x86_64-aarch64-none-linux-gnu/aarch64-none-linux-gnu/libc/usr/include/ +- cp build/lib/libcunit.a $CROSS_COMPILE_DIR/gcc-arm-10.2-2020.11-x86_64-aarch64-none-linux-gnu/lib/gcc/aarch64-none-linux-gnu/10.2.1/ +- cp build/lib/libcunit.so.1.0.1 $CROSS_COMPILE_DIR/gcc-arm-10.2-2020.11-x86_64-aarch64-none-linux-gnu/lib/gcc/aarch64-none-linux-gnu/10.2.1/libcunit.so +- else +- echo -e "cunit library already built" +- fi +-} +- +-# ISA-L +-function cross_compile_isal() { +- cd $SPDK_DIR +- +- if [ ! -d "$SPDK_DIR/isa-l/build" ]; then +- # Build ISA-L library +- echo -e "Building ISA-L library..." +- +- cd isa-l +- ./autogen.sh +- mkdir -p build/lib +- ac_cv_func_malloc_0_nonnull=yes ac_cv_func_realloc_0_nonnull=yes ./configure --prefix=$SPDK_DIR/isa-l/build --libdir=$SPDK_DIR/isa-l/build/lib --host=aarch64-none-linux-gnu +- make -j +- make -j install +- +- # Copy ISAL related dependencies +- echo -e "Copying ISA-L library dependencies..." +- +- cp -fr build/include/isa-l $CROSS_COMPILE_DIR/gcc-arm-10.2-2020.11-x86_64-aarch64-none-linux-gnu/aarch64-none-linux-gnu/libc/usr/include/ +- cp build/include/isa-l.h $CROSS_COMPILE_DIR/gcc-arm-10.2-2020.11-x86_64-aarch64-none-linux-gnu/aarch64-none-linux-gnu/libc/usr/include/ +- cp build/lib/libisal.a $CROSS_COMPILE_DIR/gcc-arm-10.2-2020.11-x86_64-aarch64-none-linux-gnu/lib/gcc/aarch64-none-linux-gnu/10.2.1/ +- cp build/lib/libisal.so.2.0.30 $CROSS_COMPILE_DIR/gcc-arm-10.2-2020.11-x86_64-aarch64-none-linux-gnu/lib/gcc/aarch64-none-linux-gnu/10.2.1/ +- ln -sf $CROSS_COMPILE_DIR/gcc-arm-10.2-2020.11-x86_64-aarch64-none-linux-gnu/lib/gcc/aarch64-none-linux-gnu/10.2.1/libisal.so.2.0.30 $CROSS_COMPILE_DIR/gcc-arm-10.2-2020.11-x86_64-aarch64-none-linux-gnu/lib/gcc/aarch64-none-linux-gnu/10.2.1/libisal.so +- ln -sf $CROSS_COMPILE_DIR/gcc-arm-10.2-2020.11-x86_64-aarch64-none-linux-gnu/lib/gcc/aarch64-none-linux-gnu/10.2.1/libisal.so.2.0.30 $CROSS_COMPILE_DIR/gcc-arm-10.2-2020.11-x86_64-aarch64-none-linux-gnu/lib/gcc/aarch64-none-linux-gnu/10.2.1/libisal.so.2 +- else +- echo -e "ISA-L library already built" +- fi +-} +- +-# DPDK +-function cross_compile_dpdk() { +- cd $DPDK_DIR +- +- if [ ! -d "$DPDK_DIR/build" ]; then +- # Build DPDK libraries +- echo -e "Building DPDK libraries..." +- +- apt install pkg-config-aarch64-linux-gnu +- meson aarch64-build-gcc --cross-file config/arm/arm64_armv8_linux_gcc -Dprefix=$DPDK_DIR/build +- ninja -C aarch64-build-gcc +- ninja -C aarch64-build-gcc install +- cd .. +- +- # Copy DPDK related dependencies +- echo -e "Copying DPDK libraries dependencies..." +- +- cp -fr dpdk/build/bin dpdk/aarch64-build-gcc/ +- cp -fr dpdk/build/include dpdk/aarch64-build-gcc/ +- cp -fr dpdk/build/share dpdk/aarch64-build-gcc/ +- cp -fr dpdk/build/lib/* dpdk/aarch64-build-gcc/lib/ +- cp $CROSS_COMPILE_DIR/gcc-arm-10.2-2020.11-x86_64-aarch64-none-linux-gnu/lib/gcc/aarch64-none-linux-gnu/10.2.1/libcrypto.so.3 dpdk/aarch64-build-gcc/lib/ +- cp $CROSS_COMPILE_DIR/gcc-arm-10.2-2020.11-x86_64-aarch64-none-linux-gnu/lib/gcc/aarch64-none-linux-gnu/10.2.1/libcrypto.so dpdk/aarch64-build-gcc/lib/ +- else +- echo -e "DPDK libraries already built" +- fi +-} +- +-# SPDK +-function cross_compile_spdk() { +- cd $SPDK_DIR +- +- # Build SPDK libraries and binaries +- echo -e "Building SPDK libraries and binaries..." +- +- CC=aarch64-none-linux-gnu-gcc CXX=aarch64-none-linux-gnu-g++ LD=aarch64-none-linux-gnu-ld CFLAGS+=-I$DPDK_DIR/aarch64-build-gcc/include ./configure --cross-prefix=aarch64-none-linux-gnu --without-vhost --with-dpdk=$DPDK_DIR/aarch64-build-gcc --target-arch=armv8-a +- +- make -j +-} +- +-mkdir -p $CROSS_COMPILE_DIR +- +-get_cc_toolchain +- +-cross_compile_packages=(numa uuid crypto_ssl libaio ncurses cunit isal dpdk spdk) +- +-for index in "${cross_compile_packages[@]}"; do +- cross_compile_$index +-done ++#!/usr/bin/env bash ++# SPDX-License-Identifier: BSD-3-Clause ++# All rights reserved. ++# ++ ++# exit on errors ++set -e ++ ++ROOT_DIR=$(readlink -f $(dirname $0))/../.. ++export CROSS_COMPILE_DIR=$ROOT_DIR/cross_compiling ++export SPDK_DIR=$ROOT_DIR/spdk ++export DPDK_DIR=$SPDK_DIR/dpdk ++ ++# Get Toolchain ++function get_cc_toolchain() { ++ cd $CROSS_COMPILE_DIR ++ ++ if [ ! -d "$CROSS_COMPILE_DIR/gcc-arm-10.2-2020.11-x86_64-aarch64-none-linux-gnu" ]; then ++ echo -e "Getting ARM Cross Compiler Toolchain..." ++ wget https://developer.arm.com/-/media/Files/downloads/gnu-a/10.2-2020.11/binrel/gcc-arm-10.2-2020.11-x86_64-aarch64-none-linux-gnu.tar.xz --no-check-certificate ++ tar xvf gcc-arm-10.2-2020.11-x86_64-aarch64-none-linux-gnu.tar.xz ++ else ++ echo -e "ARM Cross Compiler Toolchain already downloaded" ++ fi ++ ++ export PATH=$PATH:$CROSS_COMPILE_DIR/gcc-arm-10.2-2020.11-x86_64-aarch64-none-linux-gnu/bin ++} ++ ++# NUMA ++function cross_compile_numa() { ++ cd $CROSS_COMPILE_DIR ++ ++ # Download NUMA library ++ if [ ! -d "$CROSS_COMPILE_DIR/numactl" ]; then ++ echo -e "Downloading NUMA library..." ++ git clone https://github.com/numactl/numactl.git ++ cd numactl/ ++ git checkout v2.0.13 -b v2.0.13 ++ else ++ echo -e "NUMA library already downloaded" ++ cd numactl/ ++ fi ++ ++ # Build NUMA library ++ if [ ! -d "$CROSS_COMPILE_DIR/numactl/build" ]; then ++ echo -e "Building NUMA library..." ++ ./autogen.sh ++ autoconf -i ++ mkdir build ++ ./configure --host=aarch64-none-linux-gnu CC=aarch64-none-linux-gnu-gcc --prefix=$CROSS_COMPILE_DIR/numactl/build ++ make -j install ++ ++ # Copy NUMA related dependencies ++ echo -e "Copying NUMA library dependencies..." ++ ++ cp build/include/numa*.h $CROSS_COMPILE_DIR/gcc-arm-10.2-2020.11-x86_64-aarch64-none-linux-gnu/aarch64-none-linux-gnu/libc/usr/include/ ++ cp build/lib/libnuma.a $CROSS_COMPILE_DIR/gcc-arm-10.2-2020.11-x86_64-aarch64-none-linux-gnu/lib/gcc/aarch64-none-linux-gnu/10.2.1/ ++ cp build/lib/libnuma.so $CROSS_COMPILE_DIR/gcc-arm-10.2-2020.11-x86_64-aarch64-none-linux-gnu/lib/gcc/aarch64-none-linux-gnu/10.2.1/ ++ else ++ echo -e "NUMA library already built" ++ fi ++} ++ ++# util-linux UUID ++function cross_compile_uuid() { ++ cd $CROSS_COMPILE_DIR ++ ++ # Download util-linux UUID library ++ if [ ! -d "$CROSS_COMPILE_DIR/util-linux" ]; then ++ echo -e "Downloading util-linux UUID library..." ++ git clone https://github.com/karelzak/util-linux.git ++ else ++ echo -e "util-linux UUID library already downloaded" ++ fi ++ ++ if [ ! -d "$CROSS_COMPILE_DIR/util-linux/.libs" ]; then ++ cd util-linux/ ++ ++ # Build util-linux UUID library ++ echo -e "Building util-linux UUID library..." ++ ++ ./autogen.sh ++ CC=aarch64-none-linux-gnu-gcc CXX=aarch64-none-linux-gnu-g++ LD=aarch64-none-linux-gnu-ld CFLAGS+=-Wl,-rpath=$CROSS_COMPILE_DIR/util-linux/.libs ./configure --host=aarch64-none-linux-gnu --without-tinfo --without-ncurses --without-ncursesw --disable-mount --disable-libmount --disable-pylibmount --disable-libblkid --disable-fdisks --disable-libfdisk ++ make clean ++ make -j ++ ++ # Copy util-linux UUID related dependencies ++ echo -e "Copying util-linux UUID library dependencies..." ++ ++ cp .libs/libuuid.so.1.3.0 $CROSS_COMPILE_DIR/gcc-arm-10.2-2020.11-x86_64-aarch64-none-linux-gnu/lib/gcc/aarch64-none-linux-gnu/10.2.1/libuuid.so ++ mkdir -p $CROSS_COMPILE_DIR/gcc-arm-10.2-2020.11-x86_64-aarch64-none-linux-gnu/aarch64-none-linux-gnu/libc/usr/include/uuid/ ++ cp libuuid/src/uuid.h $CROSS_COMPILE_DIR/gcc-arm-10.2-2020.11-x86_64-aarch64-none-linux-gnu/aarch64-none-linux-gnu/libc/usr/include/uuid/ ++ else ++ echo -e "util-linux UUID library already built" ++ fi ++} ++ ++# Openssl Crypto and SSL ++function cross_compile_crypto_ssl() { ++ cd $CROSS_COMPILE_DIR ++ ++ # Download Openssl Crypto and SSL libraries ++ if [ ! -d "$CROSS_COMPILE_DIR/openssl" ]; then ++ echo -e "Downloading Openssl Crypto and SSL libraries..." ++ git clone https://github.com/openssl/openssl.git ++ else ++ echo -e "Openssl Crypto and SSL libraries already downloaded" ++ fi ++ ++ if [ ! -d "$CROSS_COMPILE_DIR/openssl/build" ]; then ++ cd openssl ++ ++ # Build Openssl Crypto and SSL libraries ++ echo -e "Building Openssl Crypto and SSL libraries..." ++ ++ mkdir build ++ ./Configure linux-aarch64 --prefix=$CROSS_COMPILE_DIR/openssl/build --cross-compile-prefix=aarch64-none-linux-gnu- ++ make -j ++ make -j install ++ ++ # Copy Openssl Crypto and SSL related dependencies ++ echo -e "Copying Openssl Crypto and SSL libraries dependencies..." ++ ++ cp -fr build/include/openssl $CROSS_COMPILE_DIR/gcc-arm-10.2-2020.11-x86_64-aarch64-none-linux-gnu/aarch64-none-linux-gnu/libc/usr/include/ ++ cp build/lib/libcrypto.so.3 $CROSS_COMPILE_DIR/gcc-arm-10.2-2020.11-x86_64-aarch64-none-linux-gnu/lib/gcc/aarch64-none-linux-gnu/10.2.1/libcrypto.so ++ cp build/lib/libcrypto.so.3 $CROSS_COMPILE_DIR/gcc-arm-10.2-2020.11-x86_64-aarch64-none-linux-gnu/lib/gcc/aarch64-none-linux-gnu/10.2.1/libcrypto.so.3 ++ cp build/lib/libssl.so.3 $CROSS_COMPILE_DIR/gcc-arm-10.2-2020.11-x86_64-aarch64-none-linux-gnu/lib/gcc/aarch64-none-linux-gnu/10.2.1/libssl.so ++ else ++ echo -e "Openssl Crypto and SSL libraries already built" ++ fi ++} ++ ++# Libaio ++function cross_compile_libaio() { ++ cd $CROSS_COMPILE_DIR ++ ++ # Download libaio library ++ if [ ! -d "$CROSS_COMPILE_DIR/libaio" ]; then ++ echo -e "Downloading libaio library..." ++ ++ wget https://ftp.debian.org/debian/pool/main/liba/libaio/libaio_0.3.112.orig.tar.xz --no-check-certificate ++ tar xvf libaio_0.3.112.orig.tar.xz ++ mv libaio-0.3.112 libaio ++ else ++ echo -e "libaio library already downloaded" ++ fi ++ ++ if [ ! -d "$CROSS_COMPILE_DIR/libaio/build" ]; then ++ cd libaio ++ ++ # Build libaio library ++ echo -e "Building libaio library..." ++ ++ mkdir build ++ CC=aarch64-none-linux-gnu-gcc CXX=aarch64-none-linux-gnu-g++ LD=aarch64-none-linux-gnu-ld make -j ++ make -j install DESTDIR=$CROSS_COMPILE_DIR/libaio/build ++ ++ # Copy libaio related dependencies ++ echo -e "Copying libaio library dependencies..." ++ ++ cp build/usr/include/libaio.h $CROSS_COMPILE_DIR/gcc-arm-10.2-2020.11-x86_64-aarch64-none-linux-gnu/aarch64-none-linux-gnu/libc/usr/include/ ++ cp build/usr/lib/libaio.so.1.0.1 $CROSS_COMPILE_DIR/gcc-arm-10.2-2020.11-x86_64-aarch64-none-linux-gnu/lib/gcc/aarch64-none-linux-gnu/10.2.1/libaio.so ++ else ++ echo -e "libaio library already built" ++ fi ++} ++ ++# Ncurses ++function cross_compile_ncurses() { ++ cd $CROSS_COMPILE_DIR ++ ++ # Download ncurses library ++ if [ ! -d "$CROSS_COMPILE_DIR/ncurses" ]; then ++ echo -e "Downloading ncurses library..." ++ ++ wget https://ftp.gnu.org/pub/gnu/ncurses/ncurses-6.2.tar.gz --no-check-certificate ++ tar xvf ncurses-6.2.tar.gz ++ mv ncurses-6.2 ncurses ++ else ++ echo -e "ncurses library already downloaded" ++ fi ++ ++ if [ ! -d "$CROSS_COMPILE_DIR/ncurses_build" ]; then ++ mkdir ncurses_build ++ ++ # Build ncurses library ++ echo -e "Building ncurses library..." ++ ++ (cd ncurses && ./configure --host=aarch64-none-linux-gnu --prefix=$CROSS_COMPILE_DIR/ncurses_build --disable-stripping && make -j install) ++ ++ # Copy ncurses related dependencies ++ echo -e "Copying ncurses library dependencies..." ++ ++ cp ncurses_build/include/ncurses/ncurses.h $CROSS_COMPILE_DIR/gcc-arm-10.2-2020.11-x86_64-aarch64-none-linux-gnu/aarch64-none-linux-gnu/libc/usr/include/ ++ cp ncurses_build/include/ncurses/curses.h $CROSS_COMPILE_DIR/gcc-arm-10.2-2020.11-x86_64-aarch64-none-linux-gnu/aarch64-none-linux-gnu/libc/usr/include/ ++ cp -fr ncurses_build/include/ncurses $CROSS_COMPILE_DIR/gcc-arm-10.2-2020.11-x86_64-aarch64-none-linux-gnu/aarch64-none-linux-gnu/libc/usr/include/ ++ cp ncurses_build/include/ncurses/menu.h $CROSS_COMPILE_DIR/gcc-arm-10.2-2020.11-x86_64-aarch64-none-linux-gnu/aarch64-none-linux-gnu/libc/usr/include/ ++ cp ncurses_build/include/ncurses/eti.h $CROSS_COMPILE_DIR/gcc-arm-10.2-2020.11-x86_64-aarch64-none-linux-gnu/aarch64-none-linux-gnu/libc/usr/include/ ++ cp ncurses_build/include/ncurses/panel.h $CROSS_COMPILE_DIR/gcc-arm-10.2-2020.11-x86_64-aarch64-none-linux-gnu/aarch64-none-linux-gnu/libc/usr/include/ ++ cp ncurses_build/lib/libncurses* $CROSS_COMPILE_DIR/gcc-arm-10.2-2020.11-x86_64-aarch64-none-linux-gnu/lib/gcc/aarch64-none-linux-gnu/10.2.1/ ++ cp ncurses_build/lib/libmenu.a $CROSS_COMPILE_DIR/gcc-arm-10.2-2020.11-x86_64-aarch64-none-linux-gnu/lib/gcc/aarch64-none-linux-gnu/10.2.1/ ++ cp ncurses_build/lib/libpanel.a $CROSS_COMPILE_DIR/gcc-arm-10.2-2020.11-x86_64-aarch64-none-linux-gnu/lib/gcc/aarch64-none-linux-gnu/10.2.1/ ++ else ++ echo -e "ncurses library already built" ++ fi ++ ++} ++ ++# CUnit ++function cross_compile_cunit() { ++ cd $CROSS_COMPILE_DIR ++ ++ # Download cunit library ++ if [ ! -d "$CROSS_COMPILE_DIR/CUnit" ]; then ++ echo -e "Downloading cunit library..." ++ ++ git clone https://github.com/jacklicn/CUnit.git ++ else ++ echo -e "cunit library already downloaded" ++ fi ++ ++ if [ ! -d "$CROSS_COMPILE_DIR/CUnit/build" ]; then ++ cd CUnit ++ ++ # Build cunit library ++ echo -e "Building cunit library..." ++ ++ mkdir build ++ libtoolize --force ++ aclocal ++ autoheader ++ automake --force-missing --add-missing ++ autoconf ++ ./configure --host=aarch64-none-linux-gnu --prefix=$CROSS_COMPILE_DIR/CUnit/build ++ make -j ++ make -j install ++ ++ # Copy cunit related dependencies ++ echo -e "Copying cunit library dependencies..." ++ ++ cp -fr build/include/CUnit $CROSS_COMPILE_DIR/gcc-arm-10.2-2020.11-x86_64-aarch64-none-linux-gnu/aarch64-none-linux-gnu/libc/usr/include/ ++ cp build/lib/libcunit.a $CROSS_COMPILE_DIR/gcc-arm-10.2-2020.11-x86_64-aarch64-none-linux-gnu/lib/gcc/aarch64-none-linux-gnu/10.2.1/ ++ cp build/lib/libcunit.so.1.0.1 $CROSS_COMPILE_DIR/gcc-arm-10.2-2020.11-x86_64-aarch64-none-linux-gnu/lib/gcc/aarch64-none-linux-gnu/10.2.1/libcunit.so ++ else ++ echo -e "cunit library already built" ++ fi ++} ++ ++# ISA-L ++function cross_compile_isal() { ++ cd $SPDK_DIR ++ ++ if [ ! -d "$SPDK_DIR/isa-l/build" ]; then ++ # Build ISA-L library ++ echo -e "Building ISA-L library..." ++ ++ cd isa-l ++ ./autogen.sh ++ mkdir -p build/lib ++ ac_cv_func_malloc_0_nonnull=yes ac_cv_func_realloc_0_nonnull=yes ./configure --prefix=$SPDK_DIR/isa-l/build --libdir=$SPDK_DIR/isa-l/build/lib --host=aarch64-none-linux-gnu ++ make -j ++ make -j install ++ ++ # Copy ISAL related dependencies ++ echo -e "Copying ISA-L library dependencies..." ++ ++ cp -fr build/include/isa-l $CROSS_COMPILE_DIR/gcc-arm-10.2-2020.11-x86_64-aarch64-none-linux-gnu/aarch64-none-linux-gnu/libc/usr/include/ ++ cp build/include/isa-l.h $CROSS_COMPILE_DIR/gcc-arm-10.2-2020.11-x86_64-aarch64-none-linux-gnu/aarch64-none-linux-gnu/libc/usr/include/ ++ cp build/lib/libisal.a $CROSS_COMPILE_DIR/gcc-arm-10.2-2020.11-x86_64-aarch64-none-linux-gnu/lib/gcc/aarch64-none-linux-gnu/10.2.1/ ++ cp build/lib/libisal.so.2.0.30 $CROSS_COMPILE_DIR/gcc-arm-10.2-2020.11-x86_64-aarch64-none-linux-gnu/lib/gcc/aarch64-none-linux-gnu/10.2.1/ ++ ln -sf $CROSS_COMPILE_DIR/gcc-arm-10.2-2020.11-x86_64-aarch64-none-linux-gnu/lib/gcc/aarch64-none-linux-gnu/10.2.1/libisal.so.2.0.30 $CROSS_COMPILE_DIR/gcc-arm-10.2-2020.11-x86_64-aarch64-none-linux-gnu/lib/gcc/aarch64-none-linux-gnu/10.2.1/libisal.so ++ ln -sf $CROSS_COMPILE_DIR/gcc-arm-10.2-2020.11-x86_64-aarch64-none-linux-gnu/lib/gcc/aarch64-none-linux-gnu/10.2.1/libisal.so.2.0.30 $CROSS_COMPILE_DIR/gcc-arm-10.2-2020.11-x86_64-aarch64-none-linux-gnu/lib/gcc/aarch64-none-linux-gnu/10.2.1/libisal.so.2 ++ else ++ echo -e "ISA-L library already built" ++ fi ++} ++ ++# DPDK ++function cross_compile_dpdk() { ++ cd $DPDK_DIR ++ ++ if [ ! -d "$DPDK_DIR/build" ]; then ++ # Build DPDK libraries ++ echo -e "Building DPDK libraries..." ++ ++ apt install pkg-config-aarch64-linux-gnu ++ meson aarch64-build-gcc --cross-file config/arm/arm64_armv8_linux_gcc -Dprefix=$DPDK_DIR/build ++ ninja -C aarch64-build-gcc ++ ninja -C aarch64-build-gcc install ++ cd .. ++ ++ # Copy DPDK related dependencies ++ echo -e "Copying DPDK libraries dependencies..." ++ ++ cp -fr dpdk/build/bin dpdk/aarch64-build-gcc/ ++ cp -fr dpdk/build/include dpdk/aarch64-build-gcc/ ++ cp -fr dpdk/build/share dpdk/aarch64-build-gcc/ ++ cp -fr dpdk/build/lib/* dpdk/aarch64-build-gcc/lib/ ++ cp $CROSS_COMPILE_DIR/gcc-arm-10.2-2020.11-x86_64-aarch64-none-linux-gnu/lib/gcc/aarch64-none-linux-gnu/10.2.1/libcrypto.so.3 dpdk/aarch64-build-gcc/lib/ ++ cp $CROSS_COMPILE_DIR/gcc-arm-10.2-2020.11-x86_64-aarch64-none-linux-gnu/lib/gcc/aarch64-none-linux-gnu/10.2.1/libcrypto.so dpdk/aarch64-build-gcc/lib/ ++ else ++ echo -e "DPDK libraries already built" ++ fi ++} ++ ++# SPDK ++function cross_compile_spdk() { ++ cd $SPDK_DIR ++ ++ # Build SPDK libraries and binaries ++ echo -e "Building SPDK libraries and binaries..." ++ ++ CC=aarch64-none-linux-gnu-gcc CXX=aarch64-none-linux-gnu-g++ LD=aarch64-none-linux-gnu-ld CFLAGS+=-I$DPDK_DIR/aarch64-build-gcc/include ./configure --cross-prefix=aarch64-none-linux-gnu --without-vhost --with-dpdk=$DPDK_DIR/aarch64-build-gcc --target-arch=armv8-a ++ ++ make -j ++} ++ ++mkdir -p $CROSS_COMPILE_DIR ++ ++get_cc_toolchain ++ ++cross_compile_packages=(numa uuid crypto_ssl libaio ncurses cunit isal dpdk spdk) ++ ++for index in "${cross_compile_packages[@]}"; do ++ cross_compile_$index ++done +diff --git a/scripts/bash-completion/spdk b/scripts/bash-completion/spdk +index da1f283..d37a993 100644 +--- a/scripts/bash-completion/spdk ++++ b/scripts/bash-completion/spdk +@@ -1,274 +1,274 @@ +-# shellcheck disable=SC2016,SC2207 +- +-_get_help() { +- "$@" -h 2>&1 +-} +- +-_get_help_opt() { +- # Fetch all the optional parameters with help from _parse_help() +- _parse_help - < <(printf '%s\n' "$@") +-} +- +-_get_help_pos() { +- local pos +- +- # Fetch all the positional parameters, i.e. get first word prefixed +- # with 20h x 2. This may not be 100% accurate. Also, it won't return +- # any usable strings, it's just meant to point out what type of +- # mandatory argument given method depends on, like bdev_name, etc. +- # TODO: separate completion for such arguments, e.g., get all bdevs +- # for parameter like bdev_name? +- while read -r; do +- [[ $REPLY =~ ^\ {2}[^\ -] ]] || continue +- read -r pos _ <<< "$REPLY" && echo "$pos" +- done < <(printf '%s\n' "$@") +-} +- +-_get_default_rpc_methods() { +- local aliases method names +- # Don't squash whitespaces, slurp the entire line +- while read -r; do +- # Each method name seems to be prefixed with 20h x 4. Then it can +- # be followed with list of aliases enclosed inside (). Example: +- # ioat_scan_accel_module +- [[ $REPLY =~ ^\ {4}([a-z]+(_[a-z]+)*)(\ *\((.+)\))? ]] || continue +- +- names=("${BASH_REMATCH[1]}") +- if [[ $SPDK_RPC_ALIASES == yes ]] && [[ -n ${BASH_REMATCH[4]} ]]; then +- IFS=", " read -ra aliases <<< "${BASH_REMATCH[4]}" +- names+=("${aliases[@]}") +- fi +- +- for method in "${names[@]}"; do +- rpc_methods["$method"]=1 +- done +- done < <(_get_help "$1" 2> /dev/null) +-} +- +-_get_supported_methods() { +- local method methods +- +- if ! methods=($("$1" -s "$rpc_sock" rpc_get_methods 2> /dev/null)); then +- _get_default_rpc_methods "$1" +- return 0 +- fi +- ((${#methods[@]} > 0)) || return 0 +- +- # Kill the json flavor +- methods=("${methods[@]//+(\"|,| )/}") +- unset -v "methods[0]" "methods[-1]" # [] +- +- for method in "${methods[@]}"; do +- rpc_methods["$method"]=1 +- done +-} +- +-_get_help_rpc_method() { +- local rpc=$1 +- local method=$2 +- local rpc_help opt +- +- mapfile -t rpc_help < <(_get_help "$rpc" "$method") +- +- _get_help_pos "${rpc_help[@]}" +- _get_help_opt "${rpc_help[@]}" +-} +- +-_is_rpc_method() { +- local word=$1 +- +- [[ -v rpc_methods["$word"] ]] +-} +- +-_method_in_words() { +- for word in "${words[@]}"; do +- if _is_rpc_method "$word"; then +- echo "$word" +- return 0 +- fi +- done +- return 1 +-} +- +-_set_rpc_sock() { +- # Look for unix sock each app creates upon its execution. In +- # first instance, check the cmdline for an -s arg, if it's +- # followed by the path to the sock, use it. +- +- local word +- for ((word = 0; word < ${#words[@]}; word++)); do +- if [[ ${words[word]} == -s && -S ${words[word + 1]} ]]; then +- rpc_sock=${words[word + 1]} +- return 0 +- fi +- done +- +- # default .sock +- [[ -S /var/tmp/spdk.sock ]] && rpc_sock=/var/tmp/spdk.sock +- +- return 0 +-} +- +-_spdk_opt_to_complete() { +- local opt=$1 +- +- case "$opt" in +- --pci-blocked | -B | --pci-allowed | -A) +- local pcis +- if [[ -e /sys/bus/pci/devices ]]; then +- pcis=(/sys/bus/pci/devices/*) +- pcis=("${pcis[@]##*/}") +- fi +- COMPREPLY=($(compgen -W '${pcis[*]}' -- "$cur")) +- compopt -o filenames +- ;; +- --main-core | -p) # FIXME: Is this meant to be an actual core id or thread id? Assume the latter +- local cpus +- if [[ -e /sys/devices/system/cpu ]]; then +- cpus=(/sys/devices/system/cpu/cpu+([0-9])) +- cpus=("${cpus[@]##*cpu}") +- fi +- COMPREPLY=($(compgen -W '${cpus[*]}' -- "$cur")) +- ;; +- --iova-mode) +- COMPREPLY=($(compgen -W 'pa va' -- "$cur")) +- ;; +- --tpoint-group | -e) +- COMPREPLY=($(compgen -W '$(_get_tpoint_g_masks)' -- "$cur")) +- compopt -o nosort +- ;; +- --logflag) +- COMPREPLY=($(compgen -W '$(_get_log_flags)' -- "$cur")) +- ;; +- --huge-dir) +- COMPREPLY=($(compgen -W '$(_get_fs_mounts "hugetlbfs")' -- "$cur")) +- compopt -o filenames +- ;; +- --iflag | --oflag) # spdk_dd specific +- if [[ ${app##*/} == spdk_dd ]]; then +- COMPREPLY=($(compgen -W '$(_get_help_pos "${app_help[@]}")' -- "$cur")) +- fi +- ;; +- *) return 1 ;; +- esac +- return 0 +-} +- +-_get_fs_mounts() { +- [[ $(< /proc/filesystems) == *"$1"* ]] || return 0 +- +- local mount fs mounts +- while read -r _ mount fs _; do +- [[ $fs == "$1" ]] && mounts+=("$mount") +- done < /proc/mounts +- +- if ((${#mounts[@]} > 0)); then +- printf '%s\n' "${mounts[@]}" +- fi +-} +- +-_get_from_spdk_help() { +- _get_help "$app" |& grep "$1" +-} +- +-_get_tpoint_g_masks() { +- local g_masks +- +- g_masks=$(_get_from_spdk_help "tracepoint group mask for spdk trace buffers") || return 0 +- [[ $g_masks =~ \((.+)\) ]] || return 0 +- +- IFS=", " read -ra g_masks <<< "${BASH_REMATCH[1]}" +- printf '%s\n' "${g_masks[@]}" +-} +- +-_get_log_flags() { +- local logflags +- +- logflags=$(_get_from_spdk_help "enable debug log flag") || return 0 +- [[ $logflags =~ \((.+)\) ]] || return 0 +- +- if [[ -n ${BASH_REMATCH[1]} && ${BASH_REMATCH[1]} != "not supported"* ]]; then +- IFS=", " read -ra logflags <<< "${BASH_REMATCH[1]}" +- printf '%s\n' "${logflags[@]}" +- fi +-} +- +-_is_app() { +- type -P "$1" > /dev/null +-} +- +-_rpc() { +- local cur prev words +- +- _init_completion || return +- _is_app "$1" || return +- +- local rpc=$1 rpc_sock="" method="" +- local -A rpc_methods=() +- +- _set_rpc_sock +- if [[ -S $rpc_sock ]]; then +- _get_supported_methods "$rpc" +- else +- _get_default_rpc_methods "$rpc" +- fi +- +- if method=$(_method_in_words); then +- COMPREPLY=($(compgen -W '$(_get_help_rpc_method "$rpc" "$method")' -- "$cur")) +- compopt -o nosort +- elif [[ $cur == -* ]]; then +- COMPREPLY=($(compgen -W '$(_parse_help "$rpc")' -- "$cur")) +- elif [[ $prev == --verbose ]]; then +- COMPREPLY=($(compgen -W 'DEBUG INFO ERROR' -- "$cur")) +- elif [[ $prev == -s ]]; then +- _filedir +- else +- COMPREPLY=($(compgen -W '${!rpc_methods[*]}' -- "$cur")) +- fi +-} +- +-_spdk_app() { +- local cur prev +- +- _init_completion || return +- _is_app "$1" || return +- +- local app=$1 app_help +- +- mapfile -t app_help < <(_get_help "$app") +- +- if [[ $cur == -* ]]; then +- COMPREPLY=($(compgen -W '$(_get_help_opt "${app_help[@]}")' -- "$cur")) +- else +- _spdk_opt_to_complete "$prev" || _filedir +- fi +-} +- +-# Build simple completion for some common spdk apps|tools +-_spdk_apps() { +- local apps +- +- apps=( +- iscsi_tgt +- nvmf_tgt +- spdk_dd +- spdk_tgt +- spdk_top +- spdk_trace_record +- vhost +- create_vbox.sh +- pkgdep.sh +- run-autorun.sh +- vm_setup.sh +- ) # TODO: Add more? +- +- complete -o default -F _spdk_app "${apps[@]}" +- complete -o default -F _rpc rpc.py +-} +- +-_spdk_apps +- +-# Look for _configure(). If it exists, include default completions for path lookups +-if [[ $(type -t _configure) == function ]]; then +- complete -o default -F _configure configure +-fi ++# shellcheck disable=SC2016,SC2207 ++ ++_get_help() { ++ "$@" -h 2>&1 ++} ++ ++_get_help_opt() { ++ # Fetch all the optional parameters with help from _parse_help() ++ _parse_help - < <(printf '%s\n' "$@") ++} ++ ++_get_help_pos() { ++ local pos ++ ++ # Fetch all the positional parameters, i.e. get first word prefixed ++ # with 20h x 2. This may not be 100% accurate. Also, it won't return ++ # any usable strings, it's just meant to point out what type of ++ # mandatory argument given method depends on, like bdev_name, etc. ++ # TODO: separate completion for such arguments, e.g., get all bdevs ++ # for parameter like bdev_name? ++ while read -r; do ++ [[ $REPLY =~ ^\ {2}[^\ -] ]] || continue ++ read -r pos _ <<< "$REPLY" && echo "$pos" ++ done < <(printf '%s\n' "$@") ++} ++ ++_get_default_rpc_methods() { ++ local aliases method names ++ # Don't squash whitespaces, slurp the entire line ++ while read -r; do ++ # Each method name seems to be prefixed with 20h x 4. Then it can ++ # be followed with list of aliases enclosed inside (). Example: ++ # ioat_scan_accel_module ++ [[ $REPLY =~ ^\ {4}([a-z]+(_[a-z]+)*)(\ *\((.+)\))? ]] || continue ++ ++ names=("${BASH_REMATCH[1]}") ++ if [[ $SPDK_RPC_ALIASES == yes ]] && [[ -n ${BASH_REMATCH[4]} ]]; then ++ IFS=", " read -ra aliases <<< "${BASH_REMATCH[4]}" ++ names+=("${aliases[@]}") ++ fi ++ ++ for method in "${names[@]}"; do ++ rpc_methods["$method"]=1 ++ done ++ done < <(_get_help "$1" 2> /dev/null) ++} ++ ++_get_supported_methods() { ++ local method methods ++ ++ if ! methods=($("$1" -s "$rpc_sock" rpc_get_methods 2> /dev/null)); then ++ _get_default_rpc_methods "$1" ++ return 0 ++ fi ++ ((${#methods[@]} > 0)) || return 0 ++ ++ # Kill the json flavor ++ methods=("${methods[@]//+(\"|,| )/}") ++ unset -v "methods[0]" "methods[-1]" # [] ++ ++ for method in "${methods[@]}"; do ++ rpc_methods["$method"]=1 ++ done ++} ++ ++_get_help_rpc_method() { ++ local rpc=$1 ++ local method=$2 ++ local rpc_help opt ++ ++ mapfile -t rpc_help < <(_get_help "$rpc" "$method") ++ ++ _get_help_pos "${rpc_help[@]}" ++ _get_help_opt "${rpc_help[@]}" ++} ++ ++_is_rpc_method() { ++ local word=$1 ++ ++ [[ -v rpc_methods["$word"] ]] ++} ++ ++_method_in_words() { ++ for word in "${words[@]}"; do ++ if _is_rpc_method "$word"; then ++ echo "$word" ++ return 0 ++ fi ++ done ++ return 1 ++} ++ ++_set_rpc_sock() { ++ # Look for unix sock each app creates upon its execution. In ++ # first instance, check the cmdline for an -s arg, if it's ++ # followed by the path to the sock, use it. ++ ++ local word ++ for ((word = 0; word < ${#words[@]}; word++)); do ++ if [[ ${words[word]} == -s && -S ${words[word + 1]} ]]; then ++ rpc_sock=${words[word + 1]} ++ return 0 ++ fi ++ done ++ ++ # default .sock ++ [[ -S /var/tmp/spdk.sock ]] && rpc_sock=/var/tmp/spdk.sock ++ ++ return 0 ++} ++ ++_spdk_opt_to_complete() { ++ local opt=$1 ++ ++ case "$opt" in ++ --pci-blocked | -B | --pci-allowed | -A) ++ local pcis ++ if [[ -e /sys/bus/pci/devices ]]; then ++ pcis=(/sys/bus/pci/devices/*) ++ pcis=("${pcis[@]##*/}") ++ fi ++ COMPREPLY=($(compgen -W '${pcis[*]}' -- "$cur")) ++ compopt -o filenames ++ ;; ++ --main-core | -p) # FIXME: Is this meant to be an actual core id or thread id? Assume the latter ++ local cpus ++ if [[ -e /sys/devices/system/cpu ]]; then ++ cpus=(/sys/devices/system/cpu/cpu+([0-9])) ++ cpus=("${cpus[@]##*cpu}") ++ fi ++ COMPREPLY=($(compgen -W '${cpus[*]}' -- "$cur")) ++ ;; ++ --iova-mode) ++ COMPREPLY=($(compgen -W 'pa va' -- "$cur")) ++ ;; ++ --tpoint-group | -e) ++ COMPREPLY=($(compgen -W '$(_get_tpoint_g_masks)' -- "$cur")) ++ compopt -o nosort ++ ;; ++ --logflag) ++ COMPREPLY=($(compgen -W '$(_get_log_flags)' -- "$cur")) ++ ;; ++ --huge-dir) ++ COMPREPLY=($(compgen -W '$(_get_fs_mounts "hugetlbfs")' -- "$cur")) ++ compopt -o filenames ++ ;; ++ --iflag | --oflag) # spdk_dd specific ++ if [[ ${app##*/} == spdk_dd ]]; then ++ COMPREPLY=($(compgen -W '$(_get_help_pos "${app_help[@]}")' -- "$cur")) ++ fi ++ ;; ++ *) return 1 ;; ++ esac ++ return 0 ++} ++ ++_get_fs_mounts() { ++ [[ $(< /proc/filesystems) == *"$1"* ]] || return 0 ++ ++ local mount fs mounts ++ while read -r _ mount fs _; do ++ [[ $fs == "$1" ]] && mounts+=("$mount") ++ done < /proc/mounts ++ ++ if ((${#mounts[@]} > 0)); then ++ printf '%s\n' "${mounts[@]}" ++ fi ++} ++ ++_get_from_spdk_help() { ++ _get_help "$app" |& grep "$1" ++} ++ ++_get_tpoint_g_masks() { ++ local g_masks ++ ++ g_masks=$(_get_from_spdk_help "tracepoint group mask for spdk trace buffers") || return 0 ++ [[ $g_masks =~ \((.+)\) ]] || return 0 ++ ++ IFS=", " read -ra g_masks <<< "${BASH_REMATCH[1]}" ++ printf '%s\n' "${g_masks[@]}" ++} ++ ++_get_log_flags() { ++ local logflags ++ ++ logflags=$(_get_from_spdk_help "enable debug log flag") || return 0 ++ [[ $logflags =~ \((.+)\) ]] || return 0 ++ ++ if [[ -n ${BASH_REMATCH[1]} && ${BASH_REMATCH[1]} != "not supported"* ]]; then ++ IFS=", " read -ra logflags <<< "${BASH_REMATCH[1]}" ++ printf '%s\n' "${logflags[@]}" ++ fi ++} ++ ++_is_app() { ++ type -P "$1" > /dev/null ++} ++ ++_rpc() { ++ local cur prev words ++ ++ _init_completion || return ++ _is_app "$1" || return ++ ++ local rpc=$1 rpc_sock="" method="" ++ local -A rpc_methods=() ++ ++ _set_rpc_sock ++ if [[ -S $rpc_sock ]]; then ++ _get_supported_methods "$rpc" ++ else ++ _get_default_rpc_methods "$rpc" ++ fi ++ ++ if method=$(_method_in_words); then ++ COMPREPLY=($(compgen -W '$(_get_help_rpc_method "$rpc" "$method")' -- "$cur")) ++ compopt -o nosort ++ elif [[ $cur == -* ]]; then ++ COMPREPLY=($(compgen -W '$(_parse_help "$rpc")' -- "$cur")) ++ elif [[ $prev == --verbose ]]; then ++ COMPREPLY=($(compgen -W 'DEBUG INFO ERROR' -- "$cur")) ++ elif [[ $prev == -s ]]; then ++ _filedir ++ else ++ COMPREPLY=($(compgen -W '${!rpc_methods[*]}' -- "$cur")) ++ fi ++} ++ ++_spdk_app() { ++ local cur prev ++ ++ _init_completion || return ++ _is_app "$1" || return ++ ++ local app=$1 app_help ++ ++ mapfile -t app_help < <(_get_help "$app") ++ ++ if [[ $cur == -* ]]; then ++ COMPREPLY=($(compgen -W '$(_get_help_opt "${app_help[@]}")' -- "$cur")) ++ else ++ _spdk_opt_to_complete "$prev" || _filedir ++ fi ++} ++ ++# Build simple completion for some common spdk apps|tools ++_spdk_apps() { ++ local apps ++ ++ apps=( ++ iscsi_tgt ++ nvmf_tgt ++ spdk_dd ++ spdk_tgt ++ spdk_top ++ spdk_trace_record ++ vhost ++ create_vbox.sh ++ pkgdep.sh ++ run-autorun.sh ++ vm_setup.sh ++ ) # TODO: Add more? ++ ++ complete -o default -F _spdk_app "${apps[@]}" ++ complete -o default -F _rpc rpc.py ++} ++ ++_spdk_apps ++ ++# Look for _configure(). If it exists, include default completions for path lookups ++if [[ $(type -t _configure) == function ]]; then ++ complete -o default -F _configure configure ++fi +diff --git a/scripts/bpf/gen.py b/scripts/bpf/gen.py +index 6f9af85..0672af2 100644 +--- a/scripts/bpf/gen.py ++++ b/scripts/bpf/gen.py +@@ -1,57 +1,57 @@ +-#!/usr/bin/env python3 +-# SPDX-License-Identifier: BSD-3-Clause +-# Copyright (C) 2021 Intel Corporation +-# All rights reserved. +-# +- +-from argparse import ArgumentParser +-import os +-import re +-import subprocess +-import sys +- +- +-class TraceProcess: +- def __init__(self, pid): +- self._path = os.readlink(f'/proc/{pid}/exe') +- self._pid = pid +- self._probes = self._init_probes() +- +- def _init_probes(self): +- lines = subprocess.check_output(['bpftrace', '-l', '-p', str(self._pid)], text=True) +- probes = {} +- for line in lines.split('\n'): +- parts = line.split(':') +- if len(parts) < 3: +- continue +- ptype, path, function = parts[0], parts[1], parts[-1] +- probes[(ptype, function)] = path +- return probes +- +- def fixup(self, script): +- pregs = [re.compile(r'({}):__EXE__:(\w+)'.format(ptype)) for ptype in ['usdt', 'uprobe']] +- with open(script, 'r') as file: +- lines = file.readlines() +- result = '' +- for line in lines: +- for regex in pregs: +- match = regex.match(line) +- if match is not None: +- ptype, function = match.groups() +- path = self._probes.get((ptype, function), self._path) +- line = line.replace('__EXE__', path) +- break +- result += line.replace('__EXE__', self._path).replace('__PID__', str(self._pid)) +- return result +- +- +-if __name__ == '__main__': +- parser = ArgumentParser(description='bpftrace script generator replacing special ' + +- 'variables in the scripts with appropriate values') +- parser.add_argument('-p', '--pid', type=int, required=True, help='PID of a traced process') +- parser.add_argument('scripts', metavar='SCRIPTS', type=str, nargs='+', +- help='bpftrace scripts to process') +- args = parser.parse_args(sys.argv[1:]) +- proc = TraceProcess(args.pid) +- for script in args.scripts: +- print(proc.fixup(script)) ++#!/usr/bin/env python3 ++# SPDX-License-Identifier: BSD-3-Clause ++# Copyright (C) 2021 Intel Corporation ++# All rights reserved. ++# ++ ++from argparse import ArgumentParser ++import os ++import re ++import subprocess ++import sys ++ ++ ++class TraceProcess: ++ def __init__(self, pid): ++ self._path = os.readlink(f'/proc/{pid}/exe') ++ self._pid = pid ++ self._probes = self._init_probes() ++ ++ def _init_probes(self): ++ lines = subprocess.check_output(['bpftrace', '-l', '-p', str(self._pid)], text=True) ++ probes = {} ++ for line in lines.split('\n'): ++ parts = line.split(':') ++ if len(parts) < 3: ++ continue ++ ptype, path, function = parts[0], parts[1], parts[-1] ++ probes[(ptype, function)] = path ++ return probes ++ ++ def fixup(self, script): ++ pregs = [re.compile(r'({}):__EXE__:(\w+)'.format(ptype)) for ptype in ['usdt', 'uprobe']] ++ with open(script, 'r') as file: ++ lines = file.readlines() ++ result = '' ++ for line in lines: ++ for regex in pregs: ++ match = regex.match(line) ++ if match is not None: ++ ptype, function = match.groups() ++ path = self._probes.get((ptype, function), self._path) ++ line = line.replace('__EXE__', path) ++ break ++ result += line.replace('__EXE__', self._path).replace('__PID__', str(self._pid)) ++ return result ++ ++ ++if __name__ == '__main__': ++ parser = ArgumentParser(description='bpftrace script generator replacing special ' + ++ 'variables in the scripts with appropriate values') ++ parser.add_argument('-p', '--pid', type=int, required=True, help='PID of a traced process') ++ parser.add_argument('scripts', metavar='SCRIPTS', type=str, nargs='+', ++ help='bpftrace scripts to process') ++ args = parser.parse_args(sys.argv[1:]) ++ proc = TraceProcess(args.pid) ++ for script in args.scripts: ++ print(proc.fixup(script)) +diff --git a/scripts/bpf/gen_enums.sh b/scripts/bpf/gen_enums.sh +index a4826d8..0294149 100644 +--- a/scripts/bpf/gen_enums.sh ++++ b/scripts/bpf/gen_enums.sh +@@ -1,60 +1,60 @@ +-#!/usr/bin/env bash +-# SPDX-License-Identifier: BSD-3-Clause +-# Copyright (C) 2021 Intel Corporation +-# All rights reserved. +-# +- +-set -e +- +-rootdir=$(git rev-parse --show-toplevel) +- +-_print_enums() { +- local enum_type=$1 enum_string=$2 enum_prefix=$3 enum output +- +- output=$(< "$rootdir/$(git -C "$rootdir" grep -G -l "$enum_string" -- lib module)") +- +- # Isolate the enum block +- output=${output#*"$enum_string"$'\n'} output=${output%%$'\n'\};*} +- # Fold it onto an array +- IFS="," read -ra output <<< "${output//[[:space:]]/}" +- # Drop the assignments +- output=("${output[@]/=*/}") +- +- for enum in "${!output[@]}"; do +- if [[ ${output[enum]} != "$enum_prefix"* ]]; then +- printf 'enum name %s does not start with expected prefix %s\n' "${output[enum]}" "$enum_prefix" +- return 1 +- fi >&2 +- printf ' @%s[%d] = "%s";\n' "$enum_type" "$enum" "${output[enum]#$enum_prefix}" +- done +-} +- +-print_enums() { +- for state in "${!state_enums[@]}"; do +- _print_enums "$state" "${state_enums["$state"]}" "${state_prefix["$state"]}" +- done +-} +- +-print_clear() { printf ' clear(@%s);\n' "${!state_enums[@]}"; } +- +-declare -A state_enums=() state_prefix=() +- +-state_enums["target"]="enum nvmf_tgt_state {" +-state_enums["subsystem"]="enum spdk_nvmf_subsystem_state {" +-state_prefix["target"]=NVMF_TGT_ +-state_prefix["subsystem"]=SPDK_NVMF_SUBSYSTEM_ +- +-enums=$(print_enums) +-clear=$(print_clear) +- +-# Add an empty line before "BEGIN {" to avoid it being commented out +-# when there is annotation at the end of bpftrace script +-cat <<- ENUM +- +- BEGIN { +- $enums +- } +- END { +- $clear +- } +-ENUM ++#!/usr/bin/env bash ++# SPDX-License-Identifier: BSD-3-Clause ++# Copyright (C) 2021 Intel Corporation ++# All rights reserved. ++# ++ ++set -e ++ ++rootdir=$(git rev-parse --show-toplevel) ++ ++_print_enums() { ++ local enum_type=$1 enum_string=$2 enum_prefix=$3 enum output ++ ++ output=$(< "$rootdir/$(git -C "$rootdir" grep -G -l "$enum_string" -- lib module)") ++ ++ # Isolate the enum block ++ output=${output#*"$enum_string"$'\n'} output=${output%%$'\n'\};*} ++ # Fold it onto an array ++ IFS="," read -ra output <<< "${output//[[:space:]]/}" ++ # Drop the assignments ++ output=("${output[@]/=*/}") ++ ++ for enum in "${!output[@]}"; do ++ if [[ ${output[enum]} != "$enum_prefix"* ]]; then ++ printf 'enum name %s does not start with expected prefix %s\n' "${output[enum]}" "$enum_prefix" ++ return 1 ++ fi >&2 ++ printf ' @%s[%d] = "%s";\n' "$enum_type" "$enum" "${output[enum]#$enum_prefix}" ++ done ++} ++ ++print_enums() { ++ for state in "${!state_enums[@]}"; do ++ _print_enums "$state" "${state_enums["$state"]}" "${state_prefix["$state"]}" ++ done ++} ++ ++print_clear() { printf ' clear(@%s);\n' "${!state_enums[@]}"; } ++ ++declare -A state_enums=() state_prefix=() ++ ++state_enums["target"]="enum nvmf_tgt_state {" ++state_enums["subsystem"]="enum spdk_nvmf_subsystem_state {" ++state_prefix["target"]=NVMF_TGT_ ++state_prefix["subsystem"]=SPDK_NVMF_SUBSYSTEM_ ++ ++enums=$(print_enums) ++clear=$(print_clear) ++ ++# Add an empty line before "BEGIN {" to avoid it being commented out ++# when there is annotation at the end of bpftrace script ++cat <<- ENUM ++ ++ BEGIN { ++ $enums ++ } ++ END { ++ $clear ++ } ++ENUM +diff --git a/scripts/bpf/intr-wakeups.bt b/scripts/bpf/intr-wakeups.bt +index 0451712..861bf09 100644 +--- a/scripts/bpf/intr-wakeups.bt ++++ b/scripts/bpf/intr-wakeups.bt +@@ -1,60 +1,60 @@ +-/* +- * Trace reasons for SPDK to wake up in interrupt mode. +- * +- * You'll probably need bpftrace from https://github.com/fbs/el7-bpf-specs +- * +- * Usage: +- * scripts/bpftrace.sh `pidof spdk_tgt` [all] +- * all: show every event, not just the first after waking up +- */ +- +-tracepoint:sched:sched_switch /comm == "reactor_0"/ +-{ +- if (str($1) == "all") { +- printf("%llums: %s is off-cpu\n", elapsed / 1000000, comm); +- } +- @off = 1; +-} +- +-/* +- * We explicitly filter out the framework-level handlers here in favour of the +- * more specific tracepoints below. +- */ +-usdt:__EXE__:spdk:interrupt_fd_process / +- @off == 1 && +- strncmp(str(arg1), "event_queue_run_batch", 40) != 0 && +- strncmp(str(arg1), "interrupt_timerfd_process", 40) != 0 && +- strncmp(str(arg1), "thread_interrupt_msg_process", 40) != 0 && +- strncmp(str(arg1), "thread_process_interrupts", 40) != 0 +-/ +-{ +- printf("%llums:%s: fd:%d %s(%p)\n", elapsed / 1000000, probe, arg2, usym(arg3), arg4); +- if (str($1) != "all") { +- @off = 0; +- } +-} +- +-usdt:__EXE__:spdk:timerfd_exec /@off == 1/ +-{ +- printf("%llums:%s: %s(%p)\n", elapsed / 1000000, probe, usym(arg1), arg2); +- if (str($1) != "all") { +- @off = 0; +- } +-} +- +-usdt:__EXE__:spdk:msg_exec /@off == 1/ +-{ +- printf("%llums:%s: %s(%p)\n", elapsed / 1000000, probe, usym(arg1), arg2); +- if (str($1) != "all") { +- @off = 0; +- } +-} +- +-usdt:__EXE__:spdk:event_exec /@off == 1/ +-{ +- printf("%llums:%s: %s(%p, %p)\n", elapsed / 1000000, probe, usym(arg1), +- arg2, arg3); +- if (str($1) != "all") { +- @off = 0; +- } +-} ++/* ++ * Trace reasons for SPDK to wake up in interrupt mode. ++ * ++ * You'll probably need bpftrace from https://github.com/fbs/el7-bpf-specs ++ * ++ * Usage: ++ * scripts/bpftrace.sh `pidof spdk_tgt` [all] ++ * all: show every event, not just the first after waking up ++ */ ++ ++tracepoint:sched:sched_switch /comm == "reactor_0"/ ++{ ++ if (str($1) == "all") { ++ printf("%llums: %s is off-cpu\n", elapsed / 1000000, comm); ++ } ++ @off = 1; ++} ++ ++/* ++ * We explicitly filter out the framework-level handlers here in favour of the ++ * more specific tracepoints below. ++ */ ++usdt:__EXE__:spdk:interrupt_fd_process / ++ @off == 1 && ++ strncmp(str(arg1), "event_queue_run_batch", 40) != 0 && ++ strncmp(str(arg1), "interrupt_timerfd_process", 40) != 0 && ++ strncmp(str(arg1), "thread_interrupt_msg_process", 40) != 0 && ++ strncmp(str(arg1), "thread_process_interrupts", 40) != 0 ++/ ++{ ++ printf("%llums:%s: fd:%d %s(%p)\n", elapsed / 1000000, probe, arg2, usym(arg3), arg4); ++ if (str($1) != "all") { ++ @off = 0; ++ } ++} ++ ++usdt:__EXE__:spdk:timerfd_exec /@off == 1/ ++{ ++ printf("%llums:%s: %s(%p)\n", elapsed / 1000000, probe, usym(arg1), arg2); ++ if (str($1) != "all") { ++ @off = 0; ++ } ++} ++ ++usdt:__EXE__:spdk:msg_exec /@off == 1/ ++{ ++ printf("%llums:%s: %s(%p)\n", elapsed / 1000000, probe, usym(arg1), arg2); ++ if (str($1) != "all") { ++ @off = 0; ++ } ++} ++ ++usdt:__EXE__:spdk:event_exec /@off == 1/ ++{ ++ printf("%llums:%s: %s(%p, %p)\n", elapsed / 1000000, probe, usym(arg1), ++ arg2, arg3); ++ if (str($1) != "all") { ++ @off = 0; ++ } ++} +diff --git a/scripts/bpf/nvme.bt b/scripts/bpf/nvme.bt +index 107a0d0..bd13164 100644 +--- a/scripts/bpf/nvme.bt ++++ b/scripts/bpf/nvme.bt +@@ -1,23 +1,23 @@ +-usdt:__EXE__:bdev_nvme_ctrlr_delete { +- printf("%d.%06d: delete bdev controller %s\n", +- elapsed / (uint64)(1000 * 1000), elapsed % (uint64)(1000 * 1000), +- str(arg1)); +-} +- +-usdt:__EXE__:bdev_nvme_ctrlr_release { +- printf("%d.%06d: release controller %s with ref %d\n", +- elapsed / (uint64)(1000 * 1000), elapsed % (uint64)(1000 * 1000), +- str(arg1), arg2); +-} +- +-usdt:__EXE__:bdev_nvme_destruct { +- printf("%d.%06d: delete nvme disk: %s, with nsid: %d\n", +- elapsed / (uint64)(1000 * 1000), elapsed % (uint64)(1000 * 1000), +- str(arg1), arg2); +-} +- +-usdt:__EXE__:bdev_nvme_create_qpair { +- printf("%d.%06d: controller: %s create qpair with ID: %d on thread: %d\n", +- elapsed / (uint64)(1000 * 1000), elapsed % (uint64)(1000 * 1000), +- str(arg1), arg2, arg3); +-} ++usdt:__EXE__:bdev_nvme_ctrlr_delete { ++ printf("%d.%06d: delete bdev controller %s\n", ++ elapsed / (uint64)(1000 * 1000), elapsed % (uint64)(1000 * 1000), ++ str(arg1)); ++} ++ ++usdt:__EXE__:bdev_nvme_ctrlr_release { ++ printf("%d.%06d: release controller %s with ref %d\n", ++ elapsed / (uint64)(1000 * 1000), elapsed % (uint64)(1000 * 1000), ++ str(arg1), arg2); ++} ++ ++usdt:__EXE__:bdev_nvme_destruct { ++ printf("%d.%06d: delete nvme disk: %s, with nsid: %d\n", ++ elapsed / (uint64)(1000 * 1000), elapsed % (uint64)(1000 * 1000), ++ str(arg1), arg2); ++} ++ ++usdt:__EXE__:bdev_nvme_create_qpair { ++ printf("%d.%06d: controller: %s create qpair with ID: %d on thread: %d\n", ++ elapsed / (uint64)(1000 * 1000), elapsed % (uint64)(1000 * 1000), ++ str(arg1), arg2, arg3); ++} +diff --git a/scripts/bpf/nvmf.bt b/scripts/bpf/nvmf.bt +index 05dc6a7..0810fa3 100644 +--- a/scripts/bpf/nvmf.bt ++++ b/scripts/bpf/nvmf.bt +@@ -1,191 +1,191 @@ +-usdt:__EXE__:nvmf_tgt_state { +- printf("%d.%06d: nvmf_tgt reached state %s\n", +- elapsed / (uint64)(1000 * 1000), elapsed % (uint64)(1000 * 1000), +- @target[arg1]); +-} +- +-usdt:__EXE__:nvmf_subsystem_change_state { +- printf("%d.%06d: %s change state from %s to %s start\n", +- elapsed / (uint64)(1000 * 1000), elapsed % (uint64)(1000 * 1000), +- str(arg1), @subsystem[arg3], @subsystem[arg2]); +-} +- +-usdt:__EXE__:nvmf_subsystem_change_state_done { +- printf("%d.%06d: %s change state from %s to %s %s\n", +- elapsed / (uint64)(1000 * 1000), elapsed % (uint64)(1000 * 1000), +- str(arg1), @subsystem[arg3], @subsystem[arg2], arg4 ? "failed" : "done"); +-} +- +-usdt:__EXE__:nvmf_pg_change_state { +- printf("%d.%06d: %s on thread %d state to %s start\n", +- elapsed / (uint64)(1000 * 1000), elapsed % (uint64)(1000 * 1000), +- str(arg1), arg3, @subsystem[arg2]); +-} +- +-usdt:__EXE__:nvmf_pg_change_state_done { +- printf("%d.%06d: %s on thread %d state to %s done\n", +- elapsed / (uint64)(1000 * 1000), elapsed % (uint64)(1000 * 1000), +- str(arg1), arg3, @subsystem[arg2]); +-} +- +-usdt:__EXE__:nvmf_create_poll_group { +- printf("%d.%06d: create poll group on thread: %d\n", +- elapsed / (uint64)(1000 * 1000), elapsed % (uint64)(1000 * 1000), +- arg1); +-} +- +-usdt:__EXE__:nvmf_destroy_poll_group { +- printf("%d.%06d: destroy poll group on thread: %d\n", +- elapsed / (uint64)(1000 * 1000), elapsed % (uint64)(1000 * 1000), +- arg1); +-} +- +-usdt:__EXE__:nvmf_poll_group_add_qpair { +- printf("%d.%06d: add qpair: %p to poll group on thread %d\n", +- elapsed / (uint64)(1000 * 1000), elapsed % (uint64)(1000 * 1000), +- arg1, arg2); +-} +- +-usdt:__EXE__:nvmf_destroy_poll_group_qpairs { +- printf("%d.%06d: destroy qpairs on poll group on thread %d\n", +- elapsed / (uint64)(1000 * 1000), elapsed % (uint64)(1000 * 1000), +- arg1); +-} +- +-usdt:__EXE__:nvmf_poll_group_remove_qpair { +- printf("%d.%06d: remove qpair: %p from poll group on thread %d\n", +- elapsed / (uint64)(1000 * 1000), elapsed % (uint64)(1000 * 1000), +- arg1, arg2); +-} +- +-usdt:__EXE__:nvmf_qpair_disconnect { +- printf("%d.%06d: disconnect qpair: %p from poll group on thread %d\n", +- elapsed / (uint64)(1000 * 1000), elapsed % (uint64)(1000 * 1000), +- arg1, arg2); +-} +- +-usdt:__EXE__:nvmf_transport_qpair_fini { +- printf("%d.%06d: destroy qpair: %p on transport layer\n", +- elapsed / (uint64)(1000 * 1000), elapsed % (uint64)(1000 * 1000), +- arg1); +-} +- +-usdt:__EXE__:nvmf_poll_group_drain_qpair { +- printf("%d.%06d: drain qpair: %p from poll group on thread %d\n", +- elapsed / (uint64)(1000 * 1000), elapsed % (uint64)(1000 * 1000), +- arg1, arg2); +-} +- +-usdt:__EXE__:nvmf_ctrlr_add_qpair { +- printf("%d.%06d: %s add qpair: %p, qid: %d for host: %s\n", +- elapsed / (uint64)(1000 * 1000), elapsed % (uint64)(1000 * 1000), +- str(arg3), arg1, arg2, str(arg4)); +-} +- +-usdt:__EXE__:nvmf_subsystem_add_host { +- printf("%d.%06d: subsystem: %s add host: %s\n", +- elapsed / (uint64)(1000 * 1000), elapsed % (uint64)(1000 * 1000), +- str(arg1), str(arg2)); +-} +- +-usdt:__EXE__:nvmf_subsystem_remove_host { +- printf("%d.%06d: subsystem: %s remove host: %s\n", +- elapsed / (uint64)(1000 * 1000), elapsed % (uint64)(1000 * 1000), +- str(arg1), str(arg2)); +-} +- +-usdt:__EXE__:nvmf_subsystem_add_listener { +- printf("%d.%06d: subsystem: %d add listener with trtype: %d, traddr: %s, trsvcid: %s\n", +- elapsed / (uint64)(1000 * 1000), elapsed % (uint64)(1000 * 1000), +- arg1, arg2, str(arg3), str(arg4)); +-} +- +-usdt:__EXE__:nvmf_subsystem_remove_listener { +- printf("%d.%06d: subsystem: %d remove listener with trtype: %d, traddr: %s, trsvcid: %s\n", +- elapsed / (uint64)(1000 * 1000), elapsed % (uint64)(1000 * 1000), +- arg1, arg2, str(arg3), str(arg4)); +-} +- +-usdt:__EXE__:nvmf_subsystem_create { +- printf("%d.%06d: create subsystem: %s\n", +- elapsed / (uint64)(1000 * 1000), elapsed % (uint64)(1000 * 1000), +- str(arg1)); +-} +- +-usdt:__EXE__:nvmf_subsystem_destroy { +- printf("%d.%06d: destroy subsystem: %s\n", +- elapsed / (uint64)(1000 * 1000), elapsed % (uint64)(1000 * 1000), +- str(arg1)); +-} +- +-usdt:__EXE__:nvmf_ns_change { +- printf("%d.%06d: namespace change: %d for subsystem: %s\n", +- elapsed / (uint64)(1000 * 1000), elapsed % (uint64)(1000 * 1000), +- arg1, str(arg2)); +-} +- +-usdt:__EXE__:nvmf_subsystem_add_ns { +- printf("%d.%06d: subsystem: %s add namespace: %d\n", +- elapsed / (uint64)(1000 * 1000), elapsed % (uint64)(1000 * 1000), +- str(arg1), arg2); +-} +- +-usdt:__EXE__:nvmf_subsystem_add_ctrlr { +- printf("%d.%06d: subsystem: %s add controller: %p on host: %s\n", +- elapsed / (uint64)(1000 * 1000), elapsed % (uint64)(1000 * 1000), +- str(arg1), arg2, str(arg3)); +-} +- +-usdt:__EXE__:nvmf_subsystem_remove_ctrlr { +- printf("%d.%06d: subsystem: %s remove controller: %p on host: %s\n", +- elapsed / (uint64)(1000 * 1000), elapsed % (uint64)(1000 * 1000), +- str(arg1), arg2, str(arg3)); +-} +- +-usdt:__EXE__:nvmf_transport_poll_group_add { +- printf("%d.%06d: add qpair: %p with id: %d to poll group on thread: %lu\n", +- elapsed / (uint64)(1000 * 1000), elapsed % (uint64)(1000 * 1000), +- arg1, arg2, arg3); +-} +- +-usdt:__EXE__:nvmf_transport_poll_group_remove { +- printf("%d.%06d: remove qpair: %p with id: %d from poll group on thread: %lu\n", +- elapsed / (uint64)(1000 * 1000), elapsed % (uint64)(1000 * 1000), +- arg1, arg2, arg3); +-} +- +-usdt:__EXE__:nvmf_tgt_add_transport { +- printf("%d.%06d: add transport: %p to target: %s\n", +- elapsed / (uint64)(1000 * 1000), elapsed % (uint64)(1000 * 1000), +- arg1, str(arg2)); +-} +- +-usdt:__EXE__:nvmf_poll_group_add_subsystem { +- printf("%d.%06d: poll group: %p add subsystem: %s\n", +- elapsed / (uint64)(1000 * 1000), elapsed % (uint64)(1000 * 1000), +- arg1, str(arg2)); +-} +- +-usdt:__EXE__:nvmf_poll_group_remove_subsystem { +- printf("%d.%06d: poll group: %p on thread: %lu remove subsystem: %s\n", +- elapsed / (uint64)(1000 * 1000), elapsed % (uint64)(1000 * 1000), +- arg1, arg2, str(arg3)); +-} +- +-usdt:__EXE__:nvmf_ctrlr_create { +- printf("%d.%06d: create ctrlr: %p on subsystem: %s on thread: %lu\n", +- elapsed / (uint64)(1000 * 1000), elapsed % (uint64)(1000 * 1000), +- arg1, str(arg2), arg3); +-} +- +-usdt:__EXE__:nvmf_ctrlr_destruct { +- printf("%d.%06d: destroy ctrlr: %p on subsystem: %s on thread: %lu\n", +- elapsed / (uint64)(1000 * 1000), elapsed % (uint64)(1000 * 1000), +- arg1, str(arg2), arg3); +-} +- +-usdt:__EXE__:nvmf_ctrlr_add_io_qpair { +- printf("%d.%06d: ctrlr: %p add io qpair: %p with id: %d on thread: %lu\n", +- elapsed / (uint64)(1000 * 1000), elapsed % (uint64)(1000 * 1000), +- arg1, arg2, arg3, arg4); +-} ++usdt:__EXE__:nvmf_tgt_state { ++ printf("%d.%06d: nvmf_tgt reached state %s\n", ++ elapsed / (uint64)(1000 * 1000), elapsed % (uint64)(1000 * 1000), ++ @target[arg1]); ++} ++ ++usdt:__EXE__:nvmf_subsystem_change_state { ++ printf("%d.%06d: %s change state from %s to %s start\n", ++ elapsed / (uint64)(1000 * 1000), elapsed % (uint64)(1000 * 1000), ++ str(arg1), @subsystem[arg3], @subsystem[arg2]); ++} ++ ++usdt:__EXE__:nvmf_subsystem_change_state_done { ++ printf("%d.%06d: %s change state from %s to %s %s\n", ++ elapsed / (uint64)(1000 * 1000), elapsed % (uint64)(1000 * 1000), ++ str(arg1), @subsystem[arg3], @subsystem[arg2], arg4 ? "failed" : "done"); ++} ++ ++usdt:__EXE__:nvmf_pg_change_state { ++ printf("%d.%06d: %s on thread %d state to %s start\n", ++ elapsed / (uint64)(1000 * 1000), elapsed % (uint64)(1000 * 1000), ++ str(arg1), arg3, @subsystem[arg2]); ++} ++ ++usdt:__EXE__:nvmf_pg_change_state_done { ++ printf("%d.%06d: %s on thread %d state to %s done\n", ++ elapsed / (uint64)(1000 * 1000), elapsed % (uint64)(1000 * 1000), ++ str(arg1), arg3, @subsystem[arg2]); ++} ++ ++usdt:__EXE__:nvmf_create_poll_group { ++ printf("%d.%06d: create poll group on thread: %d\n", ++ elapsed / (uint64)(1000 * 1000), elapsed % (uint64)(1000 * 1000), ++ arg1); ++} ++ ++usdt:__EXE__:nvmf_destroy_poll_group { ++ printf("%d.%06d: destroy poll group on thread: %d\n", ++ elapsed / (uint64)(1000 * 1000), elapsed % (uint64)(1000 * 1000), ++ arg1); ++} ++ ++usdt:__EXE__:nvmf_poll_group_add_qpair { ++ printf("%d.%06d: add qpair: %p to poll group on thread %d\n", ++ elapsed / (uint64)(1000 * 1000), elapsed % (uint64)(1000 * 1000), ++ arg1, arg2); ++} ++ ++usdt:__EXE__:nvmf_destroy_poll_group_qpairs { ++ printf("%d.%06d: destroy qpairs on poll group on thread %d\n", ++ elapsed / (uint64)(1000 * 1000), elapsed % (uint64)(1000 * 1000), ++ arg1); ++} ++ ++usdt:__EXE__:nvmf_poll_group_remove_qpair { ++ printf("%d.%06d: remove qpair: %p from poll group on thread %d\n", ++ elapsed / (uint64)(1000 * 1000), elapsed % (uint64)(1000 * 1000), ++ arg1, arg2); ++} ++ ++usdt:__EXE__:nvmf_qpair_disconnect { ++ printf("%d.%06d: disconnect qpair: %p from poll group on thread %d\n", ++ elapsed / (uint64)(1000 * 1000), elapsed % (uint64)(1000 * 1000), ++ arg1, arg2); ++} ++ ++usdt:__EXE__:nvmf_transport_qpair_fini { ++ printf("%d.%06d: destroy qpair: %p on transport layer\n", ++ elapsed / (uint64)(1000 * 1000), elapsed % (uint64)(1000 * 1000), ++ arg1); ++} ++ ++usdt:__EXE__:nvmf_poll_group_drain_qpair { ++ printf("%d.%06d: drain qpair: %p from poll group on thread %d\n", ++ elapsed / (uint64)(1000 * 1000), elapsed % (uint64)(1000 * 1000), ++ arg1, arg2); ++} ++ ++usdt:__EXE__:nvmf_ctrlr_add_qpair { ++ printf("%d.%06d: %s add qpair: %p, qid: %d for host: %s\n", ++ elapsed / (uint64)(1000 * 1000), elapsed % (uint64)(1000 * 1000), ++ str(arg3), arg1, arg2, str(arg4)); ++} ++ ++usdt:__EXE__:nvmf_subsystem_add_host { ++ printf("%d.%06d: subsystem: %s add host: %s\n", ++ elapsed / (uint64)(1000 * 1000), elapsed % (uint64)(1000 * 1000), ++ str(arg1), str(arg2)); ++} ++ ++usdt:__EXE__:nvmf_subsystem_remove_host { ++ printf("%d.%06d: subsystem: %s remove host: %s\n", ++ elapsed / (uint64)(1000 * 1000), elapsed % (uint64)(1000 * 1000), ++ str(arg1), str(arg2)); ++} ++ ++usdt:__EXE__:nvmf_subsystem_add_listener { ++ printf("%d.%06d: subsystem: %d add listener with trtype: %d, traddr: %s, trsvcid: %s\n", ++ elapsed / (uint64)(1000 * 1000), elapsed % (uint64)(1000 * 1000), ++ arg1, arg2, str(arg3), str(arg4)); ++} ++ ++usdt:__EXE__:nvmf_subsystem_remove_listener { ++ printf("%d.%06d: subsystem: %d remove listener with trtype: %d, traddr: %s, trsvcid: %s\n", ++ elapsed / (uint64)(1000 * 1000), elapsed % (uint64)(1000 * 1000), ++ arg1, arg2, str(arg3), str(arg4)); ++} ++ ++usdt:__EXE__:nvmf_subsystem_create { ++ printf("%d.%06d: create subsystem: %s\n", ++ elapsed / (uint64)(1000 * 1000), elapsed % (uint64)(1000 * 1000), ++ str(arg1)); ++} ++ ++usdt:__EXE__:nvmf_subsystem_destroy { ++ printf("%d.%06d: destroy subsystem: %s\n", ++ elapsed / (uint64)(1000 * 1000), elapsed % (uint64)(1000 * 1000), ++ str(arg1)); ++} ++ ++usdt:__EXE__:nvmf_ns_change { ++ printf("%d.%06d: namespace change: %d for subsystem: %s\n", ++ elapsed / (uint64)(1000 * 1000), elapsed % (uint64)(1000 * 1000), ++ arg1, str(arg2)); ++} ++ ++usdt:__EXE__:nvmf_subsystem_add_ns { ++ printf("%d.%06d: subsystem: %s add namespace: %d\n", ++ elapsed / (uint64)(1000 * 1000), elapsed % (uint64)(1000 * 1000), ++ str(arg1), arg2); ++} ++ ++usdt:__EXE__:nvmf_subsystem_add_ctrlr { ++ printf("%d.%06d: subsystem: %s add controller: %p on host: %s\n", ++ elapsed / (uint64)(1000 * 1000), elapsed % (uint64)(1000 * 1000), ++ str(arg1), arg2, str(arg3)); ++} ++ ++usdt:__EXE__:nvmf_subsystem_remove_ctrlr { ++ printf("%d.%06d: subsystem: %s remove controller: %p on host: %s\n", ++ elapsed / (uint64)(1000 * 1000), elapsed % (uint64)(1000 * 1000), ++ str(arg1), arg2, str(arg3)); ++} ++ ++usdt:__EXE__:nvmf_transport_poll_group_add { ++ printf("%d.%06d: add qpair: %p with id: %d to poll group on thread: %lu\n", ++ elapsed / (uint64)(1000 * 1000), elapsed % (uint64)(1000 * 1000), ++ arg1, arg2, arg3); ++} ++ ++usdt:__EXE__:nvmf_transport_poll_group_remove { ++ printf("%d.%06d: remove qpair: %p with id: %d from poll group on thread: %lu\n", ++ elapsed / (uint64)(1000 * 1000), elapsed % (uint64)(1000 * 1000), ++ arg1, arg2, arg3); ++} ++ ++usdt:__EXE__:nvmf_tgt_add_transport { ++ printf("%d.%06d: add transport: %p to target: %s\n", ++ elapsed / (uint64)(1000 * 1000), elapsed % (uint64)(1000 * 1000), ++ arg1, str(arg2)); ++} ++ ++usdt:__EXE__:nvmf_poll_group_add_subsystem { ++ printf("%d.%06d: poll group: %p add subsystem: %s\n", ++ elapsed / (uint64)(1000 * 1000), elapsed % (uint64)(1000 * 1000), ++ arg1, str(arg2)); ++} ++ ++usdt:__EXE__:nvmf_poll_group_remove_subsystem { ++ printf("%d.%06d: poll group: %p on thread: %lu remove subsystem: %s\n", ++ elapsed / (uint64)(1000 * 1000), elapsed % (uint64)(1000 * 1000), ++ arg1, arg2, str(arg3)); ++} ++ ++usdt:__EXE__:nvmf_ctrlr_create { ++ printf("%d.%06d: create ctrlr: %p on subsystem: %s on thread: %lu\n", ++ elapsed / (uint64)(1000 * 1000), elapsed % (uint64)(1000 * 1000), ++ arg1, str(arg2), arg3); ++} ++ ++usdt:__EXE__:nvmf_ctrlr_destruct { ++ printf("%d.%06d: destroy ctrlr: %p on subsystem: %s on thread: %lu\n", ++ elapsed / (uint64)(1000 * 1000), elapsed % (uint64)(1000 * 1000), ++ arg1, str(arg2), arg3); ++} ++ ++usdt:__EXE__:nvmf_ctrlr_add_io_qpair { ++ printf("%d.%06d: ctrlr: %p add io qpair: %p with id: %d on thread: %lu\n", ++ elapsed / (uint64)(1000 * 1000), elapsed % (uint64)(1000 * 1000), ++ arg1, arg2, arg3, arg4); ++} +diff --git a/scripts/bpf/nvmf_path.bt b/scripts/bpf/nvmf_path.bt +index e67e9c6..004eb33 100644 +--- a/scripts/bpf/nvmf_path.bt ++++ b/scripts/bpf/nvmf_path.bt +@@ -1,8 +1,8 @@ +-usdt:__EXE__:nvmf_request_io_exec_path { +- @path[str(arg2), str(arg3)] = count(); +-} +- +-interval:s:1 { +- print(@path); +- clear(@path); +-} ++usdt:__EXE__:nvmf_request_io_exec_path { ++ @path[str(arg2), str(arg3)] = count(); ++} ++ ++interval:s:1 { ++ print(@path); ++ clear(@path); ++} +diff --git a/scripts/bpf/nvmf_timeout.bt b/scripts/bpf/nvmf_timeout.bt +index 6a34435..f6b8d50 100644 +--- a/scripts/bpf/nvmf_timeout.bt ++++ b/scripts/bpf/nvmf_timeout.bt +@@ -1,17 +1,17 @@ +-usdt:__EXE__:bdev_nvme_ctrlr_reconnect { +- printf("%d.%06d: reconnect bdev controller %s\n", +- elapsed / (uint64)(1000 * 1000), elapsed % (uint64)(1000 * 1000), +- str(arg1)); +-} +- +-usdt:__EXE__:bdev_nvme_ctrlr_reconnect_delay { +- printf("%d.%06d: reconnect delay bdev controller %s\n", +- elapsed / (uint64)(1000 * 1000), elapsed % (uint64)(1000 * 1000), +- str(arg1)); +-} +- +-usdt:__EXE__:bdev_nvme_ctrlr_reset { +- printf("%d.%06d: reset bdev controller %s\n", +- elapsed / (uint64)(1000 * 1000), elapsed % (uint64)(1000 * 1000), +- str(arg1)); +-} ++usdt:__EXE__:bdev_nvme_ctrlr_reconnect { ++ printf("%d.%06d: reconnect bdev controller %s\n", ++ elapsed / (uint64)(1000 * 1000), elapsed % (uint64)(1000 * 1000), ++ str(arg1)); ++} ++ ++usdt:__EXE__:bdev_nvme_ctrlr_reconnect_delay { ++ printf("%d.%06d: reconnect delay bdev controller %s\n", ++ elapsed / (uint64)(1000 * 1000), elapsed % (uint64)(1000 * 1000), ++ str(arg1)); ++} ++ ++usdt:__EXE__:bdev_nvme_ctrlr_reset { ++ printf("%d.%06d: reset bdev controller %s\n", ++ elapsed / (uint64)(1000 * 1000), elapsed % (uint64)(1000 * 1000), ++ str(arg1)); ++} +diff --git a/scripts/bpf/readv.bt b/scripts/bpf/readv.bt +index 36abebf..0dd8391 100644 +--- a/scripts/bpf/readv.bt ++++ b/scripts/bpf/readv.bt +@@ -1,3 +1,3 @@ +-tracepoint:syscalls:sys_exit_readv /pid == __PID__/ { +- @bytes = hist(args->ret); +-} ++tracepoint:syscalls:sys_exit_readv /pid == __PID__/ { ++ @bytes = hist(args->ret); ++} +diff --git a/scripts/bpf/sched.bt b/scripts/bpf/sched.bt +index e941d2c..9d362f9 100644 +--- a/scripts/bpf/sched.bt ++++ b/scripts/bpf/sched.bt +@@ -1,62 +1,62 @@ +- +-struct spdk_thread_stats { +- uint64_t busy_tsc; +- uint64_t idle_tsc; +-}; +- +-struct spdk_scheduler_thread_info { +- uint32_t lcore; +- uint64_t thread_id; +- /* Defining these as a 1-element array here allows us to +- * create local variables for these members when accessing +- * them which improves readability. +- */ +- struct spdk_thread_stats total_stats[1]; +- struct spdk_thread_stats current_stats[1]; +-}; +- +-struct spdk_scheduler_core_info { +- uint64_t total_idle_tsc; +- uint64_t total_busy_tsc; +- uint64_t current_idle_tsc; +- uint64_t current_busy_tsc; +- uint32_t lcore; +- uint32_t threads_count; +- bool interrupt_mode; +- struct spdk_scheduler_thread_info *thread_infos; +-}; +- +-usdt:__EXE__:dynsched_move { +- $info = (struct spdk_scheduler_thread_info *)arg1; +- $stats = (struct spdk_thread_stats *)$info->current_stats; +- if ($stats->busy_tsc > 0) { +- $thread_pct = $stats->busy_tsc * 100 / ($stats->busy_tsc + $stats->idle_tsc); +- $core_pct = $stats->busy_tsc * 100 / (@cores_busy_tsc[$info->lcore] + @cores_idle_tsc[$info->lcore]); +- } else { +- $thread_pct = 0; +- $core_pct = 0; +- } +- printf("td:%2d old:%2d new:%2d td_busy:%2d% core_busy:%2d%\n", +- $info->thread_id, $info->lcore, arg2, $thread_pct, $core_pct); +-} +- +-usdt:__EXE__:dynsched_balance { +- printf("\n"); +- clear(@cores_busy_tsc); +- clear(@cores_idle_tsc); +- printf("Starting balance across %d cores\n", arg1); +-} +- +-usdt:__EXE__:dynsched_core_info { +- $info = (struct spdk_scheduler_core_info *)arg2; +- $busy = $info->current_busy_tsc; +- $idle = $info->current_idle_tsc; +- if ($busy > 0) { +- $pct = $busy * 100 / ($busy + $idle); +- } else { +- $pct = 0; +- } +- printf("core:%2d busy:%10d idle:%10d pct:%2d% td_count:%2d\n", arg1, $busy, $idle, $pct, $info->threads_count); +- @cores_busy_tsc[arg1] = $busy; +- @cores_idle_tsc[arg1] = $idle; +-} ++ ++struct spdk_thread_stats { ++ uint64_t busy_tsc; ++ uint64_t idle_tsc; ++}; ++ ++struct spdk_scheduler_thread_info { ++ uint32_t lcore; ++ uint64_t thread_id; ++ /* Defining these as a 1-element array here allows us to ++ * create local variables for these members when accessing ++ * them which improves readability. ++ */ ++ struct spdk_thread_stats total_stats[1]; ++ struct spdk_thread_stats current_stats[1]; ++}; ++ ++struct spdk_scheduler_core_info { ++ uint64_t total_idle_tsc; ++ uint64_t total_busy_tsc; ++ uint64_t current_idle_tsc; ++ uint64_t current_busy_tsc; ++ uint32_t lcore; ++ uint32_t threads_count; ++ bool interrupt_mode; ++ struct spdk_scheduler_thread_info *thread_infos; ++}; ++ ++usdt:__EXE__:dynsched_move { ++ $info = (struct spdk_scheduler_thread_info *)arg1; ++ $stats = (struct spdk_thread_stats *)$info->current_stats; ++ if ($stats->busy_tsc > 0) { ++ $thread_pct = $stats->busy_tsc * 100 / ($stats->busy_tsc + $stats->idle_tsc); ++ $core_pct = $stats->busy_tsc * 100 / (@cores_busy_tsc[$info->lcore] + @cores_idle_tsc[$info->lcore]); ++ } else { ++ $thread_pct = 0; ++ $core_pct = 0; ++ } ++ printf("td:%2d old:%2d new:%2d td_busy:%2d% core_busy:%2d%\n", ++ $info->thread_id, $info->lcore, arg2, $thread_pct, $core_pct); ++} ++ ++usdt:__EXE__:dynsched_balance { ++ printf("\n"); ++ clear(@cores_busy_tsc); ++ clear(@cores_idle_tsc); ++ printf("Starting balance across %d cores\n", arg1); ++} ++ ++usdt:__EXE__:dynsched_core_info { ++ $info = (struct spdk_scheduler_core_info *)arg2; ++ $busy = $info->current_busy_tsc; ++ $idle = $info->current_idle_tsc; ++ if ($busy > 0) { ++ $pct = $busy * 100 / ($busy + $idle); ++ } else { ++ $pct = 0; ++ } ++ printf("core:%2d busy:%10d idle:%10d pct:%2d% td_count:%2d\n", arg1, $busy, $idle, $pct, $info->threads_count); ++ @cores_busy_tsc[arg1] = $busy; ++ @cores_idle_tsc[arg1] = $idle; ++} +diff --git a/scripts/bpf/send_msg.bt b/scripts/bpf/send_msg.bt +index 2e1bedf..f25bc83 100644 +--- a/scripts/bpf/send_msg.bt ++++ b/scripts/bpf/send_msg.bt +@@ -1,7 +1,7 @@ +-uprobe:__EXE__:spdk_thread_send_msg { +- @send_msg[usym(arg1)] = count(); +-} +- +-uprobe:__EXE__:spdk_for_each_channel { +- @for_each_channel[usym(arg1)] = count(); +-} ++uprobe:__EXE__:spdk_thread_send_msg { ++ @send_msg[usym(arg1)] = count(); ++} ++ ++uprobe:__EXE__:spdk_for_each_channel { ++ @for_each_channel[usym(arg1)] = count(); ++} +diff --git a/scripts/bpf/syscalls.bt b/scripts/bpf/syscalls.bt +index 858ad75..c99142a 100644 +--- a/scripts/bpf/syscalls.bt ++++ b/scripts/bpf/syscalls.bt +@@ -1,9 +1,9 @@ +-tracepoint:syscalls:sys_enter_* /pid == __PID__/ { +- @syscall[probe] = count(); +-} +- +-interval:s:1 { +- printf("\n"); +- print(@syscall); +- clear(@syscall); +-} ++tracepoint:syscalls:sys_enter_* /pid == __PID__/ { ++ @syscall[probe] = count(); ++} ++ ++interval:s:1 { ++ printf("\n"); ++ print(@syscall); ++ clear(@syscall); ++} +diff --git a/scripts/bpf/trace.py b/scripts/bpf/trace.py +index 81dbd32..b38e46c 100644 +--- a/scripts/bpf/trace.py ++++ b/scripts/bpf/trace.py +@@ -1,636 +1,636 @@ +-#!/usr/bin/env python3 +-# SPDX-License-Identifier: BSD-3-Clause +-# Copyright (C) 2021 Intel Corporation +-# All rights reserved. +-# +- +- +-from argparse import ArgumentParser +-from dataclasses import dataclass, field +-from itertools import islice +-from typing import Dict, List, TypeVar +-import ctypes as ct +-import ijson +-import magic +-import os +-import re +-import subprocess +-import sys +-import tempfile +- +-TSC_MAX = (1 << 64) - 1 +-UCHAR_MAX = (1 << 8) - 1 +-TRACE_MAX_LCORE = 128 +-TRACE_MAX_GROUP_ID = 16 +-TRACE_MAX_TPOINT_ID = TRACE_MAX_GROUP_ID * 64 +-TRACE_MAX_ARGS_COUNT = 8 +-TRACE_MAX_RELATIONS = 16 +-TRACE_INVALID_OBJECT = (1 << 64) - 1 +-OBJECT_NONE = 0 +-OWNER_NONE = 0 +- +- +-@dataclass +-class DTraceArgument: +- """Describes a DTrace probe (usdt) argument""" +- name: str +- pos: int +- type: type +- +- +-@dataclass +-class DTraceProbe: +- """Describes a DTrace probe (usdt) point""" +- name: str +- args: Dict[str, DTraceArgument] +- +- def __init__(self, name, args): +- self.name = name +- self.args = {a.name: a for a in args} +- +- +-@dataclass +-class DTraceEntry: +- """Describes a single DTrace probe invocation""" +- name: str +- args: Dict[str, TypeVar('ArgumentType', str, int)] +- +- def __init__(self, probe, args): +- valmap = {int: lambda x: int(x, 16), +- str: lambda x: x.strip().strip("'")} +- self.name = probe.name +- self.args = {} +- for name, value in args.items(): +- arg = probe.args.get(name) +- if arg is None: +- raise ValueError(f'Unexpected argument: {name}') +- self.args[name] = valmap[arg.type](value) +- +- +-class DTrace: +- """Generates bpftrace script based on the supplied probe points, parses its +- output and stores is as a list of DTraceEntry sorted by their tsc. +- """ +- def __init__(self, probes, file=None): +- self._avail_probes = self._list_probes() +- self._probes = {p.name: p for p in probes} +- self.entries = self._parse(file) if file is not None else [] +- # Sanitize the probe definitions +- for probe in probes: +- if probe.name not in self._avail_probes: +- raise ValueError(f'Couldn\'t find probe: "{probe.name}"') +- for arg in probe.args.values(): +- if arg.pos >= self._avail_probes[probe.name]: +- raise ValueError('Invalid probe argument position') +- if arg.type not in (int, str): +- raise ValueError('Invalid argument type') +- +- def _parse(self, file): +- regex = re.compile(r'(\w+): (.*)') +- entries = [] +- +- for line in file.readlines(): +- match = regex.match(line) +- if match is None: +- continue +- name, args = match.groups() +- probe = self._probes.get(name) +- # Skip the line if we don't recognize the probe name +- if probe is None: +- continue +- entries.append(DTraceEntry(probe, args=dict(a.strip().split('=') +- for a in args.split(',')))) +- entries.sort(key=lambda e: e.args['tsc']) +- return entries +- +- def _list_probes(self): +- files = subprocess.check_output(['git', 'ls-files', '*.[ch]', +- ':!:include/spdk_internal/usdt.h']) +- files = filter(lambda f: len(f) > 0, str(files, 'ascii').split('\n')) +- regex = re.compile(r'SPDK_DTRACE_PROBE([0-9]*)\((\w+)') +- probes = {} +- +- for fname in files: +- with open(fname, 'r') as file: +- for match in regex.finditer(file.read()): +- nargs, name = match.group(1), match.group(2) +- nargs = int(nargs) if len(nargs) > 0 else 0 +- # Add one to accommodate for the tsc being the first arg +- probes[name] = nargs + 1 +- return probes +- +- def _gen_usdt(self, probe): +- usdt = (f'usdt:__EXE__:{probe.name} {{' + +- f'printf("{probe.name}: ') +- args = probe.args +- if len(args) > 0: +- argtype = {int: '0x%lx', str: '\'%s\''} +- argcast = {int: lambda x: x, str: lambda x: f'str({x})'} +- argstr = [f'{a.name}={argtype[a.type]}' for a in args.values()] +- argval = [f'{argcast[a.type](f"arg{a.pos}")}' for a in args.values()] +- usdt += ', '.join(argstr) + '\\n", ' + ', '.join(argval) +- else: +- usdt += '\\n"' +- usdt += ');}' +- return usdt +- +- def generate(self): +- return '\n'.join([self._gen_usdt(p) for p in self._probes.values()]) +- +- def record(self, pid): +- with tempfile.NamedTemporaryFile(mode='w+') as script: +- script.write(self.generate()) +- script.flush() +- try: +- subprocess.run([f'{os.path.dirname(__file__)}/../bpftrace.sh', +- f'{pid}', f'{script.name}']) +- except KeyboardInterrupt: +- pass +- +- +-@dataclass +-class TracepointArgument: +- """Describes an SPDK tracepoint argument""" +- TYPE_INT = 0 +- TYPE_PTR = 1 +- TYPE_STR = 2 +- name: str +- argtype: int +- +- +-@dataclass +-class Tracepoint: +- """Describes an SPDK tracepoint, equivalent to struct spdk_trace_tpoint""" +- name: str +- id: int +- new_object: bool +- object_type: int +- owner_type: int +- args: List[TracepointArgument] +- +- +-@dataclass +-class TraceEntry: +- """Describes an SPDK tracepoint entry, equivalent to struct spdk_trace_entry""" +- lcore: int +- tpoint: Tracepoint +- tsc: int +- poller: str +- size: int +- object_id: str +- object_ptr: int +- time: int +- args: Dict[str, TypeVar('ArgumentType', str, int)] +- related: str +- +- +-class TraceProvider: +- """Defines interface for objects providing traces and tracepoint definitions""" +- +- def tpoints(self): +- """Returns tracepoint definitions as a dict of (tracepoint_name, tracepoint)""" +- raise NotImplementedError() +- +- def entries(self): +- """Generator returning subsequent trace entries""" +- raise NotImplementedError() +- +- def tsc_rate(self): +- """Returns the TSC rate that was in place when traces were collected""" +- raise NotImplementedError() +- +- +-class JsonProvider(TraceProvider): +- """Trace provider based on JSON-formatted output produced by spdk_trace app""" +- def __init__(self, file): +- self._parser = ijson.parse(file) +- self._tpoints = {} +- self._parse_defs() +- +- def _parse_tpoints(self, tpoints): +- for tpoint in tpoints: +- tpoint_id = tpoint['id'] +- self._tpoints[tpoint_id] = Tracepoint( +- name=tpoint['name'], id=tpoint_id, +- new_object=tpoint['new_object'], object_type=OBJECT_NONE, +- owner_type=OWNER_NONE, +- args=[TracepointArgument(name=a['name'], +- argtype=a['type']) +- for a in tpoint.get('args', [])]) +- +- def _parse_defs(self): +- builder = None +- for prefix, event, value in self._parser: +- # If we reach entries array, there are no more tracepoint definitions +- if prefix == 'entries': +- break +- elif prefix == 'tsc_rate': +- self._tsc_rate = value +- continue +- +- if (prefix, event) == ('tpoints', 'start_array'): +- builder = ijson.ObjectBuilder() +- if builder is not None: +- builder.event(event, value) +- if (prefix, event) == ('tpoints', 'end_array'): +- self._parse_tpoints(builder.value) +- builder = None +- +- def _parse_entry(self, entry): +- tpoint = self._tpoints[entry['tpoint']] +- obj = entry.get('object', {}) +- return TraceEntry(tpoint=tpoint, lcore=entry['lcore'], tsc=entry['tsc'], +- size=entry.get('size'), object_id=obj.get('id'), +- object_ptr=obj.get('value'), related=entry.get('related'), +- time=obj.get('time'), poller=entry.get('poller'), +- args={n.name: v for n, v in zip(tpoint.args, entry.get('args', []))}) +- +- def tsc_rate(self): +- return self._tsc_rate +- +- def tpoints(self): +- return self._tpoints +- +- def entries(self): +- builder = None +- for prefix, event, value in self._parser: +- if (prefix, event) == ('entries.item', 'start_map'): +- builder = ijson.ObjectBuilder() +- if builder is not None: +- builder.event(event, value) +- if (prefix, event) == ('entries.item', 'end_map'): +- yield self._parse_entry(builder.value) +- builder = None +- +- +-class CParserOpts(ct.Structure): +- _fields_ = [('filename', ct.c_char_p), +- ('mode', ct.c_int), +- ('lcore', ct.c_uint16)] +- +- +-class CTraceOwner(ct.Structure): +- _fields_ = [('type', ct.c_uint8), +- ('id_prefix', ct.c_char)] +- +- +-class CTraceObject(ct.Structure): +- _fields_ = [('type', ct.c_uint8), +- ('id_prefix', ct.c_char)] +- +- +-class CTpointArgument(ct.Structure): +- _fields_ = [('name', ct.c_char * 14), +- ('type', ct.c_uint8), +- ('size', ct.c_uint8)] +- +- +-class CTpointRelatedObject(ct.Structure): +- _fields_ = [('object_type', ct.c_uint8), +- ('arg_index', ct.c_uint8)] +- +- +-class CTracepoint(ct.Structure): +- _fields_ = [('name', ct.c_char * 24), +- ('tpoint_id', ct.c_uint16), +- ('owner_type', ct.c_uint8), +- ('object_type', ct.c_uint8), +- ('new_object', ct.c_uint8), +- ('num_args', ct.c_uint8), +- ('args', CTpointArgument * TRACE_MAX_ARGS_COUNT), +- ('related_objects', CTpointRelatedObject * TRACE_MAX_RELATIONS)] +- +- +-class CTraceFlags(ct.Structure): +- _fields_ = [('tsc_rate', ct.c_uint64), +- ('tpoint_mask', ct.c_uint64 * TRACE_MAX_GROUP_ID), +- ('owner', CTraceOwner * (UCHAR_MAX + 1)), +- ('object', CTraceObject * (UCHAR_MAX + 1)), +- ('tpoint', CTracepoint * TRACE_MAX_TPOINT_ID)] +- +- +-class CTraceEntry(ct.Structure): +- _fields_ = [('tsc', ct.c_uint64), +- ('tpoint_id', ct.c_uint16), +- ('poller_id', ct.c_uint16), +- ('size', ct.c_uint32), +- ('object_id', ct.c_uint64)] +- +- +-class CTraceParserArgument(ct.Union): +- _fields_ = [('integer', ct.c_uint64), +- ('pointer', ct.c_void_p), +- ('string', ct.c_char * (UCHAR_MAX + 1))] +- +- +-class CTraceParserEntry(ct.Structure): +- _fields_ = [('entry', ct.POINTER(CTraceEntry)), +- ('object_index', ct.c_uint64), +- ('object_start', ct.c_uint64), +- ('lcore', ct.c_uint16), +- ('related_index', ct.c_uint64), +- ('related_type', ct.c_uint8), +- ('args', CTraceParserArgument * TRACE_MAX_ARGS_COUNT)] +- +- +-class NativeProvider(TraceProvider): +- """Trace provider based on SPDK's trace library""" +- def __init__(self, file): +- self._setup_binding(file.name) +- self._parse_defs() +- +- def __del__(self): +- if hasattr(self, '_parser'): +- self._lib.spdk_trace_parser_cleanup(self._parser) +- +- def _setup_binding(self, filename): +- self._lib = ct.CDLL('build/lib/libspdk_trace_parser.so') +- self._lib.spdk_trace_parser_init.restype = ct.c_void_p +- self._lib.spdk_trace_parser_init.errcheck = lambda r, *_: ct.c_void_p(r) +- self._lib.spdk_trace_parser_get_flags.restype = ct.POINTER(CTraceFlags) +- opts = CParserOpts(filename=bytes(filename, 'ascii'), mode=0, +- lcore=TRACE_MAX_LCORE) +- self._parser = self._lib.spdk_trace_parser_init(ct.byref(opts)) +- if not self._parser: +- raise ValueError('Failed to construct SPDK trace parser') +- +- def _parse_tpoints(self, tpoints): +- self._tpoints = {} +- for tpoint in tpoints: +- if len(tpoint.name) == 0: +- continue +- self._tpoints[tpoint.tpoint_id] = Tracepoint( +- name=str(tpoint.name, 'ascii'), object_type=tpoint.object_type, +- owner_type=tpoint.owner_type, id=tpoint.tpoint_id, +- new_object=bool(tpoint.new_object), +- args=[TracepointArgument(name=str(a.name, 'ascii'), argtype=a.type) +- for a in tpoint.args[:tpoint.num_args]]) +- +- def _parse_defs(self): +- flags = self._lib.spdk_trace_parser_get_flags(self._parser) +- self._tsc_rate = flags.contents.tsc_rate +- self._parse_tpoints(flags.contents.tpoint) +- +- def conv_objs(arr): +- return {int(o.type): str(o.id_prefix, 'ascii') for o in arr if o.id_prefix != b'\x00'} +- self._owners = conv_objs(flags.contents.owner) +- self._objects = conv_objs(flags.contents.object) +- +- def tsc_rate(self): +- return self._tsc_rate +- +- def tpoints(self): +- return self._tpoints +- +- def entries(self): +- pe = CTraceParserEntry() +- argconv = {TracepointArgument.TYPE_INT: lambda a: a.integer, +- TracepointArgument.TYPE_PTR: lambda a: int(a.pointer or 0), +- TracepointArgument.TYPE_STR: lambda a: str(a.string, 'ascii')} +- +- while self._lib.spdk_trace_parser_next_entry(self._parser, ct.byref(pe)): +- entry = pe.entry.contents +- lcore = pe.lcore +- tpoint = self._tpoints[entry.tpoint_id] +- args = {a.name: argconv[a.argtype](pe.args[i]) for i, a in enumerate(tpoint.args)} +- +- if tpoint.object_type != OBJECT_NONE: +- if pe.object_index != TRACE_INVALID_OBJECT: +- object_id = '{}{}'.format(self._objects[tpoint.object_type], pe.object_index) +- ts = entry.tsc - pe.object_start +- else: +- object_id, ts = 'n/a', None +- elif entry.object_id != 0: +- object_id, ts = '{:x}'.format(entry.object_id), None +- else: +- object_id, ts = None, None +- +- if tpoint.owner_type != OWNER_NONE: +- poller_id = '{}{:02}'.format(self._owners[tpoint.owner_type], entry.poller_id) +- else: +- poller_id = None +- +- if pe.related_type != OBJECT_NONE: +- related = '{}{}'.format(self._objects[pe.related_type], pe.related_index) +- else: +- related = None +- +- yield TraceEntry(tpoint=tpoint, lcore=lcore, tsc=entry.tsc, +- size=entry.size, object_id=object_id, +- object_ptr=entry.object_id, poller=poller_id, time=ts, +- args=args, related=related) +- +- +-class Trace: +- """Stores, parses, and prints out SPDK traces""" +- def __init__(self, file): +- if file == sys.stdin or magic.from_file(file.name, mime=True) == 'application/json': +- self._provider = JsonProvider(file) +- else: +- self._provider = NativeProvider(file) +- self._objects = [] +- self._argfmt = {TracepointArgument.TYPE_PTR: lambda a: f'0x{a:x}'} +- self.tpoints = self._provider.tpoints() +- +- def _annotate_args(self, entry): +- annotations = {} +- for obj in self._objects: +- current = obj.annotate(entry) +- if current is None: +- continue +- annotations.update(current) +- return annotations +- +- def _format_args(self, entry): +- annotations = self._annotate_args(entry) +- args = [] +- for arg, (name, value) in zip(entry.tpoint.args, entry.args.items()): +- annot = annotations.get(name) +- if annot is not None: +- args.append('{}({})'.format(name, ', '.join(f'{n}={v}' for n, v in annot.items()))) +- else: +- args.append('{}: {}'.format(name, self._argfmt.get(arg.argtype, +- lambda a: a)(value))) +- return args +- +- def register_object(self, obj): +- self._objects.append(obj) +- +- def print(self): +- def get_us(tsc, off): +- return ((tsc - off) * 10 ** 6) / self._provider.tsc_rate() +- +- offset = None +- for e in self._provider.entries(): +- offset = e.tsc if offset is None else offset +- timestamp = get_us(e.tsc, offset) +- diff = get_us(e.time, 0) if e.time is not None else None +- args = ', '.join(self._format_args(e)) +- related = ' (' + e.related + ')' if e.related is not None else '' +- +- print(('{:3} {:16.3f} {:3} {:24} {:12}'.format( +- e.lcore, timestamp, e.poller if e.poller is not None else '', +- e.tpoint.name, f'size: {e.size}' if e.size else '') + +- (f'id: {e.object_id + related:12} ' if e.object_id is not None else '') + +- (f'time: {diff:<8.3f} ' if diff is not None else '') + +- args).rstrip()) +- +- +-class SPDKObject: +- """Describes a specific type of an SPDK objects (e.g. qpair, thread, etc.)""" +- @dataclass +- class Lifetime: +- """Describes a lifetime and properties of a particular SPDK object.""" +- begin: int +- end: int +- ptr: int +- properties: dict = field(default_factory=dict) +- +- def __init__(self, trace: Trace, tpoints: List[str]): +- self.tpoints = {} +- for name in tpoints: +- tpoint = next((t for t in trace.tpoints.values() if t.name == name), None) +- if tpoint is None: +- # Some tpoints might be undefined if configured without specific subsystems +- continue +- self.tpoints[tpoint.id] = tpoint +- +- def _annotate(self, entry: TraceEntry): +- """Abstract annotation method to be implemented by subclasses.""" +- raise NotImplementedError() +- +- def annotate(self, entry: TraceEntry): +- """Annotates a tpoint entry and returns a dict indexed by argname with values representing +- various object properties. For instance, {"qpair": {"qid": 1, "subnqn": "nqn"}} could be +- returned to annotate an argument called "qpair" with two items: "qid" and "subnqn". +- """ +- if entry.tpoint.id not in self.tpoints: +- return None +- return self._annotate(entry) +- +- +-class QPair(SPDKObject): +- def __init__(self, trace: Trace, dtrace: DTrace): +- super().__init__(trace, tpoints=[ +- 'RDMA_REQ_NEW', +- 'RDMA_REQ_NEED_BUFFER', +- 'RDMA_REQ_TX_PENDING_C2H', +- 'RDMA_REQ_TX_PENDING_H2C', +- 'RDMA_REQ_TX_H2C', +- 'RDMA_REQ_RDY_TO_EXECUTE', +- 'RDMA_REQ_EXECUTING', +- 'RDMA_REQ_EXECUTED', +- 'RDMA_REQ_RDY_TO_COMPL', +- 'RDMA_REQ_COMPLETING_C2H', +- 'RDMA_REQ_COMPLETING', +- 'RDMA_REQ_COMPLETED', +- 'TCP_REQ_NEW', +- 'TCP_REQ_NEED_BUFFER', +- 'TCP_REQ_TX_H_TO_C', +- 'TCP_REQ_RDY_TO_EXECUTE', +- 'TCP_REQ_EXECUTING', +- 'TCP_REQ_EXECUTED', +- 'TCP_REQ_RDY_TO_COMPLETE', +- 'TCP_REQ_TRANSFER_C2H', +- 'TCP_REQ_COMPLETED', +- 'TCP_WRITE_START', +- 'TCP_WRITE_DONE', +- 'TCP_READ_DONE', +- 'TCP_REQ_AWAIT_R2T_ACK']) +- self._objects = [] +- self._find_objects(dtrace.entries) +- +- def _find_objects(self, dprobes): +- def probe_match(probe, other): +- return probe.args['qpair'] == other.args['qpair'] +- +- for i, dprobe in enumerate(dprobes): +- if dprobe.name != 'nvmf_poll_group_add_qpair': +- continue +- # We've found a new qpair, now find the probe indicating its destruction +- last_idx, last = next((((i + j + 1), d) for j, d in enumerate(islice(dprobes, i, None)) +- if d.name == 'nvmf_poll_group_remove_qpair' and +- probe_match(d, dprobe)), (None, None)) +- obj = SPDKObject.Lifetime(begin=dprobe.args['tsc'], +- end=last.args['tsc'] if last is not None else TSC_MAX, +- ptr=dprobe.args['qpair'], +- properties={'ptr': hex(dprobe.args['qpair']), +- 'thread': dprobe.args['thread']}) +- for other in filter(lambda p: probe_match(p, dprobe), dprobes[i:last_idx]): +- if other.name == 'nvmf_ctrlr_add_qpair': +- for prop in ['qid', 'subnqn', 'hostnqn']: +- obj.properties[prop] = other.args[prop] +- self._objects.append(obj) +- +- def _annotate(self, entry): +- qpair = entry.args.get('qpair') +- if qpair is None: +- return None +- for obj in self._objects: +- if obj.ptr == qpair and obj.begin <= entry.tsc <= obj.end: +- return {'qpair': obj.properties} +- return None +- +- +-def build_dtrace(file=None): +- return DTrace([ +- DTraceProbe( +- name='nvmf_poll_group_add_qpair', +- args=[DTraceArgument(name='tsc', pos=0, type=int), +- DTraceArgument(name='qpair', pos=1, type=int), +- DTraceArgument(name='thread', pos=2, type=int)]), +- DTraceProbe( +- name='nvmf_poll_group_remove_qpair', +- args=[DTraceArgument(name='tsc', pos=0, type=int), +- DTraceArgument(name='qpair', pos=1, type=int), +- DTraceArgument(name='thread', pos=2, type=int)]), +- DTraceProbe( +- name='nvmf_ctrlr_add_qpair', +- args=[DTraceArgument(name='tsc', pos=0, type=int), +- DTraceArgument(name='qpair', pos=1, type=int), +- DTraceArgument(name='qid', pos=2, type=int), +- DTraceArgument(name='subnqn', pos=3, type=str), +- DTraceArgument(name='hostnqn', pos=4, type=str)])], file) +- +- +-def print_trace(trace_file, dtrace_file): +- dtrace = build_dtrace(dtrace_file) +- trace = Trace(trace_file) +- trace.register_object(QPair(trace, dtrace)) +- trace.print() +- +- +-def main(argv): +- parser = ArgumentParser(description='SPDK trace annotation script') +- parser.add_argument('-i', '--input', +- help='Trace file to annotate (either JSON generated by spdk_trace or ' + +- 'raw binary produced by the SPDK application itself)') +- parser.add_argument('-g', '--generate', help='Generate bpftrace script', action='store_true') +- parser.add_argument('-r', '--record', help='Record BPF traces on PID', metavar='PID', type=int) +- parser.add_argument('-b', '--bpftrace', help='BPF trace script to use for annotations') +- args = parser.parse_args(argv) +- +- if args.generate: +- print(build_dtrace().generate()) +- elif args.record: +- build_dtrace().record(args.record) +- else: +- print_trace(open(args.input, 'r') if args.input is not None else sys.stdin, +- open(args.bpftrace) if args.bpftrace is not None else None) +- +- +-if __name__ == '__main__': +- # In order for the changes to LD_LIBRARY_PATH to be visible to the loader, +- # they need to be applied before starting a process, so we need to +- # re-execute the script after updating it. +- if os.environ.get('SPDK_BPF_TRACE_PY') is None: +- rootdir = f'{os.path.dirname(__file__)}/../..' +- os.environ['LD_LIBRARY_PATH'] = ':'.join([os.environ.get('LD_LIBRARY_PATH', ''), +- f'{rootdir}/build/lib']) +- os.environ['SPDK_BPF_TRACE_PY'] = '1' +- os.execv(sys.argv[0], sys.argv) +- else: +- try: +- main(sys.argv[1:]) +- except (KeyboardInterrupt, BrokenPipeError): +- pass ++#!/usr/bin/env python3 ++# SPDX-License-Identifier: BSD-3-Clause ++# Copyright (C) 2021 Intel Corporation ++# All rights reserved. ++# ++ ++ ++from argparse import ArgumentParser ++from dataclasses import dataclass, field ++from itertools import islice ++from typing import Dict, List, TypeVar ++import ctypes as ct ++import ijson ++import magic ++import os ++import re ++import subprocess ++import sys ++import tempfile ++ ++TSC_MAX = (1 << 64) - 1 ++UCHAR_MAX = (1 << 8) - 1 ++TRACE_MAX_LCORE = 128 ++TRACE_MAX_GROUP_ID = 16 ++TRACE_MAX_TPOINT_ID = TRACE_MAX_GROUP_ID * 64 ++TRACE_MAX_ARGS_COUNT = 8 ++TRACE_MAX_RELATIONS = 16 ++TRACE_INVALID_OBJECT = (1 << 64) - 1 ++OBJECT_NONE = 0 ++OWNER_NONE = 0 ++ ++ ++@dataclass ++class DTraceArgument: ++ """Describes a DTrace probe (usdt) argument""" ++ name: str ++ pos: int ++ type: type ++ ++ ++@dataclass ++class DTraceProbe: ++ """Describes a DTrace probe (usdt) point""" ++ name: str ++ args: Dict[str, DTraceArgument] ++ ++ def __init__(self, name, args): ++ self.name = name ++ self.args = {a.name: a for a in args} ++ ++ ++@dataclass ++class DTraceEntry: ++ """Describes a single DTrace probe invocation""" ++ name: str ++ args: Dict[str, TypeVar('ArgumentType', str, int)] ++ ++ def __init__(self, probe, args): ++ valmap = {int: lambda x: int(x, 16), ++ str: lambda x: x.strip().strip("'")} ++ self.name = probe.name ++ self.args = {} ++ for name, value in args.items(): ++ arg = probe.args.get(name) ++ if arg is None: ++ raise ValueError(f'Unexpected argument: {name}') ++ self.args[name] = valmap[arg.type](value) ++ ++ ++class DTrace: ++ """Generates bpftrace script based on the supplied probe points, parses its ++ output and stores is as a list of DTraceEntry sorted by their tsc. ++ """ ++ def __init__(self, probes, file=None): ++ self._avail_probes = self._list_probes() ++ self._probes = {p.name: p for p in probes} ++ self.entries = self._parse(file) if file is not None else [] ++ # Sanitize the probe definitions ++ for probe in probes: ++ if probe.name not in self._avail_probes: ++ raise ValueError(f'Couldn\'t find probe: "{probe.name}"') ++ for arg in probe.args.values(): ++ if arg.pos >= self._avail_probes[probe.name]: ++ raise ValueError('Invalid probe argument position') ++ if arg.type not in (int, str): ++ raise ValueError('Invalid argument type') ++ ++ def _parse(self, file): ++ regex = re.compile(r'(\w+): (.*)') ++ entries = [] ++ ++ for line in file.readlines(): ++ match = regex.match(line) ++ if match is None: ++ continue ++ name, args = match.groups() ++ probe = self._probes.get(name) ++ # Skip the line if we don't recognize the probe name ++ if probe is None: ++ continue ++ entries.append(DTraceEntry(probe, args=dict(a.strip().split('=') ++ for a in args.split(',')))) ++ entries.sort(key=lambda e: e.args['tsc']) ++ return entries ++ ++ def _list_probes(self): ++ files = subprocess.check_output(['git', 'ls-files', '*.[ch]', ++ ':!:include/spdk_internal/usdt.h']) ++ files = filter(lambda f: len(f) > 0, str(files, 'ascii').split('\n')) ++ regex = re.compile(r'SPDK_DTRACE_PROBE([0-9]*)\((\w+)') ++ probes = {} ++ ++ for fname in files: ++ with open(fname, 'r') as file: ++ for match in regex.finditer(file.read()): ++ nargs, name = match.group(1), match.group(2) ++ nargs = int(nargs) if len(nargs) > 0 else 0 ++ # Add one to accommodate for the tsc being the first arg ++ probes[name] = nargs + 1 ++ return probes ++ ++ def _gen_usdt(self, probe): ++ usdt = (f'usdt:__EXE__:{probe.name} {{' + ++ f'printf("{probe.name}: ') ++ args = probe.args ++ if len(args) > 0: ++ argtype = {int: '0x%lx', str: '\'%s\''} ++ argcast = {int: lambda x: x, str: lambda x: f'str({x})'} ++ argstr = [f'{a.name}={argtype[a.type]}' for a in args.values()] ++ argval = [f'{argcast[a.type](f"arg{a.pos}")}' for a in args.values()] ++ usdt += ', '.join(argstr) + '\\n", ' + ', '.join(argval) ++ else: ++ usdt += '\\n"' ++ usdt += ');}' ++ return usdt ++ ++ def generate(self): ++ return '\n'.join([self._gen_usdt(p) for p in self._probes.values()]) ++ ++ def record(self, pid): ++ with tempfile.NamedTemporaryFile(mode='w+') as script: ++ script.write(self.generate()) ++ script.flush() ++ try: ++ subprocess.run([f'{os.path.dirname(__file__)}/../bpftrace.sh', ++ f'{pid}', f'{script.name}']) ++ except KeyboardInterrupt: ++ pass ++ ++ ++@dataclass ++class TracepointArgument: ++ """Describes an SPDK tracepoint argument""" ++ TYPE_INT = 0 ++ TYPE_PTR = 1 ++ TYPE_STR = 2 ++ name: str ++ argtype: int ++ ++ ++@dataclass ++class Tracepoint: ++ """Describes an SPDK tracepoint, equivalent to struct spdk_trace_tpoint""" ++ name: str ++ id: int ++ new_object: bool ++ object_type: int ++ owner_type: int ++ args: List[TracepointArgument] ++ ++ ++@dataclass ++class TraceEntry: ++ """Describes an SPDK tracepoint entry, equivalent to struct spdk_trace_entry""" ++ lcore: int ++ tpoint: Tracepoint ++ tsc: int ++ poller: str ++ size: int ++ object_id: str ++ object_ptr: int ++ time: int ++ args: Dict[str, TypeVar('ArgumentType', str, int)] ++ related: str ++ ++ ++class TraceProvider: ++ """Defines interface for objects providing traces and tracepoint definitions""" ++ ++ def tpoints(self): ++ """Returns tracepoint definitions as a dict of (tracepoint_name, tracepoint)""" ++ raise NotImplementedError() ++ ++ def entries(self): ++ """Generator returning subsequent trace entries""" ++ raise NotImplementedError() ++ ++ def tsc_rate(self): ++ """Returns the TSC rate that was in place when traces were collected""" ++ raise NotImplementedError() ++ ++ ++class JsonProvider(TraceProvider): ++ """Trace provider based on JSON-formatted output produced by spdk_trace app""" ++ def __init__(self, file): ++ self._parser = ijson.parse(file) ++ self._tpoints = {} ++ self._parse_defs() ++ ++ def _parse_tpoints(self, tpoints): ++ for tpoint in tpoints: ++ tpoint_id = tpoint['id'] ++ self._tpoints[tpoint_id] = Tracepoint( ++ name=tpoint['name'], id=tpoint_id, ++ new_object=tpoint['new_object'], object_type=OBJECT_NONE, ++ owner_type=OWNER_NONE, ++ args=[TracepointArgument(name=a['name'], ++ argtype=a['type']) ++ for a in tpoint.get('args', [])]) ++ ++ def _parse_defs(self): ++ builder = None ++ for prefix, event, value in self._parser: ++ # If we reach entries array, there are no more tracepoint definitions ++ if prefix == 'entries': ++ break ++ elif prefix == 'tsc_rate': ++ self._tsc_rate = value ++ continue ++ ++ if (prefix, event) == ('tpoints', 'start_array'): ++ builder = ijson.ObjectBuilder() ++ if builder is not None: ++ builder.event(event, value) ++ if (prefix, event) == ('tpoints', 'end_array'): ++ self._parse_tpoints(builder.value) ++ builder = None ++ ++ def _parse_entry(self, entry): ++ tpoint = self._tpoints[entry['tpoint']] ++ obj = entry.get('object', {}) ++ return TraceEntry(tpoint=tpoint, lcore=entry['lcore'], tsc=entry['tsc'], ++ size=entry.get('size'), object_id=obj.get('id'), ++ object_ptr=obj.get('value'), related=entry.get('related'), ++ time=obj.get('time'), poller=entry.get('poller'), ++ args={n.name: v for n, v in zip(tpoint.args, entry.get('args', []))}) ++ ++ def tsc_rate(self): ++ return self._tsc_rate ++ ++ def tpoints(self): ++ return self._tpoints ++ ++ def entries(self): ++ builder = None ++ for prefix, event, value in self._parser: ++ if (prefix, event) == ('entries.item', 'start_map'): ++ builder = ijson.ObjectBuilder() ++ if builder is not None: ++ builder.event(event, value) ++ if (prefix, event) == ('entries.item', 'end_map'): ++ yield self._parse_entry(builder.value) ++ builder = None ++ ++ ++class CParserOpts(ct.Structure): ++ _fields_ = [('filename', ct.c_char_p), ++ ('mode', ct.c_int), ++ ('lcore', ct.c_uint16)] ++ ++ ++class CTraceOwner(ct.Structure): ++ _fields_ = [('type', ct.c_uint8), ++ ('id_prefix', ct.c_char)] ++ ++ ++class CTraceObject(ct.Structure): ++ _fields_ = [('type', ct.c_uint8), ++ ('id_prefix', ct.c_char)] ++ ++ ++class CTpointArgument(ct.Structure): ++ _fields_ = [('name', ct.c_char * 14), ++ ('type', ct.c_uint8), ++ ('size', ct.c_uint8)] ++ ++ ++class CTpointRelatedObject(ct.Structure): ++ _fields_ = [('object_type', ct.c_uint8), ++ ('arg_index', ct.c_uint8)] ++ ++ ++class CTracepoint(ct.Structure): ++ _fields_ = [('name', ct.c_char * 24), ++ ('tpoint_id', ct.c_uint16), ++ ('owner_type', ct.c_uint8), ++ ('object_type', ct.c_uint8), ++ ('new_object', ct.c_uint8), ++ ('num_args', ct.c_uint8), ++ ('args', CTpointArgument * TRACE_MAX_ARGS_COUNT), ++ ('related_objects', CTpointRelatedObject * TRACE_MAX_RELATIONS)] ++ ++ ++class CTraceFlags(ct.Structure): ++ _fields_ = [('tsc_rate', ct.c_uint64), ++ ('tpoint_mask', ct.c_uint64 * TRACE_MAX_GROUP_ID), ++ ('owner', CTraceOwner * (UCHAR_MAX + 1)), ++ ('object', CTraceObject * (UCHAR_MAX + 1)), ++ ('tpoint', CTracepoint * TRACE_MAX_TPOINT_ID)] ++ ++ ++class CTraceEntry(ct.Structure): ++ _fields_ = [('tsc', ct.c_uint64), ++ ('tpoint_id', ct.c_uint16), ++ ('poller_id', ct.c_uint16), ++ ('size', ct.c_uint32), ++ ('object_id', ct.c_uint64)] ++ ++ ++class CTraceParserArgument(ct.Union): ++ _fields_ = [('integer', ct.c_uint64), ++ ('pointer', ct.c_void_p), ++ ('string', ct.c_char * (UCHAR_MAX + 1))] ++ ++ ++class CTraceParserEntry(ct.Structure): ++ _fields_ = [('entry', ct.POINTER(CTraceEntry)), ++ ('object_index', ct.c_uint64), ++ ('object_start', ct.c_uint64), ++ ('lcore', ct.c_uint16), ++ ('related_index', ct.c_uint64), ++ ('related_type', ct.c_uint8), ++ ('args', CTraceParserArgument * TRACE_MAX_ARGS_COUNT)] ++ ++ ++class NativeProvider(TraceProvider): ++ """Trace provider based on SPDK's trace library""" ++ def __init__(self, file): ++ self._setup_binding(file.name) ++ self._parse_defs() ++ ++ def __del__(self): ++ if hasattr(self, '_parser'): ++ self._lib.spdk_trace_parser_cleanup(self._parser) ++ ++ def _setup_binding(self, filename): ++ self._lib = ct.CDLL('build/lib/libspdk_trace_parser.so') ++ self._lib.spdk_trace_parser_init.restype = ct.c_void_p ++ self._lib.spdk_trace_parser_init.errcheck = lambda r, *_: ct.c_void_p(r) ++ self._lib.spdk_trace_parser_get_flags.restype = ct.POINTER(CTraceFlags) ++ opts = CParserOpts(filename=bytes(filename, 'ascii'), mode=0, ++ lcore=TRACE_MAX_LCORE) ++ self._parser = self._lib.spdk_trace_parser_init(ct.byref(opts)) ++ if not self._parser: ++ raise ValueError('Failed to construct SPDK trace parser') ++ ++ def _parse_tpoints(self, tpoints): ++ self._tpoints = {} ++ for tpoint in tpoints: ++ if len(tpoint.name) == 0: ++ continue ++ self._tpoints[tpoint.tpoint_id] = Tracepoint( ++ name=str(tpoint.name, 'ascii'), object_type=tpoint.object_type, ++ owner_type=tpoint.owner_type, id=tpoint.tpoint_id, ++ new_object=bool(tpoint.new_object), ++ args=[TracepointArgument(name=str(a.name, 'ascii'), argtype=a.type) ++ for a in tpoint.args[:tpoint.num_args]]) ++ ++ def _parse_defs(self): ++ flags = self._lib.spdk_trace_parser_get_flags(self._parser) ++ self._tsc_rate = flags.contents.tsc_rate ++ self._parse_tpoints(flags.contents.tpoint) ++ ++ def conv_objs(arr): ++ return {int(o.type): str(o.id_prefix, 'ascii') for o in arr if o.id_prefix != b'\x00'} ++ self._owners = conv_objs(flags.contents.owner) ++ self._objects = conv_objs(flags.contents.object) ++ ++ def tsc_rate(self): ++ return self._tsc_rate ++ ++ def tpoints(self): ++ return self._tpoints ++ ++ def entries(self): ++ pe = CTraceParserEntry() ++ argconv = {TracepointArgument.TYPE_INT: lambda a: a.integer, ++ TracepointArgument.TYPE_PTR: lambda a: int(a.pointer or 0), ++ TracepointArgument.TYPE_STR: lambda a: str(a.string, 'ascii')} ++ ++ while self._lib.spdk_trace_parser_next_entry(self._parser, ct.byref(pe)): ++ entry = pe.entry.contents ++ lcore = pe.lcore ++ tpoint = self._tpoints[entry.tpoint_id] ++ args = {a.name: argconv[a.argtype](pe.args[i]) for i, a in enumerate(tpoint.args)} ++ ++ if tpoint.object_type != OBJECT_NONE: ++ if pe.object_index != TRACE_INVALID_OBJECT: ++ object_id = '{}{}'.format(self._objects[tpoint.object_type], pe.object_index) ++ ts = entry.tsc - pe.object_start ++ else: ++ object_id, ts = 'n/a', None ++ elif entry.object_id != 0: ++ object_id, ts = '{:x}'.format(entry.object_id), None ++ else: ++ object_id, ts = None, None ++ ++ if tpoint.owner_type != OWNER_NONE: ++ poller_id = '{}{:02}'.format(self._owners[tpoint.owner_type], entry.poller_id) ++ else: ++ poller_id = None ++ ++ if pe.related_type != OBJECT_NONE: ++ related = '{}{}'.format(self._objects[pe.related_type], pe.related_index) ++ else: ++ related = None ++ ++ yield TraceEntry(tpoint=tpoint, lcore=lcore, tsc=entry.tsc, ++ size=entry.size, object_id=object_id, ++ object_ptr=entry.object_id, poller=poller_id, time=ts, ++ args=args, related=related) ++ ++ ++class Trace: ++ """Stores, parses, and prints out SPDK traces""" ++ def __init__(self, file): ++ if file == sys.stdin or magic.from_file(file.name, mime=True) == 'application/json': ++ self._provider = JsonProvider(file) ++ else: ++ self._provider = NativeProvider(file) ++ self._objects = [] ++ self._argfmt = {TracepointArgument.TYPE_PTR: lambda a: f'0x{a:x}'} ++ self.tpoints = self._provider.tpoints() ++ ++ def _annotate_args(self, entry): ++ annotations = {} ++ for obj in self._objects: ++ current = obj.annotate(entry) ++ if current is None: ++ continue ++ annotations.update(current) ++ return annotations ++ ++ def _format_args(self, entry): ++ annotations = self._annotate_args(entry) ++ args = [] ++ for arg, (name, value) in zip(entry.tpoint.args, entry.args.items()): ++ annot = annotations.get(name) ++ if annot is not None: ++ args.append('{}({})'.format(name, ', '.join(f'{n}={v}' for n, v in annot.items()))) ++ else: ++ args.append('{}: {}'.format(name, self._argfmt.get(arg.argtype, ++ lambda a: a)(value))) ++ return args ++ ++ def register_object(self, obj): ++ self._objects.append(obj) ++ ++ def print(self): ++ def get_us(tsc, off): ++ return ((tsc - off) * 10 ** 6) / self._provider.tsc_rate() ++ ++ offset = None ++ for e in self._provider.entries(): ++ offset = e.tsc if offset is None else offset ++ timestamp = get_us(e.tsc, offset) ++ diff = get_us(e.time, 0) if e.time is not None else None ++ args = ', '.join(self._format_args(e)) ++ related = ' (' + e.related + ')' if e.related is not None else '' ++ ++ print(('{:3} {:16.3f} {:3} {:24} {:12}'.format( ++ e.lcore, timestamp, e.poller if e.poller is not None else '', ++ e.tpoint.name, f'size: {e.size}' if e.size else '') + ++ (f'id: {e.object_id + related:12} ' if e.object_id is not None else '') + ++ (f'time: {diff:<8.3f} ' if diff is not None else '') + ++ args).rstrip()) ++ ++ ++class SPDKObject: ++ """Describes a specific type of an SPDK objects (e.g. qpair, thread, etc.)""" ++ @dataclass ++ class Lifetime: ++ """Describes a lifetime and properties of a particular SPDK object.""" ++ begin: int ++ end: int ++ ptr: int ++ properties: dict = field(default_factory=dict) ++ ++ def __init__(self, trace: Trace, tpoints: List[str]): ++ self.tpoints = {} ++ for name in tpoints: ++ tpoint = next((t for t in trace.tpoints.values() if t.name == name), None) ++ if tpoint is None: ++ # Some tpoints might be undefined if configured without specific subsystems ++ continue ++ self.tpoints[tpoint.id] = tpoint ++ ++ def _annotate(self, entry: TraceEntry): ++ """Abstract annotation method to be implemented by subclasses.""" ++ raise NotImplementedError() ++ ++ def annotate(self, entry: TraceEntry): ++ """Annotates a tpoint entry and returns a dict indexed by argname with values representing ++ various object properties. For instance, {"qpair": {"qid": 1, "subnqn": "nqn"}} could be ++ returned to annotate an argument called "qpair" with two items: "qid" and "subnqn". ++ """ ++ if entry.tpoint.id not in self.tpoints: ++ return None ++ return self._annotate(entry) ++ ++ ++class QPair(SPDKObject): ++ def __init__(self, trace: Trace, dtrace: DTrace): ++ super().__init__(trace, tpoints=[ ++ 'RDMA_REQ_NEW', ++ 'RDMA_REQ_NEED_BUFFER', ++ 'RDMA_REQ_TX_PENDING_C2H', ++ 'RDMA_REQ_TX_PENDING_H2C', ++ 'RDMA_REQ_TX_H2C', ++ 'RDMA_REQ_RDY_TO_EXECUTE', ++ 'RDMA_REQ_EXECUTING', ++ 'RDMA_REQ_EXECUTED', ++ 'RDMA_REQ_RDY_TO_COMPL', ++ 'RDMA_REQ_COMPLETING_C2H', ++ 'RDMA_REQ_COMPLETING', ++ 'RDMA_REQ_COMPLETED', ++ 'TCP_REQ_NEW', ++ 'TCP_REQ_NEED_BUFFER', ++ 'TCP_REQ_TX_H_TO_C', ++ 'TCP_REQ_RDY_TO_EXECUTE', ++ 'TCP_REQ_EXECUTING', ++ 'TCP_REQ_EXECUTED', ++ 'TCP_REQ_RDY_TO_COMPLETE', ++ 'TCP_REQ_TRANSFER_C2H', ++ 'TCP_REQ_COMPLETED', ++ 'TCP_WRITE_START', ++ 'TCP_WRITE_DONE', ++ 'TCP_READ_DONE', ++ 'TCP_REQ_AWAIT_R2T_ACK']) ++ self._objects = [] ++ self._find_objects(dtrace.entries) ++ ++ def _find_objects(self, dprobes): ++ def probe_match(probe, other): ++ return probe.args['qpair'] == other.args['qpair'] ++ ++ for i, dprobe in enumerate(dprobes): ++ if dprobe.name != 'nvmf_poll_group_add_qpair': ++ continue ++ # We've found a new qpair, now find the probe indicating its destruction ++ last_idx, last = next((((i + j + 1), d) for j, d in enumerate(islice(dprobes, i, None)) ++ if d.name == 'nvmf_poll_group_remove_qpair' and ++ probe_match(d, dprobe)), (None, None)) ++ obj = SPDKObject.Lifetime(begin=dprobe.args['tsc'], ++ end=last.args['tsc'] if last is not None else TSC_MAX, ++ ptr=dprobe.args['qpair'], ++ properties={'ptr': hex(dprobe.args['qpair']), ++ 'thread': dprobe.args['thread']}) ++ for other in filter(lambda p: probe_match(p, dprobe), dprobes[i:last_idx]): ++ if other.name == 'nvmf_ctrlr_add_qpair': ++ for prop in ['qid', 'subnqn', 'hostnqn']: ++ obj.properties[prop] = other.args[prop] ++ self._objects.append(obj) ++ ++ def _annotate(self, entry): ++ qpair = entry.args.get('qpair') ++ if qpair is None: ++ return None ++ for obj in self._objects: ++ if obj.ptr == qpair and obj.begin <= entry.tsc <= obj.end: ++ return {'qpair': obj.properties} ++ return None ++ ++ ++def build_dtrace(file=None): ++ return DTrace([ ++ DTraceProbe( ++ name='nvmf_poll_group_add_qpair', ++ args=[DTraceArgument(name='tsc', pos=0, type=int), ++ DTraceArgument(name='qpair', pos=1, type=int), ++ DTraceArgument(name='thread', pos=2, type=int)]), ++ DTraceProbe( ++ name='nvmf_poll_group_remove_qpair', ++ args=[DTraceArgument(name='tsc', pos=0, type=int), ++ DTraceArgument(name='qpair', pos=1, type=int), ++ DTraceArgument(name='thread', pos=2, type=int)]), ++ DTraceProbe( ++ name='nvmf_ctrlr_add_qpair', ++ args=[DTraceArgument(name='tsc', pos=0, type=int), ++ DTraceArgument(name='qpair', pos=1, type=int), ++ DTraceArgument(name='qid', pos=2, type=int), ++ DTraceArgument(name='subnqn', pos=3, type=str), ++ DTraceArgument(name='hostnqn', pos=4, type=str)])], file) ++ ++ ++def print_trace(trace_file, dtrace_file): ++ dtrace = build_dtrace(dtrace_file) ++ trace = Trace(trace_file) ++ trace.register_object(QPair(trace, dtrace)) ++ trace.print() ++ ++ ++def main(argv): ++ parser = ArgumentParser(description='SPDK trace annotation script') ++ parser.add_argument('-i', '--input', ++ help='Trace file to annotate (either JSON generated by spdk_trace or ' + ++ 'raw binary produced by the SPDK application itself)') ++ parser.add_argument('-g', '--generate', help='Generate bpftrace script', action='store_true') ++ parser.add_argument('-r', '--record', help='Record BPF traces on PID', metavar='PID', type=int) ++ parser.add_argument('-b', '--bpftrace', help='BPF trace script to use for annotations') ++ args = parser.parse_args(argv) ++ ++ if args.generate: ++ print(build_dtrace().generate()) ++ elif args.record: ++ build_dtrace().record(args.record) ++ else: ++ print_trace(open(args.input, 'r') if args.input is not None else sys.stdin, ++ open(args.bpftrace) if args.bpftrace is not None else None) ++ ++ ++if __name__ == '__main__': ++ # In order for the changes to LD_LIBRARY_PATH to be visible to the loader, ++ # they need to be applied before starting a process, so we need to ++ # re-execute the script after updating it. ++ if os.environ.get('SPDK_BPF_TRACE_PY') is None: ++ rootdir = f'{os.path.dirname(__file__)}/../..' ++ os.environ['LD_LIBRARY_PATH'] = ':'.join([os.environ.get('LD_LIBRARY_PATH', ''), ++ f'{rootdir}/build/lib']) ++ os.environ['SPDK_BPF_TRACE_PY'] = '1' ++ os.execv(sys.argv[0], sys.argv) ++ else: ++ try: ++ main(sys.argv[1:]) ++ except (KeyboardInterrupt, BrokenPipeError): ++ pass +diff --git a/scripts/bpftrace.sh b/scripts/bpftrace.sh +index 123bc0e..e5aba5e 100644 +--- a/scripts/bpftrace.sh ++++ b/scripts/bpftrace.sh +@@ -1,24 +1,24 @@ +-#!/usr/bin/env bash +-# SPDX-License-Identifier: BSD-3-Clause +-# Copyright (C) 2021 Intel Corporation +-# All rights reserved. +-# +- +-set -e +- +-if [ $# -lt 2 ]; then +- echo "usage: $0