diff --git a/docs/en/server/_toc.yaml b/docs/en/server/_toc.yaml index e8172227bbeea80653196cac9895e0b7bf2bdb03..93b1c4ba9264248985e93b209432a15b56bea1ce 100644 --- a/docs/en/server/_toc.yaml +++ b/docs/en/server/_toc.yaml @@ -40,10 +40,18 @@ sections: - href: ./security/shangmi/_toc.yaml - label: Memory and Storage sections: - - href: ./memory_storage/lvm/_toc.yaml - - href: ./memory_storage/etmem/_toc.yaml - - href: ./memory_storage/gmem/_toc.yaml - - href: ./memory_storage/hsak/_toc.yaml + - href: + upstream: https://gitee.com/openeuler/Storage-docs/blob/master/docs/en/lvm/_toc.yaml + path: ./lvm + - href: + upstream: https://gitee.com/openeuler/Storage-docs/blob/master/docs/en/etmem/_toc.yaml + path: ./etmem + - href: + upstream: https://gitee.com/openeuler/Storage-docs/blob/master/docs/en/gmem/_toc.yaml + path: ./gmem + - href: + upstream: https://gitee.com/openeuler/Storage-docs/blob/master/docs/en/hsak/_toc.yaml + path: ./hsak - label: Network sections: - href: ./network/network_config/_toc.yaml @@ -58,7 +66,9 @@ sections: - href: ./performance/tuning_framework/oeaware/_toc.yaml - label: CPU Optimization sections: - - href: ./performance/cpu_optimization/sysboost/_toc.yaml + - href: + upstream: https://gitee.com/openeuler/Computing-docs/blob/master/docs/en/sysboost/_toc.yaml + path: ./sysboost - href: ./performance/cpu_optimization/kae/_toc.yaml - label: System Optimization sections: @@ -72,5 +82,9 @@ sections: - href: ./high_availability/ha/_toc.yaml - label: Diversified Computing sections: - - href: ./diversified_computing/dpu_offload/_toc.yaml - - href: ./diversified_computing/dpu_os/_toc.yaml + - href: + upstream: https://gitee.com/openeuler/dpu-utilities/blob/master/docs/en/dpu_offload/_toc.yaml + path: ./dpu_offload + - href: + upstream: https://gitee.com/openeuler/dpu-utilities/blob/master/docs/en/dpu_os/_toc.yaml + path: ./dpu_os \ No newline at end of file diff --git a/docs/en/server/diversified_computing/dpu_offload/_toc.yaml b/docs/en/server/diversified_computing/dpu_offload/_toc.yaml deleted file mode 100644 index f4b81c30d8f99346b7d7928f4a0b78853ade6b52..0000000000000000000000000000000000000000 --- a/docs/en/server/diversified_computing/dpu_offload/_toc.yaml +++ /dev/null @@ -1,14 +0,0 @@ -label: libvirt Direct Connection Aggregation Environment Establishment -isManual: true -href: ./libvirt_direct_connection_aggregation_environment_establishment.md -description: DPU offloading feature for container management and its installation and deployment method on openEuler -sections: - - label: qtfs Shared File System - href: ./qtfs_architecture_and_usage.md - - label: Imperceptible DPU Offload User Guide - href: ./overview.md - sections: - - label: Imperceptible Container Management Plane Offload - href: ./imperceptible_container_management_plane_offload.md - - label: Imperceptible Container Management Plane Offload Deployment Guide - href: ./offload_deployment_guide.md diff --git a/docs/en/server/diversified_computing/dpu_offload/config/client.json b/docs/en/server/diversified_computing/dpu_offload/config/client.json deleted file mode 100644 index 4aedf4c846914a6bc34dff1988c7794ddb1fa521..0000000000000000000000000000000000000000 --- a/docs/en/server/diversified_computing/dpu_offload/config/client.json +++ /dev/null @@ -1,5 +0,0 @@ -{ - "Protocol": "tcp", - "Ipaddr" : "192.168.10.11", - "Port" : "7777" -} diff --git a/docs/en/server/diversified_computing/dpu_offload/config/prepare.sh b/docs/en/server/diversified_computing/dpu_offload/config/prepare.sh deleted file mode 100644 index ccfe9402051a02451644345b39ef6aa2657bfe89..0000000000000000000000000000000000000000 --- a/docs/en/server/diversified_computing/dpu_offload/config/prepare.sh +++ /dev/null @@ -1,58 +0,0 @@ -#!/bin/bash - -mkdir -p /another_rootfs/var/run/docker/containerd -iptables -t nat -N DOCKER - -echo "---------insmod qtfs ko----------" -# TEST_MODE: IP -insmod ${YOUR_PATH}/qtfs.ko qtfs_server_ip=${YOUR_SERVER_IP} qtfs_log_level=INFO # Enter the .ko file path and IP address. -nohup ${YOUR_PATH}/udsproxyd 1 ${YOUR_CLIENT_IP} 12121 ${YOUR_SERVER_IP} 12121 2>&1 & - -# TEST_MODE: vsock -# insmod ${YOUR_PATH}/qtfs.ko qtfs_server_vsock_cid=${YOUR_SERVER_VSOCK_CID} qtfs_log_level=INFO # Enter the .ko file path and IP address. -# nohup ${YOUR_PATH}/udsproxyd 1 ${YOUR_CLIENT_VSOCK_CID} 12121 ${YOUR_SERVER_VSOCK_CID} 12121 2>&1 & - -qtcfg -w udsconnect -x /var/run/rexec -qtcfg -w udsconnect -x /run/rexec - -mkdir /another_rootfs/local_proc/ -mount -t proc proc /another_rootfs/local_proc/ -mount --bind /var/run/ /another_rootfs/var/run/ -mount --bind /var/lib/ /another_rootfs/var/lib/ -mount --bind /etc /another_rootfs/etc -mount -t devtmpfs devtmpfs /another_rootfs/dev/ -mount -t sysfs sysfs /another_rootfs/sys -mkdir -p /another_rootfs/sys/fs/cgroup -mount -t tmpfs tmpfs /another_rootfs/sys/fs/cgroup -list="perf_event freezer files net_cls,net_prio hugetlb pids rdma cpu,cpuacct memory devices blkio cpuset" -for i in $list -do - echo $i - mkdir -p /another_rootfs/sys/fs/cgroup/$i - mount -t cgroup cgroup -o rw,nosuid,nodev,noexec,relatime,$i /another_rootfs/sys/fs/cgroup/$i -done - -mount -t qtfs -o proc /proc /another_rootfs/proc -echo "proc" -mount -t qtfs /sys /another_rootfs/sys -echo "cgroup" - -mkdir -p /another_rootfs/var/lib/docker/containers -mkdir -p /another_rootfs/var/lib/docker/containerd -mkdir -p /another_rootfs/var/lib/docker/overlay2 -mkdir -p /another_rootfs/var/lib/docker/image -mkdir -p /another_rootfs/var/lib/docker/tmp -mount -t qtfs /var/lib/docker/containers /another_rootfs/var/lib/docker/containers -mount -t qtfs /var/lib/docker/containerd /another_rootfs/var/lib/docker/containerd -mount -t qtfs /var/lib/docker/overlay2 /another_rootfs/var/lib/docker/overlay2 -mount -t qtfs /var/lib/docker/image /another_rootfs/var/lib/docker/image -mount -t qtfs /var/lib/docker/tmp /another_rootfs/var/lib/docker/tmp -mkdir -p /another_rootfs/run/containerd/io.containerd.runtime.v1.linux/ -mount -t qtfs /run/containerd/io.containerd.runtime.v1.linux/ /another_rootfs/run/containerd/io.containerd.runtime.v1.linux/ -mkdir -p /another_rootfs/var/run/docker/containerd -mount -t qtfs /run/docker/containerd /another_rootfs/run/docker/containerd -mkdir -p /another_rootfs/var/lib/containerd/io.containerd.runtime.v1.linux -mount -t qtfs /var/lib/containerd/io.containerd.runtime.v1.linux /another_rootfs/var/lib/containerd/io.containerd.runtime.v1.linux - -qtcfg -w udsconnect -x /another_rootfs/var/run/rexec -qtcfg -w udsconnect -x /another_rootfs/run/rexec diff --git a/docs/en/server/diversified_computing/dpu_offload/config/rexec.service b/docs/en/server/diversified_computing/dpu_offload/config/rexec.service deleted file mode 100644 index ee9e5e4895adb5c010e3f8d4db6652cfaed3d355..0000000000000000000000000000000000000000 --- a/docs/en/server/diversified_computing/dpu_offload/config/rexec.service +++ /dev/null @@ -1,13 +0,0 @@ -[Unit] -Description=Rexec_server Service -After=network.target - -[Service] -Type=simple -Environment=CMD_NET_ADDR=tcp://0.0.0.0:7777 -ExecStart=/usr/bin/rexec_server -ExecReload=/bin/kill -s HUP $MAINPID -KillMode=process - -[Install] -WantedBy=multi-user.target diff --git a/docs/en/server/diversified_computing/dpu_offload/config/server.json b/docs/en/server/diversified_computing/dpu_offload/config/server.json deleted file mode 100644 index 1d4a7bbbc1cbf086e18b147f3f27e6a15c2e322e..0000000000000000000000000000000000000000 --- a/docs/en/server/diversified_computing/dpu_offload/config/server.json +++ /dev/null @@ -1,5 +0,0 @@ -{ - "Protocol": "tcp", - "Ipaddr" : "0.0.0.0", - "Port" : "7777" -} diff --git a/docs/en/server/diversified_computing/dpu_offload/config/server_start.sh b/docs/en/server/diversified_computing/dpu_offload/config/server_start.sh deleted file mode 100644 index fd3655159ddb0fc6069dfa3ab802f4c9f8520c13..0000000000000000000000000000000000000000 --- a/docs/en/server/diversified_computing/dpu_offload/config/server_start.sh +++ /dev/null @@ -1,44 +0,0 @@ -#!/bin/bash - -modprobe overlay -mkdir /var/lib/docker/containers -mkdir -p /var/lib/docker/containers -mkdir -p /var/lib/docker/containerd -mkdir -p /var/lib/docker/overlay2 -mkdir -p /var/lib/docker/tmp -mkdir -p /var/lib/docker/image -mkdir -p /var/run/docker/containerd -mkdir -p /run/containerd/io.containerd.runtime.v1.linux/ -mkdir -p /var/run/docker/netns -mkdir -p /var/lib/containerd/io.containerd.runtime.v1.linux/ -mkdir -p /run/user/0 -touch /var/run/docker/netns/default -# this should be done once -mount --bind /proc/1/ns/net /var/run/docker/netns/default - -function TaskClean() -{ - echo "Now do task clean..." - pkill engine - rmmod qtfs_server - echo "TaskClean done" -} - -trap "TaskClean exit" SIGINT - -mkdir -p /var/run/docker/containerd -mkdir -p /run/containerd/io.containerd.runtime.v1.linux/ - -# TEST_MODE: IP -insmod ${YOUR_PATH}/qtfs_server.ko qtfs_server_ip=${YOUR_SERVER_IP} qtfs_log_level=ERROR -nohup ${YOUR_PATH}/engine 16 1 ${YOUR_SERVER_IP} 12121 ${YOUR_CLIENT_IP} 12121 2>&1 & - -# TEST_MODE: vsock -# insmod ${YOUR_PATH}/qtfs_server.ko qtfs_server_vsock_cid=${YOUR_SERVER_VSOCK_CID} qtfs_log_level=ERROR -# nohup ${YOUR_PATH}/engine 16 1 ${YOUR_SERVER_VSOCK_CID} 12121 ${YOUR_CLIENT_VSOCK_CID} 12121 2>&1 & - -sleep 2 - -qtcfg -w udsconnect -x /var/run/rexec -qtcfg -w udsconnect -x /run/rexec -qtcfg -w udsconnect -x /var/run/containerd diff --git a/docs/en/server/diversified_computing/dpu_offload/config/whitelist b/docs/en/server/diversified_computing/dpu_offload/config/whitelist deleted file mode 100644 index b0be45f86276e89fa9fd0827ba06bbb27d158f62..0000000000000000000000000000000000000000 --- a/docs/en/server/diversified_computing/dpu_offload/config/whitelist +++ /dev/null @@ -1,8 +0,0 @@ -kill -taskset -qemu-kvm -rexec_shim -/usr/bin/taskset -/usr/bin/kill -/usr/bin/qemu-kvm -/usr/bin/rexec_shim diff --git a/docs/en/server/diversified_computing/dpu_offload/figures/arch.png b/docs/en/server/diversified_computing/dpu_offload/figures/arch.png deleted file mode 100644 index b6a7836fd6fab75009e781ac1ed96c73c352f75b..0000000000000000000000000000000000000000 Binary files a/docs/en/server/diversified_computing/dpu_offload/figures/arch.png and /dev/null differ diff --git a/docs/en/server/diversified_computing/dpu_offload/figures/offload-arch.png b/docs/en/server/diversified_computing/dpu_offload/figures/offload-arch.png deleted file mode 100644 index b0f7b8587c47838880bcca5d6694f66a16ec0aaf..0000000000000000000000000000000000000000 Binary files a/docs/en/server/diversified_computing/dpu_offload/figures/offload-arch.png and /dev/null differ diff --git a/docs/en/server/diversified_computing/dpu_offload/figures/qtfs-arch.png b/docs/en/server/diversified_computing/dpu_offload/figures/qtfs-arch.png deleted file mode 100644 index 749b007287d8503badcea52036b7a71b06092bc2..0000000000000000000000000000000000000000 Binary files a/docs/en/server/diversified_computing/dpu_offload/figures/qtfs-arch.png and /dev/null differ diff --git a/docs/en/server/diversified_computing/dpu_offload/imperceptible_container_management_plane_offload.md b/docs/en/server/diversified_computing/dpu_offload/imperceptible_container_management_plane_offload.md deleted file mode 100644 index b49dd3e781f859574dc65edf1f31a9673c9142ed..0000000000000000000000000000000000000000 --- a/docs/en/server/diversified_computing/dpu_offload/imperceptible_container_management_plane_offload.md +++ /dev/null @@ -1,35 +0,0 @@ -# Imperceptible Container Management Plane Offload - -## Overview - -Moore's law ceases to apply in data center and cloud scenarios. The CPU computing power growth rate of general processing units is slowing down, while the network I/O speed and performance keep increasing. As a result, the processing capability of current general-purpose processors cannot meet the I/O processing requirements of the network and drives. In traditional data centers, more and more general-purpose CPU computing power is occupied by I/O and management planes. This part of resource loss is called data center tax. According to AWS statistics, the data center tax may account for more than 30% of the computing power of the data center. - -The data processing unit (DPU) is developed to release the computing resources from the host CPU. The management plane, network, storage, and security capabilities are offloaded to DPUs for acceleration, reducing costs and improving efficiency. Mainstream cloud vendors, such as AWS, Alibaba Cloud, and Huawei Cloud, use self-developed processors to offload the management plane and related data plane, achieving 100% utilization of data center computing resources. - -The management plane processes can be offloaded to the DPU by splitting the component source code. The source code is split into two parts that run independently on the host and DPU based on the function logic. In this way, the component is offloaded. However, this method has the following problems: - -1. The software compatibility of the component is affected. You need to maintain the component and related patches in subsequent version upgrades, which increases the maintenance workload. -2. The offload cannot be inherited by other components. You need to split each component based on code logic analysis. - -To solve these problems, openEuler introduces imperceptible DPU offload. The abstraction layer provided by the OS shields the cross-host access differences between the host and DPU, and enables service processes to be offloaded to the DPU with virtually zero modification. This part of work at the common layer of the OS and is irrelevant to upper-layer services. Other services can also inherit the offload to DPU. - -## Architecture - -### Imperceptible Container Management Plane DPU Offload Architecture - -**Figure 1** Imperceptible Container Management Plane DPU Offload Architecture - -![offload-arch](./figures/offload-arch.png) - -As shown in Figure 1, after the container management plane is offloaded, management processes such as dockerd and kubelet run on the DPU side, and container processes run on the host. The interaction between processes is ensured by the system layer. - -* Communication layer: DPUs and hosts can communicate with each other through PCIe interfaces or networks. A communication interface layer is provided based on underlying physical connections to provide communication interfaces for upper-layer services. - -* qtfs kernel shared file system: The container management plane components kubelet and dockerd interact with container processes through file systems. Management plane tools need to prepare data plane paths to rootfs and volume for container processes. In addition, the proc and cgroup file systems need to be used to control and monitor the resources and status of container processes. For details about qtfs, see [qtfs Shared File System Introduction and Usage](./qtfs-architecture-and-usage.md). - -* User-mode offload environment: You need to use qtfs to prepare the runtime environment for the offloaded management plane, and remotely mount the container management and runtime directories of the host to the DPU. System management file systems such as proc, sys, and cgroup need to be mounted. To prevent damage to the native system functions of the DPU, the preceding mounting operations are performed in the chroot environment. In addition, the management plane (running on the DPU) and container processes (running on the host) have invoking relationships. The rexec remote binary execution tool needs to be used to provide corresponding functions. - -For details about how to offload container management plane, see the [Deployment Guide](./offload-deployment-guide.md). - -> ![](public_sys-resources/icon-note.gif) **NOTE**: -> In this user guide, modifications are performed to the container management plane components and the rexec tool of a specific version. You can modify other versions based on the actual execution environment. The patch provided in this document is for verification only and is not for commercial use. diff --git a/docs/en/server/diversified_computing/dpu_offload/libvirt_direct_connection_aggregation_environment_establishment.md b/docs/en/server/diversified_computing/dpu_offload/libvirt_direct_connection_aggregation_environment_establishment.md deleted file mode 100644 index 90280c4bda996c18b21401f8717b14b865ccfeaa..0000000000000000000000000000000000000000 --- a/docs/en/server/diversified_computing/dpu_offload/libvirt_direct_connection_aggregation_environment_establishment.md +++ /dev/null @@ -1,360 +0,0 @@ -# Libvirt Direct Connection Aggregation Environment Establishment - -## 1 Hardware Preparation - -### Test Mode - -Prepare two physical machines (VMs have not been tested) that can communicate with each other. - -One physical machine functions as the DPU, and the other functions as the host. In this document, DPU and HOST refer to the two physical machines. - -> [!NOTE]NOTE -> In the test mode, network ports are exposed without connection authentication, which is risky and should be used only for internal tests and verification. Do not use this mode in the production environment. - -### vsock mode - -The DPU and HOST are required. The DPU must be able to provide vsock communication through virtio. - -This document describes only the test mode usage. - -## 2 libvirt offload architecture - -![arch](./figures/arch.png) - -## 3 Environment Setup - -### 3.1 qtfs File System Deployment - -For details, visit . - -To establish a qtfs connection, you need to disable the firewall. - -### 3.2 Deploying the udsproxyd Service - -#### 3.2.1 Introduction - -udsproxyd is a cross-host Unix domain socket (UDS) proxy service, which needs to be deployed on both the host and DPU. The udsproxyd components on the host and dpu are peers. They implement seamless UDS communication between the host and DPU, which means that if two processes can communicate with each other through UDSs on the same host, they can do the same between the host and DPU. The code of the processes does not need to be modified, only that the client process needs to run with the **LD_PRELOAD=libudsproxy.so** environment variable. As a cross-host Unix socket service, udsproxyd can be used by running with `LD_PRELOAD=libudsproxy.so`. With the support of qtfs, udsproxyd can also be used transparently. You need to configure the allowlist in advance. The specific operations are described later. - -#### 3.2.2 Deploying udsproxyd - -Build udsproxyd in the dpu-utilities project: - -```bash -cd qtfs/ipc -make -j UDS_TEST_MODE=1 && make install -``` - -The engine service on the qtfs server has incorporated the udsproxyd feature. You do not need to manually start udsproxyd if the qtfs server is deployed. However, you need to start udsproxyd on the client by running the following command: - -```bash -nohup /usr/bin/udsproxyd 2>&1 & -``` - -Parameters: - -```bash -thread num: number of threads. Currently, only one thread is supported. -addr: IP address of the host. -port: Port used on the host. -peer addr: IP address of the udsproxyd peer. -peer port: port used on the udsproxyd peer. -``` - -Example: - -```bash -nohup /usr/bin/udsproxyd 1 192.168.10.10 12121 192.168.10.11 12121 2>&1 & -``` - -If the qtfs engine service is not started, you can start udsproxyd on the server to test udsproxyd separately. Run the following command: - -```bash -nohup /usr/bin/udsproxyd 1 192.168.10.11 12121 192.168.10.10 12121 2>&1 & -``` - -#### 3.2.3 Using udsproxyd - -##### 3.2.3.1 Using udsproxyd Independently - -When starting the client process of the Unix socket application that uses the UDS service, add the **LD_PRELOAD=libudsproxy.so** environment variable to intercept the **connect** API of glibc for UDS interconnection. In the libvirt offload scenario, you can copy **libudsproxy.so**, which will be used by the libvirtd service, to the **/usr/lib64** directory in the chroot directory of libvirt. - -##### 3.2.3.2 Using the udsproxyd Service Transparently - -Configure the UDS service allowlist for qtfs. The allowlist is the sock file address bound to the Unix socket server. For example, the files of the Unix socket server created by the libvirt VM are in the **/var/lib/libvirt** directory. In this case, add the directory path to the allowlist in either of the following ways: - -* Load the allowlist by using the `qtcfg` utility. First compile the utility in **qtfs/qtinfo**. - -Run the following command on the qtfs client: - -```bash -make role=client -make install -``` - -Run the following command on the qtfs server: - -```bash -make role=server -make install -``` - -After `qtcfg` is installed automatically, run `qtcfg` to configure the allowlist. Assume that **/var/lib/libvirt** needs to be added to the allowlist: - -```bash -qtcfg -x /var/lib/libvirt/ -``` - -Query the allowlist: - -```bash -qtcfg -z -``` - -Delete an allowlist entry: - -```bash -qtcfg -y 0 -``` - -The parameter is the index number listed when you query the allowlist. - -* Add an allowlist entry through the configuration file. The configuration file needs to be set before the qtfs or qtfs_server kernel module is loaded. The allowlist is loaded when the kernel modules are initialized. - -> [!NOTE]NOTE -> The allowlist prevents irrelevant Unix sockets from establishing remote connections, causing errors or wasting resources. You are advised to set the allowlist as precisely as possible. For example, in this document, **/var/lib/libvirt** is set in the libvirt scenario. It would be risky to directly add **/var/lib**, **/var**, or the root directory. - -### 3.3 rexec Service Deployment - -#### 3.3.1 Introduction - -rexec is a remote execution component developed using the C language. It consists of the rexec client and rexec server. The server is a daemon process, and the client is a binary file. After being started, the client establishes a UDS connection with the server using the udsproxyd service, and the server daemon process starts a specified program on the server machine. During libvirt virtualization offload, libvirtd is offloaded to the DPU. When libvirtd needs to start the QEMU process on the HOST, the rexec client is invoked to remotely start the process. - -#### 3.3.2 Deploying rexec - -##### 3.3.2.1 Configuring the Environment Variables and Allowlist - -Configure the rexec server allowlist on the host. Put the **whitelist** file in the **/etc/rexec** directory, and change the file permission to read-only. - -```bash -chmod 400 /etc/rexec/whitelist -``` - -In the test environment, the allowlist is not mandatory. You can disable the allowlist by deleting the **whitelist** file and restarting the rexec_server process. - -After downloading the dpu-utilities code, go to the **qtfs/rexec** directory and run `make && make install` to install all binary files required by rexec (**rexec** and **rexec_server**) to the **/usr/bin** directory. - -Before starting the rexec_server service on the server, check whether the **/var/run/rexec** directory exists. If not, create it. - -```bash -mkdir /var/run/rexec -``` - -##### 3.3.2.2 Starting the Service - -You can start the rexec_server service on the server in either of the following ways. - -* Method 1: - - Configure rexec as a systemd service. - - Add the **[rexec.service](./config/rexec.service)** file to **/usr/lib/systemd/system**. - - Then, use `systemctl` to manage the rexec service. - - Start the service for the first time: - - ```bash - systemctl daemon-reload - - systemctl enable --now rexec - ``` - - Restart the service: - - ```bash - systemctl stop rexec - - systemctl start rexec - ``` - -* Method 2: - - Manually start the service in the background. - - ```bash - nohup /usr/bin/rexec_server 2>&1 & - ``` - -### 3.4 libvirt Service Deployment - -#### 3.4.1 Deploying on the HOST - -Install the VM runtime and libvirt. (libvirt is installed to create related directories.) - -```bash -yum install -y qemu libvirt edk2-aarch64 # (required for starting VMs in the Arm environment) -``` - -Put the VM image on the HOST. The VM image will be mounted to the client through qtfs and shared with libvirt. - -#### 3.4.2 Deploying on the DPU - -##### 3.4.2.1 Creating the Chroot Environment - -(a) Download the QCOW image from the openEuler official website, for example, openEuler 23.09: . - -(b) Mount the QCOW2 image. - -```bash -cd /root/ - -mkdir p2 new_root_origin new_root - -modprobe nbd maxport=8 - -qemu-nbd -c /dev/nbd0 xxx.qcow2 - -mount /dev/nbd0p2 /root/p2 - -cp -rf /root/p2/* /root/new_root_origin/ - -umount /root/p2 - -qemu-nbd -d /dev/nbd0 -``` - -(c) Now, the root directory of the image is decompressed in **new_root_origin**. Bind mount **new_root** to **new_root_origin** as the mount point for chroot. - -```bash -mount --bind /root/new_root_origin /root/new_root -``` - -##### 3.4.2.2 Installing libvirt - -Compile the source code with a patch. - -(a) Go to the chroot environment and install the compilation environment and common tools. - -```bash -yum groupinstall "Development tools" -y -yum install -y vim meson qemu qemu-img strace edk2-aarch64 tar -``` - -**edk2-aarch64** is required for starting VMs in the Arm environment. - -(b) Install the dependency packages required for libvirt compilation. - -```bash - yum install -y rpcgen python3-docutils glib2-devel gnutls-devel libxml2-devel libpciaccess-devel libtirpc-devel yajl-devel systemd-devel dmidecode glusterfs-api numactl -``` - -(c) Download the libvirt-x.x.x source code package . - -(d) Obtain the libvirt patch: - -. - -(e) Decompress the source code package to a directory in the chroot environment, for example, **/home**, and apply the patch. - -(f) Go to the **libvirt-x.x.x** directory and run the following command: - -```bash -meson build --prefix=/usr -Ddriver_remote=enabled -Ddriver_network=enabled -Ddriver_qemu=enabled -Dtests=disabled -Ddocs=enabled -Ddriver_libxl=disabled -Ddriver_esx=disabled -Dsecdriver_selinux=disabled -Dselinux=disabled -``` - -(g) Complete the installation. - -```bash -ninja -C build install -``` - -##### 3.4.2.3 Starting the libvirtd Service - -To use libvirt direct connection aggregation, you need to start the libvirtd service in the chroot environment, which requires the libvirtd service outside the chroot environment to be stopped. - -(a) Put the [VM jumper script](./scripts/qemu-kvm) in **/usr/bin** and **/usr/libexec** in the chroot environment to replace the **qemu-kvm** binary file. The jumper script will call rexec to start a remote VM. -> [!NOTE]NOTE -> In the XML file of virsh, set **\** under **\** to **qemu-kvm**. If you set **\** to another value, change it to **qemu-kvm** or replace the binary file specified by **\** with the jumper script. The content of the jumper script also needs to be modified accordingly. - -(b) Copy the **libudsproxy.so** file generated during udsproxyd compilation to the **/usr/lib64** directory in the chroot directory. If the udsproxyd service is used by configuring the UDS allowlist of qtfs, you do not need to copy the **libudsproxy.so** file. - -(c) Save the **rexec** binary file generated during rexec compilation to the **/usr/bin** directory of the chroot environment. - -(d) To configure the chroot mounting environment, you need to mount some directories. Use the following scripts: - -* [virt_start.sh](./scripts/virt_start.sh) is the configuration script. In the script, you need to manually change the **qtfs.ko** path to the path of the compiled **.ko** file and set the correct HOST IP address. -* [virt_umount.sh](./scripts/virt_umount.sh) is the configuration revert script. - -(e) The mount directories in the script are based on the examples in this document. You can modify the paths in the script as required. - -(f) After the chroot environment is configured, enter the chroot environment and manually start libvirtd. - -If qtfs is not configured to use the udsproxyd allowlist, run the following commands: - -```bash -LD_PRELOAD=/usr/lib64/libudsproxy.so virtlogd -d -LD_PRELOAD=/usr/lib64/libudsproxy.so libvirtd -d -``` - -If qtfs is configured to use the udsproxyd allowlist, the LD_PRELOAD prefix is not required: - -```bash -virtlogd -d -libvirtd -d -``` - -To check whether the allowlist is configured, run the following command in another terminal that is not in the chroot environment: - -```bash -qtcfg -z -``` - -Check whether the allowlist contains **/var/lib/libvirt**. - -### 3.5 VM Startup - -After the service is deployed, you can manage the VM life cycle from the DPU. - -#### 3.5.1 Defining the VM - -(a) Place the VM boot image in a directory on the HOST, for example: - -```bash -/home/VMs/Domain_name -``` - -(b) Use qtfs to mount the directory to the DPU. - -```bash -mount -t qtfs /home/VMs /home/VMs -``` - -(c) In the XML file, **/home/VMs/Domain_name** is used as the boot image. In this way, the same image file is presented to the DPU and HOST (**Domain_name** is the VM **domain**). - -(d) Check whether **\** in the XML file points to the jumper script. - -(e) Define the VM. - -```bash -virsh define xxx.xml -``` - -#### 3.5.2 Starting the VM - -```bash -virsh start domain -``` - -# 4 Environment Reset - -Some libvirt directories are shared between the DPU and the HOST. Therefore, you need to unmount these directories before uninstalling the environment. Generally, stop the libvirtd and virtlogd processes and run the **virt_umount.sh** script. If a VM is running on the HOST, stop the VM before unmounting the directories. - -## 5 Common Errors - -1. libvirt compilation failure: Check whether the dependency packages are installed. If an external directory or HOST directory is mounted to the chroot environment, the compilation may fail. In this case, unmount the directory first. - -2. qtfs mounting failure: The engine process on the server is not started or the firewall is not disabled. As a result, the qtfs connection fails. - -3. VM definition failure: Check whether the emulator in the XML file points to the jumper script, whether the VM image has been mounted to the DPU through qtfs, and whether the path is the same as that on the HOST. - -4. VM startup failure: Check whether the libvirtd and virtlogd services are started, whether the rexec service is started, whether the jumper process is started, and whether an error is reported when qemu-kvm is started. diff --git a/docs/en/server/diversified_computing/dpu_offload/offload_deployment_guide.md b/docs/en/server/diversified_computing/dpu_offload/offload_deployment_guide.md deleted file mode 100644 index 5bc0c03ab107dc7d145e99803602be2a054d0d34..0000000000000000000000000000000000000000 --- a/docs/en/server/diversified_computing/dpu_offload/offload_deployment_guide.md +++ /dev/null @@ -1,167 +0,0 @@ -# Imperceptible Container Management Plane Offload Deployment Guide - -> ![](./public_sys-resources/icon-note.gif)**NOTE**: -> -> In this user guide, modifications are performed to the container management plane components and the rexec tool of a specific version. You can modify other versions based on the actual execution environment. The patch provided in this document is for verification only and is not for commercial use. -> ![](./public_sys-resources/icon-note.gif)**NOTE**: -> -> The communication between shared file systems is implemented through the network. You can perform a simulated offload using two physical machines or VMs connected through the network. -> -> Before the verification, you are advised to set up a Kubernetes cluster and container running environment that can be used properly and offload the management plane process of a single node. You can use a physical machine or VM that is connected to the network as an emulated DPU. - -## Introduction - -Container management plane, that is, management tools of containers such as Kubernetes, dockerd, containerd, and isulad. Container management plane offload is to offload the container management plane from the host where the container is located to another host, that is, the DPU, a set of hardware that has an independent running environment. - -By mounting directories related to container running on the host to the DPU through qtfs, the container management plane tool running on the DPU can access these directories and prepare the running environment for the containers running on the host. To remotely mount the special file systems such as proc and sys, a dedicated rootfs is created as the running environment of Kubernetes and dockerd (referred to as **/another_rootfs**). - -In addition, rexec is used to start and delete containers so that the container management plane and containers can run on two different hosts for remote container management. - -## Related Component Patches - -### rexec - -rexec is a remote execution tool written in the Go language based on the [rexec](https://github.com/docker/libchan/tree/master/examples/rexec) example tool of Docker/libchan. rexec is used to remotely invoke binary files. For ease of use, capabilities such as transferring environment variables and monitoring the exit of original processes are added to rexec. - -To use the rexec tool, run the `CMD_NET_ADDR=tcp://0.0.0.0: rexec_server` command on the server to start the rexec service process, and then run the `CMD_NET_ADDR=tcp://: rexec [command]` on the client`. This instructs rexec_server to execute the command. - -### dockerd - -The changes to dockerd are based on version 18.09. - -In containerd, the part that invokes libnetwork-setkey through hook is commented out. This does not affect container startup. In addition, to ensure the normal use of `docker load`, an error in the `mount` function in **mounter_linux.go** is commented out. - -In the running environment of the container management plane, **/proc** is mounted to the proc file system on the server, and the local proc file system is mounted to **/local_proc**. In dockerd and containerd, **/proc** is changed to **/local_proc** for accessing **/proc/self/xxx**, **/proc/getpid()/xxx**, or related file systems. - -### containerd - -The changes to containerd are based on containerd-1.2-rc.1. - -When obtaining mounting information, **/proc/self/mountinfo** can obtain only the local mounting information of dockerd but cannot obtain that on the server. Therefore, **/proc/self/mountinfo** is changed to **/proc/1/mountinfo** to obtain the mounting information on the server by obtaining the mounting information of process 1 on the server. - -In containerd-shim, the Unix socket that communicates with containerd is changed to TCP. containerd obtains the IP address of the running environment of containerd-shim through the **SHIM_HOST** environment variable, that is, the IP address of the server. The has value of shim is used to generate a port number, which is used as the communication port to start containerd-shim. - -In addition, the original method of sending signals to containerd-shim is changed to the method of remotely invoking the `kill` command to send signals to shim, ensuring that Docker can correctly kill containers. - -### Kubernetes - -kubelet is not modified. The container QoS manager may fail to be configured for the first time. This error does not affect the subsequent pod startup process. - -## Container Management Plane Offload Operation Guide - -Start rexec_server on both the server and client. rexec_server on the server is used to invoke rexec to stat containerd-shim. rexec_server on the client is used to execute invoking of dockerd and containerd by containerd-shim. - -### Server - -Create a folder required by the container management plane, insert **qtfs_server.ko**, and start the engine process. - -In addition, you need to create the rexec script **/usr/bin/dockerd** on the server. - -``` shell -#!/bin/bash -CMD_NET_ADDR=tcp://: rexec /usr/bin/dockerd $* -``` - -### Client - -Prepare a rootfs as the running environment of dockerd and containerd. Use the following script to mount the server directories required by dockerd and containerd to the client. Ensure that the remote directories mounted in the script exist on both the server and client. - -``` shell -#!/bin/bash -mkdir -p /another_rootfs/var/run/docker/containerd -iptables -t nat -N DOCKER -echo "---------insmod qtfs ko----------" -insmod /YOUR/QTFS/PATH/qtfs.ko qtfs_server_ip= qtfs_log_level=INFO - -# The proc file system in the chroot environment is replaced by the proc shared file system of the DPU. The actual proc file system of the local host needs to be mounted to **/local_proc**. -mount -t proc proc /another_rootfs/local_proc/ - -# Bind the chroot internal environment to the external environment to facilitate configuration and running. -mount --bind /var/run/ /another_rootfs/var/run/ -mount --bind /var/lib/ /another_rootfs/var/lib/ -mount --bind /etc /another_rootfs/etc - -mkdir -p /another_rootfs/var/lib/isulad - -# Create and mount the dev, sys, and cgroup file systems in the chroot environment. -mount -t devtmpfs devtmpfs /another_rootfs/dev/ -mount -t sysfs sysfs /another_rootfs/sys -mkdir -p /another_rootfs/sys/fs/cgroup -mount -t tmpfs tmpfs /another_rootfs/sys/fs/cgroup -list="perf_event freezer files net_cls,net_prio hugetlb pids rdma cpu,cpuacct memory devices blkio cpuset" -for i in $list -do - echo $i - mkdir -p /another_rootfs/sys/fs/cgroup/$i - mount -t cgroup cgroup -o rw,nosuid,nodev,noexec,relatime,$i /another_rootfs/sys/fs/cgroup/$i -done - -## common system dir -mount -t qtfs -o proc /proc /another_rootfs/proc -echo "proc" -mount -t qtfs /sys /another_rootfs/sys -echo "cgroup" - -# Mount the shared directory required by the container management plane. -mount -t qtfs /var/lib/docker/containers /another_rootfs/var/lib/docker/containers -mount -t qtfs /var/lib/docker/containerd /another_rootfs/var/lib/docker/containerd -mount -t qtfs /var/lib/docker/overlay2 /another_rootfs/var/lib/docker/overlay2 -mount -t qtfs /var/lib/docker/image /another_rootfs/var/lib/docker/image -mount -t qtfs /var/lib/docker/tmp /another_rootfs/var/lib/docker/tmp -mkdir -p /another_rootfs/run/containerd/io.containerd.runtime.v1.linux/ -mount -t qtfs /run/containerd/io.containerd.runtime.v1.linux/ /another_rootfs/run/containerd/io.containerd.runtime.v1.linux/ -mkdir -p /another_rootfs/var/run/docker/containerd -mount -t qtfs /var/run/docker/containerd /another_rootfs/var/run/docker/containerd -mount -t qtfs /var/lib/kubelet/pods /another_rootfs/var/lib/kubelet/pods -``` - -In**/another_rootfs**, create the following script to support cross-host operations: - -* /another_rootfs/usr/local/bin/containerd-shim - -``` shell -#!/bin/bash -CMD_NET_ADDR=tcp://: /usr/bin/rexec /usr/bin/containerd-shim $* -``` - -* /another_rootfs/usr/local/bin/remote_kill - -``` shell -#!/bin/bash -CMD_NET_ADDR=tcp://: /usr/bin/rexec /usr/bin/kill $* -``` - -* /another_rootfs/usr/sbin/modprobe - -``` shell -#!/bin/bash -CMD_NET_ADDR=tcp://: /usr/bin/rexec /usr/sbin/modprobe $* -``` - -After changing the root directories of dockerd and containerd to the required rootfs, run the following command to start dockerd and containerd: - -* containerd - -``` shell -#!/bin/bash -SHIM_HOST= containerd --config /var/run/docker/containerd/containerd.toml --address /var/run/containerd/containerd.sock -``` - -* dockerd - -``` shell -#!/bin/bash -SHIM_HOST=CMD_NET_ADDR=tcp://: /usr/bin/dockerd --containerd /var/run/containerd/containerd.sock -``` - -* kubelet - -Use the original parameters to start kubelet in the chroot environment. - -Because **/var/run/** is bound to **/another_rootfs/var/run/**, you can use Docker to access the **docker.sock** interface for container management in the regular rootfs. - -The container management plane is offloaded to the DPU. You can run `docker` commands to create and delete containers, or use `kubectl` on the current node to schedule and destroy pods. The actual container service process runs on the host. - -> ![](./public_sys-resources/icon-note.gif)**NOTE**: -> -> This guide describes only the container management plane offload. The offload of container network and data volumes requires additional offload capabilities, which are not included. You can perform cross-node startup of containers that are not configured with network and storage by referring to this guide. diff --git a/docs/en/server/diversified_computing/dpu_offload/overview.md b/docs/en/server/diversified_computing/dpu_offload/overview.md deleted file mode 100644 index 53ef828f387423867ffc8194bf5cf879c768276c..0000000000000000000000000000000000000000 --- a/docs/en/server/diversified_computing/dpu_offload/overview.md +++ /dev/null @@ -1,11 +0,0 @@ -# Imperceptible DPU Offload User Guide - -This document describes the container management plane DPU offload function of openEuler, as well as how to install and deploy it. Through the unified abstraction layer provided by the OS, this function masks the differences in how the container management plane accesses resources across hosts. This makes it possible to offload services from the container management plane to the DPU. - -This document is intended for community developers, open source enthusiasts, and partners who use the openEuler OS and want to learn and use the OS kernel and containers. Users must: - -- Know basic Linux operations. - -- Be familiar with the fundamental mechanisms of the Linux kernel file system. - -- Understand Kubernetes and Docker, as well as how to deploy and use them. diff --git a/docs/en/server/diversified_computing/dpu_offload/public_sys-resources/icon-note.gif b/docs/en/server/diversified_computing/dpu_offload/public_sys-resources/icon-note.gif deleted file mode 100644 index 6314297e45c1de184204098efd4814d6dc8b1cda..0000000000000000000000000000000000000000 Binary files a/docs/en/server/diversified_computing/dpu_offload/public_sys-resources/icon-note.gif and /dev/null differ diff --git a/docs/en/server/diversified_computing/dpu_offload/qtfs_architecture_and_usage.md b/docs/en/server/diversified_computing/dpu_offload/qtfs_architecture_and_usage.md deleted file mode 100644 index 7fe0544418ae42a58daab9eb2e091f945d944416..0000000000000000000000000000000000000000 --- a/docs/en/server/diversified_computing/dpu_offload/qtfs_architecture_and_usage.md +++ /dev/null @@ -1,77 +0,0 @@ -# qtfs Shared File System Architecture and Usage - -## Introduction - -qtfs is a shared file system project. It can be deployed on either a host-DPU hardware architecture or on two hosts. qtfs works in client-server mode, allowing the client to access specified file systems on the server in the same way that local files are accessed. - -qtfs provides the following features: - -+ Mount point propagation - -+ Sharing of special file systems such as proc, sys, and cgroup - -+ Shared read and write of remote files - -+ Remote mounting of server file systems on the client - -+ Customized processing of special files - -+ Remote FIFO, Unix sockets, and epoll that allow the client and server to access the files as if they were like local - -+ Bottom-layer host-DPU communication over the PCIe protocol, outperforming the network - -+ Kernel module development, preventing intrusive modification to the kernel - -## Software Architecture - -![qtfs-arch](./figures/qtfs-arch.png) - -## Installation - -Perform operations in the following qtfs-related directories: - -+ **qtfs**: code of the client kernel module. Compile the client **.ko** file in this directory. - -+ **qtfs_server**: code of the server kernel module. Compile the server **.ko** files and related programs in this directory. - -+ **qtinfo**: diagnosis tool that is used to check the status of file systems and change the log level. - -+ **demo**, **test**, and **doc**: demo programs, test programs, and project documents. - -+ Root directory: code of common modules used by the client and server. - -Configure the kernel compilation environment on two servers (or VMs). - -1. The kernel version must be 5.10 or later. -2. Install the kernel development package by running `yum install kernel-devel`. -3. Assume that the host IP address is 192.168.10.10 and the DPU IP address is 192.168.10.11. - -Install the qtfs server. - -```bash - 1. cd qtfs_server - 2. make clean && make - 3. insmod qtfs_server.ko qtfs_server_ip=192.168.10.10 qtfs_server_port=12345 qtfs_log_level=WARN - 4. nohup ./engine 16 1 192.168.10.10 12121 192.168.10.11 12121 2>&1 & -``` - -Install the qtfs client. - -```bash - 1. cd qtfs - 2. make clean && make - 3. insmod qtfs.ko qtfs_server_ip=192.168.10.10 qtfs_server_port=12345 qtfs_log_level=WARN - 4. cd ../ipc/ - 5. make clean && make && make install - 6. nohup udsproxyd 1 192.168.10.11 12121 192.168.10.10 12121 2>&1 & -``` - -## Usage - -After the installation is complete, mount the server file system to the client. For example: - -```bash - mount -t qtfs / /root/mnt/ -``` - -The file system is visible to the client. Access **/root/mnt** on the client to view and operate files on the server. diff --git a/docs/en/server/diversified_computing/dpu_offload/scripts/qemu-kvm b/docs/en/server/diversified_computing/dpu_offload/scripts/qemu-kvm deleted file mode 100644 index e869371be109b57f59709fc23bc5b1cb2002cfbf..0000000000000000000000000000000000000000 --- a/docs/en/server/diversified_computing/dpu_offload/scripts/qemu-kvm +++ /dev/null @@ -1,3 +0,0 @@ -#!/bin/bash - -exec /usr/bin/rexec /usr/bin/qemu-kvm $* diff --git a/docs/en/server/diversified_computing/dpu_offload/scripts/virt_start.sh b/docs/en/server/diversified_computing/dpu_offload/scripts/virt_start.sh deleted file mode 100644 index 06ca194b7a639a947b6e395f116beeba7c897459..0000000000000000000000000000000000000000 --- a/docs/en/server/diversified_computing/dpu_offload/scripts/virt_start.sh +++ /dev/null @@ -1,48 +0,0 @@ -#!/bin/bash -insmod ./qtfs.ko qtfs_server_ip=192.168.10.11 qtfs_log_level=NONE - -systemctl stop libvirtd - -if [ ! -d "/root/new_root/local_proc" ]; then - mkdir -p /root/new_root/local_proc -fi -if [ ! -d "/root/new_root/local" ]; then - mkdir -p /root/new_root/local -fi -mount -t proc proc /root/new_root/local_proc/ -mount -t proc proc /root/new_root/local/proc -mount -t sysfs sysfs /root/new_root/local/sys -mount --bind /var/run/ /root/new_root/var/run/ -mount --bind /var/lib/ /root/new_root/var/lib/ -mount --bind /var/cache/ /root/new_root/var/cache -mount --bind /etc /root/new_root/etc - -mkdir -p /root/new_root/home/VMs/ -mount -t qtfs /home/VMs/ /root/new_root/home/VMs/ - -mount -t qtfs /var/lib/libvirt /root/new_root/var/lib/libvirt - -mount -t devtmpfs devtmpfs /root/new_root/dev/ -mount -t hugetlbfs hugetlbfs /root/new_root/dev/hugepages/ -mount -t mqueue mqueue /root/new_root/dev/mqueue/ -mount -t tmpfs tmpfs /root/new_root/dev/shm - -mount -t sysfs sysfs /root/new_root/sys -mkdir -p /root/new_root/sys/fs/cgroup -mount -t tmpfs tmpfs /root/new_root/sys/fs/cgroup -list="perf_event freezer files net_cls,net_prio hugetlb pids rdma cpu,cpuacct memory devices blkio cpuset" -for i in $list -do - echo $i - mkdir -p /root/new_root/sys/fs/cgroup/$i - mount -t cgroup cgroup -o rw,nosuid,nodev,noexec,relatime,$i /root/new_root/sys/fs/cgroup/$i -done - -## common system dir -mount -t qtfs -o proc /proc /root/new_root/proc -echo "proc" - -mount -t qtfs /sys /root/new_root/sys -echo "cgroup" -mount -t qtfs /dev/pts /root/new_root/dev/pts -mount -t qtfs /dev/vfio /root/new_root/dev/vfio diff --git a/docs/en/server/diversified_computing/dpu_offload/scripts/virt_umount.sh b/docs/en/server/diversified_computing/dpu_offload/scripts/virt_umount.sh deleted file mode 100644 index 4adddec913c23069c6bffddec0bf1770f8c5ce71..0000000000000000000000000000000000000000 --- a/docs/en/server/diversified_computing/dpu_offload/scripts/virt_umount.sh +++ /dev/null @@ -1,20 +0,0 @@ -#!/bin/bash - -umount /root/new_root/dev/hugepages -umount /root/new_root/etc -umount /root/new_root/home/VMs -umount /root/new_root/local_proc -umount /root/new_root/local/proc -umount /root/new_root/var/lib/libvirt -umount /root/new_root/var/lib -umount /root/new_root/* -umount /root/new_root/dev/pts -umount /root/new_root/dev/mqueue -umount /root/new_root/dev/shm -umount /root/new_root/dev/vfio -umount /root/new_root/dev -rmmod qtfs - -umount /root/new_root/sys/fs/cgroup/* -umount /root/new_root/sys/fs/cgroup -umount /root/new_root/sys diff --git a/docs/en/server/diversified_computing/dpu_os/_toc.yaml b/docs/en/server/diversified_computing/dpu_os/_toc.yaml deleted file mode 100644 index 040400fa3b6582c8702736f8eb89a2664352edc5..0000000000000000000000000000000000000000 --- a/docs/en/server/diversified_computing/dpu_os/_toc.yaml +++ /dev/null @@ -1,11 +0,0 @@ -label: DPU_OS -isManual: true -href: ./overview.md -description: This guide outlines the process of creating a DPU_OS image through openEuler OS customization, including deployment and validation procedures. -sections: - - label: DPU_OS Background and Requirements - href: ./dpu_os_background_and_requirements.md - - label: DPU_OS Tailoring Guide - href: ./dpu_os_tailoring_guide.md - - label: Verification and Deployment - href: ./verification_and_deployment.md diff --git a/docs/en/server/diversified_computing/dpu_os/dpu_os_background_and_requirements.md b/docs/en/server/diversified_computing/dpu_os/dpu_os_background_and_requirements.md deleted file mode 100644 index 849d841464bf3af5100d9aa7317794ebc6e8722b..0000000000000000000000000000000000000000 --- a/docs/en/server/diversified_computing/dpu_os/dpu_os_background_and_requirements.md +++ /dev/null @@ -1,67 +0,0 @@ -# DPU-OS Background and Requirements - -## Overview - -In data center and cloud environments, Moore's Law has reached its limits, leading to a slowdown in the growth of general-purpose CPU computing power. At the same time, network I/O speeds and performance continue to rise, creating a growing disparity between the two. This gap highlights the inability of current general-purpose processors to meet the demands of network, disk, and other I/O processing. In traditional data centers, a significant portion of general-purpose CPU resources is consumed by I/O and management tasks, a phenomenon known as the "Datacenter Tax." AWS estimates that this tax can consume over 30% of a data center's computing power, and in some cases, even more. - -The DPU was introduced to address this issue by offloading management, network, storage, and security tasks from the host CPU to dedicated processor chips. This offloading accelerates processing, reduces costs, and improves efficiency. Leading cloud providers like AWS, Alibaba Cloud, and Huawei Cloud have developed custom chips to handle these offloaded tasks, ensuring that 100% of data center computing resources are available for customer use. - -The DPU market is experiencing rapid growth, driven by strong demand from cloud providers and big data applications. Numerous Chinese DPU startups have also entered the market with innovative products. This growth presents challenges for cloud and big data providers, who must integrate diverse DPU products, and for DPU manufacturers, who must adapt device drivers to customer-specified operating systems. openEuler, a leading open-source operating system in China, addresses these challenges by offering DPU-OS, a solution built on openEuler that bridges the gap between DPU manufacturers and customers. Furthermore, since DPUs rely on their OS to support service acceleration, DPU-OS requires performance optimization. By leveraging openEuler, DPU-related acceleration capabilities can be embedded into DPU-OS, fostering a robust DPU software ecosystem. - -## DPU-OS Requirements Analysis and Design - -### Current State of DPUs and OS Requirements - -DPUs exhibit several key characteristics and challenges: - -- Limited general-purpose processing resources - - DPUs are in the early stages of development, with hardware continuously evolving. Power constraints result in modest hardware specifications. Mainstream DPUs typically feature 8 to 24 CPU cores with limited single-core performance. Memory capacity ranges from 16 to 32GB, and local storage varies from tens to hundreds of gigabytes. The operating system running on DPUs must accommodate these constraints. - -- Varied DPU-OS installation methods - - The diversity of DPU manufacturers and products has led to multiple installation and deployment methods. These include PXE network installation, USB installation, and custom methods such as host-delivered installation images. - -- High performance requirements - - DPU application scenarios demand high performance. Compared to general-purpose server operating systems, DPU-OS may require specific kernel features or functional components. Examples include vDPA for device passthrough and live migration, vendor-specific driver support, seamless DPU process offloading, customized user-space data plane acceleration tools like DPDK/SPDK/OVS, and DPU management and monitoring tools. - -Based on these characteristics, the following requirements for DPU-OS are proposed: - -- Ultra-lightweight DPU-OS installation package - - Trim the openEuler system image to eliminate unnecessary packages and optimize system services to reduce resource overhead. - -- Customization support and tools - - Provide customization configurations and tools to enable customers or DPU manufacturers to tailor the system. openEuler offers an ISO reference implementation. - -- Customized kernel and system for peak performance - - Customize the kernel and drivers to deliver competitive features for DPUs. Enable hardware acceleration through tailored components and optimize system configurations for superior performance. Include DPU-related management and control tools for unified administration. - -### DPU-OS Design - -**Figure 1** Overall Design of DPU-OS - -![dpuos-arch](./figures/dpuos-arch.png) - -As illustrated in Figure 1, DPU-OS is structured into five layers: - -- **Kernel layer**: Customize the kernel configuration to remove non-essential features and modules, creating a lightweight kernel. Enable specific kernel features to deliver high-performance DPU capabilities. - -- **Driver layer**: Trim and customize openEuler native drivers, selecting the minimal required set. Integrate DPU vendor-specific drivers to natively support certain DPU hardware products. - -- **System configuration layer**: Optimize system settings through sysctl and proc configurations to ensure peak performance for DPU-related services. - -- **Peripheral package layer**: Customize and trim openEuler peripheral packages, selecting the minimal set. Provide a suite of DPU-related custom tools. - -- **System service layer**: Streamline native system service startup items to eliminate unnecessary services, minimizing runtime overhead. - -This five-layer design achieves the goal of a lightweight, high-performance DPU-OS. While this is a long-term design heavily reliant on the DPU software and hardware ecosystem, the current phase focuses on trimming using openEuler's imageTailor tool. - -For detailed steps on DPU-OS trimming, refer to the [DPU-OS Tailoring Guide](./dpu-os-tailoring-guide.md). For verification and deployment, consult the [DPU-OS Deployment and Verification Guide](./verification-and-deployment.md). - -> ![](./public_sys-resources/icon-note.gif)**Note**: -> -> Currently, DPU-OS leverages openEuler's existing kernel and peripheral packages, trimmed using the imageTailor tool to produce a lightweight OS installation image. Future development will integrate additional kernel and peripheral package features based on specific needs. diff --git a/docs/en/server/diversified_computing/dpu_os/dpu_os_tailoring_guide.md b/docs/en/server/diversified_computing/dpu_os/dpu_os_tailoring_guide.md deleted file mode 100644 index 489ec5299b6dd504cb23733d10bb0ad1677ad82e..0000000000000000000000000000000000000000 --- a/docs/en/server/diversified_computing/dpu_os/dpu_os_tailoring_guide.md +++ /dev/null @@ -1,65 +0,0 @@ -# DPU-OS Tailoring Guide - -This document explains how to use imageTailor to trim the DPU-OS installation image using configuration files from the [dpu-utilities repository](https://gitee.com/openeuler/dpu-utilities/tree/master/dpuos). Follow these steps: - -## Prepare imageTailor and Required RPM Packages - -Install the imageTailor tool by referring to the [imageTailor User Guide](https://docs.openeuler.org/zh/docs/22.03_LTS/docs/TailorCustom/imageTailor%E4%BD%BF%E7%94%A8%E6%8C%87%E5%8D%97.html) and prepare the necessary RPM packages for tailoring. - -You can use the openEuler installation image as the RPM source. While **openEuler-22.03-LTS-everything-debug-aarch64-dvd.iso** contains a complete set of RPMs, it is large. Alternatively, use the RPMs from **openEuler-22.03-LTS-aarch64-dvd.iso** along with the install-scripts.noarch package. - -Obtain the `install-scripts.noarch` package from the everything repository or download it using yum: - -```bash -yum install -y --downloadonly --downloaddir=./ install-scripts -``` - -## Copy DPUOS Configuration Files - -The imageTailor tool is installed in **/opt/imageTailor** by default. Copy the DPU-OS configuration files to the appropriate paths, selecting the correct architecture directory. The DPU-OS tailoring configuration repository supports x86_64 and aarch64 architectures. - -```bash -cp -rf custom/cfg_dpuos /opt/imageTailor/custom -cp -rf kiwi/minios/cfg_dpuos /opt/imageTailor/kiwi/minios/cfg_dpuos -``` - -## Modify Other Configuration Files - -- Add a line for `dpuos` configuration in **kiwi/eulerkiwi/product.conf**: - -```bash -dpuos PANGEA EMBEDDED DISK GRUB2 install_mode=install install_media=CD install_repo=CD selinux=0 -``` - -- Add a line for `dpuos` configuration in **kiwi/eulerkiwi/minios.conf**: - -```bash -dpuos kiwi/minios/cfg_dpuos yes -``` - -- Add a line for `dpuos` configuration in **repos/RepositoryRule.conf**: - -```bash -dpuos 1 rpm-dir euler_base -``` - -## Set Passwords - -Navigate to **/opt/imageTailor** and update the passwords in the following files: - -- **custom/cfg_dpuos/usr_file/etc/default/grub** - -- **custom/cfg_dpuos/rpm.conf** - -- **kiwi/minios/cfg_dpuos/rpm.conf** - -For password generation and modification, refer to the openEuler imageTailor manual section on [Configuring Initial Passwords](../../../tools/community_tools/image_custom/image_tailor/imagetailor_user_guide.md#configuring-initial-passwords). - -## Execute the Tailoring Command - -Run the following command to perform the tailoring. The resulting ISO will be saved in **/opt/imageTailor/result**: - -```bash -cd /opt/imageTailor -./mkdliso -p dpuos -c custom/cfg_dpuos --sec --minios force -``` diff --git a/docs/en/server/diversified_computing/dpu_os/figures/dpuos-arch.png b/docs/en/server/diversified_computing/dpu_os/figures/dpuos-arch.png deleted file mode 100644 index 453370ab07858a13a6c40f8d22e3f608e9ec6b4c..0000000000000000000000000000000000000000 Binary files a/docs/en/server/diversified_computing/dpu_os/figures/dpuos-arch.png and /dev/null differ diff --git a/docs/en/server/diversified_computing/dpu_os/overview.md b/docs/en/server/diversified_computing/dpu_os/overview.md deleted file mode 100644 index 89d83786b9a29940803a05f959d209dc6d9f1c4c..0000000000000000000000000000000000000000 --- a/docs/en/server/diversified_computing/dpu_os/overview.md +++ /dev/null @@ -1,11 +0,0 @@ -# Overview - -This document outlines the background requirements and design principles of DPU-OS. It also details the process of creating a DPU-OS image by customizing the openEuler operating system, along with deployment and verification methods. The feature leverages the openEuler ecosystem to deliver a lightweight, high-performance DPU-OS, offering a reference implementation for data processing unit (DPU) scenarios and users. - -This document targets community developers, DPU vendors, and customers using openEuler who want to explore and adopt DPUs. Users should possess the following skills and knowledge: - -- Proficiency in basic Linux operations. - -- Understanding of Linux system construction and deployment fundamentals. - -- Familiarity with the openEuler imageTailor tool for image customization. diff --git a/docs/en/server/diversified_computing/dpu_os/public_sys-resources/icon-note.gif b/docs/en/server/diversified_computing/dpu_os/public_sys-resources/icon-note.gif deleted file mode 100644 index 6314297e45c1de184204098efd4814d6dc8b1cda..0000000000000000000000000000000000000000 Binary files a/docs/en/server/diversified_computing/dpu_os/public_sys-resources/icon-note.gif and /dev/null differ diff --git a/docs/en/server/diversified_computing/dpu_os/verification_and_deployment.md b/docs/en/server/diversified_computing/dpu_os/verification_and_deployment.md deleted file mode 100644 index 0a468aac1b889ad465589b008fc2acac7fc12c05..0000000000000000000000000000000000000000 --- a/docs/en/server/diversified_computing/dpu_os/verification_and_deployment.md +++ /dev/null @@ -1,38 +0,0 @@ -# Verification and Deployment - -Once DPU-OS is built, it can be installed and deployed for verification. Since DPU hardware is still in its early stages, you can also use VirtualBox to set up a virtual machine for deployment and testing. - -## Deploying DPU-OS on VirtualBox - -This section outlines the steps to install and deploy DPU-OS using the VirtualBox hypervisor. - -### Preparation for Verification - -Before deploying DPU-OS, ensure the following prerequisites are met: - -- Obtain the DPU-OS ISO file. -- Ensure the host machine has VirtualBox installed. - -### Initial Installation and Startup - -#### Creating a Virtual Machine - -Create a new virtual machine in VirtualBox: - -- Configure the virtual machine with at least 2 CPUs and 4GB of RAM. - -- Allocate a virtual disk with a recommended size of 60GB or larger. - -- Enable EFI boot in the system extension properties. - -- In the storage settings, select the local DPU-OS ISO file as the optical drive. - -- Customize other settings such as network or display as needed. - -#### Starting the Virtual Machine - -Start the newly created virtual machine and choose **Install from ISO** to begin the DPU-OS installation. The installation process is automated and requires no manual input. After installation, the system will reboot automatically. - -Select **Boot From Local Disk** to start DPU-OS. Use the password specified during the DPU-OS creation process. - -By following these steps, you can successfully deploy and verify DPU-OS locally. diff --git a/docs/en/server/memory_storage/etmem/_toc.yaml b/docs/en/server/memory_storage/etmem/_toc.yaml deleted file mode 100644 index 22d465a29fa6166bd9bc16c0646cb8885c9b7285..0000000000000000000000000000000000000000 --- a/docs/en/server/memory_storage/etmem/_toc.yaml +++ /dev/null @@ -1,6 +0,0 @@ -label: etmem User Guide -isManual: true -description: Expand memory capacity with etmem. -sections: - - label: Using etmem - href: ./etmem_user_guide.md diff --git a/docs/en/server/memory_storage/etmem/etmem_user_guide.md b/docs/en/server/memory_storage/etmem/etmem_user_guide.md deleted file mode 100644 index 5490e7171f1b0620127b3d97e6c639dbce8fa0d5..0000000000000000000000000000000000000000 --- a/docs/en/server/memory_storage/etmem/etmem_user_guide.md +++ /dev/null @@ -1,773 +0,0 @@ -# etmem User Guide - -## Introduction - -The development of CPU computing power, particularly lower costs of ARM cores, makes memory cost and capacity become the core frustration that restricts business costs and performance. Therefore, the most pressing issue is how to save memory cost and how to expand memory capacity. - -etmem is a tiered memory expansion technology that uses DRAM+memory compression/high-performance storage media to form tiered memory storage. Memory data is tiered, and cold data is migrated from memory media to high-performance storage media to release memory space and reduce memory costs. - -The tools provided by the etmem software package include the etmem client and the etmemd server. etmemd runs continuously after being launched and implements functions such as recognition and elimination of cold and hot memory of target processes. etmem runs once when called and controls etmemd to respond with different operations based on different command parameters. - -## Compilation - -1. Download the etmem source code. - - ```shell - git clone https://gitee.com/openeuler/etmem.git - ``` - -2. Install the compilation and running dependency. The compilation and running of etmem depend on the libboundscheck component. - - Install the dependency: - - ```bash - yum install libboundscheck - ``` - - Use the `rpm` command to check if the package is installed: - - ```bash - rpm -qi libboundscheck - ``` - -3. Build source code. - - ```bash - cd etmem - mkdir build - cd build - cmake .. - make - ``` - -## Precautions - -### Dependencies - -As a memory expansion tool, etmem needs to rely on kernel features. To identify memory access conditions and support the active writing of memory into the swap partition to achieve the requirement of vertical memory expansion, etmem needs to insert the **etmem_scan** and **etmem_swap** modules at runtime: - -```bash -modprobe etmem_scan -modprobe etmem_swap -``` - -### Restrictions - -The etmem process requires root privileges. The root user has the highest system privileges. When using the root user to perform operations, strictly follow the operation instructions to avoid system management and security risks. - -### Constraints - -- The client and server of etmem must be deployed on the same server. Cross-server communication is not supported. -- etmem can scan target processes whose process name is less than or equal to 15 characters. Supported characters in process names are letters, numbers, periods (.), slashes (/), hyphens (-), and underscores (\_). -- When AEP media is used for memory expansion, it relies on the system being able to correctly recognize the AEP device and initialize the device as a NUMA node. Additionally, the **vm_flags** field in the configuration file can only be configured as **ht**. -- The private commands of the engine are only valid for the corresponding engine and tasks under the engine, such as `showhostpages` and `showtaskpages` supported by cslide. -- In a third-party policy implementations, **fd** in the `eng_mgt_func` interface cannot be written with the **0xff** and **0xfe** characters. -- Multiple different third-party policy dynamic libraries, distinguished by **eng_name** in the configuration file, can be added within a project. -- Concurrent scanning of the same process is prohibited. -- Using the **/proc/xxx/idle_pages** and **/proc/xxx/swap_pages** files is prohibited when **etmem_scan** and **etmem_swap** modules are not loaded. -- The etmem configuration file requires the owner to be the root user, with permissions of 600 or 400. The size of the configuration file cannot exceed 10 MB. -- When etmem injects a third-party policy, the **so** of the third-party policy requires the owner to be the root user, with permissions of 500 or 700. - -## Instructions - -### etmem Configuration Files - -Before running the etmem process, the administrator needs to decide the memory of which processes needs to be expanded, configure the process information in the etmem configuration files, and configure information such as the memory scanning cycle, scanning times, and memory hot and cold thresholds. - -The configuration file examples are included in the source package and stored in the **/etc/etmem** directory. There are three example files: - -```text -/etc/etmem/cslide_conf.yaml -/etc/etmem/slide_conf.yaml -/etc/etmem/thirdparty_conf.yaml -``` - -Contents of the files are as follows: - -```sh -#slide engine example -#slide_conf.yaml -[project] -name=test -loop=1 -interval=1 -sleep=1 -sysmem_threshold=50 -swapcache_high_vmark=10 -swapcache_low_vmark=6 - -[engine] -name=slide -project=test - -[task] -project=test -engine=slide -name=background_slide -type=name -value=mysql -T=1 -max_threads=1 -swap_threshold=10g -swap_flag=yes - -#cslide engine example -#cslide_conf.yaml -[engine] -name=cslide -project=test -node_pair=2,0;3,1 -hot_threshold=1 -node_mig_quota=1024 -node_hot_reserve=1024 - -[task] -project=test -engine=cslide -name=background_cslide -type=pid -name=23456 -vm_flags=ht -anon_only=no -ign_host=no - -#Third-party engine example -#thirdparty_conf.yaml -[engine] -name=thirdparty -project=test -eng_name=my_engine -libname=/usr/lib/etmem_fetch/my_engine.so -ops_name=my_engine_ops -engine_private_key=engine_private_value - -[task] -project=test -engine=my_engine -name=background_third -type=pid -value=12345 -task_private_key=task_private_value -``` - -Fields in the configuration files are described as follows: - -| Item | Description | Mandatory | Contains Parameters | Parameter Range | Example | -| -------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ------------------------------------- | ------------------- | -------------------------------------------------------------------------------------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| \[project\] | Beginning identifier of the project public configuration section | No | No | N/A | Beginning identifier of the project parameters, indicating that the parameters below are within the range of the project section until another \[xxx\] or the end of the file | -| name | Name of the project | Yes | Yes | String of up to 64 characters | Specifies that the project, engine and task need to be mounted to the specified project during configuration. | -| loop | Number of loops for memory scan | Yes | Yes | 1~120 | loop=3 // Memory is scanned 3 times. | -| interval | Time interval for each memory scan | Yes | Yes | 1~1200 | interval=5 // The interval is 5s. | -| sleep | Time interval for each memory scan+operation | Yes | Yes | 1~1200 | sleep=10 //The interval is 10s | -| sysmem_threshold | Memory swapping threshold. This is a slide engine configuration item. | No | Yes | 0~100 | sysmem_threshold=50 // When available memory is less than 50%, etmem swaps out memory. | -| swapcache_high_wmark | High watermark of swapcache. This is a slide engine configuration item. | No | Yes | 1~100 | swapcache_high_wmark=5 // swapcache can be up to 5% of the system memory. If this ratio is reached, etmem triggers swapcache recycling.
Note: swapcache_high_wmark must be greater than swapcache_low_wmark. | -| swapcache_low_wmark | Low watermark of swapcache. This is a slide engine configuration item. | No | Yes | \[1~swapcache_high_wmark\) | swapcache_low_wmark=3 //When swapcache recycling is triggered, the system recycles the swapcache memory occupancy to less than 3%. | -| \[engine\] | Beginning identifier of the engine public configuration section | No | No | N/A | Beginning identifier of the engine parameters, indicating that the parameters below are within the range of the engine section until another \[xxx\] or the end of the file | -| project | project to which the engine belongs | Yes | Yes | String of up to 64 characters | If a project named test exists, the item can be **project=test**. | -| engine | engine to which the engine belongs | Yes | Yes | slide/cslide/thirdparty | Specifies the policy to use (**slide**, **cslide**, or **thirdparty**) | -| node_pair | Node pair of AEP and DRAM. This is a cslide engine configuration item. | Yes when **engine** is **cslide** | Yes | Pair the node numbers of AEP and DRAM. Separate AEP and DRAM using a comma, and separate each pair using semicolons. | node_pair=2,0;3,1 | -| hot_threshold | Threshold of hot memory watermark. This is a cslide engine configuration item. | Yes when **engine** is **cslide** | Yes | An integer greater than or equal to 0 and less than or equal to INT_MAX | hot_threshold=3 // Memory with less than 3 accesses will be recognized as cold memory. | -| node_mig_quota | Maximum one-way flow when DRAM and AEP migrate mutually. This is a cslide engine configuration item. | Yes when **engine** is **cslide** | Yes | An integer greater than or equal to 0 and less than or equal to INT_MAX | node_mig_quota=1024 // The unit is MB. A maximum of 1024 MB can be migrated from AEP to DRAM or from DRAM to AEP each time. | -| node_hot_reserve | Size of the reserved space for hot memory in DRAM. This is a cslide engine configuration item. | Yes when **engine** is **cslide** | Yes | An integer greater than or equal to 0 and less than or equal to INT_MAX | node_hot_reserve=1024 //The unit is MB. When the hot memory of all VMs is greater than this configuration value, the hot memory will also be migrated to AEP. | -| eng_name | Name of the engine for mounting by task. This is a third-party engine configuration item. | Yes when **engine** is **thirdparty** | Yes | String of up to 64 characters | eng_name=my_engine // When mounting a task to the third-party policy engine, specify **engine=my_engine** in the task. | -| libname | Absolute path to the dynamic library of the third-party policy. This is a third-party engine configuration item. | Yes when **engine** is **thirdparty** | Yes | String of up to 256 characters | libname=/user/lib/etmem_fetch/code_test/my_engine.so | -| ops_name | Name of the operator in the dynamic library of the third-party policy. This is a third-party engine configuration item. | Yes when **engine** is **thirdparty** | Yes | String of up to 256 characters | ops_name=my_engine_ops // Name of the struct for the third-party policy implementation interface | -| engine_private_key | Reserved item for third-party policies to parse private parameters by themselves. This is a third-party engine configuration item. | No | No | Restrict according to the third-party policy's private parameters. | Configure the private engine parameters according to the third-party policy. | -| \[task\] | Beginning identifier of the task public configuration section | No | No | N/A | Beginning identifier of the task parameters, indicating that the parameters below are within the range of the project section until another \[xxx\] or the end of the file | -| project | project to which the task belongs | Yes | Yes | String of up to 64 characters | If a project named test exists, the item can be **project=test**. | -| engine | engine to which the task belongs | Yes | Yes | String of up to 64 characters | Name of the engine to which the task belongs | -| name | Name of the task | Yes | Yes | String of up to 64 characters | name=background1 // The name of the task is background1. | -| type | How the target process is identified | Yes | Yes | pid/name | **pid** specifies to identify by PID. **name** specifies to identify by name. | -| value | Value to be identified for the target process | Yes | Yes | Actual PID/name | Used with **type** to specify the PID or name of the target process. Ensure the configuration is correct and unique. | -| T | Threshold of hot memory watermark. This is a slide engine configuration item. | Yes when **engine** is **slide** | Yes | 0~loop * 3 | T=3 // Memory with less than 3 accesses will be recognized as cold memory. | -| max_threads | Maximum number of threads in the etmem internal thread pool, with each thread handling a process/subprocess memory scan+operation task. This is a slide engine configuration item. | No | Yes | 1~2 * number of cores + 1, the default value is 1. | Controls the number of internal processing threads for the etmemd server without external representation. When the target process has multiple child processes, the larger the item value, the more concurrent executions, but the more resources consumed. | -| vm_flags | Flag of the VMA to be scanned. This is a cslide engine configuration item. | No | Yes | String of up to 256 characters, with different flags separated by spaces. | vm_flags=ht // Scans memory of the VMA whose flag is ht. | -| anon_only | Scans anonymous pages only. This is a cslide engine configuration item. | No | Yes | yes/no | anon_only=no | -| ign_host | Ignores page table scan information on the host. This is a cslide engine configuration item. | No | Yes | yes/no | ign_host=no | -| task_private_key | Reserved for a task of a third-party policy to parse private parameters. This is a third-party engine configuration item. | No | No | Restrict according to the third-party policy's private parameters. | Configure the private task parameters according to the third-party policy. | -| swap_threshold | Process memory swapping threshold. This is a slide engine configuration item. | No | Yes | Absolute value of memory available to the process | swap_threshold=10g // Memory swapping will not be triggered when the process memory is less than 10 GB.
Currently, the unit can only be **g** or **G**. This item is used with **sysmem_threshold**. When system memory is lower than **sysmem_threshold**, memory of processes in the allowlist is checked. | -| swap_flag | Enables process memory swapping. This is a slide engine configuration item. | No | Yes | yes/no | swap_flag=yes | - -### Starting etmemd - -Modify related configuration files before using etmem services. After being started, etmemd stays in the system to operate the memory of the target processes.To start etmemd, you can either run the `etmemd` command or configure a service file for `systemctl` to start etmemd. The latter requires the `mode-systemctl` option. - -#### How to Use - -Run the following command to start etmemd: - -```bash -etmemd -l 0 -s etmemd_socket -``` - -or - -```bash -etmemd --log-level 0 --socket etmemd_socket -``` - -The `0` parameter of option `-l` and the `etmemd_socket` parameter of option `-s` are user-defined parameters and are described as follows. - -#### Command Parameters - -| Option | Description | Mandatory | Contains Parameters | Parameter Range | Example | -| --------------- | ---------------------------------- | -------- | ---------- | --------------------- | ------------------------------------------------------------ | -| -l or \-\-log-level | etmemd log level | No | Yes | 0~3 | 0: debug level
1: info level
2: warning level
3: error level
Logs whose levels are higher than the specified value are printed to **/var/log/message**. | -| -s or \-\-socket | Socket listened by etmemd to interact with the client | Yes | Yes | String of up to 107 characters | Socket listened by etmemd | -| -m or \-\-mode-systemctl| Starts the etmemd service through systemctl | No| No| N/A| The `-m` option needs to be specified in the service file.| -| -h or \-\-help | Prints help information | No | No | N/A | This option prints help information and exit. | - -### Adding and Deleting Projects, Engines, and Tasks Using the etmem Client - -#### Scenario - -1. The administrator adds a project, engine, or task to etmem (a project can contain multiple etmem engines, an engine can contain multiple tasks). - -2. The administrator deletes an existing etmem project, engine, or task (all tasks in a project is stopped before the project is deleted). - -#### Usage - -When etmemd is running normally, run `etmem` with the `obj` option to perform addition and deletion. etmem automatically identifies projects, engines, or tasks according to the content of the configuration file. - -- Add an object. - - ```bash - etmem obj add -f /etc/etmem/slide_conf.yaml -s etmemd_socket - ``` - - or - - ```bash - etmem obj add --file /etc/etmem/slide_conf.yaml --socket etmemd_socket - ``` - -- Delete an object. - - ```bash - etmem obj del -f /etc/etmem/slide_conf.yaml -s etmemd_socket - ``` - - or - - ```bash - etmem obj del --file /etc/etmem/slide_conf.yaml --socket etmemd_socket - ``` - -#### Command Parameters - -| Option | Description | Mandatory | Contains Parameters | Parameter Range | Example | -| ---------------- | -------------------------------------------------------------------------------------------------------------- | --------- | ------------------- | ----------------------------------------------------------------------------------------------------- | ------- | -| -f or \-\-file | Specifies the configuration file of the object. | Yes | Yes | Specify the path. | | -| -s or \-\-socket | Socket used for communication with etmemd, which must be the same as the one specified when etmemd is started. | Yes | Yes | The administrator can use this option to specify an etmemd server when multiple etmemd servers exist. | | - -### Querying, Starting, and Stopping Projects Using the etmem Client - -#### Scenario - -A project is added by using `etmem obj add` and is not deleted by using `etmem obj del`. In this case, the project can be started and stopped. - -1. The administrator starts an added project. - -2. The administrator stops a started project. - -A started project will be stopped if the administrator run `obj del` to delete the project. - -#### Usage - -Added projects can be started and stopped by using `etmem project` commands. - -- Query a project. - - ```bash - etmem project show -n test -s etmemd_socket - ``` - - or - - ```bash - etmem project show --name test --socket etmemd_socket - ``` - -- Start a project. - - ```bash - etmem project start -n test -s etmemd_socket - ``` - - or - - ```bash - etmem project start --name test --socket etmemd_socket - ``` - -- Stop a project. - - ```bash - etmem project stop -n test -s etmemd_socket - ``` - - or - - ```bash - etmem project stop --name test --socket etmemd_socket - ``` - -- Print help information. - - ```bash - etmem project help - ``` - -#### Command Parameters - -| Option | Description | Mandatory | Contains Parameters | Parameter Range | Example | -| ---------------- | -------------------------------------------------------------------------------------------------------------- | --------- | ------------------- | ----------------------------------------------------------------------------------------------------- | ------- | -| -n or \-\-name | Name of the project | Yes | Yes | The project name corresponds to the configuration file. | | -| -s or \-\-socket | Socket used for communication with etmemd, which must be the same as the one specified when etmemd is started. | Yes | Yes | The administrator can use this option to specify an etmemd server when multiple etmemd servers exist. | | - -### Specifying System Memory Swapping Threshold and Process Memory Swapping Using the etmem Client - -Only slide policies support private features. - -- Process or system memory swapping threshold - -It is necessary to consider the timing of etmem memory swapping for optimal performance. Memory swapping is not performed when the system has enough available memory or a process occupies a low amount of memory. Memory swapping threshold can be specified for the system and processes. - -- Process memory swapping - -The memory of I/O latency-sensitive service processes should not be swapped in the storage scenario. In this case, you can disable memory swapping for certain services. - -Process and system memory swapping thresholds and process memory swapping are controlled by the **sysmem_threshold**, **swap_threshold**, and **swap_flag** parameters in the configuration file. For details, see [etmem Configuration Files](#etmem-configuration-files). - -```sh -#slide_conf.yaml -[project] -name=test -loop=1 -interval=1 -sleep=1 -sysmem_threshold=50 - -[engine] -name=slide -project=test - -[task] -project=test -engine=slide -name=background_slide -type=name -value=mysql -T=1 -max_threads=1 -swap_threshold=10g -swap_flag=yes -``` - -#### System Memory Swapping Threshold - -The **sysmem_threshold** parameter is used to set system memory swapping threshold. The value range for **sysmem_threshold** is 0 to 100. If **sysmem_threshold** is set in the configuration file, etmem will swap memory when system memory is lower than **sysmem_threshold**. - -For example: - -1. Compose the configuration according to the example. Set **sysmem_threshold** to **20**. -2. Start the server, add a project to the server, and start the project. - - ```bash - etmemd -l 0 -s monitor_app & - etmem obj add -f etmem_config -s monitor_app - etmem project start -n test -s monitor_app - etmem project show -s monitor_app - ``` - -3. Observe the memory swapping results. etmem swaps memory only when the system available memory is less than 20%. - -#### Process Memory Swapping Threshold - -The **swap_threshold** parameter is used to set process memory swapping threshold. **swap_threshold** is the absolute memory usage of a process in the format of \**g/G**. If **swap_threshold** is set in the configuration file, etmem will not swap memory of the process when the process memory usage is lower then **swap_threshold**. - -For example: - -1. Compose the configuration according to the example. Set **swap_threshold** to **5g**. -2. Start the server, add a project to the server, and start the project. - - ```bash - etmemd -l 0 -s monitor_app & - etmem obj add -f etmem_config -s monitor_app - etmem project start -n test -s monitor_app - etmem project show -s monitor_app - ``` - -3. Observe the memory swapping results. etmem swaps memory only when the process memory usage reaches 5 GB. - -#### Process Memory Swapping - -The **swap_flag** parameter is used to enable the process memory swapping feature. The value of **swap_flag** can be **yes** or **no**. If **swap_flag** is **no** or not configured, etmem swaps memory normally. If **swap_flag** is **yes**, etmem swaps memory of the specified processes only. - -For example: - -1. Compose the configuration according to the example. Set **swap_flag** to **yes**. -2. Flag the memory to be swapped for the service process. - - ```bash - madvise(addr_start, addr_len, MADV_SWAPFLAG) - ``` - -3. Start the server, add a project to the server, and start the project. - - ```bash - etmemd -l 0 -s monitor_app & - etmem obj add -f etmem_config -s monitor_app - etmem project start -n test -s monitor_app - etmem project show -s monitor_app - ``` - -4. Observe the memory swapping results. Only the flagged memory is swapped. Other memory is retained in the DRAM. - -In the process memory page swapping scenario, `ioctl` is added to the original scan interface file **idle_pages** to ensure that VMAs that are not flagged do not participate in memory scanning and swapping. - -Scan management interface: - -- Function prototype - - ```c - ioctl(fd, cmd, void *arg); - ``` - -- Input parameters - 1. fd: file descriptor, which is obtained by opening a file under /proc/pid/idle_pages using the open system call - 2. cmd: controls the scan actions. The following values are supported: - VMA_SCAN_ADD_FLAGS: adds VMA memory swapping flags to scan only flagged VMAs - VMA_SCAN_REMOVE_FLAGS: removes added VMA memory swapping flags - 3. args: integer pointer parameter used to pass a specific mask. The following value is supported: - VMA_SCAN_FLAG: Before the etmem_scan.ko module starts scanning, the walk_page_test interface is called to determine whether the VMA address meets the scanning requirements. If this flag is set, only the VMA addresses that contain specific swap flags are scanned. - -- Return values - 1. 0 if the command succeeds - 2. Other values if the command fails - -- Precautions - Unsupported flags are ignored and do not return errors. - -### Specifying swapcache Memory Recycling Instructions Using the etmem Client - -The user-mode etmem initiates a memory elimination and recycling operation and interacts with the kernel-mode memory recycling module through the **write procfs** interface. The memory recycling module parses the virtual address sent from the user space, obtains the page corresponding to the address, and calls the native kernel interface to swap and recycle the memory corresponding to the page. During memory swapping, swapcache will use some system memory. To further save memory, the swapcache memory recycling feature is added. - -Add **swapcache_high_wmark** and **swapcache_low_wmark** parameters to use the swapcache memory recycling feature. - -- **swapcache_high_wmark**: High system memory water of swapcache -- **swapcache_low_wmark**: Low system memory water of swapcache - -After etmem swaps memory, it checks the swapcache memory occupancy. When the occupancy exceeds the high watermark, an `ioctl` instruction will be issued through **swap_pages** to trigger the swapcache memory recycling and stop when swapcache memory occupancy reaches the low watermark. - -An example configuration file is as follows. For details, see [etmem Configuration Files](#etmem-configuration-files). - -```sh -#slide_conf.yaml -[project] -name=test -loop=1 -interval=1 -sleep=1 -swapcache_high_vmark=5 -swapcache_low_vmark=3 - -[engine] -name=slide -project=test - -[task] -project=test -engine=slide -name=background_slide -type=name -value=mysql -T=1 -max_threads=1 -``` - -During memory swapping, swapcache memory needs to be recycled to further save memory. An `ioctl` interface is added to the original memory swap interface to configure swapcache watermarks and swapcache memory recycling. - -- Function prototype - - ```c - ioctl(fd, cmd, void *arg); - ``` - -- Input parameters - 1. fd: file descriptor, which is obtained by opening a file under /proc/pid/idle_pages using the open system call - 2. cmd: controls the scan actions. The following values are supported: - RECLAIM_SWAPCACHE_ON: enables swapcache memory swapping - RECLAIM_SWAPCACHE_OFF: disables swapcache memory swapping - SET_SWAPCACHE_WMARK: configures swapcache memory watermarks - 3. args: integer pointer parameter used to pass a specific mask. The following value is supported: - Parameters that pass the values of swapcache watermarks - -- Return values - 1. 0 if the command succeeds - 2. Other values if the command fails - -- Precautions - Unsupported flags are ignored and do not return errors. - -### Executing Private Commands and Functions Using the etmem Client - -Only the cslide policy support private commands. - -- `showtaskpages` -- `showhostpages` - -For engines and tasks of engines that use the cslide policy, you can run the commands above to query the page access of tasks and the usage of system huge pages on the host of VMs. - -For example: - -```bash -etmem engine showtaskpages <-t task_name> -n proj_name -e cslide -s etmemd_socket - -etmem engine showhostpages -n proj_name -e cslide -s etmemd_socket -``` - -**Note**: `showtaskpages` and `showhostpages` are supported by the cslide policy only. - -#### Command Parameters - -| Option | Description | Mandatory | Contains Parameters | Parameter Range | Example | -| ------------------- | -------------------------------------------------------------------------------------------------------------- | --------- | ------------------- | ----------------------------------------------------------------------------------------------------- | ------- | -| -n or \-\-proj_name | Name of the project | Yes | Yes | Name of an existing project to run | | -| -s or \-\-socket | Socket used for communication with etmemd, which must be the same as the one specified when etmemd is started. | Yes | Yes | The administrator can use this option to specify an etmemd server when multiple etmemd servers exist. | | -| -e or \-\-engine | Name of the engine to run | Yes | Yes | Name of an existing engine to run | | -| -t or \-\-task_name | Name of the task to run | No | Yes | Name of an existing task to run | | - -### Enabling and Disabling Kernel Swap - -When etmem swaps memory to the drive to expand memory, you can choose to enable the kernel swap feature. You can disable the native kernel swap mechanism to void swapping memory undesirably, resulting in problems with user-mode processes. - -A sys interface is provided to implement such control. A **kobj** named **kernel_swap_enable** is created in **/sys/kernel/mm/swap** to enable and disable kerne swap. The default value of **kernel_swap_enable** is **true**. - -For example: - -```sh -# Enable kernel swap -echo true > /sys/kernel/mm/swap/kernel_swap_enable -or -echo 1 > /sys/kernel/mm/swap/kernel_swap_enable - -# Disable kernel swap -echo false > /sys/kernel/mm/swap/kernel_swap_enable -or -echo 0 > /sys/kernel/mm/swap/kernel_swap_enable - -``` - -### Starting etmem Upon System Startup - -#### Scenario - -You can configure the systemd configuration file to run etmemd as a forking service of systemd. - -#### Usage - -Compose a service configuration file to start etmemd with the `-m` option. For example: - -```bash -etmemd -l 0 -s etmemd_socket -m -``` - -#### Command Parameters - -| Option | Description | Mandatory | Contains Parameters | Parameter Range | Example | -| --------------- | ---------------------------------- | -------- | ---------- | --------------------- | ------------------------------------------------------------ | -| -l or \-\-log-level | etmemd log level | No | Yes | 0~3 | 0: debug level
1: info level
2: warning level
3: error level
Logs whose levels are higher than the specified value are printed to **/var/log/message**. | -| -s or \-\-socket | Socket listened by etmemd to interact with the client | Yes | Yes | String of up to 107 characters | Socket listened by etmemd | -| -m or \-\-mode-systemctl| Starts the etmemd service through systemctl | No| No| N/A| The `-m` option needs to be specified in the service file.| -| -h or \-\-help | Prints help information | No | No | N/A | This option prints help information and exit. | - -### Supporting Third-party Memory Expansion Policies With etmem - -#### Scenario - -etmem provides third-party memory expansion policy registration and module scanning dynamic library and can eliminate memory according to third-party policies. - -You can use the module scanning dynamic library to implement the interface of the struct required for connecting to etmem. - -#### Usage - -To use a third-party memory expansion elimination policy, perform the following steps: - -1. Invoke the scanning interface of the module as required. - -2. Implement the interfaces using the function template provided by the etmem header file and encapsulate them into a struct. - -3. Build a dynamic library of the third-party memory expansion elimination policy. - -4. Specify the **thirdparty** engine in the configuration file. - -5. Enter the names of the library and the interface struct to the corresponding **task** fields in the configuration file. - -Other steps are similar to those of using other engines. - -Interface struct template: - -```c -struct engine_ops { - -/* Parsing private parameters of the engine. Implement the interface if required, otherwise, set it to NULL. */ - -int (*fill_eng_params)(GKeyFile *config, struct engine *eng); - -/* Clearing private parameters of the engine. Implement the interface if required, otherwise, set it to NULL. */ - -void (*clear_eng_params)(struct engine *eng); - -/* Parsing private parameters of the task. Implement the interface if required, otherwise, set it to NULL. */ - -int (*fill_task_params)(GKeyFile *config, struct task *task); - -/* Clearing private parameters of the task. Implement the interface if required, otherwise, set it to NULL. */ - -void (*clear_task_params)(struct task *tk); - -/* Task starting interface */ - -int (*start_task)(struct engine *eng, struct task *tk); - -/* Task stopping interface */ - -void (*stop_task)(struct engine *eng, struct task *tk); - -/* Allocate PID-related private parameters */ - -int (*alloc_pid_params)(struct engine *eng, struct task_pid **tk_pid); - -/* Destroy PID-related private parameters */ - -void (*free_pid_params)(struct engine *eng, struct task_pid **tk_pid); - -/* Support for private commands required by the third-party policy. If this interface is not required, set it to NULL */ - -int (*eng_mgt_func)(struct engine *eng, struct task *tk, char *cmd, int fd); - -}; -``` - -External interfaces of the scanning module: - -| Interface |Description| -| ------------ | --------------------- | -| etmemd_scan_init | Initializes the scanning module| -| etmemd_scan_exit | Exits the scanning module| -| etmemd_get_vmas | Gets the VMAs to be scanned| -| etmemd_free_vmas | Releases VMAs scanned by `etmemd_get_vmas`| -| etmemd_get_page_refs | Scans pages in VMAs| -| etmemd_free_page_refs | Release the page access information list obtained by `etmemd_get_page_refs` | - -In the VM scanning scenario, `ioctl` is added to the original scan interface file **idle_pages** to distinguish the EPT scanning granularity and specify whether to ignore page access flags on the hosts. - -In the process memory page swapping scenario, `ioctl` is added to the original scan interface file **idle_pages** to ensure that VMAs that are not flagged do not participate in memory scanning and swapping. - -Scan management interface: - -- Function prototype - - ```c - ioctl(fd, cmd, void *arg); - ``` - -- Input parameters - 1. fd: file descriptor, which is obtained by opening a file under /proc/pid/idle_pages using the open system call - 2. cmd: controls the scan actions. The following values are supported: - IDLE_SCAN_ADD_FLAG: adds a scanning flag - IDLE_SCAM_REMOVE_FLAGS: removes a scanning flag - VMA_SCAN_ADD_FLAGS: adds VMA memory swapping flags to scan only flagged VMAs - VMA_SCAN_REMOVE_FLAGS: removes added VMA memory swapping flags - 3. args: integer pointer parameter used to pass a specific mask. The following value is supported: - SCAN_AS_HUGE: scans the pages according to the 2 MB granularity to see if the pages have been accessed when scanning the EPT page table. If this parameter is not set, the granularity will be the granularity of the EPT page table itself. - SCAN_IGN_HUGE: ignores page access flags on the hosts when scanning VMs. - VMA_SCAN_FLAG: Before the etmem_scan.ko module starts scanning, the walk_page_test interface is called to determine whether the VMA address meets the scanning requirements. If this flag is set, only the VMA addresses that contain specific swap flags are scanned. - -- Return values - 1. 0 if the command succeeds - 2. Other values if the command fails - -- Precautions - Unsupported flags are ignored and do not return errors. - -An example configuration file is as follows. For details, see [etmem Configuration Files](#etmem-configuration-files). - -```text -#thirdparty -[engine] - -name=thirdparty - -project=test - -eng_name=my_engine - -libname=/user/lib/etmem_fetch/code_test/my_engine.so - -ops_name=my_engine_ops - -engine_private_key=engine_private_value - -[task] - -project=test - -engine=my_engine - -name=background1 - -type=pid - -value=1798245 - -task_private_key=task_private_value -``` - - **Note**: - -You need to use the module scanning dynamic library to implement the interface of the struct required for connecting to etmem. - -**fd** in the `eng_mgt_func` interface cannot be written with the **0xff** and **0xfe** characters. - -Multiple different third-party policy dynamic libraries, distinguished by **eng_name** in the configuration file, can be added within a project. - -### Help Information of the etmem Client and Server - -Run the following command to print help information of the etmem server: - -```bash -etmemd -h -``` - -or: - -```bash -etmemd --help -``` - -Run the following command to print help information of the etmem client: - -```bash -etmem help -``` - -Run the following command to print help information of project, engine, and task operations: - -```bash -etmem obj help -``` - -Run the following command to print help information of projects: - -```bash -etmem project help -``` - -## How to Contribute - -1. Fork this repository. -2. Create a branch. -3. Commit your code. -4. Create a pull request (PR). diff --git a/docs/en/server/memory_storage/gmem/_toc.yaml b/docs/en/server/memory_storage/gmem/_toc.yaml deleted file mode 100644 index 49b2def515716e5bdfe27c780d5785ae5b762a8c..0000000000000000000000000000000000000000 --- a/docs/en/server/memory_storage/gmem/_toc.yaml +++ /dev/null @@ -1,10 +0,0 @@ -label: GMEM User Guide -isManual: true -description: Centralized management for heterogeneous memory interconnections -sections: - - label: Overview - href: ./introduction_to_gmem.md - - label: Installation and Deployment - href: ./installation_and_deployment.md - - label: Usage Instructions - href: ./usage_instructions.md diff --git a/docs/en/server/memory_storage/gmem/images/GMEM_architecture.png b/docs/en/server/memory_storage/gmem/images/GMEM_architecture.png deleted file mode 100644 index 59b82d647166525b296529d3e3dfede3eca48f4a..0000000000000000000000000000000000000000 Binary files a/docs/en/server/memory_storage/gmem/images/GMEM_architecture.png and /dev/null differ diff --git a/docs/en/server/memory_storage/gmem/installation_and_deployment.md b/docs/en/server/memory_storage/gmem/installation_and_deployment.md deleted file mode 100644 index 8ce7ad0ccedb69cdd2f9514c4786f8b511b0356c..0000000000000000000000000000000000000000 --- a/docs/en/server/memory_storage/gmem/installation_and_deployment.md +++ /dev/null @@ -1,116 +0,0 @@ -# Installation and Deployment - -This section describes how to install and deploy GMEM. - -## Software and Hardware Requirements - -* Kunpeng 920 CPU -* Ascend 910 processor -* openEuler 23.09 - -## Environment Requirements - -* The root permission is required for using and configuring GMEM. -* GMEM can be enabled or disabled only at the system level. -* The administrator must ensure that the GMEM configuration is safe and available. - -## Installing GMEM - -* Prepare files. - - [CANN Community Version History - Ascend Community (hiascend.com)](https://www.hiascend.com/en/software/cann/community-history) - - [Firmware and Driver - Ascend Community (hiascend.com)](https://www.hiascend.com/en/hardware/firmware-drivers/community?product=2&model=19&cann=6.0.1.alpha001&driver=1.0.18.alpha) - - | Source | Software Package | - | ------------------------------------------------------------ | ------------------------------------------------------------ | - | openEuler 23.09 | kernel-6.4.0-xxx.aarch64.rpm
kernel-devel-6.4.0-xxx.aarch64.rpm
libgmem-xxx.aarch64.rpm
libgmem-devel-xxx.aarch64.rpm | - | Ascend community | CANN package:
Ascend-cann-toolkit-xxx-linux.aarch64.rpm
NPU firmware and driver:
Ascend-hdk-910-npu-driver-xxx.aarch64.rpm
Ascend-hdk-910-npu-firmware-xxx.noarch.rpm | - | Contact the maintainers of the GMEM community.
[@yang_yanchao](https://gitee.com/yang_yanchao) email:
[@LemmyHuang](https://gitee.com/LemmyHuang) email: | gmem-example-xxx.aarch64.rpm
mindspore-xxx-linux_aarch64.whl | - -* Install the kernel. - - Ensure that GMEM compilation options are enabled (enabled by default) for the openEuler kernel. - - ```sh - [root@localhost ~]# cat /boot/config-`uname -r` | grep CONFIG_GMEM - CONFIG_GMEM=y - CONFIG_GMEM_DEV=m - - [root@localhost ~]# cat /boot/config-`uname -r` | grep CONFIG_REMOTE_PAGER - CONFIG_REMOTE_PAGER=m - CONFIG_REMOTE_PAGER_MASTER=m - ``` - - Add **gmem=on** to the boot options. - - ```sh - [root@localhost gmem]# cat /proc/cmdline - BOOT_IMAGE=/vmlinuz-xxx root=/dev/mapper/openeuler-root ... gmem=on - ``` - - Configure **transparent_hugepage**. - - ```sh - echo always > /sys/kernel/mm/transparent_hugepage/enabled - ``` - -* Install the user-mode dynamic library libgmem. - - ```sh - yum install libgmem libgmem-devel - ``` - -* Install the CANN framework. - - Install the matching CANN, including the toolkit, driver, and firmware. After the installation is complete, restart the system. - - ```sh - rpm -ivh Ascend-cann-toolkit-xxx-linux.aarch64.rpm - # Use the tool provided by libgmem to install the NPU driver. - sh /usr/local/gmem/install_npu_driver.sh Ascend-hdk-910-npu-driver-xxx.aarch64.rpm - rpm -ivh Ascend-hdk-910-npu-firmware-xxx.noarch.rpm - ``` - - Run the environment configuration script in the **Ascend** directory to configure environment variables. - - ```sh - source /usr/local/Ascend/ascend-toolkit/set_env.sh - ``` - - Check whether the NPU is working properly. - - ```sh - [root@localhost ~]# npu-smi info - +-------------------------------------------------------------------------------------------+ - | npu-smi 22.0.4.1 Version: 22.0.4.1 | - +----------------------+---------------+----------------------------------------------------+ - | NPU Name | Health | Power(W) Temp(C) Hugepages-Usage(page)| - | Chip | Bus-Id | AICore(%) Memory-Usage(MB) HBM-Usage(MB) | - +======================+===============+====================================================+ - | 0 910B | OK | 79.4 82 0 / 0 | - | 0 | 0000:81:00.0 | 0 1979 / 15039 0 / 32768 | - +======================+===============+====================================================+ - ``` - -* Install the gmem-example software package. - - gmem-example updates the host driver, NPU driver, and NPU kernel. After the installation is complete, restart the system for the driver to take effect. - - ```sh - rpm -ivh gmem-example-xxx.aarch64.rpm - ``` - -* Install MindSpore. - - Obtain the correct MindSpore version and install it. After the installation, run the following command to check whether MindSpore functions are normal: - - ```sh - python -c "import mindspore;mindspore.run_check()" - MindSpore version: x.x.x - The result of multiplication calculation is correct, MindSpore has been installed on platform [Ascend] successfully! - ``` - -## Performing Training or Inference - -After installation is complete, you can execute MindSpore-based training or inference directly without any adaptation. diff --git a/docs/en/server/memory_storage/gmem/introduction_to_gmem.md b/docs/en/server/memory_storage/gmem/introduction_to_gmem.md deleted file mode 100644 index 1841bc0ee7362b1c6a6020a23391215b389f4529..0000000000000000000000000000000000000000 --- a/docs/en/server/memory_storage/gmem/introduction_to_gmem.md +++ /dev/null @@ -1,37 +0,0 @@ -# Introduction to GMEM - -## Introduction - -Memory management on the CPU side is separated from that on the heterogeneous accelerator side. Explicit data migration makes it difficult to balance usability and performance. The high bandwidth memory (HBM) of heterogeneous accelerators is generally insufficient for large language models, and manual swap causes a large performance loss and applies only to dedicated scenarios. A large number of invalid data migrations occur in search & recommendation and big data scenarios, and no efficient memory pooling solution is available. The Heterogeneous Memory Management (HMM) feature of Linux is confronted with complicated programming, unsatisfying performance, and poor portability, and depends greatly on manual tuning. As a result, OS communities are not willing to use HMM. An efficient memory management solution is needed to address the preceding issues on heterogeneous accelerators. - -Generalized Memory Management (GMEM) provides centralized management for heterogeneous memory interconnection. The GMEM APIs allow devices to connect to a unified address space and obtains programming optimization for heterogeneous memory, separating CPU architecture-related implementations from the memory management system of Linux. - -After the memory of the CPU and accelerator is encapsulated into a unified virtual address space, developers do not need to manually migrate the memory between two parallel address spaces. Instead, they only need to use a unified set of application and release functions. The dynamic random access memory (DRAM) of the CPU can even serve as the cache of the accelerator without much overhead. - -## Architecture - -![GMEM-architecture.png](images/GMEM_architecture.png) - -## Application Scenarios - -Foundation model training and inference - -* GMEM implements transparent heterogeneous memory capacity expansion and automatic HBM overcommitment, enabling high-performance and low-threshold training and inference. -* GMEM provides OS-native simplified heterogeneous memory management. With memory overcommitment, the performance of foundation model training is 60% higher than that of NVIDIA. - -Large memory sharing - -* GMEM provides flexible policies for remote access and on-demand memory migration to eliminate memory migration bottlenecks and improve end-to-end performance of search & recommendation and big data applications. - -## Functions - -For driver developers, GMEM provides unified function registration interfaces to reduce repeated work and the size of memory management code and avoid additional vulnerabilities. - -* Interfaces provided by GMEM can simplify the code for the driver to access the physical memory. -* The unified interfaces help avoid vulnerabilities when the driver developer implements the same function repeatedly. - -For users, GMEM provides stronger programmability for AI model and machine learning framework development using accelerators. You do not need to manually manage the data storage location. - -* Memory of the CPU and the accelerator can be accessed through unified memory application and release functions. -* Addresses of both CPU memory and accelerator memory can be mapped to the same virtual address space. -* GMEM encapsulates memory management code, improving the management performance. diff --git a/docs/en/server/memory_storage/gmem/usage_instructions.md b/docs/en/server/memory_storage/gmem/usage_instructions.md deleted file mode 100644 index 14ae5dfaaf8381ae4092440127fa06d7519eba0e..0000000000000000000000000000000000000000 --- a/docs/en/server/memory_storage/gmem/usage_instructions.md +++ /dev/null @@ -1,66 +0,0 @@ -# Usage Instructions - -## Introduction - -GMEM applies for virtual memory that is peer-to-peer accessible through a specific flag and provides some memory optimization semantics externally. Performance can be optimized through the memory semantics. -libgmem is the abstraction layer of the GMEM user API. It encapsulates the preceding memory semantics to simplify user operations. - -## Available APIs - -* Memory application - - GMEM extends `mmap` semantics and adds a MAP_PEER_SHARED flag to apply for heterogeneous memory. When the flag is used, 2 MB-aligned virtual addresses are returned by default. - - ```c - addr = mmap(NULL , size, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS | MAP_PEER_SHARED, -1, 0); - ``` - -* Memory release - - `munmap` is used to release memory of hosts and devices. - - ```c - munmap(addr, size); - ``` - -* Memory semantics - - `FreeEager`: For an address segment within the specified range \[**addr**, **addr** + **size**], `FreeEager` releases a complete page that aligns the page size inwards (the default page size is 2 MB). If no complete page exists in the range, a success message is returned. - - If the API is invoked successfully, 0 is returned. Otherwise, an error code is returned. - - ```c - Prototype: `int gmemFreeEager(unsigned long addr, size_t size, void *stream);` - Usage: `ret = gmemFreeEager(addr, size, stream);` - ``` - - `Prefetch`: For an address segment with the specified range \[**addr**, **addr** + **size**], `Prefetch` prefetches a complete page (covering the entire address segment) whose range is outward aligned with the page size. This ensures that the subsequent access to the virtual memory area initiated by the specified computing unit device **hnid** does not trigger a page fault. - - If the API is invoked successfully, 0 is returned. Otherwise, an error code is returned. - - ```c - Prototype: `int gmemPrefetch(unsigned long addr, size_t size, int hnid, void *stream);` - Usage: `ret = gmemPrefetch(addr, size, hnid, stream);` - ``` - - If the value of **stream** is empty, the invocation is synchronous. Otherwise, the invocation is asynchronous. - -* Other APIs - - Obtain the NUMA ID of the current device. If the API is invoked successfully, the NUMA ID is returned. Otherwise, an error code is returned. - - ```c - Prototype: `int gmemGetNumaId (void);` - Usage: `numaid = gmemGetNumaId ();` - ``` - - Obtain the GMEM statistics of the kernel. - - ```sh - cat /proc/gmemstat - ``` - -## Constraints - -1. GMEM supports only 2 MB huge pages. Therefore, transparent huge pages of the host OS and NPU OS must be enabled to use GMEM. -2. The heterogeneous memory obtained using `MAP_PEER_SHARED` cannot be inherited during forking. diff --git a/docs/en/server/memory_storage/hsak/_toc.yaml b/docs/en/server/memory_storage/hsak/_toc.yaml deleted file mode 100644 index d591c73f06220c6336b07112824e000f4e745cc8..0000000000000000000000000000000000000000 --- a/docs/en/server/memory_storage/hsak/_toc.yaml +++ /dev/null @@ -1,12 +0,0 @@ -label: HSAK Developer Guide -isManual: true -description: HSAK delivers a high-performance IO software stack optimized for new storage media, offering both high bandwidth and low latency. -sections: - - label: Overview - href: ./hsak_developer_guide.md - - label: Development with HSAK - href: ./development_with_hsak.md - - label: HSAK Tool Usage - href: ./hsak_tool_usage.md - - label: HSAK C APIs - href: ./hsak_c_apis.md diff --git a/docs/en/server/memory_storage/hsak/development_with_hsak.md b/docs/en/server/memory_storage/hsak/development_with_hsak.md deleted file mode 100644 index 4a47c498968984c6fede9b08ff465af48b0ec1d0..0000000000000000000000000000000000000000 --- a/docs/en/server/memory_storage/hsak/development_with_hsak.md +++ /dev/null @@ -1,231 +0,0 @@ -# Development with HSAK - -## Instructions - -### **nvme.conf.in** Configuration File - -By default, the HSAK configuration file is located in **/etc/spdk/nvme.conf.in**. You can modify the configuration file based on service requirements. The content of the configuration file is as follows: - -- \[Global\] - -1. **ReactorMask**: cores used for I/O polling. The value is a hexadecimal number and cannot be set to core 0. The bits from the least significant one to the most significant one indicate different CPU cores. For example, 0x1 indicates core 0, and 0x6 indicates cores 1 and 2. This parameter supports a maximum of 34 characters, including the hexadecimal flag **0x**. Each hexadecimal character can be F at most, indicating four cores. Therefore, a maximum of 128 (32 x 4) cores are supported. -2. **LogLevel**: HSAK log print level (**0**: error; **1**: warning; **2**: notice; **3**: info; **4**: debug). -3. **MemSize**: memory occupied by HSAK (The minimum value is 500 MB.) -4. **MultiQ**: whether to enable multi-queue on the same block device. -5. **E2eDif**: DIF type (**1**: half-way protection; **2**: full protection). Drives from different vendors may have different DIF support capabilities. For details, see the documents provided by hardware vendors. -6. **IoStat**: whether to enable the I/O statistics function. The options are **Yes** and **No**. -7. **RpcServer**: whether to start the RPC listening thread. The options are **Yes** and **No**. -8. **NvmeCUSE**: whether to enable the CUSE function. The options are **Yes** and **No**. After the function is enabled, the NVMe character device is generated in the **/dev/spdk** directory. - -- \[Nvme\] - -1. **TransportID**: PCI address and name of the NVMe controller. The format is **TransportID "trtype:PCIe traddr:0000:09:00.0" nvme0**. -2. **RetryCount**: number of retries upon an I/O failure. The value **0** indicates no retry. The maximum value is **255**. -3. **TimeoutUsec**: I/O timeout interval. If this parameter is set to **0** or left blank, no timeout interval is set. The unit is μs. -4. **ActionOnTimeout**: I/O timeout behavior (**None**: prints information only; **Reset**: resets the controller; **abort**: aborts the command). The default value is **None**. - -- \[Reactor\] - -1. **BatchSize**: number of I/Os that can be submitted in batches. The default value is **8**, and the maximum value is **32**. - -### Header File Reference - -HSAK provides two external header files. Include the two files when using HSAK for development. - -1. **bdev_rw.h**: defines the macros, enumerations, data structures, and APIs of the user-mode I/O operations on the data plane. -2. **ublock.h**: defines macros, enumerations, data structures, and APIs for functions such as device management and information obtaining on the management plane. - -### Service Running - -After software development and compilation, you must run the **setup.sh** script to rebind the NVMe drive driver to the user mode before running the software. The script is located in **/opt/spdk** by default. -Run the following commands to change the drive driver's binding mode from kernel to user and reserve 1024 x 2 MB huge pages: - -```shell -[root@localhost ~]# cd /opt/spdk -[root@localhost spdk]# ./setup.sh -0000:3f:00.0 (8086 2701): nvme -> uio_pci_generic -0000:40:00.0 (8086 2701): nvme -> uio_pci_generic -``` - -Run the following commands to restore the drive driver's mode from user to kernel and free the reserved huge pages: - -```shell -[root@localhost ~]# cd /opt/spdk -[root@localhost spdk]# ./setup.sh reset -0000:3f:00.0 (8086 2701): uio_pci_generic -> nvme -0000:40:00.0 (8086 2701): uio_pci_generic -> nvme -``` - -### User-Mode I/O Read and Write Scenarios - -Call HSAK APIs in the following sequence to read and write service data through the user-mode I/O channel: - -1. Initialize the HSAK UIO module. - Call **libstorage_init_module** to initialize the HSAK user-mode I/O channel. - -2. Open a drive block device. - Call **libstorage_open** to open a specified block device. If multiple block devices need to be opened, call this API repeatedly. - -3. Allocate I/O memory. - Call **libstorage_alloc_io_buf** or **libstorage_mem_reserve** to allocate memory. **libstorage_alloc_io_buf** can allocate a maximum of 65 KB I/Os, and **libstorage_mem_reserve** can allocate unlimited memory unless there is no available space. - -4. Perform read and write operations on a drive. - You can call the following APIs to perform read and write operations based on service requirements: - - - libstorage_async_read - - libstorage_async_readv - - libstorage_async_write - - libstorage_async_writev - - libstorage_sync_read - - libstorage_sync_write - -5. Free I/O memory. - Call **libstorage_free_io_buf** or **libstorage_mem_free** to free memory, which must correspond to the API used to allocate memory. - -6. Close a drive block device. - Call **libstorage_close** to close a specified block device. If multiple block devices are opened, call this API repeatedly to close them. - - | API | Description | - | ----------------------- | ----------------------------------------------------------------------------------- | - | libstorage_init_module | Initializes the HSAK module. | - | libstorage_open | Opens a block device. | - | libstorage_alloc_io_buf | Allocates memory from buf_small_pool or buf_large_pool of SPDK. | - | libstorage_mem_reserve | Allocates memory space from the huge page memory reserved by DPDK. | - | libstorage_async_read | Delivers asynchronous I/O read requests (the read buffer is a contiguous buffer). | - | libstorage_async_readv | Delivers asynchronous I/O read requests (the read buffer is a discrete buffer). | - | libstorage_async_write | Delivers asynchronous I/O write requests (the write buffer is a contiguous buffer). | - | libstorage_async_wrtiev | Delivers asynchronous I/O write requests (the write buffer is a discrete buffer). | - | libstorage_sync_read | Delivers synchronous I/O read requests (the read buffer is a contiguous buffer). | - | libstorage_sync_write | Delivers synchronous I/O write requests (the write buffer is a contiguous buffer). | - | libstorage_free_io_buf | Frees the allocated memory to buf_small_pool or buf_large_pool of SPDK. | - | libstorage_mem_free | Frees the memory space that libstorage_mem_reserve allocates. | - | libstorage_close | Closes a block device. | - | libstorage_exit_module | Exits the HSAK module. | - -### Drive Management Scenarios - -HSAK contains a group of C APIs, which can be used to format drives and create and delete namespaces. - -1. Call the C API to initialize the HSAK UIO component. If the HSAK UIO component has been initialized, skip this operation. - - libstorage_init_module - -2. Call corresponding APIs to perform drive operations based on service requirements. The following APIs can be called separately: - - - libstorage_create_namespace - - - libstorage_delete_namespace - - - libstorage_delete_all_namespace - - - libstorage_nvme_create_ctrlr - - - libstorage_nvme_delete_ctrlr - - - libstorage_nvme_reload_ctrlr - - - libstorage_low_level_format_nvm - - - libstorage_deallocate_block - -3. If you exit the program, destroy the HSAK UIO. If other services are using the HSAK UIO, you do not need to exit the program and destroy the HSAK UIO. - - libstorage_exit_module - - | API | Description | - | ------------------------------- | ---------------------------------------------------------------------------------------------------------------------- | - | libstorage_create_namespace | Creates a namespace on a specified controller (the prerequisite is that the controller supports namespace management). | - | libstorage_delete_namespace | Deletes a namespace from a specified controller. | - | libstorage_delete_all_namespace | Deletes all namespaces from a specified controller. | - | libstorage_nvme_create_ctrlr | Creates an NVMe controller based on the PCI address. | - | libstorage_nvme_delete_ctrlr | Destroys an NVMe controller based on the controller name. | - | libstorage_nvme_reload_ctrlr | Automatically creates or destroys the NVMe controller based on the input configuration file. | - | libstorage_low_level_format_nvm | Low-level formats an NVMe drive. | - | libstorage_deallocate_block | Notifies NVMe drives of blocks that can be freed for garbage collection. | - -### Data-Plane Drive Information Query - -The I/O data plane of HSAK provides a group of C APIs for querying drive information. Upper-layer services can process service logic based on the queried information. - -1. Call the C API to initialize the HSAK UIO component. If the HSAK UIO component has been initialized, skip this operation. - - libstorage_init_module - -2. Call corresponding APIs to query information based on service requirements. The following APIs can be called separately: - - - libstorage_get_nvme_ctrlr_info - - - libstorage_get_mgr_info_by_esn - - - libstorage_get_mgr_smart_by_esn - - - libstorage_get_bdev_ns_info - - - libstorage_get_ctrl_ns_info - -3. If you exit the program, destroy the HSAK UIO. If other services are using the HSAK UIO, you do not need to exit the program and destroy the HSAK UIO. - - libstorage_exit_module - - | API | Description | - | ------------------------------- | ------------------------------------------------------------------------ | - | libstorage_get_nvme_ctrlr_info | Obtains information about all controllers. | - | libstorage_get_mgr_info_by_esn | Obtains the management information of the drive corresponding to an ESN. | - | libstorage_get_mgr_smart_by_esn | Obtains the S.M.A.R.T. information of the drive corresponding to an ESN. | - | libstorage_get_bdev_ns_info | Obtains namespace information based on the device name. | - | libstorage_get_ctrl_ns_info | Obtains information about all namespaces based on the controller name. | - -### Management-Plane Drive Information Query - -The management plane component Ublock of HSAK provides a group of C APIs for querying drive information on the management plane. - -1. Call the C API to initialize the HSAK Ublock server. - -2. Call the HSAK UIO component initialization API in another process based on service requirements. - -3. If multiple processes are required to query drive information, initialize the Ublock client. - -4. Call the APIs listed in the following table on the Ublock server process or client process to query information. - -5. After obtaining the block device list, call the APIs listed in the following table to free resources. - -6. If you exit the program, destroy the HSAK Ublock module (the destruction method on the server is the same as that on the client). - - | API | Description | - | ---------------------------- | ------------------------------------------------------------ | - | init_ublock | Initializes the Ublock function module. This API must be called before the other Ublock APIs. A process can be initialized only once because the init_ublock API initializes DPDK. The initial memory allocated by DPDK is bound to the process PID. One PID can be bound to only one memory. In addition, DPDK does not provide an API for freeing the memory. The memory can be freed only by exiting the process. | - | ublock_init | It is the macro definition of the init_ublock API. It can be considered as initializing Ublock to an RPC service. | - | ublock_init_norpc | It is the macro definition of the init_ublock API. It can be considered as initializing Ublock to a non-RPC service. | - | ublock_get_bdevs | Obtains the device list. The obtained device list contains only PCI addresses and does not contain specific device information. To obtain specific device information, call the ublock_get_bdev API. | - | ublock_get_bdev | Obtains information about a specific device, including the device serial number, model, and firmware version. The information is stored in character arrays instead of character strings. | - | ublock_get_bdev_by_esn | Obtains the device information based on the specified ESN, including the serial number, model, and firmware version. | - | ublock_get_SMART_info | Obtains the S.M.A.R.T. information of a specified device. | - | ublock_get_SMART_info_by_esn | Obtains the S.M.A.R.T. information of the device corresponding to an ESN. | - | ublock_get_error_log_info | Obtains the error log information of a device. | - | ublock_get_log_page | Obtains information about a specified log page of a specified device. | - | ublock_free_bdevs | Frees the device list. | - | ublock_free_bdev | Frees device resources. | - | ublock_fini | Destroys the Ublock module. This API destroys the Ublock module and internally created resources. This API must be used together with the Ublock initialization API. | - -### Log Management - -HSAK logs are exported to **/var/log/messages** through syslog by default and managed by the rsyslog service of the OS. If a custom log directory is required, use rsyslog to configure the log directory. - -1. Modify the **/etc/rsyslog.conf** configuration file. - -2. Restart the rsyslog service: - - ```shell - if ($programname == 'LibStorage') then { - action(type="omfile" fileCreateMode="0600" file="/var/log/HSAK/run.log") - stop - } - ``` - -3. Start the HSAK process. The log information is redirected to the target directory. - - ```shell - sysemctl restart rsyslog - ``` - -4. If redirected logs need to be dumped, manually configure log dump in the **/etc/logrotate.d/syslog** file. diff --git a/docs/en/server/memory_storage/hsak/hsak_c_apis.md b/docs/en/server/memory_storage/hsak/hsak_c_apis.md deleted file mode 100644 index ccb1e774803c91c94e4aa95a760d157b6ba73aa7..0000000000000000000000000000000000000000 --- a/docs/en/server/memory_storage/hsak/hsak_c_apis.md +++ /dev/null @@ -1,2521 +0,0 @@ -# C APIs - -## Macro Definition and Enumeration - -### bdev_rw.h - -#### enum libstorage_ns_lba_size - -1. Prototype - - ```c - enum libstorage_ns_lba_size - { - LIBSTORAGE_NVME_NS_LBA_SIZE_512 = 0x9, - LIBSTORAGE_NVME_NS_LBA_SIZE_4K = 0xc - }; - ``` - -2. Description - - Sector (data) size of a drive. - -#### enum libstorage_ns_md_size - -1. Prototype - - ```c - enum libstorage_ns_md_size - { - LIBSTORAGE_METADATA_SIZE_0 = 0, - LIBSTORAGE_METADATA_SIZE_8 = 8, - LIBSTORAGE_METADATA_SIZE_64 = 64 - }; - ``` - -2. Description - - Metadata size of a drive. - -3. Remarks - - - ES3000 V3 (single-port) supports formatting of five sector types (512+0, 512+8, 4K+64, 4K, and 4K+8). - - - ES3000 V3 (dual-port) supports formatting of four sector types (512+0, 512+8, 4K+64, and 4K). - - - ES3000 V5 supports formatting of five sector types (512+0, 512+8, 4K+64, 4K, and 4K+8). - - - Optane drives support formatting of seven sector types (512+0, 512+8, 512+16,4K, 4K+8, 4K+64, and 4K+128). - -#### enum libstorage_ns_pi_type - -1. Prototype - - ```c - enum libstorage_ns_pi_type - { - LIBSTORAGE_FMT_NVM_PROTECTION_DISABLE = 0x0, - LIBSTORAGE_FMT_NVM_PROTECTION_TYPE1 = 0x1, - LIBSTORAGE_FMT_NVM_PROTECTION_TYPE2 = 0x2, - LIBSTORAGE_FMT_NVM_PROTECTION_TYPE3 = 0x3, - }; - ``` - -2. Description - - Protection type supported by drives. - -3. Remarks - - ES3000 supports only protection types 0 and 3. Optane drives support only protection types 0 and 1. - -#### enum libstorage_crc_and_prchk - -1. Prototype - - ```c - enum libstorage_crc_and_prchk - { - LIBSTORAGE_APP_CRC_AND_DISABLE_PRCHK = 0x0, - LIBSTORAGE_APP_CRC_AND_ENABLE_PRCHK = 0x1, - LIBSTORAGE_LIB_CRC_AND_DISABLE_PRCHK = 0x2, - LIBSTORAGE_LIB_CRC_AND_ENABLE_PRCHK = 0x3, - #define NVME_NO_REF 0x4 - LIBSTORAGE_APP_CRC_AND_DISABLE_PRCHK_NO_REF = LIBSTORAGE_APP_CRC_AND_DISABLE_PRCHK | NVME_NO_REF, - LIBSTORAGE_APP_CRC_AND_ENABLE_PRCHK_NO_REF = LIBSTORAGE_APP_CRC_AND_ENABLE_PRCHK | NVME_NO_REF, - }; - ``` - -2. Description - - - **LIBSTORAGE_APP_CRC_AND_DISABLE_PRCHK**: Cyclic redundancy check (CRC) is performed for the application layer, but not for HSAK. CRC is disabled for drives. - - - **LIBSTORAGE_APP_CRC_AND_ENABLE_PRCHK**: CRC is performed for the application layer, but not for HSAK. CRC is enabled for drives. - - - **LIBSTORAGE_LIB_CRC_AND_DISABLE_PRCHK**: CRC is performed for HSAK, but not for the application layer. CRC is disabled for drives. - - - **LIBSTORAGE_LIB_CRC_AND_ENABLE_PRCHK**: CRC is performed for HSAK, but not for the application layer. CRC is enabled for drives. - - - **LIBSTORAGE_APP_CRC_AND_DISABLE_PRCHK_NO_REF**: CRC is performed for the application layer, but not for HSAK. CRC is disabled for drives. REF tag verification is disabled for drives whose PI TYPE is 1 (Intel Optane P4800). - - - **LIBSTORAGE_APP_CRC_AND_ENABLE_PRCHK_NO_REF**: CRC is performed for the application layer, but not for HSAK. CRC is enabled for drives. REF tag verification is disabled for drives whose PI TYPE is 1 (Intel Optane P4800). - - - If PI TYPE of an Intel Optane P4800 drive is 1, the CRC and REF tag of the metadata area are verified by default. - - - Intel Optane P4800 drives support DIF in 512+8 format but does not support DIF in 4096+64 format. - - - For ES3000 V3 and ES3000 V5, PI TYPE of the drives is 3. By default, only the CRC of the metadata area is performed. - - - ES3000 V3 supports DIF in 512+8 format but does not support DIF in 4096+64 format. ES3000 V5 supports DIF in both 512+8 and 4096+64 formats. - - The summary is as follows: - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
E2E Verification ModeCtrl FlagCRC Generator Write ProcessRead Process
Application VerificationCRC for HSAKCRC for DrivesApplication VerificationCRC for HSAKCRC for Drives
Halfway protection0ControllerXXXXXX
1ControllerXXXXX
2ControllerXXXXXX
3ControllerXXXXX
Full protection0AppXXXX
1AppXX
2HSAKXXXX
3HSAKXX
- -#### enum libstorage_print_log_level - -1. Prototype - - ```c - enum libstorage_print_log_level - { - LIBSTORAGE_PRINT_LOG_ERROR, - LIBSTORAGE_PRINT_LOG_WARN, - LIBSTORAGE_PRINT_LOG_NOTICE, - LIBSTORAGE_PRINT_LOG_INFO, - LIBSTORAGE_PRINT_LOG_DEBUG, - }; - ``` - -2. Description - - Storage Performance Development Kit (SPDK) log print levels: ERROR, WARN, NOTICE, INFO, and DEBUG, corresponding to 0 to 4 in the configuration file. - -#### MAX_BDEV_NAME_LEN - -1. Prototype - - ```c - #define MAX_BDEV_NAME_LEN 24 - ``` - -2. Description - - Maximum length of a block device name. - -#### MAX_CTRL_NAME_LEN - -1. Prototype - - ```c - #define MAX_CTRL_NAME_LEN 16 - ``` - -2. Description - - Maximum length of a controller. - -#### LBA_FORMAT_NUM - -1. Prototype - - ```c - #define LBA_FORMAT_NUM 16 - ``` - -2. Description - - Number of LBA formats supported by a controller. - -#### LIBSTORAGE_MAX_DSM_RANGE_DESC_COUNT - -1. Prototype - - ```c - #define LIBSTORAGE_MAX_DSM_RANGE_DESC_COUNT 256 - ``` - -2. Description - - Maximum number of 16-byte sets in the dataset management command. - -### ublock.h - -#### UBLOCK_NVME_UEVENT_SUBSYSTEM_UIO - -1. Prototype - - ```c - #define UBLOCK_NVME_UEVENT_SUBSYSTEM_UIO 1 - ``` - -2. Description - - This macro is used to define that the subsystem corresponding to the uevent event is the userspace I/O subsystem (UIO) provided by the kernel. When the service receives the uevent event, this macro is used to determine whether the event is a UIO event that needs to be processed. - - The value of the int subsystem member in struct ublock_uevent is **UBLOCK_NVME_UEVENT_SUBSYSTEM_UIO**. Currently, only this value is available. - -#### UBLOCK_TRADDR_MAX_LEN - -1. Prototype - - ```c - #define UBLOCK_TRADDR_MAX_LEN 256 - ``` - -2. Description - - The *Domain:Bus:Device.Function* (**%04x:%02x:%02x.%x**) format indicates the maximum length of the PCI address character string. The actual length is far less than 256 bytes. - -#### UBLOCK_PCI_ADDR_MAX_LEN - -1. Prototype - - ```c - #define UBLOCK_PCI_ADDR_MAX_LEN 256 - ``` - -2. Description - - Maximum length of the PCI address character string. The actual length is far less than 256 bytes. The possible formats of the PCI address are as follows: - - - Full address: **%x:%x:%x.%x** or **%x.%x.%x.%x** - - - When the **Function** value is **0**: **%x:%x:%x** - - - When the **Domain** value is **0**: **%x:%x.%x** or **%x.%x.%x** - - - When the **Domain** and **Function** values are **0**: **%x:%x** or **%x.%x** - -#### UBLOCK_SMART_INFO_LEN - -1. Prototype - - ```c - #define UBLOCK_SMART_INFO_LEN 512 - ``` - -2. Description - - Size of the structure for the S.M.A.R.T. information of an NVMe drive, which is 512 bytes. - -#### enum ublock_rpc_server_status - -1. Prototype - - ```c - enum ublock_rpc_server_status { - // start rpc server or not - UBLOCK_RPC_SERVER_DISABLE = 0, - UBLOCK_RPC_SERVER_ENABLE = 1, - }; - ``` - -2. Description - - Status of the RPC service in HSAK. The status can be enabled or disabled. - -#### enum ublock_nvme_uevent_action - -1. Prototype - - ```c - enum ublock_nvme_uevent_action { - UBLOCK_NVME_UEVENT_ADD = 0, - UBLOCK_NVME_UEVENT_REMOVE = 1, - UBLOCK_NVME_UEVENT_INVALID, - }; - ``` - -2. Description - - Indicates whether the uevent hot swap event is to insert or remove a drive. - -#### enum ublock_subsystem_type - -1. Prototype - - ```c - enum ublock_subsystem_type { - SUBSYSTEM_UIO = 0, - SUBSYSTEM_NVME = 1, - SUBSYSTEM_TOP - }; - ``` - -2. Description - - Type of the callback function, which is used to determine whether the callback function is registered for the UIO driver or kernel NVMe driver. - -## Data Structure - -### bdev_rw.h - -#### struct libstorage_namespace_info - -1. Prototype - - ```c - struct libstorage_namespace_info - { - char name[MAX_BDEV_NAME_LEN]; - uint64_t size; /** namespace size in bytes */ - uint64_t sectors; /** number of sectors */ - uint32_t sector_size; /** sector size in bytes */ - uint32_t md_size; /** metadata size in bytes */ - uint32_t max_io_xfer_size; /** maximum i/o size in bytes */ - uint16_t id; /** namespace id */ - uint8_t pi_type; /** end-to-end data protection information type */ - uint8_t is_active :1; /** namespace is active or not */ - uint8_t ext_lba :1; /** namespace support extending LBA size or not */ - uint8_t dsm :1; /** namespace supports Dataset Management or not */ - uint8_t pad :3; - uint64_t reserved; - }; - ``` - -2. Description - - This data structure contains the namespace information of a drive. - -3. Struct members - - | Member | Description | - | ---------------------------- | ------------------------------------------------------------ | - | char name\[MAX_BDEV_NAME_LEN] | Name of the namespace. | - | uint64_t size | Size of the drive space allocated to the namespace, in bytes. | - | uint64_t sectors | Number of sectors. | - | uint32_t sector_size | Size of each sector, in bytes. | - | uint32_t md_size | Metadata size, in bytes. | - | uint32_t max_io_xfer_size | Maximum size of data in a single I/O operation, in bytes. | - | uint16_t id | Namespace ID. | - | uint8_t pi_type | Data protection type. The value is obtained from enum libstorage_ns_pi_type. | - | uint8_t is_active :1 | Namespace active or not. | - | uint8_t ext_lba :1 | Whether the namespace supports logical block addressing (LBA) in extended mode. | - | uint8_t dsm :1 | Whether the namespace supports dataset management. | - | uint8_t pad :3 | Reserved parameter. | - | uint64_t reserved | Reserved parameter. | - -#### struct libstorage_nvme_ctrlr_info - -1. Prototype - - ```c - struct libstorage_nvme_ctrlr_info { - char name[MAX_CTRL_NAME_LEN]; - char address[24]; - struct { - uint32_t domain; - uint8_t bus; - uint8_t dev; - uint8_t func; - } pci_addr; - uint64_t totalcap; /* Total NVM Capacity in bytes */ - uint64_t unusecap; /* Unallocated NVM Capacity in bytes */ - int8_t sn[20]; /* Serial number */ - uint8_t fr[8]; /* Firmware revision */ - uint32_t max_num_ns; /* Number of namespaces */ - uint32_t version; - uint16_t num_io_queues; /* num of io queues */ - uint16_t io_queue_size; /* io queue size */ - uint16_t ctrlid; /* Controller id */ - uint16_t pad1; - struct { - struct { - uint32_t ms : 16; /* metadata size */ - uint32_t lbads : 8; /* lba data size */ - uint32_t reserved : 8; - } lbaf[LBA_FORMAT_NUM]; - uint8_t nlbaf; - uint8_t pad2[3]; - uint32_t cur_format : 4; - uint32_t cur_extended : 1; - uint32_t cur_pi : 3; - uint32_t cur_pil : 1; - uint32_t cur_can_share : 1; - uint32_t mc_extented : 1; - uint32_t mc_pointer : 1; - uint32_t pi_type1 : 1; - uint32_t pi_type2 : 1; - uint32_t pi_type3 : 1; - uint32_t md_start : 1; - uint32_t md_end : 1; - uint32_t ns_manage : 1; /* Supports the Namespace Management and Namespace Attachment commands */ - uint32_t directives : 1; /* Controller support Directives or not */ - uint32_t streams : 1; /* Controller support Streams Directives or not */ - uint32_t dsm : 1; /* Controller support Dataset Management or not */ - uint32_t reserved : 11; - } cap_info; - }; - ``` - -2. Description - - This data structure contains the controller information of a drive. - -3. Struct members - - | Member | Description | - | ------------------------------------------------------------ | ------------------------------------------------------------ | - | char name\[MAX_CTRL_NAME_LEN] | Controller name. | - | char address\[24] | PCI address, which is a character string. | - | struct
{
uint32_t domain;
uint8_t bus;
uint8_t dev;
uint8_t func;
} pci_addr | PCI address, in segments. | - | uint64_t totalcap | Total capacity of the controller, in bytes. Optane drives are based on the NVMe 1.0 protocol and do not support this parameter. | - | uint64_t unusecap | Free capacity of the controller, in bytes. Optane drives are based on the NVMe 1.0 protocol and do not support this parameter. | - | int8_t sn\[20]; | Serial number of a drive, which is an ASCII character string without **0**. | - | uint8_t fr\[8]; | Drive firmware version, which is an ASCII character string without **0**. | - | uint32_t max_num_ns | Maximum number of namespaces. | - | uint32_t version | NVMe protocol version supported by the controller. | - | uint16_t num_io_queues | Number of I/O queues supported by a drive. | - | uint16_t io_queue_size | Maximum length of an I/O queue. | - | uint16_t ctrlid | Controller ID. | - | uint16_t pad1 | Reserved parameter. | - - Members of the struct cap_info substructure: - - | Member | Description | - | ------------------------------------------------------------ | ------------------------------------------------------------ | - | struct
{
uint32_t ms : 16;
uint32_t lbads : 8;
uint32_t reserved : 8;
}lbaf\[LBA_FORMAT_NUM] | **ms**: metadata size. The minimum value is 8 bytes.
**lbads**: The LBA size is 2^lbads, and the value of **lbads** is greater than or equal to 9. | - | uint8_t nlbaf | Number of LBA formats supported by the controller. | - | uint8_t pad2\[3] | Reserved parameter. | - | uint32_t cur_format : 4 | Current LBA format of the controller. | - | uint32_t cur_extended : 1 | Whether the controller supports LBA in extended mode. | - | uint32_t cur_pi : 3 | Current protection type of the controller. | - | uint32_t cur_pil : 1 | The current protection information (PI) of the controller is located in the first or last eight bytes of the metadata. | - | uint32_t cur_can_share : 1 | Whether the namespace supports multi-path transmission. | - | uint32_t mc_extented : 1 | Whether metadata is transmitted as part of the data buffer. | - | uint32_t mc_pointer : 1 | Whether metadata is separated from the data buffer. | - | uint32_t pi_type1 : 1 | Whether the controller supports protection type 1. | - | uint32_t pi_type2 : 1 | Whether the controller supports protection type 2. | - | uint32_t pi_type3 : 1 | Whether the controller supports protection type 3. | - | uint32_t md_start : 1 | Whether the controller supports protection information in the first eight bytes of metadata. | - | uint32_t md_end : 1 | Whether the controller supports protection information in the last eight bytes of metadata. | - | uint32_t ns_manage : 1 | Whether the controller supports namespace management. | - | uint32_t directives : 1 | Whether the Directives command set is supported. | - | uint32_t streams : 1 | Whether Streams Directives is supported. | - | uint32_t dsm : 1 | Whether Dataset Management commands are supported. | - | uint32_t reserved : 11 | Reserved parameter. | - -#### struct libstorage_dsm_range_desc - -1. Prototype - - ```c - struct libstorage_dsm_range_desc - { - /* RESERVED */ - uint32_t reserved; - - /* NUMBER OF LOGICAL BLOCKS */ - uint32_t block_count; - - /* UNMAP LOGICAL BLOCK ADDRESS */uint64_t lba;}; - ``` - -2. Description - - Definition of a single 16-byte set in the data management command set. - -3. Struct members - - | Member | Description | - | -------------------- | ------------------------ | - | uint32_t reserved | Reserved parameter. | - | uint32_t block_count | Number of LBAs per unit. | - | uint64_t lba | Start LBA. | - -#### struct libstorage_ctrl_streams_param - -1. Prototype - - ```c - struct libstorage_ctrl_streams_param - { - /* MAX Streams Limit */ - uint16_t msl; - - /* NVM Subsystem Streams Available */ - uint16_t nssa; - - /* NVM Subsystem Streams Open */uint16_t nsso; - - uint16_t pad; - }; - ``` - -2. Description - - Streams attribute value supported by NVMe drives. - -3. Struct members - - | Member | Description | - | ------------- | ------------------------------------------------------------ | - | uint16_t msl | Maximum number of Streams resources supported by a drive. | - | uint16_t nssa | Number of Streams resources that can be used by each NVM subsystem. | - | uint16_t nsso | Number of Streams resources used by each NVM subsystem. | - | uint16_t pad | Reserved parameter. | - -#### struct libstorage_bdev_streams_param - -1. Prototype - - ```c - struct libstorage_bdev_streams_param - { - /* Stream Write Size */ - uint32_t sws; - - /* Stream Granularity Size */ - uint16_t sgs; - - /* Namespace Streams Allocated */ - uint16_t nsa; - - /* Namespace Streams Open */ - uint16_t nso; - - uint16_t reserved[3]; - }; - ``` - -2. Description - - Streams attribute value of the namespace. - -3. Struct members - - | Member | Description | - | -------------------- | ------------------------------------------------------------ | - | uint32_t sws | Write granularity with the optimal performance, in sectors. | - | uint16_t sgs | Write granularity allocated to Streams, in sws. | - | uint16_t nsa | Number of private Streams resources that can be used by a namespace. | - | uint16_t nso | Number of private Streams resources used by a namespace. | - | uint16_t reserved\[3] | Reserved parameter. | - -#### struct libstorage_mgr_info - -1. Prototype - - ```c - struct libstorage_mgr_info - { - char pci[24]; - char ctrlName[MAX_CTRL_NAME_LEN]; - uint64_t sector_size; - uint64_t cap_size; - uint16_t device_id; - uint16_t subsystem_device_id; - uint16_t vendor_id; - uint16_t subsystem_vendor_id; - uint16_t controller_id; - int8_t serial_number[20]; - int8_t model_number[40]; - uint8_t firmware_revision[8]; - }; - ``` - -2. Description - - Drive management information (consistent with the drive information used by the management plane). - -3. Struct members - - | Member | Description | - | -------------------------------- | ---------------------------------------------- | - | char pci\[24] | Character string of the drive PCI address. | - | char ctrlName\[MAX_CTRL_NAME_LEN] | Character string of the drive controller name. | - | uint64_t sector_size | Drive sector size. | - | uint64_t cap_size | Drive capacity, in bytes. | - | uint16_t device_id | Drive device ID. | - | uint16_t subsystem_device_id | Drive subsystem device ID. | - | uint16\*t vendor\*id | Drive vendor ID. | - | uint16_t subsystem_vendor_id | Drive subsystem vendor ID. | - | uint16_t controller_id | Drive controller ID. | - | int8_t serial_number\[20] | Drive serial number. | - | int8_t model_number\[40] | Device model. | - | uint8_t firmware_revision\[8] | Firmware version. | - -#### struct **attribute**((packed)) libstorage_smart_info - -1. Prototype - - ```c - /* same with struct spdk_nvme_health_information_page in nvme_spec.h */ - struct __attribute__((packed)) libstorage_smart_info { - /* details of uint8_t critical_warning - * - * union spdk_nvme_critical_warning_state { - * uint8_t raw; - * struct { - * uint8_t available_spare : 1; - * uint8_t temperature : 1; - * uint8_t device_reliability : 1; - * uint8_t read_only : 1; - * uint8_t volatile_memory_backup : 1; - * uint8_t reserved : 3; - * } bits; - * }; - */ - uint8_t critical_warning; - uint16_t temperature; - uint8_t available_spare; - uint8_t available_spare_threshold; - uint8_t percentage_used; - uint8_t reserved[26]; - - /* - * Note that the following are 128-bit values, but are - * defined as an array of 2 64-bit values. - */ - /* Data Units Read is always in 512-byte units. */ - uint64_t data_units_read[2]; - /* Data Units Written is always in 512-byte units. */ - uint64_t data_units_written[2]; - /* For NVM command set, this includes Compare commands. */ - uint64_t host_read_commands[2]; - uint64_t host_write_commands[2]; - /* Controller Busy Time is reported in minutes. */ - uint64_t controller_busy_time[2]; - uint64_t power_cycles[2]; - uint64_t power_on_hours[2]; - uint64_t unsafe_shutdowns[2]; - uint64_t media_errors[2]; - uint64_t num_error_info_log_entries[2]; - - /* Controller temperature related. */ - uint32_t warning_temp_time; - uint32_t critical_temp_time; - uint16_t temp_sensor[8]; - uint8_t reserved2[296]; - }; - - ``` - -2. Description - - This data structure defines the S.M.A.R.T. information of a drive. - -3. Struct members - - | Member | **Description (For details, see the NVMe protocol.)** | - | -------------------------------------- | ------------------------------------------------------------ | - | uint8_t critical_warning | Critical alarm of the controller status. If a bit is set to 1, the bit is valid. You can set multiple bits to be valid. Critical alarms are returned to the host through asynchronous events.
Bit 0: When this bit is set to 1, the redundant space is less than the specified threshold.
Bit 1: When this bit is set to 1, the temperature is higher or lower than a major threshold.
Bit 2: When this bit is set to 1, component reliability is reduced due to major media errors or internal errors.
Bit 3: When this bit is set to 1, the medium has been set to the read-only mode.
Bit 4: When this bit is set to 1, the volatile component of the controller fails. This parameter is valid only when the volatile component exists in the controller.
Bits 5-7: reserved. | - | uint16_t temperature | Temperature of a component. The unit is Kelvin. | - | uint8_t available_spare | Percentage of the available redundant space (0 to 100%). | - | uint8_t available_spare_threshold | Threshold of the available redundant space. An asynchronous event is reported when the available redundant space is lower than the threshold. | - | uint8_t percentage_used | Percentage of the actual service life of a component to the service life of the component expected by the manufacturer. The value **100** indicates that the actual service life of the component has reached to the expected service life, but the component can still be used. The value can be greater than 100, but any value greater than 254 will be set to 255. | - | uint8_t reserved\[26] | Reserved. | - | uint64_t data_units_read\[2] | Number of 512 bytes read by the host from the controller. The value **1** indicates that 1000 x 512 bytes are read, which exclude metadata. If the LBA size is not 512 bytes, the controller converts it into 512 bytes for calculation. The value is expressed in hexadecimal notation. | - | uint64_t data_units_written\[2] | Number of 512 bytes written by the host to the controller. The value **1** indicates that 1000 x 512 bytes are written, which exclude metadata. If the LBA size is not 512 bytes, the controller converts it into 512 bytes for calculation. The value is expressed in hexadecimal notation. | - | uint64_t host_read_commands\[2] | Number of read commands delivered to the controller. | - | uint64_t host_write_commands\[2]; | Number of write commands delivered to the controller. | - | uint64_t controller_busy_time\[2] | Busy time for the controller to process I/O commands. The process from the time the commands are delivered to the time the results are returned to the CQ is busy. The time is expressed in minutes. | - | uint64_t power_cycles\[2] | Number of machine on/off cycles. | - | uint64_t power_on_hours\[2] | Power-on duration, in hours. | - | uint64_t unsafe_shutdowns\[2] | Number of abnormal power-off times. The value is incremented by 1 when CC.SHN is not received during power-off. | - | uint64_t media_errors\[2] | Number of times that the controller detects unrecoverable data integrity errors, including uncorrectable ECC errors, CRC errors, and LBA tag mismatch. | - | uint64_t num_error_info_log_entries\[2] | Number of entries in the error information log within the controller lifecycle. | - | uint32_t warning_temp_time | Accumulated time when the temperature exceeds the warning alarm threshold, in minutes. | - | uint32_t critical_temp_time | Accumulated time when the temperature exceeds the critical alarm threshold, in minutes. | - | uint16_t temp_sensor\[8] | Temperature of temperature sensors 1-8. The unit is Kelvin. | - | uint8_t reserved2\[296] | Reserved. | - -#### libstorage_dpdk_contig_mem - -1. Prototype - - ```c - struct libstorage_dpdk_contig_mem { - uint64_t virtAddr; - uint64_t memLen; - uint64_t allocLen; - }; - ``` - -2. Description - - Description about a contiguous virtual memory segment in the parameters of the callback function that notifies the service layer of initialization completion after the DPDK memory is initialized. - - Currently, 800 MB memory is reserved for HSAK. Other memory is returned to the service layer through **allocLen** in this struct for the service layer to allocate memory for self-management. - - The total memory to be reserved for HSAK is about 800 MB. The memory reserved for each memory segment is calculated based on the number of NUMA nodes in the environment. When there are too many NUMA nodes, the memory reserved on each memory segment is too small. As a result, HSAK initialization fails. Therefore, HSAK supports only the environment with a maximum of four NUMA nodes. - -3. Struct members - - | Member | Description | - | ----------------- | -------------------------------------------------------- | - | uint64_t virtAddr | Start address of the virtual memory. | - | uint64_t memLen | Length of the virtual memory, in bytes. | - | uint64_t allocLen | Available memory length in the memory segment, in bytes. | - -#### struct libstorage_dpdk_init_notify_arg - -1. Prototype - - ```c - struct libstorage_dpdk_init_notify_arg { - uint64_t baseAddr; - uint16_t memsegCount; - struct libstorage_dpdk_contig_mem *memseg; - }; - ``` - -2. Description - - Callback function parameter used to notify the service layer of initialization completion after DPDK memory initialization, indicating information about all virtual memory segments. - -3. Struct members - - | Member | Description | - | ----------------------------------------- | ------------------------------------------------------------ | - | uint64_t baseAddr | Start address of the virtual memory. | - | uint16_t memsegCount | Number of valid **memseg** array members, that is, the number of contiguous virtual memory segments. | - | struct libstorage_dpdk_contig_mem *memseg | Pointer to the memory segment array. Each array element is a contiguous virtual memory segment, and every two elements are discontiguous. | - -#### struct libstorage_dpdk_init_notify - -1. Prototype - - ```c - struct libstorage_dpdk_init_notify { - const char *name; - void (*notifyFunc)(const struct libstorage_dpdk_init_notify_arg *arg); - TAILQ_ENTRY(libstorage_dpdk_init_notify) tailq; - }; - ``` - -2. Description - - Struct used to notify the service layer of the callback function registration after the DPDK memory is initialized. - -3. Struct members - - | Member | Description | - | ------------------------------------------------------------ | ------------------------------------------------------------ | - | const char *name | Name of the service-layer module of the registered callback function. | - | void (*notifyFunc)(const struct libstorage_dpdk_init_notify_arg*arg) | Callback function parameter used to notify the service layer of initialization completion after the DPDK memory is initialized. | - | TAILQ_ENTRY(libstorage_dpdk_init_notify) tailq | Linked list that stores registered callback functions. | - -### ublock.h - -#### struct ublock_bdev_info - -1. Prototype - - ```c - struct ublock_bdev_info { - uint64_t sector_size; - uint64_t cap_size; // cap_size - uint16_t device_id; - uint16_t subsystem_device_id; // subsystem device id of nvme control - uint16_t vendor_id; - uint16_t subsystem_vendor_id; - uint16_t controller_id; - int8_t serial_number[20]; - int8_t model_number[40]; - int8_t firmware_revision[8]; - }; - ``` - -2. Description - - This data structure contains the device information of a drive. - -3. Struct members - - | Member | Description | - | ---------------------------- | ----------------------------------------------- | - | uint64_t sector_size | Sector size of a drive, for example, 512 bytes. | - | uint64_t cap_size | Total drive capacity, in bytes. | - | uint16_t device_id | Device ID. | - | uint16_t subsystem_device_id | Device ID of a subsystem. | - | uint16_t vendor_id | Main ID of the device vendor. | - | uint16_t subsystem_vendor_id | Sub-ID of the device vendor. | - | uint16_t controller_id | ID of the device controller. | - | int8_t serial_number\[20] | Device serial number. | - | int8_t model_number\[40] | Device model. | - | int8_t firmware_revision\[8] | Firmware version. | - -#### struct ublock_bdev - -1. Prototype - - ```c - struct ublock_bdev { - char pci[UBLOCK_PCI_ADDR_MAX_LEN]; - struct ublock_bdev_info info; - struct spdk_nvme_ctrlr *ctrlr; - TAILQ_ENTRY(ublock_bdev) link; - }; - ``` - -2. Description - - The data structure contains the drive information of the specified PCI address, and the structure itself is a node of the queue. - -3. Struct members - - | Member | Description | - | --------------------------------- | ------------------------------------------------------------ | - | char pci\[UBLOCK_PCI_ADDR_MAX_LEN] | PCI address. | - | struct ublock_bdev_info info | Drive information. | - | struct spdk_nvme_ctrlr *ctrlr | Data structure of the device controller. The members in this structure are not open to external systems. External services can obtain the corresponding member data through the SPDK open source interface. | - | TAILQ_ENTRY(ublock_bdev) link | Structure of the pointers before and after a queue. | - -#### struct ublock_bdev_mgr - -1. Prototype - - ```c - struct ublock_bdev_mgr { - TAILQ_HEAD(, ublock_bdev) bdevs; - }; - ``` - -2. Description - - This data structure defines the header structure of a ublock_bdev queue. - -3. Struct members - - | Member | Description | - | -------------------------------- | ----------------------- | - | TAILQ_HEAD(, ublock_bdev) bdevs; | Queue header structure. | - -#### struct **attribute**((packed)) ublock_SMART_info - -1. Prototype - - ```c - struct __attribute__((packed)) ublock_SMART_info { - uint8_t critical_warning; - uint16_t temperature; - uint8_t available_spare; - uint8_t available_spare_threshold; - uint8_t percentage_used; - uint8_t reserved[26]; - /* - - Note that the following are 128-bit values, but are - - defined as an array of 2 64-bit values. - */ - /* Data Units Read is always in 512-byte units. */ - uint64_t data_units_read[2]; - /* Data Units Written is always in 512-byte units. */ - uint64_t data_units_written[2]; - /* For NVM command set, this includes Compare commands. */ - uint64_t host_read_commands[2]; - uint64_t host_write_commands[2]; - /* Controller Busy Time is reported in minutes. */ - uint64_t controller_busy_time[2]; - uint64_t power_cycles[2]; - uint64_t power_on_hours[2]; - uint64_t unsafe_shutdowns[2]; - uint64_t media_errors[2]; - uint64_t num_error_info_log_entries[2]; - /* Controller temperature related. */ - uint32_t warning_temp_time; - uint32_t critical_temp_time; - uint16_t temp_sensor[8]; - uint8_t reserved2[296]; - }; - ``` - -2. Description - - This data structure defines the S.M.A.R.T. information of a drive. - -3. Struct members - - | Member | Description (For details, see the NVMe protocol.) | - | -------------------------------------- | ------------------------------------------------------------ | - | uint8_t critical_warning | Critical alarm of the controller status. If a bit is set to 1, the bit is valid. You can set multiple bits to be valid. Critical alarms are returned to the host through asynchronous events.
Bit 0: When this bit is set to 1, the redundant space is less than the specified threshold.
Bit 1: When this bit is set to 1, the temperature is higher or lower than a major threshold.
Bit 2: When this bit is set to 1, component reliability is reduced due to major media errors or internal errors.
Bit 3: When this bit is set to 1, the medium has been set to the read-only mode.
Bit 4: When this bit is set to 1, the volatile component of the controller fails. This parameter is valid only when the volatile component exists in the controller.
Bits 5-7: reserved. | - | uint16_t temperature | Temperature of a component. The unit is Kelvin. | - | uint8_t available_spare | Percentage of the available redundant space (0 to 100%). | - | uint8_t available_spare_threshold | Threshold of the available redundant space. An asynchronous event is reported when the available redundant space is lower than the threshold. | - | uint8_t percentage_used | Percentage of the actual service life of a component to the service life of the component expected by the manufacturer. The value **100** indicates that the actual service life of the component has reached to the expected service life, but the component can still be used. The value can be greater than 100, but any value greater than 254 will be set to 255. | - | uint8_t reserved\[26] | Reserved. | - | uint64_t data_units_read\[2] | Number of 512 bytes read by the host from the controller. The value **1** indicates that 1000 x 512 bytes are read, which exclude metadata. If the LBA size is not 512 bytes, the controller converts it into 512 bytes for calculation. The value is expressed in hexadecimal notation. | - | uint64_t data_units_written\[2] | Number of 512 bytes written by the host to the controller. The value **1** indicates that 1000 x 512 bytes are written, which exclude metadata. If the LBA size is not 512 bytes, the controller converts it into 512 bytes for calculation. The value is expressed in hexadecimal notation. | - | uint64_t host_read_commands\[2] | Number of read commands delivered to the controller. | - | uint64_t host_write_commands\[2]; | Number of write commands delivered to the controller. | - | uint64_t controller_busy_time\[2] | Busy time for the controller to process I/O commands. The process from the time the commands are delivered to the time the results are returned to the CQ is busy. The value is expressed in minutes. | - | uint64_t power_cycles\[2] | Number of machine on/off cycles. | - | uint64_t power_on_hours\[2] | Power-on duration, in hours. | - | uint64_t unsafe_shutdowns\[2] | Number of abnormal power-off times. The value is incremented by 1 when CC.SHN is not received during power-off. | - | uint64_t media_errors\[2] | Number of unrecoverable data integrity errors detected by the controller, including uncorrectable ECC errors, CRC errors, and LBA tag mismatch. | - | uint64_t num_error_info_log_entries\[2] | Number of entries in the error information log within the controller lifecycle. | - | uint32_t warning_temp_time | Accumulated time when the temperature exceeds the warning alarm threshold, in minutes. | - | uint32_t critical_temp_time | Accumulated time when the temperature exceeds the critical alarm threshold, in minutes. | - | uint16_t temp_sensor\[8] | Temperature of temperature sensors 1-8. The unit is Kelvin. | - | uint8_t reserved2\[296] | Reserved. | - -#### struct ublock_nvme_error_info - -1. Prototype - - ```c - struct ublock_nvme_error_info { - uint64_t error_count; - uint16_t sqid; - uint16_t cid; - uint16_t status; - uint16_t error_location; - uint64_t lba; - uint32_t nsid; - uint8_t vendor_specific; - uint8_t reserved[35]; - }; - ``` - -2. Description - - This data structure contains the content of a single error message in the device controller. The number of errors supported by different controllers may vary. - -3. Struct members - - | Member | Description (For details, see the NVMe protocol.) | - | ----------------------- | ------------------------------------------------------------ | - | uint64_t error_count | Error sequence number, which increases in ascending order. | - | uint16_t sqid | Submission queue identifier for the command associated with an error message. If an error cannot be associated with a specific command, this parameter should be set to **FFFFh**. | - | uint16_t cid | Command identifier associated with an error message. If an error cannot be associated with a specific command, this parameter should be set to **FFFFh**. | - | uint16_t status | Status of a completed command. | - | uint16_t error_location | Command parameter associated with an error message. | - | uint64_t lba | First LBA when an error occurs. | - | uint32_t nsid | Namespace where an error occurs. | - | uint8_t vendor_specific | Log page identifier associated with the page if other vendor-specific error messages are available. The value **00h** indicates that no additional information is available. The valid value ranges from 80h to FFh. | - | uint8_t reserved\[35] | Reserved. | - -#### struct ublock_uevent - -1. Prototype - - ```c - struct ublock_uevent { - enum ublock_nvme_uevent_action action; - int subsystem; - char traddr[UBLOCK_TRADDR_MAX_LEN + 1]; - }; - ``` - -2. Description - - This data structure contains parameters related to the uevent event. - -3. Struct members - - | Member | Description | - | -------------------------------------- | ------------------------------------------------------------ | - | enum ublock_nvme_uevent_action action | Whether the uevent event type is drive insertion or removal through enumeration. | - | int subsystem | Subsystem type of the uevent event. Currently, only **UBLOCK_NVME_UEVENT_SUBSYSTEM_UIO** is supported. If the application receives other values, no processing is required. | - | char traddr\[UBLOCK_TRADDR_MAX_LEN + 1] | PCI address character string in the *Domain:Bus:Device.Function* (**%04x:%02x:%02x.%x**) format. | - -#### struct ublock_hook - -1. Prototype - - ```c - struct ublock_hook - { - ublock_callback_func ublock_callback; - void *user_data; - }; - ``` - -2. Description - - This data structure is used to register callback functions. - -3. Struct members - - | Member | Description | - | ------------------------------------ | ------------------------------------------------------------ | - | ublock_callback_func ublock_callback | Function executed during callback. The type is bool func(void *info, void*user_data). | - | void *user_data | User parameter transferred to the callback function. | - -#### struct ublock_ctrl_iostat_info - -1. Prototype - - ```c - struct ublock_ctrl_iostat_info - { - uint64_t num_read_ops; - uint64_t num_write_ops; - uint64_t read_latency_ms; - uint64_t write_latency_ms; - uint64_t io_outstanding; - uint64_t num_poll_timeout; - uint64_t io_ticks_ms; - }; - ``` - -2. Description - - This data structure is used to obtain the I/O statistics of a controller. - -3. Struct members - - | Member | Description | - | ------------------------- | ------------------------------------------------------------ | - | uint64_t num_read_ops | Accumulated number of read I/Os of the controller. | - | uint64_t num_write_ops | Accumulated number of write I/Os of the controller. | - | uint64_t read_latency_ms | Accumulated read latency of the controller, in ms. | - | uint64_t write_latency_ms | Accumulated write latency of the controller, in ms. | - | uint64_t io_outstanding | Queue depth of the controller. | - | uint64_t num_poll_timeout | Accumulated number of polling timeouts of the controller. | - | uint64_t io_ticks_ms | Accumulated I/O processing latency of the controller, in ms. | - -## API - -### bdev_rw.h - -#### libstorage_get_nvme_ctrlr_info - -1. Prototype - - uint32_t libstorage_get_nvme_ctrlr_info(struct libstorage_nvme_ctrlr_info** ppCtrlrInfo); - -2. Description - - Obtains information about all controllers. - -3. Parameters - - | Parameter | Description | - | ----------------------------------------------- | ------------------------------------------------------------ | - | struct libstorage_nvme_ctrlr_info** ppCtrlrInfo | Output parameter, which returns all obtained controller information.
Note:
Free the memory using the free API in a timely manner. | - -4. Return value - - | Return Value | Description | - | ------------ | ------------------------------------------------------------ | - | 0 | Failed to obtain controller information or no controller information is obtained. | - | > 0 | Number of obtained controllers. | - -#### libstorage_get_mgr_info_by_esn - -1. Prototype - - ```c - int32_t libstorage_get_mgr_info_by_esn(const char *esn, struct libstorage_mgr_info *mgr_info); - ``` - -2. Description - - Obtains the management information about the NVMe drive corresponding to the ESN. - -3. Parameters - - | Parameter | Description | - | ------------------------------------ | ------------------------------------------------------------ | - | const char *esn | ESN of the target device.
Note:
An ESN is a string of a maximum of 20 characters (excluding the end character of the string), but the length may vary according to hardware vendors. For example, if the length is less than 20 characters, spaces are padded at the end of the character string.
| - | struct libstorage_mgr_info *mgr_info | Output parameter, which returns all obtained NVMe drive management information. | - -4. Return value - - | Return Value | Description | - | ------------ | ------------------------------------------------------------ | - | 0 | Succeeded in querying the NVMe drive management information corresponding to an ESN. | - | -1 | Failed to query the NVMe drive management information corresponding to an ESN. | - | -2 | No NVMe drive matching an ESN is obtained. | - -#### libstorage_get_mgr_smart_by_esn - -1. Prototype - - ```c - int32_t libstorage_get_mgr_smart_by_esn(const char *esn, uint32_t nsid, struct libstorage_smart_info *mgr_smart_info); - ``` - -2. Description - - Obtains the S.M.A.R.T. information of the NVMe drive corresponding to an ESN. - -3. Parameters - - | Parameter | Description | - | ------------------------------------ | ------------------------------------------------------------ | - | const char *esn | ESN of the target device.
Note:
An ESN is a string of a maximum of 20 characters (excluding the end character of the string), but the length may vary according to hardware vendors. For example, if the length is less than 20 characters, spaces are padded at the end of the character string.
| - | uint32_t nsid | Specified namespace. | - | struct libstorage_mgr_info *mgr_info | Output parameter, which returns all obtained S.M.A.R.T. information of NVMe drives. | - -4. Return value - - | Return Value | Description | - | ------------ | ------------------------------------------------------------ | - | 0 | Succeeded in querying the S.M.A.R.T. information of the NVMe drive corresponding to an ESN. | - | -1 | Failed to query the S.M.A.R.T. information of the NVMe drive corresponding to an ESN. | - | -2 | No NVMe drive matching an ESN is obtained. | - -#### libstorage_get_bdev_ns_info - -1. Prototype - - ```c - uint32_t libstorage_get_bdev_ns_info(const char* bdevName, struct libstorage_namespace_info** ppNsInfo); - ``` - -2. Description - - Obtains namespace information based on the device name. - -3. Parameters - - | Parameter | Description | - | ------------------------------------------- | ------------------------------------------------------------ | - | const char* bdevName | Device name. | - | struct libstorage_namespace_info** ppNsInfo | Output parameter, which returns namespace information.
Note:
Free the memory using the free API in a timely manner. | - -4. Return value - - | Return Value | Description | - | ------------ | ---------------------------- | - | 0 | The operation failed. | - | 1 | The operation is successful. | - -#### libstorage_get_ctrl_ns_info - -1. Prototype - - ```c - uint32_t libstorage_get_ctrl_ns_info(const char* ctrlName, struct libstorage_namespace_info** ppNsInfo); - ``` - -2. Description - - Obtains information about all namespaces based on the controller name. - -3. Parameters - - | Parameter | Description | - | ------------------------------------------- | ------------------------------------------------------------ | - | const char* ctrlName | Controller name. | - | struct libstorage_namespace_info** ppNsInfo | Output parameter, which returns information about all namespaces.
Note:
Free the memory using the free API in a timely manner. | - -4. Return value - - | Return Value | Description | - | ------------ | ------------------------------------------------------------ | - | 0 | Failed to obtain the namespace information or no namespace information is obtained. | - | > 0 | Number of namespaces obtained. | - -#### libstorage_create_namespace - -1. Prototype - - ```c - int32_t libstorage_create_namespace(const char* ctrlName, uint64_t ns_size, char** outputName); - ``` - -2. Description - - Creates a namespace on a specified controller (the prerequisite is that the controller supports namespace management). - - Optane drives are based on the NVMe 1.0 protocol and do not support namespace management. Therefore, this API is not supported. - - ES3000 V3 and V5 support only one namespace by default. By default, a namespace exists on the controller. To create a namespace, delete the original namespace. - -3. Parameters - - | Parameter | Description | - | -------------------- | ------------------------------------------------------------ | - | const char* ctrlName | Controller name. | - | uint64_t ns_size | Size of the namespace to be created (unit: sector_size). | - | char** outputName | Output parameter, which indicates the name of the created namespace.
Note:
Free the memory using the free API in a timely manner. | - -4. Return value - - | Return Value | Description | - | ------------ | ---------------------------------------------- | - | ≤ 0 | Failed to create the namespace. | - | > 0 | ID of the created namespace (starting from 1). | - -#### libstorage_delete_namespace - -1. Prototype - - ```c - int32_t libstorage_delete_namespace(const char* ctrlName, uint32_t ns_id); - ``` - -2. Description - - Deletes a namespace from a specified controller. Optane drives are based on the NVMe 1.0 protocol and do not support namespace management. Therefore, this API is not supported. - -3. Parameters - - | Parameter | Description | - | -------------------- | ---------------- | - | const char* ctrlName | Controller name. | - | uint32_t ns_id | Namespace ID | - -4. Return value - - | Return Value | Description | - | ------------ | ------------------------------------------------------------ | - | 0 | Deletion succeeded. | - | Other values | Deletion failed.
Note:
Before deleting a namespace, stop I/O operations. Otherwise, the namespace fails to be deleted. | - -#### libstorage_delete_all_namespace - -1. Prototype - - ```c - int32_t libstorage_delete_all_namespace(const char* ctrlName); - ``` - -2. Description - - Deletes all namespaces from a specified controller. Optane drives are based on the NVMe 1.0 protocol and do not support namespace management. Therefore, this API is not supported. - -3. Parameters - - | Parameter | Description | - | -------------------- | ---------------- | - | const char* ctrlName | Controller name. | - -4. Return value - - | Return Value | Description | - | ------------ | ------------------------------------------------------------ | - | 0 | Deletion succeeded. | - | Other values | Deletion failed.
Note:
Before deleting a namespace, stop I/O operations. Otherwise, the namespace fails to be deleted. | - -#### libstorage_nvme_create_ctrlr - -1. Prototype - - ```c - int32_t libstorage_nvme_create_ctrlr(const char *pci_addr, const char *ctrlr_name); - ``` - -2. Description - - Creates an NVMe controller based on the PCI address. - -3. Parameters - - | Parameter | Description | - | ---------------- | ---------------- | - | char *pci_addr | PCI address. | - | char *ctrlr_name | Controller name. | - -4. Return value - - | Return Value | Description | - | ------------ | ------------------- | - | < 0 | Creation failed. | - | 0 | Creation succeeded. | - -#### libstorage_nvme_delete_ctrlr - -1. Prototype - - ```c - int32_t libstorage_nvme_delete_ctrlr(const char *ctrlr_name); - ``` - -2. Description - - Destroys an NVMe controller based on the controller name. - -3. Parameters - - | Parameter | Description | - | ---------------------- | ---------------- | - | const char *ctrlr_name | Controller name. | - - This API can be called only after all delivered I/Os are returned. - -4. Return value - - | Return Value | Description | - | ------------ | ---------------------- | - | < 0 | Destruction failed. | - | 0 | Destruction succeeded. | - -#### libstorage_nvme_reload_ctrlr - -1. Prototype - - ```c - int32_t libstorage_nvme_reload_ctrlr(const char *cfgfile); - ``` - -2. Description - - Adds or deletes an NVMe controller based on the configuration file. - -3. Parameters - - | Parameter | Description | - | ------------------- | ------------------------------- | - | const char *cfgfile | Path of the configuration file. | - - Before using this API to delete a drive, ensure that all delivered I/Os have been returned. - -4. Return value - - | Return Value | Description | - | ------------ | ------------------------------------------------------------ | - | < 0 | Failed to add or delete drives based on the configuration file. (Drives may be successfully added or deleted for some controllers.) | - | 0 | Drives are successfully added or deleted based on the configuration file. | - - > Constraints - - - Currently, a maximum of 36 controllers can be configured in the configuration file. - - - The reload API creates as many controllers as possible. If a controller fails to be created, the creation of other controllers is not affected. - - - In concurrency scenarios, the final drive initialization status may be inconsistent with the input configuration file. - - - If you delete a drive that is delivering I/Os by reloading the drive, I/Os fail. - - - After the controller name (for example, **nvme0**) corresponding to the PCI address in the configuration file is modified, the modification does not take effect after this interface is called. - - - The reload function is valid only when drives are added or deleted. Other configuration items in the configuration file cannot be reloaded. - -#### libstorage_low_level_format_nvm - -1. Prototype - - ```c - int8_t libstorage_low_level_format_nvm(const char* ctrlName, uint8_t lbaf, - enum libstorage_ns_pi_type piType, - bool pil_start, bool ms_extented, uint8_t ses); - ``` - -2. Description - - Low-level formats NVMe drives. - -3. Parameters - - | Parameter | Description | - | --------------------------------- | ------------------------------------------------------------ | - | const char* ctrlName | Controller name. | - | uint8_t lbaf | LBA format to be used. | - | enum libstorage_ns_pi_type piType | Protection type to be used. | - | bool pil_start | The protection information is stored in first eight bytes (1) or last eight bytes (0) of the metadata. | - | bool ms_extented | Whether to format to the extended type. | - | uint8_t ses | Whether to perform secure erase during formatting. Currently, only the value **0** (no-secure erase) is supported. | - -4. Return value - - | Return Value | Description | - | ------------ | ------------------------------------------------- | - | < 0 | Formatting failed. | - | ≥ 0 | LBA format generated after successful formatting. | - - > Constraints - - - This low-level formatting API will clear the data and metadata of the drive namespace. Exercise caution when using this API. - - - It takes several seconds to format an ES3000 drive and several minutes to format an Intel Optane drive. Before using this API, wait until the formatting is complete. If the formatting process is forcibly stopped, the formatting fails. - - - Before formatting, stop the I/O operations on the data plane. If the drive is processing I/O requests, the formatting may fail occasionally. If the formatting is successful, the drive may discard the I/O requests that are being processed. Therefore, before formatting the drive, ensure that the I/O operations on the data plane are stopped. - - - During the formatting, the controller is reset. As a result, the initialized drive resources are unavailable. Therefore, after the formatting is complete, restart the I/O process on the data plane. - - - ES3000 V3 supports protection types 0 and 3, PI start and PI end, and mc extended. ES3000 V3 supports DIF in 512+8 format but does not support DIF in 4096+64 format. - - - ES3000 V5 supports protection types 0 and 3, PI start and PI end, mc extended, and mc pointer. ES3000 V5 supports DIF in both 512+8 and 4096+64 formats. - - - Optane drives support protection types 0 and 1, PI end, and mc extended. Optane drives support DIF in 512+8 format but does not support DIF in 4096+64 format. - - | **Drive Type** | **LBA Format** | **Drive Type** | **LBA Format** | - | ------------------ | ------------------------------------------------------------ | -------------- | ------------------------------------------------------------ | - | Intel Optane P4800 | lbaf0:512+0
lbaf1:512+8
lbaf2:512+16
lbaf3:4096+0
lbaf4:4096+8
lbaf5:4096+64
lbaf6:4096+128 | ES3000 V3, V5 | lbaf0:512+0
lbaf1:512+8
lbaf2:4096+64
lbaf3:4096+0
lbaf4:4096+8 | - -#### LIBSTORAGE_CALLBACK_FUNC - -1. Prototype - - ```c - typedef void (*LIBSTORAGE_CALLBACK_FUNC)(int32_t cb_status, int32_t sct_code, void* cb_arg); - ``` - -2. Description - - Registered HSAK I/O completion callback function. - -3. Parameters - - | Parameter | Description | - | ----------------- | ------------------------------------------------------------ | - | int32_t cb_status | I/O status code. The value **0** indicates success, a negative value indicates system error code, and a positive value indicates drive error code (for different error codes,
see [Appendixes](#Appendixes)). | - | int32_t sct_code | I/O status code type:
0: [GENERIC](#generic)
1: [COMMAND_SPECIFIC](#command_specific)
2: [MEDIA_DATA_INTERGRITY_ERROR](#media_data_intergrity_error)
7: VENDOR_SPECIFIC | - | void* cb_arg | Input parameter of the callback function. | - -4. Return value - - None. - -#### libstorage_deallocate_block - -1. Prototype - - ```c - int32_t libstorage_deallocate_block(int32_t fd, struct libstorage_dsm_range_desc *range, uint16_t range_count, LIBSTORAGE_CALLBACK_FUNC cb, void* cb_arg); - ``` - -2. Description - - Notifies NVMe drives of the blocks that can be released. - -3. Parameters - - | Parameter | Description | - | --------------------------------------- | ------------------------------------------------------------ | - | int32_t fd | Open drive file descriptor. | - | struct libstorage_dsm_range_desc *range | Description of blocks that can be released on NVMe drives.
Note:
This parameter requires **libstorage_mem_reserve** to allocate huge page memory. 4 KB alignment is required during memory allocation, that is, align is set to 4096.
The TRIM range of drives is restricted based on different drives. Exceeding the maximum TRIM range on the drives may cause data exceptions. | - | uint16_t range_count | Number of members in the array range. | - | LIBSTORAGE_CALLBACK_FUNC cb | Callback function. | - | void* cb_arg | Callback function parameter. | - -4. Return value - - | Return Value | Description | - | ------------ | ------------------------------- | - | < 0 | Failed to deliver the request. | - | 0 | Request submitted successfully. | - -#### libstorage_async_write - -1. Prototype - - ```c - int32_t libstorage_async_write(int32_t fd, void *buf, size_t nbytes, off64_t offset, void *md_buf, size_t md_len, enum libstorage_crc_and_prchk dif_flag, LIBSTORAGE_CALLBACK_FUNC cb, void* cb_arg); - ``` - -2. Description - - Delivers asynchronous I/O write requests (the write buffer is a contiguous buffer). - -3. Parameters - - | Parameter | Description | - | -------------------------------------- | ------------------------------------------------------------ | - | int32_t fd | File descriptor of the block device. | - | void *buf | Buffer for I/O write data (four-byte aligned and cannot cross the 4 KB page boundary).
Note:
The LBA in extended mode must contain the metadata memory size. | - | size_t nbytes | Size of a single write I/O, in bytes (an integer multiple of **sector_size**).
Note:
Only the data size is included. LBAs in extended mode do not include the metadata size. | - | off64_t offset | Write offset of the LBA, in bytes (an integer multiple of **sector_size**).
Note:
Only the data size is included. LBAs in extended mode do not include the metadata size. | - | void *md_buf | Metadata buffer. (Applicable only to LBAs in separated mode. Set this parameter to **NULL** for LBAs in extended mode.) | - | size_t md_len | Buffer length of metadata. (Applicable only to LBAs in separated mode. Set this parameter to **0** for LBAs in extended mode.) | - | enum libstorage_crc_and_prchk dif_flag | Whether to calculate DIF and whether to enable drive verification. | - | LIBSTORAGE_CALLBACK_FUNC cb | Registered callback function. | - | void* cb_arg | Parameters of the callback function. | - -4. Return value - - | Return Value | Description | - | ------------ | ---------------------------------------------- | - | 0 | I/O write requests are submitted successfully. | - | Other values | Failed to submit I/O write requests. | - -#### libstorage_async_read - -1. Prototype - - ```c - int32_t libstorage_async_read(int32_t fd, void *buf, size_t nbytes, off64_t offset, void *md_buf, size_t md_len, enum libstorage_crc_and_prchk dif_flag, LIBSTORAGE_CALLBACK_FUNC cb, void* cb_arg); - ``` - -2. Description - - Delivers asynchronous I/O read requests (the read buffer is a contiguous buffer). - -3. Parameters - - | Parameter | Description | - | -------------------------------------- | ------------------------------------------------------------ | - | int32_t fd | File descriptor of the block device. | - | void *buf | Buffer for I/O read data (four-byte aligned and cannot cross the 4 KB page boundary).
Note:
LBAs in extended mode must contain the metadata memory size. | - | size_t nbytes | Size of a single read I/O, in bytes (an integer multiple of **sector_size**).
Note:
Only the data size is included. LBAs in extended mode do not include the metadata size. | - | off64_t offset | Read offset of the LBA, in bytes (an integer multiple of **sector_size**).
Note:
Only the data size is included. The LBA in extended mode does not include the metadata size. | - | void *md_buf | Metadata buffer. (Applicable only to LBAs in separated mode. Set this parameter to **NULL** for LBAs in extended mode.). | - | size_t md_len | Buffer length of metadata. (Applicable only to LBAs in separated mode. Set this parameter to **0** for LBAs in extended mode.). | - | enum libstorage_crc_and_prchk dif_flag | Whether to calculate DIF and whether to enable drive verification. | - | LIBSTORAGE_CALLBACK_FUNC cb | Registered callback function. | - | void* cb_arg | Parameters of the callback function. | - -4. Return value - - | Return Value | Description | - | ------------ | --------------------------------------------- | - | 0 | I/O read requests are submitted successfully. | - | Other values | Failed to submit I/O read requests. | - -#### libstorage_async_writev - -1. Prototype - - ```c - int32_t libstorage_async_writev(int32_t fd, struct iovec *iov, int iovcnt, size_t nbytes, off64_t offset, void *md_buf, size_t md_len, enum libstorage_crc_and_prchk dif_flag, LIBSTORAGE_CALLBACK_FUNC cb, void* cb_arg); - ``` - -2. Description - - Delivers asynchronous I/O write requests (the write buffer is a discrete buffer). - -3. Parameters - - | Parameter | Description | - | -------------------------------------- | ------------------------------------------------------------ | - | int32_t fd | File descriptor of the block device. | - | struct iovec *iov | Buffer for I/O write data.
Note:
LBAs in extended mode must contain the metadata size.
The address must be 4-byte-aligned and the length cannot exceed 4 GB. | - | int iovcnt | Number of buffers for I/O write data. | - | size_t nbytes | Size of a single write I/O, in bytes (an integer multiple of **sector_size**).
Note:
Only the data size is included. LBAs in extended mode do not include the metadata size. | - | off64_t offset | Write offset of the LBA, in bytes (an integer multiple of **sector_size**).
Note:
Only the data size is included. LBAs in extended mode do not include the metadata size. | - | void *md_buf | Metadata buffer. (Applicable only to LBAs in separated mode. Set this parameter to **NULL** for LBAs in extended mode.) | - | size_t md_len | Length of the metadata buffer. (Applicable only to LBAs in separated mode. Set this parameter to **0** for LBAs in extended mode.) | - | enum libstorage_crc_and_prchk dif_flag | Whether to calculate DIF and whether to enable drive verification. | - | LIBSTORAGE_CALLBACK_FUNC cb | Registered callback function. | - | void* cb_arg | Parameters of the callback function. | - -4. Return value - - | Return Value | Description | - | ------------ | ---------------------------------------------- | - | 0 | I/O write requests are submitted successfully. | - | Other values | Failed to submit I/O write requests. | - -#### libstorage_async_readv - -1. Prototype - - ```c - int32_t libstorage_async_readv(int32_t fd, struct iovec *iov, int iovcnt, size_t nbytes, off64_t offset, void *md_buf, size_t md_len, enum libstorage_crc_and_prchk dif_flag, LIBSTORAGE_CALLBACK_FUNC cb, void* cb_arg); - ``` - -2. Description - - Delivers asynchronous I/O read requests (the read buffer is a discrete buffer). - -3. Parameters - - | Parameter | Description | - | -------------------------------------- | ------------------------------------------------------------ | - | int32_t fd | File descriptor of the block device. | - | struct iovec *iov | Buffer for I/O read data.
Note:
LBAs in extended mode must contain the metadata size.
The address must be 4-byte-aligned and the length cannot exceed 4 GB. | - | int iovcnt | Number of buffers for I/O read data. | - | size_t nbytes | Size of a single read I/O, in bytes (an integer multiple of **sector_size**).
Note:
Only the data size is included. LBAs in extended mode do not include the metadata size. | - | off64_t offset | Read offset of the LBA, in bytes (an integer multiple of **sector_size**).
Note:
Only the data size is included. LBAs in extended mode do not include the metadata size. | - | void *md_buf | Metadata buffer. (Applicable only to LBAs in separated mode. Set this parameter to **NULL** for LBAs in extended mode.) | - | size_t md_len | Length of the metadata buffer. (Applicable only to LBAs in separated mode. Set this parameter to **0** for LBAs in extended mode.) | - | enum libstorage_crc_and_prchk dif_flag | Whether to calculate DIF and whether to enable drive verification. | - | LIBSTORAGE_CALLBACK_FUNC cb | Registered callback function. | - | void* cb_arg | Parameters of the callback function. | - -4. Return value - - | Return Value | Description | - | ------------ | --------------------------------------------- | - | 0 | I/O read requests are submitted successfully. | - | Other values | Failed to submit I/O read requests. | - -#### libstorage_sync_write - -1. Prototype - - ```c - int32_t libstorage_sync_write(int fd, const void *buf, size_t nbytes, off_t offset); - ``` - -2. Description - - Delivers synchronous I/O write requests (the write buffer is a contiguous buffer). - -3. Parameters - - | Parameter | Description | - | -------------- | ------------------------------------------------------------ | - | int32_t fd | File descriptor of the block device. | - | void *buf | Buffer for I/O write data (four-byte aligned and cannot cross the 4 KB page boundary).
Note:
LBAs in extended mode must contain the metadata memory size. | - | size_t nbytes | Size of a single write I/O, in bytes (an integer multiple of **sector_size**).
Note:
Only the data size is included. LBAs in extended mode do not include the metadata size. | - | off64_t offset | Write offset of the LBA, in bytes. (an integer multiple of **sector_size**).
Note:
Only the data size is included. LBAs in extended mode do not include the metadata size. | - -4. Return value - - | Return Value | Description | - | ------------ | ---------------------------------------------- | - | 0 | I/O write requests are submitted successfully. | - | Other values | Failed to submit I/O write requests. | - -#### libstorage_sync_read - -1. Prototype - - ```c - int32_t libstorage_sync_read(int fd, const void *buf, size_t nbytes, off_t offset); - ``` - -2. Description - - Delivers synchronous I/O read requests (the read buffer is a contiguous buffer). - -3. Parameters - - | Parameter | Description | - | -------------- | ------------------------------------------------------------ | - | int32_t fd | File descriptor of the block device. | - | void *buf | Buffer for I/O read data (four-byte aligned and cannot cross the 4 KB page boundary).
Note:
LBAs in extended mode must contain the metadata memory size. | - | size_t nbytes | Size of a single read I/O, in bytes (an integer multiple of **sector_size**).
Note:
Only the data size is included. LBAs in extended mode do not include the metadata size. | - | off64_t offset | Read offset of the LBA, in bytes (an integer multiple of **sector_size**).
Note:
Only the data size is included. LBAs in extended mode do not include the metadata size. | - -4. Return value - - | Return Value | Description | - | ------------ | --------------------------------------------- | - | 0 | I/O read requests are submitted successfully. | - | Other values | Failed to submit I/O read requests. | - -#### libstorage_open - -1. Prototype - - ```c - int32_t libstorage_open(const char* devfullname); - ``` - -2. Description - - Opens a block device. - -3. Parameters - - | Parameter | Description | - | ----------------------- | ---------------------------------------- | - | const char* devfullname | Block device name (format: **nvme0n1**). | - -4. Return value - - | Return Value | Description | - | ------------ | ------------------------------------------------------------ | - | -1 | Opening failed. For example, the device name is incorrect, or the number of opened FDs is greater than the number of available channels of the NVMe drive. | - | > 0 | File descriptor of the block device. | - - After the MultiQ function in **nvme.conf.in** is enabled, different FDs are returned if a thread opens the same device for multiple times. Otherwise, the same FD is returned. This attribute applies only to the NVMe device. - -#### libstorage_close - -1. Prototype - - ```c - int32_t libstorage_close(int32_t fd); - ``` - -2. Description - - Closes a block device. - -3. Parameters - - | Parameter | Description | - | ---------- | ------------------------------------------ | - | int32_t fd | File descriptor of an opened block device. | - -4. Return value - - | Return Value | Description | - | ------------ | ----------------------------------------------- | - | -1 | Invalid file descriptor. | - | -16 | The file descriptor is busy. Retry is required. | - | 0 | Close succeeded. | - -#### libstorage_mem_reserve - -1. Prototype - - ```c - void* libstorage_mem_reserve(size_t size, size_t align); - ``` - -2. Description - - Allocates memory space from the huge page memory reserved by the DPDK. - -3. Parameters - - | Parameter | Description | - | ------------ | ----------------------------------- | - | size_t size | Size of the memory to be allocated. | - | size_t align | Aligns allocated memory space. | - -4. Return value - - | Return Value | Description | - | ------------ | -------------------------------------- | - | NULL | Allocation failed. | - | Other values | Address of the allocated memory space. | - -#### libstorage_mem_free - -1. Prototype - - ```c - void libstorage_mem_free(void* ptr); - ``` - -2. Description - - Frees the memory space pointed to by **ptr**. - -3. Parameters - - | Parameter | Description | - | --------- | ---------------------------------------- | - | void* ptr | Address of the memory space to be freed. | - -4. Return value - - None. - -#### libstorage_alloc_io_buf - -1. Prototype - - ```c - void* libstorage_alloc_io_buf(size_t nbytes); - ``` - -2. Description - - Allocates memory from buf_small_pool or buf_large_pool of the SPDK. - -3. Parameters - - | Parameter | Description | - | ------------- | ----------------------------------- | - | size_t nbytes | Size of the buffer to be allocated. | - -4. Return value - - | Return Value | Description | - | ------------ | -------------------------------------- | - | Other values | Start address of the allocated buffer. | - -#### libstorage_free_io_buf - -1. Prototype - - ```c - int32_t libstorage_free_io_buf(void *buf, size_t nbytes); - ``` - -2. Description - - Frees the allocated memory to buf_small_pool or buf_large_pool of the SPDK. - -3. Parameters - - | Parameter | Description | - | ------------- | ---------------------------------------- | - | void *buf | Start address of the buffer to be freed. | - | size_t nbytes | Size of the buffer to be freed. | - -4. Return value - - | Return Value | Description | - | ------------ | ------------------ | - | -1 | Freeing failed. | - | 0 | Freeing succeeded. | - -#### libstorage_init_module - -1. Prototype - - ```c - int32_t libstorage_init_module(const char* cfgfile); - ``` - -2. Description - - Initializes the HSAK module. - -3. Parameters - - | Parameter | Description | - | ------------------- | ------------------------------------ | - | const char* cfgfile | Name of the HSAK configuration file. | - -4. Return value - - | Return Value | Description | - | ------------ | ------------------------- | - | Other values | Initialization failed. | - | 0 | Initialization succeeded. | - -#### libstorage_exit_module - -1. Prototype - - ```c - int32_t libstorage_exit_module(void); - ``` - -2. Description - - Exits the HSAK module. - -3. Parameters - - None. - -4. Return value - - | Return Value | Description | - | ------------ | --------------------------------- | - | Other values | Failed to exit the cleanup. | - | 0 | Succeeded in exiting the cleanup. | - -#### LIBSTORAGE_REGISTER_DPDK_INIT_NOTIFY - -1. Prototype - - ```c - LIBSTORAGE_REGISTER_DPDK_INIT_NOTIFY(_name, _notify) - ``` - -2. Description - - Service layer registration function, which is used to register the callback function when the DPDK initialization is complete. - -3. Parameters - - | Parameter | Description | - | --------- | ------------------------------------------------------------ | - | _name | Name of a module at the service layer. | - | _notify | Prototype of the callback function registered at the service layer: **void (*notifyFunc)(const struct libstorage_dpdk_init_notify_arg *arg);** | - -4. Return value - - None - -### ublock.h - -#### init_ublock - -1. Prototype - - ```c - int init_ublock(const char *name, enum ublock_rpc_server_status flg); - ``` - -2. Description - - Initializes the Ublock module. This API must be called before other Ublock APIs. If the flag is set to **UBLOCK_RPC_SERVER_ENABLE**, that is, Ublock functions as the RPC server, the same process can be initialized only once. - - When Ublock is started as the RPC server, the monitor thread of a server is started at the same time. When the monitor thread detects that the RPC server thread is abnormal (for example, thread suspended), the monitor thread calls the exit function to trigger the process to exit. - - In this case, the product script is used to start the process again. - -3. Parameters - - | Parameter | Description | - | ------------------------------------ | ------------------------------------------------------------ | - | const char *name | Module name. The default value is **ublock**. You are advised to set this parameter to **NULL**. | - | enum ublock_rpc_server_status
flg | Whether to enable RPC. The value can be **UBLOCK_RPC_SERVER_DISABLE** or **UBLOCK_RPC_SERVER_ENABLE**.
If RPC is disabled and the drive is occupied by service processes, the Ublock module cannot obtain the drive information. | - -4. Return value - - | Return Value | Description | - | ------------- | ------------------------------------------------------------ | - | 0 | Initialization succeeded. | - | -1 | Initialization failed. Possible cause: The Ublock module has been initialized. | - | Process exits | Ublock considers that the following exceptions cannot be rectified and directly calls the exit API to exit the process:
- The RPC service needs to be created, but it fails to be created onsite.
- Failed to create a hot swap monitoring thread. | - -#### ublock_init - -1. Prototype - - ```c - #define ublock_init(name) init_ublock(name, UBLOCK_RPC_SERVER_ENABLE) - ``` - -2. Description - - It is the macro definition of the init_ublock API. It can be regarded as initializing Ublock into the required RPC service. - -3. Parameters - - | Parameter | Description | - | --------- | ------------------------------------------------------------ | - | name | Module name. The default value is **ublock**. You are advised to set this parameter to **NULL**. | - -4. Return value - - | Return Value | Description | - | ------------- | ------------------------------------------------------------ | - | 0 | Initialization succeeded. | - | -1 | Initialization failed. Possible cause: The Ublock RPC server module has been initialized. | - | Process exits | Ublock considers that the following exceptions cannot be rectified and directly calls the exit API to exit the process:
- The RPC service needs to be created, but it fails to be created onsite.
- Failed to create a hot swap monitoring thread. | - -#### ublock_init_norpc - -1. Prototype - - ```c - #define ublock_init_norpc(name) init_ublock(name, UBLOCK_RPC_SERVER_DISABLE) - ``` - -2. Description - - It is the macro definition of the init_ublock API and can be considered as initializing Ublock into a non-RPC service. - -3. Parameters - - | Parameter | Description | - | --------- | ------------------------------------------------------------ | - | name | Module name. The default value is **ublock**. You are advised to set this parameter to **NULL**. | - -4. Return value - - | Return Value | Description | - | ------------- | ------------------------------------------------------------ | - | 0 | Initialization succeeded. | - | -1 | Initialization failed. Possible cause: The Ublock client module has been initialized. | - | Process exits | Ublock considers that the following exceptions cannot be rectified and directly calls the exit API to exit the process:
- The RPC service needs to be created, but it fails to be created onsite.
- Failed to create a hot swap monitoring thread. | - -#### ublock_fini - -1. Prototype - - ```c - void ublock_fini(void); - ``` - -2. Description - - Destroys the Ublock module and internally created resources. This API must be used together with the Ublock initialization API. - -3. Parameters - - None. - -4. Return value - - None. - -#### ublock_get_bdevs - -1. Prototype - - ```c - int ublock_get_bdevs(struct ublock_bdev_mgr* bdev_list); - ``` - -2. Description - - Obtains the device list (all NVMe devices in the environment, including kernel-mode and user-mode drivers). The obtained NVMe device list contains only PCI addresses and does not contain specific device information. To obtain specific device information, call ublock_get_bdev. - -3. Parameters - - | Parameter | Description | - | --------------------------------- | ------------------------------------------------------------ | - | struct ublock_bdev_mgr* bdev_list | Output parameter, which returns the device queue. The **bdev_list** pointer must be allocated externally. | - -4. Return value - - | Return Value | Description | - | ------------ | ------------------------------------------ | - | 0 | The device queue is obtained successfully. | - | -2 | No NVMe device exists in the environment. | - | Other values | Failed to obtain the device list. | - -#### ublock_free_bdevs - -1. Prototype - - ```c - void ublock_free_bdevs(struct ublock_bdev_mgr* bdev_list); - ``` - -2. Description - - Releases a device list. - -3. Parameters - - | Parameter | Description | - | --------------------------------- | ------------------------------------------------------------ | - | struct ublock_bdev_mgr* bdev_list | Head pointer of the device queue. After the device queue is cleared, the **bdev_list** pointer is not released. | - -4. Return value - - None. - -#### ublock_get_bdev - -1. Prototype - - ```c - int ublock_get_bdev(const char *pci, struct ublock_bdev *bdev); - ``` - -2. Description - - Obtains information about a specific device. In the device information, the serial number, model, and firmware version of the NVMe device are saved as character arrays instead of character strings. (The return format varies depending on the drive controller, and the arrays may not end with 0.) - - After this API is called, the corresponding device is occupied by Ublock. Therefore, call ublock_free_bdev to free resources immediately after the required service operation is complete. - -3. Parameters - - | Parameter | Description | - | ------------------------ | ------------------------------------------------------------ | - | const char *pci | PCI address of the device whose information needs to be obtained. | - | struct ublock_bdev *bdev | Output parameter, which returns the device information. The **bdev** pointer must be allocated externally. | - -4. Return value - - | Return Value | Description | - | ------------ | ------------------------------------------------------------ | - | 0 | The device information is obtained successfully. | - | -1 | Failed to obtain device information due to incorrect parameters. | - | -11(EAGAIN) | Failed to obtain device information due to the RPC query failure. A retry is required (3s sleep is recommended). | - -#### ublock_get_bdev_by_esn - -1. Prototype - - ```c - int ublock_get_bdev_by_esn(const char *esn, struct ublock_bdev *bdev); - ``` - -2. Description - - Obtains information about the device corresponding to an ESN. In the device information, the serial number, model, and firmware version of the NVMe device are saved as character arrays instead of character strings. (The return format varies depending on the drive controller, and the arrays may not end with 0.) - - After this API is called, the corresponding device is occupied by Ublock. Therefore, call ublock_free_bdev to free resources immediately after the required service operation is complete. - -3. Parameters - - | Parameter | Description | - | ------------------------ | ------------------------------------------------------------ | - | const char *esn | ESN of the device whose information is to be obtained.
Note:
An ESN is a string of a maximum of 20 characters (excluding the end character of the string), but the length may vary according to hardware vendors. For example, if the length is less than 20 characters, spaces are padded at the end of the character string. | - | struct ublock_bdev *bdev | Output parameter, which returns the device information. The **bdev** pointer must be allocated externally. | - -4. Return value - - | Return Value | Description | - | ------------ | ------------------------------------------------------------ | - | 0 | The device information is obtained successfully. | - | -1 | Failed to obtain device information due to incorrect parameters. | - | -11(EAGAIN) | Failed to obtain device information due to the RPC query failure. A retry is required (3s sleep is recommended). | - -#### ublock_free_bdev - -1. Prototype - - ```c - void ublock_free_bdev(struct ublock_bdev *bdev); - ``` - -2. Description - - Frees device resources. - -3. Parameters - - | Parameter | Description | - | ------------------------ | ------------------------------------------------------------ | - | struct ublock_bdev *bdev | Pointer to the device information. After the data in the pointer is cleared, the **bdev** pointer is not freed. | - -4. Return value - - None. - -#### TAILQ_FOREACH_SAFE - -1. Prototype - - ```c - #define TAILQ_FOREACH_SAFE(var, head, field, tvar) - for ((var) = TAILQ_FIRST((head)); - (var) && ((tvar) = TAILQ_NEXT((var), field), 1); - (var) = (tvar)) - ``` - -2. Description - - Provides a macro definition for each member of the secure access queue. - -3. Parameters - - | Parameter | Description | - | --------- | ------------------------------------------------------------ | - | var | Queue node member on which you are performing operations. | - | head | Queue head pointer. Generally, it refers to the object address defined by **TAILQ_HEAD(xx, xx) obj**. | - | field | Name of the struct used to store the pointers before and after the queue in the queue node. Generally, it is the name defined by **TAILQ_ENTRY (xx) name**. | - | tvar | Next queue node member. | - -4. Return value - - None. - -#### ublock_get_SMART_info - -1. Prototype - - ```c - int ublock_get_SMART_info(const char *pci, uint32_t nsid, struct ublock_SMART_info *smart_info); - ``` - -2. Description - - Obtains the S.M.A.R.T. information of a specified device. - -3. Parameters - - | Parameter | Description | - | ------------------------------------ | ------------------------------------------------------------ | - | const char *pci | Device PCI address. | - | uint32_t nsid | Specified namespace. | - | struct ublock_SMART_info *smart_info | Output parameter, which returns the S.M.A.R.T. information of the device. | - -4. Return value - - | Return Value | Description | - | ------------ | ------------------------------------------------------------ | - | 0 | The S.M.A.R.T. information is obtained successfully. | - | -1 | Failed to obtain S.M.A.R.T. information due to incorrect parameters. | - | -11(EAGAIN) | Failed to obtain S.M.A.R.T. information due to the RPC query failure. A retry is required (3s sleep is recommended). | - -#### ublock_get_SMART_info_by_esn - -1. Prototype - - ```c - int ublock_get_SMART_info_by_esn(const char *esn, uint32_t nsid, struct ublock_SMART_info *smart_info); - ``` - -2. Description - - Obtains the S.M.A.R.T. information of the device corresponding to an ESN. - -3. Parameters - - | Parameter | Description | - | --------------------------------------- | ------------------------------------------------------------ | - | const char *esn | Device ESN.
Note:
An ESN is a string of a maximum of 20 characters (excluding the end character of the string), but the length may vary according to hardware vendors. For example, if the length is less than 20 characters, spaces are padded at the end of the character string. | - | uint32_t nsid | Specified namespace. | - | struct ublock_SMART_info
*smart_info | Output parameter, which returns the S.M.A.R.T. information of the device. | - -4. Return value - - | Return Value | Description | - | ------------ | ------------------------------------------------------------ | - | 0 | The S.M.A.R.T. information is obtained successfully. | - | -1 | Failed to obtain SMART information due to incorrect parameters. | - | -11(EAGAIN) | Failed to obtain S.M.A.R.T. information due to the RPC query failure. A retry is required (3s sleep is recommended). | - -#### ublock_get_error_log_info - -1. Prototype - - ```c - int ublock_get_error_log_info(const char *pci, uint32_t err_entries, struct ublock_nvme_error_info *errlog_info); - ``` - -2. Description - - Obtains the error log information of a specified device. - -3. Parameters - - | Parameter | Description | - | ------------------------------------------ | ------------------------------------------------------------ | - | const char *pci | Device PCI address. | - | uint32_t err_entries | Number of error logs to be obtained. A maximum of 256 error logs can be obtained. | - | struct ublock_nvme_error_info *errlog_info | Output parameter, which returns the error log information of the device. For the **errlog_info** pointer, the caller needs to apply for space and ensure that the obtained space is greater than or equal to err_entries x size of (struct ublock_nvme_error_info). | - -4. Return value - - | Return Value | Description | - | ------------------------------------------------------------ | ------------------------------------------------------------ | - | Number of obtained error logs. The value is greater than or equal to 0. | Error logs are obtained successfully. | - | -1 | Failed to obtain error logs due to incorrect parameters. | - | -11(EAGAIN) | Failed to obtain error logs due to the RPC query failure. A retry is required (3s sleep is recommended). | - -#### ublock_get_log_page - -1. Prototype - - ```c - int ublock_get_log_page(const char *pci, uint8_t log_page, uint32_t nsid, void *payload, uint32_t payload_size); - ``` - -2. Description - - Obtains information about a specified device and log page. - -3. Parameters - - | Parameter | Description | - | --------------------- | ------------------------------------------------------------ | - | const char *pci | Device PCI address. | - | uint8_t log_page | ID of the log page to be obtained. For example, **0xC0** and **0xCA** indicate the customized S.M.A.R.T. information of ES3000 V5 drives. | - | uint32_t nsid | Namespace ID. Some log pages support obtaining by namespace while some do not. If obtaining by namespace is not supported, the caller must transfer **0XFFFFFFFF**. | - | void *payload | Output parameter, which stores log page information. The caller is responsible for allocating memory. | - | uint32_t payload_size | Size of the applied payload, which cannot be greater than 4096 bytes. | - -4. Return value - - | Return Value | Description | - | ------------ | ---------------------------------------------------- | - | 0 | The log page is obtained successfully. | - | -1 | Failed to obtain error logs due to parameter errors. | - -#### ublock_info_get_pci_addr - -1. Prototype - - ```c - char *ublock_info_get_pci_addr(const void *info); - ``` - -2. Description - - Obtains the PCI address of the hot swap device. - - The memory occupied by info and the memory occupied by the returned PCI address do not need to be freed by the service process. - -3. Parameters - - | Parameter | Description | - | ---------------- | ------------------------------------------------------------ | - | const void *info | Hot swap event information transferred by the hot swap monitoring thread to the callback function. | - -4. Return value - - | Return Value | Description | - | ------------ | --------------------------------- | - | NULL | Failed to obtain the information. | - | Other values | Obtained PCI address. | - -#### ublock_info_get_action - -1. Prototype - - ```c - enum ublock_nvme_uevent_action ublock_info_get_action(const void *info); - ``` - -2. Description - - Obtains the type of the hot swap event. - - The memory occupied by info does not need to be freed by service process. - -3. Parameters - - | Parameter | Description | - | ---------------- | ------------------------------------------------------------ | - | const void *info | Hot swap event information transferred by the hot swap monitoring thread to the callback function. | - -4. Return value - - | Return Value | Description | - | -------------------------- | ------------------------------------------------------------ | - | Type of the hot swap event | Type of the event that triggers the callback function. For details, see the definition in **5.1.2.6 enum ublock_nvme_uevent_action**. | - -#### ublock_get_ctrl_iostat - -1. Prototype - - ```c - int ublock_get_ctrl_iostat(const char* pci, struct ublock_ctrl_iostat_info *ctrl_iostat); - ``` - -2. Description - - Obtains the I/O statistics of a controller. - -3. Parameters - - | Parameter | Description | - | ------------------------------------------- | ------------------------------------------------------------ | - | const char* pci | PCI address of the controller whose I/O statistics are to be obtained. | - | struct ublock_ctrl_iostat_info *ctrl_iostat | Output parameter, which returns I/O statistics. The **ctrl_iostat** pointer must be allocated externally. | - -4. Return value - - | Return Value | Description | - | ------------ | ------------------------------------------------------------ | - | 0 | Succeeded in obtaining I/O statistics. | - | -1 | Failed to obtain I/O statistics due to invalid parameters or RPC errors. | - | -2 | Failed to obtain I/O statistics because the NVMe drive is not taken over by the I/O process. | - | -3 | Failed to obtain I/O statistics because the I/O statistics function is disabled. | - -#### ublock_nvme_admin_passthru - -1. Prototype - - ```c - int32_t ublock_nvme_admin_passthru(const char *pci, void *cmd, void *buf, size_t nbytes); - ``` - -2. Description - - Transparently transmits the **nvme admin** command to the NVMe device. Currently, only the **nvme admin** command for obtaining the identify parameter is supported. - -3. Parameters - - | Parameter | Description | - | --------------- | ------------------------------------------------------------ | - | const char *pci | PCI address of the destination controller of the **nvme admin** command. | - | void *cmd | Pointer to the **nvme admin** command struct. The struct size is 64 bytes. For details, see the NVMe specifications. Currently, only the command for obtaining the identify parameter is supported. | - | void *buf | Saves the output of the **nvme admin** command. The space is allocated by users and the size is expressed in nbytes. | - | size_t nbytes | Size of the user buffer. The buffer for the identify parameter is 4096 bytes, and that for the command to obtain the identify parameter is 4096 nbytes. | - -4. Return value - - | Return Value | Description | - | ------------ | ------------------------------------------ | - | 0 | The user command is executed successfully. | - | -1 | Failed to execute the user command. | - -# Appendixes - -## GENERIC - -Generic Error Code Reference - -| sc | value | -| ------------------------------------------ | ----- | -| NVME_SC_SUCCESS | 0x00 | -| NVME_SC_INVALID_OPCODE | 0x01 | -| NVME_SC_INVALID_FIELD | 0x02 | -| NVME_SC_COMMAND_ID_CONFLICT | 0x03 | -| NVME_SC_DATA_TRANSFER_ERROR | 0x04 | -| NVME_SC_ABORTED_POWER_LOSS | 0x05 | -| NVME_SC_INTERNAL_DEVICE_ERROR | 0x06 | -| NVME_SC_ABORTED_BY_REQUEST | 0x07 | -| NVME_SC_ABORTED_SQ_DELETION | 0x08 | -| NVME_SC_ABORTED_FAILED_FUSED | 0x09 | -| NVME_SC_ABORTED_MISSING_FUSED | 0x0a | -| NVME_SC_INVALID_NAMESPACE_OR_FORMAT | 0x0b | -| NVME_SC_COMMAND_SEQUENCE_ERROR | 0x0c | -| NVME_SC_INVALID_SGL_SEG_DESCRIPTOR | 0x0d | -| NVME_SC_INVALID_NUM_SGL_DESCIRPTORS | 0x0e | -| NVME_SC_DATA_SGL_LENGTH_INVALID | 0x0f | -| NVME_SC_METADATA_SGL_LENGTH_INVALID | 0x10 | -| NVME_SC_SGL_DESCRIPTOR_TYPE_INVALID | 0x11 | -| NVME_SC_INVALID_CONTROLLER_MEM_BUF | 0x12 | -| NVME_SC_INVALID_PRP_OFFSET | 0x13 | -| NVME_SC_ATOMIC_WRITE_UNIT_EXCEEDED | 0x14 | -| NVME_SC_OPERATION_DENIED | 0x15 | -| NVME_SC_INVALID_SGL_OFFSET | 0x16 | -| NVME_SC_INVALID_SGL_SUBTYPE | 0x17 | -| NVME_SC_HOSTID_INCONSISTENT_FORMAT | 0x18 | -| NVME_SC_KEEP_ALIVE_EXPIRED | 0x19 | -| NVME_SC_KEEP_ALIVE_INVALID | 0x1a | -| NVME_SC_ABORTED_PREEMPT | 0x1b | -| NVME_SC_SANITIZE_FAILED | 0x1c | -| NVME_SC_SANITIZE_IN_PROGRESS | 0x1d | -| NVME_SC_SGL_DATA_BLOCK_GRANULARITY_INVALID | 0x1e | -| NVME_SC_COMMAND_INVALID_IN_CMB | 0x1f | -| NVME_SC_LBA_OUT_OF_RANGE | 0x80 | -| NVME_SC_CAPACITY_EXCEEDED | 0x81 | -| NVME_SC_NAMESPACE_NOT_READY | 0x82 | -| NVME_SC_RESERVATION_CONFLICT | 0x83 | -| NVME_SC_FORMAT_IN_PROGRESS | 0x84 | - -## COMMAND_SPECIFIC - -Error Code Reference for Specific Commands - -| sc | value | -| ------------------------------------------ | ----- | -| NVME_SC_COMPLETION_QUEUE_INVALID | 0x00 | -| NVME_SC_INVALID_QUEUE_IDENTIFIER | 0x01 | -| NVME_SC_MAXIMUM_QUEUE_SIZE_EXCEEDED | 0x02 | -| NVME_SC_ABORT_COMMAND_LIMIT_EXCEEDED | 0x03 | -| NVME_SC_ASYNC_EVENT_REQUEST_LIMIT_EXCEEDED | 0x05 | -| NVME_SC_INVALID_FIRMWARE_SLOT | 0x06 | -| NVME_SC_INVALID_FIRMWARE_IMAGE | 0x07 | -| NVME_SC_INVALID_INTERRUPT_VECTOR | 0x08 | -| NVME_SC_INVALID_LOG_PAGE | 0x09 | -| NVME_SC_INVALID_FORMAT | 0x0a | -| NVME_SC_FIRMWARE_REQ_CONVENTIONAL_RESET | 0x0b | -| NVME_SC_INVALID_QUEUE_DELETION | 0x0c | -| NVME_SC_FEATURE_ID_NOT_SAVEABLE | 0x0d | -| NVME_SC_FEATURE_NOT_CHANGEABLE | 0x0e | -| NVME_SC_FEATURE_NOT_NAMESPACE_SPECIFIC | 0x0f | -| NVME_SC_FIRMWARE_REQ_NVM_RESET | 0x10 | -| NVME_SC_FIRMWARE_REQ_RESET | 0x11 | -| NVME_SC_FIRMWARE_REQ_MAX_TIME_VIOLATION | 0x12 | -| NVME_SC_FIRMWARE_ACTIVATION_PROHIBITED | 0x13 | -| NVME_SC_OVERLAPPING_RANGE | 0x14 | -| NVME_SC_NAMESPACE_INSUFFICIENT_CAPACITY | 0x15 | -| NVME_SC_NAMESPACE_ID_UNAVAILABLE | 0x16 | -| NVME_SC_NAMESPACE_ALREADY_ATTACHED | 0x18 | -| NVME_SC_NAMESPACE_IS_PRIVATE | 0x19 | -| NVME_SC_NAMESPACE_NOT_ATTACHED | 0x1a | -| NVME_SC_THINPROVISIONING_NOT_SUPPORTED | 0x1b | -| NVME_SC_CONTROLLER_LIST_INVALID | 0x1c | -| NVME_SC_DEVICE_SELF_TEST_IN_PROGRESS | 0x1d | -| NVME_SC_BOOT_PARTITION_WRITE_PROHIBITED | 0x1e | -| NVME_SC_INVALID_CTRLR_ID | 0x1f | -| NVME_SC_INVALID_SECONDARY_CTRLR_STATE | 0x20 | -| NVME_SC_INVALID_NUM_CTRLR_RESOURCES | 0x21 | -| NVME_SC_INVALID_RESOURCE_ID | 0x22 | -| NVME_SC_CONFLICTING_ATTRIBUTES | 0x80 | -| NVME_SC_INVALID_PROTECTION_INFO | 0x81 | -| NVME_SC_ATTEMPTED_WRITE_TO_RO_PAGE | 0x82 | - -## MEDIA_DATA_INTERGRITY_ERROR - -Error Code Reference for Medium Exceptions - -| sc | value | -| -------------------------------------- | ----- | -| NVME_SC_WRITE_FAULTS | 0x80 | -| NVME_SC_UNRECOVERED_READ_ERROR | 0x81 | -| NVME_SC_GUARD_CHECK_ERROR | 0x82 | -| NVME_SC_APPLICATION_TAG_CHECK_ERROR | 0x83 | -| NVME_SC_REFERENCE_TAG_CHECK_ERROR | 0x84 | -| NVME_SC_COMPARE_FAILURE | 0x85 | -| NVME_SC_ACCESS_DENIED | 0x86 | -| NVME_SC_DEALLOCATED_OR_UNWRITTEN_BLOCK | 0x87 | diff --git a/docs/en/server/memory_storage/hsak/hsak_developer_guide.md b/docs/en/server/memory_storage/hsak/hsak_developer_guide.md deleted file mode 100644 index 3aa9395bc75c6e693f79546e17b4afb67da59159..0000000000000000000000000000000000000000 --- a/docs/en/server/memory_storage/hsak/hsak_developer_guide.md +++ /dev/null @@ -1,47 +0,0 @@ -# HSAK Developer Guide - -## Overview - -As the performance of storage media such as NVMe SSDs and SCMs is continuously improved, the latency overhead of the media layer in the I/O stack continuously reduces, and the overhead of the software stack becomes a bottleneck. Therefore, the kernel I/O data plane needs to be reconstructed to reduce the overhead of the software stack. HSAK provides a high-bandwidth and low-latency I/O software stack for new storage media, which reduces the overhead by more than 50% compared with the traditional I/O software stack. -The HSAK user-mode I/O engine is developed based on the open-source SPDK. - -1. A unified interface is provided for external systems to shield the differences between open-source interfaces. -2. Enhanced I/O data plane features are added, such as DIF, drive formatting, batch I/O delivery, trim, and dynamic drive addition and deletion. -3. Features such as drive device management, drive I/O monitoring, and maintenance and test tools are provided. - -## Compilation Tutorial - -1. Download the HSAK source code. - - ```shell - git clone - ``` - -2. Install the compilation and running dependencies. - - The compilation and running of HSAK depend on components such as Storage Performance Development Kit (SPDK), Data Plane Development Kit (DPDK), and libboundscheck. - -3. Start the compilation. - - ```shell - cd hsak - mkdir build - cd build - cmake .. - make - ``` - -## Precautions - -### Constraints - -- A maximum of 512 NVMe devices can be used and managed on the same machine. -- When HSAK is enabled to execute I/O-related services, ensure that the system has at least 500 MB continuous idle huge page memory. -- Before enabling the user-mode I/O component to execute services, ensure that the drive management component (Ublock) has been enabled. -- When the drive management component (Ublock) is enabled to execute services, ensure that the system has sufficient continuous idle memory. Each time the Ublock component is initialized, 20 MB huge page memory is allocated. -- Before HSAK is run, **setup.sh** is called to configure huge page memory and unbind the kernel-mode driver of the NVMe device. -- Other interfaces provided by the HSAK module can be used only after libstorage_init_module is successfully executed. Each process can call libstorage_init_module only once. -- After the libstorage_exit_module function is executed, other interfaces provided by HSAK cannot be used. In multi-thread scenarios, exit HSAK after all threads end. -- Only one service can be started for the HSAK Ublock component on a server and supports concurrent access of a maximum of 64 Ublock clients. The Ublock server can process a maximum of 20 client requests per second. -- The HSAK Ublock component must be started earlier than the data plane I/O component and Ublock clients. The command line tool provided by HSAK can be executed only after the Ublock server is started. -- Do not register the function for processing the SIGBUS signal. SPDK has an independent processing function for the signal. If the processing function is overwritten, the registered signal processing function becomes invalid and a core dump occurs. diff --git a/docs/en/server/memory_storage/hsak/hsak_tool_usage.md b/docs/en/server/memory_storage/hsak/hsak_tool_usage.md deleted file mode 100644 index 342c01a55218a6290565e29f403e1249bb3120f6..0000000000000000000000000000000000000000 --- a/docs/en/server/memory_storage/hsak/hsak_tool_usage.md +++ /dev/null @@ -1,123 +0,0 @@ -# Command-Line Interface - -## Command for Querying Drive Information - -### Format - -```shell -libstorage-list [] [] -``` - -### Parameters - -- *commands*: Only **help** is available. **libstorage-list help** is used to display the help information. - -- *device*: specifies the PCI address. The format is **0000:09:00.0**. Multiple PCI addresses are allowed and separated by spaces. If no specific PCI address is set, the command line lists all enumerated device information. - -### Precautions - -- The fault injection function applies only to development, debugging, and test scenarios. Do not use this function on live networks. Otherwise, service and security risks may occur. - -- Before running this command, ensure that the management component (Ublock) server has been started, and the user-mode I/O component (UIO) has not been started or has been correctly started. - -- Drives that are not occupied by the Ublock or UIO component will be occupied during the command execution. If the Ublock or UIO component attempts to obtain the drive control permission, a storage device access conflict may occur. As a result, the command execution fails. - -## Command for Switching Drivers for Drives - -### Format - -```shell -libstorage-shutdown reset [ ...] -``` - -### Parameters - -- **reset**: switches the UIO driver to the kernel-mode driver for a specific drive. - -- *device*: specifies the PCI address, for example, **0000:09:00.0**. Multiple PCI addresses are allowed and separated by spaces. - -### Precautions - -- The **libstorage-shutdown reset** command is used to switch a drive from the user-mode UIO driver to the kernel-mode NVMe driver. - -- Before running this command, ensure that the Ublock server has been started, and the UIO component has not been started or has been correctly started. - -- The **libstoage-shutdown reset** command is risky. Before switching to the NVMe driver, ensure that the user-mode instance has stopped delivering I/Os to the NVMe device, all FDs on the NVMe device have been disabled, and the instance that accesses the NVMe device has exited. - -## Command for Obtaining I/O Statistics - -### Format - -```shell -libstorage-iostat [-t ] [-i ] [-d ] -``` - -### Parameters - -- -**t**: interval, in seconds. The value ranges from 1 to 3600. This parameter is of the int type. If the input parameter value exceeds the upper limit of the int type, the value is truncated to a negative or positive number. - -- -**i**: number of collection times. The minimum value is **1** and the maximum value is *MAX_INT*. If this parameter is not set, information is collected at an interval by default. This parameter is of the int type. If the input parameter value exceeds the upper limit of the int type, the value is truncated to a negative or positive number. - -- -**d**: name of a block device (for example, **nvme0n1**, which depends on the controller name configured in **/etc/spdk/nvme.conf.in**). You can use this parameter to collect performance data of one or more specified devices. If this parameter is not set, performance data of all detected devices is collected. - -### Precautions - -- The I/O statistics configuration is enabled. - -- The process has delivered I/O operations to the drive whose performance information needs to be queried through the UIO component. - -- If no device in the current environment is occupied by service processes to deliver I/Os, the command exits after the message "You cannot get iostat info for nvme device no deliver io" is displayed. - -- When multiple queues are enabled on a drive, the I/O statistics tool summarizes the performance data of multiple queues on the drive and outputs the data in a unified manner. - -- The I/O statistics tool supports data records of a maximum of 8192 drive queues. - -- The I/O statistics are as follows: - - | Device | r/s | w/s | rKB/s | wKB/s | avgrq-sz | avgqu-sz | r_await | w_await | await | svctm | util% | poll-n | - | ----------- | ------------------------------ | ------------------------------- | ----------------------------------- | ------------------------------------ | -------------------------------------- | -------------------------- | --------------------- | ---------------------- | ------------------------------- | --------------------------------------- | ------------------ | -------------------------- | - | Device name | Number of read I/Os per second | Number of write I/Os per second | Number of read I/O bytes per second | Number of write I/O bytes per second | Average size of delivered I/Os (bytes) | I/O depth of a drive queue | I/O read latency (μs) | I/O write latency (μs) | Average read/write latency (μs) | Processing latency of a single I/O (μs) | Device utilization | Number of polling timeouts | - -## Commands for Drive Read/Write Operations - -### Format - -```shell -libstorage-rw [OPTIONS...] -``` - -### Parameters - -1. **COMMAND** parameters - - - **read**: reads a specified logical block from the device to the data buffer (standard output by default). - - - **write**: writes data in a data buffer (standard input by default) to a specified logical block of the NVMe device. - - - **help**: displays the help information about the command line. - -2. **device**: specifies the PCI address, for example, **0000:09:00.0**. - -3. **OPTIONS** parameters - - - **--start-block, -s**: indicates the 64-bit start address of the logical block to be read or written. The default value is **0**. - - - **--block-count, -c**: indicates the number of the logical blocks to be read or written (counted from 0). - - - **--data-size, -z**: indicates the number of bytes of the data to be read or written. - - - **--namespace-id, -n**: indicates the namespace ID of the device. The default value is **1**. - - - **--data, -d**: indicates the data file used for read and write operations (The read data is saved during read operations and the written data is provided during write operations.) - - - **--limited-retry, -l**: indicates that the device controller restarts for a limited number of times to complete device read and write operations. - - - **--force-unit-access, -f**: ensures that read and write operations are completed from the nonvolatile media before the instruction is completed. - - - **--show-command, -v**: displays instruction information before sending a read/write command. - - - **--dry-run, -w**: displays only information about read and write instructions but does not perform actual read and write operations. - - - **--latency. -t**: collects statistics on the end-to-end read and write latency of the CLI. - - - **--help, -h**: displays the help information about related commands. diff --git a/docs/en/server/memory_storage/lvm/_toc.yaml b/docs/en/server/memory_storage/lvm/_toc.yaml deleted file mode 100644 index 485edd8b46df64bd312ff44f85b817b35b676552..0000000000000000000000000000000000000000 --- a/docs/en/server/memory_storage/lvm/_toc.yaml +++ /dev/null @@ -1,6 +0,0 @@ -label: Logical Volume Configuration and Management -isManual: true -description: Use LVM to manage drives. -sections: - - label: Managing Drives Through LVM - href: ./managing_drives_through_lvm.md diff --git a/docs/en/server/memory_storage/lvm/managing_drives_through_lvm.md b/docs/en/server/memory_storage/lvm/managing_drives_through_lvm.md deleted file mode 100644 index 1281426851d143c5c122a56890ba3a448d83afe0..0000000000000000000000000000000000000000 --- a/docs/en/server/memory_storage/lvm/managing_drives_through_lvm.md +++ /dev/null @@ -1,575 +0,0 @@ -# Managing Drives Through LVM - - - -- [Managing Drives Through LVM](#managing-drives-through-lvm) - - [LVM Overview](#lvm-overview) - - [Basic Terms](#basic-terms) - - [Installing the LVM](#installing-the-lvm) - - [Managing PVs](#managing-pvs) - - [Creating a PV](#creating-a-pv) - - [Viewing a PV](#viewing-a-pv) - - [Modifying PV Attributes](#modifying-pv-attributes) - - [Deleting a PV](#deleting-a-pv) - - [Managing VGs](#managing-vgs) - - [Creating a VG](#creating-a-vg) - - [Viewing a VG](#viewing-a-vg) - - [Modifying VG Attributes](#modifying-vg-attributes) - - [Extending a VG](#extending-a-vg) - - [Shrinking a VG](#shrinking-a-vg) - - [Deleting a VG](#deleting-a-vg) - - [Managing LVs](#managing-lvs) - - [Creating an LV](#creating-an-lv) - - [Viewing an LV](#viewing-an-lv) - - [Adjusting the LV Size](#adjusting-the-lv-size) - - [Extending an LV](#extending-an-lv) - - [Shrinking an LV](#shrinking-an-lv) - - [Deleting an LV](#deleting-an-lv) - - [Creating and Mounting a File System](#creating-and-mounting-a-file-system) - - [Creating a File System](#creating-a-file-system) - - [Manually Mounting a File System](#manually-mounting-a-file-system) - - [Automatically Mounting a File System](#automatically-mounting-a-file-system) - - - -## LVM Overview - -Logical Volume Manager \(LVM\) is a mechanism used for managing drive partitions in Linux. By adding a logical layer between drives and file systems, LVM shields the drive partition layout for file systems, thereby improving flexibility in managing drive partitions. - -The procedure of managing a drive through LVM is as follows: - -1. Create physical volumes for a drive. -2. Combine several physical volumes into a volume group. -3. Create logical volumes in the volume group. -4. Create file systems on logical volumes. - -When drives are managed using LVM, file systems are distributed on multiple drives and can be easily resized as needed. Therefore, file system space will no longer be limited by drive capacities. - -### Basic Terms - -- Physical media: refers to physical storage devices in the system, such as drives \(**/dev/hda** and **/dev/sda**\). It is the storage unit at the lowest layer of the storage system. - -- Physical volume \(PV\): refers to a drive partition or device \(such as a RAID\) that has the same logical functions as a drive partition. PVs are basic logical storage blocks of LVM. A PV contains a special label that is stored in the second 512-byte sector by default. It can also be stored in one of the first four sectors. A label contains the universal unique identifier \(UUID\) of the PV, size of the block device, and the storage location of LVM metadata in the device. - -- Volume group \(VG\): consists of PVs and shields the details of underlying PVs. You can create one or more logical volumes within a VG without considering detailed PV information. - -- Logical volume \(LV\): A VG cannot be used directly. It can be used only after being partitioned into LVs. LVs can be formatted into different file systems and can be directly used after being mounted. - -- Physical extent \(PE\): A PE is a small storage unit in a PV. The PE size is the same as the size of the logical extent in the VG. - -- Logical extent \(LE\): An LE is a small storage unit in an LV. In one VG, the LEs of all the LVs have the same size. - -## Installing the LVM - -> [!NOTE]NOTE -> The LVM has been installed on the openEuler OS by default. You can run the **rpm -qa | grep lvm2** command to check whether it is installed. If the command output contains "lvm2", the LVM has been installed. In this case, skip this section. If no information is output, the LVM is not installed. Install it by referring to this section. - -1. Configure the local yum source. For details, see [Configuring the Repo Server](../../administration/administrator/configuring-the-repo-server.md). -2. Clear the cache. - - ```bash - dnf clean all - ``` - -3. Create a cache. - - ```bash - dnf makecache - ``` - -4. Install the LVM as the **root** user. - - ```bash - dnf install lvm2 - ``` - -5. Check the installed RPM package. - - ```bash - rpm -qa | grep lvm2 - ``` - -## Managing PVs - -### Creating a PV - -Run the **pvcreate** command as the **root** user to create a PV. - -```bash -pvcreate [option] devname ... -``` - -In the preceding information: - -- _option_: command parameter options. Common parameter options are as follows: - - **-f**: forcibly creates a PV without user confirmation. - - **-u**: specifies the UUID of the device. - - **-y**: answers yes to all questions. - -- _devname_: specifies the name of the device corresponding to the PV to be created. If multiple PVs need to be created in batches, set this option to multiple device names and separate the names with spaces. - -Example 1: Create PVs based on **/dev/sdb** and **/dev/sdc**. - -```bash -pvcreate /dev/sdb /dev/sdc -``` - -Example 2: Create PVs based on **/dev/sdb1** and **/dev/sdb2**. - -```bash -pvcreate /dev/sdb1 /dev/sdb2 -``` - -### Viewing a PV - -Run the **pvdisplay** command as the **root** user to view PV information, including PV name, VG to which the PV belongs, PV size, PE size, total number of PEs, number of available PEs, number of allocated PEs, and UUID. - -```bash -pvdisplay [option] devname -``` - -In the preceding information: - -- _option_: command parameter options. Common parameter options are as follows: - - **-s**: outputs information in short format. - - **-m**: displays the mapping from PEs to LEs. - -- _devname_: indicates the device corresponding to the PV to be viewed. If no PVs are specified, information about all PVs is displayed. - -Example: Run the following command to display the basic information about the PV **/dev/sdb**: - -```bash -pvdisplay /dev/sdb -``` - -### Modifying PV Attributes - -Run the **pvchange** command as the **root** user to modify the attributes of a PV. - -```bash -pvchange [option] pvname ... -``` - -In the preceding information: - -- _option_: command parameter options. Common parameter options are as follows: - - **-u**: generates a new UUID. - - **-x**: indicates whether PE allocation is allowed. - -- _pvname_: specifies the name of the device corresponding to the PV to be modified. If multiple PVs need to be modified in batches, set this option to multiple device names and separate the names with spaces. - -Example: Run the following command to prohibit PEs on the PV **/dev/sdb** from being allocated. Running `pvdisplay` for a PV that is not added to a VG will return the **Allocatable** attribute with the value **NO**. You need to add the PV to a VG before you can change the attribute. - -```bash -pvchange -x n /dev/sdb -``` - -### Deleting a PV - -Run the **pvremove** command as the **root** user to delete a PV. - -```bash -pvremove [option] pvname ... -``` - -In the preceding information: - -- _option_: command parameter options. Common parameter options are as follows: - - **-f**: forcibly deletes a PV without user confirmation. - - **-y**: answers yes to all questions. - -- _pvname_: specifies the name of the device corresponding to the PV to be deleted. If multiple PVs need to be deleted in batches, set this option to multiple device names and separate the names with spaces. - -Example: Run the following command to delete the PV **/dev/sdb**. If the PV has been added to a VG, you need to delete the VG or remove the PV from the VG in advance. - -```bash -pvremove /dev/sdb -``` - -## Managing VGs - -### Creating a VG - -Run the **vgcreate** command as the **root** user to create a VG. - -```bash -vgcreate [option] vgname pvname ... -``` - -In the preceding information: - -- _option_: command parameter options. Common parameter options are as follows: - - **-l**: specifies the maximum number of LVs that can be created on the VG. - - **-p**: specifies the maximum number of PVs that can be added to the VG. - - **-s**: specifies the PE size of a PV in the VG. - -- _vgname_: name of the VG to be created. -- _pvname_: name of the PV to be added to the VG. - -Example: Run the following command to create VG **vg1** and add the PVs **/dev/sdb** and **/dev/sdc** to the VG. - -```bash -vgcreate vg1 /dev/sdb /dev/sdc -``` - -### Viewing a VG - -Run the **vgdisplay** command as the **root** user to view VG information. - -```bash -vgdisplay [option] [vgname] -``` - -In the preceding information: - -- _option_: command parameter options. Common parameter options are as follows: - - **-s**: outputs information in short format. - - **-A**: displays only attributes of active VGs. - -- _vgname_: name of the VG to be viewed. If no VGs are specified, information about all VGs is displayed. - -Example: Run the following command to display the basic information about VG **vg1**: - -```bash -vgdisplay vg1 -``` - -### Modifying VG Attributes - -Run the **vgchange** command as the **root** user to modify the attributes of a VG. - -```bash -vgchange [option] vgname -``` - -In the preceding information: - -- _option_: command parameter options. Common parameter options are as follows: - - **-a**: sets the active status of the VG. - -- _vgname_: name of the VG whose attributes are to be modified. - -Example: Run the following command to change the status of **vg1** to active. - -```bash -vgchange -ay vg1 -``` - -### Extending a VG - -Run the **vgextend** command as the **root** user to dynamically extend a VG. In this way, the VG size is extended by adding PVs to the VG. - -```bash -vgextend [option] vgname pvname ... -``` - -In the preceding information: - -- _option_: command parameter options. Common parameter options are as follows: - - **dev**: debugging mode. - - **-t**: test only. - -- _vgname_: name of the VG whose size is to be extended. -- _pvname_: name of the PV to be added to the VG. - -Example: Run the following command to add PV **/dev/sdb** to VG **vg1**: - -```bash -vgextend vg1 /dev/sdb -``` - -### Shrinking a VG - -Run the **vgreduce** command as the **root** user to delete PVs from a VG to reduce the VG size. A VG must contain at least one PV. - -```bash -vgreduce [option] vgname pvname ... -``` - -In the preceding information: - -- _option_: command parameter options. Common parameter options are as follows: - - **-a**: If no PVs are specified in the command, all empty PVs are deleted. - - **\-\-removemissing**: deletes lost PVs in the VG to restore the VG to the normal state. - -- _vgname_: name of the VG to be shrunk. -- _pvname_: name of the PV to be deleted from the VG. - -Example: Run the following command to remove PV **/dev/sdb2** from VG **vg1**: - -```bash -vgreduce vg1 /dev/sdb2 -``` - -### Deleting a VG - -Run the **vgremove** command as the **root** user to delete a VG. - -```bash -vgremove [option] vgname -``` - -In the preceding information: - -- _option_: command parameter options. Common parameter options are as follows: - - **-f**: forcibly deletes a VG without user confirmation. - -- _vgname_: name of the VG to be deleted. - -Example: Run the following command to delete VG **vg1**. - -```bash -vgremove vg1 -``` - -## Managing LVs - -### Creating an LV - -Run the **lvcreate** command as the **root** user to create an LV. - -```bash -lvcreate [option] vgname -``` - -In the preceding information: - -- _option_: command parameter options. Common parameter options are as follows: - - **-L**: specifies the size of the LV in kKmMgGtT. - - **-l**: specifies the size of the LV \(number of LEs\). - - **-n**: specifies the name of the LV to be created. - - **-s**: creates a snapshot. - -- _vgname_: name of the VG to be created. - -Example 1: Run the following command to create a 10 GB LV in VG **vg1**. The default LV name is **lvo10**. - -```bash -lvcreate -L 10G vg1 -``` - -Example 2: Run the following command to create a 200 MB LV in VG **vg1** and name the LV **lv1**. - -```bash -lvcreate -L 200M -n lv1 vg1 -``` - -### Viewing an LV - -Run the **lvdisplay** command as the **root** user to view the LV information, including the size of the LV, its read and write status, and snapshot information. - -```bash -lvdisplay [option] [lvname] -``` - -In the preceding information: - -- _option_: command parameter options. Common parameter options are as follows: - - **-v**: displays the mapping from LEs to PEs. - -- _lvname_: device file corresponding to the LV whose attributes are to be displayed. If this option is not set, attributes of all LVs are displayed. - - > [!NOTE]NOTE - > Device files corresponding to LVs are stored in the VG directory. For example, if LV **lv1** is created in VG **vg1**, the device file corresponding to **lv1** is **/dev/vg1/lv1**. - -Example: Run the following command to display the basic information about LV **lv1**: - -```bash -lvdisplay /dev/vg1/lv1 -``` - -### Adjusting the LV Size - -Run the **lvresize** command as the **root** user to increase or reduce the size of an LVM LV. This may cause data loss. Therefore, exercise caution when running this command. - -```bash -lvresize [option] vgname -``` - -In the preceding information: - -- _option_: command parameter options. Common parameter options are as follows: - - **-L**: specifies the size of the LV in kKmMgGtT. - - **-l**: specifies the size of the LV \(number of LEs\). - - **-f**: forcibly adjusts the size of the LV without user confirmation. - -- _lvname_: name of the LV to be adjusted. - -Example 1: Run the following command to increase the size of LV **/dev/vg1/lv1** by 200 MB. - -```bash -lvresize -L +200 /dev/vg1/lv1 -``` - -Example 2: Run the following command to reduce the size of LV **/dev/vg1/lv1** by 200 MB. - -```bash -lvresize -L -200 /dev/vg1/lv1 -``` - -### Extending an LV - -Run the **lvextend** command as the **root** user to dynamically extend the size of an LV online without interrupting the access of applications to the LV. - -```bash -lvextend [option] lvname -``` - -In the preceding information: - -- _option_: command parameter options. Common parameter options are as follows: - - **-L**: specifies the size of the LV in kKmMgGtT. - - **-l**: specifies the size of the LV \(number of LEs\). - - **-f**: forcibly adjusts the size of the LV without user confirmation. - -- _lvname_: device file of the LV whose size is to be extended. - -Example: Run the following command to increase the size of LV **/dev/vg1/lv1** by 100 MB. - -```bash -lvextend -L +100M /dev/vg1/lv1 -``` - -### Shrinking an LV - -Run the **lvreduce** command as the **root** user to reduce the size of an LV. This may delete existing data on the LV. Therefore, confirm whether the data can be deleted before running the command. - -```bash -lvreduce [option] lvname -``` - -In the preceding information: - -- _option_: command parameter options. Common parameter options are as follows: - - **-L**: specifies the size of the LV in kKmMgGtT. - - **-l**: specifies the size of the LV \(number of LEs\). - - **-f**: forcibly adjusts the size of the LV without user confirmation. - -- _lvname_: device file of the LV whose size is to be extended. - -Example: Run the following command to reduce the space of LV **/dev/vg1/lvl** by 100 MB: - -```bash -lvreduce -L -100M /dev/vg1/lv1 -``` - -### Deleting an LV - -Run the **lvremove** command as the **root** user to delete an LV. If the LV has been mounted by running the **mount** command, you need to run the **umount** command to unmount the LV before running the **lvremove** command. - -```bash -lvremove [option] lvname -``` - -In the preceding information: - -- _option_: command parameter options. Common parameter options are as follows: - - **-f**: forcibly deletes an LV without user confirmation. - -- _lvname_: device name of the LV to be deleted. - -Example: Run the following command to delete LV **/dev/vg1/lv1**. - -```bash -lvremove /dev/vg1/lv1 -``` - -## Creating and Mounting a File System - -After creating an LV, you need to create a file system on the LV and mount the file system to the corresponding directory. - -### Creating a File System - -Run the **mkfs** command as the **root** user to create a file system. - -```bash -mkfs [option] lvname -``` - -In the preceding information: - -- _option_: command parameter options. Common parameter options are as follows: - - **-t**: specifies the type of the Linux file system to be created, such as **ext2**, **ext3**, and **ext4**. The default type is **ext2**. - -- _lvname_: name of the LV device file corresponding to the file system to be created. - -Example: Run the following command to create the **ext4** file system on LV **/dev/vg1/lv1**: - -```bash -mkfs -t ext4 /dev/vg1/lv1 -``` - -### Manually Mounting a File System - -The file system that is manually mounted is not valid permanently. It does not exist after the OS is restarted. - -Run the **mount** command as the **root** user to mount a file system. - -```bash -mount lvname mntpath -``` - -In the preceding information: - -- _lvname_: name of the LV device file corresponding to the file system to be mounted. -- _mntpath_: mount path. - -Example: Run the following command to mount LV **/dev/vg1/lv1** to the directory **/mnt/data**. - -```bash -mount /dev/vg1/lv1 /mnt/data -``` - -### Automatically Mounting a File System - -A file system that is automatically mounted does not exist after the OS is restarted. You need to manually mount the file system again. If you perform the following steps as the **root** user after manually mounting the file system, the file system can be automatically mounted after the OS is restarted. - -1. Run the **blkid** command to query the UUID of an LV. The following uses LV **/dev/vg1/lv1** as an example: - - ```bash - blkid /dev/vg1/lv1 - ``` - - Check the command output. It contains the following information in which _uuidnumber_ is a string of digits, indicating the UUID, and _fstype_ indicates the file system type. - - /dev/vg1/lv1: UUID=" _uuidnumber_ " TYPE=" _fstype_ " - -2. Run the **vi /etc/fstab** command to edit the **fstab** file and add the following content to the end of the file: - - ```vim - UUID=uuidnumber mntpath fstype defaults 0 0 - ``` - - In the preceding information: - - - Column 1: indicates the UUID. Enter _uuidnumber_ obtained in [1](#li65701520154311). - - Column 2: indicates the mount directory of the file system. Replace _mntpath_ with the actual value. - - Column 3: indicates the file system format. Enter _fstype_ obtained in [1](#li65701520154311). - - Column 4: indicates the mount option. In this example, **defaults** is used. - - Column 5: indicates the backup option. Enter either **1** \(the system automatically backs up the file system\) or **0** \(the system does not back up the file system\). In this example, **0** is used. - - Column 6: indicates the scanning option. Enter either **1** \(the system automatically scans the file system during startup\) or **0** \(the system does not scan the file system\). In this example, **0** is used. - -3. Verify the automatic mounting function. - 1. Run the **umount** command to unmount the file system. The following uses LV **/dev/vg1/lv1** as an example: - - ```bash - umount /dev/vg1/lv1 - ``` - - 2. Run the following command to reload all content in the **/etc/fstab** file: - - ```bash - mount -a - ``` - - 3. Run the following command to query the file system mounting information \(**/mnt/data** is used as an example\): - - ```bash - mount | grep /mnt/data - ``` - - Check the command output. If the command output contains the following information, the automatic mounting function takes effect: - - ```text - /dev/vg1/lv1 on /mnt/data - ``` diff --git a/docs/en/server/memory_storage/lvm/public_sys-resources/icon-note.gif b/docs/en/server/memory_storage/lvm/public_sys-resources/icon-note.gif deleted file mode 100644 index 6314297e45c1de184204098efd4814d6dc8b1cda..0000000000000000000000000000000000000000 Binary files a/docs/en/server/memory_storage/lvm/public_sys-resources/icon-note.gif and /dev/null differ diff --git a/docs/en/server/performance/cpu_optimization/sysboost/_toc.yaml b/docs/en/server/performance/cpu_optimization/sysboost/_toc.yaml deleted file mode 100644 index e04318f2ed58460bca3651551e02ddfb8ec73bd0..0000000000000000000000000000000000000000 --- a/docs/en/server/performance/cpu_optimization/sysboost/_toc.yaml +++ /dev/null @@ -1,10 +0,0 @@ -label: sysBoost User Guide -isManual: true -description: Enhance code compatibility with the CPU microarchitecture of the execution environment to boost program performance. -sections: - - label: Getting to Know sysBoost - href: ./getting_to_know_sysboost.md - - label: Installation and Deployment - href: ./installation_and_deployment.md - - label: Usage Instructions - href: ./usage_instructions.md diff --git a/docs/en/server/performance/cpu_optimization/sysboost/figures/architecture.png b/docs/en/server/performance/cpu_optimization/sysboost/figures/architecture.png deleted file mode 100644 index c4d805fe2caf7fe1f2ae38a37b22c39e1e002c6b..0000000000000000000000000000000000000000 Binary files a/docs/en/server/performance/cpu_optimization/sysboost/figures/architecture.png and /dev/null differ diff --git a/docs/en/server/performance/cpu_optimization/sysboost/figures/icon-note.gif b/docs/en/server/performance/cpu_optimization/sysboost/figures/icon-note.gif deleted file mode 100644 index 6314297e45c1de184204098efd4814d6dc8b1cda..0000000000000000000000000000000000000000 Binary files a/docs/en/server/performance/cpu_optimization/sysboost/figures/icon-note.gif and /dev/null differ diff --git a/docs/en/server/performance/cpu_optimization/sysboost/getting_to_know_sysboost.md b/docs/en/server/performance/cpu_optimization/sysboost/getting_to_know_sysboost.md deleted file mode 100644 index 6666c961d15ce0a5dfac833b6e6a61324db22799..0000000000000000000000000000000000000000 --- a/docs/en/server/performance/cpu_optimization/sysboost/getting_to_know_sysboost.md +++ /dev/null @@ -1,61 +0,0 @@ -# Getting to Know sysBoost - -## Introduction - -sysBoost reorders the code of executable files and dynamic libraries online to adapt the code for the CPU microarchitecture of the operating environment, boosting program performance. - -## Background - -- Large-scale applications use a large number of third-party or self-developed dynamic libraries. A large number of PLT jumps occur during function invoking. As a result, the instructions per cycle (IPC) decreases. - -- Assembly code is large in size and occupies a large amount of memory, resulting in a high iTLB miss rate. Hotspot code segments are scattered. As a result, the I-cache miss rate is high, affecting the CPU pipeline efficiency. - -- Application developers are unfamiliar with the OS and CPU microarchitecture, resulting in high IPC performance optimization costs. - -## sysBoost Design - -### Key technologies - -- Dynamic library merging: Scattered code segments and data segments are merged when the dynamic loader loads dynamic libraries. Huge page memory is used to improve the iTLB hit ratio. - -- PLT jump elimination: When the application code calls a dynamic library function, the execution is redirected to the PLT and then to the actual function. Eliminating PLT jump can improve the IPC. - -- Online reordering of hotspot code segments: By default, code is arranged by dynamic library. The online reordering technology can reorder hotspot code by segment. - -- exec native huge page mechanism: The user-mode huge page mechanism requires specific application configuration and recompilation. The exec native huge page mechanism directly uses huge page memory when the kernel loads the ELF file,without the need for modifying applications. - -### Architecture - -**Figure 1** sysBoost architecture - -![](./figures/architecture.png) - -## sysBoost Features - -- Full static merging: Applications and their dependent dynamic libraries are merged into one binary file, and segment-level reordering is performed. Multiple discrete code segments or data segments are merged into one to improve application performance. - -- Automatic binary file optimization: The sysBoost daemon reads the configuration file to obtain the binary files to be optimized and the corresponding optimization methods, optimizes the binary files based on user requirements, and stores the optimized binary files in RTO files. - -- Huge page preloading of binary code segments/data segments: When the user-mode page table is mapped to the physical memory, huge page (2 MB) mapping can improve performance. However, openEuler does not support huge page mapping of file pages. sysBoost provides the huge page pre-loading function. After binary optimization is complete, sysBoost immediately loads the content to the kernel as a huge page. When an application is started, sysBoost maps the pre-loaded content to the user-mode page table in batches to reduce page faults and memory access delay of the application, thereby improving the application startup speed and running efficiency. - -- Binary exception monitoring: If a bug occurs in the RTO binary file generated by sysBoost, the application may crash. To avoid repeated application starts and crashes and prevent the fault from spreading, sysBoost monitors the processes that load the RTO binary files. If such a process crashes, sysBoost rolls back the optimization by deleting the RTO file and the flag on the original application file. In addition, sysBoost renames the configuration file to prevent optimization from being applied again after the sysBoost service is restarted. - -## Benefits - -### Scenario 1 - -In the Bash test of UnixBench, some common commands and scripts are executed, such as `ls`, `grep`, and `awk`. These commands and scripts usually invoke some system libraries, such as **libc** and **libpthread**. These library files usually need to be dynamically linked, which increases the program startup time and delay. By using the binary file merging technology, these library files can be merged into an executable file, significantly improving the Bash performance and increasing the UnixBench score. - -### Scenario 2 - -The dynamic assembly design of some applications uses a large number of dynamic libraries, which brings the following problems: - -- Indirect function jump and scattered code segments affect CPU execution efficiency. -- The parsing of excessive dynamic library symbols slows down program startup. -- Profile-guided optimization based on a specific service model cannot adapt to different service models. - -Using sysBoost to start large processes during service deployment can effectively solve the preceding problems. - -- The exec huge page mechanism allows the loaded large processes to store code segments and data segments in memory huge pages, reducing the TLB miss rate. -- A large process contains all dynamic library code and application code, eliminating indirect function jumps. -- Service changes are intelligently identified online to regenerate large processes based on appropriate hotspot models. diff --git a/docs/en/server/performance/cpu_optimization/sysboost/installation_and_deployment.md b/docs/en/server/performance/cpu_optimization/sysboost/installation_and_deployment.md deleted file mode 100644 index 43979e5020e9f7ce2e36f4cecdcf87225c1fb6b5..0000000000000000000000000000000000000000 --- a/docs/en/server/performance/cpu_optimization/sysboost/installation_and_deployment.md +++ /dev/null @@ -1,68 +0,0 @@ -# Installation and Deployment - -## Software and Hardware Requirements - -- Hardware: Kunpeng 920 server - -- Software: openEuler 23.09 - -## Environment Preparation - -- Install the openEuler OS. - -- Obtain root permissions. - -## sysBoost Installation - -To install the sysBoost, perform the following steps (**xxx** in the commands indicate version numbers): - -1. Mount an openEuler ISO image. - - ```sh - # Use the ISO file of the corresponding openEuler version. - mount openEuler-xxx-aarch64-dvd.iso /mnt - ``` - -2. Configure a local Yum source. - - ```sh - vim /etc/yum.repos.d/local.repo - ``` - - The configured contents are as follows: - - ```text - [localosrepo] - name=localosrepo - baseurl=file:///mnt - enabled=1 - gpgcheck=1 - gpgkey=file:///mnt/RPM-GPG-KEY-openEuler - ``` - -3. Install sysBoost. - - ```sh - yum install sysboost -y - ``` - -4. Check whether the installation is successful. - - ```text - # rpm -qa | grep sysboost - sysboost-xxx - # rpm -qa | grep native-turbo - native-turbo-xxx - ``` - - If the preceding information is displayed, the installation is successful. - -5. Install the relocation packages required for combining the ELF files. - - ```sh - yum install bash-relocation-xxx -y - yum install ncurses-relocation-xxx -y - ``` - - > ![](./figures/icon-note.gif) **Note:** - > If the ELF files and their dependency libraries contain the relocation segment, skip this step. diff --git a/docs/en/server/performance/cpu_optimization/sysboost/usage_instructions.md b/docs/en/server/performance/cpu_optimization/sysboost/usage_instructions.md deleted file mode 100644 index 1f6ef3887f4167f7d6e5bd7adb5de29204523442..0000000000000000000000000000000000000000 --- a/docs/en/server/performance/cpu_optimization/sysboost/usage_instructions.md +++ /dev/null @@ -1,94 +0,0 @@ -# Usage Instructions - -## Overview - -- Root permissions are required for using and configuring sysBoost. -- Only one sysBoost instance can exist. -- The system administrator must ensure the configuration file is correct. - -## Configuration - -### Configuration File Description - -Configuration file directory: **/etc/sysboost.d/** - -**Table 1** Client YAML configuration file - - - - - - - - - - - - - - - - - - - - - - - - - -

Item

-

Description

-

Type

-

Value Range

-

elf_path

-

ELF file to be combined

-

String

-

ELF path supported by sysBoost

-

mode

-

sysBoost running mode

-

String

-

"static"

-

libs

-

Dependency library of the ELF file specified by elf_path. This is optional because sysBoost can automatically detects dependency libraries.

-

String

-

Path of the dependent library of the ELF file supported by sysBoost

-
- -### Configuration Example - -sysBoost TOML configuration file example: - -```text -# /etc/sysboost.d/bash.toml -elf_path = "/usr/bin/bash" -mode = "static-nolibc" -libs = ["/usr/lib64/libtinfo.so.6"] -``` - -## Usage - -- Start sysBoost. - - ```text - systemctl start sysboost.service - ``` - -- Stop sysBoost. - - ```text - systemctl stop sysboost.service - ``` - -- Query sysBoost status. If there is no text in red, sysBoost is running normally. - - ```text - systemctl status sysboost.service - ``` - -- View logs. If sysBoost fails, see the system logs for details. - - ```text - cat /var/log/messages - ``` diff --git a/docs/zh/server/_toc.yaml b/docs/zh/server/_toc.yaml index 4d77136eb9f807579b72725a7c56cdbbe5e73324..c0d1da4e6bdcceda701d910eb0e0e188f7c789c1 100644 --- a/docs/zh/server/_toc.yaml +++ b/docs/zh/server/_toc.yaml @@ -38,10 +38,18 @@ sections: - href: ./security/shangmi/_toc.yaml - label: 内存与存储 sections: - - href: ./memory_storage/lvm/_toc.yaml - - href: ./memory_storage/etmem/_toc.yaml - - href: ./memory_storage/gmem/_toc.yaml - - href: ./memory_storage/hsak/_toc.yaml + - href: + upstream: https://gitee.com/openeuler/Storage-docs/blob/master/docs/zh/lvm/_toc.yaml + path: ./lvm + - href: + upstream: https://gitee.com/openeuler/Storage-docs/blob/master/docs/zh/etmem/_toc.yaml + path: ./etmem + - href: + upstream: https://gitee.com/openeuler/Storage-docs/blob/master/docs/zh/gmem/_toc.yaml + path: ./gmem + - href: + upstream: https://gitee.com/openeuler/Storage-docs/blob/master/docs/zh/hsak/_toc.yaml + path: ./hsak - label: 网络 sections: - href: ./network/network_config/_toc.yaml @@ -56,7 +64,9 @@ sections: - href: ./performance/tuning_framework/oeaware/_toc.yaml - label: CPU调优 sections: - - href: ./performance/cpu_optimization/sysboost/_toc.yaml + - href: + upstream: https://gitee.com/openeuler/Computing-docs/blob/master/docs/en/sysboost/_toc.yaml + path: ./sysboost - href: ./performance/cpu_optimization/kae/_toc.yaml - label: 系统调优 sections: @@ -75,5 +85,9 @@ sections: - href: ./high_availability/ha/_toc.yaml - label: 多样性算力 sections: - - href: ./diversified_computing/dpu_offload/_toc.yaml - - href: ./diversified_computing/dpu_os/_toc.yaml + - href: + upstream: https://gitee.com/openeuler/dpu-utilities/blob/master/docs/zh/dpu_offload/_toc.yaml + path: ./dpu_offload + - href: + upstream: https://gitee.com/openeuler/dpu-utilities/blob/master/docs/zh/dpu_os/_toc.yaml + path: ./dpu_os diff --git a/docs/zh/server/memory_storage/etmem/_toc.yaml b/docs/zh/server/memory_storage/etmem/_toc.yaml deleted file mode 100644 index 71bfe6d9d44fc2fad21e6125a229463803d6f0df..0000000000000000000000000000000000000000 --- a/docs/zh/server/memory_storage/etmem/_toc.yaml +++ /dev/null @@ -1,6 +0,0 @@ -label: etmem用户指南 -isManual: true -description: 使用内存分级扩展技术 etmem 扩展内存容量 -sections: - - label: 使用etmem - href: ./etmem_user_guide.md diff --git a/docs/zh/server/memory_storage/etmem/etmem_user_guide.md b/docs/zh/server/memory_storage/etmem/etmem_user_guide.md deleted file mode 100644 index fd2daa7ff2b469140310ea25fe48be634facf686..0000000000000000000000000000000000000000 --- a/docs/zh/server/memory_storage/etmem/etmem_user_guide.md +++ /dev/null @@ -1,798 +0,0 @@ -# etmem用户指南 - -## 介绍 - -随着CPU算力的发展,尤其是ARM核成本的降低,内存成本和内存容量成为约束业务成本和性能的核心痛点,因此如何节省内存成本,如何扩大内存容量成为存储迫切要解决的问题。 - -etmem内存分级扩展技术,通过DRAM+内存压缩/高性能存储新介质形成多级内存存储,对内存数据进行分级,将分级后的内存冷数据从内存介质迁移到高性能存储介质中,达到内存容量扩展的目的,从而实现内存成本下降。 - -etmem软件包运行的工具主要分为etmem客户端和etmemd服务端。etmemd服务端工具,运行后常驻,其中实现了目的进程的内存冷热识别及淘汰等功能。etmem客户端工具,调用时运行一次,根据命令参数的不同,控制etmemd服务端响应不同的操作。 - -## 编译教程 - -1. 下载etmem源码 - - ```bash - git clone https://gitee.com/openeuler/etmem.git - ``` - -2. 编译和运行依赖 - - etmem的编译和运行依赖于libboundscheck组件 - - 安装命令: - - ```bash - yum install libboundscheck - ``` - - 通过rpm包进行确认是否安装: - - ```bash - rpm -qi libboundscheck - ``` - -3. 编译 - - ```bash - cd etmem - - mkdir build - - cd build - - cmake .. - - make - ``` - -## 注意事项 - -### 运行依赖 - -etmem作为内存扩展工具,需要依赖于内核态的特性支持,为了可以识别内存访问情况和支持主动将内存写入swap分区来达到内存垂直扩展的需求,etmem在运行时需要插入`etmem_scan`和`etmem_swap`模块: - -```bash -modprobe etmem_scan -modprobe etmem_swap -``` - -### 权限限制 - -运行etmem进程需要root权限,root用户具有系统最高权限,在使用root用户进行操作时,请严格按照操作指导进行操作,避免其他操作造成系统管理及安全风险。 - -### 使用约束 - -- etmem的客户端和服务端需要在同一个服务器上部署,不支持跨服务器通信的场景。 -- etmem仅支持扫描进程名小于或等于15个字符长度的目标进程。在使用进程名时,支持的进程名有效字符为:“字母”, “数字”,特殊字符“./%-_”以及上述三种的组合,其余组合认为是非法字符。 -- 在使用AEP介质进行内存扩展的时候,依赖于系统可以正确识别AEP设备并将AEP设备初始化为`numa node`。并且配置文件中的`vm_flags`字段只能配置为`ht`。 -- 引擎私有命令仅针对对应引擎和引擎下的任务有效,比如cslide所支持的`showhostpages`和`showtaskpages`。 -- 第三方策略实现代码中,`eng_mgt_func`接口中的`fd`不能写入`0xff`和`0xfe`字。 -- 支持在一个工程内添加多个不同的第三方策略动态库,以配置文件中的`eng_name`来区分。 -- 禁止并发扫描同一个进程。 -- 未加载`etmem_scan`和`etmem_swap` ko时,禁止使用`/proc/xxx/idle_pages`和`/proc/xxx/swap_pages`文件。 -- etmem对应配置文件,其权限要求为属主为root用户,且权限为600或400,配置文件大小不超过10M。 -- etmem在进行第三方策略注入时,第三方策略的`so`权限要求为属主为root用户,且权限为500或700。 - -## 使用说明 - -### etmem配置文件 - -在运行etmem进程之前,需要管理员预先规划哪些进程需要做内存扩展,将进程信息配置到etmem配置文件中,并配置内存扫描的周期、扫描次数、内存冷热阈值等信息。 - -配置文件的示例文件在源码包中,放置在`/etc/etmem`文件路径下,按照功能划分为3个示例文件, - -```text -/etc/etmem/cslide_conf.yaml -/etc/etmem/slide_conf.yaml -/etc/etmem/thirdparty_conf.yaml -``` - -示例内容分别为: - -```sh -#slide引擎示例 -#slide_conf.yaml -[project] -name=test -loop=1 -interval=1 -sleep=1 -sysmem_threshold=50 -swapcache_high_vmark=10 -swapcache_low_vmark=6 - -[engine] -name=slide -project=test - -[task] -project=test -engine=slide -name=background_slide -type=name -value=mysql -T=1 -max_threads=1 -swap_threshold=10g -swap_flag=yes - -#cslide引擎示例 -#cslide_conf.yaml -[engine] -name=cslide -project=test -node_pair=2,0;3,1 -hot_threshold=1 -node_mig_quota=1024 -node_hot_reserve=1024 - -[task] -project=test -engine=cslide -name=background_cslide -type=pid -name=23456 -vm_flags=ht -anon_only=no -ign_host=no - -#thirdparty引擎示例 -#thirdparty_conf.yaml -[engine] -name=thirdparty -project=test -eng_name=my_engine -libname=/usr/lib/etmem_fetch/my_engine.so -ops_name=my_engine_ops -engine_private_key=engine_private_value - -[task] -project=test -engine=my_engine -name=background_third -type=pid -value=12345 -task_private_key=task_private_value -``` - -配置文件各字段说明: - -| 配置项 | 配置项含义 | 是否必须 | 是否有参数 | 参数范围 | 示例说明 | -|-----------|---------------------|------|-------|------------|-----------------------------------------------------------------| -| [project] | project公用配置段起始标识 | 否 | 否 | NA | project参数的开头标识,表示下面的参数直到另外的[xxx]或文件结尾为止的范围内均为project section的参数 | -| name | project的名字 | 是 | 是 | 64个字以内的字符串 | 用来标识project,engine和task在配置时需要指定要挂载到的project | -| loop | 内存扫描的循环次数 | 是 | 是 | 1~120 | loop=3 //扫描3次 | -| interval | 每次内存扫描的时间间隔 | 是 | 是 | 1~1200 | interval=5 //每次扫描之间间隔5s | -| sleep | 每个内存扫描+操作的大周期之间时间间隔 | 是 | 是 | 1~1200 | sleep=10 //每次大周期之间间隔10s | -| sysmem_threshold| slide engine的配置项,系统内存换出阈值 | 否 | 是 | 0~100 | sysmem_threshold=50 //系统内存剩余量小于50%时,etmem才会触发内存换出| -| swapcache_high_wmark| slide engine的配置项,swacache可以占用系统内存的比例,高水线 | 否 | 是 | 1~100 | swapcache_high_wmark=5 //swapcache内存占用量可以为系统内存的5%,超过该比例,etmem会触发swapcache回收
注: swapcache_high_wmark需要大于swapcache_low_wmark| -| swapcache_low_wmark| slide engine的配置项,swacache可以占用系统内存的比例,低水线 | 否 | 是 | [1~swapcache_high_wmark) | swapcache_low_wmark=3 //触发swapcache回收后,系统会将swapcache内存占用量回收到低于3%| -| [engine] | engine公用配置段起始标识 | 否 | 否 | NA | engine参数的开头标识,表示下面的参数直到另外的[xxx]或文件结尾为止的范围内均为engine section的参数 | -| project | 声明所在的project | 是 | 是 | 64个字以内的字符串 | 已经存在名字为test的project,则可以写为project=test | -| engine | 声明所在的engine | 是 | 是 | slide/cslide/thirdparty | 声明使用的是slide或cslide或third party策略 | -| node_pair | cslide engine的配置项,声明系统中AEP和DRAM的node pair | engine为cslide时必须配置 | 是 | 成对配置AEP和DRAM的node号,AEP和DRAM之间用逗号隔开,没对pair之间用分号隔开 | node_pair=2,0;3,1 | -| hot_threshold | cslide engine的配置项,声明内存冷热水线的阈值 | engine为cslide时必须配置 | 是 | 大于等于0,小于等于INT_MAX的整数 | hot_threshold=3 //访问次数小于3的内存会被识别为冷内存 | -|node_mig_quota|cslide engine的配置项,流控,声明每次DRAM和AEP互相迁移时单向最大流量|engine为cslide时必须配置|是|大于等于0,小于等于INT_MAX的整数|node_mig_quota=1024 //单位为MB,AEP到DRAM或DRAM到AEP搬迁一次最大1024M| -|node_hot_reserve|cslide engine的配置项,声明DRAM中热内存的预留空间大小|engine为cslide时必须配置|是|大于等于0,小于等于INT_MAX的整数|node_hot_reserve=1024 //单位为MB,当所有虚拟机热内存大于此配置值时,热内存也会迁移到AEP中| -|eng_name|thirdparty engine的配置项,声明engine自己的名字,供task挂载|engine为thirdparty时必须配置|是|64个字以内的字符串|eng_name=my_engine //对此第三方策略engine挂载task时,task中写明engine=my_engine| -|libname|thirdparty engine的配置项,声明第三方策略的动态库的地址,绝对地址|engine为thirdparty时必须配置|是|256个字以内的字符串|libname=/user/lib/etmem_fetch/code_test/my_engine.so| -|ops_name|thirdparty engine的配置项,声明第三方策略的动态库中操作符号的名字|engine为thirdparty时必须配置|是|256个字以内的字符串|ops_name=my_engine_ops //第三方策略实现接口的结构体的名字| -|engine_private_key|thirdparty engine的配置项,预留给第三方策略自己解析私有参数的配置项,选配|否|否|根据第三方策略私有参数自行限制|根据第三方策略私有engine参数自行配置| -| [task] | task公用配置段起始标识 | 否 | 否 | NA | task参数的开头标识,表示下面的参数直到另外的[xxx]或文件结尾为止的范围内均为task section的参数 | -| project | 声明所挂的project | 是 | 是 | 64个字以内的字符串 | 已经存在名字为test的project,则可以写为project=test | -| engine | 声明所挂的engine | 是 | 是 | 64个字以内的字符串 | 所要挂载的engine的名字 | -| name | task的名字 | 是 | 是 | 64个字以内的字符串 | name=background1 //声明task的名字是backgound1 | -| type | 目标进程识别的方式 | 是 | 是 | pid/name | pid代表通过进程号识别,name代表通过进程名称识别 | -| value | 目标进程识别的具体字段 | 是 | 是 | 实际的进程号/进程名称 | 与type字段配合使用,指定目标进程的进程号或进程名称,由使用者保证配置的正确及唯一性 | -| T | engine为slide的task配置项,声明内存冷热水线的阈值 | engine为slide时必须配置 | 是 | 0~loop * 3 | T=3 //访问次数小于3的内存会被识别为冷内存 | -| max_threads | engine为slide的task配置项,etmemd内部线程池最大线程数,每个线程处理一个进程/子进程的内存扫描+操作任务 | 否 | 是 | 1~2 * core数 + 1,缺省值为1 | 对外部无表象,控制etmemd服务端内部处理线程个数,当目标进程有多个子进程时,配置越大,并发执行的个数也多,但占用资源也越多 | -| vm_flags | engine为cslide的task配置项,通过指定flag扫描的vma,不配置此项时扫描则不会区分 | 否 | 是 | 256长度以内的字符串,不同flag以空格隔开 | vm_flags=ht //扫描flags为ht(大页)的vma内存 | -| anon_only | engine为cslide的task配置项,标识是否只扫描匿名页 | 否 | 是 | yes/no | anon_only=no //配置为yes时只扫描匿名页,配置为no时非匿名页也会扫描 | -| ign_host | engine为cslide的task配置项,标识是否忽略host上的页表扫描信息 | 否 | 是 | yes/no | ign_host=no //yes为忽略,no为不忽略 | -| task_private_key | engine为thirdparty的task配置项,预留给第三方策略的task解析私有参数的配置项,选配 | 否 | 否 | 根据第三方策略私有参数自行限制 | 根据第三方策略私有task参数自行配置 | -| swap_threshold |slide engine的配置项,进程内存换出阈值 | 否 | 是 | 进程可用内存绝对值 | swap_threshold=10g //进程占用内存在低于10g时不会触发换出。
当前版本下,仅支持g/G作为内存绝对值单位。与sysmem_threshold配合使用,仅系统内存低于阈值时,进行白名单中进程阈值判断 | -| swap_flag|slide engine的配置项,进程指定内存换出 | 否 | 是 | yes/no | swap_flag=yes//使能进程指定内存换出 | - -### etmemd服务端启动 - -在使用etmem提供的服务时,首先根据需要修改相应的配置文件,然后运行etmemd服务端,常驻在系统中来操作目标进程的内存。除了支持在命令行中通过二进制来启动etmemd的进程外,还可以通过配置`service`文件来使etmemd服务端通过`systemctl`方式拉起,此场景需要通过`mode-systemctl`参数来指定支持 - -#### 使用方法 - -可以通过下列示例命令启动etmemd的服务端: - -```bash -etmemd -l 0 -s etmemd_socket -``` - -或者 - -```bash -etmemd --log-level 0 --socket etmemd_socket -``` - -其中`-l`的`0`和`-s`的`etmemd_socket`是用户自己输入的参数,参数具体含义参考以下列表。 - -#### 命令行参数说明 - -| 参数 | 参数含义 | 是否必须 | 是否有参数 | 参数范围 | 示例说明 | -| --------------- | ---------------------------------- | -------- | ---------- | --------------------- | ------------------------------------------------------------ | -| -l或\-\-log-level | etmemd日志级别 | 否 | 是 | 0~3 | 0:debug级别
1:info级别
2:warning级别
3:error级别
只有大于等于配置的级别才会打印到/var/log/message文件中 | -| -s或\-\-socket | etmemd监听的名称,用于与客户端交互 | 是 | 是 | 107个字符之内的字符串 | 指定服务端监听的名称 | -| -m或\-\-mode-systemctl| 指定通过systemctl方式来拉起stmemd服务| 否| 否| NA| service文件中需要指定-m参数| -| -h或\-\-help | 帮助信息 | 否 | 否 | NA | 执行时带有此参数会打印后退出 | - -### 通过etmem客户端添加或者删除工程/引擎/任务 - -#### 场景描述 - -1)管理员创建etmem的project/engine/task(一个工程可包含多个etmem engine,一个engine可以包含多个任务) - -2)管理员删除已有的etmem project/engine/task(删除工程前,会自动先停止该工程中的所有任务) - -#### 使用方法 - -在etmemd服务端正常运行后,通过etmem客户端,通过第二个参数指定为obj,来进行创建或删除动作,对project/engine/task则是通过配置文件中配置的内容来进行识别和区分。 - -- 添加对象: - - ```bash - etmem obj add -f /etc/etmem/slide_conf.yaml -s etmemd_socket - ``` - - 或 - - ```bash - etmem obj add --file /etc/etmem/slide_conf.yaml --socket etmemd_socket - ``` - -- 删除对象: - - ```bash - etmem obj del -f /etc/etmem/slide_conf.yaml -s etmemd_socket - ``` - - 或 - - ```bash - etmem obj del --file /etc/etmem/slide_conf.yaml --socket etmemd_socket - ``` - -#### 命令行参数说明 - -| 参数 | 参数含义 | 是否必须 | 是否有参数 | 示例说明 | -| ------------ | ------------------------------------------------------------ | -------- | ---------- | -------------------------------------------------------- | -| -f或\-\-file | 指定对象的配置文件 | 是 | 是 | 需要指定路径名称 | -| -s或\-\-socket | 与etmemd服务端通信的socket名称,需要与etmemd启动时指定的保持一致 | 是 | 是 | 必须配置,在有多个etmemd时,由管理员选择与哪个etmemd通信 | - -### 通过etmem客户端查询/启动/停止工程 - -#### 场景描述 - -在已经通过`etmem obj add`添加工程之后,在还未调用`etmem obj del`删除工程之前,可以对etmem的工程进行启动和停止。 - -1)管理员启动已添加的工程 - -2)管理员停止已启动的工程 - -在管理员调用`obj del`删除工程时,如果工程已经启动,则会自动停止。 - -#### 使用方法 - -对于已经添加成功的工程,可以通过`etmem project`的命令来控制工程的启动和停止,命令示例如下: - -- 查询工程 - - ```bash - etmem project show -n test -s etmemd_socket - ``` - - 或 - - ```bash - etmem project show --name test --socket etmemd_socket - ``` - -- 启动工程 - - ```bash - etmem project start -n test -s etmemd_socket - ``` - - 或 - - ```bash - etmem project start --name test --socket etmemd_socket - ``` - -- 停止工程 - - ```bash - etmem project stop -n test -s etmemd_socket - ``` - - 或 - - ```bash - etmem project stop --name test --socket etmemd_socket - ``` - -- 打印帮助 - - ```bash - etmem project help - ``` - -#### 命令行参数说明 - -| 参数 | 参数含义 | 是否必须 | 是否有参数 | 示例说明 | -| ------------ | ------------------------------------------------------------ | -------- | ---------- | -------------------------------------------------------- | -| -n或\-\-name | 指定project名称 | 是 | 是 | project名称,与配置文件一一对应 | -| -s或\-\-socket | 与etmemd服务端通信的socket名称,需要与etmemd启动时指定的保持一致 | 是 | 是 | 必须配置,在有多个etmemd时,由管理员选择与哪个etmemd通信 | - -### 通过etmem客户端,支持内存阈值换出以及指定内存换出 - -当前支持的策略中,只有slide策略支持私有的功能特性 - -- 进程或系统内存阈值换出 - -为了获得业务的极致性能,需要考虑etmem内存扩展进行内存换出的时机;当系统可用内存足够,系统内存压力不大时,不进行内存交换;当进程占用内存不高时,不进行内存交换;提供系统内存换出阈值控制以及进程内存换出阈值控制。 - -- 进程指定内存换出 - -在存储环境下,具有IO时延敏感型业务进程,上述进程内存不希望进行换出,因此提供一种机制,由业务指定可换出内存 - -针对进程或系统内存阈值换出,进程指定内存换出功能,可以在配置文件中添加`sysmem_threshold`,`swap_threshold`,`swap_flag`参数,示例如下,具体含义请参考etmem配置文件说明章节。 - -```sh -#slide_conf.yaml -[project] -name=test -loop=1 -interval=1 -sleep=1 -sysmem_threshold=50 - -[engine] -name=slide -project=test - -[task] -project=test -engine=slide -name=background_slide -type=name -value=mysql -T=1 -max_threads=1 -swap_threshold=10g -swap_flag=yes -``` - -#### 系统内存阈值换出 - -配置文件中`sysmem_threshold`用于指示系统内存阈值换出功能,`sysmem_threshold`取值范围为0-100,如果配置文件中设定了`sysmem_threshold`,那么只有系统内存剩余量低于该比例时,etmem才会触发内存换出流程 - -示例使用方法如下: - -1. 参考示例编写配置文件,配置文件中填写`sysmem_threshold`参数,例如`sysmem_threshold=20` -2. 启动服务端,并通过服务端添加,启动工程。 - - ```bash - etmemd -l 0 -s monitor_app & - etmem obj add -f etmem_config -s monitor_app - etmem project start -n test -s monitor_app - etmem project show -s monitor_app - ``` - -3. 观察内存换出结果,只有系统可用内存低于20%时,etmem才会触发内存换出 - -#### 进程内存阈值换出 - -配置文件中`swap_threshold`用于指示进程内存阈值换出功能,`swap_threshold`为进程内存占用量绝对值(格式为"数字+单位g/G"),如果配置文件中设定了`swap_threshold`,那么该进程内存占用量在小于该设定的可用内存量时,etmem不会针对该进程触发换出流程 - -示例使用方法如下: - -1. 参考示例编写配置文件,配置文件中填写`swap_threshold`参数,例如`swap_threshold=5g` -2. 启动服务端,并通过服务端添加,启动工程。 - - ```bash - etmemd -l 0 -s monitor_app & - etmem obj add -f etmem_config -s monitor_app - etmem project start -n test -s monitor_app - etmem project show -s monitor_app - ``` - -3. 观察内存换出结果,只有进程占用内存绝对值高于5G时,etmem才会触发内存换出 - -#### 进程指定内存换出 - -配置文件中`swap_flag`用于指示进程指定内存换出功能,`swap_flag`取值仅有两个:`yes/no`,如果配置文件中设定了`swap_flag`为no或者未配置,那么etmem换出功能无变化,如果`swap_flag`设定为yes,那么etmem仅仅换出进程指定的内存。 - -示例使用方法如下: - -1. 参考示例编写配置文件,配置文件中填写`swap_flag`参数,例如`swap_flag=yes` -2. 业务进程对需要进行换出的内存打标记 - - ```bash - madvise(addr_start, addr_len, MADV_SWAPFLAG) - ``` - -3. 启动服务端,并通过服务端添加,启动工程。 - - ```bash - etmemd -l 0 -s monitor_app & - etmem obj add -f etmem_config -s monitor_app - etmem project start -n test -s monitor_app - etmem project show -s monitor_app - ``` - -4. 观察内存换出结果,只有进程打标记的部分内存会被换出,其余内存保留在DRAM中,不会被换出 - -针对进程指定页面换出的场景中,在原扫描接口`idle_pages`中添加`ioctl`命令字的形式,来确认不带有特定标记的vma不进行扫描与换出操作 - -扫描管理接口 - -- 函数原型 - - ```c - ioctl(fd, cmd, void *arg); - ``` - -- 输入参数 - - ```text - 1. fd:文件描述符,通过open调用在/proc/pid/idle_pages下打开文件获得 - - 2.cmd:控制扫描行为,当前支持如下cmd: - VMA_SCAN_ADD_FLAGS:新增vma指定内存换出标记,仅扫描带有特定标记的VMA - VMA_SCAN_REMOVE_FLAGS:删除新增的VMA指定内存换出标记 - - 3.args:int指针参数,传递具体标记掩码,当前仅支持如下参数: - VMA_SCAN_FLAG:在etmem_scan.ko扫描模块开始扫描前,会调用接口walk_page_test接口判断vma地址是否符合扫描要求,此标记置位时,会仅扫描带有特定换出标记的vma地址段,而忽略其他vma地址 - ``` - -- 返回值 - - ```text - 1.成功,返回0 - 2.失败返回非0 - ``` - -- 注意事项 - - ```text - 所有不支持的标记都会被忽略,但是不会返回错误 - ``` - -### 通过etmem客户端,支持swapcache内存回收指令 - -用户态etmem发起内存淘汰回收操作,通过`write procfs`接口与内核态的内存回收模块交互,内存回收模块解析用户态下发的虚拟地址,获取地址对应的page页面,并调用内核原生接口将该page对应内存进行换出回收,在内存换出的过程中,swapcache会占用部分系统内存,为进一步节约内存,添加swapcache内存回收功能. - -针对swapcache内存回收功能,可以在配置文件中添加`swapcache_high_wmark`,`swapcache_low_wmark`参数。 - -- `swapcache_high_wmark`: swapcache可以占用系统内存的高水位线 -- `swapcache_low_wmark`:swapcache可以占用系统内存的低水位线 - -在etmem进行一轮内存换出后,会进行swapcache占用系统内存比例的检查,当占用比例超过高水位线后,会通过`swap_pages`下发`ioctl`命令,触发swapcache内存回收,并回收到低水位线停止 - -配置参数示例如下,具体请参考etmem配置文件相关章节: - -```sh -#slide_conf.yaml -[project] -name=test -loop=1 -interval=1 -sleep=1 -swapcache_high_vmark=5 -swapcache_low_vmark=3 - -[engine] -name=slide -project=test - -[task] -project=test -engine=slide -name=background_slide -type=name -value=mysql -T=1 -max_threads=1 -``` - -针对swap换出场景中,需要通过swapcache内存回收进一步节约内存,在原内存换出接口`swap_pages`中通过添加`ioctl`接口的方式,来提供swapcache水线的设定以及swapcache内存占用量回收的启动与关闭 - -- 函数原型 - - ```c - ioctl(fd, cmd, void *arg); - ``` - -- 输入参数 - - ```text - 1. fd:文件描述符,通过open调用在/proc/pid/idle_pages下打开文件获得 - - 2.cmd:控制扫描行为,当前支持如下cmd: - RECLAIM_SWAPCACHE_ON:启动swapcache内存换出 - RECLAIM_SWAPCACHE_OFF:关闭swapcache内存换出 - SET_SWAPCACHE_WMARK:设定swapcache内存水线 - - 3.args:int指针参数,传递具体标记掩码,当前仅支持如下参数: - 参数用来传递swapcache水线具体值 - ``` - -- 返回值 - - ```text - 1.成功,返回0 - 2.失败返回非0 - ``` - -- 注意事项 - - ```text - 所有不支持的标记都会被忽略,但是不会返回错误 - ``` - -### 通过etmem客户端,执行引擎私有命令或功能 - -当前支持的策略中,只有cslide策略支持私有的命令 - -- `showtaskpages` -- `showhostpages` - -针对使用此策略引擎的engine和engine所有的task,可以通过这两个命令分别查看task相关的页面访问情况和虚拟机的host上系统大页的使用情况。 - -示例命令如下: - -```bash -etmem engine showtaskpages <-t task_name> -n proj_name -e cslide -s etmemd_socket -etmem engine showhostpages -n proj_name -e cslide -s etmemd_socket -``` - -**注意** :`showtaskpages`和`showhostpages`仅支持引擎使用cslide的场景 - -#### 命令行参数说明 - -| 参数 | 参数含义 | 是否必须 | 是否有参数 | 实例说明 | -|----|------|------|-------|------| -|-n或\-\-proj_name| 指定project的名字| 是| 是| 指定已经存在,所需要执行的project的名字| -|-s或\-\-socket| 与etmemd服务端通信的socket名称,需要与etmemd启动时指定的保持一致| 是| 是| 必须配置,在有多个etmemd时,由管理员选择与哪个etmemd通信| -|-e或\-\-engine| 指定执行的引擎的名字| 是| 是| 指定已经存在的,所需要执行的引擎的名字| -|-t或\-\-task_name| 指定执行的任务的名字| 否| 是| 指定已经存在的,所需要执行的任务的名字| - -### 支持kernel swap功能开启与关闭 - -针对swap换出到磁盘场景,当etmem用于内存扩展时,用户可以选择是否同时开启内核swap功能。用户可以关闭内核原生swap机制,以免原生swap机制换出不应被换出的内存,导致用户态进程出现问题。 - -通过提供sys接口实现上述控制,在`/sys/kernel/mm/swap`目录下创建`kobj`对象,对象名为`kernel_swap_enable`,缺省值为`true`,用于控制kernel swap的启动与关闭 - -具体示例如下: - -```sh -#开启kernel swap -echo true > /sys/kernel/mm/swap/kernel_swap_enbale -或者 -echo 1 > /sys/kernel/mm/swap/kernel_swap_enbale - -#关闭kernel swap -echo false > /sys/kernel/mm/swap/kernel_swap_enbale -或者 -echo 0 > /sys/kernel/mm/swap/kernel_swap_enbale - -``` - -### etmem支持随系统自启动 - -#### 场景描述 - -etmemd支持由用户配置`systemd`配置文件后,以`fork`模式作为`systemd`服务被拉起运行 - -#### 使用方法 - -编写`service`配置文件,来启动etmemd,必须使用-m参数来指定此模式,例如 - -```bash -etmemd -l 0 -s etmemd_socket -m -``` - -#### 命令行参数说明 - -| 参数 | 参数含义 | 是否必须 | 是否有参数 | 参数范围 | 实例说明 | -|----------------|------------|------|-------|------|-----------| -| -l或\-\-log-level | etmemd日志级别 | 否 | 是 | 0~3 | 0:debug级别;1:info级别;2:warning级别;3:error级别;只有大于等于配置的级别才会打印到/var/log/message文件中| -| -s或\-\-socket |etmemd监听的名称,用于与客户端交互 | 是 | 是| 107个字符之内的字符串| 指定服务端监听的名称| -|-m或\-\-mode-systemctl | etmemd作为service被拉起时,命令中需要指定此参数来支持 | 否 | 否 | NA | NA | -| -h或\-\-help | 帮助信息 | 否 |否 |NA |执行时带有此参数会打印后退出| - -### etmem支持第三方内存扩展策略 - -#### 场景描述 - -etmem支持用户注册第三方内存扩展策略,同时提供扫描模块动态库,运行时通过第三方策略淘汰算法淘汰内存。 - -用户使用etmem所提供的扫描模块动态库并实现对接etmem所需要的结构体中的接口 - -#### 使用方法 - -用户使用自己实现的第三方扩展淘汰策略,主要需要按下面步骤进行实现和操作: - -1. 按需调用扫描模块提供的扫描接口, - -2. 按照etmem头文件中提供的函数模板来实现各个接口,最终封装成结构体 - -3. 编译出第三方扩展淘汰策略的动态库 - -4. 在配置文件中按要求声明类型为thirdparty的engine - -5. 将动态库的名称和接口结构体的名称按要求填入配置文件中task对应的字段 - -其他操作步骤与使用etmem的其他engine类似 - -接口结构体模板 - -```c -struct engine_ops { - -/* 针对引擎私有参数的解析,如果有,需要实现,否则置NULL */ - -int (*fill_eng_params)(GKeyFile *config, struct engine *eng); - -/* 针对引擎私有参数的清理,如果有,需要实现,否则置NULL */ - -void (*clear_eng_params)(struct engine *eng); - -/* 针对任务私有参数的解析,如果有,需要实现,否则置NULL */ - -int (*fill_task_params)(GKeyFile *config, struct task *task); - -/* 针对任务私有参数的清理,如果有,需要实现,否则置NULL */ - -void (*clear_task_params)(struct task *tk); - -/* 启动任务的接口 */ - -int (*start_task)(struct engine *eng, struct task *tk); - -/* 停止任务的接口 */ - -void (*stop_task)(struct engine *eng, struct task *tk); - -/* 填充pid相关私有参数 */ - -int (*alloc_pid_params)(struct engine *eng, struct task_pid **tk_pid); - -/* 销毁pid相关私有参数 */ - -void (*free_pid_params)(struct engine *eng, struct task_pid **tk_pid); - -/* 第三方策略自身所需要的私有命令支持,如果没有,置为NULL */ - -int (*eng_mgt_func)(struct engine *eng, struct task *tk, char *cmd, int fd); - -}; -``` - -扫描模块对外接口说明 - -| 接口名称 |接口描述| -| ------------ | --------------------- | -| etmemd_scan_init | scan模块初始化| -| etmemd_scan_exit | scan模块析构| -| etmemd_get_vmas | 获取需要扫描的vma| -| etmemd_free_vmas | 释放etmemd_get_vmas扫描到的vma| -| etmemd_get_page_refs | 扫描vmas中的页面| -| etmemd_free_page_refs | 释放etmemd_get_page_refs获取到的页访问信息链表| - -针对扫描虚拟机的场景中,在原扫描接口`idle_pages`中添加`ioctl`接口的方式,来提供区分扫描`ept`的粒度和是否忽略host上页访问标记的机制 - -针对进程指定页面换出的场景中,在原扫描接口`idle_pages`中添加`ioctl`命令字的形式,来确认不带有特定标记的vma不进行扫描和换出操作 - -扫描管理接口: - -- 函数原型 - - ```c - ioctl(fd, cmd, void *arg); - ``` - -- 输入参数 - - ```text - 1. fd:文件描述符,通过open调用在/proc/pid/idle_pages下打开文件获得 - - 2.cmd:控制扫描行为,当前支持如下cmd: - IDLE_SCAN_ADD_FLAG:新增一个扫描标记 - IDLE_SCAM_REMOVE_FLAGS:删除一个扫描标记 - VMA_SCAN_ADD_FLAGS:新增vma指定内存换出标记,仅扫描带有特定标记的VMA - VMA_SCAN_REMOVE_FLAGS:删除新增的VMA指定内存换出标记 - - 3.args:int指针参数,传递具体标记掩码,当前仅支持如下参数: - SCAN_AS_HUGE:扫描ept页表时,按照2M大页粒度扫描页是否被访问过。此标记未置位时,按照ept页表自身粒度扫描 - SCAN_IGN_HUGE:扫描虚拟机时,忽略host侧页表上的访问标记。此标记未置位时,不会忽略host侧页表上的访问标记。 - VMA_SCAN_FLAG:在etmem_scan.ko扫描模块开始扫描前,会调用接口walk_page_test接口判断vma地址是否符合扫描要求,此标记置位时,会仅扫描带有特定换出标记的vma地址段,而忽略其他vma地址 - ``` - -- 返回值 - - ```text - 1. 成功,返回0 - 2. 失败返回非0 - ``` - -- 注意事项 - - ```text - 所有不支持的标记都会被忽略,但是不会返回错误 - ``` - -配置文件示例如下所示,具体含义请参考配置文件说明章节: - -```text -#thirdparty -[engine] -name=thirdparty -project=test -eng_name=my_engine -libname=/user/lib/etmem_fetch/code_test/my_engine.so -ops_name=my_engine_ops -engine_private_key=engine_private_value -[task] -project=test -engine=my_engine -name=background1 -type=pid -value=1798245 -task_private_key=task_private_value -``` - - **注意** : - -用户需使用etmem所提供的扫描模块动态库并实现对接etmem所需要的结构体中的接口 - -`eng_mgt_func`接口中的`fd`不能写入`0xff`和`0xfe`字 - -支持在一个工程内添加多个不同的第三方策略动态库,以配置文件中的`eng_name`来区分 - -### etmem客户端和服务端帮助说明 - -通过下列命令可以打印etmem服务端帮助说明 - -```bash -etmemd -h -``` - -或 - -```bash -etmemd --help -``` - -通过下列命令可以打印etmem客户端帮助说明 - -```bash -etmem help -``` - -通过下列命令可以打印etmem客户端操作工程/引擎/任务相关帮助说明 - -```bash -etmem obj help -``` - -通过下列命令可以打印etmem客户端对项目相关帮助说明 - -```bash -etmem project help -``` - -## 参与贡献 - -1. Fork本仓库 -2. 新建个人分支 -3. 提交代码 -4. 新建Pull Request diff --git a/docs/zh/server/memory_storage/gmem/_toc.yaml b/docs/zh/server/memory_storage/gmem/_toc.yaml deleted file mode 100644 index 36121ff4a872e60286a38b9cf5fc2437fc6e226a..0000000000000000000000000000000000000000 --- a/docs/zh/server/memory_storage/gmem/_toc.yaml +++ /dev/null @@ -1,10 +0,0 @@ -label: GMEM用户指南 -isManual: true -description: 提供异构互联内存的中心化管理 -sections: - - label: 概述 - href: ./introduction_to_gmem.md - - label: 安装与部署 - href: ./installation_and_deployment.md - - label: 使用方法 - href: ./usage_instructions.md diff --git "a/docs/zh/server/memory_storage/gmem/images/GMEM-\346\236\266\346\236\204.png" "b/docs/zh/server/memory_storage/gmem/images/GMEM-\346\236\266\346\236\204.png" deleted file mode 100644 index 0ca62843c8c5148330ead20167f4b3e2576ca463..0000000000000000000000000000000000000000 Binary files "a/docs/zh/server/memory_storage/gmem/images/GMEM-\346\236\266\346\236\204.png" and /dev/null differ diff --git a/docs/zh/server/memory_storage/gmem/installation_and_deployment.md b/docs/zh/server/memory_storage/gmem/installation_and_deployment.md deleted file mode 100644 index 50c7519e9f4083abd5c5139a805cffe092054f0c..0000000000000000000000000000000000000000 --- a/docs/zh/server/memory_storage/gmem/installation_and_deployment.md +++ /dev/null @@ -1,116 +0,0 @@ -# 安装与部署 - -本章介绍如何安装和部署GMEM。 - -## 软硬件要求 - -* 鲲鹏920处理器 -* 昇腾910芯片 -* 操作系统:openEuler 23.09 - -## 环境准备 - -* 使用和配置GMEM需要使用root权限。 -* GMEM的开关只能在系统层面开启或关闭。 -* 请管理员确保GMEM的配置安全、可用。 - -## 安装GMEM - -* 文件准备 - - [CANN社区版历史版本-昇腾社区 (hiascend.com)](https://www.hiascend.com/software/cann/community-history) - - [固件与驱动-昇腾社区 (hiascend.com)](https://www.hiascend.com/zh/hardware/firmware-drivers/community?product=2&model=19&cann=6.0.1.alpha001&driver=1.0.18.alpha) - - | 来源 | 软件包 | - | ------------------------------------------------------------ | ------------------------------------------------------------ | - | openEuler 23.09 | kernel-6.4.0-xxx.aarch64.rpm
kernel-devel-6.4.0-xxx.aarch64.rpm
libgmem-xxx.aarch64.rpm
libgmem-devel-xxx.aarch64.rpm | - | 昇腾社区 | # CANN软件包
Ascend-cann-toolkit-xxx-linux.aarch64.rpm
# NPU固件与驱动
Ascend-hdk-910-npu-driver-xxx.aarch64.rpm
Ascend-hdk-910-npu-firmware-xxx.noarch.rpm | - | 联系GMEM社区维护人员
[@yang_yanchao](https://gitee.com/yang_yanchao) email:
[@LemmyHuang](https://gitee.com/LemmyHuang) email: | gmem-example-xxx.aarch64.rpm
mindspore-xxx-linux_aarch64.whl | - -* 安装内核 - - 使用的openEuler内核版本,确认GMEM相关编译选项已打开(当前默认已经打开)。 - - ```sh - [root@localhost ~]# cat /boot/config-`uname -r` | grep CONFIG_GMEM - CONFIG_GMEM=y - CONFIG_GMEM_DEV=m - - [root@localhost ~]# cat /boot/config-`uname -r` | grep CONFIG_REMOTE_PAGER - CONFIG_REMOTE_PAGER=m - CONFIG_REMOTE_PAGER_MASTER=m - ``` - - 在启动项中添加`gmem=on` 。 - - ```sh - [root@localhost gmem]# cat /proc/cmdline - BOOT_IMAGE=/vmlinuz-xxx root=/dev/mapper/openeuler-root ... gmem=on - ``` - - 修改`transparent_hugepage` 。 - - ```sh - echo always > /sys/kernel/mm/transparent_hugepage/enabled - ``` - -* 安装用户态动态库 libgmem。 - - ```sh - yum install libgmem libgmem-devel - ``` - -* 安装CANN框架。 - - 安装版本配套的CANN,包括toolkit,driver以及firmware,根据指引完成安装后重启系统。 - - ```sh - rpm -ivh Ascend-cann-toolkit-xxx-linux.aarch64.rpm - # 使用libgmem提供的工具安装npu-driver - sh /usr/local/gmem/install_npu_driver.sh Ascend-hdk-910-npu-driver-xxx.aarch64.rpm - rpm -ivh Ascend-hdk-910-npu-firmware-xxx.noarch.rpm - ``` - - 通过Ascend目录下的环境配置脚本配置好环境变量。 - - ```sh - source /usr/local/Ascend/ascend-toolkit/set_env.sh - ``` - - 查看NPU设备是否正常。 - - ```sh - [root@localhost ~]# npu-smi info - +-------------------------------------------------------------------------------------------+ - | npu-smi 22.0.4.1 Version: 22.0.4.1 | - +----------------------+---------------+----------------------------------------------------+ - | NPU Name | Health | Power(W) Temp(C) Hugepages-Usage(page)| - | Chip | Bus-Id | AICore(%) Memory-Usage(MB) HBM-Usage(MB) | - +======================+===============+====================================================+ - | 0 910B | OK | 79.4 82 0 / 0 | - | 0 | 0000:81:00.0 | 0 1979 / 15039 0 / 32768 | - +======================+===============+====================================================+ - ``` - -* 安装gmem-example软件包。 - - gmem-example会更新host驱动、NPU侧驱动及NPU侧内核。安装完成后重启系统使驱动生效。 - - ```sh - rpm -ivh gmem-example-xxx.aarch64.rpm - ``` - -* 安装mindspore。 - - 获取正确的mindspore版本并安装,安装后可通过执行以下命令验证mindspore功能是否正常。 - - ```sh - python -c "import mindspore;mindspore.run_check()" - MindSpore version: x.x.x - The result of multiplication calculation is correct, MindSpore has been installed on platform [Ascend] successfully! - ``` - -## 执行训练或推理任务 - -基于mindspore的训练或推理任务,在完成以上安装流程后,可直接执行,不需要做任何适配。 diff --git a/docs/zh/server/memory_storage/gmem/introduction_to_gmem.md b/docs/zh/server/memory_storage/gmem/introduction_to_gmem.md deleted file mode 100644 index 8befaf35369d916ab0a68c44721a2734592e100a..0000000000000000000000000000000000000000 --- a/docs/zh/server/memory_storage/gmem/introduction_to_gmem.md +++ /dev/null @@ -1,37 +0,0 @@ -# 认识GMEM - -## 简介 - -当前异构侧数据管理CPU与异构侧分离,数据显式搬移,易用性和性能难以平衡:异构设备HBM内存严重不足,应用手动SWAP方案性能损耗大且通用性差;搜推、大数据场景存在大量无效数据搬移,缺少高效内存池化方案,急需统一的有效对等内存管理机制,Linux现有HMM框架搁浅,编程复杂度高且依赖人工调优,NV、AMD虽尝试接入,但由于架构问题导致代码缺乏通用性,性能可移植性差,引起上游OS社区反弹。 - -GMEM (Generalized Memory Management) 提供了异构互联内存的中心化管理,GMEM API支持设备接入统一地址空间,获得针对异构内存编程的优化,将CPU架构相关的实现从Linux的内存管理系统中独立出来。 - -当CPU和加速卡的内存被封装到一个统一的虚拟地址空间中后,开发者们就不再需要在两个并行的地址空间中手动转移内存,只需要使用一套统一的申请释放函数。在这种模式下甚至可以选择将CPU的DRAM内存作为加速卡的cache,代码开销也不会过大。 - -## 架构 - -![GMEM-架构.png](images/GMEM-架构.png) - -## 应用场景 - -大模型训推场景 - -* GMEM实现异构内存透明扩容技术,实现HBM内存自动超分,实现高性能、低门槛训推。 -* 提供OS原生的极简异构内存管理,超分大模型性能相比NVIDIA性能提升60%。 - -大内存共享场景 - -* 提供远程访问与按需搬移内存的灵活策略,解决内存搬移瓶颈,提升搜推、大数据应用端到端性能。 - -## 功能描述 - -驱动开发侧,GMEM提供了统一的函数注册接口,让驱动开发者避免反复造相同的轮子,确保内存管理代码不再爆炸式地增长,也规避了额外的漏洞。 - -* 调用GMEM提供的接口,简化驱动使用物理内存的代码。 -* 驱动使用GMEM统一提供的接口,避免自行造轮子时出现漏洞。 - -加速卡用户侧,GMEM给使用加速卡进行AI模型及机器学习框架开发提供了更强的可编程性:不再需要手动管理数据是存放在加速卡上还是CPU上。 - -* 通过统一的内存申请释放函数与CPU及卡上的内存交互。 -* 同一虚拟地址空间中既可以映射到CPU上,也可以映射到加速卡上。 -* GMEM封装内存管理代码,相比手动管理获得性能提升。 diff --git a/docs/zh/server/memory_storage/gmem/usage_instructions.md b/docs/zh/server/memory_storage/gmem/usage_instructions.md deleted file mode 100644 index 57bd8154c9000df11922393834edd8090cefa16a..0000000000000000000000000000000000000000 --- a/docs/zh/server/memory_storage/gmem/usage_instructions.md +++ /dev/null @@ -1,66 +0,0 @@ -# 使用说明 - -## 简介 - -GMEM通过特定的flag申请对等互访的虚拟内存,并对外提供了一些内存优化语义,通过这部分内存语义可以达到性能优化的效果。 -libgmem是GMEM用户接口的抽象层,主要功能是是对上述内存语义进行封装,简化用户的使用。 - -## 接口说明 - -* 内存申请 - - GMEM扩展了mmap的含义,增加了一个flag MAP_PEER_SHARED申请异构内存,使用时默认返回2MB对齐的虚拟地址。 - - ```cpp - addr = mmap(NULL , size, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS | MAP_PEER_SHARED, -1, 0); - ``` - -* 内存释放 - - 通过munmap接口释放host和device的内存。 - - ```cpp - munmap(addr, size); - ``` - -* 内存语义 - - FreeEager:对于给定范围[addr, addr + size]的地址段,FreeEager会对范围向内对齐页面大小的完整页面进行释放(默认页面大小2M)。如果范围内不存在完整页面,将直接返回成功。 - - 接口成功返回0,失败返回错误码。 - - ```cpp - 接口原型: int gmemFreeEager(unsigned long addr, size_t size, void *stream); - 接口用法: ret = gmemFreeEager(addr, size, stream); - ``` - - Prefetch:对于给定范围[addr, addr + size]的地址段,Prefetch会对范围向外对齐页面大小的完整页面(覆盖整个地址段)进行预取。确保指定的计算单元设备hnid在接下来对vma发起的访问不会触发page fault。 - - 接口成功返回0,失败返回错误码。 - - ```cpp - 接口原型: int gmemPrefetch(unsigned long addr, size_t size, int hnid, void *stream); - 接口用法: ret = gmemPrefetch(addr, size, hnid, stream); - ``` - - 在上述内存语义使用的时候stream为空表示同步调用,非空表示异步调用。 - -* 其他接口 - - 获取当前设备的numaid。接口成功返回设备号,失败返回错误码。 - - ```cpp - 接口原型: int gmemGetNumaId(void); - 接口用法: numaid = gmemGetNumaId(); - ``` - - 获取内核的gmem统计信息。 - - ```sh - cat /proc/gmemstat - ``` - -## 约束限制 - -1. 目前仅支持2M大页,所以host OS以及NPU卡内OS的透明大页需要默认开启。 -2. 通过MAP_PEER_SHARED申请的异构内存目前不支持fork时继承。 diff --git a/docs/zh/server/memory_storage/hsak/_toc.yaml b/docs/zh/server/memory_storage/hsak/_toc.yaml deleted file mode 100644 index 7aea9965fc4c865eb2f0d0e922ac61ef81d7f0bd..0000000000000000000000000000000000000000 --- a/docs/zh/server/memory_storage/hsak/_toc.yaml +++ /dev/null @@ -1,12 +0,0 @@ -label: HSAK开发指南 -isManual: true -description: HSAK 针对新型存储介质提供高带宽低时延的IO软件栈 -sections: - - label: 概述 - href: ./hsak_developer_guide.md - - label: 使用HSAK开发应用程序 - href: ./development_with_hsak.md - - label: HSAK工具使用说明 - href: ./hsak_tool_usage.md - - label: HSAK接口说明 - href: ./hsak_c_apis.md diff --git a/docs/zh/server/memory_storage/hsak/development_with_hsak.md b/docs/zh/server/memory_storage/hsak/development_with_hsak.md deleted file mode 100644 index 3257b9fa1a8101618311e116f4d203431f8e1aa5..0000000000000000000000000000000000000000 --- a/docs/zh/server/memory_storage/hsak/development_with_hsak.md +++ /dev/null @@ -1,219 +0,0 @@ -# HSAK 使用指南 - -## nvme.conf.in配置文件 - -HSAK配置文件默认安装在/etc/spdk/nvme.conf.in,开发人员可以根据实际业务需要对配置文件进行修改,配置文件内容如下: - -- [Global] - -1. ReactorMask:指定用于轮询IO的核(16进制,不能指定0核,按bit位从低位到高位,分别表示不同CPU核,如:0x1表示0核,0x6表示1、2两个核,以此类推,本字段最大支持34个字符,去掉表示16进制的0x标记,剩余32个计数字符,每个16进制字符最大是F,可表示4个核,所以最多可以支持32*4=128个核)。 -2. LogLevel:HSAK日志打印级别(0:error;1:warning;2:notice;3:info;4:debug)。 -3. MemSize:HSAK占用的内存(最小值为500MB)。 -4. MultiQ:是否在同一个块设备上开启多队列。 -5. E2eDif:DIF类型(1:半程保护;2:全程保护),不同厂商的硬盘对DIF支持能力可能不同,具体请参考硬件厂家资料。 -6. IoStat:是否使能IO统计开关(Yes\No)。 -7. RpcServer:是否启动rpc侦听线程(Yes\No)。 -8. NvmeCUSE:是否启动CUSE功能(Yes\No),开启后在/dev/spdk目录下生成nvme字符设备。 - -- [Nvme] - -1. TransportID:指定NVMe控制器的PCI地址和名称,使用格式为:TransportID "trtype:PCIe traddr:0000:09:00.0" nvme0。 -2. RetryCount:IO失败时的重试次数,0表示不重试,最大255。 -3. TimeoutUsec:IO超时时间,0或者不配置该配置项表示不设置超时时间,单位是μs。 -4. ActionOnTimeout:IO超时行为(None:仅打印信息;Reset:reset控制器;abort:丢弃超时指令),默认None。 - -- [Reactor] - -1. BatchSize:支持批量提交提交IO的个数,默认是8,最大是32。 - -## 头文件引用 - -HSAK提供两个对外头文件,开发者在使用HSAK进行开发时需要包含这两个文件: - -1. bdev_rw.h:定义了数据面用户态IO操作的宏、枚举、数据结构和接口API。 -2. ublock.h:定义了管理面设备管理、信息获取等功能的宏、枚举、数据结构和接口API。 - -## 业务运行 - -开发者在进行软件开发编译后,运行前,需要先运行setup.sh脚本程序,用于重新绑定NVMe盘驱动到用户态,该脚本默认安装在:/opt/spdk。 -执行如下命令将盘驱动从内核态绑定到用户态,同时预留1024个2M大页: - -```shell -[root@localhost ~]# cd /opt/spdk -[root@localhost spdk]# ./setup.sh -0000:3f:00.0 (8086 2701): nvme -> uio_pci_generic -0000:40:00.0 (8086 2701): nvme -> uio_pci_generic -``` - -执行如下命令将盘驱动从用户态恢复到内核态,同时释放预留的大页: - -```shell -[root@localhost ~]# cd /opt/spdk -[root@localhost spdk]# ./setup.sh reset -0000:3f:00.0 (8086 2701): uio_pci_generic -> nvme -0000:40:00.0 (8086 2701): uio_pci_generic -> nvme -``` - -## 用户态IO读写场景 - -开发者通过以下顺序调用HSAK接口,实现经由用户态IO通道的业务数据读写: - -1. 初始化HSAK UIO模块。可调用接口libstorage_init_module,完成HSAK用户态IO通道的初始化。 - -2. 打开磁盘块设备。可调用libstorage_open,打开指定块设备,如需打开多个块设备,需要多次重复调用。 - -3. 申请IO内存。可调用接口libstorage_alloc_io_buf或libstorage_mem_reserve,前者最大可申请单个65K的IO,后者没有限制(除非无可用空间)。 - -4. 对磁盘进行读写操作。根据实际业务需要,可调用如下接口进行读写操作: - - - libstorage_async_read - - libstorage_async_readv - - libstorage_async_write - - libstorage_async_writev - - libstorage_sync_read - - libstorage_sync_write - -5. 释放IO内存。可调用接口libstorage_free_io_buf或libstorage_mem_free,需要与申请时调用的接口对应。 - -6. 关闭磁盘块设备。可调用接口libstorage_close,关闭指定块设备,如果打开了多个块设备,则需要多次重复调用接口进行关闭。 - - | 接口名称 | 功能描述 | - | ----------------------- | --------------------------------------------- | - | libstorage_init_module | HSAK模块初始化接口。 | - | libstorage_open | 打开块设备。 | - | libstorage_alloc_io_buf | 从SPDK的buf_small_pool或者buf_large_pool中分配内存。 | - | libstorage_mem_reserve | 从DPDK预留的大页内存中分配内存空间。 | - | libstorage_async_read | HSAK下发异步IO读请求的接口(读缓冲区为连续buffer)。 | - | libstorage_async_readv | HSAK下发异步IO读请求的接口(读缓冲区为离散buffer)。 | - | libstorage_async_write | HSAK下发异步IO写请求的接口(写缓冲区为连续buffer)。 | - | libstorage_async_wrtiev | HSAK下发异步IO写请求的接口(写缓冲区为离散buff)。 | - | libstorage_sync_read | HSAK下发同步IO读请求的接口(读缓冲区为连续buffer)。 | - | libstorage_sync_write | HSAK下发同步IO写请求的接口(写缓冲区为连续buffer)。 | - | libstorage_free_io_buf | 释放所分配的内存到SPDK的buf_small_pool或者buf_large_pool中。 | - | libstorage_mem_free | 释放libstorage_mem_reserve所申请的内存空间。 | - | libstorage_close | 关闭块设备。 | - | libstorage_exit_module | HSAK模块退出接口。 | - -## 盘管理场景 - -HSAK包含一组C接口,可以对盘进行格式化、创建、删除namespace操作。 - -1. 首先需要调用C接口对HSAK UIO组件进行初始化,如果已经初始化过了,就不需要再调用了。 - - libstorage_init_module - -2. 根据业务需要,调用相应的接口进行盘操作,以下接口可单独调用: - - - libstorage_create_namespace - - libstorage_delete_namespace - - libstorage_delete_all_namespace - - libstorage_nvme_create_ctrlr - - libstorage_nvme_delete_ctrlr - - libstorage_nvme_reload_ctrlr - - libstorage_low_level_format_nvm - - libstorage_deallocate_block - -3. 最后如果退出程序,则需要销毁HSAK UIO,如果还有其他业务在使用,不需要退出,则不用销毁。 - - libstorage_exit_module - - | 接口名称 | 功能描述 | - | ------------------------------- | ----------------------------------------- | - | libstorage_create_namespace | 在指定控制器上创建namespace(前提是控制器具有namespace管理能力)。 | - | libstorage_delete_namespace | 在指定控制器上删除namespace。 | - | libstorage_delete_all_namespace | 删除指定控制器上所有namespace。 | - | libstorage_nvme_create_ctrlr | 根据PCI地址创建NVMe控制器。 | - | libstorage_nvme_delete_ctrlr | 根据控制器名称销毁NVMe控制器。 | - | libstorage_nvme_reload_ctrlr | 根据传入的配置文件自动创建或销毁NVMe控制器。 | - | libstorage_low_level_format_nvm | 低级格式化NVMe盘。 | - | libstorage_deallocate_block | 告知NVMe盘可释放的块,用于垃圾回收。 | - -## 数据面盘信息查询 - -在HSAK的IO数据面提供一组C接口,用于查询盘信息,上层业务可根据查询到的信息进行相关的业务逻辑处理。 - -1. 首先需要调用C接口对HSAK UIO进行初始化,如果已经初始化过了,就不需要再调用了。 - - libstorage_init_module - -2. 根据业务需要,调用相应接口进行信息查询,以下接口可单独调用: - - - libstorage_get_nvme_ctrlr_info - - libstorage_get_mgr_info_by_esn - - libstorage_get_mgr_smart_by_esn - - libstorage_get_bdev_ns_info - - libstorage_get_ctrl_ns_info - -3. 最后如果退出程序,则需要销毁HSAK UIO,如果还有其他业务在使用,不需要退出,则不用销毁。 - - libstorage_exit_module - - | 接口名称 | 功能描述 | - | ------------------------------- | ----------------------------- | - | libstorage_get_nvme_ctrlr_info | 获取所有控制器信息。 | - | libstorage_get_mgr_info_by_esn | 数据面获取设备序列号(ESN)对应的磁盘的管理信息。 | - | libstorage_get_mgr_smart_by_esn | 数据面获取设备序列号(ESN)对应的磁盘的SMART信息。 | - | libstorage_get_bdev_ns_info | 根据设备名称,获取namespace信息。 | - | libstorage_get_ctrl_ns_info | 根据控制器名称,获取所有namespace信息。 | - -## 管理面盘信息查询场景 - -在HSAK的管理面组件ublock提供一组C接口,用于支持在管理面对盘信息进行查询。 - -1. 首先调用C接口对HSAK ublock服务端进行初始化。 - - | 接口名称 | 功能描述 | - | ---------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------- | - | init_ublock | 初始化ublock功能模块,本接口必须在其他所有ublock接口之前被调用,同一个进程只能初始化一次,原因是init_ublock接口中会初始化DPDK,而DPDK初始化所分配的内存同进程PID绑定,一个PID只能绑定一块内存,且DPDK没有提供释放这块内存的接口,只能通过进程退出来释放。 | - | ublock_init | 本身是对init_ublock接口的宏定义,可理解为将ublock初始化为需要RPC服务。 | - | ublock_init_norpc | 本身是对init_ublock接口的宏定义,可理解为ublock初始化为无RPC服务。 | - -2. 根据业务需要,在另一个进程中调用HSAK UIO组件初始化接口。 - -3. 在ublock服务端进程或客户端进程调用如下接口进行相应的信息查询业务。 - - | 接口名称 | 功能描述 | - | ---------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------- | - | ublock_get_bdevs | 业务进程通过调用本接口获取设备列表,获取的设备列表中只有PCI地址,不包含具体设备信息,需要获取具体设备信息,请调用接口ublock_get_bdev。 | - | ublock_get_bdev | 进程通过调用本接口获取具体某个设备的信息,设备信息中包括:设备的序列号、型号、fw版本号信息以字符数组形式保持,不是字符串形式。 | - | ublock_get_bdev_by_esn | 进程通过调用该接口,根据给定的ESN号获取对应设备的信息,设备信息中:序列号、型号、fw版本号。 | - | ublock_get_SMART_info | 进程通过调用本接口获取指定设备的SMART信息。 | - | ublock_get_SMART_info_by_esn | 进程通过调用本接口获取ESN号对应设备的SMART信息。 | - | ublock_get_error_log_info | 进程通过调用本接口获取设备的Error log信息。 | - | ublock_get_log_page | 进程通过调用本接口获取指定设备,指定log page的信息。 | - -4. 对于块设备列表,在获取相应信息后需要调用以下接口进行资源释放。 - - | 接口名称 | 功能描述 | - | ---------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------- | - | ublock_free_bdevs | 进程通过调用本接口释放设备列表。 | - | ublock_free_bdev | 进程通过调用本接口释放设备资源。 | - -5. 最后如果退出程序,则需要销毁HSAK ublock模块(服务端和客户端销毁方法相同)。 - - | 接口名称 | 功能描述 | - | ---------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------- | - | ublock_fini | 销毁ublock功能模块,本接口将销毁ublock模块以及内部创建的资源,本接口同ublock初始化接口需要配对使用。 | - -## 日志管理 - -HSAK的日志当前是通过syslog默认输出到/var/log/messages中,由操作系统的rsyslog服务管理。如果您需要自定义日志目录,可以通过rsyslog配置。 - -1. 首先需要在配置文件/etc/rsyslog.conf中增加如下修改: - - ```shell - if ($programname == 'LibStorage') then { - action(type="omfile" fileCreateMode="0600" file="/var/log/HSAK/run.log") - stop - } - ``` - -2. 重启rsyslog服务: - - ```shell - sysemctl restart rsyslog - ``` - -3. 启动HSAK进程,日志信息即重定向到对应目录。 - -4. 重定向日志如果需要转储,需要用户在/etc/logrotate.d/syslog文件中手动配置。 diff --git a/docs/zh/server/memory_storage/hsak/hsak_c_apis.md b/docs/zh/server/memory_storage/hsak/hsak_c_apis.md deleted file mode 100644 index ffaf50e8f60a0e2f6199806cb02f079c45248fb2..0000000000000000000000000000000000000000 --- a/docs/zh/server/memory_storage/hsak/hsak_c_apis.md +++ /dev/null @@ -1,2540 +0,0 @@ -# HSAK 接口说明 - -## C接口 - -### 宏定义和枚举 - -#### bdev_rw.h - -##### enum libstorage_ns_lba_size - -1. 原型 - - ```sh - enum libstorage_ns_lba_size - { - LIBSTORAGE_NVME_NS_LBA_SIZE_512 = 0x9, - LIBSTORAGE_NVME_NS_LBA_SIZE_4K = 0xc - }; - ``` - -2. 描述 - -磁盘sector_size(数据)大小。 - -##### enum libstorage_ns_md_size - -1. 原型 - - ```shell - enum libstorage_ns_md_size - { - LIBSTORAGE_METADATA_SIZE_0 = 0, - LIBSTORAGE_METADATA_SIZE_8 = 8, - LIBSTORAGE_METADATA_SIZE_64 = 64 - }; - ``` - -2. 描述 - - 磁盘meta data(元数据) size大小。 - -3. 备注 - - - ES3000 V3(单端口)支持5种扇区类型的格式化(512+0,512+8,4K+64,4K,4K+8)。 - - - ES3000 V3(双端口)支持4种扇区类型的格式化(512+0,512+8,4K+64,4K)。 - - - ES3000 V5 支持5种扇区类型的格式化(512+0,512+8,4K+64,4K,4K+8)。 - - - Optane盘支持7种扇区类型的格式化(512+0,512+8,512+16,4K,4K+8,4K+64,4K+128)。 - -##### enum libstorage_ns_pi_type - -1. 原型 - - ```shell - enum libstorage_ns_pi_type - { - LIBSTORAGE_FMT_NVM_PROTECTION_DISABLE = 0x0, - LIBSTORAGE_FMT_NVM_PROTECTION_TYPE1 = 0x1, - LIBSTORAGE_FMT_NVM_PROTECTION_TYPE2 = 0x2, - LIBSTORAGE_FMT_NVM_PROTECTION_TYPE3 = 0x3, - }; - ``` - -2. 描述 - - 磁盘支持的保护类型。 - -3. 备注 - - ES3000仅支持保护类型0和保护类型3,Optane盘仅支持保护类型0和保护类型1。 - -##### enum libstorage_crc_and_prchk - -1. 原型 - - ```shell - enum libstorage_crc_and_prchk - { - LIBSTORAGE_APP_CRC_AND_DISABLE_PRCHK = 0x0, - LIBSTORAGE_APP_CRC_AND_ENABLE_PRCHK = 0x1, - LIBSTORAGE_LIB_CRC_AND_DISABLE_PRCHK = 0x2, - LIBSTORAGE_LIB_CRC_AND_ENABLE_PRCHK = 0x3, - # define NVME_NO_REF 0x4 - LIBSTORAGE_APP_CRC_AND_DISABLE_PRCHK_NO_REF = LIBSTORAGE_APP_CRC_AND_DISABLE_PRCHK | NVME_NO_REF, - LIBSTORAGE_APP_CRC_AND_ENABLE_PRCHK_NO_REF = LIBSTORAGE_APP_CRC_AND_ENABLE_PRCHK | NVME_NO_REF, - }; - ``` - -2. 描述 - - - LIBSTORAGE_APP_CRC_AND_DISABLE_PRCHK:应用层做CRC校验,HSAK不做CRC校验,关闭盘的CRC校验。 - - - LIBSTORAGE_APP_CRC_AND_ENABLE_PRCHK:应用层做CRC校验,HSAK不做CRC校验,开启盘的CRC校验。 - - - LIBSTORAGE_LIB_CRC_AND_DISABLE_PRCHK:应用层不做CRC校验,HSAK做CRC校验,关闭盘的CRC校验。 - - - LIBSTORAGE_LIB_CRC_AND_ENABLE_PRCHK:应用层不做CRC校验,HSAK做CRC校验,开启盘的CRC校验。 - - - LIBSTORAGE_APP_CRC_AND_DISABLE_PRCHK_NO_REF:应用层做CRC校验,HSAK不做CRC校验,关闭盘的CRC校验。对于PI TYPE为1的磁盘(Intel optane P4800),关闭盘的REF TAG校验。 - - - LIBSTORAGE_APP_CRC_AND_ENABLE_PRCHK_NO_REF:应用层做CRC校验,HSAK不做CRC校验,开启盘的CRC校验。对于PI TYPE为1的磁盘(Intel optane P4800),关闭盘的REF TAG校验。 - - - Intel optane P4800盘PI TYPE为1,默认会校验元数据区的CRC和REF TAG。 - - - Intel optane P4800盘的512+8格式支持DIF,4096+64格式不支持。 - - - ES3000 V3和ES3000 V5盘PI TYPE为3,默认只校验元数据区的CRC。 - - - ES3000 V3的512+8格式支持DIF,4096+64格式不支持。ES3000 V5的512+8和4096+64格式均支持DIF。 - - 总结为如下: - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
端到端校验方式ctrlflagCRC生成者写流程读流程
应用校验HSAK校验CRC盘校验CRC应用校验HSAK校验CRC盘校验CRC
半程保护0控制器XXXXXX
1控制器XXXXX
2控制器XXXXXX
3控制器XXXXX
全程保护0APPXXXX
1APPXX
2HSAKXXXX
3HSAKXX
- -##### enum libstorage_print_log_level - -1. 原型 - - ```shell - enum libstorage_print_log_level - { - LIBSTORAGE_PRINT_LOG_ERROR, - LIBSTORAGE_PRINT_LOG_WARN, - LIBSTORAGE_PRINT_LOG_NOTICE, - LIBSTORAGE_PRINT_LOG_INFO, - LIBSTORAGE_PRINT_LOG_DEBUG, - }; - ``` - -2. 描述 - - SPDK日志打印级别:ERROR、WARN、NOTICE、INFO、DEBUG,分别对应配置文件中的0~4。 - -##### MAX_BDEV_NAME_LEN - -1. 原型 - - ```bash - # define MAX_BDEV_NAME_LEN 24 - ``` - -2. 描述 - - 块设备名最大长度限制。 - -##### MAX_CTRL_NAME_LEN - -1. 原型 - - ```bash - # define MAX_CTRL_NAME_LEN 16 - ``` - -2. 描述 - - 控制器名最大长度限制。 - -##### LBA_FORMAT_NUM - -1. 原型 - - ```bash - # define LBA_FORMAT_NUM 16 - ``` - -2. 描述 - - 控制器所支持的LBA格式数目。 - -##### LIBSTORAGE_MAX_DSM_RANGE_DESC_COUNT - -1. 原型 - - ```bash - # define LIBSTORAGE_MAX_DSM_RANGE_DESC_COUNT 256 - ``` - -2. 描述 - - 数据集管理命令中16字节集的最大数目。 - -#### ublock.h - -##### UBLOCK_NVME_UEVENT_SUBSYSTEM_UIO - -1. 原型 - - ```bash - # define UBLOCK_NVME_UEVENT_SUBSYSTEM_UIO 1 - ``` - -2. 描述 - - 用于定义uevent事件所对应的子系统是内核uio,在业务收到uevent事件时,通过该宏定义判断是否为需要处理的内核uio事件。 - - 数据结构struct ublock_uevent中成员int subsystem的值取值为UBLOCK_NVME_UEVENT_SUBSYSTEM_UIO,当前仅此一个可选值。 - -##### UBLOCK_TRADDR_MAX_LEN - -1. 原型 - - ```bash - # define UBLOCK_TRADDR_MAX_LEN 256 - ``` - -2. 描述 - - 以"域:总线:设备.功能"(%04x:%02x:%02x.%x)格式表示的PCI地址字符串的最大长度,其实实际长度远小于256字节。 - -##### UBLOCK_PCI_ADDR_MAX_LEN - -1. 原型 - - ```bash - # define UBLOCK_PCI_ADDR_MAX_LEN 256 - ``` - -2. 描述 - - PCI地址字符串最大长度,实际长度远小于256字节;此处PCI地址格式可能的形式为: - - - 全地址:%x:%x:%x.%x 或 %x.%x.%x.%x。 - - - 功能值为0:%x:%x:%x。 - - - 域值为0:%x:%x.%x 或 %x.%x.%x。 - - - 域和功能值为0:%x:%x 或 %x.%x。 - -##### UBLOCK_SMART_INFO_LEN - -1. 原型 - - ```bash - # define UBLOCK_SMART_INFO_LEN 512 - ``` - -2. 描述 - - 获取NVMe盘SMART信息结构体的大小,为512字节。 - -##### enum ublock_rpc_server_status - -1. 原型 - - ```bash - enum ublock_rpc_server_status { - // start rpc server or not - UBLOCK_RPC_SERVER_DISABLE = 0, - UBLOCK_RPC_SERVER_ENABLE = 1, - }; - ``` - -2. 描述 - - 用于表示HSAK内部RPC服务状态,启用或关闭。 - -##### enum ublock_nvme_uevent_action - -1. 原型 - - ```bash - enum ublock_nvme_uevent_action { - UBLOCK_NVME_UEVENT_ADD = 0, - UBLOCK_NVME_UEVENT_REMOVE = 1, - UBLOCK_NVME_UEVENT_INVALID, - }; - ``` - -2. 描述 - - 用于表示uevent热插拔事件是插入硬盘还是移除硬盘。 - -##### enum ublock_subsystem_type - -1. 原型 - - ```bash - enum ublock_subsystem_type { - SUBSYSTEM_UIO = 0, - SUBSYSTEM_NVME = 1, - SUBSYSTEM_TOP - }; - ``` - -2. 描述 - - 指定回调函数类型,用于区分产品注册回调函数时是针对于uio驱动还是针对于内核nvme驱动。 - -### 数据结构 - -#### bdev_rw.h - -##### struct libstorage_namespace_info - -1. 原型 - - ```conf - struct libstorage_namespace_info - { - char name[MAX_BDEV_NAME_LEN]; - uint64_t size; /** namespace size in bytes */ - uint64_t sectors; /** number of sectors */ - uint32_t sector_size; /** sector size in bytes */ - uint32_t md_size; /** metadata size in bytes */ - uint32_t max_io_xfer_size; /** maximum i/o size in bytes */ - uint16_t id; /** namespace id */ - uint8_t pi_type; /** end-to-end data protection information type */ - uint8_t is_active :1; /** namespace is active or not */ - uint8_t ext_lba :1; /** namespace support extending LBA size or not */ - uint8_t dsm :1; /** namespace supports Dataset Management or not */ - uint8_t pad :3; - uint64_t reserved; - }; - ``` - -2. 描述 - - 该数据结构中包含硬盘namespace相关信息。 - -3. 结构体成员 - - | **成员** | 描述 | - |------------------------------|------------------------------------------------| - | char name[MAX_BDEV_NAME_LEN] | Namespace名字 | - | uint64_t size | 该namespace所分配的硬盘大小,字节为单位 | - | uint64_t sectors | 扇区数 | - | uint32_t sector_size | 每扇区大小,字节为单位 | - | uint32_t md_size | Metadata大小,字节为单位 | - | uint32_t max_io_xfer_size | 最大允许的单次IO操作数据大小,字节为单位 | - | uint16_t id | Namespace ID | - | uint8_t pi_type | 数据保护类型,取值自enum libstorage_ns_pi_type | - | uint8_t is_active :1 | Namespace是否激活 | - | uint8_t ext_lba :1 | Namespace是否支持扩展LBA | - | uint8_t dsm :1 | Namespace是否支持数据集管理 | - | uint8_t pad :3 | 保留字段 | - | uint64_t reserved | 保留字段 | - -##### struct libstorage_nvme_ctrlr_info - -1. 原型 - - ```conf - struct libstorage_nvme_ctrlr_info - { - char name[MAX_CTRL_NAME_LEN]; - char address[24]; - struct - { - uint32_t domain; - uint8_t bus; - uint8_t dev; - uint8_t func; - } pci_addr; - uint64_t totalcap; /* Total NVM Capacity in bytes */ - uint64_t unusecap; /* Unallocated NVM Capacity in bytes */ - int8_t sn[20]; /* Serial number */ - uint8_t fr[8]; /* Firmware revision */ - uint32_t max_num_ns; /* Number of namespaces */ - uint32_t version; - uint16_t num_io_queues; /* num of io queues */ - uint16_t io_queue_size; /* io queue size */ - uint16_t ctrlid; /* Controller id */ - uint16_t pad1; - struct - { - struct - { - /** metadata size */ - uint32_t ms : 16; - /** lba data size */ - uint32_t lbads : 8; - uint32_t reserved : 8; - } lbaf[LBA_FORMAT_NUM]; - uint8_t nlbaf; - uint8_t pad2[3]; - uint32_t cur_format : 4; - uint32_t cur_extended : 1; - uint32_t cur_pi : 3; - uint32_t cur_pil : 1; - uint32_t cur_can_share : 1; - uint32_t mc_extented : 1; - uint32_t mc_pointer : 1; - uint32_t pi_type1 : 1; - uint32_t pi_type2 : 1; - uint32_t pi_type3 : 1; - uint32_t md_start : 1; - uint32_t md_end : 1; - uint32_t ns_manage : 1; /* Supports the Namespace Management and Namespace Attachment commands */ - uint32_t directives : 1; /* Controller support Directives or not */ - uint32_t streams : 1; /* Controller support Streams Directives or not */ - uint32_t dsm : 1; /* Controller support Dataset Management or not */ - uint32_t reserved : 11; - } cap_info; - }; - ``` - -2. 描述 - - 该数据结构中包含硬盘控制器相关信息。 - -3. 结构体成员 - - | **成员** | **描述** | - |----------|----------| - | char name[MAX_CTRL_NAME_LEN] | 控制器名字 | - | char address[24] | PCI地址,字符串形式 | - | struct
{
uint32_t domain;
uint8_t bus;
uint8_t dev;
uint8_t func;
} pci_addr | PCI地址,分段形式 | - | uint64_t totalcap | 控制器的总容量大小(字节为单位)Optane盘基于NVMe 1.0协议,不支持该字段 | - | uint64_t unusecap | 控制器未使用的容量大小(字节为单位)Optane盘基于NVMe 1.0协议,不支持该字段 | - | int8_t sn[20]; | 硬盘序列号。不带'0'的ASCII字符串 | - | uint8_t fr[8]; | 硬盘firmware版本号。不带'0'的ASCII字符串 | - | uint32_t max_num_ns | 最大允许的namespace数 | - | uint32_t version | 控制器支持的NVMe标准协议版本号 | - | uint16_t num_io_queues | 硬盘支持的IO队列数量 | - | uint16_t io_queue_size | IO队列最大深度 | - | uint16_t ctrlid | 控制器ID | - | uint16_t pad1 | 保留字段 | - - struct cap_info子结构体成员: - - | **成员** | **描述** | - |-----------------------------------|------------------------------------| - | struct
{
uint32_t ms : 16;
uint32_t lbads : 8;
uint32_t reserved : 8;
}lbaf[LBA_FORMAT_NUM] | ms:元数据大小,最小为8字节
lbads:指示LBA大小为2^lbads,lbads不小于9 | - | uint8_t nlbaf | 控制器所支持的LBA格式数 | - | uint8_t pad2[3] | 保留字段 | - | uint32_t cur_format : 4 | 控制器当前的LBA格式 | - | uint32_t cur_extended : 1 | 控制器当前是否支持扩展型LBA | - | uint32_t cur_pi : 3 | 控制器当前的保护类型 | - | uint32_t cur_pil : 1 | 控制器当前的PI(保护信息)位于元数据的first eight bytes或者last eight bytes | - | uint32_t cur_can_share : 1 | namespace是否支持多路径传输 | - | uint32_t mc_extented : 1 | 元数据是否作为数据缓冲区的一部分进行传输 | - | uint32_t mc_pointer : 1 | 元数据是否与数据缓冲区分离 | - | uint32_t pi_type1 : 1 | 控制器是否支持保护类型一 | - | uint32_t pi_type2 : 1 | 控制器是否支持保护类型二 | - | uint32_t pi_type3 : 1 | 控制器是否支持保护类型三 | - | uint32_t md_start : 1 | 控制器是否支持PI(保护信息)位于元数据的first eight bytes | - | uint32_t md_end : 1 | 控制器是否支持PI(保护信息)位于元数据的last eight bytes | - | uint32_t ns_manage : 1 | 控制器是否支持namespace管理 | - | uint32_t directives : 1 | 是否支持Directives命令集 | - | uint32_t streams : 1 | 是否支持Streams Directives | - | uint32_t dsm : 1 | 是否支持Dataset Management命令 | - | uint32_t reserved : 11 | 保留字段 | - -##### struct libstorage_dsm_range_desc - -1. 原型 - - ```bash - struct libstorage_dsm_range_desc - { - /* RESERVED */ - uint32_t reserved; - - /* NUMBER OF LOGICAL BLOCKS */ - uint32_t block_count; - - /* UNMAP LOGICAL BLOCK ADDRESS */uint64_t lba;}; - ``` - -2. 描述 - - 数据管理命令集中单个16字节集的定义。 - -3. 结构体成员 - - | **成员** | **描述** | - |----------------------|--------------| - | uint32_t reserved | 保留字段 | - | uint32_t block_count | 单位LBA的数量 | - | uint64_t lba | 起始LBA | - -##### struct libstorage_ctrl_streams_param - -1. 原型 - - ```bash - struct libstorage_ctrl_streams_param - { - /* MAX Streams Limit */ - uint16_t msl; - - /* NVM Subsystem Streams Available */ - uint16_t nssa; - - /* NVM Subsystem Streams Open */uint16_t nsso; - - uint16_t pad; - }; - ``` - -2. 描述 - - NVMe盘支持的Streams属性值。 - -3. 结构体成员 - - | **成员** | **描述** | - |---------------|--------------------------------------| - | uint16_t msl | 硬盘支持的最大Streams资源数 | - | uint16_t nssa | 每个NVM子系统可使用的Streams资源数 | - | uint16_t nsso | 每个NVM子系统已经使用的Streams资源数 | - | uint16_t pad | 保留字段 | - -##### struct libstorage_bdev_streams_param - -1. 原型 - - ```bash - struct libstorage_bdev_streams_param - { - /* Stream Write Size */ - uint32_t sws; - - /* Stream Granularity Size */ - uint16_t sgs; - - /* Namespace Streams Allocated */ - uint16_t nsa; - - /* Namespace Streams Open */ - uint16_t nso; - - uint16_t reserved[3]; - }; - ``` - -2. 描述 - - Namespace的Streams属性值。 - -3. 结构体成员 - - |**成员** | **描述** | - |-------------------------|---------------------------------| - |uint32_t sws |性能最优的写粒度,单位:sectors| - |uint16_t sgs |Streams分配的写粒度,单位:sws| - |uint16_t nsa |Namespace可使用的私有Streams资源数| - |uint16_t nso |Namespace已使用的私有Streams资源数| - |uint16_t reserved[3] |保留字段| - -##### struct libstorage_mgr_info - -1. 原型 - - ```bash - struct libstorage_mgr_info - { - char pci[24]; - char ctrlName[MAX_CTRL_NAME_LEN]; - uint64_t sector_size; - uint64_t cap_size; - uint16_t device_id; - uint16_t subsystem_device_id; - uint16_t vendor_id; - uint16_t subsystem_vendor_id; - uint16_t controller_id; - int8_t serial_number[20]; - int8_t model_number[40]; - uint8_t firmware_revision[8]; - }; - ``` - -2. 描述 - - 磁盘管理信息(与管理面使用的磁盘信息一致)。 - -3. 结构体成员 - - |**成员** | **描述**| - |-------------------------|------------------------------------| - |char pci[24] |磁盘PCI地址字符串| - |char ctrlName[MAX_CTRL_NAME_LEN] |磁盘控制器名字符串| - |uint64_t sector_size |磁盘扇区大小| - |uint64_t cap_size |磁盘容量,单位:字节| - |uint16_t device_id |磁盘设备ID| - |uint16_t subsystem_device_id |磁盘子系统设备ID| - |uint16­_t vendor_id |磁盘厂商ID| - |uint16_t subsystem_vendor_id |磁盘子系统厂商ID| - |uint16_t controller_id |磁盘控制器ID| - |int8_t serial_number[20] |磁盘序列号| - |int8_t model_number[40] |设备型号| - |uint8_t firmware_revision[8] |固件版本号| - -##### struct __attribute__((packed)) libstorage_smart_info - -1. 原型 - - ```bash - /* same with struct spdk_nvme_health_information_page in nvme_spec.h */ - struct __attribute__((packed)) libstorage_smart_info { - /* details of uint8_t critical_warning - - union spdk_nvme_critical_warning_state { - - uint8_t raw; - * - - struct { - - uint8_t available_spare : 1; - - uint8_t temperature : 1; - - uint8_t device_reliability : 1; - - uint8_t read_only : 1; - - uint8_t volatile_memory_backup : 1; - - uint8_t reserved : 3; - - } bits; - - }; - */ - uint8_t critical_warning; - uint16_t temperature; - uint8_t available_spare; - uint8_t available_spare_threshold; - uint8_t percentage_used; - uint8_t reserved[26]; - - /* - - Note that the following are 128-bit values, but are - - defined as an array of 2 64-bit values. - */ - /* Data Units Read is always in 512-byte units. */ - uint64_t data_units_read[2]; - /* Data Units Written is always in 512-byte units. */ - uint64_t data_units_written[2]; - /* For NVM command set, this includes Compare commands. */ - uint64_t host_read_commands[2]; - uint64_t host_write_commands[2]; - /* Controller Busy Time is reported in minutes. */ - uint64_t controller_busy_time[2]; - uint64_t power_cycles[2]; - uint64_t power_on_hours[2]; - uint64_t unsafe_shutdowns[2]; - uint64_t media_errors[2]; - uint64_t num_error_info_log_entries[2]; - /* Controller temperature related. */ - uint32_t warning_temp_time; - uint32_t critical_temp_time; - uint16_t temp_sensor[8]; - uint8_t reserved2[296]; - }; - ``` - -2. 描述 - - 该数据结构定义了硬盘SMART INFO信息内容。 - -3. 结构体成员 - - | **成员** | **描述(具体可以参考NVMe协议)** | - |-----------------------------------|------------------------------------| - | uint8_t critical_warning | 该域表示控制器状态的重要的告警,bit位设置为1表示有效,可以设置
多个bit位有效。重要的告警信息通过异步事件返回给主机端。
Bit0:设置为1时表示冗余空间小于设定的阈值
Bit1:设置为1时表示温度超过或低于一个重要的阈值
Bit2:设置为1时表示由于重要的media错误或者internal error,器件的可靠性已经降低。
Bit3:设置为1时,该介质已经被置为只读模式。
Bit4:设置为1时,表示控制器的易失性器件fail,该域仅在控制器内部存在易失性器件时有效。
Bit 5~7:保留 | - | uint16_t temperature | 表示整个器件的温度,单位为Kelvin。 | - | uint8_t available_spare | 表示可用冗余空间的百分比(0到100%)。 | - | uint8_t available_spare_threshold | 可用冗余空间的阈值,低于该阈值时上报异步事件。 | - | uint8_t percentage_used | 该值表示用户实际使用和厂家设定的器件寿命的百分比,100表示已经达
到厂家预期的寿命,但可能不会失效,可以继续使用。该值允许大于100
,高于254的值都会被置为255。 | - | uint8_t reserved[26] | 保留 | - | uint64_t data_units_read[2] | 该值表示主机端从控制器中读走的512字节数目,其中1表示读走100
0个512字节,该值不包括metadata。当LBA大小不为512
B时,控制器将其转换成512B进行计算。16进制表示。 | - | uint64_t data_units_written[2] | 该值表示主机端写入控制器中的512字节数目,其中1表示写入1000
个512字节,该值不包括metadata。当LBA大小不为512B
时,控制器将其转换成512B进行计算。16进制表示。 | - | uint64_t host_read_commands[2] | 表示下发到控制器的读命令的个数。 | - | uint64_t host_write_commands[2] | 表示下发到控制器的写命令的个数 | - | uint64_t controller_busy_time[2] | 表示控制器处理I/O命令的busy时间,从命令下发SQ到完成命令返回到CQ的整个过程都为busy。该值以分钟为单位。 | - | uint64_t power_cycles[2] | 上下电次数。 | - | uint64_t power_on_hours[2] | power-on时间小时数。 | - | uint64_t unsafe_shutdowns[2] | 异常关机次数,掉电时仍未接收到CC.SHN时该值加1。 | - | uint64_t media_errors[2] | 表示控制器检测到不可恢复的数据完整性错误的次数,
其中包括不可纠的ECC错误,CRC错误,LBA tag不匹配。 | - | uint64_t num_error_info_log_entries[2] | 该域表示控制器生命周期内的错误信息日志的entry数目。 | - | uint32_t warning_temp_time | 温度超过warning告警值的累积时间,单位分钟。 | - | uint32_t critical_temp_time | 温度超过critical告警值的累积时间,单位分钟。 | - | uint16_t temp_sensor[8] | 温度传感器1~8的温度值,单位Kelvin。 | - | uint8_t reserved2[296] | 保留 | - -##### libstorage_dpdk_contig_mem - -1. 原型 - - ```bash - struct libstorage_dpdk_contig_mem { - uint64_t virtAddr; - uint64_t memLen; - uint64_t allocLen; - }; - ``` - -2. 描述 - - DPDK内存初始化之后,通知业务层初始化完成的回调函数参数中描述一段连续虚拟内存的信息。 - - 当前HSAK预留了800M内存,其他内存通过该结构体中的allocLen返回给业务层,用于业务层申请内存自行管理。 - - HSAK需要预留的总内存是800M左右,每一个内存段上预留的内存是根据环境的NUMA节点数来计算的。在NUMA节点过多时,每个内存段上预留的内存过小,会导致HSAK初始化失败。因此HSAK只支持最多4个NUMA节点的环境。 - -3. 结构体成员 - - | **成员** | **描述** | - |--------------------|----------------------| - |uint64_t virtAddr |虚拟内存起始地址。| - |uint64_t memLen |虚拟内存长度,单位:字节。| - |uint64_t allocLen |该内存段中可用的内存长度,单位:字节。| - -##### struct libstorage_dpdk_init_notify_arg - -1. 原型 - - ```bash - struct libstorage_dpdk_init_notify_arg { - uint64_t baseAddr; - uint16_t memsegCount; - struct libstorage_dpdk_contig_mem *memseg; - }; - ``` - -2. 描述 - - 用于DPDK内存初始化之后,通知业务层初始化完成的回调函数参数,表示所有虚拟内存段信息。 - -3. 结构体成员 - - | **成员** | **描述**| - |------------------------|-----------------------| - |uint64_t baseAddr |虚拟内存起始地址。| - |uint16_t memsegCount |有效的'memseg'数组成员个数,即连续的虚拟内存段的段数。| - |struct libstorage_dpdk_contig_mem *memseg |指向内存段数组的指针,每个数组元素都是一段连续的虚拟内存,两两元素之间是不连续的。| - -##### struct libstorage_dpdk_init_notify - -1. 原型 - - ```bash - struct libstorage_dpdk_init_notify { - const char *name; - void (*notifyFunc)(const struct libstorage_dpdk_init_notify_arg *arg); - TAILQ_ENTRY(libstorage_dpdk_init_notify) tailq; - }; - ``` - -2. 描述 - - 用于DPDK内存初始化之后,通知业务层回调函数注册的结构体。 - -3. 结构体成员 - - | **成员** | **描述**| - |-------------------------------|--------------------------| - |const char *name |注册的回调函数的业务层模块名字。| - |void (*notifyFunc)(const struct libstorage_dpdk_init_notify_arg*arg) |DPDK内存初始化之后,通知业务层初始化完成的回调函数参数。| - |TAILQ_ENTRY(libstorage_dpdk_init_notify) tailq |存放回调函数注册的链表。| - -#### ublock.h - -##### struct ublock_bdev_info - -1. 原型 - - ```bash - struct ublock_bdev_info { - uint64_t sector_size; - uint64_t cap_size; // cap_size - uint16_t device_id; - uint16_t subsystem_device_id; // subsystem device id of nvme control - uint16_t vendor_id; - uint16_t subsystem_vendor_id; - uint16_t controller_id; - int8_t serial_number[20]; - int8_t model_number[40]; - int8_t firmware_revision[8]; - }; - ``` - -2. 描述 - - 该数据结构中包含硬盘设备信息。 - -3. 结构体成员 - - | **成员** | **描述**| - |------------------|------------| - |uint64_t sector_size |硬盘扇区大小,比如512字节 | - |uint64_t cap_size |硬盘总容量,字节为单位 | - |uint16_t device_id |设备id号 | - |uint16_t subsystem_device_id |子系统的设备id号 | - |uint16_t vendor_id |设备厂商主id号 | - |uint16_t subsystem_vendor_id |设备厂商子id号 | - |uint16_t controller_id |设备控制器id号 | - |int8_t serial_number[20] |设备序列号 | - |int8_t model_number[40] |设备型号 | - |int8_t firmware_revision[8] |固件版本号 | - -##### struct ublock_bdev - -1. 原型 - - ```bash - struct ublock_bdev { - char pci[UBLOCK_PCI_ADDR_MAX_LEN]; - struct ublock_bdev_info info; - struct spdk_nvme_ctrlr *ctrlr; - TAILQ_ENTRY(ublock_bdev) link; - }; - ``` - -2. 描述 - - 该数据结构中包含指定PCI地址的硬盘信息,而结构本身为队列的一个节点。 - -3. 结构体成员 - - |**成员** | **描述** | - |-----------------------------------|----------------------------------------------------------------------------------------------------| - |char pci[UBLOCK_PCI_ADDR_MAX_LEN] | PCI地址 | - |struct ublock_bdev_info info | 硬盘设备信息 | - |struct spdk_nvme_ctrlr *ctrlr | 设备控制器数据结构,该结构体内成员不对外开放,外部业务可通过SPDK开源接口获取相应成员数据。 | - |TAILQ_ENTRY(ublock_bdev) link | 队列前后指针结构体 | - -##### struct ublock_bdev_mgr - -1. 原型 - - ```bash - struct ublock_bdev_mgr { - TAILQ_HEAD(, ublock_bdev) bdevs; - }; - ``` - -2. 描述 - - 该数据结构内定义了一个ublock_bdev队列的头结构。 - -3. 结构体成员 - - |**成员** | **描述** | - |---------------------------------|------------------| - |TAILQ_HEAD(, ublock_bdev) bdevs; | 队列头结构体 | - -##### struct __attribute__((packed)) ublock_SMART_info - -1. 原型 - - ```bash - struct __attribute__((packed)) ublock_SMART_info { - uint8_t critical_warning; - uint16_t temperature; - uint8_t available_spare; - uint8_t available_spare_threshold; - uint8_t percentage_used; - uint8_t reserved[26]; - /* - - Note that the following are 128-bit values, but are - - defined as an array of 2 64-bit values. - */ - /* Data Units Read is always in 512-byte units. */ - uint64_t data_units_read[2]; - /* Data Units Written is always in 512-byte units. */ - uint64_t data_units_written[2]; - /* For NVM command set, this includes Compare commands. */ - uint64_t host_read_commands[2]; - uint64_t host_write_commands[2]; - /* Controller Busy Time is reported in minutes. */ - uint64_t controller_busy_time[2]; - uint64_t power_cycles[2]; - uint64_t power_on_hours[2]; - uint64_t unsafe_shutdowns[2]; - uint64_t media_errors[2]; - uint64_t num_error_info_log_entries[2]; - /* Controller temperature related. */ - uint32_t warning_temp_time; - uint32_t critical_temp_time; - uint16_t temp_sensor[8]; - uint8_t reserved2[296]; - }; - ``` - -2. 描述 - - 该数据结构定义了硬盘SMART INFO信息内容。 - -3. 结构体成员 - - | **成员** | **描述(具体可以参考NVMe协议)** | - |-----------------------------------|-----------------------------------| - | uint8_t critical_warning | 该域表示控制器状态的重要的告警,bit位设置为1表示有效,可以设置
多个bit位有效。重要的告警信息通过异步事件返回给主机端。
Bit0:设置为1时表示冗余空间小于设定的阈值
Bit1:设置为1时表示温度超过或低于一个重要的阈值
Bit2:设置为1时表示由于重要的media错误或者internal error,器件的可靠性已经降低。
Bit3:设置为1时,该介质已经被置为只读模式。
Bit4:设置为1时,表示控制器的易失性器件fail,该域仅在控制器内部存在易失性器件时有效。
Bit 5~7:保留 | - | uint16_t temperature | 表示整个器件的温度,单位为Kelvin。 | - | uint8_t available_spare | 表示可用冗余空间的百分比(0到100%)。 | - | uint8_t available_spare_threshold | 可用冗余空间的阈值,低于该阈值时上报异步事件。 | - | uint8_t percentage_used | 该值表示用户实际使用和厂家设定的器件寿命的百分比,100表示已经达
到厂家预期的寿命,但可能不会失效,可以继续使用。该值允许大于100
,高于254的值都会被置为255。 | - | uint8_t reserved[26] | 保留 | - | uint64_t data_units_read[2] | 该值表示主机端从控制器中读走的512字节数目,其中1表示读走100
0个512字节,该值不包括metadata。当LBA大小不为512B
时,控制器将其转换成512B进行计算。16进制表示。 | - | uint64_t data_units_written[2] | 该值表示主机端写入控制器中的512字节数目,其中1表示写入1000
个512字节,该值不包括metadata。当LBA大小不为512B
时,控制器将其转换成512B进行计算。16进制表示。 | - | uint64_t host_read_commands[2] | 表示下发到控制器的读命令的个数。 | - | uint64_t host_write_commands[2] | 表示下发到控制器的写命令的个数 | - | uint64_t controller_busy_time[2] | 表示控制器处理I/O命令的busy时间,从命令下发SQ到完成命令返回到CQ的整个过程都为busy。该值以分钟为单位。| - | uint64_t power_cycles[2] | 上下电次数。 | - | uint64_t power_on_hours[2] | power-on时间小时数。 | - | uint64_t unsafe_shutdowns[2] | 异常关机次数,掉电时仍未接收到CC.SHN时该值加1。 | - | uint64_t media_errors[2] | 表示控制器检测到不可恢复的数据完整性错误的次数,其中包括不可纠的E
CC错误,CRC错误,LBA
tag不匹配。 | - | uint64_t num_error_info_log_entries[2] | 该域表示控制器生命周期内的错误信息日志的entry数目。 | - | uint32_t warning_temp_time | 温度超过warning告警值的累积时间,单位分钟。 | - | uint32_t critical_temp_time | 温度超过critical告警值的累积时间,单位分钟。 | - | uint16_t temp_sensor[8] | 温度传感器1~8的温度值,单位Kelvin。 | - | uint8_t reserved2[296] | 保留 | - -##### struct ublock_nvme_error_info - -1. 原型 - - ```bash - struct ublock_nvme_error_info { - uint64_t error_count; - uint16_t sqid; - uint16_t cid; - uint16_t status; - uint16_t error_location; - uint64_t lba; - uint32_t nsid; - uint8_t vendor_specific; - uint8_t reserved[35]; - }; - ``` - -2. 描述 - - 该数据结构中包含设备控制器中单条错误信息具体内容,不同控制器可支持的错误条数可能不同。 - -3. 结构体成员 - - |**成员** | **描述(具体可以参考NVMe协议)** | - |------------------------|----------------------------------------------------------------------------------------------------------------------------------------| - |uint64_t error_count | Error序号,累增。 | - |uint16_t sqid | 此字段指示与错误信息关联的命令的提交队列标识符。如果错误无法关联特定命令,则该字段应设置为FFFFh。 | - |uint16_t cid | 此字段指示与错误信息关联的命令标识符。如果错误无法关联特定命令,则该字段应设置为FFFFh。 | - |uint16_t status | 此字段指示已完成命令的"状态字段"。 | - |uint16_t error_location | 此字段指示与错误信息关联的命令参数。 | - |uint64_t lba | 该字段表示遇到错误情况的第一个LBA。 | - |uint32_t nsid | 该字段表示遇到错误情况的namespace。 | - |uint8_t vendor_specific | 如果有其他供应商特定的错误信息可用,则此字段提供与该页面关联的日志页面标识符。 值00h表示没有可用的附加信息。有效值的范围为80h至FFh。 | - |uint8_t reserved[35] | 保留 | - -##### struct ublock_uevent - -1. 原型 - - ```bash - struct ublock_uevent { - enum ublock_nvme_uevent_action action; - int subsystem; - char traddr[UBLOCK_TRADDR_MAX_LEN + 1]; - }; - ``` - -2. 描述 - - 该数据结构中包含用于表示uevent事件的相关参数。 - -3. 结构体成员 - - | **成员** | **描述** | - |----------------------------------------|-------------------------------------------------------------------------------------------------------------------------| - | enum ublock_nvme_uevent_action action | 通过枚举,表示uevent事件类型为插入硬盘,还是移除硬盘。 | - | int subsystem | 表示uevent事件的子系统类型,当前仅支持UBLOCK_NVME_UEVENT_SUBSYSTEM_UIO,如果应用程序收到其他值,则可不处理。 | - | char traddr[UBLOCK_TRADDR_MAX_LEN + 1] | 以"域:总线:设备.功能"(%04x:%02x:%02x.%x)格式表示的PCI地址字符串。 | - -##### struct ublock_hook - -1. 原型 - - ```bash - struct ublock_hook - { - ublock_callback_func ublock_callback; - void *user_data; - }; - ``` - -2. 描述 - - 该数据结构用于注册回调函数。 - -3. 结构体成员 - - | **成员** | **描述** | - |---------------------------------------|---------------------------------------------------------------------------| - | ublock_callback_func ublock_callback | 表示回调时执行的函数,类型为bool func(void *info, void*user_data). | - | void *user_data | 传给回调函数的用户参数 | - -##### struct ublock_ctrl_iostat_info - -1. 原型 - - ```bash - struct ublock_ctrl_iostat_info - { - uint64_t num_read_ops; - uint64_t num_write_ops; - uint64_t read_latency_ms; - uint64_t write_latency_ms; - uint64_t io_outstanding; - uint64_t num_poll_timeout; - uint64_t io_ticks_ms; - }; - ``` - -2. 描述 - - 该数据结构用于获取控制器的IO统计信息。 - -3. 结构体成员 - - | **成员** | **描述** | - |-----------------------------|---------------------------------------------| - | uint64_t num_read_ops | 获取的该控制器的读IO个数(累加值) | - | uint64_t num_write_ops | 获取的该控制器的写IO个数(累加值) | - | uint64_t read_latency_ms | 获取的该控制器的读时延(累加值,ms) | - | uint64_t write_latency_ms | 获取的该控制器的写时延(累加值,ms) | - | uint64_t io_outstanding | 获取的该控制器的队列深度 | - | uint64_t num_poll_timeout | 获取的该控制器的轮询超时次数(累加值) | - | uint64_t io_ticks_ms | 获取的该控制器的IO处理时延(累加值,ms) | - -### API - -#### bdev_rw.h - -##### libstorage_get_nvme_ctrlr_info - -1. 接口原型 - - uint32_t libstorage_get_nvme_ctrlr_info(struct libstorage_nvme_ctrlr_info** ppCtrlrInfo); - -2. 接口描述 - - 获取所有控制器信息。 - -3. 参数 - - | **参数成员** | **描述** | - |-----------------------------------|-----------------------------------| - | struct libstorage_nvme_ctrlr_info** ppCtrlrInfo| 出参,返回所有获取到的控制器信息。
说明:
使用后务必通过free接口释放内存。 | - -4. 返回值 - - | **返回值** | **描述** | - |-------------|----------------------------------------------| - | 0 | 控制器信息获取失败,或未获取到任何控制器信息 | - | 大于0 | 获取到的控制器个数 | - -##### libstorage_get_mgr_info_by_esn - -1. 接口原型 - - ```bash - int32_t libstorage_get_mgr_info_by_esn(const char *esn, struct libstorage_mgr_info *mgr_info); - ``` - -2. 接口描述 - - 数据面获取设备序列号(ESN)对应的NVMe磁盘的管理信息。 - -3. 参数 - - | **参数成员** | **描述** | - |---------------------------|----------------------------------------------| - | const char *esn | 被查询设备的ESN号
说明:
ESN号是最大有效长度为20的字符串(不包括字符串结束符),但该长
度根据不同硬件厂商可能存在差异,如不足20字符,需要在字符串末尾加
空格补齐。 | - | struct libstorage_mgr_info *mgr_info | 出参,返回所有获取到的NVMe磁盘管理信息。 | - -4. 返回值 - - | **返回值** | **描述** | - |-------------|------------------------------------------| - | 0 | 查询ESN对应的NVMe磁盘管理信息成功。 | - | -1 | 查询ESN对应的NVMe磁盘管理信息失败。 | - | -2 | 未获取到任何匹配ESN的NVMe磁盘。 | - -##### libstorage_get_mgr_smart_by_esn - -1. 接口原型 - - ```bash - int32_t libstorage_get_mgr_smart_by_esn(const char *esn, uint32_t nsid, struct libstorage_smart_info *mgr_smart_info); - ``` - -2. 接口描述 - - 数据面获取设备序列号(ESN)对应的NVMe磁盘的SMART信息。 - -3. 参数 - - | **参数成员** | **描述** | - |-------------------------------|------------------------------------------| - | const char *esn | 被查询设备的ESN号
说明:
ESN号是最大有效长度为20的字符串(不包括字符串结束符),但该长
度根据不同硬件厂商可能存在差异,如不足20字符,需要在字符串末尾加
空格补齐。 | - | uint32_t nsid | 指定的namespace | - | struct libstorage_mgr_info *mgr_info | 出参,返回所有获取到的NVMe磁盘SMART信息。 | - -4. 返回值 - - | **返回值** | **描述** | - |-------------|---------------------------------------| - | 0 | 查询ESN对应的NVMe磁盘SMART信息成功。 | - | -1 | 查询ESN对应的NVMe磁盘SMART信息失败。 | - | -2 | 未获取到任何匹配ESN的NVMe磁盘。 | - -##### libstorage_get_bdev_ns_info - -1. 接口原型 - - ```bash - uint32_t libstorage_get_bdev_ns_info(const char* bdevName, struct libstorage_namespace_info** ppNsInfo); - ``` - -2. 接口描述 - - 根据设备名称,获取namespace信息。 - -3. 参数 - - | **参数成员** | **描述** | - |-------------------------------|---------------------------------------| - | const char* bdevName | 设备名称 | - | struct libstorage_namespace_info** ppNsInfo | 出参,返回namespace信息。
说明
使用后务必通过free接口释放内存。 | - -4. 返回值 - - | **返回值**| **描述** | - |------------|---------------| - | 0 | 获取失败 | - | 1 | 获取成功 | - -##### libstorage_get_ctrl_ns_info - -1. 接口原型 - - ```bash - uint32_t libstorage_get_ctrl_ns_info(const char* ctrlName, struct libstorage_namespace_info** ppNsInfo); - ``` - -2. 接口描述 - - 根据控制器名称,获取所有namespace信息。 - -3. 参数 - - | **参数成员** | **描述** | - |-------------------------------|---------------------------------------| - | const char* ctrlName | 控制器名称 | - | struct libstorage_namespace_info** ppNsInfo| 出参,返回所有namespace信息。
说明
使用后务必通过free接口释放内存。 | - -4. 返回值 - - | **返回值**| **描述** | - |------------|-------------------------------------------| - | 0 | 获取失败,或未获取到任何namespace信息 | - | 大于0 | 获取到的namespace个数 | - -##### libstorage_create_namespace - -1. 接口原型 - - ```bash - int32_t libstorage_create_namespace(const char* ctrlName, uint64_t ns_size, char** outputName); - ``` - -2. 接口描述 - - 在指定控制器上创建namespace(前提是控制器具有namespace管理能力)。 - - Optane盘基于NVMe 1.0协议,不支持namespace管理,因此不支持该接口的使用。 - - ES3000 V3和V5默认只支持一个namespace。在控制器上默认会存在一个namespace,如果要创建新的namespace,需要将原有namespace删除。 - -3. 参数 - - | **参数成员** | **描述** | - |--------------------------|-------------------------------------------| - | const char* ctrlName | 控制器名称 | - | uint64_t ns_size | 要创建的namespace大小(以sertor_size为单位) | - | char** outputName | 出参:创建的namespace名称
说明
使用后务必通过free接口释放内存。 | - -4. 返回值 - - | **返回值** | **描述** | - |-------------|-----------------------------------| - | 小于等于0 | 创建namespace失败 | - | 大于0 | 所创建的namespace编号(从1开始) | - -##### libstorage_delete_namespace - -1. 接口原型 - - ```bash - int32_t libstorage_delete_namespace(const char* ctrlName, uint32_t ns_id); - ``` - -2. 接口描述 - - 在指定控制器上删除namespace。Optane盘基于NVMe 1.0协议,不支持namespace管理,因此不支持该接口的使用。 - -3. 参数 - - | **参数成员** | **描述** | - |-----------------------|-------------------| - | const char* ctrlName | 控制器名字 | - | uint32_t ns_id | Namespace ID | - -4. 返回值 - - | **返回值** | **描述** | - |-------------|-------------------------------------------| - | 0 | 删除成功 | - | 非0 | 删除失败
说明
删除namespace前要求先停止IO相关动作,否则删除失败。 | - -##### libstorage_delete_all_namespace - -1. 接口原型 - - ```bash - int32_t libstorage_delete_all_namespace(const char* ctrlName); - ``` - -2. 接口描述 - - 删除指定控制器上所有namespace。Optane盘基于NVMe 1.0协议,不支持namespace管理,因此不支持该接口的使用。 - -3. 参数 - - | **参数成员** | **描述** | - |------------------------|----------------| - | const char* ctrlName |控制器名称 | - -4. 返回值 - - | **返回值** | **描述** | - |-------------|-------------------------------------------| - | 0 | 删除成功 | - | 非0 | 删除失败
说明
删除namespace前要求先停止IO相关动作,否则删除失败。 | - -##### libstorage_nvme_create_ctrlr - -1. 接口原型 - - ```bash - int32_t libstorage_nvme_create_ctrlr(const char *pci_addr, const char *ctrlr_name); - ``` - -2. 接口描述 - - 根据PCI地址创建NVMe控制器。 - -3. 参数 - - | **参数成员** | **描述** | - |--------------------|-------------------| - | char *pci_addr |PCI地址 | - | char *ctrlr_name |控制器名称 | - -4. 返回值 - - | **返回值** | **描述** | - |-------------|--------------| - | 小于0 | 创建失败 | - | 0 | 创建成功 | - -##### libstorage_nvme_delete_ctrlr - -1. 接口原型 - - ```bash - int32_t libstorage_nvme_delete_ctrlr(const char *ctrlr_name); - ``` - -2. 接口描述 - - 根据控制器名称销毁NVMe控制器。 - -3. 参数 - - | **参数成员** | **描述** | - |-------------------------|-----------------| - | const char *ctrlr_name | 控制器名称 | - - 确保已下发的io已经全部返回后方可调用本接口。 - -4. 返回值 - - | **返回值** | **描述** | - |-------------|--------------| - | 小于0 | 销毁失败 | - | 0 | 销毁成功 | - -##### libstorage_nvme_reload_ctrlr - -1. 接口原型 - - ```bash - int32_t libstorage_nvme_reload_ctrlr(const char *cfgfile); - ``` - -2. 接口描述 - - 根据配置文件增删NVMe控制器。 - -3. 参数 - - | **参数成员** | **描述** | - |----------------------|-------------------| - | const char *cfgfile | 配置文件路径 | - - 使用本接口删盘时,需要确保已下发的io已经全部返回。 - -4. 返回值 - - | **返回值** | **描述** | - |-------------|-----------------------------------------------------| - | 小于0 | 根据配置文件增删盘失败(可能部分控制器增删成功) | - | 0 | 根据配置文件增删盘成功 | - -> 使用限制 - -- 目前最多支持在配置文件中配置36个控制器。 - -- 重加载接口会尽可能创建多的控制器,某个控制器创建失败,不会影响其他控制器的创建。 - -- 无法保证并发场景下最终的盘初始化情况与最后调用传入的配置文件相符。 - -- 对正在下发io的盘通过reload删除时,会导致io失败。 - -- 修改配置文件中pci地址对应的控制器名称(e.g.nvme0),调用此接口后无法生效。 - -- reload仅针对于增删盘的场景有效,配置文件中的其他配置项修改无法重载。 - -##### libstorage_low_level_format_nvm - -1. 接口原型 - - ```bash - int8_t libstorage_low_level_format_nvm(const char* ctrlName, uint8_t lbaf, - enum libstorage_ns_pi_type piType, - bool pil_start, bool ms_extented, uint8_t ses); - ``` - -2. 接口描述 - - 低级格式化NVMe盘。 - -3. 参数 - - | **参数成员** | **描述** | - |-------------------------------------|----------------------------------------------------------------------------| - | const char* ctrlName | 控制器名称 | - | uint8_t lbaf | 所要使用的LBA格式 | - | enum libstorage_ns_pi_type piType |所要使用的保护类型 | - | bool pil_start | pi信息位于元数据的first eight bytes(1) or last eight bytes (0) | - | bool ms_extented | 是否要格式化成扩展型 | - | uint8_t ses | 格式化时是否进行安全擦除(当前仅支持设置为0:no-secure earse) | - -4. 返回值 - - | **返回值** | **描述** | - |-------------|-----------------------------| - | 小于0 | 格式化失败 | - | 大于等于0 | 当前格式化成功的LBA格式 | - -> 使用限制 - -- 该低级格式化接口会清除磁盘namespace的数据和元数据,请谨慎使用。 - -- ES3000盘在格式化时耗时数秒,Intel Optane盘在格式化时需耗时数分钟,在使用该接口时需要等待其执行完成。若强行杀掉格式化进程,会导致格式化失败。 - -- 在格式化执行之前,需要停止数据面的IO操作。如果当前磁盘正在处理IO请求,格式化操作会概率性出现失败,并且在格式化成功的情况下会存在硬盘丢弃正在处理的IO的可能,所以在格式化前,请保证数据面的IO操作已停止。 - -- 格式化过程中会reset控制器,导致之前已经初始化的磁盘资源不可用。因此格式化完成之后,需要重启数据面IO进程。 - -- ES3000 V3支持保护类型0和3,支持PI start和PI end,仅支持mc extended。ES3000 V3的512+8格式支持DIF,4096+64格式不支持。 - -- ES3000 V5支持保护类型0和3,支持PI start和PI end,支持mc extended和mc pointer。ES3000 V5的512+8和4096+64格式均支持DIF。 - -- Optane盘支持保护类型0和1,仅支持PI end,仅支持mc extended。Optane的512+8格式支持DIF,4096+64格式不支持。 - -| **磁盘类型** | **LBA格式** | **磁盘类型** | **LBA格式** | -|----------------------|-----------------|---------------|-------------------| -| Intel Optane P4800 | lbaf0:512+0
lbaf1:512+8
lbaf2:512+16
lbaf3:4096+0
lbaf4:4096+8
lbaf5:4096+64
lbaf6:4096+128 | ES3000 V3、V5 | lbaf0:512+0
lbaf1:512+8
lbaf2:4096+64
lbaf3:4096+0
lbaf4:4096+8 | - -##### LIBSTORAGE_CALLBACK_FUNC - -1. 接口原型 - - ```bash - typedef void (*LIBSTORAGE_CALLBACK_FUNC)(int32_t cb_status, int32_t sct_code, void* cb_arg); - ``` - -2. 接口描述 - - 注册的HSAK io完成回调函数。 - -3. 参数 - - | **参数成员** | **描述** | - |------------------------------------|-----------------------------| - | int32_t cb_status | io 状态码,0为成功,负值为系统错误码,正值为硬盘错误码(不同错误码的
含义见[附录](#附录)) | - | int32_t sct_code | io 状态码类型(0:[GENERIC](#generic);
1:[COMMAND_SPECIFIC](#command_specific);
2:[MEDIA_DATA_INTERGRITY_ERROR](#media_data_intergrity_error)
7:VENDOR_SPECIFIC) | - | void* cb_arg | 回调函数的入参 | - -4. 返回值 - - 无。 - -##### libstorage_deallocate_block - -1. 接口原型 - - ```bash - int32_t libstorage_deallocate_block(int32_t fd, struct libstorage_dsm_range_desc *range, uint16_t range_count, LIBSTORAGE_CALLBACK_FUNC cb, void* cb_arg); - ``` - -2. 接口描述 - - 告知NVMe盘可释放的块。 - -3. 参数 - - | **参数成员** | **描述** | - |------------------------------------|-----------------------------| - | int32_t fd | 已打开的硬盘文件描述符 | - | struct libstorage_dsm_range_desc *range | NVMe盘可释放的块描述列表
说明
该参数需要使用libstorage_mem_reserve分配
大页内存,分配内存时需要4K对齐,即align设置为4096。
盘的TRIM的范围根据不同的盘进行约束,超过盘侧的最大TRIM范围
可能触发数据异常。 | - | uint16_t range_count | 数组range的成员数 | - | LIBSTORAGE_CALLBACK_FUNC cb | 回调函数 | - | void* cb_arg | 回调函数参数 | - -4. 返回值 - - | **返回值** | **描述** | - |-------------|----------------| - | 小于0 | 请求下发失败 | - | 0 | 请求下发成功 | - -##### libstorage_async_write - -1. 接口原型 - - ```bash - int32_t libstorage_async_write(int32_t fd, void *buf, size_t nbytes, off64_t offset, void *md_buf, size_t md_len, enum libstorage_crc_and_prchk dif_flag, LIBSTORAGE_CALLBACK_FUNC cb, void* cb_arg); - ``` - -2. 接口描述 - - HSAK下发异步IO写请求的接口(写缓冲区为连续buffer)。 - -3. 参数 - - | **参数成员** | **描述** | - |------------------------------------|-----------------------------| - | int32_t fd | 块设备的文件描述符 | - | void *buf | IO写数据的缓冲区(四字节对齐,不能跨4K页面边界)
说明
注:扩展型LBA要包含元数据内存大小。 | - | size_t nbytes | 单次写IO大小(单位:字节。sector_size的整数倍)
说明
仅包含数据大小,扩展型LBA也不含元数据大小。 | - | off64_t offset | LBA的写偏移(单位:字节。sector_size的整数倍)
说明
仅包含数据大小,扩展型LBA也不含元数据大小。 | - | void *md_buf | 元数据的缓冲区(仅适用于分离型LBA,扩展型LBA设置为NULL即可) | - | size_t md_len | 元数据的缓冲区长度(仅适用于分离型LBA,扩展型LBA设置为0即可) | - | enum libstorage_crc_and_prchk dif_flag | 是否计算DIF、是否开启盘的校验 | - | LIBSTORAGE_CALLBACK_FUNC cb | 注册的回调函数 | - | void* cb_arg | 回调函数的参数 | - -4. 返回值 - - | **返回值**| **描述** | - |------------|--------------------| - | 0 | IO写请求提交成功 | - | 非0 | IO写请求提交失败 | - -##### libstorage_async_read - -1. 接口原型 - - ```bash - int32_t libstorage_async_read(int32_t fd, void *buf, size_t nbytes, off64_t offset, void *md_buf, size_t md_len, enum libstorage_crc_and_prchk dif_flag, LIBSTORAGE_CALLBACK_FUNC cb, void* cb_arg); - ``` - -2. 接口描述 - - HSAK下发异步IO读请求的接口(读缓冲区为连续buffer)。 - -3. 参数 - - | **参数成员** | **描述** | - |------------------------------------|----------------------------------| - | int32_t fd | 块设备的文件描述符 | - | void *buf | IO读数据的缓冲区(四字节对齐,不能跨4K页面边界)
说明
扩展型LBA要包含元数据内存大小。 | - | size_t nbytes | 单次读IO大小(单位:字节。sector_size的整数倍)
说明
仅包含数据大小,扩展型LBA也不含元数据大小。 | - | off64_t offset | LBA的读偏移(单位:字节。sector_size的整数倍)
说明
仅包含数据大小,扩展型LBA也不含元数据大小。 | - | void *md_buf | 元数据的缓冲区(仅适用于分离型LBA,扩展型LBA设置为NULL即可) | - | size_t md_len | 元数据的缓冲区长度(仅适用于分离型LBA,扩展型LBA设置为0即可) | - | enum libstorage_crc_and_prchk dif_flag | 是否计算DIF、是否开启盘的校验 | - | LIBSTORAGE_CALLBACK_FUNC cb | 注册的回调函数 | - | void* cb_arg | 回调函数的参数 | - -4. 返回值 - - | **返回值** | **描述** | - |-------------|------------------| - | 0 | IO读请求提交成功 | - | 非0 | IO读请求提交失败 | - -##### libstorage_async_writev - -1. 接口原型 - - ```bash - int32_t libstorage_async_writev(int32_t fd, struct iovec *iov, int iovcnt, size_t nbytes, off64_t offset, void *md_buf, size_t md_len, enum libstorage_crc_and_prchk dif_flag, LIBSTORAGE_CALLBACK_FUNC cb, void* cb_arg); - ``` - -2. 接口描述 - - HSAK下发异步IO写请求的接口(写缓冲区为离散buffer)。 - -3. 参数 - - | **参数成员** | **描述** | - |------------------------------------|----------------------------------| - | int32_t fd | 块设备的文件描述符 | - | struct iovec *iov | IO写数据的缓冲区
说明
扩展型LBA要包含元数据大小。
地址要求四字节对齐,长度不超过4GB。 | - | int iovcnt | IO写数据的缓冲区个数 | - | size_t nbytes | 单次写IO大小(单位:字节。sector_size的整数倍)
说明
仅包含数据大小,扩展型LBA也不含元数据大小。 | - | off64_t offset | LBA的写偏移(单位:字节。sector_size的整数倍)
说明
仅包含数据大小,扩展型LBA也不含元数据大小。 | - | void *md_buf | 元数据的缓冲区(仅适用于分离型LBA,扩展型LBA设置为NULL即可) | - | size_t md_len | 元数据的缓冲区长度(仅适用于分离型LBA,扩展型LBA设置为0即可) | - | enum libstorage_crc_and_prchk dif_flag | 是否计算DIF、是否开启盘的校验 | - | LIBSTORAGE_CALLBACK_FUNC cb | 注册的回调函数 | - | void* cb_arg | 回调函数的参数 | - -4. 返回值 - - | **返回值** | **描述** | - |--------------|-------------------| - | 0 | IO写请求提交成功 | - | 非0 | IO写请求提交失败 | - -##### libstorage_async_readv - -1. 接口原型 - - ```bash - int32_t libstorage_async_readv(int32_t fd, struct iovec *iov, int iovcnt, size_t nbytes, off64_t offset, void *md_buf, size_t md_len, enum libstorage_crc_and_prchk dif_flag, LIBSTORAGE_CALLBACK_FUNC cb, void* cb_arg); - ``` - -2. 接口描述 - - HSAK下发异步IO读请求的接口(读缓冲区为离散buffer)。 - -3. 参数 - - | **参数成员** | **描述** | - |------------------------------------|----------------------------------| - | int32_t fd | 块设备的文件描述符 | - | struct iovec *iov | IO读数据的缓冲区
说明
扩展型LBA要包含元数据大小。
地址要求四字节对齐,长度不超过4GB。 | - | int iovcnt | IO读数据的缓冲区个数 | - | size_t nbytes | 单次读IO大小(单位:字节。sector_size的整数倍)
说明
仅包含数据大小,扩展型LBA也不含元数据大小。 | - | off64_t offset | LBA的读偏移(单位:字节。sector_size的整数倍)
说明
仅包含数据大小,扩展型LBA也不含元数据大小。 | - | void *md_buf | 元数据的缓冲区(仅适用于分离型LBA,扩展型LBA设置为NULL即可) | - | size_t md_len | 元数据的缓冲区长度(仅适用于分离型LBA,扩展型LBA设置为0即可) | - | enum libstorage_crc_and_prchk dif_flag | 是否计算DIF、是否开启盘的校验 | - | LIBSTORAGE_CALLBACK_FUNC cb | 注册的回调函数 | - | void* cb_arg | 回调函数的参数 | - -4. 返回值 - - | **返回值** | **描述** | - |-------------|----------------------| - | 0 | IO读请求提交成功 | - | 非0 | IO读请求提交失败 | - -##### libstorage_sync_write - -1. 接口原型 - - ```bash - int32_t libstorage_sync_write(int fd, const void *buf, size_t nbytes, off_t offset); - ``` - -2. 接口描述 - - HSAK下发同步IO写请求的接口(写缓冲区为连续buffer)。 - -3. 参数 - - | **参数成员** | **描述** | - |------------------------------------|----------------------------------| - | int32_t fd | 块设备的文件描述符 | - | void *buf | IO写数据的缓冲区(四字节对齐,不能跨4K页面边界)
说明
扩展型LBA要包含元数据内存大小。 | - | size_t nbytes | 单次写IO大小(单位:字节。sector_size的整数倍)
说明
仅包含数据大小,扩展型LBA也不含元数据大小。 | - | off64_t offset | LBA的写偏移(单位:字节。sector_size的整数倍)
说明
仅包含数据大小,扩展型LBA也不含元数据大小。 | - -4. 返回值 - - | **返回值** | **描述** | - |-------------|-----------------------| - | 0 | IO写请求提交成功 | - | 非0 | IO写请求提交失败 | - -##### libstorage_sync_read - -1. 接口原型 - - ```bash - int32_t libstorage_sync_read(int fd, const void *buf, size_t nbytes, off_t offset); - ``` - -2. 接口描述 - - HSAK下发同步IO读请求的接口(读缓冲区为连续buffer)。 - -3. 参数 - - | **参数成员** | **描述** | - |------------------------------------|----------------------------------| - | int32_t fd | 块设备的文件描述符 | - | void *buf | IO读数据的缓冲区(四字节对齐,不能跨4K页面边界)
说明
扩展型LBA要包含元数据内存大小。 | - | size_t nbytes | 单次读IO大小(单位:字节。sector_size的整数倍)
说明
仅包含数据大小,扩展型LBA也不含元数据大小。 | - | off64_t offset | LBA的读偏移(单位:字节。sector_size的整数倍)
说明
仅包含数据大小,扩展型LBA也不含元数据大小。 | - -4. 返回值 - - | **返回值** | **描述** | - |-------------|-----------------------| - | 0 | IO读请求提交成功 | - | 非0 | IO读请求提交失败 | - -##### libstorage_open - -1. 接口原型 - - ```bash - int32_t libstorage_open(const char* devfullname); - ``` - -2. 接口描述 - - 打开块设备。 - -3. 参数 - - | **参数成员** | **描述** | - |--------------------------|---------------------------------| - | const char* devfullname | 块设备名称(格式为nvme0n1) | - -4. 返回值 - - | **返回值** | **描述** | - |-------------|-------------------------------------------------------------------| - | -1 | 打开失败(如设备名不对,或打开的fd数目>NVME盘的可使用通道数目) | - | 大于0 | 块设备的文件描述符 | - - 开启nvme.conf.in中的MultiQ开关以后,同一个线程多次打开同一个设备,会返回不同的fd;否则仍返回同一个fd。该特性只针对NVME设备。 - -##### libstorage_close - -1. 接口原型 - - ```bash - int32_t libstorage_close(int32_t fd); - ``` - -2. 接口描述 - - 关闭块设备。 - -3. 参数 - - | **参数成员** | **描述** | - |--------------|---------------------------| - | int32_t fd |已打开的块设备的文件描述符 | - -4. 返回值 - - | **返回值**| **描述** | - |------------|--------------------------------| - | -1 | 无效文件描述符 | - | -16 | 文件描述符正忙,需要重试 | - | 0 | 关闭成功 | - -##### libstorage_mem_reserve - -1. 接口原型 - - ```bash - void* libstorage_mem_reserve(size_t size, size_t align); - ``` - -2. 接口描述 - - 从DPDK预留的大页内存中分配内存空间。 - -3. 参数 - - | **参数成员**| **描述** | - |---------------|-------------------------------| - | size_t size | 需要分配的内存的大小 | - | size_t align | 所分配的内存空间按照align对齐 | - -4. 返回值 - - | **返回值** | **描述** | - |-------------|---------------------------| - | NULL | 分配失败 | - | 非NULL | 所分配内存空间的地址 | - -##### libstorage_mem_free - -1. 接口原型 - - ```bash - void libstorage_mem_free(void* ptr); - ``` - -2. 接口描述 - - 释放ptr指向的内存空间。 - -3. 参数 - - | **参数成员** | **描述** | - |---------------|--------------------------| - | void* ptr |所要释放的内存空间的地址 | - -4. 返回值 - - 无。 - -##### libstorage_alloc_io_buf - -1. 接口原型 - - ```bash - void* libstorage_alloc_io_buf(size_t nbytes); - ``` - -2. 接口描述 - - 从SPDK的buf_small_pool或者buf_large_pool中分配内存。 - -3. 参数 - - | **参数成员** | **描述** | - |----------------|-----------------------------| - | size_t nbytes | 所需要分配的缓冲区大小 | - -4. 返回值 - - | **返回值** | **描述** | - |-------------|--------------------------| - | 非NULL | 所分配的缓冲区的首地址 | - -##### libstorage_free_io_buf - -1. 接口原型 - - ```bash - int32_t libstorage_free_io_buf(void *buf, size_t nbytes); - ``` - -2. 接口描述 - - 释放所分配的内存到SPDK的buf_small_pool或者buf_large_pool中。 - -3. 参数 - - | **参数成员** | **描述** | - |----------------|------------------------------| - | void *buf | 所要释放的缓冲区的首地址 | - | size_t nbytes | 所要释放的缓冲区的大小 | - -4. 返回值 - - | **返回值** | **描述** | - |-------------|--------------| - | -1 | 释放失败 | - | 0 | 释放成功 | - -##### libstorage_init_module - -1. 接口原型 - - ```bash - int32_t libstorage_init_module(const char* cfgfile); - ``` - -2. 接口描述 - - HSAK模块初始化接口。 - -3. 参数 - - | **参数成员** | **描述** | - |----------------------|---------------------| - | const char* cfgfile | HSAK 配置文件名称 | - -4. 返回值 - - | **返回值** | **描述** | - |-------------|---------------| - | 非0 | 初始化失败 | - | 0 | 初始化成功 | - -##### libstorage_exit_module - -1. 接口原型 - - ```bash - int32_t libstorage_exit_module(void); - ``` - -2. 接口描述 - - HSAK模块退出接口。 - -3. 参数 - - 无。 - -4. 返回值 - - | **返回值** | **描述** | - |-------------|---------------| - | 非0 | 退出清理失败 | - | 0 | 退出清理成功 | - -##### LIBSTORAGE_REGISTER_DPDK_INIT_NOTIFY - -1. 接口原型 - - ```bash - LIBSTORAGE_REGISTER_DPDK_INIT_NOTIFY(_name, _notify) - ``` - -2. 接口描述 - - 业务层注册函数,用于注册DPDK初始化完成时的回调函数。 - -3. 参数 - - | **参数成员** | **描述** | - |----------------|---------------------------------------------------------------------------------------------------| - | _name |业务层模块名称。 | - | _notify |业务层注册的回调函数原型:void (*notifyFunc)(const struct libstorage_dpdk_init_notify_arg*arg); | - -4. 返回值 - - 无。 - -#### ublock.h - -##### init_ublock - -1. 接口原型 - - ```bash - int init_ublock(const char *name, enum ublock_rpc_server_status flg); - ``` - -2. 接口描述 - - 初始化Ublock功能模块,本接口必须在其他所有Ublock接口之前被调用。如果flag被置为UBLOCK_RPC_SERVER_ENABLE,即ublock作为rpc server,则同一个进程只能初始化一次。 - - 在ublock作为rpc server启动时,会同时启动一个server的monitor线程。monitor线程监控到rpc server线程出现异常(如卡死时),会主动调用exit触发进程退出。 - - 此时依赖于产品的脚本再次拉起相关进程。 - -3. 参数 - - | **参数成员** | **描述** | - |----------------------------------|---------------------------------| - | const char *name | 模块名字,缺省值为"ublock",建议该参数可以传NULL。 | - | enum ublock_rpc_server_status
flg | 是否启用RPC的标记值:UBLOCK_RPC_SERVER_
DISABLE或UBLOCK_RPC_SERVER_ENAB
LE;
在不启用RPC情况下,如果硬盘被业务进程占用,那么Ublock模块
将无法获取该硬盘信息。 | - -4. 返回值 - - | **返回值** | **描述** | - |----------------------------------|---------------------------------| - | 0 | 初始化成功。 | - | -1 | 初始化失败,可能原因:Ublock模块已经被初始化。 | - | 进程exit | Ublock认为在两种情况下属于无法修复异常,直接调用exit接口
退出进程:
- 需要创建RPC服务,但RPC服务现场创建失败。
- 创建热插拔监控线程,但失败。 | - -##### ublock_init - -1. 接口原型 - - ```bash - # define ublock_init(name) init_ublock(name, UBLOCK_RPC_SERVER_ENABLE) - ``` - -2. 接口描述 - - 本身是对init_ublock接口的宏定义,可理解为将Ublock初始化为需要RPC服务。 - -3. 参数 - - | **参数成员** | **描述** | - |---------------|----------------------------------------------------| - | name | 模块名字,缺省值为"ublock",建议该参数可以传NULL。 | - -4. 返回值 - - | **返回值** | **描述** | - |---------------|----------------------------------------------------| - | 0 | 初始化成功。 | - | -1 | 初始化失败,可能原因:Ublock rpc
server模块已经被初始化。 | - | 进程exit | Ublock认为在两种情况下属于无法修复异常,直接调用exit接口
退出进程:
- 需要创建RPC服务,但RPC服务现场创建失败。
- 创建热插拔监控线程,但失败。 | - -##### ublock_init_norpc - -1. 接口原型 - - ```bash - # define ublock_init_norpc(name) init_ublock(name, UBLOCK_RPC_SERVER_DISABLE) - ``` - -2. 接口描述 - - 本身是对init_ublock接口的宏定义,可理解为将Ublock初始化为无RPC服务。 - -3. 参数 - - | **参数成员** | **描述** | - |---------------|------------------------------------------------------| - | name | 模块名字,缺省值为"ublock",建议该参数可以传NULL。 | - -4. 返回值 - - | **返回值** | **描述** | - |---------------------------------|-----------------------------| - | 0 | 初始化成功。 | - | -1 | 初始化失败,可能原因:Ublock
client模块已经被初始化。 | - | 进程exit | Ublock认为在两种情况下属于无法修复异常,直接调用exit接口
退出进程:
- 需要创建RPC服务,但RPC服务现场创建失败。
- 创建热插拔监控线程,但失败。 | - -##### ublock_fini - -1. 接口原型 - - ```cpp - void ublock_fini(void); - ``` - -2. 接口描述 - - 销毁Ublock功能模块,本接口将销毁Ublock模块以及内部创建的资源,本接口同Ublock初始化接口需要配对使用。 - -3. 参数 - - 无。 - -4. 返回值 - - 无。 - -##### ublock_get_bdevs - -1. 接口原型 - - ```bash - int ublock_get_bdevs(struct ublock_bdev_mgr* bdev_list); - ``` - -2. 接口描述 - - 业务进程通过调用本接口获取设备列表(环境上所有的NVME设备,包括内核态驱动和用户态驱动),获取的NVMe设备列表中只有PCI地址,不包含具体设备信息,需要获取具体设备信息,请调用接口ublock_get_bdev。 - -3. 参数 - - | **参数成员** | **描述** | - |-------------------------------------|------------------------------------------------------| - | struct ublock_bdev_mgr* bdev_list |出参,返回设备队列,bdev_list指针需要在外部分配。 | - -4. 返回值 - - | **返回值** | **描述** | - |-------------|-----------------------| - | 0 | 获取设备队列成功。 | - | -2 | 环境中没有NVMe设备。 | - | 其余值 | 获取设备队列失败。 | - -##### ublock_free_bdevs - -1. 接口原型 - - ```bash - void ublock_free_bdevs(struct ublock_bdev_mgr* bdev_list); - ``` - -2. 接口描述 - - 业务进程通过调用本接口释放设备列表。 - -3. 参数 - - | **参数成员** | **描述** | - |-------------------------------------|--------------------------------------------------------------| - | struct ublock_bdev_mgr* bdev_list |设备队列头指针,设备队列清空后,bdev_list指针本身不会被释放。 | - -4. 返回值 - - 无。 - -##### ublock_get_bdev - -1. 接口原型 - - ```bash - int ublock_get_bdev(const char *pci, struct ublock_bdev *bdev); - ``` - -2. 接口描述 - - 业务进程通过调用本接口获取具体某个设备的信息,设备信息中:NVMe设备的序列号、型号、fw版本号信息以字符数组形式保存,不是字符串形式(不同硬盘控制器返回形式不同,不保证数组结尾必定含有"0")。 - - 本接口调用后,对应设备会被Ublock占用,请务必在完成相应业务操作后立即调用ublock_free_bdev释放资源。 - -3. 参数 - - | **参数成员** | **描述** | - |---------------------------|--------------------------------------------------| - | const char *pci | 需要获取信息的设备PCI地址 | - | struct ublock_bdev *bdev | 出参,返回设备信息,bdev指针需要在外部分配。 | - -4. 返回值 - - | **返回值** | **描述** | - |-------------|------------------------------------------------------------| - | 0 | 获取设备信息成功。 | - | -1 | 获取设备信息失败,如参数错误等。 | - | -11(EAGAIN) | 获取设备信息失败,如rpc查询失败,需要重试(建议sleep 3s)。 | - -##### ublock_get_bdev_by_esn - -1. 接口原型 - - ```bash - int ublock_get_bdev_by_esn(const char *esn, struct ublock_bdev *bdev); - ``` - -2. 接口描述 - - 业务进程通过调用本接口,根据给定的ESN号获取对应设备的信息,设备信息中:NVMe设备的序列号、型号、fw版本号信息以字符数组形式保存,不是字符串形式(不同硬盘控制器返回形式不同,不保证数组结尾必定含有"0")。 - - 本接口调用后,对应设备会被Ublock占用,请务必在完成相应业务操作后立即调用ublock_free_bdev释放资源。 - -3. 参数 - - | **参数成员** | **描述** | - |---------------------------|--------------------------------------------------| - | const char *esn | 需要获取信息的设备ESN号。
说明
ESN号是最大有效长度为20的字符串(不包括字符串结束符),但该长
度根据不同硬件厂商可能存在差异,如不足20字符,需要在字符串末尾加
空格补齐。 | - | struct ublock_bdev *bdev | 出参,返回设备信息,bdev指针需要在外部分配。 | - -4. 返回值 - - | **返回值** | **描述** | - |-------------|--------------------------------------------------------------| - | 0 | 获取设备信息成功。 | - | -1 | 获取设备信息失败,如参数错误等 | - | -11(EAGAIN)| 获取设备信息失败,如rpc查询失败,需要重试(建议sleep 3s)。 | - -##### ublock_free_bdev - -1. 接口原型 - - ```cpp - void ublock_free_bdev(struct ublock_bdev *bdev); - ``` - -2. 接口描述 - - 业务进程通过调用本接口释放设备资源。 - -3. 参数 - - | **参数成员** | **描述** | - |----------------------------|-------------------------------------------------------------| - | struct ublock_bdev *bdev | 设备信息指针,该指针内数据清空后,bdev指针本身不会被释放。 | - -4. 返回值 - - 无。 - -##### TAILQ_FOREACH_SAFE - -1. 接口原型 - - ```bash - # define TAILQ_FOREACH_SAFE(var, head, field, tvar) - for ((var) = TAILQ_FIRST((head)); - (var) && ((tvar) = TAILQ_NEXT((var), field), 1); - (var) = (tvar)) - ``` - -2. 接口描述 - - 提供安全访问队列每个成员的宏定义。 - -3. 参数 - - | **参数成员** | **描述** | - |---------------|----------------------------------------------------------------------------------------------------| - | var | 当前操作的队列节点成员 | - | head | 队列头指针,一般情况下是指通过TAILQ_HEAD(xx, xx) obj定义的obj的地址 | - | field | 队列节点中用于保存队列前后指针的结构体名字,一般情况下是指通过TAILQ_ENTRY(xx)name定义的名字name | - | tvar | 下一个队列节点成员 | - -4. 返回值 - - 无。 - -##### ublock_get_SMART_info - -1. 接口原型 - - ```bash - int ublock_get_SMART_info(const char *pci, uint32_t nsid, struct ublock_SMART_info *smart_info); - ``` - -2. 接口描述 - - 业务进程通过调用本接口获取指定设备的SMART信息。 - -3. 参数 - - | **参数成员** | **描述** | - |---------------------------------------|----------------------------| - | const char *pci | 设备PCI地址 | - | uint32_t nsid | 指定的namespace | - | struct ublock_SMART_info *smart_info | 出参,返回设备SMART信息 | - -4. 返回值 - - | **返回值** | **描述** | - |-------------|---------------------------------------------------------------| - | 0 | 获取SMART信息成功。 | - | -1 | 获取SMART信息失败,如参数错误等。 | - | -11(EAGAIN)| 获取SMART信息失败,如rpc查询失败,需要重试(建议sleep 3s)。 | - -##### ublock_get_SMART_info_by_esn - -1. 接口原型 - - ```bash - int ublock_get_SMART_info_by_esn(const char *esn, uint32_t nsid, struct ublock_SMART_info *smart_info); - ``` - -2. 接口描述 - - 业务进程通过调用本接口获取ESN号对应设备的SMART信息。 - -3. 参数 - - | **参数成员** | **描述** | - |--------------------------|-----------------------------------------------| - | const char *esn | 设备ESN号
说明
ESN号是最大有效长度为20的字符串(不包括字符串结束符),但该长
度根据不同硬件厂商可能存在差异,如不足20字符,需要在字符串末尾加
空格补齐。 | - | uint32_t nsid | 指定的namespace | - | struct ublock_SMART_info
*smart_info | 出参,返回设备SMART信息 | - -4. 返回值 - - | **返回值** | **描述** | - |-------------|--------------------------------------------------------------| - | 0 | 获取SMART信息成功。 | - | -1 | 获取SMART信息失败,如参数错误等。 | - | -11(EAGAIN) | 获取SMART信息失败,如rpc查询失败,需要重试(建议sleep 3s)。 | - -##### ublock_get_error_log_info - -1. 接口原型 - - ```bash - int ublock_get_error_log_info(const char *pci, uint32_t err_entries, struct ublock_nvme_error_info *errlog_info); - ``` - -2. 接口描述 - - 业务进程通过调用本接口获取指定设备的Error log信息。 - -3. 参数 - - | **参数成员** | **描述** | - |---------------------------------------------|----------------------------------------------------------------------------------------------------------------------------------------------------| - | const char *pci | 设备PCI地址 | - | uint32_t err_entries | 指定希望获取的Error Log条数,最多256条 | - | struct ublock_nvme_error_info *errlog_info | 出参,返回设备Error log信息,errlog_info指针需要调用者申请空间,且确保申请的空间大于或等于err_entries * sizeof (struct ublock_nvme_error_info) | - -4. 返回值 - - | **返回值** | **描述** | - |-------------------------------------|--------------------------------------------------------------| - | 获取到的Error log条数,大于或等于0 | 获取Error log成功。 | - | -1 | 获取Error log失败,如参数错误等。 | - | -11(EAGAIN) | 获取Error log失败,如rpc查询失败,需要重试(建议sleep 3s)。 | - -##### ublock_get_log_page - -1. 接口原型 - - ```bash - int ublock_get_log_page(const char *pci, uint8_t log_page, uint32_t nsid, void *payload, uint32_t payload_size); - ``` - -2. 接口描述 - - 业务进程通过调用本接口获取指定设备,指定log page的信息。 - -3. 参数 - - | **参数成员** | **描述** | - |------------------------|-------------------------------------------------------------------------------------------------------------------------| - | const char *pci | 设备PCI地址 | - | uint8_t log_page | 指定希望获取的log page ID,比如0xC0, 0xCA代表ES3000 V5盘自定义的SMART信息 | - | uint32_t nsid | 指定namespace ID,各个log page对按namespace获取支持情况不一致,如果不支持按namespace获取,调用者需要显示传0xFFFFFFFF | - | void *payload | 出参,存储log page信息,由调用者负责申请内存 | - | uint32_t payload_size | 申请的payload大小,不大于4096 Bytes | - -4. 返回值 - - | **返回值** | **描述** | - |-------------|------------------------------------| - | 0 | 获取log page成功 | - | -1 | 获取Error log失败,如参数错误等 | - -##### ublock_info_get_pci_addr - -1. 接口原型 - - ```bash - char *ublock_info_get_pci_addr(const void *info); - ``` - -2. 接口描述 - - 业务进程的回调函数中,通过调用本接口获取热插拔设备的PCI地址。 - - info占用的内存以及返回的PCI地址占用得内存不需要业务进程进行释放。 - -3. 参数 - - | **参数成员** | **描述** | - |-------------------|---------------------------------------------| - | const void *info | 热插拔监控线程传递给回调函数的热插拔事件信息 | - -4. 返回值 - - | **返回值** | **描述** | - |-------------|--------------------| - | NULL | 获取失败 | - | 非NULL | 获取的PCI地址 | - -##### ublock_info_get_action - -1. 接口原型 - - ```bash - enum ublock_nvme_uevent_action ublock_info_get_action(const void *info); - ``` - -2. 接口描述 - - 业务进程的回调函数中,通过调用本接口获取热插拔事件的类型。 - - info占用的内存不需要业务进程进行释放。 - -3. 参数 - - | **参数成员** | **描述** | - |-------------------|------------------------------------------------| - | const void *info | 热插拔监控线程传递给回调函数的热插拔事件信息 | - -4. 返回值 - - | **返回值** | **描述** | - |----------------|------------------------------------------------------------------------------| - | 热插拔事件类型| 触发回调函数的事件类型,详见结构体enum ublock_nvme_uevent_action的定义。 | - -##### ublock_get_ctrl_iostat - -1. 接口原型 - - ```bash - int ublock_get_ctrl_iostat(const char* pci, struct ublock_ctrl_iostat_info *ctrl_iostat); - ``` - -2. 接口描述 - - 业务进程通过调用本接口获取控制器的IO统计信息。 - -3. 参数 - - | **参数成员** | **描述** | - |-----------------------------------------------|----------------------------------------------------------| - | const char* pci | 需要获取IO统计信息的控制器的PCI地址。 | - | struct ublock_ctrl_iostat_info *ctrl_iostat |出参,返回IO统计信息,ctrl_iostat指针需要在外部分配。 | - -4. 返回值 - - | **返回值** | **描述** | - |-------------|-------------------------------------------------| - | 0 | 获取IO统计信息成功。 | - | -1 | 获取IO统计信息失败(无效参数、RPC error)。 | - | -2 | 获取IO统计信息失败(NVMe盘没有被IO进程接管)。 | - | -3 | 获取IO统计信息失败(IO统计开关未打开)。 | - -##### ublock_nvme_admin_passthru - -1. 接口原型 - - ```bash - int32_t ublock_nvme_admin_passthru(const char *pci, void *cmd, void *buf, size_t nbytes); - ``` - -2. 接口描述 - - 业务进程通过调用该接口透传nvme admin命令给nvme设备。当前仅支持获取identify字段的nvme admin命令。 - -3. 参数 - - | **参数成员** | **描述** | - |------------------|----------------------------------------------------------------------------------------------------| - | const char *pci | nvme admin命令目的控制器的PCI地址。 | - | void *cmd | nvme admin命令结构体指针,结构体大小为64字节,内容参考nvme spec。当前仅支持获取identify字段命令。 | - | void *buf | 保存nvme admin命令返回内容,其空间由用户分配,大小为nbytes。 | - | size_t nbytes | 用户buf的大小。identify字段为4096字节,获取identify命令的nbytes为4096。 | - -4. 返回值 - - | **返回值**| **描述** | - |------------|--------------------| - | 0 | 用户命令执行成功。 | - | -1 | 用户命令执行失败。 | - -## 附录 - -### GENERIC - -通用类型错误码参考 - -|sc |value| -|---------------------------------------------|---------------| -| NVME_SC_SUCCESS | 0x00 | -| NVME_SC_INVALID_OPCODE | 0x01 | -| NVME_SC_INVALID_FIELD | 0x02 | -| NVME_SC_COMMAND_ID_CONFLICT | 0x03 | -| NVME_SC_DATA_TRANSFER_ERROR | 0x04 | -| NVME_SC_ABORTED_POWER_LOSS | 0x05 | -| NVME_SC_INTERNAL_DEVICE_ERROR | 0x06 | -| NVME_SC_ABORTED_BY_REQUEST | 0x07 | -| NVME_SC_ABORTED_SQ_DELETION | 0x08 | -| NVME_SC_ABORTED_FAILED_FUSED | 0x09 | -| NVME_SC_ABORTED_MISSING_FUSED | 0x0a | -| NVME_SC_INVALID_NAMESPACE_OR_FORMAT | 0x0b | -| NVME_SC_COMMAND_SEQUENCE_ERROR | 0x0c | -| NVME_SC_INVALID_SGL_SEG_DESCRIPTOR | 0x0d | -| NVME_SC_INVALID_NUM_SGL_DESCIRPTORS | 0x0e | -| NVME_SC_DATA_SGL_LENGTH_INVALID | 0x0f | -| NVME_SC_METADATA_SGL_LENGTH_INVALID | 0x10 | -| NVME_SC_SGL_DESCRIPTOR_TYPE_INVALID | 0x11 | -| NVME_SC_INVALID_CONTROLLER_MEM_BUF | 0x12 | -| NVME_SC_INVALID_PRP_OFFSET | 0x13 | -| NVME_SC_ATOMIC_WRITE_UNIT_EXCEEDED | 0x14 | -| NVME_SC_OPERATION_DENIED | 0x15 | -| NVME_SC_INVALID_SGL_OFFSET | 0x16 | -| NVME_SC_INVALID_SGL_SUBTYPE | 0x17 | -| NVME_SC_HOSTID_INCONSISTENT_FORMAT | 0x18 | -| NVME_SC_KEEP_ALIVE_EXPIRED | 0x19 | -| NVME_SC_KEEP_ALIVE_INVALID | 0x1a | -| NVME_SC_ABORTED_PREEMPT | 0x1b | -| NVME_SC_SANITIZE_FAILED | 0x1c | -| NVME_SC_SANITIZE_IN_PROGRESS | 0x1d | -| NVME_SC_SGL_DATA_BLOCK_GRANULARITY_INVALID | 0x1e | -| NVME_SC_COMMAND_INVALID_IN_CMB | 0x1f | -| NVME_SC_LBA_OUT_OF_RANGE | 0x80 | -| NVME_SC_CAPACITY_EXCEEDED | 0x81 | -| NVME_SC_NAMESPACE_NOT_READY | 0x82 | -| NVME_SC_RESERVATION_CONFLICT | 0x83 | -| NVME_SC_FORMAT_IN_PROGRESS | 0x84 | - -### COMMAND_SPECIFIC - -特定命令错误码参考 - -|sc |value| -|---------------------------------------------|---------------| -| NVME_SC_COMPLETION_QUEUE_INVALID | 0x00 | -| NVME_SC_INVALID_QUEUE_IDENTIFIER | 0x01 | -| NVME_SC_MAXIMUM_QUEUE_SIZE_EXCEEDED | 0x02 | -| NVME_SC_ABORT_COMMAND_LIMIT_EXCEEDED | 0x03 | -| NVME_SC_ASYNC_EVENT_REQUEST_LIMIT_EXCEEDED | 0x05 | -| NVME_SC_INVALID_FIRMWARE_SLOT | 0x06 | -| NVME_SC_INVALID_FIRMWARE_IMAGE | 0x07 | -| NVME_SC_INVALID_INTERRUPT_VECTOR | 0x08 | -| NVME_SC_INVALID_LOG_PAGE | 0x09 | -| NVME_SC_INVALID_FORMAT | 0x0a | -| NVME_SC_FIRMWARE_REQ_CONVENTIONAL_RESET | 0x0b | -| NVME_SC_INVALID_QUEUE_DELETION | 0x0c | -| NVME_SC_FEATURE_ID_NOT_SAVEABLE | 0x0d | -| NVME_SC_FEATURE_NOT_CHANGEABLE | 0x0e | -| NVME_SC_FEATURE_NOT_NAMESPACE_SPECIFIC | 0x0f | -| NVME_SC_FIRMWARE_REQ_NVM_RESET | 0x10 | -| NVME_SC_FIRMWARE_REQ_RESET | 0x11 | -| NVME_SC_FIRMWARE_REQ_MAX_TIME_VIOLATION | 0x12 | -| NVME_SC_FIRMWARE_ACTIVATION_PROHIBITED | 0x13 | -| NVME_SC_OVERLAPPING_RANGE | 0x14 | -| NVME_SC_NAMESPACE_INSUFFICIENT_CAPACITY | 0x15 | -| NVME_SC_NAMESPACE_ID_UNAVAILABLE | 0x16 | -| NVME_SC_NAMESPACE_ALREADY_ATTACHED | 0x18 | -| NVME_SC_NAMESPACE_IS_PRIVATE | 0x19 | -| NVME_SC_NAMESPACE_NOT_ATTACHED | 0x1a | -| NVME_SC_THINPROVISIONING_NOT_SUPPORTED | 0x1b | -| NVME_SC_CONTROLLER_LIST_INVALID | 0x1c | -| NVME_SC_DEVICE_SELF_TEST_IN_PROGRESS | 0x1d | -| NVME_SC_BOOT_PARTITION_WRITE_PROHIBITED | 0x1e | -| NVME_SC_INVALID_CTRLR_ID | 0x1f | -| NVME_SC_INVALID_SECONDARY_CTRLR_STATE | 0x20 | -| NVME_SC_INVALID_NUM_CTRLR_RESOURCES | 0x21 | -| NVME_SC_INVALID_RESOURCE_ID | 0x22 | -| NVME_SC_CONFLICTING_ATTRIBUTES | 0x80 | -| NVME_SC_INVALID_PROTECTION_INFO | 0x81 | -| NVME_SC_ATTEMPTED_WRITE_TO_RO_PAGE | 0x82 | - -### MEDIA_DATA_INTERGRITY_ERROR - -介质异常错误码参考 - -|sc |value| -|-----------------------------------------|---------------| -| NVME_SC_WRITE_FAULTS | 0x80 | -| NVME_SC_UNRECOVERED_READ_ERROR | 0x81 | -| NVME_SC_GUARD_CHECK_ERROR | 0x82 | -| NVME_SC_APPLICATION_TAG_CHECK_ERROR | 0x83 | -| NVME_SC_REFERENCE_TAG_CHECK_ERROR | 0x84 | -| NVME_SC_COMPARE_FAILURE | 0x85 | -| NVME_SC_ACCESS_DENIED | 0x86 | -| NVME_SC_DEALLOCATED_OR_UNWRITTEN_BLOCK | 0x87 | diff --git a/docs/zh/server/memory_storage/hsak/hsak_developer_guide.md b/docs/zh/server/memory_storage/hsak/hsak_developer_guide.md deleted file mode 100644 index 63d102908ddff3901c5ea3031b88d4db1a027481..0000000000000000000000000000000000000000 --- a/docs/zh/server/memory_storage/hsak/hsak_developer_guide.md +++ /dev/null @@ -1,47 +0,0 @@ -# HSAK开发者指南 - -## 介绍 - -随着NVMe SSD、SCM等存储介质性能不断提升,介质层在IO栈中的时延开销不断缩减,软件栈的开销成为瓶颈,急需重构内核IO数据面,减少软件栈的开销,HSAK针对新型存储介质提供高带宽低时延的IO软件栈,相对传统IO软件栈,软件栈开销降低50%以上。 -HSAK用户态IO引擎基于开源的SPDK基础上进行开发: - -1. 对外提供统一的接口,屏蔽开源接口的差异。 -2. 在开源基础上新增IO数据面增强特性,如DIF功能,磁盘格式化,IO批量下发,trim特性,动态增删盘等特性。 -3. 提供磁盘设备管理,磁盘IO监测,维测工具等特性。 - -## 编译教程 - -1. 下载hsak源码 - - $ git clone - -2. 编译和运行依赖 - - hsak的编译和运行依赖于spdk、dpdk、libboundscheck等组件 - -3. 编译 - - $ cd hsak - - $ mkdir build - - $ cd build - - $ cmake .. - - $ make - -## 注意事项 - -### 使用约束 - -- 同一台机器最多使用和管理512个NVMe设备。 -- 启用HSAK执行IO相关业务时,需要确保系统有至少500M以上连续的空闲大页内存。 -- 启用用户态IO组件执行相关业务时,需要确保硬盘管理组件(ublock)已经启用。 -- 启用磁盘管理组件(ublock)执行相关业务时,需确保系统有足够的连续空闲内存,每次初始化ublock组件会申请20MB大页内存。 -- 每次运行HSAK之前,产品需要调用setup.sh来配置大页,解绑NVMe设备内核态驱动。 -- 执行libstorage_init_module成功后方可使用HSAK模块提供的其他接口;每个进程仅能执行一次libstorage_init_module调用。 -- 执行libstorage_exit_module函数之后不能再使用HSAK提供的其他接口,再多线程场景特别需要注意,在所有线程结束之后再退出HSAK。 -- HSAK ublock组件在一个服务器上只能启动一个服务,且最大支持64个ublock客户端并发访问,ublock服务端处理客户端请求的处理上限是20次/秒。 -- HSAK ublock组件必须早于数据面IO组件和ublock客户端启动,HSAK提供的命令行工具也必须在ublock服务端启动后才能执行。 -- 不要注册SIGBUS信号处理函数;spdk针对该信号有单独的处理函数;若该函数被覆盖,会导致spdk注册的SIGBUS处理函数失效,产生coredump。 diff --git a/docs/zh/server/memory_storage/hsak/hsak_tool_usage.md b/docs/zh/server/memory_storage/hsak/hsak_tool_usage.md deleted file mode 100644 index 822eb64c4faf4226fecc9c64c7284e52a24f3f4d..0000000000000000000000000000000000000000 --- a/docs/zh/server/memory_storage/hsak/hsak_tool_usage.md +++ /dev/null @@ -1,125 +0,0 @@ -# HSAK 工具使用说明 - -## 命令行接口 - -### 盘信息查询命令 - -#### 命令格式 - -```shell -libstorage-list [] [] -``` - -#### 参数说明 - -- commands: 只有“help”可选,“libstorage-list help”用于显示帮助内容。 - -- device: 指定PCI地址,格式如:0000:09:00.0,允许同时多个,中间用空格隔离,如果不设置具体的PCI地址,则命令行列出所有枚举到的设备信息。 - -#### 注意事项 - -- 故障注入功能仅限于开发、调试以及测试场景使用,禁止在用户现网使用,否则会引起业务及安全风险。 - -- 在执行本命令时,管理组件(ublock)服务端必须已经启动,用户态IO组件(uio)未启动或已正确启动均可。 - -- 对于未被ublock组件和用户态IO组件占用的盘,在本命令执行过程中会被占用,此时如果ublock组件或用户态IO组件尝试获取盘控制权,可能存储设备访问冲突,导致失败。 - -### 盘切换驱动命令 - -#### 命令格式 - -```shell -libstorage-shutdown reset [ ...] -``` - -#### 参数说明 - -- reset: 用于对指定盘从uio驱动切换到内核态驱动; - -- device: 指定PCI地址,格式如:0000:09:00.0,允许同时多个,中间用空格隔离。 - -#### 注意事项 - -- libstorage-shutdown reset命令用于将盘从用户态uio驱动切换到内核态nvme驱动。 - -- 在执行本命令时,管理组件(ublock)服务端必须已经启动,用户态IO组件未启动或已正确启动均可。 - -- libstorage-shutdown reset命令为危险动作,请确认在切换nvme设备驱动之前,用户态实例已经停止对nvme设备下发IO,nvme设备上的fd已全部关闭,且访问nvme设备的实例已退出。 - -### 获取IO统计数据命令 - -#### 命令格式 - -```shell -libstorage-iostat [-t ] [-i ] [-d ] -``` - -#### 参数说明 - -- -t: 时间间隔,以秒为单位,最小1秒,最大为3600秒。该参数为int型,如果入参值超过int型上限,将被截断成负数或者正数。 - -- -i: 收集次数,最小为1,最大为MAX_INT次,如果不设置,默认以时间间隔持续收集。该参数为int型,如果入参超过int型上限,将被截断成负数或者正数。 - -- -d:指定块设备名称(eg:nvme0n1,其依赖于/etc/spdk/nvme.conf.in中配置的控制器名称),可以通过本参数收集指定一个或多个设备性能数据,如果不设置本参数,则收集所有识别到的设备性能数据。 - -#### 注意事项 - -- IO统计配置项已使能。 - -- 进程已经通过用户态IO组件对所需要查询性能信息的盘下发IO操作。 - -- 如果当前环境上没有任何设备被业务进程占用下发IO,则该命令将在提示:You cannot get iostat info for nvme device no deliver io后退出。 - -- 在磁盘打开多队列情况下,IO统计工具将该磁盘上多队列的性能数据汇总后统一输出。 - -- IO统计工具最多支持8192个磁盘队列的数据记录。 - -- IO统计数据输出结果如下: - - | Device | r/s | w/s | rKB/s | wKB/s | avgrq-sz | avgqu-sz | r_await | w_await | await | svctm | util% | poll-n | - | ------ | ------- | ------- | ------- | ------- | ------------ | --------- | --------- | --------- | ---------- | ------------ | ----- | ------ | - | 设备名称 | 每秒读IO个数 | 每秒写IO个数 | 每秒读IO字节 | 每秒写IO字节 | 平均下发IO大小(字节) | 磁盘排队的IO深度 | IO读时延(us) | IO写时延(us) | 读写平均时延(us) | 单个IO处理时延(us) | 设备利用率 | 轮询超时次数 | - -## 盘读写命令 - -### 命令格式 - -```shell -libstorage-rw [OPTIONS...] -``` - -### 参数说明 - -1. COMMAND参数 - - - read,从设备读取指定的逻辑块到数据缓存区(默认是标准输出)。 - - - write,将数据缓存区(默认是标准输入)的数据写入到NVMe设备的指定逻辑块。 - - - help,显示该命令行的帮助信息。 - -2. device: 指定PCI地址,格式如:0000:09:00.0。 - -3. OPTIONS参数 - - - --start-block,-s:读写逻辑块的64位首地址(缺省值为0)。 - - - --block-count,-c:读写逻辑块的数量(从0开始计数)。 - - - --data-size,-z:读写数据的字节数。 - - - --namespace-id,-n:设备的namespace id(默认id值是1)。 - - - --data,-d:读写用的数据文件(读时保存读出的数据,写时提供写入数据)。 - - - --limited-retry,-l:设备控制器进行有限次数的重启来完成设备读写。 - - - --force-unit-access,-f:确保指令完成之前从非易失介质中完成读写。 - - - --show-command,-v:发送读写命令之前显示指令相关信息。 - - - --dry-run,-w:仅显示读写指令相关信息,不进行实际读写操作。 - - - --latency,-t:统计命令行端到端读写的时延。 - - - --help,-h:显示相关命令的帮助信息。 diff --git a/docs/zh/server/memory_storage/lvm/_toc.yaml b/docs/zh/server/memory_storage/lvm/_toc.yaml deleted file mode 100644 index f2bc64b8e377d43a654a6dd108e6a2b65f43fd62..0000000000000000000000000000000000000000 --- a/docs/zh/server/memory_storage/lvm/_toc.yaml +++ /dev/null @@ -1,6 +0,0 @@ -label: 配置和管理逻辑卷 -isManual: true -description: 使用 LVM 管理硬盘 -sections: - - label: 使用 LVM 管理硬盘 - href: ./managing_drives_through_lvm.md diff --git a/docs/zh/server/memory_storage/lvm/managing_drives_through_lvm.md b/docs/zh/server/memory_storage/lvm/managing_drives_through_lvm.md deleted file mode 100644 index 3ee0d2c59a05f74da6e63b32b8b361c3cc1a8589..0000000000000000000000000000000000000000 --- a/docs/zh/server/memory_storage/lvm/managing_drives_through_lvm.md +++ /dev/null @@ -1,575 +0,0 @@ -# 使用LVM管理硬盘 - - - -- [使用LVM管理硬盘](#使用lvm管理硬盘) - - [LVM简介](#lvm简介) - - [基本概念](#基本概念) - - [安装](#安装) - - [管理物理卷](#管理物理卷) - - [创建物理卷](#创建物理卷) - - [查看物理卷](#查看物理卷) - - [修改物理卷属性](#修改物理卷属性) - - [删除物理卷](#删除物理卷) - - [管理卷组](#管理卷组) - - [创建卷组](#创建卷组) - - [查看卷组](#查看卷组) - - [修改卷组属性](#修改卷组属性) - - [扩展卷组](#扩展卷组) - - [收缩卷组](#收缩卷组) - - [删除卷组](#删除卷组) - - [管理逻辑卷](#管理逻辑卷) - - [创建逻辑卷](#创建逻辑卷) - - [查看逻辑卷](#查看逻辑卷) - - [调整逻辑卷大小](#调整逻辑卷大小) - - [扩展逻辑卷](#扩展逻辑卷) - - [收缩逻辑卷](#收缩逻辑卷) - - [删除逻辑卷](#删除逻辑卷) - - [创建并挂载文件系统](#创建并挂载文件系统) - - [创建文件系统](#创建文件系统) - - [手动挂载文件系统](#手动挂载文件系统) - - [自动挂载文件系统](#自动挂载文件系统) - - - -## LVM简介 - -LVM是逻辑卷管理(Logical Volume Manager)的简称,它是Linux环境下对磁盘分区进行管理的一种机制。LVM通过在硬盘和文件系统之间添加一个逻辑层,来为文件系统屏蔽下层硬盘分区布局,提高硬盘分区管理的灵活性。 - -使用LVM管理硬盘的基本过程如下: - -1. 将硬盘创建为物理卷 -2. 将多个物理卷组合成卷组 -3. 在卷组中创建逻辑卷 -4. 在逻辑卷之上创建文件系统 - -通过LVM管理硬盘之后,文件系统不再受限于硬盘的大小,可以分布在多个硬盘上,也可以动态扩容。 - -### 基本概念 - -- 物理存储介质(The physical media):指系统的物理存储设备,如硬盘,系统中为/dev/hda、/dev/sda等等,是存储系统最低层的存储单元。 - -- 物理卷(Physical Volume,PV):指硬盘分区或从逻辑上与磁盘分区具有同样功能的设备\(如RAID\),是LVM的基本存储逻辑块。物理卷包括一个特殊的标签,该标签默认存放在第二个 512 字节扇区,但也可以将标签放在最开始的四个扇区之一。该标签包含物理卷的随机唯一识别符(UUID),记录块设备的大小和LVM元数据在设备中的存储位置。 - -- 卷组(Volume Group,VG):由物理卷组成,屏蔽了底层物理卷细节。可在卷组上创建一个或多个逻辑卷且不用考虑具体的物理卷信息。 - -- 逻辑卷(Logical Volume,LV):卷组不能直接用,需要划分成逻辑卷才能使用。逻辑卷可以格式化成不同的文件系统,挂载后直接使用。 - -- 物理块(Physical Extent,PE):物理卷以大小相等的“块”为单位存储,块的大小与卷组中逻辑卷块的大小相同。 - -- 逻辑块(Logical Extent,LE):逻辑卷以“块”为单位存储,在一卷组中的所有逻辑卷的块大小是相同的。 - -## 安装 - -> [!NOTE]说明 -> openEuler操作系统默认已安装LVM。可通过**rpm -qa | grep lvm2**命令查询,若打印信息中包含“lvm2”信息,则表示已安装LVM,可跳过本章节内容;若无任何打印信息,则表示未安装,可参考本章节内容进行安装。 - -1. 配置本地yum源,详细信息请参考[搭建repo服务器](../../administration/administrator/configuring-the-repo-server.md)。 -2. 清除缓存。 - - ```shell - dnf clean all - ``` - -3. 创建缓存。 - - ```shell - dnf makecache - ``` - -4. 在root权限下安装LVM。 - - ```shell - # dnf install lvm2 - ``` - -5. 查看安装后的rpm包。 - - ```shell - rpm -qa | grep lvm2 - ``` - -## 管理物理卷 - -### 创建物理卷 - -可在root权限下通过pvcreate命令创建物理卷。 - -```shell -pvcreate [option] devname ... -``` - -其中: - -- option:命令参数选项。常用的参数选项有: - - -f:强制创建物理卷,不需要用户确认。 - - - -u:指定设备的UUID。 - - -y:所有的问题都回答“yes”。 - -- devname:指定要创建的物理卷对应的设备名称,如果需要批量创建,可以填写多个设备名称,中间以空格间隔。 - -示例1:将/dev/sdb、/dev/sdc创建为物理卷。 - -```shell -# pvcreate /dev/sdb /dev/sdc -``` - -示例2:将/dev/sdb1、/dev/sdb2创建为物理卷。 - -```shell -# pvcreate /dev/sdb1 /dev/sdb2 -``` - -### 查看物理卷 - -可在root权限通过pvdisplay命令查看物理卷的信息,包括:物理卷名称、所属的卷组、物理卷大小、PE大小、总PE数、可用PE数、已分配的PE数和UUID。 - -```shell -pvdisplay [option] devname -``` - -其中: - -- option:命令参数选项。常用的参数选项有: - - -s:以短格式输出。 - - -m:显示PE到LE的映射。 - -- devname:指定要查看的物理卷对应的设备名称。如果不指定物理卷名称,则显示所有物理卷的信息。 - -示例:显示物理卷/dev/sdb的基本信息。 - -```shell -# pvdisplay /dev/sdb -``` - -### 修改物理卷属性 - -可在root权限下通过pvchange命令修改物理卷的属性。 - -```shell -pvchange [option] pvname ... -``` - -其中: - -- option:命令参数选项。常用的参数选项有: - - -u:生成新的UUID。 - - -x:是否允许分配PE。 - -- pvname:指定要修改属性的物理卷对应的设备名称,如果需要批量修改,可以填写多个设备名称,中间以空格间隔。 - -示例:禁止分配/dev/sdb物理卷上的PE。没有加入卷组的物理卷执行pvdisplay命令显示Allocatable属性为NO,需要加入卷组才能成功修改该属性。 - -```shell -# pvchange -x n /dev/sdb -``` - -### 删除物理卷 - -可在root权限下通过pvremove命令删除物理卷。 - -```shell -pvremove [option] pvname ... -``` - -其中: - -- option:命令参数选项。常用的参数选项有: - - -f:强制删除物理卷,不需要用户确认。 - - -y:所有的问题都回答“yes”。 - -- pvname:指定要删除的物理卷对应的设备名称,如果需要批量删除,可以填写多个设备名称,中间以空格间隔。 - -示例:删除物理卷/dev/sdb。如果物理卷已经加入卷组,需要先删除卷组或者从卷组中移除,再删除物理卷。 - -```shell -# pvremove /dev/sdb -``` - -## 管理卷组 - -### 创建卷组 - -可在root权限下通过vgcreate命令创建卷组。 - -```shell -vgcreate [option] vgname pvname ... -``` - -其中: - -- option:命令参数选项。常用的参数选项有: - - -l:卷组上允许创建的最大逻辑卷数。 - - -p:卷组中允许添加的最大物理卷数。 - - -s:卷组上的物理卷的PE大小。 - -- vgname:要创建的卷组名称。 -- pvname:要加入到卷组中的物理卷名称。 - -示例:创建卷组 vg1,并且将物理卷/dev/sdb和/dev/sdc添加到卷组中。 - -```shell -# vgcreate vg1 /dev/sdb /dev/sdc -``` - -### 查看卷组 - -可在root权限下通过vgdisplay命令查看卷组的信息。 - -```shell -vgdisplay [option] [vgname] -``` - -其中: - -- option:命令参数选项。常用的参数选项有: - - -s:以短格式输出。 - - -A:仅显示活动卷组的属性。 - -- vgname:指定要查看的卷组名称。如果不指定卷组名称,则显示所有卷组的信息。 - -示例:显示卷组vg1的基本信息。 - -```shell -# vgdisplay vg1 -``` - -### 修改卷组属性 - -可在root权限下通过vgchange命令修改卷组的属性。 - -```shell -vgchange [option] vgname -``` - -其中: - -- option:命令参数选项。常用的参数选项有: - - -a:设置卷组的活动状态。 - -- vgname:指定要修改属性的卷组名称。 - -示例:将卷组vg1状态修改为活动。 - -```shell -# vgchange -ay vg1 -``` - -### 扩展卷组 - -可在root权限下通过vgextend命令动态扩展卷组。它通过向卷组中添加物理卷来增加卷组的容量。 - -```shell -vgextend [option] vgname pvname ... -``` - -其中: - -- option:命令参数选项。常用的参数选项有: - - -d:调试模式。 - - -t:仅测试。 - -- vgname:要扩展容量的卷组名称。 -- pvname:要加入到卷组中的物理卷名称。 - -示例:向卷组vg1中添加物理卷/dev/sdb。 - -```shell -# vgextend vg1 /dev/sdb -``` - -### 收缩卷组 - -可在root权限下通过vgreduce命令删除卷组中的物理卷来减少卷组容量。不能删除卷组中剩余的最后一个物理卷。 - -```shell -vgreduce [option] vgname pvname ... -``` - -其中: - -- option:命令参数选项。常用的参数选项有: - - -a:如果命令行中没有指定要删除的物理卷,则删除所有的空物理卷。 - - \-\-removemissing:删除卷组中丢失的物理卷,使卷组恢复正常状态。 - -- vgname:要收缩容量的卷组名称。 -- pvname:要从卷组中删除的物理卷名称。 - -示例:从卷组vg1中移除物理卷/dev/sdb2。 - -```shell -# vgreduce vg1 /dev/sdb2 -``` - -### 删除卷组 - -可在root权限下通过vgremove命令删除卷组。 - -```shell -vgremove [option] vgname -``` - -其中: - -- option:命令参数选项。常用的参数选项有: - - -f:强制删除卷组,不需要用户确认。 - -- vgname:指定要删除的卷组名称。 - -示例:删除卷组vg1。 - -```shell -# vgremove vg1 -``` - -## 管理逻辑卷 - -### 创建逻辑卷 - -可在root权限下通过lvcreate命令创建逻辑卷。 - -```shell -lvcreate [option] vgname -``` - -其中: - -- option:命令参数选项。常用的参数选项有: - - -L:指定逻辑卷的大小,单位为“kKmMgGtT”字节。 - - -l:指定逻辑卷的大小(LE数)。 - - -n:指定要创建的逻辑卷名称。 - - -s:创建快照。 - -- vgname:要创建逻辑卷的卷组名称。 - -示例1:在卷组vg1中创建10G大小的逻辑卷。 - -```shell -# lvcreate -L 10G vg1 -``` - -示例2:在卷组vg1中创建200M的逻辑卷,并命名为lv1。 - -```shell -# lvcreate -L 200M -n lv1 vg1 -``` - -### 查看逻辑卷 - -可在root权限下通过lvdisplay命令查看逻辑卷的信息,包括逻辑卷空间大小、读写状态和快照信息等属性。 - -```shell -lvdisplay [option] [lvname] -``` - -其中: - -- option:命令参数选项。常用的参数选项有: - - - -v:显示LE到PE的映射。 - -- lvname:指定要显示属性的逻辑卷对应的设备文件。如果省略,则显示所有的逻辑卷属性。 - - > [!NOTE]说明 - > 逻辑卷对应的设备文件保存在卷组目录下,例如:在卷组vg1上创建一个逻辑卷lv1,则此逻辑卷对应的设备文件为/dev/vg1/lv1。 - -示例:显示逻辑卷lv1的基本信息。 - -```shell -# lvdisplay /dev/vg1/lv1 -``` - -### 调整逻辑卷大小 - -可在root权限下通过lvresize命令调整LVM逻辑卷的空间大小,可以增大空间和缩小空间。使用lvresize命令调整逻辑卷空间大小和缩小空间时需要谨慎,因为有可能导致数据丢失。 - -```shell -lvresize [option] vgname -``` - -其中: - -- option:命令参数选项。常用的参数选项有: - - -L:指定逻辑卷的大小,单位为“kKmMgGtT”字节。 - - -l:指定逻辑卷的大小(LE数)。 - - -f:强制调整逻辑卷大小,不需要用户确认。 - -- lvname:指定要调整的逻辑卷名称。 - -示例1:为逻辑卷/dev/vg1/lv1增加200M空间。 - -```shell -# lvresize -L +200 /dev/vg1/lv1 -``` - -示例2:为逻辑卷/dev/vg1/lv1减少200M空间。 - -```shell -# lvresize -L -200 /dev/vg1/lv1 -``` - -### 扩展逻辑卷 - -可在root权限下通过lvextend命令动态在线扩展逻辑卷的空间大小,而不中断应用程序对逻辑卷的访问。 - -```shell -lvextend [option] lvname -``` - -其中: - -- option:命令参数选项。常用的参数选项有: - - -L:指定逻辑卷的大小,单位为“kKmMgGtT”字节。 - - -l:指定逻辑卷的大小(LE数)。 - - -f:强制调整逻辑卷大小,不需要用户确认。 - -- lvname:指定要扩展空间的逻辑卷的设备文件。 - -示例:为逻辑卷/dev/vg1/lv1增加100M空间。 - -```shell -# lvextend -L +100M /dev/vg1/lv1 -``` - -### 收缩逻辑卷 - -可在root权限下通过lvreduce命令减少逻辑卷占用的空间大小。使用lvreduce命令收缩逻辑卷的空间大小有可能会删除逻辑卷上已有的数据,所以在操作前必须进行确认。 - -```shell -lvreduce [option] lvname -``` - -其中: - -- option:命令参数选项。常用的参数选项有: - - -L:指定逻辑卷的大小,单位为“kKmMgGtT”字节。 - - -l:指定逻辑卷的大小(LE数)。 - - -f:强制调整逻辑卷大小,不需要用户确认。 - -- lvname:指定要扩展空间的逻辑卷的设备文件。 - -示例:将逻辑卷/dev/vg1/lv1的空间减少100M。 - -```shell -# lvreduce -L -100M /dev/vg1/lv1 -``` - -### 删除逻辑卷 - -可在root权限下通过lvremove命令删除逻辑卷。如果逻辑卷已经使用mount命令加载,则不能使用lvremove命令删除。必须使用umount命令卸载后,逻辑卷方可被删除。 - -```shell -lvremove [option] vgname -``` - -其中: - -- option:命令参数选项。常用的参数选项有: - - -f:强制删除逻辑卷,不需要用户确认。 - -- vgname:指定要删除的逻辑卷。 - -示例:删除逻辑卷/dev/vg1/lv1。 - -```shell -# lvremove /dev/vg1/lv1 -``` - -## 创建并挂载文件系统 - -在创建完逻辑卷之后,需要在逻辑卷之上创建文件系统并挂载文件系统到相应目录下。 - -### 创建文件系统 - -可在root权限下通过mkfs命令创建文件系统。 - -```shell -mkfs [option] lvname -``` - -其中: - -- option:命令参数选项。常用的参数选项有: - - -t:指定创建的linux文件系统类型,如ext2,ext3,ext4等等,默认类型为ext2。 - -- lvname:指定要创建的文件系统对应的逻辑卷设备文件名。 - -示例:在逻辑卷/dev/vg1/lv1上创建ext4文件系统。 - -```shell -# mkfs -t ext4 /dev/vg1/lv1 -``` - -### 手动挂载文件系统 - -手动挂载的文件系统仅在当时有效,一旦操作系统重启则会不存在。 - -可在root权限下通过mount命令挂载文件系统。 - -```shell -mount lvname mntpath -``` - -其中: - -- lvname:指定要挂载文件系统的逻辑卷设备文件名。 -- mntpath:挂载路径。 - -示例:将逻辑卷/dev/vg1/lv1挂载到/mnt/data目录。 - -```shell -# mount /dev/vg1/lv1 /mnt/data -``` - -### 自动挂载文件系统 - -手动挂载的文件系统在操作系统重启之后会不存在,需要重新手动挂载文件系统。但若在手动挂载文件系统后在root权限下进行如下设置,可以实现操作系统重启后自动挂载文件系统。 - -1. 执行blkid命令查询逻辑卷的UUID,逻辑卷以/dev/vg1/lv1为例。 - - ```shell - # blkid /dev/vg1/lv1 - ``` - - 查看打印信息,打印信息中包含如下内容,其中 _uuidnumber_ 是一串数字,为UUID, _fstype_ 为文件系统。 - - /dev/vg1/lv1: UUID=" _uuidnumber_ " TYPE=" _fstype_ " - -2. 执行**vi /etc/fstab**命令编辑fstab文件,并在最后加上如下内容。 - - ```shell - UUID=uuidnumber mntpath fstype defaults 0 0 - ``` - - 内容说明如下: - - - 第一列:UUID,此处填写[1](#li65701520154311)查询的 _uuidnumber_ 。 - - 第二列:文件系统的挂载目录 _mntpath_ 。 - - 第三列:文件系统的文件格式,此处填写[1](#li65701520154311)查询的 _fstype_ 。 - - 第四列:挂载选项,此处以“defaults”为例; - - 第五列:备份选项,设置为“1”时,系统自动对该文件系统进行备份;设置为“0”时,不进行备份。此处以“0”为例; - - 第六列:扫描选项,设置为“1”时,系统在启动时自动对该文件系统进行扫描;设置为“0”时,不进行扫描。此处以“0”为例。 - -3. 验证自动挂载功能。 - 1. 执行umount命令卸载文件系统,逻辑卷以/dev/vg1/lv1为例。 - - ```shell - # umount /dev/vg1/lv1 - ``` - - 2. 执行如下命令,将/etc/fstab文件所有内容重新加载。 - - ```shell - # mount -a - ``` - - 3. 执行如下命令,查询文件系统挂载信息,挂载目录以/mnt/data为例。 - - ```shell - # mount | grep /mnt/data - ``` - - 查看打印信息,若信息中包含如下信息表示自动挂载功能生效。 - - /dev/vg1/lv1 on /mnt/data diff --git a/docs/zh/server/memory_storage/public_sys-resources/icon-caution.gif b/docs/zh/server/memory_storage/public_sys-resources/icon-caution.gif deleted file mode 100644 index 6e90d7cfc2193e39e10bb58c38d01a23f045d571..0000000000000000000000000000000000000000 Binary files a/docs/zh/server/memory_storage/public_sys-resources/icon-caution.gif and /dev/null differ diff --git a/docs/zh/server/memory_storage/public_sys-resources/icon-danger.gif b/docs/zh/server/memory_storage/public_sys-resources/icon-danger.gif deleted file mode 100644 index 6e90d7cfc2193e39e10bb58c38d01a23f045d571..0000000000000000000000000000000000000000 Binary files a/docs/zh/server/memory_storage/public_sys-resources/icon-danger.gif and /dev/null differ diff --git a/docs/zh/server/memory_storage/public_sys-resources/icon-note.gif b/docs/zh/server/memory_storage/public_sys-resources/icon-note.gif deleted file mode 100644 index 6314297e45c1de184204098efd4814d6dc8b1cda..0000000000000000000000000000000000000000 Binary files a/docs/zh/server/memory_storage/public_sys-resources/icon-note.gif and /dev/null differ diff --git a/docs/zh/server/memory_storage/public_sys-resources/icon-notice.gif b/docs/zh/server/memory_storage/public_sys-resources/icon-notice.gif deleted file mode 100644 index 86024f61b691400bea99e5b1f506d9d9aef36e27..0000000000000000000000000000000000000000 Binary files a/docs/zh/server/memory_storage/public_sys-resources/icon-notice.gif and /dev/null differ diff --git a/docs/zh/server/memory_storage/public_sys-resources/icon-tip.gif b/docs/zh/server/memory_storage/public_sys-resources/icon-tip.gif deleted file mode 100644 index 93aa72053b510e456b149f36a0972703ea9999b7..0000000000000000000000000000000000000000 Binary files a/docs/zh/server/memory_storage/public_sys-resources/icon-tip.gif and /dev/null differ diff --git a/docs/zh/server/memory_storage/public_sys-resources/icon-warning.gif b/docs/zh/server/memory_storage/public_sys-resources/icon-warning.gif deleted file mode 100644 index 6e90d7cfc2193e39e10bb58c38d01a23f045d571..0000000000000000000000000000000000000000 Binary files a/docs/zh/server/memory_storage/public_sys-resources/icon-warning.gif and /dev/null differ diff --git a/docs/zh/server/performance/cpu_optimization/sysboost/_toc.yaml b/docs/zh/server/performance/cpu_optimization/sysboost/_toc.yaml deleted file mode 100644 index c870a250939f14ecd768747b75e0b55eb43d3356..0000000000000000000000000000000000000000 --- a/docs/zh/server/performance/cpu_optimization/sysboost/_toc.yaml +++ /dev/null @@ -1,10 +0,0 @@ -label: sysBoost用户指南 -isManual: true -description: 优化代码与运行环境的 CPU 微架构的适应性,提升程序性能 -sections: - - label: 认识sysBoost - href: ./getting_to_know_sysBoost.md - - label: 安装与部署 - href: ./installation_and_deployment.md - - label: 使用方法 - href: ./usage_instructions.md diff --git a/docs/zh/server/performance/cpu_optimization/sysboost/figures/icon-note.gif b/docs/zh/server/performance/cpu_optimization/sysboost/figures/icon-note.gif deleted file mode 100644 index 6314297e45c1de184204098efd4814d6dc8b1cda..0000000000000000000000000000000000000000 Binary files a/docs/zh/server/performance/cpu_optimization/sysboost/figures/icon-note.gif and /dev/null differ diff --git "a/docs/zh/server/performance/cpu_optimization/sysboost/figures/\346\236\266\346\236\204.png" "b/docs/zh/server/performance/cpu_optimization/sysboost/figures/\346\236\266\346\236\204.png" deleted file mode 100644 index 92611802616844553a7c6ad79c12c0ed29d369ee..0000000000000000000000000000000000000000 Binary files "a/docs/zh/server/performance/cpu_optimization/sysboost/figures/\346\236\266\346\236\204.png" and /dev/null differ diff --git a/docs/zh/server/performance/cpu_optimization/sysboost/getting_to_know_sysBoost.md b/docs/zh/server/performance/cpu_optimization/sysboost/getting_to_know_sysBoost.md deleted file mode 100644 index 81fff10694a7c12c96533d95d4b4ab98094acede..0000000000000000000000000000000000000000 --- a/docs/zh/server/performance/cpu_optimization/sysboost/getting_to_know_sysBoost.md +++ /dev/null @@ -1,61 +0,0 @@ -# 认识sysBoost - -## 概述 - -通过代码重排技术对可执行文件和动态库文件在线重排操作,优化代码与运行环境的CPU微架构的适应性,提升程序性能。 - -## 问题背景 - -- 大型APP应用,使用大量的第3方或自研动态库,函数调用产生大量PLT跳转导致IPC指令执行效率下降。 - -- 汇编代码体积大内存占用大,导致iTLB miss概率高。热点代码段布局离散,导致iCache miss高,影响CPU流水线执行效率。 - -- 应用开发者对操作系统与CPU微架构不熟悉,IPC性能调优成本过大。 - -## 设计方案 - -### 关键技术 - -- 动态库拼接:通过ld加载阶段将分散的动态库的代码段数据段拼接聚合,然后使用大页内存提升iTLB命中率。 - -- 消除PLT跳转:应用代码调用动态库函数的流程,需要先跳转PLT表,然后跳转真实函数,消除PLT跳转能提升IPC。 - -- 热点Section在线重排:默认情况下代码段是按动态库粒度排布的,通过在线重排技术可以实现热点代码按Section粒度重排。 - -- exec原生大页:用户态大页机制需要应用修改配置和重编译,exec原生大页机制直接在内核加载ELF文件阶段使用大页内存,对APP透明。 - -### 架构 - -**图 1** sysBoost设计总体方案 - -![](./figures/架构.png) - -## sysBoost支持的功能特性 - -- 支持全静态合并场景:将应用与其依赖的动态库合并为一个二进制,并进行段级别的重排,将多个离散的代码段/数据段合并为一个,提升应用性能。 - -- 自动对系统中的二进制进行优化:sysBoost守护进程读取配置文件获取需要优化的二进制以及对应的优化方式,按照用户的要求进行优化,并将优化好的二进制存储在.rto后缀的文件中。 - -- 二进制代码段/数据段大页预加载:用户态页表映射物理内存时,使用大页(2M)映射可以提升性能,而当前openeuler不支持文件页的大页映射。sysBoost提供大页预加载的功能,在二进制优化完成后立即将其内容以大页形式加载到内核中,在应用启动时将预加载的内容批量映射到用户态页表,减少应用的缺页中断和访存延迟,提升启动速度和运行效率。 - -- 二进制异常监控:如果sysBoost生成的.rto二进制出现BUG,应用可能会crash。为了避免应用被反复拉起,反复crash等严重后果,防止故障扩散,sysBoost会对加载.rto二进制的进程进行监控。如果发现这样的进程发生了crash,sysBoost会回退优化,将该.rto文件和原应用文件的标记删除;同时也会将配置文件重命名,防止下次sysBoost服务重启后再次进行优化。 - -## 价值概述 - -### 场景一 - -在UnixBench的Bash测试中,通常会执行一些常见的命令和脚本,例如 ls、grep、awk 等。这些命令和脚本通常会调用一些系统库,例如 libc、libpthread 等,这些库文件通常需要动态链接。由于动态链接会增加程序的启动时间和延迟,因此采用二进制合并技术将这些库文件合并到可执行文件中,可以显著提高Bash的性能,从而提高UnixBench的得分。 - -### 场景二 - -云核等产品组件动态可装配设计, 使用大量动态库,带来了以下问题: - -- 动态库机制引入函数间接跳转和代码布局离散问题, 导致CPU执行效率降低。 -- 动态库大量的符号解析过程, 影响程序启动速度。 -- 基于特定业务模型的预先离线编译优化(Profile-Guided Optimization), 无法适应不同业务模型变化。 - -在业务进程现网部署阶段, 通过sysBoost生成大进程可有效解决上述问题: - -- 通过自研exec大页机制加载大进程, 使代码段和数据段利用大页内存, 降低TLB miss。 -- 大进程包含所有动态库代码和应用代码,消除函数间接跳转问题。 -- 智能识别业务, 选择合适的热点模型, 重新生成大进程,在线适应业务变化。 diff --git a/docs/zh/server/performance/cpu_optimization/sysboost/installation_and_deployment.md b/docs/zh/server/performance/cpu_optimization/sysboost/installation_and_deployment.md deleted file mode 100644 index 6ffccd4c2794821ae6a2d36f5d2d10b5cda52745..0000000000000000000000000000000000000000 --- a/docs/zh/server/performance/cpu_optimization/sysboost/installation_and_deployment.md +++ /dev/null @@ -1,66 +0,0 @@ -# 安装与部署 - -## 软硬件要求 - -- 硬件:鲲鹏920处理器 - -- 软件:操作系统openEuler 23.09 - -## 环境准备 - -- 安装openEuler系统。 - -- 安装sysBoost需要使用root权限。 - -## 安装sysBoost - -安装sysBoost的操作步骤如下(xxx在以下描述中代表版本号): - -1. 挂载openEuler的iso文件 - - ```sh - # 使用对应的openEuler版本 - mount openEuler-xxx-aarch64-dvd.iso /mnt - ``` - -2. 配置本地yum源 - - ```sh - vim /etc/yum.repos.d/local.repo - ``` - - 配置内容如下所示: - - ```sh - [localosrepo] - name=localosrepo - baseurl=file:///mnt - enabled=1 - gpgcheck=1 - gpgkey=file:///mnt/RPM-GPG-KEY-openEuler - ``` - -3. 安装sysBoost - - ```sh - yum install sysboost -y - ``` - -4. 验证是否安装成功,命令和回显如下表示安装成功 - - ```sh - rpm -qa | grep sysboost - # sysboost-xxx - rpm -qa | grep native-turbo - # native-turbo-xxx - ``` - -5. 安装需要合并的ELF文件所对应的relocation包 - - ```sh - yum install bash-relocation-xxx -y - yum install ncurses-relocation-xxx -y - ``` - -> [!NOTE]说明 -> 若当前所需要的可执行ELF文件及其依赖库中已经包含relocation段,则可以跳过步骤5。 diff --git a/docs/zh/server/performance/cpu_optimization/sysboost/usage_instructions.md b/docs/zh/server/performance/cpu_optimization/sysboost/usage_instructions.md deleted file mode 100644 index fbef7a804eebc3caa45263c663c2ae0bb593be32..0000000000000000000000000000000000000000 --- a/docs/zh/server/performance/cpu_optimization/sysboost/usage_instructions.md +++ /dev/null @@ -1,94 +0,0 @@ -# 使用方法 - -## 总体说明 - -- 使用和配置sysBoost需要使用root权限。 -- sysBoost不支持多实例运行。 -- 请管理员确保配置文件的正确性。 - -## 配置方法 - -### 配置文件说明 - -配置文件目录:/etc/sysboost.d/ - -**表 1** 客户端yaml文件配置说明 - - - - - - - - - - - - - - - - - - - - - - - - - -

配置名称

-

配置说明

-

参数类型

-

取值范围

-

elf_path

-

需要合并的可执行ELF文件的名称

-

字符串

-

sysBoost支持的可执行ELF文件路径名称

-

mode

-

sysBoost的运行模式

-

字符串

-

"static-nolibc"

-

libs

-

elf_path所指定的可执行ELF文件的依赖库,sysBoost可以自动探测依赖库,故为可选项

-

字符串

-

sysBoost支持的可执行ELF文件的依赖库的路径名称

-
- -### 配置示例 - -sysBoost的toml配置文件示例: - -```sh -# /etc/sysboost.d/bash.toml -elf_path = "/usr/bin/bash" -mode = "static-nolibc" -libs = ["/usr/lib64/libtinfo.so.6"] -``` - -## 操作方法 - -- 启动sysBoost服务 - - ```sh - systemctl start sysboost.service - ``` - -- 关闭sysBoost服务 - - ```sh - systemctl stop sysboost.service - ``` - -- 状态查询(若没有标红字体,说明sysBoost运行正常) - - ```sh - systemctl status sysboost.service - ``` - -- 日志(若sysBoost出现错误,可通过系统日志查询相关信息) - - ```sh - cat /var/log/messages - ```