From 80e6e6ca53d1a992995baa8c4e2bdbba30c06722 Mon Sep 17 00:00:00 2001 From: yutt Date: Tue, 4 Jun 2024 14:55:07 +0800 Subject: [PATCH 1/3] =?UTF-8?q?=E5=A2=9E=E5=8A=A0=E4=BA=A7=E5=93=81?= =?UTF-8?q?=E7=B1=BB=E6=96=87=E4=BB=B6?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../CVE\346\274\217\346\264\236.md" | 15 +++++++++++++++ ...05\263\351\224\256\347\211\271\346\200\247.md" | 13 +++++++++++++ ...17\202\344\270\216\350\264\241\347\214\256.md" | 10 ++++++++++ ...77\253\351\200\237\345\205\245\351\227\250.md" | 7 +++++++ .../\346\272\220\344\273\243\347\240\201.md" | 8 ++++++++ ...63\273\347\273\237\345\256\211\350\243\205.md" | 7 +++++++ .../\350\207\264\350\260\242.md" | 8 ++++++++ 7 files changed, 68 insertions(+) create mode 100644 "PRODUCT_DOCS/anolisos/8.8/\345\217\221\350\241\214\350\257\264\346\230\216/CVE\346\274\217\346\264\236.md" create mode 100644 "PRODUCT_DOCS/anolisos/8.8/\345\217\221\350\241\214\350\257\264\346\230\216/\345\205\263\351\224\256\347\211\271\346\200\247.md" create mode 100644 "PRODUCT_DOCS/anolisos/8.8/\345\217\221\350\241\214\350\257\264\346\230\216/\345\217\202\344\270\216\350\264\241\347\214\256.md" create mode 100644 "PRODUCT_DOCS/anolisos/8.8/\345\217\221\350\241\214\350\257\264\346\230\216/\345\277\253\351\200\237\345\205\245\351\227\250.md" create mode 100644 "PRODUCT_DOCS/anolisos/8.8/\345\217\221\350\241\214\350\257\264\346\230\216/\346\272\220\344\273\243\347\240\201.md" create mode 100644 "PRODUCT_DOCS/anolisos/8.8/\345\217\221\350\241\214\350\257\264\346\230\216/\347\263\273\347\273\237\345\256\211\350\243\205.md" create mode 100644 "PRODUCT_DOCS/anolisos/8.8/\345\217\221\350\241\214\350\257\264\346\230\216/\350\207\264\350\260\242.md" diff --git "a/PRODUCT_DOCS/anolisos/8.8/\345\217\221\350\241\214\350\257\264\346\230\216/CVE\346\274\217\346\264\236.md" "b/PRODUCT_DOCS/anolisos/8.8/\345\217\221\350\241\214\350\257\264\346\230\216/CVE\346\274\217\346\264\236.md" new file mode 100644 index 0000000..a4a1b26 --- /dev/null +++ "b/PRODUCT_DOCS/anolisos/8.8/\345\217\221\350\241\214\350\257\264\346\230\216/CVE\346\274\217\346\264\236.md" @@ -0,0 +1,15 @@ +漏洞漏洞漏洞 +漏洞漏洞漏洞 +漏洞漏洞漏洞 +漏洞漏洞漏洞 +漏洞漏洞漏洞 +漏洞漏洞漏洞 +漏洞漏洞漏洞 +漏洞漏洞漏洞 +漏洞漏洞漏洞 +漏洞漏洞漏洞 +漏洞漏洞漏洞 +漏洞漏洞漏洞 +漏洞漏洞漏洞 +漏洞漏洞漏洞 +漏洞漏洞漏洞 \ No newline at end of file diff --git "a/PRODUCT_DOCS/anolisos/8.8/\345\217\221\350\241\214\350\257\264\346\230\216/\345\205\263\351\224\256\347\211\271\346\200\247.md" "b/PRODUCT_DOCS/anolisos/8.8/\345\217\221\350\241\214\350\257\264\346\230\216/\345\205\263\351\224\256\347\211\271\346\200\247.md" new file mode 100644 index 0000000..cb284b4 --- /dev/null +++ "b/PRODUCT_DOCS/anolisos/8.8/\345\217\221\350\241\214\350\257\264\346\230\216/\345\205\263\351\224\256\347\211\271\346\200\247.md" @@ -0,0 +1,13 @@ +关键特性 +关键特性 +关键特性 +关键特性 +关键特性 +关键特性 +关键特性 +关键特性关键特性 +关键特性 +关键特性 +关键特性 +关键特性关键特性 + diff --git "a/PRODUCT_DOCS/anolisos/8.8/\345\217\221\350\241\214\350\257\264\346\230\216/\345\217\202\344\270\216\350\264\241\347\214\256.md" "b/PRODUCT_DOCS/anolisos/8.8/\345\217\221\350\241\214\350\257\264\346\230\216/\345\217\202\344\270\216\350\264\241\347\214\256.md" new file mode 100644 index 0000000..79495c7 --- /dev/null +++ "b/PRODUCT_DOCS/anolisos/8.8/\345\217\221\350\241\214\350\257\264\346\230\216/\345\217\202\344\270\216\350\264\241\347\214\256.md" @@ -0,0 +1,10 @@ +参与贡献 +参与贡献 +参与贡献 +参与贡献 +参与贡献 +参与贡献 +参与贡献 +参与贡献 +参与贡献 +参与贡献 \ No newline at end of file diff --git "a/PRODUCT_DOCS/anolisos/8.8/\345\217\221\350\241\214\350\257\264\346\230\216/\345\277\253\351\200\237\345\205\245\351\227\250.md" "b/PRODUCT_DOCS/anolisos/8.8/\345\217\221\350\241\214\350\257\264\346\230\216/\345\277\253\351\200\237\345\205\245\351\227\250.md" new file mode 100644 index 0000000..c7e13ae --- /dev/null +++ "b/PRODUCT_DOCS/anolisos/8.8/\345\217\221\350\241\214\350\257\264\346\230\216/\345\277\253\351\200\237\345\205\245\351\227\250.md" @@ -0,0 +1,7 @@ +快速入门 +快速入门 +快速入门 +快速入门 +快速入门 +快速入门快速入门快速入门快速入门快速入门快速入门 +快速入门。 \ No newline at end of file diff --git "a/PRODUCT_DOCS/anolisos/8.8/\345\217\221\350\241\214\350\257\264\346\230\216/\346\272\220\344\273\243\347\240\201.md" "b/PRODUCT_DOCS/anolisos/8.8/\345\217\221\350\241\214\350\257\264\346\230\216/\346\272\220\344\273\243\347\240\201.md" new file mode 100644 index 0000000..a472078 --- /dev/null +++ "b/PRODUCT_DOCS/anolisos/8.8/\345\217\221\350\241\214\350\257\264\346\230\216/\346\272\220\344\273\243\347\240\201.md" @@ -0,0 +1,8 @@ +源代码源代码源代码源代码源代码源代码源代码源代码源代码 +源代码源代码源代码 +源代码源代码源代码 +源代码源代码 +源代码源代码 +源代码源代码 +源代码 +源代码 \ No newline at end of file diff --git "a/PRODUCT_DOCS/anolisos/8.8/\345\217\221\350\241\214\350\257\264\346\230\216/\347\263\273\347\273\237\345\256\211\350\243\205.md" "b/PRODUCT_DOCS/anolisos/8.8/\345\217\221\350\241\214\350\257\264\346\230\216/\347\263\273\347\273\237\345\256\211\350\243\205.md" new file mode 100644 index 0000000..2c15c6e --- /dev/null +++ "b/PRODUCT_DOCS/anolisos/8.8/\345\217\221\350\241\214\350\257\264\346\230\216/\347\263\273\347\273\237\345\256\211\350\243\205.md" @@ -0,0 +1,7 @@ +系统安装系统安装系统安装系统安装系统安装系统安装 +系统安装系统安装系统安装系统安装系统安装 +系统安装系统安装系统安装 +系统安装系统安装 +系统安装系统安装 +系统安装系统安装 +系统安装系统安装系统安装; \ No newline at end of file diff --git "a/PRODUCT_DOCS/anolisos/8.8/\345\217\221\350\241\214\350\257\264\346\230\216/\350\207\264\350\260\242.md" "b/PRODUCT_DOCS/anolisos/8.8/\345\217\221\350\241\214\350\257\264\346\230\216/\350\207\264\350\260\242.md" new file mode 100644 index 0000000..16080fe --- /dev/null +++ "b/PRODUCT_DOCS/anolisos/8.8/\345\217\221\350\241\214\350\257\264\346\230\216/\350\207\264\350\260\242.md" @@ -0,0 +1,8 @@ +致谢致谢致谢致谢致谢致谢 +致谢致谢致谢致谢致谢致谢 +致谢 +致谢 +致谢 +致谢 +致谢 +致谢 -- Gitee From fcd8ead3e7ec9ed6427f3ffe0b8158b456882fd3 Mon Sep 17 00:00:00 2001 From: yutt Date: Tue, 11 Jun 2024 18:24:56 +0800 Subject: [PATCH 2/3] =?UTF-8?q?=E9=BE=99=E8=9C=A5=E5=AE=9E=E9=AA=8C?= =?UTF-8?q?=E5=AE=A4?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- ...50\346\204\217\344\272\213\351\241\271.md" | 62 +++++++++++++++++++ 1 file changed, 62 insertions(+) create mode 100644 "INFRA_DOCS/\351\276\231\350\234\245\345\256\236\351\252\214\345\256\244/\346\263\250\346\204\217\344\272\213\351\241\271.md" diff --git "a/INFRA_DOCS/\351\276\231\350\234\245\345\256\236\351\252\214\345\256\244/\346\263\250\346\204\217\344\272\213\351\241\271.md" "b/INFRA_DOCS/\351\276\231\350\234\245\345\256\236\351\252\214\345\256\244/\346\263\250\346\204\217\344\272\213\351\241\271.md" new file mode 100644 index 0000000..2f715d4 --- /dev/null +++ "b/INFRA_DOCS/\351\276\231\350\234\245\345\256\236\351\252\214\345\256\244/\346\263\250\346\204\217\344\272\213\351\241\271.md" @@ -0,0 +1,62 @@ +您可以使用toolkit-maven-plugin插件升级已经部署在EDAS的Spring Cloud、Dubbo和HSF应用。 + +前提条件 +已在EDAS创建Spring Cloud、Dubbo和HSF应用。 + +背景信息 +toolkit-maven-plugin是一个开源工具,帮助您构建一个应用完整的生命周期框架,完成项目的基础工具建设。更多信息,请参见toolkit-maven-plugin概述。 + +toolkit-maven-plugin也支持升级EDAS K8s集群中的应用。具体操作,请参见使用Cloud Toolkit插件单批发布应用(K8s)。 + +操作步骤 +在本地应用工程的pom.xml文件中,添加toolkit-maven-plugin插件的依赖。 + + + + + + com.alibaba.cloud + toolkit-maven-plugin + 1.1.5 + + + +说明 +version的值建议设置为1.1.5,最新版本可能存在不适用情况。 + +在工程的根目录下,创建配置文件.edas_config.yaml,在文件中配置部署参数。 + +以.开头的文件为IDE的默认配置文件。在未指定其它配置文件时,插件会默认使用根目录下的.edas_config.yaml文件。 + +如果创建了非.开头的配置文件,也可以在打包工程时,通过设置参数-Dedas_config=xxx来指定该配置文件。 + +如果存在默认配置文件,也通过参数指定了其它配置文件,那么插件会使用参数指定的配置文件。 + +说明 +如果当前工程为一个Maven工程的子模块,那么默认配置文件应该存放在当前工程目录下,而不是整个Maven工程的根目录下。关于多模块工程部署更多方式,请参见部署多模块工程。 + +典型升级场景的配置示例如下: + +升级应用所有分组。 + +例如,在华北2(北京)地域有一个ID为eb20****-e6ee-4f6d-a36f-5f6a5455****的应用,需要升级该应用的所有分组。配置示例如下: + + +env: + region_id: cn-beijing +app: + app_id: eb20****-e6ee-4f6d-a36f-5f6a5455**** +配置参数的值请以应用相关的实际信息为准。 + +升级应用指定分组并标识部署包版本。 + +例如,在华北2(北京)地域有一个ID为eb20dc8a-e6ee-4f6d-a36f-5f6a545****的应用,需要升级的应用分组ID为06923bb9-8c5f-4508-94d8-517b692f****,部署包版本为1.2。配置示例如下: + + +env: + region_id: cn-beijing +app: + app_id: eb20dc8a-e6ee-4f6d-a36f-5f6a5455**** + package_version: 1.2 + group_id: 06923bb9-8c5f-4508-94d8-517b692f**** +实际使用时,可能会有更复杂的场景,需要配置更多参数,请参见部署应用配置参数说明。 \ No newline at end of file -- Gitee From 6885ce16cfc978fd05da26b46bfde23d38791289 Mon Sep 17 00:00:00 2001 From: yutt Date: Mon, 17 Jun 2024 16:15:35 +0800 Subject: [PATCH 3/3] =?UTF-8?q?=E6=8F=90=E4=BA=A4=E6=96=87=E4=BB=B6?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../Anolis OS Cloud Kernel RAS White Paper.md | 982 ++++++++++++++++ ...50\346\210\267\346\226\207\346\241\243.md" | 242 ++++ ...55\345\273\272\346\225\231\347\250\213.md" | 268 +++++ ...13\350\257\225\346\212\245\345\221\212.md" | 1001 +++++++++++++++++ 4 files changed, 2493 insertions(+) create mode 100644 INFRA_DOCS/RAS/Anolis OS Cloud Kernel RAS White Paper.md create mode 100644 "INFRA_DOCS/RAS/Bugzilla\347\224\250\346\210\267\346\226\207\346\241\243.md" create mode 100644 "OPERATIONS_DOCS/\344\272\272\344\272\272\351\203\275\345\217\257\344\273\245\345\217\202\344\270\216\345\274\200\346\272\220/\351\276\231\350\234\245\344\270\200\345\210\273/\344\272\214\345\210\206\346\263\225/Anolis OS\347\216\257\345\242\203\346\220\255\345\273\272\346\225\231\347\250\213.md" create mode 100644 "OPERATIONS_DOCS/\344\272\272\344\272\272\351\203\275\345\217\257\344\273\245\345\217\202\344\270\216\345\274\200\346\272\220/\351\276\231\350\234\245\344\270\200\345\210\273/\344\272\214\345\210\206\346\263\225/\351\276\231\350\234\245 ANCK 5.10 \345\200\232\345\244\251\345\271\263\345\217\260 MPAM \346\265\213\350\257\225\346\212\245\345\221\212.md" diff --git a/INFRA_DOCS/RAS/Anolis OS Cloud Kernel RAS White Paper.md b/INFRA_DOCS/RAS/Anolis OS Cloud Kernel RAS White Paper.md new file mode 100644 index 0000000..aff77f7 --- /dev/null +++ b/INFRA_DOCS/RAS/Anolis OS Cloud Kernel RAS White Paper.md @@ -0,0 +1,982 @@ +# Anolis OS Cloud Kernel: RAS White Paper + +## REVISION HISTORY + +| DATE | VERSION | DESCRIPTION | AUTHOR | APPROVER | +| ---------- | ------- | --------------- | ----------------------- | ----------- | +| 2022/12/31 | 1.0 | Initial version | Shuai Xue, Ruidong Tian | Baolin Wang | + +## Terms and Abbreviations + +| Abbreviation | Definition | +| ------------ | ----------------------------------------------------------------------------- | +| RAS | Reliability, Availability and Serviceability | +| SLA | Service Level Agreement | +| CE | Correctable Error | +| UCE | Uncorrected Correctable Error | +| MCA | Machine-Check Architecture | +| CMCI | Corrected Machine Check Interrupt | +| MCE | Machine-Check Exception | +| SEA | Synchronous External Abort | +| ELx | Exception levels are referred to as EL, with x as a number between 0 and 3 | +| ECC | Error Correction Code | +| SECDED | Single-bit Error Correction and Double-bit Error Detection | +| TF-A | Trusted Firmware-A | +| HEST | Hardware Error Source Table | +| GHES | Generic Hardware Error Source | + +## Abstract + +Reliability, availability and serviceability (RAS) is a computer hardware engineering term referring to the elimination of hardware failures to ensure maximum system uptime. + +This document describes the memory RAS features in detail, explaining how server availability is enhanced with the memory RAS features on Yitian 710 servers running Anolis OS Cloud Kernel. + +## Introduction + +The server is one of the key components of any modern data center infrastructure. It consists of a variety of hardware parts, including processors, storage devices, PCIe devices, power supplies, and fans. + +In today’s hyper scale Cloud Data centers, correct server operation and data integrity are critical to ensure service continuity. In other words, we must avoid data corruption no matter data is stored in any server component (memory, cache, or processor registers) or transmitted through any platform links (Intel®UPI, PCI Express, or DMI). + +Server reliability, availability, and serviceability (RAS) are crucial issues for modern enterprise IT shops that deliver mission-critical applications and services, as application delivery failures can be extremely costly per hour of system downtime. Although hardware failures are rare, they are inevitable but random events, especially for large scale data centers. If such incidents are not efficiently diagnosed, the consequences may be very serious and sometimes even catastrophic, such as data corruption or server crash. which are top concerns to meet SLAs (Service Level Agreement) for cloud end users. The likelihood of such failures increases statistically with the size of the servers, data, and memory required for these deployments. Furthermore, considering today’s server system with more and more CPU cores shipped on hundreds of Virtual Machines (VM) and DDR DIMMs operating on it, the impact of server crash caused by hardware failures is much bigger than before. + +Modern CPU offers an extensive and robust set of RAS features in silicon to provide error detection, correction, containment, and recovery in all processors, memory, and I/O data paths based on Intel Machine Check Architecture (MCA) Recovery mechanism or ARM v8.2 RAS Extension. When a server component fails, OS with such RAS features is capable of recovery from hardware error, maximizing service availability and maintaining data integrity. + +## RAS Mechanism Overview + +### Error categories + +One of the most popular RAS schemes used in the memory subsystem is Error Correction Code (ECC) SECDED (Single-bit Error Correction and Double-bit Error Detection), which as its name indicates, the DDR controller can correct single-bit errors and detect double-bit errors on the received data from the DRAMs. + +Talking about detected hardware errors, we can classify memory errors as either corrected errors (CE) or uncorrected errors (UCE). + +- **Correctable Error (CE)** - the hardware error detection mechanism detected and automatically corrected the error. +- **Uncorrected errors (UCE)** - are severe enough, hardware detects but cannot correct. + +![](../../assets/MCA_categories_2.png) + +Typically, uncorrectable errors further fall into three categories: + +- **Uncorrected Recoverable Errors (UCR)** - are uncorrected errors that have been detected and signaled but have not corrupted the processor context. For certain UCR errors, this means that once system software has performed a certain recovery action, it is possible to continue execution on this processor. UCR error reporting provides an error containment mechanism for data poisoning. It can be further divided into: + - **Action Required (AR)**: The error occurs in execution context. If such an error is detected, and the memory access has been architecturally executed, that error is considered “consumed”. CPU will signal a synchronous exception when an error is detected and the processor already consumes the memory. OS requires to take action (for example, offline failure page/kill failure thread) to recover this uncorrectable error. + - **Action Optional (AO)**: The error is detected out of processor execution context, e.g. when detected by a background scrubber or accessed by prefetch instruction. In this scenario, the data in the memory are corrupted, but OS is optional to take action to recover this uncorrectable error. +- **Uncorrected Error (UC)** - 2 bit (uncorrectable) error occurs and can not be corrected by hardware. The processor context is corrupted and cannot continue to operate the system. OS requires to panic immediately. + +OS will take specific actions based on the above failures. Handling CEs is done in silicon, e.g. using ECCs and can be made transparent to system. Handling DUEs, however, can require collaboration from higher layers in the hardware-software stack, from silicon to virtual memory manager, to the operating system (OS), and sometimes even the application layer. + +### X86 MCA Recovery + +The new Intel Xeon Scalable Family processors support recovery from some memory errors based on the Machine Check Architecture (MCA) Recovery mechanism. The figure shows a basic workflow with legacy MCA or EMCA. + +Prior to enhanced machine check architecture (EMCA), IA32-legacy version of Machine Check Architecture (MCA) implemented error handling where all the errors were logged in architected registers (MC banks) and signaled to OS/hypervisor. CMCI is signaled only when CE is over threshold and OS CMCI handler, aka, `threshold_interrupt` read MC Banks and other HW registers for further error handling. MCE is signaled when uncorrected or fatal errors are detected and its handler `do_machine_check` will poison the page and then kill current thread in memory failure. + +EMCA enables BIOS-based recovery from errors which redirects MCE and CMCI to firmware first (via SMI) before sending it to the OS error handler. It allows firmware first to handle, collect, and build enhanced error logs then report to system software. + +![ras_x86.png](../../assets/ras_x86.png) + +### ARM v8.2 RAS Extension + +The RAS Extension is a mandatory extension to the Armv8.2 architecture, and it is an optional extension to the Armv8.0 and Armv8.1 architectures. The figure shows a basic workflow with Firmware First mode. +![m1_ras_flow.png](../../assets/m1_ras_flow.png) + +- Prerequisite: System boot and init + + - Platform RAS driver init: BL31 initializes SPM (includes MM dispatcher) and SDEI dispatcher, UEFI query and update error source info in HEST + - OS RAS driver init: HEST driver scans HEST table and registers error handlers by APEI notification, e.g. SDEI, SEA, GPIO, etc. + +1. RAS event (UE or CE) occurred, the event will be routed to EL3 (SPM). +2. SPM routes the event to RAS error handler in S-EL0 (MM Foundation). +3. MM Foundation creates the CPER blobs by the info from RAS Extension. +4. SPM notifies RAS event through APEI notification, e.g. SDEI, SEA, etc. to call the corresponding OS registered handler. +5. OS gets the CPER blobs by Error Status Address block, processes the error, and tries to recover. +6. OS reports the error event by RAS tracepoints. +7. rasdaemon log error info from RAS event to recorder. + +For example, the platform specifies SDEI as an APEI notification to handle RAS events. As part of initialization, the kernel registers a handler for a platform event, enables the event, and unmasks the current PE. At a later point in time, a critical event, e.g. DDR UE interrupt is trapped into EL3. EL3 performs a first-level triage of the event, and a RAS component assumes further handling. The dispatch completes, but intends to involve Non-secure world UEFI in further handling, and therefore decides to explicitly dispatch an event (which the kernel had already registered for). + +## RAS Solution on ANCK + +Modern CPU offers an extensive and robust set of RAS features in silicon to provide error detection, correction, containment, and recovery in all processors, memory, and I/O data paths based on Intel Machine Check Architecture (MCA) Recovery mechanism or ARM v8.2 RAS Extension. The RAS mechanism is intended to assist CPU designers and CPU debuggers in diagnosing, isolating, and understanding processor failures. It is also intended to help system administrators detect transient and age-related failures, suffered during long-term operation of the server. + +To reduce systems downtime, the OS recovery process for ensuring reliable hardware performance is to detect and correct errors where possible, recover from uncorrectable errors through either physical or logical replacement of a failing component or data path, and prevent future errors by replacing in timely fashion components most likely to fail. + +The figure shows the system error handling flow with Anolis OS. + +![RAS_OS_Error_Flow.png](../../assets/RAS_OS_Error_Flow.png) + +### Memory Failure Recovery + +The RAS mechanism is used to detect, signal, and record machine fault information. Some of these faults are correctable, whereas others are uncorrectable. The Memory Failure Recovery capabilities of RAS mechanism allow systems to continue to operate when an uncorrected error is detected in the system. If not for these capabilities, the system would crash and might require hardware replacement or a system reboot. + +When an uncorrectable error is detected on a requested memory address, data poisoning is used to inform the CPU that the data requested has an uncorrectable error. When the hardware detects an uncorrectable memory error, it routes a poison bit along with the data to the CPU. For the Intel architecture, when the CPU detects this poison bit, it sends a processor interrupt signal to the operating system to notify it of this error. The operating system can then examine the uncorrectable memory error, determine if the software can recover, and perform recovery actions via an interrupt handler. + +Memory Failure Recovery handles UCR errors including: + +- AR are synchronous Errors. There are two types of such errors signaled as data abort or instruction abort. For example, data abort is detected by Data Cache Unit (DCU) and instruction abort is detected by Instruction Fetch Unit (IFU) which are both signaled as Machine Check Exception. The analogy exception is Synchronous External Abort in Arm64 platform. + +- AO are asynchronous Errors. Such errors are detected by memory patrol scrub, prefetch, Last Level Cache (LLC) explicit writeback transaction for X86 platform or store less than ECC protection granularity, e.g. per 64 bit on Neoverse N1 and N2. + +The kernel will attempt to hard-offline the page, by trying to unmap the page or killing any owner, or triggering IO errors if needed. This may kill any processes accessing the page. The kernel will avoid to access this page assuming it's poisoned by the hardware. +Let's dive into more details about Anolis OS Cloud Kernel running on Severs capable of Intel MCA Recovery or ARM v8.2 RAS Extension. + +#### User Space Action Required Recovery + +In Linux, user memory and kernel memory are independent and implemented in separate address spaces. The address spaces are virtualized, meaning that the addresses are abstracted from physical memory (through a process detailed shortly). In fact, the kernel itself resides in one address space, and each process resides in its own address space, so each process can be isolated completely and protected by the paging mechanism. These address spaces consist of virtual memory addresses, permitting many processes with independent address spaces to refer to a considerably smaller physical address space (the physical memory in the machine). Not only is this convenient, but it's also secure, because each address space is independent and isolated and therefore secure. One isolated address space per process is the basis of preventing the fault from being propagated to the enclosing scope or process. + +Without OS memory failure recovery and hardware data poisoning support, once a process is consuming poison, it will be regarded as a fatal event and the kernel will crash immediately. When the OS kernel receives the UCE events, the `memory_failure` function (HWPoison handler) analyzes the log to verify if recovery is feasible. It then takes actions to offline the affected memory page and logs the event in the +mcelog or RAS tracepoint, and the possible results of the actions appear to be ignoring, recovery, delay, and failure. + +The HWPoison handler starts the recovery action by isolating the affected page and declaring it with a “poisoned” tag to disallow any reuse of the page. In the case of an AR-instruction abort event, the HWPoison handler then reloads the 4KB page containing the instruction to a new physical page and resumes normal operation. In the case of an AR-data abort event, the HWPoison handler triggers a “SIGBUS” event to take further recovery action by notifying only the accessing process or any owner process which is configured by hwpoison-aware technique like prctl or early kill. The application has a choice to either reload the data and resume normal execution, or terminate the application to avoid crashing the entire system. + +![EL0_Recovery.png](../../assets/EL0_Recovery.png) + +#### Kernel Space Action Required Recovery + +The kernel itself resides in one address space, and contains a process scheduler, networking stack, virtual file system, and device drivers for hardware support, to name just a few, shared by all user space processes. When a user space application requires the services provided by the kernel, it will signal the kernel to execute a syscall, and switch to kernel mode for the duration of the syscall execution. In principle, if any UCE error was triggered while executing OS kernel code, then the UCE error will be fatal. +Kernel also provides user space memory access APIs for cross-space data movement from or to user memory. Cross-space data movements are limited to perform in Linux by special functions, defined in ``. Such a movement is either performed by a generic (memcpy-like) function or by functions optimized for a specific data size (char, short, int, long); The role of the data-movement functions is shown in following figure as it relates to the types involved for copy (simple vs. aggregate), note, not all user access API is showed. + +![uaccess.png](../../assets/uaccess.png) + +For example, when a user process tries to write a buffer to a file, kernel will copy the data from userspace and then write them to disk. If a UCE error occurs in the userspace buffer, kernel will consume the poison data while copying data from userspace. In such case, a system wide reboot is not unnecessary. The point behind Kernel Space Action Required Recovery is that the poison data manipulated by kernel is owned by the user process. If the application that initiated the copy and owned corrupt data can be easily identified by the kernel, it is possible to isolate the corrupt data by marking the affected page with the ‘poison’ tag and terminating the initiator/impacted applications to stop the corrupt data from spreading. + +The mechanism is to track uaccess in extable in advance and change pc to fixup handler while handling synchronous Errors. Then the uaccess will jump to fixup handler which then endups the uaccess process. If the exception is fixuped correctly, the kernel can avoid panic. In the copy from user case, e.g. initiated by write(2), it is not even necessary to send a SIGBUS. System calls should return -EFAULT or a short count for write(2). The Figure shows the basic workflow for Arm64 platform and the implementation of the X86 platform is similar. + +![EL2_Recovery_x86.png](../../assets/EL2_Recovery_x86.png) + +#### Action Optional Recovery: Patrol Scrub + +ECC Patrol Scrubber is a common block in DDR Controller (DDRC) capable of generating initialization write commands, periodic read commands, periodic RMW commands, and correction RMW commands. It proactively searches the system memory, repairing correctable errors. Periodic scrubbing is performed by the ECC Patrol Scrubber to prevent the accumulation of single-bit errors and increase the reliability of the system by correcting single-bit ECC errors in time, before they turn into uncorrectable 2-bit errors. +When an uncorrectable 2-bit error is detected by Patrol Scrubber, an interrupt will be signaled. In such case, kernel will just unmap the poisoned page because no process is accessing the poison data by default. + +On X86 old generation platform, after the patrol scrub detects memory uncorrected data errors, it will report the OS by MCE. The new generation like Intel® Xeon® Processor-based platforms have an `UCE_TO_CE_DOWNGRAGE` mode where the users can request the memory controller to report UCE found by the patrol scrubber as a corrected type. It is also called ‘downgrading patrol scrub CE/SRAO to CE’. Those errors are signaled by using CMCI, a process less disruptive than a machine check and thus helps avoid double MCE interrupts to crash the system. We recommend setting it on. + +![scrub_recovery](../../assets/Scrub_Recovery.png) + +#### Action Optional Recovery: Prefetch + +Many modern processors implement implicit hardware prefetching and support software prefetching. With software prefetching the programmer or compiler inserts prefetch instructions into the program. For example, Prefetch from Memory (`PRFM`) enables code to provide a hint to the memory system that data from a particular address will be used by the program soon. While for implicit hardware prefetching, the processor monitors the memory access pattern of the running program and tries to predict what data the program will access next and prefetches that data. + +If a prefetch request accesses to poison data, an asynchronous error will be detected and an interrupt will be signaled, e.g. CMCI on Intel Icelake and SPI on Yitian 710. In such case, kernel will just unmap the poison page like Patrol Scrub error. + +Another prefetch scenario we observed is that the poisoned page may still be accessed even though all its owned user processes are killed. After a page is poisoned, it will never be reused, e.g. reallocated to other processes. The problem is that the poisoned page is only unmapped from the page table of user-space process, the kernel page table of the linear mapping range is not considered. It requires dynamically splitting the target linear mapping into PTE granularity and then clearing the PTE valid attribute of the related virtual address while processing memory failure. As a result, the poisoned page will be marked as not-present, which avoids speculative and prefetch access. + +#### Action Optional Recovery: Store + +Write is another type of request which may read the poison data from DDR controller. On Yitian 710, L2 cache is protected by a per 64-bit ECC scheme, a write less than 64bit will trigger asynchronous External Aborts, signaled as SErrors. Similarly, an asynchronous interrupt CMCI is signaled on X86 platform. In such case, it requires firmware to take extra care that does not notify kernel as a fatal error to avoid a system wide reboot. + +Unlike read access, write access does not cause error propagation. When such an error is detected, kernel will regard it as AO asynchronous error and only unmap the poisoned page. However, the write did not take effect, resulting in data loss. A subsequent 64-bit write access has the opportunity to correct this error. When the process trie to consume the poisoned page, the HWPoison handler triggers a “SIGBUS” event to take further recovery action by notifying only the accessing process or any owner process which is configured by hwpoison-aware technique like prctl or early kill. + +#### HWPoison-aware Strategy + +There are in principle two hwpoison-aware strategies to kill processes on poison: + +- just unmap the data and wait for an actual reference before killing +- kill all processes that have the corrupted and not reloadable page mapped as soon as the corruption is detected. + +Both have advantages and disadvantages and should be used in different situations. **Right now both are implemented and can be switched** with a new sysctl vm.memory_failure_early_kill. The default is late kill. Applications can override this setting individually with the PR_MCE_KILL prctl. For example, if early kill is set by `sysctl -w vm.memory_failure_early_kill=1`, kernel will kill any process which mapped the poison page when an uncorrectable 2-bit error is detected by Patrol Scrubber. + +Note, the kill is done using a catchable SIGBUS with BUS_MCEERR_AO, so processes can handle this if they want to. While for AR synchronous errors, the kill is done using a catchable SIGBUS with BUS_MCEERR_AR. + +### Memory Predictive Failure Analysis with Rasdeamon + +When a 1-bit error is detected, it is transparently corrected by the hardware ECC mechanism, and internal counters are updated. If a correctable fault occurs in the memory, we don't need to perform any recovery action on the OS. However, if we continue to see correctable errors, then perhaps the memory is failing. To avoid the possibility of future uncorrectable faults on the same page, we can copy the data to a different page and mark the page as offline. This is the mechanism used by Memory Predictive Failure Analysis (PFA). + +The PFA is powered by the userspace rasdaemon package. Rasdaemon written by Mauro Carvalho Chehab is one of the tools to gather MCE information. Previously, the task was performed by the mcelog package. However, the driver it depends on has been deprecated after kernel 4.12, we recommend switching to the new generation rasdaemon solution. + +If a memory error is detected and signaled, the OS related handler reports them to userspace through RAS tracepoints with EDAC decoded DIMM statistics for accounting and predictive failure analysis. Rasdeamon runs as a daemon that monitors the platform RAS reports from the Linux kernel trace events. And it optionally records RAS events via Sqlite3 which has the benefit of keeping a persistent record of the RAS events. Based on statistical results, some actions can be configured and taken to prevent corrected errors from evoluting into uncorrected errors. For example, specify soft offline action or hard offline action when exceeding a page error threshold within refresh cycles, e.g. 50 CEs perf 24 hours. When a soft action is specified, the kernel will then attempt to soft-offline it, by moving the contents elsewhere or dropping it if possible. The kernel will then be placed on the bad page list and never be reused. The page is still accessible, not poisoned. The kernel will never kill anything for this, but rather fail the offline. + +Note, the RAS feature is only covered but not limited to memory, the processor, PCIe, and Platform(e.g. CMN, GIC, SMMU, etc) RAS are also supported on Anolis OS Cloud Kernel. + +## RAS Validation Guide + +EINJ provides a hardware error injection mechanism. It is very useful for debugging and testing APEI and RAS features in general. In this white paper, we take Yitian 710 running Anolis OS as an example. Note that this guide is also suitable for other platforms with advanced RAS features. + +### Prerequisite + +#### BIOS Requirement + +You need to check whether your BIOS supports EINJ first. For Panjiu M Series equipped with Yitian 710, make ensure to set the following configuration properly. + +```bash +[Platform Configuration][Processor Configuration][CPU Poison] +[Platform Configuration][Memory RAS Configuration][Poison] +[Platform Configuration][Memory RAS Configuration][CE threshold ]<0> +[Platform Configuration][Memory RAS Configuration][Ecc] +[Platform Configuration][PCI-E Configuration][PCIe RAS Support] +[Platform Configuration][PCI-E Configuration][AER CE] +[Platform Configuration][Advance Configuration][Global RAS Enable] +[Platform Configuration][Advance Configuration][EINJ Enable] +[Platform Configuration][Advance Configuration][Route EA to El3] +``` + +#### OS Requirement + +Then, you need to check whether your BIOS supports EINJ. For that, look for early boot messages similar to this one, e.g. on Yitian 710 : + +```bash +#dmesg | grep EINJ +[ 0.000000] ACPI: EINJ 0x00000000F8FAFE18 000150 (v01 PTG PTG01 00000000 PTG 20200717) +``` + +which shows that the BIOS is exposing an EINJ table - it is the mechanism through which the injection is done. + +By default, the EINJ driver is built-in on Anolis OS. If you build kernel from scratch, make sure the following are options enabled in your kernel configuration: + +```shell +CONFIG_DEBUG_FS +CONFIG_ACPI_APEI +CONFIG_ACPI_APEI_EINJ +``` + +Check if the einj module is loaded: + +```shell +$ lsmod | grep einj +einj 16384 0 +``` + +If not, load the einj modules by yourself + +```shell +modprobe einj +``` + +### EINJ Interface + +The EINJ user interface is in \/apei/einj, by default, `/sys/kernel/debug/apei/einj`. + +```bash +#ls /sys/kernel/debug/apei/einj/ +available_error_type error_inject error_type flags notrigger param1 param2 param3 param4 vendor vendor_flags +``` + +The standard error types for the EINJ interface include Processor, Memory, PCIe, and Platform. The file `available_error_type`displays the supported standard error types and their severities, e.g. + +```bash +#cat /sys/kernel/debug/apei/einj/available_error_type +0x00000001 Processor Correctable +0x00000002 Processor Uncorrectable non-fatal +0x00000004 Processor Uncorrectable fatal +0x00000008 Memory Correctable +0x00000010 Memory Uncorrectable non-fatal +0x00000020 Memory Uncorrectable fatal +0x00000040 PCI Express Correctable +0x00000080 PCI Express Uncorrectable non-fatal +0x00000100 PCI Express Uncorrectable fatal +0x00000200 Platform Correctable +0x00000400 Platform Uncorrectable non-fatal +0x00000800 Platform Uncorrectable fatal +``` + +The error injection mechanism is a two-step process. + +- First select an error specified all necessary error parameters including`error_type`,`flags`,`param{1-4}`and `notrigger`,then write any integer to `error_inject` to inject the error. +- The second step performs some actions to trigger it. Setting `notrigger` to 1 skips the trigger phase, which may allow the user to cause the error in some other context by a simple access to the CPU, memory location, or device that is the target of the error injection. Setting `notrigger` to 0, the BIOS should trigger the error internally, e.g. by kicking the patrol scrubber. Whether this actually works depends on what operations the BIOS actually includes in the trigger phase. + +Please refer to the kernel document for more details about EINJ user interface format. + +#### Error Injection Examples with APEI Debugfs + +In this section, we show examples to inject errors with APEI Debugfs on Yitian 710. + +##### Processor Uncorrectable non-fatal + +```bash +APEI_IF=/sys/kernel/debug/apei/einj +echo 33 > $APEI_IF/param3 # APIC ID +echo 0x1 > $APEI_IF/flags +echo 0x00000002 > $APEI_IF/error_type +echo 1 > $APEI_IF/error_inject +``` + +The dmesg log: + +```bash +[ 1820.578688] {3}[Hardware Error]: Hardware error from APEI Generic Hardware Error Source: 4 +[ 1820.589434] {3}[Hardware Error]: event severity: recoverable +[ 1820.595078] {3}[Hardware Error]: precise tstamp: 2023-01-02 17:23:02 +[ 1820.601503] {3}[Hardware Error]: Error 0, type: recoverable +[ 1820.607147] {3}[Hardware Error]: section_type: ARM processor error +[ 1820.613485] {3}[Hardware Error]: MIDR: 0x00000000410fd490 +[ 1820.619041] {3}[Hardware Error]: Multiprocessor Affinity Register (MPIDR): 0x0000000081210000 +[ 1820.627723] {3}[Hardware Error]: running state: 0x1 +[ 1820.632759] {3}[Hardware Error]: Power State Coordination Interface state: 0 +[ 1820.639965] {3}[Hardware Error]: Error info structure 0: +[ 1820.645435] {3}[Hardware Error]: num errors: 1 +[ 1820.650037] {3}[Hardware Error]: error_type: 0, cache error +[ 1820.655854] {3}[Hardware Error]: error_info: 0x0000000000800015 +[ 1820.662019] {3}[Hardware Error]: transaction type: Instruction +[ 1820.668183] {3}[Hardware Error]: cache level: 2 +[ 1820.673045] {3}[Hardware Error]: the error has not been corrected +[ 1820.679470] {3}[Hardware Error]: type: CORE (0x41), ras_count:1 +[ 1820.685461] {3}[Hardware Error]: sub_type: 0x0 +[ 1820.689977] {3}[Hardware Error]: fr: 0x10a9a2, ctrl: 0x0, status: 0x44800007, addr: 0x800e9f716acea53d +[ 1820.699352] {3}[Hardware Error]: misc0: 0x4, misc1: 0x0, misc2: 0x0, misc3: 0x0 +``` + +##### Processor Uncorrectable fatal + +Script to inject and trigger processor uncorrectable fatal error. Note, a fatal error will cause the kernel to panic. + +```bash +APEI_IF=/sys/kernel/debug/apei/einj +echo 33 > $APEI_IF/param3 # APIC ID +echo 0x1 > $APEI_IF/flags +echo 0x00000004 > $APEI_IF/error_type +echo 1 > $APEI_IF/error_inject +``` + +The dmesg log: + +```bash +[10862.838686] {10}[Hardware Error]: Hardware error from APEI Generic Hardware Error Source: 3 +[10862.838687] {10}[Hardware Error]: event severity: fatal +[10862.838688] {10}[Hardware Error]: precise tstamp: 2023-01-02 19:53:43 +[10862.838688] {10}[Hardware Error]: Error 0, type: fatal +[10862.838688] {10}[Hardware Error]: section_type: ARM processor error +[10862.838689] {10}[Hardware Error]: MIDR: 0x00000000410fd490 +[10862.838689] {10}[Hardware Error]: Multiprocessor Affinity Register (MPIDR): 0x0000000081210000 +[10862.838689] {10}[Hardware Error]: running state: 0x1 +[10862.838690] {10}[Hardware Error]: Power State Coordination Interface state: 0 +[10862.838690] {10}[Hardware Error]: Error info structure 0: +[10862.838691] {10}[Hardware Error]: num errors: 1 +[10862.838691] {10}[Hardware Error]: error_type: 0, cache error +[10862.838691] {10}[Hardware Error]: error_info: 0x0000000000800015 +[10862.838692] {10}[Hardware Error]: transaction type: Instruction +[10862.838692] {10}[Hardware Error]: cache level: 2 +[10862.838693] {10}[Hardware Error]: the error has not been corrected +[10862.838693] {10}[Hardware Error]: type: CORE (0x41), ras_count:1 +[10862.838693] {10}[Hardware Error]: sub_type: 0x0 +[10862.838694] {10}[Hardware Error]: fr: 0x10a9a2, ctrl: 0x0, status: 0x74000007, addr: 0x800e9f716acea53d +[10862.838694] {10}[Hardware Error]: misc0: 0x4, misc1: 0x0, misc2: 0x0, misc3: 0x0 +[10862.838695] Kernel panic - not syncing: Fatal hardware error! +``` + +#### Memory + +##### Correctable + +Firstly, run a `victim` program in the background. The `victim` is one of the ras-tools which allocates a page in userspace and dumps the virtual and physical address of the page, and then holds on to trigger. + +```bash +#victim -d & +[1] 12472 +physical address of (0xffff87fb2000) = 0x89a0f8000 +Hit any key to trigger error: +[1]+ Stopped victim -d +``` + +Then run the bellow script to inject and trigger memory correct error. Note, the CE recovery is usually implemented as a threshold based error reporting mechanism. The default threshold for CE is 5000, in other words, the hardware only signal interrupt per 5000 CE errors. To test the feature, we configure CE threshold as 0. + +```bash +echo 0x89a0f8000 > $APEI_IF/param1 +echo 0xfffffffffffff000 > $APEI_IF/param2 +echo 0x1 > $APEI_IF/flags +echo 0x00000008 > $APEI_IF/error_type +echo 1 > $APEI_IF/error_inject +``` + +The dmesg log: + +```bash +[ 1555.991595] EDAC MC0: 1 CE single-symbol chipkill ECC on unknown memory (node:0 card:0 module:0 rank:0 bank_group:4 bank_address:2 device:0 row:616 column:1024 chip_id:0 page:0x89a0f8 offset:0x0 grain:1 syndrome:0x0 - APEI location: node:0 card:0 module:0 rank:0 bank_group:4 bank_address:2 device:0 row:616 column:1024 chip_id:0 status(0x0000000000000400): Storage error in DRAM memory) +[ 1555.991600] {1}[Hardware Error]: Hardware error from APEI Generic Hardware Error Source: 2 +[ 1555.991602] {1}[Hardware Error]: It has been corrected by h/w and requires no further action +[ 1555.991602] {1}[Hardware Error]: event severity: corrected +[ 1555.991604] {1}[Hardware Error]: precise tstamp: 2023-01-02 17:18:38 +[ 1555.991604] {1}[Hardware Error]: Error 0, type: corrected +[ 1555.991606] {1}[Hardware Error]: section_type: memory error +[ 1555.991606] {1}[Hardware Error]: error_status: Storage error in DRAM memory (0x0000000000000400) +[ 1555.991607] {1}[Hardware Error]: physical_address: 0x000000089a0f8000 +[ 1555.991608] {1}[Hardware Error]: node:0 card:0 module:0 rank:0 bank_group:4 bank_address:2 device:0 row:616 column:1024 chip_id:0 +[ 1555.991609] {1}[Hardware Error]: error_type: 4, single-symbol chipkill ECC +[ 1555.991610] {1}[Hardware Error]: type: DDR (0x50), ras_count:1 +[ 1555.991611] {1}[Hardware Error]: sub_type: 0x0 +[ 1555.991612] {1}[Hardware Error]: fr: 0x1000200000022, ctrl: 0x0, status: 0x0, addr: 0x0 +[ 1555.991612] {1}[Hardware Error]: misc0: 0x0, misc1: 0x0, misc2: 0x200000000000000, misc3: 0x900000000000000 +``` + +##### Memory UnCorrectable Non-fatal + +Firstly, run a `victim` program in the background as the last section described. + +```bash +#victim -d & +physical address of (0xffff962d0000) = 0x9f8acb000 +Hit any key to trigger error: +[1]+ Stopped victim -d +``` + +Then run the bellow script to inject and trigger memory correct error. Here, we specify `notrigger` to 0 to let the firmware kick the DDRC scrubber to trigger the error. + +```bash +APEI_IF=/sys/kernel/debug/apei/einj +echo 0x400a4919000 > $APEI_IF/param1 +echo 0xfffffffffffff000 > $APEI_IF/param2 +echo 0x1 > $APEI_IF/flags +echo 0x00000010 > $APEI_IF/error_type +echo 0x0 > $APEI_IF/notrigger +echo 1 > $APEI_IF/error_inject +``` + +The dmesg log: + +```bash +[ 211.121855] {1}[Hardware Error]: Hardware error from APEI Generic Hardware Error Source: 2 +[ 211.132646] {1}[Hardware Error]: event severity: recoverable +[ 211.138292] {1}[Hardware Error]: precise tstamp: 2022-12-30 15:26:40 +[ 211.144717] {1}[Hardware Error]: Error 0, type: recoverable +[ 211.150362] {1}[Hardware Error]: section_type: memory error +[ 211.156096] {1}[Hardware Error]: error_status: Storage error in DRAM memory (0x0000000000000400) +[ 211.165125] {1}[Hardware Error]: physical_address: 0x00000400a4919000 +[ 211.171725] {1}[Hardware Error]: node:0 card:7 module:0 rank:0 bank_group:7 bank_address:0 device:0 row:146 column:1152 chip_id:0 +[ 211.183619] {1}[Hardware Error]: error_type: 14, scrub uncorrected error +[ 211.190479] {1}[Hardware Error]: type: DDR (0x50), ras_count:1 +[ 211.196383] {1}[Hardware Error]: sub_type: 0x0 +[ 211.200899] {1}[Hardware Error]: fr: 0x1000200000353, ctrl: 0x0, status: 0x0, addr: 0x0 +[ 211.208974] {1}[Hardware Error]: misc0: 0x0, misc1: 0x0, misc2: 0x0, misc3: 0x200000000000500 +[ 211.218375] Memory failure: 0x400a4919: recovery action for dirty LRU page: Recovered +``` + +At this point, the allocated physical page is unmapped and poisoned, any read access will trigger a page fault. +If we move the background process `victim`on current Linux shell to the foreground and hit any key, the victim will trigger a page fault and receive a SIGBUS signal due to the poisoned PTE entry. Because the `victim`process does not register the SIGBUS handler, it will be killed. + +```bash +#fg +victim -d + +Access time at Fri Dec 30 15:38:14 2022 + +Bus error +``` + +We can also specify `notrigger` to 1 to let the firmware skip the trigger phase and allow the `victim` process to access the target of the error injection so that the error will be detected in execution context. + +Firstly, select a page and inject an error to it, while explicitly skipping the firmware trigger phase. + +```bash +#victim -d & +[1] 9522 +physical address of (0xffffaed6d000) = 0x400aa6dd000 +Hit any key to trigger error: +[1]+ Stopped victim -d + +APEI_IF=/sys/kernel/debug/apei/einj + +echo 0x400aa6dd000 > $APEI_IF/param1 +echo 0xfffffffffffff000 > $APEI_IF/param2 +echo 0x1 > $APEI_IF/flags +echo 0x00000010 > $APEI_IF/error_type +echo 0x1 > $APEI_IF/notrigger +echo 1 > $APEI_IF/error_inject +``` + +Then move the background process `victim` on current Linux shell to the foreground and hit any key, so that the error will be triggered in execution context. The kernel will poison the page and unmap it, then send SIGBUS to the process which accesses the page. + +```bash +#fg +victim -d + +Access time at Fri Dec 30 15:39:26 2022 + +Bus error +``` + +The dmesg log: + +```bash +[ 799.958832] {3}[Hardware Error]: Hardware error from APEI Generic Hardware Error Source: 2 +[ 799.969533] {3}[Hardware Error]: event severity: recoverable +[ 799.975179] {3}[Hardware Error]: precise tstamp: 2022-12-30 15:36:29 +[ 799.981603] {3}[Hardware Error]: Error 0, type: recoverable +[ 799.987248] {3}[Hardware Error]: section_type: memory error +[ 799.992978] {3}[Hardware Error]: error_status: Storage error in DRAM memory (0x0000000000000400) +[ 800.002007] {3}[Hardware Error]: physical_address: 0x00000400aa6dd000 +[ 800.008607] {3}[Hardware Error]: node:0 card:5 module:0 rank:1 bank_group:1 bank_address:0 device:0 row:169 column:1664 chip_id:0 +[ 800.020500] {3}[Hardware Error]: error_type: 5, multi-symbol chipkill ECC +[ 800.027446] {3}[Hardware Error]: type: DDR (0x50), ras_count:1 +[ 800.033351] {3}[Hardware Error]: sub_type: 0x0 +[ 800.037866] {3}[Hardware Error]: fr: 0x1001000100000000, ctrl: 0xf000000000920004, status: 0xd800000Cor0040](0xadd040000d0receiveaecntr=526(d1.subch3), cnt=0x1 +[ 800.060436] {3}[Hardware Error]: misc0: 0x3f00000000040307, misc1: 0xd00000000030cd18, misc2: 0x4015, misc3: 0x200000000000100 +[ 800.072366] Memory failure: 0x400aa6dd: recovery action for dirty LRU page: Recovered +``` + +### RAS-tools + +We can also test and validate RAS features of whole system stack across hardware, firmware and OS via ras-tools. Ras-tools are an excellent set of tools to inject and test RAS ability on X86 and Arm64 platforms based on the APEI EINJ interface. + +| tools | fatal | arch | Description | Usage | +| ----------- | -------- | -------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | ------------------------------------------------------ | +| einj_mem_uc | See help | x86、Arm | inject an error and then trigger it in one of a variety of ways. | ./einj_mem_uc # See help for testname | +| cmcistorm | No | x86 | use EINJ to inject a bunch of soft errors, then consume them all as fast as possible. | ./cmcistorm # e.g./cmcistorm 20 1 | +| hornet | No | x86、Arm | Start a process (or point to an existing one) and inject an uncorrectable memory error to a targeted or randomly chosen memory address | ./hornet -p PID | +| lmce | No | x86 | local mce | ./lmce | +| mca-recover | No | x86、Arm | Set up to get zapped by a machine check (injected elsewhere) recovery function reports physical address of new page - so we can inject to that and repeat over and over. | ./mca-recover | +| rep_ce_page | No | x86、Arm | loop using EINJ to inject a soft error, consuming after each until the page is taken offline. | ./rep_ce_page | +| vtop | No | x86、Arm | Given a process if and virtual address, dig around in /proc/id/pagemap to find the physical address (if present) behind the virtual one. | ./vtop | +| memattr | No | Arm | Example of the Linux kernel driver that allows a user-space program to mmap a buffer of contiguous physical memory with specific memory attribute. | cd pgprot-drv
make
insmod pgprot_drv.ko pgprot=4
../memattr| +| ras-tolerance | No | Arm | This driver allows to overwrite error severity to a lower level at runtime, recoverable by default. It is useful for test. | cd ras-tolerance
make
insmod ras_tolerance.ko| + +#### Install + +On servers running Anolis OS, you can install ras-tools through `yum`. On other OSes, you could build it from scratch. + +``` bash +yum install ras-tools +``` + +#### Memory Failure Recovery Validation + +The `einj_mem_uc` tool allocates pages, injects an error and then triggers it in one of a variety of ways. It intends to do a coverage test for testing the Linux RAS related features, including CPU/Memory error containment and recovery. + +##### AR Validation + +###### User Space AR-data Recovery + +In the case of an AR-data abort event e.g. `single`, `doube`,`split`,`hugetlb`,etc, the kernel will attempt to hard-offline the page, by poisoning the page and killing accessing process. For example, `single` case, it injects an uncorrected error and triggers the error by reading a byte. + +```bash +# einj_mem_uc single +0: single vaddr = 0xffff857a3400 paddr = 8e6157400 +injecting ... +triggering ... +signal 7 code 4 addr 0xffff857a3400 +page not present +Test passed +``` + +`einj_mem_uc` will print the received signal and its code, in the above case, + +- signal 7: SIGBUS +- code 4: BUS_MCEERR_AR 4 + +The dmesg log: + +```bash +[ 1785.908893] EDAC MC0: 1 UE multi-symbol chipkill ECC on unknown memory (node:0 card:0 module:0 rank:0 bank_group:1 bank_address:2 device:0 row:920 column:896 chip_id:0 page:0x8e6157 offset:0x400 grain:1 - APEI location: node:0 card:0 module:0 rank:0 bank_group:1 bank_address:2 device:0 row:920 column:896 chip_id:0 status(0x0000000000000400): Storage error in DRAM memory) +[ 1785.908900] {1}[Hardware Error]: Hardware error from APEI Generic Hardware Error Source: 2 +[ 1785.919531] {1}[Hardware Error]: event severity: recoverable +[ 1785.925176] {1}[Hardware Error]: precise tstamp: 2023-01-17 18:05:09 +[ 1785.931600] {1}[Hardware Error]: Error 0, type: recoverable +[ 1785.937244] {1}[Hardware Error]: section_type: memory error +[ 1785.942975] {1}[Hardware Error]: error_status: Storage error in DRAM memory (0x0000000000000400) +[ 1785.952004] {1}[Hardware Error]: physical_address: 0x00000008e6157400 +[ 1785.958603] {1}[Hardware Error]: node:0 card:0 module:0 rank:0 bank_group:1 bank_address:2 device:0 row:920 column:896 chip_id:0 +[ 1785.970409] {1}[Hardware Error]: error_type: 5, multi-symbol chipkill ECC +[ 1785.977355] {1}[Hardware Error]: type: DDR (0x50), common_reg_nr:1 +[ 1785.983606] {1}[Hardware Error]: Synchronous Exception taken in EL0 +[ 1785.989944] {1}[Hardware Error]: ESR: 0x92000410, ELR: 0x403abc, FAR: 0xfa00a88, SCR: 0x403073d, SCTLR: 0x30cd183f, LR: 0x403abc +[ 1786.001578] {1}[Hardware Error]: ECCERRCNT: 0x10000, ECCSTAT: 0x0, ADVECCSTAT: 0x8000002, ECCSYMBOL: 0x170000, ECCERRCNTSTAT: 0x0, ECCERRCNT0: 0x0, ECCERRCNT1: 0x0, ECCCADDR0: 0x0, ECCCADDR1: 0x0, ECCCDATA0: 0x0, ECCCDATA1: 0x0, ECCUADDR0: 0x398, ECCUADDR1: 0x1020380, ECCUDATA0: 0x1ff, ECCUDATA1: 0x0 +[ 1786.036640] Memory failure: 0x8e6157: recovery action for dirty LRU page: Recovered + +``` + +###### User Space AR-instruction Recovery + +In the case of an AR-instruction abort event, e.g. `instr`, it injects an uncorrected error and triggers the error by reading a byte. The kernel will attempt to hard-offline the page. It unmaps the corrupted page, reloads the 4KB page containing the instruction to a new physical page and resumes normal operation. + +```bash +# einj_mem_uc instr +0: instr vaddr = 0x403000 paddr = 8bba93000 +injecting ... +triggering ... +Test passed +``` + +The dmesg log: + +```bash +[ 1945.804589] EDAC MC0: 1 UE multi-symbol chipkill ECC on unknown memory (node:0 card:7 module:0 rank:1 bank_group:1 bank_address:3 device:0 row:527 column:640 chip_id:0 page:0x40883e65 offset:0x0 grain:1 - APEI location: node:0 card:7 module:0 rank:1 bank_group:1 bank_address:3 device:0 row:527 column:640 chip_id:0 status(0x0000000000000400): Storage error in DRAM memory) +[ 1945.804596] {3}[Hardware Error]: Hardware error from APEI Generic Hardware Error Source: 2 +[ 1945.815209] {3}[Hardware Error]: event severity: recoverable +[ 1945.820854] {3}[Hardware Error]: precise tstamp: 2023-01-17 18:07:49 +[ 1945.827280] {3}[Hardware Error]: Error 0, type: recoverable +[ 1945.832924] {3}[Hardware Error]: section_type: memory error +[ 1945.838654] {3}[Hardware Error]: error_status: Storage error in DRAM memory (0x0000000000000400) +[ 1945.847683] {3}[Hardware Error]: physical_address: 0x0000040883e65000 +[ 1945.854283] {3}[Hardware Error]: node:0 card:7 module:0 rank:1 bank_group:1 bank_address:3 device:0 row:527 column:640 chip_id:0 +[ 1945.866089] {3}[Hardware Error]: error_type: 5, multi-symbol chipkill ECC +[ 1945.873035] {3}[Hardware Error]: type: DDR (0x50), common_reg_nr:1 +[ 1945.879286] {3}[Hardware Error]: Synchronous Exception taken in EL0 +[ 1945.885625] {3}[Hardware Error]: ESR: 0x82000010, ELR: 0x403000, FAR: 0x403000, SCR: 0x403073d, SCTLR: 0x30cd183f, LR: 0x403f94 +[ 1945.906459] {3}[Hardware Error]: ECCERRCNT: 0x10000, ECCSTAT: 0x0, ADVECCSTAT: 0x8000002, ECCSYMBOL: 0x140000, ECCERRCNTSTAT: 0x0, ECCERRCNT0: 0x0, ECCERRCNT1: 0x0, ECCCADDR0: 0x0, ECCCADDR1: 0x0, ECCCDATA0: 0x0, ECCCDATA1: 0x0, ECCUADDR0: 0x100020f, ECCUADDR1: 0x1030280, ECCUDATA0: 0x1ff, ECCUDATA1: 0x0 +[ 1945.934071] Memory failure: 0x40883e65: corrupted page was clean: dropped without side effects +[ 1945.934084] Memory failure: 0x40883e65: recovery action for clean LRU page: Recovered +``` + +###### Kernel Space AR Recovery + +Kernel Space AR Recovery is only supported on X86 platform and we are still working on it on Arm64 platform. The recovery is evaluated on X86 icelake processor. + +First, inject an uncorrected error and trigger it by writing a buffer to a file. Kernel will copy data from user space and then write to disk. + +```bash +# einj_mem_uc copyin -f +0: copyin vaddr = 0x7f8f873e2400 paddr = 2869c1400 +injecting ... +triggering ... +einj_mem_uc: couldn't write temp file (errno=14) +Big surprise ... still running. Thought that would be fatal +Saw local machine check +Test passed +``` + +As we can see, the process is still running and the return errno for the write(2) is EFAULT(14). + +The dmesg log: + +```bash +SetMemoryDeviceStatus UCE error. Data = 00 4C A5 01 02 00 06 01 05 00 00 00 00 00 Status = Success +[15322.535921] mce: Kernel accessed poison in user space at 2869c1400 +[15322.536023] {2}[Hardware Error]: Hardware error from APEI Generic Hardware Error Source: 0 +[15322.542117] Memory failure: 0x2869c1: recovery action for dirty LRU page: Recovered +[15322.550382] {2}[Hardware Error]: event severity: recoverable +[15322.550385] {2}[Hardware Error]: Error 0, type: recoverable +[15322.558042] Memory failure: 0x2869c1: already hardware poisoned +[15322.563710] {2}[Hardware Error]: fru_text: Card02, ChnF, DIMM0 +[15322.563712] {2}[Hardware Error]: section_type: memory error +[15322.586981] {2}[Hardware Error]: error_status: Storage error in DRAM memory (0x0000000000000400) +[15322.596027] {2}[Hardware Error]: physical_address: 0x00000002869c1400 +[15322.602650] {2}[Hardware Error]: node:1 card:5 module:0 rank:0 bank:13 device:0 row:2075 column:8 +[15322.611783] {2}[Hardware Error]: error_type: 3, multi-bit ECC +[15322.617710] {2}[Hardware Error]: DIMM location: not present. DMI handle: 0x0000 +[15322.625304] Memory failure: 0x2869c1: already hardware poisoned +[15322.631827] EDAC MC6: 1 UE memory read error on CPU_SrcID#1_MC#2_Chan#1_DIMM#0 (channel:1 slot:0 page:0x2869c1 offset:0x400 grain:32 - err_code:0x00a0:0x0091 SystemAddress:0x2869c1400 ProcessorSocketId:0x1 MemoryControllerId:0x2 ChannelAddress:0x2069c000 ChannelId:0x1 RankAddress:0x1034e000 PhysicalRankId:0x0 DimmSlotId:0x0 Row:0x81b Column:0x8 Bank:0x1 BankGroup:0x3 ChipSelect:0x0 ChipId:0x0) +[15322.667403] EDAC MC6: 1 UE memory read error on CPU_SrcID#1_MC#2_Chan#1_DIMM#0 (channel:1 slot:0 page:0x2869c1 offset:0x400 grain:32 - err_code:0x0000:0x009f SystemAddress:0x2869c1400 ProcessorSocketId:0x1 MemoryControllerId:0x2 ChannelAddress:0x2069c000 ChannelId:0x1 RankAddress:0x1034e000 PhysicalRankId:0x0 DimmSlotId:0x0 Row:0x81b Column:0x8 Bank:0x1 BankGroup:0x3 ChipSelect:0x0 ChipId:0x0) +``` + +futex(2) is another system call in which kernel copies data from user space. Inject an uncorrected error and trigger it by issuing `FUTEX_WAIT` operation. + +```bash +# einj_mem_uc futex -f +0: futex vaddr = 0x7f8a1da83400 paddr = 25751d400 +injecting ... +triggering ... +futex returned with errno=14 +Big surprise ... still running. Thought that would be fatal +Unusual number of MCEs seen: 2 +Test passed +``` + +There are many retries in futex(2) mechanism, so it is possible to see many MCEs. + +The dmesg log: + +```bash +SetMemoryDeviceStatus UCE error. Data = 00 4C A5 01 02 00 06 01 05 00 00 00 00 00 Status = Success +[15521.242381] mce: Kernel accessed poison in user space at 25751d400 +[15521.242437] {4}[Hardware Error]: Hardware error from APEI Generic Hardware Error Source: 0 +[15521.248581] Memory failure: 0x25751d: recovery action for dirty LRU page: Recovered +[15521.256842] {4}[Hardware Error]: event severity: recoverable +[15521.256845] {4}[Hardware Error]: Error 0, type: recoverable +[15521.256847] {4}[Hardware Error]: fru_text: Card02, ChnF, DIMM0 +[15521.264506] Memory failure: 0x25751d: already hardware poisoned +[15521.270172] {4}[Hardware Error]: section_type: memory error +[15521.270173] {4}[Hardware Error]: error_status: Storage error in DRAM memory (0x0000000000000400) +[15521.270174] {4}[Hardware Error]: physical_address: 0x000000025751d400 +[15521.309103] {4}[Hardware Error]: node:1 card:5 module:0 rank:0 bank:4 device:0 row:1882 column:896 +[15521.318322] {4}[Hardware Error]: error_type: 3, multi-bit ECC +[15521.324252] {4}[Hardware Error]: DIMM location: not present. DMI handle: 0x0000 +[15521.331824] {4}[Hardware Error]: Error 1, type: recoverable +[15521.337484] {4}[Hardware Error]: section_type: memory error +[15521.343240] {4}[Hardware Error]: error_status: Storage error in DRAM memory (0x0000000000000400) +[15521.352286] {4}[Hardware Error]: physical_address: 0x000000025751d400 +[15521.358910] {4}[Hardware Error]: node:1 +[15521.363017] {4}[Hardware Error]: error_type: 3, multi-bit ECC +[15521.369040] Memory failure: 0x25751d: already hardware poisoned +[15521.374974] Memory failure: 0x25751d: already hardware poisoned +[15521.381515] EDAC MC6: 1 UE memory read error on CPU_SrcID#1_MC#2_Chan#1_DIMM#0 (channel:1 slot:0 page:0x25751d offset:0x400 grain:32 - err_code:0x00a0:0x0091 SystemAddress:0x25751d400 ProcessorSocketId:0x1 MemoryControllerId:0x2 ChannelAddress:0x1d751c00 ChannelId:0x1 RankAddress:0xeba9c00 PhysicalRankId:0x0 DimmSlotId:0x0 Row:0x75a Column:0x380 Bank:0x0 BankGroup:0x1 ChipSelect:0x0 ChipId:0x0) +[15521.417060] EDAC MC6: 1 UE memory read error on CPU_SrcID#1_MC#2_Chan#1_DIMM#0 (channel:1 slot:0 page:0x25751d offset:0x400 grain:32 - err_code:0x0000:0x009f SystemAddress:0x25751d400 ProcessorSocketId:0x1 MemoryControllerId:0x2 ChannelAddress:0x1d751c00 ChannelId:0x1 RankAddress:0xeba9c00 PhysicalRankId:0x0 DimmSlotId:0x0 Row:0x75a Column:0x380 Bank:0x0 BankGroup:0x1 ChipSelect:0x0 ChipId:0x0) +[15521.452740] EDAC MC6: 1 UE memory read error on CPU_SrcID#1_MC#2_Chan#1_DIMM#0 (channel:1 slot:0 page:0x25751d offset:0x400 grain:32 - err_code:0x0000:0x009f SystemAddress:0x25751d400 ProcessorSocketId:0x1 MemoryControllerId:0x2 ChannelAddress:0x1d751c00 ChannelId:0x1 RankAddress:0xeba9c00 PhysicalRankId:0x0 DimmSlotId:0x0 Row:0x75a Column:0x380 Bank:0x0 BankGroup:0x1 ChipSelect:0x0 ChipId:0x0) +``` + +##### AO Validation + +###### AO Patrol Recovery + +In the case of an AO event e.g. `patrol`, the kernel will attempt to hard-offline the page, by just poisoning and unmapping the page. Inject and trigger patrol error. Note, in this section, the HWPoison-aware strategy is default late kill. + +```bash +# einj_mem_uc patrol +0: patrol vaddr = 0xffff9d523400 paddr = 400a2575400 +injecting ... +triggering ... +page not present +Test passed +``` + +The dmesg log: + +```bash +[ 2026.290450] EDAC MC0: 1 UE scrub uncorrected error on unknown memory (node:0 card:6 module:0 rank:0 bank_group:2 bank_address:3 device:0 row:137 column:640 chip_id:0 page:0x400a2575 offset:0x400 grain:1 - APEI location: node:0 card:6 module:0 rank:0 bank_group:2 bank_address:3 device:0 row:137 column:640 chip_id:0 status(0x0000000000000400): Storage error in DRAM memory) +[ 2026.290460] {4}[Hardware Error]: Hardware error from APEI Generic Hardware Error Source: 2 +[ 2026.301258] {4}[Hardware Error]: event severity: recoverable +[ 2026.306903] {4}[Hardware Error]: precise tstamp: 2023-01-17 18:09:10 +[ 2026.313328] {4}[Hardware Error]: Error 0, type: recoverable +[ 2026.318972] {4}[Hardware Error]: section_type: memory error +[ 2026.324703] {4}[Hardware Error]: error_status: Storage error in DRAM memory (0x0000000000000400) +[ 2026.333732] {4}[Hardware Error]: physical_address: 0x00000400a2575400 +[ 2026.340331] {4}[Hardware Error]: node:0 card:6 module:0 rank:0 bank_group:2 bank_address:3 device:0 row:137 column:640 chip_id:0 +[ 2026.352138] {4}[Hardware Error]: error_type: 14, scrub uncorrected error +[ 2026.358998] {4}[Hardware Error]: type: DDR (0x50), common_reg_nr:1 +[ 2026.365249] {4}[Hardware Error]: Interrupt: 843 +[ 2026.369852] {4}[Hardware Error]: ECCERRCNT: 0x40000, ECCSTAT: 0x0, ADVECCSTAT: 0x88000002, ECCSYMBOL: 0xec0000, ECCERRCNTSTAT: 0x0, ECCERRCNT0: 0x0, ECCERRCNT1: 0x0, ECCCADDR0: 0x0, ECCCADDR1: 0x0, ECCCDATA0: 0x0, ECCCDATA1: 0x0, ECCUADDR0: 0x89, ECCUADDR1: 0x2030280, ECCUDATA0: 0x1ff, ECCUDATA1: 0x0 +[ 2026.397264] Memory failure: 0x400a2575: recovery action for dirty LRU page: Recovered +``` + +###### AO Prefetch Recovery + +First, inject an uncorrected error and trigger it by explicitly performing a `prfm`. The platform will signal an interrupt. + +```bash +#einj_mem_uc prefetch +0: prefetch vaddr = 0xffffbe03f400 paddr = 8c17eb400 +injecting ... +triggering ... +page not present +Test passed +``` + +The dmesg log: + +```bash +[ 7616.802823] {1}[Hardware Error]: Hardware error from APEI Generic Hardware Error Source: 2 +[ 7616.813922] {1}[Hardware Error]: event severity: recoverable +[ 7616.819566] {1}[Hardware Error]: Error 0, type: recoverable +[ 7616.825210] {1}[Hardware Error]: section_type: memory error +[ 7616.830940] {1}[Hardware Error]: error_status: 0x0000000000000400 +[ 7616.837191] {1}[Hardware Error]: physical_address: 0x00000008c17eb400 +[ 7616.843791] {1}[Hardware Error]: node: 0 card: 0 module: 0 rank: 1 bank_group: 3 bank_address: 0 device: 0 row: 773 column: 1408 +[ 7616.855597] {1}[Hardware Error]: error_type: 5, multi-symbol chipkill ECC +[ 7616.862543] {1}[Hardware Error]: type: DDR (0x50), ras_count:1 +[ 7616.868447] {1}[Hardware Error]: sub_type: 0x0 +[ 7616.872962] {1}[Hardware Error]: fr: 0x1000200000026, ctrl: 0x0, status: 0x0, addr: 0x0 +[ 7616.881036] {1}[Hardware Error]: misc0: 0x0, misc1: 0x0, misc2: 0x0, misc3: 0x200000000000100 +[ 7616.889888] Memory failure: 0x8c17eb: recovery action for dirty LRU page: Recovered +``` + +###### AO Store Recovery + +First, inject an uncorrected error and trigger it by writing a byte. The write size is less than 64 bits and the platform will signal a SError. + +```bash +# einj_mem_uc strbyte +0: strbyte vaddr = 0xffffa3651400 paddr = 400afd01400 +injecting ... +triggering ... +page not present +Test passed +``` + +The dmesg log: + +```bash +[ 2378.241939] EDAC MC0: 1 UE multi-symbol chipkill ECC on unknown memory (node:0 card:5 module:0 rank:0 bank_group:2 bank_address:1 device:0 row:191 column:128 chip_id:0 page:0x400afd01 offset:0x400 grain:1 - APEI location: node:0 card:5 module:0 rank:0 bank_group:2 bank_address:1 device:0 row:191 column:128 chip_id:0 status(0x0000000000000400): Storage error in DRAM memory) +[ 2378.241945] {5}[Hardware Error]: Hardware error from APEI Generic Hardware Error Source: 2 +[ 2378.252573] {5}[Hardware Error]: event severity: recoverable +[ 2378.258217] {5}[Hardware Error]: precise tstamp: 2023-01-17 18:15:02 +[ 2378.264642] {5}[Hardware Error]: Error 0, type: recoverable +[ 2378.270286] {5}[Hardware Error]: section_type: memory error +[ 2378.276017] {5}[Hardware Error]: error_status: Storage error in DRAM memory (0x0000000000000400) +[ 2378.285045] {5}[Hardware Error]: physical_address: 0x00000400afd01400 +[ 2378.291644] {5}[Hardware Error]: node:0 card:5 module:0 rank:0 bank_group:2 bank_address:1 device:0 row:191 column:128 chip_id:0 +[ 2378.303451] {5}[Hardware Error]: error_type: 5, multi-symbol chipkill ECC +[ 2378.310398] {5}[Hardware Error]: type: DDR (0x50), common_reg_nr:1 +[ 2378.316649] {5}[Hardware Error]: SError +[ 2378.320558] {5}[Hardware Error]: ECCERRCNT: 0x10000, ECCSTAT: 0x0, ADVECCSTAT: 0x8000002, ECCSYMBOL: 0x6f0000, ECCERRCNTSTAT: 0x0, ECCERRCNT0: 0x0, ECCERRCNT1: 0x0, ECCCADDR0: 0x0, ECCCADDR1: 0x0, ECCCDATA0: 0x0, ECCCDATA1: 0x0, ECCUADDR0: 0xbf, ECCUADDR1: 0x2010080, ECCUDATA0: 0x1ff, ECCUDATA1: 0x0 +[ 2378.360399] Memory failure: 0x400afd01: recovery action for dirty LRU page: Recovered +``` + +In contrast, inject an uncorrected error and trigger it by writing a quad word. The write size is 64 bits and the platform will not signal SErrors. + +```bash +# einj_mem_uc strqword +0: strqword vaddr = 0xffff991b5400 paddr = 92b73c400 +injecting ... +triggering ... +Manually take page offline +Test passed +``` + +The dmesg log: + +```bash +[270286.564242] Memory failure: 0x92b73c: recovery action for dirty LRU page: Recovered +``` + +##### QEMU Validation + +First, start a VM with a stdio monitor which allows giving complex commands to the QEMU emulator. + +```bash +qemu-system-aarch64 -enable-kvm \ + -cpu host \ + -M virt,gic-version=3 \ + -m 8G \ + -d guest_errors \ + -rtc base=localtime,clock=host \ + -smp cores=2,threads=2,sockets=2 \ + -object memory-backend-ram,id=mem0,size=4G \ + -object memory-backend-ram,id=mem1,size=4G \ + -numa node,memdev=mem0,cpus=0-3,nodeid=0 \ + -numa node,memdev=mem1,cpus=4-7,nodeid=1 \ + -bios /usr/share/AAVMF/AAVMF_CODE.fd \ + -drive driver=qcow2,media=disk,cache=writeback,if=virtio,id=alinu1_rootfs,file=/media/nvme/shawn.xs/qemu/aliyun_3_arm64_20G_alpha_alibase_20210425.qcow2 \ + -netdev user,id=n1,hostfwd=tcp::5555-:22 \ + -serial telnet:localhost:4321,server,nowait \ + -device virtio-net-pci,netdev=n1 \ + -monitor stdio +QEMU 7.2.0 monitor - type 'help' for more information +(qemu) VNC server running on 127.0.0.1:5900 +``` + +Login guest and install ras-tools, then run `einj_mem_uc` to allocate a page in userspace, dumps the virtual and physical address of the page. The `-j` is to skip error injection and `-k` is to wait for a kick. + +``` bash +$ einj_mem_uc single -j -k +0: single vaddr = 0xffffb2f27000 paddr = 154aba000 +``` + +Run command `gpa2hpa` in QEMU monitor and it will print the host physical address at which the guest’s physical address addr is mapped. + +``` bash +(qemu) gpa2hpa 0x151f21400 +Host physical address for 0x154aba000 (mem1) is 0x92b3c5000 +``` + +Inject an uncorrected error via the APEI interface to the finally translated host physical address on host. + +``` bash +echo 0x92b3c5000 > /sys/kernel/debug/apei/einj/param1 +echo 0xfffffffffffff000 > /sys/kernel/debug/apei/einj/param2 +echo 0x0 > /sys/kernel/debug/apei/einj/flags +echo 0x10 > /sys/kernel/debug/apei/einj/error_type +echo 1 > /sys/kernel/debug/apei/einj/notrigger +echo 1 > /sys/kernel/debug/apei/einj/error_inject +``` + +Then kick `einj_mem_uc` to trigger the error by writing "trigger_start". In this example, the kick is done on host. + +``` bash +#ssh -p 5555 root@localhost "echo trigger > ~/trigger_start" +``` + +We will observe that the QEMU process exit. + +``` bash +(qemu) qemu-system-aarch64: Hardware memory error! +``` + +The dmesg log: + +``` bash +[ 2705.654424] EDAC MC0: 1 UE multi-symbol chipkill ECC on unknown memory (node:0 card:0 module:0 rank:1 bank_group:4 bank_address:2 device:0 row:1196 column:640 chip_id:0 page:0x92b3c5 offset:0x0 grain:1 - APEI location: node:0 card:0 module:0 rank:1 bank_group:4 bank_address:2 device:0 row:1196 column:640 chip_id:0 status(0x0000000000000400): Storage error in DRAM memory) +[ 2705.654432] {6}[Hardware Error]: Hardware error from APEI Generic Hardware Error Source: 2 +[ 2705.665047] {6}[Hardware Error]: event severity: recoverable +[ 2705.670692] {6}[Hardware Error]: precise tstamp: 2023-01-17 18:20:29 +[ 2705.677118] {6}[Hardware Error]: Error 0, type: recoverable +[ 2705.682762] {6}[Hardware Error]: section_type: memory error +[ 2705.688492] {6}[Hardware Error]: error_status: Storage error in DRAM memory (0x0000000000000400) +[ 2705.697521] {6}[Hardware Error]: physical_address: 0x000000092b3c5000 +[ 2705.704121] {6}[Hardware Error]: node:0 card:0 module:0 rank:1 bank_group:4 bank_address:2 device:0 row:1196 column:640 chip_id:0 +[ 2705.716014] {6}[Hardware Error]: error_type: 5, multi-symbol chipkill ECC +[ 2705.722960] {6}[Hardware Error]: type: DDR (0x50), common_reg_nr:1 +[ 2705.729212] {6}[Hardware Error]: Synchronous Exception taken in EL0 +[ 2705.735551] {6}[Hardware Error]: ESR: 0x92000410, ELR: 0x401880, FAR: 0xffffb2e8c1d8, SCR: 0x403073d, SCTLR: 0x30cd183f, LR: 0x401840 +[ 2705.747619] {6}[Hardware Error]: ECCERRCNT: 0x10000, ECCSTAT: 0x0, ADVECCSTAT: 0x8000002, ECCSYMBOL: 0x60000, ECCERRCNTSTAT: 0x0, ECCERRCNT0: 0x0, ECCERRCNT1: 0x0, ECCCADDR0: 0x0, ECCCADDR1: 0x0, ECCCDATA0: 0x0, ECCCDATA1: 0x0, ECCUADDR0: 0x10004ac, ECCUADDR1: 0x4020280, ECCUDATA0: 0x1ff, ECCUDATA1: 0x0 +[ 2705.887179] EDAC MC0: 1 UE multi-symbol chipkill ECC on unknown memory (node:0 card:0 module:0 rank:1 bank_group:4 bank_address:2 device:0 row:1196 column:640 chip_id:0 page:0x92b3c5 offset:0x0 grain:1 - APEI location: node:0 card:0 module:0 rank:1 bank_group:4 bank_address:2 device:0 row:1196 column:640 chip_id:0 status(0x0000000000000400): Storage error in DRAM memory) +[ 2705.887181] {7}[Hardware Error]: Hardware error from APEI Generic Hardware Error Source: 2 +[ 2705.897824] {7}[Hardware Error]: event severity: recoverable +[ 2705.903468] {7}[Hardware Error]: precise tstamp: 2023-01-17 18:20:29 +[ 2705.909893] {7}[Hardware Error]: Error 0, type: recoverable +[ 2705.915537] {7}[Hardware Error]: section_type: memory error +[ 2705.921267] {7}[Hardware Error]: error_status: Storage error in DRAM memory (0x0000000000000400) +[ 2705.930296] {7}[Hardware Error]: physical_address: 0x000000092b3c5000 +[ 2705.936895] {7}[Hardware Error]: node:0 card:0 module:0 rank:1 bank_group:4 bank_address:2 device:0 row:1196 column:640 chip_id:0 +[ 2705.948790] {7}[Hardware Error]: error_type: 5, multi-symbol chipkill ECC +[ 2705.955736] {7}[Hardware Error]: type: DDR (0x50), common_reg_nr:1 +[ 2705.961988] {7}[Hardware Error]: Synchronous Exception taken in EL0 +[ 2705.968326] {7}[Hardware Error]: ESR: 0x92000410, ELR: 0x401880, FAR: 0xffffb2e8c1d8, SCR: 0x403073d, SCTLR: 0x30cd183f, LR: 0x401840 +[ 2705.980394] {7}[Hardware Error]: ECCERRCNT: 0x0, ECCSTAT: 0x0, ADVECCSTAT: 0x0, ECCSYMBOL: 0x0, ECCERRCNTSTAT: 0x0, ECCERRCNT0: 0x0, ECCERRCNT1: 0x0, ECCCADDR0: 0x0, ECCCADDR1: 0x0, ECCCDATA0: 0x0, ECCCDATA1: 0x0, ECCUADDR0: 0x10004ac, ECCUADDR1: 0x4020280, ECCUDATA0: 0x0, ECCUDATA1: 0x0 +[ 2706.006235] Memory failure: 0x92b3c5: Sending SIGBUS to qemu-system-aar:32293 due to hardware memory corruption +[ 2706.078549] Memory failure: 0x92b3c5: recovery action for dirty LRU page: Recovered +[ 2706.092539] Memory failure: 0x92b3c5: already hardware poisoned +[ 2706.118501] EDAC MC0: 1 UE multi-symbol chipkill ECC on unknown memory (node:0 card:0 module:0 rank:0 bank_group:1 bank_address:2 device:0 row:920 column:896 chip_id:0 page:0x0 offset:0x0 grain:1 - APEI location: node:0 card:0 module:0 rank:0 bank_group:1 bank_address:2 device:0 row:920 column:896 chip_id:0 status(0x0000000000000400): Storage error in DRAM memory) +``` + +Note, QEMU registers SIGBUS handler and sets `PR_MCE_KILL_EARLY` by `prctl`. When an AO error occurs, e.g. detected by scrubber, kernel will also send SIGBUS but with sicode `BUS_MCEERR_AO 5`. + +##### HWPoison-aware Strategy + +First, check the strategy on your system. + +```bash +#sysctl vm.memory_failure_early_kill +vm.memory_failure_early_kill = 0 +``` + +Change to early kill mode: + +```bash +#sysctl -w vm.memory_failure_early_kill=1 +vm.memory_failure_early_kill = 1 +``` + +Then inject a `patrol` error to see the kernel behavior. + +```bash +#./einj_mem_uc patrol +0: patrol vaddr = 0xffffbe4b8400 paddr = 901656400 +injecting ... +triggering ... +signal 7 code 5 addr 0xffffbe4b8000 +Unexpected SIGBUS +page not present +Test passed +``` + +As we expected, the kernel sends SIGBUS to kill the process even though it does not access the poison data. The `code 5` here means `BUS_MCEERR_AO 5`. + +#### Memory Predictive Failure Analysis Validation + +First of all, you'll need to install **rasdeamon**, it's packaged for most Linux distributions: + +```bash +yum install rasdaemon +``` + +Then we'll setup **rasdaemon** to launch at startup and to record events to an on-disk sqlite database. + +```bash +# systemctl enable rasdaemon +# systemctl start rasdaemon +``` + +Here, we manually change the `PAGE_CE_THRESHOLD="5"` in config file `/etc/sysconfig/rasdaemon` so that we can inject and exceed a page error threshold more easily. Note, run-time configuration is unsupported, service restart is needed. + +```bash +# systemctl restart rasdaemon +``` + +Run `victim` with `-p` option to help test PFA function. The `victim` allocates a page in userspace, dumps the virtual and physical address of the page, and checks the physical address in a loop while. Then inject to the physical address 5 times and it will trigger soft action in which kernel soft-offline the old page, by moving the contents to a new page. + +```bash +#victim -d -p +physical address of (0xffffa5a66000) = 0x967cf1000 +Page was replaced. New physical address = 0x8bce3e000 +``` + +## Acknowledgment + +Thanks to the developers who contributed to the Linux and Anolis communities. + +## Reference + +1. [https://www.intel.com/content/www/us/en/developer/articles/technical/new-reliability-availability-and-serviceability-ras-features-in-the-intel-xeon-processor.html](https://www.intel.com/content/www/us/en/developer/articles/technical/new-reliability-availability-and-serviceability-ras-features-in-the-intel-xeon-processor.html) +2. Reliability, Availability and Serviceability (RAS) Integration and Validation Guide for the Intel® Xeon® Processor E7- v3 Family: [https://www.intel.com/content/dam/develop/external/us/en/documents/emca2-integration-validation-guide-556978.pdf](https://www.intel.com/content/dam/develop/external/us/en/documents/emca2-integration-validation-guide-556978.pdf) +3. [https://docs.kernel.org/admin-guide/ras.html](https://docs.kernel.org/admin-guide/ras.html) +4. [https://static.linaro.org/connect/sfo17/Presentations/SFO17-203%20-%20Reliability%2C%20Availability%2C%20and%20Serviceability%20%28RAS%29%20on%20ARM64%20status.pdf](https://static.linaro.org/connect/sfo17/Presentations/SFO17-203%20-%20Reliability%2C%20Availability%2C%20and%20Serviceability%20%28RAS%29%20on%20ARM64%20status.pdf) +5. Intel® 64 and IA-32 Architectures Software Developer’s Manual +6. [https://developer.ibm.com/articles/l-kernel-memory-access/](https://developer.ibm.com/articles/l-kernel-memory-access/) +7. [https://docs.kernel.org/admin-guide/sysctl/vm.html#memory-failure-early-kill](https://docs.kernel.org/admin-guide/sysctl/vm.html#memory-failure-early-kill) +8. Programming persistent memory: A comprehensive guide for developers +9. [https://trustedfirmware-a.readthedocs.io/en/latest/components/sdei.html](https://trustedfirmware-a.readthedocs.io/en/latest/components/sdei.html#id2) \ No newline at end of file diff --git "a/INFRA_DOCS/RAS/Bugzilla\347\224\250\346\210\267\346\226\207\346\241\243.md" "b/INFRA_DOCS/RAS/Bugzilla\347\224\250\346\210\267\346\226\207\346\241\243.md" new file mode 100644 index 0000000..1113c09 --- /dev/null +++ "b/INFRA_DOCS/RAS/Bugzilla\347\224\250\346\210\267\346\226\207\346\241\243.md" @@ -0,0 +1,242 @@ +## 一. Bugzilla初识 +Bugzilla是一个缺陷管理系统,可以用来管理Bug,也可以用来管理需求。Bugzilla可以通过浏览器页面创建Bug,也可以通过api接口创建Bug。 + +- **1.1 首页** + +![image.png](../../assets/bugzilla/mainpage1.png) +## 二. Bug生命周期 + +- **2.1 新建Bug** + - **2.1.1 路径** + + 首页->New/File a Bug->Select a classification->Select a product->Bug创建页面。具体如下:首先,在首页点击New或File a Bug按钮,进行创建Bug。 + ![image.png](../../assets/bugzilla/new.png) + 然后会提示让选择一个classification, classification是Bug的一级分类。 + ![image.png](../../assets/bugzilla/classification.png) + 选择完classification分类后会提示选择一个product,product是Bug的二级分类。 + ![image.png](../../assets/bugzilla/product.png) + + - **2.1.2 创建页面** + + 选择完product产品后会进入Bug创建页面,创建页面有一些参数需要填充,具体参数含义如下图所示: + ![image.png](../../assets/bugzilla/param-info.png) + +- **2.2 编辑Bug** + - **2.2.1 路径** + - 首页->Search Bug/My Bugs->Bugs List->Click the Bug->Bug编辑页面。具体如下: + + 首先在首页点击My Bugs或Open Bugs assign to me or reported by me 进入Bug List 列表页。 + ![image.png](../../assets/bugzilla/bug-list.png) + 然后在Bug List列表页点击Bug ID或Summary进入Bug编辑页面。 + ![image.png](../../assets/bugzilla/bug-list-2.png) + + - **2.2.2 编辑页面** + + Bug编辑页面也进行Bug一些字段的修改,编辑页面的字段同Bug创建。 + ![image.png](../../assets/bugzilla/edit-param.png) + +- **2.3 关闭Bug** + - **2.3.1 路径** + - Bug编辑页面->Status->Resolved、Fixed->Save Changes + + 在Bug页面编辑Status状态,当状态被修改为[RESOLVED,FIXED]时表示Bug关闭,至此Bug的生命周期结束。 + ![image.png](../../assets/bugzilla/bugstatus.png) + +## 三. Bugzilla Restful Api + +Bugzilla 提供了对外开放的restful api接口,通过接口可以方便的对Bug进行一些操作。官方接口描述如下:[https://bugzilla.readthedocs.io/en/5.0/api/index.html#apis](https://bugzilla.readthedocs.io/en/5.0/api/index.html#apis)。 + +- **3.1 官方Api** + + 官方api包括对Bug、评论、组件、产品等的操作,具体如下: + ![image.png](../../assets/bugzilla/api.png) + +- **3.2 额外的Api** + +除了官方Api之外,还单独定制了一部分Api,包括User、Bug、子组件等。 + + - **3.2.1 Sub Component Api** + + - **Get Sub Components** + + 查询现有的子组件详情 + + - **Request**: + ```json + https://bugzilla.openanolis.cn/rest/subcomponent?ids=(bug_id) + ``` + - **Params Description**: + + | **name** | **type** | **description** | + | --- | --- | --- | + | bug_id | int | Bug ID | + + - **Response**: + ```json + { + "subcomponents": [ + { + "default_cc": [ + "xxx@163.com" + ], + "is_active": true, + "default_docs_contact": "", + "id": 10, + "component_name": "user", + "level": 0, + "description": "rest api create subcomponent", + "default_qa_contact": "xxx@163.com", + "default_to_private": false, + "product_id": 2, + "sort_key": 0, + "parent_id": null, + "product_name": "testfarm", + "name_tree": [ + "sub_user3_9" + ], + "default_assignee": "xxx@163.com", + "name": "sub_user3_9", + "agile_team": "", + "component_id": 2 + } + ] + } + ``` + + - **Post Sub Component** + + 添加子组件 + + - **Request**: + ```json + https://bugzilla.openanolis.cn/rest/subcomponent + ``` + + - **Body Description**: + + | **name** | **type** | **description** | + | --- | --- | --- | + | api_key | string | api auth key | + | product | string | 产品名 | + | component | string | 组件名 | + | subcomponent | string | 子组件名 | + | description | string | 描述 | + | default_assignee | string | 默认指派人邮箱 | + | default_docs_contact | string | 默认Docs邮箱 | + | default_qa_contact | string | 默认QA邮箱 | + | default_cc | list | 抄送邮箱列表 | + + - **Response**: + ```json + { + id: 23 + } + ``` + + - **Delete Sub Component** + + 删除子组件 + + - **Request**: + ```json + https://bugzilla.openanolis.cn/rest/subcomponent?api_key=(api_key)&ids=(sub_component_id)&component=(component)&product=(product) + ``` + + - **Params Description**: + + | **name** | **type** | **description** | + | --- | --- | --- | + | api_key | string | api auth key | + | product | string | 产品名 | + | component | string | 组件名 | + | sub_component_id | int | 子组件id | + + - **Response**: + ```json + { + Deleted: [23] + } + ``` + + - **3.2.2 Component Api** + - **Get Components** + + 查询现有的所有组件 + + - **Request**: + ```json + https://bugzilla.openanolis.cn/rest/component + ``` + + - **Params Description**: + + | **name** | **type** | **description** | + | --- | --- | --- | + | | | | + + - **Response**: + + ```json + { + "components" : [ + { + "id" : 3, // component id + "name" : "com_123" // component name + }, + { + "id" : 4, + "name" : "product-length" + }, + { + "id" : 1, + "name" : "TestComponent" + }, + { + "id" : 2, + "name" : "user" + }, + { + "id" : 5, + "name" : "user2" + } + ] + } + ``` + +## 三. Bugzilla python 命令行插件 + +python-bugzilla-cli 命令行插件支持通过python命令行去操作Bugzilla,比如创建、编辑Bug, 子组件等。 + +- 3.1 插件安装: + - 在命令行执行命令:pip install python-bugzilla-anolis 安装命令行插件,插件地址:[https://pypi.org/project/python-bugzilla-anolis/](https://pypi.org/project/python-bugzilla-anolis/) + + ![image.png](../../assets/bugzilla/plugin.png) + +- 3.2 插件配置: + - 生成api_key: [https://bugzilla.openanolis.cn/userprefs.cgi?tab=apikey](https://bugzilla.openanolis.cn/userprefs.cgi?tab=apikey),生成的api_key需要有bug edit权限,api_key权限 和当前帐号权限一致。 + + ![image.png](../../assets/bugzilla/api-key.png) + + - 配置bugzillarc文件,配置url和api_key,python-bugzilla-cli插件可自动读取配置的参数,在使用命令时无需额外携带参数,bugzillarc文件内容如下: + + ![image.png](../../assets/bugzilla/bugzillarc.png) + +- 3.3 插件使用: + - 配置好bugzillarc文件之后,就可以使用bugzilla-anolis命令操作Bugzilla了,使用bugzilla-anolis命令查看当前支持的所有参数: + + ![image.png](../../assets/bugzilla/bugzilla-anolis.png) + + - 执行sub component相关命令: + - 查询: + ```json + bugzilla-anolis getsubcomponent --ids=10 --ids=11 + ``` + ![image.png](../../assets/bugzilla/sub component.png) + + - 添加:bugzilla-anolis addsubcomponent --product=testfarm --component=user --subcomponent=sub_user77_5 --description="rest api create subcomponent" --default_assignee=shankailun@163.com --default_docs_contact=shankailun@163.com --default_qa_contact=shankailun@163.com --default_cc="shankailun@163.com,1174224378@qq.com" + + ![image.png](../../assets/bugzilla/addsubcomponent.png) + + - 删除:bugzilla-anolis deletesubcomponent --ids=39 --product=testfarm --component=user + + ![image.png](../../assets/bugzilla/deletesubcomponent.png) diff --git "a/OPERATIONS_DOCS/\344\272\272\344\272\272\351\203\275\345\217\257\344\273\245\345\217\202\344\270\216\345\274\200\346\272\220/\351\276\231\350\234\245\344\270\200\345\210\273/\344\272\214\345\210\206\346\263\225/Anolis OS\347\216\257\345\242\203\346\220\255\345\273\272\346\225\231\347\250\213.md" "b/OPERATIONS_DOCS/\344\272\272\344\272\272\351\203\275\345\217\257\344\273\245\345\217\202\344\270\216\345\274\200\346\272\220/\351\276\231\350\234\245\344\270\200\345\210\273/\344\272\214\345\210\206\346\263\225/Anolis OS\347\216\257\345\242\203\346\220\255\345\273\272\346\225\231\347\250\213.md" new file mode 100644 index 0000000..a9f7ecb --- /dev/null +++ "b/OPERATIONS_DOCS/\344\272\272\344\272\272\351\203\275\345\217\257\344\273\245\345\217\202\344\270\216\345\274\200\346\272\220/\351\276\231\350\234\245\344\270\200\345\210\273/\344\272\214\345\210\206\346\263\225/Anolis OS\347\216\257\345\242\203\346\220\255\345\273\272\346\225\231\347\250\213.md" @@ -0,0 +1,268 @@ +# 写在前面 + +Anolis OS 是 OpenAnolis 社区推出的完全开源、中立、开放的发行版,它支持多计算架构,也面向云端场景优化。 + +在您使用Anolis OS之前,我们提供了一个预装Anolis OS的在线机器资源服务。我们**强烈建议**您访问[**龙蜥实验室**](https://lab.openanolis.cn/#/apply/home),使用Web页面及机器人等形式自动创建和管理机器资源,以此来对Anolis OS进行体验。 + +您可以访问[龙蜥实验室使用指南](https://www.yuque.com/anolis-docs/community/peng85),来进行**一键申请**和 **免费试用** 。 + +![](https://intranetproxy.alipay.com/skylark/lark/0/2022/png/63156315/1656644119956-01a1cabe-eb42-4c64-82d8-902d01afb26d.png) + +我们提供两种方式安装Anolis OS: + +* ISO镜像安装 +* qcow虚拟机镜像安装 + +## 一、通过ISO进行安装 + +### 1.1 ISO镜像下载 + +登陆下载界面获取Anolis OS最新iso镜像文件 + +[https://openanolis.cn/download](https://openanolis.cn/download) + +![](https://intranetproxy.alipay.com/skylark/lark/0/2022/png/63156315/1656504756808-2cdce132-2ff8-4d66-a96d-18cd6525601a.png) + +### 1.2 镜像安装 + +参考该文档,通过图形化安装接口部署Anolis8/7至目标平台: + +[https://www.yuque.com/anolis-docs/manual/installation](https://www.yuque.com/anolis-docs/manual/installation) + +## 二、 通过qcow虚拟机镜像安装 + +首先,验证CPU是否支持KVM; + +`egrep '(vmx|svm)' /proc/cpuinfo` + +如果结果中有vmx(Intel)或svm(AMD)字样,就说明CPU是支持的。 + +如果您是买的ECS,或者已经开了虚拟机,那大概率没办法再通过KVM的方式进行安装。 + +### 2.1 虚拟机镜像下载 + +登陆下载界面获取Anolis OS最新qcow2镜像文件 + +[https://openanolis.cn/download](https://openanolis.cn/download) + +这里以7.9为例:点击网址中的下载按钮后,选择相应架构的文件夹进入,既可以看到对应的下载列表,请选择**AnolisOS-7.9-GA-x86_64-ANCK.qcow2**文件进行下载。 + +![](https://intranetproxy.alipay.com/skylark/lark/0/2022/png/63156315/1656574325523-958feb42-fbbf-4974-9bd9-9d6827dd99db.png) + +### 2.2 安装依赖包 + +`sudo yum install -y qemu-kvm libvirt virt-install bridge-utils` + +### 2.3 启动前配置 + +#### 2.3.1 libvirt服务 + +开启libvirt服务 + +`systemctl start libvirtd` + +设置开机启动 + +`systemctl enable libvirtd` + +查看状态操作结果 + +`systemctl status libvirtd` + +![](https://intranetproxy.alipay.com/skylark/lark/0/2022/png/63156315/1656557798218-65ab7a31-63e2-4200-bca5-2ed9f65a169e.png) + +`systemctl is-enabled libvirtd` + +![](https://intranetproxy.alipay.com/skylark/lark/0/2022/png/63156315/1656557863822-cec7389f-3748-43c9-8878-e03fe5906f4f.png) + +#### 2.3.2 打开虚拟化的网络支持 + +`sudo virsh net-autostart default` + +`sudo virsh net-start default` + +`sudo sysctl -w net.ipv4.ip_forward=1` # 也可以写到配置文件里持久化 + +**TIPS:** + +`sudo virsh net-autostart default` 执行过程中可能会卡住,此时将 `/etc/modprobe.d/blacklist.conf` 文件中的 "blacklist nf_conntrack_ipv4" 语句注释掉,例如 + +``` +... +#blacklist nf_conntrack_ipv4 +``` + +之后再执行 `sudo virsh net-autostart default` + +#### 2.3.3 修改kvm权限 + +直接设置成root启动 + +``` +cat >> /etc/libvirt/qemu.conf << EOF +user = "root" +group = "root" +EOF +systemctl restart libvirtd.service +``` + +#### 2.3.4 建立链接 + +查看qemu-kvm路径 + +`whereis qemu-kvm` + +``` +qemu-kvm: /etc/qemu-kvm /usr/libexec/qemu-kvm /usr/share/qemu-kvm /usr/share/man/man1/qemu-kvm.1.gz +``` + +建立软连接 + +`ln -s /usr/libexec/qemu-kvm /usr/bin/qemu-kvm` + +#### 2.3.5 创建xml配置文件 + +示例文件的名称为anolis.xml,请根据提示修改您的镜像路径 + +您可以按照注释自己酌情修改。 + +``` + + anolis + 16777216 + 8 + + + + + hvm + + + + + + + + + + + + destroy + restart + restart + + + /usr/bin/qemu-kvm + + + + + + + + + + + + + + + + + + + + + + + + + + + +``` + +### 2.4 虚拟机的启动与管理 + +#### 2.4.1 使用virsh命令启动虚拟机 + +新机器执行virsh命令可能会有setlocale: No such file or directory的警告,可安装语言包 + +`yum install -y glibc-langpack-zh` + +`sudo virsh define 虚拟机名.xml` + +`sudo virsh start 虚拟机名` # 请修改为KVM虚拟机的真实名称。 + +vm 默认的账户和密码为: + +* 用户名:`anuser` +* 密码:`anolisos` + +#### 2.4.2 切换root用户并允许ssh root登录 + +1. `sudo su` +2. 输入密码`anolisos` +3. 修改root密码:`passwd root` +4. 修改`/etc/ssh/sshd_config`: + +``` +PasswordAuthentication yes + +PermitRootLogin yes +``` + +#### 2.4.3 虚拟机的访问 + +可以通过下列方式访问VM: + +* 通过 vnc 访问宿主机的 IP,登录VM,查看 IP +* 通过 `sudo virsh console 虚拟机名` 登录 VM(请注意可能会有一段时间的黑屏,VM启动过程没有输出到屏幕),查看 IP +* 获取到 Guest IP 之后,通过 `ssh root@` 登录 VM. + +#### 2.4.3 查询虚拟机在宿主机对应串口设备 + +`virsh ttyconsole 虚拟机名` + +#### 2.4.4 其余virsh命令 + +`virsh list` #显示本地活动虚拟机 + +`virsh list –-all ` #显示本地所有的虚拟机(活动的+不活动的) + +`virsh define 虚拟机名.xml` #通过配置文件定义一个虚拟机(这个虚拟机还不是活动的) + +`virsh undefine 虚拟机名.xml` #删除虚拟机配置 + +`virsh start 虚拟机名` #启动名字为ubuntu的非活动虚拟机 + +`virsh create 虚拟机名.xml ` # 创建虚拟机(创建后,虚拟机立即执行,成为活动主机) + +`virsh suspend 虚拟机名` # 暂停虚拟机 + +`virsh resume 虚拟机名 ` # 启动暂停的虚拟机 + +`virsh shutdown 虚拟机名` # 正常关闭虚拟机 + +`virsh destroy 虚拟机名` # 强制关闭虚拟机 + +`virsh dominfo 虚拟机名` #显示虚拟机的基本信息 + +`virsh domname 2` # 显示id号为2的虚拟机名 + +`virsh domid 虚拟机名` # 显示虚拟机id号 + +`virsh domuuid 虚拟机名` # 显示虚拟机的uuid + +`virsh domstate 虚拟机名` # 显示虚拟机的当前状态 + +`virsh dumpxml 虚拟机名` # 显示虚拟机的当前配置文件(可能和定义虚拟机时的配置不同,因为当虚拟机启动时,需要给虚拟机分配id号、uuid、vnc端口号等等) + +`virsh setmem 虚拟机名 512000` #给不活动虚拟机设置内存大小 + +`virsh setvcpus 虚拟机名 4` # 给不活动虚拟机设置cpu个数 + +`virsh edit 虚拟机名` # 编辑配置文件(一般是在刚定义完虚拟机之后) diff --git "a/OPERATIONS_DOCS/\344\272\272\344\272\272\351\203\275\345\217\257\344\273\245\345\217\202\344\270\216\345\274\200\346\272\220/\351\276\231\350\234\245\344\270\200\345\210\273/\344\272\214\345\210\206\346\263\225/\351\276\231\350\234\245 ANCK 5.10 \345\200\232\345\244\251\345\271\263\345\217\260 MPAM \346\265\213\350\257\225\346\212\245\345\221\212.md" "b/OPERATIONS_DOCS/\344\272\272\344\272\272\351\203\275\345\217\257\344\273\245\345\217\202\344\270\216\345\274\200\346\272\220/\351\276\231\350\234\245\344\270\200\345\210\273/\344\272\214\345\210\206\346\263\225/\351\276\231\350\234\245 ANCK 5.10 \345\200\232\345\244\251\345\271\263\345\217\260 MPAM \346\265\213\350\257\225\346\212\245\345\221\212.md" new file mode 100644 index 0000000..b3230c2 --- /dev/null +++ "b/OPERATIONS_DOCS/\344\272\272\344\272\272\351\203\275\345\217\257\344\273\245\345\217\202\344\270\216\345\274\200\346\272\220/\351\276\231\350\234\245\344\270\200\345\210\273/\344\272\214\345\210\206\346\263\225/\351\276\231\350\234\245 ANCK 5.10 \345\200\232\345\244\251\345\271\263\345\217\260 MPAM \346\265\213\350\257\225\346\212\245\345\221\212.md" @@ -0,0 +1,1001 @@ +from arm-sig: https://openanolis.cn/sig/ARM_ARCH_SIG/doc/657742613244594693 + +一、测试总结 + +针对龙蜥 OS MPAM 特性的整体测试情况如下: + +经过对 MPAM 的功能性验证,目前 L3 cache 资源隔离和监控功能均正常,内存带宽隔离效果甚微,监控功能可用。 +测试用例覆盖 MPAM 接口读写测试、并发压力测试等多种类型,测试结果未发现问题。 +针对 MPAM ESR 中断,验证了 PARTID、PMG、monitor 相关异常能否触发中断、告知错误类型,中断监测功能正常可用。 +二、MPAM 功能验证 +2.1 cache 隔离功能验证 +2.1.1 不同配置对实际 cache 占有量的影响 + +L3 cache 资源隔离以 ways 的方式进行配置。倚天机器共有 16 ways,测试对不同 ways 的隔离效果进行了验证。 + +numactl -m 0 -C 16 memhog -r10000000 100000m > /mnt/log + +程序和资源隔离 group 绑定分别采用了 pid 绑定(tasks)和 cpu 绑定(cpus)两种方式。通过 schemata 接口设置程序所能够使用的 ways 数目,通过 mon_data/mon_L3_0*/llc_occupancy 接口读取程序的 L3 cache 占用。多次读取取平均值,并与理想的 cache ways 大小进行对比。 + +测试结果显示,L3 cache 隔离功能效果显著,无论是通过 tasks 绑定还是 cpus 绑定,均可以得到与理想值相接近的隔离效果。 + +2.1.2 不同配置对 mem latency 的影响 + +latency 作为一个重要的性能指标,在一些对时延敏感的场景来说,有很重要的参考作用,此处使用 lat_mem_rd 测试 cache 在不同的 ways 下,内存 latency 的分布情况,也从侧面验证 MPAM 对 cache 的隔离功能。 + +#设置步长为512字节 +numactl -C 10 -m 0 ./lat_mem_rd -N 1 -P 1 145M 512 + +测试结果显示,随着 cache way 数目的增加,加载相同内存的 latency 逐渐减小。 + +2.1.3 L3 cache 抗干扰测试 +# workload +numactl -m 1 -C 64-127 memhog -r10000000 100000m > /mnt/log +# distractor +numactl -m 1 -C 64-127 memhog -r10000000 100000m > /mnt/log + +workload:保持 L3:1=fff0 配置无变化 + +distractor: 测试 L3:1=,mask 值分别为 0-f(无干扰)、0010-fff0(有干扰) + +测试结果显示: + +在无干扰情况下,workload 的 L3 cache 占用量基本无变化; + +随着干扰 way 数逐渐变多,workload 和 distractor 两者的 L3 cache占比逐渐趋同,总量不变。 + +2.1.4 模拟混部 L3 cache 隔离测试 + +分别以 SPECjbb 2015 和 stress-ng 程序模拟在线环境和离线环境,对L3 cache隔离功能进行测试。两个环境均运行在 NUMA node 1 上。 + +在前 40s 的时间内,两个程序共享 L3 cache 资源。在约 40s 后,开始隔离在线和离线L3 cache资源的使用,在离线任务 L3 cache 的配比分别为 0xffff 和 0xf。 + +通过实验结果可以看到,在 L3 cache 资源共享的情况下,离线资源对在线资源干扰和压制明显,L3 cache 竞争激烈,波动幅度很大;在对 L3 cache 资源进行隔离后,一方面离线得到了持续有效的压制,L3 cache 占有率大幅下降,另一方面在线性能提升明显,而且波动幅度变小。 + +2.2 MB 隔离功能验证 +2.2.1 不同配置对内存带宽的影响 +gcc -O3 -fopenmp -DSTREAM_ARRAY_SIZE=100000000 -DNTIMES=1000 stream.c -o stream +# 单node测试 +numactl -m 1 -C 64-127 ./stream +# 单CPU测试 +numactl -m 1 -C 72 ./stream + +MB 资源隔离以百分比的方式进行配置。测试以 5% 为粒度,通过设置 schemata 接口让内存带宽从 5% 逐次递增到 100%,通过读取 mon_data/mon_MB_0*/mbm_local_bytes 接口读取带宽值,最终取多次测量的平均值。 + +通过测试结果可以发现,不同百分比下的测试MB带宽值和100%带宽下的MB带宽值基本相等,倚天机器的 MB 带宽隔离效果甚微。 + +单 node(64 CPU) MB 配置结果 + +percent + + + +stream测试值[Copy] (MB/s) + + + +mbm_local_bytes接口值 (MB/s) + + + + +5 + + + +104808.5 + + + +104800.0 + + + + +10 + + + +105028.3 + + + +105730.5 + + + + +20 + + + +104459.3 + + + +104915.1 + + + + +40 + + + +105077.6 + + + +105852.0 + + + + +60 + + + +104980.6 + + + +105178.7 + + + + +80 + + + +104924.8 + + + +105182.8 + + + + +100 + + + +104828.1 + + + +105855.8 + +单 CPU MB 配置结果 + +percent + + + +stream测试值[Copy] (MB/s) + + + +mbm_local_bytes接口值 (MB/s) + + + + +5 + + + +25948.7 + + + +24147.7 + + + + +10 + + + +25934.0 + + + +24433.1 + + + + +20 + + + +25913.5 + + + +22771.2 + + + + +40 + + + +25897.9 + + + +24559.4 + + + + +60 + + + +25952.9 + + + +24079.7 + + + + +80 + + + +25866.5 + + + +24246.4 + + + + +100 + + + +25952.1 + + + +24171.9 + +三、MPAM稳定性测试 +3.1 resctrl mount/umount + +测试方法 + +挂载 resctrl 文件系统,设置 schemata 资源隔离接口,随机写 cpus/cpus_list、tasks 接口,读取mon_data 资源监控接口,最后卸载 resctrl 文件系统。重复 1000000 次。 + +测试结果 + +resctrl 文件系统相关接口仍可正常使用。 + +3.2 接口写入测试 +3.2.1 schemata 写入 + +测试方法 + +创建两个 group,生成随机 L3 cache mask 和 MB 内存带宽值,并分别写入两个 group 的 schemata 接口,之后读取 schemata 接口,验证当前值是否与写入值相同。重复测试 1000000 次。 + +测试结果 + +schemata 均可正常写入。 + +3.2.2 schemata 错误写入 + +对 schemata 接口写入多种错误参数,验证 schemata 是否可以正确识别处理。 + +验证的错误类型及验证结果如下: + +错误写入示例 + + + +last_cmd_status输出 + + + +测试结果 + + + + +L3:0=10000 + + + +Mask out of range + + + +PASS + + + + +L3:2=ff;3=ff + + + +Unknown domain + + + +PASS + + + + +L3 + + + +Missing ':' + + + +PASS + + + + +L3: + + + +Missing 'L3' value + + + +PASS + + + + +L3:0 + + + +Missing '=' or non-numeric domain + + + +PASS + + + + +L30:0=fff + + + +Unknown or unsupported resource name 'L30' + + + +PASS + + + + +L3:0=fghi + + + +Non-hex character in the mask fghi + + + +PASS + + + + +L3:1=ff;1=f4 + + + +Duplicate domain 1 + + + +PASS + + + + +MB:0=150 + + + +MB value 150 out of range 5-100 + + + +PASS + + + + +MB:0=4 + + + +MB value 4 out of range 5-100 + + + +PASS + + + + +MB:0=FOO + + + +Non-decimal digit in MB + + + +PASS + + + + +MB + + + +Missing ':' + + + +PASS + + + + +MB:0 + + + +Missing 'MB' value + + + +PASS + + + + +MB:2=55 + + + +Unknown domain + + + +PASS + + + + +MB:1=23;1=56 + + + +Duplicate domain 1 + + + +PASS + + + + +L3:0=ff (with cdp) + + + +Unknown or unsupported resource name 'L3' + + + +PASS + +3.2.3 cpus/cpus_list 写入 + +测试方法 + +随机写入 cpus/cpus_list 接口 1000000 次,验证是否写入成功,并且 cpus 接口和 cpus_list 接口的值是否相对应。 + +测试结果 + +cpus/cpus_list 均可正常写入,并保持值的相等。 + +3.2.4 cpus/cpus_list 错误写入 + +错误写入示例 + + + +last_cmd_status输出 + + + +测试结果 + + + + +echo 156 > cpus_list + + + +Can only assign online CPUs + + + +PASS + + + + +echo 4096 > cpus_list + + + +Bad CPU list/mask + + + +PASS + + + + +echo ffff > cpus_list + + + +Bad CPU list/mask + + + +PASS + + + + +echo 3-12 > cpus + + + +Bad CPU list/mask + + + +PASS + +3.2.5 tasks 写入 + +测试方法 + +创建 500 个进程,并将其 pid 写入 tasks 接口,验证进程对应 pid 是否存在。之后 kill 掉所有进程,验证其 pid 是否已从 tasks 文件中移除。重复 1000000 次。 + +测试结果 + +tasks 接口均可正常写入和移除。 + +3.2.6 tasks 错误写入 + +错误示例 + + + +stderr + + + +测试结果 + + + + +将不存在pid写入tasks + + + +echo: write error: No such process + + + +PASS + + + + +echo hello > tasks + + + +echo: write error: Invalid argument + + + +PASS + +3.2.7 mode 写入 + +mode 接口默认值为 shareable,当前MPAM接口暂不支持 mode 接口值的修改。 + +Mode + + + +支持情况 + + + + +shareable + + + +支持 + + + + +exclusive + + + +不支持 + + + + +pseudo-locksetup + + + +不支持 + + + + +pseudo-locked + + + +不支持 + +3.3 group mkdir/rmdir 测试 +3.3.1 max group 创建 + +测试方法 + +以 info/*/num_closids 为基准,创建所能达到的最多 group。重复 1000000 次。 + +测试结果 + +倚天 PARTID 数目为 64 个,因此除了 default group 外,最多能够创建 63 个 group。一般情况下均可达到最大值。但在部分 group 被使用过的情况下,由于其对应的 PARTID 在 L3 cache中占用量可能超过 /sys/fs/resctrl/info/L3_MON/max_threshold_occupancy,从而导致该 PARTID 在一定时间内不可用。 + +3.3.2 group 随机创建/删除 + +测试方法 + +随机创建/删除 group 共计 2000*(num_closids-1),验证 group 分配和回收功能是否正常。 + +测试结果 + +group 随机创建和删除,group 分配/回收接口仍可正常运作。 + +3.3.3 mon_group 创建/删除 + +当前社区版本MPAM代码下 num_rmids 均为 1,暂不支持 mon_groups 目录下 mon group 的创建和删除。 + +3.4 并发读写测试 +3.4.1 L3 cache 监控接口并发读取 + +测试方法 + +创建 5 个 group,每个 group 中写入 10 个进程:memhog -r1000000000 1m > /mnt/log + +同时 10 个进程并发读 mon_data/L3_MON/llc_occupancy,持续时间 60 min。 + +测试结果 + +测试过程中未出现resctrl接口崩溃或不可用问题。 + +3.4.2 MB 监控接口并发读取 + +测试方法 + +创建 5 个 group,每个 group 中写入 10 个进程:memhog -r1000000000 1m > /mnt/log + +同时 10 个进程并发读 mon_data/mon_MB_*/mbm_local_bytes,持续时间 60 min。 + +测试结果 + +测试过程中未出现 resctrl 接口崩溃或不可用问题。 + +3.4.3 schemata 接口并发写入 + +测试方法 + +创建 5 个 group,每个 group 中写入 10 个进程:memhog -r1000000000 1m > /mnt/log + +同时 10 个进程并发随机写入 schemata,持续时间 60 min。 + +测试结果 + +测试过程中未出现 resctrl 接口崩溃或不可用问题。 + +3.4.4 cpus/cpus_list 接口并发写入 + +测试方法 + +创建 1 个 group,10 个进程并发写入随机 cpus/cpus_list,持续时间 60 min。 + +测试结果 + +测试过程未出现接口崩溃或不可用问题。 + +3.4.5 tasks 接口并发写入 + +测试方法 + +创建 1 个 group,10 个进程并发创建 300 个 task 并写入 tasks 接口。 + +测试结果 + +测试过程未出现接口崩溃或不可用问题。 + +四、MPAM 错误中断验证 +4.1 L3 cache 资源错误中断验证 + +错误码 + + + +描述 + + + +结果 + + + +备注 + + + + +0 + + + +No error captured in MPAMF_ESR + + + +无 + + + +非异常情况 + + + + +1 + + + +MPAMCFG_PART_SEL out of range + + + +可触发 + + + + + + + + + +2 + + + +Request PARTID out of range + + + +可触发 + + + + + + + + + +3 + + + +MSMON out of range PARTID/PMG + + + +可触发 + + + + + + + + + +4 + + + +Request PMG out of range + + + +不可触发 + + + +PMG>1时无法写入 + + + + +5 + + + +MSMON_CFG_MON_SEL out of range + + + +可触发 + + + + + + + + + +6 + + + +MPAMCFG_INTPARTID out of range + + + +未测试 + + + +暂不支持PARTID narrowing + + + + +7 + + + +INTERNAL unexpected + + + +未测试 + + + +暂不支持PARTID narrowing + + + + +8 + + + +MPAMCFG_PART_SEL.RIS unimplemented + + + +不可触发 + + + +RIS>1时无法写入 + + + + +9 + + + +MPAMCFG_PART_SEL.RIS no control + + + +不可触发 + + + +RIS>1时无法写入 + + + + +10 + + + +MSMON_CFG_MON_SEL.RIS unimplemented + + + +不可触发 + + + +RIS>1时无法写入 + + + + +11 + + + +MSMON_CFG_MON_SEL.RIS no monitor + + + +不可触发 + + + +RIS>1时无法写入 + + + + +12:18 + + + +Reserved \ No newline at end of file -- Gitee