From 98c89d1d45b3f88388297cace522d651d5103f32 Mon Sep 17 00:00:00 2001 From: lxq <191991518@qq.com> Date: Mon, 22 Sep 2025 17:33:32 +0800 Subject: [PATCH] feat: Add rpm packaging support and fix some bug about ebpf compiling and warning with updating the README.md ,and fix test bug about path --- Cargo.lock | 97 ++++++ LICENSE | 122 +++++++ README.md | 190 ++++++++--- healer-common/Cargo.toml | 7 +- healer-ebpf/Cargo.toml | 10 +- healer/Cargo.toml | 14 +- healer/src/config.rs | 2 + healer/src/config_manager.rs | 5 - .../src/coordinator/dependency_coordinator.rs | 19 -- healer/src/daemon_handler.rs | 1 + healer/src/main.rs | 103 ++++-- healer/src/monitor_manager.rs | 3 +- healer/src/subscriber/process_healer.rs | 9 +- healer/src/utils.rs | 4 + healer/tests/ebpf_e2e.rs | 151 ++++++--- healer/tests/process_e2e.rs | 316 ++++++++++++------ packaging/rpm/healer.spec | 85 +++++ packaging/systemd/healer.service | 21 ++ scripts/build-rpm.sh | 130 +++++++ .../tests => tests}/fixtures/dummy_service.py | 0 .../tests => tests}/fixtures/test_process.rs | 4 +- tests/fixtures/test_process_ebpf.rs | 7 + 22 files changed, 1049 insertions(+), 251 deletions(-) create mode 100644 LICENSE create mode 100644 packaging/rpm/healer.spec create mode 100644 packaging/systemd/healer.service create mode 100644 scripts/build-rpm.sh rename {healer/tests => tests}/fixtures/dummy_service.py (100%) rename {healer/tests => tests}/fixtures/test_process.rs (53%) create mode 100644 tests/fixtures/test_process_ebpf.rs diff --git a/Cargo.lock b/Cargo.lock index 6c4a3c4..869f218 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -378,6 +378,46 @@ dependencies = [ "windows-link", ] +[[package]] +name = "clap" +version = "4.5.48" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e2134bb3ea021b78629caa971416385309e0131b351b25e01dc16fb54e1b5fae" +dependencies = [ + "clap_builder", + "clap_derive", +] + +[[package]] +name = "clap_builder" +version = "4.5.48" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c2ba64afa3c0a6df7fa517765e31314e983f51dda798ffba27b988194fb65dc9" +dependencies = [ + "anstream", + "anstyle", + "clap_lex", + "strsim", +] + +[[package]] +name = "clap_derive" +version = "4.5.47" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bbfd7eae0b0f1a6e63d4b13c9c478de77c2eb546fba158ad50b4203dc24b9f9c" +dependencies = [ + "heck", + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "clap_lex" +version = "0.7.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b94f61472cee1439c0b966b47e3aca9ae07e45d070759512cd390ea2bebc6675" + [[package]] name = "colorchoice" version = "1.0.4" @@ -442,6 +482,27 @@ dependencies = [ "powerfmt", ] +[[package]] +name = "dirs-next" +version = "2.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b98cf8ebf19c3d1b223e151f99a4f9f0690dca41414773390fc824184ac833e1" +dependencies = [ + "cfg-if", + "dirs-sys-next", +] + +[[package]] +name = "dirs-sys-next" +version = "0.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4ebda144c4fe02d1f7ea1a7d9641b6fc6b580adcfa024ae48797ecdeb6825b4d" +dependencies = [ + "libc", + "redox_users", + "winapi", +] + [[package]] name = "displaydoc" version = "0.2.5" @@ -718,7 +779,9 @@ dependencies = [ "aya-log", "bytes", "chrono", + "clap", "daemonize", + "dirs-next", "env_logger", "futures", "healer-common", @@ -728,6 +791,7 @@ dependencies = [ "serde", "serde_yaml", "sysinfo", + "tempfile", "tokio", "tracing", "tracing-appender", @@ -752,6 +816,12 @@ dependencies = [ "which", ] +[[package]] +name = "heck" +version = "0.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea" + [[package]] name = "http" version = "1.3.1" @@ -1096,6 +1166,16 @@ version = "0.2.174" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1171693293099992e19cddea4e8b849964e9846f4acee11b3948bcc337be8776" +[[package]] +name = "libredox" +version = "0.1.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "416f7e718bdb06000964960ffa43b4335ad4012ae8b99060261aa4a8088d5ccb" +dependencies = [ + "bitflags", + "libc", +] + [[package]] name = "linux-raw-sys" version = "0.9.4" @@ -1482,6 +1562,17 @@ dependencies = [ "bitflags", ] +[[package]] +name = "redox_users" +version = "0.4.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ba009ff324d1fc1b900bd1fdb31564febe58a8ccc8a6fdbb93b543d33b13ca43" +dependencies = [ + "getrandom 0.2.16", + "libredox", + "thiserror 1.0.69", +] + [[package]] name = "regex" version = "1.11.1" @@ -1830,6 +1921,12 @@ version = "1.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a8f112729512f8e442d81f95a8a7ddf2b7c6b8a1a6f509a95864142b30cab2d3" +[[package]] +name = "strsim" +version = "0.11.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7da8b5736845d9f2fcb837ea5d9e2628564b3b043a70948a3f0b778838c5fb4f" + [[package]] name = "subtle" version = "2.6.1" diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..19354fc --- /dev/null +++ b/LICENSE @@ -0,0 +1,122 @@ +木兰宽松许可证, 第2版 + +木兰宽松许可证, 第2版 +2020年1月 http://license.coscl.org.cn/MulanPSL2 + +您对"软件"的复制、使用、修改及分发受木兰宽松许可证,第2版("本许可证")的如下条款的约束: + +0. 定义 + +"软件"是指由"贡献"构成的许可在"本许可证"下的程序和相关文档的集合。 + +"贡献"是指由任一"贡献者"许可在"本许可证"下的受版权法保护的作品。 + +"贡献者"是指将受版权法保护的作品许可在"本许可证"下的自然人或"法人实体"。 + +"法人实体"是指提交贡献的机构及其"关联实体"。 + +"关联实体"是指,对"法人实体"而言,控制、受控制或与其共同受控制的机构,此处的控制是指有受控方或共同受控方至少50%直接或间接的投票权、资金或其他有价证券。 + +1. 授予版权许可 + +每个"贡献者"根据"本许可证"授予您永久性的、全球性的、免费的、非独占的、不可撤销的版权许可,您可以复制、使用、修改、分发其"贡献",不论修改与否。 + +2. 授予专利许可 + +每个"贡献者"根据"本许可证"授予您永久性的、全球性的、免费的、非独占的、不可撤销的(根据本条规定撤销除外)专利许可,供您制造、委托制造、使用、许诺销售、销售、进口其"贡献"或以其他方式转移其"贡献"。前述专利许可仅限于"贡献者"现在或将来拥有或控制的其"贡献"本身或其"贡献"与许可"贡献"时的"软件"结合而将必然会侵犯的专利权利要求,不包括对"贡献"的修改或包含"贡献"的其他结合。如果您或您的"关联实体"直接或间接地,就"软件"或其中的"贡献"对任何人发起专利侵权诉讼(包括反诉或交叉诉讼)或其他专利维权行动,指控其侵犯专利权,则"贡献者"根据"本许可证"授予您专利许可自您提起诉讼或发起维权行动之日终止。 + +3. 无商标许可 + +"本许可证"不提供对"贡献者"的商品名称、商标、服务标志或产品名称的商标许可,但您为满足第4条规定的声明义务而必须使用除外。 + +4. 分发限制 + +您可以在任何媒介中将"软件"以源程序形式或可执行形式重新分发,不论修改与否,但您必须向接收者提供"本许可证"的副本,并保留"软件"中的版权、商标、专利及免责声明。 + +5. 免责声明与责任限制 + +"软件"及其中的"贡献"在提供时不带任何明示或默示的担保。在任何情况下,"贡献者"或版权所有者不对任何人因使用"软件"或其中的"贡献"而引发的任何直接或间接损失承担责任,不论因何种原因导致或者基于何种法律理论,即使其曾被建议有此种损失的可能性。 + +6. 语言 +"本许可证"以中英文双语表述,中英文版本具有同等法律效力。如果中英文版本存在任何冲突不一致,以中文版为准。 + +条款结束 + +如何将木兰宽松许可证,第2版,应用到您的软件 + +如果您希望将木兰宽松许可证,第2版,应用到您的软件,您可以在每个源文件的头部加入如下声明,请先将括号内的字段按照实际情况进行替换: + +Copyright (c) [Year] [name of copyright holder] +[Software Name] is licensed under Mulan PSL v2. +You can use this software according to the terms and conditions of the Mulan PSL v2. +You may obtain a copy of Mulan PSL v2 at: + http://license.coscl.org.cn/MulanPSL2 +THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, +EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, +MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE. +See the Mulan PSL v2 for more details. + + Mulan Permissive Software License,Version 2 + +Mulan Permissive Software License,Version 2 (Mulan PSL v2) +January 2020 http://license.coscl.org.cn/MulanPSL2 + +Your reproduction, use, modification and distribution of the Software are subject to the following conditions: + +0. Definition + +Software means the program and related documents which are licensed under this License and comprise all Contribution(s). + +Contribution means the copyrightable work licensed by a particular Contributor under this License. + +Contributor means the Individual or Legal Entity who licenses its copyrightable work under this License. + +Legal Entity means the entity making a Contribution and all its Affiliates. + +Affiliates means entities that control, are controlled by, or are under common control with the acting entity under this License, 'control' means direct or indirect ownership of at least fifty percent (50%) of the voting power, capital or other securities of controlled or commonly controlled entity. + +1. Grant of Copyright License + +Subject to the terms and conditions of this License, each Contributor hereby grants to you a perpetual, worldwide, royalty-free, non-exclusive, irrevocable copyright license to reproduce, use, modify, or distribute its Contribution, with modification or not. + +2. Grant of Patent License + +Subject to the terms and conditions of this License, each Contributor hereby grants to you a perpetual, worldwide, royalty-free, non-exclusive, irrevocable (except for revocation under this Section) patent license to make, have made, use, promise to sell, sell, import or otherwise transfer its Contribution, where such patent license is only limited to the patent claims owned or controlled by such Contributor now or in future which will be necessarily infringed by its Contribution alone, or by combination of the Contribution with the Software to which the Contribution was contributed. The patent license shall not apply to any modification of the Contribution, and any other combination which includes the Contribution. If you or your Affiliates directly or indirectly institute patent litigation (including a cross claim or counterclaim in a lawsuit) or other patent enforcement activities against any individual or entity by alleging that the Software or any Contribution in it infringes patents, then any patent license granted to you under this License for the Software shall terminate as of the date such litigation or activity is filed or taken. + +3. No Trademark License + +No trademark license is granted to use the trade names, trademarks, service marks, or product names of Contributor, except as required to fulfill notice requirements in Section 4. + +4. Distribution Restriction + +You may distribute the Software in any medium with or without modification, whether in source or executable forms, provided that you provide recipients with a copy of this License and retain copyright, patent, trademark and disclaimer statements in the Software. + +5. Disclaimer of Warranty and Limitation of Liability + +THE SOFTWARE AND CONTRIBUTION IN IT ARE PROVIDED WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED. IN NO EVENT SHALL ANY CONTRIBUTOR OR COPYRIGHT HOLDER BE LIABLE TO YOU FOR ANY DAMAGES, INCLUDING, BUT NOT LIMITED TO ANY DIRECT, OR INDIRECT, SPECIAL OR CONSEQUENTIAL DAMAGES ARISING FROM YOUR USE OR INABILITY TO USE THE SOFTWARE OR THE CONTRIBUTION IN IT, NO MATTER HOW IT'S CAUSED OR BASED ON WHICH LEGAL THEORY, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGES. + +6. Language + +THIS LICENSE IS WRITTEN IN BOTH CHINESE AND ENGLISH, AND THE CHINESE VERSION AND ENGLISH VERSION SHALL HAVE THE SAME LEGAL EFFECT. IN THE CASE OF DIVERGENCE BETWEEN THE CHINESE AND ENGLISH VERSIONS, THE CHINESE VERSION SHALL PREVAIL. + +END OF THE TERMS AND CONDITIONS + +How to Apply the Mulan Permissive Software License,Version 2 (Mulan PSL v2) to Your Software + +To apply the Mulan PSL v2 to your work, for easy identification by recipients, you are suggested to complete following three steps: + +i Fill in the blanks in following statement, including fill in the name of software, the year of the first publication of software, and the name of copyright holder; + +ii Create a file named "LICENSE" which contains the whole context of this License in the first directory of your software package; + +iii Attach the statement to the appropriate annotated syntax at the beginning of each source file. + +Copyright (c) [Year] [name of copyright holder] +[Software Name] is licensed under Mulan PSL v2. +You can use this software according to the terms and conditions of the Mulan PSL v2. +You may obtain a copy of Mulan PSL v2 at: + http://license.coscl.org.cn/MulanPSL2 +THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, +EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, +MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE. +See the Mulan PSL v2 for more details. \ No newline at end of file diff --git a/README.md b/README.md index f730ba6..fe05d3f 100644 --- a/README.md +++ b/README.md @@ -2,7 +2,156 @@ ## 介绍 A high-performance daemon leveraging eBPF for reliable, low-overhead monitoring and automatic recovery of critical processes to ensure service continuity. +## 安装与编译 +### 从源码编译 +``` +RUST_LOG=info cargo run --config 'target."cfg(all())".runner="sudo -E"' +``` +用非root权限进行cargo build,并以root权限执行二进制 + +日志的位置可以由用户自己在 `config.yaml`中定义: +```YAML +# 全局配置 +log_level: "info" #日志输出等级,可以调整为debug/tracing发现更多信息,不过会被RUST_LOG环境变量覆盖 +log_directory: "/var/log/healer" #日志文件地址,本地址需要root权限,用户可以放在自己定义的位置下。 +pid_file_directory: "/var/run/healer" # healer 守护进程自己的 PID 文件目录,用户可以放在自己定义的位置下。 +working_directory: "/" #工作目录,默认是根目录 +``` + +### RPM 打包与安装 +本仓库提供了 RPM 打包脚本与规范文件,帮助你在基于 RPM 的发行版上安装为系统服务: + +- 规范文件:`packaging/rpm/healer.spec` +- systemd 单元:`packaging/systemd/healer.service` +- 构建脚本:`scripts/build-rpm.sh` + +步骤: +1. 安装依赖(以 Fedora 为例): + ```bash + sudo dnf install -y rpm-build rpmdevtools gcc clang llvm rust cargo make systemd rsync + ``` + 需要 `bpf-linker` 用于 eBPF 对象链接;脚本会尝试用 `cargo install bpf-linker` 自动安装。 + +2. 安装 Rust nightly toolchain(eBPF 构建所需): + ```bash + rustup toolchain install nightly + rustup component add rust-src --toolchain nightly + ``` + +3. 构建 RPM: + ```bash + bash scripts/build-rpm.sh + ``` + 完成后,生成的二进制包位于 `~/rpmbuild/RPMS//healer--1...rpm`。 + +4. 安装与管理: + ```bash + sudo rpm -Uvh ~/rpmbuild/RPMS/*/healer-*.rpm + sudo systemctl enable --now healer + sudo systemctl status healer + ``` + +安装后的文件布局: +- 可执行文件:`/usr/bin/healer` +- 配置文件:`/etc/healer/config.yaml`(标记为 `%config(noreplace)`,升级不会覆盖本地修改) +- 日志目录:`/var/log/healer` +- 运行目录:`/var/run/healer` +- systemd unit:`/usr/lib/systemd/system/healer.service` + +卸载: +```bash +sudo systemctl disable --now healer +sudo rpm -e healer +``` + +## 使用说明 + +### 配置文件 +配置文件在根目录下的`config.yaml` +配置文件采用了`serde`库进行解析。支持 PID eBPF Network 三种工作模式,具体的使用实例可以参考yaml中已有的样例。 +```YAML + - name: "simple_counter" #名字 用于日志 + enabled: true #启用开关 + command: "/home/lxq/ospp/simple_test_process/target/debug/simple_test_process" #恢复命令,在无pid文件时作为进程的主键来识别 + args: [] #恢复命令的参数 + run_as_root: false #进程是否已root进行恢复重启 + run_as_user: "lxq" #如果非root,则以某个用户的身份重启 + monitor: + type: "pid" # 使用 PID 文件进行监控 + pid_file_path: "/var/run/healer/simple_counter.pid" # pid监控模式应该有对应的pid文件 + interval_secs: 3 # 轮询间隔,单位秒 + # 恢复/重启策略配置 + recovery: + type: "regular" # 恢复策略,目前只有regular,regular默认实现了熔断,后续可以考虑分为两种恢复模式 + retries: 3 # 60秒内最多重试3次 + retry_window_secs: 60 + cooldown_secs: 180 # 如果发生熔断,冷却3分钟(180秒) +``` +配置文件支持热加载,可以给守护进程发送信号sigup来实现更新。 + +### 命令行参数 +healer 守护进程支持以下命令行参数: + +```bash +healer [OPTIONS] +``` + +#### 可用选项 +- `-c, --config `:指定配置文件路径(YAML 格式) + - 如果未提供,程序会按照以下顺序搜索配置文件: + 1. 环境变量 `HEALER_CONFIG` 指定的路径 + 2. 当前目录下的 `config.yaml` + 3. `/etc/healer/config.yaml`(系统级配置) + +- `--foreground`:在前台运行(不进行守护进程化) + - 等同于设置环境变量 `HEALER_NO_DAEMON=1` + - 适用于调试、容器环境或由 systemd 等进程管理器管理时 + +- `--print-config-path`:打印当前使用的配置文件路径并退出 + - 用于调试配置文件解析问题 + - 不会启动守护进程,只显示配置路径 + +- `-h, --help`:显示帮助信息 + +- `-V, --version`:显示版本信息 + +#### 使用示例 +```bash +# 使用默认配置启动 +healer + +# 指定配置文件 +healer -c {/PATH} + +# 在前台运行(调试模式) +healer --foreground + +# 查看当前会使用的配置文件路径 +healer --print-config-path + +# 通过环境变量指定配置文件 +HEALER_CONFIG=/etc/healer/config.yaml healer + +# 不进行守护进程化 +HEALER_NO_DAEMON=1 healer +``` + +#### 环境变量 +- `HEALER_CONFIG`:指定配置文件路径 +- `HEALER_NO_DAEMON=1`:不进行守护进程化,在前台运行 +- `RUST_LOG`:设置日志级别(会覆盖配置文件中的 `log_level` 设置) + +## 测试 +要运行集成测试,请使用以下命令。请注意,某些测试(例如与 eBPF 相关的测试)可能需要以 root 权限运行。 +``` +# 检查所有的集成测试并将日志输出不重定向示例(ebpf测试因为需要sudo,默认忽略) +HEALER_TEST_INHERIT_STDIO=1 RUST_LOG=info cargo test -p healer --test process_e2e -- --nocapture --color=always +``` +``` +# ebpf测试检查的命令示例(需要HEALER_EBPF_E2E=1,同时可执行文件要以sudo权限执行) +HEALER_EBPF_E2E=1 HEALER_TEST_INHERIT_STDIO=1 RUST_LOG=info CARGO_TERM_COLOR=always cargo test -p healer --test ebpf_e2e --config 'target."cfg(all())".runner="sudo -E"' -- --ignored --nocapture --color=always +``` ## 软件架构 Healer 是一个面向关键进程自愈场景的轻量守护进程,当前已实现的核心要点: @@ -69,44 +218,3 @@ Monitors → broadcast → ProcessHealer(执行恢复 / 熔断)。 - 依赖关系自动发现与建模(系统d unit / 命令行启发式 / 配置融合)。 - 延迟 / 条件性恢复(等待关键依赖 ready 后再放行重启)。 - eBPF 深度扩展(在现有退出事件基础上继续增加退出原因、异常 syscall / 资源异常 统计等更细粒度信号)。 - - - -## 配置文件使用指南 -配置文件在根目录下的`config.yaml` -配置文件采用了`serde`库进行解析。支持 PID eBPF Network 三种工作模式,具体的使用实例可以参考yaml中已有的样例。 -```YAML - - name: "simple_counter" #名字 用于日志 - enabled: true #启用开关 - command: "/home/lxq/ospp/simple_test_process/target/debug/simple_test_process" #恢复命令,在无pid文件时作为进程的主键来识别 - args: [] #恢复命令的参数 - run_as_root: false #进程是否已root进行恢复重启 - run_as_user: "lxq" #如果非root,则以某个用户的身份重启 - monitor: - type: "pid" # 使用 PID 文件进行监控 - pid_file_path: "/var/run/healer/simple_counter.pid" # pid监控模式应该有对应的pid文件 - interval_secs: 3 # 轮询间隔,单位秒 - # 恢复/重启策略配置 - recovery: - type: "regular" # 恢复策略,目前只有regular,regular默认实现了熔断,后续可以考虑分为两种恢复模式 - retries: 3 # 60秒内最多重试3次 - retry_window_secs: 60 - cooldown_secs: 180 # 如果发生熔断,冷却3分钟(180秒) -``` -配置文件支持热加载,可以给守护进程发送信号sigup来实现更新。 - - -## 编译使用 -``` -RUST_LOG=info cargo run --config 'target."cfg(all())".runner="sudo -E"' -``` -用非root权限进行cargo build,并以root权限执行二进制 - -日志的位置可以由用户自己在 `config.yaml`中定义: -```YAML -# 全局配置 -log_level: "info" #日志输出等级,可以调整为debug/tracing发现更多信息,不过会被RUST_LOG环境变量覆盖 -log_directory: "/var/log/healer" #日志文件地址,本地址需要root权限,用户可以放在自己定义的位置下。 -pid_file_directory: "/var/run/healer" # healer 守护进程自己的 PID 文件目录,用户可以放在自己定义的位置下。 -working_directory: "/" #工作目录,默认是根目录 -``` diff --git a/healer-common/Cargo.toml b/healer-common/Cargo.toml index bb97b1c..2c2ef91 100644 --- a/healer-common/Cargo.toml +++ b/healer-common/Cargo.toml @@ -1,7 +1,12 @@ [package] name = "healer-common" version = "0.1.0" -edition = "2024" +edition = "2021" +license = "MulanPSL-2.0" +authors = ["XqiLiu"] +description = "Common types and utilities for healer daemon" +repository = "https://github.com/XqiLiu/healer" +homepage = "https://github.com/XqiLiu/healer" [features] default = [] diff --git a/healer-ebpf/Cargo.toml b/healer-ebpf/Cargo.toml index 0fd5266..a03aedd 100644 --- a/healer-ebpf/Cargo.toml +++ b/healer-ebpf/Cargo.toml @@ -1,7 +1,12 @@ [package] name = "healer-ebpf" version = "0.1.0" -edition = "2024" +edition = "2021" +license = "MulanPSL-2.0" +authors = ["XqiLiu"] +description = "eBPF programs for healer daemon monitoring" +repository = "https://github.com/XqiLiu/healer" +homepage = "https://github.com/XqiLiu/healer" [dependencies] aya-ebpf = { git = "https://github.com/aya-rs/aya" } @@ -18,5 +23,6 @@ path = "src/main.rs" required-features = ["build-ebpf"] [features] -# 默认不构建 eBPF 二进制,除非显式开启该 feature。 +# 默认构建 eBPF 二进制 +default = ["build-ebpf"] build-ebpf = [] diff --git a/healer/Cargo.toml b/healer/Cargo.toml index 3b39478..26c4cde 100644 --- a/healer/Cargo.toml +++ b/healer/Cargo.toml @@ -1,7 +1,14 @@ [package] name = "healer" version = "0.1.0" -edition = "2024" +edition = "2021" +license = "MulanPSL-2.0" +authors = ["XqiLiu"] +description = "Process self-healing daemon leveraging eBPF for monitoring and recovery" +repository = "https://github.com/XqiLiu/healer" +homepage = "https://github.com/XqiLiu/healer" +keywords = ["ebpf", "monitoring", "daemon", "self-healing", "process"] +categories = ["system-tools", "monitoring"] publish = false [dependencies] @@ -25,6 +32,11 @@ bytes = "1.10.1" futures = "0.3.31" sysinfo = "0.36.0" env_logger = "0.11.8" +clap = { version = "4", features = ["derive"] } +dirs-next = "2" + +[dev-dependencies] +tempfile = "3.0" [build-dependencies] diff --git a/healer/src/config.rs b/healer/src/config.rs index d074bc6..3470dc9 100644 --- a/healer/src/config.rs +++ b/healer/src/config.rs @@ -7,6 +7,7 @@ use std::path::{Path, PathBuf}; #[derive(Debug, Clone, Deserialize)] pub struct AppConfig { + #[allow(dead_code)] // Reserved for future use pub log_level: Option, pub log_directory: Option, pub pid_file_directory: Option, @@ -23,6 +24,7 @@ pub struct ProcessConfig { pub run_as_user: Option, pub run_as_root: bool, #[serde(default)] + #[allow(dead_code)] // Reserved for future use pub working_dir: Option, pub monitor: MonitorConfig, #[serde(default)] diff --git a/healer/src/config_manager.rs b/healer/src/config_manager.rs index 136cb3c..b67ee2f 100644 --- a/healer/src/config_manager.rs +++ b/healer/src/config_manager.rs @@ -38,9 +38,4 @@ impl ConfigManager { } } } - - // 获取当前配置的只读引用 - pub fn get_config(&self) -> Arc> { - Arc::clone(&self.config) - } } diff --git a/healer/src/coordinator/dependency_coordinator.rs b/healer/src/coordinator/dependency_coordinator.rs index fab1e6f..eb8127d 100644 --- a/healer/src/coordinator/dependency_coordinator.rs +++ b/healer/src/coordinator/dependency_coordinator.rs @@ -242,25 +242,6 @@ impl DependencyCoordinator { } } - /// 计算当前仍在阻塞的依赖集合: - /// 条件:依赖状态为 Waiting + Requires + hard;目标不是自身; - /// 目标是受管(或配置热更后忽略未受管);且该目标当前也在 deferred 中。 - fn compute_currently_blocking(&self, name: &str, state: &DeferredState) -> HashSet { - let mut set = HashSet::new(); - for d in &state.deps { - if d.status == DepWaitStatus::Waiting - && d.cfg.kind == DependencyKind::Requires - && d.cfg.hard - && d.cfg.target != name - && self.managed_targets.contains(&d.cfg.target) - && self.deferred.contains_key(&d.cfg.target) - { - set.insert(d.cfg.target.clone()); - } - } - set - } - async fn handle_retry(&mut self, name: String) { let mut remove_and_forward = None; let mut drop_due_to_abort = false; diff --git a/healer/src/daemon_handler.rs b/healer/src/daemon_handler.rs index efa7d4a..4712f56 100644 --- a/healer/src/daemon_handler.rs +++ b/healer/src/daemon_handler.rs @@ -4,6 +4,7 @@ use daemonize::Daemonize; use std::path::PathBuf; #[derive(Debug)] +#[allow(dead_code)] // Error fields preserved for error context pub enum DaemonError { Io(std::io::Error), Daemonize(daemonize::Error), diff --git a/healer/src/main.rs b/healer/src/main.rs index 134eebb..59b49f9 100644 --- a/healer/src/main.rs +++ b/healer/src/main.rs @@ -15,62 +15,97 @@ mod utils; use config::AppConfig; use daemon_handler::run_as_daemon; use std::env; +use std::path::{PathBuf}; use tokio::sync::RwLock; +use clap::Parser; + +/// Command line options for healer +#[derive(Debug, Parser)] +#[command(author, version, about = "Process self-healing daemon", long_about = None)] +struct Cli { + /// Path to configuration file (YAML). If not provided, search order applies. + #[arg(short, long)] + config: Option, + + /// Run in foreground (do not daemonize). Equivalent to env HEALER_NO_DAEMON=1 + #[arg(long)] + foreground: bool, + + /// Print the path that was selected for configuration and exit + #[arg(long)] + print_config_path: bool, +} + +fn candidate_config_paths(explicit: Option) -> Vec { + if let Some(p) = explicit { return vec![p]; } + + let mut cands = Vec::new(); + // 1. Environment variable + if let Ok(p) = env::var("HEALER_CONFIG") { cands.push(PathBuf::from(p)); } + // 2. Current working directory + cands.push(PathBuf::from("./config.yaml")); + cands.push(PathBuf::from("./healer.yaml")); + // 3. /etc/healer/ + cands.push(PathBuf::from("/etc/healer/config.yaml")); + cands.push(PathBuf::from("/etc/healer/healer.yaml")); + // 4. XDG config home if set + if let Ok(home) = env::var("XDG_CONFIG_HOME") { cands.push(PathBuf::from(home).join("healer/config.yaml")); } + // 5. ~/.config/healer/config.yaml + if let Some(home_dir) = dirs_next::home_dir() { cands.push(home_dir.join(".config/healer/config.yaml")); } + cands +} + +fn resolve_config_path(cli: &Cli) -> PathBuf { + if let Some(explicit) = &cli.config { return explicit.clone(); } + if let Ok(env_path) = env::var("HEALER_CONFIG") { return PathBuf::from(env_path); } + for cand in candidate_config_paths(None) { if cand.exists() { return cand; } } + // Fallback default (will likely fail later if missing) + PathBuf::from("config.yaml") +} + fn main() { - // Support overriding config path and running in foreground for tests/dev. - let config_file_path_str = - env::var("HEALER_CONFIG").unwrap_or_else(|_| "config.yaml".to_string()); - let run_foreground = matches!( - env::var("HEALER_NO_DAEMON") - .unwrap_or_else(|_| "0".to_string()) - .to_ascii_lowercase() - .as_str(), - "1" | "true" | "yes" - ); + let cli = Cli::parse(); - println!( - "Attempting to load the config from {}", - config_file_path_str - ); + // Determine final config path + let raw_config_path = resolve_config_path(&cli); + println!("Config resolution: using {:?}", raw_config_path); + + if cli.print_config_path { println!("{:?}", raw_config_path); return; } - let absolue_config_path = match std::fs::canonicalize(&config_file_path_str) { + // Expand & canonicalize for safety + let absolute_config_path = match std::fs::canonicalize(&raw_config_path) { Ok(path) => path, Err(e) => { - eprintln!( - "Error: No such file or directory about configure '{}': {}", - config_file_path_str, e - ); + eprintln!("Error: cannot access config {:?}: {}", raw_config_path, e); std::process::exit(1); } }; - let initial_config = AppConfig::load_from_file(&absolue_config_path).expect("初始配置加载失败"); + let initial_config = AppConfig::load_from_file(&absolute_config_path).expect("初始配置加载失败"); let shared_config = std::sync::Arc::new(RwLock::new(initial_config)); + // Detect foreground from either flag or env + let env_foreground = matches!( + env::var("HEALER_NO_DAEMON").unwrap_or_else(|_| "0".into()).to_ascii_lowercase().as_str(), + "1" | "true" | "yes" + ); + let run_foreground = cli.foreground || env_foreground; + if run_foreground { - // Minimal stdout logger for foreground mode; respects RUST_LOG. let _ = tracing_subscriber::fmt() .with_env_filter(tracing_subscriber::EnvFilter::from_default_env()) .with_ansi(true) .try_init(); - - // Run core logic directly without daemonizing (useful for tests). - core_logic::async_runtime(std::sync::Arc::clone(&shared_config), absolue_config_path); + core_logic::async_runtime(std::sync::Arc::clone(&shared_config), absolute_config_path); return; } - // Default: run as daemon. let config_for_closure = std::sync::Arc::clone(&shared_config); - let path_for_closure = absolue_config_path.clone(); - let core_logic_closure = - move || core_logic::async_runtime(config_for_closure, path_for_closure); + let path_for_closure = absolute_config_path.clone(); + let core_logic_closure = move || core_logic::async_runtime(config_for_closure, path_for_closure); match run_as_daemon(shared_config, core_logic_closure) { - Ok(_) => { - println!("Main program: Core logic quit"); - } - Err(e) => { - println!("Main program: Core logic error with {:?}", e); - } + Ok(_) => println!("Main program: Core logic quit"), + Err(e) => println!("Main program: Core logic error with {:?}", e), } } diff --git a/healer/src/monitor_manager.rs b/healer/src/monitor_manager.rs index 795d4fd..926ce76 100644 --- a/healer/src/monitor_manager.rs +++ b/healer/src/monitor_manager.rs @@ -1,5 +1,5 @@ use crate::{ - config::{NetworkMonitorConfig, ProcessConfig}, + config::ProcessConfig, event_bus::ProcessEvent, monitor::{ Monitor, ebpf_monitor::EbpfMonitor, network_monitor::NetworkMonitor, @@ -9,7 +9,6 @@ use crate::{ use anyhow::Result; use std::collections::HashMap; use std::time::Duration; -use sysinfo::NetworkData; use tokio::sync::broadcast; use tokio::task::JoinHandle; use tracing::{error, info, warn}; diff --git a/healer/src/subscriber/process_healer.rs b/healer/src/subscriber/process_healer.rs index 066fd3d..0869c88 100644 --- a/healer/src/subscriber/process_healer.rs +++ b/healer/src/subscriber/process_healer.rs @@ -1,14 +1,11 @@ use super::Subscriber; -use crate::config::{self, AppConfig, ProcessConfig, RecoveryConfig, RegularHealerFields}; +use crate::config::{AppConfig, RecoveryConfig}; use crate::event_bus::ProcessEvent; use async_trait::async_trait; -use serde::de::value::StrDeserializer; use std::collections::{HashMap, VecDeque}; -use std::iter::StepBy; -use std::os::unix::process::{self, CommandExt}; +use std::os::unix::process::CommandExt; use std::process::{Command, Stdio}; -use std::{default, fs}; -use std::{sync::Arc, time::Instant}; +use std::{fs, sync::Arc, time::Instant}; use tokio::sync::RwLock; use tokio::sync::{Mutex, broadcast}; use tracing::{debug, info, warn}; diff --git a/healer/src/utils.rs b/healer/src/utils.rs index 72ccbcd..443226d 100644 --- a/healer/src/utils.rs +++ b/healer/src/utils.rs @@ -3,6 +3,7 @@ use std::default::Default; use sysinfo::{ProcessRefreshKind, RefreshKind, System, UpdateKind}; use tracing::debug; +#[allow(dead_code)] // Utility function for future use pub fn find_pid_by_exe_path(path: &str) -> Option { let process_kind = ProcessRefreshKind::default().with_exe(UpdateKind::Always); let rk = RefreshKind::nothing().with_processes(process_kind); @@ -31,6 +32,7 @@ pub fn truncate_process_name(name: &str) -> String { /// 根据截断的进程名查找完整的进程配置名 /// 返回可能匹配的进程配置名列表 +#[allow(dead_code)] // Utility function for future use pub fn find_process_configs_by_truncated_name( truncated_name: &str, process_names: &[String], @@ -49,6 +51,7 @@ pub fn find_process_configs_by_truncated_name( /// 构建进程名到配置名的映射表 /// 用于快速查找截断名对应的完整配置 +#[allow(dead_code)] // Utility function for future use pub fn build_process_name_mapping(process_names: &[String]) -> HashMap> { let mut mapping = HashMap::new(); @@ -75,6 +78,7 @@ pub fn extract_executable_name(command: &str) -> String { /// 匹配进程名:处理截断名可能对应多个配置的情况 /// 优先返回精确匹配,如果有多个匹配则返回第一个 +#[allow(dead_code)] // Utility function for future use pub fn smart_match_process_name( truncated_name: &str, name_mapping: &HashMap>, diff --git a/healer/tests/ebpf_e2e.rs b/healer/tests/ebpf_e2e.rs index f447e0d..550831b 100644 --- a/healer/tests/ebpf_e2e.rs +++ b/healer/tests/ebpf_e2e.rs @@ -4,9 +4,13 @@ use std::path::PathBuf; use std::process::{Child, Command, Stdio}; use std::thread; use std::time::Duration; - +fn workspace_root() -> PathBuf { + // CARGO_MANIFEST_DIR 指向 healer 子 crate;集成测试期望使用工作区根目录(其父目录) + let crate_dir = PathBuf::from(env!("CARGO_MANIFEST_DIR")); + crate_dir.parent().map(|p| p.to_path_buf()).unwrap_or(crate_dir) +} fn cleanup_stray_processes() { - let base = PathBuf::from(env!("CARGO_MANIFEST_DIR")); + let base = workspace_root(); let healer_bin = base.join("target/debug/healer"); let test_helper = base.join("target/debug/test_process"); @@ -83,14 +87,14 @@ fn build_ebpf_config(base: &str) -> String { format!( r#" log_level: "info" -log_directory: "/tmp/healer-tests/logs" -pid_file_directory: "/tmp/healer-tests/pids" +log_directory: "{base}/target/debug/healer-tests/logs" +pid_file_directory: "{base}/target/debug/healer-tests/pids" working_directory: "/" processes: - name: "counter_ebpf" enabled: true - command: "{base}/target/debug/test_process" + command: "{base}/target/debug/test_process_ebpf" args: [] run_as_root: true run_as_user: null @@ -106,33 +110,92 @@ processes: } fn ensure_test_binaries() { - let helper_src = r#"fn main(){ - use std::{fs,thread,time,process,io::{self,Write}}; + // 创建共享的测试二进制,使用统一的逻辑 + let base = workspace_root(); + let pids_dir = base.join("target/debug/healer-tests/pids"); + let helper_src_dir = base.join("target/debug/healer-tests"); + if let Err(err) = fs::create_dir_all(&helper_src_dir) { + panic!( + "failed to create helper source directory {}: {}", + helper_src_dir.display(), + err + ); + } + + // 使用与 process_e2e.rs 相同的测试二进制代码 + let helper_src = format!( + r#"fn main(){{ + use std::{{fs,thread,time,process,io::{{self,Write}}}}; let pid = process::id(); - let _ = fs::create_dir_all("/tmp/healer-tests/pids"); - let _ = fs::write("/tmp/healer-tests/pids/counter.pid", pid.to_string()); - let mut n=0u64; loop{ print!("\r[PID {}] alive {}", pid,n); let _=io::stdout().flush(); thread::sleep(time::Duration::from_secs(1)); n+=1; } - }"#; - let base = PathBuf::from(env!("CARGO_MANIFEST_DIR")); + fs::create_dir_all("{}").expect("failed to create pid directory"); + fs::write("{}/counter.pid", pid.to_string()).expect("failed to write pid file"); + let mut n=0u64; loop{{ print!("\r[PID {{}}] alive {{}}", pid,n); io::stdout().flush().expect("flush failed"); thread::sleep(time::Duration::from_secs(1)); n+=1; }} + }}"#, + pids_dir.display(), + pids_dir.display() + ); + let bin_dir = base.join("target").join("debug"); - let test_bin_src = base.join("tests/fixtures/test_process.rs"); - let _ = fs::create_dir_all(test_bin_src.parent().unwrap()); - write_file(test_bin_src.to_str().unwrap(), helper_src); - let out_bin = bin_dir.join("test_process"); + let test_bin_src = helper_src_dir.join("test_process_ebpf.rs"); + write_file(test_bin_src.to_str().unwrap(), &helper_src); + + // 为 eBPF 测试使用独立的二进制名称 + let out_bin = bin_dir.join("test_process_ebpf"); + // 如果已存在可执行文件,跳过编译,避免在 sudo 环境下找不到 rustc if out_bin.exists() { return; } - let status = Command::new("rustc") - .args([ - "-O", - test_bin_src.to_str().unwrap(), - "-o", - out_bin.to_str().unwrap(), - ]) - .status() - .expect("failed to run rustc for test helper"); - assert!(status.success(), "failed to build test helper bin"); + + // 尝试编译,如果失败则提供有用的错误信息 + let result = try_compile_test_binary(&test_bin_src, &out_bin); + match result { + Ok(()) => println!("Successfully compiled eBPF test binary"), + Err(e) => { + eprintln!("Warning: Failed to compile eBPF test binary: {}", e); + eprintln!("Hint: If running with sudo, try pre-compiling the binary:"); + eprintln!(" rustc -O {} -o {}", + test_bin_src.display(), + out_bin.display()); + eprintln!("Or set RUSTC environment variable to point to rustc executable."); + panic!("Cannot proceed without test binary"); + } + } +} + +fn try_compile_test_binary(src: &std::path::Path, out: &std::path::Path) -> Result<(), String> { + // 尝试多种方式找到 rustc + let rustc_candidates = [ + // 首先尝试环境变量 + std::env::var("RUSTC").ok(), + // 尝试使用 which 命令 + Command::new("which").arg("rustc").output() + .ok() + .map(|out| String::from_utf8_lossy(&out.stdout).trim().to_string()) + .filter(|s| !s.is_empty()), + // 常见路径 + Some("/usr/bin/rustc".to_string()), + Some("/usr/local/bin/rustc".to_string()), + Some("/home/lxq/.cargo/bin/rustc".to_string()), + Some("/root/.cargo/bin/rustc".to_string()), + // 最后尝试直接调用 + Some("rustc".to_string()), + ]; + + for rustc_opt in rustc_candidates.into_iter().flatten() { + if let Ok(status) = Command::new(&rustc_opt) + .args(["-O", src.to_str().unwrap(), "-o", out.to_str().unwrap()]) + .status() + { + if status.success() { + return Ok(()); + } else { + return Err(format!("rustc compilation failed with status: {}", status)); + } + } + } + + Err("Could not find rustc executable".to_string()) } #[test] @@ -151,33 +214,43 @@ fn ebpf_detects_exit_and_recovers() { cleanup_stray_processes(); ensure_test_binaries(); - let base = PathBuf::from(env!("CARGO_MANIFEST_DIR")); + let base = workspace_root(); let cfg_text = build_ebpf_config(base.to_str().unwrap()); let cfg_path = base.join("target/debug/ebpf_config.yaml"); write_file(cfg_path.to_str().unwrap(), &cfg_text); // 先启动被监控进程,便于观察 eBPF 事件 - let helper_bin = base.join("target/debug/test_process"); - let mut child = Command::new(helper_bin) + let helper_bin = base.join("target/debug/test_process_ebpf"); + let mut child = Command::new(&helper_bin) .stdout(Stdio::null()) .stderr(Stdio::null()) .spawn() - .expect("failed to spawn test_process"); + .expect("failed to spawn test_process_ebpf"); - let pid_path = "/tmp/healer-tests/pids/counter.pid"; - for _ in 0..10 { - if fs::read_to_string(pid_path).is_ok() { - break; - } - wait_secs(1); + let pid_path = base.join("target/debug/healer-tests/pids/counter.pid"); + if let Some(parent) = pid_path.parent() { + fs::create_dir_all(parent).expect("failed to ensure pid directory exists"); } + let helper_pid = child.id() as i32; + fs::write(&pid_path, helper_pid.to_string()).expect("failed to prime helper pid file"); + let recorded_pid: i32 = fs::read_to_string(&pid_path) + .expect("failed to read primed pid file") + .trim() + .parse() + .expect("primed pid file contains invalid pid"); + assert_eq!( + recorded_pid, helper_pid, + "PID file content mismatch: expected {}, got {}", + helper_pid, recorded_pid + ); + let healer = spawn_healer_foreground(cfg_path.to_str().unwrap()); // 等待 eBPF 初始化与 watch 生效 wait_secs(3); // 基线 PID - let first_pid: i32 = fs::read_to_string(pid_path) + let first_pid: i32 = fs::read_to_string(&pid_path) .ok() .and_then(|s| s.trim().parse().ok()) .unwrap_or(0); @@ -191,7 +264,7 @@ fn ebpf_detects_exit_and_recovers() { let mut new_pid = first_pid; for _ in 0..20 { wait_secs(1); - if let Ok(s) = fs::read_to_string(pid_path) { + if let Ok(s) = fs::read_to_string(&pid_path) { if let Ok(p) = s.trim().parse::() { if p > 0 && p != first_pid { new_pid = p; @@ -207,7 +280,7 @@ fn ebpf_detects_exit_and_recovers() { // 清理 kill_child(healer); - if let Ok(s) = fs::read_to_string(pid_path) { + if let Ok(s) = fs::read_to_string(&pid_path) { if let Ok(p) = s.trim().parse::() { kill_by_pid(p); } diff --git a/healer/tests/process_e2e.rs b/healer/tests/process_e2e.rs index f6c1a52..36069d5 100644 --- a/healer/tests/process_e2e.rs +++ b/healer/tests/process_e2e.rs @@ -1,36 +1,88 @@ use std::fs; use std::io::{Read, Write}; +use std::net::{TcpListener, TcpStream}; use std::path::PathBuf; use std::process::{Child, Command, Stdio}; use std::thread; use std::time::Duration; +use tempfile::TempDir; -fn cleanup_stray_processes() { - let base = PathBuf::from(env!("CARGO_MANIFEST_DIR")); - let healer_bin = base.join("target/debug/healer"); - let test_helper = base.join("target/debug/test_process"); - let dummy_py = base.join("tests/fixtures/dummy_service.py"); +struct TestContext { + temp_dir: TempDir, + port: u16, + children: Vec, +} - let patterns = vec![ - healer_bin.to_string_lossy().to_string(), - test_helper.to_string_lossy().to_string(), - dummy_py.to_string_lossy().to_string(), - ]; +impl TestContext { + fn new() -> Self { + let temp_dir = TempDir::new().expect("Failed to create temp directory"); + + // 动态分配端口 + let port = find_free_port(); + + Self { + temp_dir, + port, + children: Vec::new(), + } + } + + fn temp_path(&self) -> &std::path::Path { + self.temp_dir.path() + } + + fn logs_dir(&self) -> PathBuf { + let logs = self.temp_path().join("logs"); + fs::create_dir_all(&logs).expect("Failed to create logs directory"); + logs + } + + fn pids_dir(&self) -> PathBuf { + let pids = self.temp_path().join("pids"); + fs::create_dir_all(&pids).expect("Failed to create pids directory"); + pids + } + + fn add_child(&mut self, child: Child) { + self.children.push(child); + } + + fn cleanup(&mut self) { + // 清理所有子进程 + for mut child in self.children.drain(..) { + kill_child(&mut child); + } + } +} - for pat in patterns { - let _ = Command::new("pkill").args(["-9", "-f", &pat]).status(); +impl Drop for TestContext { + fn drop(&mut self) { + self.cleanup(); } } -fn write_file(path: &str, content: &str) { - let p = PathBuf::from(path); - if let Some(parent) = p.parent() { +fn find_free_port() -> u16 { + TcpListener::bind("127.0.0.1:0") + .expect("Failed to bind to a random port") + .local_addr() + .expect("Failed to get local address") + .port() +} + +fn workspace_root() -> PathBuf { + // CARGO_MANIFEST_DIR 指向 healer 子 crate;集成测试期望使用工作区根目录(其父目录) + let crate_dir = PathBuf::from(env!("CARGO_MANIFEST_DIR")); + crate_dir.parent().map(|p| p.to_path_buf()).unwrap_or(crate_dir) +} + +fn write_file(path: &PathBuf, content: &str) { + if let Some(parent) = path.parent() { let _ = fs::create_dir_all(parent); } - fs::write(p, content).expect("write file failed"); + fs::write(path, content).expect("write file failed"); } -fn spawn_healer_foreground(config_path: &str) -> Child { +fn spawn_healer_foreground(config_path: &PathBuf) -> Child { let mut cmd = Command::new(env!("CARGO_BIN_EXE_healer")); cmd.env("HEALER_CONFIG", config_path) .env("HEALER_NO_DAEMON", "1") @@ -40,6 +92,7 @@ fn spawn_healer_foreground(config_path: &str) -> Child { "info,healer::monitor::pid_monitor=debug,healer_action=debug,healer_event=info,dep_coord=debug".to_string() }), ); + // 测试时可继承 stdio(HEALER_TEST_INHERIT_STDIO=1) let inherit_stdio = std::env::var("HEALER_TEST_INHERIT_STDIO") .map(|v| matches!(v.to_ascii_lowercase().as_str(), "1" | "true" | "yes")) @@ -66,7 +119,7 @@ fn kill_by_pid(pid: i32) { } } -fn kill_child(mut child: Child) { +fn kill_child(child: &mut Child) { // 先优雅退出(SIGINT),必要时再强杀(SIGKILL) let _ = Command::new("/bin/kill") .args(["-INT", &child.id().to_string()]) @@ -84,64 +137,101 @@ fn kill_child(mut child: Child) { let _ = child.wait(); } -fn build_temp_config(base: &str) -> String { +fn build_pid_only_config(ctx: &TestContext) -> String { + let base = workspace_root(); + let logs_dir = ctx.logs_dir(); + let pids_dir = ctx.pids_dir(); + let test_id = ctx.temp_path().file_name().unwrap().to_string_lossy(); + format!( r#" log_level: "info" -log_directory: "/tmp/healer-tests/logs" -pid_file_directory: "/tmp/healer-tests/pids" +log_directory: "{}" +pid_file_directory: "{}" working_directory: "/" processes: - name: "counter" enabled: true - command: "{base}/target/debug/test_process" + command: "{}/target/debug/test_process_{}" args: [] run_as_root: true run_as_user: null monitor: type: "pid" - pid_file_path: "/tmp/healer-tests/pids/counter.pid" + pid_file_path: "{}/counter.pid" interval_secs: 1 recovery: type: "regular" retries: 3 retry_window_secs: 10 cooldown_secs: 5 +"#, + logs_dir.display(), + pids_dir.display(), + base.display(), + test_id, + pids_dir.display() + ) +} + +fn build_network_only_config(ctx: &TestContext) -> String { + let logs_dir = ctx.logs_dir(); + let pids_dir = ctx.pids_dir(); + + format!( + r#" +log_level: "info" +log_directory: "{}" +pid_file_directory: "{}" +working_directory: "/" +processes: - name: "dummy_net" enabled: true command: "/usr/bin/python3" - args: ["{base}/tests/fixtures/dummy_service.py"] + args: ["{}/dummy_service.py"] run_as_root: true run_as_user: null monitor: type: "network" - target_url: "http://127.0.0.1:8080/health" + target_url: "http://127.0.0.1:{}/health" interval_secs: 1 recovery: type: "regular" retries: 2 retry_window_secs: 5 cooldown_secs: 4 -"# +"#, + logs_dir.display(), + pids_dir.display(), + ctx.temp_path().display(), + ctx.port ) } -fn ensure_test_binaries() { - let helper_src = r#"fn main(){ - use std::{fs,thread,time,process,io::{self,Write}}; +fn ensure_test_binaries(ctx: &TestContext) { + let pids_dir = ctx.pids_dir(); + let helper_src = format!( + r#"fn main(){{ + use std::{{fs,thread,time,process,io::{{self,Write}}}}; let pid = process::id(); - let _ = fs::create_dir_all("/tmp/healer-tests/pids"); - let _ = fs::write("/tmp/healer-tests/pids/counter.pid", pid.to_string()); - let mut n=0u64; loop{ print!("\\r[PID {}] alive {}", pid,n); let _=io::stdout().flush(); thread::sleep(time::Duration::from_secs(1)); n+=1; } - }"#; - let base = PathBuf::from(env!("CARGO_MANIFEST_DIR")); + fs::create_dir_all("{}").expect("failed to create pid directory"); + fs::write("{}/counter.pid", pid.to_string()).expect("failed to write pid file"); + let mut n=0u64; loop{{ print!("\r[PID {{}}] alive {{}}", pid,n); io::stdout().flush().expect("flush failed"); thread::sleep(time::Duration::from_secs(1)); n+=1; }} + }}"#, + pids_dir.display(), + pids_dir.display() + ); + + let base = workspace_root(); let bin_dir = base.join("target").join("debug"); - let test_bin_src = base.join("tests/fixtures/test_process.rs"); - let _ = fs::create_dir_all(test_bin_src.parent().unwrap()); - write_file(test_bin_src.to_str().unwrap(), helper_src); - let out_bin = bin_dir.join("test_process"); + let test_bin_src = ctx.temp_path().join("test_process.rs"); + write_file(&test_bin_src, &helper_src); + + // 为每个测试创建唯一的二进制文件名 + let test_id = ctx.temp_path().file_name().unwrap().to_string_lossy(); + let out_bin = bin_dir.join(format!("test_process_{}", test_id)); let status = Command::new("rustc") .args([ "-O", @@ -156,52 +246,76 @@ fn ensure_test_binaries() { #[test] fn restart_on_pid_exit_and_circuit_breaker() { - cleanup_stray_processes(); - ensure_test_binaries(); - let base = PathBuf::from(env!("CARGO_MANIFEST_DIR")); - let cfg_text = build_temp_config(base.to_str().unwrap()); - let cfg_path = base.join("target/debug/it_config.yaml"); - write_file(cfg_path.to_str().unwrap(), &cfg_text); + let mut ctx = TestContext::new(); + ensure_test_binaries(&ctx); + + let cfg_text = build_pid_only_config(&ctx); + let cfg_path = ctx.temp_path().join("it_config.yaml"); + write_file(&cfg_path, &cfg_text); // 先启动 helper,保证 PID 文件存在 - let helper_bin = base.join("target/debug/test_process"); + let base = workspace_root(); + let test_id = ctx.temp_path().file_name().unwrap().to_string_lossy(); + let helper_bin = base.join("target").join("debug").join(format!("test_process_{}", test_id)); let mut initial = Command::new(helper_bin) .stdout(Stdio::null()) .stderr(Stdio::null()) .spawn() .expect("failed to spawn initial test_process"); - // 等待 PID 文件就绪 - let pid_path = "/tmp/healer-tests/pids/counter.pid"; - for _ in 0..10 { - if fs::read_to_string(pid_path).is_ok() { - break; - } - wait_secs(1); - } + + // 直接写入 PID 文件,消除 helper 自行写入的竞态 + let pid_path = ctx.pids_dir().join("counter.pid"); + let initial_pid = initial.id() as i32; + fs::write(&pid_path, initial_pid.to_string()) + .expect("failed to prime PID file for initial helper"); - let mut healer = spawn_healer_foreground(cfg_path.to_str().unwrap()); - wait_secs(2); + let recorded_pid: i32 = fs::read_to_string(&pid_path) + .expect("failed to read primed PID file") + .trim() + .parse() + .expect("primed PID file contains invalid pid"); + assert_eq!( + recorded_pid, initial_pid, + "PID file content mismatch: expected {}, got {}", + initial_pid, recorded_pid + ); + + println!( + "Initial process started with PID: {} (primed at {})", + initial_pid, + pid_path.display() + ); + + let healer = spawn_healer_foreground(&cfg_path); + ctx.add_child(healer); + wait_secs(3); // 增加等待时间以确保 healer 完全启动 // 检查 healer 是否仍在运行 - match healer.try_wait() { - Ok(Some(status)) => { - panic!("Healer exited early with status: {:?}", status); - } - Ok(None) => { - println!("Healer is running normally"); - } - Err(e) => { - panic!("Error checking healer status: {}", e); + if let Some(healer) = ctx.children.last_mut() { + match healer.try_wait() { + Ok(Some(status)) => { + panic!("Healer exited early with status: {:?}", status); + } + Ok(None) => { + println!("Healer is running normally"); + } + Err(e) => { + panic!("Error checking healer status: {}", e); + } } } - let first_pid: i32 = fs::read_to_string(pid_path) + let first_pid: i32 = fs::read_to_string(&pid_path) .ok() .and_then(|s| s.trim().parse().ok()) .unwrap_or(0); + + // 验证我们有有效的初始 PID + assert!(first_pid > 0, "No valid initial PID found: {}", first_pid); + if first_pid > 0 { kill_by_pid(first_pid); - // 回收初始子进程,避免僵尸进程 + // 等待初始进程退出 let _ = initial.wait(); } @@ -210,7 +324,7 @@ fn restart_on_pid_exit_and_circuit_breaker() { println!("Waiting for restart. Original PID: {}", first_pid); for i in 0..15 { wait_secs(1); - if let Ok(s) = fs::read_to_string(pid_path) { + if let Ok(s) = fs::read_to_string(&pid_path) { if let Ok(p) = s.trim().parse::() { println!("Iteration {}: PID file contains: {}", i + 1, p); if p > 0 && p != first_pid { @@ -227,7 +341,7 @@ fn restart_on_pid_exit_and_circuit_breaker() { println!("Iteration {}: Could not read PID file", i + 1); } if i % 5 == 4 { - let current_pid = fs::read_to_string(pid_path) + let current_pid = fs::read_to_string(&pid_path) .ok() .and_then(|s| s.trim().parse().ok()) .unwrap_or(0); @@ -248,7 +362,7 @@ fn restart_on_pid_exit_and_circuit_breaker() { let last = new_pid; for _ in 0..10 { wait_secs(1); - if let Ok(s) = fs::read_to_string(pid_path) { + if let Ok(s) = fs::read_to_string(&pid_path) { if let Ok(p) = s.trim().parse::() { if p != last { new_pid = p; @@ -263,7 +377,7 @@ fn restart_on_pid_exit_and_circuit_breaker() { kill_by_pid(new_pid); let old = new_pid; wait_secs(3); - let after: i32 = fs::read_to_string(pid_path) + let after: i32 = fs::read_to_string(&pid_path) .ok() .and_then(|s| s.trim().parse().ok()) .unwrap_or(0); @@ -273,7 +387,7 @@ fn restart_on_pid_exit_and_circuit_breaker() { let mut restarted = false; for _ in 0..6 { wait_secs(1); - if let Ok(s) = fs::read_to_string(pid_path) { + if let Ok(s) = fs::read_to_string(&pid_path) { if let Ok(p) = s.trim().parse::() { if p != old { restarted = true; @@ -284,24 +398,24 @@ fn restart_on_pid_exit_and_circuit_breaker() { } assert!(restarted, "healer did not attempt restart after cooldown"); - // 清理:结束 healer 与最后的 helper - kill_child(healer); - if let Ok(s) = fs::read_to_string(pid_path) { + // 清理最后的 helper + if let Ok(s) = fs::read_to_string(&pid_path) { if let Ok(p) = s.trim().parse::() { kill_by_pid(p); } } - cleanup_stray_processes(); + // TestContext 的 drop 会自动清理所有子进程 } #[test] fn network_monitor_detects_crash_and_recovers() { - cleanup_stray_processes(); - ensure_test_binaries(); - let base = PathBuf::from(env!("CARGO_MANIFEST_DIR")); - let dummy_py = r#"import http.server, socketserver, sys + let mut ctx = TestContext::new(); + ensure_test_binaries(&ctx); + + let dummy_py = format!( + r#"import http.server, socketserver, sys socketserver.TCPServer.allow_reuse_address = True -PORT=8080 +PORT={} class H(http.server.SimpleHTTPRequestHandler): def do_GET(self): if self.path == '/health': @@ -312,20 +426,23 @@ class H(http.server.SimpleHTTPRequestHandler): self.send_response(404); self.end_headers() with socketserver.TCPServer(("", PORT), H) as srv: srv.serve_forever() -"#; - let py_path = base.join("tests/fixtures/dummy_service.py"); - write_file(py_path.to_str().unwrap(), dummy_py); +"#, + ctx.port + ); + + let py_path = ctx.temp_path().join("dummy_service.py"); + write_file(&py_path, &dummy_py); - let cfg_text = build_temp_config(base.to_str().unwrap()); - let cfg_path = base.join("target/debug/net_config.yaml"); - write_file(cfg_path.to_str().unwrap(), &cfg_text); + let cfg_text = build_network_only_config(&ctx); + let cfg_path = ctx.temp_path().join("net_config.yaml"); + write_file(&cfg_path, &cfg_text); - let healer = spawn_healer_foreground(cfg_path.to_str().unwrap()); + let healer = spawn_healer_foreground(&cfg_path); + ctx.add_child(healer); wait_secs(2); - fn http_get(path: &str) -> Option { - use std::net::TcpStream; - let mut stream = TcpStream::connect("127.0.0.1:8080").ok()?; + fn http_get(port: u16, path: &str) -> Option { + let mut stream = TcpStream::connect(format!("127.0.0.1:{}", port)).ok()?; let req = format!( "GET {} HTTP/1.1\r\nHost: 127.0.0.1\r\nConnection: close\r\n\r\n", path @@ -335,22 +452,24 @@ with socketserver.TCPServer(("", PORT), H) as srv: stream.read_to_string(&mut buf).ok()?; Some(buf) } - fn is_healthy() -> bool { - http_get("/health") + + let is_healthy = |port: u16| -> bool { + http_get(port, "/health") .map(|r| r.starts_with("HTTP/1.1 200") || r.contains("OK")) .unwrap_or(false) - } + }; for _ in 0..10 { - if is_healthy() { + if is_healthy(ctx.port) { break; } wait_secs(1); } - let _ = http_get("/crash"); + + let _ = http_get(ctx.port, "/crash"); let mut healthy = false; for _ in 0..20 { - if is_healthy() { + if is_healthy(ctx.port) { healthy = true; break; } @@ -358,7 +477,6 @@ with socketserver.TCPServer(("", PORT), H) as srv: } assert!(healthy, "network monitor did not recover dummy service"); - let _ = http_get("/crash"); - kill_child(healer); - cleanup_stray_processes(); + let _ = http_get(ctx.port, "/crash"); + // TestContext 的 drop 会自动清理所有子进程 } diff --git a/packaging/rpm/healer.spec b/packaging/rpm/healer.spec new file mode 100644 index 0000000..f48fdd3 --- /dev/null +++ b/packaging/rpm/healer.spec @@ -0,0 +1,85 @@ +Name: healer +Version: 0.1.0 +Release: 1%{?dist} +Summary: Process self-healing daemon leveraging eBPF for monitoring and recovery + +License: MulanPSL-2.0 +URL: https://github.com/XqiLiu/healer +Source0: %{name}-%{version}.tar.gz + +BuildRequires: gcc +BuildRequires: make +BuildRequires: rust +BuildRequires: cargo +BuildRequires: clang +BuildRequires: llvm +# bpf-linker is required by aya_build to link eBPF objects; install via cargo if not available +# BuildRequires: rust-bpf-linker + +Requires(post): systemd +Requires(preun): systemd +Requires(postun): systemd + +%description +Healer is a high-performance daemon leveraging eBPF for reliable, low-overhead +monitoring and automatic recovery of critical processes to ensure service continuity. + +It provides pluggable monitors (PID / Network / eBPF), broadcast event bus, and +recovery with circuit breaker and backoff. + +%prep +%setup -q -n %{name}-%{version} + +%build +# Ensure cargo-installed binaries (like bpf-linker) are in PATH +export PATH="$HOME/.cargo/bin:$PATH" +# Install bpf-linker if missing (best-effort) +if ! command -v bpf-linker >/dev/null 2>&1; then + cargo install --locked bpf-linker || true +fi + +# Build only the main daemon binary in release mode +cargo build --release -p healer + +%install +mkdir -p %{buildroot}%{_bindir} +install -m 0755 target/release/healer %{buildroot}%{_bindir}/healer + +# Config +mkdir -p %{buildroot}%{_sysconfdir}/healer +install -m 0644 config.yaml %{buildroot}%{_sysconfdir}/healer/config.yaml + +# systemd unit +mkdir -p %{buildroot}%{_unitdir} +install -m 0644 packaging/systemd/healer.service %{buildroot}%{_unitdir}/healer.service + +# Log and runtime directories (created on install if not present) +mkdir -p %{buildroot}/var/log/healer +mkdir -p %{buildroot}/var/run/healer + +%post +%systemd_post healer.service +mkdir -p /var/log/healer || true +mkdir -p /var/run/healer || true +chown root:root /var/log/healer /var/run/healer || true +chmod 755 /var/log/healer /var/run/healer || true + +%preun +%systemd_preun healer.service + +%postun +%systemd_postun_with_restart healer.service + +%files +%doc README.md +%license LICENSE +%dir %{_sysconfdir}/healer +%config(noreplace) %{_sysconfdir}/healer/config.yaml +%{_bindir}/healer +%{_unitdir}/healer.service +%dir /var/log/healer +%dir /var/run/healer + +%changelog +* Tue Sep 24 2025 XqiLiu - 0.1.0-1 +- Initial RPM packaging for healer daemon diff --git a/packaging/systemd/healer.service b/packaging/systemd/healer.service new file mode 100644 index 0000000..d11b6fd --- /dev/null +++ b/packaging/systemd/healer.service @@ -0,0 +1,21 @@ +[Unit] +Description=Healer - Process self-healing daemon +After=network.target + +[Service] +Type=simple +ExecStart=/usr/bin/healer +Restart=on-failure +RestartSec=3 +User=root +Group=root +# Environment can override log level etc +Environment=RUST_LOG=info +Environment=HEALER_CONFIG=/etc/healer/config.yaml +# Working directory can be overridden by config.yaml +WorkingDirectory=/ +RuntimeDirectory=healer +RuntimeDirectoryMode=0755 + +[Install] +WantedBy=multi-user.target diff --git a/scripts/build-rpm.sh b/scripts/build-rpm.sh new file mode 100644 index 0000000..965485c --- /dev/null +++ b/scripts/build-rpm.sh @@ -0,0 +1,130 @@ +#!/usr/bin/env bash +set -euo pipefail + +# Build an RPM for healer using rpmbuild -tb on a source tarball. + +if [[ ${1:-} == "--help" ]]; then + cat <<'EOF' +Usage: bash scripts/build-rpm.sh [--skip-dep-check] + +Build the healer RPM from the current working copy. +Steps: + 1. Create a clean source tarball healer-.tar.gz + 2. Invoke rpmbuild -ba packaging/rpm/healer.spec + +Options: + --skip-dep-check Do not perform local RPM build dependency preflight. + +Notes: + Even if you have rust/cargo/clang via rustup or custom install in $HOME, rpmbuild + validates BuildRequires against INSTALLED RPM PACKAGES. So a binary existing in PATH + (e.g. ~/.cargo/bin/cargo) does not satisfy 'BuildRequires: cargo'. Install the distro + packages or adjust the spec if you intentionally want to bypass them. +EOF + exit 0 +fi + +ROOT_DIR=$(cd "$(dirname "$0")/.." && pwd) +PACKAGE_NAME=healer +PACKAGE_VERSION=$(grep '^version' "$ROOT_DIR/healer/Cargo.toml" | head -n1 | awk -F '"' '{print $2}' || echo "0.1.0") +PKGDIR="$ROOT_DIR/packaging/rpm" + +# --------------------------------------------- +# Detect OS family; provide early warning on Debian/Ubuntu where BuildRequires +# cannot be satisfied because they are RPM package names. +# --------------------------------------------- +if [[ -f /etc/os-release ]]; then + # shellcheck disable=SC1091 + . /etc/os-release + os_id_like=${ID_LIKE:-} + os_id=${ID:-} + if ! command -v rpm >/dev/null 2>&1; then + echo "[Warn] This system (ID=$os_id) does not have 'rpm' installed. Install rpm-build or run inside a Fedora/CentOS/RHEL container." >&2 + fi + if [[ $os_id == "ubuntu" || $os_id == debian || $os_id_like == *"debian"* ]]; then + cat >&2 </dev/null; then + missing_pkgs+=("$p") + fi + done + + if ((${#missing_pkgs[@]})); then + echo "[Preflight] Missing required RPM packages (BuildRequires will fail): ${missing_pkgs[*]}" >&2 + echo "Detected PATH versions (if any):" >&2 + for bin in cargo rustc clang llvm-ar gcc make; do + if command -v "$bin" &>/dev/null; then + echo " - $bin -> $(command -v $bin)" >&2 + fi + done + cat >&2 < Preparing source tarball ${PACKAGE_NAME}-${PACKAGE_VERSION}.tar.gz" +TMP_SRC=$(mktemp -d) +trap 'rm -rf "$TMP_SRC"' EXIT + +# Create a clean source tree directory name as %{name}-%{version} +SRC_ROOT="$TMP_SRC/${PACKAGE_NAME}-${PACKAGE_VERSION}" +mkdir -p "$SRC_ROOT" + +# Copy all project files (excluding target/ by default) +rsync -a --exclude 'target/' --exclude '.git/' --exclude 'target-*' "$ROOT_DIR/" "$SRC_ROOT/" + +pushd "$TMP_SRC" >/dev/null +tar czf "${PACKAGE_NAME}-${PACKAGE_VERSION}.tar.gz" "${PACKAGE_NAME}-${PACKAGE_VERSION}" +popd >/dev/null + +echo "==> Running rpmbuild" +mkdir -p "$HOME/rpmbuild/SOURCES" +cp "$TMP_SRC/${PACKAGE_NAME}-${PACKAGE_VERSION}.tar.gz" "$HOME/rpmbuild/SOURCES/" + +rpmbuild -ba "$PKGDIR/${PACKAGE_NAME}.spec" \ + --define "_topdir $HOME/rpmbuild" \ + --define "_sourcedir $HOME/rpmbuild/SOURCES" \ + --define "_builddir $HOME/rpmbuild/BUILD" \ + --define "_rpmdir $HOME/rpmbuild/RPMS" \ + --define "_srcrpmdir $HOME/rpmbuild/SRPMS" \ + --define "_specdir $ROOT_DIR/packaging/rpm" + +echo "==> Done. Find RPMs under $HOME/rpmbuild/RPMS" diff --git a/healer/tests/fixtures/dummy_service.py b/tests/fixtures/dummy_service.py similarity index 100% rename from healer/tests/fixtures/dummy_service.py rename to tests/fixtures/dummy_service.py diff --git a/healer/tests/fixtures/test_process.rs b/tests/fixtures/test_process.rs similarity index 53% rename from healer/tests/fixtures/test_process.rs rename to tests/fixtures/test_process.rs index 7db433c..4da73f3 100644 --- a/healer/tests/fixtures/test_process.rs +++ b/tests/fixtures/test_process.rs @@ -1,7 +1,7 @@ fn main(){ use std::{fs,thread,time,process,io::{self,Write}}; let pid = process::id(); - let _ = fs::create_dir_all("/tmp/healer-tests/pids"); - let _ = fs::write("/tmp/healer-tests/pids/counter.pid", pid.to_string()); + let _ = fs::create_dir_all("/home/lxq/ospp/healer-process/target/debug/healer-tests/pids"); + let _ = fs::write("/home/lxq/ospp/healer-process/target/debug/healer-tests/pids/counter.pid", pid.to_string()); let mut n=0u64; loop{ print!("\r[PID {}] alive {}", pid,n); let _=io::stdout().flush(); thread::sleep(time::Duration::from_secs(1)); n+=1; } } \ No newline at end of file diff --git a/tests/fixtures/test_process_ebpf.rs b/tests/fixtures/test_process_ebpf.rs new file mode 100644 index 0000000..4da73f3 --- /dev/null +++ b/tests/fixtures/test_process_ebpf.rs @@ -0,0 +1,7 @@ +fn main(){ + use std::{fs,thread,time,process,io::{self,Write}}; + let pid = process::id(); + let _ = fs::create_dir_all("/home/lxq/ospp/healer-process/target/debug/healer-tests/pids"); + let _ = fs::write("/home/lxq/ospp/healer-process/target/debug/healer-tests/pids/counter.pid", pid.to_string()); + let mut n=0u64; loop{ print!("\r[PID {}] alive {}", pid,n); let _=io::stdout().flush(); thread::sleep(time::Duration::from_secs(1)); n+=1; } + } \ No newline at end of file -- Gitee