diff --git a/README.md b/README.md index 7ac90e79de08441c80d39f36961988a4c5143013..5b8d2b3c77f8308d447a7a6e6ef733e0cde1663d 100644 --- a/README.md +++ b/README.md @@ -1,37 +1,510 @@ -# sysTrace +## 简介 -#### 介绍 -System Performance Trace Tool, used in AI scene. +sysTrace是一款运用于在AI训练任务中的软件,在AI训练中,常常出现训练任务故障导致训练成本浪费,业务痛点如下: -#### 软件架构 -软件架构说明 +- AI训练性能故障缺乏常态化监控、检测能力 +- Host bound引发的AI任务慢,卡故障缺乏全栈跟踪能力 +sysTrace工具支持如下功能: -#### 安装教程 +- 采集torch_npu层的python函数的调用栈 +- 采集cann层的内存持有情况,判断是否发生HBM OOM故障 +- 采集mspti的通信算子下发/执行,判断是否发生算子慢的情况,从而定位到慢卡 +- 采集oncpu/offcpu事件,判断AI训练中是否存在其他进程抢占cpu导致训练慢的问题 -1. xxxx -2. xxxx -3. xxxx +### 环境 -#### 使用说明 +l**OS**: openEuler 22.03 (LTS-SP4) --5.10.0-60.18.0.50.oe2203.aarch64 -1. xxxx -2. xxxx -3. xxxx +l**软件版本**:CANN 8.0RC3, torch 2.1.0, torch_npu 2.1.0.post10 -#### 参与贡献 +## 编译 -1. Fork 本仓库 -2. 新建 Feat_xxx 分支 -3. 提交代码 -4. 新建 Pull Request +- 下载源码: +- 安装依赖包 + ```shell + ## 软件包版本:libbpf >= 0.8.1, clang >= 10.0.0 gcc >= 8.3.0, bpftool >= 6.8.0,如果版本均满足则跳过下面的手动安装步骤 + [root@localhost sysTrace] yum install gcc g++ cmake make python3-devel protobuf-compiler protobuf-devel protobuf-c-devel libbpf clang libbpf-devel bpftool + ``` -#### 特技 + - 手动安装libbpf + ```shell + [root@localhost ~] git clone https://github.com/libbpf/libbpf.git + [root@localhost ~] git checkout v0.8.1 + [root@localhost ~] cd libbpf/src + [root@localhost ~] make && make install + ``` + +- 手动安装bpftool + + ```shell + [root@localhost ~] git clone --recurse-submodules https://github.com/libbpf/bpftool.git + [root@localhost ~] git submodule update --init + [root@localhost ~] cd src + [root@localhost ~] make + [root@localhost ~] make install + ``` + +- 编译 + ```shell + [root@localhost sysTrace] cd sysTrace + [root@localhost sysTrace] bash build.sh + ## 编译产物均在build目录下,会用到libsysTrace.so和sysTrace_cli + [root@localhost build]# ll + total 1776 + -rw-r--r--. 1 root root 17000 Jun 12 16:58 CMakeCache.txt + drwxr-xr-x. 7 root root 4096 Jun 12 17:10 CMakeFiles + -rw-r--r--. 1 root root 1798 Jun 12 16:58 cmake_install.cmake + -rw-r--r--. 1 root root 534270 Jun 12 17:10 libcommon.a + -rwxr-xr-x. 1 root root 1209296 Jun 12 17:10 libsysTrace.so + -rw-r--r--. 1 root root 20479 Jun 12 17:10 Makefile + drwxr-xr-x. 3 root root 4096 Jun 12 17:10 protos + -rwxr-xr-x. 1 root root 76736 Jun 12 17:10 sysTrace_cli + ``` + +## rpm包安装 +### +dnf install sysTrace-failslow +dnf install sysTrace-mcpserver +### 运行 +systrace-failslow + +systrace-mcpserver #开启mcp server服务 服务端口为 12145 + +systrace-openapi #开启openapi server服务 服务端口 12146 + + +配置远程获取数据,修改./config/ftp_config.json文件 +~~~json +{ + "servers": [ + { + "ip": "192.168.122.196", #远程目标服务器的ip + "port": 22, #远程目标服务器的ssh端口 + "user": "root", #用户名 + "password": "Huawei12#$", #密码 + "perception_remote_dir": "/home/hx/sysTrace_dataloader/timeline", #远程目标服务器systrace采集的timeline数据保存路径 + "detection_remote_dir": "/home/hx/sysTrace_dataloader/mspti",#远程目标服务器systrace采集的mspti数据保存路径 + } + ], + "enable": "False" #True 为开启远程获取数据,False为关闭只使用本地文件进行分析 +} +~~~ +## 使用 + +### 数据采集 + +修改AI训练任务脚本,使用LD_PRELAOD的方式将动态库加载到AI训练任务中 + +```shell +LD_PRELOAD=/usr/local/lib/libunwind.so.8.2.0:/usr/local/lib/libunwind-aarch64.so.8.2.0:/home/ascend-toolkit-bak/ascend-toolkit/8.0.RC3.10/tools/mspti/lib64/libmspti.so:/systrace/build/libsysTrace.so python ... +``` + +**注意:以LD_PRELOAD的方式加载了/usr/local/lib/libunwind.so.8.2.0:/usr/local/lib/libunwind-aarch64.so.8.2.0的原因是因为低于1.7版本的libunwind有bug,需要手动下载最新版本的libunwind,如果环境中的libunwind版本大于等于1.7,则使用以下命令** + +```shell +LD_PRELOAD=/home/ascend-toolkit-bak/ascend-toolkit/8.0.RC3.10/tools/mspti/lib64/libmspti.so:/systrace/build/libsysTrace.so python ... +``` + + + +### 动态开关 + +sysTrace支持动态开启采集数据,采集数据类型支持动态开启,sysTrace提供二进制工具sysTrace_cli,当前L0数据是常态开启,L1/2/3类型数据可自行决定是否开启,使用命令如下: + +```shell +[root@localhost ~]# sysTrace_cli help +Usage: sysTrace_client [args] +Commands: + set = - Enable/disable dump level + (levels: L0, L1, L2, L3) + interval = - Set dump interval in minutes + (levels: L1, L2, L3) + print [level|all] - Print current settings + (levels: L0, L1, L2, L3, all) + +Examples: + sysTrace_cli set L1=true + sysTrace_cli interval L1=10 + sysTrace_cli print all +``` + +### 数据落盘 + +所有采集的数据当前存放在`/home/sysTrace`目录下,每张卡上的数据以独立一个文件保存,集群多节点环境,建议将保存目录`/home/sysTrace`映射到共享目录,否则需要手动将每台节点上的数据手动拷贝,如下: + +```shell +[root@localhost sysTrace]# ll +drwxr-xr-x. 2 root root 4096 Jun 12 17:01 cann ##内存数据 +drwxr-xr-x. 2 root root 4096 Jun 12 17:01 mspti ##通信算子数据 +drwxr-xr-x. 2 root root 4096 Jun 12 17:01 timeline ##torch_npu层数据 +drwxr-xr-x. 2 root root 4096 Jun 12 17:01 osprobe ##offcpu/cpu事件 +``` + +sysTrace支持动态开启采集数据,当前支持以下级别的数据: + +- L0:采集torch_npu层数据,采集数据类型包括如下数据(常态化采集) + + ```protobuf + message Pytorch { + repeated PytorchStage pytorch_stages = 1; + uint32 rank = 2; // rank号 + } + + message PytorchStage { + uint32 stage_id = 1; // AI训练迭代轮次 + string stage_type = 2; // AI训练迭代阶段 + uint64 start_us = 3; // 当前迭代阶段的开始时间 + uint64 end_us = 4; // 当前迭代阶段的结束时间 + repeated string stack_frames = 5; //当前迭代阶段的python调用栈 + oneof debug_data { + GcDebugData gc_debug = 6; //当前迭代阶段的GC数据 + } + } + ``` + +- L1:采集通信算子数据,采集数据包括如下数据 + + ```python + Flag,Id,Kind,Name,SourceKind,Timestamp,msptiObjectId_Ds_DeviceId,msptiObjectId_Ds_StreamId,msptiObjectId_Pt_ProcessId,msptiObjectId_Pt_ThreadId + ``` + +- L2:采集cann层的内存数据,采集数据包括如下数据 + + ```protobuf + message ProcMem { + uint32 pid = 1; //线程号 + repeated MemAllocEntry mem_alloc_stacks = 2; //内存申请调用栈 + repeated MemFreeEntry mem_free_stacks = 3; //内存释放调用栈 + } + + message MemAllocEntry { + uint64 alloc_ptr = 1; //内存申请起始地址 + uint32 stage_id = 2; //训练迭代轮次 + StageType stage_type = 3; //当前迭代阶段 + uint64 mem_size = 4; //内存申请大小 + repeated StackFrame stack_frames = 5; //内存申请调用栈 + } + + message MemFreeEntry { + uint64 alloc_ptr = 1; //内存释放起始地址 + uint32 stage_id = 2; //训练迭代轮次 + StageType stage_type = 3; //当前迭代阶段 + } + + ``` + +- L3: 采集offcpu/oncpu/mem事件 + + ```protobuf + message OSprobe { + repeated OSprobeEntry OSprobe_entries = 1; + } + + message OSprobeEntry { + uint32 key = 1; // pid/cpuid + uint64 start_us = 2; //事件的开始时间 + uint64 dur = 3; //事件的持续时间 + uint64 rundelay = 4; //cpu调度时延 + uint32 OS_event_type = 5; // 事件类型 + uint32 rank = 6; //卡号 + string comm = 7; //当前进程 + string nxt_comm = 8; //即将执行的进程名字 + uint32 nxt_pid = 9; //即将执行进程的pid + } + ``` + +### 数据转换 + +```shell +# 安装转化脚本依赖包(以下版本号非强要求,仅需要保证protobuf和protobuf-compiler保持一致即可,可通过protoc --version确认) +pip install protobuf==3.20.3 + +##转换内存oom数据 +#拷贝sysTrace_pb2.py到convert目录下,确保该文件和转化脚本在一个目录下 +cp /sysTrace/protos/sysTrace_pb2.py /sysTrace/convert +python /sysTrace/convert/convert_mem_to_flamegraph_for_cur.py + +##转换torch_npu数据 +cd /home/sysTrace/timeline +python /sysTrace/convert/convert_pytorch_to_timeline.py --output + +##转换通信算子数据 +python /sysTrace/convert/convert_mspti_timeline.py + +##转换offcpu/oncpu事件 +cd /home/sysTrace/osprobe +python /sysTrace/convert/convert_osprobe_to_timeline.py --output + +``` + +### 数据展示 + +将最终的json数据上传到并展示, 通过**Open trace file**加载数据。 + +### 数据分析 + +#### failslow算法 + +##### failslow参数配置 + +| 参数 | 类型 | 参数说明 | 举例 | +| --------------------------- | ------ | ---------------------------------------------------- | ------------------------------------------------------------ | +| training_log | string | 算法输入torch数据路径,以“*.timeline”结尾, 取0卡即可 | /home/workspace/hbdir/systrace/localhost.localdomain--00000.timeline | +| fail_slow_perception_path | string | 劣化感知算法输出结果,保存为json文件 | /etc/systrace/result/fail_slow/fail_slow_perception_result_failSlow_1753099791.json | +| max_data_queue_steps | int | 缓存step数据最大队列 | 500 | +| min_startup_detection_steps | int | 启动检测的最小step数量 | 10 | +| task_stable_step | int | 任务初始训练时step稳定的数量 | 3 | +| fail_slow_span_mins | float | 劣化感知算法的检测周期,单位min | 30 | +| hang_times_mins_thr | float | 判断任务是否hang的阈值,单位min | 30 | +| steps_window_size | int | 滑动窗口大小, | 5 | +| k_sigma | int | bboxplot算法的ksigma取值 | 3 | +| anomaly_degree_thr | float | 检测值偏离均值的程度 | 0.2 | + +##### 感知算法执行 + +```shell +## L0感知 +python failslow/fail_slow_detection.py +``` + +##### failslow算法输出 + +**感知结果字段表** + +| 字段 | 类型 | 说明 | 举例 | +| ------------------- | ------ | ------------------------------------- | ------------------------------------------------------------ | +| is_anomaly | bool | 检测数据是否异常 | True | +| anomaly_count_times | int | 检测出的异常点数 | 1 | +| anomaly_info | list | 记录异常信息,每个元素对应一个异常点 | {
'training_step': 16,
'anomaly_time': '2025-06-12 19:39:24',
'anomaly_degree': 26.053,
'anomaly_training_time': '69435ms',
'normal_training_time': '2566.6ms'
}] | +| anomaly_type | string | 检测结果类型: normal, failslow, hang | failslow | +| start_time | int | 检测开始时间 | 1749728380752 | +| end_time | int | 检测结束时间 | 1749728419305 | + +**输出样例** + +```json +{ + 'is_anomaly': True, + 'anomaly_count_times': 1, + 'anomaly_info': [{ + 'training_step': 16, + 'anomaly_time': '2025-06-12 19:39:24', + 'anomaly_degree': 26.053, + 'anomaly_training_time': '69435ms', + 'normal_training_time': '2566.6ms' + }], + 'anomaly_type': 'failSlow', + 'start_time': 1749728380752, + 'end_time': 1749728419305 +} +``` + +#### 慢卡定界算法 + +##### 慢卡定界算法配置 + +在文件model_config.json中,配置模型运行所需的参数。该配置项中,主要包含: + +- with_fail_slow: 配置启动慢节点检测性能劣化来源于性能劣化检测的时刻还是手动配置, 默认为false +- slow_node_detection_range_times:慢节点检测输入的时间范围,默认为空列表 +- slow_node_detection_time_span_hours:慢节点检测的时间长度,默认为0.5小时 +- slow_node_detection_path:慢节点检测结果保存路径,默认为"/etc/systrace/result/slow_node" +- data_type:算子数据的格式,默认为”json“ +- root_path: 算子数据的输入路径,默认为”/home/hbdir/systrace_failslow/data/baseline“ +- enable_detect_type:检测不同故障类型的开关,字典格式 + - enable_cal: 计算慢开关,默认为true + - enable_op_launch: 算子下发慢开关,默认为false + - enable_op_launch: Kafka对应的`server port`,如:"9092"; + - enable_comm: 通信慢开关,默认为false + - enable_dataloader: 输入模型数据加载慢开关,默认为false + - enable_ckpt: 模型保存慢开关,默认为false + +- fail_slow_ops: 检测不同故障类型对应的观测点,字典格式 + - cal_slow:计算慢对应的观测点,默认为"HcclAllGather" + - op_launch_slow:算子下发慢对应的观测点,默认为“HcclAllGather_launch” + - comm_slow:通信慢对应的观测点,默认为“HcclBatchSendRecv” + - dataloader_slow:输入模型数据加载慢对应的观测点,默认为“Dataloader” + - ckpt_slow: 模型保存满对应的观测点,默认为“SaveCkpt” + +save_image:时序数据保存的路径,用于debug算法效果,默认为“image” + +- record_kpi: 时序数据是否记录到检测结果中,默认为false +- use_plot: 时序数据保存开关,用于debug算法效果,默认为false +- max_num_normal_results:检测结果最大记录正常节点数据数量,默认为16 +- look_back:告警抑制,默认为20min +- hccl_domain: 通信域默认配置,格式为字典,默认为{},实际配置示例{"tp":[[0,1,2,3], [4,5,6,7]], "dp":[[0,4], [1,5],[2,6],[3,7]]} +- rank_table_json: rank_table配置文件路径,用于mindspore通信域配置,默认路径"./rank_table.json" +- debug_data:denug模式,会保存算子执行和算子下发的中间文件,默认为false + +在文件metric_config.json中,配置所有指标的检测算法参数,每个指标独立配置。该配置项中以**HcclAllGather**指标配置举例,主要包含: + +- metric_type:指标类型,string类型,取值“device”和“host”, +- aggregation:指标聚合配置,字典 + - during_s:聚合窗口大小, int类型,默认5s + - funcs:聚合方法配置,list类型,包含元素为dict类型 + - func: 聚合方法,string类型,有“min”,"max","mean","percentile"等 + - func_params: 聚合方法配置参数,字典类型,根据不同的聚合方法配置,默认为空 + +- priority:指标类型,string类型,取值“device”和“host”, +- aggregation:检测优先级,int类型 +- alarm_filter_window_size:告警过滤窗口大小,表示检测出的异常点连续个数,int类型,默认值为5 +- space_detector: 节点间对比检测器配置,不配置为“null” + - dist_metric: 节点间距离函数类型,“euclidean”, string类型 + - eps:Dbscan聚类参数的阈值,点间距离大于该值则为另一类, float类型 + - cv_threshold:判断值偏离均值的程度,偏移过大则认为是异常点,float类型 + - min_samples:dbscan最小成新簇的点数, int类型 + - window_size:窗口大小,表示单次检测的窗口,不重叠,int类型 + - scaling:表示时间序列是否归一化, bool类型 + - type:空间检测器类型,string类型,取值“SlidingWindowDBSCAN”,“OuterDataDetector” + +- time_detector:单节点时序异常检测配置, 不配置为“null” + - preprocess_eps: Dbscann预处理的阈值, float类型 + - preprocess_min_samples:Dbscan预处理的最小点数,int类型 + - type:时间检测器类型,string类型,取值为“TSDBSCANDetector”,“SlidingWindowKSigmaDetector” + - n_sigma_method:当为“SlidingWindowKSigmaDetector”类型时,配置字段,dict类型 + - type:SlidingWindowKSigmaDetector采用的检测算法,可替换扩展,string类型,默认为”SlidingWindowNSigma“ + - training_window_size:滑动窗口的最大值,超过该值,覆盖已有value,int类型 + - min_update_window_size:滑动窗口的最小更新值,int类型 + - min_std_val:最小标准差,当标准差为0时,设置为最小标准差,float类型 + - bias:边界基础上的偏置系数,float类型 + - abs_bias:边界基础上的偏置值,float类型 + - nsigma_coefficient:Ksigam的系数,int类型 + - detect_type:检测边界类型,string类型,取值为“lower_bound”,“upper_bound”,“bi_bound” + - min_expert_lower_bound:下边界最小专家阈值,null表示不设置专家阈值,int或者null类型 + - max_expert_lower_bound:下边界最大专家阈值,null表示不设置专家阈值,int或者null类型 + - min_expert_upper_bound:上边界最小专家阈值,null表示不设置专家阈值,int或者null类型 + - max_expert_upper_bound:上边界最大专家阈值,null表示不设置专家阈值,int或者null类型 + +##### 慢卡定界算法执行 + +```shell +## L1定界 +systrace-failslow +## 或者 +python failslow/main.py +``` + +**注意:算法执行前,需[参考文档](https://gitee.com/openeuler/sysTrace/blob/master/failslow/docs/conf_introduction.md)配置对应的数据路径** + +##### 慢卡定界算法输出 + +**慢卡定界算法输出字段表** + +| 字段 | 类型 | 说明 | 举例 | +| -------------- | ------ | -------------------------------- | ------------ | +| resultCode | int | 结果码,200表示正常,201表示异常 | 200 | +| compute | bool | 计算导致的慢卡 | false | +| network | bool | 通信导致的慢卡 | false | +| storage | bool | 存储导致的慢卡 | false | +| abnormalDetail | list | 异常rank卡的信息 | 详见下方示例 | +| normalDetail | list | 正常rank卡的信息 | 详见下方示例 | +| errorMsg | string | 记录异常信息, | “” | +| timestamp | int | 故障发生时间 | 1749085300 | + +**输出样例** + +``` +{ + "resultCode": 201, + "compute": true, + "network": false, + "storage": false, + "abnormalDetail": [ + { + "objectId": "3", + "serverIp": "9.13.100.7", + "deviceInfo": "rank_3", + "kpiId": "HcclAllGather", + "methodType": "SPACE", + "kpiData": [], + "relaIds": [ + 0, + 1, + 2, + 4, + 5, + 6, + 7 + ], + "omittedDevices": [] + } + ], + "normalDetail": [ + { + "objectId": "0", + "serverIp": "9.13.100.7", + "deviceInfo": "rank_0", + "kpiId": "HcclAllGather", + "methodType": "SPACE", + "kpiData": [], + "relaIds": [], + "omittedDevices": [] + }, + { + "objectId": "1", + "serverIp": "9.13.100.7", + "deviceInfo": "rank_1", + "kpiId": "HcclAllGather", + "methodType": "SPACE", + "kpiData": [], + "relaIds": [], + "omittedDevices": [] + }, + { + "objectId": "2", + "serverIp": "9.13.100.7", + "deviceInfo": "rank_2", + "kpiId": "HcclAllGather", + "methodType": "SPACE", + "kpiData": [], + "relaIds": [], + "omittedDevices": [] + }, + { + "objectId": "4", + "serverIp": "9.13.100.7", + "deviceInfo": "rank_4", + "kpiId": "HcclAllGather", + "methodType": "SPACE", + "kpiData": [], + "relaIds": [], + "omittedDevices": [] + }, + { + "objectId": "5", + "serverIp": "9.13.100.7", + "deviceInfo": "rank_5", + "kpiId": "HcclAllGather", + "methodType": "SPACE", + "kpiData": [], + "relaIds": [], + "omittedDevices": [] + }, + { + "objectId": "6", + "serverIp": "9.13.100.7", + "deviceInfo": "rank_6", + "kpiId": "HcclAllGather", + "methodType": "SPACE", + "kpiData": [], + "relaIds": [], + "omittedDevices": [] + }, + { + "objectId": "7", + "serverIp": "9.13.100.7", + "deviceInfo": "rank_7", + "kpiId": "HcclAllGather", + "methodType": "SPACE", + "kpiData": [], + "relaIds": [], + "omittedDevices": [] + } + ], + "errorMsg": "", + "timestamp": 1749084984 +} +``` -1. 使用 Readme\_XXX.md 来支持不同的语言,例如 Readme\_en.md, Readme\_zh.md -2. Gitee 官方博客 [blog.gitee.com](https://blog.gitee.com) -3. 你可以 [https://gitee.com/explore](https://gitee.com/explore) 这个地址来了解 Gitee 上的优秀开源项目 -4. [GVP](https://gitee.com/gvp) 全称是 Gitee 最有价值开源项目,是综合评定出的优秀开源项目 -5. Gitee 官方提供的使用手册 [https://gitee.com/help](https://gitee.com/help) -6. Gitee 封面人物是一档用来展示 Gitee 会员风采的栏目 [https://gitee.com/gitee-stars/](https://gitee.com/gitee-stars/) diff --git a/sysTrace-1.0.tar.gz b/sysTrace-1.0.tar.gz deleted file mode 100644 index 0a2142a72e0b32a51d760a6c4f073388fc8a3e8f..0000000000000000000000000000000000000000 Binary files a/sysTrace-1.0.tar.gz and /dev/null differ diff --git a/sysTrace-1.1.tar.gz b/sysTrace-1.1.tar.gz new file mode 100644 index 0000000000000000000000000000000000000000..5c1e2ee26e4321edb8cf0dfd501056c0ce1f7235 Binary files /dev/null and b/sysTrace-1.1.tar.gz differ diff --git a/sysTrace.spec b/sysTrace.spec index a4f217f549ed58c1a6f41237a8408d2507659e52..440d5a582701a5e9502db3cbe8228abfa813afef 100644 --- a/sysTrace.spec +++ b/sysTrace.spec @@ -1,18 +1,18 @@ %global debug_package %{nil} %global _enable_debug_packages 0 Name: sysTrace -Version: 1.0 -Release: 1%{?dist} +Version: 1.1 +Release: 5 Summary: System Tracing Library with Fail Slow Detection License: GPLv2 AND MulanPSL2 URL: https://gitee.com/src-openeuler/sysTrace Source0: %{name}-%{version}.tar.gz + BuildRequires: gcc-c++ cmake make -BuildRequires: python3-devel python3-setuptools -BuildRequires: boost-devel abseil-cpp-devel openssl-devel -BuildRequires: protobuf-devel protobuf-compiler jsoncpp-devel protobuf-c protobuf-c-devel -BuildRequires: libunwind-devel +BuildRequires: python3-devel +BuildRequires: libbpf libbpf-devel clang bpftool +BuildRequires: protobuf-devel protobuf-compiler protobuf-c protobuf-c-devel %description sysTrace is a system tracing library that provides low-level system @@ -21,37 +21,64 @@ with fail slow detection module. %package -n sysTrace-failslow Summary: Fail Slow Detection for AI Model Training and Inference -Requires: python3-pyyaml python3-joblib python3-numpy -Requires: python3-matplotlib python3-pandas python3-scikit-learn +Requires: python3-pyyaml python3-joblib python3-numpy python3-setuptools +Requires: python3-pandas python3-scikit-learn Requires: procps-ng %description -n sysTrace-failslow Fail slow detection module for A-Ops project, providing detection capabilities for AI model training and inference scenarios. +%package -n sysTrace-mcpserver +Summary: Fail Slow Detection for AI Model Training and Inference By MCP +Requires: %{name}-failslow = %{version}-%{release} +Requires: python3-pyyaml python3-joblib python3-numpy python3-mcp python3-fastapi +Requires: python3-matplotlib python3-pandas python3-scikit-learn +Requires: procps-ng + +%description -n sysTrace-mcpserver +The slow failure detection module's MCP function of the A-Ops project + %prep %setup -q -n %{name}-%{version} %build -cd sysTrace +cd systrace bash build.sh cd .. -cd sysTrace-failslow +cd failslow +%py3_build +cd .. + +cd systrace_mcp %py3_build cd .. %install install -d -m 755 %{buildroot}%{_libdir} -install -m 755 sysTrace/build/libsysTrace.so %{buildroot}%{_libdir}/ +install -d -m 755 %{buildroot}%{_bindir} install -d -m 755 %{buildroot}%{_sysconfdir}/systrace/config -install -m 644 sysTrace-failslow/config/*.json %{buildroot}%{_sysconfdir}/systrace/config/ -cd sysTrace-failslow +install -m 755 systrace/build/libsysTrace.so %{buildroot}%{_libdir}/ +install -m 755 systrace/build/sysTrace_cli %{buildroot}%{_bindir}/ +install -m 755 systrace/config/PyFuncList %{buildroot}%{_sysconfdir}/systrace/config + +install -m 644 failslow/config/*.json %{buildroot}%{_sysconfdir}/systrace/config/ +install -m 644 systrace_mcp/config/*.json %{buildroot}%{_sysconfdir}/systrace/config/ + +cd failslow %py3_install cd .. +cd systrace_mcp +%py3_install +cd ../.. + +install -d -m 750 %{buildroot}/var/lib/systrace +install -d -m 750 %{buildroot}/var/lib/systrace/mcp + %pre -n sysTrace-failslow getent group systrace >/dev/null || groupadd -r systrace getent passwd systrace >/dev/null || \ @@ -60,24 +87,69 @@ getent passwd systrace >/dev/null || \ %post -n sysTrace-failslow %systemd_post systrace-failslow.service +chown -R root:systrace %{_sysconfdir}/systrace/config +chmod 750 %{_sysconfdir}/systrace/config +chown -R root:systrace /var/lib/systrace +chmod 750 /var/lib/systrace + +%post -n sysTrace-mcpserver +%systemd_post systrace-mcpserver.service +chown -R root:systrace %{_sysconfdir}/systrace/config +chmod 750 %{_sysconfdir}/systrace/config +chown -R root:systrace /var/lib/systrace/mcp +chmod 750 /var/lib/systrace/mcp %preun -n sysTrace-failslow %systemd_preun systrace-failslow.service +%preun -n sysTrace-mcpserver +%systemd_preun systrace-mcpserver.service + %postun -n sysTrace-failslow -%systemd_postun_with_restart sysTrace-failslow.service +%systemd_postun_with_restart systrace-failslow.service + +%postun -n sysTrace-mcpserver +%systemd_postun_with_restart systrace-mcpserver.service %files %{_libdir}/libsysTrace.so +%{_bindir}/sysTrace_cli +%{_sysconfdir}/systrace/config/PyFuncList %files -n sysTrace-failslow -%doc sysTrace-failslow/README.md +%doc failslow/README.md %{_bindir}/systrace-failslow %config(noreplace) %{_sysconfdir}/systrace/config/*.json %{python3_sitelib}/failslow/ %{python3_sitelib}/systrace_failslow-*.egg-info %{_unitdir}/systrace-failslow.service +%dir %attr(750, root, systrace) /var/lib/systrace + +%files -n sysTrace-mcpserver +%doc systrace_mcp/README.md +%{_bindir}/systrace-mcpserver +%{_bindir}/systrace-openapi +%config(noreplace) %{_sysconfdir}/systrace/config/*.json +%{python3_sitelib}/systrace_mcp/ +%{python3_sitelib}/systrace_mcp-*.egg-info +%{_unitdir}/systrace-mcpserver.service +%dir %attr(750, root, systrace) /var/lib/systrace/mcp %changelog +* Mon Aug 25 2025 houxu - 1.1-5 +- update readme.md + +* Mon Aug 18 2025 houxu - 1.1-4 +- add sysTrace_cli + +* Fri Aug 15 2025 houxu - 1.1-3 +- update fail_slow_detection + +* Thu Aug 14 2025 houxu - 1.1-2 +- update mcp parameter + +* Mon Aug 11 2025 houxu - 1.1-1 +- Add sysTrace-mcpserver subpackage with MCP integration + * Wed May 14 2025 liwei <1289113577@qq.com> - 1.0-1 - Initial package for sysTrace