diff --git a/sysom_server/sysom_diagnosis/service_scripts/delay_post.py b/sysom_server/sysom_diagnosis/service_scripts/delay_post.py new file mode 100644 index 0000000000000000000000000000000000000000..27f87ce6b14950e32df62e3de6590e9b86d644df --- /dev/null +++ b/sysom_server/sysom_diagnosis/service_scripts/delay_post.py @@ -0,0 +1,223 @@ +from abc import abstractmethod +from typing import List, Union +from uuid import uuid4 +from .base import DiagnosisJobResult, DiagnosisPostProcessor, PostProcessResult +import json +from datetime import datetime +import traceback + + +class DelayDetail: + + def __init__( + self, + name: str, + ts: str, + delay_ms: str, + victim="N/A", + info="N/A", + other="N/A", + fix_suggestion="N/A", + ) -> None: + """Create one delay detail + + Args: + name (str): The type name of jitter eg. 调度 + ts (str): The timestamp of jitter, eg. 2024-01-01 12:00:00 + delay_ms (str): The jitter duration, eg. 100 + victim (str, optional): . The victim of the jitter, eg. task/container/pod info... + info (str, optional): The jitter sence. eg. for nosched is the calltrace. + other (str, optional): The addtional info. eg. for nosched is the cpuid and the kernel boot timestamp. + fix_suggestion (str, optional): The fix suggestion, eg. for nosched is the history case which match the calltrace in `info`. + """ + self.name = name + self.ts = ts + self.delay_ms = delay_ms + self.victim = victim + self.info = info + self.other = other + self.fix_suggestion = fix_suggestion + + def to_dict(self): + return { + "key": str(uuid4()), + "类型": self.name, + "时间戳": self.ts, + "延迟(ms)": self.delay_ms, + "受害者": self.victim, + "现场信息": self.info, + "其他": self.other, + "修复建议": self.fix_suggestion, + } + + +class DelayDiagnoseResult: + def __init__(self, name="N/A") -> None: + """The delay diagnose reulst template + + Args: + name (str, optional): The type name of jitter. eg. 调度 + """ + self.name = name + self.summary = "N/A" + self.suggestion = "N/A" + self.details = [] + + +class DelayDiagnose: + """The delay diagnose template, all subsystem diagnose should inherit this class and implement `diagnose`""" + + def __init__(self, name, metrics=None) -> None: + """ + + Args: + name (_type_): The type name of jitter eg. 调度 + metrics (Any, optional): The data that needed by the diagnose, this can be any type. + """ + self.name = name + self.metrics = metrics + self.result = DelayDiagnoseResult(name) + + def set_summary(self, summary: str) -> None: + self.result.summary = summary + + def set_suggestion(self, suggestion: str) -> None: + self.result.suggestion = suggestion + + def add_detail(self, detail: DelayDetail) -> None: + self.result.details.append(detail) + + @abstractmethod + def diagn1ose(self) -> DelayDiagnoseResult: + """Must be implemented by subclass, do the diagnose and return the result + + Returns: + DelayDiagnoseResult: contain the summary, suggestion and details + """ + return self.result + + +class SchedDelayDiagnose(DelayDiagnose): + def __init__(self, name="调度", metrics=None) -> None: + super().__init__(name, metrics) + + def _build_stack(self, stack: str): + functions = stack.split(",") + return "\n".join(functions) + + def _calltrace_match(self, calltrace: str) -> str: + if ( + calltrace.find("memcg_stat_show") != -1 + or calltrace.find("memcg_numa_stat_show") != -1 + ): + return "疑似访问/sys/fs/cgroup/memory目录下的memory.stat或者memory.numa_stat耗时太长导致, 进一步修复/排查建议:\n1) 检查cgroup内存子系统的cgroup数量是否过多: cat /proc/cgroups | grep memory | awk '{print $3}\n2) 减少或者禁止访问该文件: 推荐使用fanotifywait工具确认访问线程\n3) 更多信息参考链接: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=c03914b7aa319fb2b6701a6427c13752c7418b9b, 如果是Alinux2(4.19.91-xxxx.al7.x86_64)可以尝试使用热补丁9329054" + elif calltrace.find("read_kcore") != -1: + return "疑似访问/proc/kcore文件耗时太久导致, 可以减少或者禁止访问该文件, 推荐使用fanotifywait工具确认访问线程。" + elif calltrace.find("estimation_timer") != -1: + return "疑似 IPVS的定时器函数estimation_timer在TIMER软中断中执行耗时太长, 进一步排查/修复建议:\n1) 如果使用的是Alinux3 (5.10.xxx-xx.al8.x86_64)内核尝试关闭该操作: sysctl net.ipv4.vs.run_estimation=0\n2) 更多信息参考链接: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=2232642ec3fb4aad6ae4da1e109f55a0e7f2d204" + elif ( + calltrace.find("__free_pages_ok") != -1 + or calltrace.find("__alloc_pages_nodemask") != -1 + ): + return "疑似zone->lock锁被其他线程持有时间过长, 进一步修复/排查建议: \n1) 可能是大量线程同时访问/proc/pagetypeinfo文件导致,推荐使用irqoff工具诊断进行确认。\n解决方式一: 减少或者禁止访问该文件, 推荐使用fanotifywait工具确认访问线程。\n解决方式二(!!警告!!该操作可能会影响性能): 创建一个周期任务在业务低谷期间执行`echo 1 > /proc/sys/vm/compact`手动触发内存规整减少内存碎片从而减少访问耗时。\n2) 参考链接: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=93b3a674485f6a4b8ffff85d1682d5e8b7c51560" + elif calltrace.find("count_partial") != -1: + return "疑似访问/proc/slab文件耗时太久, 可能是kmem_cache_node->partial链表过长导致。进一步修复/排查建议: \n1) 减少或者禁止访问该文件, 推荐使用fanotifywait工具确认访问线程。\n2) 通过回收slab缓存减少访问耗时" + elif calltrace.find("flush_tlb_mm_range") != -1: + return "疑似是内核执行flush_tlb_mm_range耗时太久, 有可能是等待其他的CPU响应导致" + elif calltrace.find("cpuinfo_open") != -1: + return "疑似是访问/proc/cpuinfo耗时太长, 进一步修复/排查建议: 减少或者禁止访问该文件, 推荐使用fanotifywait工具确认访问线程" + elif calltrace.find("multi_cpu_stop") != -1: + return "疑似是内核线程[migration/x]执行耗时太长,进一步修复/排查建议: \n1) 确认是否机器是NUMA架构\n2)请检查是否开启了NUMA负载均衡: cat /proc/sys/kernel/numa_balancing" + else: + return "暂无修复建议, 请寻求内核支持" + + def _parse_logs(self, content: str, time: str): + logs = content.split("\n") + try: + for log in logs: + attrs = log.split(";", maxsplit=4) + delay_info = {} + for attr in attrs: + attr_name, attr_val = attr.split(":", maxsplit=1) + delay_info[attr_name] = attr_val + # task cpu delay stamp callstack + self.add_detail( + DelayDetail( + name=self.name, + ts=time, + delay_ms=delay_info["delay"], + victim=delay_info["task"], + info=self._build_stack(delay_info["callstack"]), + other=f"cpu={delay_info['cpu']}\nstamp={delay_info['stamp']}\n", + fix_suggestion=self._calltrace_match(delay_info["callstack"]), + ) + ) + except Exception as e: + print(f"parse_logs error, content={content}, time={time}, error={e}") + + def _generate_summary(self): + if len(self.result.details) == 0: + return "未检测到抖动" + else: + return f"检测到{len(self.result.details)}个抖动" + + def _generate_suggestion(self): + if len(self.result.details) == 0: + return "无修复建议" + return "请查看抖动详情" + + def diagnose(self) -> DelayDiagnoseResult: + self.metrics = json.loads(self.metrics) + for log in self.metrics: + time = datetime.fromtimestamp(int(log["time"]) / 1000000).strftime( + "%Y-%m-%d %H:%M:%S" + ) + content = log["logs"].get("log", None) + self._parse_logs(content, time) + self.set_suggestion(self._generate_suggestion()) + self.set_summary(self._generate_summary()) + return self.result + + +class PostProcessor(DiagnosisPostProcessor): + def build_summary(self, results: List[DelayDiagnoseResult]): + overview = "## 诊断结果\n\n" + for res in results: + overview += f"### {res.name}抖动\n\n" + overview += f"诊断结果: {res.summary}\n\n" + overview += f"**修复建议**: {res.suggestion}\n\n" + + return {"data": overview} + + def build_detail_table(self, results: List[DelayDiagnoseResult]): + details = [] + for res in results: + for detail in res.details: + details.append(detail.to_dict()) + return {"data": details} + + def parse_diagnosis_result( + self, results: List[DiagnosisJobResult] + ) -> PostProcessResult: + try: + nosched_log = results[0].stdout + diagnoses = [ + SchedDelayDiagnose(metrics=nosched_log), + # Add your `class DelayDiagnose` here + ] + results = [diag.diagnose() for diag in diagnoses] + return PostProcessResult( + code=0, + err_msg="", + result={ + "summary": "N/A", + "overview": self.build_summary(results), + "delay-detail": self.build_detail_table(results), + }, + ) + except Exception as e: + return PostProcessResult( + code=1, + err_msg=f"error: {e} \ntraceback: {traceback.format_exc()}\n", + result={}, + ) diff --git a/sysom_server/sysom_diagnosis/service_scripts/delay_pre.py b/sysom_server/sysom_diagnosis/service_scripts/delay_pre.py new file mode 100644 index 0000000000000000000000000000000000000000..2cfbab1ff95100e1e1cdb8628913bcb4ab894ee9 --- /dev/null +++ b/sysom_server/sysom_diagnosis/service_scripts/delay_pre.py @@ -0,0 +1,37 @@ +from .base import DiagnosisJob, DiagnosisPreProcessor, DiagnosisTask +from datetime import datetime, timedelta + + +class PreProcessor(DiagnosisPreProcessor): + def get_nosched_log_command(self, end: datetime): + start = datetime.strptime(end, "%Y-%m-%d %H:%M:%S") - timedelta(minutes=10) + return 'curl --silent --header "Content-Type: application/json" --request POST --data \'{{"mode":"date","start":"{}","stop":"{}","tz":0,"table":["nosched_log"]}}\' http://localhost:8400/api/query'.format( + start, end + ) + + """Command diagnosis + + Just invoke command in target instance and get stdout result + + Args: + DiagnosisPreProcessor (_type_): _description_ + """ + + def get_diagnosis_cmds(self, params: dict) -> DiagnosisTask: + instance = params.get("instance", "") + if instance.find(":") != 1: + instance = instance.split(":")[0] + moment = params.get("moment", "") + if not moment: + moment = datetime.now().strftime("%Y-%m-%d %H:%M:%S") + cmd = self.get_nosched_log_command(moment) + print(cmd) + return DiagnosisTask( + offline_mode=False, + jobs=[ + DiagnosisJob( + instance=instance, cmd=self.get_nosched_log_command(moment) + ) + ], + in_order=False, + ) diff --git a/sysom_web/cypress/e2e/diagnosis/delay.cy.js b/sysom_web/cypress/e2e/diagnosis/delay.cy.js new file mode 100644 index 0000000000000000000000000000000000000000..a0ff7197d287a3e623edd69f9298600a203b714f --- /dev/null +++ b/sysom_web/cypress/e2e/diagnosis/delay.cy.js @@ -0,0 +1,35 @@ +/// + +describe('SysOM Diagnosis Test -- Delay', () => { + beforeEach(() => { + // 自动登录 + cy.login() + }) + it('Invoke colocation cpi diagnosis, and check result', () => { + cy.sysomDiagnosisCheck( + // 诊断前端url + '/diagnose/delay', + // 需要依赖于节点启动了unity服务 + // 诊断参数 + { + 'instance': '127.0.0.1', + 'moment': '2024-05-30 12:00:00', + }, + + // 诊断结果处理(在此处判断诊断的结果数据是否符合预期) + (result) => { + cy.diagnosisTaskResultHandler(result, () => { + cy.get('.ant-pro-card').eq(3).contains('诊断结果汇总') + cy.get('.ant-pro-card') + .eq(3) + .find('.ant-pro-card-body') + .contains('诊断结果') + cy.get('.ant-pro-card') + .eq(3) + .find('.ant-pro-card-body') + .contains('修复建议') + cy.get('.ant-pro-card').eq(4).contains('抖动详情') + }) + }) + }) +}) diff --git a/sysom_web/public/resource/diagnose/v2/delay.json b/sysom_web/public/resource/diagnose/v2/delay.json new file mode 100644 index 0000000000000000000000000000000000000000..8082fa35a20400a2c784ca9390ab35b6c9a46007 --- /dev/null +++ b/sysom_web/public/resource/diagnose/v2/delay.json @@ -0,0 +1,42 @@ +{ + "servicename": "delay", + "version": 1.0, + "taskform": [ + { + "type": "select_host", + "name": "instance", + "initialValue": "", + "label": "实例IP", + "tooltips": "请输入你要诊断的IP" + }, + { + "type": "text", + "name": "moment", + "initialValue": "", + "label": "时刻", + "tooltips": "我们将对该时刻的前10分钟内的历史数据进行诊断,不填写则对当前时刻进行诊断。时间字符串格式参考 '2024-01-01 00:00:00'" + } + ], + "variables": [], + "pannels": [ + { + "key": "overview", + "type": "markdown", + "title": "诊断结果汇总", + "datasource": "overview" + }, + { + "key": "delay-detail", + "type": "table", + "title": "抖动详情", + "datasource": "delay-detail", + "tooltips": "抖动的现场详细信息", + "tableConfig": { + "enableSortColumn": [ + "类型", + "延迟(ms)" + ] + } + } + ] +} \ No newline at end of file diff --git a/sysom_web/public/resource/diagnose/v2/locales.json b/sysom_web/public/resource/diagnose/v2/locales.json index 9d01653b9029e1891eed4e1240eaee22373ee896..95e7e4cd207a75a5fe5375ae616d85740da175bc 100644 --- a/sysom_web/public/resource/diagnose/v2/locales.json +++ b/sysom_web/public/resource/diagnose/v2/locales.json @@ -2,6 +2,7 @@ "version": 1.0, "menus": [ "menu.diagnose.ossre", + "menu.diagnose.delay", "menu.diagnose.memory.memgraph", "menu.diagnose.memory.filecache", "menu.diagnose.memory.oomcheck", @@ -26,6 +27,7 @@ "locales": { "zh-CN": { "menu.diagnose.ossre":"系统健康检查", + "menu.diagnose.delay": "抖动延时诊断", "menu.diagnose.memory": "内存诊断中心", "menu.diagnose.storage": "存储诊断中心", "menu.diagnose.net": "网络诊断中心", @@ -62,6 +64,7 @@ }, "en-US": { "menu.diagnose.ossre":"System Diagnosis", + "menu.diagnose.delay": "Jitter & Delay Diagnosis", "menu.diagnose.memory": "Memory Diagnosis Center", "menu.diagnose.storage": "Storage Diagnosis Center", "menu.diagnose.net": "Network Diagnosis Center",