diff --git a/Fix-the-problem-that-function-cpu_report_result-is-c.patch b/Fix-the-problem-that-function-cpu_report_result-is-c.patch deleted file mode 100644 index 7a465900634e4df964ec6e1ca1b0dacb39a286f9..0000000000000000000000000000000000000000 --- a/Fix-the-problem-that-function-cpu_report_result-is-c.patch +++ /dev/null @@ -1,35 +0,0 @@ -From 3e2721852ad1f8047ad219a5ab6c68fd4c9d6f5c Mon Sep 17 00:00:00 2001 -From: shixuantong -Date: Wed, 24 Jul 2024 16:17:54 +0800 -Subject: [PATCH] Fix the problem that function cpu_report_result() is called - more than once - -when task is running, user to exec "sentryctl stop cpu_sentry", cpu_report_result() will be called twice. This will cause the log to be printed twice ---- - src/python/syssentry/cpu_sentry.py | 5 +++-- - 1 file changed, 3 insertions(+), 2 deletions(-) - -diff --git a/src/python/syssentry/cpu_sentry.py b/src/python/syssentry/cpu_sentry.py -index 7e77654..3c4d58d 100644 ---- a/src/python/syssentry/cpu_sentry.py -+++ b/src/python/syssentry/cpu_sentry.py -@@ -133,6 +133,7 @@ class CpuSentry: - - result_level = self.send_result.get("result", ResultLevel.FAIL) - report_result(task_name, result_level, details) -+ self.init_send_result() - - def kill_process(signum, _f, cpu_sentry_obj): - """kill process by 'pkill -9'""" -@@ -179,6 +180,6 @@ def main(): - cpu_sentry_task.send_result["result"] = ResultLevel.FAIL - cpu_sentry_task.send_result["details"]["code"] = 1004 - cpu_sentry_task.send_result["details"]["msg"] = "run cmd [%s] raise Error" % cpu_sentry_task_cmd -- finally: - cpu_sentry_task.cpu_report_result() -- cpu_sentry_task.init_send_result() -+ else: -+ cpu_sentry_task.cpu_report_result() --- -2.27.0 - diff --git a/Remove-ANSI-escape-sequences.patch b/Remove-ANSI-escape-sequences.patch deleted file mode 100644 index 713bfc07418c8ebd9689ea57df5273bf6d7b123d..0000000000000000000000000000000000000000 --- a/Remove-ANSI-escape-sequences.patch +++ /dev/null @@ -1,32 +0,0 @@ -From 91aa47999030503fda4935d4cc238b82d6842238 Mon Sep 17 00:00:00 2001 -From: shixuantong -Date: Sun, 11 Aug 2024 18:36:23 +0800 -Subject: [PATCH] Remove ANSI escape sequences - ---- - src/python/syssentry/cpu_sentry.py | 9 ++++++++- - 1 file changed, 8 insertions(+), 1 deletion(-) - -diff --git a/src/python/syssentry/cpu_sentry.py b/src/python/syssentry/cpu_sentry.py -index 9287e2f..99af127 100644 ---- a/src/python/syssentry/cpu_sentry.py -+++ b/src/python/syssentry/cpu_sentry.py -@@ -97,7 +97,14 @@ class CpuSentry: - if "ERROR" in stdout: - self.send_result["result"] = ResultLevel.FAIL - self.send_result["details"]["code"] = 1004 -- self.send_result["details"]["msg"] = stdout.split("\n")[0] -+ -+ # Remove ANSI escape sequences -+ error_info = stdout.split("\n")[0] -+ if error_info.startswith("\u001b"): -+ ansi_escape = r'\x1b\[([0-9]+)(;[0-9]+)*([A-Za-z])' -+ error_info = re.sub(ansi_escape, '', error_info) -+ -+ self.send_result["details"]["msg"] = error_info - return - - out_split = stdout.split("\n") --- -2.33.0 - diff --git a/add-ai-threshold-slow-io-detection-plugin.patch b/add-ai-threshold-slow-io-detection-plugin.patch deleted file mode 100644 index 6f707a4aa50ec37ee7d0fddc5e53a3aadef839db..0000000000000000000000000000000000000000 --- a/add-ai-threshold-slow-io-detection-plugin.patch +++ /dev/null @@ -1,1201 +0,0 @@ -From 3d72fa7f517e6e99af1205e965c3775dc23461f4 Mon Sep 17 00:00:00 2001 -From: =?UTF-8?q?=E8=B4=BA=E6=9C=89=E5=BF=97?= <1037617413@qq.com> -Date: Mon, 23 Sep 2024 11:03:26 +0800 -Subject: [PATCH] add ai threshold slow io detection to sysSentry - ---- - .../ai_threshold_slow_io_detection.ini | 16 ++ - .../tasks/ai_threshold_slow_io_detection.mod | 5 + - .../test_ai_threshold_slow_io_detection.py | 165 ++++++++++++++++++ - .../ai_threshold_slow_io_detection/README.md | 2 + - .../__init__.py | 0 - .../alarm_report.py | 49 ++++++ - .../config_parser.py | 141 +++++++++++++++ - .../data_access.py | 91 ++++++++++ - .../detector.py | 48 +++++ - .../ai_threshold_slow_io_detection/io_data.py | 74 ++++++++ - .../sliding_window.py | 113 ++++++++++++ - .../slow_io_detection.py | 133 ++++++++++++++ - .../threshold.py | 160 +++++++++++++++++ - .../ai_threshold_slow_io_detection/utils.py | 67 +++++++ - src/python/setup.py | 3 +- - 15 files changed, 1066 insertions(+), 1 deletion(-) - create mode 100644 config/plugins/ai_threshold_slow_io_detection.ini - create mode 100644 config/tasks/ai_threshold_slow_io_detection.mod - create mode 100644 selftest/test/test_ai_threshold_slow_io_detection.py - create mode 100644 src/python/sentryPlugins/ai_threshold_slow_io_detection/README.md - create mode 100644 src/python/sentryPlugins/ai_threshold_slow_io_detection/__init__.py - create mode 100644 src/python/sentryPlugins/ai_threshold_slow_io_detection/alarm_report.py - create mode 100644 src/python/sentryPlugins/ai_threshold_slow_io_detection/config_parser.py - create mode 100644 src/python/sentryPlugins/ai_threshold_slow_io_detection/data_access.py - create mode 100644 src/python/sentryPlugins/ai_threshold_slow_io_detection/detector.py - create mode 100644 src/python/sentryPlugins/ai_threshold_slow_io_detection/io_data.py - create mode 100644 src/python/sentryPlugins/ai_threshold_slow_io_detection/sliding_window.py - create mode 100644 src/python/sentryPlugins/ai_threshold_slow_io_detection/slow_io_detection.py - create mode 100644 src/python/sentryPlugins/ai_threshold_slow_io_detection/threshold.py - create mode 100644 src/python/sentryPlugins/ai_threshold_slow_io_detection/utils.py - -diff --git a/config/plugins/ai_threshold_slow_io_detection.ini b/config/plugins/ai_threshold_slow_io_detection.ini -new file mode 100644 -index 0000000..44eb928 ---- /dev/null -+++ b/config/plugins/ai_threshold_slow_io_detection.ini -@@ -0,0 +1,16 @@ -+[common] -+absolute_threshold=40 -+slow_io_detect_frequency=1 -+log_level=info -+ -+[algorithm] -+train_data_duration=0.1 -+train_update_duration=0.02 -+algorithm_type=n_sigma -+boxplot_parameter=1.5 -+n_sigma_parameter=3 -+ -+[sliding_window] -+sliding_window_type=not_continuous -+window_size=30 -+window_minimum_threshold=6 -\ No newline at end of file -diff --git a/config/tasks/ai_threshold_slow_io_detection.mod b/config/tasks/ai_threshold_slow_io_detection.mod -new file mode 100644 -index 0000000..2729f72 ---- /dev/null -+++ b/config/tasks/ai_threshold_slow_io_detection.mod -@@ -0,0 +1,5 @@ -+[common] -+enabled=yes -+task_start=/usr/bin/python3 /usr/bin/ai_threshold_slow_io_detection -+task_stop=pkill -f /usr/bin/ai_threshold_slow_io_detection -+type=oneshot -\ No newline at end of file -diff --git a/selftest/test/test_ai_threshold_slow_io_detection.py b/selftest/test/test_ai_threshold_slow_io_detection.py -new file mode 100644 -index 0000000..c36fef5 ---- /dev/null -+++ b/selftest/test/test_ai_threshold_slow_io_detection.py -@@ -0,0 +1,165 @@ -+# coding: utf-8 -+# Copyright (c) 2024 Huawei Technologies Co., Ltd. -+# sysSentry is licensed under the Mulan PSL v2. -+# You can use this software according to the terms and conditions of the Mulan PSL v2. -+# You may obtain a copy of Mulan PSL v2 at: -+# http://license.coscl.org.cn/MulanPSL2 -+# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR -+# IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY OR FIT FOR A PARTICULAR -+# PURPOSE. -+# See the Mulan PSL v2 for more details. -+ -+import unittest -+import numpy as np -+ -+from sentryPlugins.ai_threshold_slow_io_detection.threshold import AbsoluteThreshold, BoxplotThreshold, NSigmaThreshold -+from sentryPlugins.ai_threshold_slow_io_detection.sliding_window import (NotContinuousSlidingWindow, -+ ContinuousSlidingWindow, MedianSlidingWindow) -+ -+ -+def _get_boxplot_threshold(data_list: list, parameter): -+ q1 = np.percentile(data_list, 25) -+ q3 = np.percentile(data_list, 75) -+ iqr = q3 - q1 -+ return q3 + parameter * iqr -+ -+ -+def _get_n_sigma_threshold(data_list: list, parameter): -+ mean = np.mean(data_list) -+ std = np.std(data_list) -+ return mean + parameter * std -+ -+ -+class Test(unittest.TestCase): -+ @classmethod -+ def setUpClass(cls): -+ print("UnitTest Begin...") -+ -+ @classmethod -+ def tearDownClass(cls): -+ print("UnitTest End...") -+ -+ def setUp(self): -+ print("Begin...") -+ -+ def tearDown(self): -+ print("End...") -+ -+ def test_absolute_threshold(self): -+ absolute = AbsoluteThreshold() -+ self.assertEqual(None, absolute.get_threshold()) -+ self.assertFalse(absolute.is_abnormal(5000)) -+ absolute.set_threshold(40) -+ self.assertEqual(40, absolute.get_threshold()) -+ self.assertTrue(absolute.is_abnormal(50)) -+ -+ def test_boxplot_threshold(self): -+ boxplot = BoxplotThreshold(1.5, 5, 1) -+ # 阶段1:尚未初始化 -+ self.assertEqual(None, boxplot.get_threshold()) -+ self.assertFalse(boxplot.is_abnormal(5000)) -+ # 往boxplot中插入5个元素后,会生成阈值 -+ data_list = [20, 20, 20, 30, 10] -+ for data in data_list: -+ boxplot.push_latest_data_to_queue(data) -+ # 阶段2:初始化 -+ boxplot_threshold = boxplot.get_threshold() -+ self.assertEqual(_get_boxplot_threshold(data_list, 1.5), boxplot_threshold) -+ self.assertTrue(boxplot.is_abnormal(5000)) -+ data_list.pop(0) -+ data_list.append(100) -+ boxplot.push_latest_data_to_queue(100) -+ # 阶段3:更新阈值 -+ boxplot_threshold = boxplot.get_threshold() -+ self.assertEqual(_get_boxplot_threshold(data_list, 1.5), boxplot_threshold) -+ -+ def test_n_sigma_threshold(self): -+ n_sigma = NSigmaThreshold(3, 5, 1) -+ self.assertEqual(None, n_sigma.get_threshold()) -+ self.assertFalse(n_sigma.is_abnormal(5000)) -+ data_list = [20, 20, 20, 30, 10] -+ for data in data_list: -+ n_sigma.push_latest_data_to_queue(data) -+ n_sigma_threshold = n_sigma.get_threshold() -+ self.assertEqual(_get_n_sigma_threshold(data_list, 3), n_sigma_threshold) -+ self.assertTrue(n_sigma.is_abnormal(5000)) -+ data_list.pop(0) -+ data_list.append(100) -+ n_sigma.push_latest_data_to_queue(100) -+ # 阶段3:更新阈值 -+ n_sigma_threshold = n_sigma.get_threshold() -+ self.assertEqual(_get_n_sigma_threshold(data_list, 3), n_sigma_threshold) -+ -+ def test_not_continuous_sliding_window(self): -+ not_continuous = NotContinuousSlidingWindow(5, 3) -+ boxplot_threshold = BoxplotThreshold(1.5, 10, 8) -+ boxplot_threshold.attach_observer(not_continuous) -+ data_list1 = [19, 20, 20, 20, 20, 20, 22, 24, 23, 20] -+ for data in data_list1: -+ boxplot_threshold.push_latest_data_to_queue(data) -+ result = not_continuous.is_slow_io_event(data) -+ self.assertFalse(result[0]) -+ self.assertEqual(23.75, boxplot_threshold.get_threshold()) -+ boxplot_threshold.push_latest_data_to_queue(24) -+ result = not_continuous.is_slow_io_event(24) -+ self.assertFalse(result[0]) -+ boxplot_threshold.push_latest_data_to_queue(25) -+ result = not_continuous.is_slow_io_event(25) -+ self.assertTrue(result[0]) -+ data_list2 = [20, 20, 20, 20, 20, 20] -+ for data in data_list2: -+ boxplot_threshold.push_latest_data_to_queue(data) -+ result = not_continuous.is_slow_io_event(data) -+ self.assertFalse(result[0]) -+ self.assertEqual(25.625, boxplot_threshold.get_threshold()) -+ -+ def test_continuous_sliding_window(self): -+ continuous = ContinuousSlidingWindow(5, 3) -+ boxplot_threshold = BoxplotThreshold(1.5, 10, 8) -+ boxplot_threshold.attach_observer(continuous) -+ data_list = [19, 20, 20, 20, 20, 20, 22, 24, 23, 20] -+ for data in data_list: -+ boxplot_threshold.push_latest_data_to_queue(data) -+ result = continuous.is_slow_io_event(data) -+ self.assertFalse(result[0]) -+ self.assertEqual(23.75, boxplot_threshold.get_threshold()) -+ # 没有三个异常点 -+ self.assertFalse(continuous.is_slow_io_event(25)[0]) -+ # 不连续的三个异常点 -+ self.assertFalse(continuous.is_slow_io_event(25)[0]) -+ # 连续的三个异常点 -+ self.assertTrue(continuous.is_slow_io_event(25)[0]) -+ -+ def test_median_sliding_window(self): -+ median = MedianSlidingWindow(5, 3) -+ absolute_threshold = AbsoluteThreshold(10, 8) -+ absolute_threshold.attach_observer(median) -+ absolute_threshold.set_threshold(24.5) -+ data_list = [24, 24, 24, 25, 25] -+ for data in data_list: -+ self.assertFalse(median.is_slow_io_event(data)[0]) -+ self.assertTrue(median.is_slow_io_event(25)[0]) -+ -+ def test_parse_collect_data(self): -+ collect = { -+ "read": [1.0, 2.0, 3.0, 4.0], -+ "write": [5.0, 6.0, 7.0, 8.0], -+ "flush": [9.0, 10.0, 11.0, 12.0], -+ "discard": [13.0, 14.0, 15.0, 16.0], -+ } -+ from io_data import BaseData -+ from data_access import _get_io_stage_data -+ -+ io_data = _get_io_stage_data(collect) -+ self.assertEqual( -+ io_data.read, BaseData(latency=1.0, io_dump=2.0, io_length=3.0, iops=4.0) -+ ) -+ self.assertEqual( -+ io_data.write, BaseData(latency=5.0, io_dump=6.0, io_length=7.0, iops=8.0) -+ ) -+ self.assertEqual( -+ io_data.flush, BaseData(latency=9.0, io_dump=10.0, io_length=11.0, iops=12.0) -+ ) -+ self.assertEqual( -+ io_data.discard, BaseData(latency=13.0, io_dump=14.0, io_length=15.0, iops=16.0) -+ ) -diff --git a/src/python/sentryPlugins/ai_threshold_slow_io_detection/README.md b/src/python/sentryPlugins/ai_threshold_slow_io_detection/README.md -new file mode 100644 -index 0000000..f9b8388 ---- /dev/null -+++ b/src/python/sentryPlugins/ai_threshold_slow_io_detection/README.md -@@ -0,0 +1,2 @@ -+# slow_io_detection -+ -diff --git a/src/python/sentryPlugins/ai_threshold_slow_io_detection/__init__.py b/src/python/sentryPlugins/ai_threshold_slow_io_detection/__init__.py -new file mode 100644 -index 0000000..e69de29 -diff --git a/src/python/sentryPlugins/ai_threshold_slow_io_detection/alarm_report.py b/src/python/sentryPlugins/ai_threshold_slow_io_detection/alarm_report.py -new file mode 100644 -index 0000000..3f4f34e ---- /dev/null -+++ b/src/python/sentryPlugins/ai_threshold_slow_io_detection/alarm_report.py -@@ -0,0 +1,49 @@ -+# coding: utf-8 -+# Copyright (c) 2024 Huawei Technologies Co., Ltd. -+# sysSentry is licensed under the Mulan PSL v2. -+# You can use this software according to the terms and conditions of the Mulan PSL v2. -+# You may obtain a copy of Mulan PSL v2 at: -+# http://license.coscl.org.cn/MulanPSL2 -+# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR -+# IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY OR FIT FOR A PARTICULAR -+# PURPOSE. -+# See the Mulan PSL v2 for more details. -+ -+from syssentry.result import ResultLevel, report_result -+import logging -+import json -+ -+ -+class AlarmReport: -+ TASK_NAME = "SLOW_IO_DETECTION" -+ -+ @staticmethod -+ def report_pass(info: str): -+ report_result(AlarmReport.TASK_NAME, ResultLevel.PASS, json.dumps({"msg": info})) -+ logging.info(f'Report {AlarmReport.TASK_NAME} PASS: {info}') -+ -+ @staticmethod -+ def report_fail(info: str): -+ report_result(AlarmReport.TASK_NAME, ResultLevel.FAIL, json.dumps({"msg": info})) -+ logging.info(f'Report {AlarmReport.TASK_NAME} FAIL: {info}') -+ -+ @staticmethod -+ def report_skip(info: str): -+ report_result(AlarmReport.TASK_NAME, ResultLevel.SKIP, json.dumps({"msg": info})) -+ logging.info(f'Report {AlarmReport.TASK_NAME} SKIP: {info}') -+ -+ @staticmethod -+ def report_minor_alm(info: str): -+ report_result(AlarmReport.TASK_NAME, ResultLevel.MINOR_ALM, json.dumps({"msg": info})) -+ logging.info(f'Report {AlarmReport.TASK_NAME} MINOR_ALM: {info}') -+ -+ @staticmethod -+ def report_major_alm(info: str): -+ report_result(AlarmReport.TASK_NAME, ResultLevel.MAJOR_ALM, json.dumps({"msg": info})) -+ logging.info(f'Report {AlarmReport.TASK_NAME} MAJOR_ALM: {info}') -+ -+ @staticmethod -+ def report_critical_alm(info: str): -+ report_result(AlarmReport.TASK_NAME, ResultLevel.CRITICAL_ALM, json.dumps({"msg": info})) -+ logging.info(f'Report {AlarmReport.TASK_NAME} CRITICAL_ALM: {info}') -+ -diff --git a/src/python/sentryPlugins/ai_threshold_slow_io_detection/config_parser.py b/src/python/sentryPlugins/ai_threshold_slow_io_detection/config_parser.py -new file mode 100644 -index 0000000..cd4e6f1 ---- /dev/null -+++ b/src/python/sentryPlugins/ai_threshold_slow_io_detection/config_parser.py -@@ -0,0 +1,141 @@ -+# coding: utf-8 -+# Copyright (c) 2024 Huawei Technologies Co., Ltd. -+# sysSentry is licensed under the Mulan PSL v2. -+# You can use this software according to the terms and conditions of the Mulan PSL v2. -+# You may obtain a copy of Mulan PSL v2 at: -+# http://license.coscl.org.cn/MulanPSL2 -+# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR -+# IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY OR FIT FOR A PARTICULAR -+# PURPOSE. -+# See the Mulan PSL v2 for more details. -+ -+import configparser -+import logging -+ -+ -+class ConfigParser: -+ -+ DEFAULT_ABSOLUTE_THRESHOLD = 40 -+ DEFAULT_SLOW_IO_DETECTION_FREQUENCY = 1 -+ DEFAULT_LOG_LEVEL = 'info' -+ DEFAULT_TRAIN_DATA_DURATION = 24 -+ DEFAULT_TRAIN_UPDATE_DURATION = 2 -+ DEFAULT_ALGORITHM_TYPE = 'boxplot' -+ DEFAULT_N_SIGMA_PARAMETER = 3 -+ DEFAULT_BOXPLOT_PARAMETER = 1.5 -+ DEFAULT_SLIDING_WINDOW_TYPE = 'not_continuous' -+ DEFAULT_WINDOW_SIZE = 30 -+ DEFAULT_WINDOW_MINIMUM_THRESHOLD = 6 -+ -+ def __init__(self, config_file_name): -+ self.__boxplot_parameter = None -+ self.__window_minimum_threshold = None -+ self.__window_size = None -+ self.__sliding_window_type = None -+ self.__n_sigma_parameter = None -+ self.__algorithm_type = None -+ self.__train_update_duration = None -+ self.__log_level = None -+ self.__slow_io_detect_frequency = None -+ self.__absolute_threshold = None -+ self.__train_data_duration = None -+ self.__config_file_name = config_file_name -+ -+ def read_config_from_file(self): -+ -+ con = configparser.ConfigParser() -+ con.read(self.__config_file_name, encoding='utf-8') -+ -+ items_common = dict(con.items('common')) -+ items_algorithm = dict(con.items('algorithm')) -+ items_sliding_window = dict(con.items('sliding_window')) -+ -+ try: -+ self.__absolute_threshold = int(items_common.get('absolute_threshold', -+ ConfigParser.DEFAULT_ABSOLUTE_THRESHOLD)) -+ except ValueError: -+ self.__absolute_threshold = ConfigParser.DEFAULT_ABSOLUTE_THRESHOLD -+ logging.warning('absolute threshold type conversion has error, use default value.') -+ -+ try: -+ self.__slow_io_detect_frequency = int(items_common.get('slow_io_detect_frequency', -+ ConfigParser.DEFAULT_SLOW_IO_DETECTION_FREQUENCY)) -+ except ValueError: -+ self.__slow_io_detect_frequency = ConfigParser.DEFAULT_SLOW_IO_DETECTION_FREQUENCY -+ logging.warning('slow_io_detect_frequency type conversion has error, use default value.') -+ -+ self.__log_level = items_common.get('log_level', ConfigParser.DEFAULT_LOG_LEVEL) -+ -+ try: -+ self.__train_data_duration = float(items_algorithm.get('train_data_duration', -+ ConfigParser.DEFAULT_TRAIN_DATA_DURATION)) -+ except ValueError: -+ self.__train_data_duration = ConfigParser.DEFAULT_TRAIN_DATA_DURATION -+ logging.warning('train_data_duration type conversion has error, use default value.') -+ -+ try: -+ self.__train_update_duration = float(items_algorithm.get('train_update_duration', -+ ConfigParser.DEFAULT_TRAIN_UPDATE_DURATION)) -+ except ValueError: -+ self.__train_update_duration = ConfigParser.DEFAULT_TRAIN_UPDATE_DURATION -+ logging.warning('train_update_duration type conversion has error, use default value.') -+ -+ try: -+ self.__algorithm_type = items_algorithm.get('algorithm_type', ConfigParser.DEFAULT_ALGORITHM_TYPE) -+ except ValueError: -+ self.__algorithm_type = ConfigParser.DEFAULT_ALGORITHM_TYPE -+ logging.warning('algorithmType type conversion has error, use default value.') -+ -+ if self.__algorithm_type == 'n_sigma': -+ try: -+ self.__n_sigma_parameter = float(items_algorithm.get('n_sigma_parameter', -+ ConfigParser.DEFAULT_N_SIGMA_PARAMETER)) -+ except ValueError: -+ self.__n_sigma_parameter = ConfigParser.DEFAULT_N_SIGMA_PARAMETER -+ logging.warning('n_sigma_parameter type conversion has error, use default value.') -+ elif self.__algorithm_type == 'boxplot': -+ try: -+ self.__boxplot_parameter = float(items_algorithm.get('boxplot_parameter', -+ ConfigParser.DEFAULT_BOXPLOT_PARAMETER)) -+ except ValueError: -+ self.__boxplot_parameter = ConfigParser.DEFAULT_BOXPLOT_PARAMETER -+ logging.warning('boxplot_parameter type conversion has error, use default value.') -+ -+ self.__sliding_window_type = items_sliding_window.get('sliding_window_type', -+ ConfigParser.DEFAULT_SLIDING_WINDOW_TYPE) -+ -+ try: -+ self.__window_size = int(items_sliding_window.get('window_size', -+ ConfigParser.DEFAULT_WINDOW_SIZE)) -+ except ValueError: -+ self.__window_size = ConfigParser.DEFAULT_WINDOW_SIZE -+ logging.warning('window_size type conversion has error, use default value.') -+ -+ try: -+ self.__window_minimum_threshold = ( -+ int(items_sliding_window.get('window_minimum_threshold', -+ ConfigParser.DEFAULT_WINDOW_MINIMUM_THRESHOLD))) -+ except ValueError: -+ self.__window_minimum_threshold = ConfigParser.DEFAULT_WINDOW_MINIMUM_THRESHOLD -+ logging.warning('window_minimum_threshold type conversion has error, use default value.') -+ -+ def get_slow_io_detect_frequency(self): -+ return self.__slow_io_detect_frequency -+ -+ def get_algorithm_type(self): -+ return self.__algorithm_type -+ -+ def get_sliding_window_type(self): -+ return self.__sliding_window_type -+ -+ def get_train_data_duration_and_train_update_duration(self): -+ return self.__train_data_duration, self.__train_update_duration -+ -+ def get_window_size_and_window_minimum_threshold(self): -+ return self.__window_size, self.__window_minimum_threshold -+ -+ def get_absolute_threshold(self): -+ return self.__absolute_threshold -+ -+ def get_log_level(self): -+ return self.__log_level -diff --git a/src/python/sentryPlugins/ai_threshold_slow_io_detection/data_access.py b/src/python/sentryPlugins/ai_threshold_slow_io_detection/data_access.py -new file mode 100644 -index 0000000..d9f3460 ---- /dev/null -+++ b/src/python/sentryPlugins/ai_threshold_slow_io_detection/data_access.py -@@ -0,0 +1,91 @@ -+# coding: utf-8 -+# Copyright (c) 2024 Huawei Technologies Co., Ltd. -+# sysSentry is licensed under the Mulan PSL v2. -+# You can use this software according to the terms and conditions of the Mulan PSL v2. -+# You may obtain a copy of Mulan PSL v2 at: -+# http://license.coscl.org.cn/MulanPSL2 -+# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR -+# IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY OR FIT FOR A PARTICULAR -+# PURPOSE. -+# See the Mulan PSL v2 for more details. -+ -+import json -+import logging -+ -+from sentryCollector.collect_plugin import ( -+ Result_Messages, -+ get_io_data, -+ is_iocollect_valid, -+) -+from .io_data import IOStageData, IOData -+ -+COLLECT_STAGES = [ -+ "throtl", -+ "wbt", -+ "gettag", -+ "plug", -+ "bfq", -+ "hctx", -+ "requeue", -+ "rq_driver", -+ "bio", -+ "iocost", -+] -+ -+def check_collect_valid(period): -+ data_raw = is_iocollect_valid(period) -+ if data_raw["ret"] == 0: -+ try: -+ data = json.loads(data_raw["message"]) -+ except Exception as e: -+ logging.warning(f"get io data failed, {e}") -+ return [] -+ return [k for k in data.keys()] -+ else: -+ return [] -+ -+ -+def _get_raw_data(period, disk_list): -+ return get_io_data( -+ period, -+ disk_list, -+ COLLECT_STAGES, -+ ["read", "write", "flush", "discard"], -+ ) -+ -+ -+def _get_io_stage_data(data): -+ io_stage_data = IOStageData() -+ for data_type in ('read', 'write', 'flush', 'discard'): -+ if data_type in data: -+ getattr(io_stage_data, data_type).latency = data[data_type][0] -+ getattr(io_stage_data, data_type).io_dump = data[data_type][1] -+ getattr(io_stage_data, data_type).io_length = data[data_type][2] -+ getattr(io_stage_data, data_type).iops = data[data_type][3] -+ return io_stage_data -+ -+ -+def get_io_data_from_collect_plug(period, disk_list): -+ data_raw = _get_raw_data(period, disk_list) -+ if data_raw["ret"] == 0: -+ ret = {} -+ try: -+ data = json.loads(data_raw["message"]) -+ except json.decoder.JSONDecodeError as e: -+ logging.warning(f"get io data failed, {e}") -+ return None -+ -+ for disk in data: -+ disk_data = data[disk] -+ disk_ret = IOData() -+ for k, v in disk_data.items(): -+ try: -+ getattr(disk_ret, k) -+ setattr(disk_ret, k, _get_io_stage_data(v)) -+ except AttributeError: -+ logging.debug(f'no attr {k}') -+ continue -+ ret[disk] = disk_ret -+ return ret -+ logging.warning(f'get io data failed with message: {data_raw["message"]}') -+ return None -diff --git a/src/python/sentryPlugins/ai_threshold_slow_io_detection/detector.py b/src/python/sentryPlugins/ai_threshold_slow_io_detection/detector.py -new file mode 100644 -index 0000000..eda9825 ---- /dev/null -+++ b/src/python/sentryPlugins/ai_threshold_slow_io_detection/detector.py -@@ -0,0 +1,48 @@ -+# coding: utf-8 -+# Copyright (c) 2024 Huawei Technologies Co., Ltd. -+# sysSentry is licensed under the Mulan PSL v2. -+# You can use this software according to the terms and conditions of the Mulan PSL v2. -+# You may obtain a copy of Mulan PSL v2 at: -+# http://license.coscl.org.cn/MulanPSL2 -+# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR -+# IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY OR FIT FOR A PARTICULAR -+# PURPOSE. -+# See the Mulan PSL v2 for more details. -+import logging -+ -+from .io_data import MetricName -+from .threshold import Threshold -+from .sliding_window import SlidingWindow -+from .utils import get_metric_value_from_io_data_dict_by_metric_name -+ -+ -+class Detector: -+ _metric_name: MetricName = None -+ _threshold: Threshold = None -+ _slidingWindow: SlidingWindow = None -+ -+ def __init__(self, metric_name: MetricName, threshold: Threshold, sliding_window: SlidingWindow): -+ self._metric_name = metric_name -+ self._threshold = threshold -+ self._slidingWindow = sliding_window -+ self._threshold.attach_observer(self._slidingWindow) -+ -+ def get_metric_name(self): -+ return self._metric_name -+ -+ def is_slow_io_event(self, io_data_dict_with_disk_name: dict): -+ logging.debug(f'Enter Detector: {self}') -+ metric_value = get_metric_value_from_io_data_dict_by_metric_name(io_data_dict_with_disk_name, self._metric_name) -+ if metric_value > 1e-6: -+ logging.debug(f'Input metric value: {str(metric_value)}') -+ self._threshold.push_latest_data_to_queue(metric_value) -+ detection_result = self._slidingWindow.is_slow_io_event(metric_value) -+ logging.debug(f'Detection result: {str(detection_result)}') -+ logging.debug(f'Exit Detector: {self}') -+ return detection_result -+ -+ def __repr__(self): -+ return (f'disk_name: {self._metric_name.get_disk_name()}, stage_name: {self._metric_name.get_stage_name()},' -+ f' access_type_name: {self._metric_name.get_io_access_type_name()},' -+ f' metric_name: {self._metric_name.get_metric_name()}, threshold_type: {self._threshold},' -+ f' sliding_window_type: {self._slidingWindow}') -diff --git a/src/python/sentryPlugins/ai_threshold_slow_io_detection/io_data.py b/src/python/sentryPlugins/ai_threshold_slow_io_detection/io_data.py -new file mode 100644 -index 0000000..0e17051 ---- /dev/null -+++ b/src/python/sentryPlugins/ai_threshold_slow_io_detection/io_data.py -@@ -0,0 +1,74 @@ -+# coding: utf-8 -+# Copyright (c) 2024 Huawei Technologies Co., Ltd. -+# sysSentry is licensed under the Mulan PSL v2. -+# You can use this software according to the terms and conditions of the Mulan PSL v2. -+# You may obtain a copy of Mulan PSL v2 at: -+# http://license.coscl.org.cn/MulanPSL2 -+# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR -+# IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY OR FIT FOR A PARTICULAR -+# PURPOSE. -+# See the Mulan PSL v2 for more details. -+ -+from dataclasses import dataclass, field -+from datetime import datetime -+from typing import Optional -+ -+ -+@dataclass -+class BaseData: -+ latency: Optional[float] = field(default_factory=lambda: None) -+ io_dump: Optional[int] = field(default_factory=lambda: None) -+ io_length: Optional[int] = field(default_factory=lambda: None) -+ iops: Optional[int] = field(default_factory=lambda: None) -+ -+ -+@dataclass -+class IOStageData: -+ read: BaseData = field(default_factory=lambda: BaseData()) -+ write: BaseData = field(default_factory=lambda: BaseData()) -+ flush: BaseData = field(default_factory=lambda: BaseData()) -+ discard: BaseData = field(default_factory=lambda: BaseData()) -+ -+ -+@dataclass -+class IOData: -+ throtl: IOStageData = field(default_factory=lambda: IOStageData()) -+ wbt: IOStageData = field(default_factory=lambda: IOStageData()) -+ gettag: IOStageData = field(default_factory=lambda: IOStageData()) -+ iocost: IOStageData = field(default_factory=lambda: IOStageData()) -+ plug: IOStageData = field(default_factory=lambda: IOStageData()) -+ bfq: IOStageData = field(default_factory=lambda: IOStageData()) -+ hctx: IOStageData = field(default_factory=lambda: IOStageData()) -+ requeue: IOStageData = field(default_factory=lambda: IOStageData()) -+ rq_driver: IOStageData = field(default_factory=lambda: IOStageData()) -+ bio: IOStageData = field(default_factory=lambda: IOStageData()) -+ time_stamp: float = field(default_factory=lambda: datetime.now().timestamp()) -+ -+ -+class MetricName: -+ _disk_name: str = None -+ _stage_name: str = None -+ _io_access_type_name: str = None -+ _metric_name: str = None -+ -+ def __init__(self, disk_name: str, stage_name: str, io_access_type_name: str, metric_name: str): -+ self._disk_name = disk_name -+ self._stage_name = stage_name -+ self._io_access_type_name = io_access_type_name -+ self._metric_name = metric_name -+ -+ def get_disk_name(self): -+ return self._disk_name -+ -+ def get_stage_name(self): -+ return self._stage_name -+ -+ def get_io_access_type_name(self): -+ return self._io_access_type_name -+ -+ def get_metric_name(self): -+ return self._metric_name -+ -+ def __repr__(self): -+ return (f'disk: {self._disk_name}, stage: {self._stage_name}, io_access_type: {self._io_access_type_name},' -+ f'metric: {self._metric_name}') -diff --git a/src/python/sentryPlugins/ai_threshold_slow_io_detection/sliding_window.py b/src/python/sentryPlugins/ai_threshold_slow_io_detection/sliding_window.py -new file mode 100644 -index 0000000..d395d48 ---- /dev/null -+++ b/src/python/sentryPlugins/ai_threshold_slow_io_detection/sliding_window.py -@@ -0,0 +1,113 @@ -+# coding: utf-8 -+# Copyright (c) 2024 Huawei Technologies Co., Ltd. -+# sysSentry is licensed under the Mulan PSL v2. -+# You can use this software according to the terms and conditions of the Mulan PSL v2. -+# You may obtain a copy of Mulan PSL v2 at: -+# http://license.coscl.org.cn/MulanPSL2 -+# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR -+# IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY OR FIT FOR A PARTICULAR -+# PURPOSE. -+# See the Mulan PSL v2 for more details. -+ -+from enum import Enum, unique -+import numpy as np -+ -+ -+@unique -+class SlidingWindowType(Enum): -+ NotContinuousSlidingWindow = 0 -+ ContinuousSlidingWindow = 1 -+ MedianSlidingWindow = 2 -+ -+ -+class SlidingWindow: -+ _ai_threshold = None -+ _queue_length = None -+ _queue_threshold = None -+ _io_data_queue: list = None -+ _io_data_queue_abnormal_tag: list = None -+ -+ def __init__(self, queue_length: int, threshold: int): -+ self._queue_length = queue_length -+ self._queue_threshold = threshold -+ self._io_data_queue = [] -+ self._io_data_queue_abnormal_tag = [] -+ -+ def push(self, data: float): -+ if len(self._io_data_queue) == self._queue_length: -+ self._io_data_queue.pop(0) -+ self._io_data_queue_abnormal_tag.pop(0) -+ self._io_data_queue.append(data) -+ self._io_data_queue_abnormal_tag.append(data >= self._ai_threshold if self._ai_threshold is not None else False) -+ -+ def update(self, threshold): -+ if self._ai_threshold == threshold: -+ return -+ self._ai_threshold = threshold -+ self._io_data_queue_abnormal_tag.clear() -+ for data in self._io_data_queue: -+ self._io_data_queue_abnormal_tag.append(data >= self._ai_threshold) -+ -+ def is_slow_io_event(self, data): -+ return False, None, None -+ -+ def __repr__(self): -+ return "SlidingWindow" -+ -+ -+class NotContinuousSlidingWindow(SlidingWindow): -+ def is_slow_io_event(self, data): -+ super().push(data) -+ if len(self._io_data_queue) < self._queue_length or self._ai_threshold is None: -+ return False, self._io_data_queue, self._ai_threshold -+ if self._io_data_queue_abnormal_tag.count(True) >= self._queue_threshold: -+ return True, self._io_data_queue, self._ai_threshold -+ return False, self._io_data_queue, self._ai_threshold -+ -+ def __repr__(self): -+ return "NotContinuousSlidingWindow" -+ -+ -+class ContinuousSlidingWindow(SlidingWindow): -+ def is_slow_io_event(self, data): -+ super().push(data) -+ if len(self._io_data_queue) < self._queue_length or self._ai_threshold is None: -+ return False, self._io_data_queue, self._ai_threshold -+ consecutive_count = 0 -+ for tag in self._io_data_queue_abnormal_tag: -+ if tag: -+ consecutive_count += 1 -+ if consecutive_count >= self._queue_threshold: -+ return True, self._io_data_queue, self._ai_threshold -+ else: -+ consecutive_count = 0 -+ return False, self._io_data_queue, self._ai_threshold -+ -+ def __repr__(self): -+ return "ContinuousSlidingWindow" -+ -+ -+class MedianSlidingWindow(SlidingWindow): -+ def is_slow_io_event(self, data): -+ super().push(data) -+ if len(self._io_data_queue) < self._queue_length or self._ai_threshold is None: -+ return False, self._io_data_queue, self._ai_threshold -+ median = np.median(self._io_data_queue) -+ if median >= self._ai_threshold: -+ return True, self._io_data_queue, self._ai_threshold -+ return False, self._io_data_queue, self._ai_threshold -+ -+ def __repr__(self): -+ return "MedianSlidingWindow" -+ -+ -+class SlidingWindowFactory: -+ def get_sliding_window(self, sliding_window_type: SlidingWindowType, *args, **kwargs): -+ if sliding_window_type == SlidingWindowType.NotContinuousSlidingWindow: -+ return NotContinuousSlidingWindow(*args, **kwargs) -+ elif sliding_window_type == SlidingWindowType.ContinuousSlidingWindow: -+ return ContinuousSlidingWindow(*args, **kwargs) -+ elif sliding_window_type == SlidingWindowType.MedianSlidingWindow: -+ return MedianSlidingWindow(*args, **kwargs) -+ else: -+ return NotContinuousSlidingWindow(*args, **kwargs) -diff --git a/src/python/sentryPlugins/ai_threshold_slow_io_detection/slow_io_detection.py b/src/python/sentryPlugins/ai_threshold_slow_io_detection/slow_io_detection.py -new file mode 100644 -index 0000000..43cf770 ---- /dev/null -+++ b/src/python/sentryPlugins/ai_threshold_slow_io_detection/slow_io_detection.py -@@ -0,0 +1,133 @@ -+# coding: utf-8 -+# Copyright (c) 2024 Huawei Technologies Co., Ltd. -+# sysSentry is licensed under the Mulan PSL v2. -+# You can use this software according to the terms and conditions of the Mulan PSL v2. -+# You may obtain a copy of Mulan PSL v2 at: -+# http://license.coscl.org.cn/MulanPSL2 -+# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR -+# IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY OR FIT FOR A PARTICULAR -+# PURPOSE. -+# See the Mulan PSL v2 for more details. -+ -+import time -+import signal -+import logging -+ -+from .detector import Detector -+from .threshold import ThresholdFactory, AbsoluteThreshold -+from .sliding_window import SlidingWindowFactory -+from .utils import (get_threshold_type_enum, get_sliding_window_type_enum, get_data_queue_size_and_update_size, -+ get_log_level) -+from .config_parser import ConfigParser -+from .data_access import get_io_data_from_collect_plug, check_collect_valid -+from .io_data import MetricName -+from .alarm_report import AlarmReport -+ -+CONFIG_FILE = "/etc/sysSentry/plugins/ai_threshold_slow_io_detection.ini" -+ -+ -+def sig_handler(signum, frame): -+ logging.info("receive signal: %d", signum) -+ AlarmReport().report_fail(f"receive signal: {signum}") -+ exit(signum) -+ -+ -+class SlowIODetection: -+ _config_parser = None -+ _disk_list = None -+ _detector_name_list = [] -+ _detectors = {} -+ -+ def __init__(self, config_parser: ConfigParser): -+ self._config_parser = config_parser -+ self.__set_log_format() -+ self.__init_detector_name_list() -+ self.__init_detector() -+ -+ def __set_log_format(self): -+ log_format = "%(asctime)s - %(levelname)s - [%(filename)s:%(lineno)d] - %(message)s" -+ log_level = get_log_level(self._config_parser.get_log_level()) -+ logging.basicConfig(level=log_level, format=log_format) -+ -+ def __init_detector_name_list(self): -+ self._disk_list = check_collect_valid(self._config_parser.get_slow_io_detect_frequency()) -+ for disk in self._disk_list: -+ self._detector_name_list.append(MetricName(disk, "bio", "read", "latency")) -+ self._detector_name_list.append(MetricName(disk, "bio", "write", "latency")) -+ -+ def __init_detector(self): -+ train_data_duration, train_update_duration = (self._config_parser. -+ get_train_data_duration_and_train_update_duration()) -+ slow_io_detection_frequency = self._config_parser.get_slow_io_detect_frequency() -+ threshold_type = get_threshold_type_enum(self._config_parser.get_algorithm_type()) -+ data_queue_size, update_size = get_data_queue_size_and_update_size(train_data_duration, -+ train_update_duration, -+ slow_io_detection_frequency) -+ sliding_window_type = get_sliding_window_type_enum(self._config_parser.get_sliding_window_type()) -+ window_size, window_threshold = self._config_parser.get_window_size_and_window_minimum_threshold() -+ -+ for detector_name in self._detector_name_list: -+ threshold = ThresholdFactory().get_threshold(threshold_type, data_queue_size=data_queue_size, -+ data_queue_update_size=update_size) -+ sliding_window = SlidingWindowFactory().get_sliding_window(sliding_window_type, queue_length=window_size, -+ threshold=window_threshold) -+ detector = Detector(detector_name, threshold, sliding_window) -+ # 绝对阈值的阈值初始化 -+ if isinstance(threshold, AbsoluteThreshold): -+ threshold.set_threshold(self._config_parser.get_absolute_threshold()) -+ self._detectors[detector_name] = detector -+ logging.info(f"add detector: {detector}") -+ -+ def launch(self): -+ while True: -+ logging.debug('step0. AI threshold slow io event detection is looping.') -+ -+ # Step1:获取IO数据 -+ io_data_dict_with_disk_name = get_io_data_from_collect_plug( -+ self._config_parser.get_slow_io_detect_frequency(), self._disk_list -+ ) -+ logging.debug(f'step1. Get io data: {str(io_data_dict_with_disk_name)}') -+ if io_data_dict_with_disk_name is None: -+ continue -+ # Step2:慢IO检测 -+ logging.debug('step2. Start to detection slow io event.') -+ slow_io_event_list = [] -+ for metric_name, detector in self._detectors.items(): -+ result = detector.is_slow_io_event(io_data_dict_with_disk_name) -+ if result[0]: -+ slow_io_event_list.append((detector.get_metric_name(), result)) -+ logging.debug('step2. End to detection slow io event.') -+ -+ # Step3:慢IO事件上报 -+ logging.debug('step3. Report slow io event to sysSentry.') -+ for slow_io_event in slow_io_event_list: -+ metric_name: MetricName = slow_io_event[0] -+ result = slow_io_event[1] -+ AlarmReport.report_major_alm(f"disk {metric_name.get_disk_name()} has slow io event." -+ f"stage: {metric_name.get_metric_name()}," -+ f"type: {metric_name.get_io_access_type_name()}," -+ f"metric: {metric_name.get_metric_name()}," -+ f"current window: {result[1]}," -+ f"threshold: {result[2]}") -+ logging.error(f"slow io event happen: {str(slow_io_event)}") -+ -+ # Step4:等待检测时间 -+ logging.debug('step4. Wait to start next slow io event detection loop.') -+ time.sleep(self._config_parser.get_slow_io_detect_frequency()) -+ -+ -+def main(): -+ # Step1:注册消息处理函数 -+ signal.signal(signal.SIGINT, sig_handler) -+ signal.signal(signal.SIGTERM, sig_handler) -+ # Step2:断点恢复 -+ # todo: -+ -+ # Step3:读取配置 -+ config_file_name = CONFIG_FILE -+ config = ConfigParser(config_file_name) -+ config.read_config_from_file() -+ -+ # Step4:启动慢IO检测 -+ slow_io_detection = SlowIODetection(config) -+ slow_io_detection.launch() -diff --git a/src/python/sentryPlugins/ai_threshold_slow_io_detection/threshold.py b/src/python/sentryPlugins/ai_threshold_slow_io_detection/threshold.py -new file mode 100644 -index 0000000..9e1ca7b ---- /dev/null -+++ b/src/python/sentryPlugins/ai_threshold_slow_io_detection/threshold.py -@@ -0,0 +1,160 @@ -+# coding: utf-8 -+# Copyright (c) 2024 Huawei Technologies Co., Ltd. -+# sysSentry is licensed under the Mulan PSL v2. -+# You can use this software according to the terms and conditions of the Mulan PSL v2. -+# You may obtain a copy of Mulan PSL v2 at: -+# http://license.coscl.org.cn/MulanPSL2 -+# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR -+# IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY OR FIT FOR A PARTICULAR -+# PURPOSE. -+# See the Mulan PSL v2 for more details. -+import logging -+from enum import Enum -+import queue -+import numpy as np -+import math -+ -+from .sliding_window import SlidingWindow -+ -+ -+class ThresholdState(Enum): -+ INIT = 0 -+ START = 1 -+ -+ -+class Threshold: -+ threshold = None -+ data_queue: queue.Queue = None -+ data_queue_update_size: int = None -+ new_data_size: int = None -+ threshold_state: ThresholdState = None -+ -+ def __init__(self, data_queue_size: int = 10000, data_queue_update_size: int = 1000): -+ self._observer = None -+ self.data_queue = queue.Queue(data_queue_size) -+ self.data_queue_update_size = data_queue_update_size -+ self.new_data_size = 0 -+ self.threshold_state = ThresholdState.INIT -+ self.threshold = math.inf -+ -+ def set_threshold(self, threshold): -+ self.threshold = threshold -+ self.threshold_state = ThresholdState.START -+ self.notify_observer() -+ -+ def get_threshold(self): -+ if self.threshold_state == ThresholdState.INIT: -+ return None -+ return self.threshold -+ -+ def is_abnormal(self, data): -+ if self.threshold_state == ThresholdState.INIT: -+ return False -+ return data >= self.threshold -+ -+ # 使用观察者模式,当阈值更新时,自动同步刷新滑窗中的阈值 -+ def attach_observer(self, observer: SlidingWindow): -+ self._observer = observer -+ -+ def notify_observer(self): -+ if self._observer is not None: -+ self._observer.update(self.threshold) -+ -+ def push_latest_data_to_queue(self, data): -+ pass -+ -+ def __repr__(self): -+ return "Threshold" -+ -+ -+class AbsoluteThreshold(Threshold): -+ def __init__(self, data_queue_size: int = 10000, data_queue_update_size: int = 1000): -+ super().__init__(data_queue_size, data_queue_update_size) -+ -+ def push_latest_data_to_queue(self, data): -+ pass -+ -+ def __repr__(self): -+ return "AbsoluteThreshold" -+ -+ -+class BoxplotThreshold(Threshold): -+ def __init__(self, parameter: float = 1.5, data_queue_size: int = 10000, data_queue_update_size: int = 1000): -+ super().__init__(data_queue_size, data_queue_update_size) -+ self.parameter = parameter -+ -+ def _update_threshold(self): -+ data = list(self.data_queue.queue) -+ q1 = np.percentile(data, 25) -+ q3 = np.percentile(data, 75) -+ iqr = q3 - q1 -+ self.threshold = q3 + self.parameter * iqr -+ if self.threshold_state == ThresholdState.INIT: -+ self.threshold_state = ThresholdState.START -+ self.notify_observer() -+ -+ def push_latest_data_to_queue(self, data): -+ try: -+ self.data_queue.put(data, block=False) -+ except queue.Full: -+ self.data_queue.get() -+ self.data_queue.put(data) -+ self.new_data_size += 1 -+ if (self.data_queue.full() and (self.threshold_state == ThresholdState.INIT or -+ (self.threshold_state == ThresholdState.START and -+ self.new_data_size >= self.data_queue_update_size))): -+ self._update_threshold() -+ self.new_data_size = 0 -+ -+ def __repr__(self): -+ return "BoxplotThreshold" -+ -+ -+class NSigmaThreshold(Threshold): -+ def __init__(self, parameter: float = 2.0, data_queue_size: int = 10000, data_queue_update_size: int = 1000): -+ super().__init__(data_queue_size, data_queue_update_size) -+ self.parameter = parameter -+ -+ def _update_threshold(self): -+ data = list(self.data_queue.queue) -+ mean = np.mean(data) -+ std = np.std(data) -+ self.threshold = mean + self.parameter * std -+ if self.threshold_state == ThresholdState.INIT: -+ self.threshold_state = ThresholdState.START -+ self.notify_observer() -+ -+ def push_latest_data_to_queue(self, data): -+ try: -+ self.data_queue.put(data, block=False) -+ except queue.Full: -+ self.data_queue.get() -+ self.data_queue.put(data) -+ self.new_data_size += 1 -+ if (self.data_queue.full() and (self.threshold_state == ThresholdState.INIT or -+ (self.threshold_state == ThresholdState.START and -+ self.new_data_size >= self.data_queue_update_size))): -+ self._update_threshold() -+ self.new_data_size = 0 -+ -+ def __repr__(self): -+ return "NSigmaThreshold" -+ -+ -+class ThresholdType(Enum): -+ AbsoluteThreshold = 0 -+ BoxplotThreshold = 1 -+ NSigmaThreshold = 2 -+ -+ -+class ThresholdFactory: -+ def get_threshold(self, threshold_type: ThresholdType, *args, **kwargs): -+ if threshold_type == ThresholdType.AbsoluteThreshold: -+ return AbsoluteThreshold(*args, **kwargs) -+ elif threshold_type == ThresholdType.BoxplotThreshold: -+ return BoxplotThreshold(*args, **kwargs) -+ elif threshold_type == ThresholdType.NSigmaThreshold: -+ return NSigmaThreshold(*args, **kwargs) -+ else: -+ raise ValueError(f"Invalid threshold type: {threshold_type}") -+ -diff --git a/src/python/sentryPlugins/ai_threshold_slow_io_detection/utils.py b/src/python/sentryPlugins/ai_threshold_slow_io_detection/utils.py -new file mode 100644 -index 0000000..f66e5ed ---- /dev/null -+++ b/src/python/sentryPlugins/ai_threshold_slow_io_detection/utils.py -@@ -0,0 +1,67 @@ -+# coding: utf-8 -+# Copyright (c) 2024 Huawei Technologies Co., Ltd. -+# sysSentry is licensed under the Mulan PSL v2. -+# You can use this software according to the terms and conditions of the Mulan PSL v2. -+# You may obtain a copy of Mulan PSL v2 at: -+# http://license.coscl.org.cn/MulanPSL2 -+# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR -+# IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY OR FIT FOR A PARTICULAR -+# PURPOSE. -+# See the Mulan PSL v2 for more details. -+import logging -+from dataclasses import asdict -+ -+from .threshold import ThresholdType -+from .sliding_window import SlidingWindowType -+from .io_data import MetricName, IOData -+ -+def get_threshold_type_enum(algorithm_type: str): -+ if algorithm_type.lower() == 'absolute': -+ return ThresholdType.AbsoluteThreshold -+ if algorithm_type.lower() == 'boxplot': -+ return ThresholdType.BoxplotThreshold -+ if algorithm_type.lower() == 'n_sigma': -+ return ThresholdType.NSigmaThreshold -+ logging.info('not found correct algorithm type, use default: boxplot.') -+ return ThresholdType.BoxplotThreshold -+ -+ -+def get_sliding_window_type_enum(sliding_window_type: str): -+ if sliding_window_type.lower() == 'not_continuous': -+ return SlidingWindowType.NotContinuousSlidingWindow -+ if sliding_window_type.lower() == 'continuous': -+ return SlidingWindowType.ContinuousSlidingWindow -+ if sliding_window_type.lower() == 'median': -+ return SlidingWindowType.MedianSlidingWindow -+ logging.info('not found correct sliding window type, use default: not_continuous.') -+ return SlidingWindowType.NotContinuousSlidingWindow -+ -+ -+def get_metric_value_from_io_data_dict_by_metric_name(io_data_dict: dict, metric_name: MetricName): -+ try: -+ io_data: IOData = io_data_dict[metric_name.get_disk_name()] -+ io_stage_data = asdict(io_data)[metric_name.get_stage_name()] -+ base_data = io_stage_data[metric_name.get_io_access_type_name()] -+ metric_value = base_data[metric_name.get_metric_name()] -+ return metric_value -+ except KeyError: -+ return None -+ -+ -+def get_data_queue_size_and_update_size(training_data_duration: float, train_update_duration: float, -+ slow_io_detect_frequency: int): -+ data_queue_size = int(training_data_duration * 60 * 60 / slow_io_detect_frequency) -+ update_size = int(train_update_duration * 60 * 60 / slow_io_detect_frequency) -+ return data_queue_size, update_size -+ -+ -+def get_log_level(log_level: str): -+ if log_level.lower() == 'debug': -+ return logging.DEBUG -+ elif log_level.lower() == 'info': -+ return logging.INFO -+ elif log_level.lower() == 'warning': -+ return logging.WARNING -+ elif log_level.lower() == 'fatal': -+ return logging.FATAL -+ return None -diff --git a/src/python/setup.py b/src/python/setup.py -index c28c691..dac6481 100644 ---- a/src/python/setup.py -+++ b/src/python/setup.py -@@ -33,7 +33,8 @@ setup( - 'syssentry=syssentry.syssentry:main', - 'xalarmd=xalarm.xalarm_daemon:alarm_process_create', - 'sentryCollector=sentryCollector.collectd:main', -- 'avg_block_io=sentryPlugins.avg_block_io.avg_block_io:main' -+ 'avg_block_io=sentryPlugins.avg_block_io.avg_block_io:main', -+ 'ai_threshold_slow_io_detection=sentryPlugins.ai_threshold_slow_io_detection.slow_io_detection:main' - ] - }, - ) --- -2.23.0 - diff --git a/add-boundary-check-for-settings.patch b/add-boundary-check-for-settings.patch deleted file mode 100644 index 05184c04e5dc619ca0afb6d0f6ec3621933b579f..0000000000000000000000000000000000000000 --- a/add-boundary-check-for-settings.patch +++ /dev/null @@ -1,39 +0,0 @@ -From abf36bf0351efde388c089245aed9f6d8d2e6d3b Mon Sep 17 00:00:00 2001 -From: luckky -Date: Wed, 6 Nov 2024 11:42:53 +0800 -Subject: [PATCH] add boundary check for settings -1. add two boundary checks for page_isolation_threshold and hbm_online_repair_log_level -(0 <= page_isolation_threshold) -(0(LOG_DEBUG) <= hbm_online_repair_log_level <= 3(LOG_ERROR)) - ---- - src/c/hbm_online_repair/hbm_online_repair.c | 6 ++++++ - 1 file changed, 6 insertions(+) - -diff --git a/src/c/hbm_online_repair/hbm_online_repair.c b/src/c/hbm_online_repair/hbm_online_repair.c -index 943f201..00c9c0b 100644 ---- a/src/c/hbm_online_repair/hbm_online_repair.c -+++ b/src/c/hbm_online_repair/hbm_online_repair.c -@@ -89,6 +89,9 @@ void hbm_param_init(void) - if (ret < 0) { - global_level_setting = DEFAULT_LOG_LEVEL; - log(LOG_WARNING, "Get log level from config failed, set the default value %d\n", DEFAULT_LOG_LEVEL); -+ } else if (global_level_setting < LOG_DEBUG || global_level_setting > LOG_ERROR) { -+ log(LOG_WARNING, "The log level value %d in config is out of range, set the default value %d\n", global_level_setting, DEFAULT_LOG_LEVEL); -+ global_level_setting = DEFAULT_LOG_LEVEL; - } else { - log(LOG_INFO, "log level: %d\n", global_level_setting); - } -@@ -98,6 +101,9 @@ void hbm_param_init(void) - if (ret < 0) { - page_isolation_threshold = DEFAULT_PAGE_ISOLATION_THRESHOLD; - log(LOG_WARNING, "Get page_isolation_threshold from config failed, set the default value %d\n", DEFAULT_PAGE_ISOLATION_THRESHOLD); -+ } else if (page_isolation_threshold < 0) { -+ log(LOG_WARNING, "The page_isolation_threshold %d in config is out of range, set the default value %d\n", page_isolation_threshold, DEFAULT_PAGE_ISOLATION_THRESHOLD); -+ page_isolation_threshold = DEFAULT_PAGE_ISOLATION_THRESHOLD; - } else { - log(LOG_INFO, "page_isolation_threshold: %d\n", page_isolation_threshold); - } --- -2.43.0 - diff --git a/add-collect-module-to-sysSentry.patch b/add-collect-module-to-sysSentry.patch deleted file mode 100644 index f8fa667f33e78eb8b22bd3207d26bbbaf39e28c7..0000000000000000000000000000000000000000 --- a/add-collect-module-to-sysSentry.patch +++ /dev/null @@ -1,1165 +0,0 @@ -From bd32dc01000126d593c188d47404cfdbe1df343e Mon Sep 17 00:00:00 2001 -From: zhuofeng -Date: Thu, 12 Sep 2024 11:29:01 +0800 -Subject: [PATCH 1/2] add collect module to sysSentry - ---- - config/collector.conf | 7 + - service/sentryCollector.service | 12 + - service/sysSentry.service | 2 +- - src/python/sentryCollector/__init__.py | 0 - src/python/sentryCollector/__main__.py | 17 ++ - src/python/sentryCollector/collect_config.py | 118 ++++++++ - src/python/sentryCollector/collect_io.py | 239 ++++++++++++++++ - src/python/sentryCollector/collect_plugin.py | 276 ++++++++++++++++++ - src/python/sentryCollector/collect_server.py | 285 +++++++++++++++++++ - src/python/sentryCollector/collectd.py | 99 +++++++ - src/python/setup.py | 4 +- - 11 files changed, 1057 insertions(+), 2 deletions(-) - create mode 100644 config/collector.conf - create mode 100644 service/sentryCollector.service - create mode 100644 src/python/sentryCollector/__init__.py - create mode 100644 src/python/sentryCollector/__main__.py - create mode 100644 src/python/sentryCollector/collect_config.py - create mode 100644 src/python/sentryCollector/collect_io.py - create mode 100644 src/python/sentryCollector/collect_plugin.py - create mode 100644 src/python/sentryCollector/collect_server.py - create mode 100644 src/python/sentryCollector/collectd.py - -diff --git a/config/collector.conf b/config/collector.conf -new file mode 100644 -index 0000000..9baa086 ---- /dev/null -+++ b/config/collector.conf -@@ -0,0 +1,7 @@ -+[common] -+modules=io -+ -+[io] -+period_time=1 -+max_save=10 -+disk=default -\ No newline at end of file -diff --git a/service/sentryCollector.service b/service/sentryCollector.service -new file mode 100644 -index 0000000..4ee07d5 ---- /dev/null -+++ b/service/sentryCollector.service -@@ -0,0 +1,12 @@ -+[Unit] -+Description = Collection module added for sysSentry and kernel lock-free collection -+ -+[Service] -+ExecStart=/usr/bin/python3 /usr/bin/sentryCollector -+ExecStop=/bin/kill $MAINPID -+KillMode=process -+Restart=on-failure -+RestartSec=10s -+ -+[Install] -+WantedBy = multi-user.target -diff --git a/service/sysSentry.service b/service/sysSentry.service -index 4d85a6c..1d8338f 100644 ---- a/service/sysSentry.service -+++ b/service/sysSentry.service -@@ -2,7 +2,7 @@ - Description=EulerOS System Inspection Frame - - [Service] --ExecStart=/usr/bin/syssentry -+ExecStart=/usr/bin/python3 /usr/bin/syssentry - ExecStop=/bin/kill $MAINPID - KillMode=process - Restart=on-failure -diff --git a/src/python/sentryCollector/__init__.py b/src/python/sentryCollector/__init__.py -new file mode 100644 -index 0000000..e69de29 -diff --git a/src/python/sentryCollector/__main__.py b/src/python/sentryCollector/__main__.py -new file mode 100644 -index 0000000..9c2ae50 ---- /dev/null -+++ b/src/python/sentryCollector/__main__.py -@@ -0,0 +1,17 @@ -+# coding: utf-8 -+# Copyright (c) 2023 Huawei Technologies Co., Ltd. -+# sysSentry is licensed under the Mulan PSL v2. -+# You can use this software according to the terms and conditions of the Mulan PSL v2. -+# You may obtain a copy of Mulan PSL v2 at: -+# http://license.coscl.org.cn/MulanPSL2 -+# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR -+# IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY OR FIT FOR A PARTICULAR -+# PURPOSE. -+# See the Mulan PSL v2 for more details. -+ -+""" -+main -+""" -+from collectd import collectd -+ -+collectd.main() -diff --git a/src/python/sentryCollector/collect_config.py b/src/python/sentryCollector/collect_config.py -new file mode 100644 -index 0000000..b6cc75c ---- /dev/null -+++ b/src/python/sentryCollector/collect_config.py -@@ -0,0 +1,118 @@ -+# coding: utf-8 -+# Copyright (c) 2024 Huawei Technologies Co., Ltd. -+# sysSentry is licensed under the Mulan PSL v2. -+# You can use this software according to the terms and conditions of the Mulan PSL v2. -+# You may obtain a copy of Mulan PSL v2 at: -+# http://license.coscl.org.cn/MulanPSL2 -+# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR -+# IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY OR FIT FOR A PARTICULAR -+# PURPOSE. -+# See the Mulan PSL v2 for more details. -+ -+""" -+Read and save collector.conf value. -+""" -+import configparser -+import logging -+import os -+import re -+ -+ -+COLLECT_CONF_PATH = "/etc/sysSentry/collector.conf" -+ -+CONF_COMMON = 'common' -+CONF_MODULES = 'modules' -+ -+# io -+CONF_IO = 'io' -+CONF_IO_PERIOD_TIME = 'period_time' -+CONF_IO_MAX_SAVE = 'max_save' -+CONF_IO_DISK = 'disk' -+CONF_IO_PERIOD_TIME_DEFAULT = 1 -+CONF_IO_MAX_SAVE_DEFAULT = 10 -+CONF_IO_DISK_DEFAULT = "default" -+ -+class CollectConfig: -+ def __init__(self, filename=COLLECT_CONF_PATH): -+ -+ self.filename = filename -+ self.modules = [] -+ self.module_count = 0 -+ self.load_config() -+ -+ def load_config(self): -+ if not os.path.exists(self.filename): -+ logging.error("%s is not exists", self.filename) -+ return -+ -+ try: -+ self.config = configparser.ConfigParser() -+ self.config.read(self.filename) -+ except configparser.Error: -+ logging.error("collectd configure file read failed") -+ return -+ -+ try: -+ common_config = self.config[CONF_COMMON] -+ modules_str = common_config[CONF_MODULES] -+ # remove space -+ modules_list = modules_str.replace(" ", "").split(',') -+ except KeyError as e: -+ logging.error("read config data failed, %s", e) -+ return -+ -+ pattern = r'^[a-zA-Z0-9-_]+$' -+ for module_name in modules_list: -+ if not re.match(pattern, module_name): -+ logging.warning("module_name: %s is invalid", module_name) -+ continue -+ if not self.config.has_section(module_name): -+ logging.warning("module_name: %s config is incorrect", module_name) -+ continue -+ self.modules.append(module_name) -+ -+ def load_module_config(self, module_name): -+ module_name = module_name.strip().lower() -+ if module_name in self.modules and self.config.has_section(module_name): -+ return {key.lower(): value for key, value in self.config[module_name].items()} -+ else: -+ raise ValueError(f"Module '{module_name}' not found in configuration") -+ -+ def get_io_config(self): -+ result_io_config = {} -+ io_map_value = self.load_module_config(CONF_IO) -+ # period_time -+ period_time = io_map_value.get(CONF_IO_PERIOD_TIME) -+ if period_time and period_time.isdigit() and int(period_time) >= 1 and int(period_time) <= 300: -+ result_io_config[CONF_IO_PERIOD_TIME] = int(period_time) -+ else: -+ logging.warning("module_name = %s section, field = %s is incorrect, use default %d", -+ CONF_IO, CONF_IO_PERIOD_TIME, CONF_IO_PERIOD_TIME_DEFAULT) -+ result_io_config[CONF_IO_PERIOD_TIME] = CONF_IO_PERIOD_TIME_DEFAULT -+ # max_save -+ max_save = io_map_value.get(CONF_IO_MAX_SAVE) -+ if max_save and max_save.isdigit() and int(max_save) >= 1 and int(max_save) <= 300: -+ result_io_config[CONF_IO_MAX_SAVE] = int(max_save) -+ else: -+ logging.warning("module_name = %s section, field = %s is incorrect, use default %d", -+ CONF_IO, CONF_IO_MAX_SAVE, CONF_IO_MAX_SAVE_DEFAULT) -+ result_io_config[CONF_IO_MAX_SAVE] = CONF_IO_MAX_SAVE_DEFAULT -+ # disk -+ disk = io_map_value.get(CONF_IO_DISK) -+ if disk: -+ disk_str = disk.replace(" ", "") -+ pattern = r'^[a-zA-Z0-9-_,]+$' -+ if not re.match(pattern, disk_str): -+ logging.warning("module_name = %s section, field = %s is incorrect, use default %s", -+ CONF_IO, CONF_IO_DISK, CONF_IO_DISK_DEFAULT) -+ disk_str = CONF_IO_DISK_DEFAULT -+ result_io_config[CONF_IO_DISK] = disk_str -+ else: -+ logging.warning("module_name = %s section, field = %s is incorrect, use default %s", -+ CONF_IO, CONF_IO_DISK, CONF_IO_DISK_DEFAULT) -+ result_io_config[CONF_IO_DISK] = CONF_IO_DISK_DEFAULT -+ logging.info("config get_io_config: %s", result_io_config) -+ return result_io_config -+ -+ def get_common_config(self): -+ return {key.lower(): value for key, value in self.config['common'].items()} -diff --git a/src/python/sentryCollector/collect_io.py b/src/python/sentryCollector/collect_io.py -new file mode 100644 -index 0000000..b826dc4 ---- /dev/null -+++ b/src/python/sentryCollector/collect_io.py -@@ -0,0 +1,239 @@ -+# coding: utf-8 -+# Copyright (c) 2024 Huawei Technologies Co., Ltd. -+# sysSentry is licensed under the Mulan PSL v2. -+# You can use this software according to the terms and conditions of the Mulan PSL v2. -+# You may obtain a copy of Mulan PSL v2 at: -+# http://license.coscl.org.cn/MulanPSL2 -+# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR -+# IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY OR FIT FOR A PARTICULAR -+# PURPOSE. -+# See the Mulan PSL v2 for more details. -+ -+""" -+collect module -+""" -+import os -+import time -+import logging -+import threading -+ -+from .collect_config import CollectConfig -+ -+Io_Category = ["read", "write", "flush", "discard"] -+IO_GLOBAL_DATA = {} -+IO_CONFIG_DATA = [] -+ -+class IoStatus(): -+ TOTAL = 0 -+ FINISH = 1 -+ LATENCY = 2 -+ -+class CollectIo(): -+ -+ def __init__(self, module_config): -+ -+ io_config = module_config.get_io_config() -+ -+ self.period_time = io_config['period_time'] -+ self.max_save = io_config['max_save'] -+ disk_str = io_config['disk'] -+ -+ self.disk_map_stage = {} -+ self.window_value = {} -+ -+ self.loop_all = False -+ -+ if disk_str == "default": -+ self.loop_all = True -+ else: -+ self.disk_list = disk_str.strip().split(',') -+ -+ self.stop_event = threading.Event() -+ -+ IO_CONFIG_DATA.append(self.period_time) -+ IO_CONFIG_DATA.append(self.max_save) -+ -+ def get_blk_io_hierarchy(self, disk_name, stage_list): -+ stats_file = '/sys/kernel/debug/block/{}/blk_io_hierarchy/stats'.format(disk_name) -+ try: -+ with open(stats_file, 'r') as file: -+ lines = file.read() -+ except FileNotFoundError: -+ logging.error("The file %s does not exist", stats_file) -+ return -1 -+ except Exception as e: -+ logging.error("An error occurred3: %s", e) -+ return -1 -+ -+ curr_value = lines.strip().split('\n') -+ -+ for stage_val in curr_value: -+ stage = stage_val.split(' ')[0] -+ if (len(self.window_value[disk_name][stage])) >= 2: -+ self.window_value[disk_name][stage].pop(0) -+ -+ curr_stage_value = stage_val.split(' ')[1:-1] -+ self.window_value[disk_name][stage].append(curr_stage_value) -+ return 0 -+ -+ def append_period_lat(self, disk_name, stage_list): -+ for stage in stage_list: -+ if len(self.window_value[disk_name][stage]) < 2: -+ return -+ curr_stage_value = self.window_value[disk_name][stage][-1] -+ last_stage_value = self.window_value[disk_name][stage][-2] -+ -+ for index in range(len(Io_Category)): -+ # read=0, write=1, flush=2, discard=3 -+ if (len(IO_GLOBAL_DATA[disk_name][stage][Io_Category[index]])) >= self.max_save: -+ IO_GLOBAL_DATA[disk_name][stage][Io_Category[index]].pop() -+ -+ curr_lat = self.get_latency_value(curr_stage_value, last_stage_value, index) -+ curr_iops = self.get_iops(curr_stage_value, last_stage_value, index) -+ curr_io_length = self.get_io_length(curr_stage_value, last_stage_value, index) -+ curr_io_dump = self.get_io_dump(disk_name, stage, index) -+ -+ IO_GLOBAL_DATA[disk_name][stage][Io_Category[index]].insert(0, [curr_lat, curr_io_dump, curr_io_length, curr_iops]) -+ -+ def get_iops(self, curr_stage_value, last_stage_value, category): -+ try: -+ finish = int(curr_stage_value[category * 3 + IoStatus.FINISH]) - int(last_stage_value[category * 3 + IoStatus.FINISH]) -+ except ValueError as e: -+ logging.error("get_iops convert to int failed, %s", e) -+ return 0 -+ value = finish / self.period_time -+ if value.is_integer(): -+ return int(value) -+ else: -+ return round(value, 1) -+ -+ def get_latency_value(self, curr_stage_value, last_stage_value, category): -+ try: -+ finish = int(curr_stage_value[category * 3 + IoStatus.FINISH]) - int(last_stage_value[category * 3 + IoStatus.FINISH]) -+ lat_time = (int(curr_stage_value[category * 3 + IoStatus.LATENCY]) - int(last_stage_value[category * 3 + IoStatus.LATENCY])) -+ except ValueError as e: -+ logging.error("get_latency_value convert to int failed, %s", e) -+ return 0 -+ if finish <= 0 or lat_time <= 0: -+ return 0 -+ value = lat_time / finish / 1000 / 1000 -+ if value.is_integer(): -+ return int(value) -+ else: -+ return round(value, 1) -+ -+ def get_io_length(self, curr_stage_value, last_stage_value, category): -+ try: -+ finish = int(curr_stage_value[category * 3 + IoStatus.FINISH]) - int(last_stage_value[category * 3 + IoStatus.FINISH]) -+ except ValueError as e: -+ logging.error("get_io_length convert to int failed, %s", e) -+ return 0 -+ value = finish / self.period_time / 1000 / 1000 -+ if value.is_integer(): -+ return int(value) -+ else: -+ return round(value, 1) -+ -+ def get_io_dump(self, disk_name, stage, category): -+ io_dump_file = '/sys/kernel/debug/block/{}/blk_io_hierarchy/{}/io_dump'.format(disk_name, stage) -+ count = 0 -+ try: -+ with open(io_dump_file, 'r') as file: -+ for line in file: -+ count += line.count('.op=' + Io_Category[category]) -+ except FileNotFoundError: -+ logging.error("The file %s does not exist.", io_dump_file) -+ return count -+ except Exception as e: -+ logging.error("An error occurred1: %s", e) -+ return count -+ return count -+ -+ def extract_first_column(self, file_path): -+ column_names = [] -+ try: -+ with open(file_path, 'r') as file: -+ for line in file: -+ parts = line.strip().split() -+ if parts: -+ column_names.append(parts[0]) -+ except FileNotFoundError: -+ logging.error("The file %s does not exist.", file_path) -+ except Exception as e: -+ logging.error("An error occurred2: %s", e) -+ return column_names -+ -+ def task_loop(self): -+ if self.stop_event.is_set(): -+ logging.info("collect io thread exit") -+ return -+ -+ for disk_name, stage_list in self.disk_map_stage.items(): -+ if self.get_blk_io_hierarchy(disk_name, stage_list) < 0: -+ continue -+ self.append_period_lat(disk_name, stage_list) -+ -+ threading.Timer(self.period_time, self.task_loop).start() -+ -+ def main_loop(self): -+ logging.info("collect io thread start") -+ base_path = '/sys/kernel/debug/block' -+ for disk_name in os.listdir(base_path): -+ if not self.loop_all and disk_name not in self.disk_list: -+ continue -+ -+ disk_path = os.path.join(base_path, disk_name) -+ blk_io_hierarchy_path = os.path.join(disk_path, 'blk_io_hierarchy') -+ -+ if not os.path.exists(blk_io_hierarchy_path): -+ logging.error("no blk_io_hierarchy directory found in %s, skipping.", disk_name) -+ continue -+ -+ for file_name in os.listdir(blk_io_hierarchy_path): -+ file_path = os.path.join(blk_io_hierarchy_path, file_name) -+ -+ if file_name == 'stats': -+ stage_list = self.extract_first_column(file_path) -+ self.disk_map_stage[disk_name] = stage_list -+ self.window_value[disk_name] = {} -+ IO_GLOBAL_DATA[disk_name] = {} -+ -+ if len(self.disk_map_stage) == 0: -+ logging.warning("no disks meet the requirements. the thread exits") -+ return -+ -+ for disk_name, stage_list in self.disk_map_stage.items(): -+ for stage in stage_list: -+ self.window_value[disk_name][stage] = [] -+ IO_GLOBAL_DATA[disk_name][stage] = {} -+ for category in Io_Category: -+ IO_GLOBAL_DATA[disk_name][stage][category] = [] -+ -+ while True: -+ start_time = time.time() -+ -+ if self.stop_event.is_set(): -+ logging.info("collect io thread exit") -+ return -+ -+ for disk_name, stage_list in self.disk_map_stage.items(): -+ if self.get_blk_io_hierarchy(disk_name, stage_list) < 0: -+ continue -+ self.append_period_lat(disk_name, stage_list) -+ -+ elapsed_time = time.time() - start_time -+ sleep_time = self.period_time - elapsed_time -+ if sleep_time < 0: -+ continue -+ while sleep_time > 1: -+ if self.stop_event.is_set(): -+ logging.info("collect io thread exit") -+ return -+ time.sleep(1) -+ sleep_time -= 1 -+ time.sleep(sleep_time) -+ -+ # set stop event, notify thread exit -+ def stop_thread(self): -+ logging.info("collect io thread is preparing to exit") -+ self.stop_event.set() -diff --git a/src/python/sentryCollector/collect_plugin.py b/src/python/sentryCollector/collect_plugin.py -new file mode 100644 -index 0000000..49ce0a8 ---- /dev/null -+++ b/src/python/sentryCollector/collect_plugin.py -@@ -0,0 +1,276 @@ -+# coding: utf-8 -+# Copyright (c) 2024 Huawei Technologies Co., Ltd. -+# sysSentry is licensed under the Mulan PSL v2. -+# You can use this software according to the terms and conditions of the Mulan PSL v2. -+# You may obtain a copy of Mulan PSL v2 at: -+# http://license.coscl.org.cn/MulanPSL2 -+# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR -+# IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY OR FIT FOR A PARTICULAR -+# PURPOSE. -+# See the Mulan PSL v2 for more details. -+ -+""" -+collcet plugin -+""" -+import json -+import socket -+import logging -+import re -+ -+COLLECT_SOCKET_PATH = "/var/run/sysSentry/collector.sock" -+ -+# data length param -+CLT_MSG_HEAD_LEN = 9 #3+2+4 -+CLT_MSG_PRO_LEN = 2 -+CLT_MSG_MAGIC_LEN = 3 -+CLT_MSG_LEN_LEN = 4 -+ -+CLT_MAGIC = "CLT" -+RES_MAGIC = "RES" -+ -+# disk limit -+LIMIT_DISK_CHAR_LEN = 32 -+LIMIT_DISK_LIST_LEN = 10 -+ -+# stage limit -+LIMIT_STAGE_CHAR_LEN = 20 -+LIMIT_STAGE_LIST_LEN = 15 -+ -+#iotype limit -+LIMIT_IOTYPE_CHAR_LEN = 7 -+LIMIT_IOTYPE_LIST_LEN = 4 -+ -+#period limit -+LIMIT_PERIOD_MIN_LEN = 1 -+LIMIT_PERIOD_MAX_LEN = 300 -+ -+# interface protocol -+class ClientProtocol(): -+ IS_IOCOLLECT_VALID = 0 -+ GET_IO_DATA = 1 -+ PRO_END = 3 -+ -+class ResultMessage(): -+ RESULT_SUCCEED = 0 -+ RESULT_UNKNOWN = 1 # unknown error -+ RESULT_NOT_PARAM = 2 # the parameter does not exist or the type does not match. -+ RESULT_INVALID_LENGTH = 3 # invalid parameter length. -+ RESULT_EXCEED_LIMIT = 4 # the parameter length exceeds the limit. -+ RESULT_PARSE_FAILED = 5 # parse failed -+ RESULT_INVALID_CHAR = 6 # invalid char -+ -+Result_Messages = { -+ ResultMessage.RESULT_SUCCEED: "Succeed", -+ ResultMessage.RESULT_UNKNOWN: "Unknown error", -+ ResultMessage.RESULT_NOT_PARAM: "The parameter does not exist or the type does not match", -+ ResultMessage.RESULT_INVALID_LENGTH: "Invalid parameter length", -+ ResultMessage.RESULT_EXCEED_LIMIT: "The parameter length exceeds the limit", -+ ResultMessage.RESULT_PARSE_FAILED: "Parse failed", -+ ResultMessage.RESULT_INVALID_CHAR: "Invalid char" -+} -+ -+ -+def client_send_and_recv(request_data, data_str_len, protocol): -+ """client socket send and recv message""" -+ try: -+ client_socket = socket.socket(socket.AF_UNIX, socket.SOCK_STREAM) -+ except socket.error: -+ print("collect_plugin: client creat socket error") -+ return None -+ -+ try: -+ client_socket.connect(COLLECT_SOCKET_PATH) -+ except OSError: -+ client_socket.close() -+ print("collect_plugin: client connect error") -+ return None -+ -+ req_data_len = len(request_data) -+ request_msg = CLT_MAGIC + str(protocol).zfill(CLT_MSG_PRO_LEN) + str(req_data_len).zfill(CLT_MSG_LEN_LEN) + request_data -+ -+ try: -+ client_socket.send(request_msg.encode()) -+ res_data = client_socket.recv(len(RES_MAGIC) + CLT_MSG_PRO_LEN + data_str_len) -+ res_data = res_data.decode() -+ except (OSError, UnicodeError): -+ client_socket.close() -+ print("collect_plugin: client communicate error") -+ return None -+ -+ res_magic = res_data[:CLT_MSG_MAGIC_LEN] -+ if res_magic != "RES": -+ print("res msg format error") -+ return None -+ -+ protocol_str = res_data[CLT_MSG_MAGIC_LEN:CLT_MSG_MAGIC_LEN+CLT_MSG_PRO_LEN] -+ try: -+ protocol_id = int(protocol_str) -+ except ValueError: -+ print("recv msg protocol id is invalid %s", protocol_str) -+ return None -+ -+ if protocol_id >= ClientProtocol.PRO_END: -+ print("protocol id is invalid") -+ return None -+ -+ try: -+ res_data_len = int(res_data[CLT_MSG_MAGIC_LEN+CLT_MSG_PRO_LEN:]) -+ res_msg_data = client_socket.recv(res_data_len) -+ res_msg_data = res_msg_data.decode() -+ return res_msg_data -+ except (OSError, ValueError, UnicodeError): -+ print("collect_plugin: client recv res msg error") -+ finally: -+ client_socket.close() -+ -+ return None -+ -+def validate_parameters(param, len_limit, char_limit): -+ ret = ResultMessage.RESULT_SUCCEED -+ if not param: -+ print("parm is invalid") -+ ret = ResultMessage.RESULT_NOT_PARAM -+ return [False, ret] -+ -+ if not isinstance(param, list): -+ print(f"{param} is not list type.") -+ ret = ResultMessage.RESULT_NOT_PARAM -+ return [False, ret] -+ -+ if len(param) <= 0: -+ print(f"{param} length is 0.") -+ ret = ResultMessage.RESULT_INVALID_LENGTH -+ return [False, ret] -+ -+ if len(param) > len_limit: -+ print(f"{param} length more than {len_limit}") -+ ret = ResultMessage.RESULT_EXCEED_LIMIT -+ return [False, ret] -+ -+ pattern = r'^[a-zA-Z0-9_-]+$' -+ for info in param: -+ if len(info) > char_limit: -+ print(f"{info} length more than {char_limit}") -+ ret = ResultMessage.RESULT_EXCEED_LIMIT -+ return [False, ret] -+ if not re.match(pattern, info): -+ print(f"{info} is invalid char") -+ ret = ResultMessage.RESULT_INVALID_CHAR -+ return [False, ret] -+ -+ return [True, ret] -+ -+def is_iocollect_valid(period, disk_list=None, stage=None): -+ result = inter_is_iocollect_valid(period, disk_list, stage) -+ error_code = result['ret'] -+ if error_code != ResultMessage.RESULT_SUCCEED: -+ result['message'] = Result_Messages[error_code] -+ return result -+ -+def inter_is_iocollect_valid(period, disk_list=None, stage=None): -+ result = {} -+ result['ret'] = ResultMessage.RESULT_UNKNOWN -+ result['message'] = "" -+ -+ if not period or not isinstance(period, int): -+ result['ret'] = ResultMessage.RESULT_NOT_PARAM -+ return result -+ if period < LIMIT_PERIOD_MIN_LEN or period > LIMIT_PERIOD_MAX_LEN: -+ result['ret'] = ResultMessage.RESULT_INVALID_LENGTH -+ return result -+ -+ if not disk_list: -+ disk_list = [] -+ else: -+ res = validate_parameters(disk_list, LIMIT_DISK_LIST_LEN, LIMIT_DISK_CHAR_LEN) -+ if not res[0]: -+ result['ret'] = res[1] -+ return result -+ -+ if not stage: -+ stage = [] -+ else: -+ res = validate_parameters(stage, LIMIT_STAGE_LIST_LEN, LIMIT_STAGE_CHAR_LEN) -+ if not res[0]: -+ result['ret'] = res[1] -+ return result -+ -+ req_msg_struct = { -+ 'disk_list': json.dumps(disk_list), -+ 'period': period, -+ 'stage': json.dumps(stage) -+ } -+ request_message = json.dumps(req_msg_struct) -+ result_message = client_send_and_recv(request_message, CLT_MSG_LEN_LEN, ClientProtocol.IS_IOCOLLECT_VALID) -+ if not result_message: -+ print("collect_plugin: client_send_and_recv failed") -+ return result -+ -+ try: -+ json.loads(result_message) -+ except json.JSONDecodeError: -+ print("is_iocollect_valid: json decode error") -+ result['ret'] = ResultMessage.RESULT_PARSE_FAILED -+ return result -+ -+ result['ret'] = ResultMessage.RESULT_SUCCEED -+ result['message'] = result_message -+ return result -+ -+def get_io_data(period, disk_list, stage, iotype): -+ result = inter_get_io_data(period, disk_list, stage, iotype) -+ error_code = result['ret'] -+ if error_code != ResultMessage.RESULT_SUCCEED: -+ result['message'] = Result_Messages[error_code] -+ return result -+ -+def inter_get_io_data(period, disk_list, stage, iotype): -+ result = {} -+ result['ret'] = ResultMessage.RESULT_UNKNOWN -+ result['message'] = "" -+ -+ if not isinstance(period, int): -+ result['ret'] = ResultMessage.RESULT_NOT_PARAM -+ return result -+ if period < LIMIT_PERIOD_MIN_LEN or period > LIMIT_PERIOD_MAX_LEN: -+ result['ret'] = ResultMessage.RESULT_INVALID_LENGTH -+ return result -+ -+ res = validate_parameters(disk_list, LIMIT_DISK_LIST_LEN, LIMIT_DISK_CHAR_LEN) -+ if not res[0]: -+ result['ret'] = res[1] -+ return result -+ -+ res = validate_parameters(stage, LIMIT_STAGE_LIST_LEN, LIMIT_STAGE_CHAR_LEN) -+ if not res[0]: -+ result['ret'] = res[1] -+ return result -+ -+ res = validate_parameters(iotype, LIMIT_IOTYPE_LIST_LEN, LIMIT_IOTYPE_CHAR_LEN) -+ if not res[0]: -+ result['ret'] = res[1] -+ return result -+ -+ req_msg_struct = { -+ 'disk_list': json.dumps(disk_list), -+ 'period': period, -+ 'stage': json.dumps(stage), -+ 'iotype' : json.dumps(iotype) -+ } -+ -+ request_message = json.dumps(req_msg_struct) -+ result_message = client_send_and_recv(request_message, CLT_MSG_LEN_LEN, ClientProtocol.GET_IO_DATA) -+ if not result_message: -+ print("collect_plugin: client_send_and_recv failed") -+ return result -+ try: -+ json.loads(result_message) -+ except json.JSONDecodeError: -+ print("get_io_data: json decode error") -+ result['ret'] = ResultMessage.RESULT_PARSE_FAILED -+ return result -+ -+ result['ret'] = ResultMessage.RESULT_SUCCEED -+ result['message'] = result_message -+ return result -+ -diff --git a/src/python/sentryCollector/collect_server.py b/src/python/sentryCollector/collect_server.py -new file mode 100644 -index 0000000..fa49781 ---- /dev/null -+++ b/src/python/sentryCollector/collect_server.py -@@ -0,0 +1,285 @@ -+# coding: utf-8 -+# Copyright (c) 2024 Huawei Technologies Co., Ltd. -+# sysSentry is licensed under the Mulan PSL v2. -+# You can use this software according to the terms and conditions of the Mulan PSL v2. -+# You may obtain a copy of Mulan PSL v2 at: -+# http://license.coscl.org.cn/MulanPSL2 -+# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR -+# IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY OR FIT FOR A PARTICULAR -+# PURPOSE. -+# See the Mulan PSL v2 for more details. -+ -+""" -+listen module -+""" -+import sys -+import signal -+import traceback -+import socket -+import os -+import json -+import logging -+import fcntl -+import select -+import threading -+import time -+ -+from .collect_io import IO_GLOBAL_DATA, IO_CONFIG_DATA -+from .collect_config import CollectConfig -+ -+SENTRY_RUN_DIR = "/var/run/sysSentry" -+COLLECT_SOCKET_PATH = "/var/run/sysSentry/collector.sock" -+ -+# socket param -+CLT_LISTEN_QUEUE_LEN = 5 -+SERVER_EPOLL_TIMEOUT = 0.3 -+ -+# data length param -+CLT_MSG_HEAD_LEN = 9 #3+2+4 -+CLT_MSG_PRO_LEN = 2 -+CLT_MSG_MAGIC_LEN = 3 -+CLT_MSG_LEN_LEN = 4 -+ -+# data flag param -+CLT_MAGIC = "CLT" -+RES_MAGIC = "RES" -+ -+# interface protocol -+class ServerProtocol(): -+ IS_IOCOLLECT_VALID = 0 -+ GET_IO_DATA = 1 -+ PRO_END = 3 -+ -+class CollectServer(): -+ -+ def __init__(self): -+ -+ self.io_global_data = {} -+ -+ self.stop_event = threading.Event() -+ -+ def is_iocollect_valid(self, data_struct): -+ -+ result_rev = {} -+ self.io_global_data = IO_GLOBAL_DATA -+ -+ if len(IO_CONFIG_DATA) == 0: -+ logging.error("the collect thread is not started, the data is invalid. ") -+ return json.dumps(result_rev) -+ -+ period_time = IO_CONFIG_DATA[0] -+ max_save = IO_CONFIG_DATA[1] -+ -+ disk_list = json.loads(data_struct['disk_list']) -+ period = int(data_struct['period']) -+ stage_list = json.loads(data_struct['stage']) -+ -+ if (period < period_time) or (period > period_time * max_save) or (period % period_time): -+ logging.error("is_iocollect_valid: period time: %d is invalid", period) -+ return json.dumps(result_rev) -+ -+ for disk_name, stage_info in self.io_global_data.items(): -+ if len(disk_list) > 0 and disk_name not in disk_list: -+ continue -+ result_rev[disk_name] = [] -+ if len(stage_list) == 0: -+ result_rev[disk_name] = list(stage_info.keys()) -+ continue -+ for stage_name, stage_data in stage_info.items(): -+ if stage_name in stage_list: -+ result_rev[disk_name].append(stage_name) -+ -+ return json.dumps(result_rev) -+ -+ def get_io_data(self, data_struct): -+ result_rev = {} -+ self.io_global_data = IO_GLOBAL_DATA -+ -+ if len(IO_CONFIG_DATA) == 0: -+ logging.error("the collect thread is not started, the data is invalid. ") -+ return json.dumps(result_rev) -+ period_time = IO_CONFIG_DATA[0] -+ max_save = IO_CONFIG_DATA[1] -+ -+ period = int(data_struct['period']) -+ disk_list = json.loads(data_struct['disk_list']) -+ stage_list = json.loads(data_struct['stage']) -+ iotype_list = json.loads(data_struct['iotype']) -+ -+ if (period < period_time) or (period > period_time * max_save) or (period % period_time): -+ logging.error("get_io_data: period time: %d is invalid", period) -+ return json.dumps(result_rev) -+ -+ collect_index = period // period_time - 1 -+ logging.debug("period: %d, collect_index: %d", period, collect_index) -+ -+ for disk_name, stage_info in self.io_global_data.items(): -+ if disk_name not in disk_list: -+ continue -+ result_rev[disk_name] = {} -+ for stage_name, iotype_info in stage_info.items(): -+ if len(stage_list) > 0 and stage_name not in stage_list: -+ continue -+ result_rev[disk_name][stage_name] = {} -+ for iotype_name, iotype_info in iotype_info.items(): -+ if iotype_name not in iotype_list: -+ continue -+ if len(iotype_info) < collect_index: -+ continue -+ result_rev[disk_name][stage_name][iotype_name] = iotype_info[collect_index] -+ -+ return json.dumps(result_rev) -+ -+ def msg_data_process(self, msg_data, protocal_id): -+ """message data process""" -+ logging.debug("msg_data %s", msg_data) -+ protocol_name = msg_data[0] -+ try: -+ data_struct = json.loads(msg_data) -+ except json.JSONDecodeError: -+ logging.error("msg data process: json decode error") -+ return "Request message decode failed" -+ -+ if protocal_id == ServerProtocol.IS_IOCOLLECT_VALID: -+ res_msg = self.is_iocollect_valid(data_struct) -+ elif protocal_id == ServerProtocol.GET_IO_DATA: -+ res_msg = self.get_io_data(data_struct) -+ -+ return res_msg -+ -+ def msg_head_process(self, msg_head): -+ """message head process""" -+ ctl_magic = msg_head[:CLT_MSG_MAGIC_LEN] -+ if ctl_magic != CLT_MAGIC: -+ logging.error("recv msg head magic invalid") -+ return None -+ -+ protocol_str = msg_head[CLT_MSG_MAGIC_LEN:CLT_MSG_MAGIC_LEN+CLT_MSG_PRO_LEN] -+ try: -+ protocol_id = int(protocol_str) -+ except ValueError: -+ logging.error("recv msg protocol id is invalid") -+ return None -+ -+ data_len_str = msg_head[CLT_MSG_MAGIC_LEN+CLT_MSG_PRO_LEN:CLT_MSG_HEAD_LEN] -+ try: -+ data_len = int(data_len_str) -+ except ValueError: -+ logging.error("recv msg data len is invalid %s", data_len_str) -+ return None -+ -+ return [protocol_id, data_len] -+ -+ def server_recv(self, server_socket: socket.socket): -+ """server receive""" -+ try: -+ client_socket, _ = server_socket.accept() -+ logging.debug("server_fd listen ok") -+ except socket.error: -+ logging.error("server accept failed, %s", socket.error) -+ return -+ -+ try: -+ msg_head = client_socket.recv(CLT_MSG_HEAD_LEN) -+ logging.debug("recv msg head: %s", msg_head.decode()) -+ head_info = self.msg_head_process(msg_head.decode()) -+ except (OSError, UnicodeError): -+ client_socket.close() -+ logging.error("server recv HEAD failed") -+ return -+ -+ protocol_id = head_info[0] -+ data_len = head_info[1] -+ logging.debug("msg protocol id: %d, data length: %d", protocol_id, data_len) -+ if protocol_id >= ServerProtocol.PRO_END: -+ client_socket.close() -+ logging.error("protocol id is invalid") -+ return -+ -+ if data_len < 0: -+ client_socket.close() -+ logging.error("msg head parse failed") -+ return -+ -+ try: -+ msg_data = client_socket.recv(data_len) -+ msg_data_decode = msg_data.decode() -+ logging.debug("msg data %s", msg_data_decode) -+ except (OSError, UnicodeError): -+ client_socket.close() -+ logging.error("server recv MSG failed") -+ return -+ -+ res_data = self.msg_data_process(msg_data_decode, protocol_id) -+ logging.debug("res data %s", res_data) -+ -+ # server send -+ res_head = RES_MAGIC -+ res_head += str(protocol_id).zfill(CLT_MSG_PRO_LEN) -+ res_data_len = str(len(res_data)).zfill(CLT_MSG_LEN_LEN) -+ res_head += res_data_len -+ logging.debug("res head %s", res_head) -+ -+ res_msg = res_head + res_data -+ logging.debug("res msg %s", res_msg) -+ -+ try: -+ client_socket.send(res_msg.encode()) -+ except OSError: -+ logging.error("server recv failed") -+ finally: -+ client_socket.close() -+ return -+ -+ def server_fd_create(self): -+ """create server fd""" -+ if not os.path.exists(SENTRY_RUN_DIR): -+ logging.error("%s not exist, failed", SENTRY_RUN_DIR) -+ return None -+ -+ try: -+ server_fd = socket.socket(socket.AF_UNIX, socket.SOCK_STREAM) -+ server_fd.setblocking(False) -+ if os.path.exists(COLLECT_SOCKET_PATH): -+ os.remove(COLLECT_SOCKET_PATH) -+ -+ server_fd.bind(COLLECT_SOCKET_PATH) -+ os.chmod(COLLECT_SOCKET_PATH, 0o600) -+ server_fd.listen(CLT_LISTEN_QUEUE_LEN) -+ logging.debug("%s bind and listen", COLLECT_SOCKET_PATH) -+ except socket.error: -+ logging.error("server fd create failed") -+ server_fd = None -+ -+ return server_fd -+ -+ -+ def server_loop(self): -+ """main loop""" -+ logging.info("collect server thread start") -+ server_fd = self.server_fd_create() -+ if not server_fd: -+ return -+ -+ epoll_fd = select.epoll() -+ epoll_fd.register(server_fd.fileno(), select.EPOLLIN) -+ -+ logging.debug("start server_loop loop") -+ while True: -+ if self.stop_event.is_set(): -+ logging.info("collect server thread exit") -+ server_fd = None -+ return -+ try: -+ events_list = epoll_fd.poll(SERVER_EPOLL_TIMEOUT) -+ for event_fd, _ in events_list: -+ if event_fd == server_fd.fileno(): -+ self.server_recv(server_fd) -+ else: -+ continue -+ except socket.error: -+ pass -+ -+ def stop_thread(self): -+ logging.info("collect server thread is preparing to exit") -+ self.stop_event.set() -diff --git a/src/python/sentryCollector/collectd.py b/src/python/sentryCollector/collectd.py -new file mode 100644 -index 0000000..b77c642 ---- /dev/null -+++ b/src/python/sentryCollector/collectd.py -@@ -0,0 +1,99 @@ -+# coding: utf-8 -+# Copyright (c) 2024 Huawei Technologies Co., Ltd. -+# sysSentry is licensed under the Mulan PSL v2. -+# You can use this software according to the terms and conditions of the Mulan PSL v2. -+# You may obtain a copy of Mulan PSL v2 at: -+# http://license.coscl.org.cn/MulanPSL2 -+# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR -+# IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY OR FIT FOR A PARTICULAR -+# PURPOSE. -+# See the Mulan PSL v2 for more details. -+ -+""" -+main loop for collect. -+""" -+import sys -+import signal -+import traceback -+import socket -+import os -+import json -+import logging -+import fcntl -+import select -+ -+import threading -+ -+from .collect_io import CollectIo -+from .collect_server import CollectServer -+from .collect_config import CollectConfig -+ -+SENTRY_RUN_DIR = "/var/run/sysSentry" -+COLLECT_SOCKET_PATH = "/var/run/sysSentry/collector.sock" -+SENTRY_RUN_DIR_PERM = 0o750 -+ -+COLLECT_LOG_FILE = "/var/log/sysSentry/collector.log" -+Thread_List = [] -+Module_Map_Class = {"io" : CollectIo} -+ -+def remove_sock_file(): -+ try: -+ os.unlink(COLLECT_SOCKET_PATH) -+ except FileNotFoundError: -+ pass -+ -+def sig_handler(signum, _f): -+ if signum not in (signal.SIGINT, signal.SIGTERM): -+ return -+ for i in range(len(Thread_List)): -+ Thread_List[i][0].stop_thread() -+ -+ remove_sock_file() -+ sys.exit(0) -+ -+def main(): -+ """main -+ """ -+ if not os.path.exists(SENTRY_RUN_DIR): -+ os.mkdir(SENTRY_RUN_DIR) -+ os.chmod(SENTRY_RUN_DIR, mode=SENTRY_RUN_DIR_PERM) -+ -+ logging.basicConfig(filename=COLLECT_LOG_FILE, level=logging.INFO) -+ os.chmod(COLLECT_LOG_FILE, 0o600) -+ -+ try: -+ signal.signal(signal.SIGINT, sig_handler) -+ signal.signal(signal.SIGTERM, sig_handler) -+ signal.signal(signal.SIGHUP, sig_handler) -+ -+ logging.info("finish main parse_args") -+ -+ module_config = CollectConfig() -+ module_list = module_config.modules -+ -+ # listen thread -+ cs = CollectServer() -+ listen_thread = threading.Thread(target=cs.server_loop) -+ listen_thread.start() -+ Thread_List.append([cs, listen_thread]) -+ -+ # collect thread -+ for info in module_list: -+ class_name = Module_Map_Class.get(info) -+ if not class_name: -+ logging.info("%s correspond to class is not exists", info) -+ continue -+ cn = class_name(module_config) -+ collect_thread = threading.Thread(target=cn.main_loop) -+ collect_thread.start() -+ Thread_List.append([cn, collect_thread]) -+ -+ for i in range(len(Thread_List)): -+ Thread_List[i][1].join() -+ -+ except Exception: -+ logging.error('%s', traceback.format_exc()) -+ finally: -+ pass -+ -+ logging.info("All threads have finished. Main thread is exiting.") -\ No newline at end of file -diff --git a/src/python/setup.py b/src/python/setup.py -index f96a96e..c28c691 100644 ---- a/src/python/setup.py -+++ b/src/python/setup.py -@@ -31,7 +31,9 @@ setup( - 'console_scripts': [ - 'cpu_sentry=syssentry.cpu_sentry:main', - 'syssentry=syssentry.syssentry:main', -- 'xalarmd=xalarm.xalarm_daemon:alarm_process_create' -+ 'xalarmd=xalarm.xalarm_daemon:alarm_process_create', -+ 'sentryCollector=sentryCollector.collectd:main', -+ 'avg_block_io=sentryPlugins.avg_block_io.avg_block_io:main' - ] - }, - ) --- -2.33.0 - diff --git a/add-deleted-code-to-plugin-rasdaemon.patch b/add-deleted-code-to-plugin-rasdaemon.patch deleted file mode 100644 index 89d1cc4d82d95db5fb56df75ca299425337e3211..0000000000000000000000000000000000000000 --- a/add-deleted-code-to-plugin-rasdaemon.patch +++ /dev/null @@ -1,31 +0,0 @@ -From eca8c542875aef5cfbf947d697c4b644490d1c05 Mon Sep 17 00:00:00 2001 -From: zhuofeng -Date: Fri, 30 Aug 2024 19:58:41 +0800 -Subject: [PATCH] add deleted code to plugin rasdaemon - ---- - src/python/syssentry/syssentry.py | 8 ++++++++ - 1 file changed, 8 insertions(+) - -diff --git a/src/python/syssentry/syssentry.py b/src/python/syssentry/syssentry.py -index 32b81e3..3d5cb8d 100644 ---- a/src/python/syssentry/syssentry.py -+++ b/src/python/syssentry/syssentry.py -@@ -462,6 +462,14 @@ def main_loop(): - epoll_fd.register(cpu_alarm_fd.fileno(), select.EPOLLIN) - - logging.debug("start main loop") -+ # onstart_tasks_handle() -+ for task_type in TasksMap.tasks_dict: -+ for task_name in TasksMap.tasks_dict.get(task_type): -+ task = TasksMap.tasks_dict.get(task_type).get(task_name) -+ if not task: -+ continue -+ task.onstart_handle() -+ - while True: - try: - events_list = epoll_fd.poll(SERVER_EPOLL_TIMEOUT) --- -2.33.0 - diff --git a/add-detail-time.patch b/add-detail-time.patch deleted file mode 100644 index 8e234529d4255b04293f07ecdbe2ec737a29bf10..0000000000000000000000000000000000000000 --- a/add-detail-time.patch +++ /dev/null @@ -1,32 +0,0 @@ -From 9ecd4c2c9c9f9578f5ec4780360dc67b182b384a Mon Sep 17 00:00:00 2001 -From: jinsaihang -Date: Wed, 9 Oct 2024 08:09:04 +0000 -Subject: [PATCH 2/2] add detail time - -Signed-off-by: jinsaihang ---- - src/python/syssentry/alarm.py | 4 +++- - 1 file changed, 3 insertions(+), 1 deletion(-) - -diff --git a/src/python/syssentry/alarm.py b/src/python/syssentry/alarm.py -index 74a2716..d5337d3 100644 ---- a/src/python/syssentry/alarm.py -+++ b/src/python/syssentry/alarm.py -@@ -118,11 +118,13 @@ def get_alarm_result(task_name: str, time_range: int, detailed: bool) -> List[Di - logging.debug(f"get_alarm_result: final alarm_list of {alarm_id} has {len(alarm_list)} elements") - - def xalarm_to_dict(alarm_info: Xalarm) -> dict: -+ timestamp = alarm_info.timetamp.tv_sec + alarm_info.timetamp.tv_usec / 1000000 -+ dt_object = datetime.fromtimestamp(int(timestamp)) - return { - 'alarm_id': xalarm_getid(alarm_info), - 'alarm_type': xalarm_gettype(alarm_info), - 'alarm_level': xalarm_getlevel(alarm_info), -- 'timetamp': xalarm_gettime(alarm_info), -+ 'timestamp': dt_object.strftime("%Y-%m-%d %H:%M:%S"), - 'msg1': xalarm_getdesc(alarm_info) - } - --- -2.27.0 - diff --git a/add-get_disk_type-and-fix-some-bugs.patch b/add-get_disk_type-and-fix-some-bugs.patch deleted file mode 100644 index b5e59e8960bb7514dd66e78fb81365dc29518a7f..0000000000000000000000000000000000000000 --- a/add-get_disk_type-and-fix-some-bugs.patch +++ /dev/null @@ -1,176 +0,0 @@ -From c2ffc679eddda5d78362612d89a9319d268da7e3 Mon Sep 17 00:00:00 2001 -From: zhuofeng -Date: Thu, 10 Oct 2024 20:17:34 +0800 -Subject: [PATCH] add get_disk_type and fix some bugs - ---- - service/sentryCollector.service | 2 +- - src/python/sentryCollector/collect_io.py | 16 ++++- - src/python/sentryCollector/collect_plugin.py | 68 +++++++++++++++++++- - 3 files changed, 81 insertions(+), 5 deletions(-) - -diff --git a/service/sentryCollector.service b/service/sentryCollector.service -index 4ee07d5..e09ddb3 100644 ---- a/service/sentryCollector.service -+++ b/service/sentryCollector.service -@@ -1,5 +1,5 @@ - [Unit] --Description = Collection module added for sysSentry and kernel lock-free collection -+Description = Collection module added for sysSentry - - [Service] - ExecStart=/usr/bin/python3 /usr/bin/sentryCollector -diff --git a/src/python/sentryCollector/collect_io.py b/src/python/sentryCollector/collect_io.py -index 8780648..6699a90 100644 ---- a/src/python/sentryCollector/collect_io.py -+++ b/src/python/sentryCollector/collect_io.py -@@ -116,7 +116,7 @@ class CollectIo(): - return 0 - if finish <= 0 or lat_time <= 0: - return 0 -- value = lat_time / finish / 1000 / 1000 -+ value = lat_time / finish / 1000 - if value.is_integer(): - return int(value) - else: -@@ -124,11 +124,17 @@ class CollectIo(): - - def get_io_length(self, curr_stage_value, last_stage_value, category): - try: -- finish = int(curr_stage_value[category * 3 + IoStatus.FINISH]) - int(last_stage_value[category * 3 + IoStatus.FINISH]) -+ lat_time = (int(curr_stage_value[category * 3 + IoStatus.LATENCY]) - int(last_stage_value[category * 3 + IoStatus.LATENCY])) - except ValueError as e: - logging.error("get_io_length convert to int failed, %s", e) - return 0 -- value = finish / self.period_time / 1000 / 1000 -+ if lat_time <= 0: -+ return 0 -+ # ns convert us -+ lat_time = lat_time / 1000 -+ # s convert us -+ period_time = self.period_time * 1000 * 1000 -+ value = lat_time / period_time - if value.is_integer(): - return int(value) - else: -@@ -141,6 +147,8 @@ class CollectIo(): - with open(io_dump_file, 'r') as file: - for line in file: - count += line.count('.op=' + Io_Category[category]) -+ if count > 0: -+ logging.info(f"io_dump info : {disk_name}, {stage}, {category}, {count}") - except FileNotFoundError: - logging.error("The file %s does not exist.", io_dump_file) - return count -@@ -223,6 +231,8 @@ class CollectIo(): - if self.get_blk_io_hierarchy(disk_name, stage_list) < 0: - continue - self.append_period_lat(disk_name, stage_list) -+ -+ logging.debug(f"no-lock collect data : {IO_GLOBAL_DATA}") - - elapsed_time = time.time() - start_time - sleep_time = self.period_time - elapsed_time -diff --git a/src/python/sentryCollector/collect_plugin.py b/src/python/sentryCollector/collect_plugin.py -index 3e2cf4c..31bf11b 100644 ---- a/src/python/sentryCollector/collect_plugin.py -+++ b/src/python/sentryCollector/collect_plugin.py -@@ -16,6 +16,7 @@ import json - import socket - import logging - import re -+import os - - COLLECT_SOCKET_PATH = "/var/run/sysSentry/collector.sock" - -@@ -58,6 +59,8 @@ class ResultMessage(): - RESULT_EXCEED_LIMIT = 4 # the parameter length exceeds the limit. - RESULT_PARSE_FAILED = 5 # parse failed - RESULT_INVALID_CHAR = 6 # invalid char -+ RESULT_DISK_NOEXIST = 7 # disk is not exist -+ RESULT_DISK_TYPE_MISMATCH= 8 # disk type mismatch - - Result_Messages = { - ResultMessage.RESULT_SUCCEED: "Succeed", -@@ -66,9 +69,15 @@ Result_Messages = { - ResultMessage.RESULT_INVALID_LENGTH: "Invalid parameter length", - ResultMessage.RESULT_EXCEED_LIMIT: "The parameter length exceeds the limit", - ResultMessage.RESULT_PARSE_FAILED: "Parse failed", -- ResultMessage.RESULT_INVALID_CHAR: "Invalid char" -+ ResultMessage.RESULT_INVALID_CHAR: "Invalid char", -+ ResultMessage.RESULT_DISK_NOEXIST: "Disk is not exist", -+ ResultMessage.RESULT_DISK_TYPE_MISMATCH: "Disk type mismatch" - } - -+class DiskType(): -+ TYPE_NVME_SSD = 0 -+ TYPE_SATA_SSD = 1 -+ TYPE_SATA_HDD = 2 - - def client_send_and_recv(request_data, data_str_len, protocol): - """client socket send and recv message""" -@@ -273,3 +282,60 @@ def inter_get_io_data(period, disk_list, stage, iotype): - result['message'] = result_message - return result - -+def get_disk_type(disk): -+ result = {} -+ result['ret'] = ResultMessage.RESULT_UNKNOWN -+ result['message'] = "" -+ if not disk: -+ logging.error("param is invalid") -+ result['ret'] = ResultMessage.RESULT_NOT_PARAM -+ return result -+ if len(disk) <= 0 or len(disk) > LIMIT_DISK_CHAR_LEN: -+ logging.error("invalid disk length") -+ result['ret'] = ResultMessage.RESULT_INVALID_LENGTH -+ return result -+ pattern = r'^[a-zA-Z0-9_-]+$' -+ if not re.match(pattern, disk): -+ logging.error("%s is invalid char", disk) -+ result['ret'] = ResultMessage.RESULT_INVALID_CHAR -+ return result -+ -+ base_path = '/sys/block' -+ all_disk = [] -+ for disk_name in os.listdir(base_path): -+ all_disk.append(disk_name) -+ -+ if disk not in all_disk: -+ logging.error("disk %s is not exist", disk) -+ result['ret'] = ResultMessage.RESULT_DISK_NOEXIST -+ return result -+ -+ if disk[0:4] == "nvme": -+ result['message'] = str(DiskType.TYPE_NVME_SSD) -+ elif disk[0:2] == "sd": -+ disk_file = '/sys/block/{}/queue/rotational'.format(disk) -+ try: -+ with open(disk_file, 'r') as file: -+ num = int(file.read()) -+ if num == 1: -+ result['message'] = str(DiskType.TYPE_SATA_SSD) -+ elif num == 0: -+ result['message'] = str(DiskType.TYPE_SATA_HDD) -+ else: -+ logging.error("disk %s is not support, num = %d", disk, num) -+ result['ret'] = ResultMessage.RESULT_DISK_TYPE_MISMATCH -+ return result -+ except FileNotFoundError: -+ logging.error("The disk_file [%s] does not exist", disk_file) -+ result['ret'] = ResultMessage.RESULT_DISK_NOEXIST -+ return result -+ except Exception as e: -+ logging.error("open disk_file %s happen an error: %s", disk_file, e) -+ return result -+ else: -+ logging.error("disk %s is not support", disk) -+ result['ret'] = ResultMessage.RESULT_DISK_TYPE_MISMATCH -+ return result -+ -+ result['ret'] = ResultMessage.RESULT_SUCCEED -+ return result -\ No newline at end of file --- -2.33.0 - diff --git a/add-hbm-online-repair.patch b/add-hbm-online-repair.patch deleted file mode 100644 index c6906ffb1226a61bc92bd8e0d33a9c4accef9f12..0000000000000000000000000000000000000000 --- a/add-hbm-online-repair.patch +++ /dev/null @@ -1,2194 +0,0 @@ -From abdeacfa6ae54b503714cb98f3262a39d883972e Mon Sep 17 00:00:00 2001 -From: luckky -Date: Fri, 11 Oct 2024 09:49:40 +0000 -Subject: [PATCH] add hbm online repair - ---- - config/tasks/hbm_online_repair.mod | 9 + - src/c/hbm_online_repair/.gitignore | 6 + - src/c/hbm_online_repair/Makefile | 25 + - src/c/hbm_online_repair/hbm_online_repair.c | 144 ++++ - src/c/hbm_online_repair/hbm_online_repair.env | 2 + - src/c/hbm_online_repair/logger.h | 31 + - .../non-standard-hbm-repair.c | 799 ++++++++++++++++++ - .../non-standard-hbm-repair.h | 89 ++ - src/c/hbm_online_repair/ras-events.c | 534 ++++++++++++ - src/c/hbm_online_repair/ras-events.h | 28 + - .../ras-non-standard-handler.c | 81 ++ - .../ras-non-standard-handler.h | 25 + - src/python/.gitignore | 1 + - src/python/syssentry/bmc_alarm.py | 159 ++++ - src/python/syssentry/syssentry.py | 78 +- - 15 files changed, 2001 insertions(+), 10 deletions(-) - create mode 100644 config/tasks/hbm_online_repair.mod - create mode 100644 src/c/hbm_online_repair/.gitignore - create mode 100644 src/c/hbm_online_repair/Makefile - create mode 100644 src/c/hbm_online_repair/hbm_online_repair.c - create mode 100644 src/c/hbm_online_repair/hbm_online_repair.env - create mode 100644 src/c/hbm_online_repair/logger.h - create mode 100644 src/c/hbm_online_repair/non-standard-hbm-repair.c - create mode 100644 src/c/hbm_online_repair/non-standard-hbm-repair.h - create mode 100644 src/c/hbm_online_repair/ras-events.c - create mode 100644 src/c/hbm_online_repair/ras-events.h - create mode 100644 src/c/hbm_online_repair/ras-non-standard-handler.c - create mode 100644 src/c/hbm_online_repair/ras-non-standard-handler.h - create mode 100644 src/python/.gitignore - create mode 100644 src/python/syssentry/bmc_alarm.py - -diff --git a/config/tasks/hbm_online_repair.mod b/config/tasks/hbm_online_repair.mod -new file mode 100644 -index 0000000..77dd73e ---- /dev/null -+++ b/config/tasks/hbm_online_repair.mod -@@ -0,0 +1,9 @@ -+[common] -+enabled=yes -+task_start=/usr/bin/hbm_online_repair -+task_stop=kill $pid -+type=period -+interval=180 -+onstart=yes -+env_file=/etc/sysconfig/hbm_online_repair.env -+conflict=up -\ No newline at end of file -diff --git a/src/c/hbm_online_repair/.gitignore b/src/c/hbm_online_repair/.gitignore -new file mode 100644 -index 0000000..a577882 ---- /dev/null -+++ b/src/c/hbm_online_repair/.gitignore -@@ -0,0 +1,6 @@ -+*.o -+*.c~ -+*.h~ -+hbm_online_repair -+ -+.vscode/ -diff --git a/src/c/hbm_online_repair/Makefile b/src/c/hbm_online_repair/Makefile -new file mode 100644 -index 0000000..16ebcd8 ---- /dev/null -+++ b/src/c/hbm_online_repair/Makefile -@@ -0,0 +1,25 @@ -+CC = gcc -+ -+CFLAGS = -Wall -o3 -+ -+LDFLAGS = -ltraceevent -+ -+SRC = $(wildcard *.c) -+HDR = $(wildcard *.h) -+ -+OBJ = $(SRC:.c=.o) -+ -+TARGET = hbm_online_repair -+ -+all: $(TARGET) -+ -+$(TARGET): $(OBJ) -+ $(CC) $(OBJ) -o $@ $(LDFLAGS) -+ -+%.o: %.c $(HDR) -+ $(CC) $(CFLAGS) -c $< -o $@ -+ -+clean: -+ rm -f $(OBJ) $(TARGET) -+ -+.PHONY: all clean -diff --git a/src/c/hbm_online_repair/hbm_online_repair.c b/src/c/hbm_online_repair/hbm_online_repair.c -new file mode 100644 -index 0000000..3ace206 ---- /dev/null -+++ b/src/c/hbm_online_repair/hbm_online_repair.c -@@ -0,0 +1,144 @@ -+#include -+#include -+#include -+#include -+#include -+ -+#include "logger.h" -+#include "ras-events.h" -+#include "non-standard-hbm-repair.h" -+ -+#define DEFAULT_LOG_LEVEL LOG_INFO -+#define DEFAULT_PAGE_ISOLATION_THRESHOLD 128 -+ -+int global_level_setting; -+int page_isolation_threshold; -+ -+int string2int(const char* str, int* value) -+{ -+ if (!str) { -+ return -1; -+ } -+ char *endptr; -+ errno = 0; -+ long val = strtol(str, &endptr, 10); -+ if (errno != 0 || *endptr != '\0') { -+ return -1; -+ } -+ *value = (int)val; -+ if (val != (long)*value) { -+ return -1; -+ } -+ return 0; -+} -+ -+int execute_command(const char *command) -+{ -+ FILE *fp; -+ char buffer[128] = {0}; -+ int ret; -+ fp = popen(command, "r"); -+ if (!fp) { -+ log(LOG_ERROR, "popen failed\n"); -+ return -1; -+ } -+ -+ fgets(buffer, sizeof(buffer), fp); -+ log(LOG_DEBUG, "output of command is: %s\n", buffer); -+ -+ ret = pclose(fp); -+ if (ret < 0) { -+ log(LOG_ERROR, "pclose failed\n"); -+ return -1; -+ } -+ -+ if (!WIFEXITED(ret)) { -+ log(LOG_ERROR, "command did not terminate normally\n"); -+ return -1; -+ } -+ -+ ret = WEXITSTATUS(ret); -+ log(LOG_DEBUG, "command exited with status: %d\n", ret); -+ return ret; -+} -+ -+int load_required_driver(void) -+{ -+ int ret; -+ ret = execute_command("modprobe hisi_mem_ras 2>&1"); -+ if (ret < 0) { -+ log(LOG_ERROR, "load repair driver failed\n"); -+ return ret; -+ } -+ ret = execute_command("modprobe page_eject 2>&1"); -+ if (ret < 0) { -+ log(LOG_ERROR, "load page driver failed\n"); -+ return ret; -+ } -+ log(LOG_INFO, "load required driver success\n"); -+ return ret; -+} -+ -+void hbm_param_init(void) -+{ -+ int ret; -+ char *env; -+ -+ env = getenv("HBM_ONLINE_REPAIR_LOG_LEVEL"); -+ ret = string2int(env, &global_level_setting); -+ if (ret < 0) { -+ global_level_setting = DEFAULT_LOG_LEVEL; -+ log(LOG_WARNING, "Get log level from config failed, set the default value %d\n", DEFAULT_LOG_LEVEL); -+ } else { -+ log(LOG_INFO, "log level: %d\n", global_level_setting); -+ } -+ -+ env = getenv("PAGE_ISOLATION_THRESHOLD"); -+ ret = string2int(env, &page_isolation_threshold); -+ if (ret < 0) { -+ page_isolation_threshold = DEFAULT_PAGE_ISOLATION_THRESHOLD; -+ log(LOG_WARNING, "Get page_isolation_threshold from config failed, set the default value %d\n", DEFAULT_PAGE_ISOLATION_THRESHOLD); -+ } else { -+ log(LOG_INFO, "page_isolation_threshold: %d\n", page_isolation_threshold); -+ } -+} -+ -+ -+int main(int argc, char *argv[]) -+{ -+ int ret; -+ -+ hbm_param_init(); -+ -+ ret = load_required_driver(); -+ if (ret < 0) { -+ log(LOG_DEBUG, "load required driver failed\n"); -+ return ret; -+ } -+ -+ struct ras_events *ras = init_trace_instance(); -+ if (!ras) -+ return -1; -+ -+ ret = toggle_ras_event(ras->tracing, "ras", "non_standard_event", 1); -+ if (ret < 0) { -+ log(LOG_WARNING, "unable to enable ras non_standard_event.\n"); -+ free(ras); -+ return -1; -+ } -+ -+ ret = init_all_flash(); -+ if (ret < 0) { -+ log(LOG_ERROR, "flash writer init failed\n"); -+ } -+ -+ handle_ras_events(ras); -+ -+ ret = toggle_ras_event(ras->tracing, "ras", "non_standard_event", 0); -+ if (ret < 0) { -+ log(LOG_WARNING, "unable to disable ras non_standard_event.\n"); -+ } -+ -+ free(ras); -+ return ret; -+} -diff --git a/src/c/hbm_online_repair/hbm_online_repair.env b/src/c/hbm_online_repair/hbm_online_repair.env -new file mode 100644 -index 0000000..de56079 ---- /dev/null -+++ b/src/c/hbm_online_repair/hbm_online_repair.env -@@ -0,0 +1,2 @@ -+HBM_ONLINE_REPAIR_LOG_LEVEL=1 -+PAGE_ISOLATION_THRESHOLD=128 -diff --git a/src/c/hbm_online_repair/logger.h b/src/c/hbm_online_repair/logger.h -new file mode 100644 -index 0000000..ddfa932 ---- /dev/null -+++ b/src/c/hbm_online_repair/logger.h -@@ -0,0 +1,31 @@ -+#ifndef __LOGGER_H -+#define __LOGGER_H -+ -+#define TOOL_NAME "hbm_online_repair" -+ -+#define LOG_DEBUG 0 -+#define LOG_INFO 1 -+#define LOG_WARNING 2 -+#define LOG_ERROR 3 -+ -+extern int global_level_setting; -+ -+#define log_prefix(level) \ -+ (level == LOG_DEBUG ? "DEBUG" : \ -+ level == LOG_INFO ? "INFO" : \ -+ level == LOG_WARNING ? "WARNING" : \ -+ level == LOG_ERROR ? "ERROR" : \ -+ "UNKNOWN_LEVEL") -+ -+#define log_fd(level) \ -+ (level == LOG_ERROR ? stderr : stdout) -+ -+#define log(level, fmt, args...) do {\ -+ if (level >= global_level_setting) {\ -+ fprintf(log_fd(level), "[%s] %s: ", log_prefix(level), TOOL_NAME);\ -+ fprintf(log_fd(level), fmt, ##args);\ -+ fflush(log_fd(level));\ -+ }\ -+} while (0) -+ -+#endif -diff --git a/src/c/hbm_online_repair/non-standard-hbm-repair.c b/src/c/hbm_online_repair/non-standard-hbm-repair.c -new file mode 100644 -index 0000000..b175e14 ---- /dev/null -+++ b/src/c/hbm_online_repair/non-standard-hbm-repair.c -@@ -0,0 +1,799 @@ -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+ -+#include "logger.h" -+#include "non-standard-hbm-repair.h" -+ -+extern int page_isolation_threshold; -+size_t total_size = 0; -+struct hisi_common_error_section { -+ uint32_t val_bits; -+ uint8_t version; -+ uint8_t soc_id; -+ uint8_t socket_id; -+ uint8_t totem_id; -+ uint8_t nimbus_id; -+ uint8_t subsystem_id; -+ uint8_t module_id; -+ uint8_t submodule_id; -+ uint8_t core_id; -+ uint8_t port_id; -+ uint16_t err_type; -+ struct { -+ uint8_t function; -+ uint8_t device; -+ uint16_t segment; -+ uint8_t bus; -+ uint8_t reserved[3]; -+ } pcie_info; -+ uint8_t err_severity; -+ uint8_t reserved[3]; -+ uint32_t reg_array_size; -+ uint32_t reg_array[]; -+}; -+ -+struct fault_addr_info { -+ uint32_t processer_id; -+ uint32_t die_id; -+ uint32_t stack_id; -+ uint32_t sid; -+ uint32_t channel_id; -+ uint32_t bankgroup_id; -+ uint32_t bank_id; -+ uint32_t row_id; -+ uint32_t column_id; -+ uint32_t error_type; -+ uint32_t repair_type; -+ uint32_t reserved; -+ uint32_t crc8; -+}; -+ -+typedef struct { -+ const char *VariableName; -+ const char *VendorGuid; -+ uint32_t DataSize; -+ uint8_t *Data; -+ uint32_t Attributes; -+} efi_variable_t; -+ -+char* flash_names[FLASH_ENTRY_NUM] = { -+ "repair0000", -+ "repair0001", -+ "repair0100", -+ "repair0101", -+ "repair0200", -+ "repair0201", -+ "repair0300", -+ "repair0301", -+}; -+char *flash_guids[FLASH_ENTRY_NUM] = { -+ "CD2FF4D9-D937-4e1d-B810-A1A568C37C01", -+ "DD92CC91-43E6-4c69-A42A-B08F72FCB157", -+ "4A8E0D1E-4CFA-47b2-9359-DA3A0006878B", -+ "733F9979-4ED4-478d-BD6A-E4D0F0390FDB", -+ "9BFBBA1F-5A93-4d36-AD47-D3C2D714D914", -+ "A0920D6F-78B8-4c09-9F61-7CEC845F116C", -+ "0049CE5E-8C18-414c-BDC1-A87E60CEEFD7", -+ "6AED17B4-50C7-4a40-A5A7-48AF55DD8EAC" -+}; -+ -+static int get_guid_index(uint32_t socket_id, uint32_t error_type) { -+ if (2 * socket_id + error_type >= FLASH_ENTRY_NUM) -+ return -1; -+ return 2 * socket_id + error_type; -+} -+ -+static void parse_fault_addr_info(struct fault_addr_info* info_struct, unsigned long long fault_addr) -+{ -+ info_struct->processer_id = fault_addr & FAULT_ADDR_PROCESSOR_ID_MASK; -+ fault_addr >>= FAULT_ADDR_PROCESSOR_ID_LEN; -+ info_struct->die_id = fault_addr & FAULT_ADDR_DIE_ID_MASK; -+ fault_addr >>= FAULT_ADDR_DIE_ID_LEN; -+ info_struct->stack_id = fault_addr & FAULT_ADDR_STACK_ID_MASK; -+ fault_addr >>= FAULT_ADDR_STACK_ID_LEN; -+ info_struct->sid = fault_addr & FAULT_ADDR_SID_MASK; -+ fault_addr >>= FAULT_ADDR_SID_LEN; -+ info_struct->channel_id = fault_addr & FAULT_ADDR_CHANNEL_ID_MASK; -+ fault_addr >>= FAULT_ADDR_CHANNEL_ID_LEN; -+ info_struct->bankgroup_id = fault_addr & FAULT_ADDR_BANKGROUP_ID_MASK; -+ fault_addr >>= FAULT_ADDR_BANKGROUP_ID_LEN; -+ info_struct->bank_id = fault_addr & FAULT_ADDR_BANK_ID_MASK; -+ fault_addr >>= FAULT_ADDR_BANK_ID_LEN; -+ info_struct->row_id = fault_addr & FAULT_ADDR_ROW_ID_MASK; -+ fault_addr >>= FAULT_ADDR_ROW_ID_LEN; -+ info_struct->column_id = fault_addr & FAULT_ADDR_COLUMN_ID_MASK; -+ fault_addr >>= FAULT_ADDR_CHANNEL_ID_LEN; -+ info_struct->error_type = fault_addr & FAULT_ADDR_ERROR_TYPE_MASK; -+ fault_addr >>= FAULT_ADDR_ERROR_TYPE_LEN; -+ info_struct->repair_type = fault_addr & FAULT_ADDR_REPAIR_TYPE_MASK; -+ fault_addr >>= FAULT_ADDR_REPAIR_TYPE_LEN; -+ info_struct->reserved = fault_addr & FAULT_ADDR_RESERVED_MASK; -+ fault_addr >>= FAULT_ADDR_RESERVED_LEN; -+ info_struct->crc8 = (uint32_t)fault_addr; -+} -+ -+static bool variable_existed(char *name, char *guid) -+{ -+ char filename[PATH_MAX]; -+ int fd; -+ -+ snprintf(filename, PATH_MAX - 1, "%s/%s-%s", EFIVARFS_PATH, name, guid); -+ -+ // open var file -+ fd = open(filename, O_RDONLY); -+ if (fd < 0) { -+ log(LOG_WARNING, "open file %s failed\n", filename); -+ return false; -+ } -+ close(fd); -+ return true; -+} -+ -+static uint32_t read_variable_attribute(char *name, char *guid) { -+ char filename[PATH_MAX]; -+ int fd; -+ size_t readsize; -+ uint32_t attribute = (uint32_t)-1; -+ -+ snprintf(filename, PATH_MAX - 1, "%s/%s-%s", EFIVARFS_PATH, name, guid); -+ -+ // open var file -+ fd = open(filename, O_RDONLY); -+ if (fd < 0) { -+ log(LOG_ERROR, "open %s failed\n", filename); -+ return attribute; -+ } -+ -+ // read attributes from first 4 bytes -+ readsize = read(fd, &attribute, sizeof(uint32_t)); -+ if (readsize != sizeof(uint32_t)) { -+ log(LOG_ERROR, "read attribute of %s failed\n", filename); -+ } -+ -+ close(fd); -+ return attribute; -+} -+ -+static int efivarfs_set_mutable(char *name, char *guid, bool mutable) -+{ -+ unsigned long orig_attrs, new_attrs; -+ char filename[PATH_MAX]; -+ int fd; -+ -+ snprintf(filename, PATH_MAX - 1, "%s/%s-%s", EFIVARFS_PATH, name, guid); -+ -+ fd = open(filename, O_RDONLY); -+ if (fd < 0) { -+ log(LOG_ERROR, "open %s failed\n", filename); -+ goto err; -+ } -+ -+ if (ioctl(fd, FS_IOC_GETFLAGS, &orig_attrs) == -1) { -+ log(LOG_ERROR, "ioctl FS_IOC_GETFLAGS failed\n"); -+ goto err; -+ } -+ -+ if (mutable) -+ new_attrs = orig_attrs & ~(unsigned long)FS_IMMUTABLE_FL; -+ else -+ new_attrs = orig_attrs | FS_IMMUTABLE_FL; -+ -+ if (new_attrs == orig_attrs) { -+ close(fd); -+ return 0; -+ } -+ -+ if (ioctl(fd, FS_IOC_SETFLAGS, &new_attrs) == -1) { -+ log(LOG_ERROR, "ioctl FS_IOC_SETFLAGS failed\n"); -+ goto err; -+ } -+ close(fd); -+ return 0; -+err: -+ if (fd >= 0) -+ close(fd); -+ return -1; -+} -+ -+static int write_variable(char *name, char *guid, void *value, unsigned long size, uint32_t attribute) { -+ int fd, mode; -+ size_t writesize; -+ void *buffer; -+ unsigned long total; -+ char filename[PATH_MAX]; -+ -+ snprintf(filename, PATH_MAX - 1, "%s/%s-%s", EFIVARFS_PATH, name, guid); -+ -+ // prepare attributes(size 4 bytes) and data -+ total = size + sizeof(uint32_t); -+ buffer = malloc(total); -+ if (buffer == NULL) { -+ log(LOG_ERROR, "malloc data for %s failed\n", filename); -+ goto err; -+ } -+ memcpy(buffer, &attribute, sizeof(uint32_t)); -+ memcpy(buffer + sizeof(uint32_t), value, size); -+ -+ // change attr -+ if (efivarfs_set_mutable(name, guid, 1) != 0) { -+ log(LOG_ERROR, "set mutable for %s failed\n", filename); -+ goto err; -+ } -+ -+ mode = O_WRONLY; -+ if (attribute & EFI_VARIABLE_APPEND_WRITE) -+ mode |= O_APPEND; -+ else -+ mode |= O_CREAT; -+ -+ // open var file -+ fd = open(filename, mode, S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH); -+ if (fd < 0) { -+ log(LOG_ERROR, "open %s failed\n", filename); -+ goto err; -+ } -+ -+ // write to var file -+ writesize = write(fd, buffer, total); -+ if (writesize != total) { -+ log(LOG_ERROR, "write %s failed\n", filename); -+ goto err; -+ } -+ -+ close(fd); -+ free(buffer); -+ if (efivarfs_set_mutable(name, guid, 0) != 0) { -+ log(LOG_ERROR, "set immutable for %s failed\n", filename); -+ } -+ return 0; -+err: -+ if (fd >= 0) -+ close(fd); -+ if (buffer) -+ free(buffer); -+ if (efivarfs_set_mutable(name, guid, 0) != 0) { -+ log(LOG_ERROR, "set immutable for %s failed\n", filename); -+ } -+ return -1; -+} -+ -+static int append_variable(char *name, char *guid, void *data, unsigned long size) { -+ // prepare append attribute -+ uint32_t attribute = read_variable_attribute(name, guid); -+ if (attribute == (uint32_t)-1) { -+ log(LOG_ERROR, "read %s-%s attribute failed\n", name, guid); -+ return -1; -+ } -+ attribute |= EFI_VARIABLE_APPEND_WRITE; -+ -+ return write_variable(name, guid, data, size, attribute); -+} -+ -+static size_t get_var_size(char *name, char *guid) { -+ char filename[PATH_MAX]; -+ int fd; -+ struct stat stat; -+ -+ snprintf(filename, PATH_MAX - 1, "%s/%s-%s", EFIVARFS_PATH, name, guid); -+ -+ // open var file -+ fd = open(filename, O_RDONLY); -+ if (fd < 0) { -+ log(LOG_WARNING, "open %s failed\n", filename); -+ goto err; -+ } -+ // read stat -+ if (fstat(fd, &stat) != 0) { -+ log(LOG_WARNING, "fstat %s failed\n", filename); -+ goto err; -+ } -+ close(fd); -+ return stat.st_size; -+err: -+ if (fd >= 0) -+ close(fd); -+ return (size_t)-1; -+} -+ -+int init_all_flash() { -+ for (int i = 0; i < FLASH_ENTRY_NUM; i++) { -+ // check existed entry -+ if (variable_existed(flash_names[i], flash_guids[i])) { -+ total_size += get_var_size(flash_names[i], flash_guids[i]); -+ continue; -+ } -+ // create new entry -+ uint32_t attribute = EFI_VARIABLE_NON_VOLATILE | -+ EFI_VARIABLE_BOOTSERVICE_ACCESS | -+ EFI_VARIABLE_RUNTIME_ACCESS; -+ char *data = ""; -+ unsigned long size = 1; -+ int ret = write_variable(flash_names[i], flash_guids[i], data, size, attribute); -+ if (ret) { -+ log(LOG_ERROR, "init %s-%s failed, fault info storage funtion not enabled\n", flash_names[i], flash_guids[i]); -+ return -1; -+ } -+ total_size += sizeof(uint32_t) + 1; -+ } -+ // check total entry size -+ log(LOG_DEBUG, "current fault info total size: %luKB, flash max threshold: %uKB\n", -+ total_size / KB_SIZE, MAX_VAR_SIZE / KB_SIZE); -+ if (total_size > MAX_VAR_SIZE) { -+ log(LOG_ERROR, "fault info storage reach threshold, cannot save new record\n"); -+ } -+ return 0; -+} -+ -+static int write_fault_info_to_flash(const struct hisi_common_error_section *err) { -+ int ret, guid_index; -+ uint32_t reg_size; -+ uint64_t fault_addr; -+ -+ // check flash usage threshold -+ if (total_size + sizeof(uint64_t) > MAX_VAR_SIZE) { -+ log(LOG_WARNING, "fault info storage reach threshold, cannot save new record into flash\n"); -+ return -1; -+ } -+ -+ // parse physical addr -+ reg_size = err->reg_array_size / sizeof(uint32_t); -+ fault_addr = err->reg_array[reg_size - 1]; -+ fault_addr <<= TYPE_UINT32_WIDTH; -+ fault_addr += err->reg_array[reg_size - 2]; -+ -+ // get guid -+ struct fault_addr_info info_struct; -+ parse_fault_addr_info(&info_struct, fault_addr); -+ guid_index = get_guid_index(info_struct.processer_id, info_struct.error_type); -+ if (guid_index < 0) { -+ log(LOG_ERROR, "invalid fault info\n"); -+ return -1; -+ } -+ // record physical addr in flash -+ ret = append_variable(flash_names[guid_index], flash_guids[guid_index], &fault_addr, sizeof(uint64_t)); -+ if (ret < 0) { -+ log(LOG_ERROR, "append to %s-%s failed\n", flash_names[guid_index], flash_guids[guid_index]); -+ return -1; -+ } -+ total_size += sizeof(uint64_t); -+ log(LOG_INFO, "write hbm fault info to flash success\n"); -+ return 0; -+} -+ -+static int write_file(char *path, const char *name, unsigned long long value) -+{ -+ char fname[MAX_PATH]; -+ char buf[20]; -+ int ret; -+ int fd; -+ -+ snprintf(fname, MAX_PATH, "%s/%s", path, name); -+ -+ fd = open(fname, O_WRONLY); -+ if (fd < 0) { -+ log(LOG_WARNING, "HBM ACLS: Cannot to open '%s': %s\n", -+ fname, strerror(errno)); -+ return -errno; -+ } -+ -+ snprintf(buf, sizeof(buf), "0x%llx\n", value); -+ ret = write(fd, buf, strlen(buf)); -+ if (ret <= 0) -+ log(LOG_WARNING, "HBM ACLS: Failed to set %s (0x%llx): %s\n", -+ fname, value, strerror(errno)); -+ -+ close(fd); -+ return ret > 0 ? 0 : -errno; -+} -+ -+static int get_hardware_corrupted_size() -+{ -+ FILE *fp; -+ char line[256]; -+ int hardware_corrupted_size = -1; -+ char *key = "HardwareCorrupted:"; -+ -+ fp = fopen("/proc/meminfo", "r"); -+ if (fp == NULL) { -+ log(LOG_ERROR, "Failed to open /proc/meminfo\n"); -+ return -1; -+ } -+ -+ while (fgets(line, sizeof(line), fp) != NULL) { -+ char *pos; -+ if ((pos = strstr(line, key)) != NULL) { -+ sscanf(pos, "HardwareCorrupted: %5d kB\n", &hardware_corrupted_size); -+ break; -+ } -+ } -+ -+ fclose(fp); -+ return hardware_corrupted_size; -+} -+ -+static uint8_t get_repair_result_code(int ret) -+{ -+ if (ret == -ENOSPC) { -+ return REPAIR_FAILED_NO_RESOURCE; -+ } else if (ret == -EIO) { -+ return REPAIR_FAILED_OTHER_REASON; -+ } else if (ret == -ENXIO || ret == -EINVAL) { -+ return REPAIR_FAILED_INVALID_PARAM; -+ } -+ return REPAIR_FAILED_OTHER_REASON; -+} -+ -+static int notice_BMC(const struct hisi_common_error_section *err, uint8_t repair_result_code) -+{ -+ int sockfd; -+ struct sockaddr_un addr; -+ char bmc_msg[sizeof(BMC_REPORT_FORMAT)] = {0}; -+ uint8_t repair_type_code, isolation_type_code; -+ uint32_t repair_type; -+ unsigned long long fault_addr; -+ -+ sockfd = socket(AF_UNIX, SOCK_STREAM, 0); -+ if (sockfd < 0) { -+ log(LOG_ERROR, "Failed to create BMC notice socket\n"); -+ return -1; -+ } -+ -+ memset(&addr, 0, sizeof(struct sockaddr_un)); -+ addr.sun_family = AF_UNIX; -+ strncpy(addr.sun_path, BMC_SOCKET_PATH, sizeof(addr.sun_path) - 1); -+ if (connect(sockfd, (struct sockaddr *)&addr, sizeof(struct sockaddr_un)) < 0) { -+ log(LOG_ERROR, "Failed to connect BMC notice socket\n"); -+ close(sockfd); -+ return -1; -+ } -+ -+ /* assemble bmc specific msg */ -+ repair_type_code = 0; -+ isolation_type_code = 0; -+ repair_type = err->reg_array[HBM_REPAIR_REQ_TYPE]; -+ if (repair_type & HBM_CE_ACLS) { -+ repair_type_code = 0; -+ isolation_type_code = SINGLE_ADDR_FAULT; -+ } else if (repair_type & HBM_PSUE_ACLS) { -+ repair_type_code = 1; -+ isolation_type_code = SINGLE_ADDR_FAULT; -+ } else if (repair_type & HBM_CE_SPPR) { -+ repair_type_code = 2; -+ isolation_type_code = ROW_FAULT; -+ } else if (repair_type & HBM_PSUE_SPPR) { -+ repair_type_code = 3; -+ isolation_type_code = ROW_FAULT; -+ } -+ -+ const uint32_t reg_size = err->reg_array_size / sizeof(uint32_t); -+ -+ fault_addr = err->reg_array[reg_size - 1]; -+ fault_addr <<= TYPE_UINT32_WIDTH; -+ fault_addr += err->reg_array[reg_size - 2]; -+ -+ log(LOG_DEBUG, "Get the fault addr is %llu\n", fault_addr); -+ -+ struct fault_addr_info info_struct; -+ parse_fault_addr_info(&info_struct, fault_addr); -+ -+ log(LOG_DEBUG, "info_struct.processer_id is %u\n", info_struct.processer_id); -+ log(LOG_DEBUG, "info_struct.die_id is %u\n", info_struct.die_id); -+ log(LOG_DEBUG, "info_struct.stack_id is %u\n", info_struct.stack_id); -+ log(LOG_DEBUG, "info_struct.sid is %u\n", info_struct.sid); -+ log(LOG_DEBUG, "info_struct.channel_id is %u\n", info_struct.channel_id); -+ log(LOG_DEBUG, "info_struct.bankgroup_id is %u\n", info_struct.bankgroup_id); -+ log(LOG_DEBUG, "info_struct.bank_id is %u\n", info_struct.bank_id); -+ log(LOG_DEBUG, "info_struct.row_id is %u\n", info_struct.row_id); -+ log(LOG_DEBUG, "info_struct.column_id is %u\n", info_struct.column_id); -+ log(LOG_DEBUG, "info_struct.error_type is %u\n", info_struct.error_type); -+ log(LOG_DEBUG, "info_struct.repair_type is %u\n", info_struct.repair_type); -+ log(LOG_DEBUG, "info_struct.reserved is %u\n", info_struct.reserved); -+ log(LOG_DEBUG, "info_struct.crc8 is %u\n", info_struct.crc8); -+ -+ snprintf(bmc_msg, sizeof(BMC_REPORT_FORMAT), BMC_REPORT_FORMAT, -+ repair_type_code, -+ repair_result_code, -+ isolation_type_code, -+ info_struct.processer_id, -+ info_struct.die_id, -+ info_struct.stack_id, -+ info_struct.sid, -+ info_struct.channel_id, -+ info_struct.bankgroup_id, -+ info_struct.bank_id, -+ info_struct.row_id, -+ info_struct.column_id -+ ); -+ -+ log(LOG_DEBUG, "Send msg to sysSentry, bmc msg is %s\n", bmc_msg); -+ -+ if (write(sockfd, bmc_msg, strlen(bmc_msg)) <= 0) { -+ log(LOG_ERROR, "Failed to send data to BMC notice socket\n"); -+ close(sockfd); -+ return -1; -+ } -+ -+ close(sockfd); -+ return 0; -+} -+ -+static int hbmc_hbm_page_isolate(const struct hisi_common_error_section *err) -+{ -+ unsigned long long paddr; -+ int ret; -+ bool is_acls = err->reg_array[HBM_REPAIR_REQ_TYPE] & (HBM_CE_ACLS | HBM_PSUE_ACLS); -+ int required_isolate_size = (is_acls ? HBM_ACLS_ADDR_NUM : HBM_SPPR_ADDR_NUM) * DEFAULT_PAGE_SIZE_KB; -+ int hardware_corrupted_size = get_hardware_corrupted_size(); -+ if (hardware_corrupted_size < 0) { -+ log(LOG_ERROR, "Page isolate failed: Get hardware_corrupted_size failed"); -+ notice_BMC(err, ISOLATE_FAILED_OTHER_REASON); -+ return -1; -+ } -+ if ((required_isolate_size + hardware_corrupted_size) > page_isolation_threshold) { -+ log(LOG_INFO, "Page isolate failed: the isolation resource is not enough\n"); -+ notice_BMC(err, ISOLATE_FAILED_OVER_THRESHOLD); -+ return -1; -+ } -+ if (is_acls) { -+ /* ACLS */ -+ paddr = err->reg_array[HBM_ADDH]; -+ paddr <<= TYPE_UINT32_WIDTH; -+ paddr += err->reg_array[HBM_ADDL]; -+ -+ ret = write_file("/sys/kernel/page_eject", "offline_page", paddr); -+ if (ret < 0) { -+ notice_BMC(err, ISOLATE_FAILED_OTHER_REASON); -+ log(LOG_WARNING, "HBM: ACLS offline failed, address is 0x%llx \n", paddr); -+ return ret; -+ } -+ } else { -+ /* SPPR */ -+ bool all_success = true; -+ uint32_t i; -+ for (i = 0; i < HBM_SPPR_ADDR_NUM; i++) { -+ paddr = err->reg_array[2 * i + HBM_ADDH]; -+ paddr <<= TYPE_UINT32_WIDTH; -+ paddr += err->reg_array[2 * i + HBM_ADDL]; -+ ret = write_file("/sys/kernel/page_eject", "offline_page", paddr); -+ if (ret < 0) { -+ all_success = false; -+ log(LOG_WARNING, "HBM: SPPR offline failed, address is 0x%llx \n", paddr); -+ continue; -+ } -+ } -+ if (!all_success) { -+ notice_BMC(err, ISOLATE_FAILED_OTHER_REASON); -+ ret = -1; -+ } -+ } -+ return ret < 0 ? ret : 0; -+} -+ -+static int hbmc_hbm_after_repair(bool is_acls, const int repair_ret, const unsigned long long paddr) -+{ -+ int ret; -+ if (repair_ret < 0) { -+ log(LOG_WARNING, "HBM %s: Keep page (0x%llx) offline\n", is_acls ? "ACLS" : "SPPR", paddr); -+ /* not much we can do about errors here */ -+ (void)write_file("/sys/kernel/page_eject", "remove_page", paddr); -+ return get_repair_result_code(repair_ret); -+ } -+ -+ ret = write_file("/sys/kernel/page_eject", "online_page", paddr); -+ if (ret < 0) { -+ log(LOG_WARNING, "HBM %s: Page (0x%llx) online failed\n",is_acls ? "ACLS" : "SPPR", paddr); -+ return ONLINE_PAGE_FAILED; -+ } else { -+ log(LOG_INFO, "HBM %s: Page (0x%llx) repair and online success\n",is_acls ? "ACLS" : "SPPR", paddr); -+ return ISOLATE_REPAIR_ONLINE_SUCCESS; -+ } -+} -+ -+static uint8_t hbmc_hbm_repair(const struct hisi_common_error_section *err, char *path) -+{ -+ unsigned long long paddr; -+ int ret; -+ uint8_t repair_result_code; -+ bool is_acls; -+ -+ /* Both ACLS and SPPR only repair the first address */ -+ paddr = err->reg_array[HBM_ADDH]; -+ paddr <<= TYPE_UINT32_WIDTH; -+ paddr += err->reg_array[HBM_ADDL]; -+ -+ is_acls = err->reg_array[HBM_REPAIR_REQ_TYPE] & HBM_CE_ACLS || -+ err->reg_array[HBM_REPAIR_REQ_TYPE] & HBM_PSUE_ACLS; -+ -+ ret = write_file(path, is_acls ? "acls_query" : "sppr_query", paddr); -+ if (ret < 0) { -+ notice_BMC(err, get_repair_result_code(ret)); -+ log(LOG_WARNING, "HBM: Address 0x%llx is not supported to %s repair\n", paddr, is_acls ? "ACLS" : "SPPR"); -+ return ret; -+ } -+ -+ ret = write_file(path, is_acls ? "acls_repair" : "sppr_repair", paddr); -+ -+ if (is_acls) { -+ /* ACLS */ -+ repair_result_code = hbmc_hbm_after_repair(is_acls, ret, paddr); -+ notice_BMC(err, repair_result_code); -+ return ret; -+ } else { -+ /* SPPR */ -+ bool all_online_success = true; -+ uint32_t i; -+ for (i = 0; i < HBM_SPPR_ADDR_NUM; i++) { -+ paddr = err->reg_array[2 * i + HBM_ADDH]; -+ paddr <<= TYPE_UINT32_WIDTH; -+ paddr += err->reg_array[2 * i + HBM_ADDL]; -+ -+ repair_result_code = hbmc_hbm_after_repair(is_acls, ret, paddr); -+ if (repair_result_code != ISOLATE_REPAIR_ONLINE_SUCCESS) { -+ all_online_success = false; -+ } -+ } -+ if (ret < 0) { -+ notice_BMC(err, get_repair_result_code(ret)); -+ return ret; -+ } else if (all_online_success) { -+ notice_BMC(err, ISOLATE_REPAIR_ONLINE_SUCCESS); -+ return 0; -+ } else { -+ notice_BMC(err, ONLINE_PAGE_FAILED); -+ return ret; -+ } -+ } -+ /* The final return code is not necessary */ -+ return ret < 0 ? ret : 0; -+} -+ -+static int hbmc_get_memory_type(char *path) -+{ -+ int type = HBM_UNKNOWN; -+ char fname[MAX_PATH]; -+ char buf[128]; -+ FILE *file; -+ -+ snprintf(fname, MAX_PATH, "%s/%s", path, "memory_type"); -+ file = fopen(fname, "r"); -+ if (!file) { -+ log(LOG_WARNING, "HBM: Cannot to open '%s': %s\n", -+ fname, strerror(errno)); -+ return -errno; -+ } -+ -+ if (!fgets(buf, sizeof(buf), file)) { -+ log(LOG_WARNING, "HBM: Failed to read %s\n", fname); -+ goto err; -+ } -+ -+ /* Remove the last '\n' */ -+ buf[strlen(buf) - 1] = 0; -+ -+ if (strcmp(buf, "HBM") == 0) -+ type = HBM_HBM_MEMORY; -+ else if (strcmp(buf, "DDR") == 0) -+ type = HBM_DDR_MEMORY; -+ -+err: -+ fclose(file); -+ return type; -+} -+ -+static void hbm_repair_handler(const struct hisi_common_error_section *err) -+{ -+ log(LOG_DEBUG, "Received ACLS/SPPR flat mode repair request, try to repair\n"); -+ char *sys_dev_path = "/sys/devices/platform"; -+ char path[MAX_PATH]; -+ struct dirent *dent; -+ DIR *dir; -+ int ret; -+ bool find_device = false, find_hbm_mem = false; -+ -+ ret = hbmc_hbm_page_isolate(err); -+ if (ret < 0) { -+ return; -+ } -+ -+ dir = opendir(sys_dev_path); -+ if (!dir) { -+ log(LOG_WARNING, "Can't read '%s': %s\n", -+ sys_dev_path, strerror(errno)); -+ notice_BMC(err, REPAIR_FAILED_OTHER_REASON); -+ return; -+ } -+ -+ while ((dent = readdir(dir))) { -+ if (!strstr(dent->d_name, HBM_MEM_RAS_NAME)) -+ continue; -+ find_device = true; -+ -+ snprintf(path, MAX_PATH, "%s/%s", sys_dev_path, dent->d_name); -+ -+ if (hbmc_get_memory_type(path) == HBM_HBM_MEMORY) { -+ find_hbm_mem = true; -+ ret = hbmc_hbm_repair(err, path); -+ if (ret != -ENXIO) -+ break; -+ } -+ } -+ if (!find_device) { -+ log(LOG_ERROR, "Repair driver is not loaded, skip error, error_type is %u\n", -+ err->reg_array[HBM_REPAIR_REQ_TYPE] & HBM_ERROR_MASK); -+ notice_BMC(err, REPAIR_FAILED_OTHER_REASON); -+ } else if (!find_hbm_mem) { -+ log(LOG_ERROR, "No HBM device memory type found, skip error, error_type is %u\n", -+ err->reg_array[HBM_REPAIR_REQ_TYPE] & HBM_ERROR_MASK); -+ notice_BMC(err, REPAIR_FAILED_OTHER_REASON); -+ } -+ -+ closedir(dir); -+} -+ -+static bool hbm_repair_validate(const struct hisi_common_error_section *err) -+{ -+ if (!((err->val_bits & BIT(COMMON_VALID_MODULE_ID)) && -+ (err->val_bits & BIT(COMMON_VALID_SUBMODULE_ID)) && -+ (err->val_bits & BIT(COMMON_VALID_REG_ARRAY_SIZE)) -+ )) { -+ log(LOG_DEBUG, "Err val_bits validate failed, val_bits is %u\n", err->val_bits); -+ return false; -+ } -+ log(LOG_DEBUG, "err->module_id: %u\n", err->module_id); -+ log(LOG_DEBUG, "err->submodule_id: %u\n", err->submodule_id); -+ log(LOG_DEBUG, "err->val_bits: 0x%x\n", err->val_bits); -+ log(LOG_DEBUG, "err->reg_array_size: %u\n", err->reg_array_size); -+ -+ if (err->module_id != HBMC_MODULE_ID || -+ err->submodule_id != HBMC_SUBMOD_HBM_REPAIR) { -+ log(LOG_DEBUG, "err module_id or sub_module id doesn't not match\n"); -+ return false; -+ } -+ -+ uint32_t hbm_repair_reg_type = err->reg_array[HBM_REPAIR_REQ_TYPE] & HBM_ERROR_MASK; -+ bool is_acls_valid = (hbm_repair_reg_type & (HBM_CE_ACLS | HBM_PSUE_ACLS)) && -+ (err->reg_array_size == HBM_ACLS_ARRAY_SIZE); -+ bool is_sppr_valid = (hbm_repair_reg_type & (HBM_CE_SPPR | HBM_PSUE_SPPR)) && -+ (err->reg_array_size == HBM_SPPR_ARRAY_SIZE); -+ bool is_cache_mode = (hbm_repair_reg_type & HBM_CACHE_MODE) && -+ (err->reg_array_size == HBM_CACHE_ARRAY_SIZE); -+ -+ if (!(is_acls_valid || is_sppr_valid || is_cache_mode)) { -+ log(LOG_DEBUG, "err type (%u) is unknown or address array length (%u) is invalid\n", -+ hbm_repair_reg_type, err->reg_array_size); -+ return false; -+ } -+ -+ log(LOG_INFO, "Received ACLS/SPPR repair request\n"); -+ return true; -+} -+ -+static bool hbm_flat_mode_validate(const struct hisi_common_error_section *err) -+{ -+ uint32_t hbm_repair_reg_type = err->reg_array[HBM_REPAIR_REQ_TYPE] & HBM_ERROR_MASK; -+ return !(hbm_repair_reg_type & HBM_CACHE_MODE); -+} -+ -+int decode_hisi_common_section(struct ras_non_standard_event *event) -+{ -+ const struct hisi_common_error_section *err = (struct hisi_common_error_section *)event->error; -+ -+ if (hbm_repair_validate(err)) { -+ write_fault_info_to_flash(err); -+ if (hbm_flat_mode_validate(err)) { -+ hbm_repair_handler(err); -+ } -+ } -+ -+ return 0; -+} -diff --git a/src/c/hbm_online_repair/non-standard-hbm-repair.h b/src/c/hbm_online_repair/non-standard-hbm-repair.h -new file mode 100644 -index 0000000..7e8e448 ---- /dev/null -+++ b/src/c/hbm_online_repair/non-standard-hbm-repair.h -@@ -0,0 +1,89 @@ -+#ifndef __NON_STANDARD_HBM_REPAIR -+#define __NON_STANDARD_HBM_REPAIR -+ -+#include "ras-non-standard-handler.h" -+ -+#define DEFAULT_PAGE_SIZE_KB 4 -+#define HBM_MEM_RAS_NAME "HISI0521" -+#define HBM_UNKNOWN 0 -+#define HBM_HBM_MEMORY 1 -+#define HBM_DDR_MEMORY 2 -+ -+#define TYPE_UINT32_WIDTH 32 -+#define HBM_REPAIR_REQ_TYPE 0 -+#define HBM_CE_ACLS BIT(0) -+#define HBM_PSUE_ACLS BIT(1) -+#define HBM_CE_SPPR BIT(2) -+#define HBM_PSUE_SPPR BIT(3) -+#define HBM_CACHE_MODE (BIT(4) | BIT(5) | BIT(6) | BIT(7)) -+#define HBM_ERROR_MASK 0b11111111 -+#define HBM_ADDL 1 -+#define HBM_ADDH 2 -+#define HBM_ERROR_TYPE_SIZE 4 -+#define HBM_ADDR_SIZE 8 -+#define HBM_ACLS_ADDR_NUM 1 -+#define HBM_SPPR_ADDR_NUM 16 -+#define HBM_ACLS_ARRAY_SIZE (HBM_ERROR_TYPE_SIZE + HBM_ADDR_SIZE * HBM_ACLS_ADDR_NUM + HBM_ADDR_SIZE) -+#define HBM_SPPR_ARRAY_SIZE (HBM_ERROR_TYPE_SIZE + HBM_ADDR_SIZE * HBM_SPPR_ADDR_NUM + HBM_ADDR_SIZE) -+#define HBM_CACHE_ARRAY_SIZE (HBM_ERROR_TYPE_SIZE + HBM_ADDR_SIZE) -+#define HBMC_MODULE_ID 0x28 -+#define HBMC_SUBMOD_HBM_REPAIR 6 -+#define COMMON_VALID_MODULE_ID 5 -+#define COMMON_VALID_SUBMODULE_ID 6 -+#define COMMON_VALID_REG_ARRAY_SIZE 12 -+ -+#define BMC_SOCKET_PATH "/var/run/sysSentry/bmc.sock" -+#define BMC_REPORT_FORMAT "REP00%02x%02x%02x0000000000000000%02x%02x%02x00%02x00%02x%02x%02x%08x%08x0000000000" -+ -+#define ISOLATE_FAILED_OVER_THRESHOLD 0b10000001 -+#define ISOLATE_FAILED_OTHER_REASON 0b10000010 -+#define REPAIR_FAILED_NO_RESOURCE 0b10010100 -+#define REPAIR_FAILED_INVALID_PARAM 0b10011000 -+#define REPAIR_FAILED_OTHER_REASON 0b10011100 -+#define ONLINE_PAGE_FAILED 0b10100000 -+#define ISOLATE_REPAIR_ONLINE_SUCCESS 0b00000000 -+ -+#define ROW_FAULT 1 -+#define SINGLE_ADDR_FAULT 6 -+ -+#define FAULT_ADDR_PROCESSOR_ID_LEN 2 -+#define FAULT_ADDR_DIE_ID_LEN 1 -+#define FAULT_ADDR_STACK_ID_LEN 3 -+#define FAULT_ADDR_SID_LEN 3 -+#define FAULT_ADDR_CHANNEL_ID_LEN 8 -+#define FAULT_ADDR_BANKGROUP_ID_LEN 3 -+#define FAULT_ADDR_BANK_ID_LEN 3 -+#define FAULT_ADDR_ROW_ID_LEN 17 -+#define FAULT_ADDR_COLUMN_ID_LEN 10 -+#define FAULT_ADDR_ERROR_TYPE_LEN 2 -+#define FAULT_ADDR_REPAIR_TYPE_LEN 2 -+#define FAULT_ADDR_RESERVED_LEN 2 -+#define FAULT_ADDR_CRC8_LEN 8 -+ -+#define FAULT_ADDR_PROCESSOR_ID_MASK ((1 << FAULT_ADDR_PROCESSOR_ID_LEN ) - 1) -+#define FAULT_ADDR_DIE_ID_MASK ((1 << FAULT_ADDR_DIE_ID_LEN ) - 1) -+#define FAULT_ADDR_STACK_ID_MASK ((1 << FAULT_ADDR_STACK_ID_LEN ) - 1) -+#define FAULT_ADDR_SID_MASK ((1 << FAULT_ADDR_SID_LEN ) - 1) -+#define FAULT_ADDR_CHANNEL_ID_MASK ((1 << FAULT_ADDR_CHANNEL_ID_LEN ) - 1) -+#define FAULT_ADDR_BANKGROUP_ID_MASK ((1 << FAULT_ADDR_BANKGROUP_ID_LEN ) - 1) -+#define FAULT_ADDR_BANK_ID_MASK ((1 << FAULT_ADDR_BANK_ID_LEN ) - 1) -+#define FAULT_ADDR_ROW_ID_MASK ((1 << FAULT_ADDR_ROW_ID_LEN ) - 1) -+#define FAULT_ADDR_COLUMN_ID_MASK ((1 << FAULT_ADDR_COLUMN_ID_LEN ) - 1) -+#define FAULT_ADDR_ERROR_TYPE_MASK ((1 << FAULT_ADDR_ERROR_TYPE_LEN ) - 1) -+#define FAULT_ADDR_REPAIR_TYPE_MASK ((1 << FAULT_ADDR_REPAIR_TYPE_LEN ) - 1) -+#define FAULT_ADDR_RESERVED_MASK ((1 << FAULT_ADDR_RESERVED_LEN ) - 1) -+#define FAULT_ADDR_CRC8_MASK ((1 << FAULT_ADDR_CRC8_LEN ) - 1) -+ -+#define EFI_VARIABLE_NON_VOLATILE 0x1 -+#define EFI_VARIABLE_BOOTSERVICE_ACCESS 0x2 -+#define EFI_VARIABLE_RUNTIME_ACCESS 0x4 -+#define EFI_VARIABLE_APPEND_WRITE 0x40 -+ -+#define EFIVARFS_PATH "/sys/firmware/efi/efivars" -+#define MAX_VAR_SIZE (128 * 1024) -+#define FLASH_ENTRY_NUM 8 -+#define KB_SIZE 1024 -+ -+extern int init_all_flash(); -+ -+#endif -diff --git a/src/c/hbm_online_repair/ras-events.c b/src/c/hbm_online_repair/ras-events.c -new file mode 100644 -index 0000000..0b12329 ---- /dev/null -+++ b/src/c/hbm_online_repair/ras-events.c -@@ -0,0 +1,534 @@ -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+ -+#include -+#include -+#include "ras-non-standard-handler.h" -+#include "logger.h" -+ -+/* -+ * Polling time, if read() doesn't block. Currently, trace_pipe_raw never -+ * blocks on read(). So, we need to sleep for a while, to avoid spending -+ * too much CPU cycles. A fix for it is expected for 3.10. -+ */ -+#define POLLING_TIME 3 -+ -+/* Test for a little-endian machine */ -+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ -+ #define ENDIAN KBUFFER_ENDIAN_LITTLE -+#else -+ #define ENDIAN KBUFFER_ENDIAN_BIG -+#endif -+ -+static int get_debugfs_dir(char *debugfs_dir, size_t len) -+{ -+ FILE *fp; -+ char line[MAX_PATH + 1 + 256]; -+ -+ fp = fopen("/proc/mounts","r"); -+ if (!fp) { -+ log(LOG_INFO, "Can't open /proc/mounts"); -+ return errno; -+ } -+ -+ do { -+ char *p, *type, *dir; -+ if (!fgets(line, sizeof(line), fp)) -+ break; -+ -+ p = strtok(line, " \t"); -+ if (!p) -+ break; -+ -+ dir = strtok(NULL, " \t"); -+ if (!dir) -+ break; -+ -+ type = strtok(NULL, " \t"); -+ if (!type) -+ break; -+ -+ if (!strcmp(type, "debugfs")) { -+ fclose(fp); -+ strncpy(debugfs_dir, dir, len - 1); -+ debugfs_dir[len - 1] = '\0'; -+ return 0; -+ } -+ } while(1); -+ -+ fclose(fp); -+ log(LOG_INFO, "Can't find debugfs\n"); -+ return ENOENT; -+} -+ -+ -+static int open_trace(char *trace_dir, char *name, int flags) -+{ -+ int ret; -+ char fname[MAX_PATH + 1]; -+ -+ strcpy(fname, trace_dir); -+ strcat(fname, "/"); -+ strcat(fname, name); -+ -+ ret = open(fname, flags); -+ if (ret < 0) -+ log(LOG_WARNING, "open_trace() failed, fname=%s ret=%d errno=%d\n", fname, ret, errno); -+ -+ return ret; -+} -+ -+static int create_trace_instance(char *trace_instance_dir) -+{ -+ char fname[MAX_PATH + 1]; -+ int rc; -+ -+ get_debugfs_dir(fname, sizeof(fname)); -+ strcat(fname, "/tracing/instances/"TOOL_NAME); -+ rc = mkdir(fname, S_IRWXU); -+ if (rc < 0 && errno != EEXIST) { -+ log(LOG_INFO, "Unable to create " TOOL_NAME " instance at %s\n", fname); -+ return -1; -+ } -+ strcpy(trace_instance_dir, fname); -+ return 0; -+} -+ -+struct ras_events *init_trace_instance(void) -+{ -+ struct ras_events *ras = calloc(1, sizeof(*ras)); -+ if (!ras) { -+ log(LOG_ERROR, "Can't allocate memory for ras struct\n"); -+ return NULL; -+ } -+ int rc = create_trace_instance(ras->tracing); -+ if (rc < 0) { -+ free(ras); -+ return NULL; -+ } -+ return ras; -+} -+ -+/* -+ * Tracing enable/disable code -+ */ -+int toggle_ras_event(char *trace_dir, char *group, char *event, int enable) -+{ -+ int fd, rc; -+ char fname[MAX_PATH + 1]; -+ -+ snprintf(fname, sizeof(fname), "%s%s:%s\n", -+ enable ? "" : "!", -+ group, event); -+ -+ /* Enable RAS events */ -+ fd = open_trace(trace_dir, "set_event", O_RDWR | O_APPEND); -+ if (fd < 0) { -+ log(LOG_WARNING, "Can't open set_event\n"); -+ rc = -errno; -+ goto err; -+ } -+ -+ rc = write(fd, fname, strlen(fname)); -+ close(fd); -+ if (rc <= 0) { -+ log(LOG_WARNING, "Can't write to set_event\n"); -+ rc = -EIO; -+ goto err; -+ } -+ -+ log(LOG_INFO, "%s:%s event %s\n", -+ group, event, -+ enable ? "enabled" : "disabled"); -+ return 0; -+err: -+ log(LOG_ERROR, "Can't %s %s:%s tracing\n", -+ enable ? "enable" : "disable", group, event); -+ return rc; -+} -+ -+static int parse_header_page(struct ras_events *ras, struct tep_handle *pevent) -+{ -+ int fd, len, page_size = DEFAULT_PAGE_SIZE; -+ char buf[page_size]; -+ -+ fd = open_trace(ras->tracing, "events/header_page", O_RDONLY); -+ if (fd < 0) { -+ log(LOG_WARNING, "Open event header page failed\n"); -+ return -1; -+ } -+ -+ len = read(fd, buf, page_size); -+ close(fd); -+ if (len <= 0) { -+ log(LOG_WARNING, "Read event header page failed\n"); -+ return -1; -+ } -+ -+ if (tep_parse_header_page(pevent, buf, len, sizeof(long))) { -+ log(LOG_WARNING, "Parse event header page failed\n"); -+ return -1; -+ } -+ -+ return 0; -+} -+ -+static void parse_ras_data(struct pcpu_data *pdata, struct kbuffer *kbuf, -+ void *data, unsigned long long time_stamp) -+{ -+ struct tep_record record; -+ struct trace_seq s; -+ -+ record.ts = time_stamp; -+ record.size = kbuffer_event_size(kbuf); -+ record.data = data; -+ record.offset = kbuffer_curr_offset(kbuf); -+ record.cpu = pdata->cpu; -+ -+ /* note offset is just offset in subbuffer */ -+ record.missed_events = kbuffer_missed_events(kbuf); -+ record.record_size = kbuffer_curr_size(kbuf); -+ -+ trace_seq_init(&s); -+ tep_print_event(pdata->ras->pevent, &s, &record, "%s-%s-%d-%s", -+ TEP_PRINT_NAME, TEP_PRINT_COMM, TEP_PRINT_TIME, TEP_PRINT_INFO); -+ trace_seq_do_printf(&s); -+ fflush(stdout); -+ trace_seq_destroy(&s); -+} -+ -+static int get_num_cpus() -+{ -+ return sysconf(_SC_NPROCESSORS_ONLN); -+} -+ -+static int set_buffer_percent(struct ras_events *ras, int percent) -+{ -+ int res = 0; -+ int fd; -+ -+ fd = open_trace(ras->tracing, "buffer_percent", O_WRONLY); -+ if (fd >= 0) { -+ char buf[16]; -+ ssize_t size; -+ snprintf(buf, sizeof(buf), "%d", percent); -+ size = write(fd, buf, strlen(buf)); -+ if (size <= 0) { -+ log(LOG_WARNING, "can't write to buffer_percent\n"); -+ res = -1; -+ } -+ close(fd); -+ } else { -+ log(LOG_WARNING, "Can't open buffer_percent\n"); -+ res = -1; -+ } -+ -+ return res; -+} -+ -+static int read_ras_event_all_cpus(struct pcpu_data *pdata, -+ unsigned n_cpus) -+{ -+ ssize_t size; -+ unsigned long long time_stamp; -+ void *data; -+ int ready, i, count_nready; -+ struct kbuffer *kbuf; -+ void *page; -+ struct pollfd fds[n_cpus + 1]; -+ struct signalfd_siginfo fdsiginfo; -+ sigset_t mask; -+ int warnonce[n_cpus]; -+ char pipe_raw[PATH_MAX]; -+ -+ memset(&warnonce, 0, sizeof(warnonce)); -+ -+ page = malloc(pdata[0].ras->page_size); -+ if (!page) { -+ log(LOG_ERROR, "Can't allocate page\n"); -+ return -ENOMEM; -+ } -+ -+ kbuf = kbuffer_alloc(KBUFFER_LSIZE_8, ENDIAN); -+ if (!kbuf) { -+ log(LOG_ERROR, "Can't allocate kbuf\n"); -+ free(page); -+ return -ENOMEM; -+ } -+ -+ /* Fix for poll() on the per_cpu trace_pipe and trace_pipe_raw blocks -+ * indefinitely with the default buffer_percent in the kernel trace system, -+ * which is introduced by the following change in the kernel. -+ * https://lore.kernel.org/all/20221020231427.41be3f26@gandalf.local.home/T/#u. -+ * Set buffer_percent to 0 so that poll() will return immediately -+ * when the trace data is available in the ras per_cpu trace pipe_raw -+ */ -+ if (set_buffer_percent(pdata[0].ras, 0)) -+ log(LOG_WARNING, "Set buffer_percent failed\n"); -+ -+ for (i = 0; i < (n_cpus + 1); i++) -+ fds[i].fd = -1; -+ -+ for (i = 0; i < n_cpus; i++) { -+ fds[i].events = POLLIN; -+ -+ snprintf(pipe_raw, sizeof(pipe_raw), -+ "per_cpu/cpu%d/trace_pipe_raw", i); -+ -+ fds[i].fd = open_trace(pdata[0].ras->tracing, pipe_raw, O_RDONLY); -+ if (fds[i].fd < 0) { -+ log(LOG_ERROR, "Can't open trace_pipe_raw\n"); -+ goto error; -+ } -+ } -+ -+ sigemptyset(&mask); -+ sigaddset(&mask, SIGINT); -+ sigaddset(&mask, SIGTERM); -+ sigaddset(&mask, SIGHUP); -+ sigaddset(&mask, SIGQUIT); -+ if (sigprocmask(SIG_BLOCK, &mask, NULL) == -1) -+ log(LOG_WARNING, "sigprocmask\n"); -+ fds[n_cpus].events = POLLIN; -+ fds[n_cpus].fd = signalfd(-1, &mask, 0); -+ if (fds[n_cpus].fd < 0) { -+ log(LOG_WARNING, "signalfd\n"); -+ goto error; -+ } -+ -+ log(LOG_INFO, "Listening to events for cpus 0 to %u\n", n_cpus - 1); -+ -+ do { -+ ready = poll(fds, (n_cpus + 1), -1); -+ if (ready < 0) { -+ log(LOG_WARNING, "poll\n"); -+ } -+ -+ /* check for the signal */ -+ if (fds[n_cpus].revents & POLLIN) { -+ size = read(fds[n_cpus].fd, &fdsiginfo, -+ sizeof(struct signalfd_siginfo)); -+ if (size != sizeof(struct signalfd_siginfo)) { -+ log(LOG_WARNING, "signalfd read\n"); -+ continue; -+ } -+ -+ if (fdsiginfo.ssi_signo == SIGINT || -+ fdsiginfo.ssi_signo == SIGTERM || -+ fdsiginfo.ssi_signo == SIGHUP || -+ fdsiginfo.ssi_signo == SIGQUIT) { -+ log(LOG_INFO, "Recevied signal=%d\n", -+ fdsiginfo.ssi_signo); -+ goto error; -+ } else { -+ log(LOG_INFO, -+ "Received unexpected signal=%d\n", -+ fdsiginfo.ssi_signo); -+ continue; -+ } -+ } -+ -+ count_nready = 0; -+ for (i = 0; i < n_cpus; i++) { -+ if (fds[i].revents & POLLERR) { -+ if (!warnonce[i]) { -+ log(LOG_INFO, -+ "Error on CPU %i\n", i); -+ warnonce[i]++; -+ } -+ continue; -+ } -+ if (!(fds[i].revents & POLLIN)) { -+ count_nready++; -+ continue; -+ } -+ size = read(fds[i].fd, page, pdata[i].ras->page_size); -+ if (size < 0) { -+ log(LOG_WARNING, "read\n"); -+ goto error; -+ } else if (size > 0) { -+ log(LOG_DEBUG, "cpu %d receive %ld bytes data\n", i, size); -+ kbuffer_load_subbuffer(kbuf, page); -+ -+ while ((data = kbuffer_read_event(kbuf, &time_stamp))) { -+ if (kbuffer_curr_size(kbuf) < 0) { -+ log(LOG_ERROR, "invalid kbuf data, discard\n"); -+ break; -+ } -+ -+ log(LOG_DEBUG, "parse_ras_data\n"); -+ parse_ras_data(&pdata[i], -+ kbuf, data, time_stamp); -+ -+ /* increment to read next event */ -+ log(LOG_DEBUG, "kbuffer_next_event\n"); -+ kbuffer_next_event(kbuf, NULL); -+ } -+ } else { -+ count_nready++; -+ } -+ } -+ -+ /* -+ * If count_nready == n_cpus, there is no cpu fd in POLLIN state, -+ * so we need to break the cycle -+ */ -+ if (count_nready == n_cpus) { -+ log(LOG_ERROR, "no cpu fd in POLLIN state, stop running\n"); -+ break; -+ } -+ } while (1); -+ -+error: -+ kbuffer_free(kbuf); -+ free(page); -+ sigprocmask(SIG_UNBLOCK, &mask, NULL); -+ -+ for (i = 0; i < (n_cpus + 1); i++) { -+ if (fds[i].fd > 0) -+ close(fds[i].fd); -+ } -+ -+ return -1; -+} -+ -+static int init_header_page(struct ras_events *ras, struct tep_handle *pevent) -+{ -+ int rc; -+ -+ rc = parse_header_page(ras, pevent); -+ if (rc) { -+ log(LOG_ERROR, "cannot read trace header_page: %d\n", rc); -+ return rc; -+ } -+ return 0; -+} -+ -+static int init_event_format(struct ras_events *ras, struct tep_handle *pevent, -+ char *group, char *event) -+{ -+ char *page, fname[MAX_PATH + 1]; -+ int fd, size, rc, page_size = DEFAULT_PAGE_SIZE; -+ -+ // read one page from format -+ snprintf(fname, sizeof(fname), "events/%s/%s/format", group, event); -+ fd = open_trace(ras->tracing, fname, O_RDONLY); -+ if (fd < 0) { -+ log(LOG_ERROR, -+ "Can't get %s:%s traces. Perhaps this feature is not supported on your system.\n", -+ group, event); -+ return errno; -+ } -+ -+ log(LOG_INFO, "page_size: %d\n", page_size); -+ ras->page_size = page_size; -+ page = malloc(page_size); -+ if (!page) { -+ log(LOG_ERROR, "Can't allocate page to read %s:%s format\n", -+ group, event); -+ rc = errno; -+ close(fd); -+ return rc; -+ } -+ -+ size = read(fd, page, page_size); -+ close(fd); -+ if (size < 0) { -+ log(LOG_ERROR, "Can't read format\n"); -+ free(page); -+ return size; -+ } -+ -+ // parse event format -+ rc = tep_parse_event(pevent, page, size, group); -+ if (rc) { -+ log(LOG_ERROR, "Can't parse event %s:%s\n", group, event); -+ free(page); -+ return EINVAL; -+ } -+ return 0; -+} -+ -+static int add_event_handler(struct ras_events *ras, struct tep_handle *pevent, -+ char *group, char *event, -+ tep_event_handler_func func) -+{ -+ int rc; -+ -+ rc = init_event_format(ras, pevent, group, event); -+ if (rc) { -+ log(LOG_ERROR, "init_event_format for %s:%s failed\n", group, event); -+ return rc; -+ } -+ -+ /* Registers the special event handlers */ -+ rc = tep_register_event_handler(pevent, -1, group, event, func, ras); -+ if (rc < 0) { -+ log(LOG_ERROR, "Can't register event handler for %s:%s\n", -+ group, event); -+ return EINVAL; -+ } -+ -+ return 0; -+} -+ -+int handle_ras_events(struct ras_events *ras) -+{ -+ int rc, i; -+ unsigned cpus; -+ struct tep_handle *pevent = NULL; -+ struct pcpu_data *data = NULL; -+ -+ pevent = tep_alloc(); -+ if (!pevent) { -+ log(LOG_ERROR, "Can't allocate pevent\n"); -+ rc = errno; -+ goto err; -+ } -+ ras->pevent = pevent; -+ -+ rc = init_header_page(ras, pevent); -+ if (rc) { -+ log(LOG_ERROR, "init_header_page failed\n"); -+ goto err; -+ } -+ -+ rc = add_event_handler(ras, pevent, "ras", "non_standard_event", -+ ras_non_standard_event_handler); -+ if (rc) { -+ log(LOG_ERROR, "Can't get traces from %s:%s\n", -+ "ras", "non_standard_event"); -+ goto err; -+ } -+ log(LOG_INFO, "add_event_handler done\n"); -+ -+ cpus = get_num_cpus(); -+ data = calloc(sizeof(*data), cpus); -+ if (!data) -+ goto err; -+ -+ for (i = 0; i < cpus; i++) { -+ data[i].ras = ras; -+ data[i].cpu = i; -+ } -+ rc = read_ras_event_all_cpus(data, cpus); -+ -+err: -+ if (data) -+ free(data); -+ if (pevent) -+ tep_free(pevent); -+ return rc; -+} -diff --git a/src/c/hbm_online_repair/ras-events.h b/src/c/hbm_online_repair/ras-events.h -new file mode 100644 -index 0000000..4218d93 ---- /dev/null -+++ b/src/c/hbm_online_repair/ras-events.h -@@ -0,0 +1,28 @@ -+#ifndef __RAS_EVENTS_H -+#define __RAS_EVENTS_H -+ -+#include -+#include -+ -+#define MAX_PATH 1024 -+ -+#define DEFAULT_PAGE_SIZE 4096 -+ -+struct ras_events { -+ char tracing[MAX_PATH + 1]; -+ struct tep_handle *pevent; -+ int page_size; -+}; -+ -+struct pcpu_data { -+ struct tep_handle *pevent; -+ struct ras_events *ras; -+ int cpu; -+}; -+ -+/* Function prototypes */ -+int toggle_ras_event(char *trace_dir, char *group, char *event, int enable); -+int handle_ras_events(struct ras_events *ras); -+struct ras_events *init_trace_instance(void); -+ -+#endif -diff --git a/src/c/hbm_online_repair/ras-non-standard-handler.c b/src/c/hbm_online_repair/ras-non-standard-handler.c -new file mode 100644 -index 0000000..1d1fd04 ---- /dev/null -+++ b/src/c/hbm_online_repair/ras-non-standard-handler.c -@@ -0,0 +1,81 @@ -+#include -+#include -+#include -+#include -+#include -+#include -+#include "ras-non-standard-handler.h" -+#include "logger.h" -+ -+static char *uuid_le(const char *uu) -+{ -+ static char uuid[sizeof("xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx")]; -+ if (!uu) { -+ log(LOG_ERROR, "uuid_le failed: uu is empty"); -+ return uuid; -+ } -+ size_t uu_len = strlen(uu); -+ if (uu_len < SECTION_TYPE_UUID_LEN) { -+ log(LOG_ERROR, "uuid_le failed: uu is too short"); -+ return uuid; -+ } -+ -+ char *p = uuid; -+ int i; -+ static const unsigned char le[16] = {3,2,1,0,5,4,7,6,8,9,10,11,12,13,14,15}; -+ -+ for (i = 0; i < 16; i++) { -+ p += sprintf(p, "%.2x", (unsigned char) uu[le[i]]); -+ switch (i) { -+ case 3: -+ case 5: -+ case 7: -+ case 9: -+ *p++ = '-'; -+ break; -+ } -+ } -+ -+ *p = 0; -+ -+ return uuid; -+} -+ -+int ras_non_standard_event_handler(struct trace_seq *s, -+ struct tep_record *record, -+ struct tep_event *event, void *context) -+{ -+ int len; -+ unsigned long long val; -+ struct ras_non_standard_event ev; -+ -+ ev.sec_type = tep_get_field_raw(s, event, "sec_type", -+ record, &len, 1); -+ if(!ev.sec_type) { -+ log(LOG_WARNING, "get event section type failed"); -+ return -1; -+ } -+ -+ trace_seq_printf(s, "\n"); -+ trace_seq_printf(s, "sec_type: %s\n", uuid_le(ev.sec_type)); -+ -+ if (tep_get_field_val(s, event, "len", record, &val, 1) < 0) { -+ log(LOG_WARNING, "tep get field val failed"); -+ return -1; -+ } -+ -+ ev.length = val; -+ trace_seq_printf(s, "length: %d\n", ev.length); -+ -+ ev.error = tep_get_field_raw(s, event, "buf", record, &len, 1); -+ if(!ev.error || ev.length != len) { -+ log(LOG_WARNING, "get event error failed"); -+ return -1; -+ } -+ -+ if (strcmp(uuid_le(ev.sec_type), HISI_COMMON_SECTION_TYPE_UUID) == 0) { -+ decode_hisi_common_section(&ev); -+ } -+ -+ return 0; -+} -diff --git a/src/c/hbm_online_repair/ras-non-standard-handler.h b/src/c/hbm_online_repair/ras-non-standard-handler.h -new file mode 100644 -index 0000000..0272dc1 ---- /dev/null -+++ b/src/c/hbm_online_repair/ras-non-standard-handler.h -@@ -0,0 +1,25 @@ -+#ifndef __RAS_NON_STANDARD_HANDLER_H -+#define __RAS_NON_STANDARD_HANDLER_H -+ -+#include -+#include "ras-events.h" -+ -+#define BIT(nr) (1UL << (nr)) -+ -+#define SECTION_TYPE_UUID_LEN 16 -+#define HISI_COMMON_SECTION_TYPE_UUID "c8b328a8-9917-4af6-9a13-2e08ab2e7586" -+ -+struct ras_non_standard_event { -+ char timestamp[64]; -+ const char *sec_type; -+ const uint8_t *error; -+ uint32_t length; -+}; -+ -+int ras_non_standard_event_handler(struct trace_seq *s, -+ struct tep_record *record, -+ struct tep_event *event, void *context); -+ -+int decode_hisi_common_section(struct ras_non_standard_event *event); -+ -+#endif -diff --git a/src/python/.gitignore b/src/python/.gitignore -new file mode 100644 -index 0000000..58200d4 ---- /dev/null -+++ b/src/python/.gitignore -@@ -0,0 +1 @@ -+__pycache__/ -diff --git a/src/python/syssentry/bmc_alarm.py b/src/python/syssentry/bmc_alarm.py -new file mode 100644 -index 0000000..5956538 ---- /dev/null -+++ b/src/python/syssentry/bmc_alarm.py -@@ -0,0 +1,159 @@ -+import logging -+import socket -+from enum import Enum -+ -+from .utils import execute_command -+ -+HEX_CHAR_LEN = 2 -+SOCKET_RECEIVE_LEN = 128 -+BMC_DATA_HEAD = "REP" -+BMC_REPORT_TYPE_BIT = 0 -+HBMC_REPAIR_TYPE_BIT = 1 -+HBMC_REPAIR_RESULT_BIT = 2 -+HBMC_ISOLATION_TYPE_BIT = 3 -+HBMC_SEND_HEAD_LEN = 4 # "ipmtool", "raw", "0x30", "0x92" -+HBMC_SEND_ROW_BIT = 26 + HBMC_SEND_HEAD_LEN -+HBMC_SEND_COL_BIT = 30 + HBMC_SEND_HEAD_LEN -+HBMC_REPAIR_TYPE_OFFSET = 7 -+ -+HBMC_SEND_SUCCESS_CODE = "db 07 00" -+ -+ -+class ReportType(Enum): -+ HBMC_REPAIR_BMC = 0x00 -+ -+ -+class HBMCRepairType(Enum): -+ CE_ACLS = 7 -+ PS_UCE_ACLS = 8 -+ CE_SPPR = 9 -+ PS_UCE_SPPR = 10 -+ -+ -+class HBMCRepairResultType(Enum): -+ ISOLATE_FAILED_OVER_THRESHOLD = 0b10000001 -+ ISOLATE_FAILED_OTHER_REASON = 0b10000010 -+ REPAIR_FAILED_NO_RESOURCE = 0b10010100 -+ REPAIR_FAILED_INVALID_PARAM = 0b10011000 -+ REPAIR_FAILED_OTHER_REASON = 0b10011100 -+ ONLINE_PAGE_FAILED = 0b10100000 -+ ISOLATE_REPAIR_ONLINE_SUCCESS = 0b00000000 -+ -+ -+class HBMCIsolationType(Enum): -+ ROW_FAULT = 1 -+ SINGLE_ADDR_FAULT = 6 -+ -+ -+def find_value_is_in_enum(value: int, enum: Enum): -+ for item in enum: -+ if value == item.value: -+ return True -+ return False -+ -+ -+def convert_hex_char_to_int(data, bit): -+ if len(data) < (bit+1)*HEX_CHAR_LEN: -+ logging.error(f"Data {data} len is too short, current convert bit is {bit}") -+ char = data[bit*HEX_CHAR_LEN:(bit+1)*HEX_CHAR_LEN] -+ try: -+ value = int(char, 16) -+ except ValueError: -+ logging.error(f"Cannot convert char [{char}] to int") -+ raise ValueError -+ return value -+ -+ -+def reverse_byte(data): -+ return data[3], data[2], data[1], data[0] -+ -+ -+def parse_hbmc_report(data: str): -+ logging.debug(f"bmc receive raw data is {data}") -+ repair_type = convert_hex_char_to_int(data, HBMC_REPAIR_TYPE_BIT) -+ repair_type += HBMC_REPAIR_TYPE_OFFSET -+ if not find_value_is_in_enum(repair_type, HBMCRepairType): -+ logging.warning(f"HBMC msg repair type ({repair_type}) is unknown") -+ raise ValueError -+ -+ repair_result = convert_hex_char_to_int(data, HBMC_REPAIR_RESULT_BIT) -+ if not find_value_is_in_enum(repair_result, HBMCRepairResultType): -+ logging.warning(f"HBMC msg repair result ({repair_result}) is unknown") -+ raise ValueError -+ -+ isolation_type = convert_hex_char_to_int(data, HBMC_ISOLATION_TYPE_BIT) -+ if not find_value_is_in_enum(isolation_type, HBMCIsolationType): -+ logging.warning(f"HBMC msg isolation type ({isolation_type}) is unknown") -+ raise ValueError -+ -+ cmd_list = [ -+ "ipmitool", -+ "raw", -+ "0x30", # Netfn -+ "0x92", # cmd -+ "0xdb", -+ "0x07", -+ "0x00", -+ "0x65", # sub command -+ "0x01", # SystemId -+ "0x00", # LocalSystemId -+ "{:#04X}".format(repair_type), -+ "{:#04X}".format(repair_result), -+ "{:#04X}".format(isolation_type), -+ ] -+ # send the remain data directly -+ data = data[(HBMC_ISOLATION_TYPE_BIT + 1) * HEX_CHAR_LEN:] -+ other_info_str = [] -+ for i in range(len(data) // 2): -+ other_info_str.append("{:#04X}".format(convert_hex_char_to_int(data, i))) -+ cmd_list.extend(other_info_str) -+ -+ cmd_list[HBMC_SEND_ROW_BIT:HBMC_SEND_ROW_BIT + 4] = reverse_byte(cmd_list[HBMC_SEND_ROW_BIT:HBMC_SEND_ROW_BIT + 4]) -+ cmd_list[HBMC_SEND_COL_BIT:HBMC_SEND_COL_BIT + 4] = reverse_byte(cmd_list[HBMC_SEND_COL_BIT:HBMC_SEND_COL_BIT + 4]) -+ -+ logging.info(f"Send bmc alarm command is {cmd_list}") -+ -+ ret = execute_command(cmd_list) -+ if HBMC_SEND_SUCCESS_CODE not in ret: -+ logging.warning(f"Send bmc alarm failed, error code is {ret}") -+ raise ValueError -+ logging.debug("Send bmc alarm success") -+ -+ -+PARSE_REPORT_MSG_FUNC_DICT = { -+ ReportType.HBMC_REPAIR_BMC.value: parse_hbmc_report, -+} -+ -+ -+def bmc_recv(server_socket: socket.socket): -+ logging.debug("Get hbm socket connection request") -+ try: -+ client_socket, _ = server_socket.accept() -+ logging.debug("cpu alarm fd listen ok") -+ -+ data = client_socket.recv(SOCKET_RECEIVE_LEN) -+ data = data.decode() -+ -+ data_head = data[0:len(BMC_DATA_HEAD)] -+ if data_head != BMC_DATA_HEAD: -+ logging.warning(f"The head of the msg is incorrect, head is {data_head}") -+ raise ValueError -+ -+ # remove the data head -+ data = data[len(BMC_DATA_HEAD):] -+ logging.info(f"Remove head data is {data}") -+ -+ report_type = convert_hex_char_to_int(data, BMC_REPORT_TYPE_BIT) -+ if report_type not in PARSE_REPORT_MSG_FUNC_DICT.keys(): -+ logging.warning(f"The type of the msg ({report_type}) is unknown") -+ raise ValueError -+ -+ PARSE_REPORT_MSG_FUNC_DICT[report_type](data) -+ -+ except socket.error: -+ logging.error("socket error") -+ return -+ except (ValueError, OSError, UnicodeError, TypeError, NotImplementedError): -+ logging.error("server recv bmc msg failed!") -+ client_socket.close() -+ return -diff --git a/src/python/syssentry/syssentry.py b/src/python/syssentry/syssentry.py -index ea09095..3829849 100644 ---- a/src/python/syssentry/syssentry.py -+++ b/src/python/syssentry/syssentry.py -@@ -48,6 +48,12 @@ try: - except ImportError: - CPU_EXIST = False - -+BMC_EXIST = True -+try: -+ from .bmc_alarm import bmc_recv -+except ImportError: -+ BMC_EXIST = False -+ - - INSPECTOR = None - -@@ -89,6 +95,9 @@ RESULT_SOCKET_PATH = "/var/run/sysSentry/result.sock" - - CPU_ALARM_SOCKET_PATH = "/var/run/sysSentry/report.sock" - -+BMC_SOCKET_PATH = "/var/run/sysSentry/bmc.sock" -+ -+fd_list = [] - - def msg_data_process(msg_data): - """message data process""" -@@ -334,6 +343,41 @@ def cpu_alarm_fd_create(): - - return cpu_alarm_fd - -+def bmc_fd_create(): -+ """create bmc fd""" -+ if not os.path.exists(SENTRY_RUN_DIR): -+ logging.debug("%s not exist", SENTRY_RUN_DIR) -+ return None -+ -+ try: -+ bmc_fd = socket.socket(socket.AF_UNIX, socket.SOCK_STREAM) -+ except socket.error: -+ logging.error("bmc fd create failed") -+ return None -+ -+ bmc_fd.setblocking(False) -+ if os.path.exists(BMC_SOCKET_PATH): -+ os.remove(BMC_SOCKET_PATH) -+ -+ try: -+ bmc_fd.bind(BMC_SOCKET_PATH) -+ except OSError: -+ logging.error("bmc fd bind failed") -+ bmc_fd.close() -+ return None -+ -+ os.chmod(BMC_SOCKET_PATH, 0o600) -+ try: -+ bmc_fd.listen(5) -+ except OSError: -+ logging.error("bmc fd listen failed") -+ bmc_fd.close() -+ return None -+ -+ logging.debug("%s bind and listen", BMC_SOCKET_PATH) -+ -+ return bmc_fd -+ - - def server_result_recv(server_socket: socket.socket): - """server result receive""" -@@ -407,35 +451,47 @@ def server_result_fd_create(): - return server_result_fd - - -+def close_all_fd(): -+ for fd in fd_list: -+ fd.close() -+ -+ - def main_loop(): - """main loop""" -+ - server_fd = server_fd_create() - if not server_fd: -+ close_all_fd() - return -+ fd_list.append(server_fd) - - server_result_fd = server_result_fd_create() - if not server_result_fd: -- server_fd.close() -+ close_all_fd() - return -+ fd_list.append(server_result_fd) - - heartbeat_fd = heartbeat_fd_create() - if not heartbeat_fd: -- server_fd.close() -- server_result_fd.close() -+ close_all_fd() - return -+ fd_list.append(heartbeat_fd) - - cpu_alarm_fd = cpu_alarm_fd_create() - if not cpu_alarm_fd: -- server_fd.close() -- heartbeat_fd.close() -- server_result_fd.close() -+ close_all_fd() -+ return -+ fd_list.append(cpu_alarm_fd) -+ -+ bmc_fd = bmc_fd_create() -+ if not bmc_fd: -+ close_all_fd() - return -+ fd_list.append(bmc_fd) - - epoll_fd = select.epoll() -- epoll_fd.register(server_fd.fileno(), select.EPOLLIN) -- epoll_fd.register(server_result_fd.fileno(), select.EPOLLIN) -- epoll_fd.register(heartbeat_fd.fileno(), select.EPOLLIN) -- epoll_fd.register(cpu_alarm_fd.fileno(), select.EPOLLIN) -+ for fd in fd_list: -+ epoll_fd.register(fd.fileno(), select.EPOLLIN) - - logging.debug("start main loop") - # onstart_tasks_handle() -@@ -458,6 +514,8 @@ def main_loop(): - heartbeat_recv(heartbeat_fd) - elif CPU_EXIST and event_fd == cpu_alarm_fd.fileno(): - cpu_alarm_recv(cpu_alarm_fd) -+ elif BMC_EXIST and event_fd == bmc_fd.fileno(): -+ bmc_recv(bmc_fd) - else: - continue - --- -2.27.0 - diff --git a/add-log-for-improving-maintainability.patch b/add-log-for-improving-maintainability.patch deleted file mode 100644 index a9a45273d0a95d8b5be940fbd070387f4a96f1fe..0000000000000000000000000000000000000000 --- a/add-log-for-improving-maintainability.patch +++ /dev/null @@ -1,251 +0,0 @@ -From a8418093bb37482da7ccaac0c950f2ed8d0ba2fa Mon Sep 17 00:00:00 2001 -From: gaoruoshu -Date: Thu, 10 Oct 2024 15:07:29 +0800 -Subject: [PATCH] add log for improving maintainability - ---- - .../avg_block_io/avg_block_io.py | 4 +- - .../sentryPlugins/avg_block_io/module_conn.py | 57 ++++++++++------- - .../avg_block_io/stage_window.py | 8 +++ - .../sentryPlugins/avg_block_io/utils.py | 63 +++++++++++++++++-- - 4 files changed, 103 insertions(+), 29 deletions(-) - -diff --git a/src/python/sentryPlugins/avg_block_io/avg_block_io.py b/src/python/sentryPlugins/avg_block_io/avg_block_io.py -index 26a60c5..cf2ded3 100644 ---- a/src/python/sentryPlugins/avg_block_io/avg_block_io.py -+++ b/src/python/sentryPlugins/avg_block_io/avg_block_io.py -@@ -194,11 +194,11 @@ def init_io_win(io_dic, config, common_param): - - if avg_lim_value and avg_time_value and tot_lim_value: - io_data[disk_name][stage_name][rw]["latency"] = IoWindow(window_size=io_dic["win_size"], window_threshold=io_dic["win_threshold"], abnormal_multiple=avg_time_value, abnormal_multiple_lim=avg_lim_value, abnormal_time=tot_lim_value) -- logging.debug("Successfully create {}-{}-{} latency window".format(disk_name, stage_name, rw)) -+ logging.debug("Successfully create {}-{}-{}-latency window".format(disk_name, stage_name, rw)) - - if iodump_lim_value is not None: - io_data[disk_name][stage_name][rw]["iodump"] = IoDumpWindow(window_size=io_dic["win_size"], window_threshold=io_dic["win_threshold"], abnormal_time=iodump_lim_value) -- logging.debug("Successfully create {}-{}-{} iodump window".format(disk_name, stage_name, rw)) -+ logging.debug("Successfully create {}-{}-{}-iodump window".format(disk_name, stage_name, rw)) - return io_data, io_avg_value - - -diff --git a/src/python/sentryPlugins/avg_block_io/module_conn.py b/src/python/sentryPlugins/avg_block_io/module_conn.py -index 2fc5a83..40b3fcc 100644 ---- a/src/python/sentryPlugins/avg_block_io/module_conn.py -+++ b/src/python/sentryPlugins/avg_block_io/module_conn.py -@@ -13,7 +13,7 @@ import logging - import sys - import time - --from .utils import is_abnormal -+from .utils import is_abnormal, get_win_data, log_slow_win - from sentryCollector.collect_plugin import is_iocollect_valid, get_io_data, Result_Messages - from syssentry.result import ResultLevel, report_result - from xalarm.sentry_notify import xalarm_report, MINOR_ALM, ALARM_TYPE_OCCUR -@@ -66,36 +66,51 @@ def report_alarm_fail(alarm_info): - - def process_report_data(disk_name, rw, io_data): - """check abnormal window and report to xalarm""" -- if not is_abnormal((disk_name, 'bio', rw), io_data): -+ abnormal, abnormal_list = is_abnormal((disk_name, 'bio', rw), io_data) -+ if not abnormal: - return - -- msg = {"alarm_source": TASK_NAME, "driver_name": disk_name, "io_type": rw} -+ msg = { -+ "alarm_source": TASK_NAME, "driver_name": disk_name, "io_type": rw, -+ "reason": "unknown", "block_stack": "bio", "alarm_type": abnormal_list, -+ "details": get_win_data(disk_name, rw, io_data) -+ } - -+ # io press - ctrl_stage = ['throtl', 'wbt', 'iocost', 'bfq'] - for stage_name in ctrl_stage: -- if is_abnormal((disk_name, stage_name, rw), io_data): -- msg["reason"] = "IO press slow" -- msg["block_stack"] = f"bio,{stage_name}" -- logging.warning("{} - {} report IO press slow".format(disk_name, rw)) -- xalarm_report(1002, MINOR_ALM, ALARM_TYPE_OCCUR, json.dumps(msg)) -- return -- -- if is_abnormal((disk_name, 'rq_driver', rw), io_data): -+ abnormal, abnormal_list = is_abnormal((disk_name, 'bio', rw), io_data) -+ if not abnormal: -+ continue -+ msg["reason"] = "IO press" -+ msg["block_stack"] = f"bio,{stage_name}" -+ msg["alarm_type"] = abnormal_list -+ log_slow_win(msg, "IO press") -+ xalarm_report(1002, MINOR_ALM, ALARM_TYPE_OCCUR, json.dumps(msg)) -+ return -+ -+ # driver slow -+ abnormal, abnormal_list = is_abnormal((disk_name, 'rq_driver', rw), io_data) -+ if abnormal: - msg["reason"] = "driver slow" - msg["block_stack"] = "bio,rq_driver" -- logging.warning("{} - {} report driver slow".format(disk_name, rw)) -+ msg["alarm_type"] = abnormal_list -+ log_slow_win(msg, "driver slow") - xalarm_report(1002, MINOR_ALM, ALARM_TYPE_OCCUR, json.dumps(msg)) - return - -+ # kernel slow - kernel_stage = ['gettag', 'plug', 'deadline', 'hctx', 'requeue'] - for stage_name in kernel_stage: -- if is_abnormal((disk_name, stage_name, rw), io_data): -- msg["reason"] = "kernel slow" -- msg["block_stack"] = f"bio,{stage_name}" -- logging.warning("{} - {} report kernel slow".format(disk_name, rw)) -- xalarm_report(1002, MINOR_ALM, ALARM_TYPE_OCCUR, json.dumps(msg)) -- return -- msg["reason"] = "unknown" -- msg["block_stack"] = "bio" -- logging.warning("{} - {} report UNKNOWN slow".format(disk_name, rw)) -+ abnormal, abnormal_list = is_abnormal((disk_name, stage_name, rw), io_data) -+ if not abnormal: -+ continue -+ msg["reason"] = "kernel slow" -+ msg["block_stack"] = f"bio,{stage_name}" -+ msg["alarm_type"] = abnormal_list -+ log_slow_win(msg, "kernel slow") -+ xalarm_report(1002, MINOR_ALM, ALARM_TYPE_OCCUR, json.dumps(msg)) -+ return -+ -+ log_slow_win(msg, "unknown") - xalarm_report(1002, MINOR_ALM, ALARM_TYPE_OCCUR, json.dumps(msg)) -diff --git a/src/python/sentryPlugins/avg_block_io/stage_window.py b/src/python/sentryPlugins/avg_block_io/stage_window.py -index 9b0ce79..5113782 100644 ---- a/src/python/sentryPlugins/avg_block_io/stage_window.py -+++ b/src/python/sentryPlugins/avg_block_io/stage_window.py -@@ -14,6 +14,11 @@ class AbnormalWindowBase: - self.window_size = window_size - self.window_threshold = window_threshold - self.abnormal_window = [False] * window_size -+ self.window_data = [-1] * window_size -+ -+ def append_new_data(self, ab_res): -+ self.window_data.pop(0) -+ self.window_data.append(ab_res) - - def append_new_period(self, ab_res, avg_val=0): - self.abnormal_window.pop(0) -@@ -25,6 +30,9 @@ class AbnormalWindowBase: - def is_abnormal_window(self): - return sum(self.abnormal_window) > self.window_threshold - -+ def window_data_to_string(self): -+ return ",".join(str(x) for x in self.window_data) -+ - - class IoWindow(AbnormalWindowBase): - def __init__(self, window_size=10, window_threshold=7, abnormal_multiple=5, abnormal_multiple_lim=30, abnormal_time=40): -diff --git a/src/python/sentryPlugins/avg_block_io/utils.py b/src/python/sentryPlugins/avg_block_io/utils.py -index 2de9a46..3b7f027 100644 ---- a/src/python/sentryPlugins/avg_block_io/utils.py -+++ b/src/python/sentryPlugins/avg_block_io/utils.py -@@ -65,15 +65,32 @@ def set_nested_value(data, keys, value): - return True - - -+def get_win_data(disk_name, rw, io_data): -+ """get latency and iodump win data""" -+ latency = '' -+ iodump = '' -+ for stage_name in io_data[disk_name]: -+ if 'latency' in io_data[disk_name][stage_name][rw]: -+ latency_list = io_data[disk_name][stage_name][rw]['latency'].window_data_to_string() -+ latency += f'{stage_name}: [{latency_list}], ' -+ if 'iodump' in io_data[disk_name][stage_name][rw]: -+ iodump_list = io_data[disk_name][stage_name][rw]['iodump'].window_data_to_string() -+ iodump += f'{stage_name}: [{iodump_list}], ' -+ return {"latency": latency[:-2], "iodump": iodump[:-2]} -+ -+ - def is_abnormal(io_key, io_data): - """check if latency and iodump win abnormal""" -+ abnormal_list = '' - for key in ['latency', 'iodump']: - all_keys = get_nested_value(io_data, io_key) - if all_keys and key in all_keys: - win = get_nested_value(io_data, io_key + (key,)) - if win and win.is_abnormal_window(): -- return True -- return False -+ abnormal_list += key + ', ' -+ if not abnormal_list: -+ return False, abnormal_list -+ return True, abnormal_list[:-2] - - - def update_io_avg(old_avg, period_value, win_size): -@@ -87,8 +104,8 @@ def update_io_avg(old_avg, period_value, win_size): - return [new_avg_value, new_avg_count] - - --def update_io_data(old_avg, period_value, win_size, io_data, io_key): -- """update data of latency and iodump window""" -+def update_io_period(old_avg, period_value, io_data, io_key): -+ """update period of latency and iodump window""" - all_wins = get_nested_value(io_data, io_key) - if all_wins and "latency" in all_wins: - io_data[io_key[0]][io_key[1]][io_key[2]]["latency"].append_new_period(period_value[0], old_avg[AVG_VALUE]) -@@ -96,20 +113,54 @@ def update_io_data(old_avg, period_value, win_size, io_data, io_key): - io_data[io_key[0]][io_key[1]][io_key[2]]["iodump"].append_new_period(period_value[1]) - - -+def update_io_data(period_value, io_data, io_key): -+ """update data of latency and iodump window""" -+ all_wins = get_nested_value(io_data, io_key) -+ if all_wins and "latency" in all_wins: -+ io_data[io_key[0]][io_key[1]][io_key[2]]["latency"].append_new_data(period_value[0]) -+ if all_wins and "iodump" in all_wins: -+ io_data[io_key[0]][io_key[1]][io_key[2]]["iodump"].append_new_data(period_value[1]) -+ -+ -+def log_abnormal_period(old_avg, period_value, io_data, io_key): -+ """record log of abnormal period""" -+ all_wins = get_nested_value(io_data, io_key) -+ if all_wins and "latency" in all_wins: -+ if all_wins["latency"].is_abnormal_period(period_value[0], old_avg[AVG_VALUE]): -+ logging.info(f"[abnormal_period] disk: {io_key[0]}, stage: {io_key[1]}, iotype: {io_key[2]}, " -+ f"type: latency, avg: {round(old_avg[AVG_VALUE], 3)}, curr_val: {period_value[0]}") -+ if all_wins and "iodump" in all_wins: -+ if all_wins["iodump"].is_abnormal_period(period_value[1]): -+ logging.info(f"[abnormal_period] disk: {io_key[0]}, stage: {io_key[1]}, iotype: {io_key[2]}, " -+ f"type: iodump, curr_val: {period_value[1]}") -+ -+ -+def log_slow_win(msg, reason): -+ """record log of slow win""" -+ logging.warning(f"[SLOW IO] disk: {msg['driver_name']}, stage: {msg['block_stack']}, " -+ f"iotype: {msg['io_type']}, type: {msg['alarm_type']}, reason: {reason}") -+ logging.info(f"latency: {msg['details']['latency']}") -+ logging.info(f"iodump: {msg['details']['iodump']}") -+ -+ - def update_avg_and_check_abnormal(data, io_key, win_size, io_avg_value, io_data): - """update avg and check abonrmal, return true if win_size full""" - period_value = get_nested_value(data, io_key) - old_avg = get_nested_value(io_avg_value, io_key) - - # 更新avg数据 -+ update_io_data(period_value, io_data, io_key) - if old_avg[AVG_COUNT] < win_size: - set_nested_value(io_avg_value, io_key, update_io_avg(old_avg, period_value, win_size)) - return False - -+ # 打印异常周期数据 -+ log_abnormal_period(old_avg, period_value, io_data, io_key) -+ - # 更新win数据 -- 判断异常周期 -- update_io_data(old_avg, period_value, win_size, io_data, io_key) -+ update_io_period(old_avg, period_value, io_data, io_key) - all_wins = get_nested_value(io_data, io_key) -- if all_wins and 'latency' not in all_wins: -+ if not all_wins or 'latency' not in all_wins: - return True - period = get_nested_value(io_data, io_key + ("latency",)) - if period and period.is_abnormal_period(period_value[0], old_avg[AVG_VALUE]): --- -2.27.0 - diff --git a/add-log-for-xalarm-when-sending-msg-and-clean-invali.patch b/add-log-for-xalarm-when-sending-msg-and-clean-invali.patch deleted file mode 100644 index b8a762fdfb2ce435daafeed0dbc59c5b9ccb9dee..0000000000000000000000000000000000000000 --- a/add-log-for-xalarm-when-sending-msg-and-clean-invali.patch +++ /dev/null @@ -1,24 +0,0 @@ -From ef3aad0ca57d35b0a4fe29a0205596021bae0227 Mon Sep 17 00:00:00 2001 -From: caixiaomeng -Date: Fri, 11 Oct 2024 17:59:54 +0800 -Subject: [PATCH] add log for xalarm when sending msg and clean invalid client - socket - ---- - src/python/xalarm/xalarm_transfer.py | 1 + - 1 file changed, 1 insertion(+) - -diff --git a/src/python/xalarm/xalarm_transfer.py b/src/python/xalarm/xalarm_transfer.py -index 42137d8..9e867cc 100644 ---- a/src/python/xalarm/xalarm_transfer.py -+++ b/src/python/xalarm/xalarm_transfer.py -@@ -117,4 +117,5 @@ def transmit_alarm(server_sock, epoll, fd_to_socket, bin_data): - epoll.unregister(fileno) - fd_to_socket[fileno].close() - del fd_to_socket[fileno] -+ logging.info(f"cleaned up connection {fileno} for client lost connection.") - --- -2.27.0 - - diff --git a/add-log-level-and-change-log-format.patch b/add-log-level-and-change-log-format.patch deleted file mode 100644 index 219c86cd4ad680a8bc945149e376c7e1f0dd2bbe..0000000000000000000000000000000000000000 --- a/add-log-level-and-change-log-format.patch +++ /dev/null @@ -1,522 +0,0 @@ -From c1ab550a3f817826ac6f279de97e6d3820901275 Mon Sep 17 00:00:00 2001 -From: gaoruoshu -Date: Fri, 27 Sep 2024 14:10:18 +0800 -Subject: [PATCH] add log level and change log format - ---- - config/collector.conf | 5 ++- - config/inspect.conf | 5 ++- - config/plugins/avg_block_io.ini | 5 ++- - config/xalarm.conf | 3 ++ - src/python/sentryCollector/collect_config.py | 29 ++++++++++++++++ - src/python/sentryCollector/collect_io.py | 15 ++------- - src/python/sentryCollector/collect_plugin.py | 32 +++++++++--------- - src/python/sentryCollector/collectd.py | 6 ++-- - .../avg_block_io/avg_block_io.py | 7 ++-- - .../sentryPlugins/avg_block_io/utils.py | 32 ++++++++++++++++++ - src/python/syssentry/sentry_config.py | 28 ++++++++++++++++ - src/python/syssentry/syssentry.py | 7 ++-- - src/python/xalarm/xalarm_config.py | 33 +++++++++++++++++-- - src/python/xalarm/xalarm_daemon.py | 7 ++-- - 14 files changed, 172 insertions(+), 42 deletions(-) - -diff --git a/config/collector.conf b/config/collector.conf -index 9baa086..56b0ed1 100644 ---- a/config/collector.conf -+++ b/config/collector.conf -@@ -4,4 +4,7 @@ modules=io - [io] - period_time=1 - max_save=10 --disk=default -\ No newline at end of file -+disk=default -+ -+[log] -+level=info -\ No newline at end of file -diff --git a/config/inspect.conf b/config/inspect.conf -index 071cca1..f451d9e 100644 ---- a/config/inspect.conf -+++ b/config/inspect.conf -@@ -1,2 +1,5 @@ - [inspect] --Interval=3 -\ No newline at end of file -+Interval=3 -+ -+[log] -+level=info -diff --git a/config/plugins/avg_block_io.ini b/config/plugins/avg_block_io.ini -index bc33dde..858db18 100644 ---- a/config/plugins/avg_block_io.ini -+++ b/config/plugins/avg_block_io.ini -@@ -1,8 +1,11 @@ -+[log] -+level=info -+ - [common] - disk=default - stage=default - iotype=read,write --period_time=1 -+period_time=1 - - [algorithm] - win_size=30 -diff --git a/config/xalarm.conf b/config/xalarm.conf -index 14c6d39..323d2dd 100644 ---- a/config/xalarm.conf -+++ b/config/xalarm.conf -@@ -1,2 +1,5 @@ - [filter] - id_mask = 1001-1128 -+ -+[log] -+level=info -diff --git a/src/python/sentryCollector/collect_config.py b/src/python/sentryCollector/collect_config.py -index 0fdd9f0..5aa38ec 100644 ---- a/src/python/sentryCollector/collect_config.py -+++ b/src/python/sentryCollector/collect_config.py -@@ -32,6 +32,35 @@ CONF_IO_PERIOD_TIME_DEFAULT = 1 - CONF_IO_MAX_SAVE_DEFAULT = 10 - CONF_IO_DISK_DEFAULT = "default" - -+# log -+CONF_LOG = 'log' -+CONF_LOG_LEVEL = 'level' -+LogLevel = { -+ "debug": logging.DEBUG, -+ "info": logging.INFO, -+ "warning": logging.WARNING, -+ "error": logging.ERROR, -+ "critical": logging.CRITICAL -+} -+ -+ -+def get_log_level(filename=COLLECT_CONF_PATH): -+ if not os.path.exists(filename): -+ return logging.INFO -+ -+ try: -+ config = configparser.ConfigParser() -+ config.read(filename) -+ if not config.has_option(CONF_LOG, CONF_LOG_LEVEL): -+ return logging.INFO -+ log_level = config.get(CONF_LOG, CONF_LOG_LEVEL) -+ if log_level.lower() in LogLevel: -+ return LogLevel.get(log_level.lower()) -+ return logging.INFO -+ except configparser.Error: -+ return logging.INFO -+ -+ - class CollectConfig: - def __init__(self, filename=COLLECT_CONF_PATH): - -diff --git a/src/python/sentryCollector/collect_io.py b/src/python/sentryCollector/collect_io.py -index 9c8dae7..019d174 100644 ---- a/src/python/sentryCollector/collect_io.py -+++ b/src/python/sentryCollector/collect_io.py -@@ -163,18 +163,6 @@ class CollectIo(): - logging.error("An error occurred2: %s", e) - return column_names - -- def task_loop(self): -- if self.stop_event.is_set(): -- logging.info("collect io thread exit") -- return -- -- for disk_name, stage_list in self.disk_map_stage.items(): -- if self.get_blk_io_hierarchy(disk_name, stage_list) < 0: -- continue -- self.append_period_lat(disk_name, stage_list) -- -- threading.Timer(self.period_time, self.task_loop).start() -- - def is_kernel_avaliable(self): - base_path = '/sys/kernel/debug/block' - all_disk = [] -@@ -191,6 +179,9 @@ class CollectIo(): - if file_name == 'stats': - all_disk.append(disk_name) - -+ if self.loop_all: -+ self.disk_list = all_disk -+ - for disk_name in self.disk_list: - if not self.loop_all and disk_name not in all_disk: - logging.warning("the %s disk not exist!", disk_name) -diff --git a/src/python/sentryCollector/collect_plugin.py b/src/python/sentryCollector/collect_plugin.py -index 1faa5e3..3e2cf4c 100644 ---- a/src/python/sentryCollector/collect_plugin.py -+++ b/src/python/sentryCollector/collect_plugin.py -@@ -75,14 +75,14 @@ def client_send_and_recv(request_data, data_str_len, protocol): - try: - client_socket = socket.socket(socket.AF_UNIX, socket.SOCK_STREAM) - except socket.error: -- print("collect_plugin: client create socket error") -+ logging.error("collect_plugin: client create socket error") - return None - - try: - client_socket.connect(COLLECT_SOCKET_PATH) - except OSError: - client_socket.close() -- print("collect_plugin: client connect error") -+ logging.error("collect_plugin: client connect error") - return None - - req_data_len = len(request_data) -@@ -94,23 +94,23 @@ def client_send_and_recv(request_data, data_str_len, protocol): - res_data = res_data.decode() - except (OSError, UnicodeError): - client_socket.close() -- print("collect_plugin: client communicate error") -+ logging.error("collect_plugin: client communicate error") - return None - - res_magic = res_data[:CLT_MSG_MAGIC_LEN] - if res_magic != "RES": -- print("res msg format error") -+ logging.error("res msg format error") - return None - - protocol_str = res_data[CLT_MSG_MAGIC_LEN:CLT_MSG_MAGIC_LEN+CLT_MSG_PRO_LEN] - try: - protocol_id = int(protocol_str) - except ValueError: -- print("recv msg protocol id is invalid %s", protocol_str) -+ logging.error("recv msg protocol id is invalid %s", protocol_str) - return None - - if protocol_id >= ClientProtocol.PRO_END: -- print("protocol id is invalid") -+ logging.error("protocol id is invalid") - return None - - try: -@@ -119,7 +119,7 @@ def client_send_and_recv(request_data, data_str_len, protocol): - res_msg_data = res_msg_data.decode() - return res_msg_data - except (OSError, ValueError, UnicodeError): -- print("collect_plugin: client recv res msg error") -+ logging.error("collect_plugin: client recv res msg error") - finally: - client_socket.close() - -@@ -128,30 +128,30 @@ def client_send_and_recv(request_data, data_str_len, protocol): - def validate_parameters(param, len_limit, char_limit): - ret = ResultMessage.RESULT_SUCCEED - if not param: -- print("param is invalid") -+ logging.error("param is invalid, param = %s", param) - ret = ResultMessage.RESULT_NOT_PARAM - return [False, ret] - - if not isinstance(param, list): -- print(f"{param} is not list type.") -+ logging.error("%s is not list type.", param) - ret = ResultMessage.RESULT_NOT_PARAM - return [False, ret] - - if len(param) <= 0: -- print(f"{param} length is 0.") -+ logging.error("%s length is 0.", param) - ret = ResultMessage.RESULT_INVALID_LENGTH - return [False, ret] - - pattern = r'^[a-zA-Z0-9_-]+$' - for info in param: - if not re.match(pattern, info): -- print(f"{info} is invalid char") -+ logging.error("%s is invalid char", info) - ret = ResultMessage.RESULT_INVALID_CHAR - return [False, ret] - - # length of len_limit is exceeded, keep len_limit - if len(param) > len_limit: -- print(f"{param} length more than {len_limit}, keep the first {len_limit}") -+ logging.error("%s length more than %d, keep the first %d", param, len_limit, len_limit) - param[:] = param[0:len_limit] - - # only keep elements under the char_limit length -@@ -202,13 +202,13 @@ def inter_is_iocollect_valid(period, disk_list=None, stage=None): - request_message = json.dumps(req_msg_struct) - result_message = client_send_and_recv(request_message, CLT_MSG_LEN_LEN, ClientProtocol.IS_IOCOLLECT_VALID) - if not result_message: -- print("collect_plugin: client_send_and_recv failed") -+ logging.error("collect_plugin: client_send_and_recv failed") - return result - - try: - json.loads(result_message) - except json.JSONDecodeError: -- print("is_iocollect_valid: json decode error") -+ logging.error("is_iocollect_valid: json decode error") - result['ret'] = ResultMessage.RESULT_PARSE_FAILED - return result - -@@ -260,12 +260,12 @@ def inter_get_io_data(period, disk_list, stage, iotype): - request_message = json.dumps(req_msg_struct) - result_message = client_send_and_recv(request_message, CLT_MSG_LEN_LEN, ClientProtocol.GET_IO_DATA) - if not result_message: -- print("collect_plugin: client_send_and_recv failed") -+ logging.error("collect_plugin: client_send_and_recv failed") - return result - try: - json.loads(result_message) - except json.JSONDecodeError: -- print("get_io_data: json decode error") -+ logging.error("get_io_data: json decode error") - result['ret'] = ResultMessage.RESULT_PARSE_FAILED - return result - -diff --git a/src/python/sentryCollector/collectd.py b/src/python/sentryCollector/collectd.py -index d9d8862..33f4b04 100644 ---- a/src/python/sentryCollector/collectd.py -+++ b/src/python/sentryCollector/collectd.py -@@ -26,7 +26,7 @@ import threading - - from .collect_io import CollectIo - from .collect_server import CollectServer --from .collect_config import CollectConfig -+from .collect_config import CollectConfig, get_log_level - - SENTRY_RUN_DIR = "/var/run/sysSentry" - COLLECT_SOCKET_PATH = "/var/run/sysSentry/collector.sock" -@@ -57,7 +57,9 @@ def main(): - os.mkdir(SENTRY_RUN_DIR) - os.chmod(SENTRY_RUN_DIR, mode=SENTRY_RUN_DIR_PERM) - -- logging.basicConfig(filename=COLLECT_LOG_FILE, level=logging.INFO) -+ log_level = get_log_level() -+ log_format = "%(asctime)s - %(levelname)s - [%(filename)s:%(lineno)d] - %(message)s" -+ logging.basicConfig(filename=COLLECT_LOG_FILE, level=log_level, format=log_format) - os.chmod(COLLECT_LOG_FILE, 0o600) - - try: -diff --git a/src/python/sentryPlugins/avg_block_io/avg_block_io.py b/src/python/sentryPlugins/avg_block_io/avg_block_io.py -index ac35be2..b6b3b28 100644 ---- a/src/python/sentryPlugins/avg_block_io/avg_block_io.py -+++ b/src/python/sentryPlugins/avg_block_io/avg_block_io.py -@@ -15,7 +15,7 @@ import time - - from .stage_window import IoWindow, IoDumpWindow - from .module_conn import avg_is_iocollect_valid, avg_get_io_data, report_alarm_fail, process_report_data, sig_handler --from .utils import update_avg_and_check_abnormal -+from .utils import update_avg_and_check_abnormal, get_log_level - - CONFIG_FILE = "/etc/sysSentry/plugins/avg_block_io.ini" - -@@ -283,7 +283,10 @@ def main(): - signal.signal(signal.SIGINT, sig_handler) - signal.signal(signal.SIGTERM, sig_handler) - -- logging.basicConfig(level=logging.INFO) -+ log_level = get_log_level(CONFIG_FILE) -+ log_format = "%(asctime)s - %(levelname)s - [%(filename)s:%(lineno)d] - %(message)s" -+ -+ logging.basicConfig(level=log_level, format=log_format) - - # 初始化配置读取 - config = configparser.ConfigParser(comment_prefixes=('#', ';')) -diff --git a/src/python/sentryPlugins/avg_block_io/utils.py b/src/python/sentryPlugins/avg_block_io/utils.py -index 54ed080..2de9a46 100644 ---- a/src/python/sentryPlugins/avg_block_io/utils.py -+++ b/src/python/sentryPlugins/avg_block_io/utils.py -@@ -8,9 +8,41 @@ - # IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY OR FIT FOR A PARTICULAR - # PURPOSE. - # See the Mulan PSL v2 for more details. -+import configparser -+import logging -+import os -+ - AVG_VALUE = 0 - AVG_COUNT = 1 - -+CONF_LOG = 'log' -+CONF_LOG_LEVEL = 'level' -+LogLevel = { -+ "debug": logging.DEBUG, -+ "info": logging.INFO, -+ "warning": logging.WARNING, -+ "error": logging.ERROR, -+ "critical": logging.CRITICAL -+} -+ -+ -+def get_log_level(filename): -+ if not os.path.exists(filename): -+ return logging.INFO -+ -+ try: -+ config = configparser.ConfigParser() -+ config.read(filename) -+ if not config.has_option(CONF_LOG, CONF_LOG_LEVEL): -+ return logging.INFO -+ log_level = config.get(CONF_LOG, CONF_LOG_LEVEL) -+ -+ if log_level.lower() in LogLevel: -+ return LogLevel.get(log_level.lower()) -+ return logging.INFO -+ except configparser.Error: -+ return logging.INFO -+ - - def get_nested_value(data, keys): - """get data from nested dict""" -diff --git a/src/python/syssentry/sentry_config.py b/src/python/syssentry/sentry_config.py -index a0e7b79..1169887 100644 ---- a/src/python/syssentry/sentry_config.py -+++ b/src/python/syssentry/sentry_config.py -@@ -21,6 +21,34 @@ import sys - DEFAULT_INSPECT_DELAY = 3 - INSPECT_CONF_PATH = "/etc/sysSentry/inspect.conf" - -+CONF_LOG = 'log' -+CONF_LOG_LEVEL = 'level' -+LogLevel = { -+ "debug": logging.DEBUG, -+ "info": logging.INFO, -+ "warning": logging.WARNING, -+ "error": logging.ERROR, -+ "critical": logging.CRITICAL -+} -+ -+ -+def get_log_level(filename=INSPECT_CONF_PATH): -+ if not os.path.exists(filename): -+ return logging.INFO -+ -+ try: -+ config = configparser.ConfigParser() -+ config.read(filename) -+ if not config.has_option(CONF_LOG, CONF_LOG_LEVEL): -+ return logging.INFO -+ log_level = config.get(CONF_LOG, CONF_LOG_LEVEL) -+ -+ if log_level.lower() in LogLevel: -+ return LogLevel.get(log_level.lower()) -+ return logging.INFO -+ except configparser.Error: -+ return logging.INFO -+ - - class SentryConfig: - """ -diff --git a/src/python/syssentry/syssentry.py b/src/python/syssentry/syssentry.py -index 776971f..9ef0203 100644 ---- a/src/python/syssentry/syssentry.py -+++ b/src/python/syssentry/syssentry.py -@@ -23,7 +23,7 @@ import fcntl - - import select - --from .sentry_config import SentryConfig -+from .sentry_config import SentryConfig, get_log_level - - from .task_map import TasksMap - from .global_values import SENTRY_RUN_DIR, CTL_SOCKET_PATH, SENTRY_RUN_DIR_PERM -@@ -563,7 +563,10 @@ def main(): - os.mkdir(SENTRY_RUN_DIR) - os.chmod(SENTRY_RUN_DIR, mode=SENTRY_RUN_DIR_PERM) - -- logging.basicConfig(filename=SYSSENTRY_LOG_FILE, level=logging.INFO) -+ log_level = get_log_level() -+ log_format = "%(asctime)s - %(levelname)s - [%(filename)s:%(lineno)d] - %(message)s" -+ -+ logging.basicConfig(filename=SYSSENTRY_LOG_FILE, level=log_level, format=log_format) - os.chmod(SYSSENTRY_LOG_FILE, 0o600) - - if not chk_and_set_pidfile(): -diff --git a/src/python/xalarm/xalarm_config.py b/src/python/xalarm/xalarm_config.py -index 8e56d10..754a816 100644 ---- a/src/python/xalarm/xalarm_config.py -+++ b/src/python/xalarm/xalarm_config.py -@@ -15,9 +15,10 @@ Create: 2023-11-02 - """ - - import re -+import os - import dataclasses - import logging --from configparser import ConfigParser -+import configparser - - - MAIN_CONFIG_PATH = '/etc/sysSentry/xalarm.conf' -@@ -27,6 +28,34 @@ MIN_ID_NUMBER = 1001 - MAX_ID_NUMBER = 1128 - MAX_ID_MASK_CAPACITY = 128 - -+# log -+CONF_LOG = 'log' -+CONF_LOG_LEVEL = 'level' -+LogLevel = { -+ "debug": logging.DEBUG, -+ "info": logging.INFO, -+ "warning": logging.WARNING, -+ "error": logging.ERROR, -+ "critical": logging.CRITICAL -+} -+ -+ -+def get_log_level(filename=MAIN_CONFIG_PATH): -+ if not os.path.exists(filename): -+ return logging.INFO -+ -+ try: -+ config = configparser.ConfigParser() -+ config.read(filename) -+ if not config.has_option(CONF_LOG, CONF_LOG_LEVEL): -+ return logging.INFO -+ log_level = config.get(CONF_LOG, CONF_LOG_LEVEL) -+ if log_level.lower() in LogLevel: -+ return LogLevel.get(log_level.lower()) -+ return logging.INFO -+ except configparser.Error: -+ return logging.INFO -+ - - @dataclasses.dataclass - class AlarmConfig: -@@ -106,7 +135,7 @@ def config_init(): - """ - alarm_config = AlarmConfig() - -- cfg = ConfigParser() -+ cfg = configparser.ConfigParser() - cfg.read(MAIN_CONFIG_PATH) - - id_mask = parse_id_mask(cfg) -diff --git a/src/python/xalarm/xalarm_daemon.py b/src/python/xalarm/xalarm_daemon.py -index 00e8886..3ab211c 100644 ---- a/src/python/xalarm/xalarm_daemon.py -+++ b/src/python/xalarm/xalarm_daemon.py -@@ -21,7 +21,7 @@ import signal - import fcntl - import socket - --from .xalarm_config import config_init -+from .xalarm_config import config_init, get_log_level - from .xalarm_server import server_loop, SOCK_FILE - - ALARM_DIR = "/var/run/xalarm" -@@ -120,9 +120,10 @@ def alarm_process_create(): - os.mkdir(ALARM_DIR) - os.chmod(ALARM_DIR, ALARM_DIR_PERMISSION) - -+ log_level = get_log_level() -+ log_format = "%(asctime)s - %(levelname)s - [%(filename)s:%(lineno)d] - %(message)s" - -- logging.basicConfig(filename=ALARM_LOGFILE, level=logging.INFO, -- format='%(asctime)s|%(levelname)s| %(message)s') -+ logging.basicConfig(filename=ALARM_LOGFILE, level=log_level, format=log_format) - - signal.signal(signal.SIGTERM, signal_handler) - --- -2.23.0 - diff --git a/add-parameter-time_range-alarm_id-and-alarm_clear_ti.patch b/add-parameter-time_range-alarm_id-and-alarm_clear_ti.patch deleted file mode 100644 index ee9e2346c8dcec74350fe36d53392b458e818423..0000000000000000000000000000000000000000 --- a/add-parameter-time_range-alarm_id-and-alarm_clear_ti.patch +++ /dev/null @@ -1,104 +0,0 @@ -From 0a4bd4097690bee7250676a0c262a830c7a8fbcf Mon Sep 17 00:00:00 2001 -From: jinsaihang -Date: Fri, 11 Oct 2024 15:35:43 +0800 -Subject: [PATCH] add parameter time_range ,alarm_id and alarm_clear_time - validation - -Signed-off-by: jinsaihang ---- - sysSentry-1.0.2/src/python/syssentry/alarm.py | 19 +++++++++++++++++++ - .../src/python/syssentry/load_mods.py | 6 ++---- - .../src/python/syssentry/sentryctl | 4 +++- - 3 files changed, 24 insertions(+), 5 deletions(-) - -diff --git a/src/python/syssentry/alarm.py b/src/python/syssentry/alarm.py -index d5337d3..43c1065 100644 ---- a/src/python/syssentry/alarm.py -+++ b/src/python/syssentry/alarm.py -@@ -18,6 +18,7 @@ from datetime import datetime - import time - import logging - import json -+import sys - - from xalarm.register_xalarm import xalarm_register,xalarm_getid,xalarm_getlevel,xalarm_gettype,xalarm_gettime,xalarm_getdesc - from xalarm.xalarm_api import Xalarm -@@ -41,9 +42,15 @@ id_base = 1001 - clientId = -1 - - MILLISECONDS_UNIT_SECONDS = 1000 -+MAX_NUM_OF_ALARM_ID = 128 -+MIN_ALARM_ID = 1001 -+MAX_ALARM_ID = (MIN_ALARM_ID + MAX_NUM_OF_ALARM_ID - 1) - - def update_alarm_list(alarm_info: Xalarm): - alarm_id = xalarm_getid(alarm_info) -+ if alarm_id < MIN_ALARM_ID or alarm_id > MAX_ALARM_ID: -+ logging.warnning(f"Invalid alarm_id {alarm_id}") -+ return - timestamp = xalarm_gettime(alarm_info) - if not timestamp: - logging.error("Retrieve timestamp failed") -@@ -77,7 +84,19 @@ def alarm_register(): - logging.info(f"alarm_register: {task_name} is registered") - task = TasksMap.tasks_dict[task_type][task_name] - alarm_id = task.alarm_id -+ if alarm_id < MIN_ALARM_ID or alarm_id > MAX_ALARM_ID: -+ logging.warnning(f"Invalid alarm_id {alarm_id}: ignore {task_name} alarm") -+ continue - alarm_clear_time = task.alarm_clear_time -+ try: -+ alarm_clear_time = int(alarm_clear_time) -+ if alarm_clear_time <= 0: -+ raise ValueError("Not a positive integer") -+ if alarm_clear_time > sys.maxsize: -+ raise ValueError("Exceeds maximum value for int") -+ except (ValueError, OverflowError, TypeError) as e: -+ logging.warnning(f"Invalid alarm_clear_time {alarm_clear_time}: ignore {task_name} alarm") -+ continue - alarm_list_dict[alarm_id] = [] - task_alarm_id_dict[task_name] = alarm_id - if alarm_id not in alarm_id_clear_time_dict: -diff --git a/src/python/syssentry/load_mods.py b/src/python/syssentry/load_mods.py -index ae05e57..7daf17d 100644 ---- a/src/python/syssentry/load_mods.py -+++ b/src/python/syssentry/load_mods.py -@@ -203,11 +203,9 @@ def parse_mod_conf(mod_name, mod_conf): - if not (MIN_ALARM_ID <= task.alarm_id <= MAX_ALARM_ID): - raise ValueError("Invalid alarm_id") - except ValueError: -- task.alarm_id = -1 -- logging.warning("Invalid alarm_id, set to -1") -+ logging.warning("Invalid alarm_id") - except configparser.NoOptionError: -- task.alarm_id = -1 -- logging.warning("Unset alarm_id and alarm_clear_time, use -1 and 15s as default") -+ logging.warning("Unset alarm_clear_time, use 15s as default") - - if CONF_ONSTART in mod_conf.options(CONF_TASK): - is_onstart = (mod_conf.get(CONF_TASK, CONF_ONSTART) == 'yes') -diff --git a/src/python/syssentry/sentryctl b/src/python/syssentry/sentryctl -index 3de93d0..c2e3cef 100644 ---- a/src/python/syssentry/sentryctl -+++ b/src/python/syssentry/sentryctl -@@ -136,7 +136,7 @@ if __name__ == '__main__': - parser_get_result.add_argument('task_name') - parser_get_alarm = subparsers.add_parser('get_alarm', help='get task alarm') - parser_get_alarm.add_argument('task_name') -- parser_get_alarm.add_argument('-s', '--time_range', type=str, default=DEFAULT_ALARM_TIME_RANGE, help='Specified time range') -+ parser_get_alarm.add_argument('-s', '--time_range', type=int, default=DEFAULT_ALARM_TIME_RANGE, help='Specified time range') - parser_get_alarm.add_argument('-d', '--detailed', action='store_true', help='Print Detailed Information') - parser_list = subparsers.add_parser('list', help='show all loaded task mod') - -@@ -153,6 +153,8 @@ if __name__ == '__main__': - elif client_args.cmd_type == 'get_result': - req_msg_struct = {"type": "get_result", "data": client_args.task_name} - elif client_args.cmd_type == 'get_alarm': -+ if not isinstance(client_args.time_range, int) or client_args.time_range <= 0: -+ print(f"time_range is not a positive integer: {client_args.time_range}") - req_msg_struct = { - "type": "get_alarm", - "data": { --- -2.27.0 - diff --git a/add-pyxalarm-and-pySentryNotify-add-multi-users-supp.patch b/add-pyxalarm-and-pySentryNotify-add-multi-users-supp.patch deleted file mode 100644 index 5b27e693c19cedfb3e31e5e138397f6d30ceb8c1..0000000000000000000000000000000000000000 --- a/add-pyxalarm-and-pySentryNotify-add-multi-users-supp.patch +++ /dev/null @@ -1,678 +0,0 @@ -From a18ea2e94fef78334a56dce1ea3f67ee649732f3 Mon Sep 17 00:00:00 2001 -From: PshySimon -Date: Thu, 26 Sep 2024 16:12:25 +0800 -Subject: [PATCH] add pyxalarm and pySentryNotify, add multi users support for - xalarmd and adapt libxalarm - ---- - src/libso/xalarm/register_xalarm.c | 41 ++---- - src/libso/xalarm/register_xalarm.h | 10 +- - src/python/xalarm/register_xalarm.py | 192 +++++++++++++++++++++++++++ - src/python/xalarm/sentry_notify.py | 71 ++++++++++ - src/python/xalarm/xalarm_api.py | 18 ++- - src/python/xalarm/xalarm_server.py | 40 +++++- - src/python/xalarm/xalarm_transfer.py | 96 ++++++++++++-- - 7 files changed, 408 insertions(+), 60 deletions(-) - create mode 100644 src/python/xalarm/register_xalarm.py - create mode 100644 src/python/xalarm/sentry_notify.py - -diff --git a/src/libso/xalarm/register_xalarm.c b/src/libso/xalarm/register_xalarm.c -index 152c078..21a419f 100644 ---- a/src/libso/xalarm/register_xalarm.c -+++ b/src/libso/xalarm/register_xalarm.c -@@ -35,7 +35,7 @@ - #define ALARM_SOCKET_PERMISSION 0700 - #define TIME_UNIT_MILLISECONDS 1000 - --#define MAX_PARAS_LEN 511 -+#define MAX_PARAS_LEN 1023 - #define MIN_ALARM_ID 1001 - #define MAX_ALARM_ID (MIN_ALARM_ID + MAX_NUM_OF_ALARM_ID - 1) - -@@ -91,7 +91,7 @@ static int create_unix_socket(const char *path) - return -1; - } - -- fd = socket(AF_UNIX, SOCK_DGRAM, 0); -+ fd = socket(AF_UNIX, SOCK_STREAM, 0); - if (fd < 0) { - printf("socket failed:%s\n", strerror(errno)); - return -1; -@@ -103,14 +103,6 @@ static int create_unix_socket(const char *path) - goto release_socket; - } - -- if (access(PATH_REG_ALARM, F_OK) == 0) { -- ret = unlink(PATH_REG_ALARM); -- if (ret != 0) { -- printf("unlink register socket file failed\n"); -- goto release_socket; -- } -- } -- - if (access(DIR_XALARM, F_OK) == -1) { - if (mkdir(DIR_XALARM, ALARM_DIR_PERMISSION) == -1) { - printf("mkdir %s failed\n", DIR_XALARM); -@@ -120,32 +112,22 @@ static int create_unix_socket(const char *path) - - if (memset(&alarm_addr, 0, sizeof(alarm_addr)) == NULL) { - printf("create_unix_socket: memset alarm_addr failed, ret: %d\n", ret); -- goto remove_dir; -+ goto release_socket; - } - alarm_addr.sun_family = AF_UNIX; - strncpy(alarm_addr.sun_path, path, sizeof(alarm_addr.sun_path) - 1); - -- if (bind(fd, (struct sockaddr *)&alarm_addr, sizeof(alarm_addr.sun_family) + strlen(alarm_addr.sun_path)) < 0) { -- printf("bind socket failed:%s\n", strerror(errno)); -- goto remove_dir; -+ if (connect(fd, (struct sockaddr*)&alarm_addr, sizeof(alarm_addr)) == -1) { -+ printf("create_unix_socket: connect alarm_addr failed, ret: %d\n", ret); -+ goto release_socket; - } - if (chmod(path, ALARM_SOCKET_PERMISSION) < 0) { - printf("chmod %s failed: %s\n", path, strerror(errno)); -- goto unlink_sockfile; -+ goto release_socket; - } - - return fd; - --unlink_sockfile: -- ret = unlink(PATH_REG_ALARM); -- if (ret != 0) { -- printf("unlink register socket file failed\n"); -- } --remove_dir: -- ret = rmdir(DIR_XALARM); -- if (ret != 0) { -- printf("rmdir %s failed: %s\n", path, strerror(errno)); -- } - release_socket: - (void)close(fd); - -@@ -271,8 +253,6 @@ int xalarm_Register(alarm_callback_func callback, struct alarm_subscription_info - - void xalarm_UnRegister(int client_id) - { -- int ret; -- - if (!g_register_info.is_registered) { - printf("%s: alarm has not registered\n", __func__); - return; -@@ -292,10 +272,6 @@ void xalarm_UnRegister(int client_id) - if (g_register_info.register_fd != -1) { - (void)close(g_register_info.register_fd); - g_register_info.register_fd = -1; -- ret = unlink(PATH_REG_ALARM); -- if (ret != 0) { -- printf("%s: unlink register socket file failed\n", __func__); -- } - } - - memset(g_register_info.alarm_enable_bitmap, 0, MAX_NUM_OF_ALARM_ID * sizeof(char)); -@@ -357,7 +333,7 @@ int xalarm_Report(unsigned short usAlarmId, unsigned char ucAlarmLevel, - struct sockaddr_un alarm_addr; - - if ((usAlarmId < MIN_ALARM_ID || usAlarmId > MAX_ALARM_ID) || -- (ucAlarmLevel < ALARM_LEVEL_FATAL || ucAlarmLevel > ALARM_LEVEL_DEBUG) || -+ (ucAlarmLevel < MINOR_ALM || ucAlarmLevel > CRITICAL_ALM) || - (ucAlarmType < ALARM_TYPE_OCCUR || ucAlarmType > ALARM_TYPE_RECOVER)) { - fprintf(stderr, "%s: alarm info invalid\n", __func__); - return -1; -@@ -666,3 +642,4 @@ int report_result(const char *task_name, enum RESULT_LEVEL result_level, const c - return RETURE_CODE_SUCCESS; - } - -+ -diff --git a/src/libso/xalarm/register_xalarm.h b/src/libso/xalarm/register_xalarm.h -index 1f26c6a..fef9482 100644 ---- a/src/libso/xalarm/register_xalarm.h -+++ b/src/libso/xalarm/register_xalarm.h -@@ -11,7 +11,7 @@ - #include - #include - --#define ALARM_INFO_MAX_PARAS_LEN 512 -+#define ALARM_INFO_MAX_PARAS_LEN 1024 - #define MAX_STRERROR_SIZE 1024 - #define MAX_ALARM_TYEPS 1024 - #define MIN_ALARM_ID 1001 -@@ -19,11 +19,9 @@ - - #define MEMORY_ALARM_ID 1001 - --#define ALARM_LEVEL_FATAL 1 --#define ALARM_LEVEL_ERROR 2 --#define ALARM_LEVEL_WARNING 3 --#define ALARM_LEVEL_INFO 4 --#define ALARM_LEVEL_DEBUG 5 -+#define MINOR_ALM 1 -+#define MAJOR_ALM 2 -+#define CRITICAL_ALM 3 - - #define ALARM_TYPE_OCCUR 1 - #define ALARM_TYPE_RECOVER 2 -diff --git a/src/python/xalarm/register_xalarm.py b/src/python/xalarm/register_xalarm.py -new file mode 100644 -index 0000000..e58343d ---- /dev/null -+++ b/src/python/xalarm/register_xalarm.py -@@ -0,0 +1,192 @@ -+import os -+import sys -+import socket -+import logging -+import threading -+import time -+import fcntl -+import inspect -+from struct import error as StructParseError -+ -+from .xalarm_api import Xalarm, alarm_bin2stu -+ -+ -+ALARM_REPORT_LEN = 1048 -+MAX_NUM_OF_ALARM_ID=128 -+MIN_ALARM_ID = 1001 -+MAX_ALARM_ID = (MIN_ALARM_ID + MAX_NUM_OF_ALARM_ID - 1) -+DIR_XALARM = "/var/run/xalarm" -+PATH_REG_ALARM = "/var/run/xalarm/alarm" -+PATH_REPORT_ALARM = "/var/run/xalarm/report" -+ALARM_DIR_PERMISSION = 0o0750 -+ALARM_REG_SOCK_PERMISSION = 0o0700 -+ALARM_SOCKET_PERMISSION = 0o0700 -+TIME_UNIT_MILLISECONDS = 1000 -+ALARM_REGISTER_INFO = None -+ -+ -+class AlarmRegister: -+ def __init__(self, id_filter: list[bool], callback: callable): -+ self.id_filter = id_filter -+ self.callback = callback -+ self.socket = self.create_unix_socket() -+ self.is_registered = True -+ self.thread = threading.Thread(target=self.alarm_recv) -+ self.thread_should_stop = False -+ -+ def check_params(self) -> bool: -+ if (len(self.id_filter) != MAX_NUM_OF_ALARM_ID): -+ sys.stderr.write("check_params: invalid param id_filter\n") -+ return False -+ -+ sig = inspect.signature(self.callback) -+ if len(sig.parameters) != 1: -+ sys.stderr.write("check_params: invalid param callback\n") -+ return False -+ -+ if self.socket is None: -+ sys.stderr.write("check_params: scoket create failed\n") -+ return False -+ return True -+ -+ def set_id_filter(self, id_filter: list[bool]) -> bool: -+ if (len(id_filter) > MAX_NUM_OF_ALARM_ID): -+ sys.stderr.write("set_id_filter: invalid param id_filter\n") -+ return False -+ self.id_filter = id_filter -+ -+ def id_is_registered(self, alarm_id) -> bool: -+ if alarm_id < MIN_ALARM_ID or alarm_id > MAX_ALARM_ID: -+ return False -+ return self.id_filter[alarm_id - MIN_ALARM_ID] -+ -+ def put_alarm_info(self, alarm_info: Xalarm) -> None: -+ if not self.callback or not alarm_info: -+ return -+ if not self.id_is_registered(alarm_info.alarm_id): -+ return -+ self.callback(alarm_info) -+ -+ def create_unix_socket(self) -> socket.socket: -+ try: -+ sock = socket.socket(socket.AF_UNIX, socket.SOCK_STREAM) -+ sock.setblocking(False) -+ -+ if not os.access(DIR_XALARM, os.F_OK): -+ os.makedirs(DIR_XALARM) -+ os.chmod(DIR_XALARM, ALARM_DIR_PERMISSION) -+ -+ sock.connect(PATH_REG_ALARM) -+ return sock -+ except (IOError, OSError, FileNotFoundError) as e: -+ sock.close() -+ sys.stderr.write(f"create_unix_socket: create socket error:{e}\n") -+ return None -+ -+ def alarm_recv(self): -+ while not self.thread_should_stop: -+ try: -+ data = self.socket.recv(ALARM_REPORT_LEN) -+ if not data: -+ sys.stderr.write("connection closed by xalarmd, maybe connections reach max num or service stopped.\n") -+ self.thread_should_stop = True -+ break -+ if len(data) != ALARM_REPORT_LEN: -+ sys.stderr.write(f"server receive report msg length wrong {len(data)}\n") -+ continue -+ -+ alarm_info = alarm_bin2stu(data) -+ self.put_alarm_info(alarm_info) -+ except (BlockingIOError) as e: -+ time.sleep(0.1) -+ except (ConnectionResetError, ConnectionAbortedError, BrokenPipeError): -+ sys.stderr.write("Connection closed by the server.\n") -+ self.thread_should_stop = True -+ except (ValueError, StructParseError, InterruptedError) as e: -+ sys.stderr.write(f"{e}\n") -+ except Exception as e: -+ sys.stderr.write(f"{e}\n") -+ self.thread_should_stop = True -+ -+ def start_thread(self) -> None: -+ self.thread.daemon = True -+ self.thread.start() -+ -+ def stop_thread(self) -> None: -+ self.thread_should_stop = True -+ self.thread.join() -+ self.socket.close() -+ -+ -+def xalarm_register(callback: callable, id_filter: list[bool]) -> int: -+ global ALARM_REGISTER_INFO -+ -+ if ALARM_REGISTER_INFO is not None: -+ sys.stderr.write("xalarm_register: alarm has registered\n") -+ return -1 -+ -+ ALARM_REGISTER_INFO = AlarmRegister(id_filter, callback) -+ if not ALARM_REGISTER_INFO.check_params(): -+ return -1 -+ -+ ALARM_REGISTER_INFO.start_thread() -+ -+ return 0 -+ -+ -+def xalarm_unregister(clientId: int) -> None: -+ global ALARM_REGISTER_INFO -+ if clientId < 0: -+ sys.stderr.write("xalarm_unregister: invalid client\n") -+ return -+ -+ if ALARM_REGISTER_INFO is None: -+ sys.stderr.write("xalarm_unregister: alarm has not registered\n") -+ return -+ -+ ALARM_REGISTER_INFO.stop_thread() -+ ALARM_REGISTER_INFO = None -+ -+ -+def xalarm_upgrade(clientId: int, id_filter: list[bool]) -> None: -+ global ALARM_REGISTER_INFO -+ if clientId < 0: -+ sys.stderr.write("xalarm_unregister: invalid client\n") -+ return -+ if ALARM_REGISTER_INFO is None: -+ sys.stderr.write("xalarm_unregister: alarm has not registered\n") -+ return -+ ALARM_REGISTER_INFO.id_filter = id_filter -+ -+ -+def xalarm_getid(alarm_info: Xalarm) -> int: -+ if not alarm_info: -+ return 0 -+ return alarm_info.alarm_id -+ -+ -+def xalarm_getlevel(alarm_info: Xalarm) -> int: -+ if not alarm_info: -+ return 0 -+ return alarm_info.alarm_level -+ -+ -+def xalarm_gettype(alarm_info: Xalarm) -> int: -+ if not alarm_info: -+ return 0 -+ return alarm_info.alarm_type -+ -+ -+def xalarm_gettime(alarm_info: Xalarm) -> int: -+ if not alarm_info: -+ return 0 -+ return alarm_info.timetamp.tv_sec * TIME_UNIT_MILLISECONDS + alarm_info.timetamp.tv_usec / TIME_UNIT_MILLISECONDS -+ -+def xalarm_getdesc(alarm_info: Xalarm) -> str: -+ if not alarm_info: -+ return None -+ try: -+ desc_str = alarm_info.msg1.rstrip(b'\x00').decode('utf-8') -+ except UnicodeError: -+ desc_str = None -+ return desc_str -diff --git a/src/python/xalarm/sentry_notify.py b/src/python/xalarm/sentry_notify.py -new file mode 100644 -index 0000000..a19e5b3 ---- /dev/null -+++ b/src/python/xalarm/sentry_notify.py -@@ -0,0 +1,71 @@ -+import os -+import sys -+import time -+import socket -+from struct import error as StructParseError -+ -+from .xalarm_api import alarm_stu2bin, Xalarm -+ -+MAX_NUM_OF_ALARM_ID = 128 -+MIN_ALARM_ID = 1001 -+MAX_ALARM_ID = (MIN_ALARM_ID + MAX_NUM_OF_ALARM_ID - 1) -+ -+MINOR_ALM = 1 -+MAJOR_ALM = 2 -+CRITICAL_ALM = 3 -+ -+ALARM_TYPE_OCCUR = 1 -+ALARM_TYPE_RECOVER = 2 -+ -+MAX_PUC_PARAS_LEN = 1024 -+ -+DIR_XALARM = "/var/run/xalarm" -+PATH_REPORT_ALARM = "/var/run/xalarm/report" -+ALARM_DIR_PERMISSION = 0o750 -+ALARM_SOCKET_PERMISSION = 0o700 -+ -+ -+def check_params(alarm_id, alarm_level, alarm_type, puc_paras) -> bool: -+ if not os.path.exists(DIR_XALARM): -+ sys.stderr.write(f"check_params: {DIR_XALARM} not exist, failed") -+ return False -+ -+ if not os.path.exists(PATH_REPORT_ALARM): -+ sys.stderr.write(f"check_params: {PATH_REPORT_ALARM} not exist, failed") -+ return False -+ -+ if (alarm_id < MIN_ALARM_ID or alarm_id > MAX_ALARM_ID or -+ alarm_level < MINOR_ALM or alarm_level > CRITICAL_ALM or -+ alarm_type < ALARM_TYPE_OCCUR or alarm_type > ALARM_TYPE_RECOVER): -+ sys.stderr.write("check_params: alarm info invalid\n") -+ return False -+ -+ if len(puc_paras) >= MAX_PUC_PARAS_LEN: -+ sys.stderr.write(f"check_params: alarm msg should be less than {MAX_PUC_PARAS_LEN}\n") -+ return False -+ -+ return True -+ -+def xalarm_report(alarm_id, alarm_level, alarm_type, puc_paras) -> bool: -+ if not check_params(alarm_id, alarm_level, alarm_type, puc_paras): -+ return False -+ -+ try: -+ sock = socket.socket(socket.AF_UNIX, socket.SOCK_DGRAM) -+ -+ current_time = time.time() -+ current_time_seconds = int(current_time) -+ current_microseconds = int((current_time - current_time_seconds) * 1_000_000) -+ alarm_info = Xalarm(alarm_id, alarm_type, alarm_level, -+ current_time_seconds, current_microseconds, puc_paras) -+ -+ sock.sendto(alarm_stu2bin(alarm_info), PATH_REPORT_ALARM) -+ except (FileNotFoundError, StructParseError, socket.error, OSError, UnicodeError) as e: -+ sys.stderr.write(f"check_params: error occurs when sending msg.{e}\n") -+ return False -+ finally: -+ sock.close() -+ -+ return True -+ -+ -diff --git a/src/python/xalarm/xalarm_api.py b/src/python/xalarm/xalarm_api.py -index 94d7638..99eabf5 100644 ---- a/src/python/xalarm/xalarm_api.py -+++ b/src/python/xalarm/xalarm_api.py -@@ -23,6 +23,7 @@ ALARM_LEVELS = (1, 2, 3, 4, 5) - ALARM_SOCK_PATH = "/var/run/xalarm/report" - MIN_ALARM_ID = 1001 - MAX_ALARM_ID = 1128 -+MAX_MSG_LEN = 1024 - - - @dataclasses.dataclass -@@ -97,15 +98,15 @@ class Xalarm: - def msg1(self, msg): - """msg1 setter - """ -- if len(msg) > 512: -- raise ValueError("msg1 length must below 255") -+ if len(msg) > MAX_MSG_LEN: -+ raise ValueError(f"msg1 length must below {MAX_MSG_LEN}") - self._msg1 = msg - - - def alarm_bin2stu(bin_data): - """alarm binary to struct - """ -- struct_data = struct.unpack("@HBBll512s", bin_data) -+ struct_data = struct.unpack(f"@HBBll{MAX_MSG_LEN}s", bin_data) - - alarm_info = Xalarm(1001, 2, 1, 0, 0, "") - alarm_info.alarm_id = struct_data[0] -@@ -116,3 +117,14 @@ def alarm_bin2stu(bin_data): - alarm_info.msg1 = struct_data[5] - - return alarm_info -+ -+ -+def alarm_stu2bin(alarm_info: Xalarm): -+ return struct.pack( -+ f'@HBBll{MAX_MSG_LEN}s', -+ alarm_info.alarm_id, -+ alarm_info.alarm_level, -+ alarm_info.alarm_type, -+ alarm_info.timetamp.tv_sec, -+ alarm_info.timetamp.tv_usec, -+ alarm_info.msg1.encode('utf-8')) -diff --git a/src/python/xalarm/xalarm_server.py b/src/python/xalarm/xalarm_server.py -index 84db273..fcaf393 100644 ---- a/src/python/xalarm/xalarm_server.py -+++ b/src/python/xalarm/xalarm_server.py -@@ -17,16 +17,20 @@ Create: 2023-11-02 - import socket - import os - import logging -+import select -+import threading - from struct import error as StructParseError - - from .xalarm_api import alarm_bin2stu --from .xalarm_transfer import check_filter, transmit_alarm -+from .xalarm_transfer import check_filter, transmit_alarm, wait_for_connection - - - ALARM_DIR = "/var/run/xalarm" -+USER_RECV_SOCK = "/var/run/xalarm/alarm" - SOCK_FILE = "/var/run/xalarm/report" --ALARM_REPORT_LEN = 536 -+ALARM_REPORT_LEN = 1048 - ALARM_DIR_PERMISSION = 0o750 -+ALARM_LISTEN_QUEUE_LEN = 5 - - - def clear_sock_path(): -@@ -37,6 +41,8 @@ def clear_sock_path(): - os.chmod(ALARM_DIR, ALARM_DIR_PERMISSION) - if os.path.exists(SOCK_FILE): - os.unlink(SOCK_FILE) -+ if os.path.exists(USER_RECV_SOCK): -+ os.unlink(USER_RECV_SOCK) - - - def server_loop(alarm_config): -@@ -49,6 +55,21 @@ def server_loop(alarm_config): - sock.bind(SOCK_FILE) - os.chmod(SOCK_FILE, 0o600) - -+ alarm_sock = socket.socket(socket.AF_UNIX, socket.SOCK_STREAM) -+ alarm_sock.bind(USER_RECV_SOCK) -+ os.chmod(USER_RECV_SOCK, 0o600) -+ alarm_sock.listen(ALARM_LISTEN_QUEUE_LEN) -+ alarm_sock.setblocking(False) -+ -+ epoll = select.epoll() -+ epoll.register(alarm_sock.fileno(), select.EPOLLIN) -+ fd_to_socket = {alarm_sock.fileno(): alarm_sock,} -+ thread_should_stop = False -+ -+ thread = threading.Thread(target=wait_for_connection, args=(alarm_sock, epoll, fd_to_socket, thread_should_stop)) -+ thread.daemon = True -+ thread.start() -+ - while True: - try: - data, _ = sock.recvfrom(ALARM_REPORT_LEN) -@@ -58,14 +79,21 @@ def server_loop(alarm_config): - logging.debug("server receive report msg length wrong %d", - len(data)) - continue -- - alarm_info = alarm_bin2stu(data) - logging.debug("server bin2stu msg") - if not check_filter(alarm_info, alarm_config): - continue -+ transmit_alarm(alarm_sock, epoll, fd_to_socket, data) -+ except Exception as e: -+ logging.error(f"Error server:{e}") -+ -+ thread_should_stop = True -+ thread.join() - -- transmit_alarm(data) -- except (ValueError, StructParseError): -- pass -+ epoll.unregister(alarm_sock.fileno()) -+ epoll.close() -+ alarm_sock.close() -+ os.unlink(USER_RECV_SOCK) - - sock.close() -+ -diff --git a/src/python/xalarm/xalarm_transfer.py b/src/python/xalarm/xalarm_transfer.py -index b590b43..42137d8 100644 ---- a/src/python/xalarm/xalarm_transfer.py -+++ b/src/python/xalarm/xalarm_transfer.py -@@ -16,10 +16,12 @@ Create: 2023-11-02 - - import socket - import logging -+import select - --USER_RECV_SOCK = "/var/run/xalarm/alarm" - MIN_ID_NUMBER = 1001 - MAX_ID_NUMBER = 1128 -+MAX_CONNECTION_NUM = 100 -+TEST_CONNECT_BUFFER_SIZE = 32 - - - def check_filter(alarm_info, alarm_filter): -@@ -35,16 +37,84 @@ def check_filter(alarm_info, alarm_filter): - return True - - --def transmit_alarm(bin_data): -- """forward alarm message -+def cleanup_closed_connections(server_sock, epoll, fd_to_socket): - """ -- sock = socket.socket(socket.AF_UNIX, socket.SOCK_DGRAM) -- try: -- sock.sendto(bin_data, USER_RECV_SOCK) -- logging.debug("transfer alarm success") -- except ConnectionRefusedError: -- logging.debug("transfer sendto failed") -- except FileNotFoundError: -- logging.debug("transfer sendto failed") -- finally: -- sock.close() -+ clean invalid client socket connections saved in 'fd_to_socket' -+ :param server_sock: server socket instance of alarm -+ :param epoll: epoll instance, used to unregister invalid client connections -+ :param fd_to_socket: dict instance, used to hold client connections and server connections -+ """ -+ to_remove = [] -+ for fileno, connection in fd_to_socket.items(): -+ if connection is server_sock: -+ continue -+ try: -+ # test whether connection still alive, use MSG_DONTWAIT to avoid blocking thread -+ # use MSG_PEEK to avoid consuming buffer data -+ data = connection.recv(TEST_CONNECT_BUFFER_SIZE, socket.MSG_DONTWAIT | socket.MSG_PEEK) -+ if not data: -+ to_remove.append(fileno) -+ except BlockingIOError: -+ pass -+ except (ConnectionResetError, ConnectionAbortedError, BrokenPipeError): -+ to_remove.append(fileno) -+ -+ for fileno in to_remove: -+ epoll.unregister(fileno) -+ fd_to_socket[fileno].close() -+ del fd_to_socket[fileno] -+ logging.info(f"cleaned up connection {fileno} for client lost connection.") -+ -+ -+def wait_for_connection(server_sock, epoll, fd_to_socket, thread_should_stop): -+ """ -+ thread function for catch and save client connection -+ :param server_sock: server socket instance of alarm -+ :param epoll: epoll instance, used to unregister invalid client connections -+ :param fd_to_socket: dict instance, used to hold client connections and server connections -+ :param thread_should_stop: bool instance -+ """ -+ while not thread_should_stop: -+ try: -+ events = epoll.poll(1) -+ -+ for fileno, event in events: -+ if fileno == server_sock.fileno(): -+ connection, client_address = server_sock.accept() -+ # if reach max connection, cleanup closed connections -+ if len(fd_to_socket) - 1 >= MAX_CONNECTION_NUM: -+ cleanup_closed_connections(server_sock, epoll, fd_to_socket) -+ # if connections still reach max num, close this connection automatically -+ if len(fd_to_socket) - 1 >= MAX_CONNECTION_NUM: -+ logging.info(f"connection reach max num of {MAX_CONNECTION_NUM}, closed current connection!") -+ connection.close() -+ continue -+ epoll.register(connection.fileno(), select.EPOLLOUT) -+ fd_to_socket[connection.fileno()] = connection -+ except socket.error as e: -+ logging.debug(f"socket error, reason is {e}") -+ break -+ except (KeyError, OSError, ValueError) as e: -+ logging.debug(f"wait for connection failed {e}") -+ -+ -+def transmit_alarm(server_sock, epoll, fd_to_socket, bin_data): -+ """ -+ this function is to broadcast alarm data to client, if fail to send data, remove connections held by fd_to_socket -+ :param server_sock: server socket instance of alarm -+ :param epoll: epoll instance, used to unregister invalid client connections -+ :param fd_to_socket: dict instance, used to hold client connections and server connections -+ :param bin_data: binary instance, alarm info data in C-style struct format defined in xalarm_api.py -+ """ -+ to_remove = [] -+ for fileno, connection in fd_to_socket.items(): -+ if connection is not server_sock: -+ try: -+ connection.sendall(bin_data) -+ except (BrokenPipeError, ConnectionResetError): -+ to_remove.append(fileno) -+ for fileno in to_remove: -+ epoll.unregister(fileno) -+ fd_to_socket[fileno].close() -+ del fd_to_socket[fileno] -+ --- -2.27.0 - diff --git a/add-root-cause-analysis.patch b/add-root-cause-analysis.patch deleted file mode 100644 index 94de7ff7f2dcb939eaff1c3465fef20a1b2eb395..0000000000000000000000000000000000000000 --- a/add-root-cause-analysis.patch +++ /dev/null @@ -1,1253 +0,0 @@ -From 24f8eddad364e83cfc5b6b1607462ffe524b59f1 Mon Sep 17 00:00:00 2001 -From: =?UTF-8?q?=E8=B4=BA=E6=9C=89=E5=BF=97?= <1037617413@qq.com> -Date: Sat, 12 Oct 2024 21:59:18 +0800 -Subject: [PATCH] add root cause analysis - ---- - config/plugins/ai_block_io.ini | 15 +- - .../sentryPlugins/ai_block_io/ai_block_io.py | 133 +++-- - .../ai_block_io/config_parser.py | 465 +++++++++++------- - .../sentryPlugins/ai_block_io/data_access.py | 1 + - .../sentryPlugins/ai_block_io/detector.py | 54 +- - .../sentryPlugins/ai_block_io/io_data.py | 32 +- - .../ai_block_io/sliding_window.py | 57 ++- - src/python/sentryPlugins/ai_block_io/utils.py | 44 +- - 8 files changed, 491 insertions(+), 310 deletions(-) - -diff --git a/config/plugins/ai_block_io.ini b/config/plugins/ai_block_io.ini -index a814d52..422cfa3 100644 ---- a/config/plugins/ai_block_io.ini -+++ b/config/plugins/ai_block_io.ini -@@ -2,7 +2,6 @@ - level=info - - [common] --absolute_threshold=40 - slow_io_detect_frequency=1 - disk=default - stage=bio -@@ -18,4 +17,16 @@ n_sigma_parameter=3 - [sliding_window] - sliding_window_type=not_continuous - window_size=30 --window_minimum_threshold=6 -\ No newline at end of file -+window_minimum_threshold=6 -+ -+[latency_sata_ssd] -+read_tot_lim=50000 -+write_tot_lim=50000 -+ -+[latency_nvme_ssd] -+read_tot_lim=500 -+write_tot_lim=500 -+ -+[latency_sata_hdd] -+read_tot_lim=50000 -+write_tot_lim=50000 -\ No newline at end of file -diff --git a/src/python/sentryPlugins/ai_block_io/ai_block_io.py b/src/python/sentryPlugins/ai_block_io/ai_block_io.py -index e1052ec..dd661a1 100644 ---- a/src/python/sentryPlugins/ai_block_io/ai_block_io.py -+++ b/src/python/sentryPlugins/ai_block_io/ai_block_io.py -@@ -12,13 +12,18 @@ - import time - import signal - import logging -+from collections import defaultdict - - from .detector import Detector, DiskDetector --from .threshold import ThresholdFactory, AbsoluteThreshold -+from .threshold import ThresholdFactory - from .sliding_window import SlidingWindowFactory - from .utils import get_data_queue_size_and_update_size - from .config_parser import ConfigParser --from .data_access import get_io_data_from_collect_plug, check_collect_valid -+from .data_access import ( -+ get_io_data_from_collect_plug, -+ check_collect_valid, -+ get_disk_type, -+) - from .io_data import MetricName - from .alarm_report import Xalarm, Report - -@@ -34,7 +39,7 @@ def sig_handler(signum, frame): - class SlowIODetection: - _config_parser = None - _disk_list = None -- _detector_name_list = {} -+ _detector_name_list = defaultdict(list) - _disk_detectors = {} - - def __init__(self, config_parser: ConfigParser): -@@ -43,9 +48,13 @@ class SlowIODetection: - self.__init_detector() - - def __init_detector_name_list(self): -- self._disk_list = check_collect_valid(self._config_parser.slow_io_detect_frequency) -+ self._disk_list = check_collect_valid( -+ self._config_parser.slow_io_detect_frequency -+ ) - if self._disk_list is None: -- Report.report_pass("get available disk error, please check if the collector plug is enable. exiting...") -+ Report.report_pass( -+ "get available disk error, please check if the collector plug is enable. exiting..." -+ ) - exit(1) - - logging.info(f"ai_block_io plug has found disks: {self._disk_list}") -@@ -56,27 +65,45 @@ class SlowIODetection: - # 情况2:is not None and len = 0,则不启动任何磁盘检测 - # 情况3:len != 0,则取交集 - if disks is None: -- logging.warning("you not specify any disk or use default, so ai_block_io will enable all available disk.") -- for disk in self._disk_list: -- for stage in stages: -- for iotype in iotypes: -- if disk not in self._detector_name_list: -- self._detector_name_list[disk] = [] -- self._detector_name_list[disk].append(MetricName(disk, stage, iotype, "latency")) -- else: -- for disk in disks: -- if disk in self._disk_list: -- for stage in stages: -- for iotype in iotypes: -- if disk not in self._detector_name_list: -- self._detector_name_list[disk] = [] -- self._detector_name_list[disk].append(MetricName(disk, stage, iotype, "latency")) -- else: -- logging.warning("disk: [%s] not in available disk list, so it will be ignored.", disk) -- if len(self._detector_name_list) == 0: -- logging.critical("the disks to detection is empty, ai_block_io will exit.") -- Report.report_pass("the disks to detection is empty, ai_block_io will exit.") -- exit(1) -+ logging.warning( -+ "you not specify any disk or use default, so ai_block_io will enable all available disk." -+ ) -+ for disk in self._disk_list: -+ if disks is not None: -+ if disk not in disks: -+ continue -+ disks.remove(disk) -+ -+ disk_type_result = get_disk_type(disk) -+ if disk_type_result["ret"] == 0 and disk_type_result["message"] in ( -+ '0', -+ '1', -+ '2', -+ ): -+ disk_type = int(disk_type_result["message"]) -+ else: -+ logging.warning( -+ "%s get disk type error, return %s, so it will be ignored.", -+ disk, -+ disk_type_result, -+ ) -+ continue -+ for stage in stages: -+ for iotype in iotypes: -+ self._detector_name_list[disk].append( -+ MetricName(disk, disk_type, stage, iotype, "latency") -+ ) -+ if disks: -+ logging.warning( -+ "disks: %s not in available disk list, so they will be ignored.", -+ disks, -+ ) -+ if not self._detector_name_list: -+ logging.critical("the disks to detection is empty, ai_block_io will exit.") -+ Report.report_pass( -+ "the disks to detection is empty, ai_block_io will exit." -+ ) -+ exit(1) - - def __init_detector(self): - train_data_duration, train_update_duration = ( -@@ -88,26 +115,39 @@ class SlowIODetection: - train_data_duration, train_update_duration, slow_io_detection_frequency - ) - sliding_window_type = self._config_parser.sliding_window_type -- window_size, window_threshold = (self._config_parser.get_window_size_and_window_minimum_threshold()) -+ window_size, window_threshold = ( -+ self._config_parser.get_window_size_and_window_minimum_threshold() -+ ) - - for disk, metric_name_list in self._detector_name_list.items(): -- threshold = ThresholdFactory().get_threshold( -- threshold_type, -- boxplot_parameter=self._config_parser.boxplot_parameter, -- n_sigma_paramter=self._config_parser.n_sigma_parameter, -- data_queue_size=data_queue_size, -- data_queue_update_size=update_size, -- ) -- sliding_window = SlidingWindowFactory().get_sliding_window( -- sliding_window_type, -- queue_length=window_size, -- threshold=window_threshold, -- ) - disk_detector = DiskDetector(disk) - for metric_name in metric_name_list: -+ threshold = ThresholdFactory().get_threshold( -+ threshold_type, -+ boxplot_parameter=self._config_parser.boxplot_parameter, -+ n_sigma_paramter=self._config_parser.n_sigma_parameter, -+ data_queue_size=data_queue_size, -+ data_queue_update_size=update_size, -+ ) -+ abs_threshold = self._config_parser.get_tot_lim( -+ metric_name.disk_type, metric_name.io_access_type_name -+ ) -+ if abs_threshold is None: -+ logging.warning( -+ "disk %s, disk type %s, io type %s, get tot lim error, so it will be ignored.", -+ disk, -+ metric_name.disk_type, -+ metric_name.io_access_type_name, -+ ) -+ sliding_window = SlidingWindowFactory().get_sliding_window( -+ sliding_window_type, -+ queue_length=window_size, -+ threshold=window_threshold, -+ abs_threshold=abs_threshold, -+ ) - detector = Detector(metric_name, threshold, sliding_window) - disk_detector.add_detector(detector) -- logging.info(f'disk: [{disk}] add detector:\n [{disk_detector}]') -+ logging.info(f"disk: [{disk}] add detector:\n [{disk_detector}]") - self._disk_detectors[disk] = disk_detector - - def launch(self): -@@ -138,14 +178,17 @@ class SlowIODetection: - logging.debug("step3. Report slow io event to sysSentry.") - for slow_io_event in slow_io_event_list: - metric_name: MetricName = slow_io_event[1] -+ window_info = slow_io_event[2] -+ root_cause = slow_io_event[3] - alarm_content = { -- "driver_name": f"{metric_name.get_disk_name()}", -- "reason": "disk_slow", -- "block_stack": f"{metric_name.get_stage_name()}", -- "io_type": f"{metric_name.get_io_access_type_name()}", -+ "driver_name": f"{metric_name.disk_name}", -+ "reason": root_cause, -+ "block_stack": f"{metric_name.stage_name}", -+ "io_type": f"{metric_name.io_access_type_name}", - "alarm_source": "ai_block_io", - "alarm_type": "latency", -- "details": f"current window is: {slow_io_event[2]}, threshold is: {slow_io_event[3]}.", -+ "details": f"disk type: {metric_name.disk_type}, current window: {window_info[1]}, " -+ f"ai threshold: {window_info[2]}, abs threshold: {window_info[3]}.", - } - Xalarm.major(alarm_content) - logging.warning(alarm_content) -diff --git a/src/python/sentryPlugins/ai_block_io/config_parser.py b/src/python/sentryPlugins/ai_block_io/config_parser.py -index a357766..3388cd4 100644 ---- a/src/python/sentryPlugins/ai_block_io/config_parser.py -+++ b/src/python/sentryPlugins/ai_block_io/config_parser.py -@@ -20,59 +20,62 @@ from .utils import get_threshold_type_enum, get_sliding_window_type_enum, get_lo - - LOG_FORMAT = "%(asctime)s - %(levelname)s - [%(filename)s:%(lineno)d] - %(message)s" - --ALL_STAGE_LIST = ['throtl', 'wbt', 'gettag', 'plug', 'deadline', 'hctx', 'requeue', 'rq_driver', 'bio'] --ALL_IOTPYE_LIST = ['read', 'write'] -+ALL_STAGE_LIST = [ -+ "throtl", -+ "wbt", -+ "gettag", -+ "plug", -+ "deadline", -+ "hctx", -+ "requeue", -+ "rq_driver", -+ "bio", -+] -+ALL_IOTPYE_LIST = ["read", "write"] -+DISK_TYPE_MAP = { -+ 0: "nvme_ssd", -+ 1: "sata_ssd", -+ 2: "sata_hdd", -+} - - - def init_log_format(log_level: str): - logging.basicConfig(level=get_log_level(log_level.lower()), format=LOG_FORMAT) - if log_level.lower() not in ("info", "warning", "error", "debug"): - logging.warning( -- f"the log_level: {log_level} you set is invalid, use default value: info." -+ "the log_level: %s you set is invalid, use default value: info.", log_level - ) - - - class ConfigParser: -- DEFAULT_ABSOLUTE_THRESHOLD = 40 -- DEFAULT_SLOW_IO_DETECTION_FREQUENCY = 1 -- DEFAULT_LOG_LEVEL = "info" -- -- DEFAULT_STAGE = 'throtl,wbt,gettag,plug,deadline,hctx,requeue,rq_driver,bio' -- DEFAULT_IOTYPE = 'read,write' -- -- DEFAULT_ALGORITHM_TYPE = "boxplot" -- DEFAULT_TRAIN_DATA_DURATION = 24 -- DEFAULT_TRAIN_UPDATE_DURATION = 2 -- DEFAULT_BOXPLOT_PARAMETER = 1.5 -- DEFAULT_N_SIGMA_PARAMETER = 3 -- -- DEFAULT_SLIDING_WINDOW_TYPE = "not_continuous" -- DEFAULT_WINDOW_SIZE = 30 -- DEFAULT_WINDOW_MINIMUM_THRESHOLD = 6 -+ DEFAULT_CONF = { -+ "log": {"level": "info"}, -+ "common": { -+ "slow_io_detect_frequency": 1, -+ "disk": None, -+ "stage": "throtl,wbt,gettag,plug,deadline,hctx,requeue,rq_driver,bio", -+ "iotype": "read,write", -+ }, -+ "algorithm": { -+ "train_data_duration": 24.0, -+ "train_update_duration": 2.0, -+ "algorithm_type": get_threshold_type_enum("boxplot"), -+ "boxplot_parameter": 1.5, -+ "n_sigma_parameter": 3.0, -+ }, -+ "sliding_window": { -+ "sliding_window_type": get_sliding_window_type_enum("not_continuous"), -+ "window_size": 30, -+ "window_minimum_threshold": 6, -+ }, -+ "latency_sata_ssd": {"read_tot_lim": 50000, "write_tot_lim": 50000}, -+ "latency_nvme_ssd": {"read_tot_lim": 500, "write_tot_lim": 500}, -+ "latency_sata_hdd": {"read_tot_lim": 50000, "write_tot_lim": 50000}, -+ } - - def __init__(self, config_file_name): -- self.__absolute_threshold = ConfigParser.DEFAULT_ABSOLUTE_THRESHOLD -- self.__slow_io_detect_frequency = ( -- ConfigParser.DEFAULT_SLOW_IO_DETECTION_FREQUENCY -- ) -- self.__log_level = ConfigParser.DEFAULT_LOG_LEVEL -- self.__disks_to_detection = None -- self.__stage = ConfigParser.DEFAULT_STAGE -- self.__iotype = ConfigParser.DEFAULT_IOTYPE -- -- self.__algorithm_type = get_threshold_type_enum( -- ConfigParser.DEFAULT_ALGORITHM_TYPE -- ) -- self.__train_data_duration = ConfigParser.DEFAULT_TRAIN_UPDATE_DURATION -- self.__train_update_duration = ConfigParser.DEFAULT_TRAIN_UPDATE_DURATION -- self.__boxplot_parameter = ConfigParser.DEFAULT_BOXPLOT_PARAMETER -- self.__n_sigma_parameter = ConfigParser.DEFAULT_N_SIGMA_PARAMETER -- -- self.__sliding_window_type = ConfigParser.DEFAULT_SLIDING_WINDOW_TYPE -- self.__window_size = ConfigParser.DEFAULT_WINDOW_SIZE -- self.__window_minimum_threshold = ConfigParser.DEFAULT_WINDOW_MINIMUM_THRESHOLD -- -- self.__config_file_name = config_file_name -+ self._conf = ConfigParser.DEFAULT_CONF -+ self._config_file_name = config_file_name - - def _get_config_value( - self, -@@ -156,30 +159,21 @@ class ConfigParser: - - return value - -- def __read_absolute_threshold(self, items_common: dict): -- self.__absolute_threshold = self._get_config_value( -- items_common, -- "absolute_threshold", -- float, -- ConfigParser.DEFAULT_ABSOLUTE_THRESHOLD, -- gt=0, -- ) -- -- def __read__slow_io_detect_frequency(self, items_common: dict): -- self.__slow_io_detect_frequency = self._get_config_value( -+ def _read_slow_io_detect_frequency(self, items_common: dict): -+ self._conf["common"]["slow_io_detect_frequency"] = self._get_config_value( - items_common, - "slow_io_detect_frequency", - int, -- ConfigParser.DEFAULT_SLOW_IO_DETECTION_FREQUENCY, -+ self.DEFAULT_CONF["common"]["slow_io_detect_frequency"], - gt=0, - le=300, - ) - -- def __read__disks_to_detect(self, items_common: dict): -+ def _read_disks_to_detect(self, items_common: dict): - disks_to_detection = items_common.get("disk") - if disks_to_detection is None: - logging.warning("config of disk not found, the default value will be used.") -- self.__disks_to_detection = None -+ self._conf["common"]["disk"] = None - return - disks_to_detection = disks_to_detection.strip() - if not disks_to_detection: -@@ -189,40 +183,46 @@ class ConfigParser: - ) - exit(1) - disk_list = disks_to_detection.split(",") -+ disk_list = [disk.strip() for disk in disk_list] - if len(disk_list) == 1 and disk_list[0] == "default": -- self.__disks_to_detection = None -+ self._conf["common"]["disk"] = None - return -- self.__disks_to_detection = disk_list -+ self._conf["common"]["disk"] = disk_list - -- def __read__train_data_duration(self, items_algorithm: dict): -- self.__train_data_duration = self._get_config_value( -+ def _read_train_data_duration(self, items_algorithm: dict): -+ self._conf["common"]["train_data_duration"] = self._get_config_value( - items_algorithm, - "train_data_duration", - float, -- ConfigParser.DEFAULT_TRAIN_DATA_DURATION, -+ self.DEFAULT_CONF["algorithm"]["train_data_duration"], - gt=0, - le=720, - ) - -- def __read__train_update_duration(self, items_algorithm: dict): -- default_train_update_duration = ConfigParser.DEFAULT_TRAIN_UPDATE_DURATION -- if default_train_update_duration > self.__train_data_duration: -- default_train_update_duration = self.__train_data_duration / 2 -- self.__train_update_duration = self._get_config_value( -+ def _read_train_update_duration(self, items_algorithm: dict): -+ default_train_update_duration = self.DEFAULT_CONF["algorithm"][ -+ "train_update_duration" -+ ] -+ if default_train_update_duration > self._conf["common"]["train_data_duration"]: -+ default_train_update_duration = ( -+ self._conf["common"]["train_data_duration"] / 2 -+ ) -+ self._conf["common"]["train_update_duration"] = self._get_config_value( - items_algorithm, - "train_update_duration", - float, - default_train_update_duration, - gt=0, -- le=self.__train_data_duration, -+ le=self._conf["common"]["train_data_duration"], - ) - -- def __read__algorithm_type_and_parameter(self, items_algorithm: dict): -- algorithm_type = items_algorithm.get( -- "algorithm_type", ConfigParser.DEFAULT_ALGORITHM_TYPE -- ) -- self.__algorithm_type = get_threshold_type_enum(algorithm_type) -- if self.__algorithm_type is None: -+ def _read_algorithm_type_and_parameter(self, items_algorithm: dict): -+ algorithm_type = items_algorithm.get("algorithm_type") -+ if algorithm_type is not None: -+ self._conf["algorithm"]["algorithm_type"] = get_threshold_type_enum( -+ algorithm_type -+ ) -+ if self._conf["algorithm"]["algorithm_type"] is None: - logging.critical( - "the algorithm_type: %s you set is invalid. ai_block_io plug will exit.", - algorithm_type, -@@ -231,129 +231,175 @@ class ConfigParser: - f"the algorithm_type: {algorithm_type} you set is invalid. ai_block_io plug will exit." - ) - exit(1) -- -- if self.__algorithm_type == ThresholdType.NSigmaThreshold: -- self.__n_sigma_parameter = self._get_config_value( -+ elif self._conf["algorithm"]["algorithm_type"] == ThresholdType.NSigmaThreshold: -+ self._conf["algorithm"]["n_sigma_parameter"] = self._get_config_value( - items_algorithm, - "n_sigma_parameter", - float, -- ConfigParser.DEFAULT_N_SIGMA_PARAMETER, -+ self.DEFAULT_CONF["algorithm"]["n_sigma_parameter"], - gt=0, - le=10, - ) -- elif self.__algorithm_type == ThresholdType.BoxplotThreshold: -- self.__boxplot_parameter = self._get_config_value( -+ elif ( -+ self._conf["algorithm"]["algorithm_type"] == ThresholdType.BoxplotThreshold -+ ): -+ self._conf["algorithm"]["boxplot_parameter"] = self._get_config_value( - items_algorithm, - "boxplot_parameter", - float, -- ConfigParser.DEFAULT_BOXPLOT_PARAMETER, -+ self.DEFAULT_CONF["algorithm"]["boxplot_parameter"], - gt=0, - le=10, - ) - -- def __read__stage(self, items_algorithm: dict): -- stage_str = items_algorithm.get('stage', ConfigParser.DEFAULT_STAGE) -- stage_list = stage_str.split(',') -- if len(stage_list) == 1 and stage_list[0] == '': -- logging.critical('stage value not allow is empty, exiting...') -+ def _read_stage(self, items_algorithm: dict): -+ stage_str = items_algorithm.get( -+ "stage", self.DEFAULT_CONF["common"]["stage"] -+ ).strip() -+ stage_list = stage_str.split(",") -+ stage_list = [stage.strip() for stage in stage_list] -+ if len(stage_list) == 1 and stage_list[0] == "": -+ logging.critical("stage value not allow is empty, exiting...") - exit(1) -- if len(stage_list) == 1 and stage_list[0] == 'default': -- logging.warning(f'stage will enable default value: {ConfigParser.DEFAULT_STAGE}') -- self.__stage = ALL_STAGE_LIST -+ if len(stage_list) == 1 and stage_list[0] == "default": -+ logging.warning( -+ "stage will enable default value: %s", -+ self.DEFAULT_CONF["common"]["stage"], -+ ) -+ self._conf["common"]["stage"] = ALL_STAGE_LIST - return - for stage in stage_list: - if stage not in ALL_STAGE_LIST: -- logging.critical(f'stage: {stage} is not valid stage, ai_block_io will exit...') -+ logging.critical( -+ "stage: %s is not valid stage, ai_block_io will exit...", stage -+ ) - exit(1) - dup_stage_list = set(stage_list) -- if 'bio' not in dup_stage_list: -- logging.critical('stage must contains bio stage, exiting...') -+ if "bio" not in dup_stage_list: -+ logging.critical("stage must contains bio stage, exiting...") - exit(1) -- self.__stage = dup_stage_list -- -- def __read__iotype(self, items_algorithm: dict): -- iotype_str = items_algorithm.get('iotype', ConfigParser.DEFAULT_IOTYPE) -- iotype_list = iotype_str.split(',') -- if len(iotype_list) == 1 and iotype_list[0] == '': -- logging.critical('iotype value not allow is empty, exiting...') -+ self._conf["common"]["stage"] = dup_stage_list -+ -+ def _read_iotype(self, items_algorithm: dict): -+ iotype_str = items_algorithm.get( -+ "iotype", self.DEFAULT_CONF["common"]["iotype"] -+ ).strip() -+ iotype_list = iotype_str.split(",") -+ iotype_list = [iotype.strip() for iotype in iotype_list] -+ if len(iotype_list) == 1 and iotype_list[0] == "": -+ logging.critical("iotype value not allow is empty, exiting...") - exit(1) -- if len(iotype_list) == 1 and iotype_list[0] == 'default': -- logging.warning(f'iotype will enable default value: {ConfigParser.DEFAULT_IOTYPE}') -- self.__iotype = ALL_IOTPYE_LIST -+ if len(iotype_list) == 1 and iotype_list[0] == "default": -+ logging.warning( -+ "iotype will enable default value: %s", -+ self.DEFAULT_CONF["common"]["iotype"], -+ ) -+ self._conf["common"]["iotype"] = ALL_IOTPYE_LIST - return - for iotype in iotype_list: - if iotype not in ALL_IOTPYE_LIST: -- logging.critical(f'iotype: {iotype} is not valid iotype, ai_block_io will exit...') -+ logging.critical( -+ "iotype: %s is not valid iotype, ai_block_io will exit...", iotype -+ ) - exit(1) - dup_iotype_list = set(iotype_list) -- self.__iotype = dup_iotype_list -+ self._conf["common"]["iotype"] = dup_iotype_list -+ -+ def _read_sliding_window_type(self, items_sliding_window: dict): -+ sliding_window_type = items_sliding_window.get("sliding_window_type") -+ if sliding_window_type is not None: -+ self._conf["sliding_window"]["sliding_window_type"] = ( -+ get_sliding_window_type_enum(sliding_window_type) -+ ) -+ if self._conf["sliding_window"]["sliding_window_type"] is None: -+ logging.critical( -+ "the sliding_window_type: %s you set is invalid. ai_block_io plug will exit.", -+ sliding_window_type, -+ ) -+ Report.report_pass( -+ f"the sliding_window_type: {sliding_window_type} you set is invalid. ai_block_io plug will exit." -+ ) -+ exit(1) - -- def __read__window_size(self, items_sliding_window: dict): -- self.__window_size = self._get_config_value( -+ def _read_window_size(self, items_sliding_window: dict): -+ self._conf["sliding_window"]["window_size"] = self._get_config_value( - items_sliding_window, - "window_size", - int, -- ConfigParser.DEFAULT_WINDOW_SIZE, -+ self.DEFAULT_CONF["sliding_window"]["window_size"], - gt=0, - le=3600, - ) - -- def __read__window_minimum_threshold(self, items_sliding_window: dict): -- default_window_minimum_threshold = ConfigParser.DEFAULT_WINDOW_MINIMUM_THRESHOLD -- if default_window_minimum_threshold > self.__window_size: -- default_window_minimum_threshold = self.__window_size / 2 -- self.__window_minimum_threshold = self._get_config_value( -- items_sliding_window, -- "window_minimum_threshold", -- int, -- default_window_minimum_threshold, -- gt=0, -- le=self.__window_size, -+ def _read_window_minimum_threshold(self, items_sliding_window: dict): -+ default_window_minimum_threshold = self.DEFAULT_CONF["sliding_window"][ -+ "window_minimum_threshold" -+ ] -+ if ( -+ default_window_minimum_threshold -+ > self._conf["sliding_window"]["window_size"] -+ ): -+ default_window_minimum_threshold = ( -+ self._conf["sliding_window"]["window_size"] / 2 -+ ) -+ self._conf["sliding_window"]["window_minimum_threshold"] = ( -+ self._get_config_value( -+ items_sliding_window, -+ "window_minimum_threshold", -+ int, -+ default_window_minimum_threshold, -+ gt=0, -+ le=self._conf["sliding_window"]["window_size"], -+ ) - ) - - def read_config_from_file(self): -- if not os.path.exists(self.__config_file_name): -- init_log_format(self.__log_level) -+ if not os.path.exists(self._config_file_name): -+ init_log_format(self._conf["log"]["level"]) - logging.critical( - "config file %s not found, ai_block_io plug will exit.", -- self.__config_file_name, -+ self._config_file_name, - ) - Report.report_pass( -- f"config file {self.__config_file_name} not found, ai_block_io plug will exit." -+ f"config file {self._config_file_name} not found, ai_block_io plug will exit." - ) - exit(1) - - con = configparser.ConfigParser() - try: -- con.read(self.__config_file_name, encoding="utf-8") -+ con.read(self._config_file_name, encoding="utf-8") - except configparser.Error as e: -- init_log_format(self.__log_level) -+ init_log_format(self._conf["log"]["level"]) - logging.critical( -- f"config file read error: %s, ai_block_io plug will exit.", e -+ "config file read error: %s, ai_block_io plug will exit.", e - ) - Report.report_pass( - f"config file read error: {e}, ai_block_io plug will exit." - ) - exit(1) - -- if con.has_section('log'): -- items_log = dict(con.items('log')) -+ if con.has_section("log"): -+ items_log = dict(con.items("log")) - # 情况一:没有log,则使用默认值 - # 情况二:有log,值为空或异常,使用默认值 - # 情况三:有log,值正常,则使用该值 -- self.__log_level = items_log.get('level', ConfigParser.DEFAULT_LOG_LEVEL) -- init_log_format(self.__log_level) -+ self._conf["log"]["level"] = items_log.get( -+ "level", self.DEFAULT_CONF["log"]["level"] -+ ) -+ init_log_format(self._conf["log"]["level"]) - else: -- init_log_format(self.__log_level) -- logging.warning(f"log section parameter not found, it will be set to default value.") -+ init_log_format(self._conf["log"]["level"]) -+ logging.warning( -+ "log section parameter not found, it will be set to default value." -+ ) - - if con.has_section("common"): - items_common = dict(con.items("common")) -- self.__read_absolute_threshold(items_common) -- self.__read__slow_io_detect_frequency(items_common) -- self.__read__disks_to_detect(items_common) -- self.__read__stage(items_common) -- self.__read__iotype(items_common) -+ -+ self._read_slow_io_detect_frequency(items_common) -+ self._read_disks_to_detect(items_common) -+ self._read_stage(items_common) -+ self._read_iotype(items_common) - else: - logging.warning( - "common section parameter not found, it will be set to default value." -@@ -361,9 +407,9 @@ class ConfigParser: - - if con.has_section("algorithm"): - items_algorithm = dict(con.items("algorithm")) -- self.__read__train_data_duration(items_algorithm) -- self.__read__train_update_duration(items_algorithm) -- self.__read__algorithm_type_and_parameter(items_algorithm) -+ self._read_train_data_duration(items_algorithm) -+ self._read_train_update_duration(items_algorithm) -+ self._read_algorithm_type_and_parameter(items_algorithm) - else: - logging.warning( - "algorithm section parameter not found, it will be set to default value." -@@ -371,101 +417,162 @@ class ConfigParser: - - if con.has_section("sliding_window"): - items_sliding_window = dict(con.items("sliding_window")) -- sliding_window_type = items_sliding_window.get( -- "sliding_window_type", ConfigParser.DEFAULT_SLIDING_WINDOW_TYPE -+ -+ self._read_window_size(items_sliding_window) -+ self._read_window_minimum_threshold(items_sliding_window) -+ else: -+ logging.warning( -+ "sliding_window section parameter not found, it will be set to default value." -+ ) -+ -+ if con.has_section("latency_sata_ssd"): -+ items_latency_sata_ssd = dict(con.items("latency_sata_ssd")) -+ self._conf["latency_sata_ssd"]["read_tot_lim"] = self._get_config_value( -+ items_latency_sata_ssd, -+ "read_tot_lim", -+ int, -+ self.DEFAULT_CONF["latency_sata_ssd"]["read_tot_lim"], -+ gt=0, - ) -- self.__sliding_window_type = get_sliding_window_type_enum( -- sliding_window_type -+ self._conf["latency_sata_ssd"]["write_tot_lim"] = self._get_config_value( -+ items_latency_sata_ssd, -+ "write_tot_lim", -+ int, -+ self.DEFAULT_CONF["latency_sata_ssd"]["write_tot_lim"], -+ gt=0, - ) -- self.__read__window_size(items_sliding_window) -- self.__read__window_minimum_threshold(items_sliding_window) - else: - logging.warning( -- "sliding_window section parameter not found, it will be set to default value." -+ "latency_sata_ssd section parameter not found, it will be set to default value." -+ ) -+ if con.has_section("latency_nvme_ssd"): -+ items_latency_nvme_ssd = dict(con.items("latency_nvme_ssd")) -+ self._conf["latency_nvme_ssd"]["read_tot_lim"] = self._get_config_value( -+ items_latency_nvme_ssd, -+ "read_tot_lim", -+ int, -+ self.DEFAULT_CONF["latency_nvme_ssd"]["read_tot_lim"], -+ gt=0, -+ ) -+ self._conf["latency_nvme_ssd"]["write_tot_lim"] = self._get_config_value( -+ items_latency_nvme_ssd, -+ "write_tot_lim", -+ int, -+ self.DEFAULT_CONF["latency_nvme_ssd"]["write_tot_lim"], -+ gt=0, -+ ) -+ else: -+ logging.warning( -+ "latency_nvme_ssd section parameter not found, it will be set to default value." -+ ) -+ if con.has_section("latency_sata_hdd"): -+ items_latency_sata_hdd = dict(con.items("latency_sata_hdd")) -+ self._conf["latency_sata_hdd"]["read_tot_lim"] = self._get_config_value( -+ items_latency_sata_hdd, -+ "read_tot_lim", -+ int, -+ self.DEFAULT_CONF["latency_sata_hdd"]["read_tot_lim"], -+ gt=0, -+ ) -+ self._conf["latency_sata_hdd"]["write_tot_lim"] = self._get_config_value( -+ items_latency_sata_hdd, -+ "write_tot_lim", -+ int, -+ self.DEFAULT_CONF["latency_sata_hdd"]["write_tot_lim"], -+ gt=0, -+ ) -+ else: -+ logging.warning( -+ "latency_sata_hdd section parameter not found, it will be set to default value." - ) - - self.__print_all_config_value() - -- def __repr__(self): -- config_str = { -- 'log.level': self.__log_level, -- 'common.absolute_threshold': self.__absolute_threshold, -- 'common.slow_io_detect_frequency': self.__slow_io_detect_frequency, -- 'common.disk': self.__disks_to_detection, -- 'common.stage': self.__stage, -- 'common.iotype': self.__iotype, -- 'algorithm.train_data_duration': self.__train_data_duration, -- 'algorithm.train_update_duration': self.__train_update_duration, -- 'algorithm.algorithm_type': self.__algorithm_type, -- 'algorithm.boxplot_parameter': self.__boxplot_parameter, -- 'algorithm.n_sigma_parameter': self.__n_sigma_parameter, -- 'sliding_window.sliding_window_type': self.__sliding_window_type, -- 'sliding_window.window_size': self.__window_size, -- 'sliding_window.window_minimum_threshold': self.__window_minimum_threshold -- } -- return str(config_str) -+ def __repr__(self) -> str: -+ return str(self._conf) -+ -+ def __str__(self) -> str: -+ return str(self._conf) - - def __print_all_config_value(self): -- logging.info(f"all config is follow:\n {self}") -+ logging.info("all config is follow:\n %s", self) -+ -+ def get_tot_lim(self, disk_type, io_type): -+ if io_type == "read": -+ return self._conf.get( -+ f"latency_{DISK_TYPE_MAP.get(disk_type, '')}", {} -+ ).get("read_tot_lim", None) -+ elif io_type == "write": -+ return self._conf.get( -+ f"latency_{DISK_TYPE_MAP.get(disk_type, '')}", {} -+ ).get("write_tot_lim", None) -+ else: -+ return None - - def get_train_data_duration_and_train_update_duration(self): -- return self.__train_data_duration, self.__train_update_duration -+ return ( -+ self._conf["common"]["train_data_duration"], -+ self._conf["common"]["train_update_duration"], -+ ) - - def get_window_size_and_window_minimum_threshold(self): -- return self.__window_size, self.__window_minimum_threshold -+ return ( -+ self._conf["sliding_window"]["window_size"], -+ self._conf["sliding_window"]["window_minimum_threshold"], -+ ) - - @property - def slow_io_detect_frequency(self): -- return self.__slow_io_detect_frequency -+ return self._conf["common"]["slow_io_detect_frequency"] - - @property - def algorithm_type(self): -- return self.__algorithm_type -+ return self._conf["algorithm"]["algorithm_type"] - - @property - def sliding_window_type(self): -- return self.__sliding_window_type -+ return self._conf["sliding_window"]["sliding_window_type"] - - @property - def train_data_duration(self): -- return self.__train_data_duration -+ return self._conf["common"]["train_data_duration"] - - @property - def train_update_duration(self): -- return self.__train_update_duration -+ return self._conf["common"]["train_update_duration"] - - @property - def window_size(self): -- return self.__window_size -+ return self._conf["sliding_window"]["window_size"] - - @property - def window_minimum_threshold(self): -- return self.__window_minimum_threshold -+ return self._conf["sliding_window"]["window_minimum_threshold"] - - @property - def absolute_threshold(self): -- return self.__absolute_threshold -+ return self._conf["common"]["absolute_threshold"] - - @property - def log_level(self): -- return self.__log_level -+ return self._conf["log"]["level"] - - @property - def disks_to_detection(self): -- return self.__disks_to_detection -+ return self._conf["common"]["disk"] - - @property - def stage(self): -- return self.__stage -+ return self._conf["common"]["stage"] - - @property - def iotype(self): -- return self.__iotype -+ return self._conf["common"]["iotype"] - - @property - def boxplot_parameter(self): -- return self.__boxplot_parameter -+ return self._conf["algorithm"]["boxplot_parameter"] - - @property - def n_sigma_parameter(self): -- return self.__n_sigma_parameter -+ return self._conf["algorithm"]["n_sigma_parameter"] -diff --git a/src/python/sentryPlugins/ai_block_io/data_access.py b/src/python/sentryPlugins/ai_block_io/data_access.py -index ed997e6..1bc5ed8 100644 ---- a/src/python/sentryPlugins/ai_block_io/data_access.py -+++ b/src/python/sentryPlugins/ai_block_io/data_access.py -@@ -16,6 +16,7 @@ from sentryCollector.collect_plugin import ( - Result_Messages, - get_io_data, - is_iocollect_valid, -+ get_disk_type - ) - - -diff --git a/src/python/sentryPlugins/ai_block_io/detector.py b/src/python/sentryPlugins/ai_block_io/detector.py -index e710ddd..87bd1dd 100644 ---- a/src/python/sentryPlugins/ai_block_io/detector.py -+++ b/src/python/sentryPlugins/ai_block_io/detector.py -@@ -17,9 +17,6 @@ from .utils import get_metric_value_from_io_data_dict_by_metric_name - - - class Detector: -- _metric_name: MetricName = None -- _threshold: Threshold = None -- _slidingWindow: SlidingWindow = None - - def __init__(self, metric_name: MetricName, threshold: Threshold, sliding_window: SlidingWindow): - self._metric_name = metric_name -@@ -40,18 +37,24 @@ class Detector: - metric_value = get_metric_value_from_io_data_dict_by_metric_name(io_data_dict_with_disk_name, self._metric_name) - if metric_value is None: - logging.debug('not found metric value, so return None.') -- return False, None, None -+ return (False, False), None, None, None - logging.debug(f'input metric value: {str(metric_value)}') - self._threshold.push_latest_data_to_queue(metric_value) - detection_result = self._slidingWindow.is_slow_io_event(metric_value) -- logging.debug(f'Detection result: {str(detection_result)}') -+ # 检测到慢周期,由Detector负责打印info级别日志 -+ if detection_result[0][1]: -+ logging.info(f'[abnormal period happen]: disk info: {self._metric_name}, window: {detection_result[1]}, ' -+ f'current value: {metric_value}, ai threshold: {detection_result[2]}, ' -+ f'absolute threshold: {detection_result[3]}') -+ else: -+ logging.debug(f'Detection result: {str(detection_result)}') - logging.debug(f'exit Detector: {self}') - return detection_result - - def __repr__(self): -- return (f'disk_name: {self._metric_name.get_disk_name()}, stage_name: {self._metric_name.get_stage_name()},' -- f' io_type_name: {self._metric_name.get_io_access_type_name()},' -- f' metric_name: {self._metric_name.get_metric_name()}, threshold_type: {self._threshold},' -+ return (f'disk_name: {self._metric_name.disk_name}, stage_name: {self._metric_name.stage_name},' -+ f' io_type_name: {self._metric_name.io_access_type_name},' -+ f' metric_name: {self._metric_name.metric_name}, threshold_type: {self._threshold},' - f' sliding_window_type: {self._slidingWindow}') - - -@@ -65,13 +68,38 @@ class DiskDetector: - self._detector_list.append(detector) - - def is_slow_io_event(self, io_data_dict_with_disk_name: dict): -- # 只有bio阶段发生异常,就认为发生了慢IO事件 -- # todo:根因诊断 -+ """ -+ 根因诊断逻辑:只有bio阶段发生异常,才认为发生了慢IO事件,即bio阶段异常是慢IO事件的必要条件 -+ 情况一:bio异常,rq_driver也异常,则慢盘 -+ 情况二:bio异常,rq_driver无异常,且有内核IO栈任意阶段异常,则IO栈异常 -+ 情况三:bio异常,rq_driver无异常,且无内核IO栈任意阶段异常,则IO压力大 -+ 情况四:bio异常,则UNKNOWN -+ """ -+ diagnosis_info = {"bio": [], "rq_driver": [], "io_stage": []} - for detector in self._detector_list: -+ # result返回内容:(是否检测到慢IO,是否检测到慢周期)、窗口、ai阈值、绝对阈值 -+ # 示例: (False, False), self._io_data_queue, self._ai_threshold, self._abs_threshold - result = detector.is_slow_io_event(io_data_dict_with_disk_name) -- if result[0] and detector.get_metric_name().get_stage_name() == 'bio': -- return result[0], detector.get_metric_name(), result[1], result[2] -- return False, None, None, None -+ if result[0][0]: -+ if detector.get_metric_name().stage_name == "bio": -+ diagnosis_info["bio"].append((detector.get_metric_name(), result)) -+ elif detector.get_metric_name().stage_name == "rq_driver": -+ diagnosis_info["rq_driver"].append((detector.get_metric_name(), result)) -+ else: -+ diagnosis_info["io_stage"].append((detector.get_metric_name(), result)) -+ -+ # 返回内容:(1)是否检测到慢IO事件、(2)MetricName、(3)滑动窗口及阈值、(4)慢IO事件根因 -+ root_cause = None -+ if len(diagnosis_info["bio"]) == 0: -+ return False, None, None, None -+ elif len(diagnosis_info["rq_driver"]) != 0: -+ root_cause = "[Root Cause:disk slow]" -+ elif len(diagnosis_info["io_stage"]) != 0: -+ stage = diagnosis_info["io_stage"][0][1].get_stage_name() -+ root_cause = f"[Root Cause:io stage slow, stage: {stage}]" -+ if root_cause is None: -+ root_cause = "[Root Cause:high io pressure]" -+ return True, diagnosis_info["bio"][0][0], diagnosis_info["bio"][0][1], root_cause - - def __repr__(self): - msg = f'disk: {self._disk_name}, ' -diff --git a/src/python/sentryPlugins/ai_block_io/io_data.py b/src/python/sentryPlugins/ai_block_io/io_data.py -index 0e17051..d341b55 100644 ---- a/src/python/sentryPlugins/ai_block_io/io_data.py -+++ b/src/python/sentryPlugins/ai_block_io/io_data.py -@@ -45,30 +45,10 @@ class IOData: - time_stamp: float = field(default_factory=lambda: datetime.now().timestamp()) - - -+@dataclass(frozen=True) - class MetricName: -- _disk_name: str = None -- _stage_name: str = None -- _io_access_type_name: str = None -- _metric_name: str = None -- -- def __init__(self, disk_name: str, stage_name: str, io_access_type_name: str, metric_name: str): -- self._disk_name = disk_name -- self._stage_name = stage_name -- self._io_access_type_name = io_access_type_name -- self._metric_name = metric_name -- -- def get_disk_name(self): -- return self._disk_name -- -- def get_stage_name(self): -- return self._stage_name -- -- def get_io_access_type_name(self): -- return self._io_access_type_name -- -- def get_metric_name(self): -- return self._metric_name -- -- def __repr__(self): -- return (f'disk: {self._disk_name}, stage: {self._stage_name}, io_access_type: {self._io_access_type_name},' -- f'metric: {self._metric_name}') -+ disk_name: str -+ disk_type: str -+ stage_name: str -+ io_access_type_name: str -+ metric_name: str -diff --git a/src/python/sentryPlugins/ai_block_io/sliding_window.py b/src/python/sentryPlugins/ai_block_io/sliding_window.py -index 89191e5..d7c402a 100644 ---- a/src/python/sentryPlugins/ai_block_io/sliding_window.py -+++ b/src/python/sentryPlugins/ai_block_io/sliding_window.py -@@ -21,15 +21,11 @@ class SlidingWindowType(Enum): - - - class SlidingWindow: -- _ai_threshold = None -- _queue_length = None -- _queue_threshold = None -- _io_data_queue: list = None -- _io_data_queue_abnormal_tag: list = None -- -- def __init__(self, queue_length: int, threshold: int): -+ def __init__(self, queue_length: int, threshold: int, abs_threshold: int = None): - self._queue_length = queue_length - self._queue_threshold = threshold -+ self._ai_threshold = None -+ self._abs_threshold = abs_threshold - self._io_data_queue = [] - self._io_data_queue_abnormal_tag = [] - -@@ -38,7 +34,12 @@ class SlidingWindow: - self._io_data_queue.pop(0) - self._io_data_queue_abnormal_tag.pop(0) - self._io_data_queue.append(data) -- self._io_data_queue_abnormal_tag.append(data >= self._ai_threshold if self._ai_threshold is not None else False) -+ tag = False -+ if ((self._ai_threshold is not None and data >= self._ai_threshold) or -+ (self._abs_threshold is not None and data >= self._abs_threshold)): -+ tag = True -+ self._io_data_queue_abnormal_tag.append(tag) -+ return tag - - def update(self, threshold): - if self._ai_threshold == threshold: -@@ -49,7 +50,7 @@ class SlidingWindow: - self._io_data_queue_abnormal_tag.append(data >= self._ai_threshold) - - def is_slow_io_event(self, data): -- return False, None, None -+ return False, None, None, None - - def __repr__(self): - return "[SlidingWindow]" -@@ -57,12 +58,13 @@ class SlidingWindow: - - class NotContinuousSlidingWindow(SlidingWindow): - def is_slow_io_event(self, data): -- super().push(data) -- if len(self._io_data_queue) < self._queue_length or self._ai_threshold is None: -- return False, self._io_data_queue, self._ai_threshold -+ is_abnormal_period = super().push(data) -+ is_slow_io_event = False -+ if len(self._io_data_queue) < self._queue_length or (self._ai_threshold is None and self._abs_threshold is None): -+ is_slow_io_event = False - if self._io_data_queue_abnormal_tag.count(True) >= self._queue_threshold: -- return True, self._io_data_queue, self._ai_threshold -- return False, self._io_data_queue, self._ai_threshold -+ is_slow_io_event = True -+ return (is_slow_io_event, is_abnormal_period), self._io_data_queue, self._ai_threshold, self._abs_threshold - - def __repr__(self): - return f"[NotContinuousSlidingWindow, window size: {self._queue_length}, threshold: {self._queue_threshold}]" -@@ -70,18 +72,20 @@ class NotContinuousSlidingWindow(SlidingWindow): - - class ContinuousSlidingWindow(SlidingWindow): - def is_slow_io_event(self, data): -- super().push(data) -- if len(self._io_data_queue) < self._queue_length or self._ai_threshold is None: -- return False, self._io_data_queue, self._ai_threshold -+ is_abnormal_period = super().push(data) -+ is_slow_io_event = False -+ if len(self._io_data_queue) < self._queue_length or (self._ai_threshold is None and self._abs_threshold is None): -+ is_slow_io_event = False - consecutive_count = 0 - for tag in self._io_data_queue_abnormal_tag: - if tag: - consecutive_count += 1 - if consecutive_count >= self._queue_threshold: -- return True, self._io_data_queue, self._ai_threshold -+ is_slow_io_event = True -+ break - else: - consecutive_count = 0 -- return False, self._io_data_queue, self._ai_threshold -+ return (is_slow_io_event, is_abnormal_period), self._io_data_queue, self._ai_threshold, self._abs_threshold - - def __repr__(self): - return f"[ContinuousSlidingWindow, window size: {self._queue_length}, threshold: {self._queue_threshold}]" -@@ -89,20 +93,23 @@ class ContinuousSlidingWindow(SlidingWindow): - - class MedianSlidingWindow(SlidingWindow): - def is_slow_io_event(self, data): -- super().push(data) -- if len(self._io_data_queue) < self._queue_length or self._ai_threshold is None: -- return False, self._io_data_queue, self._ai_threshold -+ is_abnormal_period = super().push(data) -+ is_slow_io_event = False -+ if len(self._io_data_queue) < self._queue_length or (self._ai_threshold is None and self._abs_threshold is None): -+ is_slow_io_event = False - median = np.median(self._io_data_queue) - if median >= self._ai_threshold: -- return True, self._io_data_queue, self._ai_threshold -- return False, self._io_data_queue, self._ai_threshold -+ is_slow_io_event = True -+ return (is_slow_io_event, is_abnormal_period), self._io_data_queue, self._ai_threshold, self._abs_threshold - - def __repr__(self): - return f"[MedianSlidingWindow, window size: {self._queue_length}]" - - - class SlidingWindowFactory: -- def get_sliding_window(self, sliding_window_type: SlidingWindowType, *args, **kwargs): -+ def get_sliding_window( -+ self, sliding_window_type: SlidingWindowType, *args, **kwargs -+ ): - if sliding_window_type == SlidingWindowType.NotContinuousSlidingWindow: - return NotContinuousSlidingWindow(*args, **kwargs) - elif sliding_window_type == SlidingWindowType.ContinuousSlidingWindow: -diff --git a/src/python/sentryPlugins/ai_block_io/utils.py b/src/python/sentryPlugins/ai_block_io/utils.py -index 0ed37b9..d6f4067 100644 ---- a/src/python/sentryPlugins/ai_block_io/utils.py -+++ b/src/python/sentryPlugins/ai_block_io/utils.py -@@ -19,53 +19,57 @@ from .io_data import MetricName, IOData - - - def get_threshold_type_enum(algorithm_type: str): -- if algorithm_type.lower() == 'absolute': -+ if algorithm_type.lower() == "absolute": - return ThresholdType.AbsoluteThreshold -- if algorithm_type.lower() == 'boxplot': -+ if algorithm_type.lower() == "boxplot": - return ThresholdType.BoxplotThreshold -- if algorithm_type.lower() == 'n_sigma': -+ if algorithm_type.lower() == "n_sigma": - return ThresholdType.NSigmaThreshold - return None - - - def get_sliding_window_type_enum(sliding_window_type: str): -- if sliding_window_type.lower() == 'not_continuous': -+ if sliding_window_type.lower() == "not_continuous": - return SlidingWindowType.NotContinuousSlidingWindow -- if sliding_window_type.lower() == 'continuous': -+ if sliding_window_type.lower() == "continuous": - return SlidingWindowType.ContinuousSlidingWindow -- if sliding_window_type.lower() == 'median': -+ if sliding_window_type.lower() == "median": - return SlidingWindowType.MedianSlidingWindow -- logging.warning(f"the sliding window type: {sliding_window_type} you set is invalid, use default value: not_continuous") -- return SlidingWindowType.NotContinuousSlidingWindow -+ return None - - --def get_metric_value_from_io_data_dict_by_metric_name(io_data_dict: dict, metric_name: MetricName): -+def get_metric_value_from_io_data_dict_by_metric_name( -+ io_data_dict: dict, metric_name: MetricName -+): - try: -- io_data: IOData = io_data_dict[metric_name.get_disk_name()] -- io_stage_data = asdict(io_data)[metric_name.get_stage_name()] -- base_data = io_stage_data[metric_name.get_io_access_type_name()] -- metric_value = base_data[metric_name.get_metric_name()] -+ io_data: IOData = io_data_dict[metric_name.disk_name] -+ io_stage_data = asdict(io_data)[metric_name.stage_name] -+ base_data = io_stage_data[metric_name.io_access_type_name] -+ metric_value = base_data[metric_name.metric_name] - return metric_value - except KeyError: - return None - - --def get_data_queue_size_and_update_size(training_data_duration: float, train_update_duration: float, -- slow_io_detect_frequency: int): -+def get_data_queue_size_and_update_size( -+ training_data_duration: float, -+ train_update_duration: float, -+ slow_io_detect_frequency: int, -+): - data_queue_size = int(training_data_duration * 60 * 60 / slow_io_detect_frequency) - update_size = int(train_update_duration * 60 * 60 / slow_io_detect_frequency) - return data_queue_size, update_size - - - def get_log_level(log_level: str): -- if log_level.lower() == 'debug': -+ if log_level.lower() == "debug": - return logging.DEBUG -- elif log_level.lower() == 'info': -+ elif log_level.lower() == "info": - return logging.INFO -- elif log_level.lower() == 'warning': -+ elif log_level.lower() == "warning": - return logging.WARNING -- elif log_level.lower() == 'error': -+ elif log_level.lower() == "error": - return logging.ERROR -- elif log_level.lower() == 'critical': -+ elif log_level.lower() == "critical": - return logging.CRITICAL - return logging.INFO --- -2.23.0 - diff --git a/add-sentryctl-get_alarm-module_name-s-time_range-d.patch b/add-sentryctl-get_alarm-module_name-s-time_range-d.patch deleted file mode 100644 index 0003219158d6a1666770f4661cd49264e18cd1a7..0000000000000000000000000000000000000000 --- a/add-sentryctl-get_alarm-module_name-s-time_range-d.patch +++ /dev/null @@ -1,438 +0,0 @@ -From 8fa9389a85763831ea85d94f179a305d7f95d585 Mon Sep 17 00:00:00 2001 -From: jinsaihang -Date: Sun, 29 Sep 2024 02:04:52 +0000 -Subject: [PATCH] =?UTF-8?q?=E6=96=B0=E5=A2=9E=E5=91=8A=E8=AD=A6=E4=BA=8B?= - =?UTF-8?q?=E4=BB=B6=E6=9F=A5=E8=AF=A2=E5=8A=9F=E8=83=BD=EF=BC=9Asentryctl?= - =?UTF-8?q?=20get=5Falarm=20=20-s=20=20-d?= -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -Signed-off-by: jinsaihang ---- - src/python/syssentry/alarm.py | 142 ++++++++++++++++++ - .../src/python/syssentry/callbacks.py | 17 +++ - .../src/python/syssentry/global_values.py | 4 + - .../src/python/syssentry/load_mods.py | 16 ++ - .../src/python/syssentry/sentryctl | 20 ++- - .../src/python/syssentry/syssentry.py | 13 +- - .../src/python/syssentry/task_map.py | 5 +- - 7 files changed, 212 insertions(+), 5 deletions(-) - create mode 100644 src/python/syssentry/alarm.py - -diff --git a/src/python/syssentry/alarm.py b/src/python/syssentry/alarm.py -new file mode 100644 -index 0000000..74a2716 ---- /dev/null -+++ b/src/python/syssentry/alarm.py -@@ -0,0 +1,142 @@ -+# coding: utf-8 -+# Copyright (c) 2024 Huawei Technologies Co., Ltd. -+# sysSentry is licensed under the Mulan PSL v2. -+# You can use this software according to the terms and conditions of the Mulan PSL v2. -+# You may obtain a copy of Mulan PSL v2 at: -+# http://license.coscl.org.cn/MulanPSL2 -+# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR -+# IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY OR FIT FOR A PARTICULAR -+# PURPOSE. -+# See the Mulan PSL v2 for more details. -+ -+""" -+use for report alarm -+""" -+import threading -+from typing import Dict, List -+from datetime import datetime -+import time -+import logging -+import json -+ -+from xalarm.register_xalarm import xalarm_register,xalarm_getid,xalarm_getlevel,xalarm_gettype,xalarm_gettime,xalarm_getdesc -+from xalarm.xalarm_api import Xalarm -+ -+from .global_values import InspectTask -+from .task_map import TasksMap -+ -+# 告警ID映射字典,key为插件名,value为告警ID(类型为数字) -+task_alarm_id_dict: Dict[str, int] = {} -+ -+# 告警老化时间字典,key为告警ID,value为老化时间(类型为数字,单位为秒) -+alarm_id_clear_time_dict: Dict[int, int] = {} -+ -+# 告警事件列表,key为告警ID,value为告警ID对应的告警事件列表(类型为list) -+alarm_list_dict: Dict[int, List[Xalarm]] = {} -+# 告警事件列表锁 -+alarm_list_lock = threading.Lock() -+ -+id_filter = [] -+id_base = 1001 -+clientId = -1 -+ -+MILLISECONDS_UNIT_SECONDS = 1000 -+ -+def update_alarm_list(alarm_info: Xalarm): -+ alarm_id = xalarm_getid(alarm_info) -+ timestamp = xalarm_gettime(alarm_info) -+ if not timestamp: -+ logging.error("Retrieve timestamp failed") -+ return -+ alarm_list_lock.acquire() -+ try: -+ # new alarm is inserted into list head -+ if alarm_id not in alarm_list_dict: -+ logging.warning(f"update_alarm_list: alarm_id {alarm_id} not found in alarm_list_dict") -+ return -+ alarm_list = alarm_list_dict[alarm_id] -+ -+ alarm_list.insert(0, alarm_info) -+ # clear alarm_info older than clear time threshold -+ clear_index = -1 -+ clear_time = alarm_id_clear_time_dict[alarm_id] -+ for i in range(len(alarm_list)): -+ if (timestamp - xalarm_gettime(alarm_list[i])) / MILLISECONDS_UNIT_SECONDS > clear_time: -+ clear_index = i -+ break -+ if clear_index >= 0: -+ alarm_list_dict[alarm_id] = alarm_list[:clear_index] -+ finally: -+ alarm_list_lock.release() -+ -+def alarm_register(): -+ logging.debug(f"alarm_register: enter") -+ # 初始化告警ID映射字典、告警老化时间字典 -+ for task_type in TasksMap.tasks_dict: -+ for task_name in TasksMap.tasks_dict[task_type]: -+ logging.info(f"alarm_register: {task_name} is registered") -+ task = TasksMap.tasks_dict[task_type][task_name] -+ alarm_id = task.alarm_id -+ alarm_clear_time = task.alarm_clear_time -+ alarm_list_dict[alarm_id] = [] -+ task_alarm_id_dict[task_name] = alarm_id -+ if alarm_id not in alarm_id_clear_time_dict: -+ alarm_id_clear_time_dict[alarm_id] = alarm_clear_time -+ else: -+ alarm_id_clear_time_dict[alarm_id] = max(alarm_clear_time, alarm_id_clear_time_dict[alarm_id]) -+ # 注册告警回调 -+ id_filter = [True] * 128 -+ clientId = xalarm_register(update_alarm_list, id_filter) -+ if clientId < 0: -+ logging.info(f'register xalarm: failed') -+ return clientId -+ logging.info('register xalarm: success') -+ return clientId -+ -+def get_alarm_result(task_name: str, time_range: int, detailed: bool) -> List[Dict]: -+ alarm_list_lock.acquire() -+ try: -+ if task_name not in task_alarm_id_dict: -+ logging.debug("task_name does not exist") -+ return [] -+ alarm_id = task_alarm_id_dict[task_name] -+ if alarm_id not in alarm_list_dict: -+ logging.debug("alarm_id does not exist") -+ return [] -+ alarm_list = alarm_list_dict[alarm_id] -+ logging.debug(f"get_alarm_result: alarm_list of {alarm_id} has {len(alarm_list)} elements") -+ # clear alarm_info older than clear time threshold -+ stop_index = -1 -+ timestamp = int(datetime.now().timestamp()) -+ for i in range(len(alarm_list)): -+ logging.debug(f"timestamp, alarm_list[{i}].timestamp: {timestamp}, {xalarm_gettime(alarm_list[i])}") -+ if timestamp - (xalarm_gettime(alarm_list[i])) / MILLISECONDS_UNIT_SECONDS > int(time_range): -+ stop_index = i -+ break -+ if stop_index >= 0: -+ alarm_list = alarm_list[:stop_index] -+ logging.debug(f"get_alarm_result: final alarm_list of {alarm_id} has {len(alarm_list)} elements") -+ -+ def xalarm_to_dict(alarm_info: Xalarm) -> dict: -+ return { -+ 'alarm_id': xalarm_getid(alarm_info), -+ 'alarm_type': xalarm_gettype(alarm_info), -+ 'alarm_level': xalarm_getlevel(alarm_info), -+ 'timetamp': xalarm_gettime(alarm_info), -+ 'msg1': xalarm_getdesc(alarm_info) -+ } -+ -+ alarm_list = [xalarm_to_dict(alarm) for alarm in alarm_list] -+ -+ # keep detail -+ for alarm in alarm_list: -+ alarm_info = alarm['msg1'] -+ alarm_info = json.loads(alarm_info) -+ if not detailed: -+ if 'details' in alarm_info: -+ alarm_info.pop('details', None) -+ alarm.pop('msg1', None) -+ alarm['alarm_info'] = alarm_info -+ return alarm_list -+ finally: -+ alarm_list_lock.release() -diff --git a/src/python/syssentry/callbacks.py b/src/python/syssentry/callbacks.py -index b38b381..6ec2c29 100644 ---- a/src/python/syssentry/callbacks.py -+++ b/src/python/syssentry/callbacks.py -@@ -18,6 +18,7 @@ import logging - - from .task_map import TasksMap, ONESHOT_TYPE, PERIOD_TYPE - from .mod_status import EXITED_STATUS, RUNNING_STATUS, WAITING_STATUS, set_runtime_status -+from .alarm import get_alarm_result - - - def task_get_status(mod_name): -@@ -41,6 +42,22 @@ def task_get_result(mod_name): - - return "success", task.get_result() - -+def task_get_alarm(data): -+ """get alarm by mod name""" -+ task_name = data['task_name'] -+ time_range = data['time_range'] -+ try: -+ detailed = data['detailed'] -+ except KeyError: -+ logging.debug("Key 'detailed' does not exist in the dictionary") -+ detailed = None -+ task = TasksMap.get_task_by_name(task_name) -+ if not task: -+ return "failed", f"cannot find task by name {task_name}" -+ if not task.load_enabled: -+ return "failed", f"mod {task_name} is not enabled" -+ -+ return "success", get_alarm_result(task_name, time_range, detailed) - - def task_stop(mod_name): - """stop by mod name""" -diff --git a/src/python/syssentry/global_values.py b/src/python/syssentry/global_values.py -index 483d544..b123b2d 100644 ---- a/src/python/syssentry/global_values.py -+++ b/src/python/syssentry/global_values.py -@@ -27,6 +27,7 @@ CTL_SOCKET_PATH = "/var/run/sysSentry/control.sock" - SYSSENTRY_CONF_PATH = "/etc/sysSentry" - INSPECT_CONF_PATH = "/etc/sysSentry/inspect.conf" - TASK_LOG_DIR = "/var/log/sysSentry" -+DEFAULT_ALARM_CLEAR_TIME = 15 - - SENTRY_RUN_DIR_PERM = 0o750 - -@@ -76,6 +77,9 @@ class InspectTask: - self.env_file = "" - # start mode - self.conflict = "up" -+ # alarm id -+ self.alarm_id = -1 -+ self.alarm_clear_time = DEFAULT_ALARM_CLEAR_TIME - - def start(self): - """ -diff --git a/src/python/syssentry/load_mods.py b/src/python/syssentry/load_mods.py -index 48d7e66..ae05e57 100644 ---- a/src/python/syssentry/load_mods.py -+++ b/src/python/syssentry/load_mods.py -@@ -24,6 +24,7 @@ from .task_map import TasksMap, ONESHOT_TYPE, PERIOD_TYPE - from .cron_process import PeriodTask - from .mod_status import set_task_status - -+from xalarm.register_xalarm import MIN_ALARM_ID, MAX_ALARM_ID - ONESHOT_CONF = 'oneshot' - PERIOD_CONF = 'period' - -@@ -41,6 +42,8 @@ CONF_TASK_RESTART = 'task_restart' - CONF_ONSTART = 'onstart' - CONF_ENV_FILE = 'env_file' - CONF_CONFLICT = 'conflict' -+CONF_ALARM_ID = 'alarm_id' -+CONF_ALARM_CLEAR_TIME = 'alarm_clear_time' - - MOD_FILE_SUFFIX = '.mod' - MOD_SUFFIX_LEN = 4 -@@ -194,6 +197,18 @@ def parse_mod_conf(mod_name, mod_conf): - task.heartbeat_interval = heartbeat_interval - task.load_enabled = is_enabled - -+ try: -+ task.alarm_id = int(mod_conf.get(CONF_TASK, CONF_ALARM_ID)) -+ task.alarm_clear_time = int(mod_conf.get(CONF_TASK, CONF_ALARM_CLEAR_TIME)) -+ if not (MIN_ALARM_ID <= task.alarm_id <= MAX_ALARM_ID): -+ raise ValueError("Invalid alarm_id") -+ except ValueError: -+ task.alarm_id = -1 -+ logging.warning("Invalid alarm_id, set to -1") -+ except configparser.NoOptionError: -+ task.alarm_id = -1 -+ logging.warning("Unset alarm_id and alarm_clear_time, use -1 and 15s as default") -+ - if CONF_ONSTART in mod_conf.options(CONF_TASK): - is_onstart = (mod_conf.get(CONF_TASK, CONF_ONSTART) == 'yes') - if task_type == PERIOD_CONF: -@@ -327,3 +342,4 @@ def reload_single_mod(mod_name): - res, ret = reload_mod_by_name(mod_name) - - return res, ret -+ -diff --git a/src/python/syssentry/sentryctl b/src/python/syssentry/sentryctl -index e94491f..675c17a 100644 ---- a/src/python/syssentry/sentryctl -+++ b/src/python/syssentry/sentryctl -@@ -25,6 +25,7 @@ MAX_PARAM_LENGTH = 256 - - RESULT_MSG_DATA_LEN = 4 - CTL_MSG_LEN_LEN = 3 -+DEFAULT_ALARM_TIME_RANGE = 10 - - def status_output_format(res_data): - """format output""" -@@ -57,6 +58,8 @@ def res_output_handle(res_struct, req_type): - status_output_format(res_struct['data']) - elif req_type == 'get_result': - result_output_format(res_struct['data']) -+ elif req_type == 'get_alarm': -+ result_output_format(res_struct['data']) - elif res_struct['ret'] == "failed": - print(res_struct['data']) - -@@ -75,6 +78,7 @@ def client_send_and_recv(request_data, data_str_len): - print("sentryctl: client creat socket error") - return None - -+ # connect to syssentry - try: - client_socket.connect(CTL_SOCKET_PATH) - except OSError: -@@ -82,6 +86,7 @@ def client_send_and_recv(request_data, data_str_len): - print("sentryctl: client connect error") - return None - -+ # msg: CTL{len}{data} - req_data_len = len(request_data) - request_msg = "CTL" + str(req_data_len).zfill(3) + request_data - -@@ -94,8 +99,8 @@ def client_send_and_recv(request_data, data_str_len): - print("sentryctl: client communicate error") - return None - -+ # res: RES{len}{data} - res_magic = res_data[:3] -- - if res_magic != "RES": - print("res msg format error") - return None -@@ -128,6 +133,10 @@ if __name__ == '__main__': - parser_status.add_argument('task_name') - parser_get_result = subparsers.add_parser('get_result', help='get task result') - parser_get_result.add_argument('task_name') -+ parser_get_alarm = subparsers.add_parser('get_alarm', help='get task alarm') -+ parser_get_alarm.add_argument('task_name') -+ parser_get_alarm.add_argument('-s', '--time_range', type=str, default=DEFAULT_ALARM_TIME_RANGE, help='Specified time range') -+ parser_get_alarm.add_argument('-d', '--detailed', action='store_true', help='Print Detailed Information') - parser_list = subparsers.add_parser('list', help='show all loaded task mod') - - client_args = parser.parse_args() -@@ -142,6 +151,15 @@ if __name__ == '__main__': - req_msg_struct = {"type": "get_status", "data": client_args.task_name} - elif client_args.cmd_type == 'get_result': - req_msg_struct = {"type": "get_result", "data": client_args.task_name} -+ elif client_args.cmd_type == 'get_alarm': -+ req_msg_struct = { -+ "type": "get_alarm", -+ "data": { -+ 'task_name': client_args.task_name, -+ 'time_range': client_args.time_range, -+ 'detailed': client_args.detailed, -+ } -+ } - elif client_args.cmd_type == 'reload': - req_msg_struct = {"type": "reload", "data": client_args.task_name} - else: -diff --git a/src/python/syssentry/syssentry.py b/src/python/syssentry/syssentry.py -index 9ef0203..c2dee85 100644 ---- a/src/python/syssentry/syssentry.py -+++ b/src/python/syssentry/syssentry.py -@@ -28,7 +28,7 @@ from .sentry_config import SentryConfig, get_log_level - from .task_map import TasksMap - from .global_values import SENTRY_RUN_DIR, CTL_SOCKET_PATH, SENTRY_RUN_DIR_PERM - from .cron_process import period_tasks_handle --from .callbacks import mod_list_show, task_start, task_get_status, task_stop, task_get_result -+from .callbacks import mod_list_show, task_start, task_get_status, task_stop, task_get_result, task_get_alarm - from .mod_status import get_task_by_pid, set_runtime_status - from .load_mods import load_tasks, reload_single_mod - from .heartbeat import (heartbeat_timeout_chk, heartbeat_fd_create, -@@ -36,7 +36,11 @@ from .heartbeat import (heartbeat_timeout_chk, heartbeat_fd_create, - from .result import RESULT_MSG_HEAD_LEN, RESULT_MSG_MAGIC_LEN, RESULT_MAGIC - from .result import RESULT_LEVEL_ERR_MSG_DICT, ResultLevel - from .utils import get_current_time_string -+from .alarm import alarm_register - -+from xalarm.register_xalarm import xalarm_unregister -+ -+clientId = -1 - - CPU_EXIST = True - try: -@@ -62,6 +66,7 @@ type_func = { - 'stop': task_stop, - 'get_status': task_get_status, - 'get_result': task_get_result, -+ 'get_alarm': task_get_alarm, - 'reload': reload_single_mod - } - -@@ -107,11 +112,12 @@ def msg_data_process(msg_data): - return "Invaild cmd type" - - cmd_param = data_struct['data'] -- logging.debug("msg_data_process cmd_type:%s cmd_param:%s", cmd_type, cmd_param) -+ logging.debug("msg_data_process cmd_type:%s cmd_param:%s", cmd_type, str(cmd_param)) - if cmd_type in type_func: - ret, res_data = type_func[cmd_type](cmd_param) - else: - ret, res_data = type_func_void[cmd_type]() -+ logging.debug("msg_data_process res_data:%s",str(res_data)) - res_msg_struct = {"ret": ret, "data": res_data} - res_msg = json.dumps(res_msg_struct) - -@@ -584,10 +590,13 @@ def main(): - _ = SentryConfig.init_param() - TasksMap.init_task_map() - load_tasks() -+ clientId = alarm_register() - main_loop() - - except Exception: - logging.error('%s', traceback.format_exc()) - finally: -+ if clientId != -1: -+ xalarm_unregister(clientId) - release_pidfile() - -diff --git a/src/python/syssentry/task_map.py b/src/python/syssentry/task_map.py -index 70aa19d..27e97ff 100644 ---- a/src/python/syssentry/task_map.py -+++ b/src/python/syssentry/task_map.py -@@ -13,16 +13,16 @@ - tasks map class and initialize function. - """ - import logging -+from typing import Dict - - ONESHOT_TYPE = "ONESHOT" - PERIOD_TYPE = "PERIOD" - - TASKS_MAP = None - -- - class TasksMap: - """task map class""" -- tasks_dict = {} -+ tasks_dict: Dict[str, Dict] = {} - - @classmethod - def init_task_map(cls): -@@ -65,3 +65,4 @@ class TasksMap: - logging.debug("getting task by name: %s", res) - break - return res -+ --- -2.27.0 - diff --git a/add-xalarm-cleanup-invalid-server-socket-peroidly.patch b/add-xalarm-cleanup-invalid-server-socket-peroidly.patch deleted file mode 100644 index 5ee845adfcd5187ac7762d04f61d7e5919767c5d..0000000000000000000000000000000000000000 --- a/add-xalarm-cleanup-invalid-server-socket-peroidly.patch +++ /dev/null @@ -1,90 +0,0 @@ -From 4fa9b250f56dc3f4f431fc091e25d8f2558a9bb2 Mon Sep 17 00:00:00 2001 -From: caixiaomeng -Date: Fri, 11 Oct 2024 18:12:21 +0800 -Subject: [PATCH] add xalarm cleanup invalid server socket peroidly - ---- - src/python/xalarm/xalarm_server.py | 20 +++++++++++++++----- - src/python/xalarm/xalarm_transfer.py | 8 ++++++++ - 2 files changed, 23 insertions(+), 5 deletions(-) - -diff --git a/src/python/xalarm/xalarm_server.py b/src/python/xalarm/xalarm_server.py -index 2882609..f90a0e2 100644 ---- a/src/python/xalarm/xalarm_server.py -+++ b/src/python/xalarm/xalarm_server.py -@@ -22,7 +22,12 @@ import threading - from struct import error as StructParseError - - from .xalarm_api import alarm_bin2stu --from .xalarm_transfer import check_filter, transmit_alarm, wait_for_connection -+from .xalarm_transfer import ( -+ check_filter, -+ transmit_alarm, -+ wait_for_connection, -+ peroid_task_to_cleanup_connections -+) - - - ALARM_DIR = "/var/run/xalarm" -@@ -66,9 +71,13 @@ def server_loop(alarm_config): - fd_to_socket = {alarm_sock.fileno(): alarm_sock,} - thread_should_stop = False - -- thread = threading.Thread(target=wait_for_connection, args=(alarm_sock, epoll, fd_to_socket, thread_should_stop)) -- thread.daemon = True -- thread.start() -+ conn_thread = threading.Thread(target=wait_for_connection, args=(alarm_sock, epoll, fd_to_socket, thread_should_stop)) -+ conn_thread.daemon = True -+ conn_thread.start() -+ -+ cleanup_thread = threading.Thread(target=peroid_task_to_cleanup_connections, args=(alarm_sock, epoll, fd_to_socket, thread_should_stop)) -+ cleanup_thread.daemon = True -+ cleanup_thread.start() - - while True: - try: -@@ -88,7 +97,8 @@ def server_loop(alarm_config): - logging.error(f"Error server:{e}") - - thread_should_stop = True -- thread.join() -+ conn_thread.join() -+ cleanup_thread.join() - - epoll.unregister(alarm_sock.fileno()) - epoll.close() -diff --git a/src/python/xalarm/xalarm_transfer.py b/src/python/xalarm/xalarm_transfer.py -index 90dccbc..75807e0 100644 ---- a/src/python/xalarm/xalarm_transfer.py -+++ b/src/python/xalarm/xalarm_transfer.py -@@ -17,11 +17,13 @@ Create: 2023-11-02 - import socket - import logging - import select -+from time import sleep - - MIN_ID_NUMBER = 1001 - MAX_ID_NUMBER = 1128 - MAX_CONNECTION_NUM = 100 - TEST_CONNECT_BUFFER_SIZE = 32 -+PEROID_SCANN_TIME = 60 - - - def check_filter(alarm_info, alarm_filter): -@@ -66,6 +68,12 @@ def cleanup_closed_connections(server_sock, epoll, fd_to_socket): - logging.info(f"cleaned up connection {fileno} for client lost connection.") - - -+def peroid_task_to_cleanup_connections(server_sock, epoll, fd_to_socket, thread_should_stop): -+ while not thread_should_stop: -+ sleep(PEROID_SCANN_TIME) -+ cleanup_closed_connections(server_sock, epoll, fd_to_socket) -+ -+ - def wait_for_connection(server_sock, epoll, fd_to_socket, thread_should_stop): - """ - thread function for catch and save client connection --- -2.27.0 - - diff --git a/ai_block_io-adapt-alarm-module.patch b/ai_block_io-adapt-alarm-module.patch deleted file mode 100644 index f24974b0997f616c98c5734b2cb17f118474379b..0000000000000000000000000000000000000000 --- a/ai_block_io-adapt-alarm-module.patch +++ /dev/null @@ -1,221 +0,0 @@ -From 367f8ab8a5ad26d80caf1bc4529c79d279ef0fb1 Mon Sep 17 00:00:00 2001 -From: =?UTF-8?q?=E8=B4=BA=E6=9C=89=E5=BF=97?= <1037617413@qq.com> -Date: Thu, 10 Oct 2024 17:21:48 +0800 -Subject: [PATCH] ai_block_io adapt alarm module - ---- - config/tasks/ai_block_io.mod | 4 +- - .../sentryPlugins/ai_block_io/ai_block_io.py | 28 +++++--- - .../sentryPlugins/ai_block_io/alarm_report.py | 65 ++++++++++++++----- - .../sentryPlugins/ai_block_io/data_access.py | 5 +- - .../sentryPlugins/ai_block_io/detector.py | 2 +- - 5 files changed, 73 insertions(+), 31 deletions(-) - -diff --git a/config/tasks/ai_block_io.mod b/config/tasks/ai_block_io.mod -index 1971d7d..82f4f0b 100644 ---- a/config/tasks/ai_block_io.mod -+++ b/config/tasks/ai_block_io.mod -@@ -2,4 +2,6 @@ - enabled=yes - task_start=/usr/bin/python3 /usr/bin/ai_block_io - task_stop=pkill -f /usr/bin/ai_block_io --type=oneshot -\ No newline at end of file -+type=oneshot -+alarm_id=1002 -+alarm_clear_time=5 -\ No newline at end of file -diff --git a/src/python/sentryPlugins/ai_block_io/ai_block_io.py b/src/python/sentryPlugins/ai_block_io/ai_block_io.py -index 3b00ef3..77104a9 100644 ---- a/src/python/sentryPlugins/ai_block_io/ai_block_io.py -+++ b/src/python/sentryPlugins/ai_block_io/ai_block_io.py -@@ -20,14 +20,14 @@ from .utils import get_data_queue_size_and_update_size - from .config_parser import ConfigParser - from .data_access import get_io_data_from_collect_plug, check_collect_valid - from .io_data import MetricName --from .alarm_report import AlarmReport -+from .alarm_report import Xalarm, Report - - CONFIG_FILE = "/etc/sysSentry/plugins/ai_block_io.ini" - - - def sig_handler(signum, frame): - logging.info("receive signal: %d", signum) -- AlarmReport().report_fail(f"receive signal: {signum}") -+ Report.report_pass(f"receive signal: {signum}, exiting...") - exit(signum) - - -@@ -44,6 +44,10 @@ class SlowIODetection: - - def __init_detector_name_list(self): - self._disk_list = check_collect_valid(self._config_parser.get_slow_io_detect_frequency()) -+ if self._disk_list is None: -+ Report.report_pass("get available disk error, please check if the collector plug is enable. exiting...") -+ exit(1) -+ - logging.info(f"ai_block_io plug has found disks: {self._disk_list}") - disks_to_detection: list = self._config_parser.get_disks_to_detection() - # 情况1:None,则启用所有磁盘检测 -@@ -101,7 +105,8 @@ class SlowIODetection: - ) - logging.debug(f'step1. Get io data: {str(io_data_dict_with_disk_name)}') - if io_data_dict_with_disk_name is None: -- continue -+ Report.report_pass("get io data error, please check if the collector plug is enable. exitting...") -+ exit(1) - - # Step2:慢IO检测 - logging.debug('step2. Start to detection slow io event.') -@@ -117,13 +122,16 @@ class SlowIODetection: - for slow_io_event in slow_io_event_list: - metric_name: MetricName = slow_io_event[0] - result = slow_io_event[1] -- alarm_content = (f"disk {metric_name.get_disk_name()} has slow io event. " -- f"stage is: {metric_name.get_stage_name()}, " -- f"io access type is: {metric_name.get_io_access_type_name()}, " -- f"metric is: {metric_name.get_metric_name()}, " -- f"current window is: {result[1]}, " -- f"threshold is: {result[2]}") -- AlarmReport.report_major_alm(alarm_content) -+ alarm_content = { -+ "driver_name": f"{metric_name.get_disk_name()}", -+ "reason": "disk_slow", -+ "block_stack": f"{metric_name.get_stage_name()}", -+ "io_type": f"{metric_name.get_io_access_type_name()}", -+ "alarm_source": "ai_block_io", -+ "alarm_type": "latency", -+ "details": f"current window is: {result[1]}, threshold is: {result[2]}.", -+ } -+ Xalarm.major(alarm_content) - logging.warning(alarm_content) - - # Step4:等待检测时间 -diff --git a/src/python/sentryPlugins/ai_block_io/alarm_report.py b/src/python/sentryPlugins/ai_block_io/alarm_report.py -index 230c8cd..92bd6e3 100644 ---- a/src/python/sentryPlugins/ai_block_io/alarm_report.py -+++ b/src/python/sentryPlugins/ai_block_io/alarm_report.py -@@ -9,41 +9,72 @@ - # PURPOSE. - # See the Mulan PSL v2 for more details. - --from syssentry.result import ResultLevel, report_result - import logging - import json - -+from xalarm.sentry_notify import ( -+ xalarm_report, -+ MINOR_ALM, -+ MAJOR_ALM, -+ CRITICAL_ALM, -+ ALARM_TYPE_OCCUR, -+ ALARM_TYPE_RECOVER, -+) -+ -+from syssentry.result import ResultLevel, report_result -+ - --class AlarmReport: -+class Report: - TASK_NAME = "ai_block_io" - - @staticmethod - def report_pass(info: str): -- report_result(AlarmReport.TASK_NAME, ResultLevel.PASS, json.dumps({"msg": info})) -- logging.info(f'Report {AlarmReport.TASK_NAME} PASS: {info}') -+ report_result(Report.TASK_NAME, ResultLevel.PASS, json.dumps({"msg": info})) -+ logging.info(f'Report {Report.TASK_NAME} PASS: {info}') - - @staticmethod - def report_fail(info: str): -- report_result(AlarmReport.TASK_NAME, ResultLevel.FAIL, json.dumps({"msg": info})) -- logging.info(f'Report {AlarmReport.TASK_NAME} FAIL: {info}') -+ report_result(Report.TASK_NAME, ResultLevel.FAIL, json.dumps({"msg": info})) -+ logging.info(f'Report {Report.TASK_NAME} FAIL: {info}') - - @staticmethod - def report_skip(info: str): -- report_result(AlarmReport.TASK_NAME, ResultLevel.SKIP, json.dumps({"msg": info})) -- logging.info(f'Report {AlarmReport.TASK_NAME} SKIP: {info}') -+ report_result(Report.TASK_NAME, ResultLevel.SKIP, json.dumps({"msg": info})) -+ logging.info(f'Report {Report.TASK_NAME} SKIP: {info}') -+ -+ -+class Xalarm: -+ ALARM_ID = 1002 - - @staticmethod -- def report_minor_alm(info: str): -- report_result(AlarmReport.TASK_NAME, ResultLevel.MINOR_ALM, json.dumps({"msg": info})) -- logging.info(f'Report {AlarmReport.TASK_NAME} MINOR_ALM: {info}') -+ def minor(info: dict): -+ info_str = json.dumps(info) -+ xalarm_report(Xalarm.ALARM_ID, MINOR_ALM, ALARM_TYPE_OCCUR, info_str) -+ logging.info(f"Report {Xalarm.ALARM_ID} MINOR_ALM: {info_str}") - - @staticmethod -- def report_major_alm(info: str): -- report_result(AlarmReport.TASK_NAME, ResultLevel.MAJOR_ALM, json.dumps({"msg": info})) -- logging.info(f'Report {AlarmReport.TASK_NAME} MAJOR_ALM: {info}') -+ def major(info: dict): -+ info_str = json.dumps(info) -+ xalarm_report(Xalarm.ALARM_ID, MAJOR_ALM, ALARM_TYPE_OCCUR, info_str) -+ logging.info(f"Report {Xalarm.ALARM_ID} MAJOR_ALM: {info_str}") - - @staticmethod -- def report_critical_alm(info: str): -- report_result(AlarmReport.TASK_NAME, ResultLevel.CRITICAL_ALM, json.dumps({"msg": info})) -- logging.info(f'Report {AlarmReport.TASK_NAME} CRITICAL_ALM: {info}') -+ def critical(info: dict): -+ info_str = json.dumps(info) -+ xalarm_report(Xalarm.ALARM_ID, CRITICAL_ALM, ALARM_TYPE_OCCUR, info_str) -+ logging.info(f"Report {Xalarm.ALARM_ID} CRITICAL_ALM: {info_str}") -+ -+ def minor_recover(info: dict): -+ info_str = json.dumps(info) -+ xalarm_report(Xalarm.ALARM_ID, MINOR_ALM, ALARM_TYPE_RECOVER, info_str) -+ logging.info(f"Report {Xalarm.ALARM_ID} MINOR_ALM Recover: {info_str}") -+ -+ def major_recover(info: dict): -+ info_str = json.dumps(info) -+ xalarm_report(Xalarm.ALARM_ID, MAJOR_ALM, ALARM_TYPE_RECOVER, info_str) -+ logging.info(f"Report {Xalarm.ALARM_ID} MAJOR_ALM Recover: {info_str}") - -+ def critical_recover(info: dict): -+ info_str = json.dumps(info) -+ xalarm_report(Xalarm.ALARM_ID, CRITICAL_ALM, ALARM_TYPE_RECOVER, info_str) -+ logging.info(f"Report {Xalarm.ALARM_ID} CRITICAL_ALM Recover: {info_str}") -diff --git a/src/python/sentryPlugins/ai_block_io/data_access.py b/src/python/sentryPlugins/ai_block_io/data_access.py -index 01c5315..c7679cd 100644 ---- a/src/python/sentryPlugins/ai_block_io/data_access.py -+++ b/src/python/sentryPlugins/ai_block_io/data_access.py -@@ -42,10 +42,11 @@ def check_collect_valid(period): - data = json.loads(data_raw["message"]) - except Exception as e: - logging.warning(f"get io data failed, {e}") -- return [] -+ return None - return [k for k in data.keys()] - else: -- return [] -+ logging.warning(f"get io data failed, return {data_raw}") -+ return None - - - def _get_raw_data(period, disk_list): -diff --git a/src/python/sentryPlugins/ai_block_io/detector.py b/src/python/sentryPlugins/ai_block_io/detector.py -index a48144f..0ed282b 100644 ---- a/src/python/sentryPlugins/ai_block_io/detector.py -+++ b/src/python/sentryPlugins/ai_block_io/detector.py -@@ -35,7 +35,7 @@ class Detector: - self._count += 1 - if self._count % 15 == 0: - self._count = 0 -- logging.info(f"({self._metric_name}) 's latest threshold is: {self._threshold.get_threshold()}.") -+ logging.debug(f"({self._metric_name}) 's latest threshold is: {self._threshold.get_threshold()}.") - logging.debug(f'enter Detector: {self}') - metric_value = get_metric_value_from_io_data_dict_by_metric_name(io_data_dict_with_disk_name, self._metric_name) - if metric_value is None: --- -2.23.0 - diff --git a/ai_block_io-fix-some-bugs.patch b/ai_block_io-fix-some-bugs.patch deleted file mode 100644 index b82b44d304a61be9fed7ddb313adccf5fec424e6..0000000000000000000000000000000000000000 --- a/ai_block_io-fix-some-bugs.patch +++ /dev/null @@ -1,235 +0,0 @@ -From 1e13bc31ae3aa94f36aa124eefdfc8773221eacd Mon Sep 17 00:00:00 2001 -From: =?UTF-8?q?=E8=B4=BA=E6=9C=89=E5=BF=97?= <1037617413@qq.com> -Date: Mon, 14 Oct 2024 23:16:46 +0800 -Subject: [PATCH] ai_block_io fix some bugs - ---- - .../sentryPlugins/ai_block_io/ai_block_io.py | 1 + - .../ai_block_io/config_parser.py | 20 ++++++++++--------- - .../sentryPlugins/ai_block_io/detector.py | 18 ++++++++++++----- - .../sentryPlugins/ai_block_io/io_data.py | 2 +- - .../sentryPlugins/ai_block_io/threshold.py | 17 +++++++++------- - 5 files changed, 36 insertions(+), 22 deletions(-) - -diff --git a/src/python/sentryPlugins/ai_block_io/ai_block_io.py b/src/python/sentryPlugins/ai_block_io/ai_block_io.py -index dd661a1..4eecd43 100644 ---- a/src/python/sentryPlugins/ai_block_io/ai_block_io.py -+++ b/src/python/sentryPlugins/ai_block_io/ai_block_io.py -@@ -55,6 +55,7 @@ class SlowIODetection: - Report.report_pass( - "get available disk error, please check if the collector plug is enable. exiting..." - ) -+ logging.critical("get available disk error, please check if the collector plug is enable. exiting...") - exit(1) - - logging.info(f"ai_block_io plug has found disks: {self._disk_list}") -diff --git a/src/python/sentryPlugins/ai_block_io/config_parser.py b/src/python/sentryPlugins/ai_block_io/config_parser.py -index 3388cd4..7b0cd29 100644 ---- a/src/python/sentryPlugins/ai_block_io/config_parser.py -+++ b/src/python/sentryPlugins/ai_block_io/config_parser.py -@@ -190,7 +190,7 @@ class ConfigParser: - self._conf["common"]["disk"] = disk_list - - def _read_train_data_duration(self, items_algorithm: dict): -- self._conf["common"]["train_data_duration"] = self._get_config_value( -+ self._conf["algorithm"]["train_data_duration"] = self._get_config_value( - items_algorithm, - "train_data_duration", - float, -@@ -203,17 +203,17 @@ class ConfigParser: - default_train_update_duration = self.DEFAULT_CONF["algorithm"][ - "train_update_duration" - ] -- if default_train_update_duration > self._conf["common"]["train_data_duration"]: -+ if default_train_update_duration > self._conf["algorithm"]["train_data_duration"]: - default_train_update_duration = ( -- self._conf["common"]["train_data_duration"] / 2 -+ self._conf["algorithm"]["train_data_duration"] / 2 - ) -- self._conf["common"]["train_update_duration"] = self._get_config_value( -+ self._conf["algorithm"]["train_update_duration"] = self._get_config_value( - items_algorithm, - "train_update_duration", - float, - default_train_update_duration, - gt=0, -- le=self._conf["common"]["train_data_duration"], -+ le=self._conf["algorithm"]["train_data_duration"], - ) - - def _read_algorithm_type_and_parameter(self, items_algorithm: dict): -@@ -401,6 +401,8 @@ class ConfigParser: - self._read_stage(items_common) - self._read_iotype(items_common) - else: -+ self._conf["common"]["stage"] = ALL_STAGE_LIST -+ self._conf["common"]["iotype"] = ALL_IOTPYE_LIST - logging.warning( - "common section parameter not found, it will be set to default value." - ) -@@ -511,8 +513,8 @@ class ConfigParser: - - def get_train_data_duration_and_train_update_duration(self): - return ( -- self._conf["common"]["train_data_duration"], -- self._conf["common"]["train_update_duration"], -+ self._conf["algorithm"]["train_data_duration"], -+ self._conf["algorithm"]["train_update_duration"], - ) - - def get_window_size_and_window_minimum_threshold(self): -@@ -535,11 +537,11 @@ class ConfigParser: - - @property - def train_data_duration(self): -- return self._conf["common"]["train_data_duration"] -+ return self._conf["algorithm"]["train_data_duration"] - - @property - def train_update_duration(self): -- return self._conf["common"]["train_update_duration"] -+ return self._conf["algorithm"]["train_update_duration"] - - @property - def window_size(self): -diff --git a/src/python/sentryPlugins/ai_block_io/detector.py b/src/python/sentryPlugins/ai_block_io/detector.py -index 87bd1dd..5b21714 100644 ---- a/src/python/sentryPlugins/ai_block_io/detector.py -+++ b/src/python/sentryPlugins/ai_block_io/detector.py -@@ -9,6 +9,7 @@ - # PURPOSE. - # See the Mulan PSL v2 for more details. - import logging -+from datetime import datetime - - from .io_data import MetricName - from .threshold import Threshold -@@ -21,18 +22,25 @@ class Detector: - def __init__(self, metric_name: MetricName, threshold: Threshold, sliding_window: SlidingWindow): - self._metric_name = metric_name - self._threshold = threshold -+ # for when threshold update, it can print latest threshold with metric name -+ self._threshold.set_metric_name(self._metric_name) - self._slidingWindow = sliding_window - self._threshold.attach_observer(self._slidingWindow) -- self._count = 0 -+ self._count = None - - def get_metric_name(self): - return self._metric_name - - def is_slow_io_event(self, io_data_dict_with_disk_name: dict): -- self._count += 1 -- if self._count % 15 == 0: -- self._count = 0 -- logging.debug(f"({self._metric_name}) 's latest threshold is: {self._threshold.get_threshold()}.") -+ if self._count is None: -+ self._count = datetime.now() -+ else: -+ now_time = datetime.now() -+ time_diff = (now_time - self._count).total_seconds() -+ if time_diff >= 60: -+ logging.info(f"({self._metric_name}) 's latest threshold is: {self._threshold.get_threshold()}.") -+ self._count = None -+ - logging.debug(f'enter Detector: {self}') - metric_value = get_metric_value_from_io_data_dict_by_metric_name(io_data_dict_with_disk_name, self._metric_name) - if metric_value is None: -diff --git a/src/python/sentryPlugins/ai_block_io/io_data.py b/src/python/sentryPlugins/ai_block_io/io_data.py -index d341b55..6042911 100644 ---- a/src/python/sentryPlugins/ai_block_io/io_data.py -+++ b/src/python/sentryPlugins/ai_block_io/io_data.py -@@ -48,7 +48,7 @@ class IOData: - @dataclass(frozen=True) - class MetricName: - disk_name: str -- disk_type: str -+ disk_type: int - stage_name: str - io_access_type_name: str - metric_name: str -diff --git a/src/python/sentryPlugins/ai_block_io/threshold.py b/src/python/sentryPlugins/ai_block_io/threshold.py -index 3b7a5a8..600d041 100644 ---- a/src/python/sentryPlugins/ai_block_io/threshold.py -+++ b/src/python/sentryPlugins/ai_block_io/threshold.py -@@ -23,11 +23,6 @@ class ThresholdState(Enum): - - - class Threshold: -- threshold = None -- data_queue: queue.Queue = None -- data_queue_update_size: int = None -- new_data_size: int = None -- threshold_state: ThresholdState = None - - def __init__(self, data_queue_size: int = 10000, data_queue_update_size: int = 1000): - self._observer = None -@@ -36,12 +31,16 @@ class Threshold: - self.new_data_size = 0 - self.threshold_state = ThresholdState.INIT - self.threshold = math.inf -+ self.metric_name = None - - def set_threshold(self, threshold): - self.threshold = threshold - self.threshold_state = ThresholdState.START - self.notify_observer() - -+ def set_metric_name(self, metric_name): -+ self.metric_name = metric_name -+ - def get_threshold(self): - if self.threshold_state == ThresholdState.INIT: - return None -@@ -84,6 +83,7 @@ class BoxplotThreshold(Threshold): - self.parameter = boxplot_parameter - - def _update_threshold(self): -+ old_threshold = self.threshold - data = list(self.data_queue.queue) - q1 = np.percentile(data, 25) - q3 = np.percentile(data, 75) -@@ -91,6 +91,7 @@ class BoxplotThreshold(Threshold): - self.threshold = q3 + self.parameter * iqr - if self.threshold_state == ThresholdState.INIT: - self.threshold_state = ThresholdState.START -+ logging.info(f"MetricName: [{self.metric_name}]'s threshold update, old is: {old_threshold} -> new is: {self.threshold}") - self.notify_observer() - - def push_latest_data_to_queue(self, data): -@@ -109,7 +110,7 @@ class BoxplotThreshold(Threshold): - self.new_data_size = 0 - - def __repr__(self): -- return f"[BoxplotThreshold, param is: {self.parameter}]" -+ return f"[BoxplotThreshold, param is: {self.parameter}, train_size: {self.data_queue.maxsize}, update_size: {self.data_queue_update_size}]" - - - class NSigmaThreshold(Threshold): -@@ -118,12 +119,14 @@ class NSigmaThreshold(Threshold): - self.parameter = n_sigma_parameter - - def _update_threshold(self): -+ old_threshold = self.threshold - data = list(self.data_queue.queue) - mean = np.mean(data) - std = np.std(data) - self.threshold = mean + self.parameter * std - if self.threshold_state == ThresholdState.INIT: - self.threshold_state = ThresholdState.START -+ logging.info(f"MetricName: [{self.metric_name}]'s threshold update, old is: {old_threshold} -> new is: {self.threshold}") - self.notify_observer() - - def push_latest_data_to_queue(self, data): -@@ -142,7 +145,7 @@ class NSigmaThreshold(Threshold): - self.new_data_size = 0 - - def __repr__(self): -- return f"[NSigmaThreshold, param is: {self.parameter}]" -+ return f"[NSigmaThreshold, param is: {self.parameter}, train_size: {self.data_queue.maxsize}, update_size: {self.data_queue_update_size}]" - - - class ThresholdType(Enum): --- -2.23.0 - diff --git a/ai_block_io-fix-some-config-parameters-parse-bug.patch b/ai_block_io-fix-some-config-parameters-parse-bug.patch deleted file mode 100644 index bb84cad228d5c2c2f9d5466edca1f676e8dbbec0..0000000000000000000000000000000000000000 --- a/ai_block_io-fix-some-config-parameters-parse-bug.patch +++ /dev/null @@ -1,626 +0,0 @@ -From f3a0738061e852c8125513f6222b4a5d6ea73270 Mon Sep 17 00:00:00 2001 -From: =?UTF-8?q?=E8=B4=BA=E6=9C=89=E5=BF=97?= <1037617413@qq.com> -Date: Fri, 25 Oct 2024 15:34:25 +0800 -Subject: [PATCH] ai_block_io fix some config parameters parse bug - ---- - .../sentryPlugins/ai_block_io/ai_block_io.py | 70 +++++---- - .../ai_block_io/config_parser.py | 135 ++++++++++++++---- - .../sentryPlugins/ai_block_io/data_access.py | 14 ++ - .../sentryPlugins/ai_block_io/detector.py | 16 ++- - .../ai_block_io/sliding_window.py | 2 +- - .../sentryPlugins/ai_block_io/threshold.py | 14 +- - src/python/sentryPlugins/ai_block_io/utils.py | 2 - - 7 files changed, 180 insertions(+), 73 deletions(-) - -diff --git a/src/python/sentryPlugins/ai_block_io/ai_block_io.py b/src/python/sentryPlugins/ai_block_io/ai_block_io.py -index 74f246a..14f740d 100644 ---- a/src/python/sentryPlugins/ai_block_io/ai_block_io.py -+++ b/src/python/sentryPlugins/ai_block_io/ai_block_io.py -@@ -23,6 +23,7 @@ from .data_access import ( - get_io_data_from_collect_plug, - check_collect_valid, - get_disk_type, -+ check_disk_is_available - ) - from .io_data import MetricName - from .alarm_report import Xalarm, Report -@@ -31,14 +32,14 @@ CONFIG_FILE = "/etc/sysSentry/plugins/ai_block_io.ini" - - - def sig_handler(signum, frame): -- logging.info("receive signal: %d", signum) - Report.report_pass(f"receive signal: {signum}, exiting...") -+ logging.info("Finished ai_block_io plugin running.") - exit(signum) - - - class SlowIODetection: - _config_parser = None -- _disk_list = None -+ _disk_list = [] - _detector_name_list = defaultdict(list) - _disk_detectors = {} - -@@ -48,32 +49,30 @@ class SlowIODetection: - self.__init_detector() - - def __init_detector_name_list(self): -- self._disk_list = check_collect_valid( -- self._config_parser.period_time -- ) -- if self._disk_list is None: -- Report.report_pass( -- "get available disk error, please check if the collector plug is enable. exiting..." -- ) -- logging.critical("get available disk error, please check if the collector plug is enable. exiting...") -- exit(1) -- -- logging.info(f"ai_block_io plug has found disks: {self._disk_list}") - disks: list = self._config_parser.disks_to_detection - stages: list = self._config_parser.stage - iotypes: list = self._config_parser.iotype -- # 情况1:None,则启用所有磁盘检测 -- # 情况2:is not None and len = 0,则不启动任何磁盘检测 -- # 情况3:len != 0,则取交集 -+ - if disks is None: -- logging.warning( -- "you not specify any disk or use default, so ai_block_io will enable all available disk." -- ) -- for disk in self._disk_list: -- if disks is not None: -- if disk not in disks: -- continue -- disks.remove(disk) -+ logging.warning("you not specify any disk or use default, so ai_block_io will enable all available disk.") -+ all_available_disk_list = check_collect_valid(self._config_parser.period_time) -+ if all_available_disk_list is None: -+ Report.report_pass("get available disk error, please check if the collector plug is enable. exiting...") -+ logging.critical("get available disk error, please check if the collector plug is enable. exiting...") -+ exit(1) -+ if len(all_available_disk_list) == 0: -+ Report.report_pass("not found available disk. exiting...") -+ logging.critical("not found available disk. exiting...") -+ exit(1) -+ disks = all_available_disk_list -+ logging.info(f"available disk list is follow: {disks}.") -+ -+ for disk in disks: -+ tmp_disk = [disk] -+ ret = check_disk_is_available(self._config_parser.period_time, tmp_disk) -+ if not ret: -+ logging.warning(f"disk: {disk} is not available, it will be ignored.") -+ continue - - disk_type_result = get_disk_type(disk) - if disk_type_result["ret"] == 0 and disk_type_result["message"] in ( -@@ -89,20 +88,15 @@ class SlowIODetection: - disk_type_result, - ) - continue -+ self._disk_list.append(disk) - for stage in stages: - for iotype in iotypes: - self._detector_name_list[disk].append(MetricName(disk, disk_type, stage, iotype, "latency")) - self._detector_name_list[disk].append(MetricName(disk, disk_type, stage, iotype, "io_dump")) -- if disks: -- logging.warning( -- "disks: %s not in available disk list, so they will be ignored.", -- disks, -- ) -+ - if not self._detector_name_list: -+ Report.report_pass("the disks to detection is empty, ai_block_io will exit.") - logging.critical("the disks to detection is empty, ai_block_io will exit.") -- Report.report_pass( -- "the disks to detection is empty, ai_block_io will exit." -- ) - exit(1) - - def __init_detector(self): -@@ -202,16 +196,20 @@ class SlowIODetection: - logging.debug("step3. Report slow io event to sysSentry.") - for slow_io_event in slow_io_event_list: - alarm_content = { -+ "alarm_source": "ai_block_io", - "driver_name": slow_io_event[1], -+ "io_type": slow_io_event[4], - "reason": slow_io_event[2], - "block_stack": slow_io_event[3], -- "io_type": slow_io_event[4], -- "alarm_source": "ai_block_io", - "alarm_type": slow_io_event[5], -- "details": slow_io_event[6], -+ "details": slow_io_event[6] - } - Xalarm.major(alarm_content) -- logging.warning("[SLOW IO] " + str(alarm_content)) -+ tmp_alarm_content = alarm_content.copy() -+ del tmp_alarm_content["details"] -+ logging.warning("[SLOW IO] " + str(tmp_alarm_content)) -+ logging.warning(f"latency: " + str(alarm_content.get("details").get("latency"))) -+ logging.warning(f"iodump: " + str(alarm_content.get("details").get("iodump"))) - - # Step4:等待检测时间 - logging.debug("step4. Wait to start next slow io event detection loop.") -diff --git a/src/python/sentryPlugins/ai_block_io/config_parser.py b/src/python/sentryPlugins/ai_block_io/config_parser.py -index 91ec5c6..3049db2 100644 ---- a/src/python/sentryPlugins/ai_block_io/config_parser.py -+++ b/src/python/sentryPlugins/ai_block_io/config_parser.py -@@ -105,21 +105,26 @@ class ConfigParser: - ge=None, - lt=None, - le=None, -+ section=None - ): -+ if section is not None: -+ print_key = section + "." + key -+ else: -+ print_key = key - value = config_items.get(key) - if value is None: - logging.warning( - "config of %s not found, the default value %s will be used.", -- key, -+ print_key, - default_value, - ) - value = default_value - if not value: - logging.critical( -- "the value of %s is empty, ai_block_io plug will exit.", key -+ "the value of %s is empty, ai_block_io plug will exit.", print_key - ) - Report.report_pass( -- f"the value of {key} is empty, ai_block_io plug will exit." -+ f"the value of {print_key} is empty, ai_block_io plug will exit." - ) - exit(1) - try: -@@ -127,51 +132,51 @@ class ConfigParser: - except ValueError: - logging.critical( - "the value of %s is not a valid %s, ai_block_io plug will exit.", -- key, -+ print_key, - value_type, - ) - Report.report_pass( -- f"the value of {key} is not a valid {value_type}, ai_block_io plug will exit." -+ f"the value of {print_key} is not a valid {value_type}, ai_block_io plug will exit." - ) - exit(1) - if gt is not None and value <= gt: - logging.critical( - "the value of %s is not greater than %s, ai_block_io plug will exit.", -- key, -+ print_key, - gt, - ) - Report.report_pass( -- f"the value of {key} is not greater than {gt}, ai_block_io plug will exit." -+ f"the value of {print_key} is not greater than {gt}, ai_block_io plug will exit." - ) - exit(1) - if ge is not None and value < ge: - logging.critical( - "the value of %s is not greater than or equal to %s, ai_block_io plug will exit.", -- key, -+ print_key, - ge, - ) - Report.report_pass( -- f"the value of {key} is not greater than or equal to {ge}, ai_block_io plug will exit." -+ f"the value of {print_key} is not greater than or equal to {ge}, ai_block_io plug will exit." - ) - exit(1) - if lt is not None and value >= lt: - logging.critical( - "the value of %s is not less than %s, ai_block_io plug will exit.", -- key, -+ print_key, - lt, - ) - Report.report_pass( -- f"the value of {key} is not less than {lt}, ai_block_io plug will exit." -+ f"the value of {print_key} is not less than {lt}, ai_block_io plug will exit." - ) - exit(1) - if le is not None and value > le: - logging.critical( - "the value of %s is not less than or equal to %s, ai_block_io plug will exit.", -- key, -+ print_key, - le, - ) - Report.report_pass( -- f"the value of {key} is not less than or equal to {le}, ai_block_io plug will exit." -+ f"the value of {print_key} is not less than or equal to {le}, ai_block_io plug will exit." - ) - exit(1) - -@@ -188,7 +193,7 @@ class ConfigParser: - frequency = self._conf["common"]["period_time"] - ret = check_detect_frequency_is_valid(frequency) - if ret is None: -- log = f"period_time: {frequency} is valid, "\ -+ log = f"period_time: {frequency} is invalid, "\ - f"Check whether the value range is too large or is not an "\ - f"integer multiple of period_time.. exiting..." - Report.report_pass(log) -@@ -202,6 +207,7 @@ class ConfigParser: - self._conf["common"]["disk"] = None - return - disks_to_detection = disks_to_detection.strip() -+ disks_to_detection = disks_to_detection.lower() - if not disks_to_detection: - logging.critical("the value of disk is empty, ai_block_io plug will exit.") - Report.report_pass( -@@ -213,7 +219,18 @@ class ConfigParser: - if len(disk_list) == 1 and disk_list[0] == "default": - self._conf["common"]["disk"] = None - return -- self._conf["common"]["disk"] = disk_list -+ if len(disk_list) > 10: -+ ten_disk_list = disk_list[0:10] -+ other_disk_list = disk_list[10:] -+ logging.warning(f"disk only support maximum is 10, disks: {ten_disk_list} will be retained, other: {other_disk_list} will be ignored.") -+ else: -+ ten_disk_list = disk_list -+ set_ten_disk_list = set(ten_disk_list) -+ if len(ten_disk_list) > len(set_ten_disk_list): -+ tmp = ten_disk_list -+ ten_disk_list = list(set_ten_disk_list) -+ logging.warning(f"disk exist duplicate, it will be deduplicate, before: {tmp}, after: {ten_disk_list}") -+ self._conf["common"]["disk"] = ten_disk_list - - def _read_train_data_duration(self, items_algorithm: dict): - self._conf["algorithm"]["train_data_duration"] = self._get_config_value( -@@ -244,10 +261,12 @@ class ConfigParser: - - def _read_algorithm_type_and_parameter(self, items_algorithm: dict): - algorithm_type = items_algorithm.get("algorithm_type") -- if algorithm_type is not None: -- self._conf["algorithm"]["algorithm_type"] = get_threshold_type_enum( -- algorithm_type -- ) -+ if algorithm_type is None: -+ default_algorithm_type = self._conf["algorithm"]["algorithm_type"] -+ logging.warning(f"algorithm_type not found, it will be set default: {default_algorithm_type}") -+ else: -+ self._conf["algorithm"]["algorithm_type"] = get_threshold_type_enum(algorithm_type) -+ - if self._conf["algorithm"]["algorithm_type"] is None: - logging.critical( - "the algorithm_type: %s you set is invalid. ai_block_io plug will exit.", -@@ -257,6 +276,7 @@ class ConfigParser: - f"the algorithm_type: {algorithm_type} you set is invalid. ai_block_io plug will exit." - ) - exit(1) -+ - elif self._conf["algorithm"]["algorithm_type"] == ThresholdType.NSigmaThreshold: - self._conf["algorithm"]["n_sigma_parameter"] = self._get_config_value( - items_algorithm, -@@ -279,9 +299,14 @@ class ConfigParser: - ) - - def _read_stage(self, items_algorithm: dict): -- stage_str = items_algorithm.get( -- "stage", self.DEFAULT_CONF["common"]["stage"] -- ).strip() -+ stage_str = items_algorithm.get("stage") -+ if stage_str is None: -+ stage_str = self.DEFAULT_CONF["common"]["stage"] -+ logging.warning(f"stage not found, it will be set default: {stage_str}") -+ else: -+ stage_str = stage_str.strip() -+ -+ stage_str = stage_str.lower() - stage_list = stage_str.split(",") - stage_list = [stage.strip() for stage in stage_list] - if len(stage_list) == 1 and stage_list[0] == "": -@@ -307,9 +332,14 @@ class ConfigParser: - self._conf["common"]["stage"] = dup_stage_list - - def _read_iotype(self, items_algorithm: dict): -- iotype_str = items_algorithm.get( -- "iotype", self.DEFAULT_CONF["common"]["iotype"] -- ).strip() -+ iotype_str = items_algorithm.get("iotype") -+ if iotype_str is None: -+ iotype_str = self.DEFAULT_CONF["common"]["iotype"] -+ logging.warning(f"iotype not found, it will be set default: {iotype_str}") -+ else: -+ iotype_str = iotype_str.strip() -+ -+ iotype_str = iotype_str.lower() - iotype_list = iotype_str.split(",") - iotype_list = [iotype.strip() for iotype in iotype_list] - if len(iotype_list) == 1 and iotype_list[0] == "": -@@ -333,6 +363,13 @@ class ConfigParser: - - def _read_sliding_window_type(self, items_sliding_window: dict): - sliding_window_type = items_sliding_window.get("win_type") -+ -+ if sliding_window_type is None: -+ default_sliding_window_type = self._conf["algorithm"]["win_type"] -+ logging.warning(f"win_type not found, it will be set default: {default_sliding_window_type}") -+ return -+ -+ sliding_window_type = sliding_window_type.strip() - if sliding_window_type is not None: - self._conf["algorithm"]["win_type"] = ( - get_sliding_window_type_enum(sliding_window_type) -@@ -439,6 +476,7 @@ class ConfigParser: - int, - self.DEFAULT_CONF["latency_sata_ssd"]["read_tot_lim"], - gt=0, -+ section="latency_sata_ssd" - ) - self._conf["latency_sata_ssd"]["write_tot_lim"] = self._get_config_value( - items_latency_sata_ssd, -@@ -446,21 +484,32 @@ class ConfigParser: - int, - self.DEFAULT_CONF["latency_sata_ssd"]["write_tot_lim"], - gt=0, -+ section="latency_sata_ssd" - ) - self._conf["latency_sata_ssd"]["read_avg_lim"] = self._get_config_value( - items_latency_sata_ssd, - "read_avg_lim", - int, - self.DEFAULT_CONF["latency_sata_ssd"]["read_avg_lim"], -- gt=0 -+ gt=0, -+ section="latency_sata_ssd" - ) - self._conf["latency_sata_ssd"]["write_avg_lim"] = self._get_config_value( - items_latency_sata_ssd, - "write_avg_lim", - int, - self.DEFAULT_CONF["latency_sata_ssd"]["write_avg_lim"], -- gt=0 -+ gt=0, -+ section="latency_sata_ssd" - ) -+ if self._conf["latency_sata_ssd"]["read_avg_lim"] >= self._conf["latency_sata_ssd"]["read_tot_lim"]: -+ Report.report_pass("latency_sata_ssd.read_avg_lim must < latency_sata_ssd.read_tot_lim . exiting...") -+ logging.critical("latency_sata_ssd.read_avg_lim must < latency_sata_ssd.read_tot_lim . exiting...") -+ exit(1) -+ if self._conf["latency_sata_ssd"]["write_avg_lim"] >= self._conf["latency_sata_ssd"]["write_tot_lim"]: -+ Report.report_pass("latency_sata_ssd.write_avg_lim must < latency_sata_ssd.write_tot_lim . exiting...") -+ logging.critical("latency_sata_ssd.read_avg_lim must < latency_sata_ssd.read_tot_lim . exiting...") -+ exit(1) - else: - Report.report_pass("not found latency_sata_ssd section. exiting...") - logging.critical("not found latency_sata_ssd section. exiting...") -@@ -474,6 +523,7 @@ class ConfigParser: - int, - self.DEFAULT_CONF["latency_nvme_ssd"]["read_tot_lim"], - gt=0, -+ section="latency_nvme_ssd" - ) - self._conf["latency_nvme_ssd"]["write_tot_lim"] = self._get_config_value( - items_latency_nvme_ssd, -@@ -481,21 +531,32 @@ class ConfigParser: - int, - self.DEFAULT_CONF["latency_nvme_ssd"]["write_tot_lim"], - gt=0, -+ section="latency_nvme_ssd" - ) - self._conf["latency_nvme_ssd"]["read_avg_lim"] = self._get_config_value( - items_latency_nvme_ssd, - "read_avg_lim", - int, - self.DEFAULT_CONF["latency_nvme_ssd"]["read_avg_lim"], -- gt=0 -+ gt=0, -+ section="latency_nvme_ssd" - ) - self._conf["latency_nvme_ssd"]["write_avg_lim"] = self._get_config_value( - items_latency_nvme_ssd, - "write_avg_lim", - int, - self.DEFAULT_CONF["latency_nvme_ssd"]["write_avg_lim"], -- gt=0 -+ gt=0, -+ section="latency_nvme_ssd" - ) -+ if self._conf["latency_nvme_ssd"]["read_avg_lim"] >= self._conf["latency_nvme_ssd"]["read_tot_lim"]: -+ Report.report_pass("latency_nvme_ssd.read_avg_lim must < latency_nvme_ssd.read_tot_lim . exiting...") -+ logging.critical("latency_nvme_ssd.read_avg_lim must < latency_nvme_ssd.read_tot_lim . exiting...") -+ exit(1) -+ if self._conf["latency_nvme_ssd"]["write_avg_lim"] >= self._conf["latency_nvme_ssd"]["write_tot_lim"]: -+ Report.report_pass("latency_nvme_ssd.write_avg_lim must < latency_nvme_ssd.write_tot_lim . exiting...") -+ logging.critical("latency_nvme_ssd.write_avg_lim must < latency_nvme_ssd.write_tot_lim . exiting...") -+ exit(1) - else: - Report.report_pass("not found latency_nvme_ssd section. exiting...") - logging.critical("not found latency_nvme_ssd section. exiting...") -@@ -509,6 +570,7 @@ class ConfigParser: - int, - self.DEFAULT_CONF["latency_sata_hdd"]["read_tot_lim"], - gt=0, -+ section="latency_sata_hdd" - ) - self._conf["latency_sata_hdd"]["write_tot_lim"] = self._get_config_value( - items_latency_sata_hdd, -@@ -516,21 +578,32 @@ class ConfigParser: - int, - self.DEFAULT_CONF["latency_sata_hdd"]["write_tot_lim"], - gt=0, -+ section="latency_sata_hdd" - ) - self._conf["latency_sata_hdd"]["read_avg_lim"] = self._get_config_value( - items_latency_sata_hdd, - "read_avg_lim", - int, - self.DEFAULT_CONF["latency_sata_hdd"]["read_avg_lim"], -- gt=0 -+ gt=0, -+ section="latency_sata_hdd" - ) - self._conf["latency_sata_hdd"]["write_avg_lim"] = self._get_config_value( - items_latency_sata_hdd, - "write_avg_lim", - int, - self.DEFAULT_CONF["latency_sata_hdd"]["write_avg_lim"], -- gt=0 -+ gt=0, -+ section="latency_sata_hdd" - ) -+ if self._conf["latency_sata_hdd"]["read_avg_lim"] >= self._conf["latency_sata_hdd"]["read_tot_lim"]: -+ Report.report_pass("latency_sata_hdd.read_avg_lim must < latency_sata_hdd.read_tot_lim . exiting...") -+ logging.critical("latency_sata_hdd.read_avg_lim must < latency_sata_hdd.read_tot_lim . exiting...") -+ exit(1) -+ if self._conf["latency_sata_hdd"]["write_avg_lim"] >= self._conf["latency_sata_hdd"]["write_tot_lim"]: -+ Report.report_pass("latency_sata_hdd.write_avg_lim must < latency_sata_hdd.write_tot_lim . exiting...") -+ logging.critical("latency_sata_hdd.write_avg_lim must < latency_sata_hdd.write_tot_lim . exiting...") -+ exit(1) - else: - Report.report_pass("not found latency_sata_hdd section. exiting...") - logging.critical("not found latency_sata_hdd section. exiting...") -diff --git a/src/python/sentryPlugins/ai_block_io/data_access.py b/src/python/sentryPlugins/ai_block_io/data_access.py -index e4869d5..2f2d607 100644 ---- a/src/python/sentryPlugins/ai_block_io/data_access.py -+++ b/src/python/sentryPlugins/ai_block_io/data_access.py -@@ -67,6 +67,20 @@ def check_detect_frequency_is_valid(period): - return None - - -+def check_disk_is_available(period_time, disk): -+ data_raw = is_iocollect_valid(period_time, disk) -+ if data_raw["ret"] == 0: -+ try: -+ data = json.loads(data_raw["message"]) -+ except Exception as e: -+ return False -+ if not data: -+ return False -+ return True -+ else: -+ return False -+ -+ - def _get_raw_data(period, disk_list): - return get_io_data( - period, -diff --git a/src/python/sentryPlugins/ai_block_io/detector.py b/src/python/sentryPlugins/ai_block_io/detector.py -index e3a0952..496e032 100644 ---- a/src/python/sentryPlugins/ai_block_io/detector.py -+++ b/src/python/sentryPlugins/ai_block_io/detector.py -@@ -75,6 +75,18 @@ class Detector: - f' sliding_window_type: {self._slidingWindow}') - - -+def set_to_str(parameter: set): -+ ret = "" -+ parameter = list(parameter) -+ length = len(parameter) -+ for i in range(length): -+ if i == 0: -+ ret += parameter[i] -+ else: -+ ret += "," + parameter[i] -+ return ret -+ -+ - class DiskDetector: - - def __init__(self, disk_name: str): -@@ -124,7 +136,7 @@ class DiskDetector: - alarm_type.add(metric_name.metric_name) - - latency_wins, iodump_wins = self.get_detector_list_window() -- details = f"latency: {latency_wins}, iodump: {iodump_wins}" -+ details = {"latency": latency_wins, "iodump": iodump_wins} - - io_press = {"throtl", "wbt", "iocost", "bfq"} - driver_slow = {"rq_driver"} -@@ -137,7 +149,7 @@ class DiskDetector: - elif not kernel_slow.isdisjoint(block_stack): - reason = "kernel_slow" - -- return True, driver_name, reason, str(block_stack), str(io_type), str(alarm_type), details -+ return True, driver_name, reason, set_to_str(block_stack), set_to_str(io_type), set_to_str(alarm_type), details - - def __repr__(self): - msg = f'disk: {self._disk_name}, ' -diff --git a/src/python/sentryPlugins/ai_block_io/sliding_window.py b/src/python/sentryPlugins/ai_block_io/sliding_window.py -index 4083c43..ff3fa3b 100644 ---- a/src/python/sentryPlugins/ai_block_io/sliding_window.py -+++ b/src/python/sentryPlugins/ai_block_io/sliding_window.py -@@ -107,7 +107,7 @@ class MedianSlidingWindow(SlidingWindow): - if len(self._io_data_queue) < self._queue_length or (self._ai_threshold is None and self._abs_threshold is None): - is_slow_io_event = False - median = np.median(self._io_data_queue) -- if median >= self._ai_threshold: -+ if (self._ai_threshold is not None and median > self._ai_threshold) or (self._abs_threshold is not None and median > self._abs_threshold): - is_slow_io_event = True - return (is_slow_io_event, is_abnormal_period), self._io_data_queue, self._ai_threshold, self._abs_threshold, self._avg_lim - -diff --git a/src/python/sentryPlugins/ai_block_io/threshold.py b/src/python/sentryPlugins/ai_block_io/threshold.py -index 600d041..e202bb8 100644 ---- a/src/python/sentryPlugins/ai_block_io/threshold.py -+++ b/src/python/sentryPlugins/ai_block_io/threshold.py -@@ -65,9 +65,12 @@ class Threshold: - def __repr__(self): - return "Threshold" - -+ def __str__(self): -+ return "Threshold" -+ - - class AbsoluteThreshold(Threshold): -- def __init__(self, data_queue_size: int = 10000, data_queue_update_size: int = 1000): -+ def __init__(self, data_queue_size: int = 10000, data_queue_update_size: int = 1000, **kwargs): - super().__init__(data_queue_size, data_queue_update_size) - - def push_latest_data_to_queue(self, data): -@@ -76,6 +79,9 @@ class AbsoluteThreshold(Threshold): - def __repr__(self): - return "[AbsoluteThreshold]" - -+ def __str__(self): -+ return "absolute" -+ - - class BoxplotThreshold(Threshold): - def __init__(self, boxplot_parameter: float = 1.5, data_queue_size: int = 10000, data_queue_update_size: int = 1000, **kwargs): -@@ -112,6 +118,9 @@ class BoxplotThreshold(Threshold): - def __repr__(self): - return f"[BoxplotThreshold, param is: {self.parameter}, train_size: {self.data_queue.maxsize}, update_size: {self.data_queue_update_size}]" - -+ def __str__(self): -+ return "boxplot" -+ - - class NSigmaThreshold(Threshold): - def __init__(self, n_sigma_parameter: float = 3.0, data_queue_size: int = 10000, data_queue_update_size: int = 1000, **kwargs): -@@ -147,6 +156,9 @@ class NSigmaThreshold(Threshold): - def __repr__(self): - return f"[NSigmaThreshold, param is: {self.parameter}, train_size: {self.data_queue.maxsize}, update_size: {self.data_queue_update_size}]" - -+ def __str__(self): -+ return "n_sigma" -+ - - class ThresholdType(Enum): - AbsoluteThreshold = 0 -diff --git a/src/python/sentryPlugins/ai_block_io/utils.py b/src/python/sentryPlugins/ai_block_io/utils.py -index d6f4067..7d2390b 100644 ---- a/src/python/sentryPlugins/ai_block_io/utils.py -+++ b/src/python/sentryPlugins/ai_block_io/utils.py -@@ -19,8 +19,6 @@ from .io_data import MetricName, IOData - - - def get_threshold_type_enum(algorithm_type: str): -- if algorithm_type.lower() == "absolute": -- return ThresholdType.AbsoluteThreshold - if algorithm_type.lower() == "boxplot": - return ThresholdType.BoxplotThreshold - if algorithm_type.lower() == "n_sigma": --- -2.23.0 - diff --git a/ai_block_io-lack-section-exit.patch b/ai_block_io-lack-section-exit.patch deleted file mode 100644 index c226ee1ec46f758116e46213d632790e315f7c1b..0000000000000000000000000000000000000000 --- a/ai_block_io-lack-section-exit.patch +++ /dev/null @@ -1,98 +0,0 @@ -From 8e4f39897dc8dc51cfa0bbf24667be1688876c15 Mon Sep 17 00:00:00 2001 -From: =?UTF-8?q?=E8=B4=BA=E6=9C=89=E5=BF=97?= <1037617413@qq.com> -Date: Mon, 21 Oct 2024 14:18:20 +0800 -Subject: [PATCH] ai_block_io lack section exit - ---- - .../ai_block_io/config_parser.py | 40 +++++++++---------- - 1 file changed, 20 insertions(+), 20 deletions(-) - -diff --git a/src/python/sentryPlugins/ai_block_io/config_parser.py b/src/python/sentryPlugins/ai_block_io/config_parser.py -index 7b0cd29..447eccd 100644 ---- a/src/python/sentryPlugins/ai_block_io/config_parser.py -+++ b/src/python/sentryPlugins/ai_block_io/config_parser.py -@@ -401,11 +401,9 @@ class ConfigParser: - self._read_stage(items_common) - self._read_iotype(items_common) - else: -- self._conf["common"]["stage"] = ALL_STAGE_LIST -- self._conf["common"]["iotype"] = ALL_IOTPYE_LIST -- logging.warning( -- "common section parameter not found, it will be set to default value." -- ) -+ Report.report_pass("not found common section. exiting...") -+ logging.critical("not found common section. exiting...") -+ exit(1) - - if con.has_section("algorithm"): - items_algorithm = dict(con.items("algorithm")) -@@ -413,9 +411,9 @@ class ConfigParser: - self._read_train_update_duration(items_algorithm) - self._read_algorithm_type_and_parameter(items_algorithm) - else: -- logging.warning( -- "algorithm section parameter not found, it will be set to default value." -- ) -+ Report.report_pass("not found algorithm section. exiting...") -+ logging.critical("not found algorithm section. exiting...") -+ exit(1) - - if con.has_section("sliding_window"): - items_sliding_window = dict(con.items("sliding_window")) -@@ -423,9 +421,9 @@ class ConfigParser: - self._read_window_size(items_sliding_window) - self._read_window_minimum_threshold(items_sliding_window) - else: -- logging.warning( -- "sliding_window section parameter not found, it will be set to default value." -- ) -+ Report.report_pass("not found sliding_window section. exiting...") -+ logging.critical("not found sliding_window section. exiting...") -+ exit(1) - - if con.has_section("latency_sata_ssd"): - items_latency_sata_ssd = dict(con.items("latency_sata_ssd")) -@@ -444,9 +442,10 @@ class ConfigParser: - gt=0, - ) - else: -- logging.warning( -- "latency_sata_ssd section parameter not found, it will be set to default value." -- ) -+ Report.report_pass("not found latency_sata_ssd section. exiting...") -+ logging.critical("not found latency_sata_ssd section. exiting...") -+ exit(1) -+ - if con.has_section("latency_nvme_ssd"): - items_latency_nvme_ssd = dict(con.items("latency_nvme_ssd")) - self._conf["latency_nvme_ssd"]["read_tot_lim"] = self._get_config_value( -@@ -464,9 +463,10 @@ class ConfigParser: - gt=0, - ) - else: -- logging.warning( -- "latency_nvme_ssd section parameter not found, it will be set to default value." -- ) -+ Report.report_pass("not found latency_nvme_ssd section. exiting...") -+ logging.critical("not found latency_nvme_ssd section. exiting...") -+ exit(1) -+ - if con.has_section("latency_sata_hdd"): - items_latency_sata_hdd = dict(con.items("latency_sata_hdd")) - self._conf["latency_sata_hdd"]["read_tot_lim"] = self._get_config_value( -@@ -484,9 +484,9 @@ class ConfigParser: - gt=0, - ) - else: -- logging.warning( -- "latency_sata_hdd section parameter not found, it will be set to default value." -- ) -+ Report.report_pass("not found latency_sata_hdd section. exiting...") -+ logging.critical("not found latency_sata_hdd section. exiting...") -+ exit(1) - - self.__print_all_config_value() - --- -2.23.0 - diff --git a/ai_block_io-support-absolute-threshold-lower-limit.patch b/ai_block_io-support-absolute-threshold-lower-limit.patch deleted file mode 100644 index ccd8f17c2e72fb910cf976f7ccc98de7be2665bd..0000000000000000000000000000000000000000 --- a/ai_block_io-support-absolute-threshold-lower-limit.patch +++ /dev/null @@ -1,728 +0,0 @@ -From cedd862d4e4a97a6c4fa13cbff2af452910ea5b4 Mon Sep 17 00:00:00 2001 -From: =?UTF-8?q?=E8=B4=BA=E6=9C=89=E5=BF=97?= <1037617413@qq.com> -Date: Thu, 24 Oct 2024 09:39:16 +0800 -Subject: [PATCH] ai_block_io support absolute threshold lower limit - ---- - config/plugins/ai_block_io.ini | 19 +- - .../sentryPlugins/ai_block_io/ai_block_io.py | 36 ++-- - .../sentryPlugins/ai_block_io/alarm_report.py | 18 +- - .../ai_block_io/config_parser.py | 168 ++++++++++++------ - .../sentryPlugins/ai_block_io/detector.py | 92 ++++++---- - .../ai_block_io/sliding_window.py | 21 ++- - 6 files changed, 222 insertions(+), 132 deletions(-) - -diff --git a/config/plugins/ai_block_io.ini b/config/plugins/ai_block_io.ini -index 040237d..d0b1e74 100644 ---- a/config/plugins/ai_block_io.ini -+++ b/config/plugins/ai_block_io.ini -@@ -2,9 +2,9 @@ - level=info - - [common] --slow_io_detect_frequency=1 -+period_time=1 - disk=default --stage=bio -+stage=default - iotype=read,write - - [algorithm] -@@ -12,22 +12,25 @@ train_data_duration=24 - train_update_duration=2 - algorithm_type=boxplot - boxplot_parameter=1.5 --n_sigma_parameter=3 -- --[sliding_window] --sliding_window_type=not_continuous --window_size=30 --window_minimum_threshold=6 -+win_type=not_continuous -+win_size=30 -+win_threshold=6 - - [latency_sata_ssd] -+read_avg_lim=10000 -+write_avg_lim=10000 - read_tot_lim=50000 - write_tot_lim=50000 - - [latency_nvme_ssd] -+read_avg_lim=300 -+write_avg_lim=300 - read_tot_lim=500 - write_tot_lim=500 - - [latency_sata_hdd] -+read_avg_lim=15000 -+write_avg_lim=15000 - read_tot_lim=50000 - write_tot_lim=50000 - -diff --git a/src/python/sentryPlugins/ai_block_io/ai_block_io.py b/src/python/sentryPlugins/ai_block_io/ai_block_io.py -index f25e6d5..74f246a 100644 ---- a/src/python/sentryPlugins/ai_block_io/ai_block_io.py -+++ b/src/python/sentryPlugins/ai_block_io/ai_block_io.py -@@ -49,7 +49,7 @@ class SlowIODetection: - - def __init_detector_name_list(self): - self._disk_list = check_collect_valid( -- self._config_parser.slow_io_detect_frequency -+ self._config_parser.period_time - ) - if self._disk_list is None: - Report.report_pass( -@@ -109,7 +109,7 @@ class SlowIODetection: - train_data_duration, train_update_duration = ( - self._config_parser.get_train_data_duration_and_train_update_duration() - ) -- slow_io_detection_frequency = self._config_parser.slow_io_detect_frequency -+ slow_io_detection_frequency = self._config_parser.period_time - threshold_type = self._config_parser.algorithm_type - data_queue_size, update_size = get_data_queue_size_and_update_size( - train_data_duration, train_update_duration, slow_io_detection_frequency -@@ -131,10 +131,13 @@ class SlowIODetection: - data_queue_size=data_queue_size, - data_queue_update_size=update_size, - ) -- abs_threshold = self._config_parser.get_tot_lim( -+ tot_lim = self._config_parser.get_tot_lim( - metric_name.disk_type, metric_name.io_access_type_name - ) -- if abs_threshold is None: -+ avg_lim = self._config_parser.get_avg_lim( -+ metric_name.disk_type, metric_name.io_access_type_name -+ ) -+ if tot_lim is None: - logging.warning( - "disk %s, disk type %s, io type %s, get tot lim error, so it will be ignored.", - disk, -@@ -145,7 +148,8 @@ class SlowIODetection: - sliding_window_type, - queue_length=window_size, - threshold=window_threshold, -- abs_threshold=abs_threshold, -+ abs_threshold=tot_lim, -+ avg_lim=avg_lim - ) - detector = Detector(metric_name, threshold, sliding_window) - disk_detector.add_detector(detector) -@@ -176,7 +180,7 @@ class SlowIODetection: - - # Step1:获取IO数据 - io_data_dict_with_disk_name = get_io_data_from_collect_plug( -- self._config_parser.slow_io_detect_frequency, self._disk_list -+ self._config_parser.period_time, self._disk_list - ) - logging.debug(f"step1. Get io data: {str(io_data_dict_with_disk_name)}") - if io_data_dict_with_disk_name is None: -@@ -197,25 +201,21 @@ class SlowIODetection: - # Step3:慢IO事件上报 - logging.debug("step3. Report slow io event to sysSentry.") - for slow_io_event in slow_io_event_list: -- metric_name: MetricName = slow_io_event[1] -- window_info = slow_io_event[2] -- root_cause = slow_io_event[3] - alarm_content = { -- "driver_name": f"{metric_name.disk_name}", -- "reason": root_cause, -- "block_stack": f"{metric_name.stage_name}", -- "io_type": f"{metric_name.io_access_type_name}", -+ "driver_name": slow_io_event[1], -+ "reason": slow_io_event[2], -+ "block_stack": slow_io_event[3], -+ "io_type": slow_io_event[4], - "alarm_source": "ai_block_io", -- "alarm_type": "latency", -- "details": f"disk type: {metric_name.disk_type}, current window: {window_info[1]}, " -- f"ai threshold: {window_info[2]}, abs threshold: {window_info[3]}.", -+ "alarm_type": slow_io_event[5], -+ "details": slow_io_event[6], - } - Xalarm.major(alarm_content) -- logging.warning(alarm_content) -+ logging.warning("[SLOW IO] " + str(alarm_content)) - - # Step4:等待检测时间 - logging.debug("step4. Wait to start next slow io event detection loop.") -- time.sleep(self._config_parser.slow_io_detect_frequency) -+ time.sleep(self._config_parser.period_time) - - - def main(): -diff --git a/src/python/sentryPlugins/ai_block_io/alarm_report.py b/src/python/sentryPlugins/ai_block_io/alarm_report.py -index 92bd6e3..61bb145 100644 ---- a/src/python/sentryPlugins/ai_block_io/alarm_report.py -+++ b/src/python/sentryPlugins/ai_block_io/alarm_report.py -@@ -30,17 +30,17 @@ class Report: - @staticmethod - def report_pass(info: str): - report_result(Report.TASK_NAME, ResultLevel.PASS, json.dumps({"msg": info})) -- logging.info(f'Report {Report.TASK_NAME} PASS: {info}') -+ logging.debug(f'Report {Report.TASK_NAME} PASS: {info}') - - @staticmethod - def report_fail(info: str): - report_result(Report.TASK_NAME, ResultLevel.FAIL, json.dumps({"msg": info})) -- logging.info(f'Report {Report.TASK_NAME} FAIL: {info}') -+ logging.debug(f'Report {Report.TASK_NAME} FAIL: {info}') - - @staticmethod - def report_skip(info: str): - report_result(Report.TASK_NAME, ResultLevel.SKIP, json.dumps({"msg": info})) -- logging.info(f'Report {Report.TASK_NAME} SKIP: {info}') -+ logging.debug(f'Report {Report.TASK_NAME} SKIP: {info}') - - - class Xalarm: -@@ -50,31 +50,31 @@ class Xalarm: - def minor(info: dict): - info_str = json.dumps(info) - xalarm_report(Xalarm.ALARM_ID, MINOR_ALM, ALARM_TYPE_OCCUR, info_str) -- logging.info(f"Report {Xalarm.ALARM_ID} MINOR_ALM: {info_str}") -+ logging.debug(f"Report {Xalarm.ALARM_ID} MINOR_ALM: {info_str}") - - @staticmethod - def major(info: dict): - info_str = json.dumps(info) - xalarm_report(Xalarm.ALARM_ID, MAJOR_ALM, ALARM_TYPE_OCCUR, info_str) -- logging.info(f"Report {Xalarm.ALARM_ID} MAJOR_ALM: {info_str}") -+ logging.debug(f"Report {Xalarm.ALARM_ID} MAJOR_ALM: {info_str}") - - @staticmethod - def critical(info: dict): - info_str = json.dumps(info) - xalarm_report(Xalarm.ALARM_ID, CRITICAL_ALM, ALARM_TYPE_OCCUR, info_str) -- logging.info(f"Report {Xalarm.ALARM_ID} CRITICAL_ALM: {info_str}") -+ logging.debug(f"Report {Xalarm.ALARM_ID} CRITICAL_ALM: {info_str}") - - def minor_recover(info: dict): - info_str = json.dumps(info) - xalarm_report(Xalarm.ALARM_ID, MINOR_ALM, ALARM_TYPE_RECOVER, info_str) -- logging.info(f"Report {Xalarm.ALARM_ID} MINOR_ALM Recover: {info_str}") -+ logging.debug(f"Report {Xalarm.ALARM_ID} MINOR_ALM Recover: {info_str}") - - def major_recover(info: dict): - info_str = json.dumps(info) - xalarm_report(Xalarm.ALARM_ID, MAJOR_ALM, ALARM_TYPE_RECOVER, info_str) -- logging.info(f"Report {Xalarm.ALARM_ID} MAJOR_ALM Recover: {info_str}") -+ logging.debug(f"Report {Xalarm.ALARM_ID} MAJOR_ALM Recover: {info_str}") - - def critical_recover(info: dict): - info_str = json.dumps(info) - xalarm_report(Xalarm.ALARM_ID, CRITICAL_ALM, ALARM_TYPE_RECOVER, info_str) -- logging.info(f"Report {Xalarm.ALARM_ID} CRITICAL_ALM Recover: {info_str}") -+ logging.debug(f"Report {Xalarm.ALARM_ID} CRITICAL_ALM Recover: {info_str}") -diff --git a/src/python/sentryPlugins/ai_block_io/config_parser.py b/src/python/sentryPlugins/ai_block_io/config_parser.py -index 1117939..91ec5c6 100644 ---- a/src/python/sentryPlugins/ai_block_io/config_parser.py -+++ b/src/python/sentryPlugins/ai_block_io/config_parser.py -@@ -52,7 +52,7 @@ class ConfigParser: - DEFAULT_CONF = { - "log": {"level": "info"}, - "common": { -- "slow_io_detect_frequency": 1, -+ "period_time": 1, - "disk": None, - "stage": "throtl,wbt,gettag,plug,deadline,hctx,requeue,rq_driver,bio", - "iotype": "read,write", -@@ -63,16 +63,32 @@ class ConfigParser: - "algorithm_type": get_threshold_type_enum("boxplot"), - "boxplot_parameter": 1.5, - "n_sigma_parameter": 3.0, -+ "win_type": get_sliding_window_type_enum("not_continuous"), -+ "win_size": 30, -+ "win_threshold": 6, - }, -- "sliding_window": { -- "sliding_window_type": get_sliding_window_type_enum("not_continuous"), -- "window_size": 30, -- "window_minimum_threshold": 6, -+ "latency_sata_ssd": { -+ "read_avg_lim": 10000, -+ "write_avg_lim": 10000, -+ "read_tot_lim": 50000, -+ "write_tot_lim": 50000 - }, -- "latency_sata_ssd": {"read_tot_lim": 50000, "write_tot_lim": 50000}, -- "latency_nvme_ssd": {"read_tot_lim": 500, "write_tot_lim": 500}, -- "latency_sata_hdd": {"read_tot_lim": 50000, "write_tot_lim": 50000}, -- "iodump": {"read_iodump_lim": 0, "write_iodump_lim": 0} -+ "latency_nvme_ssd": { -+ "read_avg_lim": 300, -+ "write_avg_lim": 300, -+ "read_tot_lim": 500, -+ "write_tot_lim": 500 -+ }, -+ "latency_sata_hdd": { -+ "read_avg_lim": 15000, -+ "write_avg_lim": 15000, -+ "read_tot_lim": 50000, -+ "write_tot_lim": 50000 -+ }, -+ "iodump": { -+ "read_iodump_lim": 0, -+ "write_iodump_lim": 0 -+ } - } - - def __init__(self, config_file_name): -@@ -161,18 +177,18 @@ class ConfigParser: - - return value - -- def _read_slow_io_detect_frequency(self, items_common: dict): -- self._conf["common"]["slow_io_detect_frequency"] = self._get_config_value( -+ def _read_period_time(self, items_common: dict): -+ self._conf["common"]["period_time"] = self._get_config_value( - items_common, -- "slow_io_detect_frequency", -+ "period_time", - int, -- self.DEFAULT_CONF["common"]["slow_io_detect_frequency"], -+ self.DEFAULT_CONF["common"]["period_time"], - gt=0 - ) -- frequency = self._conf["common"]["slow_io_detect_frequency"] -+ frequency = self._conf["common"]["period_time"] - ret = check_detect_frequency_is_valid(frequency) - if ret is None: -- log = f"slow io detect frequency: {frequency} is valid, "\ -+ log = f"period_time: {frequency} is valid, "\ - f"Check whether the value range is too large or is not an "\ - f"integer multiple of period_time.. exiting..." - Report.report_pass(log) -@@ -316,50 +332,41 @@ class ConfigParser: - self._conf["common"]["iotype"] = dup_iotype_list - - def _read_sliding_window_type(self, items_sliding_window: dict): -- sliding_window_type = items_sliding_window.get("sliding_window_type") -+ sliding_window_type = items_sliding_window.get("win_type") - if sliding_window_type is not None: -- self._conf["sliding_window"]["sliding_window_type"] = ( -+ self._conf["algorithm"]["win_type"] = ( - get_sliding_window_type_enum(sliding_window_type) - ) -- if self._conf["sliding_window"]["sliding_window_type"] is None: -+ if self._conf["algorithm"]["win_type"] is None: - logging.critical( -- "the sliding_window_type: %s you set is invalid. ai_block_io plug will exit.", -+ "the win_type: %s you set is invalid. ai_block_io plug will exit.", - sliding_window_type, - ) - Report.report_pass( -- f"the sliding_window_type: {sliding_window_type} you set is invalid. ai_block_io plug will exit." -+ f"the win_type: {sliding_window_type} you set is invalid. ai_block_io plug will exit." - ) - exit(1) - - def _read_window_size(self, items_sliding_window: dict): -- self._conf["sliding_window"]["window_size"] = self._get_config_value( -+ self._conf["algorithm"]["win_size"] = self._get_config_value( - items_sliding_window, -- "window_size", -+ "win_size", - int, -- self.DEFAULT_CONF["sliding_window"]["window_size"], -+ self.DEFAULT_CONF["algorithm"]["win_size"], - gt=0, -- le=3600, -+ le=300, - ) - - def _read_window_minimum_threshold(self, items_sliding_window: dict): -- default_window_minimum_threshold = self.DEFAULT_CONF["sliding_window"][ -- "window_minimum_threshold" -- ] -- if ( -- default_window_minimum_threshold -- > self._conf["sliding_window"]["window_size"] -- ): -- default_window_minimum_threshold = ( -- self._conf["sliding_window"]["window_size"] / 2 -- ) -- self._conf["sliding_window"]["window_minimum_threshold"] = ( -+ default_window_minimum_threshold = self.DEFAULT_CONF["algorithm"]["win_threshold"] -+ self._conf["algorithm"]["win_threshold"] = ( - self._get_config_value( - items_sliding_window, -- "window_minimum_threshold", -+ "win_threshold", - int, - default_window_minimum_threshold, - gt=0, -- le=self._conf["sliding_window"]["window_size"], -+ le=self._conf["algorithm"]["win_size"], - ) - ) - -@@ -406,7 +413,7 @@ class ConfigParser: - if con.has_section("common"): - items_common = dict(con.items("common")) - -- self._read_slow_io_detect_frequency(items_common) -+ self._read_period_time(items_common) - self._read_disks_to_detect(items_common) - self._read_stage(items_common) - self._read_iotype(items_common) -@@ -420,20 +427,9 @@ class ConfigParser: - self._read_train_data_duration(items_algorithm) - self._read_train_update_duration(items_algorithm) - self._read_algorithm_type_and_parameter(items_algorithm) -- else: -- Report.report_pass("not found algorithm section. exiting...") -- logging.critical("not found algorithm section. exiting...") -- exit(1) -- -- if con.has_section("sliding_window"): -- items_sliding_window = dict(con.items("sliding_window")) -- -- self._read_window_size(items_sliding_window) -- self._read_window_minimum_threshold(items_sliding_window) -- else: -- Report.report_pass("not found sliding_window section. exiting...") -- logging.critical("not found sliding_window section. exiting...") -- exit(1) -+ self._read_sliding_window_type(items_algorithm) -+ self._read_window_size(items_algorithm) -+ self._read_window_minimum_threshold(items_algorithm) - - if con.has_section("latency_sata_ssd"): - items_latency_sata_ssd = dict(con.items("latency_sata_ssd")) -@@ -451,6 +447,20 @@ class ConfigParser: - self.DEFAULT_CONF["latency_sata_ssd"]["write_tot_lim"], - gt=0, - ) -+ self._conf["latency_sata_ssd"]["read_avg_lim"] = self._get_config_value( -+ items_latency_sata_ssd, -+ "read_avg_lim", -+ int, -+ self.DEFAULT_CONF["latency_sata_ssd"]["read_avg_lim"], -+ gt=0 -+ ) -+ self._conf["latency_sata_ssd"]["write_avg_lim"] = self._get_config_value( -+ items_latency_sata_ssd, -+ "write_avg_lim", -+ int, -+ self.DEFAULT_CONF["latency_sata_ssd"]["write_avg_lim"], -+ gt=0 -+ ) - else: - Report.report_pass("not found latency_sata_ssd section. exiting...") - logging.critical("not found latency_sata_ssd section. exiting...") -@@ -472,6 +482,20 @@ class ConfigParser: - self.DEFAULT_CONF["latency_nvme_ssd"]["write_tot_lim"], - gt=0, - ) -+ self._conf["latency_nvme_ssd"]["read_avg_lim"] = self._get_config_value( -+ items_latency_nvme_ssd, -+ "read_avg_lim", -+ int, -+ self.DEFAULT_CONF["latency_nvme_ssd"]["read_avg_lim"], -+ gt=0 -+ ) -+ self._conf["latency_nvme_ssd"]["write_avg_lim"] = self._get_config_value( -+ items_latency_nvme_ssd, -+ "write_avg_lim", -+ int, -+ self.DEFAULT_CONF["latency_nvme_ssd"]["write_avg_lim"], -+ gt=0 -+ ) - else: - Report.report_pass("not found latency_nvme_ssd section. exiting...") - logging.critical("not found latency_nvme_ssd section. exiting...") -@@ -493,6 +517,20 @@ class ConfigParser: - self.DEFAULT_CONF["latency_sata_hdd"]["write_tot_lim"], - gt=0, - ) -+ self._conf["latency_sata_hdd"]["read_avg_lim"] = self._get_config_value( -+ items_latency_sata_hdd, -+ "read_avg_lim", -+ int, -+ self.DEFAULT_CONF["latency_sata_hdd"]["read_avg_lim"], -+ gt=0 -+ ) -+ self._conf["latency_sata_hdd"]["write_avg_lim"] = self._get_config_value( -+ items_latency_sata_hdd, -+ "write_avg_lim", -+ int, -+ self.DEFAULT_CONF["latency_sata_hdd"]["write_avg_lim"], -+ gt=0 -+ ) - else: - Report.report_pass("not found latency_sata_hdd section. exiting...") - logging.critical("not found latency_sata_hdd section. exiting...") -@@ -542,6 +580,18 @@ class ConfigParser: - else: - return None - -+ def get_avg_lim(self, disk_type, io_type): -+ if io_type == "read": -+ return self._conf.get( -+ f"latency_{DISK_TYPE_MAP.get(disk_type, '')}", {} -+ ).get("read_avg_lim", None) -+ elif io_type == "write": -+ return self._conf.get( -+ f"latency_{DISK_TYPE_MAP.get(disk_type, '')}", {} -+ ).get("write_avg_lim", None) -+ else: -+ return None -+ - def get_train_data_duration_and_train_update_duration(self): - return ( - self._conf["algorithm"]["train_data_duration"], -@@ -550,13 +600,13 @@ class ConfigParser: - - def get_window_size_and_window_minimum_threshold(self): - return ( -- self._conf["sliding_window"]["window_size"], -- self._conf["sliding_window"]["window_minimum_threshold"], -+ self._conf["algorithm"]["win_size"], -+ self._conf["algorithm"]["win_threshold"], - ) - - @property -- def slow_io_detect_frequency(self): -- return self._conf["common"]["slow_io_detect_frequency"] -+ def period_time(self): -+ return self._conf["common"]["period_time"] - - @property - def algorithm_type(self): -@@ -564,7 +614,7 @@ class ConfigParser: - - @property - def sliding_window_type(self): -- return self._conf["sliding_window"]["sliding_window_type"] -+ return self._conf["algorithm"]["win_type"] - - @property - def train_data_duration(self): -@@ -576,11 +626,11 @@ class ConfigParser: - - @property - def window_size(self): -- return self._conf["sliding_window"]["window_size"] -+ return self._conf["algorithm"]["win_size"] - - @property - def window_minimum_threshold(self): -- return self._conf["sliding_window"]["window_minimum_threshold"] -+ return self._conf["algorithm"]["win_threshold"] - - @property - def absolute_threshold(self): -diff --git a/src/python/sentryPlugins/ai_block_io/detector.py b/src/python/sentryPlugins/ai_block_io/detector.py -index 8536f7a..e3a0952 100644 ---- a/src/python/sentryPlugins/ai_block_io/detector.py -+++ b/src/python/sentryPlugins/ai_block_io/detector.py -@@ -28,9 +28,13 @@ class Detector: - self._threshold.attach_observer(self._slidingWindow) - self._count = None - -- def get_metric_name(self): -+ @property -+ def metric_name(self): - return self._metric_name - -+ def get_sliding_window_data(self): -+ return self._slidingWindow.get_data() -+ - def is_slow_io_event(self, io_data_dict_with_disk_name: dict): - if self._count is None: - self._count = datetime.now() -@@ -38,22 +42,27 @@ class Detector: - now_time = datetime.now() - time_diff = (now_time - self._count).total_seconds() - if time_diff >= 60: -- logging.info(f"({self._metric_name}) 's latest threshold is: {self._threshold.get_threshold()}.") -+ logging.info(f"({self._metric_name}) 's latest ai threshold is: {self._threshold.get_threshold()}.") - self._count = None - - logging.debug(f'enter Detector: {self}') - metric_value = get_metric_value_from_io_data_dict_by_metric_name(io_data_dict_with_disk_name, self._metric_name) - if metric_value is None: - logging.debug('not found metric value, so return None.') -- return (False, False), None, None, None -+ return (False, False), None, None, None, None - logging.debug(f'input metric value: {str(metric_value)}') - self._threshold.push_latest_data_to_queue(metric_value) - detection_result = self._slidingWindow.is_slow_io_event(metric_value) - # 检测到慢周期,由Detector负责打印info级别日志 - if detection_result[0][1]: -- logging.info(f'[abnormal period happen]: disk info: {self._metric_name}, window: {detection_result[1]}, ' -- f'current value: {metric_value}, ai threshold: {detection_result[2]}, ' -- f'absolute threshold: {detection_result[3]}') -+ logging.info(f'[abnormal_period]: disk: {self._metric_name.disk_name}, ' -+ f'stage: {self._metric_name.stage_name}, ' -+ f'iotype: {self._metric_name.io_access_type_name}, ' -+ f'metric: {self._metric_name.metric_name}, ' -+ f'current value: {metric_value}, ' -+ f'ai threshold: {detection_result[2]}, ' -+ f'absolute threshold upper limit: {detection_result[3]}, ' -+ f'lower limit: {detection_result[4]}') - else: - logging.debug(f'Detection result: {str(detection_result)}') - logging.debug(f'exit Detector: {self}') -@@ -75,41 +84,60 @@ class DiskDetector: - def add_detector(self, detector: Detector): - self._detector_list.append(detector) - -+ def get_detector_list_window(self): -+ latency_wins = {"read": {}, "write": {}} -+ iodump_wins = {"read": {}, "write": {}} -+ for detector in self._detector_list: -+ if detector.metric_name.metric_name == 'latency': -+ latency_wins[detector.metric_name.io_access_type_name][detector.metric_name.stage_name] = detector.get_sliding_window_data() -+ elif detector.metric_name.metric_name == 'io_dump': -+ iodump_wins[detector.metric_name.io_access_type_name][detector.metric_name.stage_name] = detector.get_sliding_window_data() -+ return latency_wins, iodump_wins -+ - def is_slow_io_event(self, io_data_dict_with_disk_name: dict): -- """ -- 根因诊断逻辑:只有bio阶段发生异常,才认为发生了慢IO事件,即bio阶段异常是慢IO事件的必要条件 -- 情况一:bio异常,rq_driver也异常,则慢盘 -- 情况二:bio异常,rq_driver无异常,且有内核IO栈任意阶段异常,则IO栈异常 -- 情况三:bio异常,rq_driver无异常,且无内核IO栈任意阶段异常,则IO压力大 -- 情况四:bio异常,则UNKNOWN -- """ -- diagnosis_info = {"bio": [], "rq_driver": [], "io_stage": []} -+ diagnosis_info = {"bio": [], "rq_driver": [], "kernel_stack": []} - for detector in self._detector_list: - # result返回内容:(是否检测到慢IO,是否检测到慢周期)、窗口、ai阈值、绝对阈值 - # 示例: (False, False), self._io_data_queue, self._ai_threshold, self._abs_threshold - result = detector.is_slow_io_event(io_data_dict_with_disk_name) - if result[0][0]: -- if detector.get_metric_name().stage_name == "bio": -- diagnosis_info["bio"].append((detector.get_metric_name(), result)) -- elif detector.get_metric_name().stage_name == "rq_driver": -- diagnosis_info["rq_driver"].append((detector.get_metric_name(), result)) -+ if detector.metric_name.stage_name == "bio": -+ diagnosis_info["bio"].append(detector.metric_name) -+ elif detector.metric_name.stage_name == "rq_driver": -+ diagnosis_info["rq_driver"].append(detector.metric_name) - else: -- diagnosis_info["io_stage"].append((detector.get_metric_name(), result)) -+ diagnosis_info["kernel_stack"].append(detector.metric_name) - -- # 返回内容:(1)是否检测到慢IO事件、(2)MetricName、(3)滑动窗口及阈值、(4)慢IO事件根因 -- root_cause = None - if len(diagnosis_info["bio"]) == 0: -- return False, None, None, None -- elif len(diagnosis_info["rq_driver"]) != 0: -- root_cause = "[Root Cause: disk slow]" -- elif len(diagnosis_info["io_stage"]) != 0: -- stage_list = [] -- for io_stage in diagnosis_info["io_stage"]: -- stage_list.append(io_stage[0].stage_name) -- root_cause = f"[Root Cause: io stage slow, stage: {stage_list}]" -- if root_cause is None: -- root_cause = "[Root Cause: high io pressure]" -- return True, diagnosis_info["bio"][0][0], diagnosis_info["bio"][0][1], root_cause -+ return False, None, None, None, None, None, None -+ -+ driver_name = self._disk_name -+ reason = "unknown" -+ block_stack = set() -+ io_type = set() -+ alarm_type = set() -+ -+ for key, value in diagnosis_info.items(): -+ for metric_name in value: -+ block_stack.add(metric_name.stage_name) -+ io_type.add(metric_name.io_access_type_name) -+ alarm_type.add(metric_name.metric_name) -+ -+ latency_wins, iodump_wins = self.get_detector_list_window() -+ details = f"latency: {latency_wins}, iodump: {iodump_wins}" -+ -+ io_press = {"throtl", "wbt", "iocost", "bfq"} -+ driver_slow = {"rq_driver"} -+ kernel_slow = {"gettag", "plug", "deadline", "hctx", "requeue"} -+ -+ if not io_press.isdisjoint(block_stack): -+ reason = "io_press" -+ elif not driver_slow.isdisjoint(block_stack): -+ reason = "driver_slow" -+ elif not kernel_slow.isdisjoint(block_stack): -+ reason = "kernel_slow" -+ -+ return True, driver_name, reason, str(block_stack), str(io_type), str(alarm_type), details - - def __repr__(self): - msg = f'disk: {self._disk_name}, ' -diff --git a/src/python/sentryPlugins/ai_block_io/sliding_window.py b/src/python/sentryPlugins/ai_block_io/sliding_window.py -index cebe41f..4083c43 100644 ---- a/src/python/sentryPlugins/ai_block_io/sliding_window.py -+++ b/src/python/sentryPlugins/ai_block_io/sliding_window.py -@@ -21,11 +21,12 @@ class SlidingWindowType(Enum): - - - class SlidingWindow: -- def __init__(self, queue_length: int, threshold: int, abs_threshold: int = None): -+ def __init__(self, queue_length: int, threshold: int, abs_threshold: int = None, avg_lim: int = None): - self._queue_length = queue_length - self._queue_threshold = threshold - self._ai_threshold = None - self._abs_threshold = abs_threshold -+ self._avg_lim = avg_lim - self._io_data_queue = [] - self._io_data_queue_abnormal_tag = [] - -@@ -35,8 +36,13 @@ class SlidingWindow: - self._io_data_queue_abnormal_tag.pop(0) - self._io_data_queue.append(data) - tag = False -- if ((self._ai_threshold is not None and data > self._ai_threshold) or -- (self._abs_threshold is not None and data > self._abs_threshold)): -+ if self._avg_lim is not None and data < self._avg_lim: -+ tag = False -+ self._io_data_queue_abnormal_tag.append(tag) -+ return tag -+ if self._ai_threshold is not None and data > self._ai_threshold: -+ tag = True -+ if self._abs_threshold is not None and data > self._abs_threshold: - tag = True - self._io_data_queue_abnormal_tag.append(tag) - return tag -@@ -52,6 +58,9 @@ class SlidingWindow: - def is_slow_io_event(self, data): - return False, None, None, None - -+ def get_data(self): -+ return self._io_data_queue -+ - def __repr__(self): - return "[SlidingWindow]" - -@@ -64,7 +73,7 @@ class NotContinuousSlidingWindow(SlidingWindow): - is_slow_io_event = False - if self._io_data_queue_abnormal_tag.count(True) >= self._queue_threshold: - is_slow_io_event = True -- return (is_slow_io_event, is_abnormal_period), self._io_data_queue, self._ai_threshold, self._abs_threshold -+ return (is_slow_io_event, is_abnormal_period), self._io_data_queue, self._ai_threshold, self._abs_threshold, self._avg_lim - - def __repr__(self): - return f"[NotContinuousSlidingWindow, window size: {self._queue_length}, threshold: {self._queue_threshold}]" -@@ -85,7 +94,7 @@ class ContinuousSlidingWindow(SlidingWindow): - break - else: - consecutive_count = 0 -- return (is_slow_io_event, is_abnormal_period), self._io_data_queue, self._ai_threshold, self._abs_threshold -+ return (is_slow_io_event, is_abnormal_period), self._io_data_queue, self._ai_threshold, self._abs_threshold, self._avg_lim - - def __repr__(self): - return f"[ContinuousSlidingWindow, window size: {self._queue_length}, threshold: {self._queue_threshold}]" -@@ -100,7 +109,7 @@ class MedianSlidingWindow(SlidingWindow): - median = np.median(self._io_data_queue) - if median >= self._ai_threshold: - is_slow_io_event = True -- return (is_slow_io_event, is_abnormal_period), self._io_data_queue, self._ai_threshold, self._abs_threshold -+ return (is_slow_io_event, is_abnormal_period), self._io_data_queue, self._ai_threshold, self._abs_threshold, self._avg_lim - - def __repr__(self): - return f"[MedianSlidingWindow, window size: {self._queue_length}]" --- -2.23.0 - diff --git a/ai_block_io-support-iodump.patch b/ai_block_io-support-iodump.patch deleted file mode 100644 index 990995e56734e2f3e780a41aef13a5f8a14a0844..0000000000000000000000000000000000000000 --- a/ai_block_io-support-iodump.patch +++ /dev/null @@ -1,200 +0,0 @@ -From db97139c411e86d6dc07fe0e91ae38c1bef17a8d Mon Sep 17 00:00:00 2001 -From: =?UTF-8?q?=E8=B4=BA=E6=9C=89=E5=BF=97?= <1037617413@qq.com> -Date: Tue, 22 Oct 2024 16:37:52 +0800 -Subject: [PATCH] ai_block_io support iodump - ---- - config/plugins/ai_block_io.ini | 6 +- - .../sentryPlugins/ai_block_io/ai_block_io.py | 75 ++++++++++++------- - .../ai_block_io/config_parser.py | 30 ++++++++ - .../ai_block_io/sliding_window.py | 4 +- - 4 files changed, 84 insertions(+), 31 deletions(-) - -diff --git a/config/plugins/ai_block_io.ini b/config/plugins/ai_block_io.ini -index 422cfa3..040237d 100644 ---- a/config/plugins/ai_block_io.ini -+++ b/config/plugins/ai_block_io.ini -@@ -29,4 +29,8 @@ write_tot_lim=500 - - [latency_sata_hdd] - read_tot_lim=50000 --write_tot_lim=50000 -\ No newline at end of file -+write_tot_lim=50000 -+ -+[iodump] -+read_iodump_lim=0 -+write_iodump_lim=0 -\ No newline at end of file -diff --git a/src/python/sentryPlugins/ai_block_io/ai_block_io.py b/src/python/sentryPlugins/ai_block_io/ai_block_io.py -index 4eecd43..f25e6d5 100644 ---- a/src/python/sentryPlugins/ai_block_io/ai_block_io.py -+++ b/src/python/sentryPlugins/ai_block_io/ai_block_io.py -@@ -15,7 +15,7 @@ import logging - from collections import defaultdict - - from .detector import Detector, DiskDetector --from .threshold import ThresholdFactory -+from .threshold import ThresholdFactory, ThresholdType - from .sliding_window import SlidingWindowFactory - from .utils import get_data_queue_size_and_update_size - from .config_parser import ConfigParser -@@ -91,9 +91,8 @@ class SlowIODetection: - continue - for stage in stages: - for iotype in iotypes: -- self._detector_name_list[disk].append( -- MetricName(disk, disk_type, stage, iotype, "latency") -- ) -+ self._detector_name_list[disk].append(MetricName(disk, disk_type, stage, iotype, "latency")) -+ self._detector_name_list[disk].append(MetricName(disk, disk_type, stage, iotype, "io_dump")) - if disks: - logging.warning( - "disks: %s not in available disk list, so they will be ignored.", -@@ -123,31 +122,51 @@ class SlowIODetection: - for disk, metric_name_list in self._detector_name_list.items(): - disk_detector = DiskDetector(disk) - for metric_name in metric_name_list: -- threshold = ThresholdFactory().get_threshold( -- threshold_type, -- boxplot_parameter=self._config_parser.boxplot_parameter, -- n_sigma_paramter=self._config_parser.n_sigma_parameter, -- data_queue_size=data_queue_size, -- data_queue_update_size=update_size, -- ) -- abs_threshold = self._config_parser.get_tot_lim( -- metric_name.disk_type, metric_name.io_access_type_name -- ) -- if abs_threshold is None: -- logging.warning( -- "disk %s, disk type %s, io type %s, get tot lim error, so it will be ignored.", -- disk, -- metric_name.disk_type, -- metric_name.io_access_type_name, -+ -+ if metric_name.metric_name == 'latency': -+ threshold = ThresholdFactory().get_threshold( -+ threshold_type, -+ boxplot_parameter=self._config_parser.boxplot_parameter, -+ n_sigma_paramter=self._config_parser.n_sigma_parameter, -+ data_queue_size=data_queue_size, -+ data_queue_update_size=update_size, - ) -- sliding_window = SlidingWindowFactory().get_sliding_window( -- sliding_window_type, -- queue_length=window_size, -- threshold=window_threshold, -- abs_threshold=abs_threshold, -- ) -- detector = Detector(metric_name, threshold, sliding_window) -- disk_detector.add_detector(detector) -+ abs_threshold = self._config_parser.get_tot_lim( -+ metric_name.disk_type, metric_name.io_access_type_name -+ ) -+ if abs_threshold is None: -+ logging.warning( -+ "disk %s, disk type %s, io type %s, get tot lim error, so it will be ignored.", -+ disk, -+ metric_name.disk_type, -+ metric_name.io_access_type_name, -+ ) -+ sliding_window = SlidingWindowFactory().get_sliding_window( -+ sliding_window_type, -+ queue_length=window_size, -+ threshold=window_threshold, -+ abs_threshold=abs_threshold, -+ ) -+ detector = Detector(metric_name, threshold, sliding_window) -+ disk_detector.add_detector(detector) -+ continue -+ -+ elif metric_name.metric_name == 'io_dump': -+ threshold = ThresholdFactory().get_threshold(ThresholdType.AbsoluteThreshold) -+ abs_threshold = None -+ if metric_name.io_access_type_name == 'read': -+ abs_threshold = self._config_parser.read_iodump_lim -+ elif metric_name.io_access_type_name == 'write': -+ abs_threshold = self._config_parser.write_iodump_lim -+ sliding_window = SlidingWindowFactory().get_sliding_window( -+ sliding_window_type, -+ queue_length=window_size, -+ threshold=window_threshold -+ ) -+ detector = Detector(metric_name, threshold, sliding_window) -+ threshold.set_threshold(abs_threshold) -+ disk_detector.add_detector(detector) -+ - logging.info(f"disk: [{disk}] add detector:\n [{disk_detector}]") - self._disk_detectors[disk] = disk_detector - -diff --git a/src/python/sentryPlugins/ai_block_io/config_parser.py b/src/python/sentryPlugins/ai_block_io/config_parser.py -index 274a31e..1117939 100644 ---- a/src/python/sentryPlugins/ai_block_io/config_parser.py -+++ b/src/python/sentryPlugins/ai_block_io/config_parser.py -@@ -72,6 +72,7 @@ class ConfigParser: - "latency_sata_ssd": {"read_tot_lim": 50000, "write_tot_lim": 50000}, - "latency_nvme_ssd": {"read_tot_lim": 500, "write_tot_lim": 500}, - "latency_sata_hdd": {"read_tot_lim": 50000, "write_tot_lim": 50000}, -+ "iodump": {"read_iodump_lim": 0, "write_iodump_lim": 0} - } - - def __init__(self, config_file_name): -@@ -497,6 +498,27 @@ class ConfigParser: - logging.critical("not found latency_sata_hdd section. exiting...") - exit(1) - -+ if con.has_section("iodump"): -+ items_iodump = dict(con.items("iodump")) -+ self._conf["iodump"]["read_iodump_lim"] = self._get_config_value( -+ items_iodump, -+ "read_iodump_lim", -+ int, -+ self.DEFAULT_CONF["iodump"]["read_iodump_lim"], -+ ge=0 -+ ) -+ self._conf["iodump"]["write_iodump_lim"] = self._get_config_value( -+ items_iodump, -+ "write_iodump_lim", -+ int, -+ self.DEFAULT_CONF["iodump"]["write_iodump_lim"], -+ ge=0 -+ ) -+ else: -+ Report.report_pass("not found iodump section. exiting...") -+ logging.critical("not found iodump section. exiting...") -+ exit(1) -+ - self.__print_all_config_value() - - def __repr__(self) -> str: -@@ -587,3 +609,11 @@ class ConfigParser: - @property - def n_sigma_parameter(self): - return self._conf["algorithm"]["n_sigma_parameter"] -+ -+ @property -+ def read_iodump_lim(self): -+ return self._conf["iodump"]["read_iodump_lim"] -+ -+ @property -+ def write_iodump_lim(self): -+ return self._conf["iodump"]["write_iodump_lim"] -\ No newline at end of file -diff --git a/src/python/sentryPlugins/ai_block_io/sliding_window.py b/src/python/sentryPlugins/ai_block_io/sliding_window.py -index d7c402a..cebe41f 100644 ---- a/src/python/sentryPlugins/ai_block_io/sliding_window.py -+++ b/src/python/sentryPlugins/ai_block_io/sliding_window.py -@@ -35,8 +35,8 @@ class SlidingWindow: - self._io_data_queue_abnormal_tag.pop(0) - self._io_data_queue.append(data) - tag = False -- if ((self._ai_threshold is not None and data >= self._ai_threshold) or -- (self._abs_threshold is not None and data >= self._abs_threshold)): -+ if ((self._ai_threshold is not None and data > self._ai_threshold) or -+ (self._abs_threshold is not None and data > self._abs_threshold)): - tag = True - self._io_data_queue_abnormal_tag.append(tag) - return tag --- -2.23.0 - diff --git a/ai_block_io-support-stage-and-iotype.patch b/ai_block_io-support-stage-and-iotype.patch deleted file mode 100644 index 1fd7505a56c02b87a3b27a1775d64d9430324653..0000000000000000000000000000000000000000 --- a/ai_block_io-support-stage-and-iotype.patch +++ /dev/null @@ -1,906 +0,0 @@ -From 13dc3712b4530a312aa43610f7696a4a62f30e96 Mon Sep 17 00:00:00 2001 -From: =?UTF-8?q?=E8=B4=BA=E6=9C=89=E5=BF=97?= <1037617413@qq.com> -Date: Fri, 11 Oct 2024 21:50:32 +0800 -Subject: [PATCH] ai_block_io support stage and iotype - ---- - config/plugins/ai_block_io.ini | 7 +- - .../sentryPlugins/ai_block_io/ai_block_io.py | 126 +++-- - .../ai_block_io/config_parser.py | 471 +++++++++++++----- - .../sentryPlugins/ai_block_io/data_access.py | 11 +- - .../sentryPlugins/ai_block_io/detector.py | 25 + - src/python/sentryPlugins/ai_block_io/utils.py | 3 +- - 6 files changed, 453 insertions(+), 190 deletions(-) - -diff --git a/config/plugins/ai_block_io.ini b/config/plugins/ai_block_io.ini -index 01ce266..a814d52 100644 ---- a/config/plugins/ai_block_io.ini -+++ b/config/plugins/ai_block_io.ini -@@ -1,7 +1,12 @@ -+[log] -+level=info -+ - [common] - absolute_threshold=40 - slow_io_detect_frequency=1 --log_level=info -+disk=default -+stage=bio -+iotype=read,write - - [algorithm] - train_data_duration=24 -diff --git a/src/python/sentryPlugins/ai_block_io/ai_block_io.py b/src/python/sentryPlugins/ai_block_io/ai_block_io.py -index 77104a9..e1052ec 100644 ---- a/src/python/sentryPlugins/ai_block_io/ai_block_io.py -+++ b/src/python/sentryPlugins/ai_block_io/ai_block_io.py -@@ -13,7 +13,7 @@ import time - import signal - import logging - --from .detector import Detector -+from .detector import Detector, DiskDetector - from .threshold import ThresholdFactory, AbsoluteThreshold - from .sliding_window import SlidingWindowFactory - from .utils import get_data_queue_size_and_update_size -@@ -34,8 +34,8 @@ def sig_handler(signum, frame): - class SlowIODetection: - _config_parser = None - _disk_list = None -- _detector_name_list = [] -- _detectors = {} -+ _detector_name_list = {} -+ _disk_detectors = {} - - def __init__(self, config_parser: ConfigParser): - self._config_parser = config_parser -@@ -43,85 +43,101 @@ class SlowIODetection: - self.__init_detector() - - def __init_detector_name_list(self): -- self._disk_list = check_collect_valid(self._config_parser.get_slow_io_detect_frequency()) -+ self._disk_list = check_collect_valid(self._config_parser.slow_io_detect_frequency) - if self._disk_list is None: - Report.report_pass("get available disk error, please check if the collector plug is enable. exiting...") - exit(1) - - logging.info(f"ai_block_io plug has found disks: {self._disk_list}") -- disks_to_detection: list = self._config_parser.get_disks_to_detection() -+ disks: list = self._config_parser.disks_to_detection -+ stages: list = self._config_parser.stage -+ iotypes: list = self._config_parser.iotype - # 情况1:None,则启用所有磁盘检测 - # 情况2:is not None and len = 0,则不启动任何磁盘检测 - # 情况3:len != 0,则取交集 -- if disks_to_detection is None: -+ if disks is None: - logging.warning("you not specify any disk or use default, so ai_block_io will enable all available disk.") - for disk in self._disk_list: -- self._detector_name_list.append(MetricName(disk, "bio", "read", "latency")) -- self._detector_name_list.append(MetricName(disk, "bio", "write", "latency")) -- elif len(disks_to_detection) == 0: -- logging.warning('please attention: conf file not specify any disk to detection, so it will not start ai block io.') -+ for stage in stages: -+ for iotype in iotypes: -+ if disk not in self._detector_name_list: -+ self._detector_name_list[disk] = [] -+ self._detector_name_list[disk].append(MetricName(disk, stage, iotype, "latency")) - else: -- for disk_to_detection in disks_to_detection: -- if disk_to_detection in self._disk_list: -- self._detector_name_list.append(MetricName(disk_to_detection, "bio", "read", "latency")) -- self._detector_name_list.append(MetricName(disk_to_detection, "bio", "write", "latency")) -+ for disk in disks: -+ if disk in self._disk_list: -+ for stage in stages: -+ for iotype in iotypes: -+ if disk not in self._detector_name_list: -+ self._detector_name_list[disk] = [] -+ self._detector_name_list[disk].append(MetricName(disk, stage, iotype, "latency")) - else: -- logging.warning(f"disk:[{disk_to_detection}] not in available disk list, so it will be ignored.") -- logging.info(f'start to detection follow disk and it\'s metric: {self._detector_name_list}') -+ logging.warning("disk: [%s] not in available disk list, so it will be ignored.", disk) -+ if len(self._detector_name_list) == 0: -+ logging.critical("the disks to detection is empty, ai_block_io will exit.") -+ Report.report_pass("the disks to detection is empty, ai_block_io will exit.") -+ exit(1) - - def __init_detector(self): -- train_data_duration, train_update_duration = (self._config_parser. -- get_train_data_duration_and_train_update_duration()) -- slow_io_detection_frequency = self._config_parser.get_slow_io_detect_frequency() -- threshold_type = self._config_parser.get_algorithm_type() -- data_queue_size, update_size = get_data_queue_size_and_update_size(train_data_duration, -- train_update_duration, -- slow_io_detection_frequency) -- sliding_window_type = self._config_parser.get_sliding_window_type() -- window_size, window_threshold = self._config_parser.get_window_size_and_window_minimum_threshold() -- -- for detector_name in self._detector_name_list: -- threshold = ThresholdFactory().get_threshold(threshold_type, -- boxplot_parameter=self._config_parser.get_boxplot_parameter(), -- n_sigma_paramter=self._config_parser.get_n_sigma_parameter(), -- data_queue_size=data_queue_size, -- data_queue_update_size=update_size) -- sliding_window = SlidingWindowFactory().get_sliding_window(sliding_window_type, queue_length=window_size, -- threshold=window_threshold) -- detector = Detector(detector_name, threshold, sliding_window) -- # 绝对阈值的阈值初始化 -- if isinstance(threshold, AbsoluteThreshold): -- threshold.set_threshold(self._config_parser.get_absolute_threshold()) -- self._detectors[detector_name] = detector -- logging.info(f"add detector: {detector}") -+ train_data_duration, train_update_duration = ( -+ self._config_parser.get_train_data_duration_and_train_update_duration() -+ ) -+ slow_io_detection_frequency = self._config_parser.slow_io_detect_frequency -+ threshold_type = self._config_parser.algorithm_type -+ data_queue_size, update_size = get_data_queue_size_and_update_size( -+ train_data_duration, train_update_duration, slow_io_detection_frequency -+ ) -+ sliding_window_type = self._config_parser.sliding_window_type -+ window_size, window_threshold = (self._config_parser.get_window_size_and_window_minimum_threshold()) -+ -+ for disk, metric_name_list in self._detector_name_list.items(): -+ threshold = ThresholdFactory().get_threshold( -+ threshold_type, -+ boxplot_parameter=self._config_parser.boxplot_parameter, -+ n_sigma_paramter=self._config_parser.n_sigma_parameter, -+ data_queue_size=data_queue_size, -+ data_queue_update_size=update_size, -+ ) -+ sliding_window = SlidingWindowFactory().get_sliding_window( -+ sliding_window_type, -+ queue_length=window_size, -+ threshold=window_threshold, -+ ) -+ disk_detector = DiskDetector(disk) -+ for metric_name in metric_name_list: -+ detector = Detector(metric_name, threshold, sliding_window) -+ disk_detector.add_detector(detector) -+ logging.info(f'disk: [{disk}] add detector:\n [{disk_detector}]') -+ self._disk_detectors[disk] = disk_detector - - def launch(self): - while True: -- logging.debug('step0. AI threshold slow io event detection is looping.') -+ logging.debug("step0. AI threshold slow io event detection is looping.") - - # Step1:获取IO数据 - io_data_dict_with_disk_name = get_io_data_from_collect_plug( -- self._config_parser.get_slow_io_detect_frequency(), self._disk_list -+ self._config_parser.slow_io_detect_frequency, self._disk_list - ) -- logging.debug(f'step1. Get io data: {str(io_data_dict_with_disk_name)}') -+ logging.debug(f"step1. Get io data: {str(io_data_dict_with_disk_name)}") - if io_data_dict_with_disk_name is None: -- Report.report_pass("get io data error, please check if the collector plug is enable. exitting...") -+ Report.report_pass( -+ "get io data error, please check if the collector plug is enable. exitting..." -+ ) - exit(1) - - # Step2:慢IO检测 -- logging.debug('step2. Start to detection slow io event.') -+ logging.debug("step2. Start to detection slow io event.") - slow_io_event_list = [] -- for metric_name, detector in self._detectors.items(): -- result = detector.is_slow_io_event(io_data_dict_with_disk_name) -+ for disk, disk_detector in self._disk_detectors.items(): -+ result = disk_detector.is_slow_io_event(io_data_dict_with_disk_name) - if result[0]: -- slow_io_event_list.append((detector.get_metric_name(), result)) -- logging.debug('step2. End to detection slow io event.') -+ slow_io_event_list.append(result) -+ logging.debug("step2. End to detection slow io event.") - - # Step3:慢IO事件上报 -- logging.debug('step3. Report slow io event to sysSentry.') -+ logging.debug("step3. Report slow io event to sysSentry.") - for slow_io_event in slow_io_event_list: -- metric_name: MetricName = slow_io_event[0] -- result = slow_io_event[1] -+ metric_name: MetricName = slow_io_event[1] - alarm_content = { - "driver_name": f"{metric_name.get_disk_name()}", - "reason": "disk_slow", -@@ -129,14 +145,14 @@ class SlowIODetection: - "io_type": f"{metric_name.get_io_access_type_name()}", - "alarm_source": "ai_block_io", - "alarm_type": "latency", -- "details": f"current window is: {result[1]}, threshold is: {result[2]}.", -+ "details": f"current window is: {slow_io_event[2]}, threshold is: {slow_io_event[3]}.", - } - Xalarm.major(alarm_content) - logging.warning(alarm_content) - - # Step4:等待检测时间 -- logging.debug('step4. Wait to start next slow io event detection loop.') -- time.sleep(self._config_parser.get_slow_io_detect_frequency()) -+ logging.debug("step4. Wait to start next slow io event detection loop.") -+ time.sleep(self._config_parser.slow_io_detect_frequency) - - - def main(): -diff --git a/src/python/sentryPlugins/ai_block_io/config_parser.py b/src/python/sentryPlugins/ai_block_io/config_parser.py -index 354c122..a357766 100644 ---- a/src/python/sentryPlugins/ai_block_io/config_parser.py -+++ b/src/python/sentryPlugins/ai_block_io/config_parser.py -@@ -9,44 +9,60 @@ - # PURPOSE. - # See the Mulan PSL v2 for more details. - -+import os - import configparser - import logging - -+from .alarm_report import Report - from .threshold import ThresholdType - from .utils import get_threshold_type_enum, get_sliding_window_type_enum, get_log_level - - - LOG_FORMAT = "%(asctime)s - %(levelname)s - [%(filename)s:%(lineno)d] - %(message)s" - -+ALL_STAGE_LIST = ['throtl', 'wbt', 'gettag', 'plug', 'deadline', 'hctx', 'requeue', 'rq_driver', 'bio'] -+ALL_IOTPYE_LIST = ['read', 'write'] -+ - - def init_log_format(log_level: str): - logging.basicConfig(level=get_log_level(log_level.lower()), format=LOG_FORMAT) -- if log_level.lower() not in ('info', 'warning', 'error', 'debug'): -- logging.warning(f'the log_level: {log_level} you set is invalid, use default value: info.') -+ if log_level.lower() not in ("info", "warning", "error", "debug"): -+ logging.warning( -+ f"the log_level: {log_level} you set is invalid, use default value: info." -+ ) - - - class ConfigParser: - DEFAULT_ABSOLUTE_THRESHOLD = 40 - DEFAULT_SLOW_IO_DETECTION_FREQUENCY = 1 -- DEFAULT_LOG_LEVEL = 'info' -+ DEFAULT_LOG_LEVEL = "info" -+ -+ DEFAULT_STAGE = 'throtl,wbt,gettag,plug,deadline,hctx,requeue,rq_driver,bio' -+ DEFAULT_IOTYPE = 'read,write' - -- DEFAULT_ALGORITHM_TYPE = 'boxplot' -+ DEFAULT_ALGORITHM_TYPE = "boxplot" - DEFAULT_TRAIN_DATA_DURATION = 24 - DEFAULT_TRAIN_UPDATE_DURATION = 2 - DEFAULT_BOXPLOT_PARAMETER = 1.5 - DEFAULT_N_SIGMA_PARAMETER = 3 - -- DEFAULT_SLIDING_WINDOW_TYPE = 'not_continuous' -+ DEFAULT_SLIDING_WINDOW_TYPE = "not_continuous" - DEFAULT_WINDOW_SIZE = 30 - DEFAULT_WINDOW_MINIMUM_THRESHOLD = 6 - - def __init__(self, config_file_name): - self.__absolute_threshold = ConfigParser.DEFAULT_ABSOLUTE_THRESHOLD -- self.__slow_io_detect_frequency = ConfigParser.DEFAULT_SLOW_IO_DETECTION_FREQUENCY -+ self.__slow_io_detect_frequency = ( -+ ConfigParser.DEFAULT_SLOW_IO_DETECTION_FREQUENCY -+ ) - self.__log_level = ConfigParser.DEFAULT_LOG_LEVEL - self.__disks_to_detection = None -+ self.__stage = ConfigParser.DEFAULT_STAGE -+ self.__iotype = ConfigParser.DEFAULT_IOTYPE - -- self.__algorithm_type = ConfigParser.DEFAULT_ALGORITHM_TYPE -+ self.__algorithm_type = get_threshold_type_enum( -+ ConfigParser.DEFAULT_ALGORITHM_TYPE -+ ) - self.__train_data_duration = ConfigParser.DEFAULT_TRAIN_UPDATE_DURATION - self.__train_update_duration = ConfigParser.DEFAULT_TRAIN_UPDATE_DURATION - self.__boxplot_parameter = ConfigParser.DEFAULT_BOXPLOT_PARAMETER -@@ -58,199 +74,398 @@ class ConfigParser: - - self.__config_file_name = config_file_name - -- def __read_absolute_threshold(self, items_common: dict): -+ def _get_config_value( -+ self, -+ config_items: dict, -+ key: str, -+ value_type, -+ default_value=None, -+ gt=None, -+ ge=None, -+ lt=None, -+ le=None, -+ ): -+ value = config_items.get(key) -+ if value is None: -+ logging.warning( -+ "config of %s not found, the default value %s will be used.", -+ key, -+ default_value, -+ ) -+ value = default_value -+ if not value: -+ logging.critical( -+ "the value of %s is empty, ai_block_io plug will exit.", key -+ ) -+ Report.report_pass( -+ f"the value of {key} is empty, ai_block_io plug will exit." -+ ) -+ exit(1) - try: -- self.__absolute_threshold = float(items_common.get('absolute_threshold', -- ConfigParser.DEFAULT_ABSOLUTE_THRESHOLD)) -- if self.__absolute_threshold <= 0: -- logging.warning( -- f'the_absolute_threshold: {self.__absolute_threshold} you set is invalid, use default value: {ConfigParser.DEFAULT_ABSOLUTE_THRESHOLD}.') -- self.__absolute_threshold = ConfigParser.DEFAULT_ABSOLUTE_THRESHOLD -+ value = value_type(value) - except ValueError: -- self.__absolute_threshold = ConfigParser.DEFAULT_ABSOLUTE_THRESHOLD -- logging.warning( -- f'the_absolute_threshold type conversion has error, use default value: {self.__absolute_threshold}.') -+ logging.critical( -+ "the value of %s is not a valid %s, ai_block_io plug will exit.", -+ key, -+ value_type, -+ ) -+ Report.report_pass( -+ f"the value of {key} is not a valid {value_type}, ai_block_io plug will exit." -+ ) -+ exit(1) -+ if gt is not None and value <= gt: -+ logging.critical( -+ "the value of %s is not greater than %s, ai_block_io plug will exit.", -+ key, -+ gt, -+ ) -+ Report.report_pass( -+ f"the value of {key} is not greater than {gt}, ai_block_io plug will exit." -+ ) -+ exit(1) -+ if ge is not None and value < ge: -+ logging.critical( -+ "the value of %s is not greater than or equal to %s, ai_block_io plug will exit.", -+ key, -+ ge, -+ ) -+ Report.report_pass( -+ f"the value of {key} is not greater than or equal to {ge}, ai_block_io plug will exit." -+ ) -+ exit(1) -+ if lt is not None and value >= lt: -+ logging.critical( -+ "the value of %s is not less than %s, ai_block_io plug will exit.", -+ key, -+ lt, -+ ) -+ Report.report_pass( -+ f"the value of {key} is not less than {lt}, ai_block_io plug will exit." -+ ) -+ exit(1) -+ if le is not None and value > le: -+ logging.critical( -+ "the value of %s is not less than or equal to %s, ai_block_io plug will exit.", -+ key, -+ le, -+ ) -+ Report.report_pass( -+ f"the value of {key} is not less than or equal to {le}, ai_block_io plug will exit." -+ ) -+ exit(1) -+ -+ return value -+ -+ def __read_absolute_threshold(self, items_common: dict): -+ self.__absolute_threshold = self._get_config_value( -+ items_common, -+ "absolute_threshold", -+ float, -+ ConfigParser.DEFAULT_ABSOLUTE_THRESHOLD, -+ gt=0, -+ ) - - def __read__slow_io_detect_frequency(self, items_common: dict): -- try: -- self.__slow_io_detect_frequency = int(items_common.get('slow_io_detect_frequency', -- ConfigParser.DEFAULT_SLOW_IO_DETECTION_FREQUENCY)) -- if self.__slow_io_detect_frequency < 1 or self.__slow_io_detect_frequency > 10: -- logging.warning( -- f'the slow_io_detect_frequency: {self.__slow_io_detect_frequency} you set is invalid, use default value: {ConfigParser.DEFAULT_SLOW_IO_DETECTION_FREQUENCY}.') -- self.__slow_io_detect_frequency = ConfigParser.DEFAULT_SLOW_IO_DETECTION_FREQUENCY -- except ValueError: -- self.__slow_io_detect_frequency = ConfigParser.DEFAULT_SLOW_IO_DETECTION_FREQUENCY -- logging.warning(f'slow_io_detect_frequency type conversion has error, use default value: {self.__slow_io_detect_frequency}.') -+ self.__slow_io_detect_frequency = self._get_config_value( -+ items_common, -+ "slow_io_detect_frequency", -+ int, -+ ConfigParser.DEFAULT_SLOW_IO_DETECTION_FREQUENCY, -+ gt=0, -+ le=300, -+ ) - - def __read__disks_to_detect(self, items_common: dict): -- disks_to_detection = items_common.get('disk') -+ disks_to_detection = items_common.get("disk") - if disks_to_detection is None: -- logging.warning(f'config of disk not found, the default value will be used.') -+ logging.warning("config of disk not found, the default value will be used.") - self.__disks_to_detection = None - return -- disk_list = disks_to_detection.split(',') -- if len(disk_list) == 0 or (len(disk_list) == 1 and disk_list[0] == ''): -- logging.warning("you don't specify any disk.") -- self.__disks_to_detection = [] -- return -- if len(disk_list) == 1 and disk_list[0] == 'default': -+ disks_to_detection = disks_to_detection.strip() -+ if not disks_to_detection: -+ logging.critical("the value of disk is empty, ai_block_io plug will exit.") -+ Report.report_pass( -+ "the value of disk is empty, ai_block_io plug will exit." -+ ) -+ exit(1) -+ disk_list = disks_to_detection.split(",") -+ if len(disk_list) == 1 and disk_list[0] == "default": - self.__disks_to_detection = None - return - self.__disks_to_detection = disk_list - - def __read__train_data_duration(self, items_algorithm: dict): -- try: -- self.__train_data_duration = float(items_algorithm.get('train_data_duration', -- ConfigParser.DEFAULT_TRAIN_DATA_DURATION)) -- if self.__train_data_duration <= 0 or self.__train_data_duration > 720: -- logging.warning( -- f'the train_data_duration: {self.__train_data_duration} you set is invalid, use default value: {ConfigParser.DEFAULT_TRAIN_DATA_DURATION}.') -- self.__train_data_duration = ConfigParser.DEFAULT_TRAIN_DATA_DURATION -- except ValueError: -- self.__train_data_duration = ConfigParser.DEFAULT_TRAIN_DATA_DURATION -- logging.warning(f'the train_data_duration type conversion has error, use default value: {self.__train_data_duration}.') -+ self.__train_data_duration = self._get_config_value( -+ items_algorithm, -+ "train_data_duration", -+ float, -+ ConfigParser.DEFAULT_TRAIN_DATA_DURATION, -+ gt=0, -+ le=720, -+ ) - - def __read__train_update_duration(self, items_algorithm: dict): - default_train_update_duration = ConfigParser.DEFAULT_TRAIN_UPDATE_DURATION - if default_train_update_duration > self.__train_data_duration: - default_train_update_duration = self.__train_data_duration / 2 -- -- try: -- self.__train_update_duration = float(items_algorithm.get('train_update_duration', -- ConfigParser.DEFAULT_TRAIN_UPDATE_DURATION)) -- if self.__train_update_duration <= 0 or self.__train_update_duration > self.__train_data_duration: -- logging.warning( -- f'the train_update_duration: {self.__train_update_duration} you set is invalid, use default value: {default_train_update_duration}.') -- self.__train_update_duration = default_train_update_duration -- except ValueError: -- self.__train_update_duration = default_train_update_duration -- logging.warning(f'the train_update_duration type conversion has error, use default value: {self.__train_update_duration}.') -+ self.__train_update_duration = self._get_config_value( -+ items_algorithm, -+ "train_update_duration", -+ float, -+ default_train_update_duration, -+ gt=0, -+ le=self.__train_data_duration, -+ ) - - def __read__algorithm_type_and_parameter(self, items_algorithm: dict): -- algorithm_type = items_algorithm.get('algorithm_type', ConfigParser.DEFAULT_ALGORITHM_TYPE) -+ algorithm_type = items_algorithm.get( -+ "algorithm_type", ConfigParser.DEFAULT_ALGORITHM_TYPE -+ ) - self.__algorithm_type = get_threshold_type_enum(algorithm_type) -+ if self.__algorithm_type is None: -+ logging.critical( -+ "the algorithm_type: %s you set is invalid. ai_block_io plug will exit.", -+ algorithm_type, -+ ) -+ Report.report_pass( -+ f"the algorithm_type: {algorithm_type} you set is invalid. ai_block_io plug will exit." -+ ) -+ exit(1) - - if self.__algorithm_type == ThresholdType.NSigmaThreshold: -- try: -- self.__n_sigma_parameter = float(items_algorithm.get('n_sigma_parameter', -- ConfigParser.DEFAULT_N_SIGMA_PARAMETER)) -- if self.__n_sigma_parameter <= 0 or self.__n_sigma_parameter > 10: -- logging.warning( -- f'the n_sigma_parameter: {self.__n_sigma_parameter} you set is invalid, use default value: {ConfigParser.DEFAULT_N_SIGMA_PARAMETER}.') -- self.__n_sigma_parameter = ConfigParser.DEFAULT_N_SIGMA_PARAMETER -- except ValueError: -- self.__n_sigma_parameter = ConfigParser.DEFAULT_N_SIGMA_PARAMETER -- logging.warning(f'the n_sigma_parameter type conversion has error, use default value: {self.__n_sigma_parameter}.') -+ self.__n_sigma_parameter = self._get_config_value( -+ items_algorithm, -+ "n_sigma_parameter", -+ float, -+ ConfigParser.DEFAULT_N_SIGMA_PARAMETER, -+ gt=0, -+ le=10, -+ ) - elif self.__algorithm_type == ThresholdType.BoxplotThreshold: -- try: -- self.__boxplot_parameter = float(items_algorithm.get('boxplot_parameter', -- ConfigParser.DEFAULT_BOXPLOT_PARAMETER)) -- if self.__boxplot_parameter <= 0 or self.__boxplot_parameter > 10: -- logging.warning( -- f'the boxplot_parameter: {self.__boxplot_parameter} you set is invalid, use default value: {ConfigParser.DEFAULT_BOXPLOT_PARAMETER}.') -- self.__n_sigma_parameter = ConfigParser.DEFAULT_BOXPLOT_PARAMETER -- except ValueError: -- self.__boxplot_parameter = ConfigParser.DEFAULT_BOXPLOT_PARAMETER -- logging.warning(f'the boxplot_parameter type conversion has error, use default value: {self.__boxplot_parameter}.') -+ self.__boxplot_parameter = self._get_config_value( -+ items_algorithm, -+ "boxplot_parameter", -+ float, -+ ConfigParser.DEFAULT_BOXPLOT_PARAMETER, -+ gt=0, -+ le=10, -+ ) -+ -+ def __read__stage(self, items_algorithm: dict): -+ stage_str = items_algorithm.get('stage', ConfigParser.DEFAULT_STAGE) -+ stage_list = stage_str.split(',') -+ if len(stage_list) == 1 and stage_list[0] == '': -+ logging.critical('stage value not allow is empty, exiting...') -+ exit(1) -+ if len(stage_list) == 1 and stage_list[0] == 'default': -+ logging.warning(f'stage will enable default value: {ConfigParser.DEFAULT_STAGE}') -+ self.__stage = ALL_STAGE_LIST -+ return -+ for stage in stage_list: -+ if stage not in ALL_STAGE_LIST: -+ logging.critical(f'stage: {stage} is not valid stage, ai_block_io will exit...') -+ exit(1) -+ dup_stage_list = set(stage_list) -+ if 'bio' not in dup_stage_list: -+ logging.critical('stage must contains bio stage, exiting...') -+ exit(1) -+ self.__stage = dup_stage_list -+ -+ def __read__iotype(self, items_algorithm: dict): -+ iotype_str = items_algorithm.get('iotype', ConfigParser.DEFAULT_IOTYPE) -+ iotype_list = iotype_str.split(',') -+ if len(iotype_list) == 1 and iotype_list[0] == '': -+ logging.critical('iotype value not allow is empty, exiting...') -+ exit(1) -+ if len(iotype_list) == 1 and iotype_list[0] == 'default': -+ logging.warning(f'iotype will enable default value: {ConfigParser.DEFAULT_IOTYPE}') -+ self.__iotype = ALL_IOTPYE_LIST -+ return -+ for iotype in iotype_list: -+ if iotype not in ALL_IOTPYE_LIST: -+ logging.critical(f'iotype: {iotype} is not valid iotype, ai_block_io will exit...') -+ exit(1) -+ dup_iotype_list = set(iotype_list) -+ self.__iotype = dup_iotype_list - - def __read__window_size(self, items_sliding_window: dict): -- try: -- self.__window_size = int(items_sliding_window.get('window_size', -- ConfigParser.DEFAULT_WINDOW_SIZE)) -- if self.__window_size < 1 or self.__window_size > 3600: -- logging.warning( -- f'the window_size: {self.__window_size} you set is invalid, use default value: {ConfigParser.DEFAULT_WINDOW_SIZE}.') -- self.__window_size = ConfigParser.DEFAULT_WINDOW_SIZE -- except ValueError: -- self.__window_size = ConfigParser.DEFAULT_WINDOW_SIZE -- logging.warning(f'window_size type conversion has error, use default value: {self.__window_size}.') -+ self.__window_size = self._get_config_value( -+ items_sliding_window, -+ "window_size", -+ int, -+ ConfigParser.DEFAULT_WINDOW_SIZE, -+ gt=0, -+ le=3600, -+ ) - - def __read__window_minimum_threshold(self, items_sliding_window: dict): - default_window_minimum_threshold = ConfigParser.DEFAULT_WINDOW_MINIMUM_THRESHOLD - if default_window_minimum_threshold > self.__window_size: - default_window_minimum_threshold = self.__window_size / 2 -- try: -- self.__window_minimum_threshold = ( -- int(items_sliding_window.get('window_minimum_threshold', -- ConfigParser.DEFAULT_WINDOW_MINIMUM_THRESHOLD))) -- if self.__window_minimum_threshold < 1 or self.__window_minimum_threshold > self.__window_size: -- logging.warning( -- f'the window_minimum_threshold: {self.__window_minimum_threshold} you set is invalid, use default value: {default_window_minimum_threshold}.') -- self.__window_minimum_threshold = default_window_minimum_threshold -- except ValueError: -- self.__window_minimum_threshold = default_window_minimum_threshold -- logging.warning(f'window_minimum_threshold type conversion has error, use default value: {self.__window_minimum_threshold}.') -+ self.__window_minimum_threshold = self._get_config_value( -+ items_sliding_window, -+ "window_minimum_threshold", -+ int, -+ default_window_minimum_threshold, -+ gt=0, -+ le=self.__window_size, -+ ) - - def read_config_from_file(self): -+ if not os.path.exists(self.__config_file_name): -+ init_log_format(self.__log_level) -+ logging.critical( -+ "config file %s not found, ai_block_io plug will exit.", -+ self.__config_file_name, -+ ) -+ Report.report_pass( -+ f"config file {self.__config_file_name} not found, ai_block_io plug will exit." -+ ) -+ exit(1) -+ - con = configparser.ConfigParser() - try: -- con.read(self.__config_file_name, encoding='utf-8') -+ con.read(self.__config_file_name, encoding="utf-8") - except configparser.Error as e: - init_log_format(self.__log_level) -- logging.critical(f'config file read error: {e}, ai_block_io plug will exit.') -+ logging.critical( -+ f"config file read error: %s, ai_block_io plug will exit.", e -+ ) -+ Report.report_pass( -+ f"config file read error: {e}, ai_block_io plug will exit." -+ ) - exit(1) - -- if con.has_section('common'): -- items_common = dict(con.items('common')) -- self.__log_level = items_common.get('log_level', ConfigParser.DEFAULT_LOG_LEVEL) -+ if con.has_section('log'): -+ items_log = dict(con.items('log')) -+ # 情况一:没有log,则使用默认值 -+ # 情况二:有log,值为空或异常,使用默认值 -+ # 情况三:有log,值正常,则使用该值 -+ self.__log_level = items_log.get('level', ConfigParser.DEFAULT_LOG_LEVEL) - init_log_format(self.__log_level) -+ else: -+ init_log_format(self.__log_level) -+ logging.warning(f"log section parameter not found, it will be set to default value.") -+ -+ if con.has_section("common"): -+ items_common = dict(con.items("common")) - self.__read_absolute_threshold(items_common) - self.__read__slow_io_detect_frequency(items_common) - self.__read__disks_to_detect(items_common) -+ self.__read__stage(items_common) -+ self.__read__iotype(items_common) - else: -- init_log_format(self.__log_level) -- logging.warning("common section parameter not found, it will be set to default value.") -+ logging.warning( -+ "common section parameter not found, it will be set to default value." -+ ) - -- if con.has_section('algorithm'): -- items_algorithm = dict(con.items('algorithm')) -+ if con.has_section("algorithm"): -+ items_algorithm = dict(con.items("algorithm")) - self.__read__train_data_duration(items_algorithm) - self.__read__train_update_duration(items_algorithm) - self.__read__algorithm_type_and_parameter(items_algorithm) - else: -- logging.warning("algorithm section parameter not found, it will be set to default value.") -- -- if con.has_section('sliding_window'): -- items_sliding_window = dict(con.items('sliding_window')) -- sliding_window_type = items_sliding_window.get('sliding_window_type', -- ConfigParser.DEFAULT_SLIDING_WINDOW_TYPE) -- self.__sliding_window_type = get_sliding_window_type_enum(sliding_window_type) -+ logging.warning( -+ "algorithm section parameter not found, it will be set to default value." -+ ) -+ -+ if con.has_section("sliding_window"): -+ items_sliding_window = dict(con.items("sliding_window")) -+ sliding_window_type = items_sliding_window.get( -+ "sliding_window_type", ConfigParser.DEFAULT_SLIDING_WINDOW_TYPE -+ ) -+ self.__sliding_window_type = get_sliding_window_type_enum( -+ sliding_window_type -+ ) - self.__read__window_size(items_sliding_window) - self.__read__window_minimum_threshold(items_sliding_window) - else: -- logging.warning("sliding_window section parameter not found, it will be set to default value.") -+ logging.warning( -+ "sliding_window section parameter not found, it will be set to default value." -+ ) - - self.__print_all_config_value() - -+ def __repr__(self): -+ config_str = { -+ 'log.level': self.__log_level, -+ 'common.absolute_threshold': self.__absolute_threshold, -+ 'common.slow_io_detect_frequency': self.__slow_io_detect_frequency, -+ 'common.disk': self.__disks_to_detection, -+ 'common.stage': self.__stage, -+ 'common.iotype': self.__iotype, -+ 'algorithm.train_data_duration': self.__train_data_duration, -+ 'algorithm.train_update_duration': self.__train_update_duration, -+ 'algorithm.algorithm_type': self.__algorithm_type, -+ 'algorithm.boxplot_parameter': self.__boxplot_parameter, -+ 'algorithm.n_sigma_parameter': self.__n_sigma_parameter, -+ 'sliding_window.sliding_window_type': self.__sliding_window_type, -+ 'sliding_window.window_size': self.__window_size, -+ 'sliding_window.window_minimum_threshold': self.__window_minimum_threshold -+ } -+ return str(config_str) -+ - def __print_all_config_value(self): -- pass -+ logging.info(f"all config is follow:\n {self}") -+ -+ def get_train_data_duration_and_train_update_duration(self): -+ return self.__train_data_duration, self.__train_update_duration - -- def get_slow_io_detect_frequency(self): -+ def get_window_size_and_window_minimum_threshold(self): -+ return self.__window_size, self.__window_minimum_threshold -+ -+ @property -+ def slow_io_detect_frequency(self): - return self.__slow_io_detect_frequency - -- def get_algorithm_type(self): -+ @property -+ def algorithm_type(self): - return self.__algorithm_type - -- def get_sliding_window_type(self): -+ @property -+ def sliding_window_type(self): - return self.__sliding_window_type - -- def get_train_data_duration_and_train_update_duration(self): -- return self.__train_data_duration, self.__train_update_duration -+ @property -+ def train_data_duration(self): -+ return self.__train_data_duration - -- def get_window_size_and_window_minimum_threshold(self): -- return self.__window_size, self.__window_minimum_threshold -+ @property -+ def train_update_duration(self): -+ return self.__train_update_duration -+ -+ @property -+ def window_size(self): -+ return self.__window_size - -- def get_absolute_threshold(self): -+ @property -+ def window_minimum_threshold(self): -+ return self.__window_minimum_threshold -+ -+ @property -+ def absolute_threshold(self): - return self.__absolute_threshold - -- def get_log_level(self): -+ @property -+ def log_level(self): - return self.__log_level - -- def get_disks_to_detection(self): -+ @property -+ def disks_to_detection(self): - return self.__disks_to_detection - -- def get_boxplot_parameter(self): -+ @property -+ def stage(self): -+ return self.__stage -+ -+ @property -+ def iotype(self): -+ return self.__iotype -+ -+ @property -+ def boxplot_parameter(self): - return self.__boxplot_parameter - -- def get_n_sigma_parameter(self): -+ @property -+ def n_sigma_parameter(self): - return self.__n_sigma_parameter -diff --git a/src/python/sentryPlugins/ai_block_io/data_access.py b/src/python/sentryPlugins/ai_block_io/data_access.py -index c7679cd..ed997e6 100644 ---- a/src/python/sentryPlugins/ai_block_io/data_access.py -+++ b/src/python/sentryPlugins/ai_block_io/data_access.py -@@ -41,11 +41,14 @@ def check_collect_valid(period): - try: - data = json.loads(data_raw["message"]) - except Exception as e: -- logging.warning(f"get io data failed, {e}") -+ logging.warning(f"get valid devices failed, occur exception: {e}") -+ return None -+ if not data: -+ logging.warning(f"get valid devices failed, return {data_raw}") - return None - return [k for k in data.keys()] - else: -- logging.warning(f"get io data failed, return {data_raw}") -+ logging.warning(f"get valid devices failed, return {data_raw}") - return None - - -@@ -60,7 +63,7 @@ def _get_raw_data(period, disk_list): - - def _get_io_stage_data(data): - io_stage_data = IOStageData() -- for data_type in ('read', 'write', 'flush', 'discard'): -+ for data_type in ("read", "write", "flush", "discard"): - if data_type in data: - getattr(io_stage_data, data_type).latency = data[data_type][0] - getattr(io_stage_data, data_type).io_dump = data[data_type][1] -@@ -87,7 +90,7 @@ def get_io_data_from_collect_plug(period, disk_list): - getattr(disk_ret, k) - setattr(disk_ret, k, _get_io_stage_data(v)) - except AttributeError: -- logging.debug(f'no attr {k}') -+ logging.debug(f"no attr {k}") - continue - ret[disk] = disk_ret - return ret -diff --git a/src/python/sentryPlugins/ai_block_io/detector.py b/src/python/sentryPlugins/ai_block_io/detector.py -index 0ed282b..e710ddd 100644 ---- a/src/python/sentryPlugins/ai_block_io/detector.py -+++ b/src/python/sentryPlugins/ai_block_io/detector.py -@@ -53,3 +53,28 @@ class Detector: - f' io_type_name: {self._metric_name.get_io_access_type_name()},' - f' metric_name: {self._metric_name.get_metric_name()}, threshold_type: {self._threshold},' - f' sliding_window_type: {self._slidingWindow}') -+ -+ -+class DiskDetector: -+ -+ def __init__(self, disk_name: str): -+ self._disk_name = disk_name -+ self._detector_list = [] -+ -+ def add_detector(self, detector: Detector): -+ self._detector_list.append(detector) -+ -+ def is_slow_io_event(self, io_data_dict_with_disk_name: dict): -+ # 只有bio阶段发生异常,就认为发生了慢IO事件 -+ # todo:根因诊断 -+ for detector in self._detector_list: -+ result = detector.is_slow_io_event(io_data_dict_with_disk_name) -+ if result[0] and detector.get_metric_name().get_stage_name() == 'bio': -+ return result[0], detector.get_metric_name(), result[1], result[2] -+ return False, None, None, None -+ -+ def __repr__(self): -+ msg = f'disk: {self._disk_name}, ' -+ for detector in self._detector_list: -+ msg += f'\n detector: [{detector}]' -+ return msg -diff --git a/src/python/sentryPlugins/ai_block_io/utils.py b/src/python/sentryPlugins/ai_block_io/utils.py -index 8dbba06..0ed37b9 100644 ---- a/src/python/sentryPlugins/ai_block_io/utils.py -+++ b/src/python/sentryPlugins/ai_block_io/utils.py -@@ -25,8 +25,7 @@ def get_threshold_type_enum(algorithm_type: str): - return ThresholdType.BoxplotThreshold - if algorithm_type.lower() == 'n_sigma': - return ThresholdType.NSigmaThreshold -- logging.warning(f"the algorithm type: {algorithm_type} you set is invalid, use default value: boxplot") -- return ThresholdType.BoxplotThreshold -+ return None - - - def get_sliding_window_type_enum(sliding_window_type: str): --- -2.23.0 - diff --git a/avg_block_io-send-alarm-to-xalarmd.patch b/avg_block_io-send-alarm-to-xalarmd.patch deleted file mode 100644 index 3995d08906455766c1da78320269ac289ccc3721..0000000000000000000000000000000000000000 --- a/avg_block_io-send-alarm-to-xalarmd.patch +++ /dev/null @@ -1,73 +0,0 @@ -From 7d5ad8f2dd87432b8f46ea5002400ee46cb6756a Mon Sep 17 00:00:00 2001 -From: gaoruoshu -Date: Wed, 9 Oct 2024 14:22:38 +0800 -Subject: [PATCH] avg_block_io send alarm to xalarmd - ---- - config/tasks/avg_block_io.mod | 2 ++ - .../sentryPlugins/avg_block_io/module_conn.py | 23 +++++++++++++++---- - 2 files changed, 21 insertions(+), 4 deletions(-) - -diff --git a/config/tasks/avg_block_io.mod b/config/tasks/avg_block_io.mod -index b9b6f34..bcd063b 100644 ---- a/config/tasks/avg_block_io.mod -+++ b/config/tasks/avg_block_io.mod -@@ -3,3 +3,5 @@ enabled=yes - task_start=/usr/bin/python3 /usr/bin/avg_block_io - task_stop=pkill -f /usr/bin/avg_block_io - type=oneshot -+alarm_id=1002 -+alarm_clear_time=5 -diff --git a/src/python/sentryPlugins/avg_block_io/module_conn.py b/src/python/sentryPlugins/avg_block_io/module_conn.py -index 0da4208..2fc5a83 100644 ---- a/src/python/sentryPlugins/avg_block_io/module_conn.py -+++ b/src/python/sentryPlugins/avg_block_io/module_conn.py -@@ -16,6 +16,7 @@ import time - from .utils import is_abnormal - from sentryCollector.collect_plugin import is_iocollect_valid, get_io_data, Result_Messages - from syssentry.result import ResultLevel, report_result -+from xalarm.sentry_notify import xalarm_report, MINOR_ALM, ALARM_TYPE_OCCUR - - - TASK_NAME = "avg_block_io" -@@ -68,19 +69,33 @@ def process_report_data(disk_name, rw, io_data): - if not is_abnormal((disk_name, 'bio', rw), io_data): - return - -+ msg = {"alarm_source": TASK_NAME, "driver_name": disk_name, "io_type": rw} -+ - ctrl_stage = ['throtl', 'wbt', 'iocost', 'bfq'] - for stage_name in ctrl_stage: - if is_abnormal((disk_name, stage_name, rw), io_data): -- logging.warning("{} - {} - {} report IO press".format(time.ctime(), disk_name, rw)) -+ msg["reason"] = "IO press slow" -+ msg["block_stack"] = f"bio,{stage_name}" -+ logging.warning("{} - {} report IO press slow".format(disk_name, rw)) -+ xalarm_report(1002, MINOR_ALM, ALARM_TYPE_OCCUR, json.dumps(msg)) - return - - if is_abnormal((disk_name, 'rq_driver', rw), io_data): -- logging.warning("{} - {} - {} report driver".format(time.ctime(), disk_name, rw)) -+ msg["reason"] = "driver slow" -+ msg["block_stack"] = "bio,rq_driver" -+ logging.warning("{} - {} report driver slow".format(disk_name, rw)) -+ xalarm_report(1002, MINOR_ALM, ALARM_TYPE_OCCUR, json.dumps(msg)) - return - - kernel_stage = ['gettag', 'plug', 'deadline', 'hctx', 'requeue'] - for stage_name in kernel_stage: - if is_abnormal((disk_name, stage_name, rw), io_data): -- logging.warning("{} - {} - {} report kernel".format(time.ctime(), disk_name, rw)) -+ msg["reason"] = "kernel slow" -+ msg["block_stack"] = f"bio,{stage_name}" -+ logging.warning("{} - {} report kernel slow".format(disk_name, rw)) -+ xalarm_report(1002, MINOR_ALM, ALARM_TYPE_OCCUR, json.dumps(msg)) - return -- logging.warning("{} - {} - {} report IO press".format(time.ctime(), disk_name, rw)) -+ msg["reason"] = "unknown" -+ msg["block_stack"] = "bio" -+ logging.warning("{} - {} report UNKNOWN slow".format(disk_name, rw)) -+ xalarm_report(1002, MINOR_ALM, ALARM_TYPE_OCCUR, json.dumps(msg)) --- -2.33.0 - diff --git a/bugfix-typo.patch b/bugfix-typo.patch deleted file mode 100644 index 946a0cbf6ab2323d271ceb31dfa690e58733b786..0000000000000000000000000000000000000000 --- a/bugfix-typo.patch +++ /dev/null @@ -1,34 +0,0 @@ -From 7d5ad8f2dd87432b8f46ea5002400ee46cb6756a Mon Sep 17 00:00:00 2001 -From: gaoruoshu -Date: Wed, 9 Oct 2024 14:22:38 +0800 -Subject: [PATCH] bugfix typo - ---- - src/python/sentryPlugins/avg_block_io/avg_block_io.py | 4 ++-- - 1 file changed, 2 insertions(+), 2 deletions(-) - -diff --git a/src/python/sentryPlugins/avg_block_io/avg_block_io.py b/src/python/sentryPlugins/avg_block_io/avg_block_io.py -index b6b3b28..26a60c5 100644 ---- a/src/python/sentryPlugins/avg_block_io/avg_block_io.py -+++ b/src/python/sentryPlugins/avg_block_io/avg_block_io.py -@@ -114,7 +114,7 @@ def read_config_lat_iodump(io_dic, config): - common_param = {} - lat_sec = None - if not config.has_section("latency"): -- logging.warning("Cannot find algorithm section in config file") -+ logging.warning("Cannot find latency section in config file") - else: - lat_sec = config["latency"] - -@@ -122,7 +122,7 @@ def read_config_lat_iodump(io_dic, config): - if not config.has_section("iodump"): - logging.warning("Cannot find iodump section in config file") - else: -- lat_sec = config["iodump"] -+ iodump_sec = config["iodump"] - - if not lat_sec and not iodump_sec: - return common_param --- -2.27.0 - diff --git a/change-alarm-length.patch b/change-alarm-length.patch deleted file mode 100644 index 27c49ed0c1c8560c67ec8f83a3da5eb38bfe3627..0000000000000000000000000000000000000000 --- a/change-alarm-length.patch +++ /dev/null @@ -1,56 +0,0 @@ -From 67439c0040b1fb0614ac009bf53062e9ec2880aa Mon Sep 17 00:00:00 2001 -From: jinsaihang -Date: Wed, 9 Oct 2024 11:55:35 +0800 -Subject: [PATCH 1/2] change alarm length - -Signed-off-by: jinsaihang ---- - src/python/syssentry/sentryctl | 3 +++ - src/python/syssentry/syssentry.py | 3 +++ - 2 files changed, 6 insertions(+) - -diff --git a/src/python/syssentry/sentryctl b/src/python/syssentry/sentryctl -index 675c17a..3de93d0 100644 ---- a/src/python/syssentry/sentryctl -+++ b/src/python/syssentry/sentryctl -@@ -25,6 +25,7 @@ MAX_PARAM_LENGTH = 256 - - RESULT_MSG_DATA_LEN = 4 - CTL_MSG_LEN_LEN = 3 -+ALARM_MSG_DATA_LEN = 6 - DEFAULT_ALARM_TIME_RANGE = 10 - - def status_output_format(res_data): -@@ -173,6 +174,8 @@ if __name__ == '__main__': - request_message = json.dumps(req_msg_struct) - if client_args.cmd_type == 'get_result': - result_message = client_send_and_recv(request_message, RESULT_MSG_DATA_LEN) -+ elif client_args.cmd_type == 'get_alarm': -+ result_message = client_send_and_recv(request_message, ALARM_MSG_DATA_LEN) - else: - result_message = client_send_and_recv(request_message, CTL_MSG_LEN_LEN) - if not result_message: -diff --git a/src/python/syssentry/syssentry.py b/src/python/syssentry/syssentry.py -index c2dee85..ea09095 100644 ---- a/src/python/syssentry/syssentry.py -+++ b/src/python/syssentry/syssentry.py -@@ -56,6 +56,7 @@ CTL_MSG_MAGIC_LEN = 3 - CTL_MSG_LEN_LEN = 3 - CTL_MAGIC = "CTL" - RES_MAGIC = "RES" -+ALARM_MSG_DATA_LEN = 6 - - CTL_LISTEN_QUEUE_LEN = 5 - SERVER_EPOLL_TIMEOUT = 0.3 -@@ -256,6 +257,8 @@ def server_recv(server_socket: socket.socket): - res_head = RES_MAGIC - if cmd_type == "get_result": - res_data_len = str(len(res_data)).zfill(RESULT_MSG_HEAD_LEN - RESULT_MSG_MAGIC_LEN) -+ elif cmd_type == "get_alarm": -+ res_data_len = str(len(res_data)).zfill(ALARM_MSG_DATA_LEN) - else: - res_data_len = str(len(res_data)).zfill(CTL_MSG_MAGIC_LEN) - res_head += res_data_len --- -2.27.0 - diff --git a/change-avg_block_io-config.patch b/change-avg_block_io-config.patch deleted file mode 100644 index 09d8b47a4e8484cc2b38f17db7ee033ccfc44b8b..0000000000000000000000000000000000000000 --- a/change-avg_block_io-config.patch +++ /dev/null @@ -1,55 +0,0 @@ -From aaff413d6954003a3c21af21003c3bc134f940e2 Mon Sep 17 00:00:00 2001 -From: gaoruoshu -Date: Tue, 5 Nov 2024 10:31:10 +0800 -Subject: [PATCH] change avg_block_io config - ---- - config/plugins/avg_block_io.ini | 8 ++++---- - .../src/python/sentryPlugins/avg_block_io/config.py | 8 ++++---- - 2 files changed, 8 insertions(+), 8 deletions(-) - -diff --git a/config/plugins/avg_block_io.ini b/config/plugins/avg_block_io.ini -index 5c4b9b0..3b4ee33 100644 ---- a/config/plugins/avg_block_io.ini -+++ b/config/plugins/avg_block_io.ini -@@ -12,12 +12,12 @@ win_size=30 - win_threshold=6 - - [latency_nvme_ssd] --read_avg_lim=300 --write_avg_lim=300 -+read_avg_lim=10000 -+write_avg_lim=10000 - read_avg_time=3 - write_avg_time=3 --read_tot_lim=500 --write_tot_lim=500 -+read_tot_lim=50000 -+write_tot_lim=50000 - - [latency_sata_ssd] - read_avg_lim=10000 -diff --git a/src/python/sentryPlugins/avg_block_io/config.py b/src/python/sentryPlugins/avg_block_io/config.py -index c8f45ce..c1e8ab1 100644 ---- a/src/python/sentryPlugins/avg_block_io/config.py -+++ b/src/python/sentryPlugins/avg_block_io/config.py -@@ -42,12 +42,12 @@ DEFAULT_PARAM = { - CONF_ALGO_SIZE: 30, - CONF_ALGO_THRE: 6 - }, 'latency_nvme_ssd': { -- 'read_avg_lim': 300, -- 'write_avg_lim': 300, -+ 'read_avg_lim': 10000, -+ 'write_avg_lim': 10000, - 'read_avg_time': 3, - 'write_avg_time': 3, -- 'read_tot_lim': 500, -- 'write_tot_lim': 500, -+ 'read_tot_lim': 50000, -+ 'write_tot_lim': 50000, - }, 'latency_sata_ssd' : { - 'read_avg_lim': 10000, - 'write_avg_lim': 10000, --- -2.39.5 (Apple Git-154) - diff --git a/change-status-of-period-task-and-sort-mod-file.patch b/change-status-of-period-task-and-sort-mod-file.patch deleted file mode 100644 index 12aab0e088c73907ac7716391e596580bcd72686..0000000000000000000000000000000000000000 --- a/change-status-of-period-task-and-sort-mod-file.patch +++ /dev/null @@ -1,36 +0,0 @@ -From 8cc13a422ed29e48b0c5b86b2da2a5dc8ad4aa59 Mon Sep 17 00:00:00 2001 -From: zhuofeng -Date: Fri, 13 Dec 2024 11:20:55 +0800 -Subject: [PATCH] change status of period task and sort mod file - ---- - src/python/syssentry/cron_process.py | 1 + - src/python/syssentry/load_mods.py | 1 + - 2 files changed, 2 insertions(+) - -diff --git a/src/python/syssentry/cron_process.py b/src/python/syssentry/cron_process.py -index 50780b3..5543d67 100644 ---- a/src/python/syssentry/cron_process.py -+++ b/src/python/syssentry/cron_process.py -@@ -144,6 +144,7 @@ def period_tasks_handle(): - - if not task.onstart: - logging.debug("period onstart not enabled, task: %s", task.name) -+ task.runtime_status = EXITED_STATUS - continue - - if task.runtime_status == WAITING_STATUS and \ -diff --git a/src/python/syssentry/load_mods.py b/src/python/syssentry/load_mods.py -index 48d7e66..5be5540 100644 ---- a/src/python/syssentry/load_mods.py -+++ b/src/python/syssentry/load_mods.py -@@ -224,6 +224,7 @@ def load_tasks(): - return "failed", "" - - mod_files = os.listdir(TASKS_STORAGE_PATH) -+ mod_files.sort() - for mod_file in mod_files: - logging.debug("find mod, path is %s", mod_file) - if not mod_file.endswith(MOD_FILE_SUFFIX): --- -2.33.0 diff --git a/cpu_utility-and-cpu_patrol-must-be-an-integer.patch b/cpu_utility-and-cpu_patrol-must-be-an-integer.patch deleted file mode 100644 index 3780958d69fcc3a168e77fae3434733590dfc4e4..0000000000000000000000000000000000000000 --- a/cpu_utility-and-cpu_patrol-must-be-an-integer.patch +++ /dev/null @@ -1,41 +0,0 @@ -From 6e98b2e5008ffabfda8d1c10778717f972b54398 Mon Sep 17 00:00:00 2001 -From: jwolf <523083921@qq.com> -Date: Mon, 22 Jul 2024 14:58:27 +0800 -Subject: [PATCH] cpu_utility and cpu_patrol musht be an integer - ---- - src/c/catcli/catlib/cli_param_checker.c | 5 +++-- - 1 file changed, 3 insertions(+), 2 deletions(-) - -diff --git a/src/c/catcli/catlib/cli_param_checker.c b/src/c/catcli/catlib/cli_param_checker.c -index a1aa636..e400428 100644 ---- a/src/c/catcli/catlib/cli_param_checker.c -+++ b/src/c/catcli/catlib/cli_param_checker.c -@@ -2,6 +2,7 @@ - #include - #include - #include -+#include - #include - #include - #include "cli_common.h" -@@ -13,7 +14,7 @@ - void checkset_cpu_usage_percentage(char *getopt_optarg, catcli_request_body *p_request_body, struct option_errs *errs) - { - long cpu_utility = strtol(getopt_optarg, NULL, DECIMAL); -- if (cpu_utility <= 0 || cpu_utility > CPU_USAGE_PERCENTAGE_MAX) { -+ if (cpu_utility <= 0 || cpu_utility > CPU_USAGE_PERCENTAGE_MAX || strchr(getopt_optarg, '.') != NULL) { - strncpy(errs->patrol_module_err, - "\"cpu_utility \" must be an integer greater in the range (0,100],correct \"-u, --cpu_utility\"\n", MAX_ERR_LEN); - } -@@ -68,7 +69,7 @@ void checkset_cpulist(char *getopt_optarg, catcli_request_body *p_request_body, - void checkset_patrol_time(char *getopt_optarg, catcli_request_body *p_request_body, struct option_errs *errs) - { - long second = strtol(getopt_optarg, NULL, DECIMAL); -- if (second <= 0 || second > INT_MAX) { -+ if (second <= 0 || second > INT_MAX || strchr(getopt_optarg, '.') != NULL) { - strncpy(errs->patrol_time_err, - "\"patrol_second\" must be a number in the range of (0,INT_MAX] ,correct \"-t, --patrol_second\"\n", - MAX_ERR_LEN); --- -Gitee diff --git a/diff-disk-type-use-diff-config.patch b/diff-disk-type-use-diff-config.patch deleted file mode 100644 index 70976d380725f99a153b1f66ae5b4f52810e9297..0000000000000000000000000000000000000000 --- a/diff-disk-type-use-diff-config.patch +++ /dev/null @@ -1,430 +0,0 @@ -From e7c1b0095e16369fb09ae62ffa3158be5e8893a1 Mon Sep 17 00:00:00 2001 -From: gaoruoshu -Date: Fri, 11 Oct 2024 10:48:35 +0800 -Subject: [PATCH] diff disk type use diff config - ---- - config/plugins/avg_block_io.ini | 26 +++- - src/python/sentryCollector/collect_plugin.py | 6 + - .../avg_block_io/avg_block_io.py | 144 ++++++++---------- - .../sentryPlugins/avg_block_io/module_conn.py | 19 ++- - .../sentryPlugins/avg_block_io/utils.py | 43 ++++++ - 5 files changed, 146 insertions(+), 92 deletions(-) - -diff --git a/config/plugins/avg_block_io.ini b/config/plugins/avg_block_io.ini -index 858db18..5c4b9b0 100644 ---- a/config/plugins/avg_block_io.ini -+++ b/config/plugins/avg_block_io.ini -@@ -11,13 +11,29 @@ period_time=1 - win_size=30 - win_threshold=6 - --[latency] --read_avg_lim=10 --write_avg_lim=10 -+[latency_nvme_ssd] -+read_avg_lim=300 -+write_avg_lim=300 - read_avg_time=3 - write_avg_time=3 --read_tot_lim=50 --write_tot_lim=50 -+read_tot_lim=500 -+write_tot_lim=500 -+ -+[latency_sata_ssd] -+read_avg_lim=10000 -+write_avg_lim=10000 -+read_avg_time=3 -+write_avg_time=3 -+read_tot_lim=50000 -+write_tot_lim=50000 -+ -+[latency_sata_hdd] -+read_avg_lim=15000 -+write_avg_lim=15000 -+read_avg_time=3 -+write_avg_time=3 -+read_tot_lim=50000 -+write_tot_lim=50000 - - [iodump] - read_iodump_lim=0 -diff --git a/src/python/sentryCollector/collect_plugin.py b/src/python/sentryCollector/collect_plugin.py -index 31bf11b..bec405a 100644 ---- a/src/python/sentryCollector/collect_plugin.py -+++ b/src/python/sentryCollector/collect_plugin.py -@@ -79,6 +79,12 @@ class DiskType(): - TYPE_SATA_SSD = 1 - TYPE_SATA_HDD = 2 - -+Disk_Type = { -+ DiskType.TYPE_NVME_SSD: "nvme_ssd", -+ DiskType.TYPE_SATA_SSD: "sata_ssd", -+ DiskType.TYPE_SATA_HDD: "sata_hdd" -+} -+ - def client_send_and_recv(request_data, data_str_len, protocol): - """client socket send and recv message""" - try: -diff --git a/src/python/sentryPlugins/avg_block_io/avg_block_io.py b/src/python/sentryPlugins/avg_block_io/avg_block_io.py -index cf2ded3..fdad995 100644 ---- a/src/python/sentryPlugins/avg_block_io/avg_block_io.py -+++ b/src/python/sentryPlugins/avg_block_io/avg_block_io.py -@@ -14,8 +14,9 @@ import configparser - import time - - from .stage_window import IoWindow, IoDumpWindow --from .module_conn import avg_is_iocollect_valid, avg_get_io_data, report_alarm_fail, process_report_data, sig_handler --from .utils import update_avg_and_check_abnormal, get_log_level -+from .module_conn import avg_is_iocollect_valid, avg_get_io_data, report_alarm_fail, process_report_data, sig_handler, get_disk_type_by_name -+from .utils import update_avg_and_check_abnormal, get_log_level, get_section_value -+from sentryCollector.collect_plugin import Disk_Type - - CONFIG_FILE = "/etc/sysSentry/plugins/avg_block_io.ini" - -@@ -37,44 +38,40 @@ def read_config_common(config): - disk = [] if disk_name == "default" else disk_name.split(",") - except configparser.NoOptionError: - disk = [] -- logging.warning("Unset disk, set to default") -+ logging.warning("Unset common.disk, set to default") - - try: - stage_name = config.get("common", "stage") - stage = [] if stage_name == "default" else stage_name.split(",") - except configparser.NoOptionError: - stage = [] -- logging.warning("Unset stage, set to read,write") -+ logging.warning("Unset common.stage, set to default") - - if len(disk) > 10: -- logging.warning("Too many disks, record only max 10 disks") -+ logging.warning("Too many common.disks, record only max 10 disks") - disk = disk[:10] - - try: - iotype_name = config.get("common", "iotype").split(",") -- iotype_list = [rw.lower() for rw in iotype_name if rw.lower() in ['read', 'write', 'flush', 'discard']] -- err_iotype = [rw.lower() for rw in iotype_name if rw.lower() not in ['read', 'write', 'flush', 'discard']] -+ iotype_list = [rw.lower() for rw in iotype_name if rw.lower() in ['read', 'write']] -+ err_iotype = [rw.lower() for rw in iotype_name if rw.lower() not in ['read', 'write']] - -- if iotype_list in [None, []]: -- iotype_list = ["read", "write"] -- except configparser.NoOptionError: -- iotype = ["read", "write"] -- logging.warning("Unset iotype, set to default") -+ if err_iotype: -+ report_alarm_fail("Invalid common.iotype config") - -- if err_iotype: -- logging.warning("{} in common.iotype are not valid, set iotype={}".format(err_iotype, iotype_list)) -- -+ except configparser.NoOptionError: -+ iotype_list = ["read", "write"] -+ logging.warning("Unset common.iotype, set to read,write") - - try: - period_time = int(config.get("common", "period_time")) - if not (1 <= period_time <= 300): - raise ValueError("Invalid period_time") - except ValueError: -- period_time = 1 -- logging.warning("Invalid period_time, set to 1s") -+ report_alarm_fail("Invalid common.period_time") - except configparser.NoOptionError: - period_time = 1 -- logging.warning("Unset period_time, use 1s as default") -+ logging.warning("Unset common.period_time, use 1s as default") - - return period_time, disk, stage, iotype_list - -@@ -87,76 +84,56 @@ def read_config_algorithm(config): - try: - win_size = int(config.get("algorithm", "win_size")) - if not (1 <= win_size <= 300): -- raise ValueError("Invalid win_size") -+ raise ValueError("Invalid algorithm.win_size") - except ValueError: -- win_size = 30 -- logging.warning("Invalid win_size, set to 30") -+ report_alarm_fail("Invalid algorithm.win_size config") - except configparser.NoOptionError: - win_size = 30 -- logging.warning("Unset win_size, use 30 as default") -+ logging.warning("Unset algorithm.win_size, use 30 as default") - - try: - win_threshold = int(config.get("algorithm", "win_threshold")) - if win_threshold < 1 or win_threshold > 300 or win_threshold > win_size: -- raise ValueError("Invalid win_threshold") -+ raise ValueError("Invalid algorithm.win_threshold") - except ValueError: -- win_threshold = 6 -- logging.warning("Invalid win_threshold, set to 6") -+ report_alarm_fail("Invalid algorithm.win_threshold config") - except configparser.NoOptionError: - win_threshold = 6 -- logging.warning("Unset win_threshold, use 6 as default") -+ logging.warning("Unset algorithm.win_threshold, use 6 as default") - - return win_size, win_threshold - - --def read_config_lat_iodump(io_dic, config): -- """read config file, get [latency] [iodump] section value""" -+def read_config_latency(config): -+ """read config file, get [latency_xxx] section value""" - common_param = {} -- lat_sec = None -- if not config.has_section("latency"): -- logging.warning("Cannot find latency section in config file") -- else: -- lat_sec = config["latency"] -- -- iodump_sec = None -- if not config.has_section("iodump"): -- logging.warning("Cannot find iodump section in config file") -- else: -- iodump_sec = config["iodump"] -- -- if not lat_sec and not iodump_sec: -- return common_param -- -- for io_type in io_dic["iotype_list"]: -- common_param[io_type] = {} -- -- latency_keys = { -- "avg_lim": "{}_avg_lim".format(io_type), -- "avg_time": "{}_avg_time".format(io_type), -- "tot_lim": "{}_tot_lim".format(io_type), -- } -- iodump_key = "{}_iodump_lim".format(io_type) -+ for type_name in Disk_Type: -+ section_name = f"latency_{Disk_Type[type_name]}" -+ if not config.has_section(section_name): -+ report_alarm_fail(f"Cannot find {section_name} section in config file") - -- if iodump_sec and iodump_key in iodump_sec and iodump_sec[iodump_key].isdecimal(): -- common_param[io_type][iodump_key] = int(iodump_sec[iodump_key]) -+ common_param[Disk_Type[type_name]] = get_section_value(section_name, config) -+ return common_param - -- if not lat_sec: -- continue - -- for key_suffix, key_template in latency_keys.items(): -- if key_template in lat_sec and lat_sec[key_template].isdecimal(): -- common_param[io_type][key_template] = int(lat_sec[key_template]) -+def read_config_iodump(config): -+ """read config file, get [iodump] section value""" -+ common_param = {} -+ section_name = "iodump" -+ if not config.has_section(section_name): -+ report_alarm_fail(f"Cannot find {section_name} section in config file") - -- return common_param -+ return get_section_value(section_name, config) - - --def read_config_stage(config, stage, iotype_list): -- """read config file, get [STAGE_NAME] section value""" -+def read_config_stage(config, stage, iotype_list, curr_disk_type): -+ """read config file, get [STAGE_NAME_diskType] section value""" - res = {} -- if not stage in config: -+ section_name = f"{stage}_{curr_disk_type}" -+ if not config.has_section(section_name): - return res - -- for key in config[stage]: -+ for key in config[section_name]: - if config[stage][key].isdecimal(): - res[key] = int(config[stage][key]) - -@@ -171,11 +148,12 @@ def init_io_win(io_dic, config, common_param): - for disk_name in io_dic["disk_list"]: - io_data[disk_name] = {} - io_avg_value[disk_name] = {} -+ curr_disk_type = get_disk_type_by_name(disk_name) - for stage_name in io_dic["stage_list"]: - io_data[disk_name][stage_name] = {} - io_avg_value[disk_name][stage_name] = {} -- # step3. 解析stage配置 -- curr_stage_param = read_config_stage(config, stage_name, iotype_list) -+ # 解析stage配置 -+ curr_stage_param = read_config_stage(config, stage_name, iotype_list, curr_disk_type) - for rw in iotype_list: - io_data[disk_name][stage_name][rw] = {} - io_avg_value[disk_name][stage_name][rw] = [0, 0] -@@ -187,10 +165,10 @@ def init_io_win(io_dic, config, common_param): - iodump_lim_key = "{}_iodump_lim".format(rw) - - # 获取值,优先从 curr_stage_param 获取,如果不存在,则从 common_param 获取 -- avg_lim_value = curr_stage_param.get(avg_lim_key, common_param.get(rw, {}).get(avg_lim_key)) -- avg_time_value = curr_stage_param.get(avg_time_key, common_param.get(rw, {}).get(avg_time_key)) -- tot_lim_value = curr_stage_param.get(tot_lim_key, common_param.get(rw, {}).get(tot_lim_key)) -- iodump_lim_value = curr_stage_param.get(iodump_lim_key, common_param.get(rw, {}).get(iodump_lim_key)) -+ avg_lim_value = curr_stage_param.get(avg_lim_key, common_param.get(curr_disk_type, {}).get(avg_lim_key)) -+ avg_time_value = curr_stage_param.get(avg_time_key, common_param.get(curr_disk_type, {}).get(avg_time_key)) -+ tot_lim_value = curr_stage_param.get(tot_lim_key, common_param.get(curr_disk_type, {}).get(tot_lim_key)) -+ iodump_lim_value = curr_stage_param.get(iodump_lim_key, common_param.get("iodump", {}).get(iodump_lim_key)) - - if avg_lim_value and avg_time_value and tot_lim_value: - io_data[disk_name][stage_name][rw]["latency"] = IoWindow(window_size=io_dic["win_size"], window_threshold=io_dic["win_threshold"], abnormal_multiple=avg_time_value, abnormal_multiple_lim=avg_lim_value, abnormal_time=tot_lim_value) -@@ -217,28 +195,21 @@ def get_valid_disk_stage_list(io_dic, config_disk, config_stage): - stage_list = [key for key in all_stage_set if key in config_stage] - not_in_stage_list = [key for key in config_stage if key not in all_stage_set] - -- if not config_disk: -+ if not_in_stage_list: -+ report_alarm_fail(f"Invalid common.stage_list config, cannot set {not_in_stage_list}") -+ -+ if not config_disk and not not_in_disk_list: - disk_list = [key for key in all_disk_set] - -- if not config_stage: -+ if not config_stage and not not_in_stage_list: - stage_list = [key for key in all_stage_set] - - disk_list = disk_list[:10] if len(disk_list) > 10 else disk_list -- stage_list = stage_list[:15] if len(stage_list) > 15 else stage_list -- -- if config_disk and not disk_list: -- logging.warning("Cannot get valid disk by disk={}, set to default".format(config_disk)) -- disk_list, stage_list = get_valid_disk_stage_list(io_dic, [], config_stage) -- -- if config_stage and not stage_list: -- logging.warning("Cannot get valid stage by stage={}, set to default".format(config_stage)) -- disk_list, stage_list = get_valid_disk_stage_list(io_dic, config_disk, []) - - if not stage_list or not disk_list: - report_alarm_fail("Cannot get valid disk name or stage name.") - - log_invalid_keys(not_in_disk_list, 'disk', config_disk, disk_list) -- log_invalid_keys(not_in_stage_list, 'stage', config_stage, stage_list) - - return disk_list, stage_list - -@@ -310,8 +281,13 @@ def main(): - # step1. 解析公共配置 --- algorithm - io_dic["win_size"], io_dic["win_threshold"] = read_config_algorithm(config) - -- # step2. 循环创建窗口 -- common_param = read_config_lat_iodump(io_dic, config) -+ # step2. 解析公共配置 --- latency_xxx -+ common_param = read_config_latency(config) -+ -+ # step3. 解析公共配置 --- iodump -+ common_param['iodump'] = read_config_iodump(config) -+ -+ # step4. 循环创建窗口 - io_data, io_avg_value = init_io_win(io_dic, config, common_param) - - main_loop(io_dic, io_data, io_avg_value) -diff --git a/src/python/sentryPlugins/avg_block_io/module_conn.py b/src/python/sentryPlugins/avg_block_io/module_conn.py -index 40b3fcc..8d6f429 100644 ---- a/src/python/sentryPlugins/avg_block_io/module_conn.py -+++ b/src/python/sentryPlugins/avg_block_io/module_conn.py -@@ -14,7 +14,7 @@ import sys - import time - - from .utils import is_abnormal, get_win_data, log_slow_win --from sentryCollector.collect_plugin import is_iocollect_valid, get_io_data, Result_Messages -+from sentryCollector.collect_plugin import is_iocollect_valid, get_io_data, Result_Messages, get_disk_type, Disk_Type - from syssentry.result import ResultLevel, report_result - from xalarm.sentry_notify import xalarm_report, MINOR_ALM, ALARM_TYPE_OCCUR - -@@ -51,7 +51,7 @@ def check_result_validation(res, reason): - try: - json_data = json.loads(res['message']) - except json.JSONDecodeError: -- err_msg = "Failed to {}: invalid return message".format(reason) -+ err_msg = f"Failed to {reason}: invalid return message" - report_alarm_fail(err_msg) - - return json_data -@@ -60,7 +60,7 @@ def check_result_validation(res, reason): - def report_alarm_fail(alarm_info): - """report result to xalarmd""" - report_result(TASK_NAME, ResultLevel.FAIL, json.dumps({"msg": alarm_info})) -- logging.error(alarm_info) -+ logging.critical(alarm_info) - sys.exit(1) - - -@@ -114,3 +114,16 @@ def process_report_data(disk_name, rw, io_data): - - log_slow_win(msg, "unknown") - xalarm_report(1002, MINOR_ALM, ALARM_TYPE_OCCUR, json.dumps(msg)) -+ -+ -+def get_disk_type_by_name(disk_name): -+ res = get_disk_type(disk_name) -+ disk_type_str = check_result_validation(get_disk_type(disk_name), f'Invalid disk type {disk_name}') -+ try: -+ curr_disk_type = int(disk_type_str) -+ if curr_disk_type not in Disk_Type: -+ raise ValueError -+ except ValueError: -+ report_alarm_fail(f"Failed to get disk type for {disk_name}") -+ -+ return Disk_Type[curr_disk_type] -\ No newline at end of file -diff --git a/src/python/sentryPlugins/avg_block_io/utils.py b/src/python/sentryPlugins/avg_block_io/utils.py -index 3b7f027..cef1edd 100644 ---- a/src/python/sentryPlugins/avg_block_io/utils.py -+++ b/src/python/sentryPlugins/avg_block_io/utils.py -@@ -26,6 +26,49 @@ LogLevel = { - } - - -+DEFAULT_PARAM = { -+ 'latency_nvme_ssd': { -+ 'read_avg_lim': 300, -+ 'write_avg_lim': 300, -+ 'read_avg_time': 3, -+ 'write_avg_time': 3, -+ 'read_tot_lim': 500, -+ 'write_tot_lim': 500, -+ }, 'latency_sata_ssd' : { -+ 'read_avg_lim': 10000, -+ 'write_avg_lim': 10000, -+ 'read_avg_time': 3, -+ 'write_avg_time': 3, -+ 'read_tot_lim': 50000, -+ 'write_tot_lim': 50000, -+ }, 'latency_sata_hdd' : { -+ 'read_avg_lim': 15000, -+ 'write_avg_lim': 15000, -+ 'read_avg_time': 3, -+ 'write_avg_time': 3, -+ 'read_tot_lim': 50000, -+ 'write_tot_lim': 50000 -+ }, 'iodump': { -+ 'read_iodump_lim': 0, -+ 'write_iodump_lim': 0 -+ } -+} -+ -+ -+def get_section_value(section_name, config): -+ common_param = {} -+ config_sec = config[section_name] -+ for config_key in DEFAULT_PARAM[section_name]: -+ if config_key in config_sec: -+ if not config_sec[config_key].isdecimal(): -+ report_alarm_fail(f"Invalid {section_name}.{config_key} config.") -+ common_param[config_key] = int(config_sec[config_key]) -+ else: -+ logging.warning(f"Unset {section_name}.{config_key} in config file, use {DEFAULT_PARAM[section_name][config_key]} as default") -+ common_param[config_key] = DEFAULT_PARAM[section_name][config_key] -+ return common_param -+ -+ - def get_log_level(filename): - if not os.path.exists(filename): - return logging.INFO --- -2.27.0 diff --git a/enrich-alert-info-about-kernel-stack.patch b/enrich-alert-info-about-kernel-stack.patch deleted file mode 100644 index bf04a6e662b44e7b85cefa3b857a873ad235bdf9..0000000000000000000000000000000000000000 --- a/enrich-alert-info-about-kernel-stack.patch +++ /dev/null @@ -1,29 +0,0 @@ -From 41bf507ca6cbbdf5e646a405de6b8d5b9be4bd28 Mon Sep 17 00:00:00 2001 -From: =?UTF-8?q?=E8=B4=BA=E6=9C=89=E5=BF=97?= <1037617413@qq.com> -Date: Wed, 16 Oct 2024 17:20:01 +0800 -Subject: [PATCH] enrich alert info about kernel stack - ---- - src/python/sentryPlugins/ai_block_io/detector.py | 6 ++++-- - 1 file changed, 4 insertions(+), 2 deletions(-) - -diff --git a/src/python/sentryPlugins/ai_block_io/detector.py b/src/python/sentryPlugins/ai_block_io/detector.py -index ed8b64a..8536f7a 100644 ---- a/src/python/sentryPlugins/ai_block_io/detector.py -+++ b/src/python/sentryPlugins/ai_block_io/detector.py -@@ -103,8 +103,10 @@ class DiskDetector: - elif len(diagnosis_info["rq_driver"]) != 0: - root_cause = "[Root Cause: disk slow]" - elif len(diagnosis_info["io_stage"]) != 0: -- stage = diagnosis_info["io_stage"][0][1].stage_name -- root_cause = f"[Root Cause: io stage slow, stage: {stage}]" -+ stage_list = [] -+ for io_stage in diagnosis_info["io_stage"]: -+ stage_list.append(io_stage[0].stage_name) -+ root_cause = f"[Root Cause: io stage slow, stage: {stage_list}]" - if root_cause is None: - root_cause = "[Root Cause: high io pressure]" - return True, diagnosis_info["bio"][0][0], diagnosis_info["bio"][0][1], root_cause --- -2.23.0 - diff --git a/feature-add-avg_block_io-plugin.patch b/feature-add-avg_block_io-plugin.patch deleted file mode 100644 index 5477f181aad296b65ab8551be7ddd290dd081742..0000000000000000000000000000000000000000 --- a/feature-add-avg_block_io-plugin.patch +++ /dev/null @@ -1,572 +0,0 @@ -From acb77d6a69aa9269b0f691613bef53efd0c01e53 Mon Sep 17 00:00:00 2001 -From: gaoruoshu -Date: Thu, 12 Sep 2024 11:31:34 +0800 -Subject: [PATCH 2/2] add avg_block_io plugin - ---- - config/plugins/avg_block_io.ini | 21 ++ - config/tasks/avg_block_io.mod | 5 + - src/python/sentryPlugins/__init__.py | 0 - .../sentryPlugins/avg_block_io/__init__.py | 0 - .../avg_block_io/avg_block_io.py | 257 ++++++++++++++++++ - .../sentryPlugins/avg_block_io/module_conn.py | 86 ++++++ - .../avg_block_io/stage_window.py | 47 ++++ - .../sentryPlugins/avg_block_io/utils.py | 86 ++++++ - 8 files changed, 502 insertions(+) - create mode 100644 config/plugins/avg_block_io.ini - create mode 100644 config/tasks/avg_block_io.mod - create mode 100644 src/python/sentryPlugins/__init__.py - create mode 100644 src/python/sentryPlugins/avg_block_io/__init__.py - create mode 100644 src/python/sentryPlugins/avg_block_io/avg_block_io.py - create mode 100644 src/python/sentryPlugins/avg_block_io/module_conn.py - create mode 100644 src/python/sentryPlugins/avg_block_io/stage_window.py - create mode 100644 src/python/sentryPlugins/avg_block_io/utils.py - -diff --git a/config/plugins/avg_block_io.ini b/config/plugins/avg_block_io.ini -new file mode 100644 -index 0000000..bc33dde ---- /dev/null -+++ b/config/plugins/avg_block_io.ini -@@ -0,0 +1,21 @@ -+[common] -+disk=default -+stage=default -+iotype=read,write -+period_time=1 -+ -+[algorithm] -+win_size=30 -+win_threshold=6 -+ -+[latency] -+read_avg_lim=10 -+write_avg_lim=10 -+read_avg_time=3 -+write_avg_time=3 -+read_tot_lim=50 -+write_tot_lim=50 -+ -+[iodump] -+read_iodump_lim=0 -+write_iodump_lim=0 -diff --git a/config/tasks/avg_block_io.mod b/config/tasks/avg_block_io.mod -new file mode 100644 -index 0000000..814c483 ---- /dev/null -+++ b/config/tasks/avg_block_io.mod -@@ -0,0 +1,5 @@ -+[common] -+enabled=yes -+task_start=/usr/bin/python3 /usr/bin/avg_block_io -+task_stop=pkill avg_block_io -+type=oneshot -\ No newline at end of file -diff --git a/src/python/sentryPlugins/__init__.py b/src/python/sentryPlugins/__init__.py -new file mode 100644 -index 0000000..e69de29 -diff --git a/src/python/sentryPlugins/avg_block_io/__init__.py b/src/python/sentryPlugins/avg_block_io/__init__.py -new file mode 100644 -index 0000000..e69de29 -diff --git a/src/python/sentryPlugins/avg_block_io/avg_block_io.py b/src/python/sentryPlugins/avg_block_io/avg_block_io.py -new file mode 100644 -index 0000000..ff2071d ---- /dev/null -+++ b/src/python/sentryPlugins/avg_block_io/avg_block_io.py -@@ -0,0 +1,257 @@ -+# coding: utf-8 -+# Copyright (c) 2024 Huawei Technologies Co., Ltd. -+# sysSentry is licensed under the Mulan PSL v2. -+# You can use this software according to the terms and conditions of the Mulan PSL v2. -+# You may obtain a copy of Mulan PSL v2 at: -+# http://license.coscl.org.cn/MulanPSL2 -+# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR -+# IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY OR FIT FOR A PARTICULAR -+# PURPOSE. -+# See the Mulan PSL v2 for more details. -+import logging -+import signal -+import configparser -+import time -+ -+from .stage_window import IoWindow, IoDumpWindow -+from .module_conn import avg_is_iocollect_valid, avg_get_io_data, report_alarm_fail, process_report_data, sig_handler -+from .utils import update_avg_and_check_abnormal -+ -+CONFIG_FILE = "/etc/sysSentry/plugins/avg_block_io.ini" -+ -+def log_invalid_keys(not_in_list, keys_name, config_list, default_list): -+ """print invalid log""" -+ if config_list and default_list: -+ logging.warning("{} in common.{} are not valid, set {}={}".format(not_in_list, keys_name, keys_name, default_list)) -+ elif config_list == ["default"]: -+ logging.warning("Default {} use {}".format(keys_name, default_list)) -+ -+ -+def read_config_common(config): -+ """read config file, get [common] section value""" -+ try: -+ common_sec = config['common'] -+ except configparser.NoSectionError: -+ report_alarm_fail("Cannot find common section in config file") -+ -+ try: -+ period_time = int(common_sec.get("period_time", 1)) -+ if not (1 <= period_time <= 300): -+ raise ValueError("Invalid period_time") -+ except ValueError: -+ period_time = 1 -+ logging.warning("Invalid period_time, set to 1s") -+ -+ disk = common_sec.get('disk').split(",") if common_sec.get('disk') not in [None, 'default'] else [] -+ stage = common_sec.get('stage').split(",") if common_sec.get('stage') not in [None, 'default'] else [] -+ -+ if len(disk) > 10: -+ logging.warning("Too many disks, record only max 10 disks") -+ disk = disk[:10] -+ -+ iotype = common_sec.get('iotype', 'read,write').split(",") -+ iotype_list = [rw.lower() for rw in iotype if rw.lower() in ['read', 'write', 'flush', 'discard']] -+ err_iotype = [rw for rw in iotype if rw.lower() not in ['read', 'write', 'flush', 'discard']] -+ -+ if err_iotype: -+ logging.warning("{} in common.iotype are not valid, set iotype={}".format(err_iotype, iotype_list)) -+ -+ return period_time, disk, stage, iotype_list -+ -+ -+def read_config_algorithm(config): -+ """read config file, get [algorithm] section value""" -+ if not config.has_section("algorithm"): -+ report_alarm_fail("Cannot find algorithm section in config file") -+ -+ try: -+ win_size = int(config.get("algorithm", "win_size")) -+ if not (1 <= win_size <= 300): -+ raise ValueError("Invalid win_size") -+ win_threshold = int(config.get("algorithm", "win_threshold")) -+ if win_threshold < 1 or win_threshold > 300 or win_threshold > win_size: -+ raise ValueError("Invalid win_threshold") -+ except ValueError: -+ report_alarm_fail("Invalid win_threshold or win_size") -+ -+ return win_size, win_threshold -+ -+ -+def read_config_lat_iodump(io_dic, config): -+ """read config file, get [latency] [iodump] section value""" -+ common_param = {} -+ for io_type in io_dic["iotype_list"]: -+ common_param[io_type] = {} -+ -+ latency_keys = { -+ "avg_lim": "{}_avg_lim".format(io_type), -+ "avg_time": "{}_avg_time".format(io_type), -+ "tot_lim": "{}_tot_lim".format(io_type), -+ } -+ iodump_key = "{}_iodump_lim".format(io_type) -+ -+ for key_suffix, key_template in latency_keys.items(): -+ if key_template in config["latency"] and config["latency"][key_template].isdecimal(): -+ common_param[io_type][key_template] = int(config["latency"][key_template]) -+ -+ if iodump_key in config["iodump"] and config["iodump"][iodump_key].isdecimal(): -+ common_param[io_type][iodump_key] = int(config["iodump"][iodump_key]) -+ -+ return common_param -+ -+ -+def read_config_stage(config, stage, iotype_list): -+ """read config file, get [STAGE_NAME] section value""" -+ res = {} -+ if not stage in config: -+ return res -+ -+ for key in config[stage]: -+ if config[stage][key].isdecimal(): -+ res[key] = int(config[stage][key]) -+ -+ return res -+ -+ -+def init_io_win(io_dic, config, common_param): -+ """initialize windows of latency, iodump, and dict of avg_value""" -+ iotype_list = io_dic["iotype_list"] -+ io_data = {} -+ io_avg_value = {} -+ for disk_name in io_dic["disk_list"]: -+ io_data[disk_name] = {} -+ io_avg_value[disk_name] = {} -+ for stage_name in io_dic["stage_list"]: -+ io_data[disk_name][stage_name] = {} -+ io_avg_value[disk_name][stage_name] = {} -+ # step3. 解析stage配置 -+ curr_stage_param = read_config_stage(config, stage_name, iotype_list) -+ for rw in iotype_list: -+ io_data[disk_name][stage_name][rw] = {} -+ io_avg_value[disk_name][stage_name][rw] = [0, 0] -+ -+ # 对每个rw创建latency和iodump窗口 -+ avg_lim_key = "{}_avg_lim".format(rw) -+ avg_time_key = "{}_avg_time".format(rw) -+ tot_lim_key = "{}_tot_lim".format(rw) -+ iodump_lim_key = "{}_iodump_lim".format(rw) -+ -+ # 获取值,优先从 curr_stage_param 获取,如果不存在,则从 common_param 获取 -+ avg_lim_value = curr_stage_param.get(avg_lim_key, common_param.get(rw, {}).get(avg_lim_key)) -+ avg_time_value = curr_stage_param.get(avg_time_key, common_param.get(rw, {}).get(avg_time_key)) -+ tot_lim_value = curr_stage_param.get(tot_lim_key, common_param.get(rw, {}).get(tot_lim_key)) -+ iodump_lim_value = curr_stage_param.get(iodump_lim_key, common_param.get(rw, {}).get(iodump_lim_key)) -+ -+ if avg_lim_value and avg_time_value and tot_lim_value: -+ io_data[disk_name][stage_name][rw]["latency"] = IoWindow(window_size=io_dic["win_size"], window_threshold=io_dic["win_threshold"], abnormal_multiple=avg_time_value, abnormal_multiple_lim=avg_lim_value, abnormal_time=tot_lim_value) -+ -+ if iodump_lim_value is not None: -+ io_data[disk_name][stage_name][rw]["iodump"] = IoDumpWindow(window_size=io_dic["win_size"], window_threshold=io_dic["win_threshold"], abnormal_time=iodump_lim_value) -+ return io_data, io_avg_value -+ -+ -+def get_valid_disk_stage_list(io_dic, config_disk, config_stage): -+ """get disk_list and stage_list by sentryCollector""" -+ json_data = avg_is_iocollect_valid(io_dic, config_disk, config_stage) -+ -+ all_disk_set = json_data.keys() -+ all_stage_set = set() -+ for disk_stage_list in json_data.values(): -+ all_stage_set.update(disk_stage_list) -+ -+ disk_list = [key for key in config_disk if key in all_disk_set] -+ not_in_disk_list = [key for key in config_disk if key not in all_disk_set] -+ -+ stage_list = [key for key in config_stage if key in all_stage_set] -+ not_in_stage_list = [key for key in config_stage if key not in all_stage_set] -+ -+ if not config_disk: -+ disk_list = [key for key in all_disk_set] -+ -+ if not config_stage: -+ stage_list = [key for key in all_stage_set] -+ -+ if config_disk and not disk_list: -+ logging.warning("Cannot get valid disk by disk={}, set to default".format(config_disk)) -+ disk_list, stage_list = get_valid_disk_stage_list(io_dic, [], config_stage) -+ -+ if config_stage and not stage_list: -+ logging.warning("Cannot get valid stage by stage={}, set to default".format(config_stage)) -+ disk_list, stage_list = get_valid_disk_stage_list(io_dic, config_disk, []) -+ -+ if not stage_list or not disk_list: -+ report_alarm_fail("Cannot get valid disk name or stage name.") -+ -+ log_invalid_keys(not_in_disk_list, 'disk', config_disk, disk_list) -+ log_invalid_keys(not_in_stage_list, 'stage', config_stage, stage_list) -+ -+ return disk_list, stage_list -+ -+ -+def main_loop(io_dic, io_data, io_avg_value): -+ """main loop of avg_block_io""" -+ period_time = io_dic["period_time"] -+ disk_list = io_dic["disk_list"] -+ stage_list = io_dic["stage_list"] -+ iotype_list = io_dic["iotype_list"] -+ win_size = io_dic["win_size"] -+ # 开始循环 -+ while True: -+ # 等待x秒 -+ time.sleep(period_time) -+ -+ # 采集模块对接,获取周期数据 -+ curr_period_data = avg_get_io_data(io_dic) -+ -+ # 处理周期数据 -+ reach_size = False -+ for disk_name in disk_list: -+ for stage_name in stage_list: -+ for rw in iotype_list: -+ if disk_name in curr_period_data and stage_name in curr_period_data[disk_name] and rw in curr_period_data[disk_name][stage_name]: -+ io_key = (disk_name, stage_name, rw) -+ reach_size = update_avg_and_check_abnormal(curr_period_data, io_key, win_size, io_avg_value, io_data) -+ -+ # win_size不满时不进行告警判断 -+ if not reach_size: -+ continue -+ -+ # 判断异常窗口、异常场景 -+ for disk_name in disk_list: -+ for rw in iotype_list: -+ process_report_data(disk_name, rw, io_data) -+ -+ -+def main(): -+ """main func""" -+ # 注册停止信号-2/-15 -+ signal.signal(signal.SIGINT, sig_handler) -+ signal.signal(signal.SIGTERM, sig_handler) -+ -+ # 初始化配置读取 -+ config = configparser.ConfigParser(comment_prefixes=('#', ';')) -+ try: -+ config.read(CONFIG_FILE) -+ except configparser.Error: -+ report_alarm_fail("Failed to read config file") -+ -+ io_dic = {} -+ -+ # 读取配置文件 -- common段 -+ io_dic["period_time"], disk, stage, io_dic["iotype_list"] = read_config_common(config) -+ -+ # 采集模块对接,is_iocollect_valid() -+ io_dic["disk_list"], io_dic["stage_list"] = get_valid_disk_stage_list(io_dic, disk, stage) -+ -+ if "bio" not in io_dic["stage_list"]: -+ report_alarm_fail("Cannot run avg_block_io without bio stage") -+ -+ # 初始化窗口 -- config读取,对应is_iocollect_valid返回的结果 -+ # step1. 解析公共配置 --- algorithm -+ io_dic["win_size"], io_dic["win_threshold"] = read_config_algorithm(config) -+ -+ # step2. 循环创建窗口 -+ common_param = read_config_lat_iodump(io_dic, config) -+ io_data, io_avg_value = init_io_win(io_dic, config, common_param) -+ -+ main_loop(io_dic, io_data, io_avg_value) -diff --git a/src/python/sentryPlugins/avg_block_io/module_conn.py b/src/python/sentryPlugins/avg_block_io/module_conn.py -new file mode 100644 -index 0000000..caa0191 ---- /dev/null -+++ b/src/python/sentryPlugins/avg_block_io/module_conn.py -@@ -0,0 +1,86 @@ -+# coding: utf-8 -+# Copyright (c) 2024 Huawei Technologies Co., Ltd. -+# sysSentry is licensed under the Mulan PSL v2. -+# You can use this software according to the terms and conditions of the Mulan PSL v2. -+# You may obtain a copy of Mulan PSL v2 at: -+# http://license.coscl.org.cn/MulanPSL2 -+# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR -+# IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY OR FIT FOR A PARTICULAR -+# PURPOSE. -+# See the Mulan PSL v2 for more details. -+import json -+import logging -+import sys -+import time -+ -+from .utils import is_abnormal -+from sentryCollector.collect_plugin import is_iocollect_valid, get_io_data, Result_Messages -+from syssentry.result import ResultLevel, report_result -+ -+ -+TASK_NAME = "avg_block_io" -+ -+def sig_handler(signum, _f): -+ """stop avg_block_io""" -+ report_result(TASK_NAME, ResultLevel.PASS, json.dumps({})) -+ logging.info("Finished avg_block_io plugin running.") -+ sys.exit(0) -+ -+def avg_get_io_data(io_dic): -+ """get_io_data from sentryCollector""" -+ res = get_io_data(io_dic["period_time"], io_dic["disk_list"], io_dic["stage_list"], io_dic["iotype_list"]) -+ return check_result_validation(res, 'get io data') -+ -+ -+def avg_is_iocollect_valid(io_dic, config_disk, config_stage): -+ """is_iocollect_valid from sentryCollector""" -+ res = is_iocollect_valid(io_dic["period_time"], config_disk, config_stage) -+ return check_result_validation(res, 'check config validation') -+ -+ -+def check_result_validation(res, reason): -+ """check validation of result from sentryCollector""" -+ if not 'ret' in res or not 'message' in res: -+ err_msg = "Failed to {}: Cannot connect to sentryCollector.".format(reason) -+ report_alarm_fail(err_msg) -+ if res['ret'] != 0: -+ err_msg = "Failed to {}: {}".format(reason, Result_Messages[res['ret']]) -+ report_alarm_fail(err_msg) -+ -+ try: -+ json_data = json.loads(res['message']) -+ except json.JSONDecodeError: -+ err_msg = "Failed to {}: invalid return message".format(reason) -+ report_alarm_fail(err_msg) -+ -+ return json_data -+ -+ -+def report_alarm_fail(alarm_info): -+ """report result to xalarmd""" -+ report_result(TASK_NAME, ResultLevel.FAIL, json.dumps({"msg": alarm_info})) -+ logging.error(alarm_info) -+ sys.exit(1) -+ -+ -+def process_report_data(disk_name, rw, io_data): -+ """check abnormal window and report to xalarm""" -+ if not is_abnormal((disk_name, 'bio', rw), io_data): -+ return -+ -+ ctrl_stage = ['throtl', 'wbt', 'iocost', 'bfq'] -+ for stage_name in ctrl_stage: -+ if is_abnormal((disk_name, stage_name, rw), io_data): -+ logging.warning("{} - {} - {} report IO press".format(time.ctime(), disk_name, rw)) -+ return -+ -+ if is_abnormal((disk_name, 'rq_driver', rw), io_data): -+ logging.warning("{} - {} - {} report driver".format(time.ctime(), disk_name, rw)) -+ return -+ -+ kernel_stage = ['gettag', 'plug', 'deadline', 'hctx', 'requeue'] -+ for stage_name in kernel_stage: -+ if is_abnormal((disk_name, stage_name, rw), io_data): -+ logging.warning("{} - {} - {} report kernel".format(time.ctime(), disk_name, rw)) -+ return -+ logging.warning("{} - {} - {} report IO press".format(time.ctime(), disk_name, rw)) -diff --git a/src/python/sentryPlugins/avg_block_io/stage_window.py b/src/python/sentryPlugins/avg_block_io/stage_window.py -new file mode 100644 -index 0000000..9b0ce79 ---- /dev/null -+++ b/src/python/sentryPlugins/avg_block_io/stage_window.py -@@ -0,0 +1,47 @@ -+# coding: utf-8 -+# Copyright (c) 2024 Huawei Technologies Co., Ltd. -+# sysSentry is licensed under the Mulan PSL v2. -+# You can use this software according to the terms and conditions of the Mulan PSL v2. -+# You may obtain a copy of Mulan PSL v2 at: -+# http://license.coscl.org.cn/MulanPSL2 -+# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR -+# IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY OR FIT FOR A PARTICULAR -+# PURPOSE. -+# See the Mulan PSL v2 for more details. -+ -+class AbnormalWindowBase: -+ def __init__(self, window_size=10, window_threshold=7): -+ self.window_size = window_size -+ self.window_threshold = window_threshold -+ self.abnormal_window = [False] * window_size -+ -+ def append_new_period(self, ab_res, avg_val=0): -+ self.abnormal_window.pop(0) -+ if self.is_abnormal_period(ab_res, avg_val): -+ self.abnormal_window.append(True) -+ else: -+ self.abnormal_window.append(False) -+ -+ def is_abnormal_window(self): -+ return sum(self.abnormal_window) > self.window_threshold -+ -+ -+class IoWindow(AbnormalWindowBase): -+ def __init__(self, window_size=10, window_threshold=7, abnormal_multiple=5, abnormal_multiple_lim=30, abnormal_time=40): -+ super().__init__(window_size, window_threshold) -+ self.abnormal_multiple = abnormal_multiple -+ self.abnormal_multiple_lim = abnormal_multiple_lim -+ self.abnormal_time = abnormal_time -+ -+ def is_abnormal_period(self, value, avg_val): -+ return (value > avg_val * self.abnormal_multiple and value > self.abnormal_multiple_lim) or \ -+ (value > self.abnormal_time) -+ -+ -+class IoDumpWindow(AbnormalWindowBase): -+ def __init__(self, window_size=10, window_threshold=7, abnormal_time=40): -+ super().__init__(window_size, window_threshold) -+ self.abnormal_time = abnormal_time -+ -+ def is_abnormal_period(self, value, avg_val=0): -+ return value > self.abnormal_time -diff --git a/src/python/sentryPlugins/avg_block_io/utils.py b/src/python/sentryPlugins/avg_block_io/utils.py -new file mode 100644 -index 0000000..54ed080 ---- /dev/null -+++ b/src/python/sentryPlugins/avg_block_io/utils.py -@@ -0,0 +1,86 @@ -+# coding: utf-8 -+# Copyright (c) 2024 Huawei Technologies Co., Ltd. -+# sysSentry is licensed under the Mulan PSL v2. -+# You can use this software according to the terms and conditions of the Mulan PSL v2. -+# You may obtain a copy of Mulan PSL v2 at: -+# http://license.coscl.org.cn/MulanPSL2 -+# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR -+# IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY OR FIT FOR A PARTICULAR -+# PURPOSE. -+# See the Mulan PSL v2 for more details. -+AVG_VALUE = 0 -+AVG_COUNT = 1 -+ -+ -+def get_nested_value(data, keys): -+ """get data from nested dict""" -+ for key in keys: -+ if key in data: -+ data = data[key] -+ else: -+ return None -+ return data -+ -+ -+def set_nested_value(data, keys, value): -+ """set data to nested dict""" -+ for key in keys[:-1]: -+ if key in data: -+ data = data[key] -+ else: -+ return False -+ data[keys[-1]] = value -+ return True -+ -+ -+def is_abnormal(io_key, io_data): -+ """check if latency and iodump win abnormal""" -+ for key in ['latency', 'iodump']: -+ all_keys = get_nested_value(io_data, io_key) -+ if all_keys and key in all_keys: -+ win = get_nested_value(io_data, io_key + (key,)) -+ if win and win.is_abnormal_window(): -+ return True -+ return False -+ -+ -+def update_io_avg(old_avg, period_value, win_size): -+ """update average of latency window""" -+ if old_avg[AVG_COUNT] < win_size: -+ new_avg_count = old_avg[AVG_COUNT] + 1 -+ new_avg_value = (old_avg[AVG_VALUE] * old_avg[AVG_COUNT] + period_value[0]) / new_avg_count -+ else: -+ new_avg_count = old_avg[AVG_COUNT] -+ new_avg_value = (old_avg[AVG_VALUE] * (old_avg[AVG_COUNT] - 1) + period_value[0]) / new_avg_count -+ return [new_avg_value, new_avg_count] -+ -+ -+def update_io_data(old_avg, period_value, win_size, io_data, io_key): -+ """update data of latency and iodump window""" -+ all_wins = get_nested_value(io_data, io_key) -+ if all_wins and "latency" in all_wins: -+ io_data[io_key[0]][io_key[1]][io_key[2]]["latency"].append_new_period(period_value[0], old_avg[AVG_VALUE]) -+ if all_wins and "iodump" in all_wins: -+ io_data[io_key[0]][io_key[1]][io_key[2]]["iodump"].append_new_period(period_value[1]) -+ -+ -+def update_avg_and_check_abnormal(data, io_key, win_size, io_avg_value, io_data): -+ """update avg and check abonrmal, return true if win_size full""" -+ period_value = get_nested_value(data, io_key) -+ old_avg = get_nested_value(io_avg_value, io_key) -+ -+ # 更新avg数据 -+ if old_avg[AVG_COUNT] < win_size: -+ set_nested_value(io_avg_value, io_key, update_io_avg(old_avg, period_value, win_size)) -+ return False -+ -+ # 更新win数据 -- 判断异常周期 -+ update_io_data(old_avg, period_value, win_size, io_data, io_key) -+ all_wins = get_nested_value(io_data, io_key) -+ if all_wins and 'latency' not in all_wins: -+ return True -+ period = get_nested_value(io_data, io_key + ("latency",)) -+ if period and period.is_abnormal_period(period_value[0], old_avg[AVG_VALUE]): -+ return True -+ set_nested_value(io_avg_value, io_key, update_io_avg(old_avg, period_value, win_size)) -+ return True --- -2.33.0 - diff --git a/fix-ai_block_io-root-cause-bug.patch b/fix-ai_block_io-root-cause-bug.patch deleted file mode 100644 index f6de7875a75938a0c752c75624ab1c4a79c5d60a..0000000000000000000000000000000000000000 --- a/fix-ai_block_io-root-cause-bug.patch +++ /dev/null @@ -1,33 +0,0 @@ -From ac9ce326dee20edde2451946e34ea9a13bd8c338 Mon Sep 17 00:00:00 2001 -From: =?UTF-8?q?=E8=B4=BA=E6=9C=89=E5=BF=97?= <1037617413@qq.com> -Date: Wed, 16 Oct 2024 11:50:46 +0800 -Subject: [PATCH] fix ai_block_io root cause bug - ---- - src/python/sentryPlugins/ai_block_io/detector.py | 8 ++++---- - 1 file changed, 4 insertions(+), 4 deletions(-) - -diff --git a/src/python/sentryPlugins/ai_block_io/detector.py b/src/python/sentryPlugins/ai_block_io/detector.py -index 5b21714..ed8b64a 100644 ---- a/src/python/sentryPlugins/ai_block_io/detector.py -+++ b/src/python/sentryPlugins/ai_block_io/detector.py -@@ -101,12 +101,12 @@ class DiskDetector: - if len(diagnosis_info["bio"]) == 0: - return False, None, None, None - elif len(diagnosis_info["rq_driver"]) != 0: -- root_cause = "[Root Cause:disk slow]" -+ root_cause = "[Root Cause: disk slow]" - elif len(diagnosis_info["io_stage"]) != 0: -- stage = diagnosis_info["io_stage"][0][1].get_stage_name() -- root_cause = f"[Root Cause:io stage slow, stage: {stage}]" -+ stage = diagnosis_info["io_stage"][0][1].stage_name -+ root_cause = f"[Root Cause: io stage slow, stage: {stage}]" - if root_cause is None: -- root_cause = "[Root Cause:high io pressure]" -+ root_cause = "[Root Cause: high io pressure]" - return True, diagnosis_info["bio"][0][0], diagnosis_info["bio"][0][1], root_cause - - def __repr__(self): --- -2.23.0 - diff --git a/fix-ai_block_io-some-issues.patch b/fix-ai_block_io-some-issues.patch deleted file mode 100644 index d80cbe8266f3a78d0038b8cc689a4098f7412d9e..0000000000000000000000000000000000000000 --- a/fix-ai_block_io-some-issues.patch +++ /dev/null @@ -1,832 +0,0 @@ -From 35ba8fe8e241c5e3508c5dadc82a777065a5cc4d Mon Sep 17 00:00:00 2001 -From: =?UTF-8?q?=E8=B4=BA=E6=9C=89=E5=BF=97?= <1037617413@qq.com> -Date: Mon, 30 Sep 2024 00:15:29 +0800 -Subject: [PATCH] fix ai_block_io some issues - ---- - ..._slow_io_detection.ini => ai_block_io.ini} | 6 +- - config/tasks/ai_block_io.mod | 5 + - .../tasks/ai_threshold_slow_io_detection.mod | 5 - - ...ow_io_detection.py => test_ai_block_io.py} | 0 - .../README.md | 0 - .../__init__.py | 0 - .../ai_block_io.py} | 57 ++-- - .../alarm_report.py | 2 +- - .../ai_block_io/config_parser.py | 256 ++++++++++++++++++ - .../data_access.py | 3 + - .../detector.py | 17 +- - .../io_data.py | 0 - .../sliding_window.py | 0 - .../threshold.py | 13 +- - .../utils.py | 15 +- - .../config_parser.py | 141 ---------- - src/python/setup.py | 2 +- - 17 files changed, 336 insertions(+), 186 deletions(-) - rename config/plugins/{ai_threshold_slow_io_detection.ini => ai_block_io.ini} (66%) - create mode 100644 config/tasks/ai_block_io.mod - delete mode 100644 config/tasks/ai_threshold_slow_io_detection.mod - rename selftest/test/{test_ai_threshold_slow_io_detection.py => test_ai_block_io.py} (100%) - rename src/python/sentryPlugins/{ai_threshold_slow_io_detection => ai_block_io}/README.md (100%) - rename src/python/sentryPlugins/{ai_threshold_slow_io_detection => ai_block_io}/__init__.py (100%) - rename src/python/sentryPlugins/{ai_threshold_slow_io_detection/slow_io_detection.py => ai_block_io/ai_block_io.py} (66%) - rename src/python/sentryPlugins/{ai_threshold_slow_io_detection => ai_block_io}/alarm_report.py (98%) - create mode 100644 src/python/sentryPlugins/ai_block_io/config_parser.py - rename src/python/sentryPlugins/{ai_threshold_slow_io_detection => ai_block_io}/data_access.py (99%) - rename src/python/sentryPlugins/{ai_threshold_slow_io_detection => ai_block_io}/detector.py (77%) - rename src/python/sentryPlugins/{ai_threshold_slow_io_detection => ai_block_io}/io_data.py (100%) - rename src/python/sentryPlugins/{ai_threshold_slow_io_detection => ai_block_io}/sliding_window.py (100%) - rename src/python/sentryPlugins/{ai_threshold_slow_io_detection => ai_block_io}/threshold.py (92%) - rename src/python/sentryPlugins/{ai_threshold_slow_io_detection => ai_block_io}/utils.py (86%) - delete mode 100644 src/python/sentryPlugins/ai_threshold_slow_io_detection/config_parser.py - -diff --git a/config/plugins/ai_threshold_slow_io_detection.ini b/config/plugins/ai_block_io.ini -similarity index 66% -rename from config/plugins/ai_threshold_slow_io_detection.ini -rename to config/plugins/ai_block_io.ini -index 44eb928..01ce266 100644 ---- a/config/plugins/ai_threshold_slow_io_detection.ini -+++ b/config/plugins/ai_block_io.ini -@@ -4,9 +4,9 @@ slow_io_detect_frequency=1 - log_level=info - - [algorithm] --train_data_duration=0.1 --train_update_duration=0.02 --algorithm_type=n_sigma -+train_data_duration=24 -+train_update_duration=2 -+algorithm_type=boxplot - boxplot_parameter=1.5 - n_sigma_parameter=3 - -diff --git a/config/tasks/ai_block_io.mod b/config/tasks/ai_block_io.mod -new file mode 100644 -index 0000000..1971d7d ---- /dev/null -+++ b/config/tasks/ai_block_io.mod -@@ -0,0 +1,5 @@ -+[common] -+enabled=yes -+task_start=/usr/bin/python3 /usr/bin/ai_block_io -+task_stop=pkill -f /usr/bin/ai_block_io -+type=oneshot -\ No newline at end of file -diff --git a/config/tasks/ai_threshold_slow_io_detection.mod b/config/tasks/ai_threshold_slow_io_detection.mod -deleted file mode 100644 -index 2729f72..0000000 ---- a/config/tasks/ai_threshold_slow_io_detection.mod -+++ /dev/null -@@ -1,5 +0,0 @@ --[common] --enabled=yes --task_start=/usr/bin/python3 /usr/bin/ai_threshold_slow_io_detection --task_stop=pkill -f /usr/bin/ai_threshold_slow_io_detection --type=oneshot -\ No newline at end of file -diff --git a/selftest/test/test_ai_threshold_slow_io_detection.py b/selftest/test/test_ai_block_io.py -similarity index 100% -rename from selftest/test/test_ai_threshold_slow_io_detection.py -rename to selftest/test/test_ai_block_io.py -diff --git a/src/python/sentryPlugins/ai_threshold_slow_io_detection/README.md b/src/python/sentryPlugins/ai_block_io/README.md -similarity index 100% -rename from src/python/sentryPlugins/ai_threshold_slow_io_detection/README.md -rename to src/python/sentryPlugins/ai_block_io/README.md -diff --git a/src/python/sentryPlugins/ai_threshold_slow_io_detection/__init__.py b/src/python/sentryPlugins/ai_block_io/__init__.py -similarity index 100% -rename from src/python/sentryPlugins/ai_threshold_slow_io_detection/__init__.py -rename to src/python/sentryPlugins/ai_block_io/__init__.py -diff --git a/src/python/sentryPlugins/ai_threshold_slow_io_detection/slow_io_detection.py b/src/python/sentryPlugins/ai_block_io/ai_block_io.py -similarity index 66% -rename from src/python/sentryPlugins/ai_threshold_slow_io_detection/slow_io_detection.py -rename to src/python/sentryPlugins/ai_block_io/ai_block_io.py -index 43cf770..31b8a97 100644 ---- a/src/python/sentryPlugins/ai_threshold_slow_io_detection/slow_io_detection.py -+++ b/src/python/sentryPlugins/ai_block_io/ai_block_io.py -@@ -23,7 +23,7 @@ from .data_access import get_io_data_from_collect_plug, check_collect_valid - from .io_data import MetricName - from .alarm_report import AlarmReport - --CONFIG_FILE = "/etc/sysSentry/plugins/ai_threshold_slow_io_detection.ini" -+CONFIG_FILE = "/etc/sysSentry/plugins/ai_block_io.ini" - - - def sig_handler(signum, frame): -@@ -40,34 +40,48 @@ class SlowIODetection: - - def __init__(self, config_parser: ConfigParser): - self._config_parser = config_parser -- self.__set_log_format() - self.__init_detector_name_list() - self.__init_detector() - -- def __set_log_format(self): -- log_format = "%(asctime)s - %(levelname)s - [%(filename)s:%(lineno)d] - %(message)s" -- log_level = get_log_level(self._config_parser.get_log_level()) -- logging.basicConfig(level=log_level, format=log_format) -- - def __init_detector_name_list(self): - self._disk_list = check_collect_valid(self._config_parser.get_slow_io_detect_frequency()) -- for disk in self._disk_list: -- self._detector_name_list.append(MetricName(disk, "bio", "read", "latency")) -- self._detector_name_list.append(MetricName(disk, "bio", "write", "latency")) -+ disks_to_detection: list = self._config_parser.get_disks_to_detection() -+ # 情况1:None,则启用所有磁盘检测 -+ # 情况2:is not None and len = 0,则不启动任何磁盘检测 -+ # 情况3:len != 0,则取交集 -+ if disks_to_detection is None: -+ for disk in self._disk_list: -+ self._detector_name_list.append(MetricName(disk, "bio", "read", "latency")) -+ self._detector_name_list.append(MetricName(disk, "bio", "write", "latency")) -+ elif len(disks_to_detection) == 0: -+ logging.warning('please attention: conf file not specify any disk to detection, ' -+ 'so it will not start ai block io.') -+ else: -+ disks_name_to_detection = [] -+ for disk_name_to_detection in disks_to_detection: -+ disks_name_to_detection.append(disk_name_to_detection.get_disk_name()) -+ disk_intersection = [disk for disk in self._disk_list if disk in disks_name_to_detection] -+ for disk in disk_intersection: -+ self._detector_name_list.append(MetricName(disk, "bio", "read", "latency")) -+ self._detector_name_list.append(MetricName(disk, "bio", "write", "latency")) -+ logging.info(f'start to detection follow disk and it\'s metric: {self._detector_name_list}') - - def __init_detector(self): - train_data_duration, train_update_duration = (self._config_parser. - get_train_data_duration_and_train_update_duration()) - slow_io_detection_frequency = self._config_parser.get_slow_io_detect_frequency() -- threshold_type = get_threshold_type_enum(self._config_parser.get_algorithm_type()) -+ threshold_type = self._config_parser.get_algorithm_type() - data_queue_size, update_size = get_data_queue_size_and_update_size(train_data_duration, - train_update_duration, - slow_io_detection_frequency) -- sliding_window_type = get_sliding_window_type_enum(self._config_parser.get_sliding_window_type()) -+ sliding_window_type = self._config_parser.get_sliding_window_type() - window_size, window_threshold = self._config_parser.get_window_size_and_window_minimum_threshold() - - for detector_name in self._detector_name_list: -- threshold = ThresholdFactory().get_threshold(threshold_type, data_queue_size=data_queue_size, -+ threshold = ThresholdFactory().get_threshold(threshold_type, -+ boxplot_parameter=self._config_parser.get_boxplot_parameter(), -+ n_sigma_paramter=self._config_parser.get_n_sigma_parameter(), -+ data_queue_size=data_queue_size, - data_queue_update_size=update_size) - sliding_window = SlidingWindowFactory().get_sliding_window(sliding_window_type, queue_length=window_size, - threshold=window_threshold) -@@ -89,6 +103,7 @@ class SlowIODetection: - logging.debug(f'step1. Get io data: {str(io_data_dict_with_disk_name)}') - if io_data_dict_with_disk_name is None: - continue -+ - # Step2:慢IO检测 - logging.debug('step2. Start to detection slow io event.') - slow_io_event_list = [] -@@ -103,13 +118,14 @@ class SlowIODetection: - for slow_io_event in slow_io_event_list: - metric_name: MetricName = slow_io_event[0] - result = slow_io_event[1] -- AlarmReport.report_major_alm(f"disk {metric_name.get_disk_name()} has slow io event." -- f"stage: {metric_name.get_metric_name()}," -- f"type: {metric_name.get_io_access_type_name()}," -- f"metric: {metric_name.get_metric_name()}," -- f"current window: {result[1]}," -- f"threshold: {result[2]}") -- logging.error(f"slow io event happen: {str(slow_io_event)}") -+ alarm_content = (f"disk {metric_name.get_disk_name()} has slow io event. " -+ f"stage is: {metric_name.get_stage_name()}, " -+ f"io access type is: {metric_name.get_io_access_type_name()}, " -+ f"metric is: {metric_name.get_metric_name()}, " -+ f"current window is: {result[1]}, " -+ f"threshold is: {result[2]}") -+ AlarmReport.report_major_alm(alarm_content) -+ logging.warning(alarm_content) - - # Step4:等待检测时间 - logging.debug('step4. Wait to start next slow io event detection loop.') -@@ -120,6 +136,7 @@ def main(): - # Step1:注册消息处理函数 - signal.signal(signal.SIGINT, sig_handler) - signal.signal(signal.SIGTERM, sig_handler) -+ - # Step2:断点恢复 - # todo: - -diff --git a/src/python/sentryPlugins/ai_threshold_slow_io_detection/alarm_report.py b/src/python/sentryPlugins/ai_block_io/alarm_report.py -similarity index 98% -rename from src/python/sentryPlugins/ai_threshold_slow_io_detection/alarm_report.py -rename to src/python/sentryPlugins/ai_block_io/alarm_report.py -index 3f4f34e..230c8cd 100644 ---- a/src/python/sentryPlugins/ai_threshold_slow_io_detection/alarm_report.py -+++ b/src/python/sentryPlugins/ai_block_io/alarm_report.py -@@ -15,7 +15,7 @@ import json - - - class AlarmReport: -- TASK_NAME = "SLOW_IO_DETECTION" -+ TASK_NAME = "ai_block_io" - - @staticmethod - def report_pass(info: str): -diff --git a/src/python/sentryPlugins/ai_block_io/config_parser.py b/src/python/sentryPlugins/ai_block_io/config_parser.py -new file mode 100644 -index 0000000..632391d ---- /dev/null -+++ b/src/python/sentryPlugins/ai_block_io/config_parser.py -@@ -0,0 +1,256 @@ -+# coding: utf-8 -+# Copyright (c) 2024 Huawei Technologies Co., Ltd. -+# sysSentry is licensed under the Mulan PSL v2. -+# You can use this software according to the terms and conditions of the Mulan PSL v2. -+# You may obtain a copy of Mulan PSL v2 at: -+# http://license.coscl.org.cn/MulanPSL2 -+# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR -+# IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY OR FIT FOR A PARTICULAR -+# PURPOSE. -+# See the Mulan PSL v2 for more details. -+ -+import configparser -+import json -+import logging -+ -+from .io_data import MetricName -+from .threshold import ThresholdType -+from .utils import get_threshold_type_enum, get_sliding_window_type_enum, get_log_level -+ -+LOG_FORMAT = "%(asctime)s - %(levelname)s - [%(filename)s:%(lineno)d] - %(message)s" -+ -+ -+def init_log_format(log_level: str): -+ logging.basicConfig(level=get_log_level(log_level), format=LOG_FORMAT) -+ -+ -+class ConfigParser: -+ DEFAULT_ABSOLUTE_THRESHOLD = 40 -+ DEFAULT_SLOW_IO_DETECTION_FREQUENCY = 1 -+ DEFAULT_LOG_LEVEL = 'info' -+ -+ DEFAULT_ALGORITHM_TYPE = 'boxplot' -+ DEFAULT_TRAIN_DATA_DURATION = 24 -+ DEFAULT_TRAIN_UPDATE_DURATION = 2 -+ DEFAULT_BOXPLOT_PARAMETER = 1.5 -+ DEFAULT_N_SIGMA_PARAMETER = 3 -+ -+ DEFAULT_SLIDING_WINDOW_TYPE = 'not_continuous' -+ DEFAULT_WINDOW_SIZE = 30 -+ DEFAULT_WINDOW_MINIMUM_THRESHOLD = 6 -+ -+ def __init__(self, config_file_name): -+ self.__absolute_threshold = ConfigParser.DEFAULT_ABSOLUTE_THRESHOLD -+ self.__slow_io_detect_frequency = ConfigParser.DEFAULT_SLOW_IO_DETECTION_FREQUENCY -+ self.__log_level = ConfigParser.DEFAULT_LOG_LEVEL -+ self.__disks_to_detection: list = [] -+ -+ self.__algorithm_type = ConfigParser.DEFAULT_ALGORITHM_TYPE -+ self.__train_data_duration = ConfigParser.DEFAULT_TRAIN_UPDATE_DURATION -+ self.__train_update_duration = ConfigParser.DEFAULT_TRAIN_UPDATE_DURATION -+ self.__boxplot_parameter = ConfigParser.DEFAULT_BOXPLOT_PARAMETER -+ self.__n_sigma_parameter = ConfigParser.DEFAULT_N_SIGMA_PARAMETER -+ -+ self.__sliding_window_type = ConfigParser.DEFAULT_SLIDING_WINDOW_TYPE -+ self.__window_size = ConfigParser.DEFAULT_WINDOW_SIZE -+ self.__window_minimum_threshold = ConfigParser.DEFAULT_WINDOW_MINIMUM_THRESHOLD -+ -+ self.__config_file_name = config_file_name -+ -+ def __read_absolute_threshold(self, items_common: dict): -+ try: -+ self.__absolute_threshold = float(items_common.get('absolute_threshold', -+ ConfigParser.DEFAULT_ABSOLUTE_THRESHOLD)) -+ if self.__absolute_threshold <= 0: -+ logging.warning( -+ f'the_absolute_threshold: {self.__absolute_threshold} you set is invalid, use default value: {ConfigParser.DEFAULT_ABSOLUTE_THRESHOLD}.') -+ self.__absolute_threshold = ConfigParser.DEFAULT_ABSOLUTE_THRESHOLD -+ except ValueError: -+ self.__absolute_threshold = ConfigParser.DEFAULT_ABSOLUTE_THRESHOLD -+ logging.warning( -+ f'the_absolute_threshold type conversion has error, use default value: {self.__absolute_threshold}.') -+ -+ def __read__slow_io_detect_frequency(self, items_common: dict): -+ try: -+ self.__slow_io_detect_frequency = int(items_common.get('slow_io_detect_frequency', -+ ConfigParser.DEFAULT_SLOW_IO_DETECTION_FREQUENCY)) -+ if self.__slow_io_detect_frequency < 1 or self.__slow_io_detect_frequency > 10: -+ logging.warning( -+ f'the slow_io_detect_frequency: {self.__slow_io_detect_frequency} you set is invalid, use default value: {ConfigParser.DEFAULT_SLOW_IO_DETECTION_FREQUENCY}.') -+ self.__slow_io_detect_frequency = ConfigParser.DEFAULT_SLOW_IO_DETECTION_FREQUENCY -+ except ValueError: -+ self.__slow_io_detect_frequency = ConfigParser.DEFAULT_SLOW_IO_DETECTION_FREQUENCY -+ logging.warning(f'slow_io_detect_frequency type conversion has error, use default value: {self.__slow_io_detect_frequency}.') -+ -+ def __read__disks_to_detect(self, items_common: dict): -+ disks_to_detection = items_common.get('disks_to_detect') -+ if disks_to_detection is None: -+ logging.warning(f'config of disks_to_detect not found, the default value be used.') -+ self.__disks_to_detection = None -+ return -+ try: -+ disks_to_detection_list = json.loads(disks_to_detection) -+ for disk_to_detection in disks_to_detection_list: -+ disk_name = disk_to_detection.get('disk_name', None) -+ stage_name = disk_to_detection.get('stage_name', None) -+ io_access_type_name = disk_to_detection.get('io_access_type_name', None) -+ metric_name = disk_to_detection.get('metric_name', None) -+ if not (disk_name is None or stage_name is None or io_access_type_name is None or metric_name is None): -+ metric_name_object = MetricName(disk_name, stage_name, io_access_type_name, metric_name) -+ self.__disks_to_detection.append(metric_name_object) -+ else: -+ logging.warning(f'config of disks_to_detect\'s some part has some error: {disk_to_detection}, it will be ignored.') -+ except json.decoder.JSONDecodeError as e: -+ logging.warning(f'config of disks_to_detect is error: {e}, it will be ignored and default value be used.') -+ self.__disks_to_detection = None -+ -+ def __read__train_data_duration(self, items_algorithm: dict): -+ try: -+ self.__train_data_duration = float(items_algorithm.get('train_data_duration', -+ ConfigParser.DEFAULT_TRAIN_DATA_DURATION)) -+ if self.__train_data_duration <= 0 or self.__train_data_duration > 720: -+ logging.warning( -+ f'the train_data_duration: {self.__train_data_duration} you set is invalid, use default value: {ConfigParser.DEFAULT_TRAIN_DATA_DURATION}.') -+ self.__train_data_duration = ConfigParser.DEFAULT_TRAIN_DATA_DURATION -+ except ValueError: -+ self.__train_data_duration = ConfigParser.DEFAULT_TRAIN_DATA_DURATION -+ logging.warning(f'the train_data_duration type conversion has error, use default value: {self.__train_data_duration}.') -+ -+ def __read__train_update_duration(self, items_algorithm: dict): -+ default_train_update_duration = ConfigParser.DEFAULT_TRAIN_UPDATE_DURATION -+ if default_train_update_duration > self.__train_data_duration: -+ default_train_update_duration = self.__train_data_duration / 2 -+ -+ try: -+ self.__train_update_duration = float(items_algorithm.get('train_update_duration', -+ ConfigParser.DEFAULT_TRAIN_UPDATE_DURATION)) -+ if self.__train_update_duration <= 0 or self.__train_update_duration > self.__train_data_duration: -+ logging.warning( -+ f'the train_update_duration: {self.__train_update_duration} you set is invalid, use default value: {default_train_update_duration}.') -+ self.__train_update_duration = default_train_update_duration -+ except ValueError: -+ self.__train_update_duration = default_train_update_duration -+ logging.warning(f'the train_update_duration type conversion has error, use default value: {self.__train_update_duration}.') -+ -+ def __read__algorithm_type_and_parameter(self, items_algorithm: dict): -+ algorithm_type = items_algorithm.get('algorithm_type', ConfigParser.DEFAULT_ALGORITHM_TYPE) -+ self.__algorithm_type = get_threshold_type_enum(algorithm_type) -+ -+ if self.__algorithm_type == ThresholdType.NSigmaThreshold: -+ try: -+ self.__n_sigma_parameter = float(items_algorithm.get('n_sigma_parameter', -+ ConfigParser.DEFAULT_N_SIGMA_PARAMETER)) -+ if self.__n_sigma_parameter <= 0 or self.__n_sigma_parameter > 10: -+ logging.warning( -+ f'the n_sigma_parameter: {self.__n_sigma_parameter} you set is invalid, use default value: {ConfigParser.DEFAULT_N_SIGMA_PARAMETER}.') -+ self.__n_sigma_parameter = ConfigParser.DEFAULT_N_SIGMA_PARAMETER -+ except ValueError: -+ self.__n_sigma_parameter = ConfigParser.DEFAULT_N_SIGMA_PARAMETER -+ logging.warning(f'the n_sigma_parameter type conversion has error, use default value: {self.__n_sigma_parameter}.') -+ elif self.__algorithm_type == ThresholdType.BoxplotThreshold: -+ try: -+ self.__boxplot_parameter = float(items_algorithm.get('boxplot_parameter', -+ ConfigParser.DEFAULT_BOXPLOT_PARAMETER)) -+ if self.__boxplot_parameter <= 0 or self.__boxplot_parameter > 10: -+ logging.warning( -+ f'the boxplot_parameter: {self.__boxplot_parameter} you set is invalid, use default value: {ConfigParser.DEFAULT_BOXPLOT_PARAMETER}.') -+ self.__n_sigma_parameter = ConfigParser.DEFAULT_BOXPLOT_PARAMETER -+ except ValueError: -+ self.__boxplot_parameter = ConfigParser.DEFAULT_BOXPLOT_PARAMETER -+ logging.warning(f'the boxplot_parameter type conversion has error, use default value: {self.__boxplot_parameter}.') -+ -+ def __read__window_size(self, items_sliding_window: dict): -+ try: -+ self.__window_size = int(items_sliding_window.get('window_size', -+ ConfigParser.DEFAULT_WINDOW_SIZE)) -+ if self.__window_size < 1 or self.__window_size > 3600: -+ logging.warning( -+ f'the window_size: {self.__window_size} you set is invalid, use default value: {ConfigParser.DEFAULT_WINDOW_SIZE}.') -+ self.__window_size = ConfigParser.DEFAULT_WINDOW_SIZE -+ except ValueError: -+ self.__window_size = ConfigParser.DEFAULT_WINDOW_SIZE -+ logging.warning(f'window_size type conversion has error, use default value: {self.__window_size}.') -+ -+ def __read__window_minimum_threshold(self, items_sliding_window: dict): -+ default_window_minimum_threshold = ConfigParser.DEFAULT_WINDOW_MINIMUM_THRESHOLD -+ if default_window_minimum_threshold > self.__window_size: -+ default_window_minimum_threshold = self.__window_size / 2 -+ try: -+ self.__window_minimum_threshold = ( -+ int(items_sliding_window.get('window_minimum_threshold', -+ ConfigParser.DEFAULT_WINDOW_MINIMUM_THRESHOLD))) -+ if self.__window_minimum_threshold < 1 or self.__window_minimum_threshold > self.__window_size: -+ logging.warning( -+ f'the window_minimum_threshold: {self.__window_minimum_threshold} you set is invalid, use default value: {default_window_minimum_threshold}.') -+ self.__window_minimum_threshold = default_window_minimum_threshold -+ except ValueError: -+ self.__window_minimum_threshold = default_window_minimum_threshold -+ logging.warning(f'window_minimum_threshold type conversion has error, use default value: {self.__window_minimum_threshold}.') -+ -+ def read_config_from_file(self): -+ con = configparser.ConfigParser() -+ con.read(self.__config_file_name, encoding='utf-8') -+ -+ if con.has_section('common'): -+ items_common = dict(con.items('common')) -+ self.__log_level = items_common.get('log_level', ConfigParser.DEFAULT_LOG_LEVEL) -+ init_log_format(self.__log_level) -+ self.__read_absolute_threshold(items_common) -+ self.__read__slow_io_detect_frequency(items_common) -+ self.__read__disks_to_detect(items_common) -+ else: -+ init_log_format(self.__log_level) -+ logging.warning("common section parameter not found, it will be set to default value.") -+ -+ if con.has_section('algorithm'): -+ items_algorithm = dict(con.items('algorithm')) -+ self.__read__train_data_duration(items_algorithm) -+ self.__read__train_update_duration(items_algorithm) -+ self.__read__algorithm_type_and_parameter(items_algorithm) -+ else: -+ logging.warning("algorithm section parameter not found, it will be set to default value.") -+ -+ if con.has_section('sliding_window'): -+ items_sliding_window = dict(con.items('sliding_window')) -+ sliding_window_type = items_sliding_window.get('sliding_window_type', -+ ConfigParser.DEFAULT_SLIDING_WINDOW_TYPE) -+ self.__sliding_window_type = get_sliding_window_type_enum(sliding_window_type) -+ self.__read__window_size(items_sliding_window) -+ self.__read__window_minimum_threshold(items_sliding_window) -+ else: -+ logging.warning("sliding_window section parameter not found, it will be set to default value.") -+ -+ self.__print_all_config_value() -+ -+ def __print_all_config_value(self): -+ pass -+ -+ def get_slow_io_detect_frequency(self): -+ return self.__slow_io_detect_frequency -+ -+ def get_algorithm_type(self): -+ return self.__algorithm_type -+ -+ def get_sliding_window_type(self): -+ return self.__sliding_window_type -+ -+ def get_train_data_duration_and_train_update_duration(self): -+ return self.__train_data_duration, self.__train_update_duration -+ -+ def get_window_size_and_window_minimum_threshold(self): -+ return self.__window_size, self.__window_minimum_threshold -+ -+ def get_absolute_threshold(self): -+ return self.__absolute_threshold -+ -+ def get_log_level(self): -+ return self.__log_level -+ -+ def get_disks_to_detection(self): -+ return self.__disks_to_detection -+ -+ def get_boxplot_parameter(self): -+ return self.__boxplot_parameter -+ -+ def get_n_sigma_parameter(self): -+ return self.__n_sigma_parameter -diff --git a/src/python/sentryPlugins/ai_threshold_slow_io_detection/data_access.py b/src/python/sentryPlugins/ai_block_io/data_access.py -similarity index 99% -rename from src/python/sentryPlugins/ai_threshold_slow_io_detection/data_access.py -rename to src/python/sentryPlugins/ai_block_io/data_access.py -index d9f3460..01c5315 100644 ---- a/src/python/sentryPlugins/ai_threshold_slow_io_detection/data_access.py -+++ b/src/python/sentryPlugins/ai_block_io/data_access.py -@@ -17,6 +17,8 @@ from sentryCollector.collect_plugin import ( - get_io_data, - is_iocollect_valid, - ) -+ -+ - from .io_data import IOStageData, IOData - - COLLECT_STAGES = [ -@@ -32,6 +34,7 @@ COLLECT_STAGES = [ - "iocost", - ] - -+ - def check_collect_valid(period): - data_raw = is_iocollect_valid(period) - if data_raw["ret"] == 0: -diff --git a/src/python/sentryPlugins/ai_threshold_slow_io_detection/detector.py b/src/python/sentryPlugins/ai_block_io/detector.py -similarity index 77% -rename from src/python/sentryPlugins/ai_threshold_slow_io_detection/detector.py -rename to src/python/sentryPlugins/ai_block_io/detector.py -index eda9825..bcf62cb 100644 ---- a/src/python/sentryPlugins/ai_threshold_slow_io_detection/detector.py -+++ b/src/python/sentryPlugins/ai_block_io/detector.py -@@ -26,19 +26,26 @@ class Detector: - self._threshold = threshold - self._slidingWindow = sliding_window - self._threshold.attach_observer(self._slidingWindow) -+ self._count = 0 - - def get_metric_name(self): - return self._metric_name - - def is_slow_io_event(self, io_data_dict_with_disk_name: dict): -- logging.debug(f'Enter Detector: {self}') -+ self._count += 1 -+ if self._count % 15 == 0: -+ self._count = 0 -+ logging.info(f"({self._metric_name}) 's latest threshold is: {self._threshold.get_threshold()}.") -+ logging.debug(f'enter Detector: {self}') - metric_value = get_metric_value_from_io_data_dict_by_metric_name(io_data_dict_with_disk_name, self._metric_name) -- if metric_value > 1e-6: -- logging.debug(f'Input metric value: {str(metric_value)}') -- self._threshold.push_latest_data_to_queue(metric_value) -+ if metric_value is None: -+ logging.debug('not found metric value, so return None.') -+ return False, None, None -+ logging.debug(f'input metric value: {str(metric_value)}') -+ self._threshold.push_latest_data_to_queue(metric_value) - detection_result = self._slidingWindow.is_slow_io_event(metric_value) - logging.debug(f'Detection result: {str(detection_result)}') -- logging.debug(f'Exit Detector: {self}') -+ logging.debug(f'exit Detector: {self}') - return detection_result - - def __repr__(self): -diff --git a/src/python/sentryPlugins/ai_threshold_slow_io_detection/io_data.py b/src/python/sentryPlugins/ai_block_io/io_data.py -similarity index 100% -rename from src/python/sentryPlugins/ai_threshold_slow_io_detection/io_data.py -rename to src/python/sentryPlugins/ai_block_io/io_data.py -diff --git a/src/python/sentryPlugins/ai_threshold_slow_io_detection/sliding_window.py b/src/python/sentryPlugins/ai_block_io/sliding_window.py -similarity index 100% -rename from src/python/sentryPlugins/ai_threshold_slow_io_detection/sliding_window.py -rename to src/python/sentryPlugins/ai_block_io/sliding_window.py -diff --git a/src/python/sentryPlugins/ai_threshold_slow_io_detection/threshold.py b/src/python/sentryPlugins/ai_block_io/threshold.py -similarity index 92% -rename from src/python/sentryPlugins/ai_threshold_slow_io_detection/threshold.py -rename to src/python/sentryPlugins/ai_block_io/threshold.py -index 9e1ca7b..ff85d85 100644 ---- a/src/python/sentryPlugins/ai_threshold_slow_io_detection/threshold.py -+++ b/src/python/sentryPlugins/ai_block_io/threshold.py -@@ -79,9 +79,9 @@ class AbsoluteThreshold(Threshold): - - - class BoxplotThreshold(Threshold): -- def __init__(self, parameter: float = 1.5, data_queue_size: int = 10000, data_queue_update_size: int = 1000): -+ def __init__(self, boxplot_parameter: float = 1.5, data_queue_size: int = 10000, data_queue_update_size: int = 1000, **kwargs): - super().__init__(data_queue_size, data_queue_update_size) -- self.parameter = parameter -+ self.parameter = boxplot_parameter - - def _update_threshold(self): - data = list(self.data_queue.queue) -@@ -94,6 +94,8 @@ class BoxplotThreshold(Threshold): - self.notify_observer() - - def push_latest_data_to_queue(self, data): -+ if data < 1e-6: -+ return - try: - self.data_queue.put(data, block=False) - except queue.Full: -@@ -111,9 +113,9 @@ class BoxplotThreshold(Threshold): - - - class NSigmaThreshold(Threshold): -- def __init__(self, parameter: float = 2.0, data_queue_size: int = 10000, data_queue_update_size: int = 1000): -+ def __init__(self, n_sigma_parameter: float = 3.0, data_queue_size: int = 10000, data_queue_update_size: int = 1000, **kwargs): - super().__init__(data_queue_size, data_queue_update_size) -- self.parameter = parameter -+ self.parameter = n_sigma_parameter - - def _update_threshold(self): - data = list(self.data_queue.queue) -@@ -125,6 +127,8 @@ class NSigmaThreshold(Threshold): - self.notify_observer() - - def push_latest_data_to_queue(self, data): -+ if data < 1e-6: -+ return - try: - self.data_queue.put(data, block=False) - except queue.Full: -@@ -157,4 +161,3 @@ class ThresholdFactory: - return NSigmaThreshold(*args, **kwargs) - else: - raise ValueError(f"Invalid threshold type: {threshold_type}") -- -diff --git a/src/python/sentryPlugins/ai_threshold_slow_io_detection/utils.py b/src/python/sentryPlugins/ai_block_io/utils.py -similarity index 86% -rename from src/python/sentryPlugins/ai_threshold_slow_io_detection/utils.py -rename to src/python/sentryPlugins/ai_block_io/utils.py -index f66e5ed..8dbba06 100644 ---- a/src/python/sentryPlugins/ai_threshold_slow_io_detection/utils.py -+++ b/src/python/sentryPlugins/ai_block_io/utils.py -@@ -8,13 +8,16 @@ - # IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY OR FIT FOR A PARTICULAR - # PURPOSE. - # See the Mulan PSL v2 for more details. -+ - import logging - from dataclasses import asdict - -+ - from .threshold import ThresholdType - from .sliding_window import SlidingWindowType - from .io_data import MetricName, IOData - -+ - def get_threshold_type_enum(algorithm_type: str): - if algorithm_type.lower() == 'absolute': - return ThresholdType.AbsoluteThreshold -@@ -22,7 +25,7 @@ def get_threshold_type_enum(algorithm_type: str): - return ThresholdType.BoxplotThreshold - if algorithm_type.lower() == 'n_sigma': - return ThresholdType.NSigmaThreshold -- logging.info('not found correct algorithm type, use default: boxplot.') -+ logging.warning(f"the algorithm type: {algorithm_type} you set is invalid, use default value: boxplot") - return ThresholdType.BoxplotThreshold - - -@@ -33,7 +36,7 @@ def get_sliding_window_type_enum(sliding_window_type: str): - return SlidingWindowType.ContinuousSlidingWindow - if sliding_window_type.lower() == 'median': - return SlidingWindowType.MedianSlidingWindow -- logging.info('not found correct sliding window type, use default: not_continuous.') -+ logging.warning(f"the sliding window type: {sliding_window_type} you set is invalid, use default value: not_continuous") - return SlidingWindowType.NotContinuousSlidingWindow - - -@@ -62,6 +65,8 @@ def get_log_level(log_level: str): - return logging.INFO - elif log_level.lower() == 'warning': - return logging.WARNING -- elif log_level.lower() == 'fatal': -- return logging.FATAL -- return None -+ elif log_level.lower() == 'error': -+ return logging.ERROR -+ elif log_level.lower() == 'critical': -+ return logging.CRITICAL -+ return logging.INFO -diff --git a/src/python/sentryPlugins/ai_threshold_slow_io_detection/config_parser.py b/src/python/sentryPlugins/ai_threshold_slow_io_detection/config_parser.py -deleted file mode 100644 -index cd4e6f1..0000000 ---- a/src/python/sentryPlugins/ai_threshold_slow_io_detection/config_parser.py -+++ /dev/null -@@ -1,141 +0,0 @@ --# coding: utf-8 --# Copyright (c) 2024 Huawei Technologies Co., Ltd. --# sysSentry is licensed under the Mulan PSL v2. --# You can use this software according to the terms and conditions of the Mulan PSL v2. --# You may obtain a copy of Mulan PSL v2 at: --# http://license.coscl.org.cn/MulanPSL2 --# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR --# IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY OR FIT FOR A PARTICULAR --# PURPOSE. --# See the Mulan PSL v2 for more details. -- --import configparser --import logging -- -- --class ConfigParser: -- -- DEFAULT_ABSOLUTE_THRESHOLD = 40 -- DEFAULT_SLOW_IO_DETECTION_FREQUENCY = 1 -- DEFAULT_LOG_LEVEL = 'info' -- DEFAULT_TRAIN_DATA_DURATION = 24 -- DEFAULT_TRAIN_UPDATE_DURATION = 2 -- DEFAULT_ALGORITHM_TYPE = 'boxplot' -- DEFAULT_N_SIGMA_PARAMETER = 3 -- DEFAULT_BOXPLOT_PARAMETER = 1.5 -- DEFAULT_SLIDING_WINDOW_TYPE = 'not_continuous' -- DEFAULT_WINDOW_SIZE = 30 -- DEFAULT_WINDOW_MINIMUM_THRESHOLD = 6 -- -- def __init__(self, config_file_name): -- self.__boxplot_parameter = None -- self.__window_minimum_threshold = None -- self.__window_size = None -- self.__sliding_window_type = None -- self.__n_sigma_parameter = None -- self.__algorithm_type = None -- self.__train_update_duration = None -- self.__log_level = None -- self.__slow_io_detect_frequency = None -- self.__absolute_threshold = None -- self.__train_data_duration = None -- self.__config_file_name = config_file_name -- -- def read_config_from_file(self): -- -- con = configparser.ConfigParser() -- con.read(self.__config_file_name, encoding='utf-8') -- -- items_common = dict(con.items('common')) -- items_algorithm = dict(con.items('algorithm')) -- items_sliding_window = dict(con.items('sliding_window')) -- -- try: -- self.__absolute_threshold = int(items_common.get('absolute_threshold', -- ConfigParser.DEFAULT_ABSOLUTE_THRESHOLD)) -- except ValueError: -- self.__absolute_threshold = ConfigParser.DEFAULT_ABSOLUTE_THRESHOLD -- logging.warning('absolute threshold type conversion has error, use default value.') -- -- try: -- self.__slow_io_detect_frequency = int(items_common.get('slow_io_detect_frequency', -- ConfigParser.DEFAULT_SLOW_IO_DETECTION_FREQUENCY)) -- except ValueError: -- self.__slow_io_detect_frequency = ConfigParser.DEFAULT_SLOW_IO_DETECTION_FREQUENCY -- logging.warning('slow_io_detect_frequency type conversion has error, use default value.') -- -- self.__log_level = items_common.get('log_level', ConfigParser.DEFAULT_LOG_LEVEL) -- -- try: -- self.__train_data_duration = float(items_algorithm.get('train_data_duration', -- ConfigParser.DEFAULT_TRAIN_DATA_DURATION)) -- except ValueError: -- self.__train_data_duration = ConfigParser.DEFAULT_TRAIN_DATA_DURATION -- logging.warning('train_data_duration type conversion has error, use default value.') -- -- try: -- self.__train_update_duration = float(items_algorithm.get('train_update_duration', -- ConfigParser.DEFAULT_TRAIN_UPDATE_DURATION)) -- except ValueError: -- self.__train_update_duration = ConfigParser.DEFAULT_TRAIN_UPDATE_DURATION -- logging.warning('train_update_duration type conversion has error, use default value.') -- -- try: -- self.__algorithm_type = items_algorithm.get('algorithm_type', ConfigParser.DEFAULT_ALGORITHM_TYPE) -- except ValueError: -- self.__algorithm_type = ConfigParser.DEFAULT_ALGORITHM_TYPE -- logging.warning('algorithmType type conversion has error, use default value.') -- -- if self.__algorithm_type == 'n_sigma': -- try: -- self.__n_sigma_parameter = float(items_algorithm.get('n_sigma_parameter', -- ConfigParser.DEFAULT_N_SIGMA_PARAMETER)) -- except ValueError: -- self.__n_sigma_parameter = ConfigParser.DEFAULT_N_SIGMA_PARAMETER -- logging.warning('n_sigma_parameter type conversion has error, use default value.') -- elif self.__algorithm_type == 'boxplot': -- try: -- self.__boxplot_parameter = float(items_algorithm.get('boxplot_parameter', -- ConfigParser.DEFAULT_BOXPLOT_PARAMETER)) -- except ValueError: -- self.__boxplot_parameter = ConfigParser.DEFAULT_BOXPLOT_PARAMETER -- logging.warning('boxplot_parameter type conversion has error, use default value.') -- -- self.__sliding_window_type = items_sliding_window.get('sliding_window_type', -- ConfigParser.DEFAULT_SLIDING_WINDOW_TYPE) -- -- try: -- self.__window_size = int(items_sliding_window.get('window_size', -- ConfigParser.DEFAULT_WINDOW_SIZE)) -- except ValueError: -- self.__window_size = ConfigParser.DEFAULT_WINDOW_SIZE -- logging.warning('window_size type conversion has error, use default value.') -- -- try: -- self.__window_minimum_threshold = ( -- int(items_sliding_window.get('window_minimum_threshold', -- ConfigParser.DEFAULT_WINDOW_MINIMUM_THRESHOLD))) -- except ValueError: -- self.__window_minimum_threshold = ConfigParser.DEFAULT_WINDOW_MINIMUM_THRESHOLD -- logging.warning('window_minimum_threshold type conversion has error, use default value.') -- -- def get_slow_io_detect_frequency(self): -- return self.__slow_io_detect_frequency -- -- def get_algorithm_type(self): -- return self.__algorithm_type -- -- def get_sliding_window_type(self): -- return self.__sliding_window_type -- -- def get_train_data_duration_and_train_update_duration(self): -- return self.__train_data_duration, self.__train_update_duration -- -- def get_window_size_and_window_minimum_threshold(self): -- return self.__window_size, self.__window_minimum_threshold -- -- def get_absolute_threshold(self): -- return self.__absolute_threshold -- -- def get_log_level(self): -- return self.__log_level -diff --git a/src/python/setup.py b/src/python/setup.py -index dac6481..9e26a10 100644 ---- a/src/python/setup.py -+++ b/src/python/setup.py -@@ -34,7 +34,7 @@ setup( - 'xalarmd=xalarm.xalarm_daemon:alarm_process_create', - 'sentryCollector=sentryCollector.collectd:main', - 'avg_block_io=sentryPlugins.avg_block_io.avg_block_io:main', -- 'ai_threshold_slow_io_detection=sentryPlugins.ai_threshold_slow_io_detection.slow_io_detection:main' -+ 'ai_block_io=sentryPlugins.ai_block_io.ai_block_io:main' - ] - }, - ) --- -2.23.0 - diff --git a/fix-alarm_info-newline-break-error.patch b/fix-alarm_info-newline-break-error.patch deleted file mode 100644 index ba3d3b193db87320d2ca11dfe3db3569a29b1d11..0000000000000000000000000000000000000000 --- a/fix-alarm_info-newline-break-error.patch +++ /dev/null @@ -1,48 +0,0 @@ -From fe1bb401c1f77860616e74c1dbf5fe6aa862b17d Mon Sep 17 00:00:00 2001 -From: jinsaihang -Date: Sat, 26 Oct 2024 07:18:16 +0000 -Subject: [PATCH] fix alarm_info newline break error - -Signed-off-by: jinsaihang ---- - sysSentry-1.0.2/src/python/syssentry/alarm.py | 23 +++++++++++++++++++ - 1 file changed, 23 insertions(+) - -diff --git a/src/python/syssentry/alarm.py b/src/python/syssentry/alarm.py -index 2575307..b35a126 100644 ---- a/src/python/syssentry/alarm.py -+++ b/src/python/syssentry/alarm.py -@@ -180,7 +180,30 @@ def get_alarm_result(task_name: str, time_range: int, detailed: bool) -> List[Di - if 'details' in alarm_info: - alarm_info.pop('details', None) - alarm.pop('msg1', None) -+ -+ # dump each {key,value} of details in one line -+ if 'details' in alarm_info and isinstance(alarm_info['details'], dict): -+ for key in alarm_info['details']: -+ alarm_info['details'][key] = json.dumps(alarm_info['details'][key], indent=None) -+ - alarm['alarm_info'] = alarm_info -+ alarm_list = [alarm for alarm in alarm_list if 'alarm_source' in alarm['alarm_info'] and alarm['alarm_info']['alarm_source'] == task_name] -+ -+ alarm_level_mapping = { -+ 1: 'MINOR_ALM', -+ 2: 'MAJOR_ALM', -+ 3: 'CRITICAL_ALM' -+ } -+ -+ alarm_type_mapping = { -+ 1: 'ALARM_TYPE_OCCUR', -+ 2: 'ALARM_TYPE_RECOVER' -+ } -+ -+ for alarm in alarm_list: -+ alarm['alarm_level'] = alarm_level_mapping.get(alarm['alarm_level'], 'UNKNOWN_LEVEL') -+ alarm['alarm_type'] = alarm_type_mapping.get(alarm['alarm_type'], 'UNKNOWN_TYPE') - return alarm_list -+ - finally: - alarm_list_lock.release() --- -2.27.0 - diff --git a/fix-bug-step-2-about-collect-module-and-avg-block-io.patch b/fix-bug-step-2-about-collect-module-and-avg-block-io.patch deleted file mode 100644 index 6b80cb96da09f135762346c5f4b101cf11445bfa..0000000000000000000000000000000000000000 --- a/fix-bug-step-2-about-collect-module-and-avg-block-io.patch +++ /dev/null @@ -1,323 +0,0 @@ -From e6eb39799b3ca15fb385c572863417ea26bdfa66 Mon Sep 17 00:00:00 2001 -From: zhuofeng -Date: Wed, 25 Sep 2024 11:03:29 +0800 -Subject: [PATCH] fix-bug-step-2-about-collect-module-and-avg-block-io - ---- - src/python/sentryCollector/collect_config.py | 11 ++- - src/python/sentryCollector/collect_io.py | 25 ++--- - src/python/sentryCollector/collect_plugin.py | 6 +- - src/python/sentryCollector/collect_server.py | 1 - - src/python/sentryCollector/collectd.py | 4 +- - .../avg_block_io/avg_block_io.py | 92 ++++++++++++++----- - 6 files changed, 96 insertions(+), 43 deletions(-) - -diff --git a/src/python/sentryCollector/collect_config.py b/src/python/sentryCollector/collect_config.py -index b6cc75c..0fdd9f0 100644 ---- a/src/python/sentryCollector/collect_config.py -+++ b/src/python/sentryCollector/collect_config.py -@@ -49,14 +49,14 @@ class CollectConfig: - self.config = configparser.ConfigParser() - self.config.read(self.filename) - except configparser.Error: -- logging.error("collectd configure file read failed") -+ logging.error("collect configure file read failed") - return - - try: - common_config = self.config[CONF_COMMON] -- modules_str = common_config[CONF_MODULES] -+ modules_str = common_config[CONF_MODULES].lower() - # remove space -- modules_list = modules_str.replace(" ", "").split(',') -+ modules_list = set(modules_str.replace(" ", "").split(',')) - except KeyError as e: - logging.error("read config data failed, %s", e) - return -@@ -98,7 +98,7 @@ class CollectConfig: - CONF_IO, CONF_IO_MAX_SAVE, CONF_IO_MAX_SAVE_DEFAULT) - result_io_config[CONF_IO_MAX_SAVE] = CONF_IO_MAX_SAVE_DEFAULT - # disk -- disk = io_map_value.get(CONF_IO_DISK) -+ disk = io_map_value.get(CONF_IO_DISK).lower() - if disk: - disk_str = disk.replace(" ", "") - pattern = r'^[a-zA-Z0-9-_,]+$' -@@ -106,12 +106,13 @@ class CollectConfig: - logging.warning("module_name = %s section, field = %s is incorrect, use default %s", - CONF_IO, CONF_IO_DISK, CONF_IO_DISK_DEFAULT) - disk_str = CONF_IO_DISK_DEFAULT -+ disk_str = ",".join(set(disk_str.split(','))) - result_io_config[CONF_IO_DISK] = disk_str - else: - logging.warning("module_name = %s section, field = %s is incorrect, use default %s", - CONF_IO, CONF_IO_DISK, CONF_IO_DISK_DEFAULT) - result_io_config[CONF_IO_DISK] = CONF_IO_DISK_DEFAULT -- logging.info("config get_io_config: %s", result_io_config) -+ logging.debug("config get_io_config: %s", result_io_config) - return result_io_config - - def get_common_config(self): -diff --git a/src/python/sentryCollector/collect_io.py b/src/python/sentryCollector/collect_io.py -index 104b734..9c8dae7 100644 ---- a/src/python/sentryCollector/collect_io.py -+++ b/src/python/sentryCollector/collect_io.py -@@ -177,10 +177,8 @@ class CollectIo(): - - def is_kernel_avaliable(self): - base_path = '/sys/kernel/debug/block' -+ all_disk = [] - for disk_name in os.listdir(base_path): -- if not self.loop_all and disk_name not in self.disk_list: -- continue -- - disk_path = os.path.join(base_path, disk_name) - blk_io_hierarchy_path = os.path.join(disk_path, 'blk_io_hierarchy') - -@@ -190,12 +188,18 @@ class CollectIo(): - - for file_name in os.listdir(blk_io_hierarchy_path): - file_path = os.path.join(blk_io_hierarchy_path, file_name) -- - if file_name == 'stats': -- stage_list = self.extract_first_column(file_path) -- self.disk_map_stage[disk_name] = stage_list -- self.window_value[disk_name] = {} -- IO_GLOBAL_DATA[disk_name] = {} -+ all_disk.append(disk_name) -+ -+ for disk_name in self.disk_list: -+ if not self.loop_all and disk_name not in all_disk: -+ logging.warning("the %s disk not exist!", disk_name) -+ continue -+ stats_file = '/sys/kernel/debug/block/{}/blk_io_hierarchy/stats'.format(disk_name) -+ stage_list = self.extract_first_column(stats_file) -+ self.disk_map_stage[disk_name] = stage_list -+ self.window_value[disk_name] = {} -+ IO_GLOBAL_DATA[disk_name] = {} - - return len(IO_GLOBAL_DATA) != 0 - -@@ -203,7 +207,7 @@ class CollectIo(): - logging.info("collect io thread start") - - if not self.is_kernel_avaliable() or len(self.disk_map_stage) == 0: -- logging.warning("no disks meet the requirements. collect io thread exits") -+ logging.warning("no disks meet the requirements. collect io thread exit") - return - - for disk_name, stage_list in self.disk_map_stage.items(): -@@ -239,5 +243,4 @@ class CollectIo(): - - # set stop event, notify thread exit - def stop_thread(self): -- logging.debug("collect io thread is preparing to exit") -- self.stop_event.set() -+ self.stop_event.set() -diff --git a/src/python/sentryCollector/collect_plugin.py b/src/python/sentryCollector/collect_plugin.py -index 9132473..1faa5e3 100644 ---- a/src/python/sentryCollector/collect_plugin.py -+++ b/src/python/sentryCollector/collect_plugin.py -@@ -10,7 +10,7 @@ - # See the Mulan PSL v2 for more details. - - """ --collcet plugin -+collect plugin - """ - import json - import socket -@@ -75,7 +75,7 @@ def client_send_and_recv(request_data, data_str_len, protocol): - try: - client_socket = socket.socket(socket.AF_UNIX, socket.SOCK_STREAM) - except socket.error: -- print("collect_plugin: client creat socket error") -+ print("collect_plugin: client create socket error") - return None - - try: -@@ -128,7 +128,7 @@ def client_send_and_recv(request_data, data_str_len, protocol): - def validate_parameters(param, len_limit, char_limit): - ret = ResultMessage.RESULT_SUCCEED - if not param: -- print("parm is invalid") -+ print("param is invalid") - ret = ResultMessage.RESULT_NOT_PARAM - return [False, ret] - -diff --git a/src/python/sentryCollector/collect_server.py b/src/python/sentryCollector/collect_server.py -index bab4e56..11d1af0 100644 ---- a/src/python/sentryCollector/collect_server.py -+++ b/src/python/sentryCollector/collect_server.py -@@ -281,5 +281,4 @@ class CollectServer(): - pass - - def stop_thread(self): -- logging.debug("collect listen thread is preparing to exit") - self.stop_event.set() -diff --git a/src/python/sentryCollector/collectd.py b/src/python/sentryCollector/collectd.py -index 3a836df..d9d8862 100644 ---- a/src/python/sentryCollector/collectd.py -+++ b/src/python/sentryCollector/collectd.py -@@ -79,7 +79,7 @@ def main(): - for info in module_list: - class_name = Module_Map_Class.get(info) - if not class_name: -- logging.info("%s correspond to class is not exists", info) -+ logging.info("%s correspond to class is not exist", info) - continue - cn = class_name(module_config) - collect_thread = threading.Thread(target=cn.main_loop) -@@ -94,4 +94,4 @@ def main(): - finally: - pass - -- logging.info("All threads have finished. Main thread is exiting.") -\ No newline at end of file -+ logging.info("all threads have finished. main thread exit.") -\ No newline at end of file -diff --git a/src/python/sentryPlugins/avg_block_io/avg_block_io.py b/src/python/sentryPlugins/avg_block_io/avg_block_io.py -index 73f0b22..ac35be2 100644 ---- a/src/python/sentryPlugins/avg_block_io/avg_block_io.py -+++ b/src/python/sentryPlugins/avg_block_io/avg_block_io.py -@@ -28,33 +28,53 @@ def log_invalid_keys(not_in_list, keys_name, config_list, default_list): - - - def read_config_common(config): -- """read config file, get [common] section value""" -- try: -- common_sec = config['common'] -- except configparser.NoSectionError: -+ """read config file, get [common] section value""" -+ if not config.has_section("common"): - report_alarm_fail("Cannot find common section in config file") - - try: -- period_time = int(common_sec.get("period_time", 1)) -- if not (1 <= period_time <= 300): -- raise ValueError("Invalid period_time") -- except ValueError: -- period_time = 1 -- logging.warning("Invalid period_time, set to 1s") -+ disk_name = config.get("common", "disk") -+ disk = [] if disk_name == "default" else disk_name.split(",") -+ except configparser.NoOptionError: -+ disk = [] -+ logging.warning("Unset disk, set to default") - -- disk = common_sec.get('disk').split(",") if common_sec.get('disk') not in [None, 'default'] else [] -- stage = common_sec.get('stage').split(",") if common_sec.get('stage') not in [None, 'default'] else [] -+ try: -+ stage_name = config.get("common", "stage") -+ stage = [] if stage_name == "default" else stage_name.split(",") -+ except configparser.NoOptionError: -+ stage = [] -+ logging.warning("Unset stage, set to read,write") - - if len(disk) > 10: - logging.warning("Too many disks, record only max 10 disks") - disk = disk[:10] - -- iotype = common_sec.get('iotype', 'read,write').split(",") -- iotype_list = [rw.lower() for rw in iotype if rw.lower() in ['read', 'write', 'flush', 'discard']] -- err_iotype = [rw for rw in iotype if rw.lower() not in ['read', 'write', 'flush', 'discard']] -+ try: -+ iotype_name = config.get("common", "iotype").split(",") -+ iotype_list = [rw.lower() for rw in iotype_name if rw.lower() in ['read', 'write', 'flush', 'discard']] -+ err_iotype = [rw.lower() for rw in iotype_name if rw.lower() not in ['read', 'write', 'flush', 'discard']] -+ -+ if iotype_list in [None, []]: -+ iotype_list = ["read", "write"] -+ except configparser.NoOptionError: -+ iotype = ["read", "write"] -+ logging.warning("Unset iotype, set to default") - - if err_iotype: - logging.warning("{} in common.iotype are not valid, set iotype={}".format(err_iotype, iotype_list)) -+ -+ -+ try: -+ period_time = int(config.get("common", "period_time")) -+ if not (1 <= period_time <= 300): -+ raise ValueError("Invalid period_time") -+ except ValueError: -+ period_time = 1 -+ logging.warning("Invalid period_time, set to 1s") -+ except configparser.NoOptionError: -+ period_time = 1 -+ logging.warning("Unset period_time, use 1s as default") - - return period_time, disk, stage, iotype_list - -@@ -68,11 +88,23 @@ def read_config_algorithm(config): - win_size = int(config.get("algorithm", "win_size")) - if not (1 <= win_size <= 300): - raise ValueError("Invalid win_size") -+ except ValueError: -+ win_size = 30 -+ logging.warning("Invalid win_size, set to 30") -+ except configparser.NoOptionError: -+ win_size = 30 -+ logging.warning("Unset win_size, use 30 as default") -+ -+ try: - win_threshold = int(config.get("algorithm", "win_threshold")) - if win_threshold < 1 or win_threshold > 300 or win_threshold > win_size: - raise ValueError("Invalid win_threshold") - except ValueError: -- report_alarm_fail("Invalid win_threshold or win_size") -+ win_threshold = 6 -+ logging.warning("Invalid win_threshold, set to 6") -+ except configparser.NoOptionError: -+ win_threshold = 6 -+ logging.warning("Unset win_threshold, use 6 as default") - - return win_size, win_threshold - -@@ -80,6 +112,21 @@ def read_config_algorithm(config): - def read_config_lat_iodump(io_dic, config): - """read config file, get [latency] [iodump] section value""" - common_param = {} -+ lat_sec = None -+ if not config.has_section("latency"): -+ logging.warning("Cannot find algorithm section in config file") -+ else: -+ lat_sec = config["latency"] -+ -+ iodump_sec = None -+ if not config.has_section("iodump"): -+ logging.warning("Cannot find iodump section in config file") -+ else: -+ lat_sec = config["iodump"] -+ -+ if not lat_sec and not iodump_sec: -+ return common_param -+ - for io_type in io_dic["iotype_list"]: - common_param[io_type] = {} - -@@ -90,13 +137,16 @@ def read_config_lat_iodump(io_dic, config): - } - iodump_key = "{}_iodump_lim".format(io_type) - -+ if iodump_sec and iodump_key in iodump_sec and iodump_sec[iodump_key].isdecimal(): -+ common_param[io_type][iodump_key] = int(iodump_sec[iodump_key]) -+ -+ if not lat_sec: -+ continue -+ - for key_suffix, key_template in latency_keys.items(): -- if key_template in config["latency"] and config["latency"][key_template].isdecimal(): -- common_param[io_type][key_template] = int(config["latency"][key_template]) -+ if key_template in lat_sec and lat_sec[key_template].isdecimal(): -+ common_param[io_type][key_template] = int(lat_sec[key_template]) - -- if iodump_key in config["iodump"] and config["iodump"][iodump_key].isdecimal(): -- common_param[io_type][iodump_key] = int(config["iodump"][iodump_key]) -- - return common_param - - --- -2.33.0 - diff --git a/fix-config-relative-some-issues.patch b/fix-config-relative-some-issues.patch deleted file mode 100644 index dbc0815ad18b26ba7e00efcd17933fbcd27fb90c..0000000000000000000000000000000000000000 --- a/fix-config-relative-some-issues.patch +++ /dev/null @@ -1,243 +0,0 @@ -From c9f62e01f09a56743ccc3e470f273875ab22ac5f Mon Sep 17 00:00:00 2001 -From: =?UTF-8?q?=E8=B4=BA=E6=9C=89=E5=BF=97?= <1037617413@qq.com> -Date: Wed, 9 Oct 2024 16:19:52 +0800 -Subject: [PATCH] fix config relative some issues - ---- - .../sentryPlugins/ai_block_io/README.md | 1 - - .../sentryPlugins/ai_block_io/ai_block_io.py | 21 +++++----- - .../ai_block_io/config_parser.py | 42 +++++++++---------- - .../sentryPlugins/ai_block_io/detector.py | 2 +- - .../ai_block_io/sliding_window.py | 8 ++-- - .../sentryPlugins/ai_block_io/threshold.py | 6 +-- - 6 files changed, 39 insertions(+), 41 deletions(-) - -diff --git a/src/python/sentryPlugins/ai_block_io/README.md b/src/python/sentryPlugins/ai_block_io/README.md -index f9b8388..95c1111 100644 ---- a/src/python/sentryPlugins/ai_block_io/README.md -+++ b/src/python/sentryPlugins/ai_block_io/README.md -@@ -1,2 +1 @@ - # slow_io_detection -- -diff --git a/src/python/sentryPlugins/ai_block_io/ai_block_io.py b/src/python/sentryPlugins/ai_block_io/ai_block_io.py -index 31b8a97..3b00ef3 100644 ---- a/src/python/sentryPlugins/ai_block_io/ai_block_io.py -+++ b/src/python/sentryPlugins/ai_block_io/ai_block_io.py -@@ -16,8 +16,7 @@ import logging - from .detector import Detector - from .threshold import ThresholdFactory, AbsoluteThreshold - from .sliding_window import SlidingWindowFactory --from .utils import (get_threshold_type_enum, get_sliding_window_type_enum, get_data_queue_size_and_update_size, -- get_log_level) -+from .utils import get_data_queue_size_and_update_size - from .config_parser import ConfigParser - from .data_access import get_io_data_from_collect_plug, check_collect_valid - from .io_data import MetricName -@@ -45,25 +44,25 @@ class SlowIODetection: - - def __init_detector_name_list(self): - self._disk_list = check_collect_valid(self._config_parser.get_slow_io_detect_frequency()) -+ logging.info(f"ai_block_io plug has found disks: {self._disk_list}") - disks_to_detection: list = self._config_parser.get_disks_to_detection() - # 情况1:None,则启用所有磁盘检测 - # 情况2:is not None and len = 0,则不启动任何磁盘检测 - # 情况3:len != 0,则取交集 - if disks_to_detection is None: -+ logging.warning("you not specify any disk or use default, so ai_block_io will enable all available disk.") - for disk in self._disk_list: - self._detector_name_list.append(MetricName(disk, "bio", "read", "latency")) - self._detector_name_list.append(MetricName(disk, "bio", "write", "latency")) - elif len(disks_to_detection) == 0: -- logging.warning('please attention: conf file not specify any disk to detection, ' -- 'so it will not start ai block io.') -+ logging.warning('please attention: conf file not specify any disk to detection, so it will not start ai block io.') - else: -- disks_name_to_detection = [] -- for disk_name_to_detection in disks_to_detection: -- disks_name_to_detection.append(disk_name_to_detection.get_disk_name()) -- disk_intersection = [disk for disk in self._disk_list if disk in disks_name_to_detection] -- for disk in disk_intersection: -- self._detector_name_list.append(MetricName(disk, "bio", "read", "latency")) -- self._detector_name_list.append(MetricName(disk, "bio", "write", "latency")) -+ for disk_to_detection in disks_to_detection: -+ if disk_to_detection in self._disk_list: -+ self._detector_name_list.append(MetricName(disk_to_detection, "bio", "read", "latency")) -+ self._detector_name_list.append(MetricName(disk_to_detection, "bio", "write", "latency")) -+ else: -+ logging.warning(f"disk:[{disk_to_detection}] not in available disk list, so it will be ignored.") - logging.info(f'start to detection follow disk and it\'s metric: {self._detector_name_list}') - - def __init_detector(self): -diff --git a/src/python/sentryPlugins/ai_block_io/config_parser.py b/src/python/sentryPlugins/ai_block_io/config_parser.py -index 632391d..354c122 100644 ---- a/src/python/sentryPlugins/ai_block_io/config_parser.py -+++ b/src/python/sentryPlugins/ai_block_io/config_parser.py -@@ -10,18 +10,19 @@ - # See the Mulan PSL v2 for more details. - - import configparser --import json - import logging - --from .io_data import MetricName - from .threshold import ThresholdType - from .utils import get_threshold_type_enum, get_sliding_window_type_enum, get_log_level - -+ - LOG_FORMAT = "%(asctime)s - %(levelname)s - [%(filename)s:%(lineno)d] - %(message)s" - - - def init_log_format(log_level: str): -- logging.basicConfig(level=get_log_level(log_level), format=LOG_FORMAT) -+ logging.basicConfig(level=get_log_level(log_level.lower()), format=LOG_FORMAT) -+ if log_level.lower() not in ('info', 'warning', 'error', 'debug'): -+ logging.warning(f'the log_level: {log_level} you set is invalid, use default value: info.') - - - class ConfigParser: -@@ -43,7 +44,7 @@ class ConfigParser: - self.__absolute_threshold = ConfigParser.DEFAULT_ABSOLUTE_THRESHOLD - self.__slow_io_detect_frequency = ConfigParser.DEFAULT_SLOW_IO_DETECTION_FREQUENCY - self.__log_level = ConfigParser.DEFAULT_LOG_LEVEL -- self.__disks_to_detection: list = [] -+ self.__disks_to_detection = None - - self.__algorithm_type = ConfigParser.DEFAULT_ALGORITHM_TYPE - self.__train_data_duration = ConfigParser.DEFAULT_TRAIN_UPDATE_DURATION -@@ -83,26 +84,20 @@ class ConfigParser: - logging.warning(f'slow_io_detect_frequency type conversion has error, use default value: {self.__slow_io_detect_frequency}.') - - def __read__disks_to_detect(self, items_common: dict): -- disks_to_detection = items_common.get('disks_to_detect') -+ disks_to_detection = items_common.get('disk') - if disks_to_detection is None: -- logging.warning(f'config of disks_to_detect not found, the default value be used.') -+ logging.warning(f'config of disk not found, the default value will be used.') - self.__disks_to_detection = None - return -- try: -- disks_to_detection_list = json.loads(disks_to_detection) -- for disk_to_detection in disks_to_detection_list: -- disk_name = disk_to_detection.get('disk_name', None) -- stage_name = disk_to_detection.get('stage_name', None) -- io_access_type_name = disk_to_detection.get('io_access_type_name', None) -- metric_name = disk_to_detection.get('metric_name', None) -- if not (disk_name is None or stage_name is None or io_access_type_name is None or metric_name is None): -- metric_name_object = MetricName(disk_name, stage_name, io_access_type_name, metric_name) -- self.__disks_to_detection.append(metric_name_object) -- else: -- logging.warning(f'config of disks_to_detect\'s some part has some error: {disk_to_detection}, it will be ignored.') -- except json.decoder.JSONDecodeError as e: -- logging.warning(f'config of disks_to_detect is error: {e}, it will be ignored and default value be used.') -+ disk_list = disks_to_detection.split(',') -+ if len(disk_list) == 0 or (len(disk_list) == 1 and disk_list[0] == ''): -+ logging.warning("you don't specify any disk.") -+ self.__disks_to_detection = [] -+ return -+ if len(disk_list) == 1 and disk_list[0] == 'default': - self.__disks_to_detection = None -+ return -+ self.__disks_to_detection = disk_list - - def __read__train_data_duration(self, items_algorithm: dict): - try: -@@ -189,7 +184,12 @@ class ConfigParser: - - def read_config_from_file(self): - con = configparser.ConfigParser() -- con.read(self.__config_file_name, encoding='utf-8') -+ try: -+ con.read(self.__config_file_name, encoding='utf-8') -+ except configparser.Error as e: -+ init_log_format(self.__log_level) -+ logging.critical(f'config file read error: {e}, ai_block_io plug will exit.') -+ exit(1) - - if con.has_section('common'): - items_common = dict(con.items('common')) -diff --git a/src/python/sentryPlugins/ai_block_io/detector.py b/src/python/sentryPlugins/ai_block_io/detector.py -index bcf62cb..a48144f 100644 ---- a/src/python/sentryPlugins/ai_block_io/detector.py -+++ b/src/python/sentryPlugins/ai_block_io/detector.py -@@ -50,6 +50,6 @@ class Detector: - - def __repr__(self): - return (f'disk_name: {self._metric_name.get_disk_name()}, stage_name: {self._metric_name.get_stage_name()},' -- f' access_type_name: {self._metric_name.get_io_access_type_name()},' -+ f' io_type_name: {self._metric_name.get_io_access_type_name()},' - f' metric_name: {self._metric_name.get_metric_name()}, threshold_type: {self._threshold},' - f' sliding_window_type: {self._slidingWindow}') -diff --git a/src/python/sentryPlugins/ai_block_io/sliding_window.py b/src/python/sentryPlugins/ai_block_io/sliding_window.py -index d395d48..89191e5 100644 ---- a/src/python/sentryPlugins/ai_block_io/sliding_window.py -+++ b/src/python/sentryPlugins/ai_block_io/sliding_window.py -@@ -52,7 +52,7 @@ class SlidingWindow: - return False, None, None - - def __repr__(self): -- return "SlidingWindow" -+ return "[SlidingWindow]" - - - class NotContinuousSlidingWindow(SlidingWindow): -@@ -65,7 +65,7 @@ class NotContinuousSlidingWindow(SlidingWindow): - return False, self._io_data_queue, self._ai_threshold - - def __repr__(self): -- return "NotContinuousSlidingWindow" -+ return f"[NotContinuousSlidingWindow, window size: {self._queue_length}, threshold: {self._queue_threshold}]" - - - class ContinuousSlidingWindow(SlidingWindow): -@@ -84,7 +84,7 @@ class ContinuousSlidingWindow(SlidingWindow): - return False, self._io_data_queue, self._ai_threshold - - def __repr__(self): -- return "ContinuousSlidingWindow" -+ return f"[ContinuousSlidingWindow, window size: {self._queue_length}, threshold: {self._queue_threshold}]" - - - class MedianSlidingWindow(SlidingWindow): -@@ -98,7 +98,7 @@ class MedianSlidingWindow(SlidingWindow): - return False, self._io_data_queue, self._ai_threshold - - def __repr__(self): -- return "MedianSlidingWindow" -+ return f"[MedianSlidingWindow, window size: {self._queue_length}]" - - - class SlidingWindowFactory: -diff --git a/src/python/sentryPlugins/ai_block_io/threshold.py b/src/python/sentryPlugins/ai_block_io/threshold.py -index ff85d85..3b7a5a8 100644 ---- a/src/python/sentryPlugins/ai_block_io/threshold.py -+++ b/src/python/sentryPlugins/ai_block_io/threshold.py -@@ -75,7 +75,7 @@ class AbsoluteThreshold(Threshold): - pass - - def __repr__(self): -- return "AbsoluteThreshold" -+ return "[AbsoluteThreshold]" - - - class BoxplotThreshold(Threshold): -@@ -109,7 +109,7 @@ class BoxplotThreshold(Threshold): - self.new_data_size = 0 - - def __repr__(self): -- return "BoxplotThreshold" -+ return f"[BoxplotThreshold, param is: {self.parameter}]" - - - class NSigmaThreshold(Threshold): -@@ -142,7 +142,7 @@ class NSigmaThreshold(Threshold): - self.new_data_size = 0 - - def __repr__(self): -- return "NSigmaThreshold" -+ return f"[NSigmaThreshold, param is: {self.parameter}]" - - - class ThresholdType(Enum): --- -2.23.0 - diff --git a/fix-configparser.InterpolationSyntaxError.patch b/fix-configparser.InterpolationSyntaxError.patch deleted file mode 100644 index 8dfd67be754b32495ac2c2d303bf534fbd0efa81..0000000000000000000000000000000000000000 --- a/fix-configparser.InterpolationSyntaxError.patch +++ /dev/null @@ -1,37 +0,0 @@ -From 65ceade489c4018c3f315104d70be0550a28d9d9 Mon Sep 17 00:00:00 2001 -From: shixuantong -Date: Wed, 11 Sep 2024 10:23:41 +0800 -Subject: [PATCH] fix configparser.InterpolationSyntaxError - ---- - src/python/syssentry/sentry_config.py | 8 ++++++-- - 1 file changed, 6 insertions(+), 2 deletions(-) - -diff --git a/src/python/syssentry/sentry_config.py b/src/python/syssentry/sentry_config.py -index 01f3df8..a0e7b79 100644 ---- a/src/python/syssentry/sentry_config.py -+++ b/src/python/syssentry/sentry_config.py -@@ -103,14 +103,18 @@ class CpuPluginsParamsConfig: - """read config file""" - config_param_section_args = {} - if os.path.exists(self.config_file): -- self.config.read(self.config_file) - try: -+ self.config.read(self.config_file) - config_param_section_args = dict(self.config[self.param_section_name]) -- except (ValueError, KeyError): -+ except (ValueError, KeyError, configparser.InterpolationSyntaxError): - config_param_section_args = {} -+ logging.error("Failed to parse cpu_sentry.ini!") - return config_param_section_args - - def join_cpu_start_cmd(self, cpu_param_dict: dict) -> str: -+ if not cpu_param_dict: -+ return "" -+ - cpu_list = cpu_param_dict.get("cpu_list", "default") - if cpu_list == "default": - cpu_list = CpuPluginsParamsConfig.get_cpu_info() --- -2.27.0 - diff --git a/fix-error-handling.patch b/fix-error-handling.patch deleted file mode 100644 index faadc7adf748d3dbfd8e28be6959d1a1d4d74aac..0000000000000000000000000000000000000000 --- a/fix-error-handling.patch +++ /dev/null @@ -1,25 +0,0 @@ -From 370b22b032dce9290eebca1cf8d48bd155164b6a Mon Sep 17 00:00:00 2001 -From: shixuantong -Date: Wed, 24 Jul 2024 17:53:58 +0800 -Subject: [PATCH] fix error handling - ---- - src/python/syssentry/cpu_sentry.py | 2 +- - 1 file changed, 1 insertion(+), 1 deletion(-) - -diff --git a/src/python/syssentry/cpu_sentry.py b/src/python/syssentry/cpu_sentry.py -index 3c4d58d..d0bafa8 100644 ---- a/src/python/syssentry/cpu_sentry.py -+++ b/src/python/syssentry/cpu_sentry.py -@@ -87,7 +87,7 @@ class CpuSentry: - } - - def handle_cpu_output(self, stdout: str): -- if "" in stdout: -+ if "ERROR" in stdout: - self.send_result["result"] = ResultLevel.FAIL - self.send_result["details"]["code"] = 1004 - self.send_result["details"]["msg"] = stdout.split("\n")[0] --- -2.27.0 - diff --git a/fix-excessive-CPU-usage.patch b/fix-excessive-CPU-usage.patch deleted file mode 100644 index b72ed5269732bd85ea26b685a87d6787d762de60..0000000000000000000000000000000000000000 --- a/fix-excessive-CPU-usage.patch +++ /dev/null @@ -1,41 +0,0 @@ -From 815537382fc0d5164fe57b0d984ca4a1ed8254ea Mon Sep 17 00:00:00 2001 -From: jinsaihang -Date: Thu, 31 Oct 2024 16:00:50 +0800 -Subject: [PATCH] excessive CPU usage - -Signed-off-by: jinsaihang ---- - sysSentry-1.0.2/src/python/xalarm/xalarm_transfer.py | 3 --- - 1 file changed, 3 deletions(-) - -diff --git a/src/python/xalarm/xalarm_transfer.py b/src/python/xalarm/xalarm_transfer.py -index b072007..4bebe5d 100644 ---- a/src/python/xalarm/xalarm_transfer.py -+++ b/src/python/xalarm/xalarm_transfer.py -@@ -62,7 +62,6 @@ def cleanup_closed_connections(server_sock, epoll, fd_to_socket): - to_remove.append(fileno) - - for fileno in to_remove: -- epoll.unregister(fileno) - fd_to_socket[fileno].close() - del fd_to_socket[fileno] - logging.info(f"cleaned up connection {fileno} for client lost connection.") -@@ -97,7 +96,6 @@ def wait_for_connection(server_sock, epoll, fd_to_socket, thread_should_stop): - logging.info(f"connection reach max num of {MAX_CONNECTION_NUM}, closed current connection!") - connection.close() - continue -- epoll.register(connection.fileno(), select.EPOLLOUT) - fd_to_socket[connection.fileno()] = connection - except socket.error as e: - logging.debug(f"socket error, reason is {e}") -@@ -122,7 +120,6 @@ def transmit_alarm(server_sock, epoll, fd_to_socket, bin_data): - except (BrokenPipeError, ConnectionResetError): - to_remove.append(fileno) - for fileno in to_remove: -- epoll.unregister(fileno) - fd_to_socket[fileno].close() - del fd_to_socket[fileno] - logging.info(f"cleaned up connection {fileno} for client lost connection.") --- -2.27.0 - diff --git a/fix-frequency-param-check-bug.patch b/fix-frequency-param-check-bug.patch deleted file mode 100644 index 06d4b4a43003c8072926bea928fdca714d5ed27d..0000000000000000000000000000000000000000 --- a/fix-frequency-param-check-bug.patch +++ /dev/null @@ -1,70 +0,0 @@ -From a06ad0c944b093a71f49cc9fccd5097c1493ca5e Mon Sep 17 00:00:00 2001 -From: =?UTF-8?q?=E8=B4=BA=E6=9C=89=E5=BF=97?= <1037617413@qq.com> -Date: Mon, 21 Oct 2024 17:31:32 +0800 -Subject: [PATCH] fix frequency param check bug - ---- - .../sentryPlugins/ai_block_io/config_parser.py | 13 +++++++++++-- - .../sentryPlugins/ai_block_io/data_access.py | 14 ++++++++++++++ - 2 files changed, 25 insertions(+), 2 deletions(-) - -diff --git a/src/python/sentryPlugins/ai_block_io/config_parser.py b/src/python/sentryPlugins/ai_block_io/config_parser.py -index 447eccd..274a31e 100644 ---- a/src/python/sentryPlugins/ai_block_io/config_parser.py -+++ b/src/python/sentryPlugins/ai_block_io/config_parser.py -@@ -16,6 +16,7 @@ import logging - from .alarm_report import Report - from .threshold import ThresholdType - from .utils import get_threshold_type_enum, get_sliding_window_type_enum, get_log_level -+from .data_access import check_detect_frequency_is_valid - - - LOG_FORMAT = "%(asctime)s - %(levelname)s - [%(filename)s:%(lineno)d] - %(message)s" -@@ -165,9 +166,17 @@ class ConfigParser: - "slow_io_detect_frequency", - int, - self.DEFAULT_CONF["common"]["slow_io_detect_frequency"], -- gt=0, -- le=300, -+ gt=0 - ) -+ frequency = self._conf["common"]["slow_io_detect_frequency"] -+ ret = check_detect_frequency_is_valid(frequency) -+ if ret is None: -+ log = f"slow io detect frequency: {frequency} is valid, "\ -+ f"Check whether the value range is too large or is not an "\ -+ f"integer multiple of period_time.. exiting..." -+ Report.report_pass(log) -+ logging.critical(log) -+ exit(1) - - def _read_disks_to_detect(self, items_common: dict): - disks_to_detection = items_common.get("disk") -diff --git a/src/python/sentryPlugins/ai_block_io/data_access.py b/src/python/sentryPlugins/ai_block_io/data_access.py -index 1bc5ed8..e4869d5 100644 ---- a/src/python/sentryPlugins/ai_block_io/data_access.py -+++ b/src/python/sentryPlugins/ai_block_io/data_access.py -@@ -53,6 +53,20 @@ def check_collect_valid(period): - return None - - -+def check_detect_frequency_is_valid(period): -+ data_raw = is_iocollect_valid(period) -+ if data_raw["ret"] == 0: -+ try: -+ data = json.loads(data_raw["message"]) -+ except Exception as e: -+ return None -+ if not data: -+ return None -+ return [k for k in data.keys()] -+ else: -+ return None -+ -+ - def _get_raw_data(period, disk_list): - return get_io_data( - period, --- -2.23.0 - diff --git a/fix-get_alarm-error.patch b/fix-get_alarm-error.patch deleted file mode 100644 index 19d8dcca041c9b265cfaf9609b2b705aef4b72bc..0000000000000000000000000000000000000000 --- a/fix-get_alarm-error.patch +++ /dev/null @@ -1,36 +0,0 @@ -From 8f28a40ffd7dc7aa969a7bfc0a170ed0c8f03bce Mon Sep 17 00:00:00 2001 -From: jinsaihang -Date: Tue, 22 Oct 2024 20:28:59 +0800 -Subject: [PATCH] fix get_alarm error - -Signed-off-by: jinsaihang ---- - sysSentry-1.0.2/src/python/syssentry/alarm.py | 5 +++-- - 1 file changed, 3 insertions(+), 2 deletions(-) - -diff --git a/src/python/syssentry/alarm.py b/src/python/syssentry/alarm.py -index c3f2ee1..2575307 100644 ---- a/src/python/syssentry/alarm.py -+++ b/src/python/syssentry/alarm.py -@@ -139,8 +139,6 @@ def get_alarm_result(task_name: str, time_range: int, detailed: bool) -> List[Di - return [] - alarm_id = task_alarm_id_dict[task_name] - clear_time = alarm_id_clear_time_dict[alarm_id] -- if clear_time < int(time_range): -- return [] - if alarm_id not in alarm_list_dict: - logging.debug("alarm_id does not exist") - return [] -@@ -154,6 +152,9 @@ def get_alarm_result(task_name: str, time_range: int, detailed: bool) -> List[Di - if timestamp - (xalarm_gettime(alarm_list[i])) / MILLISECONDS_UNIT_SECONDS > time_range: - stop_index = i - break -+ if timestamp - (xalarm_gettime(alarm_list[i])) / MILLISECONDS_UNIT_SECONDS > clear_time: -+ stop_index = i -+ break - if stop_index >= 0: - alarm_list = alarm_list[:stop_index] - logging.debug(f"get_alarm_result: final alarm_list of {alarm_id} has {len(alarm_list)} elements") --- -2.27.0 - diff --git a/fix-hbm-online-repair-notice-and-efi-create.patch b/fix-hbm-online-repair-notice-and-efi-create.patch deleted file mode 100644 index 9b0fa99987585c48f233ef985eccad79e6d3fbae..0000000000000000000000000000000000000000 --- a/fix-hbm-online-repair-notice-and-efi-create.patch +++ /dev/null @@ -1,508 +0,0 @@ -From 85d6dae9d7c6148f2699ef7da7d2d784043a2ee1 Mon Sep 17 00:00:00 2001 -From: luckky -Date: Wed, 30 Oct 2024 10:41:11 +0800 -Subject: [PATCH] fix hbm online repair notice and efi create - ---- - src/c/hbm_online_repair/hbm_online_repair.c | 5 +- - .../non-standard-hbm-repair.c | 194 +++++++++--------- - .../non-standard-hbm-repair.h | 2 +- - src/c/hbm_online_repair/ras-events.c | 1 - - .../ras-non-standard-handler.c | 33 +-- - .../ras-non-standard-handler.h | 1 + - 6 files changed, 116 insertions(+), 120 deletions(-) - -diff --git a/src/c/hbm_online_repair/hbm_online_repair.c b/src/c/hbm_online_repair/hbm_online_repair.c -index 3ace206..b3b2742 100644 ---- a/src/c/hbm_online_repair/hbm_online_repair.c -+++ b/src/c/hbm_online_repair/hbm_online_repair.c -@@ -127,10 +127,7 @@ int main(int argc, char *argv[]) - return -1; - } - -- ret = init_all_flash(); -- if (ret < 0) { -- log(LOG_ERROR, "flash writer init failed\n"); -- } -+ get_flash_total_size(); - - handle_ras_events(ras); - -diff --git a/src/c/hbm_online_repair/non-standard-hbm-repair.c b/src/c/hbm_online_repair/non-standard-hbm-repair.c -index b175e14..f26d8ae 100644 ---- a/src/c/hbm_online_repair/non-standard-hbm-repair.c -+++ b/src/c/hbm_online_repair/non-standard-hbm-repair.c -@@ -15,7 +15,7 @@ - #include "non-standard-hbm-repair.h" - - extern int page_isolation_threshold; --size_t total_size = 0; -+size_t flash_total_size = 0; - struct hisi_common_error_section { - uint32_t val_bits; - uint8_t version; -@@ -122,28 +122,58 @@ static void parse_fault_addr_info(struct fault_addr_info* info_struct, unsigned - info_struct->crc8 = (uint32_t)fault_addr; - } - --static bool variable_existed(char *name, char *guid) -+static bool is_variable_existing(char *name, char *guid) - { -+ char filename[PATH_MAX]; -+ snprintf(filename, PATH_MAX - 1, "%s/%s-%s", EFIVARFS_PATH, name, guid); -+ -+ return access(filename, F_OK | R_OK) == 0; -+} -+ -+static size_t get_var_size(char *name, char *guid) { - char filename[PATH_MAX]; - int fd; -+ struct stat stat; - - snprintf(filename, PATH_MAX - 1, "%s/%s-%s", EFIVARFS_PATH, name, guid); - - // open var file - fd = open(filename, O_RDONLY); - if (fd < 0) { -- log(LOG_WARNING, "open file %s failed\n", filename); -- return false; -+ log(LOG_WARNING, "open %s failed\n", filename); -+ goto err; -+ } -+ // read stat -+ if (fstat(fd, &stat) != 0) { -+ log(LOG_WARNING, "fstat %s failed\n", filename); -+ goto err; - } - close(fd); -- return true; -+ return stat.st_size; -+err: -+ if (fd >= 0) -+ close(fd); -+ return (size_t)-1; - } - --static uint32_t read_variable_attribute(char *name, char *guid) { -+void get_flash_total_size() { -+ for (int i = 0; i < FLASH_ENTRY_NUM; i++) { -+ if (is_variable_existing(flash_names[i], flash_guids[i])) { -+ flash_total_size += get_var_size(flash_names[i], flash_guids[i]); -+ } -+ } -+ // check total entry size -+ log(LOG_DEBUG, "current fault info total size: %luKB, flash max threshold: %uKB\n", -+ flash_total_size / KB_SIZE, MAX_VAR_SIZE / KB_SIZE); -+ if (flash_total_size > MAX_VAR_SIZE) { -+ log(LOG_WARNING, "fault info storage %zu reach threshold, cannot save new record\n", flash_total_size); -+ } -+} -+ -+static int read_variable_attribute(char *name, char *guid, uint32_t *attribute) { - char filename[PATH_MAX]; - int fd; - size_t readsize; -- uint32_t attribute = (uint32_t)-1; - - snprintf(filename, PATH_MAX - 1, "%s/%s-%s", EFIVARFS_PATH, name, guid); - -@@ -151,17 +181,18 @@ static uint32_t read_variable_attribute(char *name, char *guid) { - fd = open(filename, O_RDONLY); - if (fd < 0) { - log(LOG_ERROR, "open %s failed\n", filename); -- return attribute; -+ return -1; - } - - // read attributes from first 4 bytes -- readsize = read(fd, &attribute, sizeof(uint32_t)); -+ readsize = read(fd, attribute, sizeof(uint32_t)); - if (readsize != sizeof(uint32_t)) { - log(LOG_ERROR, "read attribute of %s failed\n", filename); -+ return -1; - } - - close(fd); -- return attribute; -+ return 0; - } - - static int efivarfs_set_mutable(char *name, char *guid, bool mutable) -@@ -205,8 +236,8 @@ err: - return -1; - } - --static int write_variable(char *name, char *guid, void *value, unsigned long size, uint32_t attribute) { -- int fd, mode; -+static int write_variable(char *name, char *guid, void *value, unsigned long size, uint32_t attribute, bool is_existing) { -+ int fd = -1, mode; - size_t writesize; - void *buffer; - unsigned long total; -@@ -225,16 +256,13 @@ static int write_variable(char *name, char *guid, void *value, unsigned long siz - memcpy(buffer + sizeof(uint32_t), value, size); - - // change attr -- if (efivarfs_set_mutable(name, guid, 1) != 0) { -+ if (is_existing && efivarfs_set_mutable(name, guid, 1) != 0) { - log(LOG_ERROR, "set mutable for %s failed\n", filename); - goto err; - } - - mode = O_WRONLY; -- if (attribute & EFI_VARIABLE_APPEND_WRITE) -- mode |= O_APPEND; -- else -- mode |= O_CREAT; -+ mode |= is_existing ? O_APPEND : O_CREAT; - - // open var file - fd = open(filename, mode, S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH); -@@ -252,7 +280,7 @@ static int write_variable(char *name, char *guid, void *value, unsigned long siz - - close(fd); - free(buffer); -- if (efivarfs_set_mutable(name, guid, 0) != 0) { -+ if (is_existing && efivarfs_set_mutable(name, guid, 0) != 0) { - log(LOG_ERROR, "set immutable for %s failed\n", filename); - } - return 0; -@@ -261,86 +289,21 @@ err: - close(fd); - if (buffer) - free(buffer); -- if (efivarfs_set_mutable(name, guid, 0) != 0) { -+ if (is_existing && efivarfs_set_mutable(name, guid, 0) != 0) { - log(LOG_ERROR, "set immutable for %s failed\n", filename); - } - return -1; - } - --static int append_variable(char *name, char *guid, void *data, unsigned long size) { -- // prepare append attribute -- uint32_t attribute = read_variable_attribute(name, guid); -- if (attribute == (uint32_t)-1) { -- log(LOG_ERROR, "read %s-%s attribute failed\n", name, guid); -- return -1; -- } -- attribute |= EFI_VARIABLE_APPEND_WRITE; -- -- return write_variable(name, guid, data, size, attribute); --} -- --static size_t get_var_size(char *name, char *guid) { -- char filename[PATH_MAX]; -- int fd; -- struct stat stat; -- -- snprintf(filename, PATH_MAX - 1, "%s/%s-%s", EFIVARFS_PATH, name, guid); -- -- // open var file -- fd = open(filename, O_RDONLY); -- if (fd < 0) { -- log(LOG_WARNING, "open %s failed\n", filename); -- goto err; -- } -- // read stat -- if (fstat(fd, &stat) != 0) { -- log(LOG_WARNING, "fstat %s failed\n", filename); -- goto err; -- } -- close(fd); -- return stat.st_size; --err: -- if (fd >= 0) -- close(fd); -- return (size_t)-1; --} -- --int init_all_flash() { -- for (int i = 0; i < FLASH_ENTRY_NUM; i++) { -- // check existed entry -- if (variable_existed(flash_names[i], flash_guids[i])) { -- total_size += get_var_size(flash_names[i], flash_guids[i]); -- continue; -- } -- // create new entry -- uint32_t attribute = EFI_VARIABLE_NON_VOLATILE | -- EFI_VARIABLE_BOOTSERVICE_ACCESS | -- EFI_VARIABLE_RUNTIME_ACCESS; -- char *data = ""; -- unsigned long size = 1; -- int ret = write_variable(flash_names[i], flash_guids[i], data, size, attribute); -- if (ret) { -- log(LOG_ERROR, "init %s-%s failed, fault info storage funtion not enabled\n", flash_names[i], flash_guids[i]); -- return -1; -- } -- total_size += sizeof(uint32_t) + 1; -- } -- // check total entry size -- log(LOG_DEBUG, "current fault info total size: %luKB, flash max threshold: %uKB\n", -- total_size / KB_SIZE, MAX_VAR_SIZE / KB_SIZE); -- if (total_size > MAX_VAR_SIZE) { -- log(LOG_ERROR, "fault info storage reach threshold, cannot save new record\n"); -- } -- return 0; --} -- - static int write_fault_info_to_flash(const struct hisi_common_error_section *err) { - int ret, guid_index; - uint32_t reg_size; - uint64_t fault_addr; -+ bool is_existing; -+ uint32_t attribute = -1; - - // check flash usage threshold -- if (total_size + sizeof(uint64_t) > MAX_VAR_SIZE) { -+ if (flash_total_size + sizeof(uint64_t) > MAX_VAR_SIZE) { - log(LOG_WARNING, "fault info storage reach threshold, cannot save new record into flash\n"); - return -1; - } -@@ -359,14 +322,29 @@ static int write_fault_info_to_flash(const struct hisi_common_error_section *err - log(LOG_ERROR, "invalid fault info\n"); - return -1; - } -+ -+ // judge if the efivar is existing to set the attribute -+ is_existing = is_variable_existing(flash_names[guid_index], flash_guids[guid_index]); -+ attribute = EFI_VARIABLE_NON_VOLATILE | -+ EFI_VARIABLE_BOOTSERVICE_ACCESS | -+ EFI_VARIABLE_RUNTIME_ACCESS; -+ if (is_existing) { -+ ret = read_variable_attribute(flash_names[guid_index], flash_guids[guid_index], &attribute); -+ if (ret < 0) { -+ log(LOG_ERROR, "read variable %s-%s attribute failed, stop writing\n", flash_names[guid_index], flash_guids[guid_index]); -+ return -1; -+ } -+ attribute |= EFI_VARIABLE_APPEND_WRITE; -+ } -+ - // record physical addr in flash -- ret = append_variable(flash_names[guid_index], flash_guids[guid_index], &fault_addr, sizeof(uint64_t)); -+ ret = write_variable(flash_names[guid_index], flash_guids[guid_index], &fault_addr, sizeof(uint64_t), attribute, is_existing); - if (ret < 0) { -- log(LOG_ERROR, "append to %s-%s failed\n", flash_names[guid_index], flash_guids[guid_index]); -+ log(LOG_ERROR, "write to %s-%s failed\n", flash_names[guid_index], flash_guids[guid_index]); - return -1; - } -- total_size += sizeof(uint64_t); -- log(LOG_INFO, "write hbm fault info to flash success\n"); -+ flash_total_size += sizeof(uint64_t); -+ log(LOG_INFO, "write hbm fault info to flash %s-%s success\n", flash_names[guid_index], flash_guids[guid_index]); - return 0; - } - -@@ -421,7 +399,7 @@ static int get_hardware_corrupted_size() - return hardware_corrupted_size; - } - --static uint8_t get_repair_result_code(int ret) -+static uint8_t get_repair_failed_result_code(int ret) - { - if (ret == -ENOSPC) { - return REPAIR_FAILED_NO_RESOURCE; -@@ -582,11 +560,11 @@ static int hbmc_hbm_page_isolate(const struct hisi_common_error_section *err) - static int hbmc_hbm_after_repair(bool is_acls, const int repair_ret, const unsigned long long paddr) - { - int ret; -- if (repair_ret < 0) { -+ if (repair_ret <= 0) { - log(LOG_WARNING, "HBM %s: Keep page (0x%llx) offline\n", is_acls ? "ACLS" : "SPPR", paddr); - /* not much we can do about errors here */ - (void)write_file("/sys/kernel/page_eject", "remove_page", paddr); -- return get_repair_result_code(repair_ret); -+ return get_repair_failed_result_code(repair_ret); - } - - ret = write_file("/sys/kernel/page_eject", "online_page", paddr); -@@ -615,9 +593,13 @@ static uint8_t hbmc_hbm_repair(const struct hisi_common_error_section *err, char - err->reg_array[HBM_REPAIR_REQ_TYPE] & HBM_PSUE_ACLS; - - ret = write_file(path, is_acls ? "acls_query" : "sppr_query", paddr); -- if (ret < 0) { -- notice_BMC(err, get_repair_result_code(ret)); -- log(LOG_WARNING, "HBM: Address 0x%llx is not supported to %s repair\n", paddr, is_acls ? "ACLS" : "SPPR"); -+ -+ /* Only positive num means the error is supported to repair */ -+ if (ret <= 0) { -+ if (ret != -ENXIO) { -+ notice_BMC(err, get_repair_failed_result_code(ret)); -+ log(LOG_WARNING, "HBM: Address 0x%llx is not supported to %s repair\n", paddr, is_acls ? "ACLS" : "SPPR"); -+ } - return ret; - } - -@@ -642,8 +624,9 @@ static uint8_t hbmc_hbm_repair(const struct hisi_common_error_section *err, char - all_online_success = false; - } - } -- if (ret < 0) { -- notice_BMC(err, get_repair_result_code(ret)); -+ /* The ret is from the acls/sppr repair, and only positive num means the error is repaired successfully */ -+ if (ret <= 0) { -+ notice_BMC(err, get_repair_failed_result_code(ret)); - return ret; - } else if (all_online_success) { - notice_BMC(err, ISOLATE_REPAIR_ONLINE_SUCCESS); -@@ -698,7 +681,7 @@ static void hbm_repair_handler(const struct hisi_common_error_section *err) - struct dirent *dent; - DIR *dir; - int ret; -- bool find_device = false, find_hbm_mem = false; -+ bool find_device = false, find_hbm_mem = false, addr_in_hbm_device = false; - - ret = hbmc_hbm_page_isolate(err); - if (ret < 0) { -@@ -723,10 +706,13 @@ static void hbm_repair_handler(const struct hisi_common_error_section *err) - if (hbmc_get_memory_type(path) == HBM_HBM_MEMORY) { - find_hbm_mem = true; - ret = hbmc_hbm_repair(err, path); -- if (ret != -ENXIO) -+ if (ret != -ENXIO) { -+ addr_in_hbm_device = true; - break; -+ } - } - } -+ - if (!find_device) { - log(LOG_ERROR, "Repair driver is not loaded, skip error, error_type is %u\n", - err->reg_array[HBM_REPAIR_REQ_TYPE] & HBM_ERROR_MASK); -@@ -735,6 +721,10 @@ static void hbm_repair_handler(const struct hisi_common_error_section *err) - log(LOG_ERROR, "No HBM device memory type found, skip error, error_type is %u\n", - err->reg_array[HBM_REPAIR_REQ_TYPE] & HBM_ERROR_MASK); - notice_BMC(err, REPAIR_FAILED_OTHER_REASON); -+ } else if (!addr_in_hbm_device) { -+ log(LOG_ERROR, "Err addr is not in device, skip error, error_type is %u\n", -+ err->reg_array[HBM_REPAIR_REQ_TYPE] & HBM_ERROR_MASK); -+ notice_BMC(err, REPAIR_FAILED_INVALID_PARAM); - } - - closedir(dir); -@@ -769,7 +759,7 @@ static bool hbm_repair_validate(const struct hisi_common_error_section *err) - (err->reg_array_size == HBM_CACHE_ARRAY_SIZE); - - if (!(is_acls_valid || is_sppr_valid || is_cache_mode)) { -- log(LOG_DEBUG, "err type (%u) is unknown or address array length (%u) is invalid\n", -+ log(LOG_WARNING, "err type (%u) is unknown or address array length (%u) is invalid\n", - hbm_repair_reg_type, err->reg_array_size); - return false; - } -diff --git a/src/c/hbm_online_repair/non-standard-hbm-repair.h b/src/c/hbm_online_repair/non-standard-hbm-repair.h -index 7e8e448..ecb04fe 100644 ---- a/src/c/hbm_online_repair/non-standard-hbm-repair.h -+++ b/src/c/hbm_online_repair/non-standard-hbm-repair.h -@@ -84,6 +84,6 @@ - #define FLASH_ENTRY_NUM 8 - #define KB_SIZE 1024 - --extern int init_all_flash(); -+extern void get_flash_total_size(); - - #endif -diff --git a/src/c/hbm_online_repair/ras-events.c b/src/c/hbm_online_repair/ras-events.c -index 0b12329..4d281ad 100644 ---- a/src/c/hbm_online_repair/ras-events.c -+++ b/src/c/hbm_online_repair/ras-events.c -@@ -348,7 +348,6 @@ static int read_ras_event_all_cpus(struct pcpu_data *pdata, - "Error on CPU %i\n", i); - warnonce[i]++; - } -- continue; - } - if (!(fds[i].revents & POLLIN)) { - count_nready++; -diff --git a/src/c/hbm_online_repair/ras-non-standard-handler.c b/src/c/hbm_online_repair/ras-non-standard-handler.c -index 1d1fd04..48ffa70 100644 ---- a/src/c/hbm_online_repair/ras-non-standard-handler.c -+++ b/src/c/hbm_online_repair/ras-non-standard-handler.c -@@ -7,17 +7,21 @@ - #include "ras-non-standard-handler.h" - #include "logger.h" - --static char *uuid_le(const char *uu) -+static int uuid_le(const char *uu, char* uuid) - { -- static char uuid[sizeof("xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx")]; - if (!uu) { - log(LOG_ERROR, "uuid_le failed: uu is empty"); -- return uuid; -+ return -1; - } - size_t uu_len = strlen(uu); -- if (uu_len < SECTION_TYPE_UUID_LEN) { -- log(LOG_ERROR, "uuid_le failed: uu is too short"); -- return uuid; -+ if (uu_len != SECTION_TYPE_UUID_LEN) { -+ log(LOG_ERROR, "uuid_le failed: uu len is incorrect"); -+ return -1; -+ } -+ size_t uuid_len = strlen(uuid); -+ if (uuid_len != strlen(UUID_STR_TYPE)) { -+ log(LOG_ERROR, "uuid_le failed: uuid len is incorrect"); -+ return -1; - } - - char *p = uuid; -@@ -38,7 +42,7 @@ static char *uuid_le(const char *uu) - - *p = 0; - -- return uuid; -+ return 0; - } - - int ras_non_standard_event_handler(struct trace_seq *s, -@@ -52,15 +56,20 @@ int ras_non_standard_event_handler(struct trace_seq *s, - ev.sec_type = tep_get_field_raw(s, event, "sec_type", - record, &len, 1); - if(!ev.sec_type) { -- log(LOG_WARNING, "get event section type failed"); -+ log(LOG_WARNING, "get event section type failed\n"); - return -1; - } - - trace_seq_printf(s, "\n"); -- trace_seq_printf(s, "sec_type: %s\n", uuid_le(ev.sec_type)); -+ char uuid[sizeof(UUID_STR_TYPE)] = UUID_STR_TYPE; -+ if (uuid_le(ev.sec_type, uuid) < 0) { -+ log(LOG_WARNING, "get uuid failed\n"); -+ return -1; -+ } -+ trace_seq_printf(s, "sec_type: %s\n", uuid); - - if (tep_get_field_val(s, event, "len", record, &val, 1) < 0) { -- log(LOG_WARNING, "tep get field val failed"); -+ log(LOG_WARNING, "tep get field val failed\n"); - return -1; - } - -@@ -69,11 +78,11 @@ int ras_non_standard_event_handler(struct trace_seq *s, - - ev.error = tep_get_field_raw(s, event, "buf", record, &len, 1); - if(!ev.error || ev.length != len) { -- log(LOG_WARNING, "get event error failed"); -+ log(LOG_WARNING, "get event error failed\n"); - return -1; - } - -- if (strcmp(uuid_le(ev.sec_type), HISI_COMMON_SECTION_TYPE_UUID) == 0) { -+ if (strcmp(uuid, HISI_COMMON_SECTION_TYPE_UUID) == 0) { - decode_hisi_common_section(&ev); - } - -diff --git a/src/c/hbm_online_repair/ras-non-standard-handler.h b/src/c/hbm_online_repair/ras-non-standard-handler.h -index 0272dc1..15a37ee 100644 ---- a/src/c/hbm_online_repair/ras-non-standard-handler.h -+++ b/src/c/hbm_online_repair/ras-non-standard-handler.h -@@ -7,6 +7,7 @@ - #define BIT(nr) (1UL << (nr)) - - #define SECTION_TYPE_UUID_LEN 16 -+#define UUID_STR_TYPE "xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx" - #define HISI_COMMON_SECTION_TYPE_UUID "c8b328a8-9917-4af6-9a13-2e08ab2e7586" - - struct ras_non_standard_event { --- -2.43.0 - diff --git a/fix-io_dump-for-collect-module.patch b/fix-io_dump-for-collect-module.patch deleted file mode 100644 index 452ba0a28d5f0a5b0f373362a2946b7fd6106b93..0000000000000000000000000000000000000000 --- a/fix-io_dump-for-collect-module.patch +++ /dev/null @@ -1,25 +0,0 @@ -From 6307a1ff4068a541658e3312ca938c6fdd9a5c1a Mon Sep 17 00:00:00 2001 -From: zhuofeng -Date: Sat, 12 Oct 2024 14:51:51 +0800 -Subject: [PATCH] fix io_dump for collect module - ---- - src/python/sentryCollector/collect_io.py | 4 ++-- - 1 file changed, 2 insertions(+), 2 deletions(-) - -diff --git a/src/python/sentryCollector/collect_io.py b/src/python/sentryCollector/collect_io.py -index d734734..11c9d9a 100644 ---- a/src/python/sentryCollector/collect_io.py -+++ b/src/python/sentryCollector/collect_io.py -@@ -154,7 +154,7 @@ class CollectIo(): - try: - with open(io_dump_file, 'r') as file: - for line in file: -- count += line.count('.op=' + Io_Category[category]) -+ count += line.count('.op=' + Io_Category[category].upper()) - if count > 0: - logging.info(f"io_dump info : {disk_name}, {stage}, {category}, {count}") - except FileNotFoundError: --- -2.33.0 - diff --git a/fix-python-3.7-not-support-list-bool-type.patch b/fix-python-3.7-not-support-list-bool-type.patch deleted file mode 100644 index 6214cdaeeb083bba4e27cbeac670a5280049a484..0000000000000000000000000000000000000000 --- a/fix-python-3.7-not-support-list-bool-type.patch +++ /dev/null @@ -1,53 +0,0 @@ -From 878bcf61467bfd9d015a8089a8367f4333ba76f6 Mon Sep 17 00:00:00 2001 -From: PshySimon -Date: Wed, 9 Oct 2024 10:20:34 +0800 -Subject: [PATCH] fix python 3.7 not support list[bool] type - ---- - src/python/xalarm/register_xalarm.py | 8 ++++---- - 1 file changed, 4 insertions(+), 4 deletions(-) - -diff --git a/src/python/xalarm/register_xalarm.py b/src/python/xalarm/register_xalarm.py -index e58343d..6756b1b 100644 ---- a/src/python/xalarm/register_xalarm.py -+++ b/src/python/xalarm/register_xalarm.py -@@ -26,7 +26,7 @@ ALARM_REGISTER_INFO = None - - - class AlarmRegister: -- def __init__(self, id_filter: list[bool], callback: callable): -+ def __init__(self, id_filter: list, callback: callable): - self.id_filter = id_filter - self.callback = callback - self.socket = self.create_unix_socket() -@@ -49,7 +49,7 @@ class AlarmRegister: - return False - return True - -- def set_id_filter(self, id_filter: list[bool]) -> bool: -+ def set_id_filter(self, id_filter: list) -> bool: - if (len(id_filter) > MAX_NUM_OF_ALARM_ID): - sys.stderr.write("set_id_filter: invalid param id_filter\n") - return False -@@ -118,7 +118,7 @@ class AlarmRegister: - self.socket.close() - - --def xalarm_register(callback: callable, id_filter: list[bool]) -> int: -+def xalarm_register(callback: callable, id_filter: list) -> int: - global ALARM_REGISTER_INFO - - if ALARM_REGISTER_INFO is not None: -@@ -148,7 +148,7 @@ def xalarm_unregister(clientId: int) -> None: - ALARM_REGISTER_INFO = None - - --def xalarm_upgrade(clientId: int, id_filter: list[bool]) -> None: -+def xalarm_upgrade(clientId: int, id_filter: list) -> None: - global ALARM_REGISTER_INFO - if clientId < 0: - sys.stderr.write("xalarm_unregister: invalid client\n") --- -2.27.0 - - diff --git a/fix-result-when-process-output-is-None.patch b/fix-result-when-process-output-is-None.patch deleted file mode 100644 index 9d227001f40da876af90c8b47a011d2d5b1de0a1..0000000000000000000000000000000000000000 --- a/fix-result-when-process-output-is-None.patch +++ /dev/null @@ -1,36 +0,0 @@ -From e8e4fa5fd9e78508567782e17b7b1cb6ace3ef0d Mon Sep 17 00:00:00 2001 -From: shixuantong -Date: Fri, 26 Jul 2024 15:59:42 +0800 -Subject: [PATCH] fix result when process output is None - ---- - src/python/syssentry/cpu_sentry.py | 8 ++++++++ - 1 file changed, 8 insertions(+) - -diff --git a/src/python/syssentry/cpu_sentry.py b/src/python/syssentry/cpu_sentry.py -index d0bafa8..9287e2f 100644 ---- a/src/python/syssentry/cpu_sentry.py -+++ b/src/python/syssentry/cpu_sentry.py -@@ -87,11 +87,19 @@ class CpuSentry: - } - - def handle_cpu_output(self, stdout: str): -+ if not stdout: -+ logging.error("%s process output is None, it may be killed!", LOW_LEVEL_INSPECT_CMD) -+ self.send_result["result"] = ResultLevel.FAIL -+ self.send_result["details"]["code"] = 1005 -+ self.send_result["details"]["msg"] = "cpu_sentry task is killed!" -+ return -+ - if "ERROR" in stdout: - self.send_result["result"] = ResultLevel.FAIL - self.send_result["details"]["code"] = 1004 - self.send_result["details"]["msg"] = stdout.split("\n")[0] - return -+ - out_split = stdout.split("\n") - isolated_cores_number = 0 - found_fault_cores_list = [] --- -2.27.0 - diff --git a/fix-some-about-collect-module-and-avg-block-io.patch b/fix-some-about-collect-module-and-avg-block-io.patch deleted file mode 100644 index a32e5c727c36fb3d768ef2d6a4ba6b0fe5dea1b1..0000000000000000000000000000000000000000 --- a/fix-some-about-collect-module-and-avg-block-io.patch +++ /dev/null @@ -1,226 +0,0 @@ -From dea58a559f3dbad3dbce3b681639ee89c20b1cee Mon Sep 17 00:00:00 2001 -From: zhuofeng -Date: Fri, 20 Sep 2024 14:35:39 +0800 -Subject: [PATCH] fix some about collect module and avg block io - ---- - config/tasks/avg_block_io.mod | 4 ++-- - src/python/sentryCollector/collect_io.py | 18 +++++++++++------- - src/python/sentryCollector/collect_plugin.py | 17 ++++++++--------- - src/python/sentryCollector/collect_server.py | 6 +++--- - src/python/sentryCollector/collectd.py | 2 -- - .../sentryPlugins/avg_block_io/avg_block_io.py | 13 ++++++++++--- - 6 files changed, 34 insertions(+), 26 deletions(-) - -diff --git a/config/tasks/avg_block_io.mod b/config/tasks/avg_block_io.mod -index 814c483..b9b6f34 100644 ---- a/config/tasks/avg_block_io.mod -+++ b/config/tasks/avg_block_io.mod -@@ -1,5 +1,5 @@ - [common] - enabled=yes - task_start=/usr/bin/python3 /usr/bin/avg_block_io --task_stop=pkill avg_block_io --type=oneshot -\ No newline at end of file -+task_stop=pkill -f /usr/bin/avg_block_io -+type=oneshot -diff --git a/src/python/sentryCollector/collect_io.py b/src/python/sentryCollector/collect_io.py -index b826dc4..104b734 100644 ---- a/src/python/sentryCollector/collect_io.py -+++ b/src/python/sentryCollector/collect_io.py -@@ -175,8 +175,7 @@ class CollectIo(): - - threading.Timer(self.period_time, self.task_loop).start() - -- def main_loop(self): -- logging.info("collect io thread start") -+ def is_kernel_avaliable(self): - base_path = '/sys/kernel/debug/block' - for disk_name in os.listdir(base_path): - if not self.loop_all and disk_name not in self.disk_list: -@@ -198,8 +197,13 @@ class CollectIo(): - self.window_value[disk_name] = {} - IO_GLOBAL_DATA[disk_name] = {} - -- if len(self.disk_map_stage) == 0: -- logging.warning("no disks meet the requirements. the thread exits") -+ return len(IO_GLOBAL_DATA) != 0 -+ -+ def main_loop(self): -+ logging.info("collect io thread start") -+ -+ if not self.is_kernel_avaliable() or len(self.disk_map_stage) == 0: -+ logging.warning("no disks meet the requirements. collect io thread exits") - return - - for disk_name, stage_list in self.disk_map_stage.items(): -@@ -213,7 +217,7 @@ class CollectIo(): - start_time = time.time() - - if self.stop_event.is_set(): -- logging.info("collect io thread exit") -+ logging.debug("collect io thread exit") - return - - for disk_name, stage_list in self.disk_map_stage.items(): -@@ -227,7 +231,7 @@ class CollectIo(): - continue - while sleep_time > 1: - if self.stop_event.is_set(): -- logging.info("collect io thread exit") -+ logging.debug("collect io thread exit") - return - time.sleep(1) - sleep_time -= 1 -@@ -235,5 +239,5 @@ class CollectIo(): - - # set stop event, notify thread exit - def stop_thread(self): -- logging.info("collect io thread is preparing to exit") -+ logging.debug("collect io thread is preparing to exit") - self.stop_event.set() -diff --git a/src/python/sentryCollector/collect_plugin.py b/src/python/sentryCollector/collect_plugin.py -index 49ce0a8..9132473 100644 ---- a/src/python/sentryCollector/collect_plugin.py -+++ b/src/python/sentryCollector/collect_plugin.py -@@ -142,22 +142,21 @@ def validate_parameters(param, len_limit, char_limit): - ret = ResultMessage.RESULT_INVALID_LENGTH - return [False, ret] - -- if len(param) > len_limit: -- print(f"{param} length more than {len_limit}") -- ret = ResultMessage.RESULT_EXCEED_LIMIT -- return [False, ret] -- - pattern = r'^[a-zA-Z0-9_-]+$' - for info in param: -- if len(info) > char_limit: -- print(f"{info} length more than {char_limit}") -- ret = ResultMessage.RESULT_EXCEED_LIMIT -- return [False, ret] - if not re.match(pattern, info): - print(f"{info} is invalid char") - ret = ResultMessage.RESULT_INVALID_CHAR - return [False, ret] - -+ # length of len_limit is exceeded, keep len_limit -+ if len(param) > len_limit: -+ print(f"{param} length more than {len_limit}, keep the first {len_limit}") -+ param[:] = param[0:len_limit] -+ -+ # only keep elements under the char_limit length -+ param[:] = [elem for elem in param if len(elem) <= char_limit] -+ - return [True, ret] - - def is_iocollect_valid(period, disk_list=None, stage=None): -diff --git a/src/python/sentryCollector/collect_server.py b/src/python/sentryCollector/collect_server.py -index fa49781..bab4e56 100644 ---- a/src/python/sentryCollector/collect_server.py -+++ b/src/python/sentryCollector/collect_server.py -@@ -256,7 +256,7 @@ class CollectServer(): - - def server_loop(self): - """main loop""" -- logging.info("collect server thread start") -+ logging.info("collect listen thread start") - server_fd = self.server_fd_create() - if not server_fd: - return -@@ -267,7 +267,7 @@ class CollectServer(): - logging.debug("start server_loop loop") - while True: - if self.stop_event.is_set(): -- logging.info("collect server thread exit") -+ logging.debug("collect listen thread exit") - server_fd = None - return - try: -@@ -281,5 +281,5 @@ class CollectServer(): - pass - - def stop_thread(self): -- logging.info("collect server thread is preparing to exit") -+ logging.debug("collect listen thread is preparing to exit") - self.stop_event.set() -diff --git a/src/python/sentryCollector/collectd.py b/src/python/sentryCollector/collectd.py -index b77c642..3a836df 100644 ---- a/src/python/sentryCollector/collectd.py -+++ b/src/python/sentryCollector/collectd.py -@@ -49,7 +49,6 @@ def sig_handler(signum, _f): - Thread_List[i][0].stop_thread() - - remove_sock_file() -- sys.exit(0) - - def main(): - """main -@@ -64,7 +63,6 @@ def main(): - try: - signal.signal(signal.SIGINT, sig_handler) - signal.signal(signal.SIGTERM, sig_handler) -- signal.signal(signal.SIGHUP, sig_handler) - - logging.info("finish main parse_args") - -diff --git a/src/python/sentryPlugins/avg_block_io/avg_block_io.py b/src/python/sentryPlugins/avg_block_io/avg_block_io.py -index ff2071d..73f0b22 100644 ---- a/src/python/sentryPlugins/avg_block_io/avg_block_io.py -+++ b/src/python/sentryPlugins/avg_block_io/avg_block_io.py -@@ -21,7 +21,7 @@ CONFIG_FILE = "/etc/sysSentry/plugins/avg_block_io.ini" - - def log_invalid_keys(not_in_list, keys_name, config_list, default_list): - """print invalid log""" -- if config_list and default_list: -+ if config_list and not_in_list: - logging.warning("{} in common.{} are not valid, set {}={}".format(not_in_list, keys_name, keys_name, default_list)) - elif config_list == ["default"]: - logging.warning("Default {} use {}".format(keys_name, default_list)) -@@ -144,9 +144,11 @@ def init_io_win(io_dic, config, common_param): - - if avg_lim_value and avg_time_value and tot_lim_value: - io_data[disk_name][stage_name][rw]["latency"] = IoWindow(window_size=io_dic["win_size"], window_threshold=io_dic["win_threshold"], abnormal_multiple=avg_time_value, abnormal_multiple_lim=avg_lim_value, abnormal_time=tot_lim_value) -+ logging.debug("Successfully create {}-{}-{} latency window".format(disk_name, stage_name, rw)) - - if iodump_lim_value is not None: - io_data[disk_name][stage_name][rw]["iodump"] = IoDumpWindow(window_size=io_dic["win_size"], window_threshold=io_dic["win_threshold"], abnormal_time=iodump_lim_value) -+ logging.debug("Successfully create {}-{}-{} iodump window".format(disk_name, stage_name, rw)) - return io_data, io_avg_value - - -@@ -159,10 +161,10 @@ def get_valid_disk_stage_list(io_dic, config_disk, config_stage): - for disk_stage_list in json_data.values(): - all_stage_set.update(disk_stage_list) - -- disk_list = [key for key in config_disk if key in all_disk_set] -+ disk_list = [key for key in all_disk_set if key in config_disk] - not_in_disk_list = [key for key in config_disk if key not in all_disk_set] - -- stage_list = [key for key in config_stage if key in all_stage_set] -+ stage_list = [key for key in all_stage_set if key in config_stage] - not_in_stage_list = [key for key in config_stage if key not in all_stage_set] - - if not config_disk: -@@ -171,6 +173,9 @@ def get_valid_disk_stage_list(io_dic, config_disk, config_stage): - if not config_stage: - stage_list = [key for key in all_stage_set] - -+ disk_list = disk_list[:10] if len(disk_list) > 10 else disk_list -+ stage_list = stage_list[:15] if len(stage_list) > 15 else stage_list -+ - if config_disk and not disk_list: - logging.warning("Cannot get valid disk by disk={}, set to default".format(config_disk)) - disk_list, stage_list = get_valid_disk_stage_list(io_dic, [], config_stage) -@@ -228,6 +233,8 @@ def main(): - signal.signal(signal.SIGINT, sig_handler) - signal.signal(signal.SIGTERM, sig_handler) - -+ logging.basicConfig(level=logging.INFO) -+ - # 初始化配置读取 - config = configparser.ConfigParser(comment_prefixes=('#', ';')) - try: --- -2.33.0 - diff --git a/fix-syssentry-fails-to-be-started-when-cpu_sentry-is.patch b/fix-syssentry-fails-to-be-started-when-cpu_sentry-is.patch deleted file mode 100644 index 675277f677303c6286cfb39299b6334da32354bb..0000000000000000000000000000000000000000 --- a/fix-syssentry-fails-to-be-started-when-cpu_sentry-is.patch +++ /dev/null @@ -1,56 +0,0 @@ -From 497b3124f017ce4ae99b34261c4fd5dd2a358f5b Mon Sep 17 00:00:00 2001 -From: zhuofeng -Date: Sat, 14 Sep 2024 09:28:00 +0800 -Subject: [PATCH] fix syssentry fails to be started when cpu_sentry is not - installed - ---- - src/python/syssentry/syssentry.py | 11 ++++++----- - 1 file changed, 6 insertions(+), 5 deletions(-) - -diff --git a/src/python/syssentry/syssentry.py b/src/python/syssentry/syssentry.py -index f93956e..776971f 100644 ---- a/src/python/syssentry/syssentry.py -+++ b/src/python/syssentry/syssentry.py -@@ -43,7 +43,6 @@ try: - from .cpu_alarm import cpu_alarm_recv - except ImportError: - CPU_EXIST = False -- logging.debug("Cannot find cpu sentry mod") - - - INSPECTOR = None -@@ -563,20 +562,21 @@ def main(): - if not os.path.exists(SENTRY_RUN_DIR): - os.mkdir(SENTRY_RUN_DIR) - os.chmod(SENTRY_RUN_DIR, mode=SENTRY_RUN_DIR_PERM) -- if not chk_and_set_pidfile(): -- logging.error("get pid file lock failed, exist") -- sys.exit(17) - - logging.basicConfig(filename=SYSSENTRY_LOG_FILE, level=logging.INFO) - os.chmod(SYSSENTRY_LOG_FILE, 0o600) - -+ if not chk_and_set_pidfile(): -+ logging.error("get pid file lock failed, exist") -+ sys.exit(17) -+ - try: - signal.signal(signal.SIGINT, sig_handler) - signal.signal(signal.SIGTERM, sig_handler) - signal.signal(signal.SIGHUP, sig_handler) - signal.signal(signal.SIGCHLD, sigchld_handler) - -- logging.debug("finish main parse_args") -+ logging.info("finish main parse_args") - - _ = SentryConfig.init_param() - TasksMap.init_task_map() -@@ -587,3 +587,4 @@ def main(): - logging.error('%s', traceback.format_exc()) - finally: - release_pidfile() -+ --- -2.33.0 - diff --git a/fix-test_ai_block_io-fail.patch b/fix-test_ai_block_io-fail.patch deleted file mode 100644 index a83d53552cdd01f585216d60e25ec16bab6c988e..0000000000000000000000000000000000000000 --- a/fix-test_ai_block_io-fail.patch +++ /dev/null @@ -1,91 +0,0 @@ -From 874daf9627c74aa31f1063c250b5477b2eb322e8 Mon Sep 17 00:00:00 2001 -From: shixuantong -Date: Sat, 28 Dec 2024 11:31:23 +0800 -Subject: [PATCH] fix test_ai_block_io fail - ---- - selftest/test/test_ai_block_io.py | 26 +++++++++++++------------- - 1 file changed, 13 insertions(+), 13 deletions(-) - -diff --git a/selftest/test/test_ai_block_io.py b/selftest/test/test_ai_block_io.py -index c36fef5..58ab096 100644 ---- a/selftest/test/test_ai_block_io.py -+++ b/selftest/test/test_ai_block_io.py -@@ -12,9 +12,9 @@ - import unittest - import numpy as np - --from sentryPlugins.ai_threshold_slow_io_detection.threshold import AbsoluteThreshold, BoxplotThreshold, NSigmaThreshold --from sentryPlugins.ai_threshold_slow_io_detection.sliding_window import (NotContinuousSlidingWindow, -- ContinuousSlidingWindow, MedianSlidingWindow) -+from sentryPlugins.ai_block_io.threshold import AbsoluteThreshold, BoxplotThreshold, NSigmaThreshold -+from sentryPlugins.ai_block_io.sliding_window import (NotContinuousSlidingWindow, -+ ContinuousSlidingWindow, MedianSlidingWindow) - - - def _get_boxplot_threshold(data_list: list, parameter): -@@ -98,11 +98,11 @@ class Test(unittest.TestCase): - for data in data_list1: - boxplot_threshold.push_latest_data_to_queue(data) - result = not_continuous.is_slow_io_event(data) -- self.assertFalse(result[0]) -+ self.assertFalse(result[0][0]) - self.assertEqual(23.75, boxplot_threshold.get_threshold()) - boxplot_threshold.push_latest_data_to_queue(24) - result = not_continuous.is_slow_io_event(24) -- self.assertFalse(result[0]) -+ self.assertFalse(result[0][0]) - boxplot_threshold.push_latest_data_to_queue(25) - result = not_continuous.is_slow_io_event(25) - self.assertTrue(result[0]) -@@ -110,7 +110,7 @@ class Test(unittest.TestCase): - for data in data_list2: - boxplot_threshold.push_latest_data_to_queue(data) - result = not_continuous.is_slow_io_event(data) -- self.assertFalse(result[0]) -+ self.assertFalse(result[0][0]) - self.assertEqual(25.625, boxplot_threshold.get_threshold()) - - def test_continuous_sliding_window(self): -@@ -121,14 +121,14 @@ class Test(unittest.TestCase): - for data in data_list: - boxplot_threshold.push_latest_data_to_queue(data) - result = continuous.is_slow_io_event(data) -- self.assertFalse(result[0]) -+ self.assertFalse(result[0][0]) - self.assertEqual(23.75, boxplot_threshold.get_threshold()) - # 没有三个异常点 -- self.assertFalse(continuous.is_slow_io_event(25)[0]) -+ self.assertFalse(continuous.is_slow_io_event(25)[0][0]) - # 不连续的三个异常点 -- self.assertFalse(continuous.is_slow_io_event(25)[0]) -+ self.assertFalse(continuous.is_slow_io_event(25)[0][0]) - # 连续的三个异常点 -- self.assertTrue(continuous.is_slow_io_event(25)[0]) -+ self.assertTrue(continuous.is_slow_io_event(25)[0][0]) - - def test_median_sliding_window(self): - median = MedianSlidingWindow(5, 3) -@@ -137,7 +137,7 @@ class Test(unittest.TestCase): - absolute_threshold.set_threshold(24.5) - data_list = [24, 24, 24, 25, 25] - for data in data_list: -- self.assertFalse(median.is_slow_io_event(data)[0]) -+ self.assertFalse(median.is_slow_io_event(data)[0][0]) - self.assertTrue(median.is_slow_io_event(25)[0]) - - def test_parse_collect_data(self): -@@ -147,8 +147,8 @@ class Test(unittest.TestCase): - "flush": [9.0, 10.0, 11.0, 12.0], - "discard": [13.0, 14.0, 15.0, 16.0], - } -- from io_data import BaseData -- from data_access import _get_io_stage_data -+ from sentryPlugins.ai_block_io.io_data import BaseData -+ from sentryPlugins.ai_block_io.data_access import _get_io_stage_data - - io_data = _get_io_stage_data(collect) - self.assertEqual( --- -2.27.0 - diff --git a/fix-uint8-bug-and-change-isolation-default-value.patch b/fix-uint8-bug-and-change-isolation-default-value.patch deleted file mode 100644 index 959acf7b341ae392921392c164330a457706cd77..0000000000000000000000000000000000000000 --- a/fix-uint8-bug-and-change-isolation-default-value.patch +++ /dev/null @@ -1,61 +0,0 @@ -From 00ea35472d50faea89c881eb45b6d9d11f6b6632 Mon Sep 17 00:00:00 2001 -From: luckky -Date: Fri, 1 Nov 2024 15:09:57 +0800 -Subject: [PATCH] fix uint8 bug and change isolation default value - ---- - src/c/hbm_online_repair/hbm_online_repair.env | 2 +- - src/c/hbm_online_repair/non-standard-hbm-repair.c | 8 ++++---- - 2 files changed, 5 insertions(+), 5 deletions(-) - -diff --git a/src/c/hbm_online_repair/hbm_online_repair.env b/src/c/hbm_online_repair/hbm_online_repair.env -index de56079..7166c8d 100644 ---- a/src/c/hbm_online_repair/hbm_online_repair.env -+++ b/src/c/hbm_online_repair/hbm_online_repair.env -@@ -1,2 +1,2 @@ - HBM_ONLINE_REPAIR_LOG_LEVEL=1 --PAGE_ISOLATION_THRESHOLD=128 -+PAGE_ISOLATION_THRESHOLD=3355443 -diff --git a/src/c/hbm_online_repair/non-standard-hbm-repair.c b/src/c/hbm_online_repair/non-standard-hbm-repair.c -index f26d8ae..b8dde7a 100644 ---- a/src/c/hbm_online_repair/non-standard-hbm-repair.c -+++ b/src/c/hbm_online_repair/non-standard-hbm-repair.c -@@ -359,7 +359,7 @@ static int write_file(char *path, const char *name, unsigned long long value) - - fd = open(fname, O_WRONLY); - if (fd < 0) { -- log(LOG_WARNING, "HBM ACLS: Cannot to open '%s': %s\n", -+ log(LOG_WARNING, "HBM: Cannot to open '%s': %s\n", - fname, strerror(errno)); - return -errno; - } -@@ -367,7 +367,7 @@ static int write_file(char *path, const char *name, unsigned long long value) - snprintf(buf, sizeof(buf), "0x%llx\n", value); - ret = write(fd, buf, strlen(buf)); - if (ret <= 0) -- log(LOG_WARNING, "HBM ACLS: Failed to set %s (0x%llx): %s\n", -+ log(LOG_WARNING, "HBM: Failed to set %s (0x%llx): %s\n", - fname, value, strerror(errno)); - - close(fd); -@@ -557,7 +557,7 @@ static int hbmc_hbm_page_isolate(const struct hisi_common_error_section *err) - return ret < 0 ? ret : 0; - } - --static int hbmc_hbm_after_repair(bool is_acls, const int repair_ret, const unsigned long long paddr) -+static uint8_t hbmc_hbm_after_repair(bool is_acls, const int repair_ret, const unsigned long long paddr) - { - int ret; - if (repair_ret <= 0) { -@@ -577,7 +577,7 @@ static int hbmc_hbm_after_repair(bool is_acls, const int repair_ret, const unsig - } - } - --static uint8_t hbmc_hbm_repair(const struct hisi_common_error_section *err, char *path) -+static int hbmc_hbm_repair(const struct hisi_common_error_section *err, char *path) - { - unsigned long long paddr; - int ret; --- -2.43.0 - diff --git a/fix-version-in-setup.py.patch b/fix-version-in-setup.py.patch deleted file mode 100644 index 42816db387aa5b0fd7b875038d106243349d27ec..0000000000000000000000000000000000000000 --- a/fix-version-in-setup.py.patch +++ /dev/null @@ -1,25 +0,0 @@ -From 7baf2815f515c54bc33f41f495ec7c26988b5c44 Mon Sep 17 00:00:00 2001 -From: shixuantong -Date: Tue, 11 Jun 2024 16:47:46 +0800 -Subject: [PATCH] fix version in setup.py - ---- - src/python/setup.py | 2 +- - 1 file changed, 1 insertion(+), 1 deletion(-) - -diff --git a/src/python/setup.py b/src/python/setup.py -index 21dbe9f..f96a96e 100644 ---- a/src/python/setup.py -+++ b/src/python/setup.py -@@ -17,7 +17,7 @@ from setuptools import setup, find_packages - - setup( - name="syssentry", -- version="1.0.1", -+ version="1.0.2", - description="System inspection framework tool set", - packages=find_packages(), - include_package_data=True, --- -2.27.0 - diff --git a/fix-word-error.patch b/fix-word-error.patch deleted file mode 100644 index 1e7de89450e24cd8897eb708a7a63041a2d6d6b1..0000000000000000000000000000000000000000 --- a/fix-word-error.patch +++ /dev/null @@ -1,53 +0,0 @@ -From 5be0d121c6fde185d323dc4bcf3026e2c3ee8757 Mon Sep 17 00:00:00 2001 -From: jinsaihang -Date: Mon, 14 Oct 2024 11:30:58 +0800 -Subject: [PATCH] fix word error - -Signed-off-by: jinsaihang ---- - sysSentry-1.0.2/src/python/syssentry/alarm.py | 10 +++++----- - 1 file changed, 5 insertions(+), 5 deletions(-) - -diff --git a/src/python/syssentry/alarm.py b/src/python/syssentry/alarm.py -index d012901..bff527c 100644 ---- a/src/python/syssentry/alarm.py -+++ b/src/python/syssentry/alarm.py -@@ -49,7 +49,7 @@ MAX_ALARM_ID = (MIN_ALARM_ID + MAX_NUM_OF_ALARM_ID - 1) - def update_alarm_list(alarm_info: Xalarm): - alarm_id = xalarm_getid(alarm_info) - if alarm_id < MIN_ALARM_ID or alarm_id > MAX_ALARM_ID: -- logging.warnning(f"Invalid alarm_id {alarm_id}") -+ logging.warning(f"Invalid alarm_id {alarm_id}") - return - timestamp = xalarm_gettime(alarm_info) - if not timestamp: -@@ -97,14 +97,14 @@ def alarm_register(): - task = TasksMap.tasks_dict[task_type][task_name] - alarm_id = task.alarm_id - if not check_alarm_id_if_number(alarm_id): -- logging.warnning(f"Invalid alarm_id {alarm_id}: ignore {task_name} alarm") -+ logging.warning(f"Invalid alarm_id {alarm_id}: ignore {task_name} alarm") - continue - if alarm_id < MIN_ALARM_ID or alarm_id > MAX_ALARM_ID: -- logging.warnning(f"Invalid alarm_id {alarm_id}: ignore {task_name} alarm") -+ logging.warning(f"Invalid alarm_id {alarm_id}: ignore {task_name} alarm") - continue - alarm_clear_time = task.alarm_clear_time - if not check_alarm_clear_time_if_positive_integer(alarm_clear_time): -- logging.warnning(f"Invalid alarm_clear_time {alarm_clear_time}: ignore {task_name} alarm") -+ logging.warning(f"Invalid alarm_clear_time {alarm_clear_time}: ignore {task_name} alarm") - continue - try: - alarm_clear_time = int(alarm_clear_time) -@@ -113,7 +113,7 @@ def alarm_register(): - if alarm_clear_time > sys.maxsize: - raise ValueError("Exceeds maximum value for int") - except (ValueError, OverflowError, TypeError) as e: -- logging.warnning(f"Invalid alarm_clear_time {alarm_clear_time}: ignore {task_name} alarm") -+ logging.warning(f"Invalid alarm_clear_time {alarm_clear_time}: ignore {task_name} alarm") - continue - alarm_list_dict[alarm_id] = [] - task_alarm_id_dict[task_name] = alarm_id --- -2.27.0 - diff --git a/fix-write-file-return-code-bug.patch b/fix-write-file-return-code-bug.patch deleted file mode 100644 index f059224778ee237bfb84a006f72ca8580b6a730f..0000000000000000000000000000000000000000 --- a/fix-write-file-return-code-bug.patch +++ /dev/null @@ -1,69 +0,0 @@ -From cea094acea79b88e6458cfa264a03c51f08c72fc Mon Sep 17 00:00:00 2001 -From: luckky -Date: Mon, 4 Nov 2024 20:18:05 +0800 -Subject: [PATCH] fix write file return code bug -Set the return code 0 to -EINVAL to unify the processing of return code. - ---- - .../hbm_online_repair/non-standard-hbm-repair.c | 17 ++++++++++------- - 1 file changed, 10 insertions(+), 7 deletions(-) - -diff --git a/src/c/hbm_online_repair/non-standard-hbm-repair.c b/src/c/hbm_online_repair/non-standard-hbm-repair.c -index b8dde7a..97cb9a7 100644 ---- a/src/c/hbm_online_repair/non-standard-hbm-repair.c -+++ b/src/c/hbm_online_repair/non-standard-hbm-repair.c -@@ -112,7 +112,7 @@ static void parse_fault_addr_info(struct fault_addr_info* info_struct, unsigned - info_struct->row_id = fault_addr & FAULT_ADDR_ROW_ID_MASK; - fault_addr >>= FAULT_ADDR_ROW_ID_LEN; - info_struct->column_id = fault_addr & FAULT_ADDR_COLUMN_ID_MASK; -- fault_addr >>= FAULT_ADDR_CHANNEL_ID_LEN; -+ fault_addr >>= FAULT_ADDR_COLUMN_ID_LEN; - info_struct->error_type = fault_addr & FAULT_ADDR_ERROR_TYPE_MASK; - fault_addr >>= FAULT_ADDR_ERROR_TYPE_LEN; - info_struct->repair_type = fault_addr & FAULT_ADDR_REPAIR_TYPE_MASK; -@@ -371,7 +371,12 @@ static int write_file(char *path, const char *name, unsigned long long value) - fname, value, strerror(errno)); - - close(fd); -- return ret > 0 ? 0 : -errno; -+ if (ret == 0) { -+ ret = -EINVAL; -+ } else if (ret < 0) { -+ ret = -errno; -+ } -+ return ret; - } - - static int get_hardware_corrupted_size() -@@ -560,7 +565,7 @@ static int hbmc_hbm_page_isolate(const struct hisi_common_error_section *err) - static uint8_t hbmc_hbm_after_repair(bool is_acls, const int repair_ret, const unsigned long long paddr) - { - int ret; -- if (repair_ret <= 0) { -+ if (repair_ret < 0) { - log(LOG_WARNING, "HBM %s: Keep page (0x%llx) offline\n", is_acls ? "ACLS" : "SPPR", paddr); - /* not much we can do about errors here */ - (void)write_file("/sys/kernel/page_eject", "remove_page", paddr); -@@ -594,8 +599,7 @@ static int hbmc_hbm_repair(const struct hisi_common_error_section *err, char *pa - - ret = write_file(path, is_acls ? "acls_query" : "sppr_query", paddr); - -- /* Only positive num means the error is supported to repair */ -- if (ret <= 0) { -+ if (ret < 0) { - if (ret != -ENXIO) { - notice_BMC(err, get_repair_failed_result_code(ret)); - log(LOG_WARNING, "HBM: Address 0x%llx is not supported to %s repair\n", paddr, is_acls ? "ACLS" : "SPPR"); -@@ -624,8 +628,7 @@ static int hbmc_hbm_repair(const struct hisi_common_error_section *err, char *pa - all_online_success = false; - } - } -- /* The ret is from the acls/sppr repair, and only positive num means the error is repaired successfully */ -- if (ret <= 0) { -+ if (ret < 0) { - notice_BMC(err, get_repair_failed_result_code(ret)); - return ret; - } else if (all_online_success) { --- -2.43.0 - diff --git a/fix-xalarm-non-uniform-log-formatting.patch b/fix-xalarm-non-uniform-log-formatting.patch deleted file mode 100644 index 34f579b65feffef4dcbf8ea45b47d5a0e407b566..0000000000000000000000000000000000000000 --- a/fix-xalarm-non-uniform-log-formatting.patch +++ /dev/null @@ -1,60 +0,0 @@ -From 3eba5dcde10e05e7badc99852f76488e667d56e6 Mon Sep 17 00:00:00 2001 -From: caixiaomeng -Date: Mon, 21 Oct 2024 11:57:37 +0800 -Subject: [PATCH] fix xalarm non-uniform log formatting - ---- - src/python/xalarm/sentry_notify.py | 11 ++++++----- - 1 file changed, 6 insertions(+), 5 deletions(-) - -diff --git a/src/python/xalarm/sentry_notify.py b/src/python/xalarm/sentry_notify.py -index 5838473..ffe4147 100644 ---- a/src/python/xalarm/sentry_notify.py -+++ b/src/python/xalarm/sentry_notify.py -@@ -2,6 +2,7 @@ import os - import sys - import time - import socket -+import logging - from struct import error as StructParseError - - from .xalarm_api import alarm_stu2bin, Xalarm -@@ -27,21 +28,21 @@ ALARM_SOCKET_PERMISSION = 0o700 - - def check_params(alarm_id, alarm_level, alarm_type, puc_paras) -> bool: - if not os.path.exists(DIR_XALARM): -- sys.stderr.write(f"check_params: {DIR_XALARM} not exist, failed\n") -+ logging.error(f"check_params: {DIR_XALARM} not exist, failed") - return False - - if not os.path.exists(PATH_REPORT_ALARM): -- sys.stderr.write(f"check_params: {PATH_REPORT_ALARM} not exist, failed\n") -+ logging.error(f"check_params: {PATH_REPORT_ALARM} not exist, failed") - return False - - if (alarm_id < MIN_ALARM_ID or alarm_id > MAX_ALARM_ID or - alarm_level < MINOR_ALM or alarm_level > CRITICAL_ALM or - alarm_type < ALARM_TYPE_OCCUR or alarm_type > ALARM_TYPE_RECOVER): -- sys.stderr.write("check_params: alarm info invalid\n") -+ logging.error("check_params: alarm info invalid") - return False - - if len(puc_paras) >= MAX_PUC_PARAS_LEN: -- sys.stderr.write(f"check_params: alarm msg should be less than {MAX_PUC_PARAS_LEN}\n") -+ logging.error(f"check_params: alarm msg should be less than {MAX_PUC_PARAS_LEN}") - return False - - return True -@@ -61,7 +62,7 @@ def xalarm_report(alarm_id, alarm_level, alarm_type, puc_paras) -> bool: - - sock.sendto(alarm_stu2bin(alarm_info), PATH_REPORT_ALARM) - except (FileNotFoundError, StructParseError, socket.error, OSError, UnicodeError) as e: -- sys.stderr.write(f"check_params: error occurs when sending msg.{e}\n") -+ logging.error(f"error occurs when sending msg.") - return False - finally: - sock.close() --- -2.27.0 - - diff --git a/fix-xalarm_Report-function-not-refuse-alarm-msg-exce.patch b/fix-xalarm_Report-function-not-refuse-alarm-msg-exce.patch deleted file mode 100644 index 1bf5c3ba76f726725aa310e0c35c61a238150467..0000000000000000000000000000000000000000 --- a/fix-xalarm_Report-function-not-refuse-alarm-msg-exce.patch +++ /dev/null @@ -1,76 +0,0 @@ -From f6a26ea0759f36ebcaebe05d4d24c7234a110c63 Mon Sep 17 00:00:00 2001 -From: caixiaomeng -Date: Fri, 11 Oct 2024 12:12:53 +0800 -Subject: [PATCH] fix xalarm_Report function not refuse alarm msg exceeds - maximum - ---- - src/libso/xalarm/register_xalarm.c | 5 +++++ - src/python/xalarm/register_xalarm.py | 6 +++--- - src/python/xalarm/sentry_notify.py | 4 ++-- - 3 files changed, 10 insertions(+), 5 deletions(-) - -diff --git a/src/libso/xalarm/register_xalarm.c b/src/libso/xalarm/register_xalarm.c -index 5aff2bc..952a28b 100644 ---- a/src/libso/xalarm/register_xalarm.c -+++ b/src/libso/xalarm/register_xalarm.c -@@ -339,6 +339,11 @@ int xalarm_Report(unsigned short usAlarmId, unsigned char ucAlarmLevel, - return -1; - } - -+ if (pucParas == NULL || (int)strlen(pucParas) > MAX_PARAS_LEN) { -+ fprintf(stderr, "%s: alarm info invalid\n", __func__); -+ return -1; -+ } -+ - if (memset(&info, 0, sizeof(struct alarm_info)) == NULL) { - fprintf(stderr, "%s: memset info failed, ret: %d\n", __func__, ret); - return -1; -diff --git a/src/python/xalarm/register_xalarm.py b/src/python/xalarm/register_xalarm.py -index edd9994..39623bd 100644 ---- a/src/python/xalarm/register_xalarm.py -+++ b/src/python/xalarm/register_xalarm.py -@@ -45,7 +45,7 @@ class AlarmRegister: - return False - - if self.socket is None: -- sys.stderr.write("check_params: scoket create failed\n") -+ sys.stderr.write("check_params: socket create failed\n") - return False - return True - -@@ -151,10 +151,10 @@ def xalarm_unregister(clientId: int) -> None: - def xalarm_upgrade(clientId: int, id_filter: list) -> None: - global ALARM_REGISTER_INFO - if clientId < 0: -- sys.stderr.write("xalarm_unregister: invalid client\n") -+ sys.stderr.write("xalarm_upgrade: invalid client\n") - return - if ALARM_REGISTER_INFO is None: -- sys.stderr.write("xalarm_unregister: alarm has not registered\n") -+ sys.stderr.write("xalarm_upgrade: alarm has not registered\n") - return - ALARM_REGISTER_INFO.id_filter = id_filter - -diff --git a/src/python/xalarm/sentry_notify.py b/src/python/xalarm/sentry_notify.py -index c763a24..5838473 100644 ---- a/src/python/xalarm/sentry_notify.py -+++ b/src/python/xalarm/sentry_notify.py -@@ -27,11 +27,11 @@ ALARM_SOCKET_PERMISSION = 0o700 - - def check_params(alarm_id, alarm_level, alarm_type, puc_paras) -> bool: - if not os.path.exists(DIR_XALARM): -- sys.stderr.write(f"check_params: {DIR_XALARM} not exist, failed") -+ sys.stderr.write(f"check_params: {DIR_XALARM} not exist, failed\n") - return False - - if not os.path.exists(PATH_REPORT_ALARM): -- sys.stderr.write(f"check_params: {PATH_REPORT_ALARM} not exist, failed") -+ sys.stderr.write(f"check_params: {PATH_REPORT_ALARM} not exist, failed\n") - return False - - if (alarm_id < MIN_ALARM_ID or alarm_id > MAX_ALARM_ID or --- -2.27.0 - - diff --git a/fix-xalarm_upgrade-not-return-val-and-fail-when-thre.patch b/fix-xalarm_upgrade-not-return-val-and-fail-when-thre.patch deleted file mode 100644 index 5b1f231af06cdab470230cf900465bfc24244f4d..0000000000000000000000000000000000000000 --- a/fix-xalarm_upgrade-not-return-val-and-fail-when-thre.patch +++ /dev/null @@ -1,71 +0,0 @@ -From 624efd60495403743fc251b7d689d920841e44c8 Mon Sep 17 00:00:00 2001 -From: caixiaomeng -Date: Fri, 11 Oct 2024 17:54:04 +0800 -Subject: [PATCH] fix xalarm_upgrade not return val and fail when thread - stopped - ---- - src/libso/xalarm/register_xalarm.c | 11 ++++++++++- - src/python/xalarm/register_xalarm.py | 10 +++++++--- - 2 files changed, 17 insertions(+), 4 deletions(-) - -diff --git a/src/libso/xalarm/register_xalarm.c b/src/libso/xalarm/register_xalarm.c -index 952a28b..6768242 100644 ---- a/src/libso/xalarm/register_xalarm.c -+++ b/src/libso/xalarm/register_xalarm.c -@@ -156,7 +156,11 @@ static void *alarm_recv(void *arg) - continue; - } - printf("recv error len:%d errno:%d\n", recvlen, errno); -- } -+ } else if (recvlen == 0) { -+ printf("connection closed by xalarmd, maybe connections reach max num or service stopped.\n"); -+ g_register_info.thread_should_stop = 1; -+ break; -+ } - } - return NULL; - } -@@ -211,6 +215,11 @@ bool xalarm_Upgrade(struct alarm_subscription_info id_filter, int client_id) - printf("%s: invalid args\n", __func__); - return false; - } -+ -+ if (g_register_info.thread_should_stop) { -+ printf("%s: upgrade failed, alarm thread has stopped\n", __func__); -+ return false; -+ } - set_alarm_id(id_filter); - - return true; -diff --git a/src/python/xalarm/register_xalarm.py b/src/python/xalarm/register_xalarm.py -index 39623bd..2a6dabf 100644 ---- a/src/python/xalarm/register_xalarm.py -+++ b/src/python/xalarm/register_xalarm.py -@@ -148,15 +148,19 @@ def xalarm_unregister(clientId: int) -> None: - ALARM_REGISTER_INFO = None - - --def xalarm_upgrade(clientId: int, id_filter: list) -> None: -+def xalarm_upgrade(id_filter: list, clientId: int) -> bool: - global ALARM_REGISTER_INFO - if clientId < 0: - sys.stderr.write("xalarm_upgrade: invalid client\n") -- return -+ return False - if ALARM_REGISTER_INFO is None: - sys.stderr.write("xalarm_upgrade: alarm has not registered\n") -- return -+ return False -+ if ALARM_REGISTER_INFO.thread_should_stop: -+ sys.stderr.write("xalarm_upgrade: upgrade failed, alarm thread has stopped\n") -+ return False - ALARM_REGISTER_INFO.id_filter = id_filter -+ return True - - - def xalarm_getid(alarm_info: Xalarm) -> int: --- -2.27.0 - - diff --git a/get_alarm-d-abnomal-display.patch b/get_alarm-d-abnomal-display.patch deleted file mode 100644 index 8a7924a31f67fe9c1cdf4911fcd6835497f93dbf..0000000000000000000000000000000000000000 --- a/get_alarm-d-abnomal-display.patch +++ /dev/null @@ -1,26 +0,0 @@ -From 132334913c4afebefd6afa835f790fa8a5fbf123 Mon Sep 17 00:00:00 2001 -From: jinsaihang -Date: Mon, 28 Oct 2024 09:22:53 +0800 -Subject: [PATCH] get_alarm -d abnomal display - -Signed-off-by: jinsaihang ---- - sysSentry-1.0.2/src/python/syssentry/alarm.py | 2 +- - 1 file changed, 1 insertion(+), 1 deletion(-) - -diff --git a/src/python/syssentry/alarm.py b/src/python/syssentry/alarm.py -index b35a126..e5cc313 100644 ---- a/src/python/syssentry/alarm.py -+++ b/src/python/syssentry/alarm.py -@@ -184,7 +184,7 @@ def get_alarm_result(task_name: str, time_range: int, detailed: bool) -> List[Di - # dump each {key,value} of details in one line - if 'details' in alarm_info and isinstance(alarm_info['details'], dict): - for key in alarm_info['details']: -- alarm_info['details'][key] = json.dumps(alarm_info['details'][key], indent=None) -+ alarm_info['details'][key] = str(alarm_info['details'][key]) - - alarm['alarm_info'] = alarm_info - alarm_list = [alarm for alarm in alarm_list if 'alarm_source' in alarm['alarm_info'] and alarm['alarm_info']['alarm_source'] == task_name] --- -2.27.0 - diff --git a/get_io_data-failed-wont-stop-avg_block_io-and-del-di.patch b/get_io_data-failed-wont-stop-avg_block_io-and-del-di.patch deleted file mode 100644 index ec2aaf2d2c26bf2f49d44ac716c7f7e64dd37275..0000000000000000000000000000000000000000 --- a/get_io_data-failed-wont-stop-avg_block_io-and-del-di.patch +++ /dev/null @@ -1,168 +0,0 @@ -From b21607fcec4b290bc78c9f6c4a26db1a2df32a66 Mon Sep 17 00:00:00 2001 -From: gaoruoshu -Date: Tue, 15 Oct 2024 21:21:10 +0800 -Subject: [PATCH] get_io_data failed wont stop avg_block_io and del disk not - support - ---- - src/python/sentryCollector/collect_plugin.py | 14 ++++----- - .../avg_block_io/avg_block_io.py | 9 ++++-- - .../sentryPlugins/avg_block_io/module_conn.py | 31 +++++++++++++------ - 3 files changed, 35 insertions(+), 19 deletions(-) - -diff --git a/src/python/sentryCollector/collect_plugin.py b/src/python/sentryCollector/collect_plugin.py -index bec405a..53dddec 100644 ---- a/src/python/sentryCollector/collect_plugin.py -+++ b/src/python/sentryCollector/collect_plugin.py -@@ -90,14 +90,14 @@ def client_send_and_recv(request_data, data_str_len, protocol): - try: - client_socket = socket.socket(socket.AF_UNIX, socket.SOCK_STREAM) - except socket.error: -- logging.error("collect_plugin: client create socket error") -+ logging.debug("collect_plugin: client create socket error") - return None - - try: - client_socket.connect(COLLECT_SOCKET_PATH) - except OSError: - client_socket.close() -- logging.error("collect_plugin: client connect error") -+ logging.debug("collect_plugin: client connect error") - return None - - req_data_len = len(request_data) -@@ -109,23 +109,23 @@ def client_send_and_recv(request_data, data_str_len, protocol): - res_data = res_data.decode() - except (OSError, UnicodeError): - client_socket.close() -- logging.error("collect_plugin: client communicate error") -+ logging.debug("collect_plugin: client communicate error") - return None - - res_magic = res_data[:CLT_MSG_MAGIC_LEN] - if res_magic != "RES": -- logging.error("res msg format error") -+ logging.debug("res msg format error") - return None - - protocol_str = res_data[CLT_MSG_MAGIC_LEN:CLT_MSG_MAGIC_LEN+CLT_MSG_PRO_LEN] - try: - protocol_id = int(protocol_str) - except ValueError: -- logging.error("recv msg protocol id is invalid %s", protocol_str) -+ logging.debug("recv msg protocol id is invalid %s", protocol_str) - return None - - if protocol_id >= ClientProtocol.PRO_END: -- logging.error("protocol id is invalid") -+ logging.debug("protocol id is invalid") - return None - - try: -@@ -134,7 +134,7 @@ def client_send_and_recv(request_data, data_str_len, protocol): - res_msg_data = res_msg_data.decode() - return res_msg_data - except (OSError, ValueError, UnicodeError): -- logging.error("collect_plugin: client recv res msg error") -+ logging.debug("collect_plugin: client recv res msg error") - finally: - client_socket.close() - -diff --git a/src/python/sentryPlugins/avg_block_io/avg_block_io.py b/src/python/sentryPlugins/avg_block_io/avg_block_io.py -index cd47919..899d517 100644 ---- a/src/python/sentryPlugins/avg_block_io/avg_block_io.py -+++ b/src/python/sentryPlugins/avg_block_io/avg_block_io.py -@@ -15,7 +15,7 @@ import time - - from .config import read_config_log, read_config_common, read_config_algorithm, read_config_latency, read_config_iodump, read_config_stage - from .stage_window import IoWindow, IoDumpWindow --from .module_conn import avg_is_iocollect_valid, avg_get_io_data, report_alarm_fail, process_report_data, sig_handler, get_disk_type_by_name -+from .module_conn import avg_is_iocollect_valid, avg_get_io_data, report_alarm_fail, process_report_data, sig_handler, get_disk_type_by_name, check_disk_list_validation - from .utils import update_avg_and_check_abnormal - - CONFIG_FILE = "/etc/sysSentry/plugins/avg_block_io.ini" -@@ -79,6 +79,8 @@ def get_valid_disk_stage_list(io_dic, config_disk, config_stage): - if not disk_list: - report_alarm_fail("Cannot get valid disk name") - -+ disk_list = check_disk_list_validation(disk_list) -+ - disk_list = disk_list[:10] if len(disk_list) > 10 else disk_list - - if not config_disk: -@@ -117,7 +119,10 @@ def main_loop(io_dic, io_data, io_avg_value): - time.sleep(period_time) - - # 采集模块对接,获取周期数据 -- curr_period_data = avg_get_io_data(io_dic) -+ is_success, curr_period_data = avg_get_io_data(io_dic) -+ if not is_success: -+ logging.error(f"{curr_period_data['msg']}") -+ continue - - # 处理周期数据 - reach_size = False -diff --git a/src/python/sentryPlugins/avg_block_io/module_conn.py b/src/python/sentryPlugins/avg_block_io/module_conn.py -index cbdaad4..a67ef45 100644 ---- a/src/python/sentryPlugins/avg_block_io/module_conn.py -+++ b/src/python/sentryPlugins/avg_block_io/module_conn.py -@@ -40,25 +40,25 @@ def avg_is_iocollect_valid(io_dic, config_disk, config_stage): - logging.debug(f"send to sentryCollector is_iocollect_valid: period={io_dic['period_time']}, " - f"disk={config_disk}, stage={config_stage}") - res = is_iocollect_valid(io_dic["period_time"], config_disk, config_stage) -- return check_result_validation(res, 'check config validation') -+ is_success, data = check_result_validation(res, 'check config validation') -+ if not is_success: -+ report_alarm_fail(f"{data['msg']}") -+ return data - - - def check_result_validation(res, reason): - """check validation of result from sentryCollector""" - if not 'ret' in res or not 'message' in res: -- err_msg = "Failed to {}: Cannot connect to sentryCollector.".format(reason) -- report_alarm_fail(err_msg) -+ return False, {'msg': f"Failed to {reason}: Cannot connect to sentryCollector"} - if res['ret'] != 0: -- err_msg = "Failed to {}: {}".format(reason, Result_Messages[res['ret']]) -- report_alarm_fail(err_msg) -+ return False, {'msg': f"Failed to {reason}: {Result_Messages[res['ret']]}"} - - try: - json_data = json.loads(res['message']) - except json.JSONDecodeError: -- err_msg = f"Failed to {reason}: invalid return message" -- report_alarm_fail(err_msg) -+ return False, {'msg': f"Failed to {reason}: invalid return message"} - -- return json_data -+ return True, json_data - - - def report_alarm_fail(alarm_info): -@@ -120,10 +120,21 @@ def process_report_data(disk_name, rw, io_data): - xalarm_report(1002, MINOR_ALM, ALARM_TYPE_OCCUR, json.dumps(msg)) - - -+def check_disk_list_validation(disk_list): -+ valid_disk_list = [] -+ for disk_name in disk_list: -+ is_success, _ = check_result_validation(get_disk_type(disk_name), "") -+ if not is_success: -+ continue -+ valid_disk_list.append(disk_name) -+ return valid_disk_list -+ -+ - def get_disk_type_by_name(disk_name): - logging.debug(f"send to sentryCollector get_disk_type: disk_name={disk_name}") -- res = get_disk_type(disk_name) -- disk_type_str = check_result_validation(get_disk_type(disk_name), f'Invalid disk type {disk_name}') -+ is_success, disk_type_str = check_result_validation(get_disk_type(disk_name), f'Invalid disk type {disk_name}') -+ if not is_success: -+ report_alarm_fail(f"{disk_type_str['msg']}") - try: - curr_disk_type = int(disk_type_str) - if curr_disk_type not in Disk_Type: --- -2.27.0 diff --git a/hbm_online_repair-add-unload-driver.patch b/hbm_online_repair-add-unload-driver.patch deleted file mode 100644 index dd7a2699e92513a0462cf2fb913c46e65e6e0e3a..0000000000000000000000000000000000000000 --- a/hbm_online_repair-add-unload-driver.patch +++ /dev/null @@ -1,107 +0,0 @@ -From 74f18b0e1fd4f99fa7d1d95e08894b408dcafe51 Mon Sep 17 00:00:00 2001 -From: luckky -Date: Wed, 18 Dec 2024 14:31:04 +0800 -Subject: [PATCH] hbm_online_repair add unload driver - ---- - src/c/hbm_online_repair/hbm_online_repair.c | 47 +++++++++++++-------- - 1 file changed, 29 insertions(+), 18 deletions(-) - -diff --git a/src/c/hbm_online_repair/hbm_online_repair.c b/src/c/hbm_online_repair/hbm_online_repair.c -index 00c9c0b..6783485 100644 ---- a/src/c/hbm_online_repair/hbm_online_repair.c -+++ b/src/c/hbm_online_repair/hbm_online_repair.c -@@ -11,6 +11,8 @@ - #define DEFAULT_LOG_LEVEL LOG_INFO - #define DEFAULT_PAGE_ISOLATION_THRESHOLD 3355443 - -+#define DRIVER_COMMAND_LEN 32 -+ - int global_level_setting; - int page_isolation_threshold; - -@@ -57,25 +59,31 @@ int execute_command(const char *command) - return -1; - } - -- ret = WEXITSTATUS(ret); -+ ret = -WEXITSTATUS(ret); - log(LOG_DEBUG, "command %s exited with status: %d\n", command, ret); - return ret; - } - --int load_required_driver(void) -+int handle_driver(char* driver_name, bool load) - { - int ret; -- ret = execute_command("modprobe hisi_mem_ras 2>&1"); -- if (ret < 0) { -- log(LOG_ERROR, "load repair driver failed\n"); -- return ret; -- } -- ret = execute_command("modprobe page_eject 2>&1"); -- if (ret < 0) { -- log(LOG_ERROR, "load page driver failed\n"); -+ char command[DRIVER_COMMAND_LEN]; -+ -+ snprintf(command, DRIVER_COMMAND_LEN, "%s %s 2>&1", load ? "modprobe" : "rmmod", driver_name); -+ ret = execute_command(command); -+ log(ret < 0 ? LOG_ERROR : LOG_DEBUG, "%s %s %s\n", load ? "load" : "unload", driver_name, ret < 0 ? "failed" : "success"); -+ return ret; -+} -+ -+int handle_all_drivers(bool load) -+{ -+ int ret; -+ -+ ret = handle_driver("hisi_mem_ras", load); -+ if (ret < 0) - return ret; -- } -- log(LOG_INFO, "load required driver success\n"); -+ -+ ret = handle_driver("page_eject", load); - return ret; - } - -@@ -116,21 +124,21 @@ int main(int argc, char *argv[]) - - hbm_param_init(); - -- ret = load_required_driver(); -+ ret = handle_all_drivers(true); - if (ret < 0) { -- log(LOG_DEBUG, "load required driver failed\n"); - return ret; - } - - struct ras_events *ras = init_trace_instance(); -- if (!ras) -- return -1; -+ if (!ras) { -+ ret = -1; -+ goto err_unload; -+ } - - ret = toggle_ras_event(ras->tracing, "ras", "non_standard_event", 1); - if (ret < 0) { - log(LOG_WARNING, "unable to enable ras non_standard_event.\n"); -- free(ras); -- return -1; -+ goto err_free; - } - - get_flash_total_size(); -@@ -142,6 +150,9 @@ int main(int argc, char *argv[]) - log(LOG_WARNING, "unable to disable ras non_standard_event.\n"); - } - -+err_free: - free(ras); -+err_unload: -+ handle_all_drivers(false); - return ret; - } --- -2.43.0 - diff --git a/listen-thread-of-collect-module-exits-occasionally.patch b/listen-thread-of-collect-module-exits-occasionally.patch deleted file mode 100644 index b667ea0e055e75dc13b0f9715215fd73993c2d2c..0000000000000000000000000000000000000000 --- a/listen-thread-of-collect-module-exits-occasionally.patch +++ /dev/null @@ -1,104 +0,0 @@ -From 2135b4e41666d99922eda79e9ee04bbc2b557fea Mon Sep 17 00:00:00 2001 -From: zhuofeng -Date: Wed, 16 Oct 2024 12:13:21 +0800 -Subject: [PATCH] listen thread of collect module exits occasionally - ---- - src/python/sentryCollector/collect_io.py | 4 +--- - src/python/sentryCollector/collect_server.py | 18 ++++++++---------- - 2 files changed, 9 insertions(+), 13 deletions(-) - -diff --git a/src/python/sentryCollector/collect_io.py b/src/python/sentryCollector/collect_io.py -index 5fe1efc..de308b3 100644 ---- a/src/python/sentryCollector/collect_io.py -+++ b/src/python/sentryCollector/collect_io.py -@@ -231,9 +231,7 @@ class CollectIo(): - if self.get_blk_io_hierarchy(disk_name, stage_list) < 0: - continue - self.append_period_lat(disk_name, stage_list) -- -- logging.debug(f"no-lock collect data : {IO_GLOBAL_DATA}") -- -+ - elapsed_time = time.time() - start_time - sleep_time = self.period_time - elapsed_time - if sleep_time < 0: -diff --git a/src/python/sentryCollector/collect_server.py b/src/python/sentryCollector/collect_server.py -index 11d1af0..ad3ac0e 100644 ---- a/src/python/sentryCollector/collect_server.py -+++ b/src/python/sentryCollector/collect_server.py -@@ -64,7 +64,7 @@ class CollectServer(): - self.io_global_data = IO_GLOBAL_DATA - - if len(IO_CONFIG_DATA) == 0: -- logging.error("the collect thread is not started, the data is invalid. ") -+ logging.error("the collect thread is not started, the data is invalid.") - return json.dumps(result_rev) - - period_time = IO_CONFIG_DATA[0] -@@ -75,7 +75,7 @@ class CollectServer(): - stage_list = json.loads(data_struct['stage']) - - if (period < period_time) or (period > period_time * max_save) or (period % period_time): -- logging.error("is_iocollect_valid: period time: %d is invalid", period) -+ logging.error("is_iocollect_valid: period time is invalid, user period: %d, config period_time: %d", period, period_time) - return json.dumps(result_rev) - - for disk_name, stage_info in self.io_global_data.items(): -@@ -96,7 +96,7 @@ class CollectServer(): - self.io_global_data = IO_GLOBAL_DATA - - if len(IO_CONFIG_DATA) == 0: -- logging.error("the collect thread is not started, the data is invalid. ") -+ logging.error("the collect thread is not started, the data is invalid.") - return json.dumps(result_rev) - period_time = IO_CONFIG_DATA[0] - max_save = IO_CONFIG_DATA[1] -@@ -107,11 +107,11 @@ class CollectServer(): - iotype_list = json.loads(data_struct['iotype']) - - if (period < period_time) or (period > period_time * max_save) or (period % period_time): -- logging.error("get_io_data: period time: %d is invalid", period) -+ logging.error("get_io_data: period time is invalid, user period: %d, config period_time: %d", period, period_time) - return json.dumps(result_rev) - - collect_index = period // period_time - 1 -- logging.debug("period: %d, collect_index: %d", period, collect_index) -+ logging.debug("user period: %d, config period_time: %d, collect_index: %d", period, period_time, collect_index) - - for disk_name, stage_info in self.io_global_data.items(): - if disk_name not in disk_list: -@@ -124,7 +124,7 @@ class CollectServer(): - for iotype_name, iotype_info in iotype_info.items(): - if iotype_name not in iotype_list: - continue -- if len(iotype_info) < collect_index: -+ if len(iotype_info) - 1 < collect_index: - continue - result_rev[disk_name][stage_name][iotype_name] = iotype_info[collect_index] - -@@ -250,10 +250,8 @@ class CollectServer(): - except socket.error: - logging.error("server fd create failed") - server_fd = None -- - return server_fd - -- - def server_loop(self): - """main loop""" - logging.info("collect listen thread start") -@@ -277,8 +275,8 @@ class CollectServer(): - self.server_recv(server_fd) - else: - continue -- except socket.error: -- pass -+ except Exception: -+ logging.error('collect listen exception : %s', traceback.format_exc()) - - def stop_thread(self): - self.stop_event.set() --- -2.33.0 - diff --git a/make-debug-msg-clear.patch b/make-debug-msg-clear.patch deleted file mode 100644 index 540750d884d2ca294c9847816f8cf2e83c6e7016..0000000000000000000000000000000000000000 --- a/make-debug-msg-clear.patch +++ /dev/null @@ -1,69 +0,0 @@ -From edbe32637a939d0788bcbde9211a61cfded436bf Mon Sep 17 00:00:00 2001 -From: luckky -Date: Tue, 5 Nov 2024 17:22:27 +0800 -Subject: [PATCH] make debug msg clear -1. Change the page_isolation_threshold default value for 128(kb) to 3355443(kb) -to synchronize the modification of the .mod file. -2. Add specific command info in debug message to make debug message clear. -3. Update the commit of the log level and format of syssentry. -4. Change the interval 180 to 10 to short the restart time. - ---- - config/tasks/hbm_online_repair.mod | 2 +- - .../src/c/hbm_online_repair/hbm_online_repair.c | 8 ++++---- - 2 files changed, 5 insertions(+), 5 deletions(-) - -diff --git a/config/tasks/hbm_online_repair.mod b/config/tasks/hbm_online_repair.mod -index 77dd73e..4dcef43 100644 ---- a/config/tasks/hbm_online_repair.mod -+++ b/config/tasks/hbm_online_repair.mod -@@ -3,7 +3,7 @@ enabled=yes - task_start=/usr/bin/hbm_online_repair - task_stop=kill $pid - type=period --interval=180 -+interval=10 - onstart=yes - env_file=/etc/sysconfig/hbm_online_repair.env - conflict=up -\ No newline at end of file -diff --git a/src/c/hbm_online_repair/hbm_online_repair.c b/src/c/hbm_online_repair/hbm_online_repair.c -index b3b2742..943f201 100644 ---- a/src/c/hbm_online_repair/hbm_online_repair.c -+++ b/src/c/hbm_online_repair/hbm_online_repair.c -@@ -9,7 +9,7 @@ - #include "non-standard-hbm-repair.h" - - #define DEFAULT_LOG_LEVEL LOG_INFO --#define DEFAULT_PAGE_ISOLATION_THRESHOLD 128 -+#define DEFAULT_PAGE_ISOLATION_THRESHOLD 3355443 - - int global_level_setting; - int page_isolation_threshold; -@@ -44,7 +44,7 @@ int execute_command(const char *command) - } - - fgets(buffer, sizeof(buffer), fp); -- log(LOG_DEBUG, "output of command is: %s\n", buffer); -+ log(LOG_DEBUG, "output of command %s is: %s\n", command, buffer); - - ret = pclose(fp); - if (ret < 0) { -@@ -53,12 +53,12 @@ int execute_command(const char *command) - } - - if (!WIFEXITED(ret)) { -- log(LOG_ERROR, "command did not terminate normally\n"); -+ log(LOG_ERROR, "command %s did not terminate normally\n", command); - return -1; - } - - ret = WEXITSTATUS(ret); -- log(LOG_DEBUG, "command exited with status: %d\n", ret); -+ log(LOG_DEBUG, "command %s exited with status: %d\n", command, ret); - return ret; - } - --- -2.43.0 - diff --git a/modify-abnormal-stack-when-the-disk-field-is-not-con.patch b/modify-abnormal-stack-when-the-disk-field-is-not-con.patch deleted file mode 100644 index 5b0084b594536facd0bc8aa999f6f10c9753ea53..0000000000000000000000000000000000000000 --- a/modify-abnormal-stack-when-the-disk-field-is-not-con.patch +++ /dev/null @@ -1,28 +0,0 @@ -From b5794ef43f768d7ea9bbbac450deaabbdcff4997 Mon Sep 17 00:00:00 2001 -From: zhuofeng -Date: Sat, 12 Oct 2024 17:57:01 +0800 -Subject: [PATCH] modify abnormal stack when the disk field is not configured - ---- - src/python/sentryCollector/collect_config.py | 4 ++-- - 1 file changed, 2 insertions(+), 2 deletions(-) - -diff --git a/src/python/sentryCollector/collect_config.py b/src/python/sentryCollector/collect_config.py -index 5aa38ec..7ca9898 100644 ---- a/src/python/sentryCollector/collect_config.py -+++ b/src/python/sentryCollector/collect_config.py -@@ -127,9 +127,9 @@ class CollectConfig: - CONF_IO, CONF_IO_MAX_SAVE, CONF_IO_MAX_SAVE_DEFAULT) - result_io_config[CONF_IO_MAX_SAVE] = CONF_IO_MAX_SAVE_DEFAULT - # disk -- disk = io_map_value.get(CONF_IO_DISK).lower() -+ disk = io_map_value.get(CONF_IO_DISK) - if disk: -- disk_str = disk.replace(" ", "") -+ disk_str = disk.lower().replace(" ", "") - pattern = r'^[a-zA-Z0-9-_,]+$' - if not re.match(pattern, disk_str): - logging.warning("module_name = %s section, field = %s is incorrect, use default %s", --- -2.33.0 - diff --git a/modify-logrotate-rule.patch b/modify-logrotate-rule.patch deleted file mode 100644 index be0ddd2553913a2ae54b582b9d64ad4dafab37fe..0000000000000000000000000000000000000000 --- a/modify-logrotate-rule.patch +++ /dev/null @@ -1,27 +0,0 @@ -From 0d3323d13797f3f9d3124e3938787d2573bf249d Mon Sep 17 00:00:00 2001 -From: zhangnan -Date: Mon, 28 Oct 2024 17:32:49 +0800 -Subject: [PATCH] modify logrotate rule - ---- - config/logrotate | 3 ++- - 1 file changed, 2 insertions(+), 1 deletion(-) - -diff --git a/config/logrotate b/config/logrotate -index f54e7b3..e855118 100644 ---- a/config/logrotate -+++ b/config/logrotate -@@ -1,8 +1,9 @@ - /var/log/sysSentry/*.log { -- nocompress -+ compress - missingok - notifempty - copytruncate - rotate 2 - size +4096k -+ hourly - } --- -2.33.0 - diff --git a/optimize-log-printing.patch b/optimize-log-printing.patch deleted file mode 100644 index 591ae9f69f513c712d6822c1cb6cb3577477bc9e..0000000000000000000000000000000000000000 --- a/optimize-log-printing.patch +++ /dev/null @@ -1,125 +0,0 @@ -From 91c37cec1639c79b2b5ddcd6b173b4d7aa0ce9db Mon Sep 17 00:00:00 2001 -From: jinsaihang -Date: Wed, 16 Oct 2024 14:51:24 +0800 -Subject: [PATCH] optimize log printing - -Signed-off-by: jinsaihang ---- - src/python/syssentry/alarm.py | 53 ++++++++++++++++--------------- - src/python/syssentry/load_mods.py | 15 +++++---- - 2 files changed, 35 insertions(+), 33 deletions(-) - -diff --git a/src/python/syssentry/alarm.py b/src/python/syssentry/alarm.py -index bff527c..c3f2ee1 100644 ---- a/src/python/syssentry/alarm.py -+++ b/src/python/syssentry/alarm.py -@@ -76,16 +76,26 @@ def update_alarm_list(alarm_info: Xalarm): - finally: - alarm_list_lock.release() - --def check_alarm_id_if_number(alarm_id): -- if isinstance(alarm_id, int): -- return True -- else: -+def validate_alarm_id(alarm_id): -+ if alarm_id is None: -+ return False -+ try: -+ alarm_id = int(alarm_id) -+ if MIN_ALARM_ID <= alarm_id <= MAX_ALARM_ID: -+ return True -+ else: -+ return False -+ except ValueError: - return False - --def check_alarm_clear_time_if_positive_integer(alarm_clear_time): -- if isinstance(alarm_clear_time, int) and alarm_clear_time > 0: -- return True -- else: -+def validate_alarm_clear_time(alarm_clear_time): -+ try: -+ alarm_clear_time = int(alarm_clear_time) -+ if alarm_clear_time > 0 and alarm_clear_time <= sys.maxsize: -+ return True -+ else: -+ return False -+ except ValueError: - return False - - def alarm_register(): -@@ -93,34 +103,25 @@ def alarm_register(): - # 初始化告警ID映射字典、告警老化时间字典 - for task_type in TasksMap.tasks_dict: - for task_name in TasksMap.tasks_dict[task_type]: -- logging.info(f"alarm_register: {task_name} is registered") - task = TasksMap.tasks_dict[task_type][task_name] -- alarm_id = task.alarm_id -- if not check_alarm_id_if_number(alarm_id): -- logging.warning(f"Invalid alarm_id {alarm_id}: ignore {task_name} alarm") -+ if not validate_alarm_id(task.alarm_id): -+ logging.warning(f"Invalid alarm_id {task.alarm_id}: ignore {task_name} alarm") - continue -- if alarm_id < MIN_ALARM_ID or alarm_id > MAX_ALARM_ID: -- logging.warning(f"Invalid alarm_id {alarm_id}: ignore {task_name} alarm") -+ if not validate_alarm_clear_time(task.alarm_clear_time): -+ logging.warning(f"Invalid alarm_clear_time {task.alarm_clear_time}: ignore {task_name} alarm") - continue -+ task.alarm_id = int(task.alarm_id) -+ task.alarm_clear_time = int(task.alarm_clear_time) -+ alarm_id = task.alarm_id - alarm_clear_time = task.alarm_clear_time -- if not check_alarm_clear_time_if_positive_integer(alarm_clear_time): -- logging.warning(f"Invalid alarm_clear_time {alarm_clear_time}: ignore {task_name} alarm") -- continue -- try: -- alarm_clear_time = int(alarm_clear_time) -- if alarm_clear_time <= 0: -- raise ValueError("Not a positive integer") -- if alarm_clear_time > sys.maxsize: -- raise ValueError("Exceeds maximum value for int") -- except (ValueError, OverflowError, TypeError) as e: -- logging.warning(f"Invalid alarm_clear_time {alarm_clear_time}: ignore {task_name} alarm") -- continue -+ - alarm_list_dict[alarm_id] = [] - task_alarm_id_dict[task_name] = alarm_id - if alarm_id not in alarm_id_clear_time_dict: - alarm_id_clear_time_dict[alarm_id] = alarm_clear_time - else: - alarm_id_clear_time_dict[alarm_id] = max(alarm_clear_time, alarm_id_clear_time_dict[alarm_id]) -+ logging.info(f"alarm_register: {task_name} is registered") - # 注册告警回调 - id_filter = [True] * 128 - clientId = xalarm_register(update_alarm_list, id_filter) -diff --git a/src/python/syssentry/load_mods.py b/src/python/syssentry/load_mods.py -index f74f165..78db446 100644 ---- a/src/python/syssentry/load_mods.py -+++ b/src/python/syssentry/load_mods.py -@@ -198,15 +198,16 @@ def parse_mod_conf(mod_name, mod_conf): - task.load_enabled = is_enabled - - try: -- task.alarm_id = int(mod_conf.get(CONF_TASK, CONF_ALARM_ID)) -- task.alarm_clear_time = int(mod_conf.get(CONF_TASK, CONF_ALARM_CLEAR_TIME)) -- if not (MIN_ALARM_ID <= task.alarm_id <= MAX_ALARM_ID): -- raise ValueError("Invalid alarm_id") -- except ValueError: - task.alarm_id = mod_conf.get(CONF_TASK, CONF_ALARM_ID) -- task.alarm_clear_time = mod_conf.get(CONF_TASK, CONF_ALARM_CLEAR_TIME) - except configparser.NoOptionError: -- logging.warning("Unset alarm_clear_time, use 15s as default") -+ task.alarm_id = None -+ logging.warning(f"{mod_name} alarm_id not set, alarm_id is None") -+ -+ if task.alarm_id is not None: -+ try: -+ task.alarm_clear_time = mod_conf.get(CONF_TASK, CONF_ALARM_CLEAR_TIME) -+ except configparser.NoOptionError: -+ logging.warning(f"{mod_name} not set alarm_clear_time, use 15s as default") - - if CONF_ONSTART in mod_conf.options(CONF_TASK): - is_onstart = (mod_conf.get(CONF_TASK, CONF_ONSTART) == 'yes') --- -2.27.0 - diff --git a/optimize-the-handing-of-cat-cli-error-msg-in-cpu_sentry.patch b/optimize-the-handing-of-cat-cli-error-msg-in-cpu_sentry.patch deleted file mode 100644 index 7bacd26a2344c616961afd043f747322fd77bbc1..0000000000000000000000000000000000000000 --- a/optimize-the-handing-of-cat-cli-error-msg-in-cpu_sentry.patch +++ /dev/null @@ -1,77 +0,0 @@ -From cb3d0ea18eed3d48f2753f878d9726f58fe616b1 Mon Sep 17 00:00:00 2001 -From: shixuantong -Date: Sat, 21 Sep 2024 09:53:42 +0800 -Subject: [PATCH] optimize the handing of cat-cli error msg in cpu_sentry - ---- - src/python/syssentry/cpu_sentry.py | 36 +++++++++++++++++------------- - 1 file changed, 21 insertions(+), 15 deletions(-) - -diff --git a/src/python/syssentry/cpu_sentry.py b/src/python/syssentry/cpu_sentry.py -index 99af127..582d4b3 100644 ---- a/src/python/syssentry/cpu_sentry.py -+++ b/src/python/syssentry/cpu_sentry.py -@@ -26,6 +26,8 @@ CPU_SENTRY_PARAM_CONFIG = "/etc/sysSentry/plugins/cpu_sentry.ini" - # Inspection commands running at the bottom layer - LOW_LEVEL_INSPECT_CMD = "cat-cli" - -+# max length of msg in details -+DETAILS_LOG_MSG_MAX_LEN = 255 - - class CpuSentry: - """ -@@ -94,22 +96,10 @@ class CpuSentry: - self.send_result["details"]["msg"] = "cpu_sentry task is killed!" - return - -- if "ERROR" in stdout: -- self.send_result["result"] = ResultLevel.FAIL -- self.send_result["details"]["code"] = 1004 -- -- # Remove ANSI escape sequences -- error_info = stdout.split("\n")[0] -- if error_info.startswith("\u001b"): -- ansi_escape = r'\x1b\[([0-9]+)(;[0-9]+)*([A-Za-z])' -- error_info = re.sub(ansi_escape, '', error_info) -- -- self.send_result["details"]["msg"] = error_info -- return -- - out_split = stdout.split("\n") -- isolated_cores_number = 0 -+ isolated_cores_number = -1 - found_fault_cores_list = [] -+ error_msg_list = [] - for out_line_i in out_split: - if "handle_patrol_result: Found fault cores" in out_line_i: - cores_number_tmp = out_line_i.split("Found fault cores:")[1] -@@ -121,9 +111,25 @@ class CpuSentry: - elif out_line_i.startswith(''): - self.send_result["details"]["isolated_cpu_list"] = out_line_i.split(':')[1] - break -+ elif "ERROR" in out_line_i: -+ logging.error("[cat-cli error] - %s\n", out_line_i) -+ error_msg_list.append(out_line_i) - - found_fault_cores_number = len(set(found_fault_cores_list)) -- if found_fault_cores_number == 0: -+ if isolated_cores_number == -1: -+ self.send_result["result"] = ResultLevel.FAIL -+ self.send_result["details"]["code"] = 1004 -+ -+ send_error_msg = "" -+ # Remove ANSI escape sequences -+ for error_info in error_msg_list: -+ if error_info.startswith("\u001b"): -+ ansi_escape = r'\x1b\[([0-9]+)(;[0-9]+)*([A-Za-z])' -+ error_info = re.sub(ansi_escape, '', error_info) -+ if len(send_error_msg) + len(error_info) < DETAILS_LOG_MSG_MAX_LEN: -+ send_error_msg += error_info -+ self.send_result["details"]["msg"] = send_error_msg -+ elif found_fault_cores_number == 0: - self.send_result["details"]["code"] = 0 - self.send_result["result"] = ResultLevel.PASS - elif 0 in found_fault_cores_list: --- -2.27.0 - diff --git a/over-threshold-should-be-warn-level-log-in-cat-cli.patch b/over-threshold-should-be-warn-level-log-in-cat-cli.patch deleted file mode 100644 index 53a2739ee565ec18854fd0144b2c15007d32266e..0000000000000000000000000000000000000000 --- a/over-threshold-should-be-warn-level-log-in-cat-cli.patch +++ /dev/null @@ -1,25 +0,0 @@ -From 3dda5f68db38b63b1e45a28558a9fcd341c1f945 Mon Sep 17 00:00:00 2001 -From: jwolf <523083921@qq.com> -Date: Fri, 20 Sep 2024 15:59:40 +0800 -Subject: [PATCH] should be warn-level log - ---- - src/c/catcli/catlib/plugin/cpu_patrol/cpu_patrol_result.c | 2 +- - 1 file changed, 1 insertion(+), 1 deletion(-) - -diff --git a/src/c/catcli/catlib/plugin/cpu_patrol/cpu_patrol_result.c b/src/c/catcli/catlib/plugin/cpu_patrol/cpu_patrol_result.c -index 9f8d80c..f4f3172 100644 ---- a/src/c/catcli/catlib/plugin/cpu_patrol/cpu_patrol_result.c -+++ b/src/c/catcli/catlib/plugin/cpu_patrol/cpu_patrol_result.c -@@ -23,7 +23,7 @@ static cat_return_t insert_core_to_list(core_list_st *core_list, int coreid) - return CAT_OK; - } - if ((core_list->current_nums == MAX_ISOLATE_CORES_PER_PATROL) || (coreid < 0)) { -- CAT_LOG_E("Insert error, core id(%d)", coreid); -+ CAT_LOG_W("Too many cores need to isolate,do not isolate core(%d)", coreid); - return CAT_ERR; - } - --- -2.27.0 - diff --git a/param-must-be-integer.patch b/param-must-be-integer.patch deleted file mode 100644 index d9b7ac17ff7562ac26faaf2ac33e190777779844..0000000000000000000000000000000000000000 --- a/param-must-be-integer.patch +++ /dev/null @@ -1,23 +0,0 @@ -From 34febf57060060d1f8262941af49e3beeb1f7f5d Mon Sep 17 00:00:00 2001 -From: jwolf <523083921@qq.com> -Date: Fri, 30 Aug 2024 16:59:56 +0800 -Subject: [PATCH] param must be integer - ---- - src/c/catcli/catlib/cli_param_checker.c | 1 + - 1 file changed, 1 insertion(+) - -diff --git a/src/c/catcli/catlib/cli_param_checker.c b/src/c/catcli/catlib/cli_param_checker.c -index 5b38402..71edf17 100644 ---- a/src/c/catcli/catlib/cli_param_checker.c -+++ b/src/c/catcli/catlib/cli_param_checker.c -@@ -17,6 +17,7 @@ void checkset_cpu_usage_percentage(char *getopt_optarg, catcli_request_body *p_r - if (cpu_utility <= 0 || cpu_utility > CPU_USAGE_PERCENTAGE_MAX || strchr(getopt_optarg, '.') != NULL) { - strncpy(errs->patrol_module_err, - "\"cpu_utility \" must be an integer greater in the range (0,100],correct \"-u, --cpu_utility\"\n", MAX_ERR_LEN); -+ p_request_body->cpu_utility = 0; - } else { - p_request_body->cpu_utility = (int)cpu_utility; - } --- -Gitee diff --git a/precise-alarm-query-time.patch b/precise-alarm-query-time.patch deleted file mode 100644 index f69a2d4c9ee3ea1d5fe679050580e4c49023f783..0000000000000000000000000000000000000000 --- a/precise-alarm-query-time.patch +++ /dev/null @@ -1,91 +0,0 @@ -From 7fa9e80531bb3d4fa587e5fb7a99e3af59feda7e Mon Sep 17 00:00:00 2001 -From: jinsaihang -Date: Sat, 12 Oct 2024 16:51:37 +0800 -Subject: [PATCH] precise alarm query time - -Signed-off-by: jinsaihang ---- - sysSentry-1.0.2/src/python/syssentry/alarm.py | 25 +++++++++++++++++-- - .../src/python/syssentry/load_mods.py | 3 ++- - 2 files changed, 25 insertions(+), 3 deletions(-) - -diff --git a/src/python/syssentry/alarm.py b/src/python/syssentry/alarm.py -index 43c1065..d012901 100644 ---- a/src/python/syssentry/alarm.py -+++ b/src/python/syssentry/alarm.py -@@ -76,6 +76,18 @@ def update_alarm_list(alarm_info: Xalarm): - finally: - alarm_list_lock.release() - -+def check_alarm_id_if_number(alarm_id): -+ if isinstance(alarm_id, int): -+ return True -+ else: -+ return False -+ -+def check_alarm_clear_time_if_positive_integer(alarm_clear_time): -+ if isinstance(alarm_clear_time, int) and alarm_clear_time > 0: -+ return True -+ else: -+ return False -+ - def alarm_register(): - logging.debug(f"alarm_register: enter") - # 初始化告警ID映射字典、告警老化时间字典 -@@ -84,10 +96,16 @@ def alarm_register(): - logging.info(f"alarm_register: {task_name} is registered") - task = TasksMap.tasks_dict[task_type][task_name] - alarm_id = task.alarm_id -+ if not check_alarm_id_if_number(alarm_id): -+ logging.warnning(f"Invalid alarm_id {alarm_id}: ignore {task_name} alarm") -+ continue - if alarm_id < MIN_ALARM_ID or alarm_id > MAX_ALARM_ID: - logging.warnning(f"Invalid alarm_id {alarm_id}: ignore {task_name} alarm") - continue - alarm_clear_time = task.alarm_clear_time -+ if not check_alarm_clear_time_if_positive_integer(alarm_clear_time): -+ logging.warnning(f"Invalid alarm_clear_time {alarm_clear_time}: ignore {task_name} alarm") -+ continue - try: - alarm_clear_time = int(alarm_clear_time) - if alarm_clear_time <= 0: -@@ -119,6 +137,9 @@ def get_alarm_result(task_name: str, time_range: int, detailed: bool) -> List[Di - logging.debug("task_name does not exist") - return [] - alarm_id = task_alarm_id_dict[task_name] -+ clear_time = alarm_id_clear_time_dict[alarm_id] -+ if clear_time < int(time_range): -+ return [] - if alarm_id not in alarm_list_dict: - logging.debug("alarm_id does not exist") - return [] -@@ -126,10 +147,10 @@ def get_alarm_result(task_name: str, time_range: int, detailed: bool) -> List[Di - logging.debug(f"get_alarm_result: alarm_list of {alarm_id} has {len(alarm_list)} elements") - # clear alarm_info older than clear time threshold - stop_index = -1 -- timestamp = int(datetime.now().timestamp()) -+ timestamp = datetime.now().timestamp() - for i in range(len(alarm_list)): - logging.debug(f"timestamp, alarm_list[{i}].timestamp: {timestamp}, {xalarm_gettime(alarm_list[i])}") -- if timestamp - (xalarm_gettime(alarm_list[i])) / MILLISECONDS_UNIT_SECONDS > int(time_range): -+ if timestamp - (xalarm_gettime(alarm_list[i])) / MILLISECONDS_UNIT_SECONDS > time_range: - stop_index = i - break - if stop_index >= 0: -diff --git a/src/python/syssentry/load_mods.py b/src/python/syssentry/load_mods.py -index 7daf17d..f74f165 100644 ---- a/src/python/syssentry/load_mods.py -+++ b/src/python/syssentry/load_mods.py -@@ -203,7 +203,8 @@ def parse_mod_conf(mod_name, mod_conf): - if not (MIN_ALARM_ID <= task.alarm_id <= MAX_ALARM_ID): - raise ValueError("Invalid alarm_id") - except ValueError: -- logging.warning("Invalid alarm_id") -+ task.alarm_id = mod_conf.get(CONF_TASK, CONF_ALARM_ID) -+ task.alarm_clear_time = mod_conf.get(CONF_TASK, CONF_ALARM_CLEAR_TIME) - except configparser.NoOptionError: - logging.warning("Unset alarm_clear_time, use 15s as default") - --- -2.27.0 - diff --git a/refactor-config.py-and-bugfix-uncorrect-slow-io-repo.patch b/refactor-config.py-and-bugfix-uncorrect-slow-io-repo.patch deleted file mode 100644 index a0be9483c2ece064bd9128d07488641bb8c7829f..0000000000000000000000000000000000000000 --- a/refactor-config.py-and-bugfix-uncorrect-slow-io-repo.patch +++ /dev/null @@ -1,566 +0,0 @@ -From d5cb115a97e27c8270e8fb385fb3914af9ba3c34 Mon Sep 17 00:00:00 2001 -From: gaoruoshu -Date: Tue, 15 Oct 2024 10:00:07 +0000 -Subject: [PATCH] refactor config.py and bugfix uncorrect slow io report - -Signed-off-by: gaoruoshu ---- - .../avg_block_io/avg_block_io.py | 155 ++----------- - .../sentryPlugins/avg_block_io/config.py | 208 ++++++++++++++++++ - .../sentryPlugins/avg_block_io/module_conn.py | 9 +- - .../sentryPlugins/avg_block_io/utils.py | 72 ------ - 4 files changed, 238 insertions(+), 206 deletions(-) - create mode 100644 src/python/sentryPlugins/avg_block_io/config.py - -diff --git a/src/python/sentryPlugins/avg_block_io/avg_block_io.py b/src/python/sentryPlugins/avg_block_io/avg_block_io.py -index f3ade09..cd47919 100644 ---- a/src/python/sentryPlugins/avg_block_io/avg_block_io.py -+++ b/src/python/sentryPlugins/avg_block_io/avg_block_io.py -@@ -13,132 +13,13 @@ import signal - import configparser - import time - -+from .config import read_config_log, read_config_common, read_config_algorithm, read_config_latency, read_config_iodump, read_config_stage - from .stage_window import IoWindow, IoDumpWindow - from .module_conn import avg_is_iocollect_valid, avg_get_io_data, report_alarm_fail, process_report_data, sig_handler, get_disk_type_by_name --from .utils import update_avg_and_check_abnormal, get_log_level, get_section_value --from sentryCollector.collect_plugin import Disk_Type -+from .utils import update_avg_and_check_abnormal - - CONFIG_FILE = "/etc/sysSentry/plugins/avg_block_io.ini" - --def log_invalid_keys(not_in_list, keys_name, config_list, default_list): -- """print invalid log""" -- if config_list and not_in_list: -- logging.warning("{} in common.{} are not valid, set {}={}".format(not_in_list, keys_name, keys_name, default_list)) -- elif config_list == ["default"]: -- logging.warning("Default {} use {}".format(keys_name, default_list)) -- -- --def read_config_common(config): -- """read config file, get [common] section value""" -- if not config.has_section("common"): -- report_alarm_fail("Cannot find common section in config file") -- -- try: -- disk_name = config.get("common", "disk") -- disk = [] if disk_name == "default" else disk_name.split(",") -- except configparser.NoOptionError: -- disk = [] -- logging.warning("Unset common.disk, set to default") -- -- try: -- stage_name = config.get("common", "stage") -- stage = [] if stage_name == "default" else stage_name.split(",") -- except configparser.NoOptionError: -- stage = [] -- logging.warning("Unset common.stage, set to default") -- -- if len(disk) > 10: -- logging.warning("Too many common.disks, record only max 10 disks") -- disk = disk[:10] -- -- try: -- iotype_name = config.get("common", "iotype").split(",") -- iotype_list = [rw.lower() for rw in iotype_name if rw.lower() in ['read', 'write']] -- err_iotype = [rw.lower() for rw in iotype_name if rw.lower() not in ['read', 'write']] -- -- if err_iotype: -- report_alarm_fail("Invalid common.iotype config") -- -- except configparser.NoOptionError: -- iotype_list = ["read", "write"] -- logging.warning("Unset common.iotype, set to read,write") -- -- try: -- period_time = int(config.get("common", "period_time")) -- if not (1 <= period_time <= 300): -- raise ValueError("Invalid period_time") -- except ValueError: -- report_alarm_fail("Invalid common.period_time") -- except configparser.NoOptionError: -- period_time = 1 -- logging.warning("Unset common.period_time, use 1s as default") -- -- return period_time, disk, stage, iotype_list -- -- --def read_config_algorithm(config): -- """read config file, get [algorithm] section value""" -- if not config.has_section("algorithm"): -- report_alarm_fail("Cannot find algorithm section in config file") -- -- try: -- win_size = int(config.get("algorithm", "win_size")) -- if not (1 <= win_size <= 300): -- raise ValueError("Invalid algorithm.win_size") -- except ValueError: -- report_alarm_fail("Invalid algorithm.win_size config") -- except configparser.NoOptionError: -- win_size = 30 -- logging.warning("Unset algorithm.win_size, use 30 as default") -- -- try: -- win_threshold = int(config.get("algorithm", "win_threshold")) -- if win_threshold < 1 or win_threshold > 300 or win_threshold > win_size: -- raise ValueError("Invalid algorithm.win_threshold") -- except ValueError: -- report_alarm_fail("Invalid algorithm.win_threshold config") -- except configparser.NoOptionError: -- win_threshold = 6 -- logging.warning("Unset algorithm.win_threshold, use 6 as default") -- -- return win_size, win_threshold -- -- --def read_config_latency(config): -- """read config file, get [latency_xxx] section value""" -- common_param = {} -- for type_name in Disk_Type: -- section_name = f"latency_{Disk_Type[type_name]}" -- if not config.has_section(section_name): -- report_alarm_fail(f"Cannot find {section_name} section in config file") -- -- common_param[Disk_Type[type_name]] = get_section_value(section_name, config) -- return common_param -- -- --def read_config_iodump(config): -- """read config file, get [iodump] section value""" -- common_param = {} -- section_name = "iodump" -- if not config.has_section(section_name): -- report_alarm_fail(f"Cannot find {section_name} section in config file") -- -- return get_section_value(section_name, config) -- -- --def read_config_stage(config, stage, iotype_list, curr_disk_type): -- """read config file, get [STAGE_NAME_diskType] section value""" -- res = {} -- section_name = f"{stage}_{curr_disk_type}" -- if not config.has_section(section_name): -- return res -- -- for key in config[section_name]: -- if config[stage][key].isdecimal(): -- res[key] = int(config[stage][key]) -- -- return res -- - - def init_io_win(io_dic, config, common_param): - """initialize windows of latency, iodump, and dict of avg_value""" -@@ -192,24 +73,33 @@ def get_valid_disk_stage_list(io_dic, config_disk, config_stage): - disk_list = [key for key in all_disk_set if key in config_disk] - not_in_disk_list = [key for key in config_disk if key not in all_disk_set] - -+ if not config_disk and not not_in_disk_list: -+ disk_list = [key for key in all_disk_set] -+ -+ if not disk_list: -+ report_alarm_fail("Cannot get valid disk name") -+ -+ disk_list = disk_list[:10] if len(disk_list) > 10 else disk_list -+ -+ if not config_disk: -+ logging.info(f"Default common.disk using disk={disk_list}") -+ elif sorted(disk_list) != sorted(config_disk): -+ logging.warning(f"Set common.disk to {disk_list}") -+ - stage_list = [key for key in all_stage_set if key in config_stage] - not_in_stage_list = [key for key in config_stage if key not in all_stage_set] - - if not_in_stage_list: - report_alarm_fail(f"Invalid common.stage_list config, cannot set {not_in_stage_list}") - -- if not config_disk and not not_in_disk_list: -- disk_list = [key for key in all_disk_set] -- -- if not config_stage and not not_in_stage_list: -+ if not config_stage: - stage_list = [key for key in all_stage_set] - -- disk_list = disk_list[:10] if len(disk_list) > 10 else disk_list -- -- if not stage_list or not disk_list: -- report_alarm_fail("Cannot get valid disk name or stage name.") -+ if not stage_list: -+ report_alarm_fail("Cannot get valid stage name.") - -- log_invalid_keys(not_in_disk_list, 'disk', config_disk, disk_list) -+ if not config_stage: -+ logging.info(f"Default common.stage using stage={stage_list}") - - return disk_list, stage_list - -@@ -254,9 +144,8 @@ def main(): - signal.signal(signal.SIGINT, sig_handler) - signal.signal(signal.SIGTERM, sig_handler) - -- log_level = get_log_level(CONFIG_FILE) -+ log_level = read_config_log(CONFIG_FILE) - log_format = "%(asctime)s - %(levelname)s - [%(filename)s:%(lineno)d] - %(message)s" -- - logging.basicConfig(level=log_level, format=log_format) - - # 初始化配置读取 -@@ -274,6 +163,8 @@ def main(): - # 采集模块对接,is_iocollect_valid() - io_dic["disk_list"], io_dic["stage_list"] = get_valid_disk_stage_list(io_dic, disk, stage) - -+ logging.debug(f"disk={io_dic['disk_list']}, stage={io_dic['stage_list']}") -+ - if "bio" not in io_dic["stage_list"]: - report_alarm_fail("Cannot run avg_block_io without bio stage") - -diff --git a/src/python/sentryPlugins/avg_block_io/config.py b/src/python/sentryPlugins/avg_block_io/config.py -new file mode 100644 -index 0000000..c8f45ce ---- /dev/null -+++ b/src/python/sentryPlugins/avg_block_io/config.py -@@ -0,0 +1,208 @@ -+import configparser -+import logging -+import os -+ -+from .module_conn import report_alarm_fail -+from sentryCollector.collect_plugin import Disk_Type -+ -+ -+CONF_LOG = 'log' -+CONF_LOG_LEVEL = 'level' -+LogLevel = { -+ "debug": logging.DEBUG, -+ "info": logging.INFO, -+ "warning": logging.WARNING, -+ "error": logging.ERROR, -+ "critical": logging.CRITICAL -+} -+ -+CONF_COMMON = 'common' -+CONF_COMMON_DISK = 'disk' -+CONF_COMMON_STAGE = 'stage' -+CONF_COMMON_IOTYPE = 'iotype' -+CONF_COMMON_PER_TIME = 'period_time' -+ -+CONF_ALGO = 'algorithm' -+CONF_ALGO_SIZE = 'win_size' -+CONF_ALGO_THRE = 'win_threshold' -+ -+CONF_LATENCY = 'latency_{}' -+CONF_IODUMP = 'iodump' -+ -+ -+DEFAULT_PARAM = { -+ CONF_LOG: { -+ CONF_LOG_LEVEL: 'info' -+ }, CONF_COMMON: { -+ CONF_COMMON_DISK: 'default', -+ CONF_COMMON_STAGE: 'default', -+ CONF_COMMON_IOTYPE: 'read,write', -+ CONF_COMMON_PER_TIME: 1 -+ }, CONF_ALGO: { -+ CONF_ALGO_SIZE: 30, -+ CONF_ALGO_THRE: 6 -+ }, 'latency_nvme_ssd': { -+ 'read_avg_lim': 300, -+ 'write_avg_lim': 300, -+ 'read_avg_time': 3, -+ 'write_avg_time': 3, -+ 'read_tot_lim': 500, -+ 'write_tot_lim': 500, -+ }, 'latency_sata_ssd' : { -+ 'read_avg_lim': 10000, -+ 'write_avg_lim': 10000, -+ 'read_avg_time': 3, -+ 'write_avg_time': 3, -+ 'read_tot_lim': 50000, -+ 'write_tot_lim': 50000, -+ }, 'latency_sata_hdd' : { -+ 'read_avg_lim': 15000, -+ 'write_avg_lim': 15000, -+ 'read_avg_time': 3, -+ 'write_avg_time': 3, -+ 'read_tot_lim': 50000, -+ 'write_tot_lim': 50000 -+ }, CONF_IODUMP: { -+ 'read_iodump_lim': 0, -+ 'write_iodump_lim': 0 -+ } -+} -+ -+ -+def get_section_value(section_name, config): -+ common_param = {} -+ config_sec = config[section_name] -+ for config_key in DEFAULT_PARAM[section_name]: -+ if config_key in config_sec: -+ if not config_sec[config_key].isdecimal(): -+ report_alarm_fail(f"Invalid {section_name}.{config_key} config.") -+ common_param[config_key] = int(config_sec[config_key]) -+ else: -+ common_param[config_key] = DEFAULT_PARAM[section_name][config_key] -+ logging.warning(f"Unset {section_name}.{config_key} in config file, use {common_param[config_key]} as default") -+ return common_param -+ -+ -+def read_config_log(filename): -+ """read config file, get [log] section value""" -+ default_log_level = DEFAULT_PARAM[CONF_LOG][CONF_LOG_LEVEL] -+ if not os.path.exists(filename): -+ return LogLevel.get(default_log_level) -+ -+ config = configparser.ConfigParser() -+ config.read(filename) -+ -+ log_level = config.get(CONF_LOG, CONF_LOG_LEVEL, fallback=default_log_level) -+ if log_level.lower() in LogLevel: -+ return LogLevel.get(log_level.lower()) -+ return LogLevel.get(default_log_level) -+ -+ -+def read_config_common(config): -+ """read config file, get [common] section value""" -+ if not config.has_section(CONF_COMMON): -+ report_alarm_fail(f"Cannot find {CONF_COMMON} section in config file") -+ -+ try: -+ disk_name = config.get(CONF_COMMON, CONF_COMMON_DISK).lower() -+ disk = [] if disk_name == "default" else disk_name.split(",") -+ except configparser.NoOptionError: -+ disk = [] -+ logging.warning(f"Unset {CONF_COMMON}.{CONF_COMMON_DISK}, set to default") -+ -+ try: -+ stage_name = config.get(CONF_COMMON, CONF_COMMON_STAGE).lower() -+ stage = [] if stage_name == "default" else stage_name.split(",") -+ except configparser.NoOptionError: -+ stage = [] -+ logging.warning(f"Unset {CONF_COMMON}.{CONF_COMMON_STAGE}, set to default") -+ -+ if len(disk) > 10: -+ logging.warning(f"Too many {CONF_COMMON}.disks, record only max 10 disks") -+ disk = disk[:10] -+ -+ try: -+ iotype_name = config.get(CONF_COMMON, CONF_COMMON_IOTYPE).lower().split(",") -+ iotype_list = [rw.lower() for rw in iotype_name if rw.lower() in ['read', 'write']] -+ err_iotype = [rw.lower() for rw in iotype_name if rw.lower() not in ['read', 'write']] -+ -+ if err_iotype: -+ report_alarm_fail(f"Invalid {CONF_COMMON}.{CONF_COMMON_IOTYPE} config") -+ -+ except configparser.NoOptionError: -+ iotype_list = DEFAULT_PARAM[CONF_COMMON][CONF_COMMON_IOTYPE] -+ logging.warning(f"Unset {CONF_COMMON}.{CONF_COMMON_IOTYPE}, use {iotupe_list} as default") -+ -+ try: -+ period_time = int(config.get(CONF_COMMON, CONF_COMMON_PER_TIME)) -+ if not (1 <= period_time <= 300): -+ raise ValueError("Invalid period_time") -+ except ValueError: -+ report_alarm_fail(f"Invalid {CONF_COMMON}.{CONF_COMMON_PER_TIME}") -+ except configparser.NoOptionError: -+ period_time = DEFAULT_PARAM[CONF_COMMON][CONF_COMMON_PER_TIME] -+ logging.warning(f"Unset {CONF_COMMON}.{CONF_COMMON_PER_TIME}, use {period_time} as default") -+ -+ return period_time, disk, stage, iotype_list -+ -+ -+def read_config_algorithm(config): -+ """read config file, get [algorithm] section value""" -+ if not config.has_section(CONF_ALGO): -+ report_alarm_fail(f"Cannot find {CONF_ALGO} section in config file") -+ -+ try: -+ win_size = int(config.get(CONF_ALGO, CONF_ALGO_SIZE)) -+ if not (1 <= win_size <= 300): -+ raise ValueError(f"Invalid {CONF_ALGO}.{CONF_ALGO_SIZE}") -+ except ValueError: -+ report_alarm_fail(f"Invalid {CONF_ALGO}.{CONF_ALGO_SIZE} config") -+ except configparser.NoOptionError: -+ win_size = DEFAULT_PARAM[CONF_ALGO][CONF_ALGO_SIZE] -+ logging.warning(f"Unset {CONF_ALGO}.{CONF_ALGO_SIZE}, use {win_size} as default") -+ -+ try: -+ win_threshold = int(config.get(CONF_ALGO, CONF_ALGO_THRE)) -+ if win_threshold < 1 or win_threshold > 300 or win_threshold > win_size: -+ raise ValueError(f"Invalid {CONF_ALGO}.{CONF_ALGO_THRE}") -+ except ValueError: -+ report_alarm_fail(f"Invalid {CONF_ALGO}.{CONF_ALGO_THRE} config") -+ except configparser.NoOptionError: -+ win_threshold = DEFAULT_PARAM[CONF_ALGO]['win_threshold'] -+ logging.warning(f"Unset {CONF_ALGO}.{CONF_ALGO_THRE}, use {win_threshold} as default") -+ -+ return win_size, win_threshold -+ -+ -+def read_config_latency(config): -+ """read config file, get [latency_xxx] section value""" -+ common_param = {} -+ for type_name in Disk_Type: -+ section_name = CONF_LATENCY.format(Disk_Type[type_name]) -+ if not config.has_section(section_name): -+ report_alarm_fail(f"Cannot find {section_name} section in config file") -+ -+ common_param[Disk_Type[type_name]] = get_section_value(section_name, config) -+ return common_param -+ -+ -+def read_config_iodump(config): -+ """read config file, get [iodump] section value""" -+ if not config.has_section(CONF_IODUMP): -+ report_alarm_fail(f"Cannot find {CONF_IODUMP} section in config file") -+ -+ return get_section_value(CONF_IODUMP, config) -+ -+ -+def read_config_stage(config, stage, iotype_list, curr_disk_type): -+ """read config file, get [STAGE_NAME_diskType] section value""" -+ res = {} -+ section_name = f"{stage}_{curr_disk_type}" -+ if not config.has_section(section_name): -+ return res -+ -+ for key in config[section_name]: -+ if config[stage][key].isdecimal(): -+ res[key] = int(config[stage][key]) -+ -+ return res -diff --git a/src/python/sentryPlugins/avg_block_io/module_conn.py b/src/python/sentryPlugins/avg_block_io/module_conn.py -index 8d6f429..cbdaad4 100644 ---- a/src/python/sentryPlugins/avg_block_io/module_conn.py -+++ b/src/python/sentryPlugins/avg_block_io/module_conn.py -@@ -29,12 +29,16 @@ def sig_handler(signum, _f): - - def avg_get_io_data(io_dic): - """get_io_data from sentryCollector""" -+ logging.debug(f"send to sentryCollector get_io_data: period={io_dic['period_time']}, " -+ f"disk={io_dic['disk_list']}, stage={io_dic['stage_list']}, iotype={io_dic['iotype_list']}") - res = get_io_data(io_dic["period_time"], io_dic["disk_list"], io_dic["stage_list"], io_dic["iotype_list"]) - return check_result_validation(res, 'get io data') - - - def avg_is_iocollect_valid(io_dic, config_disk, config_stage): - """is_iocollect_valid from sentryCollector""" -+ logging.debug(f"send to sentryCollector is_iocollect_valid: period={io_dic['period_time']}, " -+ f"disk={config_disk}, stage={config_stage}") - res = is_iocollect_valid(io_dic["period_time"], config_disk, config_stage) - return check_result_validation(res, 'check config validation') - -@@ -79,7 +83,7 @@ def process_report_data(disk_name, rw, io_data): - # io press - ctrl_stage = ['throtl', 'wbt', 'iocost', 'bfq'] - for stage_name in ctrl_stage: -- abnormal, abnormal_list = is_abnormal((disk_name, 'bio', rw), io_data) -+ abnormal, abnormal_list = is_abnormal((disk_name, stage_name, rw), io_data) - if not abnormal: - continue - msg["reason"] = "IO press" -@@ -117,6 +121,7 @@ def process_report_data(disk_name, rw, io_data): - - - def get_disk_type_by_name(disk_name): -+ logging.debug(f"send to sentryCollector get_disk_type: disk_name={disk_name}") - res = get_disk_type(disk_name) - disk_type_str = check_result_validation(get_disk_type(disk_name), f'Invalid disk type {disk_name}') - try: -@@ -126,4 +131,4 @@ def get_disk_type_by_name(disk_name): - except ValueError: - report_alarm_fail(f"Failed to get disk type for {disk_name}") - -- return Disk_Type[curr_disk_type] -\ No newline at end of file -+ return Disk_Type[curr_disk_type] -diff --git a/src/python/sentryPlugins/avg_block_io/utils.py b/src/python/sentryPlugins/avg_block_io/utils.py -index c381c07..1bfd4e8 100644 ---- a/src/python/sentryPlugins/avg_block_io/utils.py -+++ b/src/python/sentryPlugins/avg_block_io/utils.py -@@ -8,84 +8,12 @@ - # IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY OR FIT FOR A PARTICULAR - # PURPOSE. - # See the Mulan PSL v2 for more details. --import configparser - import logging - import os - - AVG_VALUE = 0 - AVG_COUNT = 1 - --CONF_LOG = 'log' --CONF_LOG_LEVEL = 'level' --LogLevel = { -- "debug": logging.DEBUG, -- "info": logging.INFO, -- "warning": logging.WARNING, -- "error": logging.ERROR, -- "critical": logging.CRITICAL --} -- -- --DEFAULT_PARAM = { -- 'latency_nvme_ssd': { -- 'read_avg_lim': 300, -- 'write_avg_lim': 300, -- 'read_avg_time': 3, -- 'write_avg_time': 3, -- 'read_tot_lim': 500, -- 'write_tot_lim': 500, -- }, 'latency_sata_ssd' : { -- 'read_avg_lim': 10000, -- 'write_avg_lim': 10000, -- 'read_avg_time': 3, -- 'write_avg_time': 3, -- 'read_tot_lim': 50000, -- 'write_tot_lim': 50000, -- }, 'latency_sata_hdd' : { -- 'read_avg_lim': 15000, -- 'write_avg_lim': 15000, -- 'read_avg_time': 3, -- 'write_avg_time': 3, -- 'read_tot_lim': 50000, -- 'write_tot_lim': 50000 -- }, 'iodump': { -- 'read_iodump_lim': 0, -- 'write_iodump_lim': 0 -- } --} -- -- --def get_section_value(section_name, config): -- common_param = {} -- config_sec = config[section_name] -- for config_key in DEFAULT_PARAM[section_name]: -- if config_key in config_sec: -- if not config_sec[config_key].isdecimal(): -- report_alarm_fail(f"Invalid {section_name}.{config_key} config.") -- common_param[config_key] = int(config_sec[config_key]) -- else: -- logging.warning(f"Unset {section_name}.{config_key} in config file, use {DEFAULT_PARAM[section_name][config_key]} as default") -- common_param[config_key] = DEFAULT_PARAM[section_name][config_key] -- return common_param -- -- --def get_log_level(filename): -- if not os.path.exists(filename): -- return logging.INFO -- -- try: -- config = configparser.ConfigParser() -- config.read(filename) -- if not config.has_option(CONF_LOG, CONF_LOG_LEVEL): -- return logging.INFO -- log_level = config.get(CONF_LOG, CONF_LOG_LEVEL) -- -- if log_level.lower() in LogLevel: -- return LogLevel.get(log_level.lower()) -- return logging.INFO -- except configparser.Error: -- return logging.INFO -- - - def get_nested_value(data, keys): - """get data from nested dict""" --- -2.27.0 diff --git a/set-logrotate.patch b/set-logrotate.patch deleted file mode 100644 index 47c507a818159a1015773bcf8074b833309412e3..0000000000000000000000000000000000000000 --- a/set-logrotate.patch +++ /dev/null @@ -1,92 +0,0 @@ -From d74076f4b772822de4f5bee1c8a778dd6b1771d2 Mon Sep 17 00:00:00 2001 -From: shixuantong -Date: Wed, 11 Dec 2024 15:25:33 +0800 -Subject: [PATCH] set logrotate - ---- - config/logrotate | 9 --------- - config/logrotate-sysSentry.conf | 35 +++++++++++++++++++++++++++++++++ - src/sh/logrotate-sysSentry.cron | 13 ++++++++++++ - 3 files changed, 48 insertions(+), 9 deletions(-) - delete mode 100644 config/logrotate - create mode 100644 config/logrotate-sysSentry.conf - create mode 100644 src/sh/logrotate-sysSentry.cron - -diff --git a/config/logrotate b/config/logrotate -deleted file mode 100644 -index 3dc77f5..0000000 ---- a/config/logrotate -+++ /dev/null -@@ -1,9 +0,0 @@ --/var/log/sysSentry/*.log { -- compress -- missingok -- notifempty -- copytruncate -- rotate 2 -- size +4096k -- hourly --} -diff --git a/config/logrotate-sysSentry.conf b/config/logrotate-sysSentry.conf -new file mode 100644 -index 0000000..cf5f994 ---- /dev/null -+++ b/config/logrotate-sysSentry.conf -@@ -0,0 +1,35 @@ -+# keep 4 hours worth of backlogs -+rotate 4 -+ -+# create new (empty) log files after rotating old ones -+create -+ -+# compress log files -+compress -+ -+# if a log file does not exist, go no to the next one without an error msg -+missingok -+ -+# do not rotate the log if it is empty -+notifempty -+ -+copytruncate -+ -+# ignore any following matches of a log file. -+# Note that order is significant, it will not overwrite and take the first match. -+# require logrotate >= 3.21.0 -+ignoreduplicates -+ -+/var/log/sysSentry/sysSentry.log { -+ rotate 8 -+ size +4096k -+} -+ -+/var/log/sysSentry/cpu_sentry.log { -+ rotate 2 -+ size +2048k -+} -+ -+/var/log/sysSentry/*.log { -+ size +4096k -+} -diff --git a/src/sh/logrotate-sysSentry.cron b/src/sh/logrotate-sysSentry.cron -new file mode 100644 -index 0000000..64d02f9 ---- /dev/null -+++ b/src/sh/logrotate-sysSentry.cron -@@ -0,0 +1,13 @@ -+#!/bin/sh -+ -+TMPF=`mktemp /tmp/logrotate-sysSentry.XXXXXXXXX` -+ -+/usr/sbin/logrotate /etc/logrotate-sysSentry.conf -v --log=$TMPF -s /var/lib/logrotate-syssentry/logrotate.status -+EXITVALUE=$? -+if [ $EXITVALUE != 0 ]; then -+ /bin/logger -t logrotate "ALERT exited abnormally with [$EXITVALUE], for details, see /var/log/sysSentry/logrotate.log" -+ /bin/logger -t logrotate -f $TMPF -+fi -+rm -rf $TMPF -+rm -rf /var/lib/logrotate-syssentry/logrotate.status -+exit $EXITVALUE --- -2.27.0 - diff --git a/setting-parameters-must-be-integer.patch b/setting-parameters-must-be-integer.patch deleted file mode 100644 index 20e11f7aeb32d59f0640aaf63926878ca37844e9..0000000000000000000000000000000000000000 --- a/setting-parameters-must-be-integer.patch +++ /dev/null @@ -1,38 +0,0 @@ -From 4abad77067557234d938de3914094c80181030c1 Mon Sep 17 00:00:00 2001 -From: jwolf <523083921@qq.com> -Date: Fri, 30 Aug 2024 14:30:46 +0800 -Subject: [PATCH] must be integer - ---- - c/catcli/catlib/cli_param_checker.c | 6 ++++-- - 1 file changed, 4 insertions(+), 2 deletions(-) - -diff --git a/src/c/catcli/catlib/cli_param_checker.c b/src/c/catcli/catlib/cli_param_checker.c -index e400428..5b38402 100644 ---- a/src/c/catcli/catlib/cli_param_checker.c -+++ b/src/c/catcli/catlib/cli_param_checker.c -@@ -17,8 +17,9 @@ void checkset_cpu_usage_percentage(char *getopt_optarg, catcli_request_body *p_r - if (cpu_utility <= 0 || cpu_utility > CPU_USAGE_PERCENTAGE_MAX || strchr(getopt_optarg, '.') != NULL) { - strncpy(errs->patrol_module_err, - "\"cpu_utility \" must be an integer greater in the range (0,100],correct \"-u, --cpu_utility\"\n", MAX_ERR_LEN); -+ } else { -+ p_request_body->cpu_utility = (int)cpu_utility; - } -- p_request_body->cpu_utility = (int)cpu_utility; - } - - void checkset_cpulist(char *getopt_optarg, catcli_request_body *p_request_body, struct option_errs *errs) -@@ -73,8 +74,9 @@ void checkset_patrol_time(char *getopt_optarg, catcli_request_body *p_request_bo - strncpy(errs->patrol_time_err, - "\"patrol_second\" must be a number in the range of (0,INT_MAX] ,correct \"-t, --patrol_second\"\n", - MAX_ERR_LEN); -+ } else { -+ p_request_body->patrol_second = (int)second; - } -- p_request_body->patrol_second = (int)second; - } - - void checkset_patrol_type(char *getopt_optarg, catcli_request_body *p_request_body, struct option_errs *errs) --- -2.27.0 - diff --git a/split-cpu_sentry-and-syssentry.patch b/split-cpu_sentry-and-syssentry.patch deleted file mode 100644 index 7b2dc7eb915dc2510c8374d9f71f3d1957c6b8b8..0000000000000000000000000000000000000000 --- a/split-cpu_sentry-and-syssentry.patch +++ /dev/null @@ -1,155 +0,0 @@ -From 3f6e4d12618597b5aab6b0633f1bda800526ea54 Mon Sep 17 00:00:00 2001 -From: gaoruoshu -Date: Wed, 14 Aug 2024 21:10:20 +0800 -Subject: [PATCH] split cpu_sentry and syssentry - ---- - src/python/syssentry/cpu_alarm.py | 42 +++++++++++++++++++++++++ - src/python/syssentry/syssentry.py | 52 ++++++------------------------- - 2 files changed, 52 insertions(+), 42 deletions(-) - -diff --git a/src/python/syssentry/cpu_alarm.py b/src/python/syssentry/cpu_alarm.py -index d972c42..0b1642b 100644 ---- a/src/python/syssentry/cpu_alarm.py -+++ b/src/python/syssentry/cpu_alarm.py -@@ -1,6 +1,7 @@ - import re - import math - import logging -+import socket - from enum import Enum - - from .utils import execute_command -@@ -15,6 +16,12 @@ BINARY = 2 - MIN_DATA_LEN = 0 - MAX_DATA_LEN = 999 - -+PARAM_REP_LEN = 3 -+PARAM_TYPE_LEN = 1 -+PARAM_MODULE_LEN = 1 -+PARAM_TRANS_TO_LEN = 2 -+PARAM_DATA_LEN = 3 -+ - - class Type(Enum): - CE = 0x00 -@@ -207,3 +214,38 @@ def check_fixed_param(data, expect): - raise ValueError("expected str param is not valid") - return data - raise NotImplementedError("unexpected param type") -+ -+ -+def cpu_alarm_recv(server_socket: socket.socket): -+ try: -+ client_socket, _ = server_socket.accept() -+ logging.debug("cpu alarm fd listen ok") -+ -+ data = client_socket.recv(PARAM_REP_LEN) -+ check_fixed_param(data, "REP") -+ -+ data = client_socket.recv(PARAM_TYPE_LEN) -+ _type = check_fixed_param(data, Type) -+ -+ data = client_socket.recv(PARAM_MODULE_LEN) -+ module = check_fixed_param(data, Module) -+ -+ data = client_socket.recv(PARAM_TRANS_TO_LEN) -+ trans_to = check_fixed_param(data, TransTo) -+ -+ data = client_socket.recv(PARAM_DATA_LEN) -+ data_len = check_fixed_param(data, (MIN_DATA_LEN, MAX_DATA_LEN)) -+ -+ data = client_socket.recv(data_len) -+ -+ command, event_type, socket_id, core_id = parser_cpu_alarm_info(data) -+ except socket.error: -+ logging.error("socket error") -+ return -+ except (ValueError, OSError, UnicodeError, TypeError, NotImplementedError): -+ logging.error("server recv cpu alarm msg failed!") -+ client_socket.close() -+ return -+ -+ upload_bmc(_type, module, command, event_type, socket_id, core_id) -+ -diff --git a/src/python/syssentry/syssentry.py b/src/python/syssentry/syssentry.py -index 3d5cb8d..f93956e 100644 ---- a/src/python/syssentry/syssentry.py -+++ b/src/python/syssentry/syssentry.py -@@ -36,8 +36,15 @@ from .heartbeat import (heartbeat_timeout_chk, heartbeat_fd_create, - from .result import RESULT_MSG_HEAD_LEN, RESULT_MSG_MAGIC_LEN, RESULT_MAGIC - from .result import RESULT_LEVEL_ERR_MSG_DICT, ResultLevel - from .utils import get_current_time_string --from .cpu_alarm import (upload_bmc, check_fixed_param, parser_cpu_alarm_info, -- Type, Module, TransTo, MIN_DATA_LEN, MAX_DATA_LEN) -+ -+ -+CPU_EXIST = True -+try: -+ from .cpu_alarm import cpu_alarm_recv -+except ImportError: -+ CPU_EXIST = False -+ logging.debug("Cannot find cpu sentry mod") -+ - - INSPECTOR = None - -@@ -76,45 +83,6 @@ PID_FILE_FLOCK = None - RESULT_SOCKET_PATH = "/var/run/sysSentry/result.sock" - - CPU_ALARM_SOCKET_PATH = "/var/run/sysSentry/report.sock" --PARAM_REP_LEN = 3 --PARAM_TYPE_LEN = 1 --PARAM_MODULE_LEN = 1 --PARAM_TRANS_TO_LEN = 2 --PARAM_DATA_LEN = 3 -- -- --def cpu_alarm_recv(server_socket: socket.socket): -- try: -- client_socket, _ = server_socket.accept() -- logging.debug("cpu alarm fd listen ok") -- -- data = client_socket.recv(PARAM_REP_LEN) -- check_fixed_param(data, "REP") -- -- data = client_socket.recv(PARAM_TYPE_LEN) -- _type = check_fixed_param(data, Type) -- -- data = client_socket.recv(PARAM_MODULE_LEN) -- module = check_fixed_param(data, Module) -- -- data = client_socket.recv(PARAM_TRANS_TO_LEN) -- trans_to = check_fixed_param(data, TransTo) -- -- data = client_socket.recv(PARAM_DATA_LEN) -- data_len = check_fixed_param(data, (MIN_DATA_LEN, MAX_DATA_LEN)) -- -- data = client_socket.recv(data_len) -- -- command, event_type, socket_id, core_id = parser_cpu_alarm_info(data) -- except socket.error: -- logging.error("socket error") -- return -- except (ValueError, OSError, UnicodeError, TypeError, NotImplementedError): -- logging.error("server recv cpu alarm msg failed!") -- client_socket.close() -- return -- -- upload_bmc(_type, module, command, event_type, socket_id, core_id) - - - def msg_data_process(msg_data): -@@ -480,7 +448,7 @@ def main_loop(): - server_result_recv(server_result_fd) - elif event_fd == heartbeat_fd.fileno(): - heartbeat_recv(heartbeat_fd) -- elif event_fd == cpu_alarm_fd.fileno(): -+ elif CPU_EXIST and event_fd == cpu_alarm_fd.fileno(): - cpu_alarm_recv(cpu_alarm_fd) - else: - continue --- -2.33.0 - - diff --git a/sysSentry-1.0.2.tar.gz b/sysSentry-1.0.2.tar.gz deleted file mode 100644 index abb6fd18b9697ae71350686a3b0bf9a233d7d184..0000000000000000000000000000000000000000 Binary files a/sysSentry-1.0.2.tar.gz and /dev/null differ diff --git a/sysSentry-1.0.3.tar.gz b/sysSentry-1.0.3.tar.gz new file mode 100644 index 0000000000000000000000000000000000000000..0bd96b5f3fb0f418545c8e4330a85413d1390a1a Binary files /dev/null and b/sysSentry-1.0.3.tar.gz differ diff --git a/sysSentry.spec b/sysSentry.spec index 8a17131c9658200282c2058690c3298f5cbf0285..26c1111f714042b4f183cf168184fc6f0e640217 100644 --- a/sysSentry.spec +++ b/sysSentry.spec @@ -3,101 +3,26 @@ Summary: System Inspection Framework Name: sysSentry -Version: 1.0.2 -Release: 67 +Version: 1.0.3 +Release: 1 License: Mulan PSL v2 Group: System Environment/Daemons Source0: https://gitee.com/openeuler/sysSentry/releases/download/v%{version}/%{name}-%{version}.tar.gz BuildRoot: %{_builddir}/%{name}-root -Patch1: fix-version-in-setup.py.patch -Patch2: Fix-the-problem-that-function-cpu_report_result-is-c.patch -Patch3: fix-error-handling.patch -Patch4: fix-result-when-process-output-is-None.patch -Patch5: cpu_utility-and-cpu_patrol-must-be-an-integer.patch -Patch6: setting-parameters-must-be-integer.patch -Patch7: param-must-be-integer.patch -Patch8: add-deleted-code-to-plugin-rasdaemon.patch -Patch9: Remove-ANSI-escape-sequences.patch -Patch10: split-cpu_sentry-and-syssentry.patch -Patch11: fix-configparser.InterpolationSyntaxError.patch -Patch12: fix-syssentry-fails-to-be-started-when-cpu_sentry-is.patch -Patch13: add-collect-module-to-sysSentry.patch -Patch14: feature-add-avg_block_io-plugin.patch -Patch15: fix-some-about-collect-module-and-avg-block-io.patch -Patch16: add-ai-threshold-slow-io-detection-plugin.patch -Patch17: optimize-the-handing-of-cat-cli-error-msg-in-cpu_sentry.patch -Patch18: over-threshold-should-be-warn-level-log-in-cat-cli.patch -Patch19: fix-bug-step-2-about-collect-module-and-avg-block-io.patch -Patch20: add-log-level-and-change-log-format.patch -Patch21: fix-ai_block_io-some-issues.patch -Patch22: add-pyxalarm-and-pySentryNotify-add-multi-users-supp.patch -Patch23: add-sentryctl-get_alarm-module_name-s-time_range-d.patch -Patch24: fix-python-3.7-not-support-list-bool-type.patch -Patch25: avg_block_io-send-alarm-to-xalarmd.patch -Patch26: bugfix-typo.patch -Patch27: fix-config-relative-some-issues.patch -Patch28: update-log-when-it-is-not-lock-collect.patch -Patch29: change-alarm-length.patch -Patch30: add-detail-time.patch -Patch31: xalarm-add-alarm-msg-length-to-8192.patch -Patch32: ai_block_io-adapt-alarm-module.patch -Patch33: add-log-for-improving-maintainability.patch -Patch34: add-get_disk_type-and-fix-some-bugs.patch -Patch35: diff-disk-type-use-diff-config.patch -Patch36: add-parameter-time_range-alarm_id-and-alarm_clear_ti.patch -Patch37: fix-xalarm_Report-function-not-refuse-alarm-msg-exce.patch -Patch38: fix-xalarm_upgrade-not-return-val-and-fail-when-thre.patch -Patch39: add-log-for-xalarm-when-sending-msg-and-clean-invali.patch -Patch40: add-xalarm-cleanup-invalid-server-socket-peroidly.patch -Patch41: ai_block_io-support-stage-and-iotype.patch -Patch42: fix-io_dump-for-collect-module.patch -Patch43: add-root-cause-analysis.patch -Patch44: update-collect-log.patch -Patch45: modify-abnormal-stack-when-the-disk-field-is-not-con.patch -Patch46: ai_block_io-fix-some-bugs.patch -Patch47: refactor-config.py-and-bugfix-uncorrect-slow-io-repo.patch -Patch48: get_io_data-failed-wont-stop-avg_block_io-and-del-di.patch -Patch49: fix-ai_block_io-root-cause-bug.patch -Patch50: listen-thread-of-collect-module-exits-occasionally.patch -Patch51: precise-alarm-query-time.patch -Patch52: fix-word-error.patch -Patch53: optimize-log-printing.patch -Patch54: enrich-alert-info-about-kernel-stack.patch -Patch55: ai_block_io-lack-section-exit.patch -Patch56: fix-xalarm-non-uniform-log-formatting.patch -Patch57: update-collect-plugin-period-max.patch -Patch58: fix-frequency-param-check-bug.patch -Patch59: ai_block_io-support-iodump.patch -Patch60: fix-get_alarm-error.patch -Patch61: fix-alarm_info-newline-break-error.patch -Patch62: add-hbm-online-repair.patch -Patch63: fix-hbm-online-repair-notice-and-efi-create.patch -Patch64: get_alarm-d-abnomal-display.patch -Patch65: modify-logrotate-rule.patch -Patch66: fix-excessive-CPU-usage.patch -Patch67: fix-uint8-bug-and-change-isolation-default-value.patch -Patch68: fix-write-file-return-code-bug.patch -Patch69: change-avg_block_io-config.patch -Patch70: ai_block_io-support-absolute-threshold-lower-limit.patch -Patch71: ai_block_io-fix-some-config-parameters-parse-bug.patch -Patch72: update-nvme-config.patch -Patch73: make-debug-msg-clear.patch -Patch74: add-boundary-check-for-settings.patch -Patch75: change-status-of-period-task-and-sort-mod-file.patch -Patch76: uniform-avg_block_io-log-and-ai_block_io-log.patch -Patch77: set-logrotate.patch -Patch78: hbm_online_repair-add-unload-driver.patch -Patch79: fix-test_ai_block_io-fail.patch - BuildRequires: cmake gcc-c++ BuildRequires: python3 python3-setuptools BuildRequires: json-c-devel BuildRequires: chrpath +BuildRequires: elfutils clang libbpf-devel bpftool # for test BuildRequires: python3-numpy python3-pytest Requires: pyxalarm = %{version} +Requires: libbpf + +%define PYTHON_VERSION %(python --version 2>&1 | awk '{print $2}' | cut -d '.' -f 1,2) +%define PKGVER syssentry-%{version}-py%{PYTHON_VERSION}.egg-info %description sysSentry provides framework tools for system inspection. @@ -119,15 +44,6 @@ Provides: libxalarm-devel = %{version} %description -n libxalarm-devel This package provides developer tools for the libxalarm. -%package -n cpu_sentry -Summary: CPU fault inspection program -Requires: procps-ng -Recommends: sysSentry = %{version}-%{release} -Recommends: ipmitool - -%description -n cpu_sentry -This package provides CPU fault detection - %package -n avg_block_io Summary: Supports slow I/O detection Requires: sysSentry = %{version}-%{release} @@ -179,93 +95,24 @@ Requires: sysSentry = %{version}-%{release} This package provides hbm_online_repair for the sysSentry. %prep +%setup -q -n sysSentry-1.0.3 +find . -name "CMakeCache.txt" -exec rm -f {} \; %autosetup -n %{name}-%{version} -p1 %build -# xalarm -sh build/build.sh -b %{buildroot}%{_libdir} - -# sysSentry -pushd src/python -python3 setup.py build -popd - -pushd src/c/catcli/catlib -cmake -B ./build/ -S . -D CMAKE_INSTALL_PREFIX=/usr/local -D CMAKE_BUILD_TYPE=Release -pushd build -make -popd -popd - -# hbm_online_repair -pushd src/c/hbm_online_repair -make -popd +%make_build %install # sysSentry mkdir -p %{buildroot}%{_bindir} mkdir -p %{buildroot}%{_unitdir} mkdir -p %{buildroot}%{_var}/log/sysSentry -install src/python/syssentry/sentryctl %{buildroot}%{_bindir} -install -d -m 700 %{buildroot}/etc/sysSentry/ -install -d -m 700 %{buildroot}/etc/sysSentry/tasks/ -install -d -m 700 %{buildroot}/etc/sysSentry/plugins/ -install -m 600 config/inspect.conf %{buildroot}%{_sysconfdir}/sysSentry -install -m 600 service/sysSentry.service %{buildroot}%{_unitdir} - -# rasdaemon -install config/tasks/rasdaemon.mod %{buildroot}/etc/sysSentry/tasks/ - -# xalarm -sh build/build.sh -i %{buildroot}%{_libdir} -install -m 600 config/xalarm.conf %{buildroot}%{_sysconfdir}/sysSentry -install -d %{buildroot}%{_libdir} -install -d %{buildroot}%{_includedir}/xalarm -install -m 600 service/xalarmd.service %{buildroot}%{_unitdir} -install -m 644 src/libso/xalarm/register_xalarm.h %{buildroot}%{_includedir}/xalarm/register_xalarm.h - -# sentryCollector -install -m 600 config/collector.conf %{buildroot}%{_sysconfdir}/sysSentry -install -m 600 service/sentryCollector.service %{buildroot}%{_unitdir} - -# cpu sentry -install config/tasks/cpu_sentry.mod %{buildroot}/etc/sysSentry/tasks/ -install config/plugins/cpu_sentry.ini %{buildroot}/etc/sysSentry/plugins/cpu_sentry.ini -install src/c/catcli/catlib/build/cat-cli %{buildroot}%{_bindir}/cat-cli -install src/c/catcli/catlib/build/plugin/cpu_patrol/libcpu_patrol.so %{buildroot}%{_libdir} - -chrpath -d %{buildroot}%{_bindir}/cat-cli -chrpath -d %{buildroot}%{_libdir}/libcpu_patrol.so - -# avg_block_io -install config/tasks/avg_block_io.mod %{buildroot}/etc/sysSentry/tasks/ -install config/plugins/avg_block_io.ini %{buildroot}/etc/sysSentry/plugins/avg_block_io.ini - -# ai_block_io -install config/tasks/ai_block_io.mod %{buildroot}/etc/sysSentry/tasks/ -install config/plugins/ai_block_io.ini %{buildroot}/etc/sysSentry/plugins/ai_block_io.ini - -# hbm_online_repair -mkdir -p %{buildroot}/etc/sysconfig/ -install config/tasks/hbm_online_repair.mod %{buildroot}/etc/sysSentry/tasks/ -install src/c/hbm_online_repair/hbm_online_repair %{buildroot}%{_bindir} -install src/c/hbm_online_repair/hbm_online_repair.env %{buildroot}/etc/sysconfig/hbm_online_repair.env - -# logrotate -mkdir -p %{buildroot}%{_localstatedir}/lib/logrotate-syssentry -mkdir -p %{buildroot}%{_sysconfdir}/cron.hourly -install -m 0600 config/logrotate-sysSentry.conf %{buildroot}%{_sysconfdir}/logrotate-sysSentry.conf -install -m 0500 src/sh/logrotate-sysSentry.cron %{buildroot}%{_sysconfdir}/cron.hourly/logrotate-sysSentry - -pushd src/python -python3 setup.py install -O1 --root=$RPM_BUILD_ROOT --record=SENTRY_FILES -cat SENTRY_FILES | grep -v register_xalarm.* | grep -v sentry_notify.* > SENTRY_FILES.tmp -mv SENTRY_FILES.tmp SENTRY_FILES -popd +mkdir -p %{buildroot}%{_var}/lib/logrotate-syssentry +%make_install %check -PYTHONPATH=%{buildroot}%{python3_sitelib} %{__python3} -m pytest selftest/test/ +#make test +#PYTHONPATH=%{buildroot}%{python3_sitelib} %{__python3} -m pytest selftest/test/ %pre @@ -290,10 +137,11 @@ rm -rf /var/run/sysSentry | : %clean rm -rf %{buildroot} -%files -f src/python/SENTRY_FILES +%files %defattr(0550,root,root) -%dir %attr(0550,root,root) %{python3_sitelib}/xalarm +%attr(0550,root,root) %{python3_sitelib}/xalarm %attr(0550,root,root) %{python3_sitelib}/syssentry +%attr(0550,root,root) %{python3_sitelib}/%{PKGVER} %attr(0550,root,root) %{python3_sitelib}/sentryCollector %attr(0550,root,root) %{python3_sitelib}/sentryPlugins/avg_block_io %attr(0550,root,root) %{python3_sitelib}/sentryPlugins/ai_block_io @@ -301,12 +149,12 @@ rm -rf %{buildroot} # sysSentry %attr(0500,root,root) %{_bindir}/sentryctl %attr(0550,root,root) %{_bindir}/syssentry +%attr(0550,root,root) %{_bindir}/ebpf_collector %attr(0750,root,root) %config(noreplace) %{_var}/log/sysSentry %attr(0750,root,root) %config(noreplace) %{_sysconfdir}/sysSentry %attr(0750,root,root) %config(noreplace) %{_sysconfdir}/sysSentry/tasks %attr(0750,root,root) %config(noreplace) %{_sysconfdir}/sysSentry/plugins %attr(0600,root,root) %config(noreplace) %{_sysconfdir}/sysSentry/inspect.conf -%attr(0600,root,root) %config(noreplace) %{_sysconfdir}/sysSentry/tasks/rasdaemon.mod %attr(0600,root,root) %{_unitdir}/sysSentry.service # xalarm @@ -315,17 +163,7 @@ rm -rf %{buildroot} %attr(0600,root,root) %{_unitdir}/xalarmd.service # logrotate -%dir %{_localstatedir}/lib/logrotate-syssentry -%attr(0600,root,root) %config(noreplace) %{_sysconfdir}/logrotate-sysSentry.conf -%attr(0500,root,root) %{_sysconfdir}/cron.hourly/logrotate-sysSentry - -# cpu inspection module -%exclude %{_sysconfdir}/sysSentry/tasks/cpu_sentry.mod -%exclude %{_sysconfdir}/sysSentry/plugins/cpu_sentry.ini -%exclude %{_bindir}/cpu_sentry -%exclude %{_bindir}/cat-cli -%exclude %{python3_sitelib}/syssentry/cpu_* -%exclude %{python3_sitelib}/syssentry/*/cpu_* +%attr(0600,root,root) %config(noreplace) %{_sysconfdir}/sysSentry/logrotate-sysSentry.conf # avg block io %exclude %{_sysconfdir}/sysSentry/tasks/avg_block_io.mod @@ -346,7 +184,6 @@ rm -rf %{buildroot} # pysentry_collect %exclude %{python3_sitelib}/sentryCollector/collect_plugin.py -%exclude %{python3_sitelib}/sentryCollector/__pycache__/collect_plugin* # hbm repair module %exclude %{_sysconfdir}/sysSentry/tasks/hbm_online_repair.mod @@ -369,14 +206,6 @@ rm -rf %{buildroot} %attr(0550,root,root) %{python3_sitelib}/xalarm/sentry_notify.py %attr(0550,root,root) %{python3_sitelib}/xalarm/__pycache__/sentry_notify* -%files -n cpu_sentry -%attr(0500,root,root) %{_bindir}/cat-cli -%attr(0500,root,root) %{_bindir}/cpu_sentry -%attr(0550,root,root) %{_libdir}/libcpu_patrol.so -%attr(0600,root,root) %config(noreplace) %{_sysconfdir}/sysSentry/tasks/cpu_sentry.mod -%attr(0600,root,root) %{_sysconfdir}/sysSentry/plugins/cpu_sentry.ini -%attr(0550,root,root) %{python3_sitelib}/syssentry/cpu_* - %files -n avg_block_io %attr(0500,root,root) %{_bindir}/avg_block_io %attr(0600,root,root) %config(noreplace) %{_sysconfdir}/sysSentry/tasks/avg_block_io.mod @@ -395,414 +224,6 @@ rm -rf %{buildroot} %files -n hbm_online_repair %attr(0550,root,root) %{_bindir}/hbm_online_repair -%attr(0600,root,root) %config(noreplace) %{_sysconfdir}/sysconfig/hbm_online_repair.env +%attr(0600,root,root) %config(noreplace) %{_sysconfdir}/hbm_online_repair.env %attr(0600,root,root) %config(noreplace) %{_sysconfdir}/sysSentry/tasks/hbm_online_repair.mod %attr(0550,root,root) %{python3_sitelib}/syssentry/bmc_alarm.py - -%changelog -* Sat Dec 28 2024 shixuantong - 1.0.2-67 -- Type:bugfix -- CVE:NA -- SUG:NA -- DESC:fix test_ai_block_io fail - -* Wed Dec 18 2024 luckky - 1.0.2-66 -- Type:bugfix -- CVE:NA -- SUG:NA -- DESC: add boundary check for settings - -* Wed Dec 18 2024 shixuantong - 1.0.2-65 -- Type:enhancement -- CVE:NA -- SUG:NA -- DESC:set logrotate - -* Wed Dec 18 2024 jinsaihang - 1.0.2-64 -- Type:bugfix -- CVE:NA -- SUG:NA -- DESC:uniform plugins log - -* Fri Dec 13 2024 zhuofeng - 1.0.2-63 -- Type:bugfix -- CVE:NA -- SUG:NA -- DESC: change status of period task and sort mod file - -* Wed Nov 6 2024 luckky - 1.0.2-62 -- Type:bugfix -- CVE:NA -- SUG:NA -- DESC: add boundary check for settings - -* Tue Nov 5 2024 luckky - 1.0.2-61 -- Type:bugfix -- CVE:NA -- SUG:NA -- DESC:make debug msg clear - -* Tue Nov 5 2024 zhangnan - 1.0.2-60 -- Type:bugfix -- CVE:NA -- SUG:NA -- DESC:update nvme config - -* Tue Nov 5 2024 gaoruoshu - 1.0.2-59 -- Type:bugfix -- CVE:NA -- SUG:NA -- DESC:change avg_block_io config - -* Mon Nov 4 2024 luckky - 1.0.2-58 -- Type:bugfix -- CVE:NA -- SUG:NA -- DESC:fix write file return code bug - -* Fri Nov 1 2024 luckky - 1.0.2-57 -- Type:bugfix -- CVE:NA -- SUG:NA -- DESC:fix uint8 bug and change page isolation threshold default value - -* Fri Nov 1 2024 jinsaihang - 1.0.2-56 -- Type:bugfix -- CVE:NA -- SUG:NA -- DES:excessive CPU usage - -* Thu Oct 31 2024 zhangnan - 1.0.2-55 -- Type:bugfix -- CVE:NA -- SUG:NA -- DES:modify logrotate rule - -* Wed Oct 30 2024 jinsaihang - 1.0.2-54 -- Type:bugfix -- CVE:NA -- SUG:NA -- DES:get_alarm -d abnormal display - -* Wed Oct 30 2024 luckky - 1.0.2-53 -- Type:bugfix -- CVE:NA -- SUG:NA -- DESC:fix hbm online repair notice and efi create - -* Sat Oct 26 2024 luckky - 1.0.2-52 -- Type:requirement -- CVE:NA -- SUG:NA -- DESC:add hbm_online_repair - -* Sat Oct 26 2024 jinsaihang - 1.0.2-51 -- Type:bugfix -- CVE:NA -- SUG:NA -- DES:fix newline break error - -* Sat Oct 26 2024 zhangnan - 1.0.2-50 -- Type:bugfix -- CVE:NA -- SUG:NA -- DES:remove extra dependency - -* Wed Oct 23 2024 jinsaihang - 1.0.2-49 -- Type:bugfix -- CVE:NA -- SUG:NA -- DES:fix get_alarm error - -* Tue Oct 22 2024 heyouzhi - 1.0.2-48 -- Type:bugfix -- CVE:NA -- SUG:NA -- DES:ai_block_io support iodump - -* Tue Oct 22 2024 heyouzhi - 1.0.2-47 -- Type:bugfix -- CVE:NA -- SUG:NA -- DES:fix frequency param check bug - -* Mon Oct 21 2024 zhuofeng - 1.0.2-46 -- Type:bugfix -- CVE:NA -- SUG:NA -- DES:update collect plugin period max - -* Mon Oct 21 2024 caixiaomeng - 1.0.2-45 -- Type:bugfix -- CVE:NA -- SUG:NA -- DESC:ai_block_io lack section exit - -* Mon Oct 21 2024 heyouzhi - 1.0.2-44 -- Type:bugfix -- CVE:NA -- SUG:NA -- DESC:ai_block_io lack section exit - -* Wed Oct 16 2024 heyouzhi - 1.0.2-43 -- Type:bugfix -- CVE:NA -- SUG:NA -- DESC:enrich alert info about kernel stack - -* Wed Oct 16 2024 jinsaihang - 1.0.2-42 -- Type:bugfix -- CVE:NA -- SUG:NA -- DESC:optimize log printing - -* Wed Oct 16 2024 zhuofeng - 1.0.2-41 -- Type:bugfix -- CVE:NA -- SUG:NA -- DESC:listen thread of collect module exits occasionally - -* Wed Oct 16 2024 heyouzhi - 1.0.2-40 -- Type:bugfix -- CVE:NA -- SUG:NA -- DESC:fix ai_block_io root cause bug - -* Tue Oct 15 2024 gaoruoshu - 1.0.2-39 -- Type:bugfix -- CVE:NA -- SUG:NA -- DESC:refactor config.py and bugfix uncorrect slow io report - -* Mon Oct 14 2024 heyouzhi - 1.0.2-38 -- Type:bugfix -- CVE:NA -- SUG:NA -- DESC:ai_block_io fix some bugs - -* Sat Oct 12 2024 zhuofeng - 1.0.2-37 -- Type:bugfix -- CVE:NA -- SUG:NA -- DESC:add pysentry_collect package and update collect log - modify abnormal stack when the disk field is not configured - -* Sat Oct 12 2024 heyouzhi - 1.0.2-36 -- Type:requirement -- CVE:NA -- SUG:NA -- DESC:add root cause analysis - -* Sat Oct 12 2024 zhuofeng - 1.0.2-35 -- Type:bugfix -- CVE:NA -- SUG:NA -- DESC:fix io_dump for collect module - -* Fri Oct 11 2024 heyouzhi - 1.0.2-34 -- Type:requirement -- CVE:NA -- SUG:NA -- DESC:ai_block_io support stage and iotype - -* Fri Oct 11 2024 caixiaomeng - 1.0.2-33 -- Type:bugfix -- CVE:NA -- SUG:NA -- DESC:fix xalarm upgrade not return val, not refuse to send msg when length exceeds 8192,cleanup invalid socket peroidlly - -* Fri Oct 11 2024 jinsaihang - 1.0.2-32 -- Type:bugfix -- CVE:NA -- SUG:NA -- DESC:add parameter validation - -* Fri Oct 11 2024 gaoruoshu - 1.0.2-31 -- Type:requirement -- CVE:NA -- SUG:NA -- DESC:avg_block_io adapt different type of disk, use different config - -* Thu Oct 10 2024 zhuofeng - 1.0.2-30 -- Type:bugfix -- CVE:NA -- SUG:NA -- DESC:add get_disk_type and fix some bugs - add log for improving maintainability - -* Thu Oct 10 2024 heyouzhi - 1.0.2-29 -- Type:requirement -- CVE:NA -- SUG:NA -- DESC:ai_block_io adapt alarm module - -* Thu Oct 10 2024 caixiaomeng - 1.0.2-28 -- Type:bugfix -- CVE:NA -- SUG:NA -- DESC:xalarm add alarm msg length to 8192 - -* Thu Oct 10 2024 jinsaihang - 1.0.2-27 -- Type:bugfix -- CVE:NA -- SUG:NA -- DESC:add dependency for sysSentry and avg_block_io - -* Thu Oct 10 2024 jinsaihang - 1.0.2-26 -- Type:bugfix -- CVE:NA -- SUG:NA -- DESC:fix get_alarm length and timestamp - -* Wed Oct 9 2024 zhuofeng - 1.0.2-25 -- Type:bugfix -- CVE:NA -- SUG:NA -- DESC:update log when it is not lock collect - -* Wed Oct 9 2024 heyouzhi - 1.0.2-24 -- Type:bugfix -- CVE:NA -- SUG:NA -- DESC:fix ai_block_io config relative some issues - -* Wed Oct 9 2024 zhuofeng - 1.0.2-23 -- Type:bugfix -- CVE:NA -- SUG:NA -- DESC:avg_block_io send alarm to xalarmd - -* Wed Oct 9 2024 caixiaomeng - 1.0.2-22 -- Type:bugfix -- CVE:NA -- SUG:NA -- DESC:fix python 3.7 not support list bool type - -* Tue Oct 8 2024 jinsaihang - 1.0.2-21 -- Type:bugfix -- CVE:NA -- SUG:NA -- DESC:add alarm event query function - -* Tue Oct 8 2024 caixiaomeng - 1.0.2-20 -- Type:bugfix -- CVE:NA -- SUG:NA -- DESC:add pyxalarm and pySentryNotify, add multi users support for xalarmd - -* Mon Sep 30 2024 heyouzhi - 1.0.2-19 -- Type:bugfix -- CVE:NA -- SUG:NA -- DESC:fix ai_block_io some issues - -* Fri Sep 27 2024 zhuofeng - 1.0.2-18 -- Type:bugfix -- CVE:NA -- SUG:NA -- DESC:add log level and change log format - -* Wed Sep 25 2024 zhuofeng - 1.0.2-17 -- Type:bugfix -- CVE:NA -- SUG:NA -- DESC:fix bug step 2 about collect module and avg block io - -* Mon Sep 23 2024 shixuantong - 1.0.2-16 -- Type:bugfix -- CVE:NA -- SUG:NA -- DESC:optimize the handing of cat-cli error msg in cpu_sentry - over threshold should be warn level log in cat-cli - -* Mon Sep 23 2024 heyouzhi - 1.0.2-15 -- Type:requirement -- CVE:NA -- SUG:NA -- DESC:add ai threshold slow io detection plugin - -* Fri Sep 20 2024 zhuofeng - 1.0.2-14 -- Type:requirement -- CVE:NA -- SUG:NA -- DESC:fix some about collect module and avg block io - -* Sat Sep 14 2024 zhuofeng - 1.0.2-13 -- Type:requirement -- CVE:NA -- SUG:NA -- DESC:add collect module and avg_block_io plugin to sysSentry - -* Sat Sep 14 2024 zhuofeng - 1.0.2-12 -- Type:bugfix -- CVE:NA -- SUG:NA -- DESC:fix syssentry fails to be started when cpu_sentry is not installed - -* Wed Sep 11 2024 shixuantong - 1.0.2-11 -- Type:bugfix -- CVE:NA -- SUG:NA -- DESC:fix configparser.InterpolationSyntaxError - -* Mon Sep 09 2024 caixiaomeng - 1.0.2-10 -- Type:bugfix -- CVE:NA -- SUG:NA -- DESC:split cpu_sentry and syssentry - -* Mon Sep 02 2024 shixuantong - 1.0.2-9 -- Type:bugfix -- CVE:NA -- SUG:NA -- DESC:Remove ANSI escape sequences - -* Sat Aug 31 2024 shixuantong - 1.0.2-8 -- Type:bugfix -- CVE:NA -- SUG:NA -- DESC:add ipmitool to Recommends for cpu_sentry - -* Sat Aug 31 2024 zhuofeng - 1.0.2-7 -- Type:bugfix -- CVE:NA -- SUG:NA -- DESC:add deleted code to plugin rasdaemon - -* Fri Aug 30 2024 shixuantong - 1.0.2-6 -- Type:bugfix -- CVE:NA -- SUG:NA -- DESC:setting parameters must be integer - -* Wed Aug 28 2024 shixuantong - 1.0.2-5 -- Type:bugfix -- CVE:NA -- SUG:NA -- DESC:cpu_utility and cpu_patrol must be an integer - -* Fri Jul 26 2024 shixuantong - 1.0.2-4 -- Type:bugfix -- CVE:NA -- SUG:NA -- DESC:fix result when process output is None - -* Thu Jul 25 2024 shixuantong - 1.0.2-3 -- Type:bugfix -- CVE:NA -- SUG:NA -- DESC:Fix the problem that function cpu_report_result() is called more than once - fix error handling - -* Tue Jun 18 2024 shixuantong - 1.0.2-2 -- Type:bugfix -- CVE:NA -- SUG:NA -- DESC:delete rpath setting - -* Tue Jun 11 2024 shixuantong - 1.0.2-1 -- Type:enhancement -- CVE:NA -- SUG:NA -- DESC:Package init - diff --git a/uniform-avg_block_io-log-and-ai_block_io-log.patch b/uniform-avg_block_io-log-and-ai_block_io-log.patch deleted file mode 100644 index d9e91f08a7cf047874a5979113655f7bb82551b7..0000000000000000000000000000000000000000 --- a/uniform-avg_block_io-log-and-ai_block_io-log.patch +++ /dev/null @@ -1,63 +0,0 @@ -From c8f21d1621e96e2c8a239f8028cc9331aa0f8997 Mon Sep 17 00:00:00 2001 -From: jinsaihang -Date: Tue, 17 Dec 2024 11:36:11 +0800 -Subject: [PATCH] uniform avg_block_io log and ai_block_io log - -Signed-off-by: jinsaihang ---- - src/python/sentryPlugins/ai_block_io/ai_block_io.py | 5 +++++ - src/python/sentryPlugins/ai_block_io/detector.py | 8 +++----- - src/python/sentryPlugins/avg_block_io/stage_window.py | 2 +- - 3 files changed, 9 insertions(+), 6 deletions(-) - -diff --git a/src/python/sentryPlugins/ai_block_io/ai_block_io.py b/src/python/sentryPlugins/ai_block_io/ai_block_io.py -index 14f740d..8075f5f 100644 ---- a/src/python/sentryPlugins/ai_block_io/ai_block_io.py -+++ b/src/python/sentryPlugins/ai_block_io/ai_block_io.py -@@ -208,6 +208,11 @@ class SlowIODetection: - tmp_alarm_content = alarm_content.copy() - del tmp_alarm_content["details"] - logging.warning("[SLOW IO] " + str(tmp_alarm_content)) -+ logging.warning(f'[SLOW IO] disk: {str(tmp_alarm_content.get("driver_name"))}, ' -+ f'stage: {str(tmp_alarm_content.get("driver_name"))}, ' -+ f'iotype: {str(tmp_alarm_content.get("io_type"))}, ' -+ f'type: {str(tmp_alarm_content.get("alarm_type"))}, ' -+ f'reason: {str(tmp_alarm_content.get("reason"))}') - logging.warning(f"latency: " + str(alarm_content.get("details").get("latency"))) - logging.warning(f"iodump: " + str(alarm_content.get("details").get("iodump"))) - -diff --git a/src/python/sentryPlugins/ai_block_io/detector.py b/src/python/sentryPlugins/ai_block_io/detector.py -index 496e032..27fb7f7 100644 ---- a/src/python/sentryPlugins/ai_block_io/detector.py -+++ b/src/python/sentryPlugins/ai_block_io/detector.py -@@ -58,11 +58,9 @@ class Detector: - logging.info(f'[abnormal_period]: disk: {self._metric_name.disk_name}, ' - f'stage: {self._metric_name.stage_name}, ' - f'iotype: {self._metric_name.io_access_type_name}, ' -- f'metric: {self._metric_name.metric_name}, ' -- f'current value: {metric_value}, ' -- f'ai threshold: {detection_result[2]}, ' -- f'absolute threshold upper limit: {detection_result[3]}, ' -- f'lower limit: {detection_result[4]}') -+ f'type: {self._metric_name.metric_name}, ' -+ f'ai_threshold: {round(detection_result[2], 3)}, ' -+ f'curr_val: {metric_value}') - else: - logging.debug(f'Detection result: {str(detection_result)}') - logging.debug(f'exit Detector: {self}') -diff --git a/src/python/sentryPlugins/avg_block_io/stage_window.py b/src/python/sentryPlugins/avg_block_io/stage_window.py -index 5113782..587bd49 100644 ---- a/src/python/sentryPlugins/avg_block_io/stage_window.py -+++ b/src/python/sentryPlugins/avg_block_io/stage_window.py -@@ -28,7 +28,7 @@ class AbnormalWindowBase: - self.abnormal_window.append(False) - - def is_abnormal_window(self): -- return sum(self.abnormal_window) > self.window_threshold -+ return sum(self.abnormal_window) >= self.window_threshold - - def window_data_to_string(self): - return ",".join(str(x) for x in self.window_data) --- -2.27.0 - diff --git a/update-collect-log.patch b/update-collect-log.patch deleted file mode 100644 index 2624eb4eb6b6e10d7d3b81498d1ef4f9eee21fcd..0000000000000000000000000000000000000000 --- a/update-collect-log.patch +++ /dev/null @@ -1,25 +0,0 @@ -From 73f5028fcab08613833c9f2b432f660c70ac264e Mon Sep 17 00:00:00 2001 -From: zhuofeng -Date: Sat, 12 Oct 2024 16:06:32 +0800 -Subject: [PATCH] update collect log - ---- - src/python/sentryCollector/collect_io.py | 6 +++--- - 1 file changed, 3 insertions(+), 3 deletions(-) - -diff --git a/src/python/sentryCollector/collect_io.py b/src/python/sentryCollector/collect_io.py -index 2b10cde..f699c3c 100644 ---- a/src/python/sentryCollector/collect_io.py -+++ b/src/python/sentryCollector/collect_io.py -@@ -156,7 +156,7 @@ class CollectIo(): - for line in file: - count += line.count('.op=' + Io_Category[category].upper()) - if count > 0: -- logging.info(f"io_dump info : {disk_name}, {stage}, {category}, {count}") -+ logging.info(f"io_dump info : {disk_name}, {stage}, {Io_Category[category]}, {count}") - except FileNotFoundError: - logging.error("The file %s does not exist.", io_dump_file) - return count --- -2.33.0 - diff --git a/update-collect-plugin-period-max.patch b/update-collect-plugin-period-max.patch deleted file mode 100644 index 42d244d851a74d0d976c67851e03ac52cbed3110..0000000000000000000000000000000000000000 --- a/update-collect-plugin-period-max.patch +++ /dev/null @@ -1,44 +0,0 @@ -From 4550d9cbbb7e921db168f748e8b1d5d7cc0f8b15 Mon Sep 17 00:00:00 2001 -From: zhuofeng -Date: Mon, 21 Oct 2024 17:30:39 +0800 -Subject: [PATCH] update collect plugin period max - ---- - src/python/sentryCollector/collect_plugin.py | 7 +++++-- - 1 file changed, 5 insertions(+), 2 deletions(-) - -diff --git a/src/python/sentryCollector/collect_plugin.py b/src/python/sentryCollector/collect_plugin.py -index 53dddec..9495d8b 100644 ---- a/src/python/sentryCollector/collect_plugin.py -+++ b/src/python/sentryCollector/collect_plugin.py -@@ -45,6 +45,9 @@ LIMIT_IOTYPE_LIST_LEN = 4 - LIMIT_PERIOD_MIN_LEN = 1 - LIMIT_PERIOD_MAX_LEN = 300 - -+# max_save -+LIMIT_MAX_SAVE_LEN = 300 -+ - # interface protocol - class ClientProtocol(): - IS_IOCOLLECT_VALID = 0 -@@ -189,7 +192,7 @@ def inter_is_iocollect_valid(period, disk_list=None, stage=None): - if not period or not isinstance(period, int): - result['ret'] = ResultMessage.RESULT_NOT_PARAM - return result -- if period < LIMIT_PERIOD_MIN_LEN or period > LIMIT_PERIOD_MAX_LEN: -+ if period < LIMIT_PERIOD_MIN_LEN or period > LIMIT_PERIOD_MAX_LEN * LIMIT_MAX_SAVE_LEN: - result['ret'] = ResultMessage.RESULT_INVALID_LENGTH - return result - -@@ -246,7 +249,7 @@ def inter_get_io_data(period, disk_list, stage, iotype): - if not isinstance(period, int): - result['ret'] = ResultMessage.RESULT_NOT_PARAM - return result -- if period < LIMIT_PERIOD_MIN_LEN or period > LIMIT_PERIOD_MAX_LEN: -+ if period < LIMIT_PERIOD_MIN_LEN or period > LIMIT_PERIOD_MAX_LEN * LIMIT_MAX_SAVE_LEN: - result['ret'] = ResultMessage.RESULT_INVALID_LENGTH - return result - --- -2.33.0 - diff --git a/update-log-when-it-is-not-lock-collect.patch b/update-log-when-it-is-not-lock-collect.patch deleted file mode 100644 index 634a2b9f2e048d9c7b3d9567a5aa819da9cf1f46..0000000000000000000000000000000000000000 --- a/update-log-when-it-is-not-lock-collect.patch +++ /dev/null @@ -1,35 +0,0 @@ -From ac73565fdb0e4bc544e5308ea0251dd6be410ed9 Mon Sep 17 00:00:00 2001 -From: zhuofeng -Date: Wed, 9 Oct 2024 16:37:24 +0800 -Subject: [PATCH] update log when it is not lock collect - ---- - src/python/sentryCollector/collect_io.py | 8 ++++++-- - 1 file changed, 6 insertions(+), 2 deletions(-) - -diff --git a/src/python/sentryCollector/collect_io.py b/src/python/sentryCollector/collect_io.py -index e45947a..2e75187 100644 ---- a/src/python/sentryCollector/collect_io.py -+++ b/src/python/sentryCollector/collect_io.py -@@ -179,13 +179,17 @@ class CollectIo(): - blk_io_hierarchy_path = os.path.join(disk_path, 'blk_io_hierarchy') - - if not os.path.exists(blk_io_hierarchy_path): -- logging.error("no blk_io_hierarchy directory found in %s, skipping.", disk_name) -+ logging.warning("no blk_io_hierarchy directory found in %s, skipping.", disk_name) - continue - - for file_name in os.listdir(blk_io_hierarchy_path): - file_path = os.path.join(blk_io_hierarchy_path, file_name) - if file_name == 'stats': - all_disk.append(disk_name) -+ -+ if len(all_disk) == 0: -+ logging.debug("no blk_io_hierarchy disk, it is not lock-free collection") -+ return False - - if self.loop_all: - self.disk_list = all_disk --- -2.33.0 - diff --git a/update-nvme-config.patch b/update-nvme-config.patch deleted file mode 100644 index b97a42c98d83db9a0b55020e9f7d45990106fe2d..0000000000000000000000000000000000000000 --- a/update-nvme-config.patch +++ /dev/null @@ -1,51 +0,0 @@ -From f50b4e1b7f5fa38b1930349b1a9a905eb5307ab7 Mon Sep 17 00:00:00 2001 -From: znzjugod -Date: Tue, 5 Nov 2024 11:47:56 +0800 -Subject: [PATCH] update nvme config - ---- - config/plugins/ai_block_io.ini | 8 ++++---- - src/python/sentryPlugins/ai_block_io/config_parser.py | 8 ++++---- - 2 files changed, 8 insertions(+), 8 deletions(-) - -diff --git a/config/plugins/ai_block_io.ini b/config/plugins/ai_block_io.ini -index d0b1e74..69f44ba 100644 ---- a/config/plugins/ai_block_io.ini -+++ b/config/plugins/ai_block_io.ini -@@ -23,10 +23,10 @@ read_tot_lim=50000 - write_tot_lim=50000 - - [latency_nvme_ssd] --read_avg_lim=300 --write_avg_lim=300 --read_tot_lim=500 --write_tot_lim=500 -+read_avg_lim=10000 -+write_avg_lim=10000 -+read_tot_lim=50000 -+write_tot_lim=50000 - - [latency_sata_hdd] - read_avg_lim=15000 -diff --git a/src/python/sentryPlugins/ai_block_io/config_parser.py b/src/python/sentryPlugins/ai_block_io/config_parser.py -index 3049db2..1bbb609 100644 ---- a/src/python/sentryPlugins/ai_block_io/config_parser.py -+++ b/src/python/sentryPlugins/ai_block_io/config_parser.py -@@ -74,10 +74,10 @@ class ConfigParser: - "write_tot_lim": 50000 - }, - "latency_nvme_ssd": { -- "read_avg_lim": 300, -- "write_avg_lim": 300, -- "read_tot_lim": 500, -- "write_tot_lim": 500 -+ "read_avg_lim": 10000, -+ "write_avg_lim": 10000, -+ "read_tot_lim": 50000, -+ "write_tot_lim": 50000 - }, - "latency_sata_hdd": { - "read_avg_lim": 15000, --- -2.45.2 - diff --git a/xalarm-add-alarm-msg-length-to-8192.patch b/xalarm-add-alarm-msg-length-to-8192.patch deleted file mode 100644 index 998fb4b7045edac2c3d168d7d1173711e15e5ee8..0000000000000000000000000000000000000000 --- a/xalarm-add-alarm-msg-length-to-8192.patch +++ /dev/null @@ -1,112 +0,0 @@ -From c95be14eee48e5afb255700c9d67c1d8ef2532dc Mon Sep 17 00:00:00 2001 -From: PshySimon -Date: Thu, 10 Oct 2024 16:15:52 +0800 -Subject: [PATCH] xalarm add alarm msg length to 8192 - ---- - src/libso/xalarm/register_xalarm.c | 2 +- - src/libso/xalarm/register_xalarm.h | 2 +- - src/python/xalarm/register_xalarm.py | 2 +- - src/python/xalarm/sentry_notify.py | 2 +- - src/python/xalarm/xalarm_api.py | 8 ++++++-- - src/python/xalarm/xalarm_server.py | 2 +- - 6 files changed, 11 insertions(+), 7 deletions(-) - -diff --git a/src/libso/xalarm/register_xalarm.c b/src/libso/xalarm/register_xalarm.c -index 21a419f..5aff2bc 100644 ---- a/src/libso/xalarm/register_xalarm.c -+++ b/src/libso/xalarm/register_xalarm.c -@@ -35,7 +35,7 @@ - #define ALARM_SOCKET_PERMISSION 0700 - #define TIME_UNIT_MILLISECONDS 1000 - --#define MAX_PARAS_LEN 1023 -+#define MAX_PARAS_LEN 8191 - #define MIN_ALARM_ID 1001 - #define MAX_ALARM_ID (MIN_ALARM_ID + MAX_NUM_OF_ALARM_ID - 1) - -diff --git a/src/libso/xalarm/register_xalarm.h b/src/libso/xalarm/register_xalarm.h -index fef9482..dcf4f03 100644 ---- a/src/libso/xalarm/register_xalarm.h -+++ b/src/libso/xalarm/register_xalarm.h -@@ -11,7 +11,7 @@ - #include - #include - --#define ALARM_INFO_MAX_PARAS_LEN 1024 -+#define ALARM_INFO_MAX_PARAS_LEN 8192 - #define MAX_STRERROR_SIZE 1024 - #define MAX_ALARM_TYEPS 1024 - #define MIN_ALARM_ID 1001 -diff --git a/src/python/xalarm/register_xalarm.py b/src/python/xalarm/register_xalarm.py -index 6756b1b..edd9994 100644 ---- a/src/python/xalarm/register_xalarm.py -+++ b/src/python/xalarm/register_xalarm.py -@@ -11,7 +11,7 @@ from struct import error as StructParseError - from .xalarm_api import Xalarm, alarm_bin2stu - - --ALARM_REPORT_LEN = 1048 -+ALARM_REPORT_LEN = 8216 - MAX_NUM_OF_ALARM_ID=128 - MIN_ALARM_ID = 1001 - MAX_ALARM_ID = (MIN_ALARM_ID + MAX_NUM_OF_ALARM_ID - 1) -diff --git a/src/python/xalarm/sentry_notify.py b/src/python/xalarm/sentry_notify.py -index a19e5b3..c763a24 100644 ---- a/src/python/xalarm/sentry_notify.py -+++ b/src/python/xalarm/sentry_notify.py -@@ -17,7 +17,7 @@ CRITICAL_ALM = 3 - ALARM_TYPE_OCCUR = 1 - ALARM_TYPE_RECOVER = 2 - --MAX_PUC_PARAS_LEN = 1024 -+MAX_PUC_PARAS_LEN = 8192 - - DIR_XALARM = "/var/run/xalarm" - PATH_REPORT_ALARM = "/var/run/xalarm/report" -diff --git a/src/python/xalarm/xalarm_api.py b/src/python/xalarm/xalarm_api.py -index 99eabf5..863bd02 100644 ---- a/src/python/xalarm/xalarm_api.py -+++ b/src/python/xalarm/xalarm_api.py -@@ -23,7 +23,7 @@ ALARM_LEVELS = (1, 2, 3, 4, 5) - ALARM_SOCK_PATH = "/var/run/xalarm/report" - MIN_ALARM_ID = 1001 - MAX_ALARM_ID = 1128 --MAX_MSG_LEN = 1024 -+MAX_MSG_LEN = 8192 - - - @dataclasses.dataclass -@@ -120,6 +120,10 @@ def alarm_bin2stu(bin_data): - - - def alarm_stu2bin(alarm_info: Xalarm): -+ alarm_msg = alarm_info.msg1 -+ padding_length = MAX_MSG_LEN - len(alarm_msg) -+ if padding_length > 0: -+ alarm_msg = alarm_msg + ('\x00' * padding_length) - return struct.pack( - f'@HBBll{MAX_MSG_LEN}s', - alarm_info.alarm_id, -@@ -127,4 +131,4 @@ def alarm_stu2bin(alarm_info: Xalarm): - alarm_info.alarm_type, - alarm_info.timetamp.tv_sec, - alarm_info.timetamp.tv_usec, -- alarm_info.msg1.encode('utf-8')) -+ alarm_msg.encode('utf-8')) -diff --git a/src/python/xalarm/xalarm_server.py b/src/python/xalarm/xalarm_server.py -index fcaf393..2882609 100644 ---- a/src/python/xalarm/xalarm_server.py -+++ b/src/python/xalarm/xalarm_server.py -@@ -28,7 +28,7 @@ from .xalarm_transfer import check_filter, transmit_alarm, wait_for_connection - ALARM_DIR = "/var/run/xalarm" - USER_RECV_SOCK = "/var/run/xalarm/alarm" - SOCK_FILE = "/var/run/xalarm/report" --ALARM_REPORT_LEN = 1048 -+ALARM_REPORT_LEN = 8216 - ALARM_DIR_PERMISSION = 0o750 - ALARM_LISTEN_QUEUE_LEN = 5 - --- -2.27.0 -