diff --git a/add-ai-threshold-slow-io-detection-plugin.patch b/add-ai-threshold-slow-io-detection-plugin.patch new file mode 100644 index 0000000000000000000000000000000000000000..6f707a4aa50ec37ee7d0fddc5e53a3aadef839db --- /dev/null +++ b/add-ai-threshold-slow-io-detection-plugin.patch @@ -0,0 +1,1201 @@ +From 3d72fa7f517e6e99af1205e965c3775dc23461f4 Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?=E8=B4=BA=E6=9C=89=E5=BF=97?= <1037617413@qq.com> +Date: Mon, 23 Sep 2024 11:03:26 +0800 +Subject: [PATCH] add ai threshold slow io detection to sysSentry + +--- + .../ai_threshold_slow_io_detection.ini | 16 ++ + .../tasks/ai_threshold_slow_io_detection.mod | 5 + + .../test_ai_threshold_slow_io_detection.py | 165 ++++++++++++++++++ + .../ai_threshold_slow_io_detection/README.md | 2 + + .../__init__.py | 0 + .../alarm_report.py | 49 ++++++ + .../config_parser.py | 141 +++++++++++++++ + .../data_access.py | 91 ++++++++++ + .../detector.py | 48 +++++ + .../ai_threshold_slow_io_detection/io_data.py | 74 ++++++++ + .../sliding_window.py | 113 ++++++++++++ + .../slow_io_detection.py | 133 ++++++++++++++ + .../threshold.py | 160 +++++++++++++++++ + .../ai_threshold_slow_io_detection/utils.py | 67 +++++++ + src/python/setup.py | 3 +- + 15 files changed, 1066 insertions(+), 1 deletion(-) + create mode 100644 config/plugins/ai_threshold_slow_io_detection.ini + create mode 100644 config/tasks/ai_threshold_slow_io_detection.mod + create mode 100644 selftest/test/test_ai_threshold_slow_io_detection.py + create mode 100644 src/python/sentryPlugins/ai_threshold_slow_io_detection/README.md + create mode 100644 src/python/sentryPlugins/ai_threshold_slow_io_detection/__init__.py + create mode 100644 src/python/sentryPlugins/ai_threshold_slow_io_detection/alarm_report.py + create mode 100644 src/python/sentryPlugins/ai_threshold_slow_io_detection/config_parser.py + create mode 100644 src/python/sentryPlugins/ai_threshold_slow_io_detection/data_access.py + create mode 100644 src/python/sentryPlugins/ai_threshold_slow_io_detection/detector.py + create mode 100644 src/python/sentryPlugins/ai_threshold_slow_io_detection/io_data.py + create mode 100644 src/python/sentryPlugins/ai_threshold_slow_io_detection/sliding_window.py + create mode 100644 src/python/sentryPlugins/ai_threshold_slow_io_detection/slow_io_detection.py + create mode 100644 src/python/sentryPlugins/ai_threshold_slow_io_detection/threshold.py + create mode 100644 src/python/sentryPlugins/ai_threshold_slow_io_detection/utils.py + +diff --git a/config/plugins/ai_threshold_slow_io_detection.ini b/config/plugins/ai_threshold_slow_io_detection.ini +new file mode 100644 +index 0000000..44eb928 +--- /dev/null ++++ b/config/plugins/ai_threshold_slow_io_detection.ini +@@ -0,0 +1,16 @@ ++[common] ++absolute_threshold=40 ++slow_io_detect_frequency=1 ++log_level=info ++ ++[algorithm] ++train_data_duration=0.1 ++train_update_duration=0.02 ++algorithm_type=n_sigma ++boxplot_parameter=1.5 ++n_sigma_parameter=3 ++ ++[sliding_window] ++sliding_window_type=not_continuous ++window_size=30 ++window_minimum_threshold=6 +\ No newline at end of file +diff --git a/config/tasks/ai_threshold_slow_io_detection.mod b/config/tasks/ai_threshold_slow_io_detection.mod +new file mode 100644 +index 0000000..2729f72 +--- /dev/null ++++ b/config/tasks/ai_threshold_slow_io_detection.mod +@@ -0,0 +1,5 @@ ++[common] ++enabled=yes ++task_start=/usr/bin/python3 /usr/bin/ai_threshold_slow_io_detection ++task_stop=pkill -f /usr/bin/ai_threshold_slow_io_detection ++type=oneshot +\ No newline at end of file +diff --git a/selftest/test/test_ai_threshold_slow_io_detection.py b/selftest/test/test_ai_threshold_slow_io_detection.py +new file mode 100644 +index 0000000..c36fef5 +--- /dev/null ++++ b/selftest/test/test_ai_threshold_slow_io_detection.py +@@ -0,0 +1,165 @@ ++# coding: utf-8 ++# Copyright (c) 2024 Huawei Technologies Co., Ltd. ++# sysSentry is licensed under the Mulan PSL v2. ++# You can use this software according to the terms and conditions of the Mulan PSL v2. ++# You may obtain a copy of Mulan PSL v2 at: ++# http://license.coscl.org.cn/MulanPSL2 ++# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR ++# IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY OR FIT FOR A PARTICULAR ++# PURPOSE. ++# See the Mulan PSL v2 for more details. ++ ++import unittest ++import numpy as np ++ ++from sentryPlugins.ai_threshold_slow_io_detection.threshold import AbsoluteThreshold, BoxplotThreshold, NSigmaThreshold ++from sentryPlugins.ai_threshold_slow_io_detection.sliding_window import (NotContinuousSlidingWindow, ++ ContinuousSlidingWindow, MedianSlidingWindow) ++ ++ ++def _get_boxplot_threshold(data_list: list, parameter): ++ q1 = np.percentile(data_list, 25) ++ q3 = np.percentile(data_list, 75) ++ iqr = q3 - q1 ++ return q3 + parameter * iqr ++ ++ ++def _get_n_sigma_threshold(data_list: list, parameter): ++ mean = np.mean(data_list) ++ std = np.std(data_list) ++ return mean + parameter * std ++ ++ ++class Test(unittest.TestCase): ++ @classmethod ++ def setUpClass(cls): ++ print("UnitTest Begin...") ++ ++ @classmethod ++ def tearDownClass(cls): ++ print("UnitTest End...") ++ ++ def setUp(self): ++ print("Begin...") ++ ++ def tearDown(self): ++ print("End...") ++ ++ def test_absolute_threshold(self): ++ absolute = AbsoluteThreshold() ++ self.assertEqual(None, absolute.get_threshold()) ++ self.assertFalse(absolute.is_abnormal(5000)) ++ absolute.set_threshold(40) ++ self.assertEqual(40, absolute.get_threshold()) ++ self.assertTrue(absolute.is_abnormal(50)) ++ ++ def test_boxplot_threshold(self): ++ boxplot = BoxplotThreshold(1.5, 5, 1) ++ # 阶段1:尚未初始化 ++ self.assertEqual(None, boxplot.get_threshold()) ++ self.assertFalse(boxplot.is_abnormal(5000)) ++ # 往boxplot中插入5个元素后,会生成阈值 ++ data_list = [20, 20, 20, 30, 10] ++ for data in data_list: ++ boxplot.push_latest_data_to_queue(data) ++ # 阶段2:初始化 ++ boxplot_threshold = boxplot.get_threshold() ++ self.assertEqual(_get_boxplot_threshold(data_list, 1.5), boxplot_threshold) ++ self.assertTrue(boxplot.is_abnormal(5000)) ++ data_list.pop(0) ++ data_list.append(100) ++ boxplot.push_latest_data_to_queue(100) ++ # 阶段3:更新阈值 ++ boxplot_threshold = boxplot.get_threshold() ++ self.assertEqual(_get_boxplot_threshold(data_list, 1.5), boxplot_threshold) ++ ++ def test_n_sigma_threshold(self): ++ n_sigma = NSigmaThreshold(3, 5, 1) ++ self.assertEqual(None, n_sigma.get_threshold()) ++ self.assertFalse(n_sigma.is_abnormal(5000)) ++ data_list = [20, 20, 20, 30, 10] ++ for data in data_list: ++ n_sigma.push_latest_data_to_queue(data) ++ n_sigma_threshold = n_sigma.get_threshold() ++ self.assertEqual(_get_n_sigma_threshold(data_list, 3), n_sigma_threshold) ++ self.assertTrue(n_sigma.is_abnormal(5000)) ++ data_list.pop(0) ++ data_list.append(100) ++ n_sigma.push_latest_data_to_queue(100) ++ # 阶段3:更新阈值 ++ n_sigma_threshold = n_sigma.get_threshold() ++ self.assertEqual(_get_n_sigma_threshold(data_list, 3), n_sigma_threshold) ++ ++ def test_not_continuous_sliding_window(self): ++ not_continuous = NotContinuousSlidingWindow(5, 3) ++ boxplot_threshold = BoxplotThreshold(1.5, 10, 8) ++ boxplot_threshold.attach_observer(not_continuous) ++ data_list1 = [19, 20, 20, 20, 20, 20, 22, 24, 23, 20] ++ for data in data_list1: ++ boxplot_threshold.push_latest_data_to_queue(data) ++ result = not_continuous.is_slow_io_event(data) ++ self.assertFalse(result[0]) ++ self.assertEqual(23.75, boxplot_threshold.get_threshold()) ++ boxplot_threshold.push_latest_data_to_queue(24) ++ result = not_continuous.is_slow_io_event(24) ++ self.assertFalse(result[0]) ++ boxplot_threshold.push_latest_data_to_queue(25) ++ result = not_continuous.is_slow_io_event(25) ++ self.assertTrue(result[0]) ++ data_list2 = [20, 20, 20, 20, 20, 20] ++ for data in data_list2: ++ boxplot_threshold.push_latest_data_to_queue(data) ++ result = not_continuous.is_slow_io_event(data) ++ self.assertFalse(result[0]) ++ self.assertEqual(25.625, boxplot_threshold.get_threshold()) ++ ++ def test_continuous_sliding_window(self): ++ continuous = ContinuousSlidingWindow(5, 3) ++ boxplot_threshold = BoxplotThreshold(1.5, 10, 8) ++ boxplot_threshold.attach_observer(continuous) ++ data_list = [19, 20, 20, 20, 20, 20, 22, 24, 23, 20] ++ for data in data_list: ++ boxplot_threshold.push_latest_data_to_queue(data) ++ result = continuous.is_slow_io_event(data) ++ self.assertFalse(result[0]) ++ self.assertEqual(23.75, boxplot_threshold.get_threshold()) ++ # 没有三个异常点 ++ self.assertFalse(continuous.is_slow_io_event(25)[0]) ++ # 不连续的三个异常点 ++ self.assertFalse(continuous.is_slow_io_event(25)[0]) ++ # 连续的三个异常点 ++ self.assertTrue(continuous.is_slow_io_event(25)[0]) ++ ++ def test_median_sliding_window(self): ++ median = MedianSlidingWindow(5, 3) ++ absolute_threshold = AbsoluteThreshold(10, 8) ++ absolute_threshold.attach_observer(median) ++ absolute_threshold.set_threshold(24.5) ++ data_list = [24, 24, 24, 25, 25] ++ for data in data_list: ++ self.assertFalse(median.is_slow_io_event(data)[0]) ++ self.assertTrue(median.is_slow_io_event(25)[0]) ++ ++ def test_parse_collect_data(self): ++ collect = { ++ "read": [1.0, 2.0, 3.0, 4.0], ++ "write": [5.0, 6.0, 7.0, 8.0], ++ "flush": [9.0, 10.0, 11.0, 12.0], ++ "discard": [13.0, 14.0, 15.0, 16.0], ++ } ++ from io_data import BaseData ++ from data_access import _get_io_stage_data ++ ++ io_data = _get_io_stage_data(collect) ++ self.assertEqual( ++ io_data.read, BaseData(latency=1.0, io_dump=2.0, io_length=3.0, iops=4.0) ++ ) ++ self.assertEqual( ++ io_data.write, BaseData(latency=5.0, io_dump=6.0, io_length=7.0, iops=8.0) ++ ) ++ self.assertEqual( ++ io_data.flush, BaseData(latency=9.0, io_dump=10.0, io_length=11.0, iops=12.0) ++ ) ++ self.assertEqual( ++ io_data.discard, BaseData(latency=13.0, io_dump=14.0, io_length=15.0, iops=16.0) ++ ) +diff --git a/src/python/sentryPlugins/ai_threshold_slow_io_detection/README.md b/src/python/sentryPlugins/ai_threshold_slow_io_detection/README.md +new file mode 100644 +index 0000000..f9b8388 +--- /dev/null ++++ b/src/python/sentryPlugins/ai_threshold_slow_io_detection/README.md +@@ -0,0 +1,2 @@ ++# slow_io_detection ++ +diff --git a/src/python/sentryPlugins/ai_threshold_slow_io_detection/__init__.py b/src/python/sentryPlugins/ai_threshold_slow_io_detection/__init__.py +new file mode 100644 +index 0000000..e69de29 +diff --git a/src/python/sentryPlugins/ai_threshold_slow_io_detection/alarm_report.py b/src/python/sentryPlugins/ai_threshold_slow_io_detection/alarm_report.py +new file mode 100644 +index 0000000..3f4f34e +--- /dev/null ++++ b/src/python/sentryPlugins/ai_threshold_slow_io_detection/alarm_report.py +@@ -0,0 +1,49 @@ ++# coding: utf-8 ++# Copyright (c) 2024 Huawei Technologies Co., Ltd. ++# sysSentry is licensed under the Mulan PSL v2. ++# You can use this software according to the terms and conditions of the Mulan PSL v2. ++# You may obtain a copy of Mulan PSL v2 at: ++# http://license.coscl.org.cn/MulanPSL2 ++# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR ++# IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY OR FIT FOR A PARTICULAR ++# PURPOSE. ++# See the Mulan PSL v2 for more details. ++ ++from syssentry.result import ResultLevel, report_result ++import logging ++import json ++ ++ ++class AlarmReport: ++ TASK_NAME = "SLOW_IO_DETECTION" ++ ++ @staticmethod ++ def report_pass(info: str): ++ report_result(AlarmReport.TASK_NAME, ResultLevel.PASS, json.dumps({"msg": info})) ++ logging.info(f'Report {AlarmReport.TASK_NAME} PASS: {info}') ++ ++ @staticmethod ++ def report_fail(info: str): ++ report_result(AlarmReport.TASK_NAME, ResultLevel.FAIL, json.dumps({"msg": info})) ++ logging.info(f'Report {AlarmReport.TASK_NAME} FAIL: {info}') ++ ++ @staticmethod ++ def report_skip(info: str): ++ report_result(AlarmReport.TASK_NAME, ResultLevel.SKIP, json.dumps({"msg": info})) ++ logging.info(f'Report {AlarmReport.TASK_NAME} SKIP: {info}') ++ ++ @staticmethod ++ def report_minor_alm(info: str): ++ report_result(AlarmReport.TASK_NAME, ResultLevel.MINOR_ALM, json.dumps({"msg": info})) ++ logging.info(f'Report {AlarmReport.TASK_NAME} MINOR_ALM: {info}') ++ ++ @staticmethod ++ def report_major_alm(info: str): ++ report_result(AlarmReport.TASK_NAME, ResultLevel.MAJOR_ALM, json.dumps({"msg": info})) ++ logging.info(f'Report {AlarmReport.TASK_NAME} MAJOR_ALM: {info}') ++ ++ @staticmethod ++ def report_critical_alm(info: str): ++ report_result(AlarmReport.TASK_NAME, ResultLevel.CRITICAL_ALM, json.dumps({"msg": info})) ++ logging.info(f'Report {AlarmReport.TASK_NAME} CRITICAL_ALM: {info}') ++ +diff --git a/src/python/sentryPlugins/ai_threshold_slow_io_detection/config_parser.py b/src/python/sentryPlugins/ai_threshold_slow_io_detection/config_parser.py +new file mode 100644 +index 0000000..cd4e6f1 +--- /dev/null ++++ b/src/python/sentryPlugins/ai_threshold_slow_io_detection/config_parser.py +@@ -0,0 +1,141 @@ ++# coding: utf-8 ++# Copyright (c) 2024 Huawei Technologies Co., Ltd. ++# sysSentry is licensed under the Mulan PSL v2. ++# You can use this software according to the terms and conditions of the Mulan PSL v2. ++# You may obtain a copy of Mulan PSL v2 at: ++# http://license.coscl.org.cn/MulanPSL2 ++# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR ++# IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY OR FIT FOR A PARTICULAR ++# PURPOSE. ++# See the Mulan PSL v2 for more details. ++ ++import configparser ++import logging ++ ++ ++class ConfigParser: ++ ++ DEFAULT_ABSOLUTE_THRESHOLD = 40 ++ DEFAULT_SLOW_IO_DETECTION_FREQUENCY = 1 ++ DEFAULT_LOG_LEVEL = 'info' ++ DEFAULT_TRAIN_DATA_DURATION = 24 ++ DEFAULT_TRAIN_UPDATE_DURATION = 2 ++ DEFAULT_ALGORITHM_TYPE = 'boxplot' ++ DEFAULT_N_SIGMA_PARAMETER = 3 ++ DEFAULT_BOXPLOT_PARAMETER = 1.5 ++ DEFAULT_SLIDING_WINDOW_TYPE = 'not_continuous' ++ DEFAULT_WINDOW_SIZE = 30 ++ DEFAULT_WINDOW_MINIMUM_THRESHOLD = 6 ++ ++ def __init__(self, config_file_name): ++ self.__boxplot_parameter = None ++ self.__window_minimum_threshold = None ++ self.__window_size = None ++ self.__sliding_window_type = None ++ self.__n_sigma_parameter = None ++ self.__algorithm_type = None ++ self.__train_update_duration = None ++ self.__log_level = None ++ self.__slow_io_detect_frequency = None ++ self.__absolute_threshold = None ++ self.__train_data_duration = None ++ self.__config_file_name = config_file_name ++ ++ def read_config_from_file(self): ++ ++ con = configparser.ConfigParser() ++ con.read(self.__config_file_name, encoding='utf-8') ++ ++ items_common = dict(con.items('common')) ++ items_algorithm = dict(con.items('algorithm')) ++ items_sliding_window = dict(con.items('sliding_window')) ++ ++ try: ++ self.__absolute_threshold = int(items_common.get('absolute_threshold', ++ ConfigParser.DEFAULT_ABSOLUTE_THRESHOLD)) ++ except ValueError: ++ self.__absolute_threshold = ConfigParser.DEFAULT_ABSOLUTE_THRESHOLD ++ logging.warning('absolute threshold type conversion has error, use default value.') ++ ++ try: ++ self.__slow_io_detect_frequency = int(items_common.get('slow_io_detect_frequency', ++ ConfigParser.DEFAULT_SLOW_IO_DETECTION_FREQUENCY)) ++ except ValueError: ++ self.__slow_io_detect_frequency = ConfigParser.DEFAULT_SLOW_IO_DETECTION_FREQUENCY ++ logging.warning('slow_io_detect_frequency type conversion has error, use default value.') ++ ++ self.__log_level = items_common.get('log_level', ConfigParser.DEFAULT_LOG_LEVEL) ++ ++ try: ++ self.__train_data_duration = float(items_algorithm.get('train_data_duration', ++ ConfigParser.DEFAULT_TRAIN_DATA_DURATION)) ++ except ValueError: ++ self.__train_data_duration = ConfigParser.DEFAULT_TRAIN_DATA_DURATION ++ logging.warning('train_data_duration type conversion has error, use default value.') ++ ++ try: ++ self.__train_update_duration = float(items_algorithm.get('train_update_duration', ++ ConfigParser.DEFAULT_TRAIN_UPDATE_DURATION)) ++ except ValueError: ++ self.__train_update_duration = ConfigParser.DEFAULT_TRAIN_UPDATE_DURATION ++ logging.warning('train_update_duration type conversion has error, use default value.') ++ ++ try: ++ self.__algorithm_type = items_algorithm.get('algorithm_type', ConfigParser.DEFAULT_ALGORITHM_TYPE) ++ except ValueError: ++ self.__algorithm_type = ConfigParser.DEFAULT_ALGORITHM_TYPE ++ logging.warning('algorithmType type conversion has error, use default value.') ++ ++ if self.__algorithm_type == 'n_sigma': ++ try: ++ self.__n_sigma_parameter = float(items_algorithm.get('n_sigma_parameter', ++ ConfigParser.DEFAULT_N_SIGMA_PARAMETER)) ++ except ValueError: ++ self.__n_sigma_parameter = ConfigParser.DEFAULT_N_SIGMA_PARAMETER ++ logging.warning('n_sigma_parameter type conversion has error, use default value.') ++ elif self.__algorithm_type == 'boxplot': ++ try: ++ self.__boxplot_parameter = float(items_algorithm.get('boxplot_parameter', ++ ConfigParser.DEFAULT_BOXPLOT_PARAMETER)) ++ except ValueError: ++ self.__boxplot_parameter = ConfigParser.DEFAULT_BOXPLOT_PARAMETER ++ logging.warning('boxplot_parameter type conversion has error, use default value.') ++ ++ self.__sliding_window_type = items_sliding_window.get('sliding_window_type', ++ ConfigParser.DEFAULT_SLIDING_WINDOW_TYPE) ++ ++ try: ++ self.__window_size = int(items_sliding_window.get('window_size', ++ ConfigParser.DEFAULT_WINDOW_SIZE)) ++ except ValueError: ++ self.__window_size = ConfigParser.DEFAULT_WINDOW_SIZE ++ logging.warning('window_size type conversion has error, use default value.') ++ ++ try: ++ self.__window_minimum_threshold = ( ++ int(items_sliding_window.get('window_minimum_threshold', ++ ConfigParser.DEFAULT_WINDOW_MINIMUM_THRESHOLD))) ++ except ValueError: ++ self.__window_minimum_threshold = ConfigParser.DEFAULT_WINDOW_MINIMUM_THRESHOLD ++ logging.warning('window_minimum_threshold type conversion has error, use default value.') ++ ++ def get_slow_io_detect_frequency(self): ++ return self.__slow_io_detect_frequency ++ ++ def get_algorithm_type(self): ++ return self.__algorithm_type ++ ++ def get_sliding_window_type(self): ++ return self.__sliding_window_type ++ ++ def get_train_data_duration_and_train_update_duration(self): ++ return self.__train_data_duration, self.__train_update_duration ++ ++ def get_window_size_and_window_minimum_threshold(self): ++ return self.__window_size, self.__window_minimum_threshold ++ ++ def get_absolute_threshold(self): ++ return self.__absolute_threshold ++ ++ def get_log_level(self): ++ return self.__log_level +diff --git a/src/python/sentryPlugins/ai_threshold_slow_io_detection/data_access.py b/src/python/sentryPlugins/ai_threshold_slow_io_detection/data_access.py +new file mode 100644 +index 0000000..d9f3460 +--- /dev/null ++++ b/src/python/sentryPlugins/ai_threshold_slow_io_detection/data_access.py +@@ -0,0 +1,91 @@ ++# coding: utf-8 ++# Copyright (c) 2024 Huawei Technologies Co., Ltd. ++# sysSentry is licensed under the Mulan PSL v2. ++# You can use this software according to the terms and conditions of the Mulan PSL v2. ++# You may obtain a copy of Mulan PSL v2 at: ++# http://license.coscl.org.cn/MulanPSL2 ++# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR ++# IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY OR FIT FOR A PARTICULAR ++# PURPOSE. ++# See the Mulan PSL v2 for more details. ++ ++import json ++import logging ++ ++from sentryCollector.collect_plugin import ( ++ Result_Messages, ++ get_io_data, ++ is_iocollect_valid, ++) ++from .io_data import IOStageData, IOData ++ ++COLLECT_STAGES = [ ++ "throtl", ++ "wbt", ++ "gettag", ++ "plug", ++ "bfq", ++ "hctx", ++ "requeue", ++ "rq_driver", ++ "bio", ++ "iocost", ++] ++ ++def check_collect_valid(period): ++ data_raw = is_iocollect_valid(period) ++ if data_raw["ret"] == 0: ++ try: ++ data = json.loads(data_raw["message"]) ++ except Exception as e: ++ logging.warning(f"get io data failed, {e}") ++ return [] ++ return [k for k in data.keys()] ++ else: ++ return [] ++ ++ ++def _get_raw_data(period, disk_list): ++ return get_io_data( ++ period, ++ disk_list, ++ COLLECT_STAGES, ++ ["read", "write", "flush", "discard"], ++ ) ++ ++ ++def _get_io_stage_data(data): ++ io_stage_data = IOStageData() ++ for data_type in ('read', 'write', 'flush', 'discard'): ++ if data_type in data: ++ getattr(io_stage_data, data_type).latency = data[data_type][0] ++ getattr(io_stage_data, data_type).io_dump = data[data_type][1] ++ getattr(io_stage_data, data_type).io_length = data[data_type][2] ++ getattr(io_stage_data, data_type).iops = data[data_type][3] ++ return io_stage_data ++ ++ ++def get_io_data_from_collect_plug(period, disk_list): ++ data_raw = _get_raw_data(period, disk_list) ++ if data_raw["ret"] == 0: ++ ret = {} ++ try: ++ data = json.loads(data_raw["message"]) ++ except json.decoder.JSONDecodeError as e: ++ logging.warning(f"get io data failed, {e}") ++ return None ++ ++ for disk in data: ++ disk_data = data[disk] ++ disk_ret = IOData() ++ for k, v in disk_data.items(): ++ try: ++ getattr(disk_ret, k) ++ setattr(disk_ret, k, _get_io_stage_data(v)) ++ except AttributeError: ++ logging.debug(f'no attr {k}') ++ continue ++ ret[disk] = disk_ret ++ return ret ++ logging.warning(f'get io data failed with message: {data_raw["message"]}') ++ return None +diff --git a/src/python/sentryPlugins/ai_threshold_slow_io_detection/detector.py b/src/python/sentryPlugins/ai_threshold_slow_io_detection/detector.py +new file mode 100644 +index 0000000..eda9825 +--- /dev/null ++++ b/src/python/sentryPlugins/ai_threshold_slow_io_detection/detector.py +@@ -0,0 +1,48 @@ ++# coding: utf-8 ++# Copyright (c) 2024 Huawei Technologies Co., Ltd. ++# sysSentry is licensed under the Mulan PSL v2. ++# You can use this software according to the terms and conditions of the Mulan PSL v2. ++# You may obtain a copy of Mulan PSL v2 at: ++# http://license.coscl.org.cn/MulanPSL2 ++# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR ++# IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY OR FIT FOR A PARTICULAR ++# PURPOSE. ++# See the Mulan PSL v2 for more details. ++import logging ++ ++from .io_data import MetricName ++from .threshold import Threshold ++from .sliding_window import SlidingWindow ++from .utils import get_metric_value_from_io_data_dict_by_metric_name ++ ++ ++class Detector: ++ _metric_name: MetricName = None ++ _threshold: Threshold = None ++ _slidingWindow: SlidingWindow = None ++ ++ def __init__(self, metric_name: MetricName, threshold: Threshold, sliding_window: SlidingWindow): ++ self._metric_name = metric_name ++ self._threshold = threshold ++ self._slidingWindow = sliding_window ++ self._threshold.attach_observer(self._slidingWindow) ++ ++ def get_metric_name(self): ++ return self._metric_name ++ ++ def is_slow_io_event(self, io_data_dict_with_disk_name: dict): ++ logging.debug(f'Enter Detector: {self}') ++ metric_value = get_metric_value_from_io_data_dict_by_metric_name(io_data_dict_with_disk_name, self._metric_name) ++ if metric_value > 1e-6: ++ logging.debug(f'Input metric value: {str(metric_value)}') ++ self._threshold.push_latest_data_to_queue(metric_value) ++ detection_result = self._slidingWindow.is_slow_io_event(metric_value) ++ logging.debug(f'Detection result: {str(detection_result)}') ++ logging.debug(f'Exit Detector: {self}') ++ return detection_result ++ ++ def __repr__(self): ++ return (f'disk_name: {self._metric_name.get_disk_name()}, stage_name: {self._metric_name.get_stage_name()},' ++ f' access_type_name: {self._metric_name.get_io_access_type_name()},' ++ f' metric_name: {self._metric_name.get_metric_name()}, threshold_type: {self._threshold},' ++ f' sliding_window_type: {self._slidingWindow}') +diff --git a/src/python/sentryPlugins/ai_threshold_slow_io_detection/io_data.py b/src/python/sentryPlugins/ai_threshold_slow_io_detection/io_data.py +new file mode 100644 +index 0000000..0e17051 +--- /dev/null ++++ b/src/python/sentryPlugins/ai_threshold_slow_io_detection/io_data.py +@@ -0,0 +1,74 @@ ++# coding: utf-8 ++# Copyright (c) 2024 Huawei Technologies Co., Ltd. ++# sysSentry is licensed under the Mulan PSL v2. ++# You can use this software according to the terms and conditions of the Mulan PSL v2. ++# You may obtain a copy of Mulan PSL v2 at: ++# http://license.coscl.org.cn/MulanPSL2 ++# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR ++# IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY OR FIT FOR A PARTICULAR ++# PURPOSE. ++# See the Mulan PSL v2 for more details. ++ ++from dataclasses import dataclass, field ++from datetime import datetime ++from typing import Optional ++ ++ ++@dataclass ++class BaseData: ++ latency: Optional[float] = field(default_factory=lambda: None) ++ io_dump: Optional[int] = field(default_factory=lambda: None) ++ io_length: Optional[int] = field(default_factory=lambda: None) ++ iops: Optional[int] = field(default_factory=lambda: None) ++ ++ ++@dataclass ++class IOStageData: ++ read: BaseData = field(default_factory=lambda: BaseData()) ++ write: BaseData = field(default_factory=lambda: BaseData()) ++ flush: BaseData = field(default_factory=lambda: BaseData()) ++ discard: BaseData = field(default_factory=lambda: BaseData()) ++ ++ ++@dataclass ++class IOData: ++ throtl: IOStageData = field(default_factory=lambda: IOStageData()) ++ wbt: IOStageData = field(default_factory=lambda: IOStageData()) ++ gettag: IOStageData = field(default_factory=lambda: IOStageData()) ++ iocost: IOStageData = field(default_factory=lambda: IOStageData()) ++ plug: IOStageData = field(default_factory=lambda: IOStageData()) ++ bfq: IOStageData = field(default_factory=lambda: IOStageData()) ++ hctx: IOStageData = field(default_factory=lambda: IOStageData()) ++ requeue: IOStageData = field(default_factory=lambda: IOStageData()) ++ rq_driver: IOStageData = field(default_factory=lambda: IOStageData()) ++ bio: IOStageData = field(default_factory=lambda: IOStageData()) ++ time_stamp: float = field(default_factory=lambda: datetime.now().timestamp()) ++ ++ ++class MetricName: ++ _disk_name: str = None ++ _stage_name: str = None ++ _io_access_type_name: str = None ++ _metric_name: str = None ++ ++ def __init__(self, disk_name: str, stage_name: str, io_access_type_name: str, metric_name: str): ++ self._disk_name = disk_name ++ self._stage_name = stage_name ++ self._io_access_type_name = io_access_type_name ++ self._metric_name = metric_name ++ ++ def get_disk_name(self): ++ return self._disk_name ++ ++ def get_stage_name(self): ++ return self._stage_name ++ ++ def get_io_access_type_name(self): ++ return self._io_access_type_name ++ ++ def get_metric_name(self): ++ return self._metric_name ++ ++ def __repr__(self): ++ return (f'disk: {self._disk_name}, stage: {self._stage_name}, io_access_type: {self._io_access_type_name},' ++ f'metric: {self._metric_name}') +diff --git a/src/python/sentryPlugins/ai_threshold_slow_io_detection/sliding_window.py b/src/python/sentryPlugins/ai_threshold_slow_io_detection/sliding_window.py +new file mode 100644 +index 0000000..d395d48 +--- /dev/null ++++ b/src/python/sentryPlugins/ai_threshold_slow_io_detection/sliding_window.py +@@ -0,0 +1,113 @@ ++# coding: utf-8 ++# Copyright (c) 2024 Huawei Technologies Co., Ltd. ++# sysSentry is licensed under the Mulan PSL v2. ++# You can use this software according to the terms and conditions of the Mulan PSL v2. ++# You may obtain a copy of Mulan PSL v2 at: ++# http://license.coscl.org.cn/MulanPSL2 ++# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR ++# IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY OR FIT FOR A PARTICULAR ++# PURPOSE. ++# See the Mulan PSL v2 for more details. ++ ++from enum import Enum, unique ++import numpy as np ++ ++ ++@unique ++class SlidingWindowType(Enum): ++ NotContinuousSlidingWindow = 0 ++ ContinuousSlidingWindow = 1 ++ MedianSlidingWindow = 2 ++ ++ ++class SlidingWindow: ++ _ai_threshold = None ++ _queue_length = None ++ _queue_threshold = None ++ _io_data_queue: list = None ++ _io_data_queue_abnormal_tag: list = None ++ ++ def __init__(self, queue_length: int, threshold: int): ++ self._queue_length = queue_length ++ self._queue_threshold = threshold ++ self._io_data_queue = [] ++ self._io_data_queue_abnormal_tag = [] ++ ++ def push(self, data: float): ++ if len(self._io_data_queue) == self._queue_length: ++ self._io_data_queue.pop(0) ++ self._io_data_queue_abnormal_tag.pop(0) ++ self._io_data_queue.append(data) ++ self._io_data_queue_abnormal_tag.append(data >= self._ai_threshold if self._ai_threshold is not None else False) ++ ++ def update(self, threshold): ++ if self._ai_threshold == threshold: ++ return ++ self._ai_threshold = threshold ++ self._io_data_queue_abnormal_tag.clear() ++ for data in self._io_data_queue: ++ self._io_data_queue_abnormal_tag.append(data >= self._ai_threshold) ++ ++ def is_slow_io_event(self, data): ++ return False, None, None ++ ++ def __repr__(self): ++ return "SlidingWindow" ++ ++ ++class NotContinuousSlidingWindow(SlidingWindow): ++ def is_slow_io_event(self, data): ++ super().push(data) ++ if len(self._io_data_queue) < self._queue_length or self._ai_threshold is None: ++ return False, self._io_data_queue, self._ai_threshold ++ if self._io_data_queue_abnormal_tag.count(True) >= self._queue_threshold: ++ return True, self._io_data_queue, self._ai_threshold ++ return False, self._io_data_queue, self._ai_threshold ++ ++ def __repr__(self): ++ return "NotContinuousSlidingWindow" ++ ++ ++class ContinuousSlidingWindow(SlidingWindow): ++ def is_slow_io_event(self, data): ++ super().push(data) ++ if len(self._io_data_queue) < self._queue_length or self._ai_threshold is None: ++ return False, self._io_data_queue, self._ai_threshold ++ consecutive_count = 0 ++ for tag in self._io_data_queue_abnormal_tag: ++ if tag: ++ consecutive_count += 1 ++ if consecutive_count >= self._queue_threshold: ++ return True, self._io_data_queue, self._ai_threshold ++ else: ++ consecutive_count = 0 ++ return False, self._io_data_queue, self._ai_threshold ++ ++ def __repr__(self): ++ return "ContinuousSlidingWindow" ++ ++ ++class MedianSlidingWindow(SlidingWindow): ++ def is_slow_io_event(self, data): ++ super().push(data) ++ if len(self._io_data_queue) < self._queue_length or self._ai_threshold is None: ++ return False, self._io_data_queue, self._ai_threshold ++ median = np.median(self._io_data_queue) ++ if median >= self._ai_threshold: ++ return True, self._io_data_queue, self._ai_threshold ++ return False, self._io_data_queue, self._ai_threshold ++ ++ def __repr__(self): ++ return "MedianSlidingWindow" ++ ++ ++class SlidingWindowFactory: ++ def get_sliding_window(self, sliding_window_type: SlidingWindowType, *args, **kwargs): ++ if sliding_window_type == SlidingWindowType.NotContinuousSlidingWindow: ++ return NotContinuousSlidingWindow(*args, **kwargs) ++ elif sliding_window_type == SlidingWindowType.ContinuousSlidingWindow: ++ return ContinuousSlidingWindow(*args, **kwargs) ++ elif sliding_window_type == SlidingWindowType.MedianSlidingWindow: ++ return MedianSlidingWindow(*args, **kwargs) ++ else: ++ return NotContinuousSlidingWindow(*args, **kwargs) +diff --git a/src/python/sentryPlugins/ai_threshold_slow_io_detection/slow_io_detection.py b/src/python/sentryPlugins/ai_threshold_slow_io_detection/slow_io_detection.py +new file mode 100644 +index 0000000..43cf770 +--- /dev/null ++++ b/src/python/sentryPlugins/ai_threshold_slow_io_detection/slow_io_detection.py +@@ -0,0 +1,133 @@ ++# coding: utf-8 ++# Copyright (c) 2024 Huawei Technologies Co., Ltd. ++# sysSentry is licensed under the Mulan PSL v2. ++# You can use this software according to the terms and conditions of the Mulan PSL v2. ++# You may obtain a copy of Mulan PSL v2 at: ++# http://license.coscl.org.cn/MulanPSL2 ++# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR ++# IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY OR FIT FOR A PARTICULAR ++# PURPOSE. ++# See the Mulan PSL v2 for more details. ++ ++import time ++import signal ++import logging ++ ++from .detector import Detector ++from .threshold import ThresholdFactory, AbsoluteThreshold ++from .sliding_window import SlidingWindowFactory ++from .utils import (get_threshold_type_enum, get_sliding_window_type_enum, get_data_queue_size_and_update_size, ++ get_log_level) ++from .config_parser import ConfigParser ++from .data_access import get_io_data_from_collect_plug, check_collect_valid ++from .io_data import MetricName ++from .alarm_report import AlarmReport ++ ++CONFIG_FILE = "/etc/sysSentry/plugins/ai_threshold_slow_io_detection.ini" ++ ++ ++def sig_handler(signum, frame): ++ logging.info("receive signal: %d", signum) ++ AlarmReport().report_fail(f"receive signal: {signum}") ++ exit(signum) ++ ++ ++class SlowIODetection: ++ _config_parser = None ++ _disk_list = None ++ _detector_name_list = [] ++ _detectors = {} ++ ++ def __init__(self, config_parser: ConfigParser): ++ self._config_parser = config_parser ++ self.__set_log_format() ++ self.__init_detector_name_list() ++ self.__init_detector() ++ ++ def __set_log_format(self): ++ log_format = "%(asctime)s - %(levelname)s - [%(filename)s:%(lineno)d] - %(message)s" ++ log_level = get_log_level(self._config_parser.get_log_level()) ++ logging.basicConfig(level=log_level, format=log_format) ++ ++ def __init_detector_name_list(self): ++ self._disk_list = check_collect_valid(self._config_parser.get_slow_io_detect_frequency()) ++ for disk in self._disk_list: ++ self._detector_name_list.append(MetricName(disk, "bio", "read", "latency")) ++ self._detector_name_list.append(MetricName(disk, "bio", "write", "latency")) ++ ++ def __init_detector(self): ++ train_data_duration, train_update_duration = (self._config_parser. ++ get_train_data_duration_and_train_update_duration()) ++ slow_io_detection_frequency = self._config_parser.get_slow_io_detect_frequency() ++ threshold_type = get_threshold_type_enum(self._config_parser.get_algorithm_type()) ++ data_queue_size, update_size = get_data_queue_size_and_update_size(train_data_duration, ++ train_update_duration, ++ slow_io_detection_frequency) ++ sliding_window_type = get_sliding_window_type_enum(self._config_parser.get_sliding_window_type()) ++ window_size, window_threshold = self._config_parser.get_window_size_and_window_minimum_threshold() ++ ++ for detector_name in self._detector_name_list: ++ threshold = ThresholdFactory().get_threshold(threshold_type, data_queue_size=data_queue_size, ++ data_queue_update_size=update_size) ++ sliding_window = SlidingWindowFactory().get_sliding_window(sliding_window_type, queue_length=window_size, ++ threshold=window_threshold) ++ detector = Detector(detector_name, threshold, sliding_window) ++ # 绝对阈值的阈值初始化 ++ if isinstance(threshold, AbsoluteThreshold): ++ threshold.set_threshold(self._config_parser.get_absolute_threshold()) ++ self._detectors[detector_name] = detector ++ logging.info(f"add detector: {detector}") ++ ++ def launch(self): ++ while True: ++ logging.debug('step0. AI threshold slow io event detection is looping.') ++ ++ # Step1:获取IO数据 ++ io_data_dict_with_disk_name = get_io_data_from_collect_plug( ++ self._config_parser.get_slow_io_detect_frequency(), self._disk_list ++ ) ++ logging.debug(f'step1. Get io data: {str(io_data_dict_with_disk_name)}') ++ if io_data_dict_with_disk_name is None: ++ continue ++ # Step2:慢IO检测 ++ logging.debug('step2. Start to detection slow io event.') ++ slow_io_event_list = [] ++ for metric_name, detector in self._detectors.items(): ++ result = detector.is_slow_io_event(io_data_dict_with_disk_name) ++ if result[0]: ++ slow_io_event_list.append((detector.get_metric_name(), result)) ++ logging.debug('step2. End to detection slow io event.') ++ ++ # Step3:慢IO事件上报 ++ logging.debug('step3. Report slow io event to sysSentry.') ++ for slow_io_event in slow_io_event_list: ++ metric_name: MetricName = slow_io_event[0] ++ result = slow_io_event[1] ++ AlarmReport.report_major_alm(f"disk {metric_name.get_disk_name()} has slow io event." ++ f"stage: {metric_name.get_metric_name()}," ++ f"type: {metric_name.get_io_access_type_name()}," ++ f"metric: {metric_name.get_metric_name()}," ++ f"current window: {result[1]}," ++ f"threshold: {result[2]}") ++ logging.error(f"slow io event happen: {str(slow_io_event)}") ++ ++ # Step4:等待检测时间 ++ logging.debug('step4. Wait to start next slow io event detection loop.') ++ time.sleep(self._config_parser.get_slow_io_detect_frequency()) ++ ++ ++def main(): ++ # Step1:注册消息处理函数 ++ signal.signal(signal.SIGINT, sig_handler) ++ signal.signal(signal.SIGTERM, sig_handler) ++ # Step2:断点恢复 ++ # todo: ++ ++ # Step3:读取配置 ++ config_file_name = CONFIG_FILE ++ config = ConfigParser(config_file_name) ++ config.read_config_from_file() ++ ++ # Step4:启动慢IO检测 ++ slow_io_detection = SlowIODetection(config) ++ slow_io_detection.launch() +diff --git a/src/python/sentryPlugins/ai_threshold_slow_io_detection/threshold.py b/src/python/sentryPlugins/ai_threshold_slow_io_detection/threshold.py +new file mode 100644 +index 0000000..9e1ca7b +--- /dev/null ++++ b/src/python/sentryPlugins/ai_threshold_slow_io_detection/threshold.py +@@ -0,0 +1,160 @@ ++# coding: utf-8 ++# Copyright (c) 2024 Huawei Technologies Co., Ltd. ++# sysSentry is licensed under the Mulan PSL v2. ++# You can use this software according to the terms and conditions of the Mulan PSL v2. ++# You may obtain a copy of Mulan PSL v2 at: ++# http://license.coscl.org.cn/MulanPSL2 ++# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR ++# IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY OR FIT FOR A PARTICULAR ++# PURPOSE. ++# See the Mulan PSL v2 for more details. ++import logging ++from enum import Enum ++import queue ++import numpy as np ++import math ++ ++from .sliding_window import SlidingWindow ++ ++ ++class ThresholdState(Enum): ++ INIT = 0 ++ START = 1 ++ ++ ++class Threshold: ++ threshold = None ++ data_queue: queue.Queue = None ++ data_queue_update_size: int = None ++ new_data_size: int = None ++ threshold_state: ThresholdState = None ++ ++ def __init__(self, data_queue_size: int = 10000, data_queue_update_size: int = 1000): ++ self._observer = None ++ self.data_queue = queue.Queue(data_queue_size) ++ self.data_queue_update_size = data_queue_update_size ++ self.new_data_size = 0 ++ self.threshold_state = ThresholdState.INIT ++ self.threshold = math.inf ++ ++ def set_threshold(self, threshold): ++ self.threshold = threshold ++ self.threshold_state = ThresholdState.START ++ self.notify_observer() ++ ++ def get_threshold(self): ++ if self.threshold_state == ThresholdState.INIT: ++ return None ++ return self.threshold ++ ++ def is_abnormal(self, data): ++ if self.threshold_state == ThresholdState.INIT: ++ return False ++ return data >= self.threshold ++ ++ # 使用观察者模式,当阈值更新时,自动同步刷新滑窗中的阈值 ++ def attach_observer(self, observer: SlidingWindow): ++ self._observer = observer ++ ++ def notify_observer(self): ++ if self._observer is not None: ++ self._observer.update(self.threshold) ++ ++ def push_latest_data_to_queue(self, data): ++ pass ++ ++ def __repr__(self): ++ return "Threshold" ++ ++ ++class AbsoluteThreshold(Threshold): ++ def __init__(self, data_queue_size: int = 10000, data_queue_update_size: int = 1000): ++ super().__init__(data_queue_size, data_queue_update_size) ++ ++ def push_latest_data_to_queue(self, data): ++ pass ++ ++ def __repr__(self): ++ return "AbsoluteThreshold" ++ ++ ++class BoxplotThreshold(Threshold): ++ def __init__(self, parameter: float = 1.5, data_queue_size: int = 10000, data_queue_update_size: int = 1000): ++ super().__init__(data_queue_size, data_queue_update_size) ++ self.parameter = parameter ++ ++ def _update_threshold(self): ++ data = list(self.data_queue.queue) ++ q1 = np.percentile(data, 25) ++ q3 = np.percentile(data, 75) ++ iqr = q3 - q1 ++ self.threshold = q3 + self.parameter * iqr ++ if self.threshold_state == ThresholdState.INIT: ++ self.threshold_state = ThresholdState.START ++ self.notify_observer() ++ ++ def push_latest_data_to_queue(self, data): ++ try: ++ self.data_queue.put(data, block=False) ++ except queue.Full: ++ self.data_queue.get() ++ self.data_queue.put(data) ++ self.new_data_size += 1 ++ if (self.data_queue.full() and (self.threshold_state == ThresholdState.INIT or ++ (self.threshold_state == ThresholdState.START and ++ self.new_data_size >= self.data_queue_update_size))): ++ self._update_threshold() ++ self.new_data_size = 0 ++ ++ def __repr__(self): ++ return "BoxplotThreshold" ++ ++ ++class NSigmaThreshold(Threshold): ++ def __init__(self, parameter: float = 2.0, data_queue_size: int = 10000, data_queue_update_size: int = 1000): ++ super().__init__(data_queue_size, data_queue_update_size) ++ self.parameter = parameter ++ ++ def _update_threshold(self): ++ data = list(self.data_queue.queue) ++ mean = np.mean(data) ++ std = np.std(data) ++ self.threshold = mean + self.parameter * std ++ if self.threshold_state == ThresholdState.INIT: ++ self.threshold_state = ThresholdState.START ++ self.notify_observer() ++ ++ def push_latest_data_to_queue(self, data): ++ try: ++ self.data_queue.put(data, block=False) ++ except queue.Full: ++ self.data_queue.get() ++ self.data_queue.put(data) ++ self.new_data_size += 1 ++ if (self.data_queue.full() and (self.threshold_state == ThresholdState.INIT or ++ (self.threshold_state == ThresholdState.START and ++ self.new_data_size >= self.data_queue_update_size))): ++ self._update_threshold() ++ self.new_data_size = 0 ++ ++ def __repr__(self): ++ return "NSigmaThreshold" ++ ++ ++class ThresholdType(Enum): ++ AbsoluteThreshold = 0 ++ BoxplotThreshold = 1 ++ NSigmaThreshold = 2 ++ ++ ++class ThresholdFactory: ++ def get_threshold(self, threshold_type: ThresholdType, *args, **kwargs): ++ if threshold_type == ThresholdType.AbsoluteThreshold: ++ return AbsoluteThreshold(*args, **kwargs) ++ elif threshold_type == ThresholdType.BoxplotThreshold: ++ return BoxplotThreshold(*args, **kwargs) ++ elif threshold_type == ThresholdType.NSigmaThreshold: ++ return NSigmaThreshold(*args, **kwargs) ++ else: ++ raise ValueError(f"Invalid threshold type: {threshold_type}") ++ +diff --git a/src/python/sentryPlugins/ai_threshold_slow_io_detection/utils.py b/src/python/sentryPlugins/ai_threshold_slow_io_detection/utils.py +new file mode 100644 +index 0000000..f66e5ed +--- /dev/null ++++ b/src/python/sentryPlugins/ai_threshold_slow_io_detection/utils.py +@@ -0,0 +1,67 @@ ++# coding: utf-8 ++# Copyright (c) 2024 Huawei Technologies Co., Ltd. ++# sysSentry is licensed under the Mulan PSL v2. ++# You can use this software according to the terms and conditions of the Mulan PSL v2. ++# You may obtain a copy of Mulan PSL v2 at: ++# http://license.coscl.org.cn/MulanPSL2 ++# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR ++# IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY OR FIT FOR A PARTICULAR ++# PURPOSE. ++# See the Mulan PSL v2 for more details. ++import logging ++from dataclasses import asdict ++ ++from .threshold import ThresholdType ++from .sliding_window import SlidingWindowType ++from .io_data import MetricName, IOData ++ ++def get_threshold_type_enum(algorithm_type: str): ++ if algorithm_type.lower() == 'absolute': ++ return ThresholdType.AbsoluteThreshold ++ if algorithm_type.lower() == 'boxplot': ++ return ThresholdType.BoxplotThreshold ++ if algorithm_type.lower() == 'n_sigma': ++ return ThresholdType.NSigmaThreshold ++ logging.info('not found correct algorithm type, use default: boxplot.') ++ return ThresholdType.BoxplotThreshold ++ ++ ++def get_sliding_window_type_enum(sliding_window_type: str): ++ if sliding_window_type.lower() == 'not_continuous': ++ return SlidingWindowType.NotContinuousSlidingWindow ++ if sliding_window_type.lower() == 'continuous': ++ return SlidingWindowType.ContinuousSlidingWindow ++ if sliding_window_type.lower() == 'median': ++ return SlidingWindowType.MedianSlidingWindow ++ logging.info('not found correct sliding window type, use default: not_continuous.') ++ return SlidingWindowType.NotContinuousSlidingWindow ++ ++ ++def get_metric_value_from_io_data_dict_by_metric_name(io_data_dict: dict, metric_name: MetricName): ++ try: ++ io_data: IOData = io_data_dict[metric_name.get_disk_name()] ++ io_stage_data = asdict(io_data)[metric_name.get_stage_name()] ++ base_data = io_stage_data[metric_name.get_io_access_type_name()] ++ metric_value = base_data[metric_name.get_metric_name()] ++ return metric_value ++ except KeyError: ++ return None ++ ++ ++def get_data_queue_size_and_update_size(training_data_duration: float, train_update_duration: float, ++ slow_io_detect_frequency: int): ++ data_queue_size = int(training_data_duration * 60 * 60 / slow_io_detect_frequency) ++ update_size = int(train_update_duration * 60 * 60 / slow_io_detect_frequency) ++ return data_queue_size, update_size ++ ++ ++def get_log_level(log_level: str): ++ if log_level.lower() == 'debug': ++ return logging.DEBUG ++ elif log_level.lower() == 'info': ++ return logging.INFO ++ elif log_level.lower() == 'warning': ++ return logging.WARNING ++ elif log_level.lower() == 'fatal': ++ return logging.FATAL ++ return None +diff --git a/src/python/setup.py b/src/python/setup.py +index c28c691..dac6481 100644 +--- a/src/python/setup.py ++++ b/src/python/setup.py +@@ -33,7 +33,8 @@ setup( + 'syssentry=syssentry.syssentry:main', + 'xalarmd=xalarm.xalarm_daemon:alarm_process_create', + 'sentryCollector=sentryCollector.collectd:main', +- 'avg_block_io=sentryPlugins.avg_block_io.avg_block_io:main' ++ 'avg_block_io=sentryPlugins.avg_block_io.avg_block_io:main', ++ 'ai_threshold_slow_io_detection=sentryPlugins.ai_threshold_slow_io_detection.slow_io_detection:main' + ] + }, + ) +-- +2.23.0 + diff --git a/sysSentry.spec b/sysSentry.spec index ffc2b16c400fbfd1b177e4fa08c4b6dc3f97abba..eeb1a6a2e4df4c2d4ad1df8c7101bbfb66208293 100644 --- a/sysSentry.spec +++ b/sysSentry.spec @@ -4,7 +4,7 @@ Summary: System Inspection Framework Name: sysSentry Version: 1.0.2 -Release: 14 +Release: 15 License: Mulan PSL v2 Group: System Environment/Daemons Source0: https://gitee.com/openeuler/sysSentry/releases/download/v%{version}/%{name}-%{version}.tar.gz @@ -25,6 +25,7 @@ Patch12: fix-syssentry-fails-to-be-started-when-cpu_sentry-is.patch Patch13: add-collect-module-to-sysSentry.patch Patch14: feature-add-avg_block_io-plugin.patch Patch15: fix-some-about-collect-module-and-avg-block-io.patch +Patch16: add-ai-threshold-slow-io-detection-plugin.patch BuildRequires: cmake gcc-c++ BuildRequires: python3 python3-setuptools @@ -68,6 +69,14 @@ Requires: sysSentry = %{version}-%{release} %description -n avg_block_io This package provides Supports slow I/O detection based on EBPF +%package -n ai_threshold_slow_io_detection +Summary: Supports slow I/O detection +Requires: python3-numpy +Requires: sysSentry = %{version}-%{release} + +%description -n ai_threshold_slow_io_detection +This package provides Supports slow I/O detection based on AI + %prep %autosetup -n %{name}-%{version} -p1 @@ -126,6 +135,10 @@ chrpath -d %{buildroot}%{_libdir}/libcpu_patrol.so install config/tasks/avg_block_io.mod %{buildroot}/etc/sysSentry/tasks/ install config/plugins/avg_block_io.ini %{buildroot}/etc/sysSentry/plugins/avg_block_io.ini +# ai_threshold_slow_io_detection +install config/tasks/ai_threshold_slow_io_detection.mod %{buildroot}/etc/sysSentry/tasks/ +install config/plugins/ai_threshold_slow_io_detection.ini %{buildroot}/etc/sysSentry/plugins/ai_threshold_slow_io_detection.ini + pushd src/python python3 setup.py install -O1 --root=$RPM_BUILD_ROOT --record=SENTRY_FILES popd @@ -159,6 +172,7 @@ rm -rf %{buildroot} %attr(0550,root,root) %{python3_sitelib}/syssentry %attr(0550,root,root) %{python3_sitelib}/sentryCollector %attr(0550,root,root) %{python3_sitelib}/sentryPlugins/avg_block_io +%attr(0550,root,root) %{python3_sitelib}/sentryPlugins/ai_threshold_slow_io_detection # sysSentry %attr(0500,root,root) %{_bindir}/sentryctl @@ -190,6 +204,12 @@ rm -rf %{buildroot} %exclude %{_bindir}/avg_block_io %exclude %{python3_sitelib}/sentryPlugins/* +# ai_threshold_slow_io_detection +%exclude %{_sysconfdir}/sysSentry/tasks/ai_threshold_slow_io_detection.mod +%exclude %{_sysconfdir}/sysSentry/plugins/ai_threshold_slow_io_detection.ini +%exclude %{_bindir}/ai_threshold_slow_io_detection +%exclude %{python3_sitelib}/sentryPlugins/* + # sentryCollector %attr(0550,root,root) %{_bindir}/sentryCollector %attr(0600,root,root) %{_sysconfdir}/sysSentry/collector.conf @@ -217,7 +237,19 @@ rm -rf %{buildroot} %attr(0600,root,root) %{_sysconfdir}/sysSentry/plugins/avg_block_io.ini %attr(0550,root,root) %{python3_sitelib}/sentryPlugins/avg_block_io +%files -n ai_threshold_slow_io_detection +%attr(0500,root,root) %{_bindir}/ai_threshold_slow_io_detection +%attr(0600,root,root) %config(noreplace) %{_sysconfdir}/sysSentry/tasks/ai_threshold_slow_io_detection.mod +%attr(0600,root,root) %{_sysconfdir}/sysSentry/plugins/ai_threshold_slow_io_detection.ini +%attr(0550,root,root) %{python3_sitelib}/sentryPlugins/ai_threshold_slow_io_detection + %changelog +* Mon Sep 23 2024 heyouzhi - 1.0.2-15 +- Type:requirement +- CVE:NA +- SUG:NA +- DESC:add ai threshold slow io detection plugin + * Fri Sep 20 2024 zhuofeng - 1.0.2-14 - Type:requirement - CVE:NA