diff --git a/config/.env.yaml b/config/.env.yaml index ec0d99792650b645919a0f7edced6c17d445ed4b..691a92249ef5526334e94530c037d6cb4ebbc058 100644 --- a/config/.env.yaml +++ b/config/.env.yaml @@ -28,6 +28,17 @@ servers: # max_retries: 3 # delay: 1.0 +# servers: +# - ip: "" +# host_user: "" +# password: "" +# port: +# app: "gaussdb" +# target_process_name: "gaussdb" +# business_context: "高并发数据库服务,CPU负载主要集中在用户态处理" +# max_retries: 3 +# delay: 1.0 + #servers: # - ip: "" # host_user: "" diff --git a/config/app_config.yaml b/config/app_config.yaml index 5e03efa8565356ab58c76cad9108b23f1e250ad8..724d376680ff7dbb779cc2a6706cc9ab721b0445 100644 --- a/config/app_config.yaml +++ b/config/app_config.yaml @@ -38,6 +38,18 @@ nginx: benchmark: "$EXECUTE_MODE:local sh $SCRIPTS_DIR/nginx/parse_benchmark.sh $host_ip $port" performance_metric: "QPS" +gaussdb: + user: "" + password: "" + config_file: "/path/of/tpchnode" + port: 5432 + set_param_template: 'gs_guc reload -Z datanode -D /data2/zjh/gitcode/data/tpchnode -c "$param_name=$param_value"' + get_param_template: 'grep -oP "^\s*$param_name\s*=\s*\K.*" "$config_file"' + stop_workload: "gs_ctl stop -D /data2/zjh/gitcode/data/tpchnode" + start_workload: "gs_ctl start -D /data2/zjh/gitcode/data/tpchnode -l logfile" + benchmark: "$EXECUTE_MODE:local sh /home/wsy/gaussdb_benchmark.sh" + performance_metric: "DURATION" + system: set_param_template: 'sysctl -w $param_name=$param_value' get_param_template: 'sysctl $param_name' diff --git a/src/knowledge_base/knob_params/gaussdb.json b/src/knowledge_base/knob_params/gaussdb.json new file mode 100644 index 0000000000000000000000000000000000000000..8a1bb758c2e540f82d1a013608e54fe5f4215da0 --- /dev/null +++ b/src/knowledge_base/knob_params/gaussdb.json @@ -0,0 +1,239 @@ +{ + "shared_buffers": { + "desc": "设置GaussDB KernelGaussDB使用的共享内存大小。增加此参数的值会使GaussDB KernelGaussDB比系统默认设置需要更多的System V共享内存。", + "type": "continuous", + "range": [ + 16, + 1073741823 + ], + "dtype": "Integer", + "default_value": 32768 + }, + "max_process_memory": { + "desc": "设置一个数据库节点可用的最大物理内存。", + "type": "continuous", + "range": [ + 2097152, + 2147483647 + ], + "dtype": "Integer", + "default_value": 12582912 + }, + "work_mem": { + "desc": "设置内部排序操作和Hash表在开始写入临时磁盘文件之前使用的内存大小。ORDER BY,DISTINCT和merge joins都要用到排序操作。Hash表在散列连接、散列为基础的聚集、散列为基础的IN子查询处理中都要用到。", + "type": "continuous", + "range": [ + 64, + 2147483647 + ], + "dtype": "Integer", + "default_value": 65536 + }, + "enable_bloom_filter": { + "desc": "标识是否允许使用BloomFilter优化。该参数可在PDB级别设置。", + "type": "discrete", + "range": [ + "on", + "off" + ], + "dtype": "boolean", + "default_value": "on" + }, + "scan_wait_for_bloom_filter": { + "desc": "扫描算子是否等待BloomFilter创建完成。多租场景下,该参数可在PDB级别设置。", + "type": "discrete", + "range": [ + "on", + "off" + ], + "dtype": "boolean", + "default_value": "on" + }, + "bloom_filter_build_max_rows": { + "desc": "启用BloomFilter优化时,Hashjoin的Build侧可以创建BloomFilter的最大数据量。如果build侧的数据量高于这个值,则不会建立BloomFilter。多租场景下,该参数可在PDB级别设置。", + "type": "continuous", + "range": [ + 1, + 2147483647 + ], + "dtype": "Integer", + "default_value": 2000000 + }, + "bloom_filter_apply_threshold": { + "desc": "启用BloomFilter优化时,Hashjoin的Apply侧可以创建BloomFilter的最小数据量。如果apply侧的数据量低于这个值,则不会建立BloomFilter。多租场景下,该参数可在PDB级别设置。", + "type": "continuous", + "range": [ + 1, + 2147483647 + ], + "dtype": "Integer", + "default_value": 10000 + }, + "enable_smp_partitionwise": { + "desc": "控制是否在SMP场景下,开启Partition-wise Join的能力。多租场景下,该参数可在PDB级别设置。", + "type": "discrete", + "range": [ + "on", + "off" + ], + "dtype": "boolean", + "default_value": "on" + }, + "force_smp_partitionwise_scan": { + "desc": "在SMP场景下,打开Partition-wise Join开关enable_smp_partitionwise后,是否在全局范围内开启Partition-wise Scan的并行扫描方式。多租场景下,该参数可在PDB级别设置。", + "type": "discrete", + "range": [ + "on", + "off" + ], + "dtype": "boolean", + "default_value": "off" + }, + "enable_htap": { + "desc": "是否开启HTAP特性,开启HTAP特性后,会加载COLVIEW关键字指定列的存量行存数据到内存IMCV(In-memory Column View,列式内存引擎)中,用于提高AP(Analytical Processing,分析处理)查询效率。使用多租数据库特性时需要关闭该参数。", + "type": "discrete", + "range": [ + "on", + "off" + ], + "dtype": "boolean", + "default_value": "off" + }, + "enable_parallel_populate": { + "desc": "依赖HTAP特性开启状态,设置是否开启存量行存转内存列IMCV的并行加载特性,以加速IMCV表创建流程。", + "type": "discrete", + "range": [ + "on", + "off" + ], + "dtype": "boolean", + "default_value": "off" + }, + "enable_imcvscan": { + "desc": "依赖HTAP特性开启状态,设置是否打开HTAP内存列扫描算子IMCVScan。", + "type": "discrete", + "range": [ + "on", + "off" + ], + "dtype": "boolean", + "default_value": "off" + }, + "htap_max_mem_size": { + "desc": "依赖HTAP特性开启状态,设置HTAP特性可用的内存上限。当GaussDB动态内存使用达到上限后,即使HTAP内存使用未达到上限,也无法继续分配内存。", + "type": "continuous", + "range": [ + 102400, + 1073741824 + ], + "dtype": "Integer", + "default_value": 1048576 + }, + "htap_router_mode": { + "desc": "依赖HTAP特性开启状态,设置HTAP透明路由的模式。", + "type": "discrete", + "range": [ + "row", + "column", + "auto" + ], + "dtype": "string", + "default_value": "row" + }, + "query_dop": { + "desc": "用户自定义的查询并行度。开启SMP功能后,系统会使用设定的并行度执行。该参数可在PDB级别设置。", + "type": "continuous", + "range": [ + 1, + 64 + ], + "dtype": "Integer", + "default_value": 1 + }, + "enable_force_smp": { + "desc": "控制是否强制开启SMP计划。参数开启后,代价模型默认拉起stream线程的代价为空,当设置并行度并且算子支持并行时强制选择并行路径。该参数可在PDB级别设置。", + "type": "discrete", + "range": [ + "on", + "off" + ], + "dtype": "boolean", + "default_value": "off" + }, + "enable_nestloop": { + "desc": "控制优化器对内表全表扫描嵌套循环连接规划类型的使用。完全消除嵌套循环连接是不可能的,但是,若关闭这个参数,当存在其他方法时,优化器会优先选择其他方法。该参数可在PDB级别设置。", + "type": "discrete", + "range": [ + "on", + "off" + ], + "dtype": "boolean", + "default_value": "on" + }, + "enable_numeric_optimization": { + "desc": "设置是否开启Numeric类型数据运算优化。", + "type": "discrete", + "range": [ + "on", + "off" + ], + "dtype": "boolean", + "default_value": "on" + }, + "enable_sonic_hashagg": { + "desc": "标识是否依据规则约束使用基于面向列的hash表设计的Hash Agg算子。该参数可在PDB级别设置。", + "type": "discrete", + "range": [ + "on", + "off" + ], + "dtype": "boolean", + "default_value": "on" + }, + "enable_sonic_hashjoin": { + "desc": "标识是否依据规则约束使用基于面向列的hash表设计的Hash Join算子。该参数可在PDB级别设置。", + "type": "discrete", + "range": [ + "on", + "off" + ], + "dtype": "boolean", + "default_value": "on" + }, + "enable_thread_pool": { + "desc": "控制是否使用线程池功能。多租数据库特性(enable_mtd)需要开启该参数。", + "type": "discrete", + "range": [ + "on", + "off" + ], + "dtype": "boolean", + "default_value": "off" + }, + "enable_early_free": { + "desc": "控制是否可以进行算子内存的提前释放。该参数可在PDB级别设置。", + "type": "discrete", + "range": [ + "on", + "off" + ], + "dtype": "boolean", + "default_value": "on" + }, + "vector_engine_features": { + "desc": "标识向量化引擎可选特性的开关。开启这些特性不一定对执行性能有提升效果,在特定的业务场景中,通过此GUC参数对向量化引擎相关特性进行设置,使得执行性能最优。该参数可在PDB级别设置。", + "type": "discrete", + "range": [ + "", + "enable_ca_hashagg", + "enable_ca_hashjoin", + "enable_late_materialization", + "enable_ca_hashagg, enable_ca_hashjoin", + "enable_ca_hashjoin, enable_late_materialization", + "enable_ca_hashagg, enable_late_materialization", + "enable_ca_hashagg, enable_ca_hashjoin, enable_late_materialization" + ], + "dtype": "string", + "default_value": "enable_ca_hashagg, enable_ca_hashjoin, enable_late_materialization" + } +} \ No newline at end of file diff --git a/src/performance_analyzer/application/gaussdb_analyzer.py b/src/performance_analyzer/application/gaussdb_analyzer.py new file mode 100644 index 0000000000000000000000000000000000000000..02403a9d4838c6b13bc40a33083d5979dfa2df18 --- /dev/null +++ b/src/performance_analyzer/application/gaussdb_analyzer.py @@ -0,0 +1,41 @@ +from ..base_analyzer import BaseAnalyzer + +from src.utils.llm import get_llm_response + + +class GaussdbAnalyzer(): + def __init__(self, data, **kwargs): + self.data = data + + def run(self) -> str: + # 要有一个报告模板,指明包含哪些信息,以及报告格式 + if not self.data: + return None + report_prompt = f""" + # CONTEXT # + linux系统中正在运行GaussDB应用, 以下内容是GaussDB相关的性能信息: + {self.data} + 信息中所涉及到的数据准确无误,真实可信。 + + # OBJECTIVE # + 请根据上述信息,分析GaussDB应用的性能状况。 + 要求: + 1.答案中不要包含任何优化建议。 + 2.答案中尽可能保留信息中真实有效的数据。 + 3.不要遗漏任何值得分析的信息。 + + # STYLE # + 你是一个专业的系统运维专家,你的回答应该逻辑严谨、表述客观、简洁易懂、条理清晰,让你的回答真实可信 + + # Tone # + 你应该尽可能秉承严肃、认真、严谨的态度 + + # AUDIENCE # + 你的答案将会是其他系统运维专家的重要参考意见,请尽可能提供真实有用的信息,不要胡编乱造。 + + # RESPONSE FORMAT # + 回答以"GaussDB分析如下:"开头,然后另起一行逐条分析。 + 如果有多条分析结论,请用数字编号分点作答。 + + """ + return get_llm_response(report_prompt) + "\n" diff --git a/src/performance_collector/application/gaussdb_collector.py b/src/performance_collector/application/gaussdb_collector.py new file mode 100644 index 0000000000000000000000000000000000000000..037724d71de6d380a49efd4f3339a17960811c33 --- /dev/null +++ b/src/performance_collector/application/gaussdb_collector.py @@ -0,0 +1,188 @@ +import logging +import pandas as pd +from io import StringIO +from src.utils.collector.metric_collector import ( + period_task, + snapshot_task, + CollectMode, +) + +logging.basicConfig( + level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s" +) + +GAUSS_INTERVAL = 180 + + +# -------------------- 1. 后台写入与检查点(两次采样) -------------------- +@period_task( + cmd='gsql -d tpch -A -F , -c "SELECT * FROM pg_stat_bgwriter;"', + collect_mode=CollectMode.ASYNC, + tag="GaussDB后台写入与检查点", + delay=0, + sample_count=2, + interval=GAUSS_INTERVAL, +) +def gauss_bgwriter_parser(output: list[str]) -> dict: + if len(output) < 2: + return {} + df1 = pd.read_csv(StringIO(output[0])) + df2 = pd.read_csv(StringIO(output[1])) + if df1.empty or df2.empty: + return {} + + r1, r2 = df1.iloc[0].to_dict(), df2.iloc[0].to_dict() + mapping = { + "checkpoints_timed": "定时检查点", + "checkpoints_req": "请求检查点", + "checkpoint_write_time": "检查点写入耗时(ms)", + "checkpoint_sync_time": "检查点同步耗时(ms)", + "buffers_checkpoint": "检查点写出页数", + "buffers_clean": "后台清理写出页数", + "maxwritten_clean": "后台清理超限次数", + "buffers_backend": "后端写出页数", + "buffers_backend_fsync": "后端 fsync 次数", + "buffers_alloc": "分配新缓冲区页数", + } + result = {} + for key, label in mapping.items(): + try: + delta = int(r2.get(key, 0)) - int(r1.get(key, 0)) + except (ValueError, TypeError): + delta = 0 + result[f"{GAUSS_INTERVAL // 60}分钟内{label}"] = max(delta, 0) + print("GaussDB后台写入与检查点:",result) + return result + + +# -------------------- 2. 事务与IO(两次采样) -------------------- +@period_task( + cmd='''gsql -d tpch -A -F , -c " + SELECT sum(xact_commit) as commits, + sum(xact_rollback) as rollbacks, + sum(blks_read) as blks_read, + sum(blks_hit) as blks_hit, + sum(tup_returned) as tup_returned, + sum(tup_fetched) as tup_fetched + FROM pg_stat_database;"''', + collect_mode=CollectMode.ASYNC, + tag="GaussDB事务与IO", + delay=0, + sample_count=2, + interval=GAUSS_INTERVAL, +) +def gauss_dbstat_parser(output: list[str]) -> dict: + if len(output) < 2: + return {} + df1 = pd.read_csv(StringIO(output[0])) + df2 = pd.read_csv(StringIO(output[1])) + if df1.empty or df2.empty: + return {} + + r1, r2 = df1.iloc[0].to_dict(), df2.iloc[0].to_dict() + res = {} + for col in ("commits", "rollbacks", "blks_read", "blks_hit", "tup_returned", "tup_fetched"): + try: + delta = int(r2[col]) - int(r1[col]) + except (ValueError, TypeError): + delta = 0 + res[f"{GAUSS_INTERVAL // 60}分钟内{col}"] = max(delta, 0) + + # 计算命中率 + hit_delta = res[f"{GAUSS_INTERVAL // 60}分钟内blks_hit"] + read_delta = res[f"{GAUSS_INTERVAL // 60}分钟内blks_read"] + res[f"{GAUSS_INTERVAL // 60}分钟内Buffer命中率"] = ( + round(hit_delta * 100 / (hit_delta + read_delta), 2) if (hit_delta + read_delta) else 0 + ) + return res + + +# -------------------- 3. 会话信息(实时快照) -------------------- +@snapshot_task( + cmd='''gsql -d tpch -A -F , -c " +SELECT datname, state, waiting, enqueue +FROM pg_stat_activity;"''', + collect_mode=CollectMode.ASYNC, + tag="GaussDB会话信息", +) +def gauss_activity_parser(output: str) -> dict: + df = pd.read_csv(StringIO(output)) + mapping = { + "datname": "数据库名", + "state": "连接状态", + "waiting": "是否等待", + "enqueue": "排队/锁信息", + } + return { + "会话信息": [ + {mapping.get(k, k): v for k, v in row.items()} + for _, row in df.iterrows() + ] + } + + +# -------------------- 4. 锁信息(实时快照) -------------------- +@snapshot_task( + cmd='gsql -d tpch -A -F , -c "SELECT mode, granted, COUNT(*) AS count FROM pg_locks GROUP BY mode, granted;"', + collect_mode=CollectMode.ASYNC, + tag="GaussDB锁信息", +) +def gauss_locks_parser(output: str) -> dict: + df = pd.read_csv(StringIO(output)) + mapping = {"mode": "锁模式", "granted": "是否已授予", "count": "锁数量"} + return { + "锁信息": [ + {mapping.get(k, k): v for k, v in row.items()} for _, row in df.iterrows() + ] + } + + +# -------------------- 5. 数据库级统计(实时快照) -------------------- +@snapshot_task( + cmd='''gsql -d tpch -A -F , -c "SELECT datname, numbackends, xact_commit, xact_rollback, + blks_read, blks_hit, pg_database_size(datname) AS db_size_bytes + FROM pg_stat_database WHERE datname NOT IN ('template0', 'template1');"''', + collect_mode=CollectMode.ASYNC, + tag="GaussDB数据库级指标", +) +def gauss_database_snapshot_parser(output: str) -> dict: + df = pd.read_csv(StringIO(output)) + mapping = { + "datname": "数据库名", + "numbackends": "连接数", + "xact_commit": "提交事务数", + "xact_rollback": "回滚事务数", + "blks_read": "磁盘读块数", + "blks_hit": "缓冲命中块数", + "db_size_bytes": "数据库大小(Bytes)", + } + return { + "数据库统计": [ + {mapping.get(k, k): v for k, v in row.items()} for _, row in df.iterrows() + ] + } + + +# -------------------- 6. 内存使用(实时快照) -------------------- +@snapshot_task( + cmd='''gsql -d tpch -A -F , -c " + SELECT + 'localhost' AS node_name, + SUM(usedsize) AS dynamic_used_memory_bytes, + MAX(usedsize) AS dynamic_peak_memory_bytes + FROM gs_session_memory_detail;"''', + collect_mode=CollectMode.ASYNC, + tag="GaussDB内存使用", +) +def gauss_memory_parser(output: str) -> dict: + df = pd.read_csv(StringIO(output)) + mapping = { + "node_name": "节点名", + "dynamic_used_memory": "已使用动态内存(MB)", + "dynamic_peak_memory": "动态内存峰值(MB)", + } + return { + "内存信息": [ + {mapping.get(k, k): v for k, v in row.items()} for _, row in df.iterrows() + ] + }