diff --git a/deps/4_grafana/sysom-cluster-dashboard.json b/deps/4_grafana/sysom-cluster-dashboard.json index 31c667be72889d5b30964421c621045e90cc5e51..f74f3e2bb3fa3db88f960562dfe8ad3f23daf07f 100644 --- a/deps/4_grafana/sysom-cluster-dashboard.json +++ b/deps/4_grafana/sysom-cluster-dashboard.json @@ -1679,7 +1679,7 @@ { "targetBlank": true, "title": "\u96c6\u7fa4\u5185\u5b58\u8bca\u65ad\u4e2d\u5fc3", - "url": "../diagnose/memory/clustermem?instance=${__field.labels.instance}&pod_name=${__field.labels.podname}&time=${__data.fields.Time}&diagnosis_type=\u5185\u5b58\u5ef6\u65f6\u8bca\u65ad" + "url": "../diagnose/memory/clustermem?instance=${__field.labels.instance}&pod_name=${__field.labels.pod}&time=${__data.fields.Time}&diagnosis_type=\u5185\u5b58\u5ef6\u65f6\u8bca\u65ad" } ], "mappings": [], @@ -2171,8 +2171,7 @@ "mode": "absolute", "steps": [ { - "color": "super-light-blue", - "value": null + "color": "super-light-blue" }, { "color": "super-light-yellow", @@ -2297,8 +2296,7 @@ "mode": "absolute", "steps": [ { - "color": "green", - "value": null + "color": "green" }, { "color": "red", @@ -2385,8 +2383,7 @@ "mode": "absolute", "steps": [ { - "color": "green", - "value": null + "color": "green" }, { "color": "red", @@ -2522,8 +2519,7 @@ "mode": "absolute", "steps": [ { - "color": "green", - "value": null + "color": "green" }, { "color": "red", @@ -2649,8 +2645,7 @@ "mode": "absolute", "steps": [ { - "color": "green", - "value": null + "color": "green" }, { "color": "red", @@ -2757,8 +2752,7 @@ "mode": "absolute", "steps": [ { - "color": "green", - "value": null + "color": "green" }, { "color": "red", @@ -2912,8 +2906,7 @@ "mode": "absolute", "steps": [ { - "color": "green", - "value": null + "color": "green" }, { "color": "red", @@ -3020,8 +3013,7 @@ "mode": "absolute", "steps": [ { - "color": "green", - "value": null + "color": "green" }, { "color": "red", @@ -3176,8 +3168,7 @@ "mode": "absolute", "steps": [ { - "color": "green", - "value": null + "color": "green" }, { "color": "red", @@ -3295,8 +3286,7 @@ "mode": "absolute", "steps": [ { - "color": "green", - "value": null + "color": "green" }, { "color": "red", @@ -3498,7 +3488,7 @@ "timezone": "", "title": "\u96c6\u7fa4\u89c6\u89d2", "uid": "F4UBT8w4k", - "version": 8, + "version": 2, "weekStart": "" } } \ No newline at end of file diff --git a/sysom_server/sysom_diagnosis/service_scripts/clustermem_post.py b/sysom_server/sysom_diagnosis/service_scripts/clustermem_post.py index 4b71a0331dfb4f26960b29f4aa927f6903d0e2bc..6ac7b8aa7bb9042956bcf235266ed32ae5a66413 100644 --- a/sysom_server/sysom_diagnosis/service_scripts/clustermem_post.py +++ b/sysom_server/sysom_diagnosis/service_scripts/clustermem_post.py @@ -10,15 +10,15 @@ class PostProcessor(DiagnosisPostProcessor): ret_errmsg = "" ret = json.loads(ret_str) - try: - if ret["success"] != True: - ret_code = 1 - ret_errmsg = ret["errmsg"] - except: - ret_code = 2 - ret_errmsg = "clustermem error!" - pass - + if ret["success"] != True: + ret_code = 1 + ret_errmsg = ret["errmsg"] + return PostProcessResult( + code=ret_code, + err_msg=ret_errmsg, + result={} + ) + postprocess_result = PostProcessResult( code=ret_code, err_msg=ret_errmsg, @@ -45,14 +45,14 @@ class PostProcessor(DiagnosisPostProcessor): stat_data.append({"key": "节点内核内存(kernel/total)", "value":round(node_mem_data["kernel_used"] * 100 / node_mem_data["MemTotal"], 1)}) if len(ret["pod_name"]) > 0: - pod_mem_data = ret["pod_data"]["sysom_cg_memUtil"] + pod_mem_data = ret["pod_data"]["sysom_container_memUtil"] pod_anon_ratio = (pod_mem_data["usage"] - pod_mem_data["active_file"] - pod_mem_data["inactive_file"]) * 100 / pod_mem_data["usage"] # active_file + inactive_file可能大于usage pod_anon_ratio = pod_anon_ratio if pod_anon_ratio > 0 else 0.1 - stat_data.append({"key": "Pod内存使用(usage/limit)", "value": pod_mem_data["mem_util"]}) + stat_data.append({"key": "Pod内存使用(usage/limit)", "value": round(pod_mem_data["mem_util"])}) stat_data.append({"key": "Pod应用内存(app/usage)", "value": round(pod_anon_ratio, 1)}) - stat_data.append({"key": "Pod文件缓存\n(cache/usage)", "value": pod_mem_data["cache_ratio"]}) + stat_data.append({"key": "Pod文件缓存\n(cache/usage)", "value": round(pod_mem_data["cache_ratio"])}) # stat输出内存占比 postprocess_result.result["UsageResult"] = {"data": stat_data} diff --git a/sysom_server/sysom_diagnosis/service_scripts/clustermem_pre.py b/sysom_server/sysom_diagnosis/service_scripts/clustermem_pre.py index 0b199ec5ce39cf36e91a5b1510f219421136b7b8..cd2986a648e22d08db91670f0556a8971412384f 100644 --- a/sysom_server/sysom_diagnosis/service_scripts/clustermem_pre.py +++ b/sysom_server/sysom_diagnosis/service_scripts/clustermem_pre.py @@ -7,6 +7,8 @@ from metric_reader import MetricReader, dispatch_metric_reader,RangeQueryTask metric_reader = dispatch_metric_reader("prometheus://localhost:9090") +CGROUP_MEM_LIMIT = 9223372036854771712 + def format_time_to_timestamp(t): ret = {"success":True, "timestamp":""} ret["timestamp"] = t @@ -38,8 +40,8 @@ def pull_monidata(warn_time, machine_ip, pod_name): } pod_needed_metrics = { - 'sysom_cg_memUtil':{'cache': 0, 'usage': 0, 'mem_util': 0, 'cache_ratio': 0, - 'inactive_file': 0, 'active_file': 0} + 'sysom_container_memUtil':{'cache': 0, 'usage': 0, 'limit': 0, + 'inactive_file': 0, 'active_file': 0} } # podmem_metrics = { @@ -70,7 +72,6 @@ def pull_monidata(warn_time, machine_ip, pod_name): for table_key, table_value in node_needed_metrics.items(): # value = "total"... for metric_key, _ in table_value.items(): - #logger.info(table_key + "\t" + metric_key) table_task = RangeQueryTask(table_key, warn_time - 30, warn_time) \ .append_equal_filter("instance", instance) \ .append_equal_filter("value", metric_key) @@ -92,9 +93,8 @@ def pull_monidata(warn_time, machine_ip, pod_name): # get podmem metrics podmem_task = RangeQueryTask("sysom_podmem", warn_time - 30, warn_time) \ .append_equal_filter("instance", instance) \ - .append_equal_filter("podns", "default") \ .append_equal_filter("value", "cached") \ - .append_wildcard_filter("podname", "*") # all pods' podmem in instacne + .append_wildcard_filter("pod", "*") # all pods' podmem in instacne podmem_metrics_res = metric_reader.range_query([podmem_task]) if len(podmem_metrics_res.data) <= 0: ret["success"] = False @@ -107,7 +107,7 @@ def pull_monidata(warn_time, machine_ip, pod_name): if len(values) > 0: val = float(values[-1][1]) filename_cached = (tmp_dist["labels"]["file"], val) - podname = tmp_dist["labels"]["podname"] + podname = tmp_dist["labels"]["pod"] if podname not in podmem_metrics: podmem_metrics[podname] = [] podmem_metrics[podname].append(filename_cached) @@ -119,7 +119,7 @@ def pull_monidata(warn_time, machine_ip, pod_name): for metric_key, _ in table_value.items(): pod_task = RangeQueryTask(table_key, warn_time - 30, warn_time) \ .append_equal_filter("instance", instance) \ - .append_equal_filter("podname", pod_name) \ + .append_equal_filter("pod", pod_name) \ .append_equal_filter("value", metric_key) pod_metrics_res = metric_reader.range_query([pod_task]) if len(pod_metrics_res.data) <= 0: @@ -133,10 +133,16 @@ def pull_monidata(warn_time, machine_ip, pod_name): # 只取指标在异常时间的瞬时向量值, 即区间向量最后一个值 # 指定pod名,也有可能有多个data(pod中有多个容器),累加起来 val = float(values[-1][1]) + if metric_key == "limit": + if val == CGROUP_MEM_LIMIT: + val = node_needed_metrics["sysom_proc_meminfo"]["MemTotal"] pod_needed_metrics[table_key][metric_key] += val - # 如果是利用率数据,需要除容器数量(len(data)),求平均值 - if metric_key == "mem_util" or metric_key == "cache_ratio": - pod_needed_metrics[table_key][metric_key] /= len(pod_metrics_res.data) + + pod_usage = pod_needed_metrics['sysom_container_memUtil']['usage'] + pod_limit = pod_needed_metrics['sysom_container_memUtil']['limit'] + pod_cache = pod_needed_metrics['sysom_container_memUtil']['cache'] + pod_needed_metrics['sysom_container_memUtil']['mem_util'] = (pod_usage / pod_limit) * 100 + pod_needed_metrics['sysom_container_memUtil']['cache_ratio'] = (pod_cache / pod_usage) * 100 ret["pod_data"] = pod_needed_metrics return ret @@ -188,7 +194,7 @@ def check_pod_memory_high(pod_metrics): ret["root_cause"] = "" ret["suggestion"] = "" - cg_memutils = pod_metrics["sysom_cg_memUtil"] + cg_memutils = pod_metrics["sysom_container_memUtil"] mem_usage = cg_memutils["usage"] anon_mem = mem_usage - cg_memutils["active_file"] - cg_memutils["inactive_file"] cache_mem = cg_memutils["cache"] @@ -245,7 +251,7 @@ def diagnosis_pod_mem(node_metrics, pod_metrics, diagnosis_type): node_proc_meminfo = node_metrics["sysom_proc_meminfo"] mem_total = node_proc_meminfo["MemTotal"] mem_used = node_proc_meminfo["MemTotal"] - node_proc_meminfo["MemFree"] - cg_memutil = pod_metrics["sysom_cg_memUtil"]["mem_util"] + cg_memutil = pod_metrics["sysom_container_memUtil"]["mem_util"] mem_high_watermark = mem_total * 0.95 util_watermark = 95