diff --git a/deps/4_grafana/sysom-cluster-dashboard.json b/deps/4_grafana/sysom-cluster-dashboard.json index 1166c24358ce1814281fe06f521daa7673e3f2ef..07f78cb16ee0fb57817f37be3175109065c72086 100644 --- a/deps/4_grafana/sysom-cluster-dashboard.json +++ b/deps/4_grafana/sysom-cluster-dashboard.json @@ -188,7 +188,7 @@ { "datasource": "sysom-prometheus", "editorMode": "code", - "expr": "sysom_cluster_health_score{cluster=~\"$cluster\", type=\"ErrorMetric\"}", + "expr": "sysom_cluster_health_score{cluster=~\"$cluster\", type=\"error\"}", "format": "time_series", "hide": false, "instant": false, @@ -280,7 +280,7 @@ { "datasource": "sysom-prometheus", "editorMode": "code", - "expr": "sysom_cluster_health_score{cluster=~\"$cluster\", type=\"LatencyMetric\"}", + "expr": "sysom_cluster_health_score{cluster=~\"$cluster\", type=\"latency\"}", "format": "time_series", "hide": false, "instant": false, @@ -372,7 +372,7 @@ { "datasource": "sysom-prometheus", "editorMode": "code", - "expr": "sysom_cluster_health_score{cluster=~\"$cluster\", type=\"CapacityMetric\"}", + "expr": "sysom_cluster_health_score{cluster=~\"$cluster\", type=\"capacity\"}", "format": "time_series", "hide": false, "instant": false, @@ -464,7 +464,7 @@ { "datasource": "sysom-prometheus", "editorMode": "code", - "expr": "sysom_cluster_health_score{cluster=~\"$cluster\",type=\"LoadMetric\"}", + "expr": "sysom_cluster_health_score{cluster=~\"$cluster\",type=\"load\"}", "format": "time_series", "hide": false, "instant": false, @@ -479,8 +479,7 @@ "type": "gauge" }, { - "datasource": "sysom-prometheus", - "description": "", + "datasource": "sysom-mysql", "fieldConfig": { "defaults": { "color": { @@ -489,7 +488,6 @@ "custom": { "align": "auto", "displayMode": "auto", - "filterable": false, "inspect": false }, "mappings": [], @@ -497,8 +495,20 @@ "mode": "absolute", "steps": [ { - "color": "green", + "color": "dark-red", "value": null + }, + { + "color": "red", + "value": 60 + }, + { + "color": "#EAB839", + "value": 90 + }, + { + "color": "dark-green", + "value": 100 } ] } @@ -506,36 +516,16 @@ "overrides": [ { "matcher": { - "id": "byRegexp", - "options": "/.*/" - }, - "properties": [ - { - "id": "custom.hidden", - "value": true - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/Node|description|type|Value|Score/" - }, - "properties": [ - { - "id": "custom.hidden" - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/Score/" + "id": "byName", + "options": "\u6307\u6807\u5f97\u5206" }, "properties": [ { "id": "custom.displayMode", - "value": "color-background" + "value": "lcd-gauge" + }, + { + "id": "color" }, { "id": "thresholds", @@ -543,91 +533,83 @@ "mode": "absolute", "steps": [ { - "color": "red", + "color": "dark-red", "value": null }, { - "color": "orange", + "color": "red", "value": 60 }, { "color": "#EAB839", - "value": 80 + "value": 90 }, { - "color": "green", + "color": "dark-green", "value": 100 } ] } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Value" - }, - "properties": [ + }, { - "id": "custom.displayMode", - "value": "lcd-gauge" + "id": "custom.width", + "value": 274 } ] }, { "matcher": { "id": "byName", - "options": "description" + "options": "\u5f02\u5e38\u6307\u6807" }, "properties": [ { "id": "custom.width", - "value": 123 + "value": 199 } ] }, { "matcher": { "id": "byName", - "options": "type" + "options": "node" }, "properties": [ { "id": "custom.width", - "value": 119 + "value": 229 } ] }, { "matcher": { "id": "byName", - "options": "Node" + "options": "pod" }, "properties": [ { "id": "custom.width", - "value": 128 + "value": 135 } ] }, { "matcher": { "id": "byName", - "options": "Score" + "options": "\u6307\u6807\u7c7b\u522b" }, "properties": [ { "id": "custom.width", - "value": 66 + "value": 120 } ] } ] }, "gridPos": { - "h": 8, - "w": 12, + "h": 9, + "w": 10, "x": 0, "y": 6 }, @@ -635,91 +617,92 @@ "interval": "30s", "options": { "footer": { - "enablePagination": false, "fields": "", "reducer": [ "sum" ], "show": false }, - "frameIndex": 1, "showHeader": true, "sortBy": [ { - "desc": true, - "displayName": "Value" + "desc": false, + "displayName": "\u6307\u6807\u5f97\u5206" } ] }, "pluginVersion": "9.2.2", "targets": [ { - "datasource": "sysom-prometheus", - "editorMode": "code", - "exemplar": false, - "expr": "bottomk(5, sysom_node_health_metric{cluster=\"$cluster\", mode=\"score\"})", - "format": "table", - "instant": true, - "legendFormat": "__auto", - "range": false, - "refId": "A" - }, - { - "datasource": "sysom-prometheus", + "datasource": "sysom-mysql", "editorMode": "code", - "exemplar": false, - "expr": "(bottomk(5, sysom_node_health_metric{cluster=\"$cluster\", mode=\"score\"}) * 0) + on(description, exported_instance,type) sysom_node_health_metric{cluster=\"$cluster\", mode=\"value\"}", "format": "table", - "hide": false, - "instant": true, - "legendFormat": "__auto", - "range": false, - "refId": "B" + "rawQuery": true, + "rawSql": "SELECT \nmetric_id,\nscore,\ninstance,\nmetric_type,\nvalue\nFROM sysom.sys_abnormal_metrics_node\nWHERE cluster = '$cluster'\nLIMIT 50 ", + "refId": "A", + "sql": { + "columns": [ + { + "parameters": [], + "type": "function" + } + ], + "groupBy": [ + { + "property": { + "type": "string" + }, + "type": "groupBy" + } + ], + "limit": 50 + } } ], "title": "Nodes Overview", "transformations": [ - { - "id": "merge", - "options": {} - }, { "id": "organize", "options": { "excludeByName": { - "Time": true, - "Value #A": false, - "type": false + "instance": false, + "namespace": true, + "value": false }, "indexByName": { - "Time": 0, - "Value #A": 10, - "Value #B": 11, - "__name__": 1, - "description": 3, - "exported_instance": 4, - "instance": 5, - "job": 6, - "mode": 7, - "namespace": 8, - "pod": 2, - "type": 9 + "instance": 1, + "metric_id": 0, + "metric_type": 2, + "score": 3, + "value": 4 }, "renameByName": { - "Time": "", - "Value #A": "Score", - "Value #B": "Value", - "exported_instance": "Node", - "pod": "Pod" + "instance": "node", + "metric_id": "\u5f02\u5e38\u6307\u6807", + "metric_type": "\u6307\u6807\u7c7b\u522b", + "pod": "", + "score": "\u6307\u6807\u5f97\u5206", + "value": "\u6307\u6807\u503c" } } + }, + { + "id": "convertFieldType", + "options": { + "conversions": [ + { + "destinationType": "number", + "targetField": "\u6307\u6807\u5f97\u5206" + } + ], + "fields": {} + } } ], "type": "table" }, { - "datasource": "sysom-prometheus", - "description": "", + "datasource": "sysom-mysql", "fieldConfig": { "defaults": { "color": { @@ -728,7 +711,6 @@ "custom": { "align": "auto", "displayMode": "auto", - "filterable": false, "inspect": false }, "mappings": [], @@ -736,8 +718,20 @@ "mode": "absolute", "steps": [ { - "color": "green", + "color": "dark-red", "value": null + }, + { + "color": "red", + "value": 60 + }, + { + "color": "#EAB839", + "value": 90 + }, + { + "color": "dark-green", + "value": 100 } ] } @@ -745,36 +739,16 @@ "overrides": [ { "matcher": { - "id": "byRegexp", - "options": "/.*/" - }, - "properties": [ - { - "id": "custom.hidden", - "value": true - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/Node|Pod|description|type|Value|Score/" - }, - "properties": [ - { - "id": "custom.hidden" - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/Score/" + "id": "byName", + "options": "\u6307\u6807\u5f97\u5206" }, "properties": [ { "id": "custom.displayMode", - "value": "color-background" + "value": "lcd-gauge" + }, + { + "id": "color" }, { "id": "thresholds", @@ -782,189 +756,172 @@ "mode": "absolute", "steps": [ { - "color": "red", + "color": "dark-red", "value": null }, { - "color": "orange", + "color": "red", "value": 60 }, { "color": "#EAB839", - "value": 80 + "value": 90 }, { - "color": "green", + "color": "dark-green", "value": 100 } ] } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Value" - }, - "properties": [ - { - "id": "custom.displayMode", - "value": "lcd-gauge" - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "description" - }, - "properties": [ + }, { "id": "custom.width", - "value": 123 + "value": 144 } ] }, { "matcher": { "id": "byName", - "options": "type" + "options": "\u5f02\u5e38\u6307\u6807" }, "properties": [ { "id": "custom.width", - "value": 87 + "value": 199 } ] }, { "matcher": { "id": "byName", - "options": "Node" + "options": "node" }, "properties": [ { "id": "custom.width", - "value": 128 + "value": 229 } ] }, { "matcher": { "id": "byName", - "options": "Score" + "options": "pod" }, "properties": [ { "id": "custom.width", - "value": 66 + "value": 135 } ] }, { "matcher": { "id": "byName", - "options": "Pod" + "options": "\u6307\u6807\u7c7b\u522b" }, "properties": [ { "id": "custom.width", - "value": 132 + "value": 120 } ] } ] }, "gridPos": { - "h": 8, - "w": 12, - "x": 12, + "h": 9, + "w": 14, + "x": 10, "y": 6 }, - "id": 92, + "id": 95, "interval": "30s", "options": { "footer": { - "enablePagination": false, "fields": "", "reducer": [ "sum" ], "show": false }, - "frameIndex": 1, "showHeader": true, "sortBy": [ { "desc": true, - "displayName": "Value" + "displayName": "pod" } ] }, "pluginVersion": "9.2.2", "targets": [ { - "datasource": "sysom-prometheus", + "datasource": "sysom-mysql", "editorMode": "code", - "exemplar": false, - "expr": "bottomk(5, sysom_pod_health_metric{cluster=\"$cluster\", mode=\"score\"})", "format": "table", - "instant": true, - "legendFormat": "__auto", - "range": false, - "refId": "A" - }, - { - "datasource": "sysom-prometheus", - "editorMode": "code", - "exemplar": false, - "expr": "(bottomk(5, sysom_pod_health_metric{cluster=\"$cluster\", mode=\"score\"}) * 0) + on(description, pod, namespace, type) sysom_pod_health_metric{cluster=\"$cluster\", mode=\"value\"}", - "format": "table", - "hide": false, - "instant": true, - "legendFormat": "__auto", - "range": false, - "refId": "B" + "rawQuery": true, + "rawSql": "SELECT \nmetric_id,\nscore,\ninstance,\npod,\nnamespace,\nmetric_type,\nvalue\nFROM sysom.sys_abnormal_metrics_pod\nWHERE cluster = '$cluster'\nLIMIT 50 ", + "refId": "A", + "sql": { + "columns": [ + { + "parameters": [], + "type": "function" + } + ], + "groupBy": [ + { + "property": { + "type": "string" + }, + "type": "groupBy" + } + ], + "limit": 50 + } } ], - "title": "Pods/Containers Overview", + "title": "Pods Overview", "transformations": [ - { - "id": "merge", - "options": {} - }, { "id": "organize", "options": { "excludeByName": { - "Time": true, - "Value #A": false, - "type": false + "instance": false, + "namespace": false, + "value": false }, "indexByName": { - "Time": 0, - "Value #A": 10, - "Value #B": 11, - "__name__": 1, - "cluster": 12, - "description": 2, - "exported_instance": 3, - "instance": 5, - "job": 6, - "mode": 7, - "namespace": 8, - "pod": 4, - "type": 9 + "instance": 1, + "metric_id": 0, + "metric_type": 4, + "namespace": 3, + "pod": 2, + "score": 5, + "value": 6 }, "renameByName": { - "Time": "", - "Value #A": "Score", - "Value #B": "Value", - "exported_instance": "Node", - "pod": "Pod" + "instance": "node", + "metric_id": "\u5f02\u5e38\u6307\u6807", + "metric_type": "\u6307\u6807\u7c7b\u522b", + "pod": "", + "score": "\u6307\u6807\u5f97\u5206", + "value": "\u6307\u6807\u503c" } } + }, + { + "id": "convertFieldType", + "options": { + "conversions": [ + { + "destinationType": "number", + "targetField": "\u6307\u6807\u5f97\u5206" + } + ], + "fields": {} + } } ], "type": "table" @@ -975,7 +932,7 @@ "h": 1, "w": 24, "x": 0, - "y": 14 + "y": 15 }, "id": 60, "panels": [], @@ -1010,7 +967,7 @@ "h": 7, "w": 4, "x": 0, - "y": 15 + "y": 16 }, "id": 4, "options": { @@ -1185,7 +1142,7 @@ "h": 7, "w": 8, "x": 4, - "y": 15 + "y": 16 }, "id": 41, "options": { @@ -1414,7 +1371,7 @@ "h": 7, "w": 7, "x": 12, - "y": 15 + "y": 16 }, "id": 15, "options": { @@ -1517,7 +1474,7 @@ "h": 7, "w": 5, "x": 19, - "y": 15 + "y": 16 }, "id": 17, "options": { @@ -1645,7 +1602,7 @@ "h": 8, "w": 12, "x": 0, - "y": 22 + "y": 23 }, "id": 61, "options": { @@ -1775,7 +1732,7 @@ "h": 8, "w": 12, "x": 12, - "y": 22 + "y": 23 }, "id": 62, "interval": "30s", @@ -1885,7 +1842,7 @@ "h": 8, "w": 12, "x": 0, - "y": 30 + "y": 31 }, "id": 72, "options": { @@ -2012,7 +1969,7 @@ "h": 9, "w": 12, "x": 12, - "y": 30 + "y": 31 }, "id": 8, "options": { @@ -2114,7 +2071,7 @@ "h": 9, "w": 12, "x": 0, - "y": 38 + "y": 39 }, "id": 45, "options": { @@ -2155,7 +2112,7 @@ "h": 1, "w": 24, "x": 0, - "y": 47 + "y": 48 }, "id": 58, "panels": [], @@ -2264,7 +2221,7 @@ "h": 9, "w": 12, "x": 0, - "y": 48 + "y": 49 }, "id": 9, "options": { @@ -2356,7 +2313,7 @@ "h": 9, "w": 12, "x": 12, - "y": 48 + "y": 49 }, "id": 48, "options": { @@ -2444,7 +2401,7 @@ "h": 9, "w": 12, "x": 0, - "y": 57 + "y": 58 }, "id": 7, "options": { @@ -2582,7 +2539,7 @@ "h": 9, "w": 12, "x": 12, - "y": 57 + "y": 58 }, "id": 31, "options": { @@ -2725,7 +2682,7 @@ "h": 8, "w": 12, "x": 0, - "y": 66 + "y": 67 }, "id": 70, "options": { @@ -2817,7 +2774,7 @@ "h": 8, "w": 12, "x": 12, - "y": 66 + "y": 67 }, "id": 33, "options": { @@ -2955,7 +2912,8 @@ "mode": "absolute", "steps": [ { - "color": "green" + "color": "green", + "value": null }, { "color": "red", @@ -2986,7 +2944,7 @@ "h": 8, "w": 12, "x": 0, - "y": 74 + "y": 75 }, "id": 20, "options": { @@ -3062,7 +3020,8 @@ "mode": "absolute", "steps": [ { - "color": "green" + "color": "green", + "value": null }, { "color": "red", @@ -3078,7 +3037,7 @@ "h": 8, "w": 12, "x": 12, - "y": 74 + "y": 75 }, "id": 40, "options": { @@ -3217,7 +3176,8 @@ "mode": "absolute", "steps": [ { - "color": "green" + "color": "green", + "value": null }, { "color": "red", @@ -3248,7 +3208,7 @@ "h": 8, "w": 12, "x": 0, - "y": 82 + "y": 83 }, "id": 55, "options": { @@ -3335,7 +3295,8 @@ "mode": "absolute", "steps": [ { - "color": "green" + "color": "green", + "value": null }, { "color": "red", @@ -3351,7 +3312,7 @@ "h": 8, "w": 12, "x": 12, - "y": 82 + "y": 83 }, "id": 39, "options": { @@ -3399,15 +3360,10 @@ "h": 2, "w": 24, "x": 0, - "y": 90 + "y": 91 }, "id": 54, - "links": [ - { - "title": "\u5bb9\u5668\u5927\u76d8(\u94fe\u63a5)", - "url": "http://sysom_test.qjm253.cn/grafana/d/rYdddlPWW/rong-qi-jian-kong?var-node=192.168.0.12:8889&var-podname=All&var-podns=All&orgId=1&refresh=5s" - } - ], + "links": [], "options": { "code": { "language": "plaintext", @@ -3428,15 +3384,10 @@ "h": 2, "w": 24, "x": 0, - "y": 92 + "y": 93 }, "id": 53, - "links": [ - { - "title": "\u5bb9\u5668\u5927\u76d8(\u94fe\u63a5)", - "url": "http://sysom_test.qjm253.cn/grafana/d/rYdddlPWk/sysom_base?orgId=1&refresh=5s" - } - ], + "links": [], "options": { "code": { "language": "plaintext", @@ -3547,7 +3498,7 @@ "timezone": "", "title": "\u96c6\u7fa4\u89c6\u89d2", "uid": "F4UBT8w4k", - "version": 2, + "version": 10, "weekStart": "" } } \ No newline at end of file diff --git a/deps/4_grafana/sysom-container-dashboard.json b/deps/4_grafana/sysom-container-dashboard.json index 322bccf564fec0df9e6220b770ae77e79f58ebe4..6e4b9ccf7d5a85e6d5c889366fb8557367fa3cbc 100644 --- a/deps/4_grafana/sysom-container-dashboard.json +++ b/deps/4_grafana/sysom-container-dashboard.json @@ -187,7 +187,7 @@ { "datasource": "sysom-prometheus", "editorMode": "code", - "expr": "sysom_pod_health_score{pod=~\"$pod\", namespace=~\"$podns\",type=\"ErrorMetric\"}", + "expr": "sysom_pod_health_score{pod=~\"$pod\", namespace=~\"$podns\",type=\"error\"}", "format": "time_series", "hide": false, "instant": false, @@ -278,7 +278,7 @@ { "datasource": "sysom-prometheus", "editorMode": "code", - "expr": "sysom_pod_health_score{pod=~\"$pod\", namespace=~\"$podns\",type=\"LatencyMetric\"}", + "expr": "sysom_pod_health_score{pod=~\"$pod\", namespace=~\"$podns\",type=\"latency\"}", "format": "time_series", "hide": false, "instant": false, @@ -369,7 +369,7 @@ { "datasource": "sysom-prometheus", "editorMode": "code", - "expr": "sysom_pod_health_score{pod=~\"$pod\", namespace=~\"$podns\",type=\"CapacityMetric\"}", + "expr": "sysom_pod_health_score{pod=~\"$pod\", namespace=~\"$podns\",type=\"capacity\"}", "format": "time_series", "hide": false, "instant": false, @@ -460,7 +460,7 @@ { "datasource": "sysom-prometheus", "editorMode": "code", - "expr": "sysom_pod_health_score{pod=~\"$pod\", namespace=~\"$podns\",type=\"LoadMetric\"}", + "expr": "sysom_pod_health_score{pod=~\"$pod\", namespace=~\"$podns\",type=\"load\"}", "format": "time_series", "hide": false, "instant": false, @@ -475,7 +475,7 @@ "type": "gauge" }, { - "datasource": "sysom-prometheus", + "datasource": "sysom-mysql", "description": "", "fieldConfig": { "defaults": { @@ -485,7 +485,6 @@ "custom": { "align": "auto", "displayMode": "auto", - "filterable": false, "inspect": false }, "mappings": [], @@ -493,51 +492,37 @@ "mode": "absolute", "steps": [ { - "color": "green", + "color": "dark-red", "value": null + }, + { + "color": "red", + "value": 60 + }, + { + "color": "#EAB839", + "value": 90 + }, + { + "color": "dark-green", + "value": 100 } ] } }, "overrides": [ - { - "matcher": { - "id": "byRegexp", - "options": "/.*/" - }, - "properties": [ - { - "id": "custom.hidden", - "value": true - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/Pod|description|type|Value|Score/" - }, - "properties": [ - { - "id": "custom.hidden" - } - ] - }, { "matcher": { "id": "byName", - "options": "Score" + "options": "\u6307\u6807\u5f97\u5206" }, "properties": [ { - "id": "color", - "value": { - "mode": "thresholds" - } + "id": "custom.displayMode", + "value": "lcd-gauge" }, { - "id": "custom.displayMode", - "value": "color-background" + "id": "color" }, { "id": "thresholds", @@ -549,19 +534,15 @@ "value": null }, { - "color": "#E24D42", - "value": 0 - }, - { - "color": "#EF843C", + "color": "red", "value": 60 }, { "color": "#EAB839", - "value": 80 + "value": 90 }, { - "color": "green", + "color": "dark-green", "value": 100 } ] @@ -569,40 +550,43 @@ }, { "id": "custom.width", - "value": 107 + "value": 144 } ] }, { "matcher": { "id": "byName", - "options": "Value" + "options": "\u5f02\u5e38\u6307\u6807" }, "properties": [ { - "id": "custom.displayMode", - "value": "lcd-gauge" - }, - { - "id": "color", - "value": { - "mode": "thresholds" - } - }, + "id": "custom.width", + "value": 126 + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "node" + }, + "properties": [ { - "id": "custom.width" + "id": "custom.width", + "value": 123 } ] }, { "matcher": { "id": "byName", - "options": "description" + "options": "pod" }, "properties": [ { "id": "custom.width", - "value": 176 + "value": 135 } ] } @@ -614,95 +598,95 @@ "x": 0, "y": 6 }, - "id": 464, + "id": 472, "interval": "30s", "options": { "footer": { - "enablePagination": false, "fields": "", "reducer": [ "sum" ], "show": false }, - "frameIndex": 1, "showHeader": true, "sortBy": [ { "desc": false, - "displayName": "Score" + "displayName": "\u6307\u6807\u5f97\u5206" } ] }, "pluginVersion": "9.2.2", "targets": [ { - "datasource": "sysom-prometheus", - "editorMode": "code", - "exemplar": false, - "expr": "bottomk(5, sysom_pod_health_metric{pod=~\"$pod\", namespace=~\"$podns\", type=\"CapacityMetric\", mode=\"score\"}) by (description, type)", - "format": "table", - "instant": true, - "legendFormat": "__auto", - "range": false, - "refId": "A" - }, - { - "datasource": "sysom-prometheus", + "datasource": "sysom-mysql", "editorMode": "code", - "exemplar": false, - "expr": "(bottomk(5, sysom_pod_health_metric{pod=~\"$pod\", namespace=~\"$podns\",type=\"CapacityMetric\", mode=\"score\"}) * 0) + on(description, pod, namespace, type) sysom_pod_health_metric{pod=~\"$pod\", namespace=~\"$podns\", type=\"CapacityMetric\", mode=\"value\"}", "format": "table", - "hide": false, - "instant": true, - "legendFormat": "__auto", - "range": false, - "refId": "B" + "rawQuery": true, + "rawSql": "SELECT \nmetric_id,\nscore,\npod,\ninstance,\nnamespace,\nvalue\nFROM sysom.sys_abnormal_metrics_pod\nWHERE pod = $pod AND namespace = $podns AND metric_type = \"capacity\"\nLIMIT 50 ", + "refId": "A", + "sql": { + "columns": [ + { + "parameters": [], + "type": "function" + } + ], + "groupBy": [ + { + "property": { + "type": "string" + }, + "type": "groupBy" + } + ], + "limit": 50 + } } ], "title": "Saturation Health", "transformations": [ - { - "id": "merge", - "options": {} - }, { "id": "organize", "options": { "excludeByName": { - "Time": true, - "Value #A": false, - "pod": true, - "type": true + "instance": true, + "namespace": true, + "value": false }, "indexByName": { - "Time": 0, - "Value #A": 10, - "Value #B": 11, - "__name__": 1, - "description": 3, - "exported_instance": 4, - "instance": 5, - "job": 6, - "mode": 7, - "namespace": 8, + "instance": 1, + "metric_id": 0, + "namespace": 3, "pod": 2, - "type": 9 + "score": 4, + "value": 5 }, "renameByName": { - "Time": "", - "Value #A": "Score", - "Value #B": "Value", - "pod": "Pod" + "instance": "node", + "metric_id": "\u5f02\u5e38\u6307\u6807", + "pod": "", + "score": "\u6307\u6807\u5f97\u5206" } } + }, + { + "id": "convertFieldType", + "options": { + "conversions": [ + { + "destinationType": "number", + "targetField": "\u6307\u6807\u5f97\u5206" + } + ], + "fields": {} + } } ], "type": "table" }, { - "datasource": "sysom-prometheus", - "description": "", + "datasource": "sysom-mysql", "fieldConfig": { "defaults": { "color": { @@ -711,7 +695,6 @@ "custom": { "align": "auto", "displayMode": "auto", - "filterable": false, "inspect": false }, "mappings": [], @@ -719,51 +702,37 @@ "mode": "absolute", "steps": [ { - "color": "green", + "color": "dark-red", "value": null + }, + { + "color": "red", + "value": 60 + }, + { + "color": "#EAB839", + "value": 90 + }, + { + "color": "dark-green", + "value": 100 } ] } }, "overrides": [ - { - "matcher": { - "id": "byRegexp", - "options": "/.*/" - }, - "properties": [ - { - "id": "custom.hidden", - "value": true - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/Pod|description|type|Value|Score/" - }, - "properties": [ - { - "id": "custom.hidden" - } - ] - }, { "matcher": { "id": "byName", - "options": "Score" + "options": "\u6307\u6807\u5f97\u5206" }, "properties": [ { - "id": "color", - "value": { - "mode": "thresholds" - } + "id": "custom.displayMode", + "value": "lcd-gauge" }, { - "id": "custom.displayMode", - "value": "color-background" + "id": "color" }, { "id": "thresholds", @@ -775,19 +744,15 @@ "value": null }, { - "color": "#E24D42", - "value": 0 - }, - { - "color": "#EF843C", + "color": "red", "value": 60 }, { "color": "#EAB839", - "value": 80 + "value": 90 }, { - "color": "green", + "color": "dark-green", "value": 100 } ] @@ -795,40 +760,43 @@ }, { "id": "custom.width", - "value": 98 + "value": 144 } ] }, { "matcher": { "id": "byName", - "options": "Value" + "options": "\u5f02\u5e38\u6307\u6807" }, "properties": [ { - "id": "custom.displayMode", - "value": "lcd-gauge" - }, - { - "id": "color", - "value": { - "mode": "thresholds" - } - }, + "id": "custom.width", + "value": 126 + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "node" + }, + "properties": [ { - "id": "custom.width" + "id": "custom.width", + "value": 123 } ] }, { "matcher": { "id": "byName", - "options": "description" + "options": "pod" }, "properties": [ { "id": "custom.width", - "value": 215 + "value": 135 } ] } @@ -840,95 +808,95 @@ "x": 6, "y": 6 }, - "id": 468, + "id": 473, "interval": "30s", "options": { "footer": { - "enablePagination": false, "fields": "", "reducer": [ "sum" ], "show": false }, - "frameIndex": 1, "showHeader": true, "sortBy": [ { "desc": false, - "displayName": "Score" + "displayName": "\u6307\u6807\u5f97\u5206" } ] }, "pluginVersion": "9.2.2", "targets": [ { - "datasource": "sysom-prometheus", + "datasource": "sysom-mysql", "editorMode": "code", - "exemplar": false, - "expr": "bottomk(5, sysom_pod_health_metric{pod=~\"$pod\", namespace=~\"$podns\", type=\"LoadMetric\", mode=\"score\"}) by (description, type)", "format": "table", - "instant": true, - "legendFormat": "__auto", - "range": false, - "refId": "A" - }, - { - "datasource": "sysom-prometheus", - "editorMode": "code", - "exemplar": false, - "expr": "(bottomk(5, sysom_pod_health_metric{pod=~\"$pod\", namespace=~\"$podns\",type=\"LoadMetric\", mode=\"score\"}) * 0) + on(description, pod, namespace, type) sysom_pod_health_metric{pod=~\"$pod\", namespace=~\"$podns\", type=\"LoadMetric\", mode=\"value\"}", - "format": "table", - "hide": false, - "instant": true, - "legendFormat": "__auto", - "range": false, - "refId": "B" + "rawQuery": true, + "rawSql": "SELECT \nmetric_id,\nscore,\npod,\ninstance,\nnamespace,\nvalue\nFROM sysom.sys_abnormal_metrics_pod\nWHERE pod = $pod AND namespace = $podns AND metric_type = \"load\"\nLIMIT 50 ", + "refId": "A", + "sql": { + "columns": [ + { + "parameters": [], + "type": "function" + } + ], + "groupBy": [ + { + "property": { + "type": "string" + }, + "type": "groupBy" + } + ], + "limit": 50 + } } ], "title": "Load Health", "transformations": [ - { - "id": "merge", - "options": {} - }, { "id": "organize", "options": { "excludeByName": { - "Time": true, - "Value #A": false, - "pod": true, - "type": true + "instance": true, + "namespace": true, + "value": false }, "indexByName": { - "Time": 0, - "Value #A": 10, - "Value #B": 11, - "__name__": 1, - "description": 3, - "exported_instance": 4, - "instance": 5, - "job": 6, - "mode": 7, - "namespace": 8, + "instance": 1, + "metric_id": 0, + "namespace": 3, "pod": 2, - "type": 9 + "score": 4, + "value": 5 }, "renameByName": { - "Time": "", - "Value #A": "Score", - "Value #B": "Value", - "pod": "Pod" + "instance": "node", + "metric_id": "\u5f02\u5e38\u6307\u6807", + "pod": "", + "score": "\u6307\u6807\u5f97\u5206" } } + }, + { + "id": "convertFieldType", + "options": { + "conversions": [ + { + "destinationType": "number", + "targetField": "\u6307\u6807\u5f97\u5206" + } + ], + "fields": {} + } } ], "type": "table" }, { - "datasource": "sysom-prometheus", - "description": "", + "datasource": "sysom-mysql", "fieldConfig": { "defaults": { "color": { @@ -937,7 +905,6 @@ "custom": { "align": "auto", "displayMode": "auto", - "filterable": false, "inspect": false }, "mappings": [], @@ -945,51 +912,37 @@ "mode": "absolute", "steps": [ { - "color": "green", + "color": "dark-red", "value": null + }, + { + "color": "red", + "value": 60 + }, + { + "color": "#EAB839", + "value": 90 + }, + { + "color": "dark-green", + "value": 100 } ] } }, "overrides": [ - { - "matcher": { - "id": "byRegexp", - "options": "/.*/" - }, - "properties": [ - { - "id": "custom.hidden", - "value": true - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/Pod|description|type|Value|Score/" - }, - "properties": [ - { - "id": "custom.hidden" - } - ] - }, { "matcher": { "id": "byName", - "options": "Score" + "options": "\u6307\u6807\u5f97\u5206" }, "properties": [ { - "id": "color", - "value": { - "mode": "thresholds" - } + "id": "custom.displayMode", + "value": "lcd-gauge" }, { - "id": "custom.displayMode", - "value": "color-background" + "id": "color" }, { "id": "thresholds", @@ -997,23 +950,19 @@ "mode": "absolute", "steps": [ { - "color": "semi-dark-red", + "color": "dark-red", "value": null }, { - "color": "#E24D42", - "value": 0 - }, - { - "color": "#EF843C", + "color": "red", "value": 60 }, { "color": "#EAB839", - "value": 80 + "value": 90 }, { - "color": "green", + "color": "dark-green", "value": 100 } ] @@ -1021,39 +970,43 @@ }, { "id": "custom.width", - "value": 98 + "value": 144 } ] }, { "matcher": { "id": "byName", - "options": "Value" + "options": "\u5f02\u5e38\u6307\u6807" }, "properties": [ { - "id": "custom.displayMode", - "value": "lcd-gauge" - }, - { - "id": "color", - "value": { - "mode": "thresholds" - } - }, + "id": "custom.width", + "value": 126 + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "node" + }, + "properties": [ { - "id": "custom.width" + "id": "custom.width", + "value": 123 } ] }, { "matcher": { "id": "byName", - "options": "description" + "options": "pod" }, "properties": [ { - "id": "custom.width" + "id": "custom.width", + "value": 135 } ] } @@ -1065,95 +1018,95 @@ "x": 12, "y": 6 }, - "id": 469, + "id": 474, "interval": "30s", "options": { "footer": { - "enablePagination": false, "fields": "", "reducer": [ "sum" ], "show": false }, - "frameIndex": 1, "showHeader": true, "sortBy": [ { "desc": false, - "displayName": "Score" + "displayName": "\u6307\u6807\u5f97\u5206" } ] }, "pluginVersion": "9.2.2", "targets": [ { - "datasource": "sysom-prometheus", + "datasource": "sysom-mysql", "editorMode": "code", - "exemplar": false, - "expr": "bottomk(5, sysom_pod_health_metric{pod=~\"$pod\", namespace=~\"$podns\", type=\"LatencyMetric\", mode=\"score\"}) by (description, type)", "format": "table", - "instant": true, - "legendFormat": "__auto", - "range": false, - "refId": "A" - }, - { - "datasource": "sysom-prometheus", - "editorMode": "code", - "exemplar": false, - "expr": "(bottomk(5, sysom_pod_health_metric{pod=~\"$pod\", namespace=~\"$podns\",type=\"LatencyMetric\", mode=\"score\"}) * 0) + on(description, pod, namespace, type) sysom_pod_health_metric{pod=~\"$pod\", namespace=~\"$podns\", type=\"LatencyMetric\", mode=\"value\"}", - "format": "table", - "hide": false, - "instant": true, - "legendFormat": "__auto", - "range": false, - "refId": "B" + "rawQuery": true, + "rawSql": "SELECT \nmetric_id,\nscore,\npod,\ninstance,\nnamespace,\nvalue\nFROM sysom.sys_abnormal_metrics_pod\nWHERE pod = $pod AND namespace = $podns AND metric_type = \"latency\"\nLIMIT 50 ", + "refId": "A", + "sql": { + "columns": [ + { + "parameters": [], + "type": "function" + } + ], + "groupBy": [ + { + "property": { + "type": "string" + }, + "type": "groupBy" + } + ], + "limit": 50 + } } ], "title": "Latency Health", "transformations": [ - { - "id": "merge", - "options": {} - }, { "id": "organize", "options": { "excludeByName": { - "Time": true, - "Value #A": false, - "pod": true, - "type": true + "instance": true, + "namespace": true, + "value": false }, "indexByName": { - "Time": 0, - "Value #A": 10, - "Value #B": 11, - "__name__": 1, - "description": 3, - "exported_instance": 4, - "instance": 5, - "job": 6, - "mode": 7, - "namespace": 8, + "instance": 1, + "metric_id": 0, + "namespace": 3, "pod": 2, - "type": 9 + "score": 4, + "value": 5 }, "renameByName": { - "Time": "", - "Value #A": "Score", - "Value #B": "Value", - "pod": "Pod" + "instance": "node", + "metric_id": "\u5f02\u5e38\u6307\u6807", + "pod": "", + "score": "\u6307\u6807\u5f97\u5206" } } + }, + { + "id": "convertFieldType", + "options": { + "conversions": [ + { + "destinationType": "number", + "targetField": "\u6307\u6807\u5f97\u5206" + } + ], + "fields": {} + } } ], "type": "table" }, { - "datasource": "sysom-prometheus", - "description": "", + "datasource": "sysom-mysql", "fieldConfig": { "defaults": { "color": { @@ -1162,7 +1115,6 @@ "custom": { "align": "auto", "displayMode": "auto", - "filterable": false, "inspect": false }, "mappings": [], @@ -1170,51 +1122,37 @@ "mode": "absolute", "steps": [ { - "color": "green", + "color": "dark-red", "value": null + }, + { + "color": "red", + "value": 60 + }, + { + "color": "#EAB839", + "value": 90 + }, + { + "color": "dark-green", + "value": 100 } ] } }, "overrides": [ - { - "matcher": { - "id": "byRegexp", - "options": "/.*/" - }, - "properties": [ - { - "id": "custom.hidden", - "value": true - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/Pod|description|type|Value|Score/" - }, - "properties": [ - { - "id": "custom.hidden" - } - ] - }, { "matcher": { "id": "byName", - "options": "Score" + "options": "\u6307\u6807\u5f97\u5206" }, "properties": [ { - "id": "color", - "value": { - "mode": "thresholds" - } + "id": "custom.displayMode", + "value": "lcd-gauge" }, { - "id": "custom.displayMode", - "value": "color-background" + "id": "color" }, { "id": "thresholds", @@ -1222,23 +1160,19 @@ "mode": "absolute", "steps": [ { - "color": "semi-dark-red", + "color": "dark-red", "value": null }, { - "color": "#E24D42", - "value": 0 - }, - { - "color": "#EF843C", + "color": "red", "value": 60 }, { "color": "#EAB839", - "value": 80 + "value": 90 }, { - "color": "green", + "color": "dark-green", "value": 100 } ] @@ -1246,39 +1180,43 @@ }, { "id": "custom.width", - "value": 98 + "value": 144 } ] }, { "matcher": { "id": "byName", - "options": "Value" + "options": "\u5f02\u5e38\u6307\u6807" }, "properties": [ { - "id": "custom.displayMode", - "value": "lcd-gauge" - }, - { - "id": "color", - "value": { - "mode": "thresholds" - } - }, + "id": "custom.width", + "value": 126 + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "node" + }, + "properties": [ { - "id": "custom.width" + "id": "custom.width", + "value": 123 } ] }, { "matcher": { "id": "byName", - "options": "description" + "options": "pod" }, "properties": [ { - "id": "custom.width" + "id": "custom.width", + "value": 135 } ] } @@ -1290,88 +1228,89 @@ "x": 18, "y": 6 }, - "id": 470, + "id": 475, "interval": "30s", "options": { "footer": { - "enablePagination": false, "fields": "", "reducer": [ "sum" ], "show": false }, - "frameIndex": 1, "showHeader": true, "sortBy": [ { "desc": false, - "displayName": "Score" + "displayName": "\u6307\u6807\u5f97\u5206" } ] }, "pluginVersion": "9.2.2", "targets": [ { - "datasource": "sysom-prometheus", + "datasource": "sysom-mysql", "editorMode": "code", - "exemplar": false, - "expr": "bottomk(5, sysom_pod_health_metric{pod=~\"$pod\", namespace=~\"$podns\", type=\"ErrorMetric\", mode=\"score\"}) by (description, type)", "format": "table", - "instant": true, - "legendFormat": "__auto", - "range": false, - "refId": "A" - }, - { - "datasource": "sysom-prometheus", - "editorMode": "code", - "exemplar": false, - "expr": "(bottomk(5, sysom_pod_health_metric{pod=~\"$pod\", namespace=~\"$podns\",type=\"ErrorMetric\", mode=\"score\"}) * 0) + on(description, pod, namespace, type) sysom_pod_health_metric{pod=~\"$pod\", namespace=~\"$podns\", type=\"ErrorMetric\", mode=\"value\"}", - "format": "table", - "hide": false, - "instant": true, - "legendFormat": "__auto", - "range": false, - "refId": "B" + "rawQuery": true, + "rawSql": "SELECT \nmetric_id,\nscore,\npod,\ninstance,\nnamespace,\nvalue\nFROM sysom.sys_abnormal_metrics_pod\nWHERE pod = $pod AND namespace = $podns AND metric_type = \"error\"\nLIMIT 50 ", + "refId": "A", + "sql": { + "columns": [ + { + "parameters": [], + "type": "function" + } + ], + "groupBy": [ + { + "property": { + "type": "string" + }, + "type": "groupBy" + } + ], + "limit": 50 + } } ], - "title": "Errors Health", + "title": "Error Health", "transformations": [ - { - "id": "merge", - "options": {} - }, { "id": "organize", "options": { "excludeByName": { - "Time": true, - "Value #A": false, - "pod": true, - "type": true + "instance": true, + "namespace": true, + "value": false }, "indexByName": { - "Time": 0, - "Value #A": 10, - "Value #B": 11, - "__name__": 1, - "description": 3, - "exported_instance": 4, - "instance": 5, - "job": 6, - "mode": 7, - "namespace": 8, + "instance": 1, + "metric_id": 0, + "namespace": 3, "pod": 2, - "type": 9 + "score": 4, + "value": 5 }, "renameByName": { - "Time": "", - "Value #A": "Score", - "Value #B": "Value", - "pod": "Pod" + "instance": "node", + "metric_id": "\u5f02\u5e38\u6307\u6807", + "pod": "", + "score": "\u6307\u6807\u5f97\u5206" } } + }, + { + "id": "convertFieldType", + "options": { + "conversions": [ + { + "destinationType": "number", + "targetField": "\u6307\u6807\u5f97\u5206" + } + ], + "fields": {} + } } ], "type": "table" @@ -1606,7 +1545,8 @@ "mode": "absolute", "steps": [ { - "color": "green" + "color": "green", + "value": null }, { "color": "red", @@ -1768,7 +1708,8 @@ "mode": "absolute", "steps": [ { - "color": "green" + "color": "green", + "value": null }, { "color": "red", @@ -1859,7 +1800,8 @@ "mode": "absolute", "steps": [ { - "color": "green" + "color": "green", + "value": null }, { "color": "red", @@ -1977,7 +1919,8 @@ "mode": "absolute", "steps": [ { - "color": "green" + "color": "green", + "value": null }, { "color": "red", @@ -3514,8 +3457,8 @@ { "current": { "selected": false, - "text": "ack-mse-ingress-controller-d977bcdfb-7x8jr", - "value": "ack-mse-ingress-controller-d977bcdfb-7x8jr" + "text": "file-daemonset-fxkd9", + "value": "file-daemonset-fxkd9" }, "datasource": "sysom-prometheus", "definition": "label_values(sysom_container_memory_oomcnt,pod)", @@ -3569,7 +3512,7 @@ "timezone": "browser", "title": "\u5bb9\u5668\u76d1\u63a7", "uid": "rYdddlPWW", - "version": 2, + "version": 8, "weekStart": "" } } \ No newline at end of file diff --git a/deps/4_grafana/sysom-sysak-base-dashboard.json b/deps/4_grafana/sysom-sysak-base-dashboard.json index 0aa4608a314c1e52ef5bd21f7c8461b223569170..fb3a0f8d70e9aa67a1074a82e59c537945e9e292 100644 --- a/deps/4_grafana/sysom-sysak-base-dashboard.json +++ b/deps/4_grafana/sysom-sysak-base-dashboard.json @@ -1053,7 +1053,7 @@ { "datasource": "sysom-prometheus", "editorMode": "code", - "expr": "sysom_node_health_score{exported_instance=~\"$node\", type=\"ErrorMetric\"}", + "expr": "sysom_node_health_score{exported_instance=~\"$node\", type=\"error\"}", "format": "time_series", "hide": false, "instant": false, @@ -1144,7 +1144,7 @@ { "datasource": "sysom-prometheus", "editorMode": "code", - "expr": "sysom_node_health_score{exported_instance=~\"$node\", type=\"LatencyMetric\"}", + "expr": "sysom_node_health_score{exported_instance=~\"$node\", type=\"latency\"}", "format": "time_series", "hide": false, "instant": false, @@ -1235,7 +1235,7 @@ { "datasource": "sysom-prometheus", "editorMode": "code", - "expr": "sysom_node_health_score{exported_instance=~\"$node\", type=\"CapacityMetric\"}", + "expr": "sysom_node_health_score{exported_instance=~\"$node\", type=\"capacity\"}", "format": "time_series", "hide": false, "instant": false, @@ -1326,7 +1326,7 @@ { "datasource": "sysom-prometheus", "editorMode": "code", - "expr": "sysom_node_health_score{exported_instance=~\"$node\", type=\"LoadMetric\"}", + "expr": "sysom_node_health_score{exported_instance=~\"$node\", type=\"load\"}", "format": "time_series", "hide": false, "instant": false, @@ -1412,8 +1412,7 @@ "type": "gauge" }, { - "datasource": "sysom-prometheus", - "description": "", + "datasource": "sysom-mysql", "fieldConfig": { "defaults": { "color": { @@ -1422,7 +1421,6 @@ "custom": { "align": "auto", "displayMode": "auto", - "filterable": false, "inspect": false }, "mappings": [], @@ -1430,51 +1428,37 @@ "mode": "absolute", "steps": [ { - "color": "green", + "color": "dark-red", "value": null + }, + { + "color": "red", + "value": 60 + }, + { + "color": "#EAB839", + "value": 90 + }, + { + "color": "dark-green", + "value": 100 } ] } }, "overrides": [ - { - "matcher": { - "id": "byRegexp", - "options": "/.*/" - }, - "properties": [ - { - "id": "custom.hidden", - "value": true - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/Pod|description|type|Value|Score/" - }, - "properties": [ - { - "id": "custom.hidden" - } - ] - }, { "matcher": { "id": "byName", - "options": "Score" + "options": "\u6307\u6807\u5f97\u5206" }, "properties": [ { - "id": "color", - "value": { - "mode": "continuous-RdYlGr" - } + "id": "custom.displayMode", + "value": "lcd-gauge" }, { - "id": "custom.displayMode", - "value": "color-background" + "id": "color" }, { "id": "thresholds", @@ -1486,19 +1470,15 @@ "value": null }, { - "color": "#E24D42", - "value": 20 - }, - { - "color": "#EF843C", + "color": "red", "value": 60 }, { "color": "#EAB839", - "value": 80 + "value": 90 }, { - "color": "green", + "color": "dark-green", "value": 100 } ] @@ -1506,41 +1486,43 @@ }, { "id": "custom.width", - "value": 89 + "value": 144 } ] }, { "matcher": { "id": "byName", - "options": "Value" + "options": "\u5f02\u5e38\u6307\u6807" }, "properties": [ { - "id": "custom.displayMode", - "value": "lcd-gauge" - }, - { - "id": "color", - "value": { - "mode": "thresholds" - } - }, + "id": "custom.width", + "value": 147 + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "node" + }, + "properties": [ { "id": "custom.width", - "value": 140 + "value": 175 } ] }, { "matcher": { "id": "byName", - "options": "description" + "options": "pod" }, "properties": [ { "id": "custom.width", - "value": 130 + "value": 135 } ] } @@ -1552,89 +1534,95 @@ "x": 0, "y": 18 }, - "id": 419, + "id": 427, "interval": "30s", "options": { "footer": { - "enablePagination": false, "fields": "", "reducer": [ "sum" ], "show": false }, - "frameIndex": 1, "showHeader": true, - "sortBy": [] + "sortBy": [ + { + "desc": false, + "displayName": "\u6307\u6807\u5f97\u5206" + } + ] }, "pluginVersion": "9.2.2", "targets": [ { - "datasource": "sysom-prometheus", - "editorMode": "code", - "exemplar": false, - "expr": "bottomk(5, sysom_node_health_metric{exported_instance=~\"$node\", type=\"CapacityMetric\", mode=\"score\"})", - "format": "table", - "instant": true, - "legendFormat": "__auto", - "range": false, - "refId": "A" - }, - { - "datasource": "sysom-prometheus", + "datasource": "sysom-mysql", "editorMode": "code", - "exemplar": false, - "expr": "(bottomk(5, sysom_node_health_metric{exported_instance=~\"$node\", type=\"CapacityMetric\", mode=\"score\"}) * 0) + on(description, exported_instance,type) sysom_node_health_metric{exported_instance=~\"$node\", type=\"CapacityMetric\", mode=\"value\"}", "format": "table", - "hide": false, - "instant": true, - "legendFormat": "__auto", - "range": false, - "refId": "B" + "rawQuery": true, + "rawSql": "SELECT \nmetric_id,\nscore,\ninstance,\nvalue\nFROM sysom.sys_abnormal_metrics_node\nWHERE instance = '$node' AND metric_type = \"capacity\"\nLIMIT 50 ", + "refId": "A", + "sql": { + "columns": [ + { + "parameters": [], + "type": "function" + } + ], + "groupBy": [ + { + "property": { + "type": "string" + }, + "type": "groupBy" + } + ], + "limit": 50 + } } ], "title": "Saturation Health", "transformations": [ - { - "id": "merge", - "options": {} - }, { "id": "organize", "options": { "excludeByName": { - "Time": true, - "Value #A": false, - "type": true + "instance": false, + "namespace": true, + "value": false }, "indexByName": { - "Time": 0, - "Value #A": 10, - "Value #B": 11, - "__name__": 1, - "description": 3, - "exported_instance": 4, - "instance": 5, - "job": 6, - "mode": 7, - "namespace": 8, + "instance": 1, + "metric_id": 0, + "namespace": 3, "pod": 2, - "type": 9 + "score": 4, + "value": 5 }, "renameByName": { - "Time": "", - "Value #A": "Score", - "Value #B": "Value", - "pod": "Pod" + "instance": "node", + "metric_id": "\u5f02\u5e38\u6307\u6807", + "pod": "", + "score": "\u6307\u6807\u5f97\u5206" } } + }, + { + "id": "convertFieldType", + "options": { + "conversions": [ + { + "destinationType": "number", + "targetField": "\u6307\u6807\u5f97\u5206" + } + ], + "fields": {} + } } ], "type": "table" }, { - "datasource": "sysom-prometheus", - "description": "", + "datasource": "sysom-mysql", "fieldConfig": { "defaults": { "color": { @@ -1643,7 +1631,6 @@ "custom": { "align": "auto", "displayMode": "auto", - "filterable": false, "inspect": false }, "mappings": [], @@ -1651,51 +1638,37 @@ "mode": "absolute", "steps": [ { - "color": "green", + "color": "dark-red", "value": null + }, + { + "color": "red", + "value": 60 + }, + { + "color": "#EAB839", + "value": 90 + }, + { + "color": "dark-green", + "value": 100 } ] } }, "overrides": [ - { - "matcher": { - "id": "byRegexp", - "options": "/.*/" - }, - "properties": [ - { - "id": "custom.hidden", - "value": true - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/Pod|description|type|Value|Score/" - }, - "properties": [ - { - "id": "custom.hidden" - } - ] - }, { "matcher": { "id": "byName", - "options": "Score" + "options": "\u6307\u6807\u5f97\u5206" }, "properties": [ { - "id": "color", - "value": { - "mode": "continuous-RdYlGr" - } + "id": "custom.displayMode", + "value": "lcd-gauge" }, { - "id": "custom.displayMode", - "value": "color-background" + "id": "color" }, { "id": "thresholds", @@ -1707,19 +1680,15 @@ "value": null }, { - "color": "#E24D42", - "value": 0 - }, - { - "color": "#EF843C", + "color": "red", "value": 60 }, { "color": "#EAB839", - "value": 80 + "value": 90 }, { - "color": "green", + "color": "dark-green", "value": 100 } ] @@ -1727,37 +1696,43 @@ }, { "id": "custom.width", - "value": 92 + "value": 155 } ] }, { "matcher": { "id": "byName", - "options": "Value" + "options": "\u5f02\u5e38\u6307\u6807" }, "properties": [ { - "id": "custom.displayMode", - "value": "lcd-gauge" - }, + "id": "custom.width", + "value": 150 + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "node" + }, + "properties": [ { - "id": "color", - "value": { - "mode": "thresholds" - } + "id": "custom.width", + "value": 155 } ] }, { "matcher": { "id": "byName", - "options": "description" + "options": "pod" }, "properties": [ { "id": "custom.width", - "value": 127 + "value": 135 } ] } @@ -1769,93 +1744,95 @@ "x": 6, "y": 18 }, - "id": 421, + "id": 428, "interval": "30s", "options": { "footer": { - "enablePagination": false, "fields": "", "reducer": [ "sum" ], "show": false }, - "frameIndex": 1, "showHeader": true, "sortBy": [ { "desc": false, - "displayName": "Score" + "displayName": "\u6307\u6807\u5f97\u5206" } ] }, "pluginVersion": "9.2.2", "targets": [ { - "datasource": "sysom-prometheus", - "editorMode": "code", - "exemplar": false, - "expr": "bottomk(5, sysom_node_health_metric{exported_instance=~\"$node\", type=\"LoadMetric\", mode=\"score\"}) by (description, type)", - "format": "table", - "instant": true, - "legendFormat": "__auto", - "range": false, - "refId": "A" - }, - { - "datasource": "sysom-prometheus", + "datasource": "sysom-mysql", "editorMode": "code", - "exemplar": false, - "expr": "sum(sysom_node_health_metric{exported_instance=~\"$node\", type=\"LoadMetric\",mode=\"value\"}) by (description, type)", "format": "table", - "hide": false, - "instant": true, - "legendFormat": "__auto", - "range": false, - "refId": "B" + "rawQuery": true, + "rawSql": "SELECT \nmetric_id,\nscore,\ninstance,\nvalue\nFROM sysom.sys_abnormal_metrics_node\nWHERE instance = '$node' AND metric_type = \"load\"\nLIMIT 50 ", + "refId": "A", + "sql": { + "columns": [ + { + "parameters": [], + "type": "function" + } + ], + "groupBy": [ + { + "property": { + "type": "string" + }, + "type": "groupBy" + } + ], + "limit": 50 + } } ], - "title": "Load(Traffic) Health", + "title": "Load Health", "transformations": [ - { - "id": "merge", - "options": {} - }, { "id": "organize", "options": { "excludeByName": { - "Time": true, - "type": true + "instance": false, + "namespace": true, + "value": false }, "indexByName": { - "Time": 0, - "Value #A": 10, - "Value #B": 11, - "__name__": 1, - "description": 3, - "exported_instance": 4, - "instance": 5, - "job": 6, - "mode": 7, - "namespace": 8, + "instance": 1, + "metric_id": 0, + "namespace": 3, "pod": 2, - "type": 9 + "score": 4, + "value": 5 }, "renameByName": { - "Time": "", - "Value #A": "Score", - "Value #B": "Value", - "pod": "Pod" + "instance": "node", + "metric_id": "\u5f02\u5e38\u6307\u6807", + "pod": "", + "score": "\u6307\u6807\u5f97\u5206" } } + }, + { + "id": "convertFieldType", + "options": { + "conversions": [ + { + "destinationType": "number", + "targetField": "\u6307\u6807\u5f97\u5206" + } + ], + "fields": {} + } } ], "type": "table" }, { - "datasource": "sysom-prometheus", - "description": "", + "datasource": "sysom-mysql", "fieldConfig": { "defaults": { "color": { @@ -1864,7 +1841,6 @@ "custom": { "align": "auto", "displayMode": "auto", - "filterable": false, "inspect": false }, "mappings": [], @@ -1872,51 +1848,37 @@ "mode": "absolute", "steps": [ { - "color": "green", + "color": "dark-red", "value": null + }, + { + "color": "red", + "value": 60 + }, + { + "color": "#EAB839", + "value": 90 + }, + { + "color": "dark-green", + "value": 100 } ] } }, "overrides": [ - { - "matcher": { - "id": "byRegexp", - "options": "/.*/" - }, - "properties": [ - { - "id": "custom.hidden", - "value": true - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/Pod|description|type|Value|Score/" - }, - "properties": [ - { - "id": "custom.hidden" - } - ] - }, { "matcher": { "id": "byName", - "options": "Score" + "options": "\u6307\u6807\u5f97\u5206" }, "properties": [ { - "id": "color", - "value": { - "mode": "continuous-RdYlGr" - } + "id": "custom.displayMode", + "value": "lcd-gauge" }, { - "id": "custom.displayMode", - "value": "color-background" + "id": "color" }, { "id": "thresholds", @@ -1928,19 +1890,15 @@ "value": null }, { - "color": "#E24D42", - "value": 0 - }, - { - "color": "#EF843C", + "color": "red", "value": 60 }, { "color": "#EAB839", - "value": 80 + "value": 90 }, { - "color": "green", + "color": "dark-green", "value": 100 } ] @@ -1948,37 +1906,43 @@ }, { "id": "custom.width", - "value": 93 + "value": 155 } ] }, { "matcher": { "id": "byName", - "options": "Value" + "options": "\u5f02\u5e38\u6307\u6807" }, "properties": [ { - "id": "custom.displayMode", - "value": "lcd-gauge" - }, + "id": "custom.width", + "value": 150 + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "node" + }, + "properties": [ { - "id": "color", - "value": { - "mode": "thresholds" - } + "id": "custom.width", + "value": 155 } ] }, { "matcher": { "id": "byName", - "options": "description" + "options": "pod" }, "properties": [ { "id": "custom.width", - "value": 133 + "value": 135 } ] } @@ -1990,93 +1954,95 @@ "x": 12, "y": 18 }, - "id": 423, + "id": 440, "interval": "30s", "options": { "footer": { - "enablePagination": false, "fields": "", "reducer": [ "sum" ], "show": false }, - "frameIndex": 1, "showHeader": true, "sortBy": [ { "desc": false, - "displayName": "Score" + "displayName": "\u6307\u6807\u5f97\u5206" } ] }, "pluginVersion": "9.2.2", "targets": [ { - "datasource": "sysom-prometheus", + "datasource": "sysom-mysql", "editorMode": "code", - "exemplar": false, - "expr": "bottomk(5, sysom_node_health_metric{exported_instance=~\"$node\", type=\"LatencyMetric\", mode=\"score\"}) by (description, type)", - "format": "table", - "instant": true, - "legendFormat": "__auto", - "range": false, - "refId": "A" - }, - { - "datasource": "sysom-prometheus", - "editorMode": "code", - "exemplar": false, - "expr": "sum(sysom_node_health_metric{exported_instance=~\"$node\", type=\"LatencyMetric\",mode=\"value\"}) by (description, type)", "format": "table", - "hide": false, - "instant": true, - "legendFormat": "__auto", - "range": false, - "refId": "B" + "rawQuery": true, + "rawSql": "SELECT \nmetric_id,\nscore,\ninstance,\nvalue\nFROM sysom.sys_abnormal_metrics_node\nWHERE instance = '$node' AND metric_type = \"latency\"\nLIMIT 50 ", + "refId": "A", + "sql": { + "columns": [ + { + "parameters": [], + "type": "function" + } + ], + "groupBy": [ + { + "property": { + "type": "string" + }, + "type": "groupBy" + } + ], + "limit": 50 + } } ], - "title": "latency Health", + "title": "Latency Health", "transformations": [ - { - "id": "merge", - "options": {} - }, { "id": "organize", "options": { "excludeByName": { - "Time": true, - "type": true + "instance": false, + "namespace": true, + "value": false }, "indexByName": { - "Time": 0, - "Value #A": 10, - "Value #B": 11, - "__name__": 1, - "description": 3, - "exported_instance": 4, - "instance": 5, - "job": 6, - "mode": 7, - "namespace": 8, + "instance": 1, + "metric_id": 0, + "namespace": 3, "pod": 2, - "type": 9 + "score": 4, + "value": 5 }, "renameByName": { - "Time": "", - "Value #A": "Score", - "Value #B": "Value", - "pod": "Pod" + "instance": "node", + "metric_id": "\u5f02\u5e38\u6307\u6807", + "pod": "", + "score": "\u6307\u6807\u5f97\u5206" } } + }, + { + "id": "convertFieldType", + "options": { + "conversions": [ + { + "destinationType": "number", + "targetField": "\u6307\u6807\u5f97\u5206" + } + ], + "fields": {} + } } ], "type": "table" }, { - "datasource": "sysom-prometheus", - "description": "", + "datasource": "sysom-mysql", "fieldConfig": { "defaults": { "color": { @@ -2085,7 +2051,6 @@ "custom": { "align": "auto", "displayMode": "auto", - "filterable": false, "inspect": false }, "mappings": [], @@ -2093,51 +2058,37 @@ "mode": "absolute", "steps": [ { - "color": "green", + "color": "dark-red", "value": null + }, + { + "color": "red", + "value": 60 + }, + { + "color": "#EAB839", + "value": 90 + }, + { + "color": "dark-green", + "value": 100 } ] } }, "overrides": [ - { - "matcher": { - "id": "byRegexp", - "options": "/.*/" - }, - "properties": [ - { - "id": "custom.hidden", - "value": true - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "/Pod|description|type|Value|Score/" - }, - "properties": [ - { - "id": "custom.hidden" - } - ] - }, { "matcher": { "id": "byName", - "options": "Score" + "options": "\u6307\u6807\u5f97\u5206" }, "properties": [ { - "id": "color", - "value": { - "mode": "continuous-RdYlGr" - } + "id": "custom.displayMode", + "value": "lcd-gauge" }, { - "id": "custom.displayMode", - "value": "color-background" + "id": "color" }, { "id": "thresholds", @@ -2149,19 +2100,15 @@ "value": null }, { - "color": "#E24D42", - "value": 0 - }, - { - "color": "#EAB839", + "color": "red", "value": 60 }, { - "color": "#EF843C", - "value": 80 + "color": "#EAB839", + "value": 90 }, { - "color": "green", + "color": "dark-green", "value": 100 } ] @@ -2169,37 +2116,43 @@ }, { "id": "custom.width", - "value": 103 + "value": 155 } ] }, { "matcher": { "id": "byName", - "options": "Value" + "options": "\u5f02\u5e38\u6307\u6807" }, "properties": [ { - "id": "custom.displayMode", - "value": "lcd-gauge" - }, + "id": "custom.width", + "value": 150 + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "node" + }, + "properties": [ { - "id": "color", - "value": { - "mode": "thresholds" - } + "id": "custom.width", + "value": 155 } ] }, { "matcher": { "id": "byName", - "options": "description" + "options": "pod" }, "properties": [ { "id": "custom.width", - "value": 129 + "value": 135 } ] } @@ -2211,86 +2164,89 @@ "x": 18, "y": 18 }, - "id": 425, + "id": 441, "interval": "30s", "options": { "footer": { - "enablePagination": false, "fields": "", "reducer": [ "sum" ], "show": false }, - "frameIndex": 1, "showHeader": true, "sortBy": [ { "desc": false, - "displayName": "Score" + "displayName": "\u6307\u6807\u5f97\u5206" } ] }, "pluginVersion": "9.2.2", "targets": [ { - "datasource": "sysom-prometheus", - "editorMode": "code", - "exemplar": false, - "expr": "bottomk(5, sysom_node_health_metric{exported_instance=~\"$node\", type=\"ErrorMetric\", mode=\"score\"}) by (description, type)", - "format": "table", - "instant": true, - "legendFormat": "__auto", - "range": false, - "refId": "A" - }, - { - "datasource": "sysom-prometheus", + "datasource": "sysom-mysql", "editorMode": "code", - "exemplar": false, - "expr": "sum(sysom_node_health_metric{exported_instance=~\"$node\", type=\"ErrorMetric\",mode=\"value\"}) by (description, type)", "format": "table", - "hide": false, - "instant": true, - "legendFormat": "__auto", - "range": false, - "refId": "B" + "rawQuery": true, + "rawSql": "SELECT \nmetric_id,\nscore,\ninstance,\nvalue\nFROM sysom.sys_abnormal_metrics_node\nWHERE instance = '$node' AND metric_type = \"error\"\nLIMIT 50 ", + "refId": "A", + "sql": { + "columns": [ + { + "parameters": [], + "type": "function" + } + ], + "groupBy": [ + { + "property": { + "type": "string" + }, + "type": "groupBy" + } + ], + "limit": 50 + } } ], "title": "Errors Health", "transformations": [ - { - "id": "merge", - "options": {} - }, { "id": "organize", "options": { "excludeByName": { - "Time": true, - "type": true + "instance": false, + "namespace": true, + "value": false }, "indexByName": { - "Time": 0, - "Value #A": 10, - "Value #B": 11, - "__name__": 1, - "description": 3, - "exported_instance": 4, - "instance": 5, - "job": 6, - "mode": 7, - "namespace": 8, + "instance": 1, + "metric_id": 0, + "namespace": 3, "pod": 2, - "type": 9 + "score": 4, + "value": 5 }, "renameByName": { - "Time": "", - "Value #A": "Score", - "Value #B": "Value", - "pod": "Pod" + "instance": "node", + "metric_id": "\u5f02\u5e38\u6307\u6807", + "pod": "", + "score": "\u6307\u6807\u5f97\u5206" } } + }, + { + "id": "convertFieldType", + "options": { + "conversions": [ + { + "destinationType": "number", + "targetField": "\u6307\u6807\u5f97\u5206" + } + ], + "fields": {} + } } ], "type": "table" @@ -16327,8 +16283,8 @@ { "current": { "selected": false, - "text": "192.168.0.136:8400", - "value": "192.168.0.136:8400" + "text": "192.168.10.149:8400", + "value": "192.168.10.149:8400" }, "datasource": "sysom-prometheus", "definition": "label_values(sysom_proc_meminfo, instance)", @@ -16385,7 +16341,7 @@ "timezone": "browser", "title": "sysom_base", "uid": "rYdddlPWk", - "version": 1, + "version": 5, "weekStart": "" } } \ No newline at end of file diff --git a/environment/1_sdk/gcache_base/gcache.py b/environment/1_sdk/gcache_base/gcache.py index 48123d40495b09172932bd965dcf890ddfa20731..67c56a4861b1e4643779b464303f62398905b2eb 100644 --- a/environment/1_sdk/gcache_base/gcache.py +++ b/environment/1_sdk/gcache_base/gcache.py @@ -8,7 +8,7 @@ Description: """ import importlib from abc import ABCMeta, abstractmethod -from typing import Union, Optional, Dict +from typing import Union, Optional, Dict, List from threading import Lock from clogger import logger from .exceptions import GCacheProtoAlreadyExistsException, \ @@ -39,6 +39,20 @@ class GCache(metaclass=ABCMeta): """ pass + + @abstractmethod + def push_list(self, key: str, value: Union[int, float, dict, str], + front: int = 0) -> int: + pass + + @abstractmethod + def pop_list(self, key: str, front: int = 0) -> Union[int, float, dict, str]: + pass + + @abstractmethod + def get_list(self, key: str, start: int = 0, + end: int = -1) -> List[Union[None, int, float, dict, str]]: + pass @abstractmethod def load(self, key: str) -> Union[None, int, float, dict, str]: @@ -47,7 +61,11 @@ class GCache(metaclass=ABCMeta): @abstractmethod def load_all(self) -> Dict[str, Union[int, float, dict, str]]: pass - + + @abstractmethod + def delete_list(self, key: str) -> bool: + pass + @abstractmethod def delete(self, key: str) -> bool: """ diff --git a/environment/1_sdk/gcache_redis/redis_gcache.py b/environment/1_sdk/gcache_redis/redis_gcache.py index 431e4e5247917414bed3f2d9ae9652141d4b9e3e..6e3aea407c1ffc4a352db416c0a15849f8091bbe 100644 --- a/environment/1_sdk/gcache_redis/redis_gcache.py +++ b/environment/1_sdk/gcache_redis/redis_gcache.py @@ -7,9 +7,10 @@ File redis_gcache.py Description: """ import json -from typing import Union, Dict +from typing import Union, Dict, List from gcache_base import GCache, GCacheUrl, GCacheException from redis_lua import XRedisHashTable +from clogger import logger from .common import ClientBase, StaticConst SEPARATOR = "_GCache_" @@ -60,6 +61,49 @@ class RedisGCache(GCache, ClientBase): expire=expire ) + def push_list(self, key: str, value: Union[int, float, dict, str], + front: int = 0) -> int: + try: + if front not in [0, 1]: + raise GCacheException(f"Got not supported front = {front}, expect one of [0, 1]") + + method = self.redis_client.lpush if front else self.redis_client.rpush + res = method(f"{self._table_name}:{key}", + f"{self._get_store_value(value)}") + return res + except Exception as e: + logger.exception(e) + return -1 + + def pop_list(self, key: str, front: int = 0) -> Union[None, int, float, dict, str]: + try: + actual_key = f"{self._table_name}:{key}" + if front: + res = self.redis_client.lpop(actual_key) + elif front == 0: + res = self.redis_client.rpop(actual_key) + else: + raise GCacheException( + f"Got not supported front = {front}, expect one of [0, 1]" + ) + if res is None: + return None + return self._get_format_value(res) + except Exception as e: + logger.exception(e) + return None + + def get_list(self, key: str, start: int = 0, + end: int = -1) -> List[Union[None, int, float, dict, str]]: + try: + res = self.redis_client.lrange(f"{self._table_name}:{key}",start, end) + if res is None: + return [] + return [self._get_format_value(r) for r in res] + except Exception as e: + logger.exception(e) + return [] + def _get_format_value(self, value: str) -> Union[None, int, float, dict, str]: type_value = value.split(SEPARATOR) if len(type_value) < 2: @@ -92,6 +136,15 @@ class RedisGCache(GCache, ClientBase): def clean(self): self._x_redis_hash_table.hdrop_table(self._table_name) + self._x_redis_hash_table.hdrop_list(self._table_name) + + def delete_list(self, key: str) -> bool: + try: + self.redis_client.delete(key) + return True + except Exception as e: + logger.exception(e) + return False def delete(self, key: str) -> bool: return self._x_redis_hash_table.hdel(self._table_name, key) diff --git a/environment/1_sdk/redis_lua/x_hdrop_list.lua b/environment/1_sdk/redis_lua/x_hdrop_list.lua new file mode 100644 index 0000000000000000000000000000000000000000..9ad4b9cfc7aeb7761a1d256f91fe7413cab630de --- /dev/null +++ b/environment/1_sdk/redis_lua/x_hdrop_list.lua @@ -0,0 +1,9 @@ +local cursor = 0 +repeat + local result = redis.call('SCAN', cursor, 'MATCH', KEYS[1].."*", 'COUNT', 1000) + cursor = tonumber(result[1]) + local keys = result[2] + for i = 1, #keys do + redis.call('DEL', keys[i]) + end +until cursor == 0 \ No newline at end of file diff --git a/environment/1_sdk/redis_lua/xreadis_hash_table.py b/environment/1_sdk/redis_lua/xreadis_hash_table.py index 92092368adefd544654188691cc7f5ec42a7fc70..a9a31eff0d955bbf5cea89d94cafd451c21f1cfb 100644 --- a/environment/1_sdk/redis_lua/xreadis_hash_table.py +++ b/environment/1_sdk/redis_lua/xreadis_hash_table.py @@ -90,6 +90,10 @@ class XRedisHashTable: res = self._evalsha("x_hdrop_table", 2, table_name, self._get_expire_table(table_name)) return res == "OK" + + def hdrop_list(self, table_name: str) -> bool: + res = self._evalsha("x_hdrop_list", 1, table_name) + return res == "OK" def hdel(self, table_name: str, *fields: str) -> bool: res = self._evalsha("x_hdel", 2, table_name, diff --git a/environment/1_sdk/sysom_utils/framework.py b/environment/1_sdk/sysom_utils/framework.py index 89cc3dc2e0121ab59c0d172e76381f823111515d..d023f0aa01c97848779fb279cfac2241ea97f743 100644 --- a/environment/1_sdk/sysom_utils/framework.py +++ b/environment/1_sdk/sysom_utils/framework.py @@ -36,7 +36,7 @@ class SysomFramework: _config: Optional[ConfigParser] = None _gcache_map: Dict[str, GCache] = {} _framework_plug_mag: Optional[FrameworkPlugMag] = None - _alarm_producer: Optional[Producer] = None + _cec_producer: Optional[Producer] = None @classmethod def init(cls, config: ConfigParser): @@ -202,16 +202,16 @@ class SysomFramework: ) default_channel_job_executor.start() return cls + + @classmethod + def _get_cec_producer(cls): + if cls._cec_producer is None: + cls._cec_producer = cls.cec_producer() + return cls._cec_producer ################################################################################ # Alarm ################################################################################ - @classmethod - def _get_alarm_producer(cls): - if cls._alarm_producer is None: - cls._alarm_producer = cls.cec_producer() - return cls._alarm_producer - @classmethod def alarm(cls, alert_data): """Dispatch one SAD alert data to event center @@ -219,7 +219,7 @@ class SysomFramework: Args: alert_data (_type_): _description_ """ - cls._get_alarm_producer().produce("SYSOM_SAD_ALERT", alert_data) + cls._get_cec_producer().produce("SYSOM_SAD_ALERT", alert_data) @classmethod def alarm_application( @@ -269,9 +269,21 @@ class SysomFramework: """ if action not in ["ADD_ANNOTATION", "ADD_OPT", "MERGE"]: raise Exception(f"Not support alarm action: {action}") - cls._get_alarm_producer().produce( + cls._get_cec_producer().produce( "SYSOM_ALARM_ACTION", {"action": action, "data": action_data} ) + + ################################################################################ + # CLUSTER HEALTH + ################################################################################ + @classmethod + def abnormal_metric(cls, metric_data): + """Dispatch one abnormal data to event center + + Args: + alert_data (_type_): _description_ + """ + cls._get_cec_producer().produce("SYSOM_HEALTH_METRIC", metric_data) @classmethod def start(cls): diff --git a/sysom_server/sysom_cluster_health/alembic/versions/f2217aef0227_cluster_health.py b/sysom_server/sysom_cluster_health/alembic/versions/f2217aef0227_cluster_health.py new file mode 100644 index 0000000000000000000000000000000000000000..bf5a0a44f9b5f8ee37ee53a371e1a19e7dfc1309 --- /dev/null +++ b/sysom_server/sysom_cluster_health/alembic/versions/f2217aef0227_cluster_health.py @@ -0,0 +1,66 @@ +"""cluster_health + +Revision ID: f2217aef0227 +Revises: +Create Date: 2024-03-11 14:16:58.653466 + +""" +from alembic import op +import sqlalchemy as sa + + +# revision identifiers, used by Alembic. +revision = 'f2217aef0227' +down_revision = None +branch_labels = None +depends_on = None + + +def upgrade(): + # ### commands auto generated by Alembic - please adjust! ### + op.create_table('sys_abnormal_metrics_cluster', + sa.Column('uuid', sa.String(length=128), nullable=False), + sa.Column('metric_id', sa.String(length=256), nullable=True), + sa.Column('metric_type', sa.String(length=128), nullable=True), + sa.Column('score', sa.Float(), nullable=True), + sa.Column('value', sa.Float(), nullable=True), + sa.Column('timestamp', sa.Float(), nullable=True), + sa.Column('cluster', sa.String(length=256), nullable=True), + sa.PrimaryKeyConstraint('uuid'), + sa.UniqueConstraint('uuid') + ) + op.create_table('sys_abnormal_metrics_node', + sa.Column('uuid', sa.String(length=128), nullable=False), + sa.Column('metric_id', sa.String(length=256), nullable=True), + sa.Column('metric_type', sa.String(length=128), nullable=True), + sa.Column('score', sa.Float(), nullable=True), + sa.Column('value', sa.Float(), nullable=True), + sa.Column('timestamp', sa.Float(), nullable=True), + sa.Column('cluster', sa.String(length=256), nullable=True), + sa.Column('instance', sa.String(length=256), nullable=True), + sa.PrimaryKeyConstraint('uuid'), + sa.UniqueConstraint('uuid') + ) + op.create_table('sys_abnormal_metrics_pod', + sa.Column('uuid', sa.String(length=128), nullable=False), + sa.Column('metric_id', sa.String(length=256), nullable=True), + sa.Column('metric_type', sa.String(length=128), nullable=True), + sa.Column('score', sa.Float(), nullable=True), + sa.Column('value', sa.Float(), nullable=True), + sa.Column('timestamp', sa.Float(), nullable=True), + sa.Column('cluster', sa.String(length=256), nullable=True), + sa.Column('instance', sa.String(length=256), nullable=True), + sa.Column('pod', sa.String(length=256), nullable=True), + sa.Column('namespace', sa.String(length=256), nullable=True), + sa.PrimaryKeyConstraint('uuid'), + sa.UniqueConstraint('uuid') + ) + # ### end Alembic commands ### + + +def downgrade(): + # ### commands auto generated by Alembic - please adjust! ### + op.drop_table('sys_abnormal_metrics_pod') + op.drop_table('sys_abnormal_metrics_node') + op.drop_table('sys_abnormal_metrics_cluster') + # ### end Alembic commands ### diff --git a/sysom_server/sysom_cluster_health/app/calculator/analyzer.py b/sysom_server/sysom_cluster_health/app/calculator/analyzer.py deleted file mode 100644 index d1ffc5a54f21b751e1297eadc350155e3c9ef105..0000000000000000000000000000000000000000 --- a/sysom_server/sysom_cluster_health/app/calculator/analyzer.py +++ /dev/null @@ -1,556 +0,0 @@ -import time -import json -import math -import conf.settings as settings -from multiprocessing import Queue, Process -from threading import Thread -from schedule import Scheduler -from os import getpid, kill -from typing import Dict, List -from clogger import logger -from sysom_utils import SysomFramework -from lib.algorithm.weight_algorithm import WeightsCalculator,\ - TypeMetricWeights, TYPES -from lib.score_result import ScoreResult, ScoreType, TypeResult, LevelResults -from lib.metric_manager import MetricManager, Level -from lib.utils import collect_all_clusters, collect_instances_of_cluster, \ - collect_pods_of_instance -from lib.metric_exception import MetricProcessException - - -class Analyzer(Process): - def __init__( - self, - clusterhealth_interval: int = 60, - queue: Queue = None, - metric_manager: MetricManager = None, - weight_cal: WeightsCalculator = None, - parent_pid: int = None - ) -> None: - super().__init__(daemon=True) - self.clusterhealth_interval = clusterhealth_interval - self.clusterhealth_host_schedule: Scheduler = Scheduler() - self.metric_manager = metric_manager - self.weight_cal = weight_cal - self.last_end_time = time.time() - self.clusterhealth_interval - self.last_alarm_table: Dict[str, int] = {} - self.queue = queue - self.parent_pid = parent_pid - - def check_if_parent_is_alive(self): - try: - kill(self.parent_pid, 0) - except OSError: - logger.info(f"Analyzer's parent {self.parent_pid} is exit") - exit(0) - - def deliver_one_alarm(self, cluster: str, instance: str, pod: str, - type: str, level: Level, score: float, - value: float, metric): - - if metric.settings.alarm is None: - return - - threshold = metric.settings.alarm.threshold - description = metric.settings.description - - key = f"{cluster}-{instance}-{pod}-{description}" - - # score lower than threshold, deliver alarm - if score <= threshold: - if key not in self.last_alarm_table: - self.last_alarm_table[key] = 0 - - continue_alarm = self.last_alarm_table[key] - # first alarm, deliver it and raise diagnose - if continue_alarm == 0: - alart_id = metric.deliver_alarm(value, type) - metric.deliver_diagnose(alart_id, level, type, self.queue) - - self.last_alarm_table[key] += 1 - - # if alarm list is longer than MERGE_NUM, resend the alarms - if continue_alarm > settings.ALARM_MERGE_NUM: - self.last_alarm_table[key] = 0 - else: - # if continuesly alarm end, reset the alarm list - if key in self.last_alarm_table: - del self.last_alarm_table[key] - - def _get_metric_score(self, level: Level, - labels: Dict[str, str]) -> TypeResult: - type_res = TypeResult({}) - type_weights_list = self.weight_cal.type_weights[level] - registed_metric = self.metric_manager.registed_metric[level] - - for type_weights in type_weights_list: - metric_type = type_weights.type - if type_weights.weight == 0: - continue - - pod = labels.get("pod", "") - instance = labels.get("instance", "") - cluster = labels.get("cluster", "") - type_res[metric_type] = [] - - for metric in registed_metric[metric_type]: - try: - # get metric's value and score - value, score = metric.metric_score(pod, instance, - cluster, - self.last_end_time) - # deliver alarm - self.deliver_one_alarm(cluster, instance, pod, metric_type, - level, score, value, metric) - except Exception as e: - raise e - - # construct metric labels - metric_labels = labels.copy() - metric_labels["type"] = metric_type - metric_labels["description"] = metric.settings.description - - type_res[metric_type].append( - ScoreResult( - metric_labels, score, value, ScoreType.MetricScore - ).to_dict() - ) - - return type_res - - def _cal_one(self, level: Level, - labels: Dict[str, str]) -> List[ScoreResult]: - final_score = 0 - result = [] - pod = labels.get("pod", "") - instance = labels.get("instance", "") - type_weights_list = self.weight_cal.type_weights[level] - weights_method = self.weight_cal.weights_method[level] - - for type_weights in type_weights_list: - type = type_weights.type - type_weight = type_weights.weight - metrics_score = [] - type_score = 0 - - if type_weight == 0: - continue - - for metric in self.metric_manager.registed_metric[level][type]: - try: - value, score = metric.metric_score(pod, instance, - labels["cluster"], - self.last_end_time) - except MetricProcessException as e: - logger.info(f"Calculate Metric: {metric.settings.description} " - f"of Pod: {pod} of Node: {instance} failed {e}") - continue - - self.deliver_one_alarm(labels["cluster"], instance, pod, type, - level, score, value, metric) - - metric_labels = labels.copy() - metric_labels["type"] = type - metric_labels["description"] = metric.settings.description - - result.append( - ScoreResult(metric_labels, score, value, ScoreType - .MetricScore).to_dict() - ) - - metrics_score.append(score) - if weights_method == "WeightedSum": - type_score += score * metric.settings.score.weight - if weights_method == "Equal": - type_score = sum(metrics_score) / len(metrics_score) - elif weights_method == "Worst": - type_score = min(metrics_score) - - type_labels = labels.copy() - type_labels["type"] = type - result.append( - ScoreResult(type_labels, type_score, 0, ScoreType. - MetricTypeScore).to_dict() - ) - # final score of the pod is the weight sum of each type of metric - # score - final_score += type_score * type_weight - - # final score store in the last elements of the result list - final_score = math.floor(final_score) - result.append( - ScoreResult( - labels, final_score, 0, ScoreType.InstanceScore - ).to_dict() - ) - - return result - - def _cal_one_auto_weights( - self, level: Level, - item: LevelResults, - metric_weights: TypeMetricWeights - ) -> List[ScoreResult]: - - type_weights = self.weight_cal.type_weights[level] - - final_result: List[ScoreResult] = [] - base_labels = item.labels - item_score = 0 - - for type_weight in type_weights: - type = type_weight.type - weight = type_weight.weight - metric_scores = item.results[type] - - type_score = sum( - [metric_scores[i]["score"] * metric_weights[type][i] - for i in range(len(metric_scores))] - ) - - type_labels = base_labels.copy() - type_labels["type"] = type - final_result.extend(metric_scores) - final_result.append( - ScoreResult( - type_labels, type_score, 0, ScoreType.MetricTypeScore - ).to_dict() - ) - - item_score += type_score * weight - - item_score = math.floor(item_score) - final_result.append( - ScoreResult( - base_labels, item_score, 0, ScoreType.InstanceScore - ).to_dict() - ) - - return final_result - - def _cal_pods_auto_weights( - self, cluster: str, instance: str, - pod_list, g_cache - ) -> List[float]: - - level = Level.Pod - pods_score = [] - pods_result = [] - - for pod, podns in pod_list: - labels = { - "pod": pod, - "namespace": podns, - "instance": instance, - "cluster": cluster, - } - - try: - pods_result.append( - LevelResults( - labels, - self._get_metric_score(level, labels) - ) - ) - except Exception as e: - logger.error(f"Collect metric of Pod: {pod} of " - f"Node: {instance} failed: {e}") - return [] - - metric_weights = self.weight_cal.cal_metric_weights(level, pods_result) - for pod in pods_result: - pod_result = self._cal_one_auto_weights(level, pod, metric_weights) - pod_key = pod.labels["pod"] + f"-{pod.labels['namespace']}" - pods_score.append(pod_result[-1]["score"]) - g_cache.store(pod_key, json.dumps(pod_result)) - - return pods_score - - def _cal_one_pod(self, cluster: str, pod: str, - pod_ns: str, instance: str, g_cache_pod) -> float: - result = [] - final_pod_score = 0 - - try: - labels = { - "pod": pod, - "namespace": pod_ns, - "instance": instance, - "cluster": cluster, - } - - result = self._cal_one(Level.Pod, labels) - final_pod_score = result[-1]["score"] - - pod_key = pod + f"-{pod_ns}" - g_cache_pod.store(pod_key, json.dumps(result)) - - except Exception as e: - logger.error(f"Calculate score of Pod: {pod} of " - f"Node: {instance} failed: {e}") - return None - - return final_pod_score - - def _cal_pods(self, cluster: str, instance: str) -> List[float]: - g_cache_pod = SysomFramework.gcache("pod_metrics") - pod_method = self.weight_cal.weights_method[Level.Pod] - pod_results = [] - - pod_list = collect_pods_of_instance(instance, - self.metric_manager.metric_reader, - self.clusterhealth_interval) - - if pod_method in ["WeightedSum", "Equal", "Worst"]: - for pod, pod_ns in pod_list: - pod_result = self._cal_one_pod(cluster, pod, - pod_ns, instance, g_cache_pod) - if pod_result is not None: - pod_results.append(pod_result) - else: - pod_results = self._cal_pods_auto_weights(cluster, instance, - pod_list, g_cache_pod) - - return pod_results - - def _combine_pods_node(self, pod_res: List[float], - node_score: float) -> float: - weights = [0.7, 0.3] - - if len(pod_res) == 0: - return node_score - - pod_avg = sum(pod_res) / len(pod_res) - - return node_score * weights[0] + pod_avg * weights[1] - - def _cal_nodes_auto_weights(self, cluster: str, - instance_list: List[str], g_cache): - - level = Level.Node - instances_result = [] - - for instance in instance_list: - labels = { - "instance": instance, - "cluster": cluster, - } - - try: - instances_result.append( - LevelResults( - labels, - self._get_metric_score(level, labels) - ) - ) - except Exception as e: - logger.error(f"Collect metric of Node: {instance} failed: {e}") - return - - metric_weights = self.weight_cal.cal_metric_weights( - level, instances_result) - for instance in instances_result: - pod_res = self._cal_pods(cluster, instance.labels["instance"]) - node_res = self._cal_one_auto_weights( - level, instance, metric_weights) - # combine pod score and node score and update it - node_score = math.floor( - self._combine_pods_node(pod_res, node_res[-1]["score"]) - ) - node_res[-1]["score"] = node_score - - g_cache.store(instance.labels["instance"], - json.dumps(node_res)) - - def _cal_one_node(self, cluster: str, instance: str, - g_cache_instance) -> float: - node_score = 0 - node_result = [] - - pod_results = self._cal_pods(cluster, instance) - - try: - labels = { - "instance": instance, - "cluster": cluster, - } - - node_result = self._cal_one(Level.Node, labels) - node_score = math.floor( - self._combine_pods_node(pod_results, node_result[-1]["score"]) - ) - node_result[-1]["score"] = node_score - - # todo pod的健康分如何纳入node健康分的计算 - g_cache_instance.store(instance, json.dumps(node_result)) - except Exception as e: - logger.error(f"Calculating score of Node: {instance} failed") - logger.exception(e) - return None - - return node_score - - def _cal_nodes_task(self, i: int, cluster: str, instance_list: List[str]): - g_cache_instance = SysomFramework.gcache("instance_metrics") - node_method = self.weight_cal.weights_method[Level.Node] - metric_per_processor = len(instance_list) / \ - settings.ANALYZER_PROCESS_NUM - - if i == settings.ANALYZER_PROCESS_NUM: - assigned_max = len(instance_list) - else: - assigned_max = int(metric_per_processor * i) - - assigned_min = assigned_max - int(metric_per_processor) - assigned_node = range(assigned_min, assigned_max) - - if node_method in ["WeightedSum", "Equal", "Worst"]: - for j in assigned_node: - instance = instance_list[j] - self._cal_one_node(cluster, instance, - g_cache_instance) - else: - self._cal_nodes_auto_weights( - cluster, - instance_list[assigned_min:assigned_max], - g_cache_instance - ) - - def _cal_one_cluster(self, cluster: str): - def __cal_nodes_multi_thread(): - threads = [] - for i in range(1, settings.ANALYZER_PROCESS_NUM + 1): - if i > len(instances_list): - logger.warning("process num is set to be" - " larger than instance num!") - break - - t = Thread(target=self._cal_nodes_task, - args=(i, cluster, instances_list)) - threads.append(t) - t.start() - - for t in threads: - t.join() - - def __cal_nodes_normal(): - g_cache_instance = SysomFramework.gcache("instance_metrics") - node_method = self.weight_cal.weights_method[Level.Node] - - if node_method in ["WeightedSum", "Equal", "Worst"]: - for instance in instances_list: - self._cal_one_node(cluster, instance, - g_cache_instance) - else: - self._cal_nodes_auto_weights(cluster, - instances_list, g_cache_instance) - - def __nodes_to_cluster(labels: Dict[str, str]) -> List[ScoreResult]: - final_score = 0 - cluster_res = [] - type_score = {} - nodes_score = [] - g_cache_instance = SysomFramework.gcache("instance_metrics") - - for type in TYPES: - type_score[type] = [] - - instances = g_cache_instance.load_all() - for _, instance_res in instances.items(): - res = json.loads(instance_res) - for metric in res: - if metric["type"] == ScoreType.MetricTypeScore.value: - type_score[metric["labels"]["type"]].append( - metric["score"]) - elif metric["type"] == ScoreType.InstanceScore.value: - nodes_score.append(metric["score"]) - - for type in TYPES: - type_labels = labels.copy() - type_labels["type"] = type - - if len(type_score[type]) == 0: - logger.warning(f"No Nodes's {type} score") - continue - # cluster type score = avg(nodes' type score) - avg_score = sum(type_score[type]) / len(type_score[type]) - cluster_res.append( - ScoreResult( - type_labels, avg_score, 0, - ScoreType.MetricTypeScore - ).to_dict() - ) - - try: - # cluster score = avg(nodes' score) - final_score = math.floor(sum(nodes_score) / len(nodes_score)) - cluster_res.append( - ScoreResult( - labels, final_score, 0, ScoreType.InstanceScore - ).to_dict() - ) - except ZeroDivisionError as e: - logger.info("no nodes in cluster!") - raise e - - return cluster_res - - cluster_result = [] - g_cache_cluster = SysomFramework.gcache("cluster_metrics") - instances_list = collect_instances_of_cluster( - cluster, - self.metric_manager.metric_reader, - self.clusterhealth_interval - ) - - labels = { - "cluster": cluster, - } - - if settings.ENABLE_MULTI_THREAD is True: - __cal_nodes_multi_thread() - else: - __cal_nodes_normal() - - try: - # cluster_result = self._cal_one(Level.Cluster, labels) - cluster_result = __nodes_to_cluster(labels) - g_cache_cluster.store(cluster, json.dumps(cluster_result)) - - except Exception as e: - logger.error(f"Calculating score of Cluster: {cluster} failed") - logger.exception(e) - pass - - def _register_task(self): - cluster_list = [] - - cluster_list = collect_all_clusters(self.metric_manager.metric_reader) - # no cluster label, we assume just one, and names it "dafault" - if len(cluster_list) == 0 or settings.NO_CLUSTER_LABEL is True: - cluster_list.append("default") - - start_time = time.time() - - for cluster in cluster_list: - self._cal_one_cluster(cluster) - - self.last_end_time = time.time() - end_time = time.time() - logger.info(f"Excutaion time: {end_time - start_time}") - - def run(self) -> None: - logger.info(f'健康度计算守护进程PID: {getpid()}') - - self._register_task() - self.clusterhealth_host_schedule.every(self.clusterhealth_interval)\ - .seconds.do(self._register_task) - - while True: - self.check_if_parent_is_alive(); - - if self.is_alive(): - self.clusterhealth_host_schedule.run_pending() - else: - break - time.sleep(max(1, int(self.clusterhealth_interval / 2))) diff --git a/sysom_server/sysom_cluster_health/app/collector/collector.py b/sysom_server/sysom_cluster_health/app/collector/collector.py new file mode 100644 index 0000000000000000000000000000000000000000..e6a4282dabeb7898e4f3e5d7f5f8bcab5d64553f --- /dev/null +++ b/sysom_server/sysom_cluster_health/app/collector/collector.py @@ -0,0 +1,172 @@ +import time +from conf.settings import * +from multiprocessing import Queue, Process +from schedule import Scheduler +from os import getpid, kill +from typing import Dict +from clogger import logger +from app.collector.metric_manager import MetricManager +from app.collector.metric_exception import MetricProcessException +from lib.common_type import Labels, Level +from lib.utils import collect_all_clusters, collect_instances_of_cluster, \ + collect_pods_of_instance + + +class Collector(Process): + def __init__( + self, + queue: Queue = None, + metric_manager: MetricManager = None, + parent_pid: int = None + ) -> None: + super().__init__(daemon=True) + self.collect_interval = COLLECT_INTERVAL + self.collector_host_schedule: Scheduler = Scheduler() + self.metric_manager = metric_manager + self.last_end_time = time.time() - self.collect_interval + self.last_alarm_table: Dict[str, int] = {} + self.queue = queue + self.parent_pid = parent_pid + + def _check_if_parent_is_alive(self): + try: + kill(self.parent_pid, 0) + except OSError: + logger.info(f"Analyzer's parent {self.parent_pid} is exit") + exit(0) + + def _deliver_one_alarm(self, metric, labels: Labels, level: Level, + score: float, value: float): + if metric.settings.alarm is None: + return + + type = metric.settings.type + threshold = metric.settings.alarm.threshold + metric_id = metric.settings.metric_id + + key = f"{labels.cluster}-{labels.instance}-{labels.namespace}" \ + f"-{labels.pod}-{metric_id}" + + # score lower than threshold, deliver alarm + if score <= threshold: + if key not in self.last_alarm_table: + self.last_alarm_table[key] = 0 + + continue_alarm = self.last_alarm_table[key] + # first alarm, deliver it and raise diagnose + if continue_alarm == 0: + alart_id = metric.deliver_alarm(value, type) + metric.deliver_diagnose(alart_id, level, type, self.queue) + + self.last_alarm_table[key] += 1 + + # if alarm list is longer than MERGE_NUM, resend the alarms + if continue_alarm > ALARM_MERGE_NUM: + self.last_alarm_table[key] = 0 + else: + # if continuesly alarm end, reset the alarm list + if key in self.last_alarm_table: + del self.last_alarm_table[key] + + def _collect_process_one(self, level: Level, labels: Labels): + """Collect and process one cluster/node/pod's all metrics + + Args: + level (Level): cluster/node/pod + labels (Labels): cluster/node/pod labels + """ + + for metric in self.metric_manager.registed_metric[level]: + try: + value, score = metric.metric_score(labels, self.last_end_time) + except MetricProcessException as e: + logger.info(f"Calculate Metric: {metric.settings.metric_id} " + f"of Pod: {labels.pod} of Node: {labels.instance} " + f"of Cluster: {labels.cluster} failed {e}") + continue + + metric.deliver_health_metric(value, score) + self._deliver_one_alarm(metric, labels, level, score, value) + + def _collect_process_pod_metric(self, name:str, ns:str, + level: Level, labels: Labels): + pod_label = Labels( + cluster=labels.cluster, + instance=labels.instance, + pod=name, + namespace=ns + ) + + # collect and process a pod's all metrics + self._collect_process_one(level, pod_label) + + def _collect_process_node_metric(self, name:str, level: Level, + labels: Labels): + pods_list = [] + node_label = Labels(cluster=labels.cluster, instance=name) + + # collect and process a node's all metrics + self._collect_process_one(level, node_label) + + pods_list = collect_pods_of_instance( + name, + self.metric_manager.metric_reader, + self.collect_interval + ) + + for pod, ns in pods_list: + self._collect_process_pod_metric(pod, ns, Level.Pod, node_label) + + + def _collect_process_cluster_metric(self, cluster: str): + level = Level.Cluster + labels = Labels(cluster=cluster) + nodes_list = [] + + # collect and process a cluster's all metrics + self._collect_process_one(level, labels) + + nodes_list = collect_instances_of_cluster( + cluster, + self.metric_manager.metric_reader, + self.collect_interval + ) + + for node in nodes_list: + self._collect_process_node_metric(node, Level.Node, labels) + + + def _register_task(self): + cluster_list = [] + + cluster_list = collect_all_clusters(self.metric_manager.metric_reader) + # no cluster label, we assume just one, and names it "dafault" + if len(cluster_list) == 0 or NO_CLUSTER_LABEL is True: + cluster_list.append("default") + + start_time = time.time() + + for cluster in cluster_list: + self._collect_process_cluster_metric(cluster) + + end_time = time.time() + self.last_end_time = end_time + logger.info(f"Excutaion time: {end_time - start_time}") + + + def run(self) -> None: + logger.info(f'健康度内置指标采集分析守护进程PID: {getpid()}') + + self._register_task() + self.collector_host_schedule.every(self.collect_interval)\ + .seconds.do(self._register_task) + + while True: + self._check_if_parent_is_alive(); + + if self.is_alive(): + self.collector_host_schedule.run_pending() + else: + break + time.sleep(max(1, int(self.collect_interval / 2))) + diff --git a/sysom_server/sysom_cluster_health/custom_metric/node_cpu_util.py b/sysom_server/sysom_cluster_health/app/collector/custom_metric/node_cpu_util.py similarity index 91% rename from sysom_server/sysom_cluster_health/custom_metric/node_cpu_util.py rename to sysom_server/sysom_cluster_health/app/collector/custom_metric/node_cpu_util.py index 06aca4aefeffd3e14e50842230d761d31b0348e0..076f7d2599e72abb8fc74d4d873e892ad66b31d8 100644 --- a/sysom_server/sysom_cluster_health/custom_metric/node_cpu_util.py +++ b/sysom_server/sysom_cluster_health/app/collector/custom_metric/node_cpu_util.py @@ -1,7 +1,8 @@ import conf.settings as settings from metric_reader import MetricReader -from lib.metric_type.capacity import CapacityMetric, Level -from lib.metric_exception import MetricSettingsException,\ +from lib.common_type import Level +from app.collector.metric_type.capacity import CapacityMetric +from app.collector.metric_exception import MetricSettingsException,\ MetricCollectException, MetricProcessException NODE_LABEL = settings.NODE_LABEL diff --git a/sysom_server/sysom_cluster_health/custom_metric/node_fd_util.py b/sysom_server/sysom_cluster_health/app/collector/custom_metric/node_fd_util.py similarity index 90% rename from sysom_server/sysom_cluster_health/custom_metric/node_fd_util.py rename to sysom_server/sysom_cluster_health/app/collector/custom_metric/node_fd_util.py index 38d7150d3ceb2e96804480ba731efb39a87396f5..06fb39896e3a832597a1f85e50e2b8630e57a235 100644 --- a/sysom_server/sysom_cluster_health/custom_metric/node_fd_util.py +++ b/sysom_server/sysom_cluster_health/app/collector/custom_metric/node_fd_util.py @@ -1,8 +1,9 @@ from typing import Dict from metric_reader import MetricReader from sysom_utils import SysomFramework -from lib.metric_type.capacity import Level, CapacityMetric -from lib.metric_type.metric_type import DiagnoseInfo +from lib.common_type import Level +from app.collector.metric_type.capacity import CapacityMetric +from app.collector.metric_type.metric_type import DiagnoseInfo class NodeFdUtil(CapacityMetric): diff --git a/sysom_server/sysom_cluster_health/custom_metric/node_load_avg.py b/sysom_server/sysom_cluster_health/app/collector/custom_metric/node_load_avg.py similarity index 87% rename from sysom_server/sysom_cluster_health/custom_metric/node_load_avg.py rename to sysom_server/sysom_cluster_health/app/collector/custom_metric/node_load_avg.py index e0e68424732e6b0e7dfe22ea4d276dad649e658d..f3c13854c8c2e8a2efe7691b1bfdbf16523c242b 100644 --- a/sysom_server/sysom_cluster_health/custom_metric/node_load_avg.py +++ b/sysom_server/sysom_cluster_health/app/collector/custom_metric/node_load_avg.py @@ -1,7 +1,8 @@ import conf.settings as settings from metric_reader import MetricReader -from lib.metric_exception import MetricCollectException -from lib.metric_type.load import LoadMetric, Level +from lib.common_type import Level +from app.collector.metric_exception import MetricCollectException +from app.collector.metric_type.load import LoadMetric NODE_LABEL = settings.NODE_LABEL CPU_COUNT_METRIC = "sysom_proc_cpus" diff --git a/sysom_server/sysom_cluster_health/custom_metric/node_rootfs_inode_util.py b/sysom_server/sysom_cluster_health/app/collector/custom_metric/node_rootfs_inode_util.py similarity index 92% rename from sysom_server/sysom_cluster_health/custom_metric/node_rootfs_inode_util.py rename to sysom_server/sysom_cluster_health/app/collector/custom_metric/node_rootfs_inode_util.py index f3a3ab15dba3fb32d6e44d9ad5744df70e49802e..07e710008d2664e5b96f16fcfb529041981a7107 100644 --- a/sysom_server/sysom_cluster_health/custom_metric/node_rootfs_inode_util.py +++ b/sysom_server/sysom_cluster_health/app/collector/custom_metric/node_rootfs_inode_util.py @@ -1,7 +1,8 @@ import conf.settings as settings from metric_reader import MetricReader -from lib.metric_type.capacity import CapacityMetric, Level -from lib.metric_exception import MetricSettingsException,\ +from lib.common_type import Level +from app.collector.metric_type.capacity import CapacityMetric +from app.collector.metric_exception import MetricSettingsException,\ MetricCollectException, MetricProcessException MOUNTPOINT = "/" diff --git a/sysom_server/sysom_cluster_health/custom_metric/node_rootfs_util.py b/sysom_server/sysom_cluster_health/app/collector/custom_metric/node_rootfs_util.py similarity index 94% rename from sysom_server/sysom_cluster_health/custom_metric/node_rootfs_util.py rename to sysom_server/sysom_cluster_health/app/collector/custom_metric/node_rootfs_util.py index f1d7dca1faf6bf3b468255ae7acabb13e3517115..52b7f6177a3e1a21edec28350a3adf53675b94d5 100644 --- a/sysom_server/sysom_cluster_health/custom_metric/node_rootfs_util.py +++ b/sysom_server/sysom_cluster_health/app/collector/custom_metric/node_rootfs_util.py @@ -1,7 +1,8 @@ import conf.settings as settings from metric_reader import MetricReader -from lib.metric_type.capacity import CapacityMetric, Level -from lib.metric_exception import MetricSettingsException,\ +from lib.common_type import Level +from app.collector.metric_type.capacity import CapacityMetric +from app.collector.metric_exception import MetricSettingsException,\ MetricCollectException, MetricProcessException MOUNTPOINT = "/" diff --git a/sysom_server/sysom_cluster_health/lib/metric_exception.py b/sysom_server/sysom_cluster_health/app/collector/metric_exception.py similarity index 100% rename from sysom_server/sysom_cluster_health/lib/metric_exception.py rename to sysom_server/sysom_cluster_health/app/collector/metric_exception.py diff --git a/sysom_server/sysom_cluster_health/lib/metric_manager.py b/sysom_server/sysom_cluster_health/app/collector/metric_manager.py similarity index 51% rename from sysom_server/sysom_cluster_health/lib/metric_manager.py rename to sysom_server/sysom_cluster_health/app/collector/metric_manager.py index fa69919b62c3a821cbbc602784b72e9cbdf38cc6..f7ad068d471e0384cc5d9815d6725d344a65b3f6 100644 --- a/sysom_server/sysom_cluster_health/lib/metric_manager.py +++ b/sysom_server/sysom_cluster_health/app/collector/metric_manager.py @@ -1,15 +1,14 @@ import importlib -import conf.settings as settings -from lib.metric_exception import MetricSettingsException -from lib.metric_type.capacity import CapacityMetric -from lib.metric_type.load import LoadMetric -from lib.metric_type.latency import LatencyMetric -from lib.metric_type.error import ErrorMetric -from lib.metric_type.metric_type import Level +from conf.settings import * +from app.collector.metric_exception import MetricSettingsException +from app.collector.metric_type.capacity import CapacityMetric +from app.collector.metric_type.load import LoadMetric +from app.collector.metric_type.latency import LatencyMetric +from app.collector.metric_type.error import ErrorMetric +from lib.common_type import Level from metric_reader import dispatch_metric_reader -CUSTOM_METRIC_DIR = "custom_metric" -PROMETHEUS_CONFIG = settings.PROMETHEUS_CONFIG +CUSTOM_METRIC_DIR = "app.collector.custom_metric" METRIC_TYPE = { "CapacityMetric": CapacityMetric, @@ -18,43 +17,17 @@ METRIC_TYPE = { "ErrorMetric": ErrorMetric } -WEIGHT_METHODS = { - Level.Cluster: settings.CLUSTER_WEIGHT_METHOD, - Level.Node: settings.NODE_WEIGHT_METHOD, - Level.Pod: settings.POD_WEIGHT_METHOD -} - - class MetricManager(): def __init__(self): self.registed_metric = {} - self.interval = settings.CALCULATE_INTERVAL self.metric_reader = dispatch_metric_reader( "prometheus://" + PROMETHEUS_CONFIG.host + ":" + str(PROMETHEUS_CONFIG.port)) def _metric_register(self, all_metrics, level): - def __check_metric_weight(metric_list): - """ - Make sure sum of same type of metric equal to 1 - """ - if len(metric_list) <= 0: - return - - sum_weight = sum( - item.settings.score.weight for item in metric_list - ) - - if sum_weight != 1: - raise MetricSettingsException( - f"weight of {metric_type} level {level} setting error!" - ) - - self.registed_metric[level] = {} - - for metric_type, metrics in all_metrics.items(): - self.registed_metric[level][metric_type] = [] - - for metric in metrics: + self.registed_metric[level] = [] + + for metric in all_metrics: + try: if "filename" in metric["Collect"] \ and metric["Collect"]["filename"] != "": # load non_standard metric from file @@ -66,7 +39,7 @@ class MetricManager(): if hasattr(metric_module, class_name): metric_class = getattr(metric_module, class_name) metric_instance = metric_class(self.metric_reader, - metric, level) + metric, level) except ModuleNotFoundError as exc: raise MetricSettingsException( f"{module_name} not exist!" @@ -75,24 +48,26 @@ class MetricManager(): raise exc else: - metric_class = METRIC_TYPE.get(metric_type) + metric_class = METRIC_TYPE.get(metric["Type"]) if metric_class is not None: metric_instance = metric_class( self.metric_reader, metric, level) - self.registed_metric[level][metric_type].append( - metric_instance) + except Exception as exc: + raise MetricSettingsException( + f"Collector setting error of metric {metric}: {exc}" + ) from exc - if WEIGHT_METHODS.get(level) == "WeightedSum": - __check_metric_weight(self.registed_metric[level][metric_type]) + self.registed_metric[level].append( + metric_instance) def metric_register(self): """ Register all metrics to metric manager from settings """ try: - self._metric_register(settings.CLUSTER_METRICS, Level.Cluster) - self._metric_register(settings.POD_METRICS, Level.Pod) - self._metric_register(settings.NODE_METRICS, Level.Node) + self._metric_register(CLUSTER_METRICS, Level.Cluster) + self._metric_register(POD_METRICS, Level.Pod) + self._metric_register(NODE_METRICS, Level.Node) except MetricSettingsException as exc: raise exc diff --git a/sysom_server/sysom_cluster_health/lib/metric_type/capacity.py b/sysom_server/sysom_cluster_health/app/collector/metric_type/capacity.py similarity index 93% rename from sysom_server/sysom_cluster_health/lib/metric_type/capacity.py rename to sysom_server/sysom_cluster_health/app/collector/metric_type/capacity.py index 3e3a9a260a86f7afc3394f9a58fc091ea85b1b51..5640b88b12687a6df6c71f64c53e384969c35928 100644 --- a/sysom_server/sysom_cluster_health/lib/metric_type/capacity.py +++ b/sysom_server/sysom_cluster_health/app/collector/metric_type/capacity.py @@ -1,6 +1,7 @@ -from lib.metric_type.metric_type import Metric, MetricReader,\ - Level, InsAggregationType -from lib.metric_exception import MetricCollectException, \ +from lib.common_type import Level +from app.collector.metric_type.metric_type import Metric, MetricReader,\ + InsAggregationType +from app.collector.metric_exception import MetricCollectException, \ MetricSettingsException from conf.settings import NODE_LABEL, POD_LABEL,\ CLUSTER_LABEL, POD_METRIC_TAG @@ -128,6 +129,4 @@ class CapacityMetric(Metric): f'illegal standard type:{standard_type}' ) - def metric_score(self, pod: str, node: str, - cluster: str, last_end_time: float) -> (float, float): - return super().metric_score(pod, node, cluster, last_end_time) + \ No newline at end of file diff --git a/sysom_server/sysom_cluster_health/lib/metric_type/error.py b/sysom_server/sysom_cluster_health/app/collector/metric_type/error.py similarity index 60% rename from sysom_server/sysom_cluster_health/lib/metric_type/error.py rename to sysom_server/sysom_cluster_health/app/collector/metric_type/error.py index 2738ef8b80b2cf6a7de1116a168a5120a500bac3..25d70d94f88ea0138f219a82d25c7dcd6b90e060 100644 --- a/sysom_server/sysom_cluster_health/lib/metric_type/error.py +++ b/sysom_server/sysom_cluster_health/app/collector/metric_type/error.py @@ -1,6 +1,7 @@ -from lib.metric_type.metric_type import Metric, MetricReader,\ - Level, RangeAggregationType, InsAggregationType -from lib.metric_exception import MetricSettingsException +from lib.common_type import Level +from app.collector.metric_type.metric_type import Metric, MetricReader,\ + RangeAggregationType, InsAggregationType +from app.collector.metric_exception import MetricSettingsException class ErrorMetric(Metric): @@ -17,7 +18,3 @@ class ErrorMetric(Metric): range_agg_type=RangeAggregationType.Increase, ins_agg_type=InsAggregationType.Sum ) - - def metric_score(self, pod: str, node: str, - cluster: str, last_end_time: float) -> (float, float): - return super().metric_score(pod, node, cluster, last_end_time) diff --git a/sysom_server/sysom_cluster_health/lib/metric_type/latency.py b/sysom_server/sysom_cluster_health/app/collector/metric_type/latency.py similarity index 87% rename from sysom_server/sysom_cluster_health/lib/metric_type/latency.py rename to sysom_server/sysom_cluster_health/app/collector/metric_type/latency.py index baaa029f4c4d978f70cb91b61ce37d0ef9a10c64..437b95bfb50b70ceee603d502f76ec8b735812b8 100644 --- a/sysom_server/sysom_cluster_health/lib/metric_type/latency.py +++ b/sysom_server/sysom_cluster_health/app/collector/metric_type/latency.py @@ -1,5 +1,6 @@ -from lib.metric_type.metric_type import Metric, MetricReader,\ - MetricSettingsException, Level, RangeAggregationType, InsAggregationType +from lib.common_type import Level +from app.collector.metric_type.metric_type import Metric, MetricReader,\ + MetricSettingsException, RangeAggregationType, InsAggregationType class LatencyMetric(Metric): @@ -68,6 +69,3 @@ class LatencyMetric(Metric): f'illegal standard type:{standard_type}' ) - def metric_score(self, pod: str, node: str, - cluster: str, last_end_time: float) -> (float, float): - return super().metric_score(pod, node, cluster, last_end_time) diff --git a/sysom_server/sysom_cluster_health/lib/metric_type/load.py b/sysom_server/sysom_cluster_health/app/collector/metric_type/load.py similarity index 72% rename from sysom_server/sysom_cluster_health/lib/metric_type/load.py rename to sysom_server/sysom_cluster_health/app/collector/metric_type/load.py index b5721e160d85b0075ba7083014e60ef4aea54bcd..97d7cfa87edc6babeb1a25d3e3079d39d7e45522 100644 --- a/sysom_server/sysom_cluster_health/lib/metric_type/load.py +++ b/sysom_server/sysom_cluster_health/app/collector/metric_type/load.py @@ -1,6 +1,7 @@ -from lib.metric_type.metric_type import Metric, MetricReader,\ - Level, RangeAggregationType, InsAggregationType -from lib.metric_exception import MetricSettingsException +from lib.common_type import Level +from app.collector.metric_type.metric_type import Metric, MetricReader,\ + RangeAggregationType, InsAggregationType +from app.collector.metric_exception import MetricSettingsException class LoadMetric(Metric): @@ -31,6 +32,3 @@ class LoadMetric(Metric): f'illegal standard type:{standard_type}' ) - def metric_score(self, pod: str, node: str, - cluster: str, last_end_time: float) -> (float, float): - return super().metric_score(pod, node, cluster, last_end_time) diff --git a/sysom_server/sysom_cluster_health/lib/metric_type/metric_type.py b/sysom_server/sysom_cluster_health/app/collector/metric_type/metric_type.py similarity index 85% rename from sysom_server/sysom_cluster_health/lib/metric_type/metric_type.py rename to sysom_server/sysom_cluster_health/app/collector/metric_type/metric_type.py index aa2aa0b81bf33b43dded995539ac553c9e91783e..f3465bc611e8dc8436aa6d5655dd1b128154cb1b 100644 --- a/sysom_server/sysom_cluster_health/lib/metric_type/metric_type.py +++ b/sysom_server/sysom_cluster_health/app/collector/metric_type/metric_type.py @@ -5,29 +5,24 @@ import conf.settings as settings from clogger import logger from enum import Enum from time import time -from typing import List, Dict, Optional +from typing import List, Dict, Optional, Tuple from multiprocessing import Queue from queue import Full from scipy.interpolate import interp1d from sysom_utils import SysomFramework from metric_reader import RangeQueryTask, InstantQueryTask, MetricReader from dataclasses import dataclass -from lib.metric_exception import MetricSettingsException -from lib.metric_exception import MetricCollectException -from lib.metric_exception import MetricProcessException +from lib.common_type import Level, Labels +from app.diagnose.diagnose_info import DiagnoseInfo +from app.collector.metric_exception import MetricSettingsException +from app.collector.metric_exception import MetricCollectException +from app.collector.metric_exception import MetricProcessException CLUSTER_LABEL = settings.CLUSTER_LABEL POD_LABEL = settings.POD_LABEL NODE_LABEL = settings.NODE_LABEL POD_METRIC_TAG = settings.POD_METRIC_TAG - -class Level(Enum): - Cluster = 0 - Node = 1 - Pod = 2 - - class RangeAggregationType(Enum): Increase = 0 Rate = 1 @@ -50,13 +45,6 @@ class Collect: node_tag_name: Optional[str] = None filename: Optional[str] = None - -@dataclass -class Score: - weight: float - score: Dict[str, int] - - @dataclass class Alarm: threshold: int @@ -67,26 +55,13 @@ class Alarm: @dataclass class MetricSettings: - description: str + metric_id: str + type: str collect: Collect - score: Score + score: Dict[str, int] alarm: Optional[Alarm] -@dataclass -class DiagnoseInfo: - alarm_id: str - service_name: str - type: str - level: Level - metric_description: str - instance: str - pod: Optional[str] = None - container: Optional[str] = None - time: Optional[str] = None - diagnose_type: Optional[str] = None - - class Metric(): def __init__(self, metric_reader: MetricReader, metric_settings, level: Level): @@ -96,11 +71,7 @@ class Metric(): self.last_end_time = 0 self.score_interp = None self.settings = None - - try: - self._initalize_settings(metric_settings) - except BaseException: - raise + self._initalize_settings(metric_settings) ########################################################################## # Inner funtions @@ -109,14 +80,14 @@ class Metric(): def _initalize_settings(self, settings): try: self.settings = MetricSettings( - description=settings["Description"], - collect=Collect(**settings["Collect"]), - score=Score(**settings["Score"]), - alarm=Alarm( - **settings["Alarm"]) if settings.get("Alarm") else None + metric_id = settings["MetricID"], + type = settings["Type"], + collect = Collect(**settings["Collect"]), + score = settings["Score"], + alarm = Alarm(**settings["Alarm"]) if settings.get("Alarm") else None ) - if self.level == Level.Node or self.level == Level.Cluster: + if self.level == Level.Node: if not self.settings.collect.node_tag_name: raise MetricSettingsException( f"node_tag_name must set " @@ -138,7 +109,7 @@ class Metric(): f"in {self.settings.description}!" ) - self._initalize_score_settings(self.settings.score.score) + self._initalize_score_settings(self.settings.score) except Exception as exc: raise MetricSettingsException() from exc @@ -318,7 +289,30 @@ class Metric(): ########################################################################## # Outer funtions ########################################################################## + def deliver_health_metric(self, metric_value: float, score: float): + """Deliver health metric to sysom health score calculator + + Args: + metric_value: metric value + score: score after calculation + """ + health_metric = { + "metric_id": self.settings.metric_id, + "process_time": time(), + "event_time": time(), + "score": score, + "value": metric_value, + "layer": self.level.value, + "cluster": self.name[Level.Cluster], + } + if self.level == Level.Node: + health_metric["node"] = self.name[Level.Node] + if self.level == Level.Pod: + health_metric["pod"] = self.name[Level.Pod] + health_metric["namespace"] = self.name['namespace'] + SysomFramework.abnormal_metric(health_metric) + def deliver_alarm(self, metric_value: float, type: str) -> str: alarm_uuid = uuid.uuid4() metric_value = round(metric_value, 2) @@ -326,7 +320,7 @@ class Metric(): SysomFramework.alarm({ "alert_id": str(alarm_uuid), "instance": self.name[self.level], - "alert_item": self.settings.description, + "alert_item": self.settings.metric_id, "alert_category": "MONITOR", "alert_source_type": "health check", "alert_time": int(round(time() * 1000)), @@ -338,7 +332,7 @@ class Metric(): "metric_type": type, }, "annotations": { - "summary": f"{self.settings.description} has low score with" + "summary": f"{self.settings.metric_id} has low score with" f" value {metric_value}" } }) @@ -362,8 +356,8 @@ class Metric(): SysomFramework.alarm_action("ADD_OPT", { "alert_id": str(alarm_id), "opt": { - "key": self.settings.description, - "label": self.settings.description, + "key": self.settings.metric_id, + "label": self.settings.metric_id, "type": "LINK", "url": self.settings.alarm.diagnose_url } @@ -376,7 +370,7 @@ class Metric(): service_name=self.settings.alarm.service_name, type=type, level=level, - metric_description=self.settings.description, + metric_description=self.settings.metric_id, instance=self.name[Level.Node], ), block=False @@ -409,14 +403,11 @@ class Metric(): """ raise NotImplementedError("process_diagnose_req not implememted!") - def metric_score(self, pod: str, node: str, - cluster: str, last_end_time: float) -> (float, float): + def metric_score(self, labels: Labels, last_end_time: float) -> Tuple[float, float]: """Calculate the final score of this metric Args: - pod: pod name - node: node name - cluster: cluster name + labels: cluster/node/pod labels last_end_time: end time of last calculate interval Raises: @@ -427,9 +418,10 @@ class Metric(): Returns: (metric_value, score): metric value and score after calculation """ - self.name[Level.Pod] = pod - self.name[Level.Node] = node - self.name[Level.Cluster] = cluster + self.name[Level.Pod] = labels.pod + self.name['namespace'] = labels.namespace + self.name[Level.Node] = labels.instance + self.name[Level.Cluster] = labels.cluster self.last_end_time = last_end_time metric_value = None diff --git a/sysom_server/sysom_cluster_health/app/consumer/consumer.py b/sysom_server/sysom_cluster_health/app/consumer/consumer.py new file mode 100644 index 0000000000000000000000000000000000000000..3484daafa7ab9c4fdec7b13121ae3264c666fb8f --- /dev/null +++ b/sysom_server/sysom_cluster_health/app/consumer/consumer.py @@ -0,0 +1,74 @@ +from cec_base.event import Event +from cec_base.consumer import Consumer +from cec_base.producer import Producer +from cec_base.cec_client import MultiConsumer, CecAsyncConsumeTask +from clogger import logger +from lib.common_type import Level +from app.health_calculator.health_metric import HealthMetric +from conf.settings import * +from sysom_utils import CecTarget, SysomFramework + + +class HealthMetricListener(MultiConsumer): + def __init__(self) -> None: + super().__init__( + YAML_CONFIG.get_cec_url(CecTarget.PRODUCER), + custom_callback=self.on_receive_event, + ) + self.append_group_consume_task( + CEC_TOPIC_SYSOM_HEALTH_METRIC, + "sysom_cluster_health", + Consumer.generate_consumer_id(), + ensure_topic_exist=True, + ) + + self.gcache_cluster_metrics = SysomFramework.gcache(CLUSTER_HEALTH_METRIC_GCACHE) + self.gcache_node_metrics = SysomFramework.gcache(NODE_HEALTH_METRIC_GCACHE) + self.gcache_pod_metrics = SysomFramework.gcache(POD_HEALTH_METRIC_GCACHE) + + def _delivery(self, topic: str, value: dict): + self._producer.produce(topic, value) + self._producer.flush() + + def _deal_health_metric(self, health_metric: HealthMetric): + health_metric = health_metric.dict() + layer = health_metric["layer"] + + if layer not in [Level.Cluster.value, Level.Node.value, Level.Pod.value]: + raise Exception(f"Invalid layer: {layer} of metric: {health_metric}") + + try: + if layer == Level.Cluster.value: + cluster = health_metric[cluster] + self.gcache_cluster_metrics.push_list(cluster, health_metric) + elif layer == Level.Node.value: + # use cluster and node as key in case of same node name in different cluster + key = f"{health_metric['cluster']}:{health_metric['node']}" + self.gcache_node_metrics.push_list(key, health_metric) + elif layer == Level.Pod.value: + key = f"{health_metric['cluster']}:{health_metric['pod']}:{health_metric['namespace']}" + self.gcache_pod_metrics.push_list(key, health_metric) + except Exception as e: + raise Exception( + f"Failed to deal with health metric: {health_metric}, error: {e}" + ) + + def on_receive_event(self, event: Event, task: CecAsyncConsumeTask): + """ + 处理每个单独的任务 + """ + event_value = event.value + try: + assert isinstance(event_value, dict) + if task.topic_name == CEC_TOPIC_SYSOM_HEALTH_METRIC: + health_metric = HealthMetric(**event_value) + self._deal_health_metric(health_metric) + else: + logger.warning( + f"Received not expect topic data, topic = {task.topic_name}" + ) + except Exception as e: + logger.exception(e) + finally: + # 执行消息确认 + task.ack(event) \ No newline at end of file diff --git a/sysom_server/sysom_cluster_health/app/crud.py b/sysom_server/sysom_cluster_health/app/crud.py index 2c852d2cb775661a9d42d94a0e0bb94b748367ce..dff5cb83016a9fca1a68448366be17aebbd1dcb5 100644 --- a/sysom_server/sysom_cluster_health/app/crud.py +++ b/sysom_server/sysom_cluster_health/app/crud.py @@ -6,6 +6,8 @@ Email mfeng@linux.alibaba.com File crud.py Description: """ +import uuid +from clogger import logger from typing import Optional, List from sqlalchemy.orm import Session from app import models, schemas, query @@ -34,3 +36,48 @@ from app import models, schemas, query # query_params.get_query_exp(db) # .all() # ) + + +def create_abnormal_metrics_data( + db: Session, abnormal_metrics: schemas.AbnormalMetricsBase +) -> Optional[models.BaseModel]: + base = { + "uuid": str(uuid.uuid4()), + "metric_id": abnormal_metrics.metric_id, + "metric_type": abnormal_metrics.metric_type, + "score": abnormal_metrics.score, + "value": abnormal_metrics.value, + "cluster": abnormal_metrics.cluster, + "timestamp": abnormal_metrics.timestamp + } + + if abnormal_metrics.instance == "": + db_abnormal_metrics = models.AbnormalMetricsCluster(**base) + elif abnormal_metrics.pod == "" and abnormal_metrics.instance != "": + db_abnormal_metrics = models.AbnormalMetricsNode( + instance=abnormal_metrics.instance, + **base + ) + elif abnormal_metrics.pod != "" and abnormal_metrics.namespace != "": + db_abnormal_metrics = models.AbnormalMetricsPod( + instance=abnormal_metrics.instance, + pod=abnormal_metrics.pod, + namespace=abnormal_metrics.namespace, + **base, + ) + else: + logger.error(f"Inserting Invalid abnormal_metrics " + f"to mysql: {abnormal_metrics}") + return None + + db.add(db_abnormal_metrics) + db.commit() + db.refresh(db_abnormal_metrics) + return db_abnormal_metrics + +def del_all_abnormal_metrics_data(db: Session): + db.query(models.AbnormalMetricsCluster).delete() + db.query(models.AbnormalMetricsNode).delete() + db.query(models.AbnormalMetricsPod).delete() + db.commit() + return \ No newline at end of file diff --git a/sysom_server/sysom_cluster_health/app/diagnose/diagnose_info.py b/sysom_server/sysom_cluster_health/app/diagnose/diagnose_info.py new file mode 100644 index 0000000000000000000000000000000000000000..cd2ef308d8c5a7027db83f89cb794841628f785f --- /dev/null +++ b/sysom_server/sysom_cluster_health/app/diagnose/diagnose_info.py @@ -0,0 +1,16 @@ +from dataclasses import dataclass +from typing import Optional, Dict, List +from lib.common_type import Level + +@dataclass +class DiagnoseInfo: + alarm_id: str + service_name: str + type: str + level: Level + metric_description: str + instance: str + pod: Optional[str] = None + container: Optional[str] = None + time: Optional[str] = None + diagnose_type: Optional[str] = None \ No newline at end of file diff --git a/sysom_server/sysom_cluster_health/app/calculator/diagnose_worker.py b/sysom_server/sysom_cluster_health/app/diagnose/diagnose_worker.py similarity index 96% rename from sysom_server/sysom_cluster_health/app/calculator/diagnose_worker.py rename to sysom_server/sysom_cluster_health/app/diagnose/diagnose_worker.py index 49c677d3945b35f88a2675f8dc6e0e6f238b75ef..46c8c90294895fbf7878f17554e7f02fc8d7e62a 100644 --- a/sysom_server/sysom_cluster_health/app/calculator/diagnose_worker.py +++ b/sysom_server/sysom_cluster_health/app/diagnose/diagnose_worker.py @@ -6,8 +6,9 @@ from clogger import logger from conf.settings import YAML_CONFIG from datetime import datetime from sysom_utils import SysomFramework, GClient -from lib.metric_type.metric_type import Metric, DiagnoseInfo -from lib.metric_manager import MetricManager +from app.diagnose.diagnose_info import DiagnoseInfo +from app.collector.metric_type.metric_type import Metric +from app.collector.metric_manager import MetricManager SYSOM_POLL_TIMEOUT = 20 SYSOM_POLL_INTERVAL = 1 diff --git a/sysom_server/sysom_cluster_health/app/health_calculator/algorithm/health_algorithm.py b/sysom_server/sysom_cluster_health/app/health_calculator/algorithm/health_algorithm.py new file mode 100644 index 0000000000000000000000000000000000000000..99267119dc0f08ba37554b4f35cd18fb489d1820 --- /dev/null +++ b/sysom_server/sysom_cluster_health/app/health_calculator/algorithm/health_algorithm.py @@ -0,0 +1,292 @@ +import json +import conf.settings as settings +from abc import abstractmethod +from typing import List, Dict, Tuple +from clogger import logger +from lib.common_type import Level +from app.health_calculator.health_metric import HealthMetric, HealthMetricsMap + +METRIC_TYPES = ["capacity", "load", "latency", "error"] + +class HealthAlgorithm: + def __init__(self, level: Level): + self.level = level + self.registed_metric = {} # metric_id -> metric_setting map + self.type_metrics = { + metric_type: [] for metric_type in METRIC_TYPES + } # type -> [metric_setting] map + self.output_abnormal_metrics = { + metric_type: HealthMetricsMap() for metric_type in METRIC_TYPES + } + + @abstractmethod + def preprocessing(self, metrics: Dict[str, HealthMetric]): + """ + Preprocess the data + + args: + metrics: the metrics data receive from gcache metric set + metrics = { + "metric_id1": HealthMetric + "metric_id2": HealthMetric + ... + } + + """ + raise NotImplementedError("Subclass must implement abstract method") + + @abstractmethod + def calculate_this_level(self) -> Tuple[float, float, float, float, float]: + """ + The method to calculate the health of this level + + return: (capacity_score, load_score, latency_score, error_score, instance_score) + """ + raise NotImplementedError("Subclass must implement abstract method") + + + def calculate_lower_level(self, values: List[float]) -> float: + """ + The method to calculate the lower level health to this level + + args: + values: the health score of lower level instances + assume we are calculating a node's score, the values = + [pod1_score, pod2_score, pod3_score3] + + return: the health score of this instance(cluster, node, pod) + """ + if len(values) <= 0: + return 100 + + bad_instances = [] + for value in values: + if value < 0 or value > 100: + raise ValueError(f"Score: {value} is invalid") + if value < 60: + bad_instances.append(value) + + # no bad instances + if len(bad_instances) <= 0: + return sum(values) / len(values) + + if len(bad_instances) == len(values): + return 0 + + bad_instances_ratio = len(bad_instances) / len(values) + # if the ratio of bad instances is more than 10%, the health score will be under 60 + if bad_instances_ratio >= 0.1: + return 60 * (1 - (len(bad_instances) / len(values) - 0.1)) + if bad_instances_ratio < 0.1: + return 90 - 30 * (len(bad_instances) / (len(values) * 0.1)) + + return 100 + + + def register_metric_from_settings(self): + metrics_mapping = { + Level.Cluster: settings.CLUSTER_HEALTH_METRICS, + Level.Node: settings.NODE_HEALTH_METRICS, + Level.Pod: settings.POD_HEALTH_METRICS + } + + metrics = metrics_mapping.get(self.level) + if metrics is not None: + for metric in metrics: + if "MetricID" not in metric or "Type" not in metric: + logger.warning(f"Setting: metric {json.dumps(metric)}" + f"is invalid, skip it") + continue + + metric_id = metric["MetricID"] + metric_type = metric["Type"] + + if metric_type not in METRIC_TYPES: + logger.warning(f"Setting: metric {metric_id}" + f"type {metric_type} is invalid, skip it") + continue + + self.registed_metric[metric_id] = metric + self.type_metrics[metric_type].append(metric) + + + def get_abnormal_metrics(self) -> Dict[str, HealthMetricsMap]: + return self.output_abnormal_metrics + + +class DefaultHealthAlgorithm(HealthAlgorithm): + def __init__(self, level: Level): + super().__init__(level) + + def preprocessing(self, metrics: Dict[str, HealthMetric]): + categories = ["critical_abnormal", "suspect_abnormal", "trend_abnormal"] + types = METRIC_TYPES + ["instance"] + score_ranges = [(0, 60), (60, 90), (90, 100)] + + self.data = {type: {category: [] for category in categories} for type in types} + + def __add_metric_to_data(metric, category): + self.data[metric_type][category].append(metric) + self.data["instance"][category].append(metric) + self.output_abnormal_metrics[metric_type].add_metric(metric) + + for metric in metrics.values(): + if metric.metric_id not in self.registed_metric: + logger.warning(f"Receive metric {metric.metric_id} not in registed metrics") + continue + + metric_type = self.registed_metric[metric.metric_id]["Type"] + + score = metric.score + if score < 0 or score > 100: + logger.warning(f"Metric {metric.metric_id} score {score} is invalid") + continue + + for i, (low, high) in enumerate(score_ranges): + if low <= score < high: + __add_metric_to_data(metric, categories[i]) + break + + + def calculate_this_level(self) -> Tuple[float, float, float, float, float]: + def _calculate_score(abnormal_dict, metric_num) -> float: + if metric_num <= 0: + return 100 + + if len(abnormal_dict["critical_abnormal"]) > 0: + return 60 * (1 - len(abnormal_dict["critical_abnormal"]) / metric_num) + + if len(abnormal_dict["suspect_abnormal"]) > 0: + return 90 - 30 * (len(abnormal_dict["suspect_abnormal"]) / metric_num) + + if len(abnormal_dict["trend_abnormal"]) > 0: + return 100 - 10 * (len(abnormal_dict["trend_abnormal"]) / metric_num) + + return 100 + + res = [] + for type in METRIC_TYPES: + type_registed_metrics = self.type_metrics.get(type, {}) + res.append(_calculate_score(self.data[type], len(type_registed_metrics))) + + res.append(_calculate_score(self.data["instance"], len(self.registed_metric))) + return tuple(res) + + +class WeightedSumAlgorithm(HealthAlgorithm): + def __init__(self, level: Level): + super().__init__(level) + + + def register_metric_from_settings(self): + def __check_weight(metrics): + for type, metrics in metrics.items(): + if len(metrics) <= 0: + continue + + weight = 0 + for metric in metrics: + weight += metric["Weight"] + if weight != 1: + raise Exception(f"Sum of weight of {type} metrics of " + f"level {self.level} is not equal 1") + + super().register_metric_from_settings() + __check_weight(self.type_metrics) + + + def preprocessing(self, metrics: Dict[str, HealthMetric]): + self.data = { + metric_type: {} for metric_type in METRIC_TYPES + } + for metric in metrics.values(): + if metric.metric_id not in self.registed_metric: + logger.warning(f"Receive metric {metric.metric_id} not in registed metrics") + continue + + metric_type = self.registed_metric[metric.metric_id]["Type"] + self.data[metric_type][metric.metric_id] = metric + + + def calculate_this_level(self) -> Tuple[float, float, float, float, float]: + """ + Calculate the health score of this level using weighted sum algorithm: + type_score = sum(metric_score * metric_weight) + instance_score = avg(type_score) + """ + + res = [] + for type, type_metrics in self.type_metrics.items(): + if len(type_metrics) <= 0: + res.append(100) + continue + + type_score = 0 + if type not in self.data: + logger.warning("WeightedSumAlgorithm: Type {type} not in receive data") + type_score = 100 + res.append(type_score) + continue + + for metric in type_metrics: + metric_score = 100 + metric_id = metric["MetricID"] + if metric_id not in self.data[type]: + logger.warning(f"WeightedSumAlgorithm: Metric {metric} not in data") + else: + metric_score = self.data[type][metric_id].score + + if metric_score < 100: + self.output_abnormal_metrics[type].add_metric(self.data[type][metric_id]) + + type_score += metric_score * metric["Weight"] + + res.append(type_score) + + instance_score = sum(res) / len(res) + res.append(instance_score) + print("WeightedSumAlgorithm: calculated score: ", res) + return tuple(res) + + +class EwmAlgorithm(HealthAlgorithm): + def __init__(self, level: Level): + super().__init__(level) + + def preprocessing(self, metrics: Dict[str, HealthMetric]): + pass + + def calculate_this_level(self) -> Tuple[float, float, float, float, float]: + pass + +class CriticAlgorithm(HealthAlgorithm): + def __init__(self, level: Level): + super().__init__(level) + + def preprocessing(self, metrics: Dict[str, HealthMetric]): + pass + + def calculate_this_level(self) -> Tuple[float, float, float, float, float]: + pass + +def choose_algorithm(alg_setting: str, level: Level) -> HealthAlgorithm: + algorithm_mapping = { + "default": DefaultHealthAlgorithm, + "weightedSum": WeightedSumAlgorithm, + "ewm": EwmAlgorithm, + "critic": CriticAlgorithm + } + + algorithm_class = algorithm_mapping.get(alg_setting, DefaultHealthAlgorithm) + algorithm_instance = algorithm_class(level) + + try: + algorithm_instance.register_metric_from_settings() + except Exception as e: + logger.error(f"Algorithm {alg_setting} init failed: {e}") + raise e + + logger.info(f"选择算法: {algorithm_class.__name__},层级: {level}") + + return algorithm_instance + diff --git a/sysom_server/sysom_cluster_health/app/health_calculator/calculator.py b/sysom_server/sysom_cluster_health/app/health_calculator/calculator.py new file mode 100644 index 0000000000000000000000000000000000000000..16ef14a828c92d99dba812c2d61e1dfc97130a93 --- /dev/null +++ b/sysom_server/sysom_cluster_health/app/health_calculator/calculator.py @@ -0,0 +1,129 @@ +import time +import copy +from typing import Dict, Optional +from conf.settings import * +from multiprocessing import Process +from schedule import Scheduler +from os import getpid, kill +from clogger import logger +from sysom_utils import SysomFramework, GCache +from lib.common_type import Level +from app.health_calculator.instance import construct_cluster_infos +from app.health_calculator.algorithm.health_algorithm import choose_algorithm, HealthAlgorithm +from app.crud import del_all_abnormal_metrics_data +from app.database import SessionLocal +from metric_reader import dispatch_metric_reader + + +class HealthCalculator(Process): + gcache: Optional[Dict[Level, GCache]] = None + algorithms: Optional[Dict[Level, HealthAlgorithm]] = None + + def __init__( + self, + parent_pid: int = None + ) -> None: + super().__init__(daemon=True) + self.clusterhealth_interval = CALCULATE_INTERVAL + self.clusterhealth_host_schedule: Scheduler = Scheduler() + self.parent_pid = parent_pid + self.cluster_infos = {} + self.metric_reader = dispatch_metric_reader( + "prometheus://" + PROMETHEUS_CONFIG.host + ":" + str(PROMETHEUS_CONFIG.port)) + + @classmethod + def get_gcache(cls) -> Dict[Level, GCache]: + if cls.gcache is None: + cls.gcache = { + Level.Cluster: SysomFramework.gcache(CLUSTER_HEALTH_METRIC_GCACHE), + Level.Node: SysomFramework.gcache(NODE_HEALTH_METRIC_GCACHE), + Level.Pod: SysomFramework.gcache(POD_HEALTH_METRIC_GCACHE) + } + return cls.gcache + + @classmethod + def get_algorithms(cls) -> Dict[Level, HealthAlgorithm]: + if cls.algorithms is None: + cls.algorithms = { + Level.Cluster: choose_algorithm(CLUSTER_ALGORITHM, Level.Cluster), + Level.Node: choose_algorithm(NODE_ALGORITHM, Level.Node), + Level.Pod: choose_algorithm(POD_ALGORITHM, Level.Pod) + } + return cls.algorithms + + def _check_if_parent_is_alive(self): + try: + kill(self.parent_pid, 0) + except OSError: + logger.info(f"Analyzer's parent {self.parent_pid} is exit") + exit(0) + + def _calculate_health(self): + for cluster in self.cluster_infos.values(): + cluster.collect_metrics( + HealthCalculator().get_gcache()[Level.Cluster] + ) + for node in cluster.nodes.values(): + node.collect_metrics( + HealthCalculator().get_gcache()[Level.Node] + ) + for pod in node.pods.values(): + # collect metrics from gcache + pod.collect_metrics( + HealthCalculator().get_gcache()[Level.Pod] + ) + # calculate pod health score + pod.calculate_health(copy.deepcopy( + HealthCalculator().get_algorithms()[Level.Pod] + )) + # calculate node health score + node.calculate_health(copy.deepcopy( + HealthCalculator().get_algorithms()[Level.Node] + )) + # calculate cluster health score + cluster.calculate_health(copy.deepcopy( + HealthCalculator().get_algorithms()[Level.Cluster] + )) + + def calculating_task(self): + start_time = time.time() + + # cleanup abnormal metrics of last round data from mysql + if ABNORMAL_METRIC_STORAGE == "mysql": + with SessionLocal() as db: + del_all_abnormal_metrics_data(db) + + try: + self.cluster_infos = construct_cluster_infos(self.metric_reader, + self.clusterhealth_interval) + except Exception as e: + logger.error(f"Failed to construct cluster infos: {e}") + return + + self._calculate_health() + + # cleanup metric set from gcache + HealthCalculator().get_gcache()[Level.Cluster].clean() + HealthCalculator().get_gcache()[Level.Node].clean() + HealthCalculator().get_gcache()[Level.Pod].clean() + + self.last_end_time = time.time() + end_time = time.time() + logger.info(f"Excutaion time: {end_time - start_time}") + + + def run(self) -> None: + logger.info(f'健康度计算守护进程PID: {getpid()}') + + self.calculating_task() + self.clusterhealth_host_schedule.every(self.clusterhealth_interval)\ + .seconds.do(self.calculating_task) + + while True: + self._check_if_parent_is_alive(); + + if self.is_alive(): + self.clusterhealth_host_schedule.run_pending() + else: + break + time.sleep(max(1, int(self.clusterhealth_interval / 2))) \ No newline at end of file diff --git a/sysom_server/sysom_cluster_health/app/health_calculator/health_metric.py b/sysom_server/sysom_cluster_health/app/health_calculator/health_metric.py new file mode 100644 index 0000000000000000000000000000000000000000..72cefe343f3b015c22bcedcee200391e5cdfae96 --- /dev/null +++ b/sysom_server/sysom_cluster_health/app/health_calculator/health_metric.py @@ -0,0 +1,31 @@ +from pydantic import BaseModel +from collections import OrderedDict +from typing import Optional, Dict, List + +class HealthMetric(BaseModel): + metric_id: str + process_time: float + event_time: float + score: float + value: float + layer: str + cluster: str + node: Optional[str] + pod: Optional[str] + namespace: Optional[str] + +class HealthMetricsMap: + def __init__(self, capacity: int = 5): + self.capacity = capacity + self.metrics_map = OrderedDict() + + def add_metric(self, metric: HealthMetric): + if metric.metric_id in self.metrics_map: + self.metrics_map[metric.metric_id] = metric + else: + if len(self.metrics_map) >= self.capacity: + self.metrics_map.popitem(last=False) + self.metrics_map[metric.metric_id] = metric + self.metrics_map = OrderedDict( + sorted(self.metrics_map.items(), key=lambda item: item[1].score, reverse=True) + ) \ No newline at end of file diff --git a/sysom_server/sysom_cluster_health/app/health_calculator/instance.py b/sysom_server/sysom_cluster_health/app/health_calculator/instance.py new file mode 100644 index 0000000000000000000000000000000000000000..3c49172cf7af4c209c823f344852eb6f1118a5b9 --- /dev/null +++ b/sysom_server/sysom_cluster_health/app/health_calculator/instance.py @@ -0,0 +1,305 @@ +import json +import time +from conf.settings import * +from typing import Dict, List +from datetime import datetime +from clogger import logger +from sysom_utils import SysomFramework +from lib.utils import collect_all_clusters, \ + collect_instances_of_cluster, collect_pods_of_instance +from app.health_calculator.health_metric import HealthMetric, HealthMetricsMap +from app.health_calculator.score_result import ScoreResult, ScoreType +from app.schemas import AbnormalMetricsBase +from app.crud import create_abnormal_metrics_data +from app.database import SessionLocal +from abc import ABC, abstractmethod + +gcache_cluster_exporter = SysomFramework.gcache(CLUSTER_METRIC_EXPORTER) +gcache_node_exporter = SysomFramework.gcache(NODE_METRIC_EXPORTER) +gcache_pod_exporter = SysomFramework.gcache(POD_METRIC_EXPORTER) + +class Instance(ABC): + def __init__(self, name: str): + self.name = name + self.metrics = {} # type: Dict[str, HealthMetric] + self.type_score = {"capacity": 100, "load": 100, "latency": 100, "error": 100} + self.score = 100 + + + def _push_score_result(self): + """ + Insert the health score result to gcache + + args: + result: the health score result + """ + raise NotImplementedError("Must implement _push_score_result") + + def _insert_score_result( + self, + level_labels: Dict[str, str], + abnormal_metrics: Dict[str, HealthMetricsMap] + ) -> List[dict]: + score_result = [] + for type, metrics_map in abnormal_metrics.items(): + # store abnormal metrics to as prometheus metrics + if ABNORMAL_METRIC_STORAGE == "prometheus": + for abnormal_metric in metrics_map.metrics_map.values(): + labels = level_labels.copy() + labels["description"] = abnormal_metric.metric_id + labels["type"] = type + score_result.append( + ScoreResult( + labels, abnormal_metric.score, + abnormal_metric.value, ScoreType.MetricScore + ).to_dict() + ) + # store abnormal metrics to as mysql data + elif ABNORMAL_METRIC_STORAGE == "mysql": + with SessionLocal() as db: + for metric_id, abnormal_metric in metrics_map.metrics_map.items(): + abnormal_metric_data = AbnormalMetricsBase( + metric_id=metric_id, + metric_type=type, + cluster=level_labels["cluster"], + instance=level_labels.get("instance", ""), + namespace=level_labels.get("namespace", ""), + pod=level_labels.get("pod", ""), + score=abnormal_metric.score, + value=abnormal_metric.value, + timestamp=time.time() + ) + create_abnormal_metrics_data(db, abnormal_metric_data) + + for type, score in self.type_score.items(): + labels = level_labels.copy() + labels["type"] = type + score_result.append( + ScoreResult( + labels, score, 0, ScoreType.MetricTypeScore + ).to_dict() + ) + + score_result.append( + ScoreResult( + level_labels, self.score, 0, ScoreType.InstanceScore + ).to_dict() + ) + + return score_result + + def _validate_metric_time(self, metric: HealthMetric): + now = time.time() + if (now - metric.event_time) > 2 * CALCULATE_INTERVAL: + event_time = datetime.fromtimestamp(metric.event_time) + now_datetime = datetime.fromtimestamp(now) + logger.warning(f"Metric {metric.metric_id} is too old, " + f"event_time: {event_time} now: {now_datetime}") + return False + + return True + + def _add_metric(self, metric: HealthMetric): + if not self._validate_metric_time(metric): + return + + metric_id = metric.metric_id + # multiple metrics with the same metric_id, use the worst one + if metric_id in self.metrics: + if metric.score >= self.metrics[metric_id].score: + return + + self.metrics[metric.metric_id] = metric + + def _collect_metrics_from_gcacge(self, key, gcache): + metrics_list = gcache.get_list(key) + for metric_data in metrics_list: + health_metric = HealthMetric(**metric_data) + self._add_metric(health_metric) + + def _lower_level_instances_score(self) -> List[float]: + return [] + + def calculate_health(self, algorithm): + try: + algorithm.preprocessing(self.metrics) + + ( + self.type_score["capacity"], + self.type_score["load"], + self.type_score["latency"], + self.type_score["error"], + this_level_score + ) = algorithm.calculate_this_level() + + lower_instances_score = self._lower_level_instances_score() + lower_level_score = algorithm.calculate_lower_level( + lower_instances_score + ) + + abnormal_metrics = algorithm.get_abnormal_metrics() + except Exception as e: + logger.error(f"Calculate {self.name} health failed: {e}") + # set score to -1 to indicate the health score is invalid + self.score = -1 + return + + self.score = min(lower_level_score, this_level_score) + self._push_score_result(abnormal_metrics) + + @abstractmethod + def collect_metrics(self, gcache): + pass + + +# Container Level is not implemented yet +class Container(Instance): + def __init__(self, name: str, pod: Instance): + self.pod = pod + super().__init__(name) + +class Pod(Instance): + def __init__(self, name: str, namespace: str, node: Instance): + self.node = node + self.namespace = namespace + self.containers = {} + super().__init__(name) + + def _push_score_result(self, abnormal_metrics: Dict[str, HealthMetricsMap]): + level_labels = { + "cluster": self.node.cluster.name, + "instance": self.node.name, + "pod": self.name, + "namespace": self.namespace, + } + + score_result = self._insert_score_result(level_labels, abnormal_metrics) + gcache_pod_exporter.store(self.name, json.dumps(score_result)) + + def collect_metrics(self, gcache): + key = self.node.cluster.name + ":" + self.name + ":" + self.namespace + self._collect_metrics_from_gcacge(key, gcache) + + def _lower_level_instances_score(self) -> List[float]: + return super()._lower_level_instances_score() + + def add_container(self, container: Container): + self.containers[container.name] = container + + +class Node(Instance): + def __init__(self, name: str, cluster: Instance): + self.pods = {} + self.cluster = cluster + super().__init__(name) + + def add_pod(self, pod: Pod): + self.pods[pod.name] = pod + + def find_pod(self, pod_name: str) -> Pod: + return self.pods[pod_name] + + def _lower_level_instances_score(self) -> List[float]: + return [ + pod.score + for pod in self.pods.values() + if 0 <= pod.score <= 100 + ] + + def _push_score_result(self, abnormal_metrics: Dict[str, HealthMetricsMap]): + level_labels = { + "cluster": self.cluster.name, + "instance": self.name, + } + + score_result = self._insert_score_result(level_labels, abnormal_metrics) + gcache_node_exporter.store(self.name, json.dumps(score_result)) + + def collect_metrics(self, gcache): + key = self.cluster.name + ":" + self.name + self._collect_metrics_from_gcacge(key, gcache) + + +class Cluster(Instance): + def __init__(self, name: str): + self.nodes = {} + super().__init__(name) + + def add_node(self, node: Node): + self.nodes[node.name] = node + + def find_node(self, node_name: str) -> Node: + return self.nodes[node_name] + + def _lower_level_instances_score(self) -> List[float]: + return [ + node.score + for node in self.nodes.values() + if 0 <= node.score <= 100 + ] + + def _lower_level_type_score(self, type: str) -> List[float]: + return [ + node.type_score[type] + for node in self.nodes.values() + if 0 <= node.type_score[type] <= 100 + ] + + def _push_score_result(self, abnormal_metrics: Dict[str, HealthMetricsMap]): + level_labels = { + "cluster": self.name, + } + + score_result = self._insert_score_result(level_labels, abnormal_metrics) + gcache_cluster_exporter.store(self.name, json.dumps(score_result)) + + def collect_metrics(self, gcache): + key = self.name + self._collect_metrics_from_gcacge(key, gcache) + + def calculate_health(self, algorithm): + try: + for type in self.type_score.keys(): + self.type_score[type] = algorithm.calculate_lower_level( + self._lower_level_type_score(type) + ) + + self.score = algorithm.calculate_lower_level( + self._lower_level_instances_score() + ) + except Exception as e: + logger.error(f"Calculate {self.name} health failed: {e}") + # set score to -1 to indicate the health score is invalid + self.score = -1 + return + + self._push_score_result({}) + + +def construct_cluster_infos(metric_reader, interval) -> Dict[str, Cluster]: + """ + Construct cluster infos from prometheus metrics + """ + res = {} + clusters = collect_all_clusters(metric_reader) + if len(clusters) == 0 or NO_CLUSTER_LABEL is True: + clusters.append("default") + + for cluster in clusters: + cluster_instance = Cluster(cluster) + nodes = collect_instances_of_cluster(cluster, + metric_reader, interval) + for node in nodes: + node_instance = Node(node, cluster_instance) + pods = collect_pods_of_instance(node, + metric_reader, interval) + for pod, ns in pods: + pod_instance = Pod(pod, ns, node_instance) + node_instance.add_pod(pod_instance) + cluster_instance.add_node(node_instance) + + res[cluster] = cluster_instance + + return res + + \ No newline at end of file diff --git a/sysom_server/sysom_cluster_health/lib/score_result.py b/sysom_server/sysom_cluster_health/app/health_calculator/score_result.py similarity index 93% rename from sysom_server/sysom_cluster_health/lib/score_result.py rename to sysom_server/sysom_cluster_health/app/health_calculator/score_result.py index 341a3157e553198d3e3f62adf85cb5108504ca00..13a891c5d260dc939299cd3b64f85abd35753868 100644 --- a/sysom_server/sysom_cluster_health/lib/score_result.py +++ b/sysom_server/sysom_cluster_health/app/health_calculator/score_result.py @@ -1,7 +1,6 @@ -from typing import List, Dict, NewType -from enum import Enum from dataclasses import dataclass - +from enum import Enum +from typing import Dict, NewType, List class ScoreType(Enum): MetricScore = 1 @@ -41,4 +40,4 @@ TypeResult = NewType('TypeResult', Dict[str, List[ScoreResult]]) @dataclass class LevelResults: labels: Dict[str, str] - results: TypeResult + results: TypeResult \ No newline at end of file diff --git a/sysom_server/sysom_cluster_health/app/models.py b/sysom_server/sysom_cluster_health/app/models.py index aebfe897ce473c3a527ff9352a0f79429da068e1..04160af0bdd7a62b738821bd21579f08bdfd0d4d 100644 --- a/sysom_server/sysom_cluster_health/app/models.py +++ b/sysom_server/sysom_cluster_health/app/models.py @@ -6,7 +6,7 @@ Email mfeng@linux.alibaba.com File models.py Description: """ -from sqlalchemy import Column, Integer, String, DateTime +from sqlalchemy import Column, Integer, String, DateTime, Float from sqlalchemy.sql import func from app.database import Base @@ -15,10 +15,30 @@ from app.database import Base # Define databse model here ########################################################################### -# @reference https://fastapi.tiangolo.com/zh/tutorial/sql-databases/ -# class Person(Base): -# __tablename__ = "sys_person" -# id = Column(Integer, primary_key=True) -# name = Column(String(254), unique=True) -# age = Column(Integer) -# created_at = Column(DateTime(timezone=True), server_default=func.now()) \ No newline at end of file +class BaseModel: + uuid = Column(String(128), primary_key=True, unique=True) + metric_id = Column(String(256)) + metric_type = Column(String(128)) + score = Column(Float) + value = Column(Float) + timestamp = Column(Float, default=func.time()) + +class AbnormalMetricsCluster(Base, BaseModel): + __tablename__ = "sys_abnormal_metrics_cluster" + + cluster = Column(String(256)) + +class AbnormalMetricsNode(Base, BaseModel): + __tablename__ = "sys_abnormal_metrics_node" + + cluster = Column(String(256)) + instance = Column(String(256)) + +class AbnormalMetricsPod(Base, BaseModel): + __tablename__ = "sys_abnormal_metrics_pod" + + cluster = Column(String(256)) + instance = Column(String(256)) + pod = Column(String(256)) + namespace = Column(String(256)) + diff --git a/sysom_server/sysom_cluster_health/app/routers/metrics.py b/sysom_server/sysom_cluster_health/app/routers/metrics.py new file mode 100644 index 0000000000000000000000000000000000000000..fe28918989ee27e5c9291299aa56116f7e78b7ca --- /dev/null +++ b/sysom_server/sysom_cluster_health/app/routers/metrics.py @@ -0,0 +1,122 @@ +import json +from conf.settings import * +from prometheus_client import Gauge, CollectorRegistry, generate_latest +from fastapi import APIRouter, HTTPException, Response +from app.health_calculator.score_result import ScoreType +from sysom_utils import SysomFramework +from clogger import logger + +CLUSTER_HEALTH_SCORE_LABEL = ["cluster", "type"] +CLUSTER_HEALTH_METRIC_LABEL = ["cluster", "type", "description", "mode"] +NODE_HEALTH_SCORE_LABEL = ["cluster", "instance", "type"] +NODE_HEALTH_METRIC_LABEL = [ + "cluster", + "instance", + "type", + "description", + "mode"] +POD_HEALTH_SCORE_LABEL = ["cluster", "instance", "pod", "namespace", "type"] +POD_HEALTH_METRIC_LABEL = [ + "cluster", + "instance", + "type", + "pod", + "namespace", + "description", + "mode"] + + +registry = CollectorRegistry() +cluster_health_score = Gauge('sysom_cluster_health_score', + 'sysom cluster health score', + CLUSTER_HEALTH_SCORE_LABEL, + registry=registry) +cluster_health_metric = Gauge('sysom_cluster_health_metric', + 'sysom cluster health metric', + CLUSTER_HEALTH_METRIC_LABEL, + registry=registry) +node_health_score = Gauge('sysom_node_health_score', + 'sysom node health score', + NODE_HEALTH_SCORE_LABEL, + registry=registry) +node_health_metric = Gauge('sysom_node_health_metric', + 'sysom node health metric', + NODE_HEALTH_METRIC_LABEL, + registry=registry) +pod_health_score = Gauge('sysom_pod_health_score', + 'sysom pod health score', + POD_HEALTH_SCORE_LABEL, + registry=registry) +pod_health_metric = Gauge('sysom_pod_health_metric', + 'sysom pod health score', + POD_HEALTH_METRIC_LABEL, + registry=registry) + + +router = APIRouter() + +@router.get("/metrics") +def get_metrics(): + # pull health score metric from redis and push to prometheus + g_cache_cluster = SysomFramework.gcache(CLUSTER_METRIC_EXPORTER) + g_cache_instance = SysomFramework.gcache(NODE_METRIC_EXPORTER) + g_cache_pod = SysomFramework.gcache(POD_METRIC_EXPORTER) + + try: + cluster_all = g_cache_cluster.load_all() + nodes_all = g_cache_instance.load_all() + pods_all = g_cache_pod.load_all() + + if len(cluster_all) <= 0 or len(nodes_all) <= 0: + return Response(generate_latest(registry), media_type="text/plain") + + def process_metrics(metrics_all, score_labels, metric_labels, + health_score, health_metric, cache): + for item, results in metrics_all.items(): + metrics = json.loads(results) + # the last element is the health score + for metric in metrics: + if metric["type"] == ScoreType.MetricScore.value: + labels = [metric["labels"][label] + for label in metric_labels[:-1]] + ["score"] + # return score of each metric + health_metric.labels(*labels).set(metric["score"]) + # return value of each metric + labels[-1] = "value" + health_metric.labels(*labels).set(metric["value"]) + elif metric["type"] == ScoreType.MetricTypeScore.value: + labels = [metric["labels"][label] + for label in score_labels] + # return score of each metric + health_score.labels(*labels).set(metric["score"]) + elif metric["type"] == ScoreType.InstanceScore.value: + labels = [metric["labels"][label] + for label in score_labels[:-1]] + ["total"] + # return score of each metric + health_score.labels(*labels).set(metric["score"]) + + # delete metrics from redis + cache.delete(item) + + process_metrics(cluster_all, CLUSTER_HEALTH_SCORE_LABEL, + CLUSTER_HEALTH_METRIC_LABEL, + cluster_health_score, + cluster_health_metric, g_cache_cluster) + process_metrics(nodes_all, NODE_HEALTH_SCORE_LABEL, + NODE_HEALTH_METRIC_LABEL, + node_health_score, + node_health_metric, g_cache_instance) + process_metrics(pods_all, POD_HEALTH_SCORE_LABEL, + POD_HEALTH_METRIC_LABEL, + pod_health_score, + pod_health_metric, g_cache_pod) + + except Exception as e: + logger.error("Exception: ", e) + raise HTTPException(status_code=400, detail=str(e)) + finally: + g_cache_cluster.clean() + g_cache_instance.clean() + g_cache_pod.clean() + + return Response(generate_latest(registry), media_type="text/plain") \ No newline at end of file diff --git a/sysom_server/sysom_cluster_health/app/schemas.py b/sysom_server/sysom_cluster_health/app/schemas.py index 5d260eaf218780f4049e06080c1d474d51455570..86949263d20b3dd195aff133d1485d4daf3b2156 100644 --- a/sysom_server/sysom_cluster_health/app/schemas.py +++ b/sysom_server/sysom_cluster_health/app/schemas.py @@ -7,7 +7,7 @@ File schemas.py Description: """ from pydantic import BaseModel -from datetime import datetime +from typing import Optional ########################################################################### # Define schemas here @@ -21,4 +21,16 @@ from datetime import datetime # created_at: datetime # class Config: -# orm_mode = True \ No newline at end of file +# orm_mode = True + + +class AbnormalMetricsBase(BaseModel): + metric_id: str + metric_type: str + cluster: str + instance: Optional[str] + namespace: Optional[str] + pod: Optional[str] + score: float + value: float + timestamp: float \ No newline at end of file diff --git a/sysom_server/sysom_cluster_health/conf/clusterhealth_settings.py b/sysom_server/sysom_cluster_health/conf/clusterhealth_settings.py new file mode 100644 index 0000000000000000000000000000000000000000..60a4e73e4db57cd39ac16b7dfe04d48906b300d5 --- /dev/null +++ b/sysom_server/sysom_cluster_health/conf/clusterhealth_settings.py @@ -0,0 +1,148 @@ +CALCULATE_INTERVAL = 30 + +# where to store the abnormal metrics list +# mysql: store in mysql database +# prometheus: expose as prometheus metrics +ABNORMAL_METRIC_STORAGE = "mysql" + +# Health Algorithm for calculating the specific level +# default: default algorithm +# weightedSum: weighted sum algorithm +CLUSTER_ALGORITHM = "default" +NODE_ALGORITHM = "default" +POD_ALGORITHM = "default" + + +################################################################################# +# Cluster Health Metrics +################################################################################# + +CLUSTER_HEALTH_METRICS = [] + +################################################################################# +# Node Health Metrics +################################################################################# + +NODE_HEALTH_METRICS = [ + ########################################## + # Node Saturation Metrics + ######################################### + { + "MetricID": "Node file descriptor util", + "Type": "capacity", + "Weight": 0.2, + }, + { + "MetricID": "Node memory util", + "Type": "capacity", + "Weight": 0.1, + }, + { + "MetricID": "Node cpu util", + "Type": "capacity", + "Weight": 0.1, + }, + { + "MetricID": "Node sys util", + "Type": "capacity", + "Weight": 0.3, + }, + { + "MetricID": "Node rootfs util", + "Type": "capacity", + "Weight": 0.1, + }, + { + "MetricID": "Node rootfs inode util", + "Type": "capacity", + "Weight": 0.2, + }, + ######################################### + # Node load Metrics + ######################################### + { + "MetricID": "Node load average", + "Type": "load", + "Weight": 1.0, + }, + ######################################### + # Node latency Metrics + ######################################### + { + "MetricID": "Node sched latency", + "Type": "latency", + "Weight": 1.0, + }, + ######################################### + # Node Error Metrics + ######################################### + { + "MetricID": "Node OOM count", + "Type": "error", + "Weight": 1.0, + }, +] + +################################################################################# +# Pod Health Metrics +################################################################################# + +POD_HEALTH_METRICS = [ + ######################################### + # Pod Capacity Metrics + ######################################### + { + "MetricID": "Pod memory util", + "Type": "capacity", + "Weight": 0.3, + }, + { + "MetricID": "Pod cpu util", + "Type": "capacity", + "Weight": 0.2, + }, + { + "MetricID": "Pod sys util", + "Type": "capacity", + "Weight": 0.5, + }, + #{ + # "MetricID": "Pod rootfs util", + # "Type": "capacity", + # "Weight": 0.1, + #}, + #{ + # "MetricID": "Pod rootfs inode util", + # "Type": "capacity", + # "Weight": 0.1, + #}, + ######################################### + # Pod Load Metrics + ######################################### + { + "MetricID": "Pod load average", + "Type": "load", + "Weight": 1.0, + }, + ######################################### + # Pod Latency Metrics + ######################################### + { + "MetricID": "Pod memory reclaim latency", + "Type": "latency", + "Weight": 1.0, + }, + ######################################### + # Pod Error Metrics + ######################################### + { + "MetricID": "Pod OOM count", + "Type": "error", + "Weight": 0.5, + }, + { + "MetricID": "Pod memory fail count", + "Type": "error", + "Weight": 0.5, + }, +] \ No newline at end of file diff --git a/sysom_server/sysom_cluster_health/conf/collector_settings.py b/sysom_server/sysom_cluster_health/conf/collector_settings.py new file mode 100644 index 0000000000000000000000000000000000000000..853541606f31194f6e9bf583a3f250caf3e9df9f --- /dev/null +++ b/sysom_server/sysom_cluster_health/conf/collector_settings.py @@ -0,0 +1,343 @@ +from .common import * + +################################################################################# +# Base Settings +################################################################################# + +# Interval to collect metric from prometheus +COLLECT_INTERVAL = 30 +# If True, use multi-thread to collect and calculate +ENABLE_MULTI_THREAD = False +# Number or thread to use if ENABLE_MULTI_THREAD enable +ANALYZER_PROCESS_NUM = 1 + +################################################################################# +# Global Metric Collecting Settings +################################################################################# + +# the following settings is to specify some label name, in case metric labels changing +CLUSTER_LABEL = "cluster" +POD_LABEL = "pod" +NODE_LABEL = "instance" +NAMESPACE_LABEL = "namespace" +POD_METRIC_TAG = "value" + + +################################################################################# +# Global Alarm and Diagnose Settings +################################################################################# + +# The size of queue which used to send diagnose request between analyzer and diagnose worker +MAX_QUEUE_SIZE = 500 +# Used to merge alarms +ALARM_MERGE_NUM = 10 + +################################################################################# +# Cluster Metrics Settings +################################################################################# + +CLUSTER_METRICS = [ +] + +################################################################################# +# Pod Metrics Settings +################################################################################# + +POD_METRICS = [ + { + "MetricID": "Pod memory util", # description of the metric + "Type": "CapacityMetric", + "Collect": { # settings for collecting and preprocessing metric + "metric_name": "sysom_container_memUtil", # table name + "related_value": ["usage", "limit"], # specific metric + "standard_type": 2, # 0 = non-standard(custom), 1 = already usage, 2 = (usage/total*100) + }, + "Score": { # settings for calculating metric score + "100": 70, # mem usage >= 70% -- 100分(good) + "70": 80, # mem usage >= 80% -- 70分(warning) + "60": 90, # mem usage >= 90% -- 60分(error) + "0": 100 # mem usage >= 95% -- 0分(fatel) + } + }, + { + "MetricID": "Pod cpu util", # description of the metric + "Type": "CapacityMetric", + "Collect": { # settings for collecting and preprocessing metric + "metric_name": "sysom_container_cpuacct_stat", + "related_value": ["total"], + "standard_type": 1, + }, + "Score": { # settings for calculating metric score + "100": 70, # cpu total util >= 70% -- 100分(good) + "70": 80, # cpu total util >= 85% -- 70分(warning) + "60": 90, # cpu total util >= 90% -- 60分(error) + "0": 100 # cpu total util >= 95% -- 0分(fatel) + } + }, + { + "MetricID": "Pod sys util", # description of the metric + "Type": "CapacityMetric", # metric type + "Collect": { # settings for collecting and preprocessing metric + "metric_name": "sysom_container_cpuacct_stat", + "related_value": ["system"], + "standard_type": 1, + }, + "Score": { # settings for calculating metric score + "100": 5, + "70": 10, + "60": 20, + "0": 30, + } + }, + { + "MetricID": "Pod load average", # description of the metric + "Type": "LoadMetric", # metric type + "Collect": { # settings for collecting and preprocessing metric + "metric_name": "sysom_container_proc_stat", + "related_value": ["r_load1min"], + "standard_type": 1, + }, + "Score": { # settings for calculating metric score + "100": 0.7, + "70": 1, + "60": 5, + "0": 10 + } + }, + { + "MetricID": "Pod memory reclaim latency", # description of the metric + "Type": "LatencyMetric", # metric type + "Collect": { # settings for collecting and preprocessing metric + "metric_name": "sysom_container_memdrcm_latency", + # 延时区间需要按从小到达填写 + "related_value": ["memDrcm_lat_10to100ms", "memDrcm_lat_100to500ms", "memDrcm_lat_500to1000ms", + "memDrcm_lat_1000ms"], + "standard_type": 2, + }, + "Score": { # settings for calculating metric score + "100": 0, + "70": 100, + "60": 10000, + "0": 100000 + } + }, + #{ + # "MetricID": "Pod memory compact latency", # description of the metric + # "Type": "LatencyMetric", # metric type + # "Collect": { # settings for collecting and preprocessing metric + # "metric_name": "sysom_container_memmcmp_latency", + # 延时区间需要按从小到达填写 + # "related_value": ["memDcmp_lat_10to100ms", "memDcmp_lat_100to500ms", "memDcmp_lat_500to1000ms", + # "memDcmp_lat_1000ms"], + # "standard_type": 2, + # }, + # "Score": { # settings for calculating metric score + # "100": 0, + # "70": 100, + # "60": 10000, + # } + #}, + { + "MetricID": "Pod OOM count", # description of the metric + "Type": "ErrorMetric", # metric type + "Collect": { # settings for collecting and preprocessing metric + "metric_name": "sysom_container_memory_oomcnt", + "related_value": ["oom_kill"], + "standard_type": 1, + }, + "Score": { + "100": 0, + "60": 1, + "0": 5, + } + }, + { + "MetricID": "Pod memory fail count", # description of the metric + "Type": "ErrorMetric", # metric type + "Collect": { # settings for collecting and preprocessing metric + "metric_name": "sysom_container_memfail_cnt", + "related_value": ["fail_cnt"], + "standard_type": 1, + }, + "Score": { + "100": 0, + "80": 10, + "60": 50, + "0": 100, + } + }, + #{ + # "MetricID": "Pod cpu throttled count", # description of the metric + # "Type": "ErrorMetric", # metric type + # "Collect": { # settings for collecting and preprocessing metric + # "metric_name": "sysom_container_cpu_stat", + # "related_value": ["nr_throttled"], + # "standard_type": 1, + # }, + # "Score": { + # "100": 0, + # "60": 1, + # "0": 5, + # } + #} +] + +################################################################################# +# Nodes Metrics Settings +################################################################################# + +NODE_METRICS = [ + { + "MetricID": "Node file descriptor util", # description of the metric + "Type": "CapacityMetric", # metric type + "Collect": { # settings for collecting and preprocessing metric + "metric_name": "sysom_file_descriptor", + "related_value": ["file-nr", "file-max"], + "node_tag_name": "type", + "standard_type": 0, + "filename": "node_fd_util", + }, + "Score": { # settings for calculating metric score + "100": 40, # fd util >= 50% -- 100分(good) + "60": 60, # fd util >= 85% -- 70分(warning) + "30": 80, # cpu total util >= 90% -- 60分(error) + "0": 100 # cpu total util >= 95% -- 0分(fatel) + }, + "Alarm": { # settings for alerting and diagnosing + "threshold": 30, + "diagnose_type": "custom", + "service_name": "command" + } + }, + { + "MetricID": "Node memory util", # description of the metric + "Type": "CapacityMetric", # metric type + "Collect": { # settings for collecting and preprocessing metric + "metric_name": "sysom_proc_meminfo", # table name + "related_value": ["MemAvailable", "MemTotal"], # specific metric + "node_tag_name": "value", + "standard_type": 3, # 0 = non-standard, 1 = already usage, 2 = (usage/total*100) + }, + "Score": { # settings for calculating metric score + "100": 70, # mem usage >= 70% -- 100分(good) + "70": 80, # mem usage >= 80% -- 70分(warning) + "60": 90, # mem usage >= 90% -- 60分(error) + "0": 100 # mem usage >= 95% -- 0分(fatel) + } + }, + { + "MetricID": "Node cpu util", # description of the metric + "Type": "CapacityMetric", # metric type + "Collect": { # settings for collecting and preprocessing metric + "metric_name": "sysom_proc_cpu_total", + "related_value": ["idle"], + "node_tag_name": "mode", + "standard_type": 0, + "filename": "node_cpu_util", + }, + "Score": { # settings for calculating metric score + "100": 70, # cpu total util >= 70% -- 100分(good) + "70": 80, # cpu total util >= 85% -- 70分(warning) + "60": 90, # cpu total util >= 90% -- 60分(error) + "0": 100 # cpu total util >= 95% -- 0分(fatel) + } + }, + { + "MetricID": "Node sys util", # description of the metric + "Type": "CapacityMetric", # metric type + "Collect": { # settings for collecting and preprocessing metric + "metric_name": "sysom_proc_cpu_total", + "related_value": ["sys"], + "node_tag_name": "mode", + "standard_type": 1, + }, + "Score": { # settings for calculating metric score + "100": 5, + "70": 10, + "60": 20, + "0": 30 + } + }, + { + "MetricID": "Node rootfs util", # description of the metric + "Type": "CapacityMetric", # metric type + "Collect": { # settings for collecting and preprocessing metric + "metric_name": "sysom_fs_stat", + "related_value": ["f_bavail", "f_blocks", "f_bfree"], + "node_tag_name": "counter", + "standard_type": 0, + "filename": "node_rootfs_util", + }, + "Score": { # settings for calculating metric score + "100": 50, + "70": 70, + "60": 90, + "0": 95 + } + }, + { + "MetricID": "Node rootfs inode util", # description of the metric + "Type": "CapacityMetric", # metric type + "Collect": { # settings for collecting and preprocessing metric + "metric_name": "sysom_fs_stat", + "related_value": ["f_favail", "f_files"], + "node_tag_name": "counter", + "standard_type": 0, + "filename": "node_rootfs_inode_util", + }, + "Score": { # settings for calculating metric score + "100": 50, + "70": 70, + "60": 90, + "0": 95 + } + }, + { + "MetricID": "Node load average", # description of the metric + "Type": "LoadMetric", # metric type + "Collect": { # settings for collecting and preprocessing metric + "metric_name": "sysom_proc_loadavg", + "related_value": ["load1"], + "node_tag_name": "value", + "standard_type": 0, + "filename": "node_load_avg", + }, + "Score": { # settings for calculating metric score + "100": 1, # cpu total util >= 70% -- 100分(good) + "70": 5, # cpu total util >= 85% -- 70分(warning) + "60": 10, # cpu total util >= 90% -- 60分(error) + "0": 20 # cpu total util >= 95% -- 0分(fatel) + } + }, + { + "MetricID": "Node sched latency", # description of the metric + "Type": "LatencyMetric", # metric type + "Collect": { + "metric_name": "sysom_cpu_dist", + "related_value": ["ms10","ms100","s1"], + "node_tag_name": "value", + "standard_type": 2, + }, + "Score": { + "100": 40, # cpu total util >= 70% -- 100分(good) + "70": 100, # cpu total util >= 85% -- 70分(warning) + "30": 150, # cpu total util >= 90% -- 60分(error) + "0": 200 # cpu total util >= 95% -- 0分(fatel) + } + }, + { + "MetricID": "Node OOM count", # description of the metric + "Type": "ErrorMetric", # metric type + "Collect": { # settings for collecting and preprocessing metric + "metric_name": "sysom_proc_vmstat", + "related_value": ["oom_kill"], + "node_tag_name": "value", + "standard_type": 1, + }, + "Score": { # settings for calculating metric score + "100": 0, + "60": 1, + "0": 5, + } + } +] \ No newline at end of file diff --git a/sysom_server/sysom_cluster_health/conf/common.py b/sysom_server/sysom_cluster_health/conf/common.py index 974e00045ef23040b85c1af4561058470555fa3f..4b280c6496d96e783d2f3b30bc27de0a584981c0 100644 --- a/sysom_server/sysom_cluster_health/conf/common.py +++ b/sysom_server/sysom_cluster_health/conf/common.py @@ -30,4 +30,20 @@ SysomFramework.init(YAML_CONFIG) SQLALCHEMY_DATABASE_URL = ( f"{mysql_config.dialect}+{mysql_config.engine}://{mysql_config.user}:{mysql_config.password}@" f"{mysql_config.host}:{mysql_config.port}/{mysql_config.database}" -) \ No newline at end of file +) + +################################################################## +# Cec settings +################################################################## +# 健康度接收SYSOM_HEALTH_METRIC格式异常指标主题 +CEC_TOPIC_SYSOM_HEALTH_METRIC = "SYSOM_HEALTH_METRIC" + +################################################################## +# gcache settings +################################################################## +CLUSTER_HEALTH_METRIC_GCACHE = "cluster_health_metrics" +NODE_HEALTH_METRIC_GCACHE = "node_health_metrics" +POD_HEALTH_METRIC_GCACHE = "pod_health_metrics" +CLUSTER_METRIC_EXPORTER = "cluster_metric_exporter" +NODE_METRIC_EXPORTER = "node_metric_exporter" +POD_METRIC_EXPORTER = "pod_metric_exporter" \ No newline at end of file diff --git a/sysom_server/sysom_cluster_health/conf/metric_settings.py b/sysom_server/sysom_cluster_health/conf/metric_settings.py deleted file mode 100644 index 8ee56e57bfb9054ab7f8183c87a99dc97e3381c7..0000000000000000000000000000000000000000 --- a/sysom_server/sysom_cluster_health/conf/metric_settings.py +++ /dev/null @@ -1,418 +0,0 @@ -from .common import * - -################################################################################# -# Base Settings -################################################################################# - -# Prometheus to collect metrics -PROMETHEUS_CONFIG = YAML_CONFIG.get_server_config().db.prometheus -# Interval to collect metric from prometheus and calculate health score -CALCULATE_INTERVAL = 30 -# If True, use multi-thread to collect and calculate -ENABLE_MULTI_THREAD = False -# Number or thread to use if ENABLE_MULTI_THREAD enable -ANALYZER_PROCESS_NUM = 1 -# No Cluster Label in metric, assume all metric is in one cluster -NO_CLUSTER_LABEL = True - -################################################################################# -# Global Metric Collecting Settings -################################################################################# - -# the following settings is to specify some label name, in case metric labels changing -CLUSTER_LABEL = "cluster" -POD_LABEL = "pod" -NODE_LABEL = "instance" -NAMESPACE_LABEL = "namespace" -POD_METRIC_TAG = "value" - -################################################################################# -# Global Metric Weights Settings -################################################################################# - -# the following settings is to specify weight calculating method of each level -# Worst: the type score is the lowest socre of a metric of this metric type -# Equal: all metric of one metric type has equal weight -# WeightedSum: custom weight of in metric settings, should make sure sum of all metrics' -# weight equal to 1 -# Auto: use built-in weight algorithm -POD_WEIGHT_METHOD = "Worst" # Worst, Equal, WeightedSum, Auto -NODE_WEIGHT_METHOD = "Worst" # Worst, Equal, WeightedSum, Auto -CLUSTER_WEIGHT_METHOD = "Worst" # Worst, Equal, WeightedSum, Auto - -################################################################################# -# Global Alarm and Diagnose Settings -################################################################################# - -# The size of queue which used to send diagnose request between analyzer and diagnose worker -MAX_QUEUE_SIZE = 500 -# Used to merge alarms -ALARM_MERGE_NUM = 10 - - -################################################################################# -# Cluster Metrics Settings -################################################################################# - -CLUSTER_METRICS = { - "CapacityMetric": [], - "LoadMetric": [], - "LatencyMetric": [], - "ErrorMetric": [] -} - -################################################################################# -# Pod Metrics Settings -################################################################################# - -POD_METRICS = { - "CapacityMetric": [ # the weight of all capacity metrci must euqal to 1 - { - "Description": "Pod memory util", # description of the metric - "Collect": { # settings for collecting and preprocessing metric - "metric_name": "sysom_container_memUtil", # table name - "related_value": ["usage", "limit"], # specific metric - "standard_type": 2, # 0 = non-standard(custom), 1 = already usage, 2 = (usage/total*100) - }, - "Score": { # settings for calculating metric score - "weight": 0.3, # weight of the metric - "score": { # 分数:指标值 - "100": 70, # mem usage >= 70% -- 100分(good) - "70": 80, # mem usage >= 80% -- 70分(warning) - "60": 90, # mem usage >= 90% -- 60分(error) - "0": 100 # mem usage >= 95% -- 0分(fatel) - } - } - }, - { - "Description": "Pod cpu util", # description of the metric - "Collect": { # settings for collecting and preprocessing metric - "metric_name": "sysom_container_cpuacct_stat", - "related_value": ["total"], - "standard_type": 1, - }, - "Score": { # settings for calculating metric score - "weight": 0.2, - "score": { - "100": 70, # cpu total util >= 70% -- 100分(good) - "70": 80, # cpu total util >= 85% -- 70分(warning) - "60": 90, # cpu total util >= 90% -- 60分(error) - "0": 100 # cpu total util >= 95% -- 0分(fatel) - } - } - }, - { - "Description": "Pod sys util", # description of the metric - "Collect": { # settings for collecting and preprocessing metric - "metric_name": "sysom_container_cpuacct_stat", - "related_value": ["system"], - "standard_type": 1, - }, - "Score": { # settings for calculating metric score - "weight": 0.5, - "score": { - "100": 5, - "70": 10, - "60": 20, - "0": 30, - } - } - }, - ], - "LoadMetric": [ - { - "Description": "Pod load average", # description of the metric - "Collect": { # settings for collecting and preprocessing metric - "metric_name": "sysom_container_proc_stat", - "related_value": ["r_load1min"], - "standard_type": 1, - }, - "Score": { # settings for calculating metric score - "weight": 1.0, - "score": { - "100": 0.7, - "70": 1, - "60": 5, - "0": 10 - } - } - } - ], - "LatencyMetric": [ - { - "Description": "Pod memory reclaim latency", # description of the metric - "Collect": { # settings for collecting and preprocessing metric - "metric_name": "sysom_container_memdrcm_latency", - # 延时区间需要按从小到达填写 - "related_value": ["memDrcm_lat_10to100ms", "memDrcm_lat_100to500ms", "memDrcm_lat_500to1000ms", - "memDrcm_lat_1000ms"], - "standard_type": 2, - }, - "Score": { # settings for calculating metric score - "weight": 1.0, - "score": { - "100": 0, - "70": 100, - "60": 10000, - "0": 100000 - } - } - }, - { - "Description": "Pod memory compact latency", # description of the metric - "Collect": { # settings for collecting and preprocessing metric - "metric_name": "sysom_container_memmcmp_latency", - # 延时区间需要按从小到达填写 - "related_value": ["memDcmp_lat_10to100ms", "memDcmp_lat_100to500ms", "memDcmp_lat_500to1000ms", - "memDcmp_lat_1000ms"], - "standard_type": 2, - }, - "Score": { # settings for calculating metric score - "weight": 1.0, - "score": { - "100": 0, - "70": 100, - "60": 10000, - "0": 100000 - } - } - } - ], - "ErrorMetric": [ - { - "Description": "Pod OOM count", # description of the metric - "Collect": { # settings for collecting and preprocessing metric - "metric_name": "sysom_container_memory_oomcnt", - "related_value": ["oom_kill"], - "standard_type": 1, - }, - "Score": { - "weight": 1.0, - "score": { - "100": 0, - "60": 1, - "0": 5, - } - } - }, - { - "Description": "Pod memory fail count", # description of the metric - "Collect": { # settings for collecting and preprocessing metric - "metric_name": "sysom_container_memfail_cnt", - "related_value": ["fail_cnt"], - "standard_type": 1, - }, - "Score": { - "weight": 1.0, - "score": { - "100": 0, - "80": 10, - "60": 50, - "0": 100, - } - } - }, - { - "Description": "Pod cpu throttled count", # description of the metric - "Collect": { # settings for collecting and preprocessing metric - "metric_name": "sysom_container_cpu_stat", - "related_value": ["nr_throttled"], - "standard_type": 1, - }, - "Score": { - "weight": 1.0, - "score": { - "100": 0, - "60": 1, - "0": 5, - } - } - } - ] -} - -################################################################################# -# Nodes Metrics Settings -################################################################################# - -NODE_METRICS = { - "CapacityMetric": [ # the weight of all capacity metrci must euqal to 1 - { - "Description": "Node file descriptor util", # description of the metric - "Collect": { # settings for collecting and preprocessing metric - "metric_name": "sysom_file_descriptor", - "related_value": ["file-nr", "file-max"], - "node_tag_name": "type", - "standard_type": 0, - "filename": "node_fd_util", - }, - "Score": { # settings for calculating metric score - "weight": 0.2, - "score": { - "100": 40, # fd util >= 50% -- 100分(good) - "60": 60, # fd util >= 85% -- 70分(warning) - "30": 80, # cpu total util >= 90% -- 60分(error) - "0": 100 # cpu total util >= 95% -- 0分(fatel) - } - }, - "Alarm": { # settings for alerting and diagnosing - "threshold": 30, - "diagnose_type": "custom", - "service_name": "command" - }, - }, - { - "Description": "Node memory util", # description of the metric - "Collect": { # settings for collecting and preprocessing metric - "metric_name": "sysom_proc_meminfo", # table name - "related_value": ["MemAvailable", "MemTotal"], # specific metric - "node_tag_name": "value", - "standard_type": 3, # 0 = non-standard, 1 = already usage, 2 = (usage/total*100) - }, - "Score": { # settings for calculating metric score - "weight": 0.1, - "score": { # 分数:指标值 - "100": 70, # mem usage >= 70% -- 100分(good) - "70": 80, # mem usage >= 80% -- 70分(warning) - "60": 90, # mem usage >= 90% -- 60分(error) - "0": 100 # mem usage >= 95% -- 0分(fatel) - } - } - }, - { - "Description": "Node cpu util", # description of the metric - "Collect": { # settings for collecting and preprocessing metric - "metric_name": "sysom_proc_cpu_total", - "related_value": ["idle"], - "node_tag_name": "mode", - "standard_type": 0, - "filename": "node_cpu_util", - }, - "Score": { # settings for calculating metric score - "weight": 0.2, - "score": { - "100": 70, # cpu total util >= 70% -- 100分(good) - "70": 80, # cpu total util >= 85% -- 70分(warning) - "60": 90, # cpu total util >= 90% -- 60分(error) - "0": 100 # cpu total util >= 95% -- 0分(fatel) - } - } - }, - { - "Description": "Node sys util", # description of the metric - "Collect": { # settings for collecting and preprocessing metric - "metric_name": "sysom_proc_cpu_total", - "related_value": ["sys"], - "node_tag_name": "mode", - "standard_type": 1, - }, - "Score": { # settings for calculating metric score - "weight": 0.5, - "score": { - "100": 5, - "70": 10, - "60": 20, - "0": 30 - } - } - }, - { - "Description": "Node rootfs util", # description of the metric - "Collect": { # settings for collecting and preprocessing metric - "metric_name": "sysom_fs_stat", - "related_value": ["f_bavail", "f_blocks", "f_bfree"], - "node_tag_name": "counter", - "standard_type": 0, - "filename": "node_rootfs_util", - }, - "Score": { # settings for calculating metric score - "weight": 0.2, - "score": { - "100": 50, - "70": 70, - "60": 90, - "0": 95 - } - } - }, - { - "Description": "Node rootfs inode util", # description of the metric - "Collect": { # settings for collecting and preprocessing metric - "metric_name": "sysom_fs_stat", - "related_value": ["f_favail", "f_files"], - "node_tag_name": "counter", - "standard_type": 0, - "filename": "node_rootfs_inode_util", - }, - "Score": { # settings for calculating metric score - "weight": 0.2, - "score": { - "100": 50, - "70": 70, - "60": 90, - "0": 95 - } - } - } - ], - "LoadMetric": [ - { - "Description": "Node load average", # description of the metric - "Collect": { # settings for collecting and preprocessing metric - "metric_name": "sysom_proc_loadavg", - "related_value": ["load1"], - "node_tag_name": "value", - "standard_type": 0, - "filename": "node_load_avg", - }, - "Score": { # settings for calculating metric score - "weight": 1.0, - "score": { - "100": 1, # cpu total util >= 70% -- 100分(good) - "70": 5, # cpu total util >= 85% -- 70分(warning) - "60": 10, # cpu total util >= 90% -- 60分(error) - "0": 20 # cpu total util >= 95% -- 0分(fatel) - } - } - } - ], - "LatencyMetric": [ - { - "Description": "Node sched latency", # description of the metric - "Collect": { - "metric_name": "sysom_cpu_dist", - "related_value": ["ms10","ms100","s1"], - "node_tag_name": "value", - "standard_type": 2, - }, - "Score": { - "weight": 1.0, - "score": { - "100": 40, # cpu total util >= 70% -- 100分(good) - "70": 100, # cpu total util >= 85% -- 70分(warning) - "30": 150, # cpu total util >= 90% -- 60分(error) - "0": 200 # cpu total util >= 95% -- 0分(fatel) - } - } - } - ], - "ErrorMetric": [ - { - "Description": "Node OOM count", # description of the metric - "Collect": { # settings for collecting and preprocessing metric - "metric_name": "sysom_proc_vmstat", - "related_value": ["oom_kill"], - "node_tag_name": "value", - "standard_type": 1, - }, - "Score": { # settings for calculating metric score - "weight": 1.0, - "score": { - "100": 0, - "60": 1, - "0": 5, - } - } - } - ] -} \ No newline at end of file diff --git a/sysom_server/sysom_cluster_health/conf/settings.py b/sysom_server/sysom_cluster_health/conf/settings.py index cc76abe102639441f4c267750a69a38bf96c3c41..4bc3d893dfab41ce9118b656e3f26b36ca6f419b 100644 --- a/sysom_server/sysom_cluster_health/conf/settings.py +++ b/sysom_server/sysom_cluster_health/conf/settings.py @@ -17,5 +17,11 @@ elif env == "testing": from .testing import * elif env == "product": from .product import * - -from .metric_settings import * \ No newline at end of file + +from .collector_settings import * +from .clusterhealth_settings import * + +# Prometheus to collect metrics +PROMETHEUS_CONFIG = YAML_CONFIG.get_server_config().db.prometheus +# No Cluster Label in metric, assume all metric is in one cluster +NO_CLUSTER_LABEL = True \ No newline at end of file diff --git a/sysom_server/sysom_cluster_health/lib/algorithm/weight_algorithm.py b/sysom_server/sysom_cluster_health/lib/algorithm/weight_algorithm.py deleted file mode 100644 index b744eb81f89a3705cbeddae7d55efb08cd7599b2..0000000000000000000000000000000000000000 --- a/sysom_server/sysom_cluster_health/lib/algorithm/weight_algorithm.py +++ /dev/null @@ -1,144 +0,0 @@ -import math -import numpy as np -import pandas as pd -from numpy import array -from typing import List, Dict, NewType -from dataclasses import dataclass -from lib.metric_manager import WEIGHT_METHODS, MetricManager -from lib.metric_type.metric_type import Level -from lib.score_result import LevelResults - -TYPES = ["CapacityMetric", "LoadMetric", - "LatencyMetric", "ErrorMetric"] - -TYPEWEIGHTS = [0.2, 0.2, 0.3, 0.3] - - -@dataclass -class TypeWeight: - type: str - weight: float - - -# 用于存储每个level下每个metric type的权重 -LevelTypeWeights = NewType('LevelTypeWeights', - Dict[Level, List[TypeWeight]]) -# 用于存储每个metric type下每个metric的权重 -TypeMetricWeights = NewType('TypeMetricWeights', - Dict[str, List[float]]) - - -class WeightsCalculator: - def __init__(self, metric_manager: MetricManager): - self.metric_manager = metric_manager - self.weights_method = WEIGHT_METHODS - self.type_weights = self.setup_type_weights(metric_manager) - - def setup_type_weights( - self, - metric_manager: MetricManager - ) -> LevelTypeWeights: - - type_weights = {} - types = TYPES - - # 如果某个level没有注册某个metric type,其他metric的权重需要重新计算(等比例放大) - for member in Level.__members__.values(): - # for now, skip cluster level - if member == Level.Cluster: - type_weights[member] = [ - TypeWeight(type, weight) - for type, weight in zip(types, TYPEWEIGHTS) - ] - continue - - weights = TYPEWEIGHTS.copy() - for type in types: - if len(metric_manager.registed_metric[member][type]) <= 0: - missing_index = types.index(type) - weights[missing_index] = 0 - - remaining_weights = sum(weights) - adjusted_weights = [ - weight / remaining_weights for weight in weights - ] - combined = [ - TypeWeight( - type, weight) for type, weight in zip( - types, adjusted_weights)] - - type_weights[member] = combined - - return type_weights - - def cal_metric_weights( - self, - level: Level, - data: List[LevelResults] - ) -> TypeMetricWeights: - """ Calculate weight of each metric - - Args: - data: all pods/ins's LevelResults - - Returns: - Dict of metric and its weight - """ - - res = TypeMetricWeights({}) - type_weights = self.type_weights[level] - for type_weight in type_weights: - rows = [] - for item in data: - row = [sr["value"] for sr in item.results[type_weight.type]] - rows.append(row) - df = pd.DataFrame(rows) - weights = self.cal_metric_weight_ewm(df) - res[type_weight.type] = weights[0].tolist() - - return res - - def cal_metric_weight_ewm(self, x: pd.DataFrame) -> pd.DataFrame: - '''熵值法计算变量的权重''' - x = x.apply(lambda x: ((x - np.min(x)) / (np.max(x) - np.min(x)))) - - rows = x.index.size - cols = x.columns.size - k = 1.0 / math.log(rows) - - lnf = [[None] * cols for i in range(rows)] - - x = array(x) - lnf = [[None] * cols for i in range(rows)] - lnf = array(lnf) - for i in range(0, rows): - for j in range(0, cols): - if x[i][j] == 0: - lnfij = 0.0 - else: - p = x[i][j] / x.sum(axis=0)[j] - lnfij = math.log(p) * p * (-k) - lnf[i][j] = lnfij - lnf = pd.DataFrame(lnf) - E = lnf - - d = 1 - E.sum(axis=0) - - w = [[None] * 1 for i in range(cols)] - for j in range(0, cols): - wj = d[j] / sum(d) - w[j] = wj - - w = pd.DataFrame(w) - return w - - def cal_weights_critic(self, df: pd.DataFrame) -> pd.DataFrame: - X = df.values - X_norm = (X - X.min(axis=0)) / (X.max(axis=0) - X.min(axis=0)) - - sigma = np.std(X_norm, axis=0) - corr = np.corrcoef(X_norm.T) - C = sigma * np.sum(1 - corr, axis=0) - weights = C / np.sum(C) - - return weights diff --git a/sysom_server/sysom_cluster_health/lib/common_type.py b/sysom_server/sysom_cluster_health/lib/common_type.py new file mode 100644 index 0000000000000000000000000000000000000000..c1698ad62b39aba3c8ae7d619d223578e227da3d --- /dev/null +++ b/sysom_server/sysom_cluster_health/lib/common_type.py @@ -0,0 +1,21 @@ +from dataclasses import dataclass +from enum import Enum +from typing import Optional + +@dataclass +class Labels: + cluster: str + instance: Optional[str] + namespace: Optional[str] + pod: Optional[str] + + def __init__(self, cluster, instance=None, namespace=None, pod=None): + self.cluster = cluster + self.instance = instance + self.namespace = namespace + self.pod = pod + +class Level(Enum): + Cluster = "cluster" + Node = "node" + Pod = "pod" \ No newline at end of file diff --git a/sysom_server/sysom_cluster_health/main.py b/sysom_server/sysom_cluster_health/main.py index c6865bd521e9914281ead53e79c3f5e36be7afd1..7bc013db2f012fa7c00222af910334ca8c3d7e0d 100644 --- a/sysom_server/sysom_cluster_health/main.py +++ b/sysom_server/sysom_cluster_health/main.py @@ -1,74 +1,24 @@ # -*- coding: utf-8 -*- # - -import json -import conf.settings as settings +from conf.settings import * from os import getpid from multiprocessing import Queue from clogger import logger -from fastapi import FastAPI, HTTPException, Response -from prometheus_client import Gauge, CollectorRegistry, generate_latest +from fastapi import FastAPI from conf.settings import YAML_CONFIG from sysom_utils import CmgPlugin, SysomFramework -from lib.score_result import ScoreType -from lib.metric_manager import MetricManager -from lib.algorithm.weight_algorithm import WeightsCalculator -from app.routers import health -from app.calculator.analyzer import Analyzer -from app.calculator.diagnose_worker import DiagnoseWorker +from app.collector.metric_manager import MetricManager +from app.routers import health, metrics +from app.collector.collector import Collector +from app.health_calculator.calculator import HealthCalculator +from app.diagnose.diagnose_worker import DiagnoseWorker +from app.consumer.consumer import HealthMetricListener +from app.crud import del_all_abnormal_metrics_data +from app.database import SessionLocal app = FastAPI() app.include_router(health.router, prefix="/api/v1/cluster_health/health") -# app.include_router(health.router, prefix="/api/v1/cluster_health/person") - -CLUSTER_HEALTH_SCORE_LABEL = ["cluster", "type"] -CLUSTER_HEALTH_METRIC_LABEL = ["cluster", "type", "description", "mode"] -NODE_HEALTH_SCORE_LABEL = ["cluster", "instance", "type"] -NODE_HEALTH_METRIC_LABEL = [ - "cluster", - "instance", - "type", - "description", - "mode"] -POD_HEALTH_SCORE_LABEL = ["cluster", "instance", "pod", "namespace", "type"] -POD_HEALTH_METRIC_LABEL = [ - "cluster", - "instance", - "type", - "pod", - "namespace", - "description", - "mode"] - -registry = CollectorRegistry() -cluster_health_score = Gauge('sysom_cluster_health_score', - 'sysom cluster health score', - CLUSTER_HEALTH_SCORE_LABEL, - registry=registry) -cluster_health_metric = Gauge('sysom_cluster_health_metric', - 'sysom cluster health metric', - CLUSTER_HEALTH_METRIC_LABEL, - registry=registry) -node_health_score = Gauge('sysom_node_health_score', - 'sysom node health score', - NODE_HEALTH_SCORE_LABEL, - registry=registry) -node_health_metric = Gauge('sysom_node_health_metric', - 'sysom node health metric', - NODE_HEALTH_METRIC_LABEL, - registry=registry) -pod_health_score = Gauge('sysom_pod_health_score', - 'sysom pod health score', - POD_HEALTH_SCORE_LABEL, - registry=registry) -pod_health_metric = Gauge('sysom_pod_health_metric', - 'sysom pod health score', - POD_HEALTH_METRIC_LABEL, - registry=registry) - -g_cache_cluster = SysomFramework.gcache("cluster_metrics") -g_cache_instance = SysomFramework.gcache("instance_metrics") -g_cache_pod = SysomFramework.gcache("pod_metrics") +app.include_router(metrics.router) ############################################################################# # Write your API interface here, or add to app/routes @@ -81,38 +31,60 @@ def init_framwork(): .load_plugin_cls(CmgPlugin) \ .start() logger.info("SysomFramework init finished!") - + + +def cleanup_gcache_data(): + gcache_names = [ + CLUSTER_HEALTH_METRIC_GCACHE, + NODE_HEALTH_METRIC_GCACHE, + POD_HEALTH_METRIC_GCACHE, + CLUSTER_METRIC_EXPORTER, + NODE_METRIC_EXPORTER, + POD_METRIC_EXPORTER + ] + + for name in gcache_names: + gcache = SysomFramework.gcache(name) + gcache.clean() + + if ABNORMAL_METRIC_STORAGE == "mysql": + with SessionLocal() as db: + del_all_abnormal_metrics_data(db) @app.on_event("startup") async def on_start(): init_framwork() - - # cleanup history data befor startup - g_cache_cluster.clean() - g_cache_instance.clean() - g_cache_pod.clean() - + + cleanup_gcache_data() + # load all registered metrics from settings metric_manager = MetricManager() metric_manager.metric_register() - weight_calculator = WeightsCalculator(metric_manager) - diagnose_queue = Queue(maxsize=settings.MAX_QUEUE_SIZE) + diagnose_queue = Queue(maxsize=MAX_QUEUE_SIZE) pid = getpid(); # start analyzer to collect and calculate health score - Analyzer(clusterhealth_interval=settings.CALCULATE_INTERVAL, - queue=diagnose_queue, - metric_manager=metric_manager, - weight_cal=weight_calculator, - parent_pid=pid).start() - - DiagnoseWorker( - metric_manager=metric_manager, - queue=diagnose_queue, - parent_pid=pid).start() - - logger.info("集群健康度计算定时任务已启动") + try: + Collector( + queue=diagnose_queue, + metric_manager=metric_manager, + parent_pid=pid + ).start() + + HealthCalculator( + parent_pid=pid + ).start() + + DiagnoseWorker( + metric_manager=metric_manager, + queue=diagnose_queue, + parent_pid=pid).start() + + HealthMetricListener().start() + logger.info("集群健康度定时任务已启动") + except Exception as e: + logger.exception(e) ########################################################################## # Perform some microservice initialization operations over here @@ -121,69 +93,4 @@ async def on_start(): @app.on_event("shutdown") async def on_shutdown(): - pass - - -@app.get("/metrics") -def get_metrics(): - # pull health score metric from redis and push to prometheus - try: - cluster_all = g_cache_cluster.load_all() - nodes_all = g_cache_instance.load_all() - pods_all = g_cache_pod.load_all() - - if len(cluster_all) <= 0 or len(nodes_all) <= 0: - return Response(generate_latest(registry), media_type="text/plain") - - def process_metrics(metrics_all, score_labels, metric_labels, - health_score, health_metric, cache): - for item, results in metrics_all.items(): - metrics = json.loads(results) - - # the last element is the health score - for metric in metrics: - if metric["type"] == ScoreType.MetricScore.value: - labels = [metric["labels"][label] - for label in metric_labels[:-1]] + ["score"] - # return score of each metric - health_metric.labels(*labels).set(metric["score"]) - # return value of each metric - labels[-1] = "value" - health_metric.labels(*labels).set(metric["value"]) - elif metric["type"] == ScoreType.MetricTypeScore.value: - labels = [metric["labels"][label] - for label in score_labels] - # return score of each metric - health_score.labels(*labels).set(metric["score"]) - elif metric["type"] == ScoreType.InstanceScore.value: - labels = [metric["labels"][label] - for label in score_labels[:-1]] + ["total"] - # return score of each metric - health_score.labels(*labels).set(metric["score"]) - - # delete metrics from redis - cache.delete(item) - - # Then call the function with the appropriate arguments - process_metrics(cluster_all, CLUSTER_HEALTH_SCORE_LABEL, - CLUSTER_HEALTH_METRIC_LABEL, - cluster_health_score, - cluster_health_metric, g_cache_cluster) - process_metrics(nodes_all, NODE_HEALTH_SCORE_LABEL, - NODE_HEALTH_METRIC_LABEL, - node_health_score, - node_health_metric, g_cache_instance) - process_metrics(pods_all, POD_HEALTH_SCORE_LABEL, - POD_HEALTH_METRIC_LABEL, - pod_health_score, - pod_health_metric, g_cache_pod) - - except Exception as e: - print("Exception: ", e) - raise HTTPException(status_code=400, detail=str(e)) - finally: - g_cache_cluster.clean() - g_cache_instance.clean() - g_cache_pod.clean() - - return Response(generate_latest(registry), media_type="text/plain") + cleanup_gcache_data()