From c3c02bd6111d71ce5ab43bc2a672405695f82c7c Mon Sep 17 00:00:00 2001 From: liaozhaoyan Date: Sun, 4 Jun 2023 15:03:22 +0800 Subject: [PATCH] add more info for metrics. --- .../monitor/unity/beaver/guide/metrics.md | 230 +++++++++++++----- source/tools/monitor/unity/etc/base.yaml | 2 +- .../tools/monitor/unity/test/curl/postOSS.py | 9 + source/tools/monitor/unity/yamls/group.yaml | 73 ------ 4 files changed, 175 insertions(+), 139 deletions(-) create mode 100644 source/tools/monitor/unity/test/curl/postOSS.py delete mode 100644 source/tools/monitor/unity/yamls/group.yaml diff --git a/source/tools/monitor/unity/beaver/guide/metrics.md b/source/tools/monitor/unity/beaver/guide/metrics.md index fd667b84..186a0b87 100644 --- a/source/tools/monitor/unity/beaver/guide/metrics.md +++ b/source/tools/monitor/unity/beaver/guide/metrics.md @@ -6,28 +6,128 @@ ------------- -### uptime 表 +### cpu\_total 表 + +* 对应export 指标 sysak\_proc\_cpu\_total +* 属性标签: mode + | 指标名 | 单位 | 标签说明 | 备注 | 源码路径 | | :--- | ---: | :---- | :---- | :--- | -| uptime | 秒 | 从系统启动到现在的时间 | | collector/proc\_uptime.lua | -| idletime | 秒 | 系统总空闲的时间 | | collector/proc\_uptime.lua | -| stamp | 秒 | 系统时间戳 | unix 时间 | collector/proc\_uptime.lua | +| softirq | % | 软中断百分比 | | collector/proc\_stat.lua | +| user | % | 用户态占用率百分比 | | collector/proc\_stat.lua | +| guestnice | % | guestnice百分比 | | collector/proc\_stat.lua | +| guest | % |guest百分比 | | collector/proc\_stat.lua | +| steal | % |steal百分比 | | collector/proc\_stat.lua | +| hardirq | % | 硬中断百分比 | | collector/proc\_stat.lua | +| nice | % | nice百分比 | | collector/proc\_stat.lua | +| idle | % | idle百分比 | | collector/proc\_stat.lua | +| sys | % | sys百分比 | | collector/proc\_stat.lua | +| iowait | % | iowait百分比 | | collector/proc\_stat.lua | + +### cpus 表 + +* 对应export 指标 sysak\_proc\_cpus +* 属性标签: mode -### uname 表 -每小时获取一次 +| 指标名 | 单位 | 标签说明 | 备注 | 源码路径 | +| :--- | ---: | :---- | :---- | :--- | +| cpu_name | | CPU 名称 | | collector/proc\_stat.lua | +| softirq | % | 软中断百分比 | | collector/proc\_stat.lua | +| user | % | 用户态占用率百分比 | | collector/proc\_stat.lua | +| guestnice | % | guestnice百分比 | | collector/proc\_stat.lua | +| guest | % |guest百分比 | | collector/proc\_stat.lua | +| steal | % |steal百分比 | | collector/proc\_stat.lua | +| hardirq | % | 硬中断百分比 | | collector/proc\_stat.lua | +| nice | % | nice百分比 | | collector/proc\_stat.lua | +| idle | % | idle百分比 | | collector/proc\_stat.lua | +| sys | % | sys百分比 | | collector/proc\_stat.lua | +| iowait | % | iowait百分比 | | collector/proc\_stat.lua | + + +### cpus 表 + +* 对应export 指标 sysak\_proc\_cpus +* 属性标签: mode + + +| 指标名 | 单位 | 标签说明 | 备注 | 源码路径 | +| :--- | ---: | :---- | :---- | :--- | +| cpu_name | | CPU 名称 | | collector/proc\_stat.lua | +| softirq | % | 软中断百分比 | | collector/proc\_stat.lua | +| user | % | 用户态占用率百分比 | | collector/proc\_stat.lua | +| guestnice | % | guestnice百分比 | | collector/proc\_stat.lua | +| guest | % |guest百分比 | | collector/proc\_stat.lua | +| steal | % |steal百分比 | | collector/proc\_stat.lua | +| hardirq | % | 硬中断百分比 | | collector/proc\_stat.lua | +| nice | % | nice百分比 | | collector/proc\_stat.lua | +| idle | % | idle百分比 | | collector/proc\_stat.lua | +| sys | % | sys百分比 | | collector/proc\_stat.lua | +| iowait | % | iowait百分比 | | collector/proc\_stat.lua | + + +### stat\_counters表 + +* 对应export 指标 sysak\_proc\_stat\_counters +* 属性标签: counter + +| 指标名 | 单位 | 标签说明 | 备注 | 源码路径 | +| :--- | ---: | :---- | :---- | :--- | +| procs_blocked | | D状态任务数量 | | collector/proc\_stat.lua | +| processes_forks | | fork 任务数量 | | collector/proc\_stat.lua | +| btime | | 启动时间 | | collector/proc\_stat.lua | +| procs_running | | 并行任务数量 | | collector/proc\_stat.lua | +| ctxt | | 上下文切换次数 | | collector/proc\_stat.lua | + + +### proc\_loadavg 表 + +* 对应 export 指标: sysak\_proc\_loadavg +* 属性标签:value | 指标名 | 单位 | 标签说明 | 备注 | 源码路径 | |:---------| ---: | :---- | :---- | :--- | -| nodename | - | uname -r | | collector/proc\_uptime.lua | -| version | - | uname -r | | collector/proc\_uptime.lua | -| release | - | uname -r | | collector/proc\_uptime.lua | -| machine | - | uname -r | | collector/proc\_uptime.lua | -| sysname | - | uname -r | | collector/proc\_uptime.lua | +| runq | - | rq队列长度 | | collector/proc\_load.lua | +| load1 | - | load1 | | collector/proc\_load.lua | +| load5 | - | load5 | | collector/proc\_load.lua | +| load10 | - | load10 | | collector/proc\_load.lua | +| plit | - | plit | | collector/proc\_load.lua | + + +### proc\_loadavg 表 + +* 对应 export 指标: sysak\_proc\_loadavg +* 属性标签:value + +| 指标名 | 单位 | 标签说明 | 备注 | 源码路径 | +|:---------| ---: | :---- | :---- | :--- | +| runq | - | rq队列长度 | | collector/proc\_load.lua | +| load1 | - | load1 | | collector/proc\_load.lua | +| load5 | - | load5 | | collector/proc\_load.lua | +| load10 | - | load10 | | collector/proc\_load.lua | +| plit | - | plit | | collector/proc\_load.lua | + + +### meminfo 表 + +* 对应 export 指标: sysak\_proc\_meminfo +* 属性标签:value + +指标说明参考[/proc/meminfo内存文件详解](https://zhuanlan.zhihu.com/p/145524701) + +### vmstat 表 + +* 对应 export 指标: sysak\_proc\_vmstate +* 属性标签:value + +指标说明参考[/proc/vmstat输出含义](https://blog.csdn.net/kaka__55/article/details/125236633) ### cgroups 表 +* 对应 export 指标: sysak\_cgroups +* 属性标签:value + | 指标名 | 单位 | 标签说明 | 备注 | 源码路径 | | :--- | --- | :---- | :---- | :--- | | type | - | subsys类型 | | collector/proc\_cgroups.lua | @@ -46,6 +146,9 @@ ### interrupts 表 +* 对应 export 指标: sysak\_interrupts +* 属性标签:value + | 指标名 | 单位 | 标签说明 | 备注 | 源码路径 | | :--- | --- | :---- | :---- | :--- | | cpu | - | CPU ID | | collector/proc\_interrupts.lua | @@ -53,6 +156,9 @@ ### mounts 表 +* 对应 export 指标: sysak\_fs\_stat +* 属性标签:value + | 指标名 | 单位 | 标签说明 | 备注 | 源码路径 | | :--- | --- | :---- | :---- | :--- | | fs | - | sysfs | | collector/proc\_mounts.lua | @@ -67,6 +173,9 @@ ### softirqs 表 +* 对应 export 指标: sysak\_softirqs +* 属性标签:value + | 指标名 | 单位 | 标签说明 | 备注 | 源码路径 | | :--- | --- | :---- | :---- | :--- | | cpu | - | CPU ID | | collector/proc\_softirqs.lua | @@ -82,7 +191,10 @@ | RCU | 次 | RCU软中断触发次数 | | collector/proc\_softirqs.lua | ### self_statm 表 -统计监控进程的statm信息 + +* 统计监控进程的statm信息 +* 对应 export 指标: sysak\_self\_statm +* 属性标签:value | 指标名 | 单位 | 标签说明 | 备注 | 源码路径 | | :--- | --- | :---- | :---- | :--- | @@ -100,13 +212,18 @@ ### arp +* 对应 export 指标: sysak\_arp +* 属性标签:value + | 指标名 | 单位 | 标签说明 | 备注 | 源码路径 | | :--- | ---: | :---- | :---- | :--- | | count | 个 | 网卡名 | 网卡上对应arp表数量 | collector/proc\_arp.lua | ### networks -这是网卡流量统计信息,已做差值处理 +* 这是网卡流量统计信息,已做差值处理 +* 对应 export 指标: sysak\_proc\_networks +* 属性标签:value | 指标名 | 单位 | 标签说明 | 备注 | 源码路径 | | :--- | ---: | :---- | :---- | :--- | @@ -129,7 +246,9 @@ ### pkt_status -这里统计所有包状态,详细可以通过 pkt_logs 获取 +* 对应 export 指标: sysak\_proc\_pkt\_status +* 属性标签:counter +* 这里统计所有包状态,详细可以通过 pkt\_logs 获取 | 指标名 | 单位 | 标签说明 | 备注 | 源码路径 | | :--- | ---: | :---- | :---- | :--- | @@ -143,6 +262,9 @@ ### sock_stat +* 对应 export 指标: sysak\_sock\_stat +* 属性标签:value + 统计所有包状态。[参考连接](https://developer.aliyun.com/article/484451) | 指标名 | 单位 | 标签说明 | 备注 | 源码路径 | @@ -163,7 +285,8 @@ ### softnets -This parser parses the stats from network devices. These stats includes events per cpu\(in row\), number of packets processed i.e packet_process \(first column\), number of packet drops packet\_drops \(second column\), time squeeze eg net\_rx\_action performed time_squeeze\(third column\), cpu collision eg collision occur while obtaining device lock while transmitting cpu\_collision packets \(eighth column\), received_rps number of times cpu woken up received\_rps \(ninth column\), number of times reached flow limit count flow\_limit\_count \(tenth column\), backlog status \(eleventh column\), core id \(twelfth column\). +* 对应 export 指标: sysak\_softnets +* 属性标签:value | 指标名 | 单位 | 标签说明 | 备注 | 源码路径 | | :--- | ---: | :---- | :---- | :--- | @@ -176,6 +299,9 @@ This parser parses the stats from network devices. These stats includes events p ### cgroups 表 +* 对应 export 指标: sysak\_cgroups +* 属性标签:value + | 指标名 | 单位 | 标签说明 | 备注 | 源码路径 | | :--- | --- | :---- | :---- | :--- | | type | - | subsys类型 | | collector/proc\_cgroups.lua | @@ -192,56 +318,6 @@ This parser parses the stats from network devices. These stats includes events p | perf\_event | 个 | perf_event cgroup数量 | | collector/proc\_cgroups.lua | | memory | 个 | memory cgroup数量 | | collector/proc\_cgroups.lua | -### interrupts 表 - -| 指标名 | 单位 | 标签说明 | 备注 | 源码路径 | -| :--- | --- | :---- | :---- | :--- | -| cpu | - | CPU ID | | collector/proc\_interrupts.lua | -| 中断名称 | 次 | 中断触发次数 | | collector/proc\_interrupts.lua | - -### mounts 表 - -| 指标名 | 单位 | 标签说明 | 备注 | 源码路径 | -| :--- | --- | :---- | :---- | :--- | -| fs | - | sysfs | | collector/proc\_mounts.lua | -| mount | - | 挂载目录 | | collector/proc\_mounts.lua | -| f\_bsize | - | Filesystem block size | | collector/proc\_mounts.lua | -| f\_blocks | - | Size of fs in f_frsize units | | collector/proc\_mounts.lua | -| f\_bfree | - | Number of free blocks | | collector/proc\_mounts.lua | -| f\_bavail | - | Number of free blocks for unprivileged users | | collector/proc\_mounts.lua | -| f\_files | - | Number of inodes | | collector/proc\_mounts.lua | -| f\_ffree | - | Number of free inodes | | collector/proc\_mounts.lua | -| f\_favail | - | Number of free inodes for unprivileged users | | collector/proc\_mounts.lua | - -### softirqs 表 - -| 指标名 | 单位 | 标签说明 | 备注 | 源码路径 | -| :--- | --- | :---- | :---- | :--- | -| cpu | - | CPU ID | | collector/proc\_softirqs.lua | -| HI | 次 | HI软中断触发次数 | | collector/proc\_softirqs.lua | -| TIMER | 次 | TIMER软中断触发次数 | | collector/proc\_softirqs.lua | -| NET\_TX | 次 | NET\_TX软中断触发次数 | | collector/proc\_softirqs.lua | -| NET\_RX | 次 | NET\_RX软中断触发次数 | | collector/proc\_softirqs.lua | -| BLOCK | 次 | BLOCK软中断触发次数 | | collector/proc\_softirqs.lua | -| IRQ_POLL | 次 | IRQ\_POLL软中断触发次数 | | collector/proc\_softirqs.lua | -| TASKLET | 次 | TASKLET软中断触发次数 | | collector/proc\_softirqs.lua | -| SCHED | 次 | SCHED软中断触发次数 | | collector/proc\_softirqs.lua | -| HRTIMER | 次 | HRTIMER软中断触发次数 | | collector/proc\_softirqs.lua | -| RCU | 次 | RCU软中断触发次数 | | collector/proc\_softirqs.lua | - -### self_statm 表 -统计监控进程的statm信息 - -| 指标名 | 单位 | 标签说明 | 备注 | 源码路径 | -| :--- | --- | :---- | :---- | :--- | -| size | - | total program size | | collector/proc\_statm.lua | -| resident | - | resident set size | | collector/proc\_statm.lua | -| shared | - | number of resident shared pages | | collector/proc\_statm.lua | -| text | - | text (code) | | collector/proc\_statm.lua | -| lib | - | library | | collector/proc\_statm.lua | -| data | - | data + stack | | collector/proc\_statm.lua | -| dt | - | dirty pages | | collector/proc\_statm.lua | - ## IO指标 @@ -275,12 +351,20 @@ This parser parses the stats from network devices. These stats includes events p ----------- ### cg_cpu_stat 表 + +* 对应 export 指标: sysak\_ +* 属性标签:value + | 指标名 | 单位 | 标签说明 | 备注 | 源码路径 | | :--- | --- | :---- | :---- | :--- | | nr_throttled | - | total throttled number | | collector/container/cg\_cpu\_stat.lua | | throttled_time | ms | total throttled time | | collector/container/cg\_cpu\_stat.lua | ### cg_proc_stat 表 + +* 对应 export 指标: sysak\_ +* 属性标签:value + | 指标名 | 单位 | 标签说明 | 备注 | 源码路径 | | :--- | --- | :---- | :---- | :--- | | user | % | usr cpu util | | collector/container/cg\_cpuacct\_proc\_stat.lua | @@ -302,11 +386,19 @@ This parser parses the stats from network devices. These stats includes events p | nr_uninterruptible | - | number of deep sleep tasks | | collector/container/cg\_cpuacct\_proc\_stat.lua | ### cg_memfail_cnt 表 + +* 对应 export 指标: sysak\_ +* 属性标签:value + | 指标名 | 单位 | 标签说明 | 备注 | 源码路径 | | :--- | --- | :---- | :---- | :--- | | fail_cnt | - | number of mem fail counts | | collector/container/cg\_memory\_fail\_cnt.lua | ### cg_memdrcm_latency 表 + +* 对应 export 指标: sysak\_ +* 属性标签:value + This table show the hist of the latency of direct memory reclamation | 指标名 | 单位 | 标签说明 | 备注 | 源码路径 | | :--- | --- | :---- | :---- | :--- | @@ -318,6 +410,10 @@ This table show the hist of the latency of direct memory reclamation | memDrcm_lat_1000ms | - | times more than 1s | | collector/container/cg\_memory\_drcm\_latency.lua | ### cg_memmcmp_latency 表 + +* 对应 export 指标: sysak\_ +* 属性标签:value + This table show the hist of the latency of direct memory compaction | 指标名 | 单位 | 标签说明 | 备注 | 源码路径 | | :--- | --- | :---- | :---- | :--- | @@ -329,6 +425,10 @@ This table show the hist of the latency of direct memory compaction | memDcmp_lat_1000ms | - | times more than 1s | | collector/container/cg\_memory\_dcmp\_latency.lua | ### pmu_events 表 + +* 对应 export 指标: sysak\_ +* 属性标签:value + | 指标名 | 单位 | 标签说明 | 备注 | 源码路径 | | :--- | --- | :---- | :---- | :--- | | cpu_cycles | - | cycles | | collector/plugin/pmu_events/pmu\_events.c | diff --git a/source/tools/monitor/unity/etc/base.yaml b/source/tools/monitor/unity/etc/base.yaml index 9c780766..c784e642 100644 --- a/source/tools/monitor/unity/etc/base.yaml +++ b/source/tools/monitor/unity/etc/base.yaml @@ -1,6 +1,6 @@ config: freq: 15 # unit second - port: 8405 # bind port + port: 8400 # bind port bind_addr: 0.0.0.0 # bind ip backlog: 32 # listen backlog identity: # support hostip, curl(need url arg), hostname, file(need path arg), specify(need name arg) diff --git a/source/tools/monitor/unity/test/curl/postOSS.py b/source/tools/monitor/unity/test/curl/postOSS.py new file mode 100644 index 00000000..fb1f74da --- /dev/null +++ b/source/tools/monitor/unity/test/curl/postOSS.py @@ -0,0 +1,9 @@ + +import requests +import uuid +import json + +url = "http://127.0.0.1:8400/api/oss" +d = {"stream": "hello oss", "uuid": str(uuid.uuid4())} +res = requests.post(url, json=d) +print(res) diff --git a/source/tools/monitor/unity/yamls/group.yaml b/source/tools/monitor/unity/yamls/group.yaml deleted file mode 100644 index ec7217b8..00000000 --- a/source/tools/monitor/unity/yamls/group.yaml +++ /dev/null @@ -1,73 +0,0 @@ -config: - freq: 20 # unit second - daemon: true - port: 8400 # bind port - bind_addr: 127.0.0.1 # bind ip - backlog: 32 # listen backlog - identity: # support hostip, curl(need url arg), hostname, file(need path arg), specify(need name arg), env(need name arg) - mode: hostip - proc_path: / # in container mode, like -v /:/mnt/host , should use /mnt/host/ - db: - rotate: 7 # tsdb file retention time, unit day - budget: 200 # max query buffer from tsdb. - limit: - cpu: 30 # unit % - mem: 50 # unit mb - tasks: 10 # monitor 10 pid max. - -outline: - - /var/sysom/outline - -pushTo: - to: "Influx" - host: "ld-8vb0s2ih252f53pv4-proxy-tsdb.lindorm.rds.aliyuncs.com" - port: 8242 - url: "/api/v2/write?db=sysom" - -luaPlugins: ["proc_buddyinfo", "proc_diskstats", "proc_meminfo", "proc_mounts", "proc_netdev", - "proc_snmp_stat", "proc_sockstat", "proc_stat", "proc_statm", "proc_vmstat", - "proc_uptime"] - -plugins: - - so: kmsg - description: "collect dmesg info." - - so: net_health - description: "tcp net health." - - so: net_retrans - description: "tcp retrans monitor." - - so: virtout - description: "virt status out put." - - so: sum_retrans - description: "summary retrans out put." - -metrics: - - title: sysak_proc_pkt_status - from: pkt_status - head: counter - help: "net status info from /proc/net/snmp and /proc/net/status." - type: "gauge" - - title: sysak_net_health_hist - from: net_health_hist - head: value - help: "net_health_hist" - type: "gauge" - - title: sysak_net_health_count - from: net_health_count - head: value - help: "net_health_count" - type: "gauge" - - title: sysak_net_retrans_count - from: net_retrans_count - head: value - help: "net_retrans_count" - type: "gauge" - - title: sysak_virtout_dist - from: virtout_dist - head: value - help: "sysak_virtout_dist" - type: "gauge" - - title: sysak_retrans - from: retrans - head: value - help: "sysak_retrans" - type: "gauge" -- Gitee