From 82bdff3d23d1f7ce5c18f93e0edb8225615bd83f Mon Sep 17 00:00:00 2001 From: luzhihao Date: Tue, 1 Aug 2023 14:24:32 +0800 Subject: [PATCH] Update the gala-gopher observable range. --- gopher_tech.md | 1364 ++++++++++++++++++++++++++++-------------------- 1 file changed, 806 insertions(+), 558 deletions(-) diff --git a/gopher_tech.md b/gopher_tech.md index fb99b8c..698ecc5 100644 --- a/gopher_tech.md +++ b/gopher_tech.md @@ -1,558 +1,806 @@ -# TCP(entity_name:tcp_link) - -| metrics_name | table_name | metrics_type | unit | KPI | metrics description | -| ------------------- | ----------------- | ------------ | ------------------ | ---- | ------------------------------------------------------------ | -| tgid | | key | | | 进程ID | -| role | | key | | | 客户端/服务端 | -| client_ip | | key | | | 客户端:本地IP;服务端:对端IP | -| server_ip | | key | | | 客户端:对端IP;服务端:本地IP | -| client_port | | key | | | 客户端:本地端口;服务端:对端端口 | -| server_port | | key | | | 客户端:对端端口;服务端:本地端口 | -| protocol | | key | | | 协议族(IPv4、IPv6) | -| rx_bytes | tcp_tx_rx(0x8) | Gauge | bytes | Y | rx bytes | -| tx_bytes | tcp_tx_rx(0x8) | Gauge | bytes | Y | tx bytes | -| rto(P50/P90/P99) | tcp_rate(0x20) | histogram | | | Retransmission timeOut(us) | -| ato(P50/P90/P99) | tcp_rate(0x20) | histogram | | | Estimated value of delayed ACK(us) | -| srtt(P50/P90/P99) | tcp_rtt(0x4) | histogram | us | Y | Smoothed Round Trip Time(us). | -| snd_cwnd(P50/P90/P99) | tcp_windows(0x2) | histogram | | | Congestion Control Window Size. | -| reordering(P50/P90/P99) | tcp_windows(0x2) | histogram | | | Segments to be reordered. | -| rcv_rtt(P50/P90/P99) | tcp_rtt(0x4) | histogram | us | | Receive end RTT (unidirectional measurement). | -| notsent_bytes(P50/P90/P99) | tcp_windows(0x2) | histogram | bytes | | Number of bytes not sent currently. | -| notack_bytes(P50/P90/P99) | tcp_windows(0x2) | histogram | bytes | | Number of bytes not ack currently. | -| snd_wnd(P50/P90/P99) | tcp_windows(0x2) | histogram | | | Size of TCP send window. | -| rcv_wnd(P50/P90/P99) | tcp_windows(0x2) | histogram | | | Size of TCP receive window. | -| zero_snd_wnd_ratio | tcp_windows(0x2) | Gauge | | | Ratio of the number of times of sending window 0 to the number of sent bytes | -| zero_rcv_wnd_ratio | tcp_windows(0x2) | Gauge | | | Ratio of the number of receive window 0 windows to the number of received bytes | -| avl_snd_wnd(P50/P90/P99) | tcp_windows(0x2) | histogram | | | Size of TCP available send window. | -| syn_srtt(P50/P90/P99) | tcp_srtt | histogram | us | | RTT of syn packet(us). | -| syn_srtt_max | tcp_srtt | Gauge | us | Y | RTT of syn packet(us). | -| sk_rcvbuf(P50/P90/P99) | tcp_sockbuf(0x10) | histogram | bytes | | Byte length of the RX buffer. | -| sk_sndbuf(P50/P90/P99) | tcp_sockbuf(0x10) | histogram | bytes | | Byte length of the TX buffer. | -| segs_in | tcp_tx_rx(0x8) | Counter | segs | | total number of segments received | -| segs_out | tcp_tx_rx(0x8) | Counter | segs | | total number of segments sent | -| retran_packets | tcp_abn(0x01) | Gauge | | Y | total number of retrans | -| retran_ratio | tcp_abn(0x01) | Gauge | | Y | retran ratio | -| backlog_drops | tcp_abn(0x01) | Gauge | | Y | drops caused by backlog queue full | -| sk_drops | tcp_abn(0x01) | Counter | | Y | Number of lost packets in the TCP protocol stack | -| lost_out | tcp_abn(0x01) | Gauge | segs | | Number of lost segments estimated by TCP congestion | -| sacked_out | tcp_abn(0x01) | Gauge | segs | | Number of out-of-order TCP packets (SACK) or number of repeated TCP ACKs (NO SACK) | -| filter_drops | tcp_abn(0x01) | Gauge | | | drops caused by socket filter | -| tmout_count | tcp_abn(0x01) | Gauge | | | counter of tcp link timeout | -| snd_buf_limit_count | tcp_abn(0x01) | Gauge | | | counter of limits when allocate wmem | -| rmem_scheduls | tcp_abn(0x01) | Gauge | | | rmem is not enough | -| tcp_oom | tcp_abn(0x01) | Gauge | | | tcp out of memory | -| send_rsts | tcp_abn(0x01) | Gauge | | | send_rsts | -| receive_rsts | tcp_abn(0x01) | Gauge | | | receive_rsts | - -# ENDPOINT - -| metrics_name | table_name | metrics_type | unit | KPI | metrics description | -| ------------------- | ---------- | ------------ | ----- | ---- | ------------------------------------------------ | -| tgid | | key | | | 进程ID | -| s_addr | | key | | | udp/tcp 本地地址 | -| s_port | | key | | | listen port(只有listen对象存在该label) | -| ep_type | | key | | | listen/connect/udp/bind | -| listendrop | listen | Gauge | | Y | TCP accept丢弃次数(只有listen对象存在) | -| accept_overflow | listen | Gauge | | Y | TCP accept队列溢出次数 | -| syn_overflow | listen | Gauge | | Y | TCP syn队列溢出次数 | -| passive_open | listen | Gauge | | Y | tcp被动发起的建链次数(只有listen对象存在) | -| passive_open_failed | listen | Gauge | | Y | tcp被动发起的建链失败次数(只有listen对象存在) | -| retran_synacks | listen | Gauge | | | tcp synack重传报文数 | -| lost_synacks | listen | Gauge | | | TCP synack报文丢失导致的建链失败次数 | -| active_open | connect | Gauge | | | tcp主动发起的建链次数(只有connect对象存在) | -| active_open_failed | connect | Gauge | | | tcp主动发起的建链失败次数(只有connect对象存在) | -| bind_rcv_drops | bind | Gauge | | Y | UDP接收失败次数(udp/bind对象存在) | -| bind_sends | bind | Gauge | bytes | Y | UDP发送长度(udp/bind对象存在) | -| bind_rcvs | bind | Gauge | bytes | Y | UDP接收长度(udp/bind对象存在) | -| bind_err | bind | Gauge | | | UDP接收失败错误码(udp/bind对象存在) | -| udp_rcv_drops | udp | Gauge | | Y | UDP接收失败次数(udp/bind对象存在) | -| udp_sends | udp | Gauge | bytes | Y | UDP发送长度(udp/bind对象存在) | -| udp_rcvs | udp | Gauge | bytes | Y | UDP接收长度(udp/bind对象存在) | -| udp_err | udp | Gauge | | | UDP接收失败错误码(udp/bind对象存在) | - -# QDISC - -| metrics_name | table_name | metrics_type | unit | KPI | metrics description | -| ------------ | ---------- | ------------ | ---- | ---- | -------------------------- | -| dev_name | qdisc | key | | | 网卡设备名 | -| handle | qdisc | key | | | 设备句柄 | -| ifindex | qdisc | key | | | Interface index of qidsc | -| kind | qdisc | label | | | Kind of qidsc | -| netns | qdisc | label | | | net namespace | -| qlen | qdisc | Gauge | | | 队列长度 | -| backlog | qdisc | Gauge | | Y | backlog队列长度 | -| drops | qdisc | Counter | | Y | 丢包数量 | -| requeues | qdisc | Counter | | Y | Requeues count egress | -| overlimits | qdisc | Counter | | Y | 溢出数量 | - -# THREAD(entity_name:task) - -| metrics_name | table_name | metrics_type | unit | KPI | metrics description | -| --------------- | ---------- | ------------ | ---- | ---- | ------------------------------------------------------------ | -| pid | thread | key | | | 线程PID | -| tgid | thread | label | | | 所属进程ID | -| comm | thread | label | | | 线程所属进程名称 | -| off_cpu_ns | thread | Gauge | ns | Y | task调度offcpu的最大时间,统计方式: 1. KPROBE finish_task_switch 获取入参prev task(pid)以及当前时间,当前CPU信息(bpf_get_smp_processor_id()),记录MAP(pid/cpu作为key); 2. finish_task_switch 中bpf_get_current_pid_tgid获取当前pid,以及当前CPU信息(bpf_get_smp_processor_id()),匹配步骤1中的数据以及计算时间差,得出一次offcpu时间。 注意: 1. 过滤idle(pid=0) 2. 只记录offcpu最大值 | -| migration_count | thread | Gauge | | | task CPU之间迁移次数 | - -# Process(entity_name:proc) - -| metrics_name | table_name | metrics_type | unit | KPI | metrics description | -| --------------------- | ------------------------ | ------------ | ---- | ---- | ------------------------------------------------------------ | -| tgid | | key | | | 进程ID | -| ppid | system_proc | label | | | 父进程ID | -| pgid | system_proc | label | | | 进程组ID | -| comm | | label | | | 执行程序名称 | -| cmdline | system_proc | label | | | 执行程序命令(包括配置) | -| container_id | system_proc | label | | | 进程归属的容器实例ID(简写) | -| shared_dirty_size | system_proc | Gauge | | | 进程共享属性的dirty page size | -| shared_clean_size | system_proc | Gauge | | | 进程共享属性的clean page size | -| private_dirty_size | system_proc | Gauge | | | 进程私有属性的dirty page size | -| private_clean_size | system_proc | Gauge | | | 进程私有属性的clean page size | -| referenced_size | system_proc | Gauge | | | 进程当前已引用的page size | -| lazyfree_size | system_proc | Gauge | | | 进程延迟释放内存的size | -| swap_data_size | system_proc | Gauge | | | 进程swap区间数据size | -| swap_data_pss_size | system_proc | Gauge | | | 进程物理内存swap区间数据size | -| fd_count | system_proc | Gauge | | Y | 进程文件句柄 | -| fd_free_per | system_proc | Gauge | | | 进程剩余FD资源占比% | -| utime_jiffies | system_proc | Gauge | | Y | 进程用户运行时间 | -| stime_jiffies | system_proc | Gauge | | Y | 进程系统态运行时间 | -| minor pagefault_count | system_proc | Gauge | | | 进程轻微pagefault次数(无需从磁盘拷贝) | -| major pagefault_count | system_proc | Gauge | | | 进程严重pagefault次数(需从磁盘拷贝) | -| vm_size | system_proc | Gauge | | Y | 进程当前虚拟地址空间大小 | -| pm_size | system_proc | Gauge | | Y | 进程当前物理地址空间大小 | -| rchar_bytes | system_proc | Gauge | | | 进程系统调用至FS的读字节数 | -| wchar_bytes | system_proc | Gauge | | | 进程系统调用至FS的写字节数 | -| syscr_count | system_proc | Gauge | | | 进程read()/pread()执行次数 | -| syscw_count | system_proc | Gauge | | | 进程write()/pwrite()执行次数 | -| read_bytes | system_proc | Gauge | | | 进程实际从磁盘读取的字节数 | -| write_bytes | system_proc | Gauge | | | 进程实际从磁盘写入的字节数 (page cache情况下,该字段进表示设置dirty page的size) | -| cancelled_write_bytes | system_proc | Gauge | | | 参考proc_write_bytes,因为存在page cache 如果write操作结束后,又发生文件被删除事件,会导致diry page并未写入磁盘,所以存在取消写的字节数统计 | -| ns_ext4_read | proc_ext4(0x20) | Gauge | ns | | ext4文件系统读操作时间,单位ns | -| ns_ext4_write | proc_ext4(0x20) | Gauge | ns | | ext4文件系统写操作时间,单位ns | -| ns_ext4_flush | proc_ext4(0x20) | Gauge | ns | | ext4文件系统flush操作时间,单位ns | -| ns_ext4_open | proc_ext4(0x20) | Gauge | ns | | ext4文件系统open操作时间,单位ns | -| ns_overlay_read | proc_overlay(0x40) | Gauge | ns | | overlayfs文件系统读操作时间,单位ns | -| ns_overlay_write | proc_overlay(0x40) | Gauge | ns | | overlayfs文件系统写操作时间,单位ns | -| ns_overlay_flush | proc_overlay(0x40) | Gauge | ns | | overlayfs文件系统flush操作时间,单位ns | -| ns_overlay_open | proc_overlay(0x40) | Gauge | ns | | overlayfs文件系统open操作时间,单位ns | -| ns_tmpfs_read | proc_tmpfs(0x80) | Gauge | ns | | tmpfs文件系统读操作时间,单位ns | -| ns_tmpfs_write | proc_tmpfs(0x80) | Gauge | ns | | tmpfs文件系统写操作时间,单位ns | -| ns_tmpfs_flush | proc_tmpfs(0x80) | Gauge | ns | | tmpfs文件系统flush操作时间,单位ns | -| reclaim_ns | proc_page(0x100) | Gauge | ns | | 进程触发的page回收时间(执行SWAP操作),单位ns | -| access_pagecache | proc_page(0x100) | Gauge | | | 进程触发的页面访问次数 | -| mark_buffer_dirty | proc_page(0x100) | Gauge | | | 进程触发的 page buffer置脏次数 | -| load_page_cache | proc_page(0x100) | Gauge | | | 进程触发的 page 加入page cache次数 | -| mark_page_dirty | proc_page(0x100) | Gauge | | | 进程触发的 page 置脏次数 | -| ns_gethostname | proc_dns(0x200) | Gauge | ns | | 进程获取DNS域名对应的地址,单位ns | -| gethostname_failed | proc_dns(0x200) | Gauge | | | 进程获取DNS域名失败次数 | -| ns_mount | proc_syscall_io(0x02) | Gauge | ns | | 进程系统调用mount时长,单位ns | -| ns_umount | proc_syscall_io(0x02) | Gauge | ns | | 进程系统调用umount时长,单位ns | -| ns_read | proc_syscall_io(0x02) | Gauge | ns | | 进程系统调用read时长,单位ns | -| ns_write | proc_syscall_io(0x02) | Gauge | ns | | 进程系统调用write时长,单位ns | -| ns_fsync | proc_syscall_io(0x02) | Gauge | ns | | 进程系统调用fsync时长,单位ns | -| ns_sendmsg | proc_syscall_net(0x04) | Gauge | ns | | 进程系统调用sendmsg时长,单位ns | -| ns_recvmsg | proc_syscall_net(0x04) | Gauge | ns | | 进程系统调用recvmsg时长,单位ns | -| ns_sched_yield | proc_syscall_sched(0x08) | Gauge | ns | | 进程系统调用sched_yield时长,单位ns | -| ns_futex | proc_syscall_sched(0x08) | Gauge | ns | | 进程系统调用futex时长,单位ns | -| ns_epoll_wait | proc_syscall_sched(0x08) | Gauge | ns | | 进程系统调用epoll_wait时长,单位ns | -| ns_epoll_pwait | proc_syscall_sched(0x08) | Gauge | ns | | 进程系统调用epoll_pwait时长,单位ns | -| ns_fork | proc_syscall_fork(0x10) | Gauge | ns | | 进程系统调用fork时长,单位ns | -| ns_vfork | proc_syscall_fork(0x10) | Gauge | ns | | 进程系统调用vfork时长,单位ns | -| ns_clone | proc_syscall_fork(0x10) | Gauge | ns | | 进程系统调用clone时长,单位ns | -| syscall_failed | proc_syscall (0x01) | Gauge | | Y | 进程系统调用失败次数 | -| less_4k_io_read | proc_io(0x400) | Gauge | | | Number of small I/O (less than 4 KB) read operations at the BIO layer. | -| less_4k_io_write | proc_io(0x400) | Gauge | | | Number of small I/O (less than 4 KB) write operations at the BIO layer. | -| greater_4k_io_read | proc_io(0x400) | Gauge | | | Number of big I/O (greater than 4 KB) read operations at the BIO layer. | -| greater_4k_io_write | proc_io(0x400) | Gauge | | | Number of big I/O (greater than 4 KB) write operations at the BIO layer. | -| bio_latency | proc_io(0x400) | Gauge | ns | | I/O operation delay at the BIO layer (unit: us). | -| bio_err_count | proc_io(0x400) | Gauge | | | Number of I/O operation failures at the BIO layer. | -| hang_count | proc_io(0x400) | Gauge | | | Number of process hang times. | -| iowait_us | proc_io(0x400) | Gauge | us | | Process IO_wait time (unit: us). | -| sched_systime | sched_systime | Gauge | us | | Duration of the process in the system state due to scheduling delay. | -| sched_syscall | sched_syscall | Gauge | us | | Process syscall delay due to scheduling preemption. | - -# BLOCK - -| metrics_name | table_name | metrics_type | unit | KPI | metrics description | -| --------------------- | ------------------ | ------------ | ----- | ---- | ------------------------------- | -| major | block | key | | | 块对象编号 | -| first_minor | block | key | | | 块对象编号 | -| blk_type | block | label | | | 块对象类型(比如disk, part) | -| blk_name | block | label | | | 块对象名称 | -| disk_name | block | label | | | 所属磁盘名称 | -| latency_req_max | io_latency(0x01) | Gauge | us | Y | block层I/O操作时延最大值 | -| latency_req_last | io_latency(0x01) | Gauge | us | | block层I/O操作时延最近值 | -| latency_req_sum | io_latency(0x01) | Gauge | us | | block层I/O操作时延总计值 | -| latency_req_jitter | io_latency(0x01) | Gauge | us | | block层I/O操作时延抖动 | -| count_latency_req | io_latency(0x01) | Gauge | | | block层I/O操作操作次数 | -| latency_driver_max | io_latency(0x01) | Gauge | us | | 驱动层时延最大值 | -| latency_driver_last | io_latency(0x01) | Gauge | us | | 驱动层时延最近值 | -| latency_driver_sum | io_latency(0x01) | Gauge | us | | 驱动层时延最总计值 | -| latency_driver_jitter | io_latency(0x01) | Gauge | us | | 驱动层时延抖动 | -| count_latency_driver | io_latency(0x01) | Gauge | | | 驱动层操作次数 | -| latency_device_max | io_latency(0x01) | Gauge | us | Y | 设备层时延最大值 | -| latency_device_last | io_latency(0x01) | Gauge | us | | 设备层时延最近值 | -| latency_device_sum | io_latency(0x01) | Gauge | us | | 设备层时延最总计值 | -| latency_device_jitter | io_latency(0x01) | Gauge | us | | 设备层时延抖动 | -| count_latency_device | io_latency(0x01) | Gauge | | | 设备层操作次数 | -| err_code | io_err(0x02) | Gauge | | | block层I/O操作错误码 | -| read_bytes | io_count(0x04) | Gauge | bytes | | I/O操作读字节数 | -| write_bytes | io_count(0x04) | Gauge | bytes | | I/O操作写字节数 | -| access_pagecache | io_pagecache(0x08) | Gauge | | | Block页面访问次数 | -| mark_buffer_dirty | io_pagecache(0x08) | Gauge | | | Block page buffer置脏次数 | -| load_page_cache | io_pagecache(0x08) | Gauge | | | Block page 加入page cache次数 | -| mark_page_dirty | io_pagecache(0x08) | Gauge | | | Block page 置脏次数 | - -# Container - -| metrics_name | table_name | metrics_type | unit | KPI | metrics description | -| -------------------------------------- | ----------------- | ------------ | ------- | ---- | ------------------------------------------------------------ | -| container_id | container | key | | | 容器ID(简写) | -| name | container | label | | | 容器名称 | -| cpucg_inode | container | label | | | cpu,cpuacct cgroup ID(容器实例内cgroup目录对应的inode id) | -| memcg_inode | container | label | | | memory cgroup ID(容器实例内cgroup目录对应的inode id) | -| pidcg_inode | container | label | | | pids cgroup ID(容器实例内cgroup目录对应的inode id) | -| mnt_ns_id | container | label | | | mount namespace | -| net_ns_id | container | label | | | net namespace | -| proc_id | container | label | | | 容器主进程ID | -| blkio_device_usage_total | container_blkio | Gauge | bytes | | Blkio device bytes usage, unit bytes | -| cpu_load_average_10s | container_cpu | Gauge | | | Value of container cpu load average over the last 10 seconds | -| cpu_system_seconds_total | container_cpu | Gauge | seconds | Y | Cumulative system cpu time consumed, unit second | -| cpu_usage_seconds_total | container_cpu | Gauge | seconds | Y | Cumulative cpu time consumed, unit second | -| cpu_user_seconds_total | container_cpu | Gauge | seconds | | Cumulative user cpu time consumed, unit second | -| fs_inodes_free | container_fs | Gauge | | | Number of available Inodes | -| fs_inodes_total | container_fs | Gauge | | | Total number of Inodes | -| fs_io_current | container_fs | Gauge | | | Number of I/Os currently in progress | -| fs_io_time_seconds_total | container_fs | Gauge | seconds | | Cumulative count of seconds spent doing I/Os, unit second | -| fs_io_time_weighted_seconds_total | container_fs | Gauge | seconds | | Cumulative weighted I/O time, unit second | -| fs_limit_bytes | container_fs | Gauge | bytes | | Number of bytes that can be consumed by the container on this filesystem, unit bytes | -| fs_read_seconds_total | container_fs | Gauge | bytes | | Cumulative count of bytes read, unit bytes | -| fs_reads_bytes_total | container_fs | Gauge | bytes | | Cumulative count of bytes read | -| fs_reads_merged_total | container_fs | Gauge | | | Cumulative count of reads merged | -| fs_reads_total | container_fs | Gauge | | | Cumulative count of reads completed | -| fs_sector_reads_total | container_fs | Gauge | | | Cumulative count of sector reads completed | -| fs_sector_writes_total | container_fs | Gauge | | | Cumulative count of sector writes completed | -| fs_usage_bytes | container_fs | Gauge | bytes | | Number of bytes that are consumed by the container on this filesystem | -| fs_write_seconds_total | container_fs | Gauge | seconds | | Cumulative count of seconds spent writing | -| fs_writes_bytes_total | container_fs | Gauge | bytes | | Cumulative count of bytes written | -| fs_writes_merged_total | container_fs | Gauge | | | Cumulative count of writes merged | -| fs_writes_total | container_fs | Gauge | | | Cumulative count of writes completed | -| memory_cache | container_memory | Gauge | bytes | Y | Total page cache memory | -| memory_failcnt | container_memory | Gauge | | | Number of memory usage hits limits | -| memory_failures_total | container_memory | Gauge | | | Cumulative count of memory allocation failures | -| memory_mapped_file | container_memory | Gauge | bytes | | Size of memory mapped files | -| memory_max_usage_bytes | container_memory | Gauge | bytes | | Maximum memory usage recorded | -| memory_rss | container_memory | Gauge | bytes | Y | Size of RSS | -| memory_swap | container_memory | Gauge | bytes | Y | Container swap usage | -| memory_usage_bytes | container_memory | Gauge | bytes | Y | Current memory usage, including all memory regardless of when it was accessed | -| memory_working_set_bytes | container_memory | Gauge | bytes | | Current working set | -| network_receive_bytes_total | container_network | Gauge | bytes | | Cumulative count of bytes received | -| network_receive_errors_total | container_network | Gauge | | | Cumulative count of errors encountered while receiving | -| network_receive_packets_dropped_total | container_network | Gauge | | | Cumulative count of packets dropped while receiving | -| network_receive_packets_total | container_network | Gauge | | Y | Cumulative count of packets received | -| network_transmit_bytes_total | container_network | Gauge | bytes | Y | Cumulative count of bytes transmitted | -| network_transmit_errors_total | container_network | Gauge | | | Cumulative count of errors encountered while transmitting | -| network_transmit_packets_dropped_total | container_network | Gauge | | | Cumulative count of packets dropped while transmitting | -| network_transmit_packets_total | container_network | Gauge | | | Cumulative count of packets transmitted | -| oom_events_total | container_oom | Gauge | | | Count of out of memory events observed for the container | -| spec_cpu_period | container_spec | Gauge | | | CPU period of the container | -| spec_cpu_shares | container_spec | Gauge | | | CPU share of the container | -| spec_memory_limit_bytes | container_spec | Gauge | bytes | | Memory limit for the container | -| spec_memory_reservation_limit_bytes | container_spec | Gauge | bytes | | Memory reservation limit for the container | -| spec_memory_swap_limit_bytes | container_spec | Gauge | bytes | | Memory swap limit for the container | -| start_time_seconds | container_start | Gauge | seconds | | Start time of the container since unix epoch | -| tasks_state | container_tasks | Gauge | | | Number of tasks in given state (sleeping, running, stopped, uninterruptible, or ioawaiting) | -| | | | | | | - -# Redis SLI - -| metrics_name | table_name | metrics_type | unit | KPI | metrics description | -| ------------ | ------------- | ------------ | ---- | ---- | -------------------------------- | -| tgid | | key | | | 进程ID | -| ins_id | | key | | | 实例ID | -| app | | key | | | 应用名 | -| method | | key | | | 请求方法 | -| server_ip | | label | | | 服务端IP | -| server_port | | label | | | 服务端端口 | -| client_ip | | label | | | 客户端IP | -| client_port | | label | | | 客户端端口 | -| rtt_nsec | redis_sli | gauge | ns | Y | Redis协议请求RTT | -| max_rtt_nsec | redis_max_sli | gauge | ns | Y | Redis协议采样周期内最大请求RTT | - -# Postgre SLI - -支持版本:openssl 1.1.1 - -| metrics_name | table_name | metrics_type | unit | KPI | metrics description | -| ------------ | ---------- | ------------ | ---- | ---- | ------------------------------------- | -| tgid | | key | | | 进程ID | -| ins_id | | key | | | 实例ID | -| app | | key | | | 应用名 | -| method | | key | | | 请求方法 | -| server_ip | | label | | | 服务端IP | -| server_port | | label | | | 服务端端口 | -| client_ip | | label | | | 客户端IP | -| client_port | | label | | | 客户端端口 | -| rtt_nsec | pg_sli | gauge | ns | Y | Postgre协议请求RTT | -| max_rtt_nsec | pg_max_sli | gauge | ns | Y | Postgre协议采样周期内最大请求RTT | -| tps | pg_tps | gauge | | Y | 数据库吞吐量,目前只支持openGauss 2.0 | - -# HTTP SLI - -待补充支持的版本 - -| metrics_name | table_name | metrics_type | unit | KPI | metrics description | -| ------------ | ------------ | ------------ | ---- | ---- | ------------------------------ | -| tgid | | key | | | 进程ID | -| ins_id | | key | | | 实例ID | -| app | | key | | | 应用名 | -| method | | key | | | 请求方法 | -| server_ip | | label | | | 服务端IP | -| server_port | | label | | | 服务端端口 | -| client_ip | | label | | | 客户端IP | -| client_port | | label | | | 客户端端口 | -| rtt_nsec | http_sli | gauge | ns | | 单个HTTP请求RTT | -| max_rtt_nsec | http_max_sli | gauge | ns | | 当前TCP连接所有HTTP请求最长RTT | - -# DISK - -| metrics_name | table_name | metrics_type | unit | KPI | metrics description | -| ------------ | ------------- | ------------ | --------------------- | ---- | --------------------------------------- | -| disk_name | system_iostat | key | | | blk所在的物理磁盘名称 | -| rspeed | system_iostat | gauge | read times/second | Y | 读速率(IOPS) | -| rspeed_kB | system_iostat | gauge | read kbytes/second | Y | 吞吐量 | -| r_await | system_iostat | gauge | ms | Y | 读响应时间 | -| rareq | system_iostat | gauge | | Y | 饱和度(rareq-sz 和 wareq-sz+响应时间) | -| wspeed | system_iostat | gauge | write times/second | Y | 写速率(IOPS) | -| wspeed_kB | system_iostat | gauge | write kbytes/second | Y | 吞吐量 | -| w_await | system_iostat | gauge | ms | Y | 写响应时间 | -| wareq | system_iostat | gauge | | | 饱和度(rareq-sz 和 wareq-sz+响应时间) | -| aqu | system_iostat | gauge | | | 平均队列深度 | -| util | system_iostat | gauge | % | Y | 磁盘使用率 | - -# NIC - -| metrics_name | table_name | metrics_type | unit | KPI | metrics description | -| ------------------ | ---------- | ------------ | -------- | ---- | ---------------------- | -| dev_name | nic | key | | | 网卡名称 | -| rx_bytes | nic | gauge | bytes | | 网卡接收字节数 | -| rx_packets | nic | gauge | | | 网卡接收的总数据包数 | -| rx_errs | nic | gauge | | | 网卡接收错误的数据包数 | -| rx_dropped | nic | gauge | | | 网卡接收丢弃的数据包数 | -| tx_bytes | nic | gauge | bytes | | 网卡发送字节数 | -| tx_packets | nic | gauge | | | 网卡发送的总数据包数 | -| tx_errs | nic | gauge | | | 网卡发送错误的数据包数 | -| tx_dropped | nic | gauge | | | 网卡发送丢弃的数据包数 | -| rxspeed_KB | nic | gauge | Kbytes/s | | 网卡上行速率 | -| txspeed_KB | nic | gauge | Kbytes/s | | 网卡下行速率 | -| tc_sent_drop | nic | gauge | | | TC发送丢包 | -| tc_sent_overlimits | nic | gauge | | | TC发送队列溢出 | -| tc_backlog | nic | gauge | | | TC backlog队列包数量 | -| tc_ecn_mark | nic | gauge | | | TC 拥塞标记 | - -# CPU - -| metrics_name | table_name | metrics_type | unit | KPI | metrics description | -| -------------------- | --------------- | ------------ | ------- | ---- | ----------------------------- | -| cpu | system_cpu | key | | | CPU编号 | -| rcu | system_cpu | gauge | | | RCU锁软中断次数 | -| timer | system_cpu | gauge | | | 定时器软中断次数 | -| sched | system_cpu | gauge | | | 调度中断次数 | -| net_rx | system_cpu | gauge | | | 网卡收包中断次数 | -| user_msec | system_cpu | gauge | ms | | 用户态cpu占用时间(不包括nice) | -| nice_msec | system_cpu | gauge | ms | | nice用户态cpu占用时间(低优先级) | -| system_msec | system_cpu | gauge | ms | | 内核态cpu占用时间 | -| iowait_msec | system_cpu | gauge | ms | | 等待I/O完成的时间 | -| irq_msec | system_cpu | gauge | ms | | 硬中断时间 | -| softirq_msec | system_cpu | gauge | ms | | 软中断时间 | -| backlog_drops | system_cpu | gauge | | | softnet_data队列满而丢弃报文数量 | -| rps_count | system_cpu | gauge | | | CPU收到的RPS次数 | -| total_used_per | system_cpu_util | gauge | % | | CPU总利用率 | - -# MEM - -| metrics_name | table_name | metrics_type | unit | KPI | metrics description | -| ------------ | -------------- | ------------ | ---- | ---- | ------------------- | -| mem | system_meminfo | key | | | /proc/meminfo | -| mem_total | system_meminfo | gauge | KB | | 系统总的可用物理内存 | -| mem_free | system_meminfo | gauge | KB | | 系统还可用的物理内存 | -| mem_available| system_meminfo | gauge | KB | | 用户还可用内存 | -| mem_util | system_meminfo | gauge | % | | 系统内存使用率 | -| mem_buffers | system_meminfo | gauge | KB | | 被 buffer使用的物理内存| -| mem_cache | system_meminfo | gauge | KB | | 被 cache使用的物理内存 | -| mem_active | system_meminfo | gauge | KB | | 经常使用的cache页面大小| -| mem_inactive | system_meminfo | gauge | KB | | 非活跃内存大小,可回收 | -| swap_total | system_meminfo | gauge | KB | | 交换区总量 | -| swap_free | system_meminfo | gauge | KB | | 空闲交换区总量 | -| swap_util | system_meminfo | gauge | % | | 交换区的使用率 | -| dentry | system_dentry | gauge | | | The total number of dentries allocated. | -| unused_dentry | system_dentry | gauge | | | The number of dentries that are not actively used.(注意该指标如果过低通常会引发系统CPU冲高) | -| age_limit | system_dentry | gauge | s | | The age in seconds after which dcache entries can be reclaimed when memory is short. | - -# FS - -| metrics_name | table_name | metrics_type | unit | KPI | metrics description | -| ------------ | ---------- | ------------ | ---- | ---- | ----------------------- | -| MountOn | system_df | key | | | 文件系统的挂载点 | -| Fsname | system_df | label | | | 文件系统名称 | -| Fstype | system_df | label | | | 文件系统类型 | -| Inodes | system_df | label | | | 分区内inode数量 | -| IUsed | system_df | gauge | | Y | 分区内已使用的inode数量 | -| IFree | system_df | gauge | | | 分区内空闲的inode数量 | -| IUsePer | system_df | gauge | % | | 分区内已使用的inode占比 | -| Blocks | system_df | label | KB | | 分区内Block数量 | -| Used | system_df | gauge | | Y | 分区内已使用的Block数量 | -| Free | system_df | gauge | | Y | 分区内空闲的Block数量 | -| UsePer | system_df | gauge | % | Y | 分区内已使用的Block占比 | - -# NET - -| metrics_name | table_name | metrics_type | unit | KPI | metrics description | -| ----------------- | ---------- | ------------ | ---- | ---- | ------------------- | -| origin | | key | | | /proc/dev/snmp | -| tcp_curr_estab | system_tcp | gauge | | | 当前的TCP连接数 | -| tcp_in_segs | system_tcp | gauge | segs | | TCP接收的分片数 | -| tcp_out_segs | system_tcp | gauge | segs | | TCP发送的分片数 | -| tcp_retrans_segs | system_tcp | gauge | segs | Y | TCP重传的分片数 | -| tcp_in_errs | system_tcp | gauge | | | TCP入包错误包数 | -| udp_indata_grams | system_udp | gauge | segs | | UDP接收包量 | -| udp_outdata_grams | system_udp | gauge | segs | | UDP发送包量 | - -# Host - -| metrics_name | table_name | metrics_type | unit | KPI | metrics description | -| ------------ | ---------- | ------------ | ---- | ---- | -------------------------------- | -| os_version | | key | | | 操作系统版本信息 | -| hostname | system_os | label | | | 主机名 | -| kversion | system_os | label | | | 内核版本信息 | -| cpu_num | system_os | label | | | CPU核数 | -| memory_MB | system_os | label | | | 内存总量(MB) | -| ip_addr | system_os | label | | | 所有的IP地址 | -| value | system_os | gauge | | | 一个固定值作为metric,无实际意义 | - -# Dnsmasq(entity_name:dnsmasq_link) - -| metrics_name | table_name | metrics_type | unit | KPI | metrics description | -| ------------ | ------------ | ------------ | ---- | ---- | ------------------- | -| client_ip | dnsmasq_link | key | | | 客户端IP | -| virtual_ip | dnsmasq_link | key | | | DNS服务器IP | -| family | dnsmasq_link | label | | | 协议族 | -| link_count | dnsmasq_link | gauge | | | 连接数 | - -# LVS(entity_name:ipvs_link) - -支持的软件版本:>=EulerOS 2.9,且加载ipvs KO - -| metrics_name | table_name | metrics_type | unit | KPI | metrics description | -| ------------ | ---------- | ------------ | ---- | ---- | ------------------- | -| client_ip | ipvs_link | key | | | 客户端IP | -| virtual_ip | ipvs_link | key | | | 虚拟IP | -| local_ip | ipvs_link | key | | | 本地IP | -| server_ip | ipvs_link | key | | | 真实的服务端IP | -| server_port | ipvs_link | key | | | 真实的服务端端口 | -| virtual_port | ipvs_link | key | | | 虚拟地址端口 | -| protocol | ipvs_link | label | | | 协议类型 | -| link_count | ipvs_link | gauge | | | 连接数 | - -# Nginx(entity_name:nginx_link) - -支持的软件版本:1.12.1 - -| metrics_name | table_name | metrics_type | unit | KPI | metrics description | -| ------------ | ---------- | ------------ | ---- | ---- | ------------------- | -| client_ip | nginx_link | key | | | 客户端IP | -| virtual_ip | nginx_link | key | | | 虚拟服务器IP | -| server_ip | nginx_link | key | | | 真实服务端IP | -| virtual_port | nginx_link | key | | | 虚拟服务器端口 | -| server_port | nginx_link | key | | | 真实服务端端口 | -| is_l7 | nginx_link | label | | | 1—七层LB / 0—四层LB | -| link_count | nginx_link | gauge | | | 连接数 | - -# Haproxy(entity_name:haproxy_link) - -支持的软件版本:2.5-dev0 - -| metrics_name | table_name | metrics_type | unit | KPI | metrics description | -| ------------ | ------------ | ------------ | ---- | ---- | ------------------- | -| client_ip | haproxy_link | key | | | 客户端IP | -| virtual_ip | haproxy_link | key | | | 虚拟服务器IP | -| server_ip | haproxy_link | key | | | 真实服务端IP | -| virtual_port | haproxy_link | key | | | 虚拟服务器端口 | -| server_port | haproxy_link | key | | | 真实服务端端口 | -| protocol | haproxy_link | label | | | 协议类型(TCP/HTTP) | -| link_count | haproxy_link | gauge | | | 连接数 | - -# JVM(entity_name:jvm) - -| metrics_name | table_name | metrics_type | unit | KPI | metrics description | -| ----------------------------- | ------------ | ------------ | ----- | ---- | ----------------------------- | -| tgid | | key | | | Java 虚拟机的进程ID | -| runtime | jvm_info | label | | | JVM 运行时信息 | -| vendor | jvm_info | label | | | JVM 创建者/维护者 | -| version | jvm_info | label | | | JVM 版本 | -| info | jvm_info | gauge | | | 固定值1 | -| proc_start_time_secs | jvm_proc | gauge | s | | 进程起始时间 | -| proc_cpu_secs_total | jvm_proc | counter | s | | 进程已使用的CPU时间 | -| class_current_loaded | jvm_class | gauge | | | JVM当前已加载类的数量 | -| class_loaded_total | jvm_class | counter | | | JVM自执行以来加载的类的总数量 | -| threads_current | jvm_thread | gauge | | | JVM当前线程数 | -| threads_daemon | jvm_thread | gauge | | | JVM的守护线程数 | -| threads_peak | jvm_thread | gauge | | | JVM的峰值线程数 | -| threads_started_total | jvm_thread | counter | | | JVM的已启动线程数 | -| threads_deadlocked | jvm_thread | gauge | | | JVM的死锁的线程数 | -| area | jvm_mem | label | | | JVM内存类型:heap/noheap | -| mem_bytes_used | jvm_mem | gauge | bytes | | 给定JVM内存区域的已使用字节数 | -| mem_bytes_commit | jvm_mem | gauge | bytes | | 给定JVM内存区域的已提交字节数 | -| mem_bytes_max | jvm_mem | gauge | bytes | | 给定JVM内存区域的最大字节数 | -| mem_bytes_init | jvm_mem | gauge | bytes | | 给定JVM内存区域的初始字节数 | -| pool | jvm_mem_pool | label | | | 内存池类型 | -| mem_pool_bytes_used | jvm_mem_pool | gauge | bytes | | 给定JVM内存池的已使用字节数 | -| mem_pool_bytes_commit | jvm_mem_pool | gauge | bytes | | 给定JVM内存池的已提交字节数 | -| mem_pool_bytes_max | jvm_mem_pool | gauge | bytes | | 给定JVM内存池的最大字节数 | -| mem_pool_coll_used_bytes | jvm_mem_pool | gauge | bytes | | 给定JVM内存池最后一次垃圾回收使用的字节数 | -| mem_pool_coll_commit_bytes | jvm_mem_pool | gauge | bytes | | 上一次GC内存池的大小 | -| mem_pool_coll_max_bytes | jvm_mem_pool | gauge | bytes | | 上一次GC内存池的最大字节数 | -| pool | jvm_buf_pool | label | | | 缓冲池类型 | -| buffer_pool_used_bytes | jvm_buf_pool | gauge | bytes | | 给定JVM缓冲池的已用字节数 | -| buffer_pool_used_buffers | jvm_buf_pool | gauge | | | 给定JVM缓冲池的已用缓冲区数 | -| buffer_pool_capacity_bytes | jvm_buf_pool | gauge | bytes | | 给定JVM缓冲池的字节容量 | -| gc | jvm_gc | label | | | 垃圾回收器名字 | -| gc_coll_secs_count | jvm_gc | summary | | | 给定的垃圾回收器已发生的GC总次数 | -| gc_coll_secs_sum | jvm_gc | summary | s | | 在给定的垃圾回收器花费的总时间 | - - - -# kafka (entity_name: kafka) - -| metrics_name | table_name | metrics_type | unit | KPI | metrics description | -| ----------------------------- | ------------ | ------------ | ----- | ---- | ----------------------------- | -| msg_type | kafkaprobe | key | | | 访问类型,producer或consumer | -| client_ip | kafkaprobe | key | | | 客户端IP | -| client_port | kafkaprobe | key | | | 客户端port | -| num | kafkaprobe | gauge | | | 在一次采样周期中producer发布或consumer消费的消息数量 | -| topic | kafkaprobe | key | | | 消息的topic | -| server_ip | kafkaprobe | key | | | kafka server所在主机的网卡IP | -| server_port | kafkaprobe | key | | | kafka server所绑定的端口号 | - -# L7层会话(entity_name: l7) - -| metrics_name | table_name | metrics_type | unit | KPI | metrics description | -| --------------- | ---------- | ------------ | ---- | ---- | ------------------------------------------------------ | -| tgid | | key | | | Process ID of l7 session. | -| remote_ip | | key | | | Remote IP address of l7 session. | -| remote_port | | key | | | Remote port of l7 session. | -| l4_role | | key | | | Role of l4 protocol(TCP Client/Server or UDP) | -| l7_role | | key | | | Role of l7 protocol(Client or Server) | -| protocol | | key | | | Name of l7 protocol(http/http2/mysql...) | -| pod_ip | | label | | | IP address of pod which l7 session belongs. | -| ssl | | label | | | Indicates whether an SSL-encrypted l7 session is used. | -| bytes_sent | l7_link | gauge | | | Number of bytes sent by a l7 session. | -| bytes_recv | l7_link | gauge | | | Number of bytes recv by a l7 session. | -| throughput_req | l7_rpc | histogram | qps | | Request throughput of l7 session. | -| throughput_resp | l7_rpc | histogram | qps | | Response throughput of l7 session. | -| latency_avg | l7_rpc | gauge | ns | | Average l7 session latency. | -| latency_p50 | l7_rpc | histogram | ns | | L7 session P50 latency. | -| latency_p90 | l7_rpc | histogram | ns | | L7 session P90 latency. | -| latency_p99 | l7_rpc | histogram | ns | | L7 session P99 latency. | -| err_ratio | l7_rpc | gauge | % | | L7 session error rate. | +# 系统性能 + +## 主机概要 + +实体名:host + +| metrics_name | table_name | metrics_type | unit | metrics description | +| ------------ | ---------- | ------------ | ---- | -------------------------------- | +| os_version | | key | | 操作系统版本信息 | +| hostname | system_os | label | | 主机名 | +| kversion | system_os | label | | 内核版本信息 | +| cpu_num | system_os | label | | CPU核数 | +| memory_MB | system_os | label | | 内存总量(MB) | +| ip_addr | system_os | label | | 所有的IP地址 | +| value | system_os | gauge | | 一个固定值作为metric,无实际意义 | + +## CPU性能 + +实体名:cpu + +| metrics_name | table_name | metrics_type | unit | metrics description | +| -------------- | --------------- | ------------ | ---- | --------------------------------- | +| cpu | system_cpu | key | | CPU编号 | +| rcu | system_cpu | gauge | | RCU锁软中断次数 | +| timer | system_cpu | gauge | | 定时器软中断次数 | +| sched | system_cpu | gauge | | 调度中断次数 | +| net_rx | system_cpu | gauge | | 网卡收包中断次数 | +| user_msec | system_cpu | gauge | ms | 用户态cpu占用时间(不包括nice) | +| nice_msec | system_cpu | gauge | ms | nice用户态cpu占用时间(低优先级) | +| system_msec | system_cpu | gauge | ms | 内核态cpu占用时间 | +| iowait_msec | system_cpu | gauge | ms | 等待I/O完成的时间 | +| irq_msec | system_cpu | gauge | ms | 硬中断时间 | +| softirq_msec | system_cpu | gauge | ms | 软中断时间 | +| backlog_drops | system_cpu | gauge | | softnet_data队列满而丢弃报文数量 | +| rps_count | system_cpu | gauge | | CPU收到的RPS次数 | +| total_used_per | system_cpu_util | gauge | % | CPU总利用率 | + +## 内存性能 + +实体名:mem + +| metrics_name | table_name | metrics_type | unit | metrics description | +| ------------- | -------------- | ------------ | ---- | ------------------------------------------------------ | +| mem | system_meminfo | key | | /proc/meminfo | +| mem_total | system_meminfo | gauge | KB | 系统总的可用物理内存 | +| mem_free | system_meminfo | gauge | KB | 系统还可用的物理内存 | +| mem_available | system_meminfo | gauge | KB | 用户还可用内存 | +| mem_util | system_meminfo | gauge | % | 系统内存使用率 | +| mem_buffers | system_meminfo | gauge | KB | 被 buffer使用的物理内存 | +| mem_cache | system_meminfo | gauge | KB | 被 cache使用的物理内存 | +| mem_active | system_meminfo | gauge | KB | 经常使用的cache页面大小 | +| mem_inactive | system_meminfo | gauge | KB | 非活跃内存大小,可回收 | +| swap_total | system_meminfo | gauge | KB | 交换区总量 | +| swap_free | system_meminfo | gauge | KB | 空闲交换区总量 | +| swap_util | system_meminfo | gauge | % | 交换区的使用率 | +| dentry | system_dentry | gauge | | dentry已占用的数量(注意dentry数量过多会引起系统卡顿) | +| unused_dentry | system_dentry | gauge | | dentry未使用的数量 | + +## 网络性能 + +### 协议栈统计 + +实体名:net + +| metrics_name | table_name | metrics_type | unit | metrics description | +| ----------------- | ---------- | ------------ | ---- | ------------------- | +| origin | | key | | /proc/dev/snmp | +| tcp_curr_estab | system_tcp | gauge | | 当前的TCP连接数 | +| tcp_in_segs | system_tcp | gauge | segs | TCP接收的分片数 | +| tcp_out_segs | system_tcp | gauge | segs | TCP发送的分片数 | +| tcp_retrans_segs | system_tcp | gauge | segs | TCP重传的分片数 | +| tcp_in_errs | system_tcp | gauge | | TCP入包错误包数 | +| udp_indata_grams | system_udp | gauge | segs | UDP接收包量 | +| udp_outdata_grams | system_udp | gauge | segs | UDP发送包量 | + +### 网卡统计 + +实体名:nic + +| metrics_name | table_name | metrics_type | unit | metrics description | +| ------------------ | ---------- | ------------ | -------- | ---------------------- | +| dev_name | nic | key | | 网卡名称 | +| rx_bytes | nic | gauge | bytes | 网卡接收字节数 | +| rx_packets | nic | gauge | | 网卡接收的总数据包数 | +| rx_errs | nic | gauge | | 网卡接收错误的数据包数 | +| rx_dropped | nic | gauge | | 网卡接收丢弃的数据包数 | +| tx_bytes | nic | gauge | bytes | 网卡发送字节数 | +| tx_packets | nic | gauge | | 网卡发送的总数据包数 | +| tx_errs | nic | gauge | | 网卡发送错误的数据包数 | +| tx_dropped | nic | gauge | | 网卡发送丢弃的数据包数 | +| rxspeed_KB | nic | gauge | Kbytes/s | 网卡上行速率 | +| txspeed_KB | nic | gauge | Kbytes/s | 网卡下行速率 | +| tc_sent_drop | nic | gauge | | TC发送丢包 | +| tc_sent_overlimits | nic | gauge | | TC发送队列溢出 | +| tc_backlog | nic | gauge | | TC backlog队列包数量 | +| tc_ecn_mark | nic | gauge | | TC 拥塞标记 | + +## I/O性能 + +### 磁盘统计 + +实体名:disk + +| metrics_name | table_name | metrics_type | unit | metrics description | +| ------------ | ------------- | ------------ | --------------------- | --------------------------------------- | +| disk_name | system_iostat | key | | blk所在的物理磁盘名称 | +| rspeed | system_iostat | gauge | read times/second | 读速率(IOPS) | +| rspeed_kB | system_iostat | gauge | read kbytes/second | 吞吐量 | +| r_await | system_iostat | gauge | ms | 读响应时间 | +| rareq | system_iostat | gauge | | 饱和度(rareq-sz 和 wareq-sz+响应时间) | +| wspeed | system_iostat | gauge | write times/second | 写速率(IOPS) | +| wspeed_kB | system_iostat | gauge | write kbytes/second | 吞吐量 | +| w_await | system_iostat | gauge | ms | 写响应时间 | +| wareq | system_iostat | gauge | | 饱和度(rareq-sz 和 wareq-sz+响应时间) | +| aqu | system_iostat | gauge | | 平均队列深度 | +| util | system_iostat | gauge | % | 磁盘使用率 | + +### Block统计 + +实体名:block + +| metrics_name | table_name | metrics_type | unit | metrics description | Support | +| --------------------- | ---------------- | ------------ | ----- | ------------------------------ | -------------------------------------- | +| major | block | key | | 块对象编号 | 支持NVME、SCSI、VirtBlock三种类型Block | +| first_minor | block | key | | 块对象编号 | | +| blk_type | block | label | | 块对象类型(比如disk, part) | | +| blk_name | block | label | | 块对象名称 | | +| disk_name | block | label | | 所属磁盘名称 | | +| latency_req_max | io_latency(0x01) | Gauge | us | block层I/O操作时延最大值 | | +| latency_req_last | io_latency(0x01) | Gauge | us | block层I/O操作时延最近值 | | +| latency_req_sum | io_latency(0x01) | Gauge | us | block层I/O操作时延总计值 | | +| latency_req_jitter | io_latency(0x01) | Gauge | us | block层I/O操作时延抖动 | | +| count_latency_req | io_latency(0x01) | Gauge | | block层I/O操作操作次数 | | +| latency_driver_max | io_latency(0x01) | Gauge | us | 驱动层时延最大值 | | +| latency_driver_last | io_latency(0x01) | Gauge | us | 驱动层时延最近值 | | +| latency_driver_sum | io_latency(0x01) | Gauge | us | 驱动层时延最总计值 | | +| latency_driver_jitter | io_latency(0x01) | Gauge | us | 驱动层时延抖动 | | +| count_latency_driver | io_latency(0x01) | Gauge | | 驱动层操作次数 | | +| latency_device_max | io_latency(0x01) | Gauge | us | 设备层时延最大值 | | +| latency_device_last | io_latency(0x01) | Gauge | us | 设备层时延最近值 | | +| latency_device_sum | io_latency(0x01) | Gauge | us | 设备层时延最总计值 | | +| latency_device_jitter | io_latency(0x01) | Gauge | us | 设备层时延抖动 | | +| count_latency_device | io_latency(0x01) | Gauge | | 设备层操作次数 | | +| err_code | io_err(0x02) | Gauge | | block层I/O操作错误码 | | +| read_bytes | io_count(0x04) | Gauge | bytes | I/O操作读字节数 | | +| write_bytes | io_count(0x04) | Gauge | bytes | I/O操作写字节数 | | + +## 容器性能 + +实体名:container + +| metrics_name | table_name | metrics_type | unit | metrics description | +| -------------------------------------- | ----------------- | ------------ | ------- | ------------------------------------------------------------ | +| container_id | container | key | | 容器ID(简写) | +| name | container | label | | 容器名称 | +| cpucg_inode | container | label | | cpu,cpuacct cgroup ID(容器实例内cgroup目录对应的inode id) | +| memcg_inode | container | label | | memory cgroup ID(容器实例内cgroup目录对应的inode id) | +| pidcg_inode | container | label | | pids cgroup ID(容器实例内cgroup目录对应的inode id) | +| mnt_ns_id | container | label | | mount namespace | +| net_ns_id | container | label | | net namespace | +| proc_id | container | label | | 容器主进程ID | +| blkio_device_usage_total | container_blkio | Gauge | bytes | Blkio device bytes usage, unit bytes | +| cpu_load_average_10s | container_cpu | Gauge | | Value of container cpu load average over the last 10 seconds | +| cpu_system_seconds_total | container_cpu | Gauge | seconds | Cumulative system cpu time consumed, unit second | +| cpu_usage_seconds_total | container_cpu | Gauge | seconds | Cumulative cpu time consumed, unit second | +| cpu_user_seconds_total | container_cpu | Gauge | seconds | Cumulative user cpu time consumed, unit second | +| fs_inodes_free | container_fs | Gauge | | Number of available Inodes | +| fs_inodes_total | container_fs | Gauge | | Total number of Inodes | +| fs_io_current | container_fs | Gauge | | Number of I/Os currently in progress | +| fs_io_time_seconds_total | container_fs | Gauge | seconds | Cumulative count of seconds spent doing I/Os, unit second | +| fs_io_time_weighted_seconds_total | container_fs | Gauge | seconds | Cumulative weighted I/O time, unit second | +| fs_limit_bytes | container_fs | Gauge | bytes | Number of bytes that can be consumed by the container on this filesystem, unit bytes | +| fs_read_seconds_total | container_fs | Gauge | bytes | Cumulative count of bytes read, unit bytes | +| fs_reads_bytes_total | container_fs | Gauge | bytes | Cumulative count of bytes read | +| fs_reads_merged_total | container_fs | Gauge | | Cumulative count of reads merged | +| fs_reads_total | container_fs | Gauge | | Cumulative count of reads completed | +| fs_sector_reads_total | container_fs | Gauge | | Cumulative count of sector reads completed | +| fs_sector_writes_total | container_fs | Gauge | | Cumulative count of sector writes completed | +| fs_usage_bytes | container_fs | Gauge | bytes | Number of bytes that are consumed by the container on this filesystem | +| fs_write_seconds_total | container_fs | Gauge | seconds | Cumulative count of seconds spent writing | +| fs_writes_bytes_total | container_fs | Gauge | bytes | Cumulative count of bytes written | +| fs_writes_merged_total | container_fs | Gauge | | Cumulative count of writes merged | +| fs_writes_total | container_fs | Gauge | | Cumulative count of writes completed | +| memory_cache | container_memory | Gauge | bytes | Total page cache memory | +| memory_failcnt | container_memory | Gauge | | Number of memory usage hits limits | +| memory_failures_total | container_memory | Gauge | | Cumulative count of memory allocation failures | +| memory_mapped_file | container_memory | Gauge | bytes | Size of memory mapped files | +| memory_max_usage_bytes | container_memory | Gauge | bytes | Maximum memory usage recorded | +| memory_rss | container_memory | Gauge | bytes | Size of RSS | +| memory_swap | container_memory | Gauge | bytes | Container swap usage | +| memory_usage_bytes | container_memory | Gauge | bytes | Current memory usage, including all memory regardless of when it was accessed | +| memory_working_set_bytes | container_memory | Gauge | bytes | Current working set | +| network_receive_bytes_total | container_network | Gauge | bytes | Cumulative count of bytes received | +| network_receive_errors_total | container_network | Gauge | | Cumulative count of errors encountered while receiving | +| network_receive_packets_dropped_total | container_network | Gauge | | Cumulative count of packets dropped while receiving | +| network_receive_packets_total | container_network | Gauge | | Cumulative count of packets received | +| network_transmit_bytes_total | container_network | Gauge | bytes | Cumulative count of bytes transmitted | +| network_transmit_errors_total | container_network | Gauge | | Cumulative count of errors encountered while transmitting | +| network_transmit_packets_dropped_total | container_network | Gauge | | Cumulative count of packets dropped while transmitting | +| network_transmit_packets_total | container_network | Gauge | | Cumulative count of packets transmitted | +| oom_events_total | container_oom | Gauge | | Count of out of memory events observed for the container | +| spec_cpu_period | container_spec | Gauge | | CPU period of the container | +| spec_cpu_shares | container_spec | Gauge | | CPU share of the container | +| spec_memory_limit_bytes | container_spec | Gauge | bytes | Memory limit for the container | +| spec_memory_reservation_limit_bytes | container_spec | Gauge | bytes | Memory reservation limit for the container | +| spec_memory_swap_limit_bytes | container_spec | Gauge | bytes | Memory swap limit for the container | +| start_time_seconds | container_start | Gauge | seconds | Start time of the container since unix epoch | +| tasks_state | container_tasks | Gauge | | Number of tasks in given state (sleeping, running, stopped, uninterruptible, or ioawaiting) | +| | | | | | + +# 网络监控 + +## TCP流量监控 + +实体名:tcp_link + +| metrics_name | table_name | metrics_type | unit | metrics description | +| ------------ | -------------- | ------------ | ----- | ----------------------------------- | +| tgid | | key | | 进程ID | +| role | | key | | 客户端/服务端 | +| client_ip | | key | | 客户端:本地IP;服务端:对端IP | +| server_ip | | key | | 客户端:对端IP;服务端:本地IP | +| server_port | | key | | 客户端:对端端口;服务端:本地端口 | +| protocol | | key | | 协议族(IPv4、IPv6) | +| rx_bytes | tcp_tx_rx(0x8) | Gauge | bytes | rx bytes | +| tx_bytes | tcp_tx_rx(0x8) | Gauge | bytes | tx bytes | +| segs_in | tcp_tx_rx(0x8) | Counter | segs | total number of segments received | +| segs_out | tcp_tx_rx(0x8) | Counter | segs | total number of segments sent | + +## DNS访问监控 + +实体名:dns + +| metrics_name | table_name | metrics_type | unit | metrics description | Support | +| ------------ | ---------- | ------------ | ---- | ------------------- | ------- | +| tgid | dns | key | | 进程ID | | +| domain | dns | key | | 进程访问的DNS域名 | | +| dns_ip | dns | lable | | DNS域名对应的IP地址 | | +| delay_avg | dns | Gauge | ms | DNS访问平均时延 | TO BE | +| max_delay | dns | Gauge | ms | DNS访问最大时延 | TO BE | +| error_ratio | dns | Gauge | % | DNS访问错误率 | TO BE | +| count | dns | Gauge | | DNS访问次数 | TO BE | + +## TCP/IP监控 + +### TCP异常监控 + +实体名:tcp_link + +| metrics_name | table_name | metrics_type | unit | metrics description | +| ------------------- | ------------- | ------------ | ---- | ------------------------------------------------------------ | +| tgid | | key | | 进程ID | +| role | | key | | 客户端/服务端 | +| client_ip | | key | | 客户端:本地IP;服务端:对端IP | +| server_ip | | key | | 客户端:对端IP;服务端:本地IP | +| server_port | | key | | 客户端:对端端口;服务端:本地端口 | +| protocol | | key | | 协议族(IPv4、IPv6) | +| retran_packets | tcp_abn(0x01) | Gauge | | total number of retrans | +| retran_ratio | tcp_abn(0x01) | Gauge | | retran ratio | +| backlog_drops | tcp_abn(0x01) | Gauge | | drops caused by backlog queue full | +| sk_drops | tcp_abn(0x01) | Counter | | Number of lost packets in the TCP protocol stack | +| lost_out | tcp_abn(0x01) | Gauge | segs | Number of lost segments estimated by TCP congestion | +| sacked_out | tcp_abn(0x01) | Gauge | segs | Number of out-of-order TCP packets (SACK) or number of repeated TCP ACKs (NO SACK) | +| filter_drops | tcp_abn(0x01) | Gauge | | drops caused by socket filter | +| tmout_count | tcp_abn(0x01) | Gauge | | counter of tcp link timeout | +| snd_buf_limit_count | tcp_abn(0x01) | Gauge | | counter of limits when allocate wmem | +| rmem_scheduls | tcp_abn(0x01) | Gauge | | rmem is not enough | +| tcp_oom | tcp_abn(0x01) | Gauge | | tcp out of memory | +| send_rsts | tcp_abn(0x01) | Gauge | | send_rsts | +| receive_rsts | tcp_abn(0x01) | Gauge | | receive_rsts | + +### Socket监控 + +实体名:endpoint + +| metrics_name | table_name | metrics_type | unit | metrics description | +| ------------------- | ---------- | ------------ | ----- | ------------------------------------------------ | +| tgid | | key | | 进程ID | +| s_addr | | key | | udp/tcp 本地地址 | +| s_port | | key | | listen port(只有listen对象存在该label) | +| ep_type | | key | | listen/connect/udp/bind | +| listendrop | listen | Gauge | | TCP accept丢弃次数(只有listen对象存在) | +| accept_overflow | listen | Gauge | | TCP accept队列溢出次数 | +| syn_overflow | listen | Gauge | | TCP syn队列溢出次数 | +| passive_open | listen | Gauge | | tcp被动发起的建链次数(只有listen对象存在) | +| passive_open_failed | listen | Gauge | | tcp被动发起的建链失败次数(只有listen对象存在) | +| retran_synacks | listen | Gauge | | tcp synack重传报文数 | +| lost_synacks | listen | Gauge | | TCP synack报文丢失导致的建链失败次数 | +| active_open | connect | Gauge | | tcp主动发起的建链次数(只有connect对象存在) | +| active_open_failed | connect | Gauge | | tcp主动发起的建链失败次数(只有connect对象存在) | +| bind_rcv_drops | bind | Gauge | | UDP接收失败次数(udp/bind对象存在) | +| bind_sends | bind | Gauge | bytes | UDP发送长度(udp/bind对象存在) | +| bind_rcvs | bind | Gauge | bytes | UDP接收长度(udp/bind对象存在) | +| bind_err | bind | Gauge | | UDP接收失败错误码(udp/bind对象存在) | +| udp_rcv_drops | udp | Gauge | | UDP接收失败次数(udp/bind对象存在) | +| udp_sends | udp | Gauge | bytes | UDP发送长度(udp/bind对象存在) | +| udp_rcvs | udp | Gauge | bytes | UDP接收长度(udp/bind对象存在) | +| udp_err | udp | Gauge | | UDP接收失败错误码(udp/bind对象存在) | + + + +# 应用(微服务)访问性能 + +实体名:l7 + +| metrics_name | table_name | metrics_type | unit | metrics description | Support | +| --------------- | ---------- | ------------ | ---- | ------------------------------------------------------ | -------------------------- | +| tgid | | key | | Process ID of l7 session. | openSSL 1.1.1, Go SSL,JSSE | +| remote_ip | | key | | Remote IP address of l7 session. | | +| remote_port | | key | | Remote port of l7 session. | | +| l4_role | | key | | Role of l4 protocol(TCP Client/Server or UDP) | | +| l7_role | | key | | Role of l7 protocol(Client or Server) | | +| protocol | | key | | Name of l7 protocol(http/http2/mysql...) | | +| pod_ip | | label | | IP address of pod which l7 session belongs. | | +| ssl | | label | | Indicates whether an SSL-encrypted l7 session is used. | | +| bytes_sent | l7_link | gauge | | Number of bytes sent by a l7 session. | | +| bytes_recv | l7_link | gauge | | Number of bytes recv by a l7 session. | | +| throughput_req | l7_rpc | histogram | qps | Request throughput of l7 session. | | +| throughput_resp | l7_rpc | histogram | qps | Response throughput of l7 session. | | +| latency_avg | l7_rpc | gauge | ns | Average l7 session latency. | | +| latency | l7_rpc | histogram | ns | L7 session P50 latency. | | +| err_ratio | l7_rpc | gauge | % | L7 session error rate. | | + +# 应用性能监控 + +## TCP性能 + +实体名:tcp_link + +| metrics_name | table_name | metrics_type | unit | metrics description | +| ------------------ | ----------------- | ------------ | ----- | ------------------------------------------------------------ | +| tgid | | key | | 进程ID | +| role | | key | | 客户端/服务端 | +| client_ip | | key | | 客户端:本地IP;服务端:对端IP | +| server_ip | | key | | 客户端:对端IP;服务端:本地IP | +| server_port | | key | | 客户端:对端端口;服务端:本地端口 | +| protocol | | key | | 协议族(IPv4、IPv6) | +| rto | tcp_rate(0x20) | histogram | | Retransmission timeOut(us) | +| ato | tcp_rate(0x20) | histogram | | Estimated value of delayed ACK(us) | +| srtt | tcp_rtt(0x4) | histogram | us | Smoothed Round Trip Time(us). | +| snd_cwnd | tcp_windows(0x2) | histogram | | Congestion Control Window Size. | +| reordering | tcp_windows(0x2) | histogram | | Segments to be reordered. | +| rcv_rtt | tcp_rtt(0x4) | histogram | us | Receive end RTT (unidirectional measurement). | +| notsent_bytes | tcp_windows(0x2) | histogram | bytes | Number of bytes not sent currently. | +| notack_bytes | tcp_windows(0x2) | histogram | bytes | Number of bytes not ack currently. | +| snd_wnd | tcp_windows(0x2) | histogram | | Size of TCP send window. | +| rcv_wnd | tcp_windows(0x2) | histogram | | Size of TCP receive window. | +| zero_snd_wnd_ratio | tcp_windows(0x2) | Gauge | | Ratio of the number of times of sending window 0 to the number of sent bytes | +| zero_rcv_wnd_ratio | tcp_windows(0x2) | Gauge | | Ratio of the number of receive window 0 windows to the number of received bytes | +| avl_snd_wnd | tcp_windows(0x2) | histogram | | Size of TCP available send window. | +| syn_srtt | tcp_srtt | histogram | us | RTT of syn packet(us). | +| syn_srtt_max | tcp_srtt | Gauge | us | RTT of syn packet(us). | +| sk_rcvbuf | tcp_sockbuf(0x10) | histogram | bytes | Byte length of the RX buffer. | +| sk_sndbuf | tcp_sockbuf(0x10) | histogram | bytes | Byte length of the TX buffer. | + +## 应用性能 + +### 基于流的进程性能 + +实体名:proc_flow_perf + +| metrics_name | table_name | metrics_type | unit | metrics description | Support | +| ------------- | -------------- | ------------ | ---- | ----------------------------------------------------------- | ------- | +| tgid | | key | | Process ID | | +| remote_ip | | key | | 对端IP地址 | | +| port | | key | | 客户端:对端Port;服务端:本地Port; | | +| role | | key | | 客户端/服务端 | | +| tx_delay | proc_flow_perf | histogram | us | Delay in the Tx direction of the application TCP link. | TO BE | +| rx_delay | proc_flow_perf | histogram | us | Delay in the Rx direction of the application TCP link. | TO BE | +| tx_throughput | proc_flow_perf | histogram | bps | Throughput in the Tx direction of the application TCP link. | TO BE | +| rx_throughput | proc_flow_perf | histogram | bps | Throughput in the Rx direction of the application TCP link. | TO BE | + +### 进程性能 + +实体名:proc_perf + +| metrics_name | table_name | metrics_type | unit | metrics description | Support | +| ------------- | ---------- | ------------ | ---- | ----------------------------------- | ------- | +| tgid | | key | | Process ID | | +| tx_delay | proc_perf | histogram | us | TCP delay in the Tx direction. | TO BE | +| rx_delay | proc_perf | histogram | us | TCP delay in the Rx direction. | TO BE | +| tx_throughput | proc_perf | histogram | bps | TCP throughput in the Tx direction. | TO BE | +| rx_throughput | proc_perf | histogram | bps | TCP throughput in the Rx direction. | TO BE | + + + +## I/O性能 + +实体名:proc + +| metrics_name | table_name | metrics_type | unit | metrics description | +| --------------------- | ------------------ | ------------ | ---- | ------------------------------------------------------------ | +| tgid | | key | | 进程ID | +| ppid | system_proc | label | | 父进程ID | +| pgid | system_proc | label | | 进程组ID | +| comm | | label | | 执行程序名称 | +| cmdline | system_proc | label | | 执行程序命令(包括配置) | +| fd_count | system_proc | Gauge | | 进程文件句柄 | +| fd_free_per | system_proc | Gauge | | 进程剩余FD资源占比% | +| rchar_bytes | system_proc | Gauge | | 进程系统调用至FS的读字节数 | +| wchar_bytes | system_proc | Gauge | | 进程系统调用至FS的写字节数 | +| syscr_count | system_proc | Gauge | | 进程read()/pread()执行次数 | +| syscw_count | system_proc | Gauge | | 进程write()/pwrite()执行次数 | +| read_bytes | system_proc | Gauge | | 进程实际从磁盘读取的字节数 | +| write_bytes | system_proc | Gauge | | 进程实际从磁盘写入的字节数 (page cache情况下,该字段进表示设置dirty page的size) | +| cancelled_write_bytes | system_proc | Gauge | | 参考proc_write_bytes,因为存在page cache 如果write操作结束后,又发生文件被删除事件,会导致diry page并未写入磁盘,所以存在取消写的字节数统计 | +| ns_ext4_read | proc_ext4(0x20) | Gauge | ns | ext4文件系统读操作时间,单位ns | +| ns_ext4_write | proc_ext4(0x20) | Gauge | ns | ext4文件系统写操作时间,单位ns | +| ns_ext4_flush | proc_ext4(0x20) | Gauge | ns | ext4文件系统flush操作时间,单位ns | +| ns_ext4_open | proc_ext4(0x20) | Gauge | ns | ext4文件系统open操作时间,单位ns | +| ns_overlay_read | proc_overlay(0x40) | Gauge | ns | overlayfs文件系统读操作时间,单位ns | +| ns_overlay_write | proc_overlay(0x40) | Gauge | ns | overlayfs文件系统写操作时间,单位ns | +| ns_overlay_flush | proc_overlay(0x40) | Gauge | ns | overlayfs文件系统flush操作时间,单位ns | +| ns_overlay_open | proc_overlay(0x40) | Gauge | ns | overlayfs文件系统open操作时间,单位ns | +| ns_tmpfs_read | proc_tmpfs(0x80) | Gauge | ns | tmpfs文件系统读操作时间,单位ns | +| ns_tmpfs_write | proc_tmpfs(0x80) | Gauge | ns | tmpfs文件系统写操作时间,单位ns | +| ns_tmpfs_flush | proc_tmpfs(0x80) | Gauge | ns | tmpfs文件系统flush操作时间,单位ns | +| less_4k_io_read | proc_io(0x400) | Gauge | | Number of small I/O (less than 4 KB) read operations at the BIO layer. | +| less_4k_io_write | proc_io(0x400) | Gauge | | Number of small I/O (less than 4 KB) write operations at the BIO layer. | +| greater_4k_io_read | proc_io(0x400) | Gauge | | Number of big I/O (greater than 4 KB) read operations at the BIO layer. | +| greater_4k_io_write | proc_io(0x400) | Gauge | | Number of big I/O (greater than 4 KB) write operations at the BIO layer. | +| bio_latency | proc_io(0x400) | Gauge | ns | I/O operation delay at the BIO layer (unit: us). (备注:虚拟化场景针对qemu进程才有意义) | +| bio_err_count | proc_io(0x400) | Gauge | | Number of I/O operation failures at the BIO layer.(备注:虚拟化场景针对qemu进程才有意义) | +| hang_count | proc_io(0x400) | Gauge | | Number of process hang times. | +| iowait_us | proc_io(0x400) | Gauge | us | Process IO_wait time (unit: us). | + +## 内存 + +实体名:proc + +| metrics_name | table_name | metrics_type | unit | metrics description | +| --------------------- | ----------- | ------------ | ---- | --------------------------------------- | +| tgid | | key | | 进程ID | +| ppid | system_proc | label | | 父进程ID | +| pgid | system_proc | label | | 进程组ID | +| comm | | label | | 执行程序名称 | +| cmdline | system_proc | label | | 执行程序命令(包括配置) | +| shared_dirty_size | system_proc | Gauge | | 进程共享属性的dirty page size | +| shared_clean_size | system_proc | Gauge | | 进程共享属性的clean page size | +| private_dirty_size | system_proc | Gauge | | 进程私有属性的dirty page size | +| private_clean_size | system_proc | Gauge | | 进程私有属性的clean page size | +| referenced_size | system_proc | Gauge | | 进程当前已引用的page size | +| lazyfree_size | system_proc | Gauge | | 进程延迟释放内存的size | +| swap_data_size | system_proc | Gauge | | 进程swap区间数据size | +| swap_data_pss_size | system_proc | Gauge | | 进程物理内存swap区间数据size | +| minor pagefault_count | system_proc | Gauge | | 进程轻微pagefault次数(无需从磁盘拷贝) | +| major pagefault_count | system_proc | Gauge | | 进程严重pagefault次数(需从磁盘拷贝) | +| vm_size | system_proc | Gauge | | 进程当前虚拟地址空间大小 | +| pm_size | system_proc | Gauge | | 进程当前物理地址空间大小 | + +## 调度&系统调用 + +实体名:proc + +| metrics_name | table_name | metrics_type | unit | metrics description | +| -------------- | ------------------------ | ------------ | ---- | ----------------------------------- | +| tgid | | key | | 进程ID | +| ppid | system_proc | label | | 父进程ID | +| pgid | system_proc | label | | 进程组ID | +| comm | | label | | 执行程序名称 | +| cmdline | system_proc | label | | 执行程序命令(包括配置) | +| utime_jiffies | system_proc | Gauge | | 进程用户运行时间 | +| stime_jiffies | system_proc | Gauge | | 进程系统态运行时间 | +| ns_mount | proc_syscall_io(0x02) | Gauge | ns | 进程系统调用mount时长,单位ns | +| ns_umount | proc_syscall_io(0x02) | Gauge | ns | 进程系统调用umount时长,单位ns | +| ns_read | proc_syscall_io(0x02) | Gauge | ns | 进程系统调用read时长,单位ns | +| ns_write | proc_syscall_io(0x02) | Gauge | ns | 进程系统调用write时长,单位ns | +| ns_fsync | proc_syscall_io(0x02) | Gauge | ns | 进程系统调用fsync时长,单位ns | +| ns_sendmsg | proc_syscall_net(0x04) | Gauge | ns | 进程系统调用sendmsg时长,单位ns | +| ns_recvmsg | proc_syscall_net(0x04) | Gauge | ns | 进程系统调用recvmsg时长,单位ns | +| ns_sched_yield | proc_syscall_sched(0x08) | Gauge | ns | 进程系统调用sched_yield时长,单位ns | +| ns_futex | proc_syscall_sched(0x08) | Gauge | ns | 进程系统调用futex时长,单位ns | +| ns_epoll_wait | proc_syscall_sched(0x08) | Gauge | ns | 进程系统调用epoll_wait时长,单位ns | +| ns_epoll_pwait | proc_syscall_sched(0x08) | Gauge | ns | 进程系统调用epoll_pwait时长,单位ns | +| ns_fork | proc_syscall_fork(0x10) | Gauge | ns | 进程系统调用fork时长,单位ns | +| ns_vfork | proc_syscall_fork(0x10) | Gauge | ns | 进程系统调用vfork时长,单位ns | +| ns_clone | proc_syscall_fork(0x10) | Gauge | ns | 进程系统调用clone时长,单位ns | +| syscall_failed | proc_syscall (0x01) | Gauge | | 进程系统调用失败次数 | + +## JVM监控 + +实体名:jvm + +| metrics_name | table_name | metrics_type | unit | metrics description | +| -------------------------- | ------------ | ------------ | ----- | ----------------------------------------- | +| tgid | | key | | Java 虚拟机的进程ID | +| runtime | jvm_info | label | | JVM 运行时信息 | +| vendor | jvm_info | label | | JVM 创建者/维护者 | +| version | jvm_info | label | | JVM 版本 | +| info | jvm_info | gauge | | 固定值1 | +| proc_start_time_secs | jvm_proc | gauge | s | 进程起始时间 | +| proc_cpu_secs_total | jvm_proc | counter | s | 进程已使用的CPU时间 | +| class_current_loaded | jvm_class | gauge | | JVM当前已加载类的数量 | +| class_loaded_total | jvm_class | counter | | JVM自执行以来加载的类的总数量 | +| threads_current | jvm_thread | gauge | | JVM当前线程数 | +| threads_daemon | jvm_thread | gauge | | JVM的守护线程数 | +| threads_peak | jvm_thread | gauge | | JVM的峰值线程数 | +| threads_started_total | jvm_thread | counter | | JVM的已启动线程数 | +| threads_deadlocked | jvm_thread | gauge | | JVM的死锁的线程数 | +| area | jvm_mem | label | | JVM内存类型:heap/noheap | +| mem_bytes_used | jvm_mem | gauge | bytes | 给定JVM内存区域的已使用字节数 | +| mem_bytes_commit | jvm_mem | gauge | bytes | 给定JVM内存区域的已提交字节数 | +| mem_bytes_max | jvm_mem | gauge | bytes | 给定JVM内存区域的最大字节数 | +| mem_bytes_init | jvm_mem | gauge | bytes | 给定JVM内存区域的初始字节数 | +| pool | jvm_mem_pool | label | | 内存池类型 | +| mem_pool_bytes_used | jvm_mem_pool | gauge | bytes | 给定JVM内存池的已使用字节数 | +| mem_pool_bytes_commit | jvm_mem_pool | gauge | bytes | 给定JVM内存池的已提交字节数 | +| mem_pool_bytes_max | jvm_mem_pool | gauge | bytes | 给定JVM内存池的最大字节数 | +| mem_pool_coll_used_bytes | jvm_mem_pool | gauge | bytes | 给定JVM内存池最后一次垃圾回收使用的字节数 | +| mem_pool_coll_commit_bytes | jvm_mem_pool | gauge | bytes | 上一次GC内存池的大小 | +| mem_pool_coll_max_bytes | jvm_mem_pool | gauge | bytes | 上一次GC内存池的最大字节数 | +| pool | jvm_buf_pool | label | | 缓冲池类型 | +| buffer_pool_used_bytes | jvm_buf_pool | gauge | bytes | 给定JVM缓冲池的已用字节数 | +| buffer_pool_used_buffers | jvm_buf_pool | gauge | | 给定JVM缓冲池的已用缓冲区数 | +| buffer_pool_capacity_bytes | jvm_buf_pool | gauge | bytes | 给定JVM缓冲池的字节容量 | +| gc | jvm_gc | label | | 垃圾回收器名字 | +| gc_coll_secs_count | jvm_gc | summary | | 给定的垃圾回收器已发生的GC总次数 | +| gc_coll_secs_sum | jvm_gc | summary | s | 在给定的垃圾回收器花费的总时间 | + +# Kafka监控 + +## Topic流监控 + +实体名:kafka_topic_flow + +| metrics_name | table_name | metrics_type | unit | metrics description | Support | +| ------------ | ---------------- | ------------ | ---- | ---------------------------------------------------- | -------------- | +| msg_type | kafka_topic_flow | key | | 访问类型,producer或consumer | 需要修改实体名 | +| client_ip | kafka_topic_flow | key | | 客户端IP | | +| num | kafka_topic_flow | gauge | | 在一次采样周期中producer发布或consumer消费的消息数量 | | +| topic | kafka_topic_flow | key | | 消息的topic | | +| server_ip | kafka_topic_flow | key | | kafka server所在主机的网卡IP | | +| server_port | kafka_topic_flow | key | | kafka server所绑定的端口号 | | + +## Topic性能监控 + +实体名:kafka_topic_metrics + +| metrics_name | table_name | metrics_type | unit | metrics description | Support | +| ------------ | ------------------- | ------------ | ---- | ---------------------------- | ------- | +| topic | kafka_topic_metrics | key | | 消息的topic | TO BE | +| server_ip | kafka_topic_metrics | key | | kafka server所在主机的网卡IP | TO BE | +| server_port | kafka_topic_metrics | key | | kafka server所绑定的端口号 | TO BE | +| throughput | kafka_topic_metrics | histogram | | topic 吞吐量 | TO BE | + +# Nginx/Haproxy监控 + +## Nginx 负载分担监控 + +实体名:nginx_link + +| metrics_name | table_name | metrics_type | unit | metrics description | Support | +| ------------ | ---------- | ------------ | ---- | ------------------- | -------------------------- | +| client_ip | nginx_link | key | | 客户端IP | 当前仅支持nginx 1.12.1版本 | +| virtual_ip | nginx_link | key | | 虚拟服务器IP | | +| server_ip | nginx_link | key | | 真实服务端IP | | +| virtual_port | nginx_link | key | | 虚拟服务器端口 | | +| server_port | nginx_link | key | | 真实服务端端口 | | +| is_l7 | nginx_link | label | | 1—七层LB / 0—四层LB | | +| link_count | nginx_link | gauge | | 连接数 | | + +## Haproxy负载分担监控 + +实体名:haproxy_link + +| metrics_name | table_name | metrics_type | unit | metrics description | Support | +| ------------ | ------------ | ------------ | ---- | ------------------- | ------------------------------ | +| client_ip | haproxy_link | key | | 客户端IP | 当前仅支持haproxy 2.5-dev0版本 | +| virtual_ip | haproxy_link | key | | 虚拟服务器IP | | +| server_ip | haproxy_link | key | | 真实服务端IP | | +| virtual_port | haproxy_link | key | | 虚拟服务器端口 | | +| server_port | haproxy_link | key | | 真实服务端端口 | | +| protocol | haproxy_link | label | | 协议类型(TCP/HTTP) | | +| link_count | haproxy_link | gauge | | 连接数 | | + +## TCP性能监控 + +实体名:tcp_link + +| metrics_name | table_name | metrics_type | unit | metrics description | +| ------------------ | ----------------- | ------------ | ----- | ------------------------------------------------------------ | +| tgid | | key | | 进程ID | +| role | | key | | 客户端/服务端 | +| client_ip | | key | | 客户端:本地IP;服务端:对端IP | +| server_ip | | key | | 客户端:对端IP;服务端:本地IP | +| server_port | | key | | 客户端:对端端口;服务端:本地端口 | +| protocol | | key | | 协议族(IPv4、IPv6) | +| rto | tcp_rate(0x20) | histogram | | Retransmission timeOut(us) | +| ato | tcp_rate(0x20) | histogram | | Estimated value of delayed ACK(us) | +| srtt | tcp_rtt(0x4) | histogram | us | Smoothed Round Trip Time(us). | +| snd_cwnd | tcp_windows(0x2) | histogram | | Congestion Control Window Size. | +| reordering | tcp_windows(0x2) | histogram | | Segments to be reordered. | +| rcv_rtt | tcp_rtt(0x4) | histogram | us | Receive end RTT (unidirectional measurement). | +| notsent_bytes | tcp_windows(0x2) | histogram | bytes | Number of bytes not sent currently. | +| notack_bytes | tcp_windows(0x2) | histogram | bytes | Number of bytes not ack currently. | +| snd_wnd | tcp_windows(0x2) | histogram | | Size of TCP send window. | +| rcv_wnd | tcp_windows(0x2) | histogram | | Size of TCP receive window. | +| zero_snd_wnd_ratio | tcp_windows(0x2) | Gauge | | Ratio of the number of times of sending window 0 to the number of sent bytes | +| zero_rcv_wnd_ratio | tcp_windows(0x2) | Gauge | | Ratio of the number of receive window 0 windows to the number of received bytes | +| avl_snd_wnd | tcp_windows(0x2) | histogram | | Size of TCP available send window. | +| syn_srtt | tcp_srtt | histogram | us | RTT of syn packet(us). | +| syn_srtt_max | tcp_srtt | Gauge | us | RTT of syn packet(us). | +| sk_rcvbuf | tcp_sockbuf(0x10) | histogram | bytes | Byte length of the RX buffer. | +| sk_sndbuf | tcp_sockbuf(0x10) | histogram | bytes | Byte length of the TX buffer. | + +## TCP异常监控 + +实体名:tcp_link + +| metrics_name | table_name | metrics_type | unit | metrics description | +| ------------------- | ------------- | ------------ | ---- | ------------------------------------------------------------ | +| tgid | | key | | 进程ID | +| role | | key | | 客户端/服务端 | +| client_ip | | key | | 客户端:本地IP;服务端:对端IP | +| server_ip | | key | | 客户端:对端IP;服务端:本地IP | +| server_port | | key | | 客户端:对端端口;服务端:本地端口 | +| protocol | | key | | 协议族(IPv4、IPv6) | +| retran_packets | tcp_abn(0x01) | Gauge | | total number of retrans | +| retran_ratio | tcp_abn(0x01) | Gauge | | retran ratio | +| backlog_drops | tcp_abn(0x01) | Gauge | | drops caused by backlog queue full | +| sk_drops | tcp_abn(0x01) | Counter | | Number of lost packets in the TCP protocol stack | +| lost_out | tcp_abn(0x01) | Gauge | segs | Number of lost segments estimated by TCP congestion | +| sacked_out | tcp_abn(0x01) | Gauge | segs | Number of out-of-order TCP packets (SACK) or number of repeated TCP ACKs (NO SACK) | +| filter_drops | tcp_abn(0x01) | Gauge | | drops caused by socket filter | +| tmout_count | tcp_abn(0x01) | Gauge | | counter of tcp link timeout | +| snd_buf_limit_count | tcp_abn(0x01) | Gauge | | counter of limits when allocate wmem | +| rmem_scheduls | tcp_abn(0x01) | Gauge | | rmem is not enough | +| tcp_oom | tcp_abn(0x01) | Gauge | | tcp out of memory | +| send_rsts | tcp_abn(0x01) | Gauge | | send_rsts | +| receive_rsts | tcp_abn(0x01) | Gauge | | receive_rsts | + +## Socket监控 + +实体名:endpoint + +| metrics_name | table_name | metrics_type | unit | metrics description | +| ------------------- | ---------- | ------------ | ---- | ------------------------------------------------ | +| tgid | | key | | 进程ID | +| s_addr | | key | | udp/tcp 本地地址 | +| s_port | | key | | listen port(只有listen对象存在该label) | +| ep_type | | key | | listen/connect/udp/bind | +| listendrop | listen | Gauge | | TCP accept丢弃次数(只有listen对象存在) | +| accept_overflow | listen | Gauge | | TCP accept队列溢出次数 | +| syn_overflow | listen | Gauge | | TCP syn队列溢出次数 | +| passive_open | listen | Gauge | | tcp被动发起的建链次数(只有listen对象存在) | +| passive_open_failed | listen | Gauge | | tcp被动发起的建链失败次数(只有listen对象存在) | +| retran_synacks | listen | Gauge | | tcp synack重传报文数 | +| lost_synacks | listen | Gauge | | TCP synack报文丢失导致的建链失败次数 | +| active_open | connect | Gauge | | tcp主动发起的建链次数(只有connect对象存在) | +| active_open_failed | connect | Gauge | | tcp主动发起的建链失败次数(只有connect对象存在) | + + + +# Redis/PostgreSQL监控 + +## Redis性能监控 + +实体名:sli + +| metrics_name | table_name | metrics_type | unit | metrics description | Support | +| ------------ | ------------- | ------------ | ---- | ------------------------------ | ---------------- | +| tgid | | key | | 进程ID | 仅支持非加密场景 | +| ins_id | | key | | 实例ID | | +| app | | key | | 应用名 | | +| method | | key | | 请求方法 | | +| server_ip | | label | | 服务端IP | | +| server_port | | label | | 服务端端口 | | +| client_ip | | label | | 客户端IP | | +| client_port | | label | | 客户端端口 | | +| rtt_nsec | redis_sli | gauge | ns | Redis协议请求RTT | | +| max_rtt_nsec | redis_max_sli | gauge | ns | Redis协议采样周期内最大请求RTT | | + +## PostgreSQL性能监控 + +实体名:sli + +| metrics_name | table_name | metrics_type | unit | metrics description | Support | +| ------------ | ---------- | ------------ | ---- | -------------------------------- | ------------------------------- | +| tgid | | key | | 进程ID | 支持加密场景,openssl 1.1.1版本 | +| ins_id | | key | | 实例ID | | +| app | | key | | 应用名 | | +| method | | key | | 请求方法 | | +| server_ip | | label | | 服务端IP | | +| server_port | | label | | 服务端端口 | | +| client_ip | | label | | 客户端IP | | +| client_port | | label | | 客户端端口 | | +| rtt_nsec | pg_sli | gauge | ns | Postgre协议请求RTT | | +| max_rtt_nsec | pg_max_sli | gauge | ns | Postgre协议采样周期内最大请求RTT | | +| tps | pg_tps | gauge | | 数据库吞吐量 | 仅支持openGauss 2.0 | + + + +## TCP性能监控 + +实体名:tcp_link + +| metrics_name | table_name | metrics_type | unit | metrics description | +| ------------------ | ----------------- | ------------ | ----- | ------------------------------------------------------------ | +| tgid | | key | | 进程ID | +| role | | key | | 客户端/服务端 | +| client_ip | | key | | 客户端:本地IP;服务端:对端IP | +| server_ip | | key | | 客户端:对端IP;服务端:本地IP | +| server_port | | key | | 客户端:对端端口;服务端:本地端口 | +| protocol | | key | | 协议族(IPv4、IPv6) | +| rto | tcp_rate(0x20) | histogram | | Retransmission timeOut(us) | +| ato | tcp_rate(0x20) | histogram | | Estimated value of delayed ACK(us) | +| srtt | tcp_rtt(0x4) | histogram | us | Smoothed Round Trip Time(us). | +| snd_cwnd | tcp_windows(0x2) | histogram | | Congestion Control Window Size. | +| reordering | tcp_windows(0x2) | histogram | | Segments to be reordered. | +| rcv_rtt | tcp_rtt(0x4) | histogram | us | Receive end RTT (unidirectional measurement). | +| notsent_bytes | tcp_windows(0x2) | histogram | bytes | Number of bytes not sent currently. | +| notack_bytes | tcp_windows(0x2) | histogram | bytes | Number of bytes not ack currently. | +| snd_wnd | tcp_windows(0x2) | histogram | | Size of TCP send window. | +| rcv_wnd | tcp_windows(0x2) | histogram | | Size of TCP receive window. | +| zero_snd_wnd_ratio | tcp_windows(0x2) | Gauge | | Ratio of the number of times of sending window 0 to the number of sent bytes | +| zero_rcv_wnd_ratio | tcp_windows(0x2) | Gauge | | Ratio of the number of receive window 0 windows to the number of received bytes | +| avl_snd_wnd | tcp_windows(0x2) | histogram | | Size of TCP available send window. | +| syn_srtt | tcp_srtt | histogram | us | RTT of syn packet(us). | +| syn_srtt_max | tcp_srtt | Gauge | us | RTT of syn packet(us). | +| sk_rcvbuf | tcp_sockbuf(0x10) | histogram | bytes | Byte length of the RX buffer. | +| sk_sndbuf | tcp_sockbuf(0x10) | histogram | bytes | Byte length of the TX buffer. | + +## TCP异常监控 + +实体名:tcp_link + +| metrics_name | table_name | metrics_type | unit | metrics description | +| ------------------- | ------------- | ------------ | ---- | ------------------------------------------------------------ | +| tgid | | key | | 进程ID | +| role | | key | | 客户端/服务端 | +| client_ip | | key | | 客户端:本地IP;服务端:对端IP | +| server_ip | | key | | 客户端:对端IP;服务端:本地IP | +| server_port | | key | | 客户端:对端端口;服务端:本地端口 | +| protocol | | key | | 协议族(IPv4、IPv6) | +| retran_packets | tcp_abn(0x01) | Gauge | | total number of retrans | +| retran_ratio | tcp_abn(0x01) | Gauge | | retran ratio | +| backlog_drops | tcp_abn(0x01) | Gauge | | drops caused by backlog queue full | +| sk_drops | tcp_abn(0x01) | Counter | | Number of lost packets in the TCP protocol stack | +| lost_out | tcp_abn(0x01) | Gauge | segs | Number of lost segments estimated by TCP congestion | +| sacked_out | tcp_abn(0x01) | Gauge | segs | Number of out-of-order TCP packets (SACK) or number of repeated TCP ACKs (NO SACK) | +| filter_drops | tcp_abn(0x01) | Gauge | | drops caused by socket filter | +| tmout_count | tcp_abn(0x01) | Gauge | | counter of tcp link timeout | +| snd_buf_limit_count | tcp_abn(0x01) | Gauge | | counter of limits when allocate wmem | +| rmem_scheduls | tcp_abn(0x01) | Gauge | | rmem is not enough | +| tcp_oom | tcp_abn(0x01) | Gauge | | tcp out of memory | +| send_rsts | tcp_abn(0x01) | Gauge | | send_rsts | +| receive_rsts | tcp_abn(0x01) | Gauge | | receive_rsts | + +## Socket监控 + +实体名:endpoint + +| metrics_name | table_name | metrics_type | unit | metrics description | +| ------------------- | ---------- | ------------ | ---- | ------------------------------------------------ | +| tgid | | key | | 进程ID | +| s_addr | | key | | udp/tcp 本地地址 | +| s_port | | key | | listen port(只有listen对象存在该label) | +| ep_type | | key | | listen/connect/udp/bind | +| listendrop | listen | Gauge | | TCP accept丢弃次数(只有listen对象存在) | +| accept_overflow | listen | Gauge | | TCP accept队列溢出次数 | +| syn_overflow | listen | Gauge | | TCP syn队列溢出次数 | +| passive_open | listen | Gauge | | tcp被动发起的建链次数(只有listen对象存在) | +| passive_open_failed | listen | Gauge | | tcp被动发起的建链失败次数(只有listen对象存在) | +| retran_synacks | listen | Gauge | | tcp synack重传报文数 | +| lost_synacks | listen | Gauge | | TCP synack报文丢失导致的建链失败次数 | +| active_open | connect | Gauge | | tcp主动发起的建链次数(只有connect对象存在) | +| active_open_failed | connect | Gauge | | tcp主动发起的建链失败次数(只有connect对象存在) | + +## I/O性能 + +实体名:proc + +| metrics_name | table_name | metrics_type | unit | metrics description | +| --------------------- | ------------------ | ------------ | ---- | ------------------------------------------------------------ | +| tgid | | key | | 进程ID | +| ppid | system_proc | label | | 父进程ID | +| pgid | system_proc | label | | 进程组ID | +| comm | | label | | 执行程序名称 | +| cmdline | system_proc | label | | 执行程序命令(包括配置) | +| fd_count | system_proc | Gauge | | 进程文件句柄 | +| fd_free_per | system_proc | Gauge | | 进程剩余FD资源占比% | +| rchar_bytes | system_proc | Gauge | | 进程系统调用至FS的读字节数 | +| wchar_bytes | system_proc | Gauge | | 进程系统调用至FS的写字节数 | +| syscr_count | system_proc | Gauge | | 进程read()/pread()执行次数 | +| syscw_count | system_proc | Gauge | | 进程write()/pwrite()执行次数 | +| read_bytes | system_proc | Gauge | | 进程实际从磁盘读取的字节数 | +| write_bytes | system_proc | Gauge | | 进程实际从磁盘写入的字节数 (page cache情况下,该字段进表示设置dirty page的size) | +| cancelled_write_bytes | system_proc | Gauge | | 参考proc_write_bytes,因为存在page cache 如果write操作结束后,又发生文件被删除事件,会导致diry page并未写入磁盘,所以存在取消写的字节数统计 | +| ns_ext4_read | proc_ext4(0x20) | Gauge | ns | ext4文件系统读操作时间,单位ns | +| ns_ext4_write | proc_ext4(0x20) | Gauge | ns | ext4文件系统写操作时间,单位ns | +| ns_ext4_flush | proc_ext4(0x20) | Gauge | ns | ext4文件系统flush操作时间,单位ns | +| ns_ext4_open | proc_ext4(0x20) | Gauge | ns | ext4文件系统open操作时间,单位ns | +| ns_overlay_read | proc_overlay(0x40) | Gauge | ns | overlayfs文件系统读操作时间,单位ns | +| ns_overlay_write | proc_overlay(0x40) | Gauge | ns | overlayfs文件系统写操作时间,单位ns | +| ns_overlay_flush | proc_overlay(0x40) | Gauge | ns | overlayfs文件系统flush操作时间,单位ns | +| ns_overlay_open | proc_overlay(0x40) | Gauge | ns | overlayfs文件系统open操作时间,单位ns | +| ns_tmpfs_read | proc_tmpfs(0x80) | Gauge | ns | tmpfs文件系统读操作时间,单位ns | +| ns_tmpfs_write | proc_tmpfs(0x80) | Gauge | ns | tmpfs文件系统写操作时间,单位ns | +| ns_tmpfs_flush | proc_tmpfs(0x80) | Gauge | ns | tmpfs文件系统flush操作时间,单位ns | +| less_4k_io_read | proc_io(0x400) | Gauge | | Number of small I/O (less than 4 KB) read operations at the BIO layer. | +| less_4k_io_write | proc_io(0x400) | Gauge | | Number of small I/O (less than 4 KB) write operations at the BIO layer. | +| greater_4k_io_read | proc_io(0x400) | Gauge | | Number of big I/O (greater than 4 KB) read operations at the BIO layer. | +| greater_4k_io_write | proc_io(0x400) | Gauge | | Number of big I/O (greater than 4 KB) write operations at the BIO layer. | +| hang_count | proc_io(0x400) | Gauge | | Number of process hang times. | +| iowait_us | proc_io(0x400) | Gauge | us | Process IO_wait time (unit: us). | + -- Gitee