From 221cfeed720227eae193b4832168fdcd598bc43f Mon Sep 17 00:00:00 2001 From: luzhihao Date: Thu, 20 Jul 2023 11:19:32 +0800 Subject: [PATCH] Upload l7 metrics and bugfix tcp metrics. --- gopher_tech.md | 51 +++++++++++++++++++++++++++++++++++-------------- io_diag.md | 2 +- network_diag.md | 2 +- 3 files changed, 39 insertions(+), 16 deletions(-) diff --git a/gopher_tech.md b/gopher_tech.md index a6b37ca..fb99b8c 100644 --- a/gopher_tech.md +++ b/gopher_tech.md @@ -11,22 +11,23 @@ | protocol | | key | | | 协议族(IPv4、IPv6) | | rx_bytes | tcp_tx_rx(0x8) | Gauge | bytes | Y | rx bytes | | tx_bytes | tcp_tx_rx(0x8) | Gauge | bytes | Y | tx bytes | -| rto(P50/P90/P99) | tcp_rate(0x20) | Gauge | | | Retransmission timeOut(us) | -| ato(P50/P90/P99) | tcp_rate(0x20) | Gauge | | | Estimated value of delayed ACK(us) | -| srtt(P50/P90/P99) | tcp_rtt(0x4) | Gauge | us | Y | Smoothed Round Trip Time(us). | -| snd_cwnd(P50/P90/P99) | tcp_windows(0x2) | Gauge | | | Congestion Control Window Size. | -| reordering(P50/P90/P99) | tcp_windows(0x2) | Gauge | | | Segments to be reordered. | -| rcv_rtt(P50/P90/P99) | tcp_rtt(0x4) | Gauge | us | | Receive end RTT (unidirectional measurement). | -| notsent_bytes(P50/P90/P99) | tcp_windows(0x2) | Gauge | bytes | | Number of bytes not sent currently. | -| notack_bytes(P50/P90/P99) | tcp_windows(0x2) | Gauge | bytes | | Number of bytes not ack currently. | -| snd_wnd(P50/P90/P99) | tcp_windows(0x2) | Gauge | | | Size of TCP send window. | -| rcv_wnd(P50/P90/P99) | tcp_windows(0x2) | Gauge | | | Size of TCP receive window. | +| rto(P50/P90/P99) | tcp_rate(0x20) | histogram | | | Retransmission timeOut(us) | +| ato(P50/P90/P99) | tcp_rate(0x20) | histogram | | | Estimated value of delayed ACK(us) | +| srtt(P50/P90/P99) | tcp_rtt(0x4) | histogram | us | Y | Smoothed Round Trip Time(us). | +| snd_cwnd(P50/P90/P99) | tcp_windows(0x2) | histogram | | | Congestion Control Window Size. | +| reordering(P50/P90/P99) | tcp_windows(0x2) | histogram | | | Segments to be reordered. | +| rcv_rtt(P50/P90/P99) | tcp_rtt(0x4) | histogram | us | | Receive end RTT (unidirectional measurement). | +| notsent_bytes(P50/P90/P99) | tcp_windows(0x2) | histogram | bytes | | Number of bytes not sent currently. | +| notack_bytes(P50/P90/P99) | tcp_windows(0x2) | histogram | bytes | | Number of bytes not ack currently. | +| snd_wnd(P50/P90/P99) | tcp_windows(0x2) | histogram | | | Size of TCP send window. | +| rcv_wnd(P50/P90/P99) | tcp_windows(0x2) | histogram | | | Size of TCP receive window. | | zero_snd_wnd_ratio | tcp_windows(0x2) | Gauge | | | Ratio of the number of times of sending window 0 to the number of sent bytes | | zero_rcv_wnd_ratio | tcp_windows(0x2) | Gauge | | | Ratio of the number of receive window 0 windows to the number of received bytes | -| avl_snd_wnd(P50/P90/P99) | tcp_windows(0x2) | Gauge | | | Size of TCP available send window. | -| syn_srtt(P50/P90/P99) | tcp_srtt | Gauge | us | Y | RTT of syn packet(us). | -| sk_rcvbuf(P50/P90/P99) | tcp_sockbuf(0x10) | Gauge | bytes | | Byte length of the RX buffer. | -| sk_sndbuf(P50/P90/P99) | tcp_sockbuf(0x10) | Gauge | bytes | | Byte length of the TX buffer. | +| avl_snd_wnd(P50/P90/P99) | tcp_windows(0x2) | histogram | | | Size of TCP available send window. | +| syn_srtt(P50/P90/P99) | tcp_srtt | histogram | us | | RTT of syn packet(us). | +| syn_srtt_max | tcp_srtt | Gauge | us | Y | RTT of syn packet(us). | +| sk_rcvbuf(P50/P90/P99) | tcp_sockbuf(0x10) | histogram | bytes | | Byte length of the RX buffer. | +| sk_sndbuf(P50/P90/P99) | tcp_sockbuf(0x10) | histogram | bytes | | Byte length of the TX buffer. | | segs_in | tcp_tx_rx(0x8) | Counter | segs | | total number of segments received | | segs_out | tcp_tx_rx(0x8) | Counter | segs | | total number of segments sent | | retran_packets | tcp_abn(0x01) | Gauge | | Y | total number of retrans | @@ -533,3 +534,25 @@ | topic | kafkaprobe | key | | | 消息的topic | | server_ip | kafkaprobe | key | | | kafka server所在主机的网卡IP | | server_port | kafkaprobe | key | | | kafka server所绑定的端口号 | + +# L7层会话(entity_name: l7) + +| metrics_name | table_name | metrics_type | unit | KPI | metrics description | +| --------------- | ---------- | ------------ | ---- | ---- | ------------------------------------------------------ | +| tgid | | key | | | Process ID of l7 session. | +| remote_ip | | key | | | Remote IP address of l7 session. | +| remote_port | | key | | | Remote port of l7 session. | +| l4_role | | key | | | Role of l4 protocol(TCP Client/Server or UDP) | +| l7_role | | key | | | Role of l7 protocol(Client or Server) | +| protocol | | key | | | Name of l7 protocol(http/http2/mysql...) | +| pod_ip | | label | | | IP address of pod which l7 session belongs. | +| ssl | | label | | | Indicates whether an SSL-encrypted l7 session is used. | +| bytes_sent | l7_link | gauge | | | Number of bytes sent by a l7 session. | +| bytes_recv | l7_link | gauge | | | Number of bytes recv by a l7 session. | +| throughput_req | l7_rpc | histogram | qps | | Request throughput of l7 session. | +| throughput_resp | l7_rpc | histogram | qps | | Response throughput of l7 session. | +| latency_avg | l7_rpc | gauge | ns | | Average l7 session latency. | +| latency_p50 | l7_rpc | histogram | ns | | L7 session P50 latency. | +| latency_p90 | l7_rpc | histogram | ns | | L7 session P90 latency. | +| latency_p99 | l7_rpc | histogram | ns | | L7 session P99 latency. | +| err_ratio | l7_rpc | gauge | % | | L7 session error rate. | diff --git a/io_diag.md b/io_diag.md index 6577ddc..a908b2e 100644 --- a/io_diag.md +++ b/io_diag.md @@ -12,7 +12,7 @@ I/O问题按照场景划分成虚拟化存储、裸机本地存储、GuestOS等 可观测的诊断过程依赖运维人员对系统、业务熟悉,并具备常见问题的判断能力,gala系统的可观测是辅助运维人员更好的定位、决策整个诊断过程。 -但是其存在问题诊断过程清晰、可回溯,可信度高的优点。下面我们介绍通常的定位过程: +其具备问题诊断过程清晰、可回溯,可信度高的优点。下面我们介绍通常的定位过程: - **问题1**:在虚拟化存储场景中,Qemu进程(VM)之间会共享访问虚拟化存储,现有运维体系中通常只具备磁盘的监控,缺乏VM粒度(实际就是Qemu进程粒度)的监控能力。gala提供VM粒度的I/O访问时延、错误。 diff --git a/network_diag.md b/network_diag.md index ff6726b..89d802a 100644 --- a/network_diag.md +++ b/network_diag.md @@ -15,7 +15,7 @@ 可观测的诊断过程依赖运维人员对系统、业务熟悉,并具备常见问题的判断能力,gala系统的可观测是辅助运维人员更好的定位、决策整个诊断过程。 -但是其存在问题诊断过程清晰、可回溯,可信度高的优点。下面我们介绍通常的定位过程: +其具备问题诊断过程清晰、可回溯,可信度高的优点。下面我们介绍通常的定位过程: - **问题发现**:容器内应用访问吞吐量、时延下降的情况,可以查看Prometheus metrics获取信息如下: -- Gitee