diff --git a/docs/en/server/_toc.yaml b/docs/en/server/_toc.yaml index 4b7e497be9c8f0c3a563dcc298b36a279e61ab8b..cd574cd5800b5d91356b5efb5f3344bee06ef0d3 100644 --- a/docs/en/server/_toc.yaml +++ b/docs/en/server/_toc.yaml @@ -1,10 +1,8 @@ label: Server sections: - - label: Release Notes + - label: Getting Started sections: - href: ./quickstart/releasenotes/_toc.yaml - - label: Quick Start - sections: - href: ./quickstart/quickstart/_toc.yaml - label: Installation and Upgrade sections: diff --git a/docs/en/server/maintenance/gala/_toc.yaml b/docs/en/server/maintenance/gala/_toc.yaml deleted file mode 100644 index ce9b28a79e01eb70bd2c1099e6859f3c8f10d2ce..0000000000000000000000000000000000000000 --- a/docs/en/server/maintenance/gala/_toc.yaml +++ /dev/null @@ -1,10 +0,0 @@ -label: gala_anteater User Guide -isManual: true -description: Smart fault detection, performance data gathering and analysis, and resource monitoring and management -sections: - - label: gala_anteater User Guide - href: ./using_gala_anteater.md - - label: gala_gopher User Guide - href: ./using_gala_gopher.md - - label: gala_spider User Guide - href: ./using_gala_spider.md diff --git a/docs/en/server/maintenance/gala/figures/attach-process.png b/docs/en/server/maintenance/gala/figures/attach-process.png deleted file mode 100644 index f76e8f4513cb45fbece12e6237039c41786b0467..0000000000000000000000000000000000000000 Binary files a/docs/en/server/maintenance/gala/figures/attach-process.png and /dev/null differ diff --git a/docs/en/server/maintenance/gala/figures/deadlock.png b/docs/en/server/maintenance/gala/figures/deadlock.png deleted file mode 100644 index d4f863a1a87d7aad3128481c763ee715aefd0a9f..0000000000000000000000000000000000000000 Binary files a/docs/en/server/maintenance/gala/figures/deadlock.png and /dev/null differ diff --git a/docs/en/server/maintenance/gala/figures/deadlock2.png b/docs/en/server/maintenance/gala/figures/deadlock2.png deleted file mode 100644 index 3be42a5a34f90c2f3b351c7077635c580ea847a7..0000000000000000000000000000000000000000 Binary files a/docs/en/server/maintenance/gala/figures/deadlock2.png and /dev/null differ diff --git a/docs/en/server/maintenance/gala/figures/deadlock3.png b/docs/en/server/maintenance/gala/figures/deadlock3.png deleted file mode 100644 index 5ef1a08394daf6433e10f85a5b3c57df25c3e303..0000000000000000000000000000000000000000 Binary files a/docs/en/server/maintenance/gala/figures/deadlock3.png and /dev/null differ diff --git a/docs/en/server/maintenance/gala/figures/flame_muti_ins.png b/docs/en/server/maintenance/gala/figures/flame_muti_ins.png deleted file mode 100644 index 5943c7fda223a7fde4d2987ad56af4ffa776bd81..0000000000000000000000000000000000000000 Binary files a/docs/en/server/maintenance/gala/figures/flame_muti_ins.png and /dev/null differ diff --git a/docs/en/server/maintenance/gala/figures/gala-gopher-start-success.png b/docs/en/server/maintenance/gala/figures/gala-gopher-start-success.png deleted file mode 100644 index ab16e9d3661db3fd4adc6c605b2d2d08e79fdc1c..0000000000000000000000000000000000000000 Binary files a/docs/en/server/maintenance/gala/figures/gala-gopher-start-success.png and /dev/null differ diff --git a/docs/en/server/maintenance/gala/figures/gala-spider-arch.png b/docs/en/server/maintenance/gala/figures/gala-spider-arch.png deleted file mode 100644 index c5a0768be63a98ef7ccc4a56996a8c715f7090af..0000000000000000000000000000000000000000 Binary files a/docs/en/server/maintenance/gala/figures/gala-spider-arch.png and /dev/null differ diff --git a/docs/en/server/maintenance/gala/figures/gopher-arch.png b/docs/en/server/maintenance/gala/figures/gopher-arch.png deleted file mode 100644 index f151965a21d11dd7a3e215cc4ef23d70d059f4b1..0000000000000000000000000000000000000000 Binary files a/docs/en/server/maintenance/gala/figures/gopher-arch.png and /dev/null differ diff --git a/docs/en/server/maintenance/gala/figures/lockcompete1.png b/docs/en/server/maintenance/gala/figures/lockcompete1.png deleted file mode 100644 index 5848b114e02d09f23303da8cff7aef56216f655f..0000000000000000000000000000000000000000 Binary files a/docs/en/server/maintenance/gala/figures/lockcompete1.png and /dev/null differ diff --git a/docs/en/server/maintenance/gala/figures/lockcompete2.png b/docs/en/server/maintenance/gala/figures/lockcompete2.png deleted file mode 100644 index ed02a882a145dafeafb76469f328085edecc6775..0000000000000000000000000000000000000000 Binary files a/docs/en/server/maintenance/gala/figures/lockcompete2.png and /dev/null differ diff --git a/docs/en/server/maintenance/gala/figures/lockcompete3.png b/docs/en/server/maintenance/gala/figures/lockcompete3.png deleted file mode 100644 index 3992edc5b7ea61d8a2aa08ce47f0876b7d2e8cf3..0000000000000000000000000000000000000000 Binary files a/docs/en/server/maintenance/gala/figures/lockcompete3.png and /dev/null differ diff --git a/docs/en/server/maintenance/gala/figures/lockcompete4.png b/docs/en/server/maintenance/gala/figures/lockcompete4.png deleted file mode 100644 index 049ac49bcc1fb71ea9fe6866bd27e84d0acf42b1..0000000000000000000000000000000000000000 Binary files a/docs/en/server/maintenance/gala/figures/lockcompete4.png and /dev/null differ diff --git a/docs/en/server/maintenance/gala/figures/lockcompete5.png b/docs/en/server/maintenance/gala/figures/lockcompete5.png deleted file mode 100644 index 8b5cf5aaef43f125abdf3adb8a7f798dd2c86b54..0000000000000000000000000000000000000000 Binary files a/docs/en/server/maintenance/gala/figures/lockcompete5.png and /dev/null differ diff --git a/docs/en/server/maintenance/gala/figures/lockcompete6.png b/docs/en/server/maintenance/gala/figures/lockcompete6.png deleted file mode 100644 index c3b1f5f097b9e9bcabf75229eabc6ce8fe126a71..0000000000000000000000000000000000000000 Binary files a/docs/en/server/maintenance/gala/figures/lockcompete6.png and /dev/null differ diff --git a/docs/en/server/maintenance/gala/figures/spider_topology.png b/docs/en/server/maintenance/gala/figures/spider_topology.png deleted file mode 100644 index 5823a116f384801e1197350f151b4d04ef519ac4..0000000000000000000000000000000000000000 Binary files a/docs/en/server/maintenance/gala/figures/spider_topology.png and /dev/null differ diff --git a/docs/en/server/maintenance/gala/figures/tprofiling-dashboard-detail.png b/docs/en/server/maintenance/gala/figures/tprofiling-dashboard-detail.png deleted file mode 100644 index 2093808bc4e1654956f6143393757c1244f08f98..0000000000000000000000000000000000000000 Binary files a/docs/en/server/maintenance/gala/figures/tprofiling-dashboard-detail.png and /dev/null differ diff --git a/docs/en/server/maintenance/gala/figures/tprofiling-dashboard.png b/docs/en/server/maintenance/gala/figures/tprofiling-dashboard.png deleted file mode 100644 index 15f4917f5a0bfcf5dee1f8fe68e65635ffebd85e..0000000000000000000000000000000000000000 Binary files a/docs/en/server/maintenance/gala/figures/tprofiling-dashboard.png and /dev/null differ diff --git a/docs/en/server/maintenance/gala/figures/tprofiling-run-arch.png b/docs/en/server/maintenance/gala/figures/tprofiling-run-arch.png deleted file mode 100644 index 0ad835125a5e7b7f66938543de1e1c9d53706ce4..0000000000000000000000000000000000000000 Binary files a/docs/en/server/maintenance/gala/figures/tprofiling-run-arch.png and /dev/null differ diff --git a/docs/en/server/maintenance/gala/using_gala_anteater.md b/docs/en/server/maintenance/gala/using_gala_anteater.md deleted file mode 100644 index 178759af8813f271854e8ed1eea7bcf2e879c58a..0000000000000000000000000000000000000000 --- a/docs/en/server/maintenance/gala/using_gala_anteater.md +++ /dev/null @@ -1,391 +0,0 @@ -# Using gala-anteater - -gala-anteater is an AI-based operating system exception detection platform. It provides functions such as time series data preprocessing, exception detection, and exception reporting. Based on offline pre-training, online model incremental learning and model update, it can be well adapted to multi-dimensional and multi-modal data fault diagnosis. - -This chapter describes how to deploy and use the gala-anteater service. - -## Installation - -Mount the repositories. - -```conf -[everything] -name=everything -baseurl=http://121.36.84.172/dailybuild/EBS-openEuler-24.09/EBS-openEuler-24.09/everything/$basearch/ -enabled=1 -gpgcheck=0 -priority=1 - -[EPOL] -name=EPOL -baseurl=http://repo.openeuler.org/openEuler-22.03-LTS-SP4/EPOL/main/$basearch/ -enabled=1 -gpgcheck=0 -priority=1 -``` - -Install gala-anteater. - -```bash -yum install gala-anteater -``` - -## Configuration - -> Note: gala-anteater uses a configuration file (**/etc/gala-anteater/config/gala-anteater.yaml**) for its startup settings. - -### Configuration Parameters - -```yaml -Global: - data_source: "prometheus" - -Arangodb: - url: "http://localhost:8529" - db_name: "spider" - -Kafka: - server: "192.168.122.100" - port: "9092" - model_topic: "gala_anteater_hybrid_model" - rca_topic: "gala_cause_inference" - meta_topic: "gala_gopher_metadata" - group_id: "gala_anteater_kafka" - # auth_type: plaintext/sasl_plaintext, please set "" for no auth - auth_type: "" - username: "" - password: "" - -Prometheus: - server: "localhost" - port: "9090" - steps: "5" - -Aom: - base_url: "" - project_id: "" - auth_type: "token" - auth_info: - iam_server: "" - iam_domain: "" - iam_user_name: "" - iam_password: "" - ssl_verify: 0 - -Schedule: - duration: 1 -``` - -| Parameter | Description | Default Value | -| ----------- | --------------------------------------------------------------------------------------------- | ---------------------------- | -| Global | | | -| data_source | Data source | "prometheus" | -| Arangodb | | | -| url | IP address of the ArangoDB graph database | "" | -| db_name | Name of the ArangoDB database | "spider" | -| Kafka | | | -| server | IP address of the Kafka server. Configure according to the installation node IP address. | | -| port | Port of the Kafka server (for example, 9092) | | -| model_topic | Topic for reporting fault detection results | "gala_anteater_hybrid_model" | -| rca_topic | Topic for reporting root cause analysis results | "gala_cause_inference" | -| meta_topic | Topic for gopher to collect metric data | "gala_gopher_metadata" | -| group_id | Kafka group ID | "gala_anteater_kafka" | -| Prometheus | | | -| server | IP address of the Prometheus server. Configure according to the installation node IP address. | | -| port | Port of the Prometheus server (for example, 9090) | | -| steps | Metric sampling interval | | -| Schedule | | | -| duration | Interval (in minutes) between anomaly detection model executions | 1 | - -## Start - -Start gala-anteater. - -```bash -systemctl start gala-anteater -``` - -### Fault Injection - -gala-anteater is a fault detection and root cause locating module. In the testing phase, you need to inject faults to construct fault scenarios. This allows gala-anteater to obtain information about faulty nodes and the root cause nodes of fault propagation. - -- Fault injection (for reference only) - - ```bash - chaosblade create disk burn --size 10 --read --write --path /var/lib/docker/overlay2/cf0a469be8a84cabe1d057216505f8d64735e9c63159e170743353a208f6c268/merged --timeout 120 - ``` - - ChaosBlade is a fault injection tool that can simulate various faults, including but not limited to drive faults, network faults, and I/O faults. - Note: Injecting different faults will cause corresponding fluctuations in related metrics monitored and reported to the Prometheus module by metric collectors (such as gala-gopher). These fluctuations will be visible in the Prometheus graph. - -### gala-anteater Service Status Query - -If the following information is displayed, the service is started successfully. The startup log is saved to the **logs/anteater.log** file in the current running directory. - -```log -2022-09-01 17:52:54,435 - root - INFO - Run gala_anteater main function... -2022-09-01 17:52:54,436 - root - INFO - Start to try updating global configurations by querying data from Kafka! -2022-09-01 17:52:54,994 - root - INFO - Loads metric and operators from file: xxx\metrics.csv -2022-09-01 17:52:54,997 - root - INFO - Loads metric and operators from file: xxx\metrics.csv -2022-09-01 17:52:54,998 - root - INFO - Start to re-train the model based on last day metrics dataset! -2022-09-01 17:52:54,998 - root - INFO - Get training data during 2022-08-31 17:52:00+08:00 to 2022-09-01 17:52:00+08:00! -2022-09-01 17:53:06,994 - root - INFO - Spends: 11.995422840118408 seconds to get unique machine_ids! -2022-09-01 17:53:06,995 - root - INFO - The number of unique machine ids is: 1! -2022-09-01 17:53:06,996 - root - INFO - Fetch metric values from machine: xxxx. -2022-09-01 17:53:38,385 - root - INFO - Spends: 31.3896164894104 seconds to get get all metric values! -2022-09-01 17:53:38,392 - root - INFO - The shape of training data: (17281, 136) -2022-09-01 17:53:38,444 - root - INFO - Start to execute vae model training... -2022-09-01 17:53:38,456 - root - INFO - Using cpu device -2022-09-01 17:53:38,658 - root - INFO - Epoch(s): 0 train Loss: 136.68 validate Loss: 117.00 -2022-09-01 17:53:38,852 - root - INFO - Epoch(s): 1 train Loss: 113.73 validate Loss: 110.05 -2022-09-01 17:53:39,044 - root - INFO - Epoch(s): 2 train Loss: 110.60 validate Loss: 108.76 -2022-09-01 17:53:39,235 - root - INFO - Epoch(s): 3 train Loss: 109.39 validate Loss: 106.93 -2022-09-01 17:53:39,419 - root - INFO - Epoch(s): 4 train Loss: 106.48 validate Loss: 103.37 -... -2022-09-01 17:53:57,744 - root - INFO - Epoch(s): 98 train Loss: 97.63 validate Loss: 96.76 -2022-09-01 17:53:57,945 - root - INFO - Epoch(s): 99 train Loss: 97.75 validate Loss: 96.58 -2022-09-01 17:53:57,969 - root - INFO - Schedule recurrent job with time interval 1 minute(s). -2022-09-01 17:53:57,973 - apscheduler.scheduler - INFO - Adding job tentatively -- it will be properly scheduled when the scheduler starts -2022-09-01 17:53:57,974 - apscheduler.scheduler - INFO - Added job "partial" to job store "default" -2022-09-01 17:53:57,974 - apscheduler.scheduler - INFO - Scheduler started -2022-09-01 17:53:57,975 - apscheduler.scheduler - DEBUG - Looking for jobs to run -2022-09-01 17:53:57,975 - apscheduler.scheduler - DEBUG - Next wakeup is due at 2022-09-01 17:54:57.973533+08:00 (in 59.998006 seconds) -``` - -## Output Data of Fault Detection - -If gala-anteater detects an exception, it sends the result to `model_topic` of Kafka. The output data format is as follows: - -```json -{ - "Timestamp":1659075600000, - "Attributes":{ - "entity_id":"xxxxxx_sli_1513_18", - "event_id":"1659075600000_1fd37742xxxx_sli_1513_18", - "event_type":"app" - }, - "Resource":{ - "anomaly_score":1.0, - "anomaly_count":13, - "total_count":13, - "duration":60, - "anomaly_ratio":1.0, - "metric_label":{ - "machine_id":"1fd37742xxxx", - "tgid":"1513", - "conn_fd":"18" - }, - "recommend_metrics":{ - "gala_gopher_tcp_link_notack_bytes":{ - "label":{ - "__name__":"gala_gopher_tcp_link_notack_bytes", - "client_ip":"x.x.x.165", - "client_port":"51352", - "hostname":"localhost.localdomain", - "instance":"x.x.x.172:8888", - "job":"prometheus-x.x.x.172", - "machine_id":"xxxxxx", - "protocol":"2", - "role":"0", - "server_ip":"x.x.x.172", - "server_port":"8888", - "tgid":"3381701" - }, - "score":0.24421279500639545 - }, - ... - }, - "metrics":"gala_gopher_ksliprobe_recent_rtt_nsec" - }, - "SeverityText":"WARN", - "SeverityNumber":14, - "Body":"TimeStamp, WARN, APP may be impacting sli performance issues." -} -``` - -## Output Data of Root Cause Locating - -Each faulty node detected triggers root cause locating. Results of root cause locating are sent to `rca_topic` of Kafka. The output data format is as follows: - -```yaml -{ - "Timestamp": 1724287883452, - "event_id": "1721125159975_475ae627-7e88-41ed-8bb8-ff0fee95a69d_l7_3459438_192.168.11.103_192.168.11.102_26_tcp_server_server_http", - "Attributes": { - "event_id": "1721125159975_475ae627-7e88-41ed-8bb8-ff0fee95a69d_l7_3459438_192.168.11.103_192.168.11.102_26_tcp_server_server_http", - "event_source": "root-cause-inference" - }, - "Resource": { - "abnormal_kpi": { - "metric_id": "gala_gopher_l7_latency_sum", - "entity_id": "", - "metric_labels": { - "client_ip": "192.168.11.103", - "comm": "python", - "container_id": "83d0c2f4a7f4", - "container_image": "ba2d060a624e", - "container_name": "/k8s_backend_backend-node2-01-5bcb47fd7c-4jxxs_default_475ae627", - "instance": "192.168.122.102:8888", - "job": "192.168.122.102", - "l4_role": "tcp_server", - "l7_role": "server", - "machine_id": "66086618-3bad-489e-b17d-05245224f29a-192.168.122.102", - "pod": "default/backend-node2-01-5bcb47fd7c-4jxxs", - "pod_id": "475ae627-7e88-41ed-8bb8-ff0fee95a69d", - "pod_namespace": "default", - "protocol": "http", - "server_ip": "192.168.11.102", - "server_port": "26", - "ssl": "no_ssl", - "tgid": "3459438" - }, - "desc": "L7 session averaged latency.", - "score": 0.3498585816683402 - }, - "cause_metrics": [ - { - "metric_id": "gala_gopher_container_cpu_user_seconds_total@4a9fcc23-8ba2-4b0a-bcb0-b1bfd89ed929", - "entity_id": "", - "metric_labels": { - "container_id": "1319ff912a6f", - "container_image": "ba2d060a624e", - "container_name": "/k8s_backend_backend-node3-02-654dd97bf9-s8jg5_default_4a9fcc23", - "instance": "192.168.122.103:8888", - "job": "192.168.122.103", - "machine_id": "494a61be-23cc-4c97-a871-902866e43747-192.168.122.103", - "pod": "default/backend-node3-02-654dd97bf9-s8jg5", - "pod_id": "4a9fcc23-8ba2-4b0a-bcb0-b1bfd89ed929", - "pod_namespace": "default" - }, - "desc": "\u5bb9\u56681s\u5185\u7528\u6237\u6001CPU\u8d1f\u8f7d", - "keyword": "process", - "score": 0.1194249668036936, - "path": [ - { - "pod_id": "4a9fcc23-8ba2-4b0a-bcb0-b1bfd89ed929", - "pod": "default/backend-node3-02-654dd97bf9-s8jg5", - "instance": "192.168.122.103:8888", - "job": "192.168.122.103", - "pod_state": "normal" - }, - { - "pod_id": "475ae627-7e88-41ed-8bb8-ff0fee95a69d", - "pod": "default/backend-node2-01-5bcb47fd7c-4jxxs", - "instance": "192.168.122.102:8888", - "job": "192.168.122.102", - "pod_state": "abnormal" - } - ] - }, - { - "metric_id": "gala_gopher_proc_wchar_bytes@67134fb4-b2a3-43c5-a5b3-b3b463ad7d43", - "entity_id": "", - "metric_labels": { - "cmdline": "python ./backend.py ", - "comm": "python", - "container_id": "de570c7328bb", - "container_image": "ba2d060a624e", - "container_name": "/k8s_backend_backend-node2-02-548c79d989-bnl9g_default_67134fb4", - "instance": "192.168.122.102:8888", - "job": "192.168.122.102", - "machine_id": "66086618-3bad-489e-b17d-05245224f29a-192.168.122.102", - "pgid": "3459969", - "pod": "default/backend-node2-02-548c79d989-bnl9g", - "pod_id": "67134fb4-b2a3-43c5-a5b3-b3b463ad7d43", - "pod_namespace": "default", - "ppid": "3459936", - "start_time": "1139543501", - "tgid": "3459969" - }, - "desc": "\u8fdb\u7a0b\u7cfb\u7edf\u8c03\u7528\u81f3FS\u7684\u5199\u5b57\u8282\u6570", - "keyword": "process", - "score": 0.37121879175399997, - "path": [ - { - "pod_id": "67134fb4-b2a3-43c5-a5b3-b3b463ad7d43", - "pod": "default/backend-node2-02-548c79d989-bnl9g", - "instance": "192.168.122.102:8888", - "job": "192.168.122.102", - "pod_state": "normal" - }, - { - "pod_id": "4a9fcc23-8ba2-4b0a-bcb0-b1bfd89ed929", - "pod": "default/backend-node3-02-654dd97bf9-s8jg5", - "instance": "192.168.122.103:8888", - "job": "192.168.122.103", - "pod_state": "normal" - }, - { - "pod_id": "475ae627-7e88-41ed-8bb8-ff0fee95a69d", - "pod": "default/backend-node2-01-5bcb47fd7c-4jxxs", - "instance": "192.168.122.102:8888", - "job": "192.168.122.102", - "pod_state": "abnormal" - } - ] - }, - { - "metric_id": "gala_gopher_l7_latency_avg@956c70a2-9918-459c-a0a8-39396251f952", - "entity_id": "", - "metric_labels": { - "client_ip": "192.168.11.103", - "comm": "python", - "container_id": "eef1ca1082a7", - "container_image": "ba2d060a624e", - "container_name": "/k8s_backend_backend-node2-03-584f4c6cfd-w4d2b_default_956c70a2", - "instance": "192.168.122.102:8888", - "job": "192.168.122.102", - "l4_role": "tcp_server", - "l7_role": "server", - "machine_id": "66086618-3bad-489e-b17d-05245224f29a-192.168.122.102", - "pod": "default/backend-node2-03-584f4c6cfd-w4d2b", - "pod_id": "956c70a2-9918-459c-a0a8-39396251f952", - "pod_namespace": "default", - "protocol": "http", - "server_ip": "192.168.11.113", - "server_port": "26", - "ssl": "no_ssl", - "tgid": "3460169" - }, - "desc": "L7 session averaged latency.", - "keyword": null, - "score": 0.5624857367147617, - "path": [ - { - "pod_id": "956c70a2-9918-459c-a0a8-39396251f952", - "pod": "default/backend-node2-03-584f4c6cfd-w4d2b", - "instance": "192.168.122.102:8888", - "job": "192.168.122.102", - "pod_state": "abnormal" - }, - { - "pod_id": "4a9fcc23-8ba2-4b0a-bcb0-b1bfd89ed929", - "pod": "default/backend-node3-02-654dd97bf9-s8jg5", - "instance": "192.168.122.103:8888", - "job": "192.168.122.103", - "pod_state": "normal" - }, - { - "pod_id": "475ae627-7e88-41ed-8bb8-ff0fee95a69d", - "pod": "default/backend-node2-01-5bcb47fd7c-4jxxs", - "instance": "192.168.122.102:8888", - "job": "192.168.122.102", - "pod_state": "abnormal" - } - ] - } - ] - }, - "desc": "L7 session averaged latency.", - "top1": "gala_gopher_container_cpu_user_seconds_total@4a9fcc23-8ba2-4b0a-bcb0-b1bfd89ed929\u5f02\u5e38", - "top2": "gala_gopher_proc_wchar_bytes@67134fb4-b2a3-43c5-a5b3-b3b463ad7d43\u5f02\u5e38", - "top3": "gala_gopher_l7_latency_avg@956c70a2-9918-459c-a0a8-39396251f952\u5f02\u5e38", - "keywords": [ - "process", - null - ], - "SeverityText": "WARN", - "SeverityNumber": 13, - "Body": "A cause inferring event for an abnormal event" -} -``` diff --git a/docs/en/server/maintenance/gala/using_gala_gopher.md b/docs/en/server/maintenance/gala/using_gala_gopher.md deleted file mode 100644 index 9ac9624058d07d5c88528059a876ea5a6cc2c1b4..0000000000000000000000000000000000000000 --- a/docs/en/server/maintenance/gala/using_gala_gopher.md +++ /dev/null @@ -1,1119 +0,0 @@ -# Using gala-gopher - -As a data collection module, gala-gopher provides OS-level monitoring capabilities, supports dynamic probe installation and uninstallation, and integrates third-party probes in a non-intrusive manner to quickly expand the monitoring scope. - -This chapter describes how to deploy and use the gala-gopher service. - -# Installation - -Mount the repositories. - -```basic -[oe-2209] # openEuler 23.09 officially released repository -name=oe2209 -baseurl=http://119.3.219.20:82/openEuler:/23.09/standard_x86_64 -enabled=1 -gpgcheck=0 -priority=1 - -[oe-2209:Epol] # openEuler 23.09: Epol officially released repository -name=oe2209_epol -baseurl=http://119.3.219.20:82/openEuler:/23.09:/Epol/standard_x86_64/ -enabled=1 -gpgcheck=0 -priority=1 -``` - -Install gala-gopher. - -```bash -# yum install gala-gopher -``` - -# Configuration - -## Configuration Description - -The configuration file of gala-gopher is **/opt/gala-gopher/gala-gopher.conf**. The configuration items in the file are described as follows (the parts that do not need to be manually configured are not described): - -The following configurations can be modified as required: - -- `global`: global configuration for gala-gopher. - - `log_file_name`: name of the gala-gopher log file. - - `log_level`: gala-gopher log level (currently not enabled). - - `pin_path`: path for storing the map shared by the eBPF probe (keep the default configuration). -- `metric`: configuration for metric data output. - - `out_channel`: output channel for metrics (`web_server`, `logs`, or `kafka`). If empty, the output channel is disabled. - - `kafka_topic`: topic configuration for Kafka output. -- `event`: configuration for abnormal event output. - - `out_channel`: output channel for events (`logs` or `kafka`). If empty, the output channel is disabled. - - `kafka_topic`: topic configuration for Kafka output. - - `timeout`: reporting interval for the same event. - - `desc_language`: language for event descriptions (`zh_CN` or `en_US`). -- `meta`: configuration for metadata output. - - `out_channel`: output channel for metadata (`logs` or `kafka`). If empty, the output channel is disabled. - - `kafka_topic`: topic configuration for Kafka output. -- `ingress`: probe data reporting configuration (currently unused). - - `interval`: unused. -- `egress`: database reporting configuration (currently unused). - - `interval`: unused. - - `time_range`: unused. -- `imdb`: cache configuration. - - `max_tables_num`: maximum number of cache tables. Each meta file in **/opt/gala-gopher/meta** corresponds to a table. - - `max_records_num`: maximum records per cache table. Each probe typically generates at least one record per observation period. - - `max_metrics_num`: maximum number of metrics per record. - - `record_timeout`: cache table aging time (seconds). Records not updated within this time are deleted. -- `web_server`: `web_server` output channel configuration. - - `port`: listening port. -- `rest_api_server`: - - `port`: listening port for the REST API. - - `ssl_auth`: enables HTTPS encryption and authentication for the REST API (`on` or `off`). Enable in production. - - `private_key`: absolute path to the server's private key file for HTTPS encryption (required if `ssl_auth` is `on`). - - `cert_file`: absolute path to the server's certificate file for HTTPS encryption (required if `ssl_auth` is `on`). - - `ca_file`: absolute path to the CA certificate for client authentication (required if `ssl_auth` is `on`). -- `kafka`: Kafka output channel configuration. - - `kafka_broker`: IP address and port of the Kafka server. - - `batch_num_messages`: number of messages per batch. - - `compression_codec`: message compression type. - - `queue_buffering_max_messages`: maximum number of messages in the producer buffer. - - `queue_buffering_max_kbytes`: maximum size (KB) of the producer buffer. - - `queue_buffering_max_ms`: maximum time (ms) the producer waits for more messages before sending a batch. -- `logs`: `logs` output channel configuration. - - `metric_dir`: path for metric data logs. - - `event_dir`: path for abnormal event logs. - - `meta_dir`: path for metadata logs. - - `debug_dir`: path for gala-gopher runtime logs. - -## Configuration File Example - -- Select the data output channels. - - ```yaml - metric = - { - out_channel = "web_server"; - kafka_topic = "gala_gopher"; - }; - - event = - { - out_channel = "kafka"; - kafka_topic = "gala_gopher_event"; - }; - - meta = - { - out_channel = "kafka"; - kafka_topic = "gala_gopher_metadata"; - }; - ``` - -- Configure Kafka and Web Server. - - ```yaml - web_server = - { - port = 8888; - }; - - kafka = - { - kafka_broker = ":9092"; - }; - ``` - -- Select the probe to be enabled. The following is an example. - - ```yaml - probes = - ( - { - name = "system_infos"; - param = "-t 5 -w /opt/gala-gopher/task_whitelist.conf -l warn -U 80"; - switch = "on"; - }, - ); - extend_probes = - ( - { - name = "tcp"; - command = "/opt/gala-gopher/extend_probes/tcpprobe"; - param = "-l warn -c 1 -P 7"; - switch = "on"; - } - ); - ``` - -# Start - -After the configuration is complete, start gala-gopher. - -```bash -# systemctl start gala-gopher.service -``` - -Query the status of the gala-gopher service. - -```bash -# systemctl status gala-gopher.service -``` - -If the following information is displayed, the service is started successfully: Check whether the enabled probe is started. If the probe thread does not exist, check the configuration file and gala-gopher run log file. - -![](./figures/gala-gopher-start-success.png) - -> Note: The root permission is required for deploying and running gala-gopher. - -# How to Use - -## Deployment of External Dependent Software - -![](./figures/gopher-arch.png) - -As shown in the preceding figure, the green parts are external dependent components of gala-gopher. gala-gopher outputs metric data to Prometheus, metadata and abnormal events to Kafka. gala-anteater and gala-spider in gray rectangles obtain data from Prometheus and Kafka. - -> Note: Obtain the installation packages of Kafka and Prometheus from the official websites. - -### REST Dynamic Configuration Interface - -The web server port is configurable (default is 9999). The URL format is `http://[gala-gopher-node-ip-address]:[port]/[function (collection feature)]`. For example, the URL for the flamegraph is `http://localhost:9999/flamegraph` (the following documentation uses the flamegraph as an example). - -#### Configuring the Probe Monitoring Scope - -Probes are disabled by default and can be dynamically enabled and configured via the API. Taking the flamegraph as an example, the REST API can be used to enable `oncpu`, `offcpu`, and `mem` flamegraph capabilities. The monitoring scope can be configured based on four dimensions: process ID, process name, container ID, and pod. - -Below is an example of an API that simultaneously enables the oncpu and offcpu collection features for the flamegraph: - -```sh -curl -X PUT http://localhost:9999/flamegraph --data-urlencode json=' -{ - "cmd": { - "bin": "/opt/gala-gopher/extend_probes/stackprobe", - "check_cmd": "", - "probe": [ - "oncpu", - "offcpu" - ] - }, - "snoopers": { - "proc_id": [ - 101, - 102 - ], - "proc_name": [ - { - "comm": "app1", - "cmdline": "", - "debugging_dir": "" - }, - { - "comm": "app2", - "cmdline": "", - "debugging_dir": "" - } - ], - "pod_id": [ - "pod1", - "pod2" - ], - "container_id": [ - "container1", - "container2" - ] - } -}' -``` - -A full description of the collection features is provided below: - -| Collection Feature | Description | Sub-item Scope | Monitoring Targets | Startup File | Startup Condition | -| ------------------ | -------------------------------------------------- | ----------------------------------------------------------------------------------------- | ---------------------------------------- | ---------------------------------- | ------------------------- | -| flamegraph | Online performance flamegraph observation | oncpu, offcpu, mem | proc_id, proc_name, pod_id, container_id | $gala-gopher-dir/stackprobe | NA | -| l7 | Application layer 7 protocol observation | l7_bytes_metrics, l7_rpc_metrics, l7_rpc_trace | proc_id, proc_name, pod_id, container_id | $gala-gopher-dir/l7probe | NA | -| tcp | TCP exception and state observation | tcp_abnormal, tcp_rtt, tcp_windows, tcp_rate, tcp_srtt, tcp_sockbuf, tcp_stats, tcp_delay | proc_id, proc_name, pod_id, container_id | $gala-gopher-dir/tcpprobe | NA | -| socket | Socket (TCP/UDP) exception observation | tcp_socket, udp_socket | proc_id, proc_name, pod_id, container_id | $gala-gopher-dir/endpoint | NA | -| io | Block layer I/O observation | io_trace, io_err, io_count, page_cache | NA | $gala-gopher-dir/ioprobe | NA | -| proc | Process system calls, I/O, DNS, VFS observation | base_metrics, proc_syscall, proc_fs, proc_io, proc_dns, proc_pagecache | proc_id, proc_name, pod_id, container_id | $gala-gopher-dir/taskprobe | NA | -| jvm | JVM layer GC, threads, memory, cache observation | NA | proc_id, proc_name, pod_id, container_id | $gala-gopher-dir/jvmprobe | NA | -| ksli | Redis performance SLI (access latency) observation | NA | proc_id, proc_name, pod_id, container_id | $gala-gopher-dir/ksliprobe | NA | -| postgre_sli | PG DB performance SLI (access latency) observation | NA | proc_id, proc_name, pod_id, container_id | $gala-gopher-dir/pgsliprobe | NA | -| opengauss_sli | openGauss access throughput observation | NA | \[ip, port, dbname, user, password] | $gala-gopher-dir/pg_stat_probe.py | NA | -| dnsmasq | DNS session observation | NA | proc_id, proc_name, pod_id, container_id | $gala-gopher-dir/rabbitmq_probe.sh | NA | -| lvs | LVS session observation | NA | NA | $gala-gopher-dir/trace_lvs | lsmod\|grep ip_vs\| wc -l | -| nginx | Nginx L4/L7 layer session observation | NA | proc_id, proc_name, pod_id, container_id | $gala-gopher-dir/nginx_probe | NA | -| haproxy | Haproxy L4/7 layer session observation | NA | proc_id, proc_name, pod_id, container_id | $gala-gopher-dir/trace_haproxy | NA | -| kafka | Kafka producer/consumer topic observation | NA | dev, port | $gala-gopher-dir/kafkaprobe | NA | -| baseinfo | System basic information | cpu, mem, nic, disk, net, fs, proc, host | proc_id, proc_name, pod_id, container_id | system_infos | NA | -| virt | Virtualization management information | NA | NA | virtualized_infos | NA | -| tprofiling | Thread-level performance profiling observation | oncpu, syscall_file, syscall_net, syscall_lock, syscall_sched | proc_id, proc_name | | | - -### Configuring Probe Runtime Parameters - -Probes require additional parameter settings during runtime, such as configuring the sampling period and reporting period for flamegraphs. - -```sh -curl -X PUT http://localhost:9999/flamegraph --data-urlencode json=' -{ - "params": { - "report_period": 180, - "sample_period": 180, - "metrics_type": [ - "raw", - "telemetry" - ] - } -}' -``` - -Detailed runtime parameters are as follows: - -| Parameter | Description | Default & Range | Unit | Supported Monitoring Scope | Supported by gala-gopher | -| ------------------- | ---------------------------------------- | -------------------------------------------------------------- | ------- | ------------------------------------------- | ------------------------ | -| sample_period | Sampling period | 5000, \[100~10000] | ms | io, tcp | Y | -| report_period | Reporting period | 60, \[5~600] | s | ALL | Y | -| latency_thr | Latency reporting threshold | 0, \[10~100000] | ms | tcp, io, proc, ksli | Y | -| offline_thr | Process offline reporting threshold | 0, \[10~100000] | ms | proc | Y | -| drops_thr | Packet loss reporting threshold | 0, \[10~100000] | package | tcp, nic | Y | -| res_lower_thr | Resource percentage lower limit | 0%, \[0%~100%] | percent | ALL | Y | -| res_upper_thr | Resource percentage upper limit | 0%, \[0%~100%] | percent | ALL | Y | -| report_event | Report abnormal events | 0, \[0, 1] | NA | ALL | Y | -| metrics_type | Report telemetry metrics | raw, \[raw, telemetry] | NA | ALL | N | -| env | Working environment type | node, \[node, container, kubenet] | NA | ALL | N | -| report_source_port | Report source port | 0, \[0, 1] | NA | tcp | Y | -| l7_protocol | Layer 7 protocol scope | http, \[http, pgsql, mysql, redis, kafka, mongo, rocketmq, dns] | NA | l7 | Y | -| support_ssl | Support SSL encrypted protocol observation | 0, \[0, 1] | NA | l7 | Y | -| multi_instance | Output separate flamegraphs for each process | 0, \[0, 1] | NA | flamegraph | Y | -| native_stack | Display native language stack (for JAVA processes) | 0, \[0, 1] | NA | flamegraph | Y | -| cluster_ip_backend | Perform Cluster IP backend conversion | 0, \[0, 1] | NA | tcp, l7 | Y | -| pyroscope_server | Set flamegraph UI server address | localhost:4040 | NA | flamegraph | Y | -| svg_period | Flamegraph SVG file generation period | 180, \[30, 600] | s | flamegraph | Y | -| perf_sample_period | Period for collecting stack info in oncpu flamegraph | 10, \[10, 1000] | ms | flamegraph | Y | -| svg_dir | Directory for storing flamegraph SVG files | "/var/log/gala-gopher/stacktrace" | NA | flamegraph | Y | -| flame_dir | Directory for storing raw stack info in flamegraphs | "/var/log/gala-gopher/flamegraph" | NA | flamegraph | Y | -| dev_name | Observed network card/disk device name | "" | NA | io, kafka, ksli, postgre_sli, baseinfo, tcp | Y | -| continuous_sampling | Enable continuous sampling | 0, \[0, 1] | NA | ksli | Y | -| elf_path | Path to the executable file to observe | "" | NA | nginx, haproxy, dnsmasq | Y | -| kafka_port | Kafka port number to observe | 9092, \[1, 65535] | NA | kafka | Y | -| cadvisor_port | Port number for starting cadvisor | 8080, \[1, 65535] | NA | cadvisor | Y | - -### Starting and Stopping Probes - -```sh -curl -X PUT http://localhost:9999/flamegraph --data-urlencode json=' -{ - "state": "running" // optional: running, stopped -}' -``` - -### Constraints and Limitations - -1. The interface is stateless. The settings uploaded each time represent the final runtime configuration for the probe, including state, parameters, and monitoring scope. -2. Monitoring targets can be combined arbitrarily, and the monitoring scope is the union of all specified targets. -3. The startup file must be valid and accessible. -4. Collection features can be enabled partially or fully as needed, but disabling a feature requires disabling it entirely. -5. The monitoring target for opengauss is a DB instance (IP/Port/dbname/user/password). -6. The interface can receive a maximum of 2048 characters per request. - -#### Querying Probe Configurations and Status - -```sh -curl -X GET http://localhost:9999/flamegraph -{ - "cmd": { - "bin": "/opt/gala-gopher/extend_probes/stackprobe", - "check_cmd": "" - "probe": [ - "oncpu", - "offcpu" - ] - }, - "snoopers": { - "proc_id": [ - 101, - 102 - ], - "proc_name": [ - { - "comm": "app1", - "cmdline": "", - "debugging_dir": "" - }, - { - "comm": "app2", - "cmdline": "", - "debugging_dir": "" - } - ], - "pod_id": [ - "pod1", - "pod2" - ], - "container_id": [ - "container1", - "container2" - ] - }, - "params": { - "report_period": 180, - "sample_period": 180, - "metrics_type": [ - "raw", - "telemetry" - ] - }, - "state": "running" -} -``` - -## Introduction to stackprobe - -A performance flamegraph tool designed for cloud-native environments. - -### Features - -- Supports observation of applications written in C/C++, Go, Rust, and Java. -- Call stack supports container and process granularity: For processes within containers, the workload Pod name and container name are marked with `[Pod]` and `[Con]` prefixes at the bottom of the call stack. Process names are prefixed with `[]`, while threads and functions (methods) have no prefix. -- Supports generating SVG format flamegraphs locally or uploading call stack data to middleware. -- Supports generating/uploading flamegraphs for multiple instances based on process granularity. -- For Java processes, flamegraphs can simultaneously display native methods and Java methods. -- Supports multiple types of flamegraphs, including oncpu, offcpu, and mem. -- Supports custom sampling periods. - -### Usage Instructions - -Basic startup command example: Start the performance flamegraph with default parameters. - -```sh -curl -X PUT http://localhost:9999/flamegraph -d json='{ "cmd": {"probe": ["oncpu"] }, "snoopers": {"proc_name": [{ "comm": "cadvisor"}] }, "state": "running"}' -``` - -Advanced startup command example: Start the performance flamegraph with custom parameters. For a complete list of configurable parameters, refer to [Configuring Probe Runtime Parameters](#configuring-probe-runtime-parameters). - -```sh -curl -X PUT http://localhost:9999/flamegraph -d json='{ "cmd": { "check_cmd": "", "probe": ["oncpu", "offcpu", "mem"] }, "snoopers": { "proc_name": [{ "comm": "cadvisor", "cmdline": "", "debugging_dir": "" }, { "comm": "java", "cmdline": "", "debugging_dir": "" }] }, "params": { "perf_sample_period": 100, "svg_period": 300, "svg_dir": "/var/log/gala-gopher/stacktrace", "flame_dir": "/var/log/gala-gopher/flamegraph", "pyroscope_server": "localhost:4040", "multi_instance": 1, "native_stack": 0 }, "state": "running"}' -``` - -Key configuration options explained: - -- **Enabling flamegraph types**: - - Set via the `probe` parameter. Values include `oncpu`, `offcpu`, and `mem`, representing CPU usage time, blocked time, and memory allocation statistics, respectively. - - Example: - - `"probe": ["oncpu", "offcpu", "mem"]` - -- **Setting the period for generating local SVG flamegraphs**: - - Configured via the `svg_period` parameter, in seconds. Default is 180, with an optional range of \[30, 600]. - - Example: - - `"svg_period": 300` - -- **Enabling/disabling stack information upload to Pyroscope**: - - Set via the `pyroscope_server` parameter. The value must include the address and port. If empty or incorrectly formatted, the probe will not attempt to upload stack information. The upload period is 30 seconds. - - Example: - - `"pyroscope_server": "localhost:4040"` - -- **Setting the call stack sampling period**: - - Configured via the `perf_sample_period` parameter, in milliseconds. Default is 10, with an optional range of \[10, 1000]. This parameter only applies to oncpu flamegraphs. - - Example: - - `"perf_sample_period": 100` - -- **Enabling/disabling multi-instance flamegraph generation**: - - Set via the `multi_instance` parameter, with values 0 or 1. Default is 0. A value of 0 merges flamegraphs for all processes, while 1 generates separate flamegraphs for each process. - - Example: - - `"multi_instance": 1` - -- **Enabling/disabling native call stack collection**: - - Set via the `native_stack` parameter, with values 0 or 1. Default is 0. This parameter only applies to Java processes. A value of 0 disables collection of the JVM's native call stack, while 1 enables it. - - Example: - - `"native_stack": 1` - - Visualization: (Left: `"native_stack": 1`, Right: `"native_stack": 0`) - - ![image-20230804172905729](./figures/flame_muti_ins.png) - -### Implementation Plan - -#### 1. User-Space Program Logic - -The program periodically (every 30 seconds) converts kernel-reported stack information from addresses to symbols using the symbol table. It then uses the flamegraph plugin or pyroscope to generate a flame graph from the symbolized call stack. - -The approach to obtaining the symbol table differs based on the code segment type. - -- Kernel Symbol Table: Access **/proc/kallsyms**. - -- Native Language Symbol Table: Query the process virtual memory mapping file (**/proc/{pid}/maps**) to retrieve address mappings for each code segment in the process memory. The libelf library is then used to load the symbol table of the corresponding module for each segment. - -- Java Language Symbol Table: - - Since Java methods are not statically mapped to the process virtual address space, alternative methods are used to obtain the symbolized Java call stack. - -##### Method 1: Perf Observation - -A JVM agent dynamic library is loaded into the Java process to monitor JVM method compilation and loading events. This allows real-time recording of memory address-to-Java symbol mappings, generating the Java process symbol table. This method requires the Java process to be launched with the `-XX:+PreserveFramePointer` option. Its key advantage is that the flame graph can display the JVM call stack, and the resulting Java flame graph can be merged with those of other processes for unified visualization. - -##### Method 2: JFR Observation - -The JVM built-in profiler, Java Flight Recorder (JFR), is dynamically enabled to monitor various events and metrics of the Java application. This is accomplished by loading a Java agent into the Java process, which internally calls the JFR API. This method offers the advantage of more precise and comprehensive collection of Java method call stacks. - -Both Java performance analysis methods can be loaded in real time (without restarting the Java process) and feature low overhead. When stackprobe startup parameters are configured as `"multi_instance": 1` and `"native_stack": 0`, it uses Method 2 to generate the Java process flame graph; otherwise, it defaults to Method 1. - -#### 2. Kernel-Space Program Logic - -The kernel-space functionality is implemented using eBPF. Different flame graph types correspond to distinct eBPF programs. These programs periodically or through event triggers traverse the current user-space and kernel-space call stacks, reporting the results to user space. - -##### 2.1 On-CPU Flame Graph - -A sampling eBPF program is attached to perf software event `PERF_COUNT_SW_CPU_CLOCK` to periodically sample the call stack. - -##### 2.2 Off-CPU Flame Graph - -A sampling eBPF program is attached to process scheduling tracepoint `sched_switch`. This program records the time and process ID when a process is scheduled out and samples the call stack when the process is scheduled back in. - -##### 2.3 Memory Flame Graph - -A sampling eBPF program is attached to page fault tracepoint `page_fault_user`. The call stack is sampled whenever this event is triggered. - -#### 3. Java Language Support - -- stackprobe main process: - - 1. Receives an IPC message to identify the Java process to be observed. - 2. Utilizes the Java agent loading module to inject the JVM agent program into the target Java process: `jvm_agent.so` (for [Method 1](#method-1-perf-observation)) or `JstackProbeAgent.jar` (for [Method 2](#method-2-jfr-observation)). - 3. For Method 1, the main process loads the `java-symbols.bin` file of the corresponding Java process to facilitate address-to-symbol conversion. For Method 2, it loads the `stacks-{flame_type}.txt` file of the corresponding Java process, which can be directly used to generate flame graphs. - -- Java agent loading module: - - 1. Detects a new Java process and copies the JVM agent program to `/proc//root/tmp` in the process space (to ensure visibility to the JVM inside the container during attachment). - 2. Adjusts the ownership of the directory and JVM agent program to match the observed Java process. - 3. Launches the `jvm_attach` subprocess and passes the relevant parameters of the observed Java process. - -- JVM agent program: - - - jvm_agent.so: Registers JVMTI callback functions. - - When the JVM loads a Java method or dynamically compiles a native method, it triggers the callback function. The callback records the Java class name, method name, and corresponding memory address in `/proc//root/tmp/java-data-/java-symbols.bin` within the observed Java process space. - - JstackProbeAgent.jar: Invokes the JFR API. - - Activates JFR for 30 seconds and transforms the JFR statistics into a stack format suitable for flame graphs. The output is saved to `/proc//root/tmp/java-data-/stacks-.txt` in the observed Java process space. For more information, refer to [JstackProbe Introduction](https://gitee.com/openeuler/gala-gopher/blob/dev/src/probes/extends/java.probe/jstack.probe/readme.md). - -- jvm_attach: Dynamically loads the JVM agent program into the JVM of the observed process (based on `sun.tools.attach.LinuxVirtualMachine` from the JDK source code and the `jattach` tool). - - 1. Configures its own namespace (the JVM requires the attaching process and the observed process to share the same namespace for agent loading). - 2. Verifies if the JVM attach listener is active (by checking for the existence of the UNIX socket file `/proc//root/tmp/.java_pid`). - 3. If inactive, creates `/proc//cwd/.attach_pid` and sends a SIGQUIT signal to the JVM. - 4. Establishes a connection to the UNIX socket. - 5. Interprets the response; a value of 0 indicates successful attachment. - - Attachment process diagram: - - ![Attachment process](./figures/attach-process.png) - -### Precautions - -- To achieve the best observation results for Java applications, configure the stackprobe startup options to `"multi_instance": 1` and `"native_stack": 0` to enable JFR observation (JDK8u262+). Otherwise, stackprobe will use perf to generate Java flame graphs. When using perf, ensure that the JVM option `XX:+PreserveFramePointer` is enabled (JDK8 or later). - -### Constraints - -- Supports observation of Java applications based on the hotSpot JVM. - -## Introduction to tprofiling - -tprofiling, a thread-level application performance diagnostic tool provided by gala-gopher, leverages eBPF technology. It monitors key system performance events at the thread level, associating them with detailed event content. This enables real-time recording of thread states and key activities, helping users quickly pinpoint application performance bottlenecks. - -### Features - -From the OS perspective, a running application comprises multiple processes, each containing multiple running threads. tprofiling monitors and records key activities (referred to as **events**) performed by these threads. The tool then presents these events on a timeline in the front-end interface, providing an intuitive view of what each thread is doing at any given moment—whether it is executing on the CPU or blocked by file or network I/O operations. When performance issues arise, analyzing the sequence of key performance events for a given thread enables rapid problem isolation and localization. - -Currently, with its implemented event monitoring capabilities, tprofiling can identify application performance issues such as: - -- File I/O latency and blocking -- Network I/O latency and blocking -- Lock contention -- Deadlocks - -As more event types are added and refined, tprofiling will cover a broader range of application performance problems. - -### Event Observation Scope - -tprofiling currently supports two main categories of system performance events: syscall events and on-CPU events. - -**Syscall Events** - -Application performance often suffers from system resource bottlenecks like excessive CPU usage or I/O wait times. Applications typically access these resources through syscalls. Observation key syscall events helps identify time-consuming or blocking resource access operations. - -The syscall events currently observed by tprofiling are detailed in the [Supported Syscall Events](#supported-system-call-events) section. These events fall into categories such as file operations, network operations, lock operations, and scheduling operations. Examples of observed syscall events include: - -- File Operations - - `read`/`write`: Reading from or writing to disk files or network connections; these operations can be time-consuming or blocking. - - `sync`/`fsync`: Synchronizing file data to disk, which blocks the thread until completion. -- Network Operations - - `send`/`recv`: Reading from or writing to network connections; these operations can be time-consuming or blocking. -- Lock Operations - - `futex`: A syscall related to user-mode lock implementations. A `futex` call often indicates lock contention, potentially causing threads to block. -- Scheduling Operations: These syscall events can change a thread's state, such as yielding the CPU, sleeping, or waiting for other threads. - - `nanosleep`: The thread enters a sleep state. - - `epoll_wait`: The thread waits for I/O events, blocking until an event arrives. - -**on-CPU Events** - -A thread's running state can be categorized as either on-CPU (executing on a CPU core) or off-CPU (not executing). Observation on-CPU events helps identify threads performing time-consuming CPU-bound operations. - -### Event Content - -Thread profiling events include the following information: - -- Event Source: This includes the thread ID, thread name, process ID, process name, container ID, container name, host ID, and host name associated with the event. - - - `thread.pid`: The thread ID. - - `thread.comm`: The thread name. - - `thread.tgid`: The process ID. - - `proc.name`: The process name. - - `container.id`: The container ID. - - `container.name`: The container name. - - `host.id`: The host ID. - - `host.name`: The host name. - -- Event Attributes: These include common attributes and extended attributes. - - - Common Attributes: These include the event name, event type, start time, end time, and duration. - - - `event.name`: The event name. - - `event.type`: The event type, which can be `oncpu`, `file`, `net`, `lock`, or `sched`. - - `start_time`: The event start time, which is the start time of the first event in an aggregated event. See [Aggregated Events](#aggregated-events) for more information. - - `end_time`: The event end time, which is the end time of the last event in an aggregated event. - - `duration`: The event duration, calculated as (`end_time` - `start_time`). - - `count`: The number of aggregated events. - - - Extended Attributes: These provide more detailed information specific to each syscall event. For example, `read` and `write` events for files or network connections include the file path, network connection details, and function call stack. - - - `func.stack`: The function call stack. - - `file.path`: The file path for file-related events. - - `sock.conn`: The TCP connection information for network-related events. - - `futex.op`: The `futex` operation type, which can be `wait` or `wake`. - - Refer to the [Supported Syscall Events](#supported-system-call-events) section for details on the extended attributes supported by each event type. - -### Event Output - -As an eBPF probe extension provided by gala-gopher, tprofiling sends generated system events to gala-gopher for processing. gala-gopher then outputs these events in the openTelemetry format and publishes them as JSON messages to a Kafka queue. Front-end applications can consume these tprofiling events by subscribing to the Kafka topic. - -Here's an example of a thread profiling event output: - -```json -{ - "Timestamp": 1661088145000, - "SeverityText": "INFO", - "SeverityNumber": 9, - "Body": "", - "Resource": { - "host.id": "", - "host.name": "", - "thread.pid": 10, - "thread.tgid": 10, - "thread.comm": "java", - "proc.name": "xxx.jar", - "container.id": "", - "container.name": "", - }, - "Attributes": { - values: [ - { - // common info - "event.name": "read", - "event.type": "file", - "start_time": 1661088145000, - "end_time": 1661088146000, - "duration": 0.1, - "count": 1, - // extend info - "func.stack": "read;", - "file.path": "/test.txt" - }, - { - "event.name": "oncpu", - "event.type": "oncpu", - "start_time": 1661088146000, - "end_time": 1661088147000, - "duration": 0.1, - "count": 1, - } - ] - } -} -``` - -Key fields: - -- `Timestamp`: The timestamp when the event was reported. -- `Resource`: Information about the event source. -- `Attributes`: Event attribute information, containing a `values` list. Each item in the list represents a tprofiling event from the same source and includes the event's attributes. - -### Quick Start - -#### Installation - -tprofiling is an eBPF probe extension for gala-gopher, so you must first install gala-gopher before enabling tprofiling. - -[gala-ops](https://gitee.com/openeuler/gala-docs) provides a demo UI for tprofiling based on Kafka, Logstash, Elasticsearch, and Grafana. You can use the gala-ops deployment tools for quick setup. - -#### Architecture - -![](./figures/tprofiling-run-arch.png) - -Software components: - -- Kafka: An open-source message queue that receives and stores tprofiling events collected by gala-gopher. -- Logstash: A real-time, open-source log collection engine that consumes tprofiling events from Kafka, processes them (filtering, transformation, etc.), and sends them to Elasticsearch. -- Elasticsearch: An open, distributed search and analytics engine that stores the processed tprofiling events for querying and visualization in Grafana. -- Grafana: An open-source visualization tool to query and visualize the collected tprofiling events. Users interact with tprofiling through the Grafana UI to analyze application performance. - -#### Deploying the tprofiling Probe - -First, install gala-gopher as described in the [gala-gopher documentation](https://gitee.com/openeuler/gala-gopher#快速开始). Because tprofiling events are sent to Kafka, configure the Kafka service address during deployment. - -After installing and running gala-gopher, start the tprofiling probe using gala-gopher's HTTP-based dynamic configuration API: - -```sh -curl -X PUT http://:9999/tprofiling -d json='{"cmd": {"probe": ["oncpu", "syscall_file", "syscall_net", "syscall_sched", "syscall_lock"]}, "snoopers": {"proc_name": [{"comm": "java"}]}, "state": "running"}' -``` - -Configuration parameters: - -- ``: The IP address of the node where gala-gopher is deployed. -- `probe`: Under `cmd`, the `probe` configuration specifies the system events that the tprofiling probe monitors. `oncpu`, `syscall_file`, `syscall_net`, `syscall_sched`, and `syscall_lock` correspond to on-CPU events and file, network, scheduling, and lock syscall events, respectively. You can enable only the desired tprofiling event types. -- `proc_name`: Under `snoopers`, the `proc_name` configuration filters the processes to monitor by process name. You can also filter by process ID using the `proc_id` configuration. See [REST Dynamic Configuration Interface](#rest-dynamic-configuration-interface) for details. - -To stop the tprofiling probe, run: - -```sh -curl -X PUT http://:9999/tprofiling -d json='{"state": "stopped"}' -``` - -#### Deploying the Front-End Software - -The tprofiling UI requires Kafka, Logstash, Elasticsearch, and Grafana. Install these components on a management node. You can use the gala-ops deployment tools for quick installation; see the [Online Deployment Documentation](https://gitee.com/openeuler/gala-docs#%E5%9C%A8%E7%BA%BF%E9%83%A8%E7%BD%B2). - -On the management node, obtain the deployment script from the [Online Deployment Documentation](https://gitee.com/openeuler/gala-docs#%E5%9C%A8%E7%BA%BF%E9%83%A8%E7%BD%B2) and run the following command to install Kafka, Logstash, and Elasticsearch with one command: - -```sh -sh deploy.sh middleware -K -E -A -p -``` - -Run the following command to install Grafana: - -```sh -sh deploy.sh grafana -P -E -``` - -#### Usage - -After completing the deployment, access A-Ops by browsing to `http://[deployment_node_management_IP_address]:3000` and logging into Grafana. The default username and password are both **admin**. - -After logging in, find the **ThreadProfiling** dashboard. - -![image-20230628155002410](./figures/tprofiling-dashboard.png) - -Click to enter the tprofiling UI and explore its features. - -![image-20230628155249009](./figures/tprofiling-dashboard-detail.png) - -### Use Cases - -#### Case 1: Deadlock Detection - -![image-20230628095802499](./figures/deadlock.png) - -The above diagram shows the thread profiling results of a deadlock demo process. The pie chart shows that `lock` events (in gray) consume a significant portion of the execution time. The lower section displays the thread profiling results for the entire process, with the vertical axis representing the sequence of profiling events for different threads. The `java` main thread remains blocked. The `LockThd1` and `LockThd2` service threads execute `oncpu` and `file` events, followed by simultaneous, long-duration `lock` events. Hovering over a `lock` event reveals that it triggers a `futex` syscall lasting 60 seconds. - -![image-20230628101056732](./figures/deadlock2.png) - -This suggests potential issues with `LockThd1` and `LockThd2`. We can examine their thread profiling results in the thread view. - -![image-20230628102138540](./figures/deadlock3.png) - -This view displays the profiling results for each thread, with the vertical axis showing the sequence of events. `LockThd1` and `LockThd2` normally execute `oncpu` events, including `file` and `lock` events, periodically. However, around 10:17:00, they both execute a long `futex` event without any intervening `oncpu` events, indicating a blocked state. `futex` is a syscall related to user-space lock implementation, and its invocation often signals lock contention and potential blocking. - -Based on this analysis, a deadlock likely exists between `LockThd1` and `LockThd2`. - -#### Case 2: Lock Contention Detection - -![image-20230628111119499](./figures/lockcompete1.png) - -The above diagram shows the thread profiling results for a lock contention demo process. The process primarily executes `lock`, `net`, and `oncpu` events, involving three service threads. Between 11:05:45 and 11:06:45, the event execution times for all three threads increase significantly, indicating a potential performance problem. We can examine each thread's profiling results in the thread view, focusing on this period. - -![image-20230628112709827](./figures/lockcompete2.png) - -By examining the event sequence for each thread, we can understand their activities: - -- Thread `CompeteThd1`: Periodically triggers short `oncpu` events, performing a calculation task. However, around 11:05:45, it begins triggering long `oncpu` events, indicating a time-consuming calculation. - - ![image-20230628113336435](./figures/lockcompete3.png) - -- Thread `CompeteThd2`: Periodically triggers short `net` events. Clicking on an event reveals that the thread is sending network messages via the `write` syscall, along with the TCP connection details. Similarly, around 11:05:45, it starts executing long `futex` events and becomes blocked, increasing the interval between `write` events. - - ![image-20230628113759887](./figures/lockcompete4.png) - - ![image-20230628114340386](./figures/lockcompete5.png) - -- Thread `tcp-server`: A TCP server that continuously reads client requests via the `read` syscall. Starting around 11:05:45, the `read` event execution time increases, indicating that it is waiting to receive network requests. - - ![image-20230628114659071](./figures/lockcompete6.png) - -Based on this analysis, whenever `CompeteThd1` performs a long `oncpu` operation, `CompeteThd2` calls `futex` and enters a blocked state. Once `CompeteThd1` completes the `oncpu` operation, `CompeteThd2` acquires the CPU and performs the network `write` operation. This strongly suggests lock contention between `CompeteThd1` and `CompeteThd2`. Because `CompeteThd2` is waiting for a lock and cannot send network requests, the `tcp-server` thread spends most of its time waiting for `read` requests. - -### Topics - -#### Supported System Call Events - -When selecting system call events for monitoring, consider these principles: - -1. Choose potentially time-consuming or blocking events, such as file, network, or lock operations, as they involve system resource access. -2. Choose events that affect a thread's running state. - -| Event/Syscall Name | Description | Default Type | Extended Content | -| ------------------ | ------------------------------------------------------------------------------ | ------------ | -------------------------------------- | -| `read` | Reads/writes to drive files or the network; may be time-consuming or blocking. | `file` | `file.path`, `sock.conn`, `func.stack` | -| `write` | Reads/writes to drive files or the network; may be time-consuming or blocking. | `file` | `file.path`, `sock.conn`, `func.stack` | -| `readv` | Reads/writes to drive files or the network; may be time-consuming or blocking. | `file` | `file.path`, `sock.conn`, `func.stack` | -| `writev` | Reads/writes to drive files or the network; may be time-consuming or blocking. | `file` | `file.path`, `sock.conn`, `func.stack` | -| `preadv` | Reads/writes to drive files or the network; may be time-consuming or blocking. | `file` | `file.path`, `sock.conn`, `func.stack` | -| `pwritev` | Reads/writes to drive files or the network; may be time-consuming or blocking. | `file` | `file.path`, `sock.conn`, `func.stack` | -| `sync` | Synchronously flushes files to the drive; blocks the thread until completion. | `file` | `func.stack` | -| `fsync` | Synchronously flushes files to the drive; blocks the thread until completion. | `file` | `file.path`, `sock.conn`, `func.stack` | -| `fdatasync` | Synchronously flushes files to the drive; blocks the thread until completion. | `file` | `file.path`, `sock.conn`, `func.stack` | -| `sched_yield` | Thread voluntarily relinquishes the CPU for rescheduling. | `sched` | `func.stack` | -| `nanosleep` | Thread enters a sleep state. | `sched` | `func.stack` | -| `clock_nanosleep` | Thread enters a sleep state. | `sched` | `func.stack` | -| `wait4` | Thread blocks. | `sched` | `func.stack` | -| `waitpid` | Thread blocks. | `sched` | `func.stack` | -| `select` | Thread blocks and waits for an event. | `sched` | `func.stack` | -| `pselect6` | Thread blocks and waits for an event. | `sched` | `func.stack` | -| `poll` | Thread blocks and waits for an event. | `sched` | `func.stack` | -| `ppoll` | Thread blocks and waits for an event. | `sched` | `func.stack` | -| `epoll_wait` | Thread blocks and waits for an event. | `sched` | `func.stack` | -| `sendto` | Reads/writes to the network; may be time-consuming or blocking. | `net` | `sock.conn`, `func.stack` | -| `recvfrom` | Reads/writes to the network; may be time-consuming or blocking. | `net` | `sock.conn`, `func.stack` | -| `sendmsg` | Reads/writes to the network; may be time-consuming or blocking. | `net` | `sock.conn`, `func.stack` | -| `recvmsg` | Reads/writes to the network; may be time-consuming or blocking. | `net` | `sock.conn`, `func.stack` | -| `sendmmsg` | Reads/writes to the network; may be time-consuming or blocking. | `net` | `sock.conn`, `func.stack` | -| `recvmmsg` | Reads/writes to the network; may be time-consuming or blocking. | `net` | `sock.conn`, `func.stack` | -| `futex` | Often indicates lock contention; the thread may block. | `lock` | `futex.op`, `func.stack` | - -#### Aggregated Events - -tprofiling currently supports two main categories of system performance events: system call events and `oncpu` events. In certain scenarios, `oncpu` events and some system call events (like `read` and `write`) can trigger frequently, generating a large volume of system events. This can negatively impact both the performance of the application being observed and the tprofiling probe itself. - -To improve performance, tprofiling aggregates multiple system events with the same name from the same thread within a one-second interval into a single reported event. Therefore, a tprofiling event is actually an aggregated event containing one or more identical system events. Some attribute meanings differ between aggregated events and real system events: - -- `start_time`: The start time of the first system event in the aggregation. -- `end_time`: Calculated as `start_time + duration`. -- `duration`: The sum of the actual execution times of all system events in the aggregation. -- `count`: The number of system events aggregated. When `count` is 1, the aggregated event is equivalent to a single system event. -- Extended event attributes: The extended attributes of the first system event in the aggregation. - -## Introduction to L7Probe - -Purpose: L7 traffic observation, covering common protocols like HTTP1.X, PG, MySQL, Redis, Kafka, HTTP2.0, MongoDB, and RocketMQ. Supports observation of encrypted streams. - -Scope: Node, container, and Kubernetes pod environments. - -### Code Framework Design - -```text -L7Probe - | --- included // Public header files - | --- connect.h // L7 connect object definition - | --- pod.h // pod/container object definition - | --- conn_tracker.h // L7 protocol tracking object definition - | --- protocol // L7 protocol parsing - | --- http // HTTP1.X L7 message structure definition and parsing - | --- mysql // mysql L7 message structure definition and parsing - | --- pgsql // pgsql L7 message structure definition and parsing - | --- bpf // Kernel bpf code - | --- L7.h // BPF program parses L7 protocol types - | --- kern_sock.bpf.c // Kernel socket layer observation - | --- libssl.bpf.c // OpenSSL layer observation - | --- gossl.bpf.c // Go SSL layer observation - | --- cgroup.bpf.c // Pod lifecycle observation - | --- pod_mng.c // pod/container instance management (detects pod/container lifecycle) - | --- conn_mng.c // L7 Connect instance management (handles BPF observation events, such as Open/Close events, Stats statistics) - | --- conn_tracker.c // L7 traffic tracking (tracks data from BPF observation, such as data generated by send/write, read/recv system events) - | --- bpf_mng.c // BPF program lifecycle management (dynamically opens, loads, attaches, and unloads BPF programs, including uprobe BPF programs) - | --- session_conn.c // Manages JSSE sessions (records the mapping between JSSE sessions and socket connections, and reports JSSE connection information) - | --- L7Probe.c // Main probe program -``` - -### Probe Output - -| Metric Name | Table Name | Metric Type | Unit | Metric Description | -| --------------- | ---------- | ----------- | ---- | ------------------------------------------------------------------------------------------------------------------------------ | -| tgid | N/A | Key | N/A | Process ID of the L7 session. | -| client_ip | N/A | Key | N/A | Client IP address of the L7 session. | -| server_ip | N/A | Key | N/A | Server IP address of the L7 session.
Note: In Kubernetes, Cluster IP addresses can be translated to Backend IP addresses. | -| server_port | N/A | Key | N/A | Server port of the L7 session.
Note: In Kubernetes, Cluster Ports can be translated to Backend Ports. | -| l4_role | N/A | Key | N/A | Role of the L4 protocol (TCP Client/Server or UDP). | -| l7_role | N/A | Key | N/A | Role of the L7 protocol (Client or Server). | -| protocol | N/A | Key | N/A | Name of the L7 protocol (HTTP/HTTP2/MySQL...). | -| ssl | N/A | Label | N/A | Indicates whether the L7 session uses SSL encryption. | -| bytes_sent | l7_link | Gauge | N/A | Number of bytes sent by the L7 session. | -| bytes_recv | l7_link | Gauge | N/A | Number of bytes received by the L7 session. | -| segs_sent | l7_link | Gauge | N/A | Number of segments sent by the L7 session. | -| segs_recv | l7_link | Gauge | N/A | Number of segments received by the L7 session. | -| throughput_req | l7_rpc | Gauge | QPS | Request throughput of the L7 session. | -| throughput_resp | l7_rpc | Gauge | QPS | Response throughput of the L7 session. | -| req_count | l7_rpc | Gauge | N/A | Request count of the L7 session. | -| resp_count | l7_rpc | Gauge | N/A | Response count of the L7 session. | -| latency_avg | l7_rpc | Gauge | ns | Average latency of the L7 session. | -| latency | l7_rpc | Histogram | ns | Latency histogram of the L7 session. | -| latency_sum | l7_rpc | Gauge | ns | Total latency of the L7 session. | -| err_ratio | l7_rpc | Gauge | % | Error rate of the L7 session. | -| err_count | l7_rpc | Gauge | N/A | Error count of the L7 session. | - -### Dynamic Control - -#### Controlling the Scope of Pod Observation - -1. REST request sent to gala-gopher. -2. gala-gopher forwards the request to L7Probe. -3. L7Probe identifies relevant containers based on the Pod information. -4. L7Probe retrieves the CGroup ID (`cpuacct_cgrp_id`) of each container and writes it to the object module (using the `cgrp_add` API). -5. During socket system event processing, the CGroup (`cpuacct_cgrp_id`) of the process is obtained, referencing the Linux kernel code (`task_cgroup`). -6. Filtering occurs during observation via the object module (using the `is_cgrp_exist` API). - -#### Controlling Observation Capabilities - -1. REST request sent to gala-gopher. -2. gala-gopher forwards the request to L7Probe. -3. L7Probe dynamically enables or disables BPF-based observation features (including throughput, latency, tracing, and protocol type detection) based on the request parameters. - -### Observation Points - -#### Kernel Socket System Calls - -TCP-related system calls: - -```c -// int connect(int sockfd, const struct sockaddr *addr, socklen_t addrlen); -// int accept(int sockfd, struct sockaddr *addr, socklen_t *addrlen); -// int accept4(int sockfd, struct sockaddr *addr, socklen_t *addrlen, int flags); -// ssize_t write(int fd, const void *buf, size_t count); -// ssize_t send(int sockfd, const void *buf, size_t len, int flags); -// ssize_t read(int fd, void *buf, size_t count); -// ssize_t recv(int sockfd, void *buf, size_t len, int flags); -// ssize_t writev(int fd, const struct iovec *iov, int iovcnt); -// ssize_t readv(int fd, const struct iovec *iov, int iovcnt); -``` - -TCP and UDP-related system calls: - -```c -// ssize_t sendto(int sockfd, const void *buf, size_t len, int flags, const struct sockaddr *dest_addr, socklen_t addrlen); -// ssize_t recvfrom(int sockfd, void *buf, size_t len, int flags, struct sockaddr *src_addr, socklen_t *addrlen); -// ssize_t sendmsg(int sockfd, const struct msghdr *msg, int flags); -// ssize_t recvmsg(int sockfd, struct msghdr *msg, int flags); -// int close(int fd); -``` - -Important notes: - -1. `read`/`write` and `readv`/`writev` can be confused with regular file I/O. The kernel function `security_socket_sendmsg` is observed to determine if a file descriptor (FD) refers to a socket operation. -2. `sendto`/`recvfrom` and `sendmsg`/`recvmsg` are used by both TCP and UDP. Refer to the manuals below. -3. `sendmmsg`/`recvmmsg` and `sendfile` are not currently supported. - -[sendto manual](https://man7.org/linux/man-pages/man2/send.2.html): If sendto() is used on a connection-mode (SOCK_STREAM, SOCK_SEQPACKET) socket, the arguments dest_addr and addrlen are ignored (and the error EISCONN may be returned when they are not NULL and 0), and the error ENOTCONN is returned when the socket was not actually connected. otherwise, the address of the target is given by dest_addr with addrlen specifying its size. - -`sendto` determines that the protocol is TCP if the `dest_addr` parameter is NULL; otherwise, it is UDP. - -[recvfrom manual](https://linux.die.net/man/2/recvfrom): The recvfrom() and recvmsg() calls are used to receive messages from a socket, and may be used to receive data on a socket whether or not it is connection-oriented. - -`recvfrom` determines that the protocol is TCP if the `src_addr` parameter is NULL; otherwise, it is UDP. - -[sendmsg manual](https://man7.org/linux/man-pages/man3/sendmsg.3p.html): The sendmsg() function shall send a message through a connection-mode or connectionless-mode socket. If the socket is a connectionless-mode socket, the message shall be sent to the address specified by msghdr if no pre-specified peer address has been set. If a peer address has been pre-specified, either themessage shall be sent to the address specified in msghdr (overriding the pre-specified peer address), or the function shall return -1 and set errno to \[EISCONN]. If the socket is connection-mode, the destination address in msghdr shall be ignored. - -`sendmsg` determines that the protocol is TCP if `msghdr->msg_name` is NULL; otherwise, it is UDP. - -[recvmsg manual](https://man7.org/linux/man-pages/man3/recvmsg.3p.html): The recvmsg() function shall receive a message from a connection-mode or connectionless-mode socket. It is normally used with connectionless-mode sockets because it permits the application to retrieve the source address of received data. - -`recvmsg` determines that the protocol is TCP if `msghdr->msg_name` is NULL; otherwise, it is UDP. - -#### libSSL API - -SSL_write - -SSL_read - -#### Go SSL API - -#### JSSE API - -sun/security/ssl/SSLSocketImpl$AppInputStream - -sun/security/ssl/SSLSocketImpl$AppOutputStream - -### JSSE Observation Scheme - -#### Loading the JSSEProbe - -The `l7_load_jsse_agent` function in `main` loads the JSSEProbe. - -It polls processes in the whitelist (`g_proc_obj_map_fd`). If a process is a Java process, it uses `jvm_attach` to load **JSSEProbeAgent.jar** into it. After loading, the Java process outputs observation information to **/tmp/java-data-\/jsse-metrics.txt** at specific points (see [JSSE API](#jsse-api)). - -#### Processing JSSEProbe Messages - -The `l7_jsse_msg_handler` thread handles JSSEProbe messages. - -It polls processes in the whitelist (`g_proc_obj_map_fd`). If a process has a `jsse-metrics` output file, it reads the file line by line, then parses, converts, and reports JSSE read/write information. - -##### 1. Parsing JSSE Read/Write Information - -The `jsse-metrics.txt` output format is: - -```text -|jsse_msg|662220|Session(1688648699909|TLS_AES_256_GCM_SHA384)|1688648699989|Write|127.0.0.1|58302|This is test message| -``` - -It parses the process ID, session ID, time, read/write operation, IP address, port, and payload. - -The parsed information is stored in `session_data_args_s`. - -##### 2. Converting JSSE Read/Write Information - -It converts the information in `session_data_args_s` into `sock_conn` and `conn_data`. - -This conversion queries two hash maps: - -`session_head`: Records the mapping between the JSSE session ID and the socket connection ID. If the process ID and 4-tuple information match, the session and socket connection are linked. - -`file_conn_head`: Records the last session ID of the Java process, in case L7Probe doesn't start reading from the beginning of a request and can't find the session ID. - -##### 3. Reporting JSSE Read/Write Information - -It reports `sock_conn` and `conn_data` to the map. - -## sliprobe Introduction - -`sliprobe` uses eBPF to collect and report container-level service-level indicator (SLI) metrics periodically. - -### Features - -- Collects the total latency and statistical histogram of CPU scheduling events per container. Monitored events include scheduling wait, active sleep, lock/IO blocking, scheduling delay, and long system calls. -- Collects the total latency and statistical histogram of memory allocation events per container. Monitored events include memory reclamation, swapping, and memory compaction. -- Collects the total latency and statistical histogram of BIO layer I/O operations per container. - -### Usage Instructions - -Example command to start `sliprobe`: Specifies a reporting period of 15 seconds and observes SLI metrics for containers `abcd12345678` and `abcd87654321`. - -```shell -curl -X PUT http://localhost:9999/sli -d json='{"params":{"report_period":15}, "snoopers":{"container_id":[{"container_id": "abcd12345678","abcd87654321"}]}, "state":"running"}' -``` - -### Code Logic - -#### Overview - -1. The user-space application receives a list of containers to monitor and stores the inode of each container's `cpuacct` subsystem directory in an eBPF map, sharing it with the kernel. -2. The kernel traces relevant kernel events using eBPF kprobes/tracepoints, determines if the event belongs to a monitored container, and records the event type and timestamp. It aggregates and reports SLI metrics for processes in the same cgroup at regular intervals. -3. The user-space application receives and prints the SLI metrics reported by the kernel. - -#### How SLI Metrics Are Calculated - -##### CPU SLI - -1. **cpu_wait** - - At the `sched_stat_wait` tracepoint, get the `delay` value (second parameter). - -2. **cpu_sleep** - - At the `sched_stat_sleep` tracepoint, get the `delay` value (second parameter). - -3. **cpu_iowait** - - At the `sched_stat_blocked` tracepoint, if the current process is `in_iowait`, get the `delay` value (second parameter). - -4. **cpu_block** - - At the `sched_stat_blocked` tracepoint, if the current process is not `in_iowait`, get the `delay` value (second parameter). - -5. **cpu_rundelay** - - At the `sched_switch` tracepoint, get the `run_delay` value of the next scheduled process (`next->sched_info.run_delay`) from the third parameter `next` and store it in `task_sched_map`. Calculate the difference in `run_delay` between two scheduling events of the same process. - -6. **cpu_longsys** - - At the `sched_switch` tracepoint, get the `task` structure of the next scheduled process from the third parameter `next`. Obtain the number of context switches (`nvcsw+nivcsw`) and user-space execution time (`utime`) from the `task` structure. If the number of context switches and user-space execution time remain the same between two scheduling events of the same process, the process is assumed to be executing a long system call. Accumulate the time the process spends in kernel mode. - -##### MEM SLI - -1. **mem_reclaim** - - Calculate the difference between the return and entry timestamps of the `mem_cgroup_handle_over_high` function. - - Calculate the difference between the timestamps of the `mm_vmscan_memcg_reclaim_end` and `mm_vmscan_memcg_reclaim_begin` tracepoints. - -2. **mem_swapin** - - Calculate the difference between the return and entry timestamps of the `do_swap_page` function. - -3. **mem_compact** - - Calculate the difference between the return and entry timestamps of the `try_to_compact_pages` function. - -##### IO SLI - -1. **bio_latency** - - Calculate the timestamp difference between entering the `bio_endio` function and triggering the `block_bio_queue` tracepoint. - - Calculate the timestamp difference between entering the `bio_endio` function and exiting the `generic_make_request_checks` function. - -## Output Data - -- **Metric** - - Prometheus Server has a built-in Express Browser UI. You can use PromQL statements to query metric data. For details, see [Using the expression browser](https://prometheus.io/docs/prometheus/latest/getting_started/#using-the-expression-browser) in the official document. The following is an example. - - If the specified metric is `gala_gopher_tcp_link_rcv_rtt`, the metric data displayed on the UI is as follows: - - ```basic - gala_gopher_tcp_link_rcv_rtt{client_ip="x.x.x.165",client_port="1234",hostname="openEuler",instance="x.x.x.172:8888",job="prometheus",machine_id="1fd3774xx",protocol="2",role="0",server_ip="x.x.x.172",server_port="3742",tgid="1516"} 1 - ``` - -- **Metadata** - - You can directly consume data from the Kafka topic `gala_gopher_metadata`. The following is an example. - - ```bash - # Input request - ./bin/kafka-console-consumer.sh --bootstrap-server x.x.x.165:9092 --topic gala_gopher_metadata - # Output data - {"timestamp": 1655888408000, "meta_name": "thread", "entity_name": "thread", "version": "1.0.0", "keys": ["machine_id", "pid"], "labels": ["hostname", "tgid", "comm", "major", "minor"], "metrics": ["fork_count", "task_io_wait_time_us", "task_io_count", "task_io_time_us", "task_hang_count"]} - ``` - -- **Abnormal events** - - You can directly consume data from the Kafka topic `gala_gopher_event`. The following is an example. - - ```bash - # Input request - ./bin/kafka-console-consumer.sh --bootstrap-server x.x.x.165:9092 --topic gala_gopher_event - # Output data - {"timestamp": 1655888408000, "meta_name": "thread", "entity_name": "thread", "version": "1.0.0", "keys": ["machine_id", "pid"], "labels": ["hostname", "tgid", "comm", "major", "minor"], "metrics": ["fork_count", "task_io_wait_time_us", "task_io_count", "task_io_time_us", "task_hang_count"]} - ``` diff --git a/docs/en/server/maintenance/gala/using_gala_spider.md b/docs/en/server/maintenance/gala/using_gala_spider.md deleted file mode 100644 index 8052bd18b3b010d5c1634967a81bcf3f5c9b3dee..0000000000000000000000000000000000000000 --- a/docs/en/server/maintenance/gala/using_gala_spider.md +++ /dev/null @@ -1,527 +0,0 @@ -# Using gala-spider - -This chapter describes how to deploy and use gala-spider and gala-inference. - -## gala-spider - -gala-spider provides the OS-level topology drawing function. It periodically obtains the data of all observed objects collected by gala-gopher (an OS-level data collection software) at a certain time point and calculates the topology relationship between them. The generated topology is saved to the graph database ArangoDB. - -### Installation - -Mount the Yum sources. - -```basic -[oe-2209] # openEuler 22.09 officially released repository -name=oe2209 -baseurl=http://119.3.219.20:82/openEuler:/22.09/standard_x86_64 -enabled=1 -gpgcheck=0 -priority=1 - -[oe-2209:Epol] # openEuler 22.09: Epol officially released repository -name=oe2209_epol -baseurl=http://119.3.219.20:82/openEuler:/22.09:/Epol/standard_x86_64/ -enabled=1 -gpgcheck=0 -priority=1 -``` - -Install gala-spider. - -```sh -# yum install gala-spider -``` - -### Configuration - -#### Configuration File Description - -The configuration file of gala-spider is **/etc/gala-spider/gala-spider.yaml**. The configuration items in this file are described as follows: - -- `global`: global configuration information. - - `data_source`: database for collecting observation metrics. Currently, only `prometheus` is supported. - - `data_agent`: agent for collecting observation metrics. Currently, only `gala_gopher` is supported. -- `spider`: spider configuration information. - - `log_conf`: log configuration information. - - `log_path`: log file path. - - `log_level`: level of the logs to be printed. The value can be `DEBUG`, `INFO`, `WARNING`, `ERROR`, or `CRITICAL`. - - `max_size`: log file size, in MB. - - `backup_count`: number of backup log files. -- `storage`: configuration information about the topology storage service. - - `period`: storage period, in seconds, indicating the interval for storing the topology. - - `database`: graph database for storage. Currently, only `arangodb` is supported. - - `db_conf`: configuration information of the graph database. - - `url`: IP address of the graph database server. - - `db_name`: name of the database where the topology is stored. -- `kafka`: Kafka configuration information. - - `server`: Kafka server address. - - `metadata_topic`: topic name of the observed metadata messages. - - `metadata_group_id`: consumer group ID of the observed metadata messages. -- `prometheus`: Prometheus database configuration information. - - `base_url`: IP address of the Prometheus server. - - `instant_api`: API for collecting data at a single time point. - - `range_api`: API for collecting data in a time range. - - `step`: collection time step, which is configured for `range_api`. - -#### Configuration File Example - -```yaml -global: - data_source: "prometheus" - data_agent: "gala_gopher" - -prometheus: - base_url: "http://localhost:9090/" - instant_api: "/api/v1/query" - range_api: "/api/v1/query_range" - step: 1 - -spider: - log_conf: - log_path: "/var/log/gala-spider/spider.log" - # log level: DEBUG/INFO/WARNING/ERROR/CRITICAL - log_level: INFO - # unit: MB - max_size: 10 - backup_count: 10 - -storage: - # unit: second - period: 60 - database: arangodb - db_conf: - url: "http://localhost:8529" - db_name: "spider" - -kafka: - server: "localhost:9092" - metadata_topic: "gala_gopher_metadata" - metadata_group_id: "metadata-spider" -``` - -### Start - -- Run the following command to start gala-spider. - - ```sh - # spider-storage - ``` - -- Use the systemd service to start gala-spider. - - ```sh - # systemctl start gala-spider - ``` - -### How to Use - -#### Deployment of External Dependent Software - -The running of gala-spider depends on multiple external software for interaction. Therefore, before starting gala-spider, you need to deploy the software on which gala-spider depends. The following figure shows the software dependency of gala-spider. - -![gala-spider-arch](./figures/gala-spider-arch.png) - -The dotted box on the right indicates the two functional components of gala-spider. The green parts indicate the external components that gala-spider directly depends on, and the gray rectangles indicate the external components that gala-spider indirectly depends on. - -- **spider-storage**: core component of gala-spider, which provides the topology storage function. - 1. Obtains the metadata of the observation object from Kafka. - 2. Obtains information about all observation object instances from Prometheus. - 3. Saves the generated topology to the graph database ArangoDB. -- **gala-inference**: core component of gala-spider, which provides the root cause locating function. It subscribes to abnormal KPI events from Kafka to trigger the root cause locating process of abnormal KPIs, constructs a fault propagation graph based on the topology obtained from the ArangoDB, and outputs the root cause locating result to Kafka. -- **prometheus**: time series database. The observation metric data collected by the gala-gopher component is reported to Prometheus for further processing. -- **kafka**: messaging middleware, which is used to store the observation object metadata reported by gala-gopher, exception events reported by the exception detection component gala-anteater, and root cause locating results reported by the cause-inference component. -- **arangodb**: graph database, which is used to store the topology generated by spider-storage. -- **gala-gopher**: data collection component. It must be deployed in advance. -- **arangodb-ui**: UI provided by ArangoDB, which can be used to query topologies. - -The two functional components in gala-spider are released as independent software packages. - -**spider-storage**: corresponds to the gala-spider software package in this section. - -**gala-inference**: corresponds to the gala-inference software package. - -For details about how to deploy the gala-gopher software, see [Using gala-gopher](using_gala_gopher.md). This section only describes how to deploy ArangoDB. - -The current ArangoDB version is 3.8.7, which has the following requirements on the operating environment: - -- Only the x86 system is supported. -- GCC 10 or later - -For details about ArangoDB deployment, see [Deployment](https://www.arangodb.com/docs/3.9/deployment.html) in the ArangoDB official document. - -The RPM-based ArangoDB deployment process is as follows: - -1. Configure the Yum sources. - - ```basic - [oe-2209] # openEuler 22.09 officially released repository - name=oe2209 - baseurl=http://119.3.219.20:82/openEuler:/22.09/standard_x86_64 - enabled=1 - gpgcheck=0 - priority=1 - - [oe-2209:Epol] # openEuler 22.09: Epol officially released repository - name=oe2209_epol - baseurl=http://119.3.219.20:82/openEuler:/22.09:/Epol/standard_x86_64/ - enabled=1 - gpgcheck=0 - priority=1 - ``` - -2. Install arangodb3. - - ```sh - # yum install arangodb3 - ``` - -3. Modify the configurations. - - The configuration file of the arangodb3 server is **/etc/arangodb3/arangod.conf**. You need to modify the following configurations: - - - `endpoint`: IP address of the arangodb3 server. - - `authentication`: whether identity authentication is required for accessing the arangodb3 server. Currently, gala-spider does not support identity authentication. Therefore, set `authentication` to `false`. - - The following is an example. - - ```yaml - [server] - endpoint = tcp://0.0.0.0:8529 - authentication = false - ``` - -4. Start arangodb3. - - ```sh - # systemctl start arangodb3 - ``` - -#### Modifying gala-spider Configuration Items - -After the dependent software is started, you need to modify some configuration items in the gala-spider configuration file. The following is an example. - -Configure the Kafka server address. - -```yaml -kafka: - server: "localhost:9092" -``` - -Configure the Prometheus server address. - -```yaml -prometheus: - base_url: "http://localhost:9090/" -``` - -Configure the IP address of the ArangoDB server. - -```yaml -storage: - db_conf: - url: "http://localhost:8529" -``` - -#### Starting the Service - -Run `systemctl start gala-spider` to start the service. Run `systemctl status gala-spider` to check the startup status. If the following information is displayed, the startup is successful: - -```sh -[root@openEuler ~]# systemctl status gala-spider -● gala-spider.service - a-ops gala spider service - Loaded: loaded (/usr/lib/systemd/system/gala-spider.service; enabled; vendor preset: disabled) - Active: active (running) since Tue 2022-08-30 17:28:38 CST; 1 day 22h ago - Main PID: 2263793 (spider-storage) - Tasks: 3 (limit: 98900) - Memory: 44.2M - CGroup: /system.slice/gala-spider.service - └─2263793 /usr/bin/python3 /usr/bin/spider-storage -``` - -#### Output Example - -You can query the topology generated by gala-spider on the UI provided by ArangoDB. The procedure is as follows: - -1. Enter the IP address of the ArangoDB server in the address box of the browser, for example, ****. The ArangoDB UI is displayed. - -2. Click **DB** in the upper right corner of the page to switch to the spider database. - -3. On the **COLLECTIONS** page, you can view the collections of observation object instances and topology relationships stored in different time segments, as shown in the following figure. - - ![spider_topology](./figures/spider_topology.png) - -4. You can query the stored topology using the AQL statements provided by ArangoDB. For details, see the [AQL Documentation](https://www.arangodb.com/docs/3.8/aql/). - -## gala-inference - -gala-inference provides the capability of locating root causes of abnormal KPIs. It uses the exception detection result and topology as the input and outputs the root cause locating result to Kafka. The gala-inference component is archived in the gala-spider project. - -### Installation - -Mount the Yum sources. - -```basic -[oe-2209] # openEuler 22.09 officially released repository -name=oe2209 -baseurl=http://119.3.219.20:82/openEuler:/22.09/standard_x86_64 -enabled=1 -gpgcheck=0 -priority=1 - -[oe-2209:Epol] # openEuler 22.09: Epol officially released repository -name=oe2209_epol -baseurl=http://119.3.219.20:82/openEuler:/22.09:/Epol/standard_x86_64/ -enabled=1 -gpgcheck=0 -priority=1 -``` - -Install gala-inference. - -```sh -# yum install gala-inference -``` - -### Configuration - -#### Configuration File Description - -The configuration items in the gala-inference configuration file **/etc/gala-inference/gala-inference.yaml** are described as follows: - -- `inference`: configuration information about the root cause locating algorithm. - - `tolerated_bias`: tolerable time offset for querying the topology at the exception time point, in seconds. - - `topo_depth`: maximum depth for topology query. - - `root_topk`: top *K* root cause metrics generated in the root cause locating result. - - `infer_policy`: root cause derivation policy, which can be `dfs` or `rw`. - - `sample_duration`: sampling period of historical metric data, in seconds. - - `evt_valid_duration`: valid period of abnormal system metric events during root cause locating, in seconds. - - `evt_aging_duration`: aging period of abnormal metric events during root cause locating, in seconds. -- `kafka`: Kafka configuration information. - - `server`: IP address of the Kafka server. - - `metadata_topic`: configuration information about the observed metadata messages. - - `topic_id`: topic name of the observed metadata messages. - - `group_id`: consumer group ID of the observed metadata messages. - - `abnormal_kpi_topic`: configuration information about abnormal KPI event messages. - - `topic_id`: topic name of the abnormal KPI event messages. - - `group_id`: consumer group ID of the abnormal KPI event messages. - - `abnormal_metric_topic`: configuration information about abnormal metric event messages. - - `topic_id`: topic name of the abnormal metric event messages. - - `group_id`: consumer group ID of the abnormal system metric event messages. - - `consumer_to`: timeout interval for consuming abnormal system metric event messages, in seconds. - - `inference_topic`: configuration information about the output event messages of the root cause locating result. - - `topic_id`: topic name of the output event messages of the root cause locating result. -- `arangodb`: configuration information about the ArangoDB graph database, which is used to query sub-topologies required for root cause locating. - - `url`: IP address of the graph database server. - - `db_name`: name of the database where the topology is stored. -- `log_conf`: log configuration information. - - `log_path`: log file path. - - `log_level`: level of the logs to be printed. The value can be `DEBUG`, `INFO`, `WARNING`, `ERROR`, or `CRITICAL`. - - `max_size`: log file size, in MB. - - `backup_count`: number of backup log files. -- `prometheus`: Prometheus database configuration information, which is used to obtain historical time series data of metrics. - - `base_url`: IP address of the Prometheus server. - - `range_api`: API for collecting data in a time range. - - `step`: collection time step, which is configured for `range_api`. - -#### Configuration File Example - -```yaml -inference: - # Tolerable time offset for querying the topology at the exception time point, in seconds. - tolerated_bias: 120 - topo_depth: 10 - root_topk: 3 - infer_policy: "dfs" - # Unit: second - sample_duration: 600 - # Valid period of abnormal metric events during root cause locating, in seconds. - evt_valid_duration: 120 - # Aging period of abnormal metric events, in seconds. - evt_aging_duration: 600 - -kafka: - server: "localhost:9092" - metadata_topic: - topic_id: "gala_gopher_metadata" - group_id: "metadata-inference" - abnormal_kpi_topic: - topic_id: "gala_anteater_hybrid_model" - group_id: "abn-kpi-inference" - abnormal_metric_topic: - topic_id: "gala_anteater_metric" - group_id: "abn-metric-inference" - consumer_to: 1 - inference_topic: - topic_id: "gala_cause_inference" - -arangodb: - url: "http://localhost:8529" - db_name: "spider" - -log: - log_path: "/var/log/gala-inference/inference.log" - # log level: DEBUG/INFO/WARNING/ERROR/CRITICAL - log_level: INFO - # unit: MB - max_size: 10 - backup_count: 10 - -prometheus: - base_url: "http://localhost:9090/" - range_api: "/api/v1/query_range" - step: 5 -``` - -### Start - -- Run the following command to start gala-inference. - - ```sh - # gala-inference - ``` - -- Use the systemd service to start gala-inference. - - ```sh - # systemctl start gala-inference - ``` - -### How to Use - -#### Dependent Software Deployment - -The running dependency of gala-inference is the same as that of gala-spider. For details, see [Deployment of External Dependent Software](#deployment-of-external-dependent-software). In addition, gala-inference indirectly depends on the running of [gala-spider](#gala-spider) and [gala-anteater](using_gala_anteater.md). Deploy gala-spider and gala-anteater in advance. - -#### Modify configuration items - -Modify some configuration items in the gala-inference configuration file. The following is an example. - -Configure the Kafka server address. - -```yaml -kafka: - server: "localhost:9092" -``` - -Configure the Prometheus server address. - -```yaml -prometheus: - base_url: "http://localhost:9090/" -``` - -Configure the IP address of the ArangoDB server. - -```yaml -arangodb: - url: "http://localhost:8529" -``` - -#### Starting the Service - -Run `systemctl start gala-inference` to start the service. Run `systemctl status gala-inference` to check the startup status. If the following information is displayed, the startup is successful: - -```sh -[root@openEuler ~]# systemctl status gala-inference -● gala-inference.service - a-ops gala inference service - Loaded: loaded (/usr/lib/systemd/system/gala-inference.service; enabled; vendor preset: disabled) - Active: active (running) since Tue 2022-08-30 17:55:33 CST; 1 day 22h ago - Main PID: 2445875 (gala-inference) - Tasks: 10 (limit: 98900) - Memory: 48.7M - CGroup: /system.slice/gala-inference.service - └─2445875 /usr/bin/python3 /usr/bin/gala-inference -``` - -#### Output Example - -When the exception detection module gala-anteater detects a KPI exception, it exports the corresponding abnormal KPI event to Kafka. The gala-inference keeps monitoring the message of the abnormal KPI event. If gala-inference receives the message of the abnormal KPI event, root cause locating is triggered. The root cause locating result is exported to Kafka. You can view the root cause locating result on the Kafka server. The basic procedure is as follows: - -1. If Kafka is installed using the source code, go to the Kafka installation directory. - - ```sh - cd /root/kafka_2.13-2.8.0 - ``` - -2. Run the command for consuming the topic to obtain the output of root cause locating. - - ```sh - ./bin/kafka-console-consumer.sh --bootstrap-server localhost:9092 --topic gala_cause_inference - ``` - - Output example: - - ```json - { - "Timestamp": 1661853360000, - "event_id": "1661853360000_1fd37742xxxx_sli_12154_19", - "Attributes": { - "event_id": "1661853360000_1fd37742xxxx_sli_12154_19" - }, - "Resource": { - "abnormal_kpi": { - "metric_id": "gala_gopher_sli_rtt_nsec", - "entity_id": "1fd37742xxxx_sli_12154_19", - "timestamp": 1661853360000, - "metric_labels": { - "machine_id": "1fd37742xxxx", - "tgid": "12154", - "conn_fd": "19" - } - }, - "cause_metrics": [ - { - "metric_id": "gala_gopher_proc_write_bytes", - "entity_id": "1fd37742xxxx_proc_12154", - "metric_labels": { - "__name__": "gala_gopher_proc_write_bytes", - "cmdline": "/opt/redis/redis-server x.x.x.172:3742", - "comm": "redis-server", - "container_id": "5a10635e2c43", - "hostname": "openEuler", - "instance": "x.x.x.172:8888", - "job": "prometheus", - "machine_id": "1fd37742xxxx", - "pgid": "12154", - "ppid": "12126", - "tgid": "12154" - }, - "timestamp": 1661853360000, - "path": [ - { - "metric_id": "gala_gopher_proc_write_bytes", - "entity_id": "1fd37742xxxx_proc_12154", - "metric_labels": { - "__name__": "gala_gopher_proc_write_bytes", - "cmdline": "/opt/redis/redis-server x.x.x.172:3742", - "comm": "redis-server", - "container_id": "5a10635e2c43", - "hostname": "openEuler", - "instance": "x.x.x.172:8888", - "job": "prometheus", - "machine_id": "1fd37742xxxx", - "pgid": "12154", - "ppid": "12126", - "tgid": "12154" - }, - "timestamp": 1661853360000 - }, - { - "metric_id": "gala_gopher_sli_rtt_nsec", - "entity_id": "1fd37742xxxx_sli_12154_19", - "metric_labels": { - "machine_id": "1fd37742xxxx", - "tgid": "12154", - "conn_fd": "19" - }, - "timestamp": 1661853360000 - } - ] - } - ] - }, - "SeverityText": "WARN", - "SeverityNumber": 13, - "Body": "A cause inferring event for an abnormal event" - } - ``` diff --git a/docs/en/server/security/secgear/_toc.yaml b/docs/en/server/security/secgear/_toc.yaml deleted file mode 100644 index 6acb98e2e468b7984dd26a859cb55ae8b25a0321..0000000000000000000000000000000000000000 --- a/docs/en/server/security/secgear/_toc.yaml +++ /dev/null @@ -1,16 +0,0 @@ -label: secGear Developer Guide -isManual: true -description: Build applications with secGear to safeguard data during cloud operations. -sections: - - label: Introduction to secGear - href: ./introduction_to_secgear.md - - label: secGear Installation - href: ./secgear_installation.md - - label: API Reference - href: ./api_reference.md - - label: Developer Guide - href: ./developer_guide.md - - label: secGear Tools - href: ./using_secgear_tools.md - - label: Application Scenarios - href: ./application_scenarios.md diff --git a/docs/en/server/security/secgear/api_reference.md b/docs/en/server/security/secgear/api_reference.md deleted file mode 100644 index 13f2eb6c26223ea3339aaac9d0854c22c2a31602..0000000000000000000000000000000000000000 --- a/docs/en/server/security/secgear/api_reference.md +++ /dev/null @@ -1,357 +0,0 @@ -# API Reference - -The secGear unified programming framework for confidential computing consists of the TEE and REE. This section describes the APIs required for developing applications. In addition to these APIs, the TEE inherits the open-source POSIC APIs of ARM TrustZone and Intel SGX. - -## cc_enclave_create - -Creates an enclave API. - -**Function:** - -Initialization API. The function calls different TEE creation functions based on the type to initialize the enclave context in different TEE solutions. This API is called by the REE. - -> [!NOTE]NOTE -> Due to Intel SGX restrictions, memory mapping contention exists when multiple thread invoke cc_enclave_create concurrently. As a result, the creation of the enclave API may fail. Avoid concurrent invocations of cc_enclave_create in your code. - -**Function Declaration:** - -```c -cc_enclave_result_t cc_enclave_create(const char* path, enclave_type_t type, uint32_t version,uint32_t flags,const enclave_features_t* features,uint32_t features_count, - cc_enclave_t ** enclave); -``` - -**Parameters:** - -- Path: input parameter, which specifies a path of the enclave to be loaded. -- Type: input parameter, which specifies the TEE solution, for example, SGX_ENCLAVE_TYPE, GP_ENCLAVE_TYPE and AUTO_ENCLAVE_TYPE. -- version: input parameter, which specifies the enclave engine version. Currently, there is only one version, and the value is 0. -- Flags: input parameter, which specifies the running status of the enclave. For example, SECGEAR_DEBUG_FLAG indicates the debugging status, and SECGEAR_SIMULATE_FLAG indicates the simulation status (not supported currently). -- features: input parameter, which specifies some features supported by the enclave, for example, PCL and switchless of the SGX. This parameter is not supported currently. Set it to NULL. -- features_count: input parameter, which specifies the number of features. This parameter is not supported currently. Set it to 0. -- enclave: output parameter, which specifies the created enclave context. - -**Return Values:** - -- CE_SUCCESS: The authentication information is verified successfully. -- CE_ERROR_INVALID_PARAMETER: The input parameter is incorrect. -- CE_ERROR_OUT_OF_MEMORY: No memory is available. -- CC_FAIL: Common failure. -- CC_ERROR_UNEXPECTED: Unexpected error. -- CC_ERROR_ENCLAVE_MAXIMUM: The number of enclaves created by a single app reaches the maximum. -- CC_ERROR_INVALID_PATH: The secure binary path is invalid. -- CC_ERROR_NO_FIND_REGFUNC: The enclave search fails. - -## cc_enclave_destroy - -Destroys the enclave API. - -**Function:** - -This API is called by the REE to call the exit functions of different TEEs to release the created enclave entities. - -**Function Declaration:** - -```c -cc_enclave_result_t cc_enclave_destroy (cc_enclave_t ** enclave); -``` - -**Parameter:** - -- enclave: input parameter, which specifies the context of the created enclave. - -**Return Values:** - -- CE_SUCCESS: The authentication information is verified successfully. -- CE_ERROR_INVALID_PARAMETER: The input parameter is incorrect. -- CE_ERROR_OUT_OF_MEMORY: No memory is available. -- CC_ERROR_NO_FIND_UNREGFUNC: The enclave search fails. -- CC_FAIL: common failure. -- CC_ERROR_UNEXPECTED: unexpected error. - -## cc_malloc_shared_memory - -Creates the shared memory. - -**Functions** - -After the switchless feature is enabled, this API is called by the REE to create the shared memory that can be accessed by both the TEE and REE. - -**Function Declaration:** - -```c -void *cc_malloc_shared_memory(cc_enclave_t *enclave, size_t size); -``` - -**Parameters:** - -- enclave: input parameter, which indicates the context handle of the secure environment. Different platforms have different shared memory models. To ensure cross-platform interface consistency, this parameter is used only on the ARM platform and is ignored on the SGX platform. -- size: input parameter, which indicates the size of the shared memory. - -**Return Values:** - -- NULL: Failed to apply for the shared memory. -- Other values: start address of the created shared memory. - -## cc_free_shared_memory - -Releases the shared memory. - -**Functions** - -This API is called by the REE to release the shared memory after the switchless feature is enabled. - -**Function Declaration:** - -```c -cc_enclave_result_t cc_free_shared_memory(cc_enclave_t *enclave, void *ptr); -``` - -**Parameters:** - -- enclave: input parameter, which indicates the context handle of the secure environment. Different platforms have different shared memory models. To ensure cross-platform interface consistency, this parameter is used only on the ARM platform (the value of this parameter must be the same as the value of enclave passed when cc_malloc_shared_memory is invoked). It is ignored on the SGX platform. -- ptr: input parameter, which indicates the shared memory address returned by cc_malloc_shared_memory. - -**Return Values:** - -- CC_ERROR_BAD_PARAMETERS: invalid input parameter. -- CC_ERROR_INVALID_HANDLE: The enclave is invalid or the input enclave does not match the enclave corresponding to the ptr. (It takes effect only on the ARM platform. The SGX platform ignores the enclave and therefore does not check the enclave.) -- CC_ERROR_NOT_IMPLEMENTED: The API is not implemented. -- CC_ERROR_SHARED_MEMORY_START_ADDR_INVALID: ptr is not the shared memory address returned by cc_malloc_shared_memory (valid only on the ARM platform). -- CC_ERROR_OUT_OF_MEMORY: insufficient memory (valid only on the ARM platform). -- CC_FAIL: common failure. -- CC_SUCCESS: success - -## cc_enclave_generate_random - -Generates random numbers. - -**Function:** - -Generate a secure random number for the password on the TEE. - -**Function Declaration:** - -```c -cc_enclave_result_t cc_enclave_generate_random(void *buffer, size_t size) -``` - -**Parameters:** - -- buffer: input parameter, which specifies the buffer for generating random numbers. -- size: input parameter, which specifies the buffer length. - -**Return Values:** - -- CE_OK: Authentication information is verified successfully. -- CE_ERROR_INVALID_PARAMETER: incorrect input parameter. -- CE_ERROR_OUT_OF_MEMORY: no memory is available. - -## cc_enclave_seal_data - -Ensures data persistence. - -**Function:** - -This API is called by the TEE to encrypt the internal data of the enclave so that the data can be persistently stored outside the enclave. - -**Function Declaration:** - -```c -cc_enclave_result_t cc_enclave_seal_data(uint8_t *seal_data, uint32_t seal_data_len, - - cc_enclave_sealed_data_t *sealed_data, uint32_t sealed_data_len, - - uint8_t *additional_text, uint32_t additional_text_len) -``` - -**Parameters:** - -- seal_data: input parameter, which specifies the data to be encrypted. -- seal_data_len: input parameter, which specifies the length of the data to be encrypted. -- sealed_data: output parameter, which specifies the encrypted data processing handle. -- sealed_data_len: output parameter, which specifies the length of the encrypted ciphertext. -- additional_text: input parameter, which specifies the additional message required for encryption. -- additional_text_len: input parameter, which specifies the additional message length. - -**Return Values:** - -- CE_SUCCESS: Data encryption succeeds. -- CE_ERROR_INVALID_PARAMETER: incorrect input parameter. -- CE_ERROR_OUT_OF_MEMORY: no memory is available. -- CC_ERROR_SHORT_BUFFER: The input buffer is too small. -- CC_ERROR_GENERIC: Common bottom-layer hardware error. - -## cc_enclave_unseal_data - -Decrypts data. - -**Function:** - -This API is called by the TEE to decrypt the data sealed by the enclave and import the external persistent data back to the enclave. - -**Function Declaration:** - -```c -cc_enclave_result_t cc_enclave_unseal_data(cc_enclave_sealed_data_t *sealed_data, - - uint8_t *decrypted_data, uint32_t *decrypted_data_len, - - uint8_t *additional_text, uint32_t *additional_text_len) -``` - -**Parameters:** - -- sealed_data: input parameter, which specifies the handle of the encrypted data. -- decrypted_data: output parameter, which specifies the buffer of the decrypted ciphertext data. -- decrypted_data_len: output parameter, which specifies the length of the decrypted ciphertext. -- additional_text: output parameter, which specifies an additional message after decryption. -- additional_text_len: output parameter, which specifies the length of the additional message after decryption. - -**Return Values:** - -- CE_SUCCESS: Data decryption is successful. -- CE_ERROR_INVALID_PARAMETER: incorrect input parameter. -- CE_ERROR_OUT_OF_MEMORY: no memory is available. -- CC_ERROR_SHORT_BUFFER: The input buffer is too small. -- CC_ERROR_GENERIC: common bottom-layer hardware error. - -## cc_enclave_get_sealed_data_size - -Obtains the size of the encrypted data. - -**Function:** - -Obtain the size of the sealed_data data. This API can be called by the TEE and REE to allocate the decrypted data space. - -**Function Declaration:** - -```c -uint32_t cc_enclave_get_sealed_data_size(const uint32_t add_len, const uint32_t seal_data_len); -``` - -**Parameters:** - -- add_len: input parameter, which specifies the additional message length. -- sealed_data_len: input parameter, which specifies the length of the encrypted information. - -**Return Values:** - -- UINT32_MAX: Parameter error or function execution error. -- others: The function is successfully executed, and the return value is the size of the sealed_data structure. - -## cc_enclave_get_encrypted_text_size - -Obtains the length of an encrypted message. - -**Function:** - -This API is called by the TEE to obtain the length of the encrypted message in the encrypted data. - -**Function Declaration:** - -```c -uint32_t cc_enclave_get_encrypted_text_size(const cc_enclave_sealed_data_t *sealed_data); -``` - -**Parameter:** - -- sealed_data: input parameter, which specifies the handle of the encrypted data - -**Return Values:** - -- UINT32_MAX: Parameter error or function execution error. -- others: The function is executed successfully, and the return value is the length of the encrypted message in sealed_data. - -## cc_enclave_get_add_text_size - -Obtains the length of an additional message. - -**Function:** - -This API is called by the TEE to obtain the length of the additional message in the encrypted data. - -**Function Declaration:** - -```c -uint32_t cc_enclave_get_add_text_size(const cc_enclave_sealed_data_t *sealed_data); -``` - -**Parameter:** - -- sealed_data: input parameter, handle of the encrypted data. - -**Return Values:** - -- UINT32_MAX: Parameter error or function execution error. -- others: The function is successfully executed, and the return value is the length of the additional message in sealed_data. - -## cc_enclave_memory_in_enclave - -Performs security memory check. - -**Function:** - -This API is called by the TEE to check whether the memory addresses of the specified length belong to the TEE. - -**Function Declaration:** - -```c -bool cc_enclave_memory_in_enclave(const void *addr, size_t size) -``` - -**Parameters:** - -- *addr: input parameter, which specifies the memory address to be verified. -- size: input parameter, which specifies the length to be verified starting from the memory address. - -**Return Values:** - -- true: The memory in the specified zone is in the secure zone. -- false: Some or all memory in the specified area is not within the secure range. - -## cc_enclave_memory_out_enclave - -Performs security memory check. - -**Function:** - -This API is called by the TEE to check whether the memory addresses of the specified length belong to the REE. - -**Function Declaration:** - -```c -bool cc_enclave_memory_out_enclave(const void *addr, size_t size) -``` - -**Parameters:** - -- *addr: input parameter, which specifies the memory address to be verified. -- size: input parameter, length to be verified starting from the memory address. - -**Return Values:** - -- true: The memory of the specified area is in the non-secure area. -- false: Some or all of the memory in the specified zone is in the secure area. - -## PrintInfo - -Prints messages. - -**Function:** - -Print TEE logs. This API outputs the information that the TEE user wants to print. The input logs are stored in the REE /var/log/secgear/secgear.log. - -**Function Declaration:** - -```c -void PrintInfo(int level, const char *fmt, ...); -``` - -**Parameters:** - -- level: log print level, which is an input parameter. The value can be PRINT_ERROR, PRINT_WARNING, PRINT_STRACE, and PRINT_DEBUG. -- fmt: Input parameter, and a character to be output. - -**Return Value:** - -- None diff --git a/docs/en/server/security/secgear/application_scenarios.md b/docs/en/server/security/secgear/application_scenarios.md deleted file mode 100644 index 13a9b7588f9b77c4a2c36b9a366ad54016bbf2dc..0000000000000000000000000000000000000000 --- a/docs/en/server/security/secgear/application_scenarios.md +++ /dev/null @@ -1,96 +0,0 @@ -# Application Scenarios - -This chapter describes confidential computing solutions in typical scenarios with examples, helping you understand the application scenarios of secGear and build confidential computing solutions based on your services. - -## TEE-based BJCA Cryptographic Module - -Driven by policies and services, the cryptographic application assurance infrastructure has been evolving towards virtualization. As services are migrated to the cloud, a brand-new cryptographic delivery mode needs to be built to integrate cryptographic services, cloud services, and service applications. Under such circumstance, Beijing Certificate Authority (BJCA) launches a TEE-based cryptographic module. BJCA can not only use the Kunpeng-based TEEs to build compliant cryptographic computing modules to support cryptographic cloud service platforms, but also build a confidential computing platform based on Kunpeng hosts to provide high-speed ubiquitous, elastically deployed, and flexibly scheduled cryptographic services for various scenarios such as cloud computing, privacy computing, and edge computing. The endogenous cryptographic module based on Kunpeng processors has become a revolutionary innovative solution in the cryptographic industry, and becomes a new starting point for endogenous trusted cryptographic computing. - -### Status Quo - -In conventional cryptographic modules, algorithm protocols and processed data are privacy data. Migrating cryptographic modules to the cloud has security risks. - -### Solution - -![](./figures/BJCA_Crypto_Module.png) - -The figure shows a TEE-based cryptographic module solution. secGear can divide the cryptographic module into two parts: management service and algorithm protocol. - -- Management service: runs on the REE to provide cryptographic services for the external world and forward requests to the TEE for processing. -- Algorithm protocol: runs on the TEE to encrypt and decrypt user data. - -Cryptographic services may have highly concurrent requests with large data volumes. The switchless feature of secGear reduces the context switches and data copies typically required for processing a large number of requests between the REE and TEE. - -## TEE-based Fully-Encrypted GaussDB - -Cloud databases have become an important growth point for database services in the future. Most traditional database service vendors are accelerating the provision of better cloud database services. However, cloud databases face more complex and diversified risks than traditional databases. Application vulnerabilities, system configuration errors, and malicious administrators may pose great risks to data security and privacy. - -### Status Quo - -The deployment network of cloud databases changes from a private environment to an open environment. The system O&M role is divided into service administrators and O&M administrators. Service administrators have service management permissions and belong to the enterprise service provider. O&M administrators belong to the cloud service provider. Although being defined to be responsible only for system O&M management, the database O&M administrator still has full permissions to use data. The database O&M administrator can access or even tamper with data with O&M management permissions or privilege escalation. In addition, due to the open environment and blurring of network boundaries, user data is more fully exposed to attackers in the entire service process, no matter in transfer, storage, O&M, or running. Therefore, in cloud database scenarios, how to solve the third-party trust problem and how to protect data security more reliably are facing greater challenges than traditional databases. Data security and privacy leakage are top concerns of cloud databases. - -### Solution - -To address the preceding challenges, the TEE-based fully-encrypted GaussDB (openGauss) is designed as follows: Users hold data encryption and decryption keys, data is stored in ciphertext in the entire life cycle of the database service, and query operations are completed in the TEE of the database service. - -![](./figures/secret_gaussdb.png) - -The figure shows the TEE-based fully-encrypted database solution. The fully-encrypted database has the following features: - -1. Data files are stored in ciphertext and plaintext key information is not stored. -2. The database data key is stored on the client. -3. When the client initiates a query request, the REE executes the encrypted SQL syntax on the server to obtain related ciphertext records and sends them to the TEE. -4. The client encrypts and transfers the database data key to the server TEE through the secure channel of secGear. The database data key is decrypted in the TEE and used to decrypt the ciphertext records into plaintext records. The SQL statement is executed to obtain the query result. Then the query result is encrypted using the database data key and sent back to the client. - -In step 3, when a large number of concurrent database requests are sent, frequent calls between the REE and TEE will be triggered and a large amount of data needs to be transferred. As a result, the performance deteriorates sharply. The switchless feature of secGear helps reduce context switches in calls and data copies, improving the performance. - -## TEE-based openLooKeng Federated SQL - -openLooKeng federated SQL is a type of cross-DC query. The typical scenario is as follows. There are three DCs: central DC A, edge DC B, and edge DC C. The openLooKeng cluster is deployed in the three DCs. When receiving a cross-domain query request, DC A delivers an execution plan to each DC. After the openLookeng clusters in edge DCs B and C complete computing, the result is transferred to the openLookeng cluster in DC A over the network to complete aggregation computing. - -### Status Quo - -In the preceding solution, the computing result is transferred between openLookeng clusters in different DCs, avoiding insufficient network bandwidth and solving the cross-domain query problem to some extent. However, the computing result is obtained from the original data and may contain sensitive information. As a result, security and compliance risks exist when data is transferred out of the domain. How do we protect the computing results of the edge DCs during aggregation computing and ensure that the computing results are available but invisible in the central DC? - -### Solution - -In DC A, the openLookeng cluster splits the aggregation computing logic and operators into independent modules and deploys them in the Kunpeng-based TEE. The computing results of the edge DCs are transferred to the TEE of DC A through the secure channel. All data is finally aggregated and computed in the TEE. In this way, the computing results of the edge DCs are protected from being obtained or tampered with by privileged or malicious programs in the REE of DC A during aggregation computing. - -![](./figures/openLooKeng.png) - -The figure shows the TEE-based federated SQL solution. The query process is as follows: - -1. A user delivers a cross-domain query request in DC A. The coordinator of openLooKeng splits and delivers the execution plan to its worker nodes and the coordinators of edge DCs based on the query SQL statement and data distribution. Then the coordinators of edge DCs deliver the execution plan to their worker nodes. -2. Each worker node executes the plan to obtain the local computing result. -3. Edge DCs encrypt their computing results through the secure channel of secGear, transfer the results to the REE of DC A over the Internet, forward the results to the TEE, and decrypt the results in the TEE. -4. DC A performs aggregation computing on the computing results of DCs A, B, and C in the TEE, obtains a final execution result, and returns the result to the user. - -In step 4, when there are a large number of query requests, the REE and TEE will be frequently invoked and a large amount of data is copied. As a result, the performance deteriorates. The switchless feature of secGear is optimized to reduce context switches and data copies to improve the performance. - -## TEE-based MindSpore Feature Protection - -Vertical federated learning (VFL) is an important branch of federated learning. When multiple parties have features about the same set of users, VFL can be used for collaborative training. - -![](./figures/Mindspore_original.png) - -### Status Quo - -The figure shows the data processing flow of the traditional solution. - -1. A party that has features is also called a follower, while a party that has labels is also called a leader. Each follower inputs its features to its bottom model to obtain the intermediate result, and then sends the intermediate result to the leader. -2. The leader uses its labels and the intermediate results of followers to train the top model, and then sends the computed gradient back to the followers to train their bottom models. - -This solution prevents followers from directly uploading their raw data out of the domain, thereby protecting data privacy. However, attackers may derive user information from the uploaded intermediate results, causing privacy leakage risks. Therefore, a stronger privacy protection solution is required for intermediate results and gradients to meet security compliance requirements. - -### Solution - -Based on the security risks and solutions in the previous three scenarios, confidential computing is a good choice to make intermediate results "available but invisible" out of the domain. - -![](./figures/Mindspore.png) - -The figure shows the TEE-based VFL feature protection solution. The data processing process is as follows: - -1. Followers encrypt their intermediate results through the secure channel of secGear and transfer the results to the leader. After receiving the results, the leader transfers them to the TEE and decrypts them through the secure channel in the TEE. -2. In the TEE, the intermediate results are input to the computing module at the federated split layer to compute the result. - -In this process, the plaintext intermediate results of followers exist only in the TEE memory, which is inaccessible to the leader, like a black box. diff --git a/docs/en/server/security/secgear/developer_guide.md b/docs/en/server/security/secgear/developer_guide.md deleted file mode 100644 index 74fea4d81c08f4616a7633256ff7292b09583630..0000000000000000000000000000000000000000 --- a/docs/en/server/security/secgear/developer_guide.md +++ /dev/null @@ -1,93 +0,0 @@ -# Developer Guide - -This chapter provides an example of using secGear to develop a C language program helloworld, helping you understand how to use secGear to develop applications. - -## Downloading Examples - -```shell -git clone https://gitee.com/openeuler/secGear.git -``` - -## Directory Structure - -```shell -cd examples/helloworld - -#Directory structure: -├── helloworld -│ ├── CMakeLists.txt -│ ├── enclave -│ │ ├── CMakeLists.txt -│ │ ├── Enclave.config.xml -│ │ ├── Enclave.lds -│ │ ├── hello.c -│ │ ├── manifest.txt -│ │ └── config_cloud.ini -│ ├── helloworld.edl -│ └── host -│ ├── CMakeLists.txt -│ └── main.c -``` - -The code body consists of three parts: - -- **main.c**: REE program -- **helloworld.edl**: header file of the APIs called by the REE and TEE -- **hello.c**: TEE program - -## Preparations - -In addition to the preceding three parts, there are compilation project file (**CMakeLists.txt**) and developer licenses (**Enclave.config.xml**/**Enclave.lds** of Intel SGX and **manifest.txt**/**config_cloud.ini** of Kunpeng). - -> [!NOTE]NOTE -> -> - The Kunpeng developer license needs to be [applied for from the Huawei service owner](https://www.hikunpeng.com/document/detail/en/kunpengcctrustzone/fg-tz/kunpengtrustzone_04_0009.html). -> - Because Intel SGX is debugged in debug mode, you do not need to apply for a developer license currently. If the remote attestation service of Intel is required for commercial use, you need to [apply for a license from Intel](https://www.intel.com/content/www/us/en/developer/tools/software-guard-extensions/request-license.html). - -After the application is successful, the developer license file is obtained and needs to be stored in the corresponding code directory. - -## Development Procedure - -Reconstructing a confidential computing application based on secGear is similar to independently extracting functional modules. The procedure is as follows: Identify sensitive data processing logic, extract it into an independent library, deploy it in the TEE, and define APIs provided by the REE in the EDL file. - -The following figure shows the development procedure. - -1. Develop the main function and APIs in the REE, manage the enclave, and call functions in the TEE. -2. Develop the EDL file (similar to the C language header file that defines the interaction APIs between the REE and TEE). -3. Develop TEE APIs. -4. Call the code generation tool codegener to automatically generate the interaction source code between the REE and TEE based on the EDL file and compile the source code to the binary files of the REE and TEE. The REE logic directly calls the corresponding API of the TEE without considering the automatically generated interaction code, reducing the development cost. -5. Call the signing tool to sign binary files in the TEE to implement trusted boot of the TEE program. - -![](./figures/develop_step.png) - -## Build and Run - -### Arm Environment - -```shell -// clone secGear repository -git clone https://gitee.com/openeuler/secGear.git - -// build secGear and examples -cd secGear -source environment -mkdir debug && cd debug && cmake -DENCLAVE=GP .. && make && sudo make install - -// run helloworld -/vendor/bin/secgear_helloworld -``` - -### x86 Environment - -```shell -// clone secGear repository -git clone https://gitee.com/openeuler/secGear.git - -// build secGear and examples -cd secGear -source /opt/intel/sgxsdk/environment && source environment -mkdir debug && cd debug && cmake .. && make && sudo make install - -// run helloworld -./examples/helloworld/host/secgear_helloworld -``` diff --git a/docs/en/server/security/secgear/figures/BJCA_Crypto_Module.png b/docs/en/server/security/secgear/figures/BJCA_Crypto_Module.png deleted file mode 100644 index 3144e38d02872d2618ac6b8e4473504613b57261..0000000000000000000000000000000000000000 Binary files a/docs/en/server/security/secgear/figures/BJCA_Crypto_Module.png and /dev/null differ diff --git a/docs/en/server/security/secgear/figures/Mindspore.png b/docs/en/server/security/secgear/figures/Mindspore.png deleted file mode 100644 index 5a1fd1788d4ada4166c44444e14672cf8c14bd15..0000000000000000000000000000000000000000 Binary files a/docs/en/server/security/secgear/figures/Mindspore.png and /dev/null differ diff --git a/docs/en/server/security/secgear/figures/Mindspore_original.png b/docs/en/server/security/secgear/figures/Mindspore_original.png deleted file mode 100644 index 280c050df982d8806b5ed4293a3c0aec90042906..0000000000000000000000000000000000000000 Binary files a/docs/en/server/security/secgear/figures/Mindspore_original.png and /dev/null differ diff --git a/docs/en/server/security/secgear/figures/develop_step.png b/docs/en/server/security/secgear/figures/develop_step.png deleted file mode 100644 index a7b6e1842c61382cf9bad136eebab04eb6750c0e..0000000000000000000000000000000000000000 Binary files a/docs/en/server/security/secgear/figures/develop_step.png and /dev/null differ diff --git a/docs/en/server/security/secgear/figures/openLooKeng.png b/docs/en/server/security/secgear/figures/openLooKeng.png deleted file mode 100644 index 9f9e249830140fd65e26ebbc637d60adf8a20901..0000000000000000000000000000000000000000 Binary files a/docs/en/server/security/secgear/figures/openLooKeng.png and /dev/null differ diff --git a/docs/en/server/security/secgear/figures/secGear_arch.png b/docs/en/server/security/secgear/figures/secGear_arch.png deleted file mode 100644 index 06620dfd30614ded62190354c1ec0b0b2c497c01..0000000000000000000000000000000000000000 Binary files a/docs/en/server/security/secgear/figures/secGear_arch.png and /dev/null differ diff --git a/docs/en/server/security/secgear/figures/secret_gaussdb.png b/docs/en/server/security/secgear/figures/secret_gaussdb.png deleted file mode 100644 index 3556c3727f6f723f620e5c451e5fec563dfde1aa..0000000000000000000000000000000000000000 Binary files a/docs/en/server/security/secgear/figures/secret_gaussdb.png and /dev/null differ diff --git a/docs/en/server/security/secgear/introduction_to_secgear.md b/docs/en/server/security/secgear/introduction_to_secgear.md deleted file mode 100644 index 71d44581881f4160d7689c6379526fdac2769b84..0000000000000000000000000000000000000000 --- a/docs/en/server/security/secgear/introduction_to_secgear.md +++ /dev/null @@ -1,165 +0,0 @@ -# Introduction to secGear - -## Overview - -With the rapid development of cloud computing, more and more enterprises deploy computing services on the cloud. The security of user data on the third-party cloud infrastructure is facing great challenges. Confidential computing is a technology that uses hardware-based trusted execution environments (TEEs) to protect confidentiality and integrity of data in use. It relies on the bottom-layer hardware to build the minimum trust dependency, which removes the OS, hypervisor, infrastructure, system administrator, and service provider from the trusted entity list as unauthorized entities to reduce potential risks. There are various confidential computing technologies (such as Intel SGX, Arm TrustZone, and RISC-V Keystone) and software development kits (SDKs) in the industry and the application ecosystem of different TEEs are isolated from each other, which brings high development and maintenance costs to confidential computing application developers. To help developers quickly build confidential computing solutions that protect data security on the cloud, openEuler launches the unified confidential computing programming framework secGear. - -## Architecture - -![](./figures/secGear_arch.png) - -The architecture of secGear consists of three layers that form the foundation of openEuler confidential computing software. - -- Base layer: The unified layer of the confidential computing SDK provides APIs for different TEEs, enabling different architectures to share the same set of source code. -- Middleware layer: The general component layer provides confidential computing software for users to quickly build confidential computing solutions. -- Server layer: The confidential computing service layer runs dedicated solutions for typical situations. - -## Key Features - -### Switchless - -#### Pain Points - -After a conventional application is reconstructed using confidential computing, the rich execution environment (REE) logic frequently invokes the TEE logic or the REE frequently exchanges large data blocks with the TEE. Each call between the REE and TEE requires context switching among the REE user mode, REE kernel mode, driver, TEE kernel mode, and TEE user mode. When large blocks of data are exchanged during the call, multiple memory copies are generated. In addition, the interaction performance between the REE and TEE deteriorates due to factors such as the size limit of underlying data blocks, which severely affects the implementation of confidential computing applications. - -#### Solution - -Switchless is a technology that uses shared memory to reduce the number of context switches and data copies between the REE and TEE to optimize the interaction performance. - -#### How to Use - -1. Enable switchless when creating an enclave. - - The configuration items of switchless are described as follows: - - ```c - typedef struct { - uint32_t num_uworkers; - uint32_t num_tworkers; - uint32_t switchless_calls_pool_size; - uint32_t retries_before_fallback; - uint32_t retries_before_sleep; - uint32_t parameter_num; - uint32_t workers_policy; - uint32_t rollback_to_common; - cpu_set_t num_cores; - } cc_sl_config_t; - ``` - - | Configuration Item | Description | - | -------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | - | num_uworkers | Number of proxy worker threads in the REE, which are used to make switchless out calls (OCALLs). Currently, this field takes effect only on the SGX platform and can be configured on the Arm platform. However, because the Arm platform does not support OCALLs, the configuration does not take effect on the Arm platform.
Specifications:
Arm: maximum value: **512**; minimum value: **1**; default value: **8** (used when this field is set to **0**).
SGX: maximum value: **4294967295**; minimum value: **1**. | - | num_tworkers | Number of proxy worker threads in the TEE, which are used to make switchless enclave calls (ECALLs).
Specifications:
Arm: maximum value: **512**; minimum value: **1**; default value: **8** (used when this field is set to **0**).
SGX: maximum value: **4294967295**; minimum value: **1**. | - | switchless_calls_pool_size | Size of the switchless call pool. The pool can contain **switchless_calls_pool_size** x 64 switchless calls. For example, if **switchless_calls_pool_size=1**, 64 switchless calls are contained in the pool.
Specifications:
Arm: maximum value: **8**; minimum value: **1**; default value: **1** (used when this field is set to **0**).
SGX: maximum value: **8**; minimum value: **1**; default value: **1** (used when **switchless_calls_pool_size** is set to **0**). | - | retries_before_fallback | After the **pause** assembly instruction is executed for **retries_before_fallback** times, if the switchless call is not made by the proxy worker thread on the other side, the system rolls back to the switch call mode. This field takes effect only on the SGX platform.
Specifications:
SGX: maximum value: **4294967295**; minimum value: **1**; default value: **20000** (used when this field is set to **0**). | - | retries_before_sleep | After the **pause** assembly instruction is executed for **retries_before_sleep** times, if the proxy worker thread does not receive any task, the proxy worker thread enters the sleep state. This field takes effect only on the SGX platform.
Specifications:
SGX: maximum value: **4294967295**; minimum value: **1**; default value: **20000** (used when this field is set to **0**). | - | parameter_num | Maximum number of parameters supported by a switchless function. This field takes effect only on the Arm platform.
Specifications:
Arm: maximum value: **16**; minimum value: **0**. | - | workers_policy | Running mode of the switchless proxy thread. This field takes effect only on the Arm platform.
Specifications:
Arm:
**WORKERS_POLICY_BUSY**: The proxy thread always occupies CPU resources regardless of whether there are tasks to be processed. This mode applies to scenarios that require high performance and extensive system software and hardware resources.
**WORKERS_POLICY_WAKEUP**: The proxy thread wakes up only when there is a task. After the task is processed, the proxy thread enters the sleep state and waits to be woken up by a new task. | - | rollback_to_common | Whether to roll back to a common call when an asynchronous switchless call fails. This field takes effect only on the Arm platform.
Specifications:
Arm:
**0**: No. If the operation fails, only the error code is returned.
Other values: Yes. If the operation fails, an asynchronous switchless call is rolled back to a common call and the return value of the common call is returned. | - | num_cores | Number of cores for TEE core binding
Specifications: The maximum value is the number of cores in the environment. | - -1. Add the **transition_using_threads** flag when defining the API in the enclave description language (EDL) file. - - ```ocaml - enclave { - include "secgear_urts.h" - from "secgear_tstdc.edl" import *; - from "secgear_tswitchless.edl" import *; - trusted { - public int get_string([out, size=32]char *buf); - public int get_string_switchless([out, size=32]char *buf) transition_using_threads; - }; - }; - ``` - -### Secure Channel - -#### Pain Points - -When requesting the confidential computing service on the cloud, the data owner needs to upload the data to be processed to the TEE on the cloud for processing. Because the TEE is not connected to the network, the data needs to be transferred to the REE over the network in plaintext and then transferred to the TEE from the REE. The data plaintext is exposed in the REE memory, which poses security risks. - -#### Solution - -A secure channel is a technology that combines confidential computing remote attestation to implement secure key negotiation between the data owner and the TEE on the cloud. It negotiates a sessionkey owned only by the data owner and the TEE on the cloud. Then the sessionkey is used to encrypt user data transferred over the network. After receiving the ciphertext data, the REE transfers the data to the TEE for decryption and processing. - -#### How to Use - -The secure channel is provided as a library and consists of the client, host, and enclave, which are icalled by the client, server client application (CA), and server trusted application (TA) of the service program respectively. - -| Module | Header File | Library File | Dependency | -|------------|--------------------------|-----------------------|---------| -| Client | secure_channel_client.h | libcsecure_channel.so | OpenSSL| -| Host | secure_channel_host.h | libusecure_channel.so | OpenSSL| -| Enclave| secure_channel_enclave.h | libtsecure_channel.so | TEE and TEE software stack | - -##### APIs - -| API | Header File and Library | Function | Remarks| -|----------------------------------------------------------------------------------------------------------------------------------------------|-----------------------|--------------|----| -| cc_sec_chl_client_init | secure_channel_client.h libcsecure_channel.so | Initializes the secure channel on the client. | Before calling this API, initialize the network connection and message sending hook function in the **ctx** parameter. | -| cc_sec_chl_client_fini | secure_channel_client.h libcsecure_channel.so | Destroys the secure channel on the client. | Instructs the server to destroy the local client information and local secure channel information. | -| cc_sec_chl_client_callback | secure_channel_client.h libcsecure_channel.so | Function for processing secure channel negotiation messages.| Processes messages sent from the server to the client during secure channel negotiation. This API is called when messages are received on the client. | -| cc_sec_chl_client_encrypt | secure_channel_client.h libcsecure_channel.so | Encryption API of the secure channel on the client. | None | -| cc_sec_chl_client_decrypt | secure_channel_client.h libcsecure_channel.so | Decryption API of the secure channel on the client. | None | -| int (*cc_conn_opt_funcptr_t)(void*conn, void *buf, size_t count); | secure_channel.h | Prototype of the message sending hook function. | Implemented by the client and server to specify the secure channel negotiation message type. It sends secure channel negotiation messages to the peer end. | -| cc_sec_chl_svr_init | secure_channel_host.h libusecure_channel.so | Initializes the secure channel on the server. | Before calling this API, initialize **enclave_ctx** in **ctx**. | -| cc_sec_chl_svr_fini | secure_channel_host.h libusecure_channel.so | Destroys the secure channel on the server. | Destroys information about the secure channel on the server and all clients. | -| cc_sec_chl_svr_callback | secure_channel_host.h libusecure_channel.so | Function for processing secure channel negotiation messages. | Processes messages sent from the client to the server during security channel negotiation. This API is called when messages are received on the server. Before calling this API, you need to initialize the network connection to the client and the message sending hook function. For details, see [examples](https://gitee.com/openeuler/secGear/blob/master/examples/secure_channel/host/server.c#:~:text=conn_ctx.conn_kit.send). | -| cc_sec_chl_enclave_encrypt | secure_channel_enclave.h libtsecure_channel.so | Encryption API of the secure channel on the enclave. | None | -| cc_sec_chl_enclave_decrypt | secure_channel_enclave.h libtsecure_channel.so | Decryption API of the secure channel on the enclave. | None| - -##### Precautions - -A secure channel encapsulates only the key negotiation process and encryption and decryption APIs, but does not establish any network connection. The negotiation process reuses the network connection of the service. The network connection between the client and server is established and maintained by the service. The message sending hook function and network connection pointer are transferred during the initialization of the secure channel on the client and the server. -For details, see [secure channel examples](https://gitee.com/openeuler/secGear/tree/master/examples/secure_channel). - -### Remote Attestation - -#### Challenges - -As confidential computing technologies advance, several major platforms have emerged, including Arm Trustzone/CCA, Intel SGX/TDX, QingTian Enclave, and Hygon CSV. Solutions often involve multiple confidential computing hardware platforms, sometimes requiring collaboration between different TEEs. Remote attestation is a crucial part of the trust chain in any confidential computing technology. However, each technology has its own attestation report format and verification process. This forces users to integrate separate verification workflows for each TEE, increasing complexity and hindering the adoption of new TEE types. - -#### Solution - -The unified remote attestation framework of secGear addresses the key components related to remote attestation in confidential computing, abstracting away the differences between different TEEs. It provides two components: attestation agent and attestation service. The agent is integrated by users to obtain attestation reports and connect to the attestation service. The service can be deployed independently and supports the verification of iTrustee and virtCCA remote attestation reports. - -#### Feature Description - -The unified remote attestation framework focuses on confidential computing functionalities, while service deployment and operation capabilities are provided by third-party deployment services. The key features of the unified remote attestation framework are as follows: - -- Report verification plugin framework: Supports runtime compatibility with attestation report verification for different TEE platforms, such as iTrustee, virtCCA, and CCA. It also supports the extension of new TEE report verification plugins. -- Certificate baseline management: Supports the management of baseline values of Trusted Computing Bases (TCB) and Trusted Applications (TA) as well as public key certificates for different TEE types. Centralized deployment on the server ensures transparency for users. -- Policy management: Provides default policies for ease of use and customizable policies for flexibility. -- Identity token: Issues identity tokens for different TEEs, endorsed by a third party for mutual authentication between different TEE types. -- Attestation agent: Supports connection to attestation service/peer-to-peer attestation, compatible with TEE report retrieval and identity token verification. It is easy to integrate, allowing users to focus on their service logic. - -Two modes are supported depending on the usage scenario: peer-to-peer verification and attestation service verification. - -Attestation service verification process: - -1. The user (regular node or TEE) initiates a challenge to the TEE platform. -2. The TEE platform obtains the TEE attestation report through the attestation agent and returns it to the user. -3. The user-side attestation agent forwards the report to the remote attestation service. -4. The remote attestation service verifies the report and returns an identity token in a unified format endorsed by a third party. -5. The attestation agent verifies the identity token and parses the attestation report verification result. -6. Upon successful verification, a secure connection is established. - -Peer-to-peer verification process (without the attestation service): - -1. The user initiates a challenge to the TEE platform, which then returns the attestation report to the user. -2. The user uses a local peer-to-peer TEE verification plugin to verify the report. - -> [!NOTE]NOTE -> The attestation agent varies depending on whether peer-to-peer verification or remote attestation service verification is used. Users can select the desired mode during compilation by specifying the appropriate option, enabling the attestation agent to support either the attestation service or peer-to-peer mode. - -#### Application Scenarios - -In scenarios like finance and AI, where confidential computing is used to protect the security of privacy data during runtime, remote attestation is a technical means to verify the legitimacy of the confidential computing environment and applications. secGear provides components that are easy to integrate and deploy, helping users quickly enable confidential computing remote attestation capabilities. - -## Acronyms and Abbreviations - -| Acronym/Abbreviation | Full Name | -| -------------------- | ----------------------------- | -| REE | rich execution environment | -| TEE | trusted execution environment | -| EDL | enclave description language | diff --git a/docs/en/server/security/secgear/secgear_installation.md b/docs/en/server/security/secgear/secgear_installation.md deleted file mode 100644 index b6435b7f96843ac1b98d8721eef12db1a3444ec7..0000000000000000000000000000000000000000 --- a/docs/en/server/security/secgear/secgear_installation.md +++ /dev/null @@ -1,109 +0,0 @@ -# secGear Installation - -## Arm Environment - -### Environment Requirements - -#### Hardware - -| Item | Version | -| ------ | --------------------------------------------------- | -| Server| TaiShan 200 server (model 2280) | -| Mainboard | Kunpeng board | -| BMC | 1711 board (model BC82SMMAB); firmware version: 3.01.12.49 or later| -| CPU | Kunpeng 920 processor (model 7260, 5250, or 5220) | -| Chassis | No special requirements; an 8- or 12-drive chassis recommended | - -> [!NOTE]NOTE -> Ensure that the TrustZone feature kit has been preconfigured on the server. That is, the TEE OS, TEE OS boot key, BMC, BIOS, and license have been preconfigured on the server. -> For common servers, the TrustZone feature cannot be enabled only by upgrading the BMC, BIOS, and TEE OS firmware. -> By default, the TrustZone feature is disabled on the server. For details about how to enable the TrustZone feature on the server, see BIOS settings. - -### Environment Preparation - -For details, see [Environment Requirements](https://www.hikunpeng.com/document/detail/en/kunpengcctrustzone/fg-tz/kunpengtrustzone_20_0018.html) and [Procedure](https://www.hikunpeng.com/document/detail/en/kunpengcctrustzone/fg-tz/kunpengtrustzone_20_0019.html) on the Kunpeng official website. - -### Installation - -1. Configure the openEuler Yum repository. You can configure an online Yum repository (see the example below) or configure a local Yum repository by mounting an ISO file. - - ```shell - vi /etc/yum.repo/openEuler.repo - [osrepo] - name=osrepo - baseurl=http://repo.openeuler.org/openEuler-22.03-LTS/everything/aarch64/ - enabled=1 - gpgcheck=1 - gpgkey=http://repo.openeuler.org/openEuler-22.03-LTS/everything/aarch64/RPM-GPG-KEY-openEuler - ``` - -2. Install secGear. - - ```shell - #Install the compiler. - yum install cmake ocaml-dune - - #Install secGear. - yum install secGear-devel - - #Check whether the installations are successful. If the command output is as follows, the installations are successful. - rpm -qa | grep -E 'secGear|itrustee|ocaml-dune' - itrustee_sdk-xxx - itrustee_sdk-devel-xxx - secGear-xxx - secGear-devel-xxx - ocaml-dune-xxx - ``` - -## x86 Environment - -### Environment Requirements - -#### Hardware - -Processor that supports the Intel SGX feature - -### Environment Preparation - -Purchase a device that supports the Intel SGX feature and enable the SGX feature by referring to the BIOS setting manual of the device. - -### Installation - -1. Configure the openEuler Yum repository. You can configure an online Yum repository (see the example below) or configure a local Yum repository by mounting an ISO file. - - ```shell - vi openEuler.repo - [osrepo] - name=osrepo - baseurl=http://repo.openeuler.org/openEuler-22.03-LTS/everything/x86_64/ - enabled=1 - gpgcheck=1 - gpgkey=http://repo.openeuler.org/openEuler-22.03-LTS/everything/x86_64/RPM-GPG-KEY-openEuler - ``` - -2. Install secGear. - - ```shell - # Install the compiler. - yum install cmake ocaml-dune - - # Install secGear. - yum install secGear-devel - - # Check whether the installations are successful. If the command output is as follows, the installations are successful. - rpm -qa | grep -E 'secGear|ocaml-dune|sgx' - secGear-xxx - secGear-devel-xxx - ocaml-dune-xxx - libsgx-epid-xxx - libsgx-enclave-common-xxx - libsgx-quote-ex-xxx - libsgx-aesm-launch-plugin-xxx - libsgx-uae-service-xxx - libsgx-ae-le-xxx - libsgx-urts-xxx - sgxsdk-xxx - sgx-aesm-service-xxx - linux-sgx-driver-xxx - libsgx-launch-xxx - ``` diff --git a/docs/en/server/security/secgear/using_secgear_tools.md b/docs/en/server/security/secgear/using_secgear_tools.md deleted file mode 100644 index 9650186ef995de4b54023c55de61c4b0e1291861..0000000000000000000000000000000000000000 --- a/docs/en/server/security/secgear/using_secgear_tools.md +++ /dev/null @@ -1,149 +0,0 @@ -# secGear Tools - -secGear provides a tool set to facilitate application development. This document describes the tools and how to use them. - -## Code Generation Tool: codegener - -### Overview - -secGear codegener is a tool developed based on Intel SGX SDK edger8r. It is used to parse the EDL file to generate intermediate C code, that is, to assist in generating code that is called between the TEE and REE. - -The EDL file format defined by secGear codegener is the same as that defined by Intel SGX SDK edger8r, but the complete syntax definition of Intel is not supported: - -- The public can be used only in methods. Functions without public are declared as private by default. -- Switchless calls from the REE to the TEE and from the TEE to the REE are not supported. -- The Outside Call (OCALL) does not support some calling modes (such as cdecl, stdcall, and fastcall). - -The EDL file syntax is similar to the C language syntax. The following describes parts different from the C language syntax: - -| Member | Description | -| ----------------------- | ------------------------------------------------------------ | -| include "my_type.h" | Uses the type defined in the external inclusion file. | -| trusted | Declares that secure functions are available on the trusted application (TA) side. | -| untrusted | Declares that insecure functions are available on the TA side. | -| return_type | Defines the return value type. | -| parameter_type | Defines the parameter type. | -| \[in, size = len] | For the ECALL, this parameter indicates that data needs to be transferred from the REE to the TEE. For the OCALL, this parameter is required for the pointer type, and size indicates the buffer that is actually used. | -| \[out, size = len] | For the ECALL, this parameter indicates that data needs to be transferred from the TEE to the REE. For the OCALL, this parameter needs to be used for the pointer type, and size indicates the buffer that is actually used.| - -### Usage Instructions - -#### Command Format - -The format of the codegen command is as follows: - -- x86_64 architecture: - -**codegen_x86_64** < --trustzone \| --sgx > \[--trusted-dir \ \| **--untrusted-dir** \\| --trusted \| --untrusted ] edlfile - -ARM architecture - -**codegen_arm64** < --trustzone \| --sgx > \[--trusted-dir \ \| **--untrusted-dir** \\| --trusted \| --untrusted ] edlfile - -#### Parameter Description - -The parameters are described as follows: - -| Parameter | Mandatory/Optional | Description | -| ----------------------- | ------------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | -| --trustzone \| --sgx | Mandatory | Generates the API function corresponding to the confidential computing architecture only in the current command directory. If no parameter is specified, the SGX API function is generated by default. | -| --search-path \ | Optional | Specifies the search path of the file that the EDL file to be converted depends on. | -| --use-prefix | Optional | Adds a prefix to the proxy function name. The prefix is the name of the EDL file. | -| --header-only | Optional | Specifies that the code generation tool generates only header files. | -| --trusted-dir \ | Optional | Specifies the directory where the generated TEE auxiliary code is stored. If this parameter is not specified, the current path is used by default. | -| --untrusted-dir \ | Optional | Specifies the directory where the auxiliary code for generating insecure functions is located. | -| --trusted | Optional | Generates TEE auxiliary code. | -| --untrusted | Optional | Generates REE auxiliary code. | -| edlfile | Mandatory | EDL file to be converted, for example, hello.edl. | - -#### Examples - -- Convert *helloworld.edl* to generate TEE auxiliary code in *enclave-directory* and generate REE auxiliary code in *host-directory*. An example command is as follows: - -```shell -codegen_x86_64 --sgx --trusted-dir enclave-directory --untrusted-dir host-directory helloworld.edl -``` - -- Convert *helloworld.edl* to generate TEE auxiliary code in the current directory. The following is a command example for not generating REE auxiliary code: - -```shell -codegen_x86_64 --sgx --trusted helloworld.edl -``` - -- Convert *helloworld.edl* to generate REE auxiliary code in the current directory. The following is a command example that does not generate TEE auxiliary code: - -```shell -codegen_x86_64 --sgx --untrusted helloworld.edl -``` - -- Convert *helloworld.edl*. An example of the command for generating TEE and REE auxiliary code in the current directory is as follows: - -```shell -codegen_x86_64 --sgx helloworld.edl -``` - -## Signature Tool: sign_tool - -### Overview - -secGear sign_tool is a command line tool, including the compilation tool chain and signature tool, which are used for enclave signing. The sign_tool has two signature modes: - -- Single-step signature: applies only to the debugging mode. -- Two-step signature: applies to the commercial scenario. Obtain the signature private key from a third-party platform or an independent security device to sign the enclave. - -### Operation Instructions - -#### Format - -The sign_tool contains the sign command (for signing the enclave) and the digest command (for generating the digest value). Command format: - -**sign_tool.sh -d** \[sign \| digest] **-x** \ **-i** \ **-p** \ **-s** \ \[OPTIONS] **-o** \ - -#### Parameter Description - -| sign Command Parameter | Description | Mandatory/Optional | -| -------------- | -------------------------------------------------------------| -------------------------------------------- | -| -a \ | api_level, which identifies the GP API version of the iTrustee TA. The default value is 1. | Optional | -| -c \ | Configuration file | Optional | -| -d \ | Specifies the operation (sign or digest) to be performed by the signature tool. | Only the sign operation is performed in single-step mode. In two-step mode, the digest operation must be performed before the sign operation. | -| -e \ | Public key certificate of the device, which is used to protect the AES key for encrypting rawdata (mandatory for iTrustee). | This parameter is mandatory only for the iTrustee type. | -| -f \ | OTRP_FLAG, which determines whether to support the OTRP standard protocol. The default value is 0. | Optional | -| -i \ | Library file to be signed. | Mandatory | -| -k \ | Private key (PEM file) required for one-step signature. | This parameter is mandatory only for the SGX type. | -| -m \ | Security configuration file manifest.txt, which is configured by users. | Only the iTrustee type is mandatory. | -| -o \ | Output file. | Mandatory | -| -p \ | Public key certificate (PEM file) of the signature server required for two-step signing. | Mandatory | -| -s \ | Signed digest value required for two-step signing. | Mandatory | -| -t \ | TA_TYPA, which identifies TA binary format of the iTrustee. The default value is 1. | Optional | -| -x \ | enclave type (sgx or trustzone) | Mandatory | -| -h | Prints the help information. | Optional | - -#### Single-Step Signature - -Set the enclave type is SGX, sign the test.enclave, and generate the signature file signed.enclave. The following is an example: - -```shell -sign_tool.sh -d sign -x sgx -i test.enclave -k private_test.pem -o signed.enclave -``` - -#### Two-Step Signature - -The following uses SGX as an example to describe the two-step signature procedure: - -1. Generate digest value. - - Use the sign_tool to generate the digest value digest.data and the temporary intermediate file signdata. The file is used when the signature file is generated and is automatically deleted after being signed. Example: - - ```shell - sign_tool.sh -d digest -x sgx -i input -o digest.data - ``` - -2. Send digest.data to the signature authority or platform and obtain the corresponding signature. - -3. Use the obtained signature to generate the signed dynamic library signed.enclave. - - ```shell - sign_tool.sh -d sign -x sgx-i input -p pub.pem -s signature -o signed.enclave - ``` - -Note: To release an official version of applications supported by Intel SGX, you need to apply for an Intel whitelist. For details about the process, see the Intel document at . diff --git a/docs/en/tools/application/_toc.yaml b/docs/en/tools/application/_toc.yaml index 7feac471a35fd08bb17b846655b57ea6fbfb6d8c..22033c907e980a12e9eac7107c97528ee9b6f71f 100644 --- a/docs/en/tools/application/_toc.yaml +++ b/docs/en/tools/application/_toc.yaml @@ -2,3 +2,4 @@ label: Application sections: - href: upstream: https://gitee.com/openeuler/ros/blob/master/docs/en/_toc.yaml + path: ./ros diff --git a/docs/en/tools/cloud/_toc.yaml b/docs/en/tools/cloud/_toc.yaml index 2322c04e3da2b38986a30e577addfdbeae3f2249..b044c0e6d3ab508898f488cacd1dd7a8a5c6a37a 100644 --- a/docs/en/tools/cloud/_toc.yaml +++ b/docs/en/tools/cloud/_toc.yaml @@ -1,6 +1,8 @@ label: Cloud Native sections: - - href: ./ctinspector/_toc.yaml + - href: + upstream: https://gitee.com/openeuler/CTinspector/blob/master/docs/en/_toc.yaml + path: ./ctinspector - href: upstream: https://gitee.com/openeuler/Cpds/blob/master/docs/en/_toc.yaml path: ./cpds diff --git a/docs/en/tools/cloud/ctinspector/_toc.yaml b/docs/en/tools/cloud/ctinspector/_toc.yaml deleted file mode 100644 index 97cda4ea490c194d9d948303c1327e089f8a78d4..0000000000000000000000000000000000000000 --- a/docs/en/tools/cloud/ctinspector/_toc.yaml +++ /dev/null @@ -1,10 +0,0 @@ -label: CTinspector Introduction -isManual: true -description: CTinspector enables precise diagnosis of runtime performance bottlenecks and system faults. -sections: - - label: Installation and Deployment - href: ./ctinspector_introduction.md - - label: Installation and Deployment - href: ./installation_and_deployment.md - - label: Usage Instructions - href: ./usage_instructions.md diff --git a/docs/en/tools/cloud/ctinspector/ctinspector_introduction.md b/docs/en/tools/cloud/ctinspector/ctinspector_introduction.md deleted file mode 100644 index 3fe26197753f175c18f491c28ced200755037952..0000000000000000000000000000000000000000 --- a/docs/en/tools/cloud/ctinspector/ctinspector_introduction.md +++ /dev/null @@ -1,48 +0,0 @@ -# CTinspector Introduction - -## Overview - -CTinspector is a language VM framework developed by China Telecom e-Cloud Technology Co., Ltd. based on the eBPF instruction set. CTinspector enables quick expansion of application instances to diagnose network performance bottlenecks, storage I/O hotspots, and load balancing issues, ensuring stable and timely diagnosis during system running. - -Before CTinspector introduces the O&M and problem analysis of the cloud base system, the OVS O&M and ACL configuration efficiency is relatively low, and some functions are not supported. - -* The filtering field needed by the maintenance personnel is not implemented, or the AND or NOT condition expression is not supported. - -* Many commands in the system have similar filtering requirements, such as CT flow tables, OpenFlow flow tables, and offload flow tables. Developing command parameters for each flow table is a heavy development burden. - -* Stateful filtering, for example, viewing the flow table that matches the most packets, cannot be implemented based on command parameters. Traditional filtering rules are for individual flow tables. The relationships between flow tables cannot be established. - -## Architecture - -CTinspector uses a packet VM of the eBPF instruction set. The minimum size of the packet VM is 256 bytes, covering registers, segments (stack, code, and data), and page tables. The packet VM supports independent migration, in which the packet VM code can invoke the migrate kernel function to migrate to a specified node. It also supports resumable execution, that is, once migrated, the packet VM continues to execute the next instruction from the position where it has been interrupted on the previous node. - -![](./figures/CT-package-vm.png) - -The overall architecture of CTinspector is as follows: - -![](./figures/CTinspector-arch.png) - -The CTinspector framework comprises the following components: - -* **eBPF compiler/JIT**: - The eBPF compiler compiles C code into eBPF binary code, and JIT compiles eBPF instructions into -machine code. - -* **eBPF linker/loader**: - loads and links library functions, that is, kernel functions. - -* **Runner**: - executes the eBPF VM, including loading registers, code segments, and stacks, and mapping data segments. - -* **Scheduler**: - determines when to execute the eBPF VM, including determining the VM status and dependency wait conditions. - -* **Basic kernel functions**: - basic library functions, such as transporter, memory mapper, fork, and join_meeting. - -* **Extended kernel functions**: - custom library functions provided by each hook point in addition to the core functions -provided by the eBPF VM runner. - -* **Memory mapper**: - maps application data to the eBPF VM to ensure the eBPF program can read and write application data. diff --git a/docs/en/tools/cloud/ctinspector/figures/CT-package-vm.png b/docs/en/tools/cloud/ctinspector/figures/CT-package-vm.png deleted file mode 100644 index bb1ad48a6f28f39b73776b67804332036c32bdce..0000000000000000000000000000000000000000 Binary files a/docs/en/tools/cloud/ctinspector/figures/CT-package-vm.png and /dev/null differ diff --git a/docs/en/tools/cloud/ctinspector/figures/CTinspector-arch.png b/docs/en/tools/cloud/ctinspector/figures/CTinspector-arch.png deleted file mode 100644 index 82f647b7c0a311c8af597ce3fabb3cdf93e5afcc..0000000000000000000000000000000000000000 Binary files a/docs/en/tools/cloud/ctinspector/figures/CTinspector-arch.png and /dev/null differ diff --git a/docs/en/tools/cloud/ctinspector/figures/migrate_node_1.png b/docs/en/tools/cloud/ctinspector/figures/migrate_node_1.png deleted file mode 100644 index 3d7ddb16959cf83235703f564d002f95396f1963..0000000000000000000000000000000000000000 Binary files a/docs/en/tools/cloud/ctinspector/figures/migrate_node_1.png and /dev/null differ diff --git a/docs/en/tools/cloud/ctinspector/figures/migrate_node_2.png b/docs/en/tools/cloud/ctinspector/figures/migrate_node_2.png deleted file mode 100644 index 99448ced22a6cd34d393ea31cff0ef67d43ec028..0000000000000000000000000000000000000000 Binary files a/docs/en/tools/cloud/ctinspector/figures/migrate_node_2.png and /dev/null differ diff --git a/docs/en/tools/cloud/ctinspector/installation_and_deployment.md b/docs/en/tools/cloud/ctinspector/installation_and_deployment.md deleted file mode 100644 index 03cabec817d4207ea32b18793d45712d988c98c2..0000000000000000000000000000000000000000 --- a/docs/en/tools/cloud/ctinspector/installation_and_deployment.md +++ /dev/null @@ -1,42 +0,0 @@ -# Installation and Deployment - -## Software Requirements - -* OS: openEuler 23.09 - -## Hardware Requirements - -* x86_64 architecture - -## Environment Preparation - -* Install openEuler by referring to [Installation Guide](../../../server/installation_upgrade/installation/installation_on_servers.md) - -* CTinspector installation requires **root** permissions. - -## CTinspector Installation - -* Install the CTinspector framework software package. - -```shell -yum install ctinspector -``` - -* Check whether the installation is successful. If the corresponding software package is displayed in the output, the installation is successful. - -```shell -rpm -q ctinspector -``` - -* Check whether the core dynamic library **libebpf_vm_executor.so** or main program **vm_test** is installed. - -```shell -rpm -ql ctinspector -/usr/bin/vm_test -/usr/include/ctinspector/ebpf_vm_functions.h -/usr/include/ctinspector/ebpf_vm_simulator.h -/usr/include/ctinspector/ebpf_vm_transport_rdma.h -/usr/include/ctinspector/list.h -/usr/include/ctinspector/ub_list.h -/usr/lib64/libebpf_vm_executor.so -``` diff --git a/docs/en/tools/cloud/ctinspector/usage_instructions.md b/docs/en/tools/cloud/ctinspector/usage_instructions.md deleted file mode 100644 index f2aae495057bd4fb920a4e6f9f0506d2193c5a0c..0000000000000000000000000000000000000000 --- a/docs/en/tools/cloud/ctinspector/usage_instructions.md +++ /dev/null @@ -1,45 +0,0 @@ -# Usage - -## NIC Configuration - -```shell -# Change the MTU of the NIC. -ifconfig ens33 mtu 4200 - -# Add an RXE interface to ens33 for the IB function. -rdma link add rxe_0 type rxe netdev ens33 - -``` - -## Application Development - -Use relevant APIs to develop a scenario-specific application. Build the application as a binary ELF file based on the eBPF instruction set. Take **vm_migrate** of the provided **ebpf_example** for example. **vm_migrate** calls the CTinspector framework and can migrate package VMs between nodes in a resumable manner. - -```text -# Compose the Makefile and set the eBPF instruction set. - -CFLAGS=-O2 -fno-inline -emit-llvm -I/usr/include/ctinspector/ -LINKFLAGS=-march=bpf -filetype=obj - -all: vm_migrate.o - -vm_migrate.o: - clang $(CFLAGS) -c migrate.c -o - | llc $(LINKFLAGS) -o vm_migrate.o - -clean: - rm -f vm_migrate.o -``` - -```shell -# Run make to build the application. -make -clang -O2 -fno-inline -emit-llvm -I/usr/include/ctinspector/ -c migrate.c -o - | llc -march=bpf -filetype=obj -o vm_migrate.o -``` - -## Application Running - -Running **vm_migrate** on node 1. -![](./figures/migrate_node_1.png) - -Running the CTinspector main program on node 2. -![](./figures/migrate_node_2.png) diff --git a/docs/en/tools/community_tools/_toc.yaml b/docs/en/tools/community_tools/_toc.yaml index cc02d0e0c31d2ea3925ec3a53a928a0218946491..fe298ecd5c564e41730c7ad84363e95d7e0d72ea 100644 --- a/docs/en/tools/community_tools/_toc.yaml +++ b/docs/en/tools/community_tools/_toc.yaml @@ -12,7 +12,7 @@ sections: sections: - href: upstream: https://gitee.com/openeuler/compiler-docs/blob/master/docs/zh/gcc/_toc.yaml - path: ./development/gcc + path: ./gcc - label: Performance Optimization sections: - href: @@ -20,7 +20,7 @@ sections: path: ./atune - href: upstream: https://gitee.com/openeuler/oeAware-manager/blob/master/docs/zh/master/_toc.yaml - path: ./performance/oeaware + path: ./oeaware - label: Migration sections: - href: diff --git a/docs/en/tools/maintenance/_toc.yaml b/docs/en/tools/maintenance/_toc.yaml index 3237328c7950f0e0ec284999f495820c85d2a4d0..3d721b8186d0723add3a97bae83c98b43e7a0339 100644 --- a/docs/en/tools/maintenance/_toc.yaml +++ b/docs/en/tools/maintenance/_toc.yaml @@ -4,9 +4,9 @@ sections: sections: - href: upstream: https://gitee.com/openeuler/syscare/blob/master/docs/en/_toc.yaml - path: ./maintenance/syscare + path: ./syscare - label: System Monitoring sections: - href: upstream: https://gitee.com/openeuler/sysmonitor/blob/master/docs/en/_toc.yaml - path: ./maintenance/sysmonitor \ No newline at end of file + path: ./sysmonitor \ No newline at end of file diff --git a/docs/en/tools/security/_toc.yaml b/docs/en/tools/security/_toc.yaml index 4e9915d45350280f1646f35ce34e715c2a915f17..13e0a46ed6ef0a0dbaef1429d93ce382a28d9c4b 100644 --- a/docs/en/tools/security/_toc.yaml +++ b/docs/en/tools/security/_toc.yaml @@ -2,4 +2,4 @@ label: Security sections: - href: upstream: https://gitee.com/openeuler/secGear/blob/master/docs/en/master/_toc.yaml - path: ./security/secgear + path: ./secgear diff --git a/docs/en/virtualization/_toc.yaml b/docs/en/virtualization/_toc.yaml index b2bb39733ebbb447cc69909372a7b1288259d40b..2bfa2c89166c4c79e1d009ec5be4d1de1f2a38d4 100644 --- a/docs/en/virtualization/_toc.yaml +++ b/docs/en/virtualization/_toc.yaml @@ -9,6 +9,5 @@ sections: upstream: https://gitee.com/openeuler/Virt-docs/blob/master/docs/en/virtualization_platform/stratovirt/_toc.yaml path: ./virtulization_platform/stratovirt - label: openStack User Guide - href: >- - https://openstack-sig.readthedocs.io/zh/latest/ + href: https://openstack-sig.readthedocs.io/zh/latest/ description: Open source platform for cloud computing management \ No newline at end of file diff --git a/docs/zh/server/_toc.yaml b/docs/zh/server/_toc.yaml index cbb09bcb11327efdf85f56254513d26f6d90df01..95a905aefb11df753ef0664828e67a896f31a5b9 100644 --- a/docs/zh/server/_toc.yaml +++ b/docs/zh/server/_toc.yaml @@ -94,9 +94,6 @@ sections: - href: upstream: https://gitee.com/openeuler/A-Tune/blob/master/docs/zh/24.03_LTS_SP2/_toc.yaml path: ./atune - - label: 内存调优 - sections: - - href: ./performance/tlbi/_toc.yaml - label: 应用开发 sections: - href: ./development/application_dev/_toc.yaml diff --git a/docs/zh/server/maintenance/gala/_toc.yaml b/docs/zh/server/maintenance/gala/_toc.yaml deleted file mode 100644 index b9257519df8d35269df1df2c970701addffa08c9..0000000000000000000000000000000000000000 --- a/docs/zh/server/maintenance/gala/_toc.yaml +++ /dev/null @@ -1,10 +0,0 @@ -label: gala用户指南 -isManual: true -description: 故障智能检测、性能数据采集分析以及资源监测管理 -sections: - - label: 使用gala_anteater - href: ./using_gala_anteater.md - - label: 使用gala_gopher - href: ./using_gala_gopher.md - - label: 使用gala_spider - href: ./using_gala_spider.md diff --git "a/docs/zh/server/maintenance/gala/figures/attach\346\265\201\347\250\213.png" "b/docs/zh/server/maintenance/gala/figures/attach\346\265\201\347\250\213.png" deleted file mode 100644 index 73b548cc332212f3ae2eec4dcec34c8af6e0e55a..0000000000000000000000000000000000000000 Binary files "a/docs/zh/server/maintenance/gala/figures/attach\346\265\201\347\250\213.png" and /dev/null differ diff --git a/docs/zh/server/maintenance/gala/figures/deadlock.png b/docs/zh/server/maintenance/gala/figures/deadlock.png deleted file mode 100644 index d4f863a1a87d7aad3128481c763ee715aefd0a9f..0000000000000000000000000000000000000000 Binary files a/docs/zh/server/maintenance/gala/figures/deadlock.png and /dev/null differ diff --git a/docs/zh/server/maintenance/gala/figures/deadlock2.png b/docs/zh/server/maintenance/gala/figures/deadlock2.png deleted file mode 100644 index 3be42a5a34f90c2f3b351c7077635c580ea847a7..0000000000000000000000000000000000000000 Binary files a/docs/zh/server/maintenance/gala/figures/deadlock2.png and /dev/null differ diff --git a/docs/zh/server/maintenance/gala/figures/deadlock3.png b/docs/zh/server/maintenance/gala/figures/deadlock3.png deleted file mode 100644 index 5ef1a08394daf6433e10f85a5b3c57df25c3e303..0000000000000000000000000000000000000000 Binary files a/docs/zh/server/maintenance/gala/figures/deadlock3.png and /dev/null differ diff --git a/docs/zh/server/maintenance/gala/figures/flame_muti_ins.png b/docs/zh/server/maintenance/gala/figures/flame_muti_ins.png deleted file mode 100644 index 5943c7fda223a7fde4d2987ad56af4ffa776bd81..0000000000000000000000000000000000000000 Binary files a/docs/zh/server/maintenance/gala/figures/flame_muti_ins.png and /dev/null differ diff --git "a/docs/zh/server/maintenance/gala/figures/gala-gopher\346\210\220\345\212\237\345\220\257\345\212\250\347\212\266\346\200\201.png" "b/docs/zh/server/maintenance/gala/figures/gala-gopher\346\210\220\345\212\237\345\220\257\345\212\250\347\212\266\346\200\201.png" deleted file mode 100644 index ab16e9d3661db3fd4adc6c605b2d2d08e79fdc1c..0000000000000000000000000000000000000000 Binary files "a/docs/zh/server/maintenance/gala/figures/gala-gopher\346\210\220\345\212\237\345\220\257\345\212\250\347\212\266\346\200\201.png" and /dev/null differ diff --git "a/docs/zh/server/maintenance/gala/figures/gala-spider\350\275\257\344\273\266\346\236\266\346\236\204\345\233\276.png" "b/docs/zh/server/maintenance/gala/figures/gala-spider\350\275\257\344\273\266\346\236\266\346\236\204\345\233\276.png" deleted file mode 100644 index c5a0768be63a98ef7ccc4a56996a8c715f7090af..0000000000000000000000000000000000000000 Binary files "a/docs/zh/server/maintenance/gala/figures/gala-spider\350\275\257\344\273\266\346\236\266\346\236\204\345\233\276.png" and /dev/null differ diff --git "a/docs/zh/server/maintenance/gala/figures/gopher\350\275\257\344\273\266\346\236\266\346\236\204\345\233\276.png" "b/docs/zh/server/maintenance/gala/figures/gopher\350\275\257\344\273\266\346\236\266\346\236\204\345\233\276.png" deleted file mode 100644 index f151965a21d11dd7a3e215cc4ef23d70d059f4b1..0000000000000000000000000000000000000000 Binary files "a/docs/zh/server/maintenance/gala/figures/gopher\350\275\257\344\273\266\346\236\266\346\236\204\345\233\276.png" and /dev/null differ diff --git a/docs/zh/server/maintenance/gala/figures/lockcompete1.png b/docs/zh/server/maintenance/gala/figures/lockcompete1.png deleted file mode 100644 index 5848b114e02d09f23303da8cff7aef56216f655f..0000000000000000000000000000000000000000 Binary files a/docs/zh/server/maintenance/gala/figures/lockcompete1.png and /dev/null differ diff --git a/docs/zh/server/maintenance/gala/figures/lockcompete2.png b/docs/zh/server/maintenance/gala/figures/lockcompete2.png deleted file mode 100644 index ed02a882a145dafeafb76469f328085edecc6775..0000000000000000000000000000000000000000 Binary files a/docs/zh/server/maintenance/gala/figures/lockcompete2.png and /dev/null differ diff --git a/docs/zh/server/maintenance/gala/figures/lockcompete3.png b/docs/zh/server/maintenance/gala/figures/lockcompete3.png deleted file mode 100644 index 3992edc5b7ea61d8a2aa08ce47f0876b7d2e8cf3..0000000000000000000000000000000000000000 Binary files a/docs/zh/server/maintenance/gala/figures/lockcompete3.png and /dev/null differ diff --git a/docs/zh/server/maintenance/gala/figures/lockcompete4.png b/docs/zh/server/maintenance/gala/figures/lockcompete4.png deleted file mode 100644 index 049ac49bcc1fb71ea9fe6866bd27e84d0acf42b1..0000000000000000000000000000000000000000 Binary files a/docs/zh/server/maintenance/gala/figures/lockcompete4.png and /dev/null differ diff --git a/docs/zh/server/maintenance/gala/figures/lockcompete5.png b/docs/zh/server/maintenance/gala/figures/lockcompete5.png deleted file mode 100644 index 8b5cf5aaef43f125abdf3adb8a7f798dd2c86b54..0000000000000000000000000000000000000000 Binary files a/docs/zh/server/maintenance/gala/figures/lockcompete5.png and /dev/null differ diff --git a/docs/zh/server/maintenance/gala/figures/lockcompete6.png b/docs/zh/server/maintenance/gala/figures/lockcompete6.png deleted file mode 100644 index c3b1f5f097b9e9bcabf75229eabc6ce8fe126a71..0000000000000000000000000000000000000000 Binary files a/docs/zh/server/maintenance/gala/figures/lockcompete6.png and /dev/null differ diff --git "a/docs/zh/server/maintenance/gala/figures/spider\346\213\223\346\211\221\345\205\263\347\263\273\345\233\276.png" "b/docs/zh/server/maintenance/gala/figures/spider\346\213\223\346\211\221\345\205\263\347\263\273\345\233\276.png" deleted file mode 100644 index 5823a116f384801e1197350f151b4d04ef519ac4..0000000000000000000000000000000000000000 Binary files "a/docs/zh/server/maintenance/gala/figures/spider\346\213\223\346\211\221\345\205\263\347\263\273\345\233\276.png" and /dev/null differ diff --git a/docs/zh/server/maintenance/gala/figures/tprofiling-dashboard-detail.png b/docs/zh/server/maintenance/gala/figures/tprofiling-dashboard-detail.png deleted file mode 100644 index 2093808bc4e1654956f6143393757c1244f08f98..0000000000000000000000000000000000000000 Binary files a/docs/zh/server/maintenance/gala/figures/tprofiling-dashboard-detail.png and /dev/null differ diff --git a/docs/zh/server/maintenance/gala/figures/tprofiling-dashboard.png b/docs/zh/server/maintenance/gala/figures/tprofiling-dashboard.png deleted file mode 100644 index 15f4917f5a0bfcf5dee1f8fe68e65635ffebd85e..0000000000000000000000000000000000000000 Binary files a/docs/zh/server/maintenance/gala/figures/tprofiling-dashboard.png and /dev/null differ diff --git a/docs/zh/server/maintenance/gala/figures/tprofiling-run-arch.png b/docs/zh/server/maintenance/gala/figures/tprofiling-run-arch.png deleted file mode 100644 index 0ad835125a5e7b7f66938543de1e1c9d53706ce4..0000000000000000000000000000000000000000 Binary files a/docs/zh/server/maintenance/gala/figures/tprofiling-run-arch.png and /dev/null differ diff --git a/docs/zh/server/maintenance/gala/using_gala_anteater.md b/docs/zh/server/maintenance/gala/using_gala_anteater.md deleted file mode 100644 index d0f09a915db81fff5c92f9b228dcc573e0c62660..0000000000000000000000000000000000000000 --- a/docs/zh/server/maintenance/gala/using_gala_anteater.md +++ /dev/null @@ -1,394 +0,0 @@ -# gala-anteater使用手册 - -gala-anteater是一款基于AI的操作系统异常检测平台。主要提供时序数据预处理、异常点发现、异常上报等功能。基于线下预训练、线上模型的增量学习与模型更新,能够很好地适用于多维多模态数据故障诊断。 - -本文主要介绍如何部署和使用gala-anteater服务。 - -## 安装 - -挂载repo源: - -```basic -[everything] -name=everything -baseurl=https://dl-cdn.openeuler.openatom.cn/openEuler-{version}/everything/ -enabled=1 -gpgcheck=0 -priority=1 - -[EPOL] -name=EPOL -baseurl=https://dl-cdn.openeuler.openatom.cn/openEuler-{version}/EPOL/main/ -enabled=1 -gpgcheck=0 -priority=1 -``` - -安装gala-anteater: - -```bash -yum install gala-anteater -``` - -## 配置 - -> 说明:gala-anteater采用配置的config文件设置参数启动,配置文件位置: /etc/gala-anteater/config/gala-anteater.yaml。 - -### 配置文件默认参数 - -```yaml -Global: - data_source: "prometheus" - -Arangodb: - url: "http://localhost:8529" - db_name: "spider" - -Kafka: - server: "192.168.122.100" - port: "9092" - model_topic: "gala_anteater_hybrid_model" - rca_topic: "gala_cause_inference" - meta_topic: "gala_gopher_metadata" - group_id: "gala_anteater_kafka" - # auth_type: plaintext/sasl_plaintext, please set "" for no auth - auth_type: "" - username: "" - password: "" - -Prometheus: - server: "localhost" - port: "9090" - steps: "5" - -Aom: - base_url: "" - project_id: "" - auth_type: "token" - auth_info: - iam_server: "" - iam_domain: "" - iam_user_name: "" - iam_password: "" - ssl_verify: 0 - -Schedule: - duration: 1 - -``` - -| 参数 | 含义 | 默认值 | -| ----------- | --------------------------------------------------- | ---------------------------- | -| Global | | | -| data_source | 设置数据来源 | “prometheus” | -| Arangodb | | | -| url | 图数据库Arangodb的ip地址 | "" | -| db_name | 图数据库名 | "spider" | -| Kafka | | | -| server | Kafka Server的ip地址,根据安装节点ip配置 | | -| port | Kafka Server的port,如:9092 | | -| model_topic | 故障检测结果上报topic | "gala_anteater_hybrid_model" | -| rca_topic | 根因定位结果上报topic | "gala_cause_inference" | -| meta_topic | gopher采集指标数据topic | "gala_gopher_metadata" | -| group_id | kafka设置组名 | "gala_anteater_kafka" | -| Prometheus | | | -| server | Prometheus Server的ip地址,根据安装节点ip配置 | | -| port | Prometheus Server的port,如:9090 | | -| steps | 指标采样间隔 | | -| Schedule | | | -| duration | 异常检测模型执行频率(单位:分),每x分钟,检测一次 | 1 | - -## 启动 - -执行如下命令启动gala-anteater - -```shell -systemctl start gala-anteater -``` - -**注意**:gala-anteater支持启动一个进程实例,启动多个会导致内存占用过大,日志混乱。 - -### 故障注入 - -gala-anteater为故障检测与根因定位模块,测试阶段需要通过故障注入来构造故障,从而通过故障检测和根因定位模块获得故障节点信息和故障传播根因节点信息。 - -* 故障注入(仅提供参考) - - ```bash - chaosblade create disk burn --size 10 --read --write --path /var/lib/docker/overlay2/cf0a469be8a84cabe1d057216505f8d64735e9c63159e170743353a208f6c268/merged --timeout 120 - ``` - - *chaosblade 为故障注入工具, 可以模拟各种故障, 包括但不限于磁盘故障、网络故障、IO故障等待 - 备注: 通过注入不一样的故障, 指标采集器(例如 gala-gopher) 监控关联指标并上报到 prometheus 模块, prometheus graph 指标图部分关联指标会存在明显波动。 - -### 查询gala-anteater服务状态 - -若日志显示如下内容,说明服务启动成功,启动日志也会保存到当前运行目录下`logs/anteater.log`文件中。 - -```log -2022-09-01 17:52:54,435 - root - INFO - Run gala_anteater main function... -2022-09-01 17:52:54,436 - root - INFO - Start to try updating global configurations by querying data from Kafka! -2022-09-01 17:52:54,994 - root - INFO - Loads metric and operators from file: xxx\metrics.csv -2022-09-01 17:52:54,997 - root - INFO - Loads metric and operators from file: xxx\metrics.csv -2022-09-01 17:52:54,998 - root - INFO - Start to re-train the model based on last day metrics dataset! -2022-09-01 17:52:54,998 - root - INFO - Get training data during 2022-08-31 17:52:00+08:00 to 2022-09-01 17:52:00+08:00! -2022-09-01 17:53:06,994 - root - INFO - Spends: 11.995422840118408 seconds to get unique machine_ids! -2022-09-01 17:53:06,995 - root - INFO - The number of unique machine ids is: 1! -2022-09-01 17:53:06,996 - root - INFO - Fetch metric values from machine: xxxx. -2022-09-01 17:53:38,385 - root - INFO - Spends: 31.3896164894104 seconds to get get all metric values! -2022-09-01 17:53:38,392 - root - INFO - The shape of training data: (17281, 136) -2022-09-01 17:53:38,444 - root - INFO - Start to execute vae model training... -2022-09-01 17:53:38,456 - root - INFO - Using cpu device -2022-09-01 17:53:38,658 - root - INFO - Epoch(s): 0 train Loss: 136.68 validate Loss: 117.00 -2022-09-01 17:53:38,852 - root - INFO - Epoch(s): 1 train Loss: 113.73 validate Loss: 110.05 -2022-09-01 17:53:39,044 - root - INFO - Epoch(s): 2 train Loss: 110.60 validate Loss: 108.76 -2022-09-01 17:53:39,235 - root - INFO - Epoch(s): 3 train Loss: 109.39 validate Loss: 106.93 -2022-09-01 17:53:39,419 - root - INFO - Epoch(s): 4 train Loss: 106.48 validate Loss: 103.37 -... -2022-09-01 17:53:57,744 - root - INFO - Epoch(s): 98 train Loss: 97.63 validate Loss: 96.76 -2022-09-01 17:53:57,945 - root - INFO - Epoch(s): 99 train Loss: 97.75 validate Loss: 96.58 -2022-09-01 17:53:57,969 - root - INFO - Schedule recurrent job with time interval 1 minute(s). -2022-09-01 17:53:57,973 - apscheduler.scheduler - INFO - Adding job tentatively -- it will be properly scheduled when the scheduler starts -2022-09-01 17:53:57,974 - apscheduler.scheduler - INFO - Added job "partial" to job store "default" -2022-09-01 17:53:57,974 - apscheduler.scheduler - INFO - Scheduler started -2022-09-01 17:53:57,975 - apscheduler.scheduler - DEBUG - Looking for jobs to run -2022-09-01 17:53:57,975 - apscheduler.scheduler - DEBUG - Next wakeup is due at 2022-09-01 17:54:57.973533+08:00 (in 59.998006 seconds) -``` - -## 异常检测输出数据 - -gala-anteater如果检测到异常点,会将结果输出至kafka的model_topic,输出数据格式如下: - -```json -{ - "Timestamp":1659075600000, - "Attributes":{ - "entity_id":"xxxxxx_sli_1513_18", - "event_id":"1659075600000_1fd37742xxxx_sli_1513_18", - "event_type":"app" - }, - "Resource":{ - "anomaly_score":1.0, - "anomaly_count":13, - "total_count":13, - "duration":60, - "anomaly_ratio":1.0, - "metric_label":{ - "machine_id":"1fd37742xxxx", - "tgid":"1513", - "conn_fd":"18" - }, - "recommend_metrics":{ - "gala_gopher_tcp_link_notack_bytes":{ - "label":{ - "__name__":"gala_gopher_tcp_link_notack_bytes", - "client_ip":"x.x.x.165", - "client_port":"51352", - "hostname":"localhost.localdomain", - "instance":"x.x.x.172:8888", - "job":"prometheus-x.x.x.172", - "machine_id":"xxxxxx", - "protocol":"2", - "role":"0", - "server_ip":"x.x.x.172", - "server_port":"8888", - "tgid":"3381701" - }, - "score":0.24421279500639545 - }, - ... - }, - "metrics":"gala_gopher_ksliprobe_recent_rtt_nsec" - }, - "SeverityText":"WARN", - "SeverityNumber":14, - "Body":"TimeStamp, WARN, APP may be impacting sli performance issues." -} -``` - -## 根因定位输出数据 - -异常检测结果的每个异常节点都会触发根因定位,根因定位的结果会上报至kafka的rca_topic。输出数据格式如下: - -```yaml -{ - "Timestamp": 1724287883452, - "event_id": "1721125159975_475ae627-7e88-41ed-8bb8-ff0fee95a69d_l7_3459438_192.168.11.103_192.168.11.102_26_tcp_server_server_http", - "Attributes": { - "event_id": "1721125159975_475ae627-7e88-41ed-8bb8-ff0fee95a69d_l7_3459438_192.168.11.103_192.168.11.102_26_tcp_server_server_http", - "event_source": "root-cause-inference" - }, - "Resource": { - "abnormal_kpi": { - "metric_id": "gala_gopher_l7_latency_sum", - "entity_id": "", - "metric_labels": { - "client_ip": "192.168.11.103", - "comm": "python", - "container_id": "83d0c2f4a7f4", - "container_image": "ba2d060a624e", - "container_name": "/k8s_backend_backend-node2-01-5bcb47fd7c-4jxxs_default_475ae627", - "instance": "192.168.122.102:8888", - "job": "192.168.122.102", - "l4_role": "tcp_server", - "l7_role": "server", - "machine_id": "66086618-3bad-489e-b17d-05245224f29a-192.168.122.102", - "pod": "default/backend-node2-01-5bcb47fd7c-4jxxs", - "pod_id": "475ae627-7e88-41ed-8bb8-ff0fee95a69d", - "pod_namespace": "default", - "protocol": "http", - "server_ip": "192.168.11.102", - "server_port": "26", - "ssl": "no_ssl", - "tgid": "3459438" - }, - "desc": "L7 session averaged latency.", - "score": 0.3498585816683402 - }, - "cause_metrics": [ - { - "metric_id": "gala_gopher_container_cpu_user_seconds_total@4a9fcc23-8ba2-4b0a-bcb0-b1bfd89ed929", - "entity_id": "", - "metric_labels": { - "container_id": "1319ff912a6f", - "container_image": "ba2d060a624e", - "container_name": "/k8s_backend_backend-node3-02-654dd97bf9-s8jg5_default_4a9fcc23", - "instance": "192.168.122.103:8888", - "job": "192.168.122.103", - "machine_id": "494a61be-23cc-4c97-a871-902866e43747-192.168.122.103", - "pod": "default/backend-node3-02-654dd97bf9-s8jg5", - "pod_id": "4a9fcc23-8ba2-4b0a-bcb0-b1bfd89ed929", - "pod_namespace": "default" - }, - "desc": "\u5bb9\u56681s\u5185\u7528\u6237\u6001CPU\u8d1f\u8f7d", - "keyword": "process", - "score": 0.1194249668036936, - "path": [ - { - "pod_id": "4a9fcc23-8ba2-4b0a-bcb0-b1bfd89ed929", - "pod": "default/backend-node3-02-654dd97bf9-s8jg5", - "instance": "192.168.122.103:8888", - "job": "192.168.122.103", - "pod_state": "normal" - }, - { - "pod_id": "475ae627-7e88-41ed-8bb8-ff0fee95a69d", - "pod": "default/backend-node2-01-5bcb47fd7c-4jxxs", - "instance": "192.168.122.102:8888", - "job": "192.168.122.102", - "pod_state": "abnormal" - } - ] - }, - { - "metric_id": "gala_gopher_proc_wchar_bytes@67134fb4-b2a3-43c5-a5b3-b3b463ad7d43", - "entity_id": "", - "metric_labels": { - "cmdline": "python ./backend.py ", - "comm": "python", - "container_id": "de570c7328bb", - "container_image": "ba2d060a624e", - "container_name": "/k8s_backend_backend-node2-02-548c79d989-bnl9g_default_67134fb4", - "instance": "192.168.122.102:8888", - "job": "192.168.122.102", - "machine_id": "66086618-3bad-489e-b17d-05245224f29a-192.168.122.102", - "pgid": "3459969", - "pod": "default/backend-node2-02-548c79d989-bnl9g", - "pod_id": "67134fb4-b2a3-43c5-a5b3-b3b463ad7d43", - "pod_namespace": "default", - "ppid": "3459936", - "start_time": "1139543501", - "tgid": "3459969" - }, - "desc": "\u8fdb\u7a0b\u7cfb\u7edf\u8c03\u7528\u81f3FS\u7684\u5199\u5b57\u8282\u6570", - "keyword": "process", - "score": 0.37121879175399997, - "path": [ - { - "pod_id": "67134fb4-b2a3-43c5-a5b3-b3b463ad7d43", - "pod": "default/backend-node2-02-548c79d989-bnl9g", - "instance": "192.168.122.102:8888", - "job": "192.168.122.102", - "pod_state": "normal" - }, - { - "pod_id": "4a9fcc23-8ba2-4b0a-bcb0-b1bfd89ed929", - "pod": "default/backend-node3-02-654dd97bf9-s8jg5", - "instance": "192.168.122.103:8888", - "job": "192.168.122.103", - "pod_state": "normal" - }, - { - "pod_id": "475ae627-7e88-41ed-8bb8-ff0fee95a69d", - "pod": "default/backend-node2-01-5bcb47fd7c-4jxxs", - "instance": "192.168.122.102:8888", - "job": "192.168.122.102", - "pod_state": "abnormal" - } - ] - }, - { - "metric_id": "gala_gopher_l7_latency_avg@956c70a2-9918-459c-a0a8-39396251f952", - "entity_id": "", - "metric_labels": { - "client_ip": "192.168.11.103", - "comm": "python", - "container_id": "eef1ca1082a7", - "container_image": "ba2d060a624e", - "container_name": "/k8s_backend_backend-node2-03-584f4c6cfd-w4d2b_default_956c70a2", - "instance": "192.168.122.102:8888", - "job": "192.168.122.102", - "l4_role": "tcp_server", - "l7_role": "server", - "machine_id": "66086618-3bad-489e-b17d-05245224f29a-192.168.122.102", - "pod": "default/backend-node2-03-584f4c6cfd-w4d2b", - "pod_id": "956c70a2-9918-459c-a0a8-39396251f952", - "pod_namespace": "default", - "protocol": "http", - "server_ip": "192.168.11.113", - "server_port": "26", - "ssl": "no_ssl", - "tgid": "3460169" - }, - "desc": "L7 session averaged latency.", - "keyword": null, - "score": 0.5624857367147617, - "path": [ - { - "pod_id": "956c70a2-9918-459c-a0a8-39396251f952", - "pod": "default/backend-node2-03-584f4c6cfd-w4d2b", - "instance": "192.168.122.102:8888", - "job": "192.168.122.102", - "pod_state": "abnormal" - }, - { - "pod_id": "4a9fcc23-8ba2-4b0a-bcb0-b1bfd89ed929", - "pod": "default/backend-node3-02-654dd97bf9-s8jg5", - "instance": "192.168.122.103:8888", - "job": "192.168.122.103", - "pod_state": "normal" - }, - { - "pod_id": "475ae627-7e88-41ed-8bb8-ff0fee95a69d", - "pod": "default/backend-node2-01-5bcb47fd7c-4jxxs", - "instance": "192.168.122.102:8888", - "job": "192.168.122.102", - "pod_state": "abnormal" - } - ] - } - ] - }, - "desc": "L7 session averaged latency.", - "top1": "gala_gopher_container_cpu_user_seconds_total@4a9fcc23-8ba2-4b0a-bcb0-b1bfd89ed929\u5f02\u5e38", - "top2": "gala_gopher_proc_wchar_bytes@67134fb4-b2a3-43c5-a5b3-b3b463ad7d43\u5f02\u5e38", - "top3": "gala_gopher_l7_latency_avg@956c70a2-9918-459c-a0a8-39396251f952\u5f02\u5e38", - "keywords": [ - "process", - null - ], - "SeverityText": "WARN", - "SeverityNumber": 13, - "Body": "A cause inferring event for an abnormal event" -} -``` diff --git a/docs/zh/server/maintenance/gala/using_gala_gopher.md b/docs/zh/server/maintenance/gala/using_gala_gopher.md deleted file mode 100644 index 7201dc1bdeb637bce19a50b059a12932218cce8a..0000000000000000000000000000000000000000 --- a/docs/zh/server/maintenance/gala/using_gala_gopher.md +++ /dev/null @@ -1,1130 +0,0 @@ -# gala-gopher使用手册 - -gala-gopher作为数据采集模块提供OS级的监控能力,支持动态加 /卸载探针,可无侵入式地集成第三方探针,快速扩展监控范围。 - -本文介绍如何部署和使用gala-gopher服务。 - -## 安装 - -挂载repo源: - -```basic -[oe-2309] # openEuler 2309 官方发布源 -name=oe2309 -baseurl=http://119.3.219.20:82/openEuler:/23.09/standard_x86_64 -enabled=1 -gpgcheck=0 -priority=1 - -[oe-2309:Epol] # openEuler 2309:Epol 官方发布源 -name=oe2309_epol -baseurl=http://119.3.219.20:82/openEuler:/23.09:/Epol/standard_x86_64/ -enabled=1 -gpgcheck=0 -priority=1 -``` - -安装gala-gopher: - -```bash -# yum install gala-gopher -``` - -## 配置 - -### 配置介绍 - -gala-gopher配置文件为`/opt/gala-gopher/gala-gopher.conf`,该文件配置项说明如下(省略无需用户配置的部分)。 - -如下配置可以根据需要进行修改: - -- global:gala-gopher全局配置信息 - - log_file_name:gala-gopher日志文件名 - - log_level:gala-gopher日志级别(暂未开放此功能) - - pin_path:ebpf探针共享map存放路径(建议维持默认配置) -- metric:指标数据metrics输出方式配置 - - out_channel:metrics输出通道,支持配置web_server|logs|kafka,配置为空则输出通道关闭 - - kafka_topic:若输出通道为kafka,此为topic配置信息 -- event:异常事件event输出方式配置 - - out_channel:event输出通道,支持配置logs|kafka,配置为空则输出通道关闭 - - kafka_topic:若输出通道为kafka,此为topic配置信息 - - timeout:同一异常事件上报间隔设置 - - desc_language:异常事件描述信息语言选择,当前支持配置zh_CN|en_US -- meta:元数据metadata输出方式配置 - - out_channel:metadata输出通道,支持logs|kafka,配置为空则输出通道关闭 - - kafka_topic:若输出通道为kafka,此为topic配置信息 -- ingress:探针数据上报相关配置 - - interval:暂未使用 -- egress:上报数据库相关配置 - - interval:暂未使用 - - time_range:暂未使用 -- imdb:cache缓存规格配置 - - max_tables_num:最大的cache表个数,/opt/gala-gopher/meta目录下每个meta对应一个表 - - max_records_num:每张cache表最大记录数,通常每个探针在一个观测周期内产生至少1条观测记录 - - max_metrics_num:每条观测记录包含的最大的metric指标个数 - - record_timeout:cache表老化时间,若cache表中某条记录超过该时间未刷新则删除记录,单位为秒 -- web_server:输出通道web_server配置 - - port:监听端口 -- rest_api_server - - port:RestFul API监听端口 - - ssl_auth:设置RestFul API开启https加密以及鉴权,on为开启,off为不开启,建议用户在实际生产环境开启 - - private_key:用于RestFul API https加密的服务端私钥文件绝对路径,当ssl_auth为“on”必配 - - cert_file:用于RestFul API https加密的服务端证书绝对路径,当ssl_auth为“on”必配 - - ca_file:用于RestFul API对客户端进行鉴权的CA中心证书绝对路径,当ssl_auth为“on”必配 -- kafka:输出通道kafka配置 - - kafka_broker:kafka服务器的IP和port - - batch_num_messages:每个批次发送的消息数量 - - compression_codec:消息压缩类型 - - queue_buffering_max_messages:生产者缓冲区中允许的最大消息数 - - queue_buffering_max_kbytes:生产者缓冲区中允许的最大字节数 - - queue_buffering_max_ms:生产者在发送批次之前等待更多消息加入的最大时间 -- logs:输出通道logs配置 - - metric_dir:metrics指标数据日志路径 - - event_dir:异常事件数据日志路径 - - meta_dir:metadata元数据日志路径 - - debug_dir:gala-gopher运行日志路径 - -#### 配置文件示例 - -- 配置选择数据输出通道: - - ```yaml - metric = - { - out_channel = "web_server"; - kafka_topic = "gala_gopher"; - }; - - event = - { - out_channel = "kafka"; - kafka_topic = "gala_gopher_event"; - }; - - meta = - { - out_channel = "kafka"; - kafka_topic = "gala_gopher_metadata"; - }; - ``` - -- 配置kafka和webServer: - - ```yaml - web_server = - { - port = 8888; - }; - - kafka = - { - kafka_broker = ":9092"; - }; - ``` - -### 启动 - -配置完成后,执行如下命令启动gala-gopher。 - -```bash -# systemctl start gala-gopher.service -``` - -查询gala-gopher服务状态。 - -```bash -# systemctl status gala-gopher.service -``` - -若显示结果如下,说明服务启动成功。需要关注开启的探针是否已启动,如果探针线程不存在,请检查配置文件及gala-gopher运行日志文件。 - -![gala-gopher成功启动状态](./figures/gala-gopher成功启动状态.png) - -> 说明:gala-gopher部署和运行均需要root权限。 - -### REST 动态配置接口 - -WEB server端口可配置(缺省9999),URL组织方式 ,比如火焰图的URL:(以下文档均以火焰图举例)。 - -#### 配置探针监控范围 - -探针默认关闭,可以通过API动态开启、设置监控范围。以火焰图为例,通过REST分别开启oncpu/offcpu/mem火焰图能力。并且监控范围支持进程ID、进程名、容器ID、POD四个维度来设置。 - -下面是火焰图同时开启oncpu, offcpu采集特性的API举例: - -```sh -curl -X PUT http://localhost:9999/flamegraph --data-urlencode json=' -{ - "cmd": { - "bin": "/opt/gala-gopher/extend_probes/stackprobe", - "check_cmd": "", - "probe": [ - "oncpu", - "offcpu" - ] - }, - "snoopers": { - "proc_id": [ - 101, - 102 - ], - "proc_name": [ - { - "comm": "app1", - "cmdline": "", - "debugging_dir": "" - }, - { - "comm": "app2", - "cmdline": "", - "debugging_dir": "" - } - ], - "pod_id": [ - "pod1", - "pod2" - ], - "container_id": [ - "container1", - "container2" - ] - } -}' -``` - -全量采集特性说明如下: - -| 采集特性 | 采集特性说明 | 采集子项范围 | 监控对象 | 启动文件 | 启动条件 | -| ------------- | ------------------------------------- | ------------------------------------------------------------ | ---------------------------------------- | ---------------------------------- | ------------------------- | -| flamegraph | 在线性能火焰图观测能力 | oncpu, offcpu, mem | proc_id, proc_name, pod_id, container_id | $gala-gopher-dir/stackprobe | NA | -| l7 | 应用7层协议观测能力 | l7_bytes_metrics、l7_rpc_metrics、l7_rpc_trace | proc_id, proc_name, pod_id, container_id | $gala-gopher-dir/l7probe | NA | -| tcp | TCP异常、状态观测能力 | tcp_abnormal, tcp_rtt, tcp_windows, tcp_rate, tcp_srtt, tcp_sockbuf, tcp_stats,tcp_delay | proc_id, proc_name, pod_id, container_id | $gala-gopher-dir/tcpprobe | NA | -| socket | Socket(TCP/UDP)异常观测能力 | tcp_socket, udp_socket | proc_id, proc_name, pod_id, container_id | $gala-gopher-dir/endpoint | NA | -| io | Block层I/O观测能力 | io_trace, io_err, io_count, page_cache | NA | $gala-gopher-dir/ioprobe | NA | -| proc | 进程系统调用、I/O、DNS、VFS等观测能力 | base_metrics, proc_syscall, proc_fs, proc_io, proc_dns,proc_pagecache | proc_id, proc_name, pod_id, container_id | $gala-gopher-dir/taskprobe | NA | -| jvm | JVM层GC, 线程, 内存, 缓存等观测能力 | NA | proc_id, proc_name, pod_id, container_id | $gala-gopher-dir/jvmprobe | NA | -| ksli | Redis性能SLI(访问时延)观测能力 | NA | proc_id, proc_name, pod_id, container_id | $gala-gopher-dir/ksliprobe | NA | -| postgre_sli | PG DB性能SLI(访问时延)观测能力 | NA | proc_id, proc_name, pod_id, container_id | $gala-gopher-dir/pgsliprobe | NA | -| opengauss_sli | openGauss访问吞吐量观测能力 | NA | [ip, port, dbname, user,password] | $gala-gopher-dir/pg_stat_probe.py | NA | -| dnsmasq | DNS会话观测能力 | NA | proc_id, proc_name, pod_id, container_id | $gala-gopher-dir/rabbitmq_probe.sh | NA | -| lvs | lvs会话观测能力 | NA | NA | $gala-gopher-dir/trace_lvs | lsmod\|grep ip_vs\| wc -l | -| nginx | Nginx L4/L7层会话观测能力 | NA | proc_id, proc_name, pod_id, container_id | $gala-gopher-dir/nginx_probe | NA | -| haproxy | Haproxy L4/7层会话观测能力 | NA | proc_id, proc_name, pod_id, container_id | $gala-gopher-dir/trace_haproxy | NA | -| kafka | kafka 生产者/消费者topic观测能力 | NA | dev, port | $gala-gopher-dir/kafkaprobe | NA | -| baseinfo | 系统基础信息 | cpu, mem, nic, disk, net, fs, proc,host | proc_id, proc_name, pod_id, container_id | system_infos | NA | -| virt | 虚拟化管理信息 | NA | NA | virtualized_infos | NA | -| tprofiling | 线程级性能profiling观测能力 | oncpu, syscall_file, syscall_net, syscall_lock, syscall_sched | proc_id, proc_name, pod_id, container_id | $gala-gopher-dir/tprofiling | NA | -| container | 容器信息 | NA | proc_id, proc_name, container_id | $gala-gopher-dir/cadvisor_probe.py | NA | - -#### 配置探针运行参数 - -探针在运行期间还需要设置一些参数设置,例如:设置火焰图的采样周期、上报周期。 - -```sh -curl -X PUT http://localhost:9999/flamegraph --data-urlencode json=' -{ - "params": { - "report_period": 180, - "sample_period": 180, - "metrics_type": [ - "raw", - "telemetry" - ] - } -}' -``` - -详细参数运行参数如下: - -| 参数 | 含义 | 缺省值&范围 | 单位 | 支持的监控范围 | gala-gopher是否支持 | -| ------------------ | ------------------------------------------------------ | ------------------------------------------------------------ | ------- | ------------------------ | ------------------- | -| sample_period | 采样周期 | 5000, [100~10000] | ms | io, tcp | Y | -| report_period | 上报周期 | 60, [5~600] | s | ALL | Y | -| latency_thr | 时延上报门限 | 0, [10~100000] | ms | tcp, io, proc, ksli | Y | -| offline_thr | 进程离线上报门限 | 0, [10~100000] | ms | proc | Y | -| drops_thr | 丢包上送门限 | 0, [10~100000] | package | tcp, nic | Y | -| res_lower_thr | 资源百分比下限 | 0%, [0%~100%] | percent | ALL | Y | -| res_upper_thr | 资源百分比上限 | 0%, [0%~100%] | percent | ALL | Y | -| report_event | 上报异常事件 | 0, [0, 1] | NA | ALL | Y | -| metrics_type | 上报telemetry metrics | raw, [raw, telemetry] | NA | ALL | N | -| env | 工作环境类型 | node, [node, container, kubenet] | NA | ALL | N | -| report_source_port | 是否上报源端口 | 0, [0, 1] | NA | tcp | Y | -| l7_protocol | L7层协议范围 | http, [http, pgsql, mysql, redis, kafka, mongo, rocketmq, dns] | NA | l7 | Y | -| support_ssl | 支持SSL加密协议观测 | 0, [0, 1] | NA | l7 | Y | -| multi_instance | 是否每个进程输出独立火焰图 | 0, [0, 1] | NA | flamegraph | Y | -| native_stack | 是否显示本地语言堆栈(针对JAVA进程) | 0, [0, 1] | NA | flamegraph | Y | -| cluster_ip_backend | 执行Cluster IP backend转换 | 0, [0, 1] | NA | tcp,l7 | Y | -| pyroscope_server | 设置火焰图UI服务端地址 | localhost:4040 | NA | flamegraph | Y | -| svg_period | 火焰图svg文件生成周期 | 180, [30, 600] | s | flamegraph | Y | -| perf_sample_period | oncpu火焰图采集堆栈信息的周期 | 10, [10, 1000] | ms | flamegraph | Y | -| svg_dir | 火焰图svg文件存储目录 | "/var/log/gala-gopher/stacktrace" | NA | flamegraph | Y | -| flame_dir | 火焰图原始堆栈信息存储目录 | "/var/log/gala-gopher/flamegraph" | NA | flamegraph | Y | -| dev_name | 观测的网卡/磁盘设备名 | "" | NA | io, kafka, ksli, postgre_sli,baseinfo, tcp | Y | -| continuous_sampling | 是否持续采样 | 0, [0, 1] | NA | ksli | Y | -| elf_path | 要观测的可执行文件的路径 | "" | NA | nginx, haproxy, dnsmasq | Y | -| kafka_port | 要观测的kafka端口号 | 9092, [1, 65535] | NA | kafka | Y | -| cadvisor_port | 启动的cadvisor端口号 | 8080, [1, 65535] | NA | cadvisor | Y | - -#### 启动、停止探针 - -```sh -curl -X PUT http://localhost:9999/flamegraph --data-urlencode json=' -{ - "state": "running" // optional: running,stopped -}' -``` - -#### 约束与限制说明 - -1. 接口为无状态形式,每次上传的设置为该探针的最终运行结果,包括状态、参数、监控范围。 -2. 监控对象可以任意组合,监控范围取合集。 -3. 启动文件必须真实有效。 -4. 采集特性可以按需开启全部/部分能力,关闭时只能整体关闭某个采集特性。 -5. opengauss监控对象是DB实例(IP/Port/dbname/user/password)。 -6. 接口每次最多接收2048长度的数据。 - -#### 获取探针配置与运行状态 - -```sh -curl -X GET http://localhost:9999/flamegraph -{ - "cmd": { - "bin": "/opt/gala-gopher/extend_probes/stackprobe", - "check_cmd": "" - "probe": [ - "oncpu", - "offcpu" - ] - }, - "snoopers": { - "proc_id": [ - 101, - 102 - ], - "proc_name": [ - { - "comm": "app1", - "cmdline": "", - "debugging_dir": "" - }, - { - "comm": "app2", - "cmdline": "", - "debugging_dir": "" - } - ], - "pod_id": [ - "pod1", - "pod2" - ], - "container_id": [ - "container1", - "container2" - ] - }, - "params": { - "report_period": 180, - "sample_period": 180, - "metrics_type": [ - "raw", - "telemetry" - ] - }, - "state": "running" -} -``` - -## stackprobe 介绍 - -适用于云原生环境的性能火焰图。 - -### 特性 - -- 支持观测C/C++、Go、Rust、Java语言应用。 - -- 调用栈支持容器、进程粒度:对于容器内进程,在调用栈底部分别以[Pod]和[Con]前缀标记工作负载Pod名称、容器Container名称。进程名以[\]前缀标识,线程及函数(方法)无前缀。 - -- 支持本地生成svg格式火焰图或上传调用栈数据到中间件。 - -- 支持依照进程粒度多实例生成/上传火焰图。 - -- 对于Java进程的火焰图,支持同时显示本地方法和Java方法。 - -- 支持oncpu/offcpu/mem等多类型火焰图。 - -- 支持自定义采样周期。 - -### 使用说明 - -启动命令示例(基本):使用默认参数启动性能火焰图。 - -```sh -curl -X PUT http://localhost:9999/flamegraph -d json='{ "cmd": {"probe": ["oncpu"] }, "snoopers": {"proc_name": [{ "comm": "cadvisor"}] }, "state": "running"}' -``` - -启动命令示例(进阶):使用自定义参数启动性能火焰图。完整可配置参数列表参见[配置探针运行参数](#配置探针运行参数)。 - -```sh -curl -X PUT http://localhost:9999/flamegraph -d json='{ "cmd": { "check_cmd": "", "probe": ["oncpu", "offcpu", "mem"] }, "snoopers": { "proc_name": [{ "comm": "cadvisor", "cmdline": "", "debugging_dir": "" }, { "comm": "java", "cmdline": "", "debugging_dir": "" }] }, "params": { "perf_sample_period": 100, "svg_period": 300, "svg_dir": "/var/log/gala-gopher/stacktrace", "flame_dir": "/var/log/gala-gopher/flamegraph", "pyroscope_server": "localhost:4040", "multi_instance": 1, "native_stack": 0 }, "state": "running"}' -``` - -下面说明主要配置项: - -- 设置开启的火焰图类型 - - 通过probe参数设置,参数值为`oncpu`,`offcpu`,`mem`,分别代表进程cpu占用时间,进程被阻塞时间,进程申请内存大小的统计。 - - 示例: - - `"probe": ["oncpu", "offcpu", "mem"]` - -- 设置生成本地火焰图svg文件的周期 - - 通过svg_period参数设置,单位为秒,默认值180,可选设置范围为[30, 600]的整数。 - - 示例: - - `"svg_period": 300` - -- 开启/关闭堆栈信息上传到pyroscope - - 通过pyroscope_server参数设置,参数值需要包含addr和port,参数为空或格式错误则探针不会尝试上传堆栈信息。 - - 上传周期30s。 - - 示例: - - `"pyroscope_server": "localhost:4040"` - -- 设置调用栈采样周期 - - 通过perf_sample_period设置,单位为毫秒,默认值10,可选设置范围为[10, 1000]的整数,此参数仅对oncpu类型的火焰图有效。 - - 示例: - - `"perf_sample_period": 100` - -- 开启/关闭多实例生成火焰图 - - 通过multi_instance设置,参数值为0或1,默认值为0。值为0表示所有进程的火焰图会合并在一起,值为1表示分开生成每个进程的火焰图。 - - 示例: - - `"multi_instance": 1` - -- 开启/关闭本地调用栈采集 - - 通过native_stack设置,参数值为0或1,默认值为0。此参数仅对JAVA进程有效。值为0表示不采集JVM自身的调用栈,值为1表示采集JVM自身的调用栈。 - - 示例: - - `"native_stack": 1` - - 显示效果:(左"native_stack": 1,右"native_stack": 0) - - ![image-20230804172905729](./figures/flame_muti_ins.png) - -### 实现方案 - -#### 1. 用户态程序逻辑 - -周期性地(30s)根据符号表将内核态上报的堆栈信息从地址转换为符号。然后使用flamegraph插件或pyroscope将符号化的调用栈转换为火焰图。 - -其中,对于代码段类型获取符号表的方法不同。 - -- 内核符号表获取:读取/proc/kallsyms。 - -- 本地语言符号表获取:查询进程的虚拟内存映射文件(/proc/{pid}/maps),获取进程内存中各个代码段的地址映射,然后利用libelf库加载每个代码段对应模块的符号表。 - -- Java语言符号表获取: - - 由于 Java 方法没有静态映射到进程的虚拟地址空间,因此我们采用其他方式获取符号化的Java调用栈。 - -##### 方式一:perf观测 - -通过往Java进程加载JVM agent动态库来跟踪JVM的方法编译加载事件,获取并记录内存地址到Java符号的映射,从而实时生成Java进程的符号表。这种方法需要Java进程开启-XX:+PreserveFramePointer启动参数。本方式的优点是火焰图中可显示JVM自身的调用栈,而且这种方式生成的Java火焰图可以和其他进程的火焰图合并显示。 - -##### 方式二:JFR观测 - -通过动态开启JVM内置分析器JFR来跟踪Java应用程序的各种事件和指标。开启JFR的方式为往Java进程加载Java agent,Java agent中会调用JFR API。本方式的优点是对Java方法调用栈的采集会更加准确详尽。 - -上述两种针对Java进程的性能分析方法都可以实时加载(不需要重启Java进程)且具有低底噪的优点。当stackprobe的启动参数为"multi_instance": 1且"native_stack": 0时,stackprobe会使用方法二生成Java进程火焰图,否则会使用方法一。 - -#### 2. 内核态程序逻辑 - -内核态基于eBPF实现。不同火焰图类型对应不同的eBPF程序。eBPF程序会周期性地或通过事件触发的方式遍历当前用户态和内核态的调用栈,并上报用户态。 - -##### 2.1 oncpu火焰图 - -在perf SW事件PERF_COUNT_SW_CPU_CLOCK上挂载采样eBPF程序,周期性采样调用栈。 - -##### 2.2 offcpu火焰图 - -在进程调度的tracepoint(sched_switch)上挂载采样eBPF程序,采样eBPF程序中记录进程被调度出去时间和进程id,在进程被调度回来时采样调用栈。 - -#### 2.3 mem火焰图 - -在缺页异常的tracepoint(page_fault_user)上挂载采样eBPF程序,事件触发时采样调用栈。 - -#### 3. Java语言支持 - -- stackprobe主进程: - - 1. 接收到ipc消息获取要观测的Java进程。 - 2. 使用Java代理加载模块向待观测的Java进程加载JVM代理程序:jvm_agent.so(对应[方式一](#方式一-perf观测))或JstackProbeAgent.jar(对应[方式二](#方式二-jfr观测))。 - 3. 方式一主进程会加载对应java进程的java-symbols.bin文件,供地址转换符号时查询。方式二主进程会加载对应java进程的stacks-{flame_type}.txt文件,可直接供火焰图生成。 - -- Java代理加载模块 - - 1. 发现新增java进程则将JVM代理程序复制到该进程空间下/proc/\/root/tmp(因为attach时容器内JVM需要可见此代理程序)。 - 2. 设置上述目录和JVM代理程序的owner和被观测java进程一致。 - 3. 启动jvm_attach子进程,并传入被观测java进程相关参数。 - -- JVM代理程序 - - - jvm_agent.so:注册JVMTI回调函数 - - 当JVM加载一个Java方法或者动态编译一个本地方法时JVM会调用回调函数,回调函数会将java类名和方法名以及对应的内存地址写入到被观测java进程空间下(/proc/\/root/tmp/java-data-\/java-symbols.bin)。 - - JstackProbeAgent.jar:调用JFR API - - 开启持续30s的JFR功能,并转换JFR统计结果为火焰图可用的堆栈格式,结果输出到到被观测java进程空间下(/proc/\/root/tmp/java-data-\/stacks-\.txt)。详见[JstackProbe简介](https://gitee.com/openeuler/gala-gopher/blob/dev/src/probes/extends/java.probe/jstack.probe/readme.md)。 - -- jvm_attach:用于实时加载JVM代理程序到被观测进程的JVM上 - (参考jdk源码中sun.tools.attach.LinuxVirtualMachine和jattach工具)。 - - 1. 设置自身的namespace(JVM加载agent时要求加载进程和被观测进程的namespace一致)。 - - 2. 检查JVM attach listener是否启动(是否存在UNIX socket文件:/proc/\/root/tmp/.java_pid\)。 - - 3. 未启动则创建/proc/\/cwd/.attach_pid\,并发送SIGQUIT信号给JVM。 - - 4. 连接UNIX socket。 - - 5. 读取响应为0表示attach成功。 - - attach agent流程图示: - - ![attach流程](./figures/attach流程.png) - -### 注意事项 - -- 对于Java应用的观测,为获取最佳观测效果,请设置stackprobe启动选项为"multi_instance": 1, "native_stack": 0来使能JFR观测(JDK8u262+)。否则stackprobe会以perf方式来生成Java火焰图。perf方式下,请开启JVM选项XX:+PreserveFramePointer(JDK8以上)。 - -### 约束条件 - -- 支持基于hotspot JVM的Java应用观测。 - -## tprofiling 介绍 - -tprofiling 是 gala-gopher 提供的一个基于 ebpf 的线程级应用性能诊断工具,它使用 ebpf 技术观测线程的关键系统性能事件,并关联丰富的事件内容,从而实时地记录线程的运行状态和关键行为,帮助用户快速识别应用性能问题。 - -### 功能特性 - -从操作系统的视角来看,一个运行的应用程序是由多个进程组成,每个进程是由多个运行的线程组成。tprofiling 通过观测这些线程运行过程中执行的一些关键行为(后面称之为**事件**)并记录下来,然后在前端界面以时间线的方式进行展示,进而就可以很直观地分析这些线程在某段时间内正在做什么,是在 CPU 上执行还是阻塞在某个文件、网络操作上。当应用程序出现性能问题时,通过分析对应线程的关键性能事件的执行序列,快速地进行定界定位。 - -基于当前已实现的事件观测范围, tprofiling 能够定位的应用性能问题场景主要包括: - -- 文件 I/O 耗时、阻塞问题 -- 网络 I/O 耗时、阻塞问题 -- 锁竞争问题 -- 死锁问题 - -随着更多类型的事件不断地补充和完善,tprofiling 将能够覆盖更多类型的应用性能问题场景。 - -### 事件观测范围 - -tprofiling 当前支持的系统性能事件包括两大类:系统调用事件和 oncpu 事件。 - -**系统调用事件** - -应用性能问题通常是由于系统资源出现瓶颈导致,比如 CPU 资源占用过高、I/O 资源等待。应用程序往往通过系统调用访问这些系统资源,因此可以对关键的系统调用事件进行观测来识别耗时、阻塞的资源访问操作。 - -tprofiling 当前已观测的系统调用事件参见章节: [支持的系统调用事件](#支持的系统调用事件) ,大致分为几个类型:文件操作(file)、网络操作(net)、锁操作(lock)和调度操作(sched)。下面列出部分已观测的系统调用事件: - -- 文件操作(file) - - read/write:读写磁盘文件或网络,可能会耗时、阻塞。 - - sync/fsync:对文件进行同步刷盘操作,完成前线程会阻塞。 -- 网络操作(net) - - send/recv:读写网络,可能会耗时、阻塞。 -- 锁操作(lock) - - futex:用户态锁实现相关的系统调用,触发 futex 往往意味出现锁竞争,线程可能进入阻塞状态。 -- 调度操作(sched):这里泛指那些可能会引起线程状态变化的系统调用事件,如线程让出 cpu 、睡眠、或等待其他线程等。 - - nanosleep:线程进入睡眠状态。 - - epoll_wait:等待 I/O 事件到达,事件到达之前线程会阻塞。 - -**oncpu 事件** - -此外,根据线程是否在 CPU 上运行可以将线程的运行状态分为两种:oncpu 和 offcpu ,前者表示线程正在 CPU 上运行,后者表示线程不在 CPU 上运行。通过观测线程的 oncpu 事件,可以识别线程是否正在执行耗时的 cpu 操作。 - -### 事件内容 - -线程 profiling 事件主要包括以下几部分内容。 - -- 事件来源信息:包括事件所属的线程ID、线程名、进程ID、进程名、容器ID、容器名、主机ID、主机名等信息。 - - - `thread.pid`:事件所属的线程ID。 - - `thread.comm`:事件所属的线程名。 - - `thread.tgid`:事件所属的进程ID。 - - `proc.name`:事件所属的进程名。 - - `container.id`:事件所属的容器ID。 - - `container.name`:事件所属的容器名。 - - `host.id`:事件所属的主机ID。 - - `host.name`:事件所属的主机名。 - -- 事件属性信息:包括公共的事件属性和扩展的事件属性。 - - - 公共的事件属性:包括事件名、事件类型、事件开始时间、事件结束时间、事件执行时间等。 - - - `event.name`:事件名。 - - `event.type`:事件类型,目前支持 oncpu、file、net、lock、sched 五种。 - - `start_time`:事件开始时间,聚合事件中第一个事件的开始时间,关于聚合事件的说明参见章节:[聚合事件](#聚合事件) 。 - - `end_time`:事件结束时间,聚合事件中最后一个事件的结束时间。 - - `duration`:事件执行时间,值为(end_time - start_time)。 - - `count`:事件聚合数量。 - - - 扩展的事件属性:针对不同的系统调用事件,补充更加丰富的事件内容。如 read/write 文件或网络时,提供文件路径、网络连接以及函数调用栈等信息。 - - - `func.stack`:事件的函数调用栈信息。 - - `file.path`:文件类事件的文件路径信息。 - - `sock.conn`:网络类事件的tcp连接信息。 - - `futex.op`:futex系统调用事件的操作类型,取值为 wait 或 wake 。 - - 不同事件类型支持的扩展事件属性的详细情况参见章节:[支持的系统调用事件](#支持的系统调用事件) 。 - -### 事件输出 - -tprofiling 作为 gala-gopher 提供的一个扩展的 ebpf 探针程序,产生的系统事件会发送至 gala-gopher 处理,并由 gala-gopher 按照开源的 openTelemetry 事件格式对外输出,并通过 json 格式发送到 kafka 消息队列中。前端可以通过对接 kafka 消费 tprofiling 事件。 - -下面是线程 profiling 事件的一个输出示例: - -```json -{ - "Timestamp": 1661088145000, - "SeverityText": "INFO", - "SeverityNumber": 9, - "Body": "", - "Resource": { - "host.id": "", - "host.name": "", - "thread.pid": 10, - "thread.tgid": 10, - "thread.comm": "java", - "proc.name": "xxx.jar", - "container.id": "", - "container.name": "", - }, - "Attributes": { - values: [ - { - // common info - "event.name": "read", - "event.type": "file", - "start_time": 1661088145000, - "end_time": 1661088146000, - "duration": 0.1, - "count": 1, - // extend info - "func.stack": "read;", - "file.path": "/test.txt" - }, - { - "event.name": "oncpu", - "event.type": "oncpu", - "start_time": 1661088146000, - "end_time": 1661088147000, - "duration": 0.1, - "count": 1, - } - ] - } -} -``` - -部分字段说明: - -- `Timestamp`:事件上报的事件点。 -- `Resource`:包括事件来源信息。 -- `Attributes`:包括事件属性信息,它包含一个 `values` 列表字段,列表中的每一项表示一个属于相同来源的 tprofiling 事件,其中包含该事件的属性信息。 - -### 快速开始 - -#### 安装部署 - -tprofiling 是 gala-gopher 提供的一个扩展的 ebpf 探针程序,因此,需要先安装部署好 gala-gopher 软件,然后再开启 tprofiling 功能。 - -另外,为了能够在前端用户界面使用 tprofiling 的能力,[gala-ops](https://gitee.com/openeuler/gala-docs) 基于开源的 `kafka + logstash + elasticsearch + grafana` 可观测软件搭建了用于演示的 tprofiling 功能的用户界面,用户可以使用 gala-ops 提供的部署工具进行快速部署。 - -#### 运行架构 - -![](./figures/tprofiling-run-arch.png) - -前端软件说明: - -- kafka:一个开源的消息队列中间件,用于接收并存储 gala-gopher 采集的 tprofiling 事件。 -- logstash:一个实时的开源日志收集引擎,用于从 kafka 消费 tprofiling 事件,经过过滤、转换等处理后发送至 elasticsearch 。 -- elasticsearch:一个开放的分布式搜索和分析引擎,用于储存经过处理后的 tprofiling 事件,供 grafana 查询和可视化展示。 -- grafana:一个开源的可视化工具,用于查询并可视化展示采集的 tprofiling 事件。用户最终通过 grafana 提供的用户界面来使用 tprofiling 的功能,分析应用性能问题。 - -#### 部署 tprofiling 探针 - -用户需要先安装好 gala-gopher,具体的安装部署说明可参考 [gala-gopher文档](https://gitee.com/openeuler/gala-gopher#快速开始) 。由于 tprofiling 事件会发送到 kafka 中,因此部署时需要配置好 kafka 的服务地址。 - -安装并运行 gala-gopher 后,使用 gala-gopher 提供的基于 HTTP 的动态配置接口启动 tprofiling 探针。 - -```sh -curl -X PUT http://:9999/tprofiling -d json='{"cmd": {"probe": ["oncpu", "syscall_file", "syscall_net", "syscall_sched", "syscall_lock"]}, "snoopers": {"proc_name": [{"comm": "java"}]}, "state": "running"}' -``` - -配置参数说明: - -- ``:部署 gala-gopher 的节点 IP。 -- `probe`:`cmd` 下的 `probe` 配置项指定了 tprofiling 探针观测的系统事件范围。其中,oncpu、syscall_file、syscall_net、syscall_sched、syscall_lock 分别对应 oncpu 事件、以及 file、net、sched、lock 四类系统调用事件。用户可根据需要只开启部分 tprofiling 事件类型的观测。 -- `proc_name`:`snoopers` 下的 `proc_name` 配置项用于过滤要观测的进程名。另外也可以通过 `proc_id` 配置项来过滤要观测的进程ID,详情参考:[REST 动态配置接口](#rest-动态配置接口)。 - -要关闭 tprofiling 探针,执行如下命令: - -```sh -curl -X PUT http://:9999/tprofiling -d json='{"state": "stopped"}' -``` - -#### 部署前端软件 - -使用 tprofiling 功能的用户界面需要用到的软件包括:kafka、logstash、elasticsearch、grafana。这些软件安装在管理节点,用户可以使用 gala-ops 提供的部署工具进行快速安装部署,参考:[在线部署文档](https://gitee.com/openeuler/gala-docs#%E5%9C%A8%E7%BA%BF%E9%83%A8%E7%BD%B2)。 - -在管理节点上,通过 [在线部署文档](https://gitee.com/openeuler/gala-docs#%E5%9C%A8%E7%BA%BF%E9%83%A8%E7%BD%B2) 获取部署脚本后,执行如下命令一键安装中间件:kafka、logstash、elasticsearch。 - -```sh -sh deploy.sh middleware -K <部署节点管理IP> -E <部署节点管理IP> -A -p -``` - -执行如下命令一键安装 grafana 。 - -```sh -sh deploy.sh grafana -P -E -``` - -#### 使用 - -完成上述部署动作后,即可通过浏览器访问 `http://[部署节点管理IP]:3000` 登录 grafana 来使用 A-Ops,登录用户名、密码默认均为 admin。 - -登录 grafana 界面后,找到名为 `ThreadProfiling` 的 dashboard。 - -![image-20230628155002410](./figures/tprofiling-dashboard.png) - -点击进入 tprofiling 功能的前端界面,接下来就可以探索 tprofiling 的功能了。 - -![image-20230628155249009](./figures/tprofiling-dashboard-detail.png) - -### 使用案例 - -#### 案例1:死锁问题定位 - -![image-20230628095802499](./figures/deadlock.png) - -上图是一个死锁 Demo 进程的线程 profiling 运行结果,从饼图中进程事件执行时间的统计结果可以看到,这段时间内 lock 类型事件(灰色部分)占比比较高。下半部分是整个进程的线程 profiling 展示结果,纵轴展示了进程内不同线程的 profiling 事件的执行序列。其中,线程 `java` 为主线程一直处于阻塞状态,业务线程 `LockThd1` 和 `LockThd2` 在执行一些 oncpu 事件和 file 类事件后会间歇性的同时执行一段长时间的 lock 类事件。将光标悬浮到 lock 类型事件上可以查看事件内容,(如下图所示)它触发了 futex 系统调用事件,执行时间为 60 秒。 - -![image-20230628101056732](./figures/deadlock2.png) - -基于上述观测,我们可以发现业务线程 `LockThd1` 和 `LockThd2` 可能存在异常行为。接下来,我们可以进入线程视图,查看这两个业务线程 `LockThd1` 和 `LockThd2` 的线程 profiling 结果。 - -![image-20230628102138540](./figures/deadlock3.png) - -上图是每个线程的 profiling 结果展示,纵轴展示线程内不同事件类型的执行序列。从图中可以看到,线程 `LockThd1` 和 `LockThd2` 正常情况下会定期执行 oncpu 事件,其中包括执行一些 file 类事件和 lock 类事件。但是在某个时间点(10:17:00附近)它们会同时执行一个长时间的 lock 类型的 futex 事件,而且这段时间内都没有 oncpu 事件发生,说明它们都进入了阻塞状态。futex 是用户态锁实现相关的系统调用,触发 futex 往往意味出现锁竞争,线程可能进入阻塞状态。 - -基于上述分析,线程 `LockThd1` 和 `LockThd2` 很可能是出现了死锁问题。 - -#### 案例2:锁竞争问题定位 - -![image-20230628111119499](./figures/lockcompete1.png) - -上图是一个锁竞争 Demo 进程的线程 profiling 运行结果。从图中可以看到,该进程在这段时间内主要执行了 lock、net、oncpu 三类事件,该进程包括 3 个运行的业务线程。在11:05:45 - 11:06:45 这段时间内,我们发现这 3 个业务线程的事件执行时间都变得很长了,这里面可能存在性能问题。同样,我们进入线程视图,查看每个线程的线程 profiling 结果,同时我们将时间范围缩小到可能有异常的时间点附近。 - -![image-20230628112709827](./figures/lockcompete2.png) - -通过查看每个线程的事件执行序列,可以大致了解每个线程这段时间在执行什么功能。 - -- 线程 CompeteThd1:每隔一段时间触发短时的 oncpu 事件,执行一次计算任务;但是在 11:05:45 时间点附近开始触发长时的 oncpu 事件,说明正在执行耗时的计算任务。 - - ![image-20230628113336435](./figures/lockcompete3.png) - -- 线程 CompeteThd2:每隔一段时间触发短时的 net 类事件,点击事件内容可以看到,该线程正在通过 write 系统调用发送网络消息,且可以看到对应的 tcp 连接信息;同样在 11:05:45 时间点附近开始执行长时的 futex 事件并进入阻塞状态,此时 write 网络事件的执行间隔变长了。 - - ![image-20230628113759887](./figures/lockcompete4.png) - - ![image-20230628114340386](./figures/lockcompete5.png) - -- 线程 tcp-server:tcp 服务器,不断通过 read 系统调用读取客户端发送的请求;同样在 11:05:45 时间点附近开始,read 事件执行时间变长,说明此时正在等待接收网络请求。 - - ![image-20230628114659071](./figures/lockcompete6.png) - -基于上述分析,我们可以发现,每当线程 CompeteThd1 在执行耗时较长的 oncpu 操作时,线程 CompeteThd2 都会调用 futex 系统调用进入阻塞状态,一旦线程 CompeteThd1 完成 oncpu 操作时,线程 CompeteThd2 将获取 cpu 并执行网络 write 操作。因此,大概率是因为线程 CompeteThd1 和线程 CompeteThd2 之间存在锁竞争的问题。而线程 tcp-server 与线程 CompeteThd2 之间存在 tcp 网络通信,由于线程 CompeteThd2 等待锁资源无法发送网络请求,从而导致线程 tcp-server 大部分时间都在等待 read 网络请求。 - -### topics - -#### 支持的系统调用事件 - -选择需要加入观测的系统调用事件的基本原则为: - -1. 选择可能会比较耗时、阻塞的事件(如文件操作、网络操作、锁操作等),这类事件通常涉及对系统资源的访问。 -2. 选择影响线程运行状态的事件。 - -| 事件名/系统调用名 | 描述 | 默认的事件类型 | 扩展的事件内容 | -| ----------------- | ----------------------------------------------------- | -------------- | -------------------------------- | -| read | 读写磁盘文件或网络,线程可能会耗时、阻塞 | file | file.path, sock.conn, func.stack | -| write | 读写磁盘文件或网络,线程可能会耗时、阻塞 | file | file.path, sock.conn, func.stack | -| readv | 读写磁盘文件或网络,线程可能会耗时、阻塞 | file | file.path, sock.conn, func.stack | -| writev | 读写磁盘文件或网络,线程可能会耗时、阻塞 | file | file.path, sock.conn, func.stack | -| preadv | 读写磁盘文件或网络,线程可能会耗时、阻塞 | file | file.path, sock.conn, func.stack | -| pwritev | 读写磁盘文件或网络,线程可能会耗时、阻塞 | file | file.path, sock.conn, func.stack | -| sync | 对文件进行同步刷盘操作,完成前线程会阻塞 | file | func.stack | -| fsync | 对文件进行同步刷盘操作,完成前线程会阻塞 | file | file.path, sock.conn, func.stack | -| fdatasync | 对文件进行同步刷盘操作,完成前线程会阻塞 | file | file.path, sock.conn, func.stack | -| sched_yield | 线程主动让出 CPU 重新进行调度 | sched | func.stack | -| nanosleep | 线程进入睡眠状态 | sched | func.stack | -| clock_nanosleep | 线程进入睡眠状态 | sched | func.stack | -| wait4 | 线程阻塞 | sched | func.stack | -| waitpid | 线程阻塞 | sched | func.stack | -| select | 无事件到达时线程会阻塞等待 | sched | func.stack | -| pselect6 | 无事件到达时线程会阻塞等待 | sched | func.stack | -| poll | 无事件到达时线程会阻塞等待 | sched | func.stack | -| ppoll | 无事件到达时线程会阻塞等待 | sched | func.stack | -| epoll_wait | 无事件到达时线程会阻塞等待 | sched | func.stack | -| sendto | 读写网络时,线程可能会耗时、阻塞 | net | sock.conn, func.stack | -| recvfrom | 读写网络时,线程可能会耗时、阻塞 | net | sock.conn, func.stack | -| sendmsg | 读写网络时,线程可能会耗时、阻塞 | net | sock.conn, func.stack | -| recvmsg | 读写网络时,线程可能会耗时、阻塞 | net | sock.conn, func.stack | -| sendmmsg | 读写网络时,线程可能会耗时、阻塞 | net | sock.conn, func.stack | -| recvmmsg | 读写网络时,线程可能会耗时、阻塞 | net | sock.conn, func.stack | -| futex | 触发 futex 往往意味着出现锁等待,线程可能进入阻塞状态 | lock | futex.op, func.stack | - -#### 聚合事件 - -tprofiling 当前支持的系统性能事件包括两大类:系统调用事件和 oncpu 事件。其中,oncpu 事件以及部分系统调用事件(比如read/write)在特定的应用场景下可能会频繁触发,从而产生大量的系统事件,这会对观测的应用程序性能以及 tprofiling 探针本身的性能造成较大的影响。 - -为了优化性能,tprofiling 将一段时间内(1s)属于同一个线程的具有相同事件名的多个系统事件聚合为一个事件进行上报。因此,一个 tprofiling 事件实际上指的是一个聚合事件,它包含一个或多个相同的系统事件。相比于一个真实的系统事件,一个聚合事件的部分属性的含义有如下变化, - -- `start_time`:事件开始时间,在聚合事件中是指第一个系统事件的开始时间。 -- `end_time`:事件结束时间,在聚合事件中是指(`start_time + duration`)。 -- `duration`:事件执行时间,在聚合事件中是指所有系统事件实际执行时间的累加值。 -- `count`:聚合事件中系统事件的数量,当值为 1 时,聚合事件就等价于一个系统事件。 -- 扩展的事件属性:在聚合事件中是指第一个系统事件的扩展属性。 - -## L7Probe 介绍 - -定位:L7流量观测,覆盖常见的HTTP1.X、PG、MySQL、Redis、Kafka、HTTP2.0、MongoDB、RocketMQ协议,支持加密流观测。 - -场景:覆盖Node、Container、Pod(K8S)三类场景。 - -### 代码框架设计 - -```shell -L7Probe - | --- included // 公共头文件 - -​ | --- connect.h // L7 connect对象定义 - -​ | --- pod.h // pod/container对象定义 - -​ | --- conn_tracker.h // L7协议跟踪对象定义 - - | --- protocol // L7协议解析 - -​ | --- http // HTTP1.X L7 message结构定义及解析 - -​ | --- mysql // mysql L7 message结构定义及解析 - -​ | --- pgsql // pgsql L7 message结构定义及解析 - - | --- bpf // 内核bpf代码 - -​ | --- L7.h // BPF程序解析L7层协议类型 - -​ | --- kern_sock.bpf.c // 内核socket层观测 - -​ | --- libssl.bpf.c // openSSL层观测 - -​ | --- gossl.bpf.c // GO SSL层观测 - -​ | --- cgroup.bpf.c // pod 生命周期观测 - - | --- pod_mng.c // pod/container实例管理(感知pod/container生命周期) - - | --- conn_mng.c // L7 Connect实例管理(处理BPF观测事件,比如Open/Close事件、Stats统计) - - | --- conn_tracker.c // L7 流量跟踪(跟踪BPF观测数据,比如send/write、read/recv等系统事件产生的数据) - - | --- bpf_mng.c // BPF程序生命周期管理(按需、实时open、load、attach、unload BPF程序,包括uprobe BPF程序) - - | --- session_conn.c // 管理jsse Session(记录jsse Session和sock连接的对应关系,上报jsse连接信息) - - | --- L7Probe.c // 探针主程序 -``` - -### 探针输出 - -| metrics_name | table_name | metrics_type | unit | metrics description | -| --------------- | ---------- | ------------ | ---- | ------------------------------------------------------------ | -| tgid | NA | key | NA | Process ID of l7 session. | -| client_ip | NA | key | NA | Client IP address of l7 session. | -| server_ip | NA | key | NA | Server IP address of l7 session.
备注:K8S场景支持Cluster IP转换成Backend IP | -| server_port | NA | key | NA | Server Port of l7 session.
备注:K8S场景支持Cluster Port转换成Backend Port | -| l4_role | NA | key | NA | Role of l4 protocol(TCP Client/Server or UDP) | -| l7_role | NA | key | NA | Role of l7 protocol(Client or Server) | -| protocol | NA | key | NA | Name of l7 protocol(http/http2/mysql...) | -| ssl | NA | label | NA | Indicates whether an SSL-encrypted l7 session is used. | -| bytes_sent | l7_link | gauge | NA | Number of bytes sent by a l7 session. | -| bytes_recv | l7_link | gauge | NA | Number of bytes recv by a l7 session. | -| segs_sent | l7_link | gauge | NA | Number of segs sent by a l7 session. | -| segs_recv | l7_link | gauge | NA | Number of segs recv by a l7 session. | -| throughput_req | l7_rpc | gauge | qps | Request throughput of l7 session. | -| throughput_resp | l7_rpc | gauge | qps | Response throughput of l7 session. | -| req_count | l7_rpc | gauge | NA | Request num of l7 session. | -| resp_count | l7_rpc | gauge | NA | Response num of l7 session. | -| latency_avg | l7_rpc | gauge | ns | L7 session averaged latency. | -| latency | l7_rpc | histogram | ns | L7 session histogram latency. | -| latency_sum | l7_rpc | gauge | ns | L7 session sum latency. | -| err_ratio | l7_rpc | gauge | % | L7 session error rate. | -| err_count | l7_rpc | gauge | NA | L7 session error count. | - -### 动态控制 - -#### 控制观测Pod范围 - -1. REST->gala-gopher。 -2. gala-gopher->L7Probe。 -3. L7Probe 基于Pod获取相关Container。 -4. L7Probe 基于Container获取其 CGroup id(cpuacct_cgrp_id),并写入object模块(API: cgrp_add)。 -5. Socket系统事件上下文中,获取进程所属CGroup(cpuacct_cgrp_id),参考Linux代码(task_cgroup)。 -6. 观测过程中,通过object模块过滤(API: is_cgrp_exist)。 - -#### 控制观测能力 - -1. REST->gala-gopher。 -2. gala-gopher->L7Probe。 -3. L7Probe根据输入参数动态的开启、关闭BPF观测能力(包括吞吐量、时延、Trace、协议类型)。 - -### 观测点 - -#### 内核Socket系统调用 - -TCP相关系统调用 - -// int connect(int sockfd, const struct sockaddr *addr, socklen_t addrlen); - -// int accept(int sockfd, struct sockaddr \*addr, socklen_t \*addrlen); - -// int accept4(int sockfd, struct sockaddr \*addr, socklen_t \*addrlen, int flags); - -// ssize_t write(int fd, const void *buf, size_t count); - -// ssize_t send(int sockfd, const void *buf, size_t len, int flags); - -// ssize_t read(int fd, void *buf, size_t count); - -// ssize_t recv(int sockfd, void *buf, size_t len, int flags); - -// ssize_t writev(int fd, const struct iovec *iov, int iovcnt); - -// ssize_t readv(int fd, const struct iovec *iov, int iovcnt); - -TCP&UDP相关系统调用 - -// ssize_t sendto(int sockfd, const void \*buf, size_t len, int flags, const struct sockaddr \*dest_addr, socklen_t addrlen); - -// ssize_t recvfrom(int sockfd, void \*buf, size_t len, int flags, struct sockaddr \*src_addr, socklen_t \*addrlen); - -// ssize_t sendmsg(int sockfd, const struct msghdr *msg, int flags); - -// ssize_t recvmsg(int sockfd, struct msghdr *msg, int flags); - -// int close(int fd); - -注意点: - -1. read/write、readv/writev 与普通的文件I/O操作会混淆,通过观测内核security_socket_sendmsg函数区分FD是否属于socket操作。 -2. sendto/recvfrom、sendmsg/recvmsg TCP/UDP均会使用,参考下面手册的介绍。 -3. sendmmsg/recvmmsg、sendfile 暂不支持。 - -[sendto manual](https://man7.org/linux/man-pages/man2/send.2.html) :If sendto() is used on a connection-mode (SOCK_STREAM, SOCK_SEQPACKET) socket, the arguments dest_addr and addrlen are ignored (and the error EISCONN may be returned when they are not NULL and 0), and the error ENOTCONN is returned when the socket was not actually connected. otherwise, the address of the target is given by dest_addr with addrlen specifying its size. - -sendto 判断dest_addr参数为NULL则为TCP,否则为UDP。 - -[recvfrom manual](https://linux.die.net/man/2/recvfrom):The recvfrom() and recvmsg() calls are used to receive messages from a socket, and may be used to receive data on a socket whether or not it is connection-oriented. - -recvfrom判断src_addr参数为NULL则为TCP,否则为UDP。 - -[sendmsg manual](https://man7.org/linux/man-pages/man3/sendmsg.3p.html):The sendmsg() function shall send a message through a connection-mode or connectionless-mode socket. If the socket is a connectionless-mode socket, the message shall be sent to the address specified by msghdr if no pre-specified peer address has been set. If a peer address has been pre-specified, either themessage shall be sent to the address specified in msghdr (overriding the pre-specified peer address), or the function shall return -1 and set errno to [EISCONN]. If the socket is connection-mode, the destination address in msghdr shall be ignored. - -sendmsg判断msghdr->msg_name参数为NULL则为TCP,否则为UDP。 - -[recvmsg manual](https://man7.org/linux/man-pages/man3/recvmsg.3p.html): The recvmsg() function shall receive a message from a connection-mode or connectionless-mode socket. It is normally used with connectionless-mode sockets because it permits the application to retrieve the source address of received data. - -recvmsg判断msghdr->msg_name参数为NULL则为TCP,否则为UDP。 - -#### libSSL API - -SSL_write - -SSL_read - -#### Go SSL API - -#### JSSE API - -sun/security/ssl/SSLSocketImpl$AppInputStream - -sun/security/ssl/SSLSocketImpl$AppOutputStream - -### JSSE观测方案 - -#### 加载JSSEProbe探针 - -main函数中通过l7_load_jsse_agent加载JSSEProbe探针。 - -轮询观测白名单(g_proc_obj_map_fd)中的进程,若为java进程,则通过jvm_attach将JSSEProbeAgent.jar加载到此观测进程上。加载成功后,该java进程会在指定观测点(参见[JSSE API](#jsse-api))将观测信息输出到jsse-metrics输出文件(/tmp/java-data-\/jsse-metrics.txt)中。 - -#### 处理JSSEProbe消息 - -l7_jsse_msg_handler线程中处理JSSEProbe消息。 - -轮询观测白名单(g_proc_obj_map_fd)中的进程,若该进程有对应的jsse-metrics输出文件,则按行读取此文件并解析、转换、上报jsse读写信息。 - -##### 1. 解析jsse读写信息 - -jsse-metrics.txt的输出格式如下,从中解析出一次jsse请求的pid, sessionId, time, read/write操作, IP, port, payload信息: -`|jsse_msg|662220|Session(1688648699909|TLS_AES_256_GCM_SHA384)|1688648699989|Write|127.0.0.1|58302|This is test message|` - -解析出的原始信息存储于session_data_args_s中。 - -##### 2. 转换jsse读写信息 - -将session_data_args_s中的信息转换为sock_conn和conn_data。 - -转化时需要查询如下两个hash map: - -session_head:记录jsse连接的session Id和sock connection Id的对应关系。若进程id和四元组信息一致,则认为session和sock connection对应。 - -file_conn_head:记录java进程的最后一个sessionId,以备L7probe读jsseProbe输出时,没有从请求开头开始读取,找不到sessionId信息。 - -##### 3. 上报jsse读写信息 - -将sock_conn和conn_data上报到map中。 - -## sliprobe 介绍 - -基于 ebpf 采集并周期性上报容器粒度的 SLI 指标。 - -### 特性 - -- 按照容器粒度采集周期内CPU调度事件的时延总计和统计直方图,关注的事件包括:调度等待,主动睡眠,锁/IO引起的阻塞,调度延迟,长系统调用等 -- 按照容器粒度采集周期内Memory分配事件的时延总计和统计直方图,关注的事件包括:内存回收,换页,内存规整等 -- 按照容器粒度采集周期内BIO层IO操作的时延总计和统计直方图 - -### 使用说明 - -启动命令示例:指定上报周期为15秒,观测容器id为abcd12345678和abcd87654321的两个容器的SLI指标。 - -`curl -X PUT http://localhost:9999/sli -d json='{"params":{"report_period":15}, "snoopers":{"container_id":[{"container_id": "abcd12345678","abcd87654321"}]}, "state":"running"}'` - -### 代码逻辑 - -#### 总体思路 - -1. 用户态接收待观测的容器列表,将容器的cpuacct子系统目录inode记录在ebpf map中,共享给内核态。 - -2. 通过ebpf kprobe/tracepoint跟踪相关内核事件,判断当前进程是否属于待观测范围,记录事件类型,时间戳等信息。每隔一定周期将同一cgroup内进程的SLI指标进行聚合上报。 -3. 用户态接收并打印内核态上报的SLI指标信息。 - -#### SLI指标计算方式 - -##### CPU SLI - -1. **cpu_wait** - - 在sched_stat_wait观测点,获取第二个参数delay的值 - -2. **cpu_sleep** - - 在sched_stat_sleep观测点,获取第二个参数delay的值 - -3. **cpu_iowait** - - 在sched_stat_blocked观测点,判断当前进程in_iowait,则获取第二个参数delay的值 - -4. **cpu_block** - - 在sched_stat_blocked观测点,判断当前进程非in_iowait,则获取第二个参数delay的值 - -5. **cpu_rundelay** - - 在sched_switch观测点,通过第三个参数next获取将被调度进程的run_delay值:next->sched_info.run_delay,记录在task_sched_map中。计算同一进程两次被调度时run_delay的差值 - -6. **cpu_longsys** - - 在sched_switch观测点,通过第三个参数next获取将被调度进程的task结构体,从task结构体中获取上下文切换次数(nvcsw+nivcsw)和用户态执行时间utime。如果同一进程两次被调度时的上下文切换次数和用户态执行时间都不变,则说明在该进程在执行一个较长的系统调用,累积该进程处在内核态的时间 - -##### MEM SLI - -1. **mem_reclaim** - - 计算mem_cgroup_handle_over_high函数返回时间戳和进入时间戳的差值 - - 计算mm_vmscan_memcg_reclaim_end观测点和mm_vmscan_memcg_reclaim_begin观测点时间戳的差值 - -2. **mem_swapin** - - 计算do_swap_page函数返回时间戳和进入时间戳的差值 - -3. **mem_compact** - - 计算try_to_compact_pages函数返回时间戳和进入时间戳的差值 - -##### IO SLI - -1. **bio_latency** - - 计算进入bio_endio函数和触发block_bio_queue观测点的时间戳差值 - - 计算进入bio_endio函数和退出generic_make_request_checks函数的时间戳差值 - -## 使用方法 - -### 外部依赖软件部署 - -![gopher软件架构图](./figures/gopher软件架构图.png) - -如上图所示,绿色部分为gala-gopher的外部依赖组件。gala-gopher会将指标数据metrics输出到prometheus,将元数据metadata、异常事件event输出到kafka,灰色部分的gala-anteater和gala-spider会从prometheus和kafka获取数据。 - -> 说明:安装kafka、prometheus软件包时,需要从官网获取安装包进行部署。 - -### 输出数据 - -- **指标数据metrics** - - Prometheus Server内置了Express Browser UI,用户可以通过PromQL查询语句查询指标数据内容。详细教程参见官方文档:[Using the expression browser](https://prometheus.io/docs/prometheus/latest/getting_started/#using-the-expression-browser)。示例如下: - - 指定指标名称为`gala_gopher_tcp_link_rcv_rtt`,UI显示的指标数据为: - - `gala_gopher_tcp_link_rcv_rtt{client_ip="x.x.x.165",client_port="1234",hostname="openEuler",instance="x.x.x.172:8888",job="prometheus",machine_id="1fd3774xx",protocol="2",role="0",server_ip="x.x.x.172",server_port="3742",tgid="1516"} 1` - -- **元数据metadata** - - 可以直接从kafka消费topic为`gala_gopher_metadata`的数据来看。示例如下: - - ```bash - # 输入请求 - ./bin/kafka-console-consumer.sh --bootstrap-server x.x.x.165:9092 --topic gala_gopher_metadata - # 输出数据 - {"timestamp": 1655888408000, "meta_name": "thread", "entity_name": "thread", "version": "1.0.0", "keys": ["machine_id", "pid"], "labels": ["hostname", "tgid", "comm", "major", "minor"], "metrics": ["fork_count", "task_io_wait_time_us", "task_io_count", "task_io_time_us", "task_hang_count"]} - ``` - -- **异常事件event** - - 可以直接从kafka消费topic为`gala_gopher_event`的数据来看。示例如下: - - ```bash - # 输入请求 - ./bin/kafka-console-consumer.sh --bootstrap-server x.x.x.165:9092 --topic gala_gopher_event - # 输出数据 - {"timestamp": 1655888408000, "meta_name": "thread", "entity_name": "thread", "version": "1.0.0", "keys": ["machine_id", "pid"], "labels": ["hostname", "tgid", "comm", "major", "minor"], "metrics": ["fork_count", "task_io_wait_time_us", "task_io_count", "task_io_time_us", "task_hang_count"]} - ``` diff --git a/docs/zh/server/maintenance/gala/using_gala_spider.md b/docs/zh/server/maintenance/gala/using_gala_spider.md deleted file mode 100644 index b09525db81e895595f030255d76e507274bb2b83..0000000000000000000000000000000000000000 --- a/docs/zh/server/maintenance/gala/using_gala_spider.md +++ /dev/null @@ -1,527 +0,0 @@ -# gala-spider使用手册 - -本文档主要介绍如何部署和使用gala-spider和gala-inference。 - -## gala-spider - -gala-spider 提供 OS 级别的拓扑图绘制功能,它将定期获取 gala-gopher (一个 OS 层面的数据采集软件)在某个时间点采集的所有观测对象的数据,并计算它们之间的拓扑关系,最终将生成的拓扑图保存到图数据库 arangodb 中。 - -### 安装 - -挂载 yum 源: - -```basic -[oe-2309] # openEuler 2309 官方发布源 -name=oe2309 -baseurl=http://119.3.219.20:82/openEuler:/23.09/standard_x86_64 -enabled=1 -gpgcheck=0 -priority=1 - -[oe-2309:Epol] # openEuler 2309:Epol 官方发布源 -name=oe2309_epol -baseurl=http://119.3.219.20:82/openEuler:/23.09:/Epol/standard_x86_64/ -enabled=1 -gpgcheck=0 -priority=1 -``` - -安装 gala-spider: - -```sh -# yum install gala-spider -``` - -### 配置 - -#### 配置文件说明 - -gala-spider 配置文件为 `/etc/gala-spider/gala-spider.yaml` ,该文件配置项说明如下。 - -- global:全局配置信息。 - - data_source:指定观测指标采集的数据库,当前只支持 prometheus。 - - data_agent:指定观测指标采集代理,当前只支持 gala_gopher。 -- spider:spider配置信息。 - - log_conf:日志配置信息。 - - log_path:日志文件路径。 - - log_level:日志打印级别,值包括 DEBUG/INFO/WARNING/ERROR/CRITICAL 。 - - max_size:日志文件大小,单位为兆字节(MB)。 - - backup_count:日志备份文件数量。 -- storage:拓扑图存储服务的配置信息。 - - period:存储周期,单位为秒,表示每隔多少秒存储一次拓扑图。 - - database:存储的图数据库,当前只支持 arangodb。 - - db_conf:图数据库的配置信息。 - - url:图数据库的服务器地址。 - - db_name:拓扑图存储的数据库名称。 -- kafka:kafka配置信息。 - - server:kafka服务器地址。 - - metadata_topic:观测对象元数据消息的topic名称。 - - metadata_group_id:观测对象元数据消息的消费者组ID。 -- prometheus:prometheus数据库配置信息。 - - base_url:prometheus服务器地址。 - - instant_api:单个时间点采集API。 - - range_api:区间采集API。 - - step:采集时间步长,用于区间采集API。 - -#### 配置文件示例 - -```yaml -global: - data_source: "prometheus" - data_agent: "gala_gopher" - -prometheus: - base_url: "http://localhost:9090/" - instant_api: "/api/v1/query" - range_api: "/api/v1/query_range" - step: 1 - -spider: - log_conf: - log_path: "/var/log/gala-spider/spider.log" - # log level: DEBUG/INFO/WARNING/ERROR/CRITICAL - log_level: INFO - # unit: MB - max_size: 10 - backup_count: 10 - -storage: - # unit: second - period: 60 - database: arangodb - db_conf: - url: "http://localhost:8529" - db_name: "spider" - -kafka: - server: "localhost:9092" - metadata_topic: "gala_gopher_metadata" - metadata_group_id: "metadata-spider" -``` - -### 启动 - -1. 通过命令启动。 - - ```sh - # spider-storage - ``` - -2. 通过 systemd 服务启动。 - - ```sh - # systemctl start gala-spider - ``` - -### 使用方法 - -#### 外部依赖软件部署 - -gala-spider 运行时需要依赖多个外部软件进行交互。因此,在启动 gala-spider 之前,用户需要将gala-spider依赖的软件部署完成。下图为 gala-spider 项目的软件依赖图。 - -![gala-spider软件架构图](./figures/gala-spider软件架构图.png) - -其中,右侧虚线框内为 gala-spider 项目的 2 个功能组件,绿色部分为 gala-spider 项目直接依赖的外部组件,灰色部分为 gala-spider 项目间接依赖的外部组件。 - -- **spider-storage**:gala-spider 核心组件,提供拓扑图存储功能。 - 1. 从 kafka 中获取观测对象的元数据信息。 - 2. 从 Prometheus 中获取所有的观测实例信息。 - 3. 将生成的拓扑图存储到图数据库 arangodb 中。 -- **gala-inference**:gala-spider 核心组件,提供根因定位功能。它通过订阅 kafka 的异常 KPI 事件触发异常 KPI 的根因定位流程,并基于 arangodb 获取的拓扑图来构建故障传播图,最终将根因定位的结果输出到 kafka 中。 -- **prometheus**:时序数据库,gala-gopher 组件采集的观测指标数据会上报到 prometheus,再由 gala-spider 做进一步处理。 -- **kafka**:消息中间件,用于存储 gala-gopher 上报的观测对象元数据信息,异常检测组件上报的异常事件,以及 cause-inference 组件上报的根因定位结果。 -- **arangodb**:图数据库,用于存储 spider-storage 生成的拓扑图。 -- **gala-gopher**:数据采集组件,请提前部署gala-gopher。 -- **arangodb-ui**:arangodb 提供的 UI 界面,可用于查询拓扑图。 - -gala-spider 项目中的 2 个功能组件会作为独立的软件包分别发布。 - -​**spider-storage** 组件对应本节中的 gala-spider 软件包。 - -​**gala-inference** 组件对应 gala-inference 软件包。 - -gala-gopher软件的部署参见[gala-gopher使用手册](./using_gala_gopher.md),此处只介绍 arangodb 的部署。 - -当前使用的 arangodb 版本是 3.8.7 ,该版本对运行环境有如下要求: - -- 只支持 x86 系统 -- gcc10 以上 - -arangodb 官方部署文档参见:[arangodb部署](https://www.arangodb.com/docs/3.9/deployment.html) 。 - -arangodb 基于 rpm 的部署流程如下: - -1. 配置 yum 源。 - - ```basic - [oe-2309] # openEuler 2309 官方发布源 - name=oe2309 - baseurl=http://119.3.219.20:82/openEuler:/23.09/standard_x86_64 - enabled=1 - gpgcheck=0 - priority=1 - - [oe-2309:Epol] # openEuler 2309:Epol 官方发布源 - name=oe2309_epol - baseurl=http://119.3.219.20:82/openEuler:/23.09:/Epol/standard_x86_64/ - enabled=1 - gpgcheck=0 - priority=1 - ``` - -2. 安装 arangodb3。 - - ```sh - # yum install arangodb3 - ``` - -3. 配置修改。 - - arangodb3 服务器的配置文件路径为 `/etc/arangodb3/arangod.conf` ,需要修改如下的配置信息: - - - endpoint:配置 arangodb3 的服务器地址 - - authentication:访问 arangodb3 服务器是否需要进行身份认证,当前 gala-spider 还不支持身份认证,故此处将authentication设置为 false。 - - 示例配置如下: - - ```yaml - [server] - endpoint = tcp://0.0.0.0:8529 - authentication = false - ``` - -4. 启动 arangodb3。 - - ```sh - # systemctl start arangodb3 - ``` - -#### gala-spider配置项修改 - -依赖软件启动后,需要修改 gala-spider 配置文件的部分配置项内容。示例如下: - -配置 kafka 服务器地址: - -```yaml -kafka: - server: "localhost:9092" -``` - -配置 prometheus 服务器地址: - -```yaml -prometheus: - base_url: "http://localhost:9090/" -``` - -配置 arangodb 服务器地址: - -```yaml -storage: - db_conf: - url: "http://localhost:8529" -``` - -#### 启动服务 - -运行 `systemctl start gala-spider` 。查看启动状态可执行 `systemctl status gala-spider` ,输出如下信息说明启动成功。 - -```sh -[root@openEuler ~]# systemctl status gala-spider -● gala-spider.service - a-ops gala spider service - Loaded: loaded (/usr/lib/systemd/system/gala-spider.service; enabled; vendor preset: disabled) - Active: active (running) since Tue 2022-08-30 17:28:38 CST; 1 day 22h ago - Main PID: 2263793 (spider-storage) - Tasks: 3 (limit: 98900) - Memory: 44.2M - CGroup: /system.slice/gala-spider.service - └─2263793 /usr/bin/python3 /usr/bin/spider-storage -``` - -#### 输出示例 - -用户可以通过 arangodb 提供的 UI 界面来查询 gala-spider 输出的拓扑图。使用流程如下: - -1. 在浏览器输入 arangodb 服务器地址,如: ,进入 arangodb 的 UI 界面。 - -2. 界面右上角切换至 `spider` 数据库。 - -3. 在 `Collections` 面板可以看到在不同时间段存储的观测对象实例的集合、拓扑关系的集合,如下图所示: - - ![spider拓扑关系图](./figures/spider拓扑关系图.png) - -4. 可进一步根据 arangodb 提供的 AQL 查询语句查询存储的拓扑关系图,详细教程参见官方文档: [aql文档](https://www.arangodb.com/docs/3.8/aql/)。 - -## gala-inference - -gala-inference 提供异常 KPI 根因定位能力,它将基于异常检测的结果和拓扑图作为输入,根因定位的结果作为输出,输出到 kafka 中。gala-inference 组件在 gala-spider 项目下进行归档。 - -### 安装 - -挂载 yum 源: - -```basic -[oe-2309] # openEuler 2309 官方发布源 -name=oe2309 -baseurl=http://119.3.219.20:82/openEuler:/23.09/standard_x86_64 -enabled=1 -gpgcheck=0 -priority=1 - -[oe-2309:Epol] # openEuler 2309:Epol 官方发布源 -name=oe2309_epol -baseurl=http://119.3.219.20:82/openEuler:/23.09:/Epol/standard_x86_64/ -enabled=1 -gpgcheck=0 -priority=1 -``` - -安装 gala-inference: - -```sh -# yum install gala-inference -``` - -### 配置 - -#### 配置文件说明 - -gala-inference 配置文件 `/etc/gala-inference/gala-inference.yaml` 配置项说明如下。 - -- inference:根因定位算法的配置信息。 - - tolerated_bias:异常时间点的拓扑图查询所容忍的时间偏移,单位为秒。 - - topo_depth:拓扑图查询的最大深度。 - - root_topk:根因定位结果输出前 K 个根因指标。 - - infer_policy:根因推导策略,包括 dfs 和 rw 。 - - sample_duration:指标的历史数据的采样周期,单位为秒。 - - evt_valid_duration:根因定位时,有效的系统异常指标事件周期,单位为秒。 - - evt_aging_duration:根因定位时,系统异常指标事件的老化周期,单位为秒。 -- kafka:kafka配置信息。 - - server:kafka服务器地址。 - - metadata_topic:观测对象元数据消息的配置信息。 - - topic_id:观测对象元数据消息的topic名称。 - - group_id:观测对象元数据消息的消费者组ID。 - - abnormal_kpi_topic:异常 KPI 事件消息的配置信息。 - - topic_id:异常 KPI 事件消息的topic名称。 - - group_id:异常 KPI 事件消息的消费者组ID。 - - abnormal_metric_topic:系统异常指标事件消息的配置信息。 - - topic_id:系统异常指标事件消息的topic名称。 - - group_id:系统异常指标事件消息的消费者组ID。 - - consumer_to:消费系统异常指标事件消息的超时时间,单位为秒。 - - inference_topic:根因定位结果输出事件消息的配置信息。 - - topic_id:根因定位结果输出事件消息的topic名称。 -- arangodb:arangodb图数据库的配置信息,用于查询根因定位所需要的拓扑子图。 - - url:图数据库的服务器地址。 - - db_name:拓扑图存储的数据库名称。 -- log_conf:日志配置信息。 - - log_path:日志文件路径。 - - log_level:日志打印级别,值包括 DEBUG/INFO/WARNING/ERROR/CRITICAL。 - - max_size:日志文件大小,单位为兆字节(MB)。 - - backup_count:日志备份文件数量。 -- prometheus:prometheus数据库配置信息,用于获取指标的历史时序数据。 - - base_url:prometheus服务器地址。 - - range_api:区间采集API。 - - step:采集时间步长,用于区间采集API。 - -#### 配置文件示例 - -```yaml -inference: - # 异常时间点的拓扑图查询所容忍的时间偏移,单位:秒 - tolerated_bias: 120 - topo_depth: 10 - root_topk: 3 - infer_policy: "dfs" - # 单位: 秒 - sample_duration: 600 - # 根因定位时,有效的异常指标事件周期,单位:秒 - evt_valid_duration: 120 - # 异常指标事件的老化周期,单位:秒 - evt_aging_duration: 600 - -kafka: - server: "localhost:9092" - metadata_topic: - topic_id: "gala_gopher_metadata" - group_id: "metadata-inference" - abnormal_kpi_topic: - topic_id: "gala_anteater_hybrid_model" - group_id: "abn-kpi-inference" - abnormal_metric_topic: - topic_id: "gala_anteater_metric" - group_id: "abn-metric-inference" - consumer_to: 1 - inference_topic: - topic_id: "gala_cause_inference" - -arangodb: - url: "http://localhost:8529" - db_name: "spider" - -log: - log_path: "/var/log/gala-inference/inference.log" - # log level: DEBUG/INFO/WARNING/ERROR/CRITICAL - log_level: INFO - # unit: MB - max_size: 10 - backup_count: 10 - -prometheus: - base_url: "http://localhost:9090/" - range_api: "/api/v1/query_range" - step: 5 -``` - -### 启动 - -1. 通过命令启动。 - - ```sh - # gala-inference - ``` - -2. 通过 systemd 服务启动。 - - ```sh - # systemctl start gala-inference - ``` - -### 使用方法 - -#### 依赖软件部署 - -gala-inference 的运行依赖和 gala-spider一样,请参见[外部依赖软件部署](#外部依赖软件部署)。此外,gala-inference 还间接依赖 [gala-spider](#gala-spider) 和 [gala-anteater](./using_gala_anteater.md) 软件的运行,请提前部署gala-spider和gala-anteater软件。 - -#### 配置项修改 - -修改 gala-inference 的配置文件中部分配置项。示例如下: - -配置 kafka 服务器地址: - -```yaml -kafka: - server: "localhost:9092" -``` - -配置 prometheus 服务器地址: - -```yaml -prometheus: - base_url: "http://localhost:9090/" -``` - -配置 arangodb 服务器地址: - -```yaml -arangodb: - url: "http://localhost:8529" -``` - -#### 启动服务 - -直接运行 `systemctl start gala-inference` 即可。可通过执行 `systemctl status gala-inference` 查看启动状态,如下打印表示启动成功。 - -```sh -[root@openEuler ~]# systemctl status gala-inference -● gala-inference.service - a-ops gala inference service - Loaded: loaded (/usr/lib/systemd/system/gala-inference.service; enabled; vendor preset: disabled) - Active: active (running) since Tue 2022-08-30 17:55:33 CST; 1 day 22h ago - Main PID: 2445875 (gala-inference) - Tasks: 10 (limit: 98900) - Memory: 48.7M - CGroup: /system.slice/gala-inference.service - └─2445875 /usr/bin/python3 /usr/bin/gala-inference -``` - -#### 输出示例 - -当异常检测模块 gala-anteater 检测到 KPI 异常后,会将对应的异常 KPI 事件输出到 kafka 中,gala-inference 会一直监测该异常 KPI 事件的消息,如果收到异常 KPI 事件的消息,就会触发根因定位。根因定位会将定位结果输出到 kafka 中,用户可以在 kafka 服务器中查看根因定位的输出结果,基本步骤如下: - -1. 若通过源码安装 kafka ,需要进入 kafka 的安装目录下。 - - ```sh - cd /root/kafka_2.13-2.8.0 - ``` - -2. 执行消费 topic 的命令获取根因定位的输出结果。 - - ```sh - ./bin/kafka-console-consumer.sh --bootstrap-server localhost:9092 --topic gala_cause_inference - ``` - - 输出示例如下: - - ```json - { - "Timestamp": 1661853360000, - "event_id": "1661853360000_1fd37742xxxx_sli_12154_19", - "Attributes": { - "event_id": "1661853360000_1fd37742xxxx_sli_12154_19" - }, - "Resource": { - "abnormal_kpi": { - "metric_id": "gala_gopher_sli_rtt_nsec", - "entity_id": "1fd37742xxxx_sli_12154_19", - "timestamp": 1661853360000, - "metric_labels": { - "machine_id": "1fd37742xxxx", - "tgid": "12154", - "conn_fd": "19" - } - }, - "cause_metrics": [ - { - "metric_id": "gala_gopher_proc_write_bytes", - "entity_id": "1fd37742xxxx_proc_12154", - "metric_labels": { - "__name__": "gala_gopher_proc_write_bytes", - "cmdline": "/opt/redis/redis-server x.x.x.172:3742", - "comm": "redis-server", - "container_id": "5a10635e2c43", - "hostname": "openEuler", - "instance": "x.x.x.172:8888", - "job": "prometheus", - "machine_id": "1fd37742xxxx", - "pgid": "12154", - "ppid": "12126", - "tgid": "12154" - }, - "timestamp": 1661853360000, - "path": [ - { - "metric_id": "gala_gopher_proc_write_bytes", - "entity_id": "1fd37742xxxx_proc_12154", - "metric_labels": { - "__name__": "gala_gopher_proc_write_bytes", - "cmdline": "/opt/redis/redis-server x.x.x.172:3742", - "comm": "redis-server", - "container_id": "5a10635e2c43", - "hostname": "openEuler", - "instance": "x.x.x.172:8888", - "job": "prometheus", - "machine_id": "1fd37742xxxx", - "pgid": "12154", - "ppid": "12126", - "tgid": "12154" - }, - "timestamp": 1661853360000 - }, - { - "metric_id": "gala_gopher_sli_rtt_nsec", - "entity_id": "1fd37742xxxx_sli_12154_19", - "metric_labels": { - "machine_id": "1fd37742xxxx", - "tgid": "12154", - "conn_fd": "19" - }, - "timestamp": 1661853360000 - } - ] - } - ] - }, - "SeverityText": "WARN", - "SeverityNumber": 13, - "Body": "A cause inferring event for an abnormal event" - } - ``` diff --git a/docs/zh/server/performance/tlbi/_toc.yaml b/docs/zh/server/performance/tlbi/_toc.yaml deleted file mode 100644 index adeef9d46790306c3d88cfb1182c5044c3a2accc..0000000000000000000000000000000000000000 --- a/docs/zh/server/performance/tlbi/_toc.yaml +++ /dev/null @@ -1,6 +0,0 @@ -label: tlbi优化特性使用指南 -isManual: true -description: 优化了TLB的广播范围,从而在众核多虚机场景下,降低TLB刷新低噪。 -sections: - - label: tlbi优化特性使用指南 - href: ./tlbi_optimization_feature_usage_guide.md diff --git a/docs/zh/server/performance/tlbi/tlbi_optimization_feature_usage_guide.md b/docs/zh/server/performance/tlbi/tlbi_optimization_feature_usage_guide.md deleted file mode 100644 index 11a02893ea66ad973c55af8ac1fd1cbb6df9eef9..0000000000000000000000000000000000000000 --- a/docs/zh/server/performance/tlbi/tlbi_optimization_feature_usage_guide.md +++ /dev/null @@ -1,69 +0,0 @@ -# TLBi优化 - -[1.1 概述](#概述) - -[1.2 约束限制](#约束限制) - -[1.3 使用方法](#使用方法) - -## 概述 - -本特性通过在HostOS中关闭TLBi指令的全域广播,以及在GuestOS中将TLB的tlb刷新范围限定在该虚机进程执行过的CPU上,优化了TLB的广播范围,从而在众核多虚机场景下,降低TLB刷新低噪,有效提升unixbench的跑分线性度。 - -## 约束限制 - -1. 本特性目前仅适用于ARM64。 -2. 使能本特性,需要增加内核编译参数:`CONFIG_ARM64_TLBI_IPI`和`CONFIG_ARM64_KVM_HCR_NOFB`。 -3. 使能本特性,需要依赖启动参数`kvm-arm.hcr_nofb=1`。 - -## 使用方法 - -### 编译并更换内核 - -1. 在内核根目录下执行`make openeuler_defconfig`生成.config文件。 - -2. 通过`make menuconfig`指定内核编译参数,打开`CONFIG_ARM64_TLBI_IPI`和`CONFIG_ARM64_KVM_HCR_NOFB`参数,打开后通过查看.config文件中对应的配置项,确保设置生效。 - -3. 编译rpm包:`make binrpm-pkg -j`,生成的rpm包位于rpmbuild/RPMS/aarch64目录下。 - -4. 在rpm包所在目录执行`rpm -ivh --force kernel*.rpm`安装编译的新内核。 - -5. 通过命令`grub2-set-default`设置默认启动内核,以24.03 LTS-SP1版本为例: - - ``` shell - # 查询当前设置的默认启动内核 - [root@localhost ~]# grub2-editenv list - saved_entry=openEuler (6.12.0) 24.03 (LTS-SP1) - - # 查询当前安装的所有内核版本 - [root@localhost ~]# grep "^menuentry" /etc/grub2-efi.cfg - menuentry 'openEuler (6.12.0) 24.03 (LTS-SP1)' --class openeuler --class gnu-linux --class gnu --class os --unrestricted $menuentry_id_option 'gnulinux-6.12.0-advanced-887bd997-932e-4cd0-84e3-ec71abbe368b' { - menuentry 'openEuler (6.6.0-77.0.0.81.oe2403sp1.aarch64) 24.03 (LTS-SP1)' --class openeuler --class gnu-linux --class gnu --class os --unrestricted $menuentry_id_option 'gnulinux-6.6.0-77.0.0.81.oe2403sp1.aarch64-advanced-887bd997-932e-4cd0-84e3-ec71abbe368b' { - - # 设置默认启动内核为'openEuler (6.6.0-77.0.0.81.oe2403sp1.aarch64) 24.03 (LTS-SP1)' - [root@localhost ~]# grub2-set-default 'openEuler (6.6.0-77.0.0.81.oe2403sp1.aarch64) 24.03 (LTS-SP1)' - - # 再次查询当前设置的默认启动内核 - [root@localhost ~]# grub2-editenv list - saved_entry=openEuler (6.6.0-77.0.0.81.oe2403sp1.aarch64) 24.03 (LTS-SP1) - ``` - -### 设置内核启动参数 - -1. 编辑/etc/default/grub文件,在`GRUB_CMDLINE_LINUX`中添加`kvm-arm.hcr_nofb=1`。 - -2. 查看修改结果,示例结果如下: - - ``` shell - [root@localhost ~]# cat /etc/default/grub | grep GRUB_CMDLINE_LINUX - GRUB_CMDLINE_LINUX="rd.lvm.lv=openeuler/root rd.lvm.lv=openeuler/swap video=VGA-1:640x480-32@60me cgroup_disable=files apparmor=0 crashkernel=1024M,high smmu.bypassdev=0x1000:0x17 smmu.bypassdev=0x1000:0x15 arm64.nopauth console=tty0 kvm-arm.hcr_nofb=1" - ``` - -3. 使用命令`grub2-mkconfig -o /boot/efi/EFI/openEuler/grub.cfg`更新grub配置文件。 - -### 重启主机,检查参数设置 - -1. 重启主机。 -2. 执行命令`zcat /proc/config.gz | grep CONFIG_ARM64_TLBI_IPI`,确认回显为`CONFIG_ARM64_TLBI_IPI=y`。 -3. 执行命令`zcat /proc/config.gz | grep CONFIG_ARM64_KVM_HCR_NOFB`,确认回显为`CONFIG_ARM64_KVM_HCR_NOFB=y`。 -4. 执行命令`cat /proc/cmdline`,确保回显包含`kvm-arm.hcr_nofb=1`。 diff --git a/docs/zh/tools/application/_toc.yaml b/docs/zh/tools/application/_toc.yaml index 8a828e3ef4a56a30399f4e2a3aa2c3eefd751e4c..6ff3556bc1ba59e7de2c169c061f45392b67ca46 100644 --- a/docs/zh/tools/application/_toc.yaml +++ b/docs/zh/tools/application/_toc.yaml @@ -2,3 +2,4 @@ label: 拓展应用 sections: - href: upstream: https://gitee.com/openeuler/ros/blob/master/docs/zh/_toc.yaml + path: ./ros diff --git a/docs/zh/tools/cloud/_toc.yaml b/docs/zh/tools/cloud/_toc.yaml index ecc8238a6500ea33637ce4b68378bf0c516c7e52..83bfe67c82844ca6f0e074561c3432c390662ce0 100644 --- a/docs/zh/tools/cloud/_toc.yaml +++ b/docs/zh/tools/cloud/_toc.yaml @@ -1,6 +1,8 @@ label: 云原生工具 sections: - - href: ./ctinspector/_toc.yaml + - href: + upstream: https://gitee.com/openeuler/CTinspector/blob/master/docs/zh/_toc.yaml + path: ./ctinspector - href: upstream: https://gitee.com/openeuler/Cpds/blob/master/docs/zh/_toc.yaml path: ./cpds diff --git a/docs/zh/tools/community_tools/_toc.yaml b/docs/zh/tools/community_tools/_toc.yaml index f80914dd5cec3cb4bc1234e57dbf7b5741ef5af3..f2e1a101a9c7b5f8574a3744318893beeca40252 100644 --- a/docs/zh/tools/community_tools/_toc.yaml +++ b/docs/zh/tools/community_tools/_toc.yaml @@ -12,7 +12,7 @@ sections: sections: - href: upstream: https://gitee.com/openeuler/compiler-docs/blob/master/docs/zh/gcc/_toc.yaml - path: ./development/gcc + path: ./gcc - label: 性能优化 sections: - href: @@ -20,7 +20,7 @@ sections: path: ./atune - href: upstream: https://gitee.com/openeuler/oeAware-manager/blob/master/docs/zh/master/_toc.yaml - path: ./performance/oeaware + path: ./oeaware - label: 迁移 sections: - href: diff --git a/docs/zh/tools/maintenance/_toc.yaml b/docs/zh/tools/maintenance/_toc.yaml index e7b1afb2f7ec29df0f65fe33c94ed7b1e6d34f79..60240b0cd6195e05614f59aea5b91a4344092cbd 100644 --- a/docs/zh/tools/maintenance/_toc.yaml +++ b/docs/zh/tools/maintenance/_toc.yaml @@ -4,6 +4,7 @@ sections: sections: - href: upstream: https://gitee.com/openeuler/syscare/blob/master/docs/zh/_toc.yaml + path: ./syscare - label: 系统监控 sections: - href: diff --git a/docs/zh/tools/security/_toc.yaml b/docs/zh/tools/security/_toc.yaml index b7135cf52f3b397aad3c1abcd80bcbc922d4eda8..09c891797d07907a0407403a9c3f74ec969431f9 100644 --- a/docs/zh/tools/security/_toc.yaml +++ b/docs/zh/tools/security/_toc.yaml @@ -2,4 +2,4 @@ label: 安全 sections: - href: upstream: https://gitee.com/openeuler/secGear/blob/master/docs/zh/master/_toc.yaml - path: ./security/secgear + path: ./secgear