diff --git a/evaluation/README.md b/evaluation/README.md index d461eeb02d148fb57489e2cfcce09bf0a8ebf721..61148781a653bb032083517c22bca83970dc4c32 100644 --- a/evaluation/README.md +++ b/evaluation/README.md @@ -17,7 +17,7 @@ evaluation主要由evaluation进程和runner进程构成。evaluation进程主 - 下载[eval-controller.yaml](./eval-charts/evaluation-controller/eval-controller.yaml)文件 - 执行helm命令 ``` - helm repo add evaluation https://df-evaluation.oss-cn-beijing.aliyuncs.com/chart + helm repo add evaluation https://df-evaluation.oss-cn-beijing.aliyuncs.com/chart/stable helm update helm install evaluation evaluation/evaluation -n evaluation -f eval-controller.yaml ``` diff --git a/evaluation/eval-bench/cmd/eb/main.go b/evaluation/eval-bench/cmd/eb/main.go index 75ff2a45a6cbea559b9bc71fef7c1aa8b42c07bd..0393c5e13688c7ab2dcbf26dbdc04798c11b0643 100644 --- a/evaluation/eval-bench/cmd/eb/main.go +++ b/evaluation/eval-bench/cmd/eb/main.go @@ -50,7 +50,6 @@ func main() { flag.PrintDefaults() fmt.Println(engineTemplateCmd) } - flag.Usage() flag.Parse() // check flag if *fhost == "" { diff --git a/evaluation/eval-charts/evaluation-controller/Chart.yaml b/evaluation/eval-charts/evaluation-controller/Chart.yaml index cbdcfdf5882c94453ecfc846335f40b13df1c117..1dc27bc4a3e50379f5a5205f7964c8077abd09b0 100644 --- a/evaluation/eval-charts/evaluation-controller/Chart.yaml +++ b/evaluation/eval-charts/evaluation-controller/Chart.yaml @@ -15,13 +15,14 @@ type: application # This is the chart version. This version number should be incremented each time you make changes # to the chart and its templates, including the app version. # Versions are expected to follow Semantic Versioning (https://semver.org/) -version: 0.1.0 +version: 0.1.1 # This is the version number of the application being deployed. This version number should be # incremented each time you make changes to the application. Versions are not expected to # follow Semantic Versioning. They should reflect the version the application is using. # It is recommended to use it with quotes. -appVersion: "1.16.0" +appVersion: "0.1.1" + dependencies: - name: mysql repository: "" diff --git a/evaluation/eval-charts/evaluation-controller/charts/mysql/values.yaml b/evaluation/eval-charts/evaluation-controller/charts/mysql/values.yaml index 7b304d99a9ec07a4989ec165a2c319faf2e9ca97..0893e8eba900d828a10ffa74540e3c78c17a9f4c 100644 --- a/evaluation/eval-charts/evaluation-controller/charts/mysql/values.yaml +++ b/evaluation/eval-charts/evaluation-controller/charts/mysql/values.yaml @@ -52,7 +52,7 @@ storageConfig: generateType: "{{ if $.Values.global.allInOneLocalStorage }}hostPath{{ else }}{{$.Values.storageConfig.type}}{{end}}" #Please ignore this ## persistentVolumeClaim/hostPath type: persistentVolumeClaim - hostPath: /opt/evaluation-mysql + hostPath: /opt/{{ $.Release.Namespace }}/evaluation-mysql persistence: storageClass: "" annotations: diff --git a/evaluation/eval-charts/evaluation-controller/charts/web/values.yaml b/evaluation/eval-charts/evaluation-controller/charts/web/values.yaml index a839e39f4339106b015c1b1108437e39a34adbdd..6acd35497a10c15190971ba769968a664f9e0367 100644 --- a/evaluation/eval-charts/evaluation-controller/charts/web/values.yaml +++ b/evaluation/eval-charts/evaluation-controller/charts/web/values.yaml @@ -40,7 +40,7 @@ service: - name: tcp port: 20804 targetPort: 20804 - nodePort: 30080 + nodePort: "{{ $.Values.global.web.service.nodePort }}" protocol: TCP diff --git a/evaluation/eval-charts/evaluation-controller/eval-controller.yaml b/evaluation/eval-charts/evaluation-controller/eval-controller.yaml index 33e485653b466484c5121f37da68175ee8a6c460..8b7eb13d9e7d5a676ee6c7bcf9eee0fe010e7cf8 100644 --- a/evaluation/eval-charts/evaluation-controller/eval-controller.yaml +++ b/evaluation/eval-charts/evaluation-controller/eval-controller.yaml @@ -9,23 +9,22 @@ image: controllerConfig: max_runner_num: 2 - # [必填] controller以及执行测试例主机ssh信息 - global_ssh_port: - global_ssh_username: - global_ssh_password: + # [必填] 测试例主机ssh信息 + runner_ssh_port: + runner_ssh_username: + runner_ssh_password: # HTTP Listen Port listen_port: 10083 # controller组件日志目录(挂载本地目录) log_dir: /var/evaluation # [必填] controller组件位于的主机ip - local_host_ip: "" # 数据存储目录(挂载本地目录) runner_data_dir: /var/log/evalutation + # allure server: http://x.x.x.x:port + allure_server: agent-tools: deepflowce: name: "Deepflow-Agent(CE)" - # deploy_type: k8s / workload - deploy_type: k8s # deepflow是否添加了云平台信息 server_add_cloud: 0 cloud_info: @@ -41,7 +40,7 @@ controllerConfig: config: max_cpus: 1 max_memory: 1024 - # [必填]若未填写platform-tools 则fixed_host必填,目前只支持fixed_host,ssh信息需与global_shh相同 + # [必填]若未填写platform-tools 则fixed_host必填,目前只支持fixed_host,ssh信息需与runner_shh相同 fixed_host: # performance_analysis用例组所用打流机器ip,需提前安装wrk2 performance_analysis_traffic_ip: diff --git a/evaluation/eval-charts/evaluation-controller/templates/controller-deployment.yaml b/evaluation/eval-charts/evaluation-controller/templates/controller-deployment.yaml index aa1c06c1b414e7f7ae29023ddc89e4268caa8397..b2ac6e3d5947f9c0994d57eb680c5676e85b7d4c 100644 --- a/evaluation/eval-charts/evaluation-controller/templates/controller-deployment.yaml +++ b/evaluation/eval-charts/evaluation-controller/templates/controller-deployment.yaml @@ -20,6 +20,7 @@ spec: labels: {{- include "evaluation-controller.selectorLabels" . | nindent 8 }} spec: + serviceAccountName: {{ include "evaluation-controller.fullname" . }}-sa hostNetwork: {{ tpl .Values.hostNetwork . }} dnsPolicy: {{ tpl .Values.dnsPolicy . }} dnsConfig: @@ -50,7 +51,11 @@ spec: - "-c" - | source /root/venv/bin/activate && python3 -u /root/eval-controller/eval-controller.py - + env: + - name: POD_NAMESPACE + valueFrom: + fieldRef: + fieldPath: metadata.namespace volumeMounts: - name: controller-config mountPath: /etc/eval-controller.yaml diff --git a/evaluation/eval-charts/evaluation-controller/templates/role.yaml b/evaluation/eval-charts/evaluation-controller/templates/role.yaml new file mode 100644 index 0000000000000000000000000000000000000000..102937e973aa4253e85e508e7dbcb31eb74a5ed8 --- /dev/null +++ b/evaluation/eval-charts/evaluation-controller/templates/role.yaml @@ -0,0 +1,8 @@ +apiVersion: rbac.authorization.k8s.io/v1 +kind: Role +metadata: + name: {{ include "evaluation-controller.name" . }}-role +rules: +- apiGroups: ["*"] + resources: ["*"] + verbs: ["*"] diff --git a/evaluation/eval-charts/evaluation-controller/templates/rolebinding.yaml b/evaluation/eval-charts/evaluation-controller/templates/rolebinding.yaml new file mode 100644 index 0000000000000000000000000000000000000000..a425e892d6c2262fd1c493dd33161cd3a4ac88b9 --- /dev/null +++ b/evaluation/eval-charts/evaluation-controller/templates/rolebinding.yaml @@ -0,0 +1,11 @@ +apiVersion: rbac.authorization.k8s.io/v1 +kind: RoleBinding +metadata: + name: {{ include "evaluation-controller.name" . }}-rb +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: Role + name: {{ include "evaluation-controller.name" . }}-role +subjects: +- kind: ServiceAccount + name: {{ include "evaluation-controller.name" . }}-sa diff --git a/evaluation/eval-charts/evaluation-controller/templates/serviceaccount.yaml b/evaluation/eval-charts/evaluation-controller/templates/serviceaccount.yaml new file mode 100644 index 0000000000000000000000000000000000000000..0bea4bc58a60861ad82e846a3071b1499d920a7b --- /dev/null +++ b/evaluation/eval-charts/evaluation-controller/templates/serviceaccount.yaml @@ -0,0 +1,4 @@ +apiVersion: v1 +kind: ServiceAccount +metadata: + name: {{ include "evaluation-controller.name" . }}-sa diff --git a/evaluation/eval-charts/evaluation-controller/values.yaml b/evaluation/eval-charts/evaluation-controller/values.yaml index 72e3b06a4974223fb133a697aa1f424b75930fb6..894276be0206a237030dfcc55d134084693a17b4 100644 --- a/evaluation/eval-charts/evaluation-controller/values.yaml +++ b/evaluation/eval-charts/evaluation-controller/values.yaml @@ -1,4 +1,8 @@ global: + web: + service: + nodePort: 30080 + mysql: password: "deepflow" db: "evaluation" @@ -54,15 +58,12 @@ controllerConfig: listen_port: 10083 # HTTP Listen Port log_dir: /var/log/evalutation # log dir runner_data_dir: /var/evaluation # runner data dir - local_host_ip: "" - global_ssh_port: 22 - global_ssh_username: - global_ssh_password: + runner_ssh_port: 22 + runner_ssh_username: + runner_ssh_password: agent-tools: deepflowce: name: "Deepflow-Agent(CE)" - # deploy_type: k8s / workload - deploy_type: k8s docking_platform: 0 cloud_info: vpc_name: diff --git a/evaluation/eval-charts/evaluation-runner/values.yaml b/evaluation/eval-charts/evaluation-runner/values.yaml index 1aeea1e08441febe49c641650feea49af340b2e4..6d99d1c6a7ef84c2dc4f9f2c5a96d8ea898441ef 100644 --- a/evaluation/eval-charts/evaluation-runner/values.yaml +++ b/evaluation/eval-charts/evaluation-runner/values.yaml @@ -33,9 +33,9 @@ securityContext: {} runnerConfig: runner_data_dir: listen_port: - global_ssh_port: 22 - global_ssh_username: - global_ssh_password: + runner_ssh_port: 22 + runner_ssh_username: + runner_ssh_password: case_params: uuid: case_name: diff --git a/evaluation/eval-controller/README.md b/evaluation/eval-controller/README.md index 8f1f32e5829c67c151d040a9f0d022492ce5bbec..041e523efcd0959b9993bf2a57c90eac8e9ac158 100644 --- a/evaluation/eval-controller/README.md +++ b/evaluation/eval-controller/README.md @@ -91,7 +91,6 @@ reponse: "CASE_NAME": "performance_analysis_nginx_http", "CASE_PARAMS": "", "USER": null, - "RUNNER_COMMIT_ID": null, "RUNNER_IMAGE_TAG": null, "STATUS": 11, "DELETED": 0, diff --git a/evaluation/eval-controller/etc/eval-controller.yaml b/evaluation/eval-controller/etc/eval-controller.yaml index 31f71354849a72a13780a3861ebb0d36ec142751..40269cea0b9eb4b243894efcc4658a0e71eb0fd8 100644 --- a/evaluation/eval-controller/etc/eval-controller.yaml +++ b/evaluation/eval-controller/etc/eval-controller.yaml @@ -1,18 +1,15 @@ listen_port: 10083 # HTTP Listen Port runner_data_dir: /var/evaluation # runner data dir log_dir: /var/log/evalutation # log dir -local_host_ip: -global_ssh_port: 22 -global_ssh_username: -global_ssh_password: +runner_ssh_port: 22 +runner_ssh_username: +runner_ssh_password: max_runner_num: 3 +allure_server: agent-tools: deepflowce: name: Deepflow-Agent(CE) - # deploy_type: k8s / workload - deploy_type: k8s - # deepflow-server 是否对接了云平台 docking_platform: 0 cloud_info: diff --git a/evaluation/eval-controller/eval-controller/common/const.py b/evaluation/eval-controller/eval-controller/common/const.py index 3693135ec6bf1fbce3f6dbd16645c14b7b165d0b..874aaaa43ea8f8939c723297c6b905c8cab47981 100644 --- a/evaluation/eval-controller/eval-controller/common/const.py +++ b/evaluation/eval-controller/eval-controller/common/const.py @@ -1,8 +1,16 @@ CONTROLLER_CONFIG_PATH = "/etc/eval-controller.yaml" +RUNNER_CONFIG_PATH = "/etc/eval-runner.yaml" API_PREFIX = "/v1/evaluation" +FORWOAD_API_PREFIX = "/api/v1/evaluation" -POD_MAX_ABNORMAL_STATUS_NUMBER = 10 +POD_MAX_ABNORMAL_STATUS_NUMBER = 5 WAIT_MYSQL_RUNNING_TIMEOUT = 600 ALLURE_SERVER = "http://10.1.19.19:20080" + +# kube 默认变量 +RUNNER_LABEL_APP = "evaluation" +RUNNER_LABEL_COMPONENT = "runner" +DNS_NAME_SERVER = "114.114.114.114" +RUNNER_IMAGE = "hub.deepflow.yunshan.net/public/eval-runner" \ No newline at end of file diff --git a/evaluation/eval-controller/eval-controller/common/kube.py b/evaluation/eval-controller/eval-controller/common/kube.py new file mode 100644 index 0000000000000000000000000000000000000000..d2a686c49364e4deace3631d4dba9adbcf726ce0 --- /dev/null +++ b/evaluation/eval-controller/eval-controller/common/kube.py @@ -0,0 +1,120 @@ +import os +import re +import json + + +from kubernetes import client, config +from eval_lib.common.logger import get_logger +from common import const +log = get_logger() + +class KubeController(): + + def __init__(self, namespace=None) -> None: + config.load_incluster_config() + self.namespace = namespace if namespace else os.environ.get('POD_NAMESPACE') + self.core_api = client.CoreV1Api() + self.batch_api = client.BatchV1Api() + + def get_pods_status(self, name_pattern: str) -> dict: + result = {} + pattern = re.compile(name_pattern) + pod_list = self.core_api.list_namespaced_pod(self.namespace) + for pod in pod_list.items: + if re.fullmatch(pattern, pod.metadata.name): + result[pod.metadata.name] = pod.status.phase + return result + + def delete_pod(self, name_pattern): + pattern = re.compile(name_pattern) + pod_list = self.core_api.list_namespaced_pod(self.namespace) + for pod in pod_list.items: + if re.fullmatch(pattern, pod.metadata.name): + try: + self.core_api.delete_namespaced_pod(pod.metadata.name, namespace=self.namespace) + except client.exceptions.ApiException as e: + log.error(f"Error deleting Pod '{pod.metadata.name}': {e}") + + def create_configmap(self, cm_name: str, data: dict) -> None: + labels = { + "app": const.RUNNER_LABEL_APP, + "component": const.RUNNER_LABEL_COMPONENT + } + body = client.V1ConfigMap( + metadata=client.V1ObjectMeta(name=cm_name, labels=labels), + data=data + ) + try: + self.core_api.create_namespaced_config_map(self.namespace, body) + except client.exceptions.ApiException as e: + log.error(f"Error creating ConfigMap '{cm_name}': {e}") + + def delete_configmap(self, cm_name: str) -> None: + try: + self.core_api.delete_namespaced_config_map(name=cm_name, namespace=self.namespace) + except client.exceptions.ApiException as e: + log.error(f"Error deleting ConfigMap '{cm_name}': {e}") + + def create_runner_template(self, name, image, configmap_name): + labels = { + "app": const.RUNNER_LABEL_APP, + "component": const.RUNNER_LABEL_COMPONENT + } + config_map = client.V1ConfigMapVolumeSource( + name=configmap_name, + items=[{"key":"runnerConfig","path": "eval-runner.yaml"}], + ) + config_map_volume = client.V1Volume( + name="runner-config-v", + config_map=config_map + ) + volume_mount = client.V1VolumeMount( + name="runner-config-v", + mount_path=const.RUNNER_CONFIG_PATH, + sub_path=os.path.basename(const.RUNNER_CONFIG_PATH) + ) + container = client.V1Container( + name=name, + image=image, + volume_mounts=[volume_mount], + image_pull_policy="Always" + ) + spec = client.V1PodSpec( + containers=[container], + volumes=[config_map_volume], + restart_policy="Never", + dns_config={"nameservers": [const.DNS_NAME_SERVER]} + ) + template = client.V1PodTemplateSpec( + metadata=client.V1ObjectMeta(labels=labels), + spec=spec + ) + return template + + def create_job_with_configmap(self, job_name: str, image_tag: str, configmap_name: str) -> None: + labels = { + "app": const.RUNNER_LABEL_APP, + "component": const.RUNNER_LABEL_COMPONENT + } + image = f"{const.RUNNER_IMAGE}:{image_tag}" + template = self.create_runner_template(job_name, image, configmap_name) + spec = client.V1JobSpec( + template=template, + backoff_limit=0 + ) + job = client.V1Job( + api_version="batch/v1", + kind="Job", + metadata=client.V1ObjectMeta(name=job_name,labels=labels), + spec=spec + ) + try: + self.batch_api.create_namespaced_job(self.namespace, job) + except client.exceptions.ApiException as e: + log.error(f"Error creating Job '{job_name}': {e}") + + def delete_job(self, job_name: str) -> None: + try: + self.batch_api.delete_namespaced_job(name=job_name, namespace=self.namespace) + except client.exceptions.ApiException as e: + log.error(f"Error deleting Job '{job_name}': {e}") diff --git a/evaluation/eval-controller/eval-controller/common/mysql.py b/evaluation/eval-controller/eval-controller/common/mysql.py index 9c34e7032e5e0d594ccb90c5f99730d3c5b570eb..413838bd572a9892e192c82c68d00d9f3cc493e8 100644 --- a/evaluation/eval-controller/eval-controller/common/mysql.py +++ b/evaluation/eval-controller/eval-controller/common/mysql.py @@ -2,7 +2,7 @@ import time from . import const from eval_lib.databases.mysql.db import db from eval_lib.common.logger import get_logger -from eval_lib.databases.mysql.models.models import CaseRecord, CaseReport, Component +from eval_lib.model.mysql.models import CaseRecord, CaseReport, Component, CronJob log = get_logger() @@ -16,7 +16,7 @@ def init_mysql(): while True: try: db.connect() - db.create_tables([CaseRecord, CaseReport, Component]) + db.create_tables([CaseRecord, CaseReport, Component, CronJob]) break # 如果成功连接并创建表,则退出循环 except Exception as e: if time.time() - start_time > const.WAIT_MYSQL_RUNNING_TIMEOUT: diff --git a/evaluation/eval-controller/eval-controller/common/utils.py b/evaluation/eval-controller/eval-controller/common/utils.py index 022f5d435ac61dc7aaecb93cfb9fc14cb6888b2a..79b1e6c495fff61ef9f2b38b9d9b4d1eb85977e7 100644 --- a/evaluation/eval-controller/eval-controller/common/utils.py +++ b/evaluation/eval-controller/eval-controller/common/utils.py @@ -7,15 +7,10 @@ from functools import wraps from eval_lib.common.ssh import SSHPool from eval_lib.common import logger from eval_lib.common.exceptions import BadRequestException, InternalServerErrorException -from eval_lib.databases.mysql.models.base import BaseModel +from eval_lib.model.mysql.base import BaseModel log = logger.get_logger() -ssh_pool_default = SSHPool( - conf.global_ssh_port, - conf.global_ssh_username, - conf.global_ssh_password, -) class Paginator: diff --git a/evaluation/eval-controller/eval-controller/config.py b/evaluation/eval-controller/eval-controller/config.py index ec10ca377823afd45c12c83d40b104b4b93d01ff..b285f7f60dc81fcd737c5f87bda5c8514d06ffed 100644 --- a/evaluation/eval-controller/eval-controller/config.py +++ b/evaluation/eval-controller/eval-controller/config.py @@ -10,7 +10,6 @@ class EvaluationConf(): self.listen_port = None self.log_dir = None self.runner_data_dir = None - self.local_host_ip = None self.max_runner_num = None self.agent_tools = {} self.platform_tools = {} @@ -21,16 +20,16 @@ class EvaluationConf(): with open(CONTROLLER_CONFIG_PATH, 'r') as y: yml = yaml.safe_load(y) self.listen_port = yml.get('listen_port', 10083) - self.local_host_ip = yml.get('local_host_ip', "127.0.0.1") self.log_dir = yml.get('log_dir', "/var/log/evaluation") self.runner_data_dir = yml.get( 'runner_data_dir', "/var/evaluation" ) self.max_runner_num = yml.get('max_runner_num', 10) - self.global_ssh_port = yml.get('global_ssh_port', 22) - self.global_ssh_username = yml.get('global_ssh_username', "") - self.global_ssh_password = yml.get('global_ssh_password', "") + self.runner_ssh_port = yml.get('runner_ssh_port', 22) + self.runner_ssh_username = yml.get('runner_ssh_username', "") + self.runner_ssh_password = yml.get('runner_ssh_password', "") self.fixed_host = yml.get('fixed_host', "") + self.allure_server = yml.get('allure_server', "") self.parse_agent_tools(yml) self.parse_platform_tools(yml) self.parse_mysql(yml) @@ -66,6 +65,17 @@ class EvaluationConf(): self.redis_port = self.redis.get("port", 6379) self.redis_password = self.redis.get("password", "root") self.redis_db = self.redis.get("db", "0") + self.redis_max_connections = self.redis.get( + "max_connections", 10 + ) + + # def parse_df_env(self, yml): + # self.df_env = yml.get("df_env", {}) + # self.df_env_ssh_port = self.df_env.get("ssh_port", 22) + # self.df_env_ssh_username = self.df_env.get("ssh_username", "root") + # self.df_env_ssh_password = self.df_env.get( + # "ssh_password", "Yunshan3302!" + # ) def is_valid(self): return self.listen_port and self.log_dir and self.runner_data_dir diff --git a/evaluation/eval-controller/eval-controller/manager/manager.py b/evaluation/eval-controller/eval-controller/manager/manager.py index 57c82574ca779a0ebb1e5b7644f9b99e09f69e5c..24331235cbeed7e9f3725040962dac705eae2881 100644 --- a/evaluation/eval-controller/eval-controller/manager/manager.py +++ b/evaluation/eval-controller/eval-controller/manager/manager.py @@ -11,9 +11,9 @@ from typing import List from multiprocessing import Process from eval_lib.model.base import CaseParams -from eval_lib.databases.mysql.models.models import CaseRecord +from eval_lib.model.mysql.models import CaseRecord from eval_lib.databases.mysql.db import db -from eval_lib.databases.mysql import const as db_const +from eval_lib.model.mysql import const as db_const from manager.runner import Runner from eval_lib.common.logger import get_logger from eval_lib.model import const as model_const diff --git a/evaluation/eval-controller/eval-controller/manager/runner.py b/evaluation/eval-controller/eval-controller/manager/runner.py index 4114c00b5a588307e86c10327f87ba8adc75f959..d916418af32e8cf05bed1f081b2d9bb7f57b7ee5 100644 --- a/evaluation/eval-controller/eval-controller/manager/runner.py +++ b/evaluation/eval-controller/eval-controller/manager/runner.py @@ -1,20 +1,20 @@ -import datetime +import os import threading import time -import requests -import yaml -from config import conf -from common.const import POD_MAX_ABNORMAL_STATUS_NUMBER, ALLURE_SERVER -from eval_lib.databases.redis.runner_info import RedisRunnerInfo -from common.utils import ssh_pool_default -from eval_lib.common.logger import get_logger +import json + +from report.report import ReportManager +from report.allure import AllureServer +from common.kube import KubeController from eval_lib.model.base import CaseParams -from eval_lib.databases.mysql.db import db -from eval_lib.databases.mysql import const as db_const +from eval_lib.model.mysql import const as db_const from eval_lib.databases.redis import const as redis_const +from eval_lib.databases.redis.runner_info import RedisRunnerInfo +from eval_lib.databases.mysql.db import db +from config import conf +from common.const import POD_MAX_ABNORMAL_STATUS_NUMBER from common.mysql import update_case_record -from report.report import ReportManager -import os +from eval_lib.common.logger import get_logger log = get_logger() @@ -27,13 +27,11 @@ class Runner(threading.Thread): self.uuid = params.uuid self.image_tag = params.runner_image_tag self.start_time = int(time.time()) - self.redis_db = RedisRunnerInfo( - host=conf.redis_host, port=conf.redis_port, - password=conf.redis_password, db=conf.redis_db, max_connections=10 - ) - self.local_host_ip = conf.local_host_ip + self.redis_runner = RedisRunnerInfo() + self.kube = KubeController() self.runner_data_path = f"{conf.runner_data_dir}/runner-{self.uuid}" - self.release_name = f"runner-{self.uuid[:8]}" + self.runner_name = f"runner-{self.uuid[:8]}" + self.pod_name_pattern = self.runner_name + ".*" self.callback = None self.signal_lock = threading.Lock() @@ -80,42 +78,29 @@ class Runner(threading.Thread): ) def exec_env(self): - # TODO: leyi 创建pod, 写入redis - runner_yaml_path = f"{self.runner_data_path}/{self.release_name}.yaml" - self.create_runner_yaml_file(runner_yaml_path) - cmds = [ - "sudo helm repo update evaluation", - f"sudo helm install {self.release_name} evaluation/evaluation-runner -n evaluation --create-namespace -f {runner_yaml_path}", - ] - ssh_client = ssh_pool_default.get(self.local_host_ip) - try: - for cmd in cmds: - _, stdout, stderr = ssh_client.exec_command(cmd) - output = stdout.read().decode() - error = stderr.read().decode() - if error: - log.error(f"exec cmd {cmd} error: {error}") - return - log.info(f"exec cmd {cmd} output: {output}") - except Exception as e: - log.error(f"exec_env: error: {e}") + self.create_runner_cm() + self.kube.create_job_with_configmap( + job_name=self.runner_name, + image_tag=self.image_tag, + configmap_name=self.runner_name, + ) # redis 添加信息 - self.redis_db.init_runner_info(uuid=self.uuid) - time.sleep(10) + self.redis_runner.init_runner_info(uuid=self.uuid) + time.sleep(3) def check_runner_pod_running(self): - command = f"sudo kubectl get pod -n evaluation |grep {self.release_name}-evaluation-runner " - ssh_client = ssh_pool_default.get(self.local_host_ip) - _, stdout, _ = ssh_client.exec_command(command) - output = stdout.read().decode() - if "Running" in output: - return True + pod_status_dict = self.kube.get_pods_status( + name_pattern=self.pod_name_pattern + ) + for name, status in pod_status_dict.items(): + if self.runner_name in name and status == "Running": + return True else: return False def check_runner_pod_completed(self): - runner_info = self.redis_db.get_runner_info(uuid=self.uuid) - if runner_info["runner-status"] == redis_const.CASE_STATUS_COMPLETED: + runner_info = self.redis_runner.get_runner_info(uuid=self.uuid) + if runner_info["runner-status"] == redis_const.RUNNER_STATUS_COMPLETED: return True else: return False @@ -164,9 +149,8 @@ class Runner(threading.Thread): continue else: # 如果 Runner Pod 完成执行,记录相关信息并返回 - log.info( - f"case exec finished, runner_status: {self.redis_db.get_runner_info(uuid=self.uuid)}" - ) + runner_info = self.redis_runner.get_runner_info(uuid=self.uuid) + log.info(f"case exec finished, runner_status: {runner_info}") return # 如果达到最大异常状态次数,更新用例记录为错误状态,并抛出异常 @@ -178,23 +162,16 @@ class Runner(threading.Thread): raise Exception("runner pod status not ready") def remove_env(self): - command = f"sudo helm uninstall {self.release_name} -n evaluation" - ssh_client = ssh_pool_default.get(self.local_host_ip) - try: - self.redis_db.delete_runner_info(uuid=self.uuid) - _, _, stderr = ssh_client.exec_command(command) - error = stderr.read().decode() - if error: - log.error(f"uninstall env {self.release_name} error: {error}") - except Exception as e: - log.error(f"remove_env: error: {e}") - raise e + log.info("remove runner env: job,pod,configmap") + self.kube.delete_job(self.runner_name) + self.kube.delete_pod(self.pod_name_pattern) + self.kube.delete_configmap(self.runner_name) def cancel(self): update_case_record( uuid=self.uuid, status=db_const.CASE_RECORD_STATUS_STOPPING ) - self.redis_db.cancel_case(uuid=self.uuid) + self.redis_runner.cancel_case(uuid=self.uuid) log.info("cancel case") self.wait_case_sync() update_case_record( @@ -205,7 +182,7 @@ class Runner(threading.Thread): update_case_record( uuid=self.uuid, status=db_const.CASE_RECORD_STATUS_PAUSING ) - self.redis_db.pause_case(uuid=self.uuid) + self.redis_runner.pause_case(uuid=self.uuid) log.info("pause case") # TODO:leyi 检查是否完成暂停 self.wait_case_sync() @@ -217,7 +194,7 @@ class Runner(threading.Thread): update_case_record( uuid=self.uuid, status=db_const.CASE_RECORD_STATUS_STARTING ) - self.redis_db.resume_case(uuid=self.uuid) + self.redis_runner.resume_case(uuid=self.uuid) log.info("resume case") self.wait_case_sync() update_case_record( @@ -225,7 +202,7 @@ class Runner(threading.Thread): ) def force_end(self): - self.redis_db.end_case(uuid=self.uuid) + self.redis_runner.end_case(uuid=self.uuid) log.info("force end case") update_case_record( uuid=self.uuid, status=db_const.CASE_RECORD_STATUS_FORCE_END @@ -239,7 +216,7 @@ class Runner(threading.Thread): def wait_case_sync(self): while True: time.sleep(5) - runner_info = self.redis_db.get_runner_info(uuid=self.uuid) + runner_info = self.redis_runner.get_runner_info(uuid=self.uuid) if runner_info["case-control-status"] == runner_info[ "case-status"] or runner_info[ "case-status"] == redis_const.CASE_STATUS_COMPLETED: @@ -264,29 +241,26 @@ class Runner(threading.Thread): except FileExistsError: pass - def create_runner_yaml_file(self, file_path): - helm_value_dict = {} + def create_runner_cm(self): runner_config_dict = {} runner_config_dict["case_params"] = self.case_params.to_json() runner_config_dict["redis"] = conf.redis runner_config_dict["mysql"] = conf.mysql runner_config_dict["listen_port"] = conf.listen_port - runner_config_dict["global_ssh_port"] = conf.global_ssh_port - runner_config_dict["global_ssh_username"] = conf.global_ssh_username - runner_config_dict["global_ssh_password"] = conf.global_ssh_password + runner_config_dict["runner_ssh_port"] = conf.runner_ssh_port + runner_config_dict["runner_ssh_username"] = conf.runner_ssh_username + runner_config_dict["runner_ssh_password"] = conf.runner_ssh_password runner_config_dict["fixed_host"] = conf.fixed_host runner_config_dict["agent-tools"] = conf.agent_tools runner_config_dict["platform-tools"] = conf.platform_tools runner_config_dict["runner_data_dir"] = conf.runner_data_dir - helm_value_dict["runnerConfig"] = runner_config_dict - helm_value_dict["image"] = {"tag": self.image_tag} - with open(file_path, 'w') as file: - yaml.dump(helm_value_dict, file) - if not os.path.exists(file_path): - log.error(f"file :{file_path} not found") + data = {"runnerConfig": json.dumps(runner_config_dict)} + self.kube.create_configmap(cm_name=self.runner_name, data=data) def get_results(self): - self.push_allure_results() + AllureServer( + host=conf.allure_server + ).push_allure_results(path=self.runner_allure_path, uuid=self.uuid) self.get_performance_results() def get_performance_results(self): @@ -298,36 +272,9 @@ class Runner(threading.Thread): return log.info("generate test report") try: - # TODO: luyao 查看是否收到performance文件,收到则生成报告 rm = ReportManager( report_path=self.runner_report_path, report_engines=None ) rm.run() except Exception as e: log.error(f"get performance results error: {e}") - - def push_allure_results(self): - allure_file_zip = f"{self.runner_allure_path}/allure-report.zip" - if not os.path.exists(allure_file_zip): - return - headers = {"accept": "*/*"} - files = { - 'allureReportArchive': ( - "allure-report.zip", open(allure_file_zip, - 'rb'), 'application/x-zip-compressed' - ) - } - current_timestamp = int(time.time()) - result_url = ALLURE_SERVER + "/api/report/" + self.uuid + "-" + str( - current_timestamp - )[-7:] - try: - resp = requests.post(result_url, files=files, headers=headers) - log.info(resp.text) - if resp.status_code == 201: - return resp.json() - else: - log.error("Unknown Error !!!") - except Exception as e: - log.error(f"upload allure file error: {e}") - return False diff --git a/evaluation/eval-controller/eval-controller/report/allure.py b/evaluation/eval-controller/eval-controller/report/allure.py new file mode 100644 index 0000000000000000000000000000000000000000..ac42d9d8e9fef5c3e35ac21498c88d2d778f5147 --- /dev/null +++ b/evaluation/eval-controller/eval-controller/report/allure.py @@ -0,0 +1,65 @@ +import os +import requests +import time + +from config import conf +from eval_lib.common.logger import get_logger + +log = get_logger() + + +class AllureServer(object): + + def __init__(self, host=None, headers=None): + self.headers = {"accept": "*/*"} if not headers else headers + self.host = conf.allure_server if not host else host + + def push_allure_results(self, path, uuid): + """ + 将Allure测试结果上传到服务器。 + + :param path: 存储Allure测试结果的目录路径。 + :param uuid: 上传结果的唯一标识符。 + :return: 如果上传成功,返回服务器返回的JSON响应;否则返回False。 + """ + # 检查是否配置了服务器地址,如果没有配置,则不执行上传操作 + if not self.host: + log.warning("no allure server") + return + # 构建Allure报告ZIP文件的路径 + allure_file_zip = f"{path}/allure-report.zip" + # 检查ZIP文件是否存在,如果不存在,则不执行上传操作 + if not os.path.exists(allure_file_zip): + log.warning(f"allure zip: `{allure_file_zip}` not exists") + return + # 准备上传的文件,这里只上传Allure报告的ZIP文件 + files = { + 'allureReportArchive': ( + "allure-report.zip", open(allure_file_zip, + 'rb'), 'application/x-zip-compressed' + ) + } + # 获取当前时间戳,用于生成结果URL的唯一后缀 + current_timestamp = int(time.time()) + # 构建结果URL,包括服务器地址、API路径、UUID和时间戳 + result_url = self.host + "/api/report/" + uuid + "-" + str( + current_timestamp + )[-7:] + try: + # 发起POST请求,上传报告ZIP文件 + resp = requests.post(result_url, files=files, headers=self.headers) + # 记录上传结果的响应文本 + log.info(resp.text) + # 如果响应状态码为201,表示上传成功,返回服务器返回的JSON响应 + if resp.status_code == 201: + return resp.json() + # 如果响应状态码不是201,记录错误信息 + else: + log.error( + f"Allure Server Unknown Error !!! {resp.status_code}" + ) + log.error(resp.json()) + except Exception as e: + # 如果上传过程中发生异常,记录异常信息 + log.error(f"upload allure file error: {e}") + return False diff --git a/evaluation/eval-controller/eval-controller/report/markdown.py b/evaluation/eval-controller/eval-controller/report/markdown.py index e02ec5c77c78f9debcd5dc2c0a3447687948a34e..e675ee792ec97d81cc1100cf90137fb207db18fa 100644 --- a/evaluation/eval-controller/eval-controller/report/markdown.py +++ b/evaluation/eval-controller/eval-controller/report/markdown.py @@ -1,7 +1,5 @@ import os import re -import time -import datetime import yaml from jinja2 import Environment, BaseLoader, Undefined @@ -16,13 +14,17 @@ from .base import ReportBase # key is case_name pattern, value is TEMPLATE file path and case_group abbreviations REPORT_TEMPLATE_LIST = { "performance_analysis_nginx.*": ( - "./report/templates/agent_performance_report_nginx.md", + "./report/templates/agent_performance_analysis_report_nginx.md", "performance_analysis_nginx" ), "performance_analysis_istio.*": ( - "./report/templates/agent_performance_report_istio.md", + "./report/templates/agent_performance_analysis_report_istio.md", "performance_analysis_istio" ), + "performance_analysis_traefik.*": ( + "./report/templates/agent_performance_analysis_report_traefik.md", + "performance_analysis_traefik" + ), } diff --git a/evaluation/eval-controller/eval-controller/report/templates/agent_performance_analysis_report_istio.md b/evaluation/eval-controller/eval-controller/report/templates/agent_performance_analysis_report_istio.md new file mode 100644 index 0000000000000000000000000000000000000000..a106cd7c5bff1db8b4da01dddb2ab357816eebdd --- /dev/null +++ b/evaluation/eval-controller/eval-controller/report/templates/agent_performance_analysis_report_istio.md @@ -0,0 +1,66 @@ +# 采集器性能测试报告(应用性能监控) - {{ data.version }} +## 摘要 +本文档为采集器(deepflow-agent)应用性能监控部分的性能测试报告,测试版本为 {{ data.version }},测试完成时间为{{ data.datetime }}。 + +## 应用性能监控 +本章评估社区版 Agent 在应用性能监控场景下的自身资源消耗,以及对被监控服务的影响。 +测试过程中 Agent 开启了如下功能: +- 应用指标(cBPF + eBPF,1m + 1s) +- 应用调用日志(cBPF + eBPF) +- 系统事件(eBPF File IO) +- 网络指标(cBPF,1m + 1s) +- 网络流日志(cBPF) +未开启如下功能: +- TCP 时序图 +- PCAP 下载 +- 流量分发 +DeepFlow 对通过 cBPF 采集 Packet Data 获取网络和应用的指标和日志,通过 eBPF 采集 Socket Data 获取应用和文件 IO 的指标和日志。关于 eBPF、cBPF 采集位置的示意图如下: + +在 {{ data.version }} 中,DeepFlow Agent 开启的 eBPF Probe 列表可参考DeepFlow GitHub Repo 中的文档。 + +### 测试环境 +- 虚拟机内核:{{ data.vm_kernal }} +- 虚拟机规格:{{ data.vm_cpu }}{{ data.vm_mem }} +- 采集器限制:{{ data.agent_limit_cpu }}{{ data.agent_limit_mem }} +- 采集器commitId:{{ data.commit_id }} + +### 典型云原生微服务场景(istio-bookinfo-demo) +#### 测试方法 +这一节我们希望测试典型云原生微服务业务场景下 deepflow-agent 的性能表现。我们找到了 Istio Bookinfo Demo。Istio 是一种流行的服务网格解决方案,在 GitHub 上拥有 32.9K Star。这个 Demo 的应用拓扑见下图,我们可以看到它由 Python、Java、Ruby、Node.JS 实现的四个微服务组成,每个微服务所在的 Pod 中运行着 Envoy 代理。这个 Demo 中的一个事务对应着访问 4 个微服务的 4 个调用,由于 Envoy 的存在实际调用链深度会被延长约两倍。 +我们使用 wrk2 来注入稳定的 QPS 负载,wrk2 会直接请求 Productpage 服务。所有的服务(包括 wrk2)部署在一个 8C16GB 的 K8s 节点上(CentOS 7、Kernel 4.19),我们会在该节点上部署 deepflow-agent Daemonset 来对所有调用进行采集,测试过程中限制了 deepflow-agent 资源消耗为 1C768MB。 +为了使得 Bookinfo 能够承受 50+% CPU 的高负载,我们调整了两个瓶颈服务的副本数:将 Productpage 调整为 4 副本、将 Details 调整为 2 副本。 + +```mermaid +graph LR; + Wrk2[Wrk2] --> Ingress[Ingress Envoy]; + Ingress --> Productpage[Productpage\npython]; + Productpage --> Details[Details\nRuby]; + Productpage --> Reviews-v1[Reviews-v1\njava]; + Productpage --> Reviews-v2[Reviews-v2\njava]; + Productpage --> Reviews-v3[Reviews-v3\njava]; + Reviews-v2 --> Ratings[Ratings\nNodejs]; + Reviews-v3 --> Ratings[Ratings\nNodejs]; + +``` + +具体的 wrk2 测试命令: +- wrk2 -c50 -t4 -R$rate -d60 -L http://$productpage_ip:9080/productpage +#### 详细数据 +无采集器运行时的测试数据(基线): +| 期望QPS | 实际QPS | P50 时延 (us) | P90 时延 (us) | productpage CPU | details CPU | reviews CPU | ratings CPU | Envoy CPU | +| --- | --- | --- | --- | --- | --- | --- | --- | --- | +| {{ data.performance_analysis_istio_without_agent.server.rate[0] }} | {{ data.performance_analysis_istio_without_agent.server.rps[0] }} | {{ data.performance_analysis_istio_without_agent.server.latency_p50[0] }} | {{ data.performance_analysis_istio_without_agent.server.latency_p90[0] }} | {{ data.performance_analysis_istio_without_agent.productpage.max_cpu[0] }} | {{ data.performance_analysis_istio_without_agent.details.max_cpu[0] }} | {{ data.performance_analysis_istio_without_agent.ws_javaagent_jar.max_cpu[0] }} | {{ data.performance_analysis_istio_without_agent.ratings.max_cpu[0] }} | {{ data.performance_analysis_istio_without_agent.envoy.max_cpu[0] }} | +| {{ data.performance_analysis_istio_without_agent.server.rate[1] }} | {{ data.performance_analysis_istio_without_agent.server.rps[1] }} | {{ data.performance_analysis_istio_without_agent.server.latency_p50[1] }} | {{ data.performance_analysis_istio_without_agent.server.latency_p90[1] }} | {{ data.performance_analysis_istio_without_agent.productpage.max_cpu[1] }} | {{ data.performance_analysis_istio_without_agent.details.max_cpu[1] }} | {{ data.performance_analysis_istio_without_agent.ws_javaagent_jar.max_cpu[1] }} | {{ data.performance_analysis_istio_without_agent.ratings.max_cpu[1] }} | {{ data.performance_analysis_istio_without_agent.envoy.max_cpu[1] }} | +| {{ data.performance_analysis_istio_without_agent.server.rate[2] }} | {{ data.performance_analysis_istio_without_agent.server.rps[2] }} | {{ data.performance_analysis_istio_without_agent.server.latency_p50[2] }} | {{ data.performance_analysis_istio_without_agent.server.latency_p90[2] }} | {{ data.performance_analysis_istio_without_agent.productpage.max_cpu[2] }} | {{ data.performance_analysis_istio_without_agent.details.max_cpu[2] }} | {{ data.performance_analysis_istio_without_agent.ws_javaagent_jar.max_cpu[2] }} | {{ data.performance_analysis_istio_without_agent.ratings.max_cpu[2] }} | {{ data.performance_analysis_istio_without_agent.envoy.max_cpu[2] }} | + + + +有采集器运行时的测试数据: + +| 期望QPS | 实际QPS | P50 时延 (us) | P90 时延 (us) | productpage CPU | details CPU | reviews CPU | ratings CPU | Envoy CPU | Agent CPU | Agent 内存(byte) | +| --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | +| {{ data.performance_analysis_istio_with_agent.server.rate[0] }} | {{ data.performance_analysis_istio_with_agent.server.rps[0] }} | {{ data.performance_analysis_istio_with_agent.server.latency_p50[0] }} | {{ data.performance_analysis_istio_with_agent.server.latency_p90[0] }} | {{ data.performance_analysis_istio_with_agent.productpage.max_cpu[0] }} | {{ data.performance_analysis_istio_with_agent.details.max_cpu[0] }} | {{ data.performance_analysis_istio_with_agent.ws_javaagent_jar.max_cpu[0] }} | {{ data.performance_analysis_istio_with_agent.ratings.max_cpu[0] }} | {{ data.performance_analysis_istio_with_agent.envoy.max_cpu[0] }} | {{ data.performance_analysis_istio_with_agent.agent.max_cpu[0] }} | {{ data.performance_analysis_istio_with_agent.agent.max_mem[0] }} | +| {{ data.performance_analysis_istio_with_agent.server.rate[1] }} | {{ data.performance_analysis_istio_with_agent.server.rps[1] }} | {{ data.performance_analysis_istio_with_agent.server.latency_p50[1] }} | {{ data.performance_analysis_istio_with_agent.server.latency_p90[1] }} | {{ data.performance_analysis_istio_with_agent.productpage.max_cpu[1] }} | {{ data.performance_analysis_istio_with_agent.details.max_cpu[1] }} | {{ data.performance_analysis_istio_with_agent.ws_javaagent_jar.max_cpu[1] }} | {{ data.performance_analysis_istio_with_agent.ratings.max_cpu[1] }} | {{ data.performance_analysis_istio_with_agent.envoy.max_cpu[1] }} | {{ data.performance_analysis_istio_with_agent.agent.max_cpu[1] }} | {{ data.performance_analysis_istio_with_agent.agent.max_mem[1] }} | +| {{ data.performance_analysis_istio_with_agent.server.rate[2] }} | {{ data.performance_analysis_istio_with_agent.server.rps[2] }} | {{ data.performance_analysis_istio_with_agent.server.latency_p50[2] }} | {{ data.performance_analysis_istio_with_agent.server.latency_p90[2] }} | {{ data.performance_analysis_istio_with_agent.productpage.max_cpu[2] }} | {{ data.performance_analysis_istio_with_agent.details.max_cpu[2] }} | {{ data.performance_analysis_istio_with_agent.ws_javaagent_jar.max_cpu[2] }} | {{ data.performance_analysis_istio_with_agent.ratings.max_cpu[2] }} | {{ data.performance_analysis_istio_with_agent.envoy.max_cpu[2] }} | {{ data.performance_analysis_istio_with_agent.agent.max_cpu[2] }} | {{ data.performance_analysis_istio_with_agent.agent.max_mem[2] }} | + + diff --git a/evaluation/eval-controller/eval-controller/report/templates/agent_performance_analysis_report_nginx.md b/evaluation/eval-controller/eval-controller/report/templates/agent_performance_analysis_report_nginx.md new file mode 100644 index 0000000000000000000000000000000000000000..fd12b7c4515bf0c038164d3abfe0215cf23b3612 --- /dev/null +++ b/evaluation/eval-controller/eval-controller/report/templates/agent_performance_analysis_report_nginx.md @@ -0,0 +1,56 @@ +# 采集器性能测试报告(应用性能监控) - {{ data.version }} +## 摘要 +本文档为采集器(deepflow-agent)应用性能监控部分的性能测试报告,测试版本为 {{ data.version }},测试完成时间为{{ data.datetime }}。 + +## 应用性能监控 +本章评估社区版 Agent 在应用性能监控场景下的自身资源消耗,以及对被监控服务的影响。 +测试过程中 Agent 开启了如下功能: +- 应用指标(cBPF + eBPF,1m + 1s) +- 应用调用日志(cBPF + eBPF) +- 系统事件(eBPF File IO) +- 网络指标(cBPF,1m + 1s) +- 网络流日志(cBPF) +未开启如下功能: +- TCP 时序图 +- PCAP 下载 +- 流量分发 +DeepFlow 对通过 cBPF 采集 Packet Data 获取网络和应用的指标和日志,通过 eBPF 采集 Socket Data 获取应用和文件 IO 的指标和日志。关于 eBPF、cBPF 采集位置的示意图如下: + +在 {{ data.version }} 中,DeepFlow Agent 开启的 eBPF Probe 列表可参考DeepFlow GitHub Repo 中的文档。 + +### 测试环境 +- 虚拟机内核:{{ data.vm_kernal }} +- 虚拟机规格:{{ data.vm_cpu }}{{ data.vm_mem }} +- 采集器限制:{{ data.agent_limit_cpu }}{{ data.agent_limit_mem }} +- 采集器commitId:{{ data.commit_id }} + +### 极端高性能的业务场景(nginx-default-page) +#### 测试方法 +这一节我们希望测试一个极端高性能(极简业务逻辑、极低资源开销)的业务场景下 deepflow-agent 的性能表现。我们选择了 Nginx,我们知道它以性能强悍著称,它用 C 语言实现,而且我们在此 Demo 中让他只是简单的回复一个默认静态页。我们相信这个 Nginx Demo 的性能表现远超过任何一个实际的生产业务,我们希望使用这个 Demo 来说明两个问题:1)deepflow-agent 的采集性能如何;deepflow-agent 的采集对极端高性能业务的影响如何。 +我们使用 wrk2 来注入稳定的 QPS 负载,wrk2 会直接请求 Nginx 提供的 Default Page 服务。为了减少其他业务的干扰,我们将 Nginx 和 wrk2 部署在两个单独的虚拟机上(8C16GB、CentOS 7、Kernel 4.19),并且在 Nginx 所在虚拟机上部署了 deepflow-agent。测试过程中限制了 deepflow-agent 资源消耗为 1C768MB。 + +```mermaid +graph LR; + Wrk2[Vm1\nWrk2] --> Nginx[Vm2\nNginx]; +``` + +具体的 wrk2 测试命令: +- wrk2 -c1 -t1 -R$rate -d60 -L http://$nginx_ip:80/index.html +#### 详细数据 +无采集器运行时的测试数据(基线): +| 期望 QPS | 实际 QPS | P50 时延 | P90 时延 | Nginx CPU | +| --- | --- | --- | --- | --- | +| {{ data.performance_analysis_nginx_http_without_agent.server.rate[0] }} | {{ data.performance_analysis_nginx_http_without_agent.server.rps[0] }} | {{ data.performance_analysis_nginx_http_without_agent.server.latency_p50[0] }} | {{ data.performance_analysis_nginx_http_without_agent.server.latency_p90[0] }} | {{ data.performance_analysis_nginx_http_without_agent.nginx.max_cpu[0] }} | {{ data.performance_analysis_nginx_http_without_agent.agent.max_cpu[0] }} | {{ data.performance_analysis_nginx_http_without_agent.agent.max_mem[0] }} | +| {{ data.performance_analysis_nginx_http_without_agent.server.rate[1] }} | {{ data.performance_analysis_nginx_http_without_agent.server.rps[1] }} | {{ data.performance_analysis_nginx_http_without_agent.server.latency_p50[1] }} | {{ data.performance_analysis_nginx_http_without_agent.server.latency_p90[1] }} | {{ data.performance_analysis_nginx_http_without_agent.nginx.max_cpu[1] }} | {{ data.performance_analysis_nginx_http_without_agent.agent.max_cpu[1] }} | {{ data.performance_analysis_nginx_http_without_agent.agent.max_mem[1] }} | +| {{ data.performance_analysis_nginx_http_without_agent.server.rate[2] }} | {{ data.performance_analysis_nginx_http_without_agent.server.rps[2] }} | {{ data.performance_analysis_nginx_http_without_agent.server.latency_p50[2] }} | {{ data.performance_analysis_nginx_http_without_agent.server.latency_p90[2] }} | {{ data.performance_analysis_nginx_http_without_agent.nginx.max_cpu[2] }} | {{ data.performance_analysis_nginx_http_without_agent.agent.max_cpu[2] }} | {{ data.performance_analysis_nginx_http_without_agent.agent.max_mem[2] }} | + + + +有采集器运行时的测试数据: +| 期望 QPS | 实际 QPS | P50 时延 | P90 时延 | Nginx CPU | Agent CPU | Agent 内存 | +| --- | --- | --- | --- | --- | --- | --- | +| {{ data.performance_analysis_nginx_http_with_agent.server.rate[0] }} | {{ data.performance_analysis_nginx_http_with_agent.server.rps[0] }} | {{ data.performance_analysis_nginx_http_with_agent.server.latency_p50[0] }} | {{ data.performance_analysis_nginx_http_with_agent.server.latency_p90[0] }} | {{ data.performance_analysis_nginx_http_with_agent.nginx.max_cpu[0] }} | {{ data.performance_analysis_nginx_http_with_agent.agent.max_cpu[0] }} | {{ data.performance_analysis_nginx_http_with_agent.agent.max_mem[0] }} | +| {{ data.performance_analysis_nginx_http_with_agent.server.rate[1] }} | {{ data.performance_analysis_nginx_http_with_agent.server.rps[1] }} | {{ data.performance_analysis_nginx_http_with_agent.server.latency_p50[1] }} | {{ data.performance_analysis_nginx_http_with_agent.server.latency_p90[1] }} | {{ data.performance_analysis_nginx_http_with_agent.nginx.max_cpu[1] }} | {{ data.performance_analysis_nginx_http_with_agent.agent.max_cpu[1] }} | {{ data.performance_analysis_nginx_http_with_agent.agent.max_mem[1] }} | +| {{ data.performance_analysis_nginx_http_with_agent.server.rate[2] }} | {{ data.performance_analysis_nginx_http_with_agent.server.rps[2] }} | {{ data.performance_analysis_nginx_http_with_agent.server.latency_p50[2] }} | {{ data.performance_analysis_nginx_http_with_agent.server.latency_p90[2] }} | {{ data.performance_analysis_nginx_http_with_agent.nginx.max_cpu[2] }} | {{ data.performance_analysis_nginx_http_with_agent.agent.max_cpu[2] }} | {{ data.performance_analysis_nginx_http_with_agent.agent.max_mem[2] }} | + + diff --git a/evaluation/eval-controller/eval-controller/report/templates/agent_performance_analysis_report_traefik.md b/evaluation/eval-controller/eval-controller/report/templates/agent_performance_analysis_report_traefik.md new file mode 100644 index 0000000000000000000000000000000000000000..30fccebc4712fc41b4608652bef05edc12c58ec9 --- /dev/null +++ b/evaluation/eval-controller/eval-controller/report/templates/agent_performance_analysis_report_traefik.md @@ -0,0 +1,54 @@ +# 采集器性能测试报告(应用性能监控) - {{ data.version }} +## 摘要 +本文档为采集器(deepflow-agent)应用性能监控部分的性能测试报告,测试版本为 {{ data.version }},测试完成时间为{{ data.datetime }}。 + +## 应用性能监控 +本章评估社区版 Agent 在应用性能监控场景下的自身资源消耗,以及对被监控服务的影响。 +测试过程中 Agent 开启了如下功能: +- 应用指标(cBPF + eBPF,1m + 1s) +- 应用调用日志(cBPF + eBPF) +- 系统事件(eBPF File IO) +- 网络指标(cBPF,1m + 1s) +- 网络流日志(cBPF) +- 启用dispatcher-queue +未开启如下功能: +- TCP 时序图 +- PCAP 下载 +- 流量分发 +DeepFlow 对通过 cBPF 采集 Packet Data 获取网络和应用的指标和日志,通过 eBPF 采集 Socket Data 获取应用和文件 IO 的指标和日志。关于 eBPF、cBPF 采集位置的示意图如下: + +在 {{ data.version }} 中,DeepFlow Agent 开启的 eBPF Probe 列表可参考DeepFlow GitHub Repo 中的文档。 + +### 测试环境 +- 虚拟机内核:{{ data.vm_kernal }} +- 虚拟机规格:{{ data.vm_cpu }}{{ data.vm_mem }} +- 采集器限制:{{ data.agent_limit_cpu }}{{ data.agent_limit_mem }} +- 采集器commitId:{{ data.commit_id }} + +### 极端压力查询traefik场景 +#### 测试方法 +这一节我们希望测试deepflow-agent在极端压力traefik场景下的性能表现。 + +[工具地址](https://github.com/deepflowio/deepflow-auto-test/tree/main/app-traffic) + +#### 测试结果 + +无采集器运行时的测试数据: + +打流命令:eb -h http://whoami.fw.com:80 {param} -e http -d 100 + +| TPS | Trafik cpu | Traefik mem | P50 时延(us) | P90 时延(us) | +| --- | --- | --- | --- | --- | +| {{ data.performance_analysis_traefik_without_agent.server.rps[0] }} | {{ data.performance_analysis_traefik_without_agent.traefik.max_cpu[0] }} | {{ data.performance_analysis_traefik_without_agent.traefik.max_mem[0] }} | {{ data.performance_analysis_traefik_without_agent.server.latency_p50[0] }} | {{ data.performance_analysis_traefik_without_agent.server.latency_p90[0] }} | +| {{ data.performance_analysis_traefik_without_agent.server.rps[1] }} | {{ data.performance_analysis_traefik_without_agent.traefik.max_cpu[1] }} | {{ data.performance_analysis_traefik_without_agent.traefik.max_mem[1] }} | {{ data.performance_analysis_traefik_without_agent.server.latency_p50[1] }} | {{ data.performance_analysis_traefik_without_agent.server.latency_p90[1] }} | +| {{ data.performance_analysis_traefik_without_agent.server.rps[2] }} | {{ data.performance_analysis_traefik_without_agent.traefik.max_cpu[2] }} | {{ data.performance_analysis_traefik_without_agent.traefik.max_mem[2] }} | {{ data.performance_analysis_traefik_without_agent.server.latency_p50[2] }} | {{ data.performance_analysis_traefik_without_agent.server.latency_p90[2] }} | + +有采集器运行时的测试数据: + +打流命令:eb -h http://whoami.fw.com:80 {param} -e http -d 100 + +| TPS | Traefik cpu | Traefik mem | agent max_cpu | agent max_mem | P50 时延(us) | P90 时延(us) | drop_pack | +| --- | --- | --- | --- | --- | --- | --- | --- | +| {{ data.performance_analysis_traefik_with_agent.server.rps[0] }} | {{ data.performance_analysis_traefik_with_agent.traefik.max_cpu[0] }} | {{ data.performance_analysis_traefik_with_agent.traefik.max_mem[0] }} | {{ data.performance_analysis_traefik_with_agent.agent.max_cpu[0] }} | {{ data.performance_analysis_traefik_with_agent.agent.max_mem[0] }} | {{ data.performance_analysis_traefik_with_agent.server.latency_p50[0] }} | {{ data.performance_analysis_traefik_with_agent.server.latency_p90[0] }} | {{ data.performance_analysis_traefik_with_agent.agent.drop_pack[0] }} | +| {{ data.performance_analysis_traefik_with_agent.server.rps[1] }} | {{ data.performance_analysis_traefik_with_agent.traefik.max_cpu[1] }} | {{ data.performance_analysis_traefik_with_agent.traefik.max_mem[1] }} | {{ data.performance_analysis_traefik_with_agent.agent.max_cpu[1] }} | {{ data.performance_analysis_traefik_with_agent.agent.max_mem[1] }} | {{ data.performance_analysis_traefik_with_agent.server.latency_p50[1] }} | {{ data.performance_analysis_traefik_with_agent.server.latency_p90[1] }} | {{ data.performance_analysis_traefik_with_agent.agent.drop_pack[1] }} | +| {{ data.performance_analysis_traefik_with_agent.server.rps[2] }} | {{ data.performance_analysis_traefik_with_agent.traefik.max_cpu[2] }} | {{ data.performance_analysis_traefik_with_agent.traefik.max_mem[2] }} | {{ data.performance_analysis_traefik_with_agent.agent.max_cpu[2] }} | {{ data.performance_analysis_traefik_with_agent.agent.max_mem[2] }} | {{ data.performance_analysis_traefik_with_agent.server.latency_p50[2] }} | {{ data.performance_analysis_traefik_with_agent.server.latency_p90[2] }} | {{ data.performance_analysis_traefik_with_agent.agent.drop_pack[2] }} | \ No newline at end of file diff --git a/evaluation/eval-controller/eval-controller/schema/auto_test.py b/evaluation/eval-controller/eval-controller/schema/auto_test.py new file mode 100644 index 0000000000000000000000000000000000000000..8751f325a45140fb048c23cd29c6218434e7e97f --- /dev/null +++ b/evaluation/eval-controller/eval-controller/schema/auto_test.py @@ -0,0 +1,76 @@ +import uuid +from eval_lib.common.exceptions import BadRequestException +from eval_lib.model.base import BaseStruct +from eval_lib.model.const import CASE_PARAMS_STATUS_CREATE, CASE_PARAMS_STATUS_PAUSE, CASE_PARAMS_STATUS_CANCEL, CASE_PARAMS_STATUS_RESUME, CASE_PARAMS_STATUS_FROCE_END +from config import conf + +RUNNER_IMAGE_TAG_DEFAULT_LATEST = "stable" +AGENT_TYPE_DEFAULT_DEEPFLOWCE = "deepflowce" + + +class AutoTestCreate(BaseStruct): + + KEYS = [ + "uuid", + "case_name", + "process_num", + "runner_image_tag", + "agent_type", + "agent_config", + "case_variables", + "user", + "cron_id", + ] + + def init(self, **kwargs): + super().init(**kwargs) + self.uuid = str(uuid.uuid4()) + self.runner_image_tag = RUNNER_IMAGE_TAG_DEFAULT_LATEST if not self.runner_image_tag else self.runner_image_tag + self.agent_type = AGENT_TYPE_DEFAULT_DEEPFLOWCE if not self.agent_type else self.agent_type + + def is_valid(self): + # TODO + if not (self.uuid): + raise BadRequestException("bad request") + if self.agent_type not in conf.agent_tools: + raise BadRequestException( + f"bad request agent_type {self.agent_type}") + + +class AutoTestUpdate(BaseStruct): + + KEYS = ["uuids", "status"] + + def is_valid(self): + # TODO + if not self.uuids: + raise BadRequestException("bad request no uuids") + if self.status is not None: + if self.status not in [ + CASE_PARAMS_STATUS_CREATE, + CASE_PARAMS_STATUS_PAUSE, + CASE_PARAMS_STATUS_CANCEL, + CASE_PARAMS_STATUS_RESUME, + CASE_PARAMS_STATUS_FROCE_END, + ]: + raise BadRequestException(f"bad request status {self.status}") + + +class AutoTestDelete(BaseStruct): + + KEYS = ["uuids"] + + def is_valid(self): + # TODO + if not self.uuids: + raise BadRequestException("bad request") + + +class AutoTestFilter(BaseStruct): + + KEYS = ["uuid", "uuids", "status", "page_size", "page_index"] + + def init(self, **kwargs): + super().init(**kwargs) + self.page_size = int(self.page_size) if self.page_size else None + self.page_index = int(self.page_index) if self.page_index else None diff --git a/evaluation/eval-controller/eval-controller/schema/cron_job.py b/evaluation/eval-controller/eval-controller/schema/cron_job.py new file mode 100644 index 0000000000000000000000000000000000000000..62f48789c8b86bb63e1053efbebb99cc70f6c28e --- /dev/null +++ b/evaluation/eval-controller/eval-controller/schema/cron_job.py @@ -0,0 +1,79 @@ +import uuid +from eval_lib.common.exceptions import BadRequestException +from eval_lib.model.base import BaseStruct, CaseParams +from eval_lib.model.const import CASE_PARAMS_STATUS_CREATE +from config import conf + + +class Cron(BaseStruct): + + KEYS = ["minute", "hour", "day", "month", "week"] + + def init(self, **kwargs): + cron = kwargs.get("cron", "") + if not cron: + raise BadRequestException("cron is empty") + cron_split = cron.split(" ") + if len(cron_split) != 5: + raise BadRequestException("cron format error") + minute, hour, day, month, week = cron_split + super().init(minute=minute, hour=hour, day=day, month=month, week=week) + + def __str__(self): + return f"{self.minute} {self.hour} {self.day} {self.month} {self.week}" + + def is_valid(self): + try: + _ = int(self.minute) if self.minute != "*" else "*" + _ = int(self.hour) if self.hour != "*" else "*" + _ = int(self.day) if self.day != "*" else "*" + _ = int(self.month) if self.month != "*" else "*" + _ = int(self.week) if self.week != "*" else "*" + except Exception as e: + raise BadRequestException("cron format error") + + +class CronJobCreate(BaseStruct): + + KEYS = ["name", "cron", "variables", "branch", "active"] + + def init(self, **kwargs): + print(kwargs) + kwargs["cron"] = Cron(cron=kwargs.get("cron", "")) + kwargs["variables"] = CaseParams( + uuid=str(uuid.uuid4()), status=CASE_PARAMS_STATUS_CREATE, + **kwargs.get("variables", {}) + ) + super().init(**kwargs) + + def is_valid(self): + if not self.name: + raise BadRequestException("name is required") + if not self.cron: + raise BadRequestException("cron is required") + if not self.branch: + raise BadRequestException("branch is required") + if self.active is None: + raise BadRequestException("active is required") + self.variables.is_valid() + self.cron.is_valid() + + +class CronJobUpdate(BaseStruct): + + KEYS = ["uuid", "name", "cron", "variables", "branch", "active"] + + +class CronJobDelete(BaseStruct): + + KEYS = ["uuids"] + + +class CronJobFilter(BaseStruct): + + KEYS = ["uuid", "uuids", "page_size", "page_index"] + + def init(self, **kwargs): + super().init(**kwargs) + self.page_size = int(self.page_size) if self.page_size else None + self.page_index = int(self.page_index) if self.page_index else None diff --git a/evaluation/eval-controller/eval-controller/schema/result.py b/evaluation/eval-controller/eval-controller/schema/result.py new file mode 100644 index 0000000000000000000000000000000000000000..39376a56b28a9bbecaaa6a59e784617aee00a428 --- /dev/null +++ b/evaluation/eval-controller/eval-controller/schema/result.py @@ -0,0 +1,59 @@ +from eval_lib.common.exceptions import BadRequestException +from eval_lib.model.base import BaseStruct + + +class ResultPostLog(BaseStruct): + + KEYS = ["uuid", "type", "data"] + + def is_valid(self): + # TODO + if not self.uuid or self.type is None: + raise BadRequestException("bad request") + return True + + +class ResultGetLog(BaseStruct): + + KEYS = ["uuid", "type", "line_index", "line_size"] + + def init(self, **kwargs): + super().init(**kwargs) + self.type = int(self.type) if self.type is not None else self.type + if not self.line_index: + self.line_index = 1 + if not self.line_size: + self.line_size = 0 + self.line_index = int(self.line_index) + self.line_size = int(self.line_size) + + def is_valid(self): + # TODO + if not self.uuid or self.type is None: + raise BadRequestException("bad request") + return True + + +class ResultLogResponse(BaseStruct): + + KEYS = ["uuid", "logs", "line_index", "line_size", "line_count"] + + +class ResultGetFile(BaseStruct): + + KEYS = ["uuid", "type"] + + def init(self, **kwargs): + super().init(**kwargs) + self.type = int(self.type) if self.type is not None else self.type + + def is_valid(self): + # TODO + if not self.uuid or self.type is None: + raise BadRequestException("bad request") + return True + + +class ResultFileResponse(BaseStruct): + + KEYS = ["uuid", "files"] \ No newline at end of file diff --git a/evaluation/eval-controller/eval-controller/server/auto_test.py b/evaluation/eval-controller/eval-controller/server/auto_test.py index f5d9cb49044da75a613676bc5c8d9bfe4ab0f0d5..ad0e4f45e8bdbe9594c9e92b87ac011f938a6a10 100644 --- a/evaluation/eval-controller/eval-controller/server/auto_test.py +++ b/evaluation/eval-controller/eval-controller/server/auto_test.py @@ -1,11 +1,10 @@ from flask import request, Blueprint -from common.model import AutoTestCreate, AutoTestUpdate, AutoTestDelete, AutoTestFilter +from schema.auto_test import AutoTestCreate, AutoTestUpdate, AutoTestDelete, AutoTestFilter from common.utils import json_response, exception_decorate -from common.const import API_PREFIX from eval_lib.common import logger from service.auto_test import AutoTest -auto_test_app = Blueprint('auto_test_app', __name__, url_prefix=API_PREFIX) +auto_test_app = Blueprint('auto_test_app', __name__) log = logger.get_logger() diff --git a/evaluation/eval-controller/eval-controller/server/cron_job.py b/evaluation/eval-controller/eval-controller/server/cron_job.py new file mode 100644 index 0000000000000000000000000000000000000000..7749985f7ccacdbca25ea238fb92558f73f0821d --- /dev/null +++ b/evaluation/eval-controller/eval-controller/server/cron_job.py @@ -0,0 +1,51 @@ +from flask import request, Blueprint +from schema.cron_job import CronJobCreate, CronJobUpdate, CronJobDelete, CronJobFilter +from common.utils import json_response, exception_decorate +from eval_lib.common import logger +from service.cron_job import CronJobWorker + +cron_app_app = Blueprint('cron_app_app', __name__) +log = logger.get_logger() + + +@cron_app_app.route("/cron-job", methods=["POST"]) +@exception_decorate +def post_jobs(): + json_data = request.json + cj = CronJobCreate(json_data) + cj.is_valid() + + res, _ = CronJobWorker.Post(info=cj) + return json_response(data=res), 200 + + +@cron_app_app.route("/cron-job", methods=["PATCH"]) +@exception_decorate +def update_jobs(): + json_data = request.json + cj = CronJobUpdate(json_data) + cj.is_valid() + + res, _ = CronJobWorker.Update(info=cj) + return json_response(data=res), 200 + + +@cron_app_app.route("/cron-job", methods=["DELETE"]) +@exception_decorate +def delete_jobs(): + json_data = request.json + cj = CronJobDelete(json_data) + cj.is_valid() + + res, _ = CronJobWorker.Delete(info=cj) + return json_response(data=res), 200 + + +@cron_app_app.route("/cron-job", methods=["GET"]) +@exception_decorate +def get_jobs(): + args = request.args + cj = CronJobFilter(**args) + + res, page = CronJobWorker.Get(info=cj) + return json_response(data=res, page=page.to_json()), 200 diff --git a/evaluation/eval-controller/eval-controller/server/dictionary.py b/evaluation/eval-controller/eval-controller/server/dictionary.py index e5fc832150d731c6d6637fa7602478446db86b8a..b15ac3329f0ade55e1123bdfa043fd76773bf0e8 100644 --- a/evaluation/eval-controller/eval-controller/server/dictionary.py +++ b/evaluation/eval-controller/eval-controller/server/dictionary.py @@ -1,11 +1,10 @@ from flask import Blueprint -from common.const import API_PREFIX from common.utils import json_response, exception_decorate from service.dictonary import DictionaryWorker from eval_lib.common import logger -dictionary_app = Blueprint('dictionary_app', __name__, url_prefix=API_PREFIX) +dictionary_app = Blueprint('dictionary_app', __name__) log = logger.get_logger() diff --git a/evaluation/eval-controller/eval-controller/server/result.py b/evaluation/eval-controller/eval-controller/server/result.py index c49ccb11f228c1952f5bb516f954c10867df813d..127e5d80da2b485007b35899ec7f8f683b4717c4 100644 --- a/evaluation/eval-controller/eval-controller/server/result.py +++ b/evaluation/eval-controller/eval-controller/server/result.py @@ -1,15 +1,14 @@ from flask import request, Blueprint from zipfile import ZipFile -from common.model import ResultPostLog, ResultGetLog, ResultGetFile +from schema.result import ResultPostLog, ResultGetLog, ResultGetFile from common.utils import json_response, exception_decorate -from common.const import API_PREFIX from eval_lib.common import logger from eval_lib.common.exceptions import BadRequestException from eval_lib.model.const import RESULT_TYPE_LOG_RAW, RESULT_TYPE_PERFORMANCE_MD from service.result import ResultWorker -result_app = Blueprint('result_app', __name__, url_prefix=API_PREFIX) +result_app = Blueprint('result_app', __name__) log = logger.get_logger() diff --git a/evaluation/eval-controller/eval-controller/server/server.py b/evaluation/eval-controller/eval-controller/server/server.py index f5ccdc366dbe0dfdb023078f2e38f218d7c24fac..d8075f198e12dec120b4929a5767800a342d7f0c 100644 --- a/evaluation/eval-controller/eval-controller/server/server.py +++ b/evaluation/eval-controller/eval-controller/server/server.py @@ -4,14 +4,21 @@ from eval_lib.common import logger from .auto_test import auto_test_app from .result import result_app from .dictionary import dictionary_app +from .cron_job import cron_app_app from eval_lib.databases.mysql.db import db from config import conf +from common.const import API_PREFIX, FORWOAD_API_PREFIX +from service.cron_job import CronJobWorker app = Flask(__name__) log = logger.get_logger() -app.register_blueprint(auto_test_app) -app.register_blueprint(result_app) -app.register_blueprint(dictionary_app) +app.register_blueprint(auto_test_app, url_prefix=API_PREFIX) +app.register_blueprint(result_app, url_prefix=API_PREFIX) +app.register_blueprint(dictionary_app, url_prefix=API_PREFIX) +app.register_blueprint(auto_test_app, url_prefix=FORWOAD_API_PREFIX) +app.register_blueprint(result_app, url_prefix=FORWOAD_API_PREFIX) +app.register_blueprint(dictionary_app, url_prefix=FORWOAD_API_PREFIX) +app.register_blueprint(cron_app_app, url_prefix=FORWOAD_API_PREFIX) @app.before_request @@ -33,4 +40,5 @@ class ServerProcess(Process): super().__init__() def run(self): + CronJobWorker.init() app.run(host="0.0.0.0", port=conf.listen_port) diff --git a/evaluation/eval-controller/eval-controller/service/auto_test.py b/evaluation/eval-controller/eval-controller/service/auto_test.py index d971af91f1ff1bf7a9d85e0e23c30ee6c93b1dbe..c99777873001a9beefbbb9b11a61f663619b7cb0 100644 --- a/evaluation/eval-controller/eval-controller/service/auto_test.py +++ b/evaluation/eval-controller/eval-controller/service/auto_test.py @@ -9,11 +9,11 @@ from eval_lib.common.exceptions import BadRequestException from eval_lib.common import logger from eval_lib.model import const as model_const from eval_lib.model.base import CaseParams -from eval_lib.databases.mysql.models.models import CaseRecord -from eval_lib.databases.mysql import const as db_const +from eval_lib.model.mysql.models import CaseRecord +from eval_lib.model.mysql import const as db_const from config import conf -from common.model import AutoTestCreate, AutoTestUpdate, AutoTestDelete, AutoTestFilter +from schema.auto_test import AutoTestCreate, AutoTestUpdate, AutoTestDelete, AutoTestFilter log = logger.get_logger() POST_TIMEOUT = 10 @@ -110,17 +110,18 @@ class AutoTest(object): for cr in crs: if cr.case_name == info.case_name: raise BadRequestException( - "{info.case_name} already in running" + f"{info.case_name} already in running" ) # 创建一个新的测试用例记录,并保存到数据库 msg = CaseParams(info.to_json()) cr = CaseRecord( uuid=msg.uuid, name=f"{msg.case_name}-{msg.uuid[:8]}", case_name=msg.case_name, process_num=msg.process_num, - agent_type=msg.agent_type, + agent_type=msg.agent_type, case_variables=msg.case_variables, + agent_config=msg.agent_config,runner_image_tag=msg.runner_image_tag, + user=info.user,cron_id=info.cron_id, status=db_const.CASE_RECORD_STATUS_INIT, - created_at=datetime.datetime.now() + - datetime.timedelta(hours=8) + created_at=datetime.datetime.now() + datetime.timedelta(hours=8) ) cr.save() log.info(f"put msg to manager: {msg}") diff --git a/evaluation/eval-controller/eval-controller/service/cron_job.py b/evaluation/eval-controller/eval-controller/service/cron_job.py new file mode 100644 index 0000000000000000000000000000000000000000..7b559de0504848aaa71a80602d633be56fc51618 --- /dev/null +++ b/evaluation/eval-controller/eval-controller/service/cron_job.py @@ -0,0 +1,227 @@ +import threading +import uuid +import json +import requests + +from apscheduler.schedulers.background import BackgroundScheduler + +from schema.cron_job import CronJobCreate, CronJobUpdate, CronJobDelete, CronJobFilter, Cron +from eval_lib.common import logger +from eval_lib.model.mysql.models import CronJob +from eval_lib.model.mysql import const as db_const +from schema.auto_test import AutoTestCreate +from common.utils import Paginator + +log = logger.get_logger() +CRON_JOB_LOCK = threading.RLock() + + +class CronJobService(object): + + def __init__(self): + self.scheduler = None + self.scheds = {} + + def init(self): + self.scheduler = BackgroundScheduler() + self.scheduler.start() + cjs, _ = self.Get() + for cj in cjs: + if cj.active == 1: + self.add_job(cj) + + def Post(self, info: CronJobCreate): + """ + 创建一个新的定时任务。 + + 该方法通过接收一个CronJobCreate类型的info参数,来创建一个新的定时任务。它生成一个唯一的任务UUID, + 根据info中的信息初始化一个CronJob对象,并将该对象保存到数据库中。随后,它将这个新任务添加到调度器中, + 并返回新创建的定时任务的详细信息。 + + 参数: + - info: CronJobCreate类型的实例,包含了新任务的名称、cron表达式、变量、分支和激活状态等信息。 + + 返回: + - 返回新创建的定时任务的详细信息。 + """ + # 使用互斥锁确保并发安全 + with CRON_JOB_LOCK: + # 生成唯一的任务UUID + job_uuid = str(uuid.uuid4()) + # 创建CronJob对象,并初始化其属性 + cj = CronJob( + uuid=job_uuid, name=info.name, cron=info.cron, + variables=json.dumps(info.variables.to_json()), + branch=info.branch, active=info.active + ) + # 将CronJob对象保存到数据库 + cj.save() + # 将新任务添加到调度器 + self.add_job(cj) + # 返回新创建的定时任务的信息 + return self.Get(CronJobFilter(uuid=job_uuid)) + + def Update(self, info: CronJobUpdate): + """ + 更新Cron作业的信息。 + + 使用互斥锁确保在更新作业信息时的线程安全。根据info中的active字段决定是添加还是移除作业。 + 如果info中包含变量信息,将其序列化后更新到数据库中。最后,根据uuid获取更新后的Cron作业信息并返回。 + + 参数: + info: CronJobUpdate类型的实例,包含需要更新的Cron作业的信息。 + + 返回: + 更新后的Cron作业信息。 + """ + # 使用互斥锁确保更新操作的原子性 + with CRON_JOB_LOCK: + # 根据info中的active字段决定是添加还是移除作业 + if info.active is not None: + if info.active == 1: + # 添加新的Cron作业 + cj = self.Get(CronJobFilter(uuid=info.uuid)) + if not cj: + return + self.add_job(cj) + else: + # 移除指定的Cron作业 + self.remove_job(info.uuid) + + # 将info转换为JSON格式的数据 + data = info.to_json() + + # 如果info中包含变量信息,将其序列化并添加到data中 + if info.variables: + data["variables"] = json.dumps(info.variables.to_json()) + + # 移除data中的uuid字段,因为它将作为条件用于更新数据库中的相应记录 + data.pop("uuid") + + # 更新数据库中uuid与info.uuid相同的Cron作业记录,并执行更新操作 + CronJob.update(**data).where(CronJob.uuid == uuid).execute() + + # 根据uuid获取并返回更新后的Cron作业信息 + return self.Get(CronJobFilter(uuid=info.uuid)) + + def Delete(self, info: CronJobDelete): + """ + 删除指定的定时任务。 + + 使用互斥锁确保并发安全,遍历待删除任务的UUID列表,如果该UUID对应的任务存在于调度器中, + 则从调度器中移除该任务。然后更新数据库中对应UUID的定时任务的删除标志。 + + 参数: + - info: CronJobDelete类型的实例,包含待删除任务的UUID列表。 + + 返回: + - 返回更新后的定时任务列表。 + """ + # 使用互斥锁确保并发删除操作的安全性 + with CRON_JOB_LOCK: + # 遍历待删除的任务UUID列表 + for job_uuid in info.uuids: + # 检查UUID是否存在于当前的调度任务中 + if job_uuid in self.scheds: + # 如果存在,则从调度器中移除该任务 + self.remove_job(job_uuid) + # 更新数据库中对应UUID的定时任务,将其标记为已删除 + CronJob.update(deleted=1).where(CronJob.uuid == uuid).execute() + # 返回更新后的所有定时任务列表 + return self.Get() + + def Get(self, info: CronJobFilter = None) -> list: + """ + 根据提供的过滤信息获取CronJob列表。 + + 如果提供了过滤信息(info不为None),则根据过滤条件、分页信息查询并返回CronJob列表; + 如果没有提供过滤信息,则返回所有未删除的CronJob列表。 + + :param info: 包含过滤条件和分页信息的CronJobFilter对象,可选。 + :return: 一个包含CronJob实体的列表和分页信息的元组。 + """ + # 初始化一个空列表用于存储查询结果 + cjs = [] + + # 定义不删除状态的查询条件 + not_delted_where_clause = CronJob.deleted == db_const.CRON_JOB_NOT_DELETED + # 定义查询结果的排序方式为按创建时间降序 + order_by = CronJob.created_at.desc() + + # 如果提供了过滤信息 + if info: + # 根据提供的分页信息对查询结果进行分页 + page = Paginator( + CronJob.select().where(not_delted_where_clause), + info.page_index, info.page_size + ) + + # 将过滤信息转换为JSON格式 + json_where = info.to_json() + # 根据JSON格式的过滤信息生成查询条件 + where_clause = CronJob.visible_where_clause(json_where) + # 如果过滤信息中未标记为删除,则合并不删除状态的查询条件 + if not json_where.get("deleted"): + where_clause = (where_clause) & ( + not_delted_where_clause + ) if where_clause else not_delted_where_clause + # 执行查询并根据排序方式和分页信息限制查询结果 + cjs = CronJob.select().where(where_clause + ).order_by(order_by).limit( + page.limit + ).offset(page.offset) + # 返回查询结果列表和分页信息 + return [cj for cj in cjs], page + else: + # 如果没有提供过滤信息,查询所有未删除的CronJob + cjs = CronJob.select().where(not_delted_where_clause + ).order_by(order_by) + # 返回查询结果列表和一个空的分页信息对象 + return [cj for cj in cjs], Paginator() + + def send_auto_test_msg(self, cj: CronJob): + + def _send_auto_test_msg(): + case_name = cj.variables.get("case_name", "") + process_num = cj.variables.get("process_num", 1) + agent_type = cj.variables.get("agent_type", "") + agent_config = cj.variables.get("agent_config", {}) + case_variables = cj.variables.get("case_variables", {}) + msg = AutoTestCreate( + case_name=case_name, process_num=process_num, + runner_image_tag=cj.branch, agent_type=agent_type, + agent_config=agent_config, case_variables=case_variables, + user=cj.user, cron_id=cj.id + ) + print(msg) + url = "http://localhost:10083/v1/evaluation/auto-test" + body = msg.to_json() + headers = {"Content-Type": "application/json"} + res = requests.post(url, data=json.dumps(body), headers=headers) + if res.status_code == 200: + log.info(f"Cron Job Post Auto-Test Success {res.json()}") + else: + raise Exception( + f"Cron Job Post Auto-Test error: {res.status_code}" + ) + + return _send_auto_test_msg + + def add_job(self, cj: CronJob): + cron = Cron(cron=str(cj.cron)) + job_id = self.scheduler.add_job( + self.send_auto_test_msg(cj), 'cron', minute=cron.minute, + hour=cron.hour, day=cron.day, month=cron.month, week=cron.week + ).id + self.scheds[cj.uuid] = job_id + + def remove_job(self, job_uuid): + if job_uuid not in self.scheds: + log.warning(f"cron job uuid({job_uuid}) not in scheds") + return + job_id = self.scheds[job_uuid] + self.scheduler.remove_job(job_id) + del self.scheds[job_uuid] + + +CronJobWorker = CronJobService() diff --git a/evaluation/eval-controller/eval-controller/service/result.py b/evaluation/eval-controller/eval-controller/service/result.py index 2ea869d5cf66874d1240a0bcf253e09df8d29012..ea0a38c038e32642a7e4514ea44844a7665b6764 100644 --- a/evaluation/eval-controller/eval-controller/service/result.py +++ b/evaluation/eval-controller/eval-controller/service/result.py @@ -4,7 +4,7 @@ import traceback from eval_lib.common import logger from eval_lib.common.exceptions import InternalServerErrorException -from common.model import ResultPostLog, ResultGetLog, ResultLogResponse, ResultGetFile, ResultFileResponse +from schema.result import ResultPostLog, ResultGetLog, ResultLogResponse, ResultGetFile from config import conf log = logger.get_logger() diff --git a/evaluation/eval-lib/common/logger.py b/evaluation/eval-lib/common/logger.py index 7bc4f7732d98e3ad15dcd6002389b433236141eb..bf7fc666abe76df4ac70dd2b4e4236ba7873b1b3 100644 --- a/evaluation/eval-lib/common/logger.py +++ b/evaluation/eval-lib/common/logger.py @@ -57,9 +57,10 @@ class LoggerManager(object): def init_logger(self): if len(self.LOGGER.handlers) > 0: - return + for handler in self.LOGGER.handlers[:]: + self.LOGGER.removeHandler(handler) log_dir = os.path.dirname(self.log_file) - if not os.path.exists(log_dir): + if log_dir and not os.path.exists(log_dir): os.makedirs(log_dir) self.LOGGER.setLevel(LOG_LEVEL_MAP.get(self.log_level)) if self.log_file: diff --git a/evaluation/eval-lib/databases/redis/const.py b/evaluation/eval-lib/databases/redis/const.py index 01e926a8393222c485fafffb8f82110b1abde5bf..42d342c8f0e59ac95aed11ef1e152dd8f0b417f6 100644 --- a/evaluation/eval-lib/databases/redis/const.py +++ b/evaluation/eval-lib/databases/redis/const.py @@ -1,6 +1,8 @@ RUNNER_TIMEOUT = 3600 * 24 RUNNER_KEY = "runner" -GLOBAL_LOCK = "get_runner_info" + +RUNNER_STATUS_INIT = '0' +RUNNER_STATUS_COMPLETED = '1' CASE_STATUS_INIT = 'init' CASE_STATUS_RUNNING = 'running' diff --git a/evaluation/eval-lib/databases/redis/df_env.py b/evaluation/eval-lib/databases/redis/df_env.py new file mode 100644 index 0000000000000000000000000000000000000000..323481417178ab6be84ea7bd66f6e6cbf3e77cfb --- /dev/null +++ b/evaluation/eval-lib/databases/redis/df_env.py @@ -0,0 +1,140 @@ +from .redis_db import RedisDB +import redis +import time + +KEY_NAME_DF_ENV_NAMES = "df-env-names" +DF_ENV_TIMEOUT = 3600 * 24 +ENVS_LIST_KEY = "env_uuids" + + +class DFEnvInfo(RedisDB): + + + def get_envs_key_info(self): + ''' + [{ + "prefix": "12346", + "names": [$env_name] + },] + ''' + envs = [] + conn = redis.Redis(connection_pool=self.conn_pool) + envs_prefixs = conn.lrange(ENVS_LIST_KEY, 0, -1) + for prefix in envs_prefixs: + env = {} + env["prefix"] = prefix.decode('utf-8') + list_key_name = f"{env['prefix']}-{KEY_NAME_DF_ENV_NAMES}" + env_names = conn.lrange(list_key_name, 0, -1) + env["names"] = [name.decode('utf-8') for name in env_names] + envs.append(env) + return envs + + def get_prefix_envs(self, prefix=""): + list_key_name = f"{prefix}-{KEY_NAME_DF_ENV_NAMES}" + conn = redis.Redis(connection_pool=self.conn_pool) + env_names = conn.lrange(list_key_name, 0, -1) + envs = {} + for name in env_names: + name = name.decode('utf-8') + env = conn.hgetall(name) + if env: + env_decode = {} + for k, v in env.items(): + env_decode[k.decode('utf-8')] = v.decode('utf-8') + envs[name] = env_decode + return envs + + def init_envs(self, envs, prefix=""): + """ + envs = [{ + "name": "df-ce-0", + "status": "0", + "mgt_ip": "10.1.19.1", + "updated_time": "1600000000", + "type": "deepflow", + }] + """ + conn = redis.Redis(connection_pool=self.conn_pool) + list_key_name = f"{prefix}-{KEY_NAME_DF_ENV_NAMES}" + env_names_key = conn.lrange(list_key_name, 0, -1) + for env in envs: + env_name:str = env['name'] + # env list + if env_name.encode('utf-8') not in env_names_key: + conn.lpush(list_key_name, env_name) + # env metadata + conn.hmset(env_name, env) + # env timeout + conn.expire(env_name, DF_ENV_TIMEOUT) + # List timeout reset + conn.expire(list_key_name, DF_ENV_TIMEOUT) + envs_prefixs = conn.lrange(ENVS_LIST_KEY, 0, -1) + if prefix.encode('utf-8') not in envs_prefixs: + conn.lpush(ENVS_LIST_KEY, prefix) + + def update(self, env_name, info: dict): + update = False + conn = redis.Redis(connection_pool=self.conn_pool) + for k, v in info.items(): + if conn.hget(env_name, k) != v: + conn.hset(env_name, k, v) + update = True + if update: + updated = int(time.time()) + conn.hset(env_name, "updated_time", updated) + + def update_env_reserved(self, env_name, reserved): + conn = redis.Redis(connection_pool=self.conn_pool) + conn.hset(env_name, "reserved", reserved) + updated = conn.hget(env_name, "updated_time") + updated = int(time.time()) + conn.hset(env_name, "updated_time", updated) + + def update_env_status(self, env_name, status): + conn = redis.Redis(connection_pool=self.conn_pool) + conn.hset(env_name, "status", status) + updated = conn.hget(env_name, "updated_time") + updated = int(time.time()) - int(updated) + conn.hset(env_name, "updated_time", updated) + + def update_env_concurrency(self, env_name, concurrency): + conn = redis.Redis(connection_pool=self.conn_pool) + conn.hset(env_name, "concurrency", concurrency) + updated = conn.hget(env_name, "updated_time") + updated = int(time.time()) - int(updated) + conn.hset(env_name, "updated_time", updated) + + def get_update_time(self, env_name): + conn = redis.Redis(connection_pool=self.conn_pool) + updated = conn.hget(env_name, "updated_time") + return int(updated) + + def delete_env(self, prefix): + print(f"delete env {prefix}") + conn = redis.Redis(connection_pool=self.conn_pool) + # get envs list + list_key_name = f"{prefix}-{KEY_NAME_DF_ENV_NAMES}" + env_names = conn.lrange(list_key_name, 0, -1) + for name in env_names: + name = name.decode('utf-8') + conn.delete(name) + + # delete env key list + conn.delete(list_key_name) + + # delete env uuid + conn.lrem(ENVS_LIST_KEY, 0, prefix) + + def delete_by_instance_name(self, prefix, instance_name): + list_key_name = f"{prefix}-{KEY_NAME_DF_ENV_NAMES}" + conn = redis.Redis(connection_pool=self.conn_pool) + env_names = conn.lrange(list_key_name, 0, -1) + for name in env_names: + name = name.decode('utf-8') + if name == instance_name: + conn.delete(name) + conn.lrem(list_key_name, 0, instance_name) + + def clear(self): + conn = redis.Redis(connection_pool=self.conn_pool) + conn.flushdb() diff --git a/evaluation/eval-lib/databases/redis/redis_db.py b/evaluation/eval-lib/databases/redis/redis_db.py index df97a2b78cc8a48bf9218650885d99773be7c64d..c41082115f19112f82b86cedbd7a7f37e6961f95 100644 --- a/evaluation/eval-lib/databases/redis/redis_db.py +++ b/evaluation/eval-lib/databases/redis/redis_db.py @@ -2,24 +2,19 @@ import redis import time import uuid from redis.exceptions import WatchError +from config import conf +REDIS_CONN_POOL = redis.ConnectionPool( + host=conf.redis_host, port=conf.redis_port, + max_connections=conf.redis_max_connections, password=conf.redis_password, + db=conf.redis_db +) -class RedisDB(): +class RedisDB(): - def __init__( - self, host, port, - password, db, - max_connections, - ) -> None: - self.host = host - self.port = port - self.password = password - self.db = db - self.conn_pool = redis.ConnectionPool( - host=self.host, port=self.port, max_connections=max_connections, - password=self.password, db=self.db - ) + def __init__(self, conn_pool=REDIS_CONN_POOL) -> None: + self.conn_pool = conn_pool def acquire_lock(self, lockname, acquite_timeout=30, time_out=20): """ diff --git a/evaluation/eval-lib/databases/redis/runner_info.py b/evaluation/eval-lib/databases/redis/runner_info.py index 2c51bb65ba6b637a5e9ea65ccb6b56b117c9014c..4b2bfbeab0f2521501396813c74843c68ea431e0 100644 --- a/evaluation/eval-lib/databases/redis/runner_info.py +++ b/evaluation/eval-lib/databases/redis/runner_info.py @@ -2,28 +2,17 @@ from .redis_db import RedisDB import redis from . import const - +GLOBAL_LOCK = "get_runner_info" class RedisRunnerInfo(RedisDB): - def __init__( - self, host, port, - password, db, - max_connections, - ) -> None: - super().__init__( - host, port, - password, db, - max_connections, - ) - def init_runner_info(self, uuid): runner_info = { "uuid": uuid, "case-control-status": const.CASE_STATUS_RUNNING, - "runner-status": const.CASE_STATUS_INIT, + "runner-status": const.RUNNER_STATUS_INIT, "case-status": const.CASE_STATUS_INIT, } conn = redis.Redis(connection_pool=self.conn_pool) @@ -31,64 +20,104 @@ class RedisRunnerInfo(RedisDB): conn.hmset(runner_key_name, runner_info) # runner timeout conn.expire(runner_key_name, const.RUNNER_TIMEOUT) - + def update_runner_info(self, uuid, info: dict): key_name = f"{const.RUNNER_KEY}-{uuid}" - lock = self.acquire_lock(const.GLOBAL_LOCK) + lock = self.acquire_lock(GLOBAL_LOCK) conn = redis.Redis(connection_pool=self.conn_pool) - # update = False for k, v in info.items(): if conn.hget(key_name, k) != v: conn.hset(key_name, k, v) - # update = True - # if update: - # updated = int(time.time()) - # conn.hset(key_name, "updated_time", updated) - self.release_lock(const.GLOBAL_LOCK, lock) + self.release_lock(GLOBAL_LOCK, lock) def get_runner_info(self, uuid) -> dict: key_name = f"{const.RUNNER_KEY}-{uuid}" - lock = self.acquire_lock(const.GLOBAL_LOCK) + lock = self.acquire_lock(GLOBAL_LOCK) runner_info = {} conn = redis.Redis(connection_pool=self.conn_pool) hash_all = conn.hgetall(key_name) if hash_all: for k, v in hash_all.items(): runner_info[k.decode()] = v.decode() - self.release_lock(const.GLOBAL_LOCK, lock) + self.release_lock(GLOBAL_LOCK, lock) return runner_info def delete_runner_info(self, uuid): key_name = f"{const.RUNNER_KEY}-{uuid}" - lock = self.acquire_lock(const.GLOBAL_LOCK) + lock = self.acquire_lock(GLOBAL_LOCK) conn = redis.Redis(connection_pool=self.conn_pool) conn.delete(key_name) - self.release_lock(const.GLOBAL_LOCK, lock) + self.release_lock(GLOBAL_LOCK, lock) + + def sync_case_status(self, uuid) -> str: + ''' + 同步用例状态 + return: case_status 同步之后的case状态 + ''' + key_name = f"{const.RUNNER_KEY}-{uuid}" + lock = self.acquire_lock(GLOBAL_LOCK) + conn = redis.Redis(connection_pool=self.conn_pool) + case_status = conn.hget(key_name, "case-status") + case_control_status = conn.hget(key_name, "case-control-status") + if case_status != case_control_status: + conn.hset(key_name, "case-status", case_control_status) + self.release_lock(GLOBAL_LOCK, lock) + sync_status = case_control_status.decode() if case_status else None + return sync_status def pause_case(self, uuid): key_name = f"{const.RUNNER_KEY}-{uuid}" - lock = self.acquire_lock(const.GLOBAL_LOCK) + lock = self.acquire_lock(GLOBAL_LOCK) conn = redis.Redis(connection_pool=self.conn_pool) conn.hset(key_name, "case-control-status", const.CASE_STATUS_PAUSED) - self.release_lock(const.GLOBAL_LOCK, lock) - + self.release_lock(GLOBAL_LOCK, lock) + def cancel_case(self, uuid): key_name = f"{const.RUNNER_KEY}-{uuid}" - lock = self.acquire_lock(const.GLOBAL_LOCK) + lock = self.acquire_lock(GLOBAL_LOCK) conn = redis.Redis(connection_pool=self.conn_pool) conn.hset(key_name, "case-control-status", const.CASE_STATUS_CANCELLED) - self.release_lock(const.GLOBAL_LOCK, lock) - + self.release_lock(GLOBAL_LOCK, lock) + def resume_case(self, uuid): key_name = f"{const.RUNNER_KEY}-{uuid}" - lock = self.acquire_lock(const.GLOBAL_LOCK) + lock = self.acquire_lock(GLOBAL_LOCK) conn = redis.Redis(connection_pool=self.conn_pool) conn.hset(key_name, "case-control-status", const.CASE_STATUS_RUNNING) - self.release_lock(const.GLOBAL_LOCK, lock) - + self.release_lock(GLOBAL_LOCK, lock) + def end_case(self, uuid): key_name = f"{const.RUNNER_KEY}-{uuid}" - lock = self.acquire_lock(const.GLOBAL_LOCK) + lock = self.acquire_lock(GLOBAL_LOCK) conn = redis.Redis(connection_pool=self.conn_pool) conn.hset(key_name, "case-control-status", const.CASE_STATUS_FORCE_END) - self.release_lock(const.GLOBAL_LOCK, lock) \ No newline at end of file + self.release_lock(GLOBAL_LOCK, lock) + + def set_runner_complete(self, uuid): + key_name = f"{const.RUNNER_KEY}-{uuid}" + lock = self.acquire_lock(GLOBAL_LOCK) + conn = redis.Redis(connection_pool=self.conn_pool) + conn.hset(key_name, "runner-status", const.RUNNER_STATUS_COMPLETED) + self.release_lock(GLOBAL_LOCK, lock) + + def set_case_complete(self, uuid): + key_name = f"{const.RUNNER_KEY}-{uuid}" + lock = self.acquire_lock(GLOBAL_LOCK) + conn = redis.Redis(connection_pool=self.conn_pool) + conn.hset(key_name, "case-status", const.CASE_STATUS_COMPLETED) + self.release_lock(GLOBAL_LOCK, lock) + + def set_case_running(self, uuid): + key_name = f"{const.RUNNER_KEY}-{uuid}" + lock = self.acquire_lock(GLOBAL_LOCK) + conn = redis.Redis(connection_pool=self.conn_pool) + conn.hset(key_name, "case-status", const.CASE_STATUS_RUNNING) + self.release_lock(GLOBAL_LOCK, lock) + + def set_case_end(self, uuid): + key_name = f"{const.RUNNER_KEY}-{uuid}" + lock = self.acquire_lock(GLOBAL_LOCK) + conn = redis.Redis(connection_pool=self.conn_pool) + conn.hset(key_name, "case-status", const.CASE_STATUS_FORCE_END) + self.release_lock(GLOBAL_LOCK, lock) + \ No newline at end of file diff --git a/evaluation/eval-lib/model/base.py b/evaluation/eval-lib/model/base.py index 516ebe4eba0f2776b1e03f0e0c9463f358ff6ba6..3e204a58fb0b69063ec2fad7033de7292f966399 100644 --- a/evaluation/eval-lib/model/base.py +++ b/evaluation/eval-lib/model/base.py @@ -28,28 +28,42 @@ class BaseStruct: return " ".join([f"{key}:{getattr(self, key)}" for key in self.KEYS]) def to_json(self): - return {key: getattr(self, key) for key in self.KEYS} + data = {} + for key in self.KEYS: + value = getattr(self, key) + if isinstance(value, BaseStruct): + value = value.to_json() + data[key] = value + return data def keys(self): yield from self.KEYS + def is_valid(self): + pass + class CaseParams(BaseStruct): KEYS = [ "uuid", "case_name", "process_num", "status", "runner_image_tag", - "agent_type" + "agent_type", "agent_config", "case_variables" ] def init(self, **kwargs): self.uuid = kwargs.get("uuid", None) self.case_name = kwargs.get("case_name", None) - self.process_num = kwargs.get("process_num", 1) + self.process_num = kwargs.get("process_num", None) + if not self.process_num: + self.process_num = 1 self.status = int( kwargs.get("status", const.CASE_PARAMS_STATUS_UNKNOWN) ) - self.runner_image_tag = kwargs.get("runner_image_tag", "latest") + self.runner_image_tag = kwargs.get("runner_image_tag", "stable") self.agent_type = kwargs.get("agent_type", None) + self.agent_config = kwargs.get("agent_config", None) + self.case_variables = kwargs.get("case_variables", None) + print(self.to_json()) def is_valid(self): # TODO diff --git a/evaluation/eval-lib/model/mysql/base.py b/evaluation/eval-lib/model/mysql/base.py new file mode 100644 index 0000000000000000000000000000000000000000..20b70d144fb8a680188ef35b47d4bd556f130ff0 --- /dev/null +++ b/evaluation/eval-lib/model/mysql/base.py @@ -0,0 +1,83 @@ +from typing import Union +from peewee import PrimaryKeyField +from peewee import Model + +from ...databases.mysql.db import db +from ..base import BaseStruct + + +class BaseModel(Model): + id = PrimaryKeyField() + + class Meta: + database = db + + def to_json(self): + return { + key.column_name.upper(): self._get_trans_value( + key.column_name, + ) for key in self._meta.sorted_fields + } + + def _get_trans_value(self, key): + return getattr(self, key, None) + + @classmethod + def visible_where_clause(cls, filter: Union[dict, BaseStruct], **kwargs): + """ + 根据提供的过滤条件生成对应的可见性 WHERE 子句。 + + 参数: + - cls: 当前类,用于调用类级别的 where_clause 方法。 + - filter: 一个字典或 BaseStruct 实例,包含用于构建 WHERE 子句的过滤条件。 + - **kwargs: 额外的关键字参数,也可用于构建 WHERE 子句。 + + 返回值: + - 返回一个表示 WHERE 条件的表达式,这些条件由 filter 和 kwargs 中的参数生成。 + """ + where = None + # 遍历 filter 参数生成的 WHERE 子句,并合并为一个表达式 + for clause in cls.where_clause(filter): + if where is None: + where = clause + else: + where = (where) & clause + # 遍历 kwargs 参数生成的 WHERE 子句,并合并到之前的表达式中 + for clause in cls.where_clause(kwargs): + if where is None: + where = clause + else: + where = (where) & clause + return where + + @classmethod + def where_clause(cls, filter): + """ + 根据提供的过滤条件生成对应的查询条件。 + + 参数: + - cls: 类对象,用于查找属性与过滤条件匹配。 + - filter: 字典对象,包含需要应用的过滤条件。 + + 返回值: + - 生成器对象,包含构建的查询条件。 + """ + for key in filter.keys(): + # 过滤条件值为空时,跳过 + if filter.get(key) is None: + continue + # 检查过滤条件中的键是否为类属性 + if not hasattr(cls, key): + # 如果键以's'结尾且去掉's'后的键是类属性,则生成包含子查询条件的生成器项 + if key[-1] == "s" and hasattr(cls, key[:-1]): + values = filter.get(key) + if not isinstance(values, list): + values = [values] + yield getattr(cls, key[:-1]).in_(values) + else: + values = filter.get(key) + if isinstance(values, list): + yield getattr(cls, key).in_(values) + else: + # 为类属性生成等于过滤值的查询条件的生成器项 + yield getattr(cls, key) == filter.get(key) diff --git a/evaluation/eval-lib/model/mysql/const.py b/evaluation/eval-lib/model/mysql/const.py new file mode 100644 index 0000000000000000000000000000000000000000..52a2a75e970d29109d4347721ac02e029862ee85 --- /dev/null +++ b/evaluation/eval-lib/model/mysql/const.py @@ -0,0 +1,21 @@ +COMPONENT_TYPE_UNKNOWN = 0 # 未知类型 +COMPONENT_TYPE_DF_AGENT = 1 +COMPONENT_TYPE_DF_SERVER = 2 + +CASE_RECORD_STATUS_INIT = 0 +CASE_RECORD_STATUS_STARTED = 1 +CASE_RECORD_STATUS_STARTING = 11 +CASE_RECORD_STATUS_PENDING = 12 +CASE_RECORD_STATUS_PAUSED = 2 +CASE_RECORD_STATUS_PAUSING = 21 +CASE_RECORD_STATUS_FINISHED = 3 +CASE_RECORD_STATUS_STOPPING = 31 +CASE_RECORD_STATUS_ERROR = 4 +CASE_RECORD_STATUS_EXCEPTION = 5 +CASE_RECORD_STATUS_FORCE_END = 6 + +CASE_RECORD_NOT_DELETED = 0 +CASE_RECORD_DELETED = 1 + +CRON_JOB_DELETED = 1 +CRON_JOB_NOT_DELETED = 0 \ No newline at end of file diff --git a/evaluation/eval-lib/model/mysql/models.py b/evaluation/eval-lib/model/mysql/models.py new file mode 100644 index 0000000000000000000000000000000000000000..2d7204cad4b57411d40af31850559ed8990b94a2 --- /dev/null +++ b/evaluation/eval-lib/model/mysql/models.py @@ -0,0 +1,149 @@ +import datetime + +from peewee import CharField, DateTimeField, IntegerField, TextField + +from .base import BaseModel +from .const import COMPONENT_TYPE_UNKNOWN, COMPONENT_TYPE_DF_AGENT, COMPONENT_TYPE_DF_SERVER +from ...databases.mysql.db import db +from ...source.dictonary import Dictionary + + +class CaseRecord(BaseModel): + """ + 测试用例记录类 + + 属性: + name: 用例别名,字符串类型,最大长度64,不能为空 + uuid: 用例唯一标识符,字符串类型,最大长度64,不能为空 + case_name: 用例名称,字符串类型,最大长度64,不能为空 + process_num: 测试用例执行时的进程数,整数类型,不能为空 + agent_type: 采集器类型,字符串类型,最大长度64,不能为空 + user: 执行用户,字符串类型,可以为空 + runner_image_tag: 执行器镜像标签,字符串类型,最大长度64,不能为空 + case_variables: 用例变量,JSON类型,可以为空 + agent_config: 采集器配置信息,JSON类型,可以为空 + status: 执行状态,整数类型,不能为空 + deleted: 删除状态,整数类型,不能为空 + cron_id: 定时任务ID,整数类型,可以为空 + created_at: 创建时间,日期时间类型,默认为当前时间 + """ + + uuid = CharField(max_length=64, unique=True, null=False) + name = CharField(max_length=64, null=False) + case_name = CharField(max_length=64, null=False) + process_num = IntegerField(null=False) + agent_type = CharField(max_length=64, null=False) + user = CharField(max_length=64, null=True) + runner_image_tag = CharField(max_length=64, null=True) + case_variables = TextField(null=True) + agent_config = TextField(null=True) + status = IntegerField(null=False) + deleted = IntegerField(null=False, default=0) + cron_id = IntegerField(null=True) + created_at = DateTimeField( + formats='%Y-%m-%d %H:%M:%S', + default=datetime.datetime.now() + datetime.timedelta(hours=8) + ) + + class Meta: + table_name = 'case_record' + database = db + + def _get_trans_value(self, key): + value = super()._get_trans_value(key) + if key == "case_name": + if value: + trans_case_name = Dictionary().CASE_DICTIONARY.get(value) + if not trans_case_name: + trans_case_name = Dictionary( + ).CASE_GROUP_DICTIONARY.get(value) + if trans_case_name: + return trans_case_name[1] + elif key == "agent_type": + if value: + trans_value = Dictionary().AGENT_TYPE_DICTIONARY.get(value) + if trans_value: + return trans_value[0] + return value + + +class CaseReport(BaseModel): + # 暂未启用 + """ + 测试例报表类 + + 属性: + case_uuid: 测试用例唯一标识符,字符串类型,最大长度64,不能为空 + report_path: 报告路径,字符串类型,最大长度64,不能为空 + created_at: 创建时间,日期时间类型,默认为当前时间 + """ + + case_uuid = CharField(max_length=64, null=False) + report_path = CharField(max_length=64, null=False) + created_at = DateTimeField( + formats='%Y-%m-%d %H:%M:%S', + default=datetime.datetime.now() + datetime.timedelta(hours=8) + ) + + class Meta: + table_name = 'case_report' + database = db + + +class Component(BaseModel): + # 暂未启用 + """ + 测试组件类 + + 属性: + case_uuid: 关联的测试用例唯一标识符,字符串类型,最大长度64,不能为空 + name: 组件名称,字符串类型,最大长度64,不能为空 + type: 组件类型,整数类型,不能为空 + config: 组件配置,字符串类型,最大长度1024,可以为空 + commit_id: 组件提交ID,字符串类型,可以为空 + image_tag: 组件镜像标签,字符串类型,可以为空 + created_at: 创建时间,日期时间类型,默认为当前时间 + """ + + case_uuid = CharField(max_length=64, null=False) + name = CharField(max_length=64, null=False) + type = IntegerField( + null=False, default=COMPONENT_TYPE_UNKNOWN, choices=[ + COMPONENT_TYPE_UNKNOWN, COMPONENT_TYPE_DF_AGENT, + COMPONENT_TYPE_DF_SERVER + ] + ) + config = CharField(max_length=1024, null=True) + commit_id = CharField(null=True) + image_tag = CharField(null=True) + created_at = DateTimeField( + formats='%Y-%m-%d %H:%M:%S', + default=datetime.datetime.now() + datetime.timedelta(hours=8) + ) + + +class CronJob(BaseModel): + """ + 定时任务类,用于表示一个Cron类型的定时任务。 + + Attributes: + uuid (CharField): 任务唯一标识,不允许为空。 + name (CharField): 任务名称,不允许为空。 + variables (TextField): 任务变量,可为空,用于存储任务相关的配置或数据。 + branch (CharField): 任务所属分支,不允许为空,用于区分不同分支上的任务。 + cron (CharField): Cron表达式,不允许为空,用于指定任务的执行周期。 + user (CharField): 任务所属用户,可为空,用于指定任务的执行用户。 + active (IntegerField): 任务激活状态,不允许为空,默认为0(未激活)。 + """ + uuid = CharField(max_length=64, unique=True, null=False) + name = CharField(null=False) + variables = TextField(null=True) + branch = CharField(null=False) + cron = CharField(null=False) + user = CharField(max_length=64, null=True) + active = IntegerField(null=False, default=0) + deleted = IntegerField(null=False, default=0) + created_at = DateTimeField( + formats='%Y-%m-%d %H:%M:%S', + default=datetime.datetime.now() + datetime.timedelta(hours=8) + ) diff --git a/evaluation/eval-lib/source/case.py b/evaluation/eval-lib/source/case.py index 0705c83e0030c34bbe8a11580bedcf0b07bd3165..a1021742586c781926bf4be225305e4a87cb9679 100644 --- a/evaluation/eval-lib/source/case.py +++ b/evaluation/eval-lib/source/case.py @@ -1,6 +1,6 @@ from .base import BaseMeta from ..model import const as model_const -from ..databases.mysql import const as db_const +from ..model.mysql import const as db_const class CaseDictionary(metaclass=BaseMeta): @@ -13,10 +13,10 @@ class CaseDictionary(metaclass=BaseMeta): "performance_analysis_istio": [ "performance_analysis/istio", "性能分析-典型云原生微服务场景" ], - #"runner_test": [ - # "runner_test", - # "api测试用例" - #], + "runner_test": [ + "runner_test", + "api测试用例" + ], } CASE_GROUP_DICTIONARY = { diff --git a/evaluation/eval-runner/Dockerfile b/evaluation/eval-runner/Dockerfile index c059cc2d2d093146a86f545057b86e5a55fb107b..74e50881b235feff440dfafe50428f16b2edff51 100644 --- a/evaluation/eval-runner/Dockerfile +++ b/evaluation/eval-runner/Dockerfile @@ -4,6 +4,7 @@ # 构建层 FROM hub.deepflow.yunshan.net/dev/python:3.8.19-slim-bullseye AS builder +COPY allure-2.18.1.tar allure-2.18.1.tar RUN apt-get update && \ apt-get install --no-install-suggests \ --no-install-recommends --yes \ @@ -18,7 +19,9 @@ RUN apt-get update && \ make=4.3-4.1 \ && \ python3 -m venv /root/venv && \ - /root/venv/bin/pip install -U pip==24.0 + /root/venv/bin/pip install -U pip==24.0 && \ + tar -xvf allure-2.18.1.tar && \ + cp -rf allure-2.18.1 /usr/local/ # 安装 Base pip 依赖包 FROM builder AS builder-venv-base RUN /root/venv/bin/pip install cffi==1.16.0 --trusted-host mirrors.aliyun.com --index-url https://mirrors.aliyun.com/pypi/simple/ @@ -40,6 +43,8 @@ RUN --mount=type=bind,target=/temp,from=builder-venv-custom,source=/ \ cp -raf /temp/usr/lib/${BUILD_ARCH}-linux-gnu/libmariadb3/ /usr/lib/${BUILD_ARCH}-linux-gnu/libmariadb3/ && \ cp -raf /temp/usr/lib/${BUILD_ARCH}-linux-gnu/libmysql* /usr/lib/${BUILD_ARCH}-linux-gnu/ && \ cp -raf /temp/root/venv /root/venv && \ + cp -raf /temp/usr/local/allure-2.18.1 /usr/local/ && \ + echo -e 'export ALLURE=/usr/local/allure-2.18.1/bin\nexport PATH=$PATH:$ALLURE' >> /etc/profile && \ apt-get update && \ apt-get install --no-install-suggests \ --no-install-recommends --yes \ @@ -48,4 +53,4 @@ RUN --mount=type=bind,target=/temp,from=builder-venv-custom,source=/ \ COPY ./etc/eval-runner.yaml /etc/ COPY ./eval-runner/ /root/eval-runner/ # Run -CMD /root/venv/bin/python3 -u /root/runner/eval-runner.py \ No newline at end of file +CMD ["/bin/bash", "-c", "source /root/venv/bin/activate && python3 -u /root/eval-runner/eval-runner.py"] \ No newline at end of file diff --git a/evaluation/eval-runner/etc/eval-runner.yaml b/evaluation/eval-runner/etc/eval-runner.yaml index 9f747f6371b876157df33d1b127260bb3e5f5a11..71c04b0356b041e8b28c4d3d0b926b418206fb6b 100644 --- a/evaluation/eval-runner/etc/eval-runner.yaml +++ b/evaluation/eval-runner/etc/eval-runner.yaml @@ -1,8 +1,8 @@ listen_port: 10083 # HTTP Listen Port runner_data_dir: "" -global_ssh_port: 22 -global_ssh_username: root -global_ssh_password: "" +runner_ssh_port: 22 +runner_ssh_username: root +runner_ssh_password: "" case_params: uuid: @@ -52,4 +52,5 @@ redis: host: port: password: - db: \ No newline at end of file + db: + redis_max_connections: \ No newline at end of file diff --git a/evaluation/eval-runner/eval-runner/agent_tools/agent_manage.py b/evaluation/eval-runner/eval-runner/agent_tools/agent_manage.py new file mode 100644 index 0000000000000000000000000000000000000000..aba3678a3f5b7a0f95b9ed1ba482d714593e15dd --- /dev/null +++ b/evaluation/eval-runner/eval-runner/agent_tools/agent_manage.py @@ -0,0 +1,43 @@ +from typing import List + +from common import const +from eval_lib.common.logger import get_logger +from config import conf +from common.parse_config import ParseConfig +from agent_tools.deepflow_agent.deepflow_agent import DeeepflowAgent +from agent_tools.base import AgentBase + +log = get_logger() + + +class AgentManage(): + + def __init__(self): + self.agents: List[AgentBase] = [] + + def create_agent(self, case_type=const.CASE_TYPE_CONCURRENCY) -> AgentBase: + agent_type = conf.case_params.agent_type + if agent_type == 'deepflowce': + agent = DeeepflowAgent() + if not hasattr(self, "server_meta"): + self.server_meta = ParseConfig.get_df_server_meta( + case_type=case_type) + log.info( + f"deploy deepflow-agent, deepflow-server meta: {self.server_meta}" + ) + agent.init_df_server(self.server_meta) + self.agents.append(agent) + return agent + else: + # 如果没有选择有效的 agent,则记录错误 + log.error("Invalid agent type specified.") + assert False + + def clear_all_agent(self): + for agent in self.agents: + agent.clear() + self.agents = [] + agent_type = conf.case_params.agent_type + if agent_type == 'deepflowce' and hasattr(self, "server_meta"): + ParseConfig.release_df_env(self.server_meta.server_ip) + log.info(f"freed df env {self.server_meta.to_json()}") diff --git a/evaluation/eval-runner/eval-runner/agent_tools/base.py b/evaluation/eval-runner/eval-runner/agent_tools/base.py index 1278832ad1edac2a776579f529f566cb398b7bb6..7cc2cc9429ff30bf9726666dc1a8ce9e3080d5b0 100644 --- a/evaluation/eval-runner/eval-runner/agent_tools/base.py +++ b/evaluation/eval-runner/eval-runner/agent_tools/base.py @@ -1,6 +1,6 @@ from eval_lib.common.ssh import SSHPool from common.module import AgentMeta -from common.config import conf +from config import conf class Base(object): @@ -23,9 +23,10 @@ class Base(object): self.agent_ip = meta.agent_ip self.agent_version = meta.version self.agent_name = agent_name - self.agent_type = conf.case_params.agent_type + self.deploy_type = None # 配置文件中,agent_tools包含的参数 - self.custom_param: dict = conf.agent_tools[self.agent_type] + agent_type = conf.case_params.agent_type + self.custom_param: dict = conf.agent_tools[agent_type] class AgentBase(Base): @@ -33,6 +34,7 @@ class AgentBase(Base): def __init__(self) -> None: super().__init__() + # ----------------以下属性待继承类副职---------------- # 1.采集器进程名,用于telegraf获取进程负载数据 diff --git a/evaluation/eval-runner/eval-runner/agent_tools/deepflow_agent/deepflow_agent.py b/evaluation/eval-runner/eval-runner/agent_tools/deepflow_agent/deepflow_agent.py index 60e628e5110ee1ba74b45a4d1574d7ef696b0042..c4294da85b7b8753030b32e158706945ed64a056 100644 --- a/evaluation/eval-runner/eval-runner/agent_tools/deepflow_agent/deepflow_agent.py +++ b/evaluation/eval-runner/eval-runner/agent_tools/deepflow_agent/deepflow_agent.py @@ -5,6 +5,7 @@ import copy from datetime import datetime, timedelta +from common import const from agent_tools.base import AgentBase from agent_tools.deepflow_agent import url from agent_tools.deepflow_agent.deepflow_server import DeepflowServer @@ -19,6 +20,7 @@ class DeeepflowAgent(AgentBase): def __init__(self): super().__init__() + self.deploy_type: str = None self.deepflow_server = DeepflowServer() self.group_name: str = None self.vtap_lcuuid: str = None @@ -28,30 +30,25 @@ class DeeepflowAgent(AgentBase): "v6.3": "--version 6.3.9", "v6.4": "--version 6.4.9", } - self.deploy_status = False + + def init_df_server(self, server_meta): + self.deepflow_server.init(server_meta) def deploy(self): """部署agent """ - # 初始化deepflow_server - server_ip = self.custom_param.get("server_ip", None) - if server_ip is None: - log.error("deepflow_server_ip is None") - assert False self.group_name = self.agent_name - self.deepflow_server.init( - server_ip, - self.custom_param.get("server_ssh_port", 22), - self.custom_param.get("server_ssh_username"), - self.custom_param.get("server_ssh_password"), - ) # 部署agent - if self.custom_param["deploy_type"] == "k8s": + if "deploy_type" in self.custom_param: + self.deploy_type = self.custom_param["deploy_type"] + if self.deploy_type == const.AGENT_DEPLOY_TYPE_K8S: self.deploy_k8s_agent() - elif self.custom_param["deploy_type"] == "workload": + elif self.deploy_type == const.AGENT_DEPLOY_TYPE_WORKLOAD: self.deploy_workload_agent() - self.deploy_status = True + else: + log.error(f"deploy_type is not supported {self.deploy_type}") + assert False def deploy_k8s_agent(self): """部署k8s类型采集器 @@ -62,14 +59,13 @@ class DeeepflowAgent(AgentBase): ssh_pool=self._ssh_pool, ) if common_utils.check_helm_chart( - vm_ip=self.agent_ip, - chart_name="deepflow-agent", - namespace="deepflow", - ssh_pool=self._ssh_pool, + vm_ip=self.agent_ip, + chart_name="deepflow-agent", + namespace="deepflow", + ssh_pool=self._ssh_pool, ): log.info( - "helm chart is already installed, re-install deepflow-agent" - ) + "helm chart is already installed, re-install deepflow-agent") self.uninstall() time.sleep(30) common_utils.upload_files( @@ -81,8 +77,7 @@ class DeeepflowAgent(AgentBase): # self.replace_registry_to_public() ssh_client = self._ssh_pool.get(self.agent_ip) ssh_client.exec_command( - f"sed -i '2i\ tag: {self.agent_version}' deepflow-agent.yaml" - ) + f"sed -i '2i\ tag: {self.agent_version}' deepflow-agent.yaml") version = self.version_commands.get(self.agent_version, "") if self.custom_param.get("docking_platform", ""): cloud_info = self.custom_param["cloud_info"] @@ -115,21 +110,24 @@ class DeeepflowAgent(AgentBase): def deploy_workload_agent(self): """部署workload类型采集器 """ + if common_utils.check_ssh_command(self.agent_ip, "deepflow-agent -v", + self._ssh_pool): + log.info( + "deepflow-agent is already installed, re-install deepflow-agent" + ) + self.uninstall() if not self.custom_param.get("docking_platform", ""): self.platform_enabled() + log.info("Start to deploy deepflow-agent") system_name, system_version = common_utils.get_system_info( vm_ip=self.agent_ip, ssh_pool=self._ssh_pool, ) - if common_utils.check_ssh_command( - self.agent_ip, "deepflow-agent -v", self._ssh_pool - ): - self.uninstall() + common_utils.install_unzip(self.agent_ip, self._ssh_pool) if 'CentOS' in system_name or 'Alibaba' in system_name: agent_url = url.deepflow_agent_rpm_lastest_url.replace( - "latest", self.agent_version - ) + "latest", self.agent_version) install_cmd = f'''curl -O {agent_url} &&\ sudo unzip deepflow-agent-rpm.zip &&\ sudo rpm -ivh x86_64/deepflow-agent-1*.rpm &&\ @@ -137,8 +135,7 @@ class DeeepflowAgent(AgentBase): ''' elif 'Ubuntu' in system_name and "14." in system_version: agent_url = url.deepflow_agent_deb_lastest_url.replace( - "latest", self.agent_version - ) + "latest", self.agent_version) install_cmd = f'''curl -O {agent_url} &&\ sudo unzip deepflow-agent-deb.zip &&\ sudo dpkg -i x86_64/deepflow-agent-*.upstart.deb &&\ @@ -146,8 +143,7 @@ class DeeepflowAgent(AgentBase): ''' elif 'Ubuntu' in system_name and "14." not in system_version: agent_url = url.deepflow_agent_deb_lastest_url.replace( - "latest", self.agent_version - ) + "latest", self.agent_version) install_cmd = f'''curl -O {agent_url} &&\ sudo unzip deepflow-agent-deb.zip &&\ sudo dpkg -i x86_64/deepflow-agent-*.systemd.deb &&\ @@ -155,8 +151,7 @@ class DeeepflowAgent(AgentBase): ''' elif 'Debian' in system_name: agent_url = url.deepflow_agent_deb_lastest_url.replace( - "latest", self.agent_version - ) + "latest", self.agent_version) install_cmd = f'''curl -O {agent_url} &&\ sudo unzip deepflow-agent-deb.zip &&\ sudo dpkg -i x86_64/deepflow-agent-*.systemd.deb &&\ @@ -164,8 +159,7 @@ class DeeepflowAgent(AgentBase): ''' elif 'Anolis' in system_name: agent_url = url.deepflow_agent_arm_rpm_lastest_url.replace( - "latest", self.agent_version - ) + "latest", self.agent_version) install_cmd = f'''curl -O {agent_url} &&\ sudo unzip deepflow-agent*.zip &&\ sudo rpm -ivh aarch64/deepflow-agent-1*.rpm &&\ @@ -180,8 +174,7 @@ class DeeepflowAgent(AgentBase): exit_status = stdout.channel.recv_exit_status() if exit_status == 0: log.info( - f"deepflow-agent is installation successful. please start it" - ) + f"deepflow-agent is installation successful. please start it") else: log.error(f"deepflow-agent is installation failed. err: {errs}") assert False @@ -204,7 +197,7 @@ class DeeepflowAgent(AgentBase): assert False def uninstall(self): - if self.custom_param["deploy_type"] == "k8s": + if self.deploy_type == "k8s": command = f"sudo helm uninstall deepflow-agent -n deepflow" ssh_client = self._ssh_pool.get(self.agent_ip) _, _, stderr = ssh_client.exec_command(command) @@ -239,7 +232,7 @@ class DeeepflowAgent(AgentBase): ) def start(self): - if self.custom_param["deploy_type"] == "k8s": + if self.deploy_type == "k8s": log.info("k8s agent start is not supported") return ssh_client = self._ssh_pool.get(self.agent_ip) @@ -251,12 +244,11 @@ class DeeepflowAgent(AgentBase): return True else: log.error( - f"deepflow-agent start failed, err: {stderr.read().decode()}" - ) + f"deepflow-agent start failed, err: {stderr.read().decode()}") return False def stop(self): - if self.custom_param["deploy_type"] == "k8s": + if self.deploy_type == "k8s": log.info("k8s agent stop is not supported") return ssh_client = self._ssh_pool.get(self.agent_ip) @@ -268,12 +260,11 @@ class DeeepflowAgent(AgentBase): return True else: log.error( - f"deepflow-agent stop failed, err: {stderr.read().decode()}" - ) + f"deepflow-agent stop failed, err: {stderr.read().decode()}") return False def restart(self): - if self.custom_param["deploy_type"] == "k8s": + if self.deploy_type == "k8s": log.info("k8s agent restart is not supported") return ssh_client = self._ssh_pool.get(self.agent_ip) @@ -290,8 +281,6 @@ class DeeepflowAgent(AgentBase): return False def clear(self): - if self.deploy_status: - self.uninstall() if self.vtap_lcuuid: self.deepflow_server.delete_vtap_by_lcuuid(self.vtap_lcuuid) if self.group_id: @@ -299,8 +288,7 @@ class DeeepflowAgent(AgentBase): def ensure_agent_status_available(self): self.vtap_lcuuid = self.deepflow_server.check_vtaps_list_by_ip( - agent_ip=self.agent_ip - ) + agent_ip=self.agent_ip) self.deepflow_server.check_analyzer_ip(agent_ip=self.agent_ip) def replace_registry_to_public(self): @@ -317,14 +305,13 @@ class DeeepflowAgent(AgentBase): :param end_time: 结束时间戳 :return: True or False, True表示出现了重启,False表示没有出现重启。 """ - if self.custom_param["deploy_type"] == "k8s": + if self.deploy_type == "k8s": log.info("k8s agent check abnormal restart time is not supported") return ssh_client = self._ssh_pool.get(self.agent_ip) _, stdout, _ = ssh_client.exec_command( ''' sudo grep restart /var/log/deepflow-agent/deepflow-agent.log \ - |awk '{log.info substr($1, index($1, "2024")),$2}' ''' - ) + |awk '{log.info substr($1, index($1, "2024")),$2}' ''') # 这里需要把时间转换为时间戳,然后判断时间戳是否在指定的时间范围内 for line in stdout: line = line.strip() @@ -332,8 +319,8 @@ class DeeepflowAgent(AgentBase): continue time_str = line.split(' ')[0] # 日志时间是北京时间,需要减去8小时,才能转换为时间戳。 - time_obj = datetime.strptime(time_str, '%Y-%m-%d %H:%M:%S.%f' - ) - timedelta(hours=8) + time_obj = datetime.strptime( + time_str, '%Y-%m-%d %H:%M:%S.%f') - timedelta(hours=8) time_stamp = int(time_obj.timestamp()) if start_time <= time_stamp <= end_time: log.warning(f"deepflow-agent restarted at {time_str}") @@ -341,9 +328,8 @@ class DeeepflowAgent(AgentBase): log.info(f"deepflow-agent did not restart") return False - def create_config_file_yaml( - self, config_dict: dict, file_path: str - ) -> str: + def create_config_file_yaml(self, config_dict: dict, + file_path: str) -> str: try: existing_data = {"vtap_group_id": ""} if "config" in self.custom_param: @@ -359,8 +345,7 @@ class DeeepflowAgent(AgentBase): server_ip = self.deepflow_server.server_ip if self.group_id is None: self.group_id = self.deepflow_server.create_group( - group_name=self.group_name - ) + group_name=self.group_name) ssh_client = self.deepflow_server._ssh_pool.get(server_ip) # 通过deepflow-ctl载入配置 config_cmd = [ @@ -375,8 +360,7 @@ class DeeepflowAgent(AgentBase): log.info(f"deepflow-agent configure successful") else: log.error( - f"deepflow-agent configure failed {stderr.read().decode()}" - ) + f"deepflow-agent configure failed {stderr.read().decode()}") assert False def update_group_config(self, config_dict: dict = None): @@ -385,9 +369,8 @@ class DeeepflowAgent(AgentBase): else: config_dict = copy.deepcopy(config_dict) tmp_file_path = f"{self.group_name}.yaml" - self.create_config_file_yaml( - config_dict=config_dict, file_path=tmp_file_path - ) + self.create_config_file_yaml(config_dict=config_dict, + file_path=tmp_file_path) common_utils.upload_files( vm_ip=self.deepflow_server.server_ip, local_path=tmp_file_path, @@ -404,8 +387,7 @@ class DeeepflowAgent(AgentBase): """ self.update_group_config(config_dict=config_dict) self.vtap_lcuuid = self.deepflow_server.check_vtaps_list_by_ip( - agent_ip=self.agent_ip - ) + agent_ip=self.agent_ip) self.deepflow_server.agent_join_in_group( vtap_lcuuid=self.vtap_lcuuid, group_name=self.group_name, @@ -419,14 +401,11 @@ class DeeepflowAgent(AgentBase): def get_metric_data_by_agent(self, start_time, end_time): vtap_info = {} vtap_full_name = self.deepflow_server.get_vtap_full_name_by_ip( - self.agent_ip - ) + self.agent_ip) max_cpu = self.deepflow_server.get_vtap_max_cpu_usage( - vtap_full_name, start_time, end_time - ) + vtap_full_name, start_time, end_time) max_mem = self.deepflow_server.get_vtap_max_mem_usage( - vtap_full_name, start_time, end_time - ) + vtap_full_name, start_time, end_time) vtap_info = { "agent.max_cpu": max_cpu, "agent.max_mem": max_mem, diff --git a/evaluation/eval-runner/eval-runner/agent_tools/deepflow_agent/deepflow_server.py b/evaluation/eval-runner/eval-runner/agent_tools/deepflow_agent/deepflow_server.py index d0bfa53e7ece519873bcdfc821834a6258f27084..cad159a734c1006ad82aa727959bbc6239d4aceb 100644 --- a/evaluation/eval-runner/eval-runner/agent_tools/deepflow_agent/deepflow_server.py +++ b/evaluation/eval-runner/eval-runner/agent_tools/deepflow_agent/deepflow_server.py @@ -7,6 +7,7 @@ from urllib.parse import urlencode from eval_lib.common.logger import get_logger from eval_lib.common.ssh import SSHPool from agent_tools.deepflow_agent import url +from common.module import DFServerMeta log = get_logger() @@ -19,21 +20,19 @@ class DeepflowServer(): self.query_port = None self._ssh_pool = SSHPool() - def init( - self, server_ip, ssh_port=22, ssh_username=None, ssh_password=None - ): + def init(self, server_meta: DFServerMeta): """初始化控制器参数的函数 - :param meta: 控制器参数的元数据 + :server meta: 控制器参数的元数据 :return: """ - self._ssh_pool.default_port = ssh_port - self._ssh_pool.default_username = ssh_username - self._ssh_pool.default_password = ssh_password - self.server_ip = server_ip - self.control_port = self.get_control_port() - self.query_port = self.get_query_port() + self._ssh_pool.default_port = server_meta.ssh_port + self._ssh_pool.default_username = server_meta.ssh_username + self._ssh_pool.default_password = server_meta.ssh_password + self.server_ip = server_meta.server_ip + self.control_port = server_meta.control_port if server_meta.control_port else self.get_control_port() + self.query_port = server_meta.query_port if server_meta.query_port else self.get_query_port() - def get_control_port(self, server_ip=None, retry_count=100): + def get_control_port(self, server_ip=None, retry_count=10): server_ip = server_ip if server_ip else self.server_ip ssh_client = self._ssh_pool.get(server_ip) for _ in range(retry_count): @@ -52,11 +51,11 @@ class DeepflowServer(): log.error( f"stderr: {stderr.read().decode('utf-8')}, Failed to get port: {e}" ) - time.sleep(3) + time.sleep(5) log.error("Failed to get control port after retrying.") - return None + assert False - def get_query_port(self, server_ip=None, retry_count=100): + def get_query_port(self, server_ip=None, retry_count=10): server_ip = server_ip if server_ip else self.server_ip ssh_client = self._ssh_pool.get(server_ip) for _ in range(retry_count): @@ -75,9 +74,9 @@ class DeepflowServer(): log.error( f"stderr: {stderr.read().decode('utf-8')}, Failed to get port: {e}" ) - time.sleep(3) + time.sleep(5) log.error("Failed to get query port after retrying.") - return None + assert False def create_group(self, group_name, vtap_lcuuid=""): '''Move the agent to the specified group diff --git a/evaluation/eval-runner/eval-runner/agent_tools/deepflow_agent/file/deepflow-agent.yaml b/evaluation/eval-runner/eval-runner/agent_tools/deepflow_agent/file/deepflow-agent.yaml index 4c5b194aa57022a52a97f8b4fb4a37d6e3ec908c..8a4a51e18a9c1e34ddb477128bd5f0de9260a9db 100644 --- a/evaluation/eval-runner/eval-runner/agent_tools/deepflow_agent/file/deepflow-agent.yaml +++ b/evaluation/eval-runner/eval-runner/agent_tools/deepflow_agent/file/deepflow-agent.yaml @@ -1,2 +1,2 @@ image: - repository: deepflowce/deepflow-agent + repository: registry-vpc.cn-beijing.aliyuncs.com/deepflow-ce/deepflow-agent diff --git a/evaluation/eval-runner/eval-runner/case/case_base.py b/evaluation/eval-runner/eval-runner/case/case_base.py new file mode 100644 index 0000000000000000000000000000000000000000..783120c14d38c68ae5ca84b3fc14123ce9c64684 --- /dev/null +++ b/evaluation/eval-runner/eval-runner/case/case_base.py @@ -0,0 +1,46 @@ +import time + +from common.parse_config import ParseConfig +from common.const import CASE_TYPE_CONCURRENCY +from common.utils import step as allure_step +from eval_lib.common.logger import get_logger +log = get_logger() + + +class BaseCase(object): + + CASE_TYPE = CASE_TYPE_CONCURRENCY + + @classmethod + def setup_class(cls): + cls.server_meta = ParseConfig.get_df_server_meta(case_type=cls.CASE_TYPE) + log.info( + f"start to run case {cls.class_name()}, df env: {cls.server_meta.to_json()}" + ) + cls.start = time.time() + with allure_step('The setup of the class'): + pass + cls._setup_class() + + @classmethod + def teardown_class(cls): + cls._teardown_class() + log.info( + f"case {cls.class_name()} spend time {time.time()-cls.start}s" + ) + ParseConfig.release_df_env(cls.server_meta.server_ip) + log.info(f"freed df env {cls.server_meta.to_json()}") + with allure_step('The teardown of the class'): + pass + + @classmethod + def class_name(cls): + return cls.__name__ + + @classmethod + def _setup_class(cls): + pass + + @classmethod + def _teardown_class(cls): + pass diff --git a/evaluation/eval-runner/eval-runner/case/performance_analysis/istio/test_istio_with_agent.py b/evaluation/eval-runner/eval-runner/case/performance_analysis/istio/test_istio_with_agent.py index ba4d490b747677085c1c7abd0105c565c72d8567..75a2e002cb57bf0420c01d876baa1797b2737edf 100644 --- a/evaluation/eval-runner/eval-runner/case/performance_analysis/istio/test_istio_with_agent.py +++ b/evaluation/eval-runner/eval-runner/case/performance_analysis/istio/test_istio_with_agent.py @@ -1,41 +1,34 @@ import pytest -import allure,time +import allure, time from common.utils import step as allure_step from common.utils import choose_platform -from common.utils import choose_agent from eval_lib.common.logger import get_logger from common.results import AgentResults +from common import const +from common.traffic import Traffic from agent_tools.base import AgentBase from common import utils as common_utils from case.performance_analysis import utils as performance_analysis_utils from platform_tools.aliyun import ali_const -from common.utils import ssh_pool_default +from agent_tools.agent_manage import AgentManage log = get_logger() case_name = "performance_analysis_istio_with_agent" case_info = {} -tool_params = ["260", "220", "180"] server_process_names = [ "envoy", "ratings", "ws-javaagent.jar", "details", "productpage" ] -def create_http_traffic_action( - istio_ip, traffic_ip, productpage_port ,param -): - ssh = ssh_pool_default.get(traffic_ip) - start_time = int(time.time()) - log.info("start generating http traffic") - _, _, stderr = ssh.exec_command( - f'''wrk2 -c50 -t4 -R {param} -d 100 -L http://{istio_ip}:{productpage_port}/productpage | grep -E "(Latency Distribution|Requests/sec)" -A 8 | grep -E "^( 50.000| 90.000|Requests/sec:)"| awk '{{print $2}}' > traffic_result.log''' - ) - err = stderr.readlines() - if err: - log.error(f"wrk2 err, log:{err}") - log.info("complete http traffic generation") - end_time = int(time.time()) - return start_time, end_time +traffic_default_cmd = [ + "eb -r 260 -t 1 -e http -d 100 -h http://$dip:$productpage_port/productpage", + "eb -r 220 -t 1 -e http -d 100 -h http://$dip:$productpage_port/productpage", + "eb -r 180 -t 1 -e http -d 100 -h http://$dip:$productpage_port/productpage", +] +fixed_host_traffic = "performance_analysis_traffic_ip" +fixed_host_agent = "performance_analysis_istio_ip" + -class TestPerformanceAnalysisIstioWithAgent(): +class TestIstioWithAgent(): @classmethod def setup_class(cls): @@ -45,17 +38,15 @@ class TestPerformanceAnalysisIstioWithAgent(): cls.agent_name = f"{case_name}_{uuid}" cls.result = AgentResults(case_name=case_name) cls.result.add_case_info(info=case_info) - cls.agent:AgentBase = None + cls.am = AgentManage() @classmethod def teardown_class(cls): cls.result.generate_yaml_file() - cls.agent.clear() - + cls.am.clear_all_agent() @allure.suite('performance analysis') - @allure.epic('Agent performance analysis') - @allure.feature('') + @allure.epic('performance analysis istio') @allure.title('Agent性能分析 - istio') @allure.description('Test the performance of the agent on istio') @pytest.mark.medium @@ -64,32 +55,35 @@ class TestPerformanceAnalysisIstioWithAgent(): Platform = choose_platform() if Platform: instance_info = Platform.create_instances( - instance_names=[self.instance_name_agent,self.instance_name_traffic], - image_id=ali_const.ali_image_id_performance_analysis, + instance_names=[ + self.instance_name_agent, self.instance_name_traffic + ], + image_id=ali_const. + ali_image_id_performance_analysis_default, ) else: log.info("no platform, use default ip") instance_info = { - self.instance_name_agent: common_utils.get_fixed_host_ip(self.instance_name_agent), - self.instance_name_traffic: common_utils.get_fixed_host_ip(self.instance_name_traffic) + self.instance_name_agent: + common_utils.get_fixed_host_ip(fixed_host_agent), + self.instance_name_traffic: + common_utils.get_fixed_host_ip(fixed_host_traffic) } - agent_ip = instance_info[self.instance_name_agent] traffic_ip = instance_info[self.instance_name_traffic] + traffic_tool = Traffic(traffic_ip) + traffic_tool.set_traffic_commands(traffic_default_cmd) with allure_step('step 2: install agent'): - Agent = choose_agent() - TestPerformanceAnalysisIstioWithAgent.agent = Agent() + agent: AgentBase = TestIstioWithAgent.am.create_agent() agent_meta = common_utils.get_meta_data(agent_ip) - TestPerformanceAnalysisIstioWithAgent.agent.init( - agent_name=self.agent_name, - meta=agent_meta - ) - self.agent.deploy() + agent.init(agent_name=self.agent_name, meta=agent_meta) + agent.deploy_type = const.AGENT_DEPLOY_TYPE_K8S + agent.deploy() with allure_step('step 3: sync agent'): - self.agent.start() - self.agent.ensure_agent_status_available() - self.agent.configure_agent() + agent.start() + agent.ensure_agent_status_available() + agent.configure_agent() log.info("wait 120s") time.sleep(120) @@ -98,49 +92,45 @@ class TestPerformanceAnalysisIstioWithAgent(): vm_ip=agent_ip, local_path="case/performance_analysis/tools/telegraf.conf", remote_path="telegraf.conf", - ssh_pool=self.agent.get_ssh_pool(), + ssh_pool=agent.get_ssh_pool(), ) performance_analysis_utils.reload_telegraf_conf( - vm_ip=agent_ip, - ssh_pool=self.agent.get_ssh_pool() - ) + vm_ip=agent_ip, ssh_pool=agent.get_ssh_pool()) + traffic_tool.install_eb_rpm(traffic_ip) + with allure_step("step 5: install istio"): if Platform: performance_analysis_utils.install_istio( vm_ip=agent_ip, - ssh_pool=self.agent.get_ssh_pool(), + ssh_pool=agent.get_ssh_pool(), ) performance_analysis_utils.init_istio( vm_ip=agent_ip, - ssh_pool=self.agent.get_ssh_pool(), + ssh_pool=agent.get_ssh_pool(), ) else: log.info("no platform, no install istio") - + productpage_port = performance_analysis_utils.get_istio_productpage_server_port( vm_ip=agent_ip, - ssh_pool=self.agent.get_ssh_pool(), + ssh_pool=agent.get_ssh_pool(), ) - server_process_names.append(self.agent.AGENT_PROCESS_NAME) - for i in range(len(tool_params)): - param = tool_params[i] + server_process_names.append(agent.AGENT_PROCESS_NAME) + for i in range(len(traffic_tool.traffic_commands)): + cmd = traffic_tool.traffic_commands[i] result_data = {} - with allure_step(f'step 6.{i}: start wrk2 traffic tool, rate {param}'): - result_data["case.command"] = f"wrk2 -c50 -t4 -R {param} -d 100 -L http://{agent_ip}:{productpage_port}/productpage" - result_data["server.rate"] = param - start_time, end_time = create_http_traffic_action( - istio_ip=agent_ip, - traffic_ip=traffic_ip, + with allure_step(f'step 5.{i}: start traffic tool'): + start_time = int(time.time()) + traffic_result_data = traffic_tool.generate_traffic( + traffic_command=cmd, + dip=agent_ip, productpage_port=productpage_port, - param=param, ) + end_time = int(time.time()) time.sleep(30) # 打流工具的数据 - wrk2_result_data = performance_analysis_utils.get_traffic_tool_data( - vm_ip=traffic_ip, - ) - result_data.update(wrk2_result_data) - log.info(wrk2_result_data) + result_data.update(traffic_result_data) + log.info(traffic_result_data) # telegraf采集的数据 telegraf_result_data = performance_analysis_utils.get_process_usage_by_telegraf( vm_ip=agent_ip, @@ -154,13 +144,12 @@ class TestPerformanceAnalysisIstioWithAgent(): # self.result.add_result_data(data=agent_result_data) # log.info(agent_result_data) self.result.add_result_data(data=result_data, index=i) - with allure_step('step n: delete instance and clear'): + + with allure_step('step n: uninstall agent and delete instance'): + agent.uninstall() if Platform: - Platform.delete_instances( - instance_names=[self.instance_name_agent,self.instance_name_traffic] - ) + Platform.delete_instances(instance_names=[ + self.instance_name_agent, self.instance_name_traffic + ]) else: log.info("no platform, use default ip, no delete") - - - diff --git a/evaluation/eval-runner/eval-runner/case/performance_analysis/istio/test_istio_without_agent.py b/evaluation/eval-runner/eval-runner/case/performance_analysis/istio/test_istio_without_agent.py index bc90a3cd3f9f05baad281a6ed6e24cd9a3c025e3..b3cd846620287f7765e556b69774e656a94dc0bc 100644 --- a/evaluation/eval-runner/eval-runner/case/performance_analysis/istio/test_istio_without_agent.py +++ b/evaluation/eval-runner/eval-runner/case/performance_analysis/istio/test_istio_without_agent.py @@ -1,8 +1,9 @@ import pytest -import allure,time +import allure, time from common.utils import step as allure_step from common.utils import choose_platform from eval_lib.common.logger import get_logger +from common.traffic import Traffic from common.results import AgentResults from common import utils as common_utils from case.performance_analysis import utils as performance_analysis_utils @@ -13,27 +14,20 @@ log = get_logger() case_name = "performance_analysis_istio_without_agent" case_info = {} -tool_params = ["260", "220", "180"] server_process_names = [ "envoy", "ratings", "ws-javaagent.jar", "details", "productpage" ] -def create_http_traffic_action( - istio_ip, traffic_ip, productpage_port ,param -): - ssh = ssh_pool_default.get(traffic_ip) - start_time = int(time.time()) - log.info("start generating http traffic") - _, _, stderr = ssh.exec_command( - f'''wrk2 -c50 -t4 -R {param} -d 100 -L http://{istio_ip}:{productpage_port}/productpage | grep -E "(Latency Distribution|Requests/sec)" -A 8 | grep -E "^( 50.000| 90.000|Requests/sec:)"| awk '{{print $2}}' > traffic_result.log''' - ) - err = stderr.readlines() - if err: - log.error(f"wrk2 err, log:{err}") - log.info("complete http traffic generation") - end_time = int(time.time()) - return start_time, end_time +traffic_default_cmd = [ + "eb -r 260 -t 1 -e http -d 100 -h http://$dip:$productpage_port/productpage", + "eb -r 220 -t 1 -e http -d 100 -h http://$dip:$productpage_port/productpage", + "eb -r 180 -t 1 -e http -d 100 -h http://$dip:$productpage_port/productpage", +] + +fixed_host_traffic = "performance_analysis_traffic_ip" +fixed_host_agent = "performance_analysis_istio_ip" + -class TestPerformanceAnalysisIstioWithAgent(): +class TestIstioWithAgent(): @classmethod def setup_class(cls): @@ -47,11 +41,8 @@ class TestPerformanceAnalysisIstioWithAgent(): def teardown_class(cls): cls.result.generate_yaml_file() - - @allure.suite('performance analysis') - @allure.epic('Agent performance analysis') - @allure.feature('') + @allure.epic('performance analysis istio') @allure.title('Agent性能分析 - istio') @allure.description('Test the performance of the agent on istio') @pytest.mark.medium @@ -60,18 +51,24 @@ class TestPerformanceAnalysisIstioWithAgent(): Platform = choose_platform() if Platform: instance_info = Platform.create_instances( - instance_names=[self.instance_name_agent,self.instance_name_traffic], - image_id=ali_const.ali_image_id_performance_analysis, + instance_names=[ + self.instance_name_agent, self.instance_name_traffic + ], + image_id=ali_const. + ali_image_id_performance_analysis_default, ) else: log.info("no platform, use default ip") instance_info = { - self.instance_name_agent: common_utils.get_fixed_host_ip(self.instance_name_agent), - self.instance_name_traffic: common_utils.get_fixed_host_ip(self.instance_name_traffic) + self.instance_name_agent: + common_utils.get_fixed_host_ip(fixed_host_agent), + self.instance_name_traffic: + common_utils.get_fixed_host_ip(fixed_host_traffic) } agent_ip = instance_info[self.instance_name_agent] traffic_ip = instance_info[self.instance_name_traffic] - + traffic_tool = Traffic(traffic_ip) + traffic_tool.set_traffic_commands(traffic_default_cmd) with allure_step('step 2: update telegraf config'): common_utils.upload_files( vm_ip=agent_ip, @@ -80,9 +77,8 @@ class TestPerformanceAnalysisIstioWithAgent(): ssh_pool=ssh_pool_default, ) performance_analysis_utils.reload_telegraf_conf( - vm_ip=agent_ip, - ssh_pool=ssh_pool_default - ) + vm_ip=agent_ip, ssh_pool=ssh_pool_default) + traffic_tool.install_eb_rpm(traffic_ip) with allure_step("step 3: install istio"): if Platform: performance_analysis_utils.install_istio( @@ -99,25 +95,21 @@ class TestPerformanceAnalysisIstioWithAgent(): vm_ip=agent_ip, ssh_pool=ssh_pool_default, ) - for i in range(len(tool_params)): - param = tool_params[i] + for i in range(len(traffic_tool.traffic_commands)): + cmd = traffic_tool.traffic_commands[i] result_data = {} - with allure_step(f'step 4.{i}: start wrk2 traffic tool, rate {param}'): - result_data["case.command"] = f"wrk2 -c50 -t4 -R {param} -d 100 -L http://{agent_ip}:{productpage_port}/productpage" - result_data["server.rate"] = param - start_time, end_time = create_http_traffic_action( - istio_ip=agent_ip, - traffic_ip=traffic_ip, + with allure_step(f'step 5.{i}: start traffic tool'): + start_time = int(time.time()) + traffic_result_data = traffic_tool.generate_traffic( + traffic_command=cmd, + dip=agent_ip, productpage_port=productpage_port, - param=param, ) + end_time = int(time.time()) time.sleep(30) # 打流工具的数据 - wrk2_result_data = performance_analysis_utils.get_traffic_tool_data( - vm_ip=traffic_ip, - ) - result_data.update(wrk2_result_data) - log.info(wrk2_result_data) + result_data.update(traffic_result_data) + log.info(traffic_result_data) # telegraf采集的数据 telegraf_result_data = performance_analysis_utils.get_process_usage_by_telegraf( vm_ip=agent_ip, @@ -130,11 +122,8 @@ class TestPerformanceAnalysisIstioWithAgent(): self.result.add_result_data(data=result_data, index=i) with allure_step('step n: delete instance and clear'): if Platform: - Platform.delete_instances( - instance_names=[self.instance_name_agent,self.instance_name_traffic] - ) + Platform.delete_instances(instance_names=[ + self.instance_name_agent, self.instance_name_traffic + ]) else: log.info("no platform, use default ip, no delete") - - - diff --git a/evaluation/eval-runner/eval-runner/case/performance_analysis/nginx/test_nginx_with_agent.py b/evaluation/eval-runner/eval-runner/case/performance_analysis/nginx/test_nginx_with_agent.py index fa49e6889a87c590f44160c7c7c02712c9a516fd..e016fad4ecd3e37604a847463d830823a0eef9ae 100644 --- a/evaluation/eval-runner/eval-runner/case/performance_analysis/nginx/test_nginx_with_agent.py +++ b/evaluation/eval-runner/eval-runner/case/performance_analysis/nginx/test_nginx_with_agent.py @@ -1,39 +1,33 @@ import pytest -import allure,time +import allure, time + from common.utils import step as allure_step from common.utils import choose_platform -from common.utils import choose_agent -from eval_lib.common.logger import get_logger +from common import const +from common.traffic import Traffic from common.results import AgentResults from agent_tools.base import AgentBase from common import utils as common_utils from case.performance_analysis import utils as performance_analysis_utils from platform_tools.aliyun import ali_const -from common.utils import ssh_pool_default +from agent_tools.agent_manage import AgentManage +from eval_lib.common.logger import get_logger log = get_logger() case_name = "performance_analysis_nginx_http_with_agent" case_info = {} -tool_params = ["42000", "38000", "34000"] +server_process_names = ["nginx"] +traffic_default_cmd = [ + "eb -r 10000 -t 10 -e http -d 100 -h http://$dip:80/index.html", + "eb -r 8000 -t 10 -e http -d 100 -h http://$dip:80/index.html", + "eb -r 6000 -t 10 -e http -d 100 -h http://$dip:80/index.html", +] +fixed_host_traffic = "performance_analysis_traffic_ip" +fixed_host_agent = "performance_analysis_nginx_ip" -def create_http_traffic_action( - nginx_ip, traffic_ip, param -): - ssh = ssh_pool_default.get(traffic_ip) - start_time = int(time.time()) - log.info("start generating http traffic") - _, _, stderr = ssh.exec_command( - f'''wrk2 -c20 -t20 -R {param} -d 100 -L http://{nginx_ip}:80/index.html | grep -E "(Latency Distribution|Requests/sec)" -A 8 | grep -E "^( 50.000| 90.000|Requests/sec:)"| awk '{{print $2}}' > traffic_result.log''' - ) - err = stderr.readlines() - if err: - log.error(f"wrk2 err, log:{err}") - log.info("complete http traffic generation") - end_time = int(time.time()) - return start_time, end_time -class TestPerformanceAnalysisNginxHttpWithAgent(): +class TestNginxHttpWithAgent(): @classmethod def setup_class(cls): @@ -43,49 +37,53 @@ class TestPerformanceAnalysisNginxHttpWithAgent(): cls.agent_name = f"{case_name}_{uuid}" cls.result = AgentResults(case_name=case_name) cls.result.add_case_info(info=case_info) - cls.agent:AgentBase = None + cls.am = AgentManage() @classmethod def teardown_class(cls): cls.result.generate_yaml_file() - cls.agent.clear() + cls.am.clear_all_agent() @allure.suite('performance analysis') - @allure.epic('Agent performance analysis') - @allure.feature('') + @allure.epic('performance analysis nginx') @allure.title('Agent性能分析 - http') - @allure.description('Test the performance of the agent on the http protocol') + @allure.description( + 'Test the performance of the agent on the http protocol') @pytest.mark.medium def test_performance_analysis_nginx_http_with_agent(self): with allure_step('step 1: create instance'): Platform = choose_platform() if Platform: instance_info = Platform.create_instances( - instance_names=[self.instance_name_agent,self.instance_name_traffic], - image_id=ali_const.ali_image_id_performance_analysis, + instance_names=[ + self.instance_name_agent, self.instance_name_traffic + ], + image_id=ali_const. + ali_image_id_performance_analysis_default, ) else: log.info("no platform, use default ip") instance_info = { - self.instance_name_agent: common_utils.get_fixed_host_ip(self.instance_name_agent), - self.instance_name_traffic: common_utils.get_fixed_host_ip(self.instance_name_traffic) + self.instance_name_agent: + common_utils.get_fixed_host_ip(fixed_host_agent), + self.instance_name_traffic: + common_utils.get_fixed_host_ip(fixed_host_traffic) } agent_ip = instance_info[self.instance_name_agent] traffic_ip = instance_info[self.instance_name_traffic] + traffic_tool = Traffic(traffic_ip) + traffic_tool.set_traffic_commands(traffic_default_cmd) with allure_step('step 2: install agent'): - Agent = choose_agent() - TestPerformanceAnalysisNginxHttpWithAgent.agent = Agent() + agent: AgentBase = TestNginxHttpWithAgent.am.create_agent() agent_meta = common_utils.get_meta_data(agent_ip) - TestPerformanceAnalysisNginxHttpWithAgent.agent.init( - agent_name=self.agent_name, - meta=agent_meta - ) - self.agent.deploy() + agent.init(agent_name=self.agent_name, meta=agent_meta) + agent.deploy_type = const.AGENT_DEPLOY_TYPE_WORKLOAD + agent.deploy() with allure_step('step 3: sync agent'): - self.agent.start() - self.agent.ensure_agent_status_available() - self.agent.configure_agent() + agent.start() + agent.ensure_agent_status_available() + agent.configure_agent() log.info("wait 120s") time.sleep(120) @@ -94,42 +92,34 @@ class TestPerformanceAnalysisNginxHttpWithAgent(): vm_ip=agent_ip, local_path="case/performance_analysis/tools/telegraf.conf", remote_path="telegraf.conf", - ssh_pool=self.agent.get_ssh_pool(), + ssh_pool=agent.get_ssh_pool(), ) performance_analysis_utils.reload_telegraf_conf( - vm_ip=agent_ip, - ssh_pool=self.agent.get_ssh_pool() - ) - + vm_ip=agent_ip, ssh_pool=agent.get_ssh_pool()) common_utils.ensure_process_running( vm_ip=agent_ip, process_name="nginx", - ssh_pool=self.agent.get_ssh_pool(), + ssh_pool=agent.get_ssh_pool(), ) + traffic_tool.install_eb_rpm(traffic_ip) - for i in range(len(tool_params)): - param = tool_params[i] + server_process_names.append(agent.AGENT_PROCESS_NAME) + for i in range(len(traffic_tool.traffic_commands)): + cmd = traffic_tool.traffic_commands[i] result_data = {} - with allure_step(f'step 5.{i}: start wrk2 traffic tool, rate {param}'): - result_data["case.command"] = f"wrk2 -c20 -t20 -R {param} -d 100 -L http://{agent_ip}:80/index.html" - result_data["server.rate"] = param - start_time, end_time = create_http_traffic_action( - nginx_ip=agent_ip, - traffic_ip=traffic_ip, - param=param, - ) + with allure_step(f'step 5.{i}: start traffic tool'): + start_time = int(time.time()) + traffic_result_data = traffic_tool.generate_traffic( + traffic_command=cmd, dip=agent_ip) + end_time = int(time.time()) time.sleep(30) # 打流工具的数据 - wrk2_result_data = performance_analysis_utils.get_traffic_tool_data( - vm_ip=traffic_ip, - ) - result_data.update(wrk2_result_data) - log.info(wrk2_result_data) + result_data.update(traffic_result_data) + log.info(traffic_result_data) # telegraf采集的数据 - monitored_process_name = ["nginx", self.agent.AGENT_PROCESS_NAME] telegraf_result_data = performance_analysis_utils.get_process_usage_by_telegraf( vm_ip=agent_ip, - process_name_list=monitored_process_name, + process_name_list=server_process_names, start_time=start_time, end_time=end_time, ) @@ -139,13 +129,12 @@ class TestPerformanceAnalysisNginxHttpWithAgent(): # self.result.add_result_data(data=agent_result_data) # log.info(agent_result_data) self.result.add_result_data(data=result_data, index=i) - with allure_step('step n: delete instance and clear'): + + with allure_step('step n: uninstall agent and delete instance'): + agent.uninstall() if Platform: - Platform.delete_instances( - instance_names=[self.instance_name_agent,self.instance_name_traffic] - ) + Platform.delete_instances(instance_names=[ + self.instance_name_agent, self.instance_name_traffic + ]) else: log.info("no platform, use default ip, no delete") - - - diff --git a/evaluation/eval-runner/eval-runner/case/performance_analysis/nginx/test_nginx_without_agent.py b/evaluation/eval-runner/eval-runner/case/performance_analysis/nginx/test_nginx_without_agent.py index 3296cdebe47ff1f89e65543e342ba8d55e3b040d..aa724455e785ea2feed4e0c39c5cc37c2e0ed34b 100644 --- a/evaluation/eval-runner/eval-runner/case/performance_analysis/nginx/test_nginx_without_agent.py +++ b/evaluation/eval-runner/eval-runner/case/performance_analysis/nginx/test_nginx_without_agent.py @@ -1,8 +1,9 @@ import pytest -import allure,time +import allure, time from common.utils import step as allure_step from common.utils import choose_platform from eval_lib.common.logger import get_logger +from common.traffic import Traffic from common.results import AgentResults from common import utils as common_utils from case.performance_analysis import utils as performance_analysis_utils @@ -13,25 +14,17 @@ log = get_logger() case_name = "performance_analysis_nginx_http_without_agent" case_info = {} -tool_params = ["42000", "38000", "34000"] +server_process_names = ["nginx"] +traffic_default_cmd = [ + "eb -r 10000 -t 10 -e http -d 100 -h http://$dip:80/index.html", + "eb -r 8000 -t 10 -e http -d 100 -h http://$dip:80/index.html", + "eb -r 6000 -t 10 -e http -d 100 -h http://$dip:80/index.html", +] +fixed_host_traffic = "performance_analysis_traffic_ip" +fixed_host_agent = "performance_analysis_nginx_ip" -def create_http_traffic_action( - nginx_ip, traffic_ip, param -): - ssh = ssh_pool_default.get(traffic_ip) - start_time = int(time.time()) - log.info("start generating http traffic") - _, _, stderr = ssh.exec_command( - f'''wrk2 -c20 -t20 -R {param} -d 100 -L http://{nginx_ip}:80/index.html | grep -E "(Latency Distribution|Requests/sec)" -A 8 | grep -E "^( 50.000| 90.000|Requests/sec:)"| awk '{{print $2}}' > traffic_result.log''' - ) - err = stderr.readlines() - if err: - log.error(f"wrk2 err, log:{err}") - log.info("complete http traffic generation") - end_time = int(time.time()) - return start_time, end_time -class TestPerformanceAnalysisNginxHttpWithoutAgent(): +class TestNginxHttpWithoutAgent(): @classmethod def setup_class(cls): @@ -46,28 +39,34 @@ class TestPerformanceAnalysisNginxHttpWithoutAgent(): cls.result.generate_yaml_file() @allure.suite('performance analysis') - @allure.epic('Agent performance analysis') - @allure.feature('') + @allure.epic('performance analysis nginx') @allure.title('Agent性能分析无agent - http') - @allure.description('Test the performance of the agent on the http protocol') + @allure.description( + 'Test the performance of the agent on the http protocol') @pytest.mark.medium def test_performance_analysis_nginx_http_without_agent(self): with allure_step('step 1: create instance'): Platform = choose_platform() if Platform: instance_info = Platform.create_instances( - instance_names=[self.instance_name_agent,self.instance_name_traffic], - image_id=ali_const.ali_image_id_performance_analysis, + instance_names=[ + self.instance_name_agent, self.instance_name_traffic + ], + image_id=ali_const. + ali_image_id_performance_analysis_default, ) else: log.info("no platform, use default ip") instance_info = { - self.instance_name_agent: common_utils.get_fixed_host_ip(self.instance_name_agent), - self.instance_name_traffic: common_utils.get_fixed_host_ip(self.instance_name_traffic) + self.instance_name_agent: + common_utils.get_fixed_host_ip(fixed_host_agent), + self.instance_name_traffic: + common_utils.get_fixed_host_ip(fixed_host_traffic) } agent_ip = instance_info[self.instance_name_agent] traffic_ip = instance_info[self.instance_name_traffic] - + traffic_tool = Traffic(traffic_ip) + traffic_tool.set_traffic_commands(traffic_default_cmd) with allure_step('step 2: update telegraf config and start nginx'): common_utils.upload_files( vm_ip=agent_ip, @@ -76,38 +75,30 @@ class TestPerformanceAnalysisNginxHttpWithoutAgent(): ssh_pool=ssh_pool_default, ) performance_analysis_utils.reload_telegraf_conf( - vm_ip=agent_ip, - ssh_pool=ssh_pool_default - ) + vm_ip=agent_ip, ssh_pool=ssh_pool_default) common_utils.ensure_process_running( vm_ip=agent_ip, process_name="nginx", ssh_pool=ssh_pool_default, ) + traffic_tool.install_eb_rpm(traffic_ip) - for i in range(len(tool_params)): - param = tool_params[i] + for i in range(len(traffic_tool.traffic_commands)): + cmd = traffic_tool.traffic_commands[i] result_data = {} - with allure_step(f'step 3.{i}: start wrk2 traffic tool, rate {param}'): - result_data["case.command"] = f"wrk2 -c20 -t20 -R {param} -d 100 -L http://{agent_ip}:80/index.html" - result_data["server.rate"] = param - start_time, end_time = create_http_traffic_action( - nginx_ip=agent_ip, - traffic_ip=traffic_ip, - param=param, - ) + with allure_step(f'step 3.{i}: start traffic tool'): + start_time = int(time.time()) + traffic_result_data = traffic_tool.generate_traffic( + traffic_command=cmd, dip=agent_ip) + end_time = int(time.time()) time.sleep(30) # 打流工具的数据 - wrk2_result_data = performance_analysis_utils.get_traffic_tool_data( - vm_ip=traffic_ip, - ) - result_data.update(wrk2_result_data) - log.info(wrk2_result_data) + result_data.update(traffic_result_data) + log.info(traffic_result_data) # telegraf采集的数据 - monitored_process_name = ["nginx"] telegraf_result_data = performance_analysis_utils.get_process_usage_by_telegraf( vm_ip=agent_ip, - process_name_list=monitored_process_name, + process_name_list=server_process_names, start_time=start_time, end_time=end_time, ) @@ -116,10 +107,8 @@ class TestPerformanceAnalysisNginxHttpWithoutAgent(): self.result.add_result_data(data=result_data, index=i) with allure_step('step n: delete instance and clear'): if Platform: - Platform.delete_instances( - instance_names=[self.instance_name_agent,self.instance_name_traffic] - ) + Platform.delete_instances(instance_names=[ + self.instance_name_agent, self.instance_name_traffic + ]) else: log.info("no platform, use default ip, no delete") - - diff --git a/evaluation/eval-runner/eval-runner/case/performance_analysis/tools/telegraf.conf b/evaluation/eval-runner/eval-runner/case/performance_analysis/tools/telegraf.conf index 56e50b7a8d6efd0ba2cf2e31dc9387eff95c0cb5..a428f22b1d8cd66342b092019e04d3bfe9b4c81d 100644 --- a/evaluation/eval-runner/eval-runner/case/performance_analysis/tools/telegraf.conf +++ b/evaluation/eval-runner/eval-runner/case/performance_analysis/tools/telegraf.conf @@ -4195,13 +4195,18 @@ pid_finder = "pgrep" pid_tag = true +[[inputs.procstat]] + pattern = "traefik" + pid_finder = "pgrep" + pid_tag = true + [[inputs.procstat]] pattern = "wrk2" pid_finder = "pgrep" pid_tag = true [[inputs.procstat]] - pattern = "redis" + pattern = "redis-server" pid_finder = "pgrep" pid_tag = true @@ -4250,11 +4255,6 @@ pid_finder = "pgrep" pid_tag = true -[[inputs.procstat]] - pattern = "postgres" - pid_finder = "pgrep" - pid_tag = true - [[inputs.procstat]] pattern = "mysqld" pid_finder = "pgrep" diff --git a/evaluation/eval-runner/eval-runner/case/performance_analysis/utils.py b/evaluation/eval-runner/eval-runner/case/performance_analysis/utils.py index afe1cd3f62a7e38e245fe33645897b76418d900e..e65b674ca4e2fcd6af7116112f48f8a6015d35f8 100644 --- a/evaluation/eval-runner/eval-runner/case/performance_analysis/utils.py +++ b/evaluation/eval-runner/eval-runner/case/performance_analysis/utils.py @@ -1,34 +1,14 @@ -import re,time -from common.utils import ssh_pool_default +import time +from common import utils from eval_lib.common.ssh import SSHPool from eval_lib.databases.influx.influx_db import InfulxDB from common.const import TELEGRAF_TABLE_NAME_IN_INFLUX +from common.utils import ssh_pool_default from eval_lib.common.logger import get_logger log = get_logger() -def format_latency(time_str, target_unit): - units = {'us': 0.000001, 'µs': 0.000001, 'ms': 0.001, 's': 1} - time_str = time_str.strip() - try: - pattern = r"[^\d]*$" - match = re.search(pattern, time_str) - matched_position = match.start() - time_value, current_unit = time_str[:matched_position], match.group() - time_value = float(time_value) - - if current_unit not in units: - log.error(f"Invalid current time unit: {current_unit}") - return None - converted_time = round( - time_value * units[current_unit] / units[target_unit], 3 - ) - return str(converted_time) + target_unit - except ValueError as e: - log.info(f"Error: {e}") - return None - def get_traffic_tool_data( vm_ip, ssh_pool: SSHPool=ssh_pool_default ): @@ -39,10 +19,10 @@ def get_traffic_tool_data( logs = stdout.readlines() try: if logs: - result["server.lantency_p50"] = format_latency( + result["server.latency_p50"] = format_latency( logs[0].split()[0], "ms" ) - result["server.lantency_p90"] = format_latency( + result["server.latency_p90"] = format_latency( logs[1].split()[0], "ms" ) result["server.rps"] = logs[2].split()[0] @@ -106,10 +86,14 @@ def get_process_usage_by_telegraf(vm_ip, process_name_list, start_time, end_time def install_istio( vm_ip, ssh_pool: SSHPool=ssh_pool_default ): + utils.install_k8s( + vm_ip=vm_ip, + ssh_pool=ssh_pool, + ) ssh = ssh_pool.get(vm_ip) cmd1 = "sudo istio-1.17.1/bin/istioctl install --set profile=demo -y --set components.cni.enabled=true" log.info(f"exec cmd: {cmd1}") - stdin, stdout, stderr = ssh.exec_command(cmd1) + _, stdout, stderr = ssh.exec_command(cmd1) log.info(stdout.readlines()) err = stderr.readlines() log.error(err) @@ -130,11 +114,11 @@ def install_istio( cmd2 = '''sudo kubectl label namespace default istio-injection=enabled && \ sudo kubectl apply -f istio-1.17.1/samples/bookinfo/platform/kube/bookinfo.yaml''' log.info(f"exec cmd: {cmd2}") - stdin, stdout, stderr = ssh.exec_command(cmd2) + _, stdout, stderr = ssh.exec_command(cmd2) log.info(stdout.readlines()) err = stderr.readlines() if err: - log.info(err) + log.error(err) assert False def init_istio( @@ -147,7 +131,7 @@ def init_istio( log.info( 'Wait for istio service status to be normal,about 300s, timeout is 600' ) - stdin, stdout, stderr = ssh.exec_command('kubectl get pods') + _, stdout, stderr = ssh.exec_command('kubectl get pods') logs = stdout.readlines() log.info(logs) res = True @@ -168,7 +152,7 @@ def init_istio( sudo kubectl apply -f istio-1.17.1/samples/bookinfo/networking/bookinfo-gateway.yaml && \ sudo istio-1.17.1/bin/istioctl analyze''' log.info(f"exec cmd: {cmd}") - stdin, stdout, stderr = ssh.exec_command(cmd) + _, stdout, stderr = ssh.exec_command(cmd) log.info(stdout.readlines()) log.error(stderr.readlines()) @@ -184,4 +168,41 @@ def get_istio_productpage_server_port(vm_ip, ssh_pool: SSHPool=ssh_pool_default) log.error( f"get port failed, err: {stderr.read().decode()}" ) - assert False \ No newline at end of file + assert False + +def deploy_traefik_by_docker_compose(vm_ip, ssh_pool: SSHPool=ssh_pool_default): + ssh_client = ssh_pool.get(vm_ip) + _, stdout, _ = ssh_client.exec_command("test -f docker-compose.yaml ") + exit_status = stdout.channel.recv_exit_status() + if exit_status == 0: + _, stdout, stderr = ssh_client.exec_command("sudo docker-compose -f docker-compose.yaml up -d") + output = stdout.read().decode() + err = stderr.read().decode() + exit_status = stdout.channel.recv_exit_status() + if exit_status == 0: + log.info(f"deploy traefik success, out:{output} {err}") + else: + log.error(f"deploy traefik failed, err: {err}") + assert False + else: + log.error(f"no found docker-compose.yaml") + assert False + +def add_whoami_host(vm_ip, dip, ssh_pool: SSHPool=ssh_pool_default): + ssh_client = ssh_pool.get(vm_ip) + _, stdout, _ = ssh_client.exec_command(f"echo '{dip} whoami.fw.com' | sudo tee -a /etc/hosts") + exit_status = stdout.channel.recv_exit_status() + if exit_status == 0: + log.info(f"add host success") + else: + log.error(f"add host failed") + +def init_go_server(vm_ip, ssh_pool: SSHPool=ssh_pool_default): + ssh = ssh_pool.get(vm_ip) + cmd = "cd go-server-sample-master&&docker-compose up -d&&sleep 3&&docker ps" + log.info(f"exec cmd: {cmd}") + _, stdout, stderr = ssh.exec_command(cmd) + log.info(stdout.readlines()) + err = stderr.readlines() + if err: + log.error(f"init go server error: {err}") diff --git a/evaluation/eval-runner/eval-runner/case/runner_test/test_print.py b/evaluation/eval-runner/eval-runner/case/runner_test/test_print.py index aec2be1487fca99d92a38ed946eafcc6d930cbcb..3c03d60cd15a5cb0ca0447ae5ab61353e41a8f6f 100644 --- a/evaluation/eval-runner/eval-runner/case/runner_test/test_print.py +++ b/evaluation/eval-runner/eval-runner/case/runner_test/test_print.py @@ -3,21 +3,21 @@ import allure,time from common.utils import step as allure_step from eval_lib.common.logger import get_logger from common.results import AgentResults - +from case.case_base import BaseCase case_info={} case_name = "performance_analysis_nginx_http_with_agent" log = get_logger() -class TestPrint(): +class TestPrint(BaseCase): @classmethod - def setup_class(cls): + def _setup_class(cls): cls.result = AgentResults(case_name=case_name) cls.result.add_case_info(info=case_info) pass @classmethod - def teardown_class(cls): + def _teardown_class(cls): cls.result.generate_yaml_file() pass diff --git a/evaluation/eval-runner/eval-runner/common/client.py b/evaluation/eval-runner/eval-runner/common/client.py index 84a7405122860ce51621b25ae16fbb27b11a5866..d8adb19731b20e307477b180c68324786f96db67 100644 --- a/evaluation/eval-runner/eval-runner/common/client.py +++ b/evaluation/eval-runner/eval-runner/common/client.py @@ -53,13 +53,13 @@ class LogClient(threading.Thread): if new_log_data: response = requests.request("POST", self.server_url, headers=headers, data=payload) if response.status_code == 200: - log.info("New log sent successfully.") + # log.info("New log sent successfully.") # 更新上一次发送的位置 self.last_position = file.tell() else: log.error(f"Failed to send new log error:{response.text}") - else: - log.info("No new log to send.") + # else: + # log.info("No new log to send.") def run(self): while not self._stop_event.is_set(): diff --git a/evaluation/eval-runner/eval-runner/common/const.py b/evaluation/eval-runner/eval-runner/common/const.py index 16f8d393da8bd7192aa7f69bae951b7bbf43ae78..fa44dba9ef9539f12eb4b6579374cd57dc726cae 100644 --- a/evaluation/eval-runner/eval-runner/common/const.py +++ b/evaluation/eval-runner/eval-runner/common/const.py @@ -7,6 +7,16 @@ API_PREFIX_RESULT_LOG = "/v1/evaluation/result/log" API_PREFIX_RESULT_ZIP = "/v1/evaluation/result/zip" CONTROLLER_HOST = "evaluation-controller" +AGENT_DEPLOY_TYPE_WORKLOAD = "workload" +AGENT_DEPLOY_TYPE_K8S = "k8s" +EB_RPM_URL = 'https://df-evaluation.oss-cn-beijing.aliyuncs.com/eval-bench/eval-bench-1.0-91.el7.x86_64.rpm' +# case occupiesdf environment type +CASE_TYPE_CONCURRENCY = 0 +CASE_TYPE_MONOPOLIZE = 1 +# deploy const +ali_dns_ip = "10.1.0.1" +ext_dns_server = "114.114.114.114" +ali_name_default = 'aliyun' \ No newline at end of file diff --git a/evaluation/eval-runner/eval-runner/common/module.py b/evaluation/eval-runner/eval-runner/common/module.py index 34f4182868804bd0e4f8e23bded02275e67d5cb4..b8e952431cab07c8be0f5354c3019f0f30046b6a 100644 --- a/evaluation/eval-runner/eval-runner/common/module.py +++ b/evaluation/eval-runner/eval-runner/common/module.py @@ -4,3 +4,20 @@ from eval_lib.model.base import BaseStruct class AgentMeta(BaseStruct): KEYS = ["agent_ip", "version", "ssh_port", "ssh_username", "ssh_password"] + + +class DFServerMeta(BaseStruct): + + KEYS = ["server_ip", "ssh_port", "ssh_username", "ssh_password", "control_port", "query_port"] + +class DeployMeta(BaseStruct): + + KEYS = ["uuid", "instance_name", "type", "df_agent_image_tag", "df_server_image_tag", "amount"] + + def init(self, **kwargs): + self.df_agent_image_tag = kwargs.get("df_agent_image_tag", "latest") + self.df_server_image_tag = kwargs.get("df_server_image_tag", "latest") + self.amount = kwargs.get("amount", 1) + self.uuid = kwargs.get("uuid") + self.instance_name = kwargs.get("instance_name") + self.type = kwargs.get("type") diff --git a/evaluation/eval-runner/eval-runner/common/parse_config.py b/evaluation/eval-runner/eval-runner/common/parse_config.py new file mode 100644 index 0000000000000000000000000000000000000000..a3376962751612e1d003f961b3009677f572cc18 --- /dev/null +++ b/evaluation/eval-runner/eval-runner/common/parse_config.py @@ -0,0 +1,129 @@ +import os + +from eval_lib.common.logger import get_logger +from common.module import AgentMeta, DFServerMeta +from config import conf +from common import const +from platform_tools.aliyun.aliyun_sdk import Aliyun +from platform_tools.base import PlatformBase +from agent_tools.deepflow_agent.deepflow_agent import DeeepflowAgent +from agent_tools.base import AgentBase +from deploy.dfenvs import DFEnvs + +log = get_logger() + + +class ParseConfig(): + + @staticmethod + def choose_platform() -> PlatformBase: + platform_type = conf.platform_tools.get("type", "") + if platform_type == 'aliyun': + aliyun_info = conf.platform_tools.get("aliyun", {}) + if 'ALICLOUD_ACCESS_KEY' not in os.environ: + os.environ['ALICLOUD_ACCESS_KEY'] = aliyun_info['access_key'] + + if 'ALICLOUD_SECRET_KEY' not in os.environ: + os.environ['ALICLOUD_SECRET_KEY'] = aliyun_info['secret_key'] + + if 'ALICLOUD_REGION' not in os.environ: + os.environ['ALICLOUD_REGION'] = aliyun_info['region'] + return Aliyun + else: + # 如果没有选择有效的平台,则记录错误并返回 None + log.error("Invalid platform type specified.") + return None + + @staticmethod + def choose_agent() -> AgentBase: + agent_type = conf.case_params.agent_type + if agent_type == 'deepflowce': + return DeeepflowAgent + else: + # 如果没有选择有效的 agent,则记录错误并返回 None + log.error("Invalid agent type specified.") + assert False + + @staticmethod + def get_agent_meta_data(agent_ip): + agent_type = conf.case_params.agent_type + agent_conf = conf.agent_tools.get(agent_type) + agent_version = agent_conf['version'] + agent_meta = AgentMeta() + agent_meta.ssh_port = conf.runner_ssh_port + agent_meta.ssh_password = conf.runner_ssh_password + agent_meta.ssh_username = conf.runner_ssh_username + agent_meta.agent_ip = agent_ip + agent_meta.version = agent_version + return agent_meta + + @staticmethod + def create_redis_client_df_envs() -> DFEnvs: + return DFEnvs(conf.case_params.uuid) + + @staticmethod + def get_df_server_meta(case_type) -> DFServerMeta: + if case_type is None: + log.error("case type is none") + assert False + server_meta = DFServerMeta() + agent_type = conf.case_params.agent_type + platform_type = conf.platform_tools.get("type", "") + if platform_type: + if agent_type == 'deepflowce': + redis_df_envs = ParseConfig.create_redis_client_df_envs() + if case_type == const.CASE_TYPE_CONCURRENCY: + df_env_info = redis_df_envs.get_concurrency_df_env() + elif case_type == const.CASE_TYPE_MONOPOLIZE: + df_env_info = redis_df_envs.get_monopolize_df_env() + if not df_env_info: + log.error("get df env info failed") + assert False + server_meta.ssh_port = conf.runner_ssh_port + server_meta.ssh_password = conf.runner_ssh_password + server_meta.ssh_username = conf.runner_ssh_username + server_meta.server_ip = df_env_info["mgt_ip"] + server_meta.control_port = df_env_info[ + "server_controller_port"] + server_meta.query_port = df_env_info["server_query_port"] + else: + if agent_type == 'deepflowce': + agent_param = conf.agent_tools[agent_type] + server_meta.ssh_port = agent_param["server_ssh_port"] + server_meta.ssh_password = agent_param["server_ssh_password"] + server_meta.ssh_username = agent_param["server_ssh_username"] + server_meta.server_ip = agent_param["server_ip"] + server_meta.control_port = None + server_meta.query_port = None + if not server_meta.server_ip: + log.error("get server ip failed") + assert False + return server_meta + + @staticmethod + def release_df_env(server_ip): + agent_type = conf.case_params.agent_type + platform_type = conf.platform_tools.get("type", "") + if platform_type: + if agent_type == 'deepflowce': + redis_df_envs = ParseConfig.create_redis_client_df_envs() + redis_df_envs.release_df_env(server_ip) + else: + if agent_type == 'deepflowce': + return + + @staticmethod + def get_fixed_host_ip(instance_name: str): + host_info: dict = conf.fixed_host + index = instance_name.rfind("-") + ip = "" + if instance_name[:index] in host_info.keys(): + ip = host_info[instance_name[:index]] + else: + log.error("fixed host ip not found") + assert False + return ip + + @staticmethod + def get_case_short_uuid(): + return conf.case_params.uuid[:8] diff --git a/evaluation/eval-runner/eval-runner/common/results.py b/evaluation/eval-runner/eval-runner/common/results.py index a597e17216b5613753dcd6ba84bc7975ea438c59..b0d383e167dbbddfae254eda1117c14be069ada9 100644 --- a/evaluation/eval-runner/eval-runner/common/results.py +++ b/evaluation/eval-runner/eval-runner/common/results.py @@ -1,5 +1,7 @@ import yaml -from common.config import conf +from config import conf + + class ResultsBase: pass @@ -9,13 +11,16 @@ class AgentResults(ResultsBase): def __init__(self, case_name): self.case_name = case_name # results dir - self.dir_path = f"{conf.runner_data_dir}/runner-{conf.case_params.uuid}/report" + self.dir_path = f"{conf.runner_data_dir}/runner-{conf.case_params.uuid}/report" self.data_dict = {"case_name": case_name} - - def add_result_data(self, data: dict, index: int=0): - modified_dict = {f"{self.case_name}." + key.replace("-", "_") + f".{index}": value for key, value in data.items()} + + def add_result_data(self, data: dict, index: int = 0): + modified_dict = { + f"{self.case_name}." + key.replace("-", "_") + f".{index}": value + for key, value in data.items() + } self.data_dict.update(modified_dict) - + def add_case_info(self, info): self.data_dict.update(info) diff --git a/evaluation/eval-runner/eval-runner/common/traffic.py b/evaluation/eval-runner/eval-runner/common/traffic.py new file mode 100644 index 0000000000000000000000000000000000000000..a7f94537429d7e551a4a7f16e4688926988b05d3 --- /dev/null +++ b/evaluation/eval-runner/eval-runner/common/traffic.py @@ -0,0 +1,157 @@ +import re + +from common import utils +from eval_lib.common.ssh import SSHPool +from common import utils as common_utils +from common.const import EB_RPM_URL +from common.utils import ssh_pool_default +from eval_lib.common.logger import get_logger + +log = get_logger() + + +class Traffic(): + + def __init__(self, traffic_ip, ssh_pool: SSHPool = ssh_pool_default): + self.traffic_ip = traffic_ip + self.ssh_pool = ssh_pool + + def set_traffic_commands(self, traffic_commands: list): + self.traffic_commands = traffic_commands + + def extract_command_rate(self, command: str) -> int: + rate = 0 + if command.strip().startswith("wrk2"): + rate = self.extract_number_after_field("-R", command) + elif command.strip().startswith("eb"): + rate = self.extract_number_after_field("-r", command) + elif command.strip().startswith("dnsperf"): + rate = self.extract_number_after_field("-Q", command) + elif command.strip().startswith("kubectl"): + rate = 0 + else: + log.error("traffic tool not support") + rate = 0 + # assert False + return rate + + def analyze_command(self, command: str, **kwargs) -> str: + if command.strip().startswith("wrk2"): + log_filter = '''|grep -E "(Latency Distribution|Requests/sec)" -A 8 | grep -E "^( 50.000| 90.000|Requests/sec:)"| awk '{print $2}' + ''' + elif command.strip().startswith("eb"): + log_filter = '''|awk 'END{print $14; print $16; print $8}' + ''' + elif command.strip().startswith("dnsperf"): + log_filter = '''|awk '/Queries per second/{rps=$4} /Average Latency/{avg_latecy=$4; max_latecy=substr($8, 1, length($8)-1)} END{print avg_latecy"s"; print max_latecy"s"; print rps}' + ''' + elif command.strip().startswith("kubectl"): + log_filter = '' + else: + log.error("traffic tool not support") + log_filter = '' + # assert False + for key, value in kwargs.items(): + command = command.replace(f"${key}", str(value)) + rel_cmd = command + log_filter + return rel_cmd + + def get_traffic_tool_result(self, command: str, data: list) -> dict: + cmd = command.strip() + rate = self.extract_command_rate(cmd) + result = {} + if cmd.startswith("wrk2") or cmd.startswith("eb") or cmd.startswith( + "dnsperf"): + result["server.latency_p50"] = utils.format_latency( + data[0].strip(), "ms") + result["server.latency_p90"] = utils.format_latency( + data[1].strip(), "ms") + result["server.rps"] = data[2].strip() + result["server.rate"] = rate + result["case.command"] = cmd + elif cmd.startswith("kubectl"): + pass + return result + + def generate_traffic(self, traffic_command, **kwargs): + ssh = self.ssh_pool.get(self.traffic_ip) + cmd = self.analyze_command(traffic_command, **kwargs) + log.info(f"traffic command: {cmd}") + _, stdout, stderr = ssh.exec_command(cmd) + output = stdout.readlines() + error = stderr.readlines() + exit_status = stdout.channel.recv_exit_status() + if exit_status == 0: + log.info("traffic generate success") + result = self.get_traffic_tool_result(traffic_command, output) + else: + log.error(f"traffic generate failed, err: {error}") + result = {} + return result + + def install_eb_rpm(self, vm_ip=None): + if not vm_ip: + vm_ip = self.traffic_ip + if common_utils.check_ssh_command(self.traffic_ip, "eb --help", + self.ssh_pool): + log.info("eb is already installed") + return + log.info("start to install eb") + ssh_client = self.ssh_pool.get(vm_ip) + _, stdout, stderr = ssh_client.exec_command( + f''' curl -O {EB_RPM_URL} && \ + rpm -ivh eval-bench-1.0-* ''') + output = stdout.read().decode() + error = stderr.read().decode() + if "eval-bench" in output: + log.info("Install eval-bench successfully") + else: + log.error(f"Install eval-bench failed, error is {error}") + assert False + + def extract_number_after_field(self, field, text): + pattern = r"{}\s*(\d+)".format(field) + # 在文本开头搜索匹配的内容 + match = re.search(pattern, text) + if match: + # 如果找到匹配的内容,返回匹配的数字部分 + return match.group(1) + else: + # 如果没有找到匹配的内容,返回 None + return None + + def create_performance_pods(self, vm_ip): + '''Login to the vtaps by SSH,create pod for UDP performance + ''' + utils.set_nameserver(vm_ip, self.ssh_pool) + ssh_client = self.ssh_pool.get(vm_ip) + cmds = [ + "curl -O http://nexus.yunshan.net/repository/tools/automation/performance/performance_client.yaml", + "curl -O http://nexus.yunshan.net/repository/tools/automation/performance/performance_server.yaml", + "sed -i 's/image: performance:lastest/image: dfcloud-image-registry.cn-beijing.cr.aliyuncs.com\/public\/agent_performance_test:latest/g' performance_client.yaml", + "sed -i 's/image: performance:lastest/image: dfcloud-image-registry.cn-beijing.cr.aliyuncs.com\/public\/agent_performance_test:latest/g' performance_server.yaml", + "kubectl apply -f performance_client.yaml -f performance_server.yaml ", + "kubectl wait --for=condition=Ready pod -l app=performance-client-pod --timeout=300s", + "kubectl wait --for=condition=Ready pod -l app=performance-server-pod --timeout=300s", + ] + cmd = " && ".join(cmds) + log.info("start create perfomance pods") + _, stdout, _ = ssh_client.exec_command(cmd) + logs = stdout.readlines() + log.info(f"log: {logs}") + pod_client_name = '' + pod_client_ip = '' + pod_server_name = '' + pod_server_ip = '' + _, stdout, _ = ssh_client.exec_command( + "kubectl get pods -o wide | grep -E 'client|server'") + pods_info = stdout.readlines() + log.info(f"pods info: {pods_info}") + for i in pods_info: + if 'client' in i: + pod_client_name = i.split()[0] + pod_client_ip = i.split()[5] + elif 'server' in i: + pod_server_name = i.split()[0] + pod_server_ip = i.split()[5] + return pod_client_name, pod_client_ip, pod_server_name, pod_server_ip diff --git a/evaluation/eval-runner/eval-runner/common/utils.py b/evaluation/eval-runner/eval-runner/common/utils.py index 44a81e81fd60889795dafa4d6f4b6969ace82403..b316760b8d72387d197779f76ac689810352aae3 100644 --- a/evaluation/eval-runner/eval-runner/common/utils.py +++ b/evaluation/eval-runner/eval-runner/common/utils.py @@ -3,34 +3,35 @@ import os import time import re import zipfile +import redis from scp import SCPClient from eval_lib.common.logger import get_logger from eval_lib.common.ssh import SSHPool -from eval_lib.databases.redis import runner_info +from platform_tools.aliyun import ali_const +from eval_lib.databases.redis.runner_info import RedisRunnerInfo from eval_lib.databases.redis import const as redis_const from common.module import AgentMeta -from common.config import conf +from config import conf from platform_tools.aliyun.aliyun_sdk import Aliyun from platform_tools.base import PlatformBase from agent_tools.deepflow_agent.deepflow_agent import DeeepflowAgent from agent_tools.base import AgentBase ssh_pool_default = SSHPool( - conf.global_ssh_port, - conf.global_ssh_username, - conf.global_ssh_password, -) -redis_db = runner_info.RedisRunnerInfo( - host=conf.redis_host, port=conf.redis_port, password=conf.redis_password, - db=conf.redis_db, max_connections=10 + conf.runner_ssh_port, + conf.runner_ssh_username, + conf.runner_ssh_password, ) +redis_runner = RedisRunnerInfo() log = get_logger() + def get_case_uuid(): return conf.case_params.uuid[:8] + def step(title): """ 执行一个步骤,并根据Redis中存储的运行状态来决定步骤的执行流程。 @@ -41,46 +42,14 @@ def step(title): log.info(title) # 记录步骤开始的日志 while True: # 从Redis获取运行者信息 - runner_info_dict = redis_db.get_runner_info(uuid=conf.case_params.uuid) - log.info(runner_info_dict) - case_status = runner_info_dict.get("case-status", None) - case_control_status = runner_info_dict.get("case-control-status", None) - # 检查是否需要主动暂停用例 - if case_control_status == redis_const.CASE_STATUS_PAUSED: - log.info(f"case pause proactively") - # 如果当前状态不是暂停状态,则更新状态为暂停 - if case_status != redis_const.CASE_STATUS_PAUSED: - redis_db.update_runner_info( - uuid=conf.case_params.uuid, - info={"case-status": redis_const.CASE_STATUS_PAUSED} - ) - case_status = redis_const.CASE_STATUS_PAUSED - - # 检查是否需要主动取消用例 - elif case_control_status == redis_const.CASE_STATUS_CANCELLED: - log.info(f"case cancel proactively") - # 如果当前状态不是取消状态,则更新状态为取消 - if case_status != redis_const.CASE_STATUS_CANCELLED: - redis_db.update_runner_info( - uuid=conf.case_params.uuid, - info={"case-status": redis_const.CASE_STATUS_CANCELLED} - ) - case_status = redis_const.CASE_STATUS_CANCELLED + sync_status = redis_runner.sync_case_status(uuid=conf.case_params.uuid) + if sync_status == redis_const.CASE_STATUS_CANCELLED: log.info("case cancel success") - assert False - - elif case_control_status == redis_const.CASE_STATUS_RUNNING: - # 如果当前状态不是运行状态,则更新状态为运行 - if case_status != redis_const.CASE_STATUS_RUNNING: - redis_db.update_runner_info( - uuid=conf.case_params.uuid, - info={"case-status": redis_const.CASE_STATUS_RUNNING} - ) - case_status = redis_const.CASE_STATUS_RUNNING - - # 如果用例状态不是运行中,则每隔20秒检查一次;如果是,则结束循环 - if case_status != redis_const.CASE_STATUS_RUNNING: + assert False + # 如果同步后状态不是running,则每隔20秒检查一次;如果是,则结束循环 + if sync_status != redis_const.CASE_STATUS_RUNNING: time.sleep(20) + log.info(f"stop execute case, wait 20s, ctrl-status is {sync_status}") else: break # 执行allure步骤,并返回结果 @@ -100,6 +69,9 @@ def choose_platform() -> PlatformBase: if 'ALICLOUD_REGION' not in os.environ: os.environ['ALICLOUD_REGION'] = aliyun_info['region'] return Aliyun + if platform_type == '': + log.info("no platform specified") + return None else: # 如果没有选择有效的平台,则记录错误并返回 None log.error("Invalid platform type specified.") @@ -121,25 +93,28 @@ def get_meta_data(agent_ip): agent_conf = conf.agent_tools.get(agent_type) agent_version = agent_conf['version'] agent_meta = AgentMeta() - agent_meta.ssh_port = conf.global_ssh_port - agent_meta.ssh_password = conf.global_ssh_password - agent_meta.ssh_username = conf.global_ssh_username + agent_meta.ssh_port = conf.runner_ssh_port + agent_meta.ssh_password = conf.runner_ssh_password + agent_meta.ssh_username = conf.runner_ssh_username agent_meta.agent_ip = agent_ip agent_meta.version = agent_version return agent_meta -def get_fixed_host_ip(instance_name): - host_ips: dict = conf.fixed_host +def get_fixed_host_ip(instance_name:str): + host_info: dict = conf.fixed_host ip = "" - if "traffic" in instance_name: - ip = host_ips.get("performance_analysis_traffic_ip", "") - elif "nginx" in instance_name and 'agent' in instance_name: - ip = host_ips.get("performance_analysis_nginx_ip", "") - elif "istio" in instance_name and 'agent' in instance_name: - ip = host_ips.get("performance_analysis_istio_ip", "") + host_key = instance_name + if '-' in instance_name: + index = instance_name.rfind("-") + host_key = instance_name[:index] + if host_key in host_info.keys(): + ip = host_info[host_key] + else: + log.error("fixed host ip not found") + assert False return ip - + def install_unzip(vm_ip, ssh_pool: SSHPool = ssh_pool_default): """ @@ -279,7 +254,9 @@ def upload_files( return False -def check_ssh_command(vm_ip, command, ssh_pool: SSHPool = ssh_pool_default) -> bool: +def check_ssh_command( + vm_ip, command, ssh_pool: SSHPool = ssh_pool_default +) -> bool: ssh_client = ssh_pool.get(vm_ip) _, _, stderr = ssh_client.exec_command(command) err = stderr.read().decode() @@ -291,7 +268,9 @@ def check_ssh_command(vm_ip, command, ssh_pool: SSHPool = ssh_pool_default) -> b return True -def check_helm_chart(vm_ip, chart_name, namespace, ssh_pool: SSHPool = ssh_pool_default)->bool: +def check_helm_chart( + vm_ip, chart_name, namespace, ssh_pool: SSHPool = ssh_pool_default +) -> bool: ssh_client = ssh_pool.get(vm_ip) command = f"sudo helm list --short -n {namespace}" _, stdout, stderr = ssh_client.exec_command(command) @@ -306,6 +285,24 @@ def check_helm_chart(vm_ip, chart_name, namespace, ssh_pool: SSHPool = ssh_pool_ log.info(f"Chart '{chart_name}' not found in namespace '{namespace}'") return False + +def set_nameserver(vm_ip, ssh_pool: SSHPool = ssh_pool_default): + """ + 设置虚拟机的DNS服务器为阿里云的DNS服务器。 + + 参数: + - vm_ip: 虚拟机的IP地址。 + - ssh_pool: SSH连接池,默认为ssh_pool_default。 + """ + log.info(f"set nameserver to {ali_const.ali_dns_nameserver}") + command = f"echo 'nameserver {ali_const.ali_dns_nameserver}' > /etc/resolv.conf" + ssh_client = ssh_pool.get(vm_ip) + _, _, stderr = ssh_client.exec_command(command) + err = stderr.read().decode() + if err: + log.error(f"Error setting nameserver: {err}") + + def install_k8s(vm_ip, ssh_pool: SSHPool = ssh_pool_default): """ 安装Kubernetes集群。 @@ -329,9 +326,7 @@ def install_k8s(vm_ip, ssh_pool: SSHPool = ssh_pool_default): log.info("k8s installation completed") return else: - log.error( - f'k8s is not isntalled, will install k8s' - ) # k8s 已经安装 + log.error(f'k8s is not isntalled, will install k8s') # k8s 已经安装 try: # 构造安装Kubernetes和Calico的命令 cmd_install = '''sudo sealos run localhost/labring/kubernetes:v1.25.0 localhost/calico:v3.24.1 --single && \ @@ -436,3 +431,27 @@ def zip_dir(folder_path, output_path): file_path, os.path.relpath(file_path, os.path.dirname(folder_path)) ) + + +def format_latency(time_str, target_unit): + units = {'us': 0.000001, 'µs': 0.000001, 'ms': 0.001, 's': 1} + time_str = time_str.strip() + try: + pattern = r"[^\d]*$" + match = re.search(pattern, time_str) + matched_position = match.start() + time_value, current_unit = time_str[:matched_position], match.group() + time_value = float(time_value) + + if current_unit not in units: + log.error(f"Invalid current time unit: {current_unit}") + return None + converted_time = round( + time_value * units[current_unit] / units[target_unit], 3 + ) + return str(converted_time) + target_unit + + except ValueError as e: + log.info(f"Error: {e}") + return None + \ No newline at end of file diff --git a/evaluation/eval-runner/eval-runner/config.py b/evaluation/eval-runner/eval-runner/config.py new file mode 100644 index 0000000000000000000000000000000000000000..c72ce772223442a6cbe776bcede8c42fdb50e330 --- /dev/null +++ b/evaluation/eval-runner/eval-runner/config.py @@ -0,0 +1,77 @@ +import yaml +from common.const import RUNNER_CONFIG_PATH +from eval_lib.common.logger import get_logger +from eval_lib.model.base import CaseParams + +log = get_logger() + + +class CaseConf(): + + def __init__(self): + self.agent_tools = {} + self.platform_tools = {} + self.runner_data_dir = None + self.listen_port = None + self.case_params: CaseParams = None + self.parse() + + def parse(self): + try: + with open(RUNNER_CONFIG_PATH, 'r') as f: + yml: dict = yaml.safe_load(f) + self.listen_port = yml.get('listen_port', 10083) + self.agent_tools = yml.get("agent-tools") + self.platform_tools = yml.get("platform-tools") + self.runner_data_dir = yml.get("runner_data_dir") + self.runner_ssh_port = yml.get('runner_ssh_port', 22) + self.runner_ssh_username = yml.get('runner_ssh_username', "") + self.runner_ssh_password = yml.get('runner_ssh_password', "") + self.fixed_host = yml.get('fixed_host', "") + self.case_params = self.parse_case_params(yml) + self.parse_mysql(yml) + self.parse_redis(yml) + except Exception as e: + log.error(f"file:eval-runner.yaml, yaml parser Error: {e}") + + def parse_case_params(self, yml: dict) -> CaseParams: + case_params: dict = yml.get("case_params") + return CaseParams(case_params) + + def parse_mysql(self, yml): + self.mysql = yml.get("mysql") + self.mysql_host = self.mysql.get("host", "127.0.0.1") + self.mysql_port = self.mysql.get("port", 3306) + self.mysql_user = self.mysql.get("user", "root") + self.mysql_password = self.mysql.get("password", "deepflow") + self.mysql_db = self.mysql.get("db", "evaluation") + + def parse_redis(self, yml): + self.redis = yml.get("redis") + self.redis_host = self.redis.get("host", "127.0.0.1") + self.redis_port = self.redis.get("port", 6379) + self.redis_password = self.redis.get("password", "root") + self.redis_db = self.redis.get("db", "0") + self.redis_max_connections = self.redis.get('redis_max_connections', 10) + + + def is_valid(self): + if not self.agent_tools: + log.error("agent-tools is empty") + assert False + if not self.platform_tools: + log.error("platform-tools is empty") + assert False + if not self.runner_data_dir: + log.error("runner_data_dir is empty") + assert False + if not self.case_params: + log.error("case_params is empty") + assert False + if not self.case_params.is_valid(): + log.error(f"case_params {self.case_params} is invalid") + assert False + return True + + +conf = CaseConf() diff --git a/evaluation/eval-runner/eval-runner/deploy/const.py b/evaluation/eval-runner/eval-runner/deploy/const.py new file mode 100644 index 0000000000000000000000000000000000000000..a66cda3a472fef4cc316d8ebaf9607bc5cc2372a --- /dev/null +++ b/evaluation/eval-runner/eval-runner/deploy/const.py @@ -0,0 +1,23 @@ + +# deploy +ENV_INSTANCE_NAME_DEFAULT = "automation-deepflow-ce" + +## action +ACTION_DEPLOY = 1 +ACTION_UPGRADE = 2 +ACTION_RELEASE = 11 + +ENV_TYPE_DEEPFLOW_CE = 1 +ENV_TYPE_INSTANCE = 2 +ENV_TYPE_DEEPFLOW_EE = 3 + +ENV_NAME_MAP = { + ENV_TYPE_DEEPFLOW_CE: "deepflow-ce", + ENV_TYPE_INSTANCE: "instance", + ENV_TYPE_DEEPFLOW_EE: "deepflow-ee", +} +ENV_TYPE_MAP = { + "deepflow-ce": ENV_TYPE_DEEPFLOW_CE, + "instance": ENV_TYPE_INSTANCE, + "deepflow-ee": ENV_TYPE_DEEPFLOW_EE, +} \ No newline at end of file diff --git a/evaluation/eval-runner/eval-runner/deploy/deploy.py b/evaluation/eval-runner/eval-runner/deploy/deploy.py new file mode 100644 index 0000000000000000000000000000000000000000..92a4452bcfd669ea5a5745436a4f4a33305af2f1 --- /dev/null +++ b/evaluation/eval-runner/eval-runner/deploy/deploy.py @@ -0,0 +1,79 @@ +import time + +from . import const +from deploy.workers.deploy_base import DeployBase +from eval_lib.common.logger import get_logger +from .dfenvs import DFEnvs +from common import utils +from .workers.deepflow_ce import DeployDeepflowCE +from common.module import DeployMeta + +log = get_logger() + + +class DeployWorker(): + + def __init__(self, meta: DeployMeta): + self.deploys = [] + self.meta = meta + self.env_uuid = self.meta.uuid + self.deploy_timeout = 30 * 60 + self.start_time = int(time.time()) + self.redis_envs = DFEnvs(self.env_uuid) + self.Platform = utils.choose_platform() + + def deploy(self) -> bool: + for index in range(self.meta.amount): + if self.meta.type == const.ENV_TYPE_DEEPFLOW_CE: + instance_name = f"{self.meta.instance_name}-{index}" + deploy_tool = DeployDeepflowCE( + uuid=self.env_uuid, + instance_name=instance_name, + df_server_image_tag=self.meta.df_server_image_tag, + df_agent_image_tag=self.meta.df_agent_image_tag + ) + deploy_tool.start() + self.deploys.append(deploy_tool) + else: + log.error(f"UNKNOWN TYPE: {self.meta.type}") + log.info( + f'{self.env_uuid} deepflow start deploy, number is {self.meta.amount}' + ) + while self.deploys: + for index, deploy in enumerate(self.deploys): + if deploy.is_alive(): + time_usage = int(time.time()) - self.start_time + if time_usage > self.deploy_timeout: + log.info(f"timeout! kill deploy process {deploy}") + deploy.terminate() + # 保留存活的进程 + self.deploys = [deploy for deploy in self.deploys if deploy.is_alive()] + time.sleep(10) + + available_df_env_num = self.redis_envs.update_reserved() + if available_df_env_num < self.meta.amount: + log.error( + f"only {available_df_env_num} df_envs available, but {self.meta.amount} required" + ) + else: + log.info("all df_envs are available") + if available_df_env_num == 0: + return False + else: + envs = self.redis_envs.get_all_envs() + log.info( + f"all instance_names of df_env: {[env_name for env_name,_ in envs.items()]}" + ) + return True + + def release(self): + instance_names = [] + for _, env in self.redis_envs.get_all_envs().items(): + self.redis_envs.delete_by_instance_name(env["name"]) + if env["deploy_status"] != "complete": + instance_names.append(env["name"]) + if instance_names: + log.info(f"delete instances: {instance_names}") + self.Platform.delete_instances(instance_names) + self.redis_envs.delete_envs_by_uuid() + diff --git a/evaluation/eval-runner/eval-runner/deploy/dfenvs.py b/evaluation/eval-runner/eval-runner/deploy/dfenvs.py new file mode 100644 index 0000000000000000000000000000000000000000..2027802d78549b3274dafcff82acf0e3025b2655 --- /dev/null +++ b/evaluation/eval-runner/eval-runner/deploy/dfenvs.py @@ -0,0 +1,241 @@ +import time + +from eval_lib.databases.redis.df_env import DFEnvInfo +from . import const as deploy_const +from eval_lib.common.logger import get_logger + +log = get_logger() + +GLOBAL_LOCK = "get_env_info" + +ENV_STATUS_FREE = "0" +ENV_STATUS_MONOPOLIZE = "-1" + +ENV_RESERVED_FREE = "0" +ENV_RESERVED_CONCURRENCY = "1" +ENV_RESERVED_MONOPOLIZE = "-1" + +ENV_DEPLOY_STATUS_INIT = "init" +ENV_DEPLOY_STATUS_COMPLETE = "complete" + +GET_DF_ENV_TIMEOUT = 20 * 60 + +class DFEnvs(object): + + def __init__(self, uuid=None): + self.uuid = uuid + self.df_envs = [] + self.redis = DFEnvInfo() + + def update(self, name, **kwargs): + lock = self.redis.acquire_lock(GLOBAL_LOCK) + for env in self.df_envs: + if env["name"] == name: + env.update(kwargs) + self.redis.update(name, kwargs) + self.redis.release_lock(GLOBAL_LOCK, lock) + + def init(self, name, type): + df_info = { + "name": name, + "deploy_status": ENV_DEPLOY_STATUS_INIT, + "status": ENV_STATUS_FREE, + "concurrency": "0", + "mgt_ip": "", + "server_query_port": "", + "server_controller_port": "", + "updated_time": str(int(time.time())), + "reserved": "0", # Reserve environments to prevent concurrent use cases from being blocked when all environments are occupied + "type": type, + } + if type in [ + deploy_const.ENV_TYPE_DEEPFLOW_CE, + deploy_const.ENV_TYPE_DEEPFLOW_EE, + deploy_const.ENV_NAME_MAP[deploy_const.ENV_TYPE_DEEPFLOW_CE], + deploy_const.ENV_NAME_MAP[deploy_const.ENV_TYPE_DEEPFLOW_EE] + ]: + self.df_envs.append(df_info) + lock = self.redis.acquire_lock(GLOBAL_LOCK) + self.redis.init_envs(self.df_envs, self.uuid) + self.redis.release_lock(GLOBAL_LOCK, lock) + + def update_reserved(self) -> int: + ''' + return int 返回可用df环境的数量 + ''' + envs = [] + df_envs = self.get_all_envs() + for _, df_env in df_envs.items(): + if df_env["deploy_status"] == ENV_DEPLOY_STATUS_COMPLETE: + envs.append(df_env) + if len(envs) > 2: + envs[0]["reserved"] = ENV_RESERVED_FREE + envs[1]["reserved"] = ENV_RESERVED_CONCURRENCY + envs[2]["reserved"] = ENV_RESERVED_MONOPOLIZE + if len(envs) > 3: + for env in envs[3:]: + env["reserved"] = ENV_RESERVED_MONOPOLIZE + for env in envs: + lock = self.redis.acquire_lock(GLOBAL_LOCK) + self.redis.update_env_reserved(env["name"], env["reserved"]) + self.redis.release_lock(GLOBAL_LOCK, lock) + return len(envs) + + def get_df_envs(self): + # Simple LB Sort by concurrency Lowest to highest + if not self.uuid: + return + lock = self.redis.acquire_lock(GLOBAL_LOCK) + envs = self.redis.get_prefix_envs(self.uuid) + self.redis.release_lock(GLOBAL_LOCK, lock) + envs = { + name: env for name, env in envs.items() if env["type"] in [ + deploy_const.ENV_NAME_MAP[deploy_const.ENV_TYPE_DEEPFLOW_CE], + deploy_const.ENV_NAME_MAP[deploy_const.ENV_TYPE_DEEPFLOW_EE] + ] + } + keys = sorted(envs, key=lambda x: get_concurrency(envs, x)) + df_envs = {} + for key in keys: + df_envs[key] = envs[key] + return df_envs + + def get_all_envs(self): + if not self.uuid: + return + lock = self.redis.acquire_lock(GLOBAL_LOCK) + envs = self.redis.get_prefix_envs(self.uuid) + self.redis.release_lock(GLOBAL_LOCK, lock) + return envs + + def update_env_status(self, env_name, status): + lock = self.redis.acquire_lock(GLOBAL_LOCK) + self.redis.update_env_status(env_name, status) + self.redis.release_lock(GLOBAL_LOCK, lock) + + def update_env_concurrency(self, env_name, concurrency): + lock = self.redis.acquire_lock(GLOBAL_LOCK) + self.redis.update_env_concurrency(env_name, concurrency) + self.redis.release_lock(GLOBAL_LOCK, lock) + + def list_env_uuids(self): + lock = self.redis.acquire_lock(GLOBAL_LOCK) + envs = self.redis.get_envs_key_info() + self.redis.release_lock(GLOBAL_LOCK, lock) + return envs + + def delete_envs_by_uuid(self): + lock = self.redis.acquire_lock(GLOBAL_LOCK) + self.redis.delete_env(self.uuid) + self.redis.release_lock(GLOBAL_LOCK, lock) + + def delete_by_instance_name(self, instance_name): + lock = self.redis.acquire_lock(GLOBAL_LOCK) + self.redis.delete_by_instance_name(self.uuid, instance_name) + envs = self.redis.get_prefix_envs(self.uuid) + if not envs: + self.redis.delete_env(self.uuid) + self.redis.release_lock(GLOBAL_LOCK, lock) + + def get_monopolize_df_env(self): + start_time = int(time.time()) + while True: + time_consumed = int(time.time()) - start_time + identifier = self.redis.acquire_lock(self.uuid) + envs = self.get_df_envs() + if not envs: + log.error(f"No normal deepflow found !") + return None + available_env = False + for name, env in envs.items(): + if env["deploy_status"] != ENV_DEPLOY_STATUS_COMPLETE: + continue + available_env = True + if env["reserved"] == ENV_RESERVED_CONCURRENCY: + continue + # The environment is already occupied by concurrent use cases + if env["status"] != ENV_STATUS_FREE or env["concurrency" + ] != "0": + continue + if env["status"] == ENV_STATUS_MONOPOLIZE: + continue + else: + status = ENV_STATUS_MONOPOLIZE + self.update_env_status(name, status) + return env + # release lock + self.redis.release_lock(self.uuid, identifier) + if not available_env: + log.error(f"wait env error, No normal deepflow found !") + return None + if time_consumed >= GET_DF_ENV_TIMEOUT / 2: + log.info( + f"wait env {time_consumed}s! deepflow maybe deploy failed" + ) + if time_consumed >= GET_DF_ENV_TIMEOUT: + log.error(f"Time out! No normal deepflow found !") + return None + time.sleep(10) + + def get_concurrency_df_env(self): + start_time = int(time.time()) + while True: + time_consumed = int(time.time()) - start_time + identifier = self.redis.acquire_lock(self.uuid) + envs = self.get_df_envs() + if not envs: + log.error(f"No normal deepflow found !") + return None + available_env = False + for name, env in envs.items(): + if env["deploy_status"] != ENV_DEPLOY_STATUS_COMPLETE: + continue + available_env = True + if env["reserved"] == ENV_RESERVED_MONOPOLIZE: + continue + # The env has been monopolized + if env["status"] == ENV_STATUS_MONOPOLIZE: + continue + else: + # concurrency+1 + concurrency = str(int(env["concurrency"]) + 1) + self.update_env_concurrency(name, concurrency) + return env + # release lock + self.redis.release_lock(self.uuid, identifier) + if not available_env: + log.error(f"wait env error, No normal deepflow found !") + return None + if time_consumed >= GET_DF_ENV_TIMEOUT: + log.error(f"Time out! No normal deepflow found !") + return None + if time_consumed >= GET_DF_ENV_TIMEOUT / 2: + log.info( + f"wait env {time_consumed}s! deepflow maybe deploy failed" + ) + time.sleep(10) + + def release_df_env(self, mgt_ip): + identifier = self.redis.acquire_lock(self.uuid) + envs = self.get_df_envs() + for name, env in envs.items(): + if env["mgt_ip"] != mgt_ip: + continue + env_concurrency = int(env["concurrency"]) + if env["status"] == ENV_STATUS_MONOPOLIZE: + status = ENV_STATUS_FREE + self.update_env_status(name, status) + elif env_concurrency > 0: + concurrency = str(int(env["concurrency"]) - 1) + status = ENV_STATUS_FREE + self.update_env_concurrency(name, concurrency) + else: + log.error("env release error, check it") + break + self.redis.release_lock(self.uuid, identifier) + +def get_concurrency(envs, name): + if envs[name]['status'] == ENV_STATUS_MONOPOLIZE: + return 9999 + else: + return int(envs[name]['concurrency']) diff --git a/evaluation/eval-runner/eval-runner/deploy/file/values-custom-latest.yaml b/evaluation/eval-runner/eval-runner/deploy/file/values-custom-latest.yaml new file mode 100644 index 0000000000000000000000000000000000000000..2344343f2ad9fb092987ac578cf904d06bfaffb7 --- /dev/null +++ b/evaluation/eval-runner/eval-runner/deploy/file/values-custom-latest.yaml @@ -0,0 +1,24 @@ +global: + allInOneLocalStorage: true + image: + repository: registry-vpc.cn-beijing.aliyuncs.com/deepflow-ce +image: + server: + tag: latest +deepflow-agent: + image: + tag: latest +config: + tridentTypeForUnkonwVtap: 3 #default is 0 +configmap: + server.yaml: + controller: + reporting-disabled: true +server: + nameservers: + - 114.114.114.114 + extraVolumeMounts: + - name: log-volume + mountPath: /var/log/deepflow + readOnly: false + hostPath: /root/deepflow diff --git a/evaluation/eval-runner/eval-runner/deploy/message.py b/evaluation/eval-runner/eval-runner/deploy/message.py new file mode 100644 index 0000000000000000000000000000000000000000..1632a60ca8e778d8c227675199a5e014e1c853b9 --- /dev/null +++ b/evaluation/eval-runner/eval-runner/deploy/message.py @@ -0,0 +1,41 @@ +from schematics.models import Model +from schematics.types import (IntType, StringType) +from schematics.types.compound import ListType, ModelType +from schematics.exceptions import ValidationError + +# from common.exceptions import BadRequestException +from . import const +from .dfenvs import DFEnvs + + +class Env(Model): + + df_server_image_tag = StringType( + serialized_name='DF_SERVER_IMAGE_TAG', default="latest" + ) + df_agent_image_tag = StringType( + serialized_name='DF_AGENT_IMAGE_TAG', default="latest" + ) + type = IntType(serialized_name='TYPE', required=True) + instance_name = StringType(serialized_name='INSTANCE_NAME', required=True) + + +class Message(Model): + env_uuid = StringType(serialized_name='ENV_UUID', required=True) + action = IntType(serialized_name='ACTION', required=True) + envs = ListType(ModelType(Env), serialized_name='ENVS', default=[]) + + def validate_envs(self, data, value): + if value and data["action"] == const.ACTION_DEPLOY: + envs = DFEnvs(data["env_uuid"]) + envs = envs.get_all() + redis_env_names = envs.keys() + instance_names = [] + for env in value: + env.validate() + name = env.instance_name if env.type == const.ENV_TYPE_INSTANCE or data[ + 'env_uuid' + ] in env.instance_name else f"{env.instance_name}-{data['env_uuid']}" + if name in instance_names or name in redis_env_names: + raise ValidationError(f"instance_name: {name} duplicate") + instance_names.append(name) diff --git a/evaluation/eval-runner/eval-runner/deploy/utils.py b/evaluation/eval-runner/eval-runner/deploy/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..f03d95ec0ec85f3f8fc165fafe1b7a1067e5184b --- /dev/null +++ b/evaluation/eval-runner/eval-runner/deploy/utils.py @@ -0,0 +1,352 @@ +# coding: utf-8 +""" +author: danqing +date: 2022-09-29 +desc: 脚本中常用的函数 +""" +import json +import os +import time +import re +import requests +from eval_lib.common.ssh import SSHPool +from common import const +from config import conf +from eval_lib.common.logger import get_logger +from common.utils import ssh_pool_default + +log = get_logger() + + +class DeployUtils(object): + + def __init__( + self, df_mgt_ip=None, df_server_controller_port=None, + df_server_query_port=None, df_server_image_tag=None, + df_agent_image_tag=None, ssh_pool=ssh_pool_default + ): + self.df_mgt_ip = df_mgt_ip + self.df_server_controller_port = df_server_controller_port + self.df_server_query_port = df_server_query_port + self.df_server_image_tag = df_server_image_tag + self.df_agent_image_tag = df_agent_image_tag + self._ssh_pool: SSHPool = ssh_pool + + def exec_cmd(self, ssh, cmd="", err_assert=False): + if cmd == "": + log.error("The command is None") + assert False + log.info("exec_cmd::cmd ==> {}".format(cmd)) + stdin, stdout, stderr = ssh.exec_command(cmd) + logs = stdout.readlines() + log.info("exec_cmd::logs ==> {}".format(logs)) + err = stderr.readlines() + if err: + log.error(f"exec_cmd::err ==> {err}") + assert err_assert == False + return logs, err + + def replace_registry_public_to_private(self, filename): + if not filename: + return + ssh = self._ssh_pool.get(self.df_mgt_ip) + cmd = f"sed -i 's/registry.cn-beijing.aliyuncs.com/registry-vpc.cn-beijing.aliyuncs.com/g' {filename}" + _, stdout, stderr = ssh.exec_command(cmd) + err = stderr.readlines() + if err: + log.error(f"Replace Registery Error: {err}") + + def get_query_port(self): + '''Login to the deepflow by SSH, Deepflow gets query_port, parameter: + self.df_mgt_ip; The ip of deepflow + username; login username + password; login password + ssh_port; port + ''' + ssh = self._ssh_pool.get(self.df_mgt_ip) + query_api_port = None + loop_num = 100 + while loop_num: + try: + log.info("get querier port") + stdin, stdout, stderr = ssh.exec_command( + '''kubectl get svc -n deepflow|grep -o "20416:[0-9]*" | cut -d ":" -f 2 + ''', timeout=10 + ) + out = stdout.readlines() + log.info(out) + query_api_port = out[0].strip() + log.info(f"query_port :{query_api_port}") + if query_api_port: + break + except Exception as e: + log.error(f"get port error:{e}") + time.sleep(3) + loop_num -= 1 + self.df_server_query_port = query_api_port + return query_api_port + + def get_controller_port(self): + '''Login to the deepflow by SSH, Deepflow gets controller_port, parameter: + self.df_mgt_ip; The ip of deepflow + username; login username + password; login password + ssh_port; port + ''' + ssh = self._ssh_pool.get(self.df_mgt_ip) + controller_api_port = None + loop_num = 100 + while loop_num: + try: + log.info("get controller port") + stdin, stdout, stderr = ssh.exec_command( + '''kubectl get svc -n deepflow|grep -o "20417:[0-9]*" | cut -d ":" -f 2 + ''', timeout=10 + ) + controller_api_port = stdout.readlines()[0].strip() + log.info(f"controller_port :{controller_api_port}") + if controller_api_port: + break + except Exception as e: + log.error(f"get port error:{e}") + time.sleep(3) + loop_num -= 1 + self.df_server_controller_port = controller_api_port + return controller_api_port + + def add_deepflow_server_dns( + self, dns_server=const.ext_dns_server + ): + '''add dns server for deepflow_server, parameter; + dns_server; default, type is string + username; default, type is string + password; default, type is string + ssh_port; default, type is string + self.df_mgt_ip; default, type is string + ''' + ssh = self._ssh_pool.get(self.df_mgt_ip) + stdin, stdout, stderr = ssh.exec_command( + '''kubectl get deployment deepflow-server -n deepflow -o yaml > deepflow-server-dns.yaml && \ + sed -i "/dnsPolicy: ClusterFirst/a\ dnsConfig:" deepflow-server-dns.yaml && \ + sed -i "/ dnsConfig:/a\ nameservers:" deepflow-server-dns.yaml && \ + sed -i "/ nameservers:/a\ - {}" deepflow-server-dns.yaml &&\ + kubectl apply -f deepflow-server-dns.yaml''' + .format(dns_server) + ) + logs = stdout.readlines() + log.info(logs) + err = stderr.readlines() + if err: + log.error(f"dns add error: {err}") + return + + def install_deepflow_ctl( + self + ): + cmds = [ + "curl -o /usr/bin/deepflow-ctl https://deepflow-ce.oss-cn-beijing.aliyuncs.com/bin/ctl/stable/linux/$(arch | sed 's|x86_64|amd64|' | sed 's|aarch64|arm64|')/deepflow-ctl", + "chmod a+x /usr/bin/deepflow-ctl", + "deepflow_server_pod_ip=$(kubectl -n deepflow get pods -o wide | grep deepflow-server | awk '{print $6}')", + "deepflow-ctl -i $deepflow_server_pod_ip ingester profiler on" + ] + log.info("start install deepflow-ctl") + ssh = self._ssh_pool.get(self.df_mgt_ip) + for cmd in cmds: + log.info(f"exec cmd: {cmd}") + _, stdout, stderr = ssh.exec_command(cmd) + err = stderr.readlines() + if err: + log.error(err) + log.info(stdout.readlines()) + return + + def check_aliyun_cloud_isexist(self): + '''determine whether Aliyun Cloud platform exists, parameter; + self.df_mgt_ip; default, type is string + ''' + result = False + domain_url = 'http://' + self.df_mgt_ip + ':' + str( + self.df_server_controller_port + ) + '/v2/domains' + loop_num = 30 + while loop_num: + try: + res = requests.get(url=domain_url) + if res.status_code == 200: + res_json = res.json()['DATA'] + for i in res_json: + if i['TYPE'] == 9: + result = True + log.info('aliyun cloud is exist') + break + else: + log.info("aliyun cloud doesn't exist") + break + except Exception as e: + loop_num -= 1 + log.error(f"get domain error: {e}") + time.sleep(10) + return result + + def check_aliyun_cloud_status( + self, cloud_name=const.ali_name_default + ): + '''check aliyun cloud status, parameter; + cloud_name, required, type is string + self.df_mgt_ip, default, type is string + ''' + result = False + loop_num = 120 + while loop_num: + try: + domain_url = 'http://' + self.df_mgt_ip + ':' + str( + self.df_server_controller_port + ) + '/v2/domains' + res = requests.get(url=domain_url) + if res.status_code == 200: + res_json = res.json()['DATA'] + log.info(f'aliyun domain info: {res_json}') + for i in res_json: + if i['NAME'] == cloud_name and i['TYPE'] == 9 and i[ + 'ENABLED'] == 1 and len(i['SYNCED_AT']) > 0: + if i['STATE'] == 1: + log.info( + 'aliyun cloud platform status is normal' + ) + result = True + break + if i['STATE'] == 4: + if "no vtap report cluster id:" in i[ + 'ERROR_MSG']: + log.warning( + f"aliyun error_msg: {i['ERROR_MSG']}" + ) + log.info( + 'aliyun cloud platform status is normal' + ) + result = True + break + if result: + break + else: + log.info('wait for aliyun cloud sync, about 10s') + time.sleep(10) + loop_num -= 1 + except Exception as e: + log.error(f'aliyun cloud sync error: {e}') + time.sleep(10) + loop_num -= 1 + return result + + def add_aliyun_cloud_platform( + self, cloud_name=const.ali_name_default + ): + '''add aliyun cloud platform, parameter; + cloud_name, default, type is string + ''' + result = False + loop_num = 30 + while loop_num: + try: + domain_url = 'http://' + self.df_mgt_ip + ':' + str( + self.df_server_controller_port + ) + '/v1/domains/' + header = {'content-type': 'application/json'} + data = { + "TYPE": 9, + "NAME": "{}".format(cloud_name), + "ICON_ID": 8, + "CONFIG": { + "region_uuid": "ffffffff-ffff-ffff-ffff-ffffffffffff", + "controller_ip": "{}".format(self.df_mgt_ip), + "secret_id": "{}".format( + os.getenv('ALICLOUD_ACCESS_KEY') + ), + "secret_key": "{}".format( + os.getenv('ALICLOUD_SECRET_KEY') + ), + "include_regions": "华北2(北京)", + "exclude_regions": "华南3(广州),欧洲中部 1 (法兰克福),中东东部 1 (迪拜),英国 (伦敦),美国西部 1 (硅谷),美国东部 1 (弗吉尼亚),亚太南部 1 (孟买),亚太东南 3 (吉隆坡),亚太东南 5 (雅加达),亚太东南 2 (悉尼),亚太东南 1 (新加坡),亚太东北 1 (东京),香港,华北6(乌兰察布),华东5(南京-本地地域)", + "k8s_confs": "" + } + } + data = json.dumps(data) + res = requests.post(url=domain_url, headers=header, data=data) + if res.status_code == 200: + log.info('add aliyun cloud successfully') + result = True + break + else: + loop_num -= 1 + log.info(f'add aliyun cloud failed, status_code err: {res}') + log.info(res.json()) + time.sleep(10) + except Exception as err: + log.error( + 'add aliyun cloud failed, log info is {}'.format(err) + ) + time.sleep(10) + loop_num -= 1 + return result + + def delete_aliyun_cloud_platform(self): + '''delete aliyun cloud platform, parameter; + self.df_mgt_ip, default, type is string + ''' + try: + domain_url = url.protocol + self.df_mgt_ip + ':' + str( + self.df_server_controller_port + ) + url.domains_api_prefix + lcuuid = '' + res = requests.get(url=domain_url) + if res.status_code == 200: + res_json = res.json()['DATA'] + for i in res_json: + if i['TYPE'] == 9: + lcuuid = i['LCUUID'] + break + delete_domain_url = url.protocol + self.df_mgt_ip + ':' + str( + self.df_server_controller_port + ) + url.v1_domains_api_prefix + '{}/'.format(lcuuid) + res = requests.delete(url=delete_domain_url) + if res.status_code == 200: + log.info('delete aliyun cloud successfully') + except Exception as err: + log.error('delete aliyun cloud failed, log info is {}'.format(err)) + + def check_pod_running( + self, pod_name=None, namespace="deepflow", ip=None, timeout=3 * 60 + ): + if not ip: + ip = self.df_mgt_ip + cmd = "kubectl get pods -n {namespace}" + if not pod_name: + cmd = f"{cmd}|grep {pod_name}" + ssh = self._ssh_pool.get(ip) + + while timeout: + log.info( + f'Wait for {namespace} pod {pod_name} status to be normal, timeout is {timeout}' + ) + timeout -= 10 + time.sleep(10) + stdin, stdout, stderr = ssh.exec_command(cmd) + logs = stdout.readlines() + res = True + for k in logs[1:]: + log.info("get pod ========= > {}".format(k)) + if 'Running' not in k.split()[2] or '0/' in k.split()[1]: + res = False + break + if res == True: + log.info(f'{namespace} pod {pod_name} is normal') + return True + log.error(f"{namespace} pod {pod_name} is not running \n {logs}") + return False + + def add_aliyun_dns( + self, vtaps_mgt_ip + ): + ssh = self._ssh_pool.get(vtaps_mgt_ip) + cmd = f'''echo "nameserver {const.ali_dns_ip}" > /etc/resolv.conf''' + self.exec_cmd(ssh=ssh, cmd=cmd, err_assert=True) \ No newline at end of file diff --git a/evaluation/eval-runner/eval-runner/deploy/workers/deepflow_ce.py b/evaluation/eval-runner/eval-runner/deploy/workers/deepflow_ce.py new file mode 100644 index 0000000000000000000000000000000000000000..642cacd8e589aff7bb443c07fc2a7d5ab6c6fe9e --- /dev/null +++ b/evaluation/eval-runner/eval-runner/deploy/workers/deepflow_ce.py @@ -0,0 +1,359 @@ + +import time +import requests + +from datetime import datetime +from urllib.parse import urlencode + +from platform_tools.aliyun import ali_const +from common import utils +from deploy.utils import DeployUtils +from deploy.workers.deploy_base import DeployBase +from deploy import const +from common import const as common_const +from eval_lib.common.logger import get_logger +from common.utils import ssh_pool_default + +log = get_logger() + + +class DeployDeepflowCE(DeployBase): + + type = const.ENV_NAME_MAP[const.ENV_TYPE_DEEPFLOW_CE] + + def __init__( + self, uuid, instance_name="", + df_server_image_tag="latest", + df_agent_image_tag="latest" + ): + super().__init__(uuid) + self._ssh_pool = ssh_pool_default + self.Platform = utils.choose_platform() + self.mgt_ip = "" + self.server_query_port = 0 + self.server_controller_port = 0 + self.instance_name = f"{instance_name}-{self.uuid[:8]}" if self.uuid[:8] not in instance_name else instance_name + self.df_server_image_tag = df_server_image_tag + self.df_agent_image_tag = df_agent_image_tag + self.deploy_utils = DeployUtils( + df_mgt_ip=self.mgt_ip, + df_server_image_tag=df_server_image_tag, + df_agent_image_tag=df_agent_image_tag, + ssh_pool=self._ssh_pool + ) + self.status = "init" + + def create_instance(self, instance_name: str) -> str: + self.status = "creating aliyun instance" + instance_info=self.Platform.create_instances( + instance_names=[instance_name], + image_id=ali_const.ali_image_id_deepflow_default, + instance_type=ali_const.ali_instance_type_t5_c1m2_2x_large + ) + self.mgt_ip = instance_info[instance_name] + self.deploy_utils.df_mgt_ip = self.mgt_ip + self.status = f"create aliyun instance complete" + log.info(f"create instance, ip: {self.mgt_ip}") + return self.mgt_ip + + def to_redis_envs_info(self): + return { + "name": self.instance_name, + "deploy_status": self.status, + "mgt_ip": self.mgt_ip, + "server_query_port": self.server_query_port, + "server_controller_port": self.server_controller_port, + "type": self.type, + } + + def install_deepflow_ce_latest( + self, deepflow_mgt_ip= "", + df_server_image_tag="", + df_agent_image_tag="", + ): + res = False + deepflow_mgt_ip = deepflow_mgt_ip if deepflow_mgt_ip else self.mgt_ip + # assigned feature branch + utils.upload_files( + vm_ip=deepflow_mgt_ip, + local_path=f"{common_const.LOCAL_PATH}/deploy/file/values-custom-latest.yaml", + remote_path="values-custom-latest.yaml", + ssh_pool=self._ssh_pool + ) + ssh_client = self._ssh_pool.get(deepflow_mgt_ip) + if df_server_image_tag: + ssh_client.exec_command( + f''' sed -i "7s/latest/{df_server_image_tag}/" values-custom-latest.yaml''' + ) + if df_agent_image_tag: + ssh_client.exec_command( + f'''sed -i "10s/latest/{df_agent_image_tag}/" values-custom-latest.yaml''' + ) + self.status = "start install deepflow-ce" + version = "" + if "v6.1" in df_server_image_tag or "v6.1" in df_agent_image_tag: + version = "--version 6.1.8" + elif "v6.2" in df_server_image_tag or "v6.2" in df_agent_image_tag: + version = "--version 6.2.6" + elif "v6.3" in df_server_image_tag or "v6.3" in df_agent_image_tag: + version = "--version 6.3.9" + elif "v6.4" in df_server_image_tag or "v6.4" in df_agent_image_tag: + version = "--version 6.4.9" + try: + log.info(f'Start install deepflow-ce ip:{deepflow_mgt_ip}') + cmds = [ + "helm repo update deepflow_stable", + f"helm install deepflow -n deepflow deepflow_stable/deepflow {version} --create-namespace --set mysql.service.type=NodePort -f values-custom-latest.yaml" + ] + for cmd in cmds: + _, stdout, stderr = ssh_client.exec_command(cmd) + err = stderr.readlines() + log.info(f"exec cmd: {cmd}") + if len(err) > 0: + log.error(f'Install Deepflow-ce Error: {err}') + else: + log.info(stdout.readlines()) + except Exception as e: + log.error(f'Install Error: {e}') + self.status = "waiting deepflow services normal" + # if df_agent_image_tag in [ + # "v6.1", "v6.2", "v6.3" + # ] or df_server_image_tag in ["v6.1", "v6.2", "v6.3"]: + # self.deploy_utils.add_deepflow_server_dns() + try: + log.info( + 'DeepFlow is completed, waiting for the service status to be normal' + ) + wait_num = int(10 * 60 / 10) + while wait_num: + log.info( + 'Wait for DeepFlow service status to be normal,about 10s, timeout is 600' + ) + wait_num -= 1 + time.sleep(10) + stdin, stdout, stderr = ssh_client.exec_command( + 'kubectl get pods -n deepflow' + ) + logs = stdout.readlines() + res = True + for k in logs[1:]: + log.info("get pod ========= > {}".format(k)) + if 'Running' not in k.split( + )[2] and 'deepflow-server' not in k.split()[2]: + pass + if 'Running' not in k.split()[2] or '0/' in k.split()[1]: + res = False + break + if res == True: + log.info('DeepFlow services is normal') + self.status = "install deepflow-ce complete" + self.end_wait_running_time = datetime.now() + break + except Exception as err: + log.error(err) + assert False + return res + + def upgrade_deepflow_ce( + self, + df_server_image_tag=None, + df_agent_image_tag=None + ): + res = False + if not df_server_image_tag: + df_server_image_tag = "latest" + if not df_agent_image_tag: + df_agent_image_tag = "latest" + version = "" + if "v6.1" in df_server_image_tag or "v6.1" in df_agent_image_tag: + version = "--version 6.1.8" + elif "v6.2" in df_server_image_tag or "v6.2" in df_agent_image_tag: + version = "--version 6.2.6" + elif "v6.3" in df_server_image_tag or "v6.3" in df_agent_image_tag: + version = "--version 6.3.9" + elif "v6.4" in df_server_image_tag or "v6.4" in df_agent_image_tag: + version = "--version 6.4.9" + ssh_client = self._ssh_pool.get(self.mgt_ip) + _, _, stderr = ssh_client.exec_command( + f'''sed -i "18s/v6.1/latest/g" values-custom-stable.yaml''' + ) + err = stderr.readlines() + if err: + log.error(err) + + _, _, stderr = ssh_client.exec_command( + f'''sed -i "16s/v6.1/{df_server_image_tag}/g" values-custom-stable.yaml''' + ) + err = stderr.readlines() + if err: + log.error(err) + + _, _, stderr = ssh_client.exec_command( + f'''sed -i "21s/v6.1/{df_agent_image_tag}/g" values-custom-stable.yaml''' + ) + err = stderr.readlines() + if err: + log.error(err) + _, _, stderr = ssh_client.exec_command( + f'''sed -i "\$a\server:\\n nameservers:\\n - {common_const.ext_dns_server}" values-custom-stable.yaml''' + ) + err = stderr.readlines() + if err: + log.error(err) + extraVolumeMounts = ''' extraVolumeMounts:\\n - name: log-volume\\n mountPath: /var/log/deepflow\\n readOnly: false\\n hostPath: /root/deepflow''' + _, _, stderr_log = ssh_client.exec_command( + f'''sed -i "\$a\{extraVolumeMounts}" values-custom-stable.yaml && mkdir deepflow''' + ) + err_log = stderr_log.readlines() + if err_log: + log.error(err_log) + log.info(f'Start upgrade deepflow-ce ip:{self.mgt_ip}') + _, stdout, stderr = ssh_client.exec_command( + f'''helm repo update deepflow_stable && helm upgrade deepflow {version} -n deepflow deepflow_stable/deepflow -f values-custom-stable.yaml''' + ) + err = stderr.readlines() + if err: + log.error(f'Install Deepflow-ce Error: {err}') + assert False + try: + if 'Grafana auth: admin:deepflow' in stdout.readlines()[-1]: + log.info( + 'DeepFlow is completed, waiting for the service status to be normal' + ) + wait_num = int(30 * 60 / 10) + while wait_num: + log.info( + 'Wait for DeepFlow service status to be normal,about 1s, timeout is 1800' + ) + wait_num -= 1 + time.sleep(10) + _, stdout, stderr = ssh_client.exec_command( + 'kubectl get pods -n deepflow' + ) + logs = stdout.readlines() + res = True + for k in logs[1:-1]: + log.info("get pod ========= > {}".format(k)) + if 'Running' not in k.split( + )[2] and 'deepflow-server' not in k.split()[2]: + pass + if 'Running' not in k.split()[2]: + res = False + break + if res == True: + log.info('DeepFlow services is normal') + # self.end_wait_running_time = datetime.now() + break + + except Exception as err: + log.error(err) + assert False + return res + + def query_first_data(self, filters=''): + """ + kwargs: 数据库,表名,以及查询的命令 + """ + headers = {'Content-Type': 'application/x-www-form-urlencoded'} + data = { + 'db': "flow_metrics", + 'sql': f"select pod_node from vtap_flow_port {filters} order by time limit 1", + 'data_precision': "1s" + } + data = urlencode(data, encoding='utf-8') + response = requests.post( + url='http://%s:%s/v1/query/' % + (self.mgt_ip, self.server_query_port), headers=headers, data=data + ) + return response.json(), response.status_code + + def get_query_api_port(self): + port = self.deploy_utils.get_query_port() + log.info("用于查询DF-Querier的APi所用端口为{}".format(port)) + return port + + def get_controller_api_port(self): + port = self.deploy_utils.get_controller_port() + log.info("用于查询DF-Controller的APi所用端口为{}".format(port)) + return port + + def check_first_data(self, filters=''): + checked = False + self.status = "check first data" + self.server_query_port = self.get_query_api_port() + self.server_controller_port = self.get_controller_api_port() + log.info( + f"Get Server Port, querier: {self.server_query_port}, controller: {self.server_controller_port}" + ) + time_count = 0 + loop_num = 240 + while loop_num: + try: + loop_num -= 1 + time.sleep(5) + time_count += 5 + if time_count % 10 == 0: + log.info(f"第{time_count}秒尝试获取数据") + self.status = f"check first data has waiting {time_count}s" + response, code = self.query_first_data(filters=filters) + if code != 200: + log.info( + f"check_first_data failed, code {code} response {response}" + ) + continue + log.info( + 'check_first_data::response_json["result"]["values"] ==> {}' + .format(response["result"]["values"]) + ) + if code == 200: + checked = True + break + except Exception as e: + log.error(e) + pass + return checked + + def add_aliyun_platform(self): + self.status = "add aliyun platform" + if not self.deploy_utils.check_aliyun_cloud_isexist(): + self.deploy_utils.add_aliyun_cloud_platform() + self.status = "check aliyun cloud status" + return self.deploy_utils.check_aliyun_cloud_status() + else: + return True + + def install_deepflow_ce(self): + with self.step("create instance"): + self.mgt_ip = self.create_instance(self.instance_name) + log.info(f"DeepFlow IP is {self.mgt_ip}") + time.sleep(20) + + with self.step("install k8s"): + self.status = "install k8s" + utils.install_k8s(self.mgt_ip) + + with self.step("install deepflow ce"): + self.install_deepflow_ce_latest( + deepflow_mgt_ip=self.mgt_ip, + df_server_image_tag=self.df_server_image_tag, + df_agent_image_tag=self.df_agent_image_tag + ) + with self.step("check first data"): + if self.check_first_data() is False: + log.error("check_first_data failed! deploy failed!") + self.status = "check first data failed" + return + with self.step("install deepflow ctl"): + self.deploy_utils.install_deepflow_ctl() + with self.step("add aliyun platform"): + if self.add_aliyun_platform() is False: + log.error("add_aliyun_platform failed! deploy failed!") + self.status = "add aliyun platform failed" + return + self.status = "complete" + log.info("install deepflow ce complete!") + + def run(self): + self.install_deepflow_ce() + + diff --git a/evaluation/eval-runner/eval-runner/deploy/workers/deploy_base.py b/evaluation/eval-runner/eval-runner/deploy/workers/deploy_base.py new file mode 100644 index 0000000000000000000000000000000000000000..4cff9b61244c170098718f0a2c82d9daa548a2b7 --- /dev/null +++ b/evaluation/eval-runner/eval-runner/deploy/workers/deploy_base.py @@ -0,0 +1,55 @@ +from multiprocessing import Process +import time +import allure +from eval_lib.common.logger import get_logger +from deploy.dfenvs import DFEnvs +from eval_lib.databases.redis.runner_info import RedisRunnerInfo +from eval_lib.databases.redis import const as redis_const + +log = get_logger() + + +class DeployBase(Process): + type = "" + def __init__(self, uuid): + super().__init__() + self.uuid = uuid + self._status = "" + self.redis_runner = RedisRunnerInfo() + self.redis_envs = DFEnvs(uuid) + + @property + def status(self): + return self._status + + @status.setter + def status(self, new_status): + self._status = new_status + deploy_info = self.to_redis_envs_info() + if new_status == "init": + self.redis_envs.init(name=deploy_info["name"], type=deploy_info["type"]) + else: + self.redis_envs.update(deploy_info.pop("name"), **deploy_info) + + def to_redis_envs_info(self): + return { + "name": "", + "type": self.type, + } + + def step(self, title): + log.info(title) # 记录步骤开始的日志 + while True: + # 从Redis获取运行者信息 + sync_status = self.redis_runner.sync_case_status(uuid=self.uuid) + if sync_status == redis_const.CASE_STATUS_CANCELLED: + log.info("case cancel success") + assert False + # 如果同步后状态不是running,则每隔20秒检查一次;如果是,则结束循环 + if sync_status != redis_const.CASE_STATUS_RUNNING: + time.sleep(20) + log.info(f"stop execute case, wait 20s, ctrl-status is {sync_status}") + else: + break + # 执行allure步骤,并返回结果 + return allure.step(title) \ No newline at end of file diff --git a/evaluation/eval-runner/eval-runner/eval-runner.py b/evaluation/eval-runner/eval-runner/eval-runner.py index 53e1f3d0b5c65d6493ef1d48f300fe7701fc010f..ebf5ddaac795f44d16ae1d05f88f0abb75180a63 100644 --- a/evaluation/eval-runner/eval-runner/eval-runner.py +++ b/evaluation/eval-runner/eval-runner/eval-runner.py @@ -2,17 +2,20 @@ import time import os import subprocess import sys -import shutil +import traceback from common import const -from common.config import conf +from deploy import const as deploy_const +from config import conf from eval_lib.common.logger import get_logger from eval_lib.common.logger import LoggerManager -from common.utils import redis_db +from common.utils import redis_runner from common.utils import zip_dir from eval_lib.databases.redis import const as redis_const from eval_lib.source.dictonary import Dictionary from common.client import ResultClient, LogClient +from deploy.deploy import DeployWorker +from common.module import DeployMeta log = get_logger() @@ -30,86 +33,67 @@ class Runner(): self.runner_report_path = f"{self.runner_data_path}/report" self.runner_log_path = f"{self.runner_data_path}/log" self.runner_allure_path = f"{self.runner_data_path}/allure-result" + self.log_path = f"{self.runner_log_path}/runner.log" + LoggerManager(log_file=self.log_path) + self.lc = self.start_forward_log(log_path=self.log_path) + self.deploy_worker = None - def get_case_path(self): - case_path = Dictionary().CASE_DICTIONARY.get( - self.case_params.case_name - ) - if not case_path: - log.error(f"case_name: {self.case_params.case_name} not support") - raise Exception( - f"case_name: {self.case_params.case_name} not support" - ) - return case_path[0] - def run(self): + def run(self): try: self.init_env() + self.deploy_deepflow_server() self.exec_pytest() self.wait() - # self.get_results() except Exception as e: log.error(f"Runner {self.uuid} run error: {e}") + log.error(traceback.format_exc()) finally: + if self.deploy_worker: + self.deploy_worker.release() + self.lc.stop() self.push_results() - redis_db.update_runner_info( - uuid=self.uuid, - info={"runner-status": redis_const.CASE_STATUS_COMPLETED} - ) + redis_runner.set_runner_complete(uuid=self.uuid) time.sleep(300) def init_env(self): """初始化环境目录 """ # 创建数据目录 - log.info(f"data_dir is : {self.runner_data_path}") folder_paths = [ - conf.runner_data_dir, - self.runner_data_path, self.runner_report_path, self.runner_log_path, self.runner_allure_path, ] for folder_path in folder_paths: - try: - os.makedirs(folder_path) - log.info(f"Runner {self.uuid} create folder: {folder_path}") - except FileExistsError: - pass + os.makedirs(folder_path, exist_ok=True) log.info(f"Runner {self.uuid} init env success.") + log.info(f"data_dir is : {self.runner_data_path}") def exec_pytest(self): # 执行测试用例 envs = os.environ.copy() envs["PYTHONPATH"] = f":{self.runner_dir}" - # TODO: leyi 修改log文件 - log_path = f"{self.runner_log_path}/pytest-{self.uuid}.log" - + relative_case_path = self.get_relative_case_path() try: - command = f"pytest -vs ./case/{self.get_case_path()} --alluredir {self.runner_allure_path} --workers {self.case_params.process_num} > {log_path}" + command = f"pytest -vs {relative_case_path} --alluredir {self.runner_allure_path} --workers {self.case_params.process_num}" # 执行 pytest 命令 log.info(f"exec pytest command: {command}") - self.pytest_process = subprocess.Popen( - command, - shell=True, - cwd=self.runner_dir, - stdout=subprocess.PIPE, - stderr=subprocess.PIPE, - env=envs, - ) + with open(self.log_path, "a") as log_file: + self.pytest_process = subprocess.Popen( + command, + shell=True, + cwd=self.runner_dir, + stdout=log_file, + stderr=log_file, + env=envs, + ) except subprocess.CalledProcessError as e: log.error("exec pytest error:", e) - redis_db.update_runner_info( - uuid=self.uuid, info={ - "runner-status": redis_const.CASE_STATUS_RUNNING, - "case-status": redis_const.CASE_STATUS_RUNNING - } - ) + redis_runner.set_case_running(uuid=self.uuid) def wait(self): - log_path = f"{self.runner_log_path}/pytest-{self.uuid}.log" - lc = self.start_forward_log(log_path=log_path) while True: # 检查进程状态 time.sleep(5) @@ -122,30 +106,54 @@ class Runner(): log.error("pytest process occurred error") if pytest_stderr is not None: log.error(f"error_log: {pytest_stderr.decode()}") - lc.stop() - redis_db.update_runner_info( - uuid=self.uuid, - info={"case-status": redis_const.CASE_STATUS_COMPLETED} - ) + redis_runner.set_case_complete(uuid=self.uuid) break - runner_info_dict = redis_db.get_runner_info(uuid=self.uuid) + runner_info_dict = redis_runner.get_runner_info(uuid=self.uuid) if runner_info_dict["case-control-status" ] == redis_const.CASE_STATUS_FORCE_END: # 主动取消case执行 - redis_db.update_runner_info( - uuid=self.uuid, - info={"case-status": redis_const.CASE_STATUS_FORCE_END} - ) + redis_runner.set_case_end(uuid=self.uuid) log.error("case force end") self.interrupt() - lc.stop() break def interrupt(self): self.pytest_process.kill() log.error(f"Runner {self.uuid} interrupt.") + def deploy_deepflow_server(self): + agent_type = conf.case_params.agent_type + if agent_type != 'deepflowce': + return + platform_type = conf.platform_tools.get("type", "") + if platform_type != 'aliyun': + return + meta = DeployMeta() + meta.init( + instance_name=deploy_const.ENV_INSTANCE_NAME_DEFAULT, + type=deploy_const.ENV_TYPE_DEEPFLOW_CE, + uuid=self.case_params.uuid + ) + self.deploy_worker = DeployWorker(meta=meta) + if not self.deploy_worker.deploy(): + log.error("deploy deepflow server failed") + raise Exception("deploy deepflow server failed") + + def get_relative_case_path(self): + case_path = Dictionary().CASE_DICTIONARY.get( + self.case_params.case_name + ) + if not case_path: + log.error(f"case_name: {self.case_params.case_name} not support") + raise Exception( + f"case_name: {self.case_params.case_name} not support" + ) + relative_case_path = os.path.join("./case/", case_path[0]) + return relative_case_path + def start_forward_log(self, log_path): + if not os.path.exists(log_path): + os.mknod(log_path) log.info("start log forwarding") server_url = f"http://{const.CONTROLLER_HOST}:{conf.listen_port}{const.API_PREFIX_RESULT_LOG}" lc = LogClient( @@ -158,10 +166,6 @@ class Runner(): def push_results(self): log.info("start push result to controller") runner_data_zip = f"runner-{self.uuid}.zip" - shutil.move( - src=f"{conf.runner_data_dir}/runner.log", - dst=f"{self.runner_log_path}/runner.log" - ) zip_dir(folder_path=self.runner_data_path, output_path=runner_data_zip) server_url = f"http://{const.CONTROLLER_HOST}:{conf.listen_port}{const.API_PREFIX_RESULT_ZIP}" rc = ResultClient(server_url=server_url) @@ -201,6 +205,4 @@ if __name__ == '__main__': if not conf.is_valid(): print('Invalid conf value, error exit.') sys.exit(1) - # TODO: 初始化log文件 - LoggerManager(log_file=f"{conf.runner_data_dir}/runner.log") Runner().run() diff --git a/evaluation/eval-runner/eval-runner/platform_tools/aliyun/ali_const.py b/evaluation/eval-runner/eval-runner/platform_tools/aliyun/ali_const.py index cdfa77914fc4317df22f3c999cb411b64bc27131..0a8b61737c1d547c21bb758df8e12b89c60b7cf0 100644 --- a/evaluation/eval-runner/eval-runner/platform_tools/aliyun/ali_const.py +++ b/evaluation/eval-runner/eval-runner/platform_tools/aliyun/ali_const.py @@ -1,3 +1,4 @@ +ali_dns_nameserver = "10.1.0.1" # ------ Aliyun Public Cloud------------ # Common Variable Definition ali_resource_group_id_default = 'rg-aekzm564q2edrsi' @@ -9,6 +10,14 @@ ali_zone_id_beijing_k = 'cn-beijing-k' ali_key_pair_name_default = 'automation' ali_instance_type_c6_2x_large = 'ecs.c6.2xlarge' ali_instance_type_c6r_2x_large = 'ecs.c6r.2xlarge' -ali_image_id_x86_centos = 'm-2zec520yiix6ihla0r7b' -ali_image_id_arm = 'm-2ze8315uz1wvw8tshrv3' -ali_image_id_performance_analysis = "m-2ze04udh1zzjc6813fep" +ali_instance_type_t5_c1m2_2x_large = "ecs.t5-c1m2.2xlarge" + +# image id +ali_image_id_x86_centos_default = 'm-2ze8ebfmny4t903xm3gm' +ali_image_id_arm_default = 'm-2ze8315uz1wvw8tshrv3' +ali_image_id_deepflow_default = 'm-2zeih42vvjylq8ega50w' +ali_image_id_performance_analysis_default = "m-2zeid7425f50jbv4nnsb" #telegraf + influxdb + sealos +ali_image_id_performance_analysis_docker = 'm-2ze8y74nrxw27l63e0ut' # 已安装docker 不可部署k8s +ali_image_id_performance_analysis_goserver = 'm-2ze1fpbidqrtxtgbtoij' +ali_image_id_performance_default = 'm-2ze8ebfmny4t903xm3gm' +ali_image_id_performance_k8s = 'm-2ze9zy6izp9sgb44k31u' # 已安装sealos, helm 可部署k8s \ No newline at end of file diff --git a/evaluation/eval-runner/eval-runner/platform_tools/aliyun/aliyun_sdk.py b/evaluation/eval-runner/eval-runner/platform_tools/aliyun/aliyun_sdk.py index dd89b38269c04711fb85bb6536f6a4641750df4e..9e2a125797928e9a85c8d116d3f17364bb35a7be 100644 --- a/evaluation/eval-runner/eval-runner/platform_tools/aliyun/aliyun_sdk.py +++ b/evaluation/eval-runner/eval-runner/platform_tools/aliyun/aliyun_sdk.py @@ -1,8 +1,8 @@ import os -from platform_tools.aliyun import ali_const -from common.config import conf +import datetime +import pytz + from typing import List -from platform_tools.base import PlatformBase from Tea.core import TeaCore from alibabacloud_tea_util import models as util_models from alibabacloud_tea_openapi import models as open_api_models @@ -12,6 +12,9 @@ from alibabacloud_ecs20140526.client import Client as EcsClient from alibabacloud_darabonba_number.client import Client as NumberClient from eval_lib.common.logger import get_logger +from platform_tools.aliyun import ali_const +from platform_tools.base import PlatformBase +from config import conf log = get_logger() @@ -62,6 +65,7 @@ class Aliyun(PlatformBase): client: EcsClient, region_id: str, instance_ids: List[str], + force_stop: bool = False, stopped_mode: str = 'KeepCharging', dry_run: bool = False, ) -> None: @@ -73,6 +77,7 @@ class Aliyun(PlatformBase): instance_id=instance_ids, stopped_mode=stopped_mode, dry_run=dry_run, + force_stop=force_stop, ) runtime = util_models.RuntimeOptions() responce = client.stop_instances_with_options(request, runtime) @@ -94,14 +99,18 @@ class Aliyun(PlatformBase): zone_id: str, key_pair_name: str, amount: int, - )-> List[str]: + auto_release_time: int, + ) -> List[str]: """ [批量] 实例创建-> str: 实例id + auto_release_time: 自动释放实例时间,单位h """ - tag_0 = ecs_models.RunInstancesRequestTag( - key='财务单元', - value='自动化测试' - ) + utc_timezone = pytz.utc + current_time_utc = datetime.datetime.now(tz=utc_timezone) + # Add hours to the current time + updated_time_utc = current_time_utc + datetime.timedelta(hours=int(auto_release_time)) + iso_format_time_utc = updated_time_utc.strftime('%Y-%m-%dT%H:%M:%SZ') + tag_0 = ecs_models.RunInstancesRequestTag(key='财务单元', value='自动化测试') request = ecs_models.RunInstancesRequest( region_id=region_id, instance_name=instance_name, @@ -115,6 +124,7 @@ class Aliyun(PlatformBase): key_pair_name=key_pair_name, amount=amount, tag=[tag_0], + auto_release_time=str(iso_format_time_utc), ) runtime = util_models.RuntimeOptions() response = client.run_instances_with_options(request, runtime) @@ -126,7 +136,7 @@ class Aliyun(PlatformBase): f'-----------create instance successful, instance ID:{instance_ids}--------------' ) return instance_ids - + @staticmethod def _delete_instances( client: EcsClient, @@ -137,10 +147,15 @@ class Aliyun(PlatformBase): """ [批量] 实例删除-> None """ - Aliyun._stop_instances(client, region_id, instance_ids) - Aliyun._await_instances_status( + Aliyun._stop_instances(client, region_id, instance_ids, True) + status = Aliyun._await_instances_status( client, region_id, instance_ids, "Stopped" ) + if not status: + log.error( + 'instance stop failed, force delete' + ) + force = True request = ecs_models.DeleteInstancesRequest( region_id=region_id, instance_id=instance_ids, @@ -167,7 +182,7 @@ class Aliyun(PlatformBase): flag = True while flag and NumberClient.lt(time, 10): flag = False - instances_info= Aliyun._get_instances_info( + instances_info = Aliyun._get_instances_info( client, region_id, instance_ids ) for instance in instances_info: @@ -184,7 +199,7 @@ class Aliyun(PlatformBase): return NumberClient.lt(time, 10) @staticmethod - def _get_instances_info( + def _get_instances_info( client: EcsClient, region_id: str, instance_ids: List[str] ) -> List[dict]: """ @@ -199,27 +214,24 @@ class Aliyun(PlatformBase): response = client.describe_instances_with_options(request, runtime) instance_data = response.body.instances.instance for instance in instance_data: - instance_info.append( - { - "instanceid": instance.instance_id, - "ip": instance.vpc_attributes.private_ip_address.ip_address[0], - "status": instance.status, - } - ) + instance_info.append({ + "instanceid": instance.instance_id, + "ip": instance.vpc_attributes.private_ip_address.ip_address[0], + "status": instance.status, + }) return instance_info - + @staticmethod def _get_instance_id_by_name( - client: EcsClient, - region_id: str, - instance_name: str + client: EcsClient, region_id: str, instance_name: str ) -> str: describe_instances_request = ecs_models.DescribeInstancesRequest( - region_id=region_id, - instance_name=instance_name + region_id=region_id, instance_name=instance_name ) runtime = util_models.RuntimeOptions() - response = client.describe_instances_with_options(describe_instances_request, runtime) + response = client.describe_instances_with_options( + describe_instances_request, runtime + ) if response.body.instances.instance: return response.body.instances.instance[0].instance_id else: @@ -228,14 +240,14 @@ class Aliyun(PlatformBase): @staticmethod def create_instances( instance_names: list, - image_id=ali_const.ali_image_id_x86_centos, + image_id=ali_const.ali_image_id_x86_centos_default, instance_type=ali_const.ali_instance_type_c6_2x_large, ) -> dict: '''创建通用镜像的实例 - 密码固定为CASE_SSH_PASSWORD_DEFAULT + 密码固定为runner_ssh_password ''' client = Aliyun.create_client() - region_id=os.environ['ALICLOUD_REGION'] + region_id = os.environ['ALICLOUD_REGION'] instances_ip = {} for instance_name in instance_names: instance_ids = Aliyun._create_instances( @@ -247,10 +259,11 @@ class Aliyun(PlatformBase): security_group_id=ali_const.ali_security_group_id_default, v_switch_id=ali_const.ali_v_switch_id_beijing_a, resource_group_id=ali_const.ali_resource_group_id_default, - password=conf.global_ssh_password, + password=conf.runner_ssh_password, zone_id=ali_const.ali_zone_id_beijing_a, key_pair_name=ali_const.ali_key_pair_name_default, amount=1, + auto_release_time=2, ) Aliyun._await_instances_status( client, region_id, instance_ids, "Running" @@ -268,17 +281,16 @@ class Aliyun(PlatformBase): def delete_instances(instance_names: list): instance_ids = [] client = Aliyun.create_client() - region_id=os.environ['ALICLOUD_REGION'] + region_id = os.environ['ALICLOUD_REGION'] for instance_name in instance_names: instance_id = Aliyun._get_instance_id_by_name( - client=client, - region_id=region_id, - instance_name=instance_name + client=client, region_id=region_id, instance_name=instance_name ) if instance_id: instance_ids.append(instance_id) - Aliyun._delete_instances( - client=client, - region_id=region_id, - instance_ids=instance_ids, - ) \ No newline at end of file + if instance_ids: + Aliyun._delete_instances( + client=client, + region_id=region_id, + instance_ids=instance_ids, + ) diff --git a/evaluation/eval-runner/eval-runner/pytest.ini b/evaluation/eval-runner/eval-runner/pytest.ini index d3fe78aae407ccaeedef09b8d1b14d5e7995ca70..dc3c35f1c6faa2ac9742d0d88a9fadf5f7a9eb76 100644 --- a/evaluation/eval-runner/eval-runner/pytest.ini +++ b/evaluation/eval-runner/eval-runner/pytest.ini @@ -1,7 +1,7 @@ [pytest] addopts = -p no:warnings log_cli=true -log_cli_level=DEBUG +log_cli_level=INFO log_cli_format = %(asctime)s [%(levelname)8s] %(message)s (%(filename)s:%(lineno)s) log_cli_date_format=%Y-%m-%d %H:%M:%S markers = diff --git a/evaluation/evaluation.yaml b/evaluation/evaluation.yaml new file mode 120000 index 0000000000000000000000000000000000000000..df00a0b9fe8f75c1804187c29e11c9a57b56b980 --- /dev/null +++ b/evaluation/evaluation.yaml @@ -0,0 +1 @@ +eval-charts/evaluation-controller/eval-controller.yaml \ No newline at end of file diff --git a/evaluation/requirements.txt b/evaluation/requirements.txt index 82f94b9858aecb16f2300a73b72ff452465e4662..98cf39d52f9eff690b6007091db3bdde010aa9d3 100644 --- a/evaluation/requirements.txt +++ b/evaluation/requirements.txt @@ -14,4 +14,8 @@ influxdb==5.3.2 pymysql==1.0.2 peewee==3.17.3 redis==4.3.5 - +kubernetes==25.3.0 +allure-pytest==2.13.3 +pytest-multithreading-allure==1.0.8 +pytest-parallel==0.1.1 +APScheduler==3.10.4