From 51a8d8c04722d462db3717cafe0c706c8a0d8caa Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=B1=AA=E9=87=8D=E9=98=B3?= <15757101689@163.com> Date: Wed, 15 Mar 2023 03:00:10 +0000 Subject: [PATCH 1/8] =?UTF-8?q?update=20tools/accesscontrol-oepkgs-managem?= =?UTF-8?q?ent/doc/=E8=AE=BE=E8=AE=A1=E9=80=BB=E8=BE=91.md.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: 汪重阳 <15757101689@163.com> update tools/accesscontrol-oepkgs-management/src/git.py. Signed-off-by: 汪重阳 <15757101689@163.com> 新建 handle_xml add tools/handle_xml/handle_xml.py. Signed-off-by: 汪重阳 <15757101689@163.com> 重命名 tools/handle_xml 为 tools/analyze_xml 删除文件 tools/analyze_xml/.keep add tools/analyze_xml. Signed-off-by: 汪重阳 <15757101689@163.com> update tools/analyze_xml/analyze_xml.py. Signed-off-by: 汪重阳 <15757101689@163.com> add tools/analyze_xml/READE.md. Signed-off-by: 汪重阳 <15757101689@163.com> 重命名 tools/analyze_xml/READE.md 为 tools/analyze_xml/README.md update tools/analyze_xml/README.md. Signed-off-by: 汪重阳 <15757101689@163.com> update tools/analyze_xml/analyze_xml.py. Signed-off-by: 汪重阳 <15757101689@163.com> update tools/analyze_xml/handle_xml.py. Signed-off-by: 汪重阳 <15757101689@163.com> --- ...76\350\256\241\351\200\273\350\276\221.md" | 14 +- .../src/git.py | 171 ++++++++++++------ tools/analyze_xml/README.md | 7 + tools/analyze_xml/analyze_xml.py | 43 +++++ tools/analyze_xml/handle_xml.py | 45 +++++ 5 files changed, 225 insertions(+), 55 deletions(-) create mode 100644 tools/analyze_xml/README.md create mode 100644 tools/analyze_xml/analyze_xml.py create mode 100644 tools/analyze_xml/handle_xml.py diff --git "a/tools/accesscontrol-oepkgs-management/doc/\350\256\276\350\256\241\351\200\273\350\276\221.md" "b/tools/accesscontrol-oepkgs-management/doc/\350\256\276\350\256\241\351\200\273\350\276\221.md" index 019ba1d..0b9ecd4 100644 --- "a/tools/accesscontrol-oepkgs-management/doc/\350\256\276\350\256\241\351\200\273\350\276\221.md" +++ "b/tools/accesscontrol-oepkgs-management/doc/\350\256\276\350\256\241\351\200\273\350\276\221.md" @@ -1,3 +1,15 @@ + +# 总流程 +1.本地编译测试,安装测试 + +2.上传到gitee仓库中,webhook触发 + +3.进行obs编译构建、安装测试 + +4.测试通过则评论pr + + + ### 设计逻辑 - 部署x86-64和aarch64架构下的k8s集群 @@ -44,4 +56,4 @@ 1.在所属的大工程下,新建一个package,该名字要与gitee上创建的软件包名字一致。 2.创建好之后,点击add file,上传_service文件,上传之前,修改该文件中的url地址为gitee软件包地址。 -- 上传到Gitee组织仓oepkgs \ No newline at end of file +- 上传到Gitee组织仓oepkgs diff --git a/tools/accesscontrol-oepkgs-management/src/git.py b/tools/accesscontrol-oepkgs-management/src/git.py index b0ab235..c23cf4c 100644 --- a/tools/accesscontrol-oepkgs-management/src/git.py +++ b/tools/accesscontrol-oepkgs-management/src/git.py @@ -1,14 +1,13 @@ # -*- encoding=utf-8 -*- import os -import logging import re -from cmd import shell_cmd import retrying +from shell_cmd import shell_cmd_live +from shell_cmd import shell_cmd +from build_log import Log - -logging.basicConfig(level=logging.INFO) -logger = logging.getLogger("common") +logger = Log() class GitProxy(object): @@ -34,26 +33,26 @@ class GitProxy(object): repo_dir = os.path.join(work_dir, sub_dir) if work_dir else sub_dir clone_cmd = "cd /var/jenkins_home/; git clone {}".format(repo_url) - ret, _, _ = shell_cmd(clone_cmd) + ret, _, _ = shell_cmd_live(clone_cmd) if not ret: - logger.warning("clone repository failed, %s", ret) + logger.warning("clone repository failed, {}".format(ret)) return 2 return cls(repo_dir) @retrying.retry(retry_on_result=lambda result: result is False, - stop_max_attempt_number=3, wait_fixed=2) + stop_max_attempt_number=3, wait_fixed=2000) def update_repo(self): """ 更新本地仓库 @return: """ update_cmd = "cd /var/jenkins_home/{}; git pull".format(self._repo_dir) - ret, _, _ = shell_cmd(update_cmd) + ret, _, _ = shell_cmd_live(update_cmd) if ret: - logger.warning("update repository failed, %s", ret) - return 2 + logger.warning("update repository failed, {}".format(ret)) + return False return True def fetch_pull_request(self, url, pr_num): @@ -63,11 +62,11 @@ class GitProxy(object): :param pr_num: pr编号 :return: """ - fetch_cmd = "cd /var/jenkins_home/{}; git fetch --depth {} {} +refs/pull/{}/MERGE:refs/pull/{}/MERGE".format( - self._repo_dir, 4, url, pr_num, pr_num) - ret, out, _ = shell_cmd(fetch_cmd, cap_out=True, cmd_verbose=False) + fetch_cmd = "cd /var/jenkins_home/{}; git fetch {} pull/{}/head:pr_{}".format( + self._repo_dir, url, pr_num, pr_num) + ret, out, _ = shell_cmd_live(fetch_cmd, cap_out=True, cmd_verbose=False) if ret: - logger.error("git fetch failed,%s ,%s", ret, out) + logger.error("Git fetch failed,{} {}".format(ret, out)) return False return True @@ -78,11 +77,11 @@ class GitProxy(object): :param file_path: 文件完整路径 sig/hpc/src-oepkgs/m/mopac.yaml :return: StringIO """ - get_content_cmd = "cd /var/jenkins_home/{}; git show refs/pull/{}/MERGE:{}".format( - self._repo_dir, pr_num, file_path) - ret, out, _ = shell_cmd(get_content_cmd, cap_out=True) + get_content_cmd = "cd /var/jenkins_home/{}; git show pr_{}:{}".format( + self._repo_dir, pr_num, file_path) + ret, out, _ = shell_cmd_live(get_content_cmd, cap_out=True) if ret: - logger.warning("get file content of commit failed, %s", ret) + logger.error("Get file content of commit failed, {}".format(ret)) return 2 return out @@ -93,139 +92,203 @@ class GitProxy(object): :param pr_num: refs/pull/{pr_num}/MERGE :return: list<string> """ - diff_files_cmd = "cd /var/jenkins_home/{}; git diff --name-only refs/pull/{}/MERGE".format( + diff_files_cmd = "cd /var/jenkins_home/{}; git diff master --name-only --diff-filter=ACM pr_{}".format( self._repo_dir, pr_num) - ret, out, _ = shell_cmd(diff_files_cmd, cap_out=True) + ret, out, _ = shell_cmd_live(diff_files_cmd, cap_out=True) if ret: - logger.error("get diff files of commits failed, %s", ret) + logger.error("Get diff files of commits failed, {}!".format(ret)) return [] return out + @retrying.retry(retry_on_exception=lambda exception: isinstance(exception, IndexError), + stop_max_attempt_number=3, wait_fixed=2000) def check_file_path(self, pr_num): """ 验证路径 :param pr_num: """ - logger.info("Start check repo path is true ?") + logger.info("Check the repo file name and path are correct ?") files_path = self.diff_files_between_commits(pr_num) for file_path in files_path: + if "sig-info.yaml" in file_path: + continue file_list = file_path.split('/') - if file_list[0].strip() != "sig".strip() or file_list[2].strip() != "src-oepkgs".strip(): - logger.error("file_list path error") + logger.error("Path error, first dir not sig or third dir not src-oepkgs !") return 2 elif file_list[3].strip().lower() != file_list[4].strip()[0].lower(): - logger.error("{} last path error".format(file_list[-1])) + logger.error( + "{} repo name and dir name not same, please change it !".format(file_list[-1])) return 2 elif file_list[4].split(".")[-1] != "yaml".strip(): - logger.error("yaml name error") + logger.error("{} Yaml name error, without .yaml please add !".format(file_list)) return 2 logger.info("----END CHECK----") return 0 + @retrying.retry(retry_on_exception=lambda exception: isinstance(exception, IndexError), + stop_max_attempt_number=3, wait_fixed=2000) def check_repo_num(self, pr_num): """ 验证此次建仓库数量是否小于100 @param pr_num: pr序数 @return: """ - logger.info("Start check files number is more than 100 ?") + logger.info("Check whether the number of files is greater than 100 ?") files_path = self.diff_files_between_commits(pr_num) if len(files_path) > 101: - logger.error("create repo more 100, please delete some") + logger.error("Create repo more 100, please delete some !") return 2 logger.info("----END CHECK----") return 0 + @retrying.retry(retry_on_exception=lambda exception: isinstance(exception, IndexError), + stop_max_attempt_number=3, wait_fixed=2000) def check_invalid_file(self, pr_num, invalid_file=".keep"): """ check invalid file """ - logger.info("Start check files have invalid file ?") + logger.info("Check for invalid files ?") filenames = self.diff_files_between_commits(pr_num) for file_name in filenames: if invalid_file in file_name: - logger.error("'.keep' is invalid file, please delete") + logger.error("'.keep' is invalid file, please delete !") return 2 logger.info("----END CHECK----") return 0 + @retrying.retry(retry_on_exception=lambda exception: isinstance(exception, IndexError), + stop_max_attempt_number=3, wait_fixed=2000) def check_name_isdigit(self, pr_num): """ check name not start with a number @param pr_num: @return: """ - logger.info("Start check is the repo name complies with specifications ?") + logger.info("Check whether the repo name conforms to the specification ?") files_path = self.diff_files_between_commits(pr_num) for file_path in files_path: yaml_name = file_path.split('/')[-1] if yaml_name == "sig-info.yaml" or "yaml" not in yaml_name: continue + change_type = self.check_a_m_d(yaml_name, pr_num) + if change_type == "M": + continue + elif change_type == "D": + logger.error("Can not delete yaml files") + return 2 f = self.get_content_of_file_with_commit(file_path, pr_num) f = f[0] repo_name = f.split(":")[-1].strip() result = re.search("\A(?!_)(?!.*?_$)(?!-)(?!.*-$)(?!\+)([^.])[-a-zA-Z0-9_.\u4e00-\u9fa5\uff00-\uffff+ ]+\Z", repo_name) if not result: - logger.error("repo_name: {} has an error".format(repo_name)) + logger.error( + "{} is not complies with specifications, please change it !".format(repo_name)) return 2 logger.info("----END CHECK----") return 0 + def check_a_m_d(self, yaml_name, pr_num): + analyzed_cmd = "cd /var/jenkins_home/{}; git log pr_{} -1 --pretty=format: --name-status --no-merges" \ + "| grep {} | awk -F' ' '{{print $1}}' | cat".format(self._repo_dir, pr_num, + yaml_name) + out = shell_cmd(analyzed_cmd) + return out + + @retrying.retry(retry_on_exception=lambda exception: isinstance(exception, IndexError), + stop_max_attempt_number=3, wait_fixed=2000) def check_name_is_exist(self, pr_num): """ check name is exist? @param pr_num: @return: """ - logger.info("start check name is exist in repo ?") + logger.info("Check whether the repo already exists ?") files_path = self.diff_files_between_commits(pr_num) for file_path in files_path: yaml_name = file_path.split('/')[-1] if yaml_name == "sig-info.yaml" or "yaml" not in yaml_name: continue - get_files_num_cmd = "cd /var/jenkins_home/{}; git ls-files | grep {}".format(self._repo_dir, yaml_name) +\ - "| awk -F '/' '{print $NF}'" + "| grep ^{}$".format(yaml_name) - ret, out, _ = shell_cmd(get_files_num_cmd, cap_out=True) + change_type = self.check_a_m_d(yaml_name, pr_num) + if change_type == "M": + continue + elif change_type == "D": + logger.error("Can not delete yaml files") + return 2 + get_files_num_cmd = "cd /var/jenkins_home/{}; git ls-files | grep -i {}".format(self._repo_dir, yaml_name) \ + + "| awk -F '/' '{print $NF}'" + "| grep -i ^{}$".format(yaml_name) + ret, out, _ = shell_cmd_live(get_files_num_cmd, cap_out=True) if not ret: - logger.error("{} is exist".format(yaml_name)) + logger.error("{} is exist, please find in repo !".format(yaml_name)) return 2 logger.info("----END CHECK----") return 0 - @retrying.retry(retry_on_result=lambda result: result is IndexError, - stop_max_attempt_number=3, wait_fixed=2) - def compare_yaml_repo_name(self, url, pr_num, depth=4, progress=False): + @retrying.retry(retry_on_exception=lambda exception: isinstance(exception, IndexError), + stop_max_attempt_number=3, wait_fixed=2000) + def compare_yaml_repo_name(self, pr_num): """ - check name is exist? + check name @param pr_num: - @param url: - @param depth - @param progress @return: """ - logger.info("Start compare yaml name and repo name is same ?") + logger.info("Check whether yaml name and repo name are the same ?") files_path = self.diff_files_between_commits(pr_num) for file_path in files_path: yaml_name = file_path.split('/')[-1] - if yaml_name == "sig-info.yaml" or "yaml" not in yaml_name: + if "yaml" not in yaml_name: + continue + if yaml_name == "sig-info.yaml": + dir_name = file_path.split('/')[-2] + f = self.get_content_of_file_with_commit(file_path, pr_num) + f = f[0] + if f == 2: + return 2 + repo_name = f.split(":")[-1] + logger.info(yaml_name) + if repo_name.strip() != dir_name: + logger.error("sig-info.yaml [name] {} and dir [name] {} not same".format(repo_name, dir_name)) + return 2 continue - fetch_cmd = "cd /var/jenkins_home/{}; git fetch {} --depth {} {} +refs/pull/{}/MERGE:refs/pull/{}/MERGE".\ - format(self._repo_dir, "--progress" if progress else "", depth, url, pr_num, pr_num) - ret, out, _ = shell_cmd(fetch_cmd, cap_out=True, cmd_verbose=False) - if ret: - logger.error("git fetch failed,%s ,%s", ret, out) + change_type = self.check_a_m_d(yaml_name, pr_num) + if change_type == "D": + logger.error("Can not delete yaml files") return 2 f = self.get_content_of_file_with_commit(file_path, pr_num) f = f[0] + if f == 2: + return 2 repo_name = f.split(":")[-1] yaml_name = yaml_name.split('.yaml')[0] logger.info(yaml_name) if repo_name.strip() != yaml_name.strip(): - logger.error("repo_name: {} and yaml_name: {} name not same, please change".format(repo_name, - yaml_name)) + logger.error( + "repo_name: {} and yaml_name: {} name not same, please change !".format(repo_name, + yaml_name)) return 2 + sig_info_yaml_check_res = self.check_siginfo(pr_num) + if sig_info_yaml_check_res == 2: + return 2 logger.info("----END CHECK----") return 0 + + def check_siginfo(self, pr_num): + """ + check sig-info.yaml change + """ + logger.info("Check sig-info.yaml ?") + files_path = self.diff_files_between_commits(pr_num) + for file_path in files_path: + yaml_name = file_path.split('/')[-1] + change_type = self.check_a_m_d(yaml_name, pr_num) + if yaml_name == "sig-info.yaml" and change_type == "M": + dir_name = file_path.split('/')[-2] + f = self.get_content_of_file_with_commit(file_path, pr_num) + f = f[0] + repo_name = f.split(":")[-1] + if repo_name.strip() != dir_name.strip(): + logger.error("You can not change sig-info.yaml [name] {} or [description]".format(dir_name)) + return 2 + return 0 diff --git a/tools/analyze_xml/README.md b/tools/analyze_xml/README.md new file mode 100644 index 0000000..0b2a31c --- /dev/null +++ b/tools/analyze_xml/README.md @@ -0,0 +1,7 @@ +### 工具作用 +用来解析rpm源码包repodata下的primary.xml文件,找出其中group节点未被分类的软件包,并且预测未分类的软件包的类别。 +### 使用环境 +python3.10 +### 使用方法 +需要准备xml文件,并且将生成的xlsx文件另存为csv文件,将字符设置为utf-8 + diff --git a/tools/analyze_xml/analyze_xml.py b/tools/analyze_xml/analyze_xml.py new file mode 100644 index 0000000..5623ba8 --- /dev/null +++ b/tools/analyze_xml/analyze_xml.py @@ -0,0 +1,43 @@ +import pandas as pd +from sklearn.feature_extraction.text import CountVectorizer +from sklearn.naive_bayes import MultinomialNB +from sklearn.pipeline import make_pipeline +from build_log import Log + +logger = Log() + +# 加载数据 +FILE_PATH = 'all.csv' # 将此路径替换为你的csv文件的路径 +data = pd.read_csv(FILE_PATH) + +# 检查数据是否包含NaN值 +missing_values = data.isnull().sum() +unspecified_counts = data.apply(lambda x: x.str.count("Unspecified")).sum() + +# 如果有未分类的软件包 (NaN在'b'列,一下abc列皆为表头) +if missing_values['group'] > 0: + # 分割数据为训练集和未分类集 + train_data = data[data['group'].notna()] + unclassified_data = data[data['group'].isna()] + + # 创建一个模型来预测类别 + model = make_pipeline(CountVectorizer(), MultinomialNB()) + + # 使用列a和c作为特征,列b作为目标变量来训练模型 + X_train = train_data['name'] + ' ' + train_data['summary'] + train_data['description'] + y_train = train_data['group'] + model.fit(X_train, y_train) + + # 预测未分类的软件包的类别 + X_unclassified = unclassified_data['name'] + ' ' + unclassified_data['summary'] + unclassified_data['description'] + predicted_categories = model.predict(X_unclassified) + # print(X_unclassified) +# +# # 将预测的类别添加到表格 + data.loc[data['group'].isna(), 'group'] = predicted_categories + + # 保存更新后的表格到新的csv文件 + data.to_csv('updated_file1.csv', index=False) + logger.info("Categories have been predicted and the updated file has been saved as 'updated_file.csv'") +else: + logger.info("All packages are already categorized.") \ No newline at end of file diff --git a/tools/analyze_xml/handle_xml.py b/tools/analyze_xml/handle_xml.py new file mode 100644 index 0000000..959e360 --- /dev/null +++ b/tools/analyze_xml/handle_xml.py @@ -0,0 +1,45 @@ +#!/usr/bin/python3 + +from xml.dom.minidom import parse +import xml.dom.minidom +from openpyxl import Workbook +from build_log import Log + +logger = Log() + +# 使用minidom解析器打开 XML 文档 +DOMTree = xml.dom.minidom.parse("suse-primary.xml") +collection = DOMTree.documentElement +if collection.hasAttribute("shelf"): + logger.info("Root element : %s" % collection.getAttribute("shelf")) + +# 创建工作簿 +wb = Workbook() +ws = wb.active + +# 写入表头 +ws.append(['name', 'group', 'summary', 'description']) + +# 在集合中获取所有rpm包 +packages = collection.getElementsByTagName("package") + +# 打印每个rpm包所需内容的详细信息 +for package in packages: + if package.hasAttribute("package"): + logger.info("Title: %s" % package.getAttribute("package")) + + name = package.getElementsByTagName('name')[0] + name1 = name.childNodes[0].data + # 所需要的包名文件 + with open("all", 'r', encoding="utf-8") as f: + for line in f: + if line.strip() == name1: + logger.info(name1) + description = package.getElementsByTagName('description')[0] + group = package.getElementsByTagName('rpm:group')[0] + summary = package.getElementsByTagName('summary')[0] + ws.append([name1, group.childNodes[0].data, summary.childNodes[0].data, + description.childNodes[0].data.replace('\n', '')]) + +# 所需要保存的表的路径 +wb.save('test-11.xlsx') \ No newline at end of file -- Gitee From 65b0880025aa6b6d9a5e06393b414ec61dd98dd3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=B1=AA=E9=87=8D=E9=98=B3?= <15757101689@163.com> Date: Thu, 6 Jul 2023 08:26:09 +0000 Subject: [PATCH 2/8] update tools/analyze_xml/README.md. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: 汪重阳 <15757101689@163.com> --- tools/analyze_xml/README.md | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/tools/analyze_xml/README.md b/tools/analyze_xml/README.md index 0b2a31c..93d7aed 100644 --- a/tools/analyze_xml/README.md +++ b/tools/analyze_xml/README.md @@ -3,5 +3,14 @@ ### 使用环境 python3.10 ### 使用方法 -需要准备xml文件,并且将生成的xlsx文件另存为csv文件,将字符设置为utf-8 +python handle_xml.py -x xml_file_name -r rpm_file_name +xml_file_name 为repodata下xml文件 +rpm_file_name 为所需要的rpm包名 +例如: +0ad +0ad-data +2048-cli +2ping +389-ds +3omns \ No newline at end of file -- Gitee From e1603c344f49fb0fc25b439c31e3887f00a5a4c4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=B1=AA=E9=87=8D=E9=98=B3?= <15757101689@163.com> Date: Thu, 6 Jul 2023 08:26:17 +0000 Subject: [PATCH 3/8] =?UTF-8?q?=E5=88=A0=E9=99=A4=E6=96=87=E4=BB=B6=20tool?= =?UTF-8?q?s/analyze=5Fxml/analyze=5Fxml.py?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- tools/analyze_xml/analyze_xml.py | 43 -------------------------------- 1 file changed, 43 deletions(-) delete mode 100644 tools/analyze_xml/analyze_xml.py diff --git a/tools/analyze_xml/analyze_xml.py b/tools/analyze_xml/analyze_xml.py deleted file mode 100644 index 5623ba8..0000000 --- a/tools/analyze_xml/analyze_xml.py +++ /dev/null @@ -1,43 +0,0 @@ -import pandas as pd -from sklearn.feature_extraction.text import CountVectorizer -from sklearn.naive_bayes import MultinomialNB -from sklearn.pipeline import make_pipeline -from build_log import Log - -logger = Log() - -# 加载数据 -FILE_PATH = 'all.csv' # 将此路径替换为你的csv文件的路径 -data = pd.read_csv(FILE_PATH) - -# 检查数据是否包含NaN值 -missing_values = data.isnull().sum() -unspecified_counts = data.apply(lambda x: x.str.count("Unspecified")).sum() - -# 如果有未分类的软件包 (NaN在'b'列,一下abc列皆为表头) -if missing_values['group'] > 0: - # 分割数据为训练集和未分类集 - train_data = data[data['group'].notna()] - unclassified_data = data[data['group'].isna()] - - # 创建一个模型来预测类别 - model = make_pipeline(CountVectorizer(), MultinomialNB()) - - # 使用列a和c作为特征,列b作为目标变量来训练模型 - X_train = train_data['name'] + ' ' + train_data['summary'] + train_data['description'] - y_train = train_data['group'] - model.fit(X_train, y_train) - - # 预测未分类的软件包的类别 - X_unclassified = unclassified_data['name'] + ' ' + unclassified_data['summary'] + unclassified_data['description'] - predicted_categories = model.predict(X_unclassified) - # print(X_unclassified) -# -# # 将预测的类别添加到表格 - data.loc[data['group'].isna(), 'group'] = predicted_categories - - # 保存更新后的表格到新的csv文件 - data.to_csv('updated_file1.csv', index=False) - logger.info("Categories have been predicted and the updated file has been saved as 'updated_file.csv'") -else: - logger.info("All packages are already categorized.") \ No newline at end of file -- Gitee From a7cded58bb9e4aae4a3a2e49f9ddba12f68ab053 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=B1=AA=E9=87=8D=E9=98=B3?= <15757101689@163.com> Date: Thu, 6 Jul 2023 08:26:38 +0000 Subject: [PATCH 4/8] update tools/analyze_xml/handle_xml.py. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: 汪重阳 <15757101689@163.com> --- tools/analyze_xml/handle_xml.py | 146 ++++++++++++++++++++++++-------- 1 file changed, 109 insertions(+), 37 deletions(-) diff --git a/tools/analyze_xml/handle_xml.py b/tools/analyze_xml/handle_xml.py index 959e360..12946c2 100644 --- a/tools/analyze_xml/handle_xml.py +++ b/tools/analyze_xml/handle_xml.py @@ -1,45 +1,117 @@ #!/usr/bin/python3 - +import argparse from xml.dom.minidom import parse import xml.dom.minidom from openpyxl import Workbook +import pandas as pd +from sklearn.feature_extraction.text import CountVectorizer +from sklearn.naive_bayes import MultinomialNB +from sklearn.pipeline import make_pipeline from build_log import Log logger = Log() -# 使用minidom解析器打开 XML 文档 -DOMTree = xml.dom.minidom.parse("suse-primary.xml") -collection = DOMTree.documentElement -if collection.hasAttribute("shelf"): - logger.info("Root element : %s" % collection.getAttribute("shelf")) - -# 创建工作簿 -wb = Workbook() -ws = wb.active - -# 写入表头 -ws.append(['name', 'group', 'summary', 'description']) - -# 在集合中获取所有rpm包 -packages = collection.getElementsByTagName("package") - -# 打印每个rpm包所需内容的详细信息 -for package in packages: - if package.hasAttribute("package"): - logger.info("Title: %s" % package.getAttribute("package")) - - name = package.getElementsByTagName('name')[0] - name1 = name.childNodes[0].data - # 所需要的包名文件 - with open("all", 'r', encoding="utf-8") as f: - for line in f: - if line.strip() == name1: - logger.info(name1) - description = package.getElementsByTagName('description')[0] - group = package.getElementsByTagName('rpm:group')[0] - summary = package.getElementsByTagName('summary')[0] - ws.append([name1, group.childNodes[0].data, summary.childNodes[0].data, - description.childNodes[0].data.replace('\n', '')]) - -# 所需要保存的表的路径 -wb.save('test-11.xlsx') \ No newline at end of file + +def create_csv(xml_file, all_rpm_name_file): + # 使用minidom解析器打开 XML 文档 + DOMTree = xml.dom.minidom.parse(xml_file) + collection = DOMTree.documentElement + if collection.hasAttribute("shelf"): + logger.info("Root element : %s" % collection.getAttribute("shelf")) + + # 创建工作簿 + wb = Workbook() + ws = wb.active + + # 写入表头 + ws.append(['name', 'group', 'summary', 'description']) + + # 在集合中获取所有rpm包 + packages = collection.getElementsByTagName("package") + + # 打印每个rpm包所需内容的详细信息 + for package in packages: + if package.hasAttribute("package"): + logger.info("Title: %s" % package.getAttribute("package")) + + name = package.getElementsByTagName('name')[0] + rpm_name = name.childNodes[0].data + # 所需要的包名文件 + with open(all_rpm_name_file, 'r', encoding="utf-8") as f: + for line in f: + if line.strip() == rpm_name: + logger.info(rpm_name) + description = package.getElementsByTagName('description')[0] + group = package.getElementsByTagName('rpm:group')[0] + summary = package.getElementsByTagName('summary')[0] + ws.append([rpm_name, group.childNodes[0].data, summary.childNodes[0].data, + description.childNodes[0].data.replace('\n', '')]) + + for cell in ws['B']: + if cell.value is not None and isinstance(cell.value, str): + cell.value = cell.value.replace('Unspecified', '') + + # 所需要保存的表的路径 + wb.save('file_num.xlsx') + # 将xlsx转换成csv + data_xls = pd.read_excel('file_num.xlsx', engine='openpyxl') + data_xls.to_csv('file_num.csv', encoding='utf-8') + + return "file_num.csv" + + +def analyza_csv(xml_file, all_rpm_name_file): + csv_file = create_csv(xml_file, all_rpm_name_file) + # 加载数据 + FILE_PATH = csv_file # 将此路径替换为你的csv文件的路径 + data = pd.read_csv(FILE_PATH) + + # 检查数据是否包含NaN值 + missing_values = data.isnull().sum() + # unspecified_counts = data.apply(lambda x: x.str.count("Unspecified")).sum() + + # 如果有未分类的软件包 (NaN在'b'列,一下abc列皆为表头) + if missing_values['group'] > 0: + # 分割数据为训练集和未分类集 + train_data = data[data['group'].notna()] + unclassified_data = data[data['group'].isna()] + + # 创建一个模型来预测类别 + model = make_pipeline(CountVectorizer(), MultinomialNB()) + + # 使用列a和c作为特征,列b作为目标变量来训练模型 + X_train = train_data['name'] + ' ' + train_data['summary'] + train_data['description'] + y_train = train_data['group'] + model.fit(X_train, y_train) + + # 预测未分类的软件包的类别 + X_unclassified = unclassified_data['name'] + ' ' + unclassified_data['summary'] + unclassified_data[ + 'description'] + predicted_categories = model.predict(X_unclassified) + # print(X_unclassified) + # + # # 将预测的类别添加到表格 + data.loc[data['group'].isna(), 'group'] = predicted_categories + + # 保存更新后的表格到新的csv文件 + data.to_csv('updated_file1.csv', index=False) + logger.info("Categories have been predicted and the updated file has been saved as 'updated_file.csv'") + else: + logger.info("All packages are already categorized.") + + +def init_args(): + """ + init args + :return: + """ + parser = argparse.ArgumentParser() + parser.add_argument("-x", type=str, dest="xml_file_name", help="xml file name") + parser.add_argument("-r", type=str, dest="rpm_file_name", help="所需要解析的包名") + + return parser.parse_args() + + +if "__main__" == __name__: + args = init_args() + analyza_csv(args.xml_file_name, args.rpm_file_name) -- Gitee From 0611fc713a93e34bf5ef0a57e88c86327262b590 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=B1=AA=E9=87=8D=E9=98=B3?= <15757101689@163.com> Date: Thu, 6 Jul 2023 08:27:30 +0000 Subject: [PATCH 5/8] update tools/analyze_xml/handle_xml.py. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: 汪重阳 <15757101689@163.com> --- tools/analyze_xml/handle_xml.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/tools/analyze_xml/handle_xml.py b/tools/analyze_xml/handle_xml.py index 12946c2..7885565 100644 --- a/tools/analyze_xml/handle_xml.py +++ b/tools/analyze_xml/handle_xml.py @@ -68,7 +68,6 @@ def analyza_csv(xml_file, all_rpm_name_file): # 检查数据是否包含NaN值 missing_values = data.isnull().sum() - # unspecified_counts = data.apply(lambda x: x.str.count("Unspecified")).sum() # 如果有未分类的软件包 (NaN在'b'列,一下abc列皆为表头) if missing_values['group'] > 0: @@ -88,9 +87,7 @@ def analyza_csv(xml_file, all_rpm_name_file): X_unclassified = unclassified_data['name'] + ' ' + unclassified_data['summary'] + unclassified_data[ 'description'] predicted_categories = model.predict(X_unclassified) - # print(X_unclassified) - # - # # 将预测的类别添加到表格 + # 将预测的类别添加到表格 data.loc[data['group'].isna(), 'group'] = predicted_categories # 保存更新后的表格到新的csv文件 -- Gitee From d3de4eda3a53f4838e820a542cc3ed18cbf4b7dc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=B1=AA=E9=87=8D=E9=98=B3?= <15757101689@163.com> Date: Thu, 6 Jul 2023 08:36:53 +0000 Subject: [PATCH 6/8] update tools/analyze_xml/handle_xml.py. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: 汪重阳 <15757101689@163.com> --- tools/analyze_xml/handle_xml.py | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/tools/analyze_xml/handle_xml.py b/tools/analyze_xml/handle_xml.py index 7885565..f852423 100644 --- a/tools/analyze_xml/handle_xml.py +++ b/tools/analyze_xml/handle_xml.py @@ -14,8 +14,8 @@ logger = Log() def create_csv(xml_file, all_rpm_name_file): # 使用minidom解析器打开 XML 文档 - DOMTree = xml.dom.minidom.parse(xml_file) - collection = DOMTree.documentElement + DOM_TREE = xml.dom.minidom.parse(xml_file) + collection = DOM_TREE.documentElement if collection.hasAttribute("shelf"): logger.info("Root element : %s" % collection.getAttribute("shelf")) @@ -61,9 +61,10 @@ def create_csv(xml_file, all_rpm_name_file): def analyza_csv(xml_file, all_rpm_name_file): - csv_file = create_csv(xml_file, all_rpm_name_file) + CSV_FILE = create_csv(xml_file, all_rpm_name_file) # 加载数据 - FILE_PATH = csv_file # 将此路径替换为你的csv文件的路径 + # 将此路径替换为你的csv文件的路径 + FILE_PATH = CSV_FILE data = pd.read_csv(FILE_PATH) # 检查数据是否包含NaN值 @@ -79,14 +80,14 @@ def analyza_csv(xml_file, all_rpm_name_file): model = make_pipeline(CountVectorizer(), MultinomialNB()) # 使用列a和c作为特征,列b作为目标变量来训练模型 - X_train = train_data['name'] + ' ' + train_data['summary'] + train_data['description'] + X_TRAIN = train_data['name'] + ' ' + train_data['summary'] + train_data['description'] y_train = train_data['group'] - model.fit(X_train, y_train) + model.fit(X_TRAIN, y_train) # 预测未分类的软件包的类别 - X_unclassified = unclassified_data['name'] + ' ' + unclassified_data['summary'] + unclassified_data[ + X_UNCLASSIFIED = unclassified_data['name'] + ' ' + unclassified_data['summary'] + unclassified_data[ 'description'] - predicted_categories = model.predict(X_unclassified) + predicted_categories = model.predict(X_UNCLASSIFIED) # 将预测的类别添加到表格 data.loc[data['group'].isna(), 'group'] = predicted_categories @@ -111,4 +112,4 @@ def init_args(): if "__main__" == __name__: args = init_args() - analyza_csv(args.xml_file_name, args.rpm_file_name) + analyza_csv(args.xml_file_name, args.rpm_file_name) \ No newline at end of file -- Gitee From c48a63b743c600e7c91720cb1ed19c30788c8813 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=B1=AA=E9=87=8D=E9=98=B3?= <15757101689@163.com> Date: Thu, 6 Jul 2023 08:47:06 +0000 Subject: [PATCH 7/8] update tools/analyze_xml/handle_xml.py. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: 汪重阳 <15757101689@163.com> --- tools/analyze_xml/handle_xml.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/tools/analyze_xml/handle_xml.py b/tools/analyze_xml/handle_xml.py index f852423..b4908ac 100644 --- a/tools/analyze_xml/handle_xml.py +++ b/tools/analyze_xml/handle_xml.py @@ -14,8 +14,8 @@ logger = Log() def create_csv(xml_file, all_rpm_name_file): # 使用minidom解析器打开 XML 文档 - DOM_TREE = xml.dom.minidom.parse(xml_file) - collection = DOM_TREE.documentElement + dom_tree = xml.dom.minidom.parse(xml_file) + collection = dom_tree.documentElement if collection.hasAttribute("shelf"): logger.info("Root element : %s" % collection.getAttribute("shelf")) @@ -61,11 +61,11 @@ def create_csv(xml_file, all_rpm_name_file): def analyza_csv(xml_file, all_rpm_name_file): - CSV_FILE = create_csv(xml_file, all_rpm_name_file) + csv_file = create_csv(xml_file, all_rpm_name_file) # 加载数据 # 将此路径替换为你的csv文件的路径 - FILE_PATH = CSV_FILE - data = pd.read_csv(FILE_PATH) + file_path = csv_file + data = pd.read_csv(file_path) # 检查数据是否包含NaN值 missing_values = data.isnull().sum() @@ -80,14 +80,14 @@ def analyza_csv(xml_file, all_rpm_name_file): model = make_pipeline(CountVectorizer(), MultinomialNB()) # 使用列a和c作为特征,列b作为目标变量来训练模型 - X_TRAIN = train_data['name'] + ' ' + train_data['summary'] + train_data['description'] + x_train = train_data['name'] + ' ' + train_data['summary'] + train_data['description'] y_train = train_data['group'] - model.fit(X_TRAIN, y_train) + model.fit(x_train, y_train) # 预测未分类的软件包的类别 - X_UNCLASSIFIED = unclassified_data['name'] + ' ' + unclassified_data['summary'] + unclassified_data[ + x_unclassified = unclassified_data['name'] + ' ' + unclassified_data['summary'] + unclassified_data[ 'description'] - predicted_categories = model.predict(X_UNCLASSIFIED) + predicted_categories = model.predict(x_unclassified) # 将预测的类别添加到表格 data.loc[data['group'].isna(), 'group'] = predicted_categories -- Gitee From 50d0fffb23102c03d63e5a3c1a3eb5b20abb2dfa Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=B1=AA=E9=87=8D=E9=98=B3?= <15757101689@163.com> Date: Mon, 10 Jul 2023 12:10:10 +0000 Subject: [PATCH 8/8] update tools/analyze_xml/handle_xml.py. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: 汪重阳 <15757101689@163.com> --- tools/analyze_xml/handle_xml.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/tools/analyze_xml/handle_xml.py b/tools/analyze_xml/handle_xml.py index b4908ac..dbccca8 100644 --- a/tools/analyze_xml/handle_xml.py +++ b/tools/analyze_xml/handle_xml.py @@ -24,7 +24,7 @@ def create_csv(xml_file, all_rpm_name_file): ws = wb.active # 写入表头 - ws.append(['name', 'group', 'summary', 'description']) + ws.append(['name', 'group-new', 'group-old', 'summary', 'description']) # 在集合中获取所有rpm包 packages = collection.getElementsByTagName("package") @@ -44,7 +44,7 @@ def create_csv(xml_file, all_rpm_name_file): description = package.getElementsByTagName('description')[0] group = package.getElementsByTagName('rpm:group')[0] summary = package.getElementsByTagName('summary')[0] - ws.append([rpm_name, group.childNodes[0].data, summary.childNodes[0].data, + ws.append([rpm_name, group.childNodes[0].data, group.childNodes[0].data, summary.childNodes[0].data, description.childNodes[0].data.replace('\n', '')]) for cell in ws['B']: @@ -71,17 +71,17 @@ def analyza_csv(xml_file, all_rpm_name_file): missing_values = data.isnull().sum() # 如果有未分类的软件包 (NaN在'b'列,一下abc列皆为表头) - if missing_values['group'] > 0: + if missing_values['group-new'] > 0: # 分割数据为训练集和未分类集 - train_data = data[data['group'].notna()] - unclassified_data = data[data['group'].isna()] + train_data = data[data['group-new'].notna()] + unclassified_data = data[data['group-new'].isna()] # 创建一个模型来预测类别 model = make_pipeline(CountVectorizer(), MultinomialNB()) # 使用列a和c作为特征,列b作为目标变量来训练模型 x_train = train_data['name'] + ' ' + train_data['summary'] + train_data['description'] - y_train = train_data['group'] + y_train = train_data['group-new'] model.fit(x_train, y_train) # 预测未分类的软件包的类别 @@ -89,7 +89,7 @@ def analyza_csv(xml_file, all_rpm_name_file): 'description'] predicted_categories = model.predict(x_unclassified) # 将预测的类别添加到表格 - data.loc[data['group'].isna(), 'group'] = predicted_categories + data.loc[data['group-new'].isna(), 'group-new'] = predicted_categories # 保存更新后的表格到新的csv文件 data.to_csv('updated_file1.csv', index=False) -- Gitee