From 69d43d47ad83fdb8a998e4b97fbc2d5c47c3691e Mon Sep 17 00:00:00 2001 From: pxp1 <958876660@qq.com> Date: Thu, 29 Aug 2024 17:41:39 +0800 Subject: [PATCH 1/2] =?UTF-8?q?=E6=94=BE=E5=BC=80config=5Ffile=5Fchecker?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- config_checking/checkers/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/config_checking/checkers/__init__.py b/config_checking/checkers/__init__.py index ebb6cef..d8e2755 100644 --- a/config_checking/checkers/__init__.py +++ b/config_checking/checkers/__init__.py @@ -5,7 +5,7 @@ import config_checking.checkers.pip_checker import config_checking.checkers.checkpoint_checker import config_checking.checkers.dataset_checker import config_checking.checkers.weights_checker -# import config_checking.checkers.config_file_checker +import config_checking.checkers.config_file_checker from config_checking.checkers.base_checker import BaseChecker -- Gitee From f483486cd75b18c36d2dbd27db0fb4349edf0e00 Mon Sep 17 00:00:00 2001 From: pxp1 <958876660@qq.com> Date: Tue, 3 Sep 2024 17:14:55 +0800 Subject: [PATCH 2/2] =?UTF-8?q?=E5=90=88=E5=B9=B6=E4=BB=A3=E7=A0=81?= =?UTF-8?q?=E6=A3=80=E6=9F=A5=E5=92=8C=E9=85=8D=E7=BD=AE=E6=96=87=E4=BB=B6?= =?UTF-8?q?=E6=A3=80=E6=9F=A5?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- README.md | 13 +- config_checking/checkers/__init__.py | 1 - config_checking/checkers/base_checker.py | 1 - config_checking/checkers/code_checker.py | 79 ++++++- .../checkers/config_file_checker.py | 207 ------------------ config_checking/utils/dir_cmp.py | 56 ----- config_checking/utils/packing.py | 2 +- 7 files changed, 76 insertions(+), 283 deletions(-) delete mode 100644 config_checking/checkers/config_file_checker.py delete mode 100644 config_checking/utils/dir_cmp.py diff --git a/README.md b/README.md index 2fd9108..be8c9f0 100644 --- a/README.md +++ b/README.md @@ -7,7 +7,6 @@ - 文件内容 - 启动命令(需要提前写到文件里) -- 结构化配置文件(ini、xml、yaml、yml、json等) - 静态checkpoint(state dict形式) - 环境变量 - 三方库版本 @@ -32,12 +31,11 @@ ``` { - "code path": "str", - "exec cmd filepath": "str", - "config_home_dirs": { - "dir1": "path_to_config_dir1", - "dir2": "path_to_config_dir2" + "code path": { + "path1": "code_path1", + "path2": "code_path2" }, + "exec cmd filepath": "str", "ckpt path": "str", "env args": true, "pip data": true, @@ -46,9 +44,8 @@ ``` 说明: -- code path:需要对比文件内容的路径。可以是单个文件,也可以是目录,是目录的话就会采集目录下所有文件。 +- code path:需要对比文件内容的路径。可以是单个文件,也可以是目录,是目录的话就会采集目录下所有文件。可传入多个路径,按path1、path2、path3排序即可;npu和gpu上须保证对应。 - exec cmd filepath:启动命令所在文件路径。 -- config_home_dirs:参数指定了多个目录,代码会自动采集这些目录下所有结构化文件(ini、xml、yaml、yml、json等)。字段名dir1、dir2自定义,区分开即可。 - ckpt path:权重文件存放路径。 - env args:环境变量。传true就表示采集,不传或缺省就表示不采集。 - pip data:三方库信息。传true就表示采集,不传或缺省就表示不采集。 diff --git a/config_checking/checkers/__init__.py b/config_checking/checkers/__init__.py index d8e2755..831d5d1 100644 --- a/config_checking/checkers/__init__.py +++ b/config_checking/checkers/__init__.py @@ -5,7 +5,6 @@ import config_checking.checkers.pip_checker import config_checking.checkers.checkpoint_checker import config_checking.checkers.dataset_checker import config_checking.checkers.weights_checker -import config_checking.checkers.config_file_checker from config_checking.checkers.base_checker import BaseChecker diff --git a/config_checking/checkers/base_checker.py b/config_checking/checkers/base_checker.py index 2b0a672..acc2a50 100644 --- a/config_checking/checkers/base_checker.py +++ b/config_checking/checkers/base_checker.py @@ -7,7 +7,6 @@ class PackInput: def __init__(self, config_dict=None, model=None): self.code_path = config_dict.get("code path", None) self.exec_cmd_filepath = config_dict.get("exec cmd filepath", None) - self.config_home_dirs = config_dict.get("config_home_dirs", None) self.ckpt_path = config_dict.get("ckpt path", None) self.need_env_args = config_dict.get("env args", None) self.need_pip_data = config_dict.get("pip data", None) diff --git a/config_checking/checkers/code_checker.py b/config_checking/checkers/code_checker.py index 5ba3815..a20ce69 100644 --- a/config_checking/checkers/code_checker.py +++ b/config_checking/checkers/code_checker.py @@ -1,13 +1,73 @@ import os - -from config_checking.utils.dir_cmp import compare_directories +import json +import filecmp +import difflib from config_checking.checkers.base_checker import BaseChecker from config_checking.utils.packing import DirPacker, add_file_to_zip from config_checking.config_checker import register_checker_item from config_checking.utils.utils import write_list_to_file from config_checking.utils.utils import config_checking_print +from config_checking.utils.config_compare import CONFIG_EXTENSIONS, ConfigComparator + + +def is_constructed_file(filepath): + for suffix in CONFIG_EXTENSIONS: + if filepath.endswith(suffix): + return True + return False + +def file_is_identical(filepath1, filepath2): + return filecmp.cmp(filepath1, filepath2) + +def compare_directories(dir1, dir2, output_dir): + # Create the output directory if it doesn't exist + if not os.path.exists(output_dir): + os.makedirs(output_dir) + + # Walk through the directory trees + for root1, dirs1, files1 in os.walk(dir1): + for root2, dirs2, files2 in os.walk(dir2): + # Find the corresponding directory in the other tree + rel_path1 = os.path.relpath(root1, dir1) + rel_path2 = os.path.relpath(root2, dir2) + if rel_path1 == rel_path2: + # Create the corresponding directory in the output dir + output_root = os.path.join(output_dir, rel_path1) + if not os.path.exists(output_root): + os.makedirs(output_root) + # Compare files in the current directory + for file1 in files1: + file_path1 = os.path.join(root1, file1) + file_path2 = os.path.join(root2, file1) + if os.path.exists(file_path2): + if is_constructed_file(file_path1): + diff, has_diff = ConfigComparator().compare(file_path1, file_path2) + diff = json.dumps(diff, indent=4) + # Compare file contents + else: + has_diff = not file_is_identical(file_path1, file_path2) + if has_diff: + with open(file_path1, 'r', encoding='utf-8') as f1, open(file_path2, 'r', encoding='utf-8') as f2: + diff = difflib.ndiff(f1.readlines(), f2.readlines()) + if has_diff: + diff_file = os.path.join(output_root, f"{file1}.diff") + with open(diff_file, 'w', encoding='utf-8') as f: + f.writelines(diff) + else: + # File only exists in dir1, mark as deleted + deleted_file = os.path.join(output_root, f"{file1}.deleted") + with open(deleted_file, 'w', encoding='utf-8') as f: + f.write("File deleted") + for file2 in files2: + file_path2 = os.path.join(root2, file2) + if not os.path.exists(os.path.join(root1, file2)): + # File only exists in dir2, mark as added + added_file = os.path.join(output_root, f"{file2}.added") + with open(added_file, 'w', encoding='utf-8') as f: + f.write("File added") + def get_all_files(directory): file_list = [] for root, dirs, files in os.walk(directory): @@ -43,13 +103,14 @@ class CodeChecker(BaseChecker): def pack(pack_input): code_path = pack_input.code_path output_zip_path = pack_input.output_zip_path - if os.path.isdir(code_path): - DirPacker(code_path, output_zip_path, CodeChecker.target_name_in_zip) - config_checking_print(f"add code dir {code_path} to zip") - elif os.path.isfile(code_path): - dest_path_in_zip = os.path.join(CodeChecker.target_name_in_zip, os.path.basename(code_path)) - add_file_to_zip(output_zip_path, code_path, dest_path_in_zip) - config_checking_print(f"add code file {code_path} to zip") + for dirname, pathname in code_path.items(): + if os.path.isdir(pathname): + DirPacker(pathname, output_zip_path, os.path.join(CodeChecker.target_name_in_zip, dirname)) + config_checking_print(f"add code {dirname} {pathname} to zip") + elif os.path.isfile(pathname): + dest_path_in_zip = os.path.join(CodeChecker.target_name_in_zip, dirname, os.path.basename(pathname)) + add_file_to_zip(output_zip_path, pathname, dest_path_in_zip) + config_checking_print(f"add code {dirname} {pathname} to zip") def compare(bench_dir, cmp_dir, output_path): diff --git a/config_checking/checkers/config_file_checker.py b/config_checking/checkers/config_file_checker.py deleted file mode 100644 index fce5cee..0000000 --- a/config_checking/checkers/config_file_checker.py +++ /dev/null @@ -1,207 +0,0 @@ -import json -import os -import shutil -from pathlib import Path - -import yaml - -from config_checking.checkers.base_checker import BaseChecker -from config_checking.config_checker import register_checker_item -from config_checking.utils.config_compare import CONFIG_EXTENSIONS, ConfigComparator -from config_checking.utils.hash import string_hash -from config_checking.utils.packing import zip_folder_with_option - - -class Filter: - """过滤器基类""" - - def apply(self, files): - pass - - -class FileSizeFilter(Filter): - """根据文件大小过滤""" - MAX_FILE_SIZE = 1 << 20 # 1MB - - def __init__(self, min_size=None, max_size=None): - self.min_size = min_size - self.max_size = max_size - if not max_size: - self.max_size = FileSizeFilter.MAX_FILE_SIZE - - def apply(self, files): - output_files = [] - for file in files: - if os.path.exists(file): - size = os.path.getsize(file) - if (self.min_size is None or size >= self.min_size) and ( - self.max_size is None or size <= self.max_size): - output_files.append(file) - return output_files - - -class FileExtensionFilter(Filter): - """根据文件扩展名过滤""" - - def __init__(self, extensions): - self.extensions = extensions - - def apply(self, files): - output_files = [] - for file in files: - ext = Path(file).suffix - if ext.lower() in self.extensions: - output_files.append(file) - return output_files - - -def apply_filter_chain(filters, files): - """应用一系列过滤器对文件列表做筛选""" - for f in filters: - files = f.apply(files) - return files - - -def _copy_and_rename_files_with_given_extension(src_folder, dest_folder, file_extension=None, prefix='', - ext_filters=None): - """ - 构建统一方法,传入过滤chain对文件进行过滤,保存到dest文件夹 - Args: - src_folder: - dest_folder: - file_extension: - prefix: - ext_filters: 额外文件过滤函数,对文件列表做过滤 - Returns: - """ - if not file_extension: - return - - filters = [ - FileExtensionFilter(extensions=file_extension), - FileSizeFilter() - ] - if ext_filters: - filters.extend(ext_filters) - - output_file_dict = dict() - - # TODO 递归优化 - for root, _, files in os.walk(src_folder): - files = [os.path.join(root, x) for x in files] - files = apply_filter_chain(filters, files) - - for file in files: - file_name = os.path.basename(file) - # 生成文件路径hash: prefix_相对路径_文件名 - relative_file_path = os.path.join(str(os.path.relpath(root, src_folder)), file_name) - file_hash = string_hash(prefix + relative_file_path) - # 生成新的文件名: 哈希值_文件名 - new_file_name = file_hash + '_' + file_name - - src_file = file - dest_file = os.path.join(dest_folder, new_file_name) - output_file_dict[new_file_name] = src_file - - # 确保目标目录存在 - os.makedirs(os.path.dirname(dest_file), exist_ok=True) - shutil.copy2(src_file, dest_file) - # TODO change to logger - # config_checking_print(f"Copied and renamed: {src_file} -> {dest_file}") - - return output_file_dict - - -@register_checker_item("config_file") -class ConfigFileChecker(BaseChecker): - """ - 配置文件对比,pack路径(TODO 待补充) - """ - input_needed = "config_home_dirs" - target_name_in_zip = "config_files" - - # 支持的文件类型检查 - EXTENSIONS = CONFIG_EXTENSIONS - # 配置文件存储路径 - TARGET_CFG_DIR = "all_configs" - TARGET_FILE_DICT = "total_file_dict.json" - comparator = ConfigComparator() - - @staticmethod - def _load_config_files_dict(root_dir): - dest_dir = os.path.join(root_dir, ConfigFileChecker.target_name_in_zip) - - file_dict_path = os.path.join(dest_dir, ConfigFileChecker.TARGET_FILE_DICT) - cfg_files_dir = os.path.join(dest_dir, ConfigFileChecker.TARGET_CFG_DIR) - files = dict() - for entry in os.listdir(cfg_files_dir): - full_path = os.path.join(cfg_files_dir, entry) - if os.path.isdir(full_path): - continue - files[entry] = full_path - with open(file_dict_path, 'r', encoding='utf-8') as f: - src_files_dict = json.load(f) - - return src_files_dict, files - - @staticmethod - def _compare_cfgs(bench_conf_dict, cmp_conf_dict, bench_src_dict, cmp_src_dict): - output = dict() - for key in bench_conf_dict.keys(): - diff_item = dict() - conf1 = bench_conf_dict[key] - conf2 = cmp_conf_dict[key] - diff, has_diff = ConfigFileChecker.comparator.compare(conf1, conf2) - if not has_diff: - continue - # record this diff - diff_item['diff'] = diff - diff_item['bench_src_file'] = bench_src_dict.get(key) - diff_item['cmp_src_file'] = cmp_src_dict.get(key) - output[key] = diff_item - return output - - @staticmethod - def _write_files(diff_results, output_dir, file_name='config_file_checker.diff.yaml'): - out_file_path = os.path.join(output_dir, file_name) - with open(out_file_path, 'w', encoding='utf-8') as file: - yaml.dump(diff_results, file, default_flow_style=False, allow_unicode=True) - - @staticmethod - def _write_file_dict_file(dest_dir, file_dict): - dest_file = os.path.join(dest_dir, ConfigFileChecker.TARGET_FILE_DICT) - with open(dest_file, 'w', encoding='utf-8') as file: - json.dump(file_dict, file) - - @staticmethod - def pack(configs): - output_zip_path = configs.output_zip_path - training_config_dirs = configs.config_home_dirs - - # should be deleted by end of this func - dest_dir = os.path.join(os.path.dirname(output_zip_path), ConfigFileChecker.target_name_in_zip) - dest_config_file_dir = os.path.join(dest_dir, ConfigFileChecker.TARGET_CFG_DIR) - - total_file_dict = dict() - for _name, _conf_dir in training_config_dirs.items(): - file_dict = _copy_and_rename_files_with_given_extension(_conf_dir, dest_config_file_dir, - file_extension=ConfigFileChecker.EXTENSIONS, - prefix=_name) - total_file_dict.update(file_dict) - - # add total dict to dest dir - ConfigFileChecker._write_file_dict_file(dest_dir, total_file_dict) - - # add current dir to zip file - zip_folder_with_option(output_zip_path, dest_dir, prefix=ConfigFileChecker.target_name_in_zip, - keep_structure=True) - if os.path.exists(output_zip_path): - shutil.rmtree(dest_dir) - - @staticmethod - def compare(bench_dir, cmp_dir, output_path): - bench_src_dict, bench_files_dict = ConfigFileChecker._load_config_files_dict(bench_dir) - cmp_src_dict, cmp_files_dict = ConfigFileChecker._load_config_files_dict(cmp_dir) - diff_results = ConfigFileChecker._compare_cfgs(bench_files_dict, cmp_files_dict, bench_src_dict, cmp_src_dict) - - ConfigFileChecker._write_files(diff_results, output_path) diff --git a/config_checking/utils/dir_cmp.py b/config_checking/utils/dir_cmp.py deleted file mode 100644 index 795f646..0000000 --- a/config_checking/utils/dir_cmp.py +++ /dev/null @@ -1,56 +0,0 @@ -import os -import filecmp -import difflib - -def compare_directories(dir1, dir2, output_dir): - # Create the output directory if it doesn't exist - if not os.path.exists(output_dir): - os.makedirs(output_dir) - - # Walk through the directory trees - for root1, dirs1, files1 in os.walk(dir1): - for root2, dirs2, files2 in os.walk(dir2): - # Find the corresponding directory in the other tree - rel_path1 = os.path.relpath(root1, dir1) - rel_path2 = os.path.relpath(root2, dir2) - if rel_path1 == rel_path2: - # Create the corresponding directory in the output dir - output_root = os.path.join(output_dir, rel_path1) - if not os.path.exists(output_root): - os.makedirs(output_root) - - # Compare files in the current directory - for file1 in files1: - file_path1 = os.path.join(root1, file1) - file_path2 = os.path.join(root2, file1) - if os.path.exists(file_path2): - # Compare file contents - if filecmp.cmp(file_path1, file_path2): - # Files are identical, do nothing - pass - else: - # Files are different, generate diff file - diff_file = os.path.join(output_root, f"{file1}.diff") - with open(file_path1, 'r', encoding='utf-8') as f1, open(file_path2, 'r', encoding='utf-8') as f2: - diff = difflib.ndiff(f1.readlines(), f2.readlines()) - with open(diff_file, 'w', encoding='utf-8') as f: - f.writelines(diff) - else: - # File only exists in dir1, mark as deleted - deleted_file = os.path.join(output_root, f"{file1}.deleted") - with open(deleted_file, 'w', encoding='utf-8') as f: - f.write("File deleted") - - for file2 in files2: - file_path2 = os.path.join(root2, file2) - if not os.path.exists(os.path.join(root1, file2)): - # File only exists in dir2, mark as added - added_file = os.path.join(output_root, f"{file2}.added") - with open(added_file, 'w', encoding='utf-8') as f: - f.write("File added") - -if __name__ == '__main__': - dir1 = '/path/to/dir1' - dir2 = '/path/to/dir2' - output_dir = '/path/to/output_dir' - compare_directories(dir1, dir2, output_dir) \ No newline at end of file diff --git a/config_checking/utils/packing.py b/config_checking/utils/packing.py index 5c44fb9..c7c9a9a 100644 --- a/config_checking/utils/packing.py +++ b/config_checking/utils/packing.py @@ -96,7 +96,7 @@ class DirPacker: self.root_dir = root_dir self.zip_file = zip_file self.result_dirname = result_dirname - self.zip_handler = zipfile.ZipFile(zip_file, 'w') + self.zip_handler = zipfile.ZipFile(zip_file, 'a') self.parse_directory() self.close_zip() -- Gitee