diff --git a/config_checking/checkers/__init__.py b/config_checking/checkers/__init__.py index ccb5bc2aafd72ea4d68e6a4bdb7bf5e62bc9703d..3f8f9a5eb1b3d6bebdea4cf958f94cb5553cc1cf 100644 --- a/config_checking/checkers/__init__.py +++ b/config_checking/checkers/__init__.py @@ -4,6 +4,8 @@ import config_checking.checkers.pip_checker import config_checking.checkers.checkpoint_checker import config_checking.checkers.dataset_checker import config_checking.checkers.weights_checker +import config_checking.checkers.hyperparameters_checker +import config_checking.checkers.random_instruction_checker from config_checking.checkers.base_checker import BaseChecker diff --git a/config_checking/checkers/base_checker.py b/config_checking/checkers/base_checker.py index e986b2623d06ab771f8f3f4789cabd2c8570c5b7..f3df998d8090616790bb863069573c818ec8488c 100644 --- a/config_checking/checkers/base_checker.py +++ b/config_checking/checkers/base_checker.py @@ -9,6 +9,7 @@ class PackInput: self.ckpt_path = config_dict.get("ckpt path", None) self.need_env_args = config_dict.get("env args", None) self.need_pip_data = config_dict.get("pip data", None) + self.hyperparameter_path = config_dict.get("hyperparameter path", None) self.output_zip_path = config_dict.get("output zip path", "./config_check_pack.zip") self.model = model diff --git a/config_checking/checkers/hyperparameter_checker.py b/config_checking/checkers/hyperparameter_checker.py new file mode 100644 index 0000000000000000000000000000000000000000..d1ec7fe7f014ad5298e7594c812e6bd7cd1720d8 --- /dev/null +++ b/config_checking/checkers/hyperparameter_checker.py @@ -0,0 +1,92 @@ +import os +import json +from config_checking.checkers.base_checker import BaseChecker +from config_checking.config_checker import register_checker_item +from config_checking.utils.packing import add_file_to_zip +from config_checking.utils.utils import load_json, compare_dict, write_list_to_file +from config_checking.utils.utils import config_checking_print + +@register_checker_item("hyperparameter") +class HyperparameterChecker(BaseChecker): + input_needed = "hyperparameter_path" # 可以是包含多个超参数文件路径的字典或列表 + + target_name_in_zip = "hyperparameters" # 在zip文件中创建的目录名 + result_filename = "hyperparameter_diff.txt" + + def pack(pack_input): + hyperparameter_path = pack_input.hyperparameter_path + output_zip_path = pack_input.output_zip_path + + if isinstance(hyperparameter_path, dict): + for dirname, pathname in hyperparameter_path.items(): + if os.path.isfile(pathname): + dest_path_in_zip = os.path.join(HyperparameterChecker.target_name_in_zip, dirname, os.path.basename(pathname)) + add_file_to_zip(output_zip_path, pathname, dest_path_in_zip) + config_checking_print(f"add hyperparameter {dirname} {pathname} to zip") + else: + config_checking_print(f"Warning: Hyperparameter path {pathname} is not a file.") + elif isinstance(hyperparameter_path, list): + for pathname in hyperparameter_path: + if os.path.isfile(pathname): + dest_path_in_zip = os.path.join(HyperparameterChecker.target_name_in_zip, os.path.basename(pathname)) + add_file_to_zip(output_zip_path, pathname, dest_path_in_zip) + config_checking_print(f"add hyperparameter {pathname} to zip") + else: + config_checking_print(f"Warning: Hyperparameter path {pathname} is not a file.") + else: + raise TypeError("hyperparameter_path should be a dict or a list of file paths.") + + def compare(bench_dir, cmp_dir, output_path): + bench_hyperparameter_dir = os.path.join(bench_dir, HyperparameterChecker.target_name_in_zip) + cmp_hyperparameter_dir = os.path.join(cmp_dir, HyperparameterChecker.target_name_in_zip) + output_filepath = os.path.join(output_path, HyperparameterChecker.result_filename) + + bench_hyperparameters = {} + cmp_hyperparameters = {} + + # 从 bench_dir 读取超参数 + if os.path.exists(bench_hyperparameter_dir): + for root, _, files in os.walk(bench_hyperparameter_dir): + for file in files: + if file.endswith(('.json', '.yaml', '.yml')): # 假设超参数文件是 JSON 或 YAML + filepath = os.path.join(root, file) + try: + with open(filepath, 'r') as f: + if filepath.endswith('.json'): + bench_hyperparameters[os.path.relpath(filepath, bench_hyperparameter_dir)] = json.load(f) + # 可以添加对 YAML 的支持,如果需要 + except Exception as e: + config_checking_print(f"Error loading hyperparameter file {filepath}: {e}") + + # 从 cmp_dir 读取超参数 + if os.path.exists(cmp_hyperparameter_dir): + for root, _, files in os.walk(cmp_hyperparameter_dir): + for file in files: + if file.endswith(('.json', '.yaml', '.yml')): + filepath = os.path.join(root, file) + try: + with open(filepath, 'r') as f: + if filepath.endswith('.json'): + cmp_hyperparameters[os.path.relpath(filepath, cmp_hyperparameter_dir)] = json.load(f) + # 可以添加对 YAML 的支持 + except Exception as e: + config_checking_print(f"Error loading hyperparameter file {filepath}: {e}") + + # 比较超参数 + all_diffs = [] + all_files = set(bench_hyperparameters.keys()) | set(cmp_hyperparameters.keys()) + + for filename in all_files: + bench_data = bench_hyperparameters.get(filename, None) + cmp_data = cmp_hyperparameters.get(filename, None) + + if bench_data is not None and cmp_data is not None: + diff = compare_dict(bench_data, cmp_data) + all_diffs.extend(diff) + elif bench_data is not None: + all_diffs.append(f"[Only in benchmark] File: {filename}") + elif cmp_data is not None: + all_diffs.append(f"[Only in compare] File: {filename}") + + write_list_to_file(all_diffs, output_filepath) + config_checking_print(f"Hyperparameter comparison result written to {output_filepath}") \ No newline at end of file diff --git a/config_checking/checkers/random_instruction_checker.py b/config_checking/checkers/random_instruction_checker.py new file mode 100644 index 0000000000000000000000000000000000000000..d231bd7c561f4b07d741b559a0105754fca0e491 --- /dev/null +++ b/config_checking/checkers/random_instruction_checker.py @@ -0,0 +1,118 @@ +import os +import json +import torch +import inspect + +from config_checking.checkers.base_checker import BaseChecker +from config_checking.config_checker import register_checker_item, register_pre_forward_fun_list +from config_checking.utils.packing import create_file_in_zip +from config_checking.utils.utils import write_list_to_file, config_checking_print, get_rank, write_content_to_file +from config_checking.utils.utils import read_rank_result_to_dict, compare_dicts + +# 记录随机操作的列表,每个 rank 一个 +random_operations_history = {} + +def get_random_op_info(op_name, *args, **kwargs): + """ + 提取随机操作的相关信息,例如形状、数据类型等。 + """ + info = {"op_name": op_name} + # 尝试提取参数信息,可以根据需要扩展 + if args: + info["args"] = [str(arg) if not isinstance(arg, torch.Tensor) else f"Tensor[shape={list(arg.shape)}, dtype={arg.dtype}]" for arg in args] + if kwargs: + info["kwargs"] = {k: str(v) if not isinstance(v, torch.Tensor) else f"Tensor[shape={list(v.shape)}, dtype={v.dtype}]" for k, v in kwargs.items()} + return info + +def capture_random_state(): + """ + 捕获当前的随机数生成器状态。 + """ + return { + "torch.random.get_rng_state": torch.random.get_rng_state().tolist(), + "torch.cuda.get_rng_state_all": torch.cuda.get_rng_state_all() if torch.cuda.is_available() else None + } + +# 需要 hook 的随机函数列表,可以根据需要添加 +RANDOM_FUNCTIONS_TO_HOOK = [ + (torch, 'rand'), + (torch, 'randn'), + (torch, 'randint'), + (torch, 'randperm'), + (torch.nn.functional, 'dropout') # 例如 dropout 也有随机性 + # 可以添加更多具有随机性的函数 +] + +original_functions = {} + +def hook_random_functions(): + """Hook 随机函数以记录其调用信息。""" + global original_functions + rank = get_rank() + if rank not in random_operations_history: + random_operations_history[rank] = [] + + for module, func_name in RANDOM_FUNCTIONS_TO_HOOK: + original_func = getattr(module, func_name) + original_functions[(module, func_name)] = original_func + + def hooked_func(*args, **kwargs): + info = get_random_op_info(func_name, *args, **kwargs) + # 获取调用堆栈信息,可以帮助定位是哪里的随机调用 + stack_info = inspect.stack()[1][3] # 获取调用者函数名 + info['called_by'] = stack_info + random_operations_history[rank].append(info) + return original_func(*args, **kwargs) + setattr(module, func_name, hooked_func) + +def unhook_random_functions(): + """取消 hook 随机函数。""" + global original_functions + for (module, func_name), original_func in original_functions.items(): + setattr(module, func_name, original_func) + +@register_checker_item("random_instruction") +class RandomInstructionChecker(BaseChecker): + input_needed = "model" + multi_rank = True + + target_name_in_zip = "random_instructions" + result_filename = "random_instruction_check_result.txt" + + def pack(pack_input): + output_zip_path = pack_input.output_zip_path + + def pre_forward_hook(module, input): + hook_random_functions() + return input + + def post_forward_hook(module, input, output): + unhook_random_functions() + # 将捕获到的随机操作历史记录保存到 zip 文件 + rank = get_rank() + filepath = os.path.join(RandomInstructionChecker.target_name_in_zip, f"rank{rank}.json") + create_file_in_zip(output_zip_path, filepath, json.dumps(random_operations_history.get(rank, []), indent=4)) + + config_checking_print(f"add random instructions info to zip")\ + register_pre_forward_fun_list(pre_forward_hook) + register_pre_forward_fun_list(post_forward_hook) + + def compare(bench_dir, cmp_dir, output_path): + bench_random_inst_path = os.path.join(bench_dir, RandomInstructionChecker.target_name_in_zip) + cmp_random_inst_path = os.path.join(cmp_dir, RandomInstructionChecker.target_name_in_zip) + + bench_random_instructions = read_rank_result_to_dict(bench_random_inst_path) + cmp_random_instructions = read_rank_result_to_dict(cmp_random_inst_path) + + comparison_results = {} + for rank in sorted(bench_random_instructions.keys() | cmp_random_instructions.keys()): + bench_ops = bench_random_instructions.get(rank, []) + cmp_ops = cmp_random_instructions.get(rank, []) + diff = compar_dicts(bench_ops, cmp_ops) + comparison_results[rank] = { + "diff": diff + } + + output_filepath = os.path.join(output_path, RandomInstructionChecker.result_filename) + write_content_to_file(json.dumps(comparison_results, indent=4), output_filepath) + config_checking_print(f"Random instruction comparison result written to {output_filepath}") \ No newline at end of file