From c4a0ac57d1833d24908812d20faebff7507607dc Mon Sep 17 00:00:00 2001 From: sunyiming Date: Sat, 22 Feb 2025 16:17:45 +0800 Subject: [PATCH 1/2] =?UTF-8?q?=E6=94=AF=E6=8C=81=E8=B6=85=E5=8F=82?= =?UTF-8?q?=E6=95=B0=E6=AF=94=E5=AF=B9?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../config_checking/checkers/__init__.py | 1 + .../config_checking/checkers/base_checker.py | 1 + .../checkers/hyperparameter_checker.py | 199 ++++++++++++++++++ 3 files changed, 201 insertions(+) create mode 100644 debug/accuracy_tools/msprobe/pytorch/config_checking/checkers/hyperparameter_checker.py diff --git a/debug/accuracy_tools/msprobe/pytorch/config_checking/checkers/__init__.py b/debug/accuracy_tools/msprobe/pytorch/config_checking/checkers/__init__.py index 403d01e43..d0218b5fd 100644 --- a/debug/accuracy_tools/msprobe/pytorch/config_checking/checkers/__init__.py +++ b/debug/accuracy_tools/msprobe/pytorch/config_checking/checkers/__init__.py @@ -21,6 +21,7 @@ import msprobe.pytorch.config_checking.checkers.pip_checker import msprobe.pytorch.config_checking.checkers.checkpoint_checker import msprobe.pytorch.config_checking.checkers.dataset_checker import msprobe.pytorch.config_checking.checkers.weights_checker +import msprobe.pytorch.config_checking.checkers.hyperparameter_checker from msprobe.pytorch.config_checking.checkers.base_checker import BaseChecker diff --git a/debug/accuracy_tools/msprobe/pytorch/config_checking/checkers/base_checker.py b/debug/accuracy_tools/msprobe/pytorch/config_checking/checkers/base_checker.py index 45b0cfcc1..d295cb580 100644 --- a/debug/accuracy_tools/msprobe/pytorch/config_checking/checkers/base_checker.py +++ b/debug/accuracy_tools/msprobe/pytorch/config_checking/checkers/base_checker.py @@ -24,6 +24,7 @@ class PackInput: self.ckpt_path = config_dict.get("ckpt path", None) self.need_env_args = config_dict.get("env args", None) self.need_pip_data = config_dict.get("pip data", None) + self.shell_path = config_dict.get("shell path", None) self.output_zip_path = config_dict.get("output zip path", "./config_check_pack.zip") self.model = model diff --git a/debug/accuracy_tools/msprobe/pytorch/config_checking/checkers/hyperparameter_checker.py b/debug/accuracy_tools/msprobe/pytorch/config_checking/checkers/hyperparameter_checker.py new file mode 100644 index 000000000..a59395102 --- /dev/null +++ b/debug/accuracy_tools/msprobe/pytorch/config_checking/checkers/hyperparameter_checker.py @@ -0,0 +1,199 @@ +# Copyright (c) 2024-2024, Huawei Technologies Co., Ltd. +# All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import json +from msprobe.pytorch.config_checking.checkers.base_checker import BaseChecker +from msprobe.pytorch.config_checking.config_checker import register_checker_item +from msprobe.pytorch.config_checking.utils.packing import add_file_to_zip +from msprobe.pytorch.config_checking.utils.utils import load_json, compare_dict, write_list_to_file +from msprobe.pytorch.config_checking.utils.utils import config_checking_print +from typing import Union, List, Dict, Any +from difflib import SequenceMatcher +import tempfile +import re + +@register_checker_item("hyperparameter") +class HyperparameterChecker(BaseChecker): + input_needed = "shell_path" + target_name_in_zip = "hyperparameters" + result_filename = "hyperparameter_diff.txt" + + PARAMETER_NAME_MAPPING = { + "learning_rate": ["lr", "learningrate"], + "batch_size": ["batch", "bs", "batch_size_per_gpu"], + "epochs": ["num_epochs", "max_epochs", "epoch"], + "weight_decay": ["wd", "weightdecay"], + "dropout_rate": ["dropout", "drop_rate"], + } + + @staticmethod + def pack(pack_input): + shell_path = pack_input.shell_path + output_zip_path = pack_input.output_zip_path + + if not isinstance(shell_path, list): + raise TypeError("shell_path should be a list of file paths.") + + for script_path in shell_path: + if os.path.isfile(script_path): + hyperparameters = HyperparameterChecker._extract_hyperparameters_from_script(script_path) + if hyperparameters: + dest_path_in_zip = os.path.join(HyperparameterChecker.target_name_in_zip, os.path.splitext(os.path.basename(script_path))[0] + ".json") + with tempfile.NamedTemporaryFile(mode='w', suffix='.json', delete=False) as tmp_file: + json.dump(hyperparameters, tmp_file, indent=4) + tmp_file_path = tmp_file.name + add_file_to_zip(output_zip_path, tmp_file_path, dest_path_in_zip) + os.remove(tmp_file_path) + config_checking_print(f"add hyperparameters args to zip") + else: + config_checking_print(f"Warning: Failed to extract hyperparameters from script {script_path}") + else: + config_checking_print(f"Warning: Script path {script_path} is not a file.") + + @staticmethod + def _extract_hyperparameters_from_script(script_path: str) -> Dict[str, Any]: + """ + Extracts arguments from bash script used to run a model training. + """ + hyperparameters = {} + with open(script_path, 'r') as file: + script_content = file.read() + + command_line = re.search(r'torchrun\s+(.*?)\s*\|', script_content, re.DOTALL) + if command_line: + command_line = command_line.group(1) + + blocks = re.findall(r'(\w+_ARGS)="(.*?)"', script_content, re.DOTALL) + block_contents = {} + for block_name, block_content in blocks: + block_content = block_content.replace('\n', ' ') + block_contents[block_name] = block_content + command_line = command_line.replace(f"${block_name}", block_content) + + matches = re.findall(r'--([\w-]+)(?:\s+([^\s\\]+))?', command_line) + for match in matches: + key, value = match + if value and value.startswith('$'): + env_var = re.search(rf'{value[1:]}="?(.*?)"?\s', script_content) + if env_var: + value = env_var.group(1) + hyperparameters[key] = value if value else True + + return hyperparameters + + @staticmethod + def _fuzzy_match_parameter(param_name: str, available_params: Dict[str, Any]) -> Union[str, None]: + """ + Fuzzy matches a parameter name against available parameter names using predefined mappings and string similarity. + """ + if param_name in available_params: + return param_name + + canonical_name = None + for standard_name, aliases in HyperparameterChecker.PARAMETER_NAME_MAPPING.items(): + if param_name == standard_name or param_name in aliases: + canonical_name = standard_name + break + + if canonical_name: + if canonical_name in available_params: + return canonical_name + for alias in HyperparameterChecker.PARAMETER_NAME_MAPPING[canonical_name]: + if alias in available_params: + config_checking_print(f"Matched '{param_name}' to alias '{alias}' via canonical name '{canonical_name}'") + return alias + + best_match_name = None + best_match_ratio = 0.8 + for available_param_name in available_params: + ratio = SequenceMatcher(None, param_name.lower(), available_param_name.lower()).ratio() + if ratio > best_match_ratio: + best_match_ratio = ratio + best_match_name = available_param_name + + if best_match_name: + config_checking_print(f"Fuzzy matched parameter '{param_name}' to '{best_match_name}' (similarity: {best_match_ratio:.2f})") + return best_match_name + + return None + + def compare(bench_dir, cmp_dir, output_path): + bench_model_dir = os.path.join(bench_dir, HyperparameterChecker.target_name_in_zip) + cmp_model_dir = os.path.join(cmp_dir, HyperparameterChecker.target_name_in_zip) + output_filepath = os.path.join(output_path, HyperparameterChecker.result_filename) + + bench_hyperparameters = {} + cmp_hyperparameters = {} + + if os.path.exists(bench_model_dir): + for root, _, files in os.walk(bench_model_dir): + for file in files: + if file.endswith('.json'): + filepath = os.path.join(root, file) + relative_filepath = os.path.relpath(filepath, bench_model_dir) + params = load_json(filepath) + if params: + bench_hyperparameters[relative_filepath] = params + + if os.path.exists(cmp_model_dir): + for root, _, files in os.walk(cmp_model_dir): + for file in files: + if file.endswith('.json'): + filepath = os.path.join(root, file) + relative_filepath = os.path.relpath(filepath, cmp_model_dir) + params = load_json(filepath) + if params: + cmp_hyperparameters[relative_filepath] = params + + all_diffs = [] + all_files = set(bench_hyperparameters.keys()) | set(cmp_hyperparameters.keys()) + + for filename in all_files: + bench_params = bench_hyperparameters.get(filename, None) + cmp_params = cmp_hyperparameters.get(filename, None) + + if bench_params is not None and cmp_params is not None: + file_diffs = [] + bench_param_names = set(bench_params.keys()) + cmp_param_names = set(cmp_params.keys()) + + for bench_param_name in bench_param_names: + matched_cmp_param_name = HyperparameterChecker._fuzzy_match_parameter(bench_param_name, cmp_params) + if matched_cmp_param_name: + bench_param_value = bench_params[bench_param_name] + cmp_param_value = cmp_params[matched_cmp_param_name] + if bench_param_value != cmp_param_value: + diff = compare_dict({bench_param_name: bench_param_value}, + {matched_cmp_param_name: cmp_param_value}) + if diff: + file_diffs.extend([f" Parameter '{bench_param_name}' (matched with '{matched_cmp_param_name}'): {d}" for d in diff]) + del cmp_params[matched_cmp_param_name] + else: + file_diffs.append(f" [Only in benchmark] Parameter: '{bench_param_name}': {bench_params[bench_param_name]}") + + for cmp_param_name, cmp_param_value in cmp_params.items(): + file_diffs.append(f" [Only in compare] Parameter: '{cmp_param_name}': {cmp_param_value}") + + if file_diffs: + all_diffs.append(f"File: {filename}") + all_diffs.extend(file_diffs) + + elif bench_params is not None: + all_diffs.append(f"[Only in benchmark] File: {filename}") + elif cmp_params is not None: + all_diffs.append(f"[Only in compare] File: {filename}") + + write_list_to_file(all_diffs, output_filepath) \ No newline at end of file -- Gitee From 23339582c2bfd363e7b841f7cce6b19b7b533626 Mon Sep 17 00:00:00 2001 From: sunyiming Date: Fri, 28 Feb 2025 10:45:53 +0800 Subject: [PATCH 2/2] update --- .../checkers/hyperparameter_checker.py | 59 ++++++++++--------- 1 file changed, 31 insertions(+), 28 deletions(-) diff --git a/debug/accuracy_tools/msprobe/pytorch/config_checking/checkers/hyperparameter_checker.py b/debug/accuracy_tools/msprobe/pytorch/config_checking/checkers/hyperparameter_checker.py index a59395102..91a12ff0e 100644 --- a/debug/accuracy_tools/msprobe/pytorch/config_checking/checkers/hyperparameter_checker.py +++ b/debug/accuracy_tools/msprobe/pytorch/config_checking/checkers/hyperparameter_checker.py @@ -1,4 +1,4 @@ -# Copyright (c) 2024-2024, Huawei Technologies Co., Ltd. +# Copyright (c) 2024-2025, Huawei Technologies Co., Ltd. # All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -15,15 +15,20 @@ import os import json +import re +import tempfile +from difflib import SequenceMatcher + +from typing import Union, List, Dict, Any + from msprobe.pytorch.config_checking.checkers.base_checker import BaseChecker from msprobe.pytorch.config_checking.config_checker import register_checker_item from msprobe.pytorch.config_checking.utils.packing import add_file_to_zip from msprobe.pytorch.config_checking.utils.utils import load_json, compare_dict, write_list_to_file from msprobe.pytorch.config_checking.utils.utils import config_checking_print -from typing import Union, List, Dict, Any -from difflib import SequenceMatcher -import tempfile -import re +from msprobe.core.common.file_utils import os_walk_for_files +from msprobe.pytorch.parse_tool.lib.config import Const +from msprobe.core.common.const import FileCheckConst @register_checker_item("hyperparameter") class HyperparameterChecker(BaseChecker): @@ -51,7 +56,8 @@ class HyperparameterChecker(BaseChecker): if os.path.isfile(script_path): hyperparameters = HyperparameterChecker._extract_hyperparameters_from_script(script_path) if hyperparameters: - dest_path_in_zip = os.path.join(HyperparameterChecker.target_name_in_zip, os.path.splitext(os.path.basename(script_path))[0] + ".json") + dest_path_in_zip = FileChecker(os.path.join(HyperparameterChecker.target_name_in_zip, os.path.splitext(os.path.basename(script_path))[0] + ".json"), FileCheckConst.FILE, + FileCheckConst.READ_ABLE).common_check() with tempfile.NamedTemporaryFile(mode='w', suffix='.json', delete=False) as tmp_file: json.dump(hyperparameters, tmp_file, indent=4) tmp_file_path = tmp_file.name @@ -130,33 +136,30 @@ class HyperparameterChecker(BaseChecker): return None - def compare(bench_dir, cmp_dir, output_path): - bench_model_dir = os.path.join(bench_dir, HyperparameterChecker.target_name_in_zip) - cmp_model_dir = os.path.join(cmp_dir, HyperparameterChecker.target_name_in_zip) - output_filepath = os.path.join(output_path, HyperparameterChecker.result_filename) - - bench_hyperparameters = {} - cmp_hyperparameters = {} - - if os.path.exists(bench_model_dir): - for root, _, files in os.walk(bench_model_dir): + def load_hyperparameters(model_dir): + hyperparameters = {} + if os.path.exists(model_dir): + subfiles = os_walk_for_files(model_dir, Const.MAX_TRAVERSAL_DEPTH) + for root, _, files in subfiles: for file in files: if file.endswith('.json'): - filepath = os.path.join(root, file) - relative_filepath = os.path.relpath(filepath, bench_model_dir) + filepath = FileChecker(os.path.join(root, file), FileCheckConst.FILE, + FileCheckConst.READ_ABLE).common_check() + relative_filepath = os.path.relpath(filepath, model_dir) params = load_json(filepath) if params: - bench_hyperparameters[relative_filepath] = params + hyperparameters[relative_filepath] = params + return hyperparameters - if os.path.exists(cmp_model_dir): - for root, _, files in os.walk(cmp_model_dir): - for file in files: - if file.endswith('.json'): - filepath = os.path.join(root, file) - relative_filepath = os.path.relpath(filepath, cmp_model_dir) - params = load_json(filepath) - if params: - cmp_hyperparameters[relative_filepath] = params + def compare(bench_dir, cmp_dir, output_path): + bench_model_dir = FileChecker(os.path.join(bench_dir, HyperparameterChecker.target_name_in_zip), FileCheckConst.FILE, + FileCheckConst.READ_ABLE).common_check() + cmp_model_dir = FileChecker(os.path.join(cmp_dir, HyperparameterChecker.target_name_in_zip), FileCheckConst.FILE, + FileCheckConst.READ_ABLE).common_check() + output_filepath = FileChecker(os.path.join(output_path, HyperparameterChecker.result_filename), FileCheckConst.FILE, + FileCheckConst.READ_ABLE).common_check() + bench_hyperparameters = load_hyperparameters(bench_model_dir) + cmp_hyperparameters = load_hyperparameters(cmp_model_dir) all_diffs = [] all_files = set(bench_hyperparameters.keys()) | set(cmp_hyperparameters.keys()) -- Gitee