diff --git a/debug/accuracy_tools/msprobe/pytorch/config_checking/__init__.py b/debug/accuracy_tools/msprobe/core/config_check/__init__.py similarity index 85% rename from debug/accuracy_tools/msprobe/pytorch/config_checking/__init__.py rename to debug/accuracy_tools/msprobe/core/config_check/__init__.py index 7d60f07881d378bb0a7a9c6faf6147af07a915b2..621122ffa00ba40a868853ccb46ff582c3e5fdda 100644 --- a/debug/accuracy_tools/msprobe/pytorch/config_checking/__init__.py +++ b/debug/accuracy_tools/msprobe/core/config_check/__init__.py @@ -13,4 +13,5 @@ # See the License for the specific language governing permissions and # limitations under the License. -import msprobe.pytorch.config_checking.checkers +import msprobe.core.config_check.checkers +from msprobe.core.config_check.config_checker import ConfigChecker diff --git a/debug/accuracy_tools/msprobe/pytorch/config_checking/checkers/__init__.py b/debug/accuracy_tools/msprobe/core/config_check/checkers/__init__.py similarity index 55% rename from debug/accuracy_tools/msprobe/pytorch/config_checking/checkers/__init__.py rename to debug/accuracy_tools/msprobe/core/config_check/checkers/__init__.py index c1bed99b3a091f7163431e4cc3e6c02cad69530d..9b9024b862f1f60655d2f71a47ab401546a86076 100644 --- a/debug/accuracy_tools/msprobe/pytorch/config_checking/checkers/__init__.py +++ b/debug/accuracy_tools/msprobe/core/config_check/checkers/__init__.py @@ -15,13 +15,11 @@ __all__ = ['BaseChecker', 'apply_patches'] -import msprobe.pytorch.config_checking.checkers.env_args_checker -import msprobe.pytorch.config_checking.checkers.pip_checker -import msprobe.pytorch.config_checking.checkers.dataset_checker -import msprobe.pytorch.config_checking.checkers.weights_checker -import msprobe.pytorch.config_checking.checkers.hyperparameter_checker -import msprobe.pytorch.config_checking.checkers.random_checker +import msprobe.core.config_check.checkers.env_args_checker +import msprobe.core.config_check.checkers.pip_checker +import msprobe.core.config_check.checkers.dataset_checker +import msprobe.core.config_check.checkers.weights_checker +import msprobe.core.config_check.checkers.hyperparameter_checker +import msprobe.core.config_check.checkers.random_checker -from msprobe.pytorch.config_checking.checkers.random_checker import apply_patches - -from msprobe.pytorch.config_checking.checkers.base_checker import BaseChecker +from msprobe.core.config_check.checkers.base_checker import BaseChecker diff --git a/debug/accuracy_tools/msprobe/pytorch/config_checking/checkers/base_checker.py b/debug/accuracy_tools/msprobe/core/config_check/checkers/base_checker.py similarity index 77% rename from debug/accuracy_tools/msprobe/pytorch/config_checking/checkers/base_checker.py rename to debug/accuracy_tools/msprobe/core/config_check/checkers/base_checker.py index e61a4ec56344cfc199df7d084264de83a98e51c0..ee598c7f0b010653c07c11028c6adbc9b1894e0c 100644 --- a/debug/accuracy_tools/msprobe/pytorch/config_checking/checkers/base_checker.py +++ b/debug/accuracy_tools/msprobe/core/config_check/checkers/base_checker.py @@ -14,10 +14,8 @@ # limitations under the License. import os -from abc import ABC, abstractmethod - -import torch +from msprobe.core.common.framework_adapter import FmkAdp from msprobe.core.common.const import FileCheckConst @@ -30,31 +28,33 @@ class PackInput: self.check_input_params() def check_input_params(self): - if self.model and not isinstance(self.model, torch.nn.Module): - raise Exception(f"model is not torch.nn.Module or module list.") + if self.model and not FmkAdp.is_nn_module(self.model): + raise Exception(f"model is not torch.nn.Module/mindspore.nn.Cell or module list.") if not isinstance(self.output_zip_path, str) or not self.output_zip_path.endswith(FileCheckConst.ZIP_SUFFIX): raise Exception(f"output zip path must be a string and ends with '.zip'") -class BaseChecker(ABC): +class BaseChecker: input_needed = None target_name_in_zip = None multi_rank = False @staticmethod - @abstractmethod def pack(pack_input): pass @staticmethod - @abstractmethod - def compare(bench_dir, cmp_dir, output_path): + def compare(bench_dir, cmp_dir, output_path, fmk): + pass + + @staticmethod + def apply_patches(fmk): pass @classmethod - def compare_ex(cls, bench_dir, cmp_dir, output_path): + def compare_ex(cls, bench_dir, cmp_dir, output_path, fmk): bench_filepath = os.path.join(bench_dir, cls.target_name_in_zip) cmp_filepath = os.path.join(cmp_dir, cls.target_name_in_zip) if not os.path.exists(bench_filepath) or not os.path.exists(cmp_filepath): return None, None, None - return cls.compare(bench_dir, cmp_dir, output_path) + return cls.compare(bench_dir, cmp_dir, output_path, fmk) diff --git a/debug/accuracy_tools/msprobe/pytorch/config_checking/checkers/dataset_checker.py b/debug/accuracy_tools/msprobe/core/config_check/checkers/dataset_checker.py similarity index 77% rename from debug/accuracy_tools/msprobe/pytorch/config_checking/checkers/dataset_checker.py rename to debug/accuracy_tools/msprobe/core/config_check/checkers/dataset_checker.py index 89217ac18ba4fb2fffe0eaaf325a634ee5d32eb3..96ff4809f81b8db20bc5bb26ecbf1d2e8f6e874b 100644 --- a/debug/accuracy_tools/msprobe/pytorch/config_checking/checkers/dataset_checker.py +++ b/debug/accuracy_tools/msprobe/core/config_check/checkers/dataset_checker.py @@ -15,29 +15,19 @@ import os import json -import torch import pandas as pd from msprobe.core.common.file_utils import create_file_in_zip, load_json -from msprobe.pytorch.common.utils import get_rank_id -from msprobe.pytorch.config_checking.checkers.base_checker import BaseChecker -from msprobe.pytorch.config_checking.config_checker import register_checker_item, register_pre_forward_fun_list -from msprobe.pytorch.config_checking.utils.utils import config_checking_print +from msprobe.core.config_check.checkers.base_checker import BaseChecker +from msprobe.core.config_check.config_checker import register_checker_item, register_pre_forward_fun_list +from msprobe.core.config_check.utils.utils import config_checking_print, get_tensor_features from msprobe.core.common.decorator import recursion_depth_decorator - - -def process_tensor(tensor): - return { - 'max': float(tensor.max().item()), - 'min': float(tensor.min().item()), - 'mean': float(tensor.mean().item()), - 'norm': float(torch.norm(tensor).item()) - } +from msprobe.core.common.framework_adapter import FmkAdp @recursion_depth_decorator("config_check: process_obj") def process_obj(obj): - if isinstance(obj, torch.Tensor): - return process_tensor(obj) + if FmkAdp.is_tensor(obj): + return get_tensor_features(obj) elif isinstance(obj, (tuple, list)): return {i: process_obj(x) for i, x in enumerate(obj)} elif isinstance(obj, dict): @@ -59,24 +49,34 @@ def parse_args_and_kargs(args, kwargs): @recursion_depth_decorator("config_check: compare_dataset_dicts") def compare_dataset_dicts(dict1, dict2, tag=''): results = [] - for key, value1 in dict1.items(): + # 处理 dict1 中的键 + for key in dict1: new_tag = f"{tag}.{key}" if tag else key + if key not in dict2: + result = {'tag': new_tag, 'equal': False, 'status': 'delete'} + results.append(result) + continue + value1 = dict1[key] value2 = dict2[key] - # 若为包含四个指定键的字典,不再递归 if not isinstance(value1, dict): continue if set(value1.keys()) == {'max', 'min', 'mean', 'norm'}: equal = value1 == value2 relative_diffs = { - f"{k}_relative_diff": (abs(value1[k] - value2[k]) / value1[k]) \ - if value1[k] != 0 else None \ + f"{k}_relative_diff": (abs(value1[k] - value2[k]) / value1[k]) if value1[k] != 0 else None for k in ['max', 'min', 'mean', 'norm'] } - result = {'tag': new_tag, 'equal': equal} + result = {'tag': new_tag, 'equal': equal, 'status': 'unchanged'} result.update(relative_diffs) results.append(result) else: results.extend(compare_dataset_dicts(value1, value2, new_tag)) + # 处理 dict2 中独有的键 + for key in dict2: + if key not in dict1: + new_tag = f"{tag}.{key}" if tag else key + result = {'tag': new_tag, 'equal': False, 'status': 'added'} + results.append(result) return results @@ -97,8 +97,8 @@ def compare_dataset(bench_dir, cmp_dir): dict2 = load_json(rank_path_cmp) results = compare_dataset_dicts(dict1, dict2) for result in results: - result['step'] = step - result['rank'] = rank + result['step'] = int(step.replace("step", "")) + result['rank'] = int(rank.replace("rank", "")) all_results.extend(results) df = pd.DataFrame(all_results, columns=DatasetChecker.result_header) @@ -122,14 +122,14 @@ class DatasetChecker(BaseChecker): def collect_input(model, args, kwargs, step): features = parse_args_and_kargs(args, kwargs) dataset_filepath = os.path.join(DatasetChecker.target_name_in_zip, - f"step{step}", f"rank{get_rank_id()}", "dataset.json") + f"step{step}", f"rank{FmkAdp.get_rank_id()}", "dataset.json") create_file_in_zip(output_zip_path, dataset_filepath, json.dumps(features, indent=4)) config_checking_print(f"add first dataset input features to zip") register_pre_forward_fun_list(collect_input) @staticmethod - def compare(bench_dir, cmp_dir, output_path): + def compare(bench_dir, cmp_dir, output_path, fmk): bench_dataset_pack_path = os.path.join(bench_dir, DatasetChecker.target_name_in_zip) cmp_dataset_pack_path = os.path.join(cmp_dir, DatasetChecker.target_name_in_zip) diff --git a/debug/accuracy_tools/msprobe/pytorch/config_checking/checkers/env_args_checker.py b/debug/accuracy_tools/msprobe/core/config_check/checkers/env_args_checker.py similarity index 57% rename from debug/accuracy_tools/msprobe/pytorch/config_checking/checkers/env_args_checker.py rename to debug/accuracy_tools/msprobe/core/config_check/checkers/env_args_checker.py index 9eaeb1a05729f7e9ae4ff8c9727d8f3589b3a166..d4f72a6b26850322aa5c7685745cfe5b54bdb8a1 100644 --- a/debug/accuracy_tools/msprobe/pytorch/config_checking/checkers/env_args_checker.py +++ b/debug/accuracy_tools/msprobe/core/config_check/checkers/env_args_checker.py @@ -19,10 +19,10 @@ import json import pandas as pd from msprobe.core.common.file_utils import load_json, load_yaml, create_file_with_content, create_file_in_zip -from msprobe.pytorch.config_checking.checkers.base_checker import BaseChecker -from msprobe.pytorch.config_checking.config_checker import register_checker_item -from msprobe.pytorch.config_checking.utils.utils import config_checking_print -from msprobe.core.common.file_utils import save_excel +from msprobe.core.config_check.checkers.base_checker import BaseChecker +from msprobe.core.config_check.config_checker import register_checker_item +from msprobe.core.config_check.utils.utils import config_checking_print +from msprobe.core.common.const import Const dirpath = os.path.dirname(__file__) @@ -36,21 +36,40 @@ def collect_env_data(): return result +def get_device_type(env_json): + for key in env_json.keys(): + if Const.ASCEND in key: + return Const.NPU_LOWERCASE + return Const.GPU_LOWERCASE + + def compare_env_data(npu_path, bench_path): necessary_env = load_yaml(env_yaml_path) - npu_data = load_json(npu_path) + cmp_data = load_json(npu_path) + cmp_type = get_device_type(cmp_data) bench_data = load_json(bench_path) + bench_type = get_device_type(bench_data) data = [] for _, value in necessary_env.items(): - npu_env_name = value[0]["name"] - npu_value = npu_data.get(npu_env_name) if npu_data.get(npu_env_name) else value[0]["default_value"] - if len(value) == 1: - data.append([npu_env_name, "only npu has this env", npu_value, "", "warning"]) + cmp_env = value.get(cmp_type) + bench_env = value.get(bench_type) + if not bench_env and not cmp_env: continue - bench_env_name = value[1]["name"] - bench_value = bench_data.get(bench_env_name) if bench_data.get(bench_env_name) else value[1]["default_value"] - if npu_value != bench_value: - data.append([npu_env_name, bench_env_name, npu_value, bench_value, "error"]) + elif cmp_env: + cmp_env_name = cmp_env["name"] + cmp_value = cmp_data.get(cmp_env_name, value[cmp_type]["default_value"]) + if not bench_env: + data.append(["only cmp has this env", cmp_env["name"], "", cmp_value, "warning"]) + continue + bench_env_name = bench_env["name"] + bench_value = bench_data.get(bench_env_name, value[bench_type]["default_value"]) + if cmp_value != bench_value: + data.append([bench_env_name, cmp_env_name, bench_value, cmp_value, "error"]) + else: + bench_env_name = bench_env["name"] + bench_value = bench_data.get(bench_env_name) if bench_data.get(bench_env_name) else value[bench_type][ + "default_value"] + data.append([bench_env_name, "only bench has this env", bench_value, "", "warning"]) df = pd.DataFrame(data, columns=EnvArgsChecker.result_header) return df @@ -69,7 +88,7 @@ class EnvArgsChecker(BaseChecker): config_checking_print(f"add env args to zip") @staticmethod - def compare(bench_dir, cmp_dir, output_path): + def compare(bench_dir, cmp_dir, output_path, fmk): bench_env_data = os.path.join(bench_dir, EnvArgsChecker.target_name_in_zip) cmp_env_data = os.path.join(cmp_dir, EnvArgsChecker.target_name_in_zip) df = compare_env_data(bench_env_data, cmp_env_data) diff --git a/debug/accuracy_tools/msprobe/pytorch/config_checking/checkers/hyperparameter_checker.py b/debug/accuracy_tools/msprobe/core/config_check/checkers/hyperparameter_checker.py similarity index 30% rename from debug/accuracy_tools/msprobe/pytorch/config_checking/checkers/hyperparameter_checker.py rename to debug/accuracy_tools/msprobe/core/config_check/checkers/hyperparameter_checker.py index 9ac1cd61fc5483c1a002bf0109d56a341aeab120..774abef4877786268bf700bbb695586800ef64d0 100644 --- a/debug/accuracy_tools/msprobe/pytorch/config_checking/checkers/hyperparameter_checker.py +++ b/debug/accuracy_tools/msprobe/core/config_check/checkers/hyperparameter_checker.py @@ -15,190 +15,144 @@ import os import json -import re -import tempfile from difflib import SequenceMatcher from typing import Union, List, Dict, Any +import pandas as pd -from msprobe.pytorch.config_checking.checkers.base_checker import BaseChecker -from msprobe.pytorch.config_checking.config_checker import register_checker_item -from msprobe.pytorch.config_checking.utils.utils import compare_dict, config_checking_print +from msprobe.core.config_check.checkers.base_checker import BaseChecker +from msprobe.core.config_check.config_checker import register_checker_item +from msprobe.core.config_check.utils.utils import compare_dict, config_checking_print, update_dict +from msprobe.core.config_check.utils.hyperparameter_parser import ParserFactory from msprobe.core.common.file_utils import (os_walk_for_files, create_file_in_zip, load_json, create_file_with_list, - FileOpen) -from msprobe.core.common.const import FileCheckConst, Const + FileOpen, load_yaml) +from msprobe.core.common.const import Const + + +dirpath = os.path.dirname(__file__) +hyperparameters_path = os.path.join(dirpath, "../resource/hyperparameter.yaml") +parameter_name_mapping = load_yaml(os.path.realpath(hyperparameters_path)) +hyperparameters_dict = {} @register_checker_item("hyperparameter") class HyperparameterChecker(BaseChecker): - input_needed = "shell_path" target_name_in_zip = "hyperparameters" - - PARAMETER_NAME_MAPPING = { - "learning_rate": ["lr", "learningrate"], - "batch_size": ["batch", "bs", "batch_size_per_gpu"], - "epochs": ["num_epochs", "max_epochs", "epoch"], - "weight_decay": ["wd", "weightdecay"], - "dropout_rate": ["dropout", "drop_rate"], - } + result_header = ["file_name", "bench_para", "cmp_para", "bench_value", "cmp_value", "matched_with", "level"] + hyperparameters_file_list = ["hyperparameters_static.json", "hyperparameters_dynamic.json"] @staticmethod def pack(pack_input): shell_path = pack_input.shell_path output_zip_path = pack_input.output_zip_path - if not isinstance(shell_path, list): - raise TypeError("shell_path should be a list of file paths.") - - for index, script_path in enumerate(shell_path): - if os.path.isfile(script_path): - hyperparameters = HyperparameterChecker._extract_hyperparameters_from_script(script_path) - if hyperparameters: - create_file_in_zip(output_zip_path, os.path.join(HyperparameterChecker.target_name_in_zip, - HyperparameterChecker.target_name_in_zip + - Const.REPLACEMENT_CHARACTER + str(index) - + FileCheckConst.JSON_SUFFIX), - json.dumps(hyperparameters, indent=4)) - config_checking_print(f"add hyperparameters args to zip") + if shell_path: + if not isinstance(shell_path, list): + raise TypeError("shell_path should be a list of file paths.") + + hyperparameters = {} + parser_factory = ParserFactory() + for script_path in shell_path: + if os.path.isfile(script_path): + parser = parser_factory.get_parser(os.path.splitext(script_path)[1]) + update_dict(hyperparameters, parser.run(os.path.realpath(script_path))) else: - config_checking_print(f"Warning: Failed to extract hyperparameters from script {script_path}") + config_checking_print(f"Warning: Script path {script_path} is not a file.") + if hyperparameters: + create_file_in_zip(output_zip_path, + os.path.join(HyperparameterChecker.target_name_in_zip, + HyperparameterChecker.hyperparameters_file_list[0]), + json.dumps(hyperparameters, indent=4)) + config_checking_print(f"add static hyperparameters args to zip") else: - config_checking_print(f"Warning: Script path {script_path} is not a file.") + config_checking_print(f"Warning: Failed to extract hyperparameters from script {shell_path}") + if hyperparameters_dict: + create_file_in_zip(output_zip_path, + os.path.join(HyperparameterChecker.target_name_in_zip, + HyperparameterChecker.hyperparameters_file_list[1]), + json.dumps(vars(hyperparameters_dict), default=lambda x: None, indent=4)) + config_checking_print(f"add dynamic hyperparameters args to zip") @staticmethod - def compare(bench_dir, cmp_dir, output_path): - bench_model_dir = os.path.join(bench_dir, HyperparameterChecker.target_name_in_zip) - cmp_model_dir = os.path.join(cmp_dir, HyperparameterChecker.target_name_in_zip) - bench_hyperparameters = HyperparameterChecker.load_hyperparameters(bench_model_dir) - cmp_hyperparameters = HyperparameterChecker.load_hyperparameters(cmp_model_dir) - - if len(bench_hyperparameters) != len(cmp_hyperparameters): - config_checking_print("The shell path length dose not match!") - raise Exception("The shell path length dose not match!") - + def compare(bench_dir, cmp_dir, output_path, fmk): all_diffs = [] - all_files = set(bench_hyperparameters.keys()) | set(cmp_hyperparameters.keys()) - - for filename in all_files: - bench_params = bench_hyperparameters.get(filename, {}) - cmp_params = cmp_hyperparameters.get(filename, {}) - - if bench_params and cmp_params: - all_diffs.extend(HyperparameterChecker.compare_param(bench_params, cmp_params, filename)) - - elif bench_params is not None: - all_diffs.append(f"[Only in benchmark] File: {filename}") - else: - all_diffs.append(f"[Only in compare] File: {filename}") - return HyperparameterChecker.target_name_in_zip, True, None + for file_name in HyperparameterChecker.hyperparameters_file_list: + bench_model_dir = os.path.join(bench_dir, HyperparameterChecker.target_name_in_zip, file_name) + cmp_model_dir = os.path.join(cmp_dir, HyperparameterChecker.target_name_in_zip, file_name) + if os.path.isfile(bench_model_dir) and os.path.isfile(cmp_model_dir): + bench_hyperparameters = load_json(bench_model_dir) + cmp_hyperparameters = load_json(cmp_model_dir) + all_diffs.extend( + HyperparameterChecker.compare_param(bench_hyperparameters, cmp_hyperparameters, file_name)) + df = pd.DataFrame(all_diffs, columns=HyperparameterChecker.result_header) + pass_check = "error" not in df["level"].values + return HyperparameterChecker.target_name_in_zip, pass_check, df @staticmethod - def compare_param(bench_params, cmp_params, filename): + def compare_param(bench_params, cmp_params, file_name): all_diffs = [] - file_diffs = [] bench_param_names = bench_params.keys() for bench_param_name in bench_param_names: - matched_cmp_param_name = HyperparameterChecker._fuzzy_match_parameter(bench_param_name, cmp_params) + matched_cmp_param_name, matched_with = HyperparameterChecker._fuzzy_match_parameter(bench_param_name, + cmp_params) + bench_param_value = bench_params[bench_param_name] if matched_cmp_param_name: - bench_param_value = bench_params[bench_param_name] cmp_param_value = cmp_params[matched_cmp_param_name] if bench_param_value != cmp_param_value: - diff = compare_dict({bench_param_name: bench_param_value}, - {matched_cmp_param_name: cmp_param_value}) - if diff: - file_diffs.extend( - [f" Parameter '{bench_param_name}' (matched with '{matched_cmp_param_name}'): {d}" - for d in diff]) + all_diffs.append( + [file_name, bench_param_name, matched_cmp_param_name, bench_param_value, cmp_param_value, + matched_with, "error"]) del cmp_params[matched_cmp_param_name] else: - file_diffs.append( - f" [Only in benchmark] Parameter: '{bench_param_name}': {bench_params[bench_param_name]}") + all_diffs.append( + [file_name, bench_param_name, "Only in benchmark", bench_param_value, "", "", "warning"]) for cmp_param_name, cmp_param_value in cmp_params.items(): - file_diffs.append(f" [Only in compare] Parameter: '{cmp_param_name}': {cmp_param_value}") - if file_diffs: - file_diffs.sort() - all_diffs.append(f"File: {filename}") - all_diffs.extend(file_diffs) + all_diffs.append([file_name, "Only in comparison", cmp_param_name, "", cmp_param_value, "", "warning"]) + all_diffs.sort() return all_diffs @staticmethod - def load_hyperparameters(model_dir): - hyperparameters = {} - if not os.path.exists(model_dir): - return hyperparameters - subfiles = os_walk_for_files(model_dir, Const.MAX_TRAVERSAL_DEPTH) - for files in subfiles: - if files["file"].endswith(FileCheckConst.JSON_SUFFIX): - filepath = os.path.join(files["root"], files["file"]) - relative_filepath = os.path.relpath(filepath, model_dir) - params = load_json(filepath) - if params: - hyperparameters[relative_filepath] = params - return hyperparameters - - @staticmethod - def _extract_hyperparameters_from_script(script_path: str) -> Dict[str, Any]: - """ - Extracts arguments from bash script used to run a model training. - """ - hyperparameters = {} - script_content_list = [] - with FileOpen(script_path, 'r') as file: - for line in file: - stripped_line = line.lstrip() - if not stripped_line.startswith('#'): - line = line.split('#')[0].rstrip() + '\n' - if line.strip(): - script_content_list.append(line) - script_content = ''.join(script_content_list) - - command_line = re.search(r'torchrun\s[^|]*|python -m torch.distributed.launch\s[^|]*', script_content, - re.DOTALL) - if command_line: - command_line = command_line.group() - - blocks = re.findall(r'([a-zA-Z0-9_]{1,20}_ARGS)="(.*?)"', script_content, re.DOTALL) - block_contents = {} - for block_name, block_content in blocks: - block_content = block_content.replace('\n', ' ') - block_contents[block_name] = block_content - command_line = command_line.replace(f"${block_name}", block_content) - - matches = re.findall(r'--([\w-]+)(?:\s+([^\s\\]+))?', command_line) - for match in matches: - key, value = match - args_key = re.match(r'\$\{?(\w+)}?', value) - if args_key: - env_vars = re.findall(rf'{args_key.group(1)}=\s*(.+)', script_content) - if env_vars: - value = env_vars[-1] - hyperparameters[key] = value if value else True - - return hyperparameters + def apply_patches(fmk): + try: + from megatron import training + + def collect_hyperparameter_wrapper(func): + def wrapper(*args, **kwargs): + global hyperparameters_dict + result = func(*args, **kwargs) + if not hyperparameters_dict: + hyperparameters_dict = result + return result + return wrapper + training.get_args = collect_hyperparameter_wrapper(training.get_args) + except ImportError: + config_checking_print("No megatron find.") + except Exception as e: + config_checking_print(f"Patch megatron method failed, detail:{str(e)}") @staticmethod - def _fuzzy_match_parameter(param_name: str, available_params: Dict[str, Any]) -> Union[str, None]: + def _fuzzy_match_parameter(param_name: str, available_params: Dict[str, Any]): """ Fuzzy matches a parameter name against available parameter names using predefined mappings and string similarity. """ if param_name in available_params: - return param_name + return param_name, Const.MATCH_MODE_NAME canonical_name = None - for standard_name, aliases in HyperparameterChecker.PARAMETER_NAME_MAPPING.items(): + for standard_name, aliases in parameter_name_mapping.items(): if param_name == standard_name or param_name in aliases: canonical_name = standard_name break if canonical_name: if canonical_name in available_params: - return canonical_name - for alias in HyperparameterChecker.PARAMETER_NAME_MAPPING[canonical_name]: + return canonical_name, Const.MATCH_MODE_MAPPING + for alias in parameter_name_mapping[canonical_name]: if alias in available_params: config_checking_print( f"Matched '{param_name}' to alias '{alias}' via canonical name '{canonical_name}'") - return alias + return alias, Const.MATCH_MODE_MAPPING best_match_name = None best_match_ratio = 0.8 @@ -211,6 +165,6 @@ class HyperparameterChecker(BaseChecker): if best_match_name: config_checking_print( f"Fuzzy matched parameter '{param_name}' to '{best_match_name}' (similarity: {best_match_ratio:.2f})") - return best_match_name + return best_match_name, f"{Const.MATCH_MODE_SIMILARITY}:{best_match_ratio}" - return None + return None, None diff --git a/debug/accuracy_tools/msprobe/pytorch/config_checking/checkers/pip_checker.py b/debug/accuracy_tools/msprobe/core/config_check/checkers/pip_checker.py similarity index 87% rename from debug/accuracy_tools/msprobe/pytorch/config_checking/checkers/pip_checker.py rename to debug/accuracy_tools/msprobe/core/config_check/checkers/pip_checker.py index 15c02d16843d72103293834a10d6952c59cf73f3..a35bc3e00cd5bf5ed6601ce9983bad390f4b989f 100644 --- a/debug/accuracy_tools/msprobe/pytorch/config_checking/checkers/pip_checker.py +++ b/debug/accuracy_tools/msprobe/core/config_check/checkers/pip_checker.py @@ -21,9 +21,9 @@ except ImportError: import importlib_metadata as metadata from msprobe.core.common.file_utils import load_yaml, create_file_in_zip -from msprobe.pytorch.config_checking.checkers.base_checker import BaseChecker -from msprobe.pytorch.config_checking.config_checker import register_checker_item -from msprobe.pytorch.config_checking.utils.utils import config_checking_print +from msprobe.core.config_check.checkers.base_checker import BaseChecker +from msprobe.core.config_check.config_checker import register_checker_item +from msprobe.core.config_check.utils.utils import config_checking_print from msprobe.core.common.file_utils import FileOpen, save_excel dirpath = os.path.dirname(__file__) @@ -49,8 +49,9 @@ def collect_pip_data(): return result -def compare_pip_data(bench_pip_path, cmp_pip_path): +def compare_pip_data(bench_pip_path, cmp_pip_path, fmk): necessary_dependency = load_yaml(depend_path)["dependency"] + necessary_dependency.append(fmk) bench_data = load_pip_txt(bench_pip_path) cmp_data = load_pip_txt(cmp_pip_path) data = [] @@ -81,9 +82,9 @@ class PipPackageChecker(BaseChecker): config_checking_print(f"add pip info to zip") @staticmethod - def compare(bench_dir, cmp_dir, output_path): + def compare(bench_dir, cmp_dir, output_path, fmk): bench_pip_path = os.path.join(bench_dir, PipPackageChecker.target_name_in_zip) cmp_pip_path = os.path.join(cmp_dir, PipPackageChecker.target_name_in_zip) - df = compare_pip_data(bench_pip_path, cmp_pip_path) + df = compare_pip_data(bench_pip_path, cmp_pip_path, fmk) pass_check = "error" not in df['level'].values return PipPackageChecker.target_name_in_zip, pass_check, df diff --git a/debug/accuracy_tools/msprobe/core/config_check/checkers/random_checker.py b/debug/accuracy_tools/msprobe/core/config_check/checkers/random_checker.py new file mode 100644 index 0000000000000000000000000000000000000000..f018922dbde2906e1fbd0927b763b382d26dbf67 --- /dev/null +++ b/debug/accuracy_tools/msprobe/core/config_check/checkers/random_checker.py @@ -0,0 +1,367 @@ +# Copyright (c) 2025-2025, Huawei Technologies Co., Ltd. +# All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import random +from functools import wraps +from typing import Callable, List, Dict, Tuple, Optional +import inspect +import os +import json +from collections import defaultdict +import difflib + +import numpy as np +import pandas as pd +from msprobe.core.config_check.config_checker import register_checker_item, register_pre_forward_fun_list +from msprobe.core.common.file_utils import create_file_in_zip, load_json +from msprobe.core.config_check.checkers.base_checker import BaseChecker +from msprobe.core.config_check.utils.utils import config_checking_print +from msprobe.core.common.framework_adapter import FmkAdp +from msprobe.core.common.const import Const +from msprobe.core.common.log import logger + + +# 数据结构:{随机操作名字: [{count: 调用次数, stack: 调用栈列表}]} +random_op_stats = defaultdict(list) + + +def get_call_stack(frame) -> List[str]: + """获取详细的调用栈信息,每个元素包含完整路径、行号、函数名和代码行""" + stack = [] + current_frame = frame.f_back # 跳过当前函数 + + while current_frame: + frame_info = inspect.getframeinfo(current_frame) + filename = os.path.abspath(frame_info.filename) + code_line = frame_info.code_context[0].strip() if frame_info.code_context else "" + + # 格式化为详细的栈帧信息 + stack_entry = f"File {filename}, line {frame_info.lineno}, in {frame_info.function}, {code_line}" + stack.append(stack_entry) + + current_frame = current_frame.f_back + + # 反转堆栈以显示正确的调用顺序(栈底到栈顶) + return stack[::-1] + + +def track_random_call(func: Callable, name: str): + """记录随机函数的调用信息""" + @wraps(func) + def wrapper(*args, **kwargs): + frame = inspect.currentframe() + stack = get_call_stack(frame) + + # 更新调用统计:操作名 -> [{count: 次数, stack: 调用栈列表}] + # 检查是否已有相同调用栈的记录 + for entry in random_op_stats[name]: + if entry['stack'] == stack: + entry['count'] += 1 + break + else: + # 新增调用栈记录 + random_op_stats[name].append({'count': 1, 'stack': stack}) + + try: + result = func(*args, **kwargs) + return result + except Exception as e: + raise e + finally: + del frame + + return wrapper + + +def load_stats_files(directory: str) -> Dict[str, Dict[str, List[Dict]]]: + """加载目录下所有统计文件并按rank组织数据""" + rank_data = {} + for file in os.listdir(directory): + file_path = os.path.join(directory, file) + if file.startswith('rank') and file.endswith('.json'): + rank = os.path.basename(file.split('.')[0])[4:] + if not rank or not rank.isdigit(): + logger.error(f"extract rank id from {file} failed") + raise ValueError + + # 加载并存储数据 + data = load_json(file_path) + rank_data[int(rank)] = data + + return rank_data + + +def stack_match(stack1: List[str], stack2: List[str], threshold: float = 0.8) -> bool: + """ + 比较两个调用栈是否相似,同时考虑路径、函数名和代码行(各占1/3),每一层的相似度阈值需要达到0.8 + + 参数: + - stack1: 第一个调用栈列表 + - stack2: 第二个调用栈列表 + - threshold: 相似度阈值,默认0.8 + + 返回: + - 两个调用栈是否相似的布尔值 + """ + if len(stack1) != len(stack2): + return False + + for frame1, frame2 in zip(stack1, stack2): + # 提取路径、函数名和代码行 + path1, func1, code1 = _parse_frame(frame1) + path2, func2, code2 = _parse_frame(frame2) + + # 计算相似度得分 (路径、函数名、代码行各占1/3权重) + path_score = _compare_path(path1, path2) + func_score = 1.0 if func1 == func2 else 0.0 + # 代码相似度 + code_score = difflib.SequenceMatcher(None, code1, code2).ratio() + + frame_score = (path_score + func_score + code_score) / 3.0 + if frame_score < threshold: + return False + + return True + + +def _parse_frame(frame: str) -> Tuple[str, str, str]: + """ + 解析栈帧字符串,提取路径、函数名和代码行 + + 参数: + - frame: 栈帧字符串。格式为"File {path}, line {line}, in {func}, {code}" + + 返回: + - path, func, code + """ + path = func = code = '' + stack_info = frame.split(' ') + if len(stack_info) > 6: + path = stack_info[1][:-1] + func = stack_info[5][:-1] + code = ' '.join(stack_info[6:]) + return path, func, code + + +def _compare_path(path1: str, path2: str) -> float: + """比较两个路径的相似度,只考虑文件名""" + if not path1 or not path2: + return 0.0 + + # 提取文件名(忽略目录路径) + file1 = os.path.basename(path1) + file2 = os.path.basename(path2) + + return 1.0 if file1 == file2 else 0.0 + + +def find_matching_stack(bench_stack: List[str], cmp_stacks: List[Dict]) -> Optional[Dict]: + """ + 查找匹配的调用栈 + + 参数: + - bench_stack: 基准侧的调用栈列表 + - cmp_stacks: 比较侧的调用栈条目列表,每个条目是{'count': 次数, 'stack': 调用栈列表} + + 返回: + - 匹配的调用栈条目或None + """ + for cmp_entry in cmp_stacks: + if stack_match(cmp_entry['stack'], bench_stack): + return cmp_entry + + return None + + +def stack_list_to_string(stack_list): + """ + 将调用栈列表转换为换行分隔的字符串 + 如果输入是特殊标记(如"no match stack"),则直接返回 + """ + if isinstance(stack_list, list): + return '\n'.join(stack_list) + return stack_list + + +def compare_random_calls(bench_dir: str = 'bench', cmp_dir: str = 'cmp') -> pd.DataFrame: + """比较两个目录下的随机调用栈统计,生成详细比对结果""" + bench_rank_data = load_stats_files(bench_dir) + cmp_rank_data = load_stats_files(cmp_dir) + + # 获取所有rank + all_ranks = sorted(set(bench_rank_data.keys()) | set(cmp_rank_data.keys())) + + results = [] + + for rank in all_ranks: + bench_data = bench_rank_data.get(rank, {}) + cmp_data = cmp_rank_data.get(rank, {}) + + # 获取所有操作 + all_ops = set(bench_data.keys()) | set(cmp_data.keys()) + + for op in all_ops: + bench_stacks = bench_data.get(op, []) + cmp_stacks = cmp_data.get(op, []) + + # 处理bench侧的每个调用栈 + for bench_entry in bench_stacks: + bench_stack = bench_entry['stack'] + bench_count = bench_entry['count'] + + # 查找匹配的cmp侧调用栈 + cmp_entry = find_matching_stack(bench_stack, cmp_stacks) + + if cmp_entry: + cmp_count = cmp_entry['count'] + check_result = bench_count == cmp_count + results.append([op, rank, bench_stack, cmp_entry['stack'], bench_count, cmp_count, check_result]) + else: + # 没有匹配的调用栈 + results.append([op, rank, bench_stack, "no match stack", bench_count, 0, False]) + + # 处理cmp侧中没有在bench侧出现的调用栈 + for cmp_entry in cmp_stacks: + cmp_stack = cmp_entry['stack'] + # 检查是否已经在上面处理过 + if not any(stack_match(bench_entry['stack'], cmp_stack) for bench_entry in bench_stacks): + results.append([op, rank, "no match stack", cmp_stack, 0, cmp_entry['count'], False]) + + # 创建DataFrame + df = pd.DataFrame(results, columns=RandomChecker.result_header) + + # 应用转换函数 + df['bench_stack'] = df['bench_stack'].apply(stack_list_to_string) + df['cmp_stack'] = df['cmp_stack'].apply(stack_list_to_string) + + return df + + +def torch_patchs(): + """补丁Torch随机函数""" + import torch + torch_patches = { + 'rand': torch.rand, + 'randint': torch.randint, + 'randn': torch.randn, + 'rand_like': torch.rand_like, + 'randint_like': torch.randint_like, + 'randn_like': torch.randn_like, + 'manual_seed': torch.manual_seed + } + for name, func in torch_patches.items(): + setattr(torch, name, track_random_call(func, f"torch.{name}")) + + tensor_patches = { + 'exponential_': torch.Tensor.exponential_, + 'geometric_': torch.Tensor.geometric_, + 'log_normal_': torch.Tensor.log_normal_, + 'cauchy_': torch.Tensor.cauchy_ + } + for name, func in tensor_patches.items(): + setattr(torch.Tensor, name, track_random_call(func, f"torch.Tensor.{name}")) + + +def mindspore_patchs(): + """补丁MindSpore随机函数""" + import mindspore + + mindspore_ops_patches = { + 'rand': mindspore.ops.uniform, + 'randint': mindspore.ops.randint, + 'randn': mindspore.ops.normal + } + for name, func in mindspore_ops_patches.items(): + setattr(mindspore.ops, name, track_random_call(func, f"mindspore.ops.{name}")) + + mindspore_patches = { + 'manual_seed': mindspore.set_seed + } + for name, func in mindspore_patches.items(): + setattr(mindspore, name, track_random_call(func, f"mindspore.{name}")) + + +@register_checker_item("random") +class RandomChecker(BaseChecker): + input_needed = None + target_name_in_zip = "random" + result_header = ['op', 'rank', 'bench_stack', 'cmp_stack', 'bench_count', 'cmp_count', 'check_result'] + write_once = False + + @staticmethod + def pack(pack_input): + """打包随机调用统计到zip文件""" + output_zip_path = pack_input.output_zip_path + + def collect_input(model, args, kwargs, step): + if RandomChecker.write_once: + return + + random_stats_dir = os.path.join(RandomChecker.target_name_in_zip) + stats_filepath = os.path.join(random_stats_dir, f"rank{FmkAdp.get_rank_id()}.json") + + # 转换为JSON格式:{操作名: [{count: 次数, stack: 调用栈列表}]} + stats_json = {} + for op_name, entries in random_op_stats.items(): + stats_json[op_name] = entries + + create_file_in_zip(output_zip_path, stats_filepath, json.dumps(stats_json, indent=4)) + config_checking_print(f"已将随机调用统计打包到: {stats_filepath}") + RandomChecker.write_once = True + + register_pre_forward_fun_list(collect_input) + + @staticmethod + def compare(bench_dir, cmp_dir, output_path, fmk): + """比较两组随机调用统计""" + bench_stats_path = os.path.join(bench_dir, RandomChecker.target_name_in_zip) + cmp_stats_path = os.path.join(cmp_dir, RandomChecker.target_name_in_zip) + + df = compare_random_calls(bench_stats_path, cmp_stats_path) + pass_check = False not in df['check_result'].values + + return RandomChecker.target_name_in_zip, pass_check, df + + @staticmethod + def apply_patches(fmk=Const.PT_FRAMEWORK): + """应用随机函数补丁""" + # 补丁Python random模块 + random_patches = { + 'random': random.random, + 'randint': random.randint, + 'uniform': random.uniform, + 'choice': random.choice + } + for name, func in random_patches.items(): + setattr(random, name, track_random_call(func, f"random.{name}")) + + # 补丁Numpy随机函数 + np_random_patches = { + 'rand': np.random.rand, + 'randint': np.random.randint, + 'choice': np.random.choice, + 'normal': np.random.normal + } + for name, func in np_random_patches.items(): + setattr(np.random, name, track_random_call(func, f"np.random.{name}")) + + # 补丁框架特定随机函数 + if fmk == Const.PT_FRAMEWORK: + torch_patchs() + elif fmk == Const.MS_FRAMEWORK: + mindspore_patchs() + else: + raise Exception(f"不支持的框架: {fmk}, 支持的框架: {FmkAdp.supported_fmk}") diff --git a/debug/accuracy_tools/msprobe/pytorch/config_checking/checkers/weights_checker.py b/debug/accuracy_tools/msprobe/core/config_check/checkers/weights_checker.py similarity index 88% rename from debug/accuracy_tools/msprobe/pytorch/config_checking/checkers/weights_checker.py rename to debug/accuracy_tools/msprobe/core/config_check/checkers/weights_checker.py index 6cfd758f713861241ba55e36574e7e75e2041636..f17c62ff9271fe2bc95207cdf405071cb8289f80 100644 --- a/debug/accuracy_tools/msprobe/pytorch/config_checking/checkers/weights_checker.py +++ b/debug/accuracy_tools/msprobe/core/config_check/checkers/weights_checker.py @@ -15,20 +15,19 @@ import os import json -import torch import pandas as pd -from msprobe.core.common.file_utils import create_file_in_zip, save_excel, os_walk_for_files, load_json -from msprobe.pytorch.common.utils import get_rank_id -from msprobe.pytorch.config_checking.checkers.base_checker import BaseChecker -from msprobe.pytorch.config_checking.config_checker import register_checker_item, register_pre_forward_fun_list -from msprobe.pytorch.config_checking.utils.utils import config_checking_print, get_tensor_features +from msprobe.core.common.file_utils import create_file_in_zip, os_walk_for_files, load_json +from msprobe.core.config_check.checkers.base_checker import BaseChecker +from msprobe.core.config_check.config_checker import register_checker_item, register_pre_forward_fun_list +from msprobe.core.config_check.utils.utils import config_checking_print, get_tensor_features +from msprobe.core.common.framework_adapter import FmkAdp def collect_weights_data(model): weights_data = {} - for name, param in model.named_parameters(): - if param.dtype == torch.bfloat16: + for name, param in FmkAdp.named_parameters(model): + if param.dtype != FmkAdp.dtype("float32"): param = param.float() weights_data[name] = get_tensor_features(param) return weights_data @@ -134,13 +133,13 @@ class WeightsChecker(BaseChecker): def collect_weights(model, args, kwargs, step): weights_data_dict = collect_weights_data(model) weights_data_filepath = os.path.join(WeightsChecker.target_name_in_zip, - f"step{step}", f"rank{get_rank_id()}", "weight.json") + f"step{step}", f"rank{FmkAdp.get_rank_id()}", "weight.json") create_file_in_zip(output_zip_path, weights_data_filepath, json.dumps(weights_data_dict, indent=4)) config_checking_print(f"add weights info to zip") register_pre_forward_fun_list(collect_weights) @staticmethod - def compare(bench_dir, cmp_dir, output_path): + def compare(bench_dir, cmp_dir, output_path, fmk): bench_weight_pack_path = os.path.join(bench_dir, WeightsChecker.target_name_in_zip) cmp_weight_pack_path = os.path.join(cmp_dir, WeightsChecker.target_name_in_zip) df = compare_weight(bench_weight_pack_path, cmp_weight_pack_path) diff --git a/debug/accuracy_tools/msprobe/pytorch/config_checking/ckpt_compare/compare_weight.py b/debug/accuracy_tools/msprobe/core/config_check/ckpt_compare/ckpt_comparator.py similarity index 83% rename from debug/accuracy_tools/msprobe/pytorch/config_checking/ckpt_compare/compare_weight.py rename to debug/accuracy_tools/msprobe/core/config_check/ckpt_compare/ckpt_comparator.py index b4c49fc3a8e0ed7838de451f9e8dcfbcf4363388..3c088c249a3088a9768accb0b6c2a4d429a6fab0 100644 --- a/debug/accuracy_tools/msprobe/pytorch/config_checking/ckpt_compare/compare_weight.py +++ b/debug/accuracy_tools/msprobe/core/config_check/ckpt_compare/ckpt_comparator.py @@ -16,10 +16,11 @@ from typing import Dict from tqdm import tqdm -from msprobe.core.common.file_utils import save_json, check_file_or_directory_path -from msprobe.pytorch.common.log import logger -from msprobe.pytorch.config_checking.ckpt_compare.megatron_loader import load_megatron_weights -from msprobe.pytorch.config_checking.ckpt_compare.metrics import METRIC_FUNC +from msprobe.core.common.file_utils import save_json, check_path_before_create, check_path_not_exists +from msprobe.core.common.log import logger +from msprobe.core.config_check.ckpt_compare.megatron_loader import load_megatron_weights +from msprobe.core.config_check.ckpt_compare.metrics import METRIC_FUNC + def compare_checkpoints(ckpt_path1, ckpt_path2, output_path) -> Dict: @@ -43,7 +44,8 @@ def compare_checkpoints(ckpt_path1, ckpt_path2, output_path) -> Dict: """ # Load both checkpoints - check_file_or_directory_path(output_path) + check_path_before_create(output_path) + check_path_not_exists(output_path) weights1 = load_megatron_weights(ckpt_path1) weights2 = load_megatron_weights(ckpt_path2) @@ -55,14 +57,15 @@ def compare_checkpoints(ckpt_path1, ckpt_path2, output_path) -> Dict: logger.warning(f'Parameters not in ckpt2: {set(weights1) - set(weights2)}') logger.warning(f'Parameters not in ckpt1: {set(weights2) - set(weights1)}') for key in tqdm(common): - tensor1 = weights1[key].float() - tensor2 = weights2[key].float() + tensor1 = weights1[key] + tensor2 = weights2[key] results[key] = {} for metric, func in METRIC_FUNC.items(): try: results[key][metric] = func(tensor1, tensor2) except Exception as e: + results[key][metric] = 'error' logger.warning(f'Error when calculate {metric} for reason: {e}') # Write results to JSON file diff --git a/debug/accuracy_tools/msprobe/pytorch/config_checking/ckpt_compare/megatron_loader.py b/debug/accuracy_tools/msprobe/core/config_check/ckpt_compare/megatron_loader.py similarity index 78% rename from debug/accuracy_tools/msprobe/pytorch/config_checking/ckpt_compare/megatron_loader.py rename to debug/accuracy_tools/msprobe/core/config_check/ckpt_compare/megatron_loader.py index 3dea9792360a3253e6e917eb8427a53ca44481e1..af1c5518aacfa5d02f21633d1bb162e41f979917 100644 --- a/debug/accuracy_tools/msprobe/pytorch/config_checking/ckpt_compare/megatron_loader.py +++ b/debug/accuracy_tools/msprobe/core/config_check/ckpt_compare/megatron_loader.py @@ -17,23 +17,21 @@ import os import re from collections import defaultdict from typing import Dict -import torch -from msprobe.pytorch.common.log import logger +import numpy as np +from msprobe.core.common.log import logger from msprobe.core.common.decorator import recursion_depth_decorator from msprobe.core.common.const import Const from msprobe.core.common.file_utils import FileOpen, load_yaml -from msprobe.pytorch.common.utils import load_pt +from msprobe.core.common.framework_adapter import FmkAdp -try: - import megatron -except ModuleNotFoundError as e: - raise ModuleNotFoundError("No module named 'megatron', which is required to load a megatron ckpt") from e - - -COLUMN_PARALLEL_PARAMS = ['linear_qkv', 'linear_fc1', 'word_embeddings.weight'] +# both weights and bias are partitioned in column parallel +COLUMN_PARALLEL_PARAMS = ['linear_qkv', 'linear_fc1', 'word_embeddings.weight', 'output_layer.weight'] +# only weights are partitioned in column parallel +ROW_PARALLEL_PARAMS = ['linear_fc2.weight', 'linear_proj.weight'] ARGS = 'args' LAYER_IDX_PATTERN = re.compile('layers\.(\d+)\.') EXPERT_IDX_PATTERN = re.compile('experts\.(\d+)\.') +ITER_DIR_PATTERN = re.compile('iter_([\d]{7})') @recursion_depth_decorator('') @@ -42,8 +40,8 @@ def _get_parameter(weights, prefix=''): name = Const.SEP.join([prefix, k]).strip(Const.SEP) if isinstance(v, dict): yield from _get_parameter(v, prefix=name) - elif isinstance(v, torch.Tensor): - yield name, v + elif FmkAdp.is_tensor(v): + yield name, FmkAdp.asnumpy(v) def _map_to_mcore_local_names(param_name: str) -> str: @@ -62,7 +60,7 @@ def _parse_real_layer_idx(param_name, num_layers_per_stage, pp_size, pp_rank): The global layer index needs to account for both pipeline stage and virtual stage. Args: - param_name (str): Parameter name containing layer index + param_name (str): Parameter name containing layer index: layers.x./ num_layers_per_stage (int): Number of layers per pipeline stage pp_size (int): Pipeline parallel size @@ -123,13 +121,13 @@ def _consolidate_tp_weights(weights: Dict) -> Dict: for key, tensors in weights.items(): if any([name in key for name in COLUMN_PARALLEL_PARAMS]): # Column parallel - concatenate along input dimension (dim 0) - combined = torch.cat(tensors, dim=0) - elif "linear_proj.weight" in key or "linear_fc2.weight" in key: + combined = np.concatenate(tensors, axis=0) + elif any([name in key for name in ROW_PARALLEL_PARAMS]): # Row parallel - concatenate along output dimension (dim 1) - combined = torch.cat(tensors, dim=1) + combined = np.concatenate(tensors, axis=1) else: # For other params, verify identical and use first - if not all(torch.allclose(tensors[0], t) for t in tensors[1:]): + if not all(np.allclose(tensors[0], t) for t in tensors[1:]): logger.warning(f"Inconsistent values for {key} across TP ranks") combined = tensors[0] @@ -160,7 +158,10 @@ def parse_parallel_size(checkpoint_dir: str): if not rank_dirs: raise ValueError(f"No checkpoint rank directories found in {checkpoint_dir}") - ckpt = load_pt(os.path.join(checkpoint_dir, rank_dirs[0], 'model_optim_rng.pt'), to_cpu=True, weights_only=False) + ckpt = FmkAdp.load_checkpoint( + os.path.join(checkpoint_dir, rank_dirs[0], 'model_optim_rng.pt'), + to_cpu=True, + weights_only=False) args = ckpt[ARGS] return ( args.tensor_model_parallel_size, @@ -171,21 +172,45 @@ def parse_parallel_size(checkpoint_dir: str): def parse_iteration(checkpoint_path: str) -> Dict: + """ + Parse the checkpoint iteration directory from a given checkpoint path. + + If the path is a top-level checkpoint directory, this function reads the + 'latest_checkpointed_iteration.txt' file to determine the latest iteration. + If the path is already an iteration directory (e.g., 'iter_0000005'), it extracts + the iteration number from the path. + + Args: + checkpoint_path (str): Path to the checkpoint directory or iteration directory. + + Returns: + str: The full path to the checkpoint directory for the determined iteration. + + Raises: + ValueError: If the checkpoint directory for the determined iteration does not exist. + """ iteration = None - latest_iteration = None tracker_file = os.path.join(checkpoint_path, "latest_checkpointed_iteration.txt") if os.path.exists(tracker_file): with FileOpen(tracker_file, 'r') as f: - iteration = latest_iteration = int(f.read().strip()) + latest_iteration = f.read().strip() + if latest_iteration != 'release': + try: + iteration = int(latest_iteration) + except Exception: + logger.warning( + f"The latest_checkpointed_iteration is supposed to be `release` or an int. \ + But {latest_iteration} is found." + ) + checkpoint_path = os.path.join(checkpoint_path, f'iter_{iteration:07d}') else: - match = re.findall('iter_([\d]{7})', checkpoint_path) + match = re.findall(ITER_DIR_PATTERN, checkpoint_path) if match: iteration = int(match[0]) # Checkpoint directory for this iteration logger.info(f"Loaded checkpoint from iteration {iteration}") - if latest_iteration: - checkpoint_path = os.path.join(checkpoint_path, f'iter_{iteration:07d}') + if not os.path.exists(checkpoint_path): raise ValueError(f"Checkpoint directory not found: {checkpoint_path}") @@ -194,23 +219,22 @@ def parse_iteration(checkpoint_path: str) -> Dict: def get_weights_from_state_dict(state_dict): weights = {} + vpp_stage = 0 if 'model' in state_dict: model_weights = state_dict['model'] - vpp_stage = 0 - + for key, value in _get_parameter(model_weights): key = _map_to_mcore_local_names(key) weights[f"{key}{Const.SCOPE_SEPARATOR}{vpp_stage}"] = value elif 'model0' in state_dict: #vpp enabled - vpp_size = 0 - while f'model{vpp_size}' in state_dict: + while f'model{vpp_stage}' in state_dict: model_weights = state_dict[f'model{vpp_stage}'] for key, value in _get_parameter(model_weights): key = _map_to_mcore_local_names(key) weights[f"{key}{Const.SCOPE_SEPARATOR}{vpp_stage}"] = value - vpp_size += 1 + vpp_stage += 1 return weights @@ -223,6 +247,11 @@ def load_megatron_weights(checkpoint_path: str) -> Dict: Returns: combined_weights: Dict with weights from all ranks, keys include rank info """ + try: + import megatron + except ModuleNotFoundError as e: + raise ModuleNotFoundError("No module named 'megatron', which is required to load a megatron ckpt") from e + # Find latest iteration if not specified checkpoint_path = parse_iteration(checkpoint_path) @@ -247,7 +276,7 @@ def load_megatron_weights(checkpoint_path: str) -> Dict: ckpt_file = os.path.join(checkpoint_path, rank_dir, 'model_optim_rng.pt') try: - state_dict = load_pt(ckpt_file, to_cpu=True, weights_only=False) + state_dict = FmkAdp.load_checkpoint(ckpt_file, to_cpu=True, weights_only=False) partition = get_weights_from_state_dict(state_dict) for key, weight in partition.items(): tp_partition[key].append(weight) diff --git a/debug/accuracy_tools/msprobe/pytorch/config_checking/ckpt_compare/metrics.py b/debug/accuracy_tools/msprobe/core/config_check/ckpt_compare/metrics.py similarity index 53% rename from debug/accuracy_tools/msprobe/pytorch/config_checking/ckpt_compare/metrics.py rename to debug/accuracy_tools/msprobe/core/config_check/ckpt_compare/metrics.py index 65b5feb659f2fc515d5f2f57faf107d65937d16c..2e9e1324b33c570033fa4fc29a6a32dff73b64de 100644 --- a/debug/accuracy_tools/msprobe/pytorch/config_checking/ckpt_compare/metrics.py +++ b/debug/accuracy_tools/msprobe/core/config_check/ckpt_compare/metrics.py @@ -13,12 +13,11 @@ # See the License for the specific language governing permissions and # limitations under the License. -import torch -from torch.nn import functional as F +import numpy as np -from msprobe.pytorch.common.log import logger +from msprobe.core.common.log import logger +from msprobe.core.compare.npy_compare import CompareOps -MAX_SLICE = 1000000 def in_different_shape(a, b): @@ -33,46 +32,37 @@ def l2_distance(a, b): return None if in_different_shape(a, b): return None - return (a - b).square().sum().sqrt().item() + return np.linalg.norm(a - b).item() -def cos_sim(a, b, eps=1e-8): +def cos_sim(a, b): if a is None or b is None: return None - if a.dtype not in [torch.float64, torch.float32, torch.float16, torch.bfloat16]: - return None if in_different_shape(a, b): return None - if a.dim() > 0: + if a.ndim > 0: a = a.flatten().squeeze() b = b.flatten().squeeze() - num_element = a.numel() - if num_element > MAX_SLICE: - logger.info(f'num parameters: {num_element}. Calculate cos by chunks') - n_batch = num_element // MAX_SLICE + 1 - sim = 0 - total_norm_a = eps - total_norm_b = eps - for i in range(n_batch): - slice_a = a[i * MAX_SLICE: min((i + 1) * MAX_SLICE, num_element)] - slice_b = b[i * MAX_SLICE: min((i + 1) * MAX_SLICE, num_element)] - slice_sim = (slice_a * slice_b).sum().item() - total_norm_a += (slice_a ** 2).sum().item() - total_norm_b += (slice_a ** 2).sum().item() - sim += slice_sim - sim = sim / total_norm_a ** 0.5 / total_norm_b ** 0.5 + num = a.dot(b) + a_norm = np.linalg.norm(a) + b_norm = np.linalg.norm(b) + + if a_norm == 0 and b_norm == 0: + return 1. + if a_norm == 0 or b_norm == 0: + logger.warning(f'One tensor norm is zero.') + return None - else: - sim = F.cosine_similarity(a, b, dim=0, eps=eps).item() + sim = num / (a_norm * b_norm) - return sim + return sim.item() def numel(a, b): - n1 = a.numel() - n2 = b.numel() + n1 = a.size + n2 = b.size if n1 != n2: logger.warning('parameters have different number of element') return (n1, n2) @@ -80,11 +70,9 @@ def numel(a, b): def shape(a, b): - s1 = a.shape - s2 = b.shape if in_different_shape(a, b): - return [list(s1), list(s2)] - return list(s1) + return [list(a.shape), list(b.shape)] + return list(a.shape) METRIC_FUNC = { @@ -92,4 +80,4 @@ METRIC_FUNC = { 'cos': cos_sim, 'numel': numel, 'shape': shape - } \ No newline at end of file + } diff --git a/debug/accuracy_tools/msprobe/pytorch/config_checking/ckpt_compare/name_mapping.yaml b/debug/accuracy_tools/msprobe/core/config_check/ckpt_compare/name_mapping.yaml similarity index 100% rename from debug/accuracy_tools/msprobe/pytorch/config_checking/ckpt_compare/name_mapping.yaml rename to debug/accuracy_tools/msprobe/core/config_check/ckpt_compare/name_mapping.yaml diff --git a/debug/accuracy_tools/msprobe/pytorch/config_checking/config_checking.py b/debug/accuracy_tools/msprobe/core/config_check/config_check_cli.py similarity index 51% rename from debug/accuracy_tools/msprobe/pytorch/config_checking/config_checking.py rename to debug/accuracy_tools/msprobe/core/config_check/config_check_cli.py index a8cc15ab6ee36907bdc7a061cd04359b4b83ebf8..cc2db192416517d6b94020441d9edc1eff95f89b 100644 --- a/debug/accuracy_tools/msprobe/pytorch/config_checking/config_checking.py +++ b/debug/accuracy_tools/msprobe/core/config_check/config_check_cli.py @@ -13,37 +13,39 @@ # See the License for the specific language governing permissions and # limitations under the License. -from msprobe.pytorch.common.log import logger -from msprobe.pytorch.config_checking.config_checker import ConfigChecker -from msprobe.pytorch.config_checking.ckpt_compare.compare_weight import compare_checkpoints +from msprobe.core.config_check.config_checker import ConfigChecker +from msprobe.core.config_check.ckpt_compare.ckpt_comparator import compare_checkpoints +from msprobe.core.common.log import logger -def pack(config_filepath): - ConfigChecker(config_filepath) +def pack(shell_path, output_path, framework): + ConfigChecker(shell_path=shell_path, output_zip_path=output_path, fmk=framework) -def compare(bench_zip_path, cmp_zip_path, outpath): - ConfigChecker.compare(bench_zip_path, cmp_zip_path, outpath) +def compare(bench_zip_path, cmp_zip_path, output_path, framework): + ConfigChecker.compare(bench_zip_path, cmp_zip_path, output_path, framework) def _config_checking_parser(parser): - parser.add_argument('-pack', '--pack', help='Pack a directory into a zip file') - parser.add_argument('-c', '--compare', nargs=2, help='Compare two zip files or ckpt dir') - parser.add_argument('-s', '--ckpt-sim', default=False, action='store_true', - help='Calculate the similarity of two ckpt') + parser.add_argument('-d', '--dump', nargs='*', help='Collect the train config into a zip file') + parser.add_argument('-c', '--compare', nargs=2, help='Compare two zip files or checkpoints') parser.add_argument('-o', '--output', help='output path, default is current directory') def _run_config_checking_command(args): - if args.pack: - pack(args.pack) + if args.dump is not None: + output_dirpath = args.output if args.output else "./config_check_pack.zip" + pack(args.dump, output_dirpath, args.framework) elif args.compare: - if args.ckpt_sim: - output_path = args.output if args.output else "./ckpt_compare_out.json" - compare_checkpoints(args.compare[0], args.compare[1], output_path) - else: + if args.compare[0].endswith('zip'): + logger.info('The input paths is zip files, comparing packed config.') output_dirpath = args.output if args.output else "./config_check_result" - compare(args.compare[0], args.compare[1], output_dirpath) + compare(args.compare[0], args.compare[1], output_dirpath, args.framework) + else: + logger.info('Comparing model checkpoint.') + output_dirpath = args.output if args.output else "./ckpt_similarity.json" + compare_checkpoints(args.compare[0], args.compare[1], output_dirpath) + else: - logger.error("The param is not correct, you need to give '-pack' for pack or '-c' for compare.") - raise Exception("The param is not correct, you need to give '-pack' for pack or '-c' for compare.") + logger.error("The param is not correct, you need to give '-d' for dump or '-c' for compare.") + raise Exception("The param is not correct, you need to give '-d' for dump or '-c' for compare.") diff --git a/debug/accuracy_tools/msprobe/pytorch/config_checking/config_checker.py b/debug/accuracy_tools/msprobe/core/config_check/config_checker.py similarity index 69% rename from debug/accuracy_tools/msprobe/pytorch/config_checking/config_checker.py rename to debug/accuracy_tools/msprobe/core/config_check/config_checker.py index fa5d2ff3b036f0f150b19345ccad51e4b58e95f4..2dc908398b83d1f5c15b5dcefdcc93a4a2ef58a4 100644 --- a/debug/accuracy_tools/msprobe/pytorch/config_checking/config_checker.py +++ b/debug/accuracy_tools/msprobe/core/config_check/config_checker.py @@ -16,15 +16,14 @@ import os import shutil -import torch -import torch.distributed as dist import pandas as pd from msprobe.core.common.file_utils import save_excel, split_zip_file_path, \ - create_directory, extract_zip, make_dir -from msprobe.pytorch.config_checking.checkers.base_checker import PackInput -from msprobe.pytorch.config_checking.utils.utils import config_checking_print - + create_directory, extract_zip +from msprobe.core.common.framework_adapter import FmkAdp +from msprobe.core.config_check.checkers.base_checker import PackInput +from msprobe.core.config_check.utils.utils import config_checking_print +from msprobe.core.common.const import Const class ConfigChecker: @@ -34,24 +33,20 @@ class ConfigChecker: result_header = ["filename", "pass_check"] step = 0 - def __init__(self, model=None, shell_path=None, output_zip_path="./config_check_pack.zip"): + def __init__(self, model=None, shell_path=None, output_zip_path="./config_check_pack.zip", fmk="pytorch"): + FmkAdp.set_fmk(fmk) self.pack_input = PackInput(output_zip_path, model, shell_path) file_path, file_name = split_zip_file_path(self.pack_input.output_zip_path) if not os.path.exists(file_path): create_directory(file_path) - self.pack() - else: - if os.path.exists(self.pack_input.output_zip_path): - raise Exception("The output file path already exist!") - self.pack() - + self.pack() @staticmethod - def compare(bench_zip_path, cmp_zip_path, outpath): - if os.path.exists(outpath): - shutil.rmtree(outpath) - bench_dir = os.path.join(outpath, "bench") - cmp_dir = os.path.join(outpath, "cmp") + def compare(bench_zip_path, cmp_zip_path, output_path, fmk=Const.PT_FRAMEWORK): + if os.path.exists(output_path): + shutil.rmtree(output_path) + bench_dir = os.path.join(output_path, "bench") + cmp_dir = os.path.join(output_path, "cmp") extract_zip(bench_zip_path, bench_dir) config_checking_print(f"extract zip file {bench_zip_path} to {bench_dir}") extract_zip(cmp_zip_path, cmp_dir) @@ -60,23 +55,23 @@ class ConfigChecker: result = [] summary_result = [] for checker in ConfigChecker.checkers.values(): - checker_name, pass_check, df = checker.compare_ex(bench_dir, cmp_dir, outpath) + checker_name, pass_check, df = checker.compare_ex(bench_dir, cmp_dir, output_path, fmk) if checker_name: summary_result.append([checker_name, pass_check]) if df is not None: result.append((df, checker_name)) summary_result_df = pd.DataFrame(summary_result, columns=ConfigChecker.result_header) result.insert(0, (summary_result_df, "summary")) - save_excel(os.path.join(outpath, ConfigChecker.result_filename), result) - config_checking_print(f"config checking result save to {os.path.realpath(outpath)}") + save_excel(os.path.join(output_path, ConfigChecker.result_filename), result) + config_checking_print(f"config checking result save to {os.path.realpath(output_path)}") + @staticmethod + def apply_patches(fmk=Const.PT_FRAMEWORK): + for checker in ConfigChecker.checkers.values(): + checker.apply_patches(fmk) def pack(self): config_checking_print(f"pack result zip path {os.path.realpath(self.pack_input.output_zip_path)}") - if dist.is_initialized() and dist.get_rank() == 0: - config_checking_print(f"pack result zip path {self.pack_input.output_zip_path}") - if os.path.exists(self.pack_input.output_zip_path): - os.remove(self.pack_input.output_zip_path) def hook(model, args, kwargs): for collect_func in self.pre_forward_fun_list: @@ -84,11 +79,11 @@ class ConfigChecker: ConfigChecker.step += 1 if self.pack_input.model: - self.pack_input.model.register_forward_pre_hook(hook, with_kwargs=True) + FmkAdp.register_forward_pre_hook(self.pack_input.model, hook, with_kwargs=True) for checker in ConfigChecker.checkers.values(): if checker.input_needed and not getattr(self.pack_input, checker.input_needed): continue - if dist.is_initialized() and dist.get_rank() != 0 and not checker.multi_rank: + if FmkAdp.is_initialized() and FmkAdp.get_rank() != 0 and not checker.multi_rank: continue checker.pack(self.pack_input) diff --git a/debug/accuracy_tools/msprobe/pytorch/config_checking/resource/dependency.yaml b/debug/accuracy_tools/msprobe/core/config_check/resource/dependency.yaml similarity index 96% rename from debug/accuracy_tools/msprobe/pytorch/config_checking/resource/dependency.yaml rename to debug/accuracy_tools/msprobe/core/config_check/resource/dependency.yaml index f4f73a5fce97f20608a3c9bacb92e53f1747f092..02c0b565bf59b1b220f16ae17a47f5f4d5b13c1f 100644 --- a/debug/accuracy_tools/msprobe/pytorch/config_checking/resource/dependency.yaml +++ b/debug/accuracy_tools/msprobe/core/config_check/resource/dependency.yaml @@ -19,6 +19,4 @@ dependency: - megatron - numpy - datasets - - torch - - torchversion - peft \ No newline at end of file diff --git a/debug/accuracy_tools/msprobe/pytorch/config_checking/resource/env.yaml b/debug/accuracy_tools/msprobe/core/config_check/resource/env.yaml similarity index 63% rename from debug/accuracy_tools/msprobe/pytorch/config_checking/resource/env.yaml rename to debug/accuracy_tools/msprobe/core/config_check/resource/env.yaml index 13ea0e39f89b4807b72a6322ddc865145d9fde9d..87d663b9d94976c24feb88b181b3ead98905eb5a 100644 --- a/debug/accuracy_tools/msprobe/pytorch/config_checking/resource/env.yaml +++ b/debug/accuracy_tools/msprobe/core/config_check/resource/env.yaml @@ -14,25 +14,44 @@ # limitations under the License. HCCL_DETERMINISTIC: - - name: HCCL_DETERMINISTIC + npu: + name: HCCL_DETERMINISTIC + default_value: False + gpu: + name: NCCL_DETERMINISTIC default_value: False -HCCL_ALGO: - - name: HCCL_ALGO +HCCL_ALGO: + npu: + name: HCCL_ALGO + default_value: None + gpu: + name: NCCL_ALGO default_value: None HCCL_INTRA_ROCE_ENABLE: - - name: HCCL_INTRA_ROCE_ENABLE + npu: + name: HCCL_INTRA_ROCE_ENABLE default_value: 0 + HCCL_INTRA_PICE_ENABLE: - - name: HCCL_INTRA_PICE_ENABLE + npu: + name: HCCL_INTRA_ROCE_ENABLE default_value: 1 ASCEND_LAUNCH_BLOCKING: - - name: ASCEND_LAUNCH_BLOCKING - default_value: False + npu: + name: ASCEND_LAUNCH_BLOCKING + default_value: 0 + gpu: + name: CUDA_LAUNCH_BLOCKING + default_value: 0 -ASCEND_RT_VISIBLE_DEVICE: - - name: ASCEND_RT_VISIBLE_DEVICE +ASCEND_RT_VISIBLE_DEVICES: + npu: + name: ASCEND_RT_VISIBLE_DEVICES + default_value: None + gpu: + name: CUDA_VISIBLE_DEVICES default_value: None \ No newline at end of file diff --git a/debug/accuracy_tools/msprobe/core/config_check/resource/hyperparameter.yaml b/debug/accuracy_tools/msprobe/core/config_check/resource/hyperparameter.yaml new file mode 100644 index 0000000000000000000000000000000000000000..5cff815717fc5b668bdd5f99de1a18e0373760fe --- /dev/null +++ b/debug/accuracy_tools/msprobe/core/config_check/resource/hyperparameter.yaml @@ -0,0 +1,21 @@ +learning_rate: + - lr + - learningrate + +batch_size: + - batch + - bs + - batch_size_per_gpu + +epochs: + - num_epochs + - max_epochs + - epoch + +weight_decay: + - wd + - weightdecay + +dropout_rate: + - dropout + - drop_rate \ No newline at end of file diff --git a/debug/accuracy_tools/msprobe/core/config_check/utils/hyperparameter_parser.py b/debug/accuracy_tools/msprobe/core/config_check/utils/hyperparameter_parser.py new file mode 100644 index 0000000000000000000000000000000000000000..6cb540ee49951652b6094f80229da099cfc5afdf --- /dev/null +++ b/debug/accuracy_tools/msprobe/core/config_check/utils/hyperparameter_parser.py @@ -0,0 +1,115 @@ +# Copyright (c) 2025-2025, Huawei Technologies Co., Ltd. +# All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import re +from abc import ABC, abstractmethod + +from msprobe.core.config_check.utils.utils import config_checking_print +from msprobe.core.common.file_utils import FileOpen, load_yaml +from msprobe.core.common.const import Const, FileCheckConst + + +class Parser(ABC): + @abstractmethod + def parse(self, file_path: str) -> dict: + pass + + def run(self, file_path: str) -> dict: + """ + 统一对外调用接口 + :param file_path: 需解析的文件路径 + :return: + """ + try: + result = self.parse(file_path) + except Exception as exc: + config_checking_print(f"{self.__class__} parsing error, skip file path: {file_path}, error: {exc}") + result = {} + return result + + +class ShellParser(Parser): + def parse(self, file_path: str) -> dict: + """ + Extracts arguments from bash script used to run a model training. + """ + hyperparameters = {} + script_content_list = [] + with FileOpen(file_path, 'r') as file: + for line in file: + stripped_line = line.lstrip() + if not stripped_line.startswith('#'): + line = line.split('#')[0].rstrip() + '\n' + if line.strip(): + script_content_list.append(line) + script_content = ''.join(script_content_list) + + command_line = re.search(r'msrun\s[^|]*|torchrun\s[^|]*|python\d? -m torch.distributed.launch\s[^|]*', + script_content, + re.DOTALL) + if command_line: + command_line = command_line.group() + + blocks = re.findall(r'([a-zA-Z0-9_]{1,20}_ARGS)="(.*?)"', script_content, re.DOTALL) + block_contents = {} + for block_name, block_content in blocks: + block_content = block_content.replace('\n', ' ') + block_contents[block_name] = block_content + command_line = command_line.replace(f"${block_name}", block_content) + + matches = re.findall(r'--([\w-]+)(?:\s+([^\s\\]+))?', command_line) + for match in matches: + key, value = match + args_key = re.match(r'\$\{?(\w+)}?', value) + if args_key: + env_vars = re.findall(rf'{args_key.group(1)}=\s*(.+)', script_content) + if env_vars: + value = env_vars[-1] + hyperparameters[key] = value if value else True + + return hyperparameters + + +class YamlParser(Parser): + hyperparameters = {} + + def parse(self, file_path: str) -> dict: + ori_hyper = load_yaml(file_path) + self.recursive_parse_parameters(ori_hyper, "") + return self.hyperparameters + + def recursive_parse_parameters(self, parameters, prefix): + if isinstance(parameters, dict): + for key, value in parameters.items(): + new_prefix = prefix + Const.SEP + key if prefix else key + self.recursive_parse_parameters(value, new_prefix) + elif isinstance(parameters, list): + for value in parameters: + self.recursive_parse_parameters(value, prefix) + elif isinstance(parameters, (int, str, bool)): + self.hyperparameters.update({prefix: parameters}) + + +class ParserFactory: + __ParserDict = { + FileCheckConst.SHELL_SUFFIX: ShellParser(), + FileCheckConst.YAML_SUFFIX: YamlParser() + } + + def get_parser(self, file_type: str) -> Parser: + parser = self.__ParserDict.get(file_type, None) + if not parser: + raise ValueError(f'Invalid parser type: {file_type}') + return parser diff --git a/debug/accuracy_tools/msprobe/pytorch/config_checking/utils/utils.py b/debug/accuracy_tools/msprobe/core/config_check/utils/utils.py similarity index 80% rename from debug/accuracy_tools/msprobe/pytorch/config_checking/utils/utils.py rename to debug/accuracy_tools/msprobe/core/config_check/utils/utils.py index 3f8cef378ef3479aa0892786e835a66861eb6637..8c3c329cf20e2b6fb890437b3ba9950f14cc8878 100644 --- a/debug/accuracy_tools/msprobe/pytorch/config_checking/utils/utils.py +++ b/debug/accuracy_tools/msprobe/core/config_check/utils/utils.py @@ -17,9 +17,8 @@ import os import re import hashlib -import torch - -from msprobe.pytorch.common.log import logger +from msprobe.core.common.framework_adapter import FmkAdp +from msprobe.core.common.log import logger def merge_keys(dir_0, dir_1): @@ -53,15 +52,13 @@ def tensor_to_hash(tensor): def get_tensor_features(tensor): features = { - "max": lambda x: torch.max(x).item(), - "min": lambda x: torch.min(x).item(), - "mean": lambda x: torch.mean(x).item(), - "norm": lambda x: torch.norm(x).item(), + "max": FmkAdp.tensor_max(tensor), + "min": FmkAdp.tensor_max(tensor), + "mean": FmkAdp.tensor_max(tensor), + "norm": FmkAdp.tensor_max(tensor), } - if not tensor.is_floating_point() or tensor.dtype == torch.float64: - tensor = tensor.float() - return {key: features.get(key)(tensor) for key in features} + return features def compare_dicts(dict1, dict2, path=''): @@ -97,3 +94,14 @@ def bytes_hash(obj: bytes): hex_dig = hashlib.sha256(obj).hexdigest() short_hash = int(hex_dig, 16) % (2 ** 16) return short_hash + + +def update_dict(ori_dict, new_dict): + for key, value in new_dict.items(): + if key in ori_dict and ori_dict[key] != value: + if "values" in ori_dict.keys(): + ori_dict[key]["values"].append(new_dict[key]) + else: + ori_dict[key] = {"description": "duplicate_value", "values": [ori_dict[key], new_dict[key]]} + else: + ori_dict[key] = value diff --git a/debug/accuracy_tools/msprobe/pytorch/config_checking/checkers/random_checker.py b/debug/accuracy_tools/msprobe/pytorch/config_checking/checkers/random_checker.py deleted file mode 100644 index 883144d617d8ea33afbedf8d510f7181450fae6c..0000000000000000000000000000000000000000 --- a/debug/accuracy_tools/msprobe/pytorch/config_checking/checkers/random_checker.py +++ /dev/null @@ -1,182 +0,0 @@ -# Copyright (c) 2025-2025, Huawei Technologies Co., Ltd. -# All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -import random -from functools import wraps -from typing import Callable -import inspect -import os -import json -from collections import defaultdict - -import numpy as np -import torch -import pandas as pd -from msprobe.pytorch.config_checking.config_checker import register_checker_item, register_pre_forward_fun_list -from msprobe.pytorch.common.utils import get_rank_id -from msprobe.core.common.file_utils import create_file_in_zip, load_json, save_excel -from msprobe.pytorch.config_checking.checkers.base_checker import BaseChecker -from msprobe.pytorch.config_checking.utils.utils import config_checking_print - - -random_log_dict = defaultdict(dict) - - -def load_json_files(directory): - json_data = {} - for file in os.listdir(directory): - file_path = os.path.join(directory, file) - if file.startswith('rank') and file.endswith('.json'): - json_data.update(load_json(file_path)) - return json_data - - -def get_file_and_line(position): - parts = position.rsplit(':', 1) - if len(parts) == 2: - file_name = os.path.basename(parts[0]) - line_num = parts[1] - return f"{file_name}:{line_num}" - return position - - -def compare_json_files(bench_data, cmp_data): - results = [] - for op in set(bench_data) | set(cmp_data): - bench_records = bench_data.get(op, {}) - cmp_records = cmp_data.get(op, {}) - all_positions = set() - for position in set(bench_records) | set(cmp_records): - all_positions.add(get_file_and_line(position)) - - for position in all_positions: - bench_count = 0 - cmp_count = 0 - for original_position, count in bench_records.items(): - if get_file_and_line(original_position) == position: - bench_count += count - for original_position, count in cmp_records.items(): - if get_file_and_line(original_position) == position: - cmp_count += count - results.append([op, position, bench_count == cmp_count, bench_count, cmp_count]) - return results - - -def compare_random(bench_dir='bench', cmp_dir='cmp'): - bench_data = load_json_files(bench_dir) - cmp_data = load_json_files(cmp_dir) - results = compare_json_files(bench_data, cmp_data) - df = pd.DataFrame(results, columns=RandomChecker.result_header) - return df - - -def track_random_call(func: Callable, name: str): - @wraps(func) - def wrapper(*args, **kwargs): - frame = inspect.currentframe() - caller_frame = frame.f_back - caller_info = inspect.getframeinfo(caller_frame) - location = f"{os.path.abspath(caller_info.filename)}:{caller_info.lineno}" - - global random_log_dict - random_log_dict.setdefault(name, {}) - random_log_dict[name][location] = random_log_dict[name].get(location, 0) + 1 - - try: - result = func(*args, **kwargs) - return result - except Exception as e: - raise e - finally: - del frame, caller_frame - - return wrapper - - -def apply_patches(): - random_patches = { - 'random': random.random, - 'randint': random.randint, - 'uniform': random.uniform, - 'choice': random.choice - } - for name, func in random_patches.items(): - setattr(random, name, track_random_call(func, f"random.{name}")) - - np_random_patches = { - 'rand': np.random.rand, - 'randint': np.random.randint, - 'choice': np.random.choice, - 'normal': np.random.normal - } - for name, func in np_random_patches.items(): - setattr(np.random, name, track_random_call(func, f"np.random.{name}")) - - torch_patches = { - 'rand': torch.rand, - 'randint': torch.randint, - 'randn': torch.randn, - 'rand_like': torch.rand_like, - 'randint_like': torch.randint_like, - 'randn_like': torch.randn_like, - 'manual_seed': torch.manual_seed - } - for name, func in torch_patches.items(): - setattr(torch, name, track_random_call(func, f"torch.{name}")) - - tensor_patches = { - 'exponential_': torch.Tensor.exponential_, - 'geometric_': torch.Tensor.geometric_, - 'log_normal_': torch.Tensor.log_normal_, - 'cauchy_': torch.Tensor.cauchy_ - } - for name, func in tensor_patches.items(): - setattr(torch.Tensor, name, track_random_call(func, f"torch.Tensor.{name}")) - - - -@register_checker_item("random") -class RandomChecker(BaseChecker): - input_needed = None - - target_name_in_zip = "random" - result_header = ['op', 'position', 'equal', 'bench_count', 'cmp_count'] - write_once = False - - @staticmethod - def pack(pack_input): - output_zip_path = pack_input.output_zip_path - - def collect_input(model, args, kwargs, step): - if RandomChecker.write_once: - return - - random_log_filepath = os.path.join(RandomChecker.target_name_in_zip, f"rank{get_rank_id()}.json") - create_file_in_zip(output_zip_path, random_log_filepath, json.dumps(random_log_dict, indent=4)) - config_checking_print(f"add first random_log input features to zip") - RandomChecker.write_once = True - - register_pre_forward_fun_list(collect_input) - - @staticmethod - def compare(bench_dir, cmp_dir, output_path): - bench_random_log_pack_path = os.path.join(bench_dir, RandomChecker.target_name_in_zip) - cmp_random_log_pack_path = os.path.join(cmp_dir, RandomChecker.target_name_in_zip) - - df = compare_random(bench_random_log_pack_path, cmp_random_log_pack_path) - pass_check = False not in df['equal'].values - return RandomChecker.target_name_in_zip, pass_check, df -