diff --git a/debug/accuracy_tools/msprobe/core/common/const.py b/debug/accuracy_tools/msprobe/core/common/const.py index fb2f7f671c5a75ea196297fde07a8576fbc13a06..6bd9129c4e8bfd1e4268d8424fb8fec31f4a94ab 100644 --- a/debug/accuracy_tools/msprobe/core/common/const.py +++ b/debug/accuracy_tools/msprobe/core/common/const.py @@ -134,6 +134,7 @@ class Const: NPU = 'NPU' NPU_LOWERCASE = 'npu' CPU_LOWERCASE = 'cpu' + GPU_LOWERCASE = 'gpu' CUDA_LOWERCASE = 'cuda' DEVICE = 'device' DISTRIBUTED = 'Distributed' @@ -331,6 +332,8 @@ class Const: } } + ASCEND = "ASCEND" + class CompareConst: """ diff --git a/debug/accuracy_tools/msprobe/pytorch/config_checking/checkers/env_args_checker.py b/debug/accuracy_tools/msprobe/pytorch/config_checking/checkers/env_args_checker.py index 9eaeb1a05729f7e9ae4ff8c9727d8f3589b3a166..a5ed1464cf86338e12f725a36b7b8bddbf60471d 100644 --- a/debug/accuracy_tools/msprobe/pytorch/config_checking/checkers/env_args_checker.py +++ b/debug/accuracy_tools/msprobe/pytorch/config_checking/checkers/env_args_checker.py @@ -22,7 +22,7 @@ from msprobe.core.common.file_utils import load_json, load_yaml, create_file_wit from msprobe.pytorch.config_checking.checkers.base_checker import BaseChecker from msprobe.pytorch.config_checking.config_checker import register_checker_item from msprobe.pytorch.config_checking.utils.utils import config_checking_print -from msprobe.core.common.file_utils import save_excel +from msprobe.core.common.const import Const dirpath = os.path.dirname(__file__) @@ -36,21 +36,41 @@ def collect_env_data(): return result +def get_device_type(env_json): + for key in env_json.keys(): + if Const.ASCEND in key: + return Const.NPU_LOWERCASE + return Const.GPU_LOWERCASE + + def compare_env_data(npu_path, bench_path): necessary_env = load_yaml(env_yaml_path) - npu_data = load_json(npu_path) + cmp_data = load_json(npu_path) + cpm_type = get_device_type(cmp_data) bench_data = load_json(bench_path) + bench_type = get_device_type(bench_data) data = [] for _, value in necessary_env.items(): - npu_env_name = value[0]["name"] - npu_value = npu_data.get(npu_env_name) if npu_data.get(npu_env_name) else value[0]["default_value"] - if len(value) == 1: - data.append([npu_env_name, "only npu has this env", npu_value, "", "warning"]) + cmp_env = value.get(cpm_type) + bench_env = value.get(bench_type) + if not bench_env and not cmp_env: continue - bench_env_name = value[1]["name"] - bench_value = bench_data.get(bench_env_name) if bench_data.get(bench_env_name) else value[1]["default_value"] - if npu_value != bench_value: - data.append([npu_env_name, bench_env_name, npu_value, bench_value, "error"]) + elif cmp_env: + cmp_env_name = cmp_env["name"] + cmp_value = cmp_data.get(cmp_env_name) if cmp_data.get(cmp_env_name) else value[cpm_type]["default_value"] + if not bench_env: + data.append(["only cmp has this env", cmp_env["name"], "", cmp_value, "warning"]) + continue + bench_env_name = bench_env["name"] + bench_value = bench_data.get(bench_env_name) if bench_data.get(bench_env_name) else value[bench_type][ + "default_value"] + if cmp_value != bench_value: + data.append([bench_env_name, cmp_env_name, bench_value, cmp_value, "error"]) + else: + bench_env_name = bench_env["name"] + bench_value = bench_data.get(bench_env_name) if bench_data.get(bench_env_name) else value[bench_type][ + "default_value"] + data.append([bench_env_name, "only bench has this env", bench_value, "", "warning"]) df = pd.DataFrame(data, columns=EnvArgsChecker.result_header) return df diff --git a/debug/accuracy_tools/msprobe/pytorch/config_checking/checkers/hyperparameter_checker.py b/debug/accuracy_tools/msprobe/pytorch/config_checking/checkers/hyperparameter_checker.py index 9ac1cd61fc5483c1a002bf0109d56a341aeab120..90260fc46f72dd936eb720e57d7bea523183b032 100644 --- a/debug/accuracy_tools/msprobe/pytorch/config_checking/checkers/hyperparameter_checker.py +++ b/debug/accuracy_tools/msprobe/pytorch/config_checking/checkers/hyperparameter_checker.py @@ -20,6 +20,7 @@ import tempfile from difflib import SequenceMatcher from typing import Union, List, Dict, Any +import pandas as pd from msprobe.pytorch.config_checking.checkers.base_checker import BaseChecker from msprobe.pytorch.config_checking.config_checker import register_checker_item @@ -42,6 +43,8 @@ class HyperparameterChecker(BaseChecker): "dropout_rate": ["dropout", "drop_rate"], } + result_header = ["file", "bench_para", "cmp_para", "bench_value", "cmp_value", "matched_with", "level"] + @staticmethod def pack(pack_input): shell_path = pack_input.shell_path @@ -82,43 +85,32 @@ class HyperparameterChecker(BaseChecker): for filename in all_files: bench_params = bench_hyperparameters.get(filename, {}) cmp_params = cmp_hyperparameters.get(filename, {}) - if bench_params and cmp_params: all_diffs.extend(HyperparameterChecker.compare_param(bench_params, cmp_params, filename)) - - elif bench_params is not None: - all_diffs.append(f"[Only in benchmark] File: {filename}") - else: - all_diffs.append(f"[Only in compare] File: {filename}") - return HyperparameterChecker.target_name_in_zip, True, None + df = pd.DataFrame(all_diffs, columns=HyperparameterChecker.result_header) + pass_check = "error" not in df['level'].values + return HyperparameterChecker.target_name_in_zip, pass_check, df @staticmethod def compare_param(bench_params, cmp_params, filename): all_diffs = [] - file_diffs = [] bench_param_names = bench_params.keys() for bench_param_name in bench_param_names: matched_cmp_param_name = HyperparameterChecker._fuzzy_match_parameter(bench_param_name, cmp_params) + bench_param_value = bench_params[bench_param_name] if matched_cmp_param_name: - bench_param_value = bench_params[bench_param_name] cmp_param_value = cmp_params[matched_cmp_param_name] if bench_param_value != cmp_param_value: - diff = compare_dict({bench_param_name: bench_param_value}, - {matched_cmp_param_name: cmp_param_value}) - if diff: - file_diffs.extend( - [f" Parameter '{bench_param_name}' (matched with '{matched_cmp_param_name}'): {d}" - for d in diff]) + all_diffs.append( + [filename, bench_param_name, matched_cmp_param_name, bench_param_value, cmp_param_value, + matched_cmp_param_name, "error"]) del cmp_params[matched_cmp_param_name] else: - file_diffs.append( - f" [Only in benchmark] Parameter: '{bench_param_name}': {bench_params[bench_param_name]}") + all_diffs.append( + [filename, bench_param_name, "Only in benchmark", bench_param_value, "", "", "warning"]) for cmp_param_name, cmp_param_value in cmp_params.items(): - file_diffs.append(f" [Only in compare] Parameter: '{cmp_param_name}': {cmp_param_value}") - if file_diffs: - file_diffs.sort() - all_diffs.append(f"File: {filename}") - all_diffs.extend(file_diffs) + all_diffs.append([filename, "Only in comparison", cmp_param_name, "", cmp_param_value, "", "warning"]) + all_diffs.sort() return all_diffs @staticmethod diff --git a/debug/accuracy_tools/msprobe/pytorch/config_checking/resource/env.yaml b/debug/accuracy_tools/msprobe/pytorch/config_checking/resource/env.yaml index 13ea0e39f89b4807b72a6322ddc865145d9fde9d..87d663b9d94976c24feb88b181b3ead98905eb5a 100644 --- a/debug/accuracy_tools/msprobe/pytorch/config_checking/resource/env.yaml +++ b/debug/accuracy_tools/msprobe/pytorch/config_checking/resource/env.yaml @@ -14,25 +14,44 @@ # limitations under the License. HCCL_DETERMINISTIC: - - name: HCCL_DETERMINISTIC + npu: + name: HCCL_DETERMINISTIC + default_value: False + gpu: + name: NCCL_DETERMINISTIC default_value: False -HCCL_ALGO: - - name: HCCL_ALGO +HCCL_ALGO: + npu: + name: HCCL_ALGO + default_value: None + gpu: + name: NCCL_ALGO default_value: None HCCL_INTRA_ROCE_ENABLE: - - name: HCCL_INTRA_ROCE_ENABLE + npu: + name: HCCL_INTRA_ROCE_ENABLE default_value: 0 + HCCL_INTRA_PICE_ENABLE: - - name: HCCL_INTRA_PICE_ENABLE + npu: + name: HCCL_INTRA_ROCE_ENABLE default_value: 1 ASCEND_LAUNCH_BLOCKING: - - name: ASCEND_LAUNCH_BLOCKING - default_value: False + npu: + name: ASCEND_LAUNCH_BLOCKING + default_value: 0 + gpu: + name: CUDA_LAUNCH_BLOCKING + default_value: 0 -ASCEND_RT_VISIBLE_DEVICE: - - name: ASCEND_RT_VISIBLE_DEVICE +ASCEND_RT_VISIBLE_DEVICES: + npu: + name: ASCEND_RT_VISIBLE_DEVICES + default_value: None + gpu: + name: CUDA_VISIBLE_DEVICES default_value: None \ No newline at end of file diff --git a/debug/accuracy_tools/msprobe/test/pytorch_ut/config_checking/test_config_checking.py b/debug/accuracy_tools/msprobe/test/pytorch_ut/config_checking/test_config_checking.py index 27b6b6e4364ff440a74d9619d1439e349a696efe..555d7a7988b94e8d5c1d29fed4f98a2ac2ef2e02 100644 --- a/debug/accuracy_tools/msprobe/test/pytorch_ut/config_checking/test_config_checking.py +++ b/debug/accuracy_tools/msprobe/test/pytorch_ut/config_checking/test_config_checking.py @@ -56,11 +56,12 @@ def get_test_model(): @unittest.mock.patch("msprobe.pytorch.config_checking.checkers.pip_checker.collect_pip_data") @unittest.mock.patch("msprobe.pytorch.config_checking.checkers.env_args_checker.collect_env_data") def train_test(seed, output_zip_path, shell_path, mock_env, mock_pip): - mock_env.return_value = {"HCCL_DETERMINISTIC": False} if seed == 1234: mock_pip.return_value = "transformers=0.0.1" + mock_env.return_value = {"NCCL_DETERMINISTIC": True} else: mock_pip.return_value = "transformers=0.0.2" + mock_env.return_value = {"HCCL_DETERMINISTIC": False, "ASCEND_LAUNCH_BLOCKING": 1} seed_all(seed) loss_fun = nn.CrossEntropyLoss() @@ -104,11 +105,11 @@ class TestConfigChecker(unittest.TestCase): total_check_result = read_xlsx(os.path.join(compare_output_dir, ConfigChecker.result_filename)) self.assertEqual(total_check_result.columns.tolist(), ConfigChecker.result_header) target_total_check_result = [ - ['env', True], + ['env', False], ['pip', False], ['dataset', False], ['weights', False], - ['hyperparameters', True], + ['hyperparameters', False], ['random', False] ] self.assertEqual(total_check_result.values.tolist(), target_total_check_result)