From 05e5d5f7490083552c7b7d48d4482444b0316946 Mon Sep 17 00:00:00 2001 From: lcw Date: Tue, 22 Apr 2025 12:00:20 +0800 Subject: [PATCH] =?UTF-8?q?=E3=80=90bugfix=E3=80=91config=5Fchecking=20?= =?UTF-8?q?=E6=A8=A1=E5=9D=97=E8=BE=93=E5=87=BA=E4=BB=B6=E6=95=B4=E5=90=88?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../msprobe/core/common/const.py | 3 ++ .../checkers/env_args_checker.py | 40 ++++++++++++++----- .../checkers/hyperparameter_checker.py | 36 +++++++---------- .../pytorch/config_checking/resource/env.yaml | 37 ++++++++++++----- .../config_checking/test_config_checking.py | 7 ++-- 5 files changed, 79 insertions(+), 44 deletions(-) diff --git a/debug/accuracy_tools/msprobe/core/common/const.py b/debug/accuracy_tools/msprobe/core/common/const.py index 2a99e5158ac..f5d0826f09b 100644 --- a/debug/accuracy_tools/msprobe/core/common/const.py +++ b/debug/accuracy_tools/msprobe/core/common/const.py @@ -136,6 +136,7 @@ class Const: NPU = 'NPU' NPU_LOWERCASE = 'npu' CPU_LOWERCASE = 'cpu' + GPU_LOWERCASE = 'gpu' CUDA_LOWERCASE = 'cuda' DEVICE = 'device' DISTRIBUTED = 'Distributed' @@ -339,6 +340,8 @@ class Const: } } + ASCEND = "ASCEND" + class CompareConst: """ diff --git a/debug/accuracy_tools/msprobe/pytorch/config_checking/checkers/env_args_checker.py b/debug/accuracy_tools/msprobe/pytorch/config_checking/checkers/env_args_checker.py index 9eaeb1a0572..a5ed1464cf8 100644 --- a/debug/accuracy_tools/msprobe/pytorch/config_checking/checkers/env_args_checker.py +++ b/debug/accuracy_tools/msprobe/pytorch/config_checking/checkers/env_args_checker.py @@ -22,7 +22,7 @@ from msprobe.core.common.file_utils import load_json, load_yaml, create_file_wit from msprobe.pytorch.config_checking.checkers.base_checker import BaseChecker from msprobe.pytorch.config_checking.config_checker import register_checker_item from msprobe.pytorch.config_checking.utils.utils import config_checking_print -from msprobe.core.common.file_utils import save_excel +from msprobe.core.common.const import Const dirpath = os.path.dirname(__file__) @@ -36,21 +36,41 @@ def collect_env_data(): return result +def get_device_type(env_json): + for key in env_json.keys(): + if Const.ASCEND in key: + return Const.NPU_LOWERCASE + return Const.GPU_LOWERCASE + + def compare_env_data(npu_path, bench_path): necessary_env = load_yaml(env_yaml_path) - npu_data = load_json(npu_path) + cmp_data = load_json(npu_path) + cpm_type = get_device_type(cmp_data) bench_data = load_json(bench_path) + bench_type = get_device_type(bench_data) data = [] for _, value in necessary_env.items(): - npu_env_name = value[0]["name"] - npu_value = npu_data.get(npu_env_name) if npu_data.get(npu_env_name) else value[0]["default_value"] - if len(value) == 1: - data.append([npu_env_name, "only npu has this env", npu_value, "", "warning"]) + cmp_env = value.get(cpm_type) + bench_env = value.get(bench_type) + if not bench_env and not cmp_env: continue - bench_env_name = value[1]["name"] - bench_value = bench_data.get(bench_env_name) if bench_data.get(bench_env_name) else value[1]["default_value"] - if npu_value != bench_value: - data.append([npu_env_name, bench_env_name, npu_value, bench_value, "error"]) + elif cmp_env: + cmp_env_name = cmp_env["name"] + cmp_value = cmp_data.get(cmp_env_name) if cmp_data.get(cmp_env_name) else value[cpm_type]["default_value"] + if not bench_env: + data.append(["only cmp has this env", cmp_env["name"], "", cmp_value, "warning"]) + continue + bench_env_name = bench_env["name"] + bench_value = bench_data.get(bench_env_name) if bench_data.get(bench_env_name) else value[bench_type][ + "default_value"] + if cmp_value != bench_value: + data.append([bench_env_name, cmp_env_name, bench_value, cmp_value, "error"]) + else: + bench_env_name = bench_env["name"] + bench_value = bench_data.get(bench_env_name) if bench_data.get(bench_env_name) else value[bench_type][ + "default_value"] + data.append([bench_env_name, "only bench has this env", bench_value, "", "warning"]) df = pd.DataFrame(data, columns=EnvArgsChecker.result_header) return df diff --git a/debug/accuracy_tools/msprobe/pytorch/config_checking/checkers/hyperparameter_checker.py b/debug/accuracy_tools/msprobe/pytorch/config_checking/checkers/hyperparameter_checker.py index 9ac1cd61fc5..90260fc46f7 100644 --- a/debug/accuracy_tools/msprobe/pytorch/config_checking/checkers/hyperparameter_checker.py +++ b/debug/accuracy_tools/msprobe/pytorch/config_checking/checkers/hyperparameter_checker.py @@ -20,6 +20,7 @@ import tempfile from difflib import SequenceMatcher from typing import Union, List, Dict, Any +import pandas as pd from msprobe.pytorch.config_checking.checkers.base_checker import BaseChecker from msprobe.pytorch.config_checking.config_checker import register_checker_item @@ -42,6 +43,8 @@ class HyperparameterChecker(BaseChecker): "dropout_rate": ["dropout", "drop_rate"], } + result_header = ["file", "bench_para", "cmp_para", "bench_value", "cmp_value", "matched_with", "level"] + @staticmethod def pack(pack_input): shell_path = pack_input.shell_path @@ -82,43 +85,32 @@ class HyperparameterChecker(BaseChecker): for filename in all_files: bench_params = bench_hyperparameters.get(filename, {}) cmp_params = cmp_hyperparameters.get(filename, {}) - if bench_params and cmp_params: all_diffs.extend(HyperparameterChecker.compare_param(bench_params, cmp_params, filename)) - - elif bench_params is not None: - all_diffs.append(f"[Only in benchmark] File: {filename}") - else: - all_diffs.append(f"[Only in compare] File: {filename}") - return HyperparameterChecker.target_name_in_zip, True, None + df = pd.DataFrame(all_diffs, columns=HyperparameterChecker.result_header) + pass_check = "error" not in df['level'].values + return HyperparameterChecker.target_name_in_zip, pass_check, df @staticmethod def compare_param(bench_params, cmp_params, filename): all_diffs = [] - file_diffs = [] bench_param_names = bench_params.keys() for bench_param_name in bench_param_names: matched_cmp_param_name = HyperparameterChecker._fuzzy_match_parameter(bench_param_name, cmp_params) + bench_param_value = bench_params[bench_param_name] if matched_cmp_param_name: - bench_param_value = bench_params[bench_param_name] cmp_param_value = cmp_params[matched_cmp_param_name] if bench_param_value != cmp_param_value: - diff = compare_dict({bench_param_name: bench_param_value}, - {matched_cmp_param_name: cmp_param_value}) - if diff: - file_diffs.extend( - [f" Parameter '{bench_param_name}' (matched with '{matched_cmp_param_name}'): {d}" - for d in diff]) + all_diffs.append( + [filename, bench_param_name, matched_cmp_param_name, bench_param_value, cmp_param_value, + matched_cmp_param_name, "error"]) del cmp_params[matched_cmp_param_name] else: - file_diffs.append( - f" [Only in benchmark] Parameter: '{bench_param_name}': {bench_params[bench_param_name]}") + all_diffs.append( + [filename, bench_param_name, "Only in benchmark", bench_param_value, "", "", "warning"]) for cmp_param_name, cmp_param_value in cmp_params.items(): - file_diffs.append(f" [Only in compare] Parameter: '{cmp_param_name}': {cmp_param_value}") - if file_diffs: - file_diffs.sort() - all_diffs.append(f"File: {filename}") - all_diffs.extend(file_diffs) + all_diffs.append([filename, "Only in comparison", cmp_param_name, "", cmp_param_value, "", "warning"]) + all_diffs.sort() return all_diffs @staticmethod diff --git a/debug/accuracy_tools/msprobe/pytorch/config_checking/resource/env.yaml b/debug/accuracy_tools/msprobe/pytorch/config_checking/resource/env.yaml index 13ea0e39f89..87d663b9d94 100644 --- a/debug/accuracy_tools/msprobe/pytorch/config_checking/resource/env.yaml +++ b/debug/accuracy_tools/msprobe/pytorch/config_checking/resource/env.yaml @@ -14,25 +14,44 @@ # limitations under the License. HCCL_DETERMINISTIC: - - name: HCCL_DETERMINISTIC + npu: + name: HCCL_DETERMINISTIC + default_value: False + gpu: + name: NCCL_DETERMINISTIC default_value: False -HCCL_ALGO: - - name: HCCL_ALGO +HCCL_ALGO: + npu: + name: HCCL_ALGO + default_value: None + gpu: + name: NCCL_ALGO default_value: None HCCL_INTRA_ROCE_ENABLE: - - name: HCCL_INTRA_ROCE_ENABLE + npu: + name: HCCL_INTRA_ROCE_ENABLE default_value: 0 + HCCL_INTRA_PICE_ENABLE: - - name: HCCL_INTRA_PICE_ENABLE + npu: + name: HCCL_INTRA_ROCE_ENABLE default_value: 1 ASCEND_LAUNCH_BLOCKING: - - name: ASCEND_LAUNCH_BLOCKING - default_value: False + npu: + name: ASCEND_LAUNCH_BLOCKING + default_value: 0 + gpu: + name: CUDA_LAUNCH_BLOCKING + default_value: 0 -ASCEND_RT_VISIBLE_DEVICE: - - name: ASCEND_RT_VISIBLE_DEVICE +ASCEND_RT_VISIBLE_DEVICES: + npu: + name: ASCEND_RT_VISIBLE_DEVICES + default_value: None + gpu: + name: CUDA_VISIBLE_DEVICES default_value: None \ No newline at end of file diff --git a/debug/accuracy_tools/msprobe/test/pytorch_ut/config_checking/test_config_checking.py b/debug/accuracy_tools/msprobe/test/pytorch_ut/config_checking/test_config_checking.py index 27b6b6e4364..555d7a7988b 100644 --- a/debug/accuracy_tools/msprobe/test/pytorch_ut/config_checking/test_config_checking.py +++ b/debug/accuracy_tools/msprobe/test/pytorch_ut/config_checking/test_config_checking.py @@ -56,11 +56,12 @@ def get_test_model(): @unittest.mock.patch("msprobe.pytorch.config_checking.checkers.pip_checker.collect_pip_data") @unittest.mock.patch("msprobe.pytorch.config_checking.checkers.env_args_checker.collect_env_data") def train_test(seed, output_zip_path, shell_path, mock_env, mock_pip): - mock_env.return_value = {"HCCL_DETERMINISTIC": False} if seed == 1234: mock_pip.return_value = "transformers=0.0.1" + mock_env.return_value = {"NCCL_DETERMINISTIC": True} else: mock_pip.return_value = "transformers=0.0.2" + mock_env.return_value = {"HCCL_DETERMINISTIC": False, "ASCEND_LAUNCH_BLOCKING": 1} seed_all(seed) loss_fun = nn.CrossEntropyLoss() @@ -104,11 +105,11 @@ class TestConfigChecker(unittest.TestCase): total_check_result = read_xlsx(os.path.join(compare_output_dir, ConfigChecker.result_filename)) self.assertEqual(total_check_result.columns.tolist(), ConfigChecker.result_header) target_total_check_result = [ - ['env', True], + ['env', False], ['pip', False], ['dataset', False], ['weights', False], - ['hyperparameters', True], + ['hyperparameters', False], ['random', False] ] self.assertEqual(total_check_result.values.tolist(), target_total_check_result) -- Gitee