diff --git a/debug/accuracy_tools/msprobe/core/common/const.py b/debug/accuracy_tools/msprobe/core/common/const.py index 9c659a22df65cc4aa8b2843a950687e5e5fd82aa..b75948b49e29402362a39d036db88b2d1a0f64bc 100644 --- a/debug/accuracy_tools/msprobe/core/common/const.py +++ b/debug/accuracy_tools/msprobe/core/common/const.py @@ -400,6 +400,8 @@ class Const: CONFIG_CHECK_WARNING = "warning" CONFIG_CHECK_ERROR = "error" + MIX_DUMP_NAMES = {'graph', 'pynative'} + class CompareConst: """ diff --git a/debug/accuracy_tools/msprobe/core/compare/check.py b/debug/accuracy_tools/msprobe/core/compare/check.py index a88ddb8f5e088a9f72ef2d2b721b03dbc539c385..acc90ec3d61a1bb3a772a1fd6760889d80beab60 100644 --- a/debug/accuracy_tools/msprobe/core/compare/check.py +++ b/debug/accuracy_tools/msprobe/core/compare/check.py @@ -13,6 +13,8 @@ # See the License for the specific language governing permissions and # limitations under the License. +import os + from msprobe.core.common.log import logger from msprobe.core.common.utils import check_op_str_pattern_valid, CompareException from msprobe.core.common.const import Const diff --git a/debug/accuracy_tools/msprobe/core/compare/compare_cli.py b/debug/accuracy_tools/msprobe/core/compare/compare_cli.py index c5534bdce40666c69340bc68b9291c2cbfbfa6d8..991978250b133710ce893118403fc86727b354a2 100644 --- a/debug/accuracy_tools/msprobe/core/compare/compare_cli.py +++ b/debug/accuracy_tools/msprobe/core/compare/compare_cli.py @@ -1,4 +1,4 @@ -# Copyright (c) 2024-2024, Huawei Technologies Co., Ltd. +# Copyright (c) 2024-2025, Huawei Technologies Co., Ltd. # All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -13,18 +13,28 @@ # See the License for the specific language governing permissions and # limitations under the License. -import json +import os + from msprobe.core.common.file_utils import check_file_type, load_json, check_file_or_directory_path from msprobe.core.common.const import FileCheckConst, Const from msprobe.core.common.utils import CompareException from msprobe.core.common.log import logger +from msprobe.core.compare.utils import get_paired_dirs + +def compare_cli(args, depth=1): + if depth > 2: + logger.error("Recursive compare error, depth exceeds 2.") + raise CompareException(CompareException.RECURSION_LIMIT_ERROR) -def compare_cli(args): - input_param = load_json(args.input_path) + if isinstance(args.input_path, dict): # special for dyn-graph mix compare + input_param = args.input_path + else: + input_param = load_json(args.input_path) if not isinstance(input_param, dict): logger.error("input_param should be dict, please check!") raise CompareException(CompareException.INVALID_OBJECT_TYPE_ERROR) + npu_path = input_param.get("npu_path", None) bench_path = input_param.get("bench_path", None) if not npu_path: @@ -33,6 +43,7 @@ def compare_cli(args): if not bench_path: logger.error(f"Missing bench_path in input configuration file, please check!") raise CompareException(CompareException.INVALID_PATH_ERROR) + frame_name = args.framework auto_analyze = not args.compare_only @@ -78,6 +89,12 @@ def compare_cli(args): elif check_file_type(npu_path) == FileCheckConst.DIR and check_file_type(bench_path) == FileCheckConst.DIR: check_file_or_directory_path(npu_path, isdir=True) check_file_or_directory_path(bench_path, isdir=True) + + if depth == 1: + mix_compare_success = mix_compare(args, input_param, depth) + if mix_compare_success: + return + kwargs = { **common_kwargs, "stack_mode": args.stack_mode, @@ -107,3 +124,34 @@ def compare_cli(args): else: logger.error("The npu_path and bench_path need to be of the same type.") raise CompareException(CompareException.INVALID_COMPARE_MODE) + + +def mix_compare(args, input_param, depth): + npu_path = input_param.get("npu_path", None) + bench_path = input_param.get("bench_path", None) + + npu_bench_same_dirs_set = set(get_paired_dirs(npu_path, bench_path)) + compare_cross_set = npu_bench_same_dirs_set & Const.MIX_DUMP_NAMES + + if compare_cross_set: + logger.info("Start mix compare.") + origin_output = args.output_path + + for folder_name in list(compare_cross_set): + new_npu_path = os.path.join(npu_path, folder_name) + new_bench_path = os.path.join(bench_path, folder_name) + paired_steps = get_paired_dirs(new_npu_path, new_bench_path) + + for step_name in paired_steps: + logger.info(f"[mix compare] Start comparing {folder_name}/{step_name}") + npu_dir = os.path.join(new_npu_path, step_name) + bench_dir = os.path.join(new_bench_path, step_name) + args.input_path = { + "npu_path": npu_dir, + "bench_path": bench_dir, + "is_print_compare_log": input_param.get("is_print_compare_log", True) + } + args.output_path = os.path.join(origin_output, folder_name, step_name) + compare_cli(args, depth + 1) + return True + return False diff --git a/debug/accuracy_tools/msprobe/core/compare/utils.py b/debug/accuracy_tools/msprobe/core/compare/utils.py index 63e6f89d8338c2be809c3a3dd1b5bb0ba9b93ab5..15875ca462d33f00e8c9c433b1baaf981bd1f6b3 100644 --- a/debug/accuracy_tools/msprobe/core/compare/utils.py +++ b/debug/accuracy_tools/msprobe/core/compare/utils.py @@ -629,6 +629,12 @@ def gen_api_batches(result: np.ndarray): return api_batches +def get_paired_dirs(npu_path, bench_path): + npu_dirs = set(os.listdir(npu_path)) + bench_dirs = set(os.listdir(bench_path)) + return list(npu_dirs & bench_dirs) + + def _compare_parser(parser): parser.add_argument("-i", "--input_path", dest="input_path", type=str, help=" The compare input path, a dict json.", required=True) diff --git a/debug/accuracy_tools/msprobe/docs/11.accuracy_compare_MindSpore.md b/debug/accuracy_tools/msprobe/docs/11.accuracy_compare_MindSpore.md index f159675a3ad02364adf163ff8377dea9e212f695..a3b6b64f7773fa178f930e617de117842954677e 100644 --- a/debug/accuracy_tools/msprobe/docs/11.accuracy_compare_MindSpore.md +++ b/debug/accuracy_tools/msprobe/docs/11.accuracy_compare_MindSpore.md @@ -204,7 +204,59 @@ MindSpore静态图场景比对结果: ![compare_result](./img/save_compare_result_sample.png) 具体字段含义同PyTorch目录下的《[PyTorch 场景的精度比对-精度比对结果分析](./10.accuracy_compare_PyTorch.md#3-精度比对结果分析)》章节。 -### 2.9 首差异算子节点识别 +### 2.9 动静态图场景L0混合dump数据比对 + +1. 参见 [msprobe工具MindSpore场景精度数据采集指南](./06.data_dump_MindSpore.md),执行dump操作。
动态图场景下使用 `mindspore.jit` 装饰特定 Cell 或 function 时,被装饰的部分会被编译成静态图执行。采集的数据文件目录结构示例如下: + ```lua + ├── graph + │ ├── step0 + │ | ├── rank + │ | │ ├── dump_tensor_data + | | | | ├── Cell.wrap_net.net.Net.forward.0.input.0.npy + | | | | ├── Cell.wrap_net.net.Net.forward.0.output.0.npy + | | | | ... + │ | | ├── dump.json + │ | | ├── stack.json + │ | | └── construct.json + │ ├── ... + ├── pynative + │ ├── step0 + │ | ├── rank + │ | │ ├── dump_tensor_data + | | | | ├── Cell.dense1.Dense.forward.0.input.0.npy + | | | | ├── Cell.dense1.Dense.forward.0.output.0.npy + | | | | ... + │ | | ├── dump.json + │ | | ├── stack.json + │ | | └── construct.json + │ ├── ... + ``` + +2. 创建比对文件,文件内容及示例请参见[比对文件(动静态图场景L0混合数据)](#48-比对文件动静态图场景l0混合数据)。 + +3. 执行如下示例命令进行比对: + + ```shell + msprobe -f mindspore compare -i ./compare.json -o ./output + ``` + +4. 动静态图场景L0混合dump数据比对结果,示例如下: + ```lua + ├── graph + │ ├── step0 + │ | ├── advisor_rank_20250805043414.txt + │ | ├── compare_result_rank_20250805043411.xlsx + ├── pynative + │ ├── step0 + │ | ├── advisor_rank_20250805043416.txt + │ | ├── compare_result_rank_20250805043414.xlsx + ``` + +output目录下生成两个graph和pynative两个文件夹,每个文件夹下生成对应step的比对结果。 + +5. 查看比对结果,请详见PyTorch目录下的《[PyTorch 场景的精度比对-精度比对结果分析](./10.accuracy_compare_PyTorch.md#3-精度比对结果分析)》章节。 + +### 2.10 首差异算子节点识别 参见《[PyTorch 场景的精度比对-首差异算子节点识别](./10.accuracy_compare_PyTorch.md#215-首差异算子节点识别场景)》章节。 ## 3 多卡比对结果提取汇总通信算子数据 @@ -704,3 +756,14 @@ MindSpore静态图场景(不区分单/多卡)示例如下: - `bench_path`表示bench dump文件目录,指定同上。 - `common`表示开启MindSpore静态图单点保存比对,默认关闭。 - `map_dict`可用于当单点保存比对的`npy`文件名称不完全对应时,通过手动指定保证比对正确执行,比对指定名称对应,如{"input": "x"},则`input_float32_1.npy`会对应`x_float32_1.npy`。 + +### 4.8 比对文件(动静态图场景L0混合数据) + ```json +{ +"npu_path": "./npu_dump", +"bench_path": "./bench_dump", +"is_print_compare_log": true +} + ``` +- npu_path表示NPU dump文件目录,上面示例中的 ./npu_dump/ 是npu侧动静态图dump后graph和pynative目录的父目录。 +- bench_path表示Bench dump文件目录,上面示例中的 ./bench_dump/ 是bench侧动静态图dump后graph和pynative目录的父目录。 \ No newline at end of file diff --git a/debug/accuracy_tools/msprobe/test/core_ut/compare/test_acc_compare_check.py b/debug/accuracy_tools/msprobe/test/core_ut/compare/test_acc_compare_check.py index 1a0a33f799724ffefe73bf8f024e0146b2925464..58d81209b1db505d599e7430a955cf01a8d9c234 100644 --- a/debug/accuracy_tools/msprobe/test/core_ut/compare/test_acc_compare_check.py +++ b/debug/accuracy_tools/msprobe/test/core_ut/compare/test_acc_compare_check.py @@ -1,6 +1,8 @@ # coding=utf-8 import unittest -from msprobe.core.compare.check import check_dump_json_str, check_json_key_value, valid_key_value, check_stack_json_str + +from msprobe.core.compare.check import check_dump_json_str, check_json_key_value, valid_key_value, \ + check_stack_json_str from msprobe.core.common.utils import CompareException diff --git a/debug/accuracy_tools/msprobe/test/core_ut/compare/test_acc_compare_utils.py b/debug/accuracy_tools/msprobe/test/core_ut/compare/test_acc_compare_utils.py index 9d418ae5e4e7de74ba81216325a69ee057441236..4b06daf4b5343566aba5237784639758fb9646f9 100644 --- a/debug/accuracy_tools/msprobe/test/core_ut/compare/test_acc_compare_utils.py +++ b/debug/accuracy_tools/msprobe/test/core_ut/compare/test_acc_compare_utils.py @@ -6,6 +6,7 @@ import shutil import unittest from unittest.mock import patch import zlib +import tempfile import numpy as np @@ -13,7 +14,8 @@ from msprobe.core.common.const import CompareConst, Const from msprobe.core.common.utils import CompareException from msprobe.core.compare.utils import ApiItemInfo, _compare_parser, check_and_return_dir_contents, extract_json, \ count_struct, get_accuracy, get_rela_diff_summary_mode, merge_tensor, op_item_parse, read_op, result_item_init, \ - stack_column_process, table_value_is_valid, reorder_op_name_list, reorder_op_x_list, gen_op_item, ApiBatch + stack_column_process, table_value_is_valid, reorder_op_name_list, reorder_op_x_list, gen_op_item, ApiBatch, \ + get_paired_dirs # test_read_op_1 op_data = { @@ -910,3 +912,25 @@ class TestApiBatch(unittest.TestCase): self.assertEqual(api_batch.params_end_index, 3) self.assertEqual(api_batch.output_end_index, 5) self.assertEqual(api_batch.params_grad_end_index, 5) + + +class TestGetPairedSteps(unittest.TestCase): + def setUp(self): + self.npu_dir = tempfile.TemporaryDirectory() + self.bench_dir = tempfile.TemporaryDirectory() + + self.npu_files = ['step1', 'step2'] + for name in self.npu_files: + open(os.path.join(self.npu_dir.name, name), 'w').close() + + self.bench_files = ['step2', 'step3'] + for name in self.bench_files: + open(os.path.join(self.bench_dir.name, name), 'w').close() + + def tearDown(self): + self.npu_dir.cleanup() + self.bench_dir.cleanup() + + def test_get_paired_steps(self): + paired = get_paired_dirs(self.npu_dir.name, self.bench_dir.name) + self.assertEqual(set(paired), {'step2'}) diff --git a/debug/accuracy_tools/msprobe/test/core_ut/compare/test_cmp_cli.py b/debug/accuracy_tools/msprobe/test/core_ut/compare/test_cmp_cli.py new file mode 100644 index 0000000000000000000000000000000000000000..62d5467137cdb9d8b6fffaff6e82a10c992df3ea --- /dev/null +++ b/debug/accuracy_tools/msprobe/test/core_ut/compare/test_cmp_cli.py @@ -0,0 +1,34 @@ +import unittest +from unittest.mock import patch, MagicMock + +from msprobe.core.compare.compare_cli import mix_compare + + +class TestMixCompare(unittest.TestCase): + @patch('msprobe.core.compare.compare_cli.get_paired_dirs') + @patch('msprobe.core.compare.compare_cli.compare_cli') + def test_mix_compare_with_matching_dirs(self, mock_compare_cli, mock_get_paired_dirs): + mock_args = MagicMock() + mock_args.output_path = "/output" + mock_input_param = {"npu_path": "/npu_dump", "bench_path": "/bench_dump", "is_print_compare_log": True} + mock_get_paired_dirs.side_effect = [ + ["graph", "pynative"], # 第一次调用的返回值 + ["step1", "step2"], # 第二次调用的返回值 + ["step1", "step2"] # 第三次调用的返回值 + ] + + result = mix_compare(mock_args, mock_input_param, 1) + + self.assertTrue(result) + + @patch('msprobe.core.compare.compare_cli.get_paired_dirs') + @patch('msprobe.core.compare.compare_cli.compare_cli') + def test_mix_compare_no_matching_dirs(self, mock_compare_cli, mock_get_paired_dirs): + mock_args = MagicMock() + mock_args.output_path = "/output" + mock_input_param = {"npu_path": "/npu_dump", "bench_path": "/bench_dump", "is_print_compare_log": True} + mock_get_paired_dirs.return_value = set() + + result = mix_compare(mock_args, mock_input_param, 1) + + self.assertFalse(result)