From d562847ee5bbf0f41a0b2f94839230c12f5b4602 Mon Sep 17 00:00:00 2001 From: i-robot Date: Fri, 15 Aug 2025 01:40:00 +0000 Subject: [PATCH] ms graph mix compare sync --- .../msprobe/core/common/const.py | 2 + .../msprobe/core/compare/check.py | 2 + .../msprobe/core/compare/compare_cli.py | 56 ++++++++++++++-- .../msprobe/core/compare/utils.py | 6 ++ .../docs/11.accuracy_compare_MindSpore.md | 67 +++++++++++++++++++ .../core_ut/compare/test_acc_compare_check.py | 4 +- .../core_ut/compare/test_acc_compare_utils.py | 26 ++++++- .../test/core_ut/compare/test_cmp_cli.py | 34 ++++++++++ 8 files changed, 191 insertions(+), 6 deletions(-) create mode 100644 debug/accuracy_tools/msprobe/test/core_ut/compare/test_cmp_cli.py diff --git a/debug/accuracy_tools/msprobe/core/common/const.py b/debug/accuracy_tools/msprobe/core/common/const.py index 10ebcfe20..8b2ab5f08 100644 --- a/debug/accuracy_tools/msprobe/core/common/const.py +++ b/debug/accuracy_tools/msprobe/core/common/const.py @@ -394,6 +394,8 @@ class Const: CONFIG_CHECK_WARNING = "warning" CONFIG_CHECK_ERROR = "error" + MIX_DUMP_NAMES = {'graph', 'pynative'} + class CompareConst: """ diff --git a/debug/accuracy_tools/msprobe/core/compare/check.py b/debug/accuracy_tools/msprobe/core/compare/check.py index a88ddb8f5..acc90ec3d 100644 --- a/debug/accuracy_tools/msprobe/core/compare/check.py +++ b/debug/accuracy_tools/msprobe/core/compare/check.py @@ -13,6 +13,8 @@ # See the License for the specific language governing permissions and # limitations under the License. +import os + from msprobe.core.common.log import logger from msprobe.core.common.utils import check_op_str_pattern_valid, CompareException from msprobe.core.common.const import Const diff --git a/debug/accuracy_tools/msprobe/core/compare/compare_cli.py b/debug/accuracy_tools/msprobe/core/compare/compare_cli.py index 08af3aab6..a796a087d 100644 --- a/debug/accuracy_tools/msprobe/core/compare/compare_cli.py +++ b/debug/accuracy_tools/msprobe/core/compare/compare_cli.py @@ -1,4 +1,4 @@ -# Copyright (c) 2024-2024, Huawei Technologies Co., Ltd. +# Copyright (c) 2024-2025, Huawei Technologies Co., Ltd. # All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -13,18 +13,28 @@ # See the License for the specific language governing permissions and # limitations under the License. -import json +import os + from msprobe.core.common.file_utils import check_file_type, load_json, check_file_or_directory_path from msprobe.core.common.const import FileCheckConst, Const from msprobe.core.common.utils import CompareException from msprobe.core.common.log import logger +from msprobe.core.compare.utils import get_paired_dirs + +def compare_cli(args, depth=1): + if depth > 2: + logger.error("Recursive compare error, depth exceeds 2.") + raise CompareException(CompareException.RECURSION_LIMIT_ERROR) -def compare_cli(args): - input_param = load_json(args.input_path) + if isinstance(args.input_path, dict): # special for dyn-graph mix compare + input_param = args.input_path + else: + input_param = load_json(args.input_path) if not isinstance(input_param, dict): logger.error("input_param should be dict, please check!") raise CompareException(CompareException.INVALID_OBJECT_TYPE_ERROR) + npu_path = input_param.get("npu_path", None) bench_path = input_param.get("bench_path", None) if not npu_path: @@ -33,6 +43,7 @@ def compare_cli(args): if not bench_path: logger.error(f"Missing bench_path in input configuration file, please check!") raise CompareException(CompareException.INVALID_PATH_ERROR) + frame_name = args.framework auto_analyze = not args.compare_only @@ -77,6 +88,12 @@ def compare_cli(args): elif check_file_type(npu_path) == FileCheckConst.DIR and check_file_type(bench_path) == FileCheckConst.DIR: check_file_or_directory_path(npu_path, isdir=True) check_file_or_directory_path(bench_path, isdir=True) + + if depth == 1: + mix_compare_success = mix_compare(args, input_param, depth) + if mix_compare_success: + return + kwargs = { **common_kwargs, "stack_mode": args.stack_mode, @@ -99,3 +116,34 @@ def compare_cli(args): else: logger.error("The npu_path and bench_path need to be of the same type.") raise CompareException(CompareException.INVALID_COMPARE_MODE) + + +def mix_compare(args, input_param, depth): + npu_path = input_param.get("npu_path", None) + bench_path = input_param.get("bench_path", None) + + npu_bench_same_dirs_set = set(get_paired_dirs(npu_path, bench_path)) + compare_cross_set = npu_bench_same_dirs_set & Const.MIX_DUMP_NAMES + + if compare_cross_set: + logger.info("Start mix compare.") + origin_output = args.output_path + + for folder_name in list(compare_cross_set): + new_npu_path = os.path.join(npu_path, folder_name) + new_bench_path = os.path.join(bench_path, folder_name) + paired_steps = get_paired_dirs(new_npu_path, new_bench_path) + + for step_name in paired_steps: + logger.info(f"[mix compare] Start comparing {folder_name}/{step_name}") + npu_dir = os.path.join(new_npu_path, step_name) + bench_dir = os.path.join(new_bench_path, step_name) + args.input_path = { + "npu_path": npu_dir, + "bench_path": bench_dir, + "is_print_compare_log": input_param.get("is_print_compare_log", True) + } + args.output_path = os.path.join(origin_output, folder_name, step_name) + compare_cli(args, depth + 1) + return True + return False diff --git a/debug/accuracy_tools/msprobe/core/compare/utils.py b/debug/accuracy_tools/msprobe/core/compare/utils.py index e64c40327..5db2241e6 100644 --- a/debug/accuracy_tools/msprobe/core/compare/utils.py +++ b/debug/accuracy_tools/msprobe/core/compare/utils.py @@ -629,6 +629,12 @@ def gen_api_batches(result: np.ndarray): return api_batches +def get_paired_dirs(npu_path, bench_path): + npu_dirs = set(os.listdir(npu_path)) + bench_dirs = set(os.listdir(bench_path)) + return list(npu_dirs & bench_dirs) + + def _compare_parser(parser): parser.add_argument("-i", "--input_path", dest="input_path", type=str, help=" The compare input path, a dict json.", required=True) diff --git a/debug/accuracy_tools/msprobe/docs/11.accuracy_compare_MindSpore.md b/debug/accuracy_tools/msprobe/docs/11.accuracy_compare_MindSpore.md index 15020fdc5..a3b6b64f7 100644 --- a/debug/accuracy_tools/msprobe/docs/11.accuracy_compare_MindSpore.md +++ b/debug/accuracy_tools/msprobe/docs/11.accuracy_compare_MindSpore.md @@ -48,6 +48,7 @@ msprobe -f mindspore compare -i ./compare.json -o ./output -s | -cm或--cell_mapping | 跨框架比对。配置该参数时表示开启跨框架cell模块比对功能,可以指定自定义映射文件*.yaml,不指定映射文件时按照msprobe定义的默认映射关系进行比对。自定义映射文件的格式请参见[自定义映射文件(cell_mapping)](#44-自定义映射文件cell_mapping)。仅[跨框架的cell模块比对](#26-跨框架的cell模块比对)场景需要配置。 | 否 | | -dm或--data_mapping | 同框架或跨框架比对。通过映射文件指定两个具体参数的对应关系,可以在L0、L1或mix采集场景下使用。配置该参数的同时需要指定自定义映射文件*.yaml。自定义映射文件的格式请参见[自定义映射文件(data_mapping)](#45-自定义映射文件data_mapping)。 | 否 | | -lm或--layer_mapping | 跨框架比对。配置该参数时表示开启跨框架Layer层的比对功能,指定模型代码中的Layer层后,可以识别对应dump数据中的模块或API。需要指定自定义映射文件*.yaml。自定义映射文件的格式请参见[自定义映射文件(Layer_mapping)](#46-自定义映射文件layer_mapping)。仅[跨框架的Layer层比对](#27-跨框架的layer层比对)场景需要配置。 | 否 | +| -da或--diff_analyze | 自动识别网络中首差异节点,支持md5、统计量等dump数据。支持单卡/多卡场景。 | 否 | 动态图模式没有填写任何mapping时,按照同框架比对的方式进行比对,比对数据和标杆数据的Cell或Api名称需要完全相同才能匹配得上。 @@ -203,6 +204,61 @@ MindSpore静态图场景比对结果: ![compare_result](./img/save_compare_result_sample.png) 具体字段含义同PyTorch目录下的《[PyTorch 场景的精度比对-精度比对结果分析](./10.accuracy_compare_PyTorch.md#3-精度比对结果分析)》章节。 +### 2.9 动静态图场景L0混合dump数据比对 + +1. 参见 [msprobe工具MindSpore场景精度数据采集指南](./06.data_dump_MindSpore.md),执行dump操作。
动态图场景下使用 `mindspore.jit` 装饰特定 Cell 或 function 时,被装饰的部分会被编译成静态图执行。采集的数据文件目录结构示例如下: + ```lua + ├── graph + │ ├── step0 + │ | ├── rank + │ | │ ├── dump_tensor_data + | | | | ├── Cell.wrap_net.net.Net.forward.0.input.0.npy + | | | | ├── Cell.wrap_net.net.Net.forward.0.output.0.npy + | | | | ... + │ | | ├── dump.json + │ | | ├── stack.json + │ | | └── construct.json + │ ├── ... + ├── pynative + │ ├── step0 + │ | ├── rank + │ | │ ├── dump_tensor_data + | | | | ├── Cell.dense1.Dense.forward.0.input.0.npy + | | | | ├── Cell.dense1.Dense.forward.0.output.0.npy + | | | | ... + │ | | ├── dump.json + │ | | ├── stack.json + │ | | └── construct.json + │ ├── ... + ``` + +2. 创建比对文件,文件内容及示例请参见[比对文件(动静态图场景L0混合数据)](#48-比对文件动静态图场景l0混合数据)。 + +3. 执行如下示例命令进行比对: + + ```shell + msprobe -f mindspore compare -i ./compare.json -o ./output + ``` + +4. 动静态图场景L0混合dump数据比对结果,示例如下: + ```lua + ├── graph + │ ├── step0 + │ | ├── advisor_rank_20250805043414.txt + │ | ├── compare_result_rank_20250805043411.xlsx + ├── pynative + │ ├── step0 + │ | ├── advisor_rank_20250805043416.txt + │ | ├── compare_result_rank_20250805043414.xlsx + ``` + +output目录下生成两个graph和pynative两个文件夹,每个文件夹下生成对应step的比对结果。 + +5. 查看比对结果,请详见PyTorch目录下的《[PyTorch 场景的精度比对-精度比对结果分析](./10.accuracy_compare_PyTorch.md#3-精度比对结果分析)》章节。 + +### 2.10 首差异算子节点识别 +参见《[PyTorch 场景的精度比对-首差异算子节点识别](./10.accuracy_compare_PyTorch.md#215-首差异算子节点识别场景)》章节。 + ## 3 多卡比对结果提取汇总通信算子数据 本功能是将多卡比对场景的比对结果,进行通信算子数据提取和汇总,输出整理好的通信算子多卡比对精度表。 @@ -700,3 +756,14 @@ MindSpore静态图场景(不区分单/多卡)示例如下: - `bench_path`表示bench dump文件目录,指定同上。 - `common`表示开启MindSpore静态图单点保存比对,默认关闭。 - `map_dict`可用于当单点保存比对的`npy`文件名称不完全对应时,通过手动指定保证比对正确执行,比对指定名称对应,如{"input": "x"},则`input_float32_1.npy`会对应`x_float32_1.npy`。 + +### 4.8 比对文件(动静态图场景L0混合数据) + ```json +{ +"npu_path": "./npu_dump", +"bench_path": "./bench_dump", +"is_print_compare_log": true +} + ``` +- npu_path表示NPU dump文件目录,上面示例中的 ./npu_dump/ 是npu侧动静态图dump后graph和pynative目录的父目录。 +- bench_path表示Bench dump文件目录,上面示例中的 ./bench_dump/ 是bench侧动静态图dump后graph和pynative目录的父目录。 \ No newline at end of file diff --git a/debug/accuracy_tools/msprobe/test/core_ut/compare/test_acc_compare_check.py b/debug/accuracy_tools/msprobe/test/core_ut/compare/test_acc_compare_check.py index fdfd12422..aefcd0f34 100644 --- a/debug/accuracy_tools/msprobe/test/core_ut/compare/test_acc_compare_check.py +++ b/debug/accuracy_tools/msprobe/test/core_ut/compare/test_acc_compare_check.py @@ -1,6 +1,8 @@ # coding=utf-8 import unittest -from msprobe.core.compare.check import check_dump_json_str, check_json_key_value, valid_key_value, check_stack_json_str + +from msprobe.core.compare.check import check_dump_json_str, check_json_key_value, valid_key_value, \ + check_stack_json_str from msprobe.core.common.utils import CompareException diff --git a/debug/accuracy_tools/msprobe/test/core_ut/compare/test_acc_compare_utils.py b/debug/accuracy_tools/msprobe/test/core_ut/compare/test_acc_compare_utils.py index 9d418ae5e..4b06daf4b 100644 --- a/debug/accuracy_tools/msprobe/test/core_ut/compare/test_acc_compare_utils.py +++ b/debug/accuracy_tools/msprobe/test/core_ut/compare/test_acc_compare_utils.py @@ -6,6 +6,7 @@ import shutil import unittest from unittest.mock import patch import zlib +import tempfile import numpy as np @@ -13,7 +14,8 @@ from msprobe.core.common.const import CompareConst, Const from msprobe.core.common.utils import CompareException from msprobe.core.compare.utils import ApiItemInfo, _compare_parser, check_and_return_dir_contents, extract_json, \ count_struct, get_accuracy, get_rela_diff_summary_mode, merge_tensor, op_item_parse, read_op, result_item_init, \ - stack_column_process, table_value_is_valid, reorder_op_name_list, reorder_op_x_list, gen_op_item, ApiBatch + stack_column_process, table_value_is_valid, reorder_op_name_list, reorder_op_x_list, gen_op_item, ApiBatch, \ + get_paired_dirs # test_read_op_1 op_data = { @@ -910,3 +912,25 @@ class TestApiBatch(unittest.TestCase): self.assertEqual(api_batch.params_end_index, 3) self.assertEqual(api_batch.output_end_index, 5) self.assertEqual(api_batch.params_grad_end_index, 5) + + +class TestGetPairedSteps(unittest.TestCase): + def setUp(self): + self.npu_dir = tempfile.TemporaryDirectory() + self.bench_dir = tempfile.TemporaryDirectory() + + self.npu_files = ['step1', 'step2'] + for name in self.npu_files: + open(os.path.join(self.npu_dir.name, name), 'w').close() + + self.bench_files = ['step2', 'step3'] + for name in self.bench_files: + open(os.path.join(self.bench_dir.name, name), 'w').close() + + def tearDown(self): + self.npu_dir.cleanup() + self.bench_dir.cleanup() + + def test_get_paired_steps(self): + paired = get_paired_dirs(self.npu_dir.name, self.bench_dir.name) + self.assertEqual(set(paired), {'step2'}) diff --git a/debug/accuracy_tools/msprobe/test/core_ut/compare/test_cmp_cli.py b/debug/accuracy_tools/msprobe/test/core_ut/compare/test_cmp_cli.py new file mode 100644 index 000000000..62d546713 --- /dev/null +++ b/debug/accuracy_tools/msprobe/test/core_ut/compare/test_cmp_cli.py @@ -0,0 +1,34 @@ +import unittest +from unittest.mock import patch, MagicMock + +from msprobe.core.compare.compare_cli import mix_compare + + +class TestMixCompare(unittest.TestCase): + @patch('msprobe.core.compare.compare_cli.get_paired_dirs') + @patch('msprobe.core.compare.compare_cli.compare_cli') + def test_mix_compare_with_matching_dirs(self, mock_compare_cli, mock_get_paired_dirs): + mock_args = MagicMock() + mock_args.output_path = "/output" + mock_input_param = {"npu_path": "/npu_dump", "bench_path": "/bench_dump", "is_print_compare_log": True} + mock_get_paired_dirs.side_effect = [ + ["graph", "pynative"], # 第一次调用的返回值 + ["step1", "step2"], # 第二次调用的返回值 + ["step1", "step2"] # 第三次调用的返回值 + ] + + result = mix_compare(mock_args, mock_input_param, 1) + + self.assertTrue(result) + + @patch('msprobe.core.compare.compare_cli.get_paired_dirs') + @patch('msprobe.core.compare.compare_cli.compare_cli') + def test_mix_compare_no_matching_dirs(self, mock_compare_cli, mock_get_paired_dirs): + mock_args = MagicMock() + mock_args.output_path = "/output" + mock_input_param = {"npu_path": "/npu_dump", "bench_path": "/bench_dump", "is_print_compare_log": True} + mock_get_paired_dirs.return_value = set() + + result = mix_compare(mock_args, mock_input_param, 1) + + self.assertFalse(result) -- Gitee