From fa7a8090ac9ba98e522191aefc0eda6775533301 Mon Sep 17 00:00:00 2001 From: Linwei-Ying Date: Thu, 27 Feb 2025 11:03:26 +0800 Subject: [PATCH 01/37] compare read data read improve --- .../msprobe/core/common/const.py | 3 + .../msprobe/core/common/file_utils.py | 53 ++++++++++++-- .../msprobe/core/compare/acc_compare.py | 25 +++---- .../msprobe/mindspore/compare/ms_compare.py | 16 +---- .../msprobe/pytorch/compare/pt_compare.py | 33 +-------- .../test/core_ut/common/test_file_utils.py | 72 ++++++++++++++++++- .../mindspore_ut/compare/test_ms_compare.py | 27 ------- .../pytorch_ut/compare/test_pt_compare.py | 35 +-------- 8 files changed, 138 insertions(+), 126 deletions(-) diff --git a/debug/accuracy_tools/msprobe/core/common/const.py b/debug/accuracy_tools/msprobe/core/common/const.py index d9623b8071..aa203923c8 100644 --- a/debug/accuracy_tools/msprobe/core/common/const.py +++ b/debug/accuracy_tools/msprobe/core/common/const.py @@ -484,6 +484,9 @@ class CompareConst: INTERNAL_API_MAPPING_FILE = 'ms_to_pt_api.yaml' UNREADABLE = 'unreadable data' + CMP_PT_FRAMENAME = 'PTComparator' + CMP_MS_FRAMENAME = 'MSComparator' + class FileCheckConst: """ diff --git a/debug/accuracy_tools/msprobe/core/common/file_utils.py b/debug/accuracy_tools/msprobe/core/common/file_utils.py index fdc626ca6a..c153560311 100644 --- a/debug/accuracy_tools/msprobe/core/common/file_utils.py +++ b/debug/accuracy_tools/msprobe/core/common/file_utils.py @@ -1,4 +1,4 @@ -# Copyright (c) 2024-2024, Huawei Technologies Co., Ltd. +# Copyright (c) 2024-2025, Huawei Technologies Co., Ltd. # All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -23,12 +23,15 @@ import shutil from datetime import datetime, timezone from dateutil import parser import yaml +import torch import numpy as np import pandas as pd from msprobe.core.common.log import logger from msprobe.core.common.exceptions import FileCheckException -from msprobe.core.common.const import FileCheckConst +from msprobe.core.common.const import FileCheckConst, CompareConst +from msprobe.core.common.utils import CompareException +from msprobe.pytorch.common.utils import load_pt class FileChecker: @@ -446,8 +449,6 @@ def save_excel(path, data): change_mode(path, FileCheckConst.DATA_FILE_AUTHORITY) - - def move_file(src_path, dst_path): check_file_or_directory_path(src_path) check_path_before_create(dst_path) @@ -671,3 +672,47 @@ def read_xlsx(file_path): logger.error(f"The xlsx file failed to load. Please check the path: {file_path}.") raise RuntimeError(f"Read xlsx file {file_path} failed.") from e return result_df + + +def read_real_data(dir_path, file_name, frame_name): + """ + 用于比对。 + 根据比对的类名读取真实数据,比对类名为PTComparator,MSComparator。 + 输出均为ndarray。 + PTComparator时读取.pt文件,MSComparator时读取.npy文件。 + """ + if not file_name: + return None + + data_path = os.path.join(dir_path, file_name) + + if frame_name == CompareConst.CMP_PT_FRAMENAME: + path_checker = FileChecker(data_path, FileCheckConst.FILE, FileCheckConst.READ_ABLE, + FileCheckConst.PT_SUFFIX, False) + data_path = path_checker.common_check() + try: + # detach because numpy can not process gradient information + data_value = load_pt(data_path, to_cpu=True).detach() + except RuntimeError as e: + # 这里捕获 load_pt 中抛出的异常 + logger.error(f"Failed to load the .pt file at {data_path}.") + raise CompareException(CompareException.INVALID_FILE_ERROR) from e + except AttributeError as e: + # 这里捕获 detach 方法抛出的异常 + logger.error(f"Failed to detach the loaded tensor.") + raise CompareException(CompareException.DETACH_ERROR) from e + if data_value.dtype == torch.bfloat16: + data_value = data_value.to(torch.float32) + data_value = data_value.numpy() + return data_value + + elif frame_name == CompareConst.CMP_MS_FRAMENAME: + path_checker = FileChecker(data_path, FileCheckConst.FILE, FileCheckConst.READ_ABLE, + FileCheckConst.NUMPY_SUFFIX, False) + data_path = path_checker.common_check() + data_value = load_npy(data_path) + return data_value + + else: + logger.error(f"Wrong frame_name:{frame_name} when read real data in compare, please check!") + raise CompareException(CompareException.INVALID_FILE_ERROR) diff --git a/debug/accuracy_tools/msprobe/core/compare/acc_compare.py b/debug/accuracy_tools/msprobe/core/compare/acc_compare.py index 55229d7265..56b1364f0b 100644 --- a/debug/accuracy_tools/msprobe/core/compare/acc_compare.py +++ b/debug/accuracy_tools/msprobe/core/compare/acc_compare.py @@ -24,7 +24,7 @@ from tqdm import tqdm from msprobe.core.advisor.advisor import Advisor from msprobe.core.common.const import CompareConst, Const from msprobe.core.common.exceptions import FileCheckException -from msprobe.core.common.file_utils import load_json, remove_path +from msprobe.core.common.file_utils import load_json, remove_path, read_real_data from msprobe.core.common.log import logger from msprobe.core.common.utils import CompareException, add_time_with_xlsx, check_op_str_pattern_valid, safe_get_value from msprobe.core.compare.check import check_dump_json_str, check_graph_mode, check_stack_json_str, \ @@ -363,27 +363,28 @@ class Comparator: npu_bench_name_list = op_name_mapping_dict[npu_op_name] data_name = safe_get_value(npu_bench_name_list, 1, "npu_bench_name_list") error_file, relative_err, error_flag = None, None, False - bench_data_name = get_bench_data_name(bench_op_name, bench_data) - if data_name == '-1' or data_name == -1: # 没有真实数据路径 + bench_file_name = get_bench_data_name(bench_op_name, bench_data) + if str(data_name) == '-1': # 没有真实数据路径 n_value, b_value = CompareConst.READ_NONE, CompareConst.READ_NONE error_flag = True - elif not bench_data_name: + elif not bench_file_name: n_value, b_value, error_flag = CompareConst.READ_NONE, CompareConst.READ_NONE, True error_file = 'no_bench_data' else: + npu_dir = input_param.get("npu_dump_data_dir") + bench_dir = input_param.get("bench_dump_data_dir") + npu_file_name = npu_op_name + Const.NUMPY_SUFFIX try: - read_npy_data = getattr(self, "read_npy_data") frame_name = getattr(self, "frame_name") - if frame_name == "MSComparator": - n_value = read_npy_data(input_param.get("npu_dump_data_dir"), npu_op_name + Const.NUMPY_SUFFIX) + if frame_name == CompareConst.CMP_MS_FRAMENAME: + n_value = read_real_data(npu_dir, npu_file_name, CompareConst.CMP_MS_FRAMENAME) if self.cross_frame: - b_value = read_npy_data(input_param.get("bench_dump_data_dir"), bench_data_name, - load_pt_file=True) + b_value = read_real_data(bench_dir, bench_file_name, CompareConst.CMP_PT_FRAMENAME) else: - b_value = read_npy_data(input_param.get("bench_dump_data_dir"), bench_data_name) + b_value = read_real_data(bench_dir, bench_file_name, CompareConst.CMP_MS_FRAMENAME) else: - n_value = read_npy_data(input_param.get("npu_dump_data_dir"), npu_op_name + Const.PT_SUFFIX) - b_value = read_npy_data(input_param.get("bench_dump_data_dir"), bench_data_name) + n_value = read_real_data(npu_dir, npu_op_name + Const.PT_SUFFIX, CompareConst.CMP_PT_FRAMENAME) + b_value = read_real_data(bench_dir, bench_file_name, CompareConst.CMP_PT_FRAMENAME) except IOError as error: error_file = error.filename n_value, b_value = CompareConst.READ_NONE, CompareConst.READ_NONE diff --git a/debug/accuracy_tools/msprobe/mindspore/compare/ms_compare.py b/debug/accuracy_tools/msprobe/mindspore/compare/ms_compare.py index 8509a7f38a..db62c37de5 100644 --- a/debug/accuracy_tools/msprobe/mindspore/compare/ms_compare.py +++ b/debug/accuracy_tools/msprobe/mindspore/compare/ms_compare.py @@ -22,7 +22,7 @@ import pandas as pd from msprobe.core.common.const import CompareConst, Const from msprobe.core.common.exceptions import FileCheckException -from msprobe.core.common.file_utils import FileOpen, create_directory, load_json, load_npy, load_yaml +from msprobe.core.common.file_utils import FileOpen, create_directory, load_json, load_yaml from msprobe.core.common.log import logger from msprobe.core.common.utils import CompareException, check_compare_param, check_configuration_param, \ check_op_str_pattern_valid, get_dump_mode, set_dump_path @@ -202,20 +202,6 @@ class MSComparator(Comparator): npu_op_name = npu_op_name.replace(cell_name, self.cell_mapping_dict[cell_name], 1) return npu_op_name - def read_npy_data(self, dir_path, file_name, load_pt_file=False): - if not file_name: - return None - data_path = os.path.join(dir_path, file_name) - if load_pt_file: - import torch - from msprobe.pytorch.common.utils import load_pt - data_value = load_pt(data_path, True).detach() - if data_value.dtype == torch.bfloat16: - data_value = data_value.to(torch.float32) - data_value = data_value.numpy() - else: - data_value = load_npy(data_path) - return data_value def process_internal_api_mapping(self, npu_op_name): # get api name & class name from op_name diff --git a/debug/accuracy_tools/msprobe/pytorch/compare/pt_compare.py b/debug/accuracy_tools/msprobe/pytorch/compare/pt_compare.py index 308a82b3d6..7595c866bf 100644 --- a/debug/accuracy_tools/msprobe/pytorch/compare/pt_compare.py +++ b/debug/accuracy_tools/msprobe/pytorch/compare/pt_compare.py @@ -1,4 +1,4 @@ -# Copyright (c) 2024-2024, Huawei Technologies Co., Ltd. +# Copyright (c) 2024-2025, Huawei Technologies Co., Ltd. # All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -13,19 +13,13 @@ # See the License for the specific language governing permissions and # limitations under the License. -import os.path - -import torch - -from msprobe.core.common.const import FileCheckConst from msprobe.core.common.exceptions import FileCheckException -from msprobe.core.common.file_utils import FileChecker, create_directory, load_yaml +from msprobe.core.common.file_utils import create_directory, load_yaml from msprobe.core.common.utils import CompareException, check_compare_param, check_configuration_param, get_dump_mode, \ set_dump_path from msprobe.core.compare.acc_compare import Comparator, ModeConfig from msprobe.core.compare.utils import set_stack_json_path from msprobe.pytorch.common.log import logger -from msprobe.pytorch.common.utils import load_pt class PTComparator(Comparator): @@ -55,29 +49,6 @@ class PTComparator(Comparator): mapping_dict = {} return mapping_dict - def read_npy_data(self, dir_path, file_name): - if not file_name: - return None - data_path = os.path.join(dir_path, file_name) - path_checker = FileChecker(data_path, FileCheckConst.FILE, FileCheckConst.READ_ABLE, - FileCheckConst.PT_SUFFIX, False) - data_path = path_checker.common_check() - try: - # detach because numpy can not process gradient information - data_value = load_pt(data_path, to_cpu=True).detach() - except RuntimeError as e: - # 这里捕获 load_pt 中抛出的异常 - logger.error(f"Failed to load the .pt file at {data_path}.") - raise CompareException(CompareException.INVALID_FILE_ERROR) from e - except AttributeError as e: - # 这里捕获 detach 方法抛出的异常 - logger.error(f"Failed to detach the loaded tensor.") - raise CompareException(CompareException.DETACH_ERROR) from e - if data_value.dtype == torch.bfloat16: - data_value = data_value.to(torch.float32) - data_value = data_value.numpy() - return data_value - def compare(input_param, output_path, **kwargs): try: diff --git a/debug/accuracy_tools/msprobe/test/core_ut/common/test_file_utils.py b/debug/accuracy_tools/msprobe/test/core_ut/common/test_file_utils.py index 9ed13f78ae..06957f80b8 100644 --- a/debug/accuracy_tools/msprobe/test/core_ut/common/test_file_utils.py +++ b/debug/accuracy_tools/msprobe/test/core_ut/common/test_file_utils.py @@ -1,7 +1,6 @@ +import unittest from unittest.mock import patch, mock_open, MagicMock -import numpy as np -import pandas as pd import pytest from msprobe.core.common.file_utils import * @@ -533,4 +532,71 @@ class TestDirectoryChecks: # Test file path check_file_or_directory_path(self.test_file, isdir=False) # Test directory path - check_file_or_directory_path(self.test_dir, isdir=True) \ No newline at end of file + check_file_or_directory_path(self.test_dir, isdir=True) + + +class TestReadRealData(unittest.TestCase): + + @patch('load_pt') + @patch('FileChecker') + @patch('os.path.join', return_value='/fake/path/to/file.pt') + def test_read_real_data_pt(self, mock_os, mock_file_checker, mock_load_pt): + mock_file_checker.return_value.common_check.return_value = '/fake/path/to/file.pt' + + mock_tensor = MagicMock() + mock_tensor.detach.return_value = mock_tensor + mock_tensor.to.return_value = mock_tensor + mock_tensor.dtype = torch.bfloat16 + mock_tensor.numpy.return_value = np.array([1.0, 2.0, 3.0]) + mock_load_pt.return_value = mock_tensor + + result = read_real_data('/fake/dir', 'file_name.pt', 'PTComparator') + + mock_file_checker.assert_called_once_with('/fake/dir', 'file_name.pt', FileCheckConst.FILE, FileCheckConst.READ_ABLE, FileCheckConst.PT_SUFFIX, False) + mock_load_pt.assert_called_once_with('/fake/path/to/file.pt', to_cpu=True) + mock_tensor.to.assert_called_once_with(torch.float32) + self.assertTrue(np.array_equal(result, np.array([1.0, 2.0, 3.0]))) + + @patch('load_npy') + @patch('FileChecker') + @patch('os.path.join', return_value='/fake/path/to/file.npy') + def test_read_real_data_ms(self, mock_os, mock_file_checker, mock_load_npy): + mock_file_checker.return_value.common_check.return_value = '/fake/path/to/file.npy' + + mock_load_npy.return_value = np.array([1.0, 2.0, 3.0]) + + result = read_real_data('/fake/dir', 'file_name.npy', 'MSComparator') + + mock_file_checker.assert_called_once_with('/fake/dir', 'file_name.npy', FileCheckConst.FILE, FileCheckConst.READ_ABLE, FileCheckConst.NUMPY_SUFFIX, False) + mock_load_npy.assert_called_once_with('/fake/path/to/file.npy') + self.assertTrue(np.array_equal(result, np.array([1.0, 2.0, 3.0]))) + + @patch('os.path.join', return_value='/fake/path/to/file.txt') + @patch('FileChecker') + def test_read_real_data_invalid_framework(self, mock_file_checker, mock_os): + mock_file_checker.return_value.common_check.return_value = '/fake/path/to/file.txt' + + with self.assertRaises(CompareException): + read_real_data('/fake/dir', 'file_name.txt', 'InvalidComparator') + + @patch('os.path.join', return_value='/fake/path/to/file.pt') + @patch('FileChecker') + @patch('load_pt') + def test_read_real_data_pt_exception(self, mock_load_pt, mock_file_checker, mock_os): + mock_file_checker.return_value.common_check.return_value = '/fake/path/to/file.pt' + + mock_load_pt.side_effect = RuntimeError("Test Error") + + with self.assertRaises(CompareException): + read_real_data('/fake/dir', 'file_name.pt', 'PTComparator') + + @patch('os.path.join', return_value='/fake/path/to/file.npy') + @patch('FileChecker') + @patch('load_npy') + def test_read_real_data_npy_exception(self, mock_load_npy, mock_file_checker, mock_os): + mock_file_checker.return_value.common_check.return_value = '/fake/path/to/file.npy' + + mock_load_npy.side_effect = IOError("Test Error") + + with self.assertRaises(CompareException): + read_real_data('/fake/dir', 'file_name.npy', 'MSComparator') diff --git a/debug/accuracy_tools/msprobe/test/mindspore_ut/compare/test_ms_compare.py b/debug/accuracy_tools/msprobe/test/mindspore_ut/compare/test_ms_compare.py index b5cbff9784..0edb55154e 100644 --- a/debug/accuracy_tools/msprobe/test/mindspore_ut/compare/test_ms_compare.py +++ b/debug/accuracy_tools/msprobe/test/mindspore_ut/compare/test_ms_compare.py @@ -7,7 +7,6 @@ import tempfile import unittest import numpy as np -import torch import yaml from msprobe.core.common.utils import CompareException @@ -466,32 +465,6 @@ class TestUtilsMethods(unittest.TestCase): npu_op_name = ms_comparator.process_cell_mapping(npu_cell_dict.get('op_name')[0]) self.assertEqual(npu_op_name, 'Module.fc1.Linear.forward.0.input.0') - def test_read_npy_data(self): - stack_mode = True - auto_analyze = True - fuzzy_match = False - dump_mode = Const.ALL - - mode_config = ModeConfig(stack_mode, auto_analyze, fuzzy_match, dump_mode) - mapping_config = MappingConfig() - - ms_comparator = MSComparator(mode_config, mapping_config) - - self.temp_file = tempfile.NamedTemporaryFile(suffix='.pt') - tensor = torch.Tensor([1, 2, 3]) - filename = self.temp_file.name.split('/')[-1] - torch.save(tensor, self.temp_file.name) - result = ms_comparator.read_npy_data('/tmp', filename, load_pt_file=True) - self.assertTrue(np.array_equal(result, np.array([1, 2, 3]))) - self.temp_file.close() - - self.temp_file = tempfile.NamedTemporaryFile(suffix='.npy') - tensor = np.array([1, 2, 3]) - filename = self.temp_file.name.split('/')[-1] - np.save(self.temp_file.name, tensor) - result = ms_comparator.read_npy_data('/tmp', filename, load_pt_file=False) - self.assertTrue(np.array_equal(result, np.array([1, 2, 3]))) - self.temp_file.close() def test_process_internal_api_mapping(self): stack_mode = True diff --git a/debug/accuracy_tools/msprobe/test/pytorch_ut/compare/test_pt_compare.py b/debug/accuracy_tools/msprobe/test/pytorch_ut/compare/test_pt_compare.py index b079e646c4..4eda1d6d97 100644 --- a/debug/accuracy_tools/msprobe/test/pytorch_ut/compare/test_pt_compare.py +++ b/debug/accuracy_tools/msprobe/test/pytorch_ut/compare/test_pt_compare.py @@ -3,13 +3,10 @@ import os import shutil import unittest -import numpy as np import torch -from msprobe.core.common.const import Const from msprobe.core.common.utils import CompareException -from msprobe.core.compare.acc_compare import ModeConfig -from msprobe.pytorch.compare.pt_compare import PTComparator, compare +from msprobe.pytorch.compare.pt_compare import compare from msprobe.test.core_ut.compare.test_acc_compare import generate_dump_json, generate_stack_json @@ -40,36 +37,6 @@ class TestUtilsMethods(unittest.TestCase): if os.path.exists(base_dir2): shutil.rmtree(base_dir2) - def test_read_npy_data_bf16(self): - generate_bf16_pt(base_dir1) - - stack_mode = True - auto_analyze = True - fuzzy_match = False - dump_mode = Const.ALL - mode_config = ModeConfig(stack_mode, auto_analyze, fuzzy_match, dump_mode) - - pt_comparator = PTComparator(mode_config) - result = pt_comparator.read_npy_data(base_dir1, 'bf16.pt') - - target_result = torch.tensor([1, 2, 3, 4], dtype=torch.float32).numpy() - self.assertTrue(np.array_equal(result, target_result)) - - def test_read_npy_data_dict(self): - generate_dict_pt(base_dir1) - - stack_mode = True - auto_analyze = True - fuzzy_match = False - dump_mode = Const.ALL - mode_config = ModeConfig(stack_mode, auto_analyze, fuzzy_match, dump_mode) - - pt_comparator = PTComparator(mode_config) - - with self.assertRaises(CompareException) as context: - result = pt_comparator.read_npy_data(base_dir1, 'dict.pt') - self.assertEqual(context.exception.code, CompareException.DETACH_ERROR) - def test_compare(self): generate_dump_json(base_dir2) generate_stack_json(base_dir2) -- Gitee From 505904aefaf67d83c550e0f936ecf3ba80443dd0 Mon Sep 17 00:00:00 2001 From: Linwei-Ying Date: Thu, 27 Feb 2025 11:36:34 +0800 Subject: [PATCH 02/37] compare read data read improve --- .../msprobe/core/common/file_utils.py | 49 +----------- .../msprobe/core/compare/acc_compare.py | 4 +- .../msprobe/core/compare/utils.py | 47 +++++++++++- .../test/core_ut/common/test_file_utils.py | 67 ----------------- .../core_ut/compare/test_acc_compare_utils.py | 74 ++++++++++++++++++- 5 files changed, 120 insertions(+), 121 deletions(-) diff --git a/debug/accuracy_tools/msprobe/core/common/file_utils.py b/debug/accuracy_tools/msprobe/core/common/file_utils.py index c153560311..35b7f2e965 100644 --- a/debug/accuracy_tools/msprobe/core/common/file_utils.py +++ b/debug/accuracy_tools/msprobe/core/common/file_utils.py @@ -23,15 +23,12 @@ import shutil from datetime import datetime, timezone from dateutil import parser import yaml -import torch import numpy as np import pandas as pd from msprobe.core.common.log import logger from msprobe.core.common.exceptions import FileCheckException -from msprobe.core.common.const import FileCheckConst, CompareConst -from msprobe.core.common.utils import CompareException -from msprobe.pytorch.common.utils import load_pt +from msprobe.core.common.const import FileCheckConst class FileChecker: @@ -672,47 +669,3 @@ def read_xlsx(file_path): logger.error(f"The xlsx file failed to load. Please check the path: {file_path}.") raise RuntimeError(f"Read xlsx file {file_path} failed.") from e return result_df - - -def read_real_data(dir_path, file_name, frame_name): - """ - 用于比对。 - 根据比对的类名读取真实数据,比对类名为PTComparator,MSComparator。 - 输出均为ndarray。 - PTComparator时读取.pt文件,MSComparator时读取.npy文件。 - """ - if not file_name: - return None - - data_path = os.path.join(dir_path, file_name) - - if frame_name == CompareConst.CMP_PT_FRAMENAME: - path_checker = FileChecker(data_path, FileCheckConst.FILE, FileCheckConst.READ_ABLE, - FileCheckConst.PT_SUFFIX, False) - data_path = path_checker.common_check() - try: - # detach because numpy can not process gradient information - data_value = load_pt(data_path, to_cpu=True).detach() - except RuntimeError as e: - # 这里捕获 load_pt 中抛出的异常 - logger.error(f"Failed to load the .pt file at {data_path}.") - raise CompareException(CompareException.INVALID_FILE_ERROR) from e - except AttributeError as e: - # 这里捕获 detach 方法抛出的异常 - logger.error(f"Failed to detach the loaded tensor.") - raise CompareException(CompareException.DETACH_ERROR) from e - if data_value.dtype == torch.bfloat16: - data_value = data_value.to(torch.float32) - data_value = data_value.numpy() - return data_value - - elif frame_name == CompareConst.CMP_MS_FRAMENAME: - path_checker = FileChecker(data_path, FileCheckConst.FILE, FileCheckConst.READ_ABLE, - FileCheckConst.NUMPY_SUFFIX, False) - data_path = path_checker.common_check() - data_value = load_npy(data_path) - return data_value - - else: - logger.error(f"Wrong frame_name:{frame_name} when read real data in compare, please check!") - raise CompareException(CompareException.INVALID_FILE_ERROR) diff --git a/debug/accuracy_tools/msprobe/core/compare/acc_compare.py b/debug/accuracy_tools/msprobe/core/compare/acc_compare.py index 56b1364f0b..0d8c1d492c 100644 --- a/debug/accuracy_tools/msprobe/core/compare/acc_compare.py +++ b/debug/accuracy_tools/msprobe/core/compare/acc_compare.py @@ -24,7 +24,7 @@ from tqdm import tqdm from msprobe.core.advisor.advisor import Advisor from msprobe.core.common.const import CompareConst, Const from msprobe.core.common.exceptions import FileCheckException -from msprobe.core.common.file_utils import load_json, remove_path, read_real_data +from msprobe.core.common.file_utils import load_json, remove_path from msprobe.core.common.log import logger from msprobe.core.common.utils import CompareException, add_time_with_xlsx, check_op_str_pattern_valid, safe_get_value from msprobe.core.compare.check import check_dump_json_str, check_graph_mode, check_stack_json_str, \ @@ -33,7 +33,7 @@ from msprobe.core.compare.highlight import find_compare_result_error_rows, highl from msprobe.core.compare.multiprocessing_compute import ComparisonResult, _handle_multi_process, _save_cmp_result from msprobe.core.compare.npy_compare import compare_ops_apply, get_error_flag_and_msg from msprobe.core.compare.utils import get_accuracy, get_rela_diff_summary_mode, get_un_match_accuracy, merge_tensor, \ - print_compare_ends_info, read_op, get_name_and_state, reorder_op_x_list + print_compare_ends_info, read_op, get_name_and_state, reorder_op_x_list, read_real_data class ModeConfig: diff --git a/debug/accuracy_tools/msprobe/core/compare/utils.py b/debug/accuracy_tools/msprobe/core/compare/utils.py index a2edf57e5b..46915a36aa 100644 --- a/debug/accuracy_tools/msprobe/core/compare/utils.py +++ b/debug/accuracy_tools/msprobe/core/compare/utils.py @@ -19,11 +19,12 @@ import math import zlib from dataclasses import dataclass +import torch import numpy as np from msprobe.core.common.const import Const, CompareConst, FileCheckConst from msprobe.core.common.utils import CompareException, check_regex_prefix_format_valid, logger, safe_get_value -from msprobe.core.common.file_utils import check_file_or_directory_path +from msprobe.core.common.file_utils import check_file_or_directory_path, load_pt, load_npy, FileChecker def extract_json(dirname, stack_json=False): @@ -597,6 +598,50 @@ def reorder_op_x_list(op_name_list, summary_list, data_name_list): return op_name_reorder, summary_reorder, data_name_reorder +def read_real_data(dir_path, file_name, frame_name): + """ + 用于比对。 + 根据比对的类名读取真实数据,比对类名为PTComparator,MSComparator。 + 输出均为ndarray。 + PTComparator时读取.pt文件,MSComparator时读取.npy文件。 + """ + if not file_name: + return None + + data_path = os.path.join(dir_path, file_name) + + if frame_name == CompareConst.CMP_PT_FRAMENAME: + path_checker = FileChecker(data_path, FileCheckConst.FILE, FileCheckConst.READ_ABLE, + FileCheckConst.PT_SUFFIX, False) + data_path = path_checker.common_check() + try: + # detach because numpy can not process gradient information + data_value = load_pt(data_path, to_cpu=True).detach() + except RuntimeError as e: + # 这里捕获 load_pt 中抛出的异常 + logger.error(f"Failed to load the .pt file at {data_path}.") + raise CompareException(CompareException.INVALID_FILE_ERROR) from e + except AttributeError as e: + # 这里捕获 detach 方法抛出的异常 + logger.error(f"Failed to detach the loaded tensor.") + raise CompareException(CompareException.DETACH_ERROR) from e + if data_value.dtype == torch.bfloat16: + data_value = data_value.to(torch.float32) + data_value = data_value.numpy() + return data_value + + elif frame_name == CompareConst.CMP_MS_FRAMENAME: + path_checker = FileChecker(data_path, FileCheckConst.FILE, FileCheckConst.READ_ABLE, + FileCheckConst.NUMPY_SUFFIX, False) + data_path = path_checker.common_check() + data_value = load_npy(data_path) + return data_value + + else: + logger.error(f"Wrong frame_name:{frame_name} when read real data in compare, please check!") + raise CompareException(CompareException.INVALID_FILE_ERROR) + + def _compare_parser(parser): parser.add_argument("-i", "--input_path", dest="input_path", type=str, help=" The compare input path, a dict json.", required=True) diff --git a/debug/accuracy_tools/msprobe/test/core_ut/common/test_file_utils.py b/debug/accuracy_tools/msprobe/test/core_ut/common/test_file_utils.py index 06957f80b8..c4de285c7a 100644 --- a/debug/accuracy_tools/msprobe/test/core_ut/common/test_file_utils.py +++ b/debug/accuracy_tools/msprobe/test/core_ut/common/test_file_utils.py @@ -533,70 +533,3 @@ class TestDirectoryChecks: check_file_or_directory_path(self.test_file, isdir=False) # Test directory path check_file_or_directory_path(self.test_dir, isdir=True) - - -class TestReadRealData(unittest.TestCase): - - @patch('load_pt') - @patch('FileChecker') - @patch('os.path.join', return_value='/fake/path/to/file.pt') - def test_read_real_data_pt(self, mock_os, mock_file_checker, mock_load_pt): - mock_file_checker.return_value.common_check.return_value = '/fake/path/to/file.pt' - - mock_tensor = MagicMock() - mock_tensor.detach.return_value = mock_tensor - mock_tensor.to.return_value = mock_tensor - mock_tensor.dtype = torch.bfloat16 - mock_tensor.numpy.return_value = np.array([1.0, 2.0, 3.0]) - mock_load_pt.return_value = mock_tensor - - result = read_real_data('/fake/dir', 'file_name.pt', 'PTComparator') - - mock_file_checker.assert_called_once_with('/fake/dir', 'file_name.pt', FileCheckConst.FILE, FileCheckConst.READ_ABLE, FileCheckConst.PT_SUFFIX, False) - mock_load_pt.assert_called_once_with('/fake/path/to/file.pt', to_cpu=True) - mock_tensor.to.assert_called_once_with(torch.float32) - self.assertTrue(np.array_equal(result, np.array([1.0, 2.0, 3.0]))) - - @patch('load_npy') - @patch('FileChecker') - @patch('os.path.join', return_value='/fake/path/to/file.npy') - def test_read_real_data_ms(self, mock_os, mock_file_checker, mock_load_npy): - mock_file_checker.return_value.common_check.return_value = '/fake/path/to/file.npy' - - mock_load_npy.return_value = np.array([1.0, 2.0, 3.0]) - - result = read_real_data('/fake/dir', 'file_name.npy', 'MSComparator') - - mock_file_checker.assert_called_once_with('/fake/dir', 'file_name.npy', FileCheckConst.FILE, FileCheckConst.READ_ABLE, FileCheckConst.NUMPY_SUFFIX, False) - mock_load_npy.assert_called_once_with('/fake/path/to/file.npy') - self.assertTrue(np.array_equal(result, np.array([1.0, 2.0, 3.0]))) - - @patch('os.path.join', return_value='/fake/path/to/file.txt') - @patch('FileChecker') - def test_read_real_data_invalid_framework(self, mock_file_checker, mock_os): - mock_file_checker.return_value.common_check.return_value = '/fake/path/to/file.txt' - - with self.assertRaises(CompareException): - read_real_data('/fake/dir', 'file_name.txt', 'InvalidComparator') - - @patch('os.path.join', return_value='/fake/path/to/file.pt') - @patch('FileChecker') - @patch('load_pt') - def test_read_real_data_pt_exception(self, mock_load_pt, mock_file_checker, mock_os): - mock_file_checker.return_value.common_check.return_value = '/fake/path/to/file.pt' - - mock_load_pt.side_effect = RuntimeError("Test Error") - - with self.assertRaises(CompareException): - read_real_data('/fake/dir', 'file_name.pt', 'PTComparator') - - @patch('os.path.join', return_value='/fake/path/to/file.npy') - @patch('FileChecker') - @patch('load_npy') - def test_read_real_data_npy_exception(self, mock_load_npy, mock_file_checker, mock_os): - mock_file_checker.return_value.common_check.return_value = '/fake/path/to/file.npy' - - mock_load_npy.side_effect = IOError("Test Error") - - with self.assertRaises(CompareException): - read_real_data('/fake/dir', 'file_name.npy', 'MSComparator') diff --git a/debug/accuracy_tools/msprobe/test/core_ut/compare/test_acc_compare_utils.py b/debug/accuracy_tools/msprobe/test/core_ut/compare/test_acc_compare_utils.py index ab8703dcd3..113156f64b 100644 --- a/debug/accuracy_tools/msprobe/test/core_ut/compare/test_acc_compare_utils.py +++ b/debug/accuracy_tools/msprobe/test/core_ut/compare/test_acc_compare_utils.py @@ -4,17 +4,18 @@ import json import os import shutil import unittest -from unittest.mock import patch +from unittest.mock import patch, MagicMock import zlib +import torch import numpy as np -from msprobe.core.common.const import CompareConst, Const +from msprobe.core.common.const import CompareConst, Const, FileCheckConst from msprobe.core.common.utils import CompareException from msprobe.core.compare.utils import ApiItemInfo, _compare_parser, check_and_return_dir_contents, extract_json, \ count_struct, get_accuracy, append_stack_info, get_rela_diff_summary_mode, get_un_match_accuracy, merge_tensor, \ op_item_parse, read_op, rename_api, resolve_api_special_parameters, result_item_init, stack_column_process, \ - table_value_is_valid, get_name_and_state, reorder_op_name_list, reorder_op_x_list, gen_op_item + table_value_is_valid, get_name_and_state, reorder_op_name_list, reorder_op_x_list, gen_op_item, read_real_data # test_read_op_1 op_data = { @@ -848,3 +849,70 @@ class TestGenOpItem(unittest.TestCase): expected_md5 = f"{zlib.crc32(str(op_data['value']).encode()):08x}" self.assertEqual(result['md5'], expected_md5) + + +class TestReadRealData(unittest.TestCase): + + @patch('msprobe.core.compare.utils.load_pt') + @patch('msprobe.core.compare.utils.FileChecker') + @patch('os.path.join', return_value='/fake/path/to/file.pt') + def test_read_real_data_pt(self, mock_os, mock_file_checker, mock_load_pt): + mock_file_checker.return_value.common_check.return_value = '/fake/path/to/file.pt' + + mock_tensor = MagicMock() + mock_tensor.detach.return_value = mock_tensor + mock_tensor.to.return_value = mock_tensor + mock_tensor.dtype = torch.bfloat16 + mock_tensor.numpy.return_value = np.array([1.0, 2.0, 3.0]) + mock_load_pt.return_value = mock_tensor + + result = read_real_data('/fake/dir', 'file_name.pt', 'PTComparator') + + mock_file_checker.assert_called_once_with('/fake/dir', 'file_name.pt', FileCheckConst.FILE, FileCheckConst.READ_ABLE, FileCheckConst.PT_SUFFIX, False) + mock_load_pt.assert_called_once_with('/fake/path/to/file.pt', to_cpu=True) + mock_tensor.to.assert_called_once_with(torch.float32) + self.assertTrue(np.array_equal(result, np.array([1.0, 2.0, 3.0]))) + + @patch('msprobe.core.compare.utils.load_npy') + @patch('msprobe.core.compare.utils.FileChecker') + @patch('os.path.join', return_value='/fake/path/to/file.npy') + def test_read_real_data_ms(self, mock_os, mock_file_checker, mock_load_npy): + mock_file_checker.return_value.common_check.return_value = '/fake/path/to/file.npy' + + mock_load_npy.return_value = np.array([1.0, 2.0, 3.0]) + + result = read_real_data('/fake/dir', 'file_name.npy', 'MSComparator') + + mock_file_checker.assert_called_once_with('/fake/dir', 'file_name.npy', FileCheckConst.FILE, FileCheckConst.READ_ABLE, FileCheckConst.NUMPY_SUFFIX, False) + mock_load_npy.assert_called_once_with('/fake/path/to/file.npy') + self.assertTrue(np.array_equal(result, np.array([1.0, 2.0, 3.0]))) + + @patch('os.path.join', return_value='/fake/path/to/file.txt') + @patch('msprobe.core.compare.utils.FileChecker') + def test_read_real_data_invalid_framework(self, mock_file_checker, mock_os): + mock_file_checker.return_value.common_check.return_value = '/fake/path/to/file.txt' + + with self.assertRaises(CompareException): + read_real_data('/fake/dir', 'file_name.txt', 'InvalidComparator') + + @patch('os.path.join', return_value='/fake/path/to/file.pt') + @patch('msprobe.core.compare.utils.FileChecker') + @patch('msprobe.core.compare.utils.load_pt') + def test_read_real_data_pt_exception(self, mock_load_pt, mock_file_checker, mock_os): + mock_file_checker.return_value.common_check.return_value = '/fake/path/to/file.pt' + + mock_load_pt.side_effect = RuntimeError("Test Error") + + with self.assertRaises(CompareException): + read_real_data('/fake/dir', 'file_name.pt', 'PTComparator') + + @patch('os.path.join', return_value='/fake/path/to/file.npy') + @patch('msprobe.core.compare.utils.FileChecker') + @patch('msprobe.core.compare.utils.load_npy') + def test_read_real_data_npy_exception(self, mock_load_npy, mock_file_checker, mock_os): + mock_file_checker.return_value.common_check.return_value = '/fake/path/to/file.npy' + + mock_load_npy.side_effect = IOError("Test Error") + + with self.assertRaises(CompareException): + read_real_data('/fake/dir', 'file_name.npy', 'MSComparator') -- Gitee From 7ac1150df344a5b81b09f9b11285b8b6b917a43d Mon Sep 17 00:00:00 2001 From: Linwei-Ying Date: Thu, 27 Feb 2025 14:04:00 +0800 Subject: [PATCH 03/37] compare read data read improve --- debug/accuracy_tools/msprobe/core/compare/utils.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/debug/accuracy_tools/msprobe/core/compare/utils.py b/debug/accuracy_tools/msprobe/core/compare/utils.py index 46915a36aa..b8653d0292 100644 --- a/debug/accuracy_tools/msprobe/core/compare/utils.py +++ b/debug/accuracy_tools/msprobe/core/compare/utils.py @@ -24,7 +24,8 @@ import numpy as np from msprobe.core.common.const import Const, CompareConst, FileCheckConst from msprobe.core.common.utils import CompareException, check_regex_prefix_format_valid, logger, safe_get_value -from msprobe.core.common.file_utils import check_file_or_directory_path, load_pt, load_npy, FileChecker +from msprobe.core.common.file_utils import check_file_or_directory_path, load_npy, FileChecker +from msprobe.pytorch.common.utils import load_pt def extract_json(dirname, stack_json=False): -- Gitee From 6b0d3ce5e1add27efe87a2095105b2d6b00d9c96 Mon Sep 17 00:00:00 2001 From: Linwei-Ying Date: Thu, 27 Feb 2025 14:43:54 +0800 Subject: [PATCH 04/37] compare read data read improve --- debug/accuracy_tools/msprobe/core/compare/utils.py | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/debug/accuracy_tools/msprobe/core/compare/utils.py b/debug/accuracy_tools/msprobe/core/compare/utils.py index b8653d0292..ada4c7c967 100644 --- a/debug/accuracy_tools/msprobe/core/compare/utils.py +++ b/debug/accuracy_tools/msprobe/core/compare/utils.py @@ -25,7 +25,6 @@ import numpy as np from msprobe.core.common.const import Const, CompareConst, FileCheckConst from msprobe.core.common.utils import CompareException, check_regex_prefix_format_valid, logger, safe_get_value from msprobe.core.common.file_utils import check_file_or_directory_path, load_npy, FileChecker -from msprobe.pytorch.common.utils import load_pt def extract_json(dirname, stack_json=False): @@ -599,6 +598,19 @@ def reorder_op_x_list(op_name_list, summary_list, data_name_list): return op_name_reorder, summary_reorder, data_name_reorder +def load_pt(pt_path, to_cpu=False): + pt_path = os.path.realpath(pt_path) + check_file_or_directory_path(pt_path) + try: + if to_cpu: + pt = torch.load(pt_path, map_location=torch.device("cpu"), weights_only=True) + else: + pt = torch.load(pt_path, weights_only=True) + except Exception as e: + raise RuntimeError(f"load pt file {pt_path} failed") from e + return pt + + def read_real_data(dir_path, file_name, frame_name): """ 用于比对。 -- Gitee From 518ff4f5870201977d88e641ae387877bb63e1f1 Mon Sep 17 00:00:00 2001 From: Linwei-Ying Date: Thu, 27 Feb 2025 14:47:50 +0800 Subject: [PATCH 05/37] compare read data read improve --- .../msprobe/test/core_ut/compare/test_acc_compare_utils.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/debug/accuracy_tools/msprobe/test/core_ut/compare/test_acc_compare_utils.py b/debug/accuracy_tools/msprobe/test/core_ut/compare/test_acc_compare_utils.py index 113156f64b..9b4a11ca0c 100644 --- a/debug/accuracy_tools/msprobe/test/core_ut/compare/test_acc_compare_utils.py +++ b/debug/accuracy_tools/msprobe/test/core_ut/compare/test_acc_compare_utils.py @@ -868,7 +868,7 @@ class TestReadRealData(unittest.TestCase): result = read_real_data('/fake/dir', 'file_name.pt', 'PTComparator') - mock_file_checker.assert_called_once_with('/fake/dir', 'file_name.pt', FileCheckConst.FILE, FileCheckConst.READ_ABLE, FileCheckConst.PT_SUFFIX, False) + mock_file_checker.assert_called_once_with('/fake/path/to/file.pt', FileCheckConst.FILE, FileCheckConst.READ_ABLE, FileCheckConst.PT_SUFFIX, False) mock_load_pt.assert_called_once_with('/fake/path/to/file.pt', to_cpu=True) mock_tensor.to.assert_called_once_with(torch.float32) self.assertTrue(np.array_equal(result, np.array([1.0, 2.0, 3.0]))) @@ -883,7 +883,7 @@ class TestReadRealData(unittest.TestCase): result = read_real_data('/fake/dir', 'file_name.npy', 'MSComparator') - mock_file_checker.assert_called_once_with('/fake/dir', 'file_name.npy', FileCheckConst.FILE, FileCheckConst.READ_ABLE, FileCheckConst.NUMPY_SUFFIX, False) + mock_file_checker.assert_called_once_with('/fake/path/to/file.npy', FileCheckConst.FILE, FileCheckConst.READ_ABLE, FileCheckConst.NUMPY_SUFFIX, False) mock_load_npy.assert_called_once_with('/fake/path/to/file.npy') self.assertTrue(np.array_equal(result, np.array([1.0, 2.0, 3.0]))) -- Gitee From 9bf733231a57909c8efd233cc0e06cbd9789ad00 Mon Sep 17 00:00:00 2001 From: Linwei-Ying Date: Thu, 27 Feb 2025 14:52:30 +0800 Subject: [PATCH 06/37] compare read data read improve --- .../msprobe/core/common/file_utils.py | 14 ++++++++++++ .../msprobe/core/compare/utils.py | 15 +------------ .../msprobe/pytorch/common/utils.py | 22 +++++++++---------- 3 files changed, 26 insertions(+), 25 deletions(-) diff --git a/debug/accuracy_tools/msprobe/core/common/file_utils.py b/debug/accuracy_tools/msprobe/core/common/file_utils.py index 35b7f2e965..4994e8e27e 100644 --- a/debug/accuracy_tools/msprobe/core/common/file_utils.py +++ b/debug/accuracy_tools/msprobe/core/common/file_utils.py @@ -22,6 +22,7 @@ import re import shutil from datetime import datetime, timezone from dateutil import parser +import torch import yaml import numpy as np import pandas as pd @@ -669,3 +670,16 @@ def read_xlsx(file_path): logger.error(f"The xlsx file failed to load. Please check the path: {file_path}.") raise RuntimeError(f"Read xlsx file {file_path} failed.") from e return result_df + + +def load_pt(pt_path, to_cpu=False): + pt_path = os.path.realpath(pt_path) + check_file_or_directory_path(pt_path) + try: + if to_cpu: + pt = torch.load(pt_path, map_location=torch.device("cpu"), weights_only=True) + else: + pt = torch.load(pt_path, weights_only=True) + except Exception as e: + raise RuntimeError(f"load pt file {pt_path} failed") from e + return pt diff --git a/debug/accuracy_tools/msprobe/core/compare/utils.py b/debug/accuracy_tools/msprobe/core/compare/utils.py index ada4c7c967..6a706e7c38 100644 --- a/debug/accuracy_tools/msprobe/core/compare/utils.py +++ b/debug/accuracy_tools/msprobe/core/compare/utils.py @@ -24,7 +24,7 @@ import numpy as np from msprobe.core.common.const import Const, CompareConst, FileCheckConst from msprobe.core.common.utils import CompareException, check_regex_prefix_format_valid, logger, safe_get_value -from msprobe.core.common.file_utils import check_file_or_directory_path, load_npy, FileChecker +from msprobe.core.common.file_utils import check_file_or_directory_path, load_npy, FileChecker, load_pt def extract_json(dirname, stack_json=False): @@ -598,19 +598,6 @@ def reorder_op_x_list(op_name_list, summary_list, data_name_list): return op_name_reorder, summary_reorder, data_name_reorder -def load_pt(pt_path, to_cpu=False): - pt_path = os.path.realpath(pt_path) - check_file_or_directory_path(pt_path) - try: - if to_cpu: - pt = torch.load(pt_path, map_location=torch.device("cpu"), weights_only=True) - else: - pt = torch.load(pt_path, weights_only=True) - except Exception as e: - raise RuntimeError(f"load pt file {pt_path} failed") from e - return pt - - def read_real_data(dir_path, file_name, frame_name): """ 用于比对。 diff --git a/debug/accuracy_tools/msprobe/pytorch/common/utils.py b/debug/accuracy_tools/msprobe/pytorch/common/utils.py index 16067f6d2b..d63db0bffd 100644 --- a/debug/accuracy_tools/msprobe/pytorch/common/utils.py +++ b/debug/accuracy_tools/msprobe/pytorch/common/utils.py @@ -309,17 +309,17 @@ def print_rank_0(message): logger.info(message) -def load_pt(pt_path, to_cpu=False): - pt_path = os.path.realpath(pt_path) - check_file_or_directory_path(pt_path) - try: - if to_cpu: - pt = torch.load(pt_path, map_location=torch.device("cpu"), weights_only=True) - else: - pt = torch.load(pt_path, weights_only=True) - except Exception as e: - raise RuntimeError(f"load pt file {pt_path} failed") from e - return pt +# def load_pt(pt_path, to_cpu=False): +# pt_path = os.path.realpath(pt_path) +# check_file_or_directory_path(pt_path) +# try: +# if to_cpu: +# pt = torch.load(pt_path, map_location=torch.device("cpu"), weights_only=True) +# else: +# pt = torch.load(pt_path, weights_only=True) +# except Exception as e: +# raise RuntimeError(f"load pt file {pt_path} failed") from e +# return pt def save_pt(tensor, filepath): -- Gitee From beefd56e0efd95c4cc3aaba2b4c6b9116663af01 Mon Sep 17 00:00:00 2001 From: Linwei-Ying Date: Thu, 27 Feb 2025 14:53:06 +0800 Subject: [PATCH 07/37] compare read data read improve --- .../test/core_ut/compare/test_acc_compare_utils.py | 11 ----------- 1 file changed, 11 deletions(-) diff --git a/debug/accuracy_tools/msprobe/test/core_ut/compare/test_acc_compare_utils.py b/debug/accuracy_tools/msprobe/test/core_ut/compare/test_acc_compare_utils.py index 9b4a11ca0c..bc7bd0a450 100644 --- a/debug/accuracy_tools/msprobe/test/core_ut/compare/test_acc_compare_utils.py +++ b/debug/accuracy_tools/msprobe/test/core_ut/compare/test_acc_compare_utils.py @@ -905,14 +905,3 @@ class TestReadRealData(unittest.TestCase): with self.assertRaises(CompareException): read_real_data('/fake/dir', 'file_name.pt', 'PTComparator') - - @patch('os.path.join', return_value='/fake/path/to/file.npy') - @patch('msprobe.core.compare.utils.FileChecker') - @patch('msprobe.core.compare.utils.load_npy') - def test_read_real_data_npy_exception(self, mock_load_npy, mock_file_checker, mock_os): - mock_file_checker.return_value.common_check.return_value = '/fake/path/to/file.npy' - - mock_load_npy.side_effect = IOError("Test Error") - - with self.assertRaises(CompareException): - read_real_data('/fake/dir', 'file_name.npy', 'MSComparator') -- Gitee From c9da1c4b606e0d945077b7c9fc90bf7bc67a417d Mon Sep 17 00:00:00 2001 From: Linwei-Ying Date: Thu, 27 Feb 2025 14:58:25 +0800 Subject: [PATCH 08/37] compare read data read improve --- .../data_processor/pytorch_processor.py | 2 +- .../run_ut/data_generate.py | 5 ++- .../msprobe/pytorch/common/utils.py | 13 ------- .../test/core_ut/common/test_file_utils.py | 34 +++++++++++++++++++ .../test/pytorch_ut/common/test_pt_utils.py | 33 ------------------ 5 files changed, 37 insertions(+), 50 deletions(-) diff --git a/debug/accuracy_tools/msprobe/core/data_dump/data_processor/pytorch_processor.py b/debug/accuracy_tools/msprobe/core/data_dump/data_processor/pytorch_processor.py index 64253aa426..ed461be134 100644 --- a/debug/accuracy_tools/msprobe/core/data_dump/data_processor/pytorch_processor.py +++ b/debug/accuracy_tools/msprobe/core/data_dump/data_processor/pytorch_processor.py @@ -29,7 +29,7 @@ from msprobe.core.common.log import logger from msprobe.core.common.utils import convert_tuple from msprobe.core.data_dump.data_processor.base import BaseDataProcessor, ModuleBackwardInputsOutputs, \ ModuleForwardInputsOutputs, TensorStatInfo -from msprobe.pytorch.common.utils import save_pt, load_pt +from msprobe.pytorch.common.utils import save_pt from msprobe.pytorch.free_benchmark import FreeBenchmarkCheck, UnequalRow from msprobe.core.common.utils import recursion_depth_decorator diff --git a/debug/accuracy_tools/msprobe/pytorch/api_accuracy_checker/run_ut/data_generate.py b/debug/accuracy_tools/msprobe/pytorch/api_accuracy_checker/run_ut/data_generate.py index 9d89b2de32..d70c258192 100644 --- a/debug/accuracy_tools/msprobe/pytorch/api_accuracy_checker/run_ut/data_generate.py +++ b/debug/accuracy_tools/msprobe/pytorch/api_accuracy_checker/run_ut/data_generate.py @@ -1,6 +1,6 @@ #!/usr/bin/env python3 # -*- coding: utf-8 -*- -# Copyright (c) 2024-2024, Huawei Technologies Co., Ltd. +# Copyright (c) 2024-2025, Huawei Technologies Co., Ltd. # All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -23,9 +23,8 @@ import numpy from msprobe.pytorch.api_accuracy_checker.run_ut.run_ut_utils import hf_32_standard_api from msprobe.pytorch.api_accuracy_checker.common.utils import check_object_type, get_full_data_path, \ CompareException, get_module_and_atttribute_name, get_attribute -from msprobe.core.common.file_utils import FileChecker, load_npy +from msprobe.core.common.file_utils import FileChecker, load_npy, load_pt from msprobe.pytorch.common.log import logger -from msprobe.pytorch.common.utils import load_pt from msprobe.core.common.const import Const, FileCheckConst, CompareConst diff --git a/debug/accuracy_tools/msprobe/pytorch/common/utils.py b/debug/accuracy_tools/msprobe/pytorch/common/utils.py index d63db0bffd..3416661df6 100644 --- a/debug/accuracy_tools/msprobe/pytorch/common/utils.py +++ b/debug/accuracy_tools/msprobe/pytorch/common/utils.py @@ -309,19 +309,6 @@ def print_rank_0(message): logger.info(message) -# def load_pt(pt_path, to_cpu=False): -# pt_path = os.path.realpath(pt_path) -# check_file_or_directory_path(pt_path) -# try: -# if to_cpu: -# pt = torch.load(pt_path, map_location=torch.device("cpu"), weights_only=True) -# else: -# pt = torch.load(pt_path, weights_only=True) -# except Exception as e: -# raise RuntimeError(f"load pt file {pt_path} failed") from e -# return pt - - def save_pt(tensor, filepath): check_path_before_create(filepath) filepath = os.path.realpath(filepath) diff --git a/debug/accuracy_tools/msprobe/test/core_ut/common/test_file_utils.py b/debug/accuracy_tools/msprobe/test/core_ut/common/test_file_utils.py index c4de285c7a..303c083eaa 100644 --- a/debug/accuracy_tools/msprobe/test/core_ut/common/test_file_utils.py +++ b/debug/accuracy_tools/msprobe/test/core_ut/common/test_file_utils.py @@ -1,5 +1,6 @@ import unittest from unittest.mock import patch, mock_open, MagicMock +import tempfile import pytest @@ -533,3 +534,36 @@ class TestDirectoryChecks: check_file_or_directory_path(self.test_file, isdir=False) # Test directory path check_file_or_directory_path(self.test_dir, isdir=True) + + +class TestLoadPt(unittest.TestCase): + + def setUp(self): + self.temp_file = tempfile.NamedTemporaryFile(delete=False, suffix='.pt') + tensor = torch.tensor([1, 2, 3]) + torch.save(tensor, self.temp_file.name) + + @patch('torch.load') + def test_load_pt_cpu(self, mock_load): + mock_load.return_value = torch.tensor([1, 2, 3]) + result = load_pt(self.temp_file.name, to_cpu=True) + self.assertTrue(torch.equal(result, torch.tensor([1, 2, 3]))) + mock_load.assert_called_once_with(self.temp_file.name, map_location=torch.device("cpu"), weights_only=True) + + @patch('torch.load') + def test_load_pt_nogpu(self, mock_load): + mock_load.return_value = torch.tensor([1, 2, 3]) + result = load_pt(self.temp_file.name, to_cpu=False) + self.assertTrue(torch.equal(result, torch.tensor([1, 2, 3]))) + mock_load.assert_called_once_with(self.temp_file.name, weights_only=True) + + @patch('torch.load') + def test_load_pt_failure(self, mock_load): + mock_load.side_effect = RuntimeError("Load failed") + with self.assertRaises(RuntimeError) as context: + load_pt(self.temp_file.name) + self.assertIn("load pt file", str(context.exception)) + + def tearDown(self): + if os.path.isfile(self.temp_file.name): + os.remove(self.temp_file.name) diff --git a/debug/accuracy_tools/msprobe/test/pytorch_ut/common/test_pt_utils.py b/debug/accuracy_tools/msprobe/test/pytorch_ut/common/test_pt_utils.py index cdc922cc98..00574624dd 100644 --- a/debug/accuracy_tools/msprobe/test/pytorch_ut/common/test_pt_utils.py +++ b/debug/accuracy_tools/msprobe/test/pytorch_ut/common/test_pt_utils.py @@ -2,7 +2,6 @@ import os import io import unittest from unittest.mock import MagicMock, patch -import tempfile import torch import torch.distributed as dist @@ -148,38 +147,6 @@ class TestPrintRank0(unittest.TestCase): mock_logger_info.assert_called_once_with(message) -class TestLoadPt(unittest.TestCase): - - def setUp(self): - self.temp_file = tempfile.NamedTemporaryFile(delete=False, suffix='.pt') - tensor = torch.tensor([1, 2, 3]) - torch.save(tensor, self.temp_file.name) - - @patch('torch.load') - def test_load_pt_cpu(self, mock_load): - mock_load.return_value = torch.tensor([1, 2, 3]) - result = load_pt(self.temp_file.name, to_cpu=True) - self.assertTrue(torch.equal(result, torch.tensor([1, 2, 3]))) - mock_load.assert_called_once_with(self.temp_file.name, map_location=torch.device("cpu"), weights_only=True) - - @patch('torch.load') - def test_load_pt_nogpu(self, mock_load): - mock_load.return_value = torch.tensor([1, 2, 3]) - result = load_pt(self.temp_file.name, to_cpu=False) - self.assertTrue(torch.equal(result, torch.tensor([1, 2, 3]))) - mock_load.assert_called_once_with(self.temp_file.name, weights_only=True) - - @patch('torch.load') - def test_load_pt_failure(self, mock_load): - mock_load.side_effect = RuntimeError("Load failed") - with self.assertRaises(RuntimeError) as context: - load_pt(self.temp_file.name) - self.assertIn("load pt file", str(context.exception)) - - def tearDown(self): - if os.path.isfile(self.temp_file.name): - os.remove(self.temp_file.name) - class TestSavePT(unittest.TestCase): def setUp(self): -- Gitee From f4cffe0be2eb1dd84c5cc9569b9fbaa953326068 Mon Sep 17 00:00:00 2001 From: Linwei-Ying Date: Thu, 27 Feb 2025 15:20:05 +0800 Subject: [PATCH 09/37] compare read data read improve --- .../msprobe/test/pytorch_ut/common/test_pt_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/debug/accuracy_tools/msprobe/test/pytorch_ut/common/test_pt_utils.py b/debug/accuracy_tools/msprobe/test/pytorch_ut/common/test_pt_utils.py index 00574624dd..c5e63b6a67 100644 --- a/debug/accuracy_tools/msprobe/test/pytorch_ut/common/test_pt_utils.py +++ b/debug/accuracy_tools/msprobe/test/pytorch_ut/common/test_pt_utils.py @@ -10,7 +10,7 @@ from msprobe.core.common.file_utils import FileCheckConst from msprobe.core.common.exceptions import DistributedNotInitializedError from msprobe.pytorch.api_accuracy_checker.common.utils import ApiData from msprobe.pytorch.common.utils import parameter_adapter, get_rank_if_initialized, \ - get_tensor_rank, get_rank_id, print_rank_0, load_pt, save_pt, save_api_data, load_api_data, save_pkl, load_pkl + get_tensor_rank, get_rank_id, print_rank_0, save_pt, save_api_data, load_api_data, save_pkl, load_pkl class TestParameterAdapter(unittest.TestCase): -- Gitee From 2a71bc5f1b5aa6c49374d6848e869fa703c8bcb7 Mon Sep 17 00:00:00 2001 From: Linwei-Ying Date: Thu, 27 Feb 2025 15:41:32 +0800 Subject: [PATCH 10/37] compare read data read improve --- .../api_accuracy_checker/tensor_transport_layer/test_attl.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/debug/accuracy_tools/msprobe/test/pytorch_ut/api_accuracy_checker/tensor_transport_layer/test_attl.py b/debug/accuracy_tools/msprobe/test/pytorch_ut/api_accuracy_checker/tensor_transport_layer/test_attl.py index 7d4e6e950d..0320c43d0b 100644 --- a/debug/accuracy_tools/msprobe/test/pytorch_ut/api_accuracy_checker/tensor_transport_layer/test_attl.py +++ b/debug/accuracy_tools/msprobe/test/pytorch_ut/api_accuracy_checker/tensor_transport_layer/test_attl.py @@ -6,6 +6,7 @@ from multiprocessing import Queue from msprobe.pytorch.api_accuracy_checker.tensor_transport_layer.attl import * from msprobe.core.common.file_utils import create_directory + class TestATTL(unittest.TestCase): def setUp(self): @@ -48,7 +49,7 @@ class TestATTL(unittest.TestCase): self.assertIsNone(result) @patch('glob.glob') - @patch('msprobe.pytorch.common.utils.load_pt') + @patch('msprobe.core.common.file_utils.load_pt') def test_download_with_exception(self, mock_load_pt, mock_glob): mock_glob.return_value = ['/tmp/start_file.pt'] mock_load_pt.side_effect = Exception('Load error') -- Gitee From 07c61b06258abae8b33a93534b4526d4af5f51d6 Mon Sep 17 00:00:00 2001 From: Linwei-Ying Date: Thu, 27 Feb 2025 20:08:57 +0800 Subject: [PATCH 11/37] compare read data read improve --- .../msprobe/core/common/file_utils.py | 14 ------ .../msprobe/core/compare/utils.py | 47 +------------------ .../msprobe/mindspore/common/utils.py | 17 ++++++- .../msprobe/pytorch/common/utils.py | 44 +++++++++++++++-- .../test/core_ut/common/test_file_utils.py | 35 -------------- .../test/pytorch_ut/common/test_pt_utils.py | 37 ++++++++++++++- 6 files changed, 93 insertions(+), 101 deletions(-) diff --git a/debug/accuracy_tools/msprobe/core/common/file_utils.py b/debug/accuracy_tools/msprobe/core/common/file_utils.py index 4994e8e27e..35b7f2e965 100644 --- a/debug/accuracy_tools/msprobe/core/common/file_utils.py +++ b/debug/accuracy_tools/msprobe/core/common/file_utils.py @@ -22,7 +22,6 @@ import re import shutil from datetime import datetime, timezone from dateutil import parser -import torch import yaml import numpy as np import pandas as pd @@ -670,16 +669,3 @@ def read_xlsx(file_path): logger.error(f"The xlsx file failed to load. Please check the path: {file_path}.") raise RuntimeError(f"Read xlsx file {file_path} failed.") from e return result_df - - -def load_pt(pt_path, to_cpu=False): - pt_path = os.path.realpath(pt_path) - check_file_or_directory_path(pt_path) - try: - if to_cpu: - pt = torch.load(pt_path, map_location=torch.device("cpu"), weights_only=True) - else: - pt = torch.load(pt_path, weights_only=True) - except Exception as e: - raise RuntimeError(f"load pt file {pt_path} failed") from e - return pt diff --git a/debug/accuracy_tools/msprobe/core/compare/utils.py b/debug/accuracy_tools/msprobe/core/compare/utils.py index 6a706e7c38..a2edf57e5b 100644 --- a/debug/accuracy_tools/msprobe/core/compare/utils.py +++ b/debug/accuracy_tools/msprobe/core/compare/utils.py @@ -19,12 +19,11 @@ import math import zlib from dataclasses import dataclass -import torch import numpy as np from msprobe.core.common.const import Const, CompareConst, FileCheckConst from msprobe.core.common.utils import CompareException, check_regex_prefix_format_valid, logger, safe_get_value -from msprobe.core.common.file_utils import check_file_or_directory_path, load_npy, FileChecker, load_pt +from msprobe.core.common.file_utils import check_file_or_directory_path def extract_json(dirname, stack_json=False): @@ -598,50 +597,6 @@ def reorder_op_x_list(op_name_list, summary_list, data_name_list): return op_name_reorder, summary_reorder, data_name_reorder -def read_real_data(dir_path, file_name, frame_name): - """ - 用于比对。 - 根据比对的类名读取真实数据,比对类名为PTComparator,MSComparator。 - 输出均为ndarray。 - PTComparator时读取.pt文件,MSComparator时读取.npy文件。 - """ - if not file_name: - return None - - data_path = os.path.join(dir_path, file_name) - - if frame_name == CompareConst.CMP_PT_FRAMENAME: - path_checker = FileChecker(data_path, FileCheckConst.FILE, FileCheckConst.READ_ABLE, - FileCheckConst.PT_SUFFIX, False) - data_path = path_checker.common_check() - try: - # detach because numpy can not process gradient information - data_value = load_pt(data_path, to_cpu=True).detach() - except RuntimeError as e: - # 这里捕获 load_pt 中抛出的异常 - logger.error(f"Failed to load the .pt file at {data_path}.") - raise CompareException(CompareException.INVALID_FILE_ERROR) from e - except AttributeError as e: - # 这里捕获 detach 方法抛出的异常 - logger.error(f"Failed to detach the loaded tensor.") - raise CompareException(CompareException.DETACH_ERROR) from e - if data_value.dtype == torch.bfloat16: - data_value = data_value.to(torch.float32) - data_value = data_value.numpy() - return data_value - - elif frame_name == CompareConst.CMP_MS_FRAMENAME: - path_checker = FileChecker(data_path, FileCheckConst.FILE, FileCheckConst.READ_ABLE, - FileCheckConst.NUMPY_SUFFIX, False) - data_path = path_checker.common_check() - data_value = load_npy(data_path) - return data_value - - else: - logger.error(f"Wrong frame_name:{frame_name} when read real data in compare, please check!") - raise CompareException(CompareException.INVALID_FILE_ERROR) - - def _compare_parser(parser): parser.add_argument("-i", "--input_path", dest="input_path", type=str, help=" The compare input path, a dict json.", required=True) diff --git a/debug/accuracy_tools/msprobe/mindspore/common/utils.py b/debug/accuracy_tools/msprobe/mindspore/common/utils.py index ded3faaa22..57f3187e07 100644 --- a/debug/accuracy_tools/msprobe/mindspore/common/utils.py +++ b/debug/accuracy_tools/msprobe/mindspore/common/utils.py @@ -22,7 +22,8 @@ from mindspore import ops from mindspore.mint import nn from msprobe.core.common.exceptions import DistributedNotInitializedError -from msprobe.core.common.file_utils import path_len_exceeds_limit, check_path_exists, save_npy +from msprobe.core.common.file_utils import path_len_exceeds_limit, check_path_exists, load_npy, save_npy, \ + FileChecker, FileCheckConst from msprobe.core.common.log import logger from msprobe.core.common.const import Const from msprobe.core.common.utils import CompareException, check_seed_all @@ -196,4 +197,16 @@ def check_save_param(variable, name, save_backward): logger.warning("PrecisionDebugger.save_backward name not valid, " "should be bool. " "Skip current save process.") - raise ValueError \ No newline at end of file + raise ValueError + + +def read_npy_data(dir_path, file_name): + if not file_name: + return None + + data_path = os.path.join(dir_path, file_name) + path_checker = FileChecker(data_path, FileCheckConst.FILE, FileCheckConst.READ_ABLE, + FileCheckConst.NUMPY_SUFFIX, False) + data_path = path_checker.common_check() + data_value = load_npy(data_path) + return data_value diff --git a/debug/accuracy_tools/msprobe/pytorch/common/utils.py b/debug/accuracy_tools/msprobe/pytorch/common/utils.py index 3416661df6..92378c0a26 100644 --- a/debug/accuracy_tools/msprobe/pytorch/common/utils.py +++ b/debug/accuracy_tools/msprobe/pytorch/common/utils.py @@ -25,10 +25,10 @@ import numpy as np import torch import torch.distributed as dist from msprobe.core.common.exceptions import DistributedNotInitializedError -from msprobe.core.common.file_utils import (FileCheckConst, change_mode, - check_file_or_directory_path, check_path_before_create, FileOpen) +from msprobe.core.common.file_utils import FileCheckConst, change_mode, check_file_or_directory_path, \ + check_path_before_create, FileOpen, FileChecker from msprobe.core.common.log import logger -from msprobe.core.common.utils import check_seed_all +from msprobe.core.common.utils import check_seed_all, CompareException from packaging import version try: @@ -309,6 +309,19 @@ def print_rank_0(message): logger.info(message) +def load_pt(pt_path, to_cpu=False): + pt_path = os.path.realpath(pt_path) + check_file_or_directory_path(pt_path) + try: + if to_cpu: + pt = torch.load(pt_path, map_location=torch.device("cpu"), weights_only=True) + else: + pt = torch.load(pt_path, weights_only=True) + except Exception as e: + raise RuntimeError(f"load pt file {pt_path} failed") from e + return pt + + def save_pt(tensor, filepath): check_path_before_create(filepath) filepath = os.path.realpath(filepath) @@ -460,3 +473,28 @@ def replace_last_occurrence(text, old, new): if index != -1: return text[:index] + text[index:].replace(old, new, 1) return text + + +def read_pt_data(dir_path, file_name): + if not file_name: + return None + + data_path = os.path.join(dir_path, file_name) + path_checker = FileChecker(data_path, FileCheckConst.FILE, FileCheckConst.READ_ABLE, + FileCheckConst.PT_SUFFIX, False) + data_path = path_checker.common_check() + try: + # detach because numpy can not process gradient information + data_value = load_pt(data_path, to_cpu=True).detach() + except RuntimeError as e: + # 这里捕获 load_pt 中抛出的异常 + logger.error(f"Failed to load the .pt file at {data_path}.") + raise CompareException(CompareException.INVALID_FILE_ERROR) from e + except AttributeError as e: + # 这里捕获 detach 方法抛出的异常 + logger.error(f"Failed to detach the loaded tensor.") + raise CompareException(CompareException.DETACH_ERROR) from e + if data_value.dtype == torch.bfloat16: + data_value = data_value.to(torch.float32) + data_value = data_value.numpy() + return data_value diff --git a/debug/accuracy_tools/msprobe/test/core_ut/common/test_file_utils.py b/debug/accuracy_tools/msprobe/test/core_ut/common/test_file_utils.py index 303c083eaa..ac3a859bf4 100644 --- a/debug/accuracy_tools/msprobe/test/core_ut/common/test_file_utils.py +++ b/debug/accuracy_tools/msprobe/test/core_ut/common/test_file_utils.py @@ -1,6 +1,4 @@ -import unittest from unittest.mock import patch, mock_open, MagicMock -import tempfile import pytest @@ -534,36 +532,3 @@ class TestDirectoryChecks: check_file_or_directory_path(self.test_file, isdir=False) # Test directory path check_file_or_directory_path(self.test_dir, isdir=True) - - -class TestLoadPt(unittest.TestCase): - - def setUp(self): - self.temp_file = tempfile.NamedTemporaryFile(delete=False, suffix='.pt') - tensor = torch.tensor([1, 2, 3]) - torch.save(tensor, self.temp_file.name) - - @patch('torch.load') - def test_load_pt_cpu(self, mock_load): - mock_load.return_value = torch.tensor([1, 2, 3]) - result = load_pt(self.temp_file.name, to_cpu=True) - self.assertTrue(torch.equal(result, torch.tensor([1, 2, 3]))) - mock_load.assert_called_once_with(self.temp_file.name, map_location=torch.device("cpu"), weights_only=True) - - @patch('torch.load') - def test_load_pt_nogpu(self, mock_load): - mock_load.return_value = torch.tensor([1, 2, 3]) - result = load_pt(self.temp_file.name, to_cpu=False) - self.assertTrue(torch.equal(result, torch.tensor([1, 2, 3]))) - mock_load.assert_called_once_with(self.temp_file.name, weights_only=True) - - @patch('torch.load') - def test_load_pt_failure(self, mock_load): - mock_load.side_effect = RuntimeError("Load failed") - with self.assertRaises(RuntimeError) as context: - load_pt(self.temp_file.name) - self.assertIn("load pt file", str(context.exception)) - - def tearDown(self): - if os.path.isfile(self.temp_file.name): - os.remove(self.temp_file.name) diff --git a/debug/accuracy_tools/msprobe/test/pytorch_ut/common/test_pt_utils.py b/debug/accuracy_tools/msprobe/test/pytorch_ut/common/test_pt_utils.py index c5e63b6a67..216fd5bce1 100644 --- a/debug/accuracy_tools/msprobe/test/pytorch_ut/common/test_pt_utils.py +++ b/debug/accuracy_tools/msprobe/test/pytorch_ut/common/test_pt_utils.py @@ -2,6 +2,7 @@ import os import io import unittest from unittest.mock import MagicMock, patch +import tempfile import torch import torch.distributed as dist @@ -10,7 +11,7 @@ from msprobe.core.common.file_utils import FileCheckConst from msprobe.core.common.exceptions import DistributedNotInitializedError from msprobe.pytorch.api_accuracy_checker.common.utils import ApiData from msprobe.pytorch.common.utils import parameter_adapter, get_rank_if_initialized, \ - get_tensor_rank, get_rank_id, print_rank_0, save_pt, save_api_data, load_api_data, save_pkl, load_pkl + get_tensor_rank, get_rank_id, print_rank_0, load_pt, save_pt, save_api_data, load_api_data, save_pkl, load_pkl class TestParameterAdapter(unittest.TestCase): @@ -162,6 +163,40 @@ class TestSavePT(unittest.TestCase): mock_torch_save.assert_called_once_with(self.tensor, self.filepath) mock_change_mode.assert_called_once_with(self.filepath, FileCheckConst.DATA_FILE_AUTHORITY) + +class TestLoadPt(unittest.TestCase): + + def setUp(self): + self.temp_file = tempfile.NamedTemporaryFile(delete=False, suffix='.pt') + tensor = torch.tensor([1, 2, 3]) + torch.save(tensor, self.temp_file.name) + + @patch('torch.load') + def test_load_pt_cpu(self, mock_load): + mock_load.return_value = torch.tensor([1, 2, 3]) + result = load_pt(self.temp_file.name, to_cpu=True) + self.assertTrue(torch.equal(result, torch.tensor([1, 2, 3]))) + mock_load.assert_called_once_with(self.temp_file.name, map_location=torch.device("cpu"), weights_only=True) + + @patch('torch.load') + def test_load_pt_nogpu(self, mock_load): + mock_load.return_value = torch.tensor([1, 2, 3]) + result = load_pt(self.temp_file.name, to_cpu=False) + self.assertTrue(torch.equal(result, torch.tensor([1, 2, 3]))) + mock_load.assert_called_once_with(self.temp_file.name, weights_only=True) + + @patch('torch.load') + def test_load_pt_failure(self, mock_load): + mock_load.side_effect = RuntimeError("Load failed") + with self.assertRaises(RuntimeError) as context: + load_pt(self.temp_file.name) + self.assertIn("load pt file", str(context.exception)) + + def tearDown(self): + if os.path.isfile(self.temp_file.name): + os.remove(self.temp_file.name) + + class TestSavePT(unittest.TestCase): def setUp(self): -- Gitee From 269e92d225d1e40a7f539193cf94d508cb39e9a3 Mon Sep 17 00:00:00 2001 From: Linwei-Ying Date: Thu, 27 Feb 2025 20:14:00 +0800 Subject: [PATCH 12/37] compare read data read improve --- .../accuracy_tools/msprobe/core/common/const.py | 3 --- .../msprobe/core/common/file_utils.py | 2 +- .../msprobe/core/compare/acc_compare.py | 16 +++++++++------- 3 files changed, 10 insertions(+), 11 deletions(-) diff --git a/debug/accuracy_tools/msprobe/core/common/const.py b/debug/accuracy_tools/msprobe/core/common/const.py index aa203923c8..d9623b8071 100644 --- a/debug/accuracy_tools/msprobe/core/common/const.py +++ b/debug/accuracy_tools/msprobe/core/common/const.py @@ -484,9 +484,6 @@ class CompareConst: INTERNAL_API_MAPPING_FILE = 'ms_to_pt_api.yaml' UNREADABLE = 'unreadable data' - CMP_PT_FRAMENAME = 'PTComparator' - CMP_MS_FRAMENAME = 'MSComparator' - class FileCheckConst: """ diff --git a/debug/accuracy_tools/msprobe/core/common/file_utils.py b/debug/accuracy_tools/msprobe/core/common/file_utils.py index 35b7f2e965..460af6292a 100644 --- a/debug/accuracy_tools/msprobe/core/common/file_utils.py +++ b/debug/accuracy_tools/msprobe/core/common/file_utils.py @@ -1,4 +1,4 @@ -# Copyright (c) 2024-2025, Huawei Technologies Co., Ltd. +# Copyright (c) 2024-2024, Huawei Technologies Co., Ltd. # All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); diff --git a/debug/accuracy_tools/msprobe/core/compare/acc_compare.py b/debug/accuracy_tools/msprobe/core/compare/acc_compare.py index 0d8c1d492c..606c25e0db 100644 --- a/debug/accuracy_tools/msprobe/core/compare/acc_compare.py +++ b/debug/accuracy_tools/msprobe/core/compare/acc_compare.py @@ -33,7 +33,9 @@ from msprobe.core.compare.highlight import find_compare_result_error_rows, highl from msprobe.core.compare.multiprocessing_compute import ComparisonResult, _handle_multi_process, _save_cmp_result from msprobe.core.compare.npy_compare import compare_ops_apply, get_error_flag_and_msg from msprobe.core.compare.utils import get_accuracy, get_rela_diff_summary_mode, get_un_match_accuracy, merge_tensor, \ - print_compare_ends_info, read_op, get_name_and_state, reorder_op_x_list, read_real_data + print_compare_ends_info, read_op, get_name_and_state, reorder_op_x_list +from msprobe.pytorch.common.utils import read_pt_data +from msprobe.mindspore.common.utils import read_npy_data class ModeConfig: @@ -376,15 +378,15 @@ class Comparator: npu_file_name = npu_op_name + Const.NUMPY_SUFFIX try: frame_name = getattr(self, "frame_name") - if frame_name == CompareConst.CMP_MS_FRAMENAME: - n_value = read_real_data(npu_dir, npu_file_name, CompareConst.CMP_MS_FRAMENAME) + if frame_name == 'MSComparator': + n_value = read_npy_data(npu_dir, npu_file_name) if self.cross_frame: - b_value = read_real_data(bench_dir, bench_file_name, CompareConst.CMP_PT_FRAMENAME) + b_value = read_pt_data(bench_dir, bench_file_name) else: - b_value = read_real_data(bench_dir, bench_file_name, CompareConst.CMP_MS_FRAMENAME) + b_value = read_npy_data(bench_dir, bench_file_name) else: - n_value = read_real_data(npu_dir, npu_op_name + Const.PT_SUFFIX, CompareConst.CMP_PT_FRAMENAME) - b_value = read_real_data(bench_dir, bench_file_name, CompareConst.CMP_PT_FRAMENAME) + n_value = read_pt_data(npu_dir, npu_op_name + Const.PT_SUFFIX) + b_value = read_pt_data(bench_dir, bench_file_name) except IOError as error: error_file = error.filename n_value, b_value = CompareConst.READ_NONE, CompareConst.READ_NONE -- Gitee From ad02a160f22400e292668ce066875dc98a0bf655 Mon Sep 17 00:00:00 2001 From: Linwei-Ying Date: Thu, 27 Feb 2025 20:17:43 +0800 Subject: [PATCH 13/37] compare read data read improve --- debug/accuracy_tools/msprobe/core/common/file_utils.py | 2 +- .../pytorch/api_accuracy_checker/run_ut/data_generate.py | 5 +++-- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/debug/accuracy_tools/msprobe/core/common/file_utils.py b/debug/accuracy_tools/msprobe/core/common/file_utils.py index 460af6292a..35b7f2e965 100644 --- a/debug/accuracy_tools/msprobe/core/common/file_utils.py +++ b/debug/accuracy_tools/msprobe/core/common/file_utils.py @@ -1,4 +1,4 @@ -# Copyright (c) 2024-2024, Huawei Technologies Co., Ltd. +# Copyright (c) 2024-2025, Huawei Technologies Co., Ltd. # All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); diff --git a/debug/accuracy_tools/msprobe/pytorch/api_accuracy_checker/run_ut/data_generate.py b/debug/accuracy_tools/msprobe/pytorch/api_accuracy_checker/run_ut/data_generate.py index d70c258192..9d89b2de32 100644 --- a/debug/accuracy_tools/msprobe/pytorch/api_accuracy_checker/run_ut/data_generate.py +++ b/debug/accuracy_tools/msprobe/pytorch/api_accuracy_checker/run_ut/data_generate.py @@ -1,6 +1,6 @@ #!/usr/bin/env python3 # -*- coding: utf-8 -*- -# Copyright (c) 2024-2025, Huawei Technologies Co., Ltd. +# Copyright (c) 2024-2024, Huawei Technologies Co., Ltd. # All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -23,8 +23,9 @@ import numpy from msprobe.pytorch.api_accuracy_checker.run_ut.run_ut_utils import hf_32_standard_api from msprobe.pytorch.api_accuracy_checker.common.utils import check_object_type, get_full_data_path, \ CompareException, get_module_and_atttribute_name, get_attribute -from msprobe.core.common.file_utils import FileChecker, load_npy, load_pt +from msprobe.core.common.file_utils import FileChecker, load_npy from msprobe.pytorch.common.log import logger +from msprobe.pytorch.common.utils import load_pt from msprobe.core.common.const import Const, FileCheckConst, CompareConst -- Gitee From f8ba0197f21de6a5d02d88623c16dabe0130eeb1 Mon Sep 17 00:00:00 2001 From: Linwei-Ying Date: Thu, 27 Feb 2025 20:26:48 +0800 Subject: [PATCH 14/37] compare read data read improve --- .../tensor_transport_layer/test_attl.py | 2 +- .../test/pytorch_ut/common/test_pt_utils.py | 32 +++++++++---------- 2 files changed, 17 insertions(+), 17 deletions(-) diff --git a/debug/accuracy_tools/msprobe/test/pytorch_ut/api_accuracy_checker/tensor_transport_layer/test_attl.py b/debug/accuracy_tools/msprobe/test/pytorch_ut/api_accuracy_checker/tensor_transport_layer/test_attl.py index 0320c43d0b..79df231a1a 100644 --- a/debug/accuracy_tools/msprobe/test/pytorch_ut/api_accuracy_checker/tensor_transport_layer/test_attl.py +++ b/debug/accuracy_tools/msprobe/test/pytorch_ut/api_accuracy_checker/tensor_transport_layer/test_attl.py @@ -49,7 +49,7 @@ class TestATTL(unittest.TestCase): self.assertIsNone(result) @patch('glob.glob') - @patch('msprobe.core.common.file_utils.load_pt') + @patch('msprobe.pytorch.common.utils.load_pt') def test_download_with_exception(self, mock_load_pt, mock_glob): mock_glob.return_value = ['/tmp/start_file.pt'] mock_load_pt.side_effect = Exception('Load error') diff --git a/debug/accuracy_tools/msprobe/test/pytorch_ut/common/test_pt_utils.py b/debug/accuracy_tools/msprobe/test/pytorch_ut/common/test_pt_utils.py index 216fd5bce1..ab9a264c72 100644 --- a/debug/accuracy_tools/msprobe/test/pytorch_ut/common/test_pt_utils.py +++ b/debug/accuracy_tools/msprobe/test/pytorch_ut/common/test_pt_utils.py @@ -148,22 +148,6 @@ class TestPrintRank0(unittest.TestCase): mock_logger_info.assert_called_once_with(message) -class TestSavePT(unittest.TestCase): - - def setUp(self): - self.tensor = torch.tensor([1, 2, 3]) - self.filepath = 'temp_tensor.pt' - - @patch('msprobe.pytorch.common.utils.save_pt') - @patch('os.path.realpath', return_value='temp_tensor.pt') - @patch('msprobe.core.common.file_utils.check_path_before_create') - @patch('msprobe.core.common.file_utils.change_mode') - def test_save_pt_success(self, mock_change_mode, mock_check_path, mock_realpath, mock_torch_save): - mock_torch_save(self.tensor, self.filepath) - mock_torch_save.assert_called_once_with(self.tensor, self.filepath) - mock_change_mode.assert_called_once_with(self.filepath, FileCheckConst.DATA_FILE_AUTHORITY) - - class TestLoadPt(unittest.TestCase): def setUp(self): @@ -197,6 +181,22 @@ class TestLoadPt(unittest.TestCase): os.remove(self.temp_file.name) +class TestSavePT(unittest.TestCase): + + def setUp(self): + self.tensor = torch.tensor([1, 2, 3]) + self.filepath = 'temp_tensor.pt' + + @patch('msprobe.pytorch.common.utils.save_pt') + @patch('os.path.realpath', return_value='temp_tensor.pt') + @patch('msprobe.core.common.file_utils.check_path_before_create') + @patch('msprobe.core.common.file_utils.change_mode') + def test_save_pt_success(self, mock_change_mode, mock_check_path, mock_realpath, mock_torch_save): + mock_torch_save(self.tensor, self.filepath) + mock_torch_save.assert_called_once_with(self.tensor, self.filepath) + mock_change_mode.assert_called_once_with(self.filepath, FileCheckConst.DATA_FILE_AUTHORITY) + + class TestSavePT(unittest.TestCase): def setUp(self): -- Gitee From c377e674291cd3fd255c37f5ac6dc18787195818 Mon Sep 17 00:00:00 2001 From: Linwei-Ying Date: Thu, 27 Feb 2025 20:40:36 +0800 Subject: [PATCH 15/37] compare read data read improve --- .../test/mindspore_ut/common/test_ms_utils.py | 32 ++++++++------- .../test/pytorch_ut/common/test_pt_utils.py | 39 ++++++++++++++++++- 2 files changed, 56 insertions(+), 15 deletions(-) diff --git a/debug/accuracy_tools/msprobe/test/mindspore_ut/common/test_ms_utils.py b/debug/accuracy_tools/msprobe/test/mindspore_ut/common/test_ms_utils.py index 1ed3ca0161..a4e9ffa2aa 100644 --- a/debug/accuracy_tools/msprobe/test/mindspore_ut/common/test_ms_utils.py +++ b/debug/accuracy_tools/msprobe/test/mindspore_ut/common/test_ms_utils.py @@ -18,18 +18,11 @@ import unittest from unittest.mock import MagicMock, patch, call import numpy as np import mindspore as ms -import os -import random - -from msprobe.core.common.exceptions import DistributedNotInitializedError -from msprobe.mindspore.common.utils import (get_rank_if_initialized, - convert_bf16_to_fp32, - save_tensor_as_npy, - convert_to_int, - list_lowest_level_directories, - seed_all, - remove_dropout, - MsprobeStep) + +from msprobe.core.common.const import FileCheckConst +from msprobe.mindspore.common.utils import get_rank_if_initialized, convert_bf16_to_fp32, convert_to_int, \ + list_lowest_level_directories, seed_all, remove_dropout, MsprobeStep, read_npy_data + class MockCell: def __init__(self): @@ -138,6 +131,17 @@ class TestMsprobeFunctions(unittest.TestCase): self.assertTrue((dropout(x1d, p=0.5).numpy() == x1d.numpy()).all()) +class TestReadPtData(unittest.TestCase): + @patch('msprobe.core.compare.utils.load_npy') + @patch('msprobe.core.compare.utils.FileChecker') + @patch('os.path.join', return_value='/fake/path/to/file.npy') + def test_read_real_data_ms(self, mock_os, mock_file_checker, mock_load_npy): + mock_file_checker.return_value.common_check.return_value = '/fake/path/to/file.npy' + + mock_load_npy.return_value = np.array([1.0, 2.0, 3.0]) + + result = read_npy_data('/fake/dir', 'file_name.npy') -if __name__ == "__main__": - unittest.main() \ No newline at end of file + mock_file_checker.assert_called_once_with('/fake/path/to/file.npy', FileCheckConst.FILE, FileCheckConst.READ_ABLE, FileCheckConst.NUMPY_SUFFIX, False) + mock_load_npy.assert_called_once_with('/fake/path/to/file.npy') + self.assertTrue(np.array_equal(result, np.array([1.0, 2.0, 3.0]))) diff --git a/debug/accuracy_tools/msprobe/test/pytorch_ut/common/test_pt_utils.py b/debug/accuracy_tools/msprobe/test/pytorch_ut/common/test_pt_utils.py index ab9a264c72..98fdcd4d77 100644 --- a/debug/accuracy_tools/msprobe/test/pytorch_ut/common/test_pt_utils.py +++ b/debug/accuracy_tools/msprobe/test/pytorch_ut/common/test_pt_utils.py @@ -6,12 +6,15 @@ import tempfile import torch import torch.distributed as dist +import numpy as np from msprobe.core.common.file_utils import FileCheckConst from msprobe.core.common.exceptions import DistributedNotInitializedError +from msprobe.core.common.utils import CompareException from msprobe.pytorch.api_accuracy_checker.common.utils import ApiData from msprobe.pytorch.common.utils import parameter_adapter, get_rank_if_initialized, \ - get_tensor_rank, get_rank_id, print_rank_0, load_pt, save_pt, save_api_data, load_api_data, save_pkl, load_pkl + get_tensor_rank, get_rank_id, print_rank_0, load_pt, save_pt, save_api_data, \ + load_api_data, save_pkl, load_pkl, read_pt_data class TestParameterAdapter(unittest.TestCase): @@ -301,3 +304,37 @@ class TestSavePkl(unittest.TestCase): load_pkl(self.filepath) self.assertIn("Unsupported object type: os.system", str(context.exception)) os.remove(self.filepath) + + +class TestReadPtData(unittest.TestCase): + + @patch('msprobe.pytorch.common.utils.load_pt') + @patch('msprobe.core.compare.utils.FileChecker') + @patch('os.path.join', return_value='/fake/path/to/file.pt') + def test_read_pt_data(self, mock_os, mock_file_checker, mock_load_pt): + mock_file_checker.return_value.common_check.return_value = '/fake/path/to/file.pt' + + mock_tensor = MagicMock() + mock_tensor.detach.return_value = mock_tensor + mock_tensor.to.return_value = mock_tensor + mock_tensor.dtype = torch.bfloat16 + mock_tensor.numpy.return_value = np.array([1.0, 2.0, 3.0]) + mock_load_pt.return_value = mock_tensor + + result = read_pt_data('/fake/dir', 'file_name.pt') + + mock_file_checker.assert_called_once_with('/fake/path/to/file.pt', FileCheckConst.FILE, FileCheckConst.READ_ABLE, FileCheckConst.PT_SUFFIX, False) + mock_load_pt.assert_called_once_with('/fake/path/to/file.pt', to_cpu=True) + mock_tensor.to.assert_called_once_with(torch.float32) + self.assertTrue(np.array_equal(result, np.array([1.0, 2.0, 3.0]))) + + @patch('os.path.join', return_value='/fake/path/to/file.pt') + @patch('msprobe.core.compare.utils.FileChecker') + @patch('msprobe.pytorch.common.utils.load_pt') + def test_read_real_data_pt_exception(self, mock_load_pt, mock_file_checker, mock_os): + mock_file_checker.return_value.common_check.return_value = '/fake/path/to/file.pt' + + mock_load_pt.side_effect = RuntimeError("Test Error") + + with self.assertRaises(CompareException): + read_pt_data('/fake/dir', 'file_name.pt') -- Gitee From d61262dd4332e15e6f4d5216509866fe23bd1f29 Mon Sep 17 00:00:00 2001 From: Linwei-Ying Date: Thu, 27 Feb 2025 20:42:43 +0800 Subject: [PATCH 16/37] compare read data read improve --- .../core_ut/compare/test_acc_compare_utils.py | 63 +------------------ 1 file changed, 3 insertions(+), 60 deletions(-) diff --git a/debug/accuracy_tools/msprobe/test/core_ut/compare/test_acc_compare_utils.py b/debug/accuracy_tools/msprobe/test/core_ut/compare/test_acc_compare_utils.py index bc7bd0a450..ab8703dcd3 100644 --- a/debug/accuracy_tools/msprobe/test/core_ut/compare/test_acc_compare_utils.py +++ b/debug/accuracy_tools/msprobe/test/core_ut/compare/test_acc_compare_utils.py @@ -4,18 +4,17 @@ import json import os import shutil import unittest -from unittest.mock import patch, MagicMock +from unittest.mock import patch import zlib -import torch import numpy as np -from msprobe.core.common.const import CompareConst, Const, FileCheckConst +from msprobe.core.common.const import CompareConst, Const from msprobe.core.common.utils import CompareException from msprobe.core.compare.utils import ApiItemInfo, _compare_parser, check_and_return_dir_contents, extract_json, \ count_struct, get_accuracy, append_stack_info, get_rela_diff_summary_mode, get_un_match_accuracy, merge_tensor, \ op_item_parse, read_op, rename_api, resolve_api_special_parameters, result_item_init, stack_column_process, \ - table_value_is_valid, get_name_and_state, reorder_op_name_list, reorder_op_x_list, gen_op_item, read_real_data + table_value_is_valid, get_name_and_state, reorder_op_name_list, reorder_op_x_list, gen_op_item # test_read_op_1 op_data = { @@ -849,59 +848,3 @@ class TestGenOpItem(unittest.TestCase): expected_md5 = f"{zlib.crc32(str(op_data['value']).encode()):08x}" self.assertEqual(result['md5'], expected_md5) - - -class TestReadRealData(unittest.TestCase): - - @patch('msprobe.core.compare.utils.load_pt') - @patch('msprobe.core.compare.utils.FileChecker') - @patch('os.path.join', return_value='/fake/path/to/file.pt') - def test_read_real_data_pt(self, mock_os, mock_file_checker, mock_load_pt): - mock_file_checker.return_value.common_check.return_value = '/fake/path/to/file.pt' - - mock_tensor = MagicMock() - mock_tensor.detach.return_value = mock_tensor - mock_tensor.to.return_value = mock_tensor - mock_tensor.dtype = torch.bfloat16 - mock_tensor.numpy.return_value = np.array([1.0, 2.0, 3.0]) - mock_load_pt.return_value = mock_tensor - - result = read_real_data('/fake/dir', 'file_name.pt', 'PTComparator') - - mock_file_checker.assert_called_once_with('/fake/path/to/file.pt', FileCheckConst.FILE, FileCheckConst.READ_ABLE, FileCheckConst.PT_SUFFIX, False) - mock_load_pt.assert_called_once_with('/fake/path/to/file.pt', to_cpu=True) - mock_tensor.to.assert_called_once_with(torch.float32) - self.assertTrue(np.array_equal(result, np.array([1.0, 2.0, 3.0]))) - - @patch('msprobe.core.compare.utils.load_npy') - @patch('msprobe.core.compare.utils.FileChecker') - @patch('os.path.join', return_value='/fake/path/to/file.npy') - def test_read_real_data_ms(self, mock_os, mock_file_checker, mock_load_npy): - mock_file_checker.return_value.common_check.return_value = '/fake/path/to/file.npy' - - mock_load_npy.return_value = np.array([1.0, 2.0, 3.0]) - - result = read_real_data('/fake/dir', 'file_name.npy', 'MSComparator') - - mock_file_checker.assert_called_once_with('/fake/path/to/file.npy', FileCheckConst.FILE, FileCheckConst.READ_ABLE, FileCheckConst.NUMPY_SUFFIX, False) - mock_load_npy.assert_called_once_with('/fake/path/to/file.npy') - self.assertTrue(np.array_equal(result, np.array([1.0, 2.0, 3.0]))) - - @patch('os.path.join', return_value='/fake/path/to/file.txt') - @patch('msprobe.core.compare.utils.FileChecker') - def test_read_real_data_invalid_framework(self, mock_file_checker, mock_os): - mock_file_checker.return_value.common_check.return_value = '/fake/path/to/file.txt' - - with self.assertRaises(CompareException): - read_real_data('/fake/dir', 'file_name.txt', 'InvalidComparator') - - @patch('os.path.join', return_value='/fake/path/to/file.pt') - @patch('msprobe.core.compare.utils.FileChecker') - @patch('msprobe.core.compare.utils.load_pt') - def test_read_real_data_pt_exception(self, mock_load_pt, mock_file_checker, mock_os): - mock_file_checker.return_value.common_check.return_value = '/fake/path/to/file.pt' - - mock_load_pt.side_effect = RuntimeError("Test Error") - - with self.assertRaises(CompareException): - read_real_data('/fake/dir', 'file_name.pt', 'PTComparator') -- Gitee From bba1dc53cbadc9884c3ca15f4fa948dd081817b5 Mon Sep 17 00:00:00 2001 From: Linwei-Ying Date: Fri, 28 Feb 2025 10:14:55 +0800 Subject: [PATCH 17/37] compare read data read improve --- .../msprobe/pytorch/compare/distributed_compare.py | 10 +++------- .../msprobe/test/mindspore_ut/common/test_ms_utils.py | 8 ++++---- .../msprobe/test/pytorch_ut/common/test_pt_utils.py | 4 ++-- 3 files changed, 9 insertions(+), 13 deletions(-) diff --git a/debug/accuracy_tools/msprobe/pytorch/compare/distributed_compare.py b/debug/accuracy_tools/msprobe/pytorch/compare/distributed_compare.py index de62af421b..08e2f897a9 100644 --- a/debug/accuracy_tools/msprobe/pytorch/compare/distributed_compare.py +++ b/debug/accuracy_tools/msprobe/pytorch/compare/distributed_compare.py @@ -15,14 +15,10 @@ import os -from msprobe.core.common.exceptions import FileCheckException -from msprobe.core.common.file_utils import create_directory -from msprobe.core.common.utils import CompareException, check_compare_param, check_configuration_param, get_dump_mode, \ - set_dump_path -from msprobe.core.compare.acc_compare import ModeConfig -from msprobe.core.compare.utils import check_and_return_dir_contents, extract_json, set_stack_json_path +from msprobe.core.common.utils import CompareException +from msprobe.core.compare.utils import check_and_return_dir_contents, extract_json from msprobe.pytorch.common.log import logger -from msprobe.pytorch.compare.pt_compare import PTComparator, compare +from msprobe.pytorch.compare.pt_compare import compare def compare_distributed(npu_dump_dir, bench_dump_dir, output_path, **kwargs): diff --git a/debug/accuracy_tools/msprobe/test/mindspore_ut/common/test_ms_utils.py b/debug/accuracy_tools/msprobe/test/mindspore_ut/common/test_ms_utils.py index a4e9ffa2aa..6d2b0b99c7 100644 --- a/debug/accuracy_tools/msprobe/test/mindspore_ut/common/test_ms_utils.py +++ b/debug/accuracy_tools/msprobe/test/mindspore_ut/common/test_ms_utils.py @@ -15,7 +15,7 @@ # limitations under the License. """ import unittest -from unittest.mock import MagicMock, patch, call +from unittest.mock import patch import numpy as np import mindspore as ms @@ -131,9 +131,9 @@ class TestMsprobeFunctions(unittest.TestCase): self.assertTrue((dropout(x1d, p=0.5).numpy() == x1d.numpy()).all()) -class TestReadPtData(unittest.TestCase): - @patch('msprobe.core.compare.utils.load_npy') - @patch('msprobe.core.compare.utils.FileChecker') +class TestReadNpyData(unittest.TestCase): + @patch('msprobe.mindspore.common.utils.load_npy') + @patch('msprobe.mindspore.common.utils.FileChecker') @patch('os.path.join', return_value='/fake/path/to/file.npy') def test_read_real_data_ms(self, mock_os, mock_file_checker, mock_load_npy): mock_file_checker.return_value.common_check.return_value = '/fake/path/to/file.npy' diff --git a/debug/accuracy_tools/msprobe/test/pytorch_ut/common/test_pt_utils.py b/debug/accuracy_tools/msprobe/test/pytorch_ut/common/test_pt_utils.py index 98fdcd4d77..f0a26a39a0 100644 --- a/debug/accuracy_tools/msprobe/test/pytorch_ut/common/test_pt_utils.py +++ b/debug/accuracy_tools/msprobe/test/pytorch_ut/common/test_pt_utils.py @@ -309,7 +309,7 @@ class TestSavePkl(unittest.TestCase): class TestReadPtData(unittest.TestCase): @patch('msprobe.pytorch.common.utils.load_pt') - @patch('msprobe.core.compare.utils.FileChecker') + @patch('msprobe.pytorch.common.utils.FileChecker') @patch('os.path.join', return_value='/fake/path/to/file.pt') def test_read_pt_data(self, mock_os, mock_file_checker, mock_load_pt): mock_file_checker.return_value.common_check.return_value = '/fake/path/to/file.pt' @@ -329,7 +329,7 @@ class TestReadPtData(unittest.TestCase): self.assertTrue(np.array_equal(result, np.array([1.0, 2.0, 3.0]))) @patch('os.path.join', return_value='/fake/path/to/file.pt') - @patch('msprobe.core.compare.utils.FileChecker') + @patch('msprobe.pytorch.common.utils.FileChecker') @patch('msprobe.pytorch.common.utils.load_pt') def test_read_real_data_pt_exception(self, mock_load_pt, mock_file_checker, mock_os): mock_file_checker.return_value.common_check.return_value = '/fake/path/to/file.pt' -- Gitee From 40d3bc532711966ec694c28d4a734cb756269cb7 Mon Sep 17 00:00:00 2001 From: Linwei-Ying Date: Fri, 28 Feb 2025 10:51:42 +0800 Subject: [PATCH 18/37] compare read data read improve --- .../msprobe/core/common/file_utils.py | 15 ++++ .../msprobe/core/compare/acc_compare.py | 2 +- .../msprobe/core/compare/utils.py | 40 +++++++++- .../msprobe/mindspore/common/utils.py | 15 +--- .../msprobe/pytorch/common/utils.py | 44 +---------- .../test/core_ut/common/test_file_utils.py | 35 +++++++++ .../core_ut/compare/test_acc_compare_utils.py | 59 ++++++++++++++- .../test/mindspore_ut/common/test_ms_utils.py | 19 +---- .../test/pytorch_ut/common/test_pt_utils.py | 74 +------------------ 9 files changed, 153 insertions(+), 150 deletions(-) diff --git a/debug/accuracy_tools/msprobe/core/common/file_utils.py b/debug/accuracy_tools/msprobe/core/common/file_utils.py index 35b7f2e965..ad59721b54 100644 --- a/debug/accuracy_tools/msprobe/core/common/file_utils.py +++ b/debug/accuracy_tools/msprobe/core/common/file_utils.py @@ -23,6 +23,8 @@ import shutil from datetime import datetime, timezone from dateutil import parser import yaml + +import torch import numpy as np import pandas as pd @@ -669,3 +671,16 @@ def read_xlsx(file_path): logger.error(f"The xlsx file failed to load. Please check the path: {file_path}.") raise RuntimeError(f"Read xlsx file {file_path} failed.") from e return result_df + + +def load_pt(pt_path, to_cpu=False): + pt_path = os.path.realpath(pt_path) + check_file_or_directory_path(pt_path) + try: + if to_cpu: + pt = torch.load(pt_path, map_location=torch.device("cpu"), weights_only=True) + else: + pt = torch.load(pt_path, weights_only=True) + except Exception as e: + raise RuntimeError(f"load pt file {pt_path} failed") from e + return pt diff --git a/debug/accuracy_tools/msprobe/core/compare/acc_compare.py b/debug/accuracy_tools/msprobe/core/compare/acc_compare.py index 606c25e0db..3f428bf14f 100644 --- a/debug/accuracy_tools/msprobe/core/compare/acc_compare.py +++ b/debug/accuracy_tools/msprobe/core/compare/acc_compare.py @@ -33,7 +33,7 @@ from msprobe.core.compare.highlight import find_compare_result_error_rows, highl from msprobe.core.compare.multiprocessing_compute import ComparisonResult, _handle_multi_process, _save_cmp_result from msprobe.core.compare.npy_compare import compare_ops_apply, get_error_flag_and_msg from msprobe.core.compare.utils import get_accuracy, get_rela_diff_summary_mode, get_un_match_accuracy, merge_tensor, \ - print_compare_ends_info, read_op, get_name_and_state, reorder_op_x_list + print_compare_ends_info, read_op, get_name_and_state, reorder_op_x_list, read_pt_data, read_npy_data from msprobe.pytorch.common.utils import read_pt_data from msprobe.mindspore.common.utils import read_npy_data diff --git a/debug/accuracy_tools/msprobe/core/compare/utils.py b/debug/accuracy_tools/msprobe/core/compare/utils.py index a2edf57e5b..afabd44d65 100644 --- a/debug/accuracy_tools/msprobe/core/compare/utils.py +++ b/debug/accuracy_tools/msprobe/core/compare/utils.py @@ -19,11 +19,12 @@ import math import zlib from dataclasses import dataclass +import torch import numpy as np from msprobe.core.common.const import Const, CompareConst, FileCheckConst from msprobe.core.common.utils import CompareException, check_regex_prefix_format_valid, logger, safe_get_value -from msprobe.core.common.file_utils import check_file_or_directory_path +from msprobe.core.common.file_utils import check_file_or_directory_path, FileChecker, load_pt, load_npy def extract_json(dirname, stack_json=False): @@ -597,6 +598,43 @@ def reorder_op_x_list(op_name_list, summary_list, data_name_list): return op_name_reorder, summary_reorder, data_name_reorder +def read_pt_data(dir_path, file_name): + if not file_name: + return None + + data_path = os.path.join(dir_path, file_name) + path_checker = FileChecker(data_path, FileCheckConst.FILE, FileCheckConst.READ_ABLE, + FileCheckConst.PT_SUFFIX, False) + data_path = path_checker.common_check() + try: + # detach because numpy can not process gradient information + data_value = load_pt(data_path, to_cpu=True).detach() + except RuntimeError as e: + # 这里捕获 load_pt 中抛出的异常 + logger.error(f"Failed to load the .pt file at {data_path}.") + raise CompareException(CompareException.INVALID_FILE_ERROR) from e + except AttributeError as e: + # 这里捕获 detach 方法抛出的异常 + logger.error(f"Failed to detach the loaded tensor.") + raise CompareException(CompareException.DETACH_ERROR) from e + if data_value.dtype == torch.bfloat16: + data_value = data_value.to(torch.float32) + data_value = data_value.numpy() + return data_value + + +def read_npy_data(dir_path, file_name): + if not file_name: + return None + + data_path = os.path.join(dir_path, file_name) + path_checker = FileChecker(data_path, FileCheckConst.FILE, FileCheckConst.READ_ABLE, + FileCheckConst.NUMPY_SUFFIX, False) + data_path = path_checker.common_check() + data_value = load_npy(data_path) + return data_value + + def _compare_parser(parser): parser.add_argument("-i", "--input_path", dest="input_path", type=str, help=" The compare input path, a dict json.", required=True) diff --git a/debug/accuracy_tools/msprobe/mindspore/common/utils.py b/debug/accuracy_tools/msprobe/mindspore/common/utils.py index 57f3187e07..b8ed5e143f 100644 --- a/debug/accuracy_tools/msprobe/mindspore/common/utils.py +++ b/debug/accuracy_tools/msprobe/mindspore/common/utils.py @@ -22,8 +22,7 @@ from mindspore import ops from mindspore.mint import nn from msprobe.core.common.exceptions import DistributedNotInitializedError -from msprobe.core.common.file_utils import path_len_exceeds_limit, check_path_exists, load_npy, save_npy, \ - FileChecker, FileCheckConst +from msprobe.core.common.file_utils import path_len_exceeds_limit, check_path_exists, save_npy from msprobe.core.common.log import logger from msprobe.core.common.const import Const from msprobe.core.common.utils import CompareException, check_seed_all @@ -198,15 +197,3 @@ def check_save_param(variable, name, save_backward): "should be bool. " "Skip current save process.") raise ValueError - - -def read_npy_data(dir_path, file_name): - if not file_name: - return None - - data_path = os.path.join(dir_path, file_name) - path_checker = FileChecker(data_path, FileCheckConst.FILE, FileCheckConst.READ_ABLE, - FileCheckConst.NUMPY_SUFFIX, False) - data_path = path_checker.common_check() - data_value = load_npy(data_path) - return data_value diff --git a/debug/accuracy_tools/msprobe/pytorch/common/utils.py b/debug/accuracy_tools/msprobe/pytorch/common/utils.py index 92378c0a26..1f938e5a38 100644 --- a/debug/accuracy_tools/msprobe/pytorch/common/utils.py +++ b/debug/accuracy_tools/msprobe/pytorch/common/utils.py @@ -1,4 +1,4 @@ -# Copyright (c) 2024-2024, Huawei Technologies Co., Ltd. +# Copyright (c) 2024-2025, Huawei Technologies Co., Ltd. # All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -26,9 +26,9 @@ import torch import torch.distributed as dist from msprobe.core.common.exceptions import DistributedNotInitializedError from msprobe.core.common.file_utils import FileCheckConst, change_mode, check_file_or_directory_path, \ - check_path_before_create, FileOpen, FileChecker + check_path_before_create, FileOpen from msprobe.core.common.log import logger -from msprobe.core.common.utils import check_seed_all, CompareException +from msprobe.core.common.utils import check_seed_all from packaging import version try: @@ -309,19 +309,6 @@ def print_rank_0(message): logger.info(message) -def load_pt(pt_path, to_cpu=False): - pt_path = os.path.realpath(pt_path) - check_file_or_directory_path(pt_path) - try: - if to_cpu: - pt = torch.load(pt_path, map_location=torch.device("cpu"), weights_only=True) - else: - pt = torch.load(pt_path, weights_only=True) - except Exception as e: - raise RuntimeError(f"load pt file {pt_path} failed") from e - return pt - - def save_pt(tensor, filepath): check_path_before_create(filepath) filepath = os.path.realpath(filepath) @@ -473,28 +460,3 @@ def replace_last_occurrence(text, old, new): if index != -1: return text[:index] + text[index:].replace(old, new, 1) return text - - -def read_pt_data(dir_path, file_name): - if not file_name: - return None - - data_path = os.path.join(dir_path, file_name) - path_checker = FileChecker(data_path, FileCheckConst.FILE, FileCheckConst.READ_ABLE, - FileCheckConst.PT_SUFFIX, False) - data_path = path_checker.common_check() - try: - # detach because numpy can not process gradient information - data_value = load_pt(data_path, to_cpu=True).detach() - except RuntimeError as e: - # 这里捕获 load_pt 中抛出的异常 - logger.error(f"Failed to load the .pt file at {data_path}.") - raise CompareException(CompareException.INVALID_FILE_ERROR) from e - except AttributeError as e: - # 这里捕获 detach 方法抛出的异常 - logger.error(f"Failed to detach the loaded tensor.") - raise CompareException(CompareException.DETACH_ERROR) from e - if data_value.dtype == torch.bfloat16: - data_value = data_value.to(torch.float32) - data_value = data_value.numpy() - return data_value diff --git a/debug/accuracy_tools/msprobe/test/core_ut/common/test_file_utils.py b/debug/accuracy_tools/msprobe/test/core_ut/common/test_file_utils.py index ac3a859bf4..303c083eaa 100644 --- a/debug/accuracy_tools/msprobe/test/core_ut/common/test_file_utils.py +++ b/debug/accuracy_tools/msprobe/test/core_ut/common/test_file_utils.py @@ -1,4 +1,6 @@ +import unittest from unittest.mock import patch, mock_open, MagicMock +import tempfile import pytest @@ -532,3 +534,36 @@ class TestDirectoryChecks: check_file_or_directory_path(self.test_file, isdir=False) # Test directory path check_file_or_directory_path(self.test_dir, isdir=True) + + +class TestLoadPt(unittest.TestCase): + + def setUp(self): + self.temp_file = tempfile.NamedTemporaryFile(delete=False, suffix='.pt') + tensor = torch.tensor([1, 2, 3]) + torch.save(tensor, self.temp_file.name) + + @patch('torch.load') + def test_load_pt_cpu(self, mock_load): + mock_load.return_value = torch.tensor([1, 2, 3]) + result = load_pt(self.temp_file.name, to_cpu=True) + self.assertTrue(torch.equal(result, torch.tensor([1, 2, 3]))) + mock_load.assert_called_once_with(self.temp_file.name, map_location=torch.device("cpu"), weights_only=True) + + @patch('torch.load') + def test_load_pt_nogpu(self, mock_load): + mock_load.return_value = torch.tensor([1, 2, 3]) + result = load_pt(self.temp_file.name, to_cpu=False) + self.assertTrue(torch.equal(result, torch.tensor([1, 2, 3]))) + mock_load.assert_called_once_with(self.temp_file.name, weights_only=True) + + @patch('torch.load') + def test_load_pt_failure(self, mock_load): + mock_load.side_effect = RuntimeError("Load failed") + with self.assertRaises(RuntimeError) as context: + load_pt(self.temp_file.name) + self.assertIn("load pt file", str(context.exception)) + + def tearDown(self): + if os.path.isfile(self.temp_file.name): + os.remove(self.temp_file.name) diff --git a/debug/accuracy_tools/msprobe/test/core_ut/compare/test_acc_compare_utils.py b/debug/accuracy_tools/msprobe/test/core_ut/compare/test_acc_compare_utils.py index ab8703dcd3..579cd081e0 100644 --- a/debug/accuracy_tools/msprobe/test/core_ut/compare/test_acc_compare_utils.py +++ b/debug/accuracy_tools/msprobe/test/core_ut/compare/test_acc_compare_utils.py @@ -4,17 +4,19 @@ import json import os import shutil import unittest -from unittest.mock import patch +from unittest.mock import patch, MagicMock import zlib +import torch import numpy as np -from msprobe.core.common.const import CompareConst, Const +from msprobe.core.common.const import CompareConst, Const, FileCheckConst from msprobe.core.common.utils import CompareException from msprobe.core.compare.utils import ApiItemInfo, _compare_parser, check_and_return_dir_contents, extract_json, \ count_struct, get_accuracy, append_stack_info, get_rela_diff_summary_mode, get_un_match_accuracy, merge_tensor, \ op_item_parse, read_op, rename_api, resolve_api_special_parameters, result_item_init, stack_column_process, \ - table_value_is_valid, get_name_and_state, reorder_op_name_list, reorder_op_x_list, gen_op_item + table_value_is_valid, get_name_and_state, reorder_op_name_list, reorder_op_x_list, gen_op_item, read_pt_data, \ + read_npy_data # test_read_op_1 op_data = { @@ -848,3 +850,54 @@ class TestGenOpItem(unittest.TestCase): expected_md5 = f"{zlib.crc32(str(op_data['value']).encode()):08x}" self.assertEqual(result['md5'], expected_md5) + + +class TestReadPtData(unittest.TestCase): + + @patch('msprobe.core.compare.utils.load_pt') + @patch('msprobe.core.compare.utils.FileChecker') + @patch('os.path.join', return_value='/fake/path/to/file.pt') + def test_read_pt_data(self, mock_os, mock_file_checker, mock_load_pt): + mock_file_checker.return_value.common_check.return_value = '/fake/path/to/file.pt' + + mock_tensor = MagicMock() + mock_tensor.detach.return_value = mock_tensor + mock_tensor.to.return_value = mock_tensor + mock_tensor.dtype = torch.bfloat16 + mock_tensor.numpy.return_value = np.array([1.0, 2.0, 3.0]) + mock_load_pt.return_value = mock_tensor + + result = read_pt_data('/fake/dir', 'file_name.pt') + + mock_file_checker.assert_called_once_with('/fake/path/to/file.pt', FileCheckConst.FILE, FileCheckConst.READ_ABLE, FileCheckConst.PT_SUFFIX, False) + mock_load_pt.assert_called_once_with('/fake/path/to/file.pt', to_cpu=True) + mock_tensor.to.assert_called_once_with(torch.float32) + self.assertTrue(np.array_equal(result, np.array([1.0, 2.0, 3.0]))) + + @patch('os.path.join', return_value='/fake/path/to/file.pt') + @patch('msprobe.core.compare.utils.FileChecker') + @patch('msprobe.core.compare.utils.load_pt') + def test_read_real_data_pt_exception(self, mock_load_pt, mock_file_checker, mock_os): + mock_file_checker.return_value.common_check.return_value = '/fake/path/to/file.pt' + + mock_load_pt.side_effect = RuntimeError("Test Error") + + with self.assertRaises(CompareException): + read_pt_data('/fake/dir', 'file_name.pt') + + +class TestReadNpyData(unittest.TestCase): + + @patch('msprobe.core.compare.utils.load_npy') + @patch('msprobe.core.compare.utils.FileChecker') + @patch('os.path.join', return_value='/fake/path/to/file.npy') + def test_read_real_data_ms(self, mock_os, mock_file_checker, mock_load_npy): + mock_file_checker.return_value.common_check.return_value = '/fake/path/to/file.npy' + + mock_load_npy.return_value = np.array([1.0, 2.0, 3.0]) + + result = read_npy_data('/fake/dir', 'file_name.npy') + + mock_file_checker.assert_called_once_with('/fake/path/to/file.npy', FileCheckConst.FILE, FileCheckConst.READ_ABLE, FileCheckConst.NUMPY_SUFFIX, False) + mock_load_npy.assert_called_once_with('/fake/path/to/file.npy') + self.assertTrue(np.array_equal(result, np.array([1.0, 2.0, 3.0]))) \ No newline at end of file diff --git a/debug/accuracy_tools/msprobe/test/mindspore_ut/common/test_ms_utils.py b/debug/accuracy_tools/msprobe/test/mindspore_ut/common/test_ms_utils.py index 6d2b0b99c7..80f91a53f7 100644 --- a/debug/accuracy_tools/msprobe/test/mindspore_ut/common/test_ms_utils.py +++ b/debug/accuracy_tools/msprobe/test/mindspore_ut/common/test_ms_utils.py @@ -19,9 +19,8 @@ from unittest.mock import patch import numpy as np import mindspore as ms -from msprobe.core.common.const import FileCheckConst from msprobe.mindspore.common.utils import get_rank_if_initialized, convert_bf16_to_fp32, convert_to_int, \ - list_lowest_level_directories, seed_all, remove_dropout, MsprobeStep, read_npy_data + list_lowest_level_directories, seed_all, remove_dropout, MsprobeStep class MockCell: @@ -129,19 +128,3 @@ class TestMsprobeFunctions(unittest.TestCase): from mindspore.mint.nn.functional import dropout self.assertTrue((Dropout(0.5)(x1d).numpy() == x1d.numpy()).all()) self.assertTrue((dropout(x1d, p=0.5).numpy() == x1d.numpy()).all()) - - -class TestReadNpyData(unittest.TestCase): - @patch('msprobe.mindspore.common.utils.load_npy') - @patch('msprobe.mindspore.common.utils.FileChecker') - @patch('os.path.join', return_value='/fake/path/to/file.npy') - def test_read_real_data_ms(self, mock_os, mock_file_checker, mock_load_npy): - mock_file_checker.return_value.common_check.return_value = '/fake/path/to/file.npy' - - mock_load_npy.return_value = np.array([1.0, 2.0, 3.0]) - - result = read_npy_data('/fake/dir', 'file_name.npy') - - mock_file_checker.assert_called_once_with('/fake/path/to/file.npy', FileCheckConst.FILE, FileCheckConst.READ_ABLE, FileCheckConst.NUMPY_SUFFIX, False) - mock_load_npy.assert_called_once_with('/fake/path/to/file.npy') - self.assertTrue(np.array_equal(result, np.array([1.0, 2.0, 3.0]))) diff --git a/debug/accuracy_tools/msprobe/test/pytorch_ut/common/test_pt_utils.py b/debug/accuracy_tools/msprobe/test/pytorch_ut/common/test_pt_utils.py index f0a26a39a0..61f7d97b55 100644 --- a/debug/accuracy_tools/msprobe/test/pytorch_ut/common/test_pt_utils.py +++ b/debug/accuracy_tools/msprobe/test/pytorch_ut/common/test_pt_utils.py @@ -2,19 +2,16 @@ import os import io import unittest from unittest.mock import MagicMock, patch -import tempfile import torch import torch.distributed as dist -import numpy as np from msprobe.core.common.file_utils import FileCheckConst from msprobe.core.common.exceptions import DistributedNotInitializedError -from msprobe.core.common.utils import CompareException from msprobe.pytorch.api_accuracy_checker.common.utils import ApiData from msprobe.pytorch.common.utils import parameter_adapter, get_rank_if_initialized, \ - get_tensor_rank, get_rank_id, print_rank_0, load_pt, save_pt, save_api_data, \ - load_api_data, save_pkl, load_pkl, read_pt_data + get_tensor_rank, get_rank_id, print_rank_0, save_pt, save_api_data, \ + load_api_data, save_pkl, load_pkl class TestParameterAdapter(unittest.TestCase): @@ -151,39 +148,6 @@ class TestPrintRank0(unittest.TestCase): mock_logger_info.assert_called_once_with(message) -class TestLoadPt(unittest.TestCase): - - def setUp(self): - self.temp_file = tempfile.NamedTemporaryFile(delete=False, suffix='.pt') - tensor = torch.tensor([1, 2, 3]) - torch.save(tensor, self.temp_file.name) - - @patch('torch.load') - def test_load_pt_cpu(self, mock_load): - mock_load.return_value = torch.tensor([1, 2, 3]) - result = load_pt(self.temp_file.name, to_cpu=True) - self.assertTrue(torch.equal(result, torch.tensor([1, 2, 3]))) - mock_load.assert_called_once_with(self.temp_file.name, map_location=torch.device("cpu"), weights_only=True) - - @patch('torch.load') - def test_load_pt_nogpu(self, mock_load): - mock_load.return_value = torch.tensor([1, 2, 3]) - result = load_pt(self.temp_file.name, to_cpu=False) - self.assertTrue(torch.equal(result, torch.tensor([1, 2, 3]))) - mock_load.assert_called_once_with(self.temp_file.name, weights_only=True) - - @patch('torch.load') - def test_load_pt_failure(self, mock_load): - mock_load.side_effect = RuntimeError("Load failed") - with self.assertRaises(RuntimeError) as context: - load_pt(self.temp_file.name) - self.assertIn("load pt file", str(context.exception)) - - def tearDown(self): - if os.path.isfile(self.temp_file.name): - os.remove(self.temp_file.name) - - class TestSavePT(unittest.TestCase): def setUp(self): @@ -304,37 +268,3 @@ class TestSavePkl(unittest.TestCase): load_pkl(self.filepath) self.assertIn("Unsupported object type: os.system", str(context.exception)) os.remove(self.filepath) - - -class TestReadPtData(unittest.TestCase): - - @patch('msprobe.pytorch.common.utils.load_pt') - @patch('msprobe.pytorch.common.utils.FileChecker') - @patch('os.path.join', return_value='/fake/path/to/file.pt') - def test_read_pt_data(self, mock_os, mock_file_checker, mock_load_pt): - mock_file_checker.return_value.common_check.return_value = '/fake/path/to/file.pt' - - mock_tensor = MagicMock() - mock_tensor.detach.return_value = mock_tensor - mock_tensor.to.return_value = mock_tensor - mock_tensor.dtype = torch.bfloat16 - mock_tensor.numpy.return_value = np.array([1.0, 2.0, 3.0]) - mock_load_pt.return_value = mock_tensor - - result = read_pt_data('/fake/dir', 'file_name.pt') - - mock_file_checker.assert_called_once_with('/fake/path/to/file.pt', FileCheckConst.FILE, FileCheckConst.READ_ABLE, FileCheckConst.PT_SUFFIX, False) - mock_load_pt.assert_called_once_with('/fake/path/to/file.pt', to_cpu=True) - mock_tensor.to.assert_called_once_with(torch.float32) - self.assertTrue(np.array_equal(result, np.array([1.0, 2.0, 3.0]))) - - @patch('os.path.join', return_value='/fake/path/to/file.pt') - @patch('msprobe.pytorch.common.utils.FileChecker') - @patch('msprobe.pytorch.common.utils.load_pt') - def test_read_real_data_pt_exception(self, mock_load_pt, mock_file_checker, mock_os): - mock_file_checker.return_value.common_check.return_value = '/fake/path/to/file.pt' - - mock_load_pt.side_effect = RuntimeError("Test Error") - - with self.assertRaises(CompareException): - read_pt_data('/fake/dir', 'file_name.pt') -- Gitee From 7198fab89063fbdb6efaf150917d5913e4444971 Mon Sep 17 00:00:00 2001 From: Linwei-Ying Date: Fri, 28 Feb 2025 10:54:30 +0800 Subject: [PATCH 19/37] compare read data read improve --- debug/accuracy_tools/msprobe/core/compare/acc_compare.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/debug/accuracy_tools/msprobe/core/compare/acc_compare.py b/debug/accuracy_tools/msprobe/core/compare/acc_compare.py index 3f428bf14f..3a7f056f7a 100644 --- a/debug/accuracy_tools/msprobe/core/compare/acc_compare.py +++ b/debug/accuracy_tools/msprobe/core/compare/acc_compare.py @@ -34,8 +34,6 @@ from msprobe.core.compare.multiprocessing_compute import ComparisonResult, _hand from msprobe.core.compare.npy_compare import compare_ops_apply, get_error_flag_and_msg from msprobe.core.compare.utils import get_accuracy, get_rela_diff_summary_mode, get_un_match_accuracy, merge_tensor, \ print_compare_ends_info, read_op, get_name_and_state, reorder_op_x_list, read_pt_data, read_npy_data -from msprobe.pytorch.common.utils import read_pt_data -from msprobe.mindspore.common.utils import read_npy_data class ModeConfig: @@ -366,19 +364,19 @@ class Comparator: data_name = safe_get_value(npu_bench_name_list, 1, "npu_bench_name_list") error_file, relative_err, error_flag = None, None, False bench_file_name = get_bench_data_name(bench_op_name, bench_data) - if str(data_name) == '-1': # 没有真实数据路径 + if str(data_name) == "-1": # 没有真实数据路径 n_value, b_value = CompareConst.READ_NONE, CompareConst.READ_NONE error_flag = True elif not bench_file_name: n_value, b_value, error_flag = CompareConst.READ_NONE, CompareConst.READ_NONE, True - error_file = 'no_bench_data' + error_file = "no_bench_data" else: npu_dir = input_param.get("npu_dump_data_dir") bench_dir = input_param.get("bench_dump_data_dir") npu_file_name = npu_op_name + Const.NUMPY_SUFFIX try: frame_name = getattr(self, "frame_name") - if frame_name == 'MSComparator': + if frame_name == "MSComparator": n_value = read_npy_data(npu_dir, npu_file_name) if self.cross_frame: b_value = read_pt_data(bench_dir, bench_file_name) -- Gitee From d0ff94e8eda732bfde0743772347c279e34e6199 Mon Sep 17 00:00:00 2001 From: Linwei-Ying Date: Fri, 28 Feb 2025 11:05:56 +0800 Subject: [PATCH 20/37] compare read data read improve --- .../pytorch/api_accuracy_checker/run_ut/data_generate.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/debug/accuracy_tools/msprobe/pytorch/api_accuracy_checker/run_ut/data_generate.py b/debug/accuracy_tools/msprobe/pytorch/api_accuracy_checker/run_ut/data_generate.py index 9d89b2de32..05da6954cd 100644 --- a/debug/accuracy_tools/msprobe/pytorch/api_accuracy_checker/run_ut/data_generate.py +++ b/debug/accuracy_tools/msprobe/pytorch/api_accuracy_checker/run_ut/data_generate.py @@ -1,6 +1,6 @@ #!/usr/bin/env python3 # -*- coding: utf-8 -*- -# Copyright (c) 2024-2024, Huawei Technologies Co., Ltd. +# Copyright (c) 2024-2025, Huawei Technologies Co., Ltd. # All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -23,9 +23,8 @@ import numpy from msprobe.pytorch.api_accuracy_checker.run_ut.run_ut_utils import hf_32_standard_api from msprobe.pytorch.api_accuracy_checker.common.utils import check_object_type, get_full_data_path, \ CompareException, get_module_and_atttribute_name, get_attribute -from msprobe.core.common.file_utils import FileChecker, load_npy +from msprobe.core.common.file_utils import FileChecker, load_pt, load_npy from msprobe.pytorch.common.log import logger -from msprobe.pytorch.common.utils import load_pt from msprobe.core.common.const import Const, FileCheckConst, CompareConst -- Gitee From 48ded08b019a94a0ad2ca2bb5a6775192e5d50ea Mon Sep 17 00:00:00 2001 From: Linwei-Ying Date: Fri, 28 Feb 2025 11:25:13 +0800 Subject: [PATCH 21/37] compare read data read improve --- .../api_accuracy_checker/tensor_transport_layer/test_attl.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/debug/accuracy_tools/msprobe/test/pytorch_ut/api_accuracy_checker/tensor_transport_layer/test_attl.py b/debug/accuracy_tools/msprobe/test/pytorch_ut/api_accuracy_checker/tensor_transport_layer/test_attl.py index 79df231a1a..0320c43d0b 100644 --- a/debug/accuracy_tools/msprobe/test/pytorch_ut/api_accuracy_checker/tensor_transport_layer/test_attl.py +++ b/debug/accuracy_tools/msprobe/test/pytorch_ut/api_accuracy_checker/tensor_transport_layer/test_attl.py @@ -49,7 +49,7 @@ class TestATTL(unittest.TestCase): self.assertIsNone(result) @patch('glob.glob') - @patch('msprobe.pytorch.common.utils.load_pt') + @patch('msprobe.core.common.file_utils.load_pt') def test_download_with_exception(self, mock_load_pt, mock_glob): mock_glob.return_value = ['/tmp/start_file.pt'] mock_load_pt.side_effect = Exception('Load error') -- Gitee From 79bfa261a8588f4bb4ff45e4a9b67f69d24c9efd Mon Sep 17 00:00:00 2001 From: Linwei-Ying Date: Fri, 28 Feb 2025 17:51:15 +0800 Subject: [PATCH 22/37] compare bench_data_name get improve --- .../msprobe/core/compare/acc_compare.py | 80 +++++-------------- .../core/compare/multiprocessing_compute.py | 10 +-- .../msprobe/core/compare/utils.py | 14 ++-- 3 files changed, 33 insertions(+), 71 deletions(-) diff --git a/debug/accuracy_tools/msprobe/core/compare/acc_compare.py b/debug/accuracy_tools/msprobe/core/compare/acc_compare.py index f0ac97a029..a4c5f23324 100644 --- a/debug/accuracy_tools/msprobe/core/compare/acc_compare.py +++ b/debug/accuracy_tools/msprobe/core/compare/acc_compare.py @@ -349,47 +349,48 @@ class Comparator: result_df = self.make_result_table(result) return result_df - def compare_by_op(self, npu_op_name, bench_op_name, op_name_mapping_dict, input_param, bench_data): + def compare_by_op(self, npu_op_name, bench_op_name, op_name_mapping_dict, input_param): """ :param npu_op_name: excel中的NPU_Name,例如:MintFunctional.conv2d.0.forward.input.3.0 :param bench_op_name: excel中的Bench_Name,例如:Functional.conv2d.0.forward.input.3.0 :param op_name_mapping_dict: op_name和npy或pt文件的映射关系 :param input_param: npu_json_path/bench_json_path/stack_json_path等参数 - :param bench_data: bench的dump数据中"data"字段 :return: result_list,包含余弦相似度、最大绝对误差、最大相对误差、千分之一误差率、千分之五误差率和错误信息 用于读取excel中的NPU_Name和Bench_Name,根据映射关系找到npy或pt文件,然后读取文件中的数据进行比较,计算余弦相似度、 最大绝对误差、最大相对误差、千分之一误差率、千分之五误差率并生成错误信息 """ - npu_bench_name_list = op_name_mapping_dict[npu_op_name] - data_name = safe_get_value(npu_bench_name_list, 1, "npu_bench_name_list") + frame_name = getattr(self, "frame_name") error_file, relative_err, error_flag = None, None, False - bench_data_name = get_bench_data_name(bench_op_name, bench_data) - if data_name == '-1' or data_name == -1: # 没有真实数据路径 - n_value, b_value = CompareConst.READ_NONE, CompareConst.READ_NONE - error_flag = True - elif not bench_data_name: + + data_name_pair = op_name_mapping_dict(npu_op_name) + npu_data_name = data_name_pair[0] + bench_data_name = data_name_pair[1] + + if str(npu_data_name) == '-1': # 没有npu真实数据路径 + n_value, b_value, error_flag = CompareConst.READ_NONE, CompareConst.READ_NONE, True + elif str(bench_data_name) == '-1': # 没有bench真实数据路径 n_value, b_value, error_flag = CompareConst.READ_NONE, CompareConst.READ_NONE, True error_file = 'no_bench_data' else: + npu_dir = input_param.get("npu_dump_data_dir") + bench_dir = input_param.get("bench_dump_data_dir") try: read_npy_data = getattr(self, "read_npy_data") - frame_name = getattr(self, "frame_name") if frame_name == "MSComparator": - n_value = read_npy_data(input_param.get("npu_dump_data_dir"), npu_op_name + Const.NUMPY_SUFFIX) + n_value = read_npy_data(npu_dir, npu_data_name) if self.cross_frame: - b_value = read_npy_data(input_param.get("bench_dump_data_dir"), bench_data_name, - load_pt_file=True) + b_value = read_npy_data(bench_dir, bench_data_name, load_pt_file=True) else: - b_value = read_npy_data(input_param.get("bench_dump_data_dir"), bench_data_name) + b_value = read_npy_data(bench_dir, bench_data_name) else: - n_value = read_npy_data(input_param.get("npu_dump_data_dir"), npu_op_name + Const.PT_SUFFIX) - b_value = read_npy_data(input_param.get("bench_dump_data_dir"), bench_data_name) + n_value = read_npy_data(npu_dir, npu_data_name) + b_value = read_npy_data(bench_dir, bench_data_name) except IOError as error: error_file = error.filename n_value, b_value = CompareConst.READ_NONE, CompareConst.READ_NONE error_flag = True except (FileCheckException, CompareException): - error_file = data_name + error_file = npu_data_name n_value, b_value = CompareConst.READ_NONE, CompareConst.READ_NONE error_flag = True @@ -472,7 +473,7 @@ class Comparator: logger.info("start compare: {}".format(npu_op_name)) cos_sim, euc_dist, max_abs_err, max_relative_err, one_thousand_err_ratio, five_thousand_err_ratio, err_msg \ - = self.compare_by_op(npu_op_name, bench_op_name, dump_path_dict, input_param, bench_data) + = self.compare_by_op(npu_op_name, bench_op_name, dump_path_dict, input_param) if is_print_compare_log: logger.info( @@ -508,46 +509,3 @@ class Comparator: except ValueError as e: logger.error('result dataframe is not found.') raise CompareException(CompareException.INVALID_DATA_ERROR) from e - - -def get_bench_data_name(bench_op_name, bench_data): - bench_name_list = re.split(r'\.(input|output|kwargs|parameters|parameters_grad)\.', bench_op_name) - if len(bench_name_list) > 1 and bench_name_list[1] == Const.PARAMS_GRAD: - bench_data_bundle = bench_data.get(bench_name_list[0] + Const.SEP + bench_name_list[1], {}) - else: - bench_data_bundle = bench_data.get(bench_name_list[0], {}) - if not bench_data_bundle or len(bench_name_list) < 3: - return None - layers = bench_name_list[2].split(Const.SEP) - - def _get(key, container): - if isinstance(container, dict): - return container.get(key) - if isinstance(container, list): - try: - return container[int(key)] - except (ValueError, IndexError): - return None - return None - - def get_by_layer(container, params_grad=False): - data = container - # dump.json中parameters_grad的结构为key:[{}], 如果存在key,有且只有一个列表元素,而op_name中只命名到了key,因此加'0' - if params_grad: - layers.append('0') - for layer in layers: - data = _get(layer, data) - return _get(CompareConst.DATA_NAME.lower(), data) - - if Const.INPUT == bench_name_list[1]: - return get_by_layer(bench_data_bundle.get(Const.INPUT, bench_data_bundle.get(Const.INPUT_ARGS))) - elif Const.KWARGS == bench_name_list[1]: - return get_by_layer(bench_data_bundle.get(Const.INPUT_KWARGS)) - elif Const.OUTPUT == bench_name_list[1]: - return get_by_layer(bench_data_bundle.get(Const.OUTPUT)) - elif Const.PARAMS == bench_name_list[1]: - return get_by_layer(bench_data_bundle.get(Const.PARAMS)) - elif Const.PARAMS_GRAD == bench_name_list[1]: - return get_by_layer(bench_data_bundle, params_grad=True) - else: - return None diff --git a/debug/accuracy_tools/msprobe/core/compare/multiprocessing_compute.py b/debug/accuracy_tools/msprobe/core/compare/multiprocessing_compute.py index f79671827c..71b0f29d64 100644 --- a/debug/accuracy_tools/msprobe/core/compare/multiprocessing_compute.py +++ b/debug/accuracy_tools/msprobe/core/compare/multiprocessing_compute.py @@ -25,7 +25,7 @@ from msprobe.core.common.utils import CompareException from msprobe.core.common.const import CompareConst -def _handle_multi_process(func, input_parma, result_df, lock): +def _handle_multi_process(func, input_param, result_df, lock): process_num = max(int((multiprocessing.cpu_count() + 1) // 4), 1) op_name_mapping_dict = read_dump_data(result_df) @@ -55,7 +55,7 @@ def _handle_multi_process(func, input_parma, result_df, lock): idx = df_chunk_size * process_idx chunk_size = len(df_chunk) result = pool.apply_async(func, - args=(idx, op_name_mapping_dict, df_chunk, lock, input_parma), + args=(idx, op_name_mapping_dict, df_chunk, lock, input_param), error_callback=err_call, callback=partial(update_progress, chunk_size, lock) ) @@ -97,12 +97,12 @@ def _ms_graph_handle_multi_process(func, result_df, mode): def read_dump_data(result_df): try: npu_dump_name_list = result_df.iloc[0:, 0].tolist() - npu_dump_tensor_list = result_df.iloc[0:, -1].tolist() + dump_tensor_pair_list = result_df.iloc[0:, -1].tolist() op_name_mapping_dict = {} for index, _ in enumerate(npu_dump_name_list): npu_dump_name = npu_dump_name_list[index] - npu_dump_tensor = npu_dump_tensor_list[index] - op_name_mapping_dict[npu_dump_name] = [npu_dump_tensor, npu_dump_tensor] + dump_tensor_pair = dump_tensor_pair_list[index] + op_name_mapping_dict[npu_dump_name] = dump_tensor_pair return op_name_mapping_dict except ValueError as e: logger.error('result dataframe is not found.') diff --git a/debug/accuracy_tools/msprobe/core/compare/utils.py b/debug/accuracy_tools/msprobe/core/compare/utils.py index 72b75ab254..6e49d62fc1 100644 --- a/debug/accuracy_tools/msprobe/core/compare/utils.py +++ b/debug/accuracy_tools/msprobe/core/compare/utils.py @@ -321,8 +321,8 @@ def get_accuracy(result, n_dict, b_dict, dump_mode): has_stack = npu_stack_info and bench_stack_info if dump_mode == Const.ALL: - npu_data_name = n_dict.get("data_name", None) - bench_data_name = b_dict.get("data_name", None) + npu_data_name_list = n_dict.get("data_name", None) + bench_data_name_list = b_dict.get("data_name", None) for index in range(min_len): n_name = safe_get_value(n_dict, n_start + index, "n_dict", key="op_name") @@ -353,7 +353,9 @@ def get_accuracy(result, n_dict, b_dict, dump_mode): result_item.append(err_msg) result_item = stack_column_process(result_item, has_stack, index, key, npu_stack_info) if dump_mode == Const.ALL: - result_item.append(safe_get_value(npu_data_name, n_start + index, "npu_data_name")) + npu_data_name = safe_get_value(npu_data_name_list, n_start + index, "npu_data_name_list") + bench_data_name = safe_get_value(bench_data_name_list, n_start + index, "bench_data_name_list") + result_item.append(npu_data_name, bench_data_name) result.append(result_item) @@ -388,7 +390,9 @@ def get_accuracy(result, n_dict, b_dict, dump_mode): result_item.append(err_msg) result_item = stack_column_process(result_item, has_stack, index, key, npu_stack_info) if dump_mode == Const.ALL: - result_item.append(safe_get_value(npu_data_name, n_start + index, "npu_data_name")) + npu_data_name = safe_get_value(npu_data_name_list, n_start + index, "npu_data_name_list") + bench_data_name = safe_get_value(bench_data_name_list, n_start + index, "bench_data_name_list") + result_item.append(npu_data_name, bench_data_name) result.append(result_item) @@ -467,7 +471,7 @@ def get_un_match_accuracy(result, n_dict, dump_mode): result_item.append(err_msg) append_stack_info(result_item, npu_stack_info, index) if dump_mode == Const.ALL and result_item[1] == CompareConst.N_A: - result_item.extend(["-1"]) + result_item.extend(["-1", "-1"]) result.append(result_item) -- Gitee From 7c87b8b27cac16c84aa5b7f0fe1288572226c500 Mon Sep 17 00:00:00 2001 From: Linwei-Ying Date: Mon, 3 Mar 2025 11:42:49 +0800 Subject: [PATCH 23/37] compare bench_data_name get improve --- .../msprobe/core/compare/acc_compare.py | 4 +- .../test/core_ut/compare/test_acc_compare.py | 42 +++++-------------- .../test_cmp_multiprocessing_compute.py | 8 ++-- 3 files changed, 17 insertions(+), 37 deletions(-) diff --git a/debug/accuracy_tools/msprobe/core/compare/acc_compare.py b/debug/accuracy_tools/msprobe/core/compare/acc_compare.py index a4c5f23324..06f5932879 100644 --- a/debug/accuracy_tools/msprobe/core/compare/acc_compare.py +++ b/debug/accuracy_tools/msprobe/core/compare/acc_compare.py @@ -359,10 +359,9 @@ class Comparator: 用于读取excel中的NPU_Name和Bench_Name,根据映射关系找到npy或pt文件,然后读取文件中的数据进行比较,计算余弦相似度、 最大绝对误差、最大相对误差、千分之一误差率、千分之五误差率并生成错误信息 """ - frame_name = getattr(self, "frame_name") error_file, relative_err, error_flag = None, None, False - data_name_pair = op_name_mapping_dict(npu_op_name) + data_name_pair = op_name_mapping_dict.get(npu_op_name) npu_data_name = data_name_pair[0] bench_data_name = data_name_pair[1] @@ -375,6 +374,7 @@ class Comparator: npu_dir = input_param.get("npu_dump_data_dir") bench_dir = input_param.get("bench_dump_data_dir") try: + frame_name = getattr(self, "frame_name") read_npy_data = getattr(self, "read_npy_data") if frame_name == "MSComparator": n_value = read_npy_data(npu_dir, npu_data_name) diff --git a/debug/accuracy_tools/msprobe/test/core_ut/compare/test_acc_compare.py b/debug/accuracy_tools/msprobe/test/core_ut/compare/test_acc_compare.py index c882e331f5..81e9ec30b6 100644 --- a/debug/accuracy_tools/msprobe/test/core_ut/compare/test_acc_compare.py +++ b/debug/accuracy_tools/msprobe/test/core_ut/compare/test_acc_compare.py @@ -11,7 +11,7 @@ import torch from msprobe.core.common.const import CompareConst, Const from msprobe.core.common.utils import CompareException -from msprobe.core.compare.acc_compare import Comparator, ModeConfig, get_bench_data_name +from msprobe.core.compare.acc_compare import Comparator, ModeConfig from msprobe.core.compare.highlight import find_error_rows, find_compare_result_error_rows, ApiBatch from msprobe.core.compare.utils import get_accuracy from msprobe.pytorch.compare.pt_compare import PTComparator @@ -636,11 +636,11 @@ class TestUtilsMethods(unittest.TestCase): def test_do_multi_process(self): data = [['Functional.linear.0.forward.input.0', 'Functional.linear.0.forward.input.0', 'torch.float32', 'torch.float32', [2, 2], [2, 2], - '', '', '', '', '', '', 1, 1, 1, 1, 1, 1, 1, 1, 'Yes', '', '-1']] + '', '', '', '', '', '', 1, 1, 1, 1, 1, 1, 1, 1, 'Yes', '', ['-1', '-1']]] o_data = [['Functional.linear.0.forward.input.0', 'Functional.linear.0.forward.input.0', 'torch.float32', 'torch.float32', [2, 2], [2, 2], 'unsupported', 'unsupported', 'unsupported', 'unsupported', 'unsupported', 'unsupported', - 1, 1, 1, 1, 1, 1, 1, 1, 'None', 'No bench data matched.', '-1']] + 1, 1, 1, 1, 1, 1, 1, 1, 'None', 'No bench data matched.', ['-1', '-1']]] columns = CompareConst.COMPARE_RESULT_HEADER + ['Data_name'] result_df = pd.DataFrame(data, columns=columns) o_result = pd.DataFrame(o_data, columns=columns) @@ -670,7 +670,7 @@ class TestUtilsMethods(unittest.TestCase): mode_config = ModeConfig(stack_mode, auto_analyze, fuzzy_match, dump_mode) pt_comparator = PTComparator(mode_config) - result = pt_comparator.compare_by_op(npu_op_name, bench_op_name, op_name_mapping_dict, input_param, {}) + result = pt_comparator.compare_by_op(npu_op_name, bench_op_name, op_name_mapping_dict, input_param) self.assertEqual(result, ['unsupported', 'unsupported', 'unsupported', 'unsupported', 'unsupported', 'unsupported', 'No bench data matched.']) @@ -688,43 +688,23 @@ class TestUtilsMethods(unittest.TestCase): pt_comparator = PTComparator(mode_config) pt_name = '-1' - pt_path = os.path.join(base_dir, pt_name) - op_name_mapping_dict = {'Functional.linear.0.forward.input.0': [pt_path, pt_path]} + op_name_mapping_dict = {'Functional.linear.0.forward.input.0': [pt_name, pt_name]} input_param = {'npu_dump_data_dir': base_dir, 'bench_dump_data_dir': base_dir} - result = pt_comparator.compare_by_op(npu_op_name, bench_op_name, op_name_mapping_dict, input_param, - {'Functional.linear.0.forward': {'input_args': [ - {'data_name': 'Functional.linear.0.forward.input.0.pt'}]}}) + result = pt_comparator.compare_by_op(npu_op_name, bench_op_name, op_name_mapping_dict, input_param) self.assertEqual(result, ['unsupported', 'unsupported', 'unsupported', 'unsupported', 'unsupported', - 'unsupported', f'Dump file: {pt_path} not found.']) + 'unsupported', 'No bench data matched.']) pt_name = 'Functional.linear.0.forward.input.0.pt' - pt_path = os.path.join(base_dir, pt_name) - op_name_mapping_dict = {'Functional.linear.0.forward.input.0': [pt_path, pt_path]} + op_name_mapping_dict = {'Functional.linear.0.forward.input.0': [pt_name, pt_name]} input_param = {'npu_dump_data_dir': base_dir, 'bench_dump_data_dir': base_dir} - result = pt_comparator.compare_by_op(npu_op_name, bench_op_name, op_name_mapping_dict, input_param, {}) + result = pt_comparator.compare_by_op(npu_op_name, bench_op_name, op_name_mapping_dict, input_param) self.assertEqual(result, ['unsupported', 'unsupported', 'unsupported', 'unsupported', 'unsupported', - 'unsupported', 'Bench does not have data file.']) + 'unsupported', 'Dump file: Functional.linear.0.forward.input.0.pt not found']) generate_pt(base_dir) - result = pt_comparator.compare_by_op(npu_op_name, bench_op_name, op_name_mapping_dict, input_param, - {'Functional.linear.0.forward': {'input_args': [ - {'data_name': 'Functional.linear.0.forward.input.0.pt'}]}}) + result = pt_comparator.compare_by_op(npu_op_name, bench_op_name, op_name_mapping_dict, input_param) self.assertEqual(result, [1.0, 0.0, 0.0, 0.0, 1.0, 1.0, '']) - def test_get_bench_data_name_input(self): - bench_op_name = "Functional.linear.0.forward.input.0" - bench_data = {"Functional.linear.0.forward": {"input_args": [{"data_name": "Functional.linear.0.forward.input.0.pt"}], "input_kwargs": {}, "output": []}} - result = get_bench_data_name(bench_op_name, bench_data) - - self.assertEqual(result, "Functional.linear.0.forward.input.0.pt") - - def test_get_bench_data_name_output(self): - bench_op_name = "Functional.linear.0.forward.output.0" - bench_data = {"Functional.linear.0.forward": {"input_args": [], "input_kwargs": {}, "output": [{"data_name": "Functional.linear.0.forward.output.0.pt"}]}} - result = get_bench_data_name(bench_op_name, bench_data) - - self.assertEqual(result, "Functional.linear.0.forward.output.0.pt") - class TestComparator(unittest.TestCase): def setUp(self): diff --git a/debug/accuracy_tools/msprobe/test/core_ut/compare/test_cmp_multiprocessing_compute.py b/debug/accuracy_tools/msprobe/test/core_ut/compare/test_cmp_multiprocessing_compute.py index 3fa16b0d9d..49f084ce07 100644 --- a/debug/accuracy_tools/msprobe/test/core_ut/compare/test_cmp_multiprocessing_compute.py +++ b/debug/accuracy_tools/msprobe/test/core_ut/compare/test_cmp_multiprocessing_compute.py @@ -18,12 +18,12 @@ data = [['Functional.linear.0.forward.input.0', 'Functional.linear.0.forward.inp 'torch.float32', 'torch.float32', [2, 2], [2, 2], '', '', '', '', '', '', 1, 1, 1, 1, 1, 1, 1, 1, - 'Yes', '', '-1']] + 'Yes', '', ['-1', '-1']]] o_data = [['Functional.linear.0.forward.input.0', 'Functional.linear.0.forward.input.0', 'torch.float32', 'torch.float32', [2, 2], [2, 2], 'unsupported', 'unsupported', 'unsupported', 'unsupported', 'unsupported', 'unsupported', 1, 1, 1, 1, 1, 1, 1, 1, - 'None', 'No bench data matched.', '-1']] + 'None', 'No bench data matched.', ['-1', '-1']]] columns = CompareConst.COMPARE_RESULT_HEADER + ['Data_name'] result_df = pd.DataFrame(data, columns=columns) o_result = pd.DataFrame(o_data, columns=columns) @@ -54,9 +54,9 @@ class TestUtilsMethods(unittest.TestCase): func = Comparator(mode_config).compare_ops generate_dump_json(base_dir) - input_parma = {'bench_json_path': os.path.join(base_dir, 'dump.json')} + input_param = {'bench_json_path': os.path.join(base_dir, 'dump.json')} lock = multiprocessing.Manager().RLock() - result = _handle_multi_process(func, input_parma, result_df, lock) + result = _handle_multi_process(func, input_param, result_df, lock) self.assertTrue(result.equals(o_result)) def test_read_dump_data(self): -- Gitee From db98d484f4df58fe35ec8b1f2c2723dc97e2a3ce Mon Sep 17 00:00:00 2001 From: Linwei-Ying Date: Mon, 3 Mar 2025 14:17:04 +0800 Subject: [PATCH 24/37] compare bench_data_name get improve --- .../accuracy_tools/msprobe/core/compare/utils.py | 2 +- .../test/core_ut/compare/test_acc_compare.py | 2 +- .../core_ut/compare/test_acc_compare_utils.py | 16 ++++++++-------- 3 files changed, 10 insertions(+), 10 deletions(-) diff --git a/debug/accuracy_tools/msprobe/core/compare/utils.py b/debug/accuracy_tools/msprobe/core/compare/utils.py index 6e49d62fc1..f54ea1f605 100644 --- a/debug/accuracy_tools/msprobe/core/compare/utils.py +++ b/debug/accuracy_tools/msprobe/core/compare/utils.py @@ -471,7 +471,7 @@ def get_un_match_accuracy(result, n_dict, dump_mode): result_item.append(err_msg) append_stack_info(result_item, npu_stack_info, index) if dump_mode == Const.ALL and result_item[1] == CompareConst.N_A: - result_item.extend(["-1", "-1"]) + result_item.extend([["-1", "-1"]]) result.append(result_item) diff --git a/debug/accuracy_tools/msprobe/test/core_ut/compare/test_acc_compare.py b/debug/accuracy_tools/msprobe/test/core_ut/compare/test_acc_compare.py index 81e9ec30b6..1b2f6bb2fd 100644 --- a/debug/accuracy_tools/msprobe/test/core_ut/compare/test_acc_compare.py +++ b/debug/accuracy_tools/msprobe/test/core_ut/compare/test_acc_compare.py @@ -699,7 +699,7 @@ class TestUtilsMethods(unittest.TestCase): input_param = {'npu_dump_data_dir': base_dir, 'bench_dump_data_dir': base_dir} result = pt_comparator.compare_by_op(npu_op_name, bench_op_name, op_name_mapping_dict, input_param) self.assertEqual(result, ['unsupported', 'unsupported', 'unsupported', 'unsupported', 'unsupported', - 'unsupported', 'Dump file: Functional.linear.0.forward.input.0.pt not found']) + 'unsupported', 'Dump file: Functional.linear.0.forward.input.0.pt not found.']) generate_pt(base_dir) result = pt_comparator.compare_by_op(npu_op_name, bench_op_name, op_name_mapping_dict, input_param) diff --git a/debug/accuracy_tools/msprobe/test/core_ut/compare/test_acc_compare_utils.py b/debug/accuracy_tools/msprobe/test/core_ut/compare/test_acc_compare_utils.py index 2e9a465726..bf23f4de1d 100644 --- a/debug/accuracy_tools/msprobe/test/core_ut/compare/test_acc_compare_utils.py +++ b/debug/accuracy_tools/msprobe/test/core_ut/compare/test_acc_compare_utils.py @@ -224,31 +224,31 @@ o_result_unmatch_3 = [ ['Functional.conv2d.0.forward.input.0', 'N/A', 'torch.float32', 'N/A', [1, 1, 28, 28], 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 3.029174327850342, -2.926689624786377, -0.06619918346405029, 1.0, 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', - 'No bench data matched.', 'None', '-1'], + 'No bench data matched.', 'None', ['-1', '-1']], ['Functional.conv2d.0.forward.input.1', 'N/A', 'torch.float32', 'N/A', [16, 1, 5, 5], 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 0.19919930398464203, -0.19974489510059357, 0.006269412115216255, 1.0, 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', - 'No bench data matched.', 'None', '-1'], + 'No bench data matched.', 'None', ['-1', '-1']], ['Functional.conv2d.0.forward.input.2', 'N/A', 'torch.float32', 'N/A', [16], 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 0.19734230637550354, -0.18177609145641327, 0.007903944700956345, 1.0, 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', - 'No bench data matched.', 'None', '-1'], + 'No bench data matched.', 'None', ['-1', '-1']], ['Functional.conv2d.0.forward.parameters.weight', 'N/A', 'torch.float32', 'N/A', [1, 16, 28, 28], 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', - 1.0, 1.0, 1.0, 1.0, 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'No bench data matched.', 'None', '-1'], + 1.0, 1.0, 1.0, 1.0, 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'No bench data matched.', 'None', ['-1', '-1']], ['Functional.conv2d.0.forward.parameters.bias', 'N/A', 'torch.float32', 'N/A', [1, 16, 28, 28], 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', - 1.0, 1.0, 1.0, 1.0, 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'No bench data matched.', 'None', '-1'], + 1.0, 1.0, 1.0, 1.0, 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'No bench data matched.', 'None', ['-1', '-1']], ['Functional.conv2d.0.forward.output.0', 'N/A', 'torch.float32', 'N/A', [1, 16, 28, 28], 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 2.1166646480560303, -2.190781354904175, -0.003579073818400502, 1.0, 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', - 'No bench data matched.', 'None', '-1'], + 'No bench data matched.', 'None', ['-1', '-1']], ['Functional.conv2d.0.parameters_grad.weight', 'N/A', 'torch.float32', 'N/A', [1, 16, 28, 28], 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', - 1.0, 1.0, 1.0, 1.0, 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'No bench data matched.', 'None', '-1'], + 1.0, 1.0, 1.0, 1.0, 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'No bench data matched.', 'None', ['-1', '-1']], ['Functional.conv2d.0.parameters_grad.bias', 'N/A', 'torch.float32', 'N/A', [1, 16, 28, 28], 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', - 1.0, 1.0, 1.0, 1.0, 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'No bench data matched.', 'None', '-1'] + 1.0, 1.0, 1.0, 1.0, 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'No bench data matched.', 'None', ['-1', '-1']] ] # test_merge_tensor -- Gitee From db079b99e78913c70237eb76d034bbf45c87a988 Mon Sep 17 00:00:00 2001 From: Linwei-Ying Date: Mon, 3 Mar 2025 14:23:39 +0800 Subject: [PATCH 25/37] compare bench_data_name get improve --- debug/accuracy_tools/msprobe/core/compare/utils.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/debug/accuracy_tools/msprobe/core/compare/utils.py b/debug/accuracy_tools/msprobe/core/compare/utils.py index f54ea1f605..8656daf7ca 100644 --- a/debug/accuracy_tools/msprobe/core/compare/utils.py +++ b/debug/accuracy_tools/msprobe/core/compare/utils.py @@ -355,7 +355,7 @@ def get_accuracy(result, n_dict, b_dict, dump_mode): if dump_mode == Const.ALL: npu_data_name = safe_get_value(npu_data_name_list, n_start + index, "npu_data_name_list") bench_data_name = safe_get_value(bench_data_name_list, n_start + index, "bench_data_name_list") - result_item.append(npu_data_name, bench_data_name) + result_item.append([npu_data_name, bench_data_name]) result.append(result_item) @@ -392,7 +392,7 @@ def get_accuracy(result, n_dict, b_dict, dump_mode): if dump_mode == Const.ALL: npu_data_name = safe_get_value(npu_data_name_list, n_start + index, "npu_data_name_list") bench_data_name = safe_get_value(bench_data_name_list, n_start + index, "bench_data_name_list") - result_item.append(npu_data_name, bench_data_name) + result_item.append([npu_data_name, bench_data_name]) result.append(result_item) -- Gitee From 8d407ce038ecfd98ceb962ba2c310495fdb206c8 Mon Sep 17 00:00:00 2001 From: Linwei-Ying Date: Mon, 3 Mar 2025 15:11:19 +0800 Subject: [PATCH 26/37] compare bench_data_name get improve --- debug/accuracy_tools/msprobe/core/compare/acc_compare.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/debug/accuracy_tools/msprobe/core/compare/acc_compare.py b/debug/accuracy_tools/msprobe/core/compare/acc_compare.py index 06f5932879..cdc2e9fd84 100644 --- a/debug/accuracy_tools/msprobe/core/compare/acc_compare.py +++ b/debug/accuracy_tools/msprobe/core/compare/acc_compare.py @@ -356,7 +356,7 @@ class Comparator: :param op_name_mapping_dict: op_name和npy或pt文件的映射关系 :param input_param: npu_json_path/bench_json_path/stack_json_path等参数 :return: result_list,包含余弦相似度、最大绝对误差、最大相对误差、千分之一误差率、千分之五误差率和错误信息 - 用于读取excel中的NPU_Name和Bench_Name,根据映射关系找到npy或pt文件,然后读取文件中的数据进行比较,计算余弦相似度、 + 用于读取excel中的NPU_Name和Bench_Name,根据映射关系找到npy或pt文件,然后读取文件中的数据进行比较,计算余弦相似度、欧式距离 最大绝对误差、最大相对误差、千分之一误差率、千分之五误差率并生成错误信息 """ error_file, relative_err, error_flag = None, None, False @@ -365,9 +365,9 @@ class Comparator: npu_data_name = data_name_pair[0] bench_data_name = data_name_pair[1] - if str(npu_data_name) == '-1': # 没有npu真实数据路径 + if str(npu_data_name) == '-1': # 没有npu真实数据 n_value, b_value, error_flag = CompareConst.READ_NONE, CompareConst.READ_NONE, True - elif str(bench_data_name) == '-1': # 没有bench真实数据路径 + elif str(bench_data_name) == '-1': # 没有bench真实数据 n_value, b_value, error_flag = CompareConst.READ_NONE, CompareConst.READ_NONE, True error_file = 'no_bench_data' else: @@ -465,7 +465,7 @@ class Comparator: err_mess = [] is_print_compare_log = input_param.get("is_print_compare_log") - bench_data = load_json(input_param.get("bench_json_path")).get('data') + for i in range(len(result_df)): npu_op_name = result_df.iloc[i, 0] bench_op_name = result_df.iloc[i, 1] -- Gitee From f72f586ebddf1bc0b7d964784e399b6962329805 Mon Sep 17 00:00:00 2001 From: Linwei-Ying Date: Mon, 3 Mar 2025 17:07:11 +0800 Subject: [PATCH 27/37] compare bench_data_name get improve --- .../msprobe/mindspore/compare/ms_compare.py | 5 ++++ .../mindspore_ut/compare/test_ms_compare.py | 27 ++++++++++++++++++- 2 files changed, 31 insertions(+), 1 deletion(-) diff --git a/debug/accuracy_tools/msprobe/mindspore/compare/ms_compare.py b/debug/accuracy_tools/msprobe/mindspore/compare/ms_compare.py index de507e8766..f91f98182f 100644 --- a/debug/accuracy_tools/msprobe/mindspore/compare/ms_compare.py +++ b/debug/accuracy_tools/msprobe/mindspore/compare/ms_compare.py @@ -78,6 +78,11 @@ class MSComparator(Comparator): raise TypeError(f"The type of parameter `data_mapping` must be dict, str or None, but got " f"{type(self.data_mapping)}") + @staticmethod + def process_data_name(match_result): + match_result['data_name_x'] = match_result.apply(lambda row: [row['data_name_x'], row['data_name_y']], axis=1) + return match_result + def calc_accuracy(self, result_df, header): condition_no_bench = result_df[CompareConst.BENCH_NAME] == CompareConst.N_A result_df[condition_no_bench] = result_df[condition_no_bench].fillna(CompareConst.N_A) diff --git a/debug/accuracy_tools/msprobe/test/mindspore_ut/compare/test_ms_compare.py b/debug/accuracy_tools/msprobe/test/mindspore_ut/compare/test_ms_compare.py index b5cbff9784..2c0b2efea7 100644 --- a/debug/accuracy_tools/msprobe/test/mindspore_ut/compare/test_ms_compare.py +++ b/debug/accuracy_tools/msprobe/test/mindspore_ut/compare/test_ms_compare.py @@ -7,6 +7,7 @@ import tempfile import unittest import numpy as np +import pandas as pd import torch import yaml @@ -533,4 +534,28 @@ class TestUtilsMethods(unittest.TestCase): api_list = ["Mint"] with self.assertRaises(CompareException): - ms_comparator.get_api_name(api_list) \ No newline at end of file + ms_comparator.get_api_name(api_list) + + def test_process_data_name(self): + stack_mode = True + auto_analyze = True + fuzzy_match = False + dump_mode = Const.ALL + + mode_config = ModeConfig(stack_mode, auto_analyze, fuzzy_match, dump_mode) + mapping_config = MappingConfig() + ms_comparator = MSComparator(mode_config, mapping_config) + + data = pd.DataFrame({ + 'data_name_x': ['A', 'B', 'C'], + 'data_name_y': ['X', 'Y', 'Z'] + }) + + result = ms_comparator.process_data_name(data.copy()) + + expected = pd.DataFrame({ + 'data_name_x': [['A', 'X'], ['B', 'Y'], ['C', 'Z']], + 'data_name_y': ['X', 'Y', 'Z'] + }) + + pd.testing.assert_frame_equal(result, expected) -- Gitee From f22bd4d3291f069cbfb23696ee0b44a79350b324 Mon Sep 17 00:00:00 2001 From: Linwei-Ying Date: Mon, 3 Mar 2025 17:29:26 +0800 Subject: [PATCH 28/37] compare bench_data_name get improve --- debug/accuracy_tools/msprobe/core/compare/acc_compare.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/debug/accuracy_tools/msprobe/core/compare/acc_compare.py b/debug/accuracy_tools/msprobe/core/compare/acc_compare.py index cdc2e9fd84..f2aa8c479e 100644 --- a/debug/accuracy_tools/msprobe/core/compare/acc_compare.py +++ b/debug/accuracy_tools/msprobe/core/compare/acc_compare.py @@ -329,7 +329,9 @@ class Comparator: else: result_item.append(CompareConst.NONE) if self.dump_mode == Const.ALL: - result_item.append(npu_ops_all.get(ms_op_name).get("data_name", None)) + ms_data_name = npu_ops_all.get(ms_op_name).get("data_name", None) + pt_data_name = bench_ops_all.get(bench_op_name).get("data_name", None) + result_item.append([ms_data_name, pt_data_name]) result.append(result_item) elif ms_op_name not in npu_ops_all: logger.warning(f'Can not find npu op name : `{ms_op_name}` in npu dump json file.') -- Gitee From 6033e534891dc36483acdd9cd0879f33efb6be13 Mon Sep 17 00:00:00 2001 From: Linwei-Ying Date: Mon, 3 Mar 2025 17:38:55 +0800 Subject: [PATCH 29/37] compare bench_data_name get improve --- .../msprobe/docs/10.accuracy_compare_PyTorch.md | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/debug/accuracy_tools/msprobe/docs/10.accuracy_compare_PyTorch.md b/debug/accuracy_tools/msprobe/docs/10.accuracy_compare_PyTorch.md index a5f83d8dfc..6f886215b0 100644 --- a/debug/accuracy_tools/msprobe/docs/10.accuracy_compare_PyTorch.md +++ b/debug/accuracy_tools/msprobe/docs/10.accuracy_compare_PyTorch.md @@ -257,11 +257,11 @@ PyTorch 精度比对是以 CPU 或 GPU 的计算结果为标杆,通过计算 统计量有 4 种:最大值(max)、最小值(min)、平均值(mean)和 L2-范数(L2 norm)。 -|dump 数据模式|Cosine (tensor 余弦相似度)|EucDist (tensor 欧式距离)|MaxAbsErr (tensor 最大绝对误差)|MaxRelativeErr (tensor 最大相对误差)|One Thousandth Err Ratio (tensor 相对误差小于千分之一的比例)|Five Thousandth Err Ratio (tensor 相对误差小于千分之五的比例)|NPU 和 bench 的统计量绝对误差 (max, min, mean, L2 norm) diff| NPU 和 bench 的统计量相对误差 (max, min, mean, L2 norm) RelativeErr |NPU 和 bench 的统计量 (max, min, mean, L2 norm)|NPU MD5 (NPU 数据 CRC-32 值)|BENCH MD5 (bench 数据 CRC-32 值)|Result (比对结果)|Accuracy Reached or Not (计算精度是否达标)|Err_message (错误信息提示)|NPU_Stack_Info (堆栈信息)|Data_Name (NPU 真实数据名)| -|:---:|:---:|:---:|:---:|:---:|:---:|:---:|:---:|:---:|:---:|:---:|:---:|:---:|:---:|:---:|:---:|:---:| -|真实数据模式|√|√|√|√|√|√|||√||||√|√|√|√| -|统计数据模式|||||||√|√|√|||√||√|√|| -|MD5 模式||||||||||√|√|√|||√|| +|dump 数据模式|Cosine (tensor 余弦相似度)|EucDist (tensor 欧式距离)|MaxAbsErr (tensor 最大绝对误差)|MaxRelativeErr (tensor 最大相对误差)|One Thousandth Err Ratio (tensor 相对误差小于千分之一的比例)|Five Thousandth Err Ratio (tensor 相对误差小于千分之五的比例)|NPU 和 bench 的统计量绝对误差 (max, min, mean, L2 norm) diff| NPU 和 bench 的统计量相对误差 (max, min, mean, L2 norm) RelativeErr |NPU 和 bench 的统计量 (max, min, mean, L2 norm)|NPU MD5 (NPU 数据 CRC-32 值)|BENCH MD5 (bench 数据 CRC-32 值)|Result (比对结果)|Accuracy Reached or Not (计算精度是否达标)|Err_message (错误信息提示)|NPU_Stack_Info (堆栈信息)| Data_Name ([NPU真实数据名,Bench真实数据名]) | +|:---:|:---:|:---:|:---:|:---:|:---:|:---:|:---:|:---:|:---:|:---:|:---:|:---:|:---:|:---:|:---:|:---------------------------------:| +|真实数据模式|√|√|√|√|√|√|||√||||√|√|√| √ | +|统计数据模式|||||||√|√|√|||√||√|√| | +|MD5 模式||||||||||√|√|√|||√| | 上表中NPU_Stack_Info字段需要配置-s参数生成。 -- Gitee From 149b49a6865b7ba3ed686cb3de3146f10edef9bf Mon Sep 17 00:00:00 2001 From: Linwei-Ying Date: Mon, 3 Mar 2025 19:12:05 +0800 Subject: [PATCH 30/37] compare bench_data_name get improve --- debug/accuracy_tools/msprobe/mindspore/compare/ms_compare.py | 1 + 1 file changed, 1 insertion(+) diff --git a/debug/accuracy_tools/msprobe/mindspore/compare/ms_compare.py b/debug/accuracy_tools/msprobe/mindspore/compare/ms_compare.py index f91f98182f..cb731779b3 100644 --- a/debug/accuracy_tools/msprobe/mindspore/compare/ms_compare.py +++ b/debug/accuracy_tools/msprobe/mindspore/compare/ms_compare.py @@ -286,6 +286,7 @@ class MSComparator(Comparator): ((npu_dtype == Const.TORCH_BFLOAT16) & (bench_dtype == Const.TORCH_FLOAT16))) match_result.loc[~gen_dtype_condition(), [i + '_y' for i in bench_df.columns]] = CompareConst.N_A + match_result = self.process_data_name(match_result) return self.make_result_df(match_result) def modify_compare_data_with_user_mapping(self, npu_df, bench_df): -- Gitee From 945db9bd7711b6f0bd80dba6862d32975177c23b Mon Sep 17 00:00:00 2001 From: Linwei-Ying Date: Mon, 3 Mar 2025 19:28:29 +0800 Subject: [PATCH 31/37] compare bench_data_name get improve --- .../msprobe/mindspore/compare/ms_compare.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/debug/accuracy_tools/msprobe/mindspore/compare/ms_compare.py b/debug/accuracy_tools/msprobe/mindspore/compare/ms_compare.py index cb731779b3..88dd669dfc 100644 --- a/debug/accuracy_tools/msprobe/mindspore/compare/ms_compare.py +++ b/debug/accuracy_tools/msprobe/mindspore/compare/ms_compare.py @@ -79,9 +79,9 @@ class MSComparator(Comparator): f"{type(self.data_mapping)}") @staticmethod - def process_data_name(match_result): - match_result['data_name_x'] = match_result.apply(lambda row: [row['data_name_x'], row['data_name_y']], axis=1) - return match_result + def process_data_name(result): + result['data_name_x'] = result.apply(lambda row: [row['data_name_x'], row['data_name_y']], axis=1) + return result def calc_accuracy(self, result_df, header): condition_no_bench = result_df[CompareConst.BENCH_NAME] == CompareConst.N_A @@ -175,6 +175,10 @@ class MSComparator(Comparator): result[npu_summary] = result['summary_x'].apply(set_summary).tolist() result[bench_summary] = result['summary_y'].apply(set_summary).tolist() + + if self.dump_mode == Const.ALL: + result = self.process_data_name(result) + result_df = pd.DataFrame(columns=header) for h in header: if h in result.columns: @@ -286,7 +290,6 @@ class MSComparator(Comparator): ((npu_dtype == Const.TORCH_BFLOAT16) & (bench_dtype == Const.TORCH_FLOAT16))) match_result.loc[~gen_dtype_condition(), [i + '_y' for i in bench_df.columns]] = CompareConst.N_A - match_result = self.process_data_name(match_result) return self.make_result_df(match_result) def modify_compare_data_with_user_mapping(self, npu_df, bench_df): -- Gitee From 10297de3e19ad4abde5052aa33f0d724cf8d0348 Mon Sep 17 00:00:00 2001 From: Linwei-Ying Date: Mon, 3 Mar 2025 19:33:18 +0800 Subject: [PATCH 32/37] compare bench_data_name get improve --- debug/accuracy_tools/msprobe/mindspore/compare/ms_compare.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/debug/accuracy_tools/msprobe/mindspore/compare/ms_compare.py b/debug/accuracy_tools/msprobe/mindspore/compare/ms_compare.py index 88dd669dfc..1f95dbd234 100644 --- a/debug/accuracy_tools/msprobe/mindspore/compare/ms_compare.py +++ b/debug/accuracy_tools/msprobe/mindspore/compare/ms_compare.py @@ -145,6 +145,8 @@ class MSComparator(Comparator): header.append(CompareConst.STACK) if self.dump_mode == Const.ALL: header.append(CompareConst.DATA_NAME) + result = self.process_data_name(result) + result.rename(columns={'op_name_x': CompareConst.NPU_NAME, 'op_name_y': CompareConst.BENCH_NAME, 'dtype_x': CompareConst.NPU_DTYPE, @@ -176,9 +178,6 @@ class MSComparator(Comparator): result[npu_summary] = result['summary_x'].apply(set_summary).tolist() result[bench_summary] = result['summary_y'].apply(set_summary).tolist() - if self.dump_mode == Const.ALL: - result = self.process_data_name(result) - result_df = pd.DataFrame(columns=header) for h in header: if h in result.columns: -- Gitee From 820514cb6b3d352afbdf9150bd3d244687999935 Mon Sep 17 00:00:00 2001 From: Linwei-Ying Date: Mon, 3 Mar 2025 20:31:10 +0800 Subject: [PATCH 33/37] compare read data read improve --- .../msprobe/core/common/file_utils.py | 14 ------- .../msprobe/core/compare/acc_compare.py | 4 +- .../msprobe/core/compare/utils.py | 39 +------------------ .../msprobe/mindspore/compare/ms_compare.py | 15 ++++++- .../run_ut/data_generate.py | 3 +- .../msprobe/pytorch/common/utils.py | 38 ++++++++++++++++++ .../msprobe/pytorch/compare/pt_compare.py | 31 ++++++++++++++- .../test/core_ut/common/test_file_utils.py | 35 ----------------- .../test/pytorch_ut/common/test_pt_utils.py | 39 +++++++++++++++++-- 9 files changed, 124 insertions(+), 94 deletions(-) diff --git a/debug/accuracy_tools/msprobe/core/common/file_utils.py b/debug/accuracy_tools/msprobe/core/common/file_utils.py index ad59721b54..89d33a6a3e 100644 --- a/debug/accuracy_tools/msprobe/core/common/file_utils.py +++ b/debug/accuracy_tools/msprobe/core/common/file_utils.py @@ -24,7 +24,6 @@ from datetime import datetime, timezone from dateutil import parser import yaml -import torch import numpy as np import pandas as pd @@ -671,16 +670,3 @@ def read_xlsx(file_path): logger.error(f"The xlsx file failed to load. Please check the path: {file_path}.") raise RuntimeError(f"Read xlsx file {file_path} failed.") from e return result_df - - -def load_pt(pt_path, to_cpu=False): - pt_path = os.path.realpath(pt_path) - check_file_or_directory_path(pt_path) - try: - if to_cpu: - pt = torch.load(pt_path, map_location=torch.device("cpu"), weights_only=True) - else: - pt = torch.load(pt_path, weights_only=True) - except Exception as e: - raise RuntimeError(f"load pt file {pt_path} failed") from e - return pt diff --git a/debug/accuracy_tools/msprobe/core/compare/acc_compare.py b/debug/accuracy_tools/msprobe/core/compare/acc_compare.py index bd26f9121e..28a7b5f3a8 100644 --- a/debug/accuracy_tools/msprobe/core/compare/acc_compare.py +++ b/debug/accuracy_tools/msprobe/core/compare/acc_compare.py @@ -33,7 +33,9 @@ from msprobe.core.compare.highlight import find_compare_result_error_rows, highl from msprobe.core.compare.multiprocessing_compute import ComparisonResult, _handle_multi_process, _save_cmp_result from msprobe.core.compare.npy_compare import compare_ops_apply, get_error_flag_and_msg from msprobe.core.compare.utils import get_accuracy, get_rela_diff_summary_mode, get_un_match_accuracy, merge_tensor, \ - print_compare_ends_info, read_op, get_name_and_state, reorder_op_x_list, read_pt_data, read_npy_data + print_compare_ends_info, read_op, get_name_and_state, reorder_op_x_list +from msprobe.pytorch.compare.pt_compare import read_pt_data +from msprobe.mindspore.compare.ms_compare import read_npy_data class ModeConfig: diff --git a/debug/accuracy_tools/msprobe/core/compare/utils.py b/debug/accuracy_tools/msprobe/core/compare/utils.py index a2ba55fb46..471951ce4b 100644 --- a/debug/accuracy_tools/msprobe/core/compare/utils.py +++ b/debug/accuracy_tools/msprobe/core/compare/utils.py @@ -24,7 +24,7 @@ import numpy as np from msprobe.core.common.const import Const, CompareConst, FileCheckConst from msprobe.core.common.utils import CompareException, check_regex_prefix_format_valid, logger, safe_get_value -from msprobe.core.common.file_utils import check_file_or_directory_path, FileChecker, load_pt, load_npy +from msprobe.core.common.file_utils import check_file_or_directory_path, FileChecker, load_npy def extract_json(dirname, stack_json=False): @@ -602,43 +602,6 @@ def reorder_op_x_list(op_name_list, summary_list, data_name_list): return op_name_reorder, summary_reorder, data_name_reorder -def read_pt_data(dir_path, file_name): - if not file_name: - return None - - data_path = os.path.join(dir_path, file_name) - path_checker = FileChecker(data_path, FileCheckConst.FILE, FileCheckConst.READ_ABLE, - FileCheckConst.PT_SUFFIX, False) - data_path = path_checker.common_check() - try: - # detach because numpy can not process gradient information - data_value = load_pt(data_path, to_cpu=True).detach() - except RuntimeError as e: - # 这里捕获 load_pt 中抛出的异常 - logger.error(f"Failed to load the .pt file at {data_path}.") - raise CompareException(CompareException.INVALID_FILE_ERROR) from e - except AttributeError as e: - # 这里捕获 detach 方法抛出的异常 - logger.error(f"Failed to detach the loaded tensor.") - raise CompareException(CompareException.DETACH_ERROR) from e - if data_value.dtype == torch.bfloat16: - data_value = data_value.to(torch.float32) - data_value = data_value.numpy() - return data_value - - -def read_npy_data(dir_path, file_name): - if not file_name: - return None - - data_path = os.path.join(dir_path, file_name) - path_checker = FileChecker(data_path, FileCheckConst.FILE, FileCheckConst.READ_ABLE, - FileCheckConst.NUMPY_SUFFIX, False) - data_path = path_checker.common_check() - data_value = load_npy(data_path) - return data_value - - def _compare_parser(parser): parser.add_argument("-i", "--input_path", dest="input_path", type=str, help=" The compare input path, a dict json.", required=True) diff --git a/debug/accuracy_tools/msprobe/mindspore/compare/ms_compare.py b/debug/accuracy_tools/msprobe/mindspore/compare/ms_compare.py index 9abe144659..5344573ad9 100644 --- a/debug/accuracy_tools/msprobe/mindspore/compare/ms_compare.py +++ b/debug/accuracy_tools/msprobe/mindspore/compare/ms_compare.py @@ -22,7 +22,8 @@ import pandas as pd from msprobe.core.common.const import CompareConst, Const from msprobe.core.common.exceptions import FileCheckException -from msprobe.core.common.file_utils import FileOpen, create_directory, load_json, load_yaml +from msprobe.core.common.file_utils import FileOpen, create_directory, load_json, load_yaml, load_npy, FileChecker, \ + FileCheckConst from msprobe.core.common.log import logger from msprobe.core.common.utils import CompareException, check_compare_param, check_configuration_param, \ check_op_str_pattern_valid, get_dump_mode, set_dump_path @@ -385,6 +386,18 @@ def check_cross_framework(bench_json_path): return False +def read_npy_data(dir_path, file_name): + if not file_name: + return None + + data_path = os.path.join(dir_path, file_name) + path_checker = FileChecker(data_path, FileCheckConst.FILE, FileCheckConst.READ_ABLE, + FileCheckConst.NUMPY_SUFFIX, False) + data_path = path_checker.common_check() + data_value = load_npy(data_path) + return data_value + + def ms_compare(input_param, output_path, **kwargs): try: auto_analyze = kwargs.get('auto_analyze', True) diff --git a/debug/accuracy_tools/msprobe/pytorch/api_accuracy_checker/run_ut/data_generate.py b/debug/accuracy_tools/msprobe/pytorch/api_accuracy_checker/run_ut/data_generate.py index 05da6954cd..ec2a4b7165 100644 --- a/debug/accuracy_tools/msprobe/pytorch/api_accuracy_checker/run_ut/data_generate.py +++ b/debug/accuracy_tools/msprobe/pytorch/api_accuracy_checker/run_ut/data_generate.py @@ -23,8 +23,9 @@ import numpy from msprobe.pytorch.api_accuracy_checker.run_ut.run_ut_utils import hf_32_standard_api from msprobe.pytorch.api_accuracy_checker.common.utils import check_object_type, get_full_data_path, \ CompareException, get_module_and_atttribute_name, get_attribute -from msprobe.core.common.file_utils import FileChecker, load_pt, load_npy +from msprobe.core.common.file_utils import FileChecker, load_npy from msprobe.pytorch.common.log import logger +from msprobe.pytorch.common.utils import load_pt from msprobe.core.common.const import Const, FileCheckConst, CompareConst diff --git a/debug/accuracy_tools/msprobe/pytorch/common/utils.py b/debug/accuracy_tools/msprobe/pytorch/common/utils.py index 1f938e5a38..4021430ed6 100644 --- a/debug/accuracy_tools/msprobe/pytorch/common/utils.py +++ b/debug/accuracy_tools/msprobe/pytorch/common/utils.py @@ -309,6 +309,19 @@ def print_rank_0(message): logger.info(message) +def load_pt(pt_path, to_cpu=False): + pt_path = os.path.realpath(pt_path) + check_file_or_directory_path(pt_path) + try: + if to_cpu: + pt = torch.load(pt_path, map_location=torch.device("cpu"), weights_only=True) + else: + pt = torch.load(pt_path, weights_only=True) + except Exception as e: + raise RuntimeError(f"load pt file {pt_path} failed") from e + return pt + + def save_pt(tensor, filepath): check_path_before_create(filepath) filepath = os.path.realpath(filepath) @@ -460,3 +473,28 @@ def replace_last_occurrence(text, old, new): if index != -1: return text[:index] + text[index:].replace(old, new, 1) return text + + +def read_pt_data(dir_path, file_name): + if not file_name: + return None + + data_path = os.path.join(dir_path, file_name) + path_checker = FileChecker(data_path, FileCheckConst.FILE, FileCheckConst.READ_ABLE, + FileCheckConst.PT_SUFFIX, False) + data_path = path_checker.common_check() + try: + # detach because numpy can not process gradient information + data_value = load_pt(data_path, to_cpu=True).detach() + except RuntimeError as e: + # 这里捕获 load_pt 中抛出的异常 + logger.error(f"Failed to load the .pt file at {data_path}.") + raise CompareException(CompareException.INVALID_FILE_ERROR) from e + except AttributeError as e: + # 这里捕获 detach 方法抛出的异常 + logger.error(f"Failed to detach the loaded tensor.") + raise CompareException(CompareException.DETACH_ERROR) from e + if data_value.dtype == torch.bfloat16: + data_value = data_value.to(torch.float32) + data_value = data_value.numpy() + return data_value diff --git a/debug/accuracy_tools/msprobe/pytorch/compare/pt_compare.py b/debug/accuracy_tools/msprobe/pytorch/compare/pt_compare.py index 7595c866bf..7c1670dac7 100644 --- a/debug/accuracy_tools/msprobe/pytorch/compare/pt_compare.py +++ b/debug/accuracy_tools/msprobe/pytorch/compare/pt_compare.py @@ -12,14 +12,18 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +import os + +import torch from msprobe.core.common.exceptions import FileCheckException -from msprobe.core.common.file_utils import create_directory, load_yaml +from msprobe.core.common.file_utils import create_directory, load_yaml, FileChecker, FileCheckConst from msprobe.core.common.utils import CompareException, check_compare_param, check_configuration_param, get_dump_mode, \ set_dump_path from msprobe.core.compare.acc_compare import Comparator, ModeConfig from msprobe.core.compare.utils import set_stack_json_path from msprobe.pytorch.common.log import logger +from msprobe.pytorch.common.utils import load_pt class PTComparator(Comparator): @@ -50,6 +54,31 @@ class PTComparator(Comparator): return mapping_dict +def read_pt_data(dir_path, file_name): + if not file_name: + return None + + data_path = os.path.join(dir_path, file_name) + path_checker = FileChecker(data_path, FileCheckConst.FILE, FileCheckConst.READ_ABLE, + FileCheckConst.PT_SUFFIX, False) + data_path = path_checker.common_check() + try: + # detach because numpy can not process gradient information + data_value = load_pt(data_path, to_cpu=True).detach() + except RuntimeError as e: + # 这里捕获 load_pt 中抛出的异常 + logger.error(f"Failed to load the .pt file at {data_path}.") + raise CompareException(CompareException.INVALID_FILE_ERROR) from e + except AttributeError as e: + # 这里捕获 detach 方法抛出的异常 + logger.error(f"Failed to detach the loaded tensor.") + raise CompareException(CompareException.DETACH_ERROR) from e + if data_value.dtype == torch.bfloat16: + data_value = data_value.to(torch.float32) + data_value = data_value.numpy() + return data_value + + def compare(input_param, output_path, **kwargs): try: auto_analyze = kwargs.get('auto_analyze', True) diff --git a/debug/accuracy_tools/msprobe/test/core_ut/common/test_file_utils.py b/debug/accuracy_tools/msprobe/test/core_ut/common/test_file_utils.py index 303c083eaa..ac3a859bf4 100644 --- a/debug/accuracy_tools/msprobe/test/core_ut/common/test_file_utils.py +++ b/debug/accuracy_tools/msprobe/test/core_ut/common/test_file_utils.py @@ -1,6 +1,4 @@ -import unittest from unittest.mock import patch, mock_open, MagicMock -import tempfile import pytest @@ -534,36 +532,3 @@ class TestDirectoryChecks: check_file_or_directory_path(self.test_file, isdir=False) # Test directory path check_file_or_directory_path(self.test_dir, isdir=True) - - -class TestLoadPt(unittest.TestCase): - - def setUp(self): - self.temp_file = tempfile.NamedTemporaryFile(delete=False, suffix='.pt') - tensor = torch.tensor([1, 2, 3]) - torch.save(tensor, self.temp_file.name) - - @patch('torch.load') - def test_load_pt_cpu(self, mock_load): - mock_load.return_value = torch.tensor([1, 2, 3]) - result = load_pt(self.temp_file.name, to_cpu=True) - self.assertTrue(torch.equal(result, torch.tensor([1, 2, 3]))) - mock_load.assert_called_once_with(self.temp_file.name, map_location=torch.device("cpu"), weights_only=True) - - @patch('torch.load') - def test_load_pt_nogpu(self, mock_load): - mock_load.return_value = torch.tensor([1, 2, 3]) - result = load_pt(self.temp_file.name, to_cpu=False) - self.assertTrue(torch.equal(result, torch.tensor([1, 2, 3]))) - mock_load.assert_called_once_with(self.temp_file.name, weights_only=True) - - @patch('torch.load') - def test_load_pt_failure(self, mock_load): - mock_load.side_effect = RuntimeError("Load failed") - with self.assertRaises(RuntimeError) as context: - load_pt(self.temp_file.name) - self.assertIn("load pt file", str(context.exception)) - - def tearDown(self): - if os.path.isfile(self.temp_file.name): - os.remove(self.temp_file.name) diff --git a/debug/accuracy_tools/msprobe/test/pytorch_ut/common/test_pt_utils.py b/debug/accuracy_tools/msprobe/test/pytorch_ut/common/test_pt_utils.py index 61f7d97b55..b1ac148ae7 100644 --- a/debug/accuracy_tools/msprobe/test/pytorch_ut/common/test_pt_utils.py +++ b/debug/accuracy_tools/msprobe/test/pytorch_ut/common/test_pt_utils.py @@ -2,6 +2,7 @@ import os import io import unittest from unittest.mock import MagicMock, patch +import tempfile import torch import torch.distributed as dist @@ -9,9 +10,8 @@ import torch.distributed as dist from msprobe.core.common.file_utils import FileCheckConst from msprobe.core.common.exceptions import DistributedNotInitializedError from msprobe.pytorch.api_accuracy_checker.common.utils import ApiData -from msprobe.pytorch.common.utils import parameter_adapter, get_rank_if_initialized, \ - get_tensor_rank, get_rank_id, print_rank_0, save_pt, save_api_data, \ - load_api_data, save_pkl, load_pkl +from msprobe.pytorch.common.utils import parameter_adapter, get_rank_if_initialized, get_tensor_rank, get_rank_id, \ + print_rank_0, load_pt, save_pt, save_api_data, load_api_data, save_pkl, load_pkl class TestParameterAdapter(unittest.TestCase): @@ -148,6 +148,39 @@ class TestPrintRank0(unittest.TestCase): mock_logger_info.assert_called_once_with(message) +class TestLoadPt(unittest.TestCase): + + def setUp(self): + self.temp_file = tempfile.NamedTemporaryFile(delete=False, suffix='.pt') + tensor = torch.tensor([1, 2, 3]) + torch.save(tensor, self.temp_file.name) + + @patch('torch.load') + def test_load_pt_cpu(self, mock_load): + mock_load.return_value = torch.tensor([1, 2, 3]) + result = load_pt(self.temp_file.name, to_cpu=True) + self.assertTrue(torch.equal(result, torch.tensor([1, 2, 3]))) + mock_load.assert_called_once_with(self.temp_file.name, map_location=torch.device("cpu"), weights_only=True) + + @patch('torch.load') + def test_load_pt_nogpu(self, mock_load): + mock_load.return_value = torch.tensor([1, 2, 3]) + result = load_pt(self.temp_file.name, to_cpu=False) + self.assertTrue(torch.equal(result, torch.tensor([1, 2, 3]))) + mock_load.assert_called_once_with(self.temp_file.name, weights_only=True) + + @patch('torch.load') + def test_load_pt_failure(self, mock_load): + mock_load.side_effect = RuntimeError("Load failed") + with self.assertRaises(RuntimeError) as context: + load_pt(self.temp_file.name) + self.assertIn("load pt file", str(context.exception)) + + def tearDown(self): + if os.path.isfile(self.temp_file.name): + os.remove(self.temp_file.name) + + class TestSavePT(unittest.TestCase): def setUp(self): -- Gitee From 349aed6026b4bd3227130b19e674dfc190c2f067 Mon Sep 17 00:00:00 2001 From: Linwei-Ying Date: Wed, 5 Mar 2025 14:29:37 +0800 Subject: [PATCH 34/37] compare read data read improve --- .../msprobe/core/compare/acc_compare.py | 6 -- .../msprobe/core/compare/utils.py | 3 +- .../msprobe/mindspore/common/utils.py | 2 +- .../msprobe/mindspore/compare/ms_compare.py | 3 +- .../run_ut/data_generate.py | 2 +- .../msprobe/pytorch/common/utils.py | 25 -------- .../pytorch/compare/distributed_compare.py | 2 +- .../core_ut/compare/test_acc_compare_utils.py | 59 +------------------ .../mindspore_ut/compare/test_ms_compare.py | 21 ++++++- .../tensor_transport_layer/test_attl.py | 3 +- .../pytorch_ut/compare/test_pt_compare.py | 39 +++++++++++- 11 files changed, 66 insertions(+), 99 deletions(-) diff --git a/debug/accuracy_tools/msprobe/core/compare/acc_compare.py b/debug/accuracy_tools/msprobe/core/compare/acc_compare.py index d7d197fbdd..bc7c7bd888 100644 --- a/debug/accuracy_tools/msprobe/core/compare/acc_compare.py +++ b/debug/accuracy_tools/msprobe/core/compare/acc_compare.py @@ -15,7 +15,6 @@ import multiprocessing import os -import re from copy import deepcopy import pandas as pd @@ -379,18 +378,13 @@ class Comparator: bench_dir = input_param.get("bench_dump_data_dir") try: frame_name = getattr(self, "frame_name") - read_npy_data = getattr(self, "read_npy_data") - if frame_name == "MSComparator": n_value = read_npy_data(npu_dir, npu_data_name) if self.cross_frame: - b_value = read_npy_data(bench_dir, bench_data_name, load_pt_file=True) b_value = read_pt_data(bench_dir, bench_data_name) else: b_value = read_npy_data(bench_dir, bench_data_name) else: - n_value = read_npy_data(npu_dir, npu_data_name) - b_value = read_npy_data(bench_dir, bench_data_name) n_value = read_pt_data(npu_dir, npu_data_name) b_value = read_pt_data(bench_dir, bench_data_name) except IOError as error: diff --git a/debug/accuracy_tools/msprobe/core/compare/utils.py b/debug/accuracy_tools/msprobe/core/compare/utils.py index 471951ce4b..8656daf7ca 100644 --- a/debug/accuracy_tools/msprobe/core/compare/utils.py +++ b/debug/accuracy_tools/msprobe/core/compare/utils.py @@ -19,12 +19,11 @@ import math import zlib from dataclasses import dataclass -import torch import numpy as np from msprobe.core.common.const import Const, CompareConst, FileCheckConst from msprobe.core.common.utils import CompareException, check_regex_prefix_format_valid, logger, safe_get_value -from msprobe.core.common.file_utils import check_file_or_directory_path, FileChecker, load_npy +from msprobe.core.common.file_utils import check_file_or_directory_path def extract_json(dirname, stack_json=False): diff --git a/debug/accuracy_tools/msprobe/mindspore/common/utils.py b/debug/accuracy_tools/msprobe/mindspore/common/utils.py index dc9da34490..b205dabc6a 100644 --- a/debug/accuracy_tools/msprobe/mindspore/common/utils.py +++ b/debug/accuracy_tools/msprobe/mindspore/common/utils.py @@ -196,4 +196,4 @@ def check_save_param(variable, name, save_backward): logger.warning("PrecisionDebugger.save_backward name not valid, " "should be bool. " "Skip current save process.") - raise ValueError + raise ValueError \ No newline at end of file diff --git a/debug/accuracy_tools/msprobe/mindspore/compare/ms_compare.py b/debug/accuracy_tools/msprobe/mindspore/compare/ms_compare.py index b97e82fd30..2a36a7adb4 100644 --- a/debug/accuracy_tools/msprobe/mindspore/compare/ms_compare.py +++ b/debug/accuracy_tools/msprobe/mindspore/compare/ms_compare.py @@ -22,8 +22,7 @@ import pandas as pd from msprobe.core.common.const import CompareConst, Const from msprobe.core.common.exceptions import FileCheckException -from msprobe.core.common.file_utils import create_directory, load_json, load_npy, load_yaml -from msprobe.core.common.file_utils import FileOpen, create_directory, load_json, load_yaml, load_npy, FileChecker, \ +from msprobe.core.common.file_utils import create_directory, load_json, load_yaml, load_npy, FileChecker, \ FileCheckConst from msprobe.core.common.log import logger from msprobe.core.common.utils import CompareException, check_compare_param, check_configuration_param, \ diff --git a/debug/accuracy_tools/msprobe/pytorch/api_accuracy_checker/run_ut/data_generate.py b/debug/accuracy_tools/msprobe/pytorch/api_accuracy_checker/run_ut/data_generate.py index ec2a4b7165..9d89b2de32 100644 --- a/debug/accuracy_tools/msprobe/pytorch/api_accuracy_checker/run_ut/data_generate.py +++ b/debug/accuracy_tools/msprobe/pytorch/api_accuracy_checker/run_ut/data_generate.py @@ -1,6 +1,6 @@ #!/usr/bin/env python3 # -*- coding: utf-8 -*- -# Copyright (c) 2024-2025, Huawei Technologies Co., Ltd. +# Copyright (c) 2024-2024, Huawei Technologies Co., Ltd. # All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); diff --git a/debug/accuracy_tools/msprobe/pytorch/common/utils.py b/debug/accuracy_tools/msprobe/pytorch/common/utils.py index 7a3735a529..3fb9474fa9 100644 --- a/debug/accuracy_tools/msprobe/pytorch/common/utils.py +++ b/debug/accuracy_tools/msprobe/pytorch/common/utils.py @@ -473,28 +473,3 @@ def replace_last_occurrence(text, old, new): if index != -1: return text[:index] + text[index:].replace(old, new, 1) return text - - -def read_pt_data(dir_path, file_name): - if not file_name: - return None - - data_path = os.path.join(dir_path, file_name) - path_checker = FileChecker(data_path, FileCheckConst.FILE, FileCheckConst.READ_ABLE, - FileCheckConst.PT_SUFFIX, False) - data_path = path_checker.common_check() - try: - # detach because numpy can not process gradient information - data_value = load_pt(data_path, to_cpu=True).detach() - except RuntimeError as e: - # 这里捕获 load_pt 中抛出的异常 - logger.error(f"Failed to load the .pt file at {data_path}.") - raise CompareException(CompareException.INVALID_FILE_ERROR) from e - except AttributeError as e: - # 这里捕获 detach 方法抛出的异常 - logger.error(f"Failed to detach the loaded tensor.") - raise CompareException(CompareException.DETACH_ERROR) from e - if data_value.dtype == torch.bfloat16: - data_value = data_value.to(torch.float32) - data_value = data_value.numpy() - return data_value diff --git a/debug/accuracy_tools/msprobe/pytorch/compare/distributed_compare.py b/debug/accuracy_tools/msprobe/pytorch/compare/distributed_compare.py index 08e2f897a9..1b49df0653 100644 --- a/debug/accuracy_tools/msprobe/pytorch/compare/distributed_compare.py +++ b/debug/accuracy_tools/msprobe/pytorch/compare/distributed_compare.py @@ -1,4 +1,4 @@ -# Copyright (c) 2019-2024, Huawei Technologies Co., Ltd. +# Copyright (c) 2014-2025, Huawei Technologies Co., Ltd. # All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); diff --git a/debug/accuracy_tools/msprobe/test/core_ut/compare/test_acc_compare_utils.py b/debug/accuracy_tools/msprobe/test/core_ut/compare/test_acc_compare_utils.py index 5327237066..bf23f4de1d 100644 --- a/debug/accuracy_tools/msprobe/test/core_ut/compare/test_acc_compare_utils.py +++ b/debug/accuracy_tools/msprobe/test/core_ut/compare/test_acc_compare_utils.py @@ -4,19 +4,17 @@ import json import os import shutil import unittest -from unittest.mock import patch, MagicMock +from unittest.mock import patch import zlib -import torch import numpy as np -from msprobe.core.common.const import CompareConst, Const, FileCheckConst +from msprobe.core.common.const import CompareConst, Const from msprobe.core.common.utils import CompareException from msprobe.core.compare.utils import ApiItemInfo, _compare_parser, check_and_return_dir_contents, extract_json, \ count_struct, get_accuracy, append_stack_info, get_rela_diff_summary_mode, get_un_match_accuracy, merge_tensor, \ op_item_parse, read_op, rename_api, resolve_api_special_parameters, result_item_init, stack_column_process, \ - table_value_is_valid, get_name_and_state, reorder_op_name_list, reorder_op_x_list, gen_op_item, read_pt_data, \ - read_npy_data + table_value_is_valid, get_name_and_state, reorder_op_name_list, reorder_op_x_list, gen_op_item # test_read_op_1 op_data = { @@ -856,54 +854,3 @@ class TestGenOpItem(unittest.TestCase): expected_md5 = f"{zlib.crc32(str(op_data['value']).encode()):08x}" self.assertEqual(result['md5'], expected_md5) - - -class TestReadPtData(unittest.TestCase): - - @patch('msprobe.core.compare.utils.load_pt') - @patch('msprobe.core.compare.utils.FileChecker') - @patch('os.path.join', return_value='/fake/path/to/file.pt') - def test_read_pt_data(self, mock_os, mock_file_checker, mock_load_pt): - mock_file_checker.return_value.common_check.return_value = '/fake/path/to/file.pt' - - mock_tensor = MagicMock() - mock_tensor.detach.return_value = mock_tensor - mock_tensor.to.return_value = mock_tensor - mock_tensor.dtype = torch.bfloat16 - mock_tensor.numpy.return_value = np.array([1.0, 2.0, 3.0]) - mock_load_pt.return_value = mock_tensor - - result = read_pt_data('/fake/dir', 'file_name.pt') - - mock_file_checker.assert_called_once_with('/fake/path/to/file.pt', FileCheckConst.FILE, FileCheckConst.READ_ABLE, FileCheckConst.PT_SUFFIX, False) - mock_load_pt.assert_called_once_with('/fake/path/to/file.pt', to_cpu=True) - mock_tensor.to.assert_called_once_with(torch.float32) - self.assertTrue(np.array_equal(result, np.array([1.0, 2.0, 3.0]))) - - @patch('os.path.join', return_value='/fake/path/to/file.pt') - @patch('msprobe.core.compare.utils.FileChecker') - @patch('msprobe.core.compare.utils.load_pt') - def test_read_real_data_pt_exception(self, mock_load_pt, mock_file_checker, mock_os): - mock_file_checker.return_value.common_check.return_value = '/fake/path/to/file.pt' - - mock_load_pt.side_effect = RuntimeError("Test Error") - - with self.assertRaises(CompareException): - read_pt_data('/fake/dir', 'file_name.pt') - - -class TestReadNpyData(unittest.TestCase): - - @patch('msprobe.core.compare.utils.load_npy') - @patch('msprobe.core.compare.utils.FileChecker') - @patch('os.path.join', return_value='/fake/path/to/file.npy') - def test_read_real_data_ms(self, mock_os, mock_file_checker, mock_load_npy): - mock_file_checker.return_value.common_check.return_value = '/fake/path/to/file.npy' - - mock_load_npy.return_value = np.array([1.0, 2.0, 3.0]) - - result = read_npy_data('/fake/dir', 'file_name.npy') - - mock_file_checker.assert_called_once_with('/fake/path/to/file.npy', FileCheckConst.FILE, FileCheckConst.READ_ABLE, FileCheckConst.NUMPY_SUFFIX, False) - mock_load_npy.assert_called_once_with('/fake/path/to/file.npy') - self.assertTrue(np.array_equal(result, np.array([1.0, 2.0, 3.0]))) \ No newline at end of file diff --git a/debug/accuracy_tools/msprobe/test/mindspore_ut/compare/test_ms_compare.py b/debug/accuracy_tools/msprobe/test/mindspore_ut/compare/test_ms_compare.py index 667fea2241..ad8a47e510 100644 --- a/debug/accuracy_tools/msprobe/test/mindspore_ut/compare/test_ms_compare.py +++ b/debug/accuracy_tools/msprobe/test/mindspore_ut/compare/test_ms_compare.py @@ -9,12 +9,12 @@ from unittest.mock import patch import numpy as np import pandas as pd -import torch import yaml from msprobe.core.common.utils import CompareException +from msprobe.core.common.file_utils import FileCheckConst from msprobe.core.compare.acc_compare import ModeConfig -from msprobe.mindspore.compare.ms_compare import MappingConfig, MSComparator, check_cross_framework +from msprobe.mindspore.compare.ms_compare import MappingConfig, MSComparator, check_cross_framework, read_npy_data from msprobe.core.common.const import Const npu_dict = {'op_name': ['Functional.conv2d.0.forward.input.0', 'Functional.conv2d.0.forward.input.1', @@ -534,3 +534,20 @@ class TestUtilsMethods(unittest.TestCase): }) pd.testing.assert_frame_equal(result, expected) + + +class TestReadNpyData(unittest.TestCase): + + @patch('msprobe.core.compare.utils.load_npy') + @patch('msprobe.core.compare.utils.FileChecker') + @patch('os.path.join', return_value='/fake/path/to/file.npy') + def test_read_real_data_ms(self, mock_os, mock_file_checker, mock_load_npy): + mock_file_checker.return_value.common_check.return_value = '/fake/path/to/file.npy' + + mock_load_npy.return_value = np.array([1.0, 2.0, 3.0]) + + result = read_npy_data('/fake/dir', 'file_name.npy') + + mock_file_checker.assert_called_once_with('/fake/path/to/file.npy', FileCheckConst.FILE, FileCheckConst.READ_ABLE, FileCheckConst.NUMPY_SUFFIX, False) + mock_load_npy.assert_called_once_with('/fake/path/to/file.npy') + self.assertTrue(np.array_equal(result, np.array([1.0, 2.0, 3.0]))) \ No newline at end of file diff --git a/debug/accuracy_tools/msprobe/test/pytorch_ut/api_accuracy_checker/tensor_transport_layer/test_attl.py b/debug/accuracy_tools/msprobe/test/pytorch_ut/api_accuracy_checker/tensor_transport_layer/test_attl.py index 0320c43d0b..7d4e6e950d 100644 --- a/debug/accuracy_tools/msprobe/test/pytorch_ut/api_accuracy_checker/tensor_transport_layer/test_attl.py +++ b/debug/accuracy_tools/msprobe/test/pytorch_ut/api_accuracy_checker/tensor_transport_layer/test_attl.py @@ -6,7 +6,6 @@ from multiprocessing import Queue from msprobe.pytorch.api_accuracy_checker.tensor_transport_layer.attl import * from msprobe.core.common.file_utils import create_directory - class TestATTL(unittest.TestCase): def setUp(self): @@ -49,7 +48,7 @@ class TestATTL(unittest.TestCase): self.assertIsNone(result) @patch('glob.glob') - @patch('msprobe.core.common.file_utils.load_pt') + @patch('msprobe.pytorch.common.utils.load_pt') def test_download_with_exception(self, mock_load_pt, mock_glob): mock_glob.return_value = ['/tmp/start_file.pt'] mock_load_pt.side_effect = Exception('Load error') diff --git a/debug/accuracy_tools/msprobe/test/pytorch_ut/compare/test_pt_compare.py b/debug/accuracy_tools/msprobe/test/pytorch_ut/compare/test_pt_compare.py index 4eda1d6d97..157dbc2d03 100644 --- a/debug/accuracy_tools/msprobe/test/pytorch_ut/compare/test_pt_compare.py +++ b/debug/accuracy_tools/msprobe/test/pytorch_ut/compare/test_pt_compare.py @@ -2,11 +2,14 @@ import os import shutil import unittest +from unittest.mock import patch, MagicMock import torch +import numpy as np from msprobe.core.common.utils import CompareException -from msprobe.pytorch.compare.pt_compare import compare +from msprobe.core.common.file_utils import FileCheckConst +from msprobe.pytorch.compare.pt_compare import compare, read_pt_data from msprobe.test.core_ut.compare.test_acc_compare import generate_dump_json, generate_stack_json @@ -62,3 +65,37 @@ class TestUtilsMethods(unittest.TestCase): with self.assertRaises(CompareException) as context: compare(input_param2, output_path) self.assertEqual(context.exception.code, 1) + + +class TestReadPtData(unittest.TestCase): + + @patch('msprobe.core.compare.utils.load_pt') + @patch('msprobe.core.compare.utils.FileChecker') + @patch('os.path.join', return_value='/fake/path/to/file.pt') + def test_read_pt_data(self, mock_os, mock_file_checker, mock_load_pt): + mock_file_checker.return_value.common_check.return_value = '/fake/path/to/file.pt' + + mock_tensor = MagicMock() + mock_tensor.detach.return_value = mock_tensor + mock_tensor.to.return_value = mock_tensor + mock_tensor.dtype = torch.bfloat16 + mock_tensor.numpy.return_value = np.array([1.0, 2.0, 3.0]) + mock_load_pt.return_value = mock_tensor + + result = read_pt_data('/fake/dir', 'file_name.pt') + + mock_file_checker.assert_called_once_with('/fake/path/to/file.pt', FileCheckConst.FILE, FileCheckConst.READ_ABLE, FileCheckConst.PT_SUFFIX, False) + mock_load_pt.assert_called_once_with('/fake/path/to/file.pt', to_cpu=True) + mock_tensor.to.assert_called_once_with(torch.float32) + self.assertTrue(np.array_equal(result, np.array([1.0, 2.0, 3.0]))) + + @patch('os.path.join', return_value='/fake/path/to/file.pt') + @patch('msprobe.core.compare.utils.FileChecker') + @patch('msprobe.core.compare.utils.load_pt') + def test_read_real_data_pt_exception(self, mock_load_pt, mock_file_checker, mock_os): + mock_file_checker.return_value.common_check.return_value = '/fake/path/to/file.pt' + + mock_load_pt.side_effect = RuntimeError("Test Error") + + with self.assertRaises(CompareException): + read_pt_data('/fake/dir', 'file_name.pt') -- Gitee From d024503d81134cf9acba1642f1f860d33ce327e0 Mon Sep 17 00:00:00 2001 From: Linwei-Ying Date: Wed, 5 Mar 2025 17:13:07 +0800 Subject: [PATCH 35/37] compare read data read improve --- .../msprobe/core/compare/acc_compare.py | 16 ++----- .../msprobe/mindspore/compare/ms_compare.py | 25 +++++----- .../msprobe/mindspore/compare/utils.py | 30 ++++++++++++ .../msprobe/pytorch/__init__.py | 1 - .../msprobe/pytorch/compare/pt_compare.py | 35 +++----------- .../msprobe/pytorch/compare/utils.py | 47 +++++++++++++++++++ .../mindspore_ut/compare/test_ms_compare.py | 20 +------- .../test/mindspore_ut/compare/test_utils.py | 24 ++++++++++ .../pytorch_ut/compare/test_pt_compare.py | 39 +-------------- .../test/pytorch_ut/compare/test_utils.py | 43 +++++++++++++++++ 10 files changed, 167 insertions(+), 113 deletions(-) create mode 100644 debug/accuracy_tools/msprobe/mindspore/compare/utils.py create mode 100644 debug/accuracy_tools/msprobe/pytorch/compare/utils.py create mode 100644 debug/accuracy_tools/msprobe/test/mindspore_ut/compare/test_utils.py create mode 100644 debug/accuracy_tools/msprobe/test/pytorch_ut/compare/test_utils.py diff --git a/debug/accuracy_tools/msprobe/core/compare/acc_compare.py b/debug/accuracy_tools/msprobe/core/compare/acc_compare.py index bc7c7bd888..4ffbb225b0 100644 --- a/debug/accuracy_tools/msprobe/core/compare/acc_compare.py +++ b/debug/accuracy_tools/msprobe/core/compare/acc_compare.py @@ -33,8 +33,6 @@ from msprobe.core.compare.multiprocessing_compute import ComparisonResult, _hand from msprobe.core.compare.npy_compare import compare_ops_apply, get_error_flag_and_msg from msprobe.core.compare.utils import get_accuracy, get_rela_diff_summary_mode, get_un_match_accuracy, merge_tensor, \ print_compare_ends_info, read_op, get_name_and_state, reorder_op_x_list -from msprobe.pytorch.compare.pt_compare import read_pt_data -from msprobe.mindspore.compare.ms_compare import read_npy_data class ModeConfig: @@ -352,6 +350,9 @@ class Comparator: result_df = self.make_result_table(result) return result_df + def read_real_data(self, npu_dir, npu_data_name, bench_dir, bench_data_name) -> tuple: + pass + def compare_by_op(self, npu_op_name, bench_op_name, op_name_mapping_dict, input_param): """ :param npu_op_name: excel中的NPU_Name,例如:MintFunctional.conv2d.0.forward.input.3.0 @@ -377,16 +378,7 @@ class Comparator: npu_dir = input_param.get("npu_dump_data_dir") bench_dir = input_param.get("bench_dump_data_dir") try: - frame_name = getattr(self, "frame_name") - if frame_name == "MSComparator": - n_value = read_npy_data(npu_dir, npu_data_name) - if self.cross_frame: - b_value = read_pt_data(bench_dir, bench_data_name) - else: - b_value = read_npy_data(bench_dir, bench_data_name) - else: - n_value = read_pt_data(npu_dir, npu_data_name) - b_value = read_pt_data(bench_dir, bench_data_name) + n_value, b_value = self.read_real_data(npu_dir, npu_data_name, bench_dir, bench_data_name) except IOError as error: error_file = error.filename n_value, b_value = CompareConst.READ_NONE, CompareConst.READ_NONE diff --git a/debug/accuracy_tools/msprobe/mindspore/compare/ms_compare.py b/debug/accuracy_tools/msprobe/mindspore/compare/ms_compare.py index 2a36a7adb4..0060a4d1be 100644 --- a/debug/accuracy_tools/msprobe/mindspore/compare/ms_compare.py +++ b/debug/accuracy_tools/msprobe/mindspore/compare/ms_compare.py @@ -22,8 +22,7 @@ import pandas as pd from msprobe.core.common.const import CompareConst, Const from msprobe.core.common.exceptions import FileCheckException -from msprobe.core.common.file_utils import create_directory, load_json, load_yaml, load_npy, FileChecker, \ - FileCheckConst +from msprobe.core.common.file_utils import create_directory, load_json, load_yaml from msprobe.core.common.log import logger from msprobe.core.common.utils import CompareException, check_compare_param, check_configuration_param, \ check_op_str_pattern_valid, get_dump_mode, set_dump_path, detect_framework_by_dump_json @@ -31,6 +30,8 @@ from msprobe.core.compare.acc_compare import Comparator, ModeConfig from msprobe.core.compare.check import dtype_mapping from msprobe.core.compare.layer_mapping import generate_data_mapping_by_layer_mapping from msprobe.core.compare.utils import set_stack_json_path, reorder_op_x_list +from msprobe.pytorch.compare.utils import read_pt_data +from msprobe.mindspore.compare.utils import read_npy_data class MappingConfig: @@ -376,6 +377,14 @@ class MSComparator(Comparator): result['data_name'].append(data_name_reorder.pop(0)) return pd.DataFrame(result) + def read_real_data(self, npu_dir, npu_data_name, bench_dir, bench_data_name) -> tuple: + n_value = read_npy_data(npu_dir, npu_data_name) + if self.cross_frame: + b_value = read_pt_data(bench_dir, bench_data_name) + else: + b_value = read_npy_data(bench_dir, bench_data_name) + return n_value, b_value + def check_cross_framework(bench_json_path): framework = detect_framework_by_dump_json(bench_json_path) @@ -385,18 +394,6 @@ def check_cross_framework(bench_json_path): return False -def read_npy_data(dir_path, file_name): - if not file_name: - return None - - data_path = os.path.join(dir_path, file_name) - path_checker = FileChecker(data_path, FileCheckConst.FILE, FileCheckConst.READ_ABLE, - FileCheckConst.NUMPY_SUFFIX, False) - data_path = path_checker.common_check() - data_value = load_npy(data_path) - return data_value - - def ms_compare(input_param, output_path, **kwargs): try: auto_analyze = kwargs.get('auto_analyze', True) diff --git a/debug/accuracy_tools/msprobe/mindspore/compare/utils.py b/debug/accuracy_tools/msprobe/mindspore/compare/utils.py new file mode 100644 index 0000000000..737cdb55d2 --- /dev/null +++ b/debug/accuracy_tools/msprobe/mindspore/compare/utils.py @@ -0,0 +1,30 @@ +# Copyright (c) 2025-2025, Huawei Technologies Co., Ltd. +# All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os + +from msprobe.core.common.file_utils import load_npy, FileChecker, FileCheckConst + + +def read_npy_data(dir_path, file_name): + if not file_name: + return None + + data_path = os.path.join(dir_path, file_name) + path_checker = FileChecker(data_path, FileCheckConst.FILE, FileCheckConst.READ_ABLE, + FileCheckConst.NUMPY_SUFFIX, False) + data_path = path_checker.common_check() + data_value = load_npy(data_path) + return data_value diff --git a/debug/accuracy_tools/msprobe/pytorch/__init__.py b/debug/accuracy_tools/msprobe/pytorch/__init__.py index ce84e6b35b..20fbfeed0f 100644 --- a/debug/accuracy_tools/msprobe/pytorch/__init__.py +++ b/debug/accuracy_tools/msprobe/pytorch/__init__.py @@ -15,7 +15,6 @@ import torch from .compare.distributed_compare import compare_distributed -from .compare.pt_compare import compare from .common.utils import seed_all from .debugger.precision_debugger import PrecisionDebugger, module_dump, module_dump_end diff --git a/debug/accuracy_tools/msprobe/pytorch/compare/pt_compare.py b/debug/accuracy_tools/msprobe/pytorch/compare/pt_compare.py index 7c1670dac7..38176ec57c 100644 --- a/debug/accuracy_tools/msprobe/pytorch/compare/pt_compare.py +++ b/debug/accuracy_tools/msprobe/pytorch/compare/pt_compare.py @@ -12,18 +12,15 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -import os - -import torch from msprobe.core.common.exceptions import FileCheckException -from msprobe.core.common.file_utils import create_directory, load_yaml, FileChecker, FileCheckConst +from msprobe.core.common.file_utils import create_directory, load_yaml from msprobe.core.common.utils import CompareException, check_compare_param, check_configuration_param, get_dump_mode, \ set_dump_path from msprobe.core.compare.acc_compare import Comparator, ModeConfig from msprobe.core.compare.utils import set_stack_json_path from msprobe.pytorch.common.log import logger -from msprobe.pytorch.common.utils import load_pt +from msprobe.pytorch.compare.utils import read_pt_data class PTComparator(Comparator): @@ -53,30 +50,10 @@ class PTComparator(Comparator): mapping_dict = {} return mapping_dict - -def read_pt_data(dir_path, file_name): - if not file_name: - return None - - data_path = os.path.join(dir_path, file_name) - path_checker = FileChecker(data_path, FileCheckConst.FILE, FileCheckConst.READ_ABLE, - FileCheckConst.PT_SUFFIX, False) - data_path = path_checker.common_check() - try: - # detach because numpy can not process gradient information - data_value = load_pt(data_path, to_cpu=True).detach() - except RuntimeError as e: - # 这里捕获 load_pt 中抛出的异常 - logger.error(f"Failed to load the .pt file at {data_path}.") - raise CompareException(CompareException.INVALID_FILE_ERROR) from e - except AttributeError as e: - # 这里捕获 detach 方法抛出的异常 - logger.error(f"Failed to detach the loaded tensor.") - raise CompareException(CompareException.DETACH_ERROR) from e - if data_value.dtype == torch.bfloat16: - data_value = data_value.to(torch.float32) - data_value = data_value.numpy() - return data_value + def read_real_data(self, npu_dir, npu_data_name, bench_dir, bench_data_name) -> tuple: + n_value = read_pt_data(npu_dir, npu_data_name) + b_value = read_pt_data(bench_dir, bench_data_name) + return n_value, b_value def compare(input_param, output_path, **kwargs): diff --git a/debug/accuracy_tools/msprobe/pytorch/compare/utils.py b/debug/accuracy_tools/msprobe/pytorch/compare/utils.py new file mode 100644 index 0000000000..16473ff386 --- /dev/null +++ b/debug/accuracy_tools/msprobe/pytorch/compare/utils.py @@ -0,0 +1,47 @@ +# Copyright (c) 2025-2025, Huawei Technologies Co., Ltd. +# All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os + +import torch + +from msprobe.core.common.utils import logger, CompareException +from msprobe.core.common.file_utils import FileChecker, FileCheckConst +from msprobe.pytorch.common.utils import load_pt + + +def read_pt_data(dir_path, file_name): + if not file_name: + return None + + data_path = os.path.join(dir_path, file_name) + path_checker = FileChecker(data_path, FileCheckConst.FILE, FileCheckConst.READ_ABLE, + FileCheckConst.PT_SUFFIX, False) + data_path = path_checker.common_check() + try: + # detach because numpy can not process gradient information + data_value = load_pt(data_path, to_cpu=True).detach() + except RuntimeError as e: + # 这里捕获 load_pt 中抛出的异常 + logger.error(f"Failed to load the .pt file at {data_path}.") + raise CompareException(CompareException.INVALID_FILE_ERROR) from e + except AttributeError as e: + # 这里捕获 detach 方法抛出的异常 + logger.error(f"Failed to detach the loaded tensor.") + raise CompareException(CompareException.DETACH_ERROR) from e + if data_value.dtype == torch.bfloat16: + data_value = data_value.to(torch.float32) + data_value = data_value.numpy() + return data_value diff --git a/debug/accuracy_tools/msprobe/test/mindspore_ut/compare/test_ms_compare.py b/debug/accuracy_tools/msprobe/test/mindspore_ut/compare/test_ms_compare.py index ad8a47e510..4a3f01f988 100644 --- a/debug/accuracy_tools/msprobe/test/mindspore_ut/compare/test_ms_compare.py +++ b/debug/accuracy_tools/msprobe/test/mindspore_ut/compare/test_ms_compare.py @@ -12,9 +12,8 @@ import pandas as pd import yaml from msprobe.core.common.utils import CompareException -from msprobe.core.common.file_utils import FileCheckConst from msprobe.core.compare.acc_compare import ModeConfig -from msprobe.mindspore.compare.ms_compare import MappingConfig, MSComparator, check_cross_framework, read_npy_data +from msprobe.mindspore.compare.ms_compare import MappingConfig, MSComparator, check_cross_framework from msprobe.core.common.const import Const npu_dict = {'op_name': ['Functional.conv2d.0.forward.input.0', 'Functional.conv2d.0.forward.input.1', @@ -534,20 +533,3 @@ class TestUtilsMethods(unittest.TestCase): }) pd.testing.assert_frame_equal(result, expected) - - -class TestReadNpyData(unittest.TestCase): - - @patch('msprobe.core.compare.utils.load_npy') - @patch('msprobe.core.compare.utils.FileChecker') - @patch('os.path.join', return_value='/fake/path/to/file.npy') - def test_read_real_data_ms(self, mock_os, mock_file_checker, mock_load_npy): - mock_file_checker.return_value.common_check.return_value = '/fake/path/to/file.npy' - - mock_load_npy.return_value = np.array([1.0, 2.0, 3.0]) - - result = read_npy_data('/fake/dir', 'file_name.npy') - - mock_file_checker.assert_called_once_with('/fake/path/to/file.npy', FileCheckConst.FILE, FileCheckConst.READ_ABLE, FileCheckConst.NUMPY_SUFFIX, False) - mock_load_npy.assert_called_once_with('/fake/path/to/file.npy') - self.assertTrue(np.array_equal(result, np.array([1.0, 2.0, 3.0]))) \ No newline at end of file diff --git a/debug/accuracy_tools/msprobe/test/mindspore_ut/compare/test_utils.py b/debug/accuracy_tools/msprobe/test/mindspore_ut/compare/test_utils.py new file mode 100644 index 0000000000..80da434de6 --- /dev/null +++ b/debug/accuracy_tools/msprobe/test/mindspore_ut/compare/test_utils.py @@ -0,0 +1,24 @@ +import unittest +from unittest.mock import patch + +import numpy as np + +from msprobe.core.common.file_utils import FileCheckConst +from msprobe.mindspore.compare.utils import read_npy_data + + +class TestReadNpyData(unittest.TestCase): + + @patch('msprobe.core.compare.utils.load_npy') + @patch('msprobe.core.compare.utils.FileChecker') + @patch('os.path.join', return_value='/fake/path/to/file.npy') + def test_read_real_data_ms(self, mock_os, mock_file_checker, mock_load_npy): + mock_file_checker.return_value.common_check.return_value = '/fake/path/to/file.npy' + + mock_load_npy.return_value = np.array([1.0, 2.0, 3.0]) + + result = read_npy_data('/fake/dir', 'file_name.npy') + + mock_file_checker.assert_called_once_with('/fake/path/to/file.npy', FileCheckConst.FILE, FileCheckConst.READ_ABLE, FileCheckConst.NUMPY_SUFFIX, False) + mock_load_npy.assert_called_once_with('/fake/path/to/file.npy') + self.assertTrue(np.array_equal(result, np.array([1.0, 2.0, 3.0]))) diff --git a/debug/accuracy_tools/msprobe/test/pytorch_ut/compare/test_pt_compare.py b/debug/accuracy_tools/msprobe/test/pytorch_ut/compare/test_pt_compare.py index 157dbc2d03..4eda1d6d97 100644 --- a/debug/accuracy_tools/msprobe/test/pytorch_ut/compare/test_pt_compare.py +++ b/debug/accuracy_tools/msprobe/test/pytorch_ut/compare/test_pt_compare.py @@ -2,14 +2,11 @@ import os import shutil import unittest -from unittest.mock import patch, MagicMock import torch -import numpy as np from msprobe.core.common.utils import CompareException -from msprobe.core.common.file_utils import FileCheckConst -from msprobe.pytorch.compare.pt_compare import compare, read_pt_data +from msprobe.pytorch.compare.pt_compare import compare from msprobe.test.core_ut.compare.test_acc_compare import generate_dump_json, generate_stack_json @@ -65,37 +62,3 @@ class TestUtilsMethods(unittest.TestCase): with self.assertRaises(CompareException) as context: compare(input_param2, output_path) self.assertEqual(context.exception.code, 1) - - -class TestReadPtData(unittest.TestCase): - - @patch('msprobe.core.compare.utils.load_pt') - @patch('msprobe.core.compare.utils.FileChecker') - @patch('os.path.join', return_value='/fake/path/to/file.pt') - def test_read_pt_data(self, mock_os, mock_file_checker, mock_load_pt): - mock_file_checker.return_value.common_check.return_value = '/fake/path/to/file.pt' - - mock_tensor = MagicMock() - mock_tensor.detach.return_value = mock_tensor - mock_tensor.to.return_value = mock_tensor - mock_tensor.dtype = torch.bfloat16 - mock_tensor.numpy.return_value = np.array([1.0, 2.0, 3.0]) - mock_load_pt.return_value = mock_tensor - - result = read_pt_data('/fake/dir', 'file_name.pt') - - mock_file_checker.assert_called_once_with('/fake/path/to/file.pt', FileCheckConst.FILE, FileCheckConst.READ_ABLE, FileCheckConst.PT_SUFFIX, False) - mock_load_pt.assert_called_once_with('/fake/path/to/file.pt', to_cpu=True) - mock_tensor.to.assert_called_once_with(torch.float32) - self.assertTrue(np.array_equal(result, np.array([1.0, 2.0, 3.0]))) - - @patch('os.path.join', return_value='/fake/path/to/file.pt') - @patch('msprobe.core.compare.utils.FileChecker') - @patch('msprobe.core.compare.utils.load_pt') - def test_read_real_data_pt_exception(self, mock_load_pt, mock_file_checker, mock_os): - mock_file_checker.return_value.common_check.return_value = '/fake/path/to/file.pt' - - mock_load_pt.side_effect = RuntimeError("Test Error") - - with self.assertRaises(CompareException): - read_pt_data('/fake/dir', 'file_name.pt') diff --git a/debug/accuracy_tools/msprobe/test/pytorch_ut/compare/test_utils.py b/debug/accuracy_tools/msprobe/test/pytorch_ut/compare/test_utils.py new file mode 100644 index 0000000000..405503d898 --- /dev/null +++ b/debug/accuracy_tools/msprobe/test/pytorch_ut/compare/test_utils.py @@ -0,0 +1,43 @@ +import unittest +from unittest.mock import patch, MagicMock + +import torch +import numpy as np + +from msprobe.core.common.utils import CompareException +from msprobe.core.common.file_utils import FileCheckConst +from msprobe.pytorch.compare.utils import read_pt_data + + +class TestReadPtData(unittest.TestCase): + + @patch('msprobe.pytorch.compare.utils.load_pt') + @patch('msprobe.pytorch.compare.utils.FileChecker') + @patch('os.path.join', return_value='/fake/path/to/file.pt') + def test_read_pt_data(self, mock_os, mock_file_checker, mock_load_pt): + mock_file_checker.return_value.common_check.return_value = '/fake/path/to/file.pt' + + mock_tensor = MagicMock() + mock_tensor.detach.return_value = mock_tensor + mock_tensor.to.return_value = mock_tensor + mock_tensor.dtype = torch.bfloat16 + mock_tensor.numpy.return_value = np.array([1.0, 2.0, 3.0]) + mock_load_pt.return_value = mock_tensor + + result = read_pt_data('/fake/dir', 'file_name.pt') + + mock_file_checker.assert_called_once_with('/fake/path/to/file.pt', FileCheckConst.FILE, FileCheckConst.READ_ABLE, FileCheckConst.PT_SUFFIX, False) + mock_load_pt.assert_called_once_with('/fake/path/to/file.pt', to_cpu=True) + mock_tensor.to.assert_called_once_with(torch.float32) + self.assertTrue(np.array_equal(result, np.array([1.0, 2.0, 3.0]))) + + @patch('os.path.join', return_value='/fake/path/to/file.pt') + @patch('msprobe.pytorch.compare.utils.FileChecker') + @patch('msprobe.pytorch.compare.utils.load_pt') + def test_read_real_data_pt_exception(self, mock_load_pt, mock_file_checker, mock_os): + mock_file_checker.return_value.common_check.return_value = '/fake/path/to/file.pt' + + mock_load_pt.side_effect = RuntimeError("Test Error") + + with self.assertRaises(CompareException): + read_pt_data('/fake/dir', 'file_name.pt') -- Gitee From da302080011a83c64303e231d6aa1464796c08bc Mon Sep 17 00:00:00 2001 From: Linwei-Ying Date: Wed, 5 Mar 2025 17:13:38 +0800 Subject: [PATCH 36/37] compare read data read improve --- .../msprobe/test/mindspore_ut/compare/test_utils.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/debug/accuracy_tools/msprobe/test/mindspore_ut/compare/test_utils.py b/debug/accuracy_tools/msprobe/test/mindspore_ut/compare/test_utils.py index 80da434de6..d7fb5e38fb 100644 --- a/debug/accuracy_tools/msprobe/test/mindspore_ut/compare/test_utils.py +++ b/debug/accuracy_tools/msprobe/test/mindspore_ut/compare/test_utils.py @@ -9,8 +9,8 @@ from msprobe.mindspore.compare.utils import read_npy_data class TestReadNpyData(unittest.TestCase): - @patch('msprobe.core.compare.utils.load_npy') - @patch('msprobe.core.compare.utils.FileChecker') + @patch('msprobe.mindspore.compare.utils.load_npy') + @patch('msprobe.mindspore.compare.utils.FileChecker') @patch('os.path.join', return_value='/fake/path/to/file.npy') def test_read_real_data_ms(self, mock_os, mock_file_checker, mock_load_npy): mock_file_checker.return_value.common_check.return_value = '/fake/path/to/file.npy' -- Gitee From b91200ea28209ffac0b3ae03e3bc2546da3205c9 Mon Sep 17 00:00:00 2001 From: Linwei-Ying Date: Wed, 5 Mar 2025 17:48:41 +0800 Subject: [PATCH 37/37] compare read data read improve --- debug/accuracy_tools/msprobe/core/compare/acc_compare.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/debug/accuracy_tools/msprobe/core/compare/acc_compare.py b/debug/accuracy_tools/msprobe/core/compare/acc_compare.py index 4ffbb225b0..5e646c5352 100644 --- a/debug/accuracy_tools/msprobe/core/compare/acc_compare.py +++ b/debug/accuracy_tools/msprobe/core/compare/acc_compare.py @@ -351,7 +351,7 @@ class Comparator: return result_df def read_real_data(self, npu_dir, npu_data_name, bench_dir, bench_data_name) -> tuple: - pass + return None, None def compare_by_op(self, npu_op_name, bench_op_name, op_name_mapping_dict, input_param): """ -- Gitee