diff --git a/debug/accuracy_tools/msprobe/README.md b/debug/accuracy_tools/msprobe/README.md
index e31490f01e9f9d61504d9ee2311c82497323d886..6b7d483078a6a744ce935591ced0971dea2f5b2f 100644
--- a/debug/accuracy_tools/msprobe/README.md
+++ b/debug/accuracy_tools/msprobe/README.md
@@ -44,6 +44,7 @@ export MSPROBE_LOG_LEVEL={x}
- msprobe支持AscendPyTorch 1.11.0或更高版本,支持的PyTorch和CANN以及PyTorch和python软件版本配套关系请参见《[Ascend Extension for PyTorch插件](https://gitee.com/ascend/pytorch)》。
- msprobe支持MindSpore 2.4.0或更高版本,支持的MindSpore和CANN以及MindSpore和python软件版本配套关系请参见《[MindSpore版本发布列表](https://www.mindspore.cn/versions)》。
+- msprobe支持MSAdapter 2.1.0。
- msprobe支持的固件驱动版本与配套CANN软件支持的固件驱动版本相同,开发者可通过“[昇腾社区-固件与驱动](https://gitee.com/link?target=https%3A%2F%2Fwww.hiascend.com%2Fhardware%2Ffirmware-drivers%2Fcommunity%3Fproduct%3D2%26model%3D28%26cann%3D8.0.RC3.alpha003%26driver%3D1.0.25.alpha)”页面根据产品型号与CANN软件版本获取配套的固件与驱动。
@@ -69,15 +70,17 @@ export MSPROBE_LOG_LEVEL={x}
### 1 数据采集
-msprobe 通过在训练脚本中添加 PrecisionDebugger 接口的方式对 API 执行精度数据 dump 操作,对应 config.json 中的 task 为 statistics 或 tensor。
+msprobe 通过在训练脚本中添加 PrecisionDebugger 接口的方式对 API 执行精度数据 dump 操作。对应 config.json 中的 "statistics" 或 "tensor" task。
[PyTorch 场景的数据采集](./docs/05.data_dump_PyTorch.md)
[MindSpore 场景的数据采集](./docs/06.data_dump_MindSpore.md)
+[MSAdapter 场景的数据采集](./docs/29.data_dump_MSAdapter.md)
+
### 2 精度预检
-精度预检旨在昇腾 NPU 上扫描训练模型中的所有 API 进行 API 复现,给出精度情况的诊断和分析。对应 config.json 中的 task 为 run_ut。
+精度预检旨在昇腾 NPU 上扫描训练模型中的所有 API 进行 API 复现,给出精度情况的诊断和分析。对应 config.json 中的 "run_ut" task。
PyTorch 场景的[离线预检](./docs/07.accuracy_checker_PyTorch.md)和[在线预检](./docs/08.accuracy_checker_online_PyTorch.md)
@@ -143,12 +146,14 @@ MindSpore 动态图场景的[离线预检](./docs/09.accuracy_checker_MindSpore.
### 12 溢出检测与解析
-溢出检测与解析是在执行精度数据 dump 时,判断是否存在输入正常但输出存在溢出的 API,从而判断是否为正常溢出。对应 config.json 中的 overflow_check。
-推荐直接使用[数据采集](#1-数据采集)功能采集统计量信息检测溢出问题。
+溢出检测用于采集溢出 API 或 模块的精度数据,而溢出解析则是通过对溢出数据的分析,进一步判断是否为正常溢出。对应 config.json 中的 "overflow_check" task。
+推荐直接使用[数据采集](#1-数据采集)功能采集统计量信息,检测溢出问题。
[PyTorch 场景的溢出检测与解析](./docs/12.overflow_check_PyTorch.md)
-[MindSpore 场景的溢出检测与解析](./docs/13.overflow_check_MindSpore.md)
+[MindSpore 场景的溢出检测](./docs/13.overflow_check_MindSpore.md)
+
+[MSAdapter 场景的溢出检测](./docs/30.overflow_check_MSAdapter.md)
## 📑 补充材料
diff --git a/debug/accuracy_tools/msprobe/core/common/file_utils.py b/debug/accuracy_tools/msprobe/core/common/file_utils.py
index fdc626ca6a1a90e9060cefa237f9d5d8d7e42844..89d33a6a3e6fe830b981483edbe2aa6a4e5aa41f 100644
--- a/debug/accuracy_tools/msprobe/core/common/file_utils.py
+++ b/debug/accuracy_tools/msprobe/core/common/file_utils.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2024-2024, Huawei Technologies Co., Ltd.
+# Copyright (c) 2024-2025, Huawei Technologies Co., Ltd.
# All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
@@ -23,6 +23,7 @@ import shutil
from datetime import datetime, timezone
from dateutil import parser
import yaml
+
import numpy as np
import pandas as pd
@@ -446,8 +447,6 @@ def save_excel(path, data):
change_mode(path, FileCheckConst.DATA_FILE_AUTHORITY)
-
-
def move_file(src_path, dst_path):
check_file_or_directory_path(src_path)
check_path_before_create(dst_path)
diff --git a/debug/accuracy_tools/msprobe/core/common/utils.py b/debug/accuracy_tools/msprobe/core/common/utils.py
index c06b5b64927bf47da1573df3b1d4db34dfa24cb1..7ec0490168f3ec3c39afcd0915f85609e39f0030 100644
--- a/debug/accuracy_tools/msprobe/core/common/utils.py
+++ b/debug/accuracy_tools/msprobe/core/common/utils.py
@@ -247,6 +247,10 @@ def md5_find(data):
def detect_framework_by_dump_json(file_path):
+ json_data = load_json(file_path)
+ framework = json_data.get("framework", None)
+ if framework in [Const.PT_FRAMEWORK, Const.MS_FRAMEWORK]:
+ return framework
pattern_ms = r'"type":\s*"mindspore'
pattern_pt = r'"type":\s*"torch'
with FileOpen(file_path, 'r') as file:
diff --git a/debug/accuracy_tools/msprobe/core/compare/acc_compare.py b/debug/accuracy_tools/msprobe/core/compare/acc_compare.py
index f0ac97a0293b5a7ec95b61a4805af179a087eafc..28a7b5f3a87aa1ff9cac73672f55e5d7e4f5407c 100644
--- a/debug/accuracy_tools/msprobe/core/compare/acc_compare.py
+++ b/debug/accuracy_tools/msprobe/core/compare/acc_compare.py
@@ -34,6 +34,8 @@ from msprobe.core.compare.multiprocessing_compute import ComparisonResult, _hand
from msprobe.core.compare.npy_compare import compare_ops_apply, get_error_flag_and_msg
from msprobe.core.compare.utils import get_accuracy, get_rela_diff_summary_mode, get_un_match_accuracy, merge_tensor, \
print_compare_ends_info, read_op, get_name_and_state, reorder_op_x_list
+from msprobe.pytorch.compare.pt_compare import read_pt_data
+from msprobe.mindspore.compare.ms_compare import read_npy_data
class ModeConfig:
@@ -329,7 +331,9 @@ class Comparator:
else:
result_item.append(CompareConst.NONE)
if self.dump_mode == Const.ALL:
- result_item.append(npu_ops_all.get(ms_op_name).get("data_name", None))
+ ms_data_name = npu_ops_all.get(ms_op_name).get("data_name", None)
+ pt_data_name = bench_ops_all.get(bench_op_name).get("data_name", None)
+ result_item.append([ms_data_name, pt_data_name])
result.append(result_item)
elif ms_op_name not in npu_ops_all:
logger.warning(f'Can not find npu op name : `{ms_op_name}` in npu dump json file.')
@@ -349,47 +353,48 @@ class Comparator:
result_df = self.make_result_table(result)
return result_df
- def compare_by_op(self, npu_op_name, bench_op_name, op_name_mapping_dict, input_param, bench_data):
+ def compare_by_op(self, npu_op_name, bench_op_name, op_name_mapping_dict, input_param):
"""
:param npu_op_name: excel中的NPU_Name,例如:MintFunctional.conv2d.0.forward.input.3.0
:param bench_op_name: excel中的Bench_Name,例如:Functional.conv2d.0.forward.input.3.0
:param op_name_mapping_dict: op_name和npy或pt文件的映射关系
:param input_param: npu_json_path/bench_json_path/stack_json_path等参数
- :param bench_data: bench的dump数据中"data"字段
:return: result_list,包含余弦相似度、最大绝对误差、最大相对误差、千分之一误差率、千分之五误差率和错误信息
- 用于读取excel中的NPU_Name和Bench_Name,根据映射关系找到npy或pt文件,然后读取文件中的数据进行比较,计算余弦相似度、
+ 用于读取excel中的NPU_Name和Bench_Name,根据映射关系找到npy或pt文件,然后读取文件中的数据进行比较,计算余弦相似度、欧式距离
最大绝对误差、最大相对误差、千分之一误差率、千分之五误差率并生成错误信息
"""
- npu_bench_name_list = op_name_mapping_dict[npu_op_name]
- data_name = safe_get_value(npu_bench_name_list, 1, "npu_bench_name_list")
error_file, relative_err, error_flag = None, None, False
- bench_data_name = get_bench_data_name(bench_op_name, bench_data)
- if data_name == '-1' or data_name == -1: # 没有真实数据路径
- n_value, b_value = CompareConst.READ_NONE, CompareConst.READ_NONE
- error_flag = True
- elif not bench_data_name:
+
+ data_name_pair = op_name_mapping_dict.get(npu_op_name)
+ npu_data_name = data_name_pair[0]
+ bench_data_name = data_name_pair[1]
+
+ if str(npu_data_name) == '-1': # 没有npu真实数据
+ n_value, b_value, error_flag = CompareConst.READ_NONE, CompareConst.READ_NONE, True
+ elif str(bench_data_name) == '-1': # 没有bench真实数据
n_value, b_value, error_flag = CompareConst.READ_NONE, CompareConst.READ_NONE, True
error_file = 'no_bench_data'
else:
+ npu_dir = input_param.get("npu_dump_data_dir")
+ bench_dir = input_param.get("bench_dump_data_dir")
try:
- read_npy_data = getattr(self, "read_npy_data")
frame_name = getattr(self, "frame_name")
+
if frame_name == "MSComparator":
- n_value = read_npy_data(input_param.get("npu_dump_data_dir"), npu_op_name + Const.NUMPY_SUFFIX)
+ n_value = read_npy_data(npu_dir, npu_data_name)
if self.cross_frame:
- b_value = read_npy_data(input_param.get("bench_dump_data_dir"), bench_data_name,
- load_pt_file=True)
+ b_value = read_pt_data(bench_dir, bench_data_name)
else:
- b_value = read_npy_data(input_param.get("bench_dump_data_dir"), bench_data_name)
+ b_value = read_npy_data(bench_dir, bench_data_name)
else:
- n_value = read_npy_data(input_param.get("npu_dump_data_dir"), npu_op_name + Const.PT_SUFFIX)
- b_value = read_npy_data(input_param.get("bench_dump_data_dir"), bench_data_name)
+ n_value = read_pt_data(npu_dir, npu_data_name)
+ b_value = read_pt_data(bench_dir, bench_data_name)
except IOError as error:
error_file = error.filename
n_value, b_value = CompareConst.READ_NONE, CompareConst.READ_NONE
error_flag = True
except (FileCheckException, CompareException):
- error_file = data_name
+ error_file = npu_data_name
n_value, b_value = CompareConst.READ_NONE, CompareConst.READ_NONE
error_flag = True
@@ -464,7 +469,7 @@ class Comparator:
err_mess = []
is_print_compare_log = input_param.get("is_print_compare_log")
- bench_data = load_json(input_param.get("bench_json_path")).get('data')
+
for i in range(len(result_df)):
npu_op_name = result_df.iloc[i, 0]
bench_op_name = result_df.iloc[i, 1]
@@ -472,7 +477,7 @@ class Comparator:
logger.info("start compare: {}".format(npu_op_name))
cos_sim, euc_dist, max_abs_err, max_relative_err, one_thousand_err_ratio, five_thousand_err_ratio, err_msg \
- = self.compare_by_op(npu_op_name, bench_op_name, dump_path_dict, input_param, bench_data)
+ = self.compare_by_op(npu_op_name, bench_op_name, dump_path_dict, input_param)
if is_print_compare_log:
logger.info(
@@ -508,46 +513,3 @@ class Comparator:
except ValueError as e:
logger.error('result dataframe is not found.')
raise CompareException(CompareException.INVALID_DATA_ERROR) from e
-
-
-def get_bench_data_name(bench_op_name, bench_data):
- bench_name_list = re.split(r'\.(input|output|kwargs|parameters|parameters_grad)\.', bench_op_name)
- if len(bench_name_list) > 1 and bench_name_list[1] == Const.PARAMS_GRAD:
- bench_data_bundle = bench_data.get(bench_name_list[0] + Const.SEP + bench_name_list[1], {})
- else:
- bench_data_bundle = bench_data.get(bench_name_list[0], {})
- if not bench_data_bundle or len(bench_name_list) < 3:
- return None
- layers = bench_name_list[2].split(Const.SEP)
-
- def _get(key, container):
- if isinstance(container, dict):
- return container.get(key)
- if isinstance(container, list):
- try:
- return container[int(key)]
- except (ValueError, IndexError):
- return None
- return None
-
- def get_by_layer(container, params_grad=False):
- data = container
- # dump.json中parameters_grad的结构为key:[{}], 如果存在key,有且只有一个列表元素,而op_name中只命名到了key,因此加'0'
- if params_grad:
- layers.append('0')
- for layer in layers:
- data = _get(layer, data)
- return _get(CompareConst.DATA_NAME.lower(), data)
-
- if Const.INPUT == bench_name_list[1]:
- return get_by_layer(bench_data_bundle.get(Const.INPUT, bench_data_bundle.get(Const.INPUT_ARGS)))
- elif Const.KWARGS == bench_name_list[1]:
- return get_by_layer(bench_data_bundle.get(Const.INPUT_KWARGS))
- elif Const.OUTPUT == bench_name_list[1]:
- return get_by_layer(bench_data_bundle.get(Const.OUTPUT))
- elif Const.PARAMS == bench_name_list[1]:
- return get_by_layer(bench_data_bundle.get(Const.PARAMS))
- elif Const.PARAMS_GRAD == bench_name_list[1]:
- return get_by_layer(bench_data_bundle, params_grad=True)
- else:
- return None
diff --git a/debug/accuracy_tools/msprobe/core/compare/multiprocessing_compute.py b/debug/accuracy_tools/msprobe/core/compare/multiprocessing_compute.py
index f79671827c1efc30f3f0a573e23d9d72f2fbd289..71b0f29d64f717adc87b74cf48e891652e9e753f 100644
--- a/debug/accuracy_tools/msprobe/core/compare/multiprocessing_compute.py
+++ b/debug/accuracy_tools/msprobe/core/compare/multiprocessing_compute.py
@@ -25,7 +25,7 @@ from msprobe.core.common.utils import CompareException
from msprobe.core.common.const import CompareConst
-def _handle_multi_process(func, input_parma, result_df, lock):
+def _handle_multi_process(func, input_param, result_df, lock):
process_num = max(int((multiprocessing.cpu_count() + 1) // 4), 1)
op_name_mapping_dict = read_dump_data(result_df)
@@ -55,7 +55,7 @@ def _handle_multi_process(func, input_parma, result_df, lock):
idx = df_chunk_size * process_idx
chunk_size = len(df_chunk)
result = pool.apply_async(func,
- args=(idx, op_name_mapping_dict, df_chunk, lock, input_parma),
+ args=(idx, op_name_mapping_dict, df_chunk, lock, input_param),
error_callback=err_call,
callback=partial(update_progress, chunk_size, lock)
)
@@ -97,12 +97,12 @@ def _ms_graph_handle_multi_process(func, result_df, mode):
def read_dump_data(result_df):
try:
npu_dump_name_list = result_df.iloc[0:, 0].tolist()
- npu_dump_tensor_list = result_df.iloc[0:, -1].tolist()
+ dump_tensor_pair_list = result_df.iloc[0:, -1].tolist()
op_name_mapping_dict = {}
for index, _ in enumerate(npu_dump_name_list):
npu_dump_name = npu_dump_name_list[index]
- npu_dump_tensor = npu_dump_tensor_list[index]
- op_name_mapping_dict[npu_dump_name] = [npu_dump_tensor, npu_dump_tensor]
+ dump_tensor_pair = dump_tensor_pair_list[index]
+ op_name_mapping_dict[npu_dump_name] = dump_tensor_pair
return op_name_mapping_dict
except ValueError as e:
logger.error('result dataframe is not found.')
diff --git a/debug/accuracy_tools/msprobe/core/compare/utils.py b/debug/accuracy_tools/msprobe/core/compare/utils.py
index 72b75ab254e59a4ec5788e95fde6721df2babe46..471951ce4b5fd12d6fdff26a1584e261ea86d71c 100644
--- a/debug/accuracy_tools/msprobe/core/compare/utils.py
+++ b/debug/accuracy_tools/msprobe/core/compare/utils.py
@@ -19,11 +19,12 @@ import math
import zlib
from dataclasses import dataclass
+import torch
import numpy as np
from msprobe.core.common.const import Const, CompareConst, FileCheckConst
from msprobe.core.common.utils import CompareException, check_regex_prefix_format_valid, logger, safe_get_value
-from msprobe.core.common.file_utils import check_file_or_directory_path
+from msprobe.core.common.file_utils import check_file_or_directory_path, FileChecker, load_npy
def extract_json(dirname, stack_json=False):
@@ -321,8 +322,8 @@ def get_accuracy(result, n_dict, b_dict, dump_mode):
has_stack = npu_stack_info and bench_stack_info
if dump_mode == Const.ALL:
- npu_data_name = n_dict.get("data_name", None)
- bench_data_name = b_dict.get("data_name", None)
+ npu_data_name_list = n_dict.get("data_name", None)
+ bench_data_name_list = b_dict.get("data_name", None)
for index in range(min_len):
n_name = safe_get_value(n_dict, n_start + index, "n_dict", key="op_name")
@@ -353,7 +354,9 @@ def get_accuracy(result, n_dict, b_dict, dump_mode):
result_item.append(err_msg)
result_item = stack_column_process(result_item, has_stack, index, key, npu_stack_info)
if dump_mode == Const.ALL:
- result_item.append(safe_get_value(npu_data_name, n_start + index, "npu_data_name"))
+ npu_data_name = safe_get_value(npu_data_name_list, n_start + index, "npu_data_name_list")
+ bench_data_name = safe_get_value(bench_data_name_list, n_start + index, "bench_data_name_list")
+ result_item.append([npu_data_name, bench_data_name])
result.append(result_item)
@@ -388,7 +391,9 @@ def get_accuracy(result, n_dict, b_dict, dump_mode):
result_item.append(err_msg)
result_item = stack_column_process(result_item, has_stack, index, key, npu_stack_info)
if dump_mode == Const.ALL:
- result_item.append(safe_get_value(npu_data_name, n_start + index, "npu_data_name"))
+ npu_data_name = safe_get_value(npu_data_name_list, n_start + index, "npu_data_name_list")
+ bench_data_name = safe_get_value(bench_data_name_list, n_start + index, "bench_data_name_list")
+ result_item.append([npu_data_name, bench_data_name])
result.append(result_item)
@@ -467,7 +472,7 @@ def get_un_match_accuracy(result, n_dict, dump_mode):
result_item.append(err_msg)
append_stack_info(result_item, npu_stack_info, index)
if dump_mode == Const.ALL and result_item[1] == CompareConst.N_A:
- result_item.extend(["-1"])
+ result_item.extend([["-1", "-1"]])
result.append(result_item)
diff --git a/debug/accuracy_tools/msprobe/core/data_dump/data_processor/pytorch_processor.py b/debug/accuracy_tools/msprobe/core/data_dump/data_processor/pytorch_processor.py
index 2cd98b125682434b517f6d70e09ea6a850b3e3bb..c3c0c3aa734bc9effadab8a591e314610218611d 100644
--- a/debug/accuracy_tools/msprobe/core/data_dump/data_processor/pytorch_processor.py
+++ b/debug/accuracy_tools/msprobe/core/data_dump/data_processor/pytorch_processor.py
@@ -29,7 +29,7 @@ from msprobe.core.common.log import logger
from msprobe.core.common.utils import convert_tuple
from msprobe.core.data_dump.data_processor.base import BaseDataProcessor, ModuleBackwardInputsOutputs, \
ModuleForwardInputsOutputs, TensorStatInfo
-from msprobe.pytorch.common.utils import save_pt, load_pt
+from msprobe.pytorch.common.utils import save_pt
from msprobe.pytorch.free_benchmark import FreeBenchmarkCheck, UnequalRow
from msprobe.core.common.utils import recursion_depth_decorator
@@ -145,7 +145,7 @@ class PytorchDataProcessor(BaseDataProcessor):
if data.is_meta:
return tensor_stat
data_clone = data.detach()
- if data_clone.numel() == 0:
+ if not data_clone.numel() or not data_clone.data_ptr():
return tensor_stat
else:
if data_clone.device.type == Const.CPU_LOWERCASE or not async_dump:
diff --git a/debug/accuracy_tools/msprobe/docs/01.installation.md b/debug/accuracy_tools/msprobe/docs/01.installation.md
index 1ab5f6419ba07ec749bad139f874fbc7301fd8b3..530783e87d0bdadd51856cb1ae08160cb081da80 100644
--- a/debug/accuracy_tools/msprobe/docs/01.installation.md
+++ b/debug/accuracy_tools/msprobe/docs/01.installation.md
@@ -16,7 +16,7 @@ pip install mindstudio-probe
|版本|发布日期|支持 PyTorch 版本|支持 MindSpore 版本|下载链接|校验码|
|:--:|:--:|:--:|:--:|:--:|:--:|
-|1.2.2|2025.2.26|1.11/2.0/2.1/2.2|2.4.0|[mindstudio_probe-1.2.2-py3-none-any.whl](https://ptdbg.obs.myhuaweicloud.com/msprobe/1.2/mindstudio_probe-1.2.2-py3-none-any.whl)|1db0cf4572bc0305c68705b74775f652c6cb2c2bedb6c6e57f43e31ab273b288|
+|1.2.2|2025.3.03|1.11/2.0/2.1/2.2|2.4.0|[mindstudio_probe-1.2.2-py3-none-any.whl](https://ptdbg.obs.myhuaweicloud.com/msprobe/1.2/mindstudio_probe-1.2.2-py3-none-any.whl)|961411bb460d327ea51d6ca4d0c8e8c5565f07c0852d7b8592b781ca35b87212|
|1.2.1|2025.2.07|1.11/2.0/2.1/2.2|2.4.0|[mindstudio_probe-1.2.1-py3-none-any.whl](https://ptdbg.obs.myhuaweicloud.com/msprobe/1.2/mindstudio_probe-1.2.1-py3-none-any.whl)|b64b342118558e0339b39237f88a49b93fd24551b0cb202c872fbfef4260c86b|
|1.2.0|2025.1.13|1.11/2.0/2.1/2.2|2.4.0|[mindstudio_probe-1.2.0-py3-none-any.whl](https://ptdbg.obs.myhuaweicloud.com/msprobe/1.2/mindstudio_probe-1.2.0-py3-none-any.whl)|1e3aeea1706112f6ee52fd1165037936bb209138f0b9ec42ea21e2c1c8942cdc|
|1.1.1|2024.12.09|1.11/2.0/2.1/2.2|2.4.0|[mindstudio_probe-1.1.1-py3-none-any.whl](https://ptdbg.obs.myhuaweicloud.com/msprobe/1.1/mindstudio_probe-1.1.1-py3-none-any.whl)|577b597555dc155b76ba1a62d575c3546004644e140a456c3ba0824d46283735|
diff --git a/debug/accuracy_tools/msprobe/docs/02.config_introduction.md b/debug/accuracy_tools/msprobe/docs/02.config_introduction.md
index f134bd4536294d209e7b3e6e73fd80b9be61041d..a5f17637dae817de6eba091e9eef602ca95f091a 100644
--- a/debug/accuracy_tools/msprobe/docs/02.config_introduction.md
+++ b/debug/accuracy_tools/msprobe/docs/02.config_introduction.md
@@ -12,23 +12,23 @@
| 参数 | 解释 | 是否必选 |
| ----------------- |------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| -------- |
-| task | dump 的任务类型,str 类型。可选参数:
"statistics":仅采集统计信息,默认值;
"tensor":采集统计信息和完全复刻整网的真实数据;
"run_ut":精度预检,仅 PyTorch 场景支持,采集数据时勿选;
"overflow_check":溢出检测;
"free_benchmark":无标杆比对;
"grad_probe":梯度监控;
"structure":仅采集模型结构以及调用栈信息,不采集具体数据。
根据 task 参数取值的不同,可以配置不同场景参数,详见:
[1.2 task 配置为 statistics](#12-task-配置为-statistics),
[1.3 task 配置为 tensor](#13-task-配置为-tensor),
[1.4 task 配置为 run_ut](#14-task-配置为-run_ut),
[1.5 task 配置为 overflow_check](#15-task-配置为-overflow_check),
[1.6 task 配置为 free_benchmark](#16-task-配置为-free_benchmark),
[1.7 task 配置为 grad_probe](#17-task-配置为-grad_probe)。
**配置示例**:"task": "tensor"。 | 否 |
+| task | dump 的任务类型,str 类型。可选参数:
"statistics":仅采集统计信息,默认值;
"tensor":采集统计信息和完全复刻整网的真实数据;
"run_ut":精度预检,仅 PyTorch 场景支持,采集数据时勿选;
"overflow_check":溢出检测;
"free_benchmark":无标杆比对,不支持 MSAdapter 场景;
"grad_probe":梯度监控, 不支持 MSAdapter 场景;
"structure":仅采集模型结构以及调用栈信息,不采集具体数据。
根据 task 参数取值的不同,可以配置不同场景参数,详见:
[1.2 task 配置为 statistics](#12-task-配置为-statistics),
[1.3 task 配置为 tensor](#13-task-配置为-tensor),
[1.4 task 配置为 run_ut](#14-task-配置为-run_ut),
[1.5 task 配置为 overflow_check](#15-task-配置为-overflow_check),
[1.6 task 配置为 free_benchmark](#16-task-配置为-free_benchmark),
[1.7 task 配置为 grad_probe](#17-task-配置为-grad_probe)。
**配置示例**:"task": "tensor"。 | 否 |
| dump_path | 设置 dump 数据目录路径,str 类型。
**配置示例**:"dump_path": "./dump_path"。 | 是 |
| rank | 指定对某张卡上的数据进行采集,list[Union[int, str]] 类型,默认未配置(表示采集所有卡的数据),应配置元素为 ≥0 的整数或类似"4-6"的字符串,且须配置实际可用的 Rank ID。
PyTorch 场景: Rank ID 从 0 开始计数,最大取值为所有节点可用卡总数-1,若所配置的值大于实际训练所运行的卡的 Rank ID,则 dump 数据为空,比如当前环境 Rank ID 为 0 到 7,实际训练运行 0 到 3 卡,此时若配置 Rank ID 为 4 或不存在的 10 等其他值,dump 数据为空。
MindSpore 场景:所有节点的 Rank ID 均从 0 开始计数,最大取值为每个节点可用卡总数-1,config.json 配置一次 rank 参数对所有节点同时生效。
注意,单卡训练时,rank必须为[],即空列表,不能指定rank。
**配置示例**:"rank": [1, "4-6"]。 | 否 |
| step | 指定采集某个 step 的数据,list[Union[int, str]] 类型。默认未配置,表示采集所有 step 数据。采集特定 step 时,须指定为训练脚本中存在的 step,可逐个配置,也可以指定范围。
**配置示例**:"step": [0, 1 , 2, "4-6"]。 | 否 |
-| level | dump 级别,str 类型,根据不同级别采集不同数据。可选参数:
"L0":dump 模块级精度数据,仅 PyTorch 与 MindSpore 动态图场景支持,使用背景详见 [1.1.1 模块级精度数据 dump 说明](#111-模块级精度数据-dump-说明);
"L1":dump API 级精度数据,默认值,仅 PyTorch 与 MindSpore 动态图场景支持;
"L2":dump kernel 级精度数据,PyTorch场景详细介绍见 [PyTorch 场景的 kernel dump 说明](./04.kernel_dump_PyTorch.md);MindSpore场景详细介绍见 [MindSpore 场景的 kernel dump 说明](./28.kernel_dump_MindSpore.md);
"mix":dump module 模块级和 API 级精度数据,即"L0"+"L1",仅 PyTorch 与 MindSpore 动态图场景支持。
"debug":单点保存功能,细节详见[单点保存工具 README](./28.debugger_save_instruction.md)
**配置示例**:"level": "L1"。 | 否 |
+| level | dump 级别,str 类型,根据不同级别采集不同数据。可选参数:
"L0":dump 模块级精度数据,仅 PyTorch、MSAdapter 以及 MindSpore 动态图场景支持,使用背景详见 [1.1.1 模块级精度数据 dump 说明](#111-模块级精度数据-dump-说明);
"L1":dump API 级精度数据,默认值,仅 PyTorch、MSAdapter 以及 MindSpore 动态图场景支持;
"L2":dump kernel 级精度数据,PyTorch 场景详细介绍见 [PyTorch 场景的 kernel dump 说明](./04.kernel_dump_PyTorch.md);MindSpore 动态图场景详细介绍见 [MindSpore 动态图场景的 kernel dump 说明](./28.kernel_dump_MindSpore.md);MindSpore 静态图场景详细介绍见《MindSpore 场景的数据采集》中的 ["**8.1 静态图场景**"](./06.data_dump_MindSpore.md#81-静态图场景)小节;
"mix":dump module 模块级和 API 级精度数据,即"L0"+"L1",仅 PyTorch、MSAdapter 以及 MindSpore 动态图场景支持。
"debug":单点保存功能,细节详见[单点保存工具 README](./28.debugger_save_instruction.md)
**配置示例**:"level": "L1"。 | 否 |
| enable_dataloader | 自动控制开关,bool 类型,仅 PyTorch 场景支持。可选参数 true(开启)或 false(关闭),默认为 false。配置为 true 后自动识别 step 参数指定的迭代,并在该迭代执行完成后退出训练,此时 start、stop 和 step 函数可不配置,开启该开关要求训练脚本是通过 torch.utils.data.dataloader 方式加载数据。仅支持 PyTorch 单卡训练使用,分布式训练场景下存在数据 dump 不全问题。 **这个特性下个版本将被废弃** | 否 |
| async_dump | 异步 dump 开关,bool 类型。可选参数 true(开启)或 false(关闭),默认为 false。配置为 true 后开启异步 dump,即采集的精度数据会在当前 step 训练结束后统一落盘,训练过程中工具不触发同步操作。由于使用该模式有**显存溢出**的风险,当 task 配置为 tensor 时,即真实数据的异步dump模式,必须配置 [list](#13-task-配置为-tensor) 参数,指定需要 dump 的 tensor 。该模式暂不支持复数类型 tensor
的统计量计算。 | 否 |
#### 1.1.1 模块级精度数据 dump 说明
-仅 PyTorch 与 MindSpore 动态图场景支持。
+仅 PyTorch、MSAdapter以及 MindSpore 动态图场景支持。
大模型场景下,通常不是简单的利用自动迁移能力实现从 GPU 到 NPU 的训练脚本迁移,而是会对 NPU 网络进行一系列针对性的适配,因此,常常会造成迁移后的 NPU 模型存在部分子结构不能与 GPU 原始模型完全对应。模型结构不一致导致 API 调用类型及数量不一致,若直接按照 API 粒度进行精度数据 dump 和比对,则无法完全比对所有的 API。
本小节介绍的功能是对模型中的大粒度模块进行数据 dump,使其比对时,对于无法以 API 粒度比对的模块可以直接以模块粒度进行比对。
-模块指的是继承 nn.Module 类(PyTorch场景)或 nn.Cell 类(MindSpore场景)的子类,通常情况下这类模块就是一个小模型,可以被视为一个整体,dump 数据时以模块为粒度进行 dump。
+模块指的是继承 nn.Module 类(PyTorch 与 MSAdapter 场景)或 nn.Cell 类(MindSpore 场景)的子类,通常情况下这类模块就是一个小模型,可以被视为一个整体,dump 数据时以模块为粒度进行 dump。
@@ -36,21 +36,23 @@
参数 | 解释 | 是否必选 |
- scope | PyTorch 和 MindSpore 动态图场景 dump 范围,list[str] 类型,默认未配置(list 也未配置时表示 dump 所有 API 的数据)。该参数可以在 [ ] 内配置两个模块名或 API 名,要求列表长度必须为2,需要配置按照工具命名格式的完整模块名或API名称,用于锁定区间,dump 该范围内的数据。 配置示例:
+ |
scope | PyTorch、MSAdapter 以及 MindSpore 动态图场景 dump 范围,list[str] 类型,默认未配置(list 也未配置时表示 dump 所有 API 的数据)。该参数可以在 [ ] 内配置两个模块名或 API 名,要求列表长度必须为2,需要配置按照工具命名格式的完整模块名或API名称,用于锁定区间,dump 该范围内的数据。 配置示例:
"scope": ["Module.conv1.Conv2d.forward.0", "Module.fc2.Linear.forward.0"],
或 "scope": ["Cell.conv1.Conv2d.forward.0", "Cell.fc2.Dense.backward.0"], 或"scope": ["Tensor.add.0.forward", "Functional.square.2.forward"]。与 level 参数取值相关,level 为 L0 级别时,可配置模块名;level 为 L1 级别时,可配置 API 名, level为 mix 级别时,可配置为模块名或API名。 | 否 |
list | 自定义采集的算子列表,list[str] 类型,默认未配置(scope 也未配置时表示 dump 所有 API 的数据),包含以下配置方法: | 否 |
- PyTorch 和 MindSpore 动态图场景配置具体的 API 全称,dump 该 API 数据。在 PyTorch 场景,如果 level 配置成 L2,该配置为必填项。 配置示例:"list": ["Tensor.permute.1.forward", "Tensor.transpose.2.forward", "Torch.relu.3.backward"]。 PyTorch 和 MindSpore 动态图场景在level为 mix 级别时可以配置模块名称,dump该模块展开数据 (dump该模块从执行开始到执行结束期间的所有数据)。
+ |
PyTorch、MSAdapter 以及 MindSpore 动态图场景配置具体的 API 全称,dump 该 API 数据。在 PyTorch 场景,如果 level 配置成 L2,该配置为必填项。 配置示例:"list": ["Tensor.permute.1.forward", "Tensor.transpose.2.forward", "Torch.relu.3.backward"]。 PyTorch 和 MindSpore 动态图场景在level为 mix 级别时可以配置模块名称,dump该模块展开数据 (dump该模块从执行开始到执行结束期间的所有数据)。
配置示例:"list": ["Module.module.language_model.encoder.layers.0.mlp.ParallelMlp.forward.0"], 或 "list": ["Cell.network_with_loss.language_model.encoder.layers.0.mlp.ParallelMlp.forward.0"] |
- PyTorch 和 MindSpore 动态图场景指定某一类 API,dump 某一类的 API 级别输入输出数据。 配置示例:"list": ["relu"]。 PyTorch 和 MindSpore 动态图场景在level为 mix 级别时, 会dump名称中包含list中配置的字符串的API数据,还会将名称中包含list中配置的字符串的模块进行展开dump (dump该模块从执行开始到执行结束期间的所有数据)。 |
- MindSpore 静态图场景配置 kernel_name,可以是算子的名称列表,也可以指定算子类型("level": "L2"时不支持),还可以配置算子名称的正则表达式(当字符串符合“name-regex(xxx)”格式时,后台则会将其作为正则表达式。 配置示例:list: ["name-regex(Default/.+)"] 可匹配算子名称以“Default/”开头的所有算子。 |
+ PyTorch、MSAdapter 以及 MindSpore 动态图场景指定某一类 API,dump 某一类的 API 级别输入输出数据。 配置示例:"list": ["relu"]。 PyTorch、MSAdapter 以及 MindSpore 动态图场景在level为 mix 级别时, 会dump名称中包含list中配置的字符串的API数据,还会将名称中包含list中配置的字符串的模块进行展开dump (dump该模块从执行开始到执行结束期间的所有数据)。 |
+ MindSpore 静态图场景配置 kernel_name,可以是算子的名称列表,也可以指定算子类型(jit_level=O2 时不支持),还可以配置算子名称的正则表达式(当字符串符合“name-regex(xxx)”格式时,后台则会将其作为正则表达式。 配置示例:list: ["name-regex(Default/.+)"] 可匹配算子名称以“Default/”开头的所有算子。 |
data_mode | dump 数据过滤,str 类型。 | 否 |
- PyTorch 与 MindSpore 动态图场景:支持"all"、"forward"、"backward"、"input"和"output",除"all"外,其余参数可以自由组合。默认为["all"],即保存所有 dump 的数据。 配置示例:"data_mode": ["backward"] (仅保存反向数据)或 "data_mode": ["forward", "input"](仅保存前向的输入数据)。 |
+ PyTorch、MSAdapter 以及 MindSpore 动态图场景:支持"all"、"forward"、"backward"、"input"和"output",除"all"外,其余参数可以自由组合。默认为["all"],即保存所有 dump 的数据。 配置示例:"data_mode": ["backward"] (仅保存反向数据)或 "data_mode": ["forward", "input"](仅保存前向的输入数据)。 |
MindSpore 静态图场景:仅支持"all"、"input"和"output"参数,且各参数只能单独配置,不支持自由组合。 配置示例:"data_mode": ["all"]。 |
- summary_mode | 控制 dump 文件输出的模式,str 类型,仅 PyTorch 与 MindSpore 动态图场景支持,可选参数: md5:dump 输出包含 CRC-32 值以及 API 统计信息的 dump.json 文件,用于验证数据的完整性; statistics:dump 仅输出包含 API 统计信息的 dump.json 文件,默认值。 配置示例:"summary_mode": "md5"。 | 否 |
MindSpore静态图jit_level=O2场景L2级dump,支持上述配置的同时额外支持配置统计项列表,可选统计项为max、min、mean、l2norm,可从中任意选取组合搭配。其中mean、l2norm的结果为float数据格式。 配置示例:"summary_mode": ["max", "min"]。 |
+ summary_mode | 控制 dump 文件输出的模式,str 类型,支持 PyTorch、MSAdapter、MindSpore 动态图以及 MindSpore 静态图 jit_level=O2 场景。 | 否 |
+ PyTorch、MSAdapter 以及 MindSpore 动态图场景:可选参数为 md5:dump 输出包含 CRC-32 值以及 API 统计信息的 dump.json 文件,用于验证数据的完整性; statistics:dump 仅输出包含 API 统计信息的 dump.json 文件,默认值。 配置示例:"summary_mode": "md5"。 |
+ MindSpore 静态图 jit_level=O2 场景:支持上述配置的同时额外支持配置统计项列表,可选统计项为max、min、mean、l2norm,可从中任意选取组合搭配。其中mean、l2norm的结果为float数据格式。 配置示例:"summary_mode": ["max", "min"]。 |
-**说明**:"summary_mode"配置为"md5"时,所使用的校验算法为CRC-32算法。
+**说明**:"summary_mode" 配置为 "md5" 时,所使用的校验算法为 CRC-32 算法。
### 1.3 task 配置为 tensor
@@ -86,16 +88,16 @@
### 1.5 task 配置为 overflow_check
-PyTorch 与 MindSpore 动态图场景下,"level"须为"L0"或"L1";MindSpore 静态图场景下,"level"须为"L2",且模型编译优化等级(jit_level)须为"O2"。
+PyTorch、MSAdapter 以及 MindSpore 动态图场景下,"level"须为"L0"或"L1";MindSpore 静态图场景下,"level"须为"L2",且模型编译优化等级(jit_level)须为"O2"。
| 参数 | 解释 | 是否必选 |
| ------------- | ---------------------- | -------- |
| overflow_nums | 最大溢出次数,int 类型,默认为 1,仅 PyTorch 与 MindSpore 动态图场景支持。表示第 N 次溢出后,不再进行溢出检测。过程中检测到溢出 API 对应的 输入输出 数据均 dump。
**配置示例**:"overflow_nums": 3。配置为 -1 时,表示持续检测溢出直到训练结束。 | 否 |
-| check_mode | 溢出类型,str 类型,仅 MindSpore 场景支持,可选参数:
"aicore":开启 AI Core 的溢出检测,不支持 MindSpore v2.3.0 以上版本;
"atomic":开启 Atomic 的溢出检测,不支持 MindSpore v2.3.0 以上版本;
"all":开启算子的溢出检测,默认值。
**配置示例**:"check_mode": "all"。 | 否 |
+| check_mode | 溢出类型,str 类型,仅 MindSpore v2.3.0 以下版本的静态图场景支持,可选参数:
"aicore":开启 AI Core 的溢出检测;
"atomic":开启 Atomic 的溢出检测;
"all":开启算子的溢出检测,默认值。
**配置示例**:"check_mode": "all"。 | 否 |
### 1.6 task 配置为 free_benchmark
-仅 PyTorch 场景与 MindSpore 动态图场景支持,且"level"为"L1"。
+仅 PyTorch 与 MindSpore 动态图场景支持,且"level"为"L1"。
- task 配置为 free_benchmark 时,开启**无标杆比对**,在 NPU 环境下通过对当前模型 API 的输入添加扰动因子,二次执行,将得到的输出与未添加扰动因子前的输出进行比对,从而**得出该模型中可能存在因迁移等变化导致精度降低的 API**。
diff --git a/debug/accuracy_tools/msprobe/docs/05.data_dump_PyTorch.md b/debug/accuracy_tools/msprobe/docs/05.data_dump_PyTorch.md
index db9a989c9d1c731fd9099d311f3ab3b95e5c7d5d..e45be7736b92c1a8b25711fd68b50e2cdec9d53e 100644
--- a/debug/accuracy_tools/msprobe/docs/05.data_dump_PyTorch.md
+++ b/debug/accuracy_tools/msprobe/docs/05.data_dump_PyTorch.md
@@ -183,7 +183,7 @@ save(variable, name, save_backward=True)
**参数说明**:
| 参数名称 | 参数含义 | 支持数据类型 | 是否必选|
| ---------- | ------------------| ------------------- | ------------------- |
-| variable | 需要保存的变量 |dict, list, torch.tensor, int, float, str | 是 |
+| variable | 需要保存的变量 |dict, list, tuple, torch.tensor, int, float, str | 是 |
| name | 指定的名称 | str | 是 |
| save_backward | 是否保存反向数据 | boolean | 否 |
@@ -355,7 +355,7 @@ if __name__ == "__main__":
```
* `rank`:设备 ID,每张卡的数据保存在对应的 `rank{ID}` 目录下。非分布式场景下没有 rank ID,目录名称为 rank。
* `dump_tensor_data`:保存采集到的张量数据。
-* `dump.json`: 保存API或Module前反向数据的统计量信息。包含dump数据的API名称或Module名称,各数据的dtype、 shape、max、min、mean、L2norm(L2范数,平方根)统计信息以及当配置summary_mode="md5"时的CRC-32数据。具体介绍可参考[dump.json文件说明](./27.dump_json_instruction.md#1-dumpjson文件介绍pytorch)。
+* `dump.json`: 保存API或Module前反向数据的统计量信息。包含dump数据的API名称或Module名称,各数据的dtype、 shape、max、min、mean、L2norm(L2范数,平方根)统计信息以及当配置summary_mode="md5"时的CRC-32数据。具体介绍可参考[dump.json文件说明](./27.dump_json_instruction.md#1-PyTorch场景下的dump.json文件)。
* `stack.json`:API/Module的调用栈信息。
* `construct.json`:分层分级结构,level为L1时,construct.json内容为空。
diff --git a/debug/accuracy_tools/msprobe/docs/06.data_dump_MindSpore.md b/debug/accuracy_tools/msprobe/docs/06.data_dump_MindSpore.md
index f7507facd2a92f3acbefdc92fa6cd808a155d6e3..158c5e3011e72a5b7feb1458ca6c2d79bc157606 100644
--- a/debug/accuracy_tools/msprobe/docs/06.data_dump_MindSpore.md
+++ b/debug/accuracy_tools/msprobe/docs/06.data_dump_MindSpore.md
@@ -144,7 +144,7 @@ save(variable, name, save_backward=True)
**参数说明**:
| 参数名称 | 参数含义 | 支持数据类型 | 是否必选|
| ---------- | ------------------| ------------------- | ------------------- |
-| variable | 需要保存的变量 |dict, list, torch.tensor, int, float, str | 是 |
+| variable | 需要保存的变量 |dict, list, tuple, torch.tensor, int, float, str | 是 |
| name | 指定的名称 | str | 是 |
| save_backward | 是否保存反向数据 | boolean | 否 |
@@ -372,7 +372,7 @@ dump 结果目录结构示例如下:
* `rank`:设备 ID,每张卡的数据保存在对应的 `rank{ID}` 目录下。非分布式场景下没有 rank ID,目录名称为 rank。
* `dump_tensor_data`:保存采集到的张量数据。
-* `dump.json`: 保存API或Cell前反向数据的统计量信息。包含dump数据的API名称或Cell名称,各数据的dtype、 shape、max、min、mean、L2norm(L2范数,平方根)统计信息以及当配置summary_mode="md5"时的CRC-32数据。具体介绍可参考[dump.json文件说明](./27.dump_json_instruction.md#2-dumpjson文件示例mindspore)。
+* `dump.json`: 保存API或Cell前反向数据的统计量信息。包含dump数据的API名称或Cell名称,各数据的dtype、 shape、max、min、mean、L2norm(L2范数,平方根)统计信息以及当配置summary_mode="md5"时的CRC-32数据。具体介绍可参考[dump.json文件说明](./27.dump_json_instruction.md#2-MindSpore场景下的dump.json文件)。
* `stack.json`:API/Cell的调用栈信息。
* `construct.json`:分层分级结构,level为L1时,construct.json内容为空。
diff --git a/debug/accuracy_tools/msprobe/docs/10.accuracy_compare_PyTorch.md b/debug/accuracy_tools/msprobe/docs/10.accuracy_compare_PyTorch.md
index a5f83d8dfcbc7645691a8753c105fb7552522bf1..6f886215b0a389582bc3cc4c31943f76e6a414a3 100644
--- a/debug/accuracy_tools/msprobe/docs/10.accuracy_compare_PyTorch.md
+++ b/debug/accuracy_tools/msprobe/docs/10.accuracy_compare_PyTorch.md
@@ -257,11 +257,11 @@ PyTorch 精度比对是以 CPU 或 GPU 的计算结果为标杆,通过计算
统计量有 4 种:最大值(max)、最小值(min)、平均值(mean)和 L2-范数(L2 norm)。
-|dump 数据模式|Cosine (tensor 余弦相似度)|EucDist (tensor 欧式距离)|MaxAbsErr (tensor 最大绝对误差)|MaxRelativeErr (tensor 最大相对误差)|One Thousandth Err Ratio (tensor 相对误差小于千分之一的比例)|Five Thousandth Err Ratio (tensor 相对误差小于千分之五的比例)|NPU 和 bench 的统计量绝对误差 (max, min, mean, L2 norm) diff| NPU 和 bench 的统计量相对误差 (max, min, mean, L2 norm) RelativeErr |NPU 和 bench 的统计量 (max, min, mean, L2 norm)|NPU MD5 (NPU 数据 CRC-32 值)|BENCH MD5 (bench 数据 CRC-32 值)|Result (比对结果)|Accuracy Reached or Not (计算精度是否达标)|Err_message (错误信息提示)|NPU_Stack_Info (堆栈信息)|Data_Name (NPU 真实数据名)|
-|:---:|:---:|:---:|:---:|:---:|:---:|:---:|:---:|:---:|:---:|:---:|:---:|:---:|:---:|:---:|:---:|:---:|
-|真实数据模式|√|√|√|√|√|√|||√||||√|√|√|√|
-|统计数据模式|||||||√|√|√|||√||√|√||
-|MD5 模式||||||||||√|√|√|||√||
+|dump 数据模式|Cosine (tensor 余弦相似度)|EucDist (tensor 欧式距离)|MaxAbsErr (tensor 最大绝对误差)|MaxRelativeErr (tensor 最大相对误差)|One Thousandth Err Ratio (tensor 相对误差小于千分之一的比例)|Five Thousandth Err Ratio (tensor 相对误差小于千分之五的比例)|NPU 和 bench 的统计量绝对误差 (max, min, mean, L2 norm) diff| NPU 和 bench 的统计量相对误差 (max, min, mean, L2 norm) RelativeErr |NPU 和 bench 的统计量 (max, min, mean, L2 norm)|NPU MD5 (NPU 数据 CRC-32 值)|BENCH MD5 (bench 数据 CRC-32 值)|Result (比对结果)|Accuracy Reached or Not (计算精度是否达标)|Err_message (错误信息提示)|NPU_Stack_Info (堆栈信息)| Data_Name ([NPU真实数据名,Bench真实数据名]) |
+|:---:|:---:|:---:|:---:|:---:|:---:|:---:|:---:|:---:|:---:|:---:|:---:|:---:|:---:|:---:|:---:|:---------------------------------:|
+|真实数据模式|√|√|√|√|√|√|||√||||√|√|√| √ |
+|统计数据模式|||||||√|√|√|||√||√|√| |
+|MD5 模式||||||||||√|√|√|||√| |
上表中NPU_Stack_Info字段需要配置-s参数生成。
diff --git a/debug/accuracy_tools/msprobe/docs/12.overflow_check_PyTorch.md b/debug/accuracy_tools/msprobe/docs/12.overflow_check_PyTorch.md
index 97b049000c6aca9a69aeca66e1a27a4260b3d142..983477554e138f3e547f2d3efcf14fdfc4a991a0 100644
--- a/debug/accuracy_tools/msprobe/docs/12.overflow_check_PyTorch.md
+++ b/debug/accuracy_tools/msprobe/docs/12.overflow_check_PyTorch.md
@@ -28,7 +28,7 @@ msprobe 工具在 PyTorch 场景下提供溢出数据采集功能和溢出数据
溢出数据采集功能在昇腾 NPU 上支持饱和模式(仅支持 Atlas 训练系列产品)和 INF/NAN 模式。
-INF/NAN 模式遵循 IEEE 754 标准,根据定义输出 INF/NAN 的计算结果。与之对应的饱和模式在计算出现溢出时,饱和为浮点数极值(+-MAX)。对于 CANN 侧配置,Atlas 训练系列产品,默认为饱和模式,且不建议使用 INF/NAN 模式;Atlas A2 训练系列产品,默认为 INF/NAN 模式,且不建议使用饱和模式。
+INF/NAN 模式遵循 IEEE 754 标准,根据定义输出 INF/NAN 的计算结果。与之对应的饱和模式在计算出现溢出时,饱和为浮点数极值(+-MAX)。对于 CANN 侧配置,Atlas 训练系列产品,默认为饱和模式,且不支持使用 INF/NAN 模式;Atlas A2 训练系列产品,默认为 INF/NAN 模式,且不建议使用饱和模式。
INF/NAN 模式的使能方式如下:
diff --git a/debug/accuracy_tools/msprobe/docs/13.overflow_check_MindSpore.md b/debug/accuracy_tools/msprobe/docs/13.overflow_check_MindSpore.md
index 33ff4a0259aef02d122022402966c65358e8efff..ef83aa17237d1cc56b8a67bf4b3ec9f57647fb9c 100644
--- a/debug/accuracy_tools/msprobe/docs/13.overflow_check_MindSpore.md
+++ b/debug/accuracy_tools/msprobe/docs/13.overflow_check_MindSpore.md
@@ -11,7 +11,7 @@ export INF_NAN_MODE_ENABLE=1
export MS_ASCEND_CHECK_OVERFLOW_MODE="INFNAN_MODE"
```
-**a**:在处理浮点数计算溢出问题时,NPU 当前支持两种溢出模式:INF/NAN 模式与饱和模式。INF/NAN 模式遵循 IEEE 754 标准,根据定义输出 INF/NAN 的计算结果。与之对应的饱和模式在计算出现溢出时,饱和为浮点数极值(+-MAX)。对于 CANN 侧配置,Atlas 训练系列产品,默认为饱和模式,且不建议使用 INF/NAN 模式;Atlas A2训练系列产品,默认为 INF/NAN 模式,且不建议使用饱和模式。对于 MindSpore 框架侧配置,仅支持对 Atlas A2 训练系列产品进行设置,默认为 INF/NAN 模式。CANN 侧 与 MindSpore 框架侧配置须一致。
+**a**:在处理浮点数计算溢出问题时,NPU 当前支持两种溢出模式:INF/NAN 模式与饱和模式。INF/NAN 模式遵循 IEEE 754 标准,根据定义输出 INF/NAN 的计算结果。与之对应的饱和模式在计算出现溢出时,饱和为浮点数极值(+-MAX)。对于 CANN 侧配置,Atlas 训练系列产品,默认为饱和模式,且不支持使用 INF/NAN 模式;Atlas A2训练系列产品,默认为 INF/NAN 模式,且不建议使用饱和模式。对于 MindSpore 框架侧配置,仅支持对 Atlas A2 训练系列产品进行设置,默认为 INF/NAN 模式。CANN 侧 与 MindSpore 框架侧配置须一致。
溢出检测任务的配置示例见[MindSpore 静态图场景下 task 配置为 overflow_check](https://gitee.com/ascend/mstt/blob/master/debug/accuracy_tools/msprobe/docs/03.config_examples.md#23-task-%E9%85%8D%E7%BD%AE%E4%B8%BA-overflow_check)、[MindSpore 动态图场景下 task 配置为 overflow_check](https://gitee.com/ascend/mstt/blob/master/debug/accuracy_tools/msprobe/docs/03.config_examples.md#33-task-%E9%85%8D%E7%BD%AE%E4%B8%BA-overflow_check)。
diff --git a/debug/accuracy_tools/msprobe/docs/19.monitor.md b/debug/accuracy_tools/msprobe/docs/19.monitor.md
index 1c197ba5496378130d8d04b6f847ee2f35c3e946..4bb82af8c7a767fac99f4d72ea475617334ba315 100644
--- a/debug/accuracy_tools/msprobe/docs/19.monitor.md
+++ b/debug/accuracy_tools/msprobe/docs/19.monitor.md
@@ -487,7 +487,6 @@ actv, actv_grad = monitor.generate_xy_metrics()
```
-
## 详细配置
```json
diff --git a/debug/accuracy_tools/msprobe/docs/27.dump_json_instruction.md b/debug/accuracy_tools/msprobe/docs/27.dump_json_instruction.md
index f994dc2301bcae6b23dc7a7503297aa4fe5b3724..bf5998bce0b4cd174b9713d9417d1afb674c2b56 100644
--- a/debug/accuracy_tools/msprobe/docs/27.dump_json_instruction.md
+++ b/debug/accuracy_tools/msprobe/docs/27.dump_json_instruction.md
@@ -1,8 +1,8 @@
# dump.json文件说明及示例
-## 1. dump.json文件示例(PyTorch)
+## 1. PyTorch 场景下的 dump.json 文件
-### 1.1 L0级别
+### 1.1 L0 级别
L0级别的dump.json文件包括模块的前反向的输入输出,以及模块的参数和参数梯度。以PyTorch的Conv2d模块为例,网络中模块调用代码为:
`output = self.conv2(input) # self.conv2 = torch.nn.Conv2d(64, 128, 5, padding=2, bias=True)`
@@ -168,7 +168,7 @@ dump.json文件中包含以下数据名称:
}
```
-### 1.2 L1级别
+### 1.2 L1 级别
L1级别的dump.json文件包括API的前反向的输入输出。以PyTorch的relu函数为例,网络中API调用代码为:
`output = torch.nn.functional.relu(input)`
@@ -264,13 +264,13 @@ dump.json文件中包含以下数据名称:
}
```
-### 1.3 mix级别
+### 1.3 mix 级别
mix级别的dump.json文件同时包括L0和L1级别的dump数据,文件格式与上述示例相同。
-## 2. dump.json文件示例(MindSpore)
+## 2. MindSpore 场景下的 dump.json 文件
-### 2.1 L0级别
+### 2.1 L0 级别
L0级别的dump.json文件包括模块的前反向的输入输出,以及模块的参数和参数梯度。
以MindSpore的Conv2d模块为例,dump.json文件中使用的模块调用代码为:
@@ -429,7 +429,7 @@ dump.json文件中包含以下数据名称:
}
```
-### 2.2 L1级别
+### 2.2 L1 级别
L1级别的dump.json文件包括API的前反向的输入输出,以MindSpore的relu函数为例,网络中API调用代码为:
`output = mindspore.ops.relu(input)`
@@ -521,5 +521,275 @@ L1级别的dump.json文件包括API的前反向的输入输出,以MindSpore的
}
```
-### 2.3 mix级别
+### 2.3 mix 级别
+
mix级别的dump.json文件同时包括L0和L1级别的dump数据,文件格式与上述示例相同。
+
+## 3. MSAdapter 场景下的 dump.json 文件
+
+### 3.1 L0 级别
+
+L0 级别的 dump.json 文件包括模块的前反向的输入输出,以及模块的参数和参数梯度。以 Conv2d 模块为例,网络中模块调用代码为:
+`output = self.conv2(input) # self.conv2 = torch.nn.Conv2d(64, 128, 5, padding=2, bias=True)`
+
+dump.json文件中包含以下数据名称:
+
+- `Module.conv2.Conv2d.forward.0`:模块的前向数据,其中input_args为模块的输入数据(位置参数),input_kwargs为模块的输入数据(关键字参数),output为模块的输出数据,parameters为模块的参数数据,包括权重(weight)和偏置(bias)。
+- `Module.conv2.Conv2d.parameters_grad`:模块的参数梯度数据,包括权重(weight)和偏置(bias)的梯度。
+- `Module.conv2.Conv2d.backward.0`:模块的反向数据,其中input为模块反向的输入梯度(对应前向输出的梯度),output为模块的反向输出梯度(对应前向输入的梯度)。
+
+**说明**:当dump时传入的model参数为List[torch.nn.Module]或Tuple[torch.nn.Module]时,模块级数据的命名中包含该模块在列表中的索引index,命名格式为`{Module}.{index}.*`,*表示以上三种模块级数据的命名格式,例如:`Module.0.conv1.Conv2d.forward.0`。
+
+```json
+{
+ "task": "tensor",
+ "level": "L0",
+ "framework": "mindtorch",
+ "dump_data_dir": "/dump/path",
+ "data": {
+ "Module.conv2.Conv2d.forward.0": {
+ "input_args": [
+ {
+ "type": "mindspore.Tensor",
+ "dtype": "Float32",
+ "shape": [
+ 8,
+ 16,
+ 14,
+ 14
+ ],
+ "Max": 1.638758659362793,
+ "Min": 0.0,
+ "Mean": 0.2544615864753723,
+ "Norm": 70.50277709960938,
+ "requires_grad": true,
+ "data_name": "Module.conv2.Conv2d.forward.0.input.0.npy"
+ }
+ ],
+ "input_kwargs": {},
+ "output": [
+ {
+ "type": "mindspore.Tensor",
+ "dtype": "Float32",
+ "shape": [
+ 8,
+ 32,
+ 10,
+ 10
+ ],
+ "Max": 1.6815717220306396,
+ "Min": -1.5120246410369873,
+ "Mean": -0.025344856083393097,
+ "Norm": 149.65576171875,
+ "requires_grad": true,
+ "data_name": "Module.conv2.Conv2d.forward.0.output.0.npy"
+ }
+ ],
+ "parameters": {
+ "weight": {
+ "type": "mindspore.Tensor",
+ "dtype": "Float32",
+ "shape": [
+ 32,
+ 16,
+ 5,
+ 5
+ ],
+ "Max": 0.05992485210299492,
+ "Min": -0.05999220535159111,
+ "Mean": -0.0006165213999338448,
+ "Norm": 3.421217441558838,
+ "requires_grad": true,
+ "data_name": "Module.conv2.Conv2d.forward.0.parameters.weight.npy"
+ },
+ "bias": {
+ "type": "mindspore.Tensor",
+ "dtype": "Float32",
+ "shape": [
+ 32
+ ],
+ "Max": 0.05744686722755432,
+ "Min": -0.04894155263900757,
+ "Mean": 0.006410328671336174,
+ "Norm": 0.17263513803482056,
+ "requires_grad": true,
+ "data_name": "Module.conv2.Conv2d.forward.0.parameters.bias.npy"
+ }
+ }
+ },
+ "Module.conv2.Conv2d.parameters_grad": {
+ "weight": [
+ {
+ "type": "mindspore.Tensor",
+ "dtype": "Float32",
+ "shape": [
+ 32,
+ 16,
+ 5,
+ 5
+ ],
+ "Max": 0.018550323322415352,
+ "Min": -0.008627401664853096,
+ "Mean": 0.0006675920449197292,
+ "Norm": 0.26084786653518677,
+ "requires_grad": false,
+ "data_name": "Module.conv2.Conv2d.parameters_grad.weight.npy"
+ }
+ ],
+ "bias": [
+ {
+ "type": "mindspore.Tensor",
+ "dtype": "Float32",
+ "shape": [
+ 32
+ ],
+ "Max": 0.014914230443537235,
+ "Min": -0.006656786892563105,
+ "Mean": 0.002657240955159068,
+ "Norm": 0.029451673850417137,
+ "requires_grad": false,
+ "data_name": "Module.conv2.Conv2d.parameters_grad.bias.npy"
+ }
+ ]
+ },
+ "Module.conv2.Conv2d.backward.0": {
+ "input": [
+ {
+ "type": "mindspore.Tensor",
+ "dtype": "Float32",
+ "shape": [
+ 8,
+ 32,
+ 10,
+ 10
+ ],
+ "Max": 0.0015069986693561077,
+ "Min": -0.001139344065450132,
+ "Mean": 3.3215508210560074e-06,
+ "Norm": 0.020567523315548897,
+ "requires_grad": false,
+ "data_name": "Module.conv2.Conv2d.backward.0.input.0.npy"
+ }
+ ],
+ "output": [
+ {
+ "type": "mindspore.Tensor",
+ "dtype": "Float32",
+ "shape": [
+ 8,
+ 16,
+ 14,
+ 14
+ ],
+ "Max": 0.0007466732058674097,
+ "Min": -0.00044813455315306783,
+ "Mean": 6.814070275140693e-06,
+ "Norm": 0.01474067009985447,
+ "requires_grad": false,
+ "data_name": "Module.conv2.Conv2d.backward.0.output.0.npy"
+ }
+ ]
+ }
+ }
+}
+```
+
+### 3.2 L1 级别
+L1级别的dump.json文件包括API的前反向的输入输出。以 relu API 为例,网络中 API 调用代码为:
+`output = torch.nn.functional.relu(input)`
+
+dump.json文件中包含以下数据名称:
+- `Functional.relu.0.forward`:API的前向数据,其中input_args为API的输入数据(位置参数),input_kwargs为API的输入数据(关键字参数),output为API的输出数据。
+- `Functional.relu.0.backward`:API的反向数据,其中input为API的反向输入梯度(对应前向输出的梯度),output为API的反向输出梯度(对应前向输入的梯度)。
+
+```json
+{
+ "task": "tensor",
+ "level": "L1",
+ "framework": "mindtorch",
+ "dump_data_dir":"/dump/path",
+ "data": {
+ "Functional.relu.0.forward": {
+ "input_args": [
+ {
+ "type": "mindspore.Tensor",
+ "dtype": "Float32",
+ "shape": [
+ 32,
+ 16,
+ 28,
+ 28
+ ],
+ "Max": 1.3864083290100098,
+ "Min": -1.3364859819412231,
+ "Mean": 0.03711778670549393,
+ "Norm": 236.20692443847656,
+ "requires_grad": true,
+ "data_name": "Functional.relu.0.forward.input.0.npy"
+ }
+ ],
+ "input_kwargs": {},
+ "output": [
+ {
+ "type": "mindspore.Tensor",
+ "dtype": "Float32",
+ "shape": [
+ 32,
+ 16,
+ 28,
+ 28
+ ],
+ "Max": 1.3864083290100098,
+ "Min": 0.0,
+ "Mean": 0.16849493980407715,
+ "Norm": 175.23345947265625,
+ "requires_grad": true,
+ "data_name": "Functional.relu.0.forward.output.0.npy"
+ }
+ ]
+ },
+ "Functional.relu.0.backward": {
+ "input": [
+ {
+ "type": "mindspore.Tensor",
+ "dtype": "Float32",
+ "shape": [
+ 32,
+ 16,
+ 28,
+ 28
+ ],
+ "Max": 0.0001815402356442064,
+ "Min": -0.00013352684618439525,
+ "Mean": 0.00011915402356442064,
+ "Norm": 0.007598237134516239,
+ "requires_grad": false,
+ "data_name": "Functional.relu.0.backward.input.0.npy"
+ }
+ ],
+ "output": [
+ {
+ "type": "mindspore.Tensor",
+ "dtype": "Float32",
+ "shape": [
+ 32,
+ 16,
+ 28,
+ 28
+ ],
+ "Max": 0.0001815402356442064,
+ "Min": -0.00012117840378778055,
+ "Mean": 2.0098118724831693e-08,
+ "Norm": 0.006532244384288788,
+ "requires_grad": false,
+ "data_name": "Functional.relu.0.backward.output.0.npy"
+ }
+ ]
+ }
+ }
+}
+```
+
+### 3.3 mix 级别
+
+mix级别的dump.json文件同时包括L0和L1级别的dump数据,文件格式与上述示例相同。
\ No newline at end of file
diff --git a/debug/accuracy_tools/msprobe/docs/28.kernel_dump_MindSpore.md b/debug/accuracy_tools/msprobe/docs/28.kernel_dump_MindSpore.md
index 6b8cc558aa22526158033cfb35f31203d8b04278..4988586c0568b391739f7c14f1a9452461f1a6f1 100644
--- a/debug/accuracy_tools/msprobe/docs/28.kernel_dump_MindSpore.md
+++ b/debug/accuracy_tools/msprobe/docs/28.kernel_dump_MindSpore.md
@@ -1,4 +1,4 @@
-# MindSpore 场景的 kernel dump 说明
+# MindSpore 动态图场景的 kernel dump 说明
当使用 msprobe 数据采集功能时,level 配置为 "L2" 表示采集 kernel 层级的算子数据,仅支持昇腾 NPU 平台。
diff --git a/debug/accuracy_tools/msprobe/docs/29.data_dump_MSAdapter.md b/debug/accuracy_tools/msprobe/docs/29.data_dump_MSAdapter.md
new file mode 100644
index 0000000000000000000000000000000000000000..cefcabafbcbbdbb33a3d9d63c17a30396c9e4c52
--- /dev/null
+++ b/debug/accuracy_tools/msprobe/docs/29.data_dump_MSAdapter.md
@@ -0,0 +1,229 @@
+# MSAdapter 场景的精度数据采集
+
+MSAdapter 是一款 MindSpore 生态适配工具,可以将 PyTorch 训练脚本高效迁移至 MindSpore 框架执行,以实现在不改变原有 PyTorch 用户开发习惯的情况下,使得 PyTorch 代码能在昇腾上获得高效性能。
+
+msprobe 工具主要通过在训练脚本内添加 dump 接口、启动训练的方式采集精度数据。
+
+本工具提供固定的 API 支持列表,若需要删除或增加 dump 的 API,可以在 msprobe/pytorch/hook_module/support_wrap_ops.yaml 文件内手动修改,如下示例:
+
+```yaml
+functional: # functional为算子类别,找到对应的类别,在该类别下按照下列格式删除或添加API
+ - conv1d
+ - conv2d
+ - conv3d
+```
+
+删除 API 的场景:部分模型代码逻辑会存在 API 原生类型校验,工具执行dump操作时,对封装后的模型 API 可能与模型的原生 API 类型不一致,此时可能引发校验失败,详见《[FAQ](FAQ.md)》中“异常情况”的第10和11条。
+
+## 1. 工具安装
+
+请参见[《msprobe 工具安装指南》](./01.installation.md)。
+
+## 2 接口介绍
+
+### 2.1 msprobe.mindspore.PrecisionDebugger
+
+**功能说明**:通过加载 dump 配置文件的方式来确定 dump 操作的详细配置。
+
+**原型**:
+
+```Python
+PrecisionDebugger(config_path=None, task=None, dump_path=None, level=None, step=None)
+```
+
+**参数说明**:
+
+1. config_path:指定 dump 配置文件路径,string 类型。参数示例:"./config.json"。未配置该路径时,默认使用 [config.json](../config.json) 文件的默认配置,配置选项含义可见 [config.json 介绍](./02.config_introduction.md)。
+
+2. 其他参数与 [config.json](../config.json) 文件中的同名配置字段含义相同,具体可见 [config.json 介绍](./02.config_introduction.md)。当参数值非None时,优先级高于 [config.json](../config.json) 文件中的同名配置。
+
+#### 2.1.1 start
+
+**功能说明**:启动精度数据采集。需要与 [**stop**](#212-stop) 接口一起添加在训练迭代的 for 循环内。
+
+**原型**:
+
+```Python
+start(model=None)
+```
+
+**参数说明**:
+
+1. model:指定需要采集 Module 级数据的模型,支持传入 torch.nn.Module、list[torch.nn.Module]或Tuple[torch.nn.Module] 类型,默认未配置。level 配置为 "L0" 或 "mix" 时,必须在该接口中配置该参数。API级别("L1" level)dump 时,传入 model 可以采集 model 内包含 primitive op 对象在内的所有 API 数据,若不传入 model 参数,则只采集非 primitive op 的 API 数据。
+
+#### 2.1.2 stop
+
+**功能说明**:停止精度数据采集。在 **start** 接口调用之后的任意位置添加。若 **stop** 接口添加在反向计算代码之后,则会采集 **start** 和该接口之间的前反向数据。
+若 **stop** 接口添加在反向计算代码之前,则需要将 [**step**](#213-step) 接口添加到反向计算代码之后,才能采集 **start** 和该接口之间的前反向数据。
+
+**注意**:**stop** 接口必须调用,否则可能导致精度数据落盘不全。
+
+**原型**:
+
+```Python
+stop()
+```
+
+#### 2.1.3 step
+
+**功能说明**:进行训练 step 数的自增,完成当前 step 所有数据的落盘并更新 dump 参数。在一个 step 训练结束的位置添加,且必须在 **stop** 接口之后的位置调用。该接口需要配合 **start** 和 **stop** 函数使用,尽量添加在反向计算代码之后,否则可能会导致反向数据丢失。
+
+**原型**:
+
+```Python
+step()
+```
+
+#### 2.1.4 forward_backward_dump_end
+
+**功能说明**:停止精度数据采集。与 **stop** 接口功能相同,该函数在将来会被移除,建议使用 **stop** 接口。
+
+**原型**:
+
+```Python
+forward_backward_dump_end()
+```
+
+#### 2.1.5 save
+
+**功能说明**:单点保存网络执行过程中正反向数值,并以统计值/张量文件落盘。
+
+**原型**:
+```python
+save(variable, name, save_backward=True)
+```
+
+**参数说明**:
+| 参数名称 | 参数含义 | 支持数据类型 | 是否必选|
+| ---------- | ------------------| ------------------- | ------------------- |
+| variable | 需要保存的变量 |dict, list, tuple, torch.tensor, int, float, str | 是 |
+| name | 指定的名称 | str | 是 |
+| save_backward | 是否保存反向数据 | boolean | 否 |
+
+### 2.2 msprobe.mindspore.seed_all
+
+**功能说明**:用于固定网络中的随机性和开启确定性计算。
+
+**原型**:
+```python
+seed_all(seed=1234, mode=False, rm_dropout=True)
+```
+
+**参数说明**:
+
+1. seed: 随机性种子,默认值:1234,非必选。参数示例: seed=1000。该参数用于 random、numpy.random, mindspore.common.Initializer、mindspore.nn.probability.distribution的随机数生成以及 Python 中 str、bytes、datetime 对象的 hash 算法。
+
+2. mode:确定性计算使能,可配置 True 或 False,默认值:False,非必选。参数示例:mode=True。该参数设置为 True 后,将会开启算子确定性运行模式与归约类通信算子(AllReduce、ReduceScatter、Reduce)的确定性计算。注意:确定性计算会导致 API 执行性能降低,建议在发现模型多次执行结果不同的情况下开启。
+
+3. rm_dropout:控制 dropout 失效的开关。可配置 True 或 False,默认值:True,非必选。参数示例:rm_dropout=True。该参数设置为 True 后,将会使 mindspore.ops.Dropout,mindspore.ops.Dropout2D,mindspore.ops.Dropout3D,mindspore.mint.nn.Dropout和mindspore.mint.nn.functional.dropout 失效,以避免因随机 dropout 造成的网络随机性。建议在采集数据前调用。
+
+**注意**:通过 rm_dropout 控制 dropout 失效或生效需要在初始化 Dropout 实例前调用才能生效。
+
+## 3 示例代码
+
+以下为添加了 msprobe 工具 dump 接口的示例训练脚本。
+
+```python
+import mindspore as ms
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+# 导入工具的数据采集接口
+from msprobe.pytorch import PrecisionDebugger
+
+# 在模型训练开始前实例化PrecisionDebugger
+debugger = PrecisionDebugger(config_path='./config.json')
+
+
+# 定义网络
+class Net(nn.Module):
+ def __init__(self) -> None:
+ super().__init__()
+ self.linear1 = nn.Linear(in_features=8, out_features=4)
+ self.linear2 = nn.Linear(in_features=4, out_features=2)
+
+ def forward(self, x):
+ x1 = self.linear1(x)
+ x2 = self.linear2(x1)
+ logits = F.relu(x2)
+ return logits
+
+
+net = Net()
+
+
+def train_step(inputs):
+ return net(inputs)
+
+
+if __name__ == "__main__":
+ data = (torch.randn(10, 8), torch.randn(10, 8), torch.randn(10, 8))
+ grad_fn = ms.value_and_grad(train_step, grad_position=0)
+
+ for inputs in data:
+ # 开启数据 dump
+ debugger.start(model=net)
+
+ out, grad = grad_fn(inputs)
+
+ # 停止数据 dump
+ debugger.stop()
+ # 更新 step 信息
+ debugger.step()
+```
+
+## 4 dump 结果文件介绍
+
+训练结束后,工具将 dump 的数据保存在 dump_path 参数指定的目录下。目录结构示例如下:
+
+```lua
+├── dump_path
+│ ├── step0
+│ | ├── rank0
+│ | │ ├── dump_tensor_data
+| | | | ├── Tensor.permute.1.forward.npy
+| | | | ├── Functional.linear.5.backward.output.npy # 命名格式为{api_type}.{api_name}.{API调用次数}.{forward/backward}.{input/output}.{参数序号}, 其中,“参数序号”表示该API的第n个输入或输出,例如1,则为第一个参数,若该参数为list格式,则根据list继续排序,例如1.1,表示该API的第1个参数的第1个元素。
+| | | | ...
+| | | | ├── Module.conv1.Conv2d.forward.0.input.0.npy # 命名格式为{Module}.{module_name}.{class_name}.{forward/backward}.{调用次数}.{input/output}.{参数序号}, 其中,“参数序号”表示该Module的第n个参数,例如1,则为第一个参数,若该参数为list格式,则根据list继续排序,例如1.1,表示该Module的第1个参数的第1个元素。
+| | | | ├── Module.conv1.Conv2D.forward.0.parameters.bias.npy # 模块参数数据:命名格式为{Module}.{module_name}.{class_name}.forward.{调用次数}.parameters.{parameter_name}。
+| | | | └── Module.conv1.Conv2D.parameters_grad.weight.npy # 模块参数梯度数据:命名格式为{Module}.{module_name}.{class_name}.parameters_grad.{parameter_name}。因为同一模块的参数使用同一梯度进行更新,所以参数梯度文件名不包含调用次数。
+| | | | # 当dump时传入的model参数为List[torch.nn.Module]或Tuple[torch.nn.Module]时,模块级数据的命名中包含该模块在列表中的索引index,命名格式为{Module}.{index}.*,*表示以上三种模块级数据的命名格式,例如:Module.0.conv1.Conv2d.forward.0.input.0.npy。
+│ | | ├── dump.json
+│ | | ├── stack.json
+│ | | └── construct.json
+│ | ├── rank1
+| | | ├── dump_tensor_data
+| | | | └── ...
+│ | | ├── dump.json
+│ | | ├── stack.json
+| | | └── construct.json
+│ | ├── ...
+│ | |
+| | └── rank7
+│ ├── step1
+│ | ├── ...
+│ ├── step2
+```
+* `rank`:设备 ID,每张卡的数据保存在对应的 `rank{ID}` 目录下。非分布式场景下没有 rank ID,目录名称为 rank。
+* `dump_tensor_data`:保存采集到的张量数据。
+* `dump.json`: 保存 API 或 Module 前反向数据的统计量信息。包含 dump 数据的 API 名称或 Module 名称,各数据的 dtype、 shape、max、min、mean、L2norm(L2范数,平方根)统计信息以及当配置 summary_mode="md5" 时的 CRC-32 数据。具体介绍可参考[dump.json文件说明](./27.dump_json_instruction.md#3-MSAdapter场景下的dump.json文件)。
+* `stack.json`:API/Module 的调用栈信息。
+* `construct.json`:分层分级结构,level 为 L1 时,construct.json 内容为空。
+
+
+当 task 为 tensor 时,dump 过程中,npy 文件在对应算子或者模块被执行后就会落盘,而 json 文件则需要在正常执行 PrecisionDebugger.stop() 后才会写入完整数据。因此如果程序异常终止,终止前被执行算子的相关 npy 文件得以保存,但 json 文件中的数据可能丢失。
+
+其中 rank 为设备上各卡的 ID,每张卡上 dump 的数据会生成对应 dump 目录。非分布式场景下没有 rank ID,目录名称为 rank。
+
+npy 文件名的前缀含义如下:
+
+| 前缀 | 含义 |
+| ----------- | ---------------------------- |
+| Tensor | torch.Tensor API数据 |
+| Torch | torch API数据 |
+| Functional | torch.nn.functional API数据 |
+| NPU | NPU 亲和API数据 |
+| Distributed | torch.distributed API数据 |
+| Jit | 被 "jit" 装饰的模块或函数数据 |
+| Module | torch.nn.Module 类(模块)数据 |
\ No newline at end of file
diff --git a/debug/accuracy_tools/msprobe/docs/30.overflow_check_MSAdapter.md b/debug/accuracy_tools/msprobe/docs/30.overflow_check_MSAdapter.md
new file mode 100644
index 0000000000000000000000000000000000000000..01d64c808d40a1e5c4ea2190c028a7c389ffbdc4
--- /dev/null
+++ b/debug/accuracy_tools/msprobe/docs/30.overflow_check_MSAdapter.md
@@ -0,0 +1,31 @@
+# MSAdapter 场景的溢出检测
+
+msprobe 工具提供 MSAdapter 场景下的溢出检测功能。其检测对象为 **API** 级别(除 Primitive 和 Jit 类 API)或**模块**级别,分别对应 config.json 配置中的 **"L1"** 、**"L0"** level。
+
+需要注意,本工具仅支持在 INF/NAN 模式a下进行溢出检测。INF/NAN 模式的使能方式如下:
+
+```Shell
+# 使能 CANN 侧 INF/NAN 模式
+export INF_NAN_MODE_ENABLE=1
+# 使能 MindSpore 框架侧 INF/NAN 模式
+export MS_ASCEND_CHECK_OVERFLOW_MODE="INFNAN_MODE"
+```
+
+**a**:在处理浮点数计算溢出问题时,NPU 当前支持两种溢出模式:INF/NAN 模式与饱和模式。INF/NAN 模式遵循 IEEE 754 标准,根据定义输出 INF/NAN 的计算结果。与之对应的饱和模式在计算出现溢出时,饱和为浮点数极值(+-MAX)。对于 CANN 侧配置,Atlas 训练系列产品,默认为饱和模式,且不建议使用 INF/NAN 模式;Atlas A2训练系列产品,默认为 INF/NAN 模式,且不建议使用饱和模式。对于 MindSpore 框架侧配置,仅支持对 Atlas A2 训练系列产品进行设置,默认为 INF/NAN 模式。CANN 侧 与 MindSpore 框架侧配置须一致。
+
+溢出检测任务的配置示例见["**MindSpore 动态图场景 task 配置为 overflow_check**"](./03.config_examples.md#33-task配置为overflow_check)小节。
+
+
+## 1 接口介绍
+
+溢出检测功能提供的接口与数据采集任务一致,详见 MSAdapter 场景的精度数据采集中的["**2 接口介绍**"](./29.data_dump_MSAdapter.md#2-接口介绍)小节。
+
+需要注意,目前暂不支持 "L1" level 下 primitive op 的溢出检测。
+
+## 2 示例代码
+
+溢出检测功能使用方式与数据采集任务一致,详见 MSAdapter 场景的精度数据采集中的["**3 示例代码**"](./29.data_dump_MSAdapter.md#3-示例代码)小节。
+
+## 3 溢出检测结果文件介绍
+
+溢出检测结果文件目录结构与含义与数据采集任务一致,但仅保存溢出 API 或 模块 的真实数据或统计信息。详见 MSAdapter 场景的精度数据采集中的["**4 dump 结果文件介绍**"](./29.data_dump_MSAdapter.md#4-dump-结果文件介绍)小节。
\ No newline at end of file
diff --git a/debug/accuracy_tools/msprobe/docs/img/compare_result.png b/debug/accuracy_tools/msprobe/docs/img/compare_result.png
index 07cdb51707fe43d07723ed976275d99f55b50571..b6d7ec6dfcbc44b4b7056e1297a481f495ceb86e 100644
Binary files a/debug/accuracy_tools/msprobe/docs/img/compare_result.png and b/debug/accuracy_tools/msprobe/docs/img/compare_result.png differ
diff --git a/debug/accuracy_tools/msprobe/mindspore/common/utils.py b/debug/accuracy_tools/msprobe/mindspore/common/utils.py
index ded3faaa22b565ef35c17a7596782976ddf9125d..dc9da3449099ce90aa2d867a7c5cb6073c0990f6 100644
--- a/debug/accuracy_tools/msprobe/mindspore/common/utils.py
+++ b/debug/accuracy_tools/msprobe/mindspore/common/utils.py
@@ -182,9 +182,9 @@ def set_register_backward_hook_functions():
def check_save_param(variable, name, save_backward):
# try catch this api to skip invalid call
- if not isinstance(variable, (list, dict, ms.Tensor, int, float, str)):
+ if not isinstance(variable, (list, dict, tuple, ms.Tensor, int, float, str)):
logger.warning("PrecisionDebugger.save variable type not valid, "
- "should be one of list, dict, ms.Tensor, int, float or string. "
+ "should be one of list, dict, tuple, ms.Tensor, int, float or string. "
"Skip current save process.")
raise ValueError
if not isinstance(name, str):
@@ -196,4 +196,4 @@ def check_save_param(variable, name, save_backward):
logger.warning("PrecisionDebugger.save_backward name not valid, "
"should be bool. "
"Skip current save process.")
- raise ValueError
\ No newline at end of file
+ raise ValueError
diff --git a/debug/accuracy_tools/msprobe/mindspore/compare/ms_compare.py b/debug/accuracy_tools/msprobe/mindspore/compare/ms_compare.py
index de507e876653d4c4f82e1f96a00c214e2bc6f5f6..e0915f8179b69120306730c66b8ae3f12d0ccffa 100644
--- a/debug/accuracy_tools/msprobe/mindspore/compare/ms_compare.py
+++ b/debug/accuracy_tools/msprobe/mindspore/compare/ms_compare.py
@@ -22,10 +22,10 @@ import pandas as pd
from msprobe.core.common.const import CompareConst, Const
from msprobe.core.common.exceptions import FileCheckException
-from msprobe.core.common.file_utils import FileOpen, create_directory, load_json, load_npy, load_yaml
+from msprobe.core.common.file_utils import create_directory, load_json, load_npy, load_yaml
from msprobe.core.common.log import logger
from msprobe.core.common.utils import CompareException, check_compare_param, check_configuration_param, \
- check_op_str_pattern_valid, get_dump_mode, set_dump_path
+ check_op_str_pattern_valid, get_dump_mode, set_dump_path, detect_framework_by_dump_json
from msprobe.core.compare.acc_compare import Comparator, ModeConfig
from msprobe.core.compare.check import dtype_mapping
from msprobe.core.compare.layer_mapping import generate_data_mapping_by_layer_mapping
@@ -125,8 +125,7 @@ class MSComparator(Comparator):
result_df.loc[warning_flag, CompareConst.RESULT] = CompareConst.WARNING
result_df.loc[warning_flag, CompareConst.ERROR_MESSAGE] = 'Need double check api accuracy.'
else:
- fill_cols = [CompareConst.COSINE, CompareConst.EUC_DIST,
- CompareConst.MAX_ABS_ERR, CompareConst.MAX_RELATIVE_ERR,
+ fill_cols = [CompareConst.COSINE, CompareConst.MAX_ABS_ERR, CompareConst.MAX_RELATIVE_ERR,
CompareConst.ONE_THOUSANDTH_ERR_RATIO, CompareConst.FIVE_THOUSANDTHS_ERR_RATIO,
CompareConst.ERROR_MESSAGE]
result_df.loc[~condition_no_bench, fill_cols] = ''
@@ -383,12 +382,11 @@ class MSComparator(Comparator):
def check_cross_framework(bench_json_path):
- pattern = r'"data_name":\s*"[^"]+\.pt"'
- with FileOpen(bench_json_path, 'r') as file:
- for line in file:
- if re.search(pattern, line):
- return True
- return False
+ framework = detect_framework_by_dump_json(bench_json_path)
+ if framework == Const.PT_FRAMEWORK:
+ return True
+ else:
+ return False
def ms_compare(input_param, output_path, **kwargs):
diff --git a/debug/accuracy_tools/msprobe/mindspore/monitor/distributed/wrap_distributed.py b/debug/accuracy_tools/msprobe/mindspore/monitor/distributed/wrap_distributed.py
index 33fd58c7278c6245140e50a984f44e59b90c69de..e8a4739445e83c62e34e16829c5ea94c8ef5177c 100644
--- a/debug/accuracy_tools/msprobe/mindspore/monitor/distributed/wrap_distributed.py
+++ b/debug/accuracy_tools/msprobe/mindspore/monitor/distributed/wrap_distributed.py
@@ -281,7 +281,7 @@ def create_hooks(context, monitor):
global RANK
pre_hooks = []
hooks = []
- RANK = str(get_rank())
+ RANK = get_rank()
if communication.GlobalComm.INITED and RANK not in monitor.module_rank_list and monitor.module_rank_list != []:
return [pre_hooks, hooks]
diff --git a/debug/accuracy_tools/msprobe/mindspore/monitor/utils.py b/debug/accuracy_tools/msprobe/mindspore/monitor/utils.py
index 506ad6c3f91c7c73e5e12109a6ea617309df72c0..c85e66a65ba26fdbc1d10a8e55c8273236409b36 100644
--- a/debug/accuracy_tools/msprobe/mindspore/monitor/utils.py
+++ b/debug/accuracy_tools/msprobe/mindspore/monitor/utils.py
@@ -98,8 +98,8 @@ def validate_ranks(ranks):
if not isinstance(ranks, list):
raise TypeError("module_ranks should be a list")
for rank in ranks:
- if not isinstance(rank, str):
- raise TypeError(f"element in module_ranks should be a str, get {type(rank)}")
+ if not isinstance(rank, int):
+ raise TypeError(f"element in module_ranks should be a int, get {type(rank)}")
def validate_targets(targets):
diff --git a/debug/accuracy_tools/msprobe/pytorch/api_accuracy_checker/run_ut/data_generate.py b/debug/accuracy_tools/msprobe/pytorch/api_accuracy_checker/run_ut/data_generate.py
index 9d89b2de32f70c6fa7abf38add49b58a13531d7a..ec2a4b7165f25f8f9a60ea953ee71cdac0f24a03 100644
--- a/debug/accuracy_tools/msprobe/pytorch/api_accuracy_checker/run_ut/data_generate.py
+++ b/debug/accuracy_tools/msprobe/pytorch/api_accuracy_checker/run_ut/data_generate.py
@@ -1,6 +1,6 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
-# Copyright (c) 2024-2024, Huawei Technologies Co., Ltd.
+# Copyright (c) 2024-2025, Huawei Technologies Co., Ltd.
# All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/debug/accuracy_tools/msprobe/pytorch/common/utils.py b/debug/accuracy_tools/msprobe/pytorch/common/utils.py
index 16067f6d2bee70645bcc337d1809a14f41ae5b96..7a3735a5292885a6e686863088e1186a3ab464ad 100644
--- a/debug/accuracy_tools/msprobe/pytorch/common/utils.py
+++ b/debug/accuracy_tools/msprobe/pytorch/common/utils.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2024-2024, Huawei Technologies Co., Ltd.
+# Copyright (c) 2024-2025, Huawei Technologies Co., Ltd.
# All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
@@ -25,8 +25,8 @@ import numpy as np
import torch
import torch.distributed as dist
from msprobe.core.common.exceptions import DistributedNotInitializedError
-from msprobe.core.common.file_utils import (FileCheckConst, change_mode,
- check_file_or_directory_path, check_path_before_create, FileOpen)
+from msprobe.core.common.file_utils import FileCheckConst, change_mode, check_file_or_directory_path, \
+ check_path_before_create, FileOpen
from msprobe.core.common.log import logger
from msprobe.core.common.utils import check_seed_all
from packaging import version
@@ -449,9 +449,9 @@ def is_recomputation():
def check_save_param(variable, name, save_backward):
# try catch this api to skip invalid call
- if not isinstance(variable, (list, dict, torch.Tensor, int, float, str)):
+ if not isinstance(variable, (list, dict, tuple, torch.Tensor, int, float, str)):
logger.warning("PrecisionDebugger.save variable type not valid, "
- "should be one of list, dict, torch.Tensor, int, float or string. "
+ "should be one of list, dict, tuple, torch.Tensor, int, float or string. "
"Skip current save process.")
raise ValueError
if not isinstance(name, str):
@@ -473,3 +473,28 @@ def replace_last_occurrence(text, old, new):
if index != -1:
return text[:index] + text[index:].replace(old, new, 1)
return text
+
+
+def read_pt_data(dir_path, file_name):
+ if not file_name:
+ return None
+
+ data_path = os.path.join(dir_path, file_name)
+ path_checker = FileChecker(data_path, FileCheckConst.FILE, FileCheckConst.READ_ABLE,
+ FileCheckConst.PT_SUFFIX, False)
+ data_path = path_checker.common_check()
+ try:
+ # detach because numpy can not process gradient information
+ data_value = load_pt(data_path, to_cpu=True).detach()
+ except RuntimeError as e:
+ # 这里捕获 load_pt 中抛出的异常
+ logger.error(f"Failed to load the .pt file at {data_path}.")
+ raise CompareException(CompareException.INVALID_FILE_ERROR) from e
+ except AttributeError as e:
+ # 这里捕获 detach 方法抛出的异常
+ logger.error(f"Failed to detach the loaded tensor.")
+ raise CompareException(CompareException.DETACH_ERROR) from e
+ if data_value.dtype == torch.bfloat16:
+ data_value = data_value.to(torch.float32)
+ data_value = data_value.numpy()
+ return data_value
diff --git a/debug/accuracy_tools/msprobe/pytorch/compare/distributed_compare.py b/debug/accuracy_tools/msprobe/pytorch/compare/distributed_compare.py
index de62af421b5a37e39140a9836fb16853443740d7..08e2f897a9e9ecacad1c0cc2353ebe123a59b2a7 100644
--- a/debug/accuracy_tools/msprobe/pytorch/compare/distributed_compare.py
+++ b/debug/accuracy_tools/msprobe/pytorch/compare/distributed_compare.py
@@ -15,14 +15,10 @@
import os
-from msprobe.core.common.exceptions import FileCheckException
-from msprobe.core.common.file_utils import create_directory
-from msprobe.core.common.utils import CompareException, check_compare_param, check_configuration_param, get_dump_mode, \
- set_dump_path
-from msprobe.core.compare.acc_compare import ModeConfig
-from msprobe.core.compare.utils import check_and_return_dir_contents, extract_json, set_stack_json_path
+from msprobe.core.common.utils import CompareException
+from msprobe.core.compare.utils import check_and_return_dir_contents, extract_json
from msprobe.pytorch.common.log import logger
-from msprobe.pytorch.compare.pt_compare import PTComparator, compare
+from msprobe.pytorch.compare.pt_compare import compare
def compare_distributed(npu_dump_dir, bench_dump_dir, output_path, **kwargs):
diff --git a/debug/accuracy_tools/msprobe/pytorch/compare/pt_compare.py b/debug/accuracy_tools/msprobe/pytorch/compare/pt_compare.py
index 308a82b3d6e9beb67a669ea05b83d7b8a6eddc90..7c1670dac7133dd3f28c35c7107b3ffea6ed6b38 100644
--- a/debug/accuracy_tools/msprobe/pytorch/compare/pt_compare.py
+++ b/debug/accuracy_tools/msprobe/pytorch/compare/pt_compare.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2024-2024, Huawei Technologies Co., Ltd.
+# Copyright (c) 2024-2025, Huawei Technologies Co., Ltd.
# All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
@@ -12,14 +12,12 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
-
-import os.path
+import os
import torch
-from msprobe.core.common.const import FileCheckConst
from msprobe.core.common.exceptions import FileCheckException
-from msprobe.core.common.file_utils import FileChecker, create_directory, load_yaml
+from msprobe.core.common.file_utils import create_directory, load_yaml, FileChecker, FileCheckConst
from msprobe.core.common.utils import CompareException, check_compare_param, check_configuration_param, get_dump_mode, \
set_dump_path
from msprobe.core.compare.acc_compare import Comparator, ModeConfig
@@ -55,28 +53,30 @@ class PTComparator(Comparator):
mapping_dict = {}
return mapping_dict
- def read_npy_data(self, dir_path, file_name):
- if not file_name:
- return None
- data_path = os.path.join(dir_path, file_name)
- path_checker = FileChecker(data_path, FileCheckConst.FILE, FileCheckConst.READ_ABLE,
- FileCheckConst.PT_SUFFIX, False)
- data_path = path_checker.common_check()
- try:
- # detach because numpy can not process gradient information
- data_value = load_pt(data_path, to_cpu=True).detach()
- except RuntimeError as e:
- # 这里捕获 load_pt 中抛出的异常
- logger.error(f"Failed to load the .pt file at {data_path}.")
- raise CompareException(CompareException.INVALID_FILE_ERROR) from e
- except AttributeError as e:
- # 这里捕获 detach 方法抛出的异常
- logger.error(f"Failed to detach the loaded tensor.")
- raise CompareException(CompareException.DETACH_ERROR) from e
- if data_value.dtype == torch.bfloat16:
- data_value = data_value.to(torch.float32)
- data_value = data_value.numpy()
- return data_value
+
+def read_pt_data(dir_path, file_name):
+ if not file_name:
+ return None
+
+ data_path = os.path.join(dir_path, file_name)
+ path_checker = FileChecker(data_path, FileCheckConst.FILE, FileCheckConst.READ_ABLE,
+ FileCheckConst.PT_SUFFIX, False)
+ data_path = path_checker.common_check()
+ try:
+ # detach because numpy can not process gradient information
+ data_value = load_pt(data_path, to_cpu=True).detach()
+ except RuntimeError as e:
+ # 这里捕获 load_pt 中抛出的异常
+ logger.error(f"Failed to load the .pt file at {data_path}.")
+ raise CompareException(CompareException.INVALID_FILE_ERROR) from e
+ except AttributeError as e:
+ # 这里捕获 detach 方法抛出的异常
+ logger.error(f"Failed to detach the loaded tensor.")
+ raise CompareException(CompareException.DETACH_ERROR) from e
+ if data_value.dtype == torch.bfloat16:
+ data_value = data_value.to(torch.float32)
+ data_value = data_value.numpy()
+ return data_value
def compare(input_param, output_path, **kwargs):
diff --git a/debug/accuracy_tools/msprobe/pytorch/monitor/module_hook.py b/debug/accuracy_tools/msprobe/pytorch/monitor/module_hook.py
index 0c9efaab999e71d896eaf64d837978bd26f214ad..ad0fba463966b86c2d4ab3f5be5f4d95100a7df3 100644
--- a/debug/accuracy_tools/msprobe/pytorch/monitor/module_hook.py
+++ b/debug/accuracy_tools/msprobe/pytorch/monitor/module_hook.py
@@ -557,9 +557,9 @@ class TrainerMon:
def write_mv_tb(self, opt_context):
if not self.mv_distribution:
return
- self.summary_writer.write_metrics(self.ops, opt_context.exp_avg_metric,
+ self.summary_writer.write_metrics(self.ops, opt_context.exp_avg_metric,
opt_context.step, MonitorConst.EXP_AVG)
- self.summary_writer.write_metrics(self.ops, opt_context.exp_avg_sq_metric,
+ self.summary_writer.write_metrics(self.ops, opt_context.exp_avg_sq_metric,
opt_context.step, MonitorConst.EXP_AVG_SQ)
def write_grad_tb(self, step):
@@ -1051,7 +1051,7 @@ class TrainerMon:
self.enable_megatron = True
logger.info("megatron version is > core_r0.8.0 <= core_r0.9.0")
except ImportError:
- self.enable_megatron = False | self.enable_megatron
+ self.enable_megatron = False
if not self.enable_megatron:
self._hook_weights()
diff --git a/debug/accuracy_tools/msprobe/test/core_ut/common/test_dump_file/dump_no_pt_no_ms.json b/debug/accuracy_tools/msprobe/test/core_ut/common/test_dump_file/dump_no_pt_no_ms.json
new file mode 100644
index 0000000000000000000000000000000000000000..63a062d8ffa264a0254fc2bab0208dcf951ae094
--- /dev/null
+++ b/debug/accuracy_tools/msprobe/test/core_ut/common/test_dump_file/dump_no_pt_no_ms.json
@@ -0,0 +1,3 @@
+{
+ "task": "tensor"
+}
\ No newline at end of file
diff --git a/debug/accuracy_tools/msprobe/test/core_ut/common/test_dump_file/ms_dump_no_framework.json b/debug/accuracy_tools/msprobe/test/core_ut/common/test_dump_file/ms_dump_no_framework.json
new file mode 100644
index 0000000000000000000000000000000000000000..b223c74b2315af1b9454e5f1e70c29502d449c56
--- /dev/null
+++ b/debug/accuracy_tools/msprobe/test/core_ut/common/test_dump_file/ms_dump_no_framework.json
@@ -0,0 +1,4 @@
+{
+ "task": "tensor",
+ "type": "mindspore.float16"
+}
\ No newline at end of file
diff --git a/debug/accuracy_tools/msprobe/test/core_ut/common/test_dump_file/pt_dump_no_framework.json b/debug/accuracy_tools/msprobe/test/core_ut/common/test_dump_file/pt_dump_no_framework.json
new file mode 100644
index 0000000000000000000000000000000000000000..2444ae1fd4096b083a9e8a0e51c9166bb990f51f
--- /dev/null
+++ b/debug/accuracy_tools/msprobe/test/core_ut/common/test_dump_file/pt_dump_no_framework.json
@@ -0,0 +1,4 @@
+{
+ "task": "tensor",
+ "type": "torch.float16"
+}
\ No newline at end of file
diff --git a/debug/accuracy_tools/msprobe/test/core_ut/common/test_file_utils.py b/debug/accuracy_tools/msprobe/test/core_ut/common/test_file_utils.py
index 9ed13f78aed57fd4d8153e2f005ea14d4fb33643..ac3a859bf4b2da478e92650cfe3267cf90c23146 100644
--- a/debug/accuracy_tools/msprobe/test/core_ut/common/test_file_utils.py
+++ b/debug/accuracy_tools/msprobe/test/core_ut/common/test_file_utils.py
@@ -1,7 +1,5 @@
from unittest.mock import patch, mock_open, MagicMock
-import numpy as np
-import pandas as pd
import pytest
from msprobe.core.common.file_utils import *
@@ -533,4 +531,4 @@ class TestDirectoryChecks:
# Test file path
check_file_or_directory_path(self.test_file, isdir=False)
# Test directory path
- check_file_or_directory_path(self.test_dir, isdir=True)
\ No newline at end of file
+ check_file_or_directory_path(self.test_dir, isdir=True)
diff --git a/debug/accuracy_tools/msprobe/test/core_ut/common/test_utils.py b/debug/accuracy_tools/msprobe/test/core_ut/common/test_utils.py
index 3472ca9018e189ffb48e4d26cfeb79e1ba1ff16d..61766ed27c0a58f4fff81fb2f45618de60bb5b48 100644
--- a/debug/accuracy_tools/msprobe/test/core_ut/common/test_utils.py
+++ b/debug/accuracy_tools/msprobe/test/core_ut/common/test_utils.py
@@ -1,7 +1,7 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
-# Copyright (C) 2024-2024. Huawei Technologies Co., Ltd. All rights reserved.
+# Copyright (C) 2024-2025. Huawei Technologies Co., Ltd. All rights reserved.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
@@ -18,11 +18,13 @@ import json
import os
import tempfile
from datetime import datetime, timezone
+import unittest
from unittest import TestCase
from unittest.mock import MagicMock, mock_open, patch
import OpenSSL
import numpy as np
+from pathlib import Path
from msprobe.core.common.const import Const
from msprobe.core.common.file_utils import (
@@ -53,7 +55,8 @@ from msprobe.core.common.utils import (CompareException,
recursion_depth_decorator,
MsprobeBaseException,
check_str_param,
- is_json_file)
+ is_json_file,
+ detect_framework_by_dump_json)
class TestUtils(TestCase):
@@ -488,3 +491,42 @@ class TestCheckCrtValid(TestCase):
with self.assertRaises(RuntimeError) as context:
check_crt_valid(self.cert_file_path)
self.assertIn('The SSL certificate is invalid', str(context.exception))
+
+
+class TestDetectFrameworkByDumpJson(unittest.TestCase):
+
+ @patch('msprobe.core.common.utils.load_json')
+ def test_valid_pytorch_framework(self, mock_load_json):
+ mock_load_json.return_value = {"framework": Const.PT_FRAMEWORK}
+
+ result = detect_framework_by_dump_json("dummy_path")
+
+ self.assertEqual(result, Const.PT_FRAMEWORK)
+
+ @patch('msprobe.core.common.utils.load_json')
+ def test_valid_mindspore_framework(self, mock_load_json):
+ mock_load_json.return_value = {"framework": Const.MS_FRAMEWORK}
+
+ result = detect_framework_by_dump_json("dummy_path")
+
+ self.assertEqual(result, Const.MS_FRAMEWORK)
+
+ def test_detect_framework_in_file(self):
+ self.current_dir = Path(__file__).parent
+ file_path = self.current_dir / "test_dump_file/pt_dump_no_framework.json"
+ result = detect_framework_by_dump_json(file_path)
+ self.assertEqual(result, Const.PT_FRAMEWORK)
+
+ self.current_dir = Path(__file__).parent
+ file_path = self.current_dir / "test_dump_file/ms_dump_no_framework.json"
+ result = detect_framework_by_dump_json(file_path)
+ self.assertEqual(result, Const.MS_FRAMEWORK)
+
+ @patch("msprobe.core.common.utils.logger")
+ def test_detect_framework_exception(self, mock_logger):
+ self.current_dir = Path(__file__).parent
+ file_path = self.current_dir / "test_dump_file/dump_no_pt_no_ms.json"
+ with self.assertRaises(CompareException) as context:
+ result = detect_framework_by_dump_json(file_path)
+ self.assertEqual(context.exception.code, CompareException.INVALID_PARAM_ERROR)
+ mock_logger.error.assert_called_once_with(f"{file_path} must be based on the MindSpore or PyTorch framework.")
diff --git a/debug/accuracy_tools/msprobe/test/core_ut/compare/test_acc_compare.py b/debug/accuracy_tools/msprobe/test/core_ut/compare/test_acc_compare.py
index c882e331f5513ddbd3cbb5baf4c1292079680f4f..1b2f6bb2fde28ebc46a5da09bb22cd89d875edd7 100644
--- a/debug/accuracy_tools/msprobe/test/core_ut/compare/test_acc_compare.py
+++ b/debug/accuracy_tools/msprobe/test/core_ut/compare/test_acc_compare.py
@@ -11,7 +11,7 @@ import torch
from msprobe.core.common.const import CompareConst, Const
from msprobe.core.common.utils import CompareException
-from msprobe.core.compare.acc_compare import Comparator, ModeConfig, get_bench_data_name
+from msprobe.core.compare.acc_compare import Comparator, ModeConfig
from msprobe.core.compare.highlight import find_error_rows, find_compare_result_error_rows, ApiBatch
from msprobe.core.compare.utils import get_accuracy
from msprobe.pytorch.compare.pt_compare import PTComparator
@@ -636,11 +636,11 @@ class TestUtilsMethods(unittest.TestCase):
def test_do_multi_process(self):
data = [['Functional.linear.0.forward.input.0', 'Functional.linear.0.forward.input.0',
'torch.float32', 'torch.float32', [2, 2], [2, 2],
- '', '', '', '', '', '', 1, 1, 1, 1, 1, 1, 1, 1, 'Yes', '', '-1']]
+ '', '', '', '', '', '', 1, 1, 1, 1, 1, 1, 1, 1, 'Yes', '', ['-1', '-1']]]
o_data = [['Functional.linear.0.forward.input.0', 'Functional.linear.0.forward.input.0',
'torch.float32', 'torch.float32', [2, 2], [2, 2],
'unsupported', 'unsupported', 'unsupported', 'unsupported', 'unsupported', 'unsupported',
- 1, 1, 1, 1, 1, 1, 1, 1, 'None', 'No bench data matched.', '-1']]
+ 1, 1, 1, 1, 1, 1, 1, 1, 'None', 'No bench data matched.', ['-1', '-1']]]
columns = CompareConst.COMPARE_RESULT_HEADER + ['Data_name']
result_df = pd.DataFrame(data, columns=columns)
o_result = pd.DataFrame(o_data, columns=columns)
@@ -670,7 +670,7 @@ class TestUtilsMethods(unittest.TestCase):
mode_config = ModeConfig(stack_mode, auto_analyze, fuzzy_match, dump_mode)
pt_comparator = PTComparator(mode_config)
- result = pt_comparator.compare_by_op(npu_op_name, bench_op_name, op_name_mapping_dict, input_param, {})
+ result = pt_comparator.compare_by_op(npu_op_name, bench_op_name, op_name_mapping_dict, input_param)
self.assertEqual(result, ['unsupported', 'unsupported', 'unsupported', 'unsupported', 'unsupported',
'unsupported', 'No bench data matched.'])
@@ -688,43 +688,23 @@ class TestUtilsMethods(unittest.TestCase):
pt_comparator = PTComparator(mode_config)
pt_name = '-1'
- pt_path = os.path.join(base_dir, pt_name)
- op_name_mapping_dict = {'Functional.linear.0.forward.input.0': [pt_path, pt_path]}
+ op_name_mapping_dict = {'Functional.linear.0.forward.input.0': [pt_name, pt_name]}
input_param = {'npu_dump_data_dir': base_dir, 'bench_dump_data_dir': base_dir}
- result = pt_comparator.compare_by_op(npu_op_name, bench_op_name, op_name_mapping_dict, input_param,
- {'Functional.linear.0.forward': {'input_args': [
- {'data_name': 'Functional.linear.0.forward.input.0.pt'}]}})
+ result = pt_comparator.compare_by_op(npu_op_name, bench_op_name, op_name_mapping_dict, input_param)
self.assertEqual(result, ['unsupported', 'unsupported', 'unsupported', 'unsupported', 'unsupported',
- 'unsupported', f'Dump file: {pt_path} not found.'])
+ 'unsupported', 'No bench data matched.'])
pt_name = 'Functional.linear.0.forward.input.0.pt'
- pt_path = os.path.join(base_dir, pt_name)
- op_name_mapping_dict = {'Functional.linear.0.forward.input.0': [pt_path, pt_path]}
+ op_name_mapping_dict = {'Functional.linear.0.forward.input.0': [pt_name, pt_name]}
input_param = {'npu_dump_data_dir': base_dir, 'bench_dump_data_dir': base_dir}
- result = pt_comparator.compare_by_op(npu_op_name, bench_op_name, op_name_mapping_dict, input_param, {})
+ result = pt_comparator.compare_by_op(npu_op_name, bench_op_name, op_name_mapping_dict, input_param)
self.assertEqual(result, ['unsupported', 'unsupported', 'unsupported', 'unsupported', 'unsupported',
- 'unsupported', 'Bench does not have data file.'])
+ 'unsupported', 'Dump file: Functional.linear.0.forward.input.0.pt not found.'])
generate_pt(base_dir)
- result = pt_comparator.compare_by_op(npu_op_name, bench_op_name, op_name_mapping_dict, input_param,
- {'Functional.linear.0.forward': {'input_args': [
- {'data_name': 'Functional.linear.0.forward.input.0.pt'}]}})
+ result = pt_comparator.compare_by_op(npu_op_name, bench_op_name, op_name_mapping_dict, input_param)
self.assertEqual(result, [1.0, 0.0, 0.0, 0.0, 1.0, 1.0, ''])
- def test_get_bench_data_name_input(self):
- bench_op_name = "Functional.linear.0.forward.input.0"
- bench_data = {"Functional.linear.0.forward": {"input_args": [{"data_name": "Functional.linear.0.forward.input.0.pt"}], "input_kwargs": {}, "output": []}}
- result = get_bench_data_name(bench_op_name, bench_data)
-
- self.assertEqual(result, "Functional.linear.0.forward.input.0.pt")
-
- def test_get_bench_data_name_output(self):
- bench_op_name = "Functional.linear.0.forward.output.0"
- bench_data = {"Functional.linear.0.forward": {"input_args": [], "input_kwargs": {}, "output": [{"data_name": "Functional.linear.0.forward.output.0.pt"}]}}
- result = get_bench_data_name(bench_op_name, bench_data)
-
- self.assertEqual(result, "Functional.linear.0.forward.output.0.pt")
-
class TestComparator(unittest.TestCase):
def setUp(self):
diff --git a/debug/accuracy_tools/msprobe/test/core_ut/compare/test_acc_compare_utils.py b/debug/accuracy_tools/msprobe/test/core_ut/compare/test_acc_compare_utils.py
index 2e9a46572662489e861f98f03f25e9e480031bcf..5327237066cd70e13c86a34d0c13f694637a3da9 100644
--- a/debug/accuracy_tools/msprobe/test/core_ut/compare/test_acc_compare_utils.py
+++ b/debug/accuracy_tools/msprobe/test/core_ut/compare/test_acc_compare_utils.py
@@ -4,17 +4,19 @@ import json
import os
import shutil
import unittest
-from unittest.mock import patch
+from unittest.mock import patch, MagicMock
import zlib
+import torch
import numpy as np
-from msprobe.core.common.const import CompareConst, Const
+from msprobe.core.common.const import CompareConst, Const, FileCheckConst
from msprobe.core.common.utils import CompareException
from msprobe.core.compare.utils import ApiItemInfo, _compare_parser, check_and_return_dir_contents, extract_json, \
count_struct, get_accuracy, append_stack_info, get_rela_diff_summary_mode, get_un_match_accuracy, merge_tensor, \
op_item_parse, read_op, rename_api, resolve_api_special_parameters, result_item_init, stack_column_process, \
- table_value_is_valid, get_name_and_state, reorder_op_name_list, reorder_op_x_list, gen_op_item
+ table_value_is_valid, get_name_and_state, reorder_op_name_list, reorder_op_x_list, gen_op_item, read_pt_data, \
+ read_npy_data
# test_read_op_1
op_data = {
@@ -224,31 +226,31 @@ o_result_unmatch_3 = [
['Functional.conv2d.0.forward.input.0', 'N/A', 'torch.float32', 'N/A', [1, 1, 28, 28], 'N/A',
'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A',
3.029174327850342, -2.926689624786377, -0.06619918346405029, 1.0, 'N/A', 'N/A', 'N/A', 'N/A', 'N/A',
- 'No bench data matched.', 'None', '-1'],
+ 'No bench data matched.', 'None', ['-1', '-1']],
['Functional.conv2d.0.forward.input.1', 'N/A', 'torch.float32', 'N/A', [16, 1, 5, 5], 'N/A',
'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A',
0.19919930398464203, -0.19974489510059357, 0.006269412115216255, 1.0, 'N/A', 'N/A', 'N/A', 'N/A', 'N/A',
- 'No bench data matched.', 'None', '-1'],
+ 'No bench data matched.', 'None', ['-1', '-1']],
['Functional.conv2d.0.forward.input.2', 'N/A', 'torch.float32', 'N/A', [16], 'N/A',
'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A',
0.19734230637550354, -0.18177609145641327, 0.007903944700956345, 1.0, 'N/A', 'N/A', 'N/A', 'N/A', 'N/A',
- 'No bench data matched.', 'None', '-1'],
+ 'No bench data matched.', 'None', ['-1', '-1']],
['Functional.conv2d.0.forward.parameters.weight', 'N/A', 'torch.float32', 'N/A', [1, 16, 28, 28], 'N/A',
'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A',
- 1.0, 1.0, 1.0, 1.0, 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'No bench data matched.', 'None', '-1'],
+ 1.0, 1.0, 1.0, 1.0, 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'No bench data matched.', 'None', ['-1', '-1']],
['Functional.conv2d.0.forward.parameters.bias', 'N/A', 'torch.float32', 'N/A', [1, 16, 28, 28], 'N/A',
'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A',
- 1.0, 1.0, 1.0, 1.0, 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'No bench data matched.', 'None', '-1'],
+ 1.0, 1.0, 1.0, 1.0, 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'No bench data matched.', 'None', ['-1', '-1']],
['Functional.conv2d.0.forward.output.0', 'N/A', 'torch.float32', 'N/A', [1, 16, 28, 28], 'N/A',
'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A',
2.1166646480560303, -2.190781354904175, -0.003579073818400502, 1.0, 'N/A', 'N/A', 'N/A', 'N/A', 'N/A',
- 'No bench data matched.', 'None', '-1'],
+ 'No bench data matched.', 'None', ['-1', '-1']],
['Functional.conv2d.0.parameters_grad.weight', 'N/A', 'torch.float32', 'N/A', [1, 16, 28, 28], 'N/A',
'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A',
- 1.0, 1.0, 1.0, 1.0, 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'No bench data matched.', 'None', '-1'],
+ 1.0, 1.0, 1.0, 1.0, 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'No bench data matched.', 'None', ['-1', '-1']],
['Functional.conv2d.0.parameters_grad.bias', 'N/A', 'torch.float32', 'N/A', [1, 16, 28, 28], 'N/A',
'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A',
- 1.0, 1.0, 1.0, 1.0, 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'No bench data matched.', 'None', '-1']
+ 1.0, 1.0, 1.0, 1.0, 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'No bench data matched.', 'None', ['-1', '-1']]
]
# test_merge_tensor
@@ -854,3 +856,54 @@ class TestGenOpItem(unittest.TestCase):
expected_md5 = f"{zlib.crc32(str(op_data['value']).encode()):08x}"
self.assertEqual(result['md5'], expected_md5)
+
+
+class TestReadPtData(unittest.TestCase):
+
+ @patch('msprobe.core.compare.utils.load_pt')
+ @patch('msprobe.core.compare.utils.FileChecker')
+ @patch('os.path.join', return_value='/fake/path/to/file.pt')
+ def test_read_pt_data(self, mock_os, mock_file_checker, mock_load_pt):
+ mock_file_checker.return_value.common_check.return_value = '/fake/path/to/file.pt'
+
+ mock_tensor = MagicMock()
+ mock_tensor.detach.return_value = mock_tensor
+ mock_tensor.to.return_value = mock_tensor
+ mock_tensor.dtype = torch.bfloat16
+ mock_tensor.numpy.return_value = np.array([1.0, 2.0, 3.0])
+ mock_load_pt.return_value = mock_tensor
+
+ result = read_pt_data('/fake/dir', 'file_name.pt')
+
+ mock_file_checker.assert_called_once_with('/fake/path/to/file.pt', FileCheckConst.FILE, FileCheckConst.READ_ABLE, FileCheckConst.PT_SUFFIX, False)
+ mock_load_pt.assert_called_once_with('/fake/path/to/file.pt', to_cpu=True)
+ mock_tensor.to.assert_called_once_with(torch.float32)
+ self.assertTrue(np.array_equal(result, np.array([1.0, 2.0, 3.0])))
+
+ @patch('os.path.join', return_value='/fake/path/to/file.pt')
+ @patch('msprobe.core.compare.utils.FileChecker')
+ @patch('msprobe.core.compare.utils.load_pt')
+ def test_read_real_data_pt_exception(self, mock_load_pt, mock_file_checker, mock_os):
+ mock_file_checker.return_value.common_check.return_value = '/fake/path/to/file.pt'
+
+ mock_load_pt.side_effect = RuntimeError("Test Error")
+
+ with self.assertRaises(CompareException):
+ read_pt_data('/fake/dir', 'file_name.pt')
+
+
+class TestReadNpyData(unittest.TestCase):
+
+ @patch('msprobe.core.compare.utils.load_npy')
+ @patch('msprobe.core.compare.utils.FileChecker')
+ @patch('os.path.join', return_value='/fake/path/to/file.npy')
+ def test_read_real_data_ms(self, mock_os, mock_file_checker, mock_load_npy):
+ mock_file_checker.return_value.common_check.return_value = '/fake/path/to/file.npy'
+
+ mock_load_npy.return_value = np.array([1.0, 2.0, 3.0])
+
+ result = read_npy_data('/fake/dir', 'file_name.npy')
+
+ mock_file_checker.assert_called_once_with('/fake/path/to/file.npy', FileCheckConst.FILE, FileCheckConst.READ_ABLE, FileCheckConst.NUMPY_SUFFIX, False)
+ mock_load_npy.assert_called_once_with('/fake/path/to/file.npy')
+ self.assertTrue(np.array_equal(result, np.array([1.0, 2.0, 3.0])))
\ No newline at end of file
diff --git a/debug/accuracy_tools/msprobe/test/core_ut/compare/test_cmp_multiprocessing_compute.py b/debug/accuracy_tools/msprobe/test/core_ut/compare/test_cmp_multiprocessing_compute.py
index 3fa16b0d9d487250a7a8d9ec97b5572d3c0b387a..49f084ce07c8e90afb2aa1c3340bb4c3965c8fa7 100644
--- a/debug/accuracy_tools/msprobe/test/core_ut/compare/test_cmp_multiprocessing_compute.py
+++ b/debug/accuracy_tools/msprobe/test/core_ut/compare/test_cmp_multiprocessing_compute.py
@@ -18,12 +18,12 @@ data = [['Functional.linear.0.forward.input.0', 'Functional.linear.0.forward.inp
'torch.float32', 'torch.float32', [2, 2], [2, 2],
'', '', '', '', '', '',
1, 1, 1, 1, 1, 1, 1, 1,
- 'Yes', '', '-1']]
+ 'Yes', '', ['-1', '-1']]]
o_data = [['Functional.linear.0.forward.input.0', 'Functional.linear.0.forward.input.0',
'torch.float32', 'torch.float32', [2, 2], [2, 2],
'unsupported', 'unsupported', 'unsupported', 'unsupported', 'unsupported', 'unsupported',
1, 1, 1, 1, 1, 1, 1, 1,
- 'None', 'No bench data matched.', '-1']]
+ 'None', 'No bench data matched.', ['-1', '-1']]]
columns = CompareConst.COMPARE_RESULT_HEADER + ['Data_name']
result_df = pd.DataFrame(data, columns=columns)
o_result = pd.DataFrame(o_data, columns=columns)
@@ -54,9 +54,9 @@ class TestUtilsMethods(unittest.TestCase):
func = Comparator(mode_config).compare_ops
generate_dump_json(base_dir)
- input_parma = {'bench_json_path': os.path.join(base_dir, 'dump.json')}
+ input_param = {'bench_json_path': os.path.join(base_dir, 'dump.json')}
lock = multiprocessing.Manager().RLock()
- result = _handle_multi_process(func, input_parma, result_df, lock)
+ result = _handle_multi_process(func, input_param, result_df, lock)
self.assertTrue(result.equals(o_result))
def test_read_dump_data(self):
diff --git a/debug/accuracy_tools/msprobe/test/core_ut/data_dump/data_processor/test_pytorch_processor.py b/debug/accuracy_tools/msprobe/test/core_ut/data_dump/data_processor/test_pytorch_processor.py
index 34064e7cc2b9d0aa5c0c2e98806b8993137a589c..3d31a1bb51679c28d2cc25ecced891e31ce4dcfd 100644
--- a/debug/accuracy_tools/msprobe/test/core_ut/data_dump/data_processor/test_pytorch_processor.py
+++ b/debug/accuracy_tools/msprobe/test/core_ut/data_dump/data_processor/test_pytorch_processor.py
@@ -19,6 +19,7 @@ from msprobe.core.data_dump.data_processor.pytorch_processor import (
KernelDumpDataProcessor
)
from torch import distributed as dist
+from torch._subclasses import FakeTensorMode
class TestPytorchDataProcessor(unittest.TestCase):
@@ -62,6 +63,15 @@ class TestPytorchDataProcessor(unittest.TestCase):
result = PytorchDataProcessor.get_stat_info(mock_data)
self.assertIsInstance(result, TensorStatInfo)
+ def test_get_stat_info_with_fake_tensor(self):
+ with FakeTensorMode() as fake_tensor_mode:
+ fake_tensor = fake_tensor_mode.from_tensor(torch.randn(1, 2, 3))
+ result = PytorchDataProcessor.get_stat_info(fake_tensor)
+ self.assertIsNone(result.max)
+ self.assertIsNone(result.min)
+ self.assertIsNone(result.mean)
+ self.assertIsNone(result.norm)
+
def test_get_stat_info_float(self):
tensor = torch.tensor([1.0, 2.0, 3.0])
result = self.processor.get_stat_info(tensor)
diff --git a/debug/accuracy_tools/msprobe/test/mindspore_ut/common/test_ms_utils.py b/debug/accuracy_tools/msprobe/test/mindspore_ut/common/test_ms_utils.py
index 1ed3ca016108519fb3f643c9d4bb768f63a52d40..80f91a53f79c81c4e79947bc66b7bf932b774bd0 100644
--- a/debug/accuracy_tools/msprobe/test/mindspore_ut/common/test_ms_utils.py
+++ b/debug/accuracy_tools/msprobe/test/mindspore_ut/common/test_ms_utils.py
@@ -15,21 +15,13 @@
# limitations under the License.
"""
import unittest
-from unittest.mock import MagicMock, patch, call
+from unittest.mock import patch
import numpy as np
import mindspore as ms
-import os
-import random
-
-from msprobe.core.common.exceptions import DistributedNotInitializedError
-from msprobe.mindspore.common.utils import (get_rank_if_initialized,
- convert_bf16_to_fp32,
- save_tensor_as_npy,
- convert_to_int,
- list_lowest_level_directories,
- seed_all,
- remove_dropout,
- MsprobeStep)
+
+from msprobe.mindspore.common.utils import get_rank_if_initialized, convert_bf16_to_fp32, convert_to_int, \
+ list_lowest_level_directories, seed_all, remove_dropout, MsprobeStep
+
class MockCell:
def __init__(self):
@@ -136,8 +128,3 @@ class TestMsprobeFunctions(unittest.TestCase):
from mindspore.mint.nn.functional import dropout
self.assertTrue((Dropout(0.5)(x1d).numpy() == x1d.numpy()).all())
self.assertTrue((dropout(x1d, p=0.5).numpy() == x1d.numpy()).all())
-
-
-
-if __name__ == "__main__":
- unittest.main()
\ No newline at end of file
diff --git a/debug/accuracy_tools/msprobe/test/mindspore_ut/compare/dump_file/mindspore_data/dump.json b/debug/accuracy_tools/msprobe/test/mindspore_ut/compare/dump_file/mindspore_data/dump.json
index 5b954f6d6443c92e6321e5f55e373e99f428653d..48800c0455c6651b146600e61e636d4dc25fac31 100644
--- a/debug/accuracy_tools/msprobe/test/mindspore_ut/compare/dump_file/mindspore_data/dump.json
+++ b/debug/accuracy_tools/msprobe/test/mindspore_ut/compare/dump_file/mindspore_data/dump.json
@@ -1,6 +1,7 @@
{
"task": "statistics",
"level": "mix",
+ "framework": "mindspore",
"dump_data_dir": null,
"data": {
"Tensor.__add__.0.forward": {
diff --git a/debug/accuracy_tools/msprobe/test/mindspore_ut/compare/dump_file/pytorch_data/dump.json b/debug/accuracy_tools/msprobe/test/mindspore_ut/compare/dump_file/pytorch_data/dump.json
index 150cbd43b169573e48542aa0c46c26e7df69843e..b2704185ff19b961b43453f81247236d77677d83 100644
--- a/debug/accuracy_tools/msprobe/test/mindspore_ut/compare/dump_file/pytorch_data/dump.json
+++ b/debug/accuracy_tools/msprobe/test/mindspore_ut/compare/dump_file/pytorch_data/dump.json
@@ -1,6 +1,7 @@
{
"task": "statistics",
"level": "mix",
+ "framework": "pytorch",
"dump_data_dir": null,
"data": {
"Tensor.__add__.0.forward": {
diff --git a/debug/accuracy_tools/msprobe/test/mindspore_ut/compare/test_ms_compare.py b/debug/accuracy_tools/msprobe/test/mindspore_ut/compare/test_ms_compare.py
index b5cbff9784a837ea4d64ac9eccdf30175564f712..667fea224120550b0240d8c3dc16d929f2cca72a 100644
--- a/debug/accuracy_tools/msprobe/test/mindspore_ut/compare/test_ms_compare.py
+++ b/debug/accuracy_tools/msprobe/test/mindspore_ut/compare/test_ms_compare.py
@@ -5,8 +5,10 @@ import random
import shutil
import tempfile
import unittest
+from unittest.mock import patch
import numpy as np
+import pandas as pd
import torch
import yaml
@@ -350,21 +352,21 @@ class TestUtilsMethods(unittest.TestCase):
finally:
shutil.rmtree(data_path)
- def test_check_cross_framework(self):
- ms_data = {
- "data_name": "Cell.model.language_model.encoder.layers.5.input_norm.FusedRMSNorm.forward.0.input.0.npy",
- }
- pt_data = {
- "data_name": "Module.module.module.language_model.encoder.layers.0.input_norm.RMSNorm.forward.0.input.0.pt",
- }
+ @patch('msprobe.mindspore.compare.ms_compare.detect_framework_by_dump_json')
+ def test_check_cross_framework_valid_pytorch(self, mock_detect_framework):
+ mock_detect_framework.return_value = Const.PT_FRAMEWORK
+
+ result = check_cross_framework("dummy_path")
+
+ self.assertTrue(result)
+
+ @patch('msprobe.mindspore.compare.ms_compare.detect_framework_by_dump_json')
+ def test_check_cross_framework_invalid_framework(self, mock_detect_framework):
+ mock_detect_framework.return_value = Const.MS_FRAMEWORK
- def check_data(data):
- with tempfile.NamedTemporaryFile(mode='w+', suffix='.json', encoding='utf-8', delete=True) as temp_file:
- json.dump(data, temp_file, ensure_ascii=False, indent=4)
- temp_file.flush()
- return check_cross_framework(temp_file.name)
- self.assertFalse(check_data(ms_data))
- self.assertTrue(check_data(pt_data))
+ result = check_cross_framework("dummy_path")
+
+ self.assertFalse(result)
def test_comapre_process(self):
data_path = tempfile.mkdtemp(prefix='dump_data', dir='/tmp')
@@ -466,32 +468,6 @@ class TestUtilsMethods(unittest.TestCase):
npu_op_name = ms_comparator.process_cell_mapping(npu_cell_dict.get('op_name')[0])
self.assertEqual(npu_op_name, 'Module.fc1.Linear.forward.0.input.0')
- def test_read_npy_data(self):
- stack_mode = True
- auto_analyze = True
- fuzzy_match = False
- dump_mode = Const.ALL
-
- mode_config = ModeConfig(stack_mode, auto_analyze, fuzzy_match, dump_mode)
- mapping_config = MappingConfig()
-
- ms_comparator = MSComparator(mode_config, mapping_config)
-
- self.temp_file = tempfile.NamedTemporaryFile(suffix='.pt')
- tensor = torch.Tensor([1, 2, 3])
- filename = self.temp_file.name.split('/')[-1]
- torch.save(tensor, self.temp_file.name)
- result = ms_comparator.read_npy_data('/tmp', filename, load_pt_file=True)
- self.assertTrue(np.array_equal(result, np.array([1, 2, 3])))
- self.temp_file.close()
-
- self.temp_file = tempfile.NamedTemporaryFile(suffix='.npy')
- tensor = np.array([1, 2, 3])
- filename = self.temp_file.name.split('/')[-1]
- np.save(self.temp_file.name, tensor)
- result = ms_comparator.read_npy_data('/tmp', filename, load_pt_file=False)
- self.assertTrue(np.array_equal(result, np.array([1, 2, 3])))
- self.temp_file.close()
def test_process_internal_api_mapping(self):
stack_mode = True
@@ -533,4 +509,28 @@ class TestUtilsMethods(unittest.TestCase):
api_list = ["Mint"]
with self.assertRaises(CompareException):
- ms_comparator.get_api_name(api_list)
\ No newline at end of file
+ ms_comparator.get_api_name(api_list)
+
+ def test_process_data_name(self):
+ stack_mode = True
+ auto_analyze = True
+ fuzzy_match = False
+ dump_mode = Const.ALL
+
+ mode_config = ModeConfig(stack_mode, auto_analyze, fuzzy_match, dump_mode)
+ mapping_config = MappingConfig()
+ ms_comparator = MSComparator(mode_config, mapping_config)
+
+ data = pd.DataFrame({
+ 'data_name_x': ['A', 'B', 'C'],
+ 'data_name_y': ['X', 'Y', 'Z']
+ })
+
+ result = ms_comparator.process_data_name(data.copy())
+
+ expected = pd.DataFrame({
+ 'data_name_x': [['A', 'X'], ['B', 'Y'], ['C', 'Z']],
+ 'data_name_y': ['X', 'Y', 'Z']
+ })
+
+ pd.testing.assert_frame_equal(result, expected)
diff --git a/debug/accuracy_tools/msprobe/test/pytorch_ut/api_accuracy_checker/tensor_transport_layer/test_attl.py b/debug/accuracy_tools/msprobe/test/pytorch_ut/api_accuracy_checker/tensor_transport_layer/test_attl.py
index 7d4e6e950dc1d3e51ef69ca46895fcf5078c5f67..0320c43d0ba9cd1c1d8e60b9867d770b47dd1715 100644
--- a/debug/accuracy_tools/msprobe/test/pytorch_ut/api_accuracy_checker/tensor_transport_layer/test_attl.py
+++ b/debug/accuracy_tools/msprobe/test/pytorch_ut/api_accuracy_checker/tensor_transport_layer/test_attl.py
@@ -6,6 +6,7 @@ from multiprocessing import Queue
from msprobe.pytorch.api_accuracy_checker.tensor_transport_layer.attl import *
from msprobe.core.common.file_utils import create_directory
+
class TestATTL(unittest.TestCase):
def setUp(self):
@@ -48,7 +49,7 @@ class TestATTL(unittest.TestCase):
self.assertIsNone(result)
@patch('glob.glob')
- @patch('msprobe.pytorch.common.utils.load_pt')
+ @patch('msprobe.core.common.file_utils.load_pt')
def test_download_with_exception(self, mock_load_pt, mock_glob):
mock_glob.return_value = ['/tmp/start_file.pt']
mock_load_pt.side_effect = Exception('Load error')
diff --git a/debug/accuracy_tools/msprobe/test/pytorch_ut/common/test_pt_utils.py b/debug/accuracy_tools/msprobe/test/pytorch_ut/common/test_pt_utils.py
index cdc922cc98d59b59ec0be85833d2000cd38913c8..b1ac148ae742517c389f6de474463468ef90b572 100644
--- a/debug/accuracy_tools/msprobe/test/pytorch_ut/common/test_pt_utils.py
+++ b/debug/accuracy_tools/msprobe/test/pytorch_ut/common/test_pt_utils.py
@@ -10,8 +10,8 @@ import torch.distributed as dist
from msprobe.core.common.file_utils import FileCheckConst
from msprobe.core.common.exceptions import DistributedNotInitializedError
from msprobe.pytorch.api_accuracy_checker.common.utils import ApiData
-from msprobe.pytorch.common.utils import parameter_adapter, get_rank_if_initialized, \
- get_tensor_rank, get_rank_id, print_rank_0, load_pt, save_pt, save_api_data, load_api_data, save_pkl, load_pkl
+from msprobe.pytorch.common.utils import parameter_adapter, get_rank_if_initialized, get_tensor_rank, get_rank_id, \
+ print_rank_0, load_pt, save_pt, save_api_data, load_api_data, save_pkl, load_pkl
class TestParameterAdapter(unittest.TestCase):
@@ -180,6 +180,7 @@ class TestLoadPt(unittest.TestCase):
if os.path.isfile(self.temp_file.name):
os.remove(self.temp_file.name)
+
class TestSavePT(unittest.TestCase):
def setUp(self):
@@ -195,6 +196,7 @@ class TestSavePT(unittest.TestCase):
mock_torch_save.assert_called_once_with(self.tensor, self.filepath)
mock_change_mode.assert_called_once_with(self.filepath, FileCheckConst.DATA_FILE_AUTHORITY)
+
class TestSavePT(unittest.TestCase):
def setUp(self):
diff --git a/debug/accuracy_tools/msprobe/test/pytorch_ut/compare/test_pt_compare.py b/debug/accuracy_tools/msprobe/test/pytorch_ut/compare/test_pt_compare.py
index b079e646c4a8f4098bb233e3e6259ef3ebea9c94..4eda1d6d974bdc4f6699808946fafb4b136cf98e 100644
--- a/debug/accuracy_tools/msprobe/test/pytorch_ut/compare/test_pt_compare.py
+++ b/debug/accuracy_tools/msprobe/test/pytorch_ut/compare/test_pt_compare.py
@@ -3,13 +3,10 @@ import os
import shutil
import unittest
-import numpy as np
import torch
-from msprobe.core.common.const import Const
from msprobe.core.common.utils import CompareException
-from msprobe.core.compare.acc_compare import ModeConfig
-from msprobe.pytorch.compare.pt_compare import PTComparator, compare
+from msprobe.pytorch.compare.pt_compare import compare
from msprobe.test.core_ut.compare.test_acc_compare import generate_dump_json, generate_stack_json
@@ -40,36 +37,6 @@ class TestUtilsMethods(unittest.TestCase):
if os.path.exists(base_dir2):
shutil.rmtree(base_dir2)
- def test_read_npy_data_bf16(self):
- generate_bf16_pt(base_dir1)
-
- stack_mode = True
- auto_analyze = True
- fuzzy_match = False
- dump_mode = Const.ALL
- mode_config = ModeConfig(stack_mode, auto_analyze, fuzzy_match, dump_mode)
-
- pt_comparator = PTComparator(mode_config)
- result = pt_comparator.read_npy_data(base_dir1, 'bf16.pt')
-
- target_result = torch.tensor([1, 2, 3, 4], dtype=torch.float32).numpy()
- self.assertTrue(np.array_equal(result, target_result))
-
- def test_read_npy_data_dict(self):
- generate_dict_pt(base_dir1)
-
- stack_mode = True
- auto_analyze = True
- fuzzy_match = False
- dump_mode = Const.ALL
- mode_config = ModeConfig(stack_mode, auto_analyze, fuzzy_match, dump_mode)
-
- pt_comparator = PTComparator(mode_config)
-
- with self.assertRaises(CompareException) as context:
- result = pt_comparator.read_npy_data(base_dir1, 'dict.pt')
- self.assertEqual(context.exception.code, CompareException.DETACH_ERROR)
-
def test_compare(self):
generate_dump_json(base_dir2)
generate_stack_json(base_dir2)
diff --git a/debug/accuracy_tools/msprobe/test/resources/layer_mapping/mindspore/dump.json b/debug/accuracy_tools/msprobe/test/resources/layer_mapping/mindspore/dump.json
index b55f9e0699fe6329ceeb09a51fe20118c65545e7..153d84e7d117b5be89dfdb522edc39dc066929cb 100644
--- a/debug/accuracy_tools/msprobe/test/resources/layer_mapping/mindspore/dump.json
+++ b/debug/accuracy_tools/msprobe/test/resources/layer_mapping/mindspore/dump.json
@@ -1,6 +1,7 @@
{
"task": "statistics",
"level": "mix",
+ "framework": "mindspore",
"dump_data_dir": null,
"data": {
"Cell.network_with_loss.module.language_model.embedding.word_embeddings.VocabParallelEmbedding.forward.0": {
diff --git a/debug/accuracy_tools/msprobe/test/resources/layer_mapping/pytorch/dump.json b/debug/accuracy_tools/msprobe/test/resources/layer_mapping/pytorch/dump.json
index d7dd1c0c38e2d24c8b0d19c346a50eb33437d232..02239176a9d690c4ce70c06cc6ab117a3c122811 100644
--- a/debug/accuracy_tools/msprobe/test/resources/layer_mapping/pytorch/dump.json
+++ b/debug/accuracy_tools/msprobe/test/resources/layer_mapping/pytorch/dump.json
@@ -1,6 +1,7 @@
{
"task": "statistics",
"level": "mix",
+ "framework": "pytorch",
"dump_data_dir": null,
"data": {
"Module.module.module.language_model.embedding.word_embeddings.VocabParallelEmbedding.forward.0": {
diff --git a/dynolog_npu/README.md b/dynolog_npu/README.md
index d6ebd6f7ff04f0fa40601500eaf66b89ed7a7f97..86a23b7f82925079c26623b070936538768d9b8c 100644
--- a/dynolog_npu/README.md
+++ b/dynolog_npu/README.md
@@ -51,6 +51,8 @@ sudo yum install -y cmake ninja
### 3. 编译
+- dynolog编译
+
默认编译生成dyno和dynolog二进制文件, -t参数可以支持将二进制文件打包成deb包或rpm包.
```bash
@@ -64,6 +66,10 @@ bash scripts/build.sh -t deb
bash scripts/build.sh -t rpm
```
+- dynolog_npu_plugin wheel包编译
+
+dynolog_npu_plugin wheel包提供IPCMonitor,MsptiMonitor等公共能力,使用nputrace和npu-monitor功能前必须安装该wheel包,具体编译安装指导可参考dynolog_npu\plugin\README.md。
+
## 使用方式
### Profiler trace dump功能
@@ -112,7 +118,9 @@ nputrace子命令支持的参数选项
- nputrace使用方法
-Step1: 拉起dynolog daemon进程
+Step0: 参考`3.编译`章节完成dynolog的编译,以及dynolog_npu_plugin wheel包的编译和安装。
+
+Step1:拉起dynolog daemon进程
```bash
# 方法1:使用systemd拉起service
# 修改配置文件/etc/dynolog.gflags, 使能ipc_monitor
diff --git a/dynolog_npu/plugin/ipc_monitor/DynoLogNpuMonitor.cpp b/dynolog_npu/plugin/ipc_monitor/DynoLogNpuMonitor.cpp
index 940f5aae167f088361057fe2a7a389a76f5bb2b4..bba66d7297af1eec929a0149b0b2d1df35eaf843 100644
--- a/dynolog_npu/plugin/ipc_monitor/DynoLogNpuMonitor.cpp
+++ b/dynolog_npu/plugin/ipc_monitor/DynoLogNpuMonitor.cpp
@@ -1,7 +1,4 @@
#include "DynoLogNpuMonitor.h"
-
-#include
-
#include "utils.h"
namespace dynolog_npu {
@@ -10,13 +7,13 @@ namespace ipc_monitor {
bool DynoLogNpuMonitor::Init()
{
if (isInitialized_) {
- std::cout << "[WRARNING] DynoLog npu monitor already initialized" << std::endl;
+ LOG(ERROR) << "DynoLog npu monitor already initialized";
return true;
}
bool res = ipcClient_.RegisterInstance(npuId_);
if (res) {
isInitialized_ = true;
- std::cout << "[INFO] DynoLog npu monitor initialized success !" << std::endl;
+ LOG(INFO) << "DynoLog npu monitor initialized success!";
}
return res;
}
@@ -24,11 +21,6 @@ bool DynoLogNpuMonitor::Init()
std::string DynoLogNpuMonitor::Poll()
{
std::string res = ipcClient_.IpcClientNpuConfig();
- if (res.empty()) {
- std::cout << "[INFO] Request for dynolog server is empty !" << std::endl;
- return "";
- }
- std::cout << "[INFO] Received NPU configuration successfully" << std::endl;
return res;
}
diff --git a/dynolog_npu/plugin/ipc_monitor/NpuIpcClient.cpp b/dynolog_npu/plugin/ipc_monitor/NpuIpcClient.cpp
index 97966e8eeacc7276426feb237aa122eb8dee046f..ca2429f1e368ad996b8a8a954810ed7439c78bea 100644
--- a/dynolog_npu/plugin/ipc_monitor/NpuIpcClient.cpp
+++ b/dynolog_npu/plugin/ipc_monitor/NpuIpcClient.cpp
@@ -1,6 +1,5 @@
#include "NpuIpcClient.h"
-#include
namespace dynolog_npu {
namespace ipc_monitor {
@@ -15,14 +14,14 @@ bool IpcClient::RegisterInstance(int32_t id)
std::unique_ptr message = Message::ConstructMessage(context, "ctxt");
try {
if (!SyncSendMessage(*message, std::string(DYNO_IPC_NAME))) {
- std::cout << "[WARNING]Failed to send register ctxt for pid " << context.pid << " with dyno" << std::endl;
+ LOG(ERROR) << "Failed to send register ctxt for pid " << context.pid << " with dyno";
return false;
}
} catch (const std::exception &e) {
- std::cout << "[WARNING] Error when SyncSendMessage: " << e.what() << std::endl;
+ LOG(ERROR) << " Error when SyncSendMessage: " << e.what();
return false;
}
- std::cout << "[INFO] Resigter pid " << context.pid << " for dynolog success !" << std::endl;
+ LOG(INFO) << "Resigter pid " << context.pid << " for dynolog success !";
return true;
}
std::string IpcClient::IpcClientNpuConfig()
@@ -37,7 +36,7 @@ std::string IpcClient::IpcClientNpuConfig()
}
std::unique_ptr message = Message::ConstructMessage(*req, "req", size);
if (!SyncSendMessage(*message, std::string(DYNO_IPC_NAME))) {
- std::cout << "[WARNING] Failed to send config to dyno server fail !" << std::endl;
+ LOG(ERROR) << " Failed to send config to dyno server fail !";
free(req);
req = nullptr;
return "";
@@ -45,7 +44,7 @@ std::string IpcClient::IpcClientNpuConfig()
free(req);
message = PollRecvMessage(MAX_IPC_RETRIES, MAX_SLEEP_US);
if (!message) {
- std::cout << "[WARNING] Failed to receive on-demand config !" << std::endl;
+ LOG(ERROR) << " Failed to receive on-demand config !";
return "";
}
std::string res = std::string(ReinterpretConvert(message->buf.get()), message->metadata.size);
@@ -65,7 +64,7 @@ std::unique_ptr IpcClient::ReceiveMessage()
bool IpcClient::SyncSendMessage(const Message &message, const std::string &destName, int numRetry, int seepTimeUs)
{
if (destName.empty()) {
- std::cout << "[WARNING] Can not send to empty socket name !" << std::endl;
+ LOG(ERROR) << " Can not send to empty socket name !";
return false;
}
int i = 0;
@@ -79,7 +78,7 @@ bool IpcClient::SyncSendMessage(const Message &message, const std::string &destN
seepTimeUs *= 2; // 2: double sleep time
}
} catch (const std::exception &e) {
- std::cout << "[ERROR] Error when SyncSendMessage: " << e.what() << std::endl;
+ LOG(ERROR) << " Error when SyncSendMessage: " << e.what();
return false;
}
return i < numRetry;
@@ -94,7 +93,7 @@ bool IpcClient::Recv()
try {
successFlag = ep_.TryPeekMessage(*peekCtxt);
} catch (std::exception &e) {
- std::cout << "[ERROR] Error when TryPeekMessage: " << e.what() << std::endl;
+ LOG(ERROR) << " Error when TryPeekMessage: " << e.what();
return false;
}
if (successFlag) {
@@ -108,7 +107,7 @@ bool IpcClient::Recv()
try {
successFlag = ep_.TryRcvMessage(*recvCtxt);
} catch (std::exception &e) {
- std::cout << "[ERROR] Error when TryRecvMsg: " << e.what() << std::endl;
+ LOG(ERROR) << " Error when TryRecvMsg: " << e.what();
return false;
}
if (successFlag) {
@@ -118,7 +117,7 @@ bool IpcClient::Recv()
}
}
} catch (std::exception &e) {
- std::cout << "[ERROR] Error in Recv(): " << e.what() << std::endl;
+ LOG(ERROR) << " Error in Recv(): " << e.what();
return false;
}
return false;
diff --git a/dynolog_npu/plugin/ipc_monitor/PyDynamicMonitorProxy.h b/dynolog_npu/plugin/ipc_monitor/PyDynamicMonitorProxy.h
index 8b5f88abf9d2cf589bec685cd3a520729afe8dd5..0471a70a3419eeeee2986d1d18710ee112c70313 100644
--- a/dynolog_npu/plugin/ipc_monitor/PyDynamicMonitorProxy.h
+++ b/dynolog_npu/plugin/ipc_monitor/PyDynamicMonitorProxy.h
@@ -1,7 +1,7 @@
#ifndef PYDYNAMIC_MONITOR_PROXY_H
#define PYDYNAMIC_MONITOR_PROXY_H
-#include
+#include
#include
#include "MonitorBase.h"
#include "DynoLogNpuMonitor.h"
@@ -14,15 +14,21 @@ public:
PyDynamicMonitorProxy() = default;
bool InitDyno(int npuId)
{
- try {
- monitor_ = DynoLogNpuMonitor::GetInstance();
- monitor_->SetNpuId(npuId);
- bool res = monitor_->Init();
- return res;
- } catch (const std::exception &e) {
- std::cout << "[ERROR] Error when init dyno " << e.what() << std::endl;
- return false;
- }
+ try {
+ if (!google::IsGoogleLoggingInitialized()) {
+ google::InitGoogleLogging("DynoLogNpuMonitor");
+ google::SetLogDestination(google::GLOG_INFO, "/var/log/dynolog_npu_");
+ google::SetLogFilenameExtension(".log");
+ }
+ monitor_ = DynoLogNpuMonitor::GetInstance();
+ monitor_->SetNpuId(npuId);
+ bool res = monitor_->Init();
+ LOG(ERROR) << res;
+ return res;
+ } catch (const std::exception &e) {
+ LOG(ERROR) << "Error when init dyno " << e.what();
+ return false;
+ }
}
std::string PollDyno()
diff --git a/dynolog_npu/plugin/ipc_monitor/utils.cpp b/dynolog_npu/plugin/ipc_monitor/utils.cpp
index 936821fd34bc34bc9db9e09515132e8af39ba57a..b57942082e0fd52426ddce47bfc70620bf19019f 100644
--- a/dynolog_npu/plugin/ipc_monitor/utils.cpp
+++ b/dynolog_npu/plugin/ipc_monitor/utils.cpp
@@ -68,11 +68,11 @@ std::pair GetParentPidAndCommand(int32_t pid)
if (std::getline(statFile, line)) {
int ret = sscanf(line.c_str(), "%*d (%[^)]) %*c %d", command.data(), &parentPid);
if (ret == 2) { // 2: 接收到2个字符
- std::cout << "[INFO] Success to get parent pid: " << parentPid << std::endl;
+ LOG(INFO) << "Success to get parent pid: " << parentPid;
return std::make_pair(parentPid, command);
}
}
- std::cout << "[WARNING] Failed to parse /proc/" << pid << "/stat" << std::endl;
+ LOG(ERROR) << " Failed to parse /proc/" << pid << "/stat";
return std::make_pair(0, "");
}
diff --git a/dynolog_npu/plugin/ipc_monitor/utils.h b/dynolog_npu/plugin/ipc_monitor/utils.h
index 0d8ceb8cfd0bf81b6d8b807c6ac1b505276ddf83..2374a27d417f91bc23108a892c6eb25cbb5039d8 100644
--- a/dynolog_npu/plugin/ipc_monitor/utils.h
+++ b/dynolog_npu/plugin/ipc_monitor/utils.h
@@ -10,7 +10,7 @@
#include
#include
#include
-#include
+#include
#include
diff --git a/dynolog_npu/plugin/setup.py b/dynolog_npu/plugin/setup.py
index 151b9b3fb3fa1a42e147685f632163c8b3f5a564..55e924c6b6950c2a9f8f466159ea56184f77e1a6 100644
--- a/dynolog_npu/plugin/setup.py
+++ b/dynolog_npu/plugin/setup.py
@@ -13,25 +13,28 @@
# See the License for the specific language governing permissions and
# limitations under the License.
import os
+from glob import glob
from setuptools import setup
from pybind11.setup_helpers import Pybind11Extension
BASE_DIR = os.path.dirname(os.path.realpath(__file__))
+DYNOLOG_PATH = os.path.join(os.path.dirname(BASE_DIR), "third_party", "dynolog")
+GLOG_INC_PATH = os.path.join(DYNOLOG_PATH, "third_party", "glog", "src")
+GLOG_LIB_PATH = os.path.join(DYNOLOG_PATH, "build", "third_party", "glog")
# Define the extension module
ext_modules = [
Pybind11Extension(
"IPCMonitor", # Name of the Python module
- sources=["bindings.cpp",
- "ipc_monitor/utils.cpp",
- "ipc_monitor/DynoLogNpuMonitor.cpp",
- "ipc_monitor/NpuIpcClient.cpp",
- ], # Source files
- include_dirs=[os.path.join(BASE_DIR, "ipc_monitor")], # Include Pybind11 headers
+ sources=["bindings.cpp"] + list(glob("ipc_monitor/*.cpp")), # Source files
+ include_dirs=[os.path.join(BASE_DIR, "ipc_monitor"), GLOG_INC_PATH, GLOG_LIB_PATH], # Include Pybind11 headers
+ library_dirs=[GLOG_LIB_PATH],
+ libraries=["glog"],
language="c++", # Specify the language
),
]
+
# Set up the package
setup(
name="dynolog_npu_plugin",
diff --git a/profiler/msprof_analyze/cluster_analyse/README.md b/profiler/msprof_analyze/cluster_analyse/README.md
index 325a0984793297dfac28673f04a582ea7b4316b9..6612d0f1989c1028ae560e0a6e260f3b673a959d 100644
--- a/profiler/msprof_analyze/cluster_analyse/README.md
+++ b/profiler/msprof_analyze/cluster_analyse/README.md
@@ -79,6 +79,7 @@ experimental_config = torch_npu.profiler._ExperimentalConfig(
| compute_op_sum | 集群场景性能数据的device运行算子信息汇总分析,输入性能数据需要基于ascend_pytorch_profiler_{rank_id}.db文件。--export_type为db时,输出交付件cluster_analysis.db;--export_type为notebook时,在cluster_analysis_output/ComputeOpSum目录下输出交付件stats.ipynb;可根据实际情况决定是否是否打开--exclude_op_name。 | 否 |
| hccl_sum | 集合通信算子耗时分析,输入性能数据需要基于ascend_pytorch_profiler_{rank_id}.db文件。--export_type为db时,输出交付件cluster_analysis.db;--export_type为notebook时,在cluster_analysis_output/HcclSum目录下输出交付件stats.ipynb。 | 否 |
| mstx_sum | 集群场景mstx打点信息汇总分析,输入性能数据需要基于ascend_pytorch_profiler_{rank_id}.db文件。--export_type为db时,输出交付件cluster_analysis.db;--export_type为notebook时,在cluster_analysis_output/MstxSum目录下输出交付件stats.ipynb。 | 否 |
+ | freq_analysis | 集群场景aicore frequency信息汇总分析,输入性能数据需要基于ascend_pytorch_profiler_{rank_id}.db文件。打屏输出是否存在aicore存在空闲(频率为800MHz)、异常(频率不为1800MHz或800MHz)的现象。如果有,则在输出交付件cluster_analysis.db增加对应的卡和频率信息。 | 否 |
| 自定义分析参数 | 与cann_api_sum、compute_op_sum、hccl_sum等参数功能类似,用户可自定义一套性能数据的分析规则,需要详细了解性能分析的开发人员,具体开发指导请参见“[自定义分析规则开发指导](#自定义分析规则开发指导)”。 | 否 |
--parallel_mode参数示例如下:
diff --git a/profiler/msprof_analyze/cluster_analyse/analysis/comm_matrix_analysis.py b/profiler/msprof_analyze/cluster_analyse/analysis/comm_matrix_analysis.py
index 2ad5797cc924c8bfb51326387d72dceb043fc2ad..3839fe66aac2cf91f6ef08d38270a3d84143d6ee 100644
--- a/profiler/msprof_analyze/cluster_analyse/analysis/comm_matrix_analysis.py
+++ b/profiler/msprof_analyze/cluster_analyse/analysis/comm_matrix_analysis.py
@@ -22,6 +22,8 @@ from msprof_analyze.prof_common.db_manager import DBManager
from msprof_analyze.cluster_analyse.common_func.utils import increase_shared_value
from msprof_analyze.prof_common.constant import Constant
from msprof_analyze.prof_common.logger import get_logger
+from msprof_analyze.cluster_analyse.common_func.utils import double_hash
+from msprof_analyze.prof_common.file_manager import FileManager
logger = get_logger()
@@ -70,30 +72,46 @@ class CommMatrixAnalysis(BaseAnalysis):
self.combine_link_info(step_dict)
def merge_same_links(self, step_dict: dict):
- def process_link_key(rank_id, rank_dict):
+ def update_rank_map(step_dict):
+ for op_name, op_dict in step_dict.items():
+ group_name = op_name.split("@")[-1]
+ for rank_id, rank_dict in op_dict.items():
+ for link_key in rank_dict:
+ if '-' not in link_key:
+ logger.warning("%s has an invalid link key %s!", str(op_name), str(link_key))
+ break
+ src_rank = link_key.split('-')[0]
+ dst_rank = link_key.split('-')[1]
+ if src_rank == dst_rank:
+ if src_rank not in project_local_global_rank_map.get(group_name, {}):
+ project_local_global_rank_map.setdefault(group_name, {})[src_rank] = rank_id
+ elif project_local_global_rank_map.get(group_name, {}).get(src_rank) != rank_id:
+ logger.warning(f"In the same communication group {group_name}, global rank {rank_id} "
+ f"and {project_local_global_rank_map.get(group_name, {}).get(src_rank)} "
+ f"get the same local rank {src_rank}!")
+
+ def process_link_key(rank_dict):
for link_key in rank_dict:
if '-' not in link_key:
logger.warning("%s has an invalid link key %s!", str(op_name), str(link_key))
break
- src_rank = link_key.split('-')[0]
- dst_rank = link_key.split('-')[1]
- if src_rank == dst_rank:
- if src_rank not in project_local_global_rank_map:
- project_local_global_rank_map[src_rank] = rank_id
- elif project_local_global_rank_map.get(src_rank) != rank_id:
- logger.warning("In the same communication group, local ranks projecting to global ranks "
- "repeat!")
self.combine_link(link_info[link_key], rank_dict[link_key])
- def convert_local_to_global_rank():
+ def convert_local_to_global_rank(rank_map):
tmp_link = {}
for link_key, link_dict in link_info.items():
src_rank = link_key.split('-')[0]
dst_rank = link_key.split('-')[1]
- src_rank = project_local_global_rank_map[src_rank] \
- if src_rank in project_local_global_rank_map else src_rank
- dst_rank = project_local_global_rank_map[dst_rank] \
- if dst_rank in project_local_global_rank_map else dst_rank
+ if src_rank not in rank_map:
+ logger.warning(f"The src local rank {src_rank} of the operator {op_name} "
+ f"cannot be mapped to the global rank.")
+ continue
+ if dst_rank not in rank_map:
+ logger.warning(f"The dst local rank {dst_rank} of the operator {op_name} "
+ f"cannot be mapped to the global rank.")
+ continue
+ src_rank = rank_map[src_rank]
+ dst_rank = rank_map[dst_rank]
link_dict[Constant.BANDWIDTH_GB_S] = \
self.compute_ratio(link_dict.get(Constant.TRANSIT_SIZE_MB, 0),
link_dict.get(Constant.TRANSIT_TIME_MS, 0))
@@ -106,12 +124,14 @@ class CommMatrixAnalysis(BaseAnalysis):
Constant.TRANSIT_SIZE_MB: 0,
Constant.OP_NAME: ''
}
+ project_local_global_rank_map = self.get_parallel_group_info()
+ update_rank_map(step_dict)
for op_name, op_dict in step_dict.items():
link_info = defaultdict(lambda: copy.deepcopy(default_value))
- project_local_global_rank_map = dict()
- for rank_id, rank_dict in op_dict.items():
- process_link_key(rank_id, rank_dict)
- step_dict[op_name] = convert_local_to_global_rank()
+ group_name = op_name.split("@")[-1]
+ for rank_dict in op_dict.values():
+ process_link_key(rank_dict)
+ step_dict[op_name] = convert_local_to_global_rank(project_local_global_rank_map.get(group_name, {}))
def combine_link_info(self, step_dict: dict):
default_value = {
@@ -131,6 +151,19 @@ class CommMatrixAnalysis(BaseAnalysis):
link_dict.get(Constant.TRANSIT_TIME_MS, 0))
step_dict[Constant.TOTAL_OP_INFO] = total_op_info
+ def get_parallel_group_info(self):
+ parallel_group_info = {}
+ for profiler_path in self.data_map.values():
+ meta_json = os.path.join(profiler_path, "profiler_metadata.json")
+ if os.path.exists(meta_json):
+ meta_data = FileManager.read_json_file(meta_json)
+ for group_name, group_info in meta_data.get("parallel_group_info", {}).items():
+ global_ranks = group_info.get("global_ranks")
+ if isinstance(global_ranks, list) and global_ranks:
+ global_ranks.sort()
+ parallel_group_info[double_hash(group_name)] = dict(enumerate(global_ranks))
+ return parallel_group_info
+
class CommMatrixAnalysisOptimized(CommMatrixAnalysis):
SAVED_JSON = "cluster_communication_matrix.json"
diff --git a/profiler/msprof_analyze/cluster_analyse/recipes/freq_analysis/__init__.py b/profiler/msprof_analyze/cluster_analyse/recipes/freq_analysis/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/profiler/msprof_analyze/cluster_analyse/recipes/freq_analysis/freq_analysis.py b/profiler/msprof_analyze/cluster_analyse/recipes/freq_analysis/freq_analysis.py
new file mode 100644
index 0000000000000000000000000000000000000000..0bc7afa393a2a6734104847efb8daaaa9223d9b4
--- /dev/null
+++ b/profiler/msprof_analyze/cluster_analyse/recipes/freq_analysis/freq_analysis.py
@@ -0,0 +1,114 @@
+# Copyright (c) 2024, Huawei Technologies Co., Ltd.
+# All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+from collections import defaultdict
+import pandas as pd
+
+from msprof_analyze.cluster_analyse.recipes.base_recipe_analysis import BaseRecipeAnalysis
+from msprof_analyze.prof_common.constant import Constant
+from msprof_analyze.prof_common.logger import get_logger
+from msprof_analyze.prof_common.database_service import DatabaseService
+
+logger = get_logger()
+
+
+class FreqAnalysis(BaseRecipeAnalysis):
+ COMMON_FREQ = 1800
+ FREE_FREQ = 800
+
+ def __init__(self, params):
+ super().__init__(params)
+ self.free_freq_ranks = []
+ self.abnormal_freq_ranks = []
+ self.abnormal_freq_ranks_map = {}
+
+ @property
+ def base_dir(self):
+ return os.path.basename(os.path.dirname(__file__))
+
+ def reducer_func(self, mapper_res):
+ if self._is_msprof:
+ logger.warning("Freq analysis do not support msprof db now.")
+ return
+
+ mapper_res = list(filter(lambda res: res is not None, mapper_res))
+ if not mapper_res:
+ logger.error("Mapper data is None, load profiling data failed.")
+ return
+
+ for freqs, rank_id in mapper_res:
+ if freqs == [self.COMMON_FREQ]:
+ continue
+ elif set(freqs) == {self.COMMON_FREQ, self.FREE_FREQ}:
+ self.free_freq_ranks.append(rank_id)
+ else:
+ self.abnormal_freq_ranks.append(rank_id)
+ self.abnormal_freq_ranks_map[rank_id] = str(freqs)
+
+ self.free_freq_ranks.sort()
+ self.abnormal_freq_ranks.sort()
+
+ def save_db(self):
+ if len(self.free_freq_ranks) > 0:
+ logger.info(f"Found {len(self.free_freq_ranks)} ranks with free time, "
+ f"aicore frequency in {[self.FREE_FREQ, self.COMMON_FREQ]}.")
+ free_ranks_df = pd.DataFrame()
+ free_ranks_df["rankId"] = self.free_freq_ranks
+ free_ranks_df["aicoreFrequency"] = str([self.FREE_FREQ, self.COMMON_FREQ])
+ free_ranks_df.set_index(["rankId"], inplace=True)
+ self.dump_data(free_ranks_df, Constant.DB_CLUSTER_COMMUNICATION_ANALYZER, "FreeFrequencyRanks")
+ else:
+ logger.info("No rank found with free time.")
+ if len(self.abnormal_freq_ranks) > 0:
+ logger.info(f"Found {len(self.abnormal_freq_ranks)} ranks with abnormal aicore frequency.")
+
+ abnormal_ranks_df = pd.DataFrame.from_dict(self.abnormal_freq_ranks_map,
+ orient="index", columns=["aicoreFrequency"])
+ abnormal_ranks_df = abnormal_ranks_df.reset_index().rename(columns={"index": "rankId"})
+ abnormal_ranks_df.set_index(["rankId"], inplace=True)
+ self.dump_data(abnormal_ranks_df, Constant.DB_CLUSTER_COMMUNICATION_ANALYZER, "AbnormalFrequencyRanks")
+ else:
+ logger.info("No rank found with abnormal aicore frequency.")
+ if len(self.free_freq_ranks) > 0 or len(self.abnormal_freq_ranks) > 0:
+ logger.info("Please verify result in output file.")
+
+ def run(self, context):
+ mapper_res = self.mapper_func(context)
+ self.reducer_func(mapper_res)
+ self.save_db()
+
+ def _mapper_func(self, data_map, analysis_class):
+ profiler_db_path = data_map.get(Constant.PROFILER_DB_PATH)
+ service = DatabaseService(profiler_db_path, None)
+ service.add_table_for_query("AICORE_FREQ", ["deviceId", "freq"])
+ service.add_table_for_query("RANK_DEVICE_MAP", ["rankId"])
+ service_res = service.query_data()
+ aic_freq = service_res.get("AICORE_FREQ", None)
+ rank_id = service_res.get("RANK_DEVICE_MAP", None)
+
+ if aic_freq is None or aic_freq.empty:
+ logger.error(f"No aic freq data found in {profiler_db_path}.")
+ return None
+
+ if rank_id is None or rank_id.empty:
+ logger.error(f"No rank_id data found in {profiler_db_path}.")
+ return None
+
+ rank_id = rank_id["rankId"].values[0]
+ freq_arr = aic_freq["freq"].values
+ freqs = list(set(freq_arr))
+ freqs.sort()
+ return freqs, rank_id
diff --git a/profiler/msprof_analyze/test/ut/cluster_analyse/recipes/test_freq_analysis.py b/profiler/msprof_analyze/test/ut/cluster_analyse/recipes/test_freq_analysis.py
new file mode 100644
index 0000000000000000000000000000000000000000..0a559b79178d03879df6901703c66c7cfcd03663
--- /dev/null
+++ b/profiler/msprof_analyze/test/ut/cluster_analyse/recipes/test_freq_analysis.py
@@ -0,0 +1,83 @@
+# Copyright (c) 2025, Huawei Technologies Co., Ltd.
+# All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import random
+import unittest
+
+import pandas as pd
+
+from msprof_analyze.cluster_analyse.recipes.freq_analysis.freq_analysis import FreqAnalysis
+
+
+class TestFreqAnalysis(unittest.TestCase):
+
+ freq = [1800]
+ free_freq = [800, 1800]
+ abnormal_freq = [1200, 1300, 1800]
+
+ def test_no_error_freq(self):
+ params = {}
+ recipe = FreqAnalysis(params)
+ mapper_res = [(self.freq, 0)] * 10
+ recipe.reducer_func(mapper_res)
+ self.assertEqual(recipe.free_freq_ranks, [])
+ self.assertEqual(recipe.abnormal_freq_ranks, [])
+ self.assertEqual(recipe.abnormal_freq_ranks_map, {})
+
+
+ def test_free_rank_map(self):
+ params = {}
+ recipe = FreqAnalysis(params)
+ mapper_res = [
+ (self.freq, 0),
+ (self.free_freq, 1),
+ (self.free_freq, 2),
+ (self.freq, 3)
+ ]
+ recipe.reducer_func(mapper_res)
+ self.assertEqual(recipe.free_freq_ranks, [1, 2])
+ self.assertEqual(recipe.abnormal_freq_ranks, [])
+ self.assertEqual(recipe.abnormal_freq_ranks_map, {})
+
+ def test_abnormal_rank_map(self):
+ params = {}
+ recipe = FreqAnalysis(params)
+ mapper_res = [
+ (self.freq, 0),
+ (self.abnormal_freq, 1),
+ (self.abnormal_freq, 2),
+ (self.freq, 3)
+ ]
+
+ recipe.reducer_func(mapper_res)
+ self.assertEqual(recipe.free_freq_ranks, [])
+ self.assertEqual(recipe.abnormal_freq_ranks, [1, 2])
+
+ def test_mix_freq_case(self):
+ params = {}
+ recipe = FreqAnalysis(params)
+ mapper_res = []
+ rank_case = [[], [], []]
+ random_freq = {0: self.freq, 1: self.free_freq, 2: self.abnormal_freq}
+
+ for i in range(1000):
+ random_num = random.choice([0, 1, 2])
+ mapper_res.append((random_freq.get(random_num, self.freq), i))
+ rank_case[random_num].append(i)
+
+ recipe.reducer_func(mapper_res)
+ self.assertEqual(recipe.free_freq_ranks, rank_case[1])
+ self.assertEqual(recipe.abnormal_freq_ranks, rank_case[2])