diff --git a/debug/accuracy_tools/msprobe/README.md b/debug/accuracy_tools/msprobe/README.md index e31490f01e9f9d61504d9ee2311c82497323d886..6b7d483078a6a744ce935591ced0971dea2f5b2f 100644 --- a/debug/accuracy_tools/msprobe/README.md +++ b/debug/accuracy_tools/msprobe/README.md @@ -44,6 +44,7 @@ export MSPROBE_LOG_LEVEL={x} - msprobe支持AscendPyTorch 1.11.0或更高版本,支持的PyTorch和CANN以及PyTorch和python软件版本配套关系请参见《[Ascend Extension for PyTorch插件](https://gitee.com/ascend/pytorch)》。 - msprobe支持MindSpore 2.4.0或更高版本,支持的MindSpore和CANN以及MindSpore和python软件版本配套关系请参见《[MindSpore版本发布列表](https://www.mindspore.cn/versions)》。 +- msprobe支持MSAdapter 2.1.0。 - msprobe支持的固件驱动版本与配套CANN软件支持的固件驱动版本相同,开发者可通过“[昇腾社区-固件与驱动](https://gitee.com/link?target=https%3A%2F%2Fwww.hiascend.com%2Fhardware%2Ffirmware-drivers%2Fcommunity%3Fproduct%3D2%26model%3D28%26cann%3D8.0.RC3.alpha003%26driver%3D1.0.25.alpha)”页面根据产品型号与CANN软件版本获取配套的固件与驱动。 @@ -69,15 +70,17 @@ export MSPROBE_LOG_LEVEL={x} ### 1 数据采集 -msprobe 通过在训练脚本中添加 PrecisionDebugger 接口的方式对 API 执行精度数据 dump 操作,对应 config.json 中的 task 为 statistics 或 tensor。 +msprobe 通过在训练脚本中添加 PrecisionDebugger 接口的方式对 API 执行精度数据 dump 操作。对应 config.json 中的 "statistics" 或 "tensor" task。 [PyTorch 场景的数据采集](./docs/05.data_dump_PyTorch.md) [MindSpore 场景的数据采集](./docs/06.data_dump_MindSpore.md) +[MSAdapter 场景的数据采集](./docs/29.data_dump_MSAdapter.md) + ### 2 精度预检 -精度预检旨在昇腾 NPU 上扫描训练模型中的所有 API 进行 API 复现,给出精度情况的诊断和分析。对应 config.json 中的 task 为 run_ut。 +精度预检旨在昇腾 NPU 上扫描训练模型中的所有 API 进行 API 复现,给出精度情况的诊断和分析。对应 config.json 中的 "run_ut" task。 PyTorch 场景的[离线预检](./docs/07.accuracy_checker_PyTorch.md)和[在线预检](./docs/08.accuracy_checker_online_PyTorch.md) @@ -143,12 +146,14 @@ MindSpore 动态图场景的[离线预检](./docs/09.accuracy_checker_MindSpore. ### 12 溢出检测与解析 -溢出检测与解析是在执行精度数据 dump 时,判断是否存在输入正常但输出存在溢出的 API,从而判断是否为正常溢出。对应 config.json 中的 overflow_check。 -推荐直接使用[数据采集](#1-数据采集)功能采集统计量信息检测溢出问题。 +溢出检测用于采集溢出 API 或 模块的精度数据,而溢出解析则是通过对溢出数据的分析,进一步判断是否为正常溢出。对应 config.json 中的 "overflow_check" task。 +推荐直接使用[数据采集](#1-数据采集)功能采集统计量信息,检测溢出问题。 [PyTorch 场景的溢出检测与解析](./docs/12.overflow_check_PyTorch.md) -[MindSpore 场景的溢出检测与解析](./docs/13.overflow_check_MindSpore.md) +[MindSpore 场景的溢出检测](./docs/13.overflow_check_MindSpore.md) + +[MSAdapter 场景的溢出检测](./docs/30.overflow_check_MSAdapter.md) ## 📑 补充材料 diff --git a/debug/accuracy_tools/msprobe/core/common/file_utils.py b/debug/accuracy_tools/msprobe/core/common/file_utils.py index fdc626ca6a1a90e9060cefa237f9d5d8d7e42844..89d33a6a3e6fe830b981483edbe2aa6a4e5aa41f 100644 --- a/debug/accuracy_tools/msprobe/core/common/file_utils.py +++ b/debug/accuracy_tools/msprobe/core/common/file_utils.py @@ -1,4 +1,4 @@ -# Copyright (c) 2024-2024, Huawei Technologies Co., Ltd. +# Copyright (c) 2024-2025, Huawei Technologies Co., Ltd. # All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -23,6 +23,7 @@ import shutil from datetime import datetime, timezone from dateutil import parser import yaml + import numpy as np import pandas as pd @@ -446,8 +447,6 @@ def save_excel(path, data): change_mode(path, FileCheckConst.DATA_FILE_AUTHORITY) - - def move_file(src_path, dst_path): check_file_or_directory_path(src_path) check_path_before_create(dst_path) diff --git a/debug/accuracy_tools/msprobe/core/common/utils.py b/debug/accuracy_tools/msprobe/core/common/utils.py index c06b5b64927bf47da1573df3b1d4db34dfa24cb1..7ec0490168f3ec3c39afcd0915f85609e39f0030 100644 --- a/debug/accuracy_tools/msprobe/core/common/utils.py +++ b/debug/accuracy_tools/msprobe/core/common/utils.py @@ -247,6 +247,10 @@ def md5_find(data): def detect_framework_by_dump_json(file_path): + json_data = load_json(file_path) + framework = json_data.get("framework", None) + if framework in [Const.PT_FRAMEWORK, Const.MS_FRAMEWORK]: + return framework pattern_ms = r'"type":\s*"mindspore' pattern_pt = r'"type":\s*"torch' with FileOpen(file_path, 'r') as file: diff --git a/debug/accuracy_tools/msprobe/core/compare/acc_compare.py b/debug/accuracy_tools/msprobe/core/compare/acc_compare.py index f0ac97a0293b5a7ec95b61a4805af179a087eafc..28a7b5f3a87aa1ff9cac73672f55e5d7e4f5407c 100644 --- a/debug/accuracy_tools/msprobe/core/compare/acc_compare.py +++ b/debug/accuracy_tools/msprobe/core/compare/acc_compare.py @@ -34,6 +34,8 @@ from msprobe.core.compare.multiprocessing_compute import ComparisonResult, _hand from msprobe.core.compare.npy_compare import compare_ops_apply, get_error_flag_and_msg from msprobe.core.compare.utils import get_accuracy, get_rela_diff_summary_mode, get_un_match_accuracy, merge_tensor, \ print_compare_ends_info, read_op, get_name_and_state, reorder_op_x_list +from msprobe.pytorch.compare.pt_compare import read_pt_data +from msprobe.mindspore.compare.ms_compare import read_npy_data class ModeConfig: @@ -329,7 +331,9 @@ class Comparator: else: result_item.append(CompareConst.NONE) if self.dump_mode == Const.ALL: - result_item.append(npu_ops_all.get(ms_op_name).get("data_name", None)) + ms_data_name = npu_ops_all.get(ms_op_name).get("data_name", None) + pt_data_name = bench_ops_all.get(bench_op_name).get("data_name", None) + result_item.append([ms_data_name, pt_data_name]) result.append(result_item) elif ms_op_name not in npu_ops_all: logger.warning(f'Can not find npu op name : `{ms_op_name}` in npu dump json file.') @@ -349,47 +353,48 @@ class Comparator: result_df = self.make_result_table(result) return result_df - def compare_by_op(self, npu_op_name, bench_op_name, op_name_mapping_dict, input_param, bench_data): + def compare_by_op(self, npu_op_name, bench_op_name, op_name_mapping_dict, input_param): """ :param npu_op_name: excel中的NPU_Name,例如:MintFunctional.conv2d.0.forward.input.3.0 :param bench_op_name: excel中的Bench_Name,例如:Functional.conv2d.0.forward.input.3.0 :param op_name_mapping_dict: op_name和npy或pt文件的映射关系 :param input_param: npu_json_path/bench_json_path/stack_json_path等参数 - :param bench_data: bench的dump数据中"data"字段 :return: result_list,包含余弦相似度、最大绝对误差、最大相对误差、千分之一误差率、千分之五误差率和错误信息 - 用于读取excel中的NPU_Name和Bench_Name,根据映射关系找到npy或pt文件,然后读取文件中的数据进行比较,计算余弦相似度、 + 用于读取excel中的NPU_Name和Bench_Name,根据映射关系找到npy或pt文件,然后读取文件中的数据进行比较,计算余弦相似度、欧式距离 最大绝对误差、最大相对误差、千分之一误差率、千分之五误差率并生成错误信息 """ - npu_bench_name_list = op_name_mapping_dict[npu_op_name] - data_name = safe_get_value(npu_bench_name_list, 1, "npu_bench_name_list") error_file, relative_err, error_flag = None, None, False - bench_data_name = get_bench_data_name(bench_op_name, bench_data) - if data_name == '-1' or data_name == -1: # 没有真实数据路径 - n_value, b_value = CompareConst.READ_NONE, CompareConst.READ_NONE - error_flag = True - elif not bench_data_name: + + data_name_pair = op_name_mapping_dict.get(npu_op_name) + npu_data_name = data_name_pair[0] + bench_data_name = data_name_pair[1] + + if str(npu_data_name) == '-1': # 没有npu真实数据 + n_value, b_value, error_flag = CompareConst.READ_NONE, CompareConst.READ_NONE, True + elif str(bench_data_name) == '-1': # 没有bench真实数据 n_value, b_value, error_flag = CompareConst.READ_NONE, CompareConst.READ_NONE, True error_file = 'no_bench_data' else: + npu_dir = input_param.get("npu_dump_data_dir") + bench_dir = input_param.get("bench_dump_data_dir") try: - read_npy_data = getattr(self, "read_npy_data") frame_name = getattr(self, "frame_name") + if frame_name == "MSComparator": - n_value = read_npy_data(input_param.get("npu_dump_data_dir"), npu_op_name + Const.NUMPY_SUFFIX) + n_value = read_npy_data(npu_dir, npu_data_name) if self.cross_frame: - b_value = read_npy_data(input_param.get("bench_dump_data_dir"), bench_data_name, - load_pt_file=True) + b_value = read_pt_data(bench_dir, bench_data_name) else: - b_value = read_npy_data(input_param.get("bench_dump_data_dir"), bench_data_name) + b_value = read_npy_data(bench_dir, bench_data_name) else: - n_value = read_npy_data(input_param.get("npu_dump_data_dir"), npu_op_name + Const.PT_SUFFIX) - b_value = read_npy_data(input_param.get("bench_dump_data_dir"), bench_data_name) + n_value = read_pt_data(npu_dir, npu_data_name) + b_value = read_pt_data(bench_dir, bench_data_name) except IOError as error: error_file = error.filename n_value, b_value = CompareConst.READ_NONE, CompareConst.READ_NONE error_flag = True except (FileCheckException, CompareException): - error_file = data_name + error_file = npu_data_name n_value, b_value = CompareConst.READ_NONE, CompareConst.READ_NONE error_flag = True @@ -464,7 +469,7 @@ class Comparator: err_mess = [] is_print_compare_log = input_param.get("is_print_compare_log") - bench_data = load_json(input_param.get("bench_json_path")).get('data') + for i in range(len(result_df)): npu_op_name = result_df.iloc[i, 0] bench_op_name = result_df.iloc[i, 1] @@ -472,7 +477,7 @@ class Comparator: logger.info("start compare: {}".format(npu_op_name)) cos_sim, euc_dist, max_abs_err, max_relative_err, one_thousand_err_ratio, five_thousand_err_ratio, err_msg \ - = self.compare_by_op(npu_op_name, bench_op_name, dump_path_dict, input_param, bench_data) + = self.compare_by_op(npu_op_name, bench_op_name, dump_path_dict, input_param) if is_print_compare_log: logger.info( @@ -508,46 +513,3 @@ class Comparator: except ValueError as e: logger.error('result dataframe is not found.') raise CompareException(CompareException.INVALID_DATA_ERROR) from e - - -def get_bench_data_name(bench_op_name, bench_data): - bench_name_list = re.split(r'\.(input|output|kwargs|parameters|parameters_grad)\.', bench_op_name) - if len(bench_name_list) > 1 and bench_name_list[1] == Const.PARAMS_GRAD: - bench_data_bundle = bench_data.get(bench_name_list[0] + Const.SEP + bench_name_list[1], {}) - else: - bench_data_bundle = bench_data.get(bench_name_list[0], {}) - if not bench_data_bundle or len(bench_name_list) < 3: - return None - layers = bench_name_list[2].split(Const.SEP) - - def _get(key, container): - if isinstance(container, dict): - return container.get(key) - if isinstance(container, list): - try: - return container[int(key)] - except (ValueError, IndexError): - return None - return None - - def get_by_layer(container, params_grad=False): - data = container - # dump.json中parameters_grad的结构为key:[{}], 如果存在key,有且只有一个列表元素,而op_name中只命名到了key,因此加'0' - if params_grad: - layers.append('0') - for layer in layers: - data = _get(layer, data) - return _get(CompareConst.DATA_NAME.lower(), data) - - if Const.INPUT == bench_name_list[1]: - return get_by_layer(bench_data_bundle.get(Const.INPUT, bench_data_bundle.get(Const.INPUT_ARGS))) - elif Const.KWARGS == bench_name_list[1]: - return get_by_layer(bench_data_bundle.get(Const.INPUT_KWARGS)) - elif Const.OUTPUT == bench_name_list[1]: - return get_by_layer(bench_data_bundle.get(Const.OUTPUT)) - elif Const.PARAMS == bench_name_list[1]: - return get_by_layer(bench_data_bundle.get(Const.PARAMS)) - elif Const.PARAMS_GRAD == bench_name_list[1]: - return get_by_layer(bench_data_bundle, params_grad=True) - else: - return None diff --git a/debug/accuracy_tools/msprobe/core/compare/multiprocessing_compute.py b/debug/accuracy_tools/msprobe/core/compare/multiprocessing_compute.py index f79671827c1efc30f3f0a573e23d9d72f2fbd289..71b0f29d64f717adc87b74cf48e891652e9e753f 100644 --- a/debug/accuracy_tools/msprobe/core/compare/multiprocessing_compute.py +++ b/debug/accuracy_tools/msprobe/core/compare/multiprocessing_compute.py @@ -25,7 +25,7 @@ from msprobe.core.common.utils import CompareException from msprobe.core.common.const import CompareConst -def _handle_multi_process(func, input_parma, result_df, lock): +def _handle_multi_process(func, input_param, result_df, lock): process_num = max(int((multiprocessing.cpu_count() + 1) // 4), 1) op_name_mapping_dict = read_dump_data(result_df) @@ -55,7 +55,7 @@ def _handle_multi_process(func, input_parma, result_df, lock): idx = df_chunk_size * process_idx chunk_size = len(df_chunk) result = pool.apply_async(func, - args=(idx, op_name_mapping_dict, df_chunk, lock, input_parma), + args=(idx, op_name_mapping_dict, df_chunk, lock, input_param), error_callback=err_call, callback=partial(update_progress, chunk_size, lock) ) @@ -97,12 +97,12 @@ def _ms_graph_handle_multi_process(func, result_df, mode): def read_dump_data(result_df): try: npu_dump_name_list = result_df.iloc[0:, 0].tolist() - npu_dump_tensor_list = result_df.iloc[0:, -1].tolist() + dump_tensor_pair_list = result_df.iloc[0:, -1].tolist() op_name_mapping_dict = {} for index, _ in enumerate(npu_dump_name_list): npu_dump_name = npu_dump_name_list[index] - npu_dump_tensor = npu_dump_tensor_list[index] - op_name_mapping_dict[npu_dump_name] = [npu_dump_tensor, npu_dump_tensor] + dump_tensor_pair = dump_tensor_pair_list[index] + op_name_mapping_dict[npu_dump_name] = dump_tensor_pair return op_name_mapping_dict except ValueError as e: logger.error('result dataframe is not found.') diff --git a/debug/accuracy_tools/msprobe/core/compare/utils.py b/debug/accuracy_tools/msprobe/core/compare/utils.py index 72b75ab254e59a4ec5788e95fde6721df2babe46..471951ce4b5fd12d6fdff26a1584e261ea86d71c 100644 --- a/debug/accuracy_tools/msprobe/core/compare/utils.py +++ b/debug/accuracy_tools/msprobe/core/compare/utils.py @@ -19,11 +19,12 @@ import math import zlib from dataclasses import dataclass +import torch import numpy as np from msprobe.core.common.const import Const, CompareConst, FileCheckConst from msprobe.core.common.utils import CompareException, check_regex_prefix_format_valid, logger, safe_get_value -from msprobe.core.common.file_utils import check_file_or_directory_path +from msprobe.core.common.file_utils import check_file_or_directory_path, FileChecker, load_npy def extract_json(dirname, stack_json=False): @@ -321,8 +322,8 @@ def get_accuracy(result, n_dict, b_dict, dump_mode): has_stack = npu_stack_info and bench_stack_info if dump_mode == Const.ALL: - npu_data_name = n_dict.get("data_name", None) - bench_data_name = b_dict.get("data_name", None) + npu_data_name_list = n_dict.get("data_name", None) + bench_data_name_list = b_dict.get("data_name", None) for index in range(min_len): n_name = safe_get_value(n_dict, n_start + index, "n_dict", key="op_name") @@ -353,7 +354,9 @@ def get_accuracy(result, n_dict, b_dict, dump_mode): result_item.append(err_msg) result_item = stack_column_process(result_item, has_stack, index, key, npu_stack_info) if dump_mode == Const.ALL: - result_item.append(safe_get_value(npu_data_name, n_start + index, "npu_data_name")) + npu_data_name = safe_get_value(npu_data_name_list, n_start + index, "npu_data_name_list") + bench_data_name = safe_get_value(bench_data_name_list, n_start + index, "bench_data_name_list") + result_item.append([npu_data_name, bench_data_name]) result.append(result_item) @@ -388,7 +391,9 @@ def get_accuracy(result, n_dict, b_dict, dump_mode): result_item.append(err_msg) result_item = stack_column_process(result_item, has_stack, index, key, npu_stack_info) if dump_mode == Const.ALL: - result_item.append(safe_get_value(npu_data_name, n_start + index, "npu_data_name")) + npu_data_name = safe_get_value(npu_data_name_list, n_start + index, "npu_data_name_list") + bench_data_name = safe_get_value(bench_data_name_list, n_start + index, "bench_data_name_list") + result_item.append([npu_data_name, bench_data_name]) result.append(result_item) @@ -467,7 +472,7 @@ def get_un_match_accuracy(result, n_dict, dump_mode): result_item.append(err_msg) append_stack_info(result_item, npu_stack_info, index) if dump_mode == Const.ALL and result_item[1] == CompareConst.N_A: - result_item.extend(["-1"]) + result_item.extend([["-1", "-1"]]) result.append(result_item) diff --git a/debug/accuracy_tools/msprobe/core/data_dump/data_processor/pytorch_processor.py b/debug/accuracy_tools/msprobe/core/data_dump/data_processor/pytorch_processor.py index 2cd98b125682434b517f6d70e09ea6a850b3e3bb..c3c0c3aa734bc9effadab8a591e314610218611d 100644 --- a/debug/accuracy_tools/msprobe/core/data_dump/data_processor/pytorch_processor.py +++ b/debug/accuracy_tools/msprobe/core/data_dump/data_processor/pytorch_processor.py @@ -29,7 +29,7 @@ from msprobe.core.common.log import logger from msprobe.core.common.utils import convert_tuple from msprobe.core.data_dump.data_processor.base import BaseDataProcessor, ModuleBackwardInputsOutputs, \ ModuleForwardInputsOutputs, TensorStatInfo -from msprobe.pytorch.common.utils import save_pt, load_pt +from msprobe.pytorch.common.utils import save_pt from msprobe.pytorch.free_benchmark import FreeBenchmarkCheck, UnequalRow from msprobe.core.common.utils import recursion_depth_decorator @@ -145,7 +145,7 @@ class PytorchDataProcessor(BaseDataProcessor): if data.is_meta: return tensor_stat data_clone = data.detach() - if data_clone.numel() == 0: + if not data_clone.numel() or not data_clone.data_ptr(): return tensor_stat else: if data_clone.device.type == Const.CPU_LOWERCASE or not async_dump: diff --git a/debug/accuracy_tools/msprobe/docs/01.installation.md b/debug/accuracy_tools/msprobe/docs/01.installation.md index 1ab5f6419ba07ec749bad139f874fbc7301fd8b3..530783e87d0bdadd51856cb1ae08160cb081da80 100644 --- a/debug/accuracy_tools/msprobe/docs/01.installation.md +++ b/debug/accuracy_tools/msprobe/docs/01.installation.md @@ -16,7 +16,7 @@ pip install mindstudio-probe |版本|发布日期|支持 PyTorch 版本|支持 MindSpore 版本|下载链接|校验码| |:--:|:--:|:--:|:--:|:--:|:--:| -|1.2.2|2025.2.26|1.11/2.0/2.1/2.2|2.4.0|[mindstudio_probe-1.2.2-py3-none-any.whl](https://ptdbg.obs.myhuaweicloud.com/msprobe/1.2/mindstudio_probe-1.2.2-py3-none-any.whl)|1db0cf4572bc0305c68705b74775f652c6cb2c2bedb6c6e57f43e31ab273b288| +|1.2.2|2025.3.03|1.11/2.0/2.1/2.2|2.4.0|[mindstudio_probe-1.2.2-py3-none-any.whl](https://ptdbg.obs.myhuaweicloud.com/msprobe/1.2/mindstudio_probe-1.2.2-py3-none-any.whl)|961411bb460d327ea51d6ca4d0c8e8c5565f07c0852d7b8592b781ca35b87212| |1.2.1|2025.2.07|1.11/2.0/2.1/2.2|2.4.0|[mindstudio_probe-1.2.1-py3-none-any.whl](https://ptdbg.obs.myhuaweicloud.com/msprobe/1.2/mindstudio_probe-1.2.1-py3-none-any.whl)|b64b342118558e0339b39237f88a49b93fd24551b0cb202c872fbfef4260c86b| |1.2.0|2025.1.13|1.11/2.0/2.1/2.2|2.4.0|[mindstudio_probe-1.2.0-py3-none-any.whl](https://ptdbg.obs.myhuaweicloud.com/msprobe/1.2/mindstudio_probe-1.2.0-py3-none-any.whl)|1e3aeea1706112f6ee52fd1165037936bb209138f0b9ec42ea21e2c1c8942cdc| |1.1.1|2024.12.09|1.11/2.0/2.1/2.2|2.4.0|[mindstudio_probe-1.1.1-py3-none-any.whl](https://ptdbg.obs.myhuaweicloud.com/msprobe/1.1/mindstudio_probe-1.1.1-py3-none-any.whl)|577b597555dc155b76ba1a62d575c3546004644e140a456c3ba0824d46283735| diff --git a/debug/accuracy_tools/msprobe/docs/02.config_introduction.md b/debug/accuracy_tools/msprobe/docs/02.config_introduction.md index f134bd4536294d209e7b3e6e73fd80b9be61041d..a5f17637dae817de6eba091e9eef602ca95f091a 100644 --- a/debug/accuracy_tools/msprobe/docs/02.config_introduction.md +++ b/debug/accuracy_tools/msprobe/docs/02.config_introduction.md @@ -12,23 +12,23 @@ | 参数 | 解释 | 是否必选 | | ----------------- |------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| -------- | -| task | dump 的任务类型,str 类型。可选参数:
"statistics":仅采集统计信息,默认值;
"tensor":采集统计信息和完全复刻整网的真实数据;
"run_ut":精度预检,仅 PyTorch 场景支持,采集数据时勿选;
"overflow_check":溢出检测;
"free_benchmark":无标杆比对;
"grad_probe":梯度监控;
"structure":仅采集模型结构以及调用栈信息,不采集具体数据。
根据 task 参数取值的不同,可以配置不同场景参数,详见:
[1.2 task 配置为 statistics](#12-task-配置为-statistics),
[1.3 task 配置为 tensor](#13-task-配置为-tensor),
[1.4 task 配置为 run_ut](#14-task-配置为-run_ut),
[1.5 task 配置为 overflow_check](#15-task-配置为-overflow_check),
[1.6 task 配置为 free_benchmark](#16-task-配置为-free_benchmark),
[1.7 task 配置为 grad_probe](#17-task-配置为-grad_probe)。
**配置示例**:"task": "tensor"。 | 否 | +| task | dump 的任务类型,str 类型。可选参数:
"statistics":仅采集统计信息,默认值;
"tensor":采集统计信息和完全复刻整网的真实数据;
"run_ut":精度预检,仅 PyTorch 场景支持,采集数据时勿选;
"overflow_check":溢出检测;
"free_benchmark":无标杆比对,不支持 MSAdapter 场景;
"grad_probe":梯度监控, 不支持 MSAdapter 场景;
"structure":仅采集模型结构以及调用栈信息,不采集具体数据。
根据 task 参数取值的不同,可以配置不同场景参数,详见:
[1.2 task 配置为 statistics](#12-task-配置为-statistics),
[1.3 task 配置为 tensor](#13-task-配置为-tensor),
[1.4 task 配置为 run_ut](#14-task-配置为-run_ut),
[1.5 task 配置为 overflow_check](#15-task-配置为-overflow_check),
[1.6 task 配置为 free_benchmark](#16-task-配置为-free_benchmark),
[1.7 task 配置为 grad_probe](#17-task-配置为-grad_probe)。
**配置示例**:"task": "tensor"。 | 否 | | dump_path | 设置 dump 数据目录路径,str 类型。
**配置示例**:"dump_path": "./dump_path"。 | 是 | | rank | 指定对某张卡上的数据进行采集,list[Union[int, str]] 类型,默认未配置(表示采集所有卡的数据),应配置元素为 ≥0 的整数或类似"4-6"的字符串,且须配置实际可用的 Rank ID。
PyTorch 场景: Rank ID 从 0 开始计数,最大取值为所有节点可用卡总数-1,若所配置的值大于实际训练所运行的卡的 Rank ID,则 dump 数据为空,比如当前环境 Rank ID 为 0 到 7,实际训练运行 0 到 3 卡,此时若配置 Rank ID 为 4 或不存在的 10 等其他值,dump 数据为空。
MindSpore 场景:所有节点的 Rank ID 均从 0 开始计数,最大取值为每个节点可用卡总数-1,config.json 配置一次 rank 参数对所有节点同时生效。
注意,单卡训练时,rank必须为[],即空列表,不能指定rank。
**配置示例**:"rank": [1, "4-6"]。 | 否 | | step | 指定采集某个 step 的数据,list[Union[int, str]] 类型。默认未配置,表示采集所有 step 数据。采集特定 step 时,须指定为训练脚本中存在的 step,可逐个配置,也可以指定范围。
**配置示例**:"step": [0, 1 , 2, "4-6"]。 | 否 | -| level | dump 级别,str 类型,根据不同级别采集不同数据。可选参数:
"L0":dump 模块级精度数据,仅 PyTorch 与 MindSpore 动态图场景支持,使用背景详见 [1.1.1 模块级精度数据 dump 说明](#111-模块级精度数据-dump-说明);
"L1":dump API 级精度数据,默认值,仅 PyTorch 与 MindSpore 动态图场景支持;
"L2":dump kernel 级精度数据,PyTorch场景详细介绍见 [PyTorch 场景的 kernel dump 说明](./04.kernel_dump_PyTorch.md);MindSpore场景详细介绍见 [MindSpore 场景的 kernel dump 说明](./28.kernel_dump_MindSpore.md);
"mix":dump module 模块级和 API 级精度数据,即"L0"+"L1",仅 PyTorch 与 MindSpore 动态图场景支持。
"debug":单点保存功能,细节详见[单点保存工具 README](./28.debugger_save_instruction.md)
**配置示例**:"level": "L1"。 | 否 | +| level | dump 级别,str 类型,根据不同级别采集不同数据。可选参数:
"L0":dump 模块级精度数据,仅 PyTorch、MSAdapter 以及 MindSpore 动态图场景支持,使用背景详见 [1.1.1 模块级精度数据 dump 说明](#111-模块级精度数据-dump-说明);
"L1":dump API 级精度数据,默认值,仅 PyTorch、MSAdapter 以及 MindSpore 动态图场景支持;
"L2":dump kernel 级精度数据,PyTorch 场景详细介绍见 [PyTorch 场景的 kernel dump 说明](./04.kernel_dump_PyTorch.md);MindSpore 动态图场景详细介绍见 [MindSpore 动态图场景的 kernel dump 说明](./28.kernel_dump_MindSpore.md);MindSpore 静态图场景详细介绍见《MindSpore 场景的数据采集》中的 ["**8.1 静态图场景**"](./06.data_dump_MindSpore.md#81-静态图场景)小节;
"mix":dump module 模块级和 API 级精度数据,即"L0"+"L1",仅 PyTorch、MSAdapter 以及 MindSpore 动态图场景支持。
"debug":单点保存功能,细节详见[单点保存工具 README](./28.debugger_save_instruction.md)
**配置示例**:"level": "L1"。 | 否 | | enable_dataloader | 自动控制开关,bool 类型,仅 PyTorch 场景支持。可选参数 true(开启)或 false(关闭),默认为 false。配置为 true 后自动识别 step 参数指定的迭代,并在该迭代执行完成后退出训练,此时 start、stop 和 step 函数可不配置,开启该开关要求训练脚本是通过 torch.utils.data.dataloader 方式加载数据。仅支持 PyTorch 单卡训练使用,分布式训练场景下存在数据 dump 不全问题。 **这个特性下个版本将被废弃** | 否 | | async_dump | 异步 dump 开关,bool 类型。可选参数 true(开启)或 false(关闭),默认为 false。配置为 true 后开启异步 dump,即采集的精度数据会在当前 step 训练结束后统一落盘,训练过程中工具不触发同步操作。由于使用该模式有**显存溢出**的风险,当 task 配置为 tensor 时,即真实数据的异步dump模式,必须配置 [list](#13-task-配置为-tensor) 参数,指定需要 dump 的 tensor 。该模式暂不支持复数类型 tensor
的统计量计算。 | 否 | #### 1.1.1 模块级精度数据 dump 说明 -仅 PyTorch 与 MindSpore 动态图场景支持。 +仅 PyTorch、MSAdapter以及 MindSpore 动态图场景支持。 大模型场景下,通常不是简单的利用自动迁移能力实现从 GPU 到 NPU 的训练脚本迁移,而是会对 NPU 网络进行一系列针对性的适配,因此,常常会造成迁移后的 NPU 模型存在部分子结构不能与 GPU 原始模型完全对应。模型结构不一致导致 API 调用类型及数量不一致,若直接按照 API 粒度进行精度数据 dump 和比对,则无法完全比对所有的 API。 本小节介绍的功能是对模型中的大粒度模块进行数据 dump,使其比对时,对于无法以 API 粒度比对的模块可以直接以模块粒度进行比对。 -模块指的是继承 nn.Module 类(PyTorch场景)或 nn.Cell 类(MindSpore场景)的子类,通常情况下这类模块就是一个小模型,可以被视为一个整体,dump 数据时以模块为粒度进行 dump。 +模块指的是继承 nn.Module 类(PyTorch 与 MSAdapter 场景)或 nn.Cell 类(MindSpore 场景)的子类,通常情况下这类模块就是一个小模型,可以被视为一个整体,dump 数据时以模块为粒度进行 dump。 @@ -36,21 +36,23 @@ - - - - + + - + - + + +
参数解释是否必选
scopePyTorch 和 MindSpore 动态图场景 dump 范围,list[str] 类型,默认未配置(list 也未配置时表示 dump 所有 API 的数据)。该参数可以在 [ ] 内配置两个模块名或 API 名,要求列表长度必须为2,需要配置按照工具命名格式的完整模块名或API名称,用于锁定区间,dump 该范围内的数据。
配置示例: +
scopePyTorch、MSAdapter 以及 MindSpore 动态图场景 dump 范围,list[str] 类型,默认未配置(list 也未配置时表示 dump 所有 API 的数据)。该参数可以在 [ ] 内配置两个模块名或 API 名,要求列表长度必须为2,需要配置按照工具命名格式的完整模块名或API名称,用于锁定区间,dump 该范围内的数据。
配置示例: "scope": ["Module.conv1.Conv2d.forward.0", "Module.fc2.Linear.forward.0"], 或 "scope": ["Cell.conv1.Conv2d.forward.0", "Cell.fc2.Dense.backward.0"], 或"scope": ["Tensor.add.0.forward", "Functional.square.2.forward"]。与 level 参数取值相关,level 为 L0 级别时,可配置模块名;level 为 L1 级别时,可配置 API 名, level为 mix 级别时,可配置为模块名或API名。
list自定义采集的算子列表,list[str] 类型,默认未配置(scope 也未配置时表示 dump 所有 API 的数据),包含以下配置方法:
PyTorch 和 MindSpore 动态图场景配置具体的 API 全称,dump 该 API 数据。在 PyTorch 场景,如果 level 配置成 L2,该配置为必填项。
配置示例:"list": ["Tensor.permute.1.forward", "Tensor.transpose.2.forward", "Torch.relu.3.backward"]。
PyTorch 和 MindSpore 动态图场景在level为 mix 级别时可以配置模块名称,dump该模块展开数据 (dump该模块从执行开始到执行结束期间的所有数据)。 +
PyTorch、MSAdapter 以及 MindSpore 动态图场景配置具体的 API 全称,dump 该 API 数据。在 PyTorch 场景,如果 level 配置成 L2,该配置为必填项。
配置示例:"list": ["Tensor.permute.1.forward", "Tensor.transpose.2.forward", "Torch.relu.3.backward"]。
PyTorch 和 MindSpore 动态图场景在level为 mix 级别时可以配置模块名称,dump该模块展开数据 (dump该模块从执行开始到执行结束期间的所有数据)。
配置示例:"list": ["Module.module.language_model.encoder.layers.0.mlp.ParallelMlp.forward.0"], 或 "list": ["Cell.network_with_loss.language_model.encoder.layers.0.mlp.ParallelMlp.forward.0"]
PyTorch 和 MindSpore 动态图场景指定某一类 API,dump 某一类的 API 级别输入输出数据。
配置示例:"list": ["relu"]。
PyTorch 和 MindSpore 动态图场景在level为 mix 级别时, 会dump名称中包含list中配置的字符串的API数据,还会将名称中包含list中配置的字符串的模块进行展开dump (dump该模块从执行开始到执行结束期间的所有数据)。
MindSpore 静态图场景配置 kernel_name,可以是算子的名称列表,也可以指定算子类型("level": "L2"时不支持),还可以配置算子名称的正则表达式(当字符串符合“name-regex(xxx)”格式时,后台则会将其作为正则表达式。
配置示例:list: ["name-regex(Default/.+)"]
可匹配算子名称以“Default/”开头的所有算子。
PyTorch、MSAdapter 以及 MindSpore 动态图场景指定某一类 API,dump 某一类的 API 级别输入输出数据。
配置示例:"list": ["relu"]。
PyTorch、MSAdapter 以及 MindSpore 动态图场景在level为 mix 级别时, 会dump名称中包含list中配置的字符串的API数据,还会将名称中包含list中配置的字符串的模块进行展开dump (dump该模块从执行开始到执行结束期间的所有数据)。
MindSpore 静态图场景配置 kernel_name,可以是算子的名称列表,也可以指定算子类型(jit_level=O2 时不支持),还可以配置算子名称的正则表达式(当字符串符合“name-regex(xxx)”格式时,后台则会将其作为正则表达式。
配置示例:list: ["name-regex(Default/.+)"]
可匹配算子名称以“Default/”开头的所有算子。
data_modedump 数据过滤,str 类型。
PyTorch 与 MindSpore 动态图场景:支持"all"、"forward"、"backward"、"input"和"output",除"all"外,其余参数可以自由组合。默认为["all"],即保存所有 dump 的数据。
配置示例:"data_mode": ["backward"] (仅保存反向数据)或 "data_mode": ["forward", "input"](仅保存前向的输入数据)。
PyTorch、MSAdapter 以及 MindSpore 动态图场景:支持"all"、"forward"、"backward"、"input"和"output",除"all"外,其余参数可以自由组合。默认为["all"],即保存所有 dump 的数据。
配置示例:"data_mode": ["backward"] (仅保存反向数据)或 "data_mode": ["forward", "input"](仅保存前向的输入数据)。
MindSpore 静态图场景:仅支持"all"、"input"和"output"参数,且各参数只能单独配置,不支持自由组合。
配置示例:"data_mode": ["all"]。
summary_mode控制 dump 文件输出的模式,str 类型,仅 PyTorch 与 MindSpore 动态图场景支持,可选参数:
md5:dump 输出包含 CRC-32 值以及 API 统计信息的 dump.json 文件,用于验证数据的完整性;
statistics:dump 仅输出包含 API 统计信息的 dump.json 文件,默认值。
配置示例:"summary_mode": "md5"。
MindSpore静态图jit_level=O2场景L2级dump,支持上述配置的同时额外支持配置统计项列表,可选统计项为max、min、mean、l2norm,可从中任意选取组合搭配。其中mean、l2norm的结果为float数据格式。
配置示例:"summary_mode": ["max", "min"]。
summary_mode控制 dump 文件输出的模式,str 类型,支持 PyTorch、MSAdapter、MindSpore 动态图以及 MindSpore 静态图 jit_level=O2 场景。
PyTorch、MSAdapter 以及 MindSpore 动态图场景:可选参数为
md5:dump 输出包含 CRC-32 值以及 API 统计信息的 dump.json 文件,用于验证数据的完整性;
statistics:dump 仅输出包含 API 统计信息的 dump.json 文件,默认值。
配置示例:"summary_mode": "md5"。
MindSpore 静态图 jit_level=O2 场景:支持上述配置的同时额外支持配置统计项列表,可选统计项为max、min、mean、l2norm,可从中任意选取组合搭配。其中mean、l2norm的结果为float数据格式。
配置示例:"summary_mode": ["max", "min"]。
-**说明**:"summary_mode"配置为"md5"时,所使用的校验算法为CRC-32算法。 +**说明**:"summary_mode" 配置为 "md5" 时,所使用的校验算法为 CRC-32 算法。 ### 1.3 task 配置为 tensor @@ -86,16 +88,16 @@ ### 1.5 task 配置为 overflow_check -PyTorch 与 MindSpore 动态图场景下,"level"须为"L0"或"L1";MindSpore 静态图场景下,"level"须为"L2",且模型编译优化等级(jit_level)须为"O2"。 +PyTorch、MSAdapter 以及 MindSpore 动态图场景下,"level"须为"L0"或"L1";MindSpore 静态图场景下,"level"须为"L2",且模型编译优化等级(jit_level)须为"O2"。 | 参数 | 解释 | 是否必选 | | ------------- | ---------------------- | -------- | | overflow_nums | 最大溢出次数,int 类型,默认为 1,仅 PyTorch 与 MindSpore 动态图场景支持。表示第 N 次溢出后,不再进行溢出检测。过程中检测到溢出 API 对应的 输入输出 数据均 dump。
**配置示例**:"overflow_nums": 3。配置为 -1 时,表示持续检测溢出直到训练结束。 | 否 | -| check_mode | 溢出类型,str 类型,仅 MindSpore 场景支持,可选参数:
"aicore":开启 AI Core 的溢出检测,不支持 MindSpore v2.3.0 以上版本;
"atomic":开启 Atomic 的溢出检测,不支持 MindSpore v2.3.0 以上版本;
"all":开启算子的溢出检测,默认值。
**配置示例**:"check_mode": "all"。 | 否 | +| check_mode | 溢出类型,str 类型,仅 MindSpore v2.3.0 以下版本的静态图场景支持,可选参数:
"aicore":开启 AI Core 的溢出检测;
"atomic":开启 Atomic 的溢出检测;
"all":开启算子的溢出检测,默认值。
**配置示例**:"check_mode": "all"。 | 否 | ### 1.6 task 配置为 free_benchmark -仅 PyTorch 场景与 MindSpore 动态图场景支持,且"level"为"L1"。 +仅 PyTorch 与 MindSpore 动态图场景支持,且"level"为"L1"。 - task 配置为 free_benchmark 时,开启**无标杆比对**,在 NPU 环境下通过对当前模型 API 的输入添加扰动因子,二次执行,将得到的输出与未添加扰动因子前的输出进行比对,从而**得出该模型中可能存在因迁移等变化导致精度降低的 API**。 diff --git a/debug/accuracy_tools/msprobe/docs/05.data_dump_PyTorch.md b/debug/accuracy_tools/msprobe/docs/05.data_dump_PyTorch.md index db9a989c9d1c731fd9099d311f3ab3b95e5c7d5d..e45be7736b92c1a8b25711fd68b50e2cdec9d53e 100644 --- a/debug/accuracy_tools/msprobe/docs/05.data_dump_PyTorch.md +++ b/debug/accuracy_tools/msprobe/docs/05.data_dump_PyTorch.md @@ -183,7 +183,7 @@ save(variable, name, save_backward=True) **参数说明**: | 参数名称 | 参数含义 | 支持数据类型 | 是否必选| | ---------- | ------------------| ------------------- | ------------------- | -| variable | 需要保存的变量 |dict, list, torch.tensor, int, float, str | 是 | +| variable | 需要保存的变量 |dict, list, tuple, torch.tensor, int, float, str | 是 | | name | 指定的名称 | str | 是 | | save_backward | 是否保存反向数据 | boolean | 否 | @@ -355,7 +355,7 @@ if __name__ == "__main__": ``` * `rank`:设备 ID,每张卡的数据保存在对应的 `rank{ID}` 目录下。非分布式场景下没有 rank ID,目录名称为 rank。 * `dump_tensor_data`:保存采集到的张量数据。 -* `dump.json`: 保存API或Module前反向数据的统计量信息。包含dump数据的API名称或Module名称,各数据的dtype、 shape、max、min、mean、L2norm(L2范数,平方根)统计信息以及当配置summary_mode="md5"时的CRC-32数据。具体介绍可参考[dump.json文件说明](./27.dump_json_instruction.md#1-dumpjson文件介绍pytorch)。 +* `dump.json`: 保存API或Module前反向数据的统计量信息。包含dump数据的API名称或Module名称,各数据的dtype、 shape、max、min、mean、L2norm(L2范数,平方根)统计信息以及当配置summary_mode="md5"时的CRC-32数据。具体介绍可参考[dump.json文件说明](./27.dump_json_instruction.md#1-PyTorch场景下的dump.json文件)。 * `stack.json`:API/Module的调用栈信息。 * `construct.json`:分层分级结构,level为L1时,construct.json内容为空。 diff --git a/debug/accuracy_tools/msprobe/docs/06.data_dump_MindSpore.md b/debug/accuracy_tools/msprobe/docs/06.data_dump_MindSpore.md index f7507facd2a92f3acbefdc92fa6cd808a155d6e3..158c5e3011e72a5b7feb1458ca6c2d79bc157606 100644 --- a/debug/accuracy_tools/msprobe/docs/06.data_dump_MindSpore.md +++ b/debug/accuracy_tools/msprobe/docs/06.data_dump_MindSpore.md @@ -144,7 +144,7 @@ save(variable, name, save_backward=True) **参数说明**: | 参数名称 | 参数含义 | 支持数据类型 | 是否必选| | ---------- | ------------------| ------------------- | ------------------- | -| variable | 需要保存的变量 |dict, list, torch.tensor, int, float, str | 是 | +| variable | 需要保存的变量 |dict, list, tuple, torch.tensor, int, float, str | 是 | | name | 指定的名称 | str | 是 | | save_backward | 是否保存反向数据 | boolean | 否 | @@ -372,7 +372,7 @@ dump 结果目录结构示例如下: * `rank`:设备 ID,每张卡的数据保存在对应的 `rank{ID}` 目录下。非分布式场景下没有 rank ID,目录名称为 rank。 * `dump_tensor_data`:保存采集到的张量数据。 -* `dump.json`: 保存API或Cell前反向数据的统计量信息。包含dump数据的API名称或Cell名称,各数据的dtype、 shape、max、min、mean、L2norm(L2范数,平方根)统计信息以及当配置summary_mode="md5"时的CRC-32数据。具体介绍可参考[dump.json文件说明](./27.dump_json_instruction.md#2-dumpjson文件示例mindspore)。 +* `dump.json`: 保存API或Cell前反向数据的统计量信息。包含dump数据的API名称或Cell名称,各数据的dtype、 shape、max、min、mean、L2norm(L2范数,平方根)统计信息以及当配置summary_mode="md5"时的CRC-32数据。具体介绍可参考[dump.json文件说明](./27.dump_json_instruction.md#2-MindSpore场景下的dump.json文件)。 * `stack.json`:API/Cell的调用栈信息。 * `construct.json`:分层分级结构,level为L1时,construct.json内容为空。 diff --git a/debug/accuracy_tools/msprobe/docs/10.accuracy_compare_PyTorch.md b/debug/accuracy_tools/msprobe/docs/10.accuracy_compare_PyTorch.md index a5f83d8dfcbc7645691a8753c105fb7552522bf1..6f886215b0a389582bc3cc4c31943f76e6a414a3 100644 --- a/debug/accuracy_tools/msprobe/docs/10.accuracy_compare_PyTorch.md +++ b/debug/accuracy_tools/msprobe/docs/10.accuracy_compare_PyTorch.md @@ -257,11 +257,11 @@ PyTorch 精度比对是以 CPU 或 GPU 的计算结果为标杆,通过计算 统计量有 4 种:最大值(max)、最小值(min)、平均值(mean)和 L2-范数(L2 norm)。 -|dump 数据模式|Cosine (tensor 余弦相似度)|EucDist (tensor 欧式距离)|MaxAbsErr (tensor 最大绝对误差)|MaxRelativeErr (tensor 最大相对误差)|One Thousandth Err Ratio (tensor 相对误差小于千分之一的比例)|Five Thousandth Err Ratio (tensor 相对误差小于千分之五的比例)|NPU 和 bench 的统计量绝对误差 (max, min, mean, L2 norm) diff| NPU 和 bench 的统计量相对误差 (max, min, mean, L2 norm) RelativeErr |NPU 和 bench 的统计量 (max, min, mean, L2 norm)|NPU MD5 (NPU 数据 CRC-32 值)|BENCH MD5 (bench 数据 CRC-32 值)|Result (比对结果)|Accuracy Reached or Not (计算精度是否达标)|Err_message (错误信息提示)|NPU_Stack_Info (堆栈信息)|Data_Name (NPU 真实数据名)| -|:---:|:---:|:---:|:---:|:---:|:---:|:---:|:---:|:---:|:---:|:---:|:---:|:---:|:---:|:---:|:---:|:---:| -|真实数据模式|√|√|√|√|√|√|||√||||√|√|√|√| -|统计数据模式|||||||√|√|√|||√||√|√|| -|MD5 模式||||||||||√|√|√|||√|| +|dump 数据模式|Cosine (tensor 余弦相似度)|EucDist (tensor 欧式距离)|MaxAbsErr (tensor 最大绝对误差)|MaxRelativeErr (tensor 最大相对误差)|One Thousandth Err Ratio (tensor 相对误差小于千分之一的比例)|Five Thousandth Err Ratio (tensor 相对误差小于千分之五的比例)|NPU 和 bench 的统计量绝对误差 (max, min, mean, L2 norm) diff| NPU 和 bench 的统计量相对误差 (max, min, mean, L2 norm) RelativeErr |NPU 和 bench 的统计量 (max, min, mean, L2 norm)|NPU MD5 (NPU 数据 CRC-32 值)|BENCH MD5 (bench 数据 CRC-32 值)|Result (比对结果)|Accuracy Reached or Not (计算精度是否达标)|Err_message (错误信息提示)|NPU_Stack_Info (堆栈信息)| Data_Name ([NPU真实数据名,Bench真实数据名]) | +|:---:|:---:|:---:|:---:|:---:|:---:|:---:|:---:|:---:|:---:|:---:|:---:|:---:|:---:|:---:|:---:|:---------------------------------:| +|真实数据模式|√|√|√|√|√|√|||√||||√|√|√| √ | +|统计数据模式|||||||√|√|√|||√||√|√| | +|MD5 模式||||||||||√|√|√|||√| | 上表中NPU_Stack_Info字段需要配置-s参数生成。 diff --git a/debug/accuracy_tools/msprobe/docs/12.overflow_check_PyTorch.md b/debug/accuracy_tools/msprobe/docs/12.overflow_check_PyTorch.md index 97b049000c6aca9a69aeca66e1a27a4260b3d142..983477554e138f3e547f2d3efcf14fdfc4a991a0 100644 --- a/debug/accuracy_tools/msprobe/docs/12.overflow_check_PyTorch.md +++ b/debug/accuracy_tools/msprobe/docs/12.overflow_check_PyTorch.md @@ -28,7 +28,7 @@ msprobe 工具在 PyTorch 场景下提供溢出数据采集功能和溢出数据 溢出数据采集功能在昇腾 NPU 上支持饱和模式(仅支持 Atlas 训练系列产品)和 INF/NAN 模式。 -INF/NAN 模式遵循 IEEE 754 标准,根据定义输出 INF/NAN 的计算结果。与之对应的饱和模式在计算出现溢出时,饱和为浮点数极值(+-MAX)。对于 CANN 侧配置,Atlas 训练系列产品,默认为饱和模式,且不建议使用 INF/NAN 模式;Atlas A2 训练系列产品,默认为 INF/NAN 模式,且不建议使用饱和模式。 +INF/NAN 模式遵循 IEEE 754 标准,根据定义输出 INF/NAN 的计算结果。与之对应的饱和模式在计算出现溢出时,饱和为浮点数极值(+-MAX)。对于 CANN 侧配置,Atlas 训练系列产品,默认为饱和模式,且不支持使用 INF/NAN 模式;Atlas A2 训练系列产品,默认为 INF/NAN 模式,且不建议使用饱和模式。 INF/NAN 模式的使能方式如下: diff --git a/debug/accuracy_tools/msprobe/docs/13.overflow_check_MindSpore.md b/debug/accuracy_tools/msprobe/docs/13.overflow_check_MindSpore.md index 33ff4a0259aef02d122022402966c65358e8efff..ef83aa17237d1cc56b8a67bf4b3ec9f57647fb9c 100644 --- a/debug/accuracy_tools/msprobe/docs/13.overflow_check_MindSpore.md +++ b/debug/accuracy_tools/msprobe/docs/13.overflow_check_MindSpore.md @@ -11,7 +11,7 @@ export INF_NAN_MODE_ENABLE=1 export MS_ASCEND_CHECK_OVERFLOW_MODE="INFNAN_MODE" ``` -**a**:在处理浮点数计算溢出问题时,NPU 当前支持两种溢出模式:INF/NAN 模式与饱和模式。INF/NAN 模式遵循 IEEE 754 标准,根据定义输出 INF/NAN 的计算结果。与之对应的饱和模式在计算出现溢出时,饱和为浮点数极值(+-MAX)。对于 CANN 侧配置,Atlas 训练系列产品,默认为饱和模式,且不建议使用 INF/NAN 模式;Atlas A2训练系列产品,默认为 INF/NAN 模式,且不建议使用饱和模式。对于 MindSpore 框架侧配置,仅支持对 Atlas A2 训练系列产品进行设置,默认为 INF/NAN 模式。CANN 侧 与 MindSpore 框架侧配置须一致。 +**a**:在处理浮点数计算溢出问题时,NPU 当前支持两种溢出模式:INF/NAN 模式与饱和模式。INF/NAN 模式遵循 IEEE 754 标准,根据定义输出 INF/NAN 的计算结果。与之对应的饱和模式在计算出现溢出时,饱和为浮点数极值(+-MAX)。对于 CANN 侧配置,Atlas 训练系列产品,默认为饱和模式,且不支持使用 INF/NAN 模式;Atlas A2训练系列产品,默认为 INF/NAN 模式,且不建议使用饱和模式。对于 MindSpore 框架侧配置,仅支持对 Atlas A2 训练系列产品进行设置,默认为 INF/NAN 模式。CANN 侧 与 MindSpore 框架侧配置须一致。 溢出检测任务的配置示例见[MindSpore 静态图场景下 task 配置为 overflow_check](https://gitee.com/ascend/mstt/blob/master/debug/accuracy_tools/msprobe/docs/03.config_examples.md#23-task-%E9%85%8D%E7%BD%AE%E4%B8%BA-overflow_check)、[MindSpore 动态图场景下 task 配置为 overflow_check](https://gitee.com/ascend/mstt/blob/master/debug/accuracy_tools/msprobe/docs/03.config_examples.md#33-task-%E9%85%8D%E7%BD%AE%E4%B8%BA-overflow_check)。 diff --git a/debug/accuracy_tools/msprobe/docs/19.monitor.md b/debug/accuracy_tools/msprobe/docs/19.monitor.md index 1c197ba5496378130d8d04b6f847ee2f35c3e946..4bb82af8c7a767fac99f4d72ea475617334ba315 100644 --- a/debug/accuracy_tools/msprobe/docs/19.monitor.md +++ b/debug/accuracy_tools/msprobe/docs/19.monitor.md @@ -487,7 +487,6 @@ actv, actv_grad = monitor.generate_xy_metrics() ``` - ## 详细配置 ```json diff --git a/debug/accuracy_tools/msprobe/docs/27.dump_json_instruction.md b/debug/accuracy_tools/msprobe/docs/27.dump_json_instruction.md index f994dc2301bcae6b23dc7a7503297aa4fe5b3724..bf5998bce0b4cd174b9713d9417d1afb674c2b56 100644 --- a/debug/accuracy_tools/msprobe/docs/27.dump_json_instruction.md +++ b/debug/accuracy_tools/msprobe/docs/27.dump_json_instruction.md @@ -1,8 +1,8 @@ # dump.json文件说明及示例 -## 1. dump.json文件示例(PyTorch) +## 1. PyTorch 场景下的 dump.json 文件 -### 1.1 L0级别 +### 1.1 L0 级别 L0级别的dump.json文件包括模块的前反向的输入输出,以及模块的参数和参数梯度。以PyTorch的Conv2d模块为例,网络中模块调用代码为: `output = self.conv2(input) # self.conv2 = torch.nn.Conv2d(64, 128, 5, padding=2, bias=True)` @@ -168,7 +168,7 @@ dump.json文件中包含以下数据名称: } ``` -### 1.2 L1级别 +### 1.2 L1 级别 L1级别的dump.json文件包括API的前反向的输入输出。以PyTorch的relu函数为例,网络中API调用代码为: `output = torch.nn.functional.relu(input)` @@ -264,13 +264,13 @@ dump.json文件中包含以下数据名称: } ``` -### 1.3 mix级别 +### 1.3 mix 级别 mix级别的dump.json文件同时包括L0和L1级别的dump数据,文件格式与上述示例相同。 -## 2. dump.json文件示例(MindSpore) +## 2. MindSpore 场景下的 dump.json 文件 -### 2.1 L0级别 +### 2.1 L0 级别 L0级别的dump.json文件包括模块的前反向的输入输出,以及模块的参数和参数梯度。 以MindSpore的Conv2d模块为例,dump.json文件中使用的模块调用代码为: @@ -429,7 +429,7 @@ dump.json文件中包含以下数据名称: } ``` -### 2.2 L1级别 +### 2.2 L1 级别 L1级别的dump.json文件包括API的前反向的输入输出,以MindSpore的relu函数为例,网络中API调用代码为: `output = mindspore.ops.relu(input)` @@ -521,5 +521,275 @@ L1级别的dump.json文件包括API的前反向的输入输出,以MindSpore的 } ``` -### 2.3 mix级别 +### 2.3 mix 级别 + mix级别的dump.json文件同时包括L0和L1级别的dump数据,文件格式与上述示例相同。 + +## 3. MSAdapter 场景下的 dump.json 文件 + +### 3.1 L0 级别 + +L0 级别的 dump.json 文件包括模块的前反向的输入输出,以及模块的参数和参数梯度。以 Conv2d 模块为例,网络中模块调用代码为: +`output = self.conv2(input) # self.conv2 = torch.nn.Conv2d(64, 128, 5, padding=2, bias=True)` + +dump.json文件中包含以下数据名称: + +- `Module.conv2.Conv2d.forward.0`:模块的前向数据,其中input_args为模块的输入数据(位置参数),input_kwargs为模块的输入数据(关键字参数),output为模块的输出数据,parameters为模块的参数数据,包括权重(weight)和偏置(bias)。 +- `Module.conv2.Conv2d.parameters_grad`:模块的参数梯度数据,包括权重(weight)和偏置(bias)的梯度。 +- `Module.conv2.Conv2d.backward.0`:模块的反向数据,其中input为模块反向的输入梯度(对应前向输出的梯度),output为模块的反向输出梯度(对应前向输入的梯度)。 + +**说明**:当dump时传入的model参数为List[torch.nn.Module]或Tuple[torch.nn.Module]时,模块级数据的命名中包含该模块在列表中的索引index,命名格式为`{Module}.{index}.*`,*表示以上三种模块级数据的命名格式,例如:`Module.0.conv1.Conv2d.forward.0`。 + +```json +{ + "task": "tensor", + "level": "L0", + "framework": "mindtorch", + "dump_data_dir": "/dump/path", + "data": { + "Module.conv2.Conv2d.forward.0": { + "input_args": [ + { + "type": "mindspore.Tensor", + "dtype": "Float32", + "shape": [ + 8, + 16, + 14, + 14 + ], + "Max": 1.638758659362793, + "Min": 0.0, + "Mean": 0.2544615864753723, + "Norm": 70.50277709960938, + "requires_grad": true, + "data_name": "Module.conv2.Conv2d.forward.0.input.0.npy" + } + ], + "input_kwargs": {}, + "output": [ + { + "type": "mindspore.Tensor", + "dtype": "Float32", + "shape": [ + 8, + 32, + 10, + 10 + ], + "Max": 1.6815717220306396, + "Min": -1.5120246410369873, + "Mean": -0.025344856083393097, + "Norm": 149.65576171875, + "requires_grad": true, + "data_name": "Module.conv2.Conv2d.forward.0.output.0.npy" + } + ], + "parameters": { + "weight": { + "type": "mindspore.Tensor", + "dtype": "Float32", + "shape": [ + 32, + 16, + 5, + 5 + ], + "Max": 0.05992485210299492, + "Min": -0.05999220535159111, + "Mean": -0.0006165213999338448, + "Norm": 3.421217441558838, + "requires_grad": true, + "data_name": "Module.conv2.Conv2d.forward.0.parameters.weight.npy" + }, + "bias": { + "type": "mindspore.Tensor", + "dtype": "Float32", + "shape": [ + 32 + ], + "Max": 0.05744686722755432, + "Min": -0.04894155263900757, + "Mean": 0.006410328671336174, + "Norm": 0.17263513803482056, + "requires_grad": true, + "data_name": "Module.conv2.Conv2d.forward.0.parameters.bias.npy" + } + } + }, + "Module.conv2.Conv2d.parameters_grad": { + "weight": [ + { + "type": "mindspore.Tensor", + "dtype": "Float32", + "shape": [ + 32, + 16, + 5, + 5 + ], + "Max": 0.018550323322415352, + "Min": -0.008627401664853096, + "Mean": 0.0006675920449197292, + "Norm": 0.26084786653518677, + "requires_grad": false, + "data_name": "Module.conv2.Conv2d.parameters_grad.weight.npy" + } + ], + "bias": [ + { + "type": "mindspore.Tensor", + "dtype": "Float32", + "shape": [ + 32 + ], + "Max": 0.014914230443537235, + "Min": -0.006656786892563105, + "Mean": 0.002657240955159068, + "Norm": 0.029451673850417137, + "requires_grad": false, + "data_name": "Module.conv2.Conv2d.parameters_grad.bias.npy" + } + ] + }, + "Module.conv2.Conv2d.backward.0": { + "input": [ + { + "type": "mindspore.Tensor", + "dtype": "Float32", + "shape": [ + 8, + 32, + 10, + 10 + ], + "Max": 0.0015069986693561077, + "Min": -0.001139344065450132, + "Mean": 3.3215508210560074e-06, + "Norm": 0.020567523315548897, + "requires_grad": false, + "data_name": "Module.conv2.Conv2d.backward.0.input.0.npy" + } + ], + "output": [ + { + "type": "mindspore.Tensor", + "dtype": "Float32", + "shape": [ + 8, + 16, + 14, + 14 + ], + "Max": 0.0007466732058674097, + "Min": -0.00044813455315306783, + "Mean": 6.814070275140693e-06, + "Norm": 0.01474067009985447, + "requires_grad": false, + "data_name": "Module.conv2.Conv2d.backward.0.output.0.npy" + } + ] + } + } +} +``` + +### 3.2 L1 级别 +L1级别的dump.json文件包括API的前反向的输入输出。以 relu API 为例,网络中 API 调用代码为: +`output = torch.nn.functional.relu(input)` + +dump.json文件中包含以下数据名称: +- `Functional.relu.0.forward`:API的前向数据,其中input_args为API的输入数据(位置参数),input_kwargs为API的输入数据(关键字参数),output为API的输出数据。 +- `Functional.relu.0.backward`:API的反向数据,其中input为API的反向输入梯度(对应前向输出的梯度),output为API的反向输出梯度(对应前向输入的梯度)。 + +```json +{ + "task": "tensor", + "level": "L1", + "framework": "mindtorch", + "dump_data_dir":"/dump/path", + "data": { + "Functional.relu.0.forward": { + "input_args": [ + { + "type": "mindspore.Tensor", + "dtype": "Float32", + "shape": [ + 32, + 16, + 28, + 28 + ], + "Max": 1.3864083290100098, + "Min": -1.3364859819412231, + "Mean": 0.03711778670549393, + "Norm": 236.20692443847656, + "requires_grad": true, + "data_name": "Functional.relu.0.forward.input.0.npy" + } + ], + "input_kwargs": {}, + "output": [ + { + "type": "mindspore.Tensor", + "dtype": "Float32", + "shape": [ + 32, + 16, + 28, + 28 + ], + "Max": 1.3864083290100098, + "Min": 0.0, + "Mean": 0.16849493980407715, + "Norm": 175.23345947265625, + "requires_grad": true, + "data_name": "Functional.relu.0.forward.output.0.npy" + } + ] + }, + "Functional.relu.0.backward": { + "input": [ + { + "type": "mindspore.Tensor", + "dtype": "Float32", + "shape": [ + 32, + 16, + 28, + 28 + ], + "Max": 0.0001815402356442064, + "Min": -0.00013352684618439525, + "Mean": 0.00011915402356442064, + "Norm": 0.007598237134516239, + "requires_grad": false, + "data_name": "Functional.relu.0.backward.input.0.npy" + } + ], + "output": [ + { + "type": "mindspore.Tensor", + "dtype": "Float32", + "shape": [ + 32, + 16, + 28, + 28 + ], + "Max": 0.0001815402356442064, + "Min": -0.00012117840378778055, + "Mean": 2.0098118724831693e-08, + "Norm": 0.006532244384288788, + "requires_grad": false, + "data_name": "Functional.relu.0.backward.output.0.npy" + } + ] + } + } +} +``` + +### 3.3 mix 级别 + +mix级别的dump.json文件同时包括L0和L1级别的dump数据,文件格式与上述示例相同。 \ No newline at end of file diff --git a/debug/accuracy_tools/msprobe/docs/28.kernel_dump_MindSpore.md b/debug/accuracy_tools/msprobe/docs/28.kernel_dump_MindSpore.md index 6b8cc558aa22526158033cfb35f31203d8b04278..4988586c0568b391739f7c14f1a9452461f1a6f1 100644 --- a/debug/accuracy_tools/msprobe/docs/28.kernel_dump_MindSpore.md +++ b/debug/accuracy_tools/msprobe/docs/28.kernel_dump_MindSpore.md @@ -1,4 +1,4 @@ -# MindSpore 场景的 kernel dump 说明 +# MindSpore 动态图场景的 kernel dump 说明 当使用 msprobe 数据采集功能时,level 配置为 "L2" 表示采集 kernel 层级的算子数据,仅支持昇腾 NPU 平台。 diff --git a/debug/accuracy_tools/msprobe/docs/29.data_dump_MSAdapter.md b/debug/accuracy_tools/msprobe/docs/29.data_dump_MSAdapter.md new file mode 100644 index 0000000000000000000000000000000000000000..cefcabafbcbbdbb33a3d9d63c17a30396c9e4c52 --- /dev/null +++ b/debug/accuracy_tools/msprobe/docs/29.data_dump_MSAdapter.md @@ -0,0 +1,229 @@ +# MSAdapter 场景的精度数据采集 + +MSAdapter 是一款 MindSpore 生态适配工具,可以将 PyTorch 训练脚本高效迁移至 MindSpore 框架执行,以实现在不改变原有 PyTorch 用户开发习惯的情况下,使得 PyTorch 代码能在昇腾上获得高效性能。 + +msprobe 工具主要通过在训练脚本内添加 dump 接口、启动训练的方式采集精度数据。 + +本工具提供固定的 API 支持列表,若需要删除或增加 dump 的 API,可以在 msprobe/pytorch/hook_module/support_wrap_ops.yaml 文件内手动修改,如下示例: + +```yaml +functional: # functional为算子类别,找到对应的类别,在该类别下按照下列格式删除或添加API + - conv1d + - conv2d + - conv3d +``` + +删除 API 的场景:部分模型代码逻辑会存在 API 原生类型校验,工具执行dump操作时,对封装后的模型 API 可能与模型的原生 API 类型不一致,此时可能引发校验失败,详见《[FAQ](FAQ.md)》中“异常情况”的第10和11条。 + +## 1. 工具安装 + +请参见[《msprobe 工具安装指南》](./01.installation.md)。 + +## 2 接口介绍 + +### 2.1 msprobe.mindspore.PrecisionDebugger + +**功能说明**:通过加载 dump 配置文件的方式来确定 dump 操作的详细配置。 + +**原型**: + +```Python +PrecisionDebugger(config_path=None, task=None, dump_path=None, level=None, step=None) +``` + +**参数说明**: + +1. config_path:指定 dump 配置文件路径,string 类型。参数示例:"./config.json"。未配置该路径时,默认使用 [config.json](../config.json) 文件的默认配置,配置选项含义可见 [config.json 介绍](./02.config_introduction.md)。 + +2. 其他参数与 [config.json](../config.json) 文件中的同名配置字段含义相同,具体可见 [config.json 介绍](./02.config_introduction.md)。当参数值非None时,优先级高于 [config.json](../config.json) 文件中的同名配置。 + +#### 2.1.1 start + +**功能说明**:启动精度数据采集。需要与 [**stop**](#212-stop) 接口一起添加在训练迭代的 for 循环内。 + +**原型**: + +```Python +start(model=None) +``` + +**参数说明**: + +1. model:指定需要采集 Module 级数据的模型,支持传入 torch.nn.Module、list[torch.nn.Module]或Tuple[torch.nn.Module] 类型,默认未配置。level 配置为 "L0" 或 "mix" 时,必须在该接口中配置该参数。API级别("L1" level)dump 时,传入 model 可以采集 model 内包含 primitive op 对象在内的所有 API 数据,若不传入 model 参数,则只采集非 primitive op 的 API 数据。 + +#### 2.1.2 stop + +**功能说明**:停止精度数据采集。在 **start** 接口调用之后的任意位置添加。若 **stop** 接口添加在反向计算代码之后,则会采集 **start** 和该接口之间的前反向数据。 +若 **stop** 接口添加在反向计算代码之前,则需要将 [**step**](#213-step) 接口添加到反向计算代码之后,才能采集 **start** 和该接口之间的前反向数据。 + +**注意**:**stop** 接口必须调用,否则可能导致精度数据落盘不全。 + +**原型**: + +```Python +stop() +``` + +#### 2.1.3 step + +**功能说明**:进行训练 step 数的自增,完成当前 step 所有数据的落盘并更新 dump 参数。在一个 step 训练结束的位置添加,且必须在 **stop** 接口之后的位置调用。该接口需要配合 **start** 和 **stop** 函数使用,尽量添加在反向计算代码之后,否则可能会导致反向数据丢失。 + +**原型**: + +```Python +step() +``` + +#### 2.1.4 forward_backward_dump_end + +**功能说明**:停止精度数据采集。与 **stop** 接口功能相同,该函数在将来会被移除,建议使用 **stop** 接口。 + +**原型**: + +```Python +forward_backward_dump_end() +``` + +#### 2.1.5 save + +**功能说明**:单点保存网络执行过程中正反向数值,并以统计值/张量文件落盘。 + +**原型**: +```python +save(variable, name, save_backward=True) +``` + +**参数说明**: +| 参数名称 | 参数含义 | 支持数据类型 | 是否必选| +| ---------- | ------------------| ------------------- | ------------------- | +| variable | 需要保存的变量 |dict, list, tuple, torch.tensor, int, float, str | 是 | +| name | 指定的名称 | str | 是 | +| save_backward | 是否保存反向数据 | boolean | 否 | + +### 2.2 msprobe.mindspore.seed_all + +**功能说明**:用于固定网络中的随机性和开启确定性计算。 + +**原型**: +```python +seed_all(seed=1234, mode=False, rm_dropout=True) +``` + +**参数说明**: + +1. seed: 随机性种子,默认值:1234,非必选。参数示例: seed=1000。该参数用于 random、numpy.random, mindspore.common.Initializer、mindspore.nn.probability.distribution的随机数生成以及 Python 中 str、bytes、datetime 对象的 hash 算法。 + +2. mode:确定性计算使能,可配置 True 或 False,默认值:False,非必选。参数示例:mode=True。该参数设置为 True 后,将会开启算子确定性运行模式与归约类通信算子(AllReduce、ReduceScatter、Reduce)的确定性计算。注意:确定性计算会导致 API 执行性能降低,建议在发现模型多次执行结果不同的情况下开启。 + +3. rm_dropout:控制 dropout 失效的开关。可配置 True 或 False,默认值:True,非必选。参数示例:rm_dropout=True。该参数设置为 True 后,将会使 mindspore.ops.Dropout,mindspore.ops.Dropout2D,mindspore.ops.Dropout3D,mindspore.mint.nn.Dropout和mindspore.mint.nn.functional.dropout 失效,以避免因随机 dropout 造成的网络随机性。建议在采集数据前调用。 + +**注意**:通过 rm_dropout 控制 dropout 失效或生效需要在初始化 Dropout 实例前调用才能生效。 + +## 3 示例代码 + +以下为添加了 msprobe 工具 dump 接口的示例训练脚本。 + +```python +import mindspore as ms +import torch +import torch.nn as nn +import torch.nn.functional as F + +# 导入工具的数据采集接口 +from msprobe.pytorch import PrecisionDebugger + +# 在模型训练开始前实例化PrecisionDebugger +debugger = PrecisionDebugger(config_path='./config.json') + + +# 定义网络 +class Net(nn.Module): + def __init__(self) -> None: + super().__init__() + self.linear1 = nn.Linear(in_features=8, out_features=4) + self.linear2 = nn.Linear(in_features=4, out_features=2) + + def forward(self, x): + x1 = self.linear1(x) + x2 = self.linear2(x1) + logits = F.relu(x2) + return logits + + +net = Net() + + +def train_step(inputs): + return net(inputs) + + +if __name__ == "__main__": + data = (torch.randn(10, 8), torch.randn(10, 8), torch.randn(10, 8)) + grad_fn = ms.value_and_grad(train_step, grad_position=0) + + for inputs in data: + # 开启数据 dump + debugger.start(model=net) + + out, grad = grad_fn(inputs) + + # 停止数据 dump + debugger.stop() + # 更新 step 信息 + debugger.step() +``` + +## 4 dump 结果文件介绍 + +训练结束后,工具将 dump 的数据保存在 dump_path 参数指定的目录下。目录结构示例如下: + +```lua +├── dump_path +│ ├── step0 +│ | ├── rank0 +│ | │ ├── dump_tensor_data +| | | | ├── Tensor.permute.1.forward.npy +| | | | ├── Functional.linear.5.backward.output.npy # 命名格式为{api_type}.{api_name}.{API调用次数}.{forward/backward}.{input/output}.{参数序号}, 其中,“参数序号”表示该API的第n个输入或输出,例如1,则为第一个参数,若该参数为list格式,则根据list继续排序,例如1.1,表示该API的第1个参数的第1个元素。 +| | | | ... +| | | | ├── Module.conv1.Conv2d.forward.0.input.0.npy # 命名格式为{Module}.{module_name}.{class_name}.{forward/backward}.{调用次数}.{input/output}.{参数序号}, 其中,“参数序号”表示该Module的第n个参数,例如1,则为第一个参数,若该参数为list格式,则根据list继续排序,例如1.1,表示该Module的第1个参数的第1个元素。 +| | | | ├── Module.conv1.Conv2D.forward.0.parameters.bias.npy # 模块参数数据:命名格式为{Module}.{module_name}.{class_name}.forward.{调用次数}.parameters.{parameter_name}。 +| | | | └── Module.conv1.Conv2D.parameters_grad.weight.npy # 模块参数梯度数据:命名格式为{Module}.{module_name}.{class_name}.parameters_grad.{parameter_name}。因为同一模块的参数使用同一梯度进行更新,所以参数梯度文件名不包含调用次数。 +| | | | # 当dump时传入的model参数为List[torch.nn.Module]或Tuple[torch.nn.Module]时,模块级数据的命名中包含该模块在列表中的索引index,命名格式为{Module}.{index}.*,*表示以上三种模块级数据的命名格式,例如:Module.0.conv1.Conv2d.forward.0.input.0.npy。 +│ | | ├── dump.json +│ | | ├── stack.json +│ | | └── construct.json +│ | ├── rank1 +| | | ├── dump_tensor_data +| | | | └── ... +│ | | ├── dump.json +│ | | ├── stack.json +| | | └── construct.json +│ | ├── ... +│ | | +| | └── rank7 +│ ├── step1 +│ | ├── ... +│ ├── step2 +``` +* `rank`:设备 ID,每张卡的数据保存在对应的 `rank{ID}` 目录下。非分布式场景下没有 rank ID,目录名称为 rank。 +* `dump_tensor_data`:保存采集到的张量数据。 +* `dump.json`: 保存 API 或 Module 前反向数据的统计量信息。包含 dump 数据的 API 名称或 Module 名称,各数据的 dtype、 shape、max、min、mean、L2norm(L2范数,平方根)统计信息以及当配置 summary_mode="md5" 时的 CRC-32 数据。具体介绍可参考[dump.json文件说明](./27.dump_json_instruction.md#3-MSAdapter场景下的dump.json文件)。 +* `stack.json`:API/Module 的调用栈信息。 +* `construct.json`:分层分级结构,level 为 L1 时,construct.json 内容为空。 + + +当 task 为 tensor 时,dump 过程中,npy 文件在对应算子或者模块被执行后就会落盘,而 json 文件则需要在正常执行 PrecisionDebugger.stop() 后才会写入完整数据。因此如果程序异常终止,终止前被执行算子的相关 npy 文件得以保存,但 json 文件中的数据可能丢失。 + +其中 rank 为设备上各卡的 ID,每张卡上 dump 的数据会生成对应 dump 目录。非分布式场景下没有 rank ID,目录名称为 rank。 + +npy 文件名的前缀含义如下: + +| 前缀 | 含义 | +| ----------- | ---------------------------- | +| Tensor | torch.Tensor API数据 | +| Torch | torch API数据 | +| Functional | torch.nn.functional API数据 | +| NPU | NPU 亲和API数据 | +| Distributed | torch.distributed API数据 | +| Jit | 被 "jit" 装饰的模块或函数数据 | +| Module | torch.nn.Module 类(模块)数据 | \ No newline at end of file diff --git a/debug/accuracy_tools/msprobe/docs/30.overflow_check_MSAdapter.md b/debug/accuracy_tools/msprobe/docs/30.overflow_check_MSAdapter.md new file mode 100644 index 0000000000000000000000000000000000000000..01d64c808d40a1e5c4ea2190c028a7c389ffbdc4 --- /dev/null +++ b/debug/accuracy_tools/msprobe/docs/30.overflow_check_MSAdapter.md @@ -0,0 +1,31 @@ +# MSAdapter 场景的溢出检测 + +msprobe 工具提供 MSAdapter 场景下的溢出检测功能。其检测对象为 **API** 级别(除 Primitive 和 Jit 类 API)或**模块**级别,分别对应 config.json 配置中的 **"L1"** 、**"L0"** level。 + +需要注意,本工具仅支持在 INF/NAN 模式a下进行溢出检测。INF/NAN 模式的使能方式如下: + +```Shell +# 使能 CANN 侧 INF/NAN 模式 +export INF_NAN_MODE_ENABLE=1 +# 使能 MindSpore 框架侧 INF/NAN 模式 +export MS_ASCEND_CHECK_OVERFLOW_MODE="INFNAN_MODE" +``` + +**a**:在处理浮点数计算溢出问题时,NPU 当前支持两种溢出模式:INF/NAN 模式与饱和模式。INF/NAN 模式遵循 IEEE 754 标准,根据定义输出 INF/NAN 的计算结果。与之对应的饱和模式在计算出现溢出时,饱和为浮点数极值(+-MAX)。对于 CANN 侧配置,Atlas 训练系列产品,默认为饱和模式,且不建议使用 INF/NAN 模式;Atlas A2训练系列产品,默认为 INF/NAN 模式,且不建议使用饱和模式。对于 MindSpore 框架侧配置,仅支持对 Atlas A2 训练系列产品进行设置,默认为 INF/NAN 模式。CANN 侧 与 MindSpore 框架侧配置须一致。 + +溢出检测任务的配置示例见["**MindSpore 动态图场景 task 配置为 overflow_check**"](./03.config_examples.md#33-task配置为overflow_check)小节。 + + +## 1 接口介绍 + +溢出检测功能提供的接口与数据采集任务一致,详见 MSAdapter 场景的精度数据采集中的["**2 接口介绍**"](./29.data_dump_MSAdapter.md#2-接口介绍)小节。 + +需要注意,目前暂不支持 "L1" level 下 primitive op 的溢出检测。 + +## 2 示例代码 + +溢出检测功能使用方式与数据采集任务一致,详见 MSAdapter 场景的精度数据采集中的["**3 示例代码**"](./29.data_dump_MSAdapter.md#3-示例代码)小节。 + +## 3 溢出检测结果文件介绍 + +溢出检测结果文件目录结构与含义与数据采集任务一致,但仅保存溢出 API 或 模块 的真实数据或统计信息。详见 MSAdapter 场景的精度数据采集中的["**4 dump 结果文件介绍**"](./29.data_dump_MSAdapter.md#4-dump-结果文件介绍)小节。 \ No newline at end of file diff --git a/debug/accuracy_tools/msprobe/docs/img/compare_result.png b/debug/accuracy_tools/msprobe/docs/img/compare_result.png index 07cdb51707fe43d07723ed976275d99f55b50571..b6d7ec6dfcbc44b4b7056e1297a481f495ceb86e 100644 Binary files a/debug/accuracy_tools/msprobe/docs/img/compare_result.png and b/debug/accuracy_tools/msprobe/docs/img/compare_result.png differ diff --git a/debug/accuracy_tools/msprobe/mindspore/common/utils.py b/debug/accuracy_tools/msprobe/mindspore/common/utils.py index ded3faaa22b565ef35c17a7596782976ddf9125d..dc9da3449099ce90aa2d867a7c5cb6073c0990f6 100644 --- a/debug/accuracy_tools/msprobe/mindspore/common/utils.py +++ b/debug/accuracy_tools/msprobe/mindspore/common/utils.py @@ -182,9 +182,9 @@ def set_register_backward_hook_functions(): def check_save_param(variable, name, save_backward): # try catch this api to skip invalid call - if not isinstance(variable, (list, dict, ms.Tensor, int, float, str)): + if not isinstance(variable, (list, dict, tuple, ms.Tensor, int, float, str)): logger.warning("PrecisionDebugger.save variable type not valid, " - "should be one of list, dict, ms.Tensor, int, float or string. " + "should be one of list, dict, tuple, ms.Tensor, int, float or string. " "Skip current save process.") raise ValueError if not isinstance(name, str): @@ -196,4 +196,4 @@ def check_save_param(variable, name, save_backward): logger.warning("PrecisionDebugger.save_backward name not valid, " "should be bool. " "Skip current save process.") - raise ValueError \ No newline at end of file + raise ValueError diff --git a/debug/accuracy_tools/msprobe/mindspore/compare/ms_compare.py b/debug/accuracy_tools/msprobe/mindspore/compare/ms_compare.py index de507e876653d4c4f82e1f96a00c214e2bc6f5f6..e0915f8179b69120306730c66b8ae3f12d0ccffa 100644 --- a/debug/accuracy_tools/msprobe/mindspore/compare/ms_compare.py +++ b/debug/accuracy_tools/msprobe/mindspore/compare/ms_compare.py @@ -22,10 +22,10 @@ import pandas as pd from msprobe.core.common.const import CompareConst, Const from msprobe.core.common.exceptions import FileCheckException -from msprobe.core.common.file_utils import FileOpen, create_directory, load_json, load_npy, load_yaml +from msprobe.core.common.file_utils import create_directory, load_json, load_npy, load_yaml from msprobe.core.common.log import logger from msprobe.core.common.utils import CompareException, check_compare_param, check_configuration_param, \ - check_op_str_pattern_valid, get_dump_mode, set_dump_path + check_op_str_pattern_valid, get_dump_mode, set_dump_path, detect_framework_by_dump_json from msprobe.core.compare.acc_compare import Comparator, ModeConfig from msprobe.core.compare.check import dtype_mapping from msprobe.core.compare.layer_mapping import generate_data_mapping_by_layer_mapping @@ -125,8 +125,7 @@ class MSComparator(Comparator): result_df.loc[warning_flag, CompareConst.RESULT] = CompareConst.WARNING result_df.loc[warning_flag, CompareConst.ERROR_MESSAGE] = 'Need double check api accuracy.' else: - fill_cols = [CompareConst.COSINE, CompareConst.EUC_DIST, - CompareConst.MAX_ABS_ERR, CompareConst.MAX_RELATIVE_ERR, + fill_cols = [CompareConst.COSINE, CompareConst.MAX_ABS_ERR, CompareConst.MAX_RELATIVE_ERR, CompareConst.ONE_THOUSANDTH_ERR_RATIO, CompareConst.FIVE_THOUSANDTHS_ERR_RATIO, CompareConst.ERROR_MESSAGE] result_df.loc[~condition_no_bench, fill_cols] = '' @@ -383,12 +382,11 @@ class MSComparator(Comparator): def check_cross_framework(bench_json_path): - pattern = r'"data_name":\s*"[^"]+\.pt"' - with FileOpen(bench_json_path, 'r') as file: - for line in file: - if re.search(pattern, line): - return True - return False + framework = detect_framework_by_dump_json(bench_json_path) + if framework == Const.PT_FRAMEWORK: + return True + else: + return False def ms_compare(input_param, output_path, **kwargs): diff --git a/debug/accuracy_tools/msprobe/mindspore/monitor/distributed/wrap_distributed.py b/debug/accuracy_tools/msprobe/mindspore/monitor/distributed/wrap_distributed.py index 33fd58c7278c6245140e50a984f44e59b90c69de..e8a4739445e83c62e34e16829c5ea94c8ef5177c 100644 --- a/debug/accuracy_tools/msprobe/mindspore/monitor/distributed/wrap_distributed.py +++ b/debug/accuracy_tools/msprobe/mindspore/monitor/distributed/wrap_distributed.py @@ -281,7 +281,7 @@ def create_hooks(context, monitor): global RANK pre_hooks = [] hooks = [] - RANK = str(get_rank()) + RANK = get_rank() if communication.GlobalComm.INITED and RANK not in monitor.module_rank_list and monitor.module_rank_list != []: return [pre_hooks, hooks] diff --git a/debug/accuracy_tools/msprobe/mindspore/monitor/utils.py b/debug/accuracy_tools/msprobe/mindspore/monitor/utils.py index 506ad6c3f91c7c73e5e12109a6ea617309df72c0..c85e66a65ba26fdbc1d10a8e55c8273236409b36 100644 --- a/debug/accuracy_tools/msprobe/mindspore/monitor/utils.py +++ b/debug/accuracy_tools/msprobe/mindspore/monitor/utils.py @@ -98,8 +98,8 @@ def validate_ranks(ranks): if not isinstance(ranks, list): raise TypeError("module_ranks should be a list") for rank in ranks: - if not isinstance(rank, str): - raise TypeError(f"element in module_ranks should be a str, get {type(rank)}") + if not isinstance(rank, int): + raise TypeError(f"element in module_ranks should be a int, get {type(rank)}") def validate_targets(targets): diff --git a/debug/accuracy_tools/msprobe/pytorch/api_accuracy_checker/run_ut/data_generate.py b/debug/accuracy_tools/msprobe/pytorch/api_accuracy_checker/run_ut/data_generate.py index 9d89b2de32f70c6fa7abf38add49b58a13531d7a..ec2a4b7165f25f8f9a60ea953ee71cdac0f24a03 100644 --- a/debug/accuracy_tools/msprobe/pytorch/api_accuracy_checker/run_ut/data_generate.py +++ b/debug/accuracy_tools/msprobe/pytorch/api_accuracy_checker/run_ut/data_generate.py @@ -1,6 +1,6 @@ #!/usr/bin/env python3 # -*- coding: utf-8 -*- -# Copyright (c) 2024-2024, Huawei Technologies Co., Ltd. +# Copyright (c) 2024-2025, Huawei Technologies Co., Ltd. # All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); diff --git a/debug/accuracy_tools/msprobe/pytorch/common/utils.py b/debug/accuracy_tools/msprobe/pytorch/common/utils.py index 16067f6d2bee70645bcc337d1809a14f41ae5b96..7a3735a5292885a6e686863088e1186a3ab464ad 100644 --- a/debug/accuracy_tools/msprobe/pytorch/common/utils.py +++ b/debug/accuracy_tools/msprobe/pytorch/common/utils.py @@ -1,4 +1,4 @@ -# Copyright (c) 2024-2024, Huawei Technologies Co., Ltd. +# Copyright (c) 2024-2025, Huawei Technologies Co., Ltd. # All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -25,8 +25,8 @@ import numpy as np import torch import torch.distributed as dist from msprobe.core.common.exceptions import DistributedNotInitializedError -from msprobe.core.common.file_utils import (FileCheckConst, change_mode, - check_file_or_directory_path, check_path_before_create, FileOpen) +from msprobe.core.common.file_utils import FileCheckConst, change_mode, check_file_or_directory_path, \ + check_path_before_create, FileOpen from msprobe.core.common.log import logger from msprobe.core.common.utils import check_seed_all from packaging import version @@ -449,9 +449,9 @@ def is_recomputation(): def check_save_param(variable, name, save_backward): # try catch this api to skip invalid call - if not isinstance(variable, (list, dict, torch.Tensor, int, float, str)): + if not isinstance(variable, (list, dict, tuple, torch.Tensor, int, float, str)): logger.warning("PrecisionDebugger.save variable type not valid, " - "should be one of list, dict, torch.Tensor, int, float or string. " + "should be one of list, dict, tuple, torch.Tensor, int, float or string. " "Skip current save process.") raise ValueError if not isinstance(name, str): @@ -473,3 +473,28 @@ def replace_last_occurrence(text, old, new): if index != -1: return text[:index] + text[index:].replace(old, new, 1) return text + + +def read_pt_data(dir_path, file_name): + if not file_name: + return None + + data_path = os.path.join(dir_path, file_name) + path_checker = FileChecker(data_path, FileCheckConst.FILE, FileCheckConst.READ_ABLE, + FileCheckConst.PT_SUFFIX, False) + data_path = path_checker.common_check() + try: + # detach because numpy can not process gradient information + data_value = load_pt(data_path, to_cpu=True).detach() + except RuntimeError as e: + # 这里捕获 load_pt 中抛出的异常 + logger.error(f"Failed to load the .pt file at {data_path}.") + raise CompareException(CompareException.INVALID_FILE_ERROR) from e + except AttributeError as e: + # 这里捕获 detach 方法抛出的异常 + logger.error(f"Failed to detach the loaded tensor.") + raise CompareException(CompareException.DETACH_ERROR) from e + if data_value.dtype == torch.bfloat16: + data_value = data_value.to(torch.float32) + data_value = data_value.numpy() + return data_value diff --git a/debug/accuracy_tools/msprobe/pytorch/compare/distributed_compare.py b/debug/accuracy_tools/msprobe/pytorch/compare/distributed_compare.py index de62af421b5a37e39140a9836fb16853443740d7..08e2f897a9e9ecacad1c0cc2353ebe123a59b2a7 100644 --- a/debug/accuracy_tools/msprobe/pytorch/compare/distributed_compare.py +++ b/debug/accuracy_tools/msprobe/pytorch/compare/distributed_compare.py @@ -15,14 +15,10 @@ import os -from msprobe.core.common.exceptions import FileCheckException -from msprobe.core.common.file_utils import create_directory -from msprobe.core.common.utils import CompareException, check_compare_param, check_configuration_param, get_dump_mode, \ - set_dump_path -from msprobe.core.compare.acc_compare import ModeConfig -from msprobe.core.compare.utils import check_and_return_dir_contents, extract_json, set_stack_json_path +from msprobe.core.common.utils import CompareException +from msprobe.core.compare.utils import check_and_return_dir_contents, extract_json from msprobe.pytorch.common.log import logger -from msprobe.pytorch.compare.pt_compare import PTComparator, compare +from msprobe.pytorch.compare.pt_compare import compare def compare_distributed(npu_dump_dir, bench_dump_dir, output_path, **kwargs): diff --git a/debug/accuracy_tools/msprobe/pytorch/compare/pt_compare.py b/debug/accuracy_tools/msprobe/pytorch/compare/pt_compare.py index 308a82b3d6e9beb67a669ea05b83d7b8a6eddc90..7c1670dac7133dd3f28c35c7107b3ffea6ed6b38 100644 --- a/debug/accuracy_tools/msprobe/pytorch/compare/pt_compare.py +++ b/debug/accuracy_tools/msprobe/pytorch/compare/pt_compare.py @@ -1,4 +1,4 @@ -# Copyright (c) 2024-2024, Huawei Technologies Co., Ltd. +# Copyright (c) 2024-2025, Huawei Technologies Co., Ltd. # All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -12,14 +12,12 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - -import os.path +import os import torch -from msprobe.core.common.const import FileCheckConst from msprobe.core.common.exceptions import FileCheckException -from msprobe.core.common.file_utils import FileChecker, create_directory, load_yaml +from msprobe.core.common.file_utils import create_directory, load_yaml, FileChecker, FileCheckConst from msprobe.core.common.utils import CompareException, check_compare_param, check_configuration_param, get_dump_mode, \ set_dump_path from msprobe.core.compare.acc_compare import Comparator, ModeConfig @@ -55,28 +53,30 @@ class PTComparator(Comparator): mapping_dict = {} return mapping_dict - def read_npy_data(self, dir_path, file_name): - if not file_name: - return None - data_path = os.path.join(dir_path, file_name) - path_checker = FileChecker(data_path, FileCheckConst.FILE, FileCheckConst.READ_ABLE, - FileCheckConst.PT_SUFFIX, False) - data_path = path_checker.common_check() - try: - # detach because numpy can not process gradient information - data_value = load_pt(data_path, to_cpu=True).detach() - except RuntimeError as e: - # 这里捕获 load_pt 中抛出的异常 - logger.error(f"Failed to load the .pt file at {data_path}.") - raise CompareException(CompareException.INVALID_FILE_ERROR) from e - except AttributeError as e: - # 这里捕获 detach 方法抛出的异常 - logger.error(f"Failed to detach the loaded tensor.") - raise CompareException(CompareException.DETACH_ERROR) from e - if data_value.dtype == torch.bfloat16: - data_value = data_value.to(torch.float32) - data_value = data_value.numpy() - return data_value + +def read_pt_data(dir_path, file_name): + if not file_name: + return None + + data_path = os.path.join(dir_path, file_name) + path_checker = FileChecker(data_path, FileCheckConst.FILE, FileCheckConst.READ_ABLE, + FileCheckConst.PT_SUFFIX, False) + data_path = path_checker.common_check() + try: + # detach because numpy can not process gradient information + data_value = load_pt(data_path, to_cpu=True).detach() + except RuntimeError as e: + # 这里捕获 load_pt 中抛出的异常 + logger.error(f"Failed to load the .pt file at {data_path}.") + raise CompareException(CompareException.INVALID_FILE_ERROR) from e + except AttributeError as e: + # 这里捕获 detach 方法抛出的异常 + logger.error(f"Failed to detach the loaded tensor.") + raise CompareException(CompareException.DETACH_ERROR) from e + if data_value.dtype == torch.bfloat16: + data_value = data_value.to(torch.float32) + data_value = data_value.numpy() + return data_value def compare(input_param, output_path, **kwargs): diff --git a/debug/accuracy_tools/msprobe/pytorch/monitor/module_hook.py b/debug/accuracy_tools/msprobe/pytorch/monitor/module_hook.py index 0c9efaab999e71d896eaf64d837978bd26f214ad..ad0fba463966b86c2d4ab3f5be5f4d95100a7df3 100644 --- a/debug/accuracy_tools/msprobe/pytorch/monitor/module_hook.py +++ b/debug/accuracy_tools/msprobe/pytorch/monitor/module_hook.py @@ -557,9 +557,9 @@ class TrainerMon: def write_mv_tb(self, opt_context): if not self.mv_distribution: return - self.summary_writer.write_metrics(self.ops, opt_context.exp_avg_metric, + self.summary_writer.write_metrics(self.ops, opt_context.exp_avg_metric, opt_context.step, MonitorConst.EXP_AVG) - self.summary_writer.write_metrics(self.ops, opt_context.exp_avg_sq_metric, + self.summary_writer.write_metrics(self.ops, opt_context.exp_avg_sq_metric, opt_context.step, MonitorConst.EXP_AVG_SQ) def write_grad_tb(self, step): @@ -1051,7 +1051,7 @@ class TrainerMon: self.enable_megatron = True logger.info("megatron version is > core_r0.8.0 <= core_r0.9.0") except ImportError: - self.enable_megatron = False | self.enable_megatron + self.enable_megatron = False if not self.enable_megatron: self._hook_weights() diff --git a/debug/accuracy_tools/msprobe/test/core_ut/common/test_dump_file/dump_no_pt_no_ms.json b/debug/accuracy_tools/msprobe/test/core_ut/common/test_dump_file/dump_no_pt_no_ms.json new file mode 100644 index 0000000000000000000000000000000000000000..63a062d8ffa264a0254fc2bab0208dcf951ae094 --- /dev/null +++ b/debug/accuracy_tools/msprobe/test/core_ut/common/test_dump_file/dump_no_pt_no_ms.json @@ -0,0 +1,3 @@ +{ + "task": "tensor" +} \ No newline at end of file diff --git a/debug/accuracy_tools/msprobe/test/core_ut/common/test_dump_file/ms_dump_no_framework.json b/debug/accuracy_tools/msprobe/test/core_ut/common/test_dump_file/ms_dump_no_framework.json new file mode 100644 index 0000000000000000000000000000000000000000..b223c74b2315af1b9454e5f1e70c29502d449c56 --- /dev/null +++ b/debug/accuracy_tools/msprobe/test/core_ut/common/test_dump_file/ms_dump_no_framework.json @@ -0,0 +1,4 @@ +{ + "task": "tensor", + "type": "mindspore.float16" +} \ No newline at end of file diff --git a/debug/accuracy_tools/msprobe/test/core_ut/common/test_dump_file/pt_dump_no_framework.json b/debug/accuracy_tools/msprobe/test/core_ut/common/test_dump_file/pt_dump_no_framework.json new file mode 100644 index 0000000000000000000000000000000000000000..2444ae1fd4096b083a9e8a0e51c9166bb990f51f --- /dev/null +++ b/debug/accuracy_tools/msprobe/test/core_ut/common/test_dump_file/pt_dump_no_framework.json @@ -0,0 +1,4 @@ +{ + "task": "tensor", + "type": "torch.float16" +} \ No newline at end of file diff --git a/debug/accuracy_tools/msprobe/test/core_ut/common/test_file_utils.py b/debug/accuracy_tools/msprobe/test/core_ut/common/test_file_utils.py index 9ed13f78aed57fd4d8153e2f005ea14d4fb33643..ac3a859bf4b2da478e92650cfe3267cf90c23146 100644 --- a/debug/accuracy_tools/msprobe/test/core_ut/common/test_file_utils.py +++ b/debug/accuracy_tools/msprobe/test/core_ut/common/test_file_utils.py @@ -1,7 +1,5 @@ from unittest.mock import patch, mock_open, MagicMock -import numpy as np -import pandas as pd import pytest from msprobe.core.common.file_utils import * @@ -533,4 +531,4 @@ class TestDirectoryChecks: # Test file path check_file_or_directory_path(self.test_file, isdir=False) # Test directory path - check_file_or_directory_path(self.test_dir, isdir=True) \ No newline at end of file + check_file_or_directory_path(self.test_dir, isdir=True) diff --git a/debug/accuracy_tools/msprobe/test/core_ut/common/test_utils.py b/debug/accuracy_tools/msprobe/test/core_ut/common/test_utils.py index 3472ca9018e189ffb48e4d26cfeb79e1ba1ff16d..61766ed27c0a58f4fff81fb2f45618de60bb5b48 100644 --- a/debug/accuracy_tools/msprobe/test/core_ut/common/test_utils.py +++ b/debug/accuracy_tools/msprobe/test/core_ut/common/test_utils.py @@ -1,7 +1,7 @@ #!/usr/bin/env python3 # -*- coding: utf-8 -*- """ -# Copyright (C) 2024-2024. Huawei Technologies Co., Ltd. All rights reserved. +# Copyright (C) 2024-2025. Huawei Technologies Co., Ltd. All rights reserved. # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at @@ -18,11 +18,13 @@ import json import os import tempfile from datetime import datetime, timezone +import unittest from unittest import TestCase from unittest.mock import MagicMock, mock_open, patch import OpenSSL import numpy as np +from pathlib import Path from msprobe.core.common.const import Const from msprobe.core.common.file_utils import ( @@ -53,7 +55,8 @@ from msprobe.core.common.utils import (CompareException, recursion_depth_decorator, MsprobeBaseException, check_str_param, - is_json_file) + is_json_file, + detect_framework_by_dump_json) class TestUtils(TestCase): @@ -488,3 +491,42 @@ class TestCheckCrtValid(TestCase): with self.assertRaises(RuntimeError) as context: check_crt_valid(self.cert_file_path) self.assertIn('The SSL certificate is invalid', str(context.exception)) + + +class TestDetectFrameworkByDumpJson(unittest.TestCase): + + @patch('msprobe.core.common.utils.load_json') + def test_valid_pytorch_framework(self, mock_load_json): + mock_load_json.return_value = {"framework": Const.PT_FRAMEWORK} + + result = detect_framework_by_dump_json("dummy_path") + + self.assertEqual(result, Const.PT_FRAMEWORK) + + @patch('msprobe.core.common.utils.load_json') + def test_valid_mindspore_framework(self, mock_load_json): + mock_load_json.return_value = {"framework": Const.MS_FRAMEWORK} + + result = detect_framework_by_dump_json("dummy_path") + + self.assertEqual(result, Const.MS_FRAMEWORK) + + def test_detect_framework_in_file(self): + self.current_dir = Path(__file__).parent + file_path = self.current_dir / "test_dump_file/pt_dump_no_framework.json" + result = detect_framework_by_dump_json(file_path) + self.assertEqual(result, Const.PT_FRAMEWORK) + + self.current_dir = Path(__file__).parent + file_path = self.current_dir / "test_dump_file/ms_dump_no_framework.json" + result = detect_framework_by_dump_json(file_path) + self.assertEqual(result, Const.MS_FRAMEWORK) + + @patch("msprobe.core.common.utils.logger") + def test_detect_framework_exception(self, mock_logger): + self.current_dir = Path(__file__).parent + file_path = self.current_dir / "test_dump_file/dump_no_pt_no_ms.json" + with self.assertRaises(CompareException) as context: + result = detect_framework_by_dump_json(file_path) + self.assertEqual(context.exception.code, CompareException.INVALID_PARAM_ERROR) + mock_logger.error.assert_called_once_with(f"{file_path} must be based on the MindSpore or PyTorch framework.") diff --git a/debug/accuracy_tools/msprobe/test/core_ut/compare/test_acc_compare.py b/debug/accuracy_tools/msprobe/test/core_ut/compare/test_acc_compare.py index c882e331f5513ddbd3cbb5baf4c1292079680f4f..1b2f6bb2fde28ebc46a5da09bb22cd89d875edd7 100644 --- a/debug/accuracy_tools/msprobe/test/core_ut/compare/test_acc_compare.py +++ b/debug/accuracy_tools/msprobe/test/core_ut/compare/test_acc_compare.py @@ -11,7 +11,7 @@ import torch from msprobe.core.common.const import CompareConst, Const from msprobe.core.common.utils import CompareException -from msprobe.core.compare.acc_compare import Comparator, ModeConfig, get_bench_data_name +from msprobe.core.compare.acc_compare import Comparator, ModeConfig from msprobe.core.compare.highlight import find_error_rows, find_compare_result_error_rows, ApiBatch from msprobe.core.compare.utils import get_accuracy from msprobe.pytorch.compare.pt_compare import PTComparator @@ -636,11 +636,11 @@ class TestUtilsMethods(unittest.TestCase): def test_do_multi_process(self): data = [['Functional.linear.0.forward.input.0', 'Functional.linear.0.forward.input.0', 'torch.float32', 'torch.float32', [2, 2], [2, 2], - '', '', '', '', '', '', 1, 1, 1, 1, 1, 1, 1, 1, 'Yes', '', '-1']] + '', '', '', '', '', '', 1, 1, 1, 1, 1, 1, 1, 1, 'Yes', '', ['-1', '-1']]] o_data = [['Functional.linear.0.forward.input.0', 'Functional.linear.0.forward.input.0', 'torch.float32', 'torch.float32', [2, 2], [2, 2], 'unsupported', 'unsupported', 'unsupported', 'unsupported', 'unsupported', 'unsupported', - 1, 1, 1, 1, 1, 1, 1, 1, 'None', 'No bench data matched.', '-1']] + 1, 1, 1, 1, 1, 1, 1, 1, 'None', 'No bench data matched.', ['-1', '-1']]] columns = CompareConst.COMPARE_RESULT_HEADER + ['Data_name'] result_df = pd.DataFrame(data, columns=columns) o_result = pd.DataFrame(o_data, columns=columns) @@ -670,7 +670,7 @@ class TestUtilsMethods(unittest.TestCase): mode_config = ModeConfig(stack_mode, auto_analyze, fuzzy_match, dump_mode) pt_comparator = PTComparator(mode_config) - result = pt_comparator.compare_by_op(npu_op_name, bench_op_name, op_name_mapping_dict, input_param, {}) + result = pt_comparator.compare_by_op(npu_op_name, bench_op_name, op_name_mapping_dict, input_param) self.assertEqual(result, ['unsupported', 'unsupported', 'unsupported', 'unsupported', 'unsupported', 'unsupported', 'No bench data matched.']) @@ -688,43 +688,23 @@ class TestUtilsMethods(unittest.TestCase): pt_comparator = PTComparator(mode_config) pt_name = '-1' - pt_path = os.path.join(base_dir, pt_name) - op_name_mapping_dict = {'Functional.linear.0.forward.input.0': [pt_path, pt_path]} + op_name_mapping_dict = {'Functional.linear.0.forward.input.0': [pt_name, pt_name]} input_param = {'npu_dump_data_dir': base_dir, 'bench_dump_data_dir': base_dir} - result = pt_comparator.compare_by_op(npu_op_name, bench_op_name, op_name_mapping_dict, input_param, - {'Functional.linear.0.forward': {'input_args': [ - {'data_name': 'Functional.linear.0.forward.input.0.pt'}]}}) + result = pt_comparator.compare_by_op(npu_op_name, bench_op_name, op_name_mapping_dict, input_param) self.assertEqual(result, ['unsupported', 'unsupported', 'unsupported', 'unsupported', 'unsupported', - 'unsupported', f'Dump file: {pt_path} not found.']) + 'unsupported', 'No bench data matched.']) pt_name = 'Functional.linear.0.forward.input.0.pt' - pt_path = os.path.join(base_dir, pt_name) - op_name_mapping_dict = {'Functional.linear.0.forward.input.0': [pt_path, pt_path]} + op_name_mapping_dict = {'Functional.linear.0.forward.input.0': [pt_name, pt_name]} input_param = {'npu_dump_data_dir': base_dir, 'bench_dump_data_dir': base_dir} - result = pt_comparator.compare_by_op(npu_op_name, bench_op_name, op_name_mapping_dict, input_param, {}) + result = pt_comparator.compare_by_op(npu_op_name, bench_op_name, op_name_mapping_dict, input_param) self.assertEqual(result, ['unsupported', 'unsupported', 'unsupported', 'unsupported', 'unsupported', - 'unsupported', 'Bench does not have data file.']) + 'unsupported', 'Dump file: Functional.linear.0.forward.input.0.pt not found.']) generate_pt(base_dir) - result = pt_comparator.compare_by_op(npu_op_name, bench_op_name, op_name_mapping_dict, input_param, - {'Functional.linear.0.forward': {'input_args': [ - {'data_name': 'Functional.linear.0.forward.input.0.pt'}]}}) + result = pt_comparator.compare_by_op(npu_op_name, bench_op_name, op_name_mapping_dict, input_param) self.assertEqual(result, [1.0, 0.0, 0.0, 0.0, 1.0, 1.0, '']) - def test_get_bench_data_name_input(self): - bench_op_name = "Functional.linear.0.forward.input.0" - bench_data = {"Functional.linear.0.forward": {"input_args": [{"data_name": "Functional.linear.0.forward.input.0.pt"}], "input_kwargs": {}, "output": []}} - result = get_bench_data_name(bench_op_name, bench_data) - - self.assertEqual(result, "Functional.linear.0.forward.input.0.pt") - - def test_get_bench_data_name_output(self): - bench_op_name = "Functional.linear.0.forward.output.0" - bench_data = {"Functional.linear.0.forward": {"input_args": [], "input_kwargs": {}, "output": [{"data_name": "Functional.linear.0.forward.output.0.pt"}]}} - result = get_bench_data_name(bench_op_name, bench_data) - - self.assertEqual(result, "Functional.linear.0.forward.output.0.pt") - class TestComparator(unittest.TestCase): def setUp(self): diff --git a/debug/accuracy_tools/msprobe/test/core_ut/compare/test_acc_compare_utils.py b/debug/accuracy_tools/msprobe/test/core_ut/compare/test_acc_compare_utils.py index 2e9a46572662489e861f98f03f25e9e480031bcf..5327237066cd70e13c86a34d0c13f694637a3da9 100644 --- a/debug/accuracy_tools/msprobe/test/core_ut/compare/test_acc_compare_utils.py +++ b/debug/accuracy_tools/msprobe/test/core_ut/compare/test_acc_compare_utils.py @@ -4,17 +4,19 @@ import json import os import shutil import unittest -from unittest.mock import patch +from unittest.mock import patch, MagicMock import zlib +import torch import numpy as np -from msprobe.core.common.const import CompareConst, Const +from msprobe.core.common.const import CompareConst, Const, FileCheckConst from msprobe.core.common.utils import CompareException from msprobe.core.compare.utils import ApiItemInfo, _compare_parser, check_and_return_dir_contents, extract_json, \ count_struct, get_accuracy, append_stack_info, get_rela_diff_summary_mode, get_un_match_accuracy, merge_tensor, \ op_item_parse, read_op, rename_api, resolve_api_special_parameters, result_item_init, stack_column_process, \ - table_value_is_valid, get_name_and_state, reorder_op_name_list, reorder_op_x_list, gen_op_item + table_value_is_valid, get_name_and_state, reorder_op_name_list, reorder_op_x_list, gen_op_item, read_pt_data, \ + read_npy_data # test_read_op_1 op_data = { @@ -224,31 +226,31 @@ o_result_unmatch_3 = [ ['Functional.conv2d.0.forward.input.0', 'N/A', 'torch.float32', 'N/A', [1, 1, 28, 28], 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 3.029174327850342, -2.926689624786377, -0.06619918346405029, 1.0, 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', - 'No bench data matched.', 'None', '-1'], + 'No bench data matched.', 'None', ['-1', '-1']], ['Functional.conv2d.0.forward.input.1', 'N/A', 'torch.float32', 'N/A', [16, 1, 5, 5], 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 0.19919930398464203, -0.19974489510059357, 0.006269412115216255, 1.0, 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', - 'No bench data matched.', 'None', '-1'], + 'No bench data matched.', 'None', ['-1', '-1']], ['Functional.conv2d.0.forward.input.2', 'N/A', 'torch.float32', 'N/A', [16], 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 0.19734230637550354, -0.18177609145641327, 0.007903944700956345, 1.0, 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', - 'No bench data matched.', 'None', '-1'], + 'No bench data matched.', 'None', ['-1', '-1']], ['Functional.conv2d.0.forward.parameters.weight', 'N/A', 'torch.float32', 'N/A', [1, 16, 28, 28], 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', - 1.0, 1.0, 1.0, 1.0, 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'No bench data matched.', 'None', '-1'], + 1.0, 1.0, 1.0, 1.0, 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'No bench data matched.', 'None', ['-1', '-1']], ['Functional.conv2d.0.forward.parameters.bias', 'N/A', 'torch.float32', 'N/A', [1, 16, 28, 28], 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', - 1.0, 1.0, 1.0, 1.0, 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'No bench data matched.', 'None', '-1'], + 1.0, 1.0, 1.0, 1.0, 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'No bench data matched.', 'None', ['-1', '-1']], ['Functional.conv2d.0.forward.output.0', 'N/A', 'torch.float32', 'N/A', [1, 16, 28, 28], 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 2.1166646480560303, -2.190781354904175, -0.003579073818400502, 1.0, 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', - 'No bench data matched.', 'None', '-1'], + 'No bench data matched.', 'None', ['-1', '-1']], ['Functional.conv2d.0.parameters_grad.weight', 'N/A', 'torch.float32', 'N/A', [1, 16, 28, 28], 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', - 1.0, 1.0, 1.0, 1.0, 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'No bench data matched.', 'None', '-1'], + 1.0, 1.0, 1.0, 1.0, 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'No bench data matched.', 'None', ['-1', '-1']], ['Functional.conv2d.0.parameters_grad.bias', 'N/A', 'torch.float32', 'N/A', [1, 16, 28, 28], 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', - 1.0, 1.0, 1.0, 1.0, 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'No bench data matched.', 'None', '-1'] + 1.0, 1.0, 1.0, 1.0, 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'No bench data matched.', 'None', ['-1', '-1']] ] # test_merge_tensor @@ -854,3 +856,54 @@ class TestGenOpItem(unittest.TestCase): expected_md5 = f"{zlib.crc32(str(op_data['value']).encode()):08x}" self.assertEqual(result['md5'], expected_md5) + + +class TestReadPtData(unittest.TestCase): + + @patch('msprobe.core.compare.utils.load_pt') + @patch('msprobe.core.compare.utils.FileChecker') + @patch('os.path.join', return_value='/fake/path/to/file.pt') + def test_read_pt_data(self, mock_os, mock_file_checker, mock_load_pt): + mock_file_checker.return_value.common_check.return_value = '/fake/path/to/file.pt' + + mock_tensor = MagicMock() + mock_tensor.detach.return_value = mock_tensor + mock_tensor.to.return_value = mock_tensor + mock_tensor.dtype = torch.bfloat16 + mock_tensor.numpy.return_value = np.array([1.0, 2.0, 3.0]) + mock_load_pt.return_value = mock_tensor + + result = read_pt_data('/fake/dir', 'file_name.pt') + + mock_file_checker.assert_called_once_with('/fake/path/to/file.pt', FileCheckConst.FILE, FileCheckConst.READ_ABLE, FileCheckConst.PT_SUFFIX, False) + mock_load_pt.assert_called_once_with('/fake/path/to/file.pt', to_cpu=True) + mock_tensor.to.assert_called_once_with(torch.float32) + self.assertTrue(np.array_equal(result, np.array([1.0, 2.0, 3.0]))) + + @patch('os.path.join', return_value='/fake/path/to/file.pt') + @patch('msprobe.core.compare.utils.FileChecker') + @patch('msprobe.core.compare.utils.load_pt') + def test_read_real_data_pt_exception(self, mock_load_pt, mock_file_checker, mock_os): + mock_file_checker.return_value.common_check.return_value = '/fake/path/to/file.pt' + + mock_load_pt.side_effect = RuntimeError("Test Error") + + with self.assertRaises(CompareException): + read_pt_data('/fake/dir', 'file_name.pt') + + +class TestReadNpyData(unittest.TestCase): + + @patch('msprobe.core.compare.utils.load_npy') + @patch('msprobe.core.compare.utils.FileChecker') + @patch('os.path.join', return_value='/fake/path/to/file.npy') + def test_read_real_data_ms(self, mock_os, mock_file_checker, mock_load_npy): + mock_file_checker.return_value.common_check.return_value = '/fake/path/to/file.npy' + + mock_load_npy.return_value = np.array([1.0, 2.0, 3.0]) + + result = read_npy_data('/fake/dir', 'file_name.npy') + + mock_file_checker.assert_called_once_with('/fake/path/to/file.npy', FileCheckConst.FILE, FileCheckConst.READ_ABLE, FileCheckConst.NUMPY_SUFFIX, False) + mock_load_npy.assert_called_once_with('/fake/path/to/file.npy') + self.assertTrue(np.array_equal(result, np.array([1.0, 2.0, 3.0]))) \ No newline at end of file diff --git a/debug/accuracy_tools/msprobe/test/core_ut/compare/test_cmp_multiprocessing_compute.py b/debug/accuracy_tools/msprobe/test/core_ut/compare/test_cmp_multiprocessing_compute.py index 3fa16b0d9d487250a7a8d9ec97b5572d3c0b387a..49f084ce07c8e90afb2aa1c3340bb4c3965c8fa7 100644 --- a/debug/accuracy_tools/msprobe/test/core_ut/compare/test_cmp_multiprocessing_compute.py +++ b/debug/accuracy_tools/msprobe/test/core_ut/compare/test_cmp_multiprocessing_compute.py @@ -18,12 +18,12 @@ data = [['Functional.linear.0.forward.input.0', 'Functional.linear.0.forward.inp 'torch.float32', 'torch.float32', [2, 2], [2, 2], '', '', '', '', '', '', 1, 1, 1, 1, 1, 1, 1, 1, - 'Yes', '', '-1']] + 'Yes', '', ['-1', '-1']]] o_data = [['Functional.linear.0.forward.input.0', 'Functional.linear.0.forward.input.0', 'torch.float32', 'torch.float32', [2, 2], [2, 2], 'unsupported', 'unsupported', 'unsupported', 'unsupported', 'unsupported', 'unsupported', 1, 1, 1, 1, 1, 1, 1, 1, - 'None', 'No bench data matched.', '-1']] + 'None', 'No bench data matched.', ['-1', '-1']]] columns = CompareConst.COMPARE_RESULT_HEADER + ['Data_name'] result_df = pd.DataFrame(data, columns=columns) o_result = pd.DataFrame(o_data, columns=columns) @@ -54,9 +54,9 @@ class TestUtilsMethods(unittest.TestCase): func = Comparator(mode_config).compare_ops generate_dump_json(base_dir) - input_parma = {'bench_json_path': os.path.join(base_dir, 'dump.json')} + input_param = {'bench_json_path': os.path.join(base_dir, 'dump.json')} lock = multiprocessing.Manager().RLock() - result = _handle_multi_process(func, input_parma, result_df, lock) + result = _handle_multi_process(func, input_param, result_df, lock) self.assertTrue(result.equals(o_result)) def test_read_dump_data(self): diff --git a/debug/accuracy_tools/msprobe/test/core_ut/data_dump/data_processor/test_pytorch_processor.py b/debug/accuracy_tools/msprobe/test/core_ut/data_dump/data_processor/test_pytorch_processor.py index 34064e7cc2b9d0aa5c0c2e98806b8993137a589c..3d31a1bb51679c28d2cc25ecced891e31ce4dcfd 100644 --- a/debug/accuracy_tools/msprobe/test/core_ut/data_dump/data_processor/test_pytorch_processor.py +++ b/debug/accuracy_tools/msprobe/test/core_ut/data_dump/data_processor/test_pytorch_processor.py @@ -19,6 +19,7 @@ from msprobe.core.data_dump.data_processor.pytorch_processor import ( KernelDumpDataProcessor ) from torch import distributed as dist +from torch._subclasses import FakeTensorMode class TestPytorchDataProcessor(unittest.TestCase): @@ -62,6 +63,15 @@ class TestPytorchDataProcessor(unittest.TestCase): result = PytorchDataProcessor.get_stat_info(mock_data) self.assertIsInstance(result, TensorStatInfo) + def test_get_stat_info_with_fake_tensor(self): + with FakeTensorMode() as fake_tensor_mode: + fake_tensor = fake_tensor_mode.from_tensor(torch.randn(1, 2, 3)) + result = PytorchDataProcessor.get_stat_info(fake_tensor) + self.assertIsNone(result.max) + self.assertIsNone(result.min) + self.assertIsNone(result.mean) + self.assertIsNone(result.norm) + def test_get_stat_info_float(self): tensor = torch.tensor([1.0, 2.0, 3.0]) result = self.processor.get_stat_info(tensor) diff --git a/debug/accuracy_tools/msprobe/test/mindspore_ut/common/test_ms_utils.py b/debug/accuracy_tools/msprobe/test/mindspore_ut/common/test_ms_utils.py index 1ed3ca016108519fb3f643c9d4bb768f63a52d40..80f91a53f79c81c4e79947bc66b7bf932b774bd0 100644 --- a/debug/accuracy_tools/msprobe/test/mindspore_ut/common/test_ms_utils.py +++ b/debug/accuracy_tools/msprobe/test/mindspore_ut/common/test_ms_utils.py @@ -15,21 +15,13 @@ # limitations under the License. """ import unittest -from unittest.mock import MagicMock, patch, call +from unittest.mock import patch import numpy as np import mindspore as ms -import os -import random - -from msprobe.core.common.exceptions import DistributedNotInitializedError -from msprobe.mindspore.common.utils import (get_rank_if_initialized, - convert_bf16_to_fp32, - save_tensor_as_npy, - convert_to_int, - list_lowest_level_directories, - seed_all, - remove_dropout, - MsprobeStep) + +from msprobe.mindspore.common.utils import get_rank_if_initialized, convert_bf16_to_fp32, convert_to_int, \ + list_lowest_level_directories, seed_all, remove_dropout, MsprobeStep + class MockCell: def __init__(self): @@ -136,8 +128,3 @@ class TestMsprobeFunctions(unittest.TestCase): from mindspore.mint.nn.functional import dropout self.assertTrue((Dropout(0.5)(x1d).numpy() == x1d.numpy()).all()) self.assertTrue((dropout(x1d, p=0.5).numpy() == x1d.numpy()).all()) - - - -if __name__ == "__main__": - unittest.main() \ No newline at end of file diff --git a/debug/accuracy_tools/msprobe/test/mindspore_ut/compare/dump_file/mindspore_data/dump.json b/debug/accuracy_tools/msprobe/test/mindspore_ut/compare/dump_file/mindspore_data/dump.json index 5b954f6d6443c92e6321e5f55e373e99f428653d..48800c0455c6651b146600e61e636d4dc25fac31 100644 --- a/debug/accuracy_tools/msprobe/test/mindspore_ut/compare/dump_file/mindspore_data/dump.json +++ b/debug/accuracy_tools/msprobe/test/mindspore_ut/compare/dump_file/mindspore_data/dump.json @@ -1,6 +1,7 @@ { "task": "statistics", "level": "mix", + "framework": "mindspore", "dump_data_dir": null, "data": { "Tensor.__add__.0.forward": { diff --git a/debug/accuracy_tools/msprobe/test/mindspore_ut/compare/dump_file/pytorch_data/dump.json b/debug/accuracy_tools/msprobe/test/mindspore_ut/compare/dump_file/pytorch_data/dump.json index 150cbd43b169573e48542aa0c46c26e7df69843e..b2704185ff19b961b43453f81247236d77677d83 100644 --- a/debug/accuracy_tools/msprobe/test/mindspore_ut/compare/dump_file/pytorch_data/dump.json +++ b/debug/accuracy_tools/msprobe/test/mindspore_ut/compare/dump_file/pytorch_data/dump.json @@ -1,6 +1,7 @@ { "task": "statistics", "level": "mix", + "framework": "pytorch", "dump_data_dir": null, "data": { "Tensor.__add__.0.forward": { diff --git a/debug/accuracy_tools/msprobe/test/mindspore_ut/compare/test_ms_compare.py b/debug/accuracy_tools/msprobe/test/mindspore_ut/compare/test_ms_compare.py index b5cbff9784a837ea4d64ac9eccdf30175564f712..667fea224120550b0240d8c3dc16d929f2cca72a 100644 --- a/debug/accuracy_tools/msprobe/test/mindspore_ut/compare/test_ms_compare.py +++ b/debug/accuracy_tools/msprobe/test/mindspore_ut/compare/test_ms_compare.py @@ -5,8 +5,10 @@ import random import shutil import tempfile import unittest +from unittest.mock import patch import numpy as np +import pandas as pd import torch import yaml @@ -350,21 +352,21 @@ class TestUtilsMethods(unittest.TestCase): finally: shutil.rmtree(data_path) - def test_check_cross_framework(self): - ms_data = { - "data_name": "Cell.model.language_model.encoder.layers.5.input_norm.FusedRMSNorm.forward.0.input.0.npy", - } - pt_data = { - "data_name": "Module.module.module.language_model.encoder.layers.0.input_norm.RMSNorm.forward.0.input.0.pt", - } + @patch('msprobe.mindspore.compare.ms_compare.detect_framework_by_dump_json') + def test_check_cross_framework_valid_pytorch(self, mock_detect_framework): + mock_detect_framework.return_value = Const.PT_FRAMEWORK + + result = check_cross_framework("dummy_path") + + self.assertTrue(result) + + @patch('msprobe.mindspore.compare.ms_compare.detect_framework_by_dump_json') + def test_check_cross_framework_invalid_framework(self, mock_detect_framework): + mock_detect_framework.return_value = Const.MS_FRAMEWORK - def check_data(data): - with tempfile.NamedTemporaryFile(mode='w+', suffix='.json', encoding='utf-8', delete=True) as temp_file: - json.dump(data, temp_file, ensure_ascii=False, indent=4) - temp_file.flush() - return check_cross_framework(temp_file.name) - self.assertFalse(check_data(ms_data)) - self.assertTrue(check_data(pt_data)) + result = check_cross_framework("dummy_path") + + self.assertFalse(result) def test_comapre_process(self): data_path = tempfile.mkdtemp(prefix='dump_data', dir='/tmp') @@ -466,32 +468,6 @@ class TestUtilsMethods(unittest.TestCase): npu_op_name = ms_comparator.process_cell_mapping(npu_cell_dict.get('op_name')[0]) self.assertEqual(npu_op_name, 'Module.fc1.Linear.forward.0.input.0') - def test_read_npy_data(self): - stack_mode = True - auto_analyze = True - fuzzy_match = False - dump_mode = Const.ALL - - mode_config = ModeConfig(stack_mode, auto_analyze, fuzzy_match, dump_mode) - mapping_config = MappingConfig() - - ms_comparator = MSComparator(mode_config, mapping_config) - - self.temp_file = tempfile.NamedTemporaryFile(suffix='.pt') - tensor = torch.Tensor([1, 2, 3]) - filename = self.temp_file.name.split('/')[-1] - torch.save(tensor, self.temp_file.name) - result = ms_comparator.read_npy_data('/tmp', filename, load_pt_file=True) - self.assertTrue(np.array_equal(result, np.array([1, 2, 3]))) - self.temp_file.close() - - self.temp_file = tempfile.NamedTemporaryFile(suffix='.npy') - tensor = np.array([1, 2, 3]) - filename = self.temp_file.name.split('/')[-1] - np.save(self.temp_file.name, tensor) - result = ms_comparator.read_npy_data('/tmp', filename, load_pt_file=False) - self.assertTrue(np.array_equal(result, np.array([1, 2, 3]))) - self.temp_file.close() def test_process_internal_api_mapping(self): stack_mode = True @@ -533,4 +509,28 @@ class TestUtilsMethods(unittest.TestCase): api_list = ["Mint"] with self.assertRaises(CompareException): - ms_comparator.get_api_name(api_list) \ No newline at end of file + ms_comparator.get_api_name(api_list) + + def test_process_data_name(self): + stack_mode = True + auto_analyze = True + fuzzy_match = False + dump_mode = Const.ALL + + mode_config = ModeConfig(stack_mode, auto_analyze, fuzzy_match, dump_mode) + mapping_config = MappingConfig() + ms_comparator = MSComparator(mode_config, mapping_config) + + data = pd.DataFrame({ + 'data_name_x': ['A', 'B', 'C'], + 'data_name_y': ['X', 'Y', 'Z'] + }) + + result = ms_comparator.process_data_name(data.copy()) + + expected = pd.DataFrame({ + 'data_name_x': [['A', 'X'], ['B', 'Y'], ['C', 'Z']], + 'data_name_y': ['X', 'Y', 'Z'] + }) + + pd.testing.assert_frame_equal(result, expected) diff --git a/debug/accuracy_tools/msprobe/test/pytorch_ut/api_accuracy_checker/tensor_transport_layer/test_attl.py b/debug/accuracy_tools/msprobe/test/pytorch_ut/api_accuracy_checker/tensor_transport_layer/test_attl.py index 7d4e6e950dc1d3e51ef69ca46895fcf5078c5f67..0320c43d0ba9cd1c1d8e60b9867d770b47dd1715 100644 --- a/debug/accuracy_tools/msprobe/test/pytorch_ut/api_accuracy_checker/tensor_transport_layer/test_attl.py +++ b/debug/accuracy_tools/msprobe/test/pytorch_ut/api_accuracy_checker/tensor_transport_layer/test_attl.py @@ -6,6 +6,7 @@ from multiprocessing import Queue from msprobe.pytorch.api_accuracy_checker.tensor_transport_layer.attl import * from msprobe.core.common.file_utils import create_directory + class TestATTL(unittest.TestCase): def setUp(self): @@ -48,7 +49,7 @@ class TestATTL(unittest.TestCase): self.assertIsNone(result) @patch('glob.glob') - @patch('msprobe.pytorch.common.utils.load_pt') + @patch('msprobe.core.common.file_utils.load_pt') def test_download_with_exception(self, mock_load_pt, mock_glob): mock_glob.return_value = ['/tmp/start_file.pt'] mock_load_pt.side_effect = Exception('Load error') diff --git a/debug/accuracy_tools/msprobe/test/pytorch_ut/common/test_pt_utils.py b/debug/accuracy_tools/msprobe/test/pytorch_ut/common/test_pt_utils.py index cdc922cc98d59b59ec0be85833d2000cd38913c8..b1ac148ae742517c389f6de474463468ef90b572 100644 --- a/debug/accuracy_tools/msprobe/test/pytorch_ut/common/test_pt_utils.py +++ b/debug/accuracy_tools/msprobe/test/pytorch_ut/common/test_pt_utils.py @@ -10,8 +10,8 @@ import torch.distributed as dist from msprobe.core.common.file_utils import FileCheckConst from msprobe.core.common.exceptions import DistributedNotInitializedError from msprobe.pytorch.api_accuracy_checker.common.utils import ApiData -from msprobe.pytorch.common.utils import parameter_adapter, get_rank_if_initialized, \ - get_tensor_rank, get_rank_id, print_rank_0, load_pt, save_pt, save_api_data, load_api_data, save_pkl, load_pkl +from msprobe.pytorch.common.utils import parameter_adapter, get_rank_if_initialized, get_tensor_rank, get_rank_id, \ + print_rank_0, load_pt, save_pt, save_api_data, load_api_data, save_pkl, load_pkl class TestParameterAdapter(unittest.TestCase): @@ -180,6 +180,7 @@ class TestLoadPt(unittest.TestCase): if os.path.isfile(self.temp_file.name): os.remove(self.temp_file.name) + class TestSavePT(unittest.TestCase): def setUp(self): @@ -195,6 +196,7 @@ class TestSavePT(unittest.TestCase): mock_torch_save.assert_called_once_with(self.tensor, self.filepath) mock_change_mode.assert_called_once_with(self.filepath, FileCheckConst.DATA_FILE_AUTHORITY) + class TestSavePT(unittest.TestCase): def setUp(self): diff --git a/debug/accuracy_tools/msprobe/test/pytorch_ut/compare/test_pt_compare.py b/debug/accuracy_tools/msprobe/test/pytorch_ut/compare/test_pt_compare.py index b079e646c4a8f4098bb233e3e6259ef3ebea9c94..4eda1d6d974bdc4f6699808946fafb4b136cf98e 100644 --- a/debug/accuracy_tools/msprobe/test/pytorch_ut/compare/test_pt_compare.py +++ b/debug/accuracy_tools/msprobe/test/pytorch_ut/compare/test_pt_compare.py @@ -3,13 +3,10 @@ import os import shutil import unittest -import numpy as np import torch -from msprobe.core.common.const import Const from msprobe.core.common.utils import CompareException -from msprobe.core.compare.acc_compare import ModeConfig -from msprobe.pytorch.compare.pt_compare import PTComparator, compare +from msprobe.pytorch.compare.pt_compare import compare from msprobe.test.core_ut.compare.test_acc_compare import generate_dump_json, generate_stack_json @@ -40,36 +37,6 @@ class TestUtilsMethods(unittest.TestCase): if os.path.exists(base_dir2): shutil.rmtree(base_dir2) - def test_read_npy_data_bf16(self): - generate_bf16_pt(base_dir1) - - stack_mode = True - auto_analyze = True - fuzzy_match = False - dump_mode = Const.ALL - mode_config = ModeConfig(stack_mode, auto_analyze, fuzzy_match, dump_mode) - - pt_comparator = PTComparator(mode_config) - result = pt_comparator.read_npy_data(base_dir1, 'bf16.pt') - - target_result = torch.tensor([1, 2, 3, 4], dtype=torch.float32).numpy() - self.assertTrue(np.array_equal(result, target_result)) - - def test_read_npy_data_dict(self): - generate_dict_pt(base_dir1) - - stack_mode = True - auto_analyze = True - fuzzy_match = False - dump_mode = Const.ALL - mode_config = ModeConfig(stack_mode, auto_analyze, fuzzy_match, dump_mode) - - pt_comparator = PTComparator(mode_config) - - with self.assertRaises(CompareException) as context: - result = pt_comparator.read_npy_data(base_dir1, 'dict.pt') - self.assertEqual(context.exception.code, CompareException.DETACH_ERROR) - def test_compare(self): generate_dump_json(base_dir2) generate_stack_json(base_dir2) diff --git a/debug/accuracy_tools/msprobe/test/resources/layer_mapping/mindspore/dump.json b/debug/accuracy_tools/msprobe/test/resources/layer_mapping/mindspore/dump.json index b55f9e0699fe6329ceeb09a51fe20118c65545e7..153d84e7d117b5be89dfdb522edc39dc066929cb 100644 --- a/debug/accuracy_tools/msprobe/test/resources/layer_mapping/mindspore/dump.json +++ b/debug/accuracy_tools/msprobe/test/resources/layer_mapping/mindspore/dump.json @@ -1,6 +1,7 @@ { "task": "statistics", "level": "mix", + "framework": "mindspore", "dump_data_dir": null, "data": { "Cell.network_with_loss.module.language_model.embedding.word_embeddings.VocabParallelEmbedding.forward.0": { diff --git a/debug/accuracy_tools/msprobe/test/resources/layer_mapping/pytorch/dump.json b/debug/accuracy_tools/msprobe/test/resources/layer_mapping/pytorch/dump.json index d7dd1c0c38e2d24c8b0d19c346a50eb33437d232..02239176a9d690c4ce70c06cc6ab117a3c122811 100644 --- a/debug/accuracy_tools/msprobe/test/resources/layer_mapping/pytorch/dump.json +++ b/debug/accuracy_tools/msprobe/test/resources/layer_mapping/pytorch/dump.json @@ -1,6 +1,7 @@ { "task": "statistics", "level": "mix", + "framework": "pytorch", "dump_data_dir": null, "data": { "Module.module.module.language_model.embedding.word_embeddings.VocabParallelEmbedding.forward.0": { diff --git a/dynolog_npu/README.md b/dynolog_npu/README.md index d6ebd6f7ff04f0fa40601500eaf66b89ed7a7f97..86a23b7f82925079c26623b070936538768d9b8c 100644 --- a/dynolog_npu/README.md +++ b/dynolog_npu/README.md @@ -51,6 +51,8 @@ sudo yum install -y cmake ninja ### 3. 编译 +- dynolog编译 + 默认编译生成dyno和dynolog二进制文件, -t参数可以支持将二进制文件打包成deb包或rpm包. ```bash @@ -64,6 +66,10 @@ bash scripts/build.sh -t deb bash scripts/build.sh -t rpm ``` +- dynolog_npu_plugin wheel包编译 + +dynolog_npu_plugin wheel包提供IPCMonitor,MsptiMonitor等公共能力,使用nputrace和npu-monitor功能前必须安装该wheel包,具体编译安装指导可参考dynolog_npu\plugin\README.md。 + ## 使用方式 ### Profiler trace dump功能 @@ -112,7 +118,9 @@ nputrace子命令支持的参数选项 - nputrace使用方法 -Step1: 拉起dynolog daemon进程 +Step0: 参考`3.编译`章节完成dynolog的编译,以及dynolog_npu_plugin wheel包的编译和安装。 + +Step1:拉起dynolog daemon进程 ```bash # 方法1:使用systemd拉起service # 修改配置文件/etc/dynolog.gflags, 使能ipc_monitor diff --git a/dynolog_npu/plugin/ipc_monitor/DynoLogNpuMonitor.cpp b/dynolog_npu/plugin/ipc_monitor/DynoLogNpuMonitor.cpp index 940f5aae167f088361057fe2a7a389a76f5bb2b4..bba66d7297af1eec929a0149b0b2d1df35eaf843 100644 --- a/dynolog_npu/plugin/ipc_monitor/DynoLogNpuMonitor.cpp +++ b/dynolog_npu/plugin/ipc_monitor/DynoLogNpuMonitor.cpp @@ -1,7 +1,4 @@ #include "DynoLogNpuMonitor.h" - -#include - #include "utils.h" namespace dynolog_npu { @@ -10,13 +7,13 @@ namespace ipc_monitor { bool DynoLogNpuMonitor::Init() { if (isInitialized_) { - std::cout << "[WRARNING] DynoLog npu monitor already initialized" << std::endl; + LOG(ERROR) << "DynoLog npu monitor already initialized"; return true; } bool res = ipcClient_.RegisterInstance(npuId_); if (res) { isInitialized_ = true; - std::cout << "[INFO] DynoLog npu monitor initialized success !" << std::endl; + LOG(INFO) << "DynoLog npu monitor initialized success!"; } return res; } @@ -24,11 +21,6 @@ bool DynoLogNpuMonitor::Init() std::string DynoLogNpuMonitor::Poll() { std::string res = ipcClient_.IpcClientNpuConfig(); - if (res.empty()) { - std::cout << "[INFO] Request for dynolog server is empty !" << std::endl; - return ""; - } - std::cout << "[INFO] Received NPU configuration successfully" << std::endl; return res; } diff --git a/dynolog_npu/plugin/ipc_monitor/NpuIpcClient.cpp b/dynolog_npu/plugin/ipc_monitor/NpuIpcClient.cpp index 97966e8eeacc7276426feb237aa122eb8dee046f..ca2429f1e368ad996b8a8a954810ed7439c78bea 100644 --- a/dynolog_npu/plugin/ipc_monitor/NpuIpcClient.cpp +++ b/dynolog_npu/plugin/ipc_monitor/NpuIpcClient.cpp @@ -1,6 +1,5 @@ #include "NpuIpcClient.h" -#include namespace dynolog_npu { namespace ipc_monitor { @@ -15,14 +14,14 @@ bool IpcClient::RegisterInstance(int32_t id) std::unique_ptr message = Message::ConstructMessage(context, "ctxt"); try { if (!SyncSendMessage(*message, std::string(DYNO_IPC_NAME))) { - std::cout << "[WARNING]Failed to send register ctxt for pid " << context.pid << " with dyno" << std::endl; + LOG(ERROR) << "Failed to send register ctxt for pid " << context.pid << " with dyno"; return false; } } catch (const std::exception &e) { - std::cout << "[WARNING] Error when SyncSendMessage: " << e.what() << std::endl; + LOG(ERROR) << " Error when SyncSendMessage: " << e.what(); return false; } - std::cout << "[INFO] Resigter pid " << context.pid << " for dynolog success !" << std::endl; + LOG(INFO) << "Resigter pid " << context.pid << " for dynolog success !"; return true; } std::string IpcClient::IpcClientNpuConfig() @@ -37,7 +36,7 @@ std::string IpcClient::IpcClientNpuConfig() } std::unique_ptr message = Message::ConstructMessage(*req, "req", size); if (!SyncSendMessage(*message, std::string(DYNO_IPC_NAME))) { - std::cout << "[WARNING] Failed to send config to dyno server fail !" << std::endl; + LOG(ERROR) << " Failed to send config to dyno server fail !"; free(req); req = nullptr; return ""; @@ -45,7 +44,7 @@ std::string IpcClient::IpcClientNpuConfig() free(req); message = PollRecvMessage(MAX_IPC_RETRIES, MAX_SLEEP_US); if (!message) { - std::cout << "[WARNING] Failed to receive on-demand config !" << std::endl; + LOG(ERROR) << " Failed to receive on-demand config !"; return ""; } std::string res = std::string(ReinterpretConvert(message->buf.get()), message->metadata.size); @@ -65,7 +64,7 @@ std::unique_ptr IpcClient::ReceiveMessage() bool IpcClient::SyncSendMessage(const Message &message, const std::string &destName, int numRetry, int seepTimeUs) { if (destName.empty()) { - std::cout << "[WARNING] Can not send to empty socket name !" << std::endl; + LOG(ERROR) << " Can not send to empty socket name !"; return false; } int i = 0; @@ -79,7 +78,7 @@ bool IpcClient::SyncSendMessage(const Message &message, const std::string &destN seepTimeUs *= 2; // 2: double sleep time } } catch (const std::exception &e) { - std::cout << "[ERROR] Error when SyncSendMessage: " << e.what() << std::endl; + LOG(ERROR) << " Error when SyncSendMessage: " << e.what(); return false; } return i < numRetry; @@ -94,7 +93,7 @@ bool IpcClient::Recv() try { successFlag = ep_.TryPeekMessage(*peekCtxt); } catch (std::exception &e) { - std::cout << "[ERROR] Error when TryPeekMessage: " << e.what() << std::endl; + LOG(ERROR) << " Error when TryPeekMessage: " << e.what(); return false; } if (successFlag) { @@ -108,7 +107,7 @@ bool IpcClient::Recv() try { successFlag = ep_.TryRcvMessage(*recvCtxt); } catch (std::exception &e) { - std::cout << "[ERROR] Error when TryRecvMsg: " << e.what() << std::endl; + LOG(ERROR) << " Error when TryRecvMsg: " << e.what(); return false; } if (successFlag) { @@ -118,7 +117,7 @@ bool IpcClient::Recv() } } } catch (std::exception &e) { - std::cout << "[ERROR] Error in Recv(): " << e.what() << std::endl; + LOG(ERROR) << " Error in Recv(): " << e.what(); return false; } return false; diff --git a/dynolog_npu/plugin/ipc_monitor/PyDynamicMonitorProxy.h b/dynolog_npu/plugin/ipc_monitor/PyDynamicMonitorProxy.h index 8b5f88abf9d2cf589bec685cd3a520729afe8dd5..0471a70a3419eeeee2986d1d18710ee112c70313 100644 --- a/dynolog_npu/plugin/ipc_monitor/PyDynamicMonitorProxy.h +++ b/dynolog_npu/plugin/ipc_monitor/PyDynamicMonitorProxy.h @@ -1,7 +1,7 @@ #ifndef PYDYNAMIC_MONITOR_PROXY_H #define PYDYNAMIC_MONITOR_PROXY_H -#include +#include #include #include "MonitorBase.h" #include "DynoLogNpuMonitor.h" @@ -14,15 +14,21 @@ public: PyDynamicMonitorProxy() = default; bool InitDyno(int npuId) { - try { - monitor_ = DynoLogNpuMonitor::GetInstance(); - monitor_->SetNpuId(npuId); - bool res = monitor_->Init(); - return res; - } catch (const std::exception &e) { - std::cout << "[ERROR] Error when init dyno " << e.what() << std::endl; - return false; - } + try { + if (!google::IsGoogleLoggingInitialized()) { + google::InitGoogleLogging("DynoLogNpuMonitor"); + google::SetLogDestination(google::GLOG_INFO, "/var/log/dynolog_npu_"); + google::SetLogFilenameExtension(".log"); + } + monitor_ = DynoLogNpuMonitor::GetInstance(); + monitor_->SetNpuId(npuId); + bool res = monitor_->Init(); + LOG(ERROR) << res; + return res; + } catch (const std::exception &e) { + LOG(ERROR) << "Error when init dyno " << e.what(); + return false; + } } std::string PollDyno() diff --git a/dynolog_npu/plugin/ipc_monitor/utils.cpp b/dynolog_npu/plugin/ipc_monitor/utils.cpp index 936821fd34bc34bc9db9e09515132e8af39ba57a..b57942082e0fd52426ddce47bfc70620bf19019f 100644 --- a/dynolog_npu/plugin/ipc_monitor/utils.cpp +++ b/dynolog_npu/plugin/ipc_monitor/utils.cpp @@ -68,11 +68,11 @@ std::pair GetParentPidAndCommand(int32_t pid) if (std::getline(statFile, line)) { int ret = sscanf(line.c_str(), "%*d (%[^)]) %*c %d", command.data(), &parentPid); if (ret == 2) { // 2: 接收到2个字符 - std::cout << "[INFO] Success to get parent pid: " << parentPid << std::endl; + LOG(INFO) << "Success to get parent pid: " << parentPid; return std::make_pair(parentPid, command); } } - std::cout << "[WARNING] Failed to parse /proc/" << pid << "/stat" << std::endl; + LOG(ERROR) << " Failed to parse /proc/" << pid << "/stat"; return std::make_pair(0, ""); } diff --git a/dynolog_npu/plugin/ipc_monitor/utils.h b/dynolog_npu/plugin/ipc_monitor/utils.h index 0d8ceb8cfd0bf81b6d8b807c6ac1b505276ddf83..2374a27d417f91bc23108a892c6eb25cbb5039d8 100644 --- a/dynolog_npu/plugin/ipc_monitor/utils.h +++ b/dynolog_npu/plugin/ipc_monitor/utils.h @@ -10,7 +10,7 @@ #include #include #include -#include +#include #include diff --git a/dynolog_npu/plugin/setup.py b/dynolog_npu/plugin/setup.py index 151b9b3fb3fa1a42e147685f632163c8b3f5a564..55e924c6b6950c2a9f8f466159ea56184f77e1a6 100644 --- a/dynolog_npu/plugin/setup.py +++ b/dynolog_npu/plugin/setup.py @@ -13,25 +13,28 @@ # See the License for the specific language governing permissions and # limitations under the License. import os +from glob import glob from setuptools import setup from pybind11.setup_helpers import Pybind11Extension BASE_DIR = os.path.dirname(os.path.realpath(__file__)) +DYNOLOG_PATH = os.path.join(os.path.dirname(BASE_DIR), "third_party", "dynolog") +GLOG_INC_PATH = os.path.join(DYNOLOG_PATH, "third_party", "glog", "src") +GLOG_LIB_PATH = os.path.join(DYNOLOG_PATH, "build", "third_party", "glog") # Define the extension module ext_modules = [ Pybind11Extension( "IPCMonitor", # Name of the Python module - sources=["bindings.cpp", - "ipc_monitor/utils.cpp", - "ipc_monitor/DynoLogNpuMonitor.cpp", - "ipc_monitor/NpuIpcClient.cpp", - ], # Source files - include_dirs=[os.path.join(BASE_DIR, "ipc_monitor")], # Include Pybind11 headers + sources=["bindings.cpp"] + list(glob("ipc_monitor/*.cpp")), # Source files + include_dirs=[os.path.join(BASE_DIR, "ipc_monitor"), GLOG_INC_PATH, GLOG_LIB_PATH], # Include Pybind11 headers + library_dirs=[GLOG_LIB_PATH], + libraries=["glog"], language="c++", # Specify the language ), ] + # Set up the package setup( name="dynolog_npu_plugin", diff --git a/profiler/msprof_analyze/cluster_analyse/README.md b/profiler/msprof_analyze/cluster_analyse/README.md index 325a0984793297dfac28673f04a582ea7b4316b9..6612d0f1989c1028ae560e0a6e260f3b673a959d 100644 --- a/profiler/msprof_analyze/cluster_analyse/README.md +++ b/profiler/msprof_analyze/cluster_analyse/README.md @@ -79,6 +79,7 @@ experimental_config = torch_npu.profiler._ExperimentalConfig( | compute_op_sum | 集群场景性能数据的device运行算子信息汇总分析,输入性能数据需要基于ascend_pytorch_profiler_{rank_id}.db文件。--export_type为db时,输出交付件cluster_analysis.db;--export_type为notebook时,在cluster_analysis_output/ComputeOpSum目录下输出交付件stats.ipynb;可根据实际情况决定是否是否打开--exclude_op_name。 | 否 | | hccl_sum | 集合通信算子耗时分析,输入性能数据需要基于ascend_pytorch_profiler_{rank_id}.db文件。--export_type为db时,输出交付件cluster_analysis.db;--export_type为notebook时,在cluster_analysis_output/HcclSum目录下输出交付件stats.ipynb。 | 否 | | mstx_sum | 集群场景mstx打点信息汇总分析,输入性能数据需要基于ascend_pytorch_profiler_{rank_id}.db文件。--export_type为db时,输出交付件cluster_analysis.db;--export_type为notebook时,在cluster_analysis_output/MstxSum目录下输出交付件stats.ipynb。 | 否 | + | freq_analysis | 集群场景aicore frequency信息汇总分析,输入性能数据需要基于ascend_pytorch_profiler_{rank_id}.db文件。打屏输出是否存在aicore存在空闲(频率为800MHz)、异常(频率不为1800MHz或800MHz)的现象。如果有,则在输出交付件cluster_analysis.db增加对应的卡和频率信息。 | 否 | | 自定义分析参数 | 与cann_api_sum、compute_op_sum、hccl_sum等参数功能类似,用户可自定义一套性能数据的分析规则,需要详细了解性能分析的开发人员,具体开发指导请参见“[自定义分析规则开发指导](#自定义分析规则开发指导)”。 | 否 | --parallel_mode参数示例如下: diff --git a/profiler/msprof_analyze/cluster_analyse/analysis/comm_matrix_analysis.py b/profiler/msprof_analyze/cluster_analyse/analysis/comm_matrix_analysis.py index 2ad5797cc924c8bfb51326387d72dceb043fc2ad..3839fe66aac2cf91f6ef08d38270a3d84143d6ee 100644 --- a/profiler/msprof_analyze/cluster_analyse/analysis/comm_matrix_analysis.py +++ b/profiler/msprof_analyze/cluster_analyse/analysis/comm_matrix_analysis.py @@ -22,6 +22,8 @@ from msprof_analyze.prof_common.db_manager import DBManager from msprof_analyze.cluster_analyse.common_func.utils import increase_shared_value from msprof_analyze.prof_common.constant import Constant from msprof_analyze.prof_common.logger import get_logger +from msprof_analyze.cluster_analyse.common_func.utils import double_hash +from msprof_analyze.prof_common.file_manager import FileManager logger = get_logger() @@ -70,30 +72,46 @@ class CommMatrixAnalysis(BaseAnalysis): self.combine_link_info(step_dict) def merge_same_links(self, step_dict: dict): - def process_link_key(rank_id, rank_dict): + def update_rank_map(step_dict): + for op_name, op_dict in step_dict.items(): + group_name = op_name.split("@")[-1] + for rank_id, rank_dict in op_dict.items(): + for link_key in rank_dict: + if '-' not in link_key: + logger.warning("%s has an invalid link key %s!", str(op_name), str(link_key)) + break + src_rank = link_key.split('-')[0] + dst_rank = link_key.split('-')[1] + if src_rank == dst_rank: + if src_rank not in project_local_global_rank_map.get(group_name, {}): + project_local_global_rank_map.setdefault(group_name, {})[src_rank] = rank_id + elif project_local_global_rank_map.get(group_name, {}).get(src_rank) != rank_id: + logger.warning(f"In the same communication group {group_name}, global rank {rank_id} " + f"and {project_local_global_rank_map.get(group_name, {}).get(src_rank)} " + f"get the same local rank {src_rank}!") + + def process_link_key(rank_dict): for link_key in rank_dict: if '-' not in link_key: logger.warning("%s has an invalid link key %s!", str(op_name), str(link_key)) break - src_rank = link_key.split('-')[0] - dst_rank = link_key.split('-')[1] - if src_rank == dst_rank: - if src_rank not in project_local_global_rank_map: - project_local_global_rank_map[src_rank] = rank_id - elif project_local_global_rank_map.get(src_rank) != rank_id: - logger.warning("In the same communication group, local ranks projecting to global ranks " - "repeat!") self.combine_link(link_info[link_key], rank_dict[link_key]) - def convert_local_to_global_rank(): + def convert_local_to_global_rank(rank_map): tmp_link = {} for link_key, link_dict in link_info.items(): src_rank = link_key.split('-')[0] dst_rank = link_key.split('-')[1] - src_rank = project_local_global_rank_map[src_rank] \ - if src_rank in project_local_global_rank_map else src_rank - dst_rank = project_local_global_rank_map[dst_rank] \ - if dst_rank in project_local_global_rank_map else dst_rank + if src_rank not in rank_map: + logger.warning(f"The src local rank {src_rank} of the operator {op_name} " + f"cannot be mapped to the global rank.") + continue + if dst_rank not in rank_map: + logger.warning(f"The dst local rank {dst_rank} of the operator {op_name} " + f"cannot be mapped to the global rank.") + continue + src_rank = rank_map[src_rank] + dst_rank = rank_map[dst_rank] link_dict[Constant.BANDWIDTH_GB_S] = \ self.compute_ratio(link_dict.get(Constant.TRANSIT_SIZE_MB, 0), link_dict.get(Constant.TRANSIT_TIME_MS, 0)) @@ -106,12 +124,14 @@ class CommMatrixAnalysis(BaseAnalysis): Constant.TRANSIT_SIZE_MB: 0, Constant.OP_NAME: '' } + project_local_global_rank_map = self.get_parallel_group_info() + update_rank_map(step_dict) for op_name, op_dict in step_dict.items(): link_info = defaultdict(lambda: copy.deepcopy(default_value)) - project_local_global_rank_map = dict() - for rank_id, rank_dict in op_dict.items(): - process_link_key(rank_id, rank_dict) - step_dict[op_name] = convert_local_to_global_rank() + group_name = op_name.split("@")[-1] + for rank_dict in op_dict.values(): + process_link_key(rank_dict) + step_dict[op_name] = convert_local_to_global_rank(project_local_global_rank_map.get(group_name, {})) def combine_link_info(self, step_dict: dict): default_value = { @@ -131,6 +151,19 @@ class CommMatrixAnalysis(BaseAnalysis): link_dict.get(Constant.TRANSIT_TIME_MS, 0)) step_dict[Constant.TOTAL_OP_INFO] = total_op_info + def get_parallel_group_info(self): + parallel_group_info = {} + for profiler_path in self.data_map.values(): + meta_json = os.path.join(profiler_path, "profiler_metadata.json") + if os.path.exists(meta_json): + meta_data = FileManager.read_json_file(meta_json) + for group_name, group_info in meta_data.get("parallel_group_info", {}).items(): + global_ranks = group_info.get("global_ranks") + if isinstance(global_ranks, list) and global_ranks: + global_ranks.sort() + parallel_group_info[double_hash(group_name)] = dict(enumerate(global_ranks)) + return parallel_group_info + class CommMatrixAnalysisOptimized(CommMatrixAnalysis): SAVED_JSON = "cluster_communication_matrix.json" diff --git a/profiler/msprof_analyze/cluster_analyse/recipes/freq_analysis/__init__.py b/profiler/msprof_analyze/cluster_analyse/recipes/freq_analysis/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/profiler/msprof_analyze/cluster_analyse/recipes/freq_analysis/freq_analysis.py b/profiler/msprof_analyze/cluster_analyse/recipes/freq_analysis/freq_analysis.py new file mode 100644 index 0000000000000000000000000000000000000000..0bc7afa393a2a6734104847efb8daaaa9223d9b4 --- /dev/null +++ b/profiler/msprof_analyze/cluster_analyse/recipes/freq_analysis/freq_analysis.py @@ -0,0 +1,114 @@ +# Copyright (c) 2024, Huawei Technologies Co., Ltd. +# All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +from collections import defaultdict +import pandas as pd + +from msprof_analyze.cluster_analyse.recipes.base_recipe_analysis import BaseRecipeAnalysis +from msprof_analyze.prof_common.constant import Constant +from msprof_analyze.prof_common.logger import get_logger +from msprof_analyze.prof_common.database_service import DatabaseService + +logger = get_logger() + + +class FreqAnalysis(BaseRecipeAnalysis): + COMMON_FREQ = 1800 + FREE_FREQ = 800 + + def __init__(self, params): + super().__init__(params) + self.free_freq_ranks = [] + self.abnormal_freq_ranks = [] + self.abnormal_freq_ranks_map = {} + + @property + def base_dir(self): + return os.path.basename(os.path.dirname(__file__)) + + def reducer_func(self, mapper_res): + if self._is_msprof: + logger.warning("Freq analysis do not support msprof db now.") + return + + mapper_res = list(filter(lambda res: res is not None, mapper_res)) + if not mapper_res: + logger.error("Mapper data is None, load profiling data failed.") + return + + for freqs, rank_id in mapper_res: + if freqs == [self.COMMON_FREQ]: + continue + elif set(freqs) == {self.COMMON_FREQ, self.FREE_FREQ}: + self.free_freq_ranks.append(rank_id) + else: + self.abnormal_freq_ranks.append(rank_id) + self.abnormal_freq_ranks_map[rank_id] = str(freqs) + + self.free_freq_ranks.sort() + self.abnormal_freq_ranks.sort() + + def save_db(self): + if len(self.free_freq_ranks) > 0: + logger.info(f"Found {len(self.free_freq_ranks)} ranks with free time, " + f"aicore frequency in {[self.FREE_FREQ, self.COMMON_FREQ]}.") + free_ranks_df = pd.DataFrame() + free_ranks_df["rankId"] = self.free_freq_ranks + free_ranks_df["aicoreFrequency"] = str([self.FREE_FREQ, self.COMMON_FREQ]) + free_ranks_df.set_index(["rankId"], inplace=True) + self.dump_data(free_ranks_df, Constant.DB_CLUSTER_COMMUNICATION_ANALYZER, "FreeFrequencyRanks") + else: + logger.info("No rank found with free time.") + if len(self.abnormal_freq_ranks) > 0: + logger.info(f"Found {len(self.abnormal_freq_ranks)} ranks with abnormal aicore frequency.") + + abnormal_ranks_df = pd.DataFrame.from_dict(self.abnormal_freq_ranks_map, + orient="index", columns=["aicoreFrequency"]) + abnormal_ranks_df = abnormal_ranks_df.reset_index().rename(columns={"index": "rankId"}) + abnormal_ranks_df.set_index(["rankId"], inplace=True) + self.dump_data(abnormal_ranks_df, Constant.DB_CLUSTER_COMMUNICATION_ANALYZER, "AbnormalFrequencyRanks") + else: + logger.info("No rank found with abnormal aicore frequency.") + if len(self.free_freq_ranks) > 0 or len(self.abnormal_freq_ranks) > 0: + logger.info("Please verify result in output file.") + + def run(self, context): + mapper_res = self.mapper_func(context) + self.reducer_func(mapper_res) + self.save_db() + + def _mapper_func(self, data_map, analysis_class): + profiler_db_path = data_map.get(Constant.PROFILER_DB_PATH) + service = DatabaseService(profiler_db_path, None) + service.add_table_for_query("AICORE_FREQ", ["deviceId", "freq"]) + service.add_table_for_query("RANK_DEVICE_MAP", ["rankId"]) + service_res = service.query_data() + aic_freq = service_res.get("AICORE_FREQ", None) + rank_id = service_res.get("RANK_DEVICE_MAP", None) + + if aic_freq is None or aic_freq.empty: + logger.error(f"No aic freq data found in {profiler_db_path}.") + return None + + if rank_id is None or rank_id.empty: + logger.error(f"No rank_id data found in {profiler_db_path}.") + return None + + rank_id = rank_id["rankId"].values[0] + freq_arr = aic_freq["freq"].values + freqs = list(set(freq_arr)) + freqs.sort() + return freqs, rank_id diff --git a/profiler/msprof_analyze/test/ut/cluster_analyse/recipes/test_freq_analysis.py b/profiler/msprof_analyze/test/ut/cluster_analyse/recipes/test_freq_analysis.py new file mode 100644 index 0000000000000000000000000000000000000000..0a559b79178d03879df6901703c66c7cfcd03663 --- /dev/null +++ b/profiler/msprof_analyze/test/ut/cluster_analyse/recipes/test_freq_analysis.py @@ -0,0 +1,83 @@ +# Copyright (c) 2025, Huawei Technologies Co., Ltd. +# All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import random +import unittest + +import pandas as pd + +from msprof_analyze.cluster_analyse.recipes.freq_analysis.freq_analysis import FreqAnalysis + + +class TestFreqAnalysis(unittest.TestCase): + + freq = [1800] + free_freq = [800, 1800] + abnormal_freq = [1200, 1300, 1800] + + def test_no_error_freq(self): + params = {} + recipe = FreqAnalysis(params) + mapper_res = [(self.freq, 0)] * 10 + recipe.reducer_func(mapper_res) + self.assertEqual(recipe.free_freq_ranks, []) + self.assertEqual(recipe.abnormal_freq_ranks, []) + self.assertEqual(recipe.abnormal_freq_ranks_map, {}) + + + def test_free_rank_map(self): + params = {} + recipe = FreqAnalysis(params) + mapper_res = [ + (self.freq, 0), + (self.free_freq, 1), + (self.free_freq, 2), + (self.freq, 3) + ] + recipe.reducer_func(mapper_res) + self.assertEqual(recipe.free_freq_ranks, [1, 2]) + self.assertEqual(recipe.abnormal_freq_ranks, []) + self.assertEqual(recipe.abnormal_freq_ranks_map, {}) + + def test_abnormal_rank_map(self): + params = {} + recipe = FreqAnalysis(params) + mapper_res = [ + (self.freq, 0), + (self.abnormal_freq, 1), + (self.abnormal_freq, 2), + (self.freq, 3) + ] + + recipe.reducer_func(mapper_res) + self.assertEqual(recipe.free_freq_ranks, []) + self.assertEqual(recipe.abnormal_freq_ranks, [1, 2]) + + def test_mix_freq_case(self): + params = {} + recipe = FreqAnalysis(params) + mapper_res = [] + rank_case = [[], [], []] + random_freq = {0: self.freq, 1: self.free_freq, 2: self.abnormal_freq} + + for i in range(1000): + random_num = random.choice([0, 1, 2]) + mapper_res.append((random_freq.get(random_num, self.freq), i)) + rank_case[random_num].append(i) + + recipe.reducer_func(mapper_res) + self.assertEqual(recipe.free_freq_ranks, rank_case[1]) + self.assertEqual(recipe.abnormal_freq_ranks, rank_case[2])