From ce01da9ee5d2318983108091e4b226050b7ed474 Mon Sep 17 00:00:00 2001 From: wangchao_the_second Date: Mon, 31 Mar 2025 15:01:10 +0800 Subject: [PATCH 01/12] =?UTF-8?q?=E8=B0=83=E7=94=A8torch=E5=87=BD=E6=95=B0?= =?UTF-8?q?=E5=89=8D=E6=B7=BB=E5=8A=A0=E6=9D=83=E9=99=90=E5=88=A4=E6=96=AD?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../msprobe/ccsrc/base/Environment.cpp | 32 +++++++++++++++++++ 1 file changed, 32 insertions(+) diff --git a/debug/accuracy_tools/msprobe/ccsrc/base/Environment.cpp b/debug/accuracy_tools/msprobe/ccsrc/base/Environment.cpp index 3a31e03cf8..00c59afa66 100644 --- a/debug/accuracy_tools/msprobe/ccsrc/base/Environment.cpp +++ b/debug/accuracy_tools/msprobe/ccsrc/base/Environment.cpp @@ -14,6 +14,7 @@ * limitations under the License. */ +#include #include "utils/CPythonUtils.hpp" #include "DebuggerConfig.hpp" #include "Environment.hpp" @@ -21,11 +22,42 @@ namespace MindStudioDebugger { namespace Environment { +static bool IsOthersWritable(const char* module_name) +{ + Py_Initialize(); + PyObject* pName = PyUnicode_DecodeFSDefault(module_name); + PyObject* pModule = PyImport_Import(pName); + Py_DECREF(pName); + if (pModule == nullptr) { + Py_Finalize(); + return true; + } + PyObject* pFile = PyObject_GetAttrString(pModule, "__file__"); + if (pFile == nullptr) { + Py_DECREF(pModule); + Py_Finalize(); + return true; + } + const char* filepath = PyUnicode_AsUTF8(pFile); + Py_DECREF(pFile); + Py_DECREF(pModule); + Py_Finalize(); + + struct stat fileStat; + if (stat(filepath, &fileStat) < 0) { + return true; + } + return (fileStat.st_mode & S_IWOTH) != 0; +} + static int32_t GetRankID_PT() { /* if torch.distributed.is_initialized(): * return torch.distributed.get_rank() */ + if (IsOthersWritable("torch.distributed")) { + return -1; + } CPythonUtils::PythonObject torch = CPythonUtils::PythonObject::Import("torch"); if (!torch.IsModule()) { return -1; -- Gitee From 40d233c94fb0dbde6ac5c3e250fd090d1877eff8 Mon Sep 17 00:00:00 2001 From: wangchao_the_second Date: Fri, 11 Apr 2025 10:40:03 +0800 Subject: [PATCH 02/12] =?UTF-8?q?=E5=8F=AF=E8=A7=86=E5=8C=96=E4=B8=B2?= =?UTF-8?q?=E8=A1=8C=E6=94=B9=E5=B9=B6=E8=A1=8C?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../visualization/builder/graph_builder.py | 17 ++ .../msprobe/visualization/graph_service.py | 281 ++++++++++++------ .../msprobe/visualization/utils.py | 19 ++ 3 files changed, 223 insertions(+), 94 deletions(-) diff --git a/debug/accuracy_tools/msprobe/visualization/builder/graph_builder.py b/debug/accuracy_tools/msprobe/visualization/builder/graph_builder.py index bec99d675f..3479a9d6e3 100644 --- a/debug/accuracy_tools/msprobe/visualization/builder/graph_builder.py +++ b/debug/accuracy_tools/msprobe/visualization/builder/graph_builder.py @@ -285,3 +285,20 @@ class GraphExportConfig: self.micro_steps = micro_steps self.task = task self.overflow_check = overflow_check + + +class GraphInfo: + def __init__(self, graph:Graph, construct_path:str, data_path:str, stack_path:str): + self.graph = graph + self.construct_path = construct_path + self.data_path = data_path + self.stack_path = stack_path + + +class BuildGraphTaskInfo: + def __init__(self, graph_info_n:GraphInfo, graph_info_b:GraphInfo, npu_rank, bench_rank, time_str): + self.graph_info_n = graph_info_n + self.graph_info_b = graph_info_b + self.npu_rank = npu_rank + self.bench_rank = bench_rank + self.time_str = time_str diff --git a/debug/accuracy_tools/msprobe/visualization/graph_service.py b/debug/accuracy_tools/msprobe/visualization/graph_service.py index d971320a59..80e8e031e2 100644 --- a/debug/accuracy_tools/msprobe/visualization/graph_service.py +++ b/debug/accuracy_tools/msprobe/visualization/graph_service.py @@ -16,14 +16,15 @@ import os import time import json +from multiprocessing import cpu_count, Pool from msprobe.core.common.file_utils import (check_file_type, create_directory, FileChecker, check_file_or_directory_path, load_json) from msprobe.core.common.const import FileCheckConst, Const from msprobe.core.common.utils import CompareException from msprobe.core.overflow_check.checker import AnomalyDetector from msprobe.visualization.compare.graph_comparator import GraphComparator -from msprobe.visualization.utils import GraphConst, check_directory_content -from msprobe.visualization.builder.graph_builder import GraphBuilder, GraphExportConfig +from msprobe.visualization.utils import GraphConst, check_directory_content, SerializableArgs +from msprobe.visualization.builder.graph_builder import GraphBuilder, GraphExportConfig, GraphInfo, BuildGraphTaskInfo from msprobe.core.common.log import logger from msprobe.visualization.graph.node_colors import NodeColors from msprobe.core.compare.layer_mapping import generate_api_mapping_by_layer_mapping @@ -33,65 +34,66 @@ from msprobe.visualization.graph.distributed_analyzer import DistributedAnalyzer current_time = time.strftime("%Y%m%d%H%M%S") -def _compare_graph(input_param, args): - logger.info('Start building model graphs...') - # 对两个数据进行构图 - dump_path_n = input_param.get('npu_path') - dump_path_b = input_param.get('bench_path') - construct_path_n = FileChecker(os.path.join(dump_path_n, GraphConst.CONSTRUCT_FILE), - FileCheckConst.FILE, FileCheckConst.READ_ABLE).common_check() - construct_path_b = FileChecker(os.path.join(dump_path_b, GraphConst.CONSTRUCT_FILE), - FileCheckConst.FILE, FileCheckConst.READ_ABLE).common_check() - data_path_n = FileChecker(os.path.join(dump_path_n, GraphConst.DUMP_FILE), FileCheckConst.FILE, - FileCheckConst.READ_ABLE).common_check() - data_path_b = FileChecker(os.path.join(dump_path_b, GraphConst.DUMP_FILE), FileCheckConst.FILE, - FileCheckConst.READ_ABLE).common_check() - stack_path_n = FileChecker(os.path.join(dump_path_n, GraphConst.STACK_FILE), FileCheckConst.FILE, - FileCheckConst.READ_ABLE).common_check() - stack_path_b = FileChecker(os.path.join(dump_path_b, GraphConst.STACK_FILE), FileCheckConst.FILE, - FileCheckConst.READ_ABLE).common_check() - graph_n = GraphBuilder.build(construct_path_n, data_path_n, stack_path_n, complete_stack=args.complete_stack) - graph_b = GraphBuilder.build(construct_path_b, data_path_b, stack_path_b, complete_stack=args.complete_stack) - logger.info('Model graphs built successfully, start Comparing graphs...') - # 基于graph、stack和data进行比较 +def _compare_graph(graph_n:GraphInfo, graph_b:GraphInfo, input_param, args): dump_path_param = { - 'npu_json_path': data_path_n, - 'bench_json_path': data_path_b, - 'stack_json_path': stack_path_n, + 'npu_json_path': graph_n.data_path, + 'bench_json_path': graph_b.data_path, + 'stack_json_path': graph_n.stack_path, 'is_print_compare_log': input_param.get("is_print_compare_log", True) } mapping_dict = None if args.layer_mapping: yaml_path = FileChecker(args.layer_mapping, FileCheckConst.FILE, FileCheckConst.READ_ABLE).common_check() try: - mapping_dict = generate_api_mapping_by_layer_mapping(data_path_n, data_path_b, yaml_path) + mapping_dict = generate_api_mapping_by_layer_mapping(graph_n.data_path, graph_b.data_path, yaml_path) except Exception: logger.warning('The layer mapping file parsing failed, please check file format, mapping is not effective.') - graph_comparator = GraphComparator([graph_n, graph_b], dump_path_param, args, mapping_dict=mapping_dict) + graph_comparator = GraphComparator([graph_n.graph, graph_b.graph], dump_path_param, args, mapping_dict=mapping_dict) graph_comparator.compare() - micro_steps = graph_n.paging_by_micro_step(graph_b) + return graph_comparator + + +def _compare_graph_result(input_param, args): + logger.info('Start building model graphs...') + # 对两个数据进行构图 + graph_n = _build_graph_info(input_param.get('npu_path'), args) + graph_b = _build_graph_info(input_param.get('bench_path'), args) + logger.info('Model graphs built successfully, start Comparing graphs...') + # 基于graph、stack和data进行比较 + graph_comparator = _compare_graph(graph_n, graph_b, input_param, args) + # 增加micro step标记 + micro_steps = graph_n.graph.paging_by_micro_step(graph_b.graph) # 开启溢出检测 if args.overflow_check: - graph_n.overflow_check() - graph_b.overflow_check() + graph_n.graph.overflow_check() + graph_b.graph.overflow_check() - return CompareGraphResult(graph_n, graph_b, graph_comparator, micro_steps) + return CompareGraphResult(graph_n.graph, graph_b.graph, graph_comparator, micro_steps) -def _export_compare_graph_result(args, graphs, graph_comparator, micro_steps, - output_file_name=f'compare_{current_time}.vis'): - create_directory(args.output_path) +def _export_compare_graph_result(args, result): + graphs = [result.graph_n, result.graph_b] + graph_comparator = result.graph_comparator + micro_steps = result.micro_steps + output_file_name = result.output_file_name + if not output_file_name: + output_file_name = f'compare_{current_time}.vis' + logger.info(f'Start exporting compare graph result, file name: {output_file_name}...') output_path = os.path.join(args.output_path, output_file_name) task = GraphConst.GRAPHCOMPARE_MODE_TO_DUMP_MODE_TO_MAPPING.get(graph_comparator.ma.compare_mode) export_config = GraphExportConfig(graphs[0], graphs[1], graph_comparator.ma.get_tool_tip(), NodeColors.get_node_colors(graph_comparator.ma.compare_mode), micro_steps, task, args.overflow_check) - GraphBuilder.to_json(output_path, export_config) - logger.info(f'Model graphs compared successfully, the result file is saved in {output_path}') + try: + GraphBuilder.to_json(output_path, export_config) + logger.info(f'Exporting compare graph result successfully, the result file is saved in {output_path}') + return '' + except RuntimeError as e: + logger.error(f'Failed to export compare graph result, file: {output_file_name}, error: {e}') + return output_file_name -def _build_graph(dump_path, args): - logger.info('Start building model graph...') +def _build_graph_info(dump_path, args): construct_path = FileChecker(os.path.join(dump_path, GraphConst.CONSTRUCT_FILE), FileCheckConst.FILE, FileCheckConst.READ_ABLE).common_check() data_path = FileChecker(os.path.join(dump_path, GraphConst.DUMP_FILE), FileCheckConst.FILE, @@ -99,6 +101,12 @@ def _build_graph(dump_path, args): stack_path = FileChecker(os.path.join(dump_path, GraphConst.STACK_FILE), FileCheckConst.FILE, FileCheckConst.READ_ABLE).common_check() graph = GraphBuilder.build(construct_path, data_path, stack_path, complete_stack=args.complete_stack) + return GraphInfo(graph, construct_path, data_path, stack_path) + + +def _build_graph_result(dump_path, args): + graph = _build_graph_info(dump_path, args).graph + # 增加micro step标记 micro_steps = graph.paging_by_micro_step() # 开启溢出检测 if args.overflow_check: @@ -106,12 +114,70 @@ def _build_graph(dump_path, args): return BuildGraphResult(graph, micro_steps) -def _export_build_graph_result(out_path, graph, micro_steps, overflow_check, - output_file_name=f'build_{current_time}.vis'): - create_directory(out_path) +def _run_build_graph_compare(input_param, args, nr, br): + logger.info(f'Start building graph for {nr}...') + graph_n = _build_graph_info(input_param.get('npu_path'), args) + graph_b = _build_graph_info(input_param.get('bench_path'), args) + logger.info(f'Building graph for {nr} finished.') + return BuildGraphTaskInfo(graph_n, graph_b, nr, br, current_time) + + +def _run_build_graph_single(dump_ranks_path, rank, step, args): + logger.info(f'Start building graph for {rank}...') + dump_path = os.path.join(dump_ranks_path, rank) + output_file_name = f'build_{step}_{rank}_{current_time}.vis' if step else f'build_{rank}_{current_time}.vis' + result = _build_graph_result(dump_path, args) + result.output_file_name = output_file_name + if rank != Const.RANK: + try: + result.rank = int(rank.replace(Const.RANK, "")) + except Exception as e: + logger.error('The folder name format is incorrect, expected rank+number.') + raise CompareException(CompareException.INVALID_PATH_ERROR) from e + logger.info(f'Building graph for {rank} finished.') + return result + + +def _run_graph_compare(graph_task_info, input_param, args, output_file_name): + logger.info(f'Start comparing data for {graph_task_info.npu_rank}...') + graph_n = graph_task_info.graph_info_n + graph_b = graph_task_info.graph_info_b + nr = graph_task_info.npu_rank + graph_comparator = _compare_graph(graph_n, graph_b, input_param, args) + micro_steps = graph_n.graph.paging_by_micro_step(graph_b.graph) + # 开启溢出检测 + if args.overflow_check: + graph_n.graph.overflow_check() + graph_b.graph.overflow_check() + graph_result = CompareGraphResult(graph_n.graph, graph_b.graph, graph_comparator, micro_steps) + graph_result.output_file_name = output_file_name + if nr != Const.RANK: + try: + graph_result.rank = int(nr.replace(Const.RANK, "")) + except Exception as e: + logger.error('The folder name format is incorrect, expected rank+number.') + raise CompareException(CompareException.INVALID_PATH_ERROR) from e + logger.info(f'Comparing data for {graph_task_info.npu_rank} finished.') + return graph_result + + +def _export_build_graph_result(args, result): + out_path = args.output_path + graph = result.graph + micro_steps = result.micro_steps + overflow_check = args.overflow_check + output_file_name = result.output_file_name + if not output_file_name: + output_file_name=f'build_{current_time}.vis' + logger.info(f'Start exporting graph for {output_file_name}...') output_path = os.path.join(out_path, output_file_name) - GraphBuilder.to_json(output_path, GraphExportConfig(graph, micro_steps=micro_steps, overflow_check=overflow_check)) - logger.info(f'Model graph built successfully, the result file is saved in {output_path}') + try: + GraphBuilder.to_json(output_path, GraphExportConfig(graph, micro_steps=micro_steps, overflow_check=overflow_check)) + logger.info(f'Model graph exported successfully, the result file is saved in {output_path}') + return None + except RuntimeError as e: + logger.error(f'Failed to export model graph, file: {output_file_name}, error: {e}') + return output_file_name def _compare_graph_ranks(input_param, args, step=None): @@ -122,33 +188,48 @@ def _compare_graph_ranks(input_param, args, step=None): if npu_ranks != bench_ranks: logger.error('The number of ranks in the two runs are different. Unable to match the ranks.') raise CompareException(CompareException.INVALID_PATH_ERROR) + mp_res_dict = {} compare_graph_results = [] - for nr, br in zip(npu_ranks, bench_ranks): - logger.info(f'Start processing data for {nr}...') - input_param['npu_path'] = os.path.join(dump_rank_n, nr) - input_param['bench_path'] = os.path.join(dump_rank_b, br) - output_file_name = f'compare_{step}_{nr}_{current_time}.vis' if step else f'compare_{nr}_{current_time}.vis' - result = _compare_graph(input_param, args) - result.output_file_name = output_file_name - if nr != Const.RANK: + with Pool(processes=max(int((cpu_count() + 1) // 4), 1)) as pool: + def err_call(err): + logger.error(f'Error occurred while comparing graph ranks: {err}') try: - result.rank = int(nr.replace(Const.RANK, "")) - except Exception as e: - logger.error('The folder name format is incorrect, expected rank+number.') - raise CompareException(CompareException.INVALID_PATH_ERROR) from e - # 暂存所有rank的graph,用于匹配rank间的分布式节点 - compare_graph_results.append(result) - - # 匹配rank间的分布式节点 - if len(compare_graph_results) > 1: - DistributedAnalyzer({obj.rank: obj.graph_n for obj in compare_graph_results}, - args.overflow_check).distributed_match() - DistributedAnalyzer({obj.rank: obj.graph_b for obj in compare_graph_results}, - args.overflow_check).distributed_match() - - for result in compare_graph_results: - _export_compare_graph_result(args, [result.graph_n, result.graph_b], result.graph_comparator, - result.micro_steps, output_file_name=result.output_file_name) + pool.terminate() + except OSError as e: + logger.error(f'Error occurred while terminating the pool: {e}') + + serializable_args = SerializableArgs(args) + for nr, br in zip(npu_ranks, bench_ranks): + input_param['npu_path'] = os.path.join(dump_rank_n, nr) + input_param['bench_path'] = os.path.join(dump_rank_b, br) + output_file_name = f'compare_{step}_{nr}_{current_time}.vis' if step else f'compare_{nr}_{current_time}.vis' + mp_res_dict[output_file_name] = pool.apply_async(_run_build_graph_compare, + args=(input_param, serializable_args, nr, br), + error_callback=err_call) + + for output_file_name, mp_res in mp_res_dict.items(): + # 暂存所有rank的graph,用于匹配rank间的分布式节点 + compare_graph_results.append(_run_graph_compare(mp_res.get(), input_param, serializable_args, output_file_name)) + + # 匹配rank间的分布式节点 + if len(compare_graph_results) > 1: + DistributedAnalyzer({obj.rank: obj.graph_n for obj in compare_graph_results}, + args.overflow_check).distributed_match() + DistributedAnalyzer({obj.rank: obj.graph_b for obj in compare_graph_results}, + args.overflow_check).distributed_match() + + export_res_task_list = [] + create_directory(args.output_path) + for result in compare_graph_results: + export_res_task_list.append(pool.apply_async(_export_compare_graph_result, + args=(serializable_args, result), + error_callback=err_call)) + export_res_list = [res.get() for res in export_res_task_list] + if any(export_res_list): + failed_names = list(filter(lambda x: x, export_res_list)) + logger.error(f'Unable to export compare graph results: {", ".join(failed_names)}.') + else: + logger.info('Successfully exported compare graph results.') def _compare_graph_steps(input_param, args): @@ -172,28 +253,39 @@ def _compare_graph_steps(input_param, args): def _build_graph_ranks(dump_ranks_path, args, step=None): ranks = sorted(check_and_return_dir_contents(dump_ranks_path, Const.RANK)) - build_graph_results = [] - for rank in ranks: - logger.info(f'Start processing data for {rank}...') - dump_path = os.path.join(dump_ranks_path, rank) - output_file_name = f'build_{step}_{rank}_{current_time}.vis' if step else f'build_{rank}_{current_time}.vis' - result = _build_graph(dump_path, args) - result.output_file_name = output_file_name - if rank != Const.RANK: + serializable_args = SerializableArgs(args) + with Pool(processes=max(int((cpu_count() + 1) // 4), 1)) as pool: + def err_call(err): + logger.error(f'Error occurred while comparing graph ranks: {err}') try: - result.rank = int(rank.replace(Const.RANK, "")) - except Exception as e: - logger.error('The folder name format is incorrect, expected rank+number.') - raise CompareException(CompareException.INVALID_PATH_ERROR) from e - build_graph_results.append(result) - - if len(build_graph_results) > 1: - DistributedAnalyzer({obj.rank: obj.graph for obj in build_graph_results}, - args.overflow_check).distributed_match() + pool.terminate() + except OSError as e: + logger.error(f'Error occurred while terminating the pool: {e}') + + build_graph_tasks = [] + for rank in ranks: + build_graph_tasks.append(pool.apply_async(_run_build_graph_single, + args=(dump_ranks_path, rank, step, serializable_args)), + error_callback=err_call) + build_graph_results = [task.get() for task in build_graph_tasks] + + if len(build_graph_results) > 1: + DistributedAnalyzer({obj.rank: obj.graph for obj in build_graph_results}, + args.overflow_check).distributed_match() + + create_directory(args.output_path) + export_build_graph_tasks = [] + for result in build_graph_results: + export_build_graph_tasks.append(pool.apply_async(_export_build_graph_result, + args=(serializable_args, result), + error_callback=err_call)) + export_build_graph_result = [task.get() for task in export_build_graph_tasks] + if any(export_build_graph_result): + failed_names = list(filter(lambda x: x, export_build_graph_result)) + logger.error(f'Unable to export build graph results: {", ".join(failed_names)}.') + else: + logger.info(f'Successfully exported build graph results.') - for result in build_graph_results: - _export_build_graph_result(args.output_path, result.graph, result.micro_steps, args.overflow_check, - result.output_file_name) def _build_graph_steps(dump_steps_path, args): @@ -233,8 +325,9 @@ def _graph_service_command(args): elif content == GraphConst.STEPS: _build_graph_steps(npu_path, args) else: - result = _build_graph(npu_path, args) - _export_build_graph_result(args.output_path, result.graph, result.micro_steps, args.overflow_check) + result = _build_graph_result(npu_path, args) + create_directory(args.output_path) + _export_build_graph_result(args, result) elif check_file_type(npu_path) == FileCheckConst.DIR and check_file_type(bench_path) == FileCheckConst.DIR: content_n = check_directory_content(npu_path) content_b = check_directory_content(bench_path) @@ -245,9 +338,9 @@ def _graph_service_command(args): elif content_n == GraphConst.STEPS: _compare_graph_steps(input_param, args) else: - result = _compare_graph(input_param, args) - _export_compare_graph_result(args, [result.graph_n, result.graph_b], - result.graph_comparator, result.micro_steps) + result = _compare_graph_result(input_param, args) + create_directory(args.output_path) + _export_compare_graph_result(args, result) else: logger.error("The npu_path or bench_path should be a folder.") raise CompareException(CompareException.INVALID_COMPARE_MODE) diff --git a/debug/accuracy_tools/msprobe/visualization/utils.py b/debug/accuracy_tools/msprobe/visualization/utils.py index 5f428697bd..059d3eaac0 100644 --- a/debug/accuracy_tools/msprobe/visualization/utils.py +++ b/debug/accuracy_tools/msprobe/visualization/utils.py @@ -16,6 +16,7 @@ import os import re import json +import pickle from msprobe.core.common.file_utils import FileOpen from msprobe.core.common.const import CompareConst, Const from msprobe.core.compare.acc_compare import Comparator, ModeConfig @@ -192,3 +193,21 @@ class GraphConst: OP = 'op' PEER = 'peer' GROUP_ID = 'group_id' + + +def is_serializable(obj): + """ + Check if an object is serializable + """ + try: + pickle.dumps(obj) + return True + except (pickle.PicklingError, TypeError): + return False + + +class SerializableArgs: + def __init__(self, args): + for k, v in vars(args).items(): + if is_serializable(v): + setattr(self, k, v) -- Gitee From 9df4f5bb141db0454e726d155e4ed6c00326bcac Mon Sep 17 00:00:00 2001 From: wangchao_the_second Date: Fri, 11 Apr 2025 16:53:05 +0800 Subject: [PATCH 03/12] =?UTF-8?q?=E6=89=93=E5=8D=B0=E6=96=87=E4=BB=B6?= =?UTF-8?q?=E6=A0=A1=E9=AA=8C=E6=97=A5=E5=BF=97=E5=8F=AA=E6=89=93=E4=B8=80?= =?UTF-8?q?=E6=AC=A1?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../msprobe/core/common/file_utils.py | 18 ++++++++++++------ .../msprobe/visualization/utils.py | 2 +- 2 files changed, 13 insertions(+), 7 deletions(-) diff --git a/debug/accuracy_tools/msprobe/core/common/file_utils.py b/debug/accuracy_tools/msprobe/core/common/file_utils.py index 991c49c62d..caea85a806 100644 --- a/debug/accuracy_tools/msprobe/core/common/file_utils.py +++ b/debug/accuracy_tools/msprobe/core/common/file_utils.py @@ -43,12 +43,13 @@ class FileChecker: file_type(str): The correct file type for file """ - def __init__(self, file_path, path_type, ability=None, file_type=None, is_script=True): + def __init__(self, file_path, path_type, ability=None, file_type=None, is_script=True, printed_file_dict={}): self.file_path = file_path self.path_type = self._check_path_type(path_type) self.ability = ability self.file_type = file_type self.is_script = is_script + self.printed_file_dict = printed_file_dict @staticmethod def _check_path_type(path_type): @@ -74,7 +75,7 @@ class FileChecker: check_common_file_size(self.file_path) check_file_suffix(self.file_path, self.file_type) if self.path_type == FileCheckConst.FILE: - check_dirpath_before_read(self.file_path) + check_dirpath_before_read(self.file_path, self.printed_file_dict) return self.file_path def check_path_ability(self): @@ -99,11 +100,12 @@ class FileOpen: SUPPORT_WRITE_MODE = ["w", "wb", "a", "ab"] SUPPORT_READ_WRITE_MODE = ["r+", "rb+", "w+", "wb+", "a+", "ab+"] - def __init__(self, file_path, mode, encoding='utf-8'): + def __init__(self, file_path, mode, encoding='utf-8', printed_file_dict={}): self.file_path = file_path self.mode = mode self.encoding = encoding self._handle = None + self.printed_file_dict = printed_file_dict def __enter__(self): self.check_file_path() @@ -130,7 +132,7 @@ class FileOpen: check_path_pattern_valid(self.file_path) if os.path.exists(self.file_path): check_common_file_size(self.file_path) - check_dirpath_before_read(self.file_path) + check_dirpath_before_read(self.file_path, self.printed_file_dict) def check_ability_and_owner(self): if self.mode in self.SUPPORT_READ_MODE: @@ -295,11 +297,15 @@ def check_path_before_create(path): 'The file path {} contains special characters.'.format(path)) -def check_dirpath_before_read(path): +def check_dirpath_before_read(path, printed_dict): path = os.path.realpath(path) dirpath = os.path.dirname(path) if check_others_writable(dirpath): - logger.warning(f"The directory is writable by others: {dirpath}.") + printed_set = printed_dict.get('check_dirpath_before_read', set()) + if dirpath not in printed_set: + logger.warning(f"The directory is writable by others: {dirpath}.") + printed_set.add(dirpath) + printed_dict['check_dirpath_before_read'] = printed_set try: check_path_owner_consistent(dirpath) except FileCheckException: diff --git a/debug/accuracy_tools/msprobe/visualization/utils.py b/debug/accuracy_tools/msprobe/visualization/utils.py index 059d3eaac0..4193207ee3 100644 --- a/debug/accuracy_tools/msprobe/visualization/utils.py +++ b/debug/accuracy_tools/msprobe/visualization/utils.py @@ -202,7 +202,7 @@ def is_serializable(obj): try: pickle.dumps(obj) return True - except (pickle.PicklingError, TypeError): + except Exception: return False -- Gitee From 438b5216d5a3a61cd7a098b509543d1c2263f9d3 Mon Sep 17 00:00:00 2001 From: wangchao_the_second Date: Fri, 11 Apr 2025 18:52:17 +0800 Subject: [PATCH 04/12] =?UTF-8?q?=E4=BF=AE=E6=94=B9ut=E5=AF=B9=E4=B8=9A?= =?UTF-8?q?=E5=8A=A1=E4=BB=A3=E7=A0=81=E7=9A=84=E5=BD=B1=E5=93=8D?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../pytorch/api_accuracy_checker/run_ut/multi_run_ut.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/debug/accuracy_tools/msprobe/pytorch/api_accuracy_checker/run_ut/multi_run_ut.py b/debug/accuracy_tools/msprobe/pytorch/api_accuracy_checker/run_ut/multi_run_ut.py index 4ac42ac81e..763a4505b2 100644 --- a/debug/accuracy_tools/msprobe/pytorch/api_accuracy_checker/run_ut/multi_run_ut.py +++ b/debug/accuracy_tools/msprobe/pytorch/api_accuracy_checker/run_ut/multi_run_ut.py @@ -87,10 +87,6 @@ def signal_handler(signum, frame): raise KeyboardInterrupt() -signal.signal(signal.SIGINT, signal_handler) -signal.signal(signal.SIGTERM, signal_handler) - - ParallelUTConfig = namedtuple('ParallelUTConfig', ['api_files', 'out_path', 'num_splits', 'save_error_data_flag', 'jit_compile_flag', 'device_id', 'result_csv_path', 'total_items', 'config_path']) @@ -217,6 +213,8 @@ def prepare_config(args): def main(): + signal.signal(signal.SIGINT, signal_handler) + signal.signal(signal.SIGTERM, signal_handler) parser = argparse.ArgumentParser(description='Run UT in parallel') _run_ut_parser(parser) parser.add_argument('-n', '--num_splits', type=int, choices=range(1, 65), default=8, -- Gitee From 0c21739546ee00c202ac4f4bb33170de4bb78a8f Mon Sep 17 00:00:00 2001 From: wangchao_the_second Date: Mon, 14 Apr 2025 16:50:56 +0800 Subject: [PATCH 05/12] =?UTF-8?q?=E5=9B=9E=E9=80=80=E4=B9=8B=E5=89=8D?= =?UTF-8?q?=E7=9A=84=E4=BF=AE=E6=94=B9?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../msprobe/ccsrc/base/Environment.cpp | 32 ------------------- 1 file changed, 32 deletions(-) diff --git a/debug/accuracy_tools/msprobe/ccsrc/base/Environment.cpp b/debug/accuracy_tools/msprobe/ccsrc/base/Environment.cpp index 00c59afa66..3a31e03cf8 100644 --- a/debug/accuracy_tools/msprobe/ccsrc/base/Environment.cpp +++ b/debug/accuracy_tools/msprobe/ccsrc/base/Environment.cpp @@ -14,7 +14,6 @@ * limitations under the License. */ -#include #include "utils/CPythonUtils.hpp" #include "DebuggerConfig.hpp" #include "Environment.hpp" @@ -22,42 +21,11 @@ namespace MindStudioDebugger { namespace Environment { -static bool IsOthersWritable(const char* module_name) -{ - Py_Initialize(); - PyObject* pName = PyUnicode_DecodeFSDefault(module_name); - PyObject* pModule = PyImport_Import(pName); - Py_DECREF(pName); - if (pModule == nullptr) { - Py_Finalize(); - return true; - } - PyObject* pFile = PyObject_GetAttrString(pModule, "__file__"); - if (pFile == nullptr) { - Py_DECREF(pModule); - Py_Finalize(); - return true; - } - const char* filepath = PyUnicode_AsUTF8(pFile); - Py_DECREF(pFile); - Py_DECREF(pModule); - Py_Finalize(); - - struct stat fileStat; - if (stat(filepath, &fileStat) < 0) { - return true; - } - return (fileStat.st_mode & S_IWOTH) != 0; -} - static int32_t GetRankID_PT() { /* if torch.distributed.is_initialized(): * return torch.distributed.get_rank() */ - if (IsOthersWritable("torch.distributed")) { - return -1; - } CPythonUtils::PythonObject torch = CPythonUtils::PythonObject::Import("torch"); if (!torch.IsModule()) { return -1; -- Gitee From fb66f7d005d1872af5534959fefb3154eacaf796 Mon Sep 17 00:00:00 2001 From: wangchao_the_second Date: Tue, 15 Apr 2025 10:52:46 +0800 Subject: [PATCH 06/12] =?UTF-8?q?=E5=9B=9E=E9=80=80=E6=97=A5=E5=BF=97?= =?UTF-8?q?=E4=BF=AE=E6=94=B9?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../msprobe/core/common/file_utils.py | 18 ++++------- .../visualization/builder/graph_builder.py | 4 +-- .../msprobe/visualization/graph_service.py | 30 ++++++++++++------- 3 files changed, 27 insertions(+), 25 deletions(-) diff --git a/debug/accuracy_tools/msprobe/core/common/file_utils.py b/debug/accuracy_tools/msprobe/core/common/file_utils.py index caea85a806..991c49c62d 100644 --- a/debug/accuracy_tools/msprobe/core/common/file_utils.py +++ b/debug/accuracy_tools/msprobe/core/common/file_utils.py @@ -43,13 +43,12 @@ class FileChecker: file_type(str): The correct file type for file """ - def __init__(self, file_path, path_type, ability=None, file_type=None, is_script=True, printed_file_dict={}): + def __init__(self, file_path, path_type, ability=None, file_type=None, is_script=True): self.file_path = file_path self.path_type = self._check_path_type(path_type) self.ability = ability self.file_type = file_type self.is_script = is_script - self.printed_file_dict = printed_file_dict @staticmethod def _check_path_type(path_type): @@ -75,7 +74,7 @@ class FileChecker: check_common_file_size(self.file_path) check_file_suffix(self.file_path, self.file_type) if self.path_type == FileCheckConst.FILE: - check_dirpath_before_read(self.file_path, self.printed_file_dict) + check_dirpath_before_read(self.file_path) return self.file_path def check_path_ability(self): @@ -100,12 +99,11 @@ class FileOpen: SUPPORT_WRITE_MODE = ["w", "wb", "a", "ab"] SUPPORT_READ_WRITE_MODE = ["r+", "rb+", "w+", "wb+", "a+", "ab+"] - def __init__(self, file_path, mode, encoding='utf-8', printed_file_dict={}): + def __init__(self, file_path, mode, encoding='utf-8'): self.file_path = file_path self.mode = mode self.encoding = encoding self._handle = None - self.printed_file_dict = printed_file_dict def __enter__(self): self.check_file_path() @@ -132,7 +130,7 @@ class FileOpen: check_path_pattern_valid(self.file_path) if os.path.exists(self.file_path): check_common_file_size(self.file_path) - check_dirpath_before_read(self.file_path, self.printed_file_dict) + check_dirpath_before_read(self.file_path) def check_ability_and_owner(self): if self.mode in self.SUPPORT_READ_MODE: @@ -297,15 +295,11 @@ def check_path_before_create(path): 'The file path {} contains special characters.'.format(path)) -def check_dirpath_before_read(path, printed_dict): +def check_dirpath_before_read(path): path = os.path.realpath(path) dirpath = os.path.dirname(path) if check_others_writable(dirpath): - printed_set = printed_dict.get('check_dirpath_before_read', set()) - if dirpath not in printed_set: - logger.warning(f"The directory is writable by others: {dirpath}.") - printed_set.add(dirpath) - printed_dict['check_dirpath_before_read'] = printed_set + logger.warning(f"The directory is writable by others: {dirpath}.") try: check_path_owner_consistent(dirpath) except FileCheckException: diff --git a/debug/accuracy_tools/msprobe/visualization/builder/graph_builder.py b/debug/accuracy_tools/msprobe/visualization/builder/graph_builder.py index 3479a9d6e3..07e7400e8d 100644 --- a/debug/accuracy_tools/msprobe/visualization/builder/graph_builder.py +++ b/debug/accuracy_tools/msprobe/visualization/builder/graph_builder.py @@ -288,7 +288,7 @@ class GraphExportConfig: class GraphInfo: - def __init__(self, graph:Graph, construct_path:str, data_path:str, stack_path:str): + def __init__(self, graph: Graph, construct_path: str, data_path: str, stack_path: str): self.graph = graph self.construct_path = construct_path self.data_path = data_path @@ -296,7 +296,7 @@ class GraphInfo: class BuildGraphTaskInfo: - def __init__(self, graph_info_n:GraphInfo, graph_info_b:GraphInfo, npu_rank, bench_rank, time_str): + def __init__(self, graph_info_n: GraphInfo, graph_info_b: GraphInfo, npu_rank, bench_rank, time_str): self.graph_info_n = graph_info_n self.graph_info_b = graph_info_b self.npu_rank = npu_rank diff --git a/debug/accuracy_tools/msprobe/visualization/graph_service.py b/debug/accuracy_tools/msprobe/visualization/graph_service.py index 80e8e031e2..af7c0c4363 100644 --- a/debug/accuracy_tools/msprobe/visualization/graph_service.py +++ b/debug/accuracy_tools/msprobe/visualization/graph_service.py @@ -25,7 +25,7 @@ from msprobe.core.overflow_check.checker import AnomalyDetector from msprobe.visualization.compare.graph_comparator import GraphComparator from msprobe.visualization.utils import GraphConst, check_directory_content, SerializableArgs from msprobe.visualization.builder.graph_builder import GraphBuilder, GraphExportConfig, GraphInfo, BuildGraphTaskInfo -from msprobe.core.common.log import logger +from msprobe.core.common.log import logger, LogService from msprobe.visualization.graph.node_colors import NodeColors from msprobe.core.compare.layer_mapping import generate_api_mapping_by_layer_mapping from msprobe.core.compare.utils import check_and_return_dir_contents @@ -34,7 +34,7 @@ from msprobe.visualization.graph.distributed_analyzer import DistributedAnalyzer current_time = time.strftime("%Y%m%d%H%M%S") -def _compare_graph(graph_n:GraphInfo, graph_b:GraphInfo, input_param, args): +def _compare_graph(graph_n: GraphInfo, graph_b: GraphInfo, input_param, args): dump_path_param = { 'npu_json_path': graph_n.data_path, 'bench_json_path': graph_b.data_path, @@ -168,11 +168,12 @@ def _export_build_graph_result(args, result): overflow_check = args.overflow_check output_file_name = result.output_file_name if not output_file_name: - output_file_name=f'build_{current_time}.vis' + output_file_name = f'build_{current_time}.vis' logger.info(f'Start exporting graph for {output_file_name}...') output_path = os.path.join(out_path, output_file_name) try: - GraphBuilder.to_json(output_path, GraphExportConfig(graph, micro_steps=micro_steps, overflow_check=overflow_check)) + GraphBuilder.to_json(output_path, GraphExportConfig(graph, micro_steps=micro_steps, + overflow_check=overflow_check)) logger.info(f'Model graph exported successfully, the result file is saved in {output_path}') return None except RuntimeError as e: @@ -190,7 +191,8 @@ def _compare_graph_ranks(input_param, args, step=None): raise CompareException(CompareException.INVALID_PATH_ERROR) mp_res_dict = {} compare_graph_results = [] - with Pool(processes=max(int((cpu_count() + 1) // 4), 1)) as pool: + with Pool(processes=max(int((cpu_count() + 1) // 4), 1), initializer=LogService.set_queue, + initargs=(LogService.queue,)) as pool: def err_call(err): logger.error(f'Error occurred while comparing graph ranks: {err}') try: @@ -209,7 +211,8 @@ def _compare_graph_ranks(input_param, args, step=None): for output_file_name, mp_res in mp_res_dict.items(): # 暂存所有rank的graph,用于匹配rank间的分布式节点 - compare_graph_results.append(_run_graph_compare(mp_res.get(), input_param, serializable_args, output_file_name)) + compare_graph_results.append(_run_graph_compare(mp_res.get(), input_param, serializable_args, + output_file_name)) # 匹配rank间的分布式节点 if len(compare_graph_results) > 1: @@ -254,7 +257,8 @@ def _compare_graph_steps(input_param, args): def _build_graph_ranks(dump_ranks_path, args, step=None): ranks = sorted(check_and_return_dir_contents(dump_ranks_path, Const.RANK)) serializable_args = SerializableArgs(args) - with Pool(processes=max(int((cpu_count() + 1) // 4), 1)) as pool: + with Pool(processes=max(int((cpu_count() + 1) // 4), 1), initializer=LogService.set_queue, + initargs=(LogService.queue,)) as pool: def err_call(err): logger.error(f'Error occurred while comparing graph ranks: {err}') try: @@ -265,8 +269,8 @@ def _build_graph_ranks(dump_ranks_path, args, step=None): build_graph_tasks = [] for rank in ranks: build_graph_tasks.append(pool.apply_async(_run_build_graph_single, - args=(dump_ranks_path, rank, step, serializable_args)), - error_callback=err_call) + args=(dump_ranks_path, rank, step, serializable_args), + error_callback=err_call)) build_graph_results = [task.get() for task in build_graph_tasks] if len(build_graph_results) > 1: @@ -327,7 +331,9 @@ def _graph_service_command(args): else: result = _build_graph_result(npu_path, args) create_directory(args.output_path) - _export_build_graph_result(args, result) + file_name = _export_build_graph_result(args, result) + if file_name: + logger.error('Failed to export model build graph.') elif check_file_type(npu_path) == FileCheckConst.DIR and check_file_type(bench_path) == FileCheckConst.DIR: content_n = check_directory_content(npu_path) content_b = check_directory_content(bench_path) @@ -340,7 +346,9 @@ def _graph_service_command(args): else: result = _compare_graph_result(input_param, args) create_directory(args.output_path) - _export_compare_graph_result(args, result) + file_name = _export_compare_graph_result(args, result) + if file_name: + logger.error('Failed to export model compare graph.') else: logger.error("The npu_path or bench_path should be a folder.") raise CompareException(CompareException.INVALID_COMPARE_MODE) -- Gitee From 0d3d3dfee63174ea59791401835ad506b3168344 Mon Sep 17 00:00:00 2001 From: wangchao_the_second Date: Tue, 15 Apr 2025 10:54:51 +0800 Subject: [PATCH 07/12] =?UTF-8?q?=E5=9B=9E=E9=80=80=E6=97=A5=E5=BF=97?= =?UTF-8?q?=E4=BF=AE=E6=94=B9?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../accuracy_tools/msprobe/visualization/graph_service.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/debug/accuracy_tools/msprobe/visualization/graph_service.py b/debug/accuracy_tools/msprobe/visualization/graph_service.py index af7c0c4363..a057bd0a28 100644 --- a/debug/accuracy_tools/msprobe/visualization/graph_service.py +++ b/debug/accuracy_tools/msprobe/visualization/graph_service.py @@ -25,7 +25,7 @@ from msprobe.core.overflow_check.checker import AnomalyDetector from msprobe.visualization.compare.graph_comparator import GraphComparator from msprobe.visualization.utils import GraphConst, check_directory_content, SerializableArgs from msprobe.visualization.builder.graph_builder import GraphBuilder, GraphExportConfig, GraphInfo, BuildGraphTaskInfo -from msprobe.core.common.log import logger, LogService +from msprobe.core.common.log import logger from msprobe.visualization.graph.node_colors import NodeColors from msprobe.core.compare.layer_mapping import generate_api_mapping_by_layer_mapping from msprobe.core.compare.utils import check_and_return_dir_contents @@ -191,8 +191,7 @@ def _compare_graph_ranks(input_param, args, step=None): raise CompareException(CompareException.INVALID_PATH_ERROR) mp_res_dict = {} compare_graph_results = [] - with Pool(processes=max(int((cpu_count() + 1) // 4), 1), initializer=LogService.set_queue, - initargs=(LogService.queue,)) as pool: + with Pool(processes=max(int((cpu_count() + 1) // 4), 1)) as pool: def err_call(err): logger.error(f'Error occurred while comparing graph ranks: {err}') try: @@ -257,8 +256,7 @@ def _compare_graph_steps(input_param, args): def _build_graph_ranks(dump_ranks_path, args, step=None): ranks = sorted(check_and_return_dir_contents(dump_ranks_path, Const.RANK)) serializable_args = SerializableArgs(args) - with Pool(processes=max(int((cpu_count() + 1) // 4), 1), initializer=LogService.set_queue, - initargs=(LogService.queue,)) as pool: + with Pool(processes=max(int((cpu_count() + 1) // 4), 1)) as pool: def err_call(err): logger.error(f'Error occurred while comparing graph ranks: {err}') try: -- Gitee From ab61f7fd5ea96ea6a9ed6a7c0445a2427ecaeb85 Mon Sep 17 00:00:00 2001 From: wangchao_the_second Date: Tue, 15 Apr 2025 11:35:17 +0800 Subject: [PATCH 08/12] =?UTF-8?q?=E4=BF=AE=E6=94=B9ut?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../test/visualization_ut/test_graph_service.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/debug/accuracy_tools/msprobe/test/visualization_ut/test_graph_service.py b/debug/accuracy_tools/msprobe/test/visualization_ut/test_graph_service.py index 7dfd9564eb..91dbe84b73 100644 --- a/debug/accuracy_tools/msprobe/test/visualization_ut/test_graph_service.py +++ b/debug/accuracy_tools/msprobe/test/visualization_ut/test_graph_service.py @@ -7,7 +7,7 @@ import argparse from dataclasses import dataclass from unittest.mock import patch -from msprobe.visualization.graph_service import _compare_graph, _build_graph, _compare_graph_ranks, \ +from msprobe.visualization.graph_service import _compare_graph_result, _build_graph_result, _compare_graph_ranks, \ _compare_graph_steps, _build_graph_ranks, _build_graph_steps, _graph_service_command, _graph_service_parser from msprobe.core.common.utils import CompareException @@ -48,27 +48,27 @@ class TestGraphService(unittest.TestCase): self.assertTrue(os.path.exists(os.path.join(self.output, matches[0]))) @patch('msprobe.core.common.log.logger.info') - def test_compare_graph(self, mock_log_info): + def test_compare_graph_result(self, mock_log_info): args = Args(output_path=self.output, framework='pytorch') - result = _compare_graph(self.input_param, args) + result = _compare_graph_result(self.input_param, args) self.assertEqual(mock_log_info.call_count, 2) self.assertIsNotNone(result) args = Args(output_path=self.output, framework='mindspore') - result = _compare_graph(self.input_param, args) + result = _compare_graph_result(self.input_param, args) self.assertIsNotNone(result) args = Args(output_path=self.output, framework='pytorch', layer_mapping=self.layer_mapping) - result = _compare_graph(self.input_param, args) + result = _compare_graph_result(self.input_param, args) self.assertIsNotNone(result) args = Args(output_path=self.output, framework='pytorch', overflow_check=True) - result = _compare_graph(self.input_param, args) + result = _compare_graph_result(self.input_param, args) self.assertIsNotNone(result) @patch('msprobe.core.common.log.logger.info') - def test_build_graph(self, mock_log_info): - result = _build_graph(os.path.join(self.input, 'step0', 'rank0'), Args(overflow_check=True)) + def test_build_graph_result(self, mock_log_info): + result = _build_graph_result(os.path.join(self.input, 'step0', 'rank0'), Args(overflow_check=True)) self.assertEqual(mock_log_info.call_count, 1) self.assertIsNotNone(result) -- Gitee From 15ecd85b2b209ce08c54ce36919f9d1a5616bb97 Mon Sep 17 00:00:00 2001 From: wangchao_the_second Date: Tue, 15 Apr 2025 16:07:25 +0800 Subject: [PATCH 09/12] =?UTF-8?q?=E4=BF=AE=E6=94=B9ut?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../test/visualization_ut/test_graph_service.py | 14 +++++++------- .../msprobe/visualization/graph_service.py | 1 + 2 files changed, 8 insertions(+), 7 deletions(-) diff --git a/debug/accuracy_tools/msprobe/test/visualization_ut/test_graph_service.py b/debug/accuracy_tools/msprobe/test/visualization_ut/test_graph_service.py index 91dbe84b73..278d695c71 100644 --- a/debug/accuracy_tools/msprobe/test/visualization_ut/test_graph_service.py +++ b/debug/accuracy_tools/msprobe/test/visualization_ut/test_graph_service.py @@ -81,7 +81,7 @@ class TestGraphService(unittest.TestCase): } args = Args(output_path=self.output, framework='pytorch') _compare_graph_ranks(input_param, args) - self.assert_log_info(mock_log_info) + self.assert_log_info(mock_log_info, 'Successfully exported compare graph results.') input_param1 = { 'npu_path': os.path.join(self.input, 'step0'), @@ -101,7 +101,7 @@ class TestGraphService(unittest.TestCase): } args = Args(output_path=self.output, framework='pytorch') _compare_graph_steps(input_param, args) - self.assert_log_info(mock_log_info) + self.assert_log_info(mock_log_info, 'Successfully exported compare graph results.') input_param1 = { 'npu_path': self.input, @@ -115,12 +115,12 @@ class TestGraphService(unittest.TestCase): @patch('msprobe.core.common.log.logger.info') def test_build_graph_ranks(self, mock_log_info): _build_graph_ranks(os.path.join(self.input, 'step0'), Args(output_path=self.output)) - self.assert_log_info(mock_log_info, "Model graph built successfully, the result file is saved in") + self.assert_log_info(mock_log_info, "Successfully exported build graph results.") @patch('msprobe.core.common.log.logger.info') def test_build_graph_steps(self, mock_log_info): _build_graph_steps(self.input, Args(output_path=self.output)) - self.assert_log_info(mock_log_info, "Model graph built successfully, the result file is saved in") + self.assert_log_info(mock_log_info, "Successfully exported build graph results.") @patch('msprobe.core.common.log.logger.info') def test_graph_service_command(self, mock_log_info): @@ -129,7 +129,7 @@ class TestGraphService(unittest.TestCase): args = Args(input_path=self.output_json[0], output_path=self.output, framework='pytorch') _graph_service_command(args) - self.assert_log_info(mock_log_info) + self.assert_log_info(mock_log_info, 'Exporting compare graph result successfully, the result file is saved in') input_param1 = { 'npu_path': os.path.join(self.input, 'step0', 'rank0'), @@ -150,7 +150,7 @@ class TestGraphService(unittest.TestCase): json.dump(input_param2, f, indent=4) args = Args(input_path=self.output_json[2], output_path=self.output, framework='pytorch') _graph_service_command(args) - self.assert_log_info(mock_log_info) + self.assert_log_info(mock_log_info, 'Exporting compare graph result successfully, the result file is saved in') input_param3 = { 'npu_path': self.input, @@ -161,7 +161,7 @@ class TestGraphService(unittest.TestCase): json.dump(input_param3, f, indent=4) args = Args(input_path=self.output_json[3], output_path=self.output, framework='pytorch') _graph_service_command(args) - self.assert_log_info(mock_log_info) + self.assert_log_info(mock_log_info, 'Exporting compare graph result successfully, the result file is saved in') input_param4 = { 'npu_path': os.path.join(self.input, 'step0'), diff --git a/debug/accuracy_tools/msprobe/visualization/graph_service.py b/debug/accuracy_tools/msprobe/visualization/graph_service.py index a057bd0a28..31831cd1ae 100644 --- a/debug/accuracy_tools/msprobe/visualization/graph_service.py +++ b/debug/accuracy_tools/msprobe/visualization/graph_service.py @@ -105,6 +105,7 @@ def _build_graph_info(dump_path, args): def _build_graph_result(dump_path, args): + logger.info('Start building model graphs...') graph = _build_graph_info(dump_path, args).graph # 增加micro step标记 micro_steps = graph.paging_by_micro_step() -- Gitee From c2fa97794f75616163c68916c24c83152d00c9c7 Mon Sep 17 00:00:00 2001 From: wangchao_the_second Date: Tue, 15 Apr 2025 16:28:37 +0800 Subject: [PATCH 10/12] =?UTF-8?q?=E4=BF=AE=E6=94=B9ut?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../msprobe/test/visualization_ut/test_graph_service.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/debug/accuracy_tools/msprobe/test/visualization_ut/test_graph_service.py b/debug/accuracy_tools/msprobe/test/visualization_ut/test_graph_service.py index 278d695c71..e9cc4238de 100644 --- a/debug/accuracy_tools/msprobe/test/visualization_ut/test_graph_service.py +++ b/debug/accuracy_tools/msprobe/test/visualization_ut/test_graph_service.py @@ -45,7 +45,8 @@ class TestGraphService(unittest.TestCase): last_call_args = mock_log_info.call_args[0][0] self.assertIn(log_info, last_call_args) matches = re.findall(self.pattern, last_call_args) - self.assertTrue(os.path.exists(os.path.join(self.output, matches[0]))) + if matches: + self.assertTrue(os.path.exists(os.path.join(self.output, matches[0]))) @patch('msprobe.core.common.log.logger.info') def test_compare_graph_result(self, mock_log_info): @@ -139,7 +140,7 @@ class TestGraphService(unittest.TestCase): json.dump(input_param1, f, indent=4) args = Args(input_path=self.output_json[1], output_path=self.output, framework='pytorch') _graph_service_command(args) - self.assert_log_info(mock_log_info, "Model graph built successfully, the result file is saved in") + self.assert_log_info(mock_log_info, "Model graph exported successfully, the result file is saved in") input_param2 = { 'npu_path': os.path.join(self.input, 'step0'), @@ -171,7 +172,7 @@ class TestGraphService(unittest.TestCase): json.dump(input_param4, f, indent=4) args = Args(input_path=self.output_json[4], output_path=self.output, framework='pytorch') _graph_service_command(args) - self.assert_log_info(mock_log_info, "Model graph built successfully, the result file is saved in") + self.assert_log_info(mock_log_info, "Model graph exported successfully, the result file is saved in") input_param5 = { 'npu_path': self.input, @@ -181,7 +182,7 @@ class TestGraphService(unittest.TestCase): json.dump(input_param5, f, indent=4) args = Args(input_path=self.output_json[5], output_path=self.output, framework='pytorch') _graph_service_command(args) - self.assert_log_info(mock_log_info, "Model graph built successfully, the result file is saved in") + self.assert_log_info(mock_log_info, "Model graph exported successfully, the result file is saved in") input_param6 = { 'npu_path': self.input, -- Gitee From 55ed047cf6a5eaf215469e66f5d4e561362f9411 Mon Sep 17 00:00:00 2001 From: wangchao_the_second Date: Tue, 15 Apr 2025 16:43:26 +0800 Subject: [PATCH 11/12] =?UTF-8?q?=E4=BF=AE=E6=94=B9ut?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../msprobe/test/visualization_ut/test_graph_service.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/debug/accuracy_tools/msprobe/test/visualization_ut/test_graph_service.py b/debug/accuracy_tools/msprobe/test/visualization_ut/test_graph_service.py index e9cc4238de..2a669bf8f5 100644 --- a/debug/accuracy_tools/msprobe/test/visualization_ut/test_graph_service.py +++ b/debug/accuracy_tools/msprobe/test/visualization_ut/test_graph_service.py @@ -151,7 +151,7 @@ class TestGraphService(unittest.TestCase): json.dump(input_param2, f, indent=4) args = Args(input_path=self.output_json[2], output_path=self.output, framework='pytorch') _graph_service_command(args) - self.assert_log_info(mock_log_info, 'Exporting compare graph result successfully, the result file is saved in') + self.assert_log_info(mock_log_info, 'Successfully exported compare graph results.') input_param3 = { 'npu_path': self.input, @@ -162,7 +162,7 @@ class TestGraphService(unittest.TestCase): json.dump(input_param3, f, indent=4) args = Args(input_path=self.output_json[3], output_path=self.output, framework='pytorch') _graph_service_command(args) - self.assert_log_info(mock_log_info, 'Exporting compare graph result successfully, the result file is saved in') + self.assert_log_info(mock_log_info, 'Successfully exported compare graph results.') input_param4 = { 'npu_path': os.path.join(self.input, 'step0'), -- Gitee From 5f5e39461e8c3b7e7312bad5bdf498ed448caaff Mon Sep 17 00:00:00 2001 From: wangchao_the_second Date: Tue, 15 Apr 2025 16:58:22 +0800 Subject: [PATCH 12/12] =?UTF-8?q?=E4=BF=AE=E6=94=B9ut?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../msprobe/test/visualization_ut/test_graph_service.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/debug/accuracy_tools/msprobe/test/visualization_ut/test_graph_service.py b/debug/accuracy_tools/msprobe/test/visualization_ut/test_graph_service.py index 2a669bf8f5..f9ca5592aa 100644 --- a/debug/accuracy_tools/msprobe/test/visualization_ut/test_graph_service.py +++ b/debug/accuracy_tools/msprobe/test/visualization_ut/test_graph_service.py @@ -172,7 +172,7 @@ class TestGraphService(unittest.TestCase): json.dump(input_param4, f, indent=4) args = Args(input_path=self.output_json[4], output_path=self.output, framework='pytorch') _graph_service_command(args) - self.assert_log_info(mock_log_info, "Model graph exported successfully, the result file is saved in") + self.assert_log_info(mock_log_info, "Successfully exported build graph results.") input_param5 = { 'npu_path': self.input, @@ -182,7 +182,7 @@ class TestGraphService(unittest.TestCase): json.dump(input_param5, f, indent=4) args = Args(input_path=self.output_json[5], output_path=self.output, framework='pytorch') _graph_service_command(args) - self.assert_log_info(mock_log_info, "Model graph exported successfully, the result file is saved in") + self.assert_log_info(mock_log_info, "Successfully exported build graph results.") input_param6 = { 'npu_path': self.input, -- Gitee