diff --git a/profiler/msprof_analyze/advisor/analyzer/computation/ai_core_performance/ai_core_performance_checker.py b/profiler/msprof_analyze/advisor/analyzer/computation/ai_core_performance/ai_core_performance_checker.py index b301e343f131faa6262faed5a99976c78065ffa9..5a7f9e3f95c2e283e69a3a4b3e60b5c8ac179d5f 100644 --- a/profiler/msprof_analyze/advisor/analyzer/computation/ai_core_performance/ai_core_performance_checker.py +++ b/profiler/msprof_analyze/advisor/analyzer/computation/ai_core_performance/ai_core_performance_checker.py @@ -538,7 +538,9 @@ class AICorePerformanceChecker: suggestion = "" if "varlen" in op.lower(): # 处理变长算子 如果不亲和则affinity_flag为False - inner_axis = convert_to_int_with_exception(shape.split("-")[0].split(";")[0].split(",")[2]) + inner_axis = 0 + if len(shape.split("-")[0].split(";")[0].split(",")) >= 3: + inner_axis = convert_to_int_with_exception(shape.split("-")[0].split(";")[0].split(",")[2]) if inner_axis % self.INNER_AXIS_128 != 0: affinity_flag = True suggestion = self._fa_affinity_desc_head_dim_128 @@ -550,7 +552,9 @@ class AICorePerformanceChecker: else: # 处理定长算子 如果不亲和则affinity_flag为False head_dim = 0 - seq_len = convert_to_int_with_exception(shape.split("-")[1].split(";")[0].split(",")[2]) + seq_len = 0 + if len(shape.split("-")[1].split(";")[0].split(",")[2]) >= 3: + seq_len = convert_to_int_with_exception(shape.split("-")[1].split(";")[0].split(",")[2]) input_first_tensor = shape.split("-")[0].split(";")[0].split(",") if len(input_first_tensor) == 3: head_dim = safe_division(convert_to_int_with_exception(input_first_tensor[2]), diff --git a/profiler/msprof_analyze/advisor/common/graph/graph_match.py b/profiler/msprof_analyze/advisor/common/graph/graph_match.py index 1cf2fe170d2ab8d3e29429785c7b6398cc0dd964..86b130e56818b439958f9f56552fc908867dfd13 100644 --- a/profiler/msprof_analyze/advisor/common/graph/graph_match.py +++ b/profiler/msprof_analyze/advisor/common/graph/graph_match.py @@ -297,6 +297,8 @@ def get_next_candidates(config: CandidateArgsConfig) -> List[Dict[Hashable, Hash # Find a longer backbone node nodes_with_maximum_backbone.append(query_node_id) + if not nodes_with_maximum_backbone: + return [] # next_node is connected to the current backbone. next_node = max(nodes_with_maximum_backbone, key=lambda x: node_priority.get(x, 0)) diff --git a/profiler/msprof_analyze/advisor/config/config.py b/profiler/msprof_analyze/advisor/config/config.py index 80057b2a5d664c38e5bc428e5b70065df074f4c7..f01a502fd176dab0d0b38847921c832d7639cfba 100644 --- a/profiler/msprof_analyze/advisor/config/config.py +++ b/profiler/msprof_analyze/advisor/config/config.py @@ -16,6 +16,7 @@ import logging import os +import html from msprof_analyze.advisor.utils.utils import Timer from msprof_analyze.prof_common.singleton import singleton @@ -107,42 +108,42 @@ class Config: @property def timeline_api_doc_url(self) -> str: try: - return self.config.get("URL", "timeline_api_doc_url") + return html.escape(self.config.get("URL", "timeline_api_doc_url")) except Exception: return "" @property def timeline_with_stack_doc_url(self) -> str: try: - return self.config.get("URL", "timeline_with_stack_doc_url") + return html.escape(self.config.get("URL", "timeline_with_stack_doc_url")) except Exception: return "" @property def pytorch_aoe_operator_tune_url(self) -> str: try: - return self.config.get("URL", "pytorch_aoe_operator_tune_url") + return html.escape(self.config.get("URL", "pytorch_aoe_operator_tune_url")) except Exception: return "" @property def mslite_infer_aoe_operator_tune_url(self) -> str: try: - return self.config.get("URL", "mslite_infer_aoe_operator_tune_url") + return html.escape(self.config.get("URL", "mslite_infer_aoe_operator_tune_url")) except Exception: return "" @property def enable_compiled_tune_url(self) -> str: try: - return self.config.get("URL", "enable_compiled_tune_url") + return html.escape(self.config.get("URL", "enable_compiled_tune_url")) except Exception: return "" @property def ascend_profiler_url(self) -> str: try: - return self.config.get("URL", "ascend_profiler_url") + return html.escape(self.config.get("URL", "ascend_profiler_url")) except Exception: return "" diff --git a/profiler/msprof_analyze/advisor/dataset/timeline_event_dataset.py b/profiler/msprof_analyze/advisor/dataset/timeline_event_dataset.py index 512a7ae16354be0dea91f824e14241c4592438d2..f676d82caf005ab8f637b8eebf1b9247a287e00b 100644 --- a/profiler/msprof_analyze/advisor/dataset/timeline_event_dataset.py +++ b/profiler/msprof_analyze/advisor/dataset/timeline_event_dataset.py @@ -148,15 +148,23 @@ class BaseTimelineEventDataset(Dataset): return True def parse_from_db(self): - db_helper = TimelineDBHelper(self.timeline_file) - if not db_helper.init_timeline_db_helper(): + db_helper = None + try: + db_helper = TimelineDBHelper(self.timeline_file) + if not db_helper.init_timeline_db_helper(): + return False + for _, collector in tqdm(self.collector_map.items(), leave=False, + desc="Building dataset for timeline analysis"): + for event_type in collector.get_event_type(): + df = db_helper.query_timeline_event(event_type) + collector.add_op_from_db(df) + except Exception: + logger.warning("Error %s while parsing from db, file %s", traceback.format_exc(), + self.timeline_file) return False - for _, collector in tqdm(self.collector_map.items(), leave=False, - desc="Building dataset for timeline analysis"): - for event_type in collector.get_event_type(): - df = db_helper.query_timeline_event(event_type) - collector.add_op_from_db(df) - db_helper.destroy_db_connection() + finally: + if db_helper: + db_helper.destroy_db_connection() return True def parse_data_with_generator(self, func): diff --git a/profiler/msprof_analyze/advisor/display/prompt/cn/dynamic_shape_prompt.py b/profiler/msprof_analyze/advisor/display/prompt/cn/dynamic_shape_prompt.py index c525422b9a4749d4987b245c83211efe8f5df83f..37355d9ceb503608f26c40f579a74d4ac3217a13 100644 --- a/profiler/msprof_analyze/advisor/display/prompt/cn/dynamic_shape_prompt.py +++ b/profiler/msprof_analyze/advisor/display/prompt/cn/dynamic_shape_prompt.py @@ -20,4 +20,4 @@ class DynamicShapePrompt(object): ENABLE_COMPILED_SUGGESTION = "在python脚本入口加入以下代码关闭在线编译:\n" \ "'torch_npu.npu.set_compile_mode(jit_compile=False) \n " \ "torch_npu.npu.config.allow_internal_format = False' \n" - RELEASE_SUGGESTION = "详细信息请参考:链接" \ No newline at end of file + RELEASE_SUGGESTION = "详细信息请参考:链接" diff --git a/profiler/msprof_analyze/advisor/display/prompt/en/operator_prompt.py b/profiler/msprof_analyze/advisor/display/prompt/en/operator_prompt.py index b1fb76f9b79e359fe20762128287df00f8839f84..e056ed1b7989b7b53658f6b621b65a91545d6ee1 100644 --- a/profiler/msprof_analyze/advisor/display/prompt/en/operator_prompt.py +++ b/profiler/msprof_analyze/advisor/display/prompt/en/operator_prompt.py @@ -22,10 +22,10 @@ class OperatorPrompt(object): "converter_lite --fmk=ONNX --optimize=ascend_oriented --saveType=MINDIR " \ "--modelFile=$user_model.onnx --outputFile=user_model " \ "--configFile=./config.txt\n" - PYTORCH_RELEASE_SUGGESTION = "for details please refer to link : LINK" + PYTORCH_RELEASE_SUGGESTION = "for details please refer to link : LINK" MSLITE_RELEASE_SUGGESTION = "\nThe config file for MSLite AOE usage is as follows:\n" \ "[ascend_context]\n" \ "aoe_mode=\"operator tuning\"\n" \ "--tune_ops_file={}\n" \ "\nFor details please refer to link : LINK" + "\"{}\" target='_blank'>LINK" diff --git a/profiler/msprof_analyze/advisor/utils/file.py b/profiler/msprof_analyze/advisor/utils/file.py index 516077ee72ea431201d0bcc3ae7aa217b3982602..8eb4240b0c2f42d7dc847ee39c911ffaa02abbfc 100644 --- a/profiler/msprof_analyze/advisor/utils/file.py +++ b/profiler/msprof_analyze/advisor/utils/file.py @@ -73,7 +73,10 @@ class FdOpen: def __exit__(self, exc_type, exc_val, exc_tb): if self.file_open: - self.file_open.close() + try: + self.file_open.close() + except Exception: + os.close(self.fd) elif self.fd: os.close(self.fd) diff --git a/profiler/msprof_analyze/advisor/utils/utils.py b/profiler/msprof_analyze/advisor/utils/utils.py index 001949d6e29c04837f7bb527a3bce2547f58f116..9f84263ab9c6a3361f70331ed67b3775edf874db 100644 --- a/profiler/msprof_analyze/advisor/utils/utils.py +++ b/profiler/msprof_analyze/advisor/utils/utils.py @@ -346,7 +346,6 @@ class SafeOpen: def __exit__(self, exc_type, exc_val, exc_tb): if self.file: self.file.close() - return True def get_file_path_by_walk(root, filename): diff --git a/profiler/msprof_analyze/cluster_analyse/analysis/communication_analysis.py b/profiler/msprof_analyze/cluster_analyse/analysis/communication_analysis.py index e8ca793f525b0279053bc9848f99f21016ea6295..62e24bebe03d65c0afb6abad8ffe278826eb6810 100644 --- a/profiler/msprof_analyze/cluster_analyse/analysis/communication_analysis.py +++ b/profiler/msprof_analyze/cluster_analyse/analysis/communication_analysis.py @@ -68,9 +68,11 @@ class CommunicationAnalysis(BaseAnalysis): result_db = os.path.join(output_path, Constant.DB_CLUSTER_COMMUNICATION_ANALYZER) DBManager.create_tables(result_db, self.COMMUNICATION_TIME_TABLE, self.COMMUNICATION_BANDWIDTH_TABLE) conn, cursor = DBManager.create_connect_db(result_db) - self.execute(conn, res_comm_time, self.COMMUNICATION_TIME_TABLE) - self.execute(conn, res_comm_bandwidth, self.COMMUNICATION_BANDWIDTH_TABLE) - DBManager.destroy_db_connect(conn, cursor) + try: + self.execute(conn, res_comm_time, self.COMMUNICATION_TIME_TABLE) + self.execute(conn, res_comm_bandwidth, self.COMMUNICATION_BANDWIDTH_TABLE) + finally: + DBManager.destroy_db_connect(conn, cursor) def compute_total_info(self, comm_ops: dict): if not comm_ops: diff --git a/profiler/msprof_analyze/cluster_analyse/cluster_data_preprocess/msprof_data_preprocessor.py b/profiler/msprof_analyze/cluster_analyse/cluster_data_preprocess/msprof_data_preprocessor.py index f751de56fe3d622e705c481220cf4a6760b163d0..4d23292bf4629b027acef0f164154efa517f893f 100644 --- a/profiler/msprof_analyze/cluster_analyse/cluster_data_preprocess/msprof_data_preprocessor.py +++ b/profiler/msprof_analyze/cluster_analyse/cluster_data_preprocess/msprof_data_preprocessor.py @@ -14,6 +14,7 @@ # limitations under the License. import os import re +import shlex from collections import defaultdict from msprof_analyze.cluster_analyse.cluster_data_preprocess.data_preprocessor import DataPreprocessor @@ -56,6 +57,8 @@ class MsprofDataPreprocessor(DataPreprocessor): prof_data_uid = defaultdict(list) prof_data_rank = defaultdict(list) for dir_name in self.path_list: + # 对dir_name进行转义处理,防止命令注入 + escaped_dir = shlex.quote(dir_name) info_json_file = self._find_info_json_file(dir_name) if not info_json_file: logger.error(f"Profiling data in not completed, please check the info.json file in the path {dir_name}") @@ -68,12 +71,12 @@ class MsprofDataPreprocessor(DataPreprocessor): self.data_type.add(Constant.TEXT) else: logger.error(f"The profiling data has not been fully parsed. You can parse it by executing " - f"the following command: msprof --analyze=on --output={dir_name}") + f"the following command: msprof --analyze=on --output={escaped_dir}") continue else: logger.error(f"The profiling data has not been fully parsed. You can parse it by executing " - f"the following command: msprof --export=on --output={dir_name}; " - f"msprof --analyze=on --output={dir_name}") + f"the following command: msprof --export=on --output={escaped_dir}; " + f"msprof --analyze=on --output={escaped_dir}") continue info_json = FileManager.read_json_file(info_json_file) rank_id = info_json.get("rank_id") diff --git a/profiler/msprof_analyze/compare_tools/compare_backend/compare_bean/origin_data_bean/operator_memory_bean.py b/profiler/msprof_analyze/compare_tools/compare_backend/compare_bean/origin_data_bean/operator_memory_bean.py index 9678b91abd1a973f4720511b9f640740bd7236cf..2f07bab67d3d1ab34bd4514d393e24ef663f7ea6 100644 --- a/profiler/msprof_analyze/compare_tools/compare_backend/compare_bean/origin_data_bean/operator_memory_bean.py +++ b/profiler/msprof_analyze/compare_tools/compare_backend/compare_bean/origin_data_bean/operator_memory_bean.py @@ -22,7 +22,7 @@ class OperatorMemoryBean: NA = "N/A" def __init__(self, data: dict): - self._data = data + self._data = data.copy() self._name = "" self._size = 0.0 self._allocation_time = Decimal(0) diff --git a/profiler/msprof_analyze/compare_tools/compare_backend/profiling_parser/npu_profiling_db_parser.py b/profiler/msprof_analyze/compare_tools/compare_backend/profiling_parser/npu_profiling_db_parser.py index b68029a57ec8398cd1e934d556034608afb0ba9b..592f2a11a29a2e3241b8875642336df26069f93a 100644 --- a/profiler/msprof_analyze/compare_tools/compare_backend/profiling_parser/npu_profiling_db_parser.py +++ b/profiler/msprof_analyze/compare_tools/compare_backend/profiling_parser/npu_profiling_db_parser.py @@ -75,6 +75,12 @@ class NPUProfilingDbParser: self.comm_task_data = [] self.compute_op_data = [] + def __del__(self): + try: + DBManager.destroy_db_connect(self.conn, self.cursor) + except Exception: + logger.warning(f"Failed to release database connection in NPUProfilingDbParser.") + def load_data(self) -> ProfilingResult: self._prepare_data() if self._enable_communication_compare: diff --git a/profiler/msprof_analyze/prof_common/path_manager.py b/profiler/msprof_analyze/prof_common/path_manager.py index 05970362ba2410da3ea281a4fb9a8812c9d1575a..c6ac5a1dcd4eb75029e40d7e8cab5262ea6869b9 100644 --- a/profiler/msprof_analyze/prof_common/path_manager.py +++ b/profiler/msprof_analyze/prof_common/path_manager.py @@ -179,7 +179,7 @@ class PathManager: if os.path.exists(path): return try: - os.makedirs(path, mode=cls.DATA_DIR_AUTHORITY) + os.makedirs(path, mode=cls.DATA_DIR_AUTHORITY, exist_ok=True) except Exception as err: raise RuntimeError(msg) from err