From 6f141f6ec130f3b487b220cff9788d91903ea2e6 Mon Sep 17 00:00:00 2001 From: z30043230 Date: Sat, 19 Jul 2025 17:51:20 +0800 Subject: [PATCH] =?UTF-8?q?=E4=BF=AE=E5=A4=8Dlevel0=E5=9C=BA=E6=99=AF?= =?UTF-8?q?=E4=B8=8Bcluster=5Fstep=5Ftrace=5Ftime.csv=E6=97=A0=E6=B3=95?= =?UTF-8?q?=E7=94=9F=E6=88=90=E7=9A=84=E9=97=AE=E9=A2=98?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- profiler/msprof_analyze/cluster_analyse/README.md | 2 +- .../analysis/stage_group_analysis.py | 14 ++++++++++++-- 2 files changed, 13 insertions(+), 3 deletions(-) diff --git a/profiler/msprof_analyze/cluster_analyse/README.md b/profiler/msprof_analyze/cluster_analyse/README.md index 2ab5cf6461..99bbff91e7 100644 --- a/profiler/msprof_analyze/cluster_analyse/README.md +++ b/profiler/msprof_analyze/cluster_analyse/README.md @@ -120,7 +120,7 @@ experimental_config = torch_npu.profiler._ExperimentalConfig( A列: Step数,是采集性能数据时设置的,一般来说集群性能数据采集一个step足够,如果采集多个step,需要先筛选一下。 -B列: Type,主要分两种,rank和stage,和后面的Index强相关,可以理解为一个是单卡rank,一个是rank group(pp 并行的stage),如果type为stage,则后面D-K列信息为rank group下的最大值。 +B列: Type,主要分两种,rank和stage,和后面的Index强相关,可以理解为一个是单卡rank,一个是rank group(pp 并行的stage),如果type为stage,则后面D-K列信息为rank group下的最大值(**Level0场景下没有stage类型**)。 C列:Index,与type相关,表示卡号。 diff --git a/profiler/msprof_analyze/cluster_analyse/analysis/stage_group_analysis.py b/profiler/msprof_analyze/cluster_analyse/analysis/stage_group_analysis.py index bdf832d3f8..0a386e7871 100644 --- a/profiler/msprof_analyze/cluster_analyse/analysis/stage_group_analysis.py +++ b/profiler/msprof_analyze/cluster_analyse/analysis/stage_group_analysis.py @@ -70,11 +70,17 @@ class StageInfoAnalysis: return None # read comm_group_parallel_info from communication_group.json group_data = FileManager.read_json_file(communication_group_json) - if Constant.KEY_COMM_GROUP_PARALLEL_INFO not in group_data: + if (Constant.KEY_COMM_GROUP_PARALLEL_INFO not in group_data or not + group_data.get(Constant.KEY_COMM_GROUP_PARALLEL_INFO)): logger.warning(f"{Constant.KEY_COMM_GROUP_PARALLEL_INFO} not in {Constant.COMMUNICATION_GROUP_JSON}") return None # convert to dataframe comm_group_df = pd.DataFrame(group_data.get(Constant.KEY_COMM_GROUP_PARALLEL_INFO)) + expected_columns = [TableConstant.TYPE, TableConstant.RANK_SET, TableConstant.GROUP_NAME, + TableConstant.GROUP_ID, TableConstant.PG_NAME] + if list(comm_group_df.columns) != expected_columns: + logger.error(f"{Constant.COMMUNICATION_GROUP_JSON} has unexpected columns: {comm_group_df.columns}") + return None comm_group_df[TableConstant.RANK_SET] = comm_group_df[TableConstant.RANK_SET].apply(set) return comm_group_df @@ -95,7 +101,11 @@ class StageInfoAnalysis: if comm_group_df is None or comm_group_df.empty: logger.error(f"There is no {table_communication_group} data in {cluster_analysis_db}.") return None - + expected_columns = [TableConstant.TYPE, TableConstant.RANK_SET, TableConstant.GROUP_NAME, + TableConstant.GROUP_ID, TableConstant.PG_NAME] + if list(comm_group_df.columns) != expected_columns: + logger.error(f"{Constant.COMMUNICATION_GROUP_JSON} has unexpected columns: {comm_group_df.columns}") + return None # process rank_set comm_group_df[TableConstant.RANK_SET] = comm_group_df[TableConstant.RANK_SET].apply( lambda s: set(map(int, s.strip('()').split(',')))) -- Gitee