From 6f141f6ec130f3b487b220cff9788d91903ea2e6 Mon Sep 17 00:00:00 2001
From: z30043230 <zhaiyibo@huawei.com>
Date: Sat, 19 Jul 2025 17:51:20 +0800
Subject: [PATCH] =?UTF-8?q?=E4=BF=AE=E5=A4=8Dlevel0=E5=9C=BA=E6=99=AF?=
 =?UTF-8?q?=E4=B8=8Bcluster=5Fstep=5Ftrace=5Ftime.csv=E6=97=A0=E6=B3=95?=
 =?UTF-8?q?=E7=94=9F=E6=88=90=E7=9A=84=E9=97=AE=E9=A2=98?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 profiler/msprof_analyze/cluster_analyse/README.md  |  2 +-
 .../analysis/stage_group_analysis.py               | 14 ++++++++++++--
 2 files changed, 13 insertions(+), 3 deletions(-)

diff --git a/profiler/msprof_analyze/cluster_analyse/README.md b/profiler/msprof_analyze/cluster_analyse/README.md
index 2ab5cf6461..99bbff91e7 100644
--- a/profiler/msprof_analyze/cluster_analyse/README.md
+++ b/profiler/msprof_analyze/cluster_analyse/README.md
@@ -120,7 +120,7 @@ experimental_config = torch_npu.profiler._ExperimentalConfig(
 
 A列： Step数，是采集性能数据时设置的，一般来说集群性能数据采集一个step足够，如果采集多个step，需要先筛选一下。
 
-B列： Type，主要分两种，rank和stage，和后面的Index强相关，可以理解为一个是单卡rank，一个是rank group（pp 并行的stage），如果type为stage，则后面D-K列信息为rank group下的最大值。
+B列： Type，主要分两种，rank和stage，和后面的Index强相关，可以理解为一个是单卡rank，一个是rank group（pp 并行的stage），如果type为stage，则后面D-K列信息为rank group下的最大值（**Level0场景下没有stage类型**）。
 
 C列：Index，与type相关，表示卡号。
 
diff --git a/profiler/msprof_analyze/cluster_analyse/analysis/stage_group_analysis.py b/profiler/msprof_analyze/cluster_analyse/analysis/stage_group_analysis.py
index bdf832d3f8..0a386e7871 100644
--- a/profiler/msprof_analyze/cluster_analyse/analysis/stage_group_analysis.py
+++ b/profiler/msprof_analyze/cluster_analyse/analysis/stage_group_analysis.py
@@ -70,11 +70,17 @@ class StageInfoAnalysis:
             return None
         # read comm_group_parallel_info from communication_group.json
         group_data = FileManager.read_json_file(communication_group_json)
-        if Constant.KEY_COMM_GROUP_PARALLEL_INFO not in group_data:
+        if (Constant.KEY_COMM_GROUP_PARALLEL_INFO not in group_data or not
+                group_data.get(Constant.KEY_COMM_GROUP_PARALLEL_INFO)):
             logger.warning(f"{Constant.KEY_COMM_GROUP_PARALLEL_INFO} not in {Constant.COMMUNICATION_GROUP_JSON}")
             return None
         # convert to dataframe
         comm_group_df = pd.DataFrame(group_data.get(Constant.KEY_COMM_GROUP_PARALLEL_INFO))
+        expected_columns = [TableConstant.TYPE, TableConstant.RANK_SET, TableConstant.GROUP_NAME,
+                            TableConstant.GROUP_ID, TableConstant.PG_NAME]
+        if list(comm_group_df.columns) != expected_columns:
+            logger.error(f"{Constant.COMMUNICATION_GROUP_JSON} has unexpected columns: {comm_group_df.columns}")
+            return None
         comm_group_df[TableConstant.RANK_SET] = comm_group_df[TableConstant.RANK_SET].apply(set)
         return comm_group_df
 
@@ -95,7 +101,11 @@ class StageInfoAnalysis:
         if comm_group_df is None or comm_group_df.empty:
             logger.error(f"There is no {table_communication_group} data in {cluster_analysis_db}.")
             return None
-
+        expected_columns = [TableConstant.TYPE, TableConstant.RANK_SET, TableConstant.GROUP_NAME,
+                            TableConstant.GROUP_ID, TableConstant.PG_NAME]
+        if list(comm_group_df.columns) != expected_columns:
+            logger.error(f"{Constant.COMMUNICATION_GROUP_JSON} has unexpected columns: {comm_group_df.columns}")
+            return None
         # process rank_set
         comm_group_df[TableConstant.RANK_SET] = comm_group_df[TableConstant.RANK_SET].apply(
             lambda s: set(map(int, s.strip('()').split(','))))
-- 
Gitee