From dc24db4b5193842e1951b42f2b05bd228e111c13 Mon Sep 17 00:00:00 2001
From: l30044004 <louyujing@huawei.com>
Date: Tue, 2 Sep 2025 17:16:32 +0800
Subject: [PATCH] =?UTF-8?q?=E3=80=90=E4=BC=98=E5=8C=96=E3=80=91=E5=88=86?=
 =?UTF-8?q?=E7=BA=A7=E5=8F=AF=E8=A7=86=E5=8C=96=E8=BD=ACdb=E4=BC=98?=
 =?UTF-8?q?=E5=8C=96stack=E4=BF=A1=E6=81=AF=E5=AD=98=E5=82=A8?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../msprobe/docs/21.visualization_PyTorch.md  | 37 ------------
 .../docs/22.visualization_MindSpore.md        | 56 -------------------
 .../visualization/builder/graph_builder.py    | 38 +------------
 .../msprobe/visualization/db_utils.py         | 27 ++++++++-
 .../msprobe/visualization/graph_service.py    |  8 +--
 5 files changed, 29 insertions(+), 137 deletions(-)
diff --git a/debug/accuracy_tools/msprobe/docs/21.visualization_PyTorch.md b/debug/accuracy_tools/msprobe/docs/21.visualization_PyTorch.md
index b02dd7fdb8..b5a208b874 100644
--- a/debug/accuracy_tools/msprobe/docs/21.visualization_PyTorch.md
+++ b/debug/accuracy_tools/msprobe/docs/21.visualization_PyTorch.md
@@ -52,7 +52,6 @@ msprobe -f pytorch graph -i ./compare.json -o ./output
 | -lm 或 --layer_mapping  | 跨套件比对，例如同一个模型分别使用了DeepSpeed和Megatron套件的比对场景。配置该参数时表示开启跨套件Layer层的比对功能，指定模型代码中的Layer层后，可以识别对应dump数据中的模块或API。需要指定自定义映射文件*.yaml。自定义映射文件的格式请参见[自定义映射文件（Layer）](#71-自定义映射文件layer)，如何配置自定义映射文件请参考[模型分级可视化如何配置layer mapping映射文件](./visualization/layer_mapping_example.md)。 配置该参数后，将仅按节点名称进行比对，忽略节点的 type 和 shape。如果调试侧和标杆侧有名称不同的节点，则需要配置自定义映射文件，-lm参数传入自定义映射文件路径；如果调试侧和标杆侧节点名称相同，则仅指定-lm即可。<br/><br/>可参考的实际案例：[MindSpeed&LLamaFactory数据采集和自动比对](./visualization/mindspeed_llamafactory_mapping.md) | 否    |
 | -oc 或 --overflow_check | 是否开启溢出检测模式，开启后会在输出db文件中（`compare_{timestamp}.vis.db或build_{timestamp}.vis.db`）对每个溢出节点进行标记溢出等级，溢出等级说明参考[溢出等级说明](#312-溢出等级说明)                                                                                                                                                                                                                                                                                                                                                                 | 否    |
 | -f 或 --fuzzy_match     | 是否开启模糊匹配，bool类型。模糊匹配说明参考[匹配说明](#311-匹配说明)                                                                                                                                                                                                                                                                                                                                                                                                                                                   | 否    |
-| -cs 或 --complete_stack | 是否使用完整的堆栈信息，bool类型。默认使用精简的堆栈信息，数据量小有助于增加流畅度。完整堆栈和精简堆栈信息参考[堆栈信息说明](#72-堆栈信息说明)                                                                                                                                                                                                                                                                                                                                                                                                               | 否    |
 
 #### 3.1.1 匹配说明
 
@@ -506,42 +505,6 @@ yaml文件中只需配置待调试侧与标杆侧模型代码中功能一致但
 
 ![ms_dump](./img/ms_layer.png)
 
-### 7.2 堆栈信息说明
-
-**精简堆栈**
-
-保留一条当前模块或api的调用信息
-
-```json
-{
-    "Module.layer1.0.bn1.BatchNorm2d.forward.0": [
-        "File /home/torchvision/models/resnet.py, line 93, in forward, \n out = self.bn1(out)"
-       ]
-}
-```
-
-**完整堆栈**
-
-当前模块或api完整的调用信息
-
-```json
-{
-    "Module.layer1.0.bn1.BatchNorm2d.forward.0": [
-        "File /home/torchvision/models/resnet.py, line 93, in forward, \n out = self.bn1(out)",
-        "File /home/torch/nn/modules/module.py, line 1568, in _call_impl, \n result = forward_call(*args, **kwargs)",
-        "File /home/torch/nn/modules/module.py, line 1518, in _wrapped_call_impl, \n return self._call_impl(*args, **kwargs)",
-        "File /home/torch/nn/modules/container.py, line 215, in forward, \n input = module(input)",
-        "File /home/torch/nn/modules/module.py, line 1568, in _call_impl, \n result = forward_call(*args, **kwargs)",
-        "File /home/torch/nn/modules/module.py, line 1518, in _wrapped_call_impl, \n return self._call_impl(*args, **kwargs)",
-        "File /home/torchvision/models/resnet.py, line 273, in _forward_impl, \n x = self.layer1(x)",
-        "File /home/torchvision/models/resnet.py, line 285, in forward, \n return self._forward_impl(x)",
-        "File /home/torch/nn/modules/module.py, line 1527, in _call_impl, \n return forward_call(*args, **kwargs)",
-        "File /home/torch/nn/modules/module.py, line 1518, in _wrapped_call_impl, \n return self._call_impl(*args, **kwargs)",
-        "File /home/visualization/resnet18.py, line 40, in <module>, \n outputs = model(inputs)"
-       ]
-}
-
-```
 # FAQ
 1. 图比对场景，节点呈现灰色，且没有精度比对数据，怎么处理？
 
diff --git a/debug/accuracy_tools/msprobe/docs/22.visualization_MindSpore.md b/debug/accuracy_tools/msprobe/docs/22.visualization_MindSpore.md
index 7abf8df95f..4efd0532a2 100644
--- a/debug/accuracy_tools/msprobe/docs/22.visualization_MindSpore.md
+++ b/debug/accuracy_tools/msprobe/docs/22.visualization_MindSpore.md
@@ -52,7 +52,6 @@ msprobe -f mindspore graph -i ./compare.json -o ./output
 | -lm 或 --layer_mapping| 跨框架比对，MindSpore和PyTorch的比对场景。配置该参数时表示开启跨框架Layer层的比对功能，指定模型代码中的Layer层后，可以识别对应dump数据中的模块或API。需要指定自定义映射文件*.yaml。自定义映射文件的格式请参见[自定义映射文件（Layer）](#71-自定义映射文件layer), 如何配置自定义映射文件请参考[模型分级可视化如何配置layer mapping映射文件](./visualization/layer_mapping_example.md)。配置该参数后，将仅按节点名称进行比对，忽略节点的 type 和 shape。如果调试侧和标杆侧有名称不同的节点，则需要配置自定义映射文件，-lm参数传入自定义映射文件路径；如果调试侧和标杆侧节点名称相同，则仅指定-lm即可。 | 否    |
 | -oc 或 --overflow_check | 是否开启溢出检测模式，开启后会在输出db文件中（`compare_{timestamp}.vis.db或build_{timestamp}.vis.db`）对每个溢出节点进行标记溢出等级，溢出等级说明参考[溢出等级说明](#312-溢出等级说明)                                                                                                                                                                                                                                           | 否    |
 | -f 或 --fuzzy_match     | 是否开启模糊匹配，bool类型。模糊匹配说明参考[匹配说明](#311-匹配说明)                                                                                                                                                                                                                                                                                                                             | 否    |
-| -cs 或 --complete_stack | 是否使用完整的堆栈信息，bool类型。默认使用精简的堆栈信息，数据量小有助于增加流畅度。完整堆栈和精简堆栈信息参考[堆栈信息说明](#72-堆栈信息说明)                                                                                                                                                                                                                                                                                         | 否    |
 
 #### 3.1.1 匹配说明
 
@@ -502,61 +501,6 @@ yaml文件中只需配置MindSpore与PyTorch模型代码中功能一致但名称
 
 ![ms_dump](./img/ms_layer.png)
 
-### 7.2 堆栈信息说明
-
-**精简堆栈**
-
-保留一条当前模块或api的调用信息
-
-```json
-{
-    "Cell.model.language_model.embedding.word_embeddings.reduce_scatter_to_sp_region.ReduceScatterToSequenceParallelRegion.forward.0": [
-        "File /home/mindformers/experimental/distri_cores/tensor_parallel/layers.py, line 770, in construct, \n output = self.reduce_scatter_to_sp_region(output_parallel)"
-        ]
-}
-```
-
-**完整堆栈**
-
-当前模块或api完整的调用信息
-
-```json
-{
-    "Cell.model.language_model.embedding.word_embeddings.reduce_scatter_to_sp_region.ReduceScatterToSequenceParallelRegion.forward.0": [
-        "File /home/mindspore/nn/cell.py, line 507, in _run_construct, \n output = self._run_forward_hook(inputs, output)",
-        "File /home/mindspore/nn/cell.py, line 759, in _complex_call, \n output = self._run_construct(*args, **kwargs)",
-        "File /home/mindspore/nn/cell.py, line 747, in __call__, \n return self._complex_call(*args, **kwargs)",
-        "File /home/mindformers/experimental/distri_cores/tensor_parallel/layers.py, line 770, in construct, \n output = self.reduce_scatter_to_sp_region(output_parallel)",
-        "File /home/mindspore/nn/cell.py, line 2462, in _backward_hook_construct, \n outputs = self.construct(outputs, **kwargs)",
-        "File /home/mindspore/nn/cell.py, line 498, in _run_construct, \n output = self._backward_hook_construct(*inputs, **kwargs)",
-        "File /home/mindspore/nn/cell.py, line 745, in __call__, \n return self._run_construct(*args, **kwargs)",
-        "File /home/mindformers/experimental/distri_cores/transformer/language_model.py, line 151, in construct, \n embeddings = self.word_embeddings(input_ids)",
-        "File /home/mindspore/nn/cell.py, line 2460, in _backward_hook_construct, \n outputs = self.construct(*outputs, **kwargs)",
-        "File /home/mindspore/nn/cell.py, line 498, in _run_construct, \n output = self._backward_hook_construct(*inputs, **kwargs)",
-        "File /home/mindspore/nn/cell.py, line 745, in __call__, \n return self._run_construct(*args, **kwargs)",
-        "File /home/mindformers/experimental/distri_cores/transformer/language_model.py, line 391, in construct, \n text_embedding_out = self.embedding(enc_input_ids, enc_position_ids,",
-        "File /home/mindspore/nn/cell.py, line 2460, in _backward_hook_construct, \n outputs = self.construct(*outputs, **kwargs)",
-        "File /home/mindspore/nn/cell.py, line 498, in _run_construct, \n output = self._backward_hook_construct(*inputs, **kwargs)",
-        "File /home/mindspore/nn/cell.py, line 745, in __call__, \n return self._run_construct(*args, **kwargs)",
-        "File /home/model/gpt_model.py, line 104, in construct, \n lm_output = self.language_model(tokens,",
-        "File /home/mindspore/nn/cell.py, line 2460, in _backward_hook_construct, \n outputs = self.construct(*outputs, **kwargs)",
-        "File /home/mindspore/nn/cell.py, line 498, in _run_construct, \n output = self._backward_hook_construct(*inputs, **kwargs)",
-        "File /home/mindspore/nn/cell.py, line 745, in __call__, \n return self._run_construct(*args, **kwargs)",
-        "File /home/mindformers/experimental/distri_cores/pipeline_parallel/pipeline_cell.py, line 429, in construct, \n return self.model(*inputs)",
-        "File /home/mindspore/nn/cell.py, line 757, in _complex_call, \n output = self.construct(*args, **kwargs)",
-        "File /home/mindspore/nn/cell.py, line 747, in __call__, \n return self._complex_call(*args, **kwargs)",
-        "File /home/mindformers/experimental/distri_cores/pipeline_parallel/schedules.py, line 121, in run_forward, \n output_tensor = model(*input_data, recv_data=None)",
-        "File /home/mindformers/experimental/distri_cores/pipeline_parallel/schedules.py, line 735, in forward_backward_pipelining_without_interleaving, \n micro_input_data = run_forward(*micro_input_data,",
-        "File /home/mindformers/experimental/distri_cores/training.py, line 409, in forward_backward_with_pipelining, \n loss, logits, grads = forward_backward_pipelining_without_interleaving(",
-        "File /home/mindformers/experimental/distri_cores/training.py, line 533, in construct, \n (loss, _), grads = self.forward_backward_func(*inputs_tuple, loss_scale=current_step_loss_scale, **inputs_dict)",
-        "File /home/mindspore/nn/cell.py, line 757, in _complex_call, \n output = self.construct(*args, **kwargs)",
-        "File /home/mindspore/nn/cell.py, line 747, in __call__, \n return self._complex_call(*args, **kwargs)",
-        "File /home/mindformers/experimental/distri_cores/training.py, line 655, in train, \n loss, is_finite, loss_scale, learning_rate = train_one_step_cell(**data)",
-        "File /home/model/pretrain_gpt.py, line 303, in main, \n train(",
-        "File /home/model/pretrain_gpt.py, line 316, in <module>, \n main()"
-        ]
-}
-```
 # FAQ
 1. 图比对场景，节点呈现灰色，且没有精度比对数据，怎么处理？
 
diff --git a/debug/accuracy_tools/msprobe/visualization/builder/graph_builder.py b/debug/accuracy_tools/msprobe/visualization/builder/graph_builder.py
index d7120072d4..1312d244f7 100644
--- a/debug/accuracy_tools/msprobe/visualization/builder/graph_builder.py
+++ b/debug/accuracy_tools/msprobe/visualization/builder/graph_builder.py
@@ -35,7 +35,7 @@ class GraphBuilder:
     template_pattern = re.compile(r'\b([A-Z][a-zA-Z]*Template|api_template|api_instance)\(')
 
     @staticmethod
-    def build(construct_path, data_path, stack_path, model_name='DefaultModel', complete_stack=False):
+    def build(construct_path, data_path, stack_path, model_name='DefaultModel'):
         """
         GraphBuilder的对外提供的构图方法
         Args:
@@ -43,7 +43,6 @@ class GraphBuilder:
             data_path: dump.json路径
             stack_path: stack.json路径
             model_name: 模型名字，依赖外部输入
-            complete_stack: 完整的堆栈信息
         Returns: Graph，代表图的数据结构
         """
         construct_dict = load_json(construct_path)
@@ -54,8 +53,6 @@ class GraphBuilder:
             raise RuntimeError
         dump_dict = load_json(data_path)
         stack_dict = load_stack_json(stack_path)
-        if not complete_stack:
-            GraphBuilder._simplify_stack(stack_dict)
         data_dict = dump_dict.get(GraphConst.DATA_KEY, {})
         graph = Graph(model_name, data_path=dump_dict.get('dump_data_dir', ''), dump_data=data_dict)
         GraphBuilder._init_nodes(graph, construct_dict, data_dict, stack_dict)
@@ -76,39 +73,6 @@ class GraphBuilder:
             node_to_db(config.graph_b, filename)
         config_to_db(config, filename)
 
-    @staticmethod
-    def _simplify_stack(stack_dict):
-        """
-        精简堆栈内容，模块级保留包含"模块名("的堆栈，api级保留"xxxTemplate("的下一行堆栈
-
-        例如模块 Module.layer3.0.bn2.BatchNorm2d.forward.0，模块名为bn2，匹配"bn2("，
-        保留堆栈"File /home/models/resnet.py, line 97, in forward, \n out = self.bn2(out)"
-
-        例如Api Tensor.__iadd__.4.forward，堆栈为：
-        "File /home/wrap_tensor.py, line 61,  return TensorOPTemplate(op_name, hook)(*args, **kwargs)",
-        "File /home/torchvision/models/resnet.py, line 102, in forward, \n out += identity",
-        匹配到第一行的"TensorOPTemplate("，保留下一行堆栈
-        """
-        module_pattern = re.compile(op_patterns[0])
-        for dump_name, stack_list in stack_dict.items():
-            if not isinstance(stack_list, list):
-                continue
-            if module_pattern.match(dump_name):
-                parts = dump_name.split(Const.SEP)
-                if len(parts) < abs(Const.LAYER_NAME_INDEX):
-                    continue
-                module_name = parts[Const.LAYER_NAME_INDEX]
-                for stack in stack_list:
-                    if re.search(module_name + r'\(', stack):
-                        stack_list = [stack]
-                        break
-            else:
-                for index, stack in enumerate(stack_list):
-                    if GraphBuilder.template_pattern.search(stack) and index < len(stack_list) - 1:
-                        stack_list = [stack_list[index + 1]]
-                        break
-            stack_dict[dump_name] = stack_list
-
     @staticmethod
     def _handle_backward_upnode_missing(construct_dict, subnode_id, upnode_id):
         """
diff --git a/debug/accuracy_tools/msprobe/visualization/db_utils.py b/debug/accuracy_tools/msprobe/visualization/db_utils.py
index 3988ebecc8..98907b88f0 100644
--- a/debug/accuracy_tools/msprobe/visualization/db_utils.py
+++ b/debug/accuracy_tools/msprobe/visualization/db_utils.py
@@ -41,7 +41,7 @@ node_columns = {
     'overflow_level': TEXT,
     'micro_step_id': INTEGER_NOT_NULL,
     'matched_node_link': TEXT,
-    'stack_info': TEXT,
+    'stack_id': TEXT,
     'parallel_merge_info': TEXT,
     'matched_distributed': TEXT,
     'modified': INTEGER_NOT_NULL,
@@ -65,6 +65,11 @@ config_columns = {
     'step_list': TEXT_NOT_NULL
 }
 
+stack_columns = {
+    'id': TEXT_PRIMARY_KEY,
+    'stack_info': TEXT
+}
+
 indexes = {
     "index1": ["step", "rank", "data_source", "up_node", "node_order"],
     "index2": ["step", "rank", "data_source", "node_name"],
@@ -197,19 +202,24 @@ def node_to_db(graph, db_name):
     create_table_sql = create_table_sql_from_dict('tb_nodes', node_columns)
     insert_sql = create_insert_sql_from_dict('tb_nodes', node_columns)
     data = []
+    stack_dict = {}
     for i, node in enumerate(graph.get_sorted_nodes()):
+        stack_info_text = json.dumps(node.stack_info)
+        if stack_info_text not in stack_dict:
+            stack_dict[stack_info_text] = get_stack_unique_id(graph, stack_dict)
         data.append((get_node_unique_id(graph, node), get_graph_unique_id(graph), i, node.id, node.op.value,
                      node.upnode.id if node.upnode else '',
                      json.dumps([node.id for node in node.subnodes]) if node.subnodes else '',
                      node.data.get(GraphConst.JSON_INDEX_KEY), node.data.get(GraphConst.OVERFLOW_LEVEL),
                      node.micro_step_id if node.micro_step_id is not None else 0, json.dumps(node.matched_node_link),
-                     json.dumps(node.stack_info),
+                     stack_dict.get(stack_info_text),
                      json.dumps(node.parallel_merge_info) if node.parallel_merge_info else '',
                      json.dumps(node.matched_distributed), 0,
                      json.dumps(format_node_data(node.input_data, node.id, graph.compare_mode)),
                      json.dumps(format_node_data(node.output_data, node.id, graph.compare_mode)),
                      graph.data_source, graph.data_path, graph.step, graph.rank))
     to_db(db_name, create_table_sql, insert_sql, data)
+    stack_to_db(stack_dict, db_name)
 
 
 def config_to_db(config, db_name):
@@ -221,9 +231,22 @@ def config_to_db(config, db_name):
     to_db(db_name, create_table_sql, insert_sql, data)
 
 
+def stack_to_db(stack_dict, db_name):
+    create_table_sql = create_table_sql_from_dict('tb_stack', stack_columns)
+    insert_sql = create_insert_sql_from_dict('tb_stack', stack_columns)
+    data = []
+    for stack_info_text, unique_id in stack_dict.items():
+        data.append((unique_id, stack_info_text))
+    to_db(db_name, create_table_sql, insert_sql, data)
+
+
 def get_graph_unique_id(graph):
     return f'{graph.data_source}_{graph.step}_{graph.rank}'
 
 
 def get_node_unique_id(graph, node):
     return f'{get_graph_unique_id(graph)}_{node.id}'
+
+
+def get_stack_unique_id(graph, stack_dict):
+    return f'{get_graph_unique_id(graph)}_{len(stack_dict)}'
diff --git a/debug/accuracy_tools/msprobe/visualization/graph_service.py b/debug/accuracy_tools/msprobe/visualization/graph_service.py
index 9e9604746d..1f14aa94d8 100644
--- a/debug/accuracy_tools/msprobe/visualization/graph_service.py
+++ b/debug/accuracy_tools/msprobe/visualization/graph_service.py
@@ -113,7 +113,7 @@ def _build_graph_info(dump_path, args, graph=None):
     stack_path = FileChecker(os.path.join(dump_path, GraphConst.STACK_FILE), FileCheckConst.FILE,
                              FileCheckConst.READ_ABLE).common_check()
     if not graph:
-        graph = GraphBuilder.build(construct_path, data_path, stack_path, complete_stack=args.complete_stack)
+        graph = GraphBuilder.build(construct_path, data_path, stack_path)
     return GraphInfo(graph, construct_path, data_path, stack_path)
 
 
@@ -256,8 +256,8 @@ def _get_compare_graph_results(input_param, serializable_args, step, pool, err_c
             build_key = f'{step}_{nr}' if step else f'{nr}'
             input_param_copy = deepcopy(input_param)
             mp_task_dict[build_key] = pool.apply_async(_run_build_graph_compare,
-                                                              args=(input_param_copy, serializable_args, nr, br),
-                                                              error_callback=err_call)
+                                                       args=(input_param_copy, serializable_args, nr, br),
+                                                       error_callback=err_call)
 
         mp_res_dict = {k: v.get() for k, v in mp_task_dict.items()}
         for mp_res in mp_res_dict.values():
@@ -434,8 +434,6 @@ def _graph_service_parser(parser):
                         help="<Optional> whether open overflow_check for graph.", required=False)
     parser.add_argument("-f", "--fuzzy_match", dest="fuzzy_match", action="store_true",
                         help="<Optional> Whether to perform a fuzzy match on the api name.", required=False)
-    parser.add_argument("-cs", "--complete_stack", dest="complete_stack", action="store_true",
-                        help="<Optional> Whether to use complete stack information.", required=False)
 
 
 def _graph_service_command(args):
-- 
Gitee