From dc24db4b5193842e1951b42f2b05bd228e111c13 Mon Sep 17 00:00:00 2001 From: l30044004 Date: Tue, 2 Sep 2025 17:16:32 +0800 Subject: [PATCH] =?UTF-8?q?=E3=80=90=E4=BC=98=E5=8C=96=E3=80=91=E5=88=86?= =?UTF-8?q?=E7=BA=A7=E5=8F=AF=E8=A7=86=E5=8C=96=E8=BD=ACdb=E4=BC=98?= =?UTF-8?q?=E5=8C=96stack=E4=BF=A1=E6=81=AF=E5=AD=98=E5=82=A8?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../msprobe/docs/21.visualization_PyTorch.md | 37 ------------ .../docs/22.visualization_MindSpore.md | 56 ------------------- .../visualization/builder/graph_builder.py | 38 +------------ .../msprobe/visualization/db_utils.py | 27 ++++++++- .../msprobe/visualization/graph_service.py | 8 +-- 5 files changed, 29 insertions(+), 137 deletions(-) diff --git a/debug/accuracy_tools/msprobe/docs/21.visualization_PyTorch.md b/debug/accuracy_tools/msprobe/docs/21.visualization_PyTorch.md index b02dd7fdb8..b5a208b874 100644 --- a/debug/accuracy_tools/msprobe/docs/21.visualization_PyTorch.md +++ b/debug/accuracy_tools/msprobe/docs/21.visualization_PyTorch.md @@ -52,7 +52,6 @@ msprobe -f pytorch graph -i ./compare.json -o ./output | -lm 或 --layer_mapping | 跨套件比对,例如同一个模型分别使用了DeepSpeed和Megatron套件的比对场景。配置该参数时表示开启跨套件Layer层的比对功能,指定模型代码中的Layer层后,可以识别对应dump数据中的模块或API。需要指定自定义映射文件*.yaml。自定义映射文件的格式请参见[自定义映射文件(Layer)](#71-自定义映射文件layer),如何配置自定义映射文件请参考[模型分级可视化如何配置layer mapping映射文件](./visualization/layer_mapping_example.md)。 配置该参数后,将仅按节点名称进行比对,忽略节点的 type 和 shape。如果调试侧和标杆侧有名称不同的节点,则需要配置自定义映射文件,-lm参数传入自定义映射文件路径;如果调试侧和标杆侧节点名称相同,则仅指定-lm即可。

可参考的实际案例:[MindSpeed&LLamaFactory数据采集和自动比对](./visualization/mindspeed_llamafactory_mapping.md) | 否 | | -oc 或 --overflow_check | 是否开启溢出检测模式,开启后会在输出db文件中(`compare_{timestamp}.vis.db或build_{timestamp}.vis.db`)对每个溢出节点进行标记溢出等级,溢出等级说明参考[溢出等级说明](#312-溢出等级说明) | 否 | | -f 或 --fuzzy_match | 是否开启模糊匹配,bool类型。模糊匹配说明参考[匹配说明](#311-匹配说明) | 否 | -| -cs 或 --complete_stack | 是否使用完整的堆栈信息,bool类型。默认使用精简的堆栈信息,数据量小有助于增加流畅度。完整堆栈和精简堆栈信息参考[堆栈信息说明](#72-堆栈信息说明) | 否 | #### 3.1.1 匹配说明 @@ -506,42 +505,6 @@ yaml文件中只需配置待调试侧与标杆侧模型代码中功能一致但 ![ms_dump](./img/ms_layer.png) -### 7.2 堆栈信息说明 - -**精简堆栈** - -保留一条当前模块或api的调用信息 - -```json -{ - "Module.layer1.0.bn1.BatchNorm2d.forward.0": [ - "File /home/torchvision/models/resnet.py, line 93, in forward, \n out = self.bn1(out)" - ] -} -``` - -**完整堆栈** - -当前模块或api完整的调用信息 - -```json -{ - "Module.layer1.0.bn1.BatchNorm2d.forward.0": [ - "File /home/torchvision/models/resnet.py, line 93, in forward, \n out = self.bn1(out)", - "File /home/torch/nn/modules/module.py, line 1568, in _call_impl, \n result = forward_call(*args, **kwargs)", - "File /home/torch/nn/modules/module.py, line 1518, in _wrapped_call_impl, \n return self._call_impl(*args, **kwargs)", - "File /home/torch/nn/modules/container.py, line 215, in forward, \n input = module(input)", - "File /home/torch/nn/modules/module.py, line 1568, in _call_impl, \n result = forward_call(*args, **kwargs)", - "File /home/torch/nn/modules/module.py, line 1518, in _wrapped_call_impl, \n return self._call_impl(*args, **kwargs)", - "File /home/torchvision/models/resnet.py, line 273, in _forward_impl, \n x = self.layer1(x)", - "File /home/torchvision/models/resnet.py, line 285, in forward, \n return self._forward_impl(x)", - "File /home/torch/nn/modules/module.py, line 1527, in _call_impl, \n return forward_call(*args, **kwargs)", - "File /home/torch/nn/modules/module.py, line 1518, in _wrapped_call_impl, \n return self._call_impl(*args, **kwargs)", - "File /home/visualization/resnet18.py, line 40, in , \n outputs = model(inputs)" - ] -} - -``` # FAQ 1. 图比对场景,节点呈现灰色,且没有精度比对数据,怎么处理? diff --git a/debug/accuracy_tools/msprobe/docs/22.visualization_MindSpore.md b/debug/accuracy_tools/msprobe/docs/22.visualization_MindSpore.md index 7abf8df95f..4efd0532a2 100644 --- a/debug/accuracy_tools/msprobe/docs/22.visualization_MindSpore.md +++ b/debug/accuracy_tools/msprobe/docs/22.visualization_MindSpore.md @@ -52,7 +52,6 @@ msprobe -f mindspore graph -i ./compare.json -o ./output | -lm 或 --layer_mapping| 跨框架比对,MindSpore和PyTorch的比对场景。配置该参数时表示开启跨框架Layer层的比对功能,指定模型代码中的Layer层后,可以识别对应dump数据中的模块或API。需要指定自定义映射文件*.yaml。自定义映射文件的格式请参见[自定义映射文件(Layer)](#71-自定义映射文件layer), 如何配置自定义映射文件请参考[模型分级可视化如何配置layer mapping映射文件](./visualization/layer_mapping_example.md)。配置该参数后,将仅按节点名称进行比对,忽略节点的 type 和 shape。如果调试侧和标杆侧有名称不同的节点,则需要配置自定义映射文件,-lm参数传入自定义映射文件路径;如果调试侧和标杆侧节点名称相同,则仅指定-lm即可。 | 否 | | -oc 或 --overflow_check | 是否开启溢出检测模式,开启后会在输出db文件中(`compare_{timestamp}.vis.db或build_{timestamp}.vis.db`)对每个溢出节点进行标记溢出等级,溢出等级说明参考[溢出等级说明](#312-溢出等级说明) | 否 | | -f 或 --fuzzy_match | 是否开启模糊匹配,bool类型。模糊匹配说明参考[匹配说明](#311-匹配说明) | 否 | -| -cs 或 --complete_stack | 是否使用完整的堆栈信息,bool类型。默认使用精简的堆栈信息,数据量小有助于增加流畅度。完整堆栈和精简堆栈信息参考[堆栈信息说明](#72-堆栈信息说明) | 否 | #### 3.1.1 匹配说明 @@ -502,61 +501,6 @@ yaml文件中只需配置MindSpore与PyTorch模型代码中功能一致但名称 ![ms_dump](./img/ms_layer.png) -### 7.2 堆栈信息说明 - -**精简堆栈** - -保留一条当前模块或api的调用信息 - -```json -{ - "Cell.model.language_model.embedding.word_embeddings.reduce_scatter_to_sp_region.ReduceScatterToSequenceParallelRegion.forward.0": [ - "File /home/mindformers/experimental/distri_cores/tensor_parallel/layers.py, line 770, in construct, \n output = self.reduce_scatter_to_sp_region(output_parallel)" - ] -} -``` - -**完整堆栈** - -当前模块或api完整的调用信息 - -```json -{ - "Cell.model.language_model.embedding.word_embeddings.reduce_scatter_to_sp_region.ReduceScatterToSequenceParallelRegion.forward.0": [ - "File /home/mindspore/nn/cell.py, line 507, in _run_construct, \n output = self._run_forward_hook(inputs, output)", - "File /home/mindspore/nn/cell.py, line 759, in _complex_call, \n output = self._run_construct(*args, **kwargs)", - "File /home/mindspore/nn/cell.py, line 747, in __call__, \n return self._complex_call(*args, **kwargs)", - "File /home/mindformers/experimental/distri_cores/tensor_parallel/layers.py, line 770, in construct, \n output = self.reduce_scatter_to_sp_region(output_parallel)", - "File /home/mindspore/nn/cell.py, line 2462, in _backward_hook_construct, \n outputs = self.construct(outputs, **kwargs)", - "File /home/mindspore/nn/cell.py, line 498, in _run_construct, \n output = self._backward_hook_construct(*inputs, **kwargs)", - "File /home/mindspore/nn/cell.py, line 745, in __call__, \n return self._run_construct(*args, **kwargs)", - "File /home/mindformers/experimental/distri_cores/transformer/language_model.py, line 151, in construct, \n embeddings = self.word_embeddings(input_ids)", - "File /home/mindspore/nn/cell.py, line 2460, in _backward_hook_construct, \n outputs = self.construct(*outputs, **kwargs)", - "File /home/mindspore/nn/cell.py, line 498, in _run_construct, \n output = self._backward_hook_construct(*inputs, **kwargs)", - "File /home/mindspore/nn/cell.py, line 745, in __call__, \n return self._run_construct(*args, **kwargs)", - "File /home/mindformers/experimental/distri_cores/transformer/language_model.py, line 391, in construct, \n text_embedding_out = self.embedding(enc_input_ids, enc_position_ids,", - "File /home/mindspore/nn/cell.py, line 2460, in _backward_hook_construct, \n outputs = self.construct(*outputs, **kwargs)", - "File /home/mindspore/nn/cell.py, line 498, in _run_construct, \n output = self._backward_hook_construct(*inputs, **kwargs)", - "File /home/mindspore/nn/cell.py, line 745, in __call__, \n return self._run_construct(*args, **kwargs)", - "File /home/model/gpt_model.py, line 104, in construct, \n lm_output = self.language_model(tokens,", - "File /home/mindspore/nn/cell.py, line 2460, in _backward_hook_construct, \n outputs = self.construct(*outputs, **kwargs)", - "File /home/mindspore/nn/cell.py, line 498, in _run_construct, \n output = self._backward_hook_construct(*inputs, **kwargs)", - "File /home/mindspore/nn/cell.py, line 745, in __call__, \n return self._run_construct(*args, **kwargs)", - "File /home/mindformers/experimental/distri_cores/pipeline_parallel/pipeline_cell.py, line 429, in construct, \n return self.model(*inputs)", - "File /home/mindspore/nn/cell.py, line 757, in _complex_call, \n output = self.construct(*args, **kwargs)", - "File /home/mindspore/nn/cell.py, line 747, in __call__, \n return self._complex_call(*args, **kwargs)", - "File /home/mindformers/experimental/distri_cores/pipeline_parallel/schedules.py, line 121, in run_forward, \n output_tensor = model(*input_data, recv_data=None)", - "File /home/mindformers/experimental/distri_cores/pipeline_parallel/schedules.py, line 735, in forward_backward_pipelining_without_interleaving, \n micro_input_data = run_forward(*micro_input_data,", - "File /home/mindformers/experimental/distri_cores/training.py, line 409, in forward_backward_with_pipelining, \n loss, logits, grads = forward_backward_pipelining_without_interleaving(", - "File /home/mindformers/experimental/distri_cores/training.py, line 533, in construct, \n (loss, _), grads = self.forward_backward_func(*inputs_tuple, loss_scale=current_step_loss_scale, **inputs_dict)", - "File /home/mindspore/nn/cell.py, line 757, in _complex_call, \n output = self.construct(*args, **kwargs)", - "File /home/mindspore/nn/cell.py, line 747, in __call__, \n return self._complex_call(*args, **kwargs)", - "File /home/mindformers/experimental/distri_cores/training.py, line 655, in train, \n loss, is_finite, loss_scale, learning_rate = train_one_step_cell(**data)", - "File /home/model/pretrain_gpt.py, line 303, in main, \n train(", - "File /home/model/pretrain_gpt.py, line 316, in , \n main()" - ] -} -``` # FAQ 1. 图比对场景,节点呈现灰色,且没有精度比对数据,怎么处理? diff --git a/debug/accuracy_tools/msprobe/visualization/builder/graph_builder.py b/debug/accuracy_tools/msprobe/visualization/builder/graph_builder.py index d7120072d4..1312d244f7 100644 --- a/debug/accuracy_tools/msprobe/visualization/builder/graph_builder.py +++ b/debug/accuracy_tools/msprobe/visualization/builder/graph_builder.py @@ -35,7 +35,7 @@ class GraphBuilder: template_pattern = re.compile(r'\b([A-Z][a-zA-Z]*Template|api_template|api_instance)\(') @staticmethod - def build(construct_path, data_path, stack_path, model_name='DefaultModel', complete_stack=False): + def build(construct_path, data_path, stack_path, model_name='DefaultModel'): """ GraphBuilder的对外提供的构图方法 Args: @@ -43,7 +43,6 @@ class GraphBuilder: data_path: dump.json路径 stack_path: stack.json路径 model_name: 模型名字,依赖外部输入 - complete_stack: 完整的堆栈信息 Returns: Graph,代表图的数据结构 """ construct_dict = load_json(construct_path) @@ -54,8 +53,6 @@ class GraphBuilder: raise RuntimeError dump_dict = load_json(data_path) stack_dict = load_stack_json(stack_path) - if not complete_stack: - GraphBuilder._simplify_stack(stack_dict) data_dict = dump_dict.get(GraphConst.DATA_KEY, {}) graph = Graph(model_name, data_path=dump_dict.get('dump_data_dir', ''), dump_data=data_dict) GraphBuilder._init_nodes(graph, construct_dict, data_dict, stack_dict) @@ -76,39 +73,6 @@ class GraphBuilder: node_to_db(config.graph_b, filename) config_to_db(config, filename) - @staticmethod - def _simplify_stack(stack_dict): - """ - 精简堆栈内容,模块级保留包含"模块名("的堆栈,api级保留"xxxTemplate("的下一行堆栈 - - 例如模块 Module.layer3.0.bn2.BatchNorm2d.forward.0,模块名为bn2,匹配"bn2(", - 保留堆栈"File /home/models/resnet.py, line 97, in forward, \n out = self.bn2(out)" - - 例如Api Tensor.__iadd__.4.forward,堆栈为: - "File /home/wrap_tensor.py, line 61, return TensorOPTemplate(op_name, hook)(*args, **kwargs)", - "File /home/torchvision/models/resnet.py, line 102, in forward, \n out += identity", - 匹配到第一行的"TensorOPTemplate(",保留下一行堆栈 - """ - module_pattern = re.compile(op_patterns[0]) - for dump_name, stack_list in stack_dict.items(): - if not isinstance(stack_list, list): - continue - if module_pattern.match(dump_name): - parts = dump_name.split(Const.SEP) - if len(parts) < abs(Const.LAYER_NAME_INDEX): - continue - module_name = parts[Const.LAYER_NAME_INDEX] - for stack in stack_list: - if re.search(module_name + r'\(', stack): - stack_list = [stack] - break - else: - for index, stack in enumerate(stack_list): - if GraphBuilder.template_pattern.search(stack) and index < len(stack_list) - 1: - stack_list = [stack_list[index + 1]] - break - stack_dict[dump_name] = stack_list - @staticmethod def _handle_backward_upnode_missing(construct_dict, subnode_id, upnode_id): """ diff --git a/debug/accuracy_tools/msprobe/visualization/db_utils.py b/debug/accuracy_tools/msprobe/visualization/db_utils.py index 3988ebecc8..98907b88f0 100644 --- a/debug/accuracy_tools/msprobe/visualization/db_utils.py +++ b/debug/accuracy_tools/msprobe/visualization/db_utils.py @@ -41,7 +41,7 @@ node_columns = { 'overflow_level': TEXT, 'micro_step_id': INTEGER_NOT_NULL, 'matched_node_link': TEXT, - 'stack_info': TEXT, + 'stack_id': TEXT, 'parallel_merge_info': TEXT, 'matched_distributed': TEXT, 'modified': INTEGER_NOT_NULL, @@ -65,6 +65,11 @@ config_columns = { 'step_list': TEXT_NOT_NULL } +stack_columns = { + 'id': TEXT_PRIMARY_KEY, + 'stack_info': TEXT +} + indexes = { "index1": ["step", "rank", "data_source", "up_node", "node_order"], "index2": ["step", "rank", "data_source", "node_name"], @@ -197,19 +202,24 @@ def node_to_db(graph, db_name): create_table_sql = create_table_sql_from_dict('tb_nodes', node_columns) insert_sql = create_insert_sql_from_dict('tb_nodes', node_columns) data = [] + stack_dict = {} for i, node in enumerate(graph.get_sorted_nodes()): + stack_info_text = json.dumps(node.stack_info) + if stack_info_text not in stack_dict: + stack_dict[stack_info_text] = get_stack_unique_id(graph, stack_dict) data.append((get_node_unique_id(graph, node), get_graph_unique_id(graph), i, node.id, node.op.value, node.upnode.id if node.upnode else '', json.dumps([node.id for node in node.subnodes]) if node.subnodes else '', node.data.get(GraphConst.JSON_INDEX_KEY), node.data.get(GraphConst.OVERFLOW_LEVEL), node.micro_step_id if node.micro_step_id is not None else 0, json.dumps(node.matched_node_link), - json.dumps(node.stack_info), + stack_dict.get(stack_info_text), json.dumps(node.parallel_merge_info) if node.parallel_merge_info else '', json.dumps(node.matched_distributed), 0, json.dumps(format_node_data(node.input_data, node.id, graph.compare_mode)), json.dumps(format_node_data(node.output_data, node.id, graph.compare_mode)), graph.data_source, graph.data_path, graph.step, graph.rank)) to_db(db_name, create_table_sql, insert_sql, data) + stack_to_db(stack_dict, db_name) def config_to_db(config, db_name): @@ -221,9 +231,22 @@ def config_to_db(config, db_name): to_db(db_name, create_table_sql, insert_sql, data) +def stack_to_db(stack_dict, db_name): + create_table_sql = create_table_sql_from_dict('tb_stack', stack_columns) + insert_sql = create_insert_sql_from_dict('tb_stack', stack_columns) + data = [] + for stack_info_text, unique_id in stack_dict.items(): + data.append((unique_id, stack_info_text)) + to_db(db_name, create_table_sql, insert_sql, data) + + def get_graph_unique_id(graph): return f'{graph.data_source}_{graph.step}_{graph.rank}' def get_node_unique_id(graph, node): return f'{get_graph_unique_id(graph)}_{node.id}' + + +def get_stack_unique_id(graph, stack_dict): + return f'{get_graph_unique_id(graph)}_{len(stack_dict)}' diff --git a/debug/accuracy_tools/msprobe/visualization/graph_service.py b/debug/accuracy_tools/msprobe/visualization/graph_service.py index 9e9604746d..1f14aa94d8 100644 --- a/debug/accuracy_tools/msprobe/visualization/graph_service.py +++ b/debug/accuracy_tools/msprobe/visualization/graph_service.py @@ -113,7 +113,7 @@ def _build_graph_info(dump_path, args, graph=None): stack_path = FileChecker(os.path.join(dump_path, GraphConst.STACK_FILE), FileCheckConst.FILE, FileCheckConst.READ_ABLE).common_check() if not graph: - graph = GraphBuilder.build(construct_path, data_path, stack_path, complete_stack=args.complete_stack) + graph = GraphBuilder.build(construct_path, data_path, stack_path) return GraphInfo(graph, construct_path, data_path, stack_path) @@ -256,8 +256,8 @@ def _get_compare_graph_results(input_param, serializable_args, step, pool, err_c build_key = f'{step}_{nr}' if step else f'{nr}' input_param_copy = deepcopy(input_param) mp_task_dict[build_key] = pool.apply_async(_run_build_graph_compare, - args=(input_param_copy, serializable_args, nr, br), - error_callback=err_call) + args=(input_param_copy, serializable_args, nr, br), + error_callback=err_call) mp_res_dict = {k: v.get() for k, v in mp_task_dict.items()} for mp_res in mp_res_dict.values(): @@ -434,8 +434,6 @@ def _graph_service_parser(parser): help=" whether open overflow_check for graph.", required=False) parser.add_argument("-f", "--fuzzy_match", dest="fuzzy_match", action="store_true", help=" Whether to perform a fuzzy match on the api name.", required=False) - parser.add_argument("-cs", "--complete_stack", dest="complete_stack", action="store_true", - help=" Whether to use complete stack information.", required=False) def _graph_service_command(args): -- Gitee