From ce63f06bf69930ae486807294f5d909f11faff04 Mon Sep 17 00:00:00 2001 From: mookie Date: Thu, 12 Jun 2025 15:16:28 +0800 Subject: [PATCH] =?UTF-8?q?ppchart=E6=B7=BB=E5=8A=A0=E9=9D=9Edualpipe?= =?UTF-8?q?=E7=9A=84=E9=80=82=E9=85=8D=E8=AF=B4=E6=98=8E?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../msprof_analyze/cluster_analyse/README.md | 34 ++++++++++++++++--- 1 file changed, 30 insertions(+), 4 deletions(-) diff --git a/profiler/msprof_analyze/cluster_analyse/README.md b/profiler/msprof_analyze/cluster_analyse/README.md index 48f67b790dd..cef71d91d4a 100644 --- a/profiler/msprof_analyze/cluster_analyse/README.md +++ b/profiler/msprof_analyze/cluster_analyse/README.md @@ -741,8 +741,26 @@ output_dir #### 打点 -以DualpipeV2为例,找到前反向代码,在dualpipev_schedules.py里面添加如下代码(仅为示例,需要注意这段代码添加的位置): +1. 传统pipeline(不开dualpipe),在```megatron/core/pipeline_parallel/schedules.py```里面添加如下代码(添加在```backward_step```函数定义的后面): +```python +import torch_npu +def step_wrapper(func, msg: str): + def wrapper(*args, **kwargs): + new_msg = {"name": msg} + mstx_state_step_range_id = torch_npu.npu.mstx.range_start(str(new_msg), torch_npu.npu.current_stream()) + out = func(*args, **kwargs) + if mstx_state_step_range_id is not None: + torch_npu.npu.mstx.range_end(mstx_state_step_range_id) + mstx_state_step_range_id = None + return out + return wrapper + +forward_step = step_wrapper(forward_step, "forward_step") +backward_step = step_wrapper(backward_step, "backward_step") ``` + +2. DualpipeV2,找到前反向代码,在```mindspeed/core/pipeline_parallel/dualpipev/dualpipev_schedules.py```里面添加如下代码(添加在```forward_backward_pipeline_with_cutinhalf```函数定义的前面): +```python import torch_npu def step_wrapper(func, msg: str): def wrapper(*args, **kwargs): @@ -769,8 +787,7 @@ backward_step = step_wrapper(backward_step, "backward_step") WeightGradStore.pop = step_wrapper(WeightGradStore.pop, "WeightGradStore.pop") ``` -同时,采集profiling数据时,需要添加metadata: - +同时,采集profiling数据时,如果使用的是MindSpeed,未使用MindSpeed-LLM,需要在prof定义(```prof = torch_npu.profiler.profile(...```)的后面添加metadata代码: ``` prof.add_metadata('pp_info', json.dumps( { @@ -778,7 +795,16 @@ prof.add_metadata('pp_info', json.dumps( 'microbatch_num': 10, } )) -# microbatch_num需要替换成实际的值 +# microbatch_num根据公式计算实际的值:microbatch_num = global_batch_size // micrio_batch_size // data_parallel_size +``` +如果使用MindSpeed-LLM,在```mindspeed-llm/training/trainning.py```中```prof.add_metadata_json('distributed_args'...```的后面添加metadata代码: +``` +prof.add_metadata('pp_info', json.dumps( + { + 'pp_type': args.schedules_method, + 'microbatch_num': args.global_batch_size // args.micrio_batch_size // args.data_parallel_size + } +)) ``` #### StepTaskInfo -- Gitee