From 2aeddd266b4a08bda22c96b9ff6d7a5c00b0baae Mon Sep 17 00:00:00 2001
From: RanZheng <364167184@qq.com>
Date: Tue, 22 Jul 2025 18:54:57 +0800
Subject: [PATCH 1/3] =?UTF-8?q?=E3=80=90feature=E3=80=91monitor=20print=5F?=
=?UTF-8?q?struct=E6=94=AF=E6=8C=81=E9=9D=9Erank0=E6=A8=A1=E5=9E=8B?=
=?UTF-8?q?=E4=BF=A1=E6=81=AF=E4=BF=9D=E5=AD=98?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
(cherry picked from commit 06e0024a8b180f3da812b64a9f29dc100d51fb43)
---
.../accuracy_tools/msprobe/docs/19.monitor.md | 10 ++++----
.../msprobe/mindspore/monitor/module_hook.py | 3 ++-
.../msprobe/pytorch/monitor/module_hook.py | 24 ++++---------------
.../msprobe/pytorch/monitor/module_metric.py | 3 ++-
4 files changed, 14 insertions(+), 26 deletions(-)
diff --git a/debug/accuracy_tools/msprobe/docs/19.monitor.md b/debug/accuracy_tools/msprobe/docs/19.monitor.md
index f57fadf9af..f7c9028b26 100644
--- a/debug/accuracy_tools/msprobe/docs/19.monitor.md
+++ b/debug/accuracy_tools/msprobe/docs/19.monitor.md
@@ -228,7 +228,7 @@ monitor.monitor_gnorm_with_ad(
module_name可以通过nn.Module的接口named_modules()获取。
#### 打印模型结构
-工具提供可选项`print_struct`打印模型结构,帮助配置targets。工具会在在第一个step后打印结构并停止训练进程,模型结构默认打印在`$MONITOR_OUTPUT_DIR/module_struct.json`。
+工具提供可选项`print_struct`打印模型结构,帮助配置targets。工具会在在第一个step后打印结构并停止训练进程,每张卡上的模型结构默认保存在`$MONITOR_OUTPUT_DIR/module_struct/rank{rank}/module_struct.json`, 其中{rank}为对应的卡号。
```json
{
"print_struct": true
@@ -701,7 +701,7 @@ TrainerMon.monitor_gnorm_with_ad(model, grad_acc_steps, optimizer, dp_group, tp_
| "collect_times" | 可选 | 设置采集次数,达到该次数后停止监控,默认值为100000000,目的是一直采集。 |
| "start_step" | 可选 | 设置开始采集step,模型训练达到start_step后开始监控采集,默认值为0,表示从step0开始监控采集。注:在动态启停模式下该设置不生效,只会从下一步开始监控采集。 |
| "step_interval" | 可选 | 设置采集step间隔,默认值为1,表示每个step均采集监控数据。 |
-| "print_struct" | 可选 | 设置为true后监控工具会打印模型中torch module的名字和详细结构,并在第1个step后退出。不填默认为false。**仅PyTorch场景支持此参数**。 |
+| "print_struct" | 可选 | 设置为true后监控工具会打印每张卡模型中module的名字和详细结构,并在第1个step后退出。不填默认为false。 |
| "module_ranks" | 可选 | 用于在分布式训练场景中希望控制在哪些rank开启module监控。如果不填,则默认在所有rank开启。 列表内rank要求为int类型。 |
| "ur_distribution" | 可选 | 若为true则会统计adam优化器指定模块(targets中指定)参数的update和ratio向量的数值分布,并展示在heatmap里,默认为false,同时format字段必须设置为tensorboard。
依赖histc算子, 需要CANN8.0.rc2以上版本, 否则会有严重的性能问题。**仅PyTorch场景支持此参数**。 |
| "xy_distribution" | 可选 | 若为true则会监控指定module(targets中指定)的输入输出张量。 默认为false。 |
@@ -710,7 +710,7 @@ TrainerMon.monitor_gnorm_with_ad(model, grad_acc_steps, optimizer, dp_group, tp_
| "backward_only" | 可选 | 开启xy_distribution后生效,若为true,仅监控指定module的反向,targets中的input、output不生效。默认为false。 |
| "mv_distribution" | 可选 | 若为true则会监控指定模块中的参数的优化器状态, 默认为false。版本= '2.0'
if not torch_version_above_or_equal_2:
@@ -1005,19 +999,11 @@ class TrainerMon:
logger.info(msg)
def _save_module_struct(self):
- if MPU_IMPORT:
- pp_group = mpu.get_pipeline_model_parallel_group()
- pp_group_list = torch.distributed.get_process_group_ranks(pp_group)
- save_module_struct = not dist.is_initialized() or dist.get_rank() in pp_group_list
- else:
- save_module_struct = (not dist.is_initialized()
- or (self.module_rank_list and dist.get_rank() == min(self.module_rank_list))
- or (not self.module_rank_list and dist.get_rank() == 0))
- if save_module_struct:
- module_struct_file = os.path.realpath(
- os.path.join(get_output_base_dir(), f'{dist.get_rank()}_module_struct.json'))
- save_json(module_struct_file, self.module_struct, indent=2)
- logger.info(f"> save module struct to {module_struct_file}")
+ output_dir = os.path.join(get_output_base_dir(), 'module_struct', f'rank{self.rank}')
+ os.makedirs(output_dir, exist_ok=True)
+ module_struct_file = os.path.realpath(os.path.join(output_dir, 'module_struct.json'))
+ save_json(module_struct_file, self.module_struct, indent=2)
+ logger.info(f"> save module struct to {module_struct_file}")
self.struct_printed = True
def _is_target_param(self, param_name, param, prefix):
diff --git a/debug/accuracy_tools/msprobe/pytorch/monitor/module_metric.py b/debug/accuracy_tools/msprobe/pytorch/monitor/module_metric.py
index d0ff16a97d..ad10580ad5 100644
--- a/debug/accuracy_tools/msprobe/pytorch/monitor/module_metric.py
+++ b/debug/accuracy_tools/msprobe/pytorch/monitor/module_metric.py
@@ -34,7 +34,8 @@ def squash_param_name(param_name, enable=True):
if not enable:
return param_name
name = ''
- for pattern in ['layers?\.(.*)', 'embeddings?\.(.*)', 'final.*', 'output.*', 'norm.*']:
+ for pattern in ['^.*\.(layers?\..*)', '^.*\.(embeddings?\..*)', '^.*\.(final.*)', '^.*\.(output.*)',
+ '^.*\.(norm.*)']:
match = re.findall(pattern, param_name)
if match:
name += match[0]
--
Gitee
From 45e43b3cd53a142aae052a5abe200b0e585ce211 Mon Sep 17 00:00:00 2001
From: RanZheng <364167184@qq.com>
Date: Thu, 24 Jul 2025 16:53:12 +0800
Subject: [PATCH 2/3] =?UTF-8?q?=E8=A1=A5=E5=85=85mindspore=E4=BE=A7?=
=?UTF-8?q?=E4=BF=AE=E6=94=B9?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
---
.../msprobe/mindspore/monitor/module_hook.py | 13 +++++--------
1 file changed, 5 insertions(+), 8 deletions(-)
diff --git a/debug/accuracy_tools/msprobe/mindspore/monitor/module_hook.py b/debug/accuracy_tools/msprobe/mindspore/monitor/module_hook.py
index 1baf9ad29c..bcb146ea82 100644
--- a/debug/accuracy_tools/msprobe/mindspore/monitor/module_hook.py
+++ b/debug/accuracy_tools/msprobe/mindspore/monitor/module_hook.py
@@ -701,14 +701,11 @@ class TrainerMon:
index += 1
def _save_module_struct(self):
- save_module_struct = (not comm_is_initialized()
- or (self.module_rank_list and get_rank() == min(self.module_rank_list))
- or (not self.module_rank_list and get_rank() == 0))
-
- if save_module_struct:
- module_struct_file = os.path.realpath(os.path.join(get_output_base_dir(), 'module_struct.json'))
- save_json(module_struct_file, self.module_struct, indent=2)
- logger.info(f"> save module struct to {module_struct_file}")
+ output_dir = os.path.join(get_output_base_dir(), 'module_struct', f'rank{self.rank}')
+ os.makedirs(output_dir, exist_ok=True)
+ module_struct_file = os.path.realpath(os.path.join(output_dir, 'module_struct.json'))
+ save_json(module_struct_file, self.module_struct, indent=2)
+ logger.info(f"> save module struct to {module_struct_file}")
self.struct_printed = True
def _hook_module(self, target_names, module, vpp_stage=''):
--
Gitee
From d0e691dd44e9dec73e04a41c1c3cd46669a566c7 Mon Sep 17 00:00:00 2001
From: RanZheng <364167184@qq.com>
Date: Thu, 24 Jul 2025 20:00:39 +0800
Subject: [PATCH 3/3] =?UTF-8?q?make=5Fdir=E4=BD=BF=E7=94=A8=E5=AE=89?=
=?UTF-8?q?=E5=85=A8=E5=87=BD=E6=95=B0?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
(cherry picked from commit d59fa0a3e1ffb797a48e985b60e147bcd448b8b8)
---
debug/accuracy_tools/msprobe/mindspore/monitor/module_hook.py | 4 ++--
debug/accuracy_tools/msprobe/pytorch/monitor/module_hook.py | 4 ++--
2 files changed, 4 insertions(+), 4 deletions(-)
diff --git a/debug/accuracy_tools/msprobe/mindspore/monitor/module_hook.py b/debug/accuracy_tools/msprobe/mindspore/monitor/module_hook.py
index bcb146ea82..375314bff2 100644
--- a/debug/accuracy_tools/msprobe/mindspore/monitor/module_hook.py
+++ b/debug/accuracy_tools/msprobe/mindspore/monitor/module_hook.py
@@ -27,7 +27,7 @@ from mindspore import nn, _no_grad
from msprobe.core.common.log import logger
from msprobe.core.common.const import MonitorConst, Const
-from msprobe.core.common.file_utils import load_json, save_json
+from msprobe.core.common.file_utils import load_json, save_json, make_dir
from msprobe.core.monitor.utils import validate_config, get_output_base_dir, get_target_output_dir
from msprobe.core.monitor.anomaly_processor import AnomalyScanner, AnomalyDataFactory, AnomalyDataWriter
from msprobe.mindspore.common.utils import is_mindtorch
@@ -702,7 +702,7 @@ class TrainerMon:
def _save_module_struct(self):
output_dir = os.path.join(get_output_base_dir(), 'module_struct', f'rank{self.rank}')
- os.makedirs(output_dir, exist_ok=True)
+ make_dir(output_dir)
module_struct_file = os.path.realpath(os.path.join(output_dir, 'module_struct.json'))
save_json(module_struct_file, self.module_struct, indent=2)
logger.info(f"> save module struct to {module_struct_file}")
diff --git a/debug/accuracy_tools/msprobe/pytorch/monitor/module_hook.py b/debug/accuracy_tools/msprobe/pytorch/monitor/module_hook.py
index 875492687f..61e9ed4b00 100644
--- a/debug/accuracy_tools/msprobe/pytorch/monitor/module_hook.py
+++ b/debug/accuracy_tools/msprobe/pytorch/monitor/module_hook.py
@@ -26,7 +26,7 @@ import pandas as pd
from torch.utils.hooks import BackwardHook
from msprobe.core.common.const import MonitorConst, Const
-from msprobe.core.common.file_utils import load_json, save_json
+from msprobe.core.common.file_utils import load_json, save_json, make_dir
from msprobe.core.common.decorator import recursion_depth_decorator
from msprobe.core.monitor.anomaly_processor import AnomalyScanner, AnomalyDataFactory, AnomalyDataWriter
from msprobe.core.common.file_utils import write_df_to_csv
@@ -1001,7 +1001,7 @@ class TrainerMon:
def _save_module_struct(self):
output_dir = os.path.join(get_output_base_dir(), 'module_struct', f'rank{self.rank}')
- os.makedirs(output_dir, exist_ok=True)
+ make_dir(output_dir)
module_struct_file = os.path.realpath(os.path.join(output_dir, 'module_struct.json'))
save_json(module_struct_file, self.module_struct, indent=2)
logger.info(f"> save module struct to {module_struct_file}")
--
Gitee