diff --git a/debug/accuracy_tools/msprobe/docs/19.monitor.md b/debug/accuracy_tools/msprobe/docs/19.monitor.md
index f57fadf9afda3b4718da21cbce8714e0632e006e..f7c9028b26f557ad9ad40418d0593c0a158905b4 100644
--- a/debug/accuracy_tools/msprobe/docs/19.monitor.md
+++ b/debug/accuracy_tools/msprobe/docs/19.monitor.md
@@ -228,7 +228,7 @@ monitor.monitor_gnorm_with_ad(
 module_name可以通过nn.Module的接口named_modules()获取。
 
 #### 打印模型结构
-工具提供可选项`print_struct`打印模型结构，帮助配置targets。工具会在在第一个step后打印结构并停止训练进程，模型结构默认打印在`$MONITOR_OUTPUT_DIR/module_struct.json`。
+工具提供可选项`print_struct`打印模型结构，帮助配置targets。工具会在在第一个step后打印结构并停止训练进程，每张卡上的模型结构默认保存在`$MONITOR_OUTPUT_DIR/module_struct/rank{rank}/module_struct.json`, 其中{rank}为对应的卡号。
 ```json
 {
     "print_struct": true
@@ -701,7 +701,7 @@ TrainerMon.monitor_gnorm_with_ad(model, grad_acc_steps, optimizer, dp_group, tp_
 | "collect_times"         | 可选     | 设置采集次数，达到该次数后停止监控，默认值为100000000，目的是一直采集。                                                                                                                                                                                                                                                                                                                                        |
 | "start_step"            | 可选     | 设置开始采集step，模型训练达到start_step后开始监控采集，默认值为0，表示从step0开始监控采集。注：在动态启停模式下该设置不生效，只会从下一步开始监控采集。                                                                                                                                                                                                                                                                                          |
 | "step_interval"         | 可选     | 设置采集step间隔，默认值为1，表示每个step均采集监控数据。                                                                                                                                                                                                                                                                                                                                               |
-| "print_struct"          | 可选     | 设置为true后监控工具会打印模型中torch module的名字和详细结构，并在第1个step后退出。不填默认为false。**仅PyTorch场景支持此参数**。                                                                                                                                                                                                                                                                                             |
+| "print_struct"          | 可选     | 设置为true后监控工具会打印每张卡模型中module的名字和详细结构，并在第1个step后退出。不填默认为false。                                                                                                                                                                                                                                                                                                                    |
 | "module_ranks"          | 可选     | 用于在分布式训练场景中希望控制在哪些rank开启module监控。如果不填，则默认在所有rank开启。 列表内rank要求为int类型。                                                                                                                                                                                                                                                                                                            |
 | "ur_distribution"       | 可选     | 若为true则会统计adam优化器指定模块（targets中指定）参数的update和ratio向量的数值分布，并展示在heatmap里，默认为false，同时format字段必须设置为tensorboard。<br/>依赖histc算子， 需要CANN8.0.rc2以上版本， 否则会有严重的性能问题。**仅PyTorch场景支持此参数**。                                                                                                                                                                                                    |
 | "xy_distribution"       | 可选     | 若为true则会监控指定module（targets中指定）的输入输出张量。 默认为false。                                                                                                                                                                                                                                                                                                                                |
@@ -710,7 +710,7 @@ TrainerMon.monitor_gnorm_with_ad(model, grad_acc_steps, optimizer, dp_group, tp_
 | "backward_only"         | 可选     | 开启xy_distribution后生效，若为true，仅监控指定module的反向，targets中的input、output不生效。默认为false。                                                                                                                                                                                                                                                                                                   |
 | "mv_distribution"       | 可选     | 若为true则会监控指定模块中的参数的优化器状态， 默认为false。版本<msprobe1.2.2时需要在TrainerMon构造函数正确指定opt_ty。                                                                                                                                                                                                                                                                                                 |
 | "wg_distribution"       | 可选     | 若为true则会监控指定模块的参数梯度， 默认为false。                                                                                                                                                                                                                                                                                                                                                  |
-| "monitor_mbs_grad" | 可选     | 若为true则会监控mbs粒度梯度统计量，默认为false。                                                                                                                                                                                                                                                                                                                       |
+| "monitor_mbs_grad" | 可选     | 若为true则会监控mbs粒度梯度统计量，默认为false。                                                                                                                                                                                                                                                                                                                                                  |
 | "param_distribution"    | 可选     | 若为true则会监控指定模块的参数， 默认为false。                                                                                                                                                                                                                                                                                                                                                    |
 | "alert"                 | 可选     | "rules": 指定自动报警的异常检测机制及其相应的阈值。目前实现的异常检测是AnomalyTurbulence， 如果统计标量超出历史均值的指定浮动范围（threshold 0.5意味着上浮或者下浮50%）则在控制台打印报警信息。当"dump"字段配置为true表示异常事件写入文件，默认为false。**仅PyTorch场景支持此参数**。                                                                                                                                                                                                   |
 | "cc_distribution"       | 可选     | 其中"enable"字段控制通信监控模块的开关，仅支持在多卡训练时开启；需要监控通信算子时，务必尽量早地实例化`TrainerMon`, 因为监控通过劫持原始func后挂hook实现，部分加速库初始化时会保存原始function，避免监控失效。"cc_codeline"字段指定监控的代码行，如:`train.py\\[23\\]`，默认为空列表，不特别指定；"cc_pre_hook"字段控制是否监控通输入； 模块会在第二个optimize.step之前打印通信日志，包括通信api的调用栈、输入dtype、通信group。 "cc_log_only"为true时，仅打印日志，不监控通信的输入输出，并在打印后中断训练。可以根据通信日志设置"cc_codeline"，规避与训练过程不相关的通信，比如一些时间、metrics的同步。 |
@@ -718,8 +718,8 @@ TrainerMon.monitor_gnorm_with_ad(model, grad_acc_steps, optimizer, dp_group, tp_
 | "format"                | 可选     | 数据落盘格式，默认值为"csv"，可选 \["csv", "tensorboard", "api"\]。仅PyThon和MindSpore动态图场景支持此参数，且MindSpore动态图场景仅支持\["csv"\]。                                                                                                                                                                                                                                                                    |
 | "ops"                   | 可选     | 类型为list，与ur_distribution、xy_distribution、mv_distribution、wg_distribution、mg_direction、cc_distribution配合，监控所选张量的统计指标，目前支持"min"、"max"、"norm"、"mean"、"zeros"、"nans"。其中，zeros代表监控所选张量的元素小于eps的比例，nans代表张量中nan的数量。当ops中无有效指标时，默认监控norm指标。                                                                                                                                            |
 | "eps"                   | 可选     | 若ops里包含"zeros"则需要配置，默认为1e-8。                                                                                                                                                                                                                                                                                                                                                    |
-| "ndigits"               | 可选     | "format"为"csv"时，设置落盘文件中的小数位数，默认为6。                                                                                                                                                                                                                                                                                                                          |
+| "ndigits"               | 可选     | "format"为"csv"时，设置落盘文件中的小数位数，默认为6。                                                                                                                                                                                                                                                                                                                                              |
 | "step_count_per_record" | 可选     | "format"为"csv"时生效，每个csv记录多少个step的数据，默认为1。                                                                                                                                                                                                                                                                                                                                       |
 | "append_output"         | 可选     | 适用于断点续训场景。多卡场景下生效，指定两个时间戳，将输出续写到这两个时间戳范围间的输出件中，不在范围内的rank不被续写。时间戳应来自原有输出件目录前缀，例如["Dec03_21-34-40", "Dec03_21-34-41"]。默认为[]，不续写。**仅PyTorch场景支持此参数**。                                                                                                                                                                                                                             |
-| "squash_name"           | 可选     | 是否简化参数名/模块名，多模态场景建议关闭，默认为True                                                                                                                                                                                                                                                                                                                                                   |
+| "squash_name"           | 可选     | 是否简化参数名/模块名，多模态场景建议关闭，默认为True。                                                                                                                                                                                                                                                                                                                                                  |
 
diff --git a/debug/accuracy_tools/msprobe/mindspore/monitor/module_hook.py b/debug/accuracy_tools/msprobe/mindspore/monitor/module_hook.py
index 82eeed19864e9caee4122138a1d12e1ba6af53ea..375314bff28bd25ed251baea68156395a2bc9ad9 100644
--- a/debug/accuracy_tools/msprobe/mindspore/monitor/module_hook.py
+++ b/debug/accuracy_tools/msprobe/mindspore/monitor/module_hook.py
@@ -27,7 +27,7 @@ from mindspore import nn, _no_grad
 
 from msprobe.core.common.log import logger
 from msprobe.core.common.const import MonitorConst, Const
-from msprobe.core.common.file_utils import load_json, save_json
+from msprobe.core.common.file_utils import load_json, save_json, make_dir
 from msprobe.core.monitor.utils import validate_config, get_output_base_dir, get_target_output_dir
 from msprobe.core.monitor.anomaly_processor import AnomalyScanner, AnomalyDataFactory, AnomalyDataWriter
 from msprobe.mindspore.common.utils import is_mindtorch
@@ -77,7 +77,8 @@ def param_is_data_parallel_duplicate(dp_group):
 
 
 def squash_param_name(param_name):
-    for pattern in ['layers?\.(.*)', 'embeddings?\.(.*)', 'final.*', 'output.*', 'norm.*']:
+    for pattern in ['^.*\.(layers?\..*)', '^.*\.(embeddings?\..*)', '^.*\.(final.*)', '^.*\.(output.*)',
+                    '^.*\.(norm.*)']:
         match = re.findall(pattern, param_name)
         if match:
             return match[0]
@@ -700,14 +701,11 @@ class TrainerMon:
                 index += 1
 
     def _save_module_struct(self):
-        save_module_struct = (not comm_is_initialized()
-                              or (self.module_rank_list and get_rank() == min(self.module_rank_list))
-                              or (not self.module_rank_list and get_rank() == 0))
-
-        if save_module_struct:
-            module_struct_file = os.path.realpath(os.path.join(get_output_base_dir(), 'module_struct.json'))
-            save_json(module_struct_file, self.module_struct, indent=2)
-            logger.info(f"> save module struct to {module_struct_file}")
+        output_dir = os.path.join(get_output_base_dir(), 'module_struct', f'rank{self.rank}')
+        make_dir(output_dir)
+        module_struct_file = os.path.realpath(os.path.join(output_dir, 'module_struct.json'))
+        save_json(module_struct_file, self.module_struct, indent=2)
+        logger.info(f"> save module struct to {module_struct_file}")
         self.struct_printed = True
 
     def _hook_module(self, target_names, module, vpp_stage=''):
diff --git a/debug/accuracy_tools/msprobe/pytorch/monitor/module_hook.py b/debug/accuracy_tools/msprobe/pytorch/monitor/module_hook.py
index edda3d4b59f26b0ad837f13ef74447a2f1eab3a2..61e9ed4b00d2e285849a68d200db6bfa137b850a 100644
--- a/debug/accuracy_tools/msprobe/pytorch/monitor/module_hook.py
+++ b/debug/accuracy_tools/msprobe/pytorch/monitor/module_hook.py
@@ -26,7 +26,7 @@ import pandas as pd
 from torch.utils.hooks import BackwardHook
 
 from msprobe.core.common.const import MonitorConst, Const
-from msprobe.core.common.file_utils import load_json, save_json
+from msprobe.core.common.file_utils import load_json, save_json, make_dir
 from msprobe.core.common.decorator import recursion_depth_decorator
 from msprobe.core.monitor.anomaly_processor import AnomalyScanner, AnomalyDataFactory, AnomalyDataWriter
 from msprobe.core.common.file_utils import write_df_to_csv
@@ -46,12 +46,6 @@ from msprobe.pytorch.monitor.module_metric import get_metrics, get_summary_write
 from msprobe.pytorch.monitor.optimizer_collect import OptimizerMonFactory
 from msprobe.pytorch.monitor.visualizer import HeatmapVisualizer
 
-try:
-    from megatron.core import mpu
-except ImportError:
-    MPU_IMPORT = False
-else:
-    MPU_IMPORT = True
 
 torch_version_above_or_equal_2 = torch.__version__.split('+')[0] >= '2.0'
 if not torch_version_above_or_equal_2:
@@ -1006,19 +1000,11 @@ class TrainerMon:
             logger.info(msg)
 
     def _save_module_struct(self):
-        if MPU_IMPORT:
-            pp_group = mpu.get_pipeline_model_parallel_group()
-            pp_group_list = torch.distributed.get_process_group_ranks(pp_group)
-            save_module_struct = not dist.is_initialized() or dist.get_rank() in pp_group_list
-        else:
-            save_module_struct = (not dist.is_initialized()
-                                  or (self.module_rank_list and dist.get_rank() == min(self.module_rank_list))
-                                  or (not self.module_rank_list and dist.get_rank() == 0))
-        if save_module_struct:
-            module_struct_file = os.path.realpath(
-                os.path.join(get_output_base_dir(), f'{dist.get_rank()}_module_struct.json'))
-            save_json(module_struct_file, self.module_struct, indent=2)
-            logger.info(f"> save module struct to {module_struct_file}")
+        output_dir = os.path.join(get_output_base_dir(), 'module_struct', f'rank{self.rank}')
+        make_dir(output_dir)
+        module_struct_file = os.path.realpath(os.path.join(output_dir, 'module_struct.json'))
+        save_json(module_struct_file, self.module_struct, indent=2)
+        logger.info(f"> save module struct to {module_struct_file}")
         self.struct_printed = True
 
     def _is_target_param(self, param_name, param, prefix):
diff --git a/debug/accuracy_tools/msprobe/pytorch/monitor/module_metric.py b/debug/accuracy_tools/msprobe/pytorch/monitor/module_metric.py
index d0ff16a97d2410362a883429becdd49559674ff3..ad10580ad55b47f8604ee4dfc27a3290ed693bf9 100644
--- a/debug/accuracy_tools/msprobe/pytorch/monitor/module_metric.py
+++ b/debug/accuracy_tools/msprobe/pytorch/monitor/module_metric.py
@@ -34,7 +34,8 @@ def squash_param_name(param_name, enable=True):
     if not enable:
         return param_name
     name = ''
-    for pattern in ['layers?\.(.*)', 'embeddings?\.(.*)', 'final.*', 'output.*', 'norm.*']:
+    for pattern in ['^.*\.(layers?\..*)', '^.*\.(embeddings?\..*)', '^.*\.(final.*)', '^.*\.(output.*)',
+                    '^.*\.(norm.*)']:
         match = re.findall(pattern, param_name)
         if match:
             name += match[0]