diff --git a/docs/distributed_debug.md b/docs/distributed_debug.md
index 1aa5ad1ed2de4b40adc0255488e6daebf69794a3..68bc072c33a09b61db50ea6df8dd788ff22c9e81 100644
--- a/docs/distributed_debug.md
+++ b/docs/distributed_debug.md
@@ -16,9 +16,33 @@
    exit
    ```
 
+   若为pytorch后端， 则在`torch.distributed.launch pretrain_gpt.py` 之前 加入以下两行命令：
+
+   ```bash
+    #添加如下两行
+    source tools/debug_utils/dist_debug_pt.sh
+    exit
+    ```
+
    这样可以在启动前加载调试配置，并暂停正常流程，等待调试过程完成。
 
-2. **在代码中添加调试断点**
+2. **修改启动参数**
+    在dist_debug.sh/dist_debug_pt.sh中修改 `run_pretrain` 函数内的启动py脚本与入参为原启动脚本内容。
+    例如：
+
+    ```bash
+        python pretrain_gpt.py \
+        $GPT_ARGS \
+        $DATA_ARGS \
+        $OUTPUT_ARGS \
+        $MLA_ARGS \
+        $ROPE_ARGS \
+        $MOE_ARGS \
+        --distributed-backend nccl \
+        --ai-framework mindspore
+    ```
+
+3. **在代码中添加调试断点**
 
    在python代码希望调试的地方增加如下两行：
 
@@ -33,7 +57,9 @@
    breakpoint_(False)
    ```
 
-3. **进入调试**
+   使得其他进程在对应位置停住，从而防止其他rank运行到报错位置导致scheduler杀死集群。
+
+4. **进入调试**
    正常启动pretrain_xxx.sh脚本，等待到达断点处。看见类似如下交互日志时，表明已进入断点调试模式：
 
    ```bash
@@ -44,15 +70,13 @@
    (Pdb)
    ```
 
-4. **此处调试完成后，调用 clear_() 解除其它进程的阻塞**
+   **注意：此处调试完成后，若使用的`breakpoint_(False)`, 则需额外调用调用 clear_() 解除其它进程的阻塞**
 
    ```python
    clear_()
    ```
 
-   使得其他进程在对应位置停住，从而防止其他rank运行到报错位置导致scheduler杀死集群。
-
-5. **常见使用场景**：
+### 常见使用场景：
 
    场景1) 代码core dump，无法知晓错误栈场景：
 
@@ -65,7 +89,9 @@
 
    场景3) 精度问题，复现条件复杂，需打印多处变量、多轮添加打印才可定位场景：
 
-   运行用例打印精度未对齐的地方的变量哈希，将其存存下来，再在对应位置添加if条件
+   运行用例打印精度未对齐的变量的哈希值（md5），将其存下来，再在对应位置添加if条件。
+
+   **建议：若为精度对齐场景，可创造<=4卡的用例，结合np.save/np.load等接口读写数据，在单机pdb交互式窗口下完成精度比较，定位精度差异来源。
 
 ### pdb调试常用命令
 
@@ -98,12 +124,14 @@ pdb常用指令：
 1. **msrun分布式命令**
    msrun命令本质上为动态组网的封装，在master节点上启动一个scheduler(调度器)进程，负责管理所有worker进程的创建和通信；
 
-   在所有节点上启动若干个worker进程，负责执行训练任务。
+   在所有节点上启动若干个worker进程，负责执行训练任务。可参考[msrun实现源码](https://gitee.com/mindspore/mindspore/blob/master/mindspore/python/mindspore/parallel/cluster/process_entity/_api.py#L47)
 
    以Atlas 800TA2单机8卡任务为例，msrun命令会在每个节点上启动一个scheduler(调度器)进程和8个worker进程，默认状态下，所有worker进程均在后台执行。
 
    因pdb调试要求交互式调试的进程处于前台运行状态，因此需要设置环境变量指定调试进程的rank使其对应的进程在前台执行，而其他rank进程正常后台执行。
 
+   注：若为`torch.distributed.launch`, 其本质上也是动态组网的封装，可参考[torchrun实现源码](https://github.com/pytorch/pytorch/blob/main/torch/distributed/run.py#L207)
+
 2、**程序运行流程图**
 
    如下流程图展示整个流程中各个进程的状态转换。
@@ -133,7 +161,7 @@ pdb常用指令：
                       ▼                                                  ▼
             ┌────────────────────────┐                       ┌────────────────────────┐
             │ 当前进程为调试进程     │                       │ 当前进程非调试进程      │
-            │ (rank == MS_DEBUG_RANK)│                       │ (rank ≠ MS_DEBUG_RANK) │
+            │ (rank == RANK_TO_DEBUG)│                       │ (rank ≠ RANK_TO_DEBUG) │
             └────────────────────────┘                       └────────────────────────┘
                       │                                                  │
                       │                                                  │
diff --git a/tools/debug_utils/dist_debug.sh b/tools/debug_utils/dist_debug.sh
index 967be52b3c18231baefc797345b87407d139658d..44cea2c6f0a5da940ab9803dd8f8d897161f5526 100644
--- a/tools/debug_utils/dist_debug.sh
+++ b/tools/debug_utils/dist_debug.sh
@@ -1,7 +1,7 @@
 #!/bin/bash
 THIS_SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
 export PYTHONPATH=$PYTHONPATH:$THIS_SCRIPT_DIR
-export MS_DEBUG_RANK=0                         # TO SET !!!!!!!!!
+export RANK_TO_DEBUG=0                         # TO SET !!!!!!!!!
 export MS_WORKER_NUM=$WORLD_SIZE
 export CLUSTER_TIME_OUT=76800
 
@@ -14,12 +14,16 @@ if [ "$MS_SCHED_HOST" = "localhost" ]; then
 fi
 
 # original command, replace `msrun` with `python`
-run_pretrain() {
+run_pretrain() {                                    # TO SET !!!!!!!!! e.g. posttrain_gpt.py
     python pretrain_gpt.py \
-        ${GPT_ARGS} \
-        ${DATA_ARGS} \
-        ${OUTPUT_ARGS} \
-        --distributed-backend nccl
+        $GPT_ARGS \
+        $DATA_ARGS \
+        $OUTPUT_ARGS \
+        $MLA_ARGS \
+        $ROPE_ARGS \
+        $MOE_ARGS \
+        --distributed-backend nccl \
+        --ai-framework mindspore
 }
 
 # run scheduler
@@ -33,7 +37,7 @@ START_RANK=$(( NODE_RANK * NPUS_PER_NODE ))
 END_RANK=$(( START_RANK + NPUS_PER_NODE ))
 
 for ((worker_rank=START_RANK; worker_rank<END_RANK; worker_rank++)); do
-    if [ $worker_rank -eq $MS_DEBUG_RANK ]; then
+    if [[ $worker_rank -eq $RANK_TO_DEBUG ]]; then
         continue
     fi
     export MS_NODE_ID=$worker_rank
@@ -42,8 +46,8 @@ for ((worker_rank=START_RANK; worker_rank<END_RANK; worker_rank++)); do
      > worker_${worker_rank}.log 2>&1 &
 done
 
-if [ $MS_DEBUG_RANK -ge $START_RANK ] && [ $MS_DEBUG_RANK -lt $END_RANK ]; then
-    export MS_NODE_ID=$MS_DEBUG_RANK
-    echo "DEBUGGING worker in current process, global rank=${MS_DEBUG_RANK}"
+if [[ $RANK_TO_DEBUG -ge $START_RANK ]] && [[ $RANK_TO_DEBUG -lt $END_RANK ]]; then
+    export MS_NODE_ID=$RANK_TO_DEBUG
+    echo "DEBUGGING worker in current process, global rank=${RANK_TO_DEBUG}"
     run_pretrain
 fi
\ No newline at end of file
diff --git a/tools/debug_utils/dist_debug_pt.sh b/tools/debug_utils/dist_debug_pt.sh
new file mode 100644
index 0000000000000000000000000000000000000000..7db0c1fbf8058d24a7d31ebfc678f0f0a8f5ea7f
--- /dev/null
+++ b/tools/debug_utils/dist_debug_pt.sh
@@ -0,0 +1,67 @@
+#!/bin/bash
+THIS_SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
+export PYTHONPATH=$PYTHONPATH:$THIS_SCRIPT_DIR
+export RANK_TO_DEBUG=0                         # TO SET !!!!!!!!!
+export HCCL_EXEC_TIMEOUT=76800
+
+# to avoid OpenMP thread and multi-process resource conflict.
+if [ -z "${OMP_NUM_THREADS}" ] && [ "${NPUS_PER_NODE}" -gt 1 ]; then
+  export OMP_NUM_THREADS=1
+fi
+
+export WORLD_SIZE=$(( NPUS_PER_NODE * NNODES ))       # global process num
+export LOCAL_WORLD_SIZE=$NPUS_PER_NODE                # local process num
+export NODE_RANK=$NODE_RANK
+
+# MASTER env, to set due to torch multi processing logic
+export MASTER_ADDR=$MASTER_ADDR
+export MASTER_PORT=$MASTER_PORT
+
+export TORCHELASTIC_RESTART_COUNT="${TORCHELASTIC_RESTART_COUNT:-0}"   # already restart count
+export TORCHELASTIC_MAX_RESTARTS="${TORCHELASTIC_MAX_RESTARTS:-0}"     # max restart count
+export TORCHELASTIC_RUN_ID="${TORCHELASTIC_RUN_ID:-$(uuidgen)}"        # unique ID
+
+# since localhost can't be identified, replace by 127.0.0.1
+if [ "$MASTER_ADDR" = "localhost" ]; then
+    export MASTER_ADDR="127.0.0.1"
+fi
+
+# original command, replace `torch.distributed.launch` with `python`
+run_pretrain() {                                    # TO SET !!!!!!!!! e.g. posttrain_gpt.py
+    python pretrain_gpt.py \
+        $GPT_ARGS \
+        $DATA_ARGS \
+        $OUTPUT_ARGS \
+        $MLA_ARGS \
+        $ROPE_ARGS \
+        $MOE_ARGS \
+        --distributed-backend nccl
+}
+
+
+# run worker
+START_RANK=$(( NODE_RANK * NPUS_PER_NODE ))
+END_RANK=$(( START_RANK + NPUS_PER_NODE ))
+
+for ((worker_rank=START_RANK; worker_rank<END_RANK; worker_rank++)); do
+    if [[ $worker_rank -eq $RANK_TO_DEBUG ]]; then
+        continue
+    fi
+    echo "running rank ${worker_rank} in background."
+    export RANK=$worker_rank
+    export LOCAL_RANK=$(( worker_rank % NPUS_PER_NODE ))
+    export GROUP_RANK=$NODE_RANK                                        # 组内 rank（单组时等于节点号）
+    export ROLE_RANK=$RANK                                              # 单角色训练下等同全局 rank
+    export ROLE_WORLD_SIZE=$WORLD_SIZE
+    run_pretrain \
+     > worker_${worker_rank}.log 2>&1 &
+done
+
+if [[ $RANK_TO_DEBUG -ge $START_RANK ]] && [[ $RANK_TO_DEBUG -lt $END_RANK ]]; then
+    export RANK=$RANK_TO_DEBUG
+    export LOCAL_RANK=$(( RANK_TO_DEBUG % NPUS_PER_NODE ))
+    export GROUP_RANK=$NODE_RANK
+    export ROLE_RANK=$RANK_TO_DEBUG
+    echo "DEBUGGING worker in current process, global rank=${RANK_TO_DEBUG}"
+    run_pretrain
+fi
\ No newline at end of file
diff --git a/tools/debug_utils/pdb_utils.py b/tools/debug_utils/pdb_utils.py
index d48931b7b7e2baab881c0cc4a8b8c9db5219b2cc..985a804bef85bdb0c83b9fa23d0bc054942344d4 100644
--- a/tools/debug_utils/pdb_utils.py
+++ b/tools/debug_utils/pdb_utils.py
@@ -36,10 +36,10 @@ def breakpoint_(non_block=True):
         block: whether to block other ranks. In case that
     """
     current_rank = torch.distributed.get_rank()
-    debug_rank = os.environ.get("MS_DEBUG_RANK")
+    debug_rank = os.environ.get("RANK_TO_DEBUG")
     if current_rank is None or debug_rank is None:
-        raise Exception(f"MS_DEBUG_RANK/MS_NODE_ID can't be None in debug mode, "
-                        f"MS_NODE_ID: `{current_rank}`, MS_DEBUG_RANK: `{debug_rank}`")
+        raise Exception(f"RANK_TO_DEBUG/rank can't be None in debug mode, "
+                        f"MS_NODE_ID: `{current_rank}`, RANK_TO_DEBUG: `{debug_rank}`")
     logging.info(f"current_rank: {current_rank}, debug_rank: {debug_rank}")
 
     counter_file = _get_counter_file()
@@ -49,7 +49,7 @@ def breakpoint_(non_block=True):
                 f.write(f"DEBUGGING{counter_file}")
             logging.info(f"[{time.time()}]counter_file created: {counter_file}")
         import pdb
-        pdb.set_trace() # press `n` and then `enter` to reach your code
+        pdb.set_trace() # press `n` and then `Enter` to reach your code
     elif not non_block:
         logging.info(f"[{time.time()}]waiting counter_file to be created...")
         while not os.path.exists(counter_file):
@@ -61,7 +61,7 @@ def breakpoint_(non_block=True):
         logging.info(f"[{time.time()}]counter_file cleaned, continue")
     else:
         # otherwise, other ranks don't need to wait
-        pass
+        logging.info(f"[{time.time()}]stepping in to debug field in background rank. (non blocking)")
 
 
 def clear_():