From e0d8f4f37f97f1fc6bca26db9d89cc4bb6dade56 Mon Sep 17 00:00:00 2001
From: LuQingyun <luqingyun1@huawei.com>
Date: Thu, 8 May 2025 22:04:19 +0800
Subject: [PATCH 01/15] fix pipeline mistakes

---
 .../coverage/checkpoint/test_checkpoint.json  |  2 +-
 tests/coverage/evaluation/test_evaluate.json  |  4 +-
 tests/coverage/inference/test_inference.json  | 18 ++++-----
 .../test_process_instruction_data.json        | 18 ++++-----
 .../test_process_pretrain_data.json           |  6 +--
 .../pipeline/baichuan2-13B/param_config.json  |  4 +-
 tests/pipeline/common/test_checkpoint.json    |  8 ++--
 tests/pipeline/common/test_inference.json     | 38 +++++++++----------
 .../test_process_instruction_data_lf.json     | 16 ++++----
 .../test_process_instruction_pack_data.py     |  2 +-
 .../common/test_process_pairwise_data_lf.json |  6 +--
 tests/rlxf/train_orm_chatglm3_tp2_pp4_full.sh |  2 +-
 .../rlxf/train_orm_llama2_7b_pp2_vpp2_dp2.sh  |  2 +-
 .../rlxf/train_prm_llama2_tp1_pp8_full_ptd.sh |  2 +-
 .../tune_qwen7b_tp8_pp1_full_ptd.sh           |  2 +-
 15 files changed, 65 insertions(+), 65 deletions(-)

diff --git a/tests/coverage/checkpoint/test_checkpoint.json b/tests/coverage/checkpoint/test_checkpoint.json
index 20fdc1cb9..b6055dead 100644
--- a/tests/coverage/checkpoint/test_checkpoint.json
+++ b/tests/coverage/checkpoint/test_checkpoint.json
@@ -91,7 +91,7 @@
                 "save-model-type":"mg",
                 "target-tensor-parallel-size": "4",
                 "target-pipeline-parallel-size": "2",
-                "load-dir":"/data/hf/qwen2.5-7B",
+                "load-dir":"/data/hf/Qwen2.5-7B",
                 "save-dir":"/data/ci/Qwen2.5-mg",
                 "tokenizer-model":"/data/ci/Qwen2.5-7B",
                 "use-mcore-models": null,
diff --git a/tests/coverage/evaluation/test_evaluate.json b/tests/coverage/evaluation/test_evaluate.json
index d7e752f3e..3a2fa47f1 100644
--- a/tests/coverage/evaluation/test_evaluate.json
+++ b/tests/coverage/evaluation/test_evaluate.json
@@ -29,7 +29,7 @@
                 "no-load-optim": null,
                 "load":"/data/ci/ckpt",
                 "tokenizer-type":"PretrainedFromHF",
-                "tokenizer-name-or-path":"/data/llama-2-7b-hf",
+                "tokenizer-name-or-path":"/data/hf/llama-2-7b-hf",
                 "use-deter-comp": null
             }
         }
@@ -65,7 +65,7 @@
                 "no-load-optim": null,
                 "load":"/data/ci/ckpt",
                 "tokenizer-type":"PretrainedFromHF",
-                "tokenizer-name-or-path":"/data/llama-2-7b-hf",
+                "tokenizer-name-or-path":"/data/hf/llama-2-7b-hf",
                 "use-deter-comp": null
             }
         }
diff --git a/tests/coverage/inference/test_inference.json b/tests/coverage/inference/test_inference.json
index 2f5602ae5..2050e713f 100644
--- a/tests/coverage/inference/test_inference.json
+++ b/tests/coverage/inference/test_inference.json
@@ -25,14 +25,14 @@
                 "normalization": "RMSNorm", 
                 "load":"/data/ci/ckpt",
                 "tokenizer-type": "PretrainedFromHF",  
-                "tokenizer-name-or-path":"/data/llama-2-7b-hf",
-                "tokenizer-model": "/data/llama-2-7b-hf/tokenizer.model", 
+                "tokenizer-name-or-path":"/data/hf/llama-2-7b-hf",
+                "tokenizer-model": "/data/hf/llama-2-7b-hf/tokenizer.model",
                 "disable-bias-linear": null,
-                "attention-softmax-in-fp32": null, 
-                "untie-embeddings-and-output-weights": null, 
-                "no-masked-softmax-fusion": null, 
-                "no-load-optim": null, 
-                "no-load-rng": null, 
+                "attention-softmax-in-fp32": null,
+                "untie-embeddings-and-output-weights": null,
+                "no-masked-softmax-fusion": null,
+                "no-load-optim": null,
+                "no-load-rng": null,
                 "fp16": null,
                 "task":"greedy",
                 "use-deter-comp": null,
@@ -67,8 +67,8 @@
                 "normalization": "RMSNorm",
                 "load":"/data/ci/ckpt",
                 "tokenizer-type": "PretrainedFromHF",
-                "tokenizer-name-or-path":"/data/llama-2-7b-hf",
-                "tokenizer-model": "/data/llama-2-7b-hf/tokenizer.model",
+                "tokenizer-name-or-path":"/data/hf/llama-2-7b-hf",
+                "tokenizer-model": "/data/hf/llama-2-7b-hf/tokenizer.model",
                 "disable-bias-linear": null,
                 "attention-softmax-in-fp32": null,
                 "untie-embeddings-and-output-weights": null,
diff --git a/tests/coverage/process_data/test_process_instruction_data.json b/tests/coverage/process_data/test_process_instruction_data.json
index 772ebb42a..c553b2f75 100644
--- a/tests/coverage/process_data/test_process_instruction_data.json
+++ b/tests/coverage/process_data/test_process_instruction_data.json
@@ -17,7 +17,7 @@
                 "tokenizer-type": "PretrainedFromHF",
                 "handler-name": "GeneralInstructionHandler",
                 "output-prefix": "/data/process_dataset/test_ins_subs/part1", 
-                "tokenizer-name-or-path": "/data/llama-2-7b-hf",
+                "tokenizer-name-or-path": "/data/hf/llama-2-7b-hf",
                 "workers": 4,
                 "log-interval": 1000,
                 "append-eod": null
@@ -30,8 +30,8 @@
                 "input": "/data/process_dataset/0002-alpaca.parquet",
                 "tokenizer-type": "PretrainedFromHF",
                 "handler-name": "GeneralInstructionHandler",
-                "output-prefix": "/data/process_dataset/test_ins_subs/part2", 
-                "tokenizer-name-or-path": "/data/llama-2-7b-hf",
+                "output-prefix": "/data/process_dataset/test_ins_subs/part2",
+                "tokenizer-name-or-path": "/data/hf/llama-2-7b-hf",
                 "workers": 4,
                 "log-interval": 1000,
                 "append-eod": null
@@ -61,8 +61,8 @@
                 "input": "/data/process_dataset/0001-alpaca.parquet",
                 "tokenizer-type": "PretrainedFromHF",
                 "handler-name": "AlpacaStyleInstructionHandler",
-                "output-prefix": "/data/process_dataset/test_instruction_handler/alpaca_style", 
-                "tokenizer-name-or-path": "/data/llama-2-7b-hf",
+                "output-prefix": "/data/process_dataset/test_instruction_handler/alpaca_style",
+                "tokenizer-name-or-path": "/data/hf/llama-2-7b-hf",
                 "workers": 4,
                 "log-interval": 1000,
                 "overwrite-cache": null,
@@ -76,8 +76,8 @@
                 "input": "/data/process_dataset/0001-alpaca.parquet",
                 "tokenizer-type": "PretrainedFromHF",
                 "handler-name": "AlpacaStyleInstructionHandler",
-                "output-prefix": "/data/process_dataset/test_instruction_handler/alpaca_style_pack", 
-                "tokenizer-name-or-path": "/data/llama-2-7b-hf",
+                "output-prefix": "/data/process_dataset/test_instruction_handler/alpaca_style_pack",
+                "tokenizer-name-or-path": "/data/hf/llama-2-7b-hf",
                 "workers": 4,
                 "log-interval": 1000,
                 "overwrite-cache": null,
@@ -94,8 +94,8 @@
                 "input": "/data/process_dataset/sharegpt_formatted_data-evol-gpt4.jsonl",
                 "tokenizer-type": "PretrainedFromHF",
                 "handler-name": "SharegptStyleInstructionHandler",
-                "output-prefix": "/data/process_dataset/test_instruction_handler/sharegpt_style", 
-                "tokenizer-name-or-path": "/data/llama-2-7b-hf",
+                "output-prefix": "/data/process_dataset/test_instruction_handler/sharegpt_style",
+                "tokenizer-name-or-path": "/data/hf/llama-2-7b-hf",
                 "workers": 4,
                 "log-interval": 1000,
                 "prompt-type" : "llama2"
diff --git a/tests/coverage/process_data/test_process_pretrain_data.json b/tests/coverage/process_data/test_process_pretrain_data.json
index 5c68471dc..84359d49a 100644
--- a/tests/coverage/process_data/test_process_pretrain_data.json
+++ b/tests/coverage/process_data/test_process_pretrain_data.json
@@ -16,7 +16,7 @@
                 "input": "/data/process_dataset/0001-alpaca.parquet",
                 "tokenizer-type": "PretrainedFromHF",
                 "output-prefix": "/data/process_dataset/test_merge_subs/part1", 
-                "tokenizer-name-or-path": "/data/llama-2-7b-hf",
+                "tokenizer-name-or-path": "/data/hf/llama-2-7b-hf",
                 "workers": 4,
                 "log-interval": 1000
             }
@@ -27,8 +27,8 @@
             "params": {
                 "input": "/data/process_dataset/0002-alpaca.parquet",
                 "tokenizer-type": "PretrainedFromHF",
-                "output-prefix": "/data/process_dataset/test_merge_subs/part2", 
-                "tokenizer-name-or-path": "/data/llama-2-7b-hf",
+                "output-prefix": "/data/process_dataset/test_merge_subs/part2",
+                "tokenizer-name-or-path": "/data/hf/llama-2-7b-hf",
                 "workers": 4,
                 "log-interval": 1000
             }
diff --git a/tests/pipeline/baichuan2-13B/param_config.json b/tests/pipeline/baichuan2-13B/param_config.json
index 2a6bc9f29..05ab4ec55 100644
--- a/tests/pipeline/baichuan2-13B/param_config.json
+++ b/tests/pipeline/baichuan2-13B/param_config.json
@@ -24,7 +24,7 @@
                 "fp16": null,
                 "no-load-rng": null,
                 "no-load-optim": null,
-                "load": "/data/pipeline/baichuan2-13b-tp8pp1-legacy-base",
+                "load": "/data/pipeline/Baichuan2-13B-tp8pp1-mcore-hf-layer2",
                 "tokenizer-type": "PretrainedFromHF",
                 "tokenizer-name-or-path": "/data/hf/baichuan2-13B-hf/",
                 "use-deter-comp": null
@@ -85,7 +85,7 @@
                 "load-model-type": "hf",
                 "save-model-type": "mg",
                 "target-tensor-parallel-size": "8",
-                "load-dir": "/data/datasets/baichuan2-13B-data/enwiki/",
+                "load-dir": "/data/hf/baichuan2-13B-hf",
                 "save-dir": "/data/cache",
                 "tokenizer-model": "/data/hf/baichuan2-13B-hf/tokenizer.model",
                 "params-dtype": "bf16",
diff --git a/tests/pipeline/common/test_checkpoint.json b/tests/pipeline/common/test_checkpoint.json
index 23c4f6a19..f5504ba7f 100644
--- a/tests/pipeline/common/test_checkpoint.json
+++ b/tests/pipeline/common/test_checkpoint.json
@@ -292,10 +292,10 @@
                 "target-tensor-parallel-size": "2",
                 "target-pipeline-parallel-size": "4",
                 "num-layer-list": "6,8,8,10",
-                "load-dir":"/data/llama-2-7b-hf",
-                "save-dir":"/data/llama-2-7b-hf-hf2ml-tp2pp4dypp-test",
+                "load-dir":"/data/hf/llama-2-7b-hf",
+                "save-dir":"/data/hf/llama-2-7b-hf-hf2ml-tp2pp4dypp-test",
                 "model-type-hf": "llama2",
-                "tokenizer-model":"/data/llama-2-7b-hf/tokenizer.model"
+                "tokenizer-model":"/data/hf/llama-2-7b-hf/tokenizer.model"
             }
         },
         {
@@ -321,7 +321,7 @@
                 "target-tensor-parallel-size": "1",
                 "target-pipeline-parallel-size": "1",
                 "load-dir":"/data/llama-2-7b-hf-hf2ml-tp2pp4dypp-test",
-                "save-dir":"/data/llama-2-7b-hf",
+                "save-dir":"/data/hf/llama-2-7b-hf",
                 "model-type-hf": "llama2"
             }
         },
diff --git a/tests/pipeline/common/test_inference.json b/tests/pipeline/common/test_inference.json
index d76f57dab..58b6c2e95 100644
--- a/tests/pipeline/common/test_inference.json
+++ b/tests/pipeline/common/test_inference.json
@@ -15,18 +15,18 @@
                 "make-vocab-size-divisible-by": 1,
                 "normalization": "RMSNorm", 
                 "position-embedding-type": "rope", 
-                "load":"/data/llama2-7B-tp8-pp1",
-                "tokenizer-type": "PretrainedFromHF",  
-                "tokenizer-name-or-path":"/data/llama-2-7b-hf",
-                "tokenizer-model": "/data/llama-2-7b-hf/tokenizer.model", 
+                "load":"/data/pipeline/llama2-7B-tp8-pp1",
+                "tokenizer-type": "PretrainedFromHF",
+                "tokenizer-name-or-path":"/data/hf/llama-2-7b-hf",
+                "tokenizer-model": "/data/hf/llama-2-7b-hf/tokenizer.model",
                 "disable-bias-linear": null,
-                "use-fused-rmsnorm": null, 
+                "use-fused-rmsnorm": null,
                 "swiglu": null,
-                "attention-softmax-in-fp32": null, 
-                "untie-embeddings-and-output-weights": null, 
-                "no-masked-softmax-fusion": null, 
-                "no-load-optim": null, 
-                "no-load-rng": null, 
+                "attention-softmax-in-fp32": null,
+                "untie-embeddings-and-output-weights": null,
+                "no-masked-softmax-fusion": null,
+                "no-load-optim": null,
+                "no-load-rng": null,
                 "fp16": null,
                 "task":"greedy",
                 "max-new-tokens": 30,
@@ -54,10 +54,10 @@
                 "num-attention-heads": 32,
                 "max-position-embeddings": 4096 ,
                 "swiglu": null,
-                "load": "/data/llama2-7B-tp8-pp1",
+                "load": "/data/pipeline/llama2-7B-tp8-pp1",
                 "tokenizer-type": "PretrainedFromHF",
-                "tokenizer-name-or-path": "/data/llama-2-7b-hf",
-                "tokenizer-model": "/data/llama-2-7b-hf/tokenizer.model",
+                "tokenizer-name-or-path": "/data/hf/llama-2-7b-hf",
+                "tokenizer-model": "/data/hf/llama-2-7b-hf/tokenizer.model",
                 "tokenizer-not-use-fast": null,
                 "fp16": null,
                 "normalization": "RMSNorm" ,
@@ -111,10 +111,10 @@
                 "max-new-tokens": 30,
                 "micro-batch-size": 1,
                 "global-batch-size": 1,
-                "load":"/data/chatglm3-6b-base-mg-tp1pp2-mcore-base",
+                "load":"/data/pipeline/chatglm3-6b-base-mg-tp1pp2-mcore-base",
                 "tokenizer-type": "PretrainedFromHF",
-                "tokenizer-name-or-path":"/data/chatglm3-6b-base-hf/",
-                "tokenizer-model": "/data/chatglm3-6b-base-hf/tokenizer.model",
+                "tokenizer-name-or-path":"/data/hf/chatglm3-6b-base-hf/",
+                "tokenizer-model": "/data/hf/chatglm3-6b-base-hf/tokenizer.model",
                 "tokenizer-not-use-fast": null,
                 "untie-embeddings-and-output-weights": null,
                 "attention-softmax-in-fp32": null,
@@ -158,10 +158,10 @@
                 "max-new-tokens": 30,
                 "micro-batch-size": 1,
                 "global-batch-size": 16,
-                "load":"/data/pipe/chatglm3-6b-base-mg-tp1pp2-legacy-base",
+                "load":"/data/pipeline/chatglm3-6b-base-mg-tp1pp2-legacy-base",
                 "tokenizer-type": "PretrainedFromHF",
-                "tokenizer-name-or-path":"/data/chatglm3-6b-base-hf/",
-                "tokenizer-model": "/data/chatglm3-6b-base-hf/tokenizer.model",
+                "tokenizer-name-or-path":"/data/hf/chatglm3-6b-base-hf/",
+                "tokenizer-model": "/data/hf/chatglm3-6b-base-hf/tokenizer.model",
                 "tokenizer-not-use-fast": null,
                 "untie-embeddings-and-output-weights": null,
                 "attention-softmax-in-fp32": null,
diff --git a/tests/pipeline/common/test_process_instruction_data_lf.json b/tests/pipeline/common/test_process_instruction_data_lf.json
index d50aaa4ca..65edde86f 100644
--- a/tests/pipeline/common/test_process_instruction_data_lf.json
+++ b/tests/pipeline/common/test_process_instruction_data_lf.json
@@ -5,7 +5,7 @@
                 "input": "/data/tune_dataset/train-00000-of-00001-a09b74b3ef9c3b56.parquet",
                 "tokenizer-type": "PretrainedFromHF",
                 "handler-name": "AlpacaStyleInstructionHandler",
-                "output-prefix": "/data/cache/tune_dataset/alpaca/alpaca",
+                "output-prefix": "/data/tune_dataset/alpaca/alpaca",
                 "overwrite-cache": null,
                 "tokenizer-name-or-path": "/data/qwen-7b/",
                 "workers": 4,
@@ -20,7 +20,7 @@
                 "input": "/data/tune_dataset/oaast_sft.json",
                 "tokenizer-type": "PretrainedFromHF",
                 "handler-name": "AlpacaStyleInstructionHandler",
-                "output-prefix": "/data/tune_dataset/alpaca_his/alpaca_his", 
+                "output-prefix": "/data/tune_dataset/alpaca_his/alpaca_his",
                 "tokenizer-name-or-path": "/data/qwen-7b/",
                 "overwrite-cache": null,
                 "workers": 4,
@@ -28,13 +28,13 @@
                 "prompt-type": "qwen",
                 "map-keys": "{\"history\":\"history\"}"
             }
-        }, 
+        },
         {
             "params": {
                 "input": "/data/tune_dataset/oaast_sft.json",
                 "tokenizer-type": "PretrainedFromHF",
                 "handler-name": "AlpacaStyleInstructionHandler",
-                "output-prefix": "/data/tune_dataset/alpaca_his/alpaca_his_seq1024", 
+                "output-prefix": "/data/tune_dataset/alpaca_his/alpaca_his_seq1024",
                 "tokenizer-name-or-path": "/data/qwen-7b/",
                 "overwrite-cache": null,
                 "workers": 4,
@@ -52,7 +52,7 @@
                 "input": "/data/tune_dataset/sharegpt_formatted_data-evol-gpt4.jsonl",
                 "tokenizer-type": "PretrainedFromHF",
                 "handler-name": "SharegptStyleInstructionHandler",
-                "output-prefix": "/data/tune_dataset/sharegpt/sharegpt", 
+                "output-prefix": "/data/tune_dataset/sharegpt/sharegpt",
                 "tokenizer-name-or-path": "/data/qwen-7b/",
                 "workers": 4,
                 "overwrite-cache": null,
@@ -68,7 +68,7 @@
                 "input": "/data/tune_dataset/sss.json",
                 "tokenizer-type": "PretrainedFromHF",
                 "handler-name": "SharegptStyleInstructionHandler",
-                "output-prefix": "/data/tune_dataset/sharegpt/sharegpt", 
+                "output-prefix": "/data/tune_dataset/sharegpt/sharegpt",
                 "tokenizer-name-or-path": "/data/qwen-7b/",
                 "overwrite-cache": null,
                 "workers": 4,
@@ -84,8 +84,8 @@
                 "input": "/data/tune_dataset/train-00000-of-00001-a09b74b3ef9c3b56.parquet",
                 "tokenizer-type": "PretrainedFromHF",
                 "handler-name": "AlpacaStyleInstructionHandler",
-                "output-prefix": "/data/tune_dataset/prompt_abstract/alpaca", 
-                "tokenizer-name-or-path": "/data/llama-2-7b-hf/",
+                "output-prefix": "/data/tune_dataset/prompt_abstract/alpaca",
+                "tokenizer-name-or-path": "/data/hf/llama-2-7b-hf/",
                 "overwrite-cache": null,
                 "workers": 4,
                 "log-interval": 1000,
diff --git a/tests/pipeline/common/test_process_instruction_pack_data.py b/tests/pipeline/common/test_process_instruction_pack_data.py
index a5f7cf1cc..2dcaa0a3c 100644
--- a/tests/pipeline/common/test_process_instruction_pack_data.py
+++ b/tests/pipeline/common/test_process_instruction_pack_data.py
@@ -18,7 +18,7 @@ class TestProcessInstructionData:
             "--tokenizer-type", "PretrainedFromHF",
             "--handler-name", "GeneralInstructionHandler",
             "--output-prefix", "/data/tune_pack_dataset/alpaca_pack",
-            "--tokenizer-name-or-path", "/data/llama-2-7b-hf",
+            "--tokenizer-name-or-path", "/data/hf/llama-2-7b-hf",
             "--workers", "4",
             "--log-interval", "1000",
             "--append-eod",
diff --git a/tests/pipeline/common/test_process_pairwise_data_lf.json b/tests/pipeline/common/test_process_pairwise_data_lf.json
index bc76d7266..04d0cbcaf 100644
--- a/tests/pipeline/common/test_process_pairwise_data_lf.json
+++ b/tests/pipeline/common/test_process_pairwise_data_lf.json
@@ -5,7 +5,7 @@
                 "input": "/data/pairwise_dataset/orca_rlhf.jsonl",
                 "tokenizer-type": "PretrainedFromHF",
                 "output-prefix": "/data/pairwise_dataset/output/orca_rlhf/orca_rlhf", 
-                "tokenizer-name-or-path": "/data/llama-2-7b-hf/",
+                "tokenizer-name-or-path": "/data/hf/llama-2-7b-hf/",
                 "workers": 4,
                 "log-interval": 1000,
                 "handler-name": "AlpacaStylePairwiseHandler",
@@ -21,8 +21,8 @@
                 "input": "/data/pairwise_dataset/dpo_en.json",
                 "tokenizer-type": "PretrainedFromHF",
                 "handler-name": "SharegptStylePairwiseHandler",
-                "output-prefix": "/data/pairwise_dataset/output/dpo_en/dpo_en", 
-                "tokenizer-name-or-path": "/data/chatglm3-6b-base-hf/",
+                "output-prefix": "/data/pairwise_dataset/output/dpo_en/dpo_en",
+                "tokenizer-name-or-path": "/data/hf/chatglm3-6b-base-hf/",
                 "workers": 4,
                 "log-interval": 1000,
                 "prompt-type": "chatglm3"
diff --git a/tests/rlxf/train_orm_chatglm3_tp2_pp4_full.sh b/tests/rlxf/train_orm_chatglm3_tp2_pp4_full.sh
index 6ed5692c2..afe21fbe4 100644
--- a/tests/rlxf/train_orm_chatglm3_tp2_pp4_full.sh
+++ b/tests/rlxf/train_orm_chatglm3_tp2_pp4_full.sh
@@ -12,7 +12,7 @@ basepath=$(cd `dirname $0`; cd ../../../; pwd)
 
 CKPT_SAVE_DIR="./chatglm3_reward_ckpt"
 DATA_PATH="/data/pairwise_dataset/baseline/dpo_en/dpo_en"
-TOKENIZER_PATH="/data/chatglm3-6b-base-hf/"
+TOKENIZER_PATH="/data/hf/chatglm3-6b-base-hf/"
 CKPT_LOAD_DIR="/data/chatglm3_reward_ckpt"
 
 TP=2
diff --git a/tests/rlxf/train_orm_llama2_7b_pp2_vpp2_dp2.sh b/tests/rlxf/train_orm_llama2_7b_pp2_vpp2_dp2.sh
index 00b9ab197..86433f819 100644
--- a/tests/rlxf/train_orm_llama2_7b_pp2_vpp2_dp2.sh
+++ b/tests/rlxf/train_orm_llama2_7b_pp2_vpp2_dp2.sh
@@ -17,7 +17,7 @@ DISTRIBUTED_ARGS="
 "
 
 DATA_PATH="/data/ci/orm/dpo-en-llama-2-7b/dpo_en"
-TOKENIZER_PATH="/data/llama-2-7b-hf/"
+TOKENIZER_PATH="/data/hf/llama-2-7b-hf/"
 CKPT_LOAD_DIR="/data/ci/orm/llama-2-7b-layers8-rm-mcore_pp2vpp2/"
 
 basepath=$(cd `dirname $0`; cd ../../../; pwd)
diff --git a/tests/rlxf/train_prm_llama2_tp1_pp8_full_ptd.sh b/tests/rlxf/train_prm_llama2_tp1_pp8_full_ptd.sh
index 30dfbf6e1..525ff4262 100644
--- a/tests/rlxf/train_prm_llama2_tp1_pp8_full_ptd.sh
+++ b/tests/rlxf/train_prm_llama2_tp1_pp8_full_ptd.sh
@@ -11,7 +11,7 @@ WORLD_SIZE=$((NPUS_PER_NODE*$NNODES))
 basepath=$(cd `dirname $0`; cd ../../../; pwd)
 
 DATA_PATH="/data/llama2_prm_data/math_shepherd_prm"
-TOKENIZER_MODEL="/data/llama-2-7b-hf/"
+TOKENIZER_MODEL="/data/hf/llama-2-7b-hf/"
 CKPT_LOAD_DIR="/data/llama-2-7b-mcore-tp1-pp8/"
 
 TP=1
diff --git a/tests/st/shell_scripts/tune_qwen7b_tp8_pp1_full_ptd.sh b/tests/st/shell_scripts/tune_qwen7b_tp8_pp1_full_ptd.sh
index d24e2511a..23c8413cc 100644
--- a/tests/st/shell_scripts/tune_qwen7b_tp8_pp1_full_ptd.sh
+++ b/tests/st/shell_scripts/tune_qwen7b_tp8_pp1_full_ptd.sh
@@ -12,7 +12,7 @@ WORLD_SIZE=$(($NPUS_PER_NODE*$NNODES))
 
 DATA_PATH="/data/tune_dataset/alpaca/alpaca"
 TOKENIZER_PATH="/data/qwen-7b/"
-CKPT_LOAD_DIR="/data/Qwen-7B-tp8-pp1/"
+CKPT_LOAD_DIR="/data/pipeline/Qwen-7B-tp8-pp1/"
 
 basepath=$(cd `dirname $0`; cd ../../../; pwd)
 
-- 
Gitee


From aab3a8d5405c8ea1cd0c9d97283862598d84770f Mon Sep 17 00:00:00 2001
From: LuQingyun <luqingyun1@huawei.com>
Date: Tue, 13 May 2025 16:51:04 +0800
Subject: [PATCH 02/15] fix pipeline mistakes

---
 tests/pipeline/baichuan2-13B/param_config.json |  2 +-
 tests/pipeline/chatglm3-6B/param_config.json   |  2 +-
 tests/pipeline/common/test_checkpoint.json     | 14 +++++++-------
 tests/pipeline/common/test_inference.json      |  6 +++---
 4 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/tests/pipeline/baichuan2-13B/param_config.json b/tests/pipeline/baichuan2-13B/param_config.json
index 05ab4ec55..e88834923 100644
--- a/tests/pipeline/baichuan2-13B/param_config.json
+++ b/tests/pipeline/baichuan2-13B/param_config.json
@@ -86,7 +86,7 @@
                 "save-model-type": "mg",
                 "target-tensor-parallel-size": "8",
                 "load-dir": "/data/hf/baichuan2-13B-hf",
-                "save-dir": "/data/cache",
+                "save-dir": "/data/cache/baichuan2",
                 "tokenizer-model": "/data/hf/baichuan2-13B-hf/tokenizer.model",
                 "params-dtype": "bf16",
                 "w-pack":"True",
diff --git a/tests/pipeline/chatglm3-6B/param_config.json b/tests/pipeline/chatglm3-6B/param_config.json
index a395c0743..c826ccb34 100644
--- a/tests/pipeline/chatglm3-6B/param_config.json
+++ b/tests/pipeline/chatglm3-6B/param_config.json
@@ -105,7 +105,7 @@
                 "target-tensor-parallel-size": "2",
                 "target-pipeline-parallel-size": "4",
                 "load-dir": "/data/hf/chatglm3-6b-base-hf/",
-                "save-dir": "/data/cache/",
+                "save-dir": "/data/cache/chatglm3_tp2pp4",
                 "tokenizer-model": "/data/hf/chatglm3-6b-base-hf/tokenizer.model"
             }
         },
diff --git a/tests/pipeline/common/test_checkpoint.json b/tests/pipeline/common/test_checkpoint.json
index f5504ba7f..0524bf6b9 100644
--- a/tests/pipeline/common/test_checkpoint.json
+++ b/tests/pipeline/common/test_checkpoint.json
@@ -9,12 +9,12 @@
                 "target-pipeline-parallel-size": "2", 
                 "target-expert-parallel-size": "2",
                 "num-layer-list": "6,10",
-                "load-dir":"/data/Mixtral-8x7B-v0.1",
+                "load-dir":"/data/hf/Mixtral-legacy-hf",
                 "save-dir":"/data/wttest/test/hf2mc_mixtral_tp2pp2ep2dypp",
                 "use-mcore-models": null,
                 "model-type-hf": "mixtral",
                 "params-dtype": "bf16",
-                "tokenizer-model":"/data/Mixtral-8x7B-v0.1/tokenizer.model"
+                "tokenizer-model":"/data/hf/Mixtral-legacy-hf/tokenizer.model"
             }
         },
         {
@@ -40,7 +40,7 @@
                 "target-tensor-parallel-size": "1",
                 "target-pipeline-parallel-size": "1",
                 "target-expert-parallel-size": "1",
-                "save-dir":"/data/Mixtral-8x7B-v0.1/",
+                "save-dir":"/data/hf/Mixtral-legacy-hf/",
                 "load-dir":"/data/wttest/base/hf2mc_mixtral_tp1pp4ep2vpp2",
                 "use-mcore-models": null,
                 "model-type-hf": "mixtral",
@@ -127,13 +127,13 @@
                 "save-model-type": "mg",
                 "target-tensor-parallel-size": "8",
                 "target-pipeline-parallel-size": "1",
-                "load-dir": "/data/gemma2-9b-hf/",
+                "load-dir": "/data/hf/gemma2-9b-hf/",
                 "save-dir": "/data/gemma2-9b-mg-tp8pp1-mcore-test/",
                 "use-mcore-models": null,
                 "post-norm": null,
                 "model-type-hf": "gemma2",
                 "params-dtype": "bf16",
-                "tokenizer-model": "/data/gemma2-9b-hf/tokenizer.json",
+                "tokenizer-model": "/data/hf/gemma2-9b-hf/tokenizer.json",
                 "spec":"mindspeed_llm.tasks.models.spec.gemma2_spec layer_spec"
             }
         },
@@ -266,7 +266,7 @@
                 "target-tensor-parallel-size": "2",
                 "target-pipeline-parallel-size": "2",
                 "save-dir":"/data/llama-3-8b-hf-nooplayer-tp2pp2vpp2-mcore-test/",
-                "load-dir":"/data/llama-3-8b-hf-layer14/",
+                "load-dir":"/data/hf/llama-3-8b-hf-layer14/",
                 "num-layers-per-virtual-pipeline-stage": "2",
                 "noop-layers": "1,15",
                 "params-dtype": "bf16",
@@ -426,7 +426,7 @@
                 "target-expert-parallel-size": "2",
                 "load-dir":"/data/ci/orm/mixtral-8x7b-layers4-rm-hf/",
                 "save-dir":"/data/ci/orm/mixtral-8x7b-layers4-rm-mcore_tp2pp2ep2_test/",
-                "tokenizer-model":"/data/Mixtral-8x7B-v0.1/tokenizer.model",
+                "tokenizer-model":"/data/hf/Mixtral-legacy-hf/tokenizer.model",
                 "use-mcore-models": null,
                 "model-type-hf": "mixtral",
                 "orm": null
diff --git a/tests/pipeline/common/test_inference.json b/tests/pipeline/common/test_inference.json
index 58b6c2e95..fa2e87d58 100644
--- a/tests/pipeline/common/test_inference.json
+++ b/tests/pipeline/common/test_inference.json
@@ -201,7 +201,7 @@
                 "swiglu": null,
                 "load": "/data/Baichuan2-13B-tp8pp1-mcore-hf",
                 "tokenizer-type": "PretrainedFromHF",
-                "tokenizer-name-or-path": "/data/baichuan2-13B-hf/",
+                "tokenizer-name-or-path": "/data/hf/baichuan2-13B-hf/",
                 "tokenizer-not-use-fast": null,
                 "fp16": null,
                 "normalization": "RMSNorm" ,
@@ -219,7 +219,7 @@
             }
         }
     ],
-    
+
     "test_baichuan2_legacy_greedy_search": [
         {
             "param": {
@@ -242,7 +242,7 @@
                 "swiglu": null,
                 "load": "/data/pipe/baichuan2-13b-tp8pp1-legacy-base",
                 "tokenizer-type": "PretrainedFromHF",
-                "tokenizer-name-or-path": "/data/baichuan2-13B-hf/",
+                "tokenizer-name-or-path": "/data/hf/baichuan2-13B-hf/",
                 "tokenizer-not-use-fast": null,
                 "fp16": null,
                 "normalization": "RMSNorm" ,
-- 
Gitee


From 99d8e8d97403698b1467dd88d3517af0093ea94a Mon Sep 17 00:00:00 2001
From: LuQingyun <luqingyun1@huawei.com>
Date: Thu, 15 May 2025 16:27:43 +0800
Subject: [PATCH 03/15] fix pipeline mistakes

---
 tests/pipeline/baichuan2-13B/param_config.json | 2 +-
 tests/pipeline/common/test_checkpoint.json     | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/pipeline/baichuan2-13B/param_config.json b/tests/pipeline/baichuan2-13B/param_config.json
index e88834923..4417c0fe6 100644
--- a/tests/pipeline/baichuan2-13B/param_config.json
+++ b/tests/pipeline/baichuan2-13B/param_config.json
@@ -24,7 +24,7 @@
                 "fp16": null,
                 "no-load-rng": null,
                 "no-load-optim": null,
-                "load": "/data/pipeline/Baichuan2-13B-tp8pp1-mcore-hf-layer2",
+                "load": "/data/pipeline/baichuan2-13b-tp8pp1-legacy-base",
                 "tokenizer-type": "PretrainedFromHF",
                 "tokenizer-name-or-path": "/data/hf/baichuan2-13B-hf/",
                 "use-deter-comp": null
diff --git a/tests/pipeline/common/test_checkpoint.json b/tests/pipeline/common/test_checkpoint.json
index 0524bf6b9..93e4bf7b9 100644
--- a/tests/pipeline/common/test_checkpoint.json
+++ b/tests/pipeline/common/test_checkpoint.json
@@ -320,7 +320,7 @@
                 "save-model-type":"hf",
                 "target-tensor-parallel-size": "1",
                 "target-pipeline-parallel-size": "1",
-                "load-dir":"/data/llama-2-7b-hf-hf2ml-tp2pp4dypp-test",
+                "load-dir":"/data/hf/llama-2-7b-hf-hf2ml-tp2pp4dypp-test",
                 "save-dir":"/data/hf/llama-2-7b-hf",
                 "model-type-hf": "llama2"
             }
-- 
Gitee


From e9cd8f16d67dd891578c03354042d401b4701909 Mon Sep 17 00:00:00 2001
From: LuQingyun <luqingyun1@huawei.com>
Date: Thu, 15 May 2025 16:32:54 +0800
Subject: [PATCH 04/15] fix pipeline mistakes

---
 tests/pipeline/common/test_inference.json | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/pipeline/common/test_inference.json b/tests/pipeline/common/test_inference.json
index fa2e87d58..8eaa07513 100644
--- a/tests/pipeline/common/test_inference.json
+++ b/tests/pipeline/common/test_inference.json
@@ -240,7 +240,7 @@
                 "square-alibi-mask": null,
                 "fill-neg-inf": null,
                 "swiglu": null,
-                "load": "/data/pipe/baichuan2-13b-tp8pp1-legacy-base",
+                "load": "/data/pipeline/baichuan2-13b-tp8pp1-legacy-base",
                 "tokenizer-type": "PretrainedFromHF",
                 "tokenizer-name-or-path": "/data/hf/baichuan2-13B-hf/",
                 "tokenizer-not-use-fast": null,
-- 
Gitee


From af9efe29b2556e0242d74e7115caabcbc50a6363 Mon Sep 17 00:00:00 2001
From: LuQingyun <luqingyun1@huawei.com>
Date: Mon, 19 May 2025 14:28:42 +0800
Subject: [PATCH 05/15] fix coverage module mistakes

---
 tests/coverage/evaluation/test_evaluate.py | 36 +++++++++++-----------
 tests/run_coverage.sh                      | 24 +++++++++++++--
 2 files changed, 39 insertions(+), 21 deletions(-)

diff --git a/tests/coverage/evaluation/test_evaluate.py b/tests/coverage/evaluation/test_evaluate.py
index 3014459b5..10ee5084c 100644
--- a/tests/coverage/evaluation/test_evaluate.py
+++ b/tests/coverage/evaluation/test_evaluate.py
@@ -45,8 +45,8 @@ class TestEvaluate(DistributedTest):
     world_size = 8
     test_config = create_testconfig(Path(__file__).with_suffix(".json"))
 
-    @pytest.mark.parametrize("test_params", test_config["test_llama2_mcore_agieval_evaluate"])
-    def test_llama2_mcore_agieval_evaluate(self, build_args, test_params):
+    @pytest.mark.parametrize("params", test_config["test_llama2_mcore_agieval_evaluate"])
+    def test_llama2_mcore_agieval_evaluate(self, build_args, params):
         os.environ["CUDA_DEVICE_MAX_CONNECTIONS"] = "1"
         os.environ["CLOSE_MATMUL_K_SHIFT"] = "1"
 
@@ -62,8 +62,8 @@ class TestEvaluate(DistributedTest):
             assert math.isclose(expected_score, 0.192771,
                                 abs_tol=1e-2), f"score {expected_score}, forward pass has been changed, check it!"
 
-    @pytest.mark.parametrize("test_params", test_config["test_llama2_mcore_bbh_evaluate"])
-    def test_llama2_mcore_bbh_evaluate(self, build_args, test_params):
+    @pytest.mark.parametrize("params", test_config["test_llama2_mcore_bbh_evaluate"])
+    def test_llama2_mcore_bbh_evaluate(self, build_args, params):
         os.environ["CUDA_DEVICE_MAX_CONNECTIONS"] = "1"
         os.environ["CLOSE_MATMUL_K_SHIFT"] = "1"
         if dist.get_rank() == 0:
@@ -78,8 +78,8 @@ class TestEvaluate(DistributedTest):
             assert math.isclose(expected_score, 0.744186,
                                 abs_tol=1e-2), f"score {expected_score}, forward pass has been changed, check it!"
 
-    @pytest.mark.parametrize("test_params", test_config["test_qwen2_mcore_needlebench_evaluate"])
-    def test_qwen2_mcore_needlebench_evaluate(self, build_args, test_params):
+    @pytest.mark.parametrize("params", test_config["test_qwen2_mcore_needlebench_evaluate"])
+    def test_qwen2_mcore_needlebench_evaluate(self, build_args, params):
 
         os.environ["CUDA_DEVICE_MAX_CONNECTIONS"] = "1"
         os.environ["CLOSE_MATMUL_K_SHIFT"] = "1"
@@ -99,8 +99,8 @@ class TestEvaluateWorldSize1(DistributedTest):
     world_size = 1
     test_config = create_testconfig(Path(__file__).with_suffix(".json"))
 
-    @pytest.mark.parametrize("test_params", test_config["test_qwen2_mcore_mmlu_evaluate"])
-    def test_qwen2_mcore_mmlu_evaluate(self, build_args, test_params):
+    @pytest.mark.parametrize("params", test_config["test_qwen2_mcore_mmlu_evaluate"])
+    def test_qwen2_mcore_mmlu_evaluate(self, build_args, params):
         os.environ["CUDA_DEVICE_MAX_CONNECTIONS"] = "1"
         if dist.get_rank() == 0:
             handler, log_capture = setup_logger(PATTERN)
@@ -113,8 +113,8 @@ class TestEvaluateWorldSize1(DistributedTest):
             expected_score = acquire_score(log_capture)
             assert math.isclose(expected_score, 0.0, abs_tol=1e-2), f"score {expected_score}, forward pass has been changed, check it!"
 
-    @pytest.mark.parametrize("test_params", test_config["test_cmmlu_evaluate"])
-    def test_cmmlu_evaluate(self, build_args, test_params):
+    @pytest.mark.parametrize("params", test_config["test_cmmlu_evaluate"])
+    def test_cmmlu_evaluate(self, build_args, params):
         os.environ["CUDA_DEVICE_MAX_CONNECTIONS"] = "1"
         if dist.get_rank() == 0:
             handler, log_capture = setup_logger(PATTERN)
@@ -127,8 +127,8 @@ class TestEvaluateWorldSize1(DistributedTest):
             expected_score = acquire_score(log_capture)
             assert math.isclose(expected_score, 0.0, abs_tol=1e-2), f"score {expected_score}, forward pass has been changed, check it!"
 
-    @pytest.mark.parametrize("test_params", test_config["test_humaneval_evaluate"])
-    def test_humaneval_evaluate(self, build_args, test_params):
+    @pytest.mark.parametrize("params", test_config["test_humaneval_evaluate"])
+    def test_humaneval_evaluate(self, build_args, params):
         os.environ["CUDA_DEVICE_MAX_CONNECTIONS"] = "1"
         if dist.get_rank() == 0:
             handler, log_capture = setup_logger(PATTERN)
@@ -141,8 +141,8 @@ class TestEvaluateWorldSize1(DistributedTest):
             expected_score = acquire_score(log_capture)
             assert math.isclose(expected_score, 0.0, abs_tol=1e-2), f"score {expected_score}, forward pass has been changed, check it!"
 
-    @pytest.mark.parametrize("test_params", test_config["test_ceval_evaluate"])
-    def test_ceval_evaluate(self, build_args, test_params):
+    @pytest.mark.parametrize("params", test_config["test_ceval_evaluate"])
+    def test_ceval_evaluate(self, build_args, params):
         os.environ["CUDA_DEVICE_MAX_CONNECTIONS"] = "1"
         if dist.get_rank() == 0:
             handler, log_capture = setup_logger(PATTERN)
@@ -155,8 +155,8 @@ class TestEvaluateWorldSize1(DistributedTest):
             expected_score = acquire_score(log_capture)
             assert math.isclose(expected_score, 0.0, abs_tol=1e-2), f"score {expected_score}, forward pass has been changed, check it!"
 
-    @pytest.mark.parametrize("test_params", test_config["test_boolq_evaluate"])
-    def test_boolq_evaluate(self, build_args, test_params):
+    @pytest.mark.parametrize("params", test_config["test_boolq_evaluate"])
+    def test_boolq_evaluate(self, build_args, params):
         os.environ["CUDA_DEVICE_MAX_CONNECTIONS"] = "1"
         if dist.get_rank() == 0:
             handler, log_capture = setup_logger(PATTERN)
@@ -169,8 +169,8 @@ class TestEvaluateWorldSize1(DistributedTest):
             expected_score = acquire_score(log_capture)
             assert math.isclose(expected_score, 0.62171, abs_tol=1e-2), f"score {expected_score}, forward pass has been changed, check it!"
 
-    @pytest.mark.parametrize("test_params", test_config["test_gsm8k_evaluate"])
-    def test_gsm8k_evaluate(self, build_args, test_params):
+    @pytest.mark.parametrize("params", test_config["test_gsm8k_evaluate"])
+    def test_gsm8k_evaluate(self, build_args, params):
         os.environ["CUDA_DEVICE_MAX_CONNECTIONS"] = "1"
         if dist.get_rank() == 0:
             handler, log_capture = setup_logger(PATTERN)
diff --git a/tests/run_coverage.sh b/tests/run_coverage.sh
index b171e0779..99390344c 100644
--- a/tests/run_coverage.sh
+++ b/tests/run_coverage.sh
@@ -70,6 +70,15 @@ add_coverage() {
     sed -i "/    main()/a\    cov.stop()" pretrain_gpt.py
     sed -i "/    cov.stop()/a\    cov.save()" pretrain_gpt.py
 
+    sed -i "1a\import random" convert_ckpt.py
+    sed -i "2a\import time" convert_ckpt.py
+    sed -i "3a\import coverage" convert_ckpt.py
+    sed -i '4a\cov = coverage.Coverage(data_suffix=f"usecase-{time.time_ns()}_{random.randint(0, 100)}")' convert_ckpt.py
+
+    sed -i "/    main()/i\    cov.start()" convert_ckpt.py
+    sed -i "/    main()/a\    cov.stop()" convert_ckpt.py
+    sed -i "/    cov.stop()/a\    cov.save()" convert_ckpt.py
+
     sed -i "1a\import random" posttrain_gpt.py
     sed -i "2a\import time" posttrain_gpt.py
     sed -i "3a\import coverage" posttrain_gpt.py
@@ -99,6 +108,15 @@ remove_coverage() {
     sed -i "/    cov.stop()/d" pretrain_gpt.py
     sed -i "/    cov.save()/d" pretrain_gpt.py
 
+    sed -i "2d" convert_ckpt.py
+    sed -i "2d" convert_ckpt.py
+    sed -i "2d" convert_ckpt.py
+    sed -i "2d" convert_ckpt.py
+
+    sed -i "/    cov.start()/d" convert_ckpt.py
+    sed -i "/    cov.stop()/d" convert_ckpt.py
+    sed -i "/    cov.save()/d" convert_ckpt.py
+
     sed -i "2d" posttrain_gpt.py
     sed -i "2d" posttrain_gpt.py
     sed -i "2d" posttrain_gpt.py
@@ -118,6 +136,8 @@ remove_coverage() {
     sed -i "/    cov.save()/d" ray_gpt.py
 }
 
+add_coverage
+
 # run the coverage for python files in the pipeline
 find "$PIPELINE_DIR" -mindepth 1 -maxdepth 1 -type d | while read -r dir; do
     if [ -d "$dir" ]; then
@@ -132,14 +152,12 @@ pytest -xs ${UT_DIR}
 find "$UT_DIR" -mindepth 0 -maxdepth 1 -type d | while read -r dir; do
     if [ -d "$dir" ]; then
         find "$dir" -type f -name "*.py" | while read -r file; do
-          echo "${file}"
+            echo "${file}"
             coverage run -p --source=$SOURCE_DIR $file
         done
     fi
 done
 
-add_coverage
-
 # run the coverage for shell scripts in the st
 for test_case in "$ST_DIR"/*.sh; do
     file_name=$(basename "${test_case}")
-- 
Gitee


From 424fd7b8ac2054c98af6cc8d5ff2f9e91fb1ed27 Mon Sep 17 00:00:00 2001
From: LuQingyun <luqingyun1@huawei.com>
Date: Mon, 19 May 2025 14:39:17 +0800
Subject: [PATCH 06/15] fix coverage module mistakes

---
 convert_ckpt.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/convert_ckpt.py b/convert_ckpt.py
index d5c22f952..da22a3b1d 100644
--- a/convert_ckpt.py
+++ b/convert_ckpt.py
@@ -1,3 +1,4 @@
+# Copyright (c) 2024, HUAWEI CORPORATION.  All rights reserved.
 import argparse
 import importlib
 import os
-- 
Gitee


From 692abeae9525f7058df0bc2b77ee3ab835ec7735 Mon Sep 17 00:00:00 2001
From: LuQingyun <luqingyun1@huawei.com>
Date: Mon, 19 May 2025 15:53:55 +0800
Subject: [PATCH 07/15] fix coverage module mistakes

---
 tests/run_coverage.sh | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

diff --git a/tests/run_coverage.sh b/tests/run_coverage.sh
index 99390344c..ec94a8aa1 100644
--- a/tests/run_coverage.sh
+++ b/tests/run_coverage.sh
@@ -96,6 +96,15 @@ add_coverage() {
 
     sed -i "/    main()/a\    cov.stop()" ray_gpt.py
     sed -i "/    cov.stop()/a\    cov.save()" ray_gpt.py
+
+    sed -i "1a\import random" preprocess_data.py
+    sed -i "2a\import time" preprocess_data.py
+    sed -i "3a\import coverage" preprocess_data.py
+    sed -i '4a\cov = coverage.Coverage(data_suffix=f"usecase-{time.time_ns()}_{random.randint(0, 100)}")' preprocess_data.py
+
+    sed -i "/def main():/a\    cov.start()" preprocess_data.py
+    sed -i "/                os.remove(idx_file.replace('.idx', '.bin'))/a\    cov.stop()" preprocess_data.py
+    sed -i "/    cov.stop()/a\    cov.save()" preprocess_data.py
 }
 
 remove_coverage() {
@@ -134,6 +143,15 @@ remove_coverage() {
 
     sed -i "/    cov.stop()/d" ray_gpt.py
     sed -i "/    cov.save()/d" ray_gpt.py
+
+    sed -i "2d" preprocess_data.py
+    sed -i "2d" preprocess_data.py
+    sed -i "2d" preprocess_data.py
+    sed -i "2d" preprocess_data.py
+
+    sed -i "/    cov.start()/d" preprocess_data.py
+    sed -i "/    cov.stop()/d" preprocess_data.py
+    sed -i "/    cov.save()/d" preprocess_data.py
 }
 
 add_coverage
-- 
Gitee


From fb44a445d7a117ae0b5c9c68c986e75aa71d9e6d Mon Sep 17 00:00:00 2001
From: LuQingyun <luqingyun1@huawei.com>
Date: Mon, 26 May 2025 10:07:06 +0800
Subject: [PATCH 08/15] fix gloo feature patch mistakes

---
 mindspeed_llm/training/arguments.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/mindspeed_llm/training/arguments.py b/mindspeed_llm/training/arguments.py
index 315a2cff1..729e36346 100644
--- a/mindspeed_llm/training/arguments.py
+++ b/mindspeed_llm/training/arguments.py
@@ -1380,7 +1380,6 @@ def _add_dummy_args(args):
     args.recompute_in_bubble = False
     args.use_nanopipe = False
     args.moe_without_activation = False
-    args.disable_gloo_group = None
     args.ampipe_degree = 0
     args.attention_mask_type = args.cp_attention_mask_type
     args.hccl_group_buffer_adaptive = False
-- 
Gitee


From 9b82ad1b6d0d76ff02c974a81f1e903008ccc33a Mon Sep 17 00:00:00 2001
From: LuQingyun <luqingyun1@huawei.com>
Date: Thu, 29 May 2025 11:11:44 +0800
Subject: [PATCH 09/15] add daily pipeline log

---
 tests/pipeline/pipe_run.sh | 13 ++++++++++++-
 1 file changed, 12 insertions(+), 1 deletion(-)

diff --git a/tests/pipeline/pipe_run.sh b/tests/pipeline/pipe_run.sh
index 881800c0a..cd30eedce 100644
--- a/tests/pipeline/pipe_run.sh
+++ b/tests/pipeline/pipe_run.sh
@@ -29,9 +29,11 @@ BASE_DIR=$(dirname "$(readlink -f "$0")")
 CURRENT_TIME=$(date "+%Y-%m-%d")
 BASELINE_DIR="$BASE_DIR/baseline"
 GENERATE_LOG_DIR="/$(echo "$BASE_DIR" | cut -d'/' -f2)/pipeline_log/$CURRENT_TIME"
+ERROR_LOG_DIR="$GENERATE_LOG_DIR/exec_error_log_files"
 
 #mkdir cache to store product and will be removed after test
 mkdir -p "$GENERATE_LOG_DIR"
+mkdir -p "$ERROR_LOG_DIR"
 touch "$GENERATE_LOG_DIR/exec_error.log"
 echo "core0.8.0 Execution Results" > $GENERATE_LOG_DIR/exec_error.log
 
@@ -54,9 +56,12 @@ find "$BASE_DIR" -mindepth 1 -maxdepth 1 -type d | while read -r dir; do
                     --generate-json $GENERATE_LOG_DIR/$name.json
                 PYTEST_EXITCODE=$?
                 if [ $PYTEST_EXITCODE -ne 0 ]; then
+                    cp -r $GENERATE_LOG_DIR/$name.log "${ERROR_LOG_DIR}/"
+                    cp -r $GENERATE_LOG_DIR/$name.json "${ERROR_LOG_DIR}/"
                     echo "${name}.sh compare to baseline has failed, check it!" >> $GENERATE_LOG_DIR/exec_error.log
                 fi
             else
+                cp -r $GENERATE_LOG_DIR/$name.log "${ERROR_LOG_DIR}/"
                 echo "${name}.sh Script has failed. Exit!" >> $GENERATE_LOG_DIR/exec_error.log
             fi
         done
@@ -64,9 +69,15 @@ find "$BASE_DIR" -mindepth 1 -maxdepth 1 -type d | while read -r dir; do
         # python test testing
         find "$dir" -type f -name "*.py" | while read -r file; do
             echo "running $file"
-            if ! pytest --log-level=INFO "$file"; then
+            pytest --log-level=INFO "$file" | tee "${file}.log" 2>&1
+            PYTEST_EXITCODE=${PIPESTATUS[0]}
+            if [ $PYTEST_EXITCODE -ne 0 ]; then
+                cp -r ${file}.log "${ERROR_LOG_DIR}/"
                 echo "$file has failed, check it!" >> "$GENERATE_LOG_DIR/exec_error.log"
             fi
         done
     fi
 done
+
+echo "=================tar error log=================="
+tar -czvf "$GENERATE_LOG_DIR/err_log.tar.gz" "${ERROR_LOG_DIR}/"
\ No newline at end of file
-- 
Gitee


From e66d840b2e9545cdb0f692517997bd0f30c165ca Mon Sep 17 00:00:00 2001
From: LuQingyun <luqingyun1@huawei.com>
Date: Thu, 29 May 2025 11:42:40 +0800
Subject: [PATCH 10/15] add daily pipeline log

---
 tests/pipeline/pipe_run.sh | 9 ++-------
 1 file changed, 2 insertions(+), 7 deletions(-)

diff --git a/tests/pipeline/pipe_run.sh b/tests/pipeline/pipe_run.sh
index cd30eedce..3e4d3234b 100644
--- a/tests/pipeline/pipe_run.sh
+++ b/tests/pipeline/pipe_run.sh
@@ -28,12 +28,11 @@ cp -rf /home/master_branch/Megatron-LM/megatron ./
 BASE_DIR=$(dirname "$(readlink -f "$0")")
 CURRENT_TIME=$(date "+%Y-%m-%d")
 BASELINE_DIR="$BASE_DIR/baseline"
+LOG_BASE_DIR="/$(echo "$BASE_DIR" | cut -d'/' -f2)/pipeline_log"
 GENERATE_LOG_DIR="/$(echo "$BASE_DIR" | cut -d'/' -f2)/pipeline_log/$CURRENT_TIME"
-ERROR_LOG_DIR="$GENERATE_LOG_DIR/exec_error_log_files"
 
 #mkdir cache to store product and will be removed after test
 mkdir -p "$GENERATE_LOG_DIR"
-mkdir -p "$ERROR_LOG_DIR"
 touch "$GENERATE_LOG_DIR/exec_error.log"
 echo "core0.8.0 Execution Results" > $GENERATE_LOG_DIR/exec_error.log
 
@@ -56,12 +55,9 @@ find "$BASE_DIR" -mindepth 1 -maxdepth 1 -type d | while read -r dir; do
                     --generate-json $GENERATE_LOG_DIR/$name.json
                 PYTEST_EXITCODE=$?
                 if [ $PYTEST_EXITCODE -ne 0 ]; then
-                    cp -r $GENERATE_LOG_DIR/$name.log "${ERROR_LOG_DIR}/"
-                    cp -r $GENERATE_LOG_DIR/$name.json "${ERROR_LOG_DIR}/"
                     echo "${name}.sh compare to baseline has failed, check it!" >> $GENERATE_LOG_DIR/exec_error.log
                 fi
             else
-                cp -r $GENERATE_LOG_DIR/$name.log "${ERROR_LOG_DIR}/"
                 echo "${name}.sh Script has failed. Exit!" >> $GENERATE_LOG_DIR/exec_error.log
             fi
         done
@@ -72,7 +68,6 @@ find "$BASE_DIR" -mindepth 1 -maxdepth 1 -type d | while read -r dir; do
             pytest --log-level=INFO "$file" | tee "${file}.log" 2>&1
             PYTEST_EXITCODE=${PIPESTATUS[0]}
             if [ $PYTEST_EXITCODE -ne 0 ]; then
-                cp -r ${file}.log "${ERROR_LOG_DIR}/"
                 echo "$file has failed, check it!" >> "$GENERATE_LOG_DIR/exec_error.log"
             fi
         done
@@ -80,4 +75,4 @@ find "$BASE_DIR" -mindepth 1 -maxdepth 1 -type d | while read -r dir; do
 done
 
 echo "=================tar error log=================="
-tar -czvf "$GENERATE_LOG_DIR/err_log.tar.gz" "${ERROR_LOG_DIR}/"
\ No newline at end of file
+tar -czvf "${LOG_BASE_DIR}/${CURRENT_TIME}.tar.gz" "${$GENERATE_LOG_DIR}/"
\ No newline at end of file
-- 
Gitee


From 60fc7d252405f54dae867544324e4dc1d42b0954 Mon Sep 17 00:00:00 2001
From: LuQingyun <luqingyun1@huawei.com>
Date: Thu, 29 May 2025 14:30:24 +0800
Subject: [PATCH 11/15] add daily pipeline log

---
 tests/pipeline/pipe_run.sh | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/tests/pipeline/pipe_run.sh b/tests/pipeline/pipe_run.sh
index 3e4d3234b..fd09462cc 100644
--- a/tests/pipeline/pipe_run.sh
+++ b/tests/pipeline/pipe_run.sh
@@ -28,7 +28,6 @@ cp -rf /home/master_branch/Megatron-LM/megatron ./
 BASE_DIR=$(dirname "$(readlink -f "$0")")
 CURRENT_TIME=$(date "+%Y-%m-%d")
 BASELINE_DIR="$BASE_DIR/baseline"
-LOG_BASE_DIR="/$(echo "$BASE_DIR" | cut -d'/' -f2)/pipeline_log"
 GENERATE_LOG_DIR="/$(echo "$BASE_DIR" | cut -d'/' -f2)/pipeline_log/$CURRENT_TIME"
 
 #mkdir cache to store product and will be removed after test
@@ -75,4 +74,4 @@ find "$BASE_DIR" -mindepth 1 -maxdepth 1 -type d | while read -r dir; do
 done
 
 echo "=================tar error log=================="
-tar -czvf "${LOG_BASE_DIR}/${CURRENT_TIME}.tar.gz" "${$GENERATE_LOG_DIR}/"
\ No newline at end of file
+tar -czvf "${GENERATE_LOG_DIR}/${CURRENT_TIME}.tar.gz" "${$GENERATE_LOG_DIR}/"
\ No newline at end of file
-- 
Gitee


From 26bbbea5d2fe408f75e4b2b4b22929d22e2b4160 Mon Sep 17 00:00:00 2001
From: LuQingyun <luqingyun1@huawei.com>
Date: Thu, 29 May 2025 15:26:02 +0800
Subject: [PATCH 12/15] add daily pipeline log

---
 tests/pipeline/pipe_run.sh | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/tests/pipeline/pipe_run.sh b/tests/pipeline/pipe_run.sh
index fd09462cc..50874576a 100644
--- a/tests/pipeline/pipe_run.sh
+++ b/tests/pipeline/pipe_run.sh
@@ -64,7 +64,9 @@ find "$BASE_DIR" -mindepth 1 -maxdepth 1 -type d | while read -r dir; do
         # python test testing
         find "$dir" -type f -name "*.py" | while read -r file; do
             echo "running $file"
-            pytest --log-level=INFO "$file" | tee "${file}.log" 2>&1
+            tmp_file_name="${file#*MindSpeed-LLM/}"
+            file_name="${tmp_file_name//\//_}"
+            pytest --log-level=INFO "$file" | tee "${GENERATE_LOG_DIR}/${file_name}.log" 2>&1
             PYTEST_EXITCODE=${PIPESTATUS[0]}
             if [ $PYTEST_EXITCODE -ne 0 ]; then
                 echo "$file has failed, check it!" >> "$GENERATE_LOG_DIR/exec_error.log"
@@ -74,4 +76,4 @@ find "$BASE_DIR" -mindepth 1 -maxdepth 1 -type d | while read -r dir; do
 done
 
 echo "=================tar error log=================="
-tar -czvf "${GENERATE_LOG_DIR}/${CURRENT_TIME}.tar.gz" "${$GENERATE_LOG_DIR}/"
\ No newline at end of file
+tar -czvf "${GENERATE_LOG_DIR}/${CURRENT_TIME}.tar.gz" "${GENERATE_LOG_DIR}/"
\ No newline at end of file
-- 
Gitee


From 4e2b6e550af17278037542df6d3214fbfa52dc22 Mon Sep 17 00:00:00 2001
From: LuQingyun <luqingyun1@huawei.com>
Date: Fri, 30 May 2025 10:53:20 +0800
Subject: [PATCH 13/15] fix add daily pipeline log tar exception

---
 tests/pipeline/pipe_run.sh | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/tests/pipeline/pipe_run.sh b/tests/pipeline/pipe_run.sh
index 50874576a..8e2e0877d 100644
--- a/tests/pipeline/pipe_run.sh
+++ b/tests/pipeline/pipe_run.sh
@@ -28,7 +28,8 @@ cp -rf /home/master_branch/Megatron-LM/megatron ./
 BASE_DIR=$(dirname "$(readlink -f "$0")")
 CURRENT_TIME=$(date "+%Y-%m-%d")
 BASELINE_DIR="$BASE_DIR/baseline"
-GENERATE_LOG_DIR="/$(echo "$BASE_DIR" | cut -d'/' -f2)/pipeline_log/$CURRENT_TIME"
+GENERATE_LOG_BASE_DIR="/$(echo "$BASE_DIR" | cut -d'/' -f2)/pipeline_log"
+GENERATE_LOG_DIR="$GENERATE_LOG_BASE_DIR/$CURRENT_TIME"
 
 #mkdir cache to store product and will be removed after test
 mkdir -p "$GENERATE_LOG_DIR"
@@ -76,4 +77,4 @@ find "$BASE_DIR" -mindepth 1 -maxdepth 1 -type d | while read -r dir; do
 done
 
 echo "=================tar error log=================="
-tar -czvf "${GENERATE_LOG_DIR}/${CURRENT_TIME}.tar.gz" "${GENERATE_LOG_DIR}/"
\ No newline at end of file
+tar -czvf "${GENERATE_LOG_BASE_DIR}/${CURRENT_TIME}.tar.gz" "${GENERATE_LOG_DIR}/"
\ No newline at end of file
-- 
Gitee


From 8e7a47643d55c7a5518979d0f9185523220025f6 Mon Sep 17 00:00:00 2001
From: LuQingyun <luqingyun1@huawei.com>
Date: Fri, 30 May 2025 14:34:54 +0800
Subject: [PATCH 14/15] generate run coverage log

---
 tests/run_coverage.sh | 18 ++++++++++++++++--
 1 file changed, 16 insertions(+), 2 deletions(-)

diff --git a/tests/run_coverage.sh b/tests/run_coverage.sh
index ec94a8aa1..fa4c161b7 100644
--- a/tests/run_coverage.sh
+++ b/tests/run_coverage.sh
@@ -11,6 +11,13 @@ PIPELINE_DIR="empty"
 UT_DIR="empty"
 ST_DIR="empty"
 
+# 创建日志目录
+GENERATE_LOG_DIR="$UT_DIR/logs"
+mkdir -p "$GENERATE_LOG_DIR"
+touch "$GENERATE_LOG_DIR/exec_error.log"
+echo "core0.8.0 Execution Results" > $GENERATE_LOG_DIR/exec_error.log
+
+
 # 带参1用于区分运行场景
 if [ -z "$1" ]; then
     echo "请提供一个参数（ST、PIPELINE、UT、all）"
@@ -166,11 +173,18 @@ find "$PIPELINE_DIR" -mindepth 1 -maxdepth 1 -type d | while read -r dir; do
 done
 
 # run the coverage for python files in the unit tests
-pytest -xs ${UT_DIR}
 find "$UT_DIR" -mindepth 0 -maxdepth 1 -type d | while read -r dir; do
     if [ -d "$dir" ]; then
         find "$dir" -type f -name "*.py" | while read -r file; do
-            echo "${file}"
+            echo "running ${file}"
+            filename=$(basename "$file")
+            extension="${filename##*.}"
+            name="${filename%.$extension}"
+            pytest -xs $file | tee "$GENERATE_LOG_DIR/$name.log" 2>&1
+            PYTEST_EXITCODE=${PIPESTATUS[0]}
+            if [ $PYTEST_EXITCODE -ne 0 ]; then
+                echo "$file has failed, check it!" >> "$GENERATE_LOG_DIR/exec_error.log"
+            fi
             coverage run -p --source=$SOURCE_DIR $file
         done
     fi
-- 
Gitee


From f146e68ae17fa57c519246bd161bc37892b40ffb Mon Sep 17 00:00:00 2001
From: LuQingyun <luqingyun1@huawei.com>
Date: Thu, 5 Jun 2025 20:43:51 +0800
Subject: [PATCH 15/15] fix run coverage tools

---
 tests/run_coverage.sh | 54 +++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 54 insertions(+)

diff --git a/tests/run_coverage.sh b/tests/run_coverage.sh
index fa4c161b7..8346e9068 100644
--- a/tests/run_coverage.sh
+++ b/tests/run_coverage.sh
@@ -77,6 +77,15 @@ add_coverage() {
     sed -i "/    main()/a\    cov.stop()" pretrain_gpt.py
     sed -i "/    cov.stop()/a\    cov.save()" pretrain_gpt.py
 
+    sed -i "1a\import random" pretrain_mamba.py
+    sed -i "2a\import time" pretrain_mamba.py
+    sed -i "3a\import coverage" pretrain_mamba.py
+    sed -i '4a\cov = coverage.Coverage(data_suffix=f"usecase-{time.time_ns()}_{random.randint(0, 100)}")' pretrain_mamba.py
+    sed -i "5a\cov.start()" pretrain_mamba.py
+
+    sed -i "/    main()/a\    cov.stop()" pretrain_mamba.py
+    sed -i "/    cov.stop()/a\    cov.save()" pretrain_mamba.py
+
     sed -i "1a\import random" convert_ckpt.py
     sed -i "2a\import time" convert_ckpt.py
     sed -i "3a\import coverage" convert_ckpt.py
@@ -86,6 +95,24 @@ add_coverage() {
     sed -i "/    main()/a\    cov.stop()" convert_ckpt.py
     sed -i "/    cov.stop()/a\    cov.save()" convert_ckpt.py
 
+    sed -i "1a\import random" evaluation.py
+    sed -i "2a\import time" evaluation.py
+    sed -i "3a\import coverage" evaluation.py
+    sed -i '4a\cov = coverage.Coverage(data_suffix=f"usecase-{time.time_ns()}_{random.randint(0, 100)}")' evaluation.py
+
+    sed -i "/def main():/a\    cov.start()" evaluation.py
+    sed -i "/            logger.info(f'NeedleBench_eval Running Time: {time.time() - a}')/a\    cov.stop()" evaluation.py
+    sed -i "/    cov.stop()/a\    cov.save()" evaluation.py
+
+    sed -i "1a\import random" inference.py
+    sed -i "2a\import time" inference.py
+    sed -i "3a\import coverage" inference.py
+    sed -i '4a\cov = coverage.Coverage(data_suffix=f"usecase-{time.time_ns()}_{random.randint(0, 100)}")' inference.py
+
+    sed -i "/def main():/a\    cov.start()" inference.py
+    sed -i "/    task_factory(args, model)/a\    cov.stop()" inference.py
+    sed -i "/    cov.stop()/a\    cov.save()" inference.py
+
     sed -i "1a\import random" posttrain_gpt.py
     sed -i "2a\import time" posttrain_gpt.py
     sed -i "3a\import coverage" posttrain_gpt.py
@@ -124,6 +151,15 @@ remove_coverage() {
     sed -i "/    cov.stop()/d" pretrain_gpt.py
     sed -i "/    cov.save()/d" pretrain_gpt.py
 
+    sed -i "2d" pretrain_mamba.py
+    sed -i "2d" pretrain_mamba.py
+    sed -i "2d" pretrain_mamba.py
+    sed -i "2d" pretrain_mamba.py
+    sed -i "2d" pretrain_mamba.py
+
+    sed -i "/    cov.stop()/d" pretrain_mamba.py
+    sed -i "/    cov.save()/d" pretrain_mamba.py
+
     sed -i "2d" convert_ckpt.py
     sed -i "2d" convert_ckpt.py
     sed -i "2d" convert_ckpt.py
@@ -133,6 +169,24 @@ remove_coverage() {
     sed -i "/    cov.stop()/d" convert_ckpt.py
     sed -i "/    cov.save()/d" convert_ckpt.py
 
+    sed -i "2d" evaluation.py
+    sed -i "2d" evaluation.py
+    sed -i "2d" evaluation.py
+    sed -i "2d" evaluation.py
+
+    sed -i "/    cov.start()/d" evaluation.py
+    sed -i "/    cov.stop()/d" evaluation.py
+    sed -i "/    cov.save()/d" evaluation.py
+
+    sed -i "2d" inference.py
+    sed -i "2d" inference.py
+    sed -i "2d" inference.py
+    sed -i "2d" inference.py
+
+    sed -i "/    cov.start()/d" inference.py
+    sed -i "/    cov.stop()/d" inference.py
+    sed -i "/    cov.save()/d" inference.py
+
     sed -i "2d" posttrain_gpt.py
     sed -i "2d" posttrain_gpt.py
     sed -i "2d" posttrain_gpt.py
-- 
Gitee