From e0d8f4f37f97f1fc6bca26db9d89cc4bb6dade56 Mon Sep 17 00:00:00 2001 From: LuQingyun Date: Thu, 8 May 2025 22:04:19 +0800 Subject: [PATCH 01/15] fix pipeline mistakes --- .../coverage/checkpoint/test_checkpoint.json | 2 +- tests/coverage/evaluation/test_evaluate.json | 4 +- tests/coverage/inference/test_inference.json | 18 ++++----- .../test_process_instruction_data.json | 18 ++++----- .../test_process_pretrain_data.json | 6 +-- .../pipeline/baichuan2-13B/param_config.json | 4 +- tests/pipeline/common/test_checkpoint.json | 8 ++-- tests/pipeline/common/test_inference.json | 38 +++++++++---------- .../test_process_instruction_data_lf.json | 16 ++++---- .../test_process_instruction_pack_data.py | 2 +- .../common/test_process_pairwise_data_lf.json | 6 +-- tests/rlxf/train_orm_chatglm3_tp2_pp4_full.sh | 2 +- .../rlxf/train_orm_llama2_7b_pp2_vpp2_dp2.sh | 2 +- .../rlxf/train_prm_llama2_tp1_pp8_full_ptd.sh | 2 +- .../tune_qwen7b_tp8_pp1_full_ptd.sh | 2 +- 15 files changed, 65 insertions(+), 65 deletions(-) diff --git a/tests/coverage/checkpoint/test_checkpoint.json b/tests/coverage/checkpoint/test_checkpoint.json index 20fdc1cb9..b6055dead 100644 --- a/tests/coverage/checkpoint/test_checkpoint.json +++ b/tests/coverage/checkpoint/test_checkpoint.json @@ -91,7 +91,7 @@ "save-model-type":"mg", "target-tensor-parallel-size": "4", "target-pipeline-parallel-size": "2", - "load-dir":"/data/hf/qwen2.5-7B", + "load-dir":"/data/hf/Qwen2.5-7B", "save-dir":"/data/ci/Qwen2.5-mg", "tokenizer-model":"/data/ci/Qwen2.5-7B", "use-mcore-models": null, diff --git a/tests/coverage/evaluation/test_evaluate.json b/tests/coverage/evaluation/test_evaluate.json index d7e752f3e..3a2fa47f1 100644 --- a/tests/coverage/evaluation/test_evaluate.json +++ b/tests/coverage/evaluation/test_evaluate.json @@ -29,7 +29,7 @@ "no-load-optim": null, "load":"/data/ci/ckpt", "tokenizer-type":"PretrainedFromHF", - "tokenizer-name-or-path":"/data/llama-2-7b-hf", + "tokenizer-name-or-path":"/data/hf/llama-2-7b-hf", "use-deter-comp": null } } @@ -65,7 +65,7 @@ "no-load-optim": null, "load":"/data/ci/ckpt", "tokenizer-type":"PretrainedFromHF", - "tokenizer-name-or-path":"/data/llama-2-7b-hf", + "tokenizer-name-or-path":"/data/hf/llama-2-7b-hf", "use-deter-comp": null } } diff --git a/tests/coverage/inference/test_inference.json b/tests/coverage/inference/test_inference.json index 2f5602ae5..2050e713f 100644 --- a/tests/coverage/inference/test_inference.json +++ b/tests/coverage/inference/test_inference.json @@ -25,14 +25,14 @@ "normalization": "RMSNorm", "load":"/data/ci/ckpt", "tokenizer-type": "PretrainedFromHF", - "tokenizer-name-or-path":"/data/llama-2-7b-hf", - "tokenizer-model": "/data/llama-2-7b-hf/tokenizer.model", + "tokenizer-name-or-path":"/data/hf/llama-2-7b-hf", + "tokenizer-model": "/data/hf/llama-2-7b-hf/tokenizer.model", "disable-bias-linear": null, - "attention-softmax-in-fp32": null, - "untie-embeddings-and-output-weights": null, - "no-masked-softmax-fusion": null, - "no-load-optim": null, - "no-load-rng": null, + "attention-softmax-in-fp32": null, + "untie-embeddings-and-output-weights": null, + "no-masked-softmax-fusion": null, + "no-load-optim": null, + "no-load-rng": null, "fp16": null, "task":"greedy", "use-deter-comp": null, @@ -67,8 +67,8 @@ "normalization": "RMSNorm", "load":"/data/ci/ckpt", "tokenizer-type": "PretrainedFromHF", - "tokenizer-name-or-path":"/data/llama-2-7b-hf", - "tokenizer-model": "/data/llama-2-7b-hf/tokenizer.model", + "tokenizer-name-or-path":"/data/hf/llama-2-7b-hf", + "tokenizer-model": "/data/hf/llama-2-7b-hf/tokenizer.model", "disable-bias-linear": null, "attention-softmax-in-fp32": null, "untie-embeddings-and-output-weights": null, diff --git a/tests/coverage/process_data/test_process_instruction_data.json b/tests/coverage/process_data/test_process_instruction_data.json index 772ebb42a..c553b2f75 100644 --- a/tests/coverage/process_data/test_process_instruction_data.json +++ b/tests/coverage/process_data/test_process_instruction_data.json @@ -17,7 +17,7 @@ "tokenizer-type": "PretrainedFromHF", "handler-name": "GeneralInstructionHandler", "output-prefix": "/data/process_dataset/test_ins_subs/part1", - "tokenizer-name-or-path": "/data/llama-2-7b-hf", + "tokenizer-name-or-path": "/data/hf/llama-2-7b-hf", "workers": 4, "log-interval": 1000, "append-eod": null @@ -30,8 +30,8 @@ "input": "/data/process_dataset/0002-alpaca.parquet", "tokenizer-type": "PretrainedFromHF", "handler-name": "GeneralInstructionHandler", - "output-prefix": "/data/process_dataset/test_ins_subs/part2", - "tokenizer-name-or-path": "/data/llama-2-7b-hf", + "output-prefix": "/data/process_dataset/test_ins_subs/part2", + "tokenizer-name-or-path": "/data/hf/llama-2-7b-hf", "workers": 4, "log-interval": 1000, "append-eod": null @@ -61,8 +61,8 @@ "input": "/data/process_dataset/0001-alpaca.parquet", "tokenizer-type": "PretrainedFromHF", "handler-name": "AlpacaStyleInstructionHandler", - "output-prefix": "/data/process_dataset/test_instruction_handler/alpaca_style", - "tokenizer-name-or-path": "/data/llama-2-7b-hf", + "output-prefix": "/data/process_dataset/test_instruction_handler/alpaca_style", + "tokenizer-name-or-path": "/data/hf/llama-2-7b-hf", "workers": 4, "log-interval": 1000, "overwrite-cache": null, @@ -76,8 +76,8 @@ "input": "/data/process_dataset/0001-alpaca.parquet", "tokenizer-type": "PretrainedFromHF", "handler-name": "AlpacaStyleInstructionHandler", - "output-prefix": "/data/process_dataset/test_instruction_handler/alpaca_style_pack", - "tokenizer-name-or-path": "/data/llama-2-7b-hf", + "output-prefix": "/data/process_dataset/test_instruction_handler/alpaca_style_pack", + "tokenizer-name-or-path": "/data/hf/llama-2-7b-hf", "workers": 4, "log-interval": 1000, "overwrite-cache": null, @@ -94,8 +94,8 @@ "input": "/data/process_dataset/sharegpt_formatted_data-evol-gpt4.jsonl", "tokenizer-type": "PretrainedFromHF", "handler-name": "SharegptStyleInstructionHandler", - "output-prefix": "/data/process_dataset/test_instruction_handler/sharegpt_style", - "tokenizer-name-or-path": "/data/llama-2-7b-hf", + "output-prefix": "/data/process_dataset/test_instruction_handler/sharegpt_style", + "tokenizer-name-or-path": "/data/hf/llama-2-7b-hf", "workers": 4, "log-interval": 1000, "prompt-type" : "llama2" diff --git a/tests/coverage/process_data/test_process_pretrain_data.json b/tests/coverage/process_data/test_process_pretrain_data.json index 5c68471dc..84359d49a 100644 --- a/tests/coverage/process_data/test_process_pretrain_data.json +++ b/tests/coverage/process_data/test_process_pretrain_data.json @@ -16,7 +16,7 @@ "input": "/data/process_dataset/0001-alpaca.parquet", "tokenizer-type": "PretrainedFromHF", "output-prefix": "/data/process_dataset/test_merge_subs/part1", - "tokenizer-name-or-path": "/data/llama-2-7b-hf", + "tokenizer-name-or-path": "/data/hf/llama-2-7b-hf", "workers": 4, "log-interval": 1000 } @@ -27,8 +27,8 @@ "params": { "input": "/data/process_dataset/0002-alpaca.parquet", "tokenizer-type": "PretrainedFromHF", - "output-prefix": "/data/process_dataset/test_merge_subs/part2", - "tokenizer-name-or-path": "/data/llama-2-7b-hf", + "output-prefix": "/data/process_dataset/test_merge_subs/part2", + "tokenizer-name-or-path": "/data/hf/llama-2-7b-hf", "workers": 4, "log-interval": 1000 } diff --git a/tests/pipeline/baichuan2-13B/param_config.json b/tests/pipeline/baichuan2-13B/param_config.json index 2a6bc9f29..05ab4ec55 100644 --- a/tests/pipeline/baichuan2-13B/param_config.json +++ b/tests/pipeline/baichuan2-13B/param_config.json @@ -24,7 +24,7 @@ "fp16": null, "no-load-rng": null, "no-load-optim": null, - "load": "/data/pipeline/baichuan2-13b-tp8pp1-legacy-base", + "load": "/data/pipeline/Baichuan2-13B-tp8pp1-mcore-hf-layer2", "tokenizer-type": "PretrainedFromHF", "tokenizer-name-or-path": "/data/hf/baichuan2-13B-hf/", "use-deter-comp": null @@ -85,7 +85,7 @@ "load-model-type": "hf", "save-model-type": "mg", "target-tensor-parallel-size": "8", - "load-dir": "/data/datasets/baichuan2-13B-data/enwiki/", + "load-dir": "/data/hf/baichuan2-13B-hf", "save-dir": "/data/cache", "tokenizer-model": "/data/hf/baichuan2-13B-hf/tokenizer.model", "params-dtype": "bf16", diff --git a/tests/pipeline/common/test_checkpoint.json b/tests/pipeline/common/test_checkpoint.json index 23c4f6a19..f5504ba7f 100644 --- a/tests/pipeline/common/test_checkpoint.json +++ b/tests/pipeline/common/test_checkpoint.json @@ -292,10 +292,10 @@ "target-tensor-parallel-size": "2", "target-pipeline-parallel-size": "4", "num-layer-list": "6,8,8,10", - "load-dir":"/data/llama-2-7b-hf", - "save-dir":"/data/llama-2-7b-hf-hf2ml-tp2pp4dypp-test", + "load-dir":"/data/hf/llama-2-7b-hf", + "save-dir":"/data/hf/llama-2-7b-hf-hf2ml-tp2pp4dypp-test", "model-type-hf": "llama2", - "tokenizer-model":"/data/llama-2-7b-hf/tokenizer.model" + "tokenizer-model":"/data/hf/llama-2-7b-hf/tokenizer.model" } }, { @@ -321,7 +321,7 @@ "target-tensor-parallel-size": "1", "target-pipeline-parallel-size": "1", "load-dir":"/data/llama-2-7b-hf-hf2ml-tp2pp4dypp-test", - "save-dir":"/data/llama-2-7b-hf", + "save-dir":"/data/hf/llama-2-7b-hf", "model-type-hf": "llama2" } }, diff --git a/tests/pipeline/common/test_inference.json b/tests/pipeline/common/test_inference.json index d76f57dab..58b6c2e95 100644 --- a/tests/pipeline/common/test_inference.json +++ b/tests/pipeline/common/test_inference.json @@ -15,18 +15,18 @@ "make-vocab-size-divisible-by": 1, "normalization": "RMSNorm", "position-embedding-type": "rope", - "load":"/data/llama2-7B-tp8-pp1", - "tokenizer-type": "PretrainedFromHF", - "tokenizer-name-or-path":"/data/llama-2-7b-hf", - "tokenizer-model": "/data/llama-2-7b-hf/tokenizer.model", + "load":"/data/pipeline/llama2-7B-tp8-pp1", + "tokenizer-type": "PretrainedFromHF", + "tokenizer-name-or-path":"/data/hf/llama-2-7b-hf", + "tokenizer-model": "/data/hf/llama-2-7b-hf/tokenizer.model", "disable-bias-linear": null, - "use-fused-rmsnorm": null, + "use-fused-rmsnorm": null, "swiglu": null, - "attention-softmax-in-fp32": null, - "untie-embeddings-and-output-weights": null, - "no-masked-softmax-fusion": null, - "no-load-optim": null, - "no-load-rng": null, + "attention-softmax-in-fp32": null, + "untie-embeddings-and-output-weights": null, + "no-masked-softmax-fusion": null, + "no-load-optim": null, + "no-load-rng": null, "fp16": null, "task":"greedy", "max-new-tokens": 30, @@ -54,10 +54,10 @@ "num-attention-heads": 32, "max-position-embeddings": 4096 , "swiglu": null, - "load": "/data/llama2-7B-tp8-pp1", + "load": "/data/pipeline/llama2-7B-tp8-pp1", "tokenizer-type": "PretrainedFromHF", - "tokenizer-name-or-path": "/data/llama-2-7b-hf", - "tokenizer-model": "/data/llama-2-7b-hf/tokenizer.model", + "tokenizer-name-or-path": "/data/hf/llama-2-7b-hf", + "tokenizer-model": "/data/hf/llama-2-7b-hf/tokenizer.model", "tokenizer-not-use-fast": null, "fp16": null, "normalization": "RMSNorm" , @@ -111,10 +111,10 @@ "max-new-tokens": 30, "micro-batch-size": 1, "global-batch-size": 1, - "load":"/data/chatglm3-6b-base-mg-tp1pp2-mcore-base", + "load":"/data/pipeline/chatglm3-6b-base-mg-tp1pp2-mcore-base", "tokenizer-type": "PretrainedFromHF", - "tokenizer-name-or-path":"/data/chatglm3-6b-base-hf/", - "tokenizer-model": "/data/chatglm3-6b-base-hf/tokenizer.model", + "tokenizer-name-or-path":"/data/hf/chatglm3-6b-base-hf/", + "tokenizer-model": "/data/hf/chatglm3-6b-base-hf/tokenizer.model", "tokenizer-not-use-fast": null, "untie-embeddings-and-output-weights": null, "attention-softmax-in-fp32": null, @@ -158,10 +158,10 @@ "max-new-tokens": 30, "micro-batch-size": 1, "global-batch-size": 16, - "load":"/data/pipe/chatglm3-6b-base-mg-tp1pp2-legacy-base", + "load":"/data/pipeline/chatglm3-6b-base-mg-tp1pp2-legacy-base", "tokenizer-type": "PretrainedFromHF", - "tokenizer-name-or-path":"/data/chatglm3-6b-base-hf/", - "tokenizer-model": "/data/chatglm3-6b-base-hf/tokenizer.model", + "tokenizer-name-or-path":"/data/hf/chatglm3-6b-base-hf/", + "tokenizer-model": "/data/hf/chatglm3-6b-base-hf/tokenizer.model", "tokenizer-not-use-fast": null, "untie-embeddings-and-output-weights": null, "attention-softmax-in-fp32": null, diff --git a/tests/pipeline/common/test_process_instruction_data_lf.json b/tests/pipeline/common/test_process_instruction_data_lf.json index d50aaa4ca..65edde86f 100644 --- a/tests/pipeline/common/test_process_instruction_data_lf.json +++ b/tests/pipeline/common/test_process_instruction_data_lf.json @@ -5,7 +5,7 @@ "input": "/data/tune_dataset/train-00000-of-00001-a09b74b3ef9c3b56.parquet", "tokenizer-type": "PretrainedFromHF", "handler-name": "AlpacaStyleInstructionHandler", - "output-prefix": "/data/cache/tune_dataset/alpaca/alpaca", + "output-prefix": "/data/tune_dataset/alpaca/alpaca", "overwrite-cache": null, "tokenizer-name-or-path": "/data/qwen-7b/", "workers": 4, @@ -20,7 +20,7 @@ "input": "/data/tune_dataset/oaast_sft.json", "tokenizer-type": "PretrainedFromHF", "handler-name": "AlpacaStyleInstructionHandler", - "output-prefix": "/data/tune_dataset/alpaca_his/alpaca_his", + "output-prefix": "/data/tune_dataset/alpaca_his/alpaca_his", "tokenizer-name-or-path": "/data/qwen-7b/", "overwrite-cache": null, "workers": 4, @@ -28,13 +28,13 @@ "prompt-type": "qwen", "map-keys": "{\"history\":\"history\"}" } - }, + }, { "params": { "input": "/data/tune_dataset/oaast_sft.json", "tokenizer-type": "PretrainedFromHF", "handler-name": "AlpacaStyleInstructionHandler", - "output-prefix": "/data/tune_dataset/alpaca_his/alpaca_his_seq1024", + "output-prefix": "/data/tune_dataset/alpaca_his/alpaca_his_seq1024", "tokenizer-name-or-path": "/data/qwen-7b/", "overwrite-cache": null, "workers": 4, @@ -52,7 +52,7 @@ "input": "/data/tune_dataset/sharegpt_formatted_data-evol-gpt4.jsonl", "tokenizer-type": "PretrainedFromHF", "handler-name": "SharegptStyleInstructionHandler", - "output-prefix": "/data/tune_dataset/sharegpt/sharegpt", + "output-prefix": "/data/tune_dataset/sharegpt/sharegpt", "tokenizer-name-or-path": "/data/qwen-7b/", "workers": 4, "overwrite-cache": null, @@ -68,7 +68,7 @@ "input": "/data/tune_dataset/sss.json", "tokenizer-type": "PretrainedFromHF", "handler-name": "SharegptStyleInstructionHandler", - "output-prefix": "/data/tune_dataset/sharegpt/sharegpt", + "output-prefix": "/data/tune_dataset/sharegpt/sharegpt", "tokenizer-name-or-path": "/data/qwen-7b/", "overwrite-cache": null, "workers": 4, @@ -84,8 +84,8 @@ "input": "/data/tune_dataset/train-00000-of-00001-a09b74b3ef9c3b56.parquet", "tokenizer-type": "PretrainedFromHF", "handler-name": "AlpacaStyleInstructionHandler", - "output-prefix": "/data/tune_dataset/prompt_abstract/alpaca", - "tokenizer-name-or-path": "/data/llama-2-7b-hf/", + "output-prefix": "/data/tune_dataset/prompt_abstract/alpaca", + "tokenizer-name-or-path": "/data/hf/llama-2-7b-hf/", "overwrite-cache": null, "workers": 4, "log-interval": 1000, diff --git a/tests/pipeline/common/test_process_instruction_pack_data.py b/tests/pipeline/common/test_process_instruction_pack_data.py index a5f7cf1cc..2dcaa0a3c 100644 --- a/tests/pipeline/common/test_process_instruction_pack_data.py +++ b/tests/pipeline/common/test_process_instruction_pack_data.py @@ -18,7 +18,7 @@ class TestProcessInstructionData: "--tokenizer-type", "PretrainedFromHF", "--handler-name", "GeneralInstructionHandler", "--output-prefix", "/data/tune_pack_dataset/alpaca_pack", - "--tokenizer-name-or-path", "/data/llama-2-7b-hf", + "--tokenizer-name-or-path", "/data/hf/llama-2-7b-hf", "--workers", "4", "--log-interval", "1000", "--append-eod", diff --git a/tests/pipeline/common/test_process_pairwise_data_lf.json b/tests/pipeline/common/test_process_pairwise_data_lf.json index bc76d7266..04d0cbcaf 100644 --- a/tests/pipeline/common/test_process_pairwise_data_lf.json +++ b/tests/pipeline/common/test_process_pairwise_data_lf.json @@ -5,7 +5,7 @@ "input": "/data/pairwise_dataset/orca_rlhf.jsonl", "tokenizer-type": "PretrainedFromHF", "output-prefix": "/data/pairwise_dataset/output/orca_rlhf/orca_rlhf", - "tokenizer-name-or-path": "/data/llama-2-7b-hf/", + "tokenizer-name-or-path": "/data/hf/llama-2-7b-hf/", "workers": 4, "log-interval": 1000, "handler-name": "AlpacaStylePairwiseHandler", @@ -21,8 +21,8 @@ "input": "/data/pairwise_dataset/dpo_en.json", "tokenizer-type": "PretrainedFromHF", "handler-name": "SharegptStylePairwiseHandler", - "output-prefix": "/data/pairwise_dataset/output/dpo_en/dpo_en", - "tokenizer-name-or-path": "/data/chatglm3-6b-base-hf/", + "output-prefix": "/data/pairwise_dataset/output/dpo_en/dpo_en", + "tokenizer-name-or-path": "/data/hf/chatglm3-6b-base-hf/", "workers": 4, "log-interval": 1000, "prompt-type": "chatglm3" diff --git a/tests/rlxf/train_orm_chatglm3_tp2_pp4_full.sh b/tests/rlxf/train_orm_chatglm3_tp2_pp4_full.sh index 6ed5692c2..afe21fbe4 100644 --- a/tests/rlxf/train_orm_chatglm3_tp2_pp4_full.sh +++ b/tests/rlxf/train_orm_chatglm3_tp2_pp4_full.sh @@ -12,7 +12,7 @@ basepath=$(cd `dirname $0`; cd ../../../; pwd) CKPT_SAVE_DIR="./chatglm3_reward_ckpt" DATA_PATH="/data/pairwise_dataset/baseline/dpo_en/dpo_en" -TOKENIZER_PATH="/data/chatglm3-6b-base-hf/" +TOKENIZER_PATH="/data/hf/chatglm3-6b-base-hf/" CKPT_LOAD_DIR="/data/chatglm3_reward_ckpt" TP=2 diff --git a/tests/rlxf/train_orm_llama2_7b_pp2_vpp2_dp2.sh b/tests/rlxf/train_orm_llama2_7b_pp2_vpp2_dp2.sh index 00b9ab197..86433f819 100644 --- a/tests/rlxf/train_orm_llama2_7b_pp2_vpp2_dp2.sh +++ b/tests/rlxf/train_orm_llama2_7b_pp2_vpp2_dp2.sh @@ -17,7 +17,7 @@ DISTRIBUTED_ARGS=" " DATA_PATH="/data/ci/orm/dpo-en-llama-2-7b/dpo_en" -TOKENIZER_PATH="/data/llama-2-7b-hf/" +TOKENIZER_PATH="/data/hf/llama-2-7b-hf/" CKPT_LOAD_DIR="/data/ci/orm/llama-2-7b-layers8-rm-mcore_pp2vpp2/" basepath=$(cd `dirname $0`; cd ../../../; pwd) diff --git a/tests/rlxf/train_prm_llama2_tp1_pp8_full_ptd.sh b/tests/rlxf/train_prm_llama2_tp1_pp8_full_ptd.sh index 30dfbf6e1..525ff4262 100644 --- a/tests/rlxf/train_prm_llama2_tp1_pp8_full_ptd.sh +++ b/tests/rlxf/train_prm_llama2_tp1_pp8_full_ptd.sh @@ -11,7 +11,7 @@ WORLD_SIZE=$((NPUS_PER_NODE*$NNODES)) basepath=$(cd `dirname $0`; cd ../../../; pwd) DATA_PATH="/data/llama2_prm_data/math_shepherd_prm" -TOKENIZER_MODEL="/data/llama-2-7b-hf/" +TOKENIZER_MODEL="/data/hf/llama-2-7b-hf/" CKPT_LOAD_DIR="/data/llama-2-7b-mcore-tp1-pp8/" TP=1 diff --git a/tests/st/shell_scripts/tune_qwen7b_tp8_pp1_full_ptd.sh b/tests/st/shell_scripts/tune_qwen7b_tp8_pp1_full_ptd.sh index d24e2511a..23c8413cc 100644 --- a/tests/st/shell_scripts/tune_qwen7b_tp8_pp1_full_ptd.sh +++ b/tests/st/shell_scripts/tune_qwen7b_tp8_pp1_full_ptd.sh @@ -12,7 +12,7 @@ WORLD_SIZE=$(($NPUS_PER_NODE*$NNODES)) DATA_PATH="/data/tune_dataset/alpaca/alpaca" TOKENIZER_PATH="/data/qwen-7b/" -CKPT_LOAD_DIR="/data/Qwen-7B-tp8-pp1/" +CKPT_LOAD_DIR="/data/pipeline/Qwen-7B-tp8-pp1/" basepath=$(cd `dirname $0`; cd ../../../; pwd) -- Gitee From aab3a8d5405c8ea1cd0c9d97283862598d84770f Mon Sep 17 00:00:00 2001 From: LuQingyun Date: Tue, 13 May 2025 16:51:04 +0800 Subject: [PATCH 02/15] fix pipeline mistakes --- tests/pipeline/baichuan2-13B/param_config.json | 2 +- tests/pipeline/chatglm3-6B/param_config.json | 2 +- tests/pipeline/common/test_checkpoint.json | 14 +++++++------- tests/pipeline/common/test_inference.json | 6 +++--- 4 files changed, 12 insertions(+), 12 deletions(-) diff --git a/tests/pipeline/baichuan2-13B/param_config.json b/tests/pipeline/baichuan2-13B/param_config.json index 05ab4ec55..e88834923 100644 --- a/tests/pipeline/baichuan2-13B/param_config.json +++ b/tests/pipeline/baichuan2-13B/param_config.json @@ -86,7 +86,7 @@ "save-model-type": "mg", "target-tensor-parallel-size": "8", "load-dir": "/data/hf/baichuan2-13B-hf", - "save-dir": "/data/cache", + "save-dir": "/data/cache/baichuan2", "tokenizer-model": "/data/hf/baichuan2-13B-hf/tokenizer.model", "params-dtype": "bf16", "w-pack":"True", diff --git a/tests/pipeline/chatglm3-6B/param_config.json b/tests/pipeline/chatglm3-6B/param_config.json index a395c0743..c826ccb34 100644 --- a/tests/pipeline/chatglm3-6B/param_config.json +++ b/tests/pipeline/chatglm3-6B/param_config.json @@ -105,7 +105,7 @@ "target-tensor-parallel-size": "2", "target-pipeline-parallel-size": "4", "load-dir": "/data/hf/chatglm3-6b-base-hf/", - "save-dir": "/data/cache/", + "save-dir": "/data/cache/chatglm3_tp2pp4", "tokenizer-model": "/data/hf/chatglm3-6b-base-hf/tokenizer.model" } }, diff --git a/tests/pipeline/common/test_checkpoint.json b/tests/pipeline/common/test_checkpoint.json index f5504ba7f..0524bf6b9 100644 --- a/tests/pipeline/common/test_checkpoint.json +++ b/tests/pipeline/common/test_checkpoint.json @@ -9,12 +9,12 @@ "target-pipeline-parallel-size": "2", "target-expert-parallel-size": "2", "num-layer-list": "6,10", - "load-dir":"/data/Mixtral-8x7B-v0.1", + "load-dir":"/data/hf/Mixtral-legacy-hf", "save-dir":"/data/wttest/test/hf2mc_mixtral_tp2pp2ep2dypp", "use-mcore-models": null, "model-type-hf": "mixtral", "params-dtype": "bf16", - "tokenizer-model":"/data/Mixtral-8x7B-v0.1/tokenizer.model" + "tokenizer-model":"/data/hf/Mixtral-legacy-hf/tokenizer.model" } }, { @@ -40,7 +40,7 @@ "target-tensor-parallel-size": "1", "target-pipeline-parallel-size": "1", "target-expert-parallel-size": "1", - "save-dir":"/data/Mixtral-8x7B-v0.1/", + "save-dir":"/data/hf/Mixtral-legacy-hf/", "load-dir":"/data/wttest/base/hf2mc_mixtral_tp1pp4ep2vpp2", "use-mcore-models": null, "model-type-hf": "mixtral", @@ -127,13 +127,13 @@ "save-model-type": "mg", "target-tensor-parallel-size": "8", "target-pipeline-parallel-size": "1", - "load-dir": "/data/gemma2-9b-hf/", + "load-dir": "/data/hf/gemma2-9b-hf/", "save-dir": "/data/gemma2-9b-mg-tp8pp1-mcore-test/", "use-mcore-models": null, "post-norm": null, "model-type-hf": "gemma2", "params-dtype": "bf16", - "tokenizer-model": "/data/gemma2-9b-hf/tokenizer.json", + "tokenizer-model": "/data/hf/gemma2-9b-hf/tokenizer.json", "spec":"mindspeed_llm.tasks.models.spec.gemma2_spec layer_spec" } }, @@ -266,7 +266,7 @@ "target-tensor-parallel-size": "2", "target-pipeline-parallel-size": "2", "save-dir":"/data/llama-3-8b-hf-nooplayer-tp2pp2vpp2-mcore-test/", - "load-dir":"/data/llama-3-8b-hf-layer14/", + "load-dir":"/data/hf/llama-3-8b-hf-layer14/", "num-layers-per-virtual-pipeline-stage": "2", "noop-layers": "1,15", "params-dtype": "bf16", @@ -426,7 +426,7 @@ "target-expert-parallel-size": "2", "load-dir":"/data/ci/orm/mixtral-8x7b-layers4-rm-hf/", "save-dir":"/data/ci/orm/mixtral-8x7b-layers4-rm-mcore_tp2pp2ep2_test/", - "tokenizer-model":"/data/Mixtral-8x7B-v0.1/tokenizer.model", + "tokenizer-model":"/data/hf/Mixtral-legacy-hf/tokenizer.model", "use-mcore-models": null, "model-type-hf": "mixtral", "orm": null diff --git a/tests/pipeline/common/test_inference.json b/tests/pipeline/common/test_inference.json index 58b6c2e95..fa2e87d58 100644 --- a/tests/pipeline/common/test_inference.json +++ b/tests/pipeline/common/test_inference.json @@ -201,7 +201,7 @@ "swiglu": null, "load": "/data/Baichuan2-13B-tp8pp1-mcore-hf", "tokenizer-type": "PretrainedFromHF", - "tokenizer-name-or-path": "/data/baichuan2-13B-hf/", + "tokenizer-name-or-path": "/data/hf/baichuan2-13B-hf/", "tokenizer-not-use-fast": null, "fp16": null, "normalization": "RMSNorm" , @@ -219,7 +219,7 @@ } } ], - + "test_baichuan2_legacy_greedy_search": [ { "param": { @@ -242,7 +242,7 @@ "swiglu": null, "load": "/data/pipe/baichuan2-13b-tp8pp1-legacy-base", "tokenizer-type": "PretrainedFromHF", - "tokenizer-name-or-path": "/data/baichuan2-13B-hf/", + "tokenizer-name-or-path": "/data/hf/baichuan2-13B-hf/", "tokenizer-not-use-fast": null, "fp16": null, "normalization": "RMSNorm" , -- Gitee From 99d8e8d97403698b1467dd88d3517af0093ea94a Mon Sep 17 00:00:00 2001 From: LuQingyun Date: Thu, 15 May 2025 16:27:43 +0800 Subject: [PATCH 03/15] fix pipeline mistakes --- tests/pipeline/baichuan2-13B/param_config.json | 2 +- tests/pipeline/common/test_checkpoint.json | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/pipeline/baichuan2-13B/param_config.json b/tests/pipeline/baichuan2-13B/param_config.json index e88834923..4417c0fe6 100644 --- a/tests/pipeline/baichuan2-13B/param_config.json +++ b/tests/pipeline/baichuan2-13B/param_config.json @@ -24,7 +24,7 @@ "fp16": null, "no-load-rng": null, "no-load-optim": null, - "load": "/data/pipeline/Baichuan2-13B-tp8pp1-mcore-hf-layer2", + "load": "/data/pipeline/baichuan2-13b-tp8pp1-legacy-base", "tokenizer-type": "PretrainedFromHF", "tokenizer-name-or-path": "/data/hf/baichuan2-13B-hf/", "use-deter-comp": null diff --git a/tests/pipeline/common/test_checkpoint.json b/tests/pipeline/common/test_checkpoint.json index 0524bf6b9..93e4bf7b9 100644 --- a/tests/pipeline/common/test_checkpoint.json +++ b/tests/pipeline/common/test_checkpoint.json @@ -320,7 +320,7 @@ "save-model-type":"hf", "target-tensor-parallel-size": "1", "target-pipeline-parallel-size": "1", - "load-dir":"/data/llama-2-7b-hf-hf2ml-tp2pp4dypp-test", + "load-dir":"/data/hf/llama-2-7b-hf-hf2ml-tp2pp4dypp-test", "save-dir":"/data/hf/llama-2-7b-hf", "model-type-hf": "llama2" } -- Gitee From e9cd8f16d67dd891578c03354042d401b4701909 Mon Sep 17 00:00:00 2001 From: LuQingyun Date: Thu, 15 May 2025 16:32:54 +0800 Subject: [PATCH 04/15] fix pipeline mistakes --- tests/pipeline/common/test_inference.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/pipeline/common/test_inference.json b/tests/pipeline/common/test_inference.json index fa2e87d58..8eaa07513 100644 --- a/tests/pipeline/common/test_inference.json +++ b/tests/pipeline/common/test_inference.json @@ -240,7 +240,7 @@ "square-alibi-mask": null, "fill-neg-inf": null, "swiglu": null, - "load": "/data/pipe/baichuan2-13b-tp8pp1-legacy-base", + "load": "/data/pipeline/baichuan2-13b-tp8pp1-legacy-base", "tokenizer-type": "PretrainedFromHF", "tokenizer-name-or-path": "/data/hf/baichuan2-13B-hf/", "tokenizer-not-use-fast": null, -- Gitee From af9efe29b2556e0242d74e7115caabcbc50a6363 Mon Sep 17 00:00:00 2001 From: LuQingyun Date: Mon, 19 May 2025 14:28:42 +0800 Subject: [PATCH 05/15] fix coverage module mistakes --- tests/coverage/evaluation/test_evaluate.py | 36 +++++++++++----------- tests/run_coverage.sh | 24 +++++++++++++-- 2 files changed, 39 insertions(+), 21 deletions(-) diff --git a/tests/coverage/evaluation/test_evaluate.py b/tests/coverage/evaluation/test_evaluate.py index 3014459b5..10ee5084c 100644 --- a/tests/coverage/evaluation/test_evaluate.py +++ b/tests/coverage/evaluation/test_evaluate.py @@ -45,8 +45,8 @@ class TestEvaluate(DistributedTest): world_size = 8 test_config = create_testconfig(Path(__file__).with_suffix(".json")) - @pytest.mark.parametrize("test_params", test_config["test_llama2_mcore_agieval_evaluate"]) - def test_llama2_mcore_agieval_evaluate(self, build_args, test_params): + @pytest.mark.parametrize("params", test_config["test_llama2_mcore_agieval_evaluate"]) + def test_llama2_mcore_agieval_evaluate(self, build_args, params): os.environ["CUDA_DEVICE_MAX_CONNECTIONS"] = "1" os.environ["CLOSE_MATMUL_K_SHIFT"] = "1" @@ -62,8 +62,8 @@ class TestEvaluate(DistributedTest): assert math.isclose(expected_score, 0.192771, abs_tol=1e-2), f"score {expected_score}, forward pass has been changed, check it!" - @pytest.mark.parametrize("test_params", test_config["test_llama2_mcore_bbh_evaluate"]) - def test_llama2_mcore_bbh_evaluate(self, build_args, test_params): + @pytest.mark.parametrize("params", test_config["test_llama2_mcore_bbh_evaluate"]) + def test_llama2_mcore_bbh_evaluate(self, build_args, params): os.environ["CUDA_DEVICE_MAX_CONNECTIONS"] = "1" os.environ["CLOSE_MATMUL_K_SHIFT"] = "1" if dist.get_rank() == 0: @@ -78,8 +78,8 @@ class TestEvaluate(DistributedTest): assert math.isclose(expected_score, 0.744186, abs_tol=1e-2), f"score {expected_score}, forward pass has been changed, check it!" - @pytest.mark.parametrize("test_params", test_config["test_qwen2_mcore_needlebench_evaluate"]) - def test_qwen2_mcore_needlebench_evaluate(self, build_args, test_params): + @pytest.mark.parametrize("params", test_config["test_qwen2_mcore_needlebench_evaluate"]) + def test_qwen2_mcore_needlebench_evaluate(self, build_args, params): os.environ["CUDA_DEVICE_MAX_CONNECTIONS"] = "1" os.environ["CLOSE_MATMUL_K_SHIFT"] = "1" @@ -99,8 +99,8 @@ class TestEvaluateWorldSize1(DistributedTest): world_size = 1 test_config = create_testconfig(Path(__file__).with_suffix(".json")) - @pytest.mark.parametrize("test_params", test_config["test_qwen2_mcore_mmlu_evaluate"]) - def test_qwen2_mcore_mmlu_evaluate(self, build_args, test_params): + @pytest.mark.parametrize("params", test_config["test_qwen2_mcore_mmlu_evaluate"]) + def test_qwen2_mcore_mmlu_evaluate(self, build_args, params): os.environ["CUDA_DEVICE_MAX_CONNECTIONS"] = "1" if dist.get_rank() == 0: handler, log_capture = setup_logger(PATTERN) @@ -113,8 +113,8 @@ class TestEvaluateWorldSize1(DistributedTest): expected_score = acquire_score(log_capture) assert math.isclose(expected_score, 0.0, abs_tol=1e-2), f"score {expected_score}, forward pass has been changed, check it!" - @pytest.mark.parametrize("test_params", test_config["test_cmmlu_evaluate"]) - def test_cmmlu_evaluate(self, build_args, test_params): + @pytest.mark.parametrize("params", test_config["test_cmmlu_evaluate"]) + def test_cmmlu_evaluate(self, build_args, params): os.environ["CUDA_DEVICE_MAX_CONNECTIONS"] = "1" if dist.get_rank() == 0: handler, log_capture = setup_logger(PATTERN) @@ -127,8 +127,8 @@ class TestEvaluateWorldSize1(DistributedTest): expected_score = acquire_score(log_capture) assert math.isclose(expected_score, 0.0, abs_tol=1e-2), f"score {expected_score}, forward pass has been changed, check it!" - @pytest.mark.parametrize("test_params", test_config["test_humaneval_evaluate"]) - def test_humaneval_evaluate(self, build_args, test_params): + @pytest.mark.parametrize("params", test_config["test_humaneval_evaluate"]) + def test_humaneval_evaluate(self, build_args, params): os.environ["CUDA_DEVICE_MAX_CONNECTIONS"] = "1" if dist.get_rank() == 0: handler, log_capture = setup_logger(PATTERN) @@ -141,8 +141,8 @@ class TestEvaluateWorldSize1(DistributedTest): expected_score = acquire_score(log_capture) assert math.isclose(expected_score, 0.0, abs_tol=1e-2), f"score {expected_score}, forward pass has been changed, check it!" - @pytest.mark.parametrize("test_params", test_config["test_ceval_evaluate"]) - def test_ceval_evaluate(self, build_args, test_params): + @pytest.mark.parametrize("params", test_config["test_ceval_evaluate"]) + def test_ceval_evaluate(self, build_args, params): os.environ["CUDA_DEVICE_MAX_CONNECTIONS"] = "1" if dist.get_rank() == 0: handler, log_capture = setup_logger(PATTERN) @@ -155,8 +155,8 @@ class TestEvaluateWorldSize1(DistributedTest): expected_score = acquire_score(log_capture) assert math.isclose(expected_score, 0.0, abs_tol=1e-2), f"score {expected_score}, forward pass has been changed, check it!" - @pytest.mark.parametrize("test_params", test_config["test_boolq_evaluate"]) - def test_boolq_evaluate(self, build_args, test_params): + @pytest.mark.parametrize("params", test_config["test_boolq_evaluate"]) + def test_boolq_evaluate(self, build_args, params): os.environ["CUDA_DEVICE_MAX_CONNECTIONS"] = "1" if dist.get_rank() == 0: handler, log_capture = setup_logger(PATTERN) @@ -169,8 +169,8 @@ class TestEvaluateWorldSize1(DistributedTest): expected_score = acquire_score(log_capture) assert math.isclose(expected_score, 0.62171, abs_tol=1e-2), f"score {expected_score}, forward pass has been changed, check it!" - @pytest.mark.parametrize("test_params", test_config["test_gsm8k_evaluate"]) - def test_gsm8k_evaluate(self, build_args, test_params): + @pytest.mark.parametrize("params", test_config["test_gsm8k_evaluate"]) + def test_gsm8k_evaluate(self, build_args, params): os.environ["CUDA_DEVICE_MAX_CONNECTIONS"] = "1" if dist.get_rank() == 0: handler, log_capture = setup_logger(PATTERN) diff --git a/tests/run_coverage.sh b/tests/run_coverage.sh index b171e0779..99390344c 100644 --- a/tests/run_coverage.sh +++ b/tests/run_coverage.sh @@ -70,6 +70,15 @@ add_coverage() { sed -i "/ main()/a\ cov.stop()" pretrain_gpt.py sed -i "/ cov.stop()/a\ cov.save()" pretrain_gpt.py + sed -i "1a\import random" convert_ckpt.py + sed -i "2a\import time" convert_ckpt.py + sed -i "3a\import coverage" convert_ckpt.py + sed -i '4a\cov = coverage.Coverage(data_suffix=f"usecase-{time.time_ns()}_{random.randint(0, 100)}")' convert_ckpt.py + + sed -i "/ main()/i\ cov.start()" convert_ckpt.py + sed -i "/ main()/a\ cov.stop()" convert_ckpt.py + sed -i "/ cov.stop()/a\ cov.save()" convert_ckpt.py + sed -i "1a\import random" posttrain_gpt.py sed -i "2a\import time" posttrain_gpt.py sed -i "3a\import coverage" posttrain_gpt.py @@ -99,6 +108,15 @@ remove_coverage() { sed -i "/ cov.stop()/d" pretrain_gpt.py sed -i "/ cov.save()/d" pretrain_gpt.py + sed -i "2d" convert_ckpt.py + sed -i "2d" convert_ckpt.py + sed -i "2d" convert_ckpt.py + sed -i "2d" convert_ckpt.py + + sed -i "/ cov.start()/d" convert_ckpt.py + sed -i "/ cov.stop()/d" convert_ckpt.py + sed -i "/ cov.save()/d" convert_ckpt.py + sed -i "2d" posttrain_gpt.py sed -i "2d" posttrain_gpt.py sed -i "2d" posttrain_gpt.py @@ -118,6 +136,8 @@ remove_coverage() { sed -i "/ cov.save()/d" ray_gpt.py } +add_coverage + # run the coverage for python files in the pipeline find "$PIPELINE_DIR" -mindepth 1 -maxdepth 1 -type d | while read -r dir; do if [ -d "$dir" ]; then @@ -132,14 +152,12 @@ pytest -xs ${UT_DIR} find "$UT_DIR" -mindepth 0 -maxdepth 1 -type d | while read -r dir; do if [ -d "$dir" ]; then find "$dir" -type f -name "*.py" | while read -r file; do - echo "${file}" + echo "${file}" coverage run -p --source=$SOURCE_DIR $file done fi done -add_coverage - # run the coverage for shell scripts in the st for test_case in "$ST_DIR"/*.sh; do file_name=$(basename "${test_case}") -- Gitee From 424fd7b8ac2054c98af6cc8d5ff2f9e91fb1ed27 Mon Sep 17 00:00:00 2001 From: LuQingyun Date: Mon, 19 May 2025 14:39:17 +0800 Subject: [PATCH 06/15] fix coverage module mistakes --- convert_ckpt.py | 1 + 1 file changed, 1 insertion(+) diff --git a/convert_ckpt.py b/convert_ckpt.py index d5c22f952..da22a3b1d 100644 --- a/convert_ckpt.py +++ b/convert_ckpt.py @@ -1,3 +1,4 @@ +# Copyright (c) 2024, HUAWEI CORPORATION. All rights reserved. import argparse import importlib import os -- Gitee From 692abeae9525f7058df0bc2b77ee3ab835ec7735 Mon Sep 17 00:00:00 2001 From: LuQingyun Date: Mon, 19 May 2025 15:53:55 +0800 Subject: [PATCH 07/15] fix coverage module mistakes --- tests/run_coverage.sh | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/tests/run_coverage.sh b/tests/run_coverage.sh index 99390344c..ec94a8aa1 100644 --- a/tests/run_coverage.sh +++ b/tests/run_coverage.sh @@ -96,6 +96,15 @@ add_coverage() { sed -i "/ main()/a\ cov.stop()" ray_gpt.py sed -i "/ cov.stop()/a\ cov.save()" ray_gpt.py + + sed -i "1a\import random" preprocess_data.py + sed -i "2a\import time" preprocess_data.py + sed -i "3a\import coverage" preprocess_data.py + sed -i '4a\cov = coverage.Coverage(data_suffix=f"usecase-{time.time_ns()}_{random.randint(0, 100)}")' preprocess_data.py + + sed -i "/def main():/a\ cov.start()" preprocess_data.py + sed -i "/ os.remove(idx_file.replace('.idx', '.bin'))/a\ cov.stop()" preprocess_data.py + sed -i "/ cov.stop()/a\ cov.save()" preprocess_data.py } remove_coverage() { @@ -134,6 +143,15 @@ remove_coverage() { sed -i "/ cov.stop()/d" ray_gpt.py sed -i "/ cov.save()/d" ray_gpt.py + + sed -i "2d" preprocess_data.py + sed -i "2d" preprocess_data.py + sed -i "2d" preprocess_data.py + sed -i "2d" preprocess_data.py + + sed -i "/ cov.start()/d" preprocess_data.py + sed -i "/ cov.stop()/d" preprocess_data.py + sed -i "/ cov.save()/d" preprocess_data.py } add_coverage -- Gitee From fb44a445d7a117ae0b5c9c68c986e75aa71d9e6d Mon Sep 17 00:00:00 2001 From: LuQingyun Date: Mon, 26 May 2025 10:07:06 +0800 Subject: [PATCH 08/15] fix gloo feature patch mistakes --- mindspeed_llm/training/arguments.py | 1 - 1 file changed, 1 deletion(-) diff --git a/mindspeed_llm/training/arguments.py b/mindspeed_llm/training/arguments.py index 315a2cff1..729e36346 100644 --- a/mindspeed_llm/training/arguments.py +++ b/mindspeed_llm/training/arguments.py @@ -1380,7 +1380,6 @@ def _add_dummy_args(args): args.recompute_in_bubble = False args.use_nanopipe = False args.moe_without_activation = False - args.disable_gloo_group = None args.ampipe_degree = 0 args.attention_mask_type = args.cp_attention_mask_type args.hccl_group_buffer_adaptive = False -- Gitee From 9b82ad1b6d0d76ff02c974a81f1e903008ccc33a Mon Sep 17 00:00:00 2001 From: LuQingyun Date: Thu, 29 May 2025 11:11:44 +0800 Subject: [PATCH 09/15] add daily pipeline log --- tests/pipeline/pipe_run.sh | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/tests/pipeline/pipe_run.sh b/tests/pipeline/pipe_run.sh index 881800c0a..cd30eedce 100644 --- a/tests/pipeline/pipe_run.sh +++ b/tests/pipeline/pipe_run.sh @@ -29,9 +29,11 @@ BASE_DIR=$(dirname "$(readlink -f "$0")") CURRENT_TIME=$(date "+%Y-%m-%d") BASELINE_DIR="$BASE_DIR/baseline" GENERATE_LOG_DIR="/$(echo "$BASE_DIR" | cut -d'/' -f2)/pipeline_log/$CURRENT_TIME" +ERROR_LOG_DIR="$GENERATE_LOG_DIR/exec_error_log_files" #mkdir cache to store product and will be removed after test mkdir -p "$GENERATE_LOG_DIR" +mkdir -p "$ERROR_LOG_DIR" touch "$GENERATE_LOG_DIR/exec_error.log" echo "core0.8.0 Execution Results" > $GENERATE_LOG_DIR/exec_error.log @@ -54,9 +56,12 @@ find "$BASE_DIR" -mindepth 1 -maxdepth 1 -type d | while read -r dir; do --generate-json $GENERATE_LOG_DIR/$name.json PYTEST_EXITCODE=$? if [ $PYTEST_EXITCODE -ne 0 ]; then + cp -r $GENERATE_LOG_DIR/$name.log "${ERROR_LOG_DIR}/" + cp -r $GENERATE_LOG_DIR/$name.json "${ERROR_LOG_DIR}/" echo "${name}.sh compare to baseline has failed, check it!" >> $GENERATE_LOG_DIR/exec_error.log fi else + cp -r $GENERATE_LOG_DIR/$name.log "${ERROR_LOG_DIR}/" echo "${name}.sh Script has failed. Exit!" >> $GENERATE_LOG_DIR/exec_error.log fi done @@ -64,9 +69,15 @@ find "$BASE_DIR" -mindepth 1 -maxdepth 1 -type d | while read -r dir; do # python test testing find "$dir" -type f -name "*.py" | while read -r file; do echo "running $file" - if ! pytest --log-level=INFO "$file"; then + pytest --log-level=INFO "$file" | tee "${file}.log" 2>&1 + PYTEST_EXITCODE=${PIPESTATUS[0]} + if [ $PYTEST_EXITCODE -ne 0 ]; then + cp -r ${file}.log "${ERROR_LOG_DIR}/" echo "$file has failed, check it!" >> "$GENERATE_LOG_DIR/exec_error.log" fi done fi done + +echo "=================tar error log==================" +tar -czvf "$GENERATE_LOG_DIR/err_log.tar.gz" "${ERROR_LOG_DIR}/" \ No newline at end of file -- Gitee From e66d840b2e9545cdb0f692517997bd0f30c165ca Mon Sep 17 00:00:00 2001 From: LuQingyun Date: Thu, 29 May 2025 11:42:40 +0800 Subject: [PATCH 10/15] add daily pipeline log --- tests/pipeline/pipe_run.sh | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/tests/pipeline/pipe_run.sh b/tests/pipeline/pipe_run.sh index cd30eedce..3e4d3234b 100644 --- a/tests/pipeline/pipe_run.sh +++ b/tests/pipeline/pipe_run.sh @@ -28,12 +28,11 @@ cp -rf /home/master_branch/Megatron-LM/megatron ./ BASE_DIR=$(dirname "$(readlink -f "$0")") CURRENT_TIME=$(date "+%Y-%m-%d") BASELINE_DIR="$BASE_DIR/baseline" +LOG_BASE_DIR="/$(echo "$BASE_DIR" | cut -d'/' -f2)/pipeline_log" GENERATE_LOG_DIR="/$(echo "$BASE_DIR" | cut -d'/' -f2)/pipeline_log/$CURRENT_TIME" -ERROR_LOG_DIR="$GENERATE_LOG_DIR/exec_error_log_files" #mkdir cache to store product and will be removed after test mkdir -p "$GENERATE_LOG_DIR" -mkdir -p "$ERROR_LOG_DIR" touch "$GENERATE_LOG_DIR/exec_error.log" echo "core0.8.0 Execution Results" > $GENERATE_LOG_DIR/exec_error.log @@ -56,12 +55,9 @@ find "$BASE_DIR" -mindepth 1 -maxdepth 1 -type d | while read -r dir; do --generate-json $GENERATE_LOG_DIR/$name.json PYTEST_EXITCODE=$? if [ $PYTEST_EXITCODE -ne 0 ]; then - cp -r $GENERATE_LOG_DIR/$name.log "${ERROR_LOG_DIR}/" - cp -r $GENERATE_LOG_DIR/$name.json "${ERROR_LOG_DIR}/" echo "${name}.sh compare to baseline has failed, check it!" >> $GENERATE_LOG_DIR/exec_error.log fi else - cp -r $GENERATE_LOG_DIR/$name.log "${ERROR_LOG_DIR}/" echo "${name}.sh Script has failed. Exit!" >> $GENERATE_LOG_DIR/exec_error.log fi done @@ -72,7 +68,6 @@ find "$BASE_DIR" -mindepth 1 -maxdepth 1 -type d | while read -r dir; do pytest --log-level=INFO "$file" | tee "${file}.log" 2>&1 PYTEST_EXITCODE=${PIPESTATUS[0]} if [ $PYTEST_EXITCODE -ne 0 ]; then - cp -r ${file}.log "${ERROR_LOG_DIR}/" echo "$file has failed, check it!" >> "$GENERATE_LOG_DIR/exec_error.log" fi done @@ -80,4 +75,4 @@ find "$BASE_DIR" -mindepth 1 -maxdepth 1 -type d | while read -r dir; do done echo "=================tar error log==================" -tar -czvf "$GENERATE_LOG_DIR/err_log.tar.gz" "${ERROR_LOG_DIR}/" \ No newline at end of file +tar -czvf "${LOG_BASE_DIR}/${CURRENT_TIME}.tar.gz" "${$GENERATE_LOG_DIR}/" \ No newline at end of file -- Gitee From 60fc7d252405f54dae867544324e4dc1d42b0954 Mon Sep 17 00:00:00 2001 From: LuQingyun Date: Thu, 29 May 2025 14:30:24 +0800 Subject: [PATCH 11/15] add daily pipeline log --- tests/pipeline/pipe_run.sh | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/tests/pipeline/pipe_run.sh b/tests/pipeline/pipe_run.sh index 3e4d3234b..fd09462cc 100644 --- a/tests/pipeline/pipe_run.sh +++ b/tests/pipeline/pipe_run.sh @@ -28,7 +28,6 @@ cp -rf /home/master_branch/Megatron-LM/megatron ./ BASE_DIR=$(dirname "$(readlink -f "$0")") CURRENT_TIME=$(date "+%Y-%m-%d") BASELINE_DIR="$BASE_DIR/baseline" -LOG_BASE_DIR="/$(echo "$BASE_DIR" | cut -d'/' -f2)/pipeline_log" GENERATE_LOG_DIR="/$(echo "$BASE_DIR" | cut -d'/' -f2)/pipeline_log/$CURRENT_TIME" #mkdir cache to store product and will be removed after test @@ -75,4 +74,4 @@ find "$BASE_DIR" -mindepth 1 -maxdepth 1 -type d | while read -r dir; do done echo "=================tar error log==================" -tar -czvf "${LOG_BASE_DIR}/${CURRENT_TIME}.tar.gz" "${$GENERATE_LOG_DIR}/" \ No newline at end of file +tar -czvf "${GENERATE_LOG_DIR}/${CURRENT_TIME}.tar.gz" "${$GENERATE_LOG_DIR}/" \ No newline at end of file -- Gitee From 26bbbea5d2fe408f75e4b2b4b22929d22e2b4160 Mon Sep 17 00:00:00 2001 From: LuQingyun Date: Thu, 29 May 2025 15:26:02 +0800 Subject: [PATCH 12/15] add daily pipeline log --- tests/pipeline/pipe_run.sh | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/tests/pipeline/pipe_run.sh b/tests/pipeline/pipe_run.sh index fd09462cc..50874576a 100644 --- a/tests/pipeline/pipe_run.sh +++ b/tests/pipeline/pipe_run.sh @@ -64,7 +64,9 @@ find "$BASE_DIR" -mindepth 1 -maxdepth 1 -type d | while read -r dir; do # python test testing find "$dir" -type f -name "*.py" | while read -r file; do echo "running $file" - pytest --log-level=INFO "$file" | tee "${file}.log" 2>&1 + tmp_file_name="${file#*MindSpeed-LLM/}" + file_name="${tmp_file_name//\//_}" + pytest --log-level=INFO "$file" | tee "${GENERATE_LOG_DIR}/${file_name}.log" 2>&1 PYTEST_EXITCODE=${PIPESTATUS[0]} if [ $PYTEST_EXITCODE -ne 0 ]; then echo "$file has failed, check it!" >> "$GENERATE_LOG_DIR/exec_error.log" @@ -74,4 +76,4 @@ find "$BASE_DIR" -mindepth 1 -maxdepth 1 -type d | while read -r dir; do done echo "=================tar error log==================" -tar -czvf "${GENERATE_LOG_DIR}/${CURRENT_TIME}.tar.gz" "${$GENERATE_LOG_DIR}/" \ No newline at end of file +tar -czvf "${GENERATE_LOG_DIR}/${CURRENT_TIME}.tar.gz" "${GENERATE_LOG_DIR}/" \ No newline at end of file -- Gitee From 4e2b6e550af17278037542df6d3214fbfa52dc22 Mon Sep 17 00:00:00 2001 From: LuQingyun Date: Fri, 30 May 2025 10:53:20 +0800 Subject: [PATCH 13/15] fix add daily pipeline log tar exception --- tests/pipeline/pipe_run.sh | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/tests/pipeline/pipe_run.sh b/tests/pipeline/pipe_run.sh index 50874576a..8e2e0877d 100644 --- a/tests/pipeline/pipe_run.sh +++ b/tests/pipeline/pipe_run.sh @@ -28,7 +28,8 @@ cp -rf /home/master_branch/Megatron-LM/megatron ./ BASE_DIR=$(dirname "$(readlink -f "$0")") CURRENT_TIME=$(date "+%Y-%m-%d") BASELINE_DIR="$BASE_DIR/baseline" -GENERATE_LOG_DIR="/$(echo "$BASE_DIR" | cut -d'/' -f2)/pipeline_log/$CURRENT_TIME" +GENERATE_LOG_BASE_DIR="/$(echo "$BASE_DIR" | cut -d'/' -f2)/pipeline_log" +GENERATE_LOG_DIR="$GENERATE_LOG_BASE_DIR/$CURRENT_TIME" #mkdir cache to store product and will be removed after test mkdir -p "$GENERATE_LOG_DIR" @@ -76,4 +77,4 @@ find "$BASE_DIR" -mindepth 1 -maxdepth 1 -type d | while read -r dir; do done echo "=================tar error log==================" -tar -czvf "${GENERATE_LOG_DIR}/${CURRENT_TIME}.tar.gz" "${GENERATE_LOG_DIR}/" \ No newline at end of file +tar -czvf "${GENERATE_LOG_BASE_DIR}/${CURRENT_TIME}.tar.gz" "${GENERATE_LOG_DIR}/" \ No newline at end of file -- Gitee From 8e7a47643d55c7a5518979d0f9185523220025f6 Mon Sep 17 00:00:00 2001 From: LuQingyun Date: Fri, 30 May 2025 14:34:54 +0800 Subject: [PATCH 14/15] generate run coverage log --- tests/run_coverage.sh | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/tests/run_coverage.sh b/tests/run_coverage.sh index ec94a8aa1..fa4c161b7 100644 --- a/tests/run_coverage.sh +++ b/tests/run_coverage.sh @@ -11,6 +11,13 @@ PIPELINE_DIR="empty" UT_DIR="empty" ST_DIR="empty" +# 创建日志目录 +GENERATE_LOG_DIR="$UT_DIR/logs" +mkdir -p "$GENERATE_LOG_DIR" +touch "$GENERATE_LOG_DIR/exec_error.log" +echo "core0.8.0 Execution Results" > $GENERATE_LOG_DIR/exec_error.log + + # 带参1用于区分运行场景 if [ -z "$1" ]; then echo "请提供一个参数(ST、PIPELINE、UT、all)" @@ -166,11 +173,18 @@ find "$PIPELINE_DIR" -mindepth 1 -maxdepth 1 -type d | while read -r dir; do done # run the coverage for python files in the unit tests -pytest -xs ${UT_DIR} find "$UT_DIR" -mindepth 0 -maxdepth 1 -type d | while read -r dir; do if [ -d "$dir" ]; then find "$dir" -type f -name "*.py" | while read -r file; do - echo "${file}" + echo "running ${file}" + filename=$(basename "$file") + extension="${filename##*.}" + name="${filename%.$extension}" + pytest -xs $file | tee "$GENERATE_LOG_DIR/$name.log" 2>&1 + PYTEST_EXITCODE=${PIPESTATUS[0]} + if [ $PYTEST_EXITCODE -ne 0 ]; then + echo "$file has failed, check it!" >> "$GENERATE_LOG_DIR/exec_error.log" + fi coverage run -p --source=$SOURCE_DIR $file done fi -- Gitee From f146e68ae17fa57c519246bd161bc37892b40ffb Mon Sep 17 00:00:00 2001 From: LuQingyun Date: Thu, 5 Jun 2025 20:43:51 +0800 Subject: [PATCH 15/15] fix run coverage tools --- tests/run_coverage.sh | 54 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 54 insertions(+) diff --git a/tests/run_coverage.sh b/tests/run_coverage.sh index fa4c161b7..8346e9068 100644 --- a/tests/run_coverage.sh +++ b/tests/run_coverage.sh @@ -77,6 +77,15 @@ add_coverage() { sed -i "/ main()/a\ cov.stop()" pretrain_gpt.py sed -i "/ cov.stop()/a\ cov.save()" pretrain_gpt.py + sed -i "1a\import random" pretrain_mamba.py + sed -i "2a\import time" pretrain_mamba.py + sed -i "3a\import coverage" pretrain_mamba.py + sed -i '4a\cov = coverage.Coverage(data_suffix=f"usecase-{time.time_ns()}_{random.randint(0, 100)}")' pretrain_mamba.py + sed -i "5a\cov.start()" pretrain_mamba.py + + sed -i "/ main()/a\ cov.stop()" pretrain_mamba.py + sed -i "/ cov.stop()/a\ cov.save()" pretrain_mamba.py + sed -i "1a\import random" convert_ckpt.py sed -i "2a\import time" convert_ckpt.py sed -i "3a\import coverage" convert_ckpt.py @@ -86,6 +95,24 @@ add_coverage() { sed -i "/ main()/a\ cov.stop()" convert_ckpt.py sed -i "/ cov.stop()/a\ cov.save()" convert_ckpt.py + sed -i "1a\import random" evaluation.py + sed -i "2a\import time" evaluation.py + sed -i "3a\import coverage" evaluation.py + sed -i '4a\cov = coverage.Coverage(data_suffix=f"usecase-{time.time_ns()}_{random.randint(0, 100)}")' evaluation.py + + sed -i "/def main():/a\ cov.start()" evaluation.py + sed -i "/ logger.info(f'NeedleBench_eval Running Time: {time.time() - a}')/a\ cov.stop()" evaluation.py + sed -i "/ cov.stop()/a\ cov.save()" evaluation.py + + sed -i "1a\import random" inference.py + sed -i "2a\import time" inference.py + sed -i "3a\import coverage" inference.py + sed -i '4a\cov = coverage.Coverage(data_suffix=f"usecase-{time.time_ns()}_{random.randint(0, 100)}")' inference.py + + sed -i "/def main():/a\ cov.start()" inference.py + sed -i "/ task_factory(args, model)/a\ cov.stop()" inference.py + sed -i "/ cov.stop()/a\ cov.save()" inference.py + sed -i "1a\import random" posttrain_gpt.py sed -i "2a\import time" posttrain_gpt.py sed -i "3a\import coverage" posttrain_gpt.py @@ -124,6 +151,15 @@ remove_coverage() { sed -i "/ cov.stop()/d" pretrain_gpt.py sed -i "/ cov.save()/d" pretrain_gpt.py + sed -i "2d" pretrain_mamba.py + sed -i "2d" pretrain_mamba.py + sed -i "2d" pretrain_mamba.py + sed -i "2d" pretrain_mamba.py + sed -i "2d" pretrain_mamba.py + + sed -i "/ cov.stop()/d" pretrain_mamba.py + sed -i "/ cov.save()/d" pretrain_mamba.py + sed -i "2d" convert_ckpt.py sed -i "2d" convert_ckpt.py sed -i "2d" convert_ckpt.py @@ -133,6 +169,24 @@ remove_coverage() { sed -i "/ cov.stop()/d" convert_ckpt.py sed -i "/ cov.save()/d" convert_ckpt.py + sed -i "2d" evaluation.py + sed -i "2d" evaluation.py + sed -i "2d" evaluation.py + sed -i "2d" evaluation.py + + sed -i "/ cov.start()/d" evaluation.py + sed -i "/ cov.stop()/d" evaluation.py + sed -i "/ cov.save()/d" evaluation.py + + sed -i "2d" inference.py + sed -i "2d" inference.py + sed -i "2d" inference.py + sed -i "2d" inference.py + + sed -i "/ cov.start()/d" inference.py + sed -i "/ cov.stop()/d" inference.py + sed -i "/ cov.save()/d" inference.py + sed -i "2d" posttrain_gpt.py sed -i "2d" posttrain_gpt.py sed -i "2d" posttrain_gpt.py -- Gitee