diff --git a/.jenkins/test/config/dependent_packages.yaml b/.jenkins/test/config/dependent_packages.yaml index d670bc0df0a5e4bdf54438e684257f04a26899c3..72069bf798764f622e190b5beafb3b4d505ebcc6 100644 --- a/.jenkins/test/config/dependent_packages.yaml +++ b/.jenkins/test/config/dependent_packages.yaml @@ -1,2 +1,2 @@ mindspore: - 'https://repo.mindspore.cn/mindspore/mindspore/version/202503/20250329/master_20250329010013_3c6d4175fec4a7b16ecefc0882f88f25ebb725e5_newest/' + 'https://repo.mindspore.cn/mindspore/mindspore/version/202505/20250526/master_20250526010017_b6f48709740e52a1daf206553f742609a0998bae_newest/' \ No newline at end of file diff --git a/ci/access_control_test.py b/ci/access_control_test.py deleted file mode 100644 index 58a90dfc63f804a17869cc6448ae0c4ff401b854..0000000000000000000000000000000000000000 --- a/ci/access_control_test.py +++ /dev/null @@ -1,43 +0,0 @@ -import os -import stat -import sys -import unittest -from pathlib import Path -import xmlrunner - - -# ============================= -# ST test, run with shell -# ============================= -def success_check(res): - if res != 0: - sys.exit(1) - - -def success_check_ut(res): - if len(res.failures) + len(res.errors) != 0: - sys.exit(1) - - -class ST_Test: - def __init__(self): - self.shell_file_list = [] - - - def run_shell(self): - for shell_file in self.shell_file_list: - success_check(os.system("sh {}".format(shell_file))) - -# =============================================== -# UT test, run with pytest, waiting for more ... -# =============================================== - - -if __name__ == "__main__": - st_test = ST_Test() - st_test.run_shell() - test_loader = unittest.TestLoader() - discover = test_loader.discover(start_dir="./", pattern="test*.py") - - runner = unittest.TextTestRunner() - success_check_ut(runner.run(discover)) diff --git a/scripts/set_env_rl.sh b/tests/scripts/set_env_rl.sh similarity index 81% rename from scripts/set_env_rl.sh rename to tests/scripts/set_env_rl.sh index c8cd4b1a04979d2be044075bc73315c518ce1493..e84629ddfc487a91de9ef1465c59d2536caa88ca 100644 --- a/scripts/set_env_rl.sh +++ b/tests/scripts/set_env_rl.sh @@ -1,7 +1,8 @@ #!/bin/bash script_path=$(realpath "${BASH_SOURCE[0]}") script_dir=$(dirname "$script_path") -MindSpeed_Core_MS_PATH=$(dirname $script_dir) +parent_dir=$(dirname "$script_dir") +MindSpeed_Core_MS_PATH=$(dirname "$parent_dir") mkdir ${MindSpeed_Core_MS_PATH}/RL cp -r ${MindSpeed_Core_MS_PATH}/test_convert_rl.sh ${MindSpeed_Core_MS_PATH}/RL cp -r ${MindSpeed_Core_MS_PATH}/tools ${MindSpeed_Core_MS_PATH}/RL diff --git a/scripts/set_path.sh b/tests/scripts/set_path.sh similarity index 84% rename from scripts/set_path.sh rename to tests/scripts/set_path.sh index 9090d4e84c028202f090743a8bbbd225c567ff5d..52ce6fc55ed944260a6a099e70ebb9d80a62f32c 100644 --- a/scripts/set_path.sh +++ b/tests/scripts/set_path.sh @@ -1,7 +1,8 @@ #!/bin/bash script_path=$(realpath "${BASH_SOURCE[0]}") script_dir=$(dirname "$script_path") -MindSpeed_Core_MS_PATH=$(dirname $script_dir) +parent_dir=$(dirname "$script_dir") +MindSpeed_Core_MS_PATH=$(dirname "$parent_dir") export PYTHONPATH=${MindSpeed_Core_MS_PATH}/msadapter/mindtorch/:$PYTHONPATH export PYTHONPATH=${MindSpeed_Core_MS_PATH}/MindSpeed-LLM/:${MindSpeed_Core_MS_PATH}/Megatron-LM/:${MindSpeed_Core_MS_PATH}/MindSpeed/:${MindSpeed_Core_MS_PATH}/transformers/src/:$PYTHONPATH echo "..............................................done set PYTHONPATH" diff --git a/scripts/set_path_rl.sh b/tests/scripts/set_path_rl.sh similarity index 91% rename from scripts/set_path_rl.sh rename to tests/scripts/set_path_rl.sh index 15d6535cc732cf490eb7a989efe79ed010f84425..661f9ac07d45677b952b3e6d765288cd43118059 100644 --- a/scripts/set_path_rl.sh +++ b/tests/scripts/set_path_rl.sh @@ -6,7 +6,8 @@ export CUDA_DEVICE_MAX_CONNECTIONS=1 script_path=$(realpath "${BASH_SOURCE[0]}") script_dir=$(dirname "$script_path") -MindSpeed_Core_MS_PATH=$(dirname $script_dir) +parent_dir=$(dirname "$script_dir") +MindSpeed_Core_MS_PATH=$(dirname "$parent_dir") export PYTHONPATH=${MindSpeed_Core_MS_PATH}/RL/Megatron-LM/:${MindSpeed_Core_MS_PATH}/RL/MindSpeed/:${MindSpeed_Core_MS_PATH}/RL/MindSpeed-LLM/:${MindSpeed_Core_MS_PATH}/RL/MindSpeed-RL/:$PYTHONPATH export PYTHONPATH=${MindSpeed_Core_MS_PATH}/RL/msadapter/mindtorch/:${MindSpeed_Core_MS_PATH}/RL/transformers/src:${MindSpeed_Core_MS_PATH}/RL/vllm/:${MindSpeed_Core_MS_PATH}/RL/vllm-ascend/:$PYTHONPATH export PYTHONPATH=/usr/local/Ascend/ascend-toolkit/latest/opp/built-in/op_impl/ai_core/tbe:/usr/local/Ascend/ascend-toolkit/latest/python/site-packages:$PYTHONPATH diff --git a/tests/st/net/test_ds3_grpo/test_ds3_grpo.sh b/tests/st/net/test_ds3_grpo/test_ds3_grpo.sh index ff2916c104e25cb9fb2b143c248039e15188ea4f..f2496562b44f7dc8ffd6c853acebda226deee45c 100644 --- a/tests/st/net/test_ds3_grpo/test_ds3_grpo.sh +++ b/tests/st/net/test_ds3_grpo/test_ds3_grpo.sh @@ -1,5 +1,5 @@ #!/bin/bash -source ../../../../scripts/set_path_rl.sh +source ../../../scripts/set_path_rl.sh MindSpeed_RL_PATH=../../../../RL/MindSpeed-RL backup() { diff --git a/tests/st/net/test_ds3_pretrain/pta_non_det.txt b/tests/st/net/test_ds3_pretrain/pta_non_det.txt new file mode 100644 index 0000000000000000000000000000000000000000..d1481572ce8eb15c71a0bbba1f411b5a22308cbf --- /dev/null +++ b/tests/st/net/test_ds3_pretrain/pta_non_det.txt @@ -0,0 +1,26 @@ +training ... +(min, max) time across ranks (ms): + model-and-optimizer-setup ......................: (13754.94, 13759.72) + train/valid/test-data-iterators-setup ..........: (149.72, 186.70) +[before the start of training step] datetime: 2025-05-21 10:49:37 +WARNING:megatron.core.models.common.embeddings.rotary_pos_embedding:Setting apply_rope_fusion to false because its implementation is not included in Apex. Try upgrading to the latest version +[W compiler_depend.ts:41] Warning: Warning: kernel [ArgSort] can not support dtype int32 or int64 on AiCore, Now this kernel is running on AiCpu.If you are more concerned about high-performance execution,please cast dtype to float32. (function operator()) +Number of parameters in transformer layers in billions: 26.19 +Number of parameters in embedding layers in billions: 1.85 +Total number of parameters in billions: 28.04 +Number of parameters in most loaded shard in billions: 14.0214 +Number of parameters in other shards in billions: 13.0947 +Theoretical memory footprints: weight and optimizer=120346.30 MB + [2025-05-21 10:49:52] iteration 1/ 10 | consumed samples: 8 | elapsed time per iteration (ms): 15074.1 | learning rate: 9.757730E-06 | global batch size: 8 | lm loss: 13.4216022491455078 | loss scale: 1.0 | grad norm: 11.4071564614665011 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | +[Rank 0] (after 1 iterations) memory (MB) | allocated: 17975.59765625 | max allocated: 24781.76806640625 | reserved: 28276.0 | max reserved: 28276.0 +[Rank 4] (after 1 iterations) memory (MB) | allocated: 31935.140625 | max allocated: 41007.94775390625 | reserved: 42298.0 | max reserved: 42298.0 + [2025-05-21 10:49:55] iteration 2/ 10 | consumed samples: 16 | elapsed time per iteration (ms): 2768.4 | learning rate: 9.054634E-06 | global batch size: 8 | lm loss: 13.3998327255249023 | loss scale: 1.0 | grad norm: 10.8173617529537101 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | + [2025-05-21 10:49:58] iteration 3/ 10 | consumed samples: 24 | elapsed time per iteration (ms): 2756.7 | learning rate: 7.959537E-06 | global batch size: 8 | lm loss: 13.5661659240722656 | loss scale: 1.0 | grad norm: 22.3005382840660218 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | + [2025-05-21 10:50:00] iteration 4/ 10 | consumed samples: 32 | elapsed time per iteration (ms): 2763.0 | learning rate: 6.579634E-06 | global batch size: 8 | lm loss: 13.3748970031738281 | loss scale: 1.0 | grad norm: 12.7191987954328987 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | + [2025-05-21 10:50:03] iteration 5/ 10 | consumed samples: 40 | elapsed time per iteration (ms): 2777.3 | learning rate: 5.050000E-06 | global batch size: 8 | lm loss: 13.6673517227172852 | loss scale: 1.0 | grad norm: 12.5808988367714285 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | + [2025-05-21 10:50:06] iteration 6/ 10 | consumed samples: 48 | elapsed time per iteration (ms): 2791.8 | learning rate: 3.520366E-06 | global batch size: 8 | lm loss: 13.4905300140380859 | loss scale: 1.0 | grad norm: 15.5625511383074375 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | + [2025-05-21 10:50:09] iteration 7/ 10 | consumed samples: 56 | elapsed time per iteration (ms): 2776.0 | learning rate: 2.140463E-06 | global batch size: 8 | lm loss: 13.4095611572265625 | loss scale: 1.0 | grad norm: 15.2414261057317315 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | + [2025-05-21 10:50:12] iteration 8/ 10 | consumed samples: 64 | elapsed time per iteration (ms): 2823.4 | learning rate: 1.045366E-06 | global batch size: 8 | lm loss: 13.3672180175781250 | loss scale: 1.0 | grad norm: 14.1250909616697893 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | + [2025-05-21 10:50:14] iteration 9/ 10 | consumed samples: 72 | elapsed time per iteration (ms): 2809.7 | learning rate: 3.422702E-07 | global batch size: 8 | lm loss: 13.2752037048339844 | loss scale: 1.0 | grad norm: 10.1879575310687986 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | + [2025-05-21 10:50:17] iteration 10/ 10 | consumed samples: 80 | elapsed time per iteration (ms): 2810.7 | learning rate: 1.000000E-07 | global batch size: 8 | lm loss: 13.0997333526611328 | loss scale: 1.0 | grad norm: 11.8447491871022272 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | +[after training is done] datetime: 2025-05-21 10:50:17 diff --git a/tests/st/net/test_ds3_pretrain/run_ms_nondetermin.sh b/tests/st/net/test_ds3_pretrain/run_ms_nondetermin.sh new file mode 100644 index 0000000000000000000000000000000000000000..9aa01f3b72e8c2e5f0c88e94b7c90860c61417b3 --- /dev/null +++ b/tests/st/net/test_ds3_pretrain/run_ms_nondetermin.sh @@ -0,0 +1,25 @@ +#!/bin/bash +backup() { + fname=$1 + cp $fname $fname'_back' + echo '======'$fname 'backuped!' +} + +recover() { + fname=$1 + cp $fname'_back' $fname + echo '======'$fname 'recovered!!!!' +} + +memRecord() { + recordFile=$1 + bash mem.sh $recordFile > mem.txt 2>&1& +} + + +# 关闭确定性计算跑一遍 +export HCCL_DETERMINISTIC=false # HCCL确定性 +export ASCEND_LAUNCH_BLOCKING= # 硬件确定性 +export NCCL_DETERMINISTIC= +bash test_ds3_pretrain.sh > ms_non_det.txt +cat ms_non_det.txt \ No newline at end of file diff --git a/tests/st/net/test_ds3_pretrain/test_ds3_pretrain.py b/tests/st/net/test_ds3_pretrain/test_ds3_pretrain.py index 4779c5f305f092156b37383175bdd886fbbc042c..517e24d59ab7451c5296c64a20c56b4cc222c098 100644 --- a/tests/st/net/test_ds3_pretrain/test_ds3_pretrain.py +++ b/tests/st/net/test_ds3_pretrain/test_ds3_pretrain.py @@ -16,40 +16,78 @@ import os import sys import pytest +from datetime import datetime sys.path.append(os.path.join(os.path.dirname(__file__), os.path.pardir)) from utils import parse_log_file +def run_mindspore_ds3_pretrain_determinstic(): + """ + Feature: run mindspore pretrain_ds3 + Description: run mindspore pretrain_ds3 to generate pynative loss + Expectation: test success + """ + scripts_name = "run_ms_determin.sh" + + test_path = os.path.split(os.path.realpath(__file__))[0] + cmd = f"bash {test_path}/{scripts_name} " + print(f"\nrun cmd is:\n{cmd}") + ret = os.system(cmd) + assert ret == 0, f"msrun failed, please check ms_det.log" + + +def run_mindspore_ds3_pretrain_nondeterminstic(): + """ + Feature: run mindspore pretrain_ds3 + Description: run mindspore pretrain_ds3 to generate pynative loss + Expectation: test success + """ + scripts_name = "run_ms_nondetermin.sh" + + test_path = os.path.split(os.path.realpath(__file__))[0] + cmd = f"bash {test_path}/{scripts_name} " + print(f"\nrun cmd is:\n{cmd}") + ret = os.system(cmd) + assert ret == 0, f"msrun failed, please check ms_non_det.log" + + +@pytest.mark.platform_arm_ascend910b_training +@pytest.mark.env_single +@pytest.mark.level0 +@pytest.mark.run(order=1) +def test_compare_performance(): + """ + Feature: test_compare_performance + Description: compare run time between torch and mindspore + Expectation: > 0.95pta + """ + run_mindspore_ds3_pretrain_nondeterminstic() + data_pt = parse_log_file('pta_non_det.txt') + data_ms = parse_log_file('ms_non_det.txt') + tformat = '%Y-%m-%d %H:%M:%S' + dt_ms = datetime.strptime(data_ms[10][0], tformat) - datetime.strptime(data_ms[5][0], tformat) + dt_pt = datetime.strptime(data_pt[10][0], tformat) - datetime.strptime(data_pt[5][0], tformat) + # 关闭确定性计算,统计5-10步,ms性能 > 0.95pta性能 + print("pt_time: %s s" % dt_pt.total_seconds()) + print("ms_time: %s s" % dt_ms.total_seconds()) + ratio = 0.95 + assert dt_ms.total_seconds() <= dt_pt.total_seconds()/ratio + + @pytest.mark.platform_arm_ascend910b_training @pytest.mark.env_single -class TestDS3Pretrain: - @pytest.mark.level0 - @pytest.mark.run(order=1) - def test_mindspore_ds3_pretrain_determinstic(self): - """ - Feature: test mindspore pretrain_ds3 - Description: run mindspore pretrain_ds3 to generate pynative loss - Expectation: test success - """ - scripts_name = "run_ms_determin.sh" - - test_path = os.path.split(os.path.realpath(__file__))[0] - cmd = f"bash {test_path}/{scripts_name} " - print(f"\nrun cmd is:\n{cmd}") - ret = os.system(cmd) - assert ret == 0, f"msrun failed, please check ms_det.log" - - @pytest.mark.level0 - @pytest.mark.run(order=2) - def test_compare_res(self): - """ - Feature: test_compare_res - Description: compare relative error between torch loss and mindspore loss - Expectation: no error - """ - loss_pt = parse_log_file('pta_det.txt') - loss_ms = parse_log_file('ms_det.txt') - # 开确定性计算,精度对齐 - for i in loss_pt: - print("loss:", loss_pt[i][2], loss_ms[i][2]) - assert loss_pt[i][2] == loss_ms[i][2] \ No newline at end of file +@pytest.mark.level0 +@pytest.mark.run(order=2) +def test_compare_accuracy(): + """ + Feature: test_compare_accuracy + Description: compare relative error between torch loss and mindspore loss + Expectation: no error + """ + run_mindspore_ds3_pretrain_determinstic() + loss_pt = parse_log_file('pta_det.txt') + loss_ms = parse_log_file('ms_det.txt') + # 开确定性计算,精度对齐 + for i in loss_pt: + print("loss:", loss_pt[i][2], loss_ms[i][2]) + assert loss_pt[i][2] == loss_ms[i][2] diff --git a/tests/st/net/test_ds3_pretrain/test_ds3_pretrain.sh b/tests/st/net/test_ds3_pretrain/test_ds3_pretrain.sh index 86b5fdfe6a1dfab34cf9563c3a948ada4509e093..512d8a0c9e05135e948e3dff055314a6a228aa74 100644 --- a/tests/st/net/test_ds3_pretrain/test_ds3_pretrain.sh +++ b/tests/st/net/test_ds3_pretrain/test_ds3_pretrain.sh @@ -3,13 +3,9 @@ export CUDA_DEVICE_MAX_CONNECTIONS=1 export PYTORCH_NPU_ALLOC_CONF=expandable_segments:True export HCCL_CONNECT_TIMEOUT=360 -source ../../../../scripts/set_path.sh +source ../../../scripts/set_path.sh MindSpeed_LLM_PATH=../../../../MindSpeed-LLM -export HCCL_DETERMINISTIC=true # HCCL确定性 -export ASCEND_LAUNCH_BLOCKING=1 # 硬件确定性 -export NCCL_DETERMINISTIC=1 - NPUS_PER_NODE=8 MASTER_ADDR=localhost MASTER_PORT=6088 diff --git a/tests/st/net/test_ds3_sft/pta_det.txt b/tests/st/net/test_ds3_sft/pta_det.txt index c82a8142797c2746e3ac362fd891d3b5ebc3c75d..1aa830b6e64ed5f6af9586b3520bd98625a61842 100644 --- a/tests/st/net/test_ds3_sft/pta_det.txt +++ b/tests/st/net/test_ds3_sft/pta_det.txt @@ -1,85 +1,3 @@ -training ... -[before the start of training step] datetime: 2025-05-06 16:14:48 -WARNING:megatron.core.models.common.embeddings.rotary_pos_embedding:Setting apply_rope_fusion to false because its implementation is not included in Apex. Try upgrading to the latest version -Using /home/jenkins/.cache/torch_extensions/py39_cpu as PyTorch extensions root... -Using /home/jenkins/.cache/torch_extensions/py39_cpu as PyTorch extensions root... -Using /home/jenkins/.cache/torch_extensions/py39_cpu as PyTorch extensions root... -Using /home/jenkins/.cache/torch_extensions/py39_cpu as PyTorch extensions root... -Emitting ninja build file /home/jenkins/.cache/torch_extensions/py39_cpu/npu_moe_token_permute/build.ninja... -Building extension module npu_moe_token_permute... -Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N) -ninja: no work to do. -Loading extension module npu_moe_token_permute... -Loading extension module npu_moe_token_permute... -Loading extension module npu_moe_token_permute... -Loading extension module npu_moe_token_permute... -Using /home/jenkins/.cache/torch_extensions/py39_cpu as PyTorch extensions root... -Using /home/jenkins/.cache/torch_extensions/py39_cpu as PyTorch extensions root... -Using /home/jenkins/.cache/torch_extensions/py39_cpu as PyTorch extensions root... -Using /home/jenkins/.cache/torch_extensions/py39_cpu as PyTorch extensions root... -Emitting ninja build file /home/jenkins/.cache/torch_extensions/py39_cpu/grouped_matmul/build.ninja... -Building extension module grouped_matmul... -Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N) -ninja: no work to do. -Loading extension module grouped_matmul... -Using /home/jenkins/.cache/torch_extensions/py39_cpu as PyTorch extensions root... -Loading extension module grouped_matmul... -Loading extension module grouped_matmul... -Loading extension module grouped_matmul... -Using /home/jenkins/.cache/torch_extensions/py39_cpu as PyTorch extensions root... -Using /home/jenkins/.cache/torch_extensions/py39_cpu as PyTorch extensions root... -Using /home/jenkins/.cache/torch_extensions/py39_cpu as PyTorch extensions root... -Emitting ninja build file /home/jenkins/.cache/torch_extensions/py39_cpu/npu_moe_token_unpermute/build.ninja... -Building extension module npu_moe_token_unpermute... -Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N) -ninja: no work to do. -Loading extension module npu_moe_token_unpermute... -Loading extension module npu_moe_token_unpermute... -Loading extension module npu_moe_token_unpermute... -Loading extension module npu_moe_token_unpermute... -Using /home/jenkins/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /home/jenkins/.cache/torch_extensions/py39_cpu as PyTorch extensions root... - -Using /home/jenkins/.cache/torch_extensions/py39_cpu as PyTorch extensions root... -Using /home/jenkins/.cache/torch_extensions/py39_cpu as PyTorch extensions root... -Emitting ninja build file /home/jenkins/.cache/torch_extensions/py39_cpu/npu_moe_token_permute/build.ninja... -Building extension module npu_moe_token_permute... -Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N) -ninja: no work to do. -Loading extension module npu_moe_token_permute... -Loading extension module npu_moe_token_permute... -Loading extension module npu_moe_token_permute... -Loading extension module npu_moe_token_permute... -Using /home/jenkins/.cache/torch_extensions/py39_cpu as PyTorch extensions root...Using /home/jenkins/.cache/torch_extensions/py39_cpu as PyTorch extensions root... - -Using /home/jenkins/.cache/torch_extensions/py39_cpu as PyTorch extensions root... -Using /home/jenkins/.cache/torch_extensions/py39_cpu as PyTorch extensions root... -Emitting ninja build file /home/jenkins/.cache/torch_extensions/py39_cpu/grouped_matmul/build.ninja... -Building extension module grouped_matmul... -Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N) -ninja: no work to do. -Loading extension module grouped_matmul... -Using /home/jenkins/.cache/torch_extensions/py39_cpu as PyTorch extensions root... -Loading extension module grouped_matmul... -Loading extension module grouped_matmul... -Loading extension module grouped_matmul... -Using /home/jenkins/.cache/torch_extensions/py39_cpu as PyTorch extensions root... -Using /home/jenkins/.cache/torch_extensions/py39_cpu as PyTorch extensions root... -Using /home/jenkins/.cache/torch_extensions/py39_cpu as PyTorch extensions root... -Emitting ninja build file /home/jenkins/.cache/torch_extensions/py39_cpu/npu_moe_token_unpermute/build.ninja... -Building extension module npu_moe_token_unpermute... -Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N) -ninja: no work to do. -Loading extension module npu_moe_token_unpermute... -Loading extension module npu_moe_token_unpermute... -Loading extension module npu_moe_token_unpermute... -Loading extension module npu_moe_token_unpermute... -[W compiler_depend.ts:2573] Warning: Tensor not is not allocated by NPUCachingAllocator, skip eraseStream. (function operator()) -Number of parameters in transformer layers in billions: 26.19 -Number of parameters in embedding layers in billions: 1.85 -Total number of parameters in billions: 28.04 -Number of parameters in most loaded shard in billions: 14.0214 -Number of parameters in other shards in billions: 13.0947 -Theoretical memory footprints: weight and optimizer=120346.30 MB [2025-05-06 16:15:06] iteration 1/ 10 | consumed samples: 8 | elapsed time per iteration (ms): 17382.0 | learning rate: 1.000000E-05 | global batch size: 8 | lm loss: 2.5055196285247803 | loss scale: 1.0 | grad norm: 30.5073025200452719 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | [Rank 0] (after 1 iterations) memory (MB) | allocated: 16805.6318359375 | max allocated: 17467.81298828125 | reserved: 18310.0 | max reserved: 18310.0 [Rank 4] (after 1 iterations) memory (MB) | allocated: 16640.2607421875 | max allocated: 17088.26318359375 | reserved: 17432.0 | max reserved: 17432.0 diff --git a/tests/st/net/test_ds3_sft/pta_non_det.txt b/tests/st/net/test_ds3_sft/pta_non_det.txt new file mode 100644 index 0000000000000000000000000000000000000000..ffb0a80e4f3697b0c3bc9cd2276c196dcff469f1 --- /dev/null +++ b/tests/st/net/test_ds3_sft/pta_non_det.txt @@ -0,0 +1,13 @@ + [2025-05-21 14:14:39] iteration 1/ 10 | consumed samples: 8 | elapsed time per iteration (ms): 17388.6 | learning rate: 1.000000E-05 | global batch size: 8 | lm loss: 2.5055196285247803 | loss scale: 1.0 | grad norm: 30.5078306307964660 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | +[Rank 0] (after 1 iterations) memory (MB) | allocated: 16805.6318359375 | max allocated: 17467.81298828125 | reserved: 18310.0 | max reserved: 18310.0 +[Rank 4] (after 1 iterations) memory (MB) | allocated: 16640.2607421875 | max allocated: 17088.26318359375 | reserved: 17432.0 | max reserved: 17432.0 + [2025-05-21 14:14:39] iteration 2/ 10 | consumed samples: 16 | elapsed time per iteration (ms): 576.3 | learning rate: 9.701478E-06 | global batch size: 8 | lm loss: 4.0182037353515625 | loss scale: 1.0 | grad norm: 24.5773605935942321 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | + [2025-05-21 14:14:40] iteration 3/ 10 | consumed samples: 24 | elapsed time per iteration (ms): 506.0 | learning rate: 8.841920E-06 | global batch size: 8 | lm loss: 6.6553277969360352 | loss scale: 1.0 | grad norm: 26.3783260616633832 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | + [2025-05-21 14:14:40] iteration 4/ 10 | consumed samples: 32 | elapsed time per iteration (ms): 513.8 | learning rate: 7.525000E-06 | global batch size: 8 | lm loss: 7.3994584083557129 | loss scale: 1.0 | grad norm: 29.8324946884116002 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | + [2025-05-21 14:14:41] iteration 5/ 10 | consumed samples: 40 | elapsed time per iteration (ms): 514.8 | learning rate: 5.909558E-06 | global batch size: 8 | lm loss: 7.8787059783935547 | loss scale: 1.0 | grad norm: 45.1551120707076095 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | + [2025-05-21 14:14:41] iteration 6/ 10 | consumed samples: 48 | elapsed time per iteration (ms): 518.6 | learning rate: 4.190442E-06 | global batch size: 8 | lm loss: 7.0007390975952148 | loss scale: 1.0 | grad norm: 65.0190342543524054 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | + [2025-05-21 14:14:42] iteration 7/ 10 | consumed samples: 56 | elapsed time per iteration (ms): 494.9 | learning rate: 2.575000E-06 | global batch size: 8 | lm loss: 8.9156417846679688 | loss scale: 1.0 | grad norm: 67.4179611679047355 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | + [2025-05-21 14:14:42] iteration 8/ 10 | consumed samples: 64 | elapsed time per iteration (ms): 502.0 | learning rate: 1.258080E-06 | global batch size: 8 | lm loss: 8.3894538879394531 | loss scale: 1.0 | grad norm: 112.8329331932316677 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | + [2025-05-21 14:14:43] iteration 9/ 10 | consumed samples: 72 | elapsed time per iteration (ms): 502.1 | learning rate: 3.985215E-07 | global batch size: 8 | lm loss: 10.7227420806884766 | loss scale: 1.0 | grad norm: 39.1689933139906685 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | + [2025-05-21 14:14:43] iteration 10/ 10 | consumed samples: 80 | elapsed time per iteration (ms): 506.4 | learning rate: 1.000000E-07 | global batch size: 8 | lm loss: 10.0526390075683594 | loss scale: 1.0 | grad norm: 68.0683604678219467 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | +[after training is done] datetime: 2025-05-21 14:14:43 \ No newline at end of file diff --git a/tests/st/net/test_ds3_sft/run_ms_nondetermin.sh b/tests/st/net/test_ds3_sft/run_ms_nondetermin.sh new file mode 100644 index 0000000000000000000000000000000000000000..81b84fbe4074d0302cf6cd3ebe513cef80c956a7 --- /dev/null +++ b/tests/st/net/test_ds3_sft/run_ms_nondetermin.sh @@ -0,0 +1,25 @@ +#!/bin/bash +backup() { + fname=$1 + cp $fname $fname'_back' + echo '======'$fname 'backuped!' +} + +recover() { + fname=$1 + cp $fname'_back' $fname + echo '======'$fname 'recovered!!!!' +} + +memRecord() { + recordFile=$1 + bash mem.sh $recordFile > mem.txt 2>&1& +} + + +# 关闭确定性计算跑一遍 +export HCCL_DETERMINISTIC=false # HCCL确定性 +export ASCEND_LAUNCH_BLOCKING= # 硬件确定性 +export NCCL_DETERMINISTIC= +bash test_ds3_sft.sh > ms_non_det.txt +cat ms_non_det.txt \ No newline at end of file diff --git a/tests/st/net/test_ds3_sft/test_ds3_sft.py b/tests/st/net/test_ds3_sft/test_ds3_sft.py index cadc173da80fce45f39dd0392e49ec639b0e4358..c6beb6505b5b0e77328ec2040e69320361cba10e 100644 --- a/tests/st/net/test_ds3_sft/test_ds3_sft.py +++ b/tests/st/net/test_ds3_sft/test_ds3_sft.py @@ -16,40 +16,80 @@ import os import sys import pytest +from datetime import datetime sys.path.append(os.path.join(os.path.dirname(__file__), os.path.pardir)) from utils import parse_log_file +def run_mindspore_ds3_sft_determinstic(): + """ + Feature: test mindspore pretrain_glm + Description: run mindspore ds3_sft to generate pynative loss + Expectation: test success + """ + scripts_name = "run_ms_determin.sh" + + test_path = os.path.split(os.path.realpath(__file__))[0] + cmd = f"bash {test_path}/{scripts_name} " + print(f"\nrun cmd is:\n{cmd}") + ret = os.system(cmd) + assert ret == 0, f"msrun failed, please check ms_det.log" + + +def run_mindspore_ds3_sft_nondeterminstic(): + """ + Feature: test mindspore pretrain_glm + Description: run mindspore ds3_sft to generate pynative loss + Expectation: test success + """ + scripts_name = "run_ms_nondetermin.sh" + + test_path = os.path.split(os.path.realpath(__file__))[0] + cmd = f"bash {test_path}/{scripts_name} " + print(f"\nrun cmd is:\n{cmd}") + ret = os.system(cmd) + assert ret == 0, f"msrun failed, please check ms_non_det.log" + + +@pytest.mark.platform_arm_ascend910b_training +@pytest.mark.env_single +@pytest.mark.level0 +@pytest.mark.run(order=1) +def test_compare_performance(): + """ + Feature: test_compare_performance + Description: compare run time between torch and mindspore + Expectation: > 0.95pta + """ + run_mindspore_ds3_sft_nondeterminstic() + data_pt = parse_log_file('pta_non_det.txt') + data_ms = parse_log_file('ms_non_det.txt') + tformat = '%Y-%m-%d %H:%M:%S' + dt_ms = datetime.strptime(data_ms[10][0], tformat) - datetime.strptime(data_ms[5][0], tformat) + dt_pt = datetime.strptime(data_pt[10][0], tformat) - datetime.strptime(data_pt[5][0], tformat) + # 关闭确定性计算,统计5-10步,ms性能 > 0.95pta性能 + print("pt_time: %s s" % dt_pt.total_seconds()) + print("ms_time: %s s" % dt_ms.total_seconds()) + ratio = dt_ms.total_seconds() / dt_pt.total_seconds() + print("Ratio(ms_time/pt_time): %s" % ratio) + ratio = 0.95 + assert dt_ms.total_seconds() <= dt_pt.total_seconds()/ratio + + @pytest.mark.platform_arm_ascend910b_training @pytest.mark.env_single -class TestDS3Sft: - @pytest.mark.level0 - @pytest.mark.run(order=1) - def test_mindspore_ds3_sft_determinstic(self): - """ - Feature: test mindspore pretrain_glm - Description: run mindspore ds3_sft to generate pynative loss - Expectation: test success - """ - scripts_name = "run_ms_determin.sh" - - test_path = os.path.split(os.path.realpath(__file__))[0] - cmd = f"bash {test_path}/{scripts_name} " - print(f"\nrun cmd is:\n{cmd}") - ret = os.system(cmd) - assert ret == 0, f"msrun failed, please check ms_det.log" - - @pytest.mark.level0 - @pytest.mark.run(order=2) - def test_compare_res(self): - """ - Feature: test_compare_res - Description: compare relative error between torch loss and mindspore loss - Expectation: no error - """ - loss_pt = parse_log_file('pta_det.txt') - loss_ms = parse_log_file('ms_det.txt') - # 开确定性计算,精度对齐 - for i in loss_pt: - print("loss:", loss_pt[i][2], loss_ms[i][2]) - assert loss_pt[i][2] == loss_ms[i][2] \ No newline at end of file +@pytest.mark.level0 +@pytest.mark.run(order=2) +def test_compare_accuracy(): + """ + Feature: test_compare_accuracy + Description: compare relative error between torch loss and mindspore loss + Expectation: no error + """ + run_mindspore_ds3_sft_determinstic() + loss_pt = parse_log_file('pta_det.txt') + loss_ms = parse_log_file('ms_det.txt') + # 开确定性计算,精度对齐 + for i in loss_pt: + print("loss:", loss_pt[i][2], loss_ms[i][2]) + assert loss_pt[i][2] == loss_ms[i][2] \ No newline at end of file diff --git a/tests/st/net/test_ds3_sft/test_ds3_sft.sh b/tests/st/net/test_ds3_sft/test_ds3_sft.sh index 8c415c116605ce2ac02e2be20f413765cf5d9126..307edd7ec81a3da95946919144c77a8efb01503c 100644 --- a/tests/st/net/test_ds3_sft/test_ds3_sft.sh +++ b/tests/st/net/test_ds3_sft/test_ds3_sft.sh @@ -3,11 +3,7 @@ export CUDA_DEVICE_MAX_CONNECTIONS=1 export PYTORCH_NPU_ALLOC_CONF=expandable_segments:True export HCCL_CONNECT_TIMEOUT=3600 -export HCCL_DETERMINISTIC=true # HCCL确定性 -export ASCEND_LAUNCH_BLOCKING=1 # 硬件确定性 -export NCCL_DETERMINISTIC=1 - -source ../../../../scripts/set_path.sh +source ../../../scripts/set_path.sh MindSpeed_LLM_PATH=../../../../MindSpeed-LLM GPUS_PER_NODE=8 @@ -23,7 +19,6 @@ DATA_PATH="/home/workspace/mindspore_dataset/msadapter/test_input/net/test_ds3_s TOKENIZER_PATH="/home/workspace/mindspore_dataset/msadapter/test_input/net/test_ds3_sft/tokenizer" CKPT_LOAD_DIR="/home/workspace/mindspore_dataset/msadapter/test_input/net/test_ds3_sft/load" - TP=1 PP=2 EP=4 diff --git a/tests/st/net/test_qwen_grpo/test_qwen_grpo.py b/tests/st/net/test_qwen_grpo/test_qwen_grpo.py index 3689d720de9bc20bdabaaeec47d29b29f70be94a..074a8fd106ed1dfcbbf7c295eb5f0e171db65a0a 100644 --- a/tests/st/net/test_qwen_grpo/test_qwen_grpo.py +++ b/tests/st/net/test_qwen_grpo/test_qwen_grpo.py @@ -36,7 +36,7 @@ def parse_log_file(file): @pytest.mark.platform_arm_ascend910b_training @pytest.mark.env_single class TestQwenGRPO: - @pytest.mark.level0 + @pytest.mark.level1 @pytest.mark.run(order=1) def test_qwen_grpo(self): """ @@ -52,7 +52,7 @@ class TestQwenGRPO: ret = os.system(cmd) assert ret == 0, f"msrun failed, please check ms_det.log" - @pytest.mark.level0 + @pytest.mark.level1 @pytest.mark.run(order=2) def test_compare_res(self): """ diff --git a/tests/st/net/test_qwen_grpo/test_qwen_grpo.sh b/tests/st/net/test_qwen_grpo/test_qwen_grpo.sh index 2ee9ba5b7612f9b572c0d43211c36cc9959a5d25..a06a265a42934d4f120cebef4d75eac8b526e1b1 100644 --- a/tests/st/net/test_qwen_grpo/test_qwen_grpo.sh +++ b/tests/st/net/test_qwen_grpo/test_qwen_grpo.sh @@ -1,5 +1,5 @@ #!/bin/bash -source ../../../../scripts/set_path_rl.sh +source ../../../scripts/set_path_rl.sh MindSpeed_RL_PATH=../../../../RL/MindSpeed-RL backup() { diff --git a/tests/st/net/test_qwen_sft/pta_non_det.txt b/tests/st/net/test_qwen_sft/pta_non_det.txt new file mode 100644 index 0000000000000000000000000000000000000000..16d04560655c8cca460c3c1bc25994dae1e63ec8 --- /dev/null +++ b/tests/st/net/test_qwen_sft/pta_non_det.txt @@ -0,0 +1,29 @@ +training ... +[before the start of training step] datetime: 2025-05-21 14:18:53 +WARNING:megatron.core.models.common.embeddings.rotary_pos_embedding:Setting apply_rope_fusion to false because its implementation is not included in Apex. Try upgrading to the latest version +[W compiler_depend.ts:2573] Warning: Tensor not is not allocated by NPUCachingAllocator, skip eraseStream. (function operator()) +Number of parameters in transformer layers in billions: 6.53 +Number of parameters in embedding layers in billions: 1.09 +Total number of parameters in billions: 7.62 +Number of parameters in most loaded shard in billions: 1.0882 +Number of parameters in other shards in billions: 0.8157 +Theoretical memory footprints: weight and optimizer=18680.38 MB + [2025-05-21 14:19:11] iteration 1/ 10 | consumed samples: 128 | elapsed time per iteration (ms): 17909.9 | throughput per GPU (TFLOP/s/GPU): 382.7 | learning rate: 1.235606E-06 | global batch size: 128 | lm loss: 9.7951211929321289 | loss scale: 1.0 | grad norm: 20.1504889719963600 | number of skipped iterations: 0 | number of nan iterations: 0 | +[Rank 2] (after 1 iterations) memory (MB) | allocated: 12480.03173828125 | max allocated: 14014.271484375 | reserved: 13258.0 | max reserved: 14216.0 +[Rank 5] (after 1 iterations) memory (MB) | allocated: 12480.07421875 | max allocated: 14014.271484375 | reserved: 13214.0 | max reserved: 14216.0 +[Rank 3] (after 1 iterations) memory (MB) | allocated: 12480.03173828125 | max allocated: 14014.271484375 | reserved: 13258.0 | max reserved: 14216.0 +[Rank 4] (after 1 iterations) memory (MB) | allocated: 12480.07421875 | max allocated: 14014.271484375 | reserved: 13212.0 | max reserved: 14216.0 +[Rank 0] (after 1 iterations) memory (MB) | allocated: 16618.2421875 | max allocated: 18688.27587890625 | reserved: 18582.0 | max reserved: 19474.0 +[Rank 1] (after 1 iterations) memory (MB) | allocated: 16618.2421875 | max allocated: 18688.27587890625 | reserved: 18582.0 | max reserved: 19474.0 +[Rank 6] (after 1 iterations) memory (MB) | allocated: 16630.60888671875 | max allocated: 18692.78662109375 | reserved: 17926.0 | max reserved: 18896.0 +[Rank 7] (after 1 iterations) memory (MB) | allocated: 16630.60888671875 | max allocated: 18692.78662109375 | reserved: 17926.0 | max reserved: 18896.0 + [2025-05-21 14:19:16] iteration 2/ 10 | consumed samples: 256 | elapsed time per iteration (ms): 4774.8 | throughput per GPU (TFLOP/s/GPU): 1435.3 | learning rate: 1.166872E-06 | global batch size: 128 | lm loss: 9.7579383850097656 | loss scale: 1.0 | grad norm: 23.7712538534597506 | number of skipped iterations: 0 | number of nan iterations: 0 | + [2025-05-21 14:19:20] iteration 3/ 10 | consumed samples: 384 | elapsed time per iteration (ms): 4579.6 | throughput per GPU (TFLOP/s/GPU): 1496.5 | learning rate: 1.048292E-06 | global batch size: 128 | lm loss: 9.8504676818847656 | loss scale: 1.0 | grad norm: 36.9889443866128431 | number of skipped iterations: 0 | number of nan iterations: 0 | + [2025-05-21 14:19:25] iteration 4/ 10 | consumed samples: 512 | elapsed time per iteration (ms): 4656.7 | throughput per GPU (TFLOP/s/GPU): 1471.7 | learning rate: 8.921965E-07 | global batch size: 128 | lm loss: 9.6257724761962891 | loss scale: 1.0 | grad norm: 25.7486003013847089 | number of skipped iterations: 0 | number of nan iterations: 0 | + [2025-05-21 14:19:29] iteration 5/ 10 | consumed samples: 640 | elapsed time per iteration (ms): 4507.2 | throughput per GPU (TFLOP/s/GPU): 1520.5 | learning rate: 7.148162E-07 | global batch size: 128 | lm loss: 9.5870904922485352 | loss scale: 1.0 | grad norm: 27.0243093058274333 | number of skipped iterations: 0 | number of nan iterations: 0 | + [2025-05-21 14:19:34] iteration 6/ 10 | consumed samples: 768 | elapsed time per iteration (ms): 4553.4 | throughput per GPU (TFLOP/s/GPU): 1505.1 | learning rate: 5.345956E-07 | global batch size: 128 | lm loss: 9.3979997634887695 | loss scale: 1.0 | grad norm: 24.3417366184412245 | number of skipped iterations: 0 | number of nan iterations: 0 | + [2025-05-21 14:19:38] iteration 7/ 10 | consumed samples: 896 | elapsed time per iteration (ms): 4508.8 | throughput per GPU (TFLOP/s/GPU): 1520.0 | learning rate: 3.702742E-07 | global batch size: 128 | lm loss: 9.3910436630249023 | loss scale: 1.0 | grad norm: 21.0828480218610572 | number of skipped iterations: 0 | number of nan iterations: 0 | + [2025-05-21 14:19:43] iteration 8/ 10 | consumed samples: 1024 | elapsed time per iteration (ms): 4511.3 | throughput per GPU (TFLOP/s/GPU): 1519.1 | learning rate: 2.389384E-07 | global batch size: 128 | lm loss: 9.3132324218750000 | loss scale: 1.0 | grad norm: 26.6723830136154128 | number of skipped iterations: 0 | number of nan iterations: 0 | + [2025-05-21 14:19:47] iteration 9/ 10 | consumed samples: 1152 | elapsed time per iteration (ms): 4495.9 | throughput per GPU (TFLOP/s/GPU): 1524.3 | learning rate: 1.542448E-07 | global batch size: 128 | lm loss: 9.3942813873291016 | loss scale: 1.0 | grad norm: 22.5287556319088687 | number of skipped iterations: 0 | number of nan iterations: 0 | + [2025-05-21 14:19:52] iteration 10/ 10 | consumed samples: 1280 | elapsed time per iteration (ms): 4483.5 | throughput per GPU (TFLOP/s/GPU): 1528.6 | learning rate: 1.250000E-07 | global batch size: 128 | lm loss: 9.4827508926391602 | loss scale: 1.0 | grad norm: 17.8203998381244233 | number of skipped iterations: 0 | number of nan iterations: 0 | +[after training is done] datetime: 2025-05-21 14:19:52 diff --git a/tests/st/net/test_qwen_sft/run_ms_nondetermin.sh b/tests/st/net/test_qwen_sft/run_ms_nondetermin.sh new file mode 100644 index 0000000000000000000000000000000000000000..ac4bfa033578c733d402c07b5d3980a0961eaeab --- /dev/null +++ b/tests/st/net/test_qwen_sft/run_ms_nondetermin.sh @@ -0,0 +1,25 @@ +#!/bin/bash +backup() { + fname=$1 + cp $fname $fname'_back' + echo '======'$fname 'backuped!' +} + +recover() { + fname=$1 + cp $fname'_back' $fname + echo '======'$fname 'recovered!!!!' +} + +memRecord() { + recordFile=$1 + bash mem.sh $recordFile > mem.txt 2>&1& +} + + +# 关闭确定性计算跑一遍 +export HCCL_DETERMINISTIC=false # HCCL确定性 +export ASCEND_LAUNCH_BLOCKING= # 硬件确定性 +export NCCL_DETERMINISTIC= +bash test_qwen_sft.sh > ms_non_det.txt +cat ms_non_det.txt \ No newline at end of file diff --git a/tests/st/net/test_qwen_sft/test_qwen_sft.py b/tests/st/net/test_qwen_sft/test_qwen_sft.py index 2a163ff200036f83c1e0a0ccfcdd520cb48ee126..918d0d982765855733ea6924b3a2d7d789582044 100644 --- a/tests/st/net/test_qwen_sft/test_qwen_sft.py +++ b/tests/st/net/test_qwen_sft/test_qwen_sft.py @@ -16,39 +16,80 @@ import os import sys import pytest +from datetime import datetime sys.path.append(os.path.join(os.path.dirname(__file__), os.path.pardir)) from utils import parse_log_file +def run_mindspore_qwen_sft_determinstic(): + """ + Feature: test mindspore pretrain_glm + Description: run mindspore pretrain_glm to generate pynative loss + Expectation: test success + """ + scripts_name = "run_ms_determin.sh" + + test_path = os.path.split(os.path.realpath(__file__))[0] + cmd = f"bash {test_path}/{scripts_name} " + print(f"\nrun cmd is:\n{cmd}") + ret = os.system(cmd) + assert ret == 0, f"msrun failed, please check ms_det.log" + + +def run_mindspore_qwen_sft_nondeterminstic(): + """ + Feature: test mindspore pretrain_glm + Description: run mindspore pretrain_glm to generate pynative loss + Expectation: test success + """ + scripts_name = "run_ms_nondetermin.sh" + + test_path = os.path.split(os.path.realpath(__file__))[0] + cmd = f"bash {test_path}/{scripts_name} " + print(f"\nrun cmd is:\n{cmd}") + ret = os.system(cmd) + assert ret == 0, f"msrun failed, please check ms_non_det.log" + + +@pytest.mark.platform_arm_ascend910b_training +@pytest.mark.env_single +@pytest.mark.level0 +@pytest.mark.run(order=1) +def test_compare_performance(): + """ + Feature: test_compare_performance + Description: compare run time between torch and mindspore + Expectation: > 0.95pta + """ + run_mindspore_qwen_sft_nondeterminstic() + data_pt = parse_log_file('pta_non_det.txt') + data_ms = parse_log_file('ms_non_det.txt') + tformat = '%Y-%m-%d %H:%M:%S' + dt_ms = datetime.strptime(data_ms[10][0], tformat) - datetime.strptime(data_ms[5][0], tformat) + dt_pt = datetime.strptime(data_pt[10][0], tformat) - datetime.strptime(data_pt[5][0], tformat) + # 关闭确定性计算,统计5-10步,ms性能 > 0.95pta性能 + print("pt_time: %s s" % dt_pt.total_seconds()) + print("ms_time: %s s" % dt_ms.total_seconds()) + ratio = dt_ms.total_seconds() / dt_pt.total_seconds() + print("Ratio(ms_time/pt_time): %s" % ratio) + ratio = 0.85 + assert dt_ms.total_seconds() <= dt_pt.total_seconds()/ratio + + @pytest.mark.platform_arm_ascend910b_training @pytest.mark.env_single -class TestQwenSft: - @pytest.mark.level0 - @pytest.mark.run(order=1) - def test_mindspore_qwen_sft_determinstic(self): - """ - Feature: test mindspore pretrain_glm - Description: run mindspore pretrain_glm to generate pynative loss - Expectation: test success - """ - scripts_name = "run_ms_determin.sh" - - test_path = os.path.split(os.path.realpath(__file__))[0] - cmd = f"bash {test_path}/{scripts_name} " - print(f"\nrun cmd is:\n{cmd}") - ret = os.system(cmd) - assert ret == 0, f"msrun failed, please check ms_det.log" - - @pytest.mark.level0 - @pytest.mark.run(order=2) - def test_compare_res(self): - """ - Feature: test_compare_res - Description: compare relative error between torch loss and mindspore loss - Expectation: no error - """ - loss_pt = parse_log_file('pta_det.txt') - loss_ms = parse_log_file('ms_det.txt') - # 开确定性计算,精度对齐 - for i in loss_pt: - assert loss_pt[i][2] == loss_ms[i][2] \ No newline at end of file +@pytest.mark.level0 +@pytest.mark.run(order=2) +def test_compare_accuracy(): + """ + Feature: test_compare_accuracy + Description: compare relative error between torch loss and mindspore loss + Expectation: no error + """ + run_mindspore_qwen_sft_determinstic() + loss_pt = parse_log_file('pta_det.txt') + loss_ms = parse_log_file('ms_det.txt') + # 开确定性计算,精度对齐 + for i in loss_pt: + print("loss:", loss_pt[i][2], loss_ms[i][2]) + assert loss_pt[i][2] == loss_ms[i][2] \ No newline at end of file diff --git a/tests/st/net/test_qwen_sft/test_qwen_sft.sh b/tests/st/net/test_qwen_sft/test_qwen_sft.sh index ebff66597ada89b723ca62436b79374c70195d1a..0ad05a1aaf41e50425bf04f4aa01825c4c2a862d 100644 --- a/tests/st/net/test_qwen_sft/test_qwen_sft.sh +++ b/tests/st/net/test_qwen_sft/test_qwen_sft.sh @@ -3,11 +3,7 @@ export CUDA_DEVICE_MAX_CONNECTIONS=1 export PYTORCH_NPU_ALLOC_CONF=expandable_segments:True export HCCL_CONNECT_TIMEOUT=3600 -export HCCL_DETERMINISTIC=true # HCCL确定性 -export ASCEND_LAUNCH_BLOCKING=1 # 硬件确定性 -export NCCL_DETERMINISTIC=1 - -source ../../../../scripts/set_path.sh +source ../../../scripts/set_path.sh MindSpeed_LLM_PATH=../../../../MindSpeed-LLM NPUS_PER_NODE=8 diff --git a/tests/st/net/test_xiaoyi_sft/pta_non_det.txt b/tests/st/net/test_xiaoyi_sft/pta_non_det.txt new file mode 100644 index 0000000000000000000000000000000000000000..5c912f09b2cb30779ec92e33958e13402a106d34 --- /dev/null +++ b/tests/st/net/test_xiaoyi_sft/pta_non_det.txt @@ -0,0 +1,16 @@ + [2025-05-21 14:24:17] iteration 1/ 11 | consumed samples: 16 | elapsed time per iteration (ms): 16485.0 | learning rate: 5.000000E-06 | global batch size: 16 | lm loss: 12.0754003524780273 | loss scale: 1.0 | grad norm: 19.7172509544047401 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | +[Rank 0] (after 1 iterations) memory (MB) | allocated: 11651.56298828125 | max allocated: 12547.5654296875 | reserved: 13350.0 | max reserved: 13350.0 +[Rank 1] (after 1 iterations) memory (MB) | allocated: 11651.56298828125 | max allocated: 12547.5654296875 | reserved: 13350.0 | max reserved: 13350.0 +[Rank 5] (after 1 iterations) memory (MB) | allocated: 11679.6484375 | max allocated: 12575.65380859375 | reserved: 13090.0 | max reserved: 13090.0 +[Rank 4] (after 1 iterations) memory (MB) | allocated: 11679.6484375 | max allocated: 12575.65380859375 | reserved: 13090.0 | max reserved: 13090.0 + [2025-05-21 14:24:20] iteration 2/ 11 | consumed samples: 32 | elapsed time per iteration (ms): 3086.6 | learning rate: 4.902113E-06 | global batch size: 16 | lm loss: 12.1699361801147461 | loss scale: 1.0 | grad norm: 17.7069252022766079 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | + [2025-05-21 14:24:23] iteration 3/ 11 | consumed samples: 48 | elapsed time per iteration (ms): 3061.8 | learning rate: 4.618034E-06 | global batch size: 16 | lm loss: 12.0399160385131836 | loss scale: 1.0 | grad norm: 15.4125967822474212 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | + [2025-05-21 14:24:26] iteration 4/ 11 | consumed samples: 64 | elapsed time per iteration (ms): 3079.8 | learning rate: 4.175571E-06 | global batch size: 16 | lm loss: 11.5355386734008789 | loss scale: 1.0 | grad norm: 15.8696696426245758 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | + [2025-05-21 14:24:29] iteration 5/ 11 | consumed samples: 80 | elapsed time per iteration (ms): 3063.1 | learning rate: 3.618034E-06 | global batch size: 16 | lm loss: 11.6786975860595703 | loss scale: 1.0 | grad norm: 13.9098119738641461 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | + [2025-05-21 14:24:32] iteration 6/ 11 | consumed samples: 96 | elapsed time per iteration (ms): 3118.4 | learning rate: 3.000000E-06 | global batch size: 16 | lm loss: 11.1290616989135742 | loss scale: 1.0 | grad norm: 21.3662067195208714 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | + [2025-05-21 14:24:35] iteration 7/ 11 | consumed samples: 112 | elapsed time per iteration (ms): 3075.9 | learning rate: 2.381966E-06 | global batch size: 16 | lm loss: 11.1138620376586914 | loss scale: 1.0 | grad norm: 15.4319119861084300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | + [2025-05-21 14:24:38] iteration 8/ 11 | consumed samples: 128 | elapsed time per iteration (ms): 3091.2 | learning rate: 1.824429E-06 | global batch size: 16 | lm loss: 10.6487312316894531 | loss scale: 1.0 | grad norm: 19.5025222320879799 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | + [2025-05-21 14:24:41] iteration 9/ 11 | consumed samples: 144 | elapsed time per iteration (ms): 3116.4 | learning rate: 1.381966E-06 | global batch size: 16 | lm loss: 10.8175973892211914 | loss scale: 1.0 | grad norm: 14.7229057649160300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | + [2025-05-21 14:24:45] iteration 10/ 11 | consumed samples: 160 | elapsed time per iteration (ms): 3120.6 | learning rate: 1.097887E-06 | global batch size: 16 | lm loss: 10.7655591964721680 | loss scale: 1.0 | grad norm: 11.1536903329989219 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | + [2025-05-21 14:24:48] iteration 11/ 11 | consumed samples: 176 | elapsed time per iteration (ms): 3158.1 | learning rate: 1.000000E-06 | global batch size: 16 | lm loss: 10.6327543258666992 | loss scale: 1.0 | grad norm: 10.7592217790394979 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | +[after training is done] datetime: 2025-05-21 14:24:48 \ No newline at end of file diff --git a/tests/st/net/test_xiaoyi_sft/run_ms_nodetermin.sh b/tests/st/net/test_xiaoyi_sft/run_ms_determin.sh similarity index 100% rename from tests/st/net/test_xiaoyi_sft/run_ms_nodetermin.sh rename to tests/st/net/test_xiaoyi_sft/run_ms_determin.sh diff --git a/tests/st/net/test_xiaoyi_sft/run_ms_nondetermin.sh b/tests/st/net/test_xiaoyi_sft/run_ms_nondetermin.sh new file mode 100644 index 0000000000000000000000000000000000000000..e1821d3cf01f38135151146c74bd4079f65e5b12 --- /dev/null +++ b/tests/st/net/test_xiaoyi_sft/run_ms_nondetermin.sh @@ -0,0 +1,25 @@ +#!/bin/bash +backup() { + fname=$1 + cp $fname $fname'_back' + echo '======'$fname 'backuped!' +} + +recover() { + fname=$1 + cp $fname'_back' $fname + echo '======'$fname 'recovered!!!!' +} + +memRecord() { + recordFile=$1 + bash mem.sh $recordFile > mem.txt 2>&1& +} + + +# 关闭确定性计算跑一遍 +export HCCL_DETERMINISTIC=false # HCCL确定性 +export ASCEND_LAUNCH_BLOCKING= # 硬件确定性 +export NCCL_DETERMINISTIC= +bash test_xiaoyi_sft.sh > ms_non_det.txt +cat ms_non_det.txt \ No newline at end of file diff --git a/tests/st/net/test_xiaoyi_sft/test_xiaoyi_sft.py b/tests/st/net/test_xiaoyi_sft/test_xiaoyi_sft.py index c3ff3f043cbd0b9c4d61c5e9bb849f2fab87178d..f450ed601eb80ea12a67f05cdccb9c18d5dcc260 100644 --- a/tests/st/net/test_xiaoyi_sft/test_xiaoyi_sft.py +++ b/tests/st/net/test_xiaoyi_sft/test_xiaoyi_sft.py @@ -16,43 +16,80 @@ import os import sys import pytest +from datetime import datetime sys.path.append(os.path.join(os.path.dirname(__file__), os.path.pardir)) from utils import parse_log_file -import mindspore -mindspore.runtime.launch_blocking() - -class TestXIAOYISFT: - @pytest.mark.platform_arm_ascend910b_training - @pytest.mark.env_single - @pytest.mark.level0 - @pytest.mark.run(order=1) - def test_mindspore_xiaoyi_sft_determinstic(self): - """ - Feature: test mindspore xiaoyi_sft - Description: run mindspore xiaoyi_sft to generate pynative loss - Expectation: test success - """ - scripts_name = "run_ms_nodetermin.sh" - - test_path = os.path.split(os.path.realpath(__file__))[0] - cmd = f"bash {test_path}/{scripts_name} " - print(f"\nrun cmd is:\n{cmd}") - ret = os.system(cmd) - assert ret == 0, f"msrun failed, please check ms_det.log" - - @pytest.mark.platform_arm_ascend910b_training - @pytest.mark.env_single - @pytest.mark.level0 - @pytest.mark.run(order=2) - def test_compare_res(self): - """ - Feature: test_compare_res - Description: compare relative error between torch loss and mindspore loss - Expectation: no error - """ - loss_pt = parse_log_file('pta_det.txt') - loss_ms = parse_log_file('ms_det.txt') - # 开确定性计算,精度对齐 - for i in loss_pt: - print("loss:", loss_pt[i][2], loss_ms[i][2]) - assert loss_pt[i][2] == loss_ms[i][2] \ No newline at end of file + + +def run_mindspore_xiaoyi_sft_determinstic(): + """ + Feature: test mindspore xiaoyi_sft + Description: run mindspore xiaoyi_sft to generate pynative loss + Expectation: test success + """ + scripts_name = "run_ms_determin.sh" + + test_path = os.path.split(os.path.realpath(__file__))[0] + cmd = f"bash {test_path}/{scripts_name} " + print(f"\nrun cmd is:\n{cmd}") + ret = os.system(cmd) + assert ret == 0, f"msrun failed, please check ms_det.log" + + +def run_mindspore_xiaoyi_sft_nondeterminstic(): + """ + Feature: test mindspore xiaoyi_sft + Description: run mindspore xiaoyi_sft to generate pynative loss + Expectation: test success + """ + scripts_name = "run_ms_nondetermin.sh" + + test_path = os.path.split(os.path.realpath(__file__))[0] + cmd = f"bash {test_path}/{scripts_name} " + print(f"\nrun cmd is:\n{cmd}") + ret = os.system(cmd) + assert ret == 0, f"msrun failed, please check ms_non_det.log" + + +@pytest.mark.platform_arm_ascend910b_training +@pytest.mark.env_single +@pytest.mark.level0 +@pytest.mark.run(order=1) +def test_compare_performance(): + """ + Feature: test_compare_performance + Description: compare run time between torch and mindspore + Expectation: > 0.95pta + """ + run_mindspore_xiaoyi_sft_nondeterminstic() + data_pt = parse_log_file('pta_non_det.txt') + data_ms = parse_log_file('ms_non_det.txt') + tformat = '%Y-%m-%d %H:%M:%S' + dt_ms = datetime.strptime(data_ms[10][0], tformat) - datetime.strptime(data_ms[5][0], tformat) + dt_pt = datetime.strptime(data_pt[10][0], tformat) - datetime.strptime(data_pt[5][0], tformat) + # 关闭确定性计算,统计5-10步,ms性能 > 0.95pta性能 + print("pt_time: %s s" % dt_pt.total_seconds()) + print("ms_time: %s s" % dt_ms.total_seconds()) + ratio = dt_ms.total_seconds() / dt_pt.total_seconds() + print("Ratio(ms_time/pt_time): %s" % ratio) + ratio = 0.9 + assert dt_ms.total_seconds() <= dt_pt.total_seconds()/ratio + + +@pytest.mark.platform_arm_ascend910b_training +@pytest.mark.env_single +@pytest.mark.level0 +@pytest.mark.run(order=2) +def test_compare_accuracy(): + """ + Feature: test_compare_accuracy + Description: compare relative error between torch loss and mindspore loss + Expectation: no error + """ + run_mindspore_xiaoyi_sft_determinstic() + loss_pt = parse_log_file('pta_det.txt') + loss_ms = parse_log_file('ms_det.txt') + # 开确定性计算,精度对齐 + for i in loss_pt: + print("loss:", loss_pt[i][2], loss_ms[i][2]) + assert loss_pt[i][2] == loss_ms[i][2] \ No newline at end of file diff --git a/tests/st/net/test_xiaoyi_sft/test_xiaoyi_sft.sh b/tests/st/net/test_xiaoyi_sft/test_xiaoyi_sft.sh index 50d57792d32d36f16ef744c626e551ebac5fbf6e..5ce923295103ac1019c6b6f13d8e447e37009041 100644 --- a/tests/st/net/test_xiaoyi_sft/test_xiaoyi_sft.sh +++ b/tests/st/net/test_xiaoyi_sft/test_xiaoyi_sft.sh @@ -3,11 +3,7 @@ export CUDA_DEVICE_MAX_CONNECTIONS=1 export PYTORCH_NPU_ALLOC_CONF=expandable_segments:True export MS_ALLOC_CONF=enable_vmm:True -export HCCL_DETERMINISTIC=true # HCCL确定性 -export ASCEND_LAUNCH_BLOCKING=1 # 硬件确定性 -export NCCL_DETERMINISTIC=1 - -source ../../../../scripts/set_path.sh +source ../../../scripts/set_path.sh MindSpeed_LLM_PATH=../../../../MindSpeed-LLM MASTER_PORT=6103