From d640809797707d65c42e2546a2576d07bb355f67 Mon Sep 17 00:00:00 2001 From: Zn Date: Tue, 24 May 2022 17:18:10 +0800 Subject: [PATCH] =?UTF-8?q?[=E8=87=AA=E7=A0=94][PyTorch]mBART=5FID2372=5Ff?= =?UTF-8?q?or=5FPyTorch=E6=A8=A1=E5=9E=8B=E8=AE=AD=E7=BB=83=E5=90=AF?= =?UTF-8?q?=E5=8A=A8=E8=84=9A=E6=9C=AC=E5=8F=AA=E7=95=99test=E4=B8=8Bshell?= =?UTF-8?q?=E8=84=9A=E6=9C=AC=EF=BC=8C=E5=85=B6=E4=BD=99=E5=88=A0=E9=99=A4?= =?UTF-8?q?=EF=BC=9B?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Zn --- .../nlp/mBART_ID2372_for_PyTorch/README.md | 23 +++++++++----- .../{test => }/generate_on_en_de.sh | 2 +- .../{test => }/generate_on_en_ro.sh | 6 ++-- .../test/train_full_8p.sh | 31 +++++++++++++------ .../test/train_performance_1p.sh | 2 +- .../test/train_performance_8p.sh | 4 +-- 6 files changed, 43 insertions(+), 25 deletions(-) rename PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/{test => }/generate_on_en_de.sh (97%) rename PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/{test => }/generate_on_en_ro.sh (94%) diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/README.md b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/README.md index ad6fa55819..9f487365fd 100644 --- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/README.md +++ b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/README.md @@ -107,13 +107,20 @@ fairseq-preprocess \ --workers 70 ``` - +# 性能测试 +```单卡训练流程: +1.安装环境 +2.bash ./test/train_performance_1p.sh --data_path=/train_data/en_ro(数据集路径为当前磁盘路径,写到en_ro) +``` +```多卡训练流程: +1.安装环境 +2.bash ./test/train_performance_8p.sh --data_path=/train_data/en_ro(数据集路径为当前磁盘路径,写到en_ro) +``` # 在数据集上进行fine-tune - ```bash -1. 修改./test/train_full_8p.sh中PRETRAIN为模型的路径,DATA_PATH为数据集的路径(train_data/en_ro 或train_data/en_de) +1. 修改./test/train_full_8p.sh中DATA_PATH为数据集的路径(train_data/en_ro 或train_data/en_de),PRETRAIN为model.pt的路径,BPE_PATH为sentence.bpe.model路径; [若需要训练en_de数据集,则需要将train_full_8p.sh中dropout的参数设置为0.1,total-num-update与max-update设置为300000,target-lang设置为de_DE] -2. 执行 bash ./test/train_full_8p.sh +2. 执行 bash ./test/train_full_8p.sh --data_path=DATA_PATH ``` # 在数据集上进行评估 @@ -129,11 +136,11 @@ pip3.7 install sacrebleu==1.5.1 ```bash 验证en_ro精度 -1. 修改generate_on_en_ro.sh中DATA_PATH为数据集的路径,BPE_PATH为sentence.bpe.model的路径,SCRIPTS为mosesdecoder/scripts的路径,WMT16_SCRIPTS为wmt16-scripts的路径 -2. 执行 bash ./test/generate_on_en_ro.sh checkpoints/checkpoint_best.pt 验证en_ro的训练精度 +1. 修改 generate_on_en_ro.sh中DATA_PATH为数据集的路径,BPE_PATH为sentence.bpe.model的路径,SCRIPTS为mosesdecoder/scripts的路径,WMT16_SCRIPTS为wmt16-scripts的路径 +2. 执行 bash generate_on_en_ro.sh checkpoints/checkpoint_best.pt 验证en_ro的训练精度 验证en_de精度 -1. 修改generate_on_en_de.sh中DATA_PATH为数据集的路径,BPE_PATH为sentence.bpe.model的路径,DETOKENIZER为mosesdecoder/scripts/tokenizer/detokenizer.perl的路径 -2. 执行 bash ./test/generate_on_en_de.sh checkpoints/checkpoint_best.pt 验证en_de的训练精度 +1. 修改 generate_on_en_de.sh中DATA_PATH为数据集的路径,BPE_PATH为sentence.bpe.model的路径,DETOKENIZER为mosesdecoder/scripts/tokenizer/detokenizer.perl的路径 +2. 执行 bash generate_on_en_de.sh checkpoints/checkpoint_best.pt 验证en_de的训练精度 ``` # Docker容器训练 diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/test/generate_on_en_de.sh b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/generate_on_en_de.sh similarity index 97% rename from PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/test/generate_on_en_de.sh rename to PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/generate_on_en_de.sh index 7e13b08739..6b4547e41a 100644 --- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/test/generate_on_en_de.sh +++ b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/generate_on_en_de.sh @@ -1,4 +1,4 @@ -source env.sh +source ./test/env_npu.sh DATA_PATH=path_of_data # fix it to your own train data path BPE_PATH=/path/sentence.bpe.model # fix it to your own sentence.bpe.model path langs=ar_AR,cs_CZ,de_DE,en_XX,es_XX,et_EE,fi_FI,fr_XX,gu_IN,hi_IN,it_IT,ja_XX,kk_KZ,ko_KR,lt_LT,lv_LV,my_MM,ne_NP,nl_XX,ro_RO,ru_RU,si_LK,tr_TR,vi_VN,zh_CN diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/test/generate_on_en_ro.sh b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/generate_on_en_ro.sh similarity index 94% rename from PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/test/generate_on_en_ro.sh rename to PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/generate_on_en_ro.sh index 18e527b108..e9efd429e9 100644 --- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/test/generate_on_en_ro.sh +++ b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/generate_on_en_ro.sh @@ -1,4 +1,4 @@ -source env.sh +source ./test/env_npu.sh DATA_PATH=path_of_data # fix it to your own train data path BPE_PATH=/path/sentence.bpe.model # fix it to your own sentence.bpe.model path SCRIPTS=mosesdecoder/scripts # fix it to your own mosesdecoder path @@ -34,8 +34,8 @@ for f in $HYP $REF perl $REPLACE_UNICODE_PUNCT | \ perl $NORM_PUNC -l ro | \ perl $REM_NON_PRINT_CHAR | \ - python3 $NORMALIZE_ROMANIAN | \ - python3 $REMOVE_DIACRITICS | \ + python3.7 $NORMALIZE_ROMANIAN | \ + python3.7 $REMOVE_DIACRITICS | \ perl $TOKENIZER -no-escape -threads 16 -a -l ro >"en_ro."$f done sacrebleu -tok 'none' -s 'none' en_ro.ref < en_ro.hyp \ No newline at end of file diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/test/train_full_8p.sh b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/test/train_full_8p.sh index eb9b5ffa21..cc9262dcc6 100644 --- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/test/train_full_8p.sh +++ b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/test/train_full_8p.sh @@ -11,6 +11,16 @@ export RANK_SIZE=8 Network="mBART_for_PyTorch" #训练batch_size token_size=1024 +# 数据集路径,保持为空,不需要修改 +data_path="" + +# 参数校验,data_path为必传参数,其他参数的增删由模型自身决定;此处新增参数需在上面有定义并赋值 +for para in $* +do + if [[ $para == --data_path* ]];then + data_path=`echo ${para#*=}` + fi +done #校验是否传入data_path,不需要修改 if [[ $data_path == "" ]];then @@ -18,7 +28,8 @@ if [[ $data_path == "" ]];then exit 1 fi -# cd到与test文件同层级目录下执行脚本,提高兼容性;test_path_dir为包含test文件夹的路径 +###############指定训练脚本执行路径############### +# cd到与test文件夹同层级目录下执行脚本,提高兼容性;test_path_dir为包含test文件夹的路径 cur_path=`pwd` cur_path_last_dirname=${cur_path##*/} if [ x"${cur_path_last_dirname}" == x"test" ]; then @@ -51,7 +62,7 @@ if [ x"${etp_flag}" != x"true" ];then fi # 将对应的数据以及模型等放到对应路径 或 修改以下路径以适应本地训练 -DATA_PATH=train_data/en_ro +DATA_PATH=train_data/en_ro PRETRAIN=mbart.cc25/model.pt BPE_PATH=mbart.cc25/sentence.bpe.model model_dir=checkpoints/checkpoint_best.pt @@ -84,7 +95,7 @@ do then let a=0+RANK_ID*24 let b=23+RANK_ID*24 - taskset -c $a-$b fairseq-train $DATA_PATH --fp16 --distributed-world-size 8 --npu \ + nohup taskset -c $a-$b fairseq-train $DATA_PATH --fp16 --distributed-world-size 8 --npu \ --device-id $RANK_ID --distributed-rank $RANK_ID --distributed-no-spawn --max-update 40000 \ --encoder-normalize-before --decoder-normalize-before \ --arch mbart_large --layernorm-embedding \ @@ -100,9 +111,9 @@ do --restore-file $PRETRAIN \ --reset-optimizer --reset-meters --reset-dataloader --reset-lr-scheduler \ --langs $langs \ - --ddp-backend no_c10d > ${test_path_dir}/output/${RANK_ID}/train_${RANK_ID}.log 2>&1 & + --ddp-backend no_c10d & > ${test_path_dir}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log 2>&1 & else - fairseq-train $DATA_PATH --fp16 --distributed-world-size 8 --npu \ + nohup fairseq-train $DATA_PATH --fp16 --distributed-world-size 8 --npu \ --device-id $RANK_ID --distributed-rank $RANK_ID --distributed-no-spawn --max-update 40000 \ --encoder-normalize-before --decoder-normalize-before \ --arch mbart_large --layernorm-embedding \ @@ -118,7 +129,7 @@ do --restore-file $PRETRAIN \ --reset-optimizer --reset-meters --reset-dataloader --reset-lr-scheduler \ --langs $langs \ - --ddp-backend no_c10d > ${test_path_dir}/output/${RANK_ID}/train_${RANK_ID}.log 2>&1 & + --ddp-backend no_c10d > ${test_path_dir}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log 2>&1 & fi done wait @@ -127,7 +138,7 @@ end_time=$(date +%s) e2e_time=$(( $end_time - $start_time )) -fairseq-generate $DATA_PATH \ +nohup fairseq-generate $DATA_PATH \ --fp16 --path $model_dir --max-tokens 4096 \ --task translation_from_pretrained_bart \ --gen-subset test \ @@ -146,15 +157,15 @@ for f in $HYP $REF perl $REPLACE_UNICODE_PUNCT | \ perl $NORM_PUNC -l ro | \ perl $REM_NON_PRINT_CHAR | \ - python3 $NORMALIZE_ROMANIAN | \ - python3 $REMOVE_DIACRITICS | \ + python3.7 $NORMALIZE_ROMANIAN | \ + python3.7 $REMOVE_DIACRITICS | \ perl $TOKENIZER -no-escape -threads 16 -a -l ro >"en_ro."$f done sacrebleu -tok 'none' -s 'none' en_ro.ref < en_ro.hyp > res.log wait ASCEND_DEVICE_ID=0 - +cp ${cur_path}/nohup.out ${test_path_dir}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log #结果打印,不需要修改 echo "------------------ Final result ------------------" #输出性能FPS,需要模型审视修改 diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/test/train_performance_1p.sh b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/test/train_performance_1p.sh index 70974d82c2..d9b7154ca9 100644 --- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/test/train_performance_1p.sh +++ b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/test/train_performance_1p.sh @@ -94,7 +94,7 @@ wait pip3.7 install --editable ./ start=$(date +%s) -nohup python3.7 ${cur_path}/train.py $data_path/en_ro/ \ +nohup python3.7 ${cur_path}/train.py $data_path/ \ --distributed-world-size 1 --npu --npu-id $ASCEND_DEVICE_ID --fp16 --encoder-normalize-before --decoder-normalize-before \ --arch mbart_large --layernorm-embedding \ --task translation_from_pretrained_bart \ diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/test/train_performance_8p.sh b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/test/train_performance_8p.sh index c4bdd78c24..b9de8aada6 100644 --- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/test/train_performance_8p.sh +++ b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/test/train_performance_8p.sh @@ -97,7 +97,7 @@ do then let a=0+RANK_ID*24 let b=23+RANK_ID*24 - taskset -c $a-$b nohup python3.7 ${cur_path}/train.py $data_path/en_ro/ --fp16 --distributed-world-size 8 --npu \ + taskset -c $a-$b nohup python3.7 ${cur_path}/train.py $data_path/ --fp16 --distributed-world-size 8 --npu \ --device-id $RANK_ID --distributed-rank $RANK_ID --distributed-no-spawn --max-update 50 \ --encoder-normalize-before --decoder-normalize-before \ --arch mbart_large --layernorm-embedding \ @@ -116,7 +116,7 @@ do --langs $langs \ --ddp-backend no_c10d > ${test_path_dir}/output/${RANK_ID}/train_${RANK_ID}.log 2>&1 & else - nohup python3.7 ${cur_path}/train.py $data_path/en_ro/ --fp16 --distributed-world-size 8 --npu \ + nohup python3.7 ${cur_path}/train.py $data_path/ --fp16 --distributed-world-size 8 --npu \ --device-id $RANK_ID --distributed-rank $RANK_ID --distributed-no-spawn --max-update 50 \ --encoder-normalize-before --decoder-normalize-before \ --arch mbart_large --layernorm-embedding \ -- Gitee