From 8467e56d19900d7fe7757e6fb78d18bc2ffd4e32 Mon Sep 17 00:00:00 2001 From: limingxing517 Date: Wed, 1 Feb 2023 05:44:27 +0000 Subject: [PATCH 01/11] fp32 Signed-off-by: limingxing517 --- .../WideDeep_ID2712_for_TensorFlow/train.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/TensorFlow/built-in/recommendation/WideDeep_ID2712_for_TensorFlow/train.py b/TensorFlow/built-in/recommendation/WideDeep_ID2712_for_TensorFlow/train.py index 1f7e5fd95..849af2acb 100644 --- a/TensorFlow/built-in/recommendation/WideDeep_ID2712_for_TensorFlow/train.py +++ b/TensorFlow/built-in/recommendation/WideDeep_ID2712_for_TensorFlow/train.py @@ -280,6 +280,9 @@ def parse_args(): help="size of train data") parser.add_argument("--display_step", default= config.display_step, help="display step") + parser.add_argument('--precision_mode', default='allow_mix_precision', + help='allow_fp32_to_fp16/force_fp16/ ' + 'must_keep_origin_dtype/allow_mix_precision.') args = parser.parse_args() '''args, unknown_args = parser.parse_known_args() if len(unknown_args) > 0: @@ -291,7 +294,7 @@ def parse_args(): if __name__ == '__main__': display_step = config.display_step - os.environ["TF_ENABLE_AUTO_MIXED_PRECISION_GRAPH_REWRITE"] = "1" + #os.environ["TF_ENABLE_AUTO_MIXED_PRECISION_GRAPH_REWRITE"] = "1" tag = algo Base_path = config.BASE_DIR @@ -336,9 +339,11 @@ if __name__ == '__main__': #custom_op.parameter_map["mix_compile_mode"].b = True #开启混合计算,根据实际情况配置 custom_op.parameter_map["use_off_line"].b = True custom_op.parameter_map["min_group_size"].b = 1 - custom_op.parameter_map["precision_mode"].s = tf.compat.as_bytes("allow_mix_precision") + custom_op.parameter_map["precision_mode"].s = tf.compat.as_bytes(args.precision_mode) custom_op.parameter_map["hcom_parallel"].b = True custom_op.parameter_map["iterations_per_loop"].i = config.iterations_per_loop + if args.precision_mode == "allow_mix_precision": + custom_op.parameter_map["modify_mixlist"].s = tf.compat.as_bytes("ops_info.json") custom_op.parameter_map["modify_mixlist"].s = tf.compat.as_bytes("ops_info.json") custom_op.parameter_map["fusion_switch_file"].s = tf.compat.as_bytes("fusion_switch.cfg") #aic err debug -- Gitee From 04e4846bfddabec1fe26c459a2c9dcbe53bbcd03 Mon Sep 17 00:00:00 2001 From: limingxing517 Date: Wed, 1 Feb 2023 05:46:27 +0000 Subject: [PATCH 02/11] fp32 Signed-off-by: limingxing517 --- .../test/train_performance_1p.sh | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/TensorFlow/built-in/recommendation/WideDeep_ID2712_for_TensorFlow/test/train_performance_1p.sh b/TensorFlow/built-in/recommendation/WideDeep_ID2712_for_TensorFlow/test/train_performance_1p.sh index a09fdda8a..c3d592a71 100644 --- a/TensorFlow/built-in/recommendation/WideDeep_ID2712_for_TensorFlow/test/train_performance_1p.sh +++ b/TensorFlow/built-in/recommendation/WideDeep_ID2712_for_TensorFlow/test/train_performance_1p.sh @@ -28,7 +28,7 @@ display_step=10 #维持参数,以下不需要修改 over_dump=False - +precision_mode="allow_mix_precision" # 帮助信息,不需要修改 if [[ $1 == --help || $1 == -h ]];then echo"usage:./train_performance_1p.sh " @@ -97,7 +97,11 @@ cp configs/config.py configs/config.py.run #训练执行 start=$(date +%s) -nohup python3 train.py --data_path=$data_path --ckpt_path=$cur_path/output/$ASCEND_DEVICE_ID/ckpt --train_size =$train_size --display_step=$display_step > $cur_path/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log 2>&1 & +nohup python3 train.py --data_path=$data_path \ + --ckpt_path=$cur_path/output/$ASCEND_DEVICE_ID/ckpt \ + --train_size=$train_size \ + --precision_mode=$precision_mode \ + --display_step=$display_step > $cur_path/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log 2>&1 & wait end=$(date +%s) e2e_time=$(( $end - $start )) -- Gitee From d2f3ae628981f441cb6ba081361e2b6f6aad2898 Mon Sep 17 00:00:00 2001 From: limingxing517 Date: Wed, 1 Feb 2023 06:32:59 +0000 Subject: [PATCH 03/11] fp32 Signed-off-by: limingxing517 --- .../recommendation/WideDeep_ID2712_for_TensorFlow/train.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/TensorFlow/built-in/recommendation/WideDeep_ID2712_for_TensorFlow/train.py b/TensorFlow/built-in/recommendation/WideDeep_ID2712_for_TensorFlow/train.py index 849af2acb..6e98e5d33 100644 --- a/TensorFlow/built-in/recommendation/WideDeep_ID2712_for_TensorFlow/train.py +++ b/TensorFlow/built-in/recommendation/WideDeep_ID2712_for_TensorFlow/train.py @@ -344,6 +344,8 @@ if __name__ == '__main__': custom_op.parameter_map["iterations_per_loop"].i = config.iterations_per_loop if args.precision_mode == "allow_mix_precision": custom_op.parameter_map["modify_mixlist"].s = tf.compat.as_bytes("ops_info.json") + print("******************************") + print("args.precision_mode= ", args.precision_mode) custom_op.parameter_map["modify_mixlist"].s = tf.compat.as_bytes("ops_info.json") custom_op.parameter_map["fusion_switch_file"].s = tf.compat.as_bytes("fusion_switch.cfg") #aic err debug -- Gitee From 8dd7f6aa366e5bc676a38bf6cab4eb0b90947046 Mon Sep 17 00:00:00 2001 From: limingxing517 Date: Wed, 1 Feb 2023 06:38:47 +0000 Subject: [PATCH 04/11] fp32 Signed-off-by: limingxing517 --- .../WideDeep_ID2712_for_TensorFlow/test/train_performance_1p.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/TensorFlow/built-in/recommendation/WideDeep_ID2712_for_TensorFlow/test/train_performance_1p.sh b/TensorFlow/built-in/recommendation/WideDeep_ID2712_for_TensorFlow/test/train_performance_1p.sh index c3d592a71..048b908db 100644 --- a/TensorFlow/built-in/recommendation/WideDeep_ID2712_for_TensorFlow/test/train_performance_1p.sh +++ b/TensorFlow/built-in/recommendation/WideDeep_ID2712_for_TensorFlow/test/train_performance_1p.sh @@ -94,7 +94,7 @@ sed -i "s%59761827%${train_size}%p" configs/config.py sed -i "s%display_step = 100%display_step = $display_step%p" configs/config.py #echo `cat configs/config.py |uniq > configs/config.py; cp -f configs/config.py configs/config.py.run` cp configs/config.py configs/config.py.run - +echo "precision_mode: $precision_mode" #训练执行 start=$(date +%s) nohup python3 train.py --data_path=$data_path \ -- Gitee From 2c17c43a927c621976fd385dafa722b731f7ffe4 Mon Sep 17 00:00:00 2001 From: limingxing517 Date: Wed, 1 Feb 2023 06:56:11 +0000 Subject: [PATCH 05/11] fp32 Signed-off-by: limingxing517 --- .../recommendation/WideDeep_ID2712_for_TensorFlow/train.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/TensorFlow/built-in/recommendation/WideDeep_ID2712_for_TensorFlow/train.py b/TensorFlow/built-in/recommendation/WideDeep_ID2712_for_TensorFlow/train.py index 6e98e5d33..68ae2d2f5 100644 --- a/TensorFlow/built-in/recommendation/WideDeep_ID2712_for_TensorFlow/train.py +++ b/TensorFlow/built-in/recommendation/WideDeep_ID2712_for_TensorFlow/train.py @@ -293,6 +293,8 @@ def parse_args(): if __name__ == '__main__': + print("******************************") + print("args.precision_mode= ", args.precision_mode) display_step = config.display_step #os.environ["TF_ENABLE_AUTO_MIXED_PRECISION_GRAPH_REWRITE"] = "1" @@ -344,8 +346,6 @@ if __name__ == '__main__': custom_op.parameter_map["iterations_per_loop"].i = config.iterations_per_loop if args.precision_mode == "allow_mix_precision": custom_op.parameter_map["modify_mixlist"].s = tf.compat.as_bytes("ops_info.json") - print("******************************") - print("args.precision_mode= ", args.precision_mode) custom_op.parameter_map["modify_mixlist"].s = tf.compat.as_bytes("ops_info.json") custom_op.parameter_map["fusion_switch_file"].s = tf.compat.as_bytes("fusion_switch.cfg") #aic err debug -- Gitee From af74b1225f4f51ec62b18b929248a8a8f02f56d2 Mon Sep 17 00:00:00 2001 From: limingxing517 Date: Wed, 1 Feb 2023 07:01:04 +0000 Subject: [PATCH 06/11] fp32 Signed-off-by: limingxing517 --- .../recommendation/WideDeep_ID2712_for_TensorFlow/train.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/TensorFlow/built-in/recommendation/WideDeep_ID2712_for_TensorFlow/train.py b/TensorFlow/built-in/recommendation/WideDeep_ID2712_for_TensorFlow/train.py index 68ae2d2f5..1b076e41b 100644 --- a/TensorFlow/built-in/recommendation/WideDeep_ID2712_for_TensorFlow/train.py +++ b/TensorFlow/built-in/recommendation/WideDeep_ID2712_for_TensorFlow/train.py @@ -293,8 +293,6 @@ def parse_args(): if __name__ == '__main__': - print("******************************") - print("args.precision_mode= ", args.precision_mode) display_step = config.display_step #os.environ["TF_ENABLE_AUTO_MIXED_PRECISION_GRAPH_REWRITE"] = "1" @@ -320,7 +318,8 @@ if __name__ == '__main__': sess_config.gpu_options.allow_growth = True args = parse_args() - + print("******************************") + print("args.precision_mode= ", args.precision_mode) #""" # for npu custom_op = sess_config.graph_options.rewrite_options.custom_optimizers.add() @@ -346,6 +345,8 @@ if __name__ == '__main__': custom_op.parameter_map["iterations_per_loop"].i = config.iterations_per_loop if args.precision_mode == "allow_mix_precision": custom_op.parameter_map["modify_mixlist"].s = tf.compat.as_bytes("ops_info.json") + print("******************************") + print("args.precision_mode= ", args.precision_mode) custom_op.parameter_map["modify_mixlist"].s = tf.compat.as_bytes("ops_info.json") custom_op.parameter_map["fusion_switch_file"].s = tf.compat.as_bytes("fusion_switch.cfg") #aic err debug -- Gitee From e79c1f5598f792958c4e3c99bc10f96c3561cbb2 Mon Sep 17 00:00:00 2001 From: limingxing517 Date: Wed, 1 Feb 2023 09:14:43 +0000 Subject: [PATCH 07/11] fp32 Signed-off-by: limingxing517 --- .../WideDeep_ID2712_for_TensorFlow/train.py | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/TensorFlow/built-in/recommendation/WideDeep_ID2712_for_TensorFlow/train.py b/TensorFlow/built-in/recommendation/WideDeep_ID2712_for_TensorFlow/train.py index 1b076e41b..0865576db 100644 --- a/TensorFlow/built-in/recommendation/WideDeep_ID2712_for_TensorFlow/train.py +++ b/TensorFlow/built-in/recommendation/WideDeep_ID2712_for_TensorFlow/train.py @@ -318,8 +318,7 @@ if __name__ == '__main__': sess_config.gpu_options.allow_growth = True args = parse_args() - print("******************************") - print("args.precision_mode= ", args.precision_mode) + #""" # for npu custom_op = sess_config.graph_options.rewrite_options.custom_optimizers.add() @@ -345,9 +344,6 @@ if __name__ == '__main__': custom_op.parameter_map["iterations_per_loop"].i = config.iterations_per_loop if args.precision_mode == "allow_mix_precision": custom_op.parameter_map["modify_mixlist"].s = tf.compat.as_bytes("ops_info.json") - print("******************************") - print("args.precision_mode= ", args.precision_mode) - custom_op.parameter_map["modify_mixlist"].s = tf.compat.as_bytes("ops_info.json") custom_op.parameter_map["fusion_switch_file"].s = tf.compat.as_bytes("fusion_switch.cfg") #aic err debug # custom_op.parameter_map["enable_exception_dump"].i = 1 @@ -544,4 +540,4 @@ if __name__ == '__main__': if mode == 'train' and rank_size > 1: sess.run(npu_shutdown) ############### for hccl $^?################ - sess.close() + sess.close() \ No newline at end of file -- Gitee From 31b96b1ee91ec50c0b0c2ce2984626ea1dbc4a97 Mon Sep 17 00:00:00 2001 From: limingxing517 Date: Wed, 1 Feb 2023 09:20:31 +0000 Subject: [PATCH 08/11] fp32 Signed-off-by: limingxing517 --- .../WideDeep_ID2712_for_TensorFlow/test/train_performance_1p.sh | 1 - 1 file changed, 1 deletion(-) diff --git a/TensorFlow/built-in/recommendation/WideDeep_ID2712_for_TensorFlow/test/train_performance_1p.sh b/TensorFlow/built-in/recommendation/WideDeep_ID2712_for_TensorFlow/test/train_performance_1p.sh index 048b908db..506c1fd93 100644 --- a/TensorFlow/built-in/recommendation/WideDeep_ID2712_for_TensorFlow/test/train_performance_1p.sh +++ b/TensorFlow/built-in/recommendation/WideDeep_ID2712_for_TensorFlow/test/train_performance_1p.sh @@ -94,7 +94,6 @@ sed -i "s%59761827%${train_size}%p" configs/config.py sed -i "s%display_step = 100%display_step = $display_step%p" configs/config.py #echo `cat configs/config.py |uniq > configs/config.py; cp -f configs/config.py configs/config.py.run` cp configs/config.py configs/config.py.run -echo "precision_mode: $precision_mode" #训练执行 start=$(date +%s) nohup python3 train.py --data_path=$data_path \ -- Gitee From 8238ebc5c7eaead32c78842b7edcdb3dd141074f Mon Sep 17 00:00:00 2001 From: limingxing517 Date: Wed, 1 Feb 2023 09:23:12 +0000 Subject: [PATCH 09/11] fp32 Signed-off-by: limingxing517 --- .../WideDeep_ID2712_for_TensorFlow/test/train_full_1p.sh | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/TensorFlow/built-in/recommendation/WideDeep_ID2712_for_TensorFlow/test/train_full_1p.sh b/TensorFlow/built-in/recommendation/WideDeep_ID2712_for_TensorFlow/test/train_full_1p.sh index beb1e3c47..5d93f3a41 100644 --- a/TensorFlow/built-in/recommendation/WideDeep_ID2712_for_TensorFlow/test/train_full_1p.sh +++ b/TensorFlow/built-in/recommendation/WideDeep_ID2712_for_TensorFlow/test/train_full_1p.sh @@ -27,7 +27,7 @@ display_step=10 #维持参数,以下不需要修改 over_dump=False - +precision_mode="allow_mix_precision" # 帮助信息,不需要修改 if [[ $1 == --help || $1 == -h ]];then @@ -104,7 +104,11 @@ cp configs/config.py configs/config.py.run cd $cur_path/../ start=$(date +%s) -nohup python3 train.py --data_path=$data_path --ckpt_path=$cur_path/output/$ASCEND_DEVICE_ID/ckpt > $cur_path/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log 2>&1 & +nohup python3 train.py --data_path=$data_path \ + --ckpt_path=$cur_path/output/$ASCEND_DEVICE_ID/ckpt \ + --train_size=$train_size \ + --precision_mode=$precision_mode \ + --display_step=$display_step > $cur_path/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log 2>&1 & wait end=$(date +%s) e2e_time=$(( $end - $start )) -- Gitee From 725cb1bb6d10767b4051f7df30dfb48517be5ae6 Mon Sep 17 00:00:00 2001 From: limingxing517 Date: Wed, 1 Feb 2023 10:38:31 +0000 Subject: [PATCH 10/11] fp32 Signed-off-by: limingxing517 --- .../test/train_performance_8p.sh | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/TensorFlow/built-in/recommendation/WideDeep_ID2712_for_TensorFlow/test/train_performance_8p.sh b/TensorFlow/built-in/recommendation/WideDeep_ID2712_for_TensorFlow/test/train_performance_8p.sh index 35585d560..abfb0abcd 100644 --- a/TensorFlow/built-in/recommendation/WideDeep_ID2712_for_TensorFlow/test/train_performance_8p.sh +++ b/TensorFlow/built-in/recommendation/WideDeep_ID2712_for_TensorFlow/test/train_performance_8p.sh @@ -30,7 +30,7 @@ n_epoches=4 #维持参数,以下不需要修改 over_dump=False - +precision_mode="allow_mix_precision" # 帮助信息,不需要修改 if [[ $1 == --help || $1 == -h ]];then echo"usage:./train_performance_1p.sh " @@ -100,7 +100,11 @@ do #执行训练脚本,以下传参不需要修改,其他需要模型审视修改 - nohup python3 train.py --data_path=$data_path --ckpt_path=$cur_path/output/$ASCEND_DEVICE_ID/ckpt --train_size =$train_size --display_step=$display_step > $cur_path/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log 2>&1 & + nohup python3 train.py --data_path=$data_path \ + --ckpt_path=$cur_path/output/$ASCEND_DEVICE_ID/ckpt \ + --train_size=$train_size \ + --precision_mode=$precision_mode \ + --display_step=$display_step > $cur_path/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log 2>&1 & done wait -- Gitee From aa8e9fe9864caff906bdeb2286439e7ccb1886d4 Mon Sep 17 00:00:00 2001 From: limingxing517 Date: Wed, 1 Feb 2023 10:44:04 +0000 Subject: [PATCH 11/11] fp32 Signed-off-by: limingxing517 --- .../WideDeep_ID2712_for_TensorFlow/test/train_full_8p.sh | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/TensorFlow/built-in/recommendation/WideDeep_ID2712_for_TensorFlow/test/train_full_8p.sh b/TensorFlow/built-in/recommendation/WideDeep_ID2712_for_TensorFlow/test/train_full_8p.sh index 776939b22..d74293f30 100644 --- a/TensorFlow/built-in/recommendation/WideDeep_ID2712_for_TensorFlow/test/train_full_8p.sh +++ b/TensorFlow/built-in/recommendation/WideDeep_ID2712_for_TensorFlow/test/train_full_8p.sh @@ -31,7 +31,7 @@ n_epoches=8 #维持参数,以下不需要修改 over_dump=False - +precision_mode="allow_mix_precision" # 帮助信息,不需要修改 if [[ $1 == --help || $1 == -h ]];then @@ -122,7 +122,11 @@ do else mkdir -p $cur_path/output/${ASCEND_DEVICE_ID} fi - nohup python3 train.py --data_path=$data_path --ckpt_path=$cur_path/output/$ASCEND_DEVICE_ID/ckpt > $cur_path/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log 2>&1 & + nohup python3 train.py --data_path=$data_path \ + --ckpt_path=$cur_path/output/$ASCEND_DEVICE_ID/ckpt \ + --train_size=$train_size \ + --precision_mode=$precision_mode \ + --display_step=$display_step > $cur_path/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log 2>&1 & done wait end=$(date +%s) -- Gitee