From 11dc62d03cefc19d20be5e64cd72872c5c787189 Mon Sep 17 00:00:00 2001 From: wang-gq <3224925783@qq.com> Date: Fri, 19 Aug 2022 17:13:24 +0800 Subject: [PATCH] 8-19 --- .../test/set_ranktable.py | 1740 +++++++++++++++++ .../test/train_performance_16np.sh | 216 ++ .../test/train_performance_16p.sh | 211 ++ .../test/train_performance_16np.sh | 11 +- .../test/train_performance_16p.sh | 4 +- .../test/set_ranktable.py | 1740 +++++++++++++++++ .../test/train_performance_16np.sh | 189 ++ .../test/train_performance_16p.sh | 184 ++ .../test/train_performance_16np.sh | 11 +- .../test/train_performance_16p.sh | 6 +- .../test/train_performance_16np.sh | 9 +- .../test/train_performance_16p.sh | 4 +- .../test/train_full_16P_256bs_SGD.sh | 4 +- .../test/train_performance_16P_256bs_SGD.sh | 4 +- .../test/train_performance_16np_256bs_SGD.sh | 11 +- 15 files changed, 4326 insertions(+), 18 deletions(-) create mode 100644 TensorFlow/built-in/cv/detection/SSD-Resnet34_ID0048_for_TensorFlow/test/set_ranktable.py create mode 100644 TensorFlow/built-in/cv/detection/SSD-Resnet34_ID0048_for_TensorFlow/test/train_performance_16np.sh create mode 100644 TensorFlow/built-in/cv/detection/SSD-Resnet34_ID0048_for_TensorFlow/test/train_performance_16p.sh create mode 100644 TensorFlow/built-in/nlp/GRU4Rec_ID0128_for_TensorFlow/test/set_ranktable.py create mode 100644 TensorFlow/built-in/nlp/GRU4Rec_ID0128_for_TensorFlow/test/train_performance_16np.sh create mode 100644 TensorFlow/built-in/nlp/GRU4Rec_ID0128_for_TensorFlow/test/train_performance_16p.sh diff --git a/TensorFlow/built-in/cv/detection/SSD-Resnet34_ID0048_for_TensorFlow/test/set_ranktable.py b/TensorFlow/built-in/cv/detection/SSD-Resnet34_ID0048_for_TensorFlow/test/set_ranktable.py new file mode 100644 index 000000000..216dd25a5 --- /dev/null +++ b/TensorFlow/built-in/cv/detection/SSD-Resnet34_ID0048_for_TensorFlow/test/set_ranktable.py @@ -0,0 +1,1740 @@ +import argparse +parser = argparse.ArgumentParser() +parser.add_argument('-n', '--npu_nums', type=int, default='2', help='nums of npu') +parser.add_argument('-c', '--conf_path', type=str, default='./', help='the path of server_info') +FLAGS = parser.parse_args() + +import json +import os +server = [] +server_conf = [] +server_list = ["0", "1", "2", "3", "4", "5", "6", "7"] +if os.path.isdir(FLAGS.conf_path): + for f in os.listdir(FLAGS.conf_path): + if (f.split("_")[-1]).split(".")[0] in server_list and (f.split("_")[-1]).split(".")[1] == 'info' and f.split("_")[0] == 'server': + server_conf.append(f) + + + + + + +rank_address = [] +for i in range(FLAGS.npu_nums): + for x in server_conf: + if (x.split("_")[-1]).split(".")[0] == str(i): + server.append(x.split("_")[1]) + l = FLAGS.conf_path + "/" + x + with open(l, "r") as a: + s = a.readlines() + for s_ in s: + if 'address_0' in s_: + rank_address.append(s_.split("=")[-1][:-1]) + for s_ in s: + if 'address_1' in s_: + rank_address.append(s_.split("=")[-1][:-1]) + for s_ in s: + if 'address_2' in s_: + rank_address.append(s_.split("=")[-1][:-1]) + for s_ in s: + if 'address_3' in s_: + rank_address.append(s_.split("=")[-1][:-1]) + for s_ in s: + if 'address_4' in s_: + rank_address.append(s_.split("=")[-1][:-1]) + for s_ in s: + if 'address_5' in s_: + rank_address.append(s_.split("=")[-1][:-1]) + for s_ in s: + if 'address_6' in s_: + rank_address.append(s_.split("=")[-1][:-1]) + for s_ in s: + if 'address_7' in s_: + rank_address.append(s_.split("=")[-1][:-1]) + +if FLAGS.npu_nums == 1: + rank = { + "server_count":"1", + "server_list":[ + { + "server_id":server[0], + "device":[ + { + "device_id":"0", + "device_ip":rank_address[0], + "rank_id":"0" + }, + { + "device_id":"1", + "device_ip":rank_address[1], + "rank_id":"1" + }, + { + "device_id":"2", + "device_ip":rank_address[2], + "rank_id":"2" + }, + { + "device_id":"3", + "device_ip":rank_address[3], + "rank_id":"3" + }, + { + "device_id":"4", + "device_ip":rank_address[4], + "rank_id":"4" + }, + { + "device_id":"5", + "device_ip":rank_address[5], + "rank_id":"5" + }, + { + "device_id":"6", + "device_ip":rank_address[6], + "rank_id":"6" + }, + { + "device_id":"7", + "device_ip":rank_address[7], + "rank_id":"7" + } + ]}], + "status":"completed", + "version":"1.0" + } +elif FLAGS.npu_nums == 2: + rank = { + "server_count":"2", + "server_list":[ + { + "server_id":server[0], + "device":[ + { + "device_id":"0", + "device_ip":rank_address[0], + "rank_id":"0" + }, + { + "device_id":"1", + "device_ip":rank_address[1], + "rank_id":"1" + }, + { + "device_id":"2", + "device_ip":rank_address[2], + "rank_id":"2" + }, + { + "device_id":"3", + "device_ip":rank_address[3], + "rank_id":"3" + }, + { + "device_id":"4", + "device_ip":rank_address[4], + "rank_id":"4" + }, + { + "device_id":"5", + "device_ip":rank_address[5], + "rank_id":"5" + }, + { + "device_id":"6", + "device_ip":rank_address[6], + "rank_id":"6" + }, + { + "device_id":"7", + "device_ip":rank_address[7], + "rank_id":"7" + } + ]}, + + + { + "server_id":server[1], + "device":[ + { + "device_id":"0", + "device_ip":rank_address[8], + "rank_id":"8" + }, + { + "device_id":"1", + "device_ip":rank_address[9], + "rank_id":"9" + }, + { + "device_id":"2", + "device_ip":rank_address[10], + "rank_id":"10" + }, + { + "device_id":"3", + "device_ip":rank_address[11], + "rank_id":"11" + }, + { + "device_id":"4", + "device_ip":rank_address[12], + "rank_id":"12" + }, + { + "device_id":"5", + "device_ip":rank_address[13], + "rank_id":"13" + }, + { + "device_id":"6", + "device_ip":rank_address[14], + "rank_id":"14" + }, + { + "device_id":"7", + "device_ip":rank_address[15], + "rank_id":"15" + } + ]} + ], + + "status":"completed", + "version":"1.0" + } + + +elif FLAGS.npu_nums == 3: + rank = { + "server_count":"3", + "server_list":[ + { + "server_id":server[0], + "device":[ + { + "device_id":"0", + "device_ip":rank_address[0], + "rank_id":"0" + }, + { + "device_id":"1", + "device_ip":rank_address[1], + "rank_id":"1" + }, + { + "device_id":"2", + "device_ip":rank_address[2], + "rank_id":"2" + }, + { + "device_id":"3", + "device_ip":rank_address[3], + "rank_id":"3" + }, + { + "device_id":"4", + "device_ip":rank_address[4], + "rank_id":"4" + }, + { + "device_id":"5", + "device_ip":rank_address[5], + "rank_id":"5" + }, + { + "device_id":"6", + "device_ip":rank_address[6], + "rank_id":"6" + }, + { + "device_id":"7", + "device_ip":rank_address[7], + "rank_id":"7" + } + ]}, + + + { + "server_id":server[1], + "device":[ + { + "device_id":"0", + "device_ip":rank_address[8], + "rank_id":"8" + }, + { + "device_id":"1", + "device_ip":rank_address[9], + "rank_id":"9" + }, + { + "device_id":"2", + "device_ip":rank_address[10], + "rank_id":"10" + }, + { + "device_id":"3", + "device_ip":rank_address[11], + "rank_id":"11" + }, + { + "device_id":"4", + "device_ip":rank_address[12], + "rank_id":"12" + }, + { + "device_id":"5", + "device_ip":rank_address[13], + "rank_id":"13" + }, + { + "device_id":"6", + "device_ip":rank_address[14], + "rank_id":"14" + }, + { + "device_id":"7", + "device_ip":rank_address[15], + "rank_id":"15" + } + ]}, + + { + "server_id":server[2], + "device":[ + { + "device_id":"0", + "device_ip":rank_address[16], + "rank_id":"16" + }, + { + "device_id":"1", + "device_ip":rank_address[17], + "rank_id":"17" + }, + { + "device_id":"2", + "device_ip":rank_address[18], + "rank_id":"18" + }, + { + "device_id":"3", + "device_ip":rank_address[19], + "rank_id":"19" + }, + { + "device_id":"4", + "device_ip":rank_address[20], + "rank_id":"20" + }, + { + "device_id":"5", + "device_ip":rank_address[21], + "rank_id":"21" + }, + { + "device_id":"6", + "device_ip":rank_address[22], + "rank_id":"22" + }, + { + "device_id":"7", + "device_ip":rank_address[23], + "rank_id":"23" + } + ]} + ], + "status":"completed", + "version":"1.0" + } +elif FLAGS.npu_nums == 4: + rank = { + "server_count":"4", + "server_list":[ + { + "server_id":server[0], + "device":[ + { + "device_id":"0", + "device_ip":rank_address[0], + "rank_id":"0" + }, + { + "device_id":"1", + "device_ip":rank_address[1], + "rank_id":"1" + }, + { + "device_id":"2", + "device_ip":rank_address[2], + "rank_id":"2" + }, + { + "device_id":"3", + "device_ip":rank_address[3], + "rank_id":"3" + }, + { + "device_id":"4", + "device_ip":rank_address[4], + "rank_id":"4" + }, + { + "device_id":"5", + "device_ip":rank_address[5], + "rank_id":"5" + }, + { + "device_id":"6", + "device_ip":rank_address[6], + "rank_id":"6" + }, + { + "device_id":"7", + "device_ip":rank_address[7], + "rank_id":"7" + } + ]}, + + + { + "server_id":server[1], + "device":[ + { + "device_id":"0", + "device_ip":rank_address[8], + "rank_id":"8" + }, + { + "device_id":"1", + "device_ip":rank_address[9], + "rank_id":"9" + }, + { + "device_id":"2", + "device_ip":rank_address[10], + "rank_id":"10" + }, + { + "device_id":"3", + "device_ip":rank_address[11], + "rank_id":"11" + }, + { + "device_id":"4", + "device_ip":rank_address[12], + "rank_id":"12" + }, + { + "device_id":"5", + "device_ip":rank_address[13], + "rank_id":"13" + }, + { + "device_id":"6", + "device_ip":rank_address[14], + "rank_id":"14" + }, + { + "device_id":"7", + "device_ip":rank_address[15], + "rank_id":"15" + } + ]}, + { + "server_id":server[2], + "device":[ + { + "device_id":"0", + "device_ip":rank_address[16], + "rank_id":"16" + }, + { + "device_id":"1", + "device_ip":rank_address[17], + "rank_id":"17" + }, + { + "device_id":"2", + "device_ip":rank_address[18], + "rank_id":"18" + }, + { + "device_id":"3", + "device_ip":rank_address[19], + "rank_id":"19" + }, + { + "device_id":"4", + "device_ip":rank_address[20], + "rank_id":"20" + }, + { + "device_id":"5", + "device_ip":rank_address[21], + "rank_id":"21" + }, + { + "device_id":"6", + "device_ip":rank_address[22], + "rank_id":"22" + }, + { + "device_id":"7", + "device_ip":rank_address[23], + "rank_id":"23" + } + ]}, + { + "server_id":server[3], + "device":[ + { + "device_id":"0", + "device_ip":rank_address[24], + "rank_id":"24" + }, + { + "device_id":"1", + "device_ip":rank_address[25], + "rank_id":"25" + }, + { + "device_id":"2", + "device_ip":rank_address[26], + "rank_id":"26" + }, + { + "device_id":"3", + "device_ip":rank_address[27], + "rank_id":"27" + }, + { + "device_id":"4", + "device_ip":rank_address[28], + "rank_id":"28" + }, + { + "device_id":"5", + "device_ip":rank_address[29], + "rank_id":"29" + }, + { + "device_id":"6", + "device_ip":rank_address[30], + "rank_id":"30" + }, + { + "device_id":"7", + "device_ip":rank_address[31], + "rank_id":"31" + } + ]} + ], + "status":"completed", + "version":"1.0" + } +elif FLAGS.npu_nums == 5: + rank = { + "server_count":"5", + "server_list":[ + { + "server_id":server[0], + "device":[ + { + "device_id":"0", + "device_ip":rank_address[0], + "rank_id":"0" + }, + { + "device_id":"1", + "device_ip":rank_address[1], + "rank_id":"1" + }, + { + "device_id":"2", + "device_ip":rank_address[2], + "rank_id":"2" + }, + { + "device_id":"3", + "device_ip":rank_address[3], + "rank_id":"3" + }, + { + "device_id":"4", + "device_ip":rank_address[4], + "rank_id":"4" + }, + { + "device_id":"5", + "device_ip":rank_address[5], + "rank_id":"5" + }, + { + "device_id":"6", + "device_ip":rank_address[6], + "rank_id":"6" + }, + { + "device_id":"7", + "device_ip":rank_address[7], + "rank_id":"7" + } + ]}, + + + { + "server_id":server[1], + "device":[ + { + "device_id":"0", + "device_ip":rank_address[8], + "rank_id":"8" + }, + { + "device_id":"1", + "device_ip":rank_address[9], + "rank_id":"9" + }, + { + "device_id":"2", + "device_ip":rank_address[10], + "rank_id":"10" + }, + { + "device_id":"3", + "device_ip":rank_address[11], + "rank_id":"11" + }, + { + "device_id":"4", + "device_ip":rank_address[12], + "rank_id":"12" + }, + { + "device_id":"5", + "device_ip":rank_address[13], + "rank_id":"13" + }, + { + "device_id":"6", + "device_ip":rank_address[14], + "rank_id":"14" + }, + { + "device_id":"7", + "device_ip":rank_address[15], + "rank_id":"15" + } + ]}, + { + "server_id":server[2], + "device":[ + { + "device_id":"0", + "device_ip":rank_address[16], + "rank_id":"16" + }, + { + "device_id":"1", + "device_ip":rank_address[17], + "rank_id":"17" + }, + { + "device_id":"2", + "device_ip":rank_address[18], + "rank_id":"18" + }, + { + "device_id":"3", + "device_ip":rank_address[19], + "rank_id":"19" + }, + { + "device_id":"4", + "device_ip":rank_address[20], + "rank_id":"20" + }, + { + "device_id":"5", + "device_ip":rank_address[21], + "rank_id":"21" + }, + { + "device_id":"6", + "device_ip":rank_address[22], + "rank_id":"22" + }, + { + "device_id":"7", + "device_ip":rank_address[23], + "rank_id":"23" + } + ]}, + { + "server_id":server[3], + "device":[ + { + "device_id":"0", + "device_ip":rank_address[24], + "rank_id":"24" + }, + { + "device_id":"1", + "device_ip":rank_address[25], + "rank_id":"25" + }, + { + "device_id":"2", + "device_ip":rank_address[26], + "rank_id":"26" + }, + { + "device_id":"3", + "device_ip":rank_address[27], + "rank_id":"27" + }, + { + "device_id":"4", + "device_ip":rank_address[28], + "rank_id":"28" + }, + { + "device_id":"5", + "device_ip":rank_address[29], + "rank_id":"29" + }, + { + "device_id":"6", + "device_ip":rank_address[30], + "rank_id":"30" + }, + { + "device_id":"7", + "device_ip":rank_address[31], + "rank_id":"31" + } + ]}, + { + "server_id":server[4], + "device":[ + { + "device_id":"0", + "device_ip":rank_address[32], + "rank_id":"32" + }, + { + "device_id":"1", + "device_ip":rank_address[33], + "rank_id":"33" + }, + { + "device_id":"2", + "device_ip":rank_address[34], + "rank_id":"34" + }, + { + "device_id":"3", + "device_ip":rank_address[35], + "rank_id":"35" + }, + { + "device_id":"4", + "device_ip":rank_address[36], + "rank_id":"36" + }, + { + "device_id":"5", + "device_ip":rank_address[37], + "rank_id":"37" + }, + { + "device_id":"6", + "device_ip":rank_address[38], + "rank_id":"38" + }, + { + "device_id":"7", + "device_ip":rank_address[39], + "rank_id":"39" + } + ]} + ], + "status":"completed", + "version":"1.0" + } + + + +elif FLAGS.npu_nums == 6: + rank = { + "server_count":"6", + "server_list":[ + { + "server_id":server[0], + "device":[ + { + "device_id":"0", + "device_ip":rank_address[0], + "rank_id":"0" + }, + { + "device_id":"1", + "device_ip":rank_address[1], + "rank_id":"1" + }, + { + "device_id":"2", + "device_ip":rank_address[2], + "rank_id":"2" + }, + { + "device_id":"3", + "device_ip":rank_address[3], + "rank_id":"3" + }, + { + "device_id":"4", + "device_ip":rank_address[4], + "rank_id":"4" + }, + { + "device_id":"5", + "device_ip":rank_address[5], + "rank_id":"5" + }, + { + "device_id":"6", + "device_ip":rank_address[6], + "rank_id":"6" + }, + { + "device_id":"7", + "device_ip":rank_address[7], + "rank_id":"7" + } + ]}, + + + { + "server_id":server[1], + "device":[ + { + "device_id":"0", + "device_ip":rank_address[8], + "rank_id":"8" + }, + { + "device_id":"1", + "device_ip":rank_address[9], + "rank_id":"9" + }, + { + "device_id":"2", + "device_ip":rank_address[10], + "rank_id":"10" + }, + { + "device_id":"3", + "device_ip":rank_address[11], + "rank_id":"11" + }, + { + "device_id":"4", + "device_ip":rank_address[12], + "rank_id":"12" + }, + { + "device_id":"5", + "device_ip":rank_address[13], + "rank_id":"13" + }, + { + "device_id":"6", + "device_ip":rank_address[14], + "rank_id":"14" + }, + { + "device_id":"7", + "device_ip":rank_address[15], + "rank_id":"15" + } + ]}, + { + "server_id":server[2], + "device":[ + { + "device_id":"0", + "device_ip":rank_address[16], + "rank_id":"16" + }, + { + "device_id":"1", + "device_ip":rank_address[17], + "rank_id":"17" + }, + { + "device_id":"2", + "device_ip":rank_address[18], + "rank_id":"18" + }, + { + "device_id":"3", + "device_ip":rank_address[19], + "rank_id":"19" + }, + { + "device_id":"4", + "device_ip":rank_address[20], + "rank_id":"20" + }, + { + "device_id":"5", + "device_ip":rank_address[21], + "rank_id":"21" + }, + { + "device_id":"6", + "device_ip":rank_address[22], + "rank_id":"22" + }, + { + "device_id":"7", + "device_ip":rank_address[23], + "rank_id":"23" + } + ]}, + { + "server_id":server[3], + "device":[ + { + "device_id":"0", + "device_ip":rank_address[24], + "rank_id":"24" + }, + { + "device_id":"1", + "device_ip":rank_address[25], + "rank_id":"25" + }, + { + "device_id":"2", + "device_ip":rank_address[26], + "rank_id":"26" + }, + { + "device_id":"3", + "device_ip":rank_address[27], + "rank_id":"27" + }, + { + "device_id":"4", + "device_ip":rank_address[28], + "rank_id":"28" + }, + { + "device_id":"5", + "device_ip":rank_address[29], + "rank_id":"29" + }, + { + "device_id":"6", + "device_ip":rank_address[30], + "rank_id":"30" + }, + { + "device_id":"7", + "device_ip":rank_address[31], + "rank_id":"31" + } + ]}, + { + "server_id":server[4], + "device":[ + { + "device_id":"0", + "device_ip":rank_address[32], + "rank_id":"32" + }, + { + "device_id":"1", + "device_ip":rank_address[33], + "rank_id":"33" + }, + { + "device_id":"2", + "device_ip":rank_address[34], + "rank_id":"34" + }, + { + "device_id":"3", + "device_ip":rank_address[35], + "rank_id":"35" + }, + { + "device_id":"4", + "device_ip":rank_address[36], + "rank_id":"36" + }, + { + "device_id":"5", + "device_ip":rank_address[37], + "rank_id":"37" + }, + { + "device_id":"6", + "device_ip":rank_address[38], + "rank_id":"38" + }, + { + "device_id":"7", + "device_ip":rank_address[39], + "rank_id":"39" + } + ]}, + { + "server_id":server[5], + "device":[ + { + "device_id":"0", + "device_ip":rank_address[40], + "rank_id":"40" + }, + { + "device_id":"1", + "device_ip":rank_address[41], + "rank_id":"41" + }, + { + "device_id":"2", + "device_ip":rank_address[42], + "rank_id":"42" + }, + { + "device_id":"3", + "device_ip":rank_address[43], + "rank_id":"43" + }, + { + "device_id":"4", + "device_ip":rank_address[44], + "rank_id":"44" + }, + { + "device_id":"5", + "device_ip":rank_address[45], + "rank_id":"45" + }, + { + "device_id":"6", + "device_ip":rank_address[46], + "rank_id":"46" + }, + { + "device_id":"7", + "device_ip":rank_address[47], + "rank_id":"47" + } + ]} + ], + "status":"completed", + "version":"1.0" + } + + +elif FLAGS.npu_nums == 7: + rank = { + "server_count":"7", + "server_list":[ + { + "server_id":server[0], + "device":[ + { + "device_id":"0", + "device_ip":rank_address[0], + "rank_id":"0" + }, + { + "device_id":"1", + "device_ip":rank_address[1], + "rank_id":"1" + }, + { + "device_id":"2", + "device_ip":rank_address[2], + "rank_id":"2" + }, + { + "device_id":"3", + "device_ip":rank_address[3], + "rank_id":"3" + }, + { + "device_id":"4", + "device_ip":rank_address[4], + "rank_id":"4" + }, + { + "device_id":"5", + "device_ip":rank_address[5], + "rank_id":"5" + }, + { + "device_id":"6", + "device_ip":rank_address[6], + "rank_id":"6" + }, + { + "device_id":"7", + "device_ip":rank_address[7], + "rank_id":"7" + } + ]}, + + + { + "server_id":server[1], + "device":[ + { + "device_id":"0", + "device_ip":rank_address[8], + "rank_id":"8" + }, + { + "device_id":"1", + "device_ip":rank_address[9], + "rank_id":"9" + }, + { + "device_id":"2", + "device_ip":rank_address[10], + "rank_id":"10" + }, + { + "device_id":"3", + "device_ip":rank_address[11], + "rank_id":"11" + }, + { + "device_id":"4", + "device_ip":rank_address[12], + "rank_id":"12" + }, + { + "device_id":"5", + "device_ip":rank_address[13], + "rank_id":"13" + }, + { + "device_id":"6", + "device_ip":rank_address[14], + "rank_id":"14" + }, + { + "device_id":"7", + "device_ip":rank_address[15], + "rank_id":"15" + } + ]}, + { + "server_id":server[2], + "device":[ + { + "device_id":"0", + "device_ip":rank_address[16], + "rank_id":"16" + }, + { + "device_id":"1", + "device_ip":rank_address[17], + "rank_id":"17" + }, + { + "device_id":"2", + "device_ip":rank_address[18], + "rank_id":"18" + }, + { + "device_id":"3", + "device_ip":rank_address[19], + "rank_id":"19" + }, + { + "device_id":"4", + "device_ip":rank_address[20], + "rank_id":"20" + }, + { + "device_id":"5", + "device_ip":rank_address[21], + "rank_id":"21" + }, + { + "device_id":"6", + "device_ip":rank_address[22], + "rank_id":"22" + }, + { + "device_id":"7", + "device_ip":rank_address[23], + "rank_id":"23" + } + ]}, + { + "server_id":server[3], + "device":[ + { + "device_id":"0", + "device_ip":rank_address[24], + "rank_id":"24" + }, + { + "device_id":"1", + "device_ip":rank_address[25], + "rank_id":"25" + }, + { + "device_id":"2", + "device_ip":rank_address[26], + "rank_id":"26" + }, + { + "device_id":"3", + "device_ip":rank_address[27], + "rank_id":"27" + }, + { + "device_id":"4", + "device_ip":rank_address[28], + "rank_id":"28" + }, + { + "device_id":"5", + "device_ip":rank_address[29], + "rank_id":"29" + }, + { + "device_id":"6", + "device_ip":rank_address[30], + "rank_id":"30" + }, + { + "device_id":"7", + "device_ip":rank_address[31], + "rank_id":"31" + } + ]}, + { + "server_id":server[4], + "device":[ + { + "device_id":"0", + "device_ip":rank_address[32], + "rank_id":"32" + }, + { + "device_id":"1", + "device_ip":rank_address[33], + "rank_id":"33" + }, + { + "device_id":"2", + "device_ip":rank_address[34], + "rank_id":"34" + }, + { + "device_id":"3", + "device_ip":rank_address[35], + "rank_id":"35" + }, + { + "device_id":"4", + "device_ip":rank_address[36], + "rank_id":"36" + }, + { + "device_id":"5", + "device_ip":rank_address[37], + "rank_id":"37" + }, + { + "device_id":"6", + "device_ip":rank_address[38], + "rank_id":"38" + }, + { + "device_id":"7", + "device_ip":rank_address[39], + "rank_id":"39" + } + ]}, + { + "server_id":server[5], + "device":[ + { + "device_id":"0", + "device_ip":rank_address[40], + "rank_id":"40" + }, + { + "device_id":"1", + "device_ip":rank_address[41], + "rank_id":"41" + }, + { + "device_id":"2", + "device_ip":rank_address[42], + "rank_id":"42" + }, + { + "device_id":"3", + "device_ip":rank_address[43], + "rank_id":"43" + }, + { + "device_id":"4", + "device_ip":rank_address[44], + "rank_id":"44" + }, + { + "device_id":"5", + "device_ip":rank_address[45], + "rank_id":"45" + }, + { + "device_id":"6", + "device_ip":rank_address[46], + "rank_id":"46" + }, + { + "device_id":"7", + "device_ip":rank_address[47], + "rank_id":"47" + } + ]}, + { + "server_id":server[6], + "device":[ + { + "device_id":"0", + "device_ip":rank_address[48], + "rank_id":"48" + }, + { + "device_id":"1", + "device_ip":rank_address[49], + "rank_id":"49" + }, + { + "device_id":"2", + "device_ip":rank_address[50], + "rank_id":"50" + }, + { + "device_id":"3", + "device_ip":rank_address[51], + "rank_id":"51" + }, + { + "device_id":"4", + "device_ip":rank_address[52], + "rank_id":"52" + }, + { + "device_id":"5", + "device_ip":rank_address[53], + "rank_id":"53" + }, + { + "device_id":"6", + "device_ip":rank_address[54], + "rank_id":"54" + }, + { + "device_id":"7", + "device_ip":rank_address[55], + "rank_id":"55" + } + ]} + ], + "status":"completed", + "version":"1.0" + } + + + + +elif FLAGS.npu_nums == 8: + rank = { + "server_count":"8", + "server_list":[ + { + "server_id":server[0], + "device":[ + { + "device_id":"0", + "device_ip":rank_address[0], + "rank_id":"0" + }, + { + "device_id":"1", + "device_ip":rank_address[1], + "rank_id":"1" + }, + { + "device_id":"2", + "device_ip":rank_address[2], + "rank_id":"2" + }, + { + "device_id":"3", + "device_ip":rank_address[3], + "rank_id":"3" + }, + { + "device_id":"4", + "device_ip":rank_address[4], + "rank_id":"4" + }, + { + "device_id":"5", + "device_ip":rank_address[5], + "rank_id":"5" + }, + { + "device_id":"6", + "device_ip":rank_address[6], + "rank_id":"6" + }, + { + "device_id":"7", + "device_ip":rank_address[7], + "rank_id":"7" + } + ]}, + + + { + "server_id":server[1], + "device":[ + { + "device_id":"0", + "device_ip":rank_address[8], + "rank_id":"8" + }, + { + "device_id":"1", + "device_ip":rank_address[9], + "rank_id":"9" + }, + { + "device_id":"2", + "device_ip":rank_address[10], + "rank_id":"10" + }, + { + "device_id":"3", + "device_ip":rank_address[11], + "rank_id":"11" + }, + { + "device_id":"4", + "device_ip":rank_address[12], + "rank_id":"12" + }, + { + "device_id":"5", + "device_ip":rank_address[13], + "rank_id":"13" + }, + { + "device_id":"6", + "device_ip":rank_address[14], + "rank_id":"14" + }, + { + "device_id":"7", + "device_ip":rank_address[15], + "rank_id":"15" + } + ]}, + { + "server_id":server[2], + "device":[ + { + "device_id":"0", + "device_ip":rank_address[16], + "rank_id":"16" + }, + { + "device_id":"1", + "device_ip":rank_address[17], + "rank_id":"17" + }, + { + "device_id":"2", + "device_ip":rank_address[18], + "rank_id":"18" + }, + { + "device_id":"3", + "device_ip":rank_address[19], + "rank_id":"19" + }, + { + "device_id":"4", + "device_ip":rank_address[20], + "rank_id":"20" + }, + { + "device_id":"5", + "device_ip":rank_address[21], + "rank_id":"21" + }, + { + "device_id":"6", + "device_ip":rank_address[22], + "rank_id":"22" + }, + { + "device_id":"7", + "device_ip":rank_address[23], + "rank_id":"23" + } + ]}, + { + "server_id":server[3], + "device":[ + { + "device_id":"0", + "device_ip":rank_address[24], + "rank_id":"24" + }, + { + "device_id":"1", + "device_ip":rank_address[25], + "rank_id":"25" + }, + { + "device_id":"2", + "device_ip":rank_address[26], + "rank_id":"26" + }, + { + "device_id":"3", + "device_ip":rank_address[27], + "rank_id":"27" + }, + { + "device_id":"4", + "device_ip":rank_address[28], + "rank_id":"28" + }, + { + "device_id":"5", + "device_ip":rank_address[29], + "rank_id":"29" + }, + { + "device_id":"6", + "device_ip":rank_address[30], + "rank_id":"30" + }, + { + "device_id":"7", + "device_ip":rank_address[31], + "rank_id":"31" + } + ]}, + { + "server_id":server[4], + "device":[ + { + "device_id":"0", + "device_ip":rank_address[32], + "rank_id":"32" + }, + { + "device_id":"1", + "device_ip":rank_address[33], + "rank_id":"33" + }, + { + "device_id":"2", + "device_ip":rank_address[34], + "rank_id":"34" + }, + { + "device_id":"3", + "device_ip":rank_address[35], + "rank_id":"35" + }, + { + "device_id":"4", + "device_ip":rank_address[36], + "rank_id":"36" + }, + { + "device_id":"5", + "device_ip":rank_address[37], + "rank_id":"37" + }, + { + "device_id":"6", + "device_ip":rank_address[38], + "rank_id":"38" + }, + { + "device_id":"7", + "device_ip":rank_address[39], + "rank_id":"39" + } + ]}, + { + "server_id":server[5], + "device":[ + { + "device_id":"0", + "device_ip":rank_address[40], + "rank_id":"40" + }, + { + "device_id":"1", + "device_ip":rank_address[41], + "rank_id":"41" + }, + { + "device_id":"2", + "device_ip":rank_address[42], + "rank_id":"42" + }, + { + "device_id":"3", + "device_ip":rank_address[43], + "rank_id":"43" + }, + { + "device_id":"4", + "device_ip":rank_address[44], + "rank_id":"44" + }, + { + "device_id":"5", + "device_ip":rank_address[45], + "rank_id":"45" + }, + { + "device_id":"6", + "device_ip":rank_address[46], + "rank_id":"46" + }, + { + "device_id":"7", + "device_ip":rank_address[47], + "rank_id":"47" + } + ]}, + { + "server_id":server[6], + "device":[ + { + "device_id":"0", + "device_ip":rank_address[48], + "rank_id":"48" + }, + { + "device_id":"1", + "device_ip":rank_address[49], + "rank_id":"49" + }, + { + "device_id":"2", + "device_ip":rank_address[50], + "rank_id":"50" + }, + { + "device_id":"3", + "device_ip":rank_address[51], + "rank_id":"51" + }, + { + "device_id":"4", + "device_ip":rank_address[52], + "rank_id":"52" + }, + { + "device_id":"5", + "device_ip":rank_address[53], + "rank_id":"53" + }, + { + "device_id":"6", + "device_ip":rank_address[54], + "rank_id":"54" + }, + { + "device_id":"7", + "device_ip":rank_address[55], + "rank_id":"55" + } + ]}, + { + "server_id":server[7], + "device":[ + { + "device_id":"0", + "device_ip":rank_address[56], + "rank_id":"56" + }, + { + "device_id":"1", + "device_ip":rank_address[57], + "rank_id":"57" + }, + { + "device_id":"2", + "device_ip":rank_address[58], + "rank_id":"58" + }, + { + "device_id":"3", + "device_ip":rank_address[59], + "rank_id":"59" + }, + { + "device_id":"4", + "device_ip":rank_address[60], + "rank_id":"60" + }, + { + "device_id":"5", + "device_ip":rank_address[61], + "rank_id":"61" + }, + { + "device_id":"6", + "device_ip":rank_address[62], + "rank_id":"62" + }, + { + "device_id":"7", + "device_ip":rank_address[63], + "rank_id":"63" + } + ]} + ], + "status":"completed", + "version":"1.0" + } + + + + +with open("rank_table.json", "w") as f: + json.dump(rank, f) + + + + + + diff --git a/TensorFlow/built-in/cv/detection/SSD-Resnet34_ID0048_for_TensorFlow/test/train_performance_16np.sh b/TensorFlow/built-in/cv/detection/SSD-Resnet34_ID0048_for_TensorFlow/test/train_performance_16np.sh new file mode 100644 index 000000000..33b283dbe --- /dev/null +++ b/TensorFlow/built-in/cv/detection/SSD-Resnet34_ID0048_for_TensorFlow/test/train_performance_16np.sh @@ -0,0 +1,216 @@ +#!/bin/bash + +#当前路径,不需要修改 +cur_path=`pwd` + +#集合通信参数,不需要修改 +export RANK_SIZE=16 +export JOB_ID=99990001 +export RANK_ID=8p +export SLOG_PRINT_TO_STDOUT=0 +#export RANK_TABLE_FILE=${cur_path}/../npu_config/8p.json +export HCCL_CONNECT_TIMEOUT=600 +RANK_ID_START=0 + +# 数据集路径,保持为空,不需要修改 +data_path="" + +#设置默认日志级别,不需要修改 +export ASCEND_GLOBAL_LOG_LEVEL_ETP=3 + +#基础参数,需要模型审视修改 +#网络名称,同目录名称 +Network="SSD-Resnet34_ID0048_for_TensorFlow" +#训练epoch +train_epochs=8 +#训练batch_size +batch_size=32 +#训练step +train_steps=1000 +#学习率 +learning_rate= + +#维测参数,precision_mode需要模型审视修改 +#precision_mode="allow_mix_precision" +#维持参数,以下不需要修改 +over_dump=False +data_dump_flag=False +data_dump_step="10" +profiling=False +autotune=False + +# 帮助信息,不需要修改 +if [[ $1 == --help || $1 == -h ]];then + echo"usage:./train_performance_8p.sh " + echo " " + echo "parameter explain: + --precision_mode precision mode(allow_fp32_to_fp16/force_fp16/must_keep_origin_dtype/allow_mix_precision) + --over_dump if or not over detection, default is False + --data_dump_flag data dump flag, default is False + --data_dump_step data dump step, default is 10 + --profiling if or not profiling for performance debug, default is False + --autotune whether to enable autotune, default is False + --data_path source data of training + -h/--help show help message + " + exit 1 +fi + +#参数校验,不需要修改 +for para in $* +do + if [[ $para == --precision_mode* ]];then + precision_mode=`echo ${para#*=}` + elif [[ $para == --over_dump* ]];then + over_dump=`echo ${para#*=}` + over_dump_path=${cur_path}/output/overflow_dump + mkdir -p ${over_dump_path} + elif [[ $para == --data_dump_flag* ]];then + data_dump_flag=`echo ${para#*=}` + data_dump_path=${cur_path}/output/data_dump + mkdir -p ${data_dump_path} + elif [[ $para == --data_dump_step* ]];then + data_dump_step=`echo ${para#*=}` + elif [[ $para == --profiling* ]];then + profiling=`echo ${para#*=}` + profiling_dump_path=${cur_path}/output/profiling + mkdir -p ${profiling_dump_path} + elif [[ $para == --autotune* ]];then + autotune=`echo ${para#*=}` + mv $install_path/fwkacllib/data/rl/Ascend910/custom $install_path/fwkacllib/data/rl/Ascend910/custom_bak + mv $install_path/fwkacllib/data/tiling/Ascend910/custom $install_path/fwkacllib/data/tiling/Ascend910/custom_bak + autotune_dump_path=${cur_path}/output/autotune_dump + mkdir -p ${autotune_dump_path}/GA + mkdir -p ${autotune_dump_path}/rl + cp -rf $install_path/fwkacllib/data/tiling/Ascend910/custom ${autotune_dump_path}/GA/ + cp -rf $install_path/fwkacllib/data/rl/Ascend910/custom ${autotune_dump_path}/RL/ + elif [[ $para == --data_path* ]];then + data_path=`echo ${para#*=}` + elif [[ $para == --bind_core* ]]; then + bind_core=`echo ${para#*=}` + name_bind="_bindcore" + elif [[ $para == --conf_path* ]];then + conf_path=`echo ${para#*=}` + elif [[ $para == --devices_num* ]];then + devices_num=`echo ${para#*=}` + elif [[ $para == --servers_num* ]];then + servers_num=`echo ${para#*=}` + fi +done + +linux_num=$servers_num + +export RANK_SIZE=`awk 'BEGIN{printf "%.0f\n",'${devices_num}'*'${linux_num}'}'` +rank_size=8 +if [[ $conf_path != "" ]];then + nohup python3 set_ranktable.py --npu_nums=$linux_num --conf_path=$conf_path +fi +wait +export RANK_TABLE_FILE=${cur_path}/test/rank_table.json + +#校验是否传入data_path,不需要修改 +if [[ $data_path == "" ]];then + echo "[Error] para \"data_path\" must be confing" + exit 1 +fi + +#autotune时,先开启autotune执行单P训练,不需要修改 +if [[ $autotune == True ]]; then + train_full_1p.sh --autotune=$autotune --data_path=$data_path + wait + autotune=False +fi + +#训练开始时间,不需要修改 +start_time=$(date +%s) + +#进入训练脚本目录,需要模型审视修改 +cd $cur_path/../ +for((RANK_ID=$((rank_size*server_index));RANK_ID<$((((server_index+1))*rank_size));RANK_ID++)); +do + #设置环境变量,不需要修改 + echo "Device ID: $RANK_ID" + export RANK_ID=$RANK_ID + export DEVICE_INDEX=$RANK_ID + export DEVICE_ID=$RANK_ID + export ASCEND_DEVICE_ID=`expr ${RANK_ID} - $((rank_size*server_index))` + ASCEND_DEVICE_ID=`expr ${RANK_ID} - $((rank_size*server_index))` + + #创建DeviceID输出目录,不需要修改 + if [ -d ${cur_path}/output/${ASCEND_DEVICE_ID} ];then + rm -rf ${cur_path}/output/${ASCEND_DEVICE_ID} + mkdir -p ${cur_path}/output/$ASCEND_DEVICE_ID/ckpt + else + mkdir -p ${cur_path}/output/$ASCEND_DEVICE_ID/ckpt + fi + + # # 绑核,不需要的绑核的模型删除,需要模型审视修改 + # corenum=`cat /proc/cpuinfo |grep "processor"|wc -l` + # let a=RANK_ID*${corenum}/${RANK_SIZE} + # let b=RANK_ID+1 + # let c=b*${corenum}/${RANK_SIZE}-1 + + # #执行训练脚本,以下传参不需要修改,其他需要模型审视修改 + # #--data_dir, --model_dir, --precision_mode, --over_dump, --over_dump_path,--data_dump_flag,--data_dump_step,--data_dump_path,--profiling,--profiling_dump_path + # if [ "x${bind_core}" != x ];then + # bind_core="taskset -c $a-$c" + # fi + # nohup ${bind_core} python3.7 ${cur_path}/../ssd_main.py --mode=train \ + nohup python3.7 ${cur_path}/../ssd_main.py --mode=train \ + --train_batch_size=${batch_size} \ + --training_file_pattern=${data_path}/coco_official_2017/tfrecord/train2017* \ + --resnet_checkpoint=${data_path}/resnet34_pretrain/model.ckpt-28152 \ + --validation_file_pattern=${data_path}/coco_official_2017/tfrecord/val2017* \ + --val_json_file=${data_path}/coco_official_2017/annotations/instances_val2017.json \ + --eval_batch_size=${batch_size} \ + --num_epochs=${train_epochs} \ + --num_examples_per_epoch=64000 \ + --model_dir=${cur_path}/output/${ASCEND_DEVICE_ID}/d_solution/ckpt${ASCEND_DEVICE_ID} > ${cur_path}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log 2>&1 & +done +wait + +#训练结束时间,不需要修改 +end_time=$(date +%s) +e2e_time=$(( $end_time - $start_time )) + +#结果打印,不需要修改 +echo "------------------ Final result ------------------" +#输出性能FPS,需要模型审视修改 +FPS=`cat ${cur_path}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log | grep "\] FPS:" | awk -F "FPS: " '{print $2}' | awk -F "," '{print $1}' | tail -n +2 | awk '{sum+=$1} END {print sum/NR}'` +#打印,不需要修改 +echo "Final Performance images/sec : $FPS" + +#输出训练精度,需要模型审视修改 +#train_accuracy=`grep -A 1 top1 $cur_path/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log|awk 'END {print $3}'` +#打印,不需要修改 +#echo "Final Train Accuracy : ${train_accuracy}" +echo "E2E Training Duration sec : $e2e_time" + +#稳定性精度看护结果汇总 +#训练用例信息,不需要修改 +BatchSize=${batch_size} +DeviceType=`uname -m` +CaseName=${Network}${name_bind}_bs${BatchSize}_${RANK_SIZE}'p'_'perf' + +##获取性能数据 +#吞吐量,不需要修改 +ActualFPS=${FPS} +#单迭代训练时长,不需要修改 +TrainingTime=`awk 'BEGIN{printf "%.2f\n",'${batch_size}'*'${RANK_SIZE}'*1000/'${FPS}'}'` + +#从train_$ASCEND_DEVICE_ID.log提取Loss到train_${CaseName}_loss.txt中,需要根据模型审视 +grep "\] FPS:" $cur_path/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log | awk -F "loss: " '{print $2}' >> $cur_path/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt + +#最后一个迭代loss值,不需要修改 +ActualLoss=`awk 'END {print}' $cur_path/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt` + +#关键信息打印到${CaseName}.log中,不需要修改 +echo "Network = ${Network}" > $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "RankSize = ${RANK_SIZE}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "BatchSize = ${BatchSize}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "DeviceType = ${DeviceType}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "CaseName = ${CaseName}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "ActualFPS = ${ActualFPS}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "TrainingTime = ${TrainingTime}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "ActualLoss = ${ActualLoss}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "E2ETrainingTime = ${e2e_time}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log \ No newline at end of file diff --git a/TensorFlow/built-in/cv/detection/SSD-Resnet34_ID0048_for_TensorFlow/test/train_performance_16p.sh b/TensorFlow/built-in/cv/detection/SSD-Resnet34_ID0048_for_TensorFlow/test/train_performance_16p.sh new file mode 100644 index 000000000..0c0e0895a --- /dev/null +++ b/TensorFlow/built-in/cv/detection/SSD-Resnet34_ID0048_for_TensorFlow/test/train_performance_16p.sh @@ -0,0 +1,211 @@ +#!/bin/bash + +#当前路径,不需要修改 +cur_path=`pwd` + +#集合通信参数,不需要修改 +export RANK_SIZE=16 +export JOB_ID=99990001 +export RANK_ID=8p +export SLOG_PRINT_TO_STDOUT=0 +#export RANK_TABLE_FILE=${cur_path}/../npu_config/8p.json +export HCCL_CONNECT_TIMEOUT=600 +RANK_ID_START=0 + +# 数据集路径,保持为空,不需要修改 +data_path="" + +#设置默认日志级别,不需要修改 +export ASCEND_GLOBAL_LOG_LEVEL_ETP=3 + +#基础参数,需要模型审视修改 +#网络名称,同目录名称 +Network="SSD-Resnet34_ID0048_for_TensorFlow" +#训练epoch +train_epochs=8 +#训练batch_size +batch_size=32 +#训练step +train_steps=1000 +#学习率 +learning_rate= + +#维测参数,precision_mode需要模型审视修改 +#precision_mode="allow_mix_precision" +#维持参数,以下不需要修改 +over_dump=False +data_dump_flag=False +data_dump_step="10" +profiling=False +autotune=False + +# 帮助信息,不需要修改 +if [[ $1 == --help || $1 == -h ]];then + echo"usage:./train_performance_8p.sh " + echo " " + echo "parameter explain: + --precision_mode precision mode(allow_fp32_to_fp16/force_fp16/must_keep_origin_dtype/allow_mix_precision) + --over_dump if or not over detection, default is False + --data_dump_flag data dump flag, default is False + --data_dump_step data dump step, default is 10 + --profiling if or not profiling for performance debug, default is False + --autotune whether to enable autotune, default is False + --data_path source data of training + -h/--help show help message + " + exit 1 +fi + +#参数校验,不需要修改 +for para in $* +do + if [[ $para == --precision_mode* ]];then + precision_mode=`echo ${para#*=}` + elif [[ $para == --over_dump* ]];then + over_dump=`echo ${para#*=}` + over_dump_path=${cur_path}/output/overflow_dump + mkdir -p ${over_dump_path} + elif [[ $para == --data_dump_flag* ]];then + data_dump_flag=`echo ${para#*=}` + data_dump_path=${cur_path}/output/data_dump + mkdir -p ${data_dump_path} + elif [[ $para == --data_dump_step* ]];then + data_dump_step=`echo ${para#*=}` + elif [[ $para == --profiling* ]];then + profiling=`echo ${para#*=}` + profiling_dump_path=${cur_path}/output/profiling + mkdir -p ${profiling_dump_path} + elif [[ $para == --autotune* ]];then + autotune=`echo ${para#*=}` + mv $install_path/fwkacllib/data/rl/Ascend910/custom $install_path/fwkacllib/data/rl/Ascend910/custom_bak + mv $install_path/fwkacllib/data/tiling/Ascend910/custom $install_path/fwkacllib/data/tiling/Ascend910/custom_bak + autotune_dump_path=${cur_path}/output/autotune_dump + mkdir -p ${autotune_dump_path}/GA + mkdir -p ${autotune_dump_path}/rl + cp -rf $install_path/fwkacllib/data/tiling/Ascend910/custom ${autotune_dump_path}/GA/ + cp -rf $install_path/fwkacllib/data/rl/Ascend910/custom ${autotune_dump_path}/RL/ + elif [[ $para == --data_path* ]];then + data_path=`echo ${para#*=}` + elif [[ $para == --bind_core* ]]; then + bind_core=`echo ${para#*=}` + name_bind="_bindcore" + elif [[ $para == --server_index* ]];then + server_index=`echo ${para#*=}` + elif [[ $para == --conf_path* ]];then + conf_path=`echo ${para#*=}` + fi +done + +rank_size=8 +if [[ $conf_path != "" ]];then + nohup python3 set_ranktable.py --npu_nums=$((RANK_SIZE/rank_size)) --conf_path=$conf_path +fi +wait +export RANK_TABLE_FILE=${cur_path}/rank_table.json + +#校验是否传入data_path,不需要修改 +if [[ $data_path == "" ]];then + echo "[Error] para \"data_path\" must be confing" + exit 1 +fi + +#autotune时,先开启autotune执行单P训练,不需要修改 +if [[ $autotune == True ]]; then + train_full_1p.sh --autotune=$autotune --data_path=$data_path + wait + autotune=False +fi + +#训练开始时间,不需要修改 +start_time=$(date +%s) + +#进入训练脚本目录,需要模型审视修改 +cd $cur_path/../ +for((RANK_ID=$((rank_size*server_index));RANK_ID<$((((server_index+1))*rank_size));RANK_ID++)); +do + #设置环境变量,不需要修改 + echo "Device ID: $RANK_ID" + export RANK_ID=$RANK_ID + export DEVICE_INDEX=$RANK_ID + export DEVICE_ID=$RANK_ID + export ASCEND_DEVICE_ID=`expr ${RANK_ID} - $((rank_size*server_index))` + ASCEND_DEVICE_ID=`expr ${RANK_ID} - $((rank_size*server_index))` + + #创建DeviceID输出目录,不需要修改 + if [ -d ${cur_path}/output/${ASCEND_DEVICE_ID} ];then + rm -rf ${cur_path}/output/${ASCEND_DEVICE_ID} + mkdir -p ${cur_path}/output/$ASCEND_DEVICE_ID/ckpt + else + mkdir -p ${cur_path}/output/$ASCEND_DEVICE_ID/ckpt + fi + + # # 绑核,不需要的绑核的模型删除,需要模型审视修改 + # corenum=`cat /proc/cpuinfo |grep "processor"|wc -l` + # let a=RANK_ID*${corenum}/${RANK_SIZE} + # let b=RANK_ID+1 + # let c=b*${corenum}/${RANK_SIZE}-1 + + # #执行训练脚本,以下传参不需要修改,其他需要模型审视修改 + # #--data_dir, --model_dir, --precision_mode, --over_dump, --over_dump_path,--data_dump_flag,--data_dump_step,--data_dump_path,--profiling,--profiling_dump_path + # if [ "x${bind_core}" != x ];then + # bind_core="taskset -c $a-$c" + # fi + # nohup ${bind_core} python3.7 ${cur_path}/../ssd_main.py --mode=train \ + nohup python3.7 ${cur_path}/../ssd_main.py --mode=train \ + --train_batch_size=${batch_size} \ + --training_file_pattern=${data_path}/coco_official_2017/tfrecord/train2017* \ + --resnet_checkpoint=${data_path}/resnet34_pretrain/model.ckpt-28152 \ + --validation_file_pattern=${data_path}/coco_official_2017/tfrecord/val2017* \ + --val_json_file=${data_path}/coco_official_2017/annotations/instances_val2017.json \ + --eval_batch_size=${batch_size} \ + --num_epochs=${train_epochs} \ + --num_examples_per_epoch=64000 \ + --model_dir=${cur_path}/output/${ASCEND_DEVICE_ID}/d_solution/ckpt${ASCEND_DEVICE_ID} > ${cur_path}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log 2>&1 & +done +wait + +#训练结束时间,不需要修改 +end_time=$(date +%s) +e2e_time=$(( $end_time - $start_time )) + +#结果打印,不需要修改 +echo "------------------ Final result ------------------" +#输出性能FPS,需要模型审视修改 +FPS=`cat ${cur_path}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log | grep "\] FPS:" | awk -F "FPS: " '{print $2}' | awk -F "," '{print $1}' | tail -n +2 | awk '{sum+=$1} END {print sum/NR}'` +#打印,不需要修改 +echo "Final Performance images/sec : $FPS" + +#输出训练精度,需要模型审视修改 +#train_accuracy=`grep -A 1 top1 $cur_path/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log|awk 'END {print $3}'` +#打印,不需要修改 +#echo "Final Train Accuracy : ${train_accuracy}" +echo "E2E Training Duration sec : $e2e_time" + +#稳定性精度看护结果汇总 +#训练用例信息,不需要修改 +BatchSize=${batch_size} +DeviceType=`uname -m` +CaseName=${Network}${name_bind}_bs${BatchSize}_${RANK_SIZE}'p'_'perf' + +##获取性能数据 +#吞吐量,不需要修改 +ActualFPS=${FPS} +#单迭代训练时长,不需要修改 +TrainingTime=`awk 'BEGIN{printf "%.2f\n",'${batch_size}'*'${RANK_SIZE}'*1000/'${FPS}'}'` + +#从train_$ASCEND_DEVICE_ID.log提取Loss到train_${CaseName}_loss.txt中,需要根据模型审视 +grep "\] FPS:" $cur_path/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log | awk -F "loss: " '{print $2}' >> $cur_path/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt + +#最后一个迭代loss值,不需要修改 +ActualLoss=`awk 'END {print}' $cur_path/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt` + +#关键信息打印到${CaseName}.log中,不需要修改 +echo "Network = ${Network}" > $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "RankSize = ${RANK_SIZE}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "BatchSize = ${BatchSize}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "DeviceType = ${DeviceType}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "CaseName = ${CaseName}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "ActualFPS = ${ActualFPS}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "TrainingTime = ${TrainingTime}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "ActualLoss = ${ActualLoss}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "E2ETrainingTime = ${e2e_time}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log \ No newline at end of file diff --git a/TensorFlow/built-in/cv/image_classification/Densenet_3D_ID0121_for_TensorFlow/test/train_performance_16np.sh b/TensorFlow/built-in/cv/image_classification/Densenet_3D_ID0121_for_TensorFlow/test/train_performance_16np.sh index dff0633e2..1d095599f 100644 --- a/TensorFlow/built-in/cv/image_classification/Densenet_3D_ID0121_for_TensorFlow/test/train_performance_16np.sh +++ b/TensorFlow/built-in/cv/image_classification/Densenet_3D_ID0121_for_TensorFlow/test/train_performance_16np.sh @@ -33,14 +33,19 @@ do conf_path=`echo ${para#*=}` elif [[ $para == --devices_num* ]];then devices_num=`echo ${para#*=}` + elif [[ $para == --servers_num* ]];then + servers_num=`echo ${para#*=}` fi done -one_node_ip=`find $conf_path -name "server_*0.info"|awk -F "server_" '{print $2}'|awk -F "_" '{print $1}'` -linux_num=`find $conf_path -name "server_*.info" |wc -l` +#one_node_ip=`find $conf_path -name "server_*0.info"|awk -F "server_" '{print $2}'|awk -F "_" '{print $1}'` +#linux_num=`find $conf_path -name "server_*.info" |wc -l` +linux_num=$servers_num # 自动生成ranktable的脚本 export RANK_SIZE=`awk 'BEGIN{printf "%.0f\n",'${devices_num}'*'${linux_num}'}'` rank_size=8 -nohup python3 set_ranktable.py --npu_nums=$linux_num --conf_path=$conf_path +if [[ $conf_path != "" ]];then + nohup python3 set_ranktable.py --npu_nums=$linux_num --conf_path=$conf_path +fi wait export RANK_TABLE_FILE=${cur_path}/test/rank_table.json diff --git a/TensorFlow/built-in/cv/image_classification/Densenet_3D_ID0121_for_TensorFlow/test/train_performance_16p.sh b/TensorFlow/built-in/cv/image_classification/Densenet_3D_ID0121_for_TensorFlow/test/train_performance_16p.sh index 2c22303b3..0069b7cc9 100644 --- a/TensorFlow/built-in/cv/image_classification/Densenet_3D_ID0121_for_TensorFlow/test/train_performance_16p.sh +++ b/TensorFlow/built-in/cv/image_classification/Densenet_3D_ID0121_for_TensorFlow/test/train_performance_16p.sh @@ -36,7 +36,9 @@ done # 自动生成ranktable的脚本 rank_size=8 -nohup python3 set_ranktable.py --npu_nums=$((RANK_SIZE/rank_size)) --conf_path=$conf_path +if [[ $conf_path != "" ]];then + nohup python3 set_ranktable.py --npu_nums=$((RANK_SIZE/rank_size)) --conf_path=$conf_path +fi wait export RANK_TABLE_FILE=${cur_path}/test/rank_table.json diff --git a/TensorFlow/built-in/nlp/GRU4Rec_ID0128_for_TensorFlow/test/set_ranktable.py b/TensorFlow/built-in/nlp/GRU4Rec_ID0128_for_TensorFlow/test/set_ranktable.py new file mode 100644 index 000000000..216dd25a5 --- /dev/null +++ b/TensorFlow/built-in/nlp/GRU4Rec_ID0128_for_TensorFlow/test/set_ranktable.py @@ -0,0 +1,1740 @@ +import argparse +parser = argparse.ArgumentParser() +parser.add_argument('-n', '--npu_nums', type=int, default='2', help='nums of npu') +parser.add_argument('-c', '--conf_path', type=str, default='./', help='the path of server_info') +FLAGS = parser.parse_args() + +import json +import os +server = [] +server_conf = [] +server_list = ["0", "1", "2", "3", "4", "5", "6", "7"] +if os.path.isdir(FLAGS.conf_path): + for f in os.listdir(FLAGS.conf_path): + if (f.split("_")[-1]).split(".")[0] in server_list and (f.split("_")[-1]).split(".")[1] == 'info' and f.split("_")[0] == 'server': + server_conf.append(f) + + + + + + +rank_address = [] +for i in range(FLAGS.npu_nums): + for x in server_conf: + if (x.split("_")[-1]).split(".")[0] == str(i): + server.append(x.split("_")[1]) + l = FLAGS.conf_path + "/" + x + with open(l, "r") as a: + s = a.readlines() + for s_ in s: + if 'address_0' in s_: + rank_address.append(s_.split("=")[-1][:-1]) + for s_ in s: + if 'address_1' in s_: + rank_address.append(s_.split("=")[-1][:-1]) + for s_ in s: + if 'address_2' in s_: + rank_address.append(s_.split("=")[-1][:-1]) + for s_ in s: + if 'address_3' in s_: + rank_address.append(s_.split("=")[-1][:-1]) + for s_ in s: + if 'address_4' in s_: + rank_address.append(s_.split("=")[-1][:-1]) + for s_ in s: + if 'address_5' in s_: + rank_address.append(s_.split("=")[-1][:-1]) + for s_ in s: + if 'address_6' in s_: + rank_address.append(s_.split("=")[-1][:-1]) + for s_ in s: + if 'address_7' in s_: + rank_address.append(s_.split("=")[-1][:-1]) + +if FLAGS.npu_nums == 1: + rank = { + "server_count":"1", + "server_list":[ + { + "server_id":server[0], + "device":[ + { + "device_id":"0", + "device_ip":rank_address[0], + "rank_id":"0" + }, + { + "device_id":"1", + "device_ip":rank_address[1], + "rank_id":"1" + }, + { + "device_id":"2", + "device_ip":rank_address[2], + "rank_id":"2" + }, + { + "device_id":"3", + "device_ip":rank_address[3], + "rank_id":"3" + }, + { + "device_id":"4", + "device_ip":rank_address[4], + "rank_id":"4" + }, + { + "device_id":"5", + "device_ip":rank_address[5], + "rank_id":"5" + }, + { + "device_id":"6", + "device_ip":rank_address[6], + "rank_id":"6" + }, + { + "device_id":"7", + "device_ip":rank_address[7], + "rank_id":"7" + } + ]}], + "status":"completed", + "version":"1.0" + } +elif FLAGS.npu_nums == 2: + rank = { + "server_count":"2", + "server_list":[ + { + "server_id":server[0], + "device":[ + { + "device_id":"0", + "device_ip":rank_address[0], + "rank_id":"0" + }, + { + "device_id":"1", + "device_ip":rank_address[1], + "rank_id":"1" + }, + { + "device_id":"2", + "device_ip":rank_address[2], + "rank_id":"2" + }, + { + "device_id":"3", + "device_ip":rank_address[3], + "rank_id":"3" + }, + { + "device_id":"4", + "device_ip":rank_address[4], + "rank_id":"4" + }, + { + "device_id":"5", + "device_ip":rank_address[5], + "rank_id":"5" + }, + { + "device_id":"6", + "device_ip":rank_address[6], + "rank_id":"6" + }, + { + "device_id":"7", + "device_ip":rank_address[7], + "rank_id":"7" + } + ]}, + + + { + "server_id":server[1], + "device":[ + { + "device_id":"0", + "device_ip":rank_address[8], + "rank_id":"8" + }, + { + "device_id":"1", + "device_ip":rank_address[9], + "rank_id":"9" + }, + { + "device_id":"2", + "device_ip":rank_address[10], + "rank_id":"10" + }, + { + "device_id":"3", + "device_ip":rank_address[11], + "rank_id":"11" + }, + { + "device_id":"4", + "device_ip":rank_address[12], + "rank_id":"12" + }, + { + "device_id":"5", + "device_ip":rank_address[13], + "rank_id":"13" + }, + { + "device_id":"6", + "device_ip":rank_address[14], + "rank_id":"14" + }, + { + "device_id":"7", + "device_ip":rank_address[15], + "rank_id":"15" + } + ]} + ], + + "status":"completed", + "version":"1.0" + } + + +elif FLAGS.npu_nums == 3: + rank = { + "server_count":"3", + "server_list":[ + { + "server_id":server[0], + "device":[ + { + "device_id":"0", + "device_ip":rank_address[0], + "rank_id":"0" + }, + { + "device_id":"1", + "device_ip":rank_address[1], + "rank_id":"1" + }, + { + "device_id":"2", + "device_ip":rank_address[2], + "rank_id":"2" + }, + { + "device_id":"3", + "device_ip":rank_address[3], + "rank_id":"3" + }, + { + "device_id":"4", + "device_ip":rank_address[4], + "rank_id":"4" + }, + { + "device_id":"5", + "device_ip":rank_address[5], + "rank_id":"5" + }, + { + "device_id":"6", + "device_ip":rank_address[6], + "rank_id":"6" + }, + { + "device_id":"7", + "device_ip":rank_address[7], + "rank_id":"7" + } + ]}, + + + { + "server_id":server[1], + "device":[ + { + "device_id":"0", + "device_ip":rank_address[8], + "rank_id":"8" + }, + { + "device_id":"1", + "device_ip":rank_address[9], + "rank_id":"9" + }, + { + "device_id":"2", + "device_ip":rank_address[10], + "rank_id":"10" + }, + { + "device_id":"3", + "device_ip":rank_address[11], + "rank_id":"11" + }, + { + "device_id":"4", + "device_ip":rank_address[12], + "rank_id":"12" + }, + { + "device_id":"5", + "device_ip":rank_address[13], + "rank_id":"13" + }, + { + "device_id":"6", + "device_ip":rank_address[14], + "rank_id":"14" + }, + { + "device_id":"7", + "device_ip":rank_address[15], + "rank_id":"15" + } + ]}, + + { + "server_id":server[2], + "device":[ + { + "device_id":"0", + "device_ip":rank_address[16], + "rank_id":"16" + }, + { + "device_id":"1", + "device_ip":rank_address[17], + "rank_id":"17" + }, + { + "device_id":"2", + "device_ip":rank_address[18], + "rank_id":"18" + }, + { + "device_id":"3", + "device_ip":rank_address[19], + "rank_id":"19" + }, + { + "device_id":"4", + "device_ip":rank_address[20], + "rank_id":"20" + }, + { + "device_id":"5", + "device_ip":rank_address[21], + "rank_id":"21" + }, + { + "device_id":"6", + "device_ip":rank_address[22], + "rank_id":"22" + }, + { + "device_id":"7", + "device_ip":rank_address[23], + "rank_id":"23" + } + ]} + ], + "status":"completed", + "version":"1.0" + } +elif FLAGS.npu_nums == 4: + rank = { + "server_count":"4", + "server_list":[ + { + "server_id":server[0], + "device":[ + { + "device_id":"0", + "device_ip":rank_address[0], + "rank_id":"0" + }, + { + "device_id":"1", + "device_ip":rank_address[1], + "rank_id":"1" + }, + { + "device_id":"2", + "device_ip":rank_address[2], + "rank_id":"2" + }, + { + "device_id":"3", + "device_ip":rank_address[3], + "rank_id":"3" + }, + { + "device_id":"4", + "device_ip":rank_address[4], + "rank_id":"4" + }, + { + "device_id":"5", + "device_ip":rank_address[5], + "rank_id":"5" + }, + { + "device_id":"6", + "device_ip":rank_address[6], + "rank_id":"6" + }, + { + "device_id":"7", + "device_ip":rank_address[7], + "rank_id":"7" + } + ]}, + + + { + "server_id":server[1], + "device":[ + { + "device_id":"0", + "device_ip":rank_address[8], + "rank_id":"8" + }, + { + "device_id":"1", + "device_ip":rank_address[9], + "rank_id":"9" + }, + { + "device_id":"2", + "device_ip":rank_address[10], + "rank_id":"10" + }, + { + "device_id":"3", + "device_ip":rank_address[11], + "rank_id":"11" + }, + { + "device_id":"4", + "device_ip":rank_address[12], + "rank_id":"12" + }, + { + "device_id":"5", + "device_ip":rank_address[13], + "rank_id":"13" + }, + { + "device_id":"6", + "device_ip":rank_address[14], + "rank_id":"14" + }, + { + "device_id":"7", + "device_ip":rank_address[15], + "rank_id":"15" + } + ]}, + { + "server_id":server[2], + "device":[ + { + "device_id":"0", + "device_ip":rank_address[16], + "rank_id":"16" + }, + { + "device_id":"1", + "device_ip":rank_address[17], + "rank_id":"17" + }, + { + "device_id":"2", + "device_ip":rank_address[18], + "rank_id":"18" + }, + { + "device_id":"3", + "device_ip":rank_address[19], + "rank_id":"19" + }, + { + "device_id":"4", + "device_ip":rank_address[20], + "rank_id":"20" + }, + { + "device_id":"5", + "device_ip":rank_address[21], + "rank_id":"21" + }, + { + "device_id":"6", + "device_ip":rank_address[22], + "rank_id":"22" + }, + { + "device_id":"7", + "device_ip":rank_address[23], + "rank_id":"23" + } + ]}, + { + "server_id":server[3], + "device":[ + { + "device_id":"0", + "device_ip":rank_address[24], + "rank_id":"24" + }, + { + "device_id":"1", + "device_ip":rank_address[25], + "rank_id":"25" + }, + { + "device_id":"2", + "device_ip":rank_address[26], + "rank_id":"26" + }, + { + "device_id":"3", + "device_ip":rank_address[27], + "rank_id":"27" + }, + { + "device_id":"4", + "device_ip":rank_address[28], + "rank_id":"28" + }, + { + "device_id":"5", + "device_ip":rank_address[29], + "rank_id":"29" + }, + { + "device_id":"6", + "device_ip":rank_address[30], + "rank_id":"30" + }, + { + "device_id":"7", + "device_ip":rank_address[31], + "rank_id":"31" + } + ]} + ], + "status":"completed", + "version":"1.0" + } +elif FLAGS.npu_nums == 5: + rank = { + "server_count":"5", + "server_list":[ + { + "server_id":server[0], + "device":[ + { + "device_id":"0", + "device_ip":rank_address[0], + "rank_id":"0" + }, + { + "device_id":"1", + "device_ip":rank_address[1], + "rank_id":"1" + }, + { + "device_id":"2", + "device_ip":rank_address[2], + "rank_id":"2" + }, + { + "device_id":"3", + "device_ip":rank_address[3], + "rank_id":"3" + }, + { + "device_id":"4", + "device_ip":rank_address[4], + "rank_id":"4" + }, + { + "device_id":"5", + "device_ip":rank_address[5], + "rank_id":"5" + }, + { + "device_id":"6", + "device_ip":rank_address[6], + "rank_id":"6" + }, + { + "device_id":"7", + "device_ip":rank_address[7], + "rank_id":"7" + } + ]}, + + + { + "server_id":server[1], + "device":[ + { + "device_id":"0", + "device_ip":rank_address[8], + "rank_id":"8" + }, + { + "device_id":"1", + "device_ip":rank_address[9], + "rank_id":"9" + }, + { + "device_id":"2", + "device_ip":rank_address[10], + "rank_id":"10" + }, + { + "device_id":"3", + "device_ip":rank_address[11], + "rank_id":"11" + }, + { + "device_id":"4", + "device_ip":rank_address[12], + "rank_id":"12" + }, + { + "device_id":"5", + "device_ip":rank_address[13], + "rank_id":"13" + }, + { + "device_id":"6", + "device_ip":rank_address[14], + "rank_id":"14" + }, + { + "device_id":"7", + "device_ip":rank_address[15], + "rank_id":"15" + } + ]}, + { + "server_id":server[2], + "device":[ + { + "device_id":"0", + "device_ip":rank_address[16], + "rank_id":"16" + }, + { + "device_id":"1", + "device_ip":rank_address[17], + "rank_id":"17" + }, + { + "device_id":"2", + "device_ip":rank_address[18], + "rank_id":"18" + }, + { + "device_id":"3", + "device_ip":rank_address[19], + "rank_id":"19" + }, + { + "device_id":"4", + "device_ip":rank_address[20], + "rank_id":"20" + }, + { + "device_id":"5", + "device_ip":rank_address[21], + "rank_id":"21" + }, + { + "device_id":"6", + "device_ip":rank_address[22], + "rank_id":"22" + }, + { + "device_id":"7", + "device_ip":rank_address[23], + "rank_id":"23" + } + ]}, + { + "server_id":server[3], + "device":[ + { + "device_id":"0", + "device_ip":rank_address[24], + "rank_id":"24" + }, + { + "device_id":"1", + "device_ip":rank_address[25], + "rank_id":"25" + }, + { + "device_id":"2", + "device_ip":rank_address[26], + "rank_id":"26" + }, + { + "device_id":"3", + "device_ip":rank_address[27], + "rank_id":"27" + }, + { + "device_id":"4", + "device_ip":rank_address[28], + "rank_id":"28" + }, + { + "device_id":"5", + "device_ip":rank_address[29], + "rank_id":"29" + }, + { + "device_id":"6", + "device_ip":rank_address[30], + "rank_id":"30" + }, + { + "device_id":"7", + "device_ip":rank_address[31], + "rank_id":"31" + } + ]}, + { + "server_id":server[4], + "device":[ + { + "device_id":"0", + "device_ip":rank_address[32], + "rank_id":"32" + }, + { + "device_id":"1", + "device_ip":rank_address[33], + "rank_id":"33" + }, + { + "device_id":"2", + "device_ip":rank_address[34], + "rank_id":"34" + }, + { + "device_id":"3", + "device_ip":rank_address[35], + "rank_id":"35" + }, + { + "device_id":"4", + "device_ip":rank_address[36], + "rank_id":"36" + }, + { + "device_id":"5", + "device_ip":rank_address[37], + "rank_id":"37" + }, + { + "device_id":"6", + "device_ip":rank_address[38], + "rank_id":"38" + }, + { + "device_id":"7", + "device_ip":rank_address[39], + "rank_id":"39" + } + ]} + ], + "status":"completed", + "version":"1.0" + } + + + +elif FLAGS.npu_nums == 6: + rank = { + "server_count":"6", + "server_list":[ + { + "server_id":server[0], + "device":[ + { + "device_id":"0", + "device_ip":rank_address[0], + "rank_id":"0" + }, + { + "device_id":"1", + "device_ip":rank_address[1], + "rank_id":"1" + }, + { + "device_id":"2", + "device_ip":rank_address[2], + "rank_id":"2" + }, + { + "device_id":"3", + "device_ip":rank_address[3], + "rank_id":"3" + }, + { + "device_id":"4", + "device_ip":rank_address[4], + "rank_id":"4" + }, + { + "device_id":"5", + "device_ip":rank_address[5], + "rank_id":"5" + }, + { + "device_id":"6", + "device_ip":rank_address[6], + "rank_id":"6" + }, + { + "device_id":"7", + "device_ip":rank_address[7], + "rank_id":"7" + } + ]}, + + + { + "server_id":server[1], + "device":[ + { + "device_id":"0", + "device_ip":rank_address[8], + "rank_id":"8" + }, + { + "device_id":"1", + "device_ip":rank_address[9], + "rank_id":"9" + }, + { + "device_id":"2", + "device_ip":rank_address[10], + "rank_id":"10" + }, + { + "device_id":"3", + "device_ip":rank_address[11], + "rank_id":"11" + }, + { + "device_id":"4", + "device_ip":rank_address[12], + "rank_id":"12" + }, + { + "device_id":"5", + "device_ip":rank_address[13], + "rank_id":"13" + }, + { + "device_id":"6", + "device_ip":rank_address[14], + "rank_id":"14" + }, + { + "device_id":"7", + "device_ip":rank_address[15], + "rank_id":"15" + } + ]}, + { + "server_id":server[2], + "device":[ + { + "device_id":"0", + "device_ip":rank_address[16], + "rank_id":"16" + }, + { + "device_id":"1", + "device_ip":rank_address[17], + "rank_id":"17" + }, + { + "device_id":"2", + "device_ip":rank_address[18], + "rank_id":"18" + }, + { + "device_id":"3", + "device_ip":rank_address[19], + "rank_id":"19" + }, + { + "device_id":"4", + "device_ip":rank_address[20], + "rank_id":"20" + }, + { + "device_id":"5", + "device_ip":rank_address[21], + "rank_id":"21" + }, + { + "device_id":"6", + "device_ip":rank_address[22], + "rank_id":"22" + }, + { + "device_id":"7", + "device_ip":rank_address[23], + "rank_id":"23" + } + ]}, + { + "server_id":server[3], + "device":[ + { + "device_id":"0", + "device_ip":rank_address[24], + "rank_id":"24" + }, + { + "device_id":"1", + "device_ip":rank_address[25], + "rank_id":"25" + }, + { + "device_id":"2", + "device_ip":rank_address[26], + "rank_id":"26" + }, + { + "device_id":"3", + "device_ip":rank_address[27], + "rank_id":"27" + }, + { + "device_id":"4", + "device_ip":rank_address[28], + "rank_id":"28" + }, + { + "device_id":"5", + "device_ip":rank_address[29], + "rank_id":"29" + }, + { + "device_id":"6", + "device_ip":rank_address[30], + "rank_id":"30" + }, + { + "device_id":"7", + "device_ip":rank_address[31], + "rank_id":"31" + } + ]}, + { + "server_id":server[4], + "device":[ + { + "device_id":"0", + "device_ip":rank_address[32], + "rank_id":"32" + }, + { + "device_id":"1", + "device_ip":rank_address[33], + "rank_id":"33" + }, + { + "device_id":"2", + "device_ip":rank_address[34], + "rank_id":"34" + }, + { + "device_id":"3", + "device_ip":rank_address[35], + "rank_id":"35" + }, + { + "device_id":"4", + "device_ip":rank_address[36], + "rank_id":"36" + }, + { + "device_id":"5", + "device_ip":rank_address[37], + "rank_id":"37" + }, + { + "device_id":"6", + "device_ip":rank_address[38], + "rank_id":"38" + }, + { + "device_id":"7", + "device_ip":rank_address[39], + "rank_id":"39" + } + ]}, + { + "server_id":server[5], + "device":[ + { + "device_id":"0", + "device_ip":rank_address[40], + "rank_id":"40" + }, + { + "device_id":"1", + "device_ip":rank_address[41], + "rank_id":"41" + }, + { + "device_id":"2", + "device_ip":rank_address[42], + "rank_id":"42" + }, + { + "device_id":"3", + "device_ip":rank_address[43], + "rank_id":"43" + }, + { + "device_id":"4", + "device_ip":rank_address[44], + "rank_id":"44" + }, + { + "device_id":"5", + "device_ip":rank_address[45], + "rank_id":"45" + }, + { + "device_id":"6", + "device_ip":rank_address[46], + "rank_id":"46" + }, + { + "device_id":"7", + "device_ip":rank_address[47], + "rank_id":"47" + } + ]} + ], + "status":"completed", + "version":"1.0" + } + + +elif FLAGS.npu_nums == 7: + rank = { + "server_count":"7", + "server_list":[ + { + "server_id":server[0], + "device":[ + { + "device_id":"0", + "device_ip":rank_address[0], + "rank_id":"0" + }, + { + "device_id":"1", + "device_ip":rank_address[1], + "rank_id":"1" + }, + { + "device_id":"2", + "device_ip":rank_address[2], + "rank_id":"2" + }, + { + "device_id":"3", + "device_ip":rank_address[3], + "rank_id":"3" + }, + { + "device_id":"4", + "device_ip":rank_address[4], + "rank_id":"4" + }, + { + "device_id":"5", + "device_ip":rank_address[5], + "rank_id":"5" + }, + { + "device_id":"6", + "device_ip":rank_address[6], + "rank_id":"6" + }, + { + "device_id":"7", + "device_ip":rank_address[7], + "rank_id":"7" + } + ]}, + + + { + "server_id":server[1], + "device":[ + { + "device_id":"0", + "device_ip":rank_address[8], + "rank_id":"8" + }, + { + "device_id":"1", + "device_ip":rank_address[9], + "rank_id":"9" + }, + { + "device_id":"2", + "device_ip":rank_address[10], + "rank_id":"10" + }, + { + "device_id":"3", + "device_ip":rank_address[11], + "rank_id":"11" + }, + { + "device_id":"4", + "device_ip":rank_address[12], + "rank_id":"12" + }, + { + "device_id":"5", + "device_ip":rank_address[13], + "rank_id":"13" + }, + { + "device_id":"6", + "device_ip":rank_address[14], + "rank_id":"14" + }, + { + "device_id":"7", + "device_ip":rank_address[15], + "rank_id":"15" + } + ]}, + { + "server_id":server[2], + "device":[ + { + "device_id":"0", + "device_ip":rank_address[16], + "rank_id":"16" + }, + { + "device_id":"1", + "device_ip":rank_address[17], + "rank_id":"17" + }, + { + "device_id":"2", + "device_ip":rank_address[18], + "rank_id":"18" + }, + { + "device_id":"3", + "device_ip":rank_address[19], + "rank_id":"19" + }, + { + "device_id":"4", + "device_ip":rank_address[20], + "rank_id":"20" + }, + { + "device_id":"5", + "device_ip":rank_address[21], + "rank_id":"21" + }, + { + "device_id":"6", + "device_ip":rank_address[22], + "rank_id":"22" + }, + { + "device_id":"7", + "device_ip":rank_address[23], + "rank_id":"23" + } + ]}, + { + "server_id":server[3], + "device":[ + { + "device_id":"0", + "device_ip":rank_address[24], + "rank_id":"24" + }, + { + "device_id":"1", + "device_ip":rank_address[25], + "rank_id":"25" + }, + { + "device_id":"2", + "device_ip":rank_address[26], + "rank_id":"26" + }, + { + "device_id":"3", + "device_ip":rank_address[27], + "rank_id":"27" + }, + { + "device_id":"4", + "device_ip":rank_address[28], + "rank_id":"28" + }, + { + "device_id":"5", + "device_ip":rank_address[29], + "rank_id":"29" + }, + { + "device_id":"6", + "device_ip":rank_address[30], + "rank_id":"30" + }, + { + "device_id":"7", + "device_ip":rank_address[31], + "rank_id":"31" + } + ]}, + { + "server_id":server[4], + "device":[ + { + "device_id":"0", + "device_ip":rank_address[32], + "rank_id":"32" + }, + { + "device_id":"1", + "device_ip":rank_address[33], + "rank_id":"33" + }, + { + "device_id":"2", + "device_ip":rank_address[34], + "rank_id":"34" + }, + { + "device_id":"3", + "device_ip":rank_address[35], + "rank_id":"35" + }, + { + "device_id":"4", + "device_ip":rank_address[36], + "rank_id":"36" + }, + { + "device_id":"5", + "device_ip":rank_address[37], + "rank_id":"37" + }, + { + "device_id":"6", + "device_ip":rank_address[38], + "rank_id":"38" + }, + { + "device_id":"7", + "device_ip":rank_address[39], + "rank_id":"39" + } + ]}, + { + "server_id":server[5], + "device":[ + { + "device_id":"0", + "device_ip":rank_address[40], + "rank_id":"40" + }, + { + "device_id":"1", + "device_ip":rank_address[41], + "rank_id":"41" + }, + { + "device_id":"2", + "device_ip":rank_address[42], + "rank_id":"42" + }, + { + "device_id":"3", + "device_ip":rank_address[43], + "rank_id":"43" + }, + { + "device_id":"4", + "device_ip":rank_address[44], + "rank_id":"44" + }, + { + "device_id":"5", + "device_ip":rank_address[45], + "rank_id":"45" + }, + { + "device_id":"6", + "device_ip":rank_address[46], + "rank_id":"46" + }, + { + "device_id":"7", + "device_ip":rank_address[47], + "rank_id":"47" + } + ]}, + { + "server_id":server[6], + "device":[ + { + "device_id":"0", + "device_ip":rank_address[48], + "rank_id":"48" + }, + { + "device_id":"1", + "device_ip":rank_address[49], + "rank_id":"49" + }, + { + "device_id":"2", + "device_ip":rank_address[50], + "rank_id":"50" + }, + { + "device_id":"3", + "device_ip":rank_address[51], + "rank_id":"51" + }, + { + "device_id":"4", + "device_ip":rank_address[52], + "rank_id":"52" + }, + { + "device_id":"5", + "device_ip":rank_address[53], + "rank_id":"53" + }, + { + "device_id":"6", + "device_ip":rank_address[54], + "rank_id":"54" + }, + { + "device_id":"7", + "device_ip":rank_address[55], + "rank_id":"55" + } + ]} + ], + "status":"completed", + "version":"1.0" + } + + + + +elif FLAGS.npu_nums == 8: + rank = { + "server_count":"8", + "server_list":[ + { + "server_id":server[0], + "device":[ + { + "device_id":"0", + "device_ip":rank_address[0], + "rank_id":"0" + }, + { + "device_id":"1", + "device_ip":rank_address[1], + "rank_id":"1" + }, + { + "device_id":"2", + "device_ip":rank_address[2], + "rank_id":"2" + }, + { + "device_id":"3", + "device_ip":rank_address[3], + "rank_id":"3" + }, + { + "device_id":"4", + "device_ip":rank_address[4], + "rank_id":"4" + }, + { + "device_id":"5", + "device_ip":rank_address[5], + "rank_id":"5" + }, + { + "device_id":"6", + "device_ip":rank_address[6], + "rank_id":"6" + }, + { + "device_id":"7", + "device_ip":rank_address[7], + "rank_id":"7" + } + ]}, + + + { + "server_id":server[1], + "device":[ + { + "device_id":"0", + "device_ip":rank_address[8], + "rank_id":"8" + }, + { + "device_id":"1", + "device_ip":rank_address[9], + "rank_id":"9" + }, + { + "device_id":"2", + "device_ip":rank_address[10], + "rank_id":"10" + }, + { + "device_id":"3", + "device_ip":rank_address[11], + "rank_id":"11" + }, + { + "device_id":"4", + "device_ip":rank_address[12], + "rank_id":"12" + }, + { + "device_id":"5", + "device_ip":rank_address[13], + "rank_id":"13" + }, + { + "device_id":"6", + "device_ip":rank_address[14], + "rank_id":"14" + }, + { + "device_id":"7", + "device_ip":rank_address[15], + "rank_id":"15" + } + ]}, + { + "server_id":server[2], + "device":[ + { + "device_id":"0", + "device_ip":rank_address[16], + "rank_id":"16" + }, + { + "device_id":"1", + "device_ip":rank_address[17], + "rank_id":"17" + }, + { + "device_id":"2", + "device_ip":rank_address[18], + "rank_id":"18" + }, + { + "device_id":"3", + "device_ip":rank_address[19], + "rank_id":"19" + }, + { + "device_id":"4", + "device_ip":rank_address[20], + "rank_id":"20" + }, + { + "device_id":"5", + "device_ip":rank_address[21], + "rank_id":"21" + }, + { + "device_id":"6", + "device_ip":rank_address[22], + "rank_id":"22" + }, + { + "device_id":"7", + "device_ip":rank_address[23], + "rank_id":"23" + } + ]}, + { + "server_id":server[3], + "device":[ + { + "device_id":"0", + "device_ip":rank_address[24], + "rank_id":"24" + }, + { + "device_id":"1", + "device_ip":rank_address[25], + "rank_id":"25" + }, + { + "device_id":"2", + "device_ip":rank_address[26], + "rank_id":"26" + }, + { + "device_id":"3", + "device_ip":rank_address[27], + "rank_id":"27" + }, + { + "device_id":"4", + "device_ip":rank_address[28], + "rank_id":"28" + }, + { + "device_id":"5", + "device_ip":rank_address[29], + "rank_id":"29" + }, + { + "device_id":"6", + "device_ip":rank_address[30], + "rank_id":"30" + }, + { + "device_id":"7", + "device_ip":rank_address[31], + "rank_id":"31" + } + ]}, + { + "server_id":server[4], + "device":[ + { + "device_id":"0", + "device_ip":rank_address[32], + "rank_id":"32" + }, + { + "device_id":"1", + "device_ip":rank_address[33], + "rank_id":"33" + }, + { + "device_id":"2", + "device_ip":rank_address[34], + "rank_id":"34" + }, + { + "device_id":"3", + "device_ip":rank_address[35], + "rank_id":"35" + }, + { + "device_id":"4", + "device_ip":rank_address[36], + "rank_id":"36" + }, + { + "device_id":"5", + "device_ip":rank_address[37], + "rank_id":"37" + }, + { + "device_id":"6", + "device_ip":rank_address[38], + "rank_id":"38" + }, + { + "device_id":"7", + "device_ip":rank_address[39], + "rank_id":"39" + } + ]}, + { + "server_id":server[5], + "device":[ + { + "device_id":"0", + "device_ip":rank_address[40], + "rank_id":"40" + }, + { + "device_id":"1", + "device_ip":rank_address[41], + "rank_id":"41" + }, + { + "device_id":"2", + "device_ip":rank_address[42], + "rank_id":"42" + }, + { + "device_id":"3", + "device_ip":rank_address[43], + "rank_id":"43" + }, + { + "device_id":"4", + "device_ip":rank_address[44], + "rank_id":"44" + }, + { + "device_id":"5", + "device_ip":rank_address[45], + "rank_id":"45" + }, + { + "device_id":"6", + "device_ip":rank_address[46], + "rank_id":"46" + }, + { + "device_id":"7", + "device_ip":rank_address[47], + "rank_id":"47" + } + ]}, + { + "server_id":server[6], + "device":[ + { + "device_id":"0", + "device_ip":rank_address[48], + "rank_id":"48" + }, + { + "device_id":"1", + "device_ip":rank_address[49], + "rank_id":"49" + }, + { + "device_id":"2", + "device_ip":rank_address[50], + "rank_id":"50" + }, + { + "device_id":"3", + "device_ip":rank_address[51], + "rank_id":"51" + }, + { + "device_id":"4", + "device_ip":rank_address[52], + "rank_id":"52" + }, + { + "device_id":"5", + "device_ip":rank_address[53], + "rank_id":"53" + }, + { + "device_id":"6", + "device_ip":rank_address[54], + "rank_id":"54" + }, + { + "device_id":"7", + "device_ip":rank_address[55], + "rank_id":"55" + } + ]}, + { + "server_id":server[7], + "device":[ + { + "device_id":"0", + "device_ip":rank_address[56], + "rank_id":"56" + }, + { + "device_id":"1", + "device_ip":rank_address[57], + "rank_id":"57" + }, + { + "device_id":"2", + "device_ip":rank_address[58], + "rank_id":"58" + }, + { + "device_id":"3", + "device_ip":rank_address[59], + "rank_id":"59" + }, + { + "device_id":"4", + "device_ip":rank_address[60], + "rank_id":"60" + }, + { + "device_id":"5", + "device_ip":rank_address[61], + "rank_id":"61" + }, + { + "device_id":"6", + "device_ip":rank_address[62], + "rank_id":"62" + }, + { + "device_id":"7", + "device_ip":rank_address[63], + "rank_id":"63" + } + ]} + ], + "status":"completed", + "version":"1.0" + } + + + + +with open("rank_table.json", "w") as f: + json.dump(rank, f) + + + + + + diff --git a/TensorFlow/built-in/nlp/GRU4Rec_ID0128_for_TensorFlow/test/train_performance_16np.sh b/TensorFlow/built-in/nlp/GRU4Rec_ID0128_for_TensorFlow/test/train_performance_16np.sh new file mode 100644 index 000000000..7d3821a2a --- /dev/null +++ b/TensorFlow/built-in/nlp/GRU4Rec_ID0128_for_TensorFlow/test/train_performance_16np.sh @@ -0,0 +1,189 @@ +#!/bin/bash + +#当前路径,不需要修改 +cur_path=`pwd` + +#集合通信参数,不需要修改 +#export RANK_SIZE=16 +#export RANK_TABLE_FILE=$cur_path/${RANK_SIZE}p.json +export JOB_ID=10087 +export DEVICE_INDEX=0 +RANK_ID_START=0 + +data_path="" +data_file="/rsc15_train_40000.txt" +#设置默认日志级别,不需要修改 +#export ASCEND_GLOBAL_LOG_LEVEL=3 +#基础参数,需要模型审视修改 +#网络名称,同目录名称 +Network="GRU4Rec_ID0128_for_TensorFlow" +#训练epoch +train_epochs=10 +#TF2.X独有,不需要修改 +#export NPU_LOOP_SIZE=${train_steps} +#维测参数,precision_mode需要模型审视修改 +#precision_mode="allow_mix_precision" +#维持参数,以下不需要修改 +over_dump=False +data_dump_flag=False +data_dump_step="10" +profiling=False +autotune=False + +if [[ $1 == --help || $1 == -h ]];then + echo"usage:./train_full_1p.sh " + echo " " + echo "parameter explain: + --precision_mode precision mode(allow_fp32_to_fp16/force_fp16/must_keep_origin_dtype/allow_mix_precision) + --over_dump if or not over detection, default is False + --data_dump_flag data dump flag, default is False + --data_dump_step data dump step, default is 10 + --profiling if or not profiling for performance debug, default is False + --autotune whether to enable autotune, default is False + --data_path source data of training + -h/--help show help message + " + exit 1 +fi +#参数校验,不需要修改 +for para in $* +do + if [[ $para == --precision_mode* ]];then + precision_mode=`echo ${para#*=}` + elif [[ $para == --over_dump* ]];then + over_dump=`echo ${para#*=}` + over_dump_path=${cur_path}/output/overflow_dump + mkdir -p ${over_dump_path} + elif [[ $para == --data_dump_flag* ]];then + data_dump_flag=`echo ${para#*=}` + data_dump_path=${cur_path}/output/data_dump + mkdir -p ${data_dump_path} + elif [[ $para == --data_dump_step* ]];then + data_dump_step=`echo ${para#*=}` + elif [[ $para == --profiling* ]];then + profiling=`echo ${para#*=}` + profiling_dump_path=${cur_path}/output/profiling + mkdir -p ${profiling_dump_path} + elif [[ $para == --autotune* ]];then + autotune=`echo ${para#*=}` + mv $install_path/fwkacllib/data/rl/Ascend910/custom $install_path/fwkacllib/data/rl/Ascend910/custom_bak + mv $install_path/fwkacllib/data/tiling/Ascend910/custom $install_path/fwkacllib/data/tiling/Ascend910/custom_bak + autotune_dump_path=${cur_path}/output/autotune_dump + mkdir -p ${autotune_dump_path}/GA + mkdir -p ${autotune_dump_path}/rl + cp -rf $install_path/fwkacllib/data/tiling/Ascend910/custom ${autotune_dump_path}/GA/ + cp -rf $install_path/fwkacllib/data/rl/Ascend910/custom ${autotune_dump_path}/RL/ + elif [[ $para == --data_path* ]];then + data_path=`echo ${para#*=}` + elif [[ $para == --conf_path* ]];then + conf_path=`echo ${para#*=}` + elif [[ $para == --devices_num* ]];then + devices_num=`echo ${para#*=}` + elif [[ $para == --servers_num* ]];then + servers_num=`echo ${para#*=}` + fi +done + +linux_num=$servers_num + +export RANK_SIZE=`awk 'BEGIN{printf "%.0f\n",'${devices_num}'*'${linux_num}'}'` +rank_size=8 +if [[ $conf_path != "" ]];then + nohup python3 set_ranktable.py --npu_nums=$linux_num --conf_path=$conf_path +fi +wait +export RANK_TABLE_FILE=${cur_path}/test/rank_table.json + +#校验是否传入data_path,不需要修改 +if [[ $data_path == "" ]];then + echo "[Error] para \"data_path\" must be confing" + exit 1 +fi +BatchSize=4096 +CaseName=${Network}_bs${BatchSize}_${RANK_SIZE}'p'_'perf' + +#训练开始时间,不需要修改 +start_time=$(date +%s) +#进入训练脚本目录,需要模型审视修改 +cd $cur_path/../gru4rec_BP + + +#############执行训练######################### + +for((RANK_ID=$((rank_size*server_index));RANK_ID<$((((server_index+1))*rank_size));RANK_ID++)); +do + #设置环境变量,不需要修改 + echo "Device ID: $ASCEND_DEVICE_ID" + export RANK_ID=$RANK_ID + export ASCEND_DEVICE_ID=`expr ${RANK_ID} - $((rank_size*server_index))` + ASCEND_DEVICE_ID=`expr ${RANK_ID} - $((rank_size*server_index))` + + #创建DeviceID输出目录,不需要修改 + if [ -d ${cur_path}/output/${ASCEND_DEVICE_ID} ];then + rm -rf ${cur_path}/output/${ASCEND_DEVICE_ID} + mkdir -p ${cur_path}/output/$ASCEND_DEVICE_ID/ckpt + else + mkdir -p ${cur_path}/output/$ASCEND_DEVICE_ID/ckpt + fi + + #执行训练脚本,以下传参不需要修改,其他需要模型审视修改 + + nohup python3 main.py \ + --path_to_train=${data_path} \ + --path_to_test=${data_path} \ + --train=1 \ + --lr=0.008 \ + --npu_nums=8 \ + --epoch=${train_epochs} \ + --train_dataset_file=${data_file} \ + > ${cur_path}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log 2>&1 & +done + +wait + +end_time=$(date +%s) +e2e_time=$(( $end_time - $start_time )) +#结果打印,不需要修改 +echo "------------------ Final result ------------------" +#输出性能FPS,需要模型审视修改 +#FPS=`grep TimeHistory $cur_path/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log|awk 'END {print $2}'` +temp1=`grep "npu time is:" $cur_path/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log|awk 'END {print $6}'` +TrainingTime=`awk 'BEGIN{printf "%.2f\n",'1000'*'${temp1}'}'` +#TrainingTime=`echo "1000 * ${temp1}"|bc` +FPS=`awk 'BEGIN{printf "%.2f\n",'${RANK_SIZE}'*'${BatchSize}'/'${temp1}'}'` +#FPS=`echo "scale=2;${BatchSize} / ${temp1}"|bc` +#打印,不需要修改 +#echo "Final Performance images/sec : $FPS" +#输出训练精度,需要模型审视修改 +# train_accuracy=`grep train_accuracy $cur_path/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log|awk 'END {print $8}'|cut -c 1-5` +#打印,不需要修改 +#echo "Final Train Accuracy : ${train_accuracy}" +echo "E2E Training Duration sec : $e2e_time" +#稳定性精度看护结果汇总 +#训练用例信息,不需要修改 +DeviceType=`uname -m` +##获取性能数据 +#吞吐量,不需要修改 +ActualFPS=${FPS} +#单迭代训练时长,不需要修改 +grep Each $cur_path/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log|awk '{print $6}'>>$cur_path/output/$ASCEND_DEVICE_ID/train_${CaseName}_time.txt +#TrainingTime=`awk 'END {print}' $cur_path/output/$ASCEND_DEVICE_ID/train_${CaseName}_time.txt` +#TrainingTime=`awk 'BEGIN{printf "%.2f\n",'${BatchSize}'*1000/'${FPS}'}'` +#TrainningTime=`grep Each $cur_path/output/${ASCEND_DEVICE_ID}/train.log|awk 'END {print $6}'` +#从train_$ASCEND_DEVICE_ID.log提取Loss到train_${CaseName}_loss.txt中,需要根据模型审视 +#grep loss $cur_path/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log|grep -v BatchTimestamp|awk '{print $10}'|sed 's/,//g'|sed '/^$/d' >> $cur_path/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt +grep Epoch $cur_path/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log|awk '{print $8}'>>$cur_path/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt +#最后一个迭代loss值,不需要修改 +ActualLoss=`awk 'END {print}' $cur_path/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt` +#关键信息打印到${CaseName}.log中,不需要修改 +echo "Network = ${Network}" > $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "RankSize = ${RANK_SIZE}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "BatchSize = ${BatchSize}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "DeviceType = ${DeviceType}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "CaseName = ${CaseName}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "ActualFPS = ${ActualFPS}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "TrainingTime = ${TrainingTime}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +#echo "TrainAccuracy = ${train_accuracy}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "ActualLoss = ${ActualLoss}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "E2ETrainingTime = ${e2e_time}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +sed -i -e '/ModuleNotFoundError/d' $cur_path/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log \ No newline at end of file diff --git a/TensorFlow/built-in/nlp/GRU4Rec_ID0128_for_TensorFlow/test/train_performance_16p.sh b/TensorFlow/built-in/nlp/GRU4Rec_ID0128_for_TensorFlow/test/train_performance_16p.sh new file mode 100644 index 000000000..960c94093 --- /dev/null +++ b/TensorFlow/built-in/nlp/GRU4Rec_ID0128_for_TensorFlow/test/train_performance_16p.sh @@ -0,0 +1,184 @@ +#!/bin/bash + +#当前路径,不需要修改 +cur_path=`pwd` + +#集合通信参数,不需要修改 +export RANK_SIZE=16 +#export RANK_TABLE_FILE=$cur_path/${RANK_SIZE}p.json +export JOB_ID=10087 +export DEVICE_INDEX=0 +RANK_ID_START=0 + +data_path="" +data_file="/rsc15_train_40000.txt" +#设置默认日志级别,不需要修改 +#export ASCEND_GLOBAL_LOG_LEVEL=3 +#基础参数,需要模型审视修改 +#网络名称,同目录名称 +Network="GRU4Rec_ID0128_for_TensorFlow" +#训练epoch +train_epochs=10 +#TF2.X独有,不需要修改 +#export NPU_LOOP_SIZE=${train_steps} +#维测参数,precision_mode需要模型审视修改 +#precision_mode="allow_mix_precision" +#维持参数,以下不需要修改 +over_dump=False +data_dump_flag=False +data_dump_step="10" +profiling=False +autotune=False + +if [[ $1 == --help || $1 == -h ]];then + echo"usage:./train_full_1p.sh " + echo " " + echo "parameter explain: + --precision_mode precision mode(allow_fp32_to_fp16/force_fp16/must_keep_origin_dtype/allow_mix_precision) + --over_dump if or not over detection, default is False + --data_dump_flag data dump flag, default is False + --data_dump_step data dump step, default is 10 + --profiling if or not profiling for performance debug, default is False + --autotune whether to enable autotune, default is False + --data_path source data of training + -h/--help show help message + " + exit 1 +fi +#参数校验,不需要修改 +for para in $* +do + if [[ $para == --precision_mode* ]];then + precision_mode=`echo ${para#*=}` + elif [[ $para == --over_dump* ]];then + over_dump=`echo ${para#*=}` + over_dump_path=${cur_path}/output/overflow_dump + mkdir -p ${over_dump_path} + elif [[ $para == --data_dump_flag* ]];then + data_dump_flag=`echo ${para#*=}` + data_dump_path=${cur_path}/output/data_dump + mkdir -p ${data_dump_path} + elif [[ $para == --data_dump_step* ]];then + data_dump_step=`echo ${para#*=}` + elif [[ $para == --profiling* ]];then + profiling=`echo ${para#*=}` + profiling_dump_path=${cur_path}/output/profiling + mkdir -p ${profiling_dump_path} + elif [[ $para == --autotune* ]];then + autotune=`echo ${para#*=}` + mv $install_path/fwkacllib/data/rl/Ascend910/custom $install_path/fwkacllib/data/rl/Ascend910/custom_bak + mv $install_path/fwkacllib/data/tiling/Ascend910/custom $install_path/fwkacllib/data/tiling/Ascend910/custom_bak + autotune_dump_path=${cur_path}/output/autotune_dump + mkdir -p ${autotune_dump_path}/GA + mkdir -p ${autotune_dump_path}/rl + cp -rf $install_path/fwkacllib/data/tiling/Ascend910/custom ${autotune_dump_path}/GA/ + cp -rf $install_path/fwkacllib/data/rl/Ascend910/custom ${autotune_dump_path}/RL/ + elif [[ $para == --data_path* ]];then + data_path=`echo ${para#*=}` + elif [[ $para == --server_index* ]];then + server_index=`echo ${para#*=}` + elif [[ $para == --conf_path* ]];then + conf_path=`echo ${para#*=}` + fi +done + +rank_size=8 +if [[ $conf_path != "" ]];then + nohup python3 set_ranktable.py --npu_nums=$((RANK_SIZE/rank_size)) --conf_path=$conf_path +fi +wait +export RANK_TABLE_FILE=${cur_path}/rank_table.json + +#校验是否传入data_path,不需要修改 +if [[ $data_path == "" ]];then + echo "[Error] para \"data_path\" must be confing" + exit 1 +fi +BatchSize=4096 +CaseName=${Network}_bs${BatchSize}_${RANK_SIZE}'p'_'perf' + +#训练开始时间,不需要修改 +start_time=$(date +%s) +#进入训练脚本目录,需要模型审视修改 +cd $cur_path/../gru4rec_BP + + +#############执行训练######################### + +for((RANK_ID=$((rank_size*server_index));RANK_ID<$((((server_index+1))*rank_size));RANK_ID++)); +do + #设置环境变量,不需要修改 + echo "Device ID: $ASCEND_DEVICE_ID" + export RANK_ID=$RANK_ID + export ASCEND_DEVICE_ID=`expr ${RANK_ID} - $((rank_size*server_index))` + ASCEND_DEVICE_ID=`expr ${RANK_ID} - $((rank_size*server_index))` + + #创建DeviceID输出目录,不需要修改 + if [ -d ${cur_path}/output/${ASCEND_DEVICE_ID} ];then + rm -rf ${cur_path}/output/${ASCEND_DEVICE_ID} + mkdir -p ${cur_path}/output/$ASCEND_DEVICE_ID/ckpt + else + mkdir -p ${cur_path}/output/$ASCEND_DEVICE_ID/ckpt + fi + + #执行训练脚本,以下传参不需要修改,其他需要模型审视修改 + + nohup python3 main.py \ + --path_to_train=${data_path} \ + --path_to_test=${data_path} \ + --train=1 \ + --lr=0.008 \ + --npu_nums=8 \ + --epoch=${train_epochs} \ + --train_dataset_file=${data_file} \ + > ${cur_path}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log 2>&1 & +done + +wait + +end_time=$(date +%s) +e2e_time=$(( $end_time - $start_time )) +#结果打印,不需要修改 +echo "------------------ Final result ------------------" +#输出性能FPS,需要模型审视修改 +#FPS=`grep TimeHistory $cur_path/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log|awk 'END {print $2}'` +temp1=`grep "npu time is:" $cur_path/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log|awk 'END {print $6}'` +TrainingTime=`awk 'BEGIN{printf "%.2f\n",'1000'*'${temp1}'}'` +#TrainingTime=`echo "1000 * ${temp1}"|bc` +FPS=`awk 'BEGIN{printf "%.2f\n",'${RANK_SIZE}'*'${BatchSize}'/'${temp1}'}'` +#FPS=`echo "scale=2;${BatchSize} / ${temp1}"|bc` +#打印,不需要修改 +#echo "Final Performance images/sec : $FPS" +#输出训练精度,需要模型审视修改 +# train_accuracy=`grep train_accuracy $cur_path/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log|awk 'END {print $8}'|cut -c 1-5` +#打印,不需要修改 +#echo "Final Train Accuracy : ${train_accuracy}" +echo "E2E Training Duration sec : $e2e_time" +#稳定性精度看护结果汇总 +#训练用例信息,不需要修改 +DeviceType=`uname -m` +##获取性能数据 +#吞吐量,不需要修改 +ActualFPS=${FPS} +#单迭代训练时长,不需要修改 +grep Each $cur_path/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log|awk '{print $6}'>>$cur_path/output/$ASCEND_DEVICE_ID/train_${CaseName}_time.txt +#TrainingTime=`awk 'END {print}' $cur_path/output/$ASCEND_DEVICE_ID/train_${CaseName}_time.txt` +#TrainingTime=`awk 'BEGIN{printf "%.2f\n",'${BatchSize}'*1000/'${FPS}'}'` +#TrainningTime=`grep Each $cur_path/output/${ASCEND_DEVICE_ID}/train.log|awk 'END {print $6}'` +#从train_$ASCEND_DEVICE_ID.log提取Loss到train_${CaseName}_loss.txt中,需要根据模型审视 +#grep loss $cur_path/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log|grep -v BatchTimestamp|awk '{print $10}'|sed 's/,//g'|sed '/^$/d' >> $cur_path/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt +grep Epoch $cur_path/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log|awk '{print $8}'>>$cur_path/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt +#最后一个迭代loss值,不需要修改 +ActualLoss=`awk 'END {print}' $cur_path/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt` +#关键信息打印到${CaseName}.log中,不需要修改 +echo "Network = ${Network}" > $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "RankSize = ${RANK_SIZE}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "BatchSize = ${BatchSize}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "DeviceType = ${DeviceType}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "CaseName = ${CaseName}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "ActualFPS = ${ActualFPS}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "TrainingTime = ${TrainingTime}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +#echo "TrainAccuracy = ${train_accuracy}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "ActualLoss = ${ActualLoss}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "E2ETrainingTime = ${e2e_time}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +sed -i -e '/ModuleNotFoundError/d' $cur_path/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log \ No newline at end of file diff --git a/TensorFlow/built-in/recommendation/WideDeep_ID2712_for_TensorFlow/test/train_performance_16np.sh b/TensorFlow/built-in/recommendation/WideDeep_ID2712_for_TensorFlow/test/train_performance_16np.sh index 815c806fc..a3c242772 100644 --- a/TensorFlow/built-in/recommendation/WideDeep_ID2712_for_TensorFlow/test/train_performance_16np.sh +++ b/TensorFlow/built-in/recommendation/WideDeep_ID2712_for_TensorFlow/test/train_performance_16np.sh @@ -57,10 +57,13 @@ elif [[ $para == --over_dump* ]];then conf_path=`echo ${para#*=}` elif [[ $para == --devices_num* ]];then devices_num=`echo ${para#*=}` + elif [[ $para == --servers_num* ]];then + servers_num=`echo ${para#*=}` fi done -one_node_ip=`find $conf_path -name "server_111._0.info"|awk -F "server_" '{print $2}'|awk -F "_" '{print $1}'` -linux_num=`find $conf_path -name "server_*.info" |wc -l` +#one_node_ip=`find $conf_path -name "server_111._0.info"|awk -F "server_" '{print $2}'|awk -F "_" '{print $1}'` +#linux_num=`find $conf_path -name "server_*.info" |wc -l` +linux_num=$servers_num #校验是否传入data_path,不需要修改 if [[ $data_path == "" ]];then echo "[Error] para \"data_path\" must be confing" @@ -69,7 +72,9 @@ fi export RANK_SIZE=16 rank_size=8 export RANK_SIZE=`awk 'BEGIN{printf "%.0f\n",'${devices_num}'*'${linux_num}'}'` -nohup python3 set_ranktable.py --npu_nums=$linux_num --conf_path=$conf_path +if [[ $conf_path != "" ]];then + nohup python3 set_ranktable.py --npu_nums=$linux_num --conf_path=$conf_path +fi wait export RANK_TABLE_FILE=$cur_path/rank_table.json export JOB_ID=10087 diff --git a/TensorFlow/built-in/recommendation/WideDeep_ID2712_for_TensorFlow/test/train_performance_16p.sh b/TensorFlow/built-in/recommendation/WideDeep_ID2712_for_TensorFlow/test/train_performance_16p.sh index b59193bb2..45e768d86 100644 --- a/TensorFlow/built-in/recommendation/WideDeep_ID2712_for_TensorFlow/test/train_performance_16p.sh +++ b/TensorFlow/built-in/recommendation/WideDeep_ID2712_for_TensorFlow/test/train_performance_16p.sh @@ -45,7 +45,7 @@ for para in $* do if [[ $para == --precision_mode* ]];then precision_mode=`echo ${para#*=}` -elif [[ $para == --over_dump* ]];then + elif [[ $para == --over_dump* ]];then over_dump=`echo ${para#*=}` over_dump_path=${cur_path}/output/overflow_dump mkdir -p ${over_dump_path} @@ -65,7 +65,9 @@ if [[ $data_path == "" ]];then fi export RANK_SIZE=16 rank_size=8 -nohup python3 set_ranktable.py --npu_nums=$((RANK_SIZE/rank_size)) --conf_path=$conf_path +if [[ $conf_path != "" ]];then + nohup python3 set_ranktable.py --npu_nums=$((RANK_SIZE/rank_size)) --conf_path=$conf_path +fi wait export RANK_TABLE_FILE=$cur_path/rank_table.json export JOB_ID=10087 diff --git a/TensorFlow2/built-in/cv/detection/YOLOv5_ID1719_for_TensorFlow2.X/test/train_performance_16np.sh b/TensorFlow2/built-in/cv/detection/YOLOv5_ID1719_for_TensorFlow2.X/test/train_performance_16np.sh index ba1b3fc1f..2c8c62f8a 100644 --- a/TensorFlow2/built-in/cv/detection/YOLOv5_ID1719_for_TensorFlow2.X/test/train_performance_16np.sh +++ b/TensorFlow2/built-in/cv/detection/YOLOv5_ID1719_for_TensorFlow2.X/test/train_performance_16np.sh @@ -99,10 +99,11 @@ do conf_path=`echo ${para#*=}` elif [[ $para == --devices_num* ]];then devices_num=`echo ${para#*=}` + elif [[ $para == --servers_num* ]];then + servers_num=`echo ${para#*=}` fi done -one_node_ip=`find $conf_path -name "server_111._0.info"|awk -F "server_" '{print $2}'|awk -F "_" '{print $1}'` -linux_num=`find $conf_path -name "server_*.info" |wc -l` +linux_num=$servers_num #校验是否传入data_path,不需要修改 if [[ $data_path == "" ]];then echo "[Error] para \"data_path\" must be specified" @@ -112,7 +113,9 @@ export eval_batch_size=`awk 'BEGIN{printf "%.0f\n",'16'*'${linux_num}'}'` export RANK_SIZE=`awk 'BEGIN{printf "%.0f\n",'${devices_num}'*'${linux_num}'}'` rank_size=8 -nohup python3 set_ranktable.py --npu_nums=$linux_num --conf_path=$conf_path +if [[ $conf_path != "" ]];then + nohup python3 set_ranktable.py --npu_nums=$linux_num --conf_path=$conf_path +fi wait export RANK_TABLE_FILE=$cur_path/rank_table.json export JOB_ID=10087 diff --git a/TensorFlow2/built-in/cv/detection/YOLOv5_ID1719_for_TensorFlow2.X/test/train_performance_16p.sh b/TensorFlow2/built-in/cv/detection/YOLOv5_ID1719_for_TensorFlow2.X/test/train_performance_16p.sh index d2221c8d3..ea44d86b7 100644 --- a/TensorFlow2/built-in/cv/detection/YOLOv5_ID1719_for_TensorFlow2.X/test/train_performance_16p.sh +++ b/TensorFlow2/built-in/cv/detection/YOLOv5_ID1719_for_TensorFlow2.X/test/train_performance_16p.sh @@ -60,7 +60,9 @@ done export RANK_SIZE=16 export JOB_ID=10087 rank_size=8 -nohup python3 $cur_path/set_ranktable.py --npu_nums=$((RANK_SIZE/rank_size)) --conf_path=$conf_path +if [[ $conf_path != "" ]];then + nohup python3 $cur_path/set_ranktable.py --npu_nums=$((RANK_SIZE/rank_size)) --conf_path=$conf_path +fi export RANK_TABLE_FILE=${cur_path}/rank_table.json export HCCL_CONNECT_TIMEOUT=600 RANK_ID_START=0 diff --git a/TensorFlow2/built-in/cv/image_classification/ResNet50_ID0360_for_TensorFlow2.X/test/train_full_16P_256bs_SGD.sh b/TensorFlow2/built-in/cv/image_classification/ResNet50_ID0360_for_TensorFlow2.X/test/train_full_16P_256bs_SGD.sh index 3a3c34f2c..e32557438 100644 --- a/TensorFlow2/built-in/cv/image_classification/ResNet50_ID0360_for_TensorFlow2.X/test/train_full_16P_256bs_SGD.sh +++ b/TensorFlow2/built-in/cv/image_classification/ResNet50_ID0360_for_TensorFlow2.X/test/train_full_16P_256bs_SGD.sh @@ -90,7 +90,9 @@ fi # 自动生成ranktable的脚本 rank_size=8 -nohup python3 set_ranktable.py --npu_nums=$((RANK_SIZE/rank_size)) --conf_path=$conf_path +if [[ $conf_path != "" ]];then + nohup python3 set_ranktable.py --npu_nums=$((RANK_SIZE/rank_size)) --conf_path=$conf_path +fi wait export RANK_TABLE_FILE=${cur_path}/rank_table.json diff --git a/TensorFlow2/built-in/cv/image_classification/ResNet50_ID0360_for_TensorFlow2.X/test/train_performance_16P_256bs_SGD.sh b/TensorFlow2/built-in/cv/image_classification/ResNet50_ID0360_for_TensorFlow2.X/test/train_performance_16P_256bs_SGD.sh index 3755c6ff9..39f047dbb 100644 --- a/TensorFlow2/built-in/cv/image_classification/ResNet50_ID0360_for_TensorFlow2.X/test/train_performance_16P_256bs_SGD.sh +++ b/TensorFlow2/built-in/cv/image_classification/ResNet50_ID0360_for_TensorFlow2.X/test/train_performance_16P_256bs_SGD.sh @@ -90,7 +90,9 @@ fi # 自动生成ranktable的脚本 rank_size=8 -nohup python3 set_ranktable.py --npu_nums=$((RANK_SIZE/rank_size)) --conf_path=$conf_path +if [[ $conf_path != "" ]];then + nohup python3 set_ranktable.py --npu_nums=$((RANK_SIZE/rank_size)) --conf_path=$conf_path +fi wait export RANK_TABLE_FILE=${cur_path}/rank_table.json diff --git a/TensorFlow2/built-in/cv/image_classification/ResNet50_ID0360_for_TensorFlow2.X/test/train_performance_16np_256bs_SGD.sh b/TensorFlow2/built-in/cv/image_classification/ResNet50_ID0360_for_TensorFlow2.X/test/train_performance_16np_256bs_SGD.sh index ec273cace..33ea2b949 100644 --- a/TensorFlow2/built-in/cv/image_classification/ResNet50_ID0360_for_TensorFlow2.X/test/train_performance_16np_256bs_SGD.sh +++ b/TensorFlow2/built-in/cv/image_classification/ResNet50_ID0360_for_TensorFlow2.X/test/train_performance_16np_256bs_SGD.sh @@ -76,10 +76,13 @@ do data_path=`echo ${para#*=}` elif [[ $para == --devices_num* ]];then devices_num=`echo ${para#*=}` + elif [[ $para == --servers_num* ]];then + servers_num=`echo ${para#*=}` fi done -one_node_ip=`find $conf_path -name "server_*0.info"|awk -F "server_" '{print $2}'|awk -F "_" '{print $1}'` -linux_num=`find $conf_path -name "server_*.info" |wc -l` +#one_node_ip=`find $conf_path -name "server_*0.info"|awk -F "server_" '{print $2}'|awk -F "_" '{print $1}'` +#linux_num=`find $conf_path -name "server_*.info" |wc -l` +linux_num=$servers_num if [[ $data_path == "" ]];then echo "[Error] para \"data_path\" must be confing" exit 1 @@ -99,7 +102,9 @@ export NPU_LOOP_SIZE=${train_steps} rank_size=8 -nohup python3 set_ranktable.py --npu_nums=$linux_num --conf_path=$conf_path +if [[ $conf_path != "" ]];then + nohup python3 set_ranktable.py --npu_nums=$linux_num --conf_path=$conf_path +fi wait export RANK_TABLE_FILE=$cur_path/rank_table.json #训练开始时间,不需要修改 -- Gitee