diff --git a/TensorFlow/built-in/cv/image_classification/MiniGo_ID0629_for_TensorFlow/test/set_ranktable.py b/TensorFlow/built-in/cv/image_classification/MiniGo_ID0629_for_TensorFlow/test/set_ranktable.py new file mode 100644 index 0000000000000000000000000000000000000000..c25b51462c5df2325462786688d4a206ee29fb9a --- /dev/null +++ b/TensorFlow/built-in/cv/image_classification/MiniGo_ID0629_for_TensorFlow/test/set_ranktable.py @@ -0,0 +1,1740 @@ +import argparse +parser = argparse.ArgumentParser() +parser.add_argument('-n', '--npu_nums', type=int, default='2', help='nums of npu') +parser.add_argument('-c', '--conf_path', type=str, default='./', help='the path of server_info') +FLAGS = parser.parse_args() + +import json +import os +server = [] +server_conf = [] +server_list = ["0", "1", "2", "3", "4", "5", "6", "7"] +if os.path.isdir(FLAGS.conf_path): + for f in os.listdir(FLAGS.conf_path): + if (f.split("_")[-1]).split(".")[0] in server_list and (f.split("_")[-1]).split(".")[1] == 'info' and f.split("_")[0] == 'server': + server_conf.append(f) + + + + + + +rank_address = [] +for i in range(FLAGS.npu_nums): + for x in server_conf: + if (x.split("_")[-1]).split(".")[0] == str(i): + server.append(x.split("_")[1]) + l = FLAGS.conf_path + "/" + x + with open(l, "r") as a: + s = a.readlines() + for s_ in s: + if 'address_0' in s_: + rank_address.append(s_.split("=")[-1][:-1]) + for s_ in s: + if 'address_1' in s_: + rank_address.append(s_.split("=")[-1][:-1]) + for s_ in s: + if 'address_2' in s_: + rank_address.append(s_.split("=")[-1][:-1]) + for s_ in s: + if 'address_3' in s_: + rank_address.append(s_.split("=")[-1][:-1]) + for s_ in s: + if 'address_4' in s_: + rank_address.append(s_.split("=")[-1][:-1]) + for s_ in s: + if 'address_5' in s_: + rank_address.append(s_.split("=")[-1][:-1]) + for s_ in s: + if 'address_6' in s_: + rank_address.append(s_.split("=")[-1][:-1]) + for s_ in s: + if 'address_7' in s_: + rank_address.append(s_.split("=")[-1][:-1]) + +if FLAGS.npu_nums == 1: + rank = { + "server_count":"1", + "server_list":[ + { + "server_id":server[0], + "device":[ + { + "device_id":"0", + "device_ip":rank_address[0], + "rank_id":"0" + }, + { + "device_id":"1", + "device_ip":rank_address[1], + "rank_id":"1" + }, + { + "device_id":"2", + "device_ip":rank_address[2], + "rank_id":"2" + }, + { + "device_id":"3", + "device_ip":rank_address[3], + "rank_id":"3" + }, + { + "device_id":"4", + "device_ip":rank_address[4], + "rank_id":"4" + }, + { + "device_id":"5", + "device_ip":rank_address[5], + "rank_id":"5" + }, + { + "device_id":"6", + "device_ip":rank_address[6], + "rank_id":"6" + }, + { + "device_id":"7", + "device_ip":rank_address[7], + "rank_id":"7" + } + ]}], + "status":"completed", + "version":"1.0" + } +elif FLAGS.npu_nums == 2: + rank = { + "server_count":"2", + "server_list":[ + { + "server_id":server[0], + "device":[ + { + "device_id":"0", + "device_ip":rank_address[0], + "rank_id":"0" + }, + { + "device_id":"1", + "device_ip":rank_address[1], + "rank_id":"1" + }, + { + "device_id":"2", + "device_ip":rank_address[2], + "rank_id":"2" + }, + { + "device_id":"3", + "device_ip":rank_address[3], + "rank_id":"3" + }, + { + "device_id":"4", + "device_ip":rank_address[4], + "rank_id":"4" + }, + { + "device_id":"5", + "device_ip":rank_address[5], + "rank_id":"5" + }, + { + "device_id":"6", + "device_ip":rank_address[6], + "rank_id":"6" + }, + { + "device_id":"7", + "device_ip":rank_address[7], + "rank_id":"7" + } + ]}, + + + { + "server_id":server[1], + "device":[ + { + "device_id":"0", + "device_ip":rank_address[8], + "rank_id":"8" + }, + { + "device_id":"1", + "device_ip":rank_address[9], + "rank_id":"9" + }, + { + "device_id":"2", + "device_ip":rank_address[10], + "rank_id":"10" + }, + { + "device_id":"3", + "device_ip":rank_address[11], + "rank_id":"11" + }, + { + "device_id":"4", + "device_ip":rank_address[12], + "rank_id":"12" + }, + { + "device_id":"5", + "device_ip":rank_address[13], + "rank_id":"13" + }, + { + "device_id":"6", + "device_ip":rank_address[14], + "rank_id":"14" + }, + { + "device_id":"7", + "device_ip":rank_address[15], + "rank_id":"15" + } + ]} + ], + + "status":"completed", + "version":"1.0" + } + + +elif FLAGS.npu_nums == 3: + rank = { + "server_count":"3", + "server_list":[ + { + "server_id":server[0], + "device":[ + { + "device_id":"0", + "device_ip":rank_address[0], + "rank_id":"0" + }, + { + "device_id":"1", + "device_ip":rank_address[1], + "rank_id":"1" + }, + { + "device_id":"2", + "device_ip":rank_address[2], + "rank_id":"2" + }, + { + "device_id":"3", + "device_ip":rank_address[3], + "rank_id":"3" + }, + { + "device_id":"4", + "device_ip":rank_address[4], + "rank_id":"4" + }, + { + "device_id":"5", + "device_ip":rank_address[5], + "rank_id":"5" + }, + { + "device_id":"6", + "device_ip":rank_address[6], + "rank_id":"6" + }, + { + "device_id":"7", + "device_ip":rank_address[7], + "rank_id":"7" + } + ]}, + + + { + "server_id":server[1], + "device":[ + { + "device_id":"0", + "device_ip":rank_address[8], + "rank_id":"8" + }, + { + "device_id":"1", + "device_ip":rank_address[9], + "rank_id":"9" + }, + { + "device_id":"2", + "device_ip":rank_address[10], + "rank_id":"10" + }, + { + "device_id":"3", + "device_ip":rank_address[11], + "rank_id":"11" + }, + { + "device_id":"4", + "device_ip":rank_address[12], + "rank_id":"12" + }, + { + "device_id":"5", + "device_ip":rank_address[13], + "rank_id":"13" + }, + { + "device_id":"6", + "device_ip":rank_address[14], + "rank_id":"14" + }, + { + "device_id":"7", + "device_ip":rank_address[15], + "rank_id":"15" + } + ]}, + + { + "server_id":server[2], + "device":[ + { + "device_id":"0", + "device_ip":rank_address[16], + "rank_id":"16" + }, + { + "device_id":"1", + "device_ip":rank_address[17], + "rank_id":"17" + }, + { + "device_id":"2", + "device_ip":rank_address[18], + "rank_id":"18" + }, + { + "device_id":"3", + "device_ip":rank_address[19], + "rank_id":"19" + }, + { + "device_id":"4", + "device_ip":rank_address[20], + "rank_id":"20" + }, + { + "device_id":"5", + "device_ip":rank_address[21], + "rank_id":"21" + }, + { + "device_id":"6", + "device_ip":rank_address[22], + "rank_id":"22" + }, + { + "device_id":"7", + "device_ip":rank_address[23], + "rank_id":"23" + } + ]} + ], + "status":"completed", + "version":"1.0" + } +elif FLAGS.npu_nums == 4: + rank = { + "server_count":"4", + "server_list":[ + { + "server_id":server[0], + "device":[ + { + "device_id":"0", + "device_ip":rank_address[0], + "rank_id":"0" + }, + { + "device_id":"1", + "device_ip":rank_address[1], + "rank_id":"1" + }, + { + "device_id":"2", + "device_ip":rank_address[2], + "rank_id":"2" + }, + { + "device_id":"3", + "device_ip":rank_address[3], + "rank_id":"3" + }, + { + "device_id":"4", + "device_ip":rank_address[4], + "rank_id":"4" + }, + { + "device_id":"5", + "device_ip":rank_address[5], + "rank_id":"5" + }, + { + "device_id":"6", + "device_ip":rank_address[6], + "rank_id":"6" + }, + { + "device_id":"7", + "device_ip":rank_address[7], + "rank_id":"7" + } + ]}, + + + { + "server_id":server[1], + "device":[ + { + "device_id":"0", + "device_ip":rank_address[8], + "rank_id":"8" + }, + { + "device_id":"1", + "device_ip":rank_address[9], + "rank_id":"9" + }, + { + "device_id":"2", + "device_ip":rank_address[10], + "rank_id":"10" + }, + { + "device_id":"3", + "device_ip":rank_address[11], + "rank_id":"11" + }, + { + "device_id":"4", + "device_ip":rank_address[12], + "rank_id":"12" + }, + { + "device_id":"5", + "device_ip":rank_address[13], + "rank_id":"13" + }, + { + "device_id":"6", + "device_ip":rank_address[14], + "rank_id":"14" + }, + { + "device_id":"7", + "device_ip":rank_address[15], + "rank_id":"15" + } + ]}, + { + "server_id":server[2], + "device":[ + { + "device_id":"0", + "device_ip":rank_address[16], + "rank_id":"16" + }, + { + "device_id":"1", + "device_ip":rank_address[17], + "rank_id":"17" + }, + { + "device_id":"2", + "device_ip":rank_address[18], + "rank_id":"18" + }, + { + "device_id":"3", + "device_ip":rank_address[19], + "rank_id":"19" + }, + { + "device_id":"4", + "device_ip":rank_address[20], + "rank_id":"20" + }, + { + "device_id":"5", + "device_ip":rank_address[21], + "rank_id":"21" + }, + { + "device_id":"6", + "device_ip":rank_address[22], + "rank_id":"22" + }, + { + "device_id":"7", + "device_ip":rank_address[23], + "rank_id":"23" + } + ]}, + { + "server_id":server[3], + "device":[ + { + "device_id":"0", + "device_ip":rank_address[24], + "rank_id":"24" + }, + { + "device_id":"1", + "device_ip":rank_address[25], + "rank_id":"25" + }, + { + "device_id":"2", + "device_ip":rank_address[26], + "rank_id":"26" + }, + { + "device_id":"3", + "device_ip":rank_address[27], + "rank_id":"27" + }, + { + "device_id":"4", + "device_ip":rank_address[28], + "rank_id":"28" + }, + { + "device_id":"5", + "device_ip":rank_address[29], + "rank_id":"29" + }, + { + "device_id":"6", + "device_ip":rank_address[30], + "rank_id":"30" + }, + { + "device_id":"7", + "device_ip":rank_address[31], + "rank_id":"31" + } + ]} + ], + "status":"completed", + "version":"1.0" + } +elif FLAGS.npu_nums == 5: + rank = { + "server_count":"5", + "server_list":[ + { + "server_id":server[0], + "device":[ + { + "device_id":"0", + "device_ip":rank_address[0], + "rank_id":"0" + }, + { + "device_id":"1", + "device_ip":rank_address[1], + "rank_id":"1" + }, + { + "device_id":"2", + "device_ip":rank_address[2], + "rank_id":"2" + }, + { + "device_id":"3", + "device_ip":rank_address[3], + "rank_id":"3" + }, + { + "device_id":"4", + "device_ip":rank_address[4], + "rank_id":"4" + }, + { + "device_id":"5", + "device_ip":rank_address[5], + "rank_id":"5" + }, + { + "device_id":"6", + "device_ip":rank_address[6], + "rank_id":"6" + }, + { + "device_id":"7", + "device_ip":rank_address[7], + "rank_id":"7" + } + ]}, + + + { + "server_id":server[1], + "device":[ + { + "device_id":"0", + "device_ip":rank_address[8], + "rank_id":"8" + }, + { + "device_id":"1", + "device_ip":rank_address[9], + "rank_id":"9" + }, + { + "device_id":"2", + "device_ip":rank_address[10], + "rank_id":"10" + }, + { + "device_id":"3", + "device_ip":rank_address[11], + "rank_id":"11" + }, + { + "device_id":"4", + "device_ip":rank_address[12], + "rank_id":"12" + }, + { + "device_id":"5", + "device_ip":rank_address[13], + "rank_id":"13" + }, + { + "device_id":"6", + "device_ip":rank_address[14], + "rank_id":"14" + }, + { + "device_id":"7", + "device_ip":rank_address[15], + "rank_id":"15" + } + ]}, + { + "server_id":server[2], + "device":[ + { + "device_id":"0", + "device_ip":rank_address[16], + "rank_id":"16" + }, + { + "device_id":"1", + "device_ip":rank_address[17], + "rank_id":"17" + }, + { + "device_id":"2", + "device_ip":rank_address[18], + "rank_id":"18" + }, + { + "device_id":"3", + "device_ip":rank_address[19], + "rank_id":"19" + }, + { + "device_id":"4", + "device_ip":rank_address[20], + "rank_id":"20" + }, + { + "device_id":"5", + "device_ip":rank_address[21], + "rank_id":"21" + }, + { + "device_id":"6", + "device_ip":rank_address[22], + "rank_id":"22" + }, + { + "device_id":"7", + "device_ip":rank_address[23], + "rank_id":"23" + } + ]}, + { + "server_id":server[3], + "device":[ + { + "device_id":"0", + "device_ip":rank_address[24], + "rank_id":"24" + }, + { + "device_id":"1", + "device_ip":rank_address[25], + "rank_id":"25" + }, + { + "device_id":"2", + "device_ip":rank_address[26], + "rank_id":"26" + }, + { + "device_id":"3", + "device_ip":rank_address[27], + "rank_id":"27" + }, + { + "device_id":"4", + "device_ip":rank_address[28], + "rank_id":"28" + }, + { + "device_id":"5", + "device_ip":rank_address[29], + "rank_id":"29" + }, + { + "device_id":"6", + "device_ip":rank_address[30], + "rank_id":"30" + }, + { + "device_id":"7", + "device_ip":rank_address[31], + "rank_id":"31" + } + ]}, + { + "server_id":server[4], + "device":[ + { + "device_id":"0", + "device_ip":rank_address[32], + "rank_id":"32" + }, + { + "device_id":"1", + "device_ip":rank_address[33], + "rank_id":"33" + }, + { + "device_id":"2", + "device_ip":rank_address[34], + "rank_id":"34" + }, + { + "device_id":"3", + "device_ip":rank_address[35], + "rank_id":"35" + }, + { + "device_id":"4", + "device_ip":rank_address[36], + "rank_id":"36" + }, + { + "device_id":"5", + "device_ip":rank_address[37], + "rank_id":"37" + }, + { + "device_id":"6", + "device_ip":rank_address[38], + "rank_id":"38" + }, + { + "device_id":"7", + "device_ip":rank_address[39], + "rank_id":"39" + } + ]} + ], + "status":"completed", + "version":"1.0" + } + + + +elif FLAGS.npu_nums == 6: + rank = { + "server_count":"6", + "server_list":[ + { + "server_id":server[0], + "device":[ + { + "device_id":"0", + "device_ip":rank_address[0], + "rank_id":"0" + }, + { + "device_id":"1", + "device_ip":rank_address[1], + "rank_id":"1" + }, + { + "device_id":"2", + "device_ip":rank_address[2], + "rank_id":"2" + }, + { + "device_id":"3", + "device_ip":rank_address[3], + "rank_id":"3" + }, + { + "device_id":"4", + "device_ip":rank_address[4], + "rank_id":"4" + }, + { + "device_id":"5", + "device_ip":rank_address[5], + "rank_id":"5" + }, + { + "device_id":"6", + "device_ip":rank_address[6], + "rank_id":"6" + }, + { + "device_id":"7", + "device_ip":rank_address[7], + "rank_id":"7" + } + ]}, + + + { + "server_id":server[1], + "device":[ + { + "device_id":"0", + "device_ip":rank_address[8], + "rank_id":"8" + }, + { + "device_id":"1", + "device_ip":rank_address[9], + "rank_id":"9" + }, + { + "device_id":"2", + "device_ip":rank_address[10], + "rank_id":"10" + }, + { + "device_id":"3", + "device_ip":rank_address[11], + "rank_id":"11" + }, + { + "device_id":"4", + "device_ip":rank_address[12], + "rank_id":"12" + }, + { + "device_id":"5", + "device_ip":rank_address[13], + "rank_id":"13" + }, + { + "device_id":"6", + "device_ip":rank_address[14], + "rank_id":"14" + }, + { + "device_id":"7", + "device_ip":rank_address[15], + "rank_id":"15" + } + ]}, + { + "server_id":server[2], + "device":[ + { + "device_id":"0", + "device_ip":rank_address[16], + "rank_id":"16" + }, + { + "device_id":"1", + "device_ip":rank_address[17], + "rank_id":"17" + }, + { + "device_id":"2", + "device_ip":rank_address[18], + "rank_id":"18" + }, + { + "device_id":"3", + "device_ip":rank_address[19], + "rank_id":"19" + }, + { + "device_id":"4", + "device_ip":rank_address[20], + "rank_id":"20" + }, + { + "device_id":"5", + "device_ip":rank_address[21], + "rank_id":"21" + }, + { + "device_id":"6", + "device_ip":rank_address[22], + "rank_id":"22" + }, + { + "device_id":"7", + "device_ip":rank_address[23], + "rank_id":"23" + } + ]}, + { + "server_id":server[3], + "device":[ + { + "device_id":"0", + "device_ip":rank_address[24], + "rank_id":"24" + }, + { + "device_id":"1", + "device_ip":rank_address[25], + "rank_id":"25" + }, + { + "device_id":"2", + "device_ip":rank_address[26], + "rank_id":"26" + }, + { + "device_id":"3", + "device_ip":rank_address[27], + "rank_id":"27" + }, + { + "device_id":"4", + "device_ip":rank_address[28], + "rank_id":"28" + }, + { + "device_id":"5", + "device_ip":rank_address[29], + "rank_id":"29" + }, + { + "device_id":"6", + "device_ip":rank_address[30], + "rank_id":"30" + }, + { + "device_id":"7", + "device_ip":rank_address[31], + "rank_id":"31" + } + ]}, + { + "server_id":server[4], + "device":[ + { + "device_id":"0", + "device_ip":rank_address[32], + "rank_id":"32" + }, + { + "device_id":"1", + "device_ip":rank_address[33], + "rank_id":"33" + }, + { + "device_id":"2", + "device_ip":rank_address[34], + "rank_id":"34" + }, + { + "device_id":"3", + "device_ip":rank_address[35], + "rank_id":"35" + }, + { + "device_id":"4", + "device_ip":rank_address[36], + "rank_id":"36" + }, + { + "device_id":"5", + "device_ip":rank_address[37], + "rank_id":"37" + }, + { + "device_id":"6", + "device_ip":rank_address[38], + "rank_id":"38" + }, + { + "device_id":"7", + "device_ip":rank_address[39], + "rank_id":"39" + } + ]}, + { + "server_id":server[5], + "device":[ + { + "device_id":"0", + "device_ip":rank_address[40], + "rank_id":"40" + }, + { + "device_id":"1", + "device_ip":rank_address[41], + "rank_id":"41" + }, + { + "device_id":"2", + "device_ip":rank_address[42], + "rank_id":"42" + }, + { + "device_id":"3", + "device_ip":rank_address[43], + "rank_id":"43" + }, + { + "device_id":"4", + "device_ip":rank_address[44], + "rank_id":"44" + }, + { + "device_id":"5", + "device_ip":rank_address[45], + "rank_id":"45" + }, + { + "device_id":"6", + "device_ip":rank_address[46], + "rank_id":"46" + }, + { + "device_id":"7", + "device_ip":rank_address[47], + "rank_id":"47" + } + ]} + ], + "status":"completed", + "version":"1.0" + } + + +elif FLAGS.npu_nums == 7: + rank = { + "server_count":"7", + "server_list":[ + { + "server_id":server[0], + "device":[ + { + "device_id":"0", + "device_ip":rank_address[0], + "rank_id":"0" + }, + { + "device_id":"1", + "device_ip":rank_address[1], + "rank_id":"1" + }, + { + "device_id":"2", + "device_ip":rank_address[2], + "rank_id":"2" + }, + { + "device_id":"3", + "device_ip":rank_address[3], + "rank_id":"3" + }, + { + "device_id":"4", + "device_ip":rank_address[4], + "rank_id":"4" + }, + { + "device_id":"5", + "device_ip":rank_address[5], + "rank_id":"5" + }, + { + "device_id":"6", + "device_ip":rank_address[6], + "rank_id":"6" + }, + { + "device_id":"7", + "device_ip":rank_address[7], + "rank_id":"7" + } + ]}, + + + { + "server_id":server[1], + "device":[ + { + "device_id":"0", + "device_ip":rank_address[8], + "rank_id":"8" + }, + { + "device_id":"1", + "device_ip":rank_address[9], + "rank_id":"9" + }, + { + "device_id":"2", + "device_ip":rank_address[10], + "rank_id":"10" + }, + { + "device_id":"3", + "device_ip":rank_address[11], + "rank_id":"11" + }, + { + "device_id":"4", + "device_ip":rank_address[12], + "rank_id":"12" + }, + { + "device_id":"5", + "device_ip":rank_address[13], + "rank_id":"13" + }, + { + "device_id":"6", + "device_ip":rank_address[14], + "rank_id":"14" + }, + { + "device_id":"7", + "device_ip":rank_address[15], + "rank_id":"15" + } + ]}, + { + "server_id":server[2], + "device":[ + { + "device_id":"0", + "device_ip":rank_address[16], + "rank_id":"16" + }, + { + "device_id":"1", + "device_ip":rank_address[17], + "rank_id":"17" + }, + { + "device_id":"2", + "device_ip":rank_address[18], + "rank_id":"18" + }, + { + "device_id":"3", + "device_ip":rank_address[19], + "rank_id":"19" + }, + { + "device_id":"4", + "device_ip":rank_address[20], + "rank_id":"20" + }, + { + "device_id":"5", + "device_ip":rank_address[21], + "rank_id":"21" + }, + { + "device_id":"6", + "device_ip":rank_address[22], + "rank_id":"22" + }, + { + "device_id":"7", + "device_ip":rank_address[23], + "rank_id":"23" + } + ]}, + { + "server_id":server[3], + "device":[ + { + "device_id":"0", + "device_ip":rank_address[24], + "rank_id":"24" + }, + { + "device_id":"1", + "device_ip":rank_address[25], + "rank_id":"25" + }, + { + "device_id":"2", + "device_ip":rank_address[26], + "rank_id":"26" + }, + { + "device_id":"3", + "device_ip":rank_address[27], + "rank_id":"27" + }, + { + "device_id":"4", + "device_ip":rank_address[28], + "rank_id":"28" + }, + { + "device_id":"5", + "device_ip":rank_address[29], + "rank_id":"29" + }, + { + "device_id":"6", + "device_ip":rank_address[30], + "rank_id":"30" + }, + { + "device_id":"7", + "device_ip":rank_address[31], + "rank_id":"31" + } + ]}, + { + "server_id":server[4], + "device":[ + { + "device_id":"0", + "device_ip":rank_address[32], + "rank_id":"32" + }, + { + "device_id":"1", + "device_ip":rank_address[33], + "rank_id":"33" + }, + { + "device_id":"2", + "device_ip":rank_address[34], + "rank_id":"34" + }, + { + "device_id":"3", + "device_ip":rank_address[35], + "rank_id":"35" + }, + { + "device_id":"4", + "device_ip":rank_address[36], + "rank_id":"36" + }, + { + "device_id":"5", + "device_ip":rank_address[37], + "rank_id":"37" + }, + { + "device_id":"6", + "device_ip":rank_address[38], + "rank_id":"38" + }, + { + "device_id":"7", + "device_ip":rank_address[39], + "rank_id":"39" + } + ]}, + { + "server_id":server[5], + "device":[ + { + "device_id":"0", + "device_ip":rank_address[40], + "rank_id":"40" + }, + { + "device_id":"1", + "device_ip":rank_address[41], + "rank_id":"41" + }, + { + "device_id":"2", + "device_ip":rank_address[42], + "rank_id":"42" + }, + { + "device_id":"3", + "device_ip":rank_address[43], + "rank_id":"43" + }, + { + "device_id":"4", + "device_ip":rank_address[44], + "rank_id":"44" + }, + { + "device_id":"5", + "device_ip":rank_address[45], + "rank_id":"45" + }, + { + "device_id":"6", + "device_ip":rank_address[46], + "rank_id":"46" + }, + { + "device_id":"7", + "device_ip":rank_address[47], + "rank_id":"47" + } + ]}, + { + "server_id":server[6], + "device":[ + { + "device_id":"0", + "device_ip":rank_address[48], + "rank_id":"48" + }, + { + "device_id":"1", + "device_ip":rank_address[49], + "rank_id":"49" + }, + { + "device_id":"2", + "device_ip":rank_address[50], + "rank_id":"50" + }, + { + "device_id":"3", + "device_ip":rank_address[51], + "rank_id":"51" + }, + { + "device_id":"4", + "device_ip":rank_address[52], + "rank_id":"52" + }, + { + "device_id":"5", + "device_ip":rank_address[53], + "rank_id":"53" + }, + { + "device_id":"6", + "device_ip":rank_address[54], + "rank_id":"54" + }, + { + "device_id":"7", + "device_ip":rank_address[55], + "rank_id":"55" + } + ]} + ], + "status":"completed", + "version":"1.0" + } + + + + +elif FLAGS.npu_nums == 8: + rank = { + "server_count":"8", + "server_list":[ + { + "server_id":server[0], + "device":[ + { + "device_id":"0", + "device_ip":rank_address[0], + "rank_id":"0" + }, + { + "device_id":"1", + "device_ip":rank_address[1], + "rank_id":"1" + }, + { + "device_id":"2", + "device_ip":rank_address[2], + "rank_id":"2" + }, + { + "device_id":"3", + "device_ip":rank_address[3], + "rank_id":"3" + }, + { + "device_id":"4", + "device_ip":rank_address[4], + "rank_id":"4" + }, + { + "device_id":"5", + "device_ip":rank_address[5], + "rank_id":"5" + }, + { + "device_id":"6", + "device_ip":rank_address[6], + "rank_id":"6" + }, + { + "device_id":"7", + "device_ip":rank_address[7], + "rank_id":"7" + } + ]}, + + + { + "server_id":server[1], + "device":[ + { + "device_id":"0", + "device_ip":rank_address[8], + "rank_id":"8" + }, + { + "device_id":"1", + "device_ip":rank_address[9], + "rank_id":"9" + }, + { + "device_id":"2", + "device_ip":rank_address[10], + "rank_id":"10" + }, + { + "device_id":"3", + "device_ip":rank_address[11], + "rank_id":"11" + }, + { + "device_id":"4", + "device_ip":rank_address[12], + "rank_id":"12" + }, + { + "device_id":"5", + "device_ip":rank_address[13], + "rank_id":"13" + }, + { + "device_id":"6", + "device_ip":rank_address[14], + "rank_id":"14" + }, + { + "device_id":"7", + "device_ip":rank_address[15], + "rank_id":"15" + } + ]}, + { + "server_id":server[2], + "device":[ + { + "device_id":"0", + "device_ip":rank_address[16], + "rank_id":"16" + }, + { + "device_id":"1", + "device_ip":rank_address[17], + "rank_id":"17" + }, + { + "device_id":"2", + "device_ip":rank_address[18], + "rank_id":"18" + }, + { + "device_id":"3", + "device_ip":rank_address[19], + "rank_id":"19" + }, + { + "device_id":"4", + "device_ip":rank_address[20], + "rank_id":"20" + }, + { + "device_id":"5", + "device_ip":rank_address[21], + "rank_id":"21" + }, + { + "device_id":"6", + "device_ip":rank_address[22], + "rank_id":"22" + }, + { + "device_id":"7", + "device_ip":rank_address[23], + "rank_id":"23" + } + ]}, + { + "server_id":server[3], + "device":[ + { + "device_id":"0", + "device_ip":rank_address[24], + "rank_id":"24" + }, + { + "device_id":"1", + "device_ip":rank_address[25], + "rank_id":"25" + }, + { + "device_id":"2", + "device_ip":rank_address[26], + "rank_id":"26" + }, + { + "device_id":"3", + "device_ip":rank_address[27], + "rank_id":"27" + }, + { + "device_id":"4", + "device_ip":rank_address[28], + "rank_id":"28" + }, + { + "device_id":"5", + "device_ip":rank_address[29], + "rank_id":"29" + }, + { + "device_id":"6", + "device_ip":rank_address[30], + "rank_id":"30" + }, + { + "device_id":"7", + "device_ip":rank_address[31], + "rank_id":"31" + } + ]}, + { + "server_id":server[4], + "device":[ + { + "device_id":"0", + "device_ip":rank_address[32], + "rank_id":"32" + }, + { + "device_id":"1", + "device_ip":rank_address[33], + "rank_id":"33" + }, + { + "device_id":"2", + "device_ip":rank_address[34], + "rank_id":"34" + }, + { + "device_id":"3", + "device_ip":rank_address[35], + "rank_id":"35" + }, + { + "device_id":"4", + "device_ip":rank_address[36], + "rank_id":"36" + }, + { + "device_id":"5", + "device_ip":rank_address[37], + "rank_id":"37" + }, + { + "device_id":"6", + "device_ip":rank_address[38], + "rank_id":"38" + }, + { + "device_id":"7", + "device_ip":rank_address[39], + "rank_id":"39" + } + ]}, + { + "server_id":server[5], + "device":[ + { + "device_id":"0", + "device_ip":rank_address[40], + "rank_id":"40" + }, + { + "device_id":"1", + "device_ip":rank_address[41], + "rank_id":"41" + }, + { + "device_id":"2", + "device_ip":rank_address[42], + "rank_id":"42" + }, + { + "device_id":"3", + "device_ip":rank_address[43], + "rank_id":"43" + }, + { + "device_id":"4", + "device_ip":rank_address[44], + "rank_id":"44" + }, + { + "device_id":"5", + "device_ip":rank_address[45], + "rank_id":"45" + }, + { + "device_id":"6", + "device_ip":rank_address[46], + "rank_id":"46" + }, + { + "device_id":"7", + "device_ip":rank_address[47], + "rank_id":"47" + } + ]}, + { + "server_id":server[6], + "device":[ + { + "device_id":"0", + "device_ip":rank_address[48], + "rank_id":"48" + }, + { + "device_id":"1", + "device_ip":rank_address[49], + "rank_id":"49" + }, + { + "device_id":"2", + "device_ip":rank_address[50], + "rank_id":"50" + }, + { + "device_id":"3", + "device_ip":rank_address[51], + "rank_id":"51" + }, + { + "device_id":"4", + "device_ip":rank_address[52], + "rank_id":"52" + }, + { + "device_id":"5", + "device_ip":rank_address[53], + "rank_id":"53" + }, + { + "device_id":"6", + "device_ip":rank_address[54], + "rank_id":"54" + }, + { + "device_id":"7", + "device_ip":rank_address[55], + "rank_id":"55" + } + ]}, + { + "server_id":server[7], + "device":[ + { + "device_id":"0", + "device_ip":rank_address[56], + "rank_id":"56" + }, + { + "device_id":"1", + "device_ip":rank_address[57], + "rank_id":"57" + }, + { + "device_id":"2", + "device_ip":rank_address[58], + "rank_id":"58" + }, + { + "device_id":"3", + "device_ip":rank_address[59], + "rank_id":"59" + }, + { + "device_id":"4", + "device_ip":rank_address[60], + "rank_id":"60" + }, + { + "device_id":"5", + "device_ip":rank_address[61], + "rank_id":"61" + }, + { + "device_id":"6", + "device_ip":rank_address[62], + "rank_id":"62" + }, + { + "device_id":"7", + "device_ip":rank_address[63], + "rank_id":"63" + } + ]} + ], + "status":"completed", + "version":"1.0" + } + + + + +with open("rank_table.json", "w") as f: + json.dump(rank, f) + + + + + + diff --git a/TensorFlow/built-in/cv/image_classification/MiniGo_ID0629_for_TensorFlow/test/train_performance_16p.sh b/TensorFlow/built-in/cv/image_classification/MiniGo_ID0629_for_TensorFlow/test/train_performance_16p.sh new file mode 100644 index 0000000000000000000000000000000000000000..628a963ee5016fc48114100fc22129de878bedc9 --- /dev/null +++ b/TensorFlow/built-in/cv/image_classification/MiniGo_ID0629_for_TensorFlow/test/train_performance_16p.sh @@ -0,0 +1,149 @@ +#!/bin/bash + +cur_path=`pwd`/../ +rm -f $cur_path/outputs/models/* +rm -f $cur_path/estimator_working_dir/* + +#基础参数,需要模型审视修改 +#Batch Size +batch_size=128 +#网络名称,同目录名称 +Network="MiniGo_ID0629_for_TensorFlow" +#Device数量,单卡默认为1 +RankSize=16 +#训练epoch,可选 +train_epochs= +#训练step +train_steps=500 +#学习率 +learning_rate= +#动态输入模式,不需要修改 +dynamic_input="" + +#参数配置 npu param +precision_mode="allow_fp32_to_fp16" +over_dump=False +data_dump_flag=False +data_dump_step="10" +profiling=False + +data_path="$./outputs/data/selfplay" + +if [[ $1 == --help || $1 == -h ]];then + echo "usage: ./train_performance_1p.sh $data_path --work_dir="$cur_path/estimator_working_dir" --export_path="$cur_path/outputs/models/000001-first_generation"" + exit 1 +fi + +for para in $* +do + if [[ $para == --data_path* ]];then + data_path=`echo ${para#*=}` + elif [[ $para == --bind_core* ]]; then + bind_core=`echo ${para#*=}` + name_bind="_bindcore" + elif [[ $para == --dynamic_input* ]];then + dynamic_input=`echo ${para#*=}` + elif [[ $para == --server_index* ]];then + server_index=`echo ${para#*=}` + elif [[ $para == --conf_path* ]];then + conf_path=`echo ${para#*=}` + elif [[ $para == --devices_num* ]];then + devices_num=`echo ${para#*=}` + fi +done +one_node_ip=`find $conf_path -name "server_*0.info"|awk -F "server_" '{print $2}'|awk -F "_" '{print $1}'` +linux_num=`find $conf_path -name "server_*.info" |wc -l` +if [[ $data_path == "" ]];then + echo "[Error] para \"data_path\" must be config" + exit 1 +fi + +export RANK_SIZE=`awk 'BEGIN{printf "%.0f\n",'${devices_num}'*'${linux_num}'}'` +rank_size=8 +nohup python3 set_ranktable.py --npu_nums=$linux_num --conf_path=$conf_path +wait +export RANK_TABLE_FILE=$cur_path/test/rank_table.json +export JOB_ID=10087 +export DEVICE_INDEX=0 + +##############执行训练########## +cd $cur_path + +#(Step1)初始化 一定要先运行这一步 +python3 bootstrap.py --work_dir=$cur_path/estimator_working_dir --export_path=$cur_path/outputs/models/000000-bootstrap +wait + + +start=$(date +%s) + +# 8P训练模式 +for((RANK_ID=$((rank_size*server_index));RANK_ID<$((((server_index+1))*rank_size));RANK_ID++)); +do + #设置环境变量 + + export ASCEND_DEVICE_ID=`expr ${RANK_ID} - $((rank_size*server_index))` + ASCEND_DEVICE_ID=`expr ${RANK_ID} - $((rank_size*server_index))` + export RANK_ID=$RANK_ID + echo "Device ID: $ASCEND_DEVICE_ID" + + if [ -d $cur_path/test/output/$ASCEND_DEVICE_ID ];then + rm -rf $cur_path/test/output/$ASCEND_DEVICE_ID + mkdir -p $cur_path/test/output/$ASCEND_DEVICE_ID + else + mkdir -p $cur_path/test/output/$ASCEND_DEVICE_ID + fi + echo $ASCEND_DEVICE_ID + #(Step3)训练 + corenum=`cat /proc/cpuinfo |grep 'processor' |wc -l` + let a=RANK_ID*${corenum}/8 + let b=RANK_ID+1 + let c=b*${corenum}/8-1 + if [ "x${bind_core}" != x ];then + bind_core="taskset -c $a-$c" + fi + #${bind_core} python3 train.py --training_data_path=$data_path --steps_to_train=$train_steps --train_batch_size=$batch_size --work_dir=$cur_path/estimator_working_dir --export_path=$cur_path/outputs/models/000001-first_generation > $cur_path/test/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log 2>&1 & + ${bind_core} python3 train.py \ + --training_data_path=$data_path \ + --steps_to_train=$train_steps \ + --train_batch_size=$batch_size \ + --work_dir=$cur_path/estimator_working_dir \ + --export_path=$cur_path/outputs/models/000001-first_generation \ + --dynamic_input=${dynamic_input} > $cur_path/test/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log 2>&1 & +done +wait + +end=$(date +%s) +e2etime=$(( $end - $start )) + +#echo "Final Performance ms/step : $average_perf" +echo "Final Training Duration sec : $e2etime" + + +###下面字段用于冒烟看护 +BatchSize=${batch_size} +#设备类型,自动获取 +DeviceType=`uname -m` +#用例名称,自动获取 +CaseName=${Network}_bs${BatchSize}_${RankSize}'p'_'perf' + +#获取性能 +TrainingTime=`grep "tensorflow:global_step/sec" $cur_path/test/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log|awk 'END {print $2}'` +wait +ActualFPS=`awk 'BEGIN{printf "%.2f\n", '${BatchSize}'*'${RankSize}'*'${TrainingTime}'}'` + +#从train_*.log中提取Loss到${CaseName}_loss.txt中 +grep "] loss" $cur_path/test/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log|awk '{print $7}' |cut -d , -f 1 >> $cur_path/test/output/$ASCEND_DEVICE_ID/${CaseName}_loss.txt +ActualLoss=`awk 'END {print $1}' $cur_path/test/output/$ASCEND_DEVICE_ID/${CaseName}_loss.txt` + +#关键信息打印到CaseName.log中,此处无需修改 +echo "Network = ${Network}" > $cur_path/test/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "RankSize = ${RankSize}" >> $cur_path/test/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "DeviceType = ${DeviceType}" >> $cur_path/test/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "CaseName = ${CaseName}" >> $cur_path/test/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "BatchSize = ${batch_size}" >> $cur_path/test/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "TrainingTime = ${TrainingTime}" >> $cur_path/test/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "ActualFPS = ${ActualFPS}" >> $cur_path/test/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "ActualLoss = ${ActualLoss}" >> $cur_path/test/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "TrainAccuracy = ${ActualLoss}" >> $cur_path/test/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "E2ETrainingTime = ${e2etime}" >> $cur_path/test/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "DynamicInput = ${dynamic_input}" >> $cur_path/test/output/$ASCEND_DEVICE_ID/${CaseName}.log diff --git a/TensorFlow2/built-in/cv/detection/Swin-Transformer_ID2412_for_TensorFlow2.X/swintransformer/train_main_16p.py b/TensorFlow2/built-in/cv/detection/Swin-Transformer_ID2412_for_TensorFlow2.X/swintransformer/train_main_16p.py new file mode 100644 index 0000000000000000000000000000000000000000..e1ba4ad948b302d6343410092e30bc4691db415d --- /dev/null +++ b/TensorFlow2/built-in/cv/detection/Swin-Transformer_ID2412_for_TensorFlow2.X/swintransformer/train_main_16p.py @@ -0,0 +1,422 @@ +# +# Copyright 2017 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ +# Copyright 2021 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +import math, re, os +import tensorflow as tf +import numpy as np +from matplotlib import pyplot as plt +#from kaggle_datasets import KaggleDatasets +from sklearn.metrics import f1_score, precision_score, recall_score, confusion_matrix +print("Tensorflow version " + tf.__version__) +AUTO = tf.data.experimental.AUTOTUNE +_NUM_TRAIN_FILES=15 + + +import sys +sys.path.append('../swintransformer') +from model import SwinTransformerModel +from model import SwinTransformer + +import argparse +import npu_device +import ast +parser = argparse.ArgumentParser() +parser.add_argument("--epochs", default=12, type=int, help="train epochs") +parser.add_argument("--train_steps", default=0, type=int, help="train steps per epoch") +parser.add_argument("--nproc_per_node", default=1, type=int, help="npu_device_nums") +parser.add_argument("--batch_size", default=16, type=int, help="global batch_size") +parser.add_argument("--rank_size", default=1, type=int, help="rank size") +parser.add_argument("--device_id", default=0, type=int, help="Ascend device id") +parser.add_argument('--data_path', default="/home/data", type=str,help='the path to load data') +parser.add_argument('--ckpt_path', default="/home/data", type=str,help='the path to load ckpt') +parser.add_argument('--model_name', default="swin_large_224", type=str,help='model_name') +parser.add_argument('--precision_mode', default="allow_mix_precision", type=str,help='the path to save over dump data') +parser.add_argument('--over_dump', dest='over_dump', type=ast.literal_eval, + help='if or not over detection, default is False') +parser.add_argument('--data_dump_flag', dest='data_dump_flag', type=ast.literal_eval, + help='data dump flag, default is False') +parser.add_argument('--data_dump_step', default="10", + help='data dump step, default is 10') +parser.add_argument('--profiling', dest='profiling', type=ast.literal_eval,help='if or not profiling for performance debug, default is False') +parser.add_argument('--profiling_dump_path', default="/home/data", type=str,help='the path to save profiling data') +parser.add_argument('--over_dump_path', default="/home/data", type=str,help='the path to save over dump data') +parser.add_argument('--data_dump_path', default="/home/data", type=str,help='the path to save dump data') +parser.add_argument('--use_mixlist', dest='use_mixlist', type=ast.literal_eval, + help='use_mixlist flag, default is False') +parser.add_argument('--fusion_off_flag', dest='fusion_off_flag', type=ast.literal_eval, + help='fusion_off flag, default is False') +parser.add_argument('--mixlist_file', default="ops_info.json", type=str,help='mixlist file name, default is ops_info.json') +parser.add_argument('--fusion_off_file', default="fusion_switch.cfg", type=str,help='fusion_off file name, default is fusion_switch.cfg') +args = parser.parse_args() + +def npu_config(): + if args.data_dump_flag: + npu_device.global_options().dump_config.enable_dump = True + npu_device.global_options().dump_config.dump_path = args.data_dump_path + npu_device.global_options().dump_config.dump_step = args.data_dump_step + npu_device.global_options().dump_config.dump_mode = "all" + + if args.over_dump: + npu_device.global_options().dump_config.enable_dump_debug = True + npu_device.global_options().dump_config.dump_path = args.over_dump_path + npu_device.global_options().dump_config.dump_debug_mode = "all" + + if args.profiling: + npu_device.global_options().profiling_config.enable_profiling = True + profiling_options = '{"output":"' + args.profiling_dump_path + '", \ + "training_trace":"on", \ + "task_trace":"on", \ + "aicpu":"on", \ + "aic_metrics":"PipeUtilization",\ + "fp_point":"", \ + "bp_point":""}' + npu_device.global_options().profiling_config.profiling_options = profiling_options + npu_device.global_options().precision_mode = args.precision_mode + if args.use_mixlist and args.precision_mode=='allow_mix_precision': + npu_device.global_options().modify_mixlist=args.mixlist_file + if args.fusion_off_flag: + npu_device.global_options().fusion_switch_file=args.fusion_off_file + npu_device.open().as_default() + +npu_config() + +try: # detect TPUs + tpu = tf.distribute.cluster_resolver.TPUClusterResolver.connect() # TPU detection + strategy = tf.distribute.TPUStrategy(tpu) +except ValueError: # detect GPUs + #strategy = tf.distribute.MirroredStrategy() # for GPU or multi-GPU machines + strategy = tf.distribute.get_strategy() # default strategy that works on CPU and single GPU + #strategy = tf.distribute.experimental.MultiWorkerMirroredStrategy() # for clusters of multi-GPU machines + +print("Number of accelerators: ", strategy.num_replicas_in_sync) + +#need modify according to local path +#GCS_DS_PATH = KaggleDatasets().get_gcs_path("flower-classification") +GCS_DS_PATH = args.data_path + +IMAGE_SIZE = [224, 224] # At this size, a GPU will run out of memory. Use the TPU. + # For GPU training, please select 224 x 224 px image size. +EPOCHS = args.epochs +#BATCH_SIZE = 16 * strategy.num_replicas_in_sync +BATCH_SIZE = args.batch_size +TOTAL_BATCH_SIZE=args.nproc_per_node*args.batch_size + +GCS_PATH_SELECT = { # available image sizes + 192: GCS_DS_PATH + '/tfrecords-jpeg-192x192', + 224: GCS_DS_PATH + '/tfrecords-jpeg-224x224', + 331: GCS_DS_PATH + '/tfrecords-jpeg-331x331', + 512: GCS_DS_PATH + '/tfrecords-jpeg-512x512' +} +GCS_PATH = GCS_PATH_SELECT[IMAGE_SIZE[0]] + +TRAINING_FILENAMES = tf.io.gfile.glob(GCS_PATH + '/train/*.tfrec') +VALIDATION_FILENAMES = tf.io.gfile.glob(GCS_PATH + '/val/*.tfrec') +TEST_FILENAMES = tf.io.gfile.glob(GCS_PATH + '/test/*.tfrec') # predictions on this dataset should be submitted for the competition +#print(TRAINING_FILENAMES) +#print(VALIDATION_FILENAMES) +#print(TEST_FILENAMES) + +CLASSES = ['pink primrose', 'hard-leaved pocket orchid', 'canterbury bells', 'sweet pea', 'wild geranium', 'tiger lily', 'moon orchid', 'bird of paradise', 'monkshood', 'globe thistle', # 00 - 09 + 'snapdragon', "colt's foot", 'king protea', 'spear thistle', 'yellow iris', 'globe-flower', 'purple coneflower', 'peruvian lily', 'balloon flower', 'giant white arum lily', # 10 - 19 + 'fire lily', 'pincushion flower', 'fritillary', 'red ginger', 'grape hyacinth', 'corn poppy', 'prince of wales feathers', 'stemless gentian', 'artichoke', 'sweet william', # 20 - 29 + 'carnation', 'garden phlox', 'love in the mist', 'cosmos', 'alpine sea holly', 'ruby-lipped cattleya', 'cape flower', 'great masterwort', 'siam tulip', 'lenten rose', # 30 - 39 + 'barberton daisy', 'daffodil', 'sword lily', 'poinsettia', 'bolero deep blue', 'wallflower', 'marigold', 'buttercup', 'daisy', 'common dandelion', # 40 - 49 + 'petunia', 'wild pansy', 'primula', 'sunflower', 'lilac hibiscus', 'bishop of llandaff', 'gaura', 'geranium', 'orange dahlia', 'pink-yellow dahlia', # 50 - 59 + 'cautleya spicata', 'japanese anemone', 'black-eyed susan', 'silverbush', 'californian poppy', 'osteospermum', 'spring crocus', 'iris', 'windflower', 'tree poppy', # 60 - 69 + 'gazania', 'azalea', 'water lily', 'rose', 'thorn apple', 'morning glory', 'passion flower', 'lotus', 'toad lily', 'anthurium', # 70 - 79 + 'frangipani', 'clematis', 'hibiscus', 'columbine', 'desert-rose', 'tree mallow', 'magnolia', 'cyclamen ', 'watercress', 'canna lily', # 80 - 89 + 'hippeastrum ', 'bee balm', 'pink quill', 'foxglove', 'bougainvillea', 'camellia', 'mallow', 'mexican petunia', 'bromelia', 'blanket flower', # 90 - 99 + 'trumpet creeper', 'blackberry lily', 'common tulip', 'wild rose'] + +# numpy and matplotlib defaults +np.set_printoptions(threshold=15, linewidth=80) + +def batch_to_numpy_images_and_labels(data): + images, labels = data + numpy_images = images.numpy() + numpy_labels = labels.numpy() + if numpy_labels.dtype == object: # binary string in this case, these are image ID strings + numpy_labels = [None for _ in enumerate(numpy_images)] + # If no labels, only image IDs, return None for labels (this is the case for test data) + return numpy_images, numpy_labels + +def title_from_label_and_target(label, correct_label): + if correct_label is None: + return CLASSES[label], True + correct = (label == correct_label) + return "{} [{}{}{}]".format(CLASSES[label], 'OK' if correct else 'NO', u"\u2192" if not correct else '', + CLASSES[correct_label] if not correct else ''), correct + +def display_one_flower(image, title, subplot, red=False, titlesize=16): + plt.subplot(*subplot) + plt.axis('off') + plt.imshow(image) + if len(title) > 0: + plt.title(title, fontsize=int(titlesize) if not red else int(titlesize/1.2), color='red' if red else 'black', fontdict={'verticalalignment':'center'}, pad=int(titlesize/1.5)) + return (subplot[0], subplot[1], subplot[2]+1) + +def display_batch_of_images(databatch, predictions=None): + """This will work with: + display_batch_of_images(images) + display_batch_of_images(images, predictions) + display_batch_of_images((images, labels)) + display_batch_of_images((images, labels), predictions) + """ + # data + images, labels = batch_to_numpy_images_and_labels(databatch) + if labels is None: + labels = [None for _ in enumerate(images)] + + # auto-squaring: this will drop data that does not fit into square or square-ish rectangle + rows = int(math.sqrt(len(images))) + cols = len(images)//rows + + # size and spacing + FIGSIZE = 13.0 + SPACING = 0.1 + subplot=(rows,cols,1) + if rows < cols: + plt.figure(figsize=(FIGSIZE,FIGSIZE/cols*rows)) + else: + plt.figure(figsize=(FIGSIZE/rows*cols,FIGSIZE)) + + # display + for i, (image, label) in enumerate(zip(images[:rows*cols], labels[:rows*cols])): + title = '' if label is None else CLASSES[label] + correct = True + if predictions is not None: + title, correct = title_from_label_and_target(predictions[i], label) + dynamic_titlesize = FIGSIZE*SPACING/max(rows,cols)*40+3 # magic formula tested to work from 1x1 to 10x10 images + subplot = display_one_flower(image, title, subplot, not correct, titlesize=dynamic_titlesize) + + #layout + plt.tight_layout() + if label is None and predictions is None: + plt.subplots_adjust(wspace=0, hspace=0) + else: + plt.subplots_adjust(wspace=SPACING, hspace=SPACING) + plt.show() + +def display_confusion_matrix(cmat, score, precision, recall): + plt.figure(figsize=(15,15)) + ax = plt.gca() + ax.matshow(cmat, cmap='Reds') + ax.set_xticks(range(len(CLASSES))) + ax.set_xticklabels(CLASSES, fontdict={'fontsize': 7}) + plt.setp(ax.get_xticklabels(), rotation=45, ha="left", rotation_mode="anchor") + ax.set_yticks(range(len(CLASSES))) + ax.set_yticklabels(CLASSES, fontdict={'fontsize': 7}) + plt.setp(ax.get_yticklabels(), rotation=45, ha="right", rotation_mode="anchor") + titlestring = "" + if score is not None: + titlestring += 'f1 = {:.3f} '.format(score) + if precision is not None: + titlestring += '\nprecision = {:.3f} '.format(precision) + if recall is not None: + titlestring += '\nrecall = {:.3f} '.format(recall) + if len(titlestring) > 0: + ax.text(101, 1, titlestring, fontdict={'fontsize': 18, 'horizontalalignment':'right', 'verticalalignment':'top', 'color':'#804040'}) + plt.show() + +def display_training_curves(training, validation, title, subplot): + if subplot%10==1: # set up the subplots on the first call + plt.subplots(figsize=(10,10), facecolor='#F0F0F0') + plt.tight_layout() + ax = plt.subplot(subplot) + ax.set_facecolor('#F8F8F8') + ax.plot(training) + ax.plot(validation) + ax.set_title('model '+ title) + ax.set_ylabel(title) + #ax.set_ylim(0.28,1.05) + ax.set_xlabel('epoch') + ax.legend(['train', 'valid.']) + +def decode_image(image_data): + image = tf.image.decode_jpeg(image_data, channels=3) # image format uint8 [0,255] + image = tf.reshape(image, [*IMAGE_SIZE, 3]) # explicit size needed for TPU + return image + +def read_labeled_tfrecord(example): + LABELED_TFREC_FORMAT = { + "image": tf.io.FixedLenFeature([], tf.string), # tf.string means bytestring + "class": tf.io.FixedLenFeature([], tf.int64), # shape [] means single element + } + example = tf.io.parse_single_example(example, LABELED_TFREC_FORMAT) + image = decode_image(example['image']) + label = tf.cast(example['class'], tf.int32) + return image, label # returns a dataset of (image, label) pairs + +def read_unlabeled_tfrecord(example): + UNLABELED_TFREC_FORMAT = { + "image": tf.io.FixedLenFeature([], tf.string), # tf.string means bytestring + "id": tf.io.FixedLenFeature([], tf.string), # shape [] means single element + # class is missing, this competitions's challenge is to predict flower classes for the test dataset + } + example = tf.io.parse_single_example(example, UNLABELED_TFREC_FORMAT) + image = decode_image(example['image']) + idnum = example['id'] + return image, idnum # returns a dataset of image(s) + +def load_dataset(filenames, labeled=True, ordered=False): + # Read from TFRecords. For optimal performance, reading from multiple files at once and + # disregarding data order. Order does not matter since we will be shuffling the data anyway. + + ignore_order = tf.data.Options() + if not ordered: + ignore_order.experimental_deterministic = False # disable order, increase speed + + dataset = tf.data.TFRecordDataset(filenames, num_parallel_reads=AUTO) # automatically interleaves reads from multiple files + #dataset= dataset.interleave(tf.data.TFRecordDataset,cycle_length=10,num_parallel_calls=1) + dataset = dataset.with_options(ignore_order) # uses data as soon as it streams in, rather than in its original order + #dataset= dataset.interleave(tf.data.TFRecordDataset,cycle_length=10,num_parallel_calls=10) + dataset = dataset.map(read_labeled_tfrecord if labeled else read_unlabeled_tfrecord, num_parallel_calls=AUTO) + # returns a dataset of (image, label) pairs if labeled=True or (image, id) pairs if labeled=False + return dataset + +def data_augment(image, label): + # data augmentation. Thanks to the dataset.prefetch(AUTO) statement in the next function (below), + # this happens essentially for free on TPU. Data pipeline code is executed on the "CPU" part + # of the TPU while the TPU itself is computing gradients. + image = tf.image.random_flip_left_right(image) + #image = tf.image.random_saturation(image, 0, 2) + return image, label + +def get_training_dataset(): + dataset = load_dataset(TRAINING_FILENAMES, labeled=True) + dataset, BATCH_SIZE = npu_device.distribute.shard_and_rebatch_dataset(dataset,TOTAL_BATCH_SIZE) + print('111111111111122221',str(BATCH_SIZE)) + dataset = dataset.map(data_augment, num_parallel_calls=AUTO) + dataset = dataset.repeat() # the training dataset must repeat for several epochs + dataset = dataset.shuffle(2048) + dataset = dataset.batch(BATCH_SIZE, drop_remainder=True) + dataset = dataset.prefetch(AUTO) # prefetch next batch while training (autotune prefetch buffer size) + return dataset + +def get_validation_dataset(ordered=False): + dataset = load_dataset(VALIDATION_FILENAMES, labeled=True, ordered=ordered) + dataset = dataset.batch(BATCH_SIZE, drop_remainder=True) + dataset = dataset.cache() + dataset = dataset.prefetch(AUTO) # prefetch next batch while training (autotune prefetch buffer size) + return dataset + +def get_test_dataset(ordered=False): + dataset = load_dataset(TEST_FILENAMES, labeled=False, ordered=ordered) + dataset = dataset.batch(BATCH_SIZE) + dataset = dataset.prefetch(AUTO) # prefetch next batch while training (autotune prefetch buffer size) + return dataset + +def count_data_items(filenames): + # the number of data items is written in the name of the .tfrec files, i.e. flowers00-230.tfrec = 230 data items + n = [int(re.compile(r"-([0-9]*)\.").search(filename).group(1)) for filename in filenames] + return np.sum(n) + +NUM_TRAINING_IMAGES = count_data_items(TRAINING_FILENAMES) +NUM_VALIDATION_IMAGES = count_data_items(VALIDATION_FILENAMES) +NUM_TEST_IMAGES = count_data_items(TEST_FILENAMES) +STEPS_PER_EPOCH = args.train_steps if args.train_steps else NUM_TRAINING_IMAGES // BATCH_SIZE +VALIDATION_STEPS = -(-NUM_VALIDATION_IMAGES // BATCH_SIZE) # The "-(-//)" trick rounds up instead of down :-) +TEST_STEPS = -(-NUM_TEST_IMAGES // BATCH_SIZE) # The "-(-//)" trick rounds up instead of down :-) +print('Dataset: {} training images, {} validation images, {} unlabeled test images'.format(NUM_TRAINING_IMAGES, NUM_VALIDATION_IMAGES, NUM_TEST_IMAGES)) + +''' +# data dump +print("Training data shapes:") +for image, label in get_training_dataset().take(1): + print(image.numpy().shape, label.numpy().shape) + print(image.numpy()) + print(label.numpy()) +print("Training data label examples:", label.numpy()) +print("Validation data shapes:") +for image, label in get_validation_dataset().take(3): + print(image.numpy().shape, label.numpy().shape) +print("Validation data label examples:", label.numpy()) +print("Test data shapes:") +for image, idnum in get_test_dataset().take(3): + print(image.numpy().shape, idnum.numpy().shape) +print("Test data IDs:", idnum.numpy().astype('U')) # U=unicode string +''' + +# Peek at training data +training_dataset = get_training_dataset() +training_dataset = training_dataset.unbatch().batch(20) +train_batch = iter(training_dataset) + +# run this cell again for next set of images +#display_batch_of_images(next(train_batch)) + +test_dataset = get_test_dataset() +test_dataset = test_dataset.unbatch().batch(20) +test_batch = iter(test_dataset) + +# run this cell again for next set of images +#display_batch_of_images(next(test_batch)) + +with strategy.scope(): + img_adjust_layer = tf.keras.layers.Lambda(lambda data: tf.keras.applications.imagenet_utils.preprocess_input(tf.cast(data, tf.float32), mode="torch"), input_shape=[*IMAGE_SIZE, 3]) + pretrained_model = SwinTransformer(args.ckpt_path, args.model_name, num_classes=len(CLASSES), include_top=False, pretrained=True, use_tpu=False) + + model = tf.keras.Sequential([ + img_adjust_layer, + pretrained_model, + tf.keras.layers.Dense(len(CLASSES), activation='softmax') + ]) +opt = tf.keras.optimizers.Adam(learning_rate=1e-4, epsilon=1e-8) +opt = npu_device.distribute.npu_distributed_keras_optimizer_wrapper(opt) +model.compile( + optimizer=opt, + loss = 'sparse_categorical_crossentropy', + metrics=['sparse_categorical_accuracy'] +) + +training_vars = model.trainable_variables +npu_device.distribute.broadcast(training_vars, root_rank=0) +model.summary() +current_iteration=0 +while current_iteration < EPOCHS: + history = model.fit(get_training_dataset(), initial_epoch=current_iteration, epochs=current_iteration + 1, steps_per_epoch=STEPS_PER_EPOCH) + current_iteration = current_iteration+1 + current_step = current_iteration*STEPS_PER_EPOCH-1 + +cmdataset = get_validation_dataset(ordered=True) # since we are splitting the dataset and iterating separately on images and labels, order matters. +images_ds = cmdataset.map(lambda image, label: image) +labels_ds = cmdataset.map(lambda image, label: label).unbatch() +cm_correct_labels = next(iter(labels_ds.batch(NUM_VALIDATION_IMAGES))).numpy() # get everything as one batch +cm_probabilities = model.predict(images_ds, steps=VALIDATION_STEPS) +cm_predictions = np.argmax(cm_probabilities, axis=-1) +print("Correct labels: ", cm_correct_labels.shape, cm_correct_labels) +print("Predicted labels: ", cm_predictions.shape, cm_predictions) + +cmat = confusion_matrix(cm_correct_labels, cm_predictions, labels=range(len(CLASSES))) +score = f1_score(cm_correct_labels, cm_predictions, labels=range(len(CLASSES)), average='macro') +precision = precision_score(cm_correct_labels, cm_predictions, labels=range(len(CLASSES)), average='macro') +recall = recall_score(cm_correct_labels, cm_predictions, labels=range(len(CLASSES)), average='macro') +cmat = (cmat.T / cmat.sum(axis=1)).T # normalized +#display_confusion_matrix(cmat, score, precision, recall) +print('f1 score: {:.3f}, precision: {:.3f}, recall: {:.3f}'.format(score, precision, recall)) # 100 - 102 diff --git a/TensorFlow2/built-in/cv/detection/Swin-Transformer_ID2412_for_TensorFlow2.X/test/8p.json b/TensorFlow2/built-in/cv/detection/Swin-Transformer_ID2412_for_TensorFlow2.X/test/8p.json new file mode 100644 index 0000000000000000000000000000000000000000..2e2fc25f515295921d79bd23bd95cc3f4607aff8 --- /dev/null +++ b/TensorFlow2/built-in/cv/detection/Swin-Transformer_ID2412_for_TensorFlow2.X/test/8p.json @@ -0,0 +1,52 @@ +{ + "server_count":"1", + "server_list":[ + { + "server_id":"127.0.0.1", + "device":[ + { + "device_id":"0", + "device_ip":"192.168.100.101", + "rank_id":"0" + }, + { + "device_id":"1", + "device_ip":"192.168.101.101", + "rank_id":"1" + }, + { + "device_id":"2", + "device_ip":"192.168.102.101", + "rank_id":"2" + }, + { + "device_id":"3", + "device_ip":"192.168.103.101", + "rank_id":"3" + }, + { + "device_id":"4", + "device_ip":"192.168.100.100", + "rank_id":"4" + }, + { + "device_id":"5", + "device_ip":"192.168.101.100", + "rank_id":"5" + }, + { + "device_id":"6", + "device_ip":"192.168.102.100", + "rank_id":"6" + }, + { + "device_id":"7", + "device_ip":"192.168.103.100", + "rank_id":"7" + } + ] + } + ], + "status":"completed", + "version":"1.0" +} diff --git a/TensorFlow2/built-in/cv/detection/Swin-Transformer_ID2412_for_TensorFlow2.X/test/set_ranktable.py b/TensorFlow2/built-in/cv/detection/Swin-Transformer_ID2412_for_TensorFlow2.X/test/set_ranktable.py new file mode 100644 index 0000000000000000000000000000000000000000..c25b51462c5df2325462786688d4a206ee29fb9a --- /dev/null +++ b/TensorFlow2/built-in/cv/detection/Swin-Transformer_ID2412_for_TensorFlow2.X/test/set_ranktable.py @@ -0,0 +1,1740 @@ +import argparse +parser = argparse.ArgumentParser() +parser.add_argument('-n', '--npu_nums', type=int, default='2', help='nums of npu') +parser.add_argument('-c', '--conf_path', type=str, default='./', help='the path of server_info') +FLAGS = parser.parse_args() + +import json +import os +server = [] +server_conf = [] +server_list = ["0", "1", "2", "3", "4", "5", "6", "7"] +if os.path.isdir(FLAGS.conf_path): + for f in os.listdir(FLAGS.conf_path): + if (f.split("_")[-1]).split(".")[0] in server_list and (f.split("_")[-1]).split(".")[1] == 'info' and f.split("_")[0] == 'server': + server_conf.append(f) + + + + + + +rank_address = [] +for i in range(FLAGS.npu_nums): + for x in server_conf: + if (x.split("_")[-1]).split(".")[0] == str(i): + server.append(x.split("_")[1]) + l = FLAGS.conf_path + "/" + x + with open(l, "r") as a: + s = a.readlines() + for s_ in s: + if 'address_0' in s_: + rank_address.append(s_.split("=")[-1][:-1]) + for s_ in s: + if 'address_1' in s_: + rank_address.append(s_.split("=")[-1][:-1]) + for s_ in s: + if 'address_2' in s_: + rank_address.append(s_.split("=")[-1][:-1]) + for s_ in s: + if 'address_3' in s_: + rank_address.append(s_.split("=")[-1][:-1]) + for s_ in s: + if 'address_4' in s_: + rank_address.append(s_.split("=")[-1][:-1]) + for s_ in s: + if 'address_5' in s_: + rank_address.append(s_.split("=")[-1][:-1]) + for s_ in s: + if 'address_6' in s_: + rank_address.append(s_.split("=")[-1][:-1]) + for s_ in s: + if 'address_7' in s_: + rank_address.append(s_.split("=")[-1][:-1]) + +if FLAGS.npu_nums == 1: + rank = { + "server_count":"1", + "server_list":[ + { + "server_id":server[0], + "device":[ + { + "device_id":"0", + "device_ip":rank_address[0], + "rank_id":"0" + }, + { + "device_id":"1", + "device_ip":rank_address[1], + "rank_id":"1" + }, + { + "device_id":"2", + "device_ip":rank_address[2], + "rank_id":"2" + }, + { + "device_id":"3", + "device_ip":rank_address[3], + "rank_id":"3" + }, + { + "device_id":"4", + "device_ip":rank_address[4], + "rank_id":"4" + }, + { + "device_id":"5", + "device_ip":rank_address[5], + "rank_id":"5" + }, + { + "device_id":"6", + "device_ip":rank_address[6], + "rank_id":"6" + }, + { + "device_id":"7", + "device_ip":rank_address[7], + "rank_id":"7" + } + ]}], + "status":"completed", + "version":"1.0" + } +elif FLAGS.npu_nums == 2: + rank = { + "server_count":"2", + "server_list":[ + { + "server_id":server[0], + "device":[ + { + "device_id":"0", + "device_ip":rank_address[0], + "rank_id":"0" + }, + { + "device_id":"1", + "device_ip":rank_address[1], + "rank_id":"1" + }, + { + "device_id":"2", + "device_ip":rank_address[2], + "rank_id":"2" + }, + { + "device_id":"3", + "device_ip":rank_address[3], + "rank_id":"3" + }, + { + "device_id":"4", + "device_ip":rank_address[4], + "rank_id":"4" + }, + { + "device_id":"5", + "device_ip":rank_address[5], + "rank_id":"5" + }, + { + "device_id":"6", + "device_ip":rank_address[6], + "rank_id":"6" + }, + { + "device_id":"7", + "device_ip":rank_address[7], + "rank_id":"7" + } + ]}, + + + { + "server_id":server[1], + "device":[ + { + "device_id":"0", + "device_ip":rank_address[8], + "rank_id":"8" + }, + { + "device_id":"1", + "device_ip":rank_address[9], + "rank_id":"9" + }, + { + "device_id":"2", + "device_ip":rank_address[10], + "rank_id":"10" + }, + { + "device_id":"3", + "device_ip":rank_address[11], + "rank_id":"11" + }, + { + "device_id":"4", + "device_ip":rank_address[12], + "rank_id":"12" + }, + { + "device_id":"5", + "device_ip":rank_address[13], + "rank_id":"13" + }, + { + "device_id":"6", + "device_ip":rank_address[14], + "rank_id":"14" + }, + { + "device_id":"7", + "device_ip":rank_address[15], + "rank_id":"15" + } + ]} + ], + + "status":"completed", + "version":"1.0" + } + + +elif FLAGS.npu_nums == 3: + rank = { + "server_count":"3", + "server_list":[ + { + "server_id":server[0], + "device":[ + { + "device_id":"0", + "device_ip":rank_address[0], + "rank_id":"0" + }, + { + "device_id":"1", + "device_ip":rank_address[1], + "rank_id":"1" + }, + { + "device_id":"2", + "device_ip":rank_address[2], + "rank_id":"2" + }, + { + "device_id":"3", + "device_ip":rank_address[3], + "rank_id":"3" + }, + { + "device_id":"4", + "device_ip":rank_address[4], + "rank_id":"4" + }, + { + "device_id":"5", + "device_ip":rank_address[5], + "rank_id":"5" + }, + { + "device_id":"6", + "device_ip":rank_address[6], + "rank_id":"6" + }, + { + "device_id":"7", + "device_ip":rank_address[7], + "rank_id":"7" + } + ]}, + + + { + "server_id":server[1], + "device":[ + { + "device_id":"0", + "device_ip":rank_address[8], + "rank_id":"8" + }, + { + "device_id":"1", + "device_ip":rank_address[9], + "rank_id":"9" + }, + { + "device_id":"2", + "device_ip":rank_address[10], + "rank_id":"10" + }, + { + "device_id":"3", + "device_ip":rank_address[11], + "rank_id":"11" + }, + { + "device_id":"4", + "device_ip":rank_address[12], + "rank_id":"12" + }, + { + "device_id":"5", + "device_ip":rank_address[13], + "rank_id":"13" + }, + { + "device_id":"6", + "device_ip":rank_address[14], + "rank_id":"14" + }, + { + "device_id":"7", + "device_ip":rank_address[15], + "rank_id":"15" + } + ]}, + + { + "server_id":server[2], + "device":[ + { + "device_id":"0", + "device_ip":rank_address[16], + "rank_id":"16" + }, + { + "device_id":"1", + "device_ip":rank_address[17], + "rank_id":"17" + }, + { + "device_id":"2", + "device_ip":rank_address[18], + "rank_id":"18" + }, + { + "device_id":"3", + "device_ip":rank_address[19], + "rank_id":"19" + }, + { + "device_id":"4", + "device_ip":rank_address[20], + "rank_id":"20" + }, + { + "device_id":"5", + "device_ip":rank_address[21], + "rank_id":"21" + }, + { + "device_id":"6", + "device_ip":rank_address[22], + "rank_id":"22" + }, + { + "device_id":"7", + "device_ip":rank_address[23], + "rank_id":"23" + } + ]} + ], + "status":"completed", + "version":"1.0" + } +elif FLAGS.npu_nums == 4: + rank = { + "server_count":"4", + "server_list":[ + { + "server_id":server[0], + "device":[ + { + "device_id":"0", + "device_ip":rank_address[0], + "rank_id":"0" + }, + { + "device_id":"1", + "device_ip":rank_address[1], + "rank_id":"1" + }, + { + "device_id":"2", + "device_ip":rank_address[2], + "rank_id":"2" + }, + { + "device_id":"3", + "device_ip":rank_address[3], + "rank_id":"3" + }, + { + "device_id":"4", + "device_ip":rank_address[4], + "rank_id":"4" + }, + { + "device_id":"5", + "device_ip":rank_address[5], + "rank_id":"5" + }, + { + "device_id":"6", + "device_ip":rank_address[6], + "rank_id":"6" + }, + { + "device_id":"7", + "device_ip":rank_address[7], + "rank_id":"7" + } + ]}, + + + { + "server_id":server[1], + "device":[ + { + "device_id":"0", + "device_ip":rank_address[8], + "rank_id":"8" + }, + { + "device_id":"1", + "device_ip":rank_address[9], + "rank_id":"9" + }, + { + "device_id":"2", + "device_ip":rank_address[10], + "rank_id":"10" + }, + { + "device_id":"3", + "device_ip":rank_address[11], + "rank_id":"11" + }, + { + "device_id":"4", + "device_ip":rank_address[12], + "rank_id":"12" + }, + { + "device_id":"5", + "device_ip":rank_address[13], + "rank_id":"13" + }, + { + "device_id":"6", + "device_ip":rank_address[14], + "rank_id":"14" + }, + { + "device_id":"7", + "device_ip":rank_address[15], + "rank_id":"15" + } + ]}, + { + "server_id":server[2], + "device":[ + { + "device_id":"0", + "device_ip":rank_address[16], + "rank_id":"16" + }, + { + "device_id":"1", + "device_ip":rank_address[17], + "rank_id":"17" + }, + { + "device_id":"2", + "device_ip":rank_address[18], + "rank_id":"18" + }, + { + "device_id":"3", + "device_ip":rank_address[19], + "rank_id":"19" + }, + { + "device_id":"4", + "device_ip":rank_address[20], + "rank_id":"20" + }, + { + "device_id":"5", + "device_ip":rank_address[21], + "rank_id":"21" + }, + { + "device_id":"6", + "device_ip":rank_address[22], + "rank_id":"22" + }, + { + "device_id":"7", + "device_ip":rank_address[23], + "rank_id":"23" + } + ]}, + { + "server_id":server[3], + "device":[ + { + "device_id":"0", + "device_ip":rank_address[24], + "rank_id":"24" + }, + { + "device_id":"1", + "device_ip":rank_address[25], + "rank_id":"25" + }, + { + "device_id":"2", + "device_ip":rank_address[26], + "rank_id":"26" + }, + { + "device_id":"3", + "device_ip":rank_address[27], + "rank_id":"27" + }, + { + "device_id":"4", + "device_ip":rank_address[28], + "rank_id":"28" + }, + { + "device_id":"5", + "device_ip":rank_address[29], + "rank_id":"29" + }, + { + "device_id":"6", + "device_ip":rank_address[30], + "rank_id":"30" + }, + { + "device_id":"7", + "device_ip":rank_address[31], + "rank_id":"31" + } + ]} + ], + "status":"completed", + "version":"1.0" + } +elif FLAGS.npu_nums == 5: + rank = { + "server_count":"5", + "server_list":[ + { + "server_id":server[0], + "device":[ + { + "device_id":"0", + "device_ip":rank_address[0], + "rank_id":"0" + }, + { + "device_id":"1", + "device_ip":rank_address[1], + "rank_id":"1" + }, + { + "device_id":"2", + "device_ip":rank_address[2], + "rank_id":"2" + }, + { + "device_id":"3", + "device_ip":rank_address[3], + "rank_id":"3" + }, + { + "device_id":"4", + "device_ip":rank_address[4], + "rank_id":"4" + }, + { + "device_id":"5", + "device_ip":rank_address[5], + "rank_id":"5" + }, + { + "device_id":"6", + "device_ip":rank_address[6], + "rank_id":"6" + }, + { + "device_id":"7", + "device_ip":rank_address[7], + "rank_id":"7" + } + ]}, + + + { + "server_id":server[1], + "device":[ + { + "device_id":"0", + "device_ip":rank_address[8], + "rank_id":"8" + }, + { + "device_id":"1", + "device_ip":rank_address[9], + "rank_id":"9" + }, + { + "device_id":"2", + "device_ip":rank_address[10], + "rank_id":"10" + }, + { + "device_id":"3", + "device_ip":rank_address[11], + "rank_id":"11" + }, + { + "device_id":"4", + "device_ip":rank_address[12], + "rank_id":"12" + }, + { + "device_id":"5", + "device_ip":rank_address[13], + "rank_id":"13" + }, + { + "device_id":"6", + "device_ip":rank_address[14], + "rank_id":"14" + }, + { + "device_id":"7", + "device_ip":rank_address[15], + "rank_id":"15" + } + ]}, + { + "server_id":server[2], + "device":[ + { + "device_id":"0", + "device_ip":rank_address[16], + "rank_id":"16" + }, + { + "device_id":"1", + "device_ip":rank_address[17], + "rank_id":"17" + }, + { + "device_id":"2", + "device_ip":rank_address[18], + "rank_id":"18" + }, + { + "device_id":"3", + "device_ip":rank_address[19], + "rank_id":"19" + }, + { + "device_id":"4", + "device_ip":rank_address[20], + "rank_id":"20" + }, + { + "device_id":"5", + "device_ip":rank_address[21], + "rank_id":"21" + }, + { + "device_id":"6", + "device_ip":rank_address[22], + "rank_id":"22" + }, + { + "device_id":"7", + "device_ip":rank_address[23], + "rank_id":"23" + } + ]}, + { + "server_id":server[3], + "device":[ + { + "device_id":"0", + "device_ip":rank_address[24], + "rank_id":"24" + }, + { + "device_id":"1", + "device_ip":rank_address[25], + "rank_id":"25" + }, + { + "device_id":"2", + "device_ip":rank_address[26], + "rank_id":"26" + }, + { + "device_id":"3", + "device_ip":rank_address[27], + "rank_id":"27" + }, + { + "device_id":"4", + "device_ip":rank_address[28], + "rank_id":"28" + }, + { + "device_id":"5", + "device_ip":rank_address[29], + "rank_id":"29" + }, + { + "device_id":"6", + "device_ip":rank_address[30], + "rank_id":"30" + }, + { + "device_id":"7", + "device_ip":rank_address[31], + "rank_id":"31" + } + ]}, + { + "server_id":server[4], + "device":[ + { + "device_id":"0", + "device_ip":rank_address[32], + "rank_id":"32" + }, + { + "device_id":"1", + "device_ip":rank_address[33], + "rank_id":"33" + }, + { + "device_id":"2", + "device_ip":rank_address[34], + "rank_id":"34" + }, + { + "device_id":"3", + "device_ip":rank_address[35], + "rank_id":"35" + }, + { + "device_id":"4", + "device_ip":rank_address[36], + "rank_id":"36" + }, + { + "device_id":"5", + "device_ip":rank_address[37], + "rank_id":"37" + }, + { + "device_id":"6", + "device_ip":rank_address[38], + "rank_id":"38" + }, + { + "device_id":"7", + "device_ip":rank_address[39], + "rank_id":"39" + } + ]} + ], + "status":"completed", + "version":"1.0" + } + + + +elif FLAGS.npu_nums == 6: + rank = { + "server_count":"6", + "server_list":[ + { + "server_id":server[0], + "device":[ + { + "device_id":"0", + "device_ip":rank_address[0], + "rank_id":"0" + }, + { + "device_id":"1", + "device_ip":rank_address[1], + "rank_id":"1" + }, + { + "device_id":"2", + "device_ip":rank_address[2], + "rank_id":"2" + }, + { + "device_id":"3", + "device_ip":rank_address[3], + "rank_id":"3" + }, + { + "device_id":"4", + "device_ip":rank_address[4], + "rank_id":"4" + }, + { + "device_id":"5", + "device_ip":rank_address[5], + "rank_id":"5" + }, + { + "device_id":"6", + "device_ip":rank_address[6], + "rank_id":"6" + }, + { + "device_id":"7", + "device_ip":rank_address[7], + "rank_id":"7" + } + ]}, + + + { + "server_id":server[1], + "device":[ + { + "device_id":"0", + "device_ip":rank_address[8], + "rank_id":"8" + }, + { + "device_id":"1", + "device_ip":rank_address[9], + "rank_id":"9" + }, + { + "device_id":"2", + "device_ip":rank_address[10], + "rank_id":"10" + }, + { + "device_id":"3", + "device_ip":rank_address[11], + "rank_id":"11" + }, + { + "device_id":"4", + "device_ip":rank_address[12], + "rank_id":"12" + }, + { + "device_id":"5", + "device_ip":rank_address[13], + "rank_id":"13" + }, + { + "device_id":"6", + "device_ip":rank_address[14], + "rank_id":"14" + }, + { + "device_id":"7", + "device_ip":rank_address[15], + "rank_id":"15" + } + ]}, + { + "server_id":server[2], + "device":[ + { + "device_id":"0", + "device_ip":rank_address[16], + "rank_id":"16" + }, + { + "device_id":"1", + "device_ip":rank_address[17], + "rank_id":"17" + }, + { + "device_id":"2", + "device_ip":rank_address[18], + "rank_id":"18" + }, + { + "device_id":"3", + "device_ip":rank_address[19], + "rank_id":"19" + }, + { + "device_id":"4", + "device_ip":rank_address[20], + "rank_id":"20" + }, + { + "device_id":"5", + "device_ip":rank_address[21], + "rank_id":"21" + }, + { + "device_id":"6", + "device_ip":rank_address[22], + "rank_id":"22" + }, + { + "device_id":"7", + "device_ip":rank_address[23], + "rank_id":"23" + } + ]}, + { + "server_id":server[3], + "device":[ + { + "device_id":"0", + "device_ip":rank_address[24], + "rank_id":"24" + }, + { + "device_id":"1", + "device_ip":rank_address[25], + "rank_id":"25" + }, + { + "device_id":"2", + "device_ip":rank_address[26], + "rank_id":"26" + }, + { + "device_id":"3", + "device_ip":rank_address[27], + "rank_id":"27" + }, + { + "device_id":"4", + "device_ip":rank_address[28], + "rank_id":"28" + }, + { + "device_id":"5", + "device_ip":rank_address[29], + "rank_id":"29" + }, + { + "device_id":"6", + "device_ip":rank_address[30], + "rank_id":"30" + }, + { + "device_id":"7", + "device_ip":rank_address[31], + "rank_id":"31" + } + ]}, + { + "server_id":server[4], + "device":[ + { + "device_id":"0", + "device_ip":rank_address[32], + "rank_id":"32" + }, + { + "device_id":"1", + "device_ip":rank_address[33], + "rank_id":"33" + }, + { + "device_id":"2", + "device_ip":rank_address[34], + "rank_id":"34" + }, + { + "device_id":"3", + "device_ip":rank_address[35], + "rank_id":"35" + }, + { + "device_id":"4", + "device_ip":rank_address[36], + "rank_id":"36" + }, + { + "device_id":"5", + "device_ip":rank_address[37], + "rank_id":"37" + }, + { + "device_id":"6", + "device_ip":rank_address[38], + "rank_id":"38" + }, + { + "device_id":"7", + "device_ip":rank_address[39], + "rank_id":"39" + } + ]}, + { + "server_id":server[5], + "device":[ + { + "device_id":"0", + "device_ip":rank_address[40], + "rank_id":"40" + }, + { + "device_id":"1", + "device_ip":rank_address[41], + "rank_id":"41" + }, + { + "device_id":"2", + "device_ip":rank_address[42], + "rank_id":"42" + }, + { + "device_id":"3", + "device_ip":rank_address[43], + "rank_id":"43" + }, + { + "device_id":"4", + "device_ip":rank_address[44], + "rank_id":"44" + }, + { + "device_id":"5", + "device_ip":rank_address[45], + "rank_id":"45" + }, + { + "device_id":"6", + "device_ip":rank_address[46], + "rank_id":"46" + }, + { + "device_id":"7", + "device_ip":rank_address[47], + "rank_id":"47" + } + ]} + ], + "status":"completed", + "version":"1.0" + } + + +elif FLAGS.npu_nums == 7: + rank = { + "server_count":"7", + "server_list":[ + { + "server_id":server[0], + "device":[ + { + "device_id":"0", + "device_ip":rank_address[0], + "rank_id":"0" + }, + { + "device_id":"1", + "device_ip":rank_address[1], + "rank_id":"1" + }, + { + "device_id":"2", + "device_ip":rank_address[2], + "rank_id":"2" + }, + { + "device_id":"3", + "device_ip":rank_address[3], + "rank_id":"3" + }, + { + "device_id":"4", + "device_ip":rank_address[4], + "rank_id":"4" + }, + { + "device_id":"5", + "device_ip":rank_address[5], + "rank_id":"5" + }, + { + "device_id":"6", + "device_ip":rank_address[6], + "rank_id":"6" + }, + { + "device_id":"7", + "device_ip":rank_address[7], + "rank_id":"7" + } + ]}, + + + { + "server_id":server[1], + "device":[ + { + "device_id":"0", + "device_ip":rank_address[8], + "rank_id":"8" + }, + { + "device_id":"1", + "device_ip":rank_address[9], + "rank_id":"9" + }, + { + "device_id":"2", + "device_ip":rank_address[10], + "rank_id":"10" + }, + { + "device_id":"3", + "device_ip":rank_address[11], + "rank_id":"11" + }, + { + "device_id":"4", + "device_ip":rank_address[12], + "rank_id":"12" + }, + { + "device_id":"5", + "device_ip":rank_address[13], + "rank_id":"13" + }, + { + "device_id":"6", + "device_ip":rank_address[14], + "rank_id":"14" + }, + { + "device_id":"7", + "device_ip":rank_address[15], + "rank_id":"15" + } + ]}, + { + "server_id":server[2], + "device":[ + { + "device_id":"0", + "device_ip":rank_address[16], + "rank_id":"16" + }, + { + "device_id":"1", + "device_ip":rank_address[17], + "rank_id":"17" + }, + { + "device_id":"2", + "device_ip":rank_address[18], + "rank_id":"18" + }, + { + "device_id":"3", + "device_ip":rank_address[19], + "rank_id":"19" + }, + { + "device_id":"4", + "device_ip":rank_address[20], + "rank_id":"20" + }, + { + "device_id":"5", + "device_ip":rank_address[21], + "rank_id":"21" + }, + { + "device_id":"6", + "device_ip":rank_address[22], + "rank_id":"22" + }, + { + "device_id":"7", + "device_ip":rank_address[23], + "rank_id":"23" + } + ]}, + { + "server_id":server[3], + "device":[ + { + "device_id":"0", + "device_ip":rank_address[24], + "rank_id":"24" + }, + { + "device_id":"1", + "device_ip":rank_address[25], + "rank_id":"25" + }, + { + "device_id":"2", + "device_ip":rank_address[26], + "rank_id":"26" + }, + { + "device_id":"3", + "device_ip":rank_address[27], + "rank_id":"27" + }, + { + "device_id":"4", + "device_ip":rank_address[28], + "rank_id":"28" + }, + { + "device_id":"5", + "device_ip":rank_address[29], + "rank_id":"29" + }, + { + "device_id":"6", + "device_ip":rank_address[30], + "rank_id":"30" + }, + { + "device_id":"7", + "device_ip":rank_address[31], + "rank_id":"31" + } + ]}, + { + "server_id":server[4], + "device":[ + { + "device_id":"0", + "device_ip":rank_address[32], + "rank_id":"32" + }, + { + "device_id":"1", + "device_ip":rank_address[33], + "rank_id":"33" + }, + { + "device_id":"2", + "device_ip":rank_address[34], + "rank_id":"34" + }, + { + "device_id":"3", + "device_ip":rank_address[35], + "rank_id":"35" + }, + { + "device_id":"4", + "device_ip":rank_address[36], + "rank_id":"36" + }, + { + "device_id":"5", + "device_ip":rank_address[37], + "rank_id":"37" + }, + { + "device_id":"6", + "device_ip":rank_address[38], + "rank_id":"38" + }, + { + "device_id":"7", + "device_ip":rank_address[39], + "rank_id":"39" + } + ]}, + { + "server_id":server[5], + "device":[ + { + "device_id":"0", + "device_ip":rank_address[40], + "rank_id":"40" + }, + { + "device_id":"1", + "device_ip":rank_address[41], + "rank_id":"41" + }, + { + "device_id":"2", + "device_ip":rank_address[42], + "rank_id":"42" + }, + { + "device_id":"3", + "device_ip":rank_address[43], + "rank_id":"43" + }, + { + "device_id":"4", + "device_ip":rank_address[44], + "rank_id":"44" + }, + { + "device_id":"5", + "device_ip":rank_address[45], + "rank_id":"45" + }, + { + "device_id":"6", + "device_ip":rank_address[46], + "rank_id":"46" + }, + { + "device_id":"7", + "device_ip":rank_address[47], + "rank_id":"47" + } + ]}, + { + "server_id":server[6], + "device":[ + { + "device_id":"0", + "device_ip":rank_address[48], + "rank_id":"48" + }, + { + "device_id":"1", + "device_ip":rank_address[49], + "rank_id":"49" + }, + { + "device_id":"2", + "device_ip":rank_address[50], + "rank_id":"50" + }, + { + "device_id":"3", + "device_ip":rank_address[51], + "rank_id":"51" + }, + { + "device_id":"4", + "device_ip":rank_address[52], + "rank_id":"52" + }, + { + "device_id":"5", + "device_ip":rank_address[53], + "rank_id":"53" + }, + { + "device_id":"6", + "device_ip":rank_address[54], + "rank_id":"54" + }, + { + "device_id":"7", + "device_ip":rank_address[55], + "rank_id":"55" + } + ]} + ], + "status":"completed", + "version":"1.0" + } + + + + +elif FLAGS.npu_nums == 8: + rank = { + "server_count":"8", + "server_list":[ + { + "server_id":server[0], + "device":[ + { + "device_id":"0", + "device_ip":rank_address[0], + "rank_id":"0" + }, + { + "device_id":"1", + "device_ip":rank_address[1], + "rank_id":"1" + }, + { + "device_id":"2", + "device_ip":rank_address[2], + "rank_id":"2" + }, + { + "device_id":"3", + "device_ip":rank_address[3], + "rank_id":"3" + }, + { + "device_id":"4", + "device_ip":rank_address[4], + "rank_id":"4" + }, + { + "device_id":"5", + "device_ip":rank_address[5], + "rank_id":"5" + }, + { + "device_id":"6", + "device_ip":rank_address[6], + "rank_id":"6" + }, + { + "device_id":"7", + "device_ip":rank_address[7], + "rank_id":"7" + } + ]}, + + + { + "server_id":server[1], + "device":[ + { + "device_id":"0", + "device_ip":rank_address[8], + "rank_id":"8" + }, + { + "device_id":"1", + "device_ip":rank_address[9], + "rank_id":"9" + }, + { + "device_id":"2", + "device_ip":rank_address[10], + "rank_id":"10" + }, + { + "device_id":"3", + "device_ip":rank_address[11], + "rank_id":"11" + }, + { + "device_id":"4", + "device_ip":rank_address[12], + "rank_id":"12" + }, + { + "device_id":"5", + "device_ip":rank_address[13], + "rank_id":"13" + }, + { + "device_id":"6", + "device_ip":rank_address[14], + "rank_id":"14" + }, + { + "device_id":"7", + "device_ip":rank_address[15], + "rank_id":"15" + } + ]}, + { + "server_id":server[2], + "device":[ + { + "device_id":"0", + "device_ip":rank_address[16], + "rank_id":"16" + }, + { + "device_id":"1", + "device_ip":rank_address[17], + "rank_id":"17" + }, + { + "device_id":"2", + "device_ip":rank_address[18], + "rank_id":"18" + }, + { + "device_id":"3", + "device_ip":rank_address[19], + "rank_id":"19" + }, + { + "device_id":"4", + "device_ip":rank_address[20], + "rank_id":"20" + }, + { + "device_id":"5", + "device_ip":rank_address[21], + "rank_id":"21" + }, + { + "device_id":"6", + "device_ip":rank_address[22], + "rank_id":"22" + }, + { + "device_id":"7", + "device_ip":rank_address[23], + "rank_id":"23" + } + ]}, + { + "server_id":server[3], + "device":[ + { + "device_id":"0", + "device_ip":rank_address[24], + "rank_id":"24" + }, + { + "device_id":"1", + "device_ip":rank_address[25], + "rank_id":"25" + }, + { + "device_id":"2", + "device_ip":rank_address[26], + "rank_id":"26" + }, + { + "device_id":"3", + "device_ip":rank_address[27], + "rank_id":"27" + }, + { + "device_id":"4", + "device_ip":rank_address[28], + "rank_id":"28" + }, + { + "device_id":"5", + "device_ip":rank_address[29], + "rank_id":"29" + }, + { + "device_id":"6", + "device_ip":rank_address[30], + "rank_id":"30" + }, + { + "device_id":"7", + "device_ip":rank_address[31], + "rank_id":"31" + } + ]}, + { + "server_id":server[4], + "device":[ + { + "device_id":"0", + "device_ip":rank_address[32], + "rank_id":"32" + }, + { + "device_id":"1", + "device_ip":rank_address[33], + "rank_id":"33" + }, + { + "device_id":"2", + "device_ip":rank_address[34], + "rank_id":"34" + }, + { + "device_id":"3", + "device_ip":rank_address[35], + "rank_id":"35" + }, + { + "device_id":"4", + "device_ip":rank_address[36], + "rank_id":"36" + }, + { + "device_id":"5", + "device_ip":rank_address[37], + "rank_id":"37" + }, + { + "device_id":"6", + "device_ip":rank_address[38], + "rank_id":"38" + }, + { + "device_id":"7", + "device_ip":rank_address[39], + "rank_id":"39" + } + ]}, + { + "server_id":server[5], + "device":[ + { + "device_id":"0", + "device_ip":rank_address[40], + "rank_id":"40" + }, + { + "device_id":"1", + "device_ip":rank_address[41], + "rank_id":"41" + }, + { + "device_id":"2", + "device_ip":rank_address[42], + "rank_id":"42" + }, + { + "device_id":"3", + "device_ip":rank_address[43], + "rank_id":"43" + }, + { + "device_id":"4", + "device_ip":rank_address[44], + "rank_id":"44" + }, + { + "device_id":"5", + "device_ip":rank_address[45], + "rank_id":"45" + }, + { + "device_id":"6", + "device_ip":rank_address[46], + "rank_id":"46" + }, + { + "device_id":"7", + "device_ip":rank_address[47], + "rank_id":"47" + } + ]}, + { + "server_id":server[6], + "device":[ + { + "device_id":"0", + "device_ip":rank_address[48], + "rank_id":"48" + }, + { + "device_id":"1", + "device_ip":rank_address[49], + "rank_id":"49" + }, + { + "device_id":"2", + "device_ip":rank_address[50], + "rank_id":"50" + }, + { + "device_id":"3", + "device_ip":rank_address[51], + "rank_id":"51" + }, + { + "device_id":"4", + "device_ip":rank_address[52], + "rank_id":"52" + }, + { + "device_id":"5", + "device_ip":rank_address[53], + "rank_id":"53" + }, + { + "device_id":"6", + "device_ip":rank_address[54], + "rank_id":"54" + }, + { + "device_id":"7", + "device_ip":rank_address[55], + "rank_id":"55" + } + ]}, + { + "server_id":server[7], + "device":[ + { + "device_id":"0", + "device_ip":rank_address[56], + "rank_id":"56" + }, + { + "device_id":"1", + "device_ip":rank_address[57], + "rank_id":"57" + }, + { + "device_id":"2", + "device_ip":rank_address[58], + "rank_id":"58" + }, + { + "device_id":"3", + "device_ip":rank_address[59], + "rank_id":"59" + }, + { + "device_id":"4", + "device_ip":rank_address[60], + "rank_id":"60" + }, + { + "device_id":"5", + "device_ip":rank_address[61], + "rank_id":"61" + }, + { + "device_id":"6", + "device_ip":rank_address[62], + "rank_id":"62" + }, + { + "device_id":"7", + "device_ip":rank_address[63], + "rank_id":"63" + } + ]} + ], + "status":"completed", + "version":"1.0" + } + + + + +with open("rank_table.json", "w") as f: + json.dump(rank, f) + + + + + + diff --git a/TensorFlow2/built-in/cv/detection/Swin-Transformer_ID2412_for_TensorFlow2.X/test/train_performance_16p.sh b/TensorFlow2/built-in/cv/detection/Swin-Transformer_ID2412_for_TensorFlow2.X/test/train_performance_16p.sh new file mode 100644 index 0000000000000000000000000000000000000000..1db12d29f1c866a33ac05d09e95d91870fac7d81 --- /dev/null +++ b/TensorFlow2/built-in/cv/detection/Swin-Transformer_ID2412_for_TensorFlow2.X/test/train_performance_16p.sh @@ -0,0 +1,230 @@ +#!/bin/bash +cur_path=`pwd` +#失败用例打屏 +export RANK_SIZE=16 +export JOB_ID=10087 +#基础参数,需要模型审视修改 +#网络名称,同目录名称 +Network="Swin-Transformer_ID2412_for_TensorFlow2.X" +#Device数量,单卡默认为1 +RankSize=1 +#Batch Size +batch_size=256 +#训练epoch,可选 +epochs=3 +#训练step +train_steps=10 +conf_path="" +server_index="" +############维测参数############## +precision_mode="allow_mix_precision" +#维持参数,以下不需要修改 +over_dump=False +if [[ $over_dump == True ]];then + over_dump_path=$cur_path/overflow_dump #此处cur_path为代码根目录 + mkdir -p ${over_dump_path} +fi +data_dump_flag=False +data_dump_step="10" +profiling=False +use_mixlist=True +mixlist_file="../configs/ops_info.json" +fusion_off_flag=False +fusion_off_file="../configs/fusion_switch.cfg" +############维测参数############## + +#参数配置 +data_path="" +ckpt_path="" +model_name="" + +if [[ $1 == --help || $1 == --h ]];then + echo"usage:./train_full_1p.sh " + echo " " + echo "parameter explain: + --precision_mode precision mode(allow_fp32_to_fp16/force_fp16/must_keep_origin_dtype/allow_mix_precision) + --over_dump if or not over detection, default is False + --data_dump_flag data dump flag, default is False + --data_dump_step data dump step, default is 10 + --profiling if or not profiling for performance debug, default is False + --autotune whether to enable autotune, default is False + --data_path source data of training + --ckpt_path source ckpt of training + -h/--help show help message + " + exit 1 +fi + +##############执行训练########## +cd $cur_path +if [ -d $cur_path/output ];then + rm -rf $cur_path/output/* + mkdir -p $cur_path/output/$ASCEND_DEVICE_ID +else + mkdir -p $cur_path/output/$ASCEND_DEVICE_ID +fi +wait + +for para in $* +do + if [[ $para == --data_path* ]];then + data_path=`echo ${para#*=}` + elif [[ $para == --ckpt_path* ]];then + ckpt_path=`echo ${para#*=}` + elif [[ $para == --precision_mode* ]];then + precision_mode=`echo ${para#*=}` + elif [[ $para == --over_dump* ]];then + over_dump=`echo ${para#*=}` + over_dump_path=${cur_path}/output/overflow_dump + mkdir -p ${over_dump_path} + elif [[ $para == --data_dump_flag* ]];then + data_dump_flag=`echo ${para#*=}` + data_dump_path=${cur_path}/output/data_dump + mkdir -p ${data_dump_path} + elif [[ $para == --data_dump_step* ]];then + data_dump_step=`echo ${para#*=}` + elif [[ $para == --profiling* ]];then + profiling=`echo ${para#*=}` + profiling_dump_path=${cur_path}/output/profiling + mkdir -p ${profiling_dump_path} + elif [[ $para == --data_path* ]];then + data_path=`echo ${para#*=}` + elif [[ $para == --use_mixlist* ]];then + use_mixlist=`echo ${para#*=}` + elif [[ $para == --mixlist_file* ]];then + mixlist_file=`echo ${para#*=}` + elif [[ $para == --fusion_off_flag* ]];then + fusion_off_flag=`echo ${para#*=}` + elif [[ $para == --fusion_off_file* ]];then + fusion_off_file=`echo ${para#*=}` + elif [[ $para == --server_index* ]];then + server_index=`echo ${para#*=}` + elif [[ $para == --conf_path* ]];then + conf_path=`echo ${para#*=}` + fi +done + + +if [[ $data_path == "" ]];then + echo "[Error] para \"data_path\" must be config" + exit 1 +elif [[ $ckpt_path == "" ]];then + echo "[Error] para \"ckpt_path\" must be config" + exit 1 +fi +rank_size=8 +#export RANK_TABLE_FILE=$cur_path/../scripts/rank_table_16p.json +nohup python3 $cur_path/set_ranktable.py --npu_nums=$((RANK_SIZE/rank_size)) --conf_path=$conf_path +export RANK_TABLE_FILE=${cur_path}/rank_table.json +export HCCL_CONNECT_TIMEOUT=600 +export RANK_INDEX=0 +RANK_ID_START=0 +RANK_SIZE=16 + + + + + + + +start=$(date +%s) + +RANK_ID_START=0 +for((RANK_ID=$((rank_size*server_index));RANK_ID<$((((server_index+1))*rank_size));RANK_ID++)); +do + # 设置环境变量 + export RANK_ID=$RANK_ID + export ASCEND_DEVICE_ID=`expr ${RANK_ID} - $((rank_size*server_index))` + ASCEND_DEVICE_ID=`expr ${RANK_ID} - $((rank_size*server_index))` + export DEVICE_ID=${ASCEND_DEVICE_ID} + echo "Device ID: $ASCEND_DEVICE_ID" + if [ -d ${cur_path}/output/${ASCEND_DEVICE_ID} ];then + rm -rf ${cur_path}/output/${ASCEND_DEVICE_ID} + mkdir -p ${cur_path}/output/$ASCEND_DEVICE_ID + else + mkdir -p ${cur_path}/output/$ASCEND_DEVICE_ID + fi + + python3 ../swintransformer/train_main_16p.py \ + --data_path=${data_path} \ + --ckpt_path=${ckpt_path} \ + --model_name="swin_large_224" \ + --epochs=${epochs} \ + --train_steps=${train_steps} \ + --precision_mode=${precision_mode} \ + --over_dump=${over_dump} \ + --over_dump_path=${over_dump_path} \ + --data_dump_flag=${data_dump_flag} \ + --data_dump_step=${data_dump_step} \ + --data_dump_path=${data_dump_path} \ + --profiling=${profiling} \ + --use_mixlist=${use_mixlist} \ + --fusion_off_flag=${fusion_off_flag} \ + --mixlist_file=${mixlist_file} \ + --fusion_off_file=${fusion_off_file} \ + --profiling_dump_path=${profiling_dump_path}} \ + --batch_size 16 \ + --nproc_per_node=16 >$cur_path/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log 2>&1 & +done +wait +end=$(date +%s) +e2etime=$(( $end - $start )) + +#echo "Final Performance ms/step : $average_perf" +echo "Final Training Duration sec : $e2etime" + +###下面字段用于冒烟看护 +BatchSize=${batch_size} +#设备类型,自动获取 +DeviceType=`uname -m` +#用例名称,自动获取 +CaseName=${Network}_bs${BatchSize}_${RankSize}'p'_'perf' + +echo "------------------ Final result ------------------" +#输出性能FPS,需要模型审视修改 +cp -f $cur_path/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log $cur_path/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log.bak +sed -i 's/\x0d/\n/g' $cur_path/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log.bak +single_batch_step_sec=`grep ms/step $cur_path/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log.bak | awk 'END {print $5}' | awk -F 'm' '{print $1}'` + +FPS=`echo ${single_batch_step_sec} ${batch_size} | awk '{print $2 * 1000 / $1}'` + +#打印,不需要修改 +echo "Final Performance images/sec : $FPS" + +grep "ms/step" $cur_path/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log.bak|awk '{print$8}' > $cur_path/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt +ActualLoss=`awk 'END {print}' $cur_path/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt` +#输出训练精度,需要模型审视修改 +grep "f1 score" $cur_path/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log >> $cur_path/output/$ASCEND_DEVICE_ID/train_${CaseName}_acc.txt +train_accuracy=`awk 'END {print}' $cur_path/output/$ASCEND_DEVICE_ID/train_${CaseName}_acc.txt |awk '{print $3}' |awk -F "," '{print $1}'` +#打印,不需要修改 +echo "Final Train Accuracy : ${train_accuracy}" +echo "E2E Training Duration sec : $e2e_time" + +##获取性能数据 +#吞吐量,不需要修改 +ActualFPS=${FPS} +#单迭代训练时长,不需要修改 +TrainingTime=`awk 'BEGIN{printf "%.2f\n",'${BatchSize}'*'${RankSize}'*1000/'${FPS}'}'` + +##获取错误信息 +#系统错误信息 +error_msg="the shape of grad must equal with var" +#判断错误信息是否和历史状态一致,此处无需修改 +Status=`grep "${error_msg}" $cur_path/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log | wc -l` +#失败阶段,枚举值图准备FAIL/图拆分FAIL/图优化FAIL/图编译FAIL/图执行FAIL/流程OK +ModelStatus="流程OK" +#DTS单号或者issue链接 +DTS_Number="" + +#关键信息打印到CaseName.log中,此处无需修改 +echo "Network = ${Network}" > $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "RankSize = ${RankSize}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "BatchSize = ${BatchSize}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "DeviceType = ${DeviceType}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "CaseName = ${CaseName}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "ActualFPS = ${ActualFPS}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "TrainingTime = ${TrainingTime}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "ActualLoss = ${ActualLoss}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "TrainAccuracy = ${train_accuracy}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "E2ETrainingTime = ${e2etime}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log + diff --git a/TensorFlow2/built-in/cv/detection/Swin-Transformer_ID2412_for_TensorFlow2.X/test/train_performance_8p.sh b/TensorFlow2/built-in/cv/detection/Swin-Transformer_ID2412_for_TensorFlow2.X/test/train_performance_8p.sh new file mode 100644 index 0000000000000000000000000000000000000000..9ea83e280f03da2f33cd0f83f99a548c7812677b --- /dev/null +++ b/TensorFlow2/built-in/cv/detection/Swin-Transformer_ID2412_for_TensorFlow2.X/test/train_performance_8p.sh @@ -0,0 +1,211 @@ +#!/bin/bash +cur_path=`pwd` + +#失败用例打屏 +export RANK_SIZE=8 +export JOB_ID=10087 +export RANK_TABLE_FILE=8p.json +#基础参数,需要模型审视修改 +#网络名称,同目录名称 +Network="Swin-Transformer_ID2412_for_TensorFlow2.X" +#Device数量,单卡默认为1 +RankSize=1 +#Batch Size +batch_size=128 +#训练epoch,可选 +epochs=3 +#训练step +train_steps=10 + +############维测参数############## +precision_mode="allow_mix_precision" +#维持参数,以下不需要修改 +over_dump=False +if [[ $over_dump == True ]];then + over_dump_path=$cur_path/overflow_dump #此处cur_path为代码根目录 + mkdir -p ${over_dump_path} +fi +data_dump_flag=False +data_dump_step="10" +profiling=False +use_mixlist=True +mixlist_file="../configs/ops_info.json" +fusion_off_flag=False +fusion_off_file="../configs/fusion_switch.cfg" +############维测参数############## + +#参数配置 +data_path="/npu/traindata/dataset_swin/" +ckpt_path="/npu/traindata/pretrain/" +model_name="" + +if [[ $1 == --help || $1 == --h ]];then + echo"usage:./train_full_1p.sh " + echo " " + echo "parameter explain: + --precision_mode precision mode(allow_fp32_to_fp16/force_fp16/must_keep_origin_dtype/allow_mix_precision) + --over_dump if or not over detection, default is False + --data_dump_flag data dump flag, default is False + --data_dump_step data dump step, default is 10 + --profiling if or not profiling for performance debug, default is False + --autotune whether to enable autotune, default is False + --data_path source data of training + --ckpt_path source ckpt of training + -h/--help show help message + " + exit 1 +fi + +##############执行训练########## +cd $cur_path +if [ -d $cur_path/output ];then + rm -rf $cur_path/output/* + mkdir -p $cur_path/output/$ASCEND_DEVICE_ID +else + mkdir -p $cur_path/output/$ASCEND_DEVICE_ID +fi +wait + +for para in $* +do + if [[ $para == --data_path* ]];then + data_path=`echo ${para#*=}` + elif [[ $para == --ckpt_path* ]];then + ckpt_path=`echo ${para#*=}` + elif [[ $para == --precision_mode* ]];then + precision_mode=`echo ${para#*=}` + elif [[ $para == --over_dump* ]];then + over_dump=`echo ${para#*=}` + over_dump_path=${cur_path}/output/overflow_dump + mkdir -p ${over_dump_path} + elif [[ $para == --data_dump_flag* ]];then + data_dump_flag=`echo ${para#*=}` + data_dump_path=${cur_path}/output/data_dump + mkdir -p ${data_dump_path} + elif [[ $para == --data_dump_step* ]];then + data_dump_step=`echo ${para#*=}` + elif [[ $para == --profiling* ]];then + profiling=`echo ${para#*=}` + profiling_dump_path=${cur_path}/output/profiling + mkdir -p ${profiling_dump_path} + elif [[ $para == --data_path* ]];then + data_path=`echo ${para#*=}` + elif [[ $para == --use_mixlist* ]];then + use_mixlist=`echo ${para#*=}` + elif [[ $para == --mixlist_file* ]];then + mixlist_file=`echo ${para#*=}` + elif [[ $para == --fusion_off_flag* ]];then + fusion_off_flag=`echo ${para#*=}` + elif [[ $para == --fusion_off_file* ]];then + fusion_off_file=`echo ${para#*=}` + fi +done + + +if [[ $data_path == "" ]];then + echo "[Error] para \"data_path\" must be config" + exit 1 +elif [[ $ckpt_path == "" ]];then + echo "[Error] para \"ckpt_path\" must be config" + exit 1 +fi + + +start=$(date +%s) + +RANK_ID_START=0 +for((RANK_ID=$RANK_ID_START;RANK_ID<$((RANK_SIZE+RANK_ID_START));RANK_ID++)); +do + export RANK_ID=$RANK_ID + export ASCEND_DEVICE_ID=$RANK_ID + echo "Device ID: $ASCEND_DEVICE_ID" + if [ -d ${cur_path}/output/${ASCEND_DEVICE_ID} ];then + rm -rf ${cur_path}/output/${ASCEND_DEVICE_ID} + mkdir -p ${cur_path}/output/$ASCEND_DEVICE_ID + else + mkdir -p ${cur_path}/output/$ASCEND_DEVICE_ID + fi + + python3 ../swintransformer/train_main_16p.py \ + --data_path=${data_path} \ + --ckpt_path=${ckpt_path} \ + --model_name="swin_large_224" \ + --epochs=${epochs} \ + --train_steps=${train_steps} \ + --precision_mode=${precision_mode} \ + --over_dump=${over_dump} \ + --over_dump_path=${over_dump_path} \ + --data_dump_flag=${data_dump_flag} \ + --data_dump_step=${data_dump_step} \ + --data_dump_path=${data_dump_path} \ + --profiling=${profiling} \ + --use_mixlist=${use_mixlist} \ + --fusion_off_flag=${fusion_off_flag} \ + --mixlist_file=${mixlist_file} \ + --fusion_off_file=${fusion_off_file} \ + --profiling_dump_path=${profiling_dump_path}} \ + --batch_size 16 \ + --nproc_per_node=8 >$cur_path/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log 2>&1 & +done +wait +end=$(date +%s) +e2etime=$(( $end - $start )) + +#echo "Final Performance ms/step : $average_perf" +echo "Final Training Duration sec : $e2etime" + +###下面字段用于冒烟看护 +BatchSize=${batch_size} +#设备类型,自动获取 +DeviceType=`uname -m` +#用例名称,自动获取 +CaseName=${Network}_bs${BatchSize}_${RankSize}'p'_'perf' + +echo "------------------ Final result ------------------" +#输出性能FPS,需要模型审视修改 +cp -f $cur_path/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log $cur_path/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log.bak +sed -i 's/\x0d/\n/g' $cur_path/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log.bak +single_batch_step_sec=`grep ms/step $cur_path/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log.bak | awk 'END {print $5}' | awk -F 'm' '{print $1}'` + +FPS=`echo ${single_batch_step_sec} ${batch_size} | awk '{print $2 * 1000 / $1}'` + +#打印,不需要修改 +echo "Final Performance images/sec : $FPS" + +grep "ms/step" $cur_path/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log.bak|awk '{print$8}' > $cur_path/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt +ActualLoss=`awk 'END {print}' $cur_path/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt` +#输出训练精度,需要模型审视修改 +grep "f1 score" $cur_path/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log >> $cur_path/output/$ASCEND_DEVICE_ID/train_${CaseName}_acc.txt +train_accuracy=`awk 'END {print}' $cur_path/output/$ASCEND_DEVICE_ID/train_${CaseName}_acc.txt |awk '{print $3}' |awk -F "," '{print $1}'` +#打印,不需要修改 +echo "Final Train Accuracy : ${train_accuracy}" +echo "E2E Training Duration sec : $e2e_time" + +##获取性能数据 +#吞吐量,不需要修改 +ActualFPS=${FPS} +#单迭代训练时长,不需要修改 +TrainingTime=`awk 'BEGIN{printf "%.2f\n",'${BatchSize}'*'${RankSize}'*1000/'${FPS}'}'` + +##获取错误信息 +#系统错误信息 +error_msg="the shape of grad must equal with var" +#判断错误信息是否和历史状态一致,此处无需修改 +Status=`grep "${error_msg}" $cur_path/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log | wc -l` +#失败阶段,枚举值图准备FAIL/图拆分FAIL/图优化FAIL/图编译FAIL/图执行FAIL/流程OK +ModelStatus="流程OK" +#DTS单号或者issue链接 +DTS_Number="" + +#关键信息打印到CaseName.log中,此处无需修改 +echo "Network = ${Network}" > $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "RankSize = ${RankSize}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "BatchSize = ${BatchSize}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "DeviceType = ${DeviceType}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "CaseName = ${CaseName}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "ActualFPS = ${ActualFPS}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "TrainingTime = ${TrainingTime}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "ActualLoss = ${ActualLoss}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "TrainAccuracy = ${train_accuracy}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "E2ETrainingTime = ${e2etime}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +