diff --git a/TensorFlow2/built-in/cv/detection/YOLOv5_ID1719_for_TensorFlow2.X/.idea/.gitignore b/TensorFlow2/built-in/cv/detection/YOLOv5_ID1719_for_TensorFlow2.X/.idea/.gitignore new file mode 100644 index 0000000000000000000000000000000000000000..26d33521af10bcc7fd8cea344038eaaeb78d0ef5 --- /dev/null +++ b/TensorFlow2/built-in/cv/detection/YOLOv5_ID1719_for_TensorFlow2.X/.idea/.gitignore @@ -0,0 +1,3 @@ +# Default ignored files +/shelf/ +/workspace.xml diff --git a/TensorFlow2/built-in/cv/detection/YOLOv5_ID1719_for_TensorFlow2.X/.idea/YOLOv5_ID1719_for_TensorFlow2.X.iml b/TensorFlow2/built-in/cv/detection/YOLOv5_ID1719_for_TensorFlow2.X/.idea/YOLOv5_ID1719_for_TensorFlow2.X.iml new file mode 100644 index 0000000000000000000000000000000000000000..8e5446ac9594d6e198c2a2923123566d13b94bf9 --- /dev/null +++ b/TensorFlow2/built-in/cv/detection/YOLOv5_ID1719_for_TensorFlow2.X/.idea/YOLOv5_ID1719_for_TensorFlow2.X.iml @@ -0,0 +1,14 @@ + + + + + + + + + + + + \ No newline at end of file diff --git a/TensorFlow2/built-in/cv/detection/YOLOv5_ID1719_for_TensorFlow2.X/.idea/inspectionProfiles/profiles_settings.xml b/TensorFlow2/built-in/cv/detection/YOLOv5_ID1719_for_TensorFlow2.X/.idea/inspectionProfiles/profiles_settings.xml new file mode 100644 index 0000000000000000000000000000000000000000..105ce2da2d6447d11dfe32bfb846c3d5b199fc99 --- /dev/null +++ b/TensorFlow2/built-in/cv/detection/YOLOv5_ID1719_for_TensorFlow2.X/.idea/inspectionProfiles/profiles_settings.xml @@ -0,0 +1,6 @@ + + + + \ No newline at end of file diff --git a/TensorFlow2/built-in/cv/detection/YOLOv5_ID1719_for_TensorFlow2.X/.idea/misc.xml b/TensorFlow2/built-in/cv/detection/YOLOv5_ID1719_for_TensorFlow2.X/.idea/misc.xml new file mode 100644 index 0000000000000000000000000000000000000000..c456a360d0a3da01f06efa38811b6106fd900e56 --- /dev/null +++ b/TensorFlow2/built-in/cv/detection/YOLOv5_ID1719_for_TensorFlow2.X/.idea/misc.xml @@ -0,0 +1,4 @@ + + + + \ No newline at end of file diff --git a/TensorFlow2/built-in/cv/detection/YOLOv5_ID1719_for_TensorFlow2.X/.idea/modules.xml b/TensorFlow2/built-in/cv/detection/YOLOv5_ID1719_for_TensorFlow2.X/.idea/modules.xml new file mode 100644 index 0000000000000000000000000000000000000000..ee76a2b335f986f7d0460c5b6524094fc9804def --- /dev/null +++ b/TensorFlow2/built-in/cv/detection/YOLOv5_ID1719_for_TensorFlow2.X/.idea/modules.xml @@ -0,0 +1,8 @@ + + + + + + + + \ No newline at end of file diff --git a/TensorFlow2/built-in/cv/detection/YOLOv5_ID1719_for_TensorFlow2.X/core/dataset.py b/TensorFlow2/built-in/cv/detection/YOLOv5_ID1719_for_TensorFlow2.X/core/dataset.py index e7f0ad5784f820e1cfdab7f8106d5094341be8d4..c8348d839403e115a8ed22766a18f3af4a274acd 100644 --- a/TensorFlow2/built-in/cv/detection/YOLOv5_ID1719_for_TensorFlow2.X/core/dataset.py +++ b/TensorFlow2/built-in/cv/detection/YOLOv5_ID1719_for_TensorFlow2.X/core/dataset.py @@ -63,7 +63,7 @@ class Dataset(object): self.anchor_per_scale = cfg.YOLO.ANCHOR_PER_SCALE self.max_bbox_per_scale = 450 - self.annotations = self.load_annotations() + self.annotations = self.load_annotations(FLAGS) self.num_samples = len(self.annotations) self.num_batchs = int(np.ceil(self.num_samples / self.batch_size)) self.batch_count = 0 @@ -71,7 +71,7 @@ class Dataset(object): self._data_buff = dict() - def load_annotations(self): + def load_annotations(self,FLAGS): with open(self.annot_path, "r") as f: txt = f.readlines() if self.dataset_type == "converted_coco": @@ -106,6 +106,12 @@ class Dataset(object): annotations.append(image_path + string) np.random.shuffle(annotations) + + # shard + if FLAGS.rank_size > 1: + len_annotations = len(annotations) + annotations = annotations[int(len_annotations//int(FLAGS.rank_size))*FLAGS.rank:int(len_annotations//int(FLAGS.rank_size))*(FLAGS.rank+1)] + return annotations def __iter__(self): diff --git a/TensorFlow2/built-in/cv/detection/YOLOv5_ID1719_for_TensorFlow2.X/test/set_ranktable.py b/TensorFlow2/built-in/cv/detection/YOLOv5_ID1719_for_TensorFlow2.X/test/set_ranktable.py new file mode 100644 index 0000000000000000000000000000000000000000..641b0b8b5157989ca002a150c68fb418075a7b51 --- /dev/null +++ b/TensorFlow2/built-in/cv/detection/YOLOv5_ID1719_for_TensorFlow2.X/test/set_ranktable.py @@ -0,0 +1,1733 @@ +import argparse +parser = argparse.ArgumentParser() +parser.add_argument('-n', '--npu_nums', type=int, default='2', help='nums of npu') +parser.add_argument('-c', '--conf_path', type=str, default='./', help='the path of server_info') +FLAGS = parser.parse_args() + +import json +import os +server = [] +server_conf = [] +server_list = ["0", "1", "2", "3", "4", "5", "6", "7"] +if os.path.isdir(FLAGS.conf_path): + for f in os.listdir(FLAGS.conf_path): + if (f.split("_")[-1]).split(".")[0] in server_list and (f.split("_")[-1]).split(".")[1] == 'info' and f.split("_")[0] == 'server': + server_conf.append(f) + + + + + + +rank_address = [] +for i in range(FLAGS.npu_nums): + for x in server_conf: + if (x.split("_")[-1]).split(".")[0] == str(i): + server.append(x.split("_")[1]) + l = FLAGS.conf_path + "/" + x + with open(l, "r") as a: + s = a.readlines() + for s_ in s: + if 'address_0' in s_: + rank_address.append(s_.split("=")[-1][:-1]) + for s_ in s: + if 'address_1' in s_: + rank_address.append(s_.split("=")[-1][:-1]) + for s_ in s: + if 'address_2' in s_: + rank_address.append(s_.split("=")[-1][:-1]) + for s_ in s: + if 'address_3' in s_: + rank_address.append(s_.split("=")[-1][:-1]) + for s_ in s: + if 'address_4' in s_: + rank_address.append(s_.split("=")[-1][:-1]) + for s_ in s: + if 'address_5' in s_: + rank_address.append(s_.split("=")[-1][:-1]) + for s_ in s: + if 'address_6' in s_: + rank_address.append(s_.split("=")[-1][:-1]) + for s_ in s: + if 'address_7' in s_: + rank_address.append(s_.split("=")[-1][:-1]) + +if FLAGS.npu_nums == 1: + rank = { + "server_count":"1", + "server_list":[ + { + "server_id":server[0], + "device":[ + { + "device_id":"0", + "device_ip":rank_address[0], + "rank_id":"0" + }, + { + "device_id":"1", + "device_ip":rank_address[1], + "rank_id":"1" + }, + { + "device_id":"2", + "device_ip":rank_address[2], + "rank_id":"2" + }, + { + "device_id":"3", + "device_ip":rank_address[3], + "rank_id":"3" + }, + { + "device_id":"4", + "device_ip":rank_address[4], + "rank_id":"4" + }, + { + "device_id":"5", + "device_ip":rank_address[5], + "rank_id":"5" + }, + { + "device_id":"6", + "device_ip":rank_address[6], + "rank_id":"6" + }, + { + "device_id":"7", + "device_ip":rank_address[7], + "rank_id":"7" + } + ]}], + "status":"completed", + "version":"1.0" + } +elif FLAGS.npu_nums == 2: + rank = { + "server_count":"2", + "server_list":[ + { + "server_id":server[0], + "device":[ + { + "device_id":"0", + "device_ip":rank_address[0], + "rank_id":"0" + }, + { + "device_id":"1", + "device_ip":rank_address[1], + "rank_id":"1" + }, + { + "device_id":"2", + "device_ip":rank_address[2], + "rank_id":"2" + }, + { + "device_id":"3", + "device_ip":rank_address[3], + "rank_id":"3" + }, + { + "device_id":"4", + "device_ip":rank_address[4], + "rank_id":"4" + }, + { + "device_id":"5", + "device_ip":rank_address[5], + "rank_id":"5" + }, + { + "device_id":"6", + "device_ip":rank_address[6], + "rank_id":"6" + }, + { + "device_id":"7", + "device_ip":rank_address[7], + "rank_id":"7" + } + ]}, + + + { + "server_id":server[1], + "device":[ + { + "device_id":"0", + "device_ip":rank_address[8], + "rank_id":"8" + }, + { + "device_id":"1", + "device_ip":rank_address[9], + "rank_id":"9" + }, + { + "device_id":"2", + "device_ip":rank_address[10], + "rank_id":"10" + }, + { + "device_id":"3", + "device_ip":rank_address[11], + "rank_id":"11" + }, + { + "device_id":"4", + "device_ip":rank_address[12], + "rank_id":"12" + }, + { + "device_id":"5", + "device_ip":rank_address[13], + "rank_id":"13" + }, + { + "device_id":"6", + "device_ip":rank_address[14], + "rank_id":"14" + }, + { + "device_id":"7", + "device_ip":rank_address[15], + "rank_id":"15" + } + ]} + ], + + "status":"completed", + "version":"1.0" + } + + +elif FLAGS.npu_nums == 3: + rank = { + "server_count":"3", + "server_list":[ + { + "server_id":server[0], + "device":[ + { + "device_id":"0", + "device_ip":rank_address[0], + "rank_id":"0" + }, + { + "device_id":"1", + "device_ip":rank_address[1], + "rank_id":"1" + }, + { + "device_id":"2", + "device_ip":rank_address[2], + "rank_id":"2" + }, + { + "device_id":"3", + "device_ip":rank_address[3], + "rank_id":"3" + }, + { + "device_id":"4", + "device_ip":rank_address[4], + "rank_id":"4" + }, + { + "device_id":"5", + "device_ip":rank_address[5], + "rank_id":"5" + }, + { + "device_id":"6", + "device_ip":rank_address[6], + "rank_id":"6" + }, + { + "device_id":"7", + "device_ip":rank_address[7], + "rank_id":"7" + } + ]}, + + + { + "server_id":server[1], + "device":[ + { + "device_id":"0", + "device_ip":rank_address[8], + "rank_id":"8" + }, + { + "device_id":"1", + "device_ip":rank_address[9], + "rank_id":"9" + }, + { + "device_id":"2", + "device_ip":rank_address[10], + "rank_id":"10" + }, + { + "device_id":"3", + "device_ip":rank_address[11], + "rank_id":"11" + }, + { + "device_id":"4", + "device_ip":rank_address[12], + "rank_id":"12" + }, + { + "device_id":"5", + "device_ip":rank_address[13], + "rank_id":"13" + }, + { + "device_id":"6", + "device_ip":rank_address[14], + "rank_id":"14" + }, + { + "device_id":"7", + "device_ip":rank_address[15], + "rank_id":"15" + } + ]}, + + { + "server_id":server[2], + "device":[ + { + "device_id":"0", + "device_ip":rank_address[16], + "rank_id":"16" + }, + { + "device_id":"1", + "device_ip":rank_address[17], + "rank_id":"17" + }, + { + "device_id":"2", + "device_ip":rank_address[18], + "rank_id":"18" + }, + { + "device_id":"3", + "device_ip":rank_address[19], + "rank_id":"19" + }, + { + "device_id":"4", + "device_ip":rank_address[20], + "rank_id":"20" + }, + { + "device_id":"5", + "device_ip":rank_address[21], + "rank_id":"21" + }, + { + "device_id":"6", + "device_ip":rank_address[22], + "rank_id":"22" + }, + { + "device_id":"7", + "device_ip":rank_address[23], + "rank_id":"23" + } + ]} + ], + "status":"completed", + "version":"1.0" + } +elif FLAGS.npu_nums == 4: + rank = { + "server_count":"4", + "server_list":[ + { + "server_id":server[0], + "device":[ + { + "device_id":"0", + "device_ip":rank_address[0], + "rank_id":"0" + }, + { + "device_id":"1", + "device_ip":rank_address[1], + "rank_id":"1" + }, + { + "device_id":"2", + "device_ip":rank_address[2], + "rank_id":"2" + }, + { + "device_id":"3", + "device_ip":rank_address[3], + "rank_id":"3" + }, + { + "device_id":"4", + "device_ip":rank_address[4], + "rank_id":"4" + }, + { + "device_id":"5", + "device_ip":rank_address[5], + "rank_id":"5" + }, + { + "device_id":"6", + "device_ip":rank_address[6], + "rank_id":"6" + }, + { + "device_id":"7", + "device_ip":rank_address[7], + "rank_id":"7" + } + ]}, + + + { + "server_id":server[1], + "device":[ + { + "device_id":"0", + "device_ip":rank_address[8], + "rank_id":"8" + }, + { + "device_id":"1", + "device_ip":rank_address[9], + "rank_id":"9" + }, + { + "device_id":"2", + "device_ip":rank_address[10], + "rank_id":"10" + }, + { + "device_id":"3", + "device_ip":rank_address[11], + "rank_id":"11" + }, + { + "device_id":"4", + "device_ip":rank_address[12], + "rank_id":"12" + }, + { + "device_id":"5", + "device_ip":rank_address[13], + "rank_id":"13" + }, + { + "device_id":"6", + "device_ip":rank_address[14], + "rank_id":"14" + }, + { + "device_id":"7", + "device_ip":rank_address[15], + "rank_id":"15" + } + ]}, + { + "server_id":server[2], + "device":[ + { + "device_id":"0", + "device_ip":rank_address[16], + "rank_id":"16" + }, + { + "device_id":"1", + "device_ip":rank_address[17], + "rank_id":"17" + }, + { + "device_id":"2", + "device_ip":rank_address[18], + "rank_id":"18" + }, + { + "device_id":"3", + "device_ip":rank_address[19], + "rank_id":"19" + }, + { + "device_id":"4", + "device_ip":rank_address[20], + "rank_id":"20" + }, + { + "device_id":"5", + "device_ip":rank_address[21], + "rank_id":"21" + }, + { + "device_id":"6", + "device_ip":rank_address[22], + "rank_id":"22" + }, + { + "device_id":"7", + "device_ip":rank_address[23], + "rank_id":"23" + } + ]}, + { + "server_id":server[3], + "device":[ + { + "device_id":"0", + "device_ip":rank_address[24], + "rank_id":"24" + }, + { + "device_id":"1", + "device_ip":rank_address[25], + "rank_id":"25" + }, + { + "device_id":"2", + "device_ip":rank_address[26], + "rank_id":"26" + }, + { + "device_id":"3", + "device_ip":rank_address[27], + "rank_id":"27" + }, + { + "device_id":"4", + "device_ip":rank_address[28], + "rank_id":"28" + }, + { + "device_id":"5", + "device_ip":rank_address[29], + "rank_id":"29" + }, + { + "device_id":"6", + "device_ip":rank_address[30], + "rank_id":"30" + }, + { + "device_id":"7", + "device_ip":rank_address[31], + "rank_id":"31" + } + ]} + ], + "status":"completed", + "version":"1.0" + } +elif FLAGS.npu_nums == 5: + rank = { + "server_count":"5", + "server_list":[ + { + "server_id":server[0], + "device":[ + { + "device_id":"0", + "device_ip":rank_address[0], + "rank_id":"0" + }, + { + "device_id":"1", + "device_ip":rank_address[1], + "rank_id":"1" + }, + { + "device_id":"2", + "device_ip":rank_address[2], + "rank_id":"2" + }, + { + "device_id":"3", + "device_ip":rank_address[3], + "rank_id":"3" + }, + { + "device_id":"4", + "device_ip":rank_address[4], + "rank_id":"4" + }, + { + "device_id":"5", + "device_ip":rank_address[5], + "rank_id":"5" + }, + { + "device_id":"6", + "device_ip":rank_address[6], + "rank_id":"6" + }, + { + "device_id":"7", + "device_ip":rank_address[7], + "rank_id":"7" + } + ]}, + + + { + "server_id":server[1], + "device":[ + { + "device_id":"0", + "device_ip":rank_address[8], + "rank_id":"8" + }, + { + "device_id":"1", + "device_ip":rank_address[9], + "rank_id":"9" + }, + { + "device_id":"2", + "device_ip":rank_address[10], + "rank_id":"10" + }, + { + "device_id":"3", + "device_ip":rank_address[11], + "rank_id":"11" + }, + { + "device_id":"4", + "device_ip":rank_address[12], + "rank_id":"12" + }, + { + "device_id":"5", + "device_ip":rank_address[13], + "rank_id":"13" + }, + { + "device_id":"6", + "device_ip":rank_address[14], + "rank_id":"14" + }, + { + "device_id":"7", + "device_ip":rank_address[15], + "rank_id":"15" + } + ]}, + { + "server_id":server[2], + "device":[ + { + "device_id":"0", + "device_ip":rank_address[16], + "rank_id":"16" + }, + { + "device_id":"1", + "device_ip":rank_address[17], + "rank_id":"17" + }, + { + "device_id":"2", + "device_ip":rank_address[18], + "rank_id":"18" + }, + { + "device_id":"3", + "device_ip":rank_address[19], + "rank_id":"19" + }, + { + "device_id":"4", + "device_ip":rank_address[20], + "rank_id":"20" + }, + { + "device_id":"5", + "device_ip":rank_address[21], + "rank_id":"21" + }, + { + "device_id":"6", + "device_ip":rank_address[22], + "rank_id":"22" + }, + { + "device_id":"7", + "device_ip":rank_address[23], + "rank_id":"23" + } + ]}, + { + "server_id":server[3], + "device":[ + { + "device_id":"0", + "device_ip":rank_address[24], + "rank_id":"24" + }, + { + "device_id":"1", + "device_ip":rank_address[25], + "rank_id":"25" + }, + { + "device_id":"2", + "device_ip":rank_address[26], + "rank_id":"26" + }, + { + "device_id":"3", + "device_ip":rank_address[27], + "rank_id":"27" + }, + { + "device_id":"4", + "device_ip":rank_address[28], + "rank_id":"28" + }, + { + "device_id":"5", + "device_ip":rank_address[29], + "rank_id":"29" + }, + { + "device_id":"6", + "device_ip":rank_address[30], + "rank_id":"30" + }, + { + "device_id":"7", + "device_ip":rank_address[31], + "rank_id":"31" + } + ]}, + { + "server_id":server[4], + "device":[ + { + "device_id":"0", + "device_ip":rank_address[32], + "rank_id":"32" + }, + { + "device_id":"1", + "device_ip":rank_address[33], + "rank_id":"33" + }, + { + "device_id":"2", + "device_ip":rank_address[34], + "rank_id":"34" + }, + { + "device_id":"3", + "device_ip":rank_address[35], + "rank_id":"35" + }, + { + "device_id":"4", + "device_ip":rank_address[36], + "rank_id":"36" + }, + { + "device_id":"5", + "device_ip":rank_address[37], + "rank_id":"37" + }, + { + "device_id":"6", + "device_ip":rank_address[38], + "rank_id":"38" + }, + { + "device_id":"7", + "device_ip":rank_address[39], + "rank_id":"39" + } + ]} + ], + "status":"completed", + "version":"1.0" + } + + + +elif FLAGS.npu_nums == 6: + rank = { + "server_count":"6", + "server_list":[ + { + "server_id":server[0], + "device":[ + { + "device_id":"0", + "device_ip":rank_address[0], + "rank_id":"0" + }, + { + "device_id":"1", + "device_ip":rank_address[1], + "rank_id":"1" + }, + { + "device_id":"2", + "device_ip":rank_address[2], + "rank_id":"2" + }, + { + "device_id":"3", + "device_ip":rank_address[3], + "rank_id":"3" + }, + { + "device_id":"4", + "device_ip":rank_address[4], + "rank_id":"4" + }, + { + "device_id":"5", + "device_ip":rank_address[5], + "rank_id":"5" + }, + { + "device_id":"6", + "device_ip":rank_address[6], + "rank_id":"6" + }, + { + "device_id":"7", + "device_ip":rank_address[7], + "rank_id":"7" + } + ]}, + + + { + "server_id":server[1], + "device":[ + { + "device_id":"0", + "device_ip":rank_address[8], + "rank_id":"8" + }, + { + "device_id":"1", + "device_ip":rank_address[9], + "rank_id":"9" + }, + { + "device_id":"2", + "device_ip":rank_address[10], + "rank_id":"10" + }, + { + "device_id":"3", + "device_ip":rank_address[11], + "rank_id":"11" + }, + { + "device_id":"4", + "device_ip":rank_address[12], + "rank_id":"12" + }, + { + "device_id":"5", + "device_ip":rank_address[13], + "rank_id":"13" + }, + { + "device_id":"6", + "device_ip":rank_address[14], + "rank_id":"14" + }, + { + "device_id":"7", + "device_ip":rank_address[15], + "rank_id":"15" + } + ]}, + { + "server_id":server[2], + "device":[ + { + "device_id":"0", + "device_ip":rank_address[16], + "rank_id":"16" + }, + { + "device_id":"1", + "device_ip":rank_address[17], + "rank_id":"17" + }, + { + "device_id":"2", + "device_ip":rank_address[18], + "rank_id":"18" + }, + { + "device_id":"3", + "device_ip":rank_address[19], + "rank_id":"19" + }, + { + "device_id":"4", + "device_ip":rank_address[20], + "rank_id":"20" + }, + { + "device_id":"5", + "device_ip":rank_address[21], + "rank_id":"21" + }, + { + "device_id":"6", + "device_ip":rank_address[22], + "rank_id":"22" + }, + { + "device_id":"7", + "device_ip":rank_address[23], + "rank_id":"23" + } + ]}, + { + "server_id":server[3], + "device":[ + { + "device_id":"0", + "device_ip":rank_address[24], + "rank_id":"24" + }, + { + "device_id":"1", + "device_ip":rank_address[25], + "rank_id":"25" + }, + { + "device_id":"2", + "device_ip":rank_address[26], + "rank_id":"26" + }, + { + "device_id":"3", + "device_ip":rank_address[27], + "rank_id":"27" + }, + { + "device_id":"4", + "device_ip":rank_address[28], + "rank_id":"28" + }, + { + "device_id":"5", + "device_ip":rank_address[29], + "rank_id":"29" + }, + { + "device_id":"6", + "device_ip":rank_address[30], + "rank_id":"30" + }, + { + "device_id":"7", + "device_ip":rank_address[31], + "rank_id":"31" + } + ]}, + { + "server_id":server[4], + "device":[ + { + "device_id":"0", + "device_ip":rank_address[32], + "rank_id":"32" + }, + { + "device_id":"1", + "device_ip":rank_address[33], + "rank_id":"33" + }, + { + "device_id":"2", + "device_ip":rank_address[34], + "rank_id":"34" + }, + { + "device_id":"3", + "device_ip":rank_address[35], + "rank_id":"35" + }, + { + "device_id":"4", + "device_ip":rank_address[36], + "rank_id":"36" + }, + { + "device_id":"5", + "device_ip":rank_address[37], + "rank_id":"37" + }, + { + "device_id":"6", + "device_ip":rank_address[38], + "rank_id":"38" + }, + { + "device_id":"7", + "device_ip":rank_address[39], + "rank_id":"39" + } + ]}, + { + "server_id":server[5], + "device":[ + { + "device_id":"0", + "device_ip":rank_address[40], + "rank_id":"40" + }, + { + "device_id":"1", + "device_ip":rank_address[41], + "rank_id":"41" + }, + { + "device_id":"2", + "device_ip":rank_address[42], + "rank_id":"42" + }, + { + "device_id":"3", + "device_ip":rank_address[43], + "rank_id":"43" + }, + { + "device_id":"4", + "device_ip":rank_address[44], + "rank_id":"44" + }, + { + "device_id":"5", + "device_ip":rank_address[45], + "rank_id":"45" + }, + { + "device_id":"6", + "device_ip":rank_address[46], + "rank_id":"46" + }, + { + "device_id":"7", + "device_ip":rank_address[47], + "rank_id":"47" + } + ]} + ], + "status":"completed", + "version":"1.0" + } + + +elif FLAGS.npu_nums == 7: + rank = { + "server_count":"7", + "server_list":[ + { + "server_id":server[0], + "device":[ + { + "device_id":"0", + "device_ip":rank_address[0], + "rank_id":"0" + }, + { + "device_id":"1", + "device_ip":rank_address[1], + "rank_id":"1" + }, + { + "device_id":"2", + "device_ip":rank_address[2], + "rank_id":"2" + }, + { + "device_id":"3", + "device_ip":rank_address[3], + "rank_id":"3" + }, + { + "device_id":"4", + "device_ip":rank_address[4], + "rank_id":"4" + }, + { + "device_id":"5", + "device_ip":rank_address[5], + "rank_id":"5" + }, + { + "device_id":"6", + "device_ip":rank_address[6], + "rank_id":"6" + }, + { + "device_id":"7", + "device_ip":rank_address[7], + "rank_id":"7" + } + ]}, + + + { + "server_id":server[1], + "device":[ + { + "device_id":"0", + "device_ip":rank_address[8], + "rank_id":"8" + }, + { + "device_id":"1", + "device_ip":rank_address[9], + "rank_id":"9" + }, + { + "device_id":"2", + "device_ip":rank_address[10], + "rank_id":"10" + }, + { + "device_id":"3", + "device_ip":rank_address[11], + "rank_id":"11" + }, + { + "device_id":"4", + "device_ip":rank_address[12], + "rank_id":"12" + }, + { + "device_id":"5", + "device_ip":rank_address[13], + "rank_id":"13" + }, + { + "device_id":"6", + "device_ip":rank_address[14], + "rank_id":"14" + }, + { + "device_id":"7", + "device_ip":rank_address[15], + "rank_id":"15" + } + ]}, + { + "server_id":server[2], + "device":[ + { + "device_id":"0", + "device_ip":rank_address[16], + "rank_id":"16" + }, + { + "device_id":"1", + "device_ip":rank_address[17], + "rank_id":"17" + }, + { + "device_id":"2", + "device_ip":rank_address[18], + "rank_id":"18" + }, + { + "device_id":"3", + "device_ip":rank_address[19], + "rank_id":"19" + }, + { + "device_id":"4", + "device_ip":rank_address[20], + "rank_id":"20" + }, + { + "device_id":"5", + "device_ip":rank_address[21], + "rank_id":"21" + }, + { + "device_id":"6", + "device_ip":rank_address[22], + "rank_id":"22" + }, + { + "device_id":"7", + "device_ip":rank_address[23], + "rank_id":"23" + } + ]}, + { + "server_id":server[3], + "device":[ + { + "device_id":"0", + "device_ip":rank_address[24], + "rank_id":"24" + }, + { + "device_id":"1", + "device_ip":rank_address[25], + "rank_id":"25" + }, + { + "device_id":"2", + "device_ip":rank_address[26], + "rank_id":"26" + }, + { + "device_id":"3", + "device_ip":rank_address[27], + "rank_id":"27" + }, + { + "device_id":"4", + "device_ip":rank_address[28], + "rank_id":"28" + }, + { + "device_id":"5", + "device_ip":rank_address[29], + "rank_id":"29" + }, + { + "device_id":"6", + "device_ip":rank_address[30], + "rank_id":"30" + }, + { + "device_id":"7", + "device_ip":rank_address[31], + "rank_id":"31" + } + ]}, + { + "server_id":server[4], + "device":[ + { + "device_id":"0", + "device_ip":rank_address[32], + "rank_id":"32" + }, + { + "device_id":"1", + "device_ip":rank_address[33], + "rank_id":"33" + }, + { + "device_id":"2", + "device_ip":rank_address[34], + "rank_id":"34" + }, + { + "device_id":"3", + "device_ip":rank_address[35], + "rank_id":"35" + }, + { + "device_id":"4", + "device_ip":rank_address[36], + "rank_id":"36" + }, + { + "device_id":"5", + "device_ip":rank_address[37], + "rank_id":"37" + }, + { + "device_id":"6", + "device_ip":rank_address[38], + "rank_id":"38" + }, + { + "device_id":"7", + "device_ip":rank_address[39], + "rank_id":"39" + } + ]}, + { + "server_id":server[5], + "device":[ + { + "device_id":"0", + "device_ip":rank_address[40], + "rank_id":"40" + }, + { + "device_id":"1", + "device_ip":rank_address[41], + "rank_id":"41" + }, + { + "device_id":"2", + "device_ip":rank_address[42], + "rank_id":"42" + }, + { + "device_id":"3", + "device_ip":rank_address[43], + "rank_id":"43" + }, + { + "device_id":"4", + "device_ip":rank_address[44], + "rank_id":"44" + }, + { + "device_id":"5", + "device_ip":rank_address[45], + "rank_id":"45" + }, + { + "device_id":"6", + "device_ip":rank_address[46], + "rank_id":"46" + }, + { + "device_id":"7", + "device_ip":rank_address[47], + "rank_id":"47" + } + ]}, + { + "server_id":server[6], + "device":[ + { + "device_id":"0", + "device_ip":rank_address[48], + "rank_id":"48" + }, + { + "device_id":"1", + "device_ip":rank_address[49], + "rank_id":"49" + }, + { + "device_id":"2", + "device_ip":rank_address[50], + "rank_id":"50" + }, + { + "device_id":"3", + "device_ip":rank_address[51], + "rank_id":"51" + }, + { + "device_id":"4", + "device_ip":rank_address[52], + "rank_id":"52" + }, + { + "device_id":"5", + "device_ip":rank_address[53], + "rank_id":"53" + }, + { + "device_id":"6", + "device_ip":rank_address[54], + "rank_id":"54" + }, + { + "device_id":"7", + "device_ip":rank_address[55], + "rank_id":"55" + } + ]} + ], + "status":"completed", + "version":"1.0" + } + + + + +elif FLAGS.npu_nums == 8: + rank = { + "server_count":"8", + "server_list":[ + { + "server_id":server[0], + "device":[ + { + "device_id":"0", + "device_ip":rank_address[0], + "rank_id":"0" + }, + { + "device_id":"1", + "device_ip":rank_address[1], + "rank_id":"1" + }, + { + "device_id":"2", + "device_ip":rank_address[2], + "rank_id":"2" + }, + { + "device_id":"3", + "device_ip":rank_address[3], + "rank_id":"3" + }, + { + "device_id":"4", + "device_ip":rank_address[4], + "rank_id":"4" + }, + { + "device_id":"5", + "device_ip":rank_address[5], + "rank_id":"5" + }, + { + "device_id":"6", + "device_ip":rank_address[6], + "rank_id":"6" + }, + { + "device_id":"7", + "device_ip":rank_address[7], + "rank_id":"7" + } + ]}, + + + { + "server_id":server[1], + "device":[ + { + "device_id":"0", + "device_ip":rank_address[8], + "rank_id":"8" + }, + { + "device_id":"1", + "device_ip":rank_address[9], + "rank_id":"9" + }, + { + "device_id":"2", + "device_ip":rank_address[10], + "rank_id":"10" + }, + { + "device_id":"3", + "device_ip":rank_address[11], + "rank_id":"11" + }, + { + "device_id":"4", + "device_ip":rank_address[12], + "rank_id":"12" + }, + { + "device_id":"5", + "device_ip":rank_address[13], + "rank_id":"13" + }, + { + "device_id":"6", + "device_ip":rank_address[14], + "rank_id":"14" + }, + { + "device_id":"7", + "device_ip":rank_address[15], + "rank_id":"15" + } + ]}, + { + "server_id":server[2], + "device":[ + { + "device_id":"0", + "device_ip":rank_address[16], + "rank_id":"16" + }, + { + "device_id":"1", + "device_ip":rank_address[17], + "rank_id":"17" + }, + { + "device_id":"2", + "device_ip":rank_address[18], + "rank_id":"18" + }, + { + "device_id":"3", + "device_ip":rank_address[19], + "rank_id":"19" + }, + { + "device_id":"4", + "device_ip":rank_address[20], + "rank_id":"20" + }, + { + "device_id":"5", + "device_ip":rank_address[21], + "rank_id":"21" + }, + { + "device_id":"6", + "device_ip":rank_address[22], + "rank_id":"22" + }, + { + "device_id":"7", + "device_ip":rank_address[23], + "rank_id":"23" + } + ]}, + { + "server_id":server[3], + "device":[ + { + "device_id":"0", + "device_ip":rank_address[24], + "rank_id":"24" + }, + { + "device_id":"1", + "device_ip":rank_address[25], + "rank_id":"25" + }, + { + "device_id":"2", + "device_ip":rank_address[26], + "rank_id":"26" + }, + { + "device_id":"3", + "device_ip":rank_address[27], + "rank_id":"27" + }, + { + "device_id":"4", + "device_ip":rank_address[28], + "rank_id":"28" + }, + { + "device_id":"5", + "device_ip":rank_address[29], + "rank_id":"29" + }, + { + "device_id":"6", + "device_ip":rank_address[30], + "rank_id":"30" + }, + { + "device_id":"7", + "device_ip":rank_address[31], + "rank_id":"31" + } + ]}, + { + "server_id":server[4], + "device":[ + { + "device_id":"0", + "device_ip":rank_address[32], + "rank_id":"32" + }, + { + "device_id":"1", + "device_ip":rank_address[33], + "rank_id":"33" + }, + { + "device_id":"2", + "device_ip":rank_address[34], + "rank_id":"34" + }, + { + "device_id":"3", + "device_ip":rank_address[35], + "rank_id":"35" + }, + { + "device_id":"4", + "device_ip":rank_address[36], + "rank_id":"36" + }, + { + "device_id":"5", + "device_ip":rank_address[37], + "rank_id":"37" + }, + { + "device_id":"6", + "device_ip":rank_address[38], + "rank_id":"38" + }, + { + "device_id":"7", + "device_ip":rank_address[39], + "rank_id":"39" + } + ]}, + { + "server_id":server[5], + "device":[ + { + "device_id":"0", + "device_ip":rank_address[40], + "rank_id":"40" + }, + { + "device_id":"1", + "device_ip":rank_address[41], + "rank_id":"41" + }, + { + "device_id":"2", + "device_ip":rank_address[42], + "rank_id":"42" + }, + { + "device_id":"3", + "device_ip":rank_address[43], + "rank_id":"43" + }, + { + "device_id":"4", + "device_ip":rank_address[44], + "rank_id":"44" + }, + { + "device_id":"5", + "device_ip":rank_address[45], + "rank_id":"45" + }, + { + "device_id":"6", + "device_ip":rank_address[46], + "rank_id":"46" + }, + { + "device_id":"7", + "device_ip":rank_address[47], + "rank_id":"47" + } + ]}, + { + "server_id":server[6], + "device":[ + { + "device_id":"0", + "device_ip":rank_address[48], + "rank_id":"48" + }, + { + "device_id":"1", + "device_ip":rank_address[49], + "rank_id":"49" + }, + { + "device_id":"2", + "device_ip":rank_address[50], + "rank_id":"50" + }, + { + "device_id":"3", + "device_ip":rank_address[51], + "rank_id":"51" + }, + { + "device_id":"4", + "device_ip":rank_address[52], + "rank_id":"52" + }, + { + "device_id":"5", + "device_ip":rank_address[53], + "rank_id":"53" + }, + { + "device_id":"6", + "device_ip":rank_address[54], + "rank_id":"54" + }, + { + "device_id":"7", + "device_ip":rank_address[55], + "rank_id":"55" + } + ]}, + { + "server_id":server[7], + "device":[ + { + "device_id":"0", + "device_ip":rank_address[56], + "rank_id":"56" + }, + { + "device_id":"1", + "device_ip":rank_address[57], + "rank_id":"57" + }, + { + "device_id":"2", + "device_ip":rank_address[58], + "rank_id":"58" + }, + { + "device_id":"3", + "device_ip":rank_address[59], + "rank_id":"59" + }, + { + "device_id":"4", + "device_ip":rank_address[60], + "rank_id":"60" + }, + { + "device_id":"5", + "device_ip":rank_address[61], + "rank_id":"61" + }, + { + "device_id":"6", + "device_ip":rank_address[62], + "rank_id":"62" + }, + { + "device_id":"7", + "device_ip":rank_address[63], + "rank_id":"63" + } + ]} + ], + "status":"completed", + "version":"1.0" + } + + +with open('rank_table.json','w') as f: + json.dump(rank,f) + diff --git a/TensorFlow2/built-in/cv/detection/YOLOv5_ID1719_for_TensorFlow2.X/test/train_performance_16p.sh b/TensorFlow2/built-in/cv/detection/YOLOv5_ID1719_for_TensorFlow2.X/test/train_performance_16p.sh new file mode 100644 index 0000000000000000000000000000000000000000..d2221c8d32cc1cd04290bed70be4c27198c4d7a2 --- /dev/null +++ b/TensorFlow2/built-in/cv/detection/YOLOv5_ID1719_for_TensorFlow2.X/test/train_performance_16p.sh @@ -0,0 +1,225 @@ +#!/bin/bash + +#当前路径,不需要修改 +cur_path=`pwd` + +export PYTHONWARNINGS='ignore:semaphore_tracker:UserWarning' + +# 帮助信息,不需要修改 +if [[ $1 == --help || $1 == -h ]];then + echo"usage:./train_performance_1p.sh " + echo " " + echo "parameter explain: + --precision_mode precision mode(allow_fp32_to_fp16/force_fp16/must_keep_origin_dtype/allow_mix_precision) + -h/--help show help message + " + exit 1 +fi + +#参数校验,不需要修改 +for para in $* +do + if [[ $para == --precision_mode* ]];then + precision_mode=`echo ${para#*=}` + elif [[ $para == --over_dump* ]];then + over_dump=`echo ${para#*=}` + over_dump_path=${cur_path}/output/overflow_dump + mkdir -p ${over_dump_path} + elif [[ $para == --data_dump_flag* ]];then + data_dump_flag=`echo ${para#*=}` + data_dump_path=${cur_path}/output/data_dump + mkdir -p ${data_dump_path} + elif [[ $para == --data_dump_step* ]];then + data_dump_step=`echo ${para#*=}` + elif [[ $para == --profiling* ]];then + profiling=`echo ${para#*=}` + profiling_dump_path=${cur_path}/output/profiling + mkdir -p ${profiling_dump_path} + elif [[ $para == --autotune* ]];then + autotune=`echo ${para#*=}` + mv $install_path/fwkacllib/data/rl/Ascend910/custom $install_path/fwkacllib/data/rl/Ascend910/custom_bak + mv $install_path/fwkacllib/data/tiling/Ascend910/custom $install_path/fwkacllib/data/tiling/Ascend910/custom_bak + autotune_dump_path=${cur_path}/output/autotune_dump + mkdir -p ${autotune_dump_path}/GA + mkdir -p ${autotune_dump_path}/rl + cp -rf $install_path/fwkacllib/data/tiling/Ascend910/custom ${autotune_dump_path}/GA/ + cp -rf $install_path/fwkacllib/data/rl/Ascend910/custom ${autotune_dump_path}/RL/ + elif [[ $para == --data_path* ]];then + data_path=`echo ${para#*=}` + elif [[ $para == --bind_core* ]];then + bind_core=`echo ${para#*=}` + name_bind="_bindcore" + elif [[ $para == --server_index* ]];then + server_index=`echo ${para#*=}` + elif [[ $para == --conf_path* ]];then + conf_path=`echo ${para#*=}` + fi +done + +#export ASCEND_SLOG_PRINT_TO_STDOUT=1 +export RANK_SIZE=16 +export JOB_ID=10087 +rank_size=8 +nohup python3 $cur_path/set_ranktable.py --npu_nums=$((RANK_SIZE/rank_size)) --conf_path=$conf_path +export RANK_TABLE_FILE=${cur_path}/rank_table.json +export HCCL_CONNECT_TIMEOUT=600 +RANK_ID_START=0 +RANK_SIZE=16 +# 数据集路径,保持为空,不需要修改 +data_path="/npu/traindata/COCO2017" + +anno_converted='/npu/traindata/COCO2017/val2017.txt' +gt_anno_path='/npu/traindata/COCO2017/annotations/instances_val2017.json' + +#屏蔽TF2.4升级到TF2.6图差异带来的性能下降 +export NPU_EXECUTE_OP_BY_ACL=false + +#设置默认日志级别,不需要修改 +export ASCEND_GLOBAL_LOG_LEVEL_ETP=3 + +#基础参数 需要模型审视修改 +#网络名称,同目录名称 +Network="YOLOv5_ID1719_for_TensorFlow2.X" + +# 训练epoch +stage1_epoch=0 +stage2_epoch=1 + +# 训练batchsize +batch_size=8 + +train_worker_num=8 + +# TF2.X独有,不需要修改 +export NPU_LOOPSIZE=1 + +# 精度模式 +precision_mode='allow_mix_precision' +#维持参数,不需要修改 +over_dump=False +over_dump_path='' +data_dump_flag=False +data_dump_path='' +data_dump_step="1" +profiling=False +autotune=False +perf=20 + + +#校验是否传入data_path,不需要修改 +if [[ $data_path == "" ]];then + echo "[Error] para \"data_path\" must be specified" + exit 1 +fi + +#训练开始时间,不需要修改 +start_time=$(date +%s) +bind_core=1 +#进入训练脚本目录,需要模型审视修改 +#for((RANK_ID=$RANK_ID_START;RANK_ID<$((RANK_SIZE+RANK_ID_START));RANK_ID++)); +for((RANK_ID=$((rank_size*server_index));RANK_ID<$((((server_index+1))*rank_size));RANK_ID++)); +do + #设置环境变量,不需要修改 + echo "Device ID: $RANK_ID" + export RANK_ID=$RANK_ID + export ASCEND_DEVICE_ID=`expr ${RANK_ID} - $((rank_size*server_index))` + ASCEND_DEVICE_ID=`expr ${RANK_ID} - $((rank_size*server_index))` + export DEVICE_ID=${ASCEND_DEVICE_ID} + #echo 'DEVICE_ID: '$ASCEND_DEVICE_ID + RANK_ID_core=$RANK_ID + + #创建DeviceID输出目录,不需要修改 + if [ -d ${cur_path}/output/$ASCEND_DEVICE_ID ];then + rm -rf ${cur_path}/output/$ASCEND_DEVICE_ID + mkdir -p ${cur_path}/output/$ASCEND_DEVICE_ID/ckpt + else + mkdir -p ${cur_path}/output/$ASCEND_DEVICE_ID/ckpt + fi + cd ${cur_path}/output/$ASCEND_DEVICE_ID/ + #执行训练脚本,需要模型审视修改 + if [ ${RANK_ID_core} -gt 7 ];then + RANK_ID_core=$((RANK_ID_core-8)) + fi + + #echo 'RANK_ID_core is: '$RANK_ID_core + + # 执行训练脚本,需要模型审视修改 + corenum=`cat /proc/cpuinfo |grep 'processor' |wc -l` + let a=RANK_ID_core*${corenum}/8 + let b=RANK_ID_core+1 + let c=b*${corenum}/8-1 + if [ "x${bind_core}" != x ];then + bind_core="taskset -c $a-$c" + fi + #${bind_core} python3 ../../../train.py --weights='' \ + nohup ${bind_core} python3 ../../../train.py --weights='' \ + --perf=$perf \ + --model=yolov5m \ + --rank=${RANK_ID} \ + --rank_size=${RANK_SIZE} \ + --train_worker_num=${train_worker_num} \ + --data_path=${data_path} \ + --anno_converted=${anno_converted} \ + --gt_anno_path=${gt_anno_path} \ + --batch_size=${batch_size} \ + --precision_mode=${precision_mode} \ + --stage1_epoch=${stage1_epoch} \ + --stage2_epoch=${stage2_epoch} \ + --over_dump=${over_dump} \ + --over_dump_path=${over_dump_path} \ + --data_dump_flag=${data_dump_flag} \ + --data_dump_step=${data_dump_step} > ${cur_path}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log 2>&1 & +done +wait + +#训练结束时间,不需要修改 +end_time=$(date +%s) +e2e_time=$(( $end_time - $start_time )) + +echo "------------------ Final result ------------------" +#输出性能FPS。需要模型审视修改 +epoch_duration=`grep epoch_duration $cur_path/output/0/train_0.log | awk '{print $2}'` +first_step=`grep duration: $cur_path/output/0/train_0.log |head -1| awk 'END{print $18}'` +FPS=`awk 'BEGIN{printf "%.2f\n",('$perf'+'$train_worker_num'-2)/('$epoch_duration'-'$first_step')*'$batch_size'*16}'` +echo "Final Performance imgs/sec : $FPS" + +#训练精度,需要从train_$ASCEND_DEVICE_ID.log里,通过关键字获取。需要模型审视修改 +# li=`cat $cur_path/output/0/train_0.log | wc -l` +# num=$(($li - 1)) +# train_accuracy=`sed -n "${num}p" $cur_path/output/0/train_0.log | awk '{print $3}'` +# echo "Final Train Accuracy : ${train_accuracy}" +#E2E训练端到端时长,直接计算,不需要修改 +echo "E2E training Duration sec: $e2e_time" + +#训练用例信息,不需要修改 +DeviceType=`uname -m` +CaseName=${Network}${name_bind}_bs${batch_size}_${RANK_SIZE}'p'_'perf' + +##获取性能数据,不需要修改 +#吞吐量 +ActualFPS=${FPS} +#单迭代训练时长 +TrainingTime=`awk 'BEGIN{printf "%.2f\n",('$epoch_duration'-'$first_step')/('$perf'+'$train_worker_num'-2)}'` + +##获取Loss,通过train_*.log中关键字,需要根据模型审视 +grep loss $cur_path/output/0/train_0.log|awk '{print $13}' > $cur_path/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt + +#最后一个迭代loss值,不需要修改 +ActualLoss=`grep total_loss: $cur_path/output/0/train_0.log | awk 'END{print $13}'` + +#关键信息打印到${CaseName}.log中,不需要修改 +echo "Network = ${Network}" > $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "RankSize = ${RANK_SIZE}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "BatchSize = ${batch_size}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "DeviceType = ${DeviceType}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "CaseName = ${CaseName}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "ActualFPS = ${ActualFPS}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "TrainingTime = ${TrainingTime}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +# echo "TrainAccuracy = ${train_accuracy}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "ActualLoss = ${ActualLoss}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "E2ETrainingTime= ${e2e_time}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log + +for((RANK_ID=$RANK_ID_START;RANK_ID<$((RANK_SIZE+RANK_ID_START));RANK_ID++)); +do + sed -i "/AttributeError/d" $cur_path/output/${RANK_ID}/train_${RANK_ID}.log +done \ No newline at end of file diff --git a/TensorFlow2/built-in/cv/detection/YOLOv5_ID1719_for_TensorFlow2.X/test/train_performance_1p.sh b/TensorFlow2/built-in/cv/detection/YOLOv5_ID1719_for_TensorFlow2.X/test/train_performance_1p.sh new file mode 100644 index 0000000000000000000000000000000000000000..c55b4030a46f8b78d10e2b27e1d9f8fb3324b609 --- /dev/null +++ b/TensorFlow2/built-in/cv/detection/YOLOv5_ID1719_for_TensorFlow2.X/test/train_performance_1p.sh @@ -0,0 +1,204 @@ +#!/bin/bash + +#当前路径,不需要修改 +cur_path=`pwd` + +export PYTHONWARNINGS='ignore:semaphore_tracker:UserWarning' + +#集合通信参数,不需要修改 +#保证rank table file 文件rank_table_8p.json存放在和test同级的configs目录下 +export RANK_SIZE=1 +#export RANK_TABLE_FILE=${cur_path}/../configs/rank_table_8p.json +export JOB_ID=10087 +RANK_ID_START=0 +RANK_SIZE=1 + +# 数据集路径,保持为空,不需要修改 +data_path="" + +anno_converted='/npu/traindata/COCO2017/val2017.txt' +gt_anno_path='/npu/traindata/COCO2017/annotations/instances_val2017.json' + +#屏蔽TF2.4升级到TF2.6图差异带来的性能下降 +#export NPU_EXECUTE_OP_BY_ACL=false + +#设置默认日志级别,不需要修改 +export ASCEND_GLOBAL_LOG_LEVEL_ETP=3 + +#基础参数 需要模型审视修改 +#网络名称,同目录名称 +Network="YOLOv5_ID1719_for_TensorFlow2.X" + +# 训练epoch +stage1_epoch=0 +stage2_epoch=1 + +# 训练batchsize +batch_size=8 + +train_worker_num=8 + +# TF2.X独有,不需要修改 +export NPU_LOOPSIZE=1 + +# 精度模式 +precision_mode='allow_mix_precision' +#维持参数,不需要修改 +over_dump=False +over_dump_path='' +data_dump_flag=False +data_dump_path='' +data_dump_step="1" +profiling=False +autotune=False +perf=20 + +# 帮助信息,不需要修改 +if [[ $1 == --help || $1 == -h ]];then + echo"usage:./train_full_8p.sh " + echo " " + echo "parameter explain: + --precision_mode precision mode(allow_fp32_to_fp16/force_fp16/must_keep_origin_dtype/allow_mix_precision) + --over_dump if or not over detection, default is False + --data_dump_flag data dump flag, default is 0 + --data_dump_step data dump step, default is 10 + --profiling if or not profiling for performance debug, default is False + --data_path source data of training + -h/--help show help message + " + exit 1 +fi + +#参数校验,不需要修改 +for para in $* +do + if [[ $para == --precision_mode* ]];then + precision_mode=`echo ${para#*=}` + elif [[ $para == --over_dump* ]];then + over_dump=`echo ${para#*=}` + over_dump_path=${cur_path}/output/overflow_dump + mkdir -p ${over_dump_path} + elif [[ $para == --data_dump_flag* ]];then + data_dump_flag=`echo ${para#*=}` + data_dump_path=${cur_path}/output/data_dump + mkdir -p ${data_dump_path} + elif [[ $para == --data_dump_step* ]];then + data_dump_step=`echo ${para#*=}` + elif [[ $para == --profiling* ]];then + profiling=`echo ${para#*=}` + profiling_dump_path=${cur_path}/output/profiling + mkdir -p ${profiling_dump_path} + elif [[ $para == --data_path* ]];then + data_path=`echo ${para#*=}` + elif [[ $para == --bind_core* ]]; then + bind_core=`echo ${para#*=}` + name_bind="_bindcore" + fi +done + +#校验是否传入data_path,不需要修改 +if [[ $data_path == "" ]];then + echo "[Error] para \"data_path\" must be specified" + exit 1 +fi + +#训练开始时间,不需要修改 +start_time=$(date +%s) +bind_core=1 +#进入训练脚本目录,需要模型审视修改 +for((RANK_ID=$RANK_ID_START;RANK_ID<$((RANK_SIZE+RANK_ID_START));RANK_ID++)); +do + #设置环境变量,不需要修改 + echo "Device ID: $RANK_ID" + export RANK_ID=$RANK_ID + export ASCEND_DEVICE_ID=$RANK_ID + ASCEND_DEVICE_ID=$RANK_ID + + #创建DeviceID输出目录,不需要修改 + if [ -d ${cur_path}/output/$ASCEND_DEVICE_ID ];then + rm -rf ${cur_path}/output/$ASCEND_DEVICE_ID + mkdir -p ${cur_path}/output/$ASCEND_DEVICE_ID/ckpt + else + mkdir -p ${cur_path}/output/$ASCEND_DEVICE_ID/ckpt + fi + cd ${cur_path}/output/$ASCEND_DEVICE_ID/ + #执行训练脚本,需要模型审视修改 + corenum=`cat /proc/cpuinfo |grep 'processor' |wc -l` + let a=RANK_ID*${corenum}/8 + let b=RANK_ID+1 + let c=b*${corenum}/8-1 + if [ "x${bind_core}" != x ];then + bind_core="taskset -c $a-$c" + fi + #${bind_core} python3 ../../../train.py --weights='' \ + nohup ${bind_core} python3 ../../../train.py --weights='' \ + --perf=$perf \ + --model=yolov5m \ + --rank=${RANK_ID} \ + --rank_size=${RANK_SIZE} \ + --train_worker_num=${train_worker_num} \ + --data_path=${data_path} \ + --anno_converted=${anno_converted} \ + --gt_anno_path=${gt_anno_path} \ + --batch_size=${batch_size} \ + --precision_mode=${precision_mode} \ + --stage1_epoch=${stage1_epoch} \ + --stage2_epoch=${stage2_epoch} \ + --over_dump=${over_dump} \ + --over_dump_path=${over_dump_path} \ + --data_dump_flag=${data_dump_flag} \ + --data_dump_step=${data_dump_step} > ${cur_path}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log 2>&1 & +done +wait + +#训练结束时间,不需要修改 +end_time=$(date +%s) +e2e_time=$(( $end_time - $start_time )) + +echo "------------------ Final result ------------------" +#输出性能FPS。需要模型审视修改 +epoch_duration=`grep epoch_duration $cur_path/output/0/train_0.log | awk '{print $2}'` +first_step=`grep duration: $cur_path/output/0/train_0.log |head -1| awk 'END{print $17}'` +FPS=`awk 'BEGIN{printf "%.2f\n",('$perf'+'$train_worker_num'-2)/('$epoch_duration'-'$first_step')*'$batch_size'*1}'` +echo "Final Performance imgs/sec : $FPS" + +#训练精度,需要从train_$ASCEND_DEVICE_ID.log里,通过关键字获取。需要模型审视修改 +# li=`cat $cur_path/output/0/train_0.log | wc -l` +# num=$(($li - 1)) +# train_accuracy=`sed -n "${num}p" $cur_path/output/0/train_0.log | awk '{print $3}'` +# echo "Final Train Accuracy : ${train_accuracy}" +#E2E训练端到端时长,直接计算,不需要修改 +echo "E2E training Duration sec: $e2e_time" + +#训练用例信息,不需要修改 +DeviceType=`uname -m` +CaseName=${Network}${name_bind}_bs${batch_size}_${RANK_SIZE}'p'_'perf' + +##获取性能数据,不需要修改 +#吞吐量 +ActualFPS=${FPS} +#单迭代训练时长 +TrainingTime=`awk 'BEGIN{printf "%.2f\n",('$epoch_duration'-'$first_step')/('$perf'+'$train_worker_num'-2)}'` + +##获取Loss,通过train_*.log中关键字,需要根据模型审视 +grep loss $cur_path/output/0/train_0.log|awk '{print $13}' > $cur_path/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt + +#最后一个迭代loss值,不需要修改 +ActualLoss=`grep total_loss: $cur_path/output/0/train_0.log | awk 'END{print $13}'` + +#关键信息打印到${CaseName}.log中,不需要修改 +echo "Network = ${Network}" > $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "RankSize = ${RANK_SIZE}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "BatchSize = ${batch_size}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "DeviceType = ${DeviceType}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "CaseName = ${CaseName}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "ActualFPS = ${ActualFPS}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "TrainingTime = ${TrainingTime}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +# echo "TrainAccuracy = ${train_accuracy}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "ActualLoss = ${ActualLoss}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "E2ETrainingTime= ${e2e_time}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log + +for((RANK_ID=$RANK_ID_START;RANK_ID<$((RANK_SIZE+RANK_ID_START));RANK_ID++)); +do + sed -i "/AttributeError/d" $cur_path/output/${RANK_ID}/train_${RANK_ID}.log +done \ No newline at end of file diff --git a/TensorFlow2/built-in/cv/detection/YOLOv5_ID1719_for_TensorFlow2.X/test/train_performance_8p.sh b/TensorFlow2/built-in/cv/detection/YOLOv5_ID1719_for_TensorFlow2.X/test/train_performance_8p.sh index 744bdd6505d098848a7384dfdacf4d93bc2ae255..6f24266d5bf266bfd778043d527358be8ab81edf 100644 --- a/TensorFlow2/built-in/cv/detection/YOLOv5_ID1719_for_TensorFlow2.X/test/train_performance_8p.sh +++ b/TensorFlow2/built-in/cv/detection/YOLOv5_ID1719_for_TensorFlow2.X/test/train_performance_8p.sh @@ -11,7 +11,7 @@ export RANK_SIZE=8 export RANK_TABLE_FILE=${cur_path}/../configs/rank_table_8p.json export JOB_ID=10087 RANK_ID_START=0 - +RANK_SIZE=8 # 数据集路径,保持为空,不需要修改 data_path="" @@ -22,7 +22,7 @@ gt_anno_path='/npu/traindata/COCO2017/annotations/instances_val2017.json' export NPU_EXECUTE_OP_BY_ACL=false #设置默认日志级别,不需要修改 -export ASCEND_GLOBAL_LOG_LEVEL=3 +export ASCEND_GLOBAL_LOG_LEVEL_ETP=3 #基础参数 需要模型审视修改 #网络名称,同目录名称 @@ -103,7 +103,7 @@ fi #训练开始时间,不需要修改 start_time=$(date +%s) - +bind_core=1 #进入训练脚本目录,需要模型审视修改 for((RANK_ID=$RANK_ID_START;RANK_ID<$((RANK_SIZE+RANK_ID_START));RANK_ID++)); do @@ -129,22 +129,24 @@ do if [ "x${bind_core}" != x ];then bind_core="taskset -c $a-$c" fi - ${bind_core} python3 ../../../train.py --weights='' \ - --perf=$perf \ - --model=yolov5m \ - --rank=${RANK_ID} \ - --train_worker_num=${train_worker_num} \ - --data_path=${data_path} \ - --anno_converted=${anno_converted} \ - --gt_anno_path=${gt_anno_path} \ - --batch_size=${batch_size} \ - --precision_mode=${precision_mode} \ - --stage1_epoch=${stage1_epoch} \ - --stage2_epoch=${stage2_epoch} \ - --over_dump=${over_dump} \ - --over_dump_path=${over_dump_path} \ - --data_dump_flag=${data_dump_flag} \ - --data_dump_step=${data_dump_step} > ${cur_path}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log 2>&1 & + #${bind_core} python3 ../../../train.py --weights='' \ + nohup ${bind_core} python3 ../../../train.py --weights='' \ + --perf=$perf \ + --model=yolov5m \ + --rank=${RANK_ID} \ + --rank_size=${RANK_SIZE} \ + --train_worker_num=${train_worker_num} \ + --data_path=${data_path} \ + --anno_converted=${anno_converted} \ + --gt_anno_path=${gt_anno_path} \ + --batch_size=${batch_size} \ + --precision_mode=${precision_mode} \ + --stage1_epoch=${stage1_epoch} \ + --stage2_epoch=${stage2_epoch} \ + --over_dump=${over_dump} \ + --over_dump_path=${over_dump_path} \ + --data_dump_flag=${data_dump_flag} \ + --data_dump_step=${data_dump_step} > ${cur_path}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log 2>&1 & done wait @@ -178,7 +180,7 @@ ActualFPS=${FPS} TrainingTime=`awk 'BEGIN{printf "%.2f\n",('$epoch_duration'-'$first_step')/('$perf'+'$train_worker_num'-2)}'` ##获取Loss,通过train_*.log中关键字,需要根据模型审视 -grep loss $cur_path/output/0/train_0.log|awk '{print $13}' >> $cur_path/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt +grep loss $cur_path/output/0/train_0.log|awk '{print $13}' > $cur_path/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt #最后一个迭代loss值,不需要修改 ActualLoss=`grep total_loss: $cur_path/output/0/train_0.log | awk 'END{print $13}'` diff --git a/TensorFlow2/built-in/cv/detection/YOLOv5_ID1719_for_TensorFlow2.X/train.py b/TensorFlow2/built-in/cv/detection/YOLOv5_ID1719_for_TensorFlow2.X/train.py index 9bb4994f498812b9b4f7440e8dd0d86f4ded8ad3..d17bc0cc806951471ebe9f14c236a55ae8a5c223 100644 --- a/TensorFlow2/built-in/cv/detection/YOLOv5_ID1719_for_TensorFlow2.X/train.py +++ b/TensorFlow2/built-in/cv/detection/YOLOv5_ID1719_for_TensorFlow2.X/train.py @@ -65,6 +65,7 @@ flags.DEFINE_integer('train_worker_num', 8, 'train worker num') flags.DEFINE_boolean('eval_only', False, 'skip train process') flags.DEFINE_boolean('mosaic', True, 'activate mosaic data augmentation') flags.DEFINE_integer('rank', 0, 'rank of current device') +flags.DEFINE_integer('rank_size', 1, 'rank size of current device') flags.DEFINE_integer('perf', 0, 'run steps for perf') tic = 0 e_tic = 0 @@ -169,8 +170,8 @@ def main(_argv): lr = cfg.TRAIN.LR_END + 0.5 * (cfg.TRAIN.LR_INIT - cfg.TRAIN.LR_END) * ((1 + tf.cos((global_steps - warmup_steps) / (total_steps - warmup_steps) * np.pi))) optimizer.lr.assign(lr.numpy()) ciou_loss, conf_loss, prob_loss, total_loss = train_execute(image_data, target) - if FLAGS.rank == 0: - print("=> STEP %4d/%4d lr: %.6f ciou_loss: %4.2f conf_loss: %4.2f prob_loss: %4.2f total_loss: %4.2f" % (global_steps, total_steps, optimizer.lr.numpy(), ciou_loss, conf_loss, prob_loss, total_loss), end='', flush=True) + # if FLAGS.rank == 0: + print("=> STEP %4d/%4d lr: %.6f ciou_loss: %4.2f conf_loss: %4.2f prob_loss: %4.2f total_loss: %4.2f" % (global_steps, total_steps, optimizer.lr.numpy(), ciou_loss, conf_loss, prob_loss, total_loss), end='', flush=True) global_steps.assign_add(1) if not FLAGS.eval_only: @@ -204,13 +205,13 @@ def main(_argv): break image_data, target, _, _, _, _ = fetcher.process_annotations(annotations) with mutex_sess_run: - if FLAGS.rank == 0: - rstart = time.time() + # if FLAGS.rank == 0: + rstart = time.time() train_step(image_data, target) - if FLAGS.rank == 0: - duration = time.time() - tic - print(' ,global_step/sec: %.2f ,duration: %.2f'%((1 / duration), duration), flush=True) - tic = time.time() + # if FLAGS.rank == 0: + duration = time.time() - tic + print(' ,global_step/sec: %.2f ,duration: %.2f'%((1 / duration), duration), flush=True) + tic = time.time() if FLAGS.perf and (FLAGS.perf < global_steps.numpy()): break threads = [] @@ -222,11 +223,11 @@ def main(_argv): for t in threads: t.join() - if FLAGS.rank == 0: - print('epoch_duration: %d'%(time.time() - e_tic), flush=True) - e_tic = time.time() - print('saving checkpoints', flush=True) - checkpoint.save(checkpoint_dir+'/model.ckpt') + # if FLAGS.rank == 0: + print('epoch_duration: %d'%(time.time() - e_tic), flush=True) + e_tic = time.time() + # print('saving checkpoints', flush=True) + # checkpoint.save(checkpoint_dir+'/model.ckpt') if not FLAGS.perf and FLAGS.rank == 0: evaluator = COCOevaluator(model, testset, cfg.TRAIN.INPUT_SIZE, NUM_CLASS, FLAGS) evaluator.evaluate()