diff --git a/TensorFlow/contrib/cv/BSRN_ID1296_for_TensorFlow/README.md b/TensorFlow/contrib/cv/BSRN_ID1296_for_TensorFlow/README.md index 7a20326fde0ec59d9d0c8ede43dd5546120420a1..6487b607580c69c94b4fc7f5236a6da09092b4ec 100644 --- a/TensorFlow/contrib/cv/BSRN_ID1296_for_TensorFlow/README.md +++ b/TensorFlow/contrib/cv/BSRN_ID1296_for_TensorFlow/README.md @@ -48,8 +48,13 @@ │ ├── test.sh ----推理 │ ├── validate_gpu.sh ----gpu(v100)上验证模型精度 │ └── validate_npu.sh ----npu(modelarts)上验证模型精度 -│ ├── boot_modelarts.py -│ ├── help_modelarts.py +│ ├── test ----新模板(训练入口) +│ ├── train_full_1p.sh ----gpu(v100)上验证模型精度 +│ └── train_performance_1p.sh ----npu(modelarts)上验证模型精度 +│ ├── boot_modelarts.py(已过期,可用于旧版训练) +│ ├── help_modelarts.py(已过期,可用于旧版训练) +│ ├── modelarts_entry_acc.py ----训练启动文件 +│ ├── modelarts_entry_perf.py ----性能测试启动文件 │ ├── test_bsrn.py ----测试模型 │ ├── train.py ----训练模型 │ ├── output.txt ----训练输出(gpu训练自动生成) @@ -90,12 +95,8 @@ nohup bash scripts/run_gpu.sh > output.txt 2>&1 & ``` - 训练之前需修改`boot_modelarts.py`中第77行代码为bash_header = os.path.join(code_dir, 'scripts/run_gpu.sh') -### GPU离线推理 【在线推理待完善】 -命令行切换路径到`tf-bsrn-sr/`,执行以下命令,详细的参数设置请参考脚本中的注释 -```shell -bash scripts/test.sh -``` + ### GPU评估 命令行切换路径到`tf-bsrn-sr/`,执行以下命令,详细的参数设置请参考脚本中的注释 @@ -112,17 +113,19 @@ ModelArts的使用请参考[模型开发向导_昇腾CANN社区版(5.0.2.alpha00 配置方式请参考: +> 旧版,已废弃 + modelarts配置 +> 下图是使用新模板后的更新配置图: + +image-20220311145441757 + (修改`boot_modelarts.py`中第77行代码bash_header = os.path.join(code_dir, 'scripts/run_npu.sh'),可以设置在NPU上跑还是在GPU上跑) ### 指标对比 均使用相同的训练集以及测试集,训练参数都相同。 -NPU Checkpoints: ([百度云链接,提取码:xxxx]()) 【链接待完善】 - -GPU Checkpoints: ([百度云链接,提取码:xxxx]()) 【链接待完善】 - 作者论文中提供的各项指标值为: | | PSNR | SSIM | @@ -131,8 +134,8 @@ GPU Checkpoints: ([百度云链接,提取码:xxxx]()) 【 **(PSNR, SSIMscores for scale x4 on BSD100 dataset.)** +##### *×*4-scale BSRN model -##### *×*4-scale BSRN model 【bsrn gpu, npu指标 待完善】 @@ -148,16 +151,15 @@ GPU Checkpoints: ([百度云链接,提取码:xxxx]()) - - - - + + + +
metrics
BSRN26.44427.3870.6800.702
- ### 性能对比 展示bsrn模型在DIV2K 数据集上的训练性能 @@ -168,8 +170,14 @@ GPU Checkpoints: ([百度云链接,提取码:xxxx]()) 【 NPU性能log截图 +> 旧版训练截图 + NPU性能 +> 下图为使用新模板(1.15)后的训练截图 + +image-20220311144957252 + GPU性能log截图 NPU性能 @@ -179,27 +187,3 @@ GPU Checkpoints: ([百度云链接,提取码:xxxx]()) 【 | NPU | 8 | 0.739 | | GPU V100 | 8 | 0.828 | -#### 推理性能 【待完善】 - -NPU性能log截图 - -GPU性能log截图 - - - -| 平台 | BatchSize | 训练性能(fps) | -| :------: | :-------: | :-----------: | -| NPU | | | -| GPU V100 | | | - -#### 性能调优 【待完善】 - -##### NPU AutoTune性能 - -训练时开启AutoTune: - -npu训练性能(命令行截图) - -| 平台 | BatchSize | 训练性能(fps) | -| :--: | :-------: | :-----------: | -| NPU | | | diff --git a/TensorFlow/contrib/cv/BSRN_ID1296_for_TensorFlow/tf-bsrn-sr/boot_modelarts.py b/TensorFlow/contrib/cv/BSRN_ID1296_for_TensorFlow/boot_modelarts.py similarity index 100% rename from TensorFlow/contrib/cv/BSRN_ID1296_for_TensorFlow/tf-bsrn-sr/boot_modelarts.py rename to TensorFlow/contrib/cv/BSRN_ID1296_for_TensorFlow/boot_modelarts.py diff --git a/TensorFlow/contrib/cv/BSRN_ID1296_for_TensorFlow/tf-bsrn-sr/dataloaders/__init__.py b/TensorFlow/contrib/cv/BSRN_ID1296_for_TensorFlow/dataloaders/__init__.py similarity index 100% rename from TensorFlow/contrib/cv/BSRN_ID1296_for_TensorFlow/tf-bsrn-sr/dataloaders/__init__.py rename to TensorFlow/contrib/cv/BSRN_ID1296_for_TensorFlow/dataloaders/__init__.py diff --git a/TensorFlow/contrib/cv/BSRN_ID1296_for_TensorFlow/dataloaders/__pycache__/__init__.cpython-37.pyc b/TensorFlow/contrib/cv/BSRN_ID1296_for_TensorFlow/dataloaders/__pycache__/__init__.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..f99b593115bb8fcca17250998a71d1b3285b87cf Binary files /dev/null and b/TensorFlow/contrib/cv/BSRN_ID1296_for_TensorFlow/dataloaders/__pycache__/__init__.cpython-37.pyc differ diff --git a/TensorFlow/contrib/cv/BSRN_ID1296_for_TensorFlow/dataloaders/__pycache__/base_loader.cpython-37.pyc b/TensorFlow/contrib/cv/BSRN_ID1296_for_TensorFlow/dataloaders/__pycache__/base_loader.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..4186db84b08a1f8bc1d4396ce4b3d20bdc7d939b Binary files /dev/null and b/TensorFlow/contrib/cv/BSRN_ID1296_for_TensorFlow/dataloaders/__pycache__/base_loader.cpython-37.pyc differ diff --git a/TensorFlow/contrib/cv/BSRN_ID1296_for_TensorFlow/dataloaders/__pycache__/basic_loader.cpython-37.pyc b/TensorFlow/contrib/cv/BSRN_ID1296_for_TensorFlow/dataloaders/__pycache__/basic_loader.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..41caff3a12fda12d1377aebc83b37177e4271303 Binary files /dev/null and b/TensorFlow/contrib/cv/BSRN_ID1296_for_TensorFlow/dataloaders/__pycache__/basic_loader.cpython-37.pyc differ diff --git a/TensorFlow/contrib/cv/BSRN_ID1296_for_TensorFlow/dataloaders/__pycache__/div2k_loader.cpython-37.pyc b/TensorFlow/contrib/cv/BSRN_ID1296_for_TensorFlow/dataloaders/__pycache__/div2k_loader.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..7558655bba756c81eca3d0302b8a1fbcf296e583 Binary files /dev/null and b/TensorFlow/contrib/cv/BSRN_ID1296_for_TensorFlow/dataloaders/__pycache__/div2k_loader.cpython-37.pyc differ diff --git a/TensorFlow/contrib/cv/BSRN_ID1296_for_TensorFlow/tf-bsrn-sr/dataloaders/base_loader.py b/TensorFlow/contrib/cv/BSRN_ID1296_for_TensorFlow/dataloaders/base_loader.py similarity index 100% rename from TensorFlow/contrib/cv/BSRN_ID1296_for_TensorFlow/tf-bsrn-sr/dataloaders/base_loader.py rename to TensorFlow/contrib/cv/BSRN_ID1296_for_TensorFlow/dataloaders/base_loader.py diff --git a/TensorFlow/contrib/cv/BSRN_ID1296_for_TensorFlow/tf-bsrn-sr/dataloaders/basic_loader.py b/TensorFlow/contrib/cv/BSRN_ID1296_for_TensorFlow/dataloaders/basic_loader.py similarity index 97% rename from TensorFlow/contrib/cv/BSRN_ID1296_for_TensorFlow/tf-bsrn-sr/dataloaders/basic_loader.py rename to TensorFlow/contrib/cv/BSRN_ID1296_for_TensorFlow/dataloaders/basic_loader.py index bd8e58c2bf40888d3018b9b7f2b493c949b159b4..1da40d23fe5299c2447eeacb70da8d04b3b9d051 100644 --- a/TensorFlow/contrib/cv/BSRN_ID1296_for_TensorFlow/tf-bsrn-sr/dataloaders/basic_loader.py +++ b/TensorFlow/contrib/cv/BSRN_ID1296_for_TensorFlow/dataloaders/basic_loader.py @@ -182,8 +182,9 @@ class BasicLoader(BaseLoader): has_cached = True if (image is None): - image_name_truth = image_name.split("x4")[0]+".png" #add - image_path = os.path.join(FLAGS.data_truth_path, image_name_truth) #add + # image_name_truth = image_name.split("x4")[0]+".png" #add + # image_path = os.path.join(FLAGS.data_truth_path, image_name_truth) #add + image_path = os.path.join(FLAGS.data_truth_path, image_name) image = self.tf_image_session.run(self.tf_image, feed_dict={self.tf_image_path:image_path}) if (FLAGS.data_cached and (not has_cached)): diff --git a/TensorFlow/contrib/cv/BSRN_ID1296_for_TensorFlow/tf-bsrn-sr/dataloaders/div2k_loader.py b/TensorFlow/contrib/cv/BSRN_ID1296_for_TensorFlow/dataloaders/div2k_loader.py similarity index 100% rename from TensorFlow/contrib/cv/BSRN_ID1296_for_TensorFlow/tf-bsrn-sr/dataloaders/div2k_loader.py rename to TensorFlow/contrib/cv/BSRN_ID1296_for_TensorFlow/dataloaders/div2k_loader.py diff --git a/TensorFlow/contrib/cv/BSRN_ID1296_for_TensorFlow/tf-bsrn-sr/help_modelarts.py b/TensorFlow/contrib/cv/BSRN_ID1296_for_TensorFlow/help_modelarts.py similarity index 100% rename from TensorFlow/contrib/cv/BSRN_ID1296_for_TensorFlow/tf-bsrn-sr/help_modelarts.py rename to TensorFlow/contrib/cv/BSRN_ID1296_for_TensorFlow/help_modelarts.py diff --git a/TensorFlow/contrib/cv/BSRN_ID1296_for_TensorFlow/modelarts_entry_acc.py b/TensorFlow/contrib/cv/BSRN_ID1296_for_TensorFlow/modelarts_entry_acc.py new file mode 100644 index 0000000000000000000000000000000000000000..1245d57140da14a640cd6dd7d9d43100b8617b67 --- /dev/null +++ b/TensorFlow/contrib/cv/BSRN_ID1296_for_TensorFlow/modelarts_entry_acc.py @@ -0,0 +1,63 @@ +# Copyright 2017 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ +# Copyright 2021 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import argparse +import sys + +# 解析输入参数data_url +parser = argparse.ArgumentParser() +parser.add_argument("--data_url", type=str, default="/home/ma-user/modelarts/inputs/data_url_0") +parser.add_argument("--train_url", type=str, default="/home/ma-user/modelarts/outputs/train_url_0/") +config = parser.parse_args() + +print("[CANN-Modelzoo] code_dir path is [%s]" % (sys.path[0])) +code_dir = sys.path[0] +os.chdir(code_dir) +print("[CANN-Modelzoo] work_dir path is [%s]" % (os.getcwd())) + +print("[CANN-Modelzoo] before train - list my run files:") +os.system("ls -al /usr/local/Ascend/ascend-toolkit/") + +print("[CANN-Modelzoo] before train - list my dataset files:") +os.system("ls -al %s" % config.data_url) + +print("[CANN-Modelzoo] start run train shell") +# 设置sh文件格式为linux可执行 +os.system("dos2unix ./test/*") + +# 执行train_full_1p.sh或者train_performance_1p.sh,需要用户自己指定 +# full和performance的差异,performance只需要执行很少的step,控制在15分钟以内,主要关注性能FPS +os.system("bash ./test/train_full_1p.sh --data_path=%s --output_path=%s " % (config.data_url, config.train_url)) + +print("[CANN-Modelzoo] finish run train shell") + +# 将当前执行目录所有文件拷贝到obs的output进行备份 +print("[CANN-Modelzoo] after train - list my output files:") +os.system("cp -r %s %s " % (code_dir, config.train_url)) +os.system("ls -al %s" % config.train_url) diff --git a/TensorFlow/contrib/cv/BSRN_ID1296_for_TensorFlow/modelarts_entry_perf.py b/TensorFlow/contrib/cv/BSRN_ID1296_for_TensorFlow/modelarts_entry_perf.py new file mode 100644 index 0000000000000000000000000000000000000000..e2d23455d4cdec2d46fc273177a247905c751b73 --- /dev/null +++ b/TensorFlow/contrib/cv/BSRN_ID1296_for_TensorFlow/modelarts_entry_perf.py @@ -0,0 +1,63 @@ +# Copyright 2017 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ +# Copyright 2021 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import argparse +import sys + +# 解析输入参数data_url +parser = argparse.ArgumentParser() +parser.add_argument("--data_url", type=str, default="/home/ma-user/modelarts/inputs/data_url_0") +parser.add_argument("--train_url", type=str, default="/home/ma-user/modelarts/outputs/train_url_0/") +config = parser.parse_args() + +print("[CANN-Modelzoo] code_dir path is [%s]" % (sys.path[0])) +code_dir = sys.path[0] +os.chdir(code_dir) +print("[CANN-Modelzoo] work_dir path is [%s]" % (os.getcwd())) + +print("[CANN-Modelzoo] before train - list my run files:") +os.system("ls -al /usr/local/Ascend/ascend-toolkit/") + +print("[CANN-Modelzoo] before train - list my dataset files:") +os.system("ls -al %s" % config.data_url) + +print("[CANN-Modelzoo] start run train shell") +# 设置sh文件格式为linux可执行 +os.system("dos2unix ./test/*") + +# 执行train_full_1p.sh或者train_performance_1p.sh,需要用户自己指定 +# full和performance的差异,performance只需要执行很少的step,控制在15分钟以内,主要关注性能FPS +os.system("bash ./test/train_performance_1p.sh --data_path=%s --output_path=%s " % (config.data_url, config.train_url)) + +print("[CANN-Modelzoo] finish run train shell") + +# 将当前执行目录所有文件拷贝到obs的output进行备份 +print("[CANN-Modelzoo] after train - list my output files:") +os.system("cp -r %s %s " % (code_dir, config.train_url)) +os.system("ls -al %s" % config.train_url) diff --git a/TensorFlow/contrib/cv/BSRN_ID1296_for_TensorFlow/tf-bsrn-sr/models/__init__.py b/TensorFlow/contrib/cv/BSRN_ID1296_for_TensorFlow/models/__init__.py similarity index 100% rename from TensorFlow/contrib/cv/BSRN_ID1296_for_TensorFlow/tf-bsrn-sr/models/__init__.py rename to TensorFlow/contrib/cv/BSRN_ID1296_for_TensorFlow/models/__init__.py diff --git a/TensorFlow/contrib/cv/BSRN_ID1296_for_TensorFlow/models/__pycache__/__init__.cpython-37.pyc b/TensorFlow/contrib/cv/BSRN_ID1296_for_TensorFlow/models/__pycache__/__init__.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..742959f176e0da72d53c0289e47c479742509df6 Binary files /dev/null and b/TensorFlow/contrib/cv/BSRN_ID1296_for_TensorFlow/models/__pycache__/__init__.cpython-37.pyc differ diff --git a/TensorFlow/contrib/cv/BSRN_ID1296_for_TensorFlow/models/__pycache__/base_model.cpython-37.pyc b/TensorFlow/contrib/cv/BSRN_ID1296_for_TensorFlow/models/__pycache__/base_model.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..6309b9ba9523fdcc3a1fa7b1c3d714ec0bc0688a Binary files /dev/null and b/TensorFlow/contrib/cv/BSRN_ID1296_for_TensorFlow/models/__pycache__/base_model.cpython-37.pyc differ diff --git a/TensorFlow/contrib/cv/BSRN_ID1296_for_TensorFlow/models/__pycache__/bsrn.cpython-37.pyc b/TensorFlow/contrib/cv/BSRN_ID1296_for_TensorFlow/models/__pycache__/bsrn.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..16dcf3a95205e81362f636ad49ba243b3c278853 Binary files /dev/null and b/TensorFlow/contrib/cv/BSRN_ID1296_for_TensorFlow/models/__pycache__/bsrn.cpython-37.pyc differ diff --git a/TensorFlow/contrib/cv/BSRN_ID1296_for_TensorFlow/tf-bsrn-sr/models/base_model.py b/TensorFlow/contrib/cv/BSRN_ID1296_for_TensorFlow/models/base_model.py similarity index 100% rename from TensorFlow/contrib/cv/BSRN_ID1296_for_TensorFlow/tf-bsrn-sr/models/base_model.py rename to TensorFlow/contrib/cv/BSRN_ID1296_for_TensorFlow/models/base_model.py diff --git a/TensorFlow/contrib/cv/BSRN_ID1296_for_TensorFlow/tf-bsrn-sr/models/bsrn.py b/TensorFlow/contrib/cv/BSRN_ID1296_for_TensorFlow/models/bsrn.py similarity index 100% rename from TensorFlow/contrib/cv/BSRN_ID1296_for_TensorFlow/tf-bsrn-sr/models/bsrn.py rename to TensorFlow/contrib/cv/BSRN_ID1296_for_TensorFlow/models/bsrn.py diff --git a/TensorFlow/contrib/cv/BSRN_ID1296_for_TensorFlow/modelzoo_level.txt b/TensorFlow/contrib/cv/BSRN_ID1296_for_TensorFlow/modelzoo_level.txt index 7074c830f89620c714ac12871d0a6bb82c26344d..1a8f8652695550373b522c5012f226f703369be1 100644 --- a/TensorFlow/contrib/cv/BSRN_ID1296_for_TensorFlow/modelzoo_level.txt +++ b/TensorFlow/contrib/cv/BSRN_ID1296_for_TensorFlow/modelzoo_level.txt @@ -1,5 +1,5 @@ FuncStatus:OK -PrecisionStatus:POK -PerfStatus:POK +PrecisionStatus:OK +PerfStatus:OK GPUStatus:OK -NPUMigrationStatus:POK \ No newline at end of file +NPUMigrationStatus:OK \ No newline at end of file diff --git a/TensorFlow/contrib/cv/BSRN_ID1296_for_TensorFlow/tf-bsrn-sr/scripts/run_gpu.sh b/TensorFlow/contrib/cv/BSRN_ID1296_for_TensorFlow/scripts/run_gpu.sh similarity index 100% rename from TensorFlow/contrib/cv/BSRN_ID1296_for_TensorFlow/tf-bsrn-sr/scripts/run_gpu.sh rename to TensorFlow/contrib/cv/BSRN_ID1296_for_TensorFlow/scripts/run_gpu.sh diff --git a/TensorFlow/contrib/cv/BSRN_ID1296_for_TensorFlow/tf-bsrn-sr/scripts/run_npu.sh b/TensorFlow/contrib/cv/BSRN_ID1296_for_TensorFlow/scripts/run_npu.sh similarity index 100% rename from TensorFlow/contrib/cv/BSRN_ID1296_for_TensorFlow/tf-bsrn-sr/scripts/run_npu.sh rename to TensorFlow/contrib/cv/BSRN_ID1296_for_TensorFlow/scripts/run_npu.sh diff --git a/TensorFlow/contrib/cv/BSRN_ID1296_for_TensorFlow/tf-bsrn-sr/scripts/run_npu_restore.sh b/TensorFlow/contrib/cv/BSRN_ID1296_for_TensorFlow/scripts/run_npu_restore.sh similarity index 100% rename from TensorFlow/contrib/cv/BSRN_ID1296_for_TensorFlow/tf-bsrn-sr/scripts/run_npu_restore.sh rename to TensorFlow/contrib/cv/BSRN_ID1296_for_TensorFlow/scripts/run_npu_restore.sh diff --git a/TensorFlow/contrib/cv/BSRN_ID1296_for_TensorFlow/tf-bsrn-sr/scripts/test.sh b/TensorFlow/contrib/cv/BSRN_ID1296_for_TensorFlow/scripts/test.sh similarity index 100% rename from TensorFlow/contrib/cv/BSRN_ID1296_for_TensorFlow/tf-bsrn-sr/scripts/test.sh rename to TensorFlow/contrib/cv/BSRN_ID1296_for_TensorFlow/scripts/test.sh diff --git a/TensorFlow/contrib/cv/BSRN_ID1296_for_TensorFlow/tf-bsrn-sr/scripts/validate_gpu.sh b/TensorFlow/contrib/cv/BSRN_ID1296_for_TensorFlow/scripts/validate_gpu.sh similarity index 100% rename from TensorFlow/contrib/cv/BSRN_ID1296_for_TensorFlow/tf-bsrn-sr/scripts/validate_gpu.sh rename to TensorFlow/contrib/cv/BSRN_ID1296_for_TensorFlow/scripts/validate_gpu.sh diff --git a/TensorFlow/contrib/cv/BSRN_ID1296_for_TensorFlow/tf-bsrn-sr/scripts/validate_npu.sh b/TensorFlow/contrib/cv/BSRN_ID1296_for_TensorFlow/scripts/validate_npu.sh similarity index 100% rename from TensorFlow/contrib/cv/BSRN_ID1296_for_TensorFlow/tf-bsrn-sr/scripts/validate_npu.sh rename to TensorFlow/contrib/cv/BSRN_ID1296_for_TensorFlow/scripts/validate_npu.sh diff --git "a/TensorFlow/contrib/cv/BSRN_ID1296_for_TensorFlow/statics/GPU\346\200\247\350\203\275.jpg" "b/TensorFlow/contrib/cv/BSRN_ID1296_for_TensorFlow/statics/GPU\346\200\247\350\203\275.jpg" new file mode 100644 index 0000000000000000000000000000000000000000..772ea2b971585daa23a5831062d8b4783e86dee6 Binary files /dev/null and "b/TensorFlow/contrib/cv/BSRN_ID1296_for_TensorFlow/statics/GPU\346\200\247\350\203\275.jpg" differ diff --git "a/TensorFlow/contrib/cv/BSRN_ID1296_for_TensorFlow/statics/NPU\346\200\247\350\203\275.jpg" "b/TensorFlow/contrib/cv/BSRN_ID1296_for_TensorFlow/statics/NPU\346\200\247\350\203\275.jpg" new file mode 100644 index 0000000000000000000000000000000000000000..a526cbe98a7d00dce89931642cb9829e0c963e58 Binary files /dev/null and "b/TensorFlow/contrib/cv/BSRN_ID1296_for_TensorFlow/statics/NPU\346\200\247\350\203\275.jpg" differ diff --git a/TensorFlow/contrib/cv/BSRN_ID1296_for_TensorFlow/test/train_full_1p.sh b/TensorFlow/contrib/cv/BSRN_ID1296_for_TensorFlow/test/train_full_1p.sh index 5475b928f201b8960c2c099cd4c01af169b433c8..838e8c376a12b5835effef47737f95eb6c6e60b9 100644 --- a/TensorFlow/contrib/cv/BSRN_ID1296_for_TensorFlow/test/train_full_1p.sh +++ b/TensorFlow/contrib/cv/BSRN_ID1296_for_TensorFlow/test/train_full_1p.sh @@ -1,173 +1,223 @@ #!/bin/bash -#当前路径,不需要修改 -cur_path=`pwd`/../ +########################################################## +#########第3行 至 100行,请一定不要、不要、不要修改########## +#########第3行 至 100行,请一定不要、不要、不要修改########## +#########第3行 至 100行,请一定不要、不要、不要修改########## +########################################################## +# shell脚本所在路径 +cur_path=`echo $(cd $(dirname $0);pwd)` -#集合通信参数,不需要修改 +# 判断当前shell是否是performance +perf_flag=`echo $0 | grep performance | wc -l` + +# 当前执行网络的名称 +Network=`echo $(cd $(dirname $0);pwd) | awk -F"/" '{print $(NF-1)}'` export RANK_SIZE=1 +export RANK_ID=0 export JOB_ID=10087 -RANK_ID_START=0 - - -# 数据集路径,保持为空,不需要修改 -data_path='' -#预训练模型地址 -ckpt_path='' - -#设置默认日志级别,不需要改 -#export ASCEND_GLOBAL_LOG_LEVEL=3 -#export ASCEND_DEVICE_ID=4 - -#基础参数,需要模型审视修改 -#网络名称,同目录名称 -Network="BSRN_ID1296_for_TensorFlow" -#训练epoch -epochs=1 -#训练batch_size -batch_size=8 - -#TF2.X独有,需要模型审视修改 -export NPU_LOOP_SIZE=${train_steps} - -#维测参数,precision_mode需要模型审视修改 -precision_mode="allow_mix_precision" -#维持参数,以下不需要修改 -over_dump=False -data_dump_flag=False -data_dump_step="10" -profiling=False +# 路径参数初始化 +data_path="" +output_path="" # 帮助信息,不需要修改 if [[ $1 == --help || $1 == -h ]];then echo"usage:./train_performance_1P.sh " echo " " echo "parameter explain: - --precision_mode precision mode(allow_fp32_to_fp16/force_fp16/must_keep_origin_dtype/allow_mix_precision) - --over_dump if or not over detection, default is False - --data_dump_flag data dump flag, default is False - --data_dump_step data dump step, default is 10 - --profiling if or not profiling for performance debug, default is False - --data_path source data of training - --ckpt_path model - -h/--help show help message + --data_path # dataset of training + --output_path # output of training + --train_steps # max_step for training + --train_epochs # max_epoch for training + --batch_size # batch size + -h/--help show help message " exit 1 fi -#参数校验,不需要修改 +# 参数校验,不需要修改 for para in $* do - if [[ $para == --precision_mode* ]];then - precision_mode=`echo ${para#*=}` - elif [[ $para == --over_dump* ]];then - over_dump=`echo ${para#*=}` - over_dump_path=${cur_path}/test/output/overflow_dump - mkdir -p ${over_dump_path} - elif [[ $para == --data_dump_flag* ]];then - data_dump_flag=`echo ${para#*=}` - data_dump_path=${cur_path}/test/output/data_dump - mkdir -p ${data_dump_path} - elif [[ $para == --data_dump_step* ]];then - data_dump_step=`echo ${para#*=}` - elif [[ $para == --profiling* ]];then - profiling=`echo ${para#*=}` - profiling_dump_path=${cur_path}/test/output/profiling - mkdir -p ${profiling_dump_path} - elif [[ $para == --data_path* ]];then + if [[ $para == --data_path* ]];then data_path=`echo ${para#*=}` - elif [[ $para == --ckpt_path* ]];then - ckpt_path=`echo ${para#*=}` - fi + elif [[ $para == --output_path* ]];then + output_path=`echo ${para#*=}` + elif [[ $para == --train_steps* ]];then + train_steps=`echo ${para#*=}` + elif [[ $para == --train_epochs* ]];then + train_epochs=`echo ${para#*=}` + elif [[ $para == --batch_size* ]];then + batch_size=`echo ${para#*=}` + fi done -# #校验是否传入data_path,不需要修改 -# if [[$data_path == ""]];then -# echo "[Error] para \"data_path\" must be confing" -# exit 1 -# fi -#训练开始时间,不需要修改 -start_time=$(date +%s) +# 校验是否传入data_path,不需要修改 +if [[ $data_path == "" ]];then + echo "[Error] para \"data_path\" must be config" + exit 1 +fi -#进入训练脚本目录,需要模型审视修改 -cd $cur_path/tf-bsrn-sr +# 校验是否传入output_path,不需要修改 +if [[ $output_path == "" ]];then + output_path="./test/output/${ASCEND_DEVICE_ID}" +fi -#创建DeviceID输出目录,不需要修改 -if [ -d ${cur_path}/test/output/${ASCEND_DEVICE_ID} ];then - rm -rf ${cur_path}/test/output/${ASCEND_DEVICE_ID} - mkdir -p ${cur_path}/test/output/$ASCEND_DEVICE_ID/ckpt -else - mkdir -p ${cur_path}/test/output/$ASCEND_DEVICE_ID/ckpt +# 设置打屏日志文件名,请保留,文件名为${print_log} +print_log="./test/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log" +modelarts_flag=${MODELARTS_MODEL_PATH} +if [ x"${modelarts_flag}" != x ]; +then + echo "running without etp..." + print_log_name=`ls /home/ma-user/modelarts/log/ | grep proc-rank` + print_log="/home/ma-user/modelarts/log/${print_log_name}" fi +echo "### get your log here : ${print_log}" + +CaseName="" +function get_casename() +{ + if [ x"${perf_flag}" = x1 ]; + then + CaseName=${Network}_bs${batch_size}_${RANK_SIZE}'p'_'perf' + else + CaseName=${Network}_bs${batch_size}_${RANK_SIZE}'p'_'acc' + fi +} + +# 跳转到code目录 +cd ${cur_path}/../ +rm -rf ./test/output/${ASCEND_DEVICE_ID} +mkdir -p ./test/output/${ASCEND_DEVICE_ID} + +# 训练开始时间记录,不需要修改 +start_time=$(date +%s) +########################################################## +#########第3行 至 100行,请一定不要、不要、不要修改########## +#########第3行 至 100行,请一定不要、不要、不要修改########## +#########第3行 至 100行,请一定不要、不要、不要修改########## +########################################################## + +#========================================================= +#========================================================= +#========训练执行命令,需要根据您的网络进行修改============== +#========================================================= +#========================================================= +# 基础参数,需要模型审视修改 +# 您的训练数据集在${data_path}路径下,请直接使用这个变量获取 +# 您的训练输出目录在${output_path}路径下,请直接使用这个变量获取 +# 您的其他基础参数,可以自定义增加,但是batch_size请保留,并且设置正确的值 +batch_size=8 -#执行训练脚本,以下传参不需要修改,其他需要模型审视修改 -python3 train.py \ - --data_input_path=${data_path}/dataset/DIV2K/DIV2K_train_LR_bicubic\ - --data_truth_path=${data_path}/dataset/DIV2K/DIV2K_train_HR \ - --train_path='./train' \ +#if [ x"${modelarts_flag}" != x ]; +#then +# echo ${data_path} +# ls ${data_path} +# relative_path_LR="DIV2K/DIV2K_train_LR_bicubic" +# relative_path_HR="DIV2K/DIV2K_train_HR" +# +# python3.7 ./train.py \ +# --data_input_path=${data_path}${relative_path_LR} --data_truth_path=${data_path}${relative_path_HR} --train_path=${output_path} \ +# --chip='npu' \ +# --model='bsrn' \ +# --dataloader='div2k_loader' \ +# --batch_size=8 \ +# --max_steps=100000 \ +# --save_freq=1000 \ +# --scales='4' +#else +relative_path_LR="/dataset/DIV2K/DIV2K_train_LR_bicubic" +relative_path_HR="/dataset/DIV2K/DIV2K_train_HR" +python3.7 ./train.py \ + --data_input_path=${data_path}${relative_path_LR}\ + --data_truth_path=${data_path}${relative_path_HR} \ + --train_path=./checkpoints \ --chip='npu' \ --model='bsrn' \ --dataloader='div2k_loader' \ --batch_size=8 \ - --scales='4' \ - --max_steps=100000 \ - --save_freq=10000 > ${cur_path}/test/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log 2>&1 -wait -python3 validate_bsrn.py \ - --dataloader=basic_loader \ - --data_input_path=${data_path}/dataset/BSD100/LR_bicubic \ - --data_truth_path=${data_path}/dataset/BSD100/original \ - --restore_path=./train/model.ckpt-100000 \ - --model=bsrn \ - --scales=4 \ - --save_path=./result/result-pictures \ - --chip='npu' > ${cur_path}/test/output/${ASCEND_DEVICE_ID}/test_${ASCEND_DEVICE_ID}.log 2>&1 -#训练结束时间,不需要修改 + --max_steps=300000\ + --save_freq=10 \ + --scales='4' 1>${print_log} 2>&1 + +relative_path_LR="/dataset/BSD100/LR" +relative_path_HR="/dataset/BSD100/SR" +# after training, load the model to check the performance +relative_path_checkpoint='model.ckpt-300000' + +python3.7 ./validate_bsrn.py \ + --dataloader=basic_loader \ + --data_input_path=${data_path}${relative_path_LR} --data_truth_path=${data_path}${relative_path_HR} \ + --restore_path=./checkpoints/${relative_path_checkpoint} \ + --model=bsrn \ + --scales=4 \ + --save_path=./result-pictures 1>>${print_log} 2>&1 +#fi +cat ${print_log} +# 性能相关数据计算 +StepTime=`grep "sec/batch" ${print_log} | tail -n 20 | awk '{print $(NF-2)}' | awk '{sum+=$1} END {print sum/NR}'` +FPS=`awk 'BEGIN{printf "%.2f\n", '${batch_size}'/'${StepTime}'}'` +# 精度相关数据计算 +PSNR=`grep "Final PSNR" ${print_log} | awk '{print $NF}'` +SSIM=`grep "Final SSIM" ${print_log} | awk '{print $NF}'` +# 提取所有loss打印信息 +grep "loss" ${print_log} | awk -F ":" '{print $4}'| grep "loss" |awk -F "," '{print $3}'|awk '{print $2}' > ./test/output/${ASCEND_DEVICE_ID}/my_output_loss.txt + + +########################################################### +#########后面的所有内容请不要修改########################### +#########后面的所有内容请不要修改########################### +#########后面的所有内容请不要修改########################### +########################################################### + +# 判断本次执行是否正确使用Ascend NPU +use_npu_flag=`grep "The model has been compiled on the Ascend AI processor" ${print_log} | wc -l` +if [ x"${use_npu_flag}" == x0 ]; +then + echo "------------------ ERROR NOTICE START ------------------" + echo "ERROR, your task haven't used Ascend NPU, please check your npu Migration." + echo "------------------ ERROR NOTICE END------------------" +else + echo "------------------ INFO NOTICE START------------------" + echo "INFO, your task have used Ascend NPU, please check your result." + echo "------------------ INFO NOTICE END------------------" +fi + +# 获取最终的casename,请保留,case文件名为${CaseName} +get_casename + +# 重命名loss文件 +if [ -f ./test/output/${ASCEND_DEVICE_ID}/my_output_loss.txt ]; +then + mv ./test/output/${ASCEND_DEVICE_ID}/my_output_loss.txt ./test/output/${ASCEND_DEVICE_ID}/${CaseName}_loss.txt +fi + +# 训练端到端耗时 end_time=$(date +%s) e2e_time=$(( $end_time - $start_time )) -#结果打印,不需要修改 echo "------------------ Final result ------------------" -#输出性能FPS,需要模型审视修改 -TrainingTime=`grep 'fps' $cur_path/test/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log|awk 'END {print $12}'` -FPS=`grep 'fps' $cur_path/test/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log|awk 'END {print $14}'` -#打印,不需要修改 -echo "Final Performance TrainingTime : $TrainingTime" +# 输出性能FPS/单step耗时/端到端耗时 echo "Final Performance images/sec : $FPS" - -#输出训练精度,需要模型审视修改 -train_accuracy=`grep -A1 RMSE: $cur_path/test/output/${ASCEND_DEVICE_ID}/test_${ASCEND_DEVICE_ID}.log|grep -v RMSE:|awk '{print $NF}'` - -#打印,不需要修改 -echo "Final Train Accuracy : ${train_accuracy}" +echo "Final Performance sec/step : $StepTime" echo "E2E Training Duration sec : $e2e_time" -#性能看护结果汇总 -#训练用例信息,不需要修改 -BatchSize=${batch_size} -DeviceType=`uname -m` -CaseName=${Network}_bs${BatchSize}_${RANK_SIZE}'p'_'acc' - -##获取性能数据,不需要修改 -#吞吐量 -ActualFPS=${FPS} -#单迭代训练时长 -#TrainingTime=`awk 'BEGIN{printf "%.2f\n",'${FPS}'/69}'` - -#从train_$ASCEND_DEVICE_ID.log提取Loss到train_${CaseName}_loss.txt中,需要根据模型审视 -grep 'loss' $cur_path/test/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log|awk '{print $10}' >> $cur_path/test/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt -#最后一个迭代loss值,不需要修改 -ActualLoss=`awk 'END {print}' $cur_path/test/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt` - -#关键信息打印到${CaseName}.log中,不需修改 -echo "Network = ${Network}" > $cur_path/test/output/$ASCEND_DEVICE_ID/${CaseName}.log -echo "RankSize = ${RANK_SIZE}" >> $cur_path/test/output/$ASCEND_DEVICE_ID/${CaseName}.log -echo "BatchSize = ${BatchSize}" >> $cur_path/test/output/$ASCEND_DEVICE_ID/${CaseName}.log -echo "DeviceType = ${DeviceType}" >> $cur_path/test/output/$ASCEND_DEVICE_ID/${CaseName}.log -echo "CaseName = ${CaseName}" >> $cur_path/test/output/$ASCEND_DEVICE_ID/${CaseName}.log -echo "ActualFPS = ${ActualFPS}" >> $cur_path/test/output/$ASCEND_DEVICE_ID/${CaseName}.log -echo "TrainingTime = ${TrainingTime}" >> $cur_path/test/output/$ASCEND_DEVICE_ID/${CaseName}.log -echo "ActualLoss = ${ActualLoss}" >> $cur_path/test/output/$ASCEND_DEVICE_ID/${CaseName}.log -echo "E2ETrainingTime = ${e2e_time}" >> $cur_path/test/output/$ASCEND_DEVICE_ID/${CaseName}.log +# 输出训练精度 +echo "Final Train Accuracy : ${PSNR}" +echo "Final SSIM : ${SSIM}" + +# 最后一个迭代loss值,不需要修改 +ActualLoss=(`awk 'END {print $NF}' $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}_loss.txt`) + +#关键信息打印到${CaseName}.log中,不需要修改 +echo "Network = ${Network}" > $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "RankSize = ${RANK_SIZE}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "BatchSize = ${batch_size}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "DeviceType = `uname -m`" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "CaseName = ${CaseName}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "ActualFPS = ${FPS}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "TrainingTime = ${StepTime}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "ActualLoss = ${ActualLoss}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "E2ETrainingTime = ${e2e_time}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log echo "TrainAccuracy = ${train_accuracy}" >> $cur_path/test/output/$ASCEND_DEVICE_ID/${CaseName}.log \ No newline at end of file diff --git a/TensorFlow/contrib/cv/BSRN_ID1296_for_TensorFlow/test/train_performance_1p.sh b/TensorFlow/contrib/cv/BSRN_ID1296_for_TensorFlow/test/train_performance_1p.sh index f035bc5dd7d17cd0fc8ae05655c802b8179fa462..d8e210b0fcb0fc7f77767ea5b77efaeff8cc6089 100644 --- a/TensorFlow/contrib/cv/BSRN_ID1296_for_TensorFlow/test/train_performance_1p.sh +++ b/TensorFlow/contrib/cv/BSRN_ID1296_for_TensorFlow/test/train_performance_1p.sh @@ -1,165 +1,210 @@ #!/bin/bash -#当前路径,不需要修改 -cur_path=`pwd`/../ +########################################################## +#########第3行 至 100行,请一定不要、不要、不要修改########## +#########第3行 至 100行,请一定不要、不要、不要修改########## +#########第3行 至 100行,请一定不要、不要、不要修改########## +########################################################## +# shell脚本所在路径 +cur_path=`echo $(cd $(dirname $0);pwd)` -#集合通信参数,不需要修改 +# 判断当前shell是否是performance +perf_flag=`echo $0 | grep performance | wc -l` + +# 当前执行网络的名称 +Network=`echo $(cd $(dirname $0);pwd) | awk -F"/" '{print $(NF-1)}'` export RANK_SIZE=1 +export RANK_ID=0 export JOB_ID=10087 -RANK_ID_START=0 - - -# 数据集路径,保持为空,不需要修改 -data_path='' -#预训练模型地址 -ckpt_path='' - -#设置默认日志级别,不需要改 -#export ASCEND_GLOBAL_LOG_LEVEL=3 -#export ASCEND_DEVICE_ID=4 - -#基础参数,需要模型审视修改 -#网络名称,同目录名称 -Network="BSRN_ID1296_for_TensorFlow" -#训练epoch -epochs=1 -#训练batch_size -batch_size=8 - - -#TF2.X独有,需要模型审视修改 -export NPU_LOOP_SIZE=${train_steps} -#维测参数,precision_mode需要模型审视修改 -precision_mode="allow_mix_precision" -#维持参数,以下不需要修改 -over_dump=False -data_dump_flag=False -data_dump_step="10" -profiling=False +# 路径参数初始化 +data_path="" +output_path="" # 帮助信息,不需要修改 if [[ $1 == --help || $1 == -h ]];then echo"usage:./train_performance_1P.sh " echo " " echo "parameter explain: - --precision_mode precision mode(allow_fp32_to_fp16/force_fp16/must_keep_origin_dtype/allow_mix_precision) - --over_dump if or not over detection, default is False - --data_dump_flag data dump flag, default is False - --data_dump_step data dump step, default is 10 - --profiling if or not profiling for performance debug, default is False - --data_path source data of training - --ckpt_path model - -h/--help show help message + --data_path # dataset of training + --output_path # output of training + --train_steps # max_step for training + --train_epochs # max_epoch for training + --batch_size # batch size + -h/--help show help message " exit 1 fi -#参数校验,不需要修改 +# 参数校验,不需要修改 for para in $* do - if [[ $para == --precision_mode* ]];then - precision_mode=`echo ${para#*=}` - elif [[ $para == --over_dump* ]];then - over_dump=`echo ${para#*=}` - over_dump_path=${cur_path}/test/output/overflow_dump - mkdir -p ${over_dump_path} - elif [[ $para == --data_dump_flag* ]];then - data_dump_flag=`echo ${para#*=}` - data_dump_path=${cur_path}/test/output/data_dump - mkdir -p ${data_dump_path} - elif [[ $para == --data_dump_step* ]];then - data_dump_step=`echo ${para#*=}` - elif [[ $para == --profiling* ]];then - profiling=`echo ${para#*=}` - profiling_dump_path=${cur_path}/test/output/profiling - mkdir -p ${profiling_dump_path} - elif [[ $para == --data_path* ]];then + if [[ $para == --data_path* ]];then data_path=`echo ${para#*=}` - elif [[ $para == --ckpt_path* ]];then - ckpt_path=`echo ${para#*=}` - fi + elif [[ $para == --output_path* ]];then + output_path=`echo ${para#*=}` + elif [[ $para == --train_steps* ]];then + train_steps=`echo ${para#*=}` + elif [[ $para == --train_epochs* ]];then + train_epochs=`echo ${para#*=}` + elif [[ $para == --batch_size* ]];then + batch_size=`echo ${para#*=}` + fi done -# #校验是否传入data_path,不需要修改 -# if [[$data_path == ""]];then -# echo "[Error] para \"data_path\" must be confing" -# exit 1 -# fi -#训练开始时间,不需要修改 -start_time=$(date +%s) +# 校验是否传入data_path,不需要修改 +if [[ $data_path == "" ]];then + echo "[Error] para \"data_path\" must be config" + exit 1 +fi -#进入训练脚本目录,需要模型审视修改 -cd $cur_path/tf-bsrn-sr +# 校验是否传入output_path,不需要修改 +if [[ $output_path == "" ]];then + output_path="./test/output/${ASCEND_DEVICE_ID}" +fi -#创建DeviceID输出目录,不需要修改 -if [ -d ${cur_path}/test/output/${ASCEND_DEVICE_ID} ];then - rm -rf ${cur_path}/test/output/${ASCEND_DEVICE_ID} - mkdir -p ${cur_path}/test/output/$ASCEND_DEVICE_ID/ckpt -else - mkdir -p ${cur_path}/test/output/$ASCEND_DEVICE_ID/ckpt +# 设置打屏日志文件名,请保留,文件名为${print_log} +print_log="./test/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log" +modelarts_flag=${MODELARTS_MODEL_PATH} +if [ x"${modelarts_flag}" != x ]; +then + echo "running with modelarts..." + print_log_name=`ls /home/ma-user/modelarts/log/ | grep proc-rank` + print_log="/home/ma-user/modelarts/log/${print_log_name}" fi +echo "### get your log here : ${print_log}" + +CaseName="" +function get_casename() +{ + if [ x"${perf_flag}" = x1 ]; + then + CaseName=${Network}_bs${batch_size}_${RANK_SIZE}'p'_'perf' + else + CaseName=${Network}_bs${batch_size}_${RANK_SIZE}'p'_'acc' + fi +} + +# 跳转到code目录 +cd ${cur_path}/../ +rm -rf ./test/output/${ASCEND_DEVICE_ID} +mkdir -p ./test/output/${ASCEND_DEVICE_ID} + +# 训练开始时间记录,不需要修改 +start_time=$(date +%s) +########################################################## +#########第3行 至 100行,请一定不要、不要、不要修改########## +#########第3行 至 100行,请一定不要、不要、不要修改########## +#########第3行 至 100行,请一定不要、不要、不要修改########## +########################################################## + +#========================================================= +#========================================================= +#========训练执行命令,需要根据您的网络进行修改============== +#========================================================= +#========================================================= +# 基础参数,需要模型审视修改 +# 您的训练数据集在${data_path}路径下,请直接使用这个变量获取 +# 您的训练输出目录在${output_path}路径下,请直接使用这个变量获取 +# 您的其他基础参数,可以自定义增加,但是batch_size请保留,并且设置正确的值 +train_epochs=2 +train_steps=100 +batch_size=8 -#执行训练脚本,以下传参不需要修改,其他需要模型审视修改 -python3 train.py \ - --data_input_path=${data_path}/dataset/DIV2K/DIV2K_train_LR_bicubic\ - --data_truth_path=${data_path}/dataset/DIV2K/DIV2K_train_HR \ - --train_path='./train' \ +#if [ x"${modelarts_flag}" != x ]; +#then +# echo ${data_path} +# ls ${data_path} +# relative_path_LR="DIV2K/DIV2K_train_LR_bicubic" +# relative_path_HR="DIV2K/DIV2K_train_HR" +# python3.7 ./train.py \ +# --data_input_path=${data_path}${relative_path_LR} --data_truth_path=${data_path}${relative_path_HR} --train_path=${output_path} \ +# --chip='npu' \ +# --model='bsrn' \ +# --dataloader='div2k_loader' \ +# --batch_size=8 \ +# --max_steps=${train_steps} \ +# --save_freq=1000 \ +# --scales='4' +#else +relative_path_LR="/dataset/DIV2K/DIV2K_train_LR_bicubic" +relative_path_HR="/dataset/DIV2K/DIV2K_train_HR" +python3.7 ./train.py \ + --data_input_path=${data_path}${relative_path_LR} --data_truth_path=${data_path}${relative_path_HR} --train_path=${output_path} \ --chip='npu' \ --model='bsrn' \ --dataloader='div2k_loader' \ --batch_size=8 \ - --max_steps=100 \ - --save_freq=50 \ - --scales='4' > ${cur_path}/test/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log 2>&1 -wait + --max_steps=${train_steps} \ + --save_freq=1000 \ + --scales='4' 1>${print_log} 2>&1 -#训练结束时间,不需要修改 + + +# 性能相关数据计算 +StepTime=`grep "sec/batch" ${print_log} | tail -n 20 | awk '{print $(NF-2)}' | awk '{sum+=$1} END {print sum/NR}'` +FPS=`awk 'BEGIN{printf "%.2f\n", '${batch_size}'/'${StepTime}'}'` + +# 精度相关数据计算 +PSNR=`grep "Final PSNR" ${print_log} | awk '{print $NF}'` +SSIM=`grep "Final SSIM" ${print_log} | awk '{print $NF}'` +# 提取所有loss打印信息 +grep "loss" ${print_log} | awk -F ":" '{print $4}'| grep "loss" |awk -F "," '{print $3}'|awk '{print $2}' > ./test/output/${ASCEND_DEVICE_ID}/my_output_loss.txt + + +########################################################### +#########后面的所有内容请不要修改########################### +#########后面的所有内容请不要修改########################### +#########后面的所有内容请不要修改########################### +########################################################### + +# 判断本次执行是否正确使用Ascend NPU +use_npu_flag=`grep "The model has been compiled on the Ascend AI processor" ${print_log} | wc -l` +if [ x"${use_npu_flag}" == x0 ]; +then + echo "------------------ ERROR NOTICE START ------------------" + echo "ERROR, your task haven't used Ascend NPU, please check your npu Migration." + echo "------------------ ERROR NOTICE END------------------" +else + echo "------------------ INFO NOTICE START------------------" + echo "INFO, your task have used Ascend NPU, please check your result." + echo "------------------ INFO NOTICE END------------------" +fi + +# 获取最终的casename,请保留,case文件名为${CaseName} +get_casename + +# 重命名loss文件 +if [ -f ./test/output/${ASCEND_DEVICE_ID}/my_output_loss.txt ]; +then + mv ./test/output/${ASCEND_DEVICE_ID}/my_output_loss.txt ./test/output/${ASCEND_DEVICE_ID}/${CaseName}_loss.txt +fi + +# 训练端到端耗时 end_time=$(date +%s) e2e_time=$(( $end_time - $start_time )) -#结果打印,不需要修改 echo "------------------ Final result ------------------" -#输出性能FPS,需要模型审视修改 -TrainingTime=`grep 'fps' $cur_path/test/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log|awk 'END {print $12}'` -FPS=`grep 'fps' $cur_path/test/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log|awk 'END {print $14}'` -#打印,不需要修改 -echo "Final Performance TrainingTime : $TrainingTime" +# 输出性能FPS/单step耗时/端到端耗时 echo "Final Performance images/sec : $FPS" - -#输出训练精度,需要模型审视修改 -#train_accuracy=`grep val_loss $cur_path/test/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log|grep step|grep -v ETA|awk 'END {print}'|awk '{print $11}'` - -#打印,不需要修改 -#echo "Final Train Accuracy : ${train_accuracy}" +echo "Final Performance sec/step : $StepTime" echo "E2E Training Duration sec : $e2e_time" -#性能看护结果汇总 -#训练用例信息,不需要修改 -BatchSize=${batch_size} -DeviceType=`uname -m` -CaseName=${Network}_bs${BatchSize}_${RANK_SIZE}'p'_'perf' - -##获取性能数据,不需要修改 -#吞吐量 -ActualFPS=${FPS} -#单迭代训练时长 -#TrainingTime=`awk 'BEGIN{printf "%.2f\n",'${FPS}'/69}'` - -#从train_$ASCEND_DEVICE_ID.log提取Loss到train_${CaseName}_loss.txt中,需要根据模型审视 -grep 'loss' $cur_path/test/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log|awk '{print $10}' >> $cur_path/test/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt -#最后一个迭代loss值,不需要修改 -ActualLoss=`awk 'END {print}' $cur_path/test/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt` - -#关键信息打印到${CaseName}.log中,不需修改 -echo "Network = ${Network}" > $cur_path/test/output/$ASCEND_DEVICE_ID/${CaseName}.log -echo "RankSize = ${RANK_SIZE}" >> $cur_path/test/output/$ASCEND_DEVICE_ID/${CaseName}.log -echo "BatchSize = ${BatchSize}" >> $cur_path/test/output/$ASCEND_DEVICE_ID/${CaseName}.log -echo "DeviceType = ${DeviceType}" >> $cur_path/test/output/$ASCEND_DEVICE_ID/${CaseName}.log -echo "CaseName = ${CaseName}" >> $cur_path/test/output/$ASCEND_DEVICE_ID/${CaseName}.log -echo "ActualFPS = ${ActualFPS}" >> $cur_path/test/output/$ASCEND_DEVICE_ID/${CaseName}.log -echo "TrainingTime = ${TrainingTime}" >> $cur_path/test/output/$ASCEND_DEVICE_ID/${CaseName}.log -echo "ActualLoss = ${ActualLoss}" >> $cur_path/test/output/$ASCEND_DEVICE_ID/${CaseName}.log -echo "E2ETrainingTime = ${e2e_time}" >> $cur_path/test/output/$ASCEND_DEVICE_ID/${CaseName}.log -#echo "Accuracy = ${train_accuracy}" >> $cur_path/test/output/$ASCEND_DEVICE_ID/${CaseName}.log \ No newline at end of file +# 输出训练精度 +echo "Final Train Accuracy : ${PSNR}" +echo "Final SSIM : ${SSIM}" + +# 最后一个迭代loss值,不需要修改 +ActualLoss=(`awk 'END {print $NF}' $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}_loss.txt`) + +#关键信息打印到${CaseName}.log中,不需要修改 +echo "Network = ${Network}" > $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "RankSize = ${RANK_SIZE}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "BatchSize = ${batch_size}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "DeviceType = `uname -m`" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "CaseName = ${CaseName}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "ActualFPS = ${FPS}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "TrainingTime = ${StepTime}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "ActualLoss = ${ActualLoss}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "E2ETrainingTime = ${e2e_time}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log \ No newline at end of file diff --git a/TensorFlow/contrib/cv/BSRN_ID1296_for_TensorFlow/tf-bsrn-sr/test_bsrn.py b/TensorFlow/contrib/cv/BSRN_ID1296_for_TensorFlow/test_bsrn.py similarity index 100% rename from TensorFlow/contrib/cv/BSRN_ID1296_for_TensorFlow/tf-bsrn-sr/test_bsrn.py rename to TensorFlow/contrib/cv/BSRN_ID1296_for_TensorFlow/test_bsrn.py diff --git a/TensorFlow/contrib/cv/BSRN_ID1296_for_TensorFlow/tf-bsrn-sr/train.py b/TensorFlow/contrib/cv/BSRN_ID1296_for_TensorFlow/train.py similarity index 98% rename from TensorFlow/contrib/cv/BSRN_ID1296_for_TensorFlow/tf-bsrn-sr/train.py rename to TensorFlow/contrib/cv/BSRN_ID1296_for_TensorFlow/train.py index 3f44d3dc3e888bf642832839209a2bfaa02b30fb..ffa526a551328be09ddd07e9ba0fa7d670c60ef1 100644 --- a/TensorFlow/contrib/cv/BSRN_ID1296_for_TensorFlow/tf-bsrn-sr/train.py +++ b/TensorFlow/contrib/cv/BSRN_ID1296_for_TensorFlow/train.py @@ -82,6 +82,8 @@ if __name__ == '__main__': pre_parsed = pre_parser.parse_known_args()[0] if (pre_parsed.dataloader is not None): + print( "dataloader: ",pre_parsed.dataloader) + tf.logging.info("dataloader: ",pre_parsed.dataloader) DATALOADER_MODULE = importlib.import_module('dataloaders.' + pre_parsed.dataloader) if (pre_parsed.model is not None): MODEL_MODULE = importlib.import_module('models.' + pre_parsed.model) diff --git a/TensorFlow/contrib/cv/BSRN_ID1296_for_TensorFlow/tf-bsrn-sr/validate_bsrn.py b/TensorFlow/contrib/cv/BSRN_ID1296_for_TensorFlow/validate_bsrn.py similarity index 86% rename from TensorFlow/contrib/cv/BSRN_ID1296_for_TensorFlow/tf-bsrn-sr/validate_bsrn.py rename to TensorFlow/contrib/cv/BSRN_ID1296_for_TensorFlow/validate_bsrn.py index 97bd8e6fb22f50dc7fbccbdec2c3e75a87f9edd5..7012829fa13d19741e8481549993034a89a5b789 100644 --- a/TensorFlow/contrib/cv/BSRN_ID1296_for_TensorFlow/tf-bsrn-sr/validate_bsrn.py +++ b/TensorFlow/contrib/cv/BSRN_ID1296_for_TensorFlow/validate_bsrn.py @@ -31,6 +31,8 @@ import os import time import numpy as np import tensorflow as tf +from scipy.ndimage import gaussian_filter + import dataloaders import models @@ -59,7 +61,7 @@ if __name__ == '__main__': tf.flags.DEFINE_integer('shave_size', 4, 'Amount of pixels to crop the borders of the images before calculating quality metrics.') tf.flags.DEFINE_boolean('ensemble_only', False, 'Calculate (and save) ensembled image only.') - tf.flags.DEFINE_string("chip", "gpu", "Run on which chip, (npu or gpu or cpu)") + tf.flags.DEFINE_string("chip", "cpu", "Run on which chip, (npu or gpu or cpu)") tf.flags.DEFINE_string("platform", "linux", 'the platform this code is running on') # parse data loader and model first and import them @@ -116,7 +118,55 @@ def _image_rmse2(output_image, truth_image): diff = yr_tr - yr_out rmse = np.sqrt(np.mean(diff ** 2)) return rmse - +def _image_ssim(X, Y): + """ + Computes the mean structural similarity between two images. + """ + assert (X.shape == Y.shape), "Image-patche provided have different dimensions" + nch = 1 if X.ndim == 2 else X.shape[-1] + mssim = [] + for ch in range(nch): + Xc, Yc = X[..., ch].astype(np.float64), Y[..., ch].astype(np.float64) + mssim.append(compute_ssim(Xc, Yc)) + return np.mean(mssim) + + +def compute_ssim(X, Y): + """ + Compute the structural similarity per single channel (given two images) + """ + # variables are initialized as suggested in the paper + K1 = 0.01 + K2 = 0.03 + sigma = 1.5 + win_size = 5 + + # means + ux = gaussian_filter(X, sigma) + uy = gaussian_filter(Y, sigma) + + # variances and covariances + uxx = gaussian_filter(X * X, sigma) + uyy = gaussian_filter(Y * Y, sigma) + uxy = gaussian_filter(X * Y, sigma) + + # normalize by unbiased estimate of std dev + N = win_size ** X.ndim + unbiased_norm = N / (N - 1) # eq. 4 of the paper + vx = (uxx - ux * ux) * unbiased_norm + vy = (uyy - uy * uy) * unbiased_norm + vxy = (uxy - ux * uy) * unbiased_norm + + R = 255 + C1 = (K1 * R) ** 2 + C2 = (K2 * R) ** 2 + # compute SSIM (eq. 13 of the paper) + sim = (2 * ux * uy + C1) * (2 * vxy + C2) + D = (ux ** 2 + uy ** 2 + C1) * (vx + vy + C2) + SSIM = sim / D + mssim = SSIM.mean() + + return mssim def main(unused_argv): # initialize @@ -240,9 +290,9 @@ def main(unused_argv): output_image_shaved = _shave_image(output_image, shave_size=FLAGS.shave_size) psnr = _image_psnr(output_image=output_image_shaved, truth_image=truth_image_shaved) - rmse = _image_rmse(output_image=output_image_shaved, truth_image=truth_image_shaved) + rmse = _image_ssim(output_image_shaved, truth_image_shaved) - tf.logging.info('t%d, x%d, %d/%d, psnr=%.2f, rmse=%.2f' % (num_recursions, scale, image_index+1, num_images, psnr, rmse)) + tf.logging.info('t%d, x%d, %d/%d, psnr=%.2f, ssim=%.2f' % (num_recursions, scale, image_index+1, num_images, psnr, rmse)) psnr_list[i].append(psnr) rmse_list[i].append(rmse) @@ -262,9 +312,9 @@ def main(unused_argv): output_image_shaved = _shave_image(output_image, shave_size=FLAGS.shave_size) psnr = _image_psnr(output_image=output_image_shaved, truth_image=truth_image_shaved) - rmse = _image_rmse(output_image=output_image_shaved, truth_image=truth_image_shaved) + rmse = _image_ssim(output_image_shaved, truth_image_shaved) - tf.logging.info('ensemble, x%d, %d/%d, psnr=%.2f, rmse=%.2f' % (scale, image_index+1, num_images, psnr, rmse)) + tf.logging.info('ensemble, x%d, %d/%d, psnr=%.2f, ssim=%.2f' % (scale, image_index+1, num_images, psnr, rmse)) psnr_list[num_total_outputs].append(psnr) rmse_list[num_total_outputs].append(rmse) @@ -279,10 +329,10 @@ def main(unused_argv): # finalize tf.logging.info('finished') for scale in scale_list: - print('- x%d, PSNR and RMSE:' % (scale)) - print(' '.join([('%.3f' % x) for x in modules_average_psnr_dict[scale]])) + print('- x%d, PSNR and SSIM:' % (scale)) + print("Final PSNR: ",' '.join([('%.3f' % x) for x in modules_average_psnr_dict[scale]])) print('') - print(' '.join([('%.3f' % x) for x in modules_average_rmse_dict[scale]])) + print("Final SSIM: ",' '.join([('%.3f' % x) for x in modules_average_rmse_dict[scale]])) if FLAGS.platform.lower() == 'modelarts': from help_modelarts import modelarts_result2obs