From 10df269066d8155402772eeb778a7c3392ad9683 Mon Sep 17 00:00:00 2001 From: ab_dx_z <10158481+ab_dx_z@user.noreply.gitee.com> Date: Thu, 18 Aug 2022 07:18:27 +0000 Subject: [PATCH 01/20] =?UTF-8?q?save=5Fmodel=E6=A8=A1=E5=9E=8B=E5=9B=BA?= =?UTF-8?q?=E5=8C=96=E4=B8=BApb=E6=A8=A1=E5=9E=8B?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: ab_dx_z <> --- .../cv/SVD_ID2019_for_Tensorflow/frezze.py | 48 +++++++++++++++++++ 1 file changed, 48 insertions(+) create mode 100644 TensorFlow/contrib/cv/SVD_ID2019_for_Tensorflow/frezze.py diff --git a/TensorFlow/contrib/cv/SVD_ID2019_for_Tensorflow/frezze.py b/TensorFlow/contrib/cv/SVD_ID2019_for_Tensorflow/frezze.py new file mode 100644 index 000000000..6aeb2e1aa --- /dev/null +++ b/TensorFlow/contrib/cv/SVD_ID2019_for_Tensorflow/frezze.py @@ -0,0 +1,48 @@ +# Copyright 2017 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ +# Copyright 2021 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import tensorflow as tf +from tensorflow.python.tools import freeze_graph +from npu_bridge.npu_init import * + +freeze_graph.freeze_graph( + input_saved_model_dir='save_model_path', + output_node_names='L2Loss', + output_graph='test.pb', + initializer_nodes='', + input_graph= None, + input_saver= False, + input_binary=False, + input_checkpoint=None, + restore_op_name=None, + filename_tensor_name=None, + clear_devices=False, + input_meta_graph=False) + +# if __name__ == '__main__': +# main() \ No newline at end of file -- Gitee From 05be221da8af1c987ab79e25d0148bf37d2560cc Mon Sep 17 00:00:00 2001 From: ab_dx_z <10158481+ab_dx_z@user.noreply.gitee.com> Date: Thu, 18 Aug 2022 07:20:22 +0000 Subject: [PATCH 02/20] =?UTF-8?q?=E5=88=A0=E9=99=A4=E6=96=87=E4=BB=B6=20Te?= =?UTF-8?q?nsorFlow/contrib/cv/SVD=5FID2019=5Ffor=5FTensorflow/README.md?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../cv/SVD_ID2019_for_Tensorflow/README.md | 253 ------------------ 1 file changed, 253 deletions(-) delete mode 100644 TensorFlow/contrib/cv/SVD_ID2019_for_Tensorflow/README.md diff --git a/TensorFlow/contrib/cv/SVD_ID2019_for_Tensorflow/README.md b/TensorFlow/contrib/cv/SVD_ID2019_for_Tensorflow/README.md deleted file mode 100644 index 4a337f831..000000000 --- a/TensorFlow/contrib/cv/SVD_ID2019_for_Tensorflow/README.md +++ /dev/null @@ -1,253 +0,0 @@ -

基本信息

- -**发布者(Publisher):Huawei** - -**应用领域(Application Domain):CV** - -**版本(Version):** - -**修改时间(Modified) :2022.05.05** - -**框架(Framework):TensorFlow 1.15.0** - -**处理器(Processor):昇腾910** - -**描述(Description):使用训练好的SVD模型,评估对称正交化在点云对准中的应用效果。** - -

概述

- -给定两个三维点云图,利用SVD正交化过程SVDO+(M)将其投射到SO(3)上,要求网络预测最佳对齐它们的3D旋转。代码的训练逻辑是每训练10w步保存一个模型,并且在测试集上检验该模型的精度,最后比较的都是260w步的模型精度 - -- 开源代码: - - https://github.com/google-research/google-research/tree/master/special_orthogonalization。 - -- 参考论文: - - [An Analysis of SVD for Deep Rotation Estimation](https://arxiv.org/abs/2006.14616) - -- 参考实现: - gpu最后训练出来的模型在 - - obs://cann-id2019/gpu - - 数据在 - obs://cann-id2019/dataset/ - - 共有3个数据集 - - 训练数据集points - - 测试数据集 points_test - - 第一步旋转后的数据集 test_points_modified - - npu训练出来的模型在 - - obs://cann-id2019/dataset/output - - - 相关代码均以上传,在NPU和GPU文件夹中可以找到 - -- 相关迁移的工作: - 在进行代码迁移到NPU上时,输入的训练数据为点云数据,点云数据的shape为(N,3),其中N并不是固定的,因此在NPU上存在动态shape的问题,导致模型训练无法正常进行。我们为此想了三个解决方法:1、找出所有点云数据中最小的N,对于大于N的点云数据,仅取前N行的数据输入训练。2、找到所有点云数据中最大的N,对于小于N的点云进行补0操作,将所有数据固定为最大的N后,输入网络进行训练。3、找到所有点云数据中最大的N,对小于N的点云数据,从原数据中选择一个点云进行填补至行数为N,再将数据输入网络进行训练。该三种方法均成功解决了NPU上的动态shape问题,但是第一种方法删除了样本点,因此导致最后训练出的模型精度很差;第二种方法虽然并没有丢失样本信息,但是向数据中填入大量的0,改变了本来的代码逻辑,导致最后训练出的模型精度也并不高。对于第三种方法,即没有丢失样本信息,对每个点云数据中的某一个点云样本点进行重复操作,没有改变原始的代码逻辑,最后也获得了不错的精度表现。 - -- 通过Git获取对应commit\_id的代码方法如下: - - ``` - git clone {repository_url} # 克隆仓库的代码 - cd {repository_name} # 切换到模型的代码仓目录 - git checkout {branch} # 切换到对应分支 - git reset --hard {commit_id} # 代码设置到对应的commit_id - cd {code_path} # 切换到模型代码所在路径,若仓库下只有该模型,则无需切换 - ``` - -## 默认配置 - -- 数据集获取:obs://cann-id2019/dataset/ - -- 训练超参 - - - log_step_count=200 - - save_summaries_steps=25000 - - train_steps=2600000 - - save_checkpoints_steps=100000 - - eval_examples=39900 - - - -

训练环境准备

- -1. 硬件环境准备请参见各硬件产品文档"[驱动和固件安装升级指南]( https://support.huawei.com/enterprise/zh/category/ai-computing-platform-pid-1557196528909)"。需要在硬件设备上安装与CANN版本配套的固件与驱动。 -2. 宿主机上需要安装Docker并登录[Ascend Hub中心](https://ascendhub.huawei.com/#/detail?name=ascend-tensorflow-arm)获取镜像。 - - 当前模型支持的镜像列表如[表1](#zh-cn_topic_0000001074498056_table1519011227314)所示。 - - **表 1** 镜像列表 - - - - - - - - - - - - -

镜像名称

-

镜像版本

-

配套CANN版本

-
-

20.2.0

-

20.2

-
- - - -## 脚本参数 - -``` -gen_pt_test_data_gpu.py 中的参数 - ---input_test_files 传入需要进行旋转的点云数据集 ---output_directory 存储旋转后点云集的路径 ---random_rotation_axang boole型,若为真将会对传入的数据集采用指定方法旋转,默认为真 ---num_rotations_per_file 每个测试点云的随机旋转增加数。默认为100 - - -main_point_cloud_gpu.py 中的参数 - ---pt_cloud_test_files 测试数据集路径 ---pt_cloud_train_files 熟练数据集路径 ---method 指定用于预测旋转的方式。选项为"svd", "svd-inf", or "gs"。默认为“svd” ---checkpoint_dir 训练模型的存放位置 ---train_steps 训练迭代的次数。默认为2600000 ---save_checkpoints_steps 保存检查点的频率。默认为10000 ---log_step_count 日志记录一次的步数。默认为200 ---save_summaries_steps 保存一次summary的步数。默认为5000 ---learning_rate 默认为1e-5 ---lr_decay 如果为真,则衰减learning rate。默认为假 ---lr_decay_steps learning rate衰减步数。默认为35000 ---lr_decay_rate learning rate衰减速率。默认为0.95 ---predict_all_test 如果为真,则在最新的检查点上运行eval作业,并打印每个输入的误差信息。默认为假 ---eval_examples 测试样本的数量。默认为0 ---print_variable_names 打印模型变量名。默认为假 ---num_train_augmentations 增加每个输入点云的随机旋转数。默认为10 - -``` - - - -## 运行 - -GPU运行命令如下: - -**生成测试数据** - -注:生成的文件test_points_modified、points已包含在dataset文件夹中。 -```bash -# 将路径设置到训练点云图文件 -IN_FILES=/points_test/*.pts - -NEW_TEST_FILES_DIR=/test_points_modified - -AXANG_SAMPLING=True - -# 决定旋转轴角的分布 -AXANG_SAMPLING=True - -python -m special_orthogonalization.gen_pt_test_data_gpu --input_test_files=$IN_FILES --output_directory=$NEW_TEST_FILES_DIR --random_rotation_axang=$AXANG_SAMPLING -``` - -**训练与评价** -```bash -# 将路径设置到原始训练数据 -TRAIN_FILES=/points/*.pts - -#将路径设置到旋转后的训练数据 -TEST_FILES=$NEW_TEST_FILES_DIR/*.pts - -# 指定旋转预测方式 -METHOD=svd - -# 指定ckpt、summaries、评价结果等的存储路径 -OUT_DIR=/path/to/model - -python -m special_orthogonalization.main_point_cloud_gpu --method=$METHOD --checkpoint_dir=$OUT_DIR --log_step_count=200 --save_summaries_steps=25000 --pt_cloud_train_files=$TRAIN_FILES --pt_cloud_test_files=$TEST_FILES --train_steps=2600000 --save_checkpoints_steps=100000 --eval_examples=39900 -``` - -**从所有训练样本中生成统计数据** -```bash -# 打印均值、中位数、标准差和分位数 -python -m special_orthogonalization.main_point_cloud_gpu --method=$METHOD --checkpoint_dir=$OUT_DIR --pt_cloud_test_files=$TEST_FILES --predict_all_test=True -``` -## 运行 - -NPU运行命令方式如下: - -对于所有的三个步骤程序来说,modelarts插件obs桶中的数据路径均要写到真正包含数据的那一个路径 -如在dataset文件夹中含有points、points_test等包含数据的文件夹 -modelarts插件中的数据路径写为 obs://cann-id2019/dataset/ - -**生成测试数据** - -运行这一步我们需要的程序文件为gen_pt_test_data.py、modelarts_entry_Axang.py、genTestData.sh -这三个文件中的代码均不需要修改 -最后生成的旋转后的数据文件存放在obs桶当次程序文件的output路径中,文件名为test_points_modified, -为进行第二步模型训练,需要将生成旋转后的文件转移至obs桶中存放data的路径 - -注:需要确保的是存在obs桶里的data文件名为points_test - -**训练与评价** - -运行这一步我们需要的程序文件为main_point_cloud_boostPerf.py、modelarts_entry_acc_train.py、train_full_1p.sh -这三个文件中的代码均不需要修改 - -由于采用混合精度提高训练性能,一些算子计算溢出,为此增添switch_config.txt文件,该文件应该和代码所在目录一致。 - -最后生成的旋转后的数据文件存放在obs桶当次程序文件的output路径中,文件名为test_points_modified, -为进行第三步,需要将生成的output文件转移至obs桶中存放data的路径 - -注意:该次训练的模型保存在该次的obs文件夹中,进行第三步时又需要重启一次新的modelarts,因此我们需要将output文件中的 -checkpoint文件中最新模型的路径修改 -"/home/ma-user/modelarts/inputs/data_url_0/output" -这样第三步才能跑出正确的精度指标 - - -**从所有训练样本中生成统计数据** - -运行这一步我们需要的程序文件为main_point_cloud_boostPerf.py、modelarts_entry_stat.py、genStatistical.sh -这三个文件的代码均不需要修改 - -运行成功后将会在屏幕上打印出关于精度相应的统计量值 - - - -## 训练结果 -**精度对比:** - - - -| 测地线误差(°) | 论文发布 | GPU(初始代码未改动版本) | GPU实测|NPU实测 | -| ------------------------ | ------- | ----- | --------- |----------| -| 平均值 | 1.63 | 2.58 | 3.98 | 2.92 | -| 中值 | 0.89 | 1.68 | 2.6 | 1.7 | -| 标准差 | 6.70 | 6.93 | 9.36 | 8.45 | - -相比于论文中的精度,我们NPU迁移后实测差距依然较大,但是与我们未对代码任何改动初始的版本在GPU上跑出来的精度相差较小, -且对于相同的代码的代码改动,NPU上的精度优于GPU上的精度。需要注意的是,在NPU上运行程序时,我们采用混合精度来提升训练 -的性能,但是其中产生了未知的错误,导致代表的精度指标mean_degree_err在整个训练过程中始终为0,因此我们无法得知在NPU训练的 -260w步的过程中,精度指标是下降的过程是怎样的。值得庆幸的是通过NPU训练出的模型,能够在GPU上计算出精度,并且精度还不错。 - -**性能对比:** - -取华为v100上GPU运行的前2w步的global_step/sec平均值和NPU运行的前1w步的global_step/sec平均值进行对比,以达到性能对比的目的。 -对于NPU上的性能计算 需要的程序为main_point_cloud_boostPerf.py、modelarts_entry_perf.py、train_performance_1p.sh,其中参数已经 -设置完毕,无需更改。同时我们上传了我们计算性能的代码calc_perf.py,运行该代码需要将产生的日志文件从obs上下载下来,传入obs文件在本地的路径即可 -| 性能指标项 | 论文发布 | GPU实测 | NPU实测 | -| ------------------- | ------- | ------ | ------ | -| global_step/sec| 无 | 87.64 | 116.77 | - - -- Gitee From cb719d0c6fa03d147c5ffa3a2ef2c56b89d7fff4 Mon Sep 17 00:00:00 2001 From: ab_dx_z <10158481+ab_dx_z@user.noreply.gitee.com> Date: Thu, 18 Aug 2022 07:20:54 +0000 Subject: [PATCH 03/20] =?UTF-8?q?=E6=9B=B4=E6=96=B0?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: ab_dx_z <> --- .../cv/SVD_ID2019_for_Tensorflow/README.md | 256 ++++++++++++++++++ 1 file changed, 256 insertions(+) create mode 100644 TensorFlow/contrib/cv/SVD_ID2019_for_Tensorflow/README.md diff --git a/TensorFlow/contrib/cv/SVD_ID2019_for_Tensorflow/README.md b/TensorFlow/contrib/cv/SVD_ID2019_for_Tensorflow/README.md new file mode 100644 index 000000000..5de256f0f --- /dev/null +++ b/TensorFlow/contrib/cv/SVD_ID2019_for_Tensorflow/README.md @@ -0,0 +1,256 @@ +

基本信息

+ +**发布者(Publisher):Huawei** + +**应用领域(Application Domain):CV** + +**版本(Version):** + +**修改时间(Modified) :2022.05.05** + +**框架(Framework):TensorFlow 1.15.0** + +**处理器(Processor):昇腾910** + +**描述(Description):使用训练好的SVD模型,评估对称正交化在点云对准中的应用效果。** + +

概述

+ +给定两个三维点云图,利用SVD正交化过程SVDO+(M)将其投射到SO(3)上,要求网络预测最佳对齐它们的3D旋转。代码的训练逻辑是每训练10w步保存一个模型,并且在测试集上检验该模型的精度,最后比较的都是260w步的模型精度 + +- 开源代码: + + https://github.com/google-research/google-research/tree/master/special_orthogonalization。 + +- 参考论文: + + [An Analysis of SVD for Deep Rotation Estimation](https://arxiv.org/abs/2006.14616) + +- 参考实现: + gpu最后训练出来的模型在 + + obs://cann-id2019/gpu + + 数据在 + obs://cann-id2019/dataset/ + + 共有3个数据集 + + 训练数据集points + + 测试数据集 points_test + + 第一步旋转后的数据集 test_points_modified + + npu训练出来的模型在 + + obs://cann-id2019/dataset/output + + + 相关代码均以上传,在NPU和GPU文件夹中可以找到 + +- 相关迁移的工作: + 在进行代码迁移到NPU上时,输入的训练数据为点云数据,点云数据的shape为(N,3),其中N并不是固定的,因此在NPU上存在动态shape的问题,导致模型训练无法正常进行。我们为此想了三个解决方法:1、找出所有点云数据中最小的N,对于大于N的点云数据,仅取前N行的数据输入训练。2、找到所有点云数据中最大的N,对于小于N的点云进行补0操作,将所有数据固定为最大的N后,输入网络进行训练。3、找到所有点云数据中最大的N,对小于N的点云数据,从原数据中选择一个点云进行填补至行数为N,再将数据输入网络进行训练。该三种方法均成功解决了NPU上的动态shape问题,但是第一种方法删除了样本点,因此导致最后训练出的模型精度很差;第二种方法虽然并没有丢失样本信息,但是向数据中填入大量的0,改变了本来的代码逻辑,导致最后训练出的模型精度也并不高。对于第三种方法,即没有丢失样本信息,对每个点云数据中的某一个点云样本点进行重复操作,没有改变原始的代码逻辑,最后也获得了不错的精度表现。 + +- 通过Git获取对应commit\_id的代码方法如下: + + ``` + git clone {repository_url} # 克隆仓库的代码 + cd {repository_name} # 切换到模型的代码仓目录 + git checkout {branch} # 切换到对应分支 + git reset --hard {commit_id} # 代码设置到对应的commit_id + cd {code_path} # 切换到模型代码所在路径,若仓库下只有该模型,则无需切换 + ``` + +## 默认配置 + +- 数据集获取:obs://cann-id2019/dataset/ + +- 训练超参 + + - log_step_count=200 + - save_summaries_steps=25000 + - train_steps=2600000 + - save_checkpoints_steps=100000 + - eval_examples=39900 + + + +

训练环境准备

+ +1. 硬件环境准备请参见各硬件产品文档"[驱动和固件安装升级指南]( https://support.huawei.com/enterprise/zh/category/ai-computing-platform-pid-1557196528909)"。需要在硬件设备上安装与CANN版本配套的固件与驱动。 +2. 宿主机上需要安装Docker并登录[Ascend Hub中心](https://ascendhub.huawei.com/#/detail?name=ascend-tensorflow-arm)获取镜像。 + + 当前模型支持的镜像列表如[表1](#zh-cn_topic_0000001074498056_table1519011227314)所示。 + + **表 1** 镜像列表 + + + + + + + + + + + + +

镜像名称

+

镜像版本

+

配套CANN版本

+
+

20.2.0

+

20.2

+
+ + + +## 脚本参数 + +``` +gen_pt_test_data_gpu.py 中的参数 + +--input_test_files 传入需要进行旋转的点云数据集 +--output_directory 存储旋转后点云集的路径 +--random_rotation_axang boole型,若为真将会对传入的数据集采用指定方法旋转,默认为真 +--num_rotations_per_file 每个测试点云的随机旋转增加数。默认为100 + + +main_point_cloud_gpu.py 中的参数 + +--pt_cloud_test_files 测试数据集路径 +--pt_cloud_train_files 熟练数据集路径 +--method 指定用于预测旋转的方式。选项为"svd", "svd-inf", or "gs"。默认为“svd” +--checkpoint_dir 训练模型的存放位置 +--train_steps 训练迭代的次数。默认为2600000 +--save_checkpoints_steps 保存检查点的频率。默认为10000 +--log_step_count 日志记录一次的步数。默认为200 +--save_summaries_steps 保存一次summary的步数。默认为5000 +--learning_rate 默认为1e-5 +--lr_decay 如果为真,则衰减learning rate。默认为假 +--lr_decay_steps learning rate衰减步数。默认为35000 +--lr_decay_rate learning rate衰减速率。默认为0.95 +--predict_all_test 如果为真,则在最新的检查点上运行eval作业,并打印每个输入的误差信息。默认为假 +--eval_examples 测试样本的数量。默认为0 +--print_variable_names 打印模型变量名。默认为假 +--num_train_augmentations 增加每个输入点云的随机旋转数。默认为10 + +``` + + + +## 运行 + +GPU运行命令如下: + +**生成测试数据** + +注:生成的文件test_points_modified、points已包含在dataset文件夹中。 +```bash +# 将路径设置到训练点云图文件 +IN_FILES=/points_test/*.pts + +NEW_TEST_FILES_DIR=/test_points_modified + +AXANG_SAMPLING=True + +# 决定旋转轴角的分布 +AXANG_SAMPLING=True + +python -m special_orthogonalization.gen_pt_test_data_gpu --input_test_files=$IN_FILES --output_directory=$NEW_TEST_FILES_DIR --random_rotation_axang=$AXANG_SAMPLING +``` + +**训练与评价** +```bash +# 将路径设置到原始训练数据 +TRAIN_FILES=/points/*.pts + +#将路径设置到旋转后的训练数据 +TEST_FILES=$NEW_TEST_FILES_DIR/*.pts + +# 指定旋转预测方式 +METHOD=svd + +# 指定ckpt、summaries、评价结果等的存储路径 +OUT_DIR=/path/to/model + +python -m special_orthogonalization.main_point_cloud_gpu --method=$METHOD --checkpoint_dir=$OUT_DIR --log_step_count=200 --save_summaries_steps=25000 --pt_cloud_train_files=$TRAIN_FILES --pt_cloud_test_files=$TEST_FILES --train_steps=2600000 --save_checkpoints_steps=100000 --eval_examples=39900 +``` + +**从所有训练样本中生成统计数据** +```bash +# 打印均值、中位数、标准差和分位数 +python -m special_orthogonalization.main_point_cloud_gpu --method=$METHOD --checkpoint_dir=$OUT_DIR --pt_cloud_test_files=$TEST_FILES --predict_all_test=True +``` +## 运行 + +NPU运行命令方式如下: + +对于所有的三个步骤程序来说,modelarts插件obs桶中的数据路径均要写到真正包含数据的那一个路径 +如在dataset文件夹中含有points、points_test等包含数据的文件夹 +modelarts插件中的数据路径写为 obs://cann-id2019/dataset/ + +**生成测试数据** + +运行这一步我们需要的程序文件为gen_pt_test_data.py、modelarts_entry_Axang.py、genTestData.sh +这三个文件中的代码均不需要修改 +最后生成的旋转后的数据文件存放在obs桶当次程序文件的output路径中,文件名为test_points_modified, +为进行第二步模型训练,需要将生成旋转后的文件转移至obs桶中存放data的路径 + +注:需要确保的是存在obs桶里的data文件名为points_test + +**训练与评价** + +运行这一步我们需要的程序文件为main_point_cloud_boostPerf.py、modelarts_entry_acc_train.py、train_full_1p.sh +这三个文件中的代码均不需要修改 + +由于采用混合精度提高训练性能,一些算子计算溢出,为此增添switch_config.txt文件,该文件应该和代码所在目录一致。 + +最后生成的旋转后的数据文件存放在obs桶当次程序文件的output路径中,文件名为test_points_modified, +为进行第三步,需要将生成的output文件转移至obs桶中存放data的路径 + +注意:该次训练的模型保存在该次的obs文件夹中,进行第三步时又需要重启一次新的modelarts,因此我们需要将output文件中的 +checkpoint文件中最新模型的路径修改 +"/home/ma-user/modelarts/inputs/data_url_0/output" +这样第三步才能跑出正确的精度指标 + + +**从所有训练样本中生成统计数据** + +运行这一步我们需要的程序文件为main_point_cloud_boostPerf.py、modelarts_entry_stat.py、genStatistical.sh +这三个文件的代码均不需要修改 + +运行成功后将会在屏幕上打印出关于精度相应的统计量值 + + + +## 训练结果 +**精度对比:** + + + +| 测地线误差(°) | 论文发布 | GPU(初始代码未改动版本) | GPU实测|NPU实测 | +| ------------------------ | ------- | ----- | --------- |----------| +| 平均值 | 1.63 | 2.58 | 3.98 | 2.92 | +| 中值 | 0.89 | 1.68 | 2.6 | 1.7 | +| 标准差 | 6.70 | 6.93 | 9.36 | 8.45 | + +相比于论文中的精度,我们NPU迁移后实测差距依然较大,但是与我们未对代码任何改动初始的版本在GPU上跑出来的精度相差较小, +且对于相同的代码的代码改动,NPU上的精度优于GPU上的精度。需要注意的是,在NPU上运行程序时,我们采用混合精度来提升训练 +的性能,但是其中产生了未知的错误,导致代表的精度指标mean_degree_err在整个训练过程中始终为0,因此我们无法得知在NPU训练的 +260w步的过程中,精度指标是下降的过程是怎样的。值得庆幸的是通过NPU训练出的模型,能够在GPU上计算出精度,并且精度还不错。 + +**性能对比:** + +取华为v100上GPU运行的前2w步的global_step/sec平均值和NPU运行的前1w步的global_step/sec平均值进行对比,以达到性能对比的目的。 +对于NPU上的性能计算 需要的程序为main_point_cloud_boostPerf.py、modelarts_entry_perf.py、train_performance_1p.sh,其中参数已经 +设置完毕,无需更改。同时我们上传了我们计算性能的代码calc_perf.py,运行该代码需要将产生的日志文件从obs上下载下来,传入obs文件在本地的路径即可 +| 性能指标项 | 论文发布 | GPU实测 | NPU实测 | +| ------------------- | ------- | ------ | ------ | +| global_step/sec| 无 | 87.64 | 116.77 | + +##离线推理 +利用frezze.py文件将save_model模型保存为pb模型 + + -- Gitee From 3044deedba1022cd70ec21a2b224a46445c62f6c Mon Sep 17 00:00:00 2001 From: ab_dx_z <10158481+ab_dx_z@user.noreply.gitee.com> Date: Thu, 18 Aug 2022 07:21:29 +0000 Subject: [PATCH 04/20] =?UTF-8?q?=E5=88=A0=E9=99=A4=E6=96=87=E4=BB=B6=20Te?= =?UTF-8?q?nsorFlow/contrib/cv/SVD=5FID2019=5Ffor=5FTensorflow/README.md?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../cv/SVD_ID2019_for_Tensorflow/README.md | 256 ------------------ 1 file changed, 256 deletions(-) delete mode 100644 TensorFlow/contrib/cv/SVD_ID2019_for_Tensorflow/README.md diff --git a/TensorFlow/contrib/cv/SVD_ID2019_for_Tensorflow/README.md b/TensorFlow/contrib/cv/SVD_ID2019_for_Tensorflow/README.md deleted file mode 100644 index 5de256f0f..000000000 --- a/TensorFlow/contrib/cv/SVD_ID2019_for_Tensorflow/README.md +++ /dev/null @@ -1,256 +0,0 @@ -

基本信息

- -**发布者(Publisher):Huawei** - -**应用领域(Application Domain):CV** - -**版本(Version):** - -**修改时间(Modified) :2022.05.05** - -**框架(Framework):TensorFlow 1.15.0** - -**处理器(Processor):昇腾910** - -**描述(Description):使用训练好的SVD模型,评估对称正交化在点云对准中的应用效果。** - -

概述

- -给定两个三维点云图,利用SVD正交化过程SVDO+(M)将其投射到SO(3)上,要求网络预测最佳对齐它们的3D旋转。代码的训练逻辑是每训练10w步保存一个模型,并且在测试集上检验该模型的精度,最后比较的都是260w步的模型精度 - -- 开源代码: - - https://github.com/google-research/google-research/tree/master/special_orthogonalization。 - -- 参考论文: - - [An Analysis of SVD for Deep Rotation Estimation](https://arxiv.org/abs/2006.14616) - -- 参考实现: - gpu最后训练出来的模型在 - - obs://cann-id2019/gpu - - 数据在 - obs://cann-id2019/dataset/ - - 共有3个数据集 - - 训练数据集points - - 测试数据集 points_test - - 第一步旋转后的数据集 test_points_modified - - npu训练出来的模型在 - - obs://cann-id2019/dataset/output - - - 相关代码均以上传,在NPU和GPU文件夹中可以找到 - -- 相关迁移的工作: - 在进行代码迁移到NPU上时,输入的训练数据为点云数据,点云数据的shape为(N,3),其中N并不是固定的,因此在NPU上存在动态shape的问题,导致模型训练无法正常进行。我们为此想了三个解决方法:1、找出所有点云数据中最小的N,对于大于N的点云数据,仅取前N行的数据输入训练。2、找到所有点云数据中最大的N,对于小于N的点云进行补0操作,将所有数据固定为最大的N后,输入网络进行训练。3、找到所有点云数据中最大的N,对小于N的点云数据,从原数据中选择一个点云进行填补至行数为N,再将数据输入网络进行训练。该三种方法均成功解决了NPU上的动态shape问题,但是第一种方法删除了样本点,因此导致最后训练出的模型精度很差;第二种方法虽然并没有丢失样本信息,但是向数据中填入大量的0,改变了本来的代码逻辑,导致最后训练出的模型精度也并不高。对于第三种方法,即没有丢失样本信息,对每个点云数据中的某一个点云样本点进行重复操作,没有改变原始的代码逻辑,最后也获得了不错的精度表现。 - -- 通过Git获取对应commit\_id的代码方法如下: - - ``` - git clone {repository_url} # 克隆仓库的代码 - cd {repository_name} # 切换到模型的代码仓目录 - git checkout {branch} # 切换到对应分支 - git reset --hard {commit_id} # 代码设置到对应的commit_id - cd {code_path} # 切换到模型代码所在路径,若仓库下只有该模型,则无需切换 - ``` - -## 默认配置 - -- 数据集获取:obs://cann-id2019/dataset/ - -- 训练超参 - - - log_step_count=200 - - save_summaries_steps=25000 - - train_steps=2600000 - - save_checkpoints_steps=100000 - - eval_examples=39900 - - - -

训练环境准备

- -1. 硬件环境准备请参见各硬件产品文档"[驱动和固件安装升级指南]( https://support.huawei.com/enterprise/zh/category/ai-computing-platform-pid-1557196528909)"。需要在硬件设备上安装与CANN版本配套的固件与驱动。 -2. 宿主机上需要安装Docker并登录[Ascend Hub中心](https://ascendhub.huawei.com/#/detail?name=ascend-tensorflow-arm)获取镜像。 - - 当前模型支持的镜像列表如[表1](#zh-cn_topic_0000001074498056_table1519011227314)所示。 - - **表 1** 镜像列表 - - - - - - - - - - - - -

镜像名称

-

镜像版本

-

配套CANN版本

-
-

20.2.0

-

20.2

-
- - - -## 脚本参数 - -``` -gen_pt_test_data_gpu.py 中的参数 - ---input_test_files 传入需要进行旋转的点云数据集 ---output_directory 存储旋转后点云集的路径 ---random_rotation_axang boole型,若为真将会对传入的数据集采用指定方法旋转,默认为真 ---num_rotations_per_file 每个测试点云的随机旋转增加数。默认为100 - - -main_point_cloud_gpu.py 中的参数 - ---pt_cloud_test_files 测试数据集路径 ---pt_cloud_train_files 熟练数据集路径 ---method 指定用于预测旋转的方式。选项为"svd", "svd-inf", or "gs"。默认为“svd” ---checkpoint_dir 训练模型的存放位置 ---train_steps 训练迭代的次数。默认为2600000 ---save_checkpoints_steps 保存检查点的频率。默认为10000 ---log_step_count 日志记录一次的步数。默认为200 ---save_summaries_steps 保存一次summary的步数。默认为5000 ---learning_rate 默认为1e-5 ---lr_decay 如果为真,则衰减learning rate。默认为假 ---lr_decay_steps learning rate衰减步数。默认为35000 ---lr_decay_rate learning rate衰减速率。默认为0.95 ---predict_all_test 如果为真,则在最新的检查点上运行eval作业,并打印每个输入的误差信息。默认为假 ---eval_examples 测试样本的数量。默认为0 ---print_variable_names 打印模型变量名。默认为假 ---num_train_augmentations 增加每个输入点云的随机旋转数。默认为10 - -``` - - - -## 运行 - -GPU运行命令如下: - -**生成测试数据** - -注:生成的文件test_points_modified、points已包含在dataset文件夹中。 -```bash -# 将路径设置到训练点云图文件 -IN_FILES=/points_test/*.pts - -NEW_TEST_FILES_DIR=/test_points_modified - -AXANG_SAMPLING=True - -# 决定旋转轴角的分布 -AXANG_SAMPLING=True - -python -m special_orthogonalization.gen_pt_test_data_gpu --input_test_files=$IN_FILES --output_directory=$NEW_TEST_FILES_DIR --random_rotation_axang=$AXANG_SAMPLING -``` - -**训练与评价** -```bash -# 将路径设置到原始训练数据 -TRAIN_FILES=/points/*.pts - -#将路径设置到旋转后的训练数据 -TEST_FILES=$NEW_TEST_FILES_DIR/*.pts - -# 指定旋转预测方式 -METHOD=svd - -# 指定ckpt、summaries、评价结果等的存储路径 -OUT_DIR=/path/to/model - -python -m special_orthogonalization.main_point_cloud_gpu --method=$METHOD --checkpoint_dir=$OUT_DIR --log_step_count=200 --save_summaries_steps=25000 --pt_cloud_train_files=$TRAIN_FILES --pt_cloud_test_files=$TEST_FILES --train_steps=2600000 --save_checkpoints_steps=100000 --eval_examples=39900 -``` - -**从所有训练样本中生成统计数据** -```bash -# 打印均值、中位数、标准差和分位数 -python -m special_orthogonalization.main_point_cloud_gpu --method=$METHOD --checkpoint_dir=$OUT_DIR --pt_cloud_test_files=$TEST_FILES --predict_all_test=True -``` -## 运行 - -NPU运行命令方式如下: - -对于所有的三个步骤程序来说,modelarts插件obs桶中的数据路径均要写到真正包含数据的那一个路径 -如在dataset文件夹中含有points、points_test等包含数据的文件夹 -modelarts插件中的数据路径写为 obs://cann-id2019/dataset/ - -**生成测试数据** - -运行这一步我们需要的程序文件为gen_pt_test_data.py、modelarts_entry_Axang.py、genTestData.sh -这三个文件中的代码均不需要修改 -最后生成的旋转后的数据文件存放在obs桶当次程序文件的output路径中,文件名为test_points_modified, -为进行第二步模型训练,需要将生成旋转后的文件转移至obs桶中存放data的路径 - -注:需要确保的是存在obs桶里的data文件名为points_test - -**训练与评价** - -运行这一步我们需要的程序文件为main_point_cloud_boostPerf.py、modelarts_entry_acc_train.py、train_full_1p.sh -这三个文件中的代码均不需要修改 - -由于采用混合精度提高训练性能,一些算子计算溢出,为此增添switch_config.txt文件,该文件应该和代码所在目录一致。 - -最后生成的旋转后的数据文件存放在obs桶当次程序文件的output路径中,文件名为test_points_modified, -为进行第三步,需要将生成的output文件转移至obs桶中存放data的路径 - -注意:该次训练的模型保存在该次的obs文件夹中,进行第三步时又需要重启一次新的modelarts,因此我们需要将output文件中的 -checkpoint文件中最新模型的路径修改 -"/home/ma-user/modelarts/inputs/data_url_0/output" -这样第三步才能跑出正确的精度指标 - - -**从所有训练样本中生成统计数据** - -运行这一步我们需要的程序文件为main_point_cloud_boostPerf.py、modelarts_entry_stat.py、genStatistical.sh -这三个文件的代码均不需要修改 - -运行成功后将会在屏幕上打印出关于精度相应的统计量值 - - - -## 训练结果 -**精度对比:** - - - -| 测地线误差(°) | 论文发布 | GPU(初始代码未改动版本) | GPU实测|NPU实测 | -| ------------------------ | ------- | ----- | --------- |----------| -| 平均值 | 1.63 | 2.58 | 3.98 | 2.92 | -| 中值 | 0.89 | 1.68 | 2.6 | 1.7 | -| 标准差 | 6.70 | 6.93 | 9.36 | 8.45 | - -相比于论文中的精度,我们NPU迁移后实测差距依然较大,但是与我们未对代码任何改动初始的版本在GPU上跑出来的精度相差较小, -且对于相同的代码的代码改动,NPU上的精度优于GPU上的精度。需要注意的是,在NPU上运行程序时,我们采用混合精度来提升训练 -的性能,但是其中产生了未知的错误,导致代表的精度指标mean_degree_err在整个训练过程中始终为0,因此我们无法得知在NPU训练的 -260w步的过程中,精度指标是下降的过程是怎样的。值得庆幸的是通过NPU训练出的模型,能够在GPU上计算出精度,并且精度还不错。 - -**性能对比:** - -取华为v100上GPU运行的前2w步的global_step/sec平均值和NPU运行的前1w步的global_step/sec平均值进行对比,以达到性能对比的目的。 -对于NPU上的性能计算 需要的程序为main_point_cloud_boostPerf.py、modelarts_entry_perf.py、train_performance_1p.sh,其中参数已经 -设置完毕,无需更改。同时我们上传了我们计算性能的代码calc_perf.py,运行该代码需要将产生的日志文件从obs上下载下来,传入obs文件在本地的路径即可 -| 性能指标项 | 论文发布 | GPU实测 | NPU实测 | -| ------------------- | ------- | ------ | ------ | -| global_step/sec| 无 | 87.64 | 116.77 | - -##离线推理 -利用frezze.py文件将save_model模型保存为pb模型 - - -- Gitee From 715137c0637952a53c2e7cb6f329c3a0559fd631 Mon Sep 17 00:00:00 2001 From: ab_dx_z <10158481+ab_dx_z@user.noreply.gitee.com> Date: Thu, 18 Aug 2022 07:21:40 +0000 Subject: [PATCH 05/20] =?UTF-8?q?=E6=9B=B4=E6=96=B0?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: ab_dx_z <> --- .../cv/SVD_ID2019_for_Tensorflow/README.md | 256 ++++++++++++++++++ 1 file changed, 256 insertions(+) create mode 100644 TensorFlow/contrib/cv/SVD_ID2019_for_Tensorflow/README.md diff --git a/TensorFlow/contrib/cv/SVD_ID2019_for_Tensorflow/README.md b/TensorFlow/contrib/cv/SVD_ID2019_for_Tensorflow/README.md new file mode 100644 index 000000000..917ab7f44 --- /dev/null +++ b/TensorFlow/contrib/cv/SVD_ID2019_for_Tensorflow/README.md @@ -0,0 +1,256 @@ +

基本信息

+ +**发布者(Publisher):Huawei** + +**应用领域(Application Domain):CV** + +**版本(Version):** + +**修改时间(Modified) :2022.05.05** + +**框架(Framework):TensorFlow 1.15.0** + +**处理器(Processor):昇腾910** + +**描述(Description):使用训练好的SVD模型,评估对称正交化在点云对准中的应用效果。** + +

概述

+ +给定两个三维点云图,利用SVD正交化过程SVDO+(M)将其投射到SO(3)上,要求网络预测最佳对齐它们的3D旋转。代码的训练逻辑是每训练10w步保存一个模型,并且在测试集上检验该模型的精度,最后比较的都是260w步的模型精度 + +- 开源代码: + + https://github.com/google-research/google-research/tree/master/special_orthogonalization。 + +- 参考论文: + + [An Analysis of SVD for Deep Rotation Estimation](https://arxiv.org/abs/2006.14616) + +- 参考实现: + gpu最后训练出来的模型在 + + obs://cann-id2019/gpu + + 数据在 + obs://cann-id2019/dataset/ + + 共有3个数据集 + + 训练数据集points + + 测试数据集 points_test + + 第一步旋转后的数据集 test_points_modified + + npu训练出来的模型在 + + obs://cann-id2019/dataset/output + + + 相关代码均以上传,在NPU和GPU文件夹中可以找到 + +- 相关迁移的工作: + 在进行代码迁移到NPU上时,输入的训练数据为点云数据,点云数据的shape为(N,3),其中N并不是固定的,因此在NPU上存在动态shape的问题,导致模型训练无法正常进行。我们为此想了三个解决方法:1、找出所有点云数据中最小的N,对于大于N的点云数据,仅取前N行的数据输入训练。2、找到所有点云数据中最大的N,对于小于N的点云进行补0操作,将所有数据固定为最大的N后,输入网络进行训练。3、找到所有点云数据中最大的N,对小于N的点云数据,从原数据中选择一个点云进行填补至行数为N,再将数据输入网络进行训练。该三种方法均成功解决了NPU上的动态shape问题,但是第一种方法删除了样本点,因此导致最后训练出的模型精度很差;第二种方法虽然并没有丢失样本信息,但是向数据中填入大量的0,改变了本来的代码逻辑,导致最后训练出的模型精度也并不高。对于第三种方法,即没有丢失样本信息,对每个点云数据中的某一个点云样本点进行重复操作,没有改变原始的代码逻辑,最后也获得了不错的精度表现。 + +- 通过Git获取对应commit\_id的代码方法如下: + + ``` + git clone {repository_url} # 克隆仓库的代码 + cd {repository_name} # 切换到模型的代码仓目录 + git checkout {branch} # 切换到对应分支 + git reset --hard {commit_id} # 代码设置到对应的commit_id + cd {code_path} # 切换到模型代码所在路径,若仓库下只有该模型,则无需切换 + ``` + +## 默认配置 + +- 数据集获取:obs://cann-id2019/dataset/ + +- 训练超参 + + - log_step_count=200 + - save_summaries_steps=25000 + - train_steps=2600000 + - save_checkpoints_steps=100000 + - eval_examples=39900 + + + +

训练环境准备

+ +1. 硬件环境准备请参见各硬件产品文档"[驱动和固件安装升级指南]( https://support.huawei.com/enterprise/zh/category/ai-computing-platform-pid-1557196528909)"。需要在硬件设备上安装与CANN版本配套的固件与驱动。 +2. 宿主机上需要安装Docker并登录[Ascend Hub中心](https://ascendhub.huawei.com/#/detail?name=ascend-tensorflow-arm)获取镜像。 + + 当前模型支持的镜像列表如[表1](#zh-cn_topic_0000001074498056_table1519011227314)所示。 + + **表 1** 镜像列表 + + + + + + + + + + + + +

镜像名称

+

镜像版本

+

配套CANN版本

+
+

20.2.0

+

20.2

+
+ + + +## 脚本参数 + +``` +gen_pt_test_data_gpu.py 中的参数 + +--input_test_files 传入需要进行旋转的点云数据集 +--output_directory 存储旋转后点云集的路径 +--random_rotation_axang boole型,若为真将会对传入的数据集采用指定方法旋转,默认为真 +--num_rotations_per_file 每个测试点云的随机旋转增加数。默认为100 + + +main_point_cloud_gpu.py 中的参数 + +--pt_cloud_test_files 测试数据集路径 +--pt_cloud_train_files 熟练数据集路径 +--method 指定用于预测旋转的方式。选项为"svd", "svd-inf", or "gs"。默认为“svd” +--checkpoint_dir 训练模型的存放位置 +--train_steps 训练迭代的次数。默认为2600000 +--save_checkpoints_steps 保存检查点的频率。默认为10000 +--log_step_count 日志记录一次的步数。默认为200 +--save_summaries_steps 保存一次summary的步数。默认为5000 +--learning_rate 默认为1e-5 +--lr_decay 如果为真,则衰减learning rate。默认为假 +--lr_decay_steps learning rate衰减步数。默认为35000 +--lr_decay_rate learning rate衰减速率。默认为0.95 +--predict_all_test 如果为真,则在最新的检查点上运行eval作业,并打印每个输入的误差信息。默认为假 +--eval_examples 测试样本的数量。默认为0 +--print_variable_names 打印模型变量名。默认为假 +--num_train_augmentations 增加每个输入点云的随机旋转数。默认为10 + +``` + + + +## 运行 + +GPU运行命令如下: + +**生成测试数据** + +注:生成的文件test_points_modified、points已包含在dataset文件夹中。 +```bash +# 将路径设置到训练点云图文件 +IN_FILES=/points_test/*.pts + +NEW_TEST_FILES_DIR=/test_points_modified + +AXANG_SAMPLING=True + +# 决定旋转轴角的分布 +AXANG_SAMPLING=True + +python -m special_orthogonalization.gen_pt_test_data_gpu --input_test_files=$IN_FILES --output_directory=$NEW_TEST_FILES_DIR --random_rotation_axang=$AXANG_SAMPLING +``` + +**训练与评价** +```bash +# 将路径设置到原始训练数据 +TRAIN_FILES=/points/*.pts + +#将路径设置到旋转后的训练数据 +TEST_FILES=$NEW_TEST_FILES_DIR/*.pts + +# 指定旋转预测方式 +METHOD=svd + +# 指定ckpt、summaries、评价结果等的存储路径 +OUT_DIR=/path/to/model + +python -m special_orthogonalization.main_point_cloud_gpu --method=$METHOD --checkpoint_dir=$OUT_DIR --log_step_count=200 --save_summaries_steps=25000 --pt_cloud_train_files=$TRAIN_FILES --pt_cloud_test_files=$TEST_FILES --train_steps=2600000 --save_checkpoints_steps=100000 --eval_examples=39900 +``` + +**从所有训练样本中生成统计数据** +```bash +# 打印均值、中位数、标准差和分位数 +python -m special_orthogonalization.main_point_cloud_gpu --method=$METHOD --checkpoint_dir=$OUT_DIR --pt_cloud_test_files=$TEST_FILES --predict_all_test=True +``` +## 运行 + +NPU运行命令方式如下: + +对于所有的三个步骤程序来说,modelarts插件obs桶中的数据路径均要写到真正包含数据的那一个路径 +如在dataset文件夹中含有points、points_test等包含数据的文件夹 +modelarts插件中的数据路径写为 obs://cann-id2019/dataset/ + +**生成测试数据** + +运行这一步我们需要的程序文件为gen_pt_test_data.py、modelarts_entry_Axang.py、genTestData.sh +这三个文件中的代码均不需要修改 +最后生成的旋转后的数据文件存放在obs桶当次程序文件的output路径中,文件名为test_points_modified, +为进行第二步模型训练,需要将生成旋转后的文件转移至obs桶中存放data的路径 + +注:需要确保的是存在obs桶里的data文件名为points_test + +**训练与评价** + +运行这一步我们需要的程序文件为main_point_cloud_boostPerf.py、modelarts_entry_acc_train.py、train_full_1p.sh +这三个文件中的代码均不需要修改 + +由于采用混合精度提高训练性能,一些算子计算溢出,为此增添switch_config.txt文件,该文件应该和代码所在目录一致。 + +最后生成的旋转后的数据文件存放在obs桶当次程序文件的output路径中,文件名为test_points_modified, +为进行第三步,需要将生成的output文件转移至obs桶中存放data的路径 + +注意:该次训练的模型保存在该次的obs文件夹中,进行第三步时又需要重启一次新的modelarts,因此我们需要将output文件中的 +checkpoint文件中最新模型的路径修改 +"/home/ma-user/modelarts/inputs/data_url_0/output" +这样第三步才能跑出正确的精度指标 + + +**从所有训练样本中生成统计数据** + +运行这一步我们需要的程序文件为main_point_cloud_boostPerf.py、modelarts_entry_stat.py、genStatistical.sh +这三个文件的代码均不需要修改 + +运行成功后将会在屏幕上打印出关于精度相应的统计量值 + + + +## 训练结果 +**精度对比:** + + + +| 测地线误差(°) | 论文发布 | GPU(初始代码未改动版本) | GPU实测|NPU实测 | +| ------------------------ | ------- | ----- | --------- |----------| +| 平均值 | 1.63 | 2.58 | 3.98 | 2.92 | +| 中值 | 0.89 | 1.68 | 2.6 | 1.7 | +| 标准差 | 6.70 | 6.93 | 9.36 | 8.45 | + +相比于论文中的精度,我们NPU迁移后实测差距依然较大,但是与我们未对代码任何改动初始的版本在GPU上跑出来的精度相差较小, +且对于相同的代码的代码改动,NPU上的精度优于GPU上的精度。需要注意的是,在NPU上运行程序时,我们采用混合精度来提升训练 +的性能,但是其中产生了未知的错误,导致代表的精度指标mean_degree_err在整个训练过程中始终为0,因此我们无法得知在NPU训练的 +260w步的过程中,精度指标是下降的过程是怎样的。值得庆幸的是通过NPU训练出的模型,能够在GPU上计算出精度,并且精度还不错。 + +**性能对比:** + +取华为v100上GPU运行的前2w步的global_step/sec平均值和NPU运行的前1w步的global_step/sec平均值进行对比,以达到性能对比的目的。 +对于NPU上的性能计算 需要的程序为main_point_cloud_boostPerf.py、modelarts_entry_perf.py、train_performance_1p.sh,其中参数已经 +设置完毕,无需更改。同时我们上传了我们计算性能的代码calc_perf.py,运行该代码需要将产生的日志文件从obs上下载下来,传入obs文件在本地的路径即可 +| 性能指标项 | 论文发布 | GPU实测 | NPU实测 | +| ------------------- | ------- | ------ | ------ | +| global_step/sec| 无 | 87.64 | 116.77 | + +## 离线推理 +利用frezze.py文件将save_model模型保存为pb模型 + + -- Gitee From 5dbedcf7f5a4eee30aea06a133c85fc1f3ff53d3 Mon Sep 17 00:00:00 2001 From: ab_dx_z <10158481+ab_dx_z@user.noreply.gitee.com> Date: Sat, 22 Oct 2022 13:12:15 +0000 Subject: [PATCH 06/20] update TensorFlow/contrib/cv/SVD_ID2019_for_Tensorflow/README.md. Signed-off-by: ab_dx_z <> --- .../contrib/cv/SVD_ID2019_for_Tensorflow/README.md | 14 ++++---------- 1 file changed, 4 insertions(+), 10 deletions(-) diff --git a/TensorFlow/contrib/cv/SVD_ID2019_for_Tensorflow/README.md b/TensorFlow/contrib/cv/SVD_ID2019_for_Tensorflow/README.md index 917ab7f44..e688a48f0 100644 --- a/TensorFlow/contrib/cv/SVD_ID2019_for_Tensorflow/README.md +++ b/TensorFlow/contrib/cv/SVD_ID2019_for_Tensorflow/README.md @@ -27,12 +27,8 @@ [An Analysis of SVD for Deep Rotation Estimation](https://arxiv.org/abs/2006.14616) - 参考实现: - gpu最后训练出来的模型在 - obs://cann-id2019/gpu - - 数据在 - obs://cann-id2019/dataset/ + 数据下载百度网盘链接: 共有3个数据集 @@ -42,12 +38,10 @@ 第一步旋转后的数据集 test_points_modified - npu训练出来的模型在 + npu训练出来的模型下载百度网盘链接: + - obs://cann-id2019/dataset/output - - 相关代码均以上传,在NPU和GPU文件夹中可以找到 - 相关迁移的工作: 在进行代码迁移到NPU上时,输入的训练数据为点云数据,点云数据的shape为(N,3),其中N并不是固定的,因此在NPU上存在动态shape的问题,导致模型训练无法正常进行。我们为此想了三个解决方法:1、找出所有点云数据中最小的N,对于大于N的点云数据,仅取前N行的数据输入训练。2、找到所有点云数据中最大的N,对于小于N的点云进行补0操作,将所有数据固定为最大的N后,输入网络进行训练。3、找到所有点云数据中最大的N,对小于N的点云数据,从原数据中选择一个点云进行填补至行数为N,再将数据输入网络进行训练。该三种方法均成功解决了NPU上的动态shape问题,但是第一种方法删除了样本点,因此导致最后训练出的模型精度很差;第二种方法虽然并没有丢失样本信息,但是向数据中填入大量的0,改变了本来的代码逻辑,导致最后训练出的模型精度也并不高。对于第三种方法,即没有丢失样本信息,对每个点云数据中的某一个点云样本点进行重复操作,没有改变原始的代码逻辑,最后也获得了不错的精度表现。 @@ -64,7 +58,7 @@ ## 默认配置 -- 数据集获取:obs://cann-id2019/dataset/ +- 数据集获取百度网盘链接: - 训练超参 -- Gitee From ac0326c89065712c63ab23d94f5c233a83a66ff9 Mon Sep 17 00:00:00 2001 From: ab_dx_z <10158481+ab_dx_z@user.noreply.gitee.com> Date: Sat, 22 Oct 2022 13:15:51 +0000 Subject: [PATCH 07/20] =?UTF-8?q?=E5=88=A0=E9=99=A4=E6=96=87=E4=BB=B6=20Te?= =?UTF-8?q?nsorFlow/contrib/cv/SVD=5FID2019=5Ffor=5FTensorflow/main=5Fpoin?= =?UTF-8?q?t=5Fcloud=5FboostPerf.py?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../main_point_cloud_boostPerf.py | 462 ------------------ 1 file changed, 462 deletions(-) delete mode 100644 TensorFlow/contrib/cv/SVD_ID2019_for_Tensorflow/main_point_cloud_boostPerf.py diff --git a/TensorFlow/contrib/cv/SVD_ID2019_for_Tensorflow/main_point_cloud_boostPerf.py b/TensorFlow/contrib/cv/SVD_ID2019_for_Tensorflow/main_point_cloud_boostPerf.py deleted file mode 100644 index 54f2fffa0..000000000 --- a/TensorFlow/contrib/cv/SVD_ID2019_for_Tensorflow/main_point_cloud_boostPerf.py +++ /dev/null @@ -1,462 +0,0 @@ -# coding=utf-8 -# Copyright 2021 The Google Research Authors. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Training and evaluation for the point cloud alignment experiment.""" -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - -import utils -from absl import flags -import numpy as np -import tensorflow as tf -import glob -import pathlib -import datetime -from npu_bridge.npu_init import * -from npu_bridge.estimator.npu.npu_estimator import NPUEstimatorSpec -from npu_bridge.estimator.npu.npu_estimator import NPUEstimator -from npu_bridge.estimator.npu.npu_config import NPURunConfig -from npu_bridge.estimator.npu.npu_config import ProfilingConfig -#import precision_tool.tf_config as npu_tf_config - -#import precision_tool.config.config as CONFIG - -import os -FLAGS = tf.app.flags.FLAGS - -# General flags. -flags.DEFINE_string('method', 'svd', - 'Specifies the method to use for predicting rotations. ' - 'Choices are "svd", "svd-inf", or "gs".') -flags.DEFINE_string('checkpoint_dir', '', - 'Locations for checkpoints, summaries, etc.') -flags.DEFINE_integer('train_steps', 2600000, 'Number of training iterations.') -flags.DEFINE_integer('save_checkpoints_steps', 10000, - 'How often to save checkpoints') -flags.DEFINE_integer('log_step_count', 500, 'How often to log the step count') -flags.DEFINE_integer('save_summaries_steps', 5000, - 'How often to save summaries.') -flags.DEFINE_float('learning_rate', 1e-5, 'Learning rate') -flags.DEFINE_boolean('lr_decay', False, 'Decay the learning rate if True.') -flags.DEFINE_integer('lr_decay_steps', 35000, - 'Learning rate decays steps.') -flags.DEFINE_float('lr_decay_rate', 0.95, - 'Learning rate decays rate.') -flags.DEFINE_boolean('predict_all_test', False, - 'If true, runs an eval job on latest checkpoint and ' - 'prints the error for each input.') -flags.DEFINE_integer('eval_examples', 39900, 'Number of test examples.') -flags.DEFINE_boolean('print_variable_names', False, - 'Print model variable names.') - -# Flags only used in the point cloud alignment experiment. -flags.DEFINE_integer('num_train_augmentations', 10, - 'Number of random rotations for augmenting each input ' - 'point cloud.') -flags.DEFINE_string('pt_cloud_train_files', '', - 'Expression matching all training point files, e.g. ' - '/path/to/files/pc_plane/points/*.pts') -flags.DEFINE_string('pt_cloud_test_files','', - 'Expression matching all modified test point files, e.g. ' - '/path/to/files/pc_plane/points_test/*.pts') -flags.DEFINE_boolean('random_rotation_axang', True, - 'If true, samples random rotations using the method ' - 'from the original benchmark code. Otherwise samples ' - 'by Haar measure.') -flags.DEFINE_boolean('Profiling',True, - 'parse NPU operator performance') - -flags.DEFINE_boolean('Dump',False, - 'overflow test') - - -def pt_features(batch_pts): - """Input shape: [B, N, 3], output shape: [B, 1024].""" - with tf.variable_scope('ptenc', reuse=tf.AUTO_REUSE): - f1 = tf.layers.conv1d(inputs=batch_pts, filters=64, kernel_size=1) - f1 = tf.nn.leaky_relu(f1) - f2 = tf.layers.conv1d(inputs=f1, filters=128, kernel_size=1) - f2 = tf.nn.leaky_relu(f2) - f3 = tf.layers.conv1d(inputs=f2, filters=1024, kernel_size=1) - f = tf.reduce_max(f3, axis=1, keep_dims=False) - return f - - -def regress_from_features(batch_features, out_dim): - """Regress to a rotation representation from point cloud encodings. - - In Zhou et al, CVPR19, the paper describes this regression network as an MLP - mapping 2048->512->512->out_dim, but the associated code implements it with - one less layer: 2048->512->out_dim. We mimic the code. - - Args: - batch_features: [batch_size, in_dim]. - out_dim: desired output dimensionality. - - Returns: - A [batch_size, out_dim] tensor. - """ - f1 = tf.layers.dense(batch_features, 512) - f1 = tf.nn.leaky_relu(f1) - f2 = tf.layers.dense(f1, out_dim) - return f2 - - -def net_point_cloud(points1, points2, mode): - """Predict a relative rotation given two point clouds. - - Args: - points1: [batch_size, N, 3] float tensor. - points2: [batch_size, N, 3] float tensor. - mode: tf.estimator.ModeKeys. - - Returns: - [batch_size, 3, 3] matrices. - """ - f1 = pt_features(points1) - f2 = pt_features(points2) - f = tf.concat([f1, f2], axis=-1) - - if FLAGS.method == 'svd': - p = regress_from_features(f, 9) - return utils.symmetric_orthogonalization(p) - - if FLAGS.method == 'svd-inf': - p = regress_from_features(f, 9) - if mode == tf.estimator.ModeKeys.TRAIN: - return tf.reshape(p, (-1, 3, 3)) - else: - return utils.symmetric_orthogonalization(p) - - if FLAGS.method == 'gs': - p = regress_from_features(f, 6) - return utils.gs_orthogonalization(p) - - -def model_fn(features, labels, mode, params): - """The model_fn used to construct the tf.Estimator.""" - del labels, params # Unused. - if mode == tf.estimator.ModeKeys.TRAIN: - # Training data has point cloud of size [1, N, 3] and random rotations - # of size [1, FLAGS.num_train_augmentations, 3, 3] - rot = features['rot'][0] - num_rot = FLAGS.num_train_augmentations - batch_pts1 = tf.tile(features['data'], [num_rot, 1, 1]) - # In this experiment it does not matter if we pre or post-multiply the - # rotation as long as we are consistent between training and eval. - batch_pts2 = tf.matmul(batch_pts1, rot) # post-multiplying! - else: - # Test data has point cloud of size [1, N, 3] and a single random - # rotation of size [1, 3, 3] - batch_pts1 = features['data'] - rot = features['rot'] - batch_pts2 = tf.matmul(batch_pts1, rot) - rot = tf.reshape(rot, (-1, 3, 3)) - - # Predict the rotation. - r = net_point_cloud(batch_pts1, batch_pts2, mode) - - # Compute the loss. - loss = tf.nn.l2_loss(rot - r) - - # Compute the relative angle in radians. - theta = utils.relative_angle(r, rot) - - # Mean angle error over the batch. - mean_theta = tf.reduce_mean(theta) - mean_theta_deg = mean_theta * 180.0 / np.pi - - # Train, eval, or predict depending on mode. - if mode == tf.estimator.ModeKeys.TRAIN: - tf.summary.scalar('train/loss', loss) - tf.summary.scalar('train/theta', mean_theta_deg) - global_step = tf.train.get_or_create_global_step() - - if FLAGS.lr_decay: - learning_rate = tf.train.exponential_decay( - FLAGS.learning_rate, - global_step, - FLAGS.lr_decay_steps, - FLAGS.lr_decay_rate) - else: - learning_rate = FLAGS.learning_rate - - tf.summary.scalar('train/learning_rate', learning_rate) - optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate) - update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) - with tf.control_dependencies(update_ops): - train_op = optimizer.minimize(loss, global_step=global_step) - #train_op = util.set_graph_exec_config(train_op, dynamic_input=True,dynamic_graph_execute_mode="lazy_recompile") - - return NPUEstimatorSpec( - mode=mode, - loss=loss, - train_op=train_op) - - if mode == tf.estimator.ModeKeys.EVAL: - if FLAGS.predict_all_test: - print_error_op = tf.print('error:', mean_theta_deg) - with tf.control_dependencies([print_error_op]): - eval_metric_ops = { - 'mean_degree_err': tf.metrics.mean(mean_theta_deg), - } - else: - eval_metric_ops = { - 'mean_degree_err': tf.metrics.mean(mean_theta_deg), - } - - return NPUEstimatorSpec( - mode=mode, - loss=loss, - eval_metric_ops=eval_metric_ops) - - if mode == tf.estimator.ModeKeys.PREDICT: - pred = {'error': mean_theta_deg} - return NPUEstimatorSpec( - mode=mode, - predictions=pred) - - -def train_input_fn(): - """Generate training data iterator from the .pts files.""" - def _file_to_matrix(pts_path): - """Read Nx3 point cloud from a .pts file.""" - file_buffer = tf.read_file(pts_path) - lines = tf.string_split([file_buffer], delimiter='\n') - lines1 = tf.string_split(lines.values, delimiter='\r') - values = tf.stack(tf.decode_csv(lines1.values, - record_defaults=[[0.0], [0.0], [0.0]], - field_delim=' ')) - values = tf.transpose(values) # 3xN --> Nx3. - values = values[:tf.shape(values)[0] // 2, :] - - # modified by Lyoung - diff_num = 1500-tf.shape(values)[0] - repeat_pts = tf.tile(tf.reshape(values[1, :], (1, -1)), [diff_num, 1]) - values = tf.concat([values,repeat_pts], axis=0) - # The experiment code in - # github.com/papagina/RotationContinuity/.../shapenet/code/train_pointnet.py - # only used the first half of the points in each file. - return values - - def _random_rotation(pts): - """Attach N random rotations to a point cloud.""" - if FLAGS.random_rotation_axang: - rotations = utils.random_rotation_benchmark(FLAGS.num_train_augmentations) - else: - rotations = utils.random_rotation(FLAGS.num_train_augmentations) - return pts,rotations - - pts_paths = tf.gfile.Glob(FLAGS.pt_cloud_train_files) - dataset = tf.data.Dataset.from_tensor_slices(pts_paths) - dataset = dataset.map(_file_to_matrix) - dataset = dataset.cache() # Comment out if memory cannot hold all the data. - dataset = dataset.shuffle(buffer_size=50, reshuffle_each_iteration=True) - dataset = dataset.repeat() - dataset = dataset.map(_random_rotation) - dataset = dataset.batch(1) - - iterator = tf.data.make_one_shot_iterator(dataset) - batch_data, batch_rot = iterator.get_next() - features_dict = {'data': batch_data, 'rot': batch_rot} - batch_size = tf.shape(batch_data)[0] - batch_labels_dummy = tf.zeros(shape=(batch_size, 1)) - return (features_dict, batch_labels_dummy) - - -def eval_input_fn(): - """Generate test data from *modified* .pts files. - - See README and comments below for details on how the data is modified. - - Returns: - A tuple of features and associated labels. - """ - def _file_to_matrix(pts_path): - """Read Nx3 point cloud and 3x3 rotation matrix from a .pts file. - - The test data is a modified version of the original files. For each .pts - file we have (1) added a 3x3 rotation matrix for testing, and (2) removed - the second half of the point cloud since it is not used at all. - - Args: - pts_path: path to a .pts file. - - Returns: - A Nx3 point cloud. - A 3x3 rotation matrix. - """ - file_buffer = tf.read_file(pts_path) - lines = tf.string_split([file_buffer], delimiter='\n') - lines1 = tf.string_split(lines.values,delimiter='\r') - values = tf.stack(tf.decode_csv(lines1.values, - record_defaults=[[0.0], [0.0], [0.0]],field_delim=' ')) - values = tf.transpose(values)# 3xN --> Nx3. - - #modified by Lyoung - diff_num = 1414-tf.shape(values)[0] - repeat_pts = tf.tile(tf.reshape(values[4, :], (1, -1)), [diff_num, 1]) - values = tf.concat([values, repeat_pts], axis=0) - # First three rows are the rotation matrix, remaining rows the point cloud. - rot = values[:3, :] - - return values[4:, :], rot - - pts_paths = tf.io.gfile.glob(FLAGS.pt_cloud_test_files) - dataset = tf.data.Dataset.from_tensor_slices(pts_paths) - dataset = dataset.map(_file_to_matrix) - dataset = dataset.batch(1) - iterator = tf.data.make_one_shot_iterator(dataset) - batch_data, batch_rot = iterator.get_next() - - features_dict = {'data': batch_data, 'rot': batch_rot} - batch_size = tf.shape(batch_data)[0] - batch_labels_dummy = tf.zeros(shape=(batch_size, 1)) - return (features_dict, batch_labels_dummy) - - -def print_variable_names(): - """Print variable names in a model.""" - params = {'dummy': 0} - estimator = NPUEstimator( - model_fn=model_fn, - model_dir=FLAGS.checkpoint_dir, - params=params) - - names = estimator.get_variable_names() - for name in names: - print(name) - - -def predict_all_test(): - """Print error statistics for the test dataset.""" - - config = NPURunConfig( - save_summary_steps=None, - save_checkpoints_steps=None, - log_step_count_steps=None, - keep_checkpoint_max=None) - - params = {'dummy': 0} - estimator = NPUEstimator( - model_fn=model_fn, - model_dir=FLAGS.checkpoint_dir, - params=params, - config=config) - evals = estimator.predict(input_fn=eval_input_fn, yield_single_examples=True) - - # Print error statistics. - - all_errors = [x['error'] for x in evals] - errors = np.array(all_errors) - print('Evaluated %d examples'%np.size(errors)) - print('Mean error: %f degrees',np.mean(errors)) - print('Median error: %f degrees', np.median(errors)) - print('Std: %f degrees', np.std(errors)) - sorted_errors = np.sort(errors) - n = np.size(sorted_errors) - print('\nPercentiles:') - for perc in range(1, 101): - index = np.int32(np.float32(n * perc) / 100.0) - 1 - print('%3d%%: %f'%(perc, sorted_errors[index])) - - -def train_and_eval(): - """Train and evaluate a model.""" - save_summary_steps = FLAGS.save_summaries_steps - save_checkpoints_steps = FLAGS.save_checkpoints_steps - log_step_count = FLAGS.log_step_count - - - # dump_config = npu_tf_config.estimator_dump_config(action='overflow') - # - profilingPath=os.path.join(FLAGS.checkpoint_dir,'npu_profiling') - if not os.path.exists(profilingPath): - os.makedirs(profilingPath) - - profiling_options= '{"output":"%s",\ - "task_trace":"on",\ - "aicpu":"on"}'%(profilingPath) - - profiling_config=ProfilingConfig(enable_profiling=True,profiling_options=profiling_options) - session_config=tf.ConfigProto() - - if FLAGS.Profiling: - config = NPURunConfig( - save_summary_steps=save_summary_steps, - save_checkpoints_steps=save_checkpoints_steps, - log_step_count_steps=log_step_count, - keep_checkpoint_max=None, - precision_mode = "allow_mix_precision", - profiling_config=profiling_config, - session_config=session_config, - customize_dtypes="/home/ma-user/modelarts/user-job-dir/code/switch_config.txt") - # if FLAGS.Dump == True: - # config = NPURunConfig( - # save_summary_steps=save_summary_steps, - # save_checkpoints_steps=save_checkpoints_steps, - # log_step_count_steps=log_step_count, - # keep_checkpoint_max=None, - # precision_mode="allow_mix_precision", - # dump_config=dump_config) - else: - config = NPURunConfig( - save_summary_steps=save_summary_steps, - save_checkpoints_steps=save_checkpoints_steps, - log_step_count_steps=log_step_count, - keep_checkpoint_max=None, - precision_mode="allow_mix_precision", - customize_dtypes="/home/ma-user/modelarts/user-job-dir/code/switch_config.txt") - - params = {'dummy': 0} - estimator = NPUEstimator( - model_fn=model_fn, - model_dir=FLAGS.checkpoint_dir, - config=config, - params=params) - - - train_spec = tf.estimator.TrainSpec( - input_fn=train_input_fn, - max_steps=FLAGS.train_steps) - - - eval_spec = tf.estimator.EvalSpec(input_fn=eval_input_fn, - start_delay_secs=60, - steps=FLAGS.eval_examples, - throttle_secs=60) - - - - tf.estimator.train_and_evaluate(estimator, train_spec, eval_spec) - - - -def main(argv=None): # pylint: disable=unused-argument - - if FLAGS.print_variable_names: - print_variable_names() - return - - if FLAGS.predict_all_test: - predict_all_test() - else: - train_and_eval() - - -if __name__ == '__main__': - - tf.app.run() -- Gitee From 5219bb67c323571b58c86b13448951cb51106800 Mon Sep 17 00:00:00 2001 From: ab_dx_z <10158481+ab_dx_z@user.noreply.gitee.com> Date: Sat, 22 Oct 2022 13:17:50 +0000 Subject: [PATCH 08/20] =?UTF-8?q?=E5=88=A0=E9=99=A4=E6=96=87=E4=BB=B6=20Te?= =?UTF-8?q?nsorFlow/contrib/cv/SVD=5FID2019=5Ffor=5FTensorflow/frezze.py?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../cv/SVD_ID2019_for_Tensorflow/frezze.py | 48 ------------------- 1 file changed, 48 deletions(-) delete mode 100644 TensorFlow/contrib/cv/SVD_ID2019_for_Tensorflow/frezze.py diff --git a/TensorFlow/contrib/cv/SVD_ID2019_for_Tensorflow/frezze.py b/TensorFlow/contrib/cv/SVD_ID2019_for_Tensorflow/frezze.py deleted file mode 100644 index 6aeb2e1aa..000000000 --- a/TensorFlow/contrib/cv/SVD_ID2019_for_Tensorflow/frezze.py +++ /dev/null @@ -1,48 +0,0 @@ -# Copyright 2017 The TensorFlow Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================ -# Copyright 2021 Huawei Technologies Co., Ltd -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import tensorflow as tf -from tensorflow.python.tools import freeze_graph -from npu_bridge.npu_init import * - -freeze_graph.freeze_graph( - input_saved_model_dir='save_model_path', - output_node_names='L2Loss', - output_graph='test.pb', - initializer_nodes='', - input_graph= None, - input_saver= False, - input_binary=False, - input_checkpoint=None, - restore_op_name=None, - filename_tensor_name=None, - clear_devices=False, - input_meta_graph=False) - -# if __name__ == '__main__': -# main() \ No newline at end of file -- Gitee From f22355ab77aca3ea177a263a2da6c559eb8530d5 Mon Sep 17 00:00:00 2001 From: ab_dx_z <10158481+ab_dx_z@user.noreply.gitee.com> Date: Sat, 22 Oct 2022 13:18:35 +0000 Subject: [PATCH 09/20] update TensorFlow/contrib/cv/SVD_ID2019_for_Tensorflow/README.md. Signed-off-by: ab_dx_z <> --- TensorFlow/contrib/cv/SVD_ID2019_for_Tensorflow/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/TensorFlow/contrib/cv/SVD_ID2019_for_Tensorflow/README.md b/TensorFlow/contrib/cv/SVD_ID2019_for_Tensorflow/README.md index e688a48f0..78822ea33 100644 --- a/TensorFlow/contrib/cv/SVD_ID2019_for_Tensorflow/README.md +++ b/TensorFlow/contrib/cv/SVD_ID2019_for_Tensorflow/README.md @@ -245,6 +245,6 @@ checkpoint文件中最新模型的路径修改 | global_step/sec| 无 | 87.64 | 116.77 | ## 离线推理 -利用frezze.py文件将save_model模型保存为pb模型 +参考 SVD_ID2019_for_ACL -- Gitee From b42c9e065d94dac38e49335f0c05a55ba3622d5c Mon Sep 17 00:00:00 2001 From: ab_dx_z <10158481+ab_dx_z@user.noreply.gitee.com> Date: Sat, 22 Oct 2022 13:18:54 +0000 Subject: [PATCH 10/20] =?UTF-8?q?=E5=88=A0=E9=99=A4=E6=96=87=E4=BB=B6=20Te?= =?UTF-8?q?nsorFlow/contrib/cv/SVD=5FID2019=5Ffor=5FTensorflow/train=5Fful?= =?UTF-8?q?l=5F1p.sh?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../train_full_1p.sh | 209 ------------------ 1 file changed, 209 deletions(-) delete mode 100644 TensorFlow/contrib/cv/SVD_ID2019_for_Tensorflow/train_full_1p.sh diff --git a/TensorFlow/contrib/cv/SVD_ID2019_for_Tensorflow/train_full_1p.sh b/TensorFlow/contrib/cv/SVD_ID2019_for_Tensorflow/train_full_1p.sh deleted file mode 100644 index edd5eed9a..000000000 --- a/TensorFlow/contrib/cv/SVD_ID2019_for_Tensorflow/train_full_1p.sh +++ /dev/null @@ -1,209 +0,0 @@ -#!/bin/bash - -########################################################## -#########第3行 至 100行,请一定不要、不要、不要修改########## -#########第3行 至 100行,请一定不要、不要、不要修改########## -#########第3行 至 100行,请一定不要、不要、不要修改########## -########################################################## -# shell脚本所在路径 -cur_path=`echo $(cd $(dirname $0);pwd)` - -# 判断当前shell是否是performance -perf_flag=`echo $0 | grep performance | wc -l` - -# 当前执行网络的名称 -Network=`echo $(cd $(dirname $0);pwd) | awk -F"/" '{print $(NF-1)}'` - -export RANK_SIZE=1 -export RANK_ID=0 -export JOB_ID=10087 - -# 路径参数初始化 -data_path="" -output_path="" - -# 帮助信息,不需要修改 -if [[ $1 == --help || $1 == -h ]];then - echo"usage:./train_performance_1P.sh " - echo " " - echo "parameter explain: - --data_path # dataset of training - --output_path # output of training - --train_steps # max_step for training - --train_epochs # max_epoch for training - --batch_size # batch size - -h/--help show help message - " - exit 1 -fi - -# 参数校验,不需要修改 -for para in $* -do - if [[ $para == --data_path* ]];then - data_path=`echo ${para#*=}` - elif [[ $para == --output_path* ]];then - output_path=`echo ${para#*=}` - elif [[ $para == --train_steps* ]];then - train_steps=`echo ${para#*=}` - elif [[ $para == --train_epochs* ]];then - train_epochs=`echo ${para#*=}` - elif [[ $para == --batch_size* ]];then - batch_size=`echo ${para#*=}` - fi -done - -# 校验是否传入data_path,不需要修改 -if [[ $data_path == "" ]];then - echo "[Error] para \"data_path\" must be config" - exit 1 -fi - -# 校验是否传入output_path,不需要修改 -if [[ $output_path == "" ]];then - output_path="./test/output/${ASCEND_DEVICE_ID}" -fi - -# 设置打屏日志文件名,请保留,文件名为${print_log} -print_log="./test/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log" -etp_flag=${etp_running_flag} -if [ x"${etp_flag}" != xtrue ]; -then - echo "running without etp..." - print_log_name=`ls /home/ma-user/modelarts/log/ | grep proc-rank` - print_log="/home/ma-user/modelarts/log/${print_log_name}" -fi -echo ${print_log} - -CaseName="" -function get_casename() -{ - if [ x"${perf_flag}" = x1 ]; - then - CaseName=${Network}_bs${batch_size}_${RANK_SIZE}'p'_'perf' - else - CaseName=${Network}_bs${batch_size}_${RANK_SIZE}'p'_'acc' - fi -} - -# 跳转到code目录 -cd ${cur_path}/../ -rm -rf ./test/output/${ASCEND_DEVICE_ID} -mkdir -p ./test/output/${ASCEND_DEVICE_ID} - -# 训练开始时间记录,不需要修改 -start_time=$(date +%s) -########################################################## -#########第3行 至 100行,请一定不要、不要、不要修改########## -#########第3行 至 100行,请一定不要、不要、不要修改########## -#########第3行 至 100行,请一定不要、不要、不要修改########## -########################################################## - -#========================================================= -#========================================================= -#========训练执行命令,需要根据您的网络进行修改============== -#========================================================= -#========================================================= -# 您的训练数据集在${data_path}路径下,请直接使用这个变量获取 -# 您的训练输出目录在${output_path}路径下,请直接使用这个变量获取 -# 您的其他基础参数,可以自定义增加,但是batch_size请保留,并且设置正确的值 -batch_size=1 - -echo "------------------***PRINT DATA PATH***-----------------" -train_data="points/*.pts" -test_data="test_points_modified/*.pts" - -trainData_Path="$data_path$train_data" -testData_Path="$data_path$test_data" - - -if [ x"${modelarts_flag}" != x ]; -then - python3.7 /home/ma-user/modelarts/user-job-dir/code/main_point_cloud_boostPerf.py \ - --method=svd \ - --checkpoint_dir=${output_path} \ - --log_step_count=200 \ - --save_summaries_steps=25000 \ - --pt_cloud_train_files=${trainData_Path} \ - --pt_cloud_test_files=${testData_Path} \ - --train_steps=2600000 \ - --save_checkpoints_steps=100000 \ - --eval_examples=39900 -else - python3.7 /home/ma-user/modelarts/user-job-dir/code/main_point_cloud_boostPerf.py \ - --method=svd \ - --checkpoint_dir=${output_path} \ - --log_step_count=200 \ - --save_summaries_steps=25000 \ - --pt_cloud_train_files=${trainData_Path}\ - --pt_cloud_test_files=${testData_Path} \ - --train_steps=2600000 \ - --save_checkpoints_steps=100000 \ - --eval_examples=39900 > ${print_log} -fi - - -# 性能相关数据计算 -StepTime=`grep "sec/step :" ${print_log} | tail -n 10 | awk '{print $NF}' | awk '{sum+=$1} END {print sum/NR}'` -FPS=`awk 'BEGIN{ "%.2f\n", '${batch_size}'/'${StepTime}'}'` - -# 精度相关数据计算 -train_accuracy=`grep "Final Accuracy accuracy" ${print_log} | awk '{print $NF}'` -# 提取所有loss打印信息 -grep "loss :" ${print_log} | awk -F ":" '{print $4}' | awk -F "-" '{print $1}' > ./test/output/${ASCEND_DEVICE_ID}/my_output_loss.txt - - -########################################################### -#########后面的所有内容请不要修改########################### -#########后面的所有内容请不要修改########################### -#########后面的所有内容请不要修改########################### -########################################################### - -# 判断本次执行是否正确使用Ascend NPU -use_npu_flag=`grep "The model has been compiled on the Ascend AI processor" ${print_log} | wc -l` -if [ x"${use_npu_flag}" == x0 ]; -then - echo "------------------ ERROR NOTICE START ------------------" - echo "ERROR, your task haven't used Ascend NPU, please check your npu Migration." - echo "------------------ ERROR NOTICE END------------------" -else - echo "------------------ INFO NOTICE START------------------" - echo "INFO, your task have used Ascend NPU, please check your result." - echo "------------------ INFO NOTICE END------------------" -fi - -# 获取最终的casename,请保留,case文件名为${CaseName} -get_casename - -# 重命名loss文件 -if [ -f ./test/output/${ASCEND_DEVICE_ID}/my_output_loss.txt ]; -then - mv ./test/output/${ASCEND_DEVICE_ID}/my_output_loss.txt ./test/output/${ASCEND_DEVICE_ID}/${CaseName}_loss.txt -fi - -# 训练端到端耗时 -end_time=$(date +%s) -e2e_time=$(( $end_time - $start_time )) - -echo "------------------ Final result ------------------" -# 输出性能FPS/单step耗时/端到端耗时 -echo "Final Performance images/sec : $FPS" -echo "Final Performance sec/step : $StepTime" -echo "E2E Training Duration sec : $e2e_time" - -# 输出训练精度 -echo "Final Train Accuracy : ${train_accuracy}" - -# 最后一个迭代loss值,不需要修改 -ActualLoss=(`awk 'END {print $NF}' $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}_loss.txt`) - -#关键信息打印到${CaseName}.log中,不需要修改 -echo "Network = ${Network}" > $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log -echo "RankSize = ${RANK_SIZE}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log -echo "BatchSize = ${batch_size}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log -echo "DeviceType = `uname -m`" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log -echo "CaseName = ${CaseName}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log -echo "ActualFPS = ${FPS}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log -echo "TrainingTime = ${StepTime}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log -echo "ActualLoss = ${ActualLoss}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log -echo "E2ETrainingTime = ${e2e_time}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log \ No newline at end of file -- Gitee From c3fb510bc15b3e29a795774f623b9cc692457cd8 Mon Sep 17 00:00:00 2001 From: ab_dx_z <10158481+ab_dx_z@user.noreply.gitee.com> Date: Sat, 22 Oct 2022 13:19:59 +0000 Subject: [PATCH 11/20] =?UTF-8?q?=E6=A8=A1=E5=9E=8B=E4=BF=9D=E5=AD=98?= =?UTF-8?q?=E6=94=B9=E4=B8=BAsaved=5Fmodel=E5=BD=A2=E5=BC=8F?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: ab_dx_z <> --- .../main_point_cloud_perf.py | 441 ++++++++++++++++++ .../train_full_1p.sh | 209 +++++++++ 2 files changed, 650 insertions(+) create mode 100644 TensorFlow/contrib/cv/SVD_ID2019_for_Tensorflow/main_point_cloud_perf.py create mode 100644 TensorFlow/contrib/cv/SVD_ID2019_for_Tensorflow/train_full_1p.sh diff --git a/TensorFlow/contrib/cv/SVD_ID2019_for_Tensorflow/main_point_cloud_perf.py b/TensorFlow/contrib/cv/SVD_ID2019_for_Tensorflow/main_point_cloud_perf.py new file mode 100644 index 000000000..316d7da20 --- /dev/null +++ b/TensorFlow/contrib/cv/SVD_ID2019_for_Tensorflow/main_point_cloud_perf.py @@ -0,0 +1,441 @@ +# coding=utf-8 +# Copyright 2021 The Google Research Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Training and evaluation for the point cloud alignment experiment.""" +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import utils +from absl import flags +import numpy as np +import tensorflow as tf +import glob +import pathlib +import datetime +from npu_bridge.npu_init import * +from npu_bridge.estimator.npu.npu_estimator import NPUEstimatorSpec +from npu_bridge.estimator.npu.npu_estimator import NPUEstimator +from npu_bridge.estimator.npu.npu_config import NPURunConfig +import os +FLAGS = tf.app.flags.FLAGS + +# General flags. +flags.DEFINE_string('method', 'svd', + 'Specifies the method to use for predicting rotations. ' + 'Choices are "svd", "svd-inf", or "gs".') +flags.DEFINE_string('checkpoint_dir', '', + 'Locations for checkpoints, summaries, etc.') +flags.DEFINE_integer('train_steps', 2600000, 'Number of training iterations.') +flags.DEFINE_integer('save_checkpoints_steps', 10000, + 'How often to save checkpoints') +flags.DEFINE_integer('log_step_count', 500, 'How often to log the step count') +flags.DEFINE_integer('save_summaries_steps', 5000, + 'How often to save summaries.') +flags.DEFINE_float('learning_rate', 1e-5, 'Learning rate') +flags.DEFINE_boolean('lr_decay', False, 'Decay the learning rate if True.') +flags.DEFINE_integer('lr_decay_steps', 35000, + 'Learning rate decays steps.') +flags.DEFINE_float('lr_decay_rate', 0.95, + 'Learning rate decays rate.') +flags.DEFINE_boolean('predict_all_test', False, + 'If true, runs an eval job on latest checkpoint and ' + 'prints the error for each input.') +flags.DEFINE_integer('eval_examples', 39900, 'Number of test examples.') +flags.DEFINE_boolean('print_variable_names', False, + 'Print model variable names.') + +# Flags only used in the point cloud alignment experiment. +flags.DEFINE_integer('num_train_augmentations', 10, + 'Number of random rotations for augmenting each input ' + 'point cloud.') +flags.DEFINE_string('pt_cloud_train_files', '', + 'Expression matching all training point files, e.g. ' + '/path/to/files/pc_plane/points/*.pts') +flags.DEFINE_string('pt_cloud_test_files','', + 'Expression matching all modified test point files, e.g. ' + '/path/to/files/pc_plane/points_test/*.pts') +flags.DEFINE_boolean('random_rotation_axang', True, + 'If true, samples random rotations using the method ' + 'from the original benchmark code. Otherwise samples ' + 'by Haar measure.') + + +def pt_features(batch_pts): + """Input shape: [B, N, 3], output shape: [B, 1024].""" + with tf.variable_scope('ptenc', reuse=tf.AUTO_REUSE): + f1 = tf.layers.conv1d(inputs=batch_pts, filters=64, kernel_size=1) + f1 = tf.nn.leaky_relu(f1) + f2 = tf.layers.conv1d(inputs=f1, filters=128, kernel_size=1) + f2 = tf.nn.leaky_relu(f2) + f3 = tf.layers.conv1d(inputs=f2, filters=1024, kernel_size=1) + f = tf.reduce_max(f3, axis=1, keep_dims=False) + return f + + +def regress_from_features(batch_features, out_dim): + """Regress to a rotation representation from point cloud encodings. + + In Zhou et al, CVPR19, the paper describes this regression network as an MLP + mapping 2048->512->512->out_dim, but the associated code implements it with + one less layer: 2048->512->out_dim. We mimic the code. + + Args: + batch_features: [batch_size, in_dim]. + out_dim: desired output dimensionality. + + Returns: + A [batch_size, out_dim] tensor. + """ + f1 = tf.layers.dense(batch_features, 512) + f1 = tf.nn.leaky_relu(f1) + f2 = tf.layers.dense(f1, out_dim) + return f2 + + +def net_point_cloud(points1, points2, mode): + """Predict a relative rotation given two point clouds. + + Args: + points1: [batch_size, N, 3] float tensor. + points2: [batch_size, N, 3] float tensor. + mode: tf.estimator.ModeKeys. + + Returns: + [batch_size, 3, 3] matrices. + """ + f1 = pt_features(points1) + f2 = pt_features(points2) + f = tf.concat([f1, f2], axis=-1) + + if FLAGS.method == 'svd': + p = regress_from_features(f, 9) + return utils.symmetric_orthogonalization(p) + + if FLAGS.method == 'svd-inf': + p = regress_from_features(f, 9) + if mode == tf.estimator.ModeKeys.TRAIN: + return tf.reshape(p, (-1, 3, 3)) + else: + return utils.symmetric_orthogonalization(p) + + if FLAGS.method == 'gs': + p = regress_from_features(f, 6) + return utils.gs_orthogonalization(p) + + +def model_fn(features, labels, mode, params): + """The model_fn used to construct the tf.Estimator.""" + del params # Unused. + if mode == tf.estimator.ModeKeys.TRAIN: + # Training data has point cloud of size [1, N, 3] and random rotations + # of size [1, FLAGS.num_train_augmentations, 3, 3] + rot = labels[0] + data = features + num_rot = FLAGS.num_train_augmentations + batch_pts1 = tf.tile(data, [num_rot, 1, 1]) + # In this experiment it does not matter if we pre or post-multiply the + # rotation as long as we are consistent between training and eval. + batch_pts2 = tf.matmul(batch_pts1, rot) # post-multiplying! + else: + # Test data has point cloud of size [1, N, 3] and a single random + # rotation of size [1, 3, 3] + batch_pts1 = features['data'] + rot = features['rot'] + batch_pts2 = tf.matmul(batch_pts1, rot) + rot = tf.reshape(rot, (-1, 3, 3)) + + # Predict the rotation. + + + r = net_point_cloud(batch_pts1, batch_pts2, mode) + + unit_one = tf.constant(1.0,dtype=tf.float32) + + rotation_matrix = tf.multiply(r,unit_one,name='rotation_matrix') + # Compute the loss. + loss = tf.nn.l2_loss(rot - r) + + # Compute the relative angle in radians. + theta = utils.relative_angle(rot, r) + + # Mean angle error over the batch. + mean_theta = tf.reduce_mean(theta) + + mean_theta_deg = mean_theta * 180.0 / np.pi + + # Train, eval, or predict depending on mode. + if mode == tf.estimator.ModeKeys.TRAIN: + tf.summary.scalar('train/loss', loss) + tf.summary.scalar('train/theta', mean_theta_deg) + global_step = tf.train.get_or_create_global_step() + + if FLAGS.lr_decay: + learning_rate = tf.train.exponential_decay( + FLAGS.learning_rate, + global_step, + FLAGS.lr_decay_steps, + FLAGS.lr_decay_rate) + else: + learning_rate = FLAGS.learning_rate + + tf.summary.scalar('train/learning_rate', learning_rate) + optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate) + update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) + with tf.control_dependencies(update_ops): + train_op = optimizer.minimize(loss, global_step=global_step) + #train_op = util.set_graph_exec_config(train_op, dynamic_input=True,dynamic_graph_execute_mode="lazy_recompile") + + return NPUEstimatorSpec( + mode=mode, + loss=loss, + train_op=train_op, + predictions=rotation_matrix) + + if mode == tf.estimator.ModeKeys.EVAL: + if FLAGS.predict_all_test: + print_error_op = tf.print('error:', mean_theta_deg) + with tf.control_dependencies([print_error_op]): + eval_metric_ops = { + 'mean_degree_err': tf.metrics.mean(mean_theta_deg), + } + else: + eval_metric_ops = { + 'mean_degree_err': tf.metrics.mean(mean_theta_deg), + } + + return NPUEstimatorSpec( + mode=mode, + loss=loss, + eval_metric_ops=eval_metric_ops) + + if mode == tf.estimator.ModeKeys.PREDICT: + pred = {'error': mean_theta_deg} + return NPUEstimatorSpec( + mode=mode, + predictions=pred) + + +def train_input_fn(): + """Generate training data iterator from the .pts files.""" + def _file_to_matrix(pts_path): + """Read Nx3 point cloud from a .pts file.""" + file_buffer = tf.read_file(pts_path) + lines = tf.string_split([file_buffer], delimiter='\n') + lines1 = tf.string_split(lines.values, delimiter='\r') + values = tf.stack(tf.decode_csv(lines1.values, + record_defaults=[[0.0], [0.0], [0.0]], + field_delim=' ')) + values = tf.transpose(values) # 3xN --> Nx3. + values = values[:tf.shape(values)[0] // 2, :] + + # modified by Lyoung + diff_num = 1500-tf.shape(values)[0] + repeat_pts = tf.tile(tf.reshape(values[1, :], (1, -1)), [diff_num, 1]) + values = tf.concat([values,repeat_pts], axis=0) + # The experiment code in + # github.com/papagina/RotationContinuity/.../shapenet/code/train_pointnet.py + # only used the first half of the points in each file. + return values + + def _random_rotation(pts): + """Attach N random rotations to a point cloud.""" + if FLAGS.random_rotation_axang: + rotations = utils.random_rotation_benchmark(FLAGS.num_train_augmentations) + else: + rotations = utils.random_rotation(FLAGS.num_train_augmentations) + return pts,rotations + + pts_paths = tf.gfile.Glob(FLAGS.pt_cloud_train_files) + dataset = tf.data.Dataset.from_tensor_slices(pts_paths) + dataset = dataset.map(_file_to_matrix) + dataset = dataset.cache() # Comment out if memory cannot hold all the data. + dataset = dataset.shuffle(buffer_size=50, reshuffle_each_iteration=True) + dataset = dataset.repeat() + dataset = dataset.map(_random_rotation) + dataset = dataset.batch(1) + iterator = tf.data.make_one_shot_iterator(dataset) + batch_data, batch_rot = iterator.get_next() + #features_dict = {'data': batch_data, 'rot': batch_rot} + #batch_size = tf.shape(batch_data)[0] + #batch_labels_dummy = tf.zeros(shape=(batch_size, 1)) + return batch_data,batch_rot + + +def eval_input_fn(): + """Generate test data from *modified* .pts files. + + See README and comments below for details on how the data is modified. + + Returns: + A tuple of features and associated labels. + """ + def _file_to_matrix(pts_path): + """Read Nx3 point cloud and 3x3 rotation matrix from a .pts file. + + The test data is a modified version of the original files. For each .pts + file we have (1) added a 3x3 rotation matrix for testing, and (2) removed + the second half of the point cloud since it is not used at all. + + Args: + pts_path: path to a .pts file. + + Returns: + A Nx3 point cloud. + A 3x3 rotation matrix. + """ + file_buffer = tf.read_file(pts_path) + lines = tf.string_split([file_buffer], delimiter='\n') + lines1 = tf.string_split(lines.values,delimiter='\r') + values = tf.stack(tf.decode_csv(lines1.values, + record_defaults=[[0.0], [0.0], [0.0]],field_delim=' ')) + values = tf.transpose(values)# 3xN --> Nx3. + + #modified by Lyoung + diff_num = 1414-tf.shape(values)[0] + repeat_pts = tf.tile(tf.reshape(values[4, :], (1, -1)), [diff_num, 1]) + values = tf.concat([values, repeat_pts], axis=0) + # First three rows are the rotation matrix, remaining rows the point cloud. + rot = values[:3, :] + + return values[4:, :], rot + + pts_paths = tf.io.gfile.glob(FLAGS.pt_cloud_test_files) + dataset = tf.data.Dataset.from_tensor_slices(pts_paths) + dataset = dataset.map(_file_to_matrix) + dataset = dataset.batch(1) + iterator = tf.data.make_one_shot_iterator(dataset) + batch_data, batch_rot = iterator.get_next() + features_dict = {'data': batch_data, 'rot': batch_rot} + #batch_size = tf.shape(batch_data)[0] + #batch_labels_dummy = tf.zeros(shape=(batch_size, 1)) + return features_dict + + +def print_variable_names(): + """Print variable names in a model.""" + params = {'dummy': 0} + estimator = NPUEstimator( + model_fn=model_fn, + model_dir=FLAGS.checkpoint_dir, + params=params) + + names = estimator.get_variable_names() + for name in names: + print(name) + + +def predict_all_test(): + """Print error statistics for the test dataset.""" + + config = NPURunConfig( + save_summary_steps=None, + save_checkpoints_steps=None, + log_step_count_steps=None, + keep_checkpoint_max=None) + + params = {'dummy': 0} + estimator = NPUEstimator( + model_fn=model_fn, + model_dir=FLAGS.checkpoint_dir, + params=params, + config=config) + evals = estimator.predict(input_fn=eval_input_fn, yield_single_examples=True) + + # Print error statistics. + + all_errors = [x['error'] for x in evals] + errors = np.array(all_errors) + print('Evaluated %d examples'%np.size(errors)) + print('Mean error: %f degrees',np.mean(errors)) + print('Median error: %f degrees', np.median(errors)) + print('Std: %f degrees', np.std(errors)) + sorted_errors = np.sort(errors) + n = np.size(sorted_errors) + print('\nPercentiles:') + for perc in range(1, 101): + index = np.int32(np.float32(n * perc) / 100.0) - 1 + print('%3d%%: %f'%(perc, sorted_errors[index])) + +def serving_input_fn(): + input_data = tf.placeholder(tf.float32, [None, None, 3], name='data') + input_rot = tf.placeholder(tf.float32, [None, None, 3], name='rot') + input_fn = tf.estimator.export.build_raw_serving_input_receiver_fn({ + 'data': input_data, + 'rot': input_rot + })() + return input_fn + + + +def train_and_eval(): + """Train and evaluate a model.""" + save_summary_steps = FLAGS.save_summaries_steps + save_checkpoints_steps = FLAGS.save_checkpoints_steps + log_step_count = FLAGS.log_step_count + + config = NPURunConfig( + save_summary_steps=save_summary_steps, + save_checkpoints_steps=save_checkpoints_steps, + log_step_count_steps=log_step_count, + keep_checkpoint_max=None, + precision_mode = "allow_mix_precision") + + + + params = {'dummy': 0} + estimator = NPUEstimator( + model_fn=model_fn, + model_dir=FLAGS.checkpoint_dir, + config=config, + params=params) + + + # train_spec = tf.estimator.TrainSpec( + # input_fn=train_input_fn, + # max_steps=FLAGS.train_steps) + # + # eval_spec = tf.estimator.EvalSpec(input_fn=eval_input_fn, + # start_delay_secs=60, + # steps=FLAGS.eval_examples, + # throttle_secs=60) + + #tf.estimator.train_and_evaluate(estimator, train_spec, eval_spec) + + estimator.train(input_fn=train_input_fn, + max_steps=FLAGS.train_steps) + + + estimator.export_savedmodel(FLAGS.checkpoint_dir, + serving_input_fn) + + estimator.evaluate(input_fn=eval_input_fn, steps=FLAGS.eval_examples) + + +def main(argv=None): # pylint: disable=unused-argument + + if FLAGS.print_variable_names: + print_variable_names() + return + + if FLAGS.predict_all_test: + predict_all_test() + else: + train_and_eval() + + +if __name__ == '__main__': + + tf.app.run() diff --git a/TensorFlow/contrib/cv/SVD_ID2019_for_Tensorflow/train_full_1p.sh b/TensorFlow/contrib/cv/SVD_ID2019_for_Tensorflow/train_full_1p.sh new file mode 100644 index 000000000..137538832 --- /dev/null +++ b/TensorFlow/contrib/cv/SVD_ID2019_for_Tensorflow/train_full_1p.sh @@ -0,0 +1,209 @@ +#!/bin/bash + +########################################################## +#########第3行 至 100行,请一定不要、不要、不要修改########## +#########第3行 至 100行,请一定不要、不要、不要修改########## +#########第3行 至 100行,请一定不要、不要、不要修改########## +########################################################## +# shell脚本所在路径 +cur_path=`echo $(cd $(dirname $0);pwd)` + +# 判断当前shell是否是performance +perf_flag=`echo $0 | grep performance | wc -l` + +# 当前执行网络的名称 +Network=`echo $(cd $(dirname $0);pwd) | awk -F"/" '{print $(NF-1)}'` + +export RANK_SIZE=1 +export RANK_ID=0 +export JOB_ID=10087 + +# 路径参数初始化 +data_path="" +output_path="" + +# 帮助信息,不需要修改 +if [[ $1 == --help || $1 == -h ]];then + echo"usage:./train_performance_1P.sh " + echo " " + echo "parameter explain: + --data_path # dataset of training + --output_path # output of training + --train_steps # max_step for training + --train_epochs # max_epoch for training + --batch_size # batch size + -h/--help show help message + " + exit 1 +fi + +# 参数校验,不需要修改 +for para in $* +do + if [[ $para == --data_path* ]];then + data_path=`echo ${para#*=}` + elif [[ $para == --output_path* ]];then + output_path=`echo ${para#*=}` + elif [[ $para == --train_steps* ]];then + train_steps=`echo ${para#*=}` + elif [[ $para == --train_epochs* ]];then + train_epochs=`echo ${para#*=}` + elif [[ $para == --batch_size* ]];then + batch_size=`echo ${para#*=}` + fi +done + +# 校验是否传入data_path,不需要修改 +if [[ $data_path == "" ]];then + echo "[Error] para \"data_path\" must be config" + exit 1 +fi + +# 校验是否传入output_path,不需要修改 +if [[ $output_path == "" ]];then + output_path="./test/output/${ASCEND_DEVICE_ID}" +fi + +# 设置打屏日志文件名,请保留,文件名为${print_log} +print_log="./test/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log" +etp_flag=${etp_running_flag} +if [ x"${etp_flag}" != xtrue ]; +then + echo "running without etp..." + print_log_name=`ls /home/ma-user/modelarts/log/ | grep proc-rank` + print_log="/home/ma-user/modelarts/log/${print_log_name}" +fi +echo ${print_log} + +CaseName="" +function get_casename() +{ + if [ x"${perf_flag}" = x1 ]; + then + CaseName=${Network}_bs${batch_size}_${RANK_SIZE}'p'_'perf' + else + CaseName=${Network}_bs${batch_size}_${RANK_SIZE}'p'_'acc' + fi +} + +# 跳转到code目录 +cd ${cur_path}/../ +rm -rf ./test/output/${ASCEND_DEVICE_ID} +mkdir -p ./test/output/${ASCEND_DEVICE_ID} + +# 训练开始时间记录,不需要修改 +start_time=$(date +%s) +########################################################## +#########第3行 至 100行,请一定不要、不要、不要修改########## +#########第3行 至 100行,请一定不要、不要、不要修改########## +#########第3行 至 100行,请一定不要、不要、不要修改########## +########################################################## + +#========================================================= +#========================================================= +#========训练执行命令,需要根据您的网络进行修改============== +#========================================================= +#========================================================= +# 您的训练数据集在${data_path}路径下,请直接使用这个变量获取 +# 您的训练输出目录在${output_path}路径下,请直接使用这个变量获取 +# 您的其他基础参数,可以自定义增加,但是batch_size请保留,并且设置正确的值 +batch_size=1 + +echo "------------------***PRINT DATA PATH***-----------------" +train_data="points/*.pts" +test_data="test_points_modified/*.pts" + +trainData_Path="$data_path$train_data" +testData_Path="$data_path$test_data" + + +if [ x"${modelarts_flag}" != x ]; +then + python3.7 /home/ma-user/modelarts/user-job-dir/code/main_point_cloud_perf.py \ + --method=svd \ + --checkpoint_dir=${output_path} \ + --log_step_count=200 \ + --save_summaries_steps=25000 \ + --pt_cloud_train_files=${trainData_Path} \ + --pt_cloud_test_files=${testData_Path} \ + --train_steps=2600000 \ + --save_checkpoints_steps=100000 \ + --eval_examples=39900 +else + python3.7 /home/ma-user/modelarts/user-job-dir/code/main_point_cloud_perf.py \ + --method=svd \ + --checkpoint_dir=${output_path} \ + --log_step_count=200 \ + --save_summaries_steps=25000 \ + --pt_cloud_train_files=${trainData_Path}\ + --pt_cloud_test_files=${testData_Path} \ + --train_steps=2600000 \ + --save_checkpoints_steps=100000 \ + --eval_examples=39900 > ${print_log} +fi + + +# 性能相关数据计算 +StepTime=`grep "sec/step :" ${print_log} | tail -n 10 | awk '{print $NF}' | awk '{sum+=$1} END {print sum/NR}'` +FPS=`awk 'BEGIN{ "%.2f\n", '${batch_size}'/'${StepTime}'}'` + +# 精度相关数据计算 +train_accuracy=`grep "Final Accuracy accuracy" ${print_log} | awk '{print $NF}'` +# 提取所有loss打印信息 +grep "loss :" ${print_log} | awk -F ":" '{print $4}' | awk -F "-" '{print $1}' > ./test/output/${ASCEND_DEVICE_ID}/my_output_loss.txt + + +########################################################### +#########后面的所有内容请不要修改########################### +#########后面的所有内容请不要修改########################### +#########后面的所有内容请不要修改########################### +########################################################### + +# 判断本次执行是否正确使用Ascend NPU +use_npu_flag=`grep "The model has been compiled on the Ascend AI processor" ${print_log} | wc -l` +if [ x"${use_npu_flag}" == x0 ]; +then + echo "------------------ ERROR NOTICE START ------------------" + echo "ERROR, your task haven't used Ascend NPU, please check your npu Migration." + echo "------------------ ERROR NOTICE END------------------" +else + echo "------------------ INFO NOTICE START------------------" + echo "INFO, your task have used Ascend NPU, please check your result." + echo "------------------ INFO NOTICE END------------------" +fi + +# 获取最终的casename,请保留,case文件名为${CaseName} +get_casename + +# 重命名loss文件 +if [ -f ./test/output/${ASCEND_DEVICE_ID}/my_output_loss.txt ]; +then + mv ./test/output/${ASCEND_DEVICE_ID}/my_output_loss.txt ./test/output/${ASCEND_DEVICE_ID}/${CaseName}_loss.txt +fi + +# 训练端到端耗时 +end_time=$(date +%s) +e2e_time=$(( $end_time - $start_time )) + +echo "------------------ Final result ------------------" +# 输出性能FPS/单step耗时/端到端耗时 +echo "Final Performance images/sec : $FPS" +echo "Final Performance sec/step : $StepTime" +echo "E2E Training Duration sec : $e2e_time" + +# 输出训练精度 +echo "Final Train Accuracy : ${train_accuracy}" + +# 最后一个迭代loss值,不需要修改 +ActualLoss=(`awk 'END {print $NF}' $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}_loss.txt`) + +#关键信息打印到${CaseName}.log中,不需要修改 +echo "Network = ${Network}" > $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "RankSize = ${RANK_SIZE}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "BatchSize = ${batch_size}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "DeviceType = `uname -m`" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "CaseName = ${CaseName}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "ActualFPS = ${FPS}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "TrainingTime = ${StepTime}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "ActualLoss = ${ActualLoss}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "E2ETrainingTime = ${e2e_time}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log \ No newline at end of file -- Gitee From 0cc9ac9f5d7f548020e045066036dc66fa0dac9e Mon Sep 17 00:00:00 2001 From: ab_dx_z <10158481+ab_dx_z@user.noreply.gitee.com> Date: Sat, 22 Oct 2022 13:22:44 +0000 Subject: [PATCH 12/20] =?UTF-8?q?=E5=88=A0=E9=99=A4=E6=96=87=E4=BB=B6=20Te?= =?UTF-8?q?nsorFlow/contrib/cv/SVD=5FID2019=5Ffor=5FTensorflow/genStatisti?= =?UTF-8?q?cal.sh?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../genStatistical.sh | 203 ------------------ 1 file changed, 203 deletions(-) delete mode 100644 TensorFlow/contrib/cv/SVD_ID2019_for_Tensorflow/genStatistical.sh diff --git a/TensorFlow/contrib/cv/SVD_ID2019_for_Tensorflow/genStatistical.sh b/TensorFlow/contrib/cv/SVD_ID2019_for_Tensorflow/genStatistical.sh deleted file mode 100644 index 990ca29a5..000000000 --- a/TensorFlow/contrib/cv/SVD_ID2019_for_Tensorflow/genStatistical.sh +++ /dev/null @@ -1,203 +0,0 @@ -#!/bin/bash - -########################################################## -#########第3行 至 100行,请一定不要、不要、不要修改########## -#########第3行 至 100行,请一定不要、不要、不要修改########## -#########第3行 至 100行,请一定不要、不要、不要修改########## -########################################################## -# shell脚本所在路径 -cur_path=`echo $(cd $(dirname $0);pwd)` - -# 判断当前shell是否是performance -perf_flag=`echo $0 | grep performance | wc -l` - -# 当前执行网络的名称 -Network=`echo $(cd $(dirname $0);pwd) | awk -F"/" '{print $(NF-1)}'` - -export RANK_SIZE=1 -export RANK_ID=0 -export JOB_ID=10087 - -# 路径参数初始化 -data_path="" -output_path="" - -# 帮助信息,不需要修改 -if [[ $1 == --help || $1 == -h ]];then - echo"usage:./train_performance_1P.sh " - echo " " - echo "parameter explain: - --data_path # dataset of training - --output_path # output of training - --train_steps # max_step for training - --train_epochs # max_epoch for training - --batch_size # batch size - -h/--help show help message - " - exit 1 -fi - -# 参数校验,不需要修改 -for para in $* -do - if [[ $para == --data_path* ]];then - data_path=`echo ${para#*=}` - elif [[ $para == --output_path* ]];then - output_path=`echo ${para#*=}` - elif [[ $para == --train_steps* ]];then - train_steps=`echo ${para#*=}` - elif [[ $para == --train_epochs* ]];then - train_epochs=`echo ${para#*=}` - elif [[ $para == --batch_size* ]];then - batch_size=`echo ${para#*=}` - fi -done - -# 校验是否传入data_path,不需要修改 -if [[ $data_path == "" ]];then - echo "[Error] para \"data_path\" must be config" - exit 1 -fi - -# 校验是否传入output_path,不需要修改 -if [[ $output_path == "" ]];then - output_path="./test/output/${ASCEND_DEVICE_ID}" -fi - -# 设置打屏日志文件名,请保留,文件名为${print_log} -print_log="./test/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log" -etp_flag=${etp_running_flag} -if [ x"${etp_flag}" != xtrue ]; -then - echo "running without etp..." - print_log_name=`ls /home/ma-user/modelarts/log/ | grep proc-rank` - print_log="/home/ma-user/modelarts/log/${print_log_name}" -fi -echo ${print_log} - -CaseName="" -function get_casename() -{ - if [ x"${perf_flag}" = x1 ]; - then - CaseName=${Network}_bs${batch_size}_${RANK_SIZE}'p'_'perf' - else - CaseName=${Network}_bs${batch_size}_${RANK_SIZE}'p'_'acc' - fi -} - -# 跳转到code目录 -cd ${cur_path}/../ -rm -rf ./test/output/${ASCEND_DEVICE_ID} -mkdir -p ./test/output/${ASCEND_DEVICE_ID} - -# 训练开始时间记录,不需要修改 -start_time=$(date +%s) -########################################################## -#########第3行 至 100行,请一定不要、不要、不要修改########## -#########第3行 至 100行,请一定不要、不要、不要修改########## -#########第3行 至 100行,请一定不要、不要、不要修改########## -########################################################## - -#========================================================= -#========================================================= -#========训练执行命令,需要根据您的网络进行修改============== -#========================================================= -#========================================================= -# 您的训练数据集在${data_path}路径下,请直接使用这个变量获取 -# 您的训练输出目录在${output_path}路径下,请直接使用这个变量获取 -# 您的其他基础参数,可以自定义增加,但是batch_size请保留,并且设置正确的值 -batch_size=1 - -echo "------------------***PRINT DATA PATH***-----------------" - -docuClass="test_points_modified/*.pts" -modelPath="output" - -testData_Path="$data_path$docuClass" -outputPath="$data_path$modelPath" - -echo "$outputPath" -echo "$testData_Path" - - -if [ x"${modelarts_flag}" != x ]; -then - python /home/ma-user/modelarts/user-job-dir/code/main_point_cloud.py \ - --method=svd \ - --checkpoint_dir=${outputPath} \ - --pt_cloud_test_files=${testData_Path} \ - --predict_all_test=True -else - python /home/ma-user/modelarts/user-job-dir/code/main_point_cloud.py \ - --method=svd \ - --checkpoint_dir=${outputPath} \ - --pt_cloud_test_files=${testData_Path} \ - --predict_all_test=True -fi - - -# 性能相关数据计算 -StepTime=`grep "sec/step :" ${print_log} | tail -n 10 | awk '{print $NF}' | awk '{sum+=$1} END {print sum/NR}'` -FPS=`awk 'BEGIN{ "%.2f\n", '${batch_size}'/'${StepTime}'}'` - -# 精度相关数据计算 -train_accuracy=`grep "Final Accuracy accuracy" ${print_log} | awk '{print $NF}'` -# 提取所有loss打印信息 -grep "loss :" ${print_log} | awk -F ":" '{print $4}' | awk -F "-" '{print $1}' > ./test/output/${ASCEND_DEVICE_ID}/my_output_loss.txt - - -########################################################### -#########后面的所有内容请不要修改########################### -#########后面的所有内容请不要修改########################### -#########后面的所有内容请不要修改########################### -########################################################### - -# 判断本次执行是否正确使用Ascend NPU -use_npu_flag=`grep "The model has been compiled on the Ascend AI processor" ${print_log} | wc -l` -if [ x"${use_npu_flag}" == x0 ]; -then - echo "------------------ ERROR NOTICE START ------------------" - echo "ERROR, your task haven't used Ascend NPU, please check your npu Migration." - echo "------------------ ERROR NOTICE END------------------" -else - echo "------------------ INFO NOTICE START------------------" - echo "INFO, your task have used Ascend NPU, please check your result." - echo "------------------ INFO NOTICE END------------------" -fi - -# 获取最终的casename,请保留,case文件名为${CaseName} -get_casename - -# 重命名loss文件 -if [ -f ./test/output/${ASCEND_DEVICE_ID}/my_output_loss.txt ]; -then - mv ./test/output/${ASCEND_DEVICE_ID}/my_output_loss.txt ./test/output/${ASCEND_DEVICE_ID}/${CaseName}_loss.txt -fi - -# 训练端到端耗时 -end_time=$(date +%s) -e2e_time=$(( $end_time - $start_time )) - -echo "------------------ Final result ------------------" -# 输出性能FPS/单step耗时/端到端耗时 -echo "Final Performance images/sec : $FPS" -echo "Final Performance sec/step : $StepTime" -echo "E2E Training Duration sec : $e2e_time" - -# 输出训练精度 -echo "Final Train Accuracy : ${train_accuracy}" - -# 最后一个迭代loss值,不需要修改 -ActualLoss=(`awk 'END {print $NF}' $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}_loss.txt`) - -#关键信息打印到${CaseName}.log中,不需要修改 -echo "Network = ${Network}" > $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log -echo "RankSize = ${RANK_SIZE}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log -echo "BatchSize = ${batch_size}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log -echo "DeviceType = `uname -m`" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log -echo "CaseName = ${CaseName}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log -echo "ActualFPS = ${FPS}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log -echo "TrainingTime = ${StepTime}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log -echo "ActualLoss = ${ActualLoss}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log -echo "E2ETrainingTime = ${e2e_time}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log \ No newline at end of file -- Gitee From 66d31493976ea8e3f7a54f163e24f3e181a5f5e4 Mon Sep 17 00:00:00 2001 From: ab_dx_z <10158481+ab_dx_z@user.noreply.gitee.com> Date: Sat, 22 Oct 2022 13:22:53 +0000 Subject: [PATCH 13/20] =?UTF-8?q?=E5=88=A0=E9=99=A4=E6=96=87=E4=BB=B6=20Te?= =?UTF-8?q?nsorFlow/contrib/cv/SVD=5FID2019=5Ffor=5FTensorflow/train=5Fper?= =?UTF-8?q?formance=5F1p.sh?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../train_performance_1p.sh | 209 ------------------ 1 file changed, 209 deletions(-) delete mode 100644 TensorFlow/contrib/cv/SVD_ID2019_for_Tensorflow/train_performance_1p.sh diff --git a/TensorFlow/contrib/cv/SVD_ID2019_for_Tensorflow/train_performance_1p.sh b/TensorFlow/contrib/cv/SVD_ID2019_for_Tensorflow/train_performance_1p.sh deleted file mode 100644 index ab8be4ed0..000000000 --- a/TensorFlow/contrib/cv/SVD_ID2019_for_Tensorflow/train_performance_1p.sh +++ /dev/null @@ -1,209 +0,0 @@ -#!/bin/bash - -########################################################## -#########第3行 至 100行,请一定不要、不要、不要修改########## -#########第3行 至 100行,请一定不要、不要、不要修改########## -#########第3行 至 100行,请一定不要、不要、不要修改########## -########################################################## -# shell脚本所在路径 -cur_path=`echo $(cd $(dirname $0);pwd)` - -# 判断当前shell是否是performance -perf_flag=`echo $0 | grep performance | wc -l` - -# 当前执行网络的名称 -Network=`echo $(cd $(dirname $0);pwd) | awk -F"/" '{print $(NF-1)}'` - -export RANK_SIZE=1 -export RANK_ID=0 -export JOB_ID=10087 - -# 路径参数初始化 -data_path="" -output_path="" - -# 帮助信息,不需要修改 -if [[ $1 == --help || $1 == -h ]];then - echo"usage:./train_performance_1P.sh " - echo " " - echo "parameter explain: - --data_path # dataset of training - --output_path # output of training - --train_steps # max_step for training - --train_epochs # max_epoch for training - --batch_size # batch size - -h/--help show help message - " - exit 1 -fi - -# 参数校验,不需要修改 -for para in $* -do - if [[ $para == --data_path* ]];then - data_path=`echo ${para#*=}` - elif [[ $para == --output_path* ]];then - output_path=`echo ${para#*=}` - elif [[ $para == --train_steps* ]];then - train_steps=`echo ${para#*=}` - elif [[ $para == --train_epochs* ]];then - train_epochs=`echo ${para#*=}` - elif [[ $para == --batch_size* ]];then - batch_size=`echo ${para#*=}` - fi -done - -# 校验是否传入data_path,不需要修改 -if [[ $data_path == "" ]];then - echo "[Error] para \"data_path\" must be config" - exit 1 -fi - -# 校验是否传入output_path,不需要修改 -if [[ $output_path == "" ]];then - output_path="./test/output/${ASCEND_DEVICE_ID}" -fi - -# 设置打屏日志文件名,请保留,文件名为${print_log} -print_log="./test/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log" -etp_flag=${etp_running_flag} -if [ x"${etp_flag}" != xtrue ]; -then - echo "running without etp..." - print_log_name=`ls /home/ma-user/modelarts/log/ | grep proc-rank` - print_log="/home/ma-user/modelarts/log/${print_log_name}" -fi -echo ${print_log} - -CaseName="" -function get_casename() -{ - if [ x"${perf_flag}" = x1 ]; - then - CaseName=${Network}_bs${batch_size}_${RANK_SIZE}'p'_'perf' - else - CaseName=${Network}_bs${batch_size}_${RANK_SIZE}'p'_'acc' - fi -} - -# 跳转到code目录 -cd ${cur_path}/../ -rm -rf ./test/output/${ASCEND_DEVICE_ID} -mkdir -p ./test/output/${ASCEND_DEVICE_ID} - -# 训练开始时间记录,不需要修改 -start_time=$(date +%s) -########################################################## -#########第3行 至 100行,请一定不要、不要、不要修改########## -#########第3行 至 100行,请一定不要、不要、不要修改########## -#########第3行 至 100行,请一定不要、不要、不要修改########## -########################################################## - -#========================================================= -#========================================================= -#========训练执行命令,需要根据您的网络进行修改============== -#========================================================= -#========================================================= -# 您的训练数据集在${data_path}路径下,请直接使用这个变量获取 -# 您的训练输出目录在${output_path}路径下,请直接使用这个变量获取 -# 您的其他基础参数,可以自定义增加,但是batch_size请保留,并且设置正确的值 -batch_size=1 - -echo "------------------***PRINT DATA PATH***-----------------" -train_data="points/*.pts" -test_data="sub_test/*.pts" - -trainData_Path="$data_path$train_data" -testData_Path="$data_path$test_data" - - -if [ x"${modelarts_flag}" != x ]; -then - python3.7 /home/ma-user/modelarts/user-job-dir/code/main_point_cloud_boostPerf.py \ - --method=svd \ - --checkpoint_dir=${output_path} \ - --log_step_count=200 \ - --save_summaries_steps=250 \ - --pt_cloud_train_files=${trainData_Path} \ - --pt_cloud_test_files=${testData_Path} \ - --train_steps=10000 \ - --save_checkpoints_steps=10000 \ - --eval_examples=399 -else - python3.7 /home/ma-user/modelarts/user-job-dir/code/main_point_cloud_boostPerf.py \ - --method=svd \ - --checkpoint_dir=${output_path} \ - --log_step_count=200 \ - --save_summaries_steps=250 \ - --pt_cloud_train_files=${trainData_Path}\ - --pt_cloud_test_files=${testData_Path} \ - --train_steps=10000 \ - --save_checkpoints_steps=10000 \ - --eval_examples=399> ${print_log} -fi - - -# 性能相关数据计算 -StepTime=`grep "sec/step :" ${print_log} | tail -n 10 | awk '{print $NF}' | awk '{sum+=$1} END {print sum/NR}'` -FPS=`awk 'BEGIN{ "%.2f\n", '${batch_size}'/'${StepTime}'}'` - -# 精度相关数据计算 -train_accuracy=`grep "Final Accuracy accuracy" ${print_log} | awk '{print $NF}'` -# 提取所有loss打印信息 -grep "loss :" ${print_log} | awk -F ":" '{print $4}' | awk -F "-" '{print $1}' > ./test/output/${ASCEND_DEVICE_ID}/my_output_loss.txt - - -########################################################### -#########后面的所有内容请不要修改########################### -#########后面的所有内容请不要修改########################### -#########后面的所有内容请不要修改########################### -########################################################### - -# 判断本次执行是否正确使用Ascend NPU -use_npu_flag=`grep "The model has been compiled on the Ascend AI processor" ${print_log} | wc -l` -if [ x"${use_npu_flag}" == x0 ]; -then - echo "------------------ ERROR NOTICE START ------------------" - echo "ERROR, your task haven't used Ascend NPU, please check your npu Migration." - echo "------------------ ERROR NOTICE END------------------" -else - echo "------------------ INFO NOTICE START------------------" - echo "INFO, your task have used Ascend NPU, please check your result." - echo "------------------ INFO NOTICE END------------------" -fi - -# 获取最终的casename,请保留,case文件名为${CaseName} -get_casename - -# 重命名loss文件 -if [ -f ./test/output/${ASCEND_DEVICE_ID}/my_output_loss.txt ]; -then - mv ./test/output/${ASCEND_DEVICE_ID}/my_output_loss.txt ./test/output/${ASCEND_DEVICE_ID}/${CaseName}_loss.txt -fi - -# 训练端到端耗时 -end_time=$(date +%s) -e2e_time=$(( $end_time - $start_time )) - -echo "------------------ Final result ------------------" -# 输出性能FPS/单step耗时/端到端耗时 -echo "Final Performance images/sec : $FPS" -echo "Final Performance sec/step : $StepTime" -echo "E2E Training Duration sec : $e2e_time" - -# 输出训练精度 -echo "Final Train Accuracy : ${train_accuracy}" - -# 最后一个迭代loss值,不需要修改 -ActualLoss=(`awk 'END {print $NF}' $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}_loss.txt`) - -#关键信息打印到${CaseName}.log中,不需要修改 -echo "Network = ${Network}" > $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log -echo "RankSize = ${RANK_SIZE}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log -echo "BatchSize = ${batch_size}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log -echo "DeviceType = `uname -m`" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log -echo "CaseName = ${CaseName}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log -echo "ActualFPS = ${FPS}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log -echo "TrainingTime = ${StepTime}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log -echo "ActualLoss = ${ActualLoss}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log -echo "E2ETrainingTime = ${e2e_ti_time}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log \ No newline at end of file -- Gitee From c0d20fee066cbf673d8b1d2df6129ca7f56cb5b5 Mon Sep 17 00:00:00 2001 From: ab_dx_z <10158481+ab_dx_z@user.noreply.gitee.com> Date: Sat, 22 Oct 2022 13:23:36 +0000 Subject: [PATCH 14/20] =?UTF-8?q?=E6=9B=B4=E6=96=B0?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: ab_dx_z <> --- .../genStatistical.sh | 203 +++++++++++++++++ .../train_performance_1p.sh | 209 ++++++++++++++++++ 2 files changed, 412 insertions(+) create mode 100644 TensorFlow/contrib/cv/SVD_ID2019_for_Tensorflow/genStatistical.sh create mode 100644 TensorFlow/contrib/cv/SVD_ID2019_for_Tensorflow/train_performance_1p.sh diff --git a/TensorFlow/contrib/cv/SVD_ID2019_for_Tensorflow/genStatistical.sh b/TensorFlow/contrib/cv/SVD_ID2019_for_Tensorflow/genStatistical.sh new file mode 100644 index 000000000..f9a760ef8 --- /dev/null +++ b/TensorFlow/contrib/cv/SVD_ID2019_for_Tensorflow/genStatistical.sh @@ -0,0 +1,203 @@ +#!/bin/bash + +########################################################## +#########第3行 至 100行,请一定不要、不要、不要修改########## +#########第3行 至 100行,请一定不要、不要、不要修改########## +#########第3行 至 100行,请一定不要、不要、不要修改########## +########################################################## +# shell脚本所在路径 +cur_path=`echo $(cd $(dirname $0);pwd)` + +# 判断当前shell是否是performance +perf_flag=`echo $0 | grep performance | wc -l` + +# 当前执行网络的名称 +Network=`echo $(cd $(dirname $0);pwd) | awk -F"/" '{print $(NF-1)}'` + +export RANK_SIZE=1 +export RANK_ID=0 +export JOB_ID=10087 + +# 路径参数初始化 +data_path="" +output_path="" + +# 帮助信息,不需要修改 +if [[ $1 == --help || $1 == -h ]];then + echo"usage:./train_performance_1P.sh " + echo " " + echo "parameter explain: + --data_path # dataset of training + --output_path # output of training + --train_steps # max_step for training + --train_epochs # max_epoch for training + --batch_size # batch size + -h/--help show help message + " + exit 1 +fi + +# 参数校验,不需要修改 +for para in $* +do + if [[ $para == --data_path* ]];then + data_path=`echo ${para#*=}` + elif [[ $para == --output_path* ]];then + output_path=`echo ${para#*=}` + elif [[ $para == --train_steps* ]];then + train_steps=`echo ${para#*=}` + elif [[ $para == --train_epochs* ]];then + train_epochs=`echo ${para#*=}` + elif [[ $para == --batch_size* ]];then + batch_size=`echo ${para#*=}` + fi +done + +# 校验是否传入data_path,不需要修改 +if [[ $data_path == "" ]];then + echo "[Error] para \"data_path\" must be config" + exit 1 +fi + +# 校验是否传入output_path,不需要修改 +if [[ $output_path == "" ]];then + output_path="./test/output/${ASCEND_DEVICE_ID}" +fi + +# 设置打屏日志文件名,请保留,文件名为${print_log} +print_log="./test/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log" +etp_flag=${etp_running_flag} +if [ x"${etp_flag}" != xtrue ]; +then + echo "running without etp..." + print_log_name=`ls /home/ma-user/modelarts/log/ | grep proc-rank` + print_log="/home/ma-user/modelarts/log/${print_log_name}" +fi +echo ${print_log} + +CaseName="" +function get_casename() +{ + if [ x"${perf_flag}" = x1 ]; + then + CaseName=${Network}_bs${batch_size}_${RANK_SIZE}'p'_'perf' + else + CaseName=${Network}_bs${batch_size}_${RANK_SIZE}'p'_'acc' + fi +} + +# 跳转到code目录 +cd ${cur_path}/../ +rm -rf ./test/output/${ASCEND_DEVICE_ID} +mkdir -p ./test/output/${ASCEND_DEVICE_ID} + +# 训练开始时间记录,不需要修改 +start_time=$(date +%s) +########################################################## +#########第3行 至 100行,请一定不要、不要、不要修改########## +#########第3行 至 100行,请一定不要、不要、不要修改########## +#########第3行 至 100行,请一定不要、不要、不要修改########## +########################################################## + +#========================================================= +#========================================================= +#========训练执行命令,需要根据您的网络进行修改============== +#========================================================= +#========================================================= +# 您的训练数据集在${data_path}路径下,请直接使用这个变量获取 +# 您的训练输出目录在${output_path}路径下,请直接使用这个变量获取 +# 您的其他基础参数,可以自定义增加,但是batch_size请保留,并且设置正确的值 +batch_size=1 + +echo "------------------***PRINT DATA PATH***-----------------" + +docuClass="test_points_modified/*.pts" +modelPath="output" + +testData_Path="$data_path$docuClass" +outputPath="$data_path$modelPath" + +echo "$outputPath" +echo "$testData_Path" + + +if [ x"${modelarts_flag}" != x ]; +then + python /home/ma-user/modelarts/user-job-dir/code/main_point_cloud_perf.py \ + --method=svd \ + --checkpoint_dir=${outputPath} \ + --pt_cloud_test_files=${testData_Path} \ + --predict_all_test=True +else + python /home/ma-user/modelarts/user-job-dir/code/main_point_cloud_perf.py \ + --method=svd \ + --checkpoint_dir=${outputPath} \ + --pt_cloud_test_files=${testData_Path} \ + --predict_all_test=True +fi + + +# 性能相关数据计算 +StepTime=`grep "sec/step :" ${print_log} | tail -n 10 | awk '{print $NF}' | awk '{sum+=$1} END {print sum/NR}'` +FPS=`awk 'BEGIN{ "%.2f\n", '${batch_size}'/'${StepTime}'}'` + +# 精度相关数据计算 +train_accuracy=`grep "Final Accuracy accuracy" ${print_log} | awk '{print $NF}'` +# 提取所有loss打印信息 +grep "loss :" ${print_log} | awk -F ":" '{print $4}' | awk -F "-" '{print $1}' > ./test/output/${ASCEND_DEVICE_ID}/my_output_loss.txt + + +########################################################### +#########后面的所有内容请不要修改########################### +#########后面的所有内容请不要修改########################### +#########后面的所有内容请不要修改########################### +########################################################### + +# 判断本次执行是否正确使用Ascend NPU +use_npu_flag=`grep "The model has been compiled on the Ascend AI processor" ${print_log} | wc -l` +if [ x"${use_npu_flag}" == x0 ]; +then + echo "------------------ ERROR NOTICE START ------------------" + echo "ERROR, your task haven't used Ascend NPU, please check your npu Migration." + echo "------------------ ERROR NOTICE END------------------" +else + echo "------------------ INFO NOTICE START------------------" + echo "INFO, your task have used Ascend NPU, please check your result." + echo "------------------ INFO NOTICE END------------------" +fi + +# 获取最终的casename,请保留,case文件名为${CaseName} +get_casename + +# 重命名loss文件 +if [ -f ./test/output/${ASCEND_DEVICE_ID}/my_output_loss.txt ]; +then + mv ./test/output/${ASCEND_DEVICE_ID}/my_output_loss.txt ./test/output/${ASCEND_DEVICE_ID}/${CaseName}_loss.txt +fi + +# 训练端到端耗时 +end_time=$(date +%s) +e2e_time=$(( $end_time - $start_time )) + +echo "------------------ Final result ------------------" +# 输出性能FPS/单step耗时/端到端耗时 +echo "Final Performance images/sec : $FPS" +echo "Final Performance sec/step : $StepTime" +echo "E2E Training Duration sec : $e2e_time" + +# 输出训练精度 +echo "Final Train Accuracy : ${train_accuracy}" + +# 最后一个迭代loss值,不需要修改 +ActualLoss=(`awk 'END {print $NF}' $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}_loss.txt`) + +#关键信息打印到${CaseName}.log中,不需要修改 +echo "Network = ${Network}" > $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "RankSize = ${RANK_SIZE}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "BatchSize = ${batch_size}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "DeviceType = `uname -m`" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "CaseName = ${CaseName}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "ActualFPS = ${FPS}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "TrainingTime = ${StepTime}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "ActualLoss = ${ActualLoss}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "E2ETrainingTime = ${e2e_time}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log \ No newline at end of file diff --git a/TensorFlow/contrib/cv/SVD_ID2019_for_Tensorflow/train_performance_1p.sh b/TensorFlow/contrib/cv/SVD_ID2019_for_Tensorflow/train_performance_1p.sh new file mode 100644 index 000000000..330245b56 --- /dev/null +++ b/TensorFlow/contrib/cv/SVD_ID2019_for_Tensorflow/train_performance_1p.sh @@ -0,0 +1,209 @@ +#!/bin/bash + +########################################################## +#########第3行 至 100行,请一定不要、不要、不要修改########## +#########第3行 至 100行,请一定不要、不要、不要修改########## +#########第3行 至 100行,请一定不要、不要、不要修改########## +########################################################## +# shell脚本所在路径 +cur_path=`echo $(cd $(dirname $0);pwd)` + +# 判断当前shell是否是performance +perf_flag=`echo $0 | grep performance | wc -l` + +# 当前执行网络的名称 +Network=`echo $(cd $(dirname $0);pwd) | awk -F"/" '{print $(NF-1)}'` + +export RANK_SIZE=1 +export RANK_ID=0 +export JOB_ID=10087 + +# 路径参数初始化 +data_path="" +output_path="" + +# 帮助信息,不需要修改 +if [[ $1 == --help || $1 == -h ]];then + echo"usage:./train_performance_1P.sh " + echo " " + echo "parameter explain: + --data_path # dataset of training + --output_path # output of training + --train_steps # max_step for training + --train_epochs # max_epoch for training + --batch_size # batch size + -h/--help show help message + " + exit 1 +fi + +# 参数校验,不需要修改 +for para in $* +do + if [[ $para == --data_path* ]];then + data_path=`echo ${para#*=}` + elif [[ $para == --output_path* ]];then + output_path=`echo ${para#*=}` + elif [[ $para == --train_steps* ]];then + train_steps=`echo ${para#*=}` + elif [[ $para == --train_epochs* ]];then + train_epochs=`echo ${para#*=}` + elif [[ $para == --batch_size* ]];then + batch_size=`echo ${para#*=}` + fi +done + +# 校验是否传入data_path,不需要修改 +if [[ $data_path == "" ]];then + echo "[Error] para \"data_path\" must be config" + exit 1 +fi + +# 校验是否传入output_path,不需要修改 +if [[ $output_path == "" ]];then + output_path="./test/output/${ASCEND_DEVICE_ID}" +fi + +# 设置打屏日志文件名,请保留,文件名为${print_log} +print_log="./test/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log" +etp_flag=${etp_running_flag} +if [ x"${etp_flag}" != xtrue ]; +then + echo "running without etp..." + print_log_name=`ls /home/ma-user/modelarts/log/ | grep proc-rank` + print_log="/home/ma-user/modelarts/log/${print_log_name}" +fi +echo ${print_log} + +CaseName="" +function get_casename() +{ + if [ x"${perf_flag}" = x1 ]; + then + CaseName=${Network}_bs${batch_size}_${RANK_SIZE}'p'_'perf' + else + CaseName=${Network}_bs${batch_size}_${RANK_SIZE}'p'_'acc' + fi +} + +# 跳转到code目录 +cd ${cur_path}/../ +rm -rf ./test/output/${ASCEND_DEVICE_ID} +mkdir -p ./test/output/${ASCEND_DEVICE_ID} + +# 训练开始时间记录,不需要修改 +start_time=$(date +%s) +########################################################## +#########第3行 至 100行,请一定不要、不要、不要修改########## +#########第3行 至 100行,请一定不要、不要、不要修改########## +#########第3行 至 100行,请一定不要、不要、不要修改########## +########################################################## + +#========================================================= +#========================================================= +#========训练执行命令,需要根据您的网络进行修改============== +#========================================================= +#========================================================= +# 您的训练数据集在${data_path}路径下,请直接使用这个变量获取 +# 您的训练输出目录在${output_path}路径下,请直接使用这个变量获取 +# 您的其他基础参数,可以自定义增加,但是batch_size请保留,并且设置正确的值 +batch_size=1 + +echo "------------------***PRINT DATA PATH***-----------------" +train_data="points/*.pts" +test_data="sub_test/*.pts" + +trainData_Path="$data_path$train_data" +testData_Path="$data_path$test_data" + + +if [ x"${modelarts_flag}" != x ]; +then + python3.7 /home/ma-user/modelarts/user-job-dir/code/main_point_cloud_perf.py \ + --method=svd \ + --checkpoint_dir=${output_path} \ + --log_step_count=200 \ + --save_summaries_steps=250 \ + --pt_cloud_train_files=${trainData_Path} \ + --pt_cloud_test_files=${testData_Path} \ + --train_steps=10000 \ + --save_checkpoints_steps=10000 \ + --eval_examples=399 +else + python3.7 /home/ma-user/modelarts/user-job-dir/code/main_point_cloud_perf.py \ + --method=svd \ + --checkpoint_dir=${output_path} \ + --log_step_count=200 \ + --save_summaries_steps=250 \ + --pt_cloud_train_files=${trainData_Path}\ + --pt_cloud_test_files=${testData_Path} \ + --train_steps=10000 \ + --save_checkpoints_steps=10000 \ + --eval_examples=399> ${print_log} +fi + + +# 性能相关数据计算 +StepTime=`grep "sec/step :" ${print_log} | tail -n 10 | awk '{print $NF}' | awk '{sum+=$1} END {print sum/NR}'` +FPS=`awk 'BEGIN{ "%.2f\n", '${batch_size}'/'${StepTime}'}'` + +# 精度相关数据计算 +train_accuracy=`grep "Final Accuracy accuracy" ${print_log} | awk '{print $NF}'` +# 提取所有loss打印信息 +grep "loss :" ${print_log} | awk -F ":" '{print $4}' | awk -F "-" '{print $1}' > ./test/output/${ASCEND_DEVICE_ID}/my_output_loss.txt + + +########################################################### +#########后面的所有内容请不要修改########################### +#########后面的所有内容请不要修改########################### +#########后面的所有内容请不要修改########################### +########################################################### + +# 判断本次执行是否正确使用Ascend NPU +use_npu_flag=`grep "The model has been compiled on the Ascend AI processor" ${print_log} | wc -l` +if [ x"${use_npu_flag}" == x0 ]; +then + echo "------------------ ERROR NOTICE START ------------------" + echo "ERROR, your task haven't used Ascend NPU, please check your npu Migration." + echo "------------------ ERROR NOTICE END------------------" +else + echo "------------------ INFO NOTICE START------------------" + echo "INFO, your task have used Ascend NPU, please check your result." + echo "------------------ INFO NOTICE END------------------" +fi + +# 获取最终的casename,请保留,case文件名为${CaseName} +get_casename + +# 重命名loss文件 +if [ -f ./test/output/${ASCEND_DEVICE_ID}/my_output_loss.txt ]; +then + mv ./test/output/${ASCEND_DEVICE_ID}/my_output_loss.txt ./test/output/${ASCEND_DEVICE_ID}/${CaseName}_loss.txt +fi + +# 训练端到端耗时 +end_time=$(date +%s) +e2e_time=$(( $end_time - $start_time )) + +echo "------------------ Final result ------------------" +# 输出性能FPS/单step耗时/端到端耗时 +echo "Final Performance images/sec : $FPS" +echo "Final Performance sec/step : $StepTime" +echo "E2E Training Duration sec : $e2e_time" + +# 输出训练精度 +echo "Final Train Accuracy : ${train_accuracy}" + +# 最后一个迭代loss值,不需要修改 +ActualLoss=(`awk 'END {print $NF}' $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}_loss.txt`) + +#关键信息打印到${CaseName}.log中,不需要修改 +echo "Network = ${Network}" > $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "RankSize = ${RANK_SIZE}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "BatchSize = ${batch_size}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "DeviceType = `uname -m`" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "CaseName = ${CaseName}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "ActualFPS = ${FPS}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "TrainingTime = ${StepTime}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "ActualLoss = ${ActualLoss}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "E2ETrainingTime = ${e2e_ti_time}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log \ No newline at end of file -- Gitee From 075fa9c1b7001beac4b37a349f15be2fe370751f Mon Sep 17 00:00:00 2001 From: ab_dx_z <10158481+ab_dx_z@user.noreply.gitee.com> Date: Sat, 22 Oct 2022 14:08:14 +0000 Subject: [PATCH 15/20] update TensorFlow/contrib/cv/SVD_ID2019_for_Tensorflow/README.md. Signed-off-by: ab_dx_z <> --- TensorFlow/contrib/cv/SVD_ID2019_for_Tensorflow/README.md | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/TensorFlow/contrib/cv/SVD_ID2019_for_Tensorflow/README.md b/TensorFlow/contrib/cv/SVD_ID2019_for_Tensorflow/README.md index 78822ea33..cbb95c3f1 100644 --- a/TensorFlow/contrib/cv/SVD_ID2019_for_Tensorflow/README.md +++ b/TensorFlow/contrib/cv/SVD_ID2019_for_Tensorflow/README.md @@ -28,7 +28,8 @@ - 参考实现: - 数据下载百度网盘链接: + 数据下载百度网盘链接:https://pan.baidu.com/s/1up1HW6McgSor3JF0yqQZSA +提取码:2019 共有3个数据集 @@ -38,7 +39,8 @@ 第一步旋转后的数据集 test_points_modified - npu训练出来的模型下载百度网盘链接: + npu训练出来的模型下载百度网盘链接:https://pan.baidu.com/s/1JU1koZR7uGlkKfRYIk8tsw +提取码:2019 -- Gitee From 483f27ac20d25c7fe3ed550f930645437413942d Mon Sep 17 00:00:00 2001 From: ab_dx_z <10158481+ab_dx_z@user.noreply.gitee.com> Date: Sat, 22 Oct 2022 14:11:54 +0000 Subject: [PATCH 16/20] =?UTF-8?q?=E6=96=B0=E5=BB=BA=20SVD-ID2019=5Ffor=5FA?= =?UTF-8?q?CL?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- ACL_TensorFlow/contrib/cv/SVD-ID2019_for_ACL/.keep | 0 1 file changed, 0 insertions(+), 0 deletions(-) create mode 100644 ACL_TensorFlow/contrib/cv/SVD-ID2019_for_ACL/.keep diff --git a/ACL_TensorFlow/contrib/cv/SVD-ID2019_for_ACL/.keep b/ACL_TensorFlow/contrib/cv/SVD-ID2019_for_ACL/.keep new file mode 100644 index 000000000..e69de29bb -- Gitee From c16d449cbf1b7d915668af9923085549e0d425e1 Mon Sep 17 00:00:00 2001 From: ab_dx_z <10158481+ab_dx_z@user.noreply.gitee.com> Date: Sat, 22 Oct 2022 14:12:33 +0000 Subject: [PATCH 17/20] =?UTF-8?q?=E7=A6=BB=E7=BA=BF=E6=8E=A8=E7=90=86?= =?UTF-8?q?=E6=96=87=E4=BB=B6?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: ab_dx_z <> --- .../cv/SVD-ID2019_for_ACL/utils_gpu.py | 115 ++++++++++++++++++ 1 file changed, 115 insertions(+) create mode 100644 ACL_TensorFlow/contrib/cv/SVD-ID2019_for_ACL/utils_gpu.py diff --git a/ACL_TensorFlow/contrib/cv/SVD-ID2019_for_ACL/utils_gpu.py b/ACL_TensorFlow/contrib/cv/SVD-ID2019_for_ACL/utils_gpu.py new file mode 100644 index 000000000..ff47d69c2 --- /dev/null +++ b/ACL_TensorFlow/contrib/cv/SVD-ID2019_for_ACL/utils_gpu.py @@ -0,0 +1,115 @@ +# coding=utf-8 +# Copyright 2021 The Google Research Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Utility functions.""" +import numpy as np +from scipy.stats import special_ortho_group +import tensorflow as tf + + +def relative_angle(r1, r2): + """Relative angle (radians) between 3D rotation matrices.""" + rel_rot = tf.matmul(tf.transpose(r1, perm=[0, 2, 1]), r2) + trace = rel_rot[:, 0, 0] + rel_rot[:, 1, 1] + rel_rot[:, 2, 2] + cos_theta = (trace - 1.0) / 2.0 + cos_theta = tf.minimum(cos_theta, tf.ones_like(cos_theta)) + cos_theta = tf.maximum(cos_theta, (-1.0) * tf.ones_like(cos_theta)) + theta = tf.acos(cos_theta) + return theta + + +def random_rotation_benchmark_np(n): + """Sample a random 3D rotation by method used in Zhou et al, CVPR19. + + This numpy function is a copy of the PyTorch function + get_sampled_rotation_matrices_by_axisAngle() in the code made available + for Zhou et al, CVPR19, at https://github.com/papagina/RotationContinuity/. + + Args: + n: the number of rotation matrices to return. + + Returns: + [n, 3, 3] np array. + """ + theta = np.random.uniform(-1, 1, n) * np.pi + sin = np.sin(theta) + axis = np.random.randn(n, 3) + axis = axis / np.maximum(np.linalg.norm(axis, axis=-1, keepdims=True), 1e-7) + qw = np.cos(theta) + qx = axis[:, 0] * sin + qy = axis[:, 1] * sin + qz = axis[:, 2] * sin + + xx = qx*qx + yy = qy*qy + zz = qz*qz + xy = qx*qy + xz = qx*qz + yz = qy*qz + xw = qx*qw + yw = qy*qw + zw = qz*qw + + row0 = np.stack((1-2*yy-2*zz, 2*xy-2*zw, 2*xz+2*yw), axis=-1) + row1 = np.stack((2*xy+2*zw, 1-2*xx-2*zz, 2*yz-2*xw), axis=-1) + row2 = np.stack((2*xz-2*yw, 2*yz+2*xw, 1-2*xx-2*yy), axis=-1) + matrix = np.stack((row0, row1, row2), axis=1) + + return matrix + + +def random_rotation_benchmark(n): + """A TF wrapper for random_rotation_benchmark_np().""" + mat = tf.py_func( + func=lambda t: np.float32(random_rotation_benchmark_np(t)), + inp=[n], + Tout=tf.float32, + stateful=True) + return tf.reshape(mat, (n, 3, 3)) + + +def random_rotation(n): + """Sample rotations from a uniform distribution on SO(3).""" + mat = tf.py_func( + func=lambda t: np.float32(special_ortho_group.rvs(3, size=t)), + inp=[n], + Tout=tf.float32, + stateful=True) + return tf.reshape(mat, (n, 3, 3)) + + +def symmetric_orthogonalization(x): + """Maps 9D input vectors onto SO(3) via symmetric orthogonalization.""" + # Innner dimensions of the input should be 3x3 matrices. + m = tf.reshape(x, (-1, 3, 3)) + _, u, v = tf.svd(m) + det = tf.linalg.det(tf.matmul(u, v, transpose_b=True)) + r = tf.matmul( + tf.concat([u[:, :, :-1], u[:, :, -1:] * tf.reshape(det, [-1, 1, 1])], 2), + v, transpose_b=True) + return r + + +def gs_orthogonalization(p6): + """Gram-Schmidt orthogonalization from 6D input.""" + # Input should be [batch_size, 6] + x = p6[:, 0:3] + y = p6[:, 3:6] + xn = tf.math.l2_normalize(x, axis=-1) + z = tf.linalg.cross(xn, y) + zn = tf.math.l2_normalize(z, axis=-1) + y = tf.linalg.cross(zn, xn) + r = tf.stack([xn, y, zn], -1) + return r -- Gitee From 56a0181f45a11c1cc0c8be6502df82b5caa17fc7 Mon Sep 17 00:00:00 2001 From: ab_dx_z <10158481+ab_dx_z@user.noreply.gitee.com> Date: Sat, 22 Oct 2022 14:13:56 +0000 Subject: [PATCH 18/20] add ACL_TensorFlow/contrib/cv/SVD-ID2019_for_ACL. Signed-off-by: ab_dx_z <> --- ACL_TensorFlow/contrib/cv/SVD-ID2019_for_ACL/README.md | 0 1 file changed, 0 insertions(+), 0 deletions(-) create mode 100644 ACL_TensorFlow/contrib/cv/SVD-ID2019_for_ACL/README.md diff --git a/ACL_TensorFlow/contrib/cv/SVD-ID2019_for_ACL/README.md b/ACL_TensorFlow/contrib/cv/SVD-ID2019_for_ACL/README.md new file mode 100644 index 000000000..e69de29bb -- Gitee From 805be11a12385b4d1d381096ef85e57f3c52331b Mon Sep 17 00:00:00 2001 From: ab_dx_z <10158481+ab_dx_z@user.noreply.gitee.com> Date: Sat, 22 Oct 2022 14:14:32 +0000 Subject: [PATCH 19/20] update ACL_TensorFlow/contrib/cv/SVD-ID2019_for_ACL/README.md. Signed-off-by: ab_dx_z <> --- .../contrib/cv/SVD-ID2019_for_ACL/README.md | 89 +++++++++++++++++++ 1 file changed, 89 insertions(+) diff --git a/ACL_TensorFlow/contrib/cv/SVD-ID2019_for_ACL/README.md b/ACL_TensorFlow/contrib/cv/SVD-ID2019_for_ACL/README.md index e69de29bb..25ea1bb75 100644 --- a/ACL_TensorFlow/contrib/cv/SVD-ID2019_for_ACL/README.md +++ b/ACL_TensorFlow/contrib/cv/SVD-ID2019_for_ACL/README.md @@ -0,0 +1,89 @@ +# SVD_ID2019_for ACL + +#### 概述 +给定两个三维点云图,利用SVD正交化过程SVDO+(M)将其投射到SO(3)上,要求网络预测最佳对齐它们的3D旋转。 + +- 开源代码:训练获取 + + https://github.com/google-research/google-research/tree/master/special_orthogonalization。 + +- 参考论文: + + [An Analysis of SVD for Deep Rotation Estimation](https://arxiv.org/abs/2006.14616) + +#### 数据集 + +训练数据集 points + +测试数据集 points_test + +旋转后数据集 points_test_modified + +#### 模型固化 +-直接获取 + +直接下载获取,百度网盘 +链接:https://pan.baidu.com/s/17zKWq2aY06cF9IQW6htn_A +提取码:2019 +-训练获取 + +训练获取 +训练完成saved_model模型网盘链接:https://pan.baidu.com/s/1Y4ato6Ob-6-rcXr31AvgoA +提取码:2019 + +1.按照SVD_ID2019_for_Tensorflow中的流程训练,模型保存为saved_model格式 + +2.将saved_model格式文件冻结为pb文件(需要在freeze.py文件中修改路径) + +python freeze.py + +得到svd.pd + +#### 使用ATC工具将pb文件转换为om模型 +命令行代码示例 + +atc --model=/home/test_user04/svd.pb --framework=3 --output=/home/test_user04/svd --soc_version=Ascend310 --input_shape="data_1:1,1410,3;rot_1:1,3,3" + +注意所使用机器的Ascend的型号 + +模型直接下载百度网盘链接:https://pan.baidu.com/s/14-m0ZhPQyIr8enpUgVytpg +提取码:2019 + +得到svd.om +#### 制作数据集 +-直接下载,数据集在svd_inference/data_1 + + + +-自己制作 +原数据链接:链接:https://pan.baidu.com/s/1aGAO3os8ifDnYm1yXrxndQ +提取码:2019 + +使用pts2txt制作数据集(注意修改数据路径,数据路径为/xxx/points_test_modified/*.pts,注意修改产生数据集后的路径) + +python pts2txt.py + +#### 获取离线推理输出bin文件 + +推理文件在压缩包svd_inference中,下载百度网盘链接:https://pan.baidu.com/s/1OfCxHMUJcnyqp2IcvV3eWg +提取码:2019 + +脚本在src文件夹中,直接运行 + +python svdom_inference.py + +推理结果直接下载网盘链接:https://pan.baidu.com/s/1NFNfJkTUW4u7YJcaHK9mLw +提取码:2019 + +#### 使用输出的bin文件验证推理精度 + +运行脚本 + +python calc_acc.py + +得到推理精度:3.150504164928697 + +与在线推理精度近似 + +关于以上所有文件的百度网盘链接:https://pan.baidu.com/s/1sR8gYK8jM6xCZwbq7eK50A +提取码:2019 \ No newline at end of file -- Gitee From 7a0b464ef4805ea6bbbdd10245145e3eec0fed9a Mon Sep 17 00:00:00 2001 From: ab_dx_z <10158481+ab_dx_z@user.noreply.gitee.com> Date: Sat, 22 Oct 2022 14:15:37 +0000 Subject: [PATCH 20/20] =?UTF-8?q?=E7=A6=BB=E7=BA=BF=E6=8E=A8=E7=90=86?= =?UTF-8?q?=E8=84=9A=E6=9C=AC?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: ab_dx_z <> --- .../contrib/cv/SVD-ID2019_for_ACL/calc_acc.py | 33 ++++++++++++ .../contrib/cv/SVD-ID2019_for_ACL/frezze.py | 50 +++++++++++++++++++ .../contrib/cv/SVD-ID2019_for_ACL/pts2txt.py | 44 ++++++++++++++++ 3 files changed, 127 insertions(+) create mode 100644 ACL_TensorFlow/contrib/cv/SVD-ID2019_for_ACL/calc_acc.py create mode 100644 ACL_TensorFlow/contrib/cv/SVD-ID2019_for_ACL/frezze.py create mode 100644 ACL_TensorFlow/contrib/cv/SVD-ID2019_for_ACL/pts2txt.py diff --git a/ACL_TensorFlow/contrib/cv/SVD-ID2019_for_ACL/calc_acc.py b/ACL_TensorFlow/contrib/cv/SVD-ID2019_for_ACL/calc_acc.py new file mode 100644 index 000000000..b71c645e3 --- /dev/null +++ b/ACL_TensorFlow/contrib/cv/SVD-ID2019_for_ACL/calc_acc.py @@ -0,0 +1,33 @@ +import numpy as np +import glob +import pathlib +import os +import utils_gpu +import tensorflow as tf + +INFERENCE_DIR = "C:/Users/1young/Desktop/svd_output/svd_output/*.bin" +TEST_DIR = "D:/svd_code2/data_1" +tf.enable_eager_execution() + + +def main(): + input_test_files = glob.glob(INFERENCE_DIR) + mean_err = [] + for in_file in input_test_files: + out_file_prefix = pathlib.Path(in_file).stem + rot_path = os.path.join(TEST_DIR,'%s.txt' % out_file_prefix) + rot = np.loadtxt(rot_path)[:3,:].reshape((-1,3,3)) + r = np.loadtxt(in_file).reshape((-1,3,3)) + theta = utils_gpu.relative_angle(rot, r) + mean_theta = tf.reduce_mean(theta) + mean_theta_deg = mean_theta * 180.0 / np.pi + mean_theta_deg = mean_theta_deg.numpy() + mean_err.append(mean_theta_deg) + print("the mean of error") + print(np.mean(np.array(mean_err))) + +if __name__=="__main__": + main() + + + diff --git a/ACL_TensorFlow/contrib/cv/SVD-ID2019_for_ACL/frezze.py b/ACL_TensorFlow/contrib/cv/SVD-ID2019_for_ACL/frezze.py new file mode 100644 index 000000000..70014c274 --- /dev/null +++ b/ACL_TensorFlow/contrib/cv/SVD-ID2019_for_ACL/frezze.py @@ -0,0 +1,50 @@ +# Copyright 2017 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ +# Copyright 2021 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import tensorflow as tf +from tensorflow.python.tools import freeze_graph +#from npu_bridge.npu_init import * + +saved_model_path = 'C:/Users/1young/Desktop/1666004668' +def main(): + freeze_graph.freeze_graph( + input_saved_model_dir=saved_model_path, + output_node_names='rotation_matrix', + output_graph='svd.pb', + initializer_nodes='', + input_graph= None, + input_saver= False, + input_binary=False, + input_checkpoint=None, + restore_op_name=None, + filename_tensor_name=None, + clear_devices=False, + input_meta_graph=False) + +if __name__ == '__main__': + main() \ No newline at end of file diff --git a/ACL_TensorFlow/contrib/cv/SVD-ID2019_for_ACL/pts2txt.py b/ACL_TensorFlow/contrib/cv/SVD-ID2019_for_ACL/pts2txt.py new file mode 100644 index 000000000..01f84fb3a --- /dev/null +++ b/ACL_TensorFlow/contrib/cv/SVD-ID2019_for_ACL/pts2txt.py @@ -0,0 +1,44 @@ +import os +import numpy as np +import tensorflow as tf +import glob +import pathlib + +inputpath = "D:\special_orthogonalization\points_test_modified\*.pts" +outputpath = "D:\svd_code2\data_1" + + +tf.enable_eager_execution() + +def data_processing(pts_path): + file_buffer = tf.read_file(pts_path) + lines = tf.string_split([file_buffer], delimiter='\n') + lines1 = tf.string_split(lines.values, delimiter='\r') + values = tf.stack(tf.decode_csv(lines1.values, + record_defaults=[[0.0], [0.0], [0.0]], field_delim=' ')) + values = tf.transpose(values) # 3xN --> Nx3. + diff_num = 1414-tf.shape(values)[0] + repeat_pts = tf.tile(tf.reshape(values[4,:],(1,-1)),[diff_num,1]) + + values = tf.concat([values,repeat_pts],axis=0) + # First three rows are the rotation matrix, remaining rows the point cloud. + values = tf.concat([values, repeat_pts], axis=0) + # First three rows are the rotation matrix, remaining rows the point cloud. + return values.numpy() + +def file_save(path,datapath): + input_test_files = glob.glob(path) + for in_file in input_test_files: + out_file_prefix = pathlib.Path(in_file).stem + values = data_processing(in_file) + out_file1 = os.path.join( + datapath, '%s.txt' % out_file_prefix) + np.savetxt(out_file1,values) + +def main(): + os.makedirs(outputpath, exist_ok=True) + file_save(inputpath,outputpath) + +if __name__ == '__main__': + main() + -- Gitee