From b5e0202a0705f7777f09492003a3bc5db1a27fce Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=BC=A0=E6=B4=8B=E6=B4=8B?= <584244991@qq.com> Date: Thu, 28 Jul 2022 07:12:51 +0000 Subject: [PATCH 01/12] update TensorFlow/built-in/cv/detection/SSD-VGG_ID1619_for_TensorFlow/test/train_full_1p.sh. --- .../SSD-VGG_ID1619_for_TensorFlow/test/train_full_1p.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/TensorFlow/built-in/cv/detection/SSD-VGG_ID1619_for_TensorFlow/test/train_full_1p.sh b/TensorFlow/built-in/cv/detection/SSD-VGG_ID1619_for_TensorFlow/test/train_full_1p.sh index fe078086b..978a48725 100644 --- a/TensorFlow/built-in/cv/detection/SSD-VGG_ID1619_for_TensorFlow/test/train_full_1p.sh +++ b/TensorFlow/built-in/cv/detection/SSD-VGG_ID1619_for_TensorFlow/test/train_full_1p.sh @@ -91,7 +91,7 @@ echo "Final Train Accuracy : ${train_accuracy}" #训练用例信息,不需要修改 BatchSize=${batch_size} DeviceType=`uname -m` -CaseName=${Network}_bs${BatchSize}_${RANK_SIZE}'p'_'perf' +CaseName=${Network}_bs${BatchSize}_${RANK_SIZE}'p'_'acc' ##获取性能数据,不需要修改 #吞吐量 -- Gitee From 2a6f192d836181b28ef20575b2866f0b3d045dc7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=BC=A0=E6=B4=8B=E6=B4=8B?= <584244991@qq.com> Date: Thu, 28 Jul 2022 07:21:52 +0000 Subject: [PATCH 02/12] update TensorFlow/built-in/cv/detection/SSD-VGG_ID1619_for_TensorFlow/README.md. --- .../cv/detection/SSD-VGG_ID1619_for_TensorFlow/README.md | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/TensorFlow/built-in/cv/detection/SSD-VGG_ID1619_for_TensorFlow/README.md b/TensorFlow/built-in/cv/detection/SSD-VGG_ID1619_for_TensorFlow/README.md index 5cb5099ea..fc1fb726a 100644 --- a/TensorFlow/built-in/cv/detection/SSD-VGG_ID1619_for_TensorFlow/README.md +++ b/TensorFlow/built-in/cv/detection/SSD-VGG_ID1619_for_TensorFlow/README.md @@ -131,7 +131,9 @@ SSD-VGG是采用单个深度神经网络模型实现目标检测和识别的方 - 数据集准备 1. 模型训练使用Pascal VOC数据集,数据集请用户自行获取。 - + ``` + bash download-data.sh + ``` 2. 数据集训练前需要做预处理操作 ``` ./process_dataset.py -- Gitee From 653a4b9204e2bf5efa1a5d402de3c5b57f005a25 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=BC=A0=E6=B4=8B=E6=B4=8B?= <584244991@qq.com> Date: Thu, 28 Jul 2022 07:37:38 +0000 Subject: [PATCH 03/12] update TensorFlow/built-in/cv/detection/SSD-VGG_ID1619_for_TensorFlow/README.md. --- .../SSD-VGG_ID1619_for_TensorFlow/README.md | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/TensorFlow/built-in/cv/detection/SSD-VGG_ID1619_for_TensorFlow/README.md b/TensorFlow/built-in/cv/detection/SSD-VGG_ID1619_for_TensorFlow/README.md index fc1fb726a..218cf7e02 100644 --- a/TensorFlow/built-in/cv/detection/SSD-VGG_ID1619_for_TensorFlow/README.md +++ b/TensorFlow/built-in/cv/detection/SSD-VGG_ID1619_for_TensorFlow/README.md @@ -154,7 +154,24 @@ SSD-VGG是采用单个深度神经网络模型实现目标检测和识别的方 cd test; bash train_full_1p.sh --data_path=./data/ ``` + 启动训练。 + 启动单卡训练 (脚本为MUNIT_ID0953_for_TensorFlow/test/train_full_1p.sh) + + ``` + bash train_full_1p.sh + ``` + +

训练结果

+ +- 精度结果比对 + +取训练最后1000个steps的loss,计算平均值,进行结果比对。 + +|精度指标项|GPU实测|NPU实测| +|---|---|---| +|d_loss|2.619421507950002|2.7996314894200007| +|g_loss|4.192780654629998|4.389258856830003|

高级参考

- 脚本和示例代码 -- Gitee From a030c621c072a7f8bb53b27b14b39c81a638a54a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=BC=A0=E6=B4=8B=E6=B4=8B?= <584244991@qq.com> Date: Thu, 28 Jul 2022 08:09:47 +0000 Subject: [PATCH 04/12] update TensorFlow/built-in/cv/detection/SSD-VGG_ID1619_for_TensorFlow/README.md. --- .../cv/detection/SSD-VGG_ID1619_for_TensorFlow/README.md | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/TensorFlow/built-in/cv/detection/SSD-VGG_ID1619_for_TensorFlow/README.md b/TensorFlow/built-in/cv/detection/SSD-VGG_ID1619_for_TensorFlow/README.md index 218cf7e02..aa56d32fe 100644 --- a/TensorFlow/built-in/cv/detection/SSD-VGG_ID1619_for_TensorFlow/README.md +++ b/TensorFlow/built-in/cv/detection/SSD-VGG_ID1619_for_TensorFlow/README.md @@ -166,12 +166,11 @@ SSD-VGG是采用单个深度神经网络模型实现目标检测和识别的方 - 精度结果比对 -取训练最后1000个steps的loss,计算平均值,进行结果比对。 +训练200epoch,去最后一个值,结果如下: |精度指标项|GPU实测|NPU实测| |---|---|---| -|d_loss|2.619421507950002|2.7996314894200007| -|g_loss|4.192780654629998|4.389258856830003| +|loss|6.06|6.06|

高级参考

- 脚本和示例代码 -- Gitee From 989e8b1f44370b35872aab55e2824eb84fa05976 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=BC=A0=E6=B4=8B=E6=B4=8B?= <584244991@qq.com> Date: Thu, 28 Jul 2022 12:46:28 +0000 Subject: [PATCH 05/12] update TensorFlow/built-in/cv/detection/SSD-VGG_ID1619_for_TensorFlow/README.md. --- .../cv/detection/SSD-VGG_ID1619_for_TensorFlow/README.md | 9 --------- 1 file changed, 9 deletions(-) diff --git a/TensorFlow/built-in/cv/detection/SSD-VGG_ID1619_for_TensorFlow/README.md b/TensorFlow/built-in/cv/detection/SSD-VGG_ID1619_for_TensorFlow/README.md index aa56d32fe..e3e9cb751 100644 --- a/TensorFlow/built-in/cv/detection/SSD-VGG_ID1619_for_TensorFlow/README.md +++ b/TensorFlow/built-in/cv/detection/SSD-VGG_ID1619_for_TensorFlow/README.md @@ -162,15 +162,6 @@ SSD-VGG是采用单个深度神经网络模型实现目标检测和识别的方 bash train_full_1p.sh ``` -

训练结果

- -- 精度结果比对 - -训练200epoch,去最后一个值,结果如下: - -|精度指标项|GPU实测|NPU实测| -|---|---|---| -|loss|6.06|6.06|

高级参考

- 脚本和示例代码 -- Gitee From 641011e86b467ca85506d308271bf3a2e6561af6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=BC=A0=E6=B4=8B=E6=B4=8B?= <584244991@qq.com> Date: Thu, 28 Jul 2022 12:55:06 +0000 Subject: [PATCH 06/12] update TensorFlow/built-in/cv/detection/SSD-VGG_ID1619_for_TensorFlow/README.md. --- .../SSD-VGG_ID1619_for_TensorFlow/README.md | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/TensorFlow/built-in/cv/detection/SSD-VGG_ID1619_for_TensorFlow/README.md b/TensorFlow/built-in/cv/detection/SSD-VGG_ID1619_for_TensorFlow/README.md index e3e9cb751..ded3ece5e 100644 --- a/TensorFlow/built-in/cv/detection/SSD-VGG_ID1619_for_TensorFlow/README.md +++ b/TensorFlow/built-in/cv/detection/SSD-VGG_ID1619_for_TensorFlow/README.md @@ -97,6 +97,11 @@ SSD-VGG是采用单个深度神经网络模型实现目标检测和识别的方 train_full_1p.sh --precision_mode=allow_fp32_to_fp16 --data_path=./ ``` +相关代码示例: + ``` + parser.add_argument('--precision_mode', type=str, default='allow_fp32_to_fp16', + help='precision mode, default is allow_fp32_to_fp16') + ```

训练环境准备

@@ -125,7 +130,11 @@ SSD-VGG是采用单个深度神经网络模型实现目标检测和识别的方 - +- 运行以下命令安装依赖。 +``` +pip3 install requirements.txt +``` +说明:依赖配置文件requirements.txt文件位于模型的根目录

快速上手

@@ -212,3 +221,6 @@ SSD-VGG是采用单个深度神经网络模型实现目标检测和识别的方 --dump_path dump path,default='/home/HwHiAiUser/' ``` +## 训练过程 + +通过“模型训练”中的训练指令启动单卡或者多卡训练。单卡和多卡通过运行不同脚本,支持单卡,8卡网络训练。模型存储路径为${cur_path}/output/$ASCEND_DEVICE_ID,包括训练的log以及checkpoints文件。以单卡训练为例,loss信息在文件${cur_path}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log中。 \ No newline at end of file -- Gitee From 4ab75972a469f3b0d3539e60718138f62fbc8b1e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=BC=A0=E6=B4=8B=E6=B4=8B?= <584244991@qq.com> Date: Thu, 28 Jul 2022 13:14:25 +0000 Subject: [PATCH 07/12] update TensorFlow/built-in/cv/detection/SSD-VGG_ID1619_for_TensorFlow/README.md. --- .../detection/SSD-VGG_ID1619_for_TensorFlow/README.md | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/TensorFlow/built-in/cv/detection/SSD-VGG_ID1619_for_TensorFlow/README.md b/TensorFlow/built-in/cv/detection/SSD-VGG_ID1619_for_TensorFlow/README.md index ded3ece5e..c53a4ac45 100644 --- a/TensorFlow/built-in/cv/detection/SSD-VGG_ID1619_for_TensorFlow/README.md +++ b/TensorFlow/built-in/cv/detection/SSD-VGG_ID1619_for_TensorFlow/README.md @@ -138,19 +138,18 @@ pip3 install requirements.txt

快速上手

-- 数据集准备 - 1. 模型训练使用Pascal VOC数据集,数据集请用户自行获取。 - ``` +## 数据集准备 + 1、用户自行准备好数据集,模型训练使用Pascal VOC数据集,数据集请用户自行获取 + ``` bash download-data.sh ``` 2. 数据集训练前需要做预处理操作 ``` ./process_dataset.py - ``` - + ``` 3. 数据集处理后,放入模型目录下,在训练脚本中指定数据集路径,可正常使用。 -- 模型训练 +## 模型训练 1. 单击“立即下载”,并选择合适的下载方式下载源码包。 -- Gitee From 4e8e55603791b69e50de087fd3d3b653dd8be953 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=BC=A0=E6=B4=8B=E6=B4=8B?= <584244991@qq.com> Date: Thu, 28 Jul 2022 13:26:06 +0000 Subject: [PATCH 08/12] update TensorFlow/built-in/cv/detection/SSD-VGG_ID1619_for_TensorFlow/README.md. --- .../cv/detection/SSD-VGG_ID1619_for_TensorFlow/README.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/TensorFlow/built-in/cv/detection/SSD-VGG_ID1619_for_TensorFlow/README.md b/TensorFlow/built-in/cv/detection/SSD-VGG_ID1619_for_TensorFlow/README.md index c53a4ac45..4b91d3d11 100644 --- a/TensorFlow/built-in/cv/detection/SSD-VGG_ID1619_for_TensorFlow/README.md +++ b/TensorFlow/built-in/cv/detection/SSD-VGG_ID1619_for_TensorFlow/README.md @@ -25,6 +25,8 @@

概述

+## 简述 + SSD-VGG是采用单个深度神经网络模型实现目标检测和识别的方法。模型主要特点: 1、多尺度特征映射。将卷积特征层添加到截取的基础网络的末端。大尺度的特征图有较多的信息,可以用来检测小物体。而小尺度的特征图用来检测较大的物体。允许在多个尺度上对检测结果进行预测。 2、采用卷积层作为预测器。代替了全连接层,直接采用卷积对不同的特征图进行提取检测结果。 -- Gitee From 4b7b38c622b0e164f34088c838630f28763e7aba Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=BC=A0=E6=B4=8B=E6=B4=8B?= <584244991@qq.com> Date: Thu, 28 Jul 2022 13:27:23 +0000 Subject: [PATCH 09/12] update TensorFlow/built-in/cv/detection/SSD-VGG_ID1619_for_TensorFlow/README.md. --- .../cv/detection/SSD-VGG_ID1619_for_TensorFlow/README.md | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/TensorFlow/built-in/cv/detection/SSD-VGG_ID1619_for_TensorFlow/README.md b/TensorFlow/built-in/cv/detection/SSD-VGG_ID1619_for_TensorFlow/README.md index 4b91d3d11..24974fb28 100644 --- a/TensorFlow/built-in/cv/detection/SSD-VGG_ID1619_for_TensorFlow/README.md +++ b/TensorFlow/built-in/cv/detection/SSD-VGG_ID1619_for_TensorFlow/README.md @@ -1,4 +1,9 @@ - +- [基本信息](#基本信息.md) +- [概述](#概述.md) +- [训练环境准备](#训练环境准备.md) +- [快速上手](#快速上手.md) +- [迁移学习指导](#迁移学习指导.md) +- [高级参考](#高级参考.md)

基本信息

**发布者(Publisher):Huawei** -- Gitee From e0a8c4a15309fc5feec643b2efe0ed926c67ddf8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=BC=A0=E6=B4=8B=E6=B4=8B?= <584244991@qq.com> Date: Thu, 28 Jul 2022 13:35:50 +0000 Subject: [PATCH 10/12] update TensorFlow/built-in/cv/detection/SSD-VGG_ID1619_for_TensorFlow/README.md. --- .../cv/detection/SSD-VGG_ID1619_for_TensorFlow/README.md | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/TensorFlow/built-in/cv/detection/SSD-VGG_ID1619_for_TensorFlow/README.md b/TensorFlow/built-in/cv/detection/SSD-VGG_ID1619_for_TensorFlow/README.md index 24974fb28..6b120a5f7 100644 --- a/TensorFlow/built-in/cv/detection/SSD-VGG_ID1619_for_TensorFlow/README.md +++ b/TensorFlow/built-in/cv/detection/SSD-VGG_ID1619_for_TensorFlow/README.md @@ -59,7 +59,7 @@ SSD-VGG是采用单个深度神经网络模型实现目标检测和识别的方 git reset --hard {commit_id} # 代码设置到对应的commit_id cd {code_path} # 切换到模型代码所在路径,若仓库下只有该模型,则无需切换 ``` -

默认配置

+## 默认配置 - 训练数据集预处理(以Pascal VOC训练集为例,仅作为用户参考示例): @@ -73,7 +73,7 @@ SSD-VGG是采用单个深度神经网络模型实现目标检测和识别的方 - 随机变换图像通道 - 随机变换饱和度 -- 训练超参 +- 训练超参(单卡): - Batch size: 8 - Momentum: 0.9 @@ -83,8 +83,7 @@ SSD-VGG是采用单个深度神经网络模型实现目标检测和识别的方 - Weight decay: 0.0005 - Train epoch: 200 - -

支持特性

+## 支持特性 | 特性列表 | 是否支持 | |-------|------| -- Gitee From b56935555a4f21447094c6cc42b5b35f5041838e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=BC=A0=E6=B4=8B=E6=B4=8B?= <584244991@qq.com> Date: Fri, 29 Jul 2022 03:39:40 +0000 Subject: [PATCH 11/12] update TensorFlow/built-in/cv/detection/SSD-VGG_ID1619_for_TensorFlow/README.md. --- .../SSD-VGG_ID1619_for_TensorFlow/README.md | 228 ++++++++---------- 1 file changed, 103 insertions(+), 125 deletions(-) diff --git a/TensorFlow/built-in/cv/detection/SSD-VGG_ID1619_for_TensorFlow/README.md b/TensorFlow/built-in/cv/detection/SSD-VGG_ID1619_for_TensorFlow/README.md index 6b120a5f7..d6e139047 100644 --- a/TensorFlow/built-in/cv/detection/SSD-VGG_ID1619_for_TensorFlow/README.md +++ b/TensorFlow/built-in/cv/detection/SSD-VGG_ID1619_for_TensorFlow/README.md @@ -32,110 +32,83 @@ ## 简述 -SSD-VGG是采用单个深度神经网络模型实现目标检测和识别的方法。模型主要特点: -1、多尺度特征映射。将卷积特征层添加到截取的基础网络的末端。大尺度的特征图有较多的信息,可以用来检测小物体。而小尺度的特征图用来检测较大的物体。允许在多个尺度上对检测结果进行预测。 -2、采用卷积层作为预测器。代替了全连接层,直接采用卷积对不同的特征图进行提取检测结果。 +SSD-VGG是采用单个深度神经网络模型实现目标检测和识别的方法。模型主要特点:多尺度特征映射。将卷积特征层添加到截取的基础网络的末端。大尺度的特征图有较多的信息,可以用来检测小物体。而小尺度的特征图用来检测较大的物体。允许在多个尺度上对检测结果进行预测。采用卷积层作为预测器。代替了全连接层,直接采用卷积对不同的特征图进行提取检测结果。 - 参考论文: - https://arxiv.org/pdf/1512.02325.pdf + [https://arxiv.org/abs/1810.04805](https://gitee.com/link?target=https%3A%2F%2Farxiv.org%2Fabs%2F1810.04805) - 参考实现: - https://github.com/ljanyst/ssd-tensorflow + https://github.com/NVIDIA/DeepLearningExamples/tree/master/TensorFlow2/LanguageModeling/BERT - 适配昇腾 AI 处理器的实现: - - https://gitee.com/ascend/modelzoo/tree/master/built-in/TensorFlow/Research/cv/detection/SSD-VGG_ID1619_for_TensorFlow - - + + https://gitee.com/ascend/ModelZoo-TensorFlow/blob/master/TensorFlow2/built-in/nlp/BERT_ID2478_for_TensorFlow2.X - 通过Git获取对应commit\_id的代码方法如下: + + git clone {repository_url} # 克隆仓库的代码 + cd {repository_name} # 切换到模型的代码仓目录 + git checkout {branch} # 切换到对应分支 + git reset --hard {commit_id} # 代码设置到对应的commit_id + cd {code_path} # 切换到模型代码所在路径,若仓库下只有该模型,则无需切换 - ``` - git clone {repository_url} # 克隆仓库的代码 - cd {repository_name} # 切换到模型的代码仓目录 - git checkout {branch} # 切换到对应分支 - git reset --hard {commit_id} # 代码设置到对应的commit_id - cd {code_path} # 切换到模型代码所在路径,若仓库下只有该模型,则无需切换 - ``` -## 默认配置 -- 训练数据集预处理(以Pascal VOC训练集为例,仅作为用户参考示例): - - - 图像的输入尺寸为300*300 - - 图像输入格式:pickle - - 随机排序 - - 随机缩放 - - 随机变换亮度 - - 随机变换对比度 - - 随机变换色调 - - 随机变换图像通道 - - 随机变换饱和度 +## 默认配置 +- 网络结构 + - 24-layer, 1024-hidden, 16-heads, 340M parameters - 训练超参(单卡): + - Batch size: 8 + - Momentum: 0.9 + - LR scheduler: cosine + - Learning rate(LR): 0.00075;0.0001;0.0001 + - Optimizer: MomentumOptimizer + - Weight decay: 0.0005 + - Train epoch: 200 - - Batch size: 8 - - Momentum: 0.9 - - LR scheduler: cosine - - Learning rate(LR): 0.00075;0.0001;0.0001 - - Optimizer: MomentumOptimizer - - Weight decay: 0.0005 - - Train epoch: 200 ## 支持特性 -| 特性列表 | 是否支持 | -|-------|------| -| 分布式训练 | 否 | -| 混合精度 | 是 | -| 并行数据 | 是 | +| 特性列表 | 是否支持 | +| ---------- | -------- | +| 分布式训练 | 是 | +| 混合精度 | 是 | +| 数据并行 | 是 | -- 混合精度训练 + +## 混合精度训练 昇腾910 AI处理器提供自动混合精度功能,可以针对全网中float32数据类型的算子,按照内置的优化策略,自动将部分float32的算子降低精度到float16,从而在精度损失很小的情况下提升系统性能并减少内存使用。 -- 开启混合精度 +## 开启混合精度 -脚本已默认开启混合精度,设置precision_mode参数的命令行参考如下。 +拉起脚本中,传入--precision_mode='allow_mix_precision' - ``` - train_full_1p.sh --precision_mode=allow_fp32_to_fp16 --data_path=./ - ``` +``` + ./train_full_1p.sh --help + +parameter explain: + --precision_mode precision mode(allow_fp32_to_fp16/force_fp16/must_keep_origin_dtype/allow_mix_precision) + --over_dump if or not over detection, default is False + --data_dump_flag data dump flag, default is False + --data_dump_step data dump step, default is 10 + --profiling if or not profiling for performance debug, default is False + --data_path source data of training + -h/--help show help message +``` 相关代码示例: - ``` - parser.add_argument('--precision_mode', type=str, default='allow_fp32_to_fp16', + +``` +parser.add_argument('--precision_mode', type=str, default='allow_fp32_to_fp16', help='precision mode, default is allow_fp32_to_fp16') - ``` +```

训练环境准备

-1. 硬件环境准备请参见各硬件产品文档"[驱动和固件安装升级指南]( https://support.huawei.com/enterprise/zh/category/ai-computing-platform-pid-1557196528909)"。需要在硬件设备上安装与CANN版本配套的固件与驱动。 -2. 宿主机上需要安装Docker并登录[Ascend Hub中心](https://ascendhub.huawei.com/#/detail?name=ascend-tensorflow-arm)获取镜像。 - - 当前模型支持的镜像列表如[表1](#zh-cn_topic_0000001074498056_table1519011227314)所示。 - - **表 1** 镜像列表 - - - - - - - - - - - - -

镜像名称

-

镜像版本

-

配套CANN版本

-
-

21.0.2

-

5.0.2

-
+- 硬件环境和运行环境准备请参见《[CANN软件安装指南](https://support.huawei.com/enterprise/zh/ascend-computing/cann-pid-251168373?category=installation-update)》 - 运行以下命令安装依赖。 ``` pip3 install requirements.txt @@ -145,24 +118,32 @@ pip3 install requirements.txt

快速上手

## 数据集准备 - 1、用户自行准备好数据集,模型训练使用Pascal VOC数据集,数据集请用户自行获取 +1、用户自行准备好数据集,模型训练使用Pascal VOC数据集,数据集请用户自行获取 ``` bash download-data.sh ``` - 2. 数据集训练前需要做预处理操作 +2. 数据集训练前需要做预处理操作 ``` ./process_dataset.py ``` - 3. 数据集处理后,放入模型目录下,在训练脚本中指定数据集路径,可正常使用。 - +3. 数据集处理后,放入模型目录下,在训练脚本中指定数据集路径,可正常使用。 + + + ## 模型训练 - 1. 单击“立即下载”,并选择合适的下载方式下载源码包。 +- 单击“立即下载”,并选择合适的下载方式下载源码包。 +- 开始训练。 + + - 启动训练之前,首先要配置程序运行相关环境变量。 + + 环境变量配置信息参见: - 2. 启动训练之前,首先要配置程序运行相关环境变量。环境变量配置信息参见: - [Ascend 910训练平台环境变量设置](https://gitee.com/ascend/modelzoo/wikis/Ascend%20910%E8%AE%AD%E7%BB%83%E5%B9%B3%E5%8F%B0%E7%8E%AF%E5%A2%83%E5%8F%98%E9%87%8F%E8%AE%BE%E7%BD%AE?sort_id=3148819) - 3. 单卡训练 - 以数据集为./data/pascal-voc/train-samples.pkl、./data/pascal-voc/valid-samples.pkl、./data/pascal-voc/training-data.pkl为例,backbone模型为./data/vgg_graph/saved_model.pb、./data/vgg_graph/variables、./data/vgg_graph/vgg为例(vgg参考ssdvgg.py的__download_vgg下载) + [Ascend 910训练平台环境变量设置](https://gitee.com/ascend/modelzoo/wikis/Ascend%20910%E8%AE%AD%E7%BB%83%E5%B9%B3%E5%8F%B0%E7%8E%AF%E5%A2%83%E5%8F%98%E9%87%8F%E8%AE%BE%E7%BD%AE?sort_id=3148819) + + - 单卡训练 + + 以数据集为./data/pascal-voc/train-samples.pkl、./data/pascal-voc/valid-samples.pkl、./data/pascal-voc/training-data.pkl为例,backbone模型为./data/vgg_graph/saved_model.pb、./data/vgg_graph/variables、./data/vgg_graph/vgg为例(vgg参考ssdvgg.py的__download_vgg下载) ``` cd test; @@ -176,56 +157,53 @@ pip3 install requirements.txt bash train_full_1p.sh ``` + + +

高级参考

-- 脚本和示例代码 +## 脚本和示例代码 ``` - . - ├── test - │   ├── train_full_1p.sh // 执行全量训练脚本 - │   └── train_performance_1p.sh - ├── LICENSE - ├── README.md - ├── average_precision.py - ├── data_queue.py - ├── detect.py - ├── download-data.sh - ├── export_model.py - ├── infer.py - ├── modelzoo_level.txt - ├── pascal_summary.py - ├── process_dataset.py - ├── run_1p.sh - ├── source_pascal_voc.py - ├── ssdutils.py - ├── ssdvgg.py - ├── train.py - ├── training_data.py - ├── transforms.py - └── utils.py +|--LICENSE +|--README.md #说明文档 +|--input_pipeline.py +|--model_saving_utils.py +|--run_pretraining.py +|--run_squad.py #训练代码 +|--requirements.txt #所需依赖 +|--squad_lib.py +|--test #训练脚本目录 +| |--train_performance_bertlarge_8p_512bs_lamb_phase1.sh +| |--train_performance_bertlarge_8p_96bs_lamb_phase2.sh +| |--train_performance_bertlarge_8p_128bs_adam.sh +| |--train_performance_squad1.1_large_bs24_1p.sh +| |--train_performance_squad2.0_large_bs24_1p.sh ``` -- 脚本参数 +## 脚本参数 ``` ---data_path data path,default is the path of train.py ---name project name,default='ckpt' ---epochs train epochs,default=200 ---batch-size batch size,default=8 ---checkpoint-interval checkpoint interval,default=200 ---lr-values learning rate values,default='0.00075;0.0001;0.00001' ---lr-boundaries learning rate change boundaries (in batches),default='320000;400000' ---momentum momentum for the optimizer,default=0.9 ---weight-decay L2 normalization factor,default=0.0005 ---continue-training continue training from the latest checkpoint,default='False' ---num-workers number of parallel generators,default=mp.cpu_count() ---precision_mode precision mode, default is allow_fp32_to_fp16 ---over_dump over flow dump, True or False, default is False ---data_dump data dump, True or False, default is False ---dump_path dump path,default='/home/HwHiAiUser/' - +--data_path # the path to train data +--epochs # epochs of training +--ckpt_save_path # directory to ckpt +--batch_size # batch size for 1p +--log_steps # log frequency +--bert_config_file +--precision_mode # the path to save over dump data +--over_dump # if or not over detection, default is False +--data_dump_flag # data dump flag, default is False +--data_dump_step # data dump step, default is 10 +--profiling # if or not profiling for performance debug, default is False +--profiling_dump_path # the path to save profiling data +--over_dump_path # the path to save over dump data +--data_dump_path # the path to save dump data +--use_mixlist # use_mixlist flag, default is False +--fusion_off_flag # fusion_off flag, default is False +--mixlist_file # mixlist file name, default is ops_info.json +--fusion_off_file # fusion_off file name, default is fusion_switch.cfg ``` + ## 训练过程 -通过“模型训练”中的训练指令启动单卡或者多卡训练。单卡和多卡通过运行不同脚本,支持单卡,8卡网络训练。模型存储路径为${cur_path}/output/$ASCEND_DEVICE_ID,包括训练的log以及checkpoints文件。以单卡训练为例,loss信息在文件${cur_path}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log中。 \ No newline at end of file +通过“模型训练”中的训练指令启动单卡或者多卡训练。单卡和多卡通过运行不同脚本,支持单卡,8卡网络训练。模型存储路径为${cur_path}/output/$ASCEND_DEVICE_ID,包括训练的log以及checkpoints文件。以8卡训练为例,loss信息在文件${cur_path}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log中。 \ No newline at end of file -- Gitee From 943d7ae809e5c8e92f202c1348cf903fa5007d99 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=BC=A0=E6=B4=8B=E6=B4=8B?= <584244991@qq.com> Date: Fri, 29 Jul 2022 06:11:53 +0000 Subject: [PATCH 12/12] update TensorFlow/built-in/cv/detection/SSD-VGG_ID1619_for_TensorFlow/README.md. --- .../SSD-VGG_ID1619_for_TensorFlow/README.md | 71 ++++++++++--------- 1 file changed, 38 insertions(+), 33 deletions(-) diff --git a/TensorFlow/built-in/cv/detection/SSD-VGG_ID1619_for_TensorFlow/README.md b/TensorFlow/built-in/cv/detection/SSD-VGG_ID1619_for_TensorFlow/README.md index d6e139047..aefae98b5 100644 --- a/TensorFlow/built-in/cv/detection/SSD-VGG_ID1619_for_TensorFlow/README.md +++ b/TensorFlow/built-in/cv/detection/SSD-VGG_ID1619_for_TensorFlow/README.md @@ -165,43 +165,48 @@ pip3 install requirements.txt ## 脚本和示例代码 ``` -|--LICENSE -|--README.md #说明文档 -|--input_pipeline.py -|--model_saving_utils.py -|--run_pretraining.py -|--run_squad.py #训练代码 -|--requirements.txt #所需依赖 -|--squad_lib.py -|--test #训练脚本目录 -| |--train_performance_bertlarge_8p_512bs_lamb_phase1.sh -| |--train_performance_bertlarge_8p_96bs_lamb_phase2.sh -| |--train_performance_bertlarge_8p_128bs_adam.sh -| |--train_performance_squad1.1_large_bs24_1p.sh -| |--train_performance_squad2.0_large_bs24_1p.sh -``` + ├── test + │   ├── train_full_1p.sh // 执行全量训练脚本 + │   └── train_performance_1p.sh + ├── LICENSE + ├── README.md + ├── average_precision.py + ├── data_queue.py + ├── detect.py + ├── download-data.sh + ├── export_model.py + ├── infer.py + ├── modelzoo_level.txt + ├── pascal_summary.py + ├── process_dataset.py + ├── run_1p.sh + ├── source_pascal_voc.py + ├── ssdutils.py + ├── ssdvgg.py + ├── train.py + ├── training_data.py + ├── transforms.py + └── utils.py ## 脚本参数 ``` ---data_path # the path to train data ---epochs # epochs of training ---ckpt_save_path # directory to ckpt ---batch_size # batch size for 1p ---log_steps # log frequency ---bert_config_file ---precision_mode # the path to save over dump data ---over_dump # if or not over detection, default is False ---data_dump_flag # data dump flag, default is False ---data_dump_step # data dump step, default is 10 ---profiling # if or not profiling for performance debug, default is False ---profiling_dump_path # the path to save profiling data ---over_dump_path # the path to save over dump data ---data_dump_path # the path to save dump data ---use_mixlist # use_mixlist flag, default is False ---fusion_off_flag # fusion_off flag, default is False ---mixlist_file # mixlist file name, default is ops_info.json ---fusion_off_file # fusion_off file name, default is fusion_switch.cfg +--data_path data path,default is the path of train.py +--name project name,default='ckpt' +--epochs train epochs,default=200 +--batch-size batch size,default=8 +--checkpoint-interval checkpoint interval,default=200 +--lr-values learning rate values,default='0.00075;0.0001;0.00001' +--lr-boundaries learning rate change boundaries (in batches),default='320000;400000' +--momentum momentum for the optimizer,default=0.9 +--weight-decay L2 normalization factor,default=0.0005 +--continue-training continue training from the latest checkpoint,default='False' +--num-workers number of parallel generators,default=mp.cpu_count() +--precision_mode precision mode, default is allow_fp32_to_fp16 +--over_dump over flow dump, True or False, default is False +--data_dump data dump, True or False, default is False +--dump_path dump path,default='/home/HwHiAiUser/' + ``` ## 训练过程 -- Gitee