From e16589292bd192450ed9c5920c0773daa25ad8c8 Mon Sep 17 00:00:00 2001 From: Ziyan Date: Mon, 20 Apr 2020 15:16:46 +0800 Subject: [PATCH] add parameter description in distributed training --- .../source_en/advanced_use/distributed_training.md | 11 ++++++----- .../source_zh_cn/advanced_use/distributed_training.md | 11 ++++++----- .../resnet50_distributed_training.py | 2 +- 3 files changed, 13 insertions(+), 11 deletions(-) diff --git a/tutorials/source_en/advanced_use/distributed_training.md b/tutorials/source_en/advanced_use/distributed_training.md index 22079e575c..2882b4703b 100644 --- a/tutorials/source_en/advanced_use/distributed_training.md +++ b/tutorials/source_en/advanced_use/distributed_training.md @@ -29,7 +29,8 @@ Among them: In this tutorial, we will learn how to train the ResNet-50 network in `DATA_PARALLEL` or `AUTO_PARALLEL` mode on MindSpore. -> The current sample is for the Ascend 910 AI processor. You can find the complete executable sample code at:. +> The current sample is for the Ascend 910 AI processor. CPU and GPU processors are not supported for now. +> You can find the complete executable sample code at:. ## Preparations @@ -221,11 +222,11 @@ opt = Momentum(filter(lambda x: x.requires_grad, net.get_parameters()), lr, mome ## Training the Network -`context.set_auto_parallel_context()` is an API provided for users to set parallel parameters. The parameters are as follows: +`context.set_auto_parallel_context()` is an API provided for users to set parallel parameters, which can be invoked only before the initialization of `Model`. If users did not set parameters, MindSpore will automatically set parameters to the empirical values according to the parallel mode. For example, `parameter_broadcast` is `True` in data parallel mode. The parameters are as follows: -- `parallel_mode`: distributed parallel mode. The options are `ParallelMode.DATA_PARALLEL` and `ParallelMode.AUTO_PARALLEL`. -- `mirror_mean`: During backward computation, the framework collects gradients of parameters in data parallel mode across multiple machines, obtains the global gradient value, and transfers the global gradient value to the optimizer for update. -The value True indicates the `allreduce_mean` operation that would be applied, and the value False indicates the `allreduce_sum` operation that would be applied. +- `parallel_mode`: distributed parallel mode. The default value is `ParallelMode.STAND_ALONE`. The options are `ParallelMode.DATA_PARALLEL` and `ParallelMode.AUTO_PARALLEL`. +- `paramater_broadcast`: specifies whether to broadcast initialized parameters. The Default value is `False` in non-data parallel mode. +- `mirror_mean`: During backward computation, the framework collects gradients of parameters in data parallel mode across multiple machines, obtains the global gradient value, and transfers the global gradient value to the optimizer for update. The default value is `False`, which indicates the `allreduce_sum` operation that would be applied. And the value `True` indicates the `allreduce_mean` operation that would be applied. In the following example, the parallel mode is set to `AUTO_PARALLEL`. `dataset_sink_mode=False` indicates that the non-sink mode is used. `LossMonitor` can return the loss value through the callback function. diff --git a/tutorials/source_zh_cn/advanced_use/distributed_training.md b/tutorials/source_zh_cn/advanced_use/distributed_training.md index ed6bd7dc93..288829bf83 100644 --- a/tutorials/source_zh_cn/advanced_use/distributed_training.md +++ b/tutorials/source_zh_cn/advanced_use/distributed_training.md @@ -28,7 +28,8 @@ MindSpore支持数据并行及自动并行。自动并行是MindSpore融合了 - 代价模型(Cost Model):同时考虑内存的计算代价和通信代价对训练时间建模,并设计了高效的算法来找到训练时间较短的并行策略。 本篇教程我们主要了解如何在MindSpore上通过数据并行及自动并行模式训练ResNet-50网络。 -> 本例面向Ascend 910 AI处理器硬件平台,你可以在这里下载完整的样例代码: +> 本例面向Ascend 910 AI处理器硬件平台,暂不支持CPU和GPU场景。 +> 你可以在这里下载完整的样例代码: ## 准备环节 @@ -220,11 +221,11 @@ opt = Momentum(filter(lambda x: x.requires_grad, net.get_parameters()), lr, mome ## 训练网络 -`context.set_auto_parallel_context()`是提供给用户设置并行参数的接口。主要参数包括: +`context.set_auto_parallel_context()`是配置并行训练参数的接口,必须在`Model`初始化前调用。如用户未指定参数,框架会自动根据并行模式为用户设置参数的经验值。如数据并行模式下,`parameter_broadcast`默认打开。主要参数包括: -- `parallel_mode`:分布式并行模式。可选数据并行`ParallelMode.DATA_PARALLEL`及自动并行`ParallelMode.AUTO_PARALLEL`。 -- `mirror_mean`: 反向计算时,框架内部会将数据并行参数分散在多台机器的梯度值进行收集,得到全局梯度值后再传入优化器中更新。 -设置为True对应`allreduce_mean`操作,False对应`allreduce_sum`操作。 +- `parallel_mode`:分布式并行模式,默认为单机模式`ParallelMode.STAND_ALONE`。可选数据并行`ParallelMode.DATA_PARALLEL`及自动并行`ParallelMode.AUTO_PARALLEL`。 +- `paramater_broadcast`: 参数初始化广播开关,非数据并行模式下,默认值为`False`。 +- `mirror_mean`:反向计算时,框架内部会将数据并行参数分散在多台机器的梯度值进行收集,得到全局梯度值后再传入优化器中更新。默认值为`False`,对应`allreduce_sum`操作;设置为`True`对应`allreduce_mean`操作。 在下面的样例中我们指定并行模式为自动并行,其中`dataset_sink_mode=False`表示采用数据非下沉模式,`LossMonitor`能够通过回调函数返回loss值。 diff --git a/tutorials/tutorial_code/distributed_training/resnet50_distributed_training.py b/tutorials/tutorial_code/distributed_training/resnet50_distributed_training.py index 54bd1bc3aa..36cdcb0117 100644 --- a/tutorials/tutorial_code/distributed_training/resnet50_distributed_training.py +++ b/tutorials/tutorial_code/distributed_training/resnet50_distributed_training.py @@ -53,7 +53,7 @@ def create_dataset(repeat_num=1, batch_size=32, rank_id=0, rank_size=1): # get rank_id and rank_size rank_id = get_rank() rank_size = get_group_size() - data_set = ds.Cifar10Dataset(data_path, num_shards=rank_size, shared_id=rank_id) + data_set = ds.Cifar10Dataset(data_path, num_shards=rank_size, shard_id=rank_id) # define map operations random_crop_op = vision.RandomCrop((32, 32), (4, 4, 4, 4)) -- Gitee