From 6015b428d3ae62a37d74fc0c8802b8c5528fefdd Mon Sep 17 00:00:00 2001 From: l30062829 Date: Tue, 11 Feb 2025 10:34:03 +0800 Subject: [PATCH] =?UTF-8?q?=E4=BF=AE=E6=94=B9graphcast=E5=A4=9A=E5=8D=A1?= =?UTF-8?q?=E5=B9=B6=E8=A1=8C=E5=90=AF=E5=8A=A8=E6=96=B9=E5=BC=8F=E4=B8=BA?= =?UTF-8?q?msrun=EF=BC=8Cfix=20fuxi=20bug?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../medium-range/fuxi/src/fuxi.py | 2 - .../medium-range/fuxi/src/fuxi_net.py | 2 +- .../medium-range/graphcast/README.md | 11 ++--- .../medium-range/graphcast/README_CN.md | 12 ++++-- .../scripts/run_distributed_train.sh | 43 +++++++++---------- .../medium-range/graphcast/src/utils.py | 8 +--- 6 files changed, 38 insertions(+), 40 deletions(-) diff --git a/MindEarth/applications/medium-range/fuxi/src/fuxi.py b/MindEarth/applications/medium-range/fuxi/src/fuxi.py index 083ff4a13..4fae170f4 100644 --- a/MindEarth/applications/medium-range/fuxi/src/fuxi.py +++ b/MindEarth/applications/medium-range/fuxi/src/fuxi.py @@ -130,9 +130,7 @@ class CubeEmbed(nn.Cell): x_surface = x_surface.transpose(0, 3, 1, 2) pad_zeros = ops.zeros((self.batch_size, self.level_feature_size, 1, self.h_size, self.w_size), dtype=x.dtype) x = ops.concat((pad_zeros, x), axis=2) - x = ops.cast(x, self.conv3d_dtype) x = self.cube3d(x) - x = ops.cast(x, x_surface.dtype) x_surface = self.conv2d(x_surface) x_surface = x_surface.reshape(self.batch_size, self.in_channels, 1, self.h_size // 4, self.w_size // 4) x = ops.concat((x, x_surface), axis=2) diff --git a/MindEarth/applications/medium-range/fuxi/src/fuxi_net.py b/MindEarth/applications/medium-range/fuxi/src/fuxi_net.py index aca976037..31ac2e2d5 100644 --- a/MindEarth/applications/medium-range/fuxi/src/fuxi_net.py +++ b/MindEarth/applications/medium-range/fuxi/src/fuxi_net.py @@ -115,7 +115,7 @@ class FuXiNet(nn.Cell): """ out = self.cube_embed(inputs) out_down_sample = self.down_sample(out) - batch_size, z_size, h_size, w_size = out_down_sample.shape + batch_size, z_size, h_size, w_size, _ = out_down_sample.shape out_skip = out_down_sample.reshape(batch_size, -1, self.out_channels) out_swin_block = out_skip for swin_block in self.swin_block: diff --git a/MindEarth/applications/medium-range/graphcast/README.md b/MindEarth/applications/medium-range/graphcast/README.md index ad0223b4a..f2d6d02ab 100644 --- a/MindEarth/applications/medium-range/graphcast/README.md +++ b/MindEarth/applications/medium-range/graphcast/README.md @@ -48,6 +48,8 @@ Where $tp$ represents the original value of `Total Precipitation`, in our experi ### Base Backbone +choose mindspore2.4 to valid and test. + Before running, it is necessary to prepare the training data, which can be downloaded in [1.40625°](https://download.mindspore.cn/mindscience/mindearth/dataset/WeatherBench_1.4_69/) or [0.25°](https://download-mindspore.osinfra.cn/mindscience/mindearth/dataset/ERA5_0_25_tiny400/) as needed, and saved in the `./dataset`. #### Quick Start @@ -74,22 +76,21 @@ You can use [Chinese](https://gitee.com/mindspore/mindscience/raw/master/MindEar #### Multi-Card Parallel +use [msrun](https://www.mindspore.cn/docs/zh-CN/r2.4.10/model_train/parallel/msrun_launcher.html) for graphcast Running in Multi-Card parallel mode requires setting the `distribute` in the configuration file specified by `config_file_path` to `True`. ```shell -bash ./scripts/run_distributed_train.sh $rank_table_file $device_num $device_start_id $config_file_path +bash ./scripts/run_distributed_train.sh $rank_table_file $config_file_path $device_num ``` where: `--rank_table_file` [path to the networking information file](https://www.mindspore.cn/tutorials/experts/zh-CN/r2.2/parallel/rank_table.html#%E6%A6%82%E8%BF%B0). -`--device_num` the numbers of networking device. - -`--device_start_id` the start ID of networking device. - `--config_file_path` the path of config file. +`--device_num` the numbers of networking device. + ### Medium-range Precipitation Before running, it is necessary to prepare the training data, which can be downloaded in [ERA5_0_5_tiny400](https://download-mindspore.osinfra.cn/mindscience/mindearth/dataset/medium_precipitation/tiny_datasets/), and saved in the `./dataset`. In addition, it is necessary to set the path of GraphCast pretrain ckpt in the file `GraphCastTp.yaml`, which is `backbone_ckpt_path`. Users can download the provided 0.5° data for 4-year pretrain [ckpt](https://download-mindspore.osinfra.cn/mindscience/mindearth/dataset/medium_precipitation/tiny_datasets/ckpt/). You can set it up or train yourself to obtain CKPT for configuration. diff --git a/MindEarth/applications/medium-range/graphcast/README_CN.md b/MindEarth/applications/medium-range/graphcast/README_CN.md index 5e416b95b..e592c5b97 100644 --- a/MindEarth/applications/medium-range/graphcast/README_CN.md +++ b/MindEarth/applications/medium-range/graphcast/README_CN.md @@ -48,6 +48,8 @@ $$ ### 基础Backbone +本模型需要在mindspore2.4环境下运行。 + 运行前需先准备训练使用的数据,可根据需要下载[1.40625°](https://download.mindspore.cn/mindscience/mindearth/dataset/WeatherBench_1.4_69/)或[0.25°](https://download-mindspore.osinfra.cn/mindscience/mindearth/dataset/ERA5_0_25_tiny400/)数据并保存在`./dataset`。 #### 快速开始 @@ -74,21 +76,23 @@ bash ./scripts/run_standalone_train.sh $device_id $device_target $config_file_pa #### 多卡并行 +采用[msrun启动方式](https://www.mindspore.cn/docs/zh-CN/r2.4.10/model_train/parallel/msrun_launcher.html) + 多卡并行需要将`config_file_path`指定的配置文件中`distribute`字段设置为`True`。 ```shell -bash ./scripts/run_distributed_train.sh $rank_table_file $device_num $device_start_id $config_file_path +bash ./scripts/run_distributed_train.sh $rank_table_file $config_file_path $device_num ``` 其中: `--rank_table_file` [组网信息文件](https://www.mindspore.cn/tutorials/experts/zh-CN/r2.2/parallel/rank_table.html#%E6%A6%82%E8%BF%B0)。 -`--device_num` 表示组网设备的数量。 +`--config_file_path` 配置文件的路径。 -`--device_start_id` 表示组网设备起始ID。 +`--device_num` 表示组网设备的数量。 -`--config_file_path` 配置文件的路径。 +之后在"msrun"文件夹中查看运行信息。 ### 中期降水 diff --git a/MindEarth/applications/medium-range/graphcast/scripts/run_distributed_train.sh b/MindEarth/applications/medium-range/graphcast/scripts/run_distributed_train.sh index b2e8dfe87..bd50b885a 100644 --- a/MindEarth/applications/medium-range/graphcast/scripts/run_distributed_train.sh +++ b/MindEarth/applications/medium-range/graphcast/scripts/run_distributed_train.sh @@ -13,9 +13,9 @@ # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================ -if [ $# != 4 ] +if [ $# != 3 ] then - echo "Usage: bash run_distribute_train.sh [RANK_TABLE_FILE] [DEVICE_NUM] [DEVICE_START_ID] [CONFIG_FILE]" + echo "Usage: bash run_distribute_train.sh [RANK_TABLE_FILE] [CONFIG_FILE] [DEVICE_NUM]" exit 1 fi @@ -25,26 +25,25 @@ then exit 1 fi -RANK_TABLE_FILE=$(realpath $1) -DEVICE_NUM=$2 +RANK_TABLE_FILE=$(realpath $1) +export MINDSPORE_HCCL_CONFIG_PATH=${RANK_TABLE_FILE} echo "RANK_TABLE_FILE=${RANK_TABLE_FILE}" -DEVICE_START_ID=$3 -CONFIG_FILE=$(realpath $4) +CONFIG_FILE=$(realpath $2) echo "CONFIG_FILE=${CONFIG_FILE}" +DEVICE_NUM=$(realpath $3) +echo "CONFIG_FILE=${DEVICE_NUM}" -for((i=$DEVICE_START_ID;i<$[$DEVICE_NUM+$DEVICE_START_ID];i++)) -do - rm -rf device$i - mkdir device$i - cp ./main.py ./device$i - cp ${CONFIG_FILE} ./device$i - cp -r ./src ./device$i - cd ./device$i - export DEVICE_ID=$i - export RANK_ID=$[$i-$DEVICE_START_ID] - export GLOG_v=3 - echo "start training for device $i" - env > env$i.log - nohup python -u main.py --device_id $i --config_file_path ${CONFIG_FILE} >train${i}.log 2>&1 & - cd ../ -done \ No newline at end of file +i=1 +rm -rf msrun +mkdir msrun +cp ./main.py ./msrun +cp ${CONFIG_FILE} ./msrun +cp -r ./src ./msrun +cd ./msrun +export DEVICE_ID=$i +export RANK_ID=$[$i-$i] +export GLOG_v=3 +echo "start training for msrun" +env > env.log +nohup msrun --worker_num=${DEVICE_NUM} --local_worker_num=${DEVICE_NUM} --master_port=8118 --log_dir=msrun_log --join=True --cluster_time_out=300 main.py --device_id $i --config_file_path ${CONFIG_FILE} >train${i}.log 2>&1 & +cd ../ diff --git a/MindEarth/applications/medium-range/graphcast/src/utils.py b/MindEarth/applications/medium-range/graphcast/src/utils.py index a3fa67010..1f80de9b6 100644 --- a/MindEarth/applications/medium-range/graphcast/src/utils.py +++ b/MindEarth/applications/medium-range/graphcast/src/utils.py @@ -17,8 +17,8 @@ import os import matplotlib.pyplot as plt import numpy as np +import mindspore as ms import mindspore.common.dtype as mstype -import mindspore.communication.management as D from mindspore.communication import init from mindspore import context, Tensor, ops, nn from mindspore.train.serialization import load_checkpoint, load_param_into_net @@ -63,12 +63,8 @@ def amp_convert(network, black_list=None): def init_data_parallel(use_ascend): """Init data parallel for model running""" if use_ascend: + ms.set_auto_parallel_context(parallel_mode=ms.ParallelMode.DATA_PARALLEL, gradients_mean=True) init() - device_num = D.get_group_size() - os.environ['HCCL_CONNECT_TIMEOUT'] = "7200" - context.reset_auto_parallel_context() - context.set_auto_parallel_context(parallel_mode=context.ParallelMode.DATA_PARALLEL, gradients_mean=True, - device_num=device_num, parameter_broadcast=False) else: init("nccl") context.reset_auto_parallel_context() -- Gitee