From 6015b428d3ae62a37d74fc0c8802b8c5528fefdd Mon Sep 17 00:00:00 2001
From: l30062829 <funniless@163.com>
Date: Tue, 11 Feb 2025 10:34:03 +0800
Subject: [PATCH] =?UTF-8?q?=E4=BF=AE=E6=94=B9graphcast=E5=A4=9A=E5=8D=A1?=
 =?UTF-8?q?=E5=B9=B6=E8=A1=8C=E5=90=AF=E5=8A=A8=E6=96=B9=E5=BC=8F=E4=B8=BA?=
 =?UTF-8?q?msrun=EF=BC=8Cfix=20fuxi=20bug?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../medium-range/fuxi/src/fuxi.py             |  2 -
 .../medium-range/fuxi/src/fuxi_net.py         |  2 +-
 .../medium-range/graphcast/README.md          | 11 ++---
 .../medium-range/graphcast/README_CN.md       | 12 ++++--
 .../scripts/run_distributed_train.sh          | 43 +++++++++----------
 .../medium-range/graphcast/src/utils.py       |  8 +---
 6 files changed, 38 insertions(+), 40 deletions(-)

diff --git a/MindEarth/applications/medium-range/fuxi/src/fuxi.py b/MindEarth/applications/medium-range/fuxi/src/fuxi.py
index 083ff4a13..4fae170f4 100644
--- a/MindEarth/applications/medium-range/fuxi/src/fuxi.py
+++ b/MindEarth/applications/medium-range/fuxi/src/fuxi.py
@@ -130,9 +130,7 @@ class CubeEmbed(nn.Cell):
         x_surface = x_surface.transpose(0, 3, 1, 2)
         pad_zeros = ops.zeros((self.batch_size, self.level_feature_size, 1, self.h_size, self.w_size), dtype=x.dtype)
         x = ops.concat((pad_zeros, x), axis=2)
-        x = ops.cast(x, self.conv3d_dtype)
         x = self.cube3d(x)
-        x = ops.cast(x, x_surface.dtype)
         x_surface = self.conv2d(x_surface)
         x_surface = x_surface.reshape(self.batch_size, self.in_channels, 1, self.h_size // 4, self.w_size // 4)
         x = ops.concat((x, x_surface), axis=2)
diff --git a/MindEarth/applications/medium-range/fuxi/src/fuxi_net.py b/MindEarth/applications/medium-range/fuxi/src/fuxi_net.py
index aca976037..31ac2e2d5 100644
--- a/MindEarth/applications/medium-range/fuxi/src/fuxi_net.py
+++ b/MindEarth/applications/medium-range/fuxi/src/fuxi_net.py
@@ -115,7 +115,7 @@ class FuXiNet(nn.Cell):
         """
         out = self.cube_embed(inputs)
         out_down_sample = self.down_sample(out)
-        batch_size, z_size, h_size, w_size = out_down_sample.shape
+        batch_size, z_size, h_size, w_size, _ = out_down_sample.shape
         out_skip = out_down_sample.reshape(batch_size, -1, self.out_channels)
         out_swin_block = out_skip
         for swin_block in self.swin_block:
diff --git a/MindEarth/applications/medium-range/graphcast/README.md b/MindEarth/applications/medium-range/graphcast/README.md
index ad0223b4a..f2d6d02ab 100644
--- a/MindEarth/applications/medium-range/graphcast/README.md
+++ b/MindEarth/applications/medium-range/graphcast/README.md
@@ -48,6 +48,8 @@ Where $tp$ represents the original value of `Total Precipitation`, in our experi
 
 ### Base Backbone
 
+choose mindspore2.4 to valid and test.
+
 Before running, it is necessary to prepare the training data, which can be downloaded in [1.40625°](https://download.mindspore.cn/mindscience/mindearth/dataset/WeatherBench_1.4_69/) or [0.25°](https://download-mindspore.osinfra.cn/mindscience/mindearth/dataset/ERA5_0_25_tiny400/) as needed, and saved in the `./dataset`.
 
 #### Quick Start
@@ -74,22 +76,21 @@ You can use [Chinese](https://gitee.com/mindspore/mindscience/raw/master/MindEar
 
 #### Multi-Card Parallel
 
+use [msrun](https://www.mindspore.cn/docs/zh-CN/r2.4.10/model_train/parallel/msrun_launcher.html) for graphcast
 Running in Multi-Card parallel mode requires setting the `distribute` in the configuration file specified by `config_file_path` to `True`.
 
 ```shell
-bash ./scripts/run_distributed_train.sh $rank_table_file $device_num $device_start_id $config_file_path
+bash ./scripts/run_distributed_train.sh $rank_table_file $config_file_path $device_num
 ```
 
 where:
 
 `--rank_table_file` [path to the networking information file](https://www.mindspore.cn/tutorials/experts/zh-CN/r2.2/parallel/rank_table.html#%E6%A6%82%E8%BF%B0).
 
-`--device_num` the numbers of networking device.
-
-`--device_start_id` the start ID of networking device.
-
 `--config_file_path` the path of config file.
 
+`--device_num` the numbers of networking device.
+
 ### Medium-range Precipitation
 
 Before running, it is necessary to prepare the training data, which can be downloaded in [ERA5_0_5_tiny400](https://download-mindspore.osinfra.cn/mindscience/mindearth/dataset/medium_precipitation/tiny_datasets/), and saved in the `./dataset`. In addition, it is necessary to set the path of GraphCast pretrain ckpt in the file `GraphCastTp.yaml`, which is `backbone_ckpt_path`. Users can download the provided 0.5° data for 4-year pretrain [ckpt](https://download-mindspore.osinfra.cn/mindscience/mindearth/dataset/medium_precipitation/tiny_datasets/ckpt/). You can set it up or train yourself to obtain CKPT for configuration.
diff --git a/MindEarth/applications/medium-range/graphcast/README_CN.md b/MindEarth/applications/medium-range/graphcast/README_CN.md
index 5e416b95b..e592c5b97 100644
--- a/MindEarth/applications/medium-range/graphcast/README_CN.md
+++ b/MindEarth/applications/medium-range/graphcast/README_CN.md
@@ -48,6 +48,8 @@ $$
 
 ### 基础Backbone
 
+本模型需要在mindspore2.4环境下运行。
+
 运行前需先准备训练使用的数据，可根据需要下载[1.40625°](https://download.mindspore.cn/mindscience/mindearth/dataset/WeatherBench_1.4_69/)或[0.25°](https://download-mindspore.osinfra.cn/mindscience/mindearth/dataset/ERA5_0_25_tiny400/)数据并保存在`./dataset`。
 
 #### 快速开始
@@ -74,21 +76,23 @@ bash ./scripts/run_standalone_train.sh $device_id $device_target $config_file_pa
 
 #### 多卡并行
 
+采用[msrun启动方式](https://www.mindspore.cn/docs/zh-CN/r2.4.10/model_train/parallel/msrun_launcher.html)
+
 多卡并行需要将`config_file_path`指定的配置文件中`distribute`字段设置为`True`。
 
 ```shell
-bash ./scripts/run_distributed_train.sh $rank_table_file $device_num $device_start_id $config_file_path
+bash ./scripts/run_distributed_train.sh $rank_table_file $config_file_path $device_num
 ```
 
 其中：
 
 `--rank_table_file` [组网信息文件](https://www.mindspore.cn/tutorials/experts/zh-CN/r2.2/parallel/rank_table.html#%E6%A6%82%E8%BF%B0)。
 
-`--device_num` 表示组网设备的数量。
+`--config_file_path` 配置文件的路径。
 
-`--device_start_id` 表示组网设备起始ID。
+`--device_num` 表示组网设备的数量。
 
-`--config_file_path` 配置文件的路径。
+之后在"msrun"文件夹中查看运行信息。
 
 ### 中期降水
 
diff --git a/MindEarth/applications/medium-range/graphcast/scripts/run_distributed_train.sh b/MindEarth/applications/medium-range/graphcast/scripts/run_distributed_train.sh
index b2e8dfe87..bd50b885a 100644
--- a/MindEarth/applications/medium-range/graphcast/scripts/run_distributed_train.sh
+++ b/MindEarth/applications/medium-range/graphcast/scripts/run_distributed_train.sh
@@ -13,9 +13,9 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ============================================================================
-if [ $# != 4 ]
+if [ $# != 3 ]
 then
-    echo "Usage: bash run_distribute_train.sh [RANK_TABLE_FILE] [DEVICE_NUM] [DEVICE_START_ID] [CONFIG_FILE]"
+    echo "Usage: bash run_distribute_train.sh [RANK_TABLE_FILE] [CONFIG_FILE] [DEVICE_NUM]"
 exit 1
 fi
 
@@ -25,26 +25,25 @@ then
 exit 1
 fi
 
-RANK_TABLE_FILE=$(realpath $1)
-DEVICE_NUM=$2
+RANK_TABLE_FILE=$(realpath  $1)
+export MINDSPORE_HCCL_CONFIG_PATH=${RANK_TABLE_FILE}
 echo "RANK_TABLE_FILE=${RANK_TABLE_FILE}"
-DEVICE_START_ID=$3
-CONFIG_FILE=$(realpath $4)
+CONFIG_FILE=$(realpath $2)
 echo "CONFIG_FILE=${CONFIG_FILE}"
+DEVICE_NUM=$(realpath $3)
+echo "CONFIG_FILE=${DEVICE_NUM}"
 
-for((i=$DEVICE_START_ID;i<$[$DEVICE_NUM+$DEVICE_START_ID];i++))
-do
-    rm -rf device$i
-    mkdir device$i
-    cp ./main.py ./device$i
-    cp ${CONFIG_FILE} ./device$i
-    cp -r ./src ./device$i
-    cd ./device$i
-    export DEVICE_ID=$i
-    export RANK_ID=$[$i-$DEVICE_START_ID]
-    export GLOG_v=3
-    echo "start training for device $i"
-    env > env$i.log
-    nohup python -u main.py --device_id $i --config_file_path ${CONFIG_FILE} >train${i}.log 2>&1 &
-    cd ../
-done
\ No newline at end of file
+i=1
+rm -rf msrun
+mkdir msrun
+cp ./main.py ./msrun
+cp ${CONFIG_FILE} ./msrun
+cp -r ./src ./msrun
+cd ./msrun
+export DEVICE_ID=$i
+export RANK_ID=$[$i-$i]
+export GLOG_v=3
+echo "start training for msrun"
+env > env.log
+nohup msrun --worker_num=${DEVICE_NUM} --local_worker_num=${DEVICE_NUM} --master_port=8118 --log_dir=msrun_log --join=True --cluster_time_out=300 main.py --device_id $i --config_file_path ${CONFIG_FILE} >train${i}.log 2>&1 &
+cd ../
diff --git a/MindEarth/applications/medium-range/graphcast/src/utils.py b/MindEarth/applications/medium-range/graphcast/src/utils.py
index a3fa67010..1f80de9b6 100644
--- a/MindEarth/applications/medium-range/graphcast/src/utils.py
+++ b/MindEarth/applications/medium-range/graphcast/src/utils.py
@@ -17,8 +17,8 @@ import os
 
 import matplotlib.pyplot as plt
 import numpy as np
+import mindspore as ms
 import mindspore.common.dtype as mstype
-import mindspore.communication.management as D
 from mindspore.communication import init
 from mindspore import context, Tensor, ops, nn
 from mindspore.train.serialization import load_checkpoint, load_param_into_net
@@ -63,12 +63,8 @@ def amp_convert(network, black_list=None):
 def init_data_parallel(use_ascend):
     """Init data parallel for model running"""
     if use_ascend:
+        ms.set_auto_parallel_context(parallel_mode=ms.ParallelMode.DATA_PARALLEL, gradients_mean=True)
         init()
-        device_num = D.get_group_size()
-        os.environ['HCCL_CONNECT_TIMEOUT'] = "7200"
-        context.reset_auto_parallel_context()
-        context.set_auto_parallel_context(parallel_mode=context.ParallelMode.DATA_PARALLEL, gradients_mean=True,
-                                          device_num=device_num, parameter_broadcast=False)
     else:
         init("nccl")
         context.reset_auto_parallel_context()
-- 
Gitee