From 865487749b295128f5e005e89d6998618d6a2604 Mon Sep 17 00:00:00 2001 From: Tridu33 Date: Sat, 16 Nov 2024 16:51:57 +0800 Subject: [PATCH] bugfix: fix issues https://gitee.com/mindspore/mindspore/issues/IB455C?from=project-issue , where -worker_num and --local_worker_num must be the same number, like 8 or 4. --- docs/sample_code/startup_method/msrun_1.sh | 2 +- docs/sample_code/startup_method/msrun_2.sh | 2 +- docs/sample_code/startup_method/msrun_single.sh | 2 +- docs/sample_code/startup_method/run_mpirun_1.sh | 2 +- docs/sample_code/startup_method/run_mpirun_2.sh | 2 +- 5 files changed, 5 insertions(+), 5 deletions(-) diff --git a/docs/sample_code/startup_method/msrun_1.sh b/docs/sample_code/startup_method/msrun_1.sh index 76d4602c33..9c9fd2334e 100644 --- a/docs/sample_code/startup_method/msrun_1.sh +++ b/docs/sample_code/startup_method/msrun_1.sh @@ -18,4 +18,4 @@ rm -rf msrun_log mkdir msrun_log echo "start training" -msrun --worker_num=8 --local_worker_num=4 --master_addr= --master_port=8118 --node_rank=0 --log_dir=msrun_log --join=True --cluster_time_out=300 net.py +msrun --worker_num=8 --local_worker_num=8 --master_addr= --master_port=8118 --node_rank=0 --log_dir=msrun_log --join=True --cluster_time_out=300 net.py diff --git a/docs/sample_code/startup_method/msrun_2.sh b/docs/sample_code/startup_method/msrun_2.sh index 210da5f81d..19e3bdeeba 100644 --- a/docs/sample_code/startup_method/msrun_2.sh +++ b/docs/sample_code/startup_method/msrun_2.sh @@ -18,4 +18,4 @@ rm -rf msrun_log mkdir msrun_log echo "start training" -msrun --worker_num=8 --local_worker_num=4 --master_addr= --master_port=8118 --node_rank=1 --log_dir=msrun_log --join=True --cluster_time_out=300 net.py +msrun --worker_num=8 --local_worker_num=8 --master_addr= --master_port=8118 --node_rank=1 --log_dir=msrun_log --join=True --cluster_time_out=300 net.py diff --git a/docs/sample_code/startup_method/msrun_single.sh b/docs/sample_code/startup_method/msrun_single.sh index 6d0cb6007f..7bb7f0ff5c 100644 --- a/docs/sample_code/startup_method/msrun_single.sh +++ b/docs/sample_code/startup_method/msrun_single.sh @@ -18,4 +18,4 @@ rm -rf msrun_log mkdir msrun_log echo "start training" -msrun --worker_num=8 --local_worker_num=4 --master_port=8118 --log_dir=msrun_log --join=True --cluster_time_out=300 net.py +msrun --worker_num=8 --local_worker_num=8 --master_port=8118 --log_dir=msrun_log --join=True --cluster_time_out=300 net.py diff --git a/docs/sample_code/startup_method/run_mpirun_1.sh b/docs/sample_code/startup_method/run_mpirun_1.sh index db3689ab9f..6016cedddd 100644 --- a/docs/sample_code/startup_method/run_mpirun_1.sh +++ b/docs/sample_code/startup_method/run_mpirun_1.sh @@ -14,5 +14,5 @@ if [ ! -d "${EXEC_PATH}/MNIST_Data" ]; then fi export DATA_PATH=${EXEC_PATH}/MNIST_Data/train/ -mpirun -n 8 -H 192.168.*.1:8,192.168.*.2:8 --output-filename log_output \ +mpirun -n 8 -H 192.168.*.1:8,192.168.*.2:8 --output-filename ./mpirun_log/log_output \ --merge-stderr-to-stdout python net.py diff --git a/docs/sample_code/startup_method/run_mpirun_2.sh b/docs/sample_code/startup_method/run_mpirun_2.sh index e5ca9cdeb5..101c28615e 100644 --- a/docs/sample_code/startup_method/run_mpirun_2.sh +++ b/docs/sample_code/startup_method/run_mpirun_2.sh @@ -15,5 +15,5 @@ fi export DATA_PATH=${EXEC_PATH}/MNIST_Data/train/ HOSTFILE=$1 -mpirun -n 16 --hostfile $HOSTFILE --output-filename log_output \ +mpirun -n 16 --hostfile $HOSTFILE --output-filename ./mpirun_log/log_output \ --merge-stderr-to-stdout python net.py -- Gitee