From db954e0574d98cd6a2e762442d9e690fc9de88e4 Mon Sep 17 00:00:00 2001 From: gitee_code_template Date: Tue, 26 Dec 2023 14:49:34 +0800 Subject: [PATCH 1/2] =?UTF-8?q?=E6=96=B0=E5=A2=9Ezero3=E5=92=8C=E5=8D=95?= =?UTF-8?q?=E6=9C=BA=E8=AE=AD=E7=BB=83=E8=84=9A=E6=9C=AC?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../foundation/Baichuan-13B/README.md | 21 +++++++++++-- .../Baichuan-13B/ds_config_zero3.json | 28 +++++++++++++++++ .../Baichuan-13B/run_baichuan_sft_1m.sh | 30 +++++++++++++++++++ .../foundation/Baichuan-13B/utils/misc.py | 6 ++-- .../Baichuan-13B/utils/modeling_baichuan.py | 3 ++ .../Baichuan-13B/utils/train_bash.py | 9 ++++-- 6 files changed, 88 insertions(+), 9 deletions(-) create mode 100644 PyTorch/built-in/foundation/Baichuan-13B/ds_config_zero3.json create mode 100644 PyTorch/built-in/foundation/Baichuan-13B/run_baichuan_sft_1m.sh diff --git a/PyTorch/built-in/foundation/Baichuan-13B/README.md b/PyTorch/built-in/foundation/Baichuan-13B/README.md index 8031324445..1302c4e220 100644 --- a/PyTorch/built-in/foundation/Baichuan-13B/README.md +++ b/PyTorch/built-in/foundation/Baichuan-13B/README.md @@ -178,8 +178,24 @@ ssh root@ip1 ssh root@ip2 ``` -## 开始训练 +## 开始训练 +**单机启动** + +1、将项目根目录下的`run_baichuan_sft_1m.sh`、`ds_config_zero3.json`文件拷贝到`${模型文件夹名称}`路径下。 +```shell +cp ../run_baichuan_sft_1m.sh . +cp ../ds_config_zero3.json . +``` + +2、启动脚本 +该模型单机8卡微调,执行如下命令启动训练。 +```shell +sh run_baichuan_sft_1m.sh +``` + + +**双机启动** 1、将项目根目录下的`run_baichuan_sft_2m.sh`、`ds_config_zero2.json`、`hostfile`文件拷贝到`${模型文件夹名称}`路径下。 ```shell @@ -193,6 +209,7 @@ cp ../hostfile . ```shell sh run_baichuan_sft_2m.sh ``` + 模型训练部分参数说明如下: ``` @@ -207,7 +224,7 @@ sh run_baichuan_sft_2m.sh --fp16 //使用fp16精度浮点数进行训练。 ``` - **注**:为确保双机训练成功,请保证双机环境及路径一致,包括项目路径、conda环境、cann和驱动等。 + **注**:zero3策略下也可以双机执行训练。为确保双机训练成功,请保证双机环境及路径一致,包括项目路径、conda环境、cann和驱动等。 训练完成后,权重文件保存`--output_dir`参数指定的路径下,并输出模型训练相关信息。 ## 训练结果展示 diff --git a/PyTorch/built-in/foundation/Baichuan-13B/ds_config_zero3.json b/PyTorch/built-in/foundation/Baichuan-13B/ds_config_zero3.json new file mode 100644 index 0000000000..b8af47551c --- /dev/null +++ b/PyTorch/built-in/foundation/Baichuan-13B/ds_config_zero3.json @@ -0,0 +1,28 @@ +{ + "train_micro_batch_size_per_gpu": "auto", + "zero_allow_untested_optimizer": true, + "fp16": { + "enabled": true, + "loss_scale": 16384, + "initial_scale_power": 16, + "loss_scale_window": 1000, + "hysteresis": 2, + "min_loss_scale": 1 + }, + "zero_optimization": { + "stage": 2, + "allgather_partitions": true, + "allgather_bucket_size": 5e8, + "overlap_comm": false, + "reduce_scatter": true, + "reduce_bucket_size": 5e8, + "contiguous_gradients" : true + }, + "bf16": { + "enabled": false + }, + "gradient_accumulation_steps": 16, + "gradient_clipping": "auto", + "train_batch_size": "auto", + "wall_clock_breakdown": false +} diff --git a/PyTorch/built-in/foundation/Baichuan-13B/run_baichuan_sft_1m.sh b/PyTorch/built-in/foundation/Baichuan-13B/run_baichuan_sft_1m.sh new file mode 100644 index 0000000000..0572ce503e --- /dev/null +++ b/PyTorch/built-in/foundation/Baichuan-13B/run_baichuan_sft_1m.sh @@ -0,0 +1,30 @@ +#!/bin/bash + +export HCCL_CONNECT_TIMEOUT=7200 +export INF_NAN_MODE_ENABLE=1 + +NUM_WORKERS=1 +NUM_GPUS_PER_WORKER=8 +MASTER_PORT=6669 + +HCCL_CONNECT_TIMEOUT=1200 deepspeed --num_gpus ${NUM_GPUS_PER_WORKER} src/train_bash.py \ + --stage sft \ + --model_name_or_path ./model_weight \ + --deepspeed ./ds_config_zero3.json \ + --do_train \ + --dataset alpaca_gpt4_en,alpaca_gpt4_zh \ + --template default \ + --finetuning_type full \ + --output_dir ./output_sft \ + --overwrite_cache \ + --overwrite_output_dir \ + --per_device_train_batch_size 1 \ + --gradient_accumulation_steps 4 \ + --lr_scheduler_type cosine \ + --logging_steps 1 \ + --save_steps 10000000 \ + --learning_rate 1e-6 \ + --num_train_epochs 5.0 \ + --max_grad_norm 0.5 \ + --plot_loss \ + --fp16 | tee logs/train.log diff --git a/PyTorch/built-in/foundation/Baichuan-13B/utils/misc.py b/PyTorch/built-in/foundation/Baichuan-13B/utils/misc.py index 2f1a174748..0bb73a27af 100644 --- a/PyTorch/built-in/foundation/Baichuan-13B/utils/misc.py +++ b/PyTorch/built-in/foundation/Baichuan-13B/utils/misc.py @@ -12,12 +12,11 @@ try: is_torch_cuda_available, is_torch_npu_available ) + # only for fp16 _is_fp16_available = is_torch_npu_available() or is_torch_cuda_available() - #_is_bf16_available = is_torch_bf16_gpu_available() or is_torch_bf16_cpu_available _is_bf16_available = False except ImportError: _is_fp16_available = torch.cuda.is_available() - #_is_bf16_available = torch.cuda.is_bf16_supported() _is_bf16_available = False if TYPE_CHECKING: @@ -100,9 +99,8 @@ def torch_gc() -> None: def dispatch_model(model: "PreTrainedModel") -> "PreTrainedModel": r""" Dispatches a pre-trained model to GPUs with balanced memory. - Borrowed from: https://github.com/huggingface/transformers/blob/v4.31.0/src/transformers/modeling_utils.py#L2803 """ - if getattr(model, "is_loaded_in_8bit", False) or getattr(model, "is_loaded_in_4bit", False): # do nothing + if getattr(model, "is_loaded_in_8bit", False) or getattr(model, "is_loaded_in_4bit", False): return model if torch.cuda.device_count() > 1: diff --git a/PyTorch/built-in/foundation/Baichuan-13B/utils/modeling_baichuan.py b/PyTorch/built-in/foundation/Baichuan-13B/utils/modeling_baichuan.py index 115d81f455..50b6b1fdf6 100644 --- a/PyTorch/built-in/foundation/Baichuan-13B/utils/modeling_baichuan.py +++ b/PyTorch/built-in/foundation/Baichuan-13B/utils/modeling_baichuan.py @@ -30,10 +30,12 @@ def _get_interleave(n): return _get_interleave_power_of_2(closest_power_of_2) + \ _get_interleave(2 * closest_power_of_2)[0::2][:n - closest_power_of_2] + def _fill_with_neg_inf(t): """FP16-compatible function that fills a tensor with -inf.""" return t.float().fill_(float("-inf")).type_as(t) + def _gen_alibi_mask(n_head, max_pos): """used in inference only""" slopes = torch.Tensor(_get_interleave(n_head)) @@ -46,6 +48,7 @@ def _gen_alibi_mask(n_head, max_pos): alibi_mask = alibi_mask.unsqueeze(0) + alibi return alibi_mask + def _buffered_future_mask(tensor, maxpos, alibi, attn_heads): """used in training only""" dim = tensor.size(1) diff --git a/PyTorch/built-in/foundation/Baichuan-13B/utils/train_bash.py b/PyTorch/built-in/foundation/Baichuan-13B/utils/train_bash.py index d6b40ae93e..6dc033ee30 100644 --- a/PyTorch/built-in/foundation/Baichuan-13B/utils/train_bash.py +++ b/PyTorch/built-in/foundation/Baichuan-13B/utils/train_bash.py @@ -4,7 +4,6 @@ import torch_npu from torch_npu.contrib import transfer_to_npu - def main(): run_exp() @@ -12,8 +11,12 @@ def main(): def _mp_fn(index): # For xla_spawn (TPUs) main() - + + def setup_seeds(seed=42): + """ + random seed + """ import random import numpy as np import torch.backends.cudnn as cudnn @@ -28,12 +31,12 @@ def setup_seeds(seed=42): cudnn.deterministic = True - if __name__ == "__main__": import deepspeed import deepspeed_npu setup_seeds(42) + # 二进制开启 torch.npu.set_compile_mode(jit_compile=False) deepspeed.init_distributed('hccl') -- Gitee From ca0c29da8876ef21bd4a70a5cac67f402507e5ca Mon Sep 17 00:00:00 2001 From: gitee_code_template Date: Tue, 26 Dec 2023 15:04:44 +0800 Subject: [PATCH 2/2] =?UTF-8?q?=E6=96=B0=E5=A2=9Ezero3=E5=92=8C=E5=8D=95?= =?UTF-8?q?=E6=9C=BA=E8=AE=AD=E7=BB=83=E8=84=9A=E6=9C=AC?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- PyTorch/built-in/foundation/Baichuan-13B/ds_config_zero3.json | 4 ++-- .../built-in/foundation/Baichuan-13B/run_baichuan_sft_1m.sh | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/PyTorch/built-in/foundation/Baichuan-13B/ds_config_zero3.json b/PyTorch/built-in/foundation/Baichuan-13B/ds_config_zero3.json index b8af47551c..d3d8c6797a 100644 --- a/PyTorch/built-in/foundation/Baichuan-13B/ds_config_zero3.json +++ b/PyTorch/built-in/foundation/Baichuan-13B/ds_config_zero3.json @@ -10,7 +10,7 @@ "min_loss_scale": 1 }, "zero_optimization": { - "stage": 2, + "stage": 3, "allgather_partitions": true, "allgather_bucket_size": 5e8, "overlap_comm": false, @@ -21,7 +21,7 @@ "bf16": { "enabled": false }, - "gradient_accumulation_steps": 16, + "gradient_accumulation_steps": 4, "gradient_clipping": "auto", "train_batch_size": "auto", "wall_clock_breakdown": false diff --git a/PyTorch/built-in/foundation/Baichuan-13B/run_baichuan_sft_1m.sh b/PyTorch/built-in/foundation/Baichuan-13B/run_baichuan_sft_1m.sh index 0572ce503e..197b058cfe 100644 --- a/PyTorch/built-in/foundation/Baichuan-13B/run_baichuan_sft_1m.sh +++ b/PyTorch/built-in/foundation/Baichuan-13B/run_baichuan_sft_1m.sh @@ -1,6 +1,6 @@ #!/bin/bash -export HCCL_CONNECT_TIMEOUT=7200 +export HCCL_CONNECT_TIMEOUT=1200 export INF_NAN_MODE_ENABLE=1 NUM_WORKERS=1 -- Gitee