From a7294b8dce11691a9afc2dc7cde1ac0c1b649cbd Mon Sep 17 00:00:00 2001
From: baoxiang <baoxiang5@huawei.com>
Date: Thu, 21 Sep 2023 10:48:40 +0800
Subject: [PATCH 1/3] modify readme, remove inference part

---
 .../built-in/foundation/LLaMA-13B/README.md   | 26 ++-----------------
 1 file changed, 2 insertions(+), 24 deletions(-)

diff --git a/PyTorch/built-in/foundation/LLaMA-13B/README.md b/PyTorch/built-in/foundation/LLaMA-13B/README.md
index 67159388dc..f4d8023566 100644
--- a/PyTorch/built-in/foundation/LLaMA-13B/README.md
+++ b/PyTorch/built-in/foundation/LLaMA-13B/README.md
@@ -161,7 +161,7 @@ LLaMA原始权重上实现来使用，主要分为如下两步：
   python3 -m fastchat.model.apply_delta \
   --base-model-path /path/to/llama-7b \
   --target-model-path /output/path/to/vicuna-7b \
-  --delta-path lmsys/vicuna-7b-delta-v1.1
+  --delta-path lmsys/vicuna-7b-delta-v0
   ```
 
 #### Vicuna-13B
@@ -171,7 +171,7 @@ LLaMA原始权重上实现来使用，主要分为如下两步：
   python3 -m fastchat.model.apply_delta \
   --base-model-path /path/to/llama-13b \
   --target-model-path /output/path/to/vicuna-13b \
-  --delta-path lmsys/vicuna-13b-delta-v1.1
+  --delta-path lmsys/vicuna-13b-delta-v0
   ```
 
 下载完毕后，可以在源码包根目录下找到对应的预训练参数文件夹。
@@ -254,28 +254,6 @@ LLaMA原始权重上实现来使用，主要分为如下两步：
 | 13B-竞品A  | -     |            1386 | 3      |     zero2 |
 | 13B-NPU | -     |            1498 | 3      |     zero2 |
 
-# 模型推理
-
-## 支持模型
-
-- Vicuna，LLaMA
-
-## 执行推理
-
-由于当前npu上融合算子scaledmaskedsoftmax算子存在限制，在推理时需要将源码包根目录下transformers_modify文件夹中的下列文件替换到transformers安装目录下的对应位置（基于transformers 4.28.1版本）；
-
-  ``` 
-  modeling_llama_eval.py -> transformers/models/llama/modeling_llama.py
-  ```
-
-执行下列命令以完成模型推理（基于单NPU，推理13B模型大约需要28GB显存，推理7B模型大约需要14G显存）。
-
-  ```
-  source /usr/local/Ascend/ascend_toolkit/set_env.sh
-  python3 -m fastchat.serve.cli --model-path path/to/FastChat/7B-vicuna --num-gpus 1 --conv-template conv_one_shot
-  python3 -m fastchat.serve.cli --model-path path/to/FastChat/13B-vicuna --num-gpus 1 --conv-template conv_one_shot
-  ```
-
 # 版本说明
 
 ## 变更
-- 
Gitee


From 28722aadf3652be5ebbc3e3d05ab5c97867aa665 Mon Sep 17 00:00:00 2001
From: baoxiang <baoxiang5@huawei.com>
Date: Mon, 9 Oct 2023 20:20:22 +0800
Subject: [PATCH 2/3] fix bug for CrossEntropyLoss on overflow

---
 .../foundation/LLaMA-13B/transformers_modify/modeling_llama.py  | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/PyTorch/built-in/foundation/LLaMA-13B/transformers_modify/modeling_llama.py b/PyTorch/built-in/foundation/LLaMA-13B/transformers_modify/modeling_llama.py
index 8a00bf45ca..ed2a1c7c49 100644
--- a/PyTorch/built-in/foundation/LLaMA-13B/transformers_modify/modeling_llama.py
+++ b/PyTorch/built-in/foundation/LLaMA-13B/transformers_modify/modeling_llama.py
@@ -689,7 +689,7 @@ class LlamaForCausalLM(LlamaPreTrainedModel):
             shift_labels = shift_labels.view(-1)
             # Enable model parallelism
             shift_labels = shift_labels.to(shift_logits.device)
-            loss = loss_fct(shift_logits, shift_labels)
+            loss = loss_fct(shift_logits.float(), shift_labels).half()
 
         if not return_dict:
             output = (logits,) + outputs[1:]
-- 
Gitee


From b34e3ae1d5f16f0bfe20e9305825e2410fca34aa Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E9=B2=8D=E7=BF=94?= <baoxiang5@huawei.com>
Date: Tue, 31 Oct 2023 09:00:38 +0000
Subject: [PATCH 3/3] update
 PyTorch/built-in/foundation/LLaMA-13B/13B_finetune.sh.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: 鲍翔 <baoxiang5@huawei.com>
---
 PyTorch/built-in/foundation/LLaMA-13B/13B_finetune.sh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/PyTorch/built-in/foundation/LLaMA-13B/13B_finetune.sh b/PyTorch/built-in/foundation/LLaMA-13B/13B_finetune.sh
index bd5e76e339..3775299f05 100644
--- a/PyTorch/built-in/foundation/LLaMA-13B/13B_finetune.sh
+++ b/PyTorch/built-in/foundation/LLaMA-13B/13B_finetune.sh
@@ -17,9 +17,9 @@ run_cmd="HCCL_CONNECT_TIMEOUT=1200 deepspeed --master_port ${MASTER_PORT} --num_
     --fp16 True \
     --output_dir ./ckpt_16p \
     --num_train_epochs 1 \
-    --per_device_train_batch_size 8 \
+    --per_device_train_batch_size 4 \
     --per_device_eval_batch_size 1 \
-    --gradient_accumulation_steps 16 \
+    --gradient_accumulation_steps 32 \
     --evaluation_strategy "no" \
     --save_strategy "steps" \
     --save_steps 500 \
-- 
Gitee