diff --git a/PyTorch/built-in/cv/classification/ResNet50_for_PyTorch/DistributedResnet50/main_apex_d76_npu.py b/PyTorch/built-in/cv/classification/ResNet50_for_PyTorch/DistributedResnet50/main_apex_d76_npu.py
index 7886ada80dc0c113df4dd2c5b2dba3f34d5fb809..26edd676cebb02f8dc1f25336bd91233bb954750 100644
--- a/PyTorch/built-in/cv/classification/ResNet50_for_PyTorch/DistributedResnet50/main_apex_d76_npu.py
+++ b/PyTorch/built-in/cv/classification/ResNet50_for_PyTorch/DistributedResnet50/main_apex_d76_npu.py
@@ -259,6 +259,10 @@ parser.add_argument('-t',
                     '--fine-tuning',
                     action='store_true',
                     help='transfer learning + fine tuning - train only the last FC layer.')
+# 图模式
+parser.add_argument('--graph_mode',
+                    action='store_true',
+                    help='whether to enable graph mode.')
 best_acc1 = 0
 
 def nvidia_model_config(args):
@@ -682,6 +686,10 @@ def train(train_loader, train_loader_len, model, criterion, optimizer, epoch, ar
     if args.benchmark == 1 :
         optimizer.zero_grad()
     for i, (images, target) in enumerate(train_loader):
+        # 图模式
+        if args.graph_mode:
+            print("graph mode on")
+            torch.npu.enable_graph_mode()
         # measure data loading time
         data_time.update(time.time() - end)
 
@@ -689,8 +697,15 @@ def train(train_loader, train_loader_len, model, criterion, optimizer, epoch, ar
 
         if args.device == 'npu':
             loc = 'npu:{}'.format(args.gpu)
-            images = images.to(loc, non_blocking=True).to(torch.float).sub(mean).div(std)
-            target = target.to(torch.int32).to(loc, non_blocking=True)
+            # 图模式
+            if args.graph_mode:
+                images = images.to(loc, non_blocking=True)
+                target = target.to(loc, non_blocking=True)
+                images = images.to(torch.float).sub(mean).div(std)
+                target = target.to(torch.int32)
+            else:
+                images = images.to(loc, non_blocking=True).to(torch.float).sub(mean).div(std)
+                target = target.to(torch.int32).to(loc, non_blocking=True)
         else:
             images = images.cuda(args.gpu, non_blocking=True)
             target = target.cuda(args.gpu, non_blocking=True)
@@ -701,10 +716,12 @@ def train(train_loader, train_loader_len, model, criterion, optimizer, epoch, ar
         loss = criterion(output, target)
 
         # measure accuracy and record loss
-        acc1, acc5 = accuracy(output, target, topk=(1, 5))
-        losses.update(loss.item(), images.size(0))
-        top1.update(acc1[0], images.size(0))
-        top5.update(acc5[0], images.size(0))
+        # 图模式
+        if not args.graph_mode:
+            acc1, acc5 = accuracy(output, target, topk=(1, 5))
+            losses.update(loss.item(), images.size(0))
+            top1.update(acc1[0], images.size(0))
+            top5.update(acc5[0], images.size(0))
 
         # compute gradient and do SGD step
         if args.benchmark == 0 :
@@ -727,7 +744,13 @@ def train(train_loader, train_loader_len, model, criterion, optimizer, epoch, ar
                 optimizer.zero_grad()
 
         torch.npu.synchronize()
-        
+
+        # 图模式
+        if args.graph_mode:
+            print("graph mode launch")
+            torch.npu.launch_graph()
+            if i == len(train_loader):
+                torch.npu.synchronize()
         # measure elapsed time
         batch_time.update(time.time() - end)
         end = time.time()
@@ -736,7 +759,10 @@ def train(train_loader, train_loader_len, model, criterion, optimizer, epoch, ar
             if not args.multiprocessing_distributed or (args.multiprocessing_distributed
                     and args.rank % ngpus_per_node == 0):
                     progress.display(i)
-
+    # 图模式
+    if args.graph_mode:
+        print("graph mode off")
+        torch.npu.disable_graph_mode()   
     if not args.multiprocessing_distributed or (args.multiprocessing_distributed
             and args.rank % ngpus_per_node == 0):
         print("[npu id:",args.gpu,"]", "batch_size:", ngpus_per_node*args.batch_size, 'Time: {:.3f}'.format(batch_time.avg), '* FPS@all {:.3f}'.format(
diff --git a/PyTorch/built-in/cv/classification/ResNet50_for_PyTorch/test/train_ID3071_ResNet50_performance_8p.sh b/PyTorch/built-in/cv/classification/ResNet50_for_PyTorch/test/train_ID3071_ResNet50_performance_8p.sh
new file mode 100644
index 0000000000000000000000000000000000000000..0013d695905b43a64bd7a73c8d9c5f9e341b726f
--- /dev/null
+++ b/PyTorch/built-in/cv/classification/ResNet50_for_PyTorch/test/train_ID3071_ResNet50_performance_8p.sh
@@ -0,0 +1,140 @@
+#!/bin/bash
+
+################基础配置参数，需要模型审视修改##################
+# 必选字段(必须在此处定义的参数): Network batch_size RANK_SIZE
+# 网络名称，同目录名称
+Network="ResNet50_ID3071_for_PyTorch"
+# 训练batch_size
+batch_size=4096
+# 训练使用的npu卡数
+export RANK_SIZE=8
+# 数据集路径,保持为空,不需要修改
+data_path=""
+
+# 训练epoch 90
+train_epochs=3
+# 加载数据进程数
+workers=128
+
+# 参数校验，data_path为必传参数，其他参数的增删由模型自身决定；此处新增参数需在上面有定义并赋值
+for para in $*
+do
+    if [[ $para == --data_path* ]];then
+        data_path=`echo ${para#*=}`
+    fi
+done
+
+
+# 校验是否传入data_path,不需要修改
+if [[ $data_path == "" ]];then
+    echo "[Error] para \"data_path\" must be confing"
+    exit 1
+fi
+
+###############指定训练脚本执行路径###############
+# cd到与test文件夹同层级目录下执行脚本，提高兼容性；test_path_dir为包含test文件夹的路径
+cur_path=`pwd`
+cur_path_last_dirname=${cur_path##*/}
+if [ x"${cur_path_last_dirname}" == x"test" ];then
+    test_path_dir=${cur_path}
+    cd ..
+    cur_path=`pwd`
+else
+    test_path_dir=${cur_path}/test
+fi
+
+
+#################创建日志输出目录，不需要修改#################
+ASCEND_DEVICE_ID=0
+if [ -d ${test_path_dir}/output/${ASCEND_DEVICE_ID} ];then
+    rm -rf ${test_path_dir}/output/${ASCEND_DEVICE_ID}
+    mkdir -p ${test_path_dir}/output/$ASCEND_DEVICE_ID
+else
+    mkdir -p ${test_path_dir}/output/$ASCEND_DEVICE_ID
+fi
+
+
+#################启动训练脚本#################
+# 训练开始时间，不需要修改
+start_time=$(date +%s)
+# 非平台场景时source 环境变量
+check_etp_flag=`env | grep etp_running_flag`
+etp_flag=`echo ${check_etp_flag#*=}`
+if [ x"${etp_flag}" != x"true" ];then
+    source ${test_path_dir}/env_npu.sh
+fi
+
+python3.7 ./DistributedResnet50/main_apex_d76_npu.py \
+        --data ${data_path} \
+        --addr=$(hostname -I |awk '{print $1}') \
+        --seed=49 \
+        --workers=${workers} \
+        --learning-rate=1.6 \
+        --warmup=8 \
+        --label-smoothing=0.0 \
+        --mom=0.9 \
+        --weight-decay=1.0e-04  \
+        --static-loss-scale=128 \
+        --print-freq=1 \
+        --dist-url='tcp://127.0.0.1:50000' \
+        --dist-backend='hccl' \
+        --multiprocessing-distributed \
+        --world-size=1 \
+        --rank=0 \
+        --benchmark=0 \
+        --device='npu' \
+        --graph_mode \
+        --epochs=${train_epochs} \
+        --batch-size=${batch_size} > ${test_path_dir}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log 2>&1 &
+
+wait
+
+
+##################获取训练数据################
+# 训练结束时间，不需要修改
+end_time=$(date +%s)
+e2e_time=$(( $end_time - $start_time ))
+
+# 训练用例信息，不需要修改
+BatchSize=${batch_size}
+DeviceType=`uname -m`
+CaseName=${Network}_bs${BatchSize}_${RANK_SIZE}'p'_'perf'
+
+# 结果打印，不需要修改
+echo "------------------ Final result ------------------"
+# 输出性能FPS，需要模型审视修改
+grep "FPS@all" ${test_path_dir}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log | awk '{print $11}' >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/train_${CaseName}_fps.log
+FPS=`cat ${test_path_dir}/output/${ASCEND_DEVICE_ID}/train_${CaseName}_fps.log | awk '{a+=$1} END {if (NR != 0) printf("%.3f",a/NR)}'`
+# 打印，不需要修改
+echo "Final Performance images/sec : $FPS"
+
+# 输出训练精度,需要模型审视修改
+train_accuracy=`grep -a '* Acc@1'  ${test_path_dir}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log|awk 'END {print}'|awk -F "Acc@1" '{print $NF}'|awk -F " " '{print $1}'`
+# 打印，不需要修改
+echo "Final Train Accuracy : ${train_accuracy}"
+echo "E2E Training Duration sec : $e2e_time"
+
+# 性能看护结果汇总
+# 获取性能数据，不需要修改
+# 吞吐量
+ActualFPS=${FPS}
+# 单迭代训练时长
+TrainingTime=`awk 'BEGIN{printf "%.2f\n", '${batch_size}'*1000/'${FPS}'}'`
+
+# 从train_$ASCEND_DEVICE_ID.log提取Loss到train_${CaseName}_loss.txt中，需要根据模型审视
+grep Epoch: ${test_path_dir}/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log|grep -v Test|awk -F "Loss" '{print $NF}' | awk -F " " '{print $1}' >>  ${test_path_dir}/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt
+
+# 最后一个迭代loss值，不需要修改
+ActualLoss=`awk 'END {print}'  ${test_path_dir}/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt`
+
+# 关键信息打印到${CaseName}.log中，不需要修改
+echo "Network = ${Network}" >  ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "RankSize = ${RANK_SIZE}" >>  ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "BatchSize = ${BatchSize}" >>  ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "DeviceType = ${DeviceType}" >>  ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "CaseName = ${CaseName}" >>  ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "ActualFPS = ${ActualFPS}" >>  ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "TrainingTime = ${TrainingTime}" >>  ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "TrainAccuracy = ${train_accuracy}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "ActualLoss = ${ActualLoss}" >>  ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "E2ETrainingTime = ${e2e_time}" >>  ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
\ No newline at end of file