From 193b750a67e366b373b1377a72d8ad9c36831482 Mon Sep 17 00:00:00 2001
From: "rrrr.cao@hotmail.com" <rrrr.cao@hotmail.com>
Date: Thu, 24 Mar 2022 20:48:23 +0800
Subject: [PATCH 1/7] add resnet50 graph

---
 .../DistributedResnet50/main_apex_d76_npu.py  | 33 ++++++++--
 .../pytorch_resnet50_apex.py                  | 64 +++++++++++++++----
 .../test/train_full_1p.sh                     |  9 +++
 .../test/train_full_8p.sh                     |  9 +++
 .../test/train_performance_1p.sh              | 10 +++
 .../test/train_performance_8p.sh              |  9 +++
 6 files changed, 116 insertions(+), 18 deletions(-)

diff --git a/PyTorch/built-in/cv/classification/ResNet50_for_PyTorch/DistributedResnet50/main_apex_d76_npu.py b/PyTorch/built-in/cv/classification/ResNet50_for_PyTorch/DistributedResnet50/main_apex_d76_npu.py
index 7886ada80d..d05fe4ed88 100644
--- a/PyTorch/built-in/cv/classification/ResNet50_for_PyTorch/DistributedResnet50/main_apex_d76_npu.py
+++ b/PyTorch/built-in/cv/classification/ResNet50_for_PyTorch/DistributedResnet50/main_apex_d76_npu.py
@@ -682,6 +682,10 @@ def train(train_loader, train_loader_len, model, criterion, optimizer, epoch, ar
     if args.benchmark == 1 :
         optimizer.zero_grad()
     for i, (images, target) in enumerate(train_loader):
+        # 图模式
+        if args.graph_mode:
+            print("args.graph_mode")
+            torch.npu.enable_graph_mode()
         # measure data loading time
         data_time.update(time.time() - end)
 
@@ -689,8 +693,15 @@ def train(train_loader, train_loader_len, model, criterion, optimizer, epoch, ar
 
         if args.device == 'npu':
             loc = 'npu:{}'.format(args.gpu)
-            images = images.to(loc, non_blocking=True).to(torch.float).sub(mean).div(std)
-            target = target.to(torch.int32).to(loc, non_blocking=True)
+            # 图模式
+            if args.graph_mode:
+                images = images.to(loc, non_blocking=True)
+                target = target.to(loc, non_blocking=True)
+                images = images.to(torch.float).sub(mean).div(std)
+                target = target.to(torch.int32)
+            else:
+                images = images.to(loc, non_blocking=True).to(torch.float).sub(mean).div(std)
+                target = target.to(torch.int32).to(loc, non_blocking=True)
         else:
             images = images.cuda(args.gpu, non_blocking=True)
             target = target.cuda(args.gpu, non_blocking=True)
@@ -701,10 +712,12 @@ def train(train_loader, train_loader_len, model, criterion, optimizer, epoch, ar
         loss = criterion(output, target)
 
         # measure accuracy and record loss
-        acc1, acc5 = accuracy(output, target, topk=(1, 5))
-        losses.update(loss.item(), images.size(0))
-        top1.update(acc1[0], images.size(0))
-        top5.update(acc5[0], images.size(0))
+        # 图模式
+        if not args.graph_mode:
+            acc1, acc5 = accuracy(output, target, topk=(1, 5))
+            losses.update(loss.item(), images.size(0))
+            top1.update(acc1[0], images.size(0))
+            top5.update(acc5[0], images.size(0))
 
         # compute gradient and do SGD step
         if args.benchmark == 0 :
@@ -727,7 +740,13 @@ def train(train_loader, train_loader_len, model, criterion, optimizer, epoch, ar
                 optimizer.zero_grad()
 
         torch.npu.synchronize()
-        
+
+        # 图模式
+        if args.graph_mode:
+            print("args.graph_mode")
+            torch.npu.launch_graph()
+            if i == 100:
+                torch.npu.synchronize()
         # measure elapsed time
         batch_time.update(time.time() - end)
         end = time.time()
diff --git a/PyTorch/built-in/cv/classification/ResNet50_for_PyTorch/pytorch_resnet50_apex.py b/PyTorch/built-in/cv/classification/ResNet50_for_PyTorch/pytorch_resnet50_apex.py
index 695f9de34e..2932965ffd 100644
--- a/PyTorch/built-in/cv/classification/ResNet50_for_PyTorch/pytorch_resnet50_apex.py
+++ b/PyTorch/built-in/cv/classification/ResNet50_for_PyTorch/pytorch_resnet50_apex.py
@@ -181,10 +181,14 @@ parser.add_argument('-t',
                     '--fine-tuning',
                     action='store_true',
                     help='transfer learning + fine tuning - train only the last FC layer.')
+# 图模式
+parser.add_argument('--graph_mode',
+                    action='store_true',
+                    help='whether to enable graph mode.')
 best_acc1 = 0
-
+args = parser.parse_args()
 def main():
-    args = parser.parse_args()
+    
     if args.npu is None:
         args.npu = 0
     global CALCULATE_DEVICE
@@ -428,6 +432,11 @@ def train(train_loader, model, criterion, optimizer, epoch, args):
     optimizer.zero_grad()
     end = time.time()
     for i, (images, target) in enumerate(train_loader):
+        # 图模式
+        if args.graph_mode:
+            print("args.graph_mode")
+            torch.npu.enable_graph_mode()
+
         if i > 100:
             pass
         # measure data loading time
@@ -438,20 +447,34 @@ def train(train_loader, model, criterion, optimizer, epoch, args):
 
         images = images.to(CALCULATE_DEVICE, non_blocking=True)
         if args.label_smoothing == 0.0:
-            target = target.to(torch.int32).to(CALCULATE_DEVICE, non_blocking=True)
-
+        # 图模式
+            if args.graph_mode:
+                print("args.graph_mode")
+                target = target.to(CALCULATE_DEVICE, non_blocking=True).to(torch.int32)
+            else:
+                target = target.to(torch.int32).to(CALCULATE_DEVICE, non_blocking=True)
         # compute output
         output = model(images)
         loss = criterion(output, target)
 
         if args.label_smoothing > 0.0:
-            target = target.to(torch.int32).to(CALCULATE_DEVICE, non_blocking=True)
+        # 图模式
+            if args.graph_mode:
+                print("args.graph_mode")
+                target = target.to(CALCULATE_DEVICE, non_blocking=True).to(torch.int32)
+            else:
+                target = target.to(torch.int32).to(CALCULATE_DEVICE, non_blocking=True)
+        
+        
 
         # measure accuracy and record loss
-        acc1, acc5 = accuracy(output, target, topk=(1, 5))
-        losses.update(loss.item(), images.size(0))
-        top1.update(acc1[0], images.size(0))
-        top5.update(acc5[0], images.size(0))
+        # 图模式
+        if not args.graph_mode:
+            # print("args.graph_mode====================")
+            acc1, acc5 = accuracy(output, target, topk=(1, 5))
+            losses.update(loss.item(), images.size(0))
+            top1.update(acc1[0], images.size(0))
+            top5.update(acc5[0], images.size(0))
 
         # compute gradient and do SGD step
         with amp.scale_loss(loss, optimizer) as scaled_loss:
@@ -464,6 +487,13 @@ def train(train_loader, model, criterion, optimizer, epoch, args):
                         param.grad /= batch_size_multiplier
             optimizer.step()
             optimizer.zero_grad()
+        
+        # 图模式
+        if args.graph_mode:
+            print("args.graph_mode")
+            torch.npu.launch_graph()
+            if i == 100:
+                torch.npu.synchronize()
 
         # measure elapsed time
         batch_time.update(time.time() - end)
@@ -474,6 +504,10 @@ def train(train_loader, model, criterion, optimizer, epoch, args):
 
         if i == TRAIN_STEP:
             break
+    # 图模式
+    if args.graph_mode:
+        print("args.graph_mode")
+        torch.npu.disable_graph_mode()
 
     print("batch_size:", args.batch_size, 'Time: {:.3f}'.format(batch_time.avg), '* FPS@all {:.3f}'.format(
             args.batch_size/batch_time.avg))
@@ -615,12 +649,20 @@ class LabelSmoothing(nn.Module):
         self.smoothing = smoothing
 
     def forward(self, x, target):
-        logprobs = torch.nn.functional.log_softmax(x, dim=-1).to("cpu")
+        # 图模式
+        if args.graph_mode:
+            logprobs = torch.nn.functional.log_softmax(x, dim=-1)
+        else:
+            logprobs = torch.nn.functional.log_softmax(x, dim=-1).to("cpu")
         nll_loss = -logprobs.gather(dim=-1, index=target.unsqueeze(1))
         nll_loss = nll_loss.squeeze(1)
         smooth_loss = -logprobs.mean(dim=-1)
         loss = self.confidence * nll_loss + self.smoothing * smooth_loss
-        return loss.mean().to(CALCULATE_DEVICE)
+        # 图模式
+        if args.graph_mode:
+            return loss.mean()
+        else:
+            return loss.mean().to(CALCULATE_DEVICE)
 
 def lr_policy(lr_fn, logger=None):
     if logger is not None:
diff --git a/PyTorch/built-in/cv/classification/ResNet50_for_PyTorch/test/train_full_1p.sh b/PyTorch/built-in/cv/classification/ResNet50_for_PyTorch/test/train_full_1p.sh
index 37fd0fd4b8..ce1bf775b0 100644
--- a/PyTorch/built-in/cv/classification/ResNet50_for_PyTorch/test/train_full_1p.sh
+++ b/PyTorch/built-in/cv/classification/ResNet50_for_PyTorch/test/train_full_1p.sh
@@ -25,9 +25,17 @@ do
         device_id=`echo ${para#*=}`
     elif [[ $para == --data_path* ]];then
         data_path=`echo ${para#*=}`
+    elif [[  $para == --graph_mode* ]];then
+        graph_mode=`echo ${para#*=}`
     fi
 done
 
+#圖模式
+graph=""
+if [[ x"${graph_mode}" == x"true" ]];then
+  graph="--graph_mode"
+fi
+
 # 校验是否传入data_path,不需要修改
 if [[ $data_path == "" ]];then
     echo "[Error] para \"data_path\" must be confing"
@@ -86,6 +94,7 @@ python3.7 ./pytorch_resnet50_apex.py \
     --warmup 5 \
     --label-smoothing=0.1 \
     --epochs ${train_epochs} \
+    ${graph} \
     --optimizer-batch-size 512 > ${test_path_dir}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log 2>&1 &
 
 wait
diff --git a/PyTorch/built-in/cv/classification/ResNet50_for_PyTorch/test/train_full_8p.sh b/PyTorch/built-in/cv/classification/ResNet50_for_PyTorch/test/train_full_8p.sh
index 56b6147f44..169217917c 100644
--- a/PyTorch/built-in/cv/classification/ResNet50_for_PyTorch/test/train_full_8p.sh
+++ b/PyTorch/built-in/cv/classification/ResNet50_for_PyTorch/test/train_full_8p.sh
@@ -22,9 +22,17 @@ for para in $*
 do
     if [[ $para == --data_path* ]];then
         data_path=`echo ${para#*=}`
+    elif [[  $para == --graph_mode* ]];then
+        graph_mode=`echo ${para#*=}`
     fi
 done
 
+#圖模式
+graph=""
+if [[ x"${graph_mode}" == x"true" ]];then
+  graph="--graph_mode"
+fi
+
 # 校验是否传入data_path,不需要修改
 if [[ $data_path == "" ]];then
     echo "[Error] para \"data_path\" must be confing"
@@ -98,6 +106,7 @@ python3.7 ./DistributedResnet50/main_apex_d76_npu.py \
         --benchmark=0 \
         --device='npu' \
         --epochs=${train_epochs} \
+        ${graph} \
         --batch-size=${batch_size} > ${test_path_dir}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log 2>&1 &
 
 wait
diff --git a/PyTorch/built-in/cv/classification/ResNet50_for_PyTorch/test/train_performance_1p.sh b/PyTorch/built-in/cv/classification/ResNet50_for_PyTorch/test/train_performance_1p.sh
index 96226ecf32..8e3b8c9d2a 100644
--- a/PyTorch/built-in/cv/classification/ResNet50_for_PyTorch/test/train_performance_1p.sh
+++ b/PyTorch/built-in/cv/classification/ResNet50_for_PyTorch/test/train_performance_1p.sh
@@ -25,9 +25,17 @@ do
         device_id=`echo ${para#*=}`
     elif [[ $para == --data_path* ]];then
         data_path=`echo ${para#*=}`
+    elif [[  $para == --graph_mode* ]];then
+        graph_mode=`echo ${para#*=}`
     fi
 done
 
+#圖模式
+graph=""
+if [[ x"${graph_mode}" == x"true" ]];then
+  graph="--graph_mode"
+fi
+
 # 校验是否传入data_path,不需要修改
 if [[ $data_path == "" ]];then
     echo "[Error] para \"data_path\" must be confing"
@@ -79,6 +87,7 @@ etp_flag=`echo ${check_etp_flag#*=}`
 if [ x"${etp_flag}" != x"true" ];then
     source ${test_path_dir}/env_npu.sh
 fi
+
 python3.7 ./pytorch_resnet50_apex.py \
     --data ${data_path} \
     --npu ${ASCEND_DEVICE_ID} \
@@ -88,6 +97,7 @@ python3.7 ./pytorch_resnet50_apex.py \
     --warmup 5 \
     --label-smoothing=0.1 \
     --epochs ${train_epochs} \
+    ${graph} \
     --optimizer-batch-size 512 > ${test_path_dir}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log 2>&1 &
 
 wait
diff --git a/PyTorch/built-in/cv/classification/ResNet50_for_PyTorch/test/train_performance_8p.sh b/PyTorch/built-in/cv/classification/ResNet50_for_PyTorch/test/train_performance_8p.sh
index 850d3dda9e..4c1704fbc4 100644
--- a/PyTorch/built-in/cv/classification/ResNet50_for_PyTorch/test/train_performance_8p.sh
+++ b/PyTorch/built-in/cv/classification/ResNet50_for_PyTorch/test/train_performance_8p.sh
@@ -21,9 +21,17 @@ for para in $*
 do
     if [[ $para == --data_path* ]];then
         data_path=`echo ${para#*=}`
+    elif [[  $para == --graph_mode* ]];then
+        graph_mode=`echo ${para#*=}`
     fi
 done
 
+#圖模式
+graph=""
+if [[ x"${graph_mode}" == x"true" ]];then
+  graph="--graph_mode"
+fi
+
 # 校验是否传入data_path,不需要修改
 if [[ $data_path == "" ]];then
     echo "[Error] para \"data_path\" must be confing"
@@ -83,6 +91,7 @@ python3.7 ./DistributedResnet50/main_apex_d76_npu.py \
         --benchmark=0 \
         --device='npu' \
         --epochs=${train_epochs} \
+        ${graph} \
         --batch-size=${batch_size} > ${test_path_dir}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log 2>&1 &
 
 wait
-- 
Gitee


From 0ede7c15d269c755fe921de65da19b89363f2a32 Mon Sep 17 00:00:00 2001
From: Ryan <rrrr.cao@hotmail.com>
Date: Fri, 25 Mar 2022 06:28:18 +0000
Subject: [PATCH 2/7] update run_squad.py.

---
 .../nlp/Bert-Squad_ID0470_for_PyTorch/run_squad.py        | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/PyTorch/built-in/nlp/Bert-Squad_ID0470_for_PyTorch/run_squad.py b/PyTorch/built-in/nlp/Bert-Squad_ID0470_for_PyTorch/run_squad.py
index a5203e848d..12ca1fc130 100644
--- a/PyTorch/built-in/nlp/Bert-Squad_ID0470_for_PyTorch/run_squad.py
+++ b/PyTorch/built-in/nlp/Bert-Squad_ID0470_for_PyTorch/run_squad.py
@@ -1108,6 +1108,7 @@ def main():
             train_iter = train_dataloader
             step_start_time = time.time()
             for step, batch in enumerate(train_iter):
+                torch.npu.enable_graph_mode()
                 # Terminate early for benchmarking
                 data_time = time.time() - step_start_time
                 if args.max_steps > 0 and global_step > args.max_steps:
@@ -1150,7 +1151,10 @@ def main():
                     optimizer.zero_grad()
                     global_step += 1
 
-                final_loss = loss.item()
+                # final_loss = loss.item()
+                final_loss = 0
+                torch.npu.launch_graph()
+                torch.npu.synchronize()
                 step_time = time.time() - step_start_time
                 if step % args.log_freq == 0:
                     # dllogger.log(step=(epoch, global_step,), data={"step_loss": final_loss,
@@ -1162,7 +1166,7 @@ def main():
                                        "step_loss": round(final_loss, 4),
                                        "learning_rate": round(optimizer.param_groups[0]['lr'], 10)})
                 step_start_time = time.time()
-
+            torch.npu.disable_graph_mode()
         time_to_train = time.time() - train_start
 
     if args.do_train and is_main_process() and not args.skip_checkpoint:
-- 
Gitee


From 5cd11bc0857ba49cb7af1686e1ceeee635f2f208 Mon Sep 17 00:00:00 2001
From: Ryan <rrrr.cao@hotmail.com>
Date: Fri, 25 Mar 2022 07:20:00 +0000
Subject: [PATCH 3/7] update
 PyTorch/built-in/nlp/Bert-Squad_ID0470_for_PyTorch/test/train_performance_1p.sh.

---
 .../Bert-Squad_ID0470_for_PyTorch/test/train_performance_1p.sh | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/PyTorch/built-in/nlp/Bert-Squad_ID0470_for_PyTorch/test/train_performance_1p.sh b/PyTorch/built-in/nlp/Bert-Squad_ID0470_for_PyTorch/test/train_performance_1p.sh
index 1ab48782e3..3a3c738ca5 100644
--- a/PyTorch/built-in/nlp/Bert-Squad_ID0470_for_PyTorch/test/train_performance_1p.sh
+++ b/PyTorch/built-in/nlp/Bert-Squad_ID0470_for_PyTorch/test/train_performance_1p.sh
@@ -8,7 +8,8 @@ export BMMV2_ENABLE=1
 export RANK_SIZE=1
 export JOB_ID=10087
 RANK_ID_START=0
-
+export ASCEND_SLOG_PRINT_TO_STDOUT=1
+export ASCEND_GLOBAL_LOG_LEVEL=1
 
 # 数据集路径,保持为空,不需要修改
 data_path=""
-- 
Gitee


From 068df456342b501819fd23a8543f93eb50ed9007 Mon Sep 17 00:00:00 2001
From: Ryan <rrrr.cao@hotmail.com>
Date: Fri, 25 Mar 2022 07:33:20 +0000
Subject: [PATCH 4/7] update  train_performance_1p.sh.

---
 .../test/train_performance_1p.sh                              | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/PyTorch/built-in/nlp/Bert-Squad_ID0470_for_PyTorch/test/train_performance_1p.sh b/PyTorch/built-in/nlp/Bert-Squad_ID0470_for_PyTorch/test/train_performance_1p.sh
index 3a3c738ca5..16d8a05c16 100644
--- a/PyTorch/built-in/nlp/Bert-Squad_ID0470_for_PyTorch/test/train_performance_1p.sh
+++ b/PyTorch/built-in/nlp/Bert-Squad_ID0470_for_PyTorch/test/train_performance_1p.sh
@@ -8,8 +8,8 @@ export BMMV2_ENABLE=1
 export RANK_SIZE=1
 export JOB_ID=10087
 RANK_ID_START=0
-export ASCEND_SLOG_PRINT_TO_STDOUT=1
-export ASCEND_GLOBAL_LOG_LEVEL=1
+#export ASCEND_SLOG_PRINT_TO_STDOUT=1
+#export ASCEND_GLOBAL_LOG_LEVEL=1
 
 # 数据集路径,保持为空,不需要修改
 data_path=""
-- 
Gitee


From a0f8fc207740a2acbcc224c4c77fb47081657e7e Mon Sep 17 00:00:00 2001
From: Ryan <rrrr.cao@hotmail.com>
Date: Fri, 25 Mar 2022 08:54:36 +0000
Subject: [PATCH 5/7] update
 PyTorch/built-in/nlp/Bert-Squad_ID0470_for_PyTorch/run_squad.py.

---
 .../nlp/Bert-Squad_ID0470_for_PyTorch/run_squad.py         | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

diff --git a/PyTorch/built-in/nlp/Bert-Squad_ID0470_for_PyTorch/run_squad.py b/PyTorch/built-in/nlp/Bert-Squad_ID0470_for_PyTorch/run_squad.py
index 12ca1fc130..2327cbcce4 100644
--- a/PyTorch/built-in/nlp/Bert-Squad_ID0470_for_PyTorch/run_squad.py
+++ b/PyTorch/built-in/nlp/Bert-Squad_ID0470_for_PyTorch/run_squad.py
@@ -1108,7 +1108,6 @@ def main():
             train_iter = train_dataloader
             step_start_time = time.time()
             for step, batch in enumerate(train_iter):
-                torch.npu.enable_graph_mode()
                 # Terminate early for benchmarking
                 data_time = time.time() - step_start_time
                 if args.max_steps > 0 and global_step > args.max_steps:
@@ -1151,10 +1150,7 @@ def main():
                     optimizer.zero_grad()
                     global_step += 1
 
-                # final_loss = loss.item()
-                final_loss = 0
-                torch.npu.launch_graph()
-                torch.npu.synchronize()
+                final_loss = loss.item()
                 step_time = time.time() - step_start_time
                 if step % args.log_freq == 0:
                     # dllogger.log(step=(epoch, global_step,), data={"step_loss": final_loss,
@@ -1166,7 +1162,6 @@ def main():
                                        "step_loss": round(final_loss, 4),
                                        "learning_rate": round(optimizer.param_groups[0]['lr'], 10)})
                 step_start_time = time.time()
-            torch.npu.disable_graph_mode()
         time_to_train = time.time() - train_start
 
     if args.do_train and is_main_process() and not args.skip_checkpoint:
-- 
Gitee


From 5157fbb4bcedb30522b959b0d80d1ff955b18368 Mon Sep 17 00:00:00 2001
From: Ryan <rrrr.cao@hotmail.com>
Date: Fri, 25 Mar 2022 08:55:01 +0000
Subject: [PATCH 6/7] update
 PyTorch/built-in/nlp/Bert-Squad_ID0470_for_PyTorch/test/train_performance_1p.sh.

---
 .../Bert-Squad_ID0470_for_PyTorch/test/train_performance_1p.sh  | 2 --
 1 file changed, 2 deletions(-)

diff --git a/PyTorch/built-in/nlp/Bert-Squad_ID0470_for_PyTorch/test/train_performance_1p.sh b/PyTorch/built-in/nlp/Bert-Squad_ID0470_for_PyTorch/test/train_performance_1p.sh
index 16d8a05c16..37d3c83637 100644
--- a/PyTorch/built-in/nlp/Bert-Squad_ID0470_for_PyTorch/test/train_performance_1p.sh
+++ b/PyTorch/built-in/nlp/Bert-Squad_ID0470_for_PyTorch/test/train_performance_1p.sh
@@ -8,8 +8,6 @@ export BMMV2_ENABLE=1
 export RANK_SIZE=1
 export JOB_ID=10087
 RANK_ID_START=0
-#export ASCEND_SLOG_PRINT_TO_STDOUT=1
-#export ASCEND_GLOBAL_LOG_LEVEL=1
 
 # 数据集路径,保持为空,不需要修改
 data_path=""
-- 
Gitee


From 108507be10759808eb38d86c889d87d0df229812 Mon Sep 17 00:00:00 2001
From: Ryan <rrrr.cao@hotmail.com>
Date: Mon, 28 Mar 2022 05:59:00 +0000
Subject: [PATCH 7/7] update mobilenetv2_8p_main_anycard.py.

---
 .../train/mobilenetv2_8p_main_anycard.py      | 116 ++++++++++--------
 1 file changed, 62 insertions(+), 54 deletions(-)

diff --git a/PyTorch/built-in/cv/classification/MobileNetV2_for_PyTorch/train/mobilenetv2_8p_main_anycard.py b/PyTorch/built-in/cv/classification/MobileNetV2_for_PyTorch/train/mobilenetv2_8p_main_anycard.py
index b38bdd8a85..d631020f0d 100644
--- a/PyTorch/built-in/cv/classification/MobileNetV2_for_PyTorch/train/mobilenetv2_8p_main_anycard.py
+++ b/PyTorch/built-in/cv/classification/MobileNetV2_for_PyTorch/train/mobilenetv2_8p_main_anycard.py
@@ -341,61 +341,69 @@ def train(train_loader, train_loader_len, model, criterion, optimizer, epoch, ar
     steps_per_epoch = train_loader_len
     print('==========step per epoch======================', steps_per_epoch)
     for i, (images, target) in enumerate(train_loader):
-        if i > 200 :
-            pass
-        # measure data loading time
-        data_time.update(time.time() - end)
-
-        global_step = epoch * steps_per_epoch + i
-        lr = adjust_learning_rate(optimizer, global_step, steps_per_epoch, args)
-
-        target = target.to(torch.int32)
-        images = images.to(loc, non_blocking=True).to(torch.float).sub(mean).div(std)
-        target = target.to(loc, non_blocking=True)
-
-        # compute output
-        output = model(images)
-        stream = torch.npu.current_stream()
-        stream.synchronize()
-
-        loss = criterion(output, target)
-        stream = torch.npu.current_stream()
-        stream.synchronize()
-
-        # measure accuracy and record loss
-        acc1, acc5 = accuracy(output, target, topk=(1, 5))
-        losses.update(loss.item(), images.size(0))
-        top1.update(acc1[0], images.size(0))
-        top5.update(acc5[0], images.size(0))
-
-        # compute gradient and do SGD step
-        if args.benchmark == 0:
-            optimizer.zero_grad()
-
-        if args.amp:
-            with amp.scale_loss(loss, optimizer) as scaled_loss:
-                scaled_loss.backward()
-        else:
-            loss.backward()
-
-        stream = torch.npu.current_stream()
-        stream.synchronize()
-
-        if args.benchmark == 0:
-            optimizer.step()
-        elif args.benchmark == 1:
-            batch_size_multiplier = int(OPTIMIZER_BATCH_SIZE / args.batch_size)
-            bm_optimizer_step = ((i + 1) % batch_size_multiplier) == 0
-            if bm_optimizer_step:
-                for param_group in optimizer.param_groups:
-                    for param in param_group['params']:
-                        param.grad /= batch_size_multiplier
-                optimizer.step()
+        #with torch.autograd.profiler.profile(use_npu=False) as prof:
+        if True:
+            torch.npu.enable_graph_mode()
+            if i > 200 :
+                pass
+            # measure data loading time
+            data_time.update(time.time() - end)
+
+            global_step = epoch * steps_per_epoch + i
+            lr = adjust_learning_rate(optimizer, global_step, steps_per_epoch, args)
+
+            images = images.to(loc, non_blocking=True)
+            target = target.to(loc, non_blocking=True)
+            images = images.to(torch.float).sub(mean).div(std)
+            target = target.to(torch.int32)         
+
+            # compute output
+            output = model(images)
+            # stream = torch.npu.current_stream()
+            # stream.synchronize()
+
+            loss = criterion(output, target)
+            # stream = torch.npu.current_stream()
+            # stream.synchronize()
+
+            # measure accuracy and record loss
+            acc1, acc5 = accuracy(output, target, topk=(1, 5))
+            # losses.update(loss.item(), images.size(0))
+            # top1.update(acc1[0], images.size(0))
+            # top5.update(acc5[0], images.size(0))
+
+            # compute gradient and do SGD step
+            if args.benchmark == 0:
                 optimizer.zero_grad()
-        stream = torch.npu.current_stream()
-        stream.synchronize()
 
-        # measure elapsed time
+            if args.amp:
+                with amp.scale_loss(loss, optimizer) as scaled_loss:
+                    scaled_loss.backward()
+            else:
+                loss.backward()
+
+            # stream = torch.npu.current_stream()
+            # stream.synchronize()
+
+            if args.benchmark == 0:
+                optimizer.step()
+            elif args.benchmark == 1:
+                batch_size_multiplier = int(OPTIMIZER_BATCH_SIZE / args.batch_size)
+                bm_optimizer_step = ((i + 1) % batch_size_multiplier) == 0
+                if bm_optimizer_step:
+                    for param_group in optimizer.param_groups:
+                        for param in param_group['params']:
+                            param.grad /= batch_size_multiplier
+                    optimizer.step()
+                    optimizer.zero_grad()
+            # stream = torch.npu.current_stream()
+            # stream.synchronize()
+            torch.npu.launch_graph()
+            if i == 200:
+                torch.npu.synchronize()
+            # measure elapsed time
+        #prof.export_chrome_trace('./npu_profile_%d.json'%i)
+
         batch_time.update(time.time() - end)
         end = time.time()
 
@@ -403,7 +411,7 @@ def train(train_loader, train_loader_len, model, criterion, optimizer, epoch, ar
             if not args.multiprocessing_distributed or (args.multiprocessing_distributed
                                                         and args.rank % ngpus_per_node == 0):
                 progress.display(i)
-
+    torch.npu.disable_graph_mode()
     if not args.multiprocessing_distributed or (args.multiprocessing_distributed
                                                 and args.rank % ngpus_per_node == 0):
         print("[npu id:", args.gpu, "]", '* FPS@all {:.3f}'.format(ngpus_per_node * args.batch_size / batch_time.avg))
-- 
Gitee