From 2192270ba3f3d96207376e0db7d1ca898e6d064f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E5=90=89=E5=AE=8F=E6=A2=85?= <591861959@qq.com>
Date: Wed, 30 Mar 2022 01:12:32 +0000
Subject: [PATCH 01/22] update

---
 .../Transformer_ID0105_for_PyTorch/test/train_performance_1p.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/PyTorch/dev/nlp/Transformer_ID0105_for_PyTorch/test/train_performance_1p.sh b/PyTorch/dev/nlp/Transformer_ID0105_for_PyTorch/test/train_performance_1p.sh
index 118737861f..70300b545e 100644
--- a/PyTorch/dev/nlp/Transformer_ID0105_for_PyTorch/test/train_performance_1p.sh
+++ b/PyTorch/dev/nlp/Transformer_ID0105_for_PyTorch/test/train_performance_1p.sh
@@ -3,7 +3,7 @@
 cur_path=`pwd`
 #集合通信参数,不需要修改
 export RANK_SIZE=1
-
+export BMMV2_ENABLE=1
 # 数据集路径,保持为空,不需要修改
 data_path=""
 
-- 
Gitee


From bc9f56f2367d11e07ec72c05de525813320438f6 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E5=90=89=E5=AE=8F=E6=A2=85?= <591861959@qq.com>
Date: Wed, 30 Mar 2022 01:37:21 +0000
Subject: [PATCH 02/22] update

---
 .../models/transformer.py                     | 38 +++++++++----------
 1 file changed, 19 insertions(+), 19 deletions(-)

diff --git a/PyTorch/dev/nlp/Transformer_ID0105_for_PyTorch/models/transformer.py b/PyTorch/dev/nlp/Transformer_ID0105_for_PyTorch/models/transformer.py
index 3995699393..eefbd6489f 100644
--- a/PyTorch/dev/nlp/Transformer_ID0105_for_PyTorch/models/transformer.py
+++ b/PyTorch/dev/nlp/Transformer_ID0105_for_PyTorch/models/transformer.py
@@ -29,7 +29,7 @@ from torch import Tensor
 from typing import Optional, Dict
 torch.set_printoptions(threshold=500000000, linewidth=1024)
 from utils import options
-
+import torch.nn.functional as F
 from modules import (
     MultiheadAttention, SinusoidalPositionalEmbedding
 )
@@ -184,8 +184,8 @@ class TransformerEncoder(nn.Module):
         x = self.embed_scale * self.embed_tokens(src_tokens)
         if self.embed_positions is not None:
             x += self.embed_positions(src_tokens)
-        if self.training:
-            x, _, _ = torch.npu_dropoutV2(x, self.seed, p=self.dropout)
+        # if self.training:
+            x = F.dropout(x, self.seed, p=self.dropout)
 
         # B:batch size ; T: seq length ; C: embedding dim 512
         # B x T x C -> T x B x C
@@ -266,8 +266,8 @@ class TransformerDecoder(IncrementalDecoder):
         x = self.embed_scale * self.embed_tokens(prev_output_tokens)
         if positions is not None:
             x += positions
-        if self.training:
-            x,_,_ = torch.npu_dropoutV2(x, self.seed, p=self.dropout)
+        # if self.training:
+            x= F.dropout(x, self.seed, p=self.dropout)
 
         # B x T x C -> T x B x C
         x = x.transpose(0, 1)
@@ -319,18 +319,18 @@ class TransformerEncoderLayer(nn.Module):
                               incremental_state=None,
                               need_weights=False,
                               static_kv=False)
-        if self.training:
-            x, _, _ = torch.npu_dropoutV2(x, self.seed, p=self.dropout)
+        # if self.training:
+            x = F.dropout(x, self.seed, p=self.dropout)
         x = residual + x
         x = self.ln1(x)
 
         residual = x
         x = F.threshold(self.fc1(x), 0.0, 0.0)
-        if self.training:
-            x, _, _ = torch.npu_dropoutV2(x, self.seed, p=self.relu_dropout)
+        # if self.training:
+        x = F.dropout(x, self.seed, p=self.relu_dropout)
         x = self.fc2(x)
-        if self.training:
-            x, _, _ = torch.npu_dropoutV2(x, self.seed, p =self.dropout)
+        # if self.training:
+        x = F.dropout(x, self.seed, p =self.dropout)
         x = residual + x
         x = self.ln2(x)
         return x
@@ -383,8 +383,8 @@ class TransformerDecoderLayer(nn.Module):
             static_kv=False
         )
 
-        if self.training:
-            x, _, _ = torch.npu_dropoutV2(x, self.seed, p=self.dropout)
+        # if self.training:
+        x = F.dropout(x, self.seed, p=self.dropout)
         x = residual + x
         x = self.self_attn_layer_norm(x)
 
@@ -402,19 +402,19 @@ class TransformerDecoderLayer(nn.Module):
                 mask_future_timesteps=False,
                 need_weights=(not self.training and self.need_attn),
             )
-            if self.training:
-                x, _, _ = torch.npu_dropoutV2(x, self.seed, p=self.dropout)
+            # if self.training:
+            x = F.dropout(x, self.seed, p=self.dropout)
             x = residual + x
 
             x = self.encoder_attn_layer_norm(x)
 
         residual = x
         x = F.threshold(self.fc1(x), 0.0, 0.0)
-        if self.training:
-            x, _, _ = torch.npu_dropoutV2(x, self.seed, p=self.relu_dropout)
+        # if self.training:
+        x = F.dropout(x, self.seed, p=self.relu_dropout)
         x = self.fc2(x)
-        if self.training:
-            x, _, _ = torch.npu_dropoutV2(x, self.seed, p=self.dropout)
+        # if self.training:
+        x = F.dropout(x, self.seed, p=self.dropout)
         x = residual + x
         x = self.layer_norm(x)
         return x, attn
-- 
Gitee


From e574fd1f4374d54ad43b437375cd63503ceb70eb Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E5=90=89=E5=AE=8F=E6=A2=85?= <591861959@qq.com>
Date: Wed, 30 Mar 2022 01:39:37 +0000
Subject: [PATCH 03/22] update

---
 .../train_1p.py                               | 19 ++++++++++---------
 1 file changed, 10 insertions(+), 9 deletions(-)

diff --git a/PyTorch/dev/nlp/Transformer_ID0105_for_PyTorch/train_1p.py b/PyTorch/dev/nlp/Transformer_ID0105_for_PyTorch/train_1p.py
index b1c4229807..fc7e4352b6 100644
--- a/PyTorch/dev/nlp/Transformer_ID0105_for_PyTorch/train_1p.py
+++ b/PyTorch/dev/nlp/Transformer_ID0105_for_PyTorch/train_1p.py
@@ -217,15 +217,16 @@ def train(args, trainer, datasets, epoch_itr):
     trainer.get_throughput_meter().reset()
 
     for i, sample in enumerate(itr):
-        if i>100:pass
-        if i < num_batches - 1 and (i + 1) % update_freq > 0:
-            # buffer updates according to --update-freq
-            loss = trainer.train_step(sample, update_params=False, last_step=(i == len(itr) - 1))
-            continue
-        else:
-            loss = trainer.train_step(sample, update_params=True, last_step=(i == len(itr) - 1))
-            if loss != None:
-                losses.update(loss)
+        if i>20:break
+        with torch.npu.profile(profiler_result_path="./results", use_e2e_profiler=True):
+            if i < num_batches - 1 and (i + 1) % update_freq > 0:
+                # buffer updates according to --update-freq
+                loss = trainer.train_step(sample, update_params=False, last_step=(i == len(itr) - 1))
+                continue
+            else:
+                loss = trainer.train_step(sample, update_params=True, last_step=(i == len(itr) - 1))
+                if loss != None:
+                    losses.update(loss)
 
         if i >= 10:
             t = time.time()
-- 
Gitee


From fe0312febcf599a6fbdad70a8e659280ed280f02 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E5=90=89=E5=AE=8F=E6=A2=85?= <591861959@qq.com>
Date: Wed, 30 Mar 2022 01:40:56 +0000
Subject: [PATCH 04/22] update

---
 .../nlp/Transformer_ID0105_for_PyTorch/models/transformer.py    | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/PyTorch/dev/nlp/Transformer_ID0105_for_PyTorch/models/transformer.py b/PyTorch/dev/nlp/Transformer_ID0105_for_PyTorch/models/transformer.py
index eefbd6489f..3edd80f899 100644
--- a/PyTorch/dev/nlp/Transformer_ID0105_for_PyTorch/models/transformer.py
+++ b/PyTorch/dev/nlp/Transformer_ID0105_for_PyTorch/models/transformer.py
@@ -320,7 +320,7 @@ class TransformerEncoderLayer(nn.Module):
                               need_weights=False,
                               static_kv=False)
         # if self.training:
-            x = F.dropout(x, self.seed, p=self.dropout)
+        x = F.dropout(x, self.seed, p=self.dropout)
         x = residual + x
         x = self.ln1(x)
 
-- 
Gitee


From 69f8322f42f831ace0898c9499d2044ebbac7bca Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E5=90=89=E5=AE=8F=E6=A2=85?= <591861959@qq.com>
Date: Wed, 30 Mar 2022 01:42:05 +0000
Subject: [PATCH 05/22] update

---
 .../train_1p.py                               | 20 +++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/PyTorch/dev/nlp/Transformer_ID0105_for_PyTorch/train_1p.py b/PyTorch/dev/nlp/Transformer_ID0105_for_PyTorch/train_1p.py
index fc7e4352b6..d0e1cb7b2e 100644
--- a/PyTorch/dev/nlp/Transformer_ID0105_for_PyTorch/train_1p.py
+++ b/PyTorch/dev/nlp/Transformer_ID0105_for_PyTorch/train_1p.py
@@ -217,16 +217,16 @@ def train(args, trainer, datasets, epoch_itr):
     trainer.get_throughput_meter().reset()
 
     for i, sample in enumerate(itr):
-        if i>20:break
-        with torch.npu.profile(profiler_result_path="./results", use_e2e_profiler=True):
-            if i < num_batches - 1 and (i + 1) % update_freq > 0:
-                # buffer updates according to --update-freq
-                loss = trainer.train_step(sample, update_params=False, last_step=(i == len(itr) - 1))
-                continue
-            else:
-                loss = trainer.train_step(sample, update_params=True, last_step=(i == len(itr) - 1))
-                if loss != None:
-                    losses.update(loss)
+        if i>100:pass
+        # with torch.npu.profile(profiler_result_path="./results", use_e2e_profiler=True):
+        if i < num_batches - 1 and (i + 1) % update_freq > 0:
+            # buffer updates according to --update-freq
+            loss = trainer.train_step(sample, update_params=False, last_step=(i == len(itr) - 1))
+            continue
+        else:
+            loss = trainer.train_step(sample, update_params=True, last_step=(i == len(itr) - 1))
+            if loss != None:
+                losses.update(loss)
 
         if i >= 10:
             t = time.time()
-- 
Gitee


From 7601a346e67ac9baddae6b5741b1c679d3c1e522 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E5=90=89=E5=AE=8F=E6=A2=85?= <591861959@qq.com>
Date: Wed, 30 Mar 2022 01:49:14 +0000
Subject: [PATCH 06/22] update

---
 .../nlp/Transformer_ID0105_for_PyTorch/models/transformer.py  | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/PyTorch/dev/nlp/Transformer_ID0105_for_PyTorch/models/transformer.py b/PyTorch/dev/nlp/Transformer_ID0105_for_PyTorch/models/transformer.py
index 3edd80f899..0a29e71f2b 100644
--- a/PyTorch/dev/nlp/Transformer_ID0105_for_PyTorch/models/transformer.py
+++ b/PyTorch/dev/nlp/Transformer_ID0105_for_PyTorch/models/transformer.py
@@ -185,7 +185,7 @@ class TransformerEncoder(nn.Module):
         if self.embed_positions is not None:
             x += self.embed_positions(src_tokens)
         # if self.training:
-            x = F.dropout(x, self.seed, p=self.dropout)
+        x = F.dropout(x, self.seed, p=self.dropout)
 
         # B:batch size ; T: seq length ; C: embedding dim 512
         # B x T x C -> T x B x C
@@ -267,7 +267,7 @@ class TransformerDecoder(IncrementalDecoder):
         if positions is not None:
             x += positions
         # if self.training:
-            x= F.dropout(x, self.seed, p=self.dropout)
+        x= F.dropout(x, self.seed, p=self.dropout)
 
         # B x T x C -> T x B x C
         x = x.transpose(0, 1)
-- 
Gitee


From 5697d58faf90abb1613b863fdf58197e46c5aec3 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E5=90=89=E5=AE=8F=E6=A2=85?= <591861959@qq.com>
Date: Wed, 30 Mar 2022 01:50:29 +0000
Subject: [PATCH 07/22] update

---
 .../train_1p.py                               | 20 +++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/PyTorch/dev/nlp/Transformer_ID0105_for_PyTorch/train_1p.py b/PyTorch/dev/nlp/Transformer_ID0105_for_PyTorch/train_1p.py
index d0e1cb7b2e..00713d6a95 100644
--- a/PyTorch/dev/nlp/Transformer_ID0105_for_PyTorch/train_1p.py
+++ b/PyTorch/dev/nlp/Transformer_ID0105_for_PyTorch/train_1p.py
@@ -217,16 +217,16 @@ def train(args, trainer, datasets, epoch_itr):
     trainer.get_throughput_meter().reset()
 
     for i, sample in enumerate(itr):
-        if i>100:pass
-        # with torch.npu.profile(profiler_result_path="./results", use_e2e_profiler=True):
-        if i < num_batches - 1 and (i + 1) % update_freq > 0:
-            # buffer updates according to --update-freq
-            loss = trainer.train_step(sample, update_params=False, last_step=(i == len(itr) - 1))
-            continue
-        else:
-            loss = trainer.train_step(sample, update_params=True, last_step=(i == len(itr) - 1))
-            if loss != None:
-                losses.update(loss)
+        if i>10:break
+        with torch.npu.profile(profiler_result_path="./results", use_e2e_profiler=True):
+            if i < num_batches - 1 and (i + 1) % update_freq > 0:
+                # buffer updates according to --update-freq
+                loss = trainer.train_step(sample, update_params=False, last_step=(i == len(itr) - 1))
+                continue
+            else:
+                loss = trainer.train_step(sample, update_params=True, last_step=(i == len(itr) - 1))
+                if loss != None:
+                    losses.update(loss)
 
         if i >= 10:
             t = time.time()
-- 
Gitee


From 8ca916d4b9dbd34ba1960894af5fc2ebb8bb9052 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E5=90=89=E5=AE=8F=E6=A2=85?= <591861959@qq.com>
Date: Wed, 30 Mar 2022 01:58:30 +0000
Subject: [PATCH 08/22] update

---
 .../models/transformer.py                      | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/PyTorch/dev/nlp/Transformer_ID0105_for_PyTorch/models/transformer.py b/PyTorch/dev/nlp/Transformer_ID0105_for_PyTorch/models/transformer.py
index 0a29e71f2b..3d080b4725 100644
--- a/PyTorch/dev/nlp/Transformer_ID0105_for_PyTorch/models/transformer.py
+++ b/PyTorch/dev/nlp/Transformer_ID0105_for_PyTorch/models/transformer.py
@@ -185,7 +185,7 @@ class TransformerEncoder(nn.Module):
         if self.embed_positions is not None:
             x += self.embed_positions(src_tokens)
         # if self.training:
-        x = F.dropout(x, self.seed, p=self.dropout)
+        x = F.dropout(x, self.seed, p=self.dropout,training=self.training)
 
         # B:batch size ; T: seq length ; C: embedding dim 512
         # B x T x C -> T x B x C
@@ -267,7 +267,7 @@ class TransformerDecoder(IncrementalDecoder):
         if positions is not None:
             x += positions
         # if self.training:
-        x= F.dropout(x, self.seed, p=self.dropout)
+        x= F.dropout(x, self.seed, p=self.dropout, training=self.training)
 
         # B x T x C -> T x B x C
         x = x.transpose(0, 1)
@@ -320,17 +320,17 @@ class TransformerEncoderLayer(nn.Module):
                               need_weights=False,
                               static_kv=False)
         # if self.training:
-        x = F.dropout(x, self.seed, p=self.dropout)
+        x = F.dropout(x, self.seed, p=self.dropout, training=self.training)
         x = residual + x
         x = self.ln1(x)
 
         residual = x
         x = F.threshold(self.fc1(x), 0.0, 0.0)
         # if self.training:
-        x = F.dropout(x, self.seed, p=self.relu_dropout)
+        x = F.dropout(x, self.seed, p=self.relu_dropout, training=self.training)
         x = self.fc2(x)
         # if self.training:
-        x = F.dropout(x, self.seed, p =self.dropout)
+        x = F.dropout(x, self.seed, p =self.dropout, training=self.training)
         x = residual + x
         x = self.ln2(x)
         return x
@@ -384,7 +384,7 @@ class TransformerDecoderLayer(nn.Module):
         )
 
         # if self.training:
-        x = F.dropout(x, self.seed, p=self.dropout)
+        x = F.dropout(x, self.seed, p=self.dropout, training=self.training)
         x = residual + x
         x = self.self_attn_layer_norm(x)
 
@@ -403,7 +403,7 @@ class TransformerDecoderLayer(nn.Module):
                 need_weights=(not self.training and self.need_attn),
             )
             # if self.training:
-            x = F.dropout(x, self.seed, p=self.dropout)
+            x = F.dropout(x, self.seed, p=self.dropout, training=self.training)
             x = residual + x
 
             x = self.encoder_attn_layer_norm(x)
@@ -411,10 +411,10 @@ class TransformerDecoderLayer(nn.Module):
         residual = x
         x = F.threshold(self.fc1(x), 0.0, 0.0)
         # if self.training:
-        x = F.dropout(x, self.seed, p=self.relu_dropout)
+        x = F.dropout(x, self.seed, p=self.relu_dropout, training=self.training)
         x = self.fc2(x)
         # if self.training:
-        x = F.dropout(x, self.seed, p=self.dropout)
+        x = F.dropout(x, self.seed, p=self.dropout, training=self.training)
         x = residual + x
         x = self.layer_norm(x)
         return x, attn
-- 
Gitee


From 54e395c01867e096e027103ee771cfb7e03bc4ba Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E5=90=89=E5=AE=8F=E6=A2=85?= <591861959@qq.com>
Date: Wed, 30 Mar 2022 01:59:47 +0000
Subject: [PATCH 09/22] update

---
 .../train_1p.py                               | 20 +++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/PyTorch/dev/nlp/Transformer_ID0105_for_PyTorch/train_1p.py b/PyTorch/dev/nlp/Transformer_ID0105_for_PyTorch/train_1p.py
index 00713d6a95..d0e1cb7b2e 100644
--- a/PyTorch/dev/nlp/Transformer_ID0105_for_PyTorch/train_1p.py
+++ b/PyTorch/dev/nlp/Transformer_ID0105_for_PyTorch/train_1p.py
@@ -217,16 +217,16 @@ def train(args, trainer, datasets, epoch_itr):
     trainer.get_throughput_meter().reset()
 
     for i, sample in enumerate(itr):
-        if i>10:break
-        with torch.npu.profile(profiler_result_path="./results", use_e2e_profiler=True):
-            if i < num_batches - 1 and (i + 1) % update_freq > 0:
-                # buffer updates according to --update-freq
-                loss = trainer.train_step(sample, update_params=False, last_step=(i == len(itr) - 1))
-                continue
-            else:
-                loss = trainer.train_step(sample, update_params=True, last_step=(i == len(itr) - 1))
-                if loss != None:
-                    losses.update(loss)
+        if i>100:pass
+        # with torch.npu.profile(profiler_result_path="./results", use_e2e_profiler=True):
+        if i < num_batches - 1 and (i + 1) % update_freq > 0:
+            # buffer updates according to --update-freq
+            loss = trainer.train_step(sample, update_params=False, last_step=(i == len(itr) - 1))
+            continue
+        else:
+            loss = trainer.train_step(sample, update_params=True, last_step=(i == len(itr) - 1))
+            if loss != None:
+                losses.update(loss)
 
         if i >= 10:
             t = time.time()
-- 
Gitee


From 06f3e88cab4fafb02982dc4363cf25776b91e20c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E5=90=89=E5=AE=8F=E6=A2=85?= <591861959@qq.com>
Date: Wed, 30 Mar 2022 02:07:13 +0000
Subject: [PATCH 10/22] update

---
 .../models/transformer.py                      | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/PyTorch/dev/nlp/Transformer_ID0105_for_PyTorch/models/transformer.py b/PyTorch/dev/nlp/Transformer_ID0105_for_PyTorch/models/transformer.py
index 3d080b4725..f8a7d0a6d9 100644
--- a/PyTorch/dev/nlp/Transformer_ID0105_for_PyTorch/models/transformer.py
+++ b/PyTorch/dev/nlp/Transformer_ID0105_for_PyTorch/models/transformer.py
@@ -185,7 +185,7 @@ class TransformerEncoder(nn.Module):
         if self.embed_positions is not None:
             x += self.embed_positions(src_tokens)
         # if self.training:
-        x = F.dropout(x, self.seed, p=self.dropout,training=self.training)
+        x = F.dropout(x, p=self.dropout,training=self.training)
 
         # B:batch size ; T: seq length ; C: embedding dim 512
         # B x T x C -> T x B x C
@@ -267,7 +267,7 @@ class TransformerDecoder(IncrementalDecoder):
         if positions is not None:
             x += positions
         # if self.training:
-        x= F.dropout(x, self.seed, p=self.dropout, training=self.training)
+        x= F.dropout(x, p=self.dropout, training=self.training)
 
         # B x T x C -> T x B x C
         x = x.transpose(0, 1)
@@ -320,17 +320,17 @@ class TransformerEncoderLayer(nn.Module):
                               need_weights=False,
                               static_kv=False)
         # if self.training:
-        x = F.dropout(x, self.seed, p=self.dropout, training=self.training)
+        x = F.dropout(x, p=self.dropout, training=self.training)
         x = residual + x
         x = self.ln1(x)
 
         residual = x
         x = F.threshold(self.fc1(x), 0.0, 0.0)
         # if self.training:
-        x = F.dropout(x, self.seed, p=self.relu_dropout, training=self.training)
+        x = F.dropout(x, p=self.relu_dropout, training=self.training)
         x = self.fc2(x)
         # if self.training:
-        x = F.dropout(x, self.seed, p =self.dropout, training=self.training)
+        x = F.dropout(x, p =self.dropout, training=self.training)
         x = residual + x
         x = self.ln2(x)
         return x
@@ -384,7 +384,7 @@ class TransformerDecoderLayer(nn.Module):
         )
 
         # if self.training:
-        x = F.dropout(x, self.seed, p=self.dropout, training=self.training)
+        x = F.dropout(x, p=self.dropout, training=self.training)
         x = residual + x
         x = self.self_attn_layer_norm(x)
 
@@ -403,7 +403,7 @@ class TransformerDecoderLayer(nn.Module):
                 need_weights=(not self.training and self.need_attn),
             )
             # if self.training:
-            x = F.dropout(x, self.seed, p=self.dropout, training=self.training)
+            x = F.dropout(x, p=self.dropout, training=self.training)
             x = residual + x
 
             x = self.encoder_attn_layer_norm(x)
@@ -411,10 +411,10 @@ class TransformerDecoderLayer(nn.Module):
         residual = x
         x = F.threshold(self.fc1(x), 0.0, 0.0)
         # if self.training:
-        x = F.dropout(x, self.seed, p=self.relu_dropout, training=self.training)
+        x = F.dropout(x, p=self.relu_dropout, training=self.training)
         x = self.fc2(x)
         # if self.training:
-        x = F.dropout(x, self.seed, p=self.dropout, training=self.training)
+        x = F.dropout(x, p=self.dropout, training=self.training)
         x = residual + x
         x = self.layer_norm(x)
         return x, attn
-- 
Gitee


From 20448fd14e3229ffa51869731a53f47f18c91c59 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E5=90=89=E5=AE=8F=E6=A2=85?= <591861959@qq.com>
Date: Wed, 30 Mar 2022 02:17:44 +0000
Subject: [PATCH 11/22] update

---
 .../train_1p.py                               | 20 +++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/PyTorch/dev/nlp/Transformer_ID0105_for_PyTorch/train_1p.py b/PyTorch/dev/nlp/Transformer_ID0105_for_PyTorch/train_1p.py
index d0e1cb7b2e..00713d6a95 100644
--- a/PyTorch/dev/nlp/Transformer_ID0105_for_PyTorch/train_1p.py
+++ b/PyTorch/dev/nlp/Transformer_ID0105_for_PyTorch/train_1p.py
@@ -217,16 +217,16 @@ def train(args, trainer, datasets, epoch_itr):
     trainer.get_throughput_meter().reset()
 
     for i, sample in enumerate(itr):
-        if i>100:pass
-        # with torch.npu.profile(profiler_result_path="./results", use_e2e_profiler=True):
-        if i < num_batches - 1 and (i + 1) % update_freq > 0:
-            # buffer updates according to --update-freq
-            loss = trainer.train_step(sample, update_params=False, last_step=(i == len(itr) - 1))
-            continue
-        else:
-            loss = trainer.train_step(sample, update_params=True, last_step=(i == len(itr) - 1))
-            if loss != None:
-                losses.update(loss)
+        if i>10:break
+        with torch.npu.profile(profiler_result_path="./results", use_e2e_profiler=True):
+            if i < num_batches - 1 and (i + 1) % update_freq > 0:
+                # buffer updates according to --update-freq
+                loss = trainer.train_step(sample, update_params=False, last_step=(i == len(itr) - 1))
+                continue
+            else:
+                loss = trainer.train_step(sample, update_params=True, last_step=(i == len(itr) - 1))
+                if loss != None:
+                    losses.update(loss)
 
         if i >= 10:
             t = time.time()
-- 
Gitee


From 7136f330dd4d6da943abe28068862800b40f1edc Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E5=90=89=E5=AE=8F=E6=A2=85?= <591861959@qq.com>
Date: Wed, 30 Mar 2022 02:31:19 +0000
Subject: [PATCH 12/22] update

---
 .../train_1p.py                               | 20 +++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/PyTorch/dev/nlp/Transformer_ID0105_for_PyTorch/train_1p.py b/PyTorch/dev/nlp/Transformer_ID0105_for_PyTorch/train_1p.py
index 00713d6a95..d0e1cb7b2e 100644
--- a/PyTorch/dev/nlp/Transformer_ID0105_for_PyTorch/train_1p.py
+++ b/PyTorch/dev/nlp/Transformer_ID0105_for_PyTorch/train_1p.py
@@ -217,16 +217,16 @@ def train(args, trainer, datasets, epoch_itr):
     trainer.get_throughput_meter().reset()
 
     for i, sample in enumerate(itr):
-        if i>10:break
-        with torch.npu.profile(profiler_result_path="./results", use_e2e_profiler=True):
-            if i < num_batches - 1 and (i + 1) % update_freq > 0:
-                # buffer updates according to --update-freq
-                loss = trainer.train_step(sample, update_params=False, last_step=(i == len(itr) - 1))
-                continue
-            else:
-                loss = trainer.train_step(sample, update_params=True, last_step=(i == len(itr) - 1))
-                if loss != None:
-                    losses.update(loss)
+        if i>100:pass
+        # with torch.npu.profile(profiler_result_path="./results", use_e2e_profiler=True):
+        if i < num_batches - 1 and (i + 1) % update_freq > 0:
+            # buffer updates according to --update-freq
+            loss = trainer.train_step(sample, update_params=False, last_step=(i == len(itr) - 1))
+            continue
+        else:
+            loss = trainer.train_step(sample, update_params=True, last_step=(i == len(itr) - 1))
+            if loss != None:
+                losses.update(loss)
 
         if i >= 10:
             t = time.time()
-- 
Gitee


From 041a59a586f6ca597c11eb6063189932b3fc83c9 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E5=90=89=E5=AE=8F=E6=A2=85?= <591861959@qq.com>
Date: Wed, 30 Mar 2022 02:59:19 +0000
Subject: [PATCH 13/22] update

---
 .../train_1p.py                               | 20 +++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/PyTorch/dev/nlp/Transformer_ID0105_for_PyTorch/train_1p.py b/PyTorch/dev/nlp/Transformer_ID0105_for_PyTorch/train_1p.py
index d0e1cb7b2e..00713d6a95 100644
--- a/PyTorch/dev/nlp/Transformer_ID0105_for_PyTorch/train_1p.py
+++ b/PyTorch/dev/nlp/Transformer_ID0105_for_PyTorch/train_1p.py
@@ -217,16 +217,16 @@ def train(args, trainer, datasets, epoch_itr):
     trainer.get_throughput_meter().reset()
 
     for i, sample in enumerate(itr):
-        if i>100:pass
-        # with torch.npu.profile(profiler_result_path="./results", use_e2e_profiler=True):
-        if i < num_batches - 1 and (i + 1) % update_freq > 0:
-            # buffer updates according to --update-freq
-            loss = trainer.train_step(sample, update_params=False, last_step=(i == len(itr) - 1))
-            continue
-        else:
-            loss = trainer.train_step(sample, update_params=True, last_step=(i == len(itr) - 1))
-            if loss != None:
-                losses.update(loss)
+        if i>10:break
+        with torch.npu.profile(profiler_result_path="./results", use_e2e_profiler=True):
+            if i < num_batches - 1 and (i + 1) % update_freq > 0:
+                # buffer updates according to --update-freq
+                loss = trainer.train_step(sample, update_params=False, last_step=(i == len(itr) - 1))
+                continue
+            else:
+                loss = trainer.train_step(sample, update_params=True, last_step=(i == len(itr) - 1))
+                if loss != None:
+                    losses.update(loss)
 
         if i >= 10:
             t = time.time()
-- 
Gitee


From f30b7fcf6078c329f05118135b7d50cead7e94c8 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E5=90=89=E5=AE=8F=E6=A2=85?= <591861959@qq.com>
Date: Wed, 30 Mar 2022 03:16:51 +0000
Subject: [PATCH 14/22] update

---
 .../train_1p.py                               | 20 +++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/PyTorch/dev/nlp/Transformer_ID0105_for_PyTorch/train_1p.py b/PyTorch/dev/nlp/Transformer_ID0105_for_PyTorch/train_1p.py
index 00713d6a95..d0e1cb7b2e 100644
--- a/PyTorch/dev/nlp/Transformer_ID0105_for_PyTorch/train_1p.py
+++ b/PyTorch/dev/nlp/Transformer_ID0105_for_PyTorch/train_1p.py
@@ -217,16 +217,16 @@ def train(args, trainer, datasets, epoch_itr):
     trainer.get_throughput_meter().reset()
 
     for i, sample in enumerate(itr):
-        if i>10:break
-        with torch.npu.profile(profiler_result_path="./results", use_e2e_profiler=True):
-            if i < num_batches - 1 and (i + 1) % update_freq > 0:
-                # buffer updates according to --update-freq
-                loss = trainer.train_step(sample, update_params=False, last_step=(i == len(itr) - 1))
-                continue
-            else:
-                loss = trainer.train_step(sample, update_params=True, last_step=(i == len(itr) - 1))
-                if loss != None:
-                    losses.update(loss)
+        if i>100:pass
+        # with torch.npu.profile(profiler_result_path="./results", use_e2e_profiler=True):
+        if i < num_batches - 1 and (i + 1) % update_freq > 0:
+            # buffer updates according to --update-freq
+            loss = trainer.train_step(sample, update_params=False, last_step=(i == len(itr) - 1))
+            continue
+        else:
+            loss = trainer.train_step(sample, update_params=True, last_step=(i == len(itr) - 1))
+            if loss != None:
+                losses.update(loss)
 
         if i >= 10:
             t = time.time()
-- 
Gitee


From 9fc776477f496c187dd28b4c343c3faa161e13d1 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E5=90=89=E5=AE=8F=E6=A2=85?= <591861959@qq.com>
Date: Wed, 30 Mar 2022 03:17:34 +0000
Subject: [PATCH 15/22] update

---
 .../nlp/Transformer_ID0105_for_PyTorch/test/train_full_8p.sh    | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/PyTorch/dev/nlp/Transformer_ID0105_for_PyTorch/test/train_full_8p.sh b/PyTorch/dev/nlp/Transformer_ID0105_for_PyTorch/test/train_full_8p.sh
index 06828617bf..9ccd3f32ff 100644
--- a/PyTorch/dev/nlp/Transformer_ID0105_for_PyTorch/test/train_full_8p.sh
+++ b/PyTorch/dev/nlp/Transformer_ID0105_for_PyTorch/test/train_full_8p.sh
@@ -10,7 +10,7 @@ data_path=""
 
 #网络名称,同目录名称,需要模型审视修改
 Network="Transformer_ID0105_for_PyTorch"
-
+export BMMV2_ENABLE=1
 #训练epoch
 train_epochs=30
 #训练batch_size,,需要模型审视修改
-- 
Gitee


From c4edb381e676c97960bbd421c9b9192732b50a74 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E5=90=89=E5=AE=8F=E6=A2=85?= <591861959@qq.com>
Date: Wed, 30 Mar 2022 03:25:15 +0000
Subject: [PATCH 16/22] updat

---
 .../nlp/Transformer_ID0105_for_PyTorch/test/train_full_1p.sh    | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/PyTorch/dev/nlp/Transformer_ID0105_for_PyTorch/test/train_full_1p.sh b/PyTorch/dev/nlp/Transformer_ID0105_for_PyTorch/test/train_full_1p.sh
index 8942bbc29c..b1ef8b10f4 100644
--- a/PyTorch/dev/nlp/Transformer_ID0105_for_PyTorch/test/train_full_1p.sh
+++ b/PyTorch/dev/nlp/Transformer_ID0105_for_PyTorch/test/train_full_1p.sh
@@ -3,7 +3,7 @@
 cur_path=`pwd`
 #集合通信参数,不需要修改
 export RANK_SIZE=1
-
+export BMMV2_ENABLE=1
 # 数据集路径,保持为空,不需要修改
 data_path=""
 
-- 
Gitee


From b8d18d39e71eccfaa24809031da5ccb79d76ec6b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E5=90=89=E5=AE=8F=E6=A2=85?= <591861959@qq.com>
Date: Wed, 30 Mar 2022 08:29:52 +0000
Subject: [PATCH 17/22] updat

---
 .../Transformer_ID0105_for_PyTorch/models/transformer.py | 9 ---------
 1 file changed, 9 deletions(-)

diff --git a/PyTorch/dev/nlp/Transformer_ID0105_for_PyTorch/models/transformer.py b/PyTorch/dev/nlp/Transformer_ID0105_for_PyTorch/models/transformer.py
index f8a7d0a6d9..2671386f66 100644
--- a/PyTorch/dev/nlp/Transformer_ID0105_for_PyTorch/models/transformer.py
+++ b/PyTorch/dev/nlp/Transformer_ID0105_for_PyTorch/models/transformer.py
@@ -184,7 +184,6 @@ class TransformerEncoder(nn.Module):
         x = self.embed_scale * self.embed_tokens(src_tokens)
         if self.embed_positions is not None:
             x += self.embed_positions(src_tokens)
-        # if self.training:
         x = F.dropout(x, p=self.dropout,training=self.training)
 
         # B:batch size ; T: seq length ; C: embedding dim 512
@@ -266,7 +265,6 @@ class TransformerDecoder(IncrementalDecoder):
         x = self.embed_scale * self.embed_tokens(prev_output_tokens)
         if positions is not None:
             x += positions
-        # if self.training:
         x= F.dropout(x, p=self.dropout, training=self.training)
 
         # B x T x C -> T x B x C
@@ -319,17 +317,14 @@ class TransformerEncoderLayer(nn.Module):
                               incremental_state=None,
                               need_weights=False,
                               static_kv=False)
-        # if self.training:
         x = F.dropout(x, p=self.dropout, training=self.training)
         x = residual + x
         x = self.ln1(x)
 
         residual = x
         x = F.threshold(self.fc1(x), 0.0, 0.0)
-        # if self.training:
         x = F.dropout(x, p=self.relu_dropout, training=self.training)
         x = self.fc2(x)
-        # if self.training:
         x = F.dropout(x, p =self.dropout, training=self.training)
         x = residual + x
         x = self.ln2(x)
@@ -383,7 +378,6 @@ class TransformerDecoderLayer(nn.Module):
             static_kv=False
         )
 
-        # if self.training:
         x = F.dropout(x, p=self.dropout, training=self.training)
         x = residual + x
         x = self.self_attn_layer_norm(x)
@@ -402,7 +396,6 @@ class TransformerDecoderLayer(nn.Module):
                 mask_future_timesteps=False,
                 need_weights=(not self.training and self.need_attn),
             )
-            # if self.training:
             x = F.dropout(x, p=self.dropout, training=self.training)
             x = residual + x
 
@@ -410,10 +403,8 @@ class TransformerDecoderLayer(nn.Module):
 
         residual = x
         x = F.threshold(self.fc1(x), 0.0, 0.0)
-        # if self.training:
         x = F.dropout(x, p=self.relu_dropout, training=self.training)
         x = self.fc2(x)
-        # if self.training:
         x = F.dropout(x, p=self.dropout, training=self.training)
         x = residual + x
         x = self.layer_norm(x)
-- 
Gitee


From b1cc6b0359883a291fc2bfa67d81bd62dd600f13 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E5=90=89=E5=AE=8F=E6=A2=85?= <591861959@qq.com>
Date: Wed, 30 Mar 2022 08:31:45 +0000
Subject: [PATCH 18/22] update

---
 .../dev/nlp/Transformer_ID0105_for_PyTorch/test/train_full_1p.sh | 1 +
 1 file changed, 1 insertion(+)

diff --git a/PyTorch/dev/nlp/Transformer_ID0105_for_PyTorch/test/train_full_1p.sh b/PyTorch/dev/nlp/Transformer_ID0105_for_PyTorch/test/train_full_1p.sh
index b1ef8b10f4..2b6134790e 100644
--- a/PyTorch/dev/nlp/Transformer_ID0105_for_PyTorch/test/train_full_1p.sh
+++ b/PyTorch/dev/nlp/Transformer_ID0105_for_PyTorch/test/train_full_1p.sh
@@ -3,6 +3,7 @@
 cur_path=`pwd`
 #集合通信参数,不需要修改
 export RANK_SIZE=1
+
 export BMMV2_ENABLE=1
 # 数据集路径,保持为空,不需要修改
 data_path=""
-- 
Gitee


From 2eea6acee416aa45f11ff0a65ee0d48619bfa526 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E5=90=89=E5=AE=8F=E6=A2=85?= <591861959@qq.com>
Date: Wed, 30 Mar 2022 08:32:25 +0000
Subject: [PATCH 19/22] update

---
 .../dev/nlp/Transformer_ID0105_for_PyTorch/test/train_full_8p.sh | 1 +
 1 file changed, 1 insertion(+)

diff --git a/PyTorch/dev/nlp/Transformer_ID0105_for_PyTorch/test/train_full_8p.sh b/PyTorch/dev/nlp/Transformer_ID0105_for_PyTorch/test/train_full_8p.sh
index 9ccd3f32ff..dc8f9b79a7 100644
--- a/PyTorch/dev/nlp/Transformer_ID0105_for_PyTorch/test/train_full_8p.sh
+++ b/PyTorch/dev/nlp/Transformer_ID0105_for_PyTorch/test/train_full_8p.sh
@@ -10,6 +10,7 @@ data_path=""
 
 #网络名称,同目录名称,需要模型审视修改
 Network="Transformer_ID0105_for_PyTorch"
+
 export BMMV2_ENABLE=1
 #训练epoch
 train_epochs=30
-- 
Gitee


From 6928741d7845f831ed2fd4ba5ed0ec0408ef6a74 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E5=90=89=E5=AE=8F=E6=A2=85?= <591861959@qq.com>
Date: Wed, 30 Mar 2022 08:32:59 +0000
Subject: [PATCH 20/22] update

---
 .../Transformer_ID0105_for_PyTorch/test/train_performance_8p.sh  | 1 +
 1 file changed, 1 insertion(+)

diff --git a/PyTorch/dev/nlp/Transformer_ID0105_for_PyTorch/test/train_performance_8p.sh b/PyTorch/dev/nlp/Transformer_ID0105_for_PyTorch/test/train_performance_8p.sh
index b10fe5ec3b..f4b26836cd 100644
--- a/PyTorch/dev/nlp/Transformer_ID0105_for_PyTorch/test/train_performance_8p.sh
+++ b/PyTorch/dev/nlp/Transformer_ID0105_for_PyTorch/test/train_performance_8p.sh
@@ -7,6 +7,7 @@ export RANK_SIZE=8
 export MASTER_ADDR=localhost
 export MASTER_PORT=29688
 export HCCL_WHITELIST_DISABLE=1
+export BMMV2_ENABLE=1
 # 数据集路径,保持为空,不需要修改
 data_path=""
 
-- 
Gitee


From 5678dd480fb03f94c746f420f4b96fc8aad65070 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E5=90=89=E5=AE=8F=E6=A2=85?= <591861959@qq.com>
Date: Wed, 30 Mar 2022 08:33:31 +0000
Subject: [PATCH 21/22] updat

---
 .../Transformer_ID0105_for_PyTorch/test/train_performance_1p.sh  | 1 +
 1 file changed, 1 insertion(+)

diff --git a/PyTorch/dev/nlp/Transformer_ID0105_for_PyTorch/test/train_performance_1p.sh b/PyTorch/dev/nlp/Transformer_ID0105_for_PyTorch/test/train_performance_1p.sh
index 70300b545e..48e7fb3af7 100644
--- a/PyTorch/dev/nlp/Transformer_ID0105_for_PyTorch/test/train_performance_1p.sh
+++ b/PyTorch/dev/nlp/Transformer_ID0105_for_PyTorch/test/train_performance_1p.sh
@@ -3,6 +3,7 @@
 cur_path=`pwd`
 #集合通信参数,不需要修改
 export RANK_SIZE=1
+
 export BMMV2_ENABLE=1
 # 数据集路径,保持为空,不需要修改
 data_path=""
-- 
Gitee


From 54ae5bf667b9ca61956e95953f5e847b9e6904e1 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E5=90=89=E5=AE=8F=E6=A2=85?= <591861959@qq.com>
Date: Wed, 30 Mar 2022 08:35:05 +0000
Subject: [PATCH 22/22] update

---
 PyTorch/dev/nlp/Transformer_ID0105_for_PyTorch/train_1p.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/PyTorch/dev/nlp/Transformer_ID0105_for_PyTorch/train_1p.py b/PyTorch/dev/nlp/Transformer_ID0105_for_PyTorch/train_1p.py
index d0e1cb7b2e..b1c4229807 100644
--- a/PyTorch/dev/nlp/Transformer_ID0105_for_PyTorch/train_1p.py
+++ b/PyTorch/dev/nlp/Transformer_ID0105_for_PyTorch/train_1p.py
@@ -218,7 +218,6 @@ def train(args, trainer, datasets, epoch_itr):
 
     for i, sample in enumerate(itr):
         if i>100:pass
-        # with torch.npu.profile(profiler_result_path="./results", use_e2e_profiler=True):
         if i < num_batches - 1 and (i + 1) % update_freq > 0:
             # buffer updates according to --update-freq
             loss = trainer.train_step(sample, update_params=False, last_step=(i == len(itr) - 1))
-- 
Gitee