diff --git a/graphengine b/graphengine
index b56450bde6d5afa1c557437ebf154487afe355f0..236001806129e36c0f48b240c4f61b2e1d92c470 160000
--- a/graphengine
+++ b/graphengine
@@ -1 +1 @@
-Subproject commit b56450bde6d5afa1c557437ebf154487afe355f0
+Subproject commit 236001806129e36c0f48b240c4f61b2e1d92c470
diff --git a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/logit.h b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/logit.h
index 1df19e6d37d1ac8096e0ce439cd9bb8a65468100..8daaa4c12d2c0714cf65c0d54d7b18163f2f2a11 100644
--- a/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/logit.h
+++ b/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/ms_kernel/logit.h
@@ -24,10 +24,8 @@ class LogitCpuKernel : public CpuKernel {
  public:
   LogitCpuKernel() = default;
   ~LogitCpuKernel() override = default;
-
  protected:
   uint32_t Compute(CpuKernelContext &ctx) override;
-
  private:
   template <typename T>
   uint32_t LogitCompute(CpuKernelContext &ctx);
diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/attention_parameter.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/attention_parameter.h
index c3b600b234ff591bcf1a385c2eea08d862854a72..aabd3121c43ef07606af24bc6e64ca60706812e0 100644
--- a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/attention_parameter.h
+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/attention_parameter.h
@@ -23,6 +23,7 @@ typedef struct AttentionParameter {
   int head_num_;
   int head_size_;
   bool cross_;
+  float sclae_;
 } AttentionParameter;
 
 typedef struct RelativePositionAttentionParameter {
diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/decoder_layer_parameter.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/decoder_layer_parameter.h
new file mode 100644
index 0000000000000000000000000000000000000000..4c3254fc2f5652e584e3254d5978aa420c933c78
--- /dev/null
+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/decoder_layer_parameter.h
@@ -0,0 +1,38 @@
+/**
+ * Copyright 2022 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_CCSRC_PLUGIN_DEVICE_CPU_KERNEL_NNACL_DECODER_LAYER_PARAMETER_H_
+#define MINDSPORE_CCSRC_PLUGIN_DEVICE_CPU_KERNEL_NNACL_DECODER_LAYER_PARAMETER_H_
+
+#include "nnacl/op_base.h"
+
+typedef struct DecoderLayerParameter {
+  OpParameter op_parameter_;
+  int head_num_;
+  int head_size_;
+  bool post_layernorm_;
+  float eps_layernorm1_;
+  float eps_layernorm2_;
+  float eps_layernorm3_;
+  int ffn_hidden_size_;
+  bool position_bias1_;
+  bool position_bias2_;
+  float scale1_;
+  float scale2_;
+  ActType act_type_;
+} DecoderLayerParameter;
+
+#endif  // MINDSPORE_CCSRC_PLUGIN_DEVICE_CPU_KERNEL_NNACL_DECODER_LAYER_PARAMETER_H_
diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/infer/decoder_layer_infer.c b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/infer/decoder_layer_infer.c
new file mode 100644
index 0000000000000000000000000000000000000000..401acdea7ab089ddc3e77a37aaac3268236403c7
--- /dev/null
+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/infer/decoder_layer_infer.c
@@ -0,0 +1,37 @@
+/**
+ * Copyright 2022 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include <stdio.h>
+#include "nnacl/infer/decoder_layer_infer.h"
+#include "nnacl/infer/infer_register.h"
+#include "nnacl/decoder_layer_parameter.h"
+
+int DecoderLayerInferShape(const TensorC *const *inputs, size_t inputs_size, TensorC **outputs, size_t outputs_size,
+                           OpParameter *parameter) {
+  int check_ret = CheckAugmentWithMinSize(inputs, inputs_size, outputs, outputs_size, parameter, C16NUM, C1NUM);
+  if (check_ret != NNACL_OK) {
+    return check_ret;
+  }
+  const TensorC *input = inputs[FIRST_INPUT];
+  TensorC *output0 = outputs[FIRST_INPUT];
+  SetDataTypeFormat(output0, input);
+  if (!InferFlag(inputs, inputs_size)) {
+    return NNACL_INFER_INVALID;
+  }
+  SetShapeTensor(output0, input);
+  return NNACL_OK;
+}
+
+REG_INFER(DecoderLayer, PrimType_Inner_DecoderLayer, DecoderLayerInferShape)
diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/infer/decoder_layer_infer.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/infer/decoder_layer_infer.h
new file mode 100644
index 0000000000000000000000000000000000000000..facdcc50662785ff9a86cab7e49980b08f245311
--- /dev/null
+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/infer/decoder_layer_infer.h
@@ -0,0 +1,31 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef MINDSPORE_CCSRC_PLUGIN_DEVICE_CPU_KERNEL_NNACL_INFER_DECODER_LAYER_INFER_H_
+#define MINDSPORE_CCSRC_PLUGIN_DEVICE_CPU_KERNEL_NNACL_INFER_DECODER_LAYER_INFER_H_
+
+#include "nnacl/infer/common_infer.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+int DecoderLayerInferShape(const TensorC *const *inputs, size_t inputs_size, TensorC **outputs, size_t outputs_size,
+                           OpParameter *parameter);
+
+#ifdef __cplusplus
+}
+#endif
+#endif  // MINDSPORE_CCSRC_PLUGIN_DEVICE_CPU_KERNEL_NNACL_INFER_DECODER_LAYER_INFER_H_
diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/infer/infer_register.c b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/infer/infer_register.c
index f9ddf940dd2be79af5b7cb1ad6213b9a21ce73d4..042ccf149dcaaa385c20051258ec6a086e8c9ef4 100644
--- a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/infer/infer_register.c
+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/infer/infer_register.c
@@ -42,6 +42,8 @@
 #include "nnacl/infer/common_infer.h"
 #include "nnacl/infer/concat_infer.h"
 #include "nnacl/infer/constant_of_shape_infer.h"
+#include "nnacl/infer/decoder_layer_infer.h"
+
 #ifdef MSLITE_ENABLE_CONTROLFLOW
 #include "nnacl/infer/control/tensor_array_infer.h"
 #include "nnacl/infer/control/tensor_array_read_infer.h"
@@ -404,7 +406,7 @@ void RegAllInferFunc5() {
 #ifndef RUNTIME_PASS_CLIP
   g_inner_op_infer_func[PrimType_Inner_ShapeFusion - PrimType_InnerOpMin] = ShapeFusionInferShape;
   g_inner_op_infer_func[PrimType_Inner_EncoderLayer - PrimType_InnerOpMin] = EncoderLayerInferShape;
-
+  g_inner_op_infer_func[PrimType_Inner_DecoderLayer - PrimType_InnerOpMin] = DecoderLayerInferShape;
 #endif
   g_inner_op_infer_func[PrimType_Inner_ToFormat - PrimType_InnerOpMin] = NULL;
 }
diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/op_base.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/op_base.h
index ad88e3cc7053a7218dbdcf8c0cd09b8f4ef897ee..2ed8cb8cdbdb61a52a0e2f9a884ed7ca436bbeea 100644
--- a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/op_base.h
+++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/op_base.h
@@ -41,9 +41,15 @@
 #define C12NUM 12
 #define C13NUM 13
 #define C14NUM 14
+#define C15NUM 15
 #define C16NUM 16
+#define C17NUM 17
+#define C18NUM 18
+#define C19NUM 19
 #define C20NUM 20
 #define C21NUM 21
+#define C22NUM 22
+#define C23NUM 23
 #define C24NUM 24
 #define C28NUM 28
 #define C32NUM 32
@@ -536,6 +542,7 @@ enum PrimType {
   PrimType_Inner_GraphKernel = 10004,
   PrimType_Inner_SplitReduceConcatFusion = 10005,
   PrimType_Inner_EncoderLayer = 10006,
+  PrimType_Inner_DecoderLayer = 10007,
   PrimType_InnerOpMax,
   PrimType_InnerOpMin = PrimType_Inner_ToFormat
 };
@@ -654,7 +661,7 @@ typedef struct QuantMulArg {
 } QuantMulArg;
 
 typedef enum ReductionType { Reduction_Sum, Reduction_Mean, Reduction_None } ReductionType;
-typedef enum ActType { ActType_No, ActType_Relu, ActType_Sigmod, ActType_Relu6, ActType_Prelu } ActType;
+typedef enum ActType { ActType_No, ActType_Relu, ActType_Sigmod, ActType_Relu6, ActType_Prelu, ActType_Gelu } ActType;
 typedef enum PadMode { Pad_pad, Pad_same, Pad_valid } PadMode;
 typedef enum RoundingMode { Rounding_No, Rounding_Away_from_zero, Rounding_Up } RoundingMode;
 typedef enum CalFixedMultiplierMode {
diff --git a/mindspore/core/ops/attention.cc b/mindspore/core/ops/attention.cc
index e6f26cbb28a66945ad0036ca65b71d1bdbe2ca59..715bb985e285e20099527b848b58112b02087601 100644
--- a/mindspore/core/ops/attention.cc
+++ b/mindspore/core/ops/attention.cc
@@ -34,7 +34,7 @@ void Attention::set_cross(bool cross) { (void)this->AddAttr(kCross, api::MakeVal
 void Attention::set_position_bias(bool position_bias) {
   (void)this->AddAttr(kPositionBias, api::MakeValue(position_bias));
 }
-
+void Attention::set_scale(float scale) { (void)this->AddAttr(kScale, api::MakeValue(scale)); }
 int64_t Attention::get_head_num() const {
   auto value_ptr = this->GetAttr(kAttentionNumHeads);
   return GetValue<int64_t>(value_ptr);
@@ -54,12 +54,16 @@ bool Attention::get_position_bias() const {
   auto value_ptr = this->GetAttr(kPositionBias);
   return GetValue<bool>(value_ptr);
 }
-
-void Attention::Init(int64_t head_num, int64_t head_size, bool position_bias, bool cross) {
+float Attention::get_scale() const {
+  auto value_ptr = this->GetAttr(kScale);
+  return GetValue<float>(value_ptr);
+}
+void Attention::Init(int64_t head_num, int64_t head_size, bool position_bias, bool cross, float scale) {
   this->set_head_num(head_num);
   this->set_head_size(head_size);
   this->set_cross(cross);
   this->set_position_bias(position_bias);
+  this->set_scale(scale);
 }
 REGISTER_PRIMITIVE_C(kNameAttention, Attention);
 }  // namespace mindspore::ops
diff --git a/mindspore/core/ops/attention.h b/mindspore/core/ops/attention.h
index 24b0a98f3f62edfd59b955bc4788042e8ab73d6f..838c04a381715ce30682c6c18bee637dceba7c4c 100644
--- a/mindspore/core/ops/attention.h
+++ b/mindspore/core/ops/attention.h
@@ -41,15 +41,17 @@ class MIND_API Attention : public BaseOperator {
   /// \param[in] head_size Define size per head.
   /// \param[in] cross Define is cross attention. Default false.
   /// \param[in] position_bias Define is position bias attention.
-  void Init(int64_t head_num, int64_t head_size, bool position_bias, bool cross = false);
+  void Init(int64_t head_num, int64_t head_size, bool position_bias, bool cross = false, float scale = 1.0f);
   void set_head_num(int64_t head_num);
   void set_head_size(int64_t head_size);
   void set_cross(bool cross);
   void set_position_bias(bool position_bias);
+  void set_scale(float scale);
   int64_t get_head_num() const;
   int64_t get_head_size() const;
   bool get_cross() const;
   bool get_position_bias() const;
+  float get_scale() const;
 };
 }  // namespace ops
 }  // namespace mindspore
diff --git a/mindspore/core/ops/decoder_layer.cc b/mindspore/core/ops/decoder_layer.cc
new file mode 100644
index 0000000000000000000000000000000000000000..91d725c5b1522b24be5515fa767e3078087019a7
--- /dev/null
+++ b/mindspore/core/ops/decoder_layer.cc
@@ -0,0 +1,129 @@
+
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "ops/decoder_layer.h"
+#include "ops/primitive_c.h"
+#include "ops/op_utils.h"
+#include "mindapi/src/helper.h"
+
+namespace mindspore::ops {
+MIND_API_OPERATOR_IMPL(DecoderLayer, BaseOperator);
+
+void DecoderLayer::set_head_num(int64_t head_num) {
+  (void)this->AddAttr(kDecoderLayerNumHeads, api::MakeValue(head_num));
+}
+
+void DecoderLayer::set_head_size(int64_t head_size) {
+  (void)this->AddAttr(kDecoderLayerSizePerHead, api::MakeValue(head_size));
+}
+
+void DecoderLayer::set_post_layernorm(bool post_layernorm) {
+  (void)this->AddAttr(kDecoderLayerPostLayernorm, api::MakeValue(post_layernorm));
+}
+void DecoderLayer::set_eps_layernorm1(float eps_layernorm1) {
+  (void)this->AddAttr(kDecoderLayerEpsLayerNorm1, api::MakeValue(eps_layernorm1));
+}
+void DecoderLayer::set_eps_layernorm2(float eps_layernorm2) {
+  (void)this->AddAttr(kDecoderLayerEpsLayerNorm2, api::MakeValue(eps_layernorm2));
+}
+void DecoderLayer::set_eps_layernorm3(float eps_layernorm3) {
+  (void)this->AddAttr(kDecoderLayerEpsLayerNorm3, api::MakeValue(eps_layernorm3));
+}
+void DecoderLayer::set_ffn_hidden_size(int64_t ffn_hidden_size) {
+  (void)this->AddAttr(kDecoderLayerFfnHiddenSize, api::MakeValue(ffn_hidden_size));
+}
+void DecoderLayer::set_position_bias1(bool position_bias1) {
+  (void)this->AddAttr(kDecoderLayerPositionBias1, api::MakeValue(position_bias1));
+}
+void DecoderLayer::set_position_bias2(bool position_bias2) {
+  (void)this->AddAttr(kDecoderLayerPositionBias2, api::MakeValue(position_bias2));
+}
+void DecoderLayer::set_scale1(float scale1) { (void)this->AddAttr(kDecoderLayerScale1, api::MakeValue(scale1)); }
+void DecoderLayer::set_scale2(float scale2) { (void)this->AddAttr(kDecoderLayerScale2, api::MakeValue(scale2)); }
+void DecoderLayer::set_act_type(ActType act_type) { (void)this->AddAttr(kActivationType, api::MakeValue(act_type)); }
+int64_t DecoderLayer::get_head_num() const {
+  auto value_ptr = this->GetAttr(kDecoderLayerNumHeads);
+  return GetValue<int64_t>(value_ptr);
+}
+
+int64_t DecoderLayer::get_head_size() const {
+  auto value_ptr = this->GetAttr(kDecoderLayerSizePerHead);
+  return GetValue<int64_t>(value_ptr);
+}
+
+bool DecoderLayer::get_post_layernorm() const {
+  auto value_ptr = this->GetAttr(kDecoderLayerPostLayernorm);
+  return GetValue<bool>(value_ptr);
+}
+float DecoderLayer::get_eps_layernorm1() const {
+  auto value_ptr = this->GetAttr(kDecoderLayerEpsLayerNorm1);
+  return GetValue<float>(value_ptr);
+}
+float DecoderLayer::get_eps_layernorm2() const {
+  auto value_ptr = this->GetAttr(kDecoderLayerEpsLayerNorm2);
+  return GetValue<float>(value_ptr);
+}
+float DecoderLayer::get_eps_layernorm3() const {
+  auto value_ptr = this->GetAttr(kDecoderLayerEpsLayerNorm3);
+  return GetValue<float>(value_ptr);
+}
+int64_t DecoderLayer::get_ffn_hidden_size() const {
+  auto value_ptr = this->GetAttr(kDecoderLayerFfnHiddenSize);
+  return GetValue<int64_t>(value_ptr);
+}
+bool DecoderLayer::get_position_bias1() const {
+  auto value_ptr = this->GetAttr(kDecoderLayerPositionBias1);
+  return GetValue<bool>(value_ptr);
+}
+bool DecoderLayer::get_position_bias2() const {
+  auto value_ptr = this->GetAttr(kDecoderLayerPositionBias2);
+  return GetValue<bool>(value_ptr);
+}
+float DecoderLayer::get_scale1() const {
+  auto value_ptr = this->GetAttr(kDecoderLayerScale1);
+  return GetValue<float>(value_ptr);
+}
+float DecoderLayer::get_scale2() const {
+  auto value_ptr = this->GetAttr(kDecoderLayerScale2);
+  return GetValue<float>(value_ptr);
+}
+ActType DecoderLayer::get_act_type() const {
+  auto value_ptr = GetAttr(kActivationType);
+  if (value_ptr == nullptr) {
+    return ActType::ActType_No;
+  }
+  return ActType(GetValue<int64_t>(value_ptr));
+}
+
+void DecoderLayer::Init(int64_t head_num, int64_t head_size, float eps_layernorm1, float eps_layernorm2,
+                        float eps_layernorm3, int64_t ffn_hidden_size, bool position_bias1, bool position_bias2,
+                        bool post_layernorm, float scale1, float scale2, ActType act_type) {
+  this->set_head_num(head_num);
+  this->set_head_size(head_size);
+  this->set_post_layernorm(post_layernorm);
+  this->set_eps_layernorm1(eps_layernorm1);
+  this->set_eps_layernorm2(eps_layernorm2);
+  this->set_eps_layernorm3(eps_layernorm3);
+  this->set_ffn_hidden_size(ffn_hidden_size);
+  this->set_position_bias1(position_bias1);
+  this->set_position_bias2(position_bias2);
+  this->set_act_type(act_type);
+  this->set_scale1(scale1);
+  this->set_scale2(scale2);
+}
+REGISTER_PRIMITIVE_C(kNameDecoderLayer, DecoderLayer);
+}  // namespace mindspore::ops
diff --git a/mindspore/core/ops/decoder_layer.h b/mindspore/core/ops/decoder_layer.h
new file mode 100644
index 0000000000000000000000000000000000000000..b196689eb2f37d575abd34409e2f7f070439a93e
--- /dev/null
+++ b/mindspore/core/ops/decoder_layer.h
@@ -0,0 +1,103 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef MINDSPORE_CORE_OPS_DECODER_LAYER_H_
+#define MINDSPORE_CORE_OPS_DECODER_LAYER_H_
+#include <map>
+#include <vector>
+#include <string>
+#include <memory>
+#include "nnacl/op_base.h"
+
+#include "ops/base_operator.h"
+#include "mindapi/base/types.h"
+
+namespace mindspore {
+namespace ops {
+constexpr auto kNameDecoderLayer = "DecoderLayer";
+/// \brief MultiHead-Attention op in MindIR.
+class MIND_API DecoderLayer : public BaseOperator {
+ public:
+  MIND_API_BASE_MEMBER(DecoderLayer);
+  /// \brief Constructor.
+  DecoderLayer() : BaseOperator(kNameDecoderLayer) {
+    InitIOName({"input",
+                "gamma1",
+                "beta1",
+                "weight_qkv",
+                "bias_attn_qkv",
+                "input_mask",
+                "weight_attn_o",
+                "bias_attn_o",
+                "gamma2",
+                "beta2",
+                "encoder_output",
+                "weight_attn_q",
+                "weight_attn_kv",
+                "bias_attn_cross_qkv",
+                "cross_mask",
+                "weight_attn_cross_o",
+                "bias_attn_cross_o",
+                "gamma3",
+                "beta3",
+                "weight_m",
+                "bias_m",
+                "weight_p",
+                "bias_p"},
+               {"output"});
+  }
+  /// \brief Initialize DecoderLayer op.
+  /// \param[in] head_num Define head number.
+  /// \param[in] head_size Define size per head.
+  /// \param[in] eps_layernorm1 Define eps layernorm1.
+  /// \param[in] eps_layernorm2 Define eps layernorm2.
+  /// \param[in] eps_layernorm3 Define eps layernorm3.
+  /// \param[in] ffn_hidden_size Define ffn hidden size.
+  /// \param[in] position_bias1 Define position_bias1.
+  /// \param[in] position_bias2 Define position_bias2.
+  /// \param[in] scale1 Define scale1.
+  /// \param[in] scale2 Define scale2.
+  /// \param[in] act_type Define act_type.
+  void Init(int64_t head_num, int64_t head_size, float eps_layernorm1, float eps_layernorm2, float eps_layernorm3,
+            int64_t ffn_hidden_size, bool position_bias1, bool position_bias2, bool post_layernorm, float scale1 = 1.0f,
+            float scale2 = 1.0f, ActType act_type = ActType::ActType_Gelu);
+  void set_head_num(int64_t head_num);
+  void set_head_size(int64_t head_size);
+  void set_post_layernorm(bool post_layernorm);
+  void set_eps_layernorm1(float eps_layernorm1);
+  void set_eps_layernorm2(float eps_layernorm2);
+  void set_eps_layernorm3(float eps_layernorm2);
+  void set_ffn_hidden_size(int64_t ffn_hidden_size);
+  void set_position_bias1(bool position_bias1);
+  void set_position_bias2(bool position_bias2);
+  void set_scale1(float scale1);
+  void set_scale2(float scale2);
+  void set_act_type(ActType act_type);
+  int64_t get_head_num() const;
+  int64_t get_head_size() const;
+  bool get_post_layernorm() const;
+  float get_eps_layernorm1() const;
+  float get_eps_layernorm2() const;
+  float get_eps_layernorm3() const;
+  int64_t get_ffn_hidden_size() const;
+  bool get_position_bias1() const;
+  bool get_position_bias2() const;
+  float get_scale1() const;
+  float get_scale2() const;
+  ActType get_act_type() const;
+};
+}  // namespace ops
+}  // namespace mindspore
+#endif  // MINDSPORE_CORE_OPS_DECODER_LAYER_H_
diff --git a/mindspore/core/ops/encoder_layer.cc b/mindspore/core/ops/encoder_layer.cc
index 1a2a9f0aa189f9f6209409912683e0f2fa682be0..276d10de2238aa95c752c2c90c0b6c2a234be503 100644
--- a/mindspore/core/ops/encoder_layer.cc
+++ b/mindspore/core/ops/encoder_layer.cc
@@ -46,7 +46,8 @@ void EncoderLayer::set_ffn_hidden_size(int64_t ffn_hidden_size) {
 void EncoderLayer::set_position_bias(bool position_bias) {
   (void)this->AddAttr(kPositionBias, api::MakeValue(position_bias));
 }
-
+void EncoderLayer::set_scale(float scale) { (void)this->AddAttr(kScale, api::MakeValue(scale)); }
+void EncoderLayer::set_act_type(ActType act_type) { (void)this->AddAttr(kActivationType, api::MakeValue(act_type));}
 int64_t EncoderLayer::get_head_num() const {
   auto value_ptr = this->GetAttr(kEncoderLayerNumHeads);
   return GetValue<int64_t>(value_ptr);
@@ -77,9 +78,20 @@ bool EncoderLayer::get_position_bias() const {
   auto value_ptr = this->GetAttr(kPositionBias);
   return GetValue<bool>(value_ptr);
 }
-
+float EncoderLayer::get_scale() const {
+  auto value_ptr = this->GetAttr(kScale);
+  return GetValue<float>(value_ptr);
+}
+ActType EncoderLayer::get_act_type() const {
+  auto value_ptr = GetAttr(kActivationType);
+  if (value_ptr == nullptr) {
+    return ActType::ActType_No;
+  }
+  return ActType(GetValue<int64_t>(value_ptr));
+}
 void EncoderLayer::Init(int64_t head_num, int64_t head_size, float eps_layernorm1, float eps_layernorm2,
-                        int64_t ffn_hidden_size, bool position_bias, bool post_layernorm = false) {
+                        int64_t ffn_hidden_size, bool position_bias, bool post_layernorm, float scale,
+                        ActType act_type) {
   this->set_head_num(head_num);
   this->set_head_size(head_size);
   this->set_post_layernorm(post_layernorm);
@@ -87,6 +99,8 @@ void EncoderLayer::Init(int64_t head_num, int64_t head_size, float eps_layernorm
   this->set_eps_layernorm2(eps_layernorm2);
   this->set_ffn_hidden_size(ffn_hidden_size);
   this->set_position_bias(position_bias);
+  this->set_act_type(act_type);
+  this->set_scale(scale);
 }
 REGISTER_PRIMITIVE_C(kNameEncoderLayer, EncoderLayer);
 }  // namespace mindspore::ops
diff --git a/mindspore/core/ops/encoder_layer.h b/mindspore/core/ops/encoder_layer.h
index f2f2a9286136efb0a99aae0a720baf985d23b2a6..b0466be467af79f7d7cbea4ac5916ca8dc24a9b8 100644
--- a/mindspore/core/ops/encoder_layer.h
+++ b/mindspore/core/ops/encoder_layer.h
@@ -22,6 +22,7 @@
 
 #include "ops/base_operator.h"
 #include "mindapi/base/types.h"
+#include "nnacl/op_base.h"
 
 namespace mindspore {
 namespace ops {
@@ -42,9 +43,11 @@ class MIND_API EncoderLayer : public BaseOperator {
   /// \param[in] eps_layernorm1 Define eps layernorm1.
   /// \param[in] eps_layernorm2 Define eps layernorm2.
   /// \param[in] ffn_hidden_size Define ffn hidden size.
-  /// \param[in] position_bias Define ffn position_bias.
+  /// \param[in] position_bias Define position_bias.
+  /// \param[in] scale Define scale.
+  /// \param[in] act_type Define act_type.
   void Init(int64_t head_num, int64_t head_size, float eps_layernorm1, float eps_layernorm2, int64_t ffn_hidden_size,
-            bool position_bias, bool post_layernorm);
+            bool position_bias, bool post_layernorm, float scale = 1.0f, ActType act_type = ActType::ActType_Gelu);
   void set_head_num(int64_t head_num);
   void set_head_size(int64_t head_size);
   void set_post_layernorm(bool post_layernorm);
@@ -52,6 +55,8 @@ class MIND_API EncoderLayer : public BaseOperator {
   void set_eps_layernorm2(float eps_layernorm2);
   void set_ffn_hidden_size(int64_t ffn_hidden_size);
   void set_position_bias(bool position_bias);
+  void set_scale(float scale);
+  void set_act_type(ActType act_type);
   int64_t get_head_num() const;
   int64_t get_head_size() const;
   bool get_post_layernorm() const;
@@ -59,6 +64,8 @@ class MIND_API EncoderLayer : public BaseOperator {
   float get_eps_layernorm2() const;
   int64_t get_ffn_hidden_size() const;
   bool get_position_bias() const;
+  float get_scale() const;
+  ActType get_act_type() const;
 };
 }  // namespace ops
 }  // namespace mindspore
diff --git a/mindspore/core/ops/op_name.h b/mindspore/core/ops/op_name.h
index 4fd47b9f0fb424716fa37415e6aa0a39e3f26a89..caf14dc257aa0ddb524c31041716754b1661f3bb 100644
--- a/mindspore/core/ops/op_name.h
+++ b/mindspore/core/ops/op_name.h
@@ -378,12 +378,24 @@ constexpr auto kSampleNum = "sample_num";
 constexpr auto kRoiEndMode = "roi_end_mode";
 constexpr auto kUpper = "upper";
 constexpr auto kConjugate = "conjugate";
+constexpr auto kScalar = "scalar";
 constexpr auto kEncoderLayerNumHeads = "head_num";
 constexpr auto kEncoderLayerSizePerHead = "head_size";
 constexpr auto kEncoderLayerPostLayernorm = "post_layernorm";
 constexpr auto kEncoderLayerFfnHiddenSize = "ffn_hidden_size";
 constexpr auto kEncoderLayerEpsLayerNorm1 = "eps_layernorm1";
 constexpr auto kEncoderLayerEpsLayerNorm2 = "eps_layernorm2";
+constexpr auto kDecoderLayerNumHeads = "head_num";
+constexpr auto kDecoderLayerSizePerHead = "head_size";
+constexpr auto kDecoderLayerPostLayernorm = "post_layernorm";
+constexpr auto kDecoderLayerFfnHiddenSize = "ffn_hidden_size";
+constexpr auto kDecoderLayerEpsLayerNorm1 = "eps_layernorm1";
+constexpr auto kDecoderLayerEpsLayerNorm2 = "eps_layernorm2";
+constexpr auto kDecoderLayerEpsLayerNorm3 = "eps_layernorm3";
+constexpr auto kDecoderLayerPositionBias1 = "position_bias1";
+constexpr auto kDecoderLayerPositionBias2 = "position_bias2";
+constexpr auto kDecoderLayerScale1 = "scale1";
+constexpr auto kDecoderLayerScale2 = "scale2";
 constexpr auto kPositionBias = "position_bias";
 constexpr auto KExclusive = "exclusive";
 constexpr auto KReverse = "reverse";
diff --git a/mindspore/lite/schema/ops.fbs b/mindspore/lite/schema/ops.fbs
index e04470c778b3d2f5bb77fe3db5c4d877aebfe68f..3a24ac6fc442ab1abba02277a072bf38a5a62c10 100644
--- a/mindspore/lite/schema/ops.fbs
+++ b/mindspore/lite/schema/ops.fbs
@@ -395,6 +395,7 @@ table Attention {
     head_num: long;
     head_size: long;
     cross: bool;
+    scale: float;
 }
 
 table Conv2DBackpropFilterFusion {
diff --git a/mindspore/lite/src/common/ops/ops_def.cc b/mindspore/lite/src/common/ops/ops_def.cc
index 7a8fa84392dbd8f0e9bb26a94f18aa7313a65ee5..2de89604156a294ed803c48418d00379668a776b 100644
--- a/mindspore/lite/src/common/ops/ops_def.cc
+++ b/mindspore/lite/src/common/ops/ops_def.cc
@@ -395,6 +395,7 @@ OP_SCHEMA_DEF(Attention)
 OP_ATTR(head_num, long)
 OP_ATTR(head_size, long);
 OP_ATTR(cross, bool)
+OP_ATTR(scale, float)
 OP_SCHEMA_DEF_END(Attention)
 
 OP_SCHEMA_DEF(Conv2DBackpropFilterFusion)
diff --git a/mindspore/lite/src/common/ops/ops_func_declare.h b/mindspore/lite/src/common/ops/ops_func_declare.h
index b00ed666fadd54c3ee807f6e9b123723f2b57c04..3b151b36caa415d62711a6108b7f9403d7ac622e 100644
--- a/mindspore/lite/src/common/ops/ops_func_declare.h
+++ b/mindspore/lite/src/common/ops/ops_func_declare.h
@@ -260,6 +260,8 @@
 #include "ops/format_transpose.h"
 #include "ops/gather_d.h"
 #include "ops/tensor_scatter_add.h"
+#include "ops/decoder_layer.h"
+#include "ops/encoder_layer.h"
 #include "ops/scatter_elements.h"
 
 namespace mindspore::lite::ops {
diff --git a/mindspore/lite/src/common/ops/populate/custom_populate.cc b/mindspore/lite/src/common/ops/populate/custom_populate.cc
index 4f855ce999ad0d00faac597187556aca99e92d95..e35357ee67319b9728fa95b4d4bff63f72bdf837 100644
--- a/mindspore/lite/src/common/ops/populate/custom_populate.cc
+++ b/mindspore/lite/src/common/ops/populate/custom_populate.cc
@@ -107,6 +107,16 @@ OpParameter *PopulateCustomParameter(const void *prim) {
     memset(param, 0, sizeof(OpParameter));
     param->type_ = PrimType_Inner_EncoderLayer;
     return reinterpret_cast<OpParameter *>(param);
+  } else if (type == "DecoderLayer") {
+    std::cout << "DecoderLayer populate" << std::endl;
+    auto *param = reinterpret_cast<OpParameter *>(malloc(sizeof(OpParameter)));
+    if (param == nullptr) {
+      MS_LOG(ERROR) << "malloc DecoderLayer failed.";
+      return nullptr;
+    }
+    memset(param, 0, sizeof(OpParameter));
+    param->type_ = PrimType_Inner_DecoderLayer;
+    return reinterpret_cast<OpParameter *>(param);
   } else {
     MS_LOG(ERROR) << "Unsupported custom type: " << type;
   }
diff --git a/mindspore/lite/src/common/prim_util.cc b/mindspore/lite/src/common/prim_util.cc
index b7276233b763a1294b8051b96d1e3b1af9488483..c5be3ff571493dbcf24acb73519c37fd3f4868fc 100644
--- a/mindspore/lite/src/common/prim_util.cc
+++ b/mindspore/lite/src/common/prim_util.cc
@@ -28,9 +28,9 @@ static std::set<schema::PrimitiveType> kTensorListOps = {
   schema::PrimitiveType_TensorListReserve, schema::PrimitiveType_TensorListSetItem,
   schema::PrimitiveType_TensorListStack};
 
-static const char *const kInnerOpNames[6] = {
+static const char *const kInnerOpNames[8] = {
   "Inner_ToFormat",    "Inner_GltextureToOpencl", "Inner_Identity",
-  "Inner_ShapeFusion", "Inner_GraphKernel",       "Inner_SplitReduceConcatFusion",
+  "Inner_ShapeFusion", "Inner_GraphKernel",       "Inner_SplitReduceConcatFusion", "Inner_EncoderLayer" ,"Inner_DecoderLayer",
 };
 int GetPrimitiveType(const void *primitive, int schema_version) {
   if (primitive == nullptr) {
diff --git a/mindspore/lite/src/extendrt/delegate/tensorrt/op/decoder_tensorrt.cc b/mindspore/lite/src/extendrt/delegate/tensorrt/op/decoder_tensorrt.cc
new file mode 100755
index 0000000000000000000000000000000000000000..22f0e43a091785902996e1679beec86cc89701fd
--- /dev/null
+++ b/mindspore/lite/src/extendrt/delegate/tensorrt/op/decoder_tensorrt.cc
@@ -0,0 +1,262 @@
+/**
+ * Copyright 2022 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "src/extendrt/delegate/tensorrt/op/decoder_tensorrt.h"
+#include <cuda_runtime.h>
+#include <numeric>
+#include <memory>
+#include <vector>
+#include <functional>
+#include <unordered_map>
+#include <algorithm>
+#include "NvInferRuntimeCommon.h"
+#include "ops/decoder_layer.h"
+#include "src/fastertransformer/kernels/unfused_attention_kernels.h"
+#include "src/fastertransformer/kernels/activation_kernels.h"
+#include "src/fastertransformer/utils/cuda_utils.h"
+#include "src/fastertransformer/utils/allocator.h"
+#include "src/fastertransformer/kernels/layernorm_kernels.h"
+
+namespace mindspore::lite {
+namespace {
+constexpr std::size_t kTwo = 2;
+}  // namespace
+
+int DecoderTensorRT::IsSupport(const BaseOperatorPtr &base_operator, const std::vector<TensorInfo> &in_tensors,
+                               const std::vector<TensorInfo> &out_tensors) {
+  if (in_tensors.size() != C23NUM && in_tensors.size() != C16NUM) {
+    MS_LOG(ERROR) << "Unsupported input tensor size, size is " << in_tensors.size();
+    return RET_ERROR;
+  }
+  if (out_tensors.size() != 1) {
+    MS_LOG(ERROR) << "Unsupported output tensor size, size is " << out_tensors.size();
+    return RET_ERROR;
+  }
+  return RET_OK;
+}
+nvinfer1::ITensor *DecoderTensorRT::castTensor(TensorRTContext *ctx, const TensorInfo &ms_tensor,
+                                               const std::string &op_name) {
+  if (ctx == nullptr || ctx->network() == nullptr) {
+    MS_LOG(ERROR) << "context or network is null for ConvertConstantTensor";
+    return nullptr;
+  }
+  nvinfer1::Dims dims = ConvertCudaDims(ms_tensor.Shape());
+  if (dims.nbDims == -1) {
+    MS_LOG(INFO) << ms_tensor.Name() << " ConvertCudaDims failed, convert as scalar.";
+    dims.nbDims = 1;
+    dims.d[0] = 1;
+  }
+  nvinfer1::DataType data_type = ConvertDataType(ms_tensor.DataType());
+  if (!ms_tensor.IsConst()) {
+    MS_LOG(ERROR) << "ConvertConstantTensor from a MSTensor with nullptr data: " << ms_tensor.Name();
+    return nullptr;
+  }
+  nvinfer1::Weights weights{data_type, ms_tensor.Data(), ms_tensor.ElementNum()};
+  if (data_type == nvinfer1::DataType::kFLOAT && is_ffn_fp16_) {
+    void *data_float16 = malloc(ms_tensor.ElementNum() * sizeof(float));
+    if (data_float16 == nullptr) {
+      MS_LOG(ERROR) << "Malloc buffer failed.";
+      return nullptr;
+    }
+    auto src = static_cast<const float *>(ms_tensor.Data());
+    auto dst = static_cast<half *>(data_float16);
+    for (int i = 0; i < ms_tensor.ElementNum(); i++) {
+      dst[i] = static_cast<half>(src[i]);
+    }
+    weights.values = data_float16;
+  }
+  nvinfer1::IConstantLayer *constant_tensor = ctx->network()->addConstant(dims, weights);
+  if (constant_tensor == nullptr) {
+    MS_LOG(ERROR) << "create constant_tensor failed.";
+    return nullptr;
+  }
+  ctx->RegisterLayer(constant_tensor, ms_tensor.Name() + "_" + op_name);
+  auto tensor_ptr = constant_tensor->getOutput(0);
+  return tensor_ptr;
+}
+int DecoderTensorRT::AddInnerOp(TensorRTContext *ctx) {
+  if (ctx == nullptr || ctx->network() == nullptr) {
+    MS_LOG(ERROR) << "context or network is invalid";
+    return RET_ERROR;
+  }
+  auto decoder_op = AsOps<ops::DecoderLayer>();
+  if (decoder_op == nullptr) {
+    MS_LOG(ERROR) << "op action convert failed";
+    return RET_ERROR;
+  }
+  fastertransformer::decoderParamRun params;
+  cublasHandle_t cublas_handle = GetCublasHandle();
+  params.common_param.cublas_handle = cublas_handle;
+  params.common_param.head_num = decoder_op->get_head_num();
+  params.common_param.head_size = decoder_op->get_head_size();
+  params.common_param.hidden_size = params.common_param.head_num * params.common_param.head_size;
+  params.decoder.layernorm_post = decoder_op->get_post_layernorm();
+  params.decoder.eps1 = decoder_op->get_eps_layernorm1();
+  params.decoder.eps2 = decoder_op->get_eps_layernorm2();
+  params.decoder.eps3 = decoder_op->get_eps_layernorm3();
+  params.ffn_param.ffn_param.ffn_hidden_size = decoder_op->get_ffn_hidden_size();
+  params.ffn_param.ffn_param.ffn_fp16 = is_ffn_fp16_;
+  params.ffn_param.ffn_param.act_type = (fastertransformer::ActType)(decoder_op->get_act_type());
+  params.attn1.attn.position_bias = decoder_op->get_position_bias1();
+  params.ffn_param.ffn_param.ffn_bias = !params.attn1.attn.position_bias;
+  params.attn1.attn.qkv_bias = !params.attn1.attn.position_bias;
+  params.attn1.attn.projection_bias = !params.attn1.attn.position_bias;
+  params.attn1.attn.is_cross = false;
+  params.attn1.attn.scale = decoder_op->get_scale1();
+  params.attn1.attn.mask = true;
+  params.attn2.attn.position_bias = decoder_op->get_position_bias2();
+  params.attn2.attn.qkv_bias = !params.attn2.attn.position_bias;
+  params.attn2.attn.projection_bias = !params.attn2.attn.position_bias;
+  params.attn2.attn.is_cross = true;
+  params.attn2.attn.scale = decoder_op->get_scale2();
+  params.attn2.attn.mask = true;
+  params.decoder.has_beta = !params.attn1.attn.position_bias;
+  auto compute_type = runtime_->GetRuntimePrecisionMode();
+  if (is_ffn_fp16_) {
+    size_t start_fp16 = (params.attn1.attn.position_bias) ? C13NUM : C18NUM;
+    size_t end_fp16 = (params.attn1.attn.position_bias) ? C16NUM : C22NUM;
+    for (size_t i = 0; i < in_tensors_.size(); i++) {
+      auto in_tensor = input(ctx, i);
+      if (in_tensors_[i].IsConst() || in_tensor.trt_tensor_ == nullptr) {
+        if (i > start_fp16 && i < end_fp16) {
+          in_tensor.trt_tensor_ = castTensor(ctx, in_tensors_[i], op_name_);
+          ctx->RegisterTensor(in_tensor, in_tensors_[i].Name());
+        } else {
+          in_tensor.trt_tensor_ = lite::ConvertConstantTensor(ctx, in_tensors_[i], op_name_);
+          ctx->RegisterTensor(in_tensor, in_tensors_[i].Name());
+        }
+      }
+    }
+  }
+  nvinfer1::ITensor *input_tensor = input(ctx, 0).trt_tensor_;
+  auto plugin = std::make_shared<DecoderPlugin>(input_tensor->getName(), compute_type, params, device_id_);
+  const int input_number = inputs().size();
+  nvinfer1::ITensor *inputTensors[input_number];
+  for (int i = 0; i < input_number; i++) {
+    inputTensors[i] = input(ctx, i).trt_tensor_;
+  }
+  nvinfer1::IPluginV2Layer *decoder_layer = ctx->network()->addPluginV2(inputTensors, input_number, *plugin);
+  if (decoder_layer == nullptr) {
+    MS_LOG(ERROR) << "add decoder op failed for TensorRT.";
+    return RET_ERROR;
+  }
+  decoder_layer->setName((op_name_ + "plugin_decoder_layer").c_str());
+  nvinfer1::ITensor *decoder_tensor = decoder_layer->getOutput(0);
+  ctx->RegisterTensor(ITensorHelper{decoder_tensor, Format::NCHW, true}, out_tensors_[0].Name());
+  this->layer_ = decoder_layer;
+  return RET_OK;
+}
+
+REGISTER_TENSORRT_PLUGIN(DecoderPluginCreater);
+template class TensorRTPluginCreater<DecoderPlugin>;
+template <class T>
+nvinfer1::PluginFieldCollection TensorRTPluginCreater<T>::field_collection_{};
+template <class T>
+std::vector<nvinfer1::PluginField> TensorRTPluginCreater<T>::fields_;
+
+int DecoderPlugin::enqueue(const nvinfer1::PluginTensorDesc *inputDesc, const nvinfer1::PluginTensorDesc *outputDesc,
+                           const void *const *inputs, void *const *outputs, void *workspace,
+                           cudaStream_t stream) noexcept {
+  if (compute_type_ == RuntimePrecisionMode_FP16) {
+    return RunCudaDecoder<half>(inputDesc, outputDesc, inputs, outputs, workspace, stream,
+                                CUBLAS_GEMM_DEFAULT_TENSOR_OP);
+  } else {
+    return RunCudaDecoder<float>(inputDesc, outputDesc, inputs, outputs, workspace, stream,
+                                 CUBLAS_GEMM_DEFAULT_TENSOR_OP);
+  }
+}
+template <typename T>
+int DecoderPlugin::RunCudaDecoder(const nvinfer1::PluginTensorDesc *inputDesc,
+                                  const nvinfer1::PluginTensorDesc *outputDesc, const void *const *inputs,
+                                  void *const *outputs, void *workspace, cudaStream_t stream, cublasGemmAlgo_t algoId) {
+  params_.common_param.algo = algoId;
+  params_.common_param.stream = stream;
+  void *inputs_forward[num_of_inputs_];
+  for (int i = 0; i < num_of_inputs_; i++) {
+    inputs_forward[i] = const_cast<void *>(inputs[i]);
+  }
+  void *outputs_forward[] = {outputs[0]};
+  fastertransformer::forwardDecoder<T>(inputs_forward, num_of_inputs_, outputs_forward, num_of_outputs_, &params_,
+                                       workspace);
+  return RET_OK;
+}
+
+bool DecoderPlugin::supportsFormatCombination(int pos, const nvinfer1::PluginTensorDesc *tensorsDesc, int nbInputs,
+                                              int nbOutputs) noexcept {
+  auto type = (compute_type_ == RuntimePrecisionMode_FP16) ? nvinfer1::DataType::kHALF : nvinfer1::DataType::kFLOAT;
+  for (int i = 0; i < pos; i++) {
+    if (tensorsDesc[pos].type != tensorsDesc[i].type) return false;
+  }
+  bool res = (tensorsDesc[pos].format == nvinfer1::TensorFormat::kLINEAR) && (tensorsDesc[pos].type == type);
+  return res;
+}
+
+void DecoderPlugin::configurePlugin(const nvinfer1::DynamicPluginTensorDesc *in, int nbInputs,
+                                    const nvinfer1::DynamicPluginTensorDesc *out, int nbOutputs) noexcept {
+  const int request_batch_size = static_cast<const int>(in[0].desc.dims.d[0]);
+  const int request_src_seq_len = static_cast<const int>(in[0].desc.dims.d[1]);
+  const int request_tgt_seq_len = request_src_seq_len;
+  params_.common_param.batch_size = request_batch_size;
+  params_.common_param.src_seq_len = request_src_seq_len;
+  params_.common_param.tgt_seq_len = request_tgt_seq_len;
+  num_of_inputs_ = nbInputs;
+  num_of_outputs_ = nbOutputs;
+}
+size_t DecoderPlugin::getWorkspaceSize(const nvinfer1::PluginTensorDesc *inputs, int nbInputs,
+                                       const nvinfer1::PluginTensorDesc *outputs, int nbOutputs) const noexcept {
+  if (compute_type_ == RuntimePrecisionMode_FP16) {
+    return fastertransformer::GetDecoderLayerWorkspaceSize<half>(&params_);
+  } else {
+    return fastertransformer::GetDecoderLayerWorkspaceSize<float>(&params_);
+  }
+}
+
+nvinfer1::DimsExprs DecoderPlugin::getOutputDimensions(int32_t index, const nvinfer1::DimsExprs *inputs,
+                                                       int nbInputDims, nvinfer1::IExprBuilder &exprBuilder) noexcept {
+  nvinfer1::DimsExprs dims;
+  if (index == 0) {
+    int num_dims = inputs[0].nbDims;
+    dims.nbDims = num_dims;
+    for (int i = 0; i < num_dims; i++) {
+      dims.d[i] = exprBuilder.constant(inputs[index].d[i]->getConstantValue());
+    }
+  }
+  return dims;
+}
+
+nvinfer1::IPluginV2DynamicExt *DecoderPlugin::clone() const noexcept {
+  auto *plugin = new DecoderPlugin(*this);
+  if (plugin == nullptr) {
+    MS_LOG(ERROR) << "plugin is null";
+    return nullptr;
+  }
+  plugin->setPluginNamespace(name_space_.c_str());
+  plugin->params_.attn1.common_param = &plugin->params_.common_param;
+  plugin->params_.attn2.common_param = &plugin->params_.common_param;
+  plugin->params_.ffn_param.common_param = &plugin->params_.common_param;
+  return plugin;
+}
+
+size_t DecoderPlugin::getSerializationSize() const noexcept {
+  return sizeof(int) + sizeof(fastertransformer::decoderParamRun);
+}
+
+void DecoderPlugin::serialize(void *buffer) const noexcept {
+  SerializeValue(&buffer, &compute_type_, sizeof(int));
+  SerializeValue(&buffer, &params_, sizeof(fastertransformer::decoderParamRun));
+}
+REGISTER_TENSORRT_CREATOR(ops::kNameDecoderLayer, DecoderTensorRT)
+}  // namespace mindspore::lite
diff --git a/mindspore/lite/src/extendrt/delegate/tensorrt/op/decoder_tensorrt.h b/mindspore/lite/src/extendrt/delegate/tensorrt/op/decoder_tensorrt.h
new file mode 100644
index 0000000000000000000000000000000000000000..d9d5f458383c35a1033c70c07431140438f62b9a
--- /dev/null
+++ b/mindspore/lite/src/extendrt/delegate/tensorrt/op/decoder_tensorrt.h
@@ -0,0 +1,110 @@
+/**
+ * Copyright 2022 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_LITE_SRC_EXTENDRT_DELEGATE_TENSORRT_OP_DECODER_TENSORRT_H_
+#define MINDSPORE_LITE_SRC_EXTENDRT_DELEGATE_TENSORRT_OP_DECODER_TENSORRT_H_
+
+#include <string>
+#include <vector>
+#include "src/extendrt/delegate/tensorrt/op/tensorrt_op.h"
+#include "src/extendrt/delegate/tensorrt/op/tensorrt_plugin.h"
+#include "src/extendrt/delegate/tensorrt/cuda_impl/cudnn_utils.h"
+#include "src/fastertransformer/layers/ms_layers/decoder.h"
+
+namespace mindspore::lite {
+class DecoderTensorRT : public TensorRTOp {
+ public:
+  DecoderTensorRT(const BaseOperatorPtr &base_operator, const std::vector<TensorInfo> &in_tensors,
+                  const std::vector<TensorInfo> &out_tensors, std::string name)
+      : TensorRTOp(base_operator, in_tensors, out_tensors, name) {}
+
+  ~DecoderTensorRT() override = default;
+  bool IsWeightInputHanledInner() const override { return is_ffn_fp16_; }
+  int AddInnerOp(TensorRTContext *ctx) override;
+
+  int IsSupport(const BaseOperatorPtr &base_operator, const std::vector<TensorInfo> &in_tensors,
+                const std::vector<TensorInfo> &out_tensors) override;
+
+ private:
+  nvinfer1::ITensor *castTensor(TensorRTContext *ctx, const TensorInfo &ms_tensor, const std::string &op_name);
+  bool is_ffn_fp16_ = false;
+};
+
+constexpr auto DECODER_PLUGIN_NAME{"DecoderPlugin"};
+class DecoderPlugin : public TensorRTPlugin {
+ public:
+  DecoderPlugin(const std::string name, int compute_type, fastertransformer::decoderParamRun params, uint32_t device_id)
+      : TensorRTPlugin(name, std::string(DECODER_PLUGIN_NAME), device_id), compute_type_(compute_type) {
+    params_ = params;
+    params_.attn1.common_param = &params_.common_param;
+    params_.attn2.common_param = &params_.common_param;
+    params_.ffn_param.common_param = &params_.common_param;
+  }
+
+  DecoderPlugin(const char *name, const nvinfer1::PluginFieldCollection *fc)
+      : TensorRTPlugin(std::string(name), std::string(DECODER_PLUGIN_NAME)) {
+    const nvinfer1::PluginField *fields = fc->fields;
+    compute_type_ = static_cast<const int *>(fields[0].data)[0];
+    params_ = static_cast<const fastertransformer::decoderParamRun *>(fields[1].data)[0];
+    params_.attn1.common_param = &params_.common_param;
+    params_.attn2.common_param = &params_.common_param;
+    params_.ffn_param.common_param = &params_.common_param;
+  }
+
+  DecoderPlugin(const char *name, const void *serialData, size_t serialLength)
+      : TensorRTPlugin(std::string(name), std::string(DECODER_PLUGIN_NAME)) {
+    DeserializeValue(&serialData, &serialLength, &compute_type_, sizeof(int));
+    DeserializeValue(&serialData, &serialLength, &params_, sizeof(fastertransformer::decoderParamRun));
+    params_.attn1.common_param = &params_.common_param;
+    params_.attn2.common_param = &params_.common_param;
+    params_.ffn_param.common_param = &params_.common_param;
+  }
+
+  DecoderPlugin() = delete;
+
+  ~DecoderPlugin() override {}
+
+  nvinfer1::IPluginV2DynamicExt *clone() const noexcept override;
+  int enqueue(const nvinfer1::PluginTensorDesc *inputDesc, const nvinfer1::PluginTensorDesc *outputDesc,
+              const void *const *inputs, void *const *outputs, void *workspace, cudaStream_t stream) noexcept override;
+  size_t getSerializationSize() const noexcept override;
+  void serialize(void *buffer) const noexcept override;
+  size_t getWorkspaceSize(const nvinfer1::PluginTensorDesc *inputs, int nbInputs,
+                          const nvinfer1::PluginTensorDesc *outputs, int nbOutputs) const noexcept override;
+  nvinfer1::DimsExprs getOutputDimensions(int index, const nvinfer1::DimsExprs *inputs, int nbInputDims,
+                                          nvinfer1::IExprBuilder &exprBuilder) noexcept override;
+  void configurePlugin(const nvinfer1::DynamicPluginTensorDesc *in, int nbInputs,
+                       const nvinfer1::DynamicPluginTensorDesc *out, int nbOutputs) noexcept override;
+  bool supportsFormatCombination(int pos, const nvinfer1::PluginTensorDesc *tensorsDesc, int nbInputs,
+                                 int nbOutputs) noexcept override;
+
+ private:
+  std::string name_space_;
+  int compute_type_;
+  mutable fastertransformer::decoderParamRun params_;
+  int num_of_inputs_;
+  int num_of_outputs_;
+  template <typename T>
+  int RunCudaDecoder(const nvinfer1::PluginTensorDesc *inputDesc, const nvinfer1::PluginTensorDesc *outputDesc,
+                     const void *const *inputs, void *const *outputs, void *workspace, cudaStream_t stream,
+                     cublasGemmAlgo_t algoId);
+};
+class DecoderPluginCreater : public TensorRTPluginCreater<DecoderPlugin> {
+ public:
+  DecoderPluginCreater() : TensorRTPluginCreater(std::string(DECODER_PLUGIN_NAME)) {}
+};
+}  // namespace mindspore::lite
+#endif  // MINDSPORE_LITE_SRC_EXTENDRT_DELEGATE_TENSORRT_OP_DECODER_TENSORRT_H_
\ No newline at end of file
diff --git a/mindspore/lite/src/extendrt/delegate/tensorrt/op/encoder_tensorrt.cc b/mindspore/lite/src/extendrt/delegate/tensorrt/op/encoder_tensorrt.cc
old mode 100644
new mode 100755
index 398ccea3fb9ed0f121babb1b6f43b134bd1972f2..7eb3653cceb163a858b4f673208f523257b6f1aa
--- a/mindspore/lite/src/extendrt/delegate/tensorrt/op/encoder_tensorrt.cc
+++ b/mindspore/lite/src/extendrt/delegate/tensorrt/op/encoder_tensorrt.cc
@@ -30,22 +30,26 @@
 #include "src/fastertransformer/utils/cuda_utils.h"
 #include "src/fastertransformer/utils/allocator.h"
 #include "src/fastertransformer/kernels/layernorm_kernels.h"
+#include "src/extendrt/delegate/tensorrt/op/tensorrt_op.h"
 
 namespace mindspore::lite {
 namespace {
 constexpr std::size_t kTwo = 2;
-constexpr std::size_t kThree = 3;
 }  // namespace
 
-// Multi Head Attention TensorRT op
 int EncoderTensorRT::IsSupport(const BaseOperatorPtr &base_operator, const std::vector<TensorInfo> &in_tensors,
                                const std::vector<TensorInfo> &out_tensors) {
-  if (in_tensors.size() != C14NUM) {
+  if (in_tensors.size() != C14NUM && in_tensors.size() != C9NUM && in_tensors.size() != C13NUM) {
     MS_LOG(ERROR) << "Unsupported input tensor size, size is " << in_tensors.size();
     return RET_ERROR;
   }
+  if (out_tensors.size() != 1) {
+    MS_LOG(ERROR) << "Unsupported output tensor size, size is " << out_tensors.size();
+    return RET_ERROR;
+  }
   return RET_OK;
 }
+
 nvinfer1::ITensor *EncoderTensorRT::castTensor(TensorRTContext *ctx, const TensorInfo &ms_tensor,
                                                const std::string &op_name) {
   if (ctx == nullptr || ctx->network() == nullptr) {
@@ -97,25 +101,38 @@ int EncoderTensorRT::AddInnerOp(TensorRTContext *ctx) {
     MS_LOG(ERROR) << "op action convert failed";
     return RET_ERROR;
   }
-  fastertransformer::encoderParamT params;
-  memset_s(&params, sizeof(params), 0, sizeof(params));
-  params.head_num = encoder_op->get_head_num();
-  params.head_size = encoder_op->get_head_size();
-  params.layernorm_post = encoder_op->get_post_layernorm();
-  params.eps1 = encoder_op->get_eps_layernorm1();
-  params.eps2 = encoder_op->get_eps_layernorm2();
-  params.ffn_hidden_size = encoder_op->get_ffn_hidden_size();
-  params.is_cross = false;
-  params.ffn_fp16 = is_ffn_fp16_;
-  params.position_bias = encoder_op->get_position_bias();
-  params.cublas_handle = GetCublasHandle();
-  params.qkv_bias = !params.position_bias;
-  params.projection_bias = !params.position_bias;
-  params.hidden_size = params.head_num * params.head_size;
+  cublasHandle_t cublas_handle = GetCublasHandle();
+  fastertransformer::encoderParamRun params;
+  //update commonparam
+  params.common_param.cublas_handle =cublas_handle;
+  params.common_param.head_num = encoder_op->get_head_num();
+  params.common_param.head_size = encoder_op->get_head_size();
+  params.common_param.hidden_size = params.common_param.head_num * params.common_param.head_size;
+  //connect commonparam to attention and ffn
+
+  //update encoder_param_
+  params.encoder.layernorm_post = encoder_op->get_post_layernorm();
+  params.encoder.eps1 = encoder_op->get_eps_layernorm1();
+  params.encoder.eps2 = encoder_op->get_eps_layernorm2();
+  params.ffn_param.ffn_param.ffn_hidden_size = encoder_op->get_ffn_hidden_size();
+  params.ffn_param.ffn_param.ffn_fp16 = is_ffn_fp16_;
+  params.attn.attn.is_cross = false;
+  params.attn.attn.position_bias = encoder_op->get_position_bias();
+  params.attn.attn.projection_bias = !params.attn.attn.position_bias;
+  params.attn.attn.qkv_bias = !params.attn.attn.position_bias;
+  params.encoder.has_beta = !params.attn.attn.position_bias;
+  params.ffn_param.ffn_param.ffn_bias = !params.attn.attn.position_bias;
+  params.attn.attn.mask = true;
+  params.ffn_param.ffn_param.act_type = (fastertransformer::ActType)(encoder_op->get_act_type());
+  params.attn.attn.scale = encoder_op->get_scale();
   auto compute_type = runtime_->GetRuntimePrecisionMode();
   if (is_ffn_fp16_) {
-    size_t start_fp16 = (params.layernorm_post) ? C7NUM : C9NUM;
-    size_t end_fp16 = (params.layernorm_post) ? C11NUM : C13NUM;
+    size_t start_fp16 = (params.encoder.layernorm_post) ? C7NUM : C9NUM;
+    size_t end_fp16 = (params.encoder.layernorm_post) ? C11NUM : C13NUM;
+    if (params.attn.attn.position_bias) {
+      start_fp16 = C6NUM;
+      end_fp16 = C9NUM;
+    }
     for (size_t i = 0; i < in_tensors_.size(); i++) {
       auto in_tensor = input(ctx, i);
       if (in_tensors_[i].IsConst() || in_tensor.trt_tensor_ == nullptr) {
@@ -131,7 +148,7 @@ int EncoderTensorRT::AddInnerOp(TensorRTContext *ctx) {
   }
   nvinfer1::ITensor *input_tensor = input(ctx, 0).trt_tensor_;
   auto plugin =
-    std::make_shared<EncoderPlugin>(input_tensor->getName(), compute_type, params, GetCublasLtHandle(), device_id_);
+    std::make_shared<EncoderPlugin>(input_tensor->getName(), compute_type, params, device_id_);
   const int input_number = inputs().size();
   nvinfer1::ITensor *inputTensors[input_number];
   for (int i = 0; i < input_number; i++) {
@@ -172,14 +189,12 @@ template <typename T>
 int EncoderPlugin::RunCudaEncoder(const nvinfer1::PluginTensorDesc *inputDesc,
                                   const nvinfer1::PluginTensorDesc *outputDesc, const void *const *inputs,
                                   void *const *outputs, void *workspace, cudaStream_t stream, cublasGemmAlgo_t algoId) {
-  params_.stream = stream;
-  params_.algo = algoId;
-  void *inputs_forward[] = {
-    const_cast<void *>(inputs[0]),  const_cast<void *>(inputs[1]),  const_cast<void *>(inputs[2]),
-    const_cast<void *>(inputs[3]),  const_cast<void *>(inputs[4]),  const_cast<void *>(inputs[5]),
-    const_cast<void *>(inputs[6]),  const_cast<void *>(inputs[7]),  const_cast<void *>(inputs[8]),
-    const_cast<void *>(inputs[9]),  const_cast<void *>(inputs[10]), const_cast<void *>(inputs[11]),
-    const_cast<void *>(inputs[12]), const_cast<void *>(inputs[13])};
+  params_.common_param.algo = algoId;
+  params_.common_param.stream = stream;
+  void *inputs_forward[num_of_inputs_];
+  for (int i = 0; i < num_of_inputs_; i++) {
+    inputs_forward[i] = const_cast<void *>(inputs[i]);
+  }
   void *outputs_forward[] = {outputs[0]};
   fastertransformer::forwardEncoder<T>(inputs_forward, num_of_inputs_, outputs_forward, num_of_outputs_, &params_,
                                        workspace);
@@ -195,14 +210,15 @@ bool EncoderPlugin::supportsFormatCombination(int pos, const nvinfer1::PluginTen
   bool res = (tensorsDesc[pos].format == nvinfer1::TensorFormat::kLINEAR) && (tensorsDesc[pos].type == type);
   return res;
 }
+
 void EncoderPlugin::configurePlugin(const nvinfer1::DynamicPluginTensorDesc *in, int nbInputs,
                                     const nvinfer1::DynamicPluginTensorDesc *out, int nbOutputs) noexcept {
   const int request_batch_size = static_cast<const int>(in[0].desc.dims.d[0]);
   const int request_src_seq_len = static_cast<const int>(in[0].desc.dims.d[1]);
   const int request_tgt_seq_len = request_src_seq_len;
-  params_.batch_size = request_batch_size;
-  params_.src_seq_len = request_src_seq_len;
-  params_.tgt_seq_len = request_tgt_seq_len;
+  params_.common_param.batch_size = request_batch_size;
+  params_.common_param.src_seq_len = request_src_seq_len;
+  params_.common_param.tgt_seq_len = request_tgt_seq_len;
   num_of_inputs_ = nbInputs;
   num_of_outputs_ = nbOutputs;
 }
@@ -221,13 +237,8 @@ nvinfer1::DimsExprs EncoderPlugin::getOutputDimensions(int32_t index, const nvin
   if (index == 0) {
     int num_dims = inputs[0].nbDims;
     dims.nbDims = num_dims;
-    if (num_dims == INPUT_SIZE2) {
-      dims.d[0] = exprBuilder.constant(inputs[0].d[0]->getConstantValue());
-      dims.d[1] = exprBuilder.constant(inputs[0].d[1]->getConstantValue());
-    } else if (num_dims == INPUT_SIZE3) {
-      dims.d[0] = exprBuilder.constant(inputs[0].d[0]->getConstantValue());
-      dims.d[1] = exprBuilder.constant(inputs[0].d[1]->getConstantValue());
-      dims.d[kTwo] = exprBuilder.constant(inputs[0].d[kTwo]->getConstantValue());
+    for(int i = 0; i < num_dims; i++ ) {
+      dims.d[i] = exprBuilder.constant(inputs[index].d[i]->getConstantValue());
     }
   }
   return dims;
@@ -240,16 +251,18 @@ nvinfer1::IPluginV2DynamicExt *EncoderPlugin::clone() const noexcept {
     return nullptr;
   }
   plugin->setPluginNamespace(name_space_.c_str());
+  plugin->params_.attn.common_param = &plugin->params_.common_param;
+  plugin->params_.ffn_param.common_param = &plugin->params_.common_param;
   return plugin;
 }
 
 size_t EncoderPlugin::getSerializationSize() const noexcept {
-  return sizeof(int) + sizeof(fastertransformer::encoderParamT);
+  return sizeof(int) + sizeof(fastertransformer::encoderParamRun);
 }
 
 void EncoderPlugin::serialize(void *buffer) const noexcept {
   SerializeValue(&buffer, &compute_type_, sizeof(int));
-  SerializeValue(&buffer, &params_, sizeof(fastertransformer::encoderParamT));
+  SerializeValue(&buffer, &params_, sizeof(fastertransformer::encoderParamRun));
 }
 REGISTER_TENSORRT_CREATOR(ops::kNameEncoderLayer, EncoderTensorRT)
 }  // namespace mindspore::lite
diff --git a/mindspore/lite/src/extendrt/delegate/tensorrt/op/encoder_tensorrt.h b/mindspore/lite/src/extendrt/delegate/tensorrt/op/encoder_tensorrt.h
index 45da8ab88a5837eb5f3aef04b32f84c50310a852..ae6133c0ef40373b1cd58769f61175fe3b300da8 100644
--- a/mindspore/lite/src/extendrt/delegate/tensorrt/op/encoder_tensorrt.h
+++ b/mindspore/lite/src/extendrt/delegate/tensorrt/op/encoder_tensorrt.h
@@ -22,7 +22,8 @@
 #include "src/extendrt/delegate/tensorrt/op/tensorrt_op.h"
 #include "src/extendrt/delegate/tensorrt/op/tensorrt_plugin.h"
 #include "src/extendrt/delegate/tensorrt/cuda_impl/cudnn_utils.h"
-#include "src/fastertransformer/layers/encoder_layers/encoder.h"
+#include "src/fastertransformer/layers/ms_layers/encoder.h"
+
 namespace mindspore::lite {
 class EncoderTensorRT : public TensorRTOp {
  public:
@@ -45,27 +46,29 @@ class EncoderTensorRT : public TensorRTOp {
 constexpr auto ENCODER_PLUGIN_NAME{"EncoderPlugin"};
 class EncoderPlugin : public TensorRTPlugin {
  public:
-  EncoderPlugin(const std::string name, int compute_type, fastertransformer::encoderParamT params,
-                cublasLtHandle_t cublaslt_handle, uint32_t device_id)
-      : TensorRTPlugin(name, std::string(ENCODER_PLUGIN_NAME), device_id),
-        compute_type_(compute_type),
-        params_(params),
-        cublaslt_handle_(cublaslt_handle) {}
+  EncoderPlugin(const std::string name, int compute_type, fastertransformer::encoderParamRun params, uint32_t device_id)
+      : TensorRTPlugin(name, std::string(ENCODER_PLUGIN_NAME), device_id), compute_type_(compute_type) {
+    params_ = params;
+    params_.attn.common_param = &params_.common_param;
+    params_.ffn_param.common_param = &params_.common_param;
+  }
 
   EncoderPlugin(const char *name, const nvinfer1::PluginFieldCollection *fc)
       : TensorRTPlugin(std::string(name), std::string(ENCODER_PLUGIN_NAME)) {
     const nvinfer1::PluginField *fields = fc->fields;
     compute_type_ = static_cast<const int *>(fields[0].data)[0];
-    params_ = static_cast<const fastertransformer::encoderParamT *>(fields[1].data)[0];
-    cublaslt_handle_ = static_cast<const cublasLtHandle_t *>(fields[2].data)[0];
+    params_ = static_cast<const fastertransformer::encoderParamRun *>(fields[1].data)[0];
+    params_.attn.common_param = &params_.common_param;
+    params_.ffn_param.common_param = &params_.common_param;
   }
 
   EncoderPlugin(const char *name, const void *serialData, size_t serialLength)
       : TensorRTPlugin(std::string(name), std::string(ENCODER_PLUGIN_NAME)) {
     DeserializeValue(&serialData, &serialLength, &compute_type_, sizeof(int));
-    DeserializeValue(&serialData, &serialLength, &params_, sizeof(fastertransformer::encoderParamT));
+    DeserializeValue(&serialData, &serialLength, &params_, sizeof(fastertransformer::encoderParamRun));
+    params_.attn.common_param = &params_.common_param;
+    params_.ffn_param.common_param = &params_.common_param;
   }
-
   EncoderPlugin() = delete;
 
   ~EncoderPlugin() override {}
@@ -85,11 +88,9 @@ class EncoderPlugin : public TensorRTPlugin {
                                  int nbOutputs) noexcept override;
 
  private:
-  const std::string layer_name_;
   std::string name_space_;
   int compute_type_;
-  mutable fastertransformer::encoderParamT params_;
-  cublasLtHandle_t cublaslt_handle_;
+  mutable fastertransformer::encoderParamRun params_;
   int num_of_inputs_;
   int num_of_outputs_;
 
diff --git a/mindspore/lite/src/extendrt/delegate/tensorrt/op/mha_tensorrt.cc b/mindspore/lite/src/extendrt/delegate/tensorrt/op/mha_tensorrt.cc
index 859bf9dc426c5a1a59cb347c79c75b3522727754..75cb121d986924b271f08975724ddd28f7b3cf85 100644
--- a/mindspore/lite/src/extendrt/delegate/tensorrt/op/mha_tensorrt.cc
+++ b/mindspore/lite/src/extendrt/delegate/tensorrt/op/mha_tensorrt.cc
@@ -71,18 +71,22 @@ int MhaTensorRT::AddInnerOp(TensorRTContext *ctx) {
   bool is_cross = mha_op->get_cross();
   bool is_position_bias = mha_op->get_position_bias();
   nvinfer1::ITensor *input_tensor = input(ctx, 0).trt_tensor_;
-  fastertransformer::encoderParamT params;
+  fastertransformer::attentionParamRun params;
+  fastertransformer::CommonParam common_param;
+  memset_s(&common_param, sizeof(common_param), 0, sizeof(common_param));
   memset_s(&params, sizeof(params), 0, sizeof(params));
-  params.head_num = head_number;
-  params.head_size = head_size;
-  params.hidden_size = head_number * head_size;
-  params.cublas_handle = GetCublasHandle();
-  params.qkv_bias = !is_position_bias;
-  params.projection_bias = !is_position_bias;
-  params.is_cross = is_cross;
-  params.position_bias = is_position_bias;
-  auto plugin =
-    std::make_shared<MhaPlugin>(input_tensor->getName(), compute_type, params, GetCublasLtHandle(), device_id_);
+  cublasHandle_t cublas_handle = GetCublasHandle();
+  common_param.cublas_handle = cublas_handle;
+  common_param.head_num = head_number;
+  common_param.head_size = head_size;
+  common_param.hidden_size = head_number * head_size;
+  params.attn.qkv_bias = !is_position_bias;
+  params.attn.projection_bias = !is_position_bias;
+  params.attn.is_cross = is_cross;
+  params.attn.position_bias = is_position_bias;
+  params.attn.scale = mha_op->get_scale();
+  params.attn.mask = true;
+  auto plugin = std::make_shared<MhaPlugin>(input_tensor->getName(), compute_type, params, common_param, device_id_);
   const int input_number = inputs().size();
   nvinfer1::ITensor *inputTensors[input_number];
   for (int i = 0; i < input_number; i++) {
@@ -95,39 +99,8 @@ int MhaTensorRT::AddInnerOp(TensorRTContext *ctx) {
   }
   mha_layer->setName((op_name_ + "plugin_attention").c_str());
   nvinfer1::ITensor *attn_tensor = mha_layer->getOutput(0);
-#ifndef TEST_
   ctx->RegisterTensor(ITensorHelper{attn_tensor, Format::NCHW, true}, out_tensors_[0].Name());
-#else  /* TEST_ */
-  ctx->RegisterTensor(ITensorHelper{attn_tensor, Format::NCHW, true}, out_tensors_[0].Name() + "attn");
-#endif /* TEST_ */
   this->layer_ = mha_layer;
-#ifdef TEST_
-  auto weight_projection = input(ctx, 4).trt_tensor_;
-  auto bias_projection = input(ctx, 6).trt_tensor_;
-#endif /* TEST_ */
-
-#ifdef TEST_
-  auto matmul_layer = ctx->network()->addMatrixMultiply(*attn_tensor, nvinfer1::MatrixOperation::kNONE,
-                                                        *weight_projection, nvinfer1::MatrixOperation::kNONE);
-  if (matmul_layer == nullptr) {
-    MS_LOG(ERROR) << "failed to add matmul layer";
-    return RET_ERROR;
-  }
-  matmul_layer->setName((op_name_ + "_matmul").c_str());
-  auto matmul_tensor = matmul_layer->getOutput(0);
-  auto shuffle_layer = ctx->network()->addShuffle(*bias_projection);
-  const auto size = bias_projection->getDimensions().d[0];
-  shuffle_layer->setReshapeDimensions(nvinfer1::Dims{2, {1, size}});
-  auto shuffle_tensor = shuffle_layer->getOutput(0);
-  auto addbias = ctx->network()->addElementWise(*matmul_tensor, *shuffle_tensor, nvinfer1::ElementWiseOperation::kSUM);
-  if (addbias == nullptr) {
-    MS_LOG(ERROR) << "failed to add bias layer";
-    return RET_ERROR;
-  }
-  addbias->setName((op_name_ + "_bias").c_str());
-  auto bias_out = addbias->getOutput(0);
-  ctx->RegisterTensor(ITensorHelper{bias_out, Format::NCHW, true}, out_tensors_[0].Name());
-#endif /* TEST_ */
   return RET_OK;
 }
 
@@ -152,36 +125,36 @@ template <typename T>
 int MhaPlugin::RunCudaMha(const nvinfer1::PluginTensorDesc *inputDesc, const nvinfer1::PluginTensorDesc *outputDesc,
                           const void *const *inputs, void *const *outputs, void *workspace, cudaStream_t stream,
                           cublasGemmAlgo_t algoId) {
-  int cross_tensor_offset = (params_.is_cross) ? 1 : 0;
+  int cross_tensor_offset = (params_.attn.is_cross) ? 1 : 0;
   const int weight_projection_tensor_idx = 4 + cross_tensor_offset;
   const int bias_projection_tensor_idx = 6 + cross_tensor_offset;
   const int attn_mask_tensor_idx = 7 + cross_tensor_offset;
   const int bias_qkv_tensor_idx = 5 + cross_tensor_offset;
   const int weight_qkv_tensor_idx = 3;
   const int position_bias_tensor_idx = 6 + cross_tensor_offset;
-  params_.stream = stream;
-  params_.algo = algoId;
+  common_param_.algo = algoId;
+  common_param_.stream = stream;
   void *inputs_attn[num_of_inputs_];
   int index = 0;
   inputs_attn[index++] = const_cast<void *>(inputs[0]);
-  if (params_.is_cross) {
+  if (params_.attn.is_cross) {
     inputs_attn[index++] = const_cast<void *>(inputs[1]);
     inputs_attn[index++] = const_cast<void *>(inputs[weight_qkv_tensor_idx]);
     inputs_attn[index++] = const_cast<void *>(inputs[weight_qkv_tensor_idx + 1]);
   } else {
     inputs_attn[index++] = const_cast<void *>(inputs[weight_qkv_tensor_idx]);
   }
-  if (params_.qkv_bias) {
+  if (params_.attn.qkv_bias) {
     inputs_attn[index++] = const_cast<void *>(inputs[bias_qkv_tensor_idx]);
   }
-  if (params_.position_bias) {
+  if (params_.attn.position_bias) {
     inputs_attn[index++] = const_cast<void *>(inputs[position_bias_tensor_idx]);
     inputs_attn[index++] = const_cast<void *>(inputs[attn_mask_tensor_idx - C2NUM]);
   } else {
     inputs_attn[index++] = const_cast<void *>(inputs[attn_mask_tensor_idx]);
   }
   inputs_attn[index++] = const_cast<void *>(inputs[weight_projection_tensor_idx]);
-  if (params_.projection_bias) {
+  if (params_.attn.projection_bias) {
     inputs_attn[index++] = const_cast<void *>(inputs[bias_projection_tensor_idx]);
   }
   void *outputs_attn[] = {outputs[0]};
@@ -204,15 +177,15 @@ void MhaPlugin::configurePlugin(const nvinfer1::DynamicPluginTensorDesc *in, int
                                 const nvinfer1::DynamicPluginTensorDesc *out, int nbOutputs) noexcept {
   int cross_tensor_offset = 0;
   int position_bias_tensor_offsets = 0;
-  if (params_.is_cross) cross_tensor_offset = 1;
-  if (params_.position_bias) position_bias_tensor_offsets = 1;
+  if (params_.attn.is_cross) cross_tensor_offset = 1;
+  if (params_.attn.position_bias) position_bias_tensor_offsets = 1;
   const int attn_mask_tensor_idx = 7 + cross_tensor_offset - position_bias_tensor_offsets;
   const int request_batch_size = static_cast<const int>(in[attn_mask_tensor_idx].desc.dims.d[0]);
   const int request_src_seq_len = static_cast<const int>(in[attn_mask_tensor_idx].desc.dims.d[1]);
   const int request_tgt_seq_len = static_cast<const int>(in[attn_mask_tensor_idx].desc.dims.d[2]);
-  params_.batch_size = request_batch_size;
-  params_.src_seq_len = request_src_seq_len;
-  params_.tgt_seq_len = request_tgt_seq_len;
+  common_param_.batch_size = request_batch_size;
+  common_param_.src_seq_len = request_src_seq_len;
+  common_param_.tgt_seq_len = request_tgt_seq_len;
   num_of_inputs_ = nbInputs;
   num_of_outputs_ = nbOutputs;
 }
@@ -230,34 +203,26 @@ nvinfer1::DimsExprs MhaPlugin::getOutputDimensions(int32_t index, const nvinfer1
                                                    nvinfer1::IExprBuilder &exprBuilder) noexcept {
   nvinfer1::DimsExprs dims;
   if (index == 0) {
-#ifndef TEST_
     int num_dims = inputs[0].nbDims;
     dims.nbDims = num_dims;
     if (num_dims == INPUT_SIZE2) {
       dims.d[0] = exprBuilder.constant(inputs[nbInputDims - 1].d[0]->getConstantValue() *
                                        inputs[nbInputDims - 1].d[1]->getConstantValue());
-      auto hidden_size = exprBuilder.constant(params_.head_size * params_.head_num);
+      auto hidden_size = exprBuilder.constant(common_param_.head_size * common_param_.head_num);
       dims.d[1] = hidden_size;
     } else if (num_dims == INPUT_SIZE3) {
       dims.d[0] = inputs[nbInputDims - 1].d[0];  // batch
       dims.d[1] = inputs[nbInputDims - 1].d[(inputs[nbInputDims - 1].nbDims) - 1];
-      auto hidden_size = exprBuilder.constant(params_.head_size * params_.head_num);
+      auto hidden_size = exprBuilder.constant(common_param_.head_size * common_param_.head_num);
       dims.d[kTwo] = hidden_size;
     }
   } else {
     dims.nbDims = INPUT_SIZE4;
     dims.d[0] = inputs[nbInputDims - 1].d[0];  // batch
-    dims.d[1] = exprBuilder.constant(params_.head_num);
+    dims.d[1] = exprBuilder.constant(common_param_.head_num);
     dims.d[kTwo] = inputs[nbInputDims - 1].d[(inputs[nbInputDims - 1].nbDims) - 1];
-    dims.d[kThree] = exprBuilder.constant(params_.head_size);
-  }
-#else
-    dims.nbDims = C2NUM;
-    dims.d[0] = inputs[nbInputDims - 1].d[(inputs[nbInputDims - 1].nbDims) - 1];
-    auto hidden_size = exprBuilder.constant(head_size_ * head_number_);
-    dims.d[1] = hidden_size;
+    dims.d[kThree] = exprBuilder.constant(common_param_.head_size);
   }
-#endif
   return dims;
 }
 
@@ -268,6 +233,7 @@ nvinfer1::IPluginV2DynamicExt *MhaPlugin::clone() const noexcept {
     return nullptr;
   }
   plugin->setPluginNamespace(name_space_.c_str());
+  plugin->params_.common_param = &plugin->common_param_;
   return plugin;
 }
 
@@ -276,12 +242,13 @@ int MhaPlugin::initialize() noexcept { return 0; }
 void MhaPlugin::terminate() noexcept {}
 
 size_t MhaPlugin::getSerializationSize() const noexcept {
-  return sizeof(int) + sizeof(fastertransformer::encoderParamT);
+  return sizeof(int) + sizeof(fastertransformer::attentionParamRun) + sizeof(fastertransformer::CommonParam);
 }
 
 void MhaPlugin::serialize(void *buffer) const noexcept {
   SerializeValue(&buffer, &compute_type_, sizeof(int));
-  SerializeValue(&buffer, &params_, sizeof(fastertransformer::encoderParamT));
+  SerializeValue(&buffer, &params_, sizeof(fastertransformer::attentionParamRun));
+  SerializeValue(&buffer, &common_param_, sizeof(fastertransformer::CommonParam));
 }
 REGISTER_TENSORRT_CREATOR(ops::kNameAttention, MhaTensorRT)
 }  // namespace mindspore::lite
diff --git a/mindspore/lite/src/extendrt/delegate/tensorrt/op/mha_tensorrt.h b/mindspore/lite/src/extendrt/delegate/tensorrt/op/mha_tensorrt.h
index d755690927acbbc3fd8f86783e321148732df181..3370d2bc18850115e5c2b3c85d8d056a40d8f978 100644
--- a/mindspore/lite/src/extendrt/delegate/tensorrt/op/mha_tensorrt.h
+++ b/mindspore/lite/src/extendrt/delegate/tensorrt/op/mha_tensorrt.h
@@ -22,7 +22,8 @@
 #include "src/extendrt/delegate/tensorrt/op/tensorrt_op.h"
 #include "src/extendrt/delegate/tensorrt/op/tensorrt_plugin.h"
 #include "src/extendrt/delegate/tensorrt/cuda_impl/cudnn_utils.h"
-#include "src/fastertransformer/layers/encoder_layers/encoder.h"
+#include "src/fastertransformer/layers/ms_layers/attention.h"
+#include "src/fastertransformer/layers/ms_layers/param.h"
 
 namespace mindspore::lite {
 class MhaTensorRT : public TensorRTOp {
@@ -43,24 +44,28 @@ class MhaTensorRT : public TensorRTOp {
 constexpr auto MHA_PLUGIN_NAME{"AttentionPlugin"};
 class MhaPlugin : public TensorRTPlugin {
  public:
-  MhaPlugin(const std::string name, int compute_type, fastertransformer::encoderParamT params,
-            cublasLtHandle_t cublaslt_handle, uint32_t device_id)
-      : TensorRTPlugin(name, std::string(MHA_PLUGIN_NAME), device_id),
-        compute_type_(compute_type),
-        params_(params),
-        cublaslt_handle_(cublaslt_handle) {}
+  MhaPlugin(const std::string name, int compute_type, fastertransformer::attentionParamRun params,
+            fastertransformer::CommonParam common_param, uint32_t device_id)
+      : TensorRTPlugin(name, std::string(MHA_PLUGIN_NAME), device_id), compute_type_(compute_type) {
+    params_ = params;
+    common_param_ = common_param;
+  }
 
   MhaPlugin(const char *name, const nvinfer1::PluginFieldCollection *fc)
       : TensorRTPlugin(std::string(name), std::string(MHA_PLUGIN_NAME)) {
     const nvinfer1::PluginField *fields = fc->fields;
     compute_type_ = static_cast<const int *>(fields[0].data)[0];
-    params_ = static_cast<const fastertransformer::encoderParamT *>(fields[1].data)[0];
+    params_ = static_cast<const fastertransformer::attentionParamRun *>(fields[1].data)[0];
+    common_param_ = static_cast<const fastertransformer::CommonParam *>(fields[2].data)[0];
+    params_.common_param = &common_param_;
   }
 
   MhaPlugin(const char *name, const void *serialData, size_t serialLength)
       : TensorRTPlugin(std::string(name), std::string(MHA_PLUGIN_NAME)) {
     DeserializeValue(&serialData, &serialLength, &compute_type_, sizeof(int));
-    DeserializeValue(&serialData, &serialLength, &params_, sizeof(fastertransformer::encoderParamT));
+    DeserializeValue(&serialData, &serialLength, &params_, sizeof(fastertransformer::attentionParamRun));
+    DeserializeValue(&serialData, &serialLength, &common_param_, sizeof(fastertransformer::CommonParam));
+    params_.common_param = &common_param_;
   }
 
   MhaPlugin() = delete;
@@ -91,8 +96,9 @@ class MhaPlugin : public TensorRTPlugin {
   const std::string layer_name_;
   std::string name_space_;
   int compute_type_;
-  mutable fastertransformer::encoderParamT params_;
-  cublasLtHandle_t cublaslt_handle_;
+  mutable fastertransformer::attentionParamRun params_;
+  mutable fastertransformer::CommonParam common_param_;
+  cublasLtHandle_t *cublaslt_handle_;
   int num_of_inputs_;
   int num_of_outputs_;
 };
diff --git a/mindspore/lite/src/extendrt/delegate/tensorrt/tensorrt_utils.h b/mindspore/lite/src/extendrt/delegate/tensorrt/tensorrt_utils.h
index 18baf21654fc86a87544b329058b937c58f470af..d112051b9f966f594e7b0f5ecd4c94b711f972f7 100644
--- a/mindspore/lite/src/extendrt/delegate/tensorrt/tensorrt_utils.h
+++ b/mindspore/lite/src/extendrt/delegate/tensorrt/tensorrt_utils.h
@@ -216,5 +216,6 @@ void Data2Vector(std::vector<float> *dst, const void *src) {
     dst->at(i) = static_cast<float>(src_ptr[i]);
   }
 }
+
 }  // namespace mindspore::lite
 #endif  // MINDSPORE_LITE_SRC_EXTENDRT_DELEGATE_TENSORRT_TENSORRT_UTILS_H_
diff --git a/mindspore/lite/tools/converter/anf_transform.cc b/mindspore/lite/tools/converter/anf_transform.cc
index a296d6054aa320c9b2ac3f36540557446ef0f588..aa3a624b3609a9f5ae589e5cdcb8a9ae0e1052a4 100644
--- a/mindspore/lite/tools/converter/anf_transform.cc
+++ b/mindspore/lite/tools/converter/anf_transform.cc
@@ -52,7 +52,9 @@
 #include "tools/optimizer/fusion/tensor_dot_fusion.h"
 #include "tools/optimizer/fusion/multi_head_attention_fusion.h"
 #include "tools/optimizer/fusion/encoder_layer_fusion.h"
+#include "tools/optimizer/fusion/decoder_layer_fusion.h"
 #include "tools/optimizer/fusion/glu_fusion.h"
+
 #include "tools/optimizer/fusion/tflite_rel_pos_multi_head_attention_fusion.h"
 #include "tools/optimizer/fusion/matmul_add_fusion.h"
 #include "tools/optimizer/fusion/matmul_mul_fusion.h"
@@ -323,6 +325,7 @@ int AnfTransform::RunFusionPass(const FuncGraphPtr &old_graph, const std::shared
   if (param->optimize_transformer) {
     fusions.push_back(std::make_shared<opt::MultiHeadAttentionFusion>());
     fusions.push_back(std::make_shared<opt::EncoderLayerFusion>());
+    fusions.push_back(std::make_shared<opt::DecoderLayerFusion>());
   }
   for (size_t index = 0; index < fusions.size(); index++) {
     auto pass_ptr = fusions.at(index);
diff --git a/mindspore/lite/tools/optimizer/fusion/decoder_layer_fusion.cc b/mindspore/lite/tools/optimizer/fusion/decoder_layer_fusion.cc
new file mode 100644
index 0000000000000000000000000000000000000000..cf8276147149ca0033d59c7719716fda733b7039
--- /dev/null
+++ b/mindspore/lite/tools/optimizer/fusion/decoder_layer_fusion.cc
@@ -0,0 +1,518 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#define USE_DEPRECATED_API
+#include "tools/optimizer/fusion/decoder_layer_fusion.h"
+#include <functional>
+#include <utility>
+#include <vector>
+#include <algorithm>
+#include "tools/optimizer/common/gllo_utils.h"
+#include "nnacl/op_base.h"
+#include "ops/tuple_get_item.h"
+#include "tools/common/tensor_util.h"
+#include "ops/op_utils.h"
+
+namespace mindspore::opt {
+namespace {
+const auto &p1 = std::placeholders::_1;
+}  // namespace
+
+bool DecoderLayerFusion::Init() const {
+  hidden_stats_ = std::make_shared<Var>("input");
+  MS_CHECK_TRUE_RET(hidden_stats_ != nullptr, false);
+  encoder_output_ = std::make_shared<Var>("input");
+  MS_CHECK_TRUE_RET(encoder_output_ != nullptr, false);
+  beta1_ = std::make_shared<Var>("beta1");
+  MS_CHECK_TRUE_RET(beta1_ != nullptr, false);
+  gamma1_ = std::make_shared<Var>("gamma1");
+  MS_CHECK_TRUE_RET(gamma1_ != nullptr, false);
+  beta2_ = std::make_shared<Var>("beta2");
+  MS_CHECK_TRUE_RET(beta2_ != nullptr, false);
+  gamma2_ = std::make_shared<Var>("gamma2");
+  MS_CHECK_TRUE_RET(gamma2_ != nullptr, false);
+  beta3_ = std::make_shared<Var>("beta3");
+  MS_CHECK_TRUE_RET(beta3_ != nullptr, false);
+  gamma3_ = std::make_shared<Var>("gamma3");
+  MS_CHECK_TRUE_RET(gamma3_ != nullptr, false);
+  weight_attn_qkv_ = std::make_shared<Var>("weight_attn_qkv");
+  MS_CHECK_TRUE_RET(weight_attn_qkv_ != nullptr, false);
+  weight_attn_q_ = std::make_shared<Var>("weight_attn_q_");
+  MS_CHECK_TRUE_RET(weight_attn_q_ != nullptr, false);
+  weight_attn_kv_ = std::make_shared<Var>("weight_attn_kv_");
+  MS_CHECK_TRUE_RET(weight_attn_kv_ != nullptr, false);
+  weight_attn_o_ = std::make_shared<CondVar>(IsParamNode, "weight_attn_o");
+  MS_CHECK_TRUE_RET(weight_attn_o_ != nullptr, false);
+  weight_attn_cross_o_ = std::make_shared<CondVar>(IsParamNode, "weight_attn_cross_o_");
+  MS_CHECK_TRUE_RET(weight_attn_cross_o_ != nullptr, false);
+  weight_m_ = std::make_shared<CondVar>(IsParamNode, "weight_m");
+  MS_CHECK_TRUE_RET(weight_m_ != nullptr, false);
+  weight_p_ = std::make_shared<CondVar>(IsParamNode, "weight_p");
+  MS_CHECK_TRUE_RET(weight_p_ != nullptr, false);
+  bias_attn_qkv_ = std::make_shared<Var>("bias_attn_qkv");
+  MS_CHECK_TRUE_RET(bias_attn_qkv_ != nullptr, false);
+  bias_attn_o_ = std::make_shared<CondVar>(IsParamNode, "bias_attn_o");
+  MS_CHECK_TRUE_RET(bias_attn_o_ != nullptr, false);
+  bias_attn_cross_qkv_ = std::make_shared<Var>("bias_attn_cross_qkv_");
+  MS_CHECK_TRUE_RET(bias_attn_cross_qkv_ != nullptr, false);
+  bias_attn_cross_o_ = std::make_shared<CondVar>(IsParamNode, "bias_attn_cross_o_");
+  MS_CHECK_TRUE_RET(bias_attn_cross_o_ != nullptr, false);
+  bias_m_ = std::make_shared<CondVar>(IsParamNode, "bias_m");
+  MS_CHECK_TRUE_RET(bias_m_ != nullptr, false);
+  bias_p_ = std::make_shared<CondVar>(IsParamNode, "bias_p");
+  MS_CHECK_TRUE_RET(bias_p_ != nullptr, false);
+  mask_ = std::make_shared<Var>("mask");
+  MS_CHECK_TRUE_RET(mask_ != nullptr, false);
+  cross_mask_ = std::make_shared<Var>("cross_mask_");
+  MS_CHECK_TRUE_RET(cross_mask_ != nullptr, false);
+  is_attention_ = std::make_shared<CondVar>(std::bind(IsOpType, p1, prim::kPrimAttention), "is_attention");
+  MS_CHECK_TRUE_RET(is_attention_ != nullptr, false);
+  is_attention_cross_ = std::make_shared<CondVar>(std::bind(IsOpType, p1, prim::kPrimAttention), "is_attention_cross");
+  MS_CHECK_TRUE_RET(is_attention_cross_ != nullptr, false);
+  is_layernorm1_ = std::make_shared<CondVar>(std::bind(IsOpType, p1, prim::kPrimLayerNormFusion), "layer_norm1");
+  MS_CHECK_TRUE_RET(is_layernorm1_ != nullptr, false);
+  is_layernorm2_ = std::make_shared<CondVar>(std::bind(IsOpType, p1, prim::kPrimLayerNormFusion), "layer_norm2");
+  MS_CHECK_TRUE_RET(is_layernorm2_ != nullptr, false);
+  is_layernorm3_ = std::make_shared<CondVar>(std::bind(IsOpType, p1, prim::kPrimLayerNormFusion), "layer_norm3");
+  MS_CHECK_TRUE_RET(is_layernorm3_ != nullptr, false);
+  position_bias_ = std::make_shared<Var>("position_bias");
+  MS_CHECK_TRUE_RET(position_bias_ != nullptr, false);
+  position_bias_cross_ = std::make_shared<Var>("position_bias_cross_");
+  MS_CHECK_TRUE_RET(position_bias_ != nullptr, false);
+  is_act_ = std::make_shared<CondVar>(std::bind(IsOpType, p1, prim::kPrimActivation), "activation");
+  MS_CHECK_TRUE_RET(is_act_ != nullptr, false);
+  eps1_ = std::make_shared<Var>("eps1_");
+  MS_CHECK_TRUE_RET(eps1_ != nullptr, false);
+  eps2_ = std::make_shared<Var>("eps2_");
+  MS_CHECK_TRUE_RET(eps2_ != nullptr, false);
+  eps3_ = std::make_shared<Var>("eps3_");
+  MS_CHECK_TRUE_RET(eps3_ != nullptr, false);
+  return true;
+}
+
+VectorRef DecoderLayerFusion::getTuple(bool post_layernorm, bool layernorm_fusion = false,
+                                       bool is_position_bias = false) const {
+  auto is_reshape1 = std::make_shared<CondVar>(std::bind(IsOpType, p1, prim::kPrimReshape), "reshape-decoder");
+  MS_CHECK_TRUE_RET(is_reshape1 != nullptr, {});
+  auto var1 = std::make_shared<Var>("var1-reshape");
+  MS_CHECK_TRUE_RET(var1 != nullptr, {});
+  auto reshape1 = VectorRef({is_reshape1, hidden_stats_, var1});
+  VectorRef layer_norm, tuple;
+  if (!layernorm_fusion) {
+    return DefineLayerNorm(reshape1, gamma1_, beta1_, eps1_);
+  }
+  layer_norm = VectorRef({is_layernorm1_, reshape1, gamma1_, beta1_});
+  auto is_tuple = std::make_shared<CondVar>(std::bind(IsOpType, p1, prim::kPrimTupleGetItem), "tuple_get_itme");
+  auto var_tuple = std::make_shared<Var>("var_tuple");
+  tuple = VectorRef({is_tuple, layer_norm, var_tuple});
+  return tuple;
+}
+
+VectorRef DecoderLayerFusion::DefineLayerNorm(VectorRef input, VarPtr gamma, VarPtr beta, VarPtr eps) const {
+  auto is_sqr = std::make_shared<CondVar>(std::bind(IsOpType, p1, prim::kPrimSquare), "sqr2");
+  MS_CHECK_TRUE_RET(is_sqr != nullptr, {});
+  auto sqr = VectorRef({is_sqr, input});
+  auto var1 = std::make_shared<Var>("var1");
+  MS_CHECK_TRUE_RET(var1 != nullptr, {});
+  auto is_reduce = std::make_shared<CondVar>(std::bind(IsOpType, p1, prim::kPrimReduceFusion), "reduce");
+  MS_CHECK_TRUE_RET(is_reduce != nullptr, {});
+  auto reduce = VectorRef({is_reduce, sqr, var1});
+  auto is_add = std::make_shared<CondVar>(std::bind(IsOpType, p1, prim::kPrimAddFusion), "is-add");
+  MS_CHECK_TRUE_RET(is_add != nullptr, {});
+  auto add = VectorRef({is_add, reduce, eps});
+  auto is_sqrt = std::make_shared<CondVar>(std::bind(IsOpType, p1, prim::kPrimSqrt), "sqr2");
+  MS_CHECK_TRUE_RET(is_sqrt != nullptr, {});
+  auto sqrt = VectorRef({is_sqrt, add});
+  auto is_div = std::make_shared<CondVar>(std::bind(IsOpType, p1, prim::kPrimRealDiv), "real-div");
+  MS_CHECK_TRUE_RET(is_div != nullptr, {});
+  auto real_div = VectorRef({is_div, input, sqrt});
+  auto is_mul = std::make_shared<CondVar>(std::bind(IsOpType, p1, prim::kPrimMulFusion), "mul");
+  MS_CHECK_TRUE_RET(is_mul != nullptr, {});
+  auto mul = VectorRef({is_mul, real_div, gamma});
+  return mul;
+}
+
+VectorRef DecoderLayerFusion::DefinePatternDecoderLayer(bool post_layernorm = true, bool layernorm_fusion = false,
+                                                        bool is_position_bias = false, bool mask = true) const {
+  auto is_reshape1 = std::make_shared<CondVar>(std::bind(IsOpType, p1, prim::kPrimReshape), "reshape-decoder");
+  MS_CHECK_TRUE_RET(is_reshape1 != nullptr, {});
+  auto var1 = std::make_shared<Var>("var1-reshape");
+  MS_CHECK_TRUE_RET(var1 != nullptr, {});
+  auto reshape1 = VectorRef({is_reshape1, hidden_stats_, var1});
+  VectorRef inputs, input_cross, tuple2, tuple3, matmul2, tuple4, tuple5;
+  if (is_position_bias) {
+    inputs = VectorRef({is_attention_, getTuple(post_layernorm, layernorm_fusion, is_position_bias),
+                        getTuple(post_layernorm, layernorm_fusion, is_position_bias),
+                        getTuple(post_layernorm, layernorm_fusion, is_position_bias), weight_attn_qkv_, weight_attn_o_,
+                        position_bias_});
+  } else {
+    inputs = VectorRef({is_attention_, getTuple(post_layernorm, layernorm_fusion, is_position_bias),
+                        getTuple(post_layernorm, layernorm_fusion, is_position_bias),
+                        getTuple(post_layernorm, layernorm_fusion, is_position_bias), weight_attn_qkv_, weight_attn_o_,
+                        bias_attn_qkv_, bias_attn_o_});
+  }
+  if (mask) inputs.push_back(mask_);
+  auto attention = VectorRef(inputs);
+  // return attention;
+  if (is_position_bias) {
+    tuple4 = attention;
+  } else {
+    auto is_tuple4 = std::make_shared<CondVar>(std::bind(IsOpType, p1, prim::kPrimTupleGetItem), "tuple_get_item4");
+    auto var_tuple4 = std::make_shared<Var>("var_tuple4");
+    tuple4 = VectorRef({is_tuple4, attention, var_tuple4});
+  }
+  auto is_add2 = std::make_shared<CondVar>(std::bind(IsOpType, p1, prim::kPrimAddFusion), "is_add2");
+  auto add2 = (post_layernorm)
+                ? VectorRef({is_add2, getTuple(post_layernorm, layernorm_fusion, is_position_bias), tuple4})
+                : VectorRef({is_add2, reshape1, tuple4});
+  if (layernorm_fusion) {
+    auto layer_norm2 = VectorRef({is_layernorm2_, add2, gamma2_, beta2_});
+    auto is_tuple2 = std::make_shared<CondVar>(std::bind(IsOpType, p1, prim::kPrimTupleGetItem), "tuple_get_item2");
+    auto var_tuple2 = std::make_shared<Var>("var_tuple2");
+    tuple2 = VectorRef({is_tuple2, layer_norm2, var_tuple2});
+  } else {
+    tuple2 = DefineLayerNorm(add2, gamma2_, beta2_, eps2_);
+  }
+  auto is_reshape2 = std::make_shared<CondVar>(std::bind(IsOpType, p1, prim::kPrimReshape), "reshape-decoder2");
+  MS_CHECK_TRUE_RET(is_reshape2 != nullptr, {});
+  auto var2 = std::make_shared<Var>("var2");
+  MS_CHECK_TRUE_RET(var2 != nullptr, {});
+  auto reshape2 = VectorRef({is_reshape2, encoder_output_, var2});
+  if (is_position_bias) {
+    input_cross = VectorRef({is_attention_cross_, tuple2, reshape2, reshape2, weight_attn_q_, weight_attn_kv_,
+                             weight_attn_cross_o_, position_bias_cross_});
+  } else {
+    input_cross = VectorRef({is_attention_cross_, tuple2, reshape2, reshape2, weight_attn_q_, weight_attn_kv_,
+                             weight_attn_cross_o_, bias_attn_cross_qkv_, bias_attn_cross_o_});
+  }
+  if (mask) input_cross.push_back(cross_mask_);
+  auto attention_cross = VectorRef(input_cross);
+  if (is_position_bias) {
+    tuple5 = attention_cross;
+  } else {
+    auto is_tuple5 = std::make_shared<CondVar>(std::bind(IsOpType, p1, prim::kPrimTupleGetItem), "tuple_get_item5");
+    auto var_tuple5 = std::make_shared<Var>("var_tuple5");
+    tuple5 = VectorRef({is_tuple5, attention_cross, var_tuple5});
+  }
+  auto is_add3 = std::make_shared<CondVar>(std::bind(IsOpType, p1, prim::kPrimAddFusion), "is_add3");
+  MS_CHECK_TRUE_RET(is_add2 != nullptr, {});
+  auto add3 = (post_layernorm) ? VectorRef({is_add3, tuple2, tuple5}) : VectorRef({is_add3, add2, tuple5});
+  if (layernorm_fusion) {
+    auto layer_norm3 = VectorRef({is_layernorm3_, add3, gamma3_, beta3_});
+    auto is_tuple3 = std::make_shared<CondVar>(std::bind(IsOpType, p1, prim::kPrimTupleGetItem), "tuple_get_item3");
+    auto var_tuple3 = std::make_shared<Var>("var_tuple3");
+    tuple3 = VectorRef({is_tuple3, layer_norm3, var_tuple3});
+  } else {
+    tuple3 = DefineLayerNorm(add3, gamma3_, beta3_, eps3_);
+  }
+  auto is_matmul1 = std::make_shared<CondVar>(std::bind(IsOpType, p1, prim::kPrimMatMulFusion), "is_matmul1");
+  MS_CHECK_TRUE_RET(is_matmul1 != nullptr, {});
+  auto is_matmul2 = std::make_shared<CondVar>(std::bind(IsOpType, p1, prim::kPrimMatMulFusion), "is_matmul2");
+  MS_CHECK_TRUE_RET(is_matmul2 != nullptr, {});
+  if (!is_position_bias) {
+    auto matmul1 = VectorRef({is_matmul1, tuple3, weight_m_, bias_m_});
+    auto act = VectorRef({is_act_, matmul1});
+    matmul2 = VectorRef({is_matmul2, act, weight_p_, bias_p_});
+  } else {
+    auto matmul1 = VectorRef({is_matmul1, tuple3, weight_m_});
+    matmul2 = VectorRef({is_matmul2, matmul1, weight_p_});
+  }
+  auto is_reshape3 = std::make_shared<CondVar>(std::bind(IsOpType, p1, prim::kPrimReshape), "reshape-decoder3");
+  MS_CHECK_TRUE_RET(is_reshape3 != nullptr, {});
+  auto var3 = std::make_shared<Var>("var3");
+  MS_CHECK_TRUE_RET(var3 != nullptr, {});
+  auto reshape3 = VectorRef({is_reshape3, matmul2, var3});
+  auto is_reshape4 = std::make_shared<CondVar>(std::bind(IsOpType, p1, prim::kPrimReshape), "reshape-decoder4");
+  MS_CHECK_TRUE_RET(is_reshape4 != nullptr, {});
+  auto var4 = std::make_shared<Var>("var4");
+  MS_CHECK_TRUE_RET(var4 != nullptr, {});
+  auto reshape4 = (post_layernorm) ? VectorRef({is_reshape4, tuple3, var4}) : VectorRef({is_reshape4, add3, var4});
+  auto is_add4 = std::make_shared<CondVar>(std::bind(IsOpType, p1, prim::kPrimAddFusion), "is_add4");
+  auto add4 = VectorRef({is_add4, reshape4, reshape3});
+  return add4;
+}
+
+std::unordered_map<std::string, VectorRef> DecoderLayerFusion::DefinePatterns() const {
+  std::unordered_map<std::string, VectorRef> patterns;
+  if (!Init()) {
+    MS_LOG(ERROR) << "initial member failed.";
+    return patterns;
+  }
+  patterns[kPatternDecoderLayerPre] = DefinePatternDecoderLayer(false, true, false, true);
+  patterns[kPatternDecoderLayerPost] = DefinePatternDecoderLayer(true, true, false, true);
+  patterns[kPatternDecoderLayerNormPre] = DefinePatternDecoderLayer(false, false, false, true);
+  patterns[kPatternDecoderLayerNormPost] = DefinePatternDecoderLayer(true, false, false, true);
+  patterns[kPatternDecoderT5Pre] = DefinePatternDecoderLayer(false, false, true, true);
+  patterns[kPatternDecoderT5Post] = DefinePatternDecoderLayer(true, false, true, true);
+  return patterns;
+}
+
+AnfNodePtr DecoderLayerFusion::Process(const std::string &pattern_name, const mindspore::FuncGraphPtr &func_graph,
+                                       const mindspore::AnfNodePtr &node, const mindspore::EquivPtr &equiv) const {
+  if (func_graph == nullptr || node == nullptr || equiv == nullptr) {
+    return nullptr;
+  }
+  if (pattern_name == kPatternDecoderT5Pre || pattern_name == kPatternDecoderT5Post) {
+    is_position_bias_ = true;
+  }
+  if (pattern_name == kPatternDecoderLayerPre || pattern_name == kPatternDecoderLayerPost) {
+    is_layernorm_fusion_ = true;
+  }
+  bool mask = true;
+  bool post_layernorm = false;
+  if (pattern_name == kPatternDecoderLayerPost || pattern_name == kPatternDecoderT5Post ||
+      pattern_name == kPatternDecoderLayerNormPost) {
+    post_layernorm = true;
+  }
+  return CreateMaskedDecoderLayerFusionNode(func_graph, equiv, node, post_layernorm, mask);
+}  // namespace mindspore::opt
+
+bool DecoderLayerFusion::IsActGELU(const FuncGraphPtr &func_graph, const EquivPtr &equiv) const {
+  auto act_input = GetAttribute(func_graph, equiv, is_act_);
+  MS_ASSERT(act_input != nullptr);
+  auto act_primitive = ops::GetOperator<ops::Activation>(act_input);
+  MS_CHECK_TRUE_RET(act_primitive != nullptr, false);
+  auto act_primitive_c = act_primitive->GetPrim();
+  if (act_primitive_c->GetAttr(ops::kActivationType) == nullptr ||
+      act_primitive->get_activation_type() != mindspore::GELU) {
+    return false;
+  }
+  return true;
+}
+
+AnfNodePtr DecoderLayerFusion::GetAttribute(const FuncGraphPtr &func_graph, const EquivPtr &equiv,
+                                            VarPtr node_name) const {
+  if ((*equiv)[node_name] == nullptr || !utils::isa<AnfNodePtr>((*equiv)[node_name])) {
+    MS_LOG(ERROR) << node_name << "is not AnfNodePtr";
+    return nullptr;
+  }
+  AnfNodePtr node = utils::cast<AnfNodePtr>((*equiv)[node_name]);
+  MS_ASSERT(node != nullptr);
+  if (node == nullptr || !utils::isa<CNodePtr>(node)) {
+    auto manager = func_graph->manager();
+    if (manager == nullptr) {
+      return nullptr;
+    }
+    auto users = manager->node_users();
+    auto it = users.find(node);
+    if (it != users.end()) {
+      node = it->second.front().first;
+    }
+    if (node == nullptr || !utils::isa<CNodePtr>(node)) {
+      return nullptr;
+    }
+  }
+  auto cnode = utils::cast<CNodePtr>(node);
+  MS_ASSERT(cnode != nullptr);
+  auto input = cnode->input(0);
+  return input;
+}
+
+STATUS DecoderLayerFusion::GetEps(const EquivPtr &equiv, VarPtr node_name, float *eps) const {
+  if ((*equiv)[node_name] == nullptr || !utils::isa<AnfNodePtr>((*equiv)[node_name])) {
+    MS_LOG(ERROR) << node_name << " is not anfnodeptr";
+    return RET_ERROR;
+  }
+  AnfNodePtr node = utils::cast<AnfNodePtr>((*equiv)[node_name]);
+  MS_ASSERT(node != nullptr);
+  if (utils::isa<ValueNodePtr>(node)) {
+    auto value_ptr_node = utils::cast<ValueNodePtr>(node);
+    auto value_node = utils::cast<ValuePtr>(value_ptr_node->value());
+    if (value_node->isa<tensor::Tensor>()) {
+      auto tensor = value_node->cast<tensor::TensorPtr>();
+      MS_EXCEPTION_IF_NULL(tensor);
+      *eps = *reinterpret_cast<float *>(tensor->data().data());
+      return RET_OK;
+    }
+  }
+  return RET_ERROR;
+}
+
+STATUS DecoderLayerFusion::CheckPattern(const FuncGraphPtr &func_graph, const EquivPtr &equiv, int *head_num,
+                                        int *head_size, float *eps1, float *eps2, float *eps3, bool *is_position_bias1,
+                                        bool *is_position_bias2, float *scale1, float *scale2) const {
+  auto attn_input = GetAttribute(func_graph, equiv, is_attention_);
+  MS_ASSERT(attn_input != nullptr);
+  auto attn_prim = ops::GetOperator<ops::Attention>(attn_input);
+  if (attn_prim->GetAttr(ops::kDecoderLayerNumHeads) != nullptr) {
+    *head_num = attn_prim->get_head_num();
+  }
+  if (attn_prim->GetAttr(ops::kAttentionSizePerHead) != nullptr) {
+    *head_size = attn_prim->get_head_size();
+  }
+  if (attn_prim->GetAttr(ops::kPositionBias) != nullptr) {
+    *is_position_bias1 = attn_prim->get_position_bias();
+  }
+  if (attn_prim->GetAttr(ops::kScale) != nullptr) {
+    *scale1 = attn_prim->get_scale();
+  }
+  if ((*equiv)[is_attention_] == nullptr || !utils::isa<AnfNodePtr>((*equiv)[is_attention_])) {
+    MS_LOG(ERROR) << "is_attention_ is not AnfNodePtr";
+    return RET_ERROR;
+  }
+  auto attn_cross_input = GetAttribute(func_graph, equiv, is_attention_cross_);
+  MS_ASSERT(attn_cross_input != nullptr);
+  auto attn_cross_prim = ops::GetOperator<ops::Attention>(attn_cross_input);
+  if (attn_cross_prim->GetAttr(ops::kPositionBias) != nullptr) {
+    *is_position_bias2 = attn_cross_prim->get_position_bias();
+  }
+  if (attn_cross_prim->GetAttr(ops::kScale) != nullptr) {
+    *scale2 = attn_cross_prim->get_scale();
+  }
+  if (is_layernorm_fusion_) {
+    auto layrn1_input = GetAttribute(func_graph, equiv, is_layernorm1_);
+    auto layrn1_prim = ops::GetOperator<ops::LayerNormFusion>(layrn1_input);
+    if (layrn1_prim->GetAttr(ops::kEpsilon) != nullptr) {
+      *eps1 = layrn1_prim->get_epsilon();
+    }
+    auto layrn2_input = GetAttribute(func_graph, equiv, is_layernorm2_);
+    auto layrn2_prim = ops::GetOperator<ops::LayerNormFusion>(layrn2_input);
+    if (layrn2_prim->GetAttr(ops::kEpsilon) != nullptr) {
+      *eps2 = layrn2_prim->get_epsilon();
+    }
+    auto layrn3_input = GetAttribute(func_graph, equiv, is_layernorm3_);
+    auto layrn3_prim = ops::GetOperator<ops::LayerNormFusion>(layrn3_input);
+    if (layrn3_prim->GetAttr(ops::kEpsilon) != nullptr) {
+      *eps3 = layrn3_prim->get_epsilon();
+    }
+  } else {
+    if (GetEps(equiv, eps1_, eps1) != RET_OK) {
+      MS_LOG(ERROR) << "not found eps1";
+      return RET_ERROR;
+    }
+    if (GetEps(equiv, eps2_, eps2) != RET_OK) {
+      MS_LOG(ERROR) << "not found eps2";
+      return RET_ERROR;
+    }
+    if (GetEps(equiv, eps3_, eps3) != RET_OK) {
+      MS_LOG(ERROR) << "not found eps3";
+      return RET_ERROR;
+    }
+  }
+  if (!is_position_bias_) {
+    if (!IsActGELU(func_graph, equiv)) {
+      return RET_ERROR;
+    }
+    act_type_ = ActType::ActType_Gelu;
+  } else {
+    act_type_ = ActType::ActType_Relu;
+  }
+  return RET_OK;
+}
+
+std::shared_ptr<ops::DecoderLayer> DecoderLayerFusion::CreatePrim(const FuncGraphPtr &func_graph, const EquivPtr &equiv,
+                                                                  bool post_layernorm, int64_t ffn_hidden_size) const {
+  auto decoder_layer_prim = std::make_shared<ops::DecoderLayer>();
+  if (decoder_layer_prim == nullptr) {
+    MS_LOG(ERROR) << "Build decoder layer primitive failed.";
+    return nullptr;
+  }
+  int head_num = 0;
+  int head_size = 0;
+  float eps1 = 1e-6;
+  float eps2 = 1e-6;
+  float eps3 = 1e-6;
+  bool is_position_bias1 = false;
+  bool is_position_bias2 = false;
+  float scale1 = 1.0f;
+  float scale2 = 1.0f;
+  if (CheckPattern(func_graph, equiv, &head_num, &head_size, &eps1, &eps2, &eps3, &is_position_bias1,
+                   &is_position_bias2, &scale1, &scale2)) {
+    return nullptr;
+  }
+  decoder_layer_prim->Init(head_num, head_size, eps1, eps2, eps3, ffn_hidden_size, is_position_bias1, is_position_bias2,
+                           post_layernorm, scale1, scale2, act_type_);
+  return decoder_layer_prim;
+}
+
+CNodePtr DecoderLayerFusion::CreateMaskedDecoderLayerFusionNode(const FuncGraphPtr &func_graph, const EquivPtr &equiv,
+                                                                const AnfNodePtr &node, bool post_layernorm = true,
+                                                                bool mask = true) const {
+  MS_ASSERT(func_graph != nullptr);
+  MS_ASSERT(equiv != nullptr);
+  MS_ASSERT(node != nullptr);
+  auto input = utils::cast<AnfNodePtr>((*equiv)[hidden_stats_]);
+  MS_ASSERT(input != nullptr);
+  auto encoder_output = utils::cast<AnfNodePtr>((*equiv)[encoder_output_]);
+  MS_ASSERT(encoder_output != nullptr);
+  AnfNodePtr position_bias, input_mask, bias_attn_o, bias_attn_qkv, beta1, beta2, bias_m, bias_p, beta3,
+    bias_attn_cross_qkv, bias_attn_cross_o, position_bias_cross;
+  auto weight_qkv = utils::cast<AnfNodePtr>((*equiv)[weight_attn_qkv_]);
+  auto weight_attn_o = utils::cast<AnfNodePtr>((*equiv)[weight_attn_o_]);
+  auto weight_attn_q = utils::cast<AnfNodePtr>((*equiv)[weight_attn_q_]);
+  auto weight_attn_kv = utils::cast<AnfNodePtr>((*equiv)[weight_attn_kv_]);
+  auto weight_attn_cross_o = utils::cast<AnfNodePtr>((*equiv)[weight_attn_cross_o_]);
+  auto weight_m = utils::cast<AnfNodePtr>((*equiv)[weight_m_]);
+  auto weight_p = utils::cast<AnfNodePtr>((*equiv)[weight_p_]);
+  if (is_position_bias_) {
+    position_bias = utils::cast<AnfNodePtr>((*equiv)[position_bias_]);
+    position_bias_cross = utils::cast<AnfNodePtr>((*equiv)[position_bias_cross_]);
+  } else {
+    bias_attn_o = utils::cast<AnfNodePtr>((*equiv)[bias_attn_o_]);
+    bias_attn_qkv = utils::cast<AnfNodePtr>((*equiv)[bias_attn_qkv_]);
+    bias_attn_cross_qkv = utils::cast<AnfNodePtr>((*equiv)[bias_attn_cross_qkv_]);
+    bias_attn_cross_o = utils::cast<AnfNodePtr>((*equiv)[bias_attn_cross_o_]);
+    bias_m = utils::cast<AnfNodePtr>((*equiv)[bias_m_]);
+    bias_p = utils::cast<AnfNodePtr>((*equiv)[bias_p_]);
+    beta1 = utils::cast<AnfNodePtr>((*equiv)[beta1_]);
+    beta2 = utils::cast<AnfNodePtr>((*equiv)[beta2_]);
+    beta3 = utils::cast<AnfNodePtr>((*equiv)[beta3_]);
+  }
+  auto gamma1 = utils::cast<AnfNodePtr>((*equiv)[gamma1_]);
+  auto gamma2 = utils::cast<AnfNodePtr>((*equiv)[gamma2_]);
+  auto gamma3 = utils::cast<AnfNodePtr>((*equiv)[gamma3_]);
+  input_mask = mask ? utils::cast<AnfNodePtr>((*equiv)[mask_]) : nullptr;
+  auto cross_mask = utils::cast<AnfNodePtr>((*equiv)[cross_mask_]);
+  auto base_shape_ptr = weight_m->Shape();
+  MS_EXCEPTION_IF_NULL(base_shape_ptr);
+  auto input_shape_ptr = base_shape_ptr->cast<abstract::ShapePtr>();
+  MS_EXCEPTION_IF_NULL(input_shape_ptr);
+  auto input_shape = input_shape_ptr->shape();
+  MS_ASSERT(input_shape != nullptr);
+  int ffn_hidden_size = (int64_t)input_shape[1];
+  auto decoder_layer_prim = CreatePrim(func_graph, equiv, post_layernorm, ffn_hidden_size);
+  MS_CHECK_TRUE_RET(decoder_layer_prim != nullptr, nullptr);
+  auto decoder_layer_prim_c = decoder_layer_prim->GetPrim();
+  MS_CHECK_TRUE_RET(decoder_layer_prim_c != nullptr, nullptr);
+  auto value_node = NewValueNode(decoder_layer_prim_c);
+  MS_CHECK_TRUE_RET(value_node != nullptr, nullptr);
+  std::vector<AnfNodePtr> new_node_inputs = {value_node, input, gamma1};
+  if (is_position_bias_) {
+    new_node_inputs.insert(new_node_inputs.end(), {weight_qkv});
+    if (mask) new_node_inputs.push_back(input_mask);
+    new_node_inputs.insert(new_node_inputs.end(),
+                           {position_bias, weight_attn_o, gamma2, encoder_output, weight_attn_q, weight_attn_kv});
+    if (mask) new_node_inputs.push_back(cross_mask);
+    new_node_inputs.insert(new_node_inputs.end(),
+                           {position_bias_cross, weight_attn_cross_o, gamma3, weight_m, weight_p});
+  } else {
+    new_node_inputs.insert(new_node_inputs.end(), {beta1, weight_qkv, bias_attn_qkv});
+    if (mask) new_node_inputs.push_back(input_mask);
+    new_node_inputs.insert(new_node_inputs.end(), {weight_attn_o, bias_attn_o, gamma2, beta2, encoder_output,
+                                                   weight_attn_q, weight_attn_kv, bias_attn_cross_qkv});
+    if (mask) new_node_inputs.push_back(cross_mask);
+    new_node_inputs.insert(new_node_inputs.end(),
+                           {weight_attn_cross_o, bias_attn_cross_o, gamma3, beta3, weight_m, bias_m, weight_p, bias_p});
+  }
+  auto new_node = func_graph->NewCNode(new_node_inputs);
+  MS_CHECK_TRUE_RET(new_node != nullptr, nullptr);
+  auto old_node = node->cast<CNodePtr>();
+  MS_CHECK_TRUE_RET(old_node->abstract() != nullptr, nullptr);
+  new_node->set_abstract(old_node->abstract()->Clone());
+  new_node->set_fullname_with_scope(node->fullname_with_scope() + "/decoder_layer");
+  return new_node;
+}
+}  // namespace mindspore::opt
diff --git a/mindspore/lite/tools/optimizer/fusion/decoder_layer_fusion.h b/mindspore/lite/tools/optimizer/fusion/decoder_layer_fusion.h
new file mode 100644
index 0000000000000000000000000000000000000000..f5faec283e2fca3114817db87edb7e554b6e14e6
--- /dev/null
+++ b/mindspore/lite/tools/optimizer/fusion/decoder_layer_fusion.h
@@ -0,0 +1,113 @@
+// /**
+//  * Copyright 2021 Huawei Technologies Co., Ltd
+//  *
+//  * Licensed under the Apache License, Version 2.0 (the "License");
+//  * you may not use this file except in compliance with the License.
+//  * You may obtain a copy of the License at
+//  *
+//  * http://www.apache.org/licenses/LICENSE-2.0
+//  *
+//  * Unless required by applicable law or agreed to in writing, software
+//  * distributed under the License is distributed on an "AS IS" BASIS,
+//  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//  * See the License for the specific language governing permissions and
+//  * limitations under the License.
+//  */
+#ifndef MINDSPORE_LITE_TOOLS_OPTIMIZER_FUSION_DECODER_LAYER_FUSION_H_
+#define MINDSPORE_LITE_TOOLS_OPTIMIZER_FUSION_DECODER_LAYER_FUSION_H_
+
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <vector>
+#include "tools/optimizer/common/multiple_pattern_process_pass.h"
+#include "include/common/utils/utils.h"
+#include "include/errorcode.h"
+#include "ops/decoder_layer.h"
+#include "ops/fusion/layer_norm_fusion.h"
+#include "ops/fusion/activation.h"
+#include "tools/optimizer/fusion/multi_head_attention_fusion.h"
+
+namespace mindspore {
+namespace opt {
+class DecoderLayerFusion : public MultiplePatternProcessPass {
+ public:
+  explicit DecoderLayerFusion(const std::string &name = "DecoderLayerFusion", bool multigraph = true)
+      : MultiplePatternProcessPass(name, multigraph) {}
+
+  ~DecoderLayerFusion() override = default;
+
+  AnfNodePtr Process(const std::string &pattern_name, const FuncGraphPtr &, const AnfNodePtr &,
+                     const EquivPtr &) const override;
+  std::unordered_map<std::string, VectorRef> DefinePatterns() const override;
+
+ protected:
+  virtual bool Init() const;
+
+ private:
+  VectorRef DefinePatternDecoderLayer(bool post_layernorm, bool layernorm_fusion, bool is_position_bias,
+                                      bool mask) const;
+  VectorRef getTuple(bool post_layernorm, bool layernorm_fusion, bool is_position_bias) const;
+  VectorRef DefineLayerNorm(VectorRef input, VarPtr gamma, VarPtr beta, VarPtr eps) const;
+  CNodePtr CreateMaskedDecoderLayerFusionNode(const FuncGraphPtr &func_graph, const EquivPtr &equiv,
+                                              const AnfNodePtr &node, bool post_layernorm, bool mask) const;
+  std::shared_ptr<ops::DecoderLayer> CreatePrim(const FuncGraphPtr &func_graph, const EquivPtr &equiv,
+                                                bool post_layernorm, int64_t ffn_hidden_size) const;
+  lite::STATUS CheckPattern(const FuncGraphPtr &func_graph, const EquivPtr &equiv, int *head_num, int *head_size,
+                            float *eps1, float *eps2, float *eps3, bool *is_position_bias1, bool *is_position_bias2,
+                            float *scale1, float *scale2) const;
+  AnfNodePtr GetAttribute(const FuncGraphPtr &func_graph, const EquivPtr &equiv, VarPtr node_name) const;
+  bool IsActGELU(const FuncGraphPtr &func_graph, const EquivPtr &equiv) const;
+  lite::STATUS GetEps(const EquivPtr &equiv, VarPtr node_name, float *eps) const;
+
+ protected:
+  const std::string kPatternDecoderLayerPre = "PatternDecoderLayerPre";
+  const std::string kPatternDecoderLayerPost = "PatternDecoderLayerPost";
+  const std::string kPatternDecoderLayerNormPre = "kPatternDecoderLayerNormPre";
+  const std::string kPatternDecoderLayerNormPost = "kPatternDecoderLayerNormPost";
+  const std::string kPatternDecoderT5Pre = "PatternDecoderT5Pre";
+  const std::string kPatternDecoderT5Post = "PatternDecoderT5Post";
+  mutable VarPtr hidden_stats_{nullptr};
+  mutable VarPtr encoder_output_{nullptr};
+  mutable VarPtr position_bias_{nullptr};
+  mutable VarPtr beta1_{nullptr};
+  mutable VarPtr gamma1_{nullptr};
+  mutable VarPtr beta2_{nullptr};
+  mutable VarPtr gamma2_{nullptr};
+  mutable VarPtr gamma3_{nullptr};
+  mutable VarPtr beta3_{nullptr};
+  mutable VarPtr weight_attn_qkv_{nullptr};
+  mutable VarPtr weight_attn_qkv_cross_{nullptr};
+  mutable VarPtr weight_attn_o_{nullptr};
+  mutable VarPtr weight_m_{nullptr};
+  mutable VarPtr weight_p_{nullptr};
+  mutable VarPtr bias_attn_qkv_{nullptr};
+  mutable VarPtr bias_attn_o_{nullptr};
+  mutable VarPtr bias_attn_cross_qkv_{nullptr};
+  mutable VarPtr bias_attn_cross_o_{nullptr};
+  mutable VarPtr bias_m_{nullptr};
+  mutable VarPtr bias_p_{nullptr};
+  mutable VarPtr mask_{nullptr};
+  mutable VarPtr is_attention_{nullptr};
+  mutable VarPtr is_attention_cross_{nullptr};
+  mutable VarPtr weight_attn_q_{nullptr};
+  mutable VarPtr weight_attn_kv_{nullptr};
+  mutable VarPtr weight_attn_cross_o_{nullptr};
+  mutable VarPtr position_bias_cross_{nullptr};
+  mutable VarPtr cross_mask_{nullptr};
+  mutable VarPtr reshape_k_{nullptr};
+  mutable VarPtr reshape_v_{nullptr};
+  mutable VarPtr is_layernorm1_{nullptr};
+  mutable VarPtr is_layernorm2_{nullptr};
+  mutable VarPtr is_layernorm3_{nullptr};
+  mutable VarPtr is_act_{nullptr};
+  mutable VarPtr eps1_{nullptr};
+  mutable VarPtr eps2_{nullptr};
+  mutable VarPtr eps3_{nullptr};
+  mutable bool is_position_bias_{false};
+  mutable bool is_layernorm_fusion_{false};
+  mutable ActType act_type_{ActType::ActType_No};
+};
+}  // namespace opt
+}  // namespace mindspore
+#endif  // MINDSPORE_LITE_TOOLS_OPTIMIZER_FUSION_DECODER_LAYER_FUSION_H_
diff --git a/mindspore/lite/tools/optimizer/fusion/encoder_layer_fusion.cc b/mindspore/lite/tools/optimizer/fusion/encoder_layer_fusion.cc
index 233f878781bfe4992f9392ce043bcdb8c32f511c..ada4b7e856ea534fec4812e3eba94f2677231365 100644
--- a/mindspore/lite/tools/optimizer/fusion/encoder_layer_fusion.cc
+++ b/mindspore/lite/tools/optimizer/fusion/encoder_layer_fusion.cc
@@ -67,9 +67,13 @@ bool EncoderLayerFusion::Init() const {
   is_layernorm2_ = std::make_shared<CondVar>(std::bind(IsOpType, p1, prim::kPrimLayerNormFusion), "layer_norm2");
   MS_CHECK_TRUE_RET(is_layernorm2_ != nullptr, false);
   position_bias_ = std::make_shared<Var>("position_bias");
-  MS_CHECK_TRUE_RET(is_layernorm2_ != nullptr, false);
+  MS_CHECK_TRUE_RET(position_bias_ != nullptr, false);
   is_act_ = std::make_shared<CondVar>(std::bind(IsOpType, p1, prim::kPrimActivation), "activation");
   MS_CHECK_TRUE_RET(is_act_ != nullptr, {});
+  eps1_ = std::make_shared<Var>("position_bias");
+  MS_CHECK_TRUE_RET(eps1_ != nullptr, false);
+  eps2_ = std::make_shared<Var>("position_bias");
+  MS_CHECK_TRUE_RET(eps2_ != nullptr, false);
   return true;
 }
 
@@ -80,11 +84,11 @@ VectorRef EncoderLayerFusion::getTuple(bool post_layernorm, bool layernorm_fusio
   auto var1 = std::make_shared<Var>("var1-reshape");
   MS_CHECK_TRUE_RET(var1 != nullptr, {});
   auto reshape1 = VectorRef({is_reshape1, input_, var1});
-  if (post_layernorm) {
+  if (post_layernorm && !is_position_bias) {
     return reshape1;
   }
-  if (layernorm_fusion) {
-    return DefineLayerNorm(is_position_bias, reshape1, gamma1_, beta1_);
+  if (!layernorm_fusion) {
+    return DefineLayerNorm(is_position_bias, reshape1, gamma1_, beta1_, eps1_);
   }
   auto layer_norm = VectorRef({is_layernorm1_, reshape1, gamma1_, beta1_});
   auto is_tuple = std::make_shared<CondVar>(std::bind(IsOpType, p1, prim::kPrimTupleGetItem), "tuple_get_itme");
@@ -93,7 +97,8 @@ VectorRef EncoderLayerFusion::getTuple(bool post_layernorm, bool layernorm_fusio
   return tuple;
 }
 
-VectorRef EncoderLayerFusion::DefineLayerNorm(bool is_position_bias, VectorRef input, VarPtr gamma, VarPtr beta) const {
+VectorRef EncoderLayerFusion::DefineLayerNorm(bool is_position_bias, VectorRef input, VarPtr gamma, VarPtr beta,
+                                              VarPtr eps) const {
   auto var1 = std::make_shared<Var>("var1");
   MS_CHECK_TRUE_RET(var1 != nullptr, {});
   auto is_reduce = std::make_shared<CondVar>(std::bind(IsOpType, p1, prim::kPrimReduceFusion), "reduce");
@@ -110,11 +115,9 @@ VectorRef EncoderLayerFusion::DefineLayerNorm(bool is_position_bias, VectorRef i
   auto is_reduce2 = std::make_shared<CondVar>(std::bind(IsOpType, p1, prim::kPrimReduceFusion), "reduce2");
   MS_CHECK_TRUE_RET(is_reduce2 != nullptr, {});
   auto reduce2 = VectorRef({is_reduce2, sqr, var2});
-  auto var3 = std::make_shared<Var>("var3");
-  MS_CHECK_TRUE_RET(var3 != nullptr, {});
   auto is_add = std::make_shared<CondVar>(std::bind(IsOpType, p1, prim::kPrimAddFusion), "is-add");
   MS_CHECK_TRUE_RET(is_add != nullptr, {});
-  auto add = VectorRef({is_add, reduce2, var3});
+  auto add = VectorRef({is_add, reduce2, eps});
   auto is_sqr2 = std::make_shared<CondVar>(std::bind(IsOpType, p1, prim::kPrimSqrt), "sqr2");
   MS_CHECK_TRUE_RET(is_sqr2 != nullptr, {});
   auto sqr2 = VectorRef({is_sqr2, add});
@@ -136,24 +139,27 @@ VectorRef EncoderLayerFusion::DefineLayerNorm(bool is_position_bias, VectorRef i
 }
 
 VectorRef EncoderLayerFusion::DefinePatternEncoderLayer(bool post_layernorm = true, bool layernorm_fusion = false,
-                                                        bool is_position_bias = false) const {
-  VectorRef attention, tuple, tuple2, tuple3, reshape2, matmul1;
+                                                        bool is_position_bias = false, bool mask = true) const {
+  VectorRef tuple, tuple2, tuple3, reshape2, matmul1, inputs;
   auto is_reshape1 = std::make_shared<CondVar>(std::bind(IsOpType, p1, prim::kPrimReshape), "reshape-encoder");
   MS_CHECK_TRUE_RET(is_reshape1 != nullptr, {});
   auto var1 = std::make_shared<Var>("var1");
   MS_CHECK_TRUE_RET(var1 != nullptr, {});
   auto reshape1 = VectorRef({is_reshape1, input_, var1});
   if (!is_position_bias) {
-    attention = VectorRef({is_attention_, getTuple(post_layernorm, layernorm_fusion, is_position_bias),
-                           getTuple(post_layernorm, layernorm_fusion, is_position_bias),
-                           getTuple(post_layernorm, layernorm_fusion, is_position_bias), weight_attn_qkv_,
-                           weight_attn_o_, bias_attn_qkv_, bias_attn_o_, mask_});
+    inputs = VectorRef({is_attention_, getTuple(post_layernorm, layernorm_fusion, is_position_bias),
+                        getTuple(post_layernorm, layernorm_fusion, is_position_bias),
+                        getTuple(post_layernorm, layernorm_fusion, is_position_bias), weight_attn_qkv_, weight_attn_o_,
+                        bias_attn_qkv_, bias_attn_o_});
   } else {
-    attention = VectorRef({is_attention_, getTuple(post_layernorm, layernorm_fusion, is_position_bias),
-                           getTuple(post_layernorm, layernorm_fusion, is_position_bias),
-                           getTuple(post_layernorm, layernorm_fusion, is_position_bias), weight_attn_qkv_,
-                           weight_attn_o_, position_bias_, mask_});
+    inputs = VectorRef({is_attention_, getTuple(post_layernorm, layernorm_fusion, is_position_bias),
+                        getTuple(post_layernorm, layernorm_fusion, is_position_bias),
+                        getTuple(post_layernorm, layernorm_fusion, is_position_bias), weight_attn_qkv_, weight_attn_o_,
+                        position_bias_});
   }
+  // return attention;
+  if (mask) inputs.push_back(mask_);
+  auto attention = VectorRef(inputs);
   if (!is_position_bias) {
     auto is_tuple = std::make_shared<CondVar>(std::bind(IsOpType, p1, prim::kPrimTupleGetItem), "tuple_get_itme");
     auto var_tuple = std::make_shared<Var>("var_tuple");
@@ -162,14 +168,16 @@ VectorRef EncoderLayerFusion::DefinePatternEncoderLayer(bool post_layernorm = tr
     tuple = attention;
   }
   auto is_add = std::make_shared<CondVar>(std::bind(IsOpType, p1, prim::kPrimAddFusion), "is_add");
-  auto add = VectorRef({is_add, reshape1, tuple});
+  auto add = (is_position_bias && post_layernorm)
+               ? VectorRef({is_add, getTuple(post_layernorm, layernorm_fusion, is_position_bias), tuple})
+               : VectorRef({is_add, reshape1, tuple});
   if (layernorm_fusion) {
-    tuple2 = DefineLayerNorm(is_position_bias, add, gamma2_, beta2_);
-  } else {
     auto layer_norm2 = VectorRef({is_layernorm2_, add, gamma2_, beta2_});
     auto is_tuple2 = std::make_shared<CondVar>(std::bind(IsOpType, p1, prim::kPrimTupleGetItem), "tuple_get_item2");
     auto var_tuple2 = std::make_shared<Var>("var_tuple2");
     tuple2 = VectorRef({is_tuple2, layer_norm2, var_tuple2});
+  } else {
+    tuple2 = DefineLayerNorm(is_position_bias, add, gamma2_, beta2_, eps2_);
   }
   auto is_reshape2 = std::make_shared<CondVar>(std::bind(IsOpType, p1, prim::kPrimReshape), "reshape-encoder2");
   MS_CHECK_TRUE_RET(is_reshape2 != nullptr, {});
@@ -178,9 +186,13 @@ VectorRef EncoderLayerFusion::DefinePatternEncoderLayer(bool post_layernorm = tr
   auto is_matmul1 = std::make_shared<CondVar>(std::bind(IsOpType, p1, prim::kPrimMatMulFusion), "is_matmul1");
   MS_CHECK_TRUE_RET(is_matmul1 != nullptr, {});
   if (is_position_bias) {
-    reshape2 = VectorRef({is_reshape2, add, var2});
+    if (post_layernorm) {
+      reshape2 = VectorRef({is_reshape2, tuple2, var2});
+    } else {
+      reshape2 = VectorRef({is_reshape2, add, var2});
+    }
     matmul1 = VectorRef({is_matmul1, tuple2, weight_m_});
-  } else if (post_layernorm || layernorm_fusion) {
+  } else if (post_layernorm || !layernorm_fusion) {
     reshape2 = VectorRef({is_reshape2, tuple2, var2});
     matmul1 = VectorRef({is_matmul1, tuple2, weight_m_, bias_m_});
   } else {
@@ -199,7 +211,7 @@ VectorRef EncoderLayerFusion::DefinePatternEncoderLayer(bool post_layernorm = tr
   auto reshape3 = VectorRef({is_reshape3, matmul2, var3});
   auto is_add3 = std::make_shared<CondVar>(std::bind(IsOpType, p1, prim::kPrimAddFusion), "is_add3");
   auto add3 = VectorRef({is_add3, reshape2, reshape3});
-  if (!post_layernorm || layernorm_fusion) {
+  if (!post_layernorm || !layernorm_fusion) {
     return add3;
   }
   auto is_reshape4 = std::make_shared<CondVar>(std::bind(IsOpType, p1, prim::kPrimReshape), "reshape-encoder");
@@ -208,12 +220,12 @@ VectorRef EncoderLayerFusion::DefinePatternEncoderLayer(bool post_layernorm = tr
   MS_CHECK_TRUE_RET(var4 != nullptr, {});
   auto reshape4 = VectorRef({is_reshape4, add3, var4});
   if (layernorm_fusion) {
-    tuple3 = DefineLayerNorm(is_position_bias, reshape4, gamma1_, beta1_);
-  } else {
     auto layer_norm = VectorRef({is_layernorm1_, reshape4, gamma1_, beta1_});
     auto is_tuple3 = std::make_shared<CondVar>(std::bind(IsOpType, p1, prim::kPrimTupleGetItem), "tuple_get_item3");
     auto var_tuple3 = std::make_shared<Var>("var_tuple3");
     tuple3 = VectorRef({is_tuple3, layer_norm, var_tuple3});
+  } else {
+    tuple3 = DefineLayerNorm(is_position_bias, reshape4, gamma1_, beta1_, eps1_);
   }
   auto is_reshape5 = std::make_shared<CondVar>(std::bind(IsOpType, p1, prim::kPrimReshape), "reshape-encoder");
   MS_CHECK_TRUE_RET(is_reshape5 != nullptr, {});
@@ -233,7 +245,8 @@ std::unordered_map<std::string, VectorRef> EncoderLayerFusion::DefinePatterns()
   patterns[kPatternEncoderLayerPost] = DefinePatternEncoderLayer(true);
   patterns[kPatternEncoderLayerPostNorm] = DefinePatternEncoderLayer(true, true);
   patterns[kPatternEncoderLayerPreNorm] = DefinePatternEncoderLayer(false, true);
-  patterns[kPatternEncoderLayerT5] = DefinePatternEncoderLayer(false, true, true);
+  patterns[kPatternEncoderLayerT5Pre] = DefinePatternEncoderLayer(false, false, true, true);
+  patterns[kPatternEncoderLayerT5Post] = DefinePatternEncoderLayer(true, false, true, true);
   return patterns;
 }
 
@@ -242,15 +255,16 @@ AnfNodePtr EncoderLayerFusion::Process(const std::string &pattern_name, const mi
   if (func_graph == nullptr || node == nullptr || equiv == nullptr) {
     return nullptr;
   }
-  if (pattern_name == kPatternEncoderLayerPost || pattern_name == kPatternEncoderLayerPostNorm) {
-    return CreateMaskedEncoderLayerFusionNode(func_graph, equiv, node, true);
-  } else if (pattern_name == kPatternEncoderLayerPre || pattern_name == kPatternEncoderLayerPreNorm) {
-    return CreateMaskedEncoderLayerFusionNode(func_graph, equiv, node, false);
-  } else if (pattern_name == kPatternEncoderLayerT5) {
-    is_position_bias_ = true;
-    return CreateMaskedEncoderLayerFusionNode(func_graph, equiv, node, false);
-  }
-  return nullptr;
+  if (pattern_name == kPatternEncoderLayerPostNorm || pattern_name == kPatternEncoderLayerPreNorm)
+    is_layernorm_fusion_ = true;
+  if (pattern_name == kPatternEncoderLayerT5Pre || pattern_name == kPatternEncoderLayerT5Post) is_position_bias_ = true;
+  bool mask = true;
+  bool post_layernorm = false;
+  if (pattern_name == kPatternEncoderLayerPost || pattern_name == kPatternEncoderLayerPostNorm ||
+      pattern_name == kPatternEncoderLayerT5Post)
+    post_layernorm = true;
+
+  return CreateMaskedEncoderLayerFusionNode(func_graph, equiv, node, post_layernorm, mask);
 }
 
 bool EncoderLayerFusion::IsActGELU(const FuncGraphPtr &func_graph, const EquivPtr &equiv,
@@ -267,6 +281,26 @@ bool EncoderLayerFusion::IsActGELU(const FuncGraphPtr &func_graph, const EquivPt
   return true;
 }
 
+STATUS EncoderLayerFusion::GetEps(const EquivPtr &equiv, VarPtr node_name, float *eps) const {
+  if ((*equiv)[node_name] == nullptr || !utils::isa<AnfNodePtr>((*equiv)[node_name])) {
+    MS_LOG(ERROR) << node_name << " is not anfnodeptr";
+    return RET_ERROR;
+  }
+  AnfNodePtr node = utils::cast<AnfNodePtr>((*equiv)[node_name]);
+  MS_ASSERT(node != nullptr);
+  if (utils::isa<ValueNodePtr>(node)) {
+    auto value_ptr_node = utils::cast<ValueNodePtr>(node);
+    auto value_node = utils::cast<ValuePtr>(value_ptr_node->value());
+    if (value_node->isa<tensor::Tensor>()) {
+      auto tensor = value_node->cast<tensor::TensorPtr>();
+      MS_EXCEPTION_IF_NULL(tensor);
+      *eps = *reinterpret_cast<float *>(tensor->data().data());
+      return RET_OK;
+    }
+  }
+  return RET_ERROR;
+}
+
 AnfNodePtr EncoderLayerFusion::GetAttribute(const FuncGraphPtr &func_graph, const EquivPtr &equiv,
                                             VarPtr node_name) const {
   if ((*equiv)[node_name] == nullptr || !utils::isa<AnfNodePtr>((*equiv)[node_name])) {
@@ -294,8 +328,9 @@ AnfNodePtr EncoderLayerFusion::GetAttribute(const FuncGraphPtr &func_graph, cons
   auto input = cnode->input(0);
   return input;
 }
+
 STATUS EncoderLayerFusion::CheckPattern(const FuncGraphPtr &func_graph, const EquivPtr &equiv, int *head_num,
-                                        int *head_size, float *eps1, float *eps2) const {
+                                        int *head_size, float *eps1, float *eps2, float *scale) const {
   auto attn_input = GetAttribute(func_graph, equiv, is_attention_);
   MS_ASSERT(attn_input != nullptr);
   auto attn_prim = ops::GetOperator<ops::Attention>(attn_input);
@@ -308,18 +343,38 @@ STATUS EncoderLayerFusion::CheckPattern(const FuncGraphPtr &func_graph, const Eq
   if (attn_prim->GetAttr(ops::kPositionBias) != nullptr) {
     is_position_bias_ = attn_prim->get_position_bias();
   }
-  auto layrn1_input = GetAttribute(func_graph, equiv, is_layernorm1_);
-  auto layrn1_prim = ops::GetOperator<ops::LayerNormFusion>(layrn1_input);
-  if (layrn1_prim->GetAttr(ops::kEpsilon) != nullptr) {
-    *eps1 = layrn1_prim->get_epsilon();
+  if (attn_prim->GetAttr(ops::kScale) != nullptr) {
+    *scale = attn_prim->get_scale();
   }
-  auto layrn2_input = GetAttribute(func_graph, equiv, is_layernorm2_);
-  auto layrn2_prim = ops::GetOperator<ops::LayerNormFusion>(layrn2_input);
-  if (layrn2_prim->GetAttr(ops::kEpsilon) != nullptr) {
-    *eps2 = layrn2_prim->get_epsilon();
+  if (is_layernorm_fusion_) {
+    auto layrn1_input = GetAttribute(func_graph, equiv, is_layernorm1_);
+    auto layrn1_prim = ops::GetOperator<ops::LayerNormFusion>(layrn1_input);
+    if (layrn1_prim->GetAttr(ops::kEpsilon) != nullptr) {
+      *eps1 = layrn1_prim->get_epsilon();
+    }
+    auto layrn2_input = GetAttribute(func_graph, equiv, is_layernorm2_);
+    auto layrn2_prim = ops::GetOperator<ops::LayerNormFusion>(layrn2_input);
+    if (layrn2_prim->GetAttr(ops::kEpsilon) != nullptr) {
+      *eps2 = layrn2_prim->get_epsilon();
+    }
+  } else {
+    if (GetEps(equiv, eps1_, eps1) != RET_OK) {
+      MS_LOG(ERROR) << "not found eps1";
+      return RET_ERROR;
+    }
+
+    if (GetEps(equiv, eps2_, eps2) != RET_OK) {
+      MS_LOG(ERROR) << "not found eps2";
+      return RET_ERROR;
+    }
   }
-  if (!IsActGELU(func_graph, equiv, is_act_)) {
-    return false;
+  if (!is_position_bias_) {
+    if (!IsActGELU(func_graph, equiv, is_act_)) {
+      return RET_ERROR;
+    }
+    act_type_ = ActType::ActType_Gelu;
+  } else {
+    act_type_ = ActType::ActType_Relu;
   }
   return RET_OK;
 }
@@ -333,18 +388,20 @@ std::shared_ptr<ops::EncoderLayer> EncoderLayerFusion::CreatePrim(const FuncGrap
   }
   int head_num = 0;
   int head_size = 0;
-  float eps1 = 1e-6;
-  float eps2 = 1e-6;
-  if (CheckPattern(func_graph, equiv, &head_num, &head_size, &eps1, &eps2)) {
+  float eps1 = 1e-5;
+  float eps2 = 1e-5;
+  float scale = 1.0f;
+  if (CheckPattern(func_graph, equiv, &head_num, &head_size, &eps1, &eps2, &scale)) {
     return nullptr;
   }
-  encoder_layer_prim->Init(head_num, head_size, eps1, eps2, ffn_hidden_size, is_position_bias_, post_layernorm);
+  encoder_layer_prim->Init(head_num, head_size, eps1, eps2, ffn_hidden_size, is_position_bias_, post_layernorm, scale,
+                           act_type_);
   return encoder_layer_prim;
 }
 
 CNodePtr EncoderLayerFusion::CreateMaskedEncoderLayerFusionNode(const FuncGraphPtr &func_graph, const EquivPtr &equiv,
-                                                                const AnfNodePtr &node,
-                                                                bool post_layernorm = true) const {
+                                                                const AnfNodePtr &node, bool post_layernorm,
+                                                                bool mask) const {
   MS_ASSERT(func_graph != nullptr);
   MS_ASSERT(equiv != nullptr);
   MS_ASSERT(node != nullptr);
@@ -364,9 +421,7 @@ CNodePtr EncoderLayerFusion::CreateMaskedEncoderLayerFusionNode(const FuncGraphP
   }
   auto gamma1 = utils::cast<AnfNodePtr>((*equiv)[gamma1_]);
   auto gamma2 = utils::cast<AnfNodePtr>((*equiv)[gamma2_]);
-  if (mask_) {
-    input_mask = utils::cast<AnfNodePtr>((*equiv)[mask_]);
-  }
+  input_mask = mask ? utils::cast<AnfNodePtr>((*equiv)[mask_]) : nullptr;
   auto base_shape_ptr = weight_m->Shape();
   MS_EXCEPTION_IF_NULL(base_shape_ptr);
   auto input_shape_ptr = base_shape_ptr->cast<abstract::ShapePtr>();
@@ -380,24 +435,23 @@ CNodePtr EncoderLayerFusion::CreateMaskedEncoderLayerFusionNode(const FuncGraphP
   MS_CHECK_TRUE_RET(encoder_layer_prim_c != nullptr, nullptr);
   auto value_node = NewValueNode(encoder_layer_prim_c);
   MS_CHECK_TRUE_RET(value_node != nullptr, nullptr);
-  std::vector<AnfNodePtr> new_node_inputs;
-  ParameterPtr c_bias_m_param, c_weight_p_param, c_bias_p_param, c_weight_m_param;
+  std::vector<AnfNodePtr> new_node_inputs = {value_node, input};
   if (is_position_bias_) {
     position_bias = utils::cast<AnfNodePtr>((*equiv)[position_bias_]);
-    if (!post_layernorm)
-      new_node_inputs = {value_node,    input,  gamma1,   weight_qkv, input_mask,
-                         weight_attn_o, gamma2, weight_m, weight_p,   position_bias};
-    else
-      new_node_inputs = {value_node, input,    weight_qkv, input_mask, weight_attn_o,
-                         gamma1,     weight_m, weight_p,   gamma2,     position_bias};
+    new_node_inputs.insert(new_node_inputs.end(), {gamma1, weight_qkv});
+    if (mask) new_node_inputs.push_back(input_mask);
+    new_node_inputs.insert(new_node_inputs.end(), {position_bias, weight_attn_o, gamma2, weight_m, weight_p});
   } else {
     if (!post_layernorm) {
-      new_node_inputs = {value_node,  input,  gamma1, beta1,    weight_qkv, bias_attn_qkv, input_mask, weight_attn_o,
-                         bias_attn_o, gamma2, beta2,  weight_m, bias_m,     weight_p,      bias_p};
+      new_node_inputs.insert(new_node_inputs.end(), {gamma1, beta1, weight_qkv, bias_attn_qkv});
+      if (mask) new_node_inputs.push_back(input_mask);
+      new_node_inputs.insert(new_node_inputs.end(),
+                             {weight_attn_o, bias_attn_o, gamma2, beta2, weight_m, bias_m, weight_p, bias_p});
     } else {
-      new_node_inputs = {value_node,    input,       weight_qkv, bias_attn_qkv, input_mask,
-                         weight_attn_o, bias_attn_o, gamma1,     beta1,         weight_m,
-                         bias_m,        weight_p,    bias_p,     gamma2,        beta2};
+      new_node_inputs.insert(new_node_inputs.end(), {weight_qkv, bias_attn_qkv});
+      if (mask) new_node_inputs.push_back(input_mask);
+      new_node_inputs.insert(new_node_inputs.end(), {weight_attn_o, bias_attn_o, gamma1, beta1, weight_m, bias_m,
+                                                     weight_p, bias_p, gamma2, beta2});
     }
   }
   auto new_node = func_graph->NewCNode(new_node_inputs);
diff --git a/mindspore/lite/tools/optimizer/fusion/encoder_layer_fusion.h b/mindspore/lite/tools/optimizer/fusion/encoder_layer_fusion.h
index 56945b850cc67579517cdc1e1bd2cf767eaf22ff..9f93f60396226caa25d3b40d5b73b63a104f3df1 100644
--- a/mindspore/lite/tools/optimizer/fusion/encoder_layer_fusion.h
+++ b/mindspore/lite/tools/optimizer/fusion/encoder_layer_fusion.h
@@ -49,16 +49,19 @@ class EncoderLayerFusion : public MultiplePatternProcessPass {
   const std::string kPatternEncoderLayerPre = "PatternTEncoderLayerPre";
   const std::string kPatternEncoderLayerPostNorm = "PatternTEncoderLayerPostNorm";
   const std::string kPatternEncoderLayerPreNorm = "PatternTEncoderLayerPreNorm";
-  const std::string kPatternEncoderLayerT5 = "PatternEncoderLayerT5";
-  VectorRef DefinePatternEncoderLayer(bool post_layernorm, bool layernorm_fusion, bool is_position_bias_) const;
-  VectorRef getTuple(bool post_layernorm, bool layernorm_fusion, bool is_position_bias) const;
-  VectorRef DefineLayerNorm(bool is_position_bias, VectorRef input, VarPtr gamma, VarPtr beta) const;
+  const std::string kPatternEncoderLayerT5Post = "kPatternEncoderLayerT5Post";
+  const std::string kPatternEncoderLayerT5Pre = "kPatternEncoderLayerT5Pre";
+  VectorRef DefinePatternEncoderLayer(bool post_layernorm, bool layernorm_fusion, bool is_position_bias_,
+                                      bool mask) const;  VectorRef getTuple(bool post_layernorm, bool layernorm_fusion, bool is_position_bias) const;
+  VectorRef DefineLayerNorm(bool is_position_bias, VectorRef input, VarPtr gamma, VarPtr beta, VarPtr eps) const;
   CNodePtr CreateMaskedEncoderLayerFusionNode(const FuncGraphPtr &func_graph, const EquivPtr &equiv,
-                                              const AnfNodePtr &node, bool post_layernorm) const;
+                                              const AnfNodePtr &node, bool post_layernorm = true,
+                                              bool mask = true) const;
   AnfNodePtr GetAttribute(const FuncGraphPtr &func_graph, const EquivPtr &equiv, VarPtr node_name) const;
   bool IsActGELU(const FuncGraphPtr &func_graph, const EquivPtr &equiv, const VarPtr &input_prim) const;
+  lite::STATUS GetEps(const EquivPtr &equiv, VarPtr node_name, float *eps) const;
   lite::STATUS CheckPattern(const FuncGraphPtr &func_graph, const EquivPtr &equiv, int *head_num, int *head_size,
-                            float *eps1, float *eps2) const;
+                            float *eps1, float *eps2, float *scale) const;
   std::shared_ptr<ops::EncoderLayer> CreatePrim(const FuncGraphPtr &func_graph, const EquivPtr &equiv,
                                                 bool post_layernorm, int64_t ffn_hidden_size) const;
 
@@ -83,7 +86,11 @@ class EncoderLayerFusion : public MultiplePatternProcessPass {
   mutable VarPtr is_layernorm1_{nullptr};
   mutable VarPtr is_layernorm2_{nullptr};
   mutable bool is_position_bias_{false};
+  mutable bool is_layernorm_fusion_{false};
+  mutable ActType act_type_{ActType::ActType_No};
   mutable VarPtr is_act_{nullptr};
+  mutable VarPtr eps1_{nullptr};
+  mutable VarPtr eps2_{nullptr};
 };
 }  // namespace opt
 }  // namespace mindspore
diff --git a/mindspore/lite/tools/optimizer/fusion/multi_head_attention_fusion.cc b/mindspore/lite/tools/optimizer/fusion/multi_head_attention_fusion.cc
index 5da662e2abed86a4db5b657fd48da0430071882e..d9b2ed45a1cf7b9602d6a7e3e9a520e66cf2c3ff 100644
--- a/mindspore/lite/tools/optimizer/fusion/multi_head_attention_fusion.cc
+++ b/mindspore/lite/tools/optimizer/fusion/multi_head_attention_fusion.cc
@@ -386,7 +386,7 @@ VectorRef MultiHeadAttentionFusion::DefineMPWithMaskPatternT5New(bool transpose,
   return matmul3;
 }
 
-VectorRef MultiHeadAttentionFusion::DefineMPWithMaskPatternPA() const {
+VectorRef MultiHeadAttentionFusion::DefineMPWithMaskPatternPA(bool mask) const {
   VectorRef k_embedding, v_embedding;
   auto q_transpose = std::make_shared<CondVar>(std::bind(IsOpType, p1, prim::kPrimTranspose));
   MS_CHECK_TRUE_RET(q_transpose != nullptr, {});
@@ -399,14 +399,21 @@ VectorRef MultiHeadAttentionFusion::DefineMPWithMaskPatternPA() const {
   auto is_matmul1 = std::make_shared<CondVar>(std::bind(IsOpType, p1, prim::kPrimMatMulFusion));
   MS_CHECK_TRUE_RET(is_matmul1 != nullptr, {});
   auto matmul1 = VectorRef({is_matmul1, q_embedding, k_embedding});
-  auto is_add = std::make_shared<CondVar>(std::bind(IsOpType, p1, prim::kPrimAddFusion));
-  MS_CHECK_TRUE_RET(is_add != nullptr, {});
-  auto mask = DefineMask(mask_);
-  MS_CHECK_TRUE_RET(!mask.empty(), {});
-  auto add = VectorRef({is_add, mask, matmul1});
-  auto is_softmax = std::make_shared<CondVar>(std::bind(IsOpType, p1, prim::kPrimSoftmax));
-  MS_CHECK_TRUE_RET(is_softmax != nullptr, {});
-  auto softmax = VectorRef({is_softmax, add});
+  VectorRef softmax;
+  if (mask) {
+    auto is_add = std::make_shared<CondVar>(std::bind(IsOpType, p1, prim::kPrimAddFusion));
+    MS_CHECK_TRUE_RET(is_add != nullptr, {});
+    auto mask = DefineMask(mask_);
+    MS_CHECK_TRUE_RET(!mask.empty(), {});
+    auto add = VectorRef({is_add, mask, matmul1});
+    auto is_softmax = std::make_shared<CondVar>(std::bind(IsOpType, p1, prim::kPrimSoftmax));
+    MS_CHECK_TRUE_RET(is_softmax != nullptr, {});
+    softmax = VectorRef({is_softmax, add});
+  } else {
+    auto is_softmax = std::make_shared<CondVar>(std::bind(IsOpType, p1, prim::kPrimSoftmax));
+    MS_CHECK_TRUE_RET(is_softmax != nullptr, {});
+    softmax = VectorRef({is_softmax, matmul1});
+  }
   auto is_matmul2 = std::make_shared<CondVar>(std::bind(IsOpType, p1, prim::kPrimMatMulFusion));
   MS_CHECK_TRUE_RET(is_matmul2 != nullptr, {});
   auto matmul2 = VectorRef({is_matmul2, softmax, v_embedding});
@@ -574,6 +581,7 @@ std::unordered_map<std::string, VectorRef> MultiHeadAttentionFusion::DefinePatte
   patterns[kMPAWithMaskPatternName] = DefineMPWithMaskPattern();
   patterns[kMPAPatternName] = DefineMPWithMaskPattern(false);
   patterns[kMPAWithMaskPatternNamePA] = DefineMPWithMaskPatternPA();
+  patterns[kMPAPatternNamePA] = DefineMPWithMaskPatternPA(false);
   patterns[kMPAWithMaskPatternNameT5] = DefineMPWithMaskPatternT5();
   patterns[kMPAWithMaskPatternNameT5New] = DefineMPWithMaskPatternT5New(false);
   patterns[kMPAWithMaskPatternNameT5New2] = DefineMPWithMaskPatternT5New(true, true);
@@ -605,6 +613,7 @@ bool MultiHeadAttentionFusion::CheckPattern(const EquivPtr &equiv, int *head_num
   }
   *head_num = out.at(0);
   *head_size = out.at(1);
+  scale_ = 1.0f / sqrtf(*head_size * 1.0f);
   return true;
 }
 
@@ -620,10 +629,12 @@ AnfNodePtr MultiHeadAttentionFusion::Process(const std::string &pattern_name, co
     if (pattern_name == kMPAWithMaskPatternNameT5New || pattern_name == kMPAWithMaskTransposePatternNameT5New ||
         pattern_name == kMPAWithMaskPatternNameT5New2) {
       t5_x_ = true;
+      scale_ = (pattern_name == kMPAWithMaskPatternNameT5New2) ? 1.0f : scale_;
     }
     return CreateMaskedMultiHeadAttentionNode(func_graph, equiv, node->fullname_with_scope(), true);
   }
-  if (pattern_name == kMPAPatternName || pattern_name == kMPAPatternNameSwin1 || pattern_name == kMPAPatternNameSwin2)
+  if (pattern_name == kMPAPatternName || pattern_name == kMPAPatternNameSwin1 || pattern_name == kMPAPatternNameSwin2 ||
+      pattern_name == kMPAPatternNamePA)
     return CreateMaskedMultiHeadAttentionNode(func_graph, equiv, node->fullname_with_scope(), false);
   return nullptr;
 }
@@ -758,7 +769,7 @@ std::shared_ptr<ops::Attention> MultiHeadAttentionFusion::CreatePrim(const Equiv
   if (!CheckPattern(equiv, &head_num, &head_size)) {
     return nullptr;
   }
-  attention_prim->Init(head_num, head_size, t5_x_, cross);
+  attention_prim->Init(head_num, head_size, t5_x_, cross, scale_);
   return attention_prim;
 }
 
diff --git a/mindspore/lite/tools/optimizer/fusion/multi_head_attention_fusion.h b/mindspore/lite/tools/optimizer/fusion/multi_head_attention_fusion.h
index ebe365273de2b24443ee432b400879b6f2b98f48..345616ed4aee9b1859074d49ca5bb55d1b446a5a 100644
--- a/mindspore/lite/tools/optimizer/fusion/multi_head_attention_fusion.h
+++ b/mindspore/lite/tools/optimizer/fusion/multi_head_attention_fusion.h
@@ -48,7 +48,7 @@ class MultiHeadAttentionFusion : public MultiplePatternProcessPass {
  private:
   // define patterns
   VectorRef DefineMPWithMaskPattern(bool mask = true) const;
-  VectorRef DefineMPWithMaskPatternPA() const;
+  VectorRef DefineMPWithMaskPatternPA(bool mask = true) const;
   VectorRef DefineMPWithMaskPatternT5() const;
   VectorRef DefineMPWithMaskPatternT5New(bool transpose = true, bool no_div_flag = false) const;
   VectorRef DefineMPPatternSwin(bool flag = true) const;
@@ -91,7 +91,7 @@ class MultiHeadAttentionFusion : public MultiplePatternProcessPass {
   const std::string kMPAWithMaskTransposePatternNameT5New = "MPAWithMaskTransposePatternT5New";
   const std::string kMPAPatternNameSwin1 = "MPAPatternNameSwin1";
   const std::string kMPAPatternNameSwin2 = "MPAPatternNameSwin2";
-
+  const std::string kMPAPatternNamePA = "kMPAPatternNamePA";
   mutable VarPtr input_q_{nullptr};
   mutable VarPtr input_k_{nullptr};
   mutable VarPtr input_v_{nullptr};
@@ -120,6 +120,7 @@ class MultiHeadAttentionFusion : public MultiplePatternProcessPass {
   mutable VarPtr k_transpose_{nullptr};
 
   mutable bool t5_x_{false};
+  mutable float scale_{true};
 };
 }  // namespace opt
 }  // namespace mindspore
diff --git a/third_party/patch/fast_transformer/001-fast_transformer.patch b/third_party/patch/fast_transformer/001-fast_transformer.patch
index 8816cc4e9bcd658e9d55dd3e8b6bae5d8541c15c..355db7c1bb0317f4a3571d12b45e48969cce501b 100644
--- a/third_party/patch/fast_transformer/001-fast_transformer.patch
+++ b/third_party/patch/fast_transformer/001-fast_transformer.patch
@@ -132,7 +132,7 @@ index 8707220..c9369e0 100644
  target_link_libraries(trt_fused_multi_head_attention PUBLIC -lcublas -lcudart)
  set_property(TARGET trt_fused_multi_head_attention PROPERTY POSITION_INDEPENDENT_CODE  ON)
 diff --git a/CMakeLists.txt b/CMakeLists.txt
-index ea21014..f9e08b8 100644
+index ea21014..e3d61e7 100644
 --- a/CMakeLists.txt
 +++ b/CMakeLists.txt
 @@ -14,7 +14,9 @@
@@ -199,7 +199,7 @@ index ea21014..f9e08b8 100644
    $<TARGET_OBJECTS:FfnLayer>
    $<TARGET_OBJECTS:FusedAttentionLayer>
    $<TARGET_OBJECTS:GptContextAttentionLayer>
-+  $<TARGET_OBJECTS:EncoderLayer>
++  $<TARGET_OBJECTS:MSLayer>
    $<TARGET_OBJECTS:GptJ>
    $<TARGET_OBJECTS:GptJContextDecoder>
    $<TARGET_OBJECTS:GptJDecoder>
@@ -221,7 +221,7 @@ index ea21014..f9e08b8 100644
 -  $<TARGET_OBJECTS:GptJ>
 -  $<TARGET_OBJECTS:GptJContextDecoder>
 -  $<TARGET_OBJECTS:GptJDecoder>
-+  $<TARGET_OBJECTS:EncoderLayer>
++  $<TARGET_OBJECTS:MSLayer>
    $<TARGET_OBJECTS:GptJDecoderLayerWeight>
 -  $<TARGET_OBJECTS:GptJTritonBackend>
    $<TARGET_OBJECTS:GptJWeight>
@@ -351,10 +351,10 @@ index a60983c..45b5374 100644
  
 diff --git a/deploy.sh b/deploy.sh
 new file mode 100755
-index 0000000..ac54401
+index 0000000..0e60c1a
 --- /dev/null
 +++ b/deploy.sh
-@@ -0,0 +1,32 @@
+@@ -0,0 +1,27 @@
 +#copy cuda folder (once)
 +base=`git rev-parse --show-toplevel`
 +server=10.10.10.174
@@ -371,16 +371,11 @@ index 0000000..ac54401
 +shift
 +rsync -v ${file} ${server}:${file}
 +echo "file=${file}"
-+rsync -v ${base}/../mindspore/trc/transformer/*.fp32 ${server}:${base}/build/bin
++rsync -v ${base}/../mindspore/trc/transformer/*.fp* ${server}:${base}/build/bin
 +rsync -v ${base}/build/lib/*.so ${server}:${base}/build/lib
 +# echo "cd ${base}/build/bin/"
 +command=$(cat <<-ENDM
-+<<<<<<< HEAD
-+  CUDA_VISIBLE_DEVICES=0 \
-+  NVIDIA_TF32_OVERRIDE=0 \
-+=======
 +  CUDA_VISIBLE_DEVICES=3 \
-+>>>>>>> origin/bert1
 +  LD_LIBRARY_PATH=${base}/../FasterTransformer:/usr/local/cuda-11.7/lib64 \
 +  ${file} $@ 
 +ENDM
@@ -427,7 +422,7 @@ index cacb09e..5fec0c9 100644
      else if (std::is_same<T, __nv_bfloat16>::value) {
 diff --git a/examples/cpp/ms/CMakeLists.txt b/examples/cpp/ms/CMakeLists.txt
 new file mode 100644
-index 0000000..eb47b5c
+index 0000000..33e562b
 --- /dev/null
 +++ b/examples/cpp/ms/CMakeLists.txt
 @@ -0,0 +1,22 @@
@@ -448,23 +443,23 @@ index 0000000..eb47b5c
 +add_executable(ms_benchmark ms.cc)
 +if (SPARSITY_SUPPORT)
 +# target_link_libraries(ms_benchmark PUBLIC -lcublas -lcublasLt -lcudart -lcusparse -lcusparseLt transformer-shared) 
-+target_link_libraries(ms_benchmark PUBLIC -lcublas -lcublasLt -lcudart -lcusparse -lcusparseLt GptContextAttentionLayer EncoderLayer)
++target_link_libraries(ms_benchmark PUBLIC -lcublas -lcublasLt -lcudart -lcusparse -lcusparseLt GptContextAttentionLayer MSLayer)
 +else()
 +# target_link_libraries(ms_benchmark PUBLIC -lcublas -lcublasLt -lcudart transformer-shared)
-+target_link_libraries(ms_benchmark PUBLIC -lcublas -lcublasLt -lcudart GptContextAttentionLayer EncoderLayer)
++target_link_libraries(ms_benchmark PUBLIC -lcublas -lcublasLt -lcudart GptContextAttentionLayer MSLayer)
 +endif()
 diff --git a/examples/cpp/ms/initialize.h b/examples/cpp/ms/initialize.h
 new file mode 100644
-index 0000000..9bcf4eb
+index 0000000..8ee1c95
 --- /dev/null
 +++ b/examples/cpp/ms/initialize.h
-@@ -0,0 +1,643 @@
+@@ -0,0 +1,969 @@
 +#pragma once
 +
-+#include "src/fastertransformer/layers/attention_layers/AttentionWeight.h"
-+#include "src/fastertransformer/layers/attention_layers/GptContextAttentionLayer.h"
-+#include "src/fastertransformer/layers/encoder_layers/EncoderLayerWeight.h"
-+#include "src/fastertransformer/layers/encoder_layers/MSEncoderLayer.h"
++#include "src/fastertransformer/layers/ms_layers/MSLayerWeight.h"
++#include "src/fastertransformer/layers/ms_layers/MSAttentionLayer.h"
++#include "src/fastertransformer/layers/ms_layers/MSDecoderLayer.h"
++#include "src/fastertransformer/layers/ms_layers/MSEncoderLayer.h"
 +using namespace fastertransformer;
 +struct opt_arg {
 +    size_t batch_size;
@@ -476,7 +471,11 @@ index 0000000..9bcf4eb
 +    size_t size_per_head;
 +    float eps1;
 +    float eps2;
++    float eps3;
++    bool position_bias1;
++    bool position_bias2;
 +    bool post_layernorm_residual;
++    bool is_ffn_fp16;
 +    bool is_remove_padding;
 +    std::string model_name;
 +    std::string compute_type;
@@ -491,7 +490,7 @@ index 0000000..9bcf4eb
 +    std::vector<Tensor> output_tensors;         // GPU
 +    std::vector<Tensor> output_python_tensors;  // CPU
 +    std::vector<Tensor> w_tensors;
-+    BaseAttentionLayer<T, U, S>* Attn;
++    MSBaseLayer<T, U, S>* Attn;
 +    //
 +};
 +template<typename T, typename U = T, typename S = T>
@@ -501,17 +500,29 @@ index 0000000..9bcf4eb
 +    std::vector<Tensor> output_tensors;         // GPU
 +    std::vector<Tensor> output_python_tensors;  // CPU
 +    std::vector<Tensor> w_tensors;
-+    BaseEncoderLayer<T, U, S>* Encoder;
++    MSBaseLayer<T, U, S>* Encoder;
++    //
++};
++template<typename T, typename U = T, typename S = T>
++struct DecriptorDecoderLayer {
++    std::vector<Tensor> input_tensors;          // GPU
++    std::vector<Tensor> input_python_tensors;   // CPU
++    std::vector<Tensor> output_tensors;         // GPU
++    std::vector<Tensor> output_python_tensors;  // CPU
++    std::vector<Tensor> w_tensors;
++    MSBaseLayer<T, U, S>* Decoder;
 +    //
 +};
-+
 +typedef enum {
-+    MHA_X1 = 1,     // AttnIn + AttnMask
-+    MHA_X2,         // AttnIn + EncOut -- same seq size + AttnMask
-+    MHA_CROSS,      // AttnIn + EncOut + AttnMAsk
-+    MHA_T5,         // AttnIn + EncOut + AttnMAsk + position_bias
-+    MHA_T5_CROSS,   // AttnIn + EncOut + AttnMAsk + position_bias
-+    TEL,            // transformer encoder layer
++    MHA_X1 = 1,    // AttnIn + AttnMask
++    MHA_X2,        // AttnIn + EncOut -- same seq size + AttnMask
++    MHA_CROSS,     // AttnIn + EncOut + AttnMAsk
++    MHA_T5,        // AttnIn + EncOut + AttnMAsk + position_bias
++    MHA_T5_CROSS,  // AttnIn + EncOut + AttnMAsk + position_bias
++    TEL,           // transformer encoder layer
++    TEL_T5,           // transformer encoder layer
++    TDL,
++    TDL_T5,
 +} MODEL_TEST_ID_E;
 +
 +int ModelNum(std::string model_name)
@@ -527,11 +538,23 @@ index 0000000..9bcf4eb
 +    }
 +    else if (model_name == "mha_T5") {
 +        return MHA_T5;
-+    } else if (model_name == "mha_T5_cross") { 
++    }
++    else if (model_name == "mha_T5_cross") {
 +        return MHA_T5_CROSS;
-+    } else if (model_name == "transformer_encoder_layer") { 
++    }
++    else if (model_name == "transformer_encoder_layer") {
 +        return TEL;
-+    } else {
++    }
++    else if (model_name == "transformer_encoder_layer_t5") {
++        return TEL_T5;
++    }
++    else if (model_name == "transformer_decoder_layer") {
++        return TDL;
++    }
++    else if (model_name ==  "transformer_decoder_layer_t5") {
++        return TDL_T5;
++    }
++    else {
 +        return -1;
 +    }
 +}
@@ -547,37 +570,29 @@ index 0000000..9bcf4eb
 +
 +    // TODO Nizzan - check if need to be <T,U>
 +    desc.Attn = new MSMHALayer<T, U, S>(opt_a->batch_size,
-+                                    opt_a->seq_len,
-+                                    opt_a->tgt_seq_len,
-+                                    opt_a->head_num,
-+                                    opt_a->size_per_head,
-+                                    stream,
-+                                    cublas_wrapper,
-+                                    allocator,
-+                                    false,      // free buffer after fwd
-+                                    true,       // is_qk_buf_float_
-+                                    false,      //is_cross
-+                                    false,      // sparse
-+                                    false);     // is_position_bias
++                                        opt_a->seq_len,
++                                        opt_a->tgt_seq_len,
++                                        opt_a->head_num,
++                                        opt_a->size_per_head,
++                                        stream,
++                                        cublas_wrapper,
++                                        allocator,
++                                        false,   // free buffer after fwd
++                                        true,    // is_qk_buf_float_
++                                        false,   // is_cross
++                                        false,   // sparse
++                                        false);  // is_position_bias
 +
-+    desc.input_tensors.push_back(Tensor{MEMORY_GPU,
-+                                    getTensorType<T>(),
-+                                    std::vector<size_t>{opt_a->batch_size * opt_a->seq_len,hidden_units},
-+                                    0});
-+    desc.input_tensors.push_back(Tensor{MEMORY_GPU,
-+                                    getTensorType<U>(),
-+                                    std::vector<size_t>{opt_a->batch_size, 1, opt_a->seq_len, opt_a->seq_len},
-+                                    0});
-+
-+    desc.input_python_tensors.push_back(Tensor{MEMORY_CPU,
-+                                    getTensorType<T>(),
-+                                    std::vector<size_t>{opt_a->batch_size * opt_a->seq_len,hidden_units},
-+                                    0});
-+									
-+    desc.input_python_tensors.push_back(Tensor{MEMORY_CPU,
-+                                    getTensorType<U>(),
-+                                    std::vector<size_t>{opt_a->batch_size, 1, opt_a->seq_len, opt_a->seq_len},
-+                                    0});
++    desc.input_tensors.push_back(Tensor{
++        MEMORY_GPU, getTensorType<T>(), std::vector<size_t>{opt_a->batch_size * opt_a->seq_len, hidden_units}, 0});
++    desc.input_tensors.push_back(Tensor{
++        MEMORY_GPU, getTensorType<U>(), std::vector<size_t>{opt_a->batch_size, 1, opt_a->seq_len, opt_a->seq_len}, 0});
++
++    desc.input_python_tensors.push_back(Tensor{
++        MEMORY_CPU, getTensorType<T>(), std::vector<size_t>{opt_a->batch_size * opt_a->seq_len, hidden_units}, 0});
++
++    desc.input_python_tensors.push_back(Tensor{
++        MEMORY_CPU, getTensorType<U>(), std::vector<size_t>{opt_a->batch_size, 1, opt_a->seq_len, opt_a->seq_len}, 0});
 +    // desc.input_python_tensors.push_back(Tensor{MEMORY_CPU,
 +    //                                 getTensorType<T>(),
 +    //                                 std::vector<size_t>{opt_a->batch_size * opt_a->seq_len,hidden_units},
@@ -630,23 +645,21 @@ index 0000000..9bcf4eb
 +    const size_t hidden_units = opt_a->head_num * opt_a->size_per_head;
 +
 +    desc.Attn = new MSMHALayer<T, U, S>(opt_a->batch_size,
-+                                    opt_a->seq_len,
-+                                    opt_a->tgt_seq_len,
-+                                    opt_a->head_num,
-+                                    opt_a->size_per_head,
-+                                    stream,
-+                                    cublas_wrapper,
-+                                    allocator,
-+                                    false,      // free buffer after fwd
-+                                    true,       // is_qk_buf_float_
-+                                    false,      //is_cross
-+                                    false,      // sparse
-+                                    false);     // is_position_bias
++                                        opt_a->seq_len,
++                                        opt_a->tgt_seq_len,
++                                        opt_a->head_num,
++                                        opt_a->size_per_head,
++                                        stream,
++                                        cublas_wrapper,
++                                        allocator,
++                                        false,   // free buffer after fwd
++                                        true,    // is_qk_buf_float_
++                                        false,   // is_cross
++                                        false,   // sparse
++                                        false);  // is_position_bias
 +
-+    desc.input_tensors.push_back(Tensor{MEMORY_GPU,
-+                                    getTensorType<T>(),
-+                                    std::vector<size_t>{opt_a->batch_size * opt_a->seq_len, hidden_units},
-+                                    0});
++    desc.input_tensors.push_back(Tensor{
++        MEMORY_GPU, getTensorType<T>(), std::vector<size_t>{opt_a->batch_size * opt_a->seq_len, hidden_units}, 0});
 +
 +    // GPU RESULTS
 +    desc.output_tensors.push_back(Tensor{
@@ -695,55 +708,53 @@ index 0000000..9bcf4eb
 +    const size_t hidden_units = opt_a->head_num * opt_a->size_per_head;
 +
 +    desc.Attn = new MSMHALayer<T, U, S>(opt_a->batch_size,
-+                                    opt_a->seq_len,
-+                                    opt_a->tgt_seq_len,
-+                                    opt_a->head_num,
-+                                    opt_a->size_per_head,
-+                                    stream,
-+                                    cublas_wrapper,
-+                                    allocator,
-+                                    false,      // free buffer after fwd
-+                                    true,       // is_qk_buf_float_
-+                                    true,       //is_cross
-+                                    false,      // sparse
-+                                    false);     // is_position_bias
++                                        opt_a->seq_len,
++                                        opt_a->tgt_seq_len,
++                                        opt_a->head_num,
++                                        opt_a->size_per_head,
++                                        stream,
++                                        cublas_wrapper,
++                                        allocator,
++                                        false,   // free buffer after fwd
++                                        true,    // is_qk_buf_float_
++                                        true,    // is_cross
++                                        false,   // sparse
++                                        false);  // is_position_bias
 +
-+    desc.input_tensors.push_back(Tensor{MEMORY_GPU,
-+                                        getTensorType<T>(),
-+                                        std::vector<size_t>{opt_a->batch_size*opt_a->seq_len, hidden_units},
-+                                        0});
 +    desc.input_tensors.push_back(Tensor{
-+        MEMORY_GPU, getTensorType<T>(), std::vector<size_t>{opt_a->batch_size* opt_a->tgt_seq_len, hidden_units}, 0});
++        MEMORY_GPU, getTensorType<T>(), std::vector<size_t>{opt_a->batch_size * opt_a->seq_len, hidden_units}, 0});
++    desc.input_tensors.push_back(Tensor{
++        MEMORY_GPU, getTensorType<T>(), std::vector<size_t>{opt_a->batch_size * opt_a->tgt_seq_len, hidden_units}, 0});
 +
 +    desc.input_tensors.push_back(Tensor{
 +        MEMORY_GPU, getTensorType<T>(), std::vector<size_t>{opt_a->batch_size, opt_a->seq_len, opt_a->tgt_seq_len}, 0});
-+ desc.input_python_tensors.push_back(Tensor{MEMORY_CPU,
-+                                        getTensorType<T>(),
-+                                        std::vector<size_t>{opt_a->batch_size*opt_a->seq_len, hidden_units},
-+                                        0});
 +    desc.input_python_tensors.push_back(Tensor{
-+        MEMORY_CPU, getTensorType<T>(), std::vector<size_t>{opt_a->batch_size* opt_a->tgt_seq_len, hidden_units}, 0});
++        MEMORY_CPU, getTensorType<T>(), std::vector<size_t>{opt_a->batch_size * opt_a->seq_len, hidden_units}, 0});
++    desc.input_python_tensors.push_back(Tensor{
++        MEMORY_CPU, getTensorType<T>(), std::vector<size_t>{opt_a->batch_size * opt_a->tgt_seq_len, hidden_units}, 0});
 +
 +    desc.input_python_tensors.push_back(Tensor{
 +        MEMORY_CPU, getTensorType<T>(), std::vector<size_t>{opt_a->batch_size, opt_a->seq_len, opt_a->tgt_seq_len}, 0});
 +
-+   
 +    // GPU RESULTS
 +
 +    desc.output_tensors.push_back(Tensor{
 +        MEMORY_GPU, getTensorType<T>(), std::vector<size_t>{opt_a->batch_size, opt_a->seq_len, hidden_units}, 0});
 +    // desc.output_tensors.push_back(Tensor{
-+    //     MEMORY_GPU, getTensorType<T>(), std::vector<size_t>{opt_a->batch_size, opt_a->head_num, opt_a->tgt_seq_len, opt_a->size_per_head}, 0});
++    //     MEMORY_GPU, getTensorType<T>(), std::vector<size_t>{opt_a->batch_size, opt_a->head_num, opt_a->tgt_seq_len,
++    //     opt_a->size_per_head}, 0});
 +    // desc.output_tensors.push_back(Tensor{
-+    //     MEMORY_GPU, getTensorType<T>(), std::vector<size_t>{opt_a->batch_size, opt_a->head_num, opt_a->tgt_seq_len, opt_a->size_per_head}, 0});
++    //     MEMORY_GPU, getTensorType<T>(), std::vector<size_t>{opt_a->batch_size, opt_a->head_num, opt_a->tgt_seq_len,
++    //     opt_a->size_per_head}, 0});
 +
 +    desc.output_python_tensors.push_back(Tensor{
 +        MEMORY_CPU, getTensorType<T>(), std::vector<size_t>{opt_a->batch_size, opt_a->seq_len, hidden_units}, 0});
 +    // desc.output_python_tensors.push_back(Tensor{
-+    //     MEMORY_CPU, getTensorType<T>(), std::vector<size_t>{opt_a->batch_size, opt_a->head_num, opt_a->tgt_seq_len, opt_a->size_per_head}, 0});
++    //     MEMORY_CPU, getTensorType<T>(), std::vector<size_t>{opt_a->batch_size, opt_a->head_num, opt_a->tgt_seq_len,
++    //     opt_a->size_per_head}, 0});
 +    // desc.output_python_tensors.push_back(Tensor{
-+    //     MEMORY_CPU, getTensorType<T>(), std::vector<size_t>{opt_a->batch_size, opt_a->head_num, opt_a->tgt_seq_len, opt_a->size_per_head}, 0});
-+
++    //     MEMORY_CPU, getTensorType<T>(), std::vector<size_t>{opt_a->batch_size, opt_a->head_num, opt_a->tgt_seq_len,
++    //     opt_a->size_per_head}, 0});
 +
 +    desc.w_tensors.push_back(
 +        Tensor{MEMORY_GPU, getTensorType<U>(), std::vector<size_t>{hidden_units, hidden_units}, 0});
@@ -764,68 +775,67 @@ index 0000000..9bcf4eb
 +    const size_t hidden_units = opt_a->head_num * opt_a->size_per_head;
 +
 +    desc.Attn = new MSMHALayer<T, U, S>(opt_a->batch_size,
-+                                    opt_a->seq_len,
-+                                    opt_a->tgt_seq_len,
-+                                    opt_a->head_num,
-+                                    opt_a->size_per_head,
-+                                    stream,
-+                                    cublas_wrapper,
-+                                    allocator,
-+                                    false,      // free buffer after fwd
-+                                    true,       // is_qk_buf_float_
-+                                    false,      //is_cross
-+                                    false,      // sparse
-+                                    true);     // is_position_bias
++                                        opt_a->seq_len,
++                                        opt_a->tgt_seq_len,
++                                        opt_a->head_num,
++                                        opt_a->size_per_head,
++                                        stream,
++                                        cublas_wrapper,
++                                        allocator,
++                                        false,  // free buffer after fwd
++                                        true,   // is_qk_buf_float_
++                                        false,  // is_cross
++                                        false,  // sparse
++                                        true);  // is_position_bias
 +
-+    desc.input_tensors.push_back(Tensor{MEMORY_GPU,
-+                                    getTensorType<T>(),
-+                                    std::vector<size_t>{opt_a->batch_size * opt_a->seq_len,hidden_units},
-+                                    0});
-+    desc.input_tensors.push_back(Tensor{MEMORY_GPU,
-+                                    getTensorType<U>(),
-+                                    std::vector<size_t>{opt_a->batch_size, 1, opt_a->seq_len, opt_a->seq_len},
-+                                    0});
-+    
-+    desc.input_tensors.push_back(Tensor{MEMORY_GPU,
-+                                    getTensorType<T>(),
-+                                    std::vector<size_t>{opt_a->batch_size, opt_a->head_num, opt_a->seq_len, opt_a->tgt_seq_len},
-+                                    0});
-+
-+    desc.input_python_tensors.push_back(Tensor{MEMORY_CPU,
-+                                    getTensorType<T>(),
-+                                    std::vector<size_t>{opt_a->batch_size * opt_a->seq_len,hidden_units},
-+                                    0});
-+									
-+    desc.input_python_tensors.push_back(Tensor{MEMORY_CPU,
-+                                    getTensorType<U>(),
-+                                    std::vector<size_t>{opt_a->batch_size, 1, opt_a->seq_len, opt_a->seq_len},
-+                                    0});
-+
-+    desc.input_python_tensors.push_back(Tensor{MEMORY_CPU,
-+                                    getTensorType<T>(),
-+                                    std::vector<size_t>{opt_a->batch_size, opt_a->head_num, opt_a->seq_len, opt_a->tgt_seq_len},
-+                                    0});
++    desc.input_tensors.push_back(Tensor{
++        MEMORY_GPU, getTensorType<T>(), std::vector<size_t>{opt_a->batch_size * opt_a->seq_len, hidden_units}, 0});
++    desc.input_tensors.push_back(Tensor{
++        MEMORY_GPU, getTensorType<U>(), std::vector<size_t>{opt_a->batch_size, 1, opt_a->seq_len, opt_a->seq_len}, 0});
++
++    desc.input_tensors.push_back(
++        Tensor{MEMORY_GPU,
++               getTensorType<T>(),
++               std::vector<size_t>{opt_a->batch_size, opt_a->head_num, opt_a->seq_len, opt_a->tgt_seq_len},
++               0});
++
++    desc.input_python_tensors.push_back(Tensor{
++        MEMORY_CPU, getTensorType<T>(), std::vector<size_t>{opt_a->batch_size * opt_a->seq_len, hidden_units}, 0});
++
++    desc.input_python_tensors.push_back(Tensor{
++        MEMORY_CPU, getTensorType<U>(), std::vector<size_t>{opt_a->batch_size, 1, opt_a->seq_len, opt_a->seq_len}, 0});
 +
++    desc.input_python_tensors.push_back(
++        Tensor{MEMORY_CPU,
++               getTensorType<T>(),
++               std::vector<size_t>{opt_a->batch_size, opt_a->head_num, opt_a->seq_len, opt_a->tgt_seq_len},
++               0});
 +
 +    // GPU RESULTS
 +
 +    desc.output_tensors.push_back(Tensor{
 +        MEMORY_GPU, getTensorType<T>(), std::vector<size_t>{opt_a->batch_size, opt_a->seq_len, hidden_units}, 0});
 +    // desc.output_tensors.push_back(Tensor{
-+    //     MEMORY_GPU, getTensorType<T>(), std::vector<size_t>{opt_a->batch_size, opt_a->head_num, opt_a->tgt_seq_len, opt_a->size_per_head}, 0});
++    //     MEMORY_GPU, getTensorType<T>(), std::vector<size_t>{opt_a->batch_size, opt_a->head_num, opt_a->tgt_seq_len,
++    //     opt_a->size_per_head}, 0});
 +    // desc.output_tensors.push_back(Tensor{
-+    //     MEMORY_GPU, getTensorType<T>(), std::vector<size_t>{opt_a->batch_size, opt_a->head_num, opt_a->tgt_seq_len, opt_a->size_per_head}, 0});
++    //     MEMORY_GPU, getTensorType<T>(), std::vector<size_t>{opt_a->batch_size, opt_a->head_num, opt_a->tgt_seq_len,
++    //     opt_a->size_per_head}, 0});
 +    // desc.output_tensors.push_back(Tensor{
-+    //   MEMORY_GPU, getTensorType<T>(), std::vector<size_t>{opt_a->batch_size, opt_a->head_num, opt_a->seq_len, opt_a->tgt_seq_len},0});
++    //   MEMORY_GPU, getTensorType<T>(), std::vector<size_t>{opt_a->batch_size, opt_a->head_num, opt_a->seq_len,
++    //   opt_a->tgt_seq_len},0});
 +
 +    desc.output_python_tensors.push_back(Tensor{
 +        MEMORY_CPU, getTensorType<T>(), std::vector<size_t>{opt_a->batch_size, opt_a->seq_len, hidden_units}, 0});
 +    // desc.output_python_tensors.push_back(Tensor{
-+    //     MEMORY_CPU, getTensorType<T>(), std::vector<size_t>{opt_a->batch_size, opt_a->head_num, opt_a->tgt_seq_len, opt_a->size_per_head}, 0});
++    //     MEMORY_CPU, getTensorType<T>(), std::vector<size_t>{opt_a->batch_size, opt_a->head_num, opt_a->tgt_seq_len,
++    //     opt_a->size_per_head}, 0});
 +    // desc.output_python_tensors.push_back(Tensor{
-+    //     MEMORY_CPU, getTensorType<T>(), std::vector<size_t>{opt_a->batch_size, opt_a->head_num, opt_a->tgt_seq_len, opt_a->size_per_head}, 0});
++    //     MEMORY_CPU, getTensorType<T>(), std::vector<size_t>{opt_a->batch_size, opt_a->head_num, opt_a->tgt_seq_len,
++    //     opt_a->size_per_head}, 0});
 +    // desc.output_python_tensors.push_back(Tensor{
-+    //     MEMORY_CPU, getTensorType<T>(), std::vector<size_t>{opt_a->batch_size, opt_a->head_num, opt_a->seq_len, opt_a->tgt_seq_len}, 0});
++    //     MEMORY_CPU, getTensorType<T>(), std::vector<size_t>{opt_a->batch_size, opt_a->head_num, opt_a->seq_len,
++    //     opt_a->tgt_seq_len}, 0});
 +
 +    desc.w_tensors.push_back(
 +        Tensor{MEMORY_GPU, getTensorType<U>(), std::vector<size_t>{hidden_units, 3 * hidden_units}, 0});
@@ -833,88 +843,89 @@ index 0000000..9bcf4eb
 +        Tensor{MEMORY_GPU, getTensorType<U>(), std::vector<size_t>{hidden_units, hidden_units}, 0});
 +}
 +
-+template<typename T, typename U=T, typename S=T>
++template<typename T, typename U = T, typename S = T>
 +void InitializeAttnT5Cross(opt_arg* opt_a,
-+                        DecriptorTest<T, U, S> &desc,
-+                        cudaStream_t stream,
-+                        cublasMMWrapper* cublas_wrapper,
-+                        Allocator<AllocatorType::CUDA>* allocator) {
++                           DecriptorTest<T, U, S>& desc,
++                           cudaStream_t stream,
++                           cublasMMWrapper* cublas_wrapper,
++                           Allocator<AllocatorType::CUDA>* allocator)
++{
 +    const size_t hidden_units = opt_a->head_num * opt_a->size_per_head;
 +
 +    desc.Attn = new MSMHALayer<T, U, S>(opt_a->batch_size,
-+                                    opt_a->seq_len,
-+                                    opt_a->tgt_seq_len,
-+                                    opt_a->head_num,
-+                                    opt_a->size_per_head,
-+                                    stream,
-+                                    cublas_wrapper,
-+                                    allocator,
-+                                    false,      // free buffer after fwd
-+                                    true,       // is_qk_buf_float_
-+                                    true,       //is_cross
-+                                    false,      // sparse
-+                                    true);     // is_position_bias
++                                        opt_a->seq_len,
++                                        opt_a->tgt_seq_len,
++                                        opt_a->head_num,
++                                        opt_a->size_per_head,
++                                        stream,
++                                        cublas_wrapper,
++                                        allocator,
++                                        false,  // free buffer after fwd
++                                        true,   // is_qk_buf_float_
++                                        true,   // is_cross
++                                        false,  // sparse
++                                        true);  // is_position_bias
 +
-+    desc.input_tensors.push_back(Tensor{MEMORY_GPU,
-+                                    getTensorType<T>(),
-+                                    std::vector<size_t>{opt_a->batch_size * opt_a->seq_len,hidden_units},
-+                                    0});
-+    
-+    desc.input_tensors.push_back(Tensor{MEMORY_GPU,
-+                                    getTensorType<T>(),
-+                                    std::vector<size_t>{opt_a->batch_size * opt_a->tgt_seq_len, hidden_units},
-+                                    0});
++    desc.input_tensors.push_back(Tensor{
++        MEMORY_GPU, getTensorType<T>(), std::vector<size_t>{opt_a->batch_size * opt_a->seq_len, hidden_units}, 0});
++
++    desc.input_tensors.push_back(Tensor{
++        MEMORY_GPU, getTensorType<T>(), std::vector<size_t>{opt_a->batch_size * opt_a->tgt_seq_len, hidden_units}, 0});
 +
 +    desc.input_tensors.push_back(Tensor{MEMORY_GPU,
-+                                    getTensorType<U>(),
-+                                    std::vector<size_t>{opt_a->batch_size, 1, opt_a->seq_len, opt_a->tgt_seq_len},
-+                                    0});
-+    
-+    desc.input_tensors.push_back(Tensor{MEMORY_GPU,
-+                                    getTensorType<T>(),
-+                                    std::vector<size_t>{opt_a->batch_size, opt_a->head_num, opt_a->seq_len, opt_a->tgt_seq_len},
-+                                    0});
-+
-+    desc.input_python_tensors.push_back(Tensor{MEMORY_CPU,
-+                                    getTensorType<T>(),
-+                                    std::vector<size_t>{opt_a->batch_size * opt_a->seq_len,hidden_units},
-+                                    0});
-+    
-+    desc.input_python_tensors.push_back(Tensor{MEMORY_CPU,
-+                                    getTensorType<T>(),
-+                                    std::vector<size_t>{opt_a->batch_size * opt_a->tgt_seq_len, hidden_units},
-+                                    0});
-+									
-+    desc.input_python_tensors.push_back(Tensor{MEMORY_CPU,
-+                                    getTensorType<U>(),
-+                                    std::vector<size_t>{opt_a->batch_size, 1, opt_a->seq_len, opt_a->tgt_seq_len},
-+                                    0});
-+
-+    desc.input_python_tensors.push_back(Tensor{MEMORY_CPU,
-+                                    getTensorType<T>(),
-+                                    std::vector<size_t>{opt_a->batch_size, opt_a->head_num, opt_a->seq_len, opt_a->tgt_seq_len},
-+                                    0});
++                                        getTensorType<U>(),
++                                        std::vector<size_t>{opt_a->batch_size, 1, opt_a->seq_len, opt_a->tgt_seq_len},
++                                        0});
++
++    desc.input_tensors.push_back(
++        Tensor{MEMORY_GPU,
++               getTensorType<T>(),
++               std::vector<size_t>{opt_a->batch_size, opt_a->head_num, opt_a->seq_len, opt_a->tgt_seq_len},
++               0});
++
++    desc.input_python_tensors.push_back(Tensor{
++        MEMORY_CPU, getTensorType<T>(), std::vector<size_t>{opt_a->batch_size * opt_a->seq_len, hidden_units}, 0});
++
++    desc.input_python_tensors.push_back(Tensor{
++        MEMORY_CPU, getTensorType<T>(), std::vector<size_t>{opt_a->batch_size * opt_a->tgt_seq_len, hidden_units}, 0});
++
++    desc.input_python_tensors.push_back(
++        Tensor{MEMORY_CPU,
++               getTensorType<U>(),
++               std::vector<size_t>{opt_a->batch_size, 1, opt_a->seq_len, opt_a->tgt_seq_len},
++               0});
 +
++    desc.input_python_tensors.push_back(
++        Tensor{MEMORY_CPU,
++               getTensorType<T>(),
++               std::vector<size_t>{opt_a->batch_size, opt_a->head_num, opt_a->seq_len, opt_a->tgt_seq_len},
++               0});
 +
 +    // GPU RESULTS
 +
 +    desc.output_tensors.push_back(Tensor{
 +        MEMORY_GPU, getTensorType<T>(), std::vector<size_t>{opt_a->batch_size, opt_a->seq_len, hidden_units}, 0});
 +    // desc.output_tensors.push_back(Tensor{
-+    //     MEMORY_GPU, getTensorType<T>(), std::vector<size_t>{opt_a->batch_size, opt_a->head_num, opt_a->tgt_seq_len, opt_a->size_per_head}, 0});
++    //     MEMORY_GPU, getTensorType<T>(), std::vector<size_t>{opt_a->batch_size, opt_a->head_num, opt_a->tgt_seq_len,
++    //     opt_a->size_per_head}, 0});
 +    // desc.output_tensors.push_back(Tensor{
-+    //     MEMORY_GPU, getTensorType<T>(), std::vector<size_t>{opt_a->batch_size, opt_a->head_num, opt_a->tgt_seq_len, opt_a->size_per_head}, 0});
++    //     MEMORY_GPU, getTensorType<T>(), std::vector<size_t>{opt_a->batch_size, opt_a->head_num, opt_a->tgt_seq_len,
++    //     opt_a->size_per_head}, 0});
 +    // desc.output_tensors.push_back(Tensor{
-+    //   MEMORY_GPU, getTensorType<T>(), std::vector<size_t>{opt_a->batch_size, opt_a->head_num, opt_a->seq_len, opt_a->tgt_seq_len},0});
++    //   MEMORY_GPU, getTensorType<T>(), std::vector<size_t>{opt_a->batch_size, opt_a->head_num, opt_a->seq_len,
++    //   opt_a->tgt_seq_len},0});
 +
 +    desc.output_python_tensors.push_back(Tensor{
 +        MEMORY_CPU, getTensorType<T>(), std::vector<size_t>{opt_a->batch_size, opt_a->seq_len, hidden_units}, 0});
 +    // desc.output_python_tensors.push_back(Tensor{
-+    //     MEMORY_CPU, getTensorType<T>(), std::vector<size_t>{opt_a->batch_size, opt_a->head_num, opt_a->tgt_seq_len, opt_a->size_per_head}, 0});
++    //     MEMORY_CPU, getTensorType<T>(), std::vector<size_t>{opt_a->batch_size, opt_a->head_num, opt_a->tgt_seq_len,
++    //     opt_a->size_per_head}, 0});
 +    // desc.output_python_tensors.push_back(Tensor{
-+    //     MEMORY_CPU, getTensorType<T>(), std::vector<size_t>{opt_a->batch_size, opt_a->head_num, opt_a->tgt_seq_len, opt_a->size_per_head}, 0});
++    //     MEMORY_CPU, getTensorType<T>(), std::vector<size_t>{opt_a->batch_size, opt_a->head_num, opt_a->tgt_seq_len,
++    //     opt_a->size_per_head}, 0});
 +    // desc.output_python_tensors.push_back(Tensor{
-+    //     MEMORY_CPU, getTensorType<T>(), std::vector<size_t>{opt_a->batch_size, opt_a->head_num, opt_a->seq_len, opt_a->tgt_seq_len}, 0});
++    //     MEMORY_CPU, getTensorType<T>(), std::vector<size_t>{opt_a->batch_size, opt_a->head_num, opt_a->seq_len,
++    //     opt_a->tgt_seq_len}, 0});
 +
 +    desc.w_tensors.push_back(
 +        Tensor{MEMORY_GPU, getTensorType<U>(), std::vector<size_t>{hidden_units, hidden_units}, 0});
@@ -944,6 +955,8 @@ index 0000000..9bcf4eb
 +                                         opt_a->eps1,
 +                                         opt_a->eps2,
 +                                         opt_a->post_layernorm_residual,
++                                         false,
++                                         opt_a->is_ffn_fp16,
 +                                         stream,
 +                                         cublas_wrapper,
 +                                         cublas_handle,
@@ -985,6 +998,256 @@ index 0000000..9bcf4eb
 +        Tensor{MEMORY_GPU, getTensorType<U>(), std::vector<size_t>{opt_a->ffn_hidden_size, opt_a->hidden_size}, 0});
 +    desc.w_tensors.push_back(Tensor{MEMORY_GPU, getTensorType<U>(), std::vector<size_t>{opt_a->hidden_size}, 0});
 +}
++template<typename T, typename U = T, typename S = T>
++void InitializeEncoderT5(opt_arg* opt_a,
++                       DecriptorEncoderLayer<T, U, S>& desc,
++                       cudaStream_t stream,
++                       cublasMMWrapper* cublas_wrapper,
++                       cublasHandle_t* cublas_handle,
++                       Allocator<AllocatorType::CUDA>* allocator)
++{
++    // const size_t hidden_units = opt_a->head_num * opt_a->size_per_head;
++    const size_t hidden_units = opt_a->hidden_size;
++    // TODO Nizzan - check if need to be <T,U>
++    desc.Encoder = new MSELayer<T, U, S>(opt_a->batch_size,
++                                         opt_a->seq_len,
++                                         opt_a->tgt_seq_len,
++                                         opt_a->head_num,
++                                         opt_a->size_per_head,
++                                         opt_a->ffn_hidden_size,
++                                         opt_a->eps1,
++                                         opt_a->eps2,
++                                         opt_a->post_layernorm_residual,
++                                         true,
++                                         opt_a->is_ffn_fp16,
++                                         stream,
++                                         cublas_wrapper,
++                                         cublas_handle,
++                                         allocator,
++                                         false,   // free buffer after fwd
++                                         true,    // is_qk_buf_float_
++                                         false);  // sparse
++    desc.input_tensors.push_back(Tensor{
++        MEMORY_GPU, getTensorType<T>(), std::vector<size_t>{opt_a->batch_size, opt_a->seq_len, opt_a->hidden_size}, 0});
++    desc.input_tensors.push_back(Tensor{
++        MEMORY_GPU, getTensorType<U>(), std::vector<size_t>{opt_a->batch_size, opt_a->seq_len, opt_a->seq_len}, 0});
++    desc.input_tensors.push_back(Tensor{
++    MEMORY_GPU, getTensorType<U>(), std::vector<size_t>{opt_a->batch_size, opt_a->head_num, opt_a->seq_len, opt_a->tgt_seq_len}, 0});
++    desc.input_python_tensors.push_back(Tensor{
++        MEMORY_CPU, getTensorType<T>(), std::vector<size_t>{opt_a->batch_size, opt_a->seq_len, opt_a->hidden_size}, 0});
++    desc.input_python_tensors.push_back(Tensor{
++        MEMORY_CPU, getTensorType<U>(), std::vector<size_t>{opt_a->batch_size, opt_a->seq_len, opt_a->seq_len}, 0});
++desc.input_python_tensors.push_back(Tensor{
++    MEMORY_CPU, getTensorType<U>(), std::vector<size_t>{opt_a->batch_size, opt_a->head_num, opt_a->seq_len, opt_a->tgt_seq_len}, 0});
++    desc.output_tensors.push_back(Tensor{
++        MEMORY_GPU, getTensorType<T>(), std::vector<size_t>{opt_a->batch_size, opt_a->seq_len, opt_a->hidden_size}, 0});
++
++    desc.output_python_tensors.push_back(Tensor{
++        MEMORY_CPU, getTensorType<T>(), std::vector<size_t>{opt_a->batch_size, opt_a->seq_len, opt_a->hidden_size}, 0});
++
++    desc.w_tensors.push_back(Tensor{MEMORY_GPU, getTensorType<U>(), std::vector<size_t>{opt_a->hidden_size}, 0}); //g1
++    desc.w_tensors.push_back(
++        Tensor{MEMORY_GPU, getTensorType<U>(), std::vector<size_t>{hidden_units, 3 * hidden_units}, 0}); //wt
++    desc.w_tensors.push_back(Tensor{MEMORY_GPU, getTensorType<U>(), std::vector<size_t>{hidden_units, hidden_units}, 0});//wp
++    desc.w_tensors.push_back(
++        Tensor{MEMORY_GPU, getTensorType<U>(), std::vector<size_t>{opt_a->hidden_size}, 0});//g2
++    desc.w_tensors.push_back(Tensor{MEMORY_GPU, getTensorType<U>(), std::vector<size_t>{opt_a->hidden_size, opt_a->ffn_hidden_size}, 0});
++    desc.w_tensors.push_back(Tensor{MEMORY_GPU, getTensorType<U>(), std::vector<size_t>{opt_a->ffn_hidden_size, opt_a->hidden_size}, 0});
++
++   
++}
++
++template<typename T, typename U = T, typename S = T>
++void InitializeDecoder(opt_arg* opt_a,
++                       DecriptorDecoderLayer<T, U, S>& desc,
++                       cudaStream_t stream,
++                       cublasMMWrapper* cublas_wrapper,
++                       cublasHandle_t* cublas_handle,
++                       Allocator<AllocatorType::CUDA>* allocator)
++{
++    const size_t hidden_units = opt_a->head_num * opt_a->size_per_head;
++    std::cout<<"hidden_units: "<<hidden_units<<std::endl;
++    // TODO Nizzan - check if need to be <T,U>
++    desc.Decoder = new MSDLayer<T, U, S>(opt_a->batch_size,
++                                         opt_a->seq_len,
++                                         opt_a->tgt_seq_len,
++                                         opt_a->head_num,
++                                         opt_a->size_per_head,
++                                         opt_a->ffn_hidden_size,
++                                         opt_a->eps1,
++                                         opt_a->eps2,
++                                         opt_a->eps3,
++                                         opt_a->post_layernorm_residual,
++                                         opt_a->position_bias1,
++                                         opt_a->position_bias2,
++                                         opt_a->is_ffn_fp16,
++                                         stream,
++                                         cublas_wrapper,
++                                         cublas_handle,
++                                         allocator,
++                                         false,   // free buffer after fwd
++                                         true,    // is_qk_buf_float_
++                                         false);  // sparse
++    desc.input_tensors.push_back(Tensor{
++        MEMORY_GPU,getTensorType<T>(),std::vector<size_t>{opt_a->batch_size, opt_a->tgt_seq_len, opt_a->hidden_size},0});
++    desc.input_tensors.push_back(Tensor{
++        MEMORY_GPU, getTensorType<U>(), std::vector<size_t>{opt_a->batch_size, opt_a->seq_len, opt_a->seq_len}, 0});
++    desc.input_tensors.push_back(Tensor{
++        MEMORY_GPU, getTensorType<T>(), std::vector<size_t>{opt_a->batch_size, opt_a->seq_len, opt_a->hidden_size}, 0});
++    desc.input_tensors.push_back(Tensor{
++        MEMORY_GPU, getTensorType<U>(), std::vector<size_t>{opt_a->batch_size, opt_a->tgt_seq_len, opt_a->seq_len}, 0});
++    
++    desc.input_python_tensors.push_back(Tensor{
++        MEMORY_CPU,getTensorType<T>(),std::vector<size_t>{opt_a->batch_size, opt_a->tgt_seq_len, opt_a->hidden_size},0});
++    desc.input_python_tensors.push_back(Tensor{
++        MEMORY_CPU, getTensorType<U>(), std::vector<size_t>{opt_a->batch_size, opt_a->seq_len, opt_a->seq_len}, 0});
++    desc.input_python_tensors.push_back(Tensor{
++        MEMORY_CPU, getTensorType<T>(), std::vector<size_t>{opt_a->batch_size, opt_a->seq_len, opt_a->hidden_size}, 0});
++    desc.input_python_tensors.push_back(Tensor{
++        MEMORY_CPU, getTensorType<U>(), std::vector<size_t>{opt_a->batch_size, opt_a->tgt_seq_len, opt_a->seq_len}, 0});
++    
++    // desc.output_tensors.push_back(Tensor{
++    //     MEMORY_GPU, getTensorType<T>(), std::vector<size_t>{opt_a->batch_size, opt_a->seq_len, opt_a->hidden_size}, 0});
++
++    // desc.output_python_tensors.push_back(Tensor{
++    //     MEMORY_CPU, getTensorType<T>(), std::vector<size_t>{opt_a->batch_size, opt_a->seq_len, opt_a->hidden_size}, 0});
++       
++    desc.output_tensors.push_back(Tensor{
++        MEMORY_GPU, getTensorType<T>(), std::vector<size_t>{640/4}, 0});
++
++    desc.output_python_tensors.push_back(Tensor{
++        MEMORY_CPU, getTensorType<T>(), std::vector<size_t>{640/4}, 0});
++   
++    desc.w_tensors.push_back(Tensor{MEMORY_GPU, getTensorType<U>(), std::vector<size_t>{
++        opt_a->hidden_size}, 0}); //G1
++    desc.w_tensors.push_back(Tensor{MEMORY_GPU, getTensorType<U>(), std::vector<size_t>{
++        opt_a->hidden_size}, 0}); //B1
++    desc.w_tensors.push_back(Tensor{MEMORY_GPU, getTensorType<U>(), std::vector<size_t>{
++        hidden_units, 3 * hidden_units}, 0});//wt
++    desc.w_tensors.push_back(Tensor{MEMORY_GPU, getTensorType<U>(), std::vector<size_t>{
++        3 * hidden_units}, 0});//bt
++    desc.w_tensors.push_back(Tensor{MEMORY_GPU, getTensorType<U>(), std::vector<size_t>{
++        hidden_units, hidden_units}, 0});//wp
++    desc.w_tensors.push_back(Tensor{MEMORY_GPU, getTensorType<U>(), std::vector<size_t>{
++        hidden_units}, 0});//bp
++    desc.w_tensors.push_back(Tensor{MEMORY_GPU, getTensorType<U>(), std::vector<size_t>{
++        opt_a->hidden_size}, 0});//g1
++    desc.w_tensors.push_back(Tensor{MEMORY_GPU, getTensorType<U>(), std::vector<size_t>{
++        opt_a->hidden_size}, 0});//b2
++    desc.w_tensors.push_back(Tensor{MEMORY_GPU, getTensorType<U>(), std::vector<size_t>{
++        hidden_units, hidden_units}, 0});
++    desc.w_tensors.push_back(Tensor{MEMORY_GPU, getTensorType<U>(), std::vector<size_t>{
++        hidden_units , hidden_units * 2}, 0});//bt2
++    desc.w_tensors.push_back(Tensor{MEMORY_GPU, getTensorType<U>(), std::vector<size_t>{
++        hidden_units * 3}, 0});
++    desc.w_tensors.push_back(Tensor{MEMORY_GPU, getTensorType<U>(), std::vector<size_t>{
++        hidden_units, hidden_units}, 0});//wp2
++    desc.w_tensors.push_back(Tensor{MEMORY_GPU, getTensorType<U>(), std::vector<size_t>{
++        hidden_units}, 0});//bp2
++    desc.w_tensors.push_back(Tensor{MEMORY_GPU, getTensorType<U>(), std::vector<size_t>{
++        opt_a->hidden_size}, 0});//g3
++    desc.w_tensors.push_back(Tensor{MEMORY_GPU, getTensorType<U>(), std::vector<size_t>{
++        opt_a->hidden_size}, 0});//b3
++    desc.w_tensors.push_back(Tensor{MEMORY_GPU, getTensorType<U>(), std::vector<size_t>{
++        opt_a->hidden_size, opt_a->ffn_hidden_size}, 0});//wm
++    desc.w_tensors.push_back(Tensor{MEMORY_GPU, getTensorType<U>(), std::vector<size_t>{
++        opt_a->ffn_hidden_size}, 0});//bm
++    desc.w_tensors.push_back(Tensor{MEMORY_GPU, getTensorType<U>(), std::vector<size_t>{
++        opt_a->hidden_size, opt_a->ffn_hidden_size}, 0});;//wp
++    desc.w_tensors.push_back(Tensor{MEMORY_GPU, getTensorType<U>(), std::vector<size_t>{
++        opt_a->hidden_size}, 0});//bp
++}
++template<typename T, typename U = T, typename S = T>
++void InitializeDecoderT5(opt_arg* opt_a,
++                       DecriptorDecoderLayer<T, U, S>& desc,
++                       cudaStream_t stream,
++                       cublasMMWrapper* cublas_wrapper,
++                       cublasHandle_t* cublas_handle,
++                       Allocator<AllocatorType::CUDA>* allocator)
++{
++    const size_t hidden_units = opt_a->head_num * opt_a->size_per_head;
++    std::cout<<"hidden_units: "<<hidden_units<<std::endl;
++    // TODO Nizzan - check if need to be <T,U>
++    desc.Decoder = new MSDLayer<T, U, S>(opt_a->batch_size,
++                                         opt_a->seq_len,
++                                         opt_a->tgt_seq_len,
++                                         opt_a->head_num,
++                                         opt_a->size_per_head,
++                                         opt_a->ffn_hidden_size,
++                                         opt_a->eps1,
++                                         opt_a->eps2,
++                                         opt_a->eps3,
++                                         opt_a->post_layernorm_residual,
++                                         opt_a->position_bias1,
++                                         opt_a->position_bias2,
++                                         opt_a->is_ffn_fp16,
++                                         stream,
++                                         cublas_wrapper,
++                                         cublas_handle,
++                                         allocator,
++                                         false,   // free buffer after fwd
++                                         true,    // is_qk_buf_float_
++                                         false);  // sparse
++    desc.input_tensors.push_back(Tensor{
++        MEMORY_GPU,getTensorType<T>(),std::vector<size_t>{opt_a->batch_size, opt_a->tgt_seq_len, opt_a->hidden_size},0});
++    desc.input_tensors.push_back(Tensor{
++        MEMORY_GPU, getTensorType<U>(), std::vector<size_t>{opt_a->batch_size, opt_a->seq_len, opt_a->seq_len}, 0});
++    desc.input_tensors.push_back(Tensor{
++        MEMORY_GPU, getTensorType<T>(), std::vector<size_t>{opt_a->batch_size, opt_a->seq_len, opt_a->hidden_size}, 0});
++    desc.input_tensors.push_back(Tensor{
++        MEMORY_GPU, getTensorType<U>(), std::vector<size_t>{opt_a->batch_size, opt_a->tgt_seq_len, opt_a->seq_len}, 0});
++    desc.input_tensors.push_back(Tensor{
++        MEMORY_GPU, getTensorType<U>(), std::vector<size_t>{opt_a->batch_size, opt_a->head_num, opt_a->seq_len, opt_a->tgt_seq_len}, 0});
++    desc.input_tensors.push_back(Tensor{
++        MEMORY_GPU, getTensorType<U>(), std::vector<size_t>{opt_a->batch_size, opt_a->head_num, opt_a->seq_len, opt_a->tgt_seq_len}, 0});
++          
++          
++    desc.input_python_tensors.push_back(Tensor{
++        MEMORY_CPU,getTensorType<T>(),std::vector<size_t>{opt_a->batch_size, opt_a->tgt_seq_len, opt_a->hidden_size},0});
++    desc.input_python_tensors.push_back(Tensor{
++        MEMORY_CPU, getTensorType<U>(), std::vector<size_t>{opt_a->batch_size, opt_a->seq_len, opt_a->seq_len}, 0});
++    desc.input_python_tensors.push_back(Tensor{
++        MEMORY_CPU, getTensorType<T>(), std::vector<size_t>{opt_a->batch_size, opt_a->seq_len, opt_a->hidden_size}, 0});
++    desc.input_python_tensors.push_back(Tensor{
++        MEMORY_CPU, getTensorType<U>(), std::vector<size_t>{opt_a->batch_size, opt_a->tgt_seq_len, opt_a->seq_len}, 0});
++    desc.input_python_tensors.push_back(Tensor{
++        MEMORY_GPU, getTensorType<U>(), std::vector<size_t>{opt_a->batch_size, opt_a->head_num, opt_a->seq_len, opt_a->tgt_seq_len}, 0});
++    desc.input_python_tensors.push_back(Tensor{
++        MEMORY_GPU, getTensorType<U>(), std::vector<size_t>{opt_a->batch_size, opt_a->head_num, opt_a->seq_len, opt_a->tgt_seq_len}, 0});
++    
++    // desc.output_tensors.push_back(Tensor{
++    //     MEMORY_GPU, getTensorType<T>(), std::vector<size_t>{opt_a->batch_size, opt_a->seq_len, opt_a->hidden_size}, 0});
++
++    // desc.output_python_tensors.push_back(Tensor{
++    //     MEMORY_CPU, getTensorType<T>(), std::vector<size_t>{opt_a->batch_size, opt_a->seq_len, opt_a->hidden_size}, 0});
++       
++    desc.output_tensors.push_back(Tensor{
++        MEMORY_GPU, getTensorType<T>(), std::vector<size_t>{640/4}, 0});
++
++    desc.output_python_tensors.push_back(Tensor{
++        MEMORY_CPU, getTensorType<T>(), std::vector<size_t>{640/4}, 0});
++   
++    desc.w_tensors.push_back(Tensor{MEMORY_GPU, getTensorType<U>(), std::vector<size_t>{
++        opt_a->hidden_size}, 0}); //G1
++    desc.w_tensors.push_back(Tensor{MEMORY_GPU, getTensorType<U>(), std::vector<size_t>{
++        hidden_units, 3 * hidden_units}, 0});//wt
++    desc.w_tensors.push_back(Tensor{MEMORY_GPU, getTensorType<U>(), std::vector<size_t>{
++        hidden_units, hidden_units}, 0});//wp
++    desc.w_tensors.push_back(Tensor{MEMORY_GPU, getTensorType<U>(), std::vector<size_t>{
++        opt_a->hidden_size}, 0});//g1
++    desc.w_tensors.push_back(Tensor{MEMORY_GPU, getTensorType<U>(), std::vector<size_t>{
++        hidden_units, hidden_units}, 0});
++    desc.w_tensors.push_back(Tensor{MEMORY_GPU, getTensorType<U>(), std::vector<size_t>{
++        hidden_units , hidden_units * 2}, 0});
++    desc.w_tensors.push_back(Tensor{MEMORY_GPU, getTensorType<U>(), std::vector<size_t>{
++        hidden_units, hidden_units}, 0});//wp2
++    desc.w_tensors.push_back(Tensor{MEMORY_GPU, getTensorType<U>(), std::vector<size_t>{
++        opt_a->hidden_size}, 0});//g3
++    desc.w_tensors.push_back(Tensor{MEMORY_GPU, getTensorType<U>(), std::vector<size_t>{
++        opt_a->hidden_size, opt_a->ffn_hidden_size}, 0});//wm
++    desc.w_tensors.push_back(Tensor{MEMORY_GPU, getTensorType<U>(), std::vector<size_t>{
++        opt_a->hidden_size, opt_a->ffn_hidden_size}, 0});;//wp
++}
 +
 +template<typename T, typename U = T, typename S = T>
 +void Init(opt_arg* opt_a,
@@ -999,32 +1262,16 @@ index 0000000..9bcf4eb
 +            InitializeAttn<T, U, S>(opt_a, desc, stream, cublas_wrapper, allocator);
 +            break;
 +        case MHA_X2:
-+            InitializeAttnX2<T, U, S>(opt_a,
-+                              desc,
-+                              stream,
-+                              cublas_wrapper,
-+                              allocator);
++            InitializeAttnX2<T, U, S>(opt_a, desc, stream, cublas_wrapper, allocator);
 +            break;
 +        case MHA_CROSS:
-+            InitializeAttnCross<T, U, S>(opt_a,
-+                              desc,
-+                              stream,
-+                              cublas_wrapper,
-+                              allocator);
++            InitializeAttnCross<T, U, S>(opt_a, desc, stream, cublas_wrapper, allocator);
 +            break;
 +        case MHA_T5:
-+            InitializeAttnT5<T, U, S>(opt_a,
-+                            desc,
-+                            stream,
-+                            cublas_wrapper,
-+                            allocator);
++            InitializeAttnT5<T, U, S>(opt_a, desc, stream, cublas_wrapper, allocator);
 +            break;
 +        case MHA_T5_CROSS:
-+            InitializeAttnT5Cross<T, U, S>(opt_a,
-+                                desc,
-+                                stream,
-+                                cublas_wrapper,
-+                                allocator);
++            InitializeAttnT5Cross<T, U, S>(opt_a, desc, stream, cublas_wrapper, allocator);
 +            break;
 +        default:
 +            break;
@@ -1043,13 +1290,37 @@ index 0000000..9bcf4eb
 +        case TEL:
 +            InitializeEncoder<T, U, S>(opt_a, desc, stream, cublas_wrapper, cublas_handle, allocator);
 +            break;
++        case TEL_T5:
++            InitializeEncoderT5<T, U, S>(opt_a, desc, stream, cublas_wrapper, cublas_handle, allocator);
++            break;
++        default:
++            break;
++    }
++}
++
++template<typename T, typename U = T, typename S = T>
++void InitD(opt_arg* opt_a,
++           DecriptorDecoderLayer<T, U, S>& desc,
++           cudaStream_t stream,
++           cublasMMWrapper* cublas_wrapper,
++           cublasHandle_t* cublas_handle,
++           Allocator<AllocatorType::CUDA>* allocator)
++{
++    int model_num = ModelNum(opt_a->model_name);
++    switch (model_num) {
++        case TDL:
++            InitializeDecoder<T, U, S>(opt_a, desc, stream, cublas_wrapper, cublas_handle, allocator);
++            break;
++        case TDL_T5:
++            InitializeDecoderT5<T, U, S>(opt_a, desc, stream, cublas_wrapper, cublas_handle, allocator);
++            break;
 +        default:
 +            break;
 +    }
 +}
 +
 +template<typename T>
-+void InitWeight(opt_arg* opt_a, AttentionWeight<T>& attn_weights, std::vector<Tensor> w_tensors)
++void InitWeight(opt_arg* opt_a, AttentionLayerWeight<T>& attn_weights, std::vector<Tensor> w_tensors)
 +{
 +    int modelId = ModelNum(opt_a->model_name);
 +    if (modelId == MHA_X1) {
@@ -1064,18 +1335,21 @@ index 0000000..9bcf4eb
 +        attn_weights.key_weight.kernel = (const T*)w_tensors[2].data;
 +        attn_weights.attention_output_weight.kernel = (const T*)w_tensors[3].data;
 +        attn_weights.attention_output_weight.bias = (const T*)w_tensors[4].data;
-+    } else if (modelId==MHA_T5) {
++    }
++    else if (modelId == MHA_T5) {
 +        attn_weights.query_weight.kernel = (const T*)w_tensors[0].data;
 +        attn_weights.query_weight.bias = nullptr;
 +        attn_weights.attention_output_weight.kernel = (const T*)w_tensors[1].data;
 +        attn_weights.attention_output_weight.bias = nullptr;
-+    } else if (modelId==MHA_T5_CROSS) {
++    }
++    else if (modelId == MHA_T5_CROSS) {
 +        attn_weights.query_weight.kernel = (const T*)w_tensors[0].data;
 +        attn_weights.query_weight.bias = nullptr;
 +        attn_weights.key_weight.kernel = (const T*)w_tensors[1].data;
 +        attn_weights.attention_output_weight.kernel = (const T*)w_tensors[2].data;
 +        attn_weights.attention_output_weight.bias = nullptr;
-+    } else {
++    }
++    else {
 +        // return ERROR illegal model !
 +    }
 +}
@@ -1085,10 +1359,10 @@ index 0000000..9bcf4eb
 +{
 +    int modelId = ModelNum(opt_a->model_name);
 +    if (modelId == TEL) {
-+        encoder_weights.qkv_weight.kernel = (const T*)w_tensors[2].data;
-+        encoder_weights.qkv_weight.bias = (const T*)w_tensors[3].data;
-+        encoder_weights.attention_layer_output_weight.kernel = (const T*)w_tensors[4].data;
-+        encoder_weights.attention_layer_output_weight.bias = (const T*)w_tensors[5].data;
++        encoder_weights.attention.query_weight.kernel = (const T*)w_tensors[2].data;
++        encoder_weights.attention.query_weight.bias = (const T*)w_tensors[3].data;
++        encoder_weights.attention.attention_output_weight.kernel = (const T*)w_tensors[4].data;
++        encoder_weights.attention.attention_output_weight.bias = (const T*)w_tensors[5].data;
 +        encoder_weights.layernorm1.gamma = (const T*)w_tensors[0].data;
 +        encoder_weights.layernorm1.beta = (const T*)w_tensors[1].data;
 +        encoder_weights.layernorm2.gamma = (const T*)w_tensors[6].data;
@@ -1098,16 +1372,63 @@ index 0000000..9bcf4eb
 +        encoder_weights.encoder_output_mapping.bias = (const T*)w_tensors[9].data;
 +        encoder_weights.encoder_output_projection.bias = (const T*)w_tensors[11].data;
 +    }
++    else if (modelId == TEL_T5){
++        encoder_weights.attention.query_weight.kernel = (const T*)w_tensors[2].data;
++        encoder_weights.attention.attention_output_weight.kernel = (const T*)w_tensors[3].data;
++        encoder_weights.layernorm1.gamma = (const T*)w_tensors[0].data;
++        encoder_weights.layernorm2.gamma = (const T*)w_tensors[4].data;
++        encoder_weights.encoder_output_mapping.kernel = (const T*)w_tensors[5].data;
++        encoder_weights.encoder_output_projection.kernel = (const T*)w_tensors[6].data;
++    }
++}
++template<typename T>
++void InitWeightDecoder(opt_arg* opt_a, DecoderLayerWeight<T>& decoder_weights, std::vector<Tensor> w_tensors)
++{
++    int modelId = ModelNum(opt_a->model_name);
++    if (modelId == TDL) {
++        decoder_weights.layernorm1.gamma = (const T*)w_tensors[0].data;
++        decoder_weights.layernorm1.beta = (const T*)w_tensors[1].data;
++        decoder_weights.attention.query_weight.kernel = (const T*)w_tensors[2].data;
++        decoder_weights.attention.query_weight.bias = (const T*)w_tensors[3].data;
++        decoder_weights.attention.attention_output_weight.kernel = (const T*)w_tensors[4].data;
++        decoder_weights.attention.attention_output_weight.bias = (const T*)w_tensors[5].data;
++        decoder_weights.layernorm2.gamma = (const T*)w_tensors[6].data;
++        decoder_weights.layernorm2.beta = (const T*)w_tensors[7].data;
++        decoder_weights.cross_attention.query_weight.kernel = (const T*)w_tensors[8].data;
++        decoder_weights.cross_attention.key_weight.kernel = (const T*)w_tensors[9].data;
++        decoder_weights.cross_attention.query_weight.bias = (const T*)w_tensors[10].data;
++        decoder_weights.cross_attention.key_weight.bias = (const T*)w_tensors[10].data;
++        decoder_weights.cross_attention.attention_output_weight.kernel = (const T*)w_tensors[11].data;
++        decoder_weights.cross_attention.attention_output_weight.bias = (const T*)w_tensors[12].data;
++        decoder_weights.layernorm3.gamma = (const T*)w_tensors[13].data;
++        decoder_weights.layernorm3.beta = (const T*)w_tensors[14].data;
++        decoder_weights.decoder_output_mapping.kernel = (const T*)w_tensors[15].data;
++        decoder_weights.decoder_output_mapping.bias = (const T*)w_tensors[16].data;
++        decoder_weights.decoder_output_projection.kernel = (const T*)w_tensors[17].data;
++        decoder_weights.decoder_output_projection.bias = (const T*)w_tensors[18].data;
++    }
++    else if (modelId == TDL_T5) {
++        decoder_weights.layernorm1.gamma = (const T*)w_tensors[0].data;
++        decoder_weights.attention.query_weight.kernel = (const T*)w_tensors[1].data;
++        decoder_weights.attention.attention_output_weight.kernel = (const T*)w_tensors[2].data;
++        decoder_weights.layernorm2.gamma = (const T*)w_tensors[3].data;
++        decoder_weights.cross_attention.query_weight.kernel = (const T*)w_tensors[4].data;
++        decoder_weights.cross_attention.key_weight.kernel = (const T*)w_tensors[5].data;
++        decoder_weights.cross_attention.attention_output_weight.kernel = (const T*)w_tensors[6].data;
++        decoder_weights.layernorm3.gamma = (const T*)w_tensors[7].data;
++        decoder_weights.decoder_output_mapping.kernel = (const T*)w_tensors[8].data;
++        decoder_weights.decoder_output_projection.kernel = (const T*)w_tensors[9].data;
++    }
 +    else {
 +        // return ERROR illegal model !
 +    }
 +}
 diff --git a/examples/cpp/ms/ms.cc b/examples/cpp/ms/ms.cc
 new file mode 100644
-index 0000000..2b12bd5
+index 0000000..4ad059a
 --- /dev/null
 +++ b/examples/cpp/ms/ms.cc
-@@ -0,0 +1,591 @@
+@@ -0,0 +1,671 @@
 +/*
 + * Copyright (c) 2019-2022, NVIDIA CORPORATION.  All rights reserved.
 + *
@@ -1124,10 +1445,10 @@ index 0000000..2b12bd5
 + * limitations under the License.
 + */
 +#include "examples/cpp/ms/initialize.h"
-+#include "src/fastertransformer/layers/attention_layers/AttentionWeight.h"
-+#include "src/fastertransformer/layers/attention_layers/GptContextAttentionLayer.h"
-+#include "src/fastertransformer/layers/encoder_layers/EncoderLayerWeight.h"
-+#include "src/fastertransformer/layers/encoder_layers/MSEncoderLayer.h"
++// #include "src/fastertransformer/layers/attention_layers/MSLayerWeight.h"
++// #include "src/fastertransformer/layers/ms_layers/MSAttentionLayer.h"
++// #include "src/fastertransformer/layers/ms_layers/MSEncoderLayer.h"
++// #include "src/fastertransformer/layers/ms_layers/MSDecoderLayer.h"
 +#include "src/fastertransformer/utils/logger.h"
 +#include <cfloat>
 +#include <getopt.h>
@@ -1147,7 +1468,7 @@ index 0000000..2b12bd5
 +bool read_args(int argc, char* argv[], opt_arg* opt_a)
 +{
 +    int opt;
-+    while ((opt = getopt(argc, argv, "b:l:s:t:H:S:p:m:T:W:F:i:w:f:P:e1:e2")) != -1) {
++    while ((opt = getopt(argc, argv, "b:l:s:t:H:S:p:m:T:W:F:i:w:f:P:x:1:2:3")) != -1) {
 +        switch (opt) {
 +            case 'b':
 +                opt_a->batch_size = atoi(optarg);
@@ -1188,6 +1509,9 @@ index 0000000..2b12bd5
 +            case '2':
 +                opt_a->eps2 = atoi(optarg);
 +                break;
++            case '3':
++                opt_a->eps3 = atoi(optarg);
++                break;
 +            case 'P':
 +                if (atoi(optarg) == 1)
 +                    opt_a->post_layernorm_residual=true;
@@ -1197,8 +1521,14 @@ index 0000000..2b12bd5
 +            case 'p':
 +                opt_a->is_remove_padding = bool(optarg);
 +                break;
-+            case 'i':
-+            case 'w':
++            case 'x':
++                if (atoi(optarg) == 1)
++                    opt_a->is_ffn_fp16=true;
++                else if (atoi(optarg) == 0)
++                    opt_a->is_ffn_fp16=false;
++                break;
++            case 'i':
++            case 'w':
 +                break;
 +            case 'h':
 +            default:
@@ -1227,13 +1557,14 @@ index 0000000..2b12bd5
 +    opt_a.ffn_hidden_size = -1;
 +    opt_a.eps1 = 1e-6f;
 +    opt_a.eps2 = 1e-6f;
++    opt_a.eps3 = 1e-6f;
 +    opt_a.post_layernorm_residual = true;
 +    opt_a.is_remove_padding = false;
 +    opt_a.model_name = "";
 +    opt_a.compute_type = "fp32";
 +    opt_a.w_compute_type = "fp32";
 +    opt_a.s_compute_type = "fp32";
-+
++    opt_a.is_ffn_fp16 = false;
 +
 +    if (read_args(argc, argv, &opt_a)) {
 +        bool c_type_fp32 = (opt_a.compute_type.compare("fp32") == 0);
@@ -1545,75 +1876,64 @@ index 0000000..2b12bd5
 +            cublas_wrapper.setFP32GemmConfig();
 +        }
 +    }
-+
-+    if (opt_a->model_name != "transformer_encoder_layer") {
-+        DecriptorTest<T, U, S> desc;
-+        Init<T, U, S>(opt_a, desc, stream, &cublas_wrapper, &allocator);
++    if(opt_a->model_name == "transformer_decoder_layer" || opt_a->model_name == "transformer_decoder_layer_t5") {
++        DecriptorDecoderLayer<T, U, S> desc;
++        InitD<T, U, S>(opt_a, desc, stream, &cublas_wrapper, &cublas_handle, &allocator);        
 +        int res = ReadTensors<T>(desc.input_tensors, std::string("input"), opt_a);
 +        FT_CHECK(!res);
 +        res = ReadTensors<T>(desc.input_python_tensors, std::string("input"), opt_a);
 +        FT_CHECK(!res);
-+
 +        res = ReadTensors<T>(desc.output_tensors, std::string("output"), opt_a, false);
 +        FT_CHECK(!res);
-+
 +        res = ReadTensors<T>(desc.output_python_tensors, std::string("output"), opt_a);
 +        FT_CHECK(!res);
-+
 +        res = ReadTensors<U>(desc.w_tensors, std::string("weight"), opt_a);
 +        FT_CHECK(!res);
-+
-+        std::cout << "inputs size not encoder: " << CalcTensorsSize<T>(desc.input_tensors) << std::endl;
-+        std::cout << "weights size not encoder: " << CalcTensorsSize<T>(desc.w_tensors) << std::endl;
-+        std::cout << "ouputs size not encoder: " << CalcTensorsSize<T>(desc.output_tensors) << std::endl;
-+
-+        AttentionWeight<U> attn_weights;
-+        InitWeight<U>(opt_a, attn_weights, desc.w_tensors);
-+
-+        // test for BE !!
-+        desc.Attn->forward(&desc.output_tensors, &desc.input_tensors, &attn_weights);
-+
++        DecoderLayerWeight<U> decoder_weights;
++        InitWeightDecoder<U>(opt_a, decoder_weights, desc.w_tensors);
++        // // test for BE !!
++        desc.Decoder->forward(&desc.output_tensors, &desc.input_tensors, &decoder_weights);
 +
 +        CompareOutput<T>(desc.output_python_tensors, desc.output_tensors);
-+        
-+// #define DO_TIME
-+// #ifdef DO_TIME
-+//         // warmup
-+//         for (int i = 0; i < 10; i++) {
-+//             desc.Attn->forward(&desc.output_tensors, &desc.input_tensors, &attn_weights);
-+//         }
-+//         // profile time
-+//         const int ite = 1000;
-+//         CudaTimer cuda_timer(stream);
-+//         cuda_timer.start();
++#define DO_TIME
++#ifdef DO_TIME
++        // warmup
++        for (int i = 0; i < 10; i++) {
++            // desc.Decoder->forward(&desc.output_tensors, &desc.input_tensors, &decoder_weights);
++        }
++        // profile time
++        const int ite = 1000;
++        CudaTimer cuda_timer(stream);
++        cuda_timer.start();
 +
-+//         for (int i = 0; i < ite; i++) {
-+//             for (int i = 0; i < desc.input_tensors.size(); i++) {
-+//                 int size = desc.input_tensors[i].size();
-+//                 cudaH2Dcpy(const_cast<T*>(reinterpret_cast<const T*>(desc.input_tensors[i].data)),
-+//                            const_cast<T*>(reinterpret_cast<const T*>(desc.input_python_tensors[i].data)),
-+//                            size);
-+//             }
++        for (int i = 0; i < ite; i++) {
++            // for (int i = 0; i < desc.input_tensors.size(); i++) {
++            //     int size = desc.input_tensors[i].size();
++            //     cudaH2Dcpy(const_cast<T*>(reinterpret_cast<const T*>(desc.input_tensors[i].data)),
++            //                const_cast<T*>(reinterpret_cast<const T*>(desc.input_python_tensors[i].data)),
++            //                size);
++            // }
 +
-+//             desc.Attn->forward(&desc.output_tensors, &desc.input_tensors, &attn_weights);
-+//             for (int i = 0; i < desc.output_tensors.size(); i++) {
-+//                 int size = desc.output_tensors[i].size();
-+//                 cudaD2Hcpy(const_cast<T*>(reinterpret_cast<const T*>(desc.output_python_tensors[i].data)),
-+//                            const_cast<T*>(reinterpret_cast<const T*>(desc.output_tensors[i].data)),
-+//                            size);
-+//             }
-+//         }
-+//         float total_time = cuda_timer.stop();
-+//         printf("batch_size %ld seq_len %ld layer %ld "
-+//                "AVG FT-CPP-time %.2f ms (%d iterations) "
-+//                "Total Time %.2f ms\n",
-+//                opt_a->batch_size,
-+//                opt_a->seq_len,
-+//                opt_a->num_layers,
-+//                total_time / ite,
-+//                ite,
-+//                total_time);
-+// #endif
++            // desc.Decoder->forward(&desc.output_tensors, &desc.input_tensors, &decoder_weights);
++            // for (int i = 0; i < desc.output_tensors.size(); i++) {
++            //     int size = desc.output_tensors[i].size();
++            //     cudaD2Hcpy(const_cast<T*>(reinterpret_cast<const T*>(desc.output_python_tensors[i].data)),
++            //                const_cast<T*>(reinterpret_cast<const T*>(desc.output_tensors[i].data)),
++            //                size);
++            // }
++        }
++        float total_time = cuda_timer.stop();
++
++        printf("batch_size %ld seq_len %ld layer %ld "
++               "AVG FT-CPP-time %.2f ms (%d iterations) "
++               "Total Time %.2f ms\n",
++               opt_a->batch_size,
++               opt_a->seq_len,
++               opt_a->num_layers,
++               total_time / ite,
++               ite,
++               total_time);
++#endif
 +
 +#ifdef SPARSITY_ENABLED
 +        cusparseLtDestroy(&cusparselt_handle);
@@ -1624,8 +1944,10 @@ index 0000000..2b12bd5
 +        FreeDesc(desc.input_tensors);
 +        FreeDesc(desc.output_python_tensors);
 +        FreeDesc(desc.w_tensors);
++        return 0;
 +    }
-+    else {
++    else if (opt_a->model_name == "transformer_encoder_layer"|| opt_a->model_name == "transformer_encoder_layer_t5") {
++        
 +        DecriptorEncoderLayer<T, U, S> desc;
 +        InitE<T, U, S>(opt_a, desc, stream, &cublas_wrapper, &cublas_handle, &allocator);
 +        int res = ReadTensors<T>(desc.input_tensors, std::string("input"), opt_a);
@@ -1697,6 +2019,85 @@ index 0000000..2b12bd5
 +        FreeDesc(desc.output_python_tensors);
 +        FreeDesc(desc.w_tensors);
 +    }
++    else {
++        DecriptorTest<T, U, S> desc;
++        Init<T, U, S>(opt_a, desc, stream, &cublas_wrapper, &allocator);
++        int res = ReadTensors<T>(desc.input_tensors, std::string("input"), opt_a);
++        FT_CHECK(!res);
++        res = ReadTensors<T>(desc.input_python_tensors, std::string("input"), opt_a);
++        FT_CHECK(!res);
++
++        res = ReadTensors<T>(desc.output_tensors, std::string("output"), opt_a, false);
++        FT_CHECK(!res);
++
++        res = ReadTensors<T>(desc.output_python_tensors, std::string("output"), opt_a);
++        FT_CHECK(!res);
++
++        res = ReadTensors<U>(desc.w_tensors, std::string("weight"), opt_a);
++        FT_CHECK(!res);
++
++        std::cout << "inputs size not encoder: " << CalcTensorsSize<T>(desc.input_tensors) << std::endl;
++        std::cout << "weights size not encoder: " << CalcTensorsSize<T>(desc.w_tensors) << std::endl;
++        std::cout << "ouputs size not encoder: " << CalcTensorsSize<T>(desc.output_tensors) << std::endl;
++
++        AttentionLayerWeight<U> attn_weights;
++        InitWeight<U>(opt_a, attn_weights, desc.w_tensors);
++
++        // test for BE !!
++        desc.Attn->forward(&desc.output_tensors, &desc.input_tensors, &attn_weights);
++
++
++        CompareOutput<T>(desc.output_python_tensors, desc.output_tensors);
++        
++// #define DO_TIME
++// #ifdef DO_TIME
++//         // warmup
++//         for (int i = 0; i < 10; i++) {
++//             desc.Attn->forward(&desc.output_tensors, &desc.input_tensors, &attn_weights);
++//         }
++//         // profile time
++//         const int ite = 1000;
++//         CudaTimer cuda_timer(stream);
++//         cuda_timer.start();
++
++//         for (int i = 0; i < ite; i++) {
++//             for (int i = 0; i < desc.input_tensors.size(); i++) {
++//                 int size = desc.input_tensors[i].size();
++//                 cudaH2Dcpy(const_cast<T*>(reinterpret_cast<const T*>(desc.input_tensors[i].data)),
++//                            const_cast<T*>(reinterpret_cast<const T*>(desc.input_python_tensors[i].data)),
++//                            size);
++//             }
++
++//             desc.Attn->forward(&desc.output_tensors, &desc.input_tensors, &attn_weights);
++//             for (int i = 0; i < desc.output_tensors.size(); i++) {
++//                 int size = desc.output_tensors[i].size();
++//                 cudaD2Hcpy(const_cast<T*>(reinterpret_cast<const T*>(desc.output_python_tensors[i].data)),
++//                            const_cast<T*>(reinterpret_cast<const T*>(desc.output_tensors[i].data)),
++//                            size);
++//             }
++//         }
++//         float total_time = cuda_timer.stop();
++//         printf("batch_size %ld seq_len %ld layer %ld "
++//                "AVG FT-CPP-time %.2f ms (%d iterations) "
++//                "Total Time %.2f ms\n",
++//                opt_a->batch_size,
++//                opt_a->seq_len,
++//                opt_a->num_layers,
++//                total_time / ite,
++//                ite,
++//                total_time);
++// #endif
++
++#ifdef SPARSITY_ENABLED
++        cusparseLtDestroy(&cusparselt_handle);
++#endif
++        delete cublas_algo_map;
++        delete cublas_wrapper_mutex;
++        FreeDesc(desc.output_tensors);
++        FreeDesc(desc.input_tensors);
++        FreeDesc(desc.output_python_tensors);
++        FreeDesc(desc.w_tensors);
++    }
 +    return 0;
 +}
 diff --git a/examples/pytorch/swin/Swin-Transformer-Quantization/SwinTransformer b/examples/pytorch/swin/Swin-Transformer-Quantization/SwinTransformer
@@ -1806,10 +2207,10 @@ index 7ff8e0f..e1be64c 100644
  
  template void invokeAddBias(float* out, const float* bias, const int m, const int n, cudaStream_t stream);
 diff --git a/src/fastertransformer/kernels/add_residual_kernels.cu b/src/fastertransformer/kernels/add_residual_kernels.cu
-index 4cd9f0f..1bf2be3 100644
+index 4cd9f0f..42c9216 100644
 --- a/src/fastertransformer/kernels/add_residual_kernels.cu
 +++ b/src/fastertransformer/kernels/add_residual_kernels.cu
-@@ -29,6 +29,18 @@ __global__ void addBiasResidual(T* output, const T* input, const T* bias, const
+@@ -29,6 +29,30 @@ __global__ void addBiasResidual(T* output, const T* input, const T* bias, const
      }
  }
  
@@ -1824,11 +2225,23 @@ index 4cd9f0f..1bf2be3 100644
 +            (S)((T)output[blockIdx.x * n + col_index] + (T)input[blockIdx.x * n + col_index] + bias_val);
 +    }
 +}
++
++template<typename T, typename S, typename U>
++__global__ void addBiasResidualSameTypeCast(U* output, const U* input, T* out, const T* bias, const int m, const int n)
++{
++    S *out_cast = (S*)out;
++    const int col_index = blockIdx.y * blockDim.x + threadIdx.x;
++    if (col_index < n) {
++        T bias_val = (bias == nullptr) ? (T)(0.0f) : bias[col_index];
++        out_cast[blockIdx.x * n + col_index] =
++            (S)((T)output[blockIdx.x * n + col_index] + (T)input[blockIdx.x * n + col_index] + bias_val);
++    }
++}
 +
  template<typename T>
  void invokeAddBiasResidual(T* output, const T* input, const T* bias, const int m, const int n, cudaStream_t stream)
  {
-@@ -38,6 +50,20 @@ void invokeAddBiasResidual(T* output, const T* input, const T* bias, const int m
+@@ -38,6 +62,31 @@ void invokeAddBiasResidual(T* output, const T* input, const T* bias, const int m
      addBiasResidual<<<grid, block, 0, stream>>>(output, input, bias, m, n);
  }
  
@@ -1841,15 +2254,26 @@ index 4cd9f0f..1bf2be3 100644
 +    addBiasResidualCast<T, S, U><<<grid, block, 0, stream>>>(output, input, out, bias, m, n);
 +}
 +
++template<typename T, typename S, typename U>
++void invokeAddBiasResidualSameTypeCast(U* output, const U* input, T* out, const T* bias, const int m, const int n, cudaStream_t stream)
++{
++    int blocks_per_row = ceil(float(n) / 1024);
++    dim3 grid(m, blocks_per_row);
++    dim3 block(min(n, 1024));
++    addBiasResidualSameTypeCast<T, S, U><<<grid, block, 0, stream>>>(output, input, out, bias, m, n);
++}
++
 +template void invokeAddBiasResidualCast<float, half, half>(half* output, const float* input, float* out, const float* bias, const int m, const int n, cudaStream_t stream);
 +template void invokeAddBiasResidualCast<float, half, float>(float* output, const float* input, float* out, const float* bias, const int m, const int n, cudaStream_t stream);
 +template void invokeAddBiasResidualCast<float, float, float>(float* output, const float* input, float* out, const float* bias, const int m, const int n, cudaStream_t stream);
 +template void invokeAddBiasResidualCast<float, float, half>(half* output, const float* input, float* out, const float* bias, const int m, const int n, cudaStream_t stream);
++
++template void invokeAddBiasResidualSameTypeCast<float, float, half>(half* output, const half* input, float* out, const float* bias, const int m, const int n, cudaStream_t stream);
 +
  template<typename T>
  __global__ void addBiasAttentionFfnResidual(T* block_output,
                                              const T* ffn_output,
-@@ -88,11 +114,9 @@ void invokeAddBiasAttentionFfnResidual(T* block_output,
+@@ -88,11 +137,9 @@ void invokeAddBiasAttentionFfnResidual(T* block_output,
      }
  }
  
@@ -1864,7 +2288,7 @@ index 4cd9f0f..1bf2be3 100644
  #ifdef ENABLE_BF16
  template void invokeAddBiasResidual(__nv_bfloat16* output,
 diff --git a/src/fastertransformer/kernels/add_residual_kernels.h b/src/fastertransformer/kernels/add_residual_kernels.h
-index edd8179..7ab8eb4 100644
+index edd8179..afa5a77 100644
 --- a/src/fastertransformer/kernels/add_residual_kernels.h
 +++ b/src/fastertransformer/kernels/add_residual_kernels.h
 @@ -27,6 +27,9 @@ namespace fastertransformer {
@@ -1877,12 +2301,15 @@ index edd8179..7ab8eb4 100644
  template<typename T>
  void invokeT5AddResidual(T* output, const T* input, const int m, const int n, cudaStream_t stream);
  
-@@ -65,4 +68,8 @@ void invokeAddBiasResidualCol32(T* output,
+@@ -65,4 +68,11 @@ void invokeAddBiasResidualCol32(T* output,
                                  const float* input1_amax_ptr,
                                  const int scale_is_vector = 0);
  
 +template<typename T, typename S, typename U>
 +void invokeAddBiasResidualCast(U* output, const T* input, T* out, const T* bias, const int m, const int n, cudaStream_t stream);
++
++template<typename T, typename S, typename U>
++void invokeAddBiasResidualSameTypeCast(U* output, const U* input, T* out, const T* bias, const int m, const int n, cudaStream_t stream);
 +
  }  // namespace fastertransformer
 +
@@ -5489,14 +5916,14 @@ index be8b178..e9b4310 100644
 +
  }  // namespace fastertransformer
 diff --git a/src/fastertransformer/layers/CMakeLists.txt b/src/fastertransformer/layers/CMakeLists.txt
-index cbaf4fa..00a46d4 100644
+index cbaf4fa..49779bf 100644
 --- a/src/fastertransformer/layers/CMakeLists.txt
 +++ b/src/fastertransformer/layers/CMakeLists.txt
 @@ -14,6 +14,7 @@
  
  cmake_minimum_required(VERSION 3.8)
  
-+add_subdirectory(encoder_layers)
++add_subdirectory(ms_layers)
  add_subdirectory(attention_layers)
  add_subdirectory(attention_layers_int8)
  add_subdirectory(xlnet_attention_layers)
@@ -5551,7 +5978,7 @@ index b21e3a7..746cb71 100644
                         cublasMMWrapper* cublas_wrapper,
                         IAllocator* allocator,
 diff --git a/src/fastertransformer/layers/attention_layers/CMakeLists.txt b/src/fastertransformer/layers/attention_layers/CMakeLists.txt
-index 9cef315..f9c9cde 100644
+index 9cef315..7170af4 100644
 --- a/src/fastertransformer/layers/attention_layers/CMakeLists.txt
 +++ b/src/fastertransformer/layers/attention_layers/CMakeLists.txt
 @@ -42,8 +42,8 @@ target_link_libraries(DecoderSelfAttentionLayer PUBLIC -lcublas -lcudart cublasM
@@ -5560,7 +5987,7 @@ index 9cef315..f9c9cde 100644
  set_property(TARGET GptContextAttentionLayer PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS  ON)
 -target_link_libraries(GptContextAttentionLayer PUBLIC -lcublas -lcudart cublasMMWrapper memory_utils unfused_attention_kernels)
 -
-+target_link_libraries(GptContextAttentionLayer PUBLIC -lcublas -lcudart cublasMMWrapper memory_utils unfused_attention_kernels activation_kernels EncoderLayer)
++target_link_libraries(GptContextAttentionLayer PUBLIC -lcublas -lcudart cublasMMWrapper memory_utils unfused_attention_kernels activation_kernels)
 +if(EXAMPLES)
  add_library(TensorParallelDecoderSelfAttentionLayer STATIC TensorParallelDecoderSelfAttentionLayer.cc)
  set_property(TARGET TensorParallelDecoderSelfAttentionLayer PROPERTY POSITION_INDEPENDENT_CODE  ON)
@@ -5576,7 +6003,7 @@ index 9cef315..f9c9cde 100644
 diff --git a/src/fastertransformer/layers/attention_layers/GptContextAttentionLayer.cc b/src/fastertransformer/layers/attention_layers/GptContextAttentionLayer.cc
 old mode 100644
 new mode 100755
-index bada640..3dca224
+index bada640..2415ac2
 --- a/src/fastertransformer/layers/attention_layers/GptContextAttentionLayer.cc
 +++ b/src/fastertransformer/layers/attention_layers/GptContextAttentionLayer.cc
 @@ -16,10 +16,39 @@
@@ -5645,159 +6072,157 @@ index bada640..3dca224
              sync_check_cuda_error();
              T scalar = 1 / sqrtf(size_per_head_ * 1.0f);
              invokeMaskedSoftMax(qk_buf_,
-@@ -428,4 +456,148 @@ template class GptContextAttentionLayer<half>;
+@@ -428,4 +456,146 @@ template class GptContextAttentionLayer<half>;
  template class GptContextAttentionLayer<__nv_bfloat16>;
  #endif
  
 +// HAIM Playground MS-MHA
 +
-+template<typename T, typename U, typename S>
-+MSMHALayer<T, U, S>::MSMHALayer(size_t max_batch_size,
-+                                size_t max_src_seq_len,
-+                                size_t max_tgt_seq_len,
-+                                size_t head_num,
-+                                size_t size_per_head,
-+                                cudaStream_t stream,
-+                                cublasMMWrapper* cublas_wrapper,
-+                                IAllocator* allocator,
-+                                bool is_free_buffer_after_forward,
-+                                bool is_qk_buf_float,
-+                                bool is_cross,
-+                                bool sparse,
-+                                bool is_position_bias):
-+    BaseAttentionLayer<T, U, S>(stream, cublas_wrapper, allocator, is_free_buffer_after_forward, sparse)
-+{
-+cublasHandle_t cublas_handle;
-+    cublasCreate(&cublas_handle);
-+    cublasSetStream(cublas_handle, stream);
-+
-+    params_.batch_size = max_batch_size;
-+    params_.src_seq_len = max_src_seq_len;
-+    params_.tgt_seq_len = max_tgt_seq_len;
-+    params_.head_num = head_num;
-+    params_.head_size = size_per_head;
-+    params_.hidden_size = head_num * size_per_head;
-+    params_.cublas_handle = cublas_handle;
-+    params_.stream = stream;
-+    // ctrls
-+    params_.in_idx = 0;
-+    params_.qkv_bias = !is_position_bias;
-+    params_.projection_bias = !is_position_bias;
-+    params_.is_cross = is_cross;
-+    params_.position_bias = is_position_bias;
-+    params_.algo = CUBLAS_GEMM_DEFAULT_TENSOR_OP;
-+}
-+
-+template<typename T, typename U, typename S>
-+void MSMHALayer<T, U, S>::allocateBuffer()
-+{
-+    if (buf_ == nullptr) {
-+        size_t buff_size = GetAttnWorkspaceSize<T>(&params_);
-+        buf_ = reinterpret_cast<T*>(allocator_->reMalloc(buf_, buff_size, true));
-+    }
-+}
-+
-+template<typename T, typename U, typename S>
-+void MSMHALayer<T, U, S>::forward(std::vector<fastertransformer::Tensor>* output_tensors,
-+                                  const std::vector<fastertransformer::Tensor>* input_tensors,
-+                                  const AttentionWeight<U>* attention_weights)
-+{
-+    // input_tensors: use 1 gemm -- multi head attention
-+    //      input_query [batch_size * seq_len, hidden_dimension]
-+    //      attention_mask [batch_size, 1, seq_len, seq_len]
-+
-+    // input_tensors: use 2 gemm -- cross attention
-+    //      input_query [batch_size * seq_len, hidden_dimension]
-+    //      enc_output [batch_size * tgt_len, hidden_dimension]
-+    //      attention_mask [batch_size, 1, seq_len, seq_len]
++// template<typename T, typename U, typename S>
++// MSMHALayer<T, U, S>::MSMHALayer(size_t max_batch_size,
++//                                 size_t max_src_seq_len,
++//                                 size_t max_tgt_seq_len,
++//                                 size_t head_num,
++//                                 size_t size_per_head,
++//                                 cudaStream_t stream,
++//                                 cublasMMWrapper* cublas_wrapper,
++//                                 IAllocator* allocator,
++//                                 bool is_free_buffer_after_forward,
++//                                 bool is_qk_buf_float,
++//                                 bool is_cross,
++//                                 bool sparse,
++//                                 bool is_position_bias):
++//     BaseAttentionLayer<T, U, S>(stream, cublas_wrapper, allocator, is_free_buffer_after_forward, sparse)
++// {
++//     cublasHandle_t cublas_handle;
++//     cublasCreate(&cublas_handle);
++//     cublasSetStream(cublas_handle, stream);
++
++//     // params_.batch_size = max_batch_size;
++//     // params_.src_seq_len = max_src_seq_len;
++//     // params_.tgt_seq_len = max_tgt_seq_len;
++//     // params_.head_num = head_num;
++//     // params_.head_size = size_per_head;
++//     // params_.hidden_size = head_num * size_per_head;
++//     // params_.cublas_handle = cublas_handle;
++//     // params_.stream = stream;
++//     // // ctrls
++//     // params_.in_idx = 0;
++//     // params_.qkv_bias = !is_position_bias;
++//     // params_.projection_bias = !is_position_bias;
++//     // params_.is_cross = is_cross;
++//     // params_.position_bias = is_position_bias;
++//     // params_.algo = CUBLAS_GEMM_DEFAULT_TENSOR_OP;
++// }
++// template<typename T, typename U, typename S>
++// void MSMHALayer<T, U, S>::allocateBuffer()
++// {
++//     if (buf_ == nullptr) {
++//         // size_t buff_size = GetAttnWorkspaceSize<T>(&params_);
++//         // buf_ = reinterpret_cast<T*>(allocator_->reMalloc(buf_, buff_size, true));
++//     }
++// }
++// template<typename T, typename U, typename S>
++// void MSMHALayer<T, U, S>::forward(std::vector<fastertransformer::Tensor>* output_tensors,
++//                                   const std::vector<fastertransformer::Tensor>* input_tensors,
++//                                   const AttentionWeight<U>* attention_weights)
++// {
++//     // input_tensors: use 1 gemm -- multi head attention
++//     //      input_query [batch_size * seq_len, hidden_dimension]
++//     //      attention_mask [batch_size, 1, seq_len, seq_len]
++
++//     // input_tensors: use 2 gemm -- cross attention
++//     //      input_query [batch_size * seq_len, hidden_dimension]
++//     //      enc_output [batch_size * tgt_len, hidden_dimension]
++//     //      attention_mask [batch_size, 1, seq_len, seq_len]
++
++//     // output_tensors:
++//     //      attention_out [batch_size * seq_len, hidden_dimension]
++//     //      key_cache [batch, local_head_num, size_per_head // x, max_seq_len, x]
++//     //      value_cache [batch, local_head_num, max_seq_len, size_per_head]
++
++//     int in_tensor_number = input_tensors->size();
++//     allocateBuffer();  // only once
++//     // if (params_.position_bias)
++//         // if (params_.is_cross) {
++//         //     void* outputs[] = {(void*)output_tensors->at(0).data};
++//         //     void* inputs[] = {(void*)input_tensors->at(0).data,
++//         //                       (void*)input_tensors->at(1).data,
++//         //                       (void*)attention_weights->query_weight.kernel,
++//         //                       (void*)attention_weights->key_weight.kernel,
++//         //                       (void*)input_tensors->at(2).data,
++//         //                       (void*)input_tensors->at(3).data,
++//         //                       (void*)attention_weights->attention_output_weight.kernel};
++//             //    forward_attn<T>((T**)inputs, 7, (T**)outputs, 1, &params_, (void*)buf_);
++//         // }
++//         // else {
++//         //     void* outputs[] = {(void*)output_tensors->at(0).data};
++//         //     void* inputs[] = {
++//         //         (void*)input_tensors->at(0).data,
++//         //         (void*)attention_weights->query_weight.kernel,
++//         //         (void*)input_tensors->at(1).data,
++//         //         (void*)input_tensors->at(2).data,
++//         //         (void*)attention_weights->attention_output_weight.kernel
++//         //     };
++//             // forward_attn<T>((T**)inputs, 5, (T**)outputs, 1, &params_, (void*)buf_);
++//     //     }
++//     // else {
++//     //     if (params_.is_cross) {
++//     //         void* outputs[] = {(void*)output_tensors->at(0).data};
++//     //         void* inputs[] = {(void*)input_tensors->at(0).data,
++//     //                           (void*)input_tensors->at(1).data,
++//     //                           (void*)attention_weights->query_weight.kernel,
++//     //                           (void*)attention_weights->key_weight.kernel,
++//     //                           (void*)attention_weights->query_weight.bias,
++//     //                           (void*)input_tensors->at(2).data,
++//     //                           (void*)attention_weights->attention_output_weight.kernel,
++//     //                           (void*)attention_weights->attention_output_weight.bias
++//                             //   };
++//             // forward_attn<T>((T**)inputs, 8, (T**)outputs, 1, &params_, (void*)buf_);
++//         //       }
++//         // else {
++//         //     void* outputs[] = {(void*)output_tensors->at(0).data};
++//         //     void* inputs[] = {(void*)input_tensors->at(0).data,
++//         //                       (void*)attention_weights->query_weight.kernel,
++//         //                       (void*)attention_weights->query_weight.bias,
++//         //                       (void*)input_tensors->at(1).data,
++//         //                       (void*)attention_weights->attention_output_weight.kernel,
++//         //                       (void*)attention_weights->attention_output_weight.bias};
++//         //    forward_attn<T>((T**)inputs, 6, (T**)outputs, 1, &params_, (void*)buf_);
++//         // }
++//     }
++
++
++//     // template<typename T, typename U, typename S>
++//     // MSMHALayer<T, U, S>::~MSMHALayer()
++//     // {
++//     //     // cublas_wrapper_ = nullptr;
++//     //     freeBuffer();
++//     // }
++
++//     template<typename T, typename U, typename S>
++//     void MSMHALayer<T, U, S>::freeBuffer()
++//     {
++//         if (buf_ != nullptr) {
++//             allocator_->free(buf_);
++//             buf_ = nullptr;
++//         }
++//     }
 +
-+    // output_tensors:
-+    //      attention_out [batch_size * seq_len, hidden_dimension]
-+    //      key_cache [batch, local_head_num, size_per_head // x, max_seq_len, x]
-+    //      value_cache [batch, local_head_num, max_seq_len, size_per_head]
-+
-+    int in_tensor_number = input_tensors->size();
-+    allocateBuffer();  // only once
-+    if (params_.position_bias)
-+        if (params_.is_cross) {
-+            void* outputs[] = {(void*)output_tensors->at(0).data};
-+            void* inputs[] = {(void*)input_tensors->at(0).data,
-+                              (void*)input_tensors->at(1).data,
-+                              (void*)attention_weights->query_weight.kernel,
-+                              (void*)attention_weights->key_weight.kernel,
-+                              (void*)input_tensors->at(2).data,
-+                              (void*)input_tensors->at(3).data,
-+                              (void*)attention_weights->attention_output_weight.kernel};
-+
-+            forward_attn<T>((T**)inputs, 7, (T**)outputs, 1, &params_, (void*)buf_);
-+        }
-+        else {
-+            void* outputs[] = {(void*)output_tensors->at(0).data};
-+            void* inputs[] = {
-+                (void*)input_tensors->at(0).data,
-+                (void*)attention_weights->query_weight.kernel,
-+                (void*)input_tensors->at(1).data,
-+                (void*)input_tensors->at(2).data,
-+                (void*)attention_weights->attention_output_weight.kernel
-+            };
-+            forward_attn<T>((T**)inputs, 5, (T**)outputs, 1, &params_, (void*)buf_);
-+        }
-+    else {
-+        if (params_.is_cross) {
-+            void* outputs[] = {(void*)output_tensors->at(0).data};
-+            void* inputs[] = {(void*)input_tensors->at(0).data,
-+                              (void*)input_tensors->at(1).data,
-+                              (void*)attention_weights->query_weight.kernel,
-+                              (void*)attention_weights->key_weight.kernel,
-+                              (void*)attention_weights->query_weight.bias,
-+                              (void*)input_tensors->at(2).data,
-+                              (void*)attention_weights->attention_output_weight.kernel,
-+                              (void*)attention_weights->attention_output_weight.bias
-+                              };
-+            forward_attn<T>((T**)inputs, 8, (T**)outputs, 1, &params_, (void*)buf_);
-+        } else {
-+            void* outputs[] = {(void*)output_tensors->at(0).data};
-+            void* inputs[] = {(void*)input_tensors->at(0).data,
-+                              (void*)attention_weights->query_weight.kernel,
-+                              (void*)attention_weights->query_weight.bias,
-+                              (void*)input_tensors->at(1).data,
-+                              (void*)attention_weights->attention_output_weight.kernel,
-+                              (void*)attention_weights->attention_output_weight.bias};                
-+            forward_attn<T>((T**)inputs, 6, (T**)outputs, 1, &params_, (void*)buf_);
-+        }
-+    }
-+}
-+
-+template<typename T, typename U, typename S>
-+MSMHALayer<T, U, S>::~MSMHALayer()
-+{
-+    cublas_wrapper_ = nullptr;
-+    freeBuffer();
-+}
-+
-+template<typename T, typename U, typename S>
-+void MSMHALayer<T, U, S>::freeBuffer()
-+{
-+    if (buf_ != nullptr) {
-+        allocator_->free(buf_);
-+        buf_ = nullptr;
-+    }
-+}
-+
-+template class MSMHALayer<float, float, float>;
-+template class MSMHALayer<float, float, half>;
-+template class MSMHALayer<float, half, float>;
-+template class MSMHALayer<float, half, half>;
-+template class MSMHALayer<half, float, float>;
-+template class MSMHALayer<half, float, half>;
-+template class MSMHALayer<half, half, float>;
-+template class MSMHALayer<half, half, half>;
++    // template class MSMHALayer<float, float, float>;
++    // template class MSMHALayer<float, float, half>;
++    // template class MSMHALayer<float, half, float>;
++    // template class MSMHALayer<float, half, half>;
++    // template class MSMHALayer<half, float, float>;
++    // template class MSMHALayer<half, float, half>;
++    // template class MSMHALayer<half, half, float>;
++    // template class MSMHALayer<half, half, half>;
 +
  }  // namespace fastertransformer
 diff --git a/src/fastertransformer/layers/attention_layers/GptContextAttentionLayer.h b/src/fastertransformer/layers/attention_layers/GptContextAttentionLayer.h
 old mode 100644
 new mode 100755
-index 92e2175..f7fa5ca
+index 92e2175..39c49c0
 --- a/src/fastertransformer/layers/attention_layers/GptContextAttentionLayer.h
 +++ b/src/fastertransformer/layers/attention_layers/GptContextAttentionLayer.h
 @@ -18,7 +18,7 @@
@@ -5805,7 +6230,7 @@ index 92e2175..f7fa5ca
  
  #include "src/fastertransformer/layers/attention_layers/BaseAttentionLayer.h"
 -
-+#include "src/fastertransformer/layers/encoder_layers/encoder.h"
++// #include "src/fastertransformer/layers/encoder_layers/encoder.h"
  namespace fastertransformer {
  
  template<typename T>
@@ -5815,132 +6240,132 @@ index 92e2175..f7fa5ca
  
 +
 +// TODO(haim): Add template according to "mix" compute type (fp32, fp16)
-+template<typename T,typename U=T, typename S=T>
-+class MSMHALayer: public BaseAttentionLayer<T,U,S> {
-+private:
-+    void allocateBuffer() override;
-+    void freeBuffer() override;
-+
-+    using BaseAttentionLayer<T,U,S>::is_free_buffer_after_forward_;
-+    using BaseAttentionLayer<T,U,S>::is_allocate_buffer_;
-+    using BaseAttentionLayer<T,U,S>::cublas_wrapper_;
-+    using BaseAttentionLayer<T,U,S>::allocator_;
-+
-+protected:
-+    using BaseAttentionLayer<T,U,S>::stream_;
-+    using BaseAttentionLayer<T,U,S>::sparse_;
-+    T* buf_ = nullptr;
-+    encoderParamT params_;
-+
-+public:
-+    MSMHALayer(size_t batch_size,
-+               size_t src_seq_len,
-+               size_t tgt_seq_len,
-+               size_t head_num,
-+               size_t size_per_head,
-+               cudaStream_t stream,
-+               cublasMMWrapper* cublas_wrapper,
-+               IAllocator* allocator,
-+               bool is_free_buffer_after_forward,
-+               bool is_qk_buf_float,
-+               bool is_cross,
-+               bool sparse = false,
-+               bool is_position_bias=false);
-+    MSMHALayer(MSMHALayer<T> const& attention_layer);
-+    virtual ~MSMHALayer();
-+    void forward(std::vector<fastertransformer::Tensor>* output_tensors,
-+                 const std::vector<fastertransformer::Tensor>* input_tensors,
-+                 const AttentionWeight<U>* attention_weights) override;
-+};
++// template<typename T,typename U=T, typename S=T>
++// class MSMHALayer: public BaseAttentionLayer<T,U,S> {
++// private:
++//     void allocateBuffer() override;
++//     void freeBuffer() override;
++
++//     using BaseAttentionLayer<T,U,S>::is_free_buffer_after_forward_;
++//     using BaseAttentionLayer<T,U,S>::is_allocate_buffer_;
++//     using BaseAttentionLayer<T,U,S>::cublas_wrapper_;
++//     using BaseAttentionLayer<T,U,S>::allocator_;
++
++// protected:
++//     using BaseAttentionLayer<T,U,S>::stream_;
++//     using BaseAttentionLayer<T,U,S>::sparse_;
++//     T* buf_ = nullptr;
++//     // encoderParamT params_;
++
++// public:
++//     MSMHALayer(size_t batch_size,
++//                size_t src_seq_len,
++//                size_t tgt_seq_len,
++//                size_t head_num,
++//                size_t size_per_head,
++//                cudaStream_t stream,
++//                cublasMMWrapper* cublas_wrapper,
++//                IAllocator* allocator,
++//                bool is_free_buffer_after_forward,
++//                bool is_qk_buf_float,
++//                bool is_cross,
++//                bool sparse = false,
++//                bool is_position_bias=false);
++//     MSMHALayer(MSMHALayer<T> const& attention_layer);
++//     virtual ~MSMHALayer();
++//     void forward(std::vector<fastertransformer::Tensor>* output_tensors,
++//                  const std::vector<fastertransformer::Tensor>* input_tensors,
++//                  const AttentionWeight<U>* attention_weights) override;
++// };
 +
  }  // namespace fastertransformer
-diff --git a/src/fastertransformer/layers/encoder_layers/BaseEncoderLayer.h b/src/fastertransformer/layers/encoder_layers/BaseEncoderLayer.h
+diff --git a/src/fastertransformer/layers/decoder_layers/BaseDecoderLayer.h b/src/fastertransformer/layers/decoder_layers/BaseDecoderLayer.h
 new file mode 100644
-index 0000000..3b43391
+index 0000000..0a60835
 --- /dev/null
-+++ b/src/fastertransformer/layers/encoder_layers/BaseEncoderLayer.h
++++ b/src/fastertransformer/layers/decoder_layers/BaseDecoderLayer.h
 @@ -0,0 +1,76 @@
-+/*
-+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.  All rights reserved.
-+ *
-+ * Licensed under the Apache License, Version 2.0 (the "License");
-+ * you may not use this file except in compliance with the License.
-+ * You may obtain a copy of the License at
-+ *
-+ *     http://www.apache.org/licenses/LICENSE-2.0
-+ *
-+ * Unless required by applicable law or agreed to in writing, software
-+ * distributed under the License is distributed on an "AS IS" BASIS,
-+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-+ * See the License for the specific language governing permissions and
-+ * limitations under the License.
-+ */
-+
-+#pragma once
-+
-+#include <assert.h>
-+#include <vector>
-+
-+#include "3rdparty/trt_fused_multihead_attention/fused_multihead_attention_common.h"
-+#include "src/fastertransformer/layers/BaseLayer.h"
-+#include "src/fastertransformer/layers/encoder_layers/EncoderLayerWeight.h"
-+#include "src/fastertransformer/utils/Tensor.h"
-+#include "src/fastertransformer/utils/allocator.h"
-+#include "src/fastertransformer/utils/cublasMMWrapper.h"
-+#include "src/fastertransformer/utils/memory_utils.h"
-+
-+namespace fastertransformer {
-+
-+enum class EncoderLayerType {
-+    UNFUSED_ENCODER_LAYER,
-+    FUSED_ENCODER_LAYER
-+};
-+
-+template<typename T>
-+EncoderLayerType getEncoderLayerType(size_t size_per_head, const int sm, const bool remove_padding, 
-+                                        const int max_seq_len, const bool is_fuse = true) {
-+    if (std::is_same<T, half>::value && (sm == kSM_70 || sm == kSM_86 || sm == kSM_80 || sm == kSM_75 || sm == kSM_72)
-+        && size_per_head == 64 && max_seq_len <= 384 && is_fuse == true) {
-+        return remove_padding ? EncoderLayerType::FUSED_ENCODER_LAYER : EncoderLayerType::FUSED_ENCODER_LAYER;
-+    } else {
-+        return remove_padding ? EncoderLayerType::FUSED_ENCODER_LAYER : EncoderLayerType::FUSED_ENCODER_LAYER;
-+    }
-+}
-+
-+template<typename T>
-+EncoderLayerType getEncoderLayerTypeINT8(size_t size_per_head, const int sm, const bool remove_padding, 
-+                                            const int max_seq_len, const int int8_mode) {
-+    if ((int8_mode == 1 || int8_mode == 2) && (sm == kSM_86 || sm == kSM_80 || sm == kSM_75) && size_per_head == 64
-+        && max_seq_len <= 384) {
-+        return remove_padding ? EncoderLayerType::FUSED_ENCODER_LAYER : EncoderLayerType::FUSED_ENCODER_LAYER;
-+    } else {
-+        return remove_padding ? EncoderLayerType::FUSED_ENCODER_LAYER : EncoderLayerType::FUSED_ENCODER_LAYER;
-+    }
-+}
++// /*
++//  * Copyright (c) 2019-2021, NVIDIA CORPORATION.  All rights reserved.
++//  *
++//  * Licensed under the Apache License, Version 2.0 (the "License");
++//  * you may not use this file except in compliance with the License.
++//  * You may obtain a copy of the License at
++//  *
++//  *     http://www.apache.org/licenses/LICENSE-2.0
++//  *
++//  * Unless required by applicable law or agreed to in writing, software
++//  * distributed under the License is distributed on an "AS IS" BASIS,
++//  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
++//  * See the License for the specific language governing permissions and
++//  * limitations under the License.
++//  */
++
++// #pragma once
++
++// #include <assert.h>
++// #include <vector>
++
++// #include "3rdparty/trt_fused_multihead_attention/fused_multihead_attention_common.h"
++// #include "src/fastertransformer/layers/BaseLayer.h"
++// #include "src/fastertransformer/layers/decoder_layers/DecoderLayerWeight.h"
++// #include "src/fastertransformer/utils/Tensor.h"
++// #include "src/fastertransformer/utils/allocator.h"
++// #include "src/fastertransformer/utils/cublasMMWrapper.h"
++// #include "src/fastertransformer/utils/memory_utils.h"
++
++// namespace fastertransformer {
++
++// enum class DecoderLayerType {
++//     UNFUSED_DECODER_LAYER,
++//     FUSED_DECODER_LAYER
++// };
++
++// template<typename T>
++// DecoderLayerType getDecoderLayerType(size_t size_per_head, const int sm, const bool remove_padding, 
++//                                         const int max_seq_len, const bool is_fuse = true) {
++//     if (std::is_same<T, half>::value && (sm == kSM_70 || sm == kSM_86 || sm == kSM_80 || sm == kSM_75 || sm == kSM_72)
++//         && size_per_head == 64 && max_seq_len <= 384 && is_fuse == true) {
++//         return remove_padding ? DecoderLayerType::FUSED_DECODER_LAYER : DecoderLayerType::FUSED_DECODER_LAYER;
++//     } else {
++//         return remove_padding ? DecoderLayerType::FUSED_DECODER_LAYER : DecoderLayerType::FUSED_DECODER_LAYER;
++//     }
++// }
 +
-+template<typename T, typename U = T, typename S = T>
-+class BaseEncoderLayer: public BaseLayer {
++// template<typename T>
++// DecoderLayerType getDecoderLayerTypeINT8(size_t size_per_head, const int sm, const bool remove_padding, 
++//                                             const int max_seq_len, const int int8_mode) {
++//     if ((int8_mode == 1 || int8_mode == 2) && (sm == kSM_86 || sm == kSM_80 || sm == kSM_75) && size_per_head == 64
++//         && max_seq_len <= 384) {
++//         return remove_padding ? DecoderLayerType::FUSED_DECODER_LAYER : DecoderLayerType::FUSED_DECODER_LAYER;
++//     } else {
++//         return remove_padding ? DecoderLayerType::FUSED_DECODER_LAYER : DecoderLayerType::FUSED_DECODER_LAYER;
++//     }
++// }
 +
-+public:
-+    virtual void forward(std::vector<fastertransformer::Tensor>* output_tensors,
-+                         const std::vector<fastertransformer::Tensor>* input_tensors,
-+                         const EncoderLayerWeight<U>* encoder_layer_weights) = 0;
-+    BaseEncoderLayer(cudaStream_t stream,
-+                     cublasMMWrapper* cublas_wrapper,
-+                     IAllocator* allocator,
-+                     bool is_free_buffer_after_forward,
-+                     bool sparse = false):
-+        BaseLayer(stream, cublas_wrapper, allocator, is_free_buffer_after_forward, nullptr, sparse)
-+    {
-+    }
-+    virtual ~BaseEncoderLayer() = default;
-+};
-+}  // namespace fastertransformer
-diff --git a/src/fastertransformer/layers/encoder_layers/CMakeLists.txt b/src/fastertransformer/layers/encoder_layers/CMakeLists.txt
++// template<typename T, typename U = T, typename S = T>
++// class BaseDecoderLayer: public BaseLayer {
++
++// public:
++//     virtual void forward(std::vector<fastertransformer::Tensor>* output_tensors,
++//                          const std::vector<fastertransformer::Tensor>* input_tensors,
++//                          const DecoderLayerWeight<U>* decoder_layer_weights) = 0;
++//     BaseDecoderLayer(cudaStream_t stream,
++//                      cublasMMWrapper* cublas_wrapper,
++//                      IAllocator* allocator,
++//                      bool is_free_buffer_after_forward,
++//                      bool sparse = false):
++//         BaseLayer(stream, cublas_wrapper, allocator, is_free_buffer_after_forward, nullptr, sparse)
++//     {
++//     }
++//     virtual ~BaseDecoderLayer() = default;
++// };
++// }  // namespace fastertransformer
+diff --git a/src/fastertransformer/layers/decoder_layers/CMakeLists.txt b/src/fastertransformer/layers/decoder_layers/CMakeLists.txt
 new file mode 100644
-index 0000000..1a3af85
+index 0000000..e343db9
 --- /dev/null
-+++ b/src/fastertransformer/layers/encoder_layers/CMakeLists.txt
++++ b/src/fastertransformer/layers/decoder_layers/CMakeLists.txt
 @@ -0,0 +1,21 @@
 +# Copyright (c) 2019-2022, NVIDIA CORPORATION.  All rights reserved.
 +#
@@ -5958,17 +6383,17 @@ index 0000000..1a3af85
 +
 +cmake_minimum_required(VERSION 3.8)
 +
-+add_library(EncoderLayer STATIC encoder.cc MSEncoderLayer.cc)
-+set_property(TARGET EncoderLayer PROPERTY POSITION_INDEPENDENT_CODE  ON)
-+set_property(TARGET EncoderLayer PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS  ON)
-+target_link_libraries(EncoderLayer PUBLIC -lcublas -lcudart unfused_attention_kernels activation_kernels 
++add_library(DecoderLayer STATIC decoder.cc MSDecoderLayer.cc)
++set_property(TARGET DecoderLayer PROPERTY POSITION_INDEPENDENT_CODE  ON)
++set_property(TARGET DecoderLayer PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS  ON)
++target_link_libraries(DecoderLayer PUBLIC -lcublas -lcudart unfused_attention_kernels activation_kernels 
 +                      layernorm_kernels add_residual_kernels bert_preprocess_kernels)
-diff --git a/src/fastertransformer/layers/encoder_layers/EncoderLayerWeight.h b/src/fastertransformer/layers/encoder_layers/EncoderLayerWeight.h
+diff --git a/src/fastertransformer/layers/decoder_layers/DecoderLayerWeight.h b/src/fastertransformer/layers/decoder_layers/DecoderLayerWeight.h
 new file mode 100644
-index 0000000..c441b23
+index 0000000..bd31438
 --- /dev/null
-+++ b/src/fastertransformer/layers/encoder_layers/EncoderLayerWeight.h
-@@ -0,0 +1,33 @@
++++ b/src/fastertransformer/layers/decoder_layers/DecoderLayerWeight.h
+@@ -0,0 +1,37 @@
 +/*
 + * Copyright (c) 2019-2021, NVIDIA CORPORATION.  All rights reserved.
 + *
@@ -5991,23 +6416,27 @@ index 0000000..c441b23
 +#include "src/fastertransformer/kernels/layernorm_kernels.h"
 +namespace fastertransformer {
 +
-+template<typename T>
-+struct EncoderLayerWeight {
-+    DenseWeight<T> qkv_weight;
-+    DenseWeight<T> attention_layer_output_weight;
-+    DenseWeight<T> encoder_output_mapping;
-+    DenseWeight<T> encoder_output_projection;
-+    LayerNormWeight<T> layernorm1;
-+    LayerNormWeight<T> layernorm2;
-+};
++// template<typename T>
++// struct DecoderLayerWeight {
++//     DenseWeight<T> attention_qkv_weight;
++//     DenseWeight<T> attention_layer_output_weight;
++//     DenseWeight<T> attention_cross_q_weight;
++//     DenseWeight<T> attention_cross_kv_weight;
++//     DenseWeight<T> attention_cross_layer_output_weight;
++//     DenseWeight<T> decoder_output_mapping;
++//     DenseWeight<T> decoder_output_projection;
++//     LayerNormWeight<T> layernorm1;
++//     LayerNormWeight<T> layernorm2;
++//     LayerNormWeight<T> layernorm3;
++// };
 +
 +}  // namespace fastertransformer
-diff --git a/src/fastertransformer/layers/encoder_layers/MSEncoderLayer.cc b/src/fastertransformer/layers/encoder_layers/MSEncoderLayer.cc
+diff --git a/src/fastertransformer/layers/decoder_layers/MSDecoderLayer.cc b/src/fastertransformer/layers/decoder_layers/MSDecoderLayer.cc
 new file mode 100644
-index 0000000..a3442da
+index 0000000..ae8875d
 --- /dev/null
-+++ b/src/fastertransformer/layers/encoder_layers/MSEncoderLayer.cc
-@@ -0,0 +1,164 @@
++++ b/src/fastertransformer/layers/decoder_layers/MSDecoderLayer.cc
+@@ -0,0 +1,208 @@
 +/*
 + * Copyright (c) 2021-2022, NVIDIA CORPORATION.  All rights reserved.
 + * Copyright (c) 2021, NAVER Corp.  Authored by CLOVA.
@@ -6025,7 +6454,7 @@ index 0000000..a3442da
 + * limitations under the License.
 + */
 +
-+#include "src/fastertransformer/layers/encoder_layers/MSEncoderLayer.h"
++#include "src/fastertransformer/layers/decoder_layers/MSDecoderLayer.h"
 +#include "src/fastertransformer/kernels/activation_kernels.h"
 +
 +namespace fastertransformer {
@@ -6050,7 +6479,7 @@ index 0000000..a3442da
 +    free(input_host);
 +}
 +template<typename T, typename U, typename S>
-+MSELayer<T, U, S>::MSELayer(size_t max_batch_size,
++MSDLayer<T, U, S>::MSDLayer(size_t max_batch_size,
 +                            size_t max_src_seq_len,
 +                            size_t max_tgt_seq_len,
 +                            size_t head_num,
@@ -6058,7 +6487,11 @@ index 0000000..a3442da
 +                            size_t ffn_hidden_size,
 +                            float eps1,
 +                            float eps2,
++                            float eps3,
 +                            bool post_layernorm,
++                            bool position_bias1,
++                            bool position_bias2,
++                            bool is_ffn_fp16,
 +                            cudaStream_t stream,
 +                            cublasMMWrapper* cublas_wrapper,
 +                            cublasHandle_t* cublas_handle,
@@ -6067,7 +6500,7 @@ index 0000000..a3442da
 +                            bool is_qk_buf_float,
 +                            bool sparse):
 +
-+    BaseEncoderLayer<T, U, S>(stream, cublas_wrapper, allocator, is_free_buffer_after_forward, sparse), buf_(nullptr)
++    BaseDecoderLayer<T, U, S>(stream, cublas_wrapper, allocator, is_free_buffer_after_forward, sparse), buf_(nullptr)
 +{
 +    params_.batch_size = max_batch_size;
 +    params_.src_seq_len = max_src_seq_len;
@@ -6078,31 +6511,60 @@ index 0000000..a3442da
 +    params_.ffn_hidden_size = ffn_hidden_size;
 +    params_.eps1 = eps1;
 +    params_.eps2 = eps2;
++    params_.eps3 = eps3;
 +    params_.layernorm_post = post_layernorm;
 +    // handle
 +    params_.cublas_handle = *cublas_handle;
 +    params_.stream = stream;
-+    params_.ffn_fp16 = true;
++    params_.ffn_fp16 = is_ffn_fp16;
 +    // ctrls
 +    params_.in_idx = 0;
-+    params_.qkv_bias = true;
-+    params_.projection_bias = true;
-+    params_.is_cross = false;
-+    params_.position_bias = false;
 +    params_.algo = CUBLAS_GEMM_DEFAULT_TENSOR_OP;
++    params_.projection_bias = true;
++    
++    params_.attn1.in_idx = 0;
++    params_.attn1.batch_size = max_batch_size;
++    params_.attn1.src_seq_len = max_src_seq_len;
++    params_.attn1.tgt_seq_len = max_tgt_seq_len;
++    params_.attn1.head_num = head_num;
++    params_.attn1.head_size = size_per_head;
++    params_.attn1.hidden_size = head_num * size_per_head;
++    params_.attn1.qkv_bias = true;
++    params_.attn1.projection_bias = false;
++    params_.attn1.is_cross = false;
++    params_.attn1.position_bias = false;
++    params_.attn1.algo = CUBLAS_GEMM_DEFAULT_TENSOR_OP;
++    params_.attn1.cublas_handle = *cublas_handle;
++    params_.attn1.stream = stream;
++
++    params_.attn2.in_idx = 0;
++    params_.attn2.batch_size = max_batch_size;
++    params_.attn2.src_seq_len = max_src_seq_len;
++    params_.attn2.tgt_seq_len = max_tgt_seq_len;
++    params_.attn2.head_num = head_num;
++    params_.attn2.head_size = size_per_head;
++    params_.attn2.hidden_size = head_num * size_per_head;
++    params_.attn2.qkv_bias = true;
++    params_.attn2.projection_bias = false;
++    params_.attn2.is_cross = true;
++    params_.attn2.position_bias = false;
++    params_.attn2.algo = CUBLAS_GEMM_DEFAULT_TENSOR_OP;
++    params_.attn2.cublas_handle = *cublas_handle;
++    params_.attn2.stream = stream;
 +}
 +
 +template<typename T, typename U, typename S>
-+void MSELayer<T, U, S>::allocateBuffer()
++void MSDLayer<T, U, S>::allocateBuffer()
 +{
 +    if (buf_ == nullptr) {
-+        size_t buff_size = GetEncoderLayerWorkspaceSize<T>(&params_);
-+        buf_ = reinterpret_cast<T*>(allocator_->reMalloc(buf_, sizeof(T) * buff_size, true));
++        size_t buff_size = GetDecoderLayerWorkspaceSize<T>(&params_);
++        std::cout<<"buff_size: "<<buff_size<<std::endl;
++        buf_ = reinterpret_cast<T*>(allocator_->reMalloc(buf_,  buff_size, true));
 +    }
 +}
 +
 +template<typename T, typename U, typename S>
-+void MSELayer<T, U, S>::freeBuffer()
++void MSDLayer<T, U, S>::freeBuffer()
 +{
 +    if (buf_ != nullptr) {
 +        allocator_->free(buf_);
@@ -6111,181 +6573,200 @@ index 0000000..a3442da
 +}
 +
 +template<typename T, typename U, typename S>
-+MSELayer<T, U, S>::~MSELayer()
++MSDLayer<T, U, S>::~MSDLayer()
 +{
 +    cublas_wrapper_ = nullptr;
 +    freeBuffer();
 +}
 +
 +template<typename T, typename U, typename S>
-+void MSELayer<T, U, S>::forward(std::vector<fastertransformer::Tensor>* output_tensors,
++void MSDLayer<T, U, S>::forward(std::vector<fastertransformer::Tensor>* output_tensors,
 +                                const std::vector<fastertransformer::Tensor>* input_tensors,
-+                                const EncoderLayerWeight<U>* encoder_weights)
-+{
++                                const DecoderLayerWeight<U>* decoder_weights)
++{   
++    std::cout<<"forward\n";
 +    allocateBuffer();  // only once
 +    void* outputs[] = {(void*)output_tensors->at(0).data};
-+    if (!params_.layernorm_post) {
-+        void* inputs[] = {(void*)input_tensors->at(0).data,
-+                          (void*)encoder_weights->layernorm1.gamma,
-+                          (void*)encoder_weights->layernorm1.beta,
-+                          (void*)encoder_weights->qkv_weight.kernel,
-+                          (void*)encoder_weights->qkv_weight.bias,
-+                          (void*)input_tensors->at(1).data,
-+                          (void*)encoder_weights->attention_layer_output_weight.kernel,
-+                          (void*)encoder_weights->attention_layer_output_weight.bias,
-+                          (void*)encoder_weights->layernorm2.gamma,
-+                          (void*)encoder_weights->layernorm2.beta,
-+                          (void*)encoder_weights->encoder_output_mapping.kernel,
-+                          (void*)encoder_weights->encoder_output_mapping.bias,
-+                          (void*)encoder_weights->encoder_output_projection.kernel,
-+                          (void*)encoder_weights->encoder_output_projection.bias};
-+        forwardEncoder<T>(inputs, 14, outputs, 1, &params_, buf_);
-+    }
-+    else {
++    // std::cout<<params_.layernorm_post<< params_.attn1->qkv_bias<< params_.attn2->qkv_bias<< !params_.attn1->position_bias<< !params_.attn2->position_bias<<std::endl;
++    if (params_.attn1.qkv_bias && params_.attn2.qkv_bias && !params_.attn1.position_bias && !params_.attn2.position_bias) {
 +        void* inputs[] = {(void*)input_tensors->at(0).data,
-+                          (void*)encoder_weights->qkv_weight.kernel,
-+                          (void*)encoder_weights->qkv_weight.bias,
++                          (void*)decoder_weights->layernorm1.gamma,
++                          (void*)decoder_weights->layernorm1.beta,
++                          (void*)decoder_weights->attention_qkv_weight.kernel,
++                          (void*)decoder_weights->attention_qkv_weight.bias,
 +                          (void*)input_tensors->at(1).data,
-+                          (void*)encoder_weights->attention_layer_output_weight.kernel,
-+                          (void*)encoder_weights->attention_layer_output_weight.bias,
-+                          (void*)encoder_weights->layernorm1.gamma,
-+                          (void*)encoder_weights->layernorm1.beta,
-+                          (void*)encoder_weights->encoder_output_mapping.kernel,
-+                          (void*)encoder_weights->encoder_output_mapping.bias,
-+                          (void*)encoder_weights->encoder_output_projection.kernel,
-+                          (void*)encoder_weights->encoder_output_projection.bias,
-+                          (void*)encoder_weights->layernorm2.gamma,
-+                          (void*)encoder_weights->layernorm2.beta};
-+        forwardEncoder<T>(inputs, 3, outputs, 1, &params_, buf_);
++                          (void*)decoder_weights->attention_layer_output_weight.kernel,
++                          (void*)decoder_weights->attention_layer_output_weight.bias,
++                          (void*)decoder_weights->layernorm2.gamma,
++                          (void*)decoder_weights->layernorm2.beta,
++                          (void*)input_tensors->at(2).data,
++                          (void*)decoder_weights->attention_cross_q_weight.kernel,
++                          (void*)decoder_weights->attention_cross_kv_weight.kernel,
++                          (void*)decoder_weights->attention_cross_q_weight.bias,
++                          (void*)input_tensors->at(3).data,
++                          (void*)decoder_weights->attention_cross_layer_output_weight.kernel,
++                          (void*)decoder_weights->attention_cross_layer_output_weight.bias,
++                          (void*)decoder_weights->layernorm3.gamma,
++                          (void*)decoder_weights->layernorm3.beta,
++                          (void*)decoder_weights->decoder_output_mapping.kernel,
++                          (void*)decoder_weights->decoder_output_mapping.bias,
++                          (void*)decoder_weights->decoder_output_projection.kernel,
++                          (void*)decoder_weights->decoder_output_projection.bias};
++        forwardDecoder<T>(inputs, 23, outputs, 1, &params_, buf_);
++    // }
++    // else {
++    //     void* inputs[] = {(void*)input_tensors->at(0).data,
++    //                       (void*)decoder_weights->qkv_weight.kernel,
++    //                       (void*)decoder_weights->qkv_weight.bias,
++    //                       (void*)input_tensors->at(1).data,
++    //                       (void*)decoder_weights->attention_layer_output_weight.kernel,
++    //                       (void*)decoder_weights->attention_layer_output_weight.bias,
++    //                       (void*)decoder_weights->layernorm1.gamma,
++    //                       (void*)decoder_weights->layernorm1.beta,
++    //                       (void*)decoder_weights->decoder_output_mapping.kernel,
++    //                       (void*)decoder_weights->decoder_output_mapping.bias,
++    //                       (void*)decoder_weights->decoder_output_projection.kernel,
++    //                       (void*)decoder_weights->decoder_output_projection.bias,
++    //                       (void*)decoder_weights->layernorm2.gamma,
++    //                       (void*)decoder_weights->layernorm2.beta};
++    //     forwardDecoder<T>(inputs, 3, outputs, 1, &params_, buf_);
++    // }
 +    }
-+
 +    return;
 +}
 +
-+template class MSELayer<float, float, float>;
-+template class MSELayer<float, float, half>;
-+template class MSELayer<float, half, float>;
-+template class MSELayer<float, half, half>;
-+template class MSELayer<half, float, float>;
-+template class MSELayer<half, float, half>;
-+template class MSELayer<half, half, float>;
-+template class MSELayer<half, half, half>;
++template class MSDLayer<float, float, float>;
++template class MSDLayer<float, float, half>;
++template class MSDLayer<float, half, float>;
++template class MSDLayer<float, half, half>;
++template class MSDLayer<half, float, float>;
++template class MSDLayer<half, float, half>;
++template class MSDLayer<half, half, float>;
++template class MSDLayer<half, half, half>;
 +
 +}  // namespace fastertransformer
-diff --git a/src/fastertransformer/layers/encoder_layers/MSEncoderLayer.h b/src/fastertransformer/layers/encoder_layers/MSEncoderLayer.h
+diff --git a/src/fastertransformer/layers/decoder_layers/MSDecoderLayer.h b/src/fastertransformer/layers/decoder_layers/MSDecoderLayer.h
 new file mode 100644
-index 0000000..afc6a5a
+index 0000000..8908141
 --- /dev/null
-+++ b/src/fastertransformer/layers/encoder_layers/MSEncoderLayer.h
-@@ -0,0 +1,69 @@
-+/*
-+ * Copyright (c) 2021, NVIDIA CORPORATION.  All rights reserved.
-+ * Copyright (c) 2021, NAVER Corp.  Authored by CLOVA.
-+ *
-+ * Licensed under the Apache License, Version 2.0 (the "License");
-+ * you may not use this file except in compliance with the License.
-+ * You may obtain a copy of the License at
-+ *
-+ *     http://www.apache.org/licenses/LICENSE-2.0
-+ *
-+ * Unless required by applicable law or agreed to in writing, software
-+ * distributed under the License is distributed on an "AS IS" BASIS,
-+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-+ * See the License for the specific language governing permissions and
-+ * limitations under the License.
-+ */
++++ b/src/fastertransformer/layers/decoder_layers/MSDecoderLayer.h
+@@ -0,0 +1,74 @@
++// /*
++//  * Copyright (c) 2021, NVIDIA CORPORATION.  All rights reserved.
++//  * Copyright (c) 2021, NAVER Corp.  Authored by CLOVA.
++//  *
++//  * Licensed under the Apache License, Version 2.0 (the "License");
++//  * you may not use this file except in compliance with the License.
++//  * You may obtain a copy of the License at
++//  *
++//  *     http://www.apache.org/licenses/LICENSE-2.0
++//  *
++//  * Unless required by applicable law or agreed to in writing, software
++//  * distributed under the License is distributed on an "AS IS" BASIS,
++//  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
++//  * See the License for the specific language governing permissions and
++//  * limitations under the License.
++//  */
++
++// #pragma once
++
++// #include "src/fastertransformer/layers/decoder_layers/BaseDecoderLayer.h"
++// #include "src/fastertransformer/layers/decoder_layers/decoder.h"
++
++// namespace fastertransformer {
++
++// // TODO(haim): Add template according to "mix" compute type (fp32, fp16)
++// template<typename T, typename U = T, typename S = T>
++// class MSDLayer: public BaseDecoderLayer<T, U, S> {
++// private:
++//     mutable decoderParamT params_;
++
++//     void allocateBuffer() override;
++//     void freeBuffer() override;
++//     void* buf_;
++//     using BaseDecoderLayer<T, U, S>::is_free_buffer_after_forward_;
++//     using BaseDecoderLayer<T, U, S>::is_allocate_buffer_;
++//     using BaseDecoderLayer<T, U, S>::cublas_wrapper_;
++//     using BaseDecoderLayer<T, U, S>::allocator_;
++
++// protected:
++//     using BaseDecoderLayer<T, U, S>::stream_;
++//     using BaseDecoderLayer<T, U, S>::sparse_;
++
++// public:
++//     MSDLayer(size_t max_batch_size,
++//                             size_t max_src_seq_len,
++//                             size_t max_tgt_seq_len,
++//                             size_t head_num,
++//                             size_t size_per_head,
++//                             size_t ffn_hidden_size,
++//                             float eps1,
++//                             float eps2,
++//                             float eps3,
++//                             bool post_layernorm,
++//                             bool position_bias1,
++//                             bool position_bias2,
++//                             bool is_ffn_fp16,
++//                             cudaStream_t stream,
++//                             cublasMMWrapper* cublas_wrapper,
++//                             cublasHandle_t* cublas_handle,
++//                             IAllocator* allocator,
++//                             bool is_free_buffer_after_forward,
++//                             bool is_qk_buf_float,
++//                             bool sparse);
++
++//     MSDLayer(MSDLayer<T> const& decoder_layer);
++
++//     virtual ~MSDLayer();
++
++//     void forward(std::vector<fastertransformer::Tensor>* output_tensors,
++//                  const std::vector<fastertransformer::Tensor>* input_tensors,
++//                  const DecoderLayerWeight<U>* decoder_weights) override;
++// };
++
++// }  // namespace fastertransformer
+diff --git a/src/fastertransformer/layers/decoder_layers/decoder.cc b/src/fastertransformer/layers/decoder_layers/decoder.cc
+new file mode 100644
+index 0000000..bb5c615
+--- /dev/null
++++ b/src/fastertransformer/layers/decoder_layers/decoder.cc
+@@ -0,0 +1,421 @@
 +
-+#pragma once
++#include "src/fastertransformer/layers/decoder_layers/decoder.h"
++#include "src/fastertransformer/kernels/activation_kernels.h"
++#include "src/fastertransformer/kernels/add_residual_kernels.h"
++#include "src/fastertransformer/kernels/layernorm_kernels.h"
++#include "src/fastertransformer/kernels/unfused_attention_kernels.h"
++#include "src/fastertransformer/layers/encoder_layers/encoder.h"
 +
-+#include "src/fastertransformer/layers/encoder_layers/BaseEncoderLayer.h"
-+#include "src/fastertransformer/layers/encoder_layers/encoder.h"
-+
-+namespace fastertransformer {
-+
-+// TODO(haim): Add template according to "mix" compute type (fp32, fp16)
-+template<typename T, typename U = T, typename S = T>
-+class MSELayer: public BaseEncoderLayer<T, U, S> {
-+private:
-+    encoderParamT params_;
-+    void allocateBuffer() override;
-+    void freeBuffer() override;
-+    void* buf_;
-+    using BaseEncoderLayer<T, U, S>::is_free_buffer_after_forward_;
-+    using BaseEncoderLayer<T, U, S>::is_allocate_buffer_;
-+    using BaseEncoderLayer<T, U, S>::cublas_wrapper_;
-+    using BaseEncoderLayer<T, U, S>::allocator_;
-+
-+protected:
-+    using BaseEncoderLayer<T, U, S>::stream_;
-+    using BaseEncoderLayer<T, U, S>::sparse_;
-+
-+public:
-+    MSELayer(size_t max_batch_size,
-+                            size_t max_src_seq_len,
-+                            size_t max_tgt_seq_len,
-+                            size_t head_num,
-+                            size_t size_per_head,
-+                            size_t ffn_hidden_size,
-+                            float eps1,
-+                            float eps2,
-+                            bool post_layernorm,
-+                            cudaStream_t stream,
-+                            cublasMMWrapper* cublas_wrapper,
-+                            cublasHandle_t* cublas_handle,
-+                            IAllocator* allocator,
-+                            bool is_free_buffer_after_forward,
-+                            bool is_qk_buf_float,
-+                            bool sparse);
-+
-+    MSELayer(MSELayer<T> const& encoder_layer);
-+
-+    virtual ~MSELayer();
-+
-+    void forward(std::vector<fastertransformer::Tensor>* output_tensors,
-+                 const std::vector<fastertransformer::Tensor>* input_tensors,
-+                 const EncoderLayerWeight<U>* encoder_weights) override;
-+};
-+
-+}  // namespace fastertransformer
-diff --git a/src/fastertransformer/layers/encoder_layers/encoder.cc b/src/fastertransformer/layers/encoder_layers/encoder.cc
-new file mode 100644
-index 0000000..004718e
---- /dev/null
-+++ b/src/fastertransformer/layers/encoder_layers/encoder.cc
-@@ -0,0 +1,814 @@
-+
-+#include "src/fastertransformer/layers/encoder_layers/encoder.h"
-+#include "src/fastertransformer/kernels/activation_kernels.h"
-+#include "src/fastertransformer/kernels/add_residual_kernels.h"
-+#include "src/fastertransformer/kernels/bert_preprocess_kernels.h"
-+#include "src/fastertransformer/kernels/layernorm_kernels.h"
-+#include "src/fastertransformer/kernels/unfused_attention_kernels.h"
 +#include <iostream>
-+
 +namespace fastertransformer {
 +
 +#define UP_DIV(x, y) (((x) + (y) - (1)) / (y))
-+#define ALIGN(x, y) (UP_DIV(x, y) * (y))
++// #define UP_DIV(x, y) (x)
 +#define ALIGN_SIZE 16
 +
 +template<typename T>
-+void printTensor(const std::string& str, T* input, int size)
++void printTensor(char* str, T* input, int size)
 +{
-+    std::cout << str;
++    printf("%s ", str);
 +    T* input_device = input;
-+    auto input_host = std::make_unique<T[]>(size);
-+    cudaD2Hcpy(input_host.get(), input_device, size);
-+    for (int k = 0, index = 0; k < size; k++) {
-+        if (index != 0)
-+            std::cout << ',';
-+        std::cout << input_host[k];
-+        index++;
-+        if (index == 10) {
++    T* input_host = (T*)malloc(size * sizeof(T));
++
++    fastertransformer::cudaD2Hcpy(input_host, input_device, size);
++
++    for (int k = 0; k < (int)size; k++) {
++
++        std::cout << input_host[k] << ",";
++        if (k % 10 == 0)
++            std::cout << std::endl;
++        if (k % 10 == 0)
 +            std::cout << std::endl;
-+            index = 0;
-+        }
 +    }
++
 +    std::cout << std::endl;
++
++    free(input_host);
 +}
 +
 +template<typename T>
@@ -6295,107 +6776,1078 @@ index 0000000..004718e
 +              << " size is " << size;
 +    T* input_device = input;
 +    T* input_host = (T*)malloc(size * sizeof(T));
-+    cudaD2Hcpy(input_host, input_device, size);
++
++    fastertransformer::cudaD2Hcpy(input_host, input_device, size);
++
 +    for (int k = 0; k < (int)size; k++) {
 +        if (std::isnan((float)input_host[k]) || std ::isinf((float)input_host[k])) {
 +            std::cout << "found NAN or INF";
 +            break;
 +        }
 +    }
++
 +    std::cout << std::endl;
 +    free(input_host);
 +}
++
++template<typename T>
++size_t GetAttnWorkspaceSize(decoderParamT* param)
++{
++    size_t size_q = UP_DIV((param->batch_size * param->src_seq_len * param->hidden_size), ALIGN_SIZE) * ALIGN_SIZE;
++    size_t size_k = UP_DIV((param->batch_size * param->tgt_seq_len * param->hidden_size), ALIGN_SIZE) * ALIGN_SIZE;
++    size_t size_v = size_k;
++    size_t qkv_len = size_q + size_k + size_v;
++    size_t q_buf_2_len = size_q;
++    size_t qk_buf_len =
++        UP_DIV(param->batch_size * param->head_num * param->src_seq_len * param->tgt_seq_len, ALIGN_SIZE) * ALIGN_SIZE;
++    size_t qkv_buf_2_len = UP_DIV(param->batch_size * param->src_seq_len * param->hidden_size, ALIGN_SIZE) * ALIGN_SIZE;
++    size_t qkv_buf_3_len = qkv_buf_2_len;
++    size_t attn_out_size =
++        UP_DIV(param->batch_size * param->head_num * param->head_size * param->tgt_seq_len, ALIGN_SIZE) * ALIGN_SIZE;
++    return (qkv_len + q_buf_2_len + qk_buf_len + qkv_buf_2_len + qkv_buf_3_len + 2 * attn_out_size) * sizeof(T);
++}
++
++template size_t GetAttnWorkspaceSize<float>(decoderParamT* param);
++template size_t GetAttnWorkspaceSize<half>(decoderParamT* param);
 +template<typename T>
-+T checksum(const T* tensor, int size)
++size_t GetDecoderLayerWorkspaceSize(decoderParamT* param)
 +{
-+    if constexpr (std::is_floating_point<T>()) {
-+        auto tensor_host = std::make_unique<T[]>(size);
-+        double sum = 0.;
-+        T* ptr = tensor_host.get();
-+        cudaD2Hcpy(ptr, tensor, size);
-+        for (int i = 0; i < size; i++) {
-+            //  sum += (double)ptr[i]*i;
-+            sum += ptr[i];
-+        }
-+        return static_cast<T>(sum);
++    size_t attn_out = UP_DIV(param->batch_size * param->src_seq_len * param->hidden_size, ALIGN_SIZE) * ALIGN_SIZE;
++    ;
++    size_t attn2_out = UP_DIV(param->batch_size * param->src_seq_len * param->hidden_size, ALIGN_SIZE) * ALIGN_SIZE;
++    ;
++
++    size_t ffn = UP_DIV(param->batch_size * param->src_seq_len * param->ffn_hidden_size, ALIGN_SIZE) * ALIGN_SIZE;
++    size_t ffn_size = (param->layernorm_post) ? ffn : (attn_out + ffn);
++    size_t out_size = (param->layernorm_post) ? attn_out + attn2_out : attn_out * 2 + attn2_out * 2;
++    return (std::max(GetAttnWorkspaceSize<T>(param) * 2, ffn_size * sizeof(T)) + out_size * sizeof(T)
++            + GetAttnWorkspaceSize<T>(param)*2);
++}
++
++template size_t GetDecoderLayerWorkspaceSize<float>(decoderParamT* param);
++template size_t GetDecoderLayerWorkspaceSize<half>(decoderParamT* param);
++
++template<typename T, typename S = T>
++void forward_ffn(T* inputs[], int in_len, T* output[], int out_len, ParamT* param, void* ws)
++{
++    size_t inter_size = param->ffn_hidden_size;
++    size_t h_token_num = param->batch_size * param->src_seq_len;
++    cublasOperation_t gemm_ops[] = {CUBLAS_OP_N, CUBLAS_OP_N};
++    cudaDataType gemm_data_types[] = {CUDA_R_32F, CUDA_R_32F, CUDA_R_32F};
++    if ((std::is_same<T, half>::value) || (std::is_same<S, half>::value)) {
++        gemm_data_types[0] = CUDA_R_16F;
++        gemm_data_types[1] = CUDA_R_16F;
++        gemm_data_types[2] = CUDA_R_16F;
 +    }
-+    else
-+        return static_cast<T>(0.f);
++    S alpha = 1.0f;
++    S beta = 0.0f;
++
++    int gemm_dims[] = {(int)inter_size, (int)h_token_num, (int)param->hidden_size};
++    int gemm_lds[] = {(int)inter_size, (int)param->hidden_size, (int)inter_size};
++    T* normed_attn_out = reinterpret_cast<T*>(inputs[param->in_idx++]);
++    fastertransformer::CublasGemmWrapper(inputs[param->in_idx++],
++                                         normed_attn_out,
++                                         ws,
++                                         gemm_dims,
++                                         gemm_lds,
++                                         gemm_ops,
++                                         gemm_data_types,
++                                         &alpha,
++                                         &beta,
++                                         param->cublas_handle,
++                                         param->algo);
++    invokeAddBiasGelu(reinterpret_cast<S*>(ws),
++                      reinterpret_cast<S*>(inputs[param->in_idx++]),
++                      h_token_num,
++                      inter_size,
++                      param->stream);
++    gemm_dims[0] = param->hidden_size;
++    gemm_dims[1] = h_token_num;
++    gemm_dims[2] = inter_size;
++    gemm_lds[0] = param->hidden_size;
++    gemm_lds[1] = inter_size;
++    gemm_lds[2] = param->hidden_size;
++    fastertransformer::CublasGemmWrapper(inputs[param->in_idx++],
++                                         ws,
++                                         output[0],
++                                         gemm_dims,
++                                         gemm_lds,
++                                         gemm_ops,
++                                         gemm_data_types,
++                                         &alpha,
++                                         &beta,
++                                         param->cublas_handle,
++                                         param->algo);
 +}
 +
 +template<typename T>
-+T checksumGrid(const T* tensor, const encoderParamT* param, bool zp = false, bool cross = false, bool ffn = false)
++void forwardDecoder(void* inputs[], int in_len, void* output[], int out_len, decoderParamT* param, void* ws)
 +{
-+    if constexpr (std::is_floating_point<T>()) {
-+        int hidden_size;
-+        if (ffn) {
-+            hidden_size = param->ffn_hidden_size;
++    param->in_idx = 0;
++    size_t h_token_num = param->batch_size * param->src_seq_len;
++    T* from_tensor = reinterpret_cast<T*>(inputs[param->in_idx++]);
++    T* attn_out = reinterpret_cast<T*>(ws);
++    T* normed_from_tensor = reinterpret_cast<T*>(ws) + UP_DIV(param->batch_size * param->src_seq_len * param->hidden_size, ALIGN_SIZE) * ALIGN_SIZE;
++    T* attn_ws = reinterpret_cast<T*>(normed_from_tensor) + UP_DIV(param->batch_size * param->src_seq_len * param->hidden_size, ALIGN_SIZE) * ALIGN_SIZE;
++    T* normed_attn_out = normed_from_tensor; 
++    T* attn2_out = reinterpret_cast<T*>(attn_ws) + GetAttnWorkspaceSize<T>(param);  
++    T* normed_from_tensor2 = reinterpret_cast<T*>(attn2_out) + UP_DIV(param->batch_size * param->src_seq_len * param->hidden_size, ALIGN_SIZE) * ALIGN_SIZE;
++    T* attn2_ws = reinterpret_cast<T*>(normed_from_tensor2) + UP_DIV(param->batch_size * param->src_seq_len * param->hidden_size, ALIGN_SIZE) * ALIGN_SIZE;
++    T* normed_attn2_out = normed_from_tensor2;
++    T* ffn_ws =  normed_attn2_out + UP_DIV(param->batch_size * param->src_seq_len * param->hidden_size, ALIGN_SIZE) * ALIGN_SIZE;
++    T* tmp_out = reinterpret_cast<T*>(output[0]);
++    if (std::is_same<T, float>::value && param->ffn_fp16 == true) {
++        tmp_out = ffn_ws + UP_DIV(param->batch_size * param->src_seq_len * param->ffn_hidden_size, ALIGN_SIZE) * ALIGN_SIZE;
++    }
++    T* gamma1 = reinterpret_cast<T*>(inputs[param->in_idx++]);
++    T* beta1 = reinterpret_cast<T*>(inputs[param->in_idx++]);
++    invokeGeneralLayerNorm(normed_from_tensor,
++                           reinterpret_cast<T*>(from_tensor),  // from tensor
++                           gamma1,                             // Gamma
++                           beta1,                              // Beta
++                           h_token_num,
++                           param->hidden_size,
++                           param->stream,
++                           param->eps1);
++    inputs[--param->in_idx] = normed_from_tensor;
++    // if attention is embedded inside an decoder - fuse the bias to next layer normalization
++    int in_idx = param->in_idx;
++    forward_attn(reinterpret_cast<T**>(&inputs[param->in_idx]), in_len, &attn_out, 1, &(param->attn1), attn_ws);
++    param->in_idx = param->attn1.in_idx + in_idx;
++    if (param->projection_bias) {
++        T* projection_bias = reinterpret_cast<T*>(inputs[param->in_idx++]);
++        T* gamma2 = reinterpret_cast<T*>(inputs[param->in_idx++]);
++        T* beta2 = reinterpret_cast<T*>(inputs[param->in_idx++]);
++        if (param->layernorm_post == false) {
++            invokeGeneralAddBiasResidualPreLayerNorm(attn_out,
++                                                     normed_attn_out,
++                                                     from_tensor,
++                                                     gamma2,  // gamma
++                                                     beta2,   // beta
++                                                     projection_bias,
++                                                     h_token_num,
++                                                     param->hidden_size,
++                                                     param->stream,
++                                                     param->eps2);
 +        }
 +        else {
-+            hidden_size = param->hidden_size;
-+        }
-+        const int size = param->batch_size * param->src_seq_len * hidden_size;
-+        int head_size = hidden_size / param->head_num;
-+        auto tensor_host = std::make_unique<T[]>(size);
-+        double sum = 0.;
-+        T* ptr = tensor_host.get();
-+        try {
-+            cudaD2Hcpy(ptr, tensor, size);
-+        }
-+        catch (...) {
-+            std::cout << "copy tensor failed" << std::endl;
-+            return static_cast<T>(0.f);
-+        }
-+        bool compressed = param->eft && zp;
-+        if (!compressed) {
-+            if (cross) {
-+                std::cout << "cross sum:" << std::endl;
-+                for (int i = 0; i < param->batch_size; i++) {
-+                    for (int j = 0; j < param->head_num; j++) {
-+                        for (int k = 0; k < param->src_seq_len / 2; k++) {
-+                            for (int l = 0; l < head_size; l++) {
-+                                sum += ptr[(((i * param->head_num) + j) * param->src_seq_len + k) * head_size + l];
-+                            }
-+                        }
-+                    }
-+                }
-+            }
-+            else {
-+                std::cout << "grid sum:" << std::endl;
-+                for (int i = 0; i < param->batch_size; i++) {
-+                    for (int j = 0; j < param->src_seq_len / 2; j++) {
-+                        for (int k = 0; k < hidden_size; k++) {
-+                            sum += ptr[((i * param->src_seq_len) + j) * hidden_size + k];
-+                        }
-+                    }
-+                }
-+            }
++    }
++    inputs[--param->in_idx] = normed_attn_out;
++    in_idx = param->in_idx;
++    forward_attn(reinterpret_cast<T**>(&inputs[param->in_idx]), in_len, &attn2_out, 1, &(param->attn2), attn2_ws);
++    param->in_idx = param->attn2.in_idx + in_idx;
++    if (param->projection_bias) {
++        T* projection_bias = reinterpret_cast<T*>(inputs[param->in_idx++]);
++        T* gamma3 = reinterpret_cast<T*>(inputs[param->in_idx++]);
++        T* beta3 = reinterpret_cast<T*>(inputs[param->in_idx++]);
++        if (std::is_same<T, half>::value || param->ffn_fp16==false) {
++                invokeGeneralAddBiasResidualPreLayerNorm(attn2_out,
++                                            normed_attn2_out,
++                                            attn_out,
++                                            gamma3,  // gamma
++                                            beta3,  // beta
++                                            projection_bias,
++                                            h_token_num,
++                                            param->hidden_size,
++                                            param->stream,
++                                            param->eps3);
++
++        } else {
++                invokeGeneralAddBiasResidualPreLayerNormCast<T, half>(attn2_out,
++                                                        reinterpret_cast<half*>(normed_attn2_out),
++                                                        attn_out,
++                                                        gamma3,  // gamma
++                                                        beta3,   // beta
++                                                        projection_bias,
++                                                        h_token_num,
++                                                        param->hidden_size,
++                                                        param->stream,
++                                                        param->eps3);          
 +        }
-+        else {
-+            std::cout << "compress sum:" << std::endl;
-+            for (int i = 0; i < param->h_token_num * hidden_size; i++) {
-+                sum += ptr[i];
-+            }
++    } else {
++        // without projection bias
++    }
++    inputs[--param->in_idx] = normed_attn2_out;
++    if (param->ffn_fp16 == false) {
++        forward_ffn<T, T>(reinterpret_cast<T**>(inputs), in_len, &tmp_out, 1, param, ffn_ws);
++    } else {
++        forward_ffn<T, half>(reinterpret_cast<T**>(inputs), in_len, &tmp_out, 1, param, ffn_ws);
++    }
++    attn2_out = param->layernorm_post ? normed_attn2_out : attn2_out;
++     if (std::is_same<T, half>::value || param->ffn_fp16==false) {
++        invokeAddBiasResidual(reinterpret_cast<T*>(tmp_out),
++                              attn2_out,
++                              reinterpret_cast<T*>(inputs[param->in_idx++]),  // FFN bias
++                              h_token_num,
++                              param->hidden_size,
++                              param->stream);
++    } else {
++        if(param->layernorm_post){
++            invokeAddBiasResidualSameTypeCast<T, T, half>(reinterpret_cast<half*>(tmp_out),
++                                            reinterpret_cast<half*>(attn2_out),
++                                            reinterpret_cast<T*>(output[0]),
++                                            reinterpret_cast<T*>(inputs[param->in_idx++]),  // FFN bias
++                                            h_token_num,
++                                            param->hidden_size,
++                                            param->stream);
++        } else{
++            invokeAddBiasResidualCast<T, T, half>(reinterpret_cast<half*>(tmp_out),
++                                            reinterpret_cast<T*>(attn2_out),
++                                            reinterpret_cast<T*>(output[0]),
++                                            reinterpret_cast<T*>(inputs[param->in_idx++]),  // FFN bias
++                                            h_token_num,
++                                            param->hidden_size,
++                                            param->stream);
 +        }
-+        return static_cast<T>(sum);
++                                            qkv_buf,
++                                            bias_qkv,
++                                            param->batch_size,
++                                            param->src_seq_len,
++                                            param->tgt_seq_len,
++                                            param->head_num,
++                                            param->head_size,
++                                            param->stream);
 +    }
 +    else {
-+        return static_cast<T>(0.f);
++        T* weight_qkv = reinterpret_cast<T*>(inputs[param->in_idx++]);
++        fastertransformer::CublasGemmWrapper(weight_qkv,
++                                             from_tensor,
++                                             qkv_buf,
++                                             gemm_dims,
++                                             gemm_lds,
++                                             gemm_ops,
++                                             const_cast<const cudaDataType*>(gemm_data_types),
++                                             &alpha,
++                                             &beta,
++                                             param->cublas_handle,
++                                             param->algo);
++        T* bias_qkv = (param->qkv_bias) ? reinterpret_cast<T*>(inputs[param->in_idx++]) : nullptr;
++        fastertransformer::invokeAddFusedQKVBiasTranspose(static_cast<T*>(q_buf_2),
++                                                          static_cast<T*>(output1),
++                                                          static_cast<T*>(output2),
++                                                          static_cast<T*>(qkv_buf),
++                                                          bias_qkv,
++                                                          param->src_seq_len,
++                                                          param->head_num,
++                                                          param->head_size,
++                                                          0,
++                                                          param->stream);
 +    }
-+}
-+
-+template<typename T>
-+void saveTensor(const std::string& name, T* tensor, int size)
-+{
-+    auto tensor_host = std::make_unique<T[]>(size);
-+    T* ptr = tensor_host.get();
-+    cudaD2Hcpy(ptr, tensor, size);
-+    std::ofstream wf(name + ".bin", std::ofstream::out | std::ofstream::binary);
-+    wf.write(reinterpret_cast<char*>(ptr), size * sizeof(T));
-+    wf.close();
-+}
++    gemm_ops[0] = CUBLAS_OP_T;
 +
-+void CublasGemmWrapper(const void* a_addr,
++    gemm_lds[0] = param->head_size;
++    gemm_lds[1] = param->head_size;
++    gemm_lds[2] = param->tgt_seq_len;
++
++    int gemm_strides[] = {(int)(param->tgt_seq_len * param->head_size),
++                          (int)(param->src_seq_len * param->head_size),
++                          (int)(param->src_seq_len * param->tgt_seq_len)};
++
++    gemm_dims[0] = param->tgt_seq_len;
++    gemm_dims[1] = param->src_seq_len;
++    gemm_dims[2] = param->head_size;
++
++    fastertransformer::CublasGemmStridedBatchedWrapper(output1,
++                                                       q_buf_2,
++                                                       qk_buf,
++                                                       gemm_dims,
++                                                       gemm_lds,
++                                                       gemm_ops,
++                                                       gemm_strides,
++                                                       const_cast<const cudaDataType*>(gemm_data_types),
++                                                       &alpha,
++                                                       &beta,
++                                                       param->batch_size * param->head_num,
++                                                       param->cublas_handle,
++                                                       param->algo);
++
++    T* attention_mask = reinterpret_cast<T*>(inputs[param->in_idx++]);
++    T* position_bias = nullptr;
++    if (param->position_bias) {
++        position_bias = reinterpret_cast<T*>(inputs[param->in_idx++]);
++    }
++    T scalar = static_cast<T>(1.0f / sqrtf(param->head_size * 1.0f));
++    fastertransformer::invokeMixMaskedSoftMax(static_cast<T*>(qk_buf),
++                                              attention_mask,
++                                              position_bias,
++                                              param->batch_size,
++                                              param->src_seq_len,
++                                              param->tgt_seq_len,
++                                              param->head_num,
++                                              scalar,
++                                              param->stream);
++
++    gemm_ops[0] = CUBLAS_OP_N;
++    gemm_ops[1] = CUBLAS_OP_N;
++    gemm_dims[0] = param->head_size;
++    gemm_dims[1] = param->src_seq_len;
++    gemm_dims[2] = param->tgt_seq_len;
++
++    gemm_lds[0] = param->head_size;
++    gemm_lds[1] = param->tgt_seq_len;
++    gemm_lds[2] = param->head_size;
++
++    gemm_strides[0] = param->tgt_seq_len * param->head_size;
++    gemm_strides[1] = param->src_seq_len * param->tgt_seq_len;
++    gemm_strides[2] = param->src_seq_len * param->head_size;
++    fastertransformer::CublasGemmStridedBatchedWrapper(output2,
++                                                       qk_buf,
++                                                       qkv_buf_2,
++                                                       gemm_dims,
++                                                       gemm_lds,
++                                                       gemm_ops,
++                                                       gemm_strides,
++                                                       const_cast<const cudaDataType*>(gemm_data_types),
++                                                       &alpha,
++                                                       &beta,
++                                                       param->batch_size * param->head_num,
++                                                       param->cublas_handle,
++                                                       param->algo);
++    invokeTransposeQKV(static_cast<T*>(qkv_buf_3),
++                       static_cast<T*>(qkv_buf_2),
++                       param->batch_size,
++                       param->src_seq_len,
++                       param->head_num,
++                       param->head_size,
++                       param->stream);
++
++    gemm_ops[0] = CUBLAS_OP_N;
++    gemm_ops[1] = CUBLAS_OP_N;
++    gemm_dims[0] = param->hidden_size;
++    gemm_dims[1] = param->batch_size * param->src_seq_len;
++    gemm_dims[2] = param->hidden_size;
++
++    gemm_lds[0] = param->hidden_size;
++    gemm_lds[1] = param->hidden_size;
++    gemm_lds[2] = param->hidden_size;
++
++    fastertransformer::CublasGemmWrapper(reinterpret_cast<T*>(inputs[param->in_idx++]),
++                                         qkv_buf_3,
++                                         static_cast<T*>(output[0]),
++                                         gemm_dims,
++                                         gemm_lds,
++                                         gemm_ops,
++                                         const_cast<const cudaDataType*>(gemm_data_types),
++                                         &alpha,
++                                         &beta,
++                                         param->cublas_handle,
++                                         param->algo);
++    if (param->projection_bias) {
++        int len = param->batch_size * param->src_seq_len;
++        invokeAddBias(
++            static_cast<T*>(output[0]), (const T*)(inputs[param->in_idx++]), len, param->hidden_size, param->stream);
++    }
++    return;
++}
++
++template void
++forward_attn<float>(float* inputs[], int in_len, float* output[], int out_len, attentionParamT* param, void* ws);
++template void
++forward_attn<half>(half* inputs[], int in_len, half* output[], int out_len, attentionParamT* param, void* ws);
++
++template void
++forward_ffn<float, half>(float* inputs[], int in_len, float* output[], int out_len, ParamT* param, void* ws);
++template void
++forward_ffn<half>(half* inputs[], int in_len, half* output[], int out_len, ParamT* param, void* ws);
++template void
++forward_ffn<float>(float* inputs[], int in_len, float* output[], int out_len, ParamT* param, void* ws);
++}  // namespace fastertransformer
+diff --git a/src/fastertransformer/layers/decoder_layers/decoder.h b/src/fastertransformer/layers/decoder_layers/decoder.h
+new file mode 100644
+index 0000000..c302ea8
+--- /dev/null
++++ b/src/fastertransformer/layers/decoder_layers/decoder.h
+@@ -0,0 +1,112 @@
++#pragma once
++
++#include "src/fastertransformer/kernels/activation_kernels.h"
++#include "src/fastertransformer/layers/decoder_layers/BaseDecoderLayer.h"
++#include <cublas_v2.h>
++#include <cuda.h>
++
++namespace fastertransformer {
++
++// typedef struct {
++//     size_t batch_size;
++//     size_t src_seq_len;
++//     size_t tgt_seq_len;
++//     size_t head_num;
++//     size_t head_size;
++//     size_t hidden_size;
++//     size_t h_token_num;
++//     // handle
++//     cublasHandle_t cublas_handle;
++//     cudaStream_t stream;
++//     cublasGemmAlgo_t algo;
++//     // ctrls
++//     int in_idx;
++//     bool qkv_bias;         // ture
++//     bool projection_bias;  // ture
++//     bool is_cross;         // false
++//     bool position_bias;    
++//     int *padding_offset;
++// } attentionParamT;
++
++// typedef struct {
++//     size_t batch_size;
++//     size_t src_seq_len;
++//     size_t tgt_seq_len;
++//     size_t head_num;
++//     size_t head_size;
++//     size_t hidden_size;
++//     size_t h_token_num;
++//     size_t ffn_hidden_size;  // 4 * param->hidden_size;
++//     bool ffn_fp16;
++//     float eps1;
++//     float eps2;
++//     float eps3;
++//     // handle
++//     cublasHandle_t cublas_handle;
++//     cudaStream_t stream;
++//     cublasGemmAlgo_t algo;
++//     // ctrls
++//     bool projection_bias;  // ture
++
++//     int in_idx;
++//     mutable attentionParamT attn1;
++//     mutable attentionParamT attn2;
++//     bool layernorm_post;  
++//     int *padding_offset;
++// } decoderParamT;
++// typedef struct{
++//     public:
++//         size_t batch_size;
++//         size_t src_seq_len;
++//         size_t tgt_seq_len;
++//         size_t head_num;
++//         size_t head_size;
++//         size_t hidden_size;
++//         size_t h_token_num;
++//         size_t ffn_hidden_size;
++//         // handle
++//         cublasHandle_t cublas_handle;
++//         cudaStream_t stream;
++//         cublasGemmAlgo_t algo;
++//         // ctrls    
++//         int *padding_offset;
++//             int in_idx;
++
++// } ParamT;
++
++// typedef struct : ParamT{
++
++//     // ctrls
++//     bool qkv_bias;         // ture
++//     bool projection_bias;  // ture
++//     bool is_cross;         // false
++//     bool position_bias;    
++//     int *padding_offset;
++// } attentionParamT;
++
++// typedef struct : ParamT{
++
++//     bool ffn_fp16;
++//     float eps1;
++//     float eps2;
++//     float eps3;
++
++//     bool projection_bias;  // ture
++
++//     mutable attentionParamT attn1;
++//     mutable attentionParamT attn2;
++//     bool layernorm_post;  
++//     int *padding_offset;
++// } decoderParamT;
++// template<typename T>
++// size_t GetDecoderLayerWorkspaceSize(decoderParamT* param);
++
++// template<typename T>
++// size_t GetAttnWorkspaceSize(decoderParamT* param);
++// template<typename T>
++// void forward_attn(T* inputs[], int in_len, T* output[], int out_len, attentionParamT* param, void* ws);
++// template<typename T>
++// void forwardDecoder(void* inputs[], int in_len, void* output[], int out_len, decoderParamT* param, void* ws);
++// void forwardDecoder(std::vector<fastertransformer::Tensor, std::allocator<fastertransformer::Tensor> > const*
++// inputs);
++}  // namespace fastertransformer
+diff --git a/src/fastertransformer/layers/encoder_layers/BaseEncoderLayer.h b/src/fastertransformer/layers/encoder_layers/BaseEncoderLayer.h
+new file mode 100644
+index 0000000..3b43391
+--- /dev/null
++++ b/src/fastertransformer/layers/encoder_layers/BaseEncoderLayer.h
+@@ -0,0 +1,76 @@
++/*
++ * Copyright (c) 2019-2021, NVIDIA CORPORATION.  All rights reserved.
++ *
++ * Licensed under the Apache License, Version 2.0 (the "License");
++ * you may not use this file except in compliance with the License.
++ * You may obtain a copy of the License at
++ *
++ *     http://www.apache.org/licenses/LICENSE-2.0
++ *
++ * Unless required by applicable law or agreed to in writing, software
++ * distributed under the License is distributed on an "AS IS" BASIS,
++ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
++ * See the License for the specific language governing permissions and
++ * limitations under the License.
++ */
++
++#pragma once
++
++#include <assert.h>
++#include <vector>
++
++#include "3rdparty/trt_fused_multihead_attention/fused_multihead_attention_common.h"
++#include "src/fastertransformer/layers/BaseLayer.h"
++#include "src/fastertransformer/layers/encoder_layers/EncoderLayerWeight.h"
++#include "src/fastertransformer/utils/Tensor.h"
++#include "src/fastertransformer/utils/allocator.h"
++#include "src/fastertransformer/utils/cublasMMWrapper.h"
++#include "src/fastertransformer/utils/memory_utils.h"
++
++namespace fastertransformer {
++
++enum class EncoderLayerType {
++    UNFUSED_ENCODER_LAYER,
++    FUSED_ENCODER_LAYER
++};
++
++template<typename T>
++EncoderLayerType getEncoderLayerType(size_t size_per_head, const int sm, const bool remove_padding, 
++                                        const int max_seq_len, const bool is_fuse = true) {
++    if (std::is_same<T, half>::value && (sm == kSM_70 || sm == kSM_86 || sm == kSM_80 || sm == kSM_75 || sm == kSM_72)
++        && size_per_head == 64 && max_seq_len <= 384 && is_fuse == true) {
++        return remove_padding ? EncoderLayerType::FUSED_ENCODER_LAYER : EncoderLayerType::FUSED_ENCODER_LAYER;
++    } else {
++        return remove_padding ? EncoderLayerType::FUSED_ENCODER_LAYER : EncoderLayerType::FUSED_ENCODER_LAYER;
++    }
++}
++
++template<typename T>
++EncoderLayerType getEncoderLayerTypeINT8(size_t size_per_head, const int sm, const bool remove_padding, 
++                                            const int max_seq_len, const int int8_mode) {
++    if ((int8_mode == 1 || int8_mode == 2) && (sm == kSM_86 || sm == kSM_80 || sm == kSM_75) && size_per_head == 64
++        && max_seq_len <= 384) {
++        return remove_padding ? EncoderLayerType::FUSED_ENCODER_LAYER : EncoderLayerType::FUSED_ENCODER_LAYER;
++    } else {
++        return remove_padding ? EncoderLayerType::FUSED_ENCODER_LAYER : EncoderLayerType::FUSED_ENCODER_LAYER;
++    }
++}
++
++template<typename T, typename U = T, typename S = T>
++class BaseEncoderLayer: public BaseLayer {
++
++public:
++    virtual void forward(std::vector<fastertransformer::Tensor>* output_tensors,
++                         const std::vector<fastertransformer::Tensor>* input_tensors,
++                         const EncoderLayerWeight<U>* encoder_layer_weights) = 0;
++    BaseEncoderLayer(cudaStream_t stream,
++                     cublasMMWrapper* cublas_wrapper,
++                     IAllocator* allocator,
++                     bool is_free_buffer_after_forward,
++                     bool sparse = false):
++        BaseLayer(stream, cublas_wrapper, allocator, is_free_buffer_after_forward, nullptr, sparse)
++    {
++    }
++    virtual ~BaseEncoderLayer() = default;
++};
++}  // namespace fastertransformer
+diff --git a/src/fastertransformer/layers/encoder_layers/CMakeLists.txt b/src/fastertransformer/layers/encoder_layers/CMakeLists.txt
+new file mode 100644
+index 0000000..1a3af85
+--- /dev/null
++++ b/src/fastertransformer/layers/encoder_layers/CMakeLists.txt
+@@ -0,0 +1,21 @@
++# Copyright (c) 2019-2022, NVIDIA CORPORATION.  All rights reserved.
++#
++# Licensed under the Apache License, Version 2.0 (the "License");
++# you may not use this file except in compliance with the License.
++# You may obtain a copy of the License at
++#
++#     http://www.apache.org/licenses/LICENSE-2.0
++#
++# Unless required by applicable law or agreed to in writing, software
++# distributed under the License is distributed on an "AS IS" BASIS,
++# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
++# See the License for the specific language governing permissions and
++# limitations under the License.
++
++cmake_minimum_required(VERSION 3.8)
++
++add_library(EncoderLayer STATIC encoder.cc MSEncoderLayer.cc)
++set_property(TARGET EncoderLayer PROPERTY POSITION_INDEPENDENT_CODE  ON)
++set_property(TARGET EncoderLayer PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS  ON)
++target_link_libraries(EncoderLayer PUBLIC -lcublas -lcudart unfused_attention_kernels activation_kernels 
++                      layernorm_kernels add_residual_kernels bert_preprocess_kernels)
+diff --git a/src/fastertransformer/layers/encoder_layers/EncoderLayerWeight.h b/src/fastertransformer/layers/encoder_layers/EncoderLayerWeight.h
+new file mode 100644
+index 0000000..c441b23
+--- /dev/null
++++ b/src/fastertransformer/layers/encoder_layers/EncoderLayerWeight.h
+@@ -0,0 +1,33 @@
++/*
++ * Copyright (c) 2019-2021, NVIDIA CORPORATION.  All rights reserved.
++ *
++ * Licensed under the Apache License, Version 2.0 (the "License");
++ * you may not use this file except in compliance with the License.
++ * You may obtain a copy of the License at
++ *
++ *     http://www.apache.org/licenses/LICENSE-2.0
++ *
++ * Unless required by applicable law or agreed to in writing, software
++ * distributed under the License is distributed on an "AS IS" BASIS,
++ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
++ * See the License for the specific language governing permissions and
++ * limitations under the License.
++ */
++
++#pragma once
++
++#include "src/fastertransformer/layers/DenseWeight.h"
++#include "src/fastertransformer/kernels/layernorm_kernels.h"
++namespace fastertransformer {
++
++template<typename T>
++struct EncoderLayerWeight {
++    DenseWeight<T> qkv_weight;
++    DenseWeight<T> attention_layer_output_weight;
++    DenseWeight<T> encoder_output_mapping;
++    DenseWeight<T> encoder_output_projection;
++    LayerNormWeight<T> layernorm1;
++    LayerNormWeight<T> layernorm2;
++};
++
++}  // namespace fastertransformer
+diff --git a/src/fastertransformer/layers/encoder_layers/MSEncoderLayer.cc b/src/fastertransformer/layers/encoder_layers/MSEncoderLayer.cc
+new file mode 100644
+index 0000000..4075695
+--- /dev/null
++++ b/src/fastertransformer/layers/encoder_layers/MSEncoderLayer.cc
+@@ -0,0 +1,198 @@
++/*
++ * Copyright (c) 2021-2022, NVIDIA CORPORATION.  All rights reserved.
++ * Copyright (c) 2021, NAVER Corp.  Authored by CLOVA.
++ *
++ * Licensed under the Apache License, Version 2.0 (the "License");
++ * you may not use this file except in compliance with the License.
++ * You may obtain a copy of the License at
++ *
++ *     http://www.apache.org/licenses/LICENSE-2.0
++ *
++ * Unless required by applicable law or agreed to in writing, software
++ * distributed under the License is distributed on an "AS IS" BASIS,
++ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
++ * See the License for the specific language governing permissions and
++ * limitations under the License.
++ */
++
++#include "src/fastertransformer/layers/encoder_layers/MSEncoderLayer.h"
++#include "src/fastertransformer/kernels/activation_kernels.h"
++
++namespace fastertransformer {
++template<typename T>
++void printTensor(char* str, T* input, int size)
++{
++    printf("%s ", str);
++    T* input_device = input;
++    T* input_host = (T*)malloc(size * sizeof(T));
++
++    fastertransformer::cudaD2Hcpy(input_host, input_device, size);
++
++    for (int k = 0; k < (int)size; k++) {
++
++        std::cout << input_host[k] << ",";
++        if (k % 10 == 0)
++            std::cout << std::endl;
++    }
++
++    std::cout << std::endl;
++
++    free(input_host);
++}
++template<typename T, typename U, typename S>
++MSELayer<T, U, S>::MSELayer(size_t max_batch_size,
++                            size_t max_src_seq_len,
++                            size_t max_tgt_seq_len,
++                            size_t head_num,
++                            size_t size_per_head,
++                            size_t ffn_hidden_size,
++                            float eps1,
++                            float eps2,
++                            bool post_layernorm,
++                            bool position_bias,
++                            bool is_ffn_fp16,
++                            cudaStream_t stream,
++                            cublasMMWrapper* cublas_wrapper,
++                            cublasHandle_t* cublas_handle,
++                            IAllocator* allocator,
++                            bool is_free_buffer_after_forward,
++                            bool is_qk_buf_float,
++                            bool sparse):
++
++    BaseEncoderLayer<T, U, S>(stream, cublas_wrapper, allocator, is_free_buffer_after_forward, sparse), buf_(nullptr)
++{
++    params_.batch_size = max_batch_size;
++    params_.src_seq_len = max_src_seq_len;
++    params_.tgt_seq_len = max_tgt_seq_len;
++    params_.head_num = head_num;
++    params_.head_size = size_per_head;
++    params_.hidden_size = head_num * size_per_head;
++    params_.ffn_hidden_size = ffn_hidden_size;
++    params_.eps1 = eps1;
++    params_.eps2 = eps2;
++    params_.layernorm_post = post_layernorm;
++    // handle
++    params_.cublas_handle = *cublas_handle;
++    params_.stream = stream;
++    params_.ffn_fp16 = is_ffn_fp16;
++    // ctrls
++    params_.in_idx = 0;
++    params_.position_bias = position_bias;
++    params_.qkv_bias = !params_.position_bias;
++    params_.projection_bias = !params_.position_bias;
++    params_.is_cross = false;
++    params_.algo = CUBLAS_GEMM_DEFAULT_TENSOR_OP;
++}
++
++template<typename T, typename U, typename S>
++void MSELayer<T, U, S>::allocateBuffer()
++{
++    if (buf_ == nullptr) {
++        size_t buff_size = GetEncoderLayerWorkspaceSize<T>(&params_);
++        buf_ = reinterpret_cast<T*>(allocator_->reMalloc(buf_, sizeof(T) * buff_size, true));
++    }
++}
++
++template<typename T, typename U, typename S>
++void MSELayer<T, U, S>::freeBuffer()
++{
++    if (buf_ != nullptr) {
++        allocator_->free(buf_);
++        buf_ = nullptr;
++    }
++}
++
++template<typename T, typename U, typename S>
++MSELayer<T, U, S>::~MSELayer()
++{
++    cublas_wrapper_ = nullptr;
++    freeBuffer();
++}
++
++template<typename T, typename U, typename S>
++void MSELayer<T, U, S>::forward(std::vector<fastertransformer::Tensor>* output_tensors,
++                                const std::vector<fastertransformer::Tensor>* input_tensors,
++                                const EncoderLayerWeight<U>* encoder_weights)
++{
++    allocateBuffer();  // only once
++    void* outputs[] = {(void*)output_tensors->at(0).data};
++    if (!params_.layernorm_post) {
++        if (params_.position_bias) {
++            void* inputs[] = {
++                (void*)input_tensors->at(0).data,
++                (void*)encoder_weights->layernorm1.gamma,
++                (void*)encoder_weights->qkv_weight.kernel,
++                (void*)input_tensors->at(1).data,
++                (void*)encoder_weights->attention_layer_output_weight.kernel,
++                (void*)encoder_weights->layernorm2.gamma,
++                (void*)encoder_weights->encoder_output_mapping.kernel,
++                (void*)encoder_weights->encoder_output_projection.kernel,
++                (void*)input_tensors->at(2).data,
++            };
++            forwardEncoder<T>(inputs, 9, outputs, 1, &params_, buf_);
++        }
++        else {
++            void* inputs[] = {(void*)input_tensors->at(0).data,
++                              (void*)encoder_weights->layernorm1.gamma,
++                              (void*)encoder_weights->layernorm1.beta,
++                              (void*)encoder_weights->qkv_weight.kernel,
++                              (void*)encoder_weights->qkv_weight.bias,
++                              (void*)input_tensors->at(1).data,
++                              (void*)encoder_weights->attention_layer_output_weight.kernel,
++                              (void*)encoder_weights->attention_layer_output_weight.bias,
++                              (void*)encoder_weights->layernorm2.gamma,
++                              (void*)encoder_weights->layernorm2.beta,
++                              (void*)encoder_weights->encoder_output_mapping.kernel,
++                              (void*)encoder_weights->encoder_output_mapping.bias,
++                              (void*)encoder_weights->encoder_output_projection.kernel,
++                              (void*)encoder_weights->encoder_output_projection.bias};
++            forwardEncoder<T>(inputs, 14, outputs, 1, &params_, buf_);
++        }
++    }
++    else {
++        if (params_.position_bias) {
++            void* inputs[] = {
++                (void*)input_tensors->at(0).data,
++                (void*)encoder_weights->qkv_weight.kernel,
++                (void*)input_tensors->at(1).data,
++                (void*)encoder_weights->attention_layer_output_weight.kernel,
++                (void*)encoder_weights->layernorm1.gamma,
++                (void*)encoder_weights->encoder_output_mapping.kernel,
++                (void*)encoder_weights->encoder_output_projection.kernel,
++                (void*)encoder_weights->layernorm2.gamma,
++                (void*)input_tensors->at(2).data,
++            };
++            forwardEncoder<T>(inputs, 9, outputs, 1, &params_, buf_);
++        }
++        else {
++            void* inputs[] = {(void*)input_tensors->at(0).data,
++                              (void*)encoder_weights->qkv_weight.kernel,
++                              (void*)encoder_weights->qkv_weight.bias,
++                              (void*)input_tensors->at(1).data,
++                              (void*)encoder_weights->attention_layer_output_weight.kernel,
++                              (void*)encoder_weights->attention_layer_output_weight.bias,
++                              (void*)encoder_weights->layernorm1.gamma,
++                              (void*)encoder_weights->layernorm1.beta,
++                              (void*)encoder_weights->encoder_output_mapping.kernel,
++                              (void*)encoder_weights->encoder_output_mapping.bias,
++                              (void*)encoder_weights->encoder_output_projection.kernel,
++                              (void*)encoder_weights->encoder_output_projection.bias,
++                              (void*)encoder_weights->layernorm2.gamma,
++                              (void*)encoder_weights->layernorm2.beta};
++            forwardEncoder<T>(inputs, 14, outputs, 1, &params_, buf_);
++        }
++    }
++
++    return;
++}
++
++template class MSELayer<float, float, float>;
++template class MSELayer<float, float, half>;
++template class MSELayer<float, half, float>;
++template class MSELayer<float, half, half>;
++template class MSELayer<half, float, float>;
++template class MSELayer<half, float, half>;
++template class MSELayer<half, half, float>;
++template class MSELayer<half, half, half>;
++
++}  // namespace fastertransformer
+diff --git a/src/fastertransformer/layers/encoder_layers/MSEncoderLayer.h b/src/fastertransformer/layers/encoder_layers/MSEncoderLayer.h
+new file mode 100644
+index 0000000..33de2ba
+--- /dev/null
++++ b/src/fastertransformer/layers/encoder_layers/MSEncoderLayer.h
+@@ -0,0 +1,71 @@
++/*
++ * Copyright (c) 2021, NVIDIA CORPORATION.  All rights reserved.
++ * Copyright (c) 2021, NAVER Corp.  Authored by CLOVA.
++ *
++ * Licensed under the Apache License, Version 2.0 (the "License");
++ * you may not use this file except in compliance with the License.
++ * You may obtain a copy of the License at
++ *
++ *     http://www.apache.org/licenses/LICENSE-2.0
++ *
++ * Unless required by applicable law or agreed to in writing, software
++ * distributed under the License is distributed on an "AS IS" BASIS,
++ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
++ * See the License for the specific language governing permissions and
++ * limitations under the License.
++ */
++
++#pragma once
++
++#include "src/fastertransformer/layers/encoder_layers/BaseEncoderLayer.h"
++#include "src/fastertransformer/layers/encoder_layers/encoder.h"
++
++namespace fastertransformer {
++
++// TODO(haim): Add template according to "mix" compute type (fp32, fp16)
++template<typename T, typename U = T, typename S = T>
++class MSELayer: public BaseEncoderLayer<T, U, S> {
++private:
++    encoderParamT params_;
++    void allocateBuffer() override;
++    void freeBuffer() override;
++    void* buf_;
++    using BaseEncoderLayer<T, U, S>::is_free_buffer_after_forward_;
++    using BaseEncoderLayer<T, U, S>::is_allocate_buffer_;
++    using BaseEncoderLayer<T, U, S>::cublas_wrapper_;
++    using BaseEncoderLayer<T, U, S>::allocator_;
++
++protected:
++    using BaseEncoderLayer<T, U, S>::stream_;
++    using BaseEncoderLayer<T, U, S>::sparse_;
++
++public:
++    MSELayer(size_t max_batch_size,
++                            size_t max_src_seq_len,
++                            size_t max_tgt_seq_len,
++                            size_t head_num,
++                            size_t size_per_head,
++                            size_t ffn_hidden_size,
++                            float eps1,
++                            float eps2,
++                            bool post_layernorm,
++                            bool position_bias,
++                            bool is_ffn_fp16,
++                            cudaStream_t stream,
++                            cublasMMWrapper* cublas_wrapper,
++                            cublasHandle_t* cublas_handle,
++                            IAllocator* allocator,
++                            bool is_free_buffer_after_forward,
++                            bool is_qk_buf_float,
++                            bool sparse);
++
++    MSELayer(MSELayer<T> const& encoder_layer);
++
++    virtual ~MSELayer();
++
++    void forward(std::vector<fastertransformer::Tensor>* output_tensors,
++                 const std::vector<fastertransformer::Tensor>* input_tensors,
++                 const EncoderLayerWeight<U>* encoder_weights) override;
++};
++
++}  // namespace fastertransformer
+diff --git a/src/fastertransformer/layers/encoder_layers/encoder.cc b/src/fastertransformer/layers/encoder_layers/encoder.cc
+new file mode 100644
+index 0000000..c0b4f37
+--- /dev/null
++++ b/src/fastertransformer/layers/encoder_layers/encoder.cc
+@@ -0,0 +1,815 @@
++
++#include "src/fastertransformer/layers/encoder_layers/encoder.h"
++#include "src/fastertransformer/kernels/activation_kernels.h"
++#include "src/fastertransformer/kernels/add_residual_kernels.h"
++#include "src/fastertransformer/kernels/bert_preprocess_kernels.h"
++#include "src/fastertransformer/kernels/layernorm_kernels.h"
++#include "src/fastertransformer/kernels/unfused_attention_kernels.h"
++#include <iostream>
++
++namespace fastertransformer {
++
++#define UP_DIV(x, y) (((x) + (y) - (1)) / (y))
++#define ALIGN(x, y) (UP_DIV(x, y) * (y))
++#define ALIGN_SIZE 16
++
++template<typename T>
++void printTensor(const std::string& str, T* input, int size)
++{
++    std::cout << str;
++    T* input_device = input;
++    auto input_host = std::make_unique<T[]>(size);
++    cudaD2Hcpy(input_host.get(), input_device, size);
++    for (int k = 0, index = 0; k < size; k++) {
++        if (index != 0)
++            std::cout << ',';
++        std::cout << input_host[k];
++        index++;
++        if (index == 10) {
++            std::cout << std::endl;
++            index = 0;
++        }
++    }
++    std::cout << std::endl;
++}
++
++template<typename T>
++void isNan(char* str, T* input, int size)
++{
++    std::cout << str << " "
++              << " size is " << size;
++    T* input_device = input;
++    T* input_host = (T*)malloc(size * sizeof(T));
++    cudaD2Hcpy(input_host, input_device, size);
++    for (int k = 0; k < (int)size; k++) {
++        if (std::isnan((float)input_host[k]) || std ::isinf((float)input_host[k])) {
++            std::cout << "found NAN or INF";
++            break;
++        }
++    }
++    std::cout << std::endl;
++    free(input_host);
++}
++template<typename T>
++T checksum(const T* tensor, int size)
++{
++    if constexpr (std::is_floating_point<T>()) {
++        auto tensor_host = std::make_unique<T[]>(size);
++        double sum = 0.;
++        T* ptr = tensor_host.get();
++        cudaD2Hcpy(ptr, tensor, size);
++        for (int i = 0; i < size; i++) {
++            //  sum += (double)ptr[i]*i;
++            sum += ptr[i];
++        }
++        return static_cast<T>(sum);
++    }
++    else
++        return static_cast<T>(0.f);
++}
++
++template<typename T>
++T checksumGrid(const T* tensor, const encoderParamT* param, bool zp = false, bool cross = false, bool ffn = false)
++{
++    if constexpr (std::is_floating_point<T>()) {
++        int hidden_size;
++        if (ffn) {
++            hidden_size = param->ffn_hidden_size;
++        }
++        else {
++            hidden_size = param->hidden_size;
++        }
++        const int size = param->batch_size * param->src_seq_len * hidden_size;
++        int head_size = hidden_size / param->head_num;
++        auto tensor_host = std::make_unique<T[]>(size);
++        double sum = 0.;
++        T* ptr = tensor_host.get();
++        try {
++            cudaD2Hcpy(ptr, tensor, size);
++        }
++        catch (...) {
++            std::cout << "copy tensor failed" << std::endl;
++            return static_cast<T>(0.f);
++        }
++        bool compressed = param->eft && zp;
++        if (!compressed) {
++            if (cross) {
++                std::cout << "cross sum:" << std::endl;
++                for (int i = 0; i < param->batch_size; i++) {
++                    for (int j = 0; j < param->head_num; j++) {
++                        for (int k = 0; k < param->src_seq_len / 2; k++) {
++                            for (int l = 0; l < head_size; l++) {
++                                sum += ptr[(((i * param->head_num) + j) * param->src_seq_len + k) * head_size + l];
++                            }
++                        }
++                    }
++                }
++            }
++            else {
++                std::cout << "grid sum:" << std::endl;
++                for (int i = 0; i < param->batch_size; i++) {
++                    for (int j = 0; j < param->src_seq_len / 2; j++) {
++                        for (int k = 0; k < hidden_size; k++) {
++                            sum += ptr[((i * param->src_seq_len) + j) * hidden_size + k];
++                        }
++                    }
++                }
++            }
++        }
++        else {
++            std::cout << "compress sum:" << std::endl;
++            for (int i = 0; i < param->h_token_num * hidden_size; i++) {
++                sum += ptr[i];
++            }
++        }
++        return static_cast<T>(sum);
++    }
++    else {
++        return static_cast<T>(0.f);
++    }
++}
++
++template<typename T>
++void saveTensor(const std::string& name, T* tensor, int size)
++{
++    auto tensor_host = std::make_unique<T[]>(size);
++    T* ptr = tensor_host.get();
++    cudaD2Hcpy(ptr, tensor, size);
++    std::ofstream wf(name + ".bin", std::ofstream::out | std::ofstream::binary);
++    wf.write(reinterpret_cast<char*>(ptr), size * sizeof(T));
++    wf.close();
++}
++
++void CublasGemmWrapper(const void* a_addr,
 +                       const void* b_addr,
 +                       void* c_addr,
 +                       const int* params,
@@ -6407,397 +7859,2549 @@ index 0000000..004718e
 +                       cublasHandle_t cublas_handle,
 +                       cublasGemmAlgo_t algo)
 +{
-+    const int m = params[0];
-+    const int n = params[1];
-+    const int k = params[2];
-+    cublasOperation_t trans_a = operations[0];
-+    cublasOperation_t trans_b = operations[1];
-+    const int lda = lds[0];
-+    const int ldb = lds[1];
-+    const int ldc = lds[2];
-+    cudaDataType type_a = data_types[0];
-+    cudaDataType type_b = data_types[1];
-+    cudaDataType type_c = data_types[2];
-+    cublasComputeType_t compute_type = CUBLAS_COMPUTE_32F_FAST_TF32;
-+    if ((type_a == CUDA_R_16F) && (type_b == CUDA_R_16F) && (type_c == CUDA_R_16F)) {
-+        compute_type = CUBLAS_COMPUTE_16F;
++    const int m = params[0];
++    const int n = params[1];
++    const int k = params[2];
++    cublasOperation_t trans_a = operations[0];
++    cublasOperation_t trans_b = operations[1];
++    const int lda = lds[0];
++    const int ldb = lds[1];
++    const int ldc = lds[2];
++    cudaDataType type_a = data_types[0];
++    cudaDataType type_b = data_types[1];
++    cudaDataType type_c = data_types[2];
++    cublasComputeType_t compute_type = CUBLAS_COMPUTE_32F_FAST_TF32;
++    if ((type_a == CUDA_R_16F) && (type_b == CUDA_R_16F) && (type_c == CUDA_R_16F)) {
++        compute_type = CUBLAS_COMPUTE_16F;
++    }
++    cublasGemmEx(cublas_handle,
++                 trans_a,
++                 trans_b,
++                 m,
++                 n,
++                 k,
++                 alpha,
++                 a_addr,
++                 type_a,
++                 lda,
++                 b_addr,
++                 type_b,
++                 ldb,
++                 beta,
++                 c_addr,
++                 type_c,
++                 ldc,
++                 compute_type,
++                 algo);
++}
++
++void CublasGemmStridedBatchedWrapper(const void* a_addr,
++                                     const void* b_addr,
++                                     void* c_addr,
++                                     const int* params,
++                                     const int* lds,
++                                     const cublasOperation_t* operations,
++                                     const int* strides,
++                                     const cudaDataType* data_types,
++                                     void* alpha,
++                                     void* beta,
++                                     int batch,
++                                     cublasHandle_t cublas_handle,
++                                     cublasGemmAlgo_t algo)
++{
++    const int m = params[0];
++    const int n = params[1];
++    const int k = params[2];
++    cublasOperation_t trans_a = operations[0];
++    cublasOperation_t trans_b = operations[1];
++    const int lda = lds[0];
++    const int ldb = lds[1];
++    const int ldc = lds[2];
++    cudaDataType type_a = data_types[0];
++    cudaDataType type_b = data_types[1];
++    cudaDataType type_c = data_types[2];
++    cublasComputeType_t compute_type = CUBLAS_COMPUTE_32F_FAST_TF32;
++    // cublasComputeType_t compute_type = CUBLAS_COMPUTE_32F_FAST_16F;
++
++    if ((type_a == CUDA_R_16F) && (type_b == CUDA_R_16F) && (type_c == CUDA_R_16F)) {
++        compute_type = CUBLAS_COMPUTE_16F;
++    }
++    const int stride_a = strides[0];
++    const int stride_b = strides[1];
++    const int stride_c = strides[2];
++    cublasGemmStridedBatchedEx(cublas_handle,
++                               trans_a,
++                               trans_b,
++                               m,
++                               n,
++                               k,
++                               alpha,
++                               a_addr,
++                               type_a,
++                               lda,
++                               stride_a,
++                               b_addr,
++                               type_b,
++                               ldb,
++                               stride_b,
++                               beta,
++                               c_addr,
++                               type_c,
++                               ldc,
++                               stride_c,
++                               batch,
++                               compute_type,
++                               algo);
++}
++
++template<typename T>
++size_t GetAttnWorkspaceSize(encoderParamT* param)
++{
++    size_t size_q = ALIGN((param->batch_size * param->src_seq_len * param->hidden_size), ALIGN_SIZE);
++    size_t size_k = ALIGN((param->batch_size * param->tgt_seq_len * param->hidden_size), ALIGN_SIZE);
++    size_t size_v = size_k;
++    size_t qkv_len = size_q + size_k + size_v;
++    size_t qk_buf_len =
++        ALIGN(param->batch_size * param->head_num * param->src_seq_len * param->tgt_seq_len, ALIGN_SIZE);
++    size_t qkv_buf_2_len = ALIGN(param->batch_size * param->src_seq_len * param->hidden_size, ALIGN_SIZE);
++    size_t attn_out_size =
++        ALIGN(param->batch_size * param->head_num * param->head_size * param->tgt_seq_len, ALIGN_SIZE);
++    return (qkv_buf_2_len + 2 * attn_out_size + std::max(qkv_len, qk_buf_len)) * sizeof(T);
++}
++
++template size_t GetAttnWorkspaceSize<float>(encoderParamT* param);
++template size_t GetAttnWorkspaceSize<half>(encoderParamT* param);
++template<typename T>
++size_t GetEncoderLayerWorkspaceSize(encoderParamT* param)
++{
++    size_t max_hidden = ALIGN(std::max(param->hidden_size, param->ffn_hidden_size),ALIGN_SIZE);
++    size_t compress_buffer_len = ALIGN(param->batch_size * param->src_seq_len * max_hidden,ALIGN_SIZE);
++    size_t padding_len = ALIGN(param->batch_size * param->src_seq_len,ALIGN_SIZE);
++    size_t offset_len = ALIGN(param->batch_size,ALIGN_SIZE);
++    size_t d_token_len = ALIGN(1,ALIGN_SIZE);
++    size_t eft_size = compress_buffer_len * sizeof(T) + (padding_len + offset_len) * sizeof(int) + d_token_len * sizeof(size_t);
++    size_t attn_out = ALIGN(param->batch_size * param->src_seq_len * param->hidden_size, ALIGN_SIZE);
++    size_t ffn = ALIGN(param->batch_size * param->src_seq_len * param->ffn_hidden_size, ALIGN_SIZE);
++    return (std::max(GetAttnWorkspaceSize<T>(param), ffn * sizeof(T)) + (attn_out * 3) * sizeof(T)) + eft_size;
++}
++
++template size_t GetEncoderLayerWorkspaceSize<float>(encoderParamT* param);
++template size_t GetEncoderLayerWorkspaceSize<half>(encoderParamT* param);
++
++template<typename T, typename S = T>
++void forward_ffn(T* inputs[], int in_len, T* output[], int out_len, encoderParamT* param, void* ws)
++{
++    size_t inter_size = param->ffn_hidden_size;
++    size_t h_token_num = param->h_token_num;
++    cublasOperation_t gemm_ops[] = {CUBLAS_OP_N, CUBLAS_OP_N};
++    cudaDataType gemm_data_types[] = {CUDA_R_32F, CUDA_R_32F, CUDA_R_32F};
++    if ((std::is_same<T, half>::value) || (std::is_same<S, half>::value)) {
++        gemm_data_types[0] = CUDA_R_16F;
++        gemm_data_types[1] = CUDA_R_16F;
++        gemm_data_types[2] = CUDA_R_16F;
++    }
++    S alpha = 1.0f;
++    S beta = 0.0f;
++
++    int gemm_dims[] = {(int)inter_size, (int)h_token_num, (int)param->hidden_size};
++    int gemm_lds[] = {(int)inter_size, (int)param->hidden_size, (int)inter_size};
++    T* normed_attn_out = reinterpret_cast<T*>(inputs[param->in_idx++]);
++    CublasGemmWrapper(inputs[param->in_idx++],
++                      normed_attn_out,
++                      ws,
++                      gemm_dims,
++                      gemm_lds,
++                      gemm_ops,
++                      gemm_data_types,
++                      &alpha,
++                      &beta,
++                      param->cublas_handle,
++                      param->algo);
++    invokeAddBiasGelu(reinterpret_cast<S*>(ws),
++                      reinterpret_cast<S*>(inputs[param->in_idx++]),
++                      h_token_num,
++                      inter_size,
++                      param->stream);
++    gemm_dims[0] = param->hidden_size;
++    gemm_dims[1] = h_token_num;
++    gemm_dims[2] = inter_size;
++    gemm_lds[0] = param->hidden_size;
++    gemm_lds[1] = inter_size;
++    gemm_lds[2] = param->hidden_size;
++    CublasGemmWrapper(inputs[param->in_idx++],
++                      ws,
++                      output[0],
++                      gemm_dims,
++                      gemm_lds,
++                      gemm_ops,
++                      gemm_data_types,
++                      &alpha,
++                      &beta,
++                      param->cublas_handle,
++                      param->algo);
++}
++
++template<typename T>
++void forwardEncoder(void* inputs[], int in_len, void* output[], int out_len, encoderParamT* param, void* ws)
++{
++    param->in_idx = 0;
++    size_t h_token_num = param->batch_size * param->src_seq_len;
++    param->h_token_num = h_token_num;
++    param->padding_offset = nullptr;
++    int* d_sequence_lengths = nullptr;
++    T* input_tensor = reinterpret_cast<T*>(inputs[param->in_idx++]);
++    T* from_tensor = input_tensor;
++    T* compress_buffer;
++    compress_buffer = reinterpret_cast<T*>(ws);
++    ws = reinterpret_cast<void*>(reinterpret_cast<T*>(ws) + ALIGN(h_token_num * param->hidden_size,ALIGN_SIZE));
++    int* padding_offset = reinterpret_cast<int*>(ws);
++    ws = reinterpret_cast<void*>(reinterpret_cast<int*>(ws) + ALIGN(param->batch_size * param->src_seq_len,ALIGN_SIZE));
++    d_sequence_lengths = reinterpret_cast<int*>(ws);
++    param->d_sequence_length = d_sequence_lengths;
++    ws = reinterpret_cast<void*>(reinterpret_cast<int*>(ws) + ALIGN(param->batch_size,ALIGN_SIZE));
++    size_t* d_token_num = reinterpret_cast<size_t*>(ws);
++    ws = reinterpret_cast<void*>(reinterpret_cast<size_t*>(ws) + ALIGN(1,ALIGN_SIZE));
++    invokeBuildSequnceLength(
++        from_tensor, param->batch_size, d_sequence_lengths, param->src_seq_len, param->hidden_size, param->stream);
++   // printTensor("seq_len=",d_sequence_lengths,param->batch_size);
++    invokeGetPaddingOffset(&h_token_num,
++                           d_token_num,
++                           padding_offset,
++                           d_sequence_lengths,
++                           param->batch_size,
++                           param->src_seq_len,
++                           param->stream);
++    // std::cout << "token=" << h_token_num << "m=" << param->batch_size * param->src_seq_len << std::endl;
++    if (h_token_num * 2 <= param->batch_size * param->src_seq_len) {
++        param->eft = true;
++        invokeRemovePadding(compress_buffer,
++                            (const T*)from_tensor,
++                            padding_offset,
++                            h_token_num,
++                            param->head_num * param->head_size,
++                            param->stream);
++        param->h_token_num = h_token_num;
++        param->padding_offset = padding_offset;
++        from_tensor = compress_buffer;
++    }
++    h_token_num = param->h_token_num;
++    T* attn_out = reinterpret_cast<T*>(ws);
++    T* normed_from_tensor =
++        reinterpret_cast<T*>(ws) + ALIGN(param->batch_size * param->src_seq_len * param->hidden_size, ALIGN_SIZE);
++    T* attn_ws_offset = (param->layernorm_post) ? reinterpret_cast<T*>(ws) : reinterpret_cast<T*>(normed_from_tensor);
++    T* attn_ws = attn_ws_offset + ALIGN(param->batch_size * param->src_seq_len * param->hidden_size, ALIGN_SIZE);
++    T* normed_attn_out = normed_from_tensor;
++    T* ffn_ws = normed_attn_out + ALIGN(param->batch_size * param->src_seq_len * param->hidden_size, ALIGN_SIZE);
++
++    T* tmp_out = reinterpret_cast<T*>(output[0]);
++    if (param->padding_offset != nullptr || (std::is_same<T, float>::value && param->ffn_fp16 == true)) {
++        tmp_out = ffn_ws + ALIGN(param->batch_size * param->src_seq_len * param->ffn_hidden_size, ALIGN_SIZE);
++    }
++    T* tmp_out1 = reinterpret_cast<T*>(output[0]);
++    T* out_buf = tmp_out;
++    if (param->padding_offset != nullptr) {
++        tmp_out1 = compress_buffer;
++    }
++    if (param->layernorm_post == false) {
++        T* gamma1 = reinterpret_cast<T*>(inputs[param->in_idx++]);
++        T* beta1 = reinterpret_cast<T*>(inputs[param->in_idx++]);
++
++        invokeGeneralLayerNorm(normed_from_tensor,
++                               reinterpret_cast<T*>(from_tensor),  // from tensor
++                               gamma1,                             // Gamma
++                               beta1,                              // Beta
++                               h_token_num,
++                               param->hidden_size,
++                               param->stream,
++                               param->eps1);
++    }
++    else {
++        normed_from_tensor = from_tensor;
++    }
++    inputs[--param->in_idx] = normed_from_tensor;
++    // if attention is embedded inside an encoder - fuse the bias to next layer normalization
++    bool projection_bias = param->projection_bias;
++    param->projection_bias = false;
++    int in_idx = param->in_idx;
++    forward_attn(reinterpret_cast<T**>(&inputs[param->in_idx]), in_len, &attn_out, 1, param, attn_ws);
++    param->in_idx += in_idx;
++    param->projection_bias = projection_bias;
++    if (param->projection_bias) {
++        T* projection_bias = reinterpret_cast<T*>(inputs[param->in_idx++]);
++        T* gamma2 = reinterpret_cast<T*>(inputs[param->in_idx++]);
++        T* beta2 = reinterpret_cast<T*>(inputs[param->in_idx++]);
++        if (param->layernorm_post == false) {
++            if (std::is_same<T, half>::value || param->ffn_fp16 == false) {
++                invokeGeneralAddBiasResidualPreLayerNorm(attn_out,
++                                                         normed_attn_out,
++                                                         from_tensor,
++                                                         gamma2,  // gamma
++                                                         beta2,   // beta
++                                                         projection_bias,
++                                                         h_token_num,
++                                                         param->hidden_size,
++                                                         param->stream,
++                                                         param->eps2);
++            }
++            else {
++                invokeGeneralAddBiasResidualPreLayerNormCast<T, half>(attn_out,
++                                                                      reinterpret_cast<half*>(normed_attn_out),
++                                                                      from_tensor,
++                                                                      gamma2,  // gamma
++                                                                      beta2,   // beta
++                                                                      projection_bias,
++                                                                      h_token_num,
++                                                                      param->hidden_size,
++                                                                      param->stream,
++                                                                      param->eps2);
++            }
++        }
++        else {
++            if (std::is_same<T, half>::value || param->ffn_fp16 == false) {
++                invokeAddBiasResidualLayerNorm(attn_out,
++                                               from_tensor,
++                                               projection_bias,
++                                               gamma2,  // gamma
++                                               beta2,   // beta
++                                               h_token_num,
++                                               param->hidden_size,
++                                               param->stream,
++                                               param->eps1);
++                normed_attn_out = attn_out;
++            }
++            else {
++                invokeAddBiasResidualLayerNormCast<T, float, half>(reinterpret_cast<float*>(attn_out),
++                                                                   reinterpret_cast<half*>(normed_attn_out),
++                                                                   reinterpret_cast<float*>(from_tensor),
++                                                                   projection_bias,
++                                                                   gamma2,  // gamma
++                                                                   beta2,   // beta
++                                                                   h_token_num,
++                                                                   param->hidden_size,
++                                                                   param->stream,
++                                                                   param->eps1);
++                // isNan<half>((char*)"LN 1 model", (half*)attn_out, h_token_num * param->hidden_size);
++            }
++        }
++    }
++    else {
++        // without projection bias
++    }
++    // forward ffn
++    // simulate attention inputs
++    inputs[--param->in_idx] = normed_attn_out;
++    if (param->ffn_fp16 == false) {
++        forward_ffn<T, T>(reinterpret_cast<T**>(inputs), in_len, &tmp_out, 1, param, ffn_ws);
++    }
++    else {
++        forward_ffn<T, half>(reinterpret_cast<T**>(inputs), in_len, &tmp_out, 1, param, ffn_ws);
++    }
++    if (param->layernorm_post == true) {
++        if (std::is_same<T, half>::value || param->ffn_fp16 == false) {
++            invokeAddBiasResidualLayerNorm(reinterpret_cast<T*>(tmp_out),
++                                           attn_out,
++                                           reinterpret_cast<T*>(inputs[param->in_idx++]),  // FFN bias,
++                                           reinterpret_cast<T*>(inputs[param->in_idx++]),  // Gamma
++                                           reinterpret_cast<T*>(inputs[param->in_idx++]),  // Beta
++                                           h_token_num,
++                                           param->hidden_size,
++                                           param->stream,
++                                           param->eps2);
++        }
++        else {
++            invokeAddBiasResidualLayerNormCast<T, half, float>(
++                reinterpret_cast<half*>(tmp_out),
++                reinterpret_cast<float*>(tmp_out1),
++                reinterpret_cast<half*>(normed_attn_out),
++                reinterpret_cast<T*>(inputs[param->in_idx++]),  // FFN bias,
++                reinterpret_cast<T*>(inputs[param->in_idx++]),  // Gamma
++                reinterpret_cast<T*>(inputs[param->in_idx++]),  // Beta
++                h_token_num,
++                param->hidden_size,
++                param->stream,
++                param->eps2);
++            out_buf = tmp_out1;
++        }
++    }
++    else {
++        if (std::is_same<T, half>::value || param->ffn_fp16 == false) {
++            invokeAddBiasResidual(reinterpret_cast<T*>(tmp_out),
++                                  attn_out,
++                                  reinterpret_cast<T*>(inputs[param->in_idx++]),  // FFN bias
++                                  h_token_num,
++                                  param->hidden_size,
++                                  param->stream);
++        }
++        else {
++            invokeAddBiasResidualCast<T, T, half>(reinterpret_cast<half*>(tmp_out),
++                                                  reinterpret_cast<T*>(attn_out),
++                                                  reinterpret_cast<T*>(tmp_out1),
++                                                  reinterpret_cast<T*>(inputs[param->in_idx++]),  // FFN bias
++                                                  h_token_num,
++                                                  param->hidden_size,
++                                                  param->stream);
++        }
++    }
++    if (param->padding_offset != nullptr) {
++        cudaMemsetAsync(output[0],
++                        0,
++                        param->batch_size * param->src_seq_len * param->head_size * param->head_num * sizeof(T),
++                        param->stream);
++        invokeRebuildPadding(
++            (T*)output[0], out_buf, param->padding_offset, h_token_num, param->hidden_size, param->stream);
++    }
++    return;
++}
++
++template void
++forwardEncoder<float>(void* inputs[], int in_len, void* output[], int out_len, encoderParamT* param, void* ws);
++template void
++forwardEncoder<half>(void* inputs[], int in_len, void* output[], int out_len, encoderParamT* param, void* ws);
++
++template<typename T>
++void forward_attn(T* inputs[], int in_len, T* output[], int out_len, encoderParamT* param, void* ws)
++{
++    param->in_idx = 0;
++    auto extra_tmp_size =
++        ALIGN(param->batch_size * param->head_num * param->head_size * param->tgt_seq_len, ALIGN_SIZE);
++    size_t size_q = ALIGN(param->batch_size * param->src_seq_len * param->hidden_size, ALIGN_SIZE);
++    size_t q_buf_2_len = size_q;
++    size_t qk_buf_len =
++        ALIGN(param->batch_size * param->head_num * param->src_seq_len * param->tgt_seq_len, ALIGN_SIZE);
++    size_t qkv_buf_2_len = ALIGN(param->batch_size * param->src_seq_len * param->hidden_size, ALIGN_SIZE);
++    T* q_buf_2 = (T*)ws;
++    T* output1 = static_cast<T*>(ws) + q_buf_2_len;
++    T* output2 = static_cast<T*>(output1) + extra_tmp_size;
++    T* qkv_buf = static_cast<T*>(output2) + extra_tmp_size;
++    T* qk_buf = qkv_buf;
++    T* qkv_buf_2 = q_buf_2;
++    T* qkv_buf_3 = qk_buf;
++    int gemm_dims[] = {3 * (int)param->hidden_size, (int)param->h_token_num, (int)param->hidden_size};
++    int gemm_lds[] = {3 * (int)param->hidden_size, (int)param->hidden_size, 3 * (int)param->hidden_size};
++    T* from_tensor = reinterpret_cast<T*>(inputs[param->in_idx++]);
++    cublasOperation_t gemm_ops[] = {CUBLAS_OP_N, CUBLAS_OP_N};
++    cudaDataType gemm_data_types[] = {CUDA_R_32F, CUDA_R_32F, CUDA_R_32F};
++    if (std::is_same<T, half>::value) {
++        gemm_data_types[0] = CUDA_R_16F;
++        gemm_data_types[1] = CUDA_R_16F;
++        gemm_data_types[2] = CUDA_R_16F;
++    }
++    T alpha = 1.0f;
++    T beta = 0.0f;
++
++    if (param->is_cross) {
++        gemm_dims[0] = param->hidden_size;
++        gemm_dims[1] = param->batch_size * param->src_seq_len;
++        gemm_dims[2] = param->hidden_size;
++        gemm_lds[0] = param->hidden_size;
++        gemm_lds[1] = param->hidden_size;
++        gemm_lds[2] = param->hidden_size;
++        T* encoder_output = reinterpret_cast<T*>(inputs[param->in_idx++]);
++        T* weight_q = reinterpret_cast<T*>(inputs[param->in_idx++]);
++
++        CublasGemmWrapper(weight_q,
++                          from_tensor,
++                          qkv_buf,
++                          gemm_dims,
++                          gemm_lds,
++                          gemm_ops,
++                          gemm_data_types,
++                          &alpha,
++                          &beta,
++                          param->cublas_handle,
++                          param->algo);
++        gemm_dims[0] = 2 * param->hidden_size;
++        gemm_dims[1] = param->batch_size * param->tgt_seq_len;
++        gemm_lds[0] = 2 * param->hidden_size;
++        gemm_lds[2] = 2 * param->hidden_size;
++        T* weight_kv = reinterpret_cast<T*>(inputs[param->in_idx++]);
++
++        CublasGemmWrapper(weight_kv,
++                          encoder_output,
++                          qkv_buf + (param->batch_size * param->src_seq_len) * param->hidden_size,
++                          gemm_dims,
++                          gemm_lds,
++                          gemm_ops,
++                          gemm_data_types,
++                          &alpha,
++                          &beta,
++                          param->cublas_handle,
++                          param->algo);
++
++        T* bias_qkv = (param->qkv_bias) ? reinterpret_cast<T*>(inputs[param->in_idx++]) : nullptr;
++        invokeCrossAddFusedQKVBiasTranspose(q_buf_2,
++                                            output1,
++                                            output2,
++                                            qkv_buf,
++                                            bias_qkv,
++                                            param->batch_size,
++                                            param->src_seq_len,
++                                            param->tgt_seq_len,
++                                            param->head_num,
++                                            param->head_size,
++                                            param->stream);
++    }
++    else {
++        T* weight_qkv = reinterpret_cast<T*>(inputs[param->in_idx++]);
++        CublasGemmWrapper(weight_qkv,
++                          from_tensor,
++                          qkv_buf,
++                          gemm_dims,
++                          gemm_lds,
++                          gemm_ops,
++                          const_cast<const cudaDataType*>(gemm_data_types),
++                          &alpha,
++                          &beta,
++                          param->cublas_handle,
++                          param->algo);
++
++        T* bias_qkv = (param->qkv_bias) ? reinterpret_cast<T*>(inputs[param->in_idx++]) : nullptr;
++        if (param->padding_offset == nullptr) {
++            invokeAddFusedQKVBiasTranspose(static_cast<T*>(q_buf_2),
++                                           static_cast<T*>(output1),
++                                           static_cast<T*>(output2),
++                                           static_cast<T*>(qkv_buf),
++                                           bias_qkv,
++                                           param->batch_size,
++                                           param->src_seq_len,
++                                           param->head_num,
++                                           param->head_size,
++                                           0,
++                                           param->stream);
++        }
++        else {
++            invokeAddFusedZP_QKVBiasTranspose(static_cast<T*>(q_buf_2),
++                                              static_cast<T*>(output1),
++                                              static_cast<T*>(output2),
++                                              static_cast<T*>(qkv_buf),
++                                              bias_qkv,
++                                              param->batch_size,
++                                              param->src_seq_len,
++                                              param->head_num,
++                                              param->head_size,
++                                              param->h_token_num,
++                                              param->padding_offset,
++                                              param->stream);
++        }
++    }
++    gemm_ops[0] = CUBLAS_OP_T;
++    gemm_ops[1] = CUBLAS_OP_N;
++    gemm_dims[0] = param->tgt_seq_len;
++    gemm_dims[1] = param->src_seq_len;
++    gemm_dims[2] = param->head_size;
++
++    gemm_lds[0] = param->head_size;
++    gemm_lds[1] = param->head_size;
++    gemm_lds[2] = param->tgt_seq_len;
++
++    int gemm_strides[] = {(int)(param->tgt_seq_len * param->head_size),
++                          (int)(param->src_seq_len * param->head_size),
++                          (int)(param->src_seq_len * param->tgt_seq_len)};
++
++    CublasGemmStridedBatchedWrapper(output1,
++                                    q_buf_2,
++                                    qk_buf,
++                                    gemm_dims,
++                                    gemm_lds,
++                                    gemm_ops,
++                                    gemm_strides,
++                                    const_cast<const cudaDataType*>(gemm_data_types),
++                                    &alpha,
++                                    &beta,
++                                    param->batch_size * param->head_num,
++                                    param->cublas_handle,
++                                    param->algo);
++
++    T* attention_mask = reinterpret_cast<T*>(inputs[param->in_idx++]);
++    if (param->padding_offset != nullptr)
++        invokeBuildEncoderAttentionMask(
++            attention_mask, param->d_sequence_length, param->batch_size, param->src_seq_len, param->stream);
++    T* position_bias = nullptr;
++    if (param->position_bias) {
++        position_bias = reinterpret_cast<T*>(inputs[param->in_idx++]);
++    }
++    T scalar = static_cast<T>(1.0f / sqrtf(param->head_size * 1.0f));
++    invokeMixMaskedSoftMax(static_cast<T*>(qk_buf),
++                           attention_mask,
++                           position_bias,
++                           param->batch_size,
++                           param->src_seq_len,
++                           param->tgt_seq_len,
++                           param->head_num,
++                           scalar,
++                           param->stream);
++
++    gemm_ops[0] = CUBLAS_OP_N;
++    gemm_ops[1] = CUBLAS_OP_N;
++    gemm_dims[0] = param->head_size;
++    gemm_dims[1] = param->src_seq_len;
++    gemm_dims[2] = param->tgt_seq_len;
++
++    gemm_lds[0] = param->head_size;
++    gemm_lds[1] = param->tgt_seq_len;
++    gemm_lds[2] = param->head_size;
++
++    gemm_strides[0] = param->tgt_seq_len * param->head_size;
++    gemm_strides[1] = param->src_seq_len * param->tgt_seq_len;
++    gemm_strides[2] = param->src_seq_len * param->head_size;
++
++    CublasGemmStridedBatchedWrapper(output2,
++                                    qk_buf,
++                                    qkv_buf_2,
++                                    gemm_dims,
++                                    gemm_lds,
++                                    gemm_ops,
++                                    gemm_strides,
++                                    const_cast<const cudaDataType*>(gemm_data_types),
++                                    &alpha,
++                                    &beta,
++                                    param->batch_size * param->head_num,
++                                    param->cublas_handle,
++                                    param->algo);
++
++    if (param->padding_offset == nullptr) {
++        invokeTransposeQKV(static_cast<T*>(qkv_buf_3),
++                           static_cast<T*>(qkv_buf_2),
++                           param->batch_size,
++                           param->src_seq_len,
++                           param->head_num,
++                           param->head_size,
++                           param->stream);
++    }
++    else {
++        invokeTransposeAttentionOutRemovePadding(qkv_buf_2,
++                                                 qkv_buf_3,
++                                                 param->h_token_num,
++                                                 param->batch_size,
++                                                 param->src_seq_len,
++                                                 param->head_num,
++                                                 param->head_size,
++                                                 param->padding_offset,
++                                                 param->stream);
++    }
++    gemm_ops[0] = CUBLAS_OP_N;
++    gemm_ops[1] = CUBLAS_OP_N;
++    gemm_dims[0] = param->hidden_size;
++    gemm_dims[1] = param->h_token_num;
++    gemm_dims[2] = param->hidden_size;
++
++    gemm_lds[0] = param->hidden_size;
++    gemm_lds[1] = param->hidden_size;
++    gemm_lds[2] = param->hidden_size;
++    CublasGemmWrapper(reinterpret_cast<T*>(inputs[param->in_idx++]),
++                      qkv_buf_3,
++                      static_cast<T*>(output[0]),
++                      gemm_dims,
++                      gemm_lds,
++                      gemm_ops,
++                      const_cast<const cudaDataType*>(gemm_data_types),
++                      &alpha,
++                      &beta,
++                      param->cublas_handle,
++                      param->algo);
++
++    if (param->projection_bias) {
++        int len = param->h_token_num;
++        invokeAddBias(
++            static_cast<T*>(output[0]), (const T*)(inputs[param->in_idx++]), len, param->hidden_size, param->stream);
++    }
++    return;
++}
++
++template void
++forward_attn<float>(float* inputs[], int in_len, float* output[], int out_len, encoderParamT* param, void* ws);
++template void
++forward_attn<half>(half* inputs[], int in_len, half* output[], int out_len, encoderParamT* param, void* ws);
++
++template void
++forward_ffn<float, half>(float* inputs[], int in_len, float* output[], int out_len, encoderParamT* param, void* ws);
++template void
++forward_ffn<half>(half* inputs[], int in_len, half* output[], int out_len, encoderParamT* param, void* ws);
++template void
++forward_ffn<float>(float* inputs[], int in_len, float* output[], int out_len, encoderParamT* param, void* ws);
++}  // namespace fastertransformer
+diff --git a/src/fastertransformer/layers/encoder_layers/encoder.h b/src/fastertransformer/layers/encoder_layers/encoder.h
+new file mode 100644
+index 0000000..2ae0ad3
+--- /dev/null
++++ b/src/fastertransformer/layers/encoder_layers/encoder.h
+@@ -0,0 +1,50 @@
++#pragma once
++
++#include "src/fastertransformer/kernels/activation_kernels.h"
++#include "src/fastertransformer/layers/encoder_layers/BaseEncoderLayer.h"
++#include <cublas_v2.h>
++#include <cuda.h>
++
++namespace fastertransformer {
++
++typedef struct {
++    size_t batch_size;
++    size_t src_seq_len;
++    size_t tgt_seq_len;
++    size_t head_num;
++    size_t head_size;
++    size_t hidden_size;
++    size_t h_token_num;
++    size_t ffn_hidden_size;  // 4 * param->hidden_size;
++    bool ffn_fp16;
++    float eps1;
++    float eps2;
++    // handle
++    cublasHandle_t cublas_handle;
++    cudaStream_t stream;
++    cublasGemmAlgo_t algo;
++    // ctrls
++    int in_idx;
++    bool qkv_bias;         // true
++    bool projection_bias;  // true
++    bool is_cross;         // false
++    bool position_bias;    // false
++    bool layernorm_post;   // dont care
++    bool eft;              // false - effective fast trn
++    int *padding_offset;
++    int *d_sequence_length;
++} encoderParamT;
++void CublasGemmWrapper(const void* a_addr, const void* b_addr, void* c_addr, const int* params, const int* lds, const cublasOperation_t* operations, const cudaDataType* data_types, void* alpha, void* beta, cublasHandle_t cublas_handle, cublasGemmAlgo_t algo);
++void CublasGemmStridedBatchedWrapper(const void* a_addr, const void* b_addr, void* c_addr, const int* params, const int* lds, const cublasOperation_t* operations, const int* strides, const cudaDataType* data_types, void* alpha, void* beta, int batch, cublasHandle_t cublas_handle, cublasGemmAlgo_t algo);
++template<typename T>
++size_t GetEncoderLayerWorkspaceSize(encoderParamT* param);
++
++template<typename T>
++size_t GetAttnWorkspaceSize(encoderParamT* param);
++template<typename T>
++void forward_attn(T* inputs[], int in_len, T* output[], int out_len, encoderParamT* param, void* ws);
++template<typename T>
++void forwardEncoder(void* inputs[], int in_len, void* output[], int out_len, encoderParamT* param, void* ws);
++// void forwardEncoder(std::vector<fastertransformer::Tensor, std::allocator<fastertransformer::Tensor> > const*
++// inputs);
++}  // namespace fastertransformer
+diff --git a/src/fastertransformer/layers/ms_layers/CMakeLists.txt b/src/fastertransformer/layers/ms_layers/CMakeLists.txt
+new file mode 100644
+index 0000000..36abaf8
+--- /dev/null
++++ b/src/fastertransformer/layers/ms_layers/CMakeLists.txt
+@@ -0,0 +1,21 @@
++# Copyright (c) 2019-2022, NVIDIA CORPORATION.  All rights reserved.
++#
++# Licensed under the Apache License, Version 2.0 (the "License");
++# you may not use this file except in compliance with the License.
++# You may obtain a copy of the License at
++#
++#     http://www.apache.org/licenses/LICENSE-2.0
++#
++# Unless required by applicable law or agreed to in writing, software
++# distributed under the License is distributed on an "AS IS" BASIS,
++# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
++# See the License for the specific language governing permissions and
++# limitations under the License.
++
++cmake_minimum_required(VERSION 3.8)
++
++add_library(MSLayer STATIC MSDecoderLayer.cc MSEncoderLayer.cc MSAttentionLayer.cc decoder.cc encoder.cc ffn.cc gemm.cc attention.cc)
++set_property(TARGET MSLayer PROPERTY POSITION_INDEPENDENT_CODE  ON)
++set_property(TARGET MSLayer PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS  ON)
++target_link_libraries(MSLayer PUBLIC -lcublas -lcudart unfused_attention_kernels activation_kernels 
++                      layernorm_kernels add_residual_kernels bert_preprocess_kernels)
+diff --git a/src/fastertransformer/layers/ms_layers/MSAttentionLayer.cc b/src/fastertransformer/layers/ms_layers/MSAttentionLayer.cc
+new file mode 100755
+index 0000000..97daa1b
+--- /dev/null
++++ b/src/fastertransformer/layers/ms_layers/MSAttentionLayer.cc
+@@ -0,0 +1,171 @@
++/*
++ * Copyright (c) 2021-2022, NVIDIA CORPORATION.  All rights reserved.
++ * Copyright (c) 2021, NAVER Corp.  Authored by CLOVA.
++ *
++ * Licensed under the Apache License, Version 2.0 (the "License");
++ * you may not use this file except in compliance with the License.
++ * You may obtain a copy of the License at
++ *
++ *     http://www.apache.org/licenses/LICENSE-2.0
++ *
++ * Unless required by applicable law or agreed to in writing, software
++ * distributed under the License is distributed on an "AS IS" BASIS,
++ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
++ * See the License for the specific language governing permissions and
++ * limitations under the License.
++ */
++
++#include "src/fastertransformer/layers/ms_layers/MSAttentionLayer.h"
++
++namespace fastertransformer {
++
++template<typename T>
++static void printTensor(char* str, T* input, int size)
++{
++    printf("%s ", str);
++    T* input_device = input;
++    T* input_host = (T*)malloc(size * sizeof(T));
++
++    fastertransformer::cudaD2Hcpy(input_host, input_device, size);
++
++    for (int k = 0; k < (int)size; k++) {
++        std::cout << input_host[k] << ",";
++        if (k % 10 == 0)
++            std::cout << std::endl;
++    }
++
++    std::cout << std::endl;
++
++    free(input_host);
++}
++
++template<typename T, typename U, typename S>
++MSMHALayer<T, U, S>::MSMHALayer(size_t max_batch_size,
++                                size_t max_src_seq_len,
++                                size_t max_tgt_seq_len,
++                                size_t head_num,
++                                size_t size_per_head,
++                                cudaStream_t stream,
++                                cublasMMWrapper* cublas_wrapper,
++                                IAllocator* allocator,
++                                bool is_free_buffer_after_forward,
++                                bool is_qk_buf_float,
++                                bool is_cross,
++                                bool sparse,
++                                bool is_position_bias):
++    MSBaseLayer<T, U, S>(stream, cublas_wrapper, allocator, is_free_buffer_after_forward, sparse)
++{
++    cublasHandle_t cublas_handle;
++    cublasCreate(&cublas_handle);
++    cublasSetStream(cublas_handle, stream);
++
++    params_.batch_size = max_batch_size;
++    params_.src_seq_len = max_src_seq_len;
++    params_.tgt_seq_len = max_tgt_seq_len;
++    params_.head_num = head_num;
++    params_.head_size = size_per_head;
++    params_.hidden_size = head_num * size_per_head;
++    params_.cublas_handle = cublas_handle;
++    params_.stream = stream;
++    // ctrls
++    params_.in_idx = 0;
++    params_.qkv_bias = !is_position_bias;
++    params_.projection_bias = !is_position_bias;
++    params_.is_cross = is_cross;
++    params_.position_bias = is_position_bias;
++    params_.algo = CUBLAS_GEMM_DEFAULT_TENSOR_OP;
++}
++template<typename T, typename U, typename S>
++void MSMHALayer<T, U, S>::allocateBuffer()
++{
++    if (buf_ == nullptr) {
++        size_t buff_size = fastertransformer::GetAttnWorkspaceSize<T>(&params_);
++        buf_ = reinterpret_cast<T*>(allocator_->reMalloc(buf_, buff_size, true));
++    }
++}
++template<typename T, typename U, typename S>
++void MSMHALayer<T, U, S>::forward(std::vector<fastertransformer::Tensor>* output_tensors,
++                                  const std::vector<fastertransformer::Tensor>* input_tensors,
++                                  const MSLayerWeight<U>* weights)
++{
++    const AttentionLayerWeight<U>* attention_weights = dynamic_cast<const AttentionLayerWeight<U>*>(weights);
++    if(attention_weights == NULL){
++    std::cout<<"cast EncoderLayerWeight not sucsses";
++    }
++    allocateBuffer();  // only once
++    if (params_.position_bias)
++        if (params_.is_cross) {
++            void* outputs[] = {(void*)output_tensors->at(0).data};
++            void* inputs[] = {(void*)input_tensors->at(0).data,
++                              (void*)input_tensors->at(1).data,
++                              (void*)attention_weights->query_weight.kernel,
++                              (void*)attention_weights->key_weight.kernel,
++                              (void*)input_tensors->at(2).data,
++                              (void*)input_tensors->at(3).data,
++                              (void*)attention_weights->attention_output_weight.kernel};
++               fastertransformer::forward_attn<T>((T**)inputs, 7, (T**)outputs, 1, &params_, (void*)buf_);
++        }
++        else {
++            void* outputs[] = {(void*)output_tensors->at(0).data};
++            void* inputs[] = {
++                (void*)input_tensors->at(0).data,
++                (void*)attention_weights->query_weight.kernel,
++                (void*)input_tensors->at(1).data,
++                (void*)input_tensors->at(2).data,
++                (void*)attention_weights->attention_output_weight.kernel
++            };
++            fastertransformer::forward_attn<T>((T**)inputs, 5, (T**)outputs, 1, &params_, (void*)buf_);
++        }
++    else {
++        if (params_.is_cross) {
++            void* outputs[] = {(void*)output_tensors->at(0).data};
++            void* inputs[] = {(void*)input_tensors->at(0).data,
++                              (void*)input_tensors->at(1).data,
++                              (void*)attention_weights->query_weight.kernel,
++                              (void*)attention_weights->key_weight.kernel,
++                              (void*)attention_weights->query_weight.bias,
++                              (void*)input_tensors->at(2).data,
++                              (void*)attention_weights->attention_output_weight.kernel,
++                              (void*)attention_weights->attention_output_weight.bias
++                              };
++            fastertransformer::forward_attn<T>((T**)inputs, 8, (T**)outputs, 1, &params_, (void*)buf_);
++              }
++        else {
++            void* outputs[] = {(void*)output_tensors->at(0).data};
++            void* inputs[] = {(void*)input_tensors->at(0).data,
++                              (void*)attention_weights->query_weight.kernel,
++                              (void*)attention_weights->query_weight.bias,
++                              (void*)input_tensors->at(1).data,
++                              (void*)attention_weights->attention_output_weight.kernel,
++                              (void*)attention_weights->attention_output_weight.bias};
++           fastertransformer::forward_attn<T>((T**)inputs, 6, (T**)outputs, 1, &params_, (void*)buf_);
++        }
++    }
++}
++
++    template<typename T, typename U, typename S>
++    MSMHALayer<T, U, S>::~MSMHALayer()
++    {
++        cublas_wrapper_ = nullptr;
++        freeBuffer();
++    }
++
++    template<typename T, typename U, typename S>
++    void MSMHALayer<T, U, S>::freeBuffer()
++    {
++        if (buf_ != nullptr) {
++            allocator_->free(buf_);
++            buf_ = nullptr;
++        }
++    }
++
++    template class MSMHALayer<float, float, float>;
++    template class MSMHALayer<float, float, half>;
++    template class MSMHALayer<float, half, float>;
++    template class MSMHALayer<float, half, half>;
++    template class MSMHALayer<half, float, float>;
++    template class MSMHALayer<half, float, half>;
++    template class MSMHALayer<half, half, float>;
++    template class MSMHALayer<half, half, half>;
++
++}  // namespace fastertransformer
+diff --git a/src/fastertransformer/layers/ms_layers/MSAttentionLayer.h b/src/fastertransformer/layers/ms_layers/MSAttentionLayer.h
+new file mode 100755
+index 0000000..e448fb4
+--- /dev/null
++++ b/src/fastertransformer/layers/ms_layers/MSAttentionLayer.h
+@@ -0,0 +1,63 @@
++/*
++ * Copyright (c) 2021, NVIDIA CORPORATION.  All rights reserved.
++ * Copyright (c) 2021, NAVER Corp.  Authored by CLOVA.
++ *
++ * Licensed under the Apache License, Version 2.0 (the "License");
++ * you may not use this file except in compliance with the License.
++ * You may obtain a copy of the License at
++ *
++ *     http://www.apache.org/licenses/LICENSE-2.0
++ *
++ * Unless required by applicable law or agreed to in writing, software
++ * distributed under the License is distributed on an "AS IS" BASIS,
++ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
++ * See the License for the specific language governing permissions and
++ * limitations under the License.
++ */
++
++#pragma once
++
++#include "src/fastertransformer/layers/ms_layers/MSBaseLayer.h"
++#include "src/fastertransformer/layers/ms_layers/attention.h"
++namespace fastertransformer {
++
++// TODO(haim): Add template according to "mix" compute type (fp32, fp16)
++template<typename T,typename U=T, typename S=T>
++class MSMHALayer: public MSBaseLayer<T,U,S> {
++private:
++    void allocateBuffer() override;
++    void freeBuffer() override;
++
++    using MSBaseLayer<T,U,S>::is_free_buffer_after_forward_;
++    using MSBaseLayer<T,U,S>::is_allocate_buffer_;
++    using MSBaseLayer<T,U,S>::cublas_wrapper_;
++    using MSBaseLayer<T,U,S>::allocator_;
++
++protected:
++    using MSBaseLayer<T,U,S>::stream_;
++    using MSBaseLayer<T,U,S>::sparse_;
++    T* buf_ = nullptr;
++    attentionParamT params_;
++
++public:
++    MSMHALayer(size_t batch_size,
++               size_t src_seq_len,
++               size_t tgt_seq_len,
++               size_t head_num,
++               size_t size_per_head,
++               cudaStream_t stream,
++               cublasMMWrapper* cublas_wrapper,
++               IAllocator* allocator,
++               bool is_free_buffer_after_forward,
++               bool is_qk_buf_float,
++               bool is_cross,
++               bool sparse = false,
++               bool is_position_bias=false);
++    MSMHALayer(MSMHALayer<T> const& attention_layer);
++    virtual ~MSMHALayer();
++    void forward(std::vector<fastertransformer::Tensor>* output_tensors,
++                 const std::vector<fastertransformer::Tensor>* input_tensors,
++                 const MSLayerWeight<U>* weights) override;
++};
++
++}  // namespace fastertransformer
+diff --git a/src/fastertransformer/layers/ms_layers/MSBaseLayer.h b/src/fastertransformer/layers/ms_layers/MSBaseLayer.h
+new file mode 100644
+index 0000000..4056480
+--- /dev/null
++++ b/src/fastertransformer/layers/ms_layers/MSBaseLayer.h
+@@ -0,0 +1,76 @@
++/*
++ * Copyright (c) 2019-2021, NVIDIA CORPORATION.  All rights reserved.
++ *
++ * Licensed under the Apache License, Version 2.0 (the "License");
++ * you may not use this file except in compliance with the License.
++ * You may obtain a copy of the License at
++ *
++ *     http://www.apache.org/licenses/LICENSE-2.0
++ *
++ * Unless required by applicable law or agreed to in writing, software
++ * distributed under the License is distributed on an "AS IS" BASIS,
++ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
++ * See the License for the specific language governing permissions and
++ * limitations under the License.
++ */
++
++#pragma once
++
++#include <assert.h>
++#include <vector>
++
++#include "3rdparty/trt_fused_multihead_attention/fused_multihead_attention_common.h"
++#include "src/fastertransformer/layers/BaseLayer.h"
++#include "src/fastertransformer/utils/Tensor.h"
++#include "src/fastertransformer/utils/allocator.h"
++#include "src/fastertransformer/utils/cublasMMWrapper.h"
++#include "src/fastertransformer/utils/memory_utils.h"
++#include "src/fastertransformer/layers/ms_layers/MSLayerWeight.h"
++
++namespace fastertransformer {
++
++enum class MSLayerType {
++    UNFUSED_MS_LAYER,
++    FUSED_MS_LAYER
++};
++
++template<typename T>
++MSLayerType getMSLayerType(size_t size_per_head, const int sm, const bool remove_padding, 
++                                        const int max_seq_len, const bool is_fuse = true) {
++    if (std::is_same<T, half>::value && (sm == kSM_70 || sm == kSM_86 || sm == kSM_80 || sm == kSM_75 || sm == kSM_72)
++        && size_per_head == 64 && max_seq_len <= 384 && is_fuse == true) {
++        return remove_padding ? MSLayerType::FUSED_MS_LAYER : MSLayerType::FUSED_MS_LAYER;
++    } else {
++        return remove_padding ? MSLayerType::FUSED_MS_LAYER : MSLayerType::FUSED_MS_LAYER;
++    }
++}
++
++template<typename T>
++MSLayerType getMSLayerTypeINT8(size_t size_per_head, const int sm, const bool remove_padding, 
++                                            const int max_seq_len, const int int8_mode) {
++    if ((int8_mode == 1 || int8_mode == 2) && (sm == kSM_86 || sm == kSM_80 || sm == kSM_75) && size_per_head == 64
++        && max_seq_len <= 384) {
++        return remove_padding ? MSLayerType::FUSED_MS_LAYER : MSLayerType::FUSED_MS_LAYER;
++    } else {
++        return remove_padding ? MSLayerType::FUSED_MS_LAYER : MSLayerType::FUSED_MS_LAYER;
++    }
++}
++
++template<typename T, typename U = T, typename S = T>
++class MSBaseLayer: public BaseLayer {
++
++public:
++    virtual void forward(std::vector<fastertransformer::Tensor>* output_tensors,
++                         const std::vector<fastertransformer::Tensor>* input_tensors,
++                         const MSLayerWeight<U>* layer_weights) = 0;
++    MSBaseLayer(cudaStream_t stream,
++                     cublasMMWrapper* cublas_wrapper,
++                     IAllocator* allocator,
++                     bool is_free_buffer_after_forward,
++                     bool sparse = false):
++        BaseLayer(stream, cublas_wrapper, allocator, is_free_buffer_after_forward, nullptr, sparse)
++    {
++    }
++    virtual ~MSBaseLayer() = default;
++};
++}  // namespace fastertransformer
+diff --git a/src/fastertransformer/layers/ms_layers/MSDecoderLayer.cc b/src/fastertransformer/layers/ms_layers/MSDecoderLayer.cc
+new file mode 100644
+index 0000000..2198115
+--- /dev/null
++++ b/src/fastertransformer/layers/ms_layers/MSDecoderLayer.cc
+@@ -0,0 +1,210 @@
++/*
++ * Copyright (c) 2021-2022, NVIDIA CORPORATION.  All rights reserved.
++ * Copyright (c) 2021, NAVER Corp.  Authored by CLOVA.
++ *
++ * Licensed under the Apache License, Version 2.0 (the "License");
++ * you may not use this file except in compliance with the License.
++ * You may obtain a copy of the License at
++ *
++ *     http://www.apache.org/licenses/LICENSE-2.0
++ *
++ * Unless required by applicable law or agreed to in writing, software
++ * distributed under the License is distributed on an "AS IS" BASIS,
++ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
++ * See the License for the specific language governing permissions and
++ * limitations under the License.
++ */
++
++#include "src/fastertransformer/layers/ms_layers/MSDecoderLayer.h"
++
++namespace fastertransformer {
++template<typename T>
++void printTensor(char* str, T* input, int size)
++{
++    printf("%s ", str);
++    T* input_device = input;
++    T* input_host = (T*)malloc(size * sizeof(T));
++
++    fastertransformer::cudaD2Hcpy(input_host, input_device, size);
++
++    for (int k = 0; k < (int)size; k++) {
++
++        std::cout << input_host[k] << ",";
++        if (k % 10 == 0)
++            std::cout << std::endl;
++    }
++
++    std::cout << std::endl;
++
++    free(input_host);
++}
++template<typename T, typename U, typename S>
++MSDLayer<T, U, S>::MSDLayer(size_t max_batch_size,
++                            size_t max_src_seq_len,
++                            size_t max_tgt_seq_len,
++                            size_t head_num,
++                            size_t size_per_head,
++                            size_t ffn_hidden_size,
++                            float eps1,
++                            float eps2,
++                            float eps3,
++                            bool post_layernorm,
++                            bool position_bias1,
++                            bool position_bias2,
++                            bool is_ffn_fp16,
++                            cudaStream_t stream,
++                            cublasMMWrapper* cublas_wrapper,
++                            cublasHandle_t* cublas_handle,
++                            IAllocator* allocator,
++                            bool is_free_buffer_after_forward,
++                            bool is_qk_buf_float,
++                            bool sparse):
++
++    MSBaseLayer<T, U, S>(stream, cublas_wrapper, allocator, is_free_buffer_after_forward, sparse), buf_(nullptr)
++{
++    params_.batch_size = max_batch_size;
++    params_.src_seq_len = max_src_seq_len;
++    params_.tgt_seq_len = max_tgt_seq_len;
++    params_.head_num = head_num;
++    params_.head_size = size_per_head;
++    params_.hidden_size = head_num * size_per_head;
++    params_.ffn_hidden_size = ffn_hidden_size;
++    params_.eps1 = eps1;
++    params_.eps2 = eps2;
++    params_.eps3 = eps3;
++    params_.layernorm_post = post_layernorm;
++    // handle
++    params_.cublas_handle = *cublas_handle;
++    params_.stream = stream;
++    params_.ffn_fp16 = is_ffn_fp16;
++    // ctrls
++    params_.in_idx = 0;
++    params_.algo = CUBLAS_GEMM_DEFAULT_TENSOR_OP;    
++    params_.attn1.in_idx = 0;
++    params_.attn1.batch_size = max_batch_size;
++    params_.attn1.src_seq_len = max_src_seq_len;
++    params_.attn1.tgt_seq_len = max_tgt_seq_len;
++    params_.attn1.head_num = head_num;
++    params_.attn1.head_size = size_per_head;
++    params_.attn1.hidden_size = head_num * size_per_head;
++    params_.attn1.qkv_bias = true;
++    params_.attn1.projection_bias = true;
++    params_.attn1.is_cross = false;
++    params_.attn1.position_bias = false;
++    params_.attn1.algo = CUBLAS_GEMM_DEFAULT_TENSOR_OP;
++    params_.attn1.cublas_handle = *cublas_handle;
++    params_.attn1.stream = stream;
++
++    params_.attn2.in_idx = 0;
++    params_.attn2.batch_size = max_batch_size;
++    params_.attn2.src_seq_len = max_src_seq_len;
++    params_.attn2.tgt_seq_len = max_tgt_seq_len;
++    params_.attn2.head_num = head_num;
++    params_.attn2.head_size = size_per_head;
++    params_.attn2.hidden_size = head_num * size_per_head;
++    params_.attn2.qkv_bias = true;
++    params_.attn2.projection_bias = true;
++    params_.attn2.is_cross = true;
++    params_.attn2.position_bias = false;
++    params_.attn2.algo = CUBLAS_GEMM_DEFAULT_TENSOR_OP;
++    params_.attn2.cublas_handle = *cublas_handle;
++    params_.attn2.stream = stream;
++
++}
++
++template<typename T, typename U, typename S>
++void MSDLayer<T, U, S>::allocateBuffer()
++{
++    if (buf_ == nullptr) {
++        size_t buff_size = fastertransformer::GetDecoderLayerWorkspaceSize<T>(&params_);
++        buf_ = reinterpret_cast<T*>(allocator_->reMalloc(buf_,  buff_size, true));
++    }
++}
++
++template<typename T, typename U, typename S>
++void MSDLayer<T, U, S>::freeBuffer()
++{
++    if (buf_ != nullptr) {
++        allocator_->free(buf_);
++        buf_ = nullptr;
++    }
++}
++
++template<typename T, typename U, typename S>
++MSDLayer<T, U, S>::~MSDLayer()
++{
++    cublas_wrapper_ = nullptr;
++    freeBuffer();
++}
++
++template<typename T, typename U, typename S>
++void MSDLayer<T, U, S>::forward(std::vector<fastertransformer::Tensor>* output_tensors,
++                                const std::vector<fastertransformer::Tensor>* input_tensors,
++                                const MSLayerWeight<U>* weights)
++{   
++    const DecoderLayerWeight<U>* decoder_weights = dynamic_cast<const DecoderLayerWeight<U>*>(weights);
++    if(weights == NULL){
++        std::cout<<"cast EncoderLayerWeight not sucsses";
++    return ;}
++    allocateBuffer();  // only once
++    void* outputs[] = {(void*)output_tensors->at(0).data};
++    // std::cout<<params_.layernorm_post<< params_.attn1->qkv_bias<< params_.attn2->qkv_bias<< !params_.attn1->position_bias<< !params_.attn2->position_bias<<std::endl;
++    if (params_.attn1.qkv_bias && params_.attn2.qkv_bias && !params_.attn1.position_bias && !params_.attn2.position_bias) {
++        void* inputs[] = {(void*)input_tensors->at(0).data,
++                          (void*)decoder_weights->layernorm1.gamma,
++                          (void*)decoder_weights->layernorm1.beta,
++                          (void*)decoder_weights->attention.query_weight.kernel,
++                          (void*)decoder_weights->attention.query_weight.bias,
++                          (void*)input_tensors->at(1).data,
++                          (void*)decoder_weights->attention.attention_output_weight.kernel,
++                          (void*)decoder_weights->attention.attention_output_weight.bias,
++                          (void*)decoder_weights->layernorm2.gamma,
++                          (void*)decoder_weights->layernorm2.beta,
++                          (void*)input_tensors->at(2).data,
++                          (void*)decoder_weights->cross_attention.query_weight.kernel,
++                          (void*)decoder_weights->cross_attention.key_weight.kernel,
++                          (void*)decoder_weights->cross_attention.query_weight.bias,
++                          (void*)input_tensors->at(3).data,
++                          (void*)decoder_weights->cross_attention.attention_output_weight.kernel,
++                          (void*)decoder_weights->cross_attention.attention_output_weight.bias,
++                          (void*)decoder_weights->layernorm3.gamma,
++                          (void*)decoder_weights->layernorm3.beta,
++                          (void*)decoder_weights->decoder_output_mapping.kernel,
++                          (void*)decoder_weights->decoder_output_mapping.bias,
++                          (void*)decoder_weights->decoder_output_projection.kernel,
++                          (void*)decoder_weights->decoder_output_projection.bias};
++        fastertransformer::forwardDecoder<T>(inputs, 23, outputs, 1, &params_, buf_);
++    }
++    if (params_.attn1.position_bias && params_.attn2.position_bias) {
++        void* inputs[] = {(void*)input_tensors->at(0).data,
++                          (void*)decoder_weights->layernorm1.gamma,
++                          (void*)decoder_weights->attention.query_weight.kernel,
++                          (void*)input_tensors->at(4).data,
++                          (void*)input_tensors->at(1).data,
++                          (void*)decoder_weights->attention.attention_output_weight.kernel,
++                          (void*)decoder_weights->layernorm2.gamma,
++                          (void*)input_tensors->at(2).data,
++                          (void*)decoder_weights->cross_attention.query_weight.kernel,
++                          (void*)decoder_weights->cross_attention.key_weight.kernel,
++                          (void*)input_tensors->at(5).data,
++                          (void*)input_tensors->at(3).data,
++                          (void*)decoder_weights->cross_attention.attention_output_weight.kernel,
++                          (void*)decoder_weights->layernorm3.gamma,
++                          (void*)decoder_weights->decoder_output_mapping.kernel,
++                          (void*)decoder_weights->decoder_output_projection.kernel};
++    //     fastertransformer::forwardDecoder<T>(inputs, 23, outputs, 1, &params_, buf_);
++    }
++    else{}
++    return;
++}
++
++template class MSDLayer<float, float, float>;
++template class MSDLayer<float, float, half>;
++template class MSDLayer<float, half, float>;
++template class MSDLayer<float, half, half>;
++template class MSDLayer<half, float, float>;
++template class MSDLayer<half, float, half>;
++template class MSDLayer<half, half, float>;
++template class MSDLayer<half, half, half>;
++
++}  // namespace fastertransformer
+diff --git a/src/fastertransformer/layers/ms_layers/MSDecoderLayer.h b/src/fastertransformer/layers/ms_layers/MSDecoderLayer.h
+new file mode 100644
+index 0000000..53d7675
+--- /dev/null
++++ b/src/fastertransformer/layers/ms_layers/MSDecoderLayer.h
+@@ -0,0 +1,74 @@
++/*
++ * Copyright (c) 2021, NVIDIA CORPORATION.  All rights reserved.
++ * Copyright (c) 2021, NAVER Corp.  Authored by CLOVA.
++ *
++ * Licensed under the Apache License, Version 2.0 (the "License");
++ * you may not use this file except in compliance with the License.
++ * You may obtain a copy of the License at
++ *
++ *     http://www.apache.org/licenses/LICENSE-2.0
++ *
++ * Unless required by applicable law or agreed to in writing, software
++ * distributed under the License is distributed on an "AS IS" BASIS,
++ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
++ * See the License for the specific language governing permissions and
++ * limitations under the License.
++ */
++
++#pragma once
++
++#include "src/fastertransformer/layers/ms_layers/MSBaseLayer.h"
++#include "src/fastertransformer/layers/ms_layers/decoder.h"
++
++namespace fastertransformer {
++
++// TODO(haim): Add template according to "mix" compute type (fp32, fp16)
++template<typename T, typename U = T, typename S = T>
++class MSDLayer: public MSBaseLayer<T, U, S> {
++private:
++    mutable decoderParamT params_;
++
++    void allocateBuffer() override;
++    void freeBuffer() override;
++    void* buf_;
++    using MSBaseLayer<T, U, S>::is_free_buffer_after_forward_;
++    using MSBaseLayer<T, U, S>::is_allocate_buffer_;
++    using MSBaseLayer<T, U, S>::cublas_wrapper_;
++    using MSBaseLayer<T, U, S>::allocator_;
++
++protected:
++    using MSBaseLayer<T, U, S>::stream_;
++    using MSBaseLayer<T, U, S>::sparse_;
++
++public:
++    MSDLayer(size_t max_batch_size,
++                            size_t max_src_seq_len,
++                            size_t max_tgt_seq_len,
++                            size_t head_num,
++                            size_t size_per_head,
++                            size_t ffn_hidden_size,
++                            float eps1,
++                            float eps2,
++                            float eps3,
++                            bool post_layernorm,
++                            bool position_bias1,
++                            bool position_bias2,
++                            bool is_ffn_fp16,
++                            cudaStream_t stream,
++                            cublasMMWrapper* cublas_wrapper,
++                            cublasHandle_t* cublas_handle,
++                            IAllocator* allocator,
++                            bool is_free_buffer_after_forward,
++                            bool is_qk_buf_float,
++                            bool sparse);
++
++    MSDLayer(MSDLayer<T> const& decoder_layer);
++
++    virtual ~MSDLayer();
++
++    void forward(std::vector<fastertransformer::Tensor>* output_tensors,
++                 const std::vector<fastertransformer::Tensor>* input_tensors,
++                 const MSLayerWeight<U>* weights) override;
++};
++
++}  // namespace fastertransformer
+diff --git a/src/fastertransformer/layers/ms_layers/MSEncoderLayer.cc b/src/fastertransformer/layers/ms_layers/MSEncoderLayer.cc
+new file mode 100644
+index 0000000..12b4657
+--- /dev/null
++++ b/src/fastertransformer/layers/ms_layers/MSEncoderLayer.cc
+@@ -0,0 +1,215 @@
++/*
++ * Copyright (c) 2021-2022, NVIDIA CORPORATION.  All rights reserved.
++ * Copyright (c) 2021, NAVER Corp.  Authored by CLOVA.
++ *
++ * Licensed under the Apache License, Version 2.0 (the "License");
++ * you may not use this file except in compliance with the License.
++ * You may obtain a copy of the License at
++ *
++ *     http://www.apache.org/licenses/LICENSE-2.0
++ *
++ * Unless required by applicable law or agreed to in writing, software
++ * distributed under the License is distributed on an "AS IS" BASIS,
++ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
++ * See the License for the specific language governing permissions and
++ * limitations under the License.
++ */
++
++#include "src/fastertransformer/layers/ms_layers/MSEncoderLayer.h"
++
++namespace fastertransformer {
++template<typename T>
++void printTensor(char* str, T* input, int size)
++{
++    printf("%s ", str);
++    T* input_device = input;
++    T* input_host = (T*)malloc(size * sizeof(T));
++
++    fastertransformer::cudaD2Hcpy(input_host, input_device, size);
++
++    for (int k = 0; k < (int)size; k++) {
++
++        std::cout << input_host[k] << ",";
++        if (k % 10 == 0)
++            std::cout << std::endl;
++    }
++
++    std::cout << std::endl;
++
++    free(input_host);
++}
++template<typename T, typename U, typename S>
++MSELayer<T, U, S>::MSELayer(size_t max_batch_size,
++                            size_t max_src_seq_len,
++                            size_t max_tgt_seq_len,
++                            size_t head_num,
++                            size_t size_per_head,
++                            size_t ffn_hidden_size,
++                            float eps1,
++                            float eps2,
++                            bool post_layernorm,
++                            bool position_bias,
++                            bool is_ffn_fp16,
++                            cudaStream_t stream,
++                            cublasMMWrapper* cublas_wrapper,
++                            cublasHandle_t* cublas_handle,
++                            IAllocator* allocator,
++                            bool is_free_buffer_after_forward,
++                            bool is_qk_buf_float,
++                            bool sparse):
++
++    MSBaseLayer<T, U, S>(stream, cublas_wrapper, allocator, is_free_buffer_after_forward, sparse), buf_(nullptr)
++{
++    params_.batch_size = max_batch_size;
++    params_.src_seq_len = max_src_seq_len;
++    params_.tgt_seq_len = max_tgt_seq_len;
++    params_.head_num = head_num;
++    params_.head_size = size_per_head;
++    params_.hidden_size = head_num * size_per_head;
++    params_.ffn_hidden_size = ffn_hidden_size;
++    params_.eps1 = eps1;
++    params_.eps2 = eps2;
++    params_.layernorm_post = post_layernorm;
++    // handle
++    params_.cublas_handle = *cublas_handle;
++    params_.stream = stream;
++    params_.ffn_fp16 = is_ffn_fp16;
++    // ctrls
++    params_.algo = CUBLAS_GEMM_DEFAULT_TENSOR_OP;
++    params_.has_bias=!position_bias;
++    params_.has_beta=!position_bias;
++    params_.attn.in_idx = 0;
++    params_.attn.batch_size = max_batch_size;
++    params_.attn.src_seq_len = max_src_seq_len;
++    params_.attn.tgt_seq_len = max_tgt_seq_len;
++    params_.attn.head_num = head_num;
++    params_.attn.head_size = size_per_head;
++    params_.attn.hidden_size = head_num * size_per_head;
++    params_.attn.qkv_bias = !position_bias;
++    params_.attn.projection_bias = !position_bias;
++    params_.attn.is_cross = false;
++    std::cout<<"position_bias"<<position_bias<<std::endl;
++    params_.attn.position_bias = position_bias;
++    params_.attn.algo = CUBLAS_GEMM_DEFAULT_TENSOR_OP;
++    params_.attn.cublas_handle = *cublas_handle;
++    params_.attn.stream = stream;
++}
++
++template<typename T, typename U, typename S>
++void MSELayer<T, U, S>::allocateBuffer()
++{
++    if (buf_ == nullptr) {
++        size_t buff_size = fastertransformer::GetEncoderLayerWorkspaceSize<T>(&params_);
++        buf_ = reinterpret_cast<T*>(allocator_->reMalloc(buf_, sizeof(T) * buff_size, true));
++    }
++}
++
++template<typename T, typename U, typename S>
++void MSELayer<T, U, S>::freeBuffer()
++{
++    if (buf_ != nullptr) {
++        allocator_->free(buf_);
++        buf_ = nullptr;
++    }
++}
++
++template<typename T, typename U, typename S>
++MSELayer<T, U, S>::~MSELayer()
++{
++    cublas_wrapper_ = nullptr;
++    freeBuffer();
++}
++
++template<typename T, typename U, typename S>
++void MSELayer<T, U, S>::forward(std::vector<fastertransformer::Tensor>* output_tensors,
++                                const std::vector<fastertransformer::Tensor>* input_tensors,
++                                const MSLayerWeight<U>* weights)
++{
++    const EncoderLayerWeight<U>* encoder_weights = dynamic_cast<const EncoderLayerWeight<U>*>(weights); 
++    // EncoderLayerWeight<U>* encoder_weights = dynamic_cast<EncoderLayerWeight<U>*>(const_cast<MSLayerWeight<U>*>(weights));
++    // const EncoderLayerWeight<U>* encoder_weights = dynamic_cast<EncoderLayerWeight<U>*>(weights);
++    if(encoder_weights == NULL){
++    std::cout<<"cast EncoderLayerWeight not sucsses";
++    return ;}
++    allocateBuffer();  // only once
++    void* outputs[] = {(void*)output_tensors->at(0).data};
++    if (!params_.layernorm_post) {
++        if (params_.attn.position_bias) {
++            void* inputs[] = {
++                (void*)input_tensors->at(0).data,
++                (void*)encoder_weights->layernorm1.gamma,
++                (void*)encoder_weights->attention.query_weight.kernel,
++                (void*)input_tensors->at(1).data,
++                (void*)input_tensors->at(2).data,
++                (void*)encoder_weights->attention.attention_output_weight.kernel,
++                (void*)encoder_weights->layernorm2.gamma,
++                (void*)encoder_weights->encoder_output_mapping.kernel,
++                (void*)encoder_weights->encoder_output_projection.kernel
++                
++            };
++            forwardEncoder<T>(inputs, 9, outputs, 1, &params_, buf_);
++        }
++        else{
++        void* inputs[] = {(void*)input_tensors->at(0).data,
++                          (void*)encoder_weights->layernorm1.gamma,
++                          (void*)encoder_weights->layernorm1.beta,
++                          (void*)encoder_weights->attention.query_weight.kernel,
++                          (void*)encoder_weights->attention.query_weight.bias,
++                          (void*)input_tensors->at(1).data,
++                          (void*)encoder_weights->attention.attention_output_weight.kernel,
++                          (void*)encoder_weights->attention.attention_output_weight.bias,
++                          (void*)encoder_weights->layernorm2.gamma,
++                          (void*)encoder_weights->layernorm2.beta,
++                          (void*)encoder_weights->encoder_output_mapping.kernel,
++                          (void*)encoder_weights->encoder_output_mapping.bias,
++                          (void*)encoder_weights->encoder_output_projection.kernel,
++                          (void*)encoder_weights->encoder_output_projection.bias};
++        fastertransformer::forwardEncoder<T>(inputs, 14, outputs, 1, &params_, buf_);
++    }
++    }
++    else {
++        if (params_.attn.position_bias) {
++            void* inputs[] = {
++                (void*)input_tensors->at(0).data,
++                (void*)encoder_weights->attention.query_weight.kernel,
++                (void*)input_tensors->at(1).data,
++                (void*)input_tensors->at(2).data,
++                (void*)encoder_weights->attention.attention_output_weight.kernel,
++                (void*)encoder_weights->layernorm1.gamma,
++                (void*)encoder_weights->encoder_output_mapping.kernel,
++                (void*)encoder_weights->encoder_output_projection.kernel,
++                (void*)encoder_weights->layernorm2.gamma
++            };
++            forwardEncoder<T>(inputs, 9, outputs, 1, &params_, buf_);
++        } else {
++        void* inputs[] = {(void*)input_tensors->at(0).data,
++                          (void*)encoder_weights->attention.query_weight.kernel,
++                          (void*)encoder_weights->attention.query_weight.bias,
++                          (void*)input_tensors->at(1).data,
++                          (void*)encoder_weights->attention.attention_output_weight.kernel,
++                          (void*)encoder_weights->attention.attention_output_weight.bias,
++                          (void*)encoder_weights->layernorm1.gamma,
++                          (void*)encoder_weights->layernorm1.beta,
++                          (void*)encoder_weights->encoder_output_mapping.kernel,
++                          (void*)encoder_weights->encoder_output_mapping.bias,
++                          (void*)encoder_weights->encoder_output_projection.kernel,
++                          (void*)encoder_weights->encoder_output_projection.bias,
++                          (void*)encoder_weights->layernorm2.gamma,
++                          (void*)encoder_weights->layernorm2.beta};
++        fastertransformer::forwardEncoder<T>(inputs, 3, outputs, 1, &params_, buf_);
++        }
++    }
++
++    return;
++}
++
++template class MSELayer<float, float, float>;
++template class MSELayer<float, float, half>;
++template class MSELayer<float, half, float>;
++template class MSELayer<float, half, half>;
++template class MSELayer<half, float, float>;
++template class MSELayer<half, float, half>;
++template class MSELayer<half, half, float>;
++template class MSELayer<half, half, half>;
++
++}  // namespace fastertransformer
+diff --git a/src/fastertransformer/layers/ms_layers/MSEncoderLayer.h b/src/fastertransformer/layers/ms_layers/MSEncoderLayer.h
+new file mode 100644
+index 0000000..95c598f
+--- /dev/null
++++ b/src/fastertransformer/layers/ms_layers/MSEncoderLayer.h
+@@ -0,0 +1,71 @@
++/*
++ * Copyright (c) 2021, NVIDIA CORPORATION.  All rights reserved.
++ * Copyright (c) 2021, NAVER Corp.  Authored by CLOVA.
++ *
++ * Licensed under the Apache License, Version 2.0 (the "License");
++ * you may not use this file except in compliance with the License.
++ * You may obtain a copy of the License at
++ *
++ *     http://www.apache.org/licenses/LICENSE-2.0
++ *
++ * Unless required by applicable law or agreed to in writing, software
++ * distributed under the License is distributed on an "AS IS" BASIS,
++ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
++ * See the License for the specific language governing permissions and
++ * limitations under the License.
++ */
++
++#pragma once
++
++#include "src/fastertransformer/layers/ms_layers/MSBaseLayer.h"
++#include "src/fastertransformer/layers/ms_layers/encoder.h"
++
++namespace fastertransformer {
++
++// TODO(haim): Add template according to "mix" compute type (fp32, fp16)
++template<typename T, typename U = T, typename S = T>
++class MSELayer: public MSBaseLayer<T, U, S> {
++private:
++    encoderParamT params_;
++    void allocateBuffer() override;
++    void freeBuffer() override;
++    void* buf_;
++    using MSBaseLayer<T, U, S>::is_free_buffer_after_forward_;
++    using MSBaseLayer<T, U, S>::is_allocate_buffer_;
++    using MSBaseLayer<T, U, S>::cublas_wrapper_;
++    using MSBaseLayer<T, U, S>::allocator_;
++
++protected:
++    using MSBaseLayer<T, U, S>::stream_;
++    using MSBaseLayer<T, U, S>::sparse_;
++
++public:
++    MSELayer(size_t max_batch_size,
++                            size_t max_src_seq_len,
++                            size_t max_tgt_seq_len,
++                            size_t head_num,
++                            size_t size_per_head,
++                            size_t ffn_hidden_size,
++                            float eps1,
++                            float eps2,
++                            bool post_layernorm,
++                            bool position_bias,
++                            bool is_ffn_fp16,
++                            cudaStream_t stream,
++                            cublasMMWrapper* cublas_wrapper,
++                            cublasHandle_t* cublas_handle,
++                            IAllocator* allocator,
++                            bool is_free_buffer_after_forward,
++                            bool is_qk_buf_float,
++                            bool sparse);
++
++    MSELayer(MSELayer<T> const& encoder_layer);
++
++    virtual ~MSELayer();
++
++    void forward(std::vector<fastertransformer::Tensor>* output_tensors,
++                 const std::vector<fastertransformer::Tensor>* input_tensors,
++                 const MSLayerWeight<U>* weights) override;
++};
++
++}  // namespace fastertransformer
+diff --git a/src/fastertransformer/layers/ms_layers/MSLayerWeight.h b/src/fastertransformer/layers/ms_layers/MSLayerWeight.h
+new file mode 100644
+index 0000000..8915136
+--- /dev/null
++++ b/src/fastertransformer/layers/ms_layers/MSLayerWeight.h
+@@ -0,0 +1,62 @@
++/*
++ * Copyright (c) 2019-2021, NVIDIA CORPORATION.  All rights reserved.
++ *
++ * Licensed under the Apache License, Version 2.0 (the "License");
++ * you may not use this file except in compliance with the License.
++ * You may obtain a copy of the License at
++ *
++ *     http://www.apache.org/licenses/LICENSE-2.0
++ *
++ * Unless required by applicable law or agreed to in writing, software
++ * distributed under the License is distributed on an "AS IS" BASIS,
++ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
++ * See the License for the specific language governing permissions and
++ * limitations under the License.
++ */
++
++#pragma once
++
++#include "src/fastertransformer/layers/DenseWeight.h"
++#include "src/fastertransformer/kernels/layernorm_kernels.h"
++namespace fastertransformer {
++
++template<typename T>
++ struct MSLayerWeight{
++ virtual ~MSLayerWeight() {}
++};
++template<typename T>
++struct AttentionLayerWeight:MSLayerWeight<T>{
++    DenseWeight<T> query_weight;
++    DenseWeight<T> key_weight;
++    DenseWeight<T> value_weight;
++    DenseWeight<T> attention_output_weight;
++};
++template<typename T>
++struct DecoderLayerWeight:MSLayerWeight<T>{
++    AttentionLayerWeight<T> attention;
++    AttentionLayerWeight<T> cross_attention;
++    // DenseWeight attention_qkv_weight;
++    // DenseWeight<T> attention_layer_output_weight;
++    // DenseWeight<T> attention_cross_q_weight;
++    // DenseWeight<T> attention_cross_kv_weight;
++    // DenseWeight<T> attention_cross_layer_output_weight;
++    DenseWeight<T> decoder_output_mapping;
++    DenseWeight<T> decoder_output_projection;
++    LayerNormWeight<T> layernorm1;
++    LayerNormWeight<T> layernorm2;
++    LayerNormWeight<T> layernorm3;
++};
++
++template<typename T>
++struct EncoderLayerWeight:MSLayerWeight<T>{
++    AttentionLayerWeight<T> attention;
++    // DenseWeight<T> qkv_weight;
++    // DenseWeight<T> attention_layer_output_weight;
++    DenseWeight<T> encoder_output_mapping;
++    DenseWeight<T> encoder_output_projection;
++    LayerNormWeight<T> layernorm1;
++    LayerNormWeight<T> layernorm2;
++};
++
++
++}  // namespace fastertransformer
+diff --git a/src/fastertransformer/layers/ms_layers/attention.cc b/src/fastertransformer/layers/ms_layers/attention.cc
+new file mode 100644
+index 0000000..40e8d6e
+--- /dev/null
++++ b/src/fastertransformer/layers/ms_layers/attention.cc
+@@ -0,0 +1,300 @@
++
++#include "src/fastertransformer/layers/ms_layers/attention.h"
++#include "src/fastertransformer/kernels/activation_kernels.h"
++#include "src/fastertransformer/kernels/add_residual_kernels.h"
++#include "src/fastertransformer/kernels/unfused_attention_kernels.h"
++#include <iostream>
++namespace fastertransformer {
++
++#define UP_DIV(x, y) (((x) + (y) - (1)) / (y))
++// #define UP_DIV(x, y) (x)
++#define ALIGN_SIZE 16
++
++template<typename T>
++void printTensor(char* str, T* input, int size) {
++    printf("%s ",str);
++    T* input_device = input;
++    T* input_host = (T*)malloc(size * sizeof(T));
++
++    fastertransformer::cudaD2Hcpy(input_host, input_device, size);
++
++    for (int k = 0; k < (int)size; k++) {
++
++        std::cout << input_host[k] << ",";
++        if (k % 10 == 0)
++            std::cout << std::endl;
++        if (k % 10 == 0)
++            std::cout << std::endl;
++    }
++
++    std::cout << std::endl;
++
++    free(input_host);
++}
++
++template<typename T>
++void isNan(char* str, T* input, int size)
++{
++    std::cout << str << " " << " size is " << size;
++    T* input_device = input;
++    T* input_host = (T*)malloc(size * sizeof(T));
++
++    fastertransformer::cudaD2Hcpy(input_host, input_device, size);
++
++    for (int k = 0; k < (int)size; k++) {
++        if (std::isnan((float)input_host[k]) || std ::isinf((float)input_host[k])) {
++            std::cout << "found NAN or INF";
++            break;
++        }
++    }
++
++    std::cout << std::endl;
++    free(input_host);
++}
++
++
++template<typename T>
++size_t GetAttnWorkspaceSize(attentionParamT* param)
++{
++    size_t size_q = UP_DIV((param->batch_size * param->src_seq_len * param->hidden_size), ALIGN_SIZE) * ALIGN_SIZE;
++    size_t size_k = UP_DIV((param->batch_size * param->tgt_seq_len * param->hidden_size), ALIGN_SIZE) * ALIGN_SIZE;
++    size_t size_v = size_k;
++    size_t qkv_len = size_q + size_k + size_v;
++    size_t q_buf_2_len = size_q;
++    size_t qk_buf_len =
++        UP_DIV(param->batch_size * param->head_num * param->src_seq_len * param->tgt_seq_len, ALIGN_SIZE) * ALIGN_SIZE;
++    size_t qkv_buf_2_len = UP_DIV(param->batch_size * param->src_seq_len * param->hidden_size, ALIGN_SIZE) * ALIGN_SIZE;
++    size_t qkv_buf_3_len = qkv_buf_2_len;
++    size_t attn_out_size =
++        UP_DIV(param->batch_size * param->head_num * param->head_size * param->tgt_seq_len, ALIGN_SIZE) * ALIGN_SIZE;
++    return (qkv_len + q_buf_2_len + qk_buf_len + qkv_buf_2_len + qkv_buf_3_len + 2 * attn_out_size) * sizeof(T);
++
++}
++
++template size_t GetAttnWorkspaceSize<float>(attentionParamT* param);
++template size_t GetAttnWorkspaceSize<half>(attentionParamT* param);
++
++template<typename T>
++void forward_attn(T* inputs[], int in_len, T* output[], int out_len, attentionParamT* param, void* ws)
++{
++    param->in_idx = 0;
++    auto extra_tmp_size =
++        UP_DIV(param->batch_size * param->head_num * param->head_size * param->tgt_seq_len, ALIGN_SIZE) * ALIGN_SIZE;
++    size_t size_q = UP_DIV(param->batch_size * param->src_seq_len * param->hidden_size, ALIGN_SIZE) * ALIGN_SIZE;
++    size_t size_k = UP_DIV(param->batch_size * param->tgt_seq_len * param->hidden_size, ALIGN_SIZE) * ALIGN_SIZE;
++    size_t size_v = size_k;
++
++    size_t qkv_len = size_q + size_k + size_v;
++    size_t q_buf_2_len = size_q;
++    size_t qk_buf_len =
++        UP_DIV(param->batch_size * param->head_num * param->src_seq_len * param->tgt_seq_len, ALIGN_SIZE) * ALIGN_SIZE;
++    size_t qkv_buf_2_len = UP_DIV(param->batch_size * param->src_seq_len * param->hidden_size, ALIGN_SIZE) * ALIGN_SIZE;
++    size_t qkv_buf_3_len = qkv_buf_2_len;
++    auto buff_size = qkv_len + q_buf_2_len + qk_buf_len + qkv_buf_2_len + qkv_buf_3_len;
++    T* qkv_buf = (T*)ws;
++    T* q_buf_2 = static_cast<T*>(qkv_buf) + qkv_len;
++    T* qk_buf = static_cast<T*>(q_buf_2) + q_buf_2_len;
++    T* qkv_buf_2 = static_cast<T*>(qk_buf) + qk_buf_len;
++    T* qkv_buf_3 = static_cast<T*>(qkv_buf_2) + qkv_buf_2_len;
++    T* output1 = static_cast<T*>(ws) + buff_size;
++    T* output2 = static_cast<T*>(output1) + extra_tmp_size;
++    int gemm_dims[] = {
++        3 * (int)param->hidden_size, (int)param->batch_size * (int)param->src_seq_len, (int)param->hidden_size};
++    int gemm_lds[] = {3 * (int)param->hidden_size, (int)param->hidden_size, 3 * (int)param->hidden_size};
++    T* from_tensor = reinterpret_cast<T*>(inputs[param->in_idx++]);
++    cublasOperation_t gemm_ops[] = {CUBLAS_OP_N, CUBLAS_OP_N};
++    cudaDataType gemm_data_types[] = {CUDA_R_32F, CUDA_R_32F, CUDA_R_32F};
++    if (std::is_same<T, half>::value) {
++        gemm_data_types[0] = CUDA_R_16F;
++        gemm_data_types[1] = CUDA_R_16F;
++        gemm_data_types[2] = CUDA_R_16F;
++    }
++    T alpha = 1.0f;
++    T beta = 0.0f;
++
++    if (param->is_cross) {
++        gemm_dims[0] = param->hidden_size;
++        gemm_dims[1] = param->batch_size * param->src_seq_len;
++        gemm_dims[2] = param->hidden_size;
++        gemm_lds[0] = param->hidden_size;
++        gemm_lds[1] = param->hidden_size;
++        gemm_lds[2] = param->hidden_size;
++        T* encoder_output = reinterpret_cast<T*>(inputs[param->in_idx++]);
++        T* weight_q = reinterpret_cast<T*>(inputs[param->in_idx++]);
++        fastertransformer::CublasGemmWrapper(weight_q,
++                          from_tensor,
++                          qkv_buf,
++                          gemm_dims,
++                          gemm_lds,
++                          gemm_ops,
++                          gemm_data_types,
++                          &alpha,
++                          &beta,
++                          param->cublas_handle,
++                          param->algo);
++        gemm_dims[0] = 2 * param->hidden_size;
++        gemm_dims[1] = param->batch_size * param->tgt_seq_len;
++        gemm_lds[0] = 2 * param->hidden_size;
++        gemm_lds[2] = 2 * param->hidden_size;
++        T* weight_kv = reinterpret_cast<T*>(inputs[param->in_idx++]);
++        fastertransformer::CublasGemmWrapper(weight_kv,
++                          encoder_output,
++                          qkv_buf + (param->batch_size * param->src_seq_len) * param->hidden_size,
++                          gemm_dims,
++                          gemm_lds,
++                          gemm_ops,
++                          gemm_data_types,
++                          &alpha,
++                          &beta,
++                          param->cublas_handle,
++                          param->algo);
++
++        T* bias_qkv = (param->qkv_bias) ? reinterpret_cast<T*>(inputs[param->in_idx++]) : nullptr;
++        invokeCrossAddFusedQKVBiasTranspose(q_buf_2,
++                                            output1,
++                                            output2,
++                                            qkv_buf,
++                                            bias_qkv,
++                                            param->batch_size,
++                                            param->src_seq_len,
++                                            param->tgt_seq_len,
++                                            param->head_num,
++                                            param->head_size,
++                                            param->stream);
++    }
++    else {
++        T* weight_qkv = reinterpret_cast<T*>(inputs[param->in_idx++]);
++        fastertransformer::CublasGemmWrapper(weight_qkv,
++                          from_tensor,
++                          qkv_buf,
++                          gemm_dims,
++                          gemm_lds,
++                          gemm_ops,
++                          const_cast<const cudaDataType*>(gemm_data_types),
++                          &alpha,
++                          &beta,
++                          param->cublas_handle,
++                          param->algo);
++        T* bias_qkv = (param->qkv_bias) ? reinterpret_cast<T*>(inputs[param->in_idx++]) : nullptr;
++        fastertransformer::invokeAddFusedQKVBiasTranspose(static_cast<T*>(q_buf_2),
++                                                          static_cast<T*>(output1),
++                                                          static_cast<T*>(output2),
++                                                          static_cast<T*>(qkv_buf),
++                                                          bias_qkv,
++                                                          param->batch_size,
++                                                          param->src_seq_len,
++                                                          param->head_num,
++                                                          param->head_size,
++                                                          0,
++                                                          param->stream);
++    }
++    gemm_ops[0] = CUBLAS_OP_T;
++
++    gemm_lds[0] = param->head_size;
++    gemm_lds[1] = param->head_size;
++    gemm_lds[2] = param->tgt_seq_len;
++
++    int gemm_strides[] = {(int)(param->tgt_seq_len * param->head_size),
++                          (int)(param->src_seq_len * param->head_size),
++                          (int)(param->src_seq_len * param->tgt_seq_len)};
++
++    gemm_dims[0] = param->tgt_seq_len;
++    gemm_dims[1] = param->src_seq_len;
++    gemm_dims[2] = param->head_size;
++
++    fastertransformer::CublasGemmStridedBatchedWrapper(output1,
++                                    q_buf_2,
++                                    qk_buf,
++                                    gemm_dims,
++                                    gemm_lds,
++                                    gemm_ops,
++                                    gemm_strides,
++                                    const_cast<const cudaDataType*>(gemm_data_types),
++                                    &alpha,
++                                    &beta,
++                                    param->batch_size * param->head_num,
++                                    param->cublas_handle,
++                                    param->algo);
++
++    T* attention_mask = reinterpret_cast<T*>(inputs[param->in_idx++]);
++    T* position_bias = (param->position_bias) ? reinterpret_cast<T*>(inputs[param->in_idx++]) : nullptr;
++    T scalar = static_cast<T>(1.0f / sqrtf(param->head_size * 1.0f));
++    fastertransformer::invokeMixMaskedSoftMax(static_cast<T*>(qk_buf),
++                                              attention_mask,
++                                              position_bias,
++                                              param->batch_size,
++                                              param->src_seq_len,
++                                              param->tgt_seq_len,
++                                              param->head_num,
++                                              scalar,
++                                              param->stream);
++
++    gemm_ops[0] = CUBLAS_OP_N;
++    gemm_ops[1] = CUBLAS_OP_N;
++    gemm_dims[0] = param->head_size;
++    gemm_dims[1] = param->src_seq_len;
++    gemm_dims[2] = param->tgt_seq_len;
++
++    gemm_lds[0] = param->head_size;
++    gemm_lds[1] = param->tgt_seq_len;
++    gemm_lds[2] = param->head_size;
++
++    gemm_strides[0] = param->tgt_seq_len * param->head_size;
++    gemm_strides[1] = param->src_seq_len * param->tgt_seq_len;
++    gemm_strides[2] = param->src_seq_len * param->head_size;
++    fastertransformer::CublasGemmStridedBatchedWrapper(output2,
++                                    qk_buf,
++                                    qkv_buf_2,
++                                    gemm_dims,
++                                    gemm_lds,
++                                    gemm_ops,
++                                    gemm_strides,
++                                    const_cast<const cudaDataType*>(gemm_data_types),
++                                    &alpha,
++                                    &beta,
++                                    param->batch_size * param->head_num,
++                                    param->cublas_handle,
++                                    param->algo);
++
++    invokeTransposeQKV(static_cast<T*>(qkv_buf_3),
++                       static_cast<T*>(qkv_buf_2),
++                       param->batch_size,
++                       param->src_seq_len,
++                       param->head_num,
++                       param->head_size,
++                       param->stream);
++    gemm_ops[0] = CUBLAS_OP_N;
++    gemm_ops[1] = CUBLAS_OP_N;
++    gemm_dims[0] = param->hidden_size;
++    gemm_dims[1] = param->batch_size * param->src_seq_len;
++    gemm_dims[2] = param->hidden_size;
++
++    gemm_lds[0] = param->hidden_size;
++    gemm_lds[1] = param->hidden_size;
++    gemm_lds[2] = param->hidden_size;
++    fastertransformer::CublasGemmWrapper(reinterpret_cast<T*>(inputs[param->in_idx++]),
++                      qkv_buf_3,
++                      static_cast<T*>(output[0]),
++                      gemm_dims,
++                      gemm_lds,
++                      gemm_ops,
++                      const_cast<const cudaDataType*>(gemm_data_types),
++                      &alpha,
++                      &beta,
++                      param->cublas_handle,
++                      param->algo);
++
++    if (param->projection_bias) {
++        int len = param->batch_size * param->src_seq_len;
++        invokeAddBias(
++            static_cast<T*>(output[0]), (const T*)(inputs[param->in_idx++]), len, param->hidden_size, param->stream);
++    }
++    return;
++}
++
++template void
++forward_attn<float>(float* inputs[], int in_len, float* output[], int out_len, attentionParamT* param, void* ws);
++template void
++forward_attn<half>(half* inputs[], int in_len, half* output[], int out_len, attentionParamT* param, void* ws);
++
++}  // namespace fastertransformer
+diff --git a/src/fastertransformer/layers/ms_layers/attention.h b/src/fastertransformer/layers/ms_layers/attention.h
+new file mode 100644
+index 0000000..04623a1
+--- /dev/null
++++ b/src/fastertransformer/layers/ms_layers/attention.h
+@@ -0,0 +1,19 @@
++#pragma once
++
++#include "src/fastertransformer/kernels/activation_kernels.h"
++#include "src/fastertransformer/layers/ms_layers/MSBaseLayer.h"
++#include "src/fastertransformer/layers/ms_layers/param.h"
++#include "src/fastertransformer/layers/ms_layers/gemm.h"
++
++#include <cublas_v2.h>
++#include <cuda.h>
++
++namespace fastertransformer {
++
++
++template<typename T>
++size_t GetAttnWorkspaceSize(attentionParamT* param);
++
++template<typename T>
++void forward_attn(T* inputs[], int in_len, T* output[], int out_len, attentionParamT* param, void* ws);
++}  // namespace fastertransformer
+diff --git a/src/fastertransformer/layers/ms_layers/decoder.cc b/src/fastertransformer/layers/ms_layers/decoder.cc
+new file mode 100644
+index 0000000..3ee6389
+--- /dev/null
++++ b/src/fastertransformer/layers/ms_layers/decoder.cc
+@@ -0,0 +1,216 @@
++
++#include "src/fastertransformer/layers/decoder_layers/decoder.h"
++#include "src/fastertransformer/kernels/activation_kernels.h"
++#include "src/fastertransformer/kernels/add_residual_kernels.h"
++#include "src/fastertransformer/kernels/layernorm_kernels.h"
++#include "src/fastertransformer/kernels/unfused_attention_kernels.h"
++#include "src/fastertransformer/layers/ms_layers/attention.h"
++#include "src/fastertransformer/layers/ms_layers/ffn.h"
++
++#include <iostream>
++namespace fastertransformer {
++
++#define UP_DIV(x, y) (((x) + (y) - (1)) / (y))
++// #define UP_DIV(x, y) (x)
++#define ALIGN_SIZE 16
++
++template<typename T>
++void printTensor(char* str, T* input, int size) {
++    printf("%s ",str);
++    T* input_device = input;
++    T* input_host = (T*)malloc(size * sizeof(T));
++
++    fastertransformer::cudaD2Hcpy(input_host, input_device, size);
++
++    for (int k = 0; k < (int)size; k++) {
++
++        std::cout << input_host[k] << ",";
++        if (k % 10 == 0)
++            std::cout << std::endl;
++        if (k % 10 == 0)
++            std::cout << std::endl;
 +    }
-+    cublasGemmEx(cublas_handle,
-+                 trans_a,
-+                 trans_b,
-+                 m,
-+                 n,
-+                 k,
-+                 alpha,
-+                 a_addr,
-+                 type_a,
-+                 lda,
-+                 b_addr,
-+                 type_b,
-+                 ldb,
-+                 beta,
-+                 c_addr,
-+                 type_c,
-+                 ldc,
-+                 compute_type,
-+                 algo);
++
++    std::cout << std::endl;
++
++    free(input_host);
 +}
 +
-+void CublasGemmStridedBatchedWrapper(const void* a_addr,
-+                                     const void* b_addr,
-+                                     void* c_addr,
-+                                     const int* params,
-+                                     const int* lds,
-+                                     const cublasOperation_t* operations,
-+                                     const int* strides,
-+                                     const cudaDataType* data_types,
-+                                     void* alpha,
-+                                     void* beta,
-+                                     int batch,
-+                                     cublasHandle_t cublas_handle,
-+                                     cublasGemmAlgo_t algo)
++template<typename T>
++void isNan(char* str, T* input, int size)
 +{
-+    const int m = params[0];
-+    const int n = params[1];
-+    const int k = params[2];
-+    cublasOperation_t trans_a = operations[0];
-+    cublasOperation_t trans_b = operations[1];
-+    const int lda = lds[0];
-+    const int ldb = lds[1];
-+    const int ldc = lds[2];
-+    cudaDataType type_a = data_types[0];
-+    cudaDataType type_b = data_types[1];
-+    cudaDataType type_c = data_types[2];
-+    cublasComputeType_t compute_type = CUBLAS_COMPUTE_32F_FAST_TF32;
-+    // cublasComputeType_t compute_type = CUBLAS_COMPUTE_32F_FAST_16F;
++    std::cout << str << " " << " size is " << size;
++    T* input_device = input;
++    T* input_host = (T*)malloc(size * sizeof(T));
 +
-+    if ((type_a == CUDA_R_16F) && (type_b == CUDA_R_16F) && (type_c == CUDA_R_16F)) {
-+        compute_type = CUBLAS_COMPUTE_16F;
++    fastertransformer::cudaD2Hcpy(input_host, input_device, size);
++
++    for (int k = 0; k < (int)size; k++) {
++        if (std::isnan((float)input_host[k]) || std ::isinf((float)input_host[k])) {
++            std::cout << "found NAN or INF";
++            break;
++        }
 +    }
-+    const int stride_a = strides[0];
-+    const int stride_b = strides[1];
-+    const int stride_c = strides[2];
-+    cublasGemmStridedBatchedEx(cublas_handle,
-+                               trans_a,
-+                               trans_b,
-+                               m,
-+                               n,
-+                               k,
-+                               alpha,
-+                               a_addr,
-+                               type_a,
-+                               lda,
-+                               stride_a,
-+                               b_addr,
-+                               type_b,
-+                               ldb,
-+                               stride_b,
-+                               beta,
-+                               c_addr,
-+                               type_c,
-+                               ldc,
-+                               stride_c,
-+                               batch,
-+                               compute_type,
-+                               algo);
++
++    std::cout << std::endl;
++    free(input_host);
 +}
 +
 +template<typename T>
-+size_t GetAttnWorkspaceSize(encoderParamT* param)
++size_t GetDecoderLayerWorkspaceSize(decoderParamT* param)
 +{
-+    size_t size_q = ALIGN((param->batch_size * param->src_seq_len * param->hidden_size), ALIGN_SIZE);
-+    size_t size_k = ALIGN((param->batch_size * param->tgt_seq_len * param->hidden_size), ALIGN_SIZE);
-+    size_t size_v = size_k;
-+    size_t qkv_len = size_q + size_k + size_v;
-+    size_t qk_buf_len =
-+        ALIGN(param->batch_size * param->head_num * param->src_seq_len * param->tgt_seq_len, ALIGN_SIZE);
-+    size_t qkv_buf_2_len = ALIGN(param->batch_size * param->src_seq_len * param->hidden_size, ALIGN_SIZE);
-+    size_t attn_out_size =
-+        ALIGN(param->batch_size * param->head_num * param->head_size * param->tgt_seq_len, ALIGN_SIZE);
-+    return (qkv_buf_2_len + 2 * attn_out_size + std::max(qkv_len, qk_buf_len)) * sizeof(T);
++    size_t attn_out = UP_DIV(param->batch_size * param->src_seq_len * param->hidden_size, ALIGN_SIZE) * ALIGN_SIZE;;
++    size_t attn2_out = UP_DIV(param->batch_size * param->src_seq_len * param->hidden_size, ALIGN_SIZE) * ALIGN_SIZE;;
++
++    size_t ffn = UP_DIV(param->batch_size * param->src_seq_len * param->ffn_hidden_size, ALIGN_SIZE) * ALIGN_SIZE;
++    size_t ffn_size = (param->layernorm_post) ? ffn : (attn_out + ffn);
++    size_t out_size = (param->layernorm_post) ? attn_out + attn2_out : attn_out * 2 + attn2_out * 2;
++    return (std::max(fastertransformer::GetAttnWorkspaceSize<T>(&(param->attn1)) * 2, ffn_size * sizeof(T)) + out_size * sizeof(T)+ fastertransformer::GetAttnWorkspaceSize<T>(&(param->attn1))*4);
++}
++
++template size_t GetDecoderLayerWorkspaceSize<float>(decoderParamT* param);
++template size_t GetDecoderLayerWorkspaceSize<half>(decoderParamT* param);
++
++template<typename T>
++void forwardDecoder(void* inputs[], int in_len, void* output[], int out_len, decoderParamT* param, void* ws)
++{
++    param->in_idx = 0;
++    size_t h_token_num = param->batch_size * param->src_seq_len;
++    T* from_tensor = reinterpret_cast<T*>(inputs[param->in_idx++]);
++    T* attn_out = reinterpret_cast<T*>(ws);
++    T* normed_from_tensor = reinterpret_cast<T*>(ws) + UP_DIV(param->batch_size * param->src_seq_len * param->hidden_size, ALIGN_SIZE) * ALIGN_SIZE;
++    T* attn_ws = reinterpret_cast<T*>(normed_from_tensor) + UP_DIV(param->batch_size * param->src_seq_len * param->hidden_size, ALIGN_SIZE) * ALIGN_SIZE;
++    T* normed_attn_out = normed_from_tensor; 
++    T* attn2_out = reinterpret_cast<T*>(attn_ws) + fastertransformer::GetAttnWorkspaceSize<T>(&(param->attn1));  
++    T* normed_from_tensor2 = reinterpret_cast<T*>(attn2_out) + UP_DIV(param->batch_size * param->src_seq_len * param->hidden_size, ALIGN_SIZE) * ALIGN_SIZE;
++    T* attn2_ws = reinterpret_cast<T*>(normed_from_tensor2) + UP_DIV(param->batch_size * param->src_seq_len * param->hidden_size, ALIGN_SIZE) * ALIGN_SIZE;
++    T* normed_attn2_out = normed_from_tensor2;
++    T* ffn_ws =  attn2_ws + UP_DIV(param->batch_size * param->src_seq_len * param->hidden_size, ALIGN_SIZE) * ALIGN_SIZE;
++    T* tmp_out = reinterpret_cast<T*>(output[0]);
++    if (std::is_same<T, float>::value && param->ffn_fp16==true) {
++        tmp_out = ffn_ws + UP_DIV(param->batch_size * param->src_seq_len * param->ffn_hidden_size, ALIGN_SIZE) * ALIGN_SIZE;
++    }
++    T* gamma1 = reinterpret_cast<T*>(inputs[param->in_idx++]);
++    T* beta1 = reinterpret_cast<T*>(inputs[param->in_idx++]);
++    invokeGeneralLayerNorm(normed_from_tensor,
++                           reinterpret_cast<T*>(from_tensor),  // from tensor
++                           gamma1,                             // Gamma
++                           beta1,                              // Beta
++                           h_token_num,
++                           param->hidden_size,
++                           param->stream,
++                           param->eps1);
++    inputs[--param->in_idx] = normed_from_tensor;
++    // if attention is embedded inside an decoder - fuse the bias to next layer normalization
++    int in_idx = param->in_idx;
++    bool projection_bias = param->attn1.projection_bias;
++        param->attn1.projection_bias=false;
++
++    fastertransformer::forward_attn(reinterpret_cast<T**>(&inputs[param->in_idx]), in_len, &attn_out, 1, &(param->attn1), attn_ws);
++    param->attn1.projection_bias = projection_bias;
++    param->in_idx = param->attn1.in_idx + in_idx;
++    if (param->attn1.projection_bias) {
++        T* projection_bias = reinterpret_cast<T*>(inputs[param->in_idx++]);
++        T* gamma2 = reinterpret_cast<T*>(inputs[param->in_idx++]);
++        T* beta2 = reinterpret_cast<T*>(inputs[param->in_idx++]);
++        from_tensor = param->layernorm_post ? normed_from_tensor : from_tensor;
++        invokeGeneralAddBiasResidualPreLayerNorm(attn_out,
++                                            normed_attn_out,
++                                            from_tensor,
++                                            gamma2,  // gamma
++                                            beta2,  // beta
++                                            projection_bias,
++                                            h_token_num,
++                                            param->hidden_size,
++                                            param->stream,
++                                            param->eps2);  
++
++    } else {
++        // without projection bias
++    }
++    inputs[--param->in_idx] = normed_attn_out;
++    in_idx = param->in_idx;
++    projection_bias = param->attn2.projection_bias;
++    param->attn2.projection_bias=false;
++    fastertransformer::forward_attn(reinterpret_cast<T**>(&inputs[param->in_idx]), in_len, &attn2_out, 1, &(param->attn2), attn2_ws);
++
++    param->attn2.projection_bias = projection_bias;
++    param->in_idx = param->attn2.in_idx + in_idx;
++    if (param->attn2.projection_bias) {
++        T* projection_bias = reinterpret_cast<T*>(inputs[param->in_idx++]);
++        T* gamma3 = reinterpret_cast<T*>(inputs[param->in_idx++]);
++        T* beta3 = reinterpret_cast<T*>(inputs[param->in_idx++]);
++        if (std::is_same<T, half>::value || param->ffn_fp16==false) {
++                invokeGeneralAddBiasResidualPreLayerNorm(attn2_out,
++                                            normed_attn2_out,
++                                            attn_out,
++                                            gamma3,  // gamma
++                                            beta3,  // beta
++                                            projection_bias,
++                                            h_token_num,
++                                            param->hidden_size,
++                                            param->stream,
++                                            param->eps3);
++
++
++        } else {
++                invokeGeneralAddBiasResidualPreLayerNormCast<T, half>(attn2_out,
++                                                        reinterpret_cast<half*>(normed_attn2_out),
++                                                        attn_out,
++                                                        gamma3,  // gamma
++                                                        beta3,   // beta
++                                                        projection_bias,
++                                                        h_token_num,
++                                                        param->hidden_size,
++                                                        param->stream,
++                                                        param->eps3);          
++        }
++    } else {
++        // without projection bias
++    }
++    inputs[--param->in_idx] = normed_attn2_out;
++    if (param->ffn_fp16==false) {
++        fastertransformer::forward_ffn<T, T>(reinterpret_cast<T**>(inputs), in_len, &tmp_out, 1, param, ffn_ws);
++
++    } else {
++        fastertransformer::forward_ffn<T, half>(reinterpret_cast<T**>(inputs), in_len, &tmp_out, 1, param, ffn_ws);
++    }
++    attn2_out = param->layernorm_post ? normed_attn2_out : attn2_out;
++     if (std::is_same<T, half>::value || param->ffn_fp16==false) {
++        invokeAddBiasResidual(reinterpret_cast<T*>(tmp_out),
++                              attn2_out,
++                              reinterpret_cast<T*>(inputs[param->in_idx++]),  // FFN bias
++                              h_token_num,
++                              param->hidden_size,
++                              param->stream);
++
++    } else {
++        if(param->layernorm_post){
++            invokeAddBiasResidualSameTypeCast<T, T, half>(reinterpret_cast<half*>(tmp_out),
++                                            reinterpret_cast<half*>(attn2_out),
++                                            reinterpret_cast<T*>(output[0]),
++                                            reinterpret_cast<T*>(inputs[param->in_idx++]),  // FFN bias
++                                            h_token_num,
++                                            param->hidden_size,
++                                            param->stream);
++
++        } else{
++            invokeAddBiasResidualCast<T, T, half>(reinterpret_cast<half*>(tmp_out),
++                                            reinterpret_cast<T*>(attn2_out),
++                                            reinterpret_cast<T*>(output[0]),
++                                            reinterpret_cast<T*>(inputs[param->in_idx++]),  // FFN bias
++                                            h_token_num,
++                                            param->hidden_size,
++                                            param->stream);
++
++        }
++    }
++    return;
++}
++
++template void
++forwardDecoder<float>(void* inputs[], int in_len, void* output[], int out_len, decoderParamT* param, void* ws);
++template void
++forwardDecoder<half>(void* inputs[], int in_len, void* output[], int out_len, decoderParamT* param, void* ws);
++
++}  // namespace fastertransformer
+diff --git a/src/fastertransformer/layers/ms_layers/decoder.h b/src/fastertransformer/layers/ms_layers/decoder.h
+new file mode 100644
+index 0000000..7c2ea9e
+--- /dev/null
++++ b/src/fastertransformer/layers/ms_layers/decoder.h
+@@ -0,0 +1,17 @@
++#pragma once
++
++#include "src/fastertransformer/kernels/activation_kernels.h"
++#include "src/fastertransformer/layers/ms_layers/param.h"
++
++#include "src/fastertransformer/layers/decoder_layers/BaseDecoderLayer.h"
++#include <cublas_v2.h>
++#include <cuda.h>
++
++namespace fastertransformer {
++
++template<typename T>
++size_t GetDecoderLayerWorkspaceSize(decoderParamT* param);
++
++template<typename T>
++void forwardDecoder(void* inputs[], int in_len, void* output[], int out_len, decoderParamT* param, void* ws);
++}  // namespace fastertransformer
+diff --git a/src/fastertransformer/layers/ms_layers/encoder.cc b/src/fastertransformer/layers/ms_layers/encoder.cc
+new file mode 100644
+index 0000000..6e7f546
+--- /dev/null
++++ b/src/fastertransformer/layers/ms_layers/encoder.cc
+@@ -0,0 +1,235 @@
++
++#include "src/fastertransformer/layers/ms_layers/encoder.h"
++#include "src/fastertransformer/layers/ms_layers/attention.h"
++#include "src/fastertransformer/layers/ms_layers/ffn.h"
++#include "src/fastertransformer/kernels/activation_kernels.h"
++#include "src/fastertransformer/kernels/add_residual_kernels.h"
++#include "src/fastertransformer/kernels/layernorm_kernels.h"
++#include "src/fastertransformer/kernels/unfused_attention_kernels.h"
++#include <iostream>
++namespace fastertransformer {
++
++#define UP_DIV(x, y) (((x) + (y) - (1)) / (y))
++// #define UP_DIV(x, y) (x)
++#define ALIGN_SIZE 16
++
++template<typename T>
++void printTensor(char* str, T* input, int size) {
++    printf("%s ",str);
++    T* input_device = input;
++    T* input_host = (T*)malloc(size * sizeof(T));
++
++    fastertransformer::cudaD2Hcpy(input_host, input_device, size);
++
++    for (int k = 0; k < (int)size; k++) {
++
++        std::cout << input_host[k] << ",";
++        if (k % 10 == 0)
++            std::cout << std::endl;
++        if (k % 10 == 0)
++            std::cout << std::endl;
++    }
++
++    std::cout << std::endl;
++
++    free(input_host);
++}
++
++template<typename T>
++void isNan(char* str, T* input, int size)
++{
++    std::cout << str << " " << " size is " << size;
++    T* input_device = input;
++    T* input_host = (T*)malloc(size * sizeof(T));
++
++    fastertransformer::cudaD2Hcpy(input_host, input_device, size);
++
++    for (int k = 0; k < (int)size; k++) {
++        if (std::isnan((float)input_host[k]) || std ::isinf((float)input_host[k])) {
++            std::cout << "found NAN or INF";
++            break;
++        }
++    }
++
++    std::cout << std::endl;
++    free(input_host);
 +}
 +
-+template size_t GetAttnWorkspaceSize<float>(encoderParamT* param);
-+template size_t GetAttnWorkspaceSize<half>(encoderParamT* param);
 +template<typename T>
 +size_t GetEncoderLayerWorkspaceSize(encoderParamT* param)
 +{
-+    size_t max_hidden = ALIGN(std::max(param->hidden_size, param->ffn_hidden_size),ALIGN_SIZE);
-+    size_t compress_buffer_len = ALIGN(param->batch_size * param->src_seq_len * max_hidden,ALIGN_SIZE);
-+    size_t padding_len = ALIGN(param->batch_size * param->src_seq_len,ALIGN_SIZE);
-+    size_t offset_len = ALIGN(param->batch_size,ALIGN_SIZE);
-+    size_t d_token_len = ALIGN(1,ALIGN_SIZE);
-+    size_t eft_size = compress_buffer_len * sizeof(T) + (padding_len + offset_len) * sizeof(int) + d_token_len * sizeof(size_t);
-+    size_t attn_out = ALIGN(param->batch_size * param->src_seq_len * param->hidden_size, ALIGN_SIZE);
-+    size_t ffn = ALIGN(param->batch_size * param->src_seq_len * param->ffn_hidden_size, ALIGN_SIZE);
-+    return (std::max(GetAttnWorkspaceSize<T>(param), ffn * sizeof(T)) + (attn_out * 3) * sizeof(T)) + eft_size;
++    size_t attn_out = UP_DIV(param->batch_size * param->src_seq_len * param->hidden_size, ALIGN_SIZE) * ALIGN_SIZE;;
++    size_t ffn = UP_DIV(param->batch_size * param->src_seq_len * param->ffn_hidden_size, ALIGN_SIZE) * ALIGN_SIZE;
++    size_t ffn_size = (param->layernorm_post) ? ffn : (attn_out + ffn);
++    size_t out_size = (param->layernorm_post) ? attn_out : attn_out * 2;
++    return (std::max(fastertransformer::GetAttnWorkspaceSize<T>(&(param->attn)), ffn_size * sizeof(T)) + out_size * sizeof(T));
 +}
 +
 +template size_t GetEncoderLayerWorkspaceSize<float>(encoderParamT* param);
 +template size_t GetEncoderLayerWorkspaceSize<half>(encoderParamT* param);
 +
-+template<typename T, typename S = T>
-+void forward_ffn(T* inputs[], int in_len, T* output[], int out_len, encoderParamT* param, void* ws)
-+{
-+    size_t inter_size = param->ffn_hidden_size;
-+    size_t h_token_num = param->h_token_num;
-+    cublasOperation_t gemm_ops[] = {CUBLAS_OP_N, CUBLAS_OP_N};
-+    cudaDataType gemm_data_types[] = {CUDA_R_32F, CUDA_R_32F, CUDA_R_32F};
-+    if ((std::is_same<T, half>::value) || (std::is_same<S, half>::value)) {
-+        gemm_data_types[0] = CUDA_R_16F;
-+        gemm_data_types[1] = CUDA_R_16F;
-+        gemm_data_types[2] = CUDA_R_16F;
-+    }
-+    S alpha = 1.0f;
-+    S beta = 0.0f;
-+
-+    int gemm_dims[] = {(int)inter_size, (int)h_token_num, (int)param->hidden_size};
-+    int gemm_lds[] = {(int)inter_size, (int)param->hidden_size, (int)inter_size};
-+    T* normed_attn_out = reinterpret_cast<T*>(inputs[param->in_idx++]);
-+    CublasGemmWrapper(inputs[param->in_idx++],
-+                      normed_attn_out,
-+                      ws,
-+                      gemm_dims,
-+                      gemm_lds,
-+                      gemm_ops,
-+                      gemm_data_types,
-+                      &alpha,
-+                      &beta,
-+                      param->cublas_handle,
-+                      param->algo);
-+    invokeAddBiasGelu(reinterpret_cast<S*>(ws),
-+                      reinterpret_cast<S*>(inputs[param->in_idx++]),
-+                      h_token_num,
-+                      inter_size,
-+                      param->stream);
-+    gemm_dims[0] = param->hidden_size;
-+    gemm_dims[1] = h_token_num;
-+    gemm_dims[2] = inter_size;
-+    gemm_lds[0] = param->hidden_size;
-+    gemm_lds[1] = inter_size;
-+    gemm_lds[2] = param->hidden_size;
-+    CublasGemmWrapper(inputs[param->in_idx++],
-+                      ws,
-+                      output[0],
-+                      gemm_dims,
-+                      gemm_lds,
-+                      gemm_ops,
-+                      gemm_data_types,
-+                      &alpha,
-+                      &beta,
-+                      param->cublas_handle,
-+                      param->algo);
-+}
-+
 +template<typename T>
 +void forwardEncoder(void* inputs[], int in_len, void* output[], int out_len, encoderParamT* param, void* ws)
 +{
++    // std::cout<<param->has_bias<<param->has_beta<<param->attn.position_bias<<param->attn.projection_bias<<std::endl;
 +    param->in_idx = 0;
 +    size_t h_token_num = param->batch_size * param->src_seq_len;
-+    param->h_token_num = h_token_num;
-+    param->padding_offset = nullptr;
-+    int* d_sequence_lengths = nullptr;
-+    T* input_tensor = reinterpret_cast<T*>(inputs[param->in_idx++]);
-+    T* from_tensor = input_tensor;
-+    T* compress_buffer;
-+    compress_buffer = reinterpret_cast<T*>(ws);
-+    ws = reinterpret_cast<void*>(reinterpret_cast<T*>(ws) + ALIGN(h_token_num * param->hidden_size,ALIGN_SIZE));
-+    int* padding_offset = reinterpret_cast<int*>(ws);
-+    ws = reinterpret_cast<void*>(reinterpret_cast<int*>(ws) + ALIGN(param->batch_size * param->src_seq_len,ALIGN_SIZE));
-+    d_sequence_lengths = reinterpret_cast<int*>(ws);
-+    param->d_sequence_length = d_sequence_lengths;
-+    ws = reinterpret_cast<void*>(reinterpret_cast<int*>(ws) + ALIGN(param->batch_size,ALIGN_SIZE));
-+    size_t* d_token_num = reinterpret_cast<size_t*>(ws);
-+    ws = reinterpret_cast<void*>(reinterpret_cast<size_t*>(ws) + ALIGN(1,ALIGN_SIZE));
-+    invokeBuildSequnceLength(
-+        from_tensor, param->batch_size, d_sequence_lengths, param->src_seq_len, param->hidden_size, param->stream);
-+   // printTensor("seq_len=",d_sequence_lengths,param->batch_size);
-+    invokeGetPaddingOffset(&h_token_num,
-+                           d_token_num,
-+                           padding_offset,
-+                           d_sequence_lengths,
-+                           param->batch_size,
-+                           param->src_seq_len,
-+                           param->stream);
-+    // std::cout << "token=" << h_token_num << "m=" << param->batch_size * param->src_seq_len << std::endl;
-+    if (h_token_num * 2 <= param->batch_size * param->src_seq_len) {
-+        param->eft = true;
-+        invokeRemovePadding(compress_buffer,
-+                            (const T*)from_tensor,
-+                            padding_offset,
-+                            h_token_num,
-+                            param->head_num * param->head_size,
-+                            param->stream);
-+        param->h_token_num = h_token_num;
-+        param->padding_offset = padding_offset;
-+        from_tensor = compress_buffer;
-+    }
-+    h_token_num = param->h_token_num;
++    T* from_tensor = reinterpret_cast<T*>(inputs[param->in_idx++]);
 +    T* attn_out = reinterpret_cast<T*>(ws);
-+    T* normed_from_tensor =
-+        reinterpret_cast<T*>(ws) + ALIGN(param->batch_size * param->src_seq_len * param->hidden_size, ALIGN_SIZE);
++    T* normed_from_tensor = reinterpret_cast<T*>(ws) + UP_DIV(param->batch_size * param->src_seq_len * param->hidden_size, ALIGN_SIZE) * ALIGN_SIZE;
 +    T* attn_ws_offset = (param->layernorm_post) ? reinterpret_cast<T*>(ws) : reinterpret_cast<T*>(normed_from_tensor);
-+    T* attn_ws = attn_ws_offset + ALIGN(param->batch_size * param->src_seq_len * param->hidden_size, ALIGN_SIZE);
++    T* attn_ws = attn_ws_offset + UP_DIV(param->batch_size * param->src_seq_len * param->hidden_size, ALIGN_SIZE) * ALIGN_SIZE;
 +    T* normed_attn_out = normed_from_tensor;
-+    T* ffn_ws = normed_attn_out + ALIGN(param->batch_size * param->src_seq_len * param->hidden_size, ALIGN_SIZE);
-+
++    T* ffn_ws =  normed_attn_out + UP_DIV(param->batch_size * param->src_seq_len * param->hidden_size, ALIGN_SIZE) * ALIGN_SIZE;
++    
 +    T* tmp_out = reinterpret_cast<T*>(output[0]);
-+    if (param->padding_offset != nullptr || (std::is_same<T, float>::value && param->ffn_fp16 == true)) {
-+        tmp_out = ffn_ws + ALIGN(param->batch_size * param->src_seq_len * param->ffn_hidden_size, ALIGN_SIZE);
-+    }
-+    T* tmp_out1 = reinterpret_cast<T*>(output[0]);
-+    T* out_buf = tmp_out;
-+    if (param->padding_offset != nullptr) {
-+        tmp_out1 = compress_buffer;
++    if (std::is_same<T, float>::value && param->ffn_fp16==true) {
++        tmp_out = ffn_ws + UP_DIV(param->batch_size * param->src_seq_len * param->ffn_hidden_size, ALIGN_SIZE) * ALIGN_SIZE;
 +    }
++    
 +    if (param->layernorm_post == false) {
-+        T* gamma1 = reinterpret_cast<T*>(inputs[param->in_idx++]);
-+        T* beta1 = reinterpret_cast<T*>(inputs[param->in_idx++]);
-+
-+        invokeGeneralLayerNorm(normed_from_tensor,
-+                               reinterpret_cast<T*>(from_tensor),  // from tensor
-+                               gamma1,                             // Gamma
-+                               beta1,                              // Beta
-+                               h_token_num,
-+                               param->hidden_size,
-+                               param->stream,
-+                               param->eps1);
-+    }
-+    else {
++            T* gamma1 = reinterpret_cast<T*>(inputs[param->in_idx++]);
++            T* beta1 = (param->has_beta) ? reinterpret_cast<T*>(inputs[param->in_idx++]) : nullptr;
++            invokeGeneralLayerNorm(normed_from_tensor,
++                                   reinterpret_cast<T*>(from_tensor),  // from tensor
++                                   gamma1,                             // Gamma
++                                   beta1,                              // Beta
++                                   h_token_num,
++                                   param->hidden_size,
++                                   param->stream,
++                                   param->eps1);
++    } else {
 +        normed_from_tensor = from_tensor;
 +    }
++    
 +    inputs[--param->in_idx] = normed_from_tensor;
 +    // if attention is embedded inside an encoder - fuse the bias to next layer normalization
-+    bool projection_bias = param->projection_bias;
-+    param->projection_bias = false;
 +    int in_idx = param->in_idx;
-+    forward_attn(reinterpret_cast<T**>(&inputs[param->in_idx]), in_len, &attn_out, 1, param, attn_ws);
-+    param->in_idx += in_idx;
-+    param->projection_bias = projection_bias;
-+    if (param->projection_bias) {
-+        T* projection_bias = reinterpret_cast<T*>(inputs[param->in_idx++]);
-+        T* gamma2 = reinterpret_cast<T*>(inputs[param->in_idx++]);
-+        T* beta2 = reinterpret_cast<T*>(inputs[param->in_idx++]);
-+        if (param->layernorm_post == false) {
-+            if (std::is_same<T, half>::value || param->ffn_fp16 == false) {
-+                invokeGeneralAddBiasResidualPreLayerNorm(attn_out,
-+                                                         normed_attn_out,
-+                                                         from_tensor,
-+                                                         gamma2,  // gamma
-+                                                         beta2,   // beta
-+                                                         projection_bias,
-+                                                         h_token_num,
-+                                                         param->hidden_size,
-+                                                         param->stream,
-+                                                         param->eps2);
-+            }
-+            else {
-+                invokeGeneralAddBiasResidualPreLayerNormCast<T, half>(attn_out,
-+                                                                      reinterpret_cast<half*>(normed_attn_out),
-+                                                                      from_tensor,
-+                                                                      gamma2,  // gamma
-+                                                                      beta2,   // beta
-+                                                                      projection_bias,
-+                                                                      h_token_num,
-+                                                                      param->hidden_size,
-+                                                                      param->stream,
-+                                                                      param->eps2);
-+            }
++    bool is_projection_bias = param->attn.projection_bias;
++    param->attn.projection_bias = false;
++    fastertransformer::forward_attn(reinterpret_cast<T**>(&inputs[param->in_idx]), in_len, &attn_out, 1, &(param->attn), attn_ws);
++    param->attn.projection_bias = is_projection_bias;
++    param->in_idx = param->attn.in_idx + in_idx;
++    // std::cout<<"index: "<<param->in_idx<<std::endl;
++    T* projection_bias = (param->attn.projection_bias) ? reinterpret_cast<T*>(inputs[param->in_idx++]) : nullptr;
++    T* gamma2 = reinterpret_cast<T*>(inputs[param->in_idx++]);
++    T* beta2 = (param->has_beta) ? reinterpret_cast<T*>(inputs[param->in_idx++]) : nullptr;
++    // std::cout<<"index: "<<param->in_idx<<std::endl;
++    if (param->layernorm_post == false) {
++        if (std::is_same<T, half>::value || param->ffn_fp16==false) {
++            invokeGeneralAddBiasResidualPreLayerNorm(attn_out,
++                                        normed_attn_out,
++                                        from_tensor,
++                                        gamma2,  // gamma
++                                        beta2,  // beta
++                                        projection_bias,
++                                        h_token_num,
++                                        param->hidden_size,
++                                        param->stream,
++                                        param->eps2);
++        } else {
++            invokeGeneralAddBiasResidualPreLayerNormCast<T, half>(attn_out,
++                                                    reinterpret_cast<half*>(normed_attn_out),
++                                                    from_tensor,
++                                                    gamma2,  // gamma
++                                                    beta2,   // beta
++                                                    projection_bias,
++                                                    h_token_num,
++                                                    param->hidden_size,
++                                                    param->stream,
++                                                    param->eps2);
 +        }
-+        else {
-+            if (std::is_same<T, half>::value || param->ffn_fp16 == false) {
-+                invokeAddBiasResidualLayerNorm(attn_out,
-+                                               from_tensor,
-+                                               projection_bias,
-+                                               gamma2,  // gamma
-+                                               beta2,   // beta
-+                                               h_token_num,
-+                                               param->hidden_size,
-+                                               param->stream,
-+                                               param->eps1);
++    } else {
++        if (std::is_same<T, half>::value || param->ffn_fp16==false) {
++            invokeAddBiasResidualLayerNorm(
++                attn_out,
++                from_tensor,
++                projection_bias,
++                gamma2,  // gamma
++                beta2,   // beta
++                h_token_num,
++                param->hidden_size,
++                param->stream,
++                param->eps1);
 +                normed_attn_out = attn_out;
-+            }
-+            else {
-+                invokeAddBiasResidualLayerNormCast<T, float, half>(reinterpret_cast<float*>(attn_out),
-+                                                                   reinterpret_cast<half*>(normed_attn_out),
-+                                                                   reinterpret_cast<float*>(from_tensor),
-+                                                                   projection_bias,
-+                                                                   gamma2,  // gamma
-+                                                                   beta2,   // beta
-+                                                                   h_token_num,
-+                                                                   param->hidden_size,
-+                                                                   param->stream,
-+                                                                   param->eps1);
++        } else {
++            invokeAddBiasResidualLayerNormCast<T, float, half>(
++                reinterpret_cast<float*>(attn_out),
++                reinterpret_cast<half*>(normed_attn_out),
++                reinterpret_cast<float*>(from_tensor),
++                projection_bias,
++                gamma2,  // gamma
++                beta2,   // beta
++                h_token_num,
++                param->hidden_size,
++                param->stream,
++                param->eps1);
 +                // isNan<half>((char*)"LN 1 model", (half*)attn_out, h_token_num * param->hidden_size);
-+            }
 +        }
 +    }
-+    else {
-+        // without projection bias
-+    }
-+    // forward ffn
++    // forward ffn    
 +    // simulate attention inputs
 +    inputs[--param->in_idx] = normed_attn_out;
-+    if (param->ffn_fp16 == false) {
-+        forward_ffn<T, T>(reinterpret_cast<T**>(inputs), in_len, &tmp_out, 1, param, ffn_ws);
-+    }
-+    else {
-+        forward_ffn<T, half>(reinterpret_cast<T**>(inputs), in_len, &tmp_out, 1, param, ffn_ws);
++    if (param->ffn_fp16==false) {
++        fastertransformer::forward_ffn<T, T>(reinterpret_cast<T**>(inputs), in_len, &tmp_out, 1, param, ffn_ws);
++        // std::cout<<"index: "<<param->in_idx<<std::endl;
++    } else {
++        fastertransformer::forward_ffn<T, half>(reinterpret_cast<T**>(inputs), in_len, &tmp_out, 1, param, ffn_ws);
 +    }
++    T* ffn_bias = (param->attn.projection_bias) ? reinterpret_cast<T*>(inputs[param->in_idx++]) : nullptr;
++    // std::cout<<"index: "<<param->in_idx<<std::endl;
 +    if (param->layernorm_post == true) {
-+        if (std::is_same<T, half>::value || param->ffn_fp16 == false) {
++        T* gamma3 = reinterpret_cast<T*>(inputs[param->in_idx++]);
++        T* beta3 = (param->has_beta) ? reinterpret_cast<T*>(inputs[param->in_idx++]) : nullptr;
++        if (std::is_same<T, half>::value || param->ffn_fp16==false) {
 +            invokeAddBiasResidualLayerNorm(reinterpret_cast<T*>(tmp_out),
-+                                           attn_out,
-+                                           reinterpret_cast<T*>(inputs[param->in_idx++]),  // FFN bias,
-+                                           reinterpret_cast<T*>(inputs[param->in_idx++]),  // Gamma
-+                                           reinterpret_cast<T*>(inputs[param->in_idx++]),  // Beta
-+                                           h_token_num,
-+                                           param->hidden_size,
-+                                           param->stream,
-+                                           param->eps2);
-+        }
-+        else {
++                                        attn_out,
++                                        ffn_bias,      // FFN bias,
++                                        gamma3,      // Gamma
++                                        beta3,      // Beta
++                                        h_token_num,
++                                        param->hidden_size,
++                                        param->stream,
++                                        param->eps2);
++
++        } else {
 +            invokeAddBiasResidualLayerNormCast<T, half, float>(
-+                reinterpret_cast<half*>(tmp_out),
-+                reinterpret_cast<float*>(tmp_out1),
-+                reinterpret_cast<half*>(normed_attn_out),
-+                reinterpret_cast<T*>(inputs[param->in_idx++]),  // FFN bias,
-+                reinterpret_cast<T*>(inputs[param->in_idx++]),  // Gamma
-+                reinterpret_cast<T*>(inputs[param->in_idx++]),  // Beta
-+                h_token_num,
-+                param->hidden_size,
-+                param->stream,
-+                param->eps2);
-+            out_buf = tmp_out1;
++                                        reinterpret_cast<half*>(tmp_out),
++                                        reinterpret_cast<float*>(output[0]),
++                                        reinterpret_cast<half*>(normed_attn_out),
++                                        ffn_bias,     // FFN bias,
++                                        gamma3,       // Gamma
++                                        beta3,        // Beta
++                                        h_token_num,
++                                        param->hidden_size,
++                                        param->stream,
++                                        param->eps2);
 +        }
-+    }
-+    else {
-+        if (std::is_same<T, half>::value || param->ffn_fp16 == false) {
++    } else {
++        if (std::is_same<T, half>::value || param->ffn_fp16==false) {
 +            invokeAddBiasResidual(reinterpret_cast<T*>(tmp_out),
 +                                  attn_out,
-+                                  reinterpret_cast<T*>(inputs[param->in_idx++]),  // FFN bias
++                                  ffn_bias,  // FFN bias
 +                                  h_token_num,
 +                                  param->hidden_size,
 +                                  param->stream);
 +        }
 +        else {
-+            invokeAddBiasResidualCast<T, T, half>(reinterpret_cast<half*>(tmp_out),
-+                                                  reinterpret_cast<T*>(attn_out),
-+                                                  reinterpret_cast<T*>(tmp_out1),
-+                                                  reinterpret_cast<T*>(inputs[param->in_idx++]),  // FFN bias
-+                                                  h_token_num,
-+                                                  param->hidden_size,
-+                                                  param->stream);
-+        }
-+    }
-+    if (param->padding_offset != nullptr) {
-+        cudaMemsetAsync(output[0],
-+                        0,
-+                        param->batch_size * param->src_seq_len * param->head_size * param->head_num * sizeof(T),
-+                        param->stream);
-+        invokeRebuildPadding(
-+            (T*)output[0], out_buf, param->padding_offset, h_token_num, param->hidden_size, param->stream);
++            invokeAddBiasResidualCast<T, T, half>(reinterpret_cast<half*>(tmp_out),
++                                                  reinterpret_cast<T*>(attn_out),
++                                                  reinterpret_cast<T*>(output[0]),
++                                                  ffn_bias,  // FFN bias
++                                                  h_token_num,
++                                                  param->hidden_size,
++                                                  param->stream);
++        }
 +    }
++
 +    return;
 +}
 +
@@ -6805,323 +10409,373 @@ index 0000000..004718e
 +forwardEncoder<float>(void* inputs[], int in_len, void* output[], int out_len, encoderParamT* param, void* ws);
 +template void
 +forwardEncoder<half>(void* inputs[], int in_len, void* output[], int out_len, encoderParamT* param, void* ws);
++}  // namespace fastertransformer
+diff --git a/src/fastertransformer/layers/ms_layers/encoder.h b/src/fastertransformer/layers/ms_layers/encoder.h
+new file mode 100644
+index 0000000..bdfa7be
+--- /dev/null
++++ b/src/fastertransformer/layers/ms_layers/encoder.h
+@@ -0,0 +1,16 @@
++#pragma once
 +
-+template<typename T>
-+void forward_attn(T* inputs[], int in_len, T* output[], int out_len, encoderParamT* param, void* ws)
-+{
-+    param->in_idx = 0;
-+    auto extra_tmp_size =
-+        ALIGN(param->batch_size * param->head_num * param->head_size * param->tgt_seq_len, ALIGN_SIZE);
-+    size_t size_q = ALIGN(param->batch_size * param->src_seq_len * param->hidden_size, ALIGN_SIZE);
-+    size_t q_buf_2_len = size_q;
-+    size_t qk_buf_len =
-+        ALIGN(param->batch_size * param->head_num * param->src_seq_len * param->tgt_seq_len, ALIGN_SIZE);
-+    size_t qkv_buf_2_len = ALIGN(param->batch_size * param->src_seq_len * param->hidden_size, ALIGN_SIZE);
-+    T* q_buf_2 = (T*)ws;
-+    T* output1 = static_cast<T*>(ws) + q_buf_2_len;
-+    T* output2 = static_cast<T*>(output1) + extra_tmp_size;
-+    T* qkv_buf = static_cast<T*>(output2) + extra_tmp_size;
-+    T* qk_buf = qkv_buf;
-+    T* qkv_buf_2 = q_buf_2;
-+    T* qkv_buf_3 = qk_buf;
-+    int gemm_dims[] = {3 * (int)param->hidden_size, (int)param->h_token_num, (int)param->hidden_size};
-+    int gemm_lds[] = {3 * (int)param->hidden_size, (int)param->hidden_size, 3 * (int)param->hidden_size};
-+    T* from_tensor = reinterpret_cast<T*>(inputs[param->in_idx++]);
-+    cublasOperation_t gemm_ops[] = {CUBLAS_OP_N, CUBLAS_OP_N};
-+    cudaDataType gemm_data_types[] = {CUDA_R_32F, CUDA_R_32F, CUDA_R_32F};
-+    if (std::is_same<T, half>::value) {
-+        gemm_data_types[0] = CUDA_R_16F;
-+        gemm_data_types[1] = CUDA_R_16F;
-+        gemm_data_types[2] = CUDA_R_16F;
-+    }
-+    T alpha = 1.0f;
-+    T beta = 0.0f;
-+
-+    if (param->is_cross) {
-+        gemm_dims[0] = param->hidden_size;
-+        gemm_dims[1] = param->batch_size * param->src_seq_len;
-+        gemm_dims[2] = param->hidden_size;
-+        gemm_lds[0] = param->hidden_size;
-+        gemm_lds[1] = param->hidden_size;
-+        gemm_lds[2] = param->hidden_size;
-+        T* encoder_output = reinterpret_cast<T*>(inputs[param->in_idx++]);
-+        T* weight_q = reinterpret_cast<T*>(inputs[param->in_idx++]);
++#include "src/fastertransformer/kernels/activation_kernels.h"
++#include "src/fastertransformer/layers/ms_layers/MSBaseLayer.h"
++#include "src/fastertransformer/layers/ms_layers/param.h"
++#include <cublas_v2.h>
++#include <cuda.h>
 +
-+        CublasGemmWrapper(weight_q,
-+                          from_tensor,
-+                          qkv_buf,
-+                          gemm_dims,
-+                          gemm_lds,
-+                          gemm_ops,
-+                          gemm_data_types,
-+                          &alpha,
-+                          &beta,
-+                          param->cublas_handle,
-+                          param->algo);
-+        gemm_dims[0] = 2 * param->hidden_size;
-+        gemm_dims[1] = param->batch_size * param->tgt_seq_len;
-+        gemm_lds[0] = 2 * param->hidden_size;
-+        gemm_lds[2] = 2 * param->hidden_size;
-+        T* weight_kv = reinterpret_cast<T*>(inputs[param->in_idx++]);
++namespace fastertransformer {
 +
-+        CublasGemmWrapper(weight_kv,
-+                          encoder_output,
-+                          qkv_buf + (param->batch_size * param->src_seq_len) * param->hidden_size,
-+                          gemm_dims,
-+                          gemm_lds,
-+                          gemm_ops,
-+                          gemm_data_types,
-+                          &alpha,
-+                          &beta,
-+                          param->cublas_handle,
-+                          param->algo);
++template<typename T>
++size_t GetEncoderLayerWorkspaceSize(encoderParamT* param);
 +
-+        T* bias_qkv = (param->qkv_bias) ? reinterpret_cast<T*>(inputs[param->in_idx++]) : nullptr;
-+        invokeCrossAddFusedQKVBiasTranspose(q_buf_2,
-+                                            output1,
-+                                            output2,
-+                                            qkv_buf,
-+                                            bias_qkv,
-+                                            param->batch_size,
-+                                            param->src_seq_len,
-+                                            param->tgt_seq_len,
-+                                            param->head_num,
-+                                            param->head_size,
-+                                            param->stream);
-+    }
-+    else {
-+        T* weight_qkv = reinterpret_cast<T*>(inputs[param->in_idx++]);
-+        CublasGemmWrapper(weight_qkv,
-+                          from_tensor,
-+                          qkv_buf,
-+                          gemm_dims,
-+                          gemm_lds,
-+                          gemm_ops,
-+                          const_cast<const cudaDataType*>(gemm_data_types),
-+                          &alpha,
-+                          &beta,
-+                          param->cublas_handle,
-+                          param->algo);
++template<typename T>
++void forwardEncoder(void* inputs[], int in_len, void* output[], int out_len, encoderParamT* param, void* ws);
++}  // namespace fastertransformer
+diff --git a/src/fastertransformer/layers/ms_layers/ffn.cc b/src/fastertransformer/layers/ms_layers/ffn.cc
+new file mode 100644
+index 0000000..9dc7f04
+--- /dev/null
++++ b/src/fastertransformer/layers/ms_layers/ffn.cc
+@@ -0,0 +1,114 @@
 +
-+        T* bias_qkv = (param->qkv_bias) ? reinterpret_cast<T*>(inputs[param->in_idx++]) : nullptr;
-+        if (param->padding_offset == nullptr) {
-+            invokeAddFusedQKVBiasTranspose(static_cast<T*>(q_buf_2),
-+                                           static_cast<T*>(output1),
-+                                           static_cast<T*>(output2),
-+                                           static_cast<T*>(qkv_buf),
-+                                           bias_qkv,
-+                                           param->batch_size,
-+                                           param->src_seq_len,
-+                                           param->head_num,
-+                                           param->head_size,
-+                                           0,
-+                                           param->stream);
-+        }
-+        else {
-+            invokeAddFusedZP_QKVBiasTranspose(static_cast<T*>(q_buf_2),
-+                                              static_cast<T*>(output1),
-+                                              static_cast<T*>(output2),
-+                                              static_cast<T*>(qkv_buf),
-+                                              bias_qkv,
-+                                              param->batch_size,
-+                                              param->src_seq_len,
-+                                              param->head_num,
-+                                              param->head_size,
-+                                              param->h_token_num,
-+                                              param->padding_offset,
-+                                              param->stream);
-+        }
-+    }
-+    gemm_ops[0] = CUBLAS_OP_T;
-+    gemm_ops[1] = CUBLAS_OP_N;
-+    gemm_dims[0] = param->tgt_seq_len;
-+    gemm_dims[1] = param->src_seq_len;
-+    gemm_dims[2] = param->head_size;
++#include "src/fastertransformer/layers/ms_layers/ffn.h"
++#include "src/fastertransformer/layers/ms_layers/gemm.h"
++#include "src/fastertransformer/kernels/activation_kernels.h"
++#include "src/fastertransformer/kernels/add_residual_kernels.h"
++#include "src/fastertransformer/kernels/layernorm_kernels.h"
++#include "src/fastertransformer/kernels/unfused_attention_kernels.h"
++#include <iostream>
++namespace fastertransformer {
 +
-+    gemm_lds[0] = param->head_size;
-+    gemm_lds[1] = param->head_size;
-+    gemm_lds[2] = param->tgt_seq_len;
++template<typename T>
++void printTensor(char* str, T* input, int size) {
++    printf("%s ",str);
++    T* input_device = input;
++    T* input_host = (T*)malloc(size * sizeof(T));
 +
-+    int gemm_strides[] = {(int)(param->tgt_seq_len * param->head_size),
-+                          (int)(param->src_seq_len * param->head_size),
-+                          (int)(param->src_seq_len * param->tgt_seq_len)};
++    fastertransformer::cudaD2Hcpy(input_host, input_device, size);
 +
-+    CublasGemmStridedBatchedWrapper(output1,
-+                                    q_buf_2,
-+                                    qk_buf,
-+                                    gemm_dims,
-+                                    gemm_lds,
-+                                    gemm_ops,
-+                                    gemm_strides,
-+                                    const_cast<const cudaDataType*>(gemm_data_types),
-+                                    &alpha,
-+                                    &beta,
-+                                    param->batch_size * param->head_num,
-+                                    param->cublas_handle,
-+                                    param->algo);
++    for (int k = 0; k < (int)size; k++) {
 +
-+    T* attention_mask = reinterpret_cast<T*>(inputs[param->in_idx++]);
-+    if (param->padding_offset != nullptr)
-+        invokeBuildEncoderAttentionMask(
-+            attention_mask, param->d_sequence_length, param->batch_size, param->src_seq_len, param->stream);
-+    T* position_bias = nullptr;
-+    if (param->position_bias) {
-+        position_bias = reinterpret_cast<T*>(inputs[param->in_idx++]);
++        std::cout << input_host[k] << ",";
++        if (k % 10 == 0)
++            std::cout << std::endl;
++        if (k % 10 == 0)
++            std::cout << std::endl;
 +    }
-+    T scalar = static_cast<T>(1.0f / sqrtf(param->head_size * 1.0f));
-+    invokeMixMaskedSoftMax(static_cast<T*>(qk_buf),
-+                           attention_mask,
-+                           position_bias,
-+                           param->batch_size,
-+                           param->src_seq_len,
-+                           param->tgt_seq_len,
-+                           param->head_num,
-+                           scalar,
-+                           param->stream);
 +
-+    gemm_ops[0] = CUBLAS_OP_N;
-+    gemm_ops[1] = CUBLAS_OP_N;
-+    gemm_dims[0] = param->head_size;
-+    gemm_dims[1] = param->src_seq_len;
-+    gemm_dims[2] = param->tgt_seq_len;
++    std::cout << std::endl;
 +
-+    gemm_lds[0] = param->head_size;
-+    gemm_lds[1] = param->tgt_seq_len;
-+    gemm_lds[2] = param->head_size;
++    free(input_host);
++}
 +
-+    gemm_strides[0] = param->tgt_seq_len * param->head_size;
-+    gemm_strides[1] = param->src_seq_len * param->tgt_seq_len;
-+    gemm_strides[2] = param->src_seq_len * param->head_size;
++template<typename T>
++void isNan(char* str, T* input, int size)
++{
++    std::cout << str << " " << " size is " << size;
++    T* input_device = input;
++    T* input_host = (T*)malloc(size * sizeof(T));
 +
-+    CublasGemmStridedBatchedWrapper(output2,
-+                                    qk_buf,
-+                                    qkv_buf_2,
-+                                    gemm_dims,
-+                                    gemm_lds,
-+                                    gemm_ops,
-+                                    gemm_strides,
-+                                    const_cast<const cudaDataType*>(gemm_data_types),
-+                                    &alpha,
-+                                    &beta,
-+                                    param->batch_size * param->head_num,
-+                                    param->cublas_handle,
-+                                    param->algo);
++    fastertransformer::cudaD2Hcpy(input_host, input_device, size);
 +
-+    if (param->padding_offset == nullptr) {
-+        invokeTransposeQKV(static_cast<T*>(qkv_buf_3),
-+                           static_cast<T*>(qkv_buf_2),
-+                           param->batch_size,
-+                           param->src_seq_len,
-+                           param->head_num,
-+                           param->head_size,
-+                           param->stream);
-+    }
-+    else {
-+        invokeTransposeAttentionOutRemovePadding(qkv_buf_2,
-+                                                 qkv_buf_3,
-+                                                 param->h_token_num,
-+                                                 param->batch_size,
-+                                                 param->src_seq_len,
-+                                                 param->head_num,
-+                                                 param->head_size,
-+                                                 param->padding_offset,
-+                                                 param->stream);
++    for (int k = 0; k < (int)size; k++) {
++        if (std::isnan((float)input_host[k]) || std ::isinf((float)input_host[k])) {
++            std::cout << "found NAN or INF";
++            break;
++        }
 +    }
-+    gemm_ops[0] = CUBLAS_OP_N;
-+    gemm_ops[1] = CUBLAS_OP_N;
-+    gemm_dims[0] = param->hidden_size;
-+    gemm_dims[1] = param->h_token_num;
-+    gemm_dims[2] = param->hidden_size;
 +
++    std::cout << std::endl;
++    free(input_host);
++}
++
++template<typename T, typename S = T>
++void forward_ffn(T* inputs[], int in_len, T* output[], int out_len, ParamT* param, void* ws)
++{
++    size_t inter_size = param->ffn_hidden_size;
++    size_t h_token_num = param->batch_size * param->src_seq_len;
++    cublasOperation_t gemm_ops[] = {CUBLAS_OP_N, CUBLAS_OP_N};
++    cudaDataType gemm_data_types[] = {CUDA_R_32F, CUDA_R_32F, CUDA_R_32F};
++    if ((std::is_same<T, half>::value) || (std::is_same<S, half>::value)) {
++        gemm_data_types[0] = CUDA_R_16F;
++        gemm_data_types[1] = CUDA_R_16F;
++        gemm_data_types[2] = CUDA_R_16F;
++    }
++    S alpha = 1.0f;
++    S beta = 0.0f;
++
++    int gemm_dims[] = {(int)inter_size, (int)h_token_num, (int)param->hidden_size};
++    int gemm_lds[] = {(int)inter_size, (int)param->hidden_size, (int)inter_size};
++    T* normed_attn_out = reinterpret_cast<T*>(inputs[param->in_idx++]);
++    fastertransformer::CublasGemmWrapper(inputs[param->in_idx++],
++                      normed_attn_out,
++                      ws,
++                      gemm_dims,
++                      gemm_lds,
++                      gemm_ops,
++                      gemm_data_types,
++                      &alpha,
++                      &beta,
++                      param->cublas_handle,
++                      param->algo);
++    S* bias = (param->has_bias) ? reinterpret_cast<S*>(inputs[param->in_idx++]) : nullptr;
++    invokeAddBiasGelu(reinterpret_cast<S*>(ws),
++                      bias,
++                      h_token_num,
++                      inter_size,
++                      param->stream);
++    gemm_dims[0] = param->hidden_size;
++    gemm_dims[1] = h_token_num;
++    gemm_dims[2] = inter_size;
 +    gemm_lds[0] = param->hidden_size;
-+    gemm_lds[1] = param->hidden_size;
++    gemm_lds[1] = inter_size;
 +    gemm_lds[2] = param->hidden_size;
-+    CublasGemmWrapper(reinterpret_cast<T*>(inputs[param->in_idx++]),
-+                      qkv_buf_3,
-+                      static_cast<T*>(output[0]),
++    fastertransformer::CublasGemmWrapper(inputs[param->in_idx++],
++                      ws,
++                      output[0],
 +                      gemm_dims,
 +                      gemm_lds,
 +                      gemm_ops,
-+                      const_cast<const cudaDataType*>(gemm_data_types),
++                      gemm_data_types,
 +                      &alpha,
 +                      &beta,
 +                      param->cublas_handle,
 +                      param->algo);
-+    if (param->projection_bias) {
-+        int len = param->h_token_num;
-+        invokeAddBias(
-+            static_cast<T*>(output[0]), (const T*)(inputs[param->in_idx++]), len, param->hidden_size, param->stream);
-+    }
-+    return;
 +}
 +
-+template void
-+forward_attn<float>(float* inputs[], int in_len, float* output[], int out_len, encoderParamT* param, void* ws);
-+template void
-+forward_attn<half>(half* inputs[], int in_len, half* output[], int out_len, encoderParamT* param, void* ws);
 +
 +template void
-+forward_ffn<float, half>(float* inputs[], int in_len, float* output[], int out_len, encoderParamT* param, void* ws);
++forward_ffn<float, half>(float* inputs[], int in_len, float* output[], int out_len, ParamT* param, void* ws);
 +template void
-+forward_ffn<half>(half* inputs[], int in_len, half* output[], int out_len, encoderParamT* param, void* ws);
++forward_ffn<half>(half* inputs[], int in_len, half* output[], int out_len, ParamT* param, void* ws);
 +template void
-+forward_ffn<float>(float* inputs[], int in_len, float* output[], int out_len, encoderParamT* param, void* ws);
++forward_ffn<float>(float* inputs[], int in_len, float* output[], int out_len, ParamT* param, void* ws);
 +}  // namespace fastertransformer
-diff --git a/src/fastertransformer/layers/encoder_layers/encoder.h b/src/fastertransformer/layers/encoder_layers/encoder.h
+diff --git a/src/fastertransformer/layers/ms_layers/ffn.h b/src/fastertransformer/layers/ms_layers/ffn.h
 new file mode 100644
-index 0000000..ffba081
+index 0000000..9498dc8
 --- /dev/null
-+++ b/src/fastertransformer/layers/encoder_layers/encoder.h
-@@ -0,0 +1,49 @@
++++ b/src/fastertransformer/layers/ms_layers/ffn.h
+@@ -0,0 +1,14 @@
 +#pragma once
 +
 +#include "src/fastertransformer/kernels/activation_kernels.h"
-+#include "src/fastertransformer/layers/encoder_layers/BaseEncoderLayer.h"
++#include "src/fastertransformer/layers/ms_layers/MSBaseLayer.h"
++#include "src/fastertransformer/layers/ms_layers/param.h"
++
 +#include <cublas_v2.h>
 +#include <cuda.h>
 +
 +namespace fastertransformer {
 +
-+typedef struct {
-+    size_t batch_size;
-+    size_t src_seq_len;
-+    size_t tgt_seq_len;
-+    size_t head_num;
-+    size_t head_size;
-+    size_t hidden_size;
-+    size_t h_token_num;
-+    size_t ffn_hidden_size;  // 4 * param->hidden_size;
++template<typename T, typename S = T>
++void forward_ffn(T* inputs[], int in_len, T* output[], int out_len, ParamT* param, void* ws);
++}  // namespace fastertransformer
+diff --git a/src/fastertransformer/layers/ms_layers/gemm.cc b/src/fastertransformer/layers/ms_layers/gemm.cc
+new file mode 100644
+index 0000000..aabafb7
+--- /dev/null
++++ b/src/fastertransformer/layers/ms_layers/gemm.cc
+@@ -0,0 +1,117 @@
++
++#include "src/fastertransformer/layers/ms_layers/gemm.h"
++#include "src/fastertransformer/kernels/activation_kernels.h"
++#include "src/fastertransformer/kernels/unfused_attention_kernels.h"
++#include <iostream>
++namespace fastertransformer {
++
++void CublasGemmWrapper(const void* a_addr,
++                       const void* b_addr,
++                       void* c_addr,
++                       const int* params,
++                       const int* lds,
++                       const cublasOperation_t* operations,
++                       const cudaDataType* data_types,
++                       void* alpha,
++                       void* beta,
++                       cublasHandle_t cublas_handle,
++                       cublasGemmAlgo_t algo)
++{
++    const int m = params[0];
++    const int n = params[1];
++    const int k = params[2];
++    cublasOperation_t trans_a = operations[0];
++    cublasOperation_t trans_b = operations[1];
++    const int lda = lds[0];
++    const int ldb = lds[1];
++    const int ldc = lds[2];
++    cudaDataType type_a = data_types[0];
++    cudaDataType type_b = data_types[1];
++    cudaDataType type_c = data_types[2];
++    cublasComputeType_t compute_type = CUBLAS_COMPUTE_32F_FAST_TF32;
++    // cublasComputeType_t compute_type = CUBLAS_COMPUTE_32F_FAST_16F;
++    if ((type_a == CUDA_R_16F) && (type_b == CUDA_R_16F) && (type_c == CUDA_R_16F)) {
++        compute_type = CUBLAS_COMPUTE_16F;
++    }
++    cublasGemmEx(cublas_handle,
++                 trans_a,
++                 trans_b,
++                 m,
++                 n,
++                 k,
++                 alpha,
++                 a_addr,
++                 type_a,
++                 lda,
++                 b_addr,
++                 type_b,
++                 ldb,
++                 beta,
++                 c_addr,
++                 type_c,
++                 ldc,
++                 compute_type,
++                 algo);
++}
++
++void CublasGemmStridedBatchedWrapper(const void* a_addr,
++                                     const void* b_addr,
++                                     void* c_addr,
++                                     const int* params,
++                                     const int* lds,
++                                     const cublasOperation_t* operations,
++                                     const int* strides,
++                                     const cudaDataType* data_types,
++                                     void* alpha,
++                                     void* beta,
++                                     int batch,
++                                     cublasHandle_t cublas_handle,
++                                     cublasGemmAlgo_t algo)
++{
++    const int m = params[0];
++    const int n = params[1];
++    const int k = params[2];
++    cublasOperation_t trans_a = operations[0];
++    cublasOperation_t trans_b = operations[1];
++    const int lda = lds[0];
++    const int ldb = lds[1];
++    const int ldc = lds[2];
++    cudaDataType type_a = data_types[0];
++    cudaDataType type_b = data_types[1];
++    cudaDataType type_c = data_types[2];
++    cublasComputeType_t compute_type = CUBLAS_COMPUTE_32F_FAST_TF32;
++    // cublasComputeType_t compute_type = CUBLAS_COMPUTE_32F_FAST_16F;
++
++    if ((type_a == CUDA_R_16F) && (type_b == CUDA_R_16F) && (type_c == CUDA_R_16F)) {
++        compute_type = CUBLAS_COMPUTE_16F;
++    }
++    const int stride_a = strides[0];
++    const int stride_b = strides[1];
++    const int stride_c = strides[2];
++    cublasGemmStridedBatchedEx(cublas_handle,
++                               trans_a,
++                               trans_b,
++                               m,
++                               n,
++                               k,
++                               alpha,
++                               a_addr,
++                               type_a,
++                               lda,
++                               stride_a,
++                               b_addr,
++                               type_b,
++                               ldb,
++                               stride_b,
++                               beta,
++                               c_addr,
++                               type_c,
++                               ldc,
++                               stride_c,
++                               batch,
++                               compute_type,
++                               algo);
++}
++
++
++}  // namespace fastertransformer
+diff --git a/src/fastertransformer/layers/ms_layers/gemm.h b/src/fastertransformer/layers/ms_layers/gemm.h
+new file mode 100644
+index 0000000..21dd35c
+--- /dev/null
++++ b/src/fastertransformer/layers/ms_layers/gemm.h
+@@ -0,0 +1,13 @@
++#pragma once
++
++#include "src/fastertransformer/kernels/activation_kernels.h"
++#include "src/fastertransformer/layers/ms_layers/MSBaseLayer.h"
++#include <cublas_v2.h>
++#include <cuda.h>
++
++namespace fastertransformer {
++
++void CublasGemmWrapper(const void* a_addr, const void* b_addr, void* c_addr, const int* params, const int* lds, const cublasOperation_t* operations, const cudaDataType* data_types, void* alpha, void* beta, cublasHandle_t cublas_handle, cublasGemmAlgo_t algo);
++void CublasGemmStridedBatchedWrapper(const void* a_addr, const void* b_addr, void* c_addr, const int* params, const int* lds, const cublasOperation_t* operations, const int* strides, const cudaDataType* data_types, void* alpha, void* beta, int batch, cublasHandle_t cublas_handle, cublasGemmAlgo_t algo);
++
++}  // namespace fastertransformer
+diff --git a/src/fastertransformer/layers/ms_layers/param.h b/src/fastertransformer/layers/ms_layers/param.h
+new file mode 100644
+index 0000000..09af694
+--- /dev/null
++++ b/src/fastertransformer/layers/ms_layers/param.h
+@@ -0,0 +1,55 @@
++#pragma once
++namespace fastertransformer {
++typedef struct{
++    public:
++        size_t batch_size;
++        size_t src_seq_len;
++        size_t tgt_seq_len;
++        size_t head_num;
++        size_t head_size;
++        size_t hidden_size;
++        size_t h_token_num;
++        // handle
++        cublasHandle_t cublas_handle;
++        cudaStream_t stream;
++        cublasGemmAlgo_t algo;
++        size_t ffn_hidden_size;
++        // ctrls    
++        int *padding_offset;
++        int in_idx;
++        bool has_bias;  
++
++} ParamT;
++
++typedef struct : ParamT{
++    bool qkv_bias;         // ture
++    bool projection_bias;  // ture
++    bool is_cross;         // false
++    bool position_bias;    
++    int *padding_offset;
++} attentionParamT;
++
++typedef struct : ParamT{
++
 +    bool ffn_fp16;
 +    float eps1;
 +    float eps2;
-+    // handle
-+    cublasHandle_t cublas_handle;
-+    cudaStream_t stream;
-+    cublasGemmAlgo_t algo;
-+    // ctrls
-+    int in_idx;
-+    bool qkv_bias;         // true
-+    bool projection_bias;  // true
-+    bool is_cross;         // false
-+    bool position_bias;    // false
-+    bool layernorm_post;   // dont care
-+    bool eft;              // false - effective fast trn
++    float eps3;
++    attentionParamT attn1;
++    attentionParamT attn2;
++    bool layernorm_post;  
++    bool has_beta;
 +    int *padding_offset;
-+    int *d_sequence_length;
-+} encoderParamT;
++} decoderParamT;
 +
-+template<typename T>
-+size_t GetEncoderLayerWorkspaceSize(encoderParamT* param);
++typedef struct : ParamT{
 +
-+template<typename T>
-+size_t GetAttnWorkspaceSize(encoderParamT* param);
-+template<typename T>
-+void forward_attn(T* inputs[], int in_len, T* output[], int out_len, encoderParamT* param, void* ws);
-+template<typename T>
-+void forwardEncoder(void* inputs[], int in_len, void* output[], int out_len, encoderParamT* param, void* ws);
-+// void forwardEncoder(std::vector<fastertransformer::Tensor, std::allocator<fastertransformer::Tensor> > const*
-+// inputs);
-+}  // namespace fastertransformer
++    bool ffn_fp16;
++    float eps1;
++    float eps2;
++    attentionParamT attn;
++    bool layernorm_post;
++    bool has_beta;
++    int *padding_offset;
++} encoderParamT;
++}
+\ No newline at end of file
 diff --git a/src/fastertransformer/models/CMakeLists.txt b/src/fastertransformer/models/CMakeLists.txt
 index af33e76..97fc471 100644
 --- a/src/fastertransformer/models/CMakeLists.txt
diff --git a/trc/transformer/MultiHeadTester.py b/trc/transformer/MultiHeadTester.py
old mode 100644
new mode 100755
index bfc72ef3123496f5b70ccdd0b0a08d1d680bb915..286bc5b75d17d7003b70b68a045b6fa6a2e00bfe
--- a/trc/transformer/MultiHeadTester.py
+++ b/trc/transformer/MultiHeadTester.py
@@ -24,7 +24,8 @@ __all__ = [
     "MultiHeadAttentionX",
     "FeedForwardX",
     "TransformerEncoderLayerX",
-    "_LayerNormX"
+    "_LayerNormX",
+    "TransformerDecoderLayerX"
 ]
 
 
@@ -44,7 +45,7 @@ class _LayerNormX(Cell):
             Tensor of shape :math:`(batch, seq_length, hidden_size)`.
     """
 
-    def __init__(self, normalized_shape, eps=1e-4, param_init_type=mstype.float32, is_self_defined=True):
+    def __init__(self, normalized_shape, eps=1e-4, param_init_type=mstype.float32, is_self_defined=False):
         super(_LayerNormX, self).__init__()
         if param_init_type not in [mstype.float32, mstype.float16]:
             raise TypeError("The type of parameter 'param_init_type' should in [float32, float16], "
@@ -540,39 +541,39 @@ class MultiHeadAttentionX(Cell):
         # # key and value for current token(s)
         key_present = key
         value_present = value
-        # if self.use_past:
-        #     # The first graph with the input size of (bs, seq_length)
-        #     if self.is_first_iteration:
-        #         # Get the valid input length without padding
-        #         valid_length_vector = F.cast(self.less(self.range, batch_valid_length.view(-1, 1, 1)), self.dtype)
-        #         # Cover the key and value numbers corresponding to the padding position
-        #         key_present = self.mul1(key, self.expand_dims(valid_length_vector, 2))
-        #         value_present = self.mul1(value, self.expand_dims(valid_length_vector, 3))
-        #     # The second graph with the inpus size of (bs, 1)
-        #     # the shape of query is (bs, num_heads, 1, size_per_head)
-        #     # the shape of key is   (bs, num_heads, size_per_head, 1)
-        #     # the shape of value is (bs, num_heads, 1, size_per_head)
-        #     else:
-        #         # Get the current token position index
-        #         valid_length = self.reducesum(F.cast(self.not_equal(self.slice(key_past, (0, 0, 0, 0),
-        #                                                                        (F.shape(key_tensor)[0], 1, 1,
-        #                                                                         self.src_seq_length),
-        #                                                                        (1, 1, 1, 1)),
-        #                                                             0), mstype.float32), (1, 2, 3))
-        #         valid_length = F.reshape(valid_length, (-1, 1, 1))
-        #         valid_length_vector = F.cast(self.equal(valid_length, self.range), self.dtype)
-        #         # Pad the key and value to seq_length with only the position index not zero
-        #         current_key = self.mul1(self.tile(key, (1, 1, 1, self.seq_length)),
-        #                                 self.expand_dims(valid_length_vector, 2))
-        #         current_value = self.mul1(self.tile(value, (1, 1, self.seq_length, 1)),
-        #                                   self.expand_dims(valid_length_vector, 3))
-        #         # Concat the previous saved state and current state
-        #         key = self.add(key_past, current_key)
-        #         value = self.add(value_past, current_value)
-        #         # Update key_present and value_present for state update
-        #         key_present = key
-        #         value_present = value
-        #         attention_mask = F.reshape(self.attention_mask, (self.seq_length, self.seq_length, 1, 1))
+        if self.use_past:
+            # The first graph with the input size of (bs, seq_length)
+            if self.is_first_iteration:
+                # Get the valid input length without padding
+                valid_length_vector = F.cast(self.less(self.range, batch_valid_length.view(-1, 1, 1)), self.dtype)
+                # Cover the key and value numbers corresponding to the padding position
+                key_present = self.mul1(key, self.expand_dims(valid_length_vector, 2))
+                value_present = self.mul1(value, self.expand_dims(valid_length_vector, 3))
+            # The second graph with the inpus size of (bs, 1)
+            # the shape of query is (bs, num_heads, 1, size_per_head)
+            # the shape of key is   (bs, num_heads, size_per_head, 1)
+            # the shape of value is (bs, num_heads, 1, size_per_head)
+            else:
+                # Get the current token position index
+                valid_length = self.reducesum(F.cast(self.not_equal(self.slice(key_past, (0, 0, 0, 0),
+                                                                               (F.shape(key_tensor)[0], 1, 1,
+                                                                                self.src_seq_length),
+                                                                               (1, 1, 1, 1)),
+                                                                    0), mstype.float32), (1, 2, 3))
+                valid_length = F.reshape(valid_length, (-1, 1, 1))
+                valid_length_vector = F.cast(self.equal(valid_length, self.range), self.dtype)
+                # Pad the key and value to seq_length with only the position index not zero
+                current_key = self.mul1(self.tile(key, (1, 1, 1, self.seq_length)),
+                                        self.expand_dims(valid_length_vector, 2))
+                current_value = self.mul1(self.tile(value, (1, 1, self.seq_length, 1)),
+                                          self.expand_dims(valid_length_vector, 3))
+                # Concat the previous saved state and current state
+                key = self.add(key_past, current_key)
+                value = self.add(value_past, current_value)
+                # Update key_present and value_present for state update
+                key_present = key
+                value_present = value
+                attention_mask = F.reshape(self.attention_mask, (self.seq_length, self.seq_length, 1, 1))
 
         layer_present = (key_present, value_present)
         # # multi head attention considering attention mask
@@ -857,9 +858,9 @@ class FeedForwardX(Cell):
                  param_init_type=mstype.float32,
                  parallel_config=default_dpmp_config):
         super(FeedForwardX, self).__init__()
-        if hidden_act is None or not (isinstance(hidden_act, str) or issubclass(hidden_act, nn.Cell)):
-            raise TypeError(f"For FeedForward cell, the hidden_act should str type or nn.Cell type, "
-                            f"but got {hidden_act}.")
+        # if hidden_act is None or not (isinstance(hidden_act, str) or issubclass(hidden_act, nn.Cell)):
+        #     raise TypeError(f"For FeedForward cell, the hidden_act should str type or nn.Cell type, "
+        #                     f"but got {hidden_act}.")
         if _get_parallel_mode() in (ParallelMode.AUTO_PARALLEL,) and _is_sharding_propagation():
             _check_config(parallel_config)
             mp = parallel_config.model_parallel
@@ -886,7 +887,7 @@ class FeedForwardX(Cell):
             # Project to ffn_hidden_size
             self.mapping = _Linear(in_channels=input_size,
                                    out_channels=output_size,
-                                   activation=hidden_act,
+                                   activation=None,
                                    transpose_b=False,
                                 #    expert_num=expert_num,
                                 #    expert_group_size=expert_group_size,
@@ -1309,6 +1310,7 @@ class TransformerEncoderLayerX(Cell):
         else:
             input_x = self.layernorm1(x)
         input_x = F.cast(input_x, self.dtype)
+
         # indicate whether reset saved states
         key_reset = None
         value_reset = None
@@ -1415,3 +1417,462 @@ class TransformerEncoderLayerX(Cell):
             _check_input_dtype(F.dtype(batch_valid_length), "batch_valid_length", [mstype.int32], self.cls_name)
         return True
 
+class TransformerDecoderLayerX(Cell):
+    r"""
+        Transformer Decoder Layer. This is an implementation of the single layer of the transformer
+        decoder layer, including self-attention, cross attention and feedward layer. When the encoder_output is None,
+        the cross attention will not be effective.
+
+        Args:
+            hidden_size(int): The hidden size of the input.
+            ffn_hidden_size(int): The hidden size of bottleneck in the feedforward layer.
+            num_heads(int): The number of the heads.
+            batch_size(int): The batch size of the input tensor.
+            src_seq_length(int): The input source sequence length.
+            tgt_seq_length(int): The input target sequence length.
+            attention_dropout_rate(float): The dropout rate of the attention scores. Default:0.1.
+            hidden_dropout_rate(float): The dropout rate of the final output of the layer. Default:0.1.
+            post_layernorm_residual(bool): Do residuals adds before the layernorm. Default False.
+            use_past(bool): Use the past state to compute, used for incremental prediction. Default False.
+            layernorm_compute_type(dtype.Number): The computation type of the layernorm.
+                Should be dtype.float32 or dtype.float16. Default dtype.float32.
+            softmax_compute_type(dtype.Number): The computation type of the softmax in the attention.
+                Should be dtype.float32 or dtype.float16. Default mstype.float32.
+            param_init_type(dtype.Number): The parameter initialization type of the module.
+                Should be dtype.float32 or dtype.float16. Default dtype.float32.
+            hidden_act(str): The activation of the internal feedforward layer. Supports 'relu',
+                'relu6', 'tanh', 'gelu', 'fast_gelu', 'elu', 'sigmoid', 'prelu', 'leakyrelu', 'hswish',
+                'hsigmoid', 'logsigmoid' and so on. Default: gelu.
+            moe_config(MoEConfig): The configuration of MoE (Mixture of Expert). Default is an instance of MoEConfig
+                with default values. Please see `MoEConfig`.
+            parallel_config(OpParallelConfig, MoEParallelConfig): The parallel configure. When MoE is applied,
+                MoEParallelConfig is effective, otherwise OpParallelConfig is effective. Default `default_dpmp_config`,
+                an instance of `OpParallelConfig` with default args.
+
+        Inputs:
+            - **hidden_stats** (Tensor) - The input tensor with shape [batch_size, tgt_seq_length, hidden_size] or
+              [batch_size * tgt_seq_length, hidden_size].
+            - **decoder_mask** (Tensor) - The attention mask for decoder with shape [batch_size, src_seq_length,
+              seq_length].
+            - **encoder_output** (Tensor) - The output of the encoder with shape [batch_size, seq_length, hidden_size]
+              or [batch_size * seq_length, hidden_size].
+              Note this args can not be passed by None when the net is in outermost layer. Default None.
+            - **memory_mask** (Tensor) - The memory mask of the cross attention with shape [batch, tgt_seq_length,
+              src_seq_length] where tgt_seq_length is the length of the decoder. Note this args can not be passed by
+              None when the net is in outermost layer. Default None.
+            - **init_reset** (Tensor) - A bool tensor with shape [1], used to clear the past key parameter and
+              past value parameter used in the incremental prediction. Only valid when use_past is True. Default True.
+            - **batch_valid_length** (Tensor) - Int32 tensor with shape [batch_size] the past calculated the index.
+              Used for incremental prediction when the use_past is True. Default None.
+
+        Outputs:
+            Tuple, a tuple contains(`output`, `layer_present`)
+
+            - **output** (Tensor) - The output logit of this layer. The shape is [batch, seq_length, hidden_size] or
+              [batch * seq_length, hidden_size].
+            - **layer_present** (Tuple) - A tuple, where each tuple is the tensor of the projected key and value
+              vector in self attention with shape ((batch_size, num_heads, size_per_head, tgt_seq_length),
+              (batch_size, num_heads, tgt_seq_length, size_per_head), and of the projected key and value vector
+              in cross attention with shape  (batch_size, num_heads, size_per_head, src_seq_length),
+              (batch_size, num_heads, src_seq_length, size_per_head)).
+
+        Supported Platforms:
+            ``Ascend`` ``GPU``
+
+        Examples:
+            >>> import numpy as np
+            >>> from mindspore import dtype as mstype
+            >>> from mindspore.nn.transformer import TransformerDecoderLayer
+            >>> from mindspore import Tensor
+            >>> model = TransformerDecoderLayer(batch_size=2, hidden_size=64, ffn_hidden_size=64, num_heads=2,
+            ...                                 src_seq_length=20, tgt_seq_length=10)
+            >>> encoder_input_value = Tensor(np.ones((2, 20, 64)), mstype.float32)
+            >>> decoder_input_value = Tensor(np.ones((2, 10, 64)), mstype.float32)
+            >>> decoder_input_mask = Tensor(np.ones((2, 10, 10)), mstype.float16)
+            >>> memory_mask = Tensor(np.ones((2, 10, 20)), mstype.float16)
+            >>> output, past = model(decoder_input_value, decoder_input_mask, encoder_input_value, memory_mask)
+            >>> print(output.shape)
+            (2, 10, 64)
+            >>> print(past[0].shape)
+            (2, 2, 32, 10)
+            >>> print(past[1].shape)
+            (2, 2, 10, 32)
+            >>> print(past[2].shape)
+            (2, 2, 32, 20)
+            >>> print(past[3].shape)
+            (2, 2, 20, 32)
+    """
+    # @_LogActionOnce(logger=logger, key='TransformerDecoderLayer',
+    #                 no_warning=_get_parallel_mode() in (ParallelMode.STAND_ALONE,))
+    @_args_type_validator_check(batch_size=Validator.check_positive_int,
+                                hidden_size=Validator.check_positive_int,
+                                num_heads=Validator.check_positive_int,
+                                ffn_hidden_size=Validator.check_positive_int,
+                                src_seq_length=Validator.check_positive_int,
+                                tgt_seq_length=Validator.check_positive_int,
+                                attention_dropout_rate=Validator.check_non_negative_float,
+                                hidden_dropout_rate=Validator.check_non_negative_float,
+                                hidden_act=_valid_type_checks([str], "TransformerDecoderLayer"),
+                                post_layernorm_residual=Validator.check_bool,
+                                layernorm_compute_type=_valid_value_checks([mstype.float32, mstype.float16],
+                                                                           "TransformerDecoderLayer"),
+                                softmax_compute_type=_valid_value_checks([mstype.float32, mstype.float16],
+                                                                         "TransformerDecoderLayer"),
+                                param_init_type=_valid_value_checks([mstype.float32, mstype.float16],
+                                                                    "TransformerDecoderLayer"),
+                                parallel_config=_valid_type_checks([OpParallelConfig, MoEParallelConfig],
+                                                                   "TransformerDecoderLayer"),
+                                use_past=Validator.check_bool)
+    def __init__(self, hidden_size,
+                 ffn_hidden_size,
+                 num_heads,
+                 batch_size,
+                 src_seq_length,
+                 tgt_seq_length,
+                 attention_dropout_rate=0.1,
+                 hidden_dropout_rate=0.1,
+                 post_layernorm_residual=False,
+                 use_past=False,
+                 layernorm_compute_type=mstype.float32,
+                 softmax_compute_type=mstype.float32,
+                 param_init_type=mstype.float32,
+                 hidden_act='gelu',
+                 moe_config=default_moe_config,
+                 parallel_config=default_dpmp_config):
+        super(TransformerDecoderLayerX, self).__init__()
+        _check_moe_config(moe_config, parallel_config)
+        self.use_moe = (moe_config.expert_num > 1)
+        config_to_attention = parallel_config.dpmp if self.use_moe else parallel_config
+        if _get_parallel_mode() in (ParallelMode.AUTO_PARALLEL,) and _is_sharding_propagation():
+            _check_config(parallel_config)
+            if num_heads % parallel_config.model_parallel != 0:
+                raise ValueError("For 'TransformerDecoderLayer', the class variable 'num_heads' must be divisibled by "
+                                 "'parallel_config.model_parallel', but got the num_heads is {} and "
+                                 "parallel_config.model_parallel is {}.".format(num_heads,
+                                                                                parallel_config.model_parallel))
+            if hidden_size % parallel_config.model_parallel != 0:
+                raise ValueError(
+                    "For 'TransformerDecoderLayer', the class variable 'hidden_size' must be divisibled by "
+                    "'parallel_config.model_parallel', but got the hidden_size is {} and "
+                    "parallel_config.model_parallel is {}."
+                    .format(hidden_size, parallel_config.model_parallel))
+            if ffn_hidden_size % parallel_config.model_parallel != 0:
+                raise ValueError("For 'TransformerDecoderLayer', the class variable 'ffn_hidden_size' must be "
+                                 "divisibled by 'parallel_config.model_parallel', but got the ffn_hidden_size is {} "
+                                 "and parallel_config.model_parallel is {}."
+                                 .format(ffn_hidden_size, parallel_config.model_parallel))
+            if use_past:
+                raise ValueError(f"The {self.cls_name} does not support use_past=True.")
+            self.batch_size = batch_size
+            self.use_past = use_past
+            self.softmax_compute_type = softmax_compute_type
+
+            self.src_seq_length = src_seq_length
+            self.tgt_seq_length = tgt_seq_length
+            self.use_past = use_past
+            self.hidden_size = hidden_size
+
+            self.layernorm1 = _LayerNormX((hidden_size,)).to_float(layernorm_compute_type)
+            self.layernorm2 = _LayerNormX((hidden_size,)).to_float(layernorm_compute_type)
+            self.attention = MultiHeadAttentionX(hidden_size=hidden_size,
+                                                num_heads=num_heads,
+                                                batch_size=batch_size,
+                                                src_seq_length=tgt_seq_length,
+                                                tgt_seq_length=tgt_seq_length,
+                                                hidden_dropout_rate=hidden_dropout_rate,
+                                                attention_dropout_rate=attention_dropout_rate,
+                                                use_past=use_past,
+                                                softmax_compute_type=softmax_compute_type,
+                                                param_init_type=param_init_type,
+                                                parallel_config=config_to_attention)
+
+            # Cross attention with the output of encoder as memory tensor
+            self.cross_attention = MultiHeadAttentionX(hidden_size=hidden_size,
+                                                      num_heads=num_heads,
+                                                      batch_size=batch_size,
+                                                      src_seq_length=tgt_seq_length,
+                                                      tgt_seq_length=src_seq_length,
+                                                      hidden_dropout_rate=hidden_dropout_rate,
+                                                      attention_dropout_rate=attention_dropout_rate,
+                                                      softmax_compute_type=softmax_compute_type,
+                                                      use_past=use_past,
+                                                      param_init_type=param_init_type,
+                                                      parallel_config=config_to_attention)
+            self.cross_attention_layernorm = _LayerNormX((hidden_size,)).to_float(
+                layernorm_compute_type)
+
+            if self.use_moe:
+                self.output = MoE(hidden_size=hidden_size,
+                                  dropout_rate=hidden_dropout_rate,
+                                  ffn_hidden_size=ffn_hidden_size,
+                                  param_init_type=param_init_type,
+                                  hidden_act=hidden_act,
+                                  moe_config=moe_config,
+                                  parallel_config=parallel_config)
+            else:
+                # Feed Forward Network, FFN
+                self.output = FeedForwardX(hidden_size=hidden_size,
+                                          dropout_rate=hidden_dropout_rate,
+                                          ffn_hidden_size=ffn_hidden_size,
+                                          hidden_act=hidden_act,
+                                          param_init_type=param_init_type,
+                                          parallel_config=parallel_config)
+            self.post_layernorm_residual = post_layernorm_residual
+            self.add = P.Add()
+            self.add_3d = P.Add()
+            self.dtype = mstype.float16
+            self.key_past = None
+            self.value_past = None
+            if self.use_past:
+                # operator used for state reuse
+                self.reducesum = P.ReduceSum().shard(((1, 1, 1, 1),))
+                self.not_equal = P.NotEqual().shard(((1, 1, 1, 1), ()))
+                self.slice = P.StridedSlice().shard(((1, 1, 1, 1),))
+                size_per_head = hidden_size // num_heads
+                self.key_shape = (batch_size, num_heads, size_per_head, tgt_seq_length)
+                self.value_shape = (batch_size, num_heads, tgt_seq_length, size_per_head)
+                # parameters saving key and value states
+                self.key_past = Parameter(Tensor(np.zeros(shape=self.key_shape), self.dtype), name="key_past")
+                self.value_past = Parameter(Tensor(np.zeros(shape=self.value_shape), self.dtype), name="value_past")
+                self.tile = P.Tile().shard(((1, 1),))
+                self.mul = P.Mul().shard(((1, 1, 1, 1), (1,)))
+                self.assign = P.Assign().shard(((1, 1, 1, 1), (1, 1, 1, 1)))
+        elif _get_parallel_mode() not in (ParallelMode.AUTO_PARALLEL,):
+            _check_config(parallel_config)
+            if num_heads % parallel_config.model_parallel != 0:
+                raise ValueError("For 'TransformerDecoderLayer', the class variable 'num_heads' must be divisibled by "
+                                 "'parallel_config.model_parallel', but got the num_heads is {} and "
+                                 "parallel_config.model_parallel is {}.".format(num_heads,
+                                                                                parallel_config.model_parallel))
+            if hidden_size % parallel_config.model_parallel != 0:
+                raise ValueError(
+                    "For 'TransformerDecoderLayer', the class variable 'hidden_size' must be divisibled by "
+                    "'parallel_config.model_parallel', but got the hidden_size is {} and "
+                    "parallel_config.model_parallel is {}."
+                    .format(hidden_size, parallel_config.model_parallel))
+            if ffn_hidden_size % parallel_config.model_parallel != 0:
+                raise ValueError("For 'TransformerDecoderLayer', the class variable 'ffn_hidden_size' must be "
+                                 "divisibled by 'parallel_config.model_parallel', but got the ffn_hidden_size is {} "
+                                 "and parallel_config.model_parallel is {}."
+                                 .format(ffn_hidden_size, parallel_config.model_parallel))
+            if use_past:
+                raise ValueError(f"The {self.cls_name} does not support use_past=True.")
+            self.batch_size = batch_size
+            self.use_past = use_past
+            self.softmax_compute_type = softmax_compute_type
+
+            self.src_seq_length = src_seq_length
+            self.tgt_seq_length = tgt_seq_length
+            self.use_past = use_past
+            self.hidden_size = hidden_size
+
+            self.layernorm1 = _LayerNormX((hidden_size,)).to_float(layernorm_compute_type)
+            self.layernorm1.shard(((parallel_config.data_parallel, 1),))
+            self.layernorm2 = _LayerNormX((hidden_size,)).to_float(layernorm_compute_type)
+            self.layernorm2.shard(((parallel_config.data_parallel, 1),))
+            self.attention = MultiHeadAttentionX(hidden_size=hidden_size,
+                                                num_heads=num_heads,
+                                                batch_size=batch_size,
+                                                src_seq_length=tgt_seq_length,
+                                                tgt_seq_length=tgt_seq_length,
+                                                hidden_dropout_rate=hidden_dropout_rate,
+                                                attention_dropout_rate=attention_dropout_rate,
+                                                use_past=use_past,
+                                                softmax_compute_type=softmax_compute_type,
+                                                param_init_type=param_init_type,
+                                                parallel_config=config_to_attention)
+
+            # Cross attention with the output of encoder as memory tensor
+            self.cross_attention = MultiHeadAttentionX(hidden_size=hidden_size,
+                                                      num_heads=num_heads,
+                                                      batch_size=batch_size,
+                                                      src_seq_length=tgt_seq_length,
+                                                      tgt_seq_length=src_seq_length,
+                                                      hidden_dropout_rate=hidden_dropout_rate,
+                                                      attention_dropout_rate=attention_dropout_rate,
+                                                      softmax_compute_type=softmax_compute_type,
+                                                      use_past=use_past,
+                                                      param_init_type=param_init_type,
+                                                      parallel_config=config_to_attention)
+            self.cross_attention_layernorm = _LayerNormX((hidden_size,)).to_float(
+                layernorm_compute_type)
+            self.cross_attention_layernorm.shard(((parallel_config.data_parallel, 1),))
+
+            if self.use_moe:
+                self.output = MoE(hidden_size=hidden_size,
+                                  dropout_rate=hidden_dropout_rate,
+                                  ffn_hidden_size=ffn_hidden_size,
+                                  param_init_type=param_init_type,
+                                  hidden_act=hidden_act,
+                                  moe_config=moe_config,
+                                  parallel_config=parallel_config)
+            else:
+                # Feed Forward Network, FFN
+                self.output = FeedForwardX(hidden_size=hidden_size,
+                                          dropout_rate=hidden_dropout_rate,
+                                          ffn_hidden_size=ffn_hidden_size,
+                                          hidden_act=hidden_act,
+                                          param_init_type=param_init_type,
+                                          parallel_config=parallel_config)
+            self.post_layernorm_residual = post_layernorm_residual
+            self.add = P.Add().shard(((parallel_config.data_parallel, 1), (parallel_config.data_parallel, 1)))
+            self.add_3d = P.Add().shard(((parallel_config.data_parallel, 1, 1), (parallel_config.data_parallel, 1, 1)))
+            self.dtype = mstype.float16
+            self.key_past = None
+            self.value_past = None
+            if self.use_past:
+                # operator used for state reuse
+                self.reducesum = P.ReduceSum().shard(((1, 1, 1, 1),))
+                self.not_equal = P.NotEqual().shard(((1, 1, 1, 1), ()))
+                self.slice = P.StridedSlice().shard(((1, 1, 1, 1),))
+                size_per_head = hidden_size // num_heads
+                self.key_shape = (batch_size, num_heads, size_per_head, tgt_seq_length)
+                self.value_shape = (batch_size, num_heads, tgt_seq_length, size_per_head)
+                # parameters saving key and value states
+                self.key_past = Parameter(Tensor(np.zeros(shape=self.key_shape), self.dtype), name="key_past")
+                self.value_past = Parameter(Tensor(np.zeros(shape=self.value_shape), self.dtype), name="value_past")
+                self.tile = P.Tile().shard(((1, 1),))
+                self.mul = P.Mul().shard(((1, 1, 1, 1), (1,)))
+                self.assign = P.Assign().shard(((1, 1, 1, 1), (1, 1, 1, 1)))
+        else:
+            raise RuntimeError(f"The {self.cls_name} only support sharding propagation or "
+                               f"semi-auto parallel mode now.")
+
+    def construct(self, hidden_stats,
+                  decoder_mask,
+                  encoder_output=None,
+                  memory_mask=None,
+                  init_reset=True, batch_valid_length=None):
+        self._check_input(hidden_stats, decoder_mask, encoder_output, memory_mask, init_reset, batch_valid_length)
+        # the returned shape is [bs, seq_length, embedding_size] or [bs * seq_length, embedding_size]
+        hidden_shape = F.shape(hidden_stats)
+        hidden_stats = F.reshape(hidden_stats, (-1, hidden_shape[-1]))
+        input_x = self.layernorm1(hidden_stats)
+
+        input_x = F.cast(input_x, self.dtype)
+        # indicate whether reset saved states
+        key_reset = None
+        value_reset = None
+        if self.use_past:
+            # reset states, init_reset True for reuse and False for reset
+            key_reset = self.assign(self.key_past, self.mul(self.key_past, F.cast(init_reset, self.dtype)))
+            value_reset = self.assign(self.value_past, self.mul(self.value_past, F.cast(init_reset, self.dtype)))
+            # add dependency for desired execution order
+            input_x = F.depend(input_x, key_reset)
+            input_x = F.depend(input_x, value_reset)
+
+        attention, layer_present = self.attention(input_x, input_x, input_x, decoder_mask, self.key_past,
+                                                  self.value_past, batch_valid_length)
+        # For post-layernorm the inputs for residual path are output of self-attention and output of layernorm
+        if self.post_layernorm_residual:
+            x = self.add(input_x, attention)
+        # For pre-layernorm the inputs for residual path are output of self-attention and input of this layer
+        else:
+            x = self.add(hidden_stats, attention)
+        middle_output = None
+        cross_attn_output = None
+        if encoder_output is not None:
+            middle_output = self.cross_attention_layernorm(x)
+            middle_output = F.cast(middle_output, self.dtype)
+            encoder_output = F.cast(encoder_output, self.dtype)
+            cross_attn_output, cross_layer_present = self.cross_attention(middle_output, encoder_output,
+                                                                          encoder_output,
+                                                                          memory_mask, self.key_past,
+                                                                          self.value_past, batch_valid_length)
+            layer_present += cross_layer_present
+            if self.post_layernorm_residual:
+                x = self.add(middle_output, cross_attn_output)
+            else:
+                x = self.add(x, cross_attn_output)
+
+        output_x = self.layernorm2(x)
+        output_x = F.cast(output_x, self.dtype)
+        aux_loss = None
+        if self.use_moe:
+            mlp_logit, aux_loss = self.output(output_x)
+        else:
+            mlp_logit = self.output(output_x)
+        # return mlp_logit
+
+        value_update = None
+        key_update = None
+        if self.use_past:
+            # current key and value
+            key_present, value_present = layer_present
+            # update key and value calculated this step
+            key_update = self.assign(self.key_past, key_present)
+            value_update = self.assign(self.value_past, value_present)
+            # add dependency for desired execution order
+            key_update = F.depend(key_update, key_reset)
+            value_update = F.depend(value_update, value_reset)
+
+        # add dependency for desired execution order
+        mlp_logit = F.depend(mlp_logit, value_update)
+        mlp_logit = F.depend(mlp_logit, key_update)
+
+        # if shape is 3d, we reshape the inputs of the add
+        if len(hidden_shape) == 3:
+            output_x = P.Reshape()(output_x, hidden_shape)
+            mlp_logit = P.Reshape()(mlp_logit, hidden_shape)
+            x = P.Reshape()(x, hidden_shape)
+
+            if self.post_layernorm_residual:
+                output = self.add_3d(output_x, mlp_logit)
+            else:
+                output = self.add_3d(x, mlp_logit)
+        else:
+            if self.post_layernorm_residual:
+                output = self.add(output_x, mlp_logit)
+            else:
+                output = self.add(x, mlp_logit)
+            output = F.reshape(output, hidden_shape)
+
+        if self.use_moe:
+            return output#, layer_present, aux_loss
+        return output#, layer_present
+
+    def _check_input(self, hidden_states, attention_mask, encoder_output, memory_mask, init_reset, batch_valid_length):
+        r"""Check inputs"""
+        if not self.use_past or (self.use_past and self.is_first_iteration):
+            _check_shape_equal(F.shape(hidden_states), "hidden_states", self.cls_name,
+                               [[self.batch_size, self.tgt_seq_length, self.hidden_size],
+                                [self.batch_size * self.tgt_seq_length, self.hidden_size]])
+            _check_shape_equal(F.shape(attention_mask), "attention_mask", self.cls_name,
+                               [self.batch_size, self.tgt_seq_length, self.tgt_seq_length])
+
+        else:
+            _check_shape_equal(F.shape(hidden_states), "hidden_states", self.cls_name,
+                               [self.batch_size, 1, self.hidden_size])
+            _check_shape_equal(F.shape(attention_mask), "attention_mask", self.cls_name,
+                               [self.batch_size, 1, self.tgt_seq_length])
+        _check_input_dtype(F.dtype(hidden_states), "hidden_states", [mstype.float32, mstype.float16], self.cls_name)
+        _check_input_dtype(F.dtype(attention_mask), "attention_mask", [mstype.float32, mstype.float16], self.cls_name)
+        if encoder_output is not None:
+            _check_shape_equal(F.shape(encoder_output), "encoder_output", self.cls_name,
+                               [[self.batch_size, self.src_seq_length, self.hidden_size],
+                                [self.batch_size * self.src_seq_length, self.hidden_size]])
+            _check_input_dtype(F.dtype(encoder_output), "encoder_output",
+                               [mstype.float32, mstype.float16], self.cls_name)
+        if memory_mask is not None:
+            _check_shape_equal(F.shape(memory_mask), "memory_mask", self.cls_name,
+                               [self.batch_size, self.tgt_seq_length, self.src_seq_length])
+            _check_input_dtype(F.dtype(memory_mask), "memory_mask",
+                               [mstype.float32, mstype.float16], self.cls_name)
+
+        init_reset_is_tensor = isinstance(init_reset, Tensor)
+        init_reset_is_default = init_reset is True
+        batch_valid_length_is_tensor = isinstance(batch_valid_length, Tensor)
+        batch_is_default = batch_valid_length is None
+        _check_past_none_input_none(self.use_past, "init_reset", self.cls_name, True, init_reset_is_tensor,
+                                    init_reset_is_default)
+        _check_past_none_input_none(self.use_past, "batch_valid_length", self.cls_name, None,
+                                    batch_valid_length_is_tensor, batch_is_default)
+
+        if self.use_past:
+            _check_shape_equal(F.shape(init_reset), "init_reset", self.cls_name, [1])
+            _check_input_dtype(F.dtype(init_reset), "init_reset", [mstype.bool_], self.cls_name)
+            _check_shape_equal(F.shape(batch_valid_length), "batch_valid_length", self.cls_name, [self.batch_size])
+            _check_input_dtype(F.dtype(batch_valid_length), "batch_valid_length", [mstype.int32], self.cls_name)
+        return True
+
+
diff --git a/trc/transformer/T5/transformer.py b/trc/transformer/T5/transformer.py
index a369b9d0aeddd4c9f8d0d8330dd993fbd2320c38..84d11f1298dda18a2b79553243b344d6907f2629 100644
--- a/trc/transformer/T5/transformer.py
+++ b/trc/transformer/T5/transformer.py
@@ -407,7 +407,7 @@ class FeedForward(Cell):
     @_args_type_validator_check(hidden_size=Validator.check_positive_int,
                                 ffn_hidden_size=Validator.check_positive_int,
                                 dropout_rate=Validator.check_non_negative_float,
-                                hidden_act=_valid_type_checks([str], "FeedForward"),
+                                # hidden_act=_valid_type_checks([str], "FeedForward"),
                                 param_init_type=_valid_value_checks([mstype.float32, mstype.float16],
                                                                     "FeedForward"),
                                 parallel_config=_valid_type_checks([OpParallelConfig],
@@ -415,13 +415,14 @@ class FeedForward(Cell):
     def __init__(self, hidden_size,
                  ffn_hidden_size,
                  dropout_rate,
-                 hidden_act='gelu',
+                 hidden_act=None,
                  has_bias=True,
                  expert_num=1,
                  param_init_type=mstype.float32,
                  parallel_config=default_dpmp_config):
         super(FeedForward, self).__init__()
         _check_config(parallel_config)
+        self.dtype = param_init_type
         dp = parallel_config.data_parallel
         mp = parallel_config.model_parallel
         if ffn_hidden_size % mp != 0:
@@ -480,7 +481,7 @@ class FeedForward(Cell):
     def construct(self, x):
         _check_input_shape(F.shape(x), "x", self.cls_name, [2, 3])
         _check_input_dtype(F.dtype(x), "x", [mstype.float32, mstype.float16], self.cls_name)
-        x = self.cast(x, mstype.float16)
+        x = self.cast(x, self.dtype)
         # returned shape is [bs, seq_length, ffn_hidden_size] or [bs * seq_length, ffn_hidden_size]
         hidden = self.mapping(x)
         output = self.projection(hidden)
@@ -794,7 +795,6 @@ class MultiHeadAttention(Cell):
                  tgt_seq_length,
                  hidden_size,
                  num_heads,
-                #  app,
                  hidden_dropout_rate=0.1,
                  attention_dropout_rate=0.1,
                  compute_dtype=mstype.float32,
@@ -804,7 +804,8 @@ class MultiHeadAttention(Cell):
                  use_past=False,
                  is_decoder=False,
                  has_relative_attention_bias=False,
-                 parallel_config=default_dpmp_config):
+                 parallel_config=default_dpmp_config,
+                 num_outputs=1):
         super(MultiHeadAttention, self).__init__()
         _check_config(parallel_config)
         self.is_parallel_mode = _get_parallel_mode() in (ParallelMode.SEMI_AUTO_PARALLEL, ParallelMode.AUTO_PARALLEL)
@@ -812,7 +813,7 @@ class MultiHeadAttention(Cell):
         self.tgt_seq_length = tgt_seq_length
         self.hidden_size = hidden_size
         self.batch_size = batch_size
-        # self.app=app
+        self.num_outputs = num_outputs
         if hidden_dropout_rate < 0 or hidden_dropout_rate >= 1:
             raise ValueError("For 'MultiHeadAttention', the class variable 'hidden_dropout_rate' must be "
                              "in range [0, 1.0), but got the value : {}.".format(hidden_dropout_rate))
@@ -1017,11 +1018,11 @@ class MultiHeadAttention(Cell):
         output = self.projection(attention)
         output = self.dropout(output)
         output = F.reshape(output, ori_shape)
-        # if self.app=="trc":
-        #     return output, layer_present, position_bias
+        if self.num_outputs==1:
+            return output
+        return output, layer_present, position_bias
         # else:
-        #     return output
-        return output
+            # return output
 
     def _check_inputs(self, query_tensor, key_tensor, value_tensor, attention_mask, key_past=None,
                       value_past=None, batch_valid_length=None):
@@ -1322,7 +1323,7 @@ class TransformerEncoderLayer(Cell):
                                 seq_length=Validator.check_positive_int,
                                 attention_dropout_rate=Validator.check_non_negative_float,
                                 hidden_dropout_rate=Validator.check_non_negative_float,
-                                hidden_act=_valid_type_checks([str], "TransformerEncoderLayer"),
+                                # hidden_act=_valid_type_checks([str], "TransformerEncoderLayer"),
                                 post_layernorm_residual=Validator.check_bool,
                                 layernorm_compute_type=_valid_value_checks([mstype.float32, mstype.float16],
                                                                            "TransformerEncoderLayer"),
@@ -1388,7 +1389,8 @@ class TransformerEncoderLayer(Cell):
                                             use_past=use_past,
                                             is_decoder=False,
                                             has_relative_attention_bias=has_relative_attention_bias,
-                                            parallel_config=parallel_config)
+                                            parallel_config=parallel_config,
+                                            num_outputs=3)
         _check_moe_config(moe_config, parallel_config)
         self.use_moe = (moe_config.expert_num > 1)
         if self.use_moe is True:
@@ -1430,7 +1432,7 @@ class TransformerEncoderLayer(Cell):
             self.mul = P.Mul().shard(((1, 1, 1, 1), (1,)))
             self.assign = P.Assign().shard(((1, 1, 1, 1), (1, 1, 1, 1)))
 
-    def construct(self, x, input_mask, init_reset=True, batch_valid_length=None, position_bias=None):
+    def construct(self, x, input_mask, position_bias=None, init_reset=True, batch_valid_length=None):
         self._check_input(x, input_mask, init_reset, batch_valid_length)
         x_shape = F.shape(x)
         x = F.reshape(x, (-1, x_shape[-1]))
@@ -1449,8 +1451,8 @@ class TransformerEncoderLayer(Cell):
             input_x = F.depend(input_x, key_reset)
             input_x = F.depend(input_x, value_reset)
 
-        attention, layer_present, position_bias = self.attention(input_x, input_x, input_x, input_mask,
-                                                  self.key_past, self.value_past, batch_valid_length, position_bias)
+        attention, layer_present, position_bias = self.attention(input_x, input_x, input_x, input_mask, position_bias,
+                                                  self.key_past, self.value_past, batch_valid_length)
         # For post-layernorm the inputs for residual path are output of self-attention and output of layernorm
         if self.post_layernorm_residual:
             x = self.add(input_x, attention)
@@ -1465,6 +1467,7 @@ class TransformerEncoderLayer(Cell):
             mlp_logit, aux_loss = self.output(output_x)
         else:
             mlp_logit = self.output(output_x)
+        # return mlp_logit                                                  
 
         value_update = None
         key_update = None
@@ -1500,8 +1503,8 @@ class TransformerEncoderLayer(Cell):
             output = F.reshape(output, x_shape)
 
         if self.use_moe is True:
-            return output, layer_present, aux_loss
-        return output, layer_present, position_bias
+            return output#, layer_present, aux_loss
+        return output#, layer_present, position_bias
 
     def _check_input(self, x, input_mask, init_reset, batch_valid_length):
         r"""Check inputs"""
@@ -1628,7 +1631,7 @@ class TransformerDecoderLayer(Cell):
                                 tgt_seq_length=Validator.check_positive_int,
                                 attention_dropout_rate=Validator.check_non_negative_float,
                                 hidden_dropout_rate=Validator.check_non_negative_float,
-                                hidden_act=_valid_type_checks([str], "TransformerDecoderLayer"),
+                                # hidden_act=_valid_type_checks([str], "TransformerDecoderLayer"),
                                 post_layernorm_residual=Validator.check_bool,
                                 layernorm_compute_type=_valid_value_checks([mstype.float32, mstype.float16],
                                                                            "TransformerDecoderLayer"),
@@ -1685,6 +1688,7 @@ class TransformerDecoderLayer(Cell):
         self.hidden_size = hidden_size
 
         self.layernorm1 = T5LayerNorm((hidden_size,)).to_float(layernorm_compute_type)
+
         self.layernorm1.shard(((parallel_config.data_parallel, 1),))
         self.layernorm2 = T5LayerNorm((hidden_size,)).to_float(layernorm_compute_type)
         self.layernorm2.shard(((parallel_config.data_parallel, 1),))
@@ -1702,7 +1706,8 @@ class TransformerDecoderLayer(Cell):
                                             param_init_type=param_init_type,
                                             is_decoder=True,
                                             has_relative_attention_bias=has_relative_attention_bias,
-                                            parallel_config=parallel_config)
+                                            parallel_config=parallel_config,
+                                            num_outputs=3)
         # Cross attention with the output of encoder as memory tensor
         self.cross_attention = MultiHeadAttention(hidden_size=hidden_size,
                                                   num_heads=num_heads,
@@ -1717,8 +1722,10 @@ class TransformerDecoderLayer(Cell):
                                                   param_init_type=param_init_type,
                                                   is_decoder=True,
                                                   has_relative_attention_bias=has_relative_attention_bias,
-                                                  parallel_config=parallel_config)
+                                                  parallel_config=parallel_config,
+                                                  num_outputs=3)
         self.cross_attention_layernorm = T5LayerNorm((hidden_size,)).to_float(layernorm_compute_type)
+
         self.cross_attention_layernorm.shard(((parallel_config.data_parallel, 1),))
         _check_moe_config(moe_config, parallel_config)
         self.use_moe = (moe_config.expert_num > 1)
@@ -1742,7 +1749,7 @@ class TransformerDecoderLayer(Cell):
         self.post_layernorm_residual = post_layernorm_residual
         self.add = P.Add().shard(((parallel_config.data_parallel, 1), (parallel_config.data_parallel, 1)))
         self.add_3d = P.Add().shard(((parallel_config.data_parallel, 1, 1), (parallel_config.data_parallel, 1, 1)))
-        self.dtype = mstype.float16
+        self.dtype = mstype.float32
         self.key_past = None
         self.value_past = None
         if self.use_past:
@@ -1764,15 +1771,16 @@ class TransformerDecoderLayer(Cell):
                   decoder_mask,
                   encoder_output=None,
                   memory_mask=None,
-                  init_reset=True, batch_valid_length=None,
-                  position_bias=None, encoder_decoder_position_bias=None):
-        #self._check_input(hidden_stats, decoder_mask, encoder_output, memory_mask, init_reset, batch_valid_length)
+                  position_bias=None, encoder_decoder_position_bias=None,
+                  init_reset=True, batch_valid_length=None):
+        # self._check_input(hidden_stats, decoder_mask, encoder_output, memory_mask, init_reset, batch_valid_length)
         # the returned shape is [bs, seq_length, embedding_size] or [bs * seq_length, embedding_size]
         hidden_shape = F.shape(hidden_stats)
         hidden_stats = F.reshape(hidden_stats, (-1, hidden_shape[-1]))
         input_x = self.layernorm1(hidden_stats)
         input_x = F.cast(input_x, self.dtype)
-
+        init_reset = True
+        batch_valid_length=None
         # indicate whether reset saved states
         key_reset = None
         value_reset = None
@@ -1784,23 +1792,25 @@ class TransformerDecoderLayer(Cell):
             input_x = F.depend(input_x, key_reset)
             input_x = F.depend(input_x, value_reset)
 
-        attention, layer_present, position_bias = self.attention(input_x, input_x, input_x, decoder_mask, self.key_past,
-                                                  self.value_past, batch_valid_length, position_bias)
+        attention, layer_present, position_bias = self.attention(input_x, input_x, input_x, decoder_mask, position_bias, self.key_past,
+                                                  self.value_past, batch_valid_length)
+        # return attention
         # For post-layernorm the inputs for residual path are output of self-attention and output of layernorm
         if self.post_layernorm_residual:
             x = self.add(input_x, attention)
         # For pre-layernorm the inputs for residual path are output of self-attention and input of this layer
         else:
             x = self.add(hidden_stats, attention)
-
         middle_output = None
+        cross_attn_output = None
         if encoder_output is not None:
             middle_output = self.cross_attention_layernorm(x)
             middle_output = F.cast(middle_output, self.dtype)
             cross_attn_output, cross_layer_present, encoder_decoder_position_bias = self.cross_attention(middle_output, encoder_output,
                                                                           encoder_output,
-                                                                          memory_mask, self.key_past,
-                                                                          self.value_past, batch_valid_length, encoder_decoder_position_bias)
+                                                                          memory_mask, encoder_decoder_position_bias, self.key_past,
+                                                                          self.value_past, batch_valid_length)
+        # return cross_attn_output
             layer_present += cross_layer_present
             if self.post_layernorm_residual:
                 x = self.add(middle_output, cross_attn_output)
@@ -1849,8 +1859,8 @@ class TransformerDecoderLayer(Cell):
             output = F.reshape(output, hidden_shape)
 
         if self.use_moe is True:
-            return output, layer_present, aux_loss
-        return output, layer_present, position_bias, encoder_decoder_position_bias
+            return output#, layer_present, aux_loss
+        return output#, layer_present, position_bias, encoder_decoder_position_bias
 
     def _check_input(self, hidden_states, attention_mask, encoder_output, memory_mask, init_reset, batch_valid_length):
         r"""Check inputs"""
diff --git a/trc/transformer/build.sh b/trc/transformer/build.sh
index e8b8e2e2c65c93b5f149f2a6698572a2e0a0f67a..72ddec2dc44955d8594f3c199179bc1d22030ac6 100755
--- a/trc/transformer/build.sh
+++ b/trc/transformer/build.sh
@@ -13,7 +13,7 @@ MSLITE_ENABLE_TESTCASES=off \
 MSLITE_ENABLE_GPU=on \
 MSLITE_MINDDATA_IMPLEMENT=full \
 MSLITE_GPU_BACKEND=tensorrt \
-MSLITE_GPU_ARCH=60 \
+MSLITE_GPU_ARCH=80 \
 TENSORRT_PATH=/usr/lib/x86_64-linux-gnu \
 MSLIBS_SERVER=localHost \
 ${base}/build.sh -I x86_64 $@
diff --git a/trc/transformer/cfg_bert.config b/trc/transformer/cfg_bert.config
index 2f318d6c2a4a39ce9bb3e5f0f468558cb8bc4503..8f66f66f642dcf01e140a25443f2b329a3994fad 100755
--- a/trc/transformer/cfg_bert.config
+++ b/trc/transformer/cfg_bert.config
@@ -1,3 +1,3 @@
 [gpu_context]
-input_shape=input_ids:[1,128];token_type_ids:[1,128];input_mask:[1,128]
+input_shape=input_ids:[1,20];token_type_ids:[1,20];input_mask:[1,20]
 
diff --git a/trc/transformer/convert_fp32.sh b/trc/transformer/convert_fp32.sh
index c0bb22e19b83340b212a144a5fdebe23ac146925..95c5bdcf6c48a5c1603b883865acd089d51e5eab 100755
--- a/trc/transformer/convert_fp32.sh
+++ b/trc/transformer/convert_fp32.sh
@@ -1,3 +1,4 @@
+#!/bin/bash
 base=`git rev-parse --show-toplevel`
 version=$(cat ${base}/version.txt)
 fusion=true
@@ -9,8 +10,8 @@ while getopts "n" opt ; do
      echo "Unknown option ${opt}!" ;;
     esac
   done
-if [ "${fusion}" = "true" ]; then
-  optimize="--optimizeTransformer=true"
+if [ "${fusion}" = "false" ]; then
+  optimize="--optimizeTransformer=false"
 fi
 shift $(($OPTIND - 1))
 file_name=$(basename $1)
@@ -41,6 +42,5 @@ ${base}/trc/system_test/release/ubuntu_x86/mindspore-lite-${version}-linux-x64/t
   --modelFile=$1 \
   --outputFile=${base}/trc/transformer/convv_${file_name} \
   --configFile=${base}/trc/transformer/t.config \
-  --encryption=false \
-  ${optimize}
+  --encryption=false 
 fi
diff --git a/trc/transformer/ftBench.py b/trc/transformer/ftBench.py
index a3cba44dd0cbea0b85c3591e61aa2c1caf038a62..cc0cfcebbb1a84e39f3683fa5b2211d455e8a1fd 100755
--- a/trc/transformer/ftBench.py
+++ b/trc/transformer/ftBench.py
@@ -13,13 +13,13 @@ system = f'{base}/trc/system_test/release/ubuntu_x86/mindspore-lite-{version}-li
 benchmark = f'{system}/tools/benchmark'
 work_dir=f'{base}/trc/transformer'
 image = "private_transformer:0.1"
-server = "10.10.10.174"
+server = "caspi"
 enable_fp16 = "false"
 suffix="fp32"
 usage='enter the correct parameters: app=ch\\trc, act=runtime\\be, loop count=int>=0, server=local\\num of server\nif app=trc and act=be loop count must be 1'
 app='ch'
 act='be'
-cuda_visible_dev=3
+cuda_visible_dev=6
 loop_count=1
 if len(sys.argv)>2 or len(sys.argv)==1:
     parameters=sys.argv[1:]
@@ -53,20 +53,26 @@ for i in range(len(parameters)) :
 print('loop count=',loop_count)
 inputs_file = open("models.txt")
 models_arg = inputs_file.readlines()
-# import subprocess
 def find_output_name(ms_model, output_file):
-    output_name = os.popen(f"../readers/flatbuf/readfb {ms_model} -O").read()
-    print(output_name)
-    output_name = output_name[:-1]
-    print(output_name)
+    os.system(f"../readers/flatbuf/readfb {ms_model} > readmodel.txt")
+    file = open('readmodel.txt', 'r')
+    lines = file.readlines()
+    file.close()
+    line_of_output = [i for i,s in enumerate(lines) if "outputs:#" in s][0]
+    outputs = lines[line_of_output+1].split()
+    outpus_name=[]
+    for out in outputs:
+        output = [i for i,s in enumerate(lines) if "tensor #"+out in s][0]
+        output_name = lines[output+2].split()[2]
+        outpus_name.append(output_name)
     with open(output_file, 'r') as file:
         data = file.read()
-        for i,out in enumerate(output_name.split()):
-            print(out)
-            data = data.replace('output'+str(i+1), out)
+        for i,out1 in enumerate(outpus_name):
+            data = data.replace('output'+str(i+1), out1)
     with open(output_file, 'w') as file:
         file.write(data)
-    print(output_name)
+    print(outpus_name)
+numcount=0
 for line_model_arg in models_arg:
     if line_model_arg[0] == '#' or line_model_arg == '\n': continue
     line_model_arg=line_model_arg[:-1]
@@ -92,13 +98,12 @@ for line_model_arg in models_arg:
         if batch_size!='1':
             model_name+=batch_size
     os.system(f"rm -f {base}/trc/transformer/{model_name}* {base}/trc/transformer/convv_{model_name}*")
+    os.system(f"cp /home/batya/git-proj/transformer_repo/transformer/models/t5/T5Transformer.py .")
     ret = os.system(f"docker run --user \"$(id -u):$(id -g)\" -w {base}/trc/transformer --runtime=nvidia -v {base}/../:{base}/../ -v /opt/share:/opt/share  --privileged=true {image} python {base}/trc/transformer/train_transformer_export.py {line_model_arg} " )
     ret=0
     if ret != 0: exit()
     input_files=''
     output_file=''
-    # os.system(f"./convert_fp32.sh {model_name}_fwd.mindir")
-    # find_output_name(f'convv_{model_name}_fwd.mindir', f'{model_name}_output.txt')
     if app=='ch':
         ret=0
         if act == 'be':
@@ -113,19 +118,24 @@ for line_model_arg in models_arg:
             os.system('./trc/release.sh x86')
             os.system(f"cd {benchmark} && CUDA_VISIBLE_DEVICES={cuda_visible_dev} LD_LIBRARY_PATH={system}/runtime/lib:{system}/tools/converter/lib ./benchmark {benchmark_args}" )
         else:
-            
-            with open(f'cfg_{model_name}.config','w') as f:
-                if model_name == 'bert':
-                    f.write(f"[gpu_context]\ninput_shape=input_ids:[{batch_size},{seq}];token_type_ids:[{batch_size},{seq}];input_mask:[{batch_size},{seq}]")
-                elif model_name == 'transformer_encoder_layer':
-                    f.write(f"[gpu_context]\ninput_shape=x:[{batch_size},{seq},{hidden_size}];input_mask:[{batch_size},{seq},{seq}]")
+            if model_name in ['bert','transformer_encoder_layer']:
+                with open(f'cfg_{model_name}.config','w') as f:
+                    if model_name == 'bert':
+                        f.write(f"[gpu_context]\ninput_shape=input_ids:[{batch_size},{seq}];token_type_ids:[{batch_size},{seq}];input_mask:[{batch_size},{seq}]")
+                    elif model_name == 'transformer_encoder_layer':
+                        f.write(f"[gpu_context]\ninput_shape=x:[{batch_size},{seq},{hidden_size}];input_mask:[{batch_size},{seq},{seq}]")
             os.system(f"ssh {server} 'rm -f {system}/../mindspore-lite-{version}-linux-x64.tar.gz {work_dir}/*{model_name}*'")
             os.system(f"ssh {server} 'mkdir -p {benchmark}'")
             os.system(f"rsync -v {system}/../mindspore-lite-{version}-linux-x64.tar.gz {server}:{system}/..")
             os.system(f"ssh {server} 'cd {system}/.. && tar -xzf {system}/../mindspore-lite-{version}-linux-x64.tar.gz'")
             os.system(f"rsync -v {base}/trc/transformer/*{model_name}* {server}:{base}/trc/transformer/")
             os.system(f"./deploy.sh convv_{model_name}_fwd.mindir")
-            #os.system(f"ssh {server} 'cd {benchmark} && CUDA_VISIBLE_DEVICES={cuda_visible_dev} LD_LIBRARY_PATH={system}/runtime/lib:{system}/tools/converter/lib ./benchmark {benchmark_args}'" )
+            os.system(f"./deploy.sh convv_{model_name}_fwd.mindir run")
+            # os.system(f"ssh {server} 'cd {benchmark} && CUDA_VISIBLE_DEVICES={cuda_visible_dev} LD_LIBRARY_PATH={system}/runtime/lib:{system}/tools/converter/lib ./benchmark {benchmark_args}'" )
+            # os.system(f"mkdir {base}/trc/transformer/{model_name}{numcount}")
+            # os.system(f"cp {base}/trc/transformer/{model_name}* {base}/trc/transformer/{model_name}{numcount}/")
+            numcount+=1
+
 
     elif app=='trc':
         #if loop count =1 app=be else app = runtime
@@ -140,9 +150,9 @@ for line_model_arg in models_arg:
         else:
             print("run trc caspi")
             print("line model arg=", line_model_arg)
-            os.system(f"ssh {server} 'rm -f {base}/../FasterTransformer/build/bin/ms_benchmark {base}/../FasterTransformer/build/bin/{model_name}*'")
+            os.system(f"ssh {server} 'rm -f {base}/../FasterTransformer/build/bin/ms_benchmark {base}/../FasterTransformer/build/bin/*{model_name}*'")
             os.system(f"rsync -v {base}/../FasterTransformer/build/bin/ms_benchmark {server}:{base}/../FasterTransformer/build/bin/ms_benchmark" )
-            os.system(f"rsync -v {base}/trc/transformer/{model_name}* {server}:{base}/../FasterTransformer/build/bin" ) 
+            os.system(f"rsync -v {base}/trc/transformer/*{model_name}* {server}:{base}/../FasterTransformer/build/bin" ) 
             os.system(f'rsync -v {base}/../FasterTransformer/build/lib/libtransformer-shared.so caspi:{base}/../FasterTransformer/build/lib/.')
             os.system(f"ssh {server} 'cd {base}/../FasterTransformer/build/bin && CUDA_VISIBLE_DEVICES={cuda_visible_dev} LD_LIBRARY_PATH={base}/../FasterTransformer:/usr/local/cuda-11.7/lib64 ./ms_benchmark {line_model_arg}' " )
             
diff --git a/trc/transformer/get_output_by_mindir.py b/trc/transformer/get_output_by_mindir.py
old mode 100644
new mode 100755
diff --git a/trc/transformer/models.txt b/trc/transformer/models.txt
index 41bbc1106f85786ba64d14b0e891aa08e625b9ca..b69028b155b878826aae6f5042efd36ed0908c41 100755
--- a/trc/transformer/models.txt
+++ b/trc/transformer/models.txt
@@ -1,28 +1,77 @@
+-b 1 -l 12 -H 2 -S 8 -s 20 -f 32 -P 0 -x 1 -m transformer_encoder_layer_t5
+-b 1 -l 12 -H 2 -S 8 -s 20 -f 32 -P 1 -x 0 -m transformer_encoder_layer_t5
+
+-b 1 -l 12 -H 2 -S 8 -s 20 -f 32 -P 0 -x 0 -m transformer_decoder_layer_t5
+-b 1 -l 12 -H 2 -S 8 -s 20 -f 32 -P 1 -x 1 -m transformer_decoder_layer_t5
+
+#-b 1 -l 12 -H 2 -S 8 -s 20 -f 32 -P 0 -x 0 -m transformer_encoder_layer_t5
+#-b 1 -l 12 -H 2 -S 8 -s 20 -f 32 -P 1 -x 0 -m transformer_encoder_layer_t5
+
+#-b 1 -l 12 -H 2 -S 8 -s 20 -f 32 -P 1 -x 0 -m transformer_decoder_layer_t5
+#-b 1 -l 12 -H 2 -S 8 -s 20 -f 32 -P 0 -x 0 -m transformer_decoder_layer_t5
+
+
+#-b 1 -l 12 -H 2 -S 8 -s 20 -f 32 -P 1 -x 0 -m transformer_decoder_layer
 #run the following tests before push
-#-b 1 -l 66 -s 128 -H 12 -S 768 -p 0 -m mha_x1
+
+-b 1 -l 66 -s 128 -H 12 -S 768 -p 0 -m mha_x1
 #-b 1 -l 66 -s 128 -t 256 -H 12 -S 768 -p 0 -m mha_cross
-#-b 1 -l 66 -s 20 -H 4 -S 768 -p 0 -m mha_T5
-#-b 1 -l 66 -s 20 -t 40 -H 4 -S 768 -p 0 -m mha_T5_cross
+-b 1 -l 66 -s 20 -t 20 -H 3 -S 15 -p 0 -m mha_cross
+-b 1 -l 66 -s 20 -H 4 -S 768 -p 0 -m mha_T5
+-b 1 -l 66 -s 20 -t 40 -H 4 -S 768 -p 0 -m mha_T5_cross
+
 #-b 1 -l 12 -H 12 -S 768 -s 128 -P 0 -m transformer_encoder_layer
 #-b 8 -l 12 -H 12 -S 768 -s 128 -P 0 -m transformer_encoder_layer
-#-b 1 -l 12 -H 12 -S 768 -s 128 -P 1 -m transformer_encoder_layer
 #-b 8 -l 12 -H 12 -S 768 -s 128 -P 1 -m transformer_encoder_layer
 #-b 32 -l 12 -H 12 -S 768 -s 128 -P 0 -f 3072 -m bert
-#-b 1 -l 12 -H 12 -S 768 -s 128 -P 1 -f 3072 -m bert
+
+#-b 1 -l 3 -H 12 -S 768 -s 128 -m T5
+
+#-b 1 -l 12 -H 2 -S 8 -s 20 -f 32 -P 0 -x 1 -m transformer_encoder_layer_t5
+#-b 1 -l 12 -H 2 -S 8 -s 20 -f 32 -P 0 -f 3072 -x 0 -m transformer_encoder_layer_t5
+
+-b 1 -l 12 -H 2 -S 8 -s 20 -f 32 -P 0 -x 0 -m transformer_decoder_layer_t5
+-b 1 -l 12 -H 2 -S 8 -s 20 -f 32 -P 1 -x 0 -m transformer_decoder_layer_t5
+#-b 1 -l 12 -H 2 -S 8 -s 20 -f 32 -P 0 -x 1 -m transformer_decoder_layer_t5
+#-b 1 -l 12 -H 2 -S 8 -s 20 -f 32 -P 1 -x 1 -m transformer_decoder_layer_t5
+
+-b 1 -l 12 -H 2 -S 8 -s 20 -f 32 -P 0 -x 0 -m transformer_decoder_layer
+-b 1 -l 12 -H 2 -S 8 -s 20 -f 32 -P 1 -x 0 -m transformer_decoder_layer
+#-b 1 -l 12 -H 2 -S 8 -s 20 -f 32 -P 0 -x 1 -m transformer_decoder_layer
+#-b 1 -l 12 -H 2 -S 8 -s 20 -f 32 -P 1 -x 1 -m transformer_decoder_layer
+
 #-b 1 -l 66 -s 20 -H 3 -S 15 -p 0 -m mha_x1
+#-b 1 -l 24 -H 16 -S 1024 -s 128 -P 1 -m bert
+#-b 8 -l 24 -H 16 -S 1024 -s 128 -P 1 -m bert
+#-b 16 -l 24 -H 16 -S 1024 -s 128 -P 0 -m bert
+#-b 32 -l 24 -H 16 -S 1024 -s 128 -P 1 -m bert
 
+#-b 1 -l 12 -H 2 -S 8 -s 20 -f 32 -P 0 -x 1 -m transformer_decoder_layer
+#-b 1 -l 12 -H 2 -S 8 -s 20 -f 32 -P 1 -x 1 -m transformer_decoder_layer
+#-b 1 -l 12 -H 2 -S 8 -s 20 -f 32 -P 0 -x 0 -m transformer_encoder_layer
+#-b 1 -l 12 -H 2 -S 8 -s 20 -f 32 -P 1 -x 0 -m transformer_encoder_layer
+#-b 1 -l 12 -H 2 -S 8 -s 20 -f 32 -P 0 -x 1 -m transformer_encoder_layer
+#-b 1 -l 12 -H 2 -S 8 -s 20 -f 32 -P 1 -x 1 -m transformer_encoder_layer
 #-b 1 -l 66 -s 1 -H 8 -S 512 -p 0 -m mha_x1
 #-b 3 -l 66 -s 20 -H 3 -S 15 -p   -m mha_x2
 #-b 3 -l 66 -s 20 -t 40 -H 3 -S 15 -p 0 -m mha_x1
 #-b 1 -l 66 -s 128 -H 4 -S 1024 -p 0 -m mha_x1
-#-b 1 -l 6 -s 128 -H 8 -S 1024 -m T5
+#-b 1 -l 2 -s 12 -t 12 -H 2 -S 4 -m transformer
 #-b 8 -l 12 -H 12 -S 768 -s 128 -P 0 -m transformer_encoder_layer
 #-b 1 -l 12 -H 2 -S 8 -s 20 -f 32 -P 0 -m transformer_encoder_layer
 #-b 1 -l 12 -H 4 -S 512 -s 128 -f 3072 -P 1 -m transformer_encoder_layer
 
-#-b 1 -l 12 -H 12 -S 768 -s 128 -P 1 -m transformer_encoder_layer
-#-b 4 -l 12 -H 12 -S 768 -s 128 -P 1 -m transformer_encoder_layer
-#-b 8 -l 12 -H 12 -S 768 -s 128 -P 1 -m transformer_encoder_layer
+#-b 4 -l 12 -H 12 -S 768 -s 128 -P 1 -x 1 -m transformer_encoder_layer
+#-b 1 -l 12 -H 12 -S 768 -s 128 -P 1 -x 0 -m transformer_encoder_layer
+#-b 8 -l 12 -H 12 -S 768 -s 128 -P 0 -x 0 -m transformer_encoder_layer
+#-b 1 -l 12 -H 12 -S 768 -s 128 -P 1 -x 1 -m transformer_encoder_layer_t5
+#-b 4 -l 12 -H 12 -S 768 -s 128 -P 0 -x 0 -m transformer_encoder_layer_t5
+#
+#-b 1 -l 12 -H 12 -S 768 -s 128 -P 1 -x 0 -m transformer_decoder_layer
+#-b 8 -l 12 -H 12 -S 768 -s 128 -P 0 -x 0 -m transformer_decoder_layer
+#-b 1 -l 12 -H 12 -S 768 -s 128 -P 1 -x 1 -m transformer_decoder_layer_t5
+#-b 4 -l 12 -H 12 -S 768 -s 128 -P 0 -x 0 -m transformer_decoder_layer_t5
+#-b 8 -l 12 -H 12 -S 768 -s 128 -P 1 -x 1 -m transformer_encoder_layer_t5
 #-b 1 -l 12 -H 4 -S 512 -s 128 -P 1 -m transformer_encoder_layer
 #-b 1 -l 12 -H 4 -S 512 -s 128 -P 1 -f 3072 -m transformer_encoder_layer
 #-b 4 -l 12 -H 4 -S 512 -s 128 -P 1 -m transformer_encoder_layer
@@ -39,7 +88,7 @@
 #-b 1 -l 66 -s 20 -t 30 -H 3 -S 15 -p 0 -m mha_cross
 #-b 1 -l 66 -s 20 -H 4 -S 768 -p 0 -m mha_T5
 #-b 1 -l 66 -s 20 -t 40 -H 4 -S 768 -p 0 -m mha_T5_cross
--b 1 -l 12 -H 12 -S 768 -s 128 -P 1 -m transformer_encoder_layer
+#-b 1 -l 12 -H 12 -S 768 -s 128 -P 1 -m transformer_encoder_layer
 #-b 8 -l 12 -H 4 -S 512 -s 128 -P 0 -f 3072 -m transformer_encoder_layer
 #-b 16 -l 16 -H 8 -S 1024 -s 64 -P 1 -f 1024 -m transformer_encoder_layer
 #-b 32 -l 12 -H 4 -S 512 -s 128 -P 0 -f 3072 -m transformer_encoder_layer
@@ -49,10 +98,10 @@
 #-b 8 -l 12 -H 4 -S 512 -s 64 -m bert
 # -----------------------------------------------------------------------------
 
-#-b 1 -l 12 -H 2 -S 8 -s 20 -f 32 -P False -m transformer_encoder_layer
-#-b 1 -l 2 -H 2 -S 8 -s 20 -f 1024 -P True -m bert
-#-b 1 -l 12 -H 2 -S 8 -s 20 -m T5
+#-b 1 -l 12 -H 2 -S 8 -s 20 -f 32 -P 0 -m transformer_decoder_layer
 #-b 1 -l 2 -H 2 -S 8 -s 20 -f 1024 -P True -m bert
+#-b 1 -l 2 -H 2 -S 2 -s 128 -m T5
+#-b 1 -l 2 -H 2 -S 8 -s 20 -f 1024 -P 1 -m bert
 
 #-b 1 -l 12 -H 12 -S 768 -s 128 -m bert
 #-b 8 -l 12 -H 12 -S 768 -s 128 -m bert
@@ -60,7 +109,7 @@
 #-b 32 -l 12 -H 12 -S 768 -s 128 -m bert
 
 #num=12 head_size=64
-#-b 16 -l 12 -H 16 -S 512 -s 128 -m bert 
+#-b 16 -l 12 -H 12 -S 768 -s 128 -m bert 
 #-b 8 -l 12 -H 12 -S 768 -s 128 -m bert
 #-b 1 -l 12 -H 12 -S 768 -s 64 -m bert
 #-b 1 -l 24 -H 16 -S 1024 -s 128 -m bert
@@ -68,7 +117,7 @@
 #-b 32 -l 24 -H 16 -S 1024 -s 128 -m bert
 #-b 8 -l 24 -H 16 -S 1024 -s 128 -m bert
 # -s 64 128 512 1024
-# -b 1 16 32
+#-b 1 16 32
 ##-s 128 -H 4 8 -S 1024 2048
 #-b 1 -l 66 -s 20 -H 3 -S 15 -p 0 -m test
 #-b 1 -l 66 -s 128 -H 12 -S 768 -p 0 -m test -T fp32 -W fp32 -F fp32
@@ -81,24 +130,17 @@
 #-b 1 -l 66 -s 256 -H 12 -S 768 -p 0 -m mha_x1 -T fp16
 #-b 8 -l 66 -s 256 -H 12 -S 768 -t 512 -p 0 -m mha_cross -T fp16
 #-b 8 -l 66 -s 9 -H 12 -S 768 -t 21 -p 0 -m mha_cross
-# -T fp16
-#-b 1 -l 2 -s 128 -H 8 -S 1024 -m T5
-#-b 1 -l 66 -s 20 -H 4 -S 768 -p 0 -m mha_T5 -T fp16 -W fp32 -F fp32
 
-#-b 1 -l 66 -s 20 -H 4 -S 768 -p 0 -m mha_T5
-#-b 1 -l 66 -s 20 -t 40 -H 4 -S 768 -p 0 -m mha_T5_cross
-
-#-b 1 -l 66 -s 128 -H 12 -S 768 -p 0 -m mha_x1 -T fp32 -W fp32 -F fp32
-#-b 1 -l 66 -s 128 -H 12 -S 768 -p 0 -m mha_x1 -T fp16
-
-#-b 1 -l 66 -s 2 -H 96 -S 12288 -p 0 -m mha_x1 -T fp32 -W fp32 -F fp32
-#-b 1 -l 66 -s 128 -H 12 -S 768 -p 0 -m mha_x1 -T fp32 -W fp32 -F fp32
-#-b 1 -l 66 -s 128 -t 64 -H 12 -S 768 -p 0 -m mha_x1 -T fp32 -W fp32 -F fp32
+#-b 1 -l 1 -s 20 -H 2 -S 4 -m bert
+# T5 tests
+#-b 1 -l 6 -s 128 -t 128 -H 8 -S 512 -f 2048 -m T5
+#-b 1 -l 6 -s 512 -t 512 -H 8 -S 512 -f 2048 -m T5
+#
+#-b 1 -l 6 -s 128 -t 128 -H 12 -S 768 -f 3072 -m T5
+#-b 1 -l 6 -s 512 -t 512 -H 12 -S 768 -f 3072 -m T5
 
-#-b 1 -l 66 -s 128 -t 64 -H 12 -S 768 -p 0 -m mha_cross
-#-b 5 -l 66 -s 256 -H 4 -S 1024 -p 0 -m mha_x1 -T fp32 -W fp32 -F fp32
-#-b 8 -l 66 -s 128 -H 12 -S 768 -p 0 -m mha_x1 -T fp16
-#-b 8 -l 66 -s 256 -H 12 -S 768 -p 0 -m mha_x1 -T fp16
-#-b 8 -l 66 -s 256 -H 8	-S 512 -p 0 -m mha_x1 -T fp16
-#-b 8 -l 66 -s 64 -H 8 -S 512 -p 0 -m mha_x1 -T fp16
-#-b 1 -l 66 -s 256 -H 12 -S 768 -p 0 -m mha_x1 -T fp16
+# transformer tests
+#-b 1 -l 6 -s 128 -t 128 -H 8 -S 512 -f 2048 -m transformer
+#-b 1 -l 6 -s 512 -t 512 -H 8 -S 512 -f 2048 -m transformer
+#-b 1 -l 6 -s 128 -t 128 -H 12 -S 768 -f 3072 -m transformer
+#-b 1 -l 6 -s 512 -t 512 -H 12 -S 768 -f 3072 -m transformer 
diff --git a/trc/transformer/t.config b/trc/transformer/t.config
index d4450ed345c54537a2c402e91c33a785af8ab5f1..0fad133d432b210e3d49d70c6a36f480ff877951 100755
--- a/trc/transformer/t.config
+++ b/trc/transformer/t.config
@@ -1,3 +1,5 @@
 [registry]
-#fusion_blacklists="MultiHeadAttentionFusion","EncoderLayerFusion"
-
+#fusion_blacklists="MultiHeadAttentionFusion"
+#fusion_blacklists="EncoderLayerFusion","DecoderLayerFusion"
+#fusion_blacklists="DecoderLayerFusion"
+#fusion_blacklists="EncoderLayerFusion"
diff --git a/trc/transformer/test_tr.py b/trc/transformer/test_tr.py
old mode 100644
new mode 100755
diff --git a/trc/transformer/train_transformer_export.py b/trc/transformer/train_transformer_export.py
index af10b53c392a146b936ea7ce71de2b2c27ad596b..275aeaf906b1813febedeab1edcbf3a60bc7a342 100755
--- a/trc/transformer/train_transformer_export.py
+++ b/trc/transformer/train_transformer_export.py
@@ -11,7 +11,8 @@ model_zoo_path=os.environ['CLOUD_MODEL_ZOO']
 sys.path.append(model_zoo_path)
 sys.path.append("../../../transformer/transformer/models")
 sys.path.append("./T5")
-from MultiHeadTester import MultiHeadAttentionX,TransformerEncoderLayerX,FeedForwardX
+from MultiHeadTester import MultiHeadAttentionX, TransformerDecoderLayerX,TransformerEncoderLayerX,FeedForwardX
+import T5Transformer as T
 from mindspore.common.parameter import Parameter
 from mindspore.common.initializer import Tensor
 import mindspore as M
@@ -53,8 +54,6 @@ M.context.set_context(mode=M.context.GRAPH_MODE,device_target="GPU", save_graphs
 # # y = model(encoder_input_value, encoder_input_mask)# _cell_graph_executor.compile(model, encoder_input_value, encoder_input_mask)
 # for i in range (2):
 #     y = model(encoder_input_value, encoder_input_mask, decoder_input_value, decoder_input_mask, memory_mask)
-
-# # print("y=", y)
 # export(model, encoder_input_value, encoder_input_mask, decoder_input_value, decoder_input_mask, memory_mask, file_name= name + "_fwd", file_format='MINDIR')
 
 def get_gpu_memory():
@@ -106,6 +105,7 @@ eps2=1e-6
 post_layernorm=True
 ffn_hidden_size=-1
 app="ch"
+ffn_fp16 = False
 compress = False
 def read_args():
     global batch
@@ -124,6 +124,7 @@ def read_args():
     global in_type
     global w_type
     global app
+    global ffn_fp16
     global compress
     print("sys argv = ", sys.argv)
     for i in range(len(sys.argv)) :
@@ -213,7 +214,14 @@ def read_args():
                 print("error: illegal compute type {}".format(sys.argv[i + 1]) )
             else:    
                 app = sys.argv[i + 1]
-                print("app=",app)
+        elif sys.argv[i] == '-x':
+           if sys.argv[i + 1] not in ["0", "1"]:
+               print("error: illegal compute type {}".format(sys.argv[i + 1]) )
+           else:   
+            if sys.argv[i + 1]=='0':
+                ffn_fp16 = False
+            elif sys.argv[i + 1]=='1':
+                ffn_fp16 = True
         elif sys.argv[i] == '-c':
             if sys.argv[i+1] == 'true':
                 compress = True
@@ -259,7 +267,6 @@ def calc_seq_lengths(batch, max_seq_length,th):
     return seq_len
 
 def transformer_encoder_layer_create():
-    post_layernorm=False
     name = "transformer_encoder_layer"
     if (post_layernorm):
         model = TransformerEncoderLayerX(batch_size=batch, hidden_size=hid_size, ffn_hidden_size=ffn_hidden_size, seq_length=seq,
@@ -320,11 +327,11 @@ def transformer_encoder_layer_create():
     saveT(bp, name + "_weight6.fp" + suffix)
     saveT(gl2, name + "_weight7.fp" + suffix)
     saveT(bl2, name + "_weight8.fp" + suffix)
-    if app == 'trc':
+    if ffn_fp16 == True:
         saveTensorToHalf(omw, name + "_weight9.fp" + "16")
         saveTensorToHalf(omb, name + "_weight10.fp" + "16")
         saveTensorToHalf(opw, name + "_weight11.fp" + "16")
-    elif app == 'ch':
+    else:
         saveT(omw, name + "_weight9.fp" + suffix)
         saveT(omb, name + "_weight10.fp" + suffix)
         saveT(opw, name + "_weight11.fp" + suffix)
@@ -343,11 +350,260 @@ def transformer_encoder_layer_create():
            y = pruneTensor(y,seq_len,1) 
         saveCalib(out_name, np.array(y), f_y)
         print("y.shape",np.array(y).shape)
+        f_y.close()
         # saveCalib('Default/Add-op267', np.array(y), f_y)#2 dims
    
     elif app=="trc":
         saveT(y, name + "_output1.fp" + suffix)
 
+def transformer_encoder_layer_t5_create():
+    name = "transformer_encoder_layer_t5"
+    if (post_layernorm):
+        print("post_layernorm")
+        model = T5_TF.TransformerEncoderLayer(batch_size=batch, hidden_size=hid_size, ffn_hidden_size=ffn_hidden_size, seq_length=seq,
+                                    num_heads=head_num, post_layernorm_residual=True, has_bias=False, hidden_act='relu')
+    else:
+        model = T5_TF.TransformerEncoderLayer(batch_size=batch, hidden_size=hid_size, ffn_hidden_size=ffn_hidden_size, seq_length=seq,
+                                    num_heads=head_num, has_bias=False, hidden_act='relu')
+    encoder_input_value = M.Tensor(np.random.normal(0., 0.5, (batch, seq, hid_size)), M.float32)  
+    encoder_input_mask = M.Tensor(np.random.normal(0., 0.5, (batch, seq, seq)), M.float32)
+    pos = M.Tensor(np.random.normal(0., 0.5, (batch, head_num, seq, tgt_seq_len)), M.float32)
+    # encoder_input_value = M.Tensor(np.zeros((batch, seq, hid_size)), M.float32)  
+    # encoder_input_mask = M.Tensor(np.zeros((batch, seq, seq)), M.float32)
+    q =   model.attention.dense1.weight.asnumpy()#.transpose() # hid_size x hid_size
+    k =   model.attention.dense2.weight.asnumpy()#.transpose()
+    v =   model.attention.dense3.weight.asnumpy()#.transpose()
+   
+    w = np.concatenate((q, k, v)) # 3xhid_size x hid_size
+    w = w.transpose() # hid_size x 3xhid_size
+    wt = M.Tensor(w, w_compute_type)
+    wp = model.attention.projection.weight
+    omw = model.output.mapping.weight
+    opw = model.output.projection.weight
+    gl1 = model.layernorm1.weight
+    gl2 = model.layernorm2.weight 
+
+    suffix = str(compute_type)
+    suffix = suffix[-2:]
+    saveT(encoder_input_value, name + "_input1.fp" + suffix)
+    saveT(encoder_input_mask, name + "_input2.fp" + suffix)
+    saveT(pos, name + "_input3.fp" + suffix)
+    saveT(gl1, name + "_weight1.fp" + suffix)
+    saveT(wt, name + "_weight2.fp" + suffix)
+    saveT(wp, name + "_weight3.fp" + suffix)
+    saveT(gl2, name + "_weight4.fp" + suffix)
+    if ffn_fp16 == True:
+        saveTensorToHalf(omw, name + "_weight5.fp" + "16")
+        saveTensorToHalf(opw, name + "_weight6.fp" + "16")
+    else:
+        saveT(omw, name + "_weight5.fp" + suffix)
+        saveT(opw, name + "_weight6.fp" + suffix)
+    _cell_graph_executor.compile(model,
+                                 encoder_input_value,
+                                 encoder_input_mask,pos)
+    y = model(encoder_input_value, encoder_input_mask,pos)
+    print('name=',name)
+    export(model, encoder_input_value, encoder_input_mask,pos, file_name= name + "_fwd", file_format='MINDIR')
+    # if app=="ch":
+    f_y=open(f'./{name}_output.txt','w')
+    out_name='output1'
+    print("name output:",out_name)
+    saveCalib(out_name, np.array(y), f_y)
+    print("y.shape",np.array(y).shape)
+    # saveCalib('Default/Add-op267', np.array(y), f_y)#2 dims
+    f_y.close()
+    # elif app=="trc":
+    saveT(y, name + "_output1.fp" + suffix)
+
+
+def transformer_decoder_layer_t5_create():
+    name = "transformer_decoder_layer_t5"
+    if (post_layernorm):
+        print("post_layernorm true")
+        model = T5_TF.TransformerDecoderLayer(batch_size=batch, hidden_size=hid_size, ffn_hidden_size=ffn_hidden_size, src_seq_length=seq,
+                                    tgt_seq_length=tgt_seq_len,num_heads=head_num, post_layernorm_residual=True, use_past=False, has_bias=False, hidden_act="relu")
+    else:
+        print("post_layernorm false")
+        model = T5_TF.TransformerDecoderLayer(batch_size=batch, hidden_size=hid_size, ffn_hidden_size=ffn_hidden_size, src_seq_length=seq,
+                                    tgt_seq_length=tgt_seq_len,num_heads=head_num,use_past=False, has_bias=False, hidden_act="relu")        
+    hidden_stats = M.Tensor(np.random.normal(0., 0.5, (batch, tgt_seq_len, hid_size)), M.float32)  
+    decoder_mask = M.Tensor(np.random.normal(0., 0.5, (batch, seq, seq)), M.float32)
+    encoder_output = M.Tensor(np.random.normal(0., 0.5, (batch, seq, hid_size)), M.float32)
+    memory_mask = M.Tensor(np.random.normal(0., 0.5, (batch, tgt_seq_len,seq)), M.float32)
+    pos = M.Tensor(np.random.normal(0., 0.5, (batch, head_num, seq, tgt_seq_len)), M.float32)
+    encoder_pos = M.Tensor(np.random.normal(0., 0.5, (batch, head_num, seq, tgt_seq_len)), M.float32)
+    actual_seq = seq // 2
+    if compress:
+        input_value = hidden_stats.asnumpy()
+        input_value[:,actual_seq:,:] = 0
+        hidden_stats = M.Tensor.from_numpy(input_value)
+        decoder_input_mask_value =  decoder_mask.asnumpy()
+        decoder_input_mask_value[:,:,actual_seq:] = 0
+        decoder_mask = M.Tensor.from_numpy(decoder_input_mask_value)
+        encoder_output_value =  encoder_output.asnumpy()
+        encoder_output_value[:,:,actual_seq:] = 0
+        encoder_output = M.Tensor.from_numpy(encoder_output_value)
+        memory_mask_value =  memory_mask.asnumpy()
+        memory_mask_value[:,:,actual_seq:] = 0
+        memory_mask = M.Tensor.from_numpy(memory_mask_value)
+        pos_value =  pos.asnumpy()
+        pos_value[:,:,actual_seq:] = 0
+        pos = M.Tensor.from_numpy(pos_value)
+        encoder_pos_value =  encoder_pos.asnumpy()
+        encoder_pos_value[:,:,actual_seq:] = 0
+        encoder_pos = M.Tensor.from_numpy(encoder_pos_value)
+    q =   model.attention.dense1.weight.asnumpy()#.transpose() # hid_size x hid_size
+    k =   model.attention.dense2.weight.asnumpy()#.transpose()
+    v =   model.attention.dense3.weight.asnumpy()#.transpose()
+   
+    w = np.concatenate((q, k, v)) # 3xhid_size x hid_size
+    w = w.transpose() # hid_size x 3xhid_size
+    wt = M.Tensor(w, w_compute_type)
+    wp = model.attention.projection.weight
+    
+    qt2 =   model.cross_attention.dense1.weight#.transpose() # hid_size x hid_size
+    k2 =   model.cross_attention.dense2.weight.asnumpy()#.transpose()
+    v2 =   model.cross_attention.dense3.weight.asnumpy()#.transpose()
+   
+    w2 = np.concatenate((k2, v2)) # 2xhid_size x hid_size
+    w2 = w2.transpose() # hid_size x 2xhid_size
+    wt2 = M.Tensor(w2, w_compute_type)
+    wp2 = model.cross_attention.projection.weight
+    omw = model.output.mapping.weight
+    print('omw.asnumpy().shape',omw.asnumpy().shape)
+    opw = model.output.projection.weight
+
+    gl1 = model.layernorm1.weight
+    gl2 = model.layernorm2.weight
+    gl3 = model.cross_attention_layernorm.weight
+
+    suffix = str(compute_type)
+    suffix = suffix[-2:]
+    saveT(gl1, name + "_weight1.fp" + suffix)
+    saveT(wt, name + "_weight2.fp" + suffix)
+    saveT(wp, name + "_weight3.fp" + suffix)
+    saveT(gl2, name + "_weight4.fp" + suffix)
+    saveT(qt2, name + "_weight5.fp" + suffix)
+    saveT(wt2, name + "_weight6.fp" + suffix)
+    saveT(wp2, name + "_weight7.fp" + suffix)
+    saveT(gl3, name + "_weight8.fp" + suffix)
+    if(ffn_fp16):
+        saveTensorToHalf(omw, name + "_weight9.fp" + "16")
+        saveTensorToHalf(opw, name + "_weight10.fp" + "16")
+    else: 
+        saveT(omw, name + "_weight9.fp" + suffix)
+        saveT(opw, name + "_weight10.fp" + suffix)
+    saveT(hidden_stats, name + "_input1.fp" + suffix)
+    saveT(decoder_mask, name + "_input2.fp" + suffix)
+    saveT(encoder_output, name + "_input3.fp" + suffix)
+    saveT(memory_mask, name + "_input4.fp" + suffix)
+    saveT(pos, name + "_input5.fp" + suffix)
+    saveT(encoder_pos, name + "_input6.fp" + suffix)
+    _cell_graph_executor.compile(model, hidden_stats, decoder_mask, encoder_output, memory_mask, pos, encoder_pos)
+    y = model(hidden_stats, decoder_mask, encoder_output, memory_mask , pos, encoder_pos)
+    export(model, hidden_stats, decoder_mask, encoder_output, memory_mask, pos, encoder_pos, file_name= name + "_fwd", file_format='MINDIR')
+    if compress:
+        y_num = y.asnumpy()
+        y_num[:,actual_seq:,:] = 0
+        y =  M.Tensor.from_numpy(y_num)  
+    f_y=open(f'./{name}_output.txt','w')
+    saveCalib("output1", np.array(y), f_y)#2 dims
+    f_y.close()
+    saveT(y, name + "_output1.fp" + suffix)
+
+def transformer_decoder_layer_create():
+    name = "transformer_decoder_layer"
+    if (post_layernorm):
+        print("post_layernorm true")
+        model = TransformerDecoderLayerX(batch_size=batch, hidden_size=hid_size, ffn_hidden_size=ffn_hidden_size, src_seq_length=seq,
+                                    tgt_seq_length=tgt_seq_len,num_heads=head_num, post_layernorm_residual=True)
+    else:
+        print("post_layernorm false")
+        model = TransformerDecoderLayerX(batch_size=batch, hidden_size=hid_size, ffn_hidden_size=ffn_hidden_size, src_seq_length=seq,
+                                    tgt_seq_length=tgt_seq_len,num_heads=head_num)        
+    hidden_stats = M.Tensor(np.random.normal(0., 0.5, (batch, tgt_seq_len, hid_size)), M.float32)  
+    decoder_mask = M.Tensor(np.random.normal(0., 0.5, (batch, seq, seq)), M.float32)
+    encoder_output = M.Tensor(np.random.normal(0., 0.5, (batch, seq, hid_size)), M.float32)
+    memory_mask = M.Tensor(np.random.normal(0., 0.5, (batch, tgt_seq_len,seq)), M.float32)
+    q =   model.attention.dense1.weight.asnumpy()#.transpose() # hid_size x hid_size
+    k =   model.attention.dense2.weight.asnumpy()#.transpose()
+    v =   model.attention.dense3.weight.asnumpy()#.transpose()
+   
+    w = np.concatenate((q, k, v)) # 3xhid_size x hid_size
+    w = w.transpose() # hid_size x 3xhid_size
+    wt = M.Tensor(w, w_compute_type)
+    bq =   model.attention.dense1.bias.asnumpy() 
+    bk =   model.attention.dense2.bias.asnumpy()
+    bv =   model.attention.dense3.bias.asnumpy()
+    bw = np.concatenate((bq, bk, bv)) #(3xhid) X 1
+    bt =M.Tensor(bw, w_compute_type)
+    wp = model.attention.projection.weight
+    bp = model.attention.projection.bias
+    
+    qt2 =   model.cross_attention.dense1.weight#.transpose() # hid_size x hid_size
+    k2 =   model.cross_attention.dense2.weight.asnumpy()#.transpose()
+    v2 =   model.cross_attention.dense3.weight.asnumpy()#.transpose()
+   
+    w2 = np.concatenate((k2, v2)) # 3xhid_size x hid_size
+    w2 = w2.transpose() # hid_size x 3xhid_size
+    wt2 = M.Tensor(w2, w_compute_type)
+    bq2 =   model.cross_attention.dense1.bias.asnumpy() 
+    bk2 =   model.cross_attention.dense2.bias.asnumpy()
+    bv2 =   model.cross_attention.dense3.bias.asnumpy()
+    bw2 = np.concatenate((bq2, bk2, bv2)) #(3xhid) X 1
+    bt2 =M.Tensor(bw2, w_compute_type)
+    wp2 = model.cross_attention.projection.weight
+    bp2 = model.cross_attention.projection.bias
+    omw = model.output.mapping.weight
+    opw = model.output.projection.weight
+    omb = model.output.mapping.bias
+    opb = model.output.projection.bias
+
+    gl1 = model.layernorm1.gamma
+    bl1 = model.layernorm1.beta
+    gl2 = model.layernorm2.gamma
+    bl2 = model.layernorm2.beta
+    gl3 = model.cross_attention_layernorm.gamma
+    bl3 = model.cross_attention_layernorm.beta
+    suffix = str(compute_type)
+    suffix = suffix[-2:]
+    saveT(hidden_stats, name + "_input1.fp" + suffix)
+    saveT(decoder_mask, name + "_input2.fp" + suffix)
+    saveT(encoder_output, name + "_input3.fp" + suffix)
+    saveT(memory_mask, name + "_input4.fp" + suffix)
+
+    saveT(gl1, name + "_weight1.fp" + suffix)
+    saveT(bl1, name + "_weight2.fp" + suffix)
+    saveT(wt, name + "_weight3.fp" + suffix)
+    saveT(bt, name + "_weight4.fp" + suffix)
+    saveT(wp, name + "_weight5.fp" + suffix)
+    saveT(bp, name + "_weight6.fp" + suffix)
+    saveT(gl2, name + "_weight7.fp" + suffix)
+    saveT(bl2, name + "_weight8.fp" + suffix)
+    saveT(qt2, name + "_weight9.fp" + suffix)
+    saveT(wt2, name + "_weight10.fp" + suffix)
+    saveT(bt2, name + "_weight11.fp" + suffix)
+    saveT(wp2, name + "_weight12.fp" + suffix)
+    saveT(bp2, name + "_weight13.fp" + suffix)
+    saveT(gl3, name + "_weight14.fp" + suffix)
+    saveT(bl3, name + "_weight15.fp" + suffix)
+    if(ffn_fp16):
+        saveTensorToHalf(omw, name + "_weight16.fp" + "16")
+        saveTensorToHalf(omb, name + "_weight17.fp" + "16")
+        saveTensorToHalf(opw, name + "_weight18.fp" + "16")
+    else:
+        saveT(omw, name + "_weight16.fp" + suffix)
+        saveT(omb, name + "_weight17.fp" + suffix)
+        saveT(opw, name + "_weight18.fp" + suffix)
+    saveT(opb, name + "_weight19.fp" + suffix)
+    _cell_graph_executor.compile(model, hidden_stats, decoder_mask, encoder_output, memory_mask)
+    y = model(hidden_stats, decoder_mask, encoder_output, memory_mask)
+    export(model, hidden_stats, decoder_mask, encoder_output, memory_mask, file_name= name + "_fwd", file_format='MINDIR')
+    f_y=open(f'./{name}_output.txt','w')
+    saveCalib("output1", np.array(y), f_y)#2 dims
+    f_y.close()
+    saveT(y, name + "_output1.fp" + suffix)
+
 def build_transformer_encoder_layer_post_ture():
     model = TransformerEncoderLayer(batch_size=2,
                                     seq_length=16,
@@ -424,8 +680,8 @@ def mha_x1_create():
                                softmax_compute_type=s_compute_type,
                                app=app
                                )
-    q =   model.dense1.weight.asnumpy()#.transpose() # hid_size x hid_size
     k =   model.dense2.weight.asnumpy()#.transpose()
+    q =   model.dense1.weight.asnumpy()#.transpose() # hid_size x hid_size
     v =   model.dense3.weight.asnumpy()#.transpose()
     w = np.concatenate((q, k, v)) # 3xhid_size x hid_size
     w = w.transpose() # hid_size x 3xhid_size
@@ -678,6 +934,19 @@ def T5_create():
     name = "T5"
     str=" "
     os.system(f"python {base}/../transformer_repo/pretrain_{name}.py {str.join(sys.argv[1:-4])} " )
+def vit_create():
+    repo = git.Repo('.', search_parent_directories=True)
+    base = repo.working_tree_dir
+    name = "vit"
+    str=" "
+    os.system(f"python {base}/../transformer_repo/pretrain_{name}.py {str.join(sys.argv[1:-4])} " )
+def transformer_create():
+    repo = git.Repo('.', search_parent_directories=True)
+    base = repo.working_tree_dir
+    name = "transformer"
+    str=" "
+    os.system(f"python {base}/../transformer_repo/pretrain_{name}.py {str.join(sys.argv[1:-4])} " )
+
 def mha_T5_create(): 
     # M.context.set_auto_parallel_context(parallel_mode=M.ParallelMode.SEMI_AUTO_PARALLEL)
     M.context.set_context(mode=M.context.PYNATIVE_MODE)
@@ -692,8 +961,7 @@ def mha_T5_create():
                             compute_dtype=compute_type,
                             param_init_type=w_compute_type,
                             softmax_compute_type=s_compute_type,
-                            has_bias=False,
-                            app=app
+                            has_bias=False 
                             )
     print('compute_type',compute_type)
     q =   model.dense1.weight.asnumpy()#.transpose() # hid_size x hid_size
@@ -736,7 +1004,7 @@ def mha_T5_create():
     # print(y.shape)
     # if app=="ch":
     f_y=open(f'./{name}_output.txt','w')
-    saveCalib('Default/projection-_Linear/MatMul-op58', np.array(y), f_y)#2 dims
+    saveCalib('output1', np.array(y), f_y)#2 dims
     # elif app=="trc":
     saveT(y, name + "_output1.fp" + suffix)
         # tmp = y[1][0].asnumpy().transpose(0, 1, 3, 2)
@@ -759,8 +1027,7 @@ def mha_T5_cross_create():
                             compute_dtype=compute_type,
                             param_init_type=w_compute_type,
                             softmax_compute_type=s_compute_type,
-                            has_bias=False,
-                            app=app
+                            has_bias=False
                             )
 
     qt =   model.dense1.weight
@@ -830,6 +1097,7 @@ def main():
     for i in range(len(sys.argv)):
         if sys.argv[i]=='-m':
             model_name=sys.argv[i+1]
+    print("%s_create()" % model_name)
     eval("%s_create()" % model_name)
 
 if __name__ == "__main__":